Doxygen
Loading...
Searching...
No Matches
regex.h
Go to the documentation of this file.
1/******************************************************************************
2 *
3 * Copyright (C) 1997-2021 by Dimitri van Heesch.
4 *
5 * Permission to use, copy, modify, and distribute this software and its
6 * documentation under the terms of the GNU General Public License is hereby
7 * granted. No representations are made about the suitability of this software
8 * for any purpose. It is provided "as is" without express or implied warranty.
9 * See the GNU General Public License for more details.
10 *
11 * Documents produced by Doxygen are derivative works derived from the
12 * input used in their production; they are not affected by this license.
13 *
14 */
15
16#ifndef FREGEX_H
17#define FREGEX_H
18
19#include <memory>
20#include <string>
21#include <string_view>
22#include <vector>
23#include <iterator>
24
25#include "construct.h"
26
27/** Namespace for the regular expression functions */
28namespace reg
29{
30
31class Match;
32
33/** Class representing a regular expression.
34 *
35 * It has a similar API as `std::regex`,
36 * but is much faster (and also somewhat more limited).
37 */
38class Ex
39{
40 public:
41 /** Matching algorithm */
42 enum class Mode
43 {
44 RegEx, /**< full regular expression. */
45 Wildcard /**< simple globbing pattern. */
46 };
47 /** Creates a regular expression object given the pattern as a string.
48 * Two modes of matching are supported: RegEx and Wildcard
49 *
50 * The following special characters are supported in Mode::RegEx mode.
51 * - `c` matches character `c`
52 * - `.` matches any character
53 * - `^` matches the start of the input
54 * - `$` matches the end of the input
55 * - `<` matches the start of a word
56 * - `>` matches the end of a word
57 * - `[]` matches a set of characters
58 * - `x*` matches a sequence of zero or more `x`'s
59 * - `x+` matches a sequence of one or more `x`'s
60 * - `x?` matches an optional `x`
61 * - `(` matches the start of a capture range
62 * - `)` matches the ends a capture range
63 * - `\c` to escape a special character, such as `+`, `[`, `*`, `(`, etc.
64 * - `\t` matches a tab character
65 * - `\n` matches a newline character
66 * - `\r` matches a return character
67 * - `\s` matches any whitespace as defined by `std::isspace()`
68 * - `\d` matches any digit as defined by `std::digit()`
69 * - `\a` matches any alphabetical characters, same as `[a-z_A-Z\x80-\xFF]`
70 * - `\w` matches any alpha numerical character, same as `[a-z_A-Z0-9\x80-\xFF]`
71 * - `\xHH` matches a hexadecimal character, e.g. `\xA0` matches character code 160.
72 *
73 * A character range can be used to match a character that falls inside a range
74 * (or set of ranges).
75 * Within the opening `[` and closing `]` brackets of a character ranges the following
76 * is supported:
77 * - `^` if at the start of the range, a character matches if it is \e not in the range,
78 * e.g. `[^\d]` matches any character not a digit
79 * - `-` when placed between 2 characters it defines a range from the first character to the second.
80 * any character that falls in the range will match, e.g. [0-9] matches the digit from 0 to 9.
81 * - `\s`, `\d`, `\a`, and `\w` as explained above.
82 *
83 * @note that special characters `.`, `*`, `?`, `$`, `+`, `[` do not have a special
84 * meaning in a character range. `^` only has a special meaning as the first character.
85 *
86 * @note capture ranges can be nested. Quantifiers (`*`, `+`, `?`) on entire capture ranges
87 * are not supported.
88 *
89 * In Wildcard mode `*` is used to match any sequence of zero or more characters.
90 * The character `?` can be used to match an optional character. Character ranges are
91 * also supported, but other characters like `$` and `+` are just treated as
92 * literal characters.
93 *
94 */
95 Ex(std::string_view pattern, Mode mode=Mode::RegEx);
96
97 /** Destroys the regular expression object. Frees resources. */
98 ~Ex();
99
100 /** Check if a given string matches this regular expression.
101 * @param str The input string to match against.
102 * @param match The match object to hold the matching results.
103 * @param pos The position in the string at which to start the match.
104 * @returns true iff a match is found. Details are stored in the match object.
105 */
106 bool match(std::string_view str,Match &match,size_t pos=0) const;
107 bool isValid() const;
108 private:
110
111 class Private;
112 std::unique_ptr<Private> p;
113};
114
115/** Object representing the match results of a capture range. */
117{
118 public:
119 /** Creates a match for a single capture range given a non-owning pointer to the string. */
120 SubMatch(std::string_view str) : m_str(str) {}
121
122 /** Returns the position in the string at which the match starts. */
123 size_t position() const { return m_pos; }
124
125 /** Returns the length of the matching part. */
126 size_t length() const { return m_len; }
127
128 /** Returns the matching part as a string */
129 std::string str() const { return std::string{m_str.substr(m_pos,m_len)}; }
130
131 private:
132 friend class Match;
133 void setStart(size_t pos) { m_pos=pos; }
134 void setEnd(size_t pos) { m_len=pos-m_pos; }
135 void setMatch(size_t pos,size_t len) { m_pos=pos; m_len=len; }
136 size_t m_pos = std::string::npos;
137 size_t m_len = std::string::npos;
138 std::string_view m_str;
139};
140
141/** Object representing the matching results. It consists of an array of
142 * SubMatch objects. The first entry of the array represents the whole match, any
143 * next elements represent each of the capture ranges.
144 *
145 * For example string `@42` and expression `@(\\d+)` will have two
146 * Submatches, match[0] will point to the input string as a whole, and
147 * match[1] will point to the number 42 only.
148 *
149 */
150class Match
151{
152 public:
153 /** Creates an empty match object */
154 Match() {}
155
156 /** Returns the position of the match or std::string::npos if no position is set. */
157 size_t position() const { return m_subMatches[0].position(); }
158
159 /** Returns the position of the match or std::string::npos if no length is set. */
160 size_t length() const { return m_subMatches[0].length(); }
161
162 /** Return a string representing the matching part. */
163 std::string str() const { return std::string{m_subMatches[0].str()}; }
164
165 /** Return the part of the string before the match */
166 SubMatch prefix() const { SubMatch m(m_str); m.setMatch(0,position()); return m; }
167
168 /** Return the part of the string after the match */
170 {
171 SubMatch m(m_str);
172 if (!m_str.empty())
173 {
174 size_t e = position()+length();
175 m.setMatch(e,m_str.length()-e);
176 }
177 return m;
178 }
179
180 /** Returns the number of sub matches available in this match. */
181 size_t size() const { return m_subMatches.size(); }
182
183 /** Returns the n-th SubMatch object. Note that there is always 1 SubMatch object
184 * representing the whole match.
185 */
186 const SubMatch &operator[](size_t index) const { return m_subMatches[index]; }
187
188 private:
189 friend class Ex;
190 void init(std::string_view str,size_t captureCount)
191 {
192 m_subMatches.clear();
193 m_subMatches.reserve(captureCount+1);
194 for (size_t i=0;i<captureCount+1;i++)
195 {
196 m_subMatches.emplace_back(str);
197 }
198 m_str = str;
199 }
200 void startCapture(size_t groupId,size_t index)
201 {
202 if (groupId < m_subMatches.size())
203 {
204 m_subMatches[groupId].setStart(index);
205 }
206 }
207 void endCapture(size_t groupId,size_t index)
208 {
209 if (groupId < m_subMatches.size())
210 {
211 if (index>m_subMatches[groupId].position())
212 {
213 m_subMatches[groupId].setEnd(index);
214 }
215 }
216 }
217 void setMatch(size_t pos,size_t len)
218 {
219 // Always set the whole match
220 m_subMatches[0].setMatch(pos,len);
221 }
222
223 std::vector<SubMatch> m_subMatches;
224 std::string_view m_str;
225};
226
227/** Class to iterate through matches.
228 */
230{
231 public:
233 using difference_type = std::ptrdiff_t;
236 using iterator_category = std::forward_iterator_tag;
237
238 /** Creates an end-of-sequence iterator */
240
241 /** Creates an iterator for input string \a str, using regular expression \a re to search.
242 * @note the string and regular expression objects should remain valid while iterating.
243 */
244 Iterator(std::string_view str, const Ex &re, size_t pos=0)
245 : m_str(str), m_re(&re), m_pos(pos) { findNext(); }
246
247 // Iterator holds pointers, so prevent temporaries to be passed as string or
248 // regular expression
249 Iterator(std::string &&str, const Ex &re) = delete;
250 Iterator(const std::string &str, Ex &&re) = delete;
251 Iterator(std::string &&str, Ex &&re) = delete;
252
253 /** Returns true if the iterators point to the same match (or both are end-of-sequence iterators) */
254 bool operator==(const Iterator &rhs) const { return rhs.m_pos==m_pos; }
255
256 /** Returns true if the iterators are not pointing to the same match */
257 bool operator!=(const Iterator &rhs) const { return rhs.m_pos!=m_pos; }
258
259 /** Returns a reference to the current match */
260 const value_type &operator*() const { return m_match; }
261
262 /** Returns a pointer to the current match */
263 const value_type *operator->() const { return &m_match; }
264
265 /** Advances the iterator to the next match. */
266 Iterator &operator++() { findNext(); return *this; }
267
268 private:
269 void findNext()
270 {
271 if (!m_re || m_str.empty()) { m_pos=std::string::npos; return; } // end marker
272 if (m_re->match(m_str,m_match,m_pos))
273 {
274 m_pos=m_match.position()+m_match.length(); // update m_pos to point beyond last match
275 }
276 else // no more matches, make the iterator point to the 'end-of-sequence'
277 {
278 m_pos=std::string::npos;
279 }
280 }
281 std::string_view m_str;
282 const Ex *m_re = nullptr;
283 size_t m_pos = std::string::npos;
285};
286
287/** Search in a given string \a str starting at position \a pos for a match against regular expression \a re.
288 * Returns true iff a match was found.
289 * Details of what part of the string has matched is returned via the \a match object.
290 *
291 * An example to show how to match all identifiers in a string.
292 * @code
293 * static reg::Ex re(R"(\a\w*)");
294 * std::string = u8"void(Func是<B_C::Códe42>(42));";
295 * while (reg::search(str,match,re,pos))
296 * {
297 * std::cout << match.str() << std::endl;
298 * pos=match.position()+match.length();
299 * }
300 * @endcode
301 * produces:
302 * @code
303 * void
304 * Func是
305 * B_C
306 * Códe42
307 * @endcode
308 *
309 * @see Ex::Ex() for details on the regular expression patterns.
310 */
311bool search(std::string_view str,Match &match,const Ex &re,size_t pos=0);
312
313/** Search in a given string \a str starting at position \a pos for a match against regular expression \a re.
314 * Returns true iff a match was found.
315 */
316bool search(std::string_view str,const Ex &re,size_t pos=0);
317
318/** Matches a given string \a str for a match against regular expression \a re.
319 * Returns true iff a match was found for the whole string.
320 * Any capture groups are returned via the \a match object.
321 */
322bool match(std::string_view str,Match &match,const Ex &re);
323
324/** Matches a given string \a str for a match against regular expression \a re.
325 * Returns true iff a match was found for the whole string.
326 */
327bool match(std::string_view str,const Ex &re);
328
329/** Searching in a given input string \a for parts that match regular expression \a re and
330 * replaces those parts by string \a replacement.
331 */
332std::string replace(std::string_view str,const Ex &re,std::string_view replacement);
333
334} // namespace
335
336#endif
Private members of a regular expression.
Definition regex.cpp:170
Class representing a regular expression.
Definition regex.h:39
~Ex()
Destroys the regular expression object.
std::unique_ptr< Private > p
Definition regex.h:112
bool match(std::string_view str, Match &match, size_t pos=0) const
Check if a given string matches this regular expression.
Definition regex.cpp:736
Ex(std::string_view pattern, Mode mode=Mode::RegEx)
Creates a regular expression object given the pattern as a string.
Definition regex.cpp:724
Mode
Matching algorithm.
Definition regex.h:43
@ RegEx
full regular expression.
Definition regex.h:44
@ Wildcard
simple globbing pattern.
Definition regex.h:45
bool isValid() const
Definition regex.cpp:771
Iterator(std::string &&str, const Ex &re)=delete
const value_type & operator*() const
Returns a reference to the current match.
Definition regex.h:260
value_type & reference
Definition regex.h:235
Iterator(std::string &&str, Ex &&re)=delete
Iterator & operator++()
Advances the iterator to the next match.
Definition regex.h:266
size_t m_pos
Definition regex.h:283
Iterator(const std::string &str, Ex &&re)=delete
std::forward_iterator_tag iterator_category
Definition regex.h:236
std::ptrdiff_t difference_type
Definition regex.h:233
Iterator()
Creates an end-of-sequence iterator.
Definition regex.h:239
Match value_type
Definition regex.h:232
const value_type * operator->() const
Returns a pointer to the current match.
Definition regex.h:263
std::string_view m_str
Definition regex.h:281
Match m_match
Definition regex.h:284
Iterator(std::string_view str, const Ex &re, size_t pos=0)
Creates an iterator for input string str, using regular expression re to search.
Definition regex.h:244
bool operator==(const Iterator &rhs) const
Returns true if the iterators point to the same match (or both are end-of-sequence iterators).
Definition regex.h:254
value_type * pointer
Definition regex.h:234
const Ex * m_re
Definition regex.h:282
void findNext()
Definition regex.h:269
bool operator!=(const Iterator &rhs) const
Returns true if the iterators are not pointing to the same match.
Definition regex.h:257
Object representing the matching results.
Definition regex.h:151
friend class Ex
Definition regex.h:189
void endCapture(size_t groupId, size_t index)
Definition regex.h:207
Match()
Creates an empty match object.
Definition regex.h:154
std::string_view m_str
Definition regex.h:224
void init(std::string_view str, size_t captureCount)
Definition regex.h:190
void startCapture(size_t groupId, size_t index)
Definition regex.h:200
const SubMatch & operator[](size_t index) const
Returns the n-th SubMatch object.
Definition regex.h:186
size_t size() const
Returns the number of sub matches available in this match.
Definition regex.h:181
SubMatch prefix() const
Return the part of the string before the match.
Definition regex.h:166
size_t position() const
Returns the position of the match or std::string::npos if no position is set.
Definition regex.h:157
std::string str() const
Return a string representing the matching part.
Definition regex.h:163
std::vector< SubMatch > m_subMatches
Definition regex.h:223
void setMatch(size_t pos, size_t len)
Definition regex.h:217
SubMatch suffix() const
Return the part of the string after the match.
Definition regex.h:169
size_t length() const
Returns the position of the match or std::string::npos if no length is set.
Definition regex.h:160
Object representing the match results of a capture range.
Definition regex.h:117
size_t length() const
Returns the length of the matching part.
Definition regex.h:126
void setEnd(size_t pos)
Definition regex.h:134
void setStart(size_t pos)
Definition regex.h:133
void setMatch(size_t pos, size_t len)
Definition regex.h:135
size_t m_len
Definition regex.h:137
size_t m_pos
Definition regex.h:136
std::string_view m_str
Definition regex.h:138
friend class Match
Definition regex.h:132
SubMatch(std::string_view str)
Creates a match for a single capture range given a non-owning pointer to the string.
Definition regex.h:120
size_t position() const
Returns the position in the string at which the match starts.
Definition regex.h:123
std::string str() const
Returns the matching part as a string.
Definition regex.h:129
#define NON_COPYABLE(cls)
Macro to help implementing the rule of 5 for a non-copyable & movable class.
Definition construct.h:37
Namespace for the regular expression functions.
Definition regex.cpp:31
bool search(std::string_view str, Match &match, const Ex &re, size_t pos)
Search in a given string str starting at position pos for a match against regular expression re.
Definition regex.cpp:778
std::string replace(std::string_view str, const Ex &re, std::string_view replacement)
Searching in a given input string for parts that match regular expression re and replaces those parts...
Definition regex.cpp:800
bool match(std::string_view str, Match &match, const Ex &re)
Matches a given string str for a match against regular expression re.
Definition regex.cpp:789