Doxygen
Loading...
Searching...
No Matches
regex.h
Go to the documentation of this file.
1/******************************************************************************
2 *
3 * Copyright (C) 1997-2021 by Dimitri van Heesch.
4 *
5 * Permission to use, copy, modify, and distribute this software and its
6 * documentation under the terms of the GNU General Public License is hereby
7 * granted. No representations are made about the suitability of this software
8 * for any purpose. It is provided "as is" without express or implied warranty.
9 * See the GNU General Public License for more details.
10 *
11 * Documents produced by Doxygen are derivative works derived from the
12 * input used in their production; they are not affected by this license.
13 *
14 */
15
16#ifndef FREGEX_H
17#define FREGEX_H
18
19#include <memory>
20#include <string>
21#include <string_view>
22#include <vector>
23#include <iterator>
24
25#include "construct.h"
26
27/** Namespace for the regular expression functions */
28namespace reg
29{
30
31class Match;
32
33/** Class representing a regular expression.
34 *
35 * It has a similar API as `std::regex`,
36 * but is much faster (and also somewhat more limited).
37 */
38class Ex
39{
40 public:
41 /** Matching algorithm */
42 enum class Mode
43 {
44 RegEx, /**< full regular expression. */
45 Wildcard /**< simple globbing pattern. */
46 };
47 /** Creates a regular expression object given the pattern as a string.
48 * Two modes of matching are supported: RegEx and Wildcard
49 *
50 * The following special characters are supported in Mode::RegEx mode.
51 * - `c` matches character `c`
52 * - `.` matches any character
53 * - `^` matches the start of the input
54 * - `$` matches the end of the input
55 * - `<` matches the start of a word
56 * - `>` matches the end of a word
57 * - `[]` matches a set of characters
58 * - `x*` matches a sequence of zero or more `x`'s
59 * - `x+` matches a sequence of one or more `x`'s
60 * - `x?` matches an optional `x`
61 * - `(` matches the start of a capture range
62 * - `)` matches the ends a capture range
63 * - `\c` to escape a special character, such as `+`, `[`, `*`, `(`, etc.
64 * - `\t` matches a tab character
65 * - `\n` matches a newline character
66 * - `\r` matches a return character
67 * - `\s` matches any whitespace as defined by `std::isspace()`
68 * - `\d` matches any digit as defined by `std::digit()`
69 * - `\a` matches any alphabetical characters, same as `[a-z_A-Z\x80-\xFF]`
70 * - `\w` matches any alpha numerical character, same as `[a-z_A-Z0-9\x80-\xFF]`
71 * - `\xHH` matches a hexadecimal character, e.g. `\xA0` matches character code 160.
72 *
73 * A character range can be used to match a character that falls inside a range
74 * (or set of ranges).
75 * Within the opening `[` and closing `]` brackets of a character ranges the following
76 * is supported:
77 * - `^` if at the start of the range, a character matches if it is \e not in the range,
78 * e.g. `[^\d]` matches any character not a digit
79 * - `-` when placed between 2 characters it defines a range from the first character to the second.
80 * any character that falls in the range will match, e.g. [0-9] matches the digit from 0 to 9.
81 * - `\s`, `\d`, `\a`, and `\w` as explained above.
82 *
83 * @note that special characters `.`, `*`, `?`, `$`, `+`, `[` do not have a special
84 * meaning in a character range. `^` only has a special meaning as the first character.
85 *
86 * @note that capture ranges cannot be nested, and `*`, `+`, and `?` do not work on
87 * capture ranges. e.g. `(abd)?` is not valid. If multiple capture ranges are
88 * specified then some character has to be in between them,
89 * e.g. this does not work `(.*)(a.*)`, but this does `(.*)a(.*)`.
90 *
91 * In Wildcard mode `*` is used to match any sequence of zero or more characters.
92 * The character `?` can be used to match an optional character. Character ranges are
93 * also supported, but other characters like `$` and `+` are just treated as
94 * literal characters.
95 *
96 */
97 Ex(std::string_view pattern, Mode mode=Mode::RegEx);
98
99 /** Destroys the regular expression object. Frees resources. */
101
102 /** Check if a given string matches this regular expression.
103 * @param str The input string to match against.
104 * @param match The match object to hold the matching results.
105 * @param pos The position in the string at which to start the match.
106 * @returns true iff a match is found. Details are stored in the match object.
107 */
108 bool match(std::string_view str,Match &match,size_t pos=0) const;
109 bool isValid() const;
110 private:
112
113 class Private;
114 std::unique_ptr<Private> p;
115};
116
117/** Object representing the match results of a capture range. */
119{
120 public:
121 /** Creates a match for a single capture range given a non-owning pointer to the string. */
122 SubMatch(std::string_view str) : m_str(str) {}
123
124 /** Returns the position in the string at which the match starts. */
125 size_t position() const { return m_pos; }
126
127 /** Returns the length of the matching part. */
128 size_t length() const { return m_len; }
129
130 /** Returns the matching part as a string */
131 std::string str() const { return std::string{m_str.substr(m_pos,m_len)}; }
132
133 private:
134 friend class Match;
135 void setStart(size_t pos) { m_pos=pos; }
136 void setEnd(size_t pos) { m_len=pos-m_pos; }
137 void setMatch(size_t pos,size_t len) { m_pos=pos; m_len=len; }
138 size_t m_pos = std::string::npos;
139 size_t m_len = std::string::npos;
140 std::string_view m_str;
141};
142
143/** Object representing the matching results. It consists of an array of
144 * SubMatch objects. The first entry of the array represents the whole match, any
145 * next elements represent each of the capture ranges.
146 *
147 * For example string `@42` and expression `@(\\d+)` will have two
148 * Submatches, match[0] will point to the input string as a whole, and
149 * match[1] will point to the number 42 only.
150 *
151 */
152class Match
153{
154 public:
155 /** Creates an empty match object */
156 Match() {}
157
158 /** Returns the position of the match or std::string::npos if no position is set. */
159 size_t position() const { return m_subMatches[0].position(); }
160
161 /** Returns the position of the match or std::string::npos if no length is set. */
162 size_t length() const { return m_subMatches[0].length(); }
163
164 /** Return a string representing the matching part. */
165 std::string str() const { return std::string{m_subMatches[0].str()}; }
166
167 /** Return the part of the string before the match */
168 SubMatch prefix() const { SubMatch m(m_str); m.setMatch(0,position()); return m; }
169
170 /** Return the part of the string after the match */
172 {
173 SubMatch m(m_str);
174 if (!m_str.empty())
175 {
176 size_t e = position()+length();
177 m.setMatch(e,m_str.length()-e);
178 }
179 return m;
180 }
181
182 /** Returns the number of sub matches available in this match. */
183 size_t size() const { return m_subMatches.size(); }
184
185 /** Returns the n-th SubMatch object. Note that there is always 1 SubMatch object
186 * representing the whole match.
187 */
188 const SubMatch &operator[](size_t index) const { return m_subMatches[index]; }
189
190 private:
191 friend class Ex;
192 void init(std::string_view str)
193 {
194 m_subMatches.clear();
195 m_subMatches.emplace_back(str);
196 m_str = str;
197 }
198 void startCapture(size_t index)
199 {
200 if (!m_insideCapture) // when backtracking we can re-entry the capture multiple times
201 // only update the index, example `\s*(x)`
202 {
204 m_subMatches.emplace_back(m_str);
205 m_insideCapture = true;
206 }
207 m_subMatches.back().setStart(index);
208 }
209 void endCapture(size_t index)
210 {
211 if (index>m_subMatches.back().position())
212 {
214 m_subMatches.back().setEnd(index);
215 m_insideCapture = false;
216 }
217 }
218 void setMatch(size_t pos,size_t len)
219 {
220 m_subMatches[m_captureIndex].setMatch(pos,len);
221 }
222
223 std::vector<SubMatch> m_subMatches;
225 std::string_view m_str;
226 bool m_insideCapture=false;
227};
228
229/** Iterator class to iterator through matches.
230 */
232{
233 public:
235 using difference_type = std::ptrdiff_t;
238 using iterator_category = std::forward_iterator_tag;
239
240 /** Creates an end-of-sequence iterator */
242
243 /** Creates an iterator for input string \a str, using regular expression \a re to search.
244 * @note the string and regular expression objects should remain valid while iterating.
245 */
246 Iterator(std::string_view str, const Ex &re, size_t pos=0)
247 : m_str(str), m_re(&re), m_pos(pos) { findNext(); }
248
249 // Iterator holds pointers, so prevent temporaries to be passed as string or
250 // regular expression
251 Iterator(std::string &&str, const Ex &re) = delete;
252 Iterator(const std::string &str, Ex &&re) = delete;
253 Iterator(std::string &&str, Ex &&re) = delete;
254
255 /** Returns true if the iterators point to the same match (or both are end-of-sequence iterators) */
256 bool operator==(const Iterator &rhs) const { return rhs.m_pos==m_pos; }
257
258 /** Returns true if the iterators are not pointing to the same match */
259 bool operator!=(const Iterator &rhs) const { return rhs.m_pos!=m_pos; }
260
261 /** Returns a reference to the current match */
262 const value_type &operator*() const { return m_match; }
263
264 /** Returns a pointer to the current match */
265 const value_type *operator->() const { return &m_match; }
266
267 /** Advances the iterator to the next match. */
268 Iterator &operator++() { findNext(); return *this; }
269
270 private:
271 void findNext()
272 {
273 if (!m_re || m_str.empty()) { m_pos=std::string::npos; return; } // end marker
274 if (m_re->match(m_str,m_match,m_pos))
275 {
276 m_pos=m_match.position()+m_match.length(); // update m_pos to point beyond last match
277 }
278 else // no more matches, make the iterator point to the 'end-of-sequence'
279 {
280 m_pos=std::string::npos;
281 }
282 }
283 std::string_view m_str;
284 const Ex *m_re = nullptr;
285 size_t m_pos = std::string::npos;
287};
288
289/** Search in a given string \a str starting at position \a pos for a match against regular expression \a re.
290 * Returns true iff a match was found.
291 * Details of what part of the string has matched is returned via the \a match object.
292 *
293 * An example to show how to match all identifiers in a string.
294 * @code
295 * static reg::Ex re(R"(\a\w*)");
296 * std::string = u8"void(Func是<B_C::Códe42>(42));";
297 * while (reg::search(str,match,re,pos))
298 * {
299 * std::cout << match.str() << std::endl;
300 * pos=match.position()+match.length();
301 * }
302 * @endcode
303 * produces:
304 * @code
305 * void
306 * Func是
307 * B_C
308 * Códe42
309 * @endcode
310 *
311 * @see Ex::Ex() for details on the regular expression patterns.
312 */
313bool search(std::string_view str,Match &match,const Ex &re,size_t pos=0);
314
315/** Search in a given string \a str starting at position \a pos for a match against regular expression \a re.
316 * Returns true iff a match was found.
317 */
318bool search(std::string_view str,const Ex &re,size_t pos=0);
319
320/** Matches a given string \a str for a match against regular expression \a re.
321 * Returns true iff a match was found for the whole string.
322 * Any capture groups are returned via the \a match object.
323 */
324bool match(std::string_view str,Match &match,const Ex &re);
325
326/** Matches a given string \a str for a match against regular expression \a re.
327 * Returns true iff a match was found for the whole string.
328 */
329bool match(std::string_view str,const Ex &re);
330
331/** Searching in a given input string \a for parts that match regular expression \a re and
332 * replaces those parts by string \a replacement.
333 */
334std::string replace(std::string_view str,const Ex &re,std::string_view replacement);
335
336} // namespace
337
338#endif
Private members of a regular expression.
Definition regex.cpp:170
Class representing a regular expression.
Definition regex.h:39
~Ex()
Destroys the regular expression object.
std::unique_ptr< Private > p
Definition regex.h:114
bool match(std::string_view str, Match &match, size_t pos=0) const
Check if a given string matches this regular expression.
Definition regex.cpp:706
Ex(std::string_view pattern, Mode mode=Mode::RegEx)
Creates a regular expression object given the pattern as a string.
Definition regex.cpp:694
Mode
Matching algorithm.
Definition regex.h:43
@ RegEx
full regular expression.
Definition regex.h:44
@ Wildcard
simple globbing pattern.
Definition regex.h:45
bool isValid() const
Definition regex.cpp:741
Iterator(std::string &&str, const Ex &re)=delete
const value_type & operator*() const
Returns a reference to the current match.
Definition regex.h:262
value_type & reference
Definition regex.h:237
Iterator(std::string &&str, Ex &&re)=delete
Iterator & operator++()
Advances the iterator to the next match.
Definition regex.h:268
size_t m_pos
Definition regex.h:285
Iterator(const std::string &str, Ex &&re)=delete
std::forward_iterator_tag iterator_category
Definition regex.h:238
std::ptrdiff_t difference_type
Definition regex.h:235
Iterator()
Creates an end-of-sequence iterator.
Definition regex.h:241
Match value_type
Definition regex.h:234
const value_type * operator->() const
Returns a pointer to the current match.
Definition regex.h:265
std::string_view m_str
Definition regex.h:283
Match m_match
Definition regex.h:286
Iterator(std::string_view str, const Ex &re, size_t pos=0)
Creates an iterator for input string str, using regular expression re to search.
Definition regex.h:246
bool operator==(const Iterator &rhs) const
Returns true if the iterators point to the same match (or both are end-of-sequence iterators)
Definition regex.h:256
value_type * pointer
Definition regex.h:236
const Ex * m_re
Definition regex.h:284
void findNext()
Definition regex.h:271
bool operator!=(const Iterator &rhs) const
Returns true if the iterators are not pointing to the same match.
Definition regex.h:259
Object representing the matching results.
Definition regex.h:153
bool m_insideCapture
Definition regex.h:226
friend class Ex
Definition regex.h:191
Match()
Creates an empty match object.
Definition regex.h:156
std::string_view m_str
Definition regex.h:225
const SubMatch & operator[](size_t index) const
Returns the n-th SubMatch object.
Definition regex.h:188
void startCapture(size_t index)
Definition regex.h:198
size_t size() const
Returns the number of sub matches available in this match.
Definition regex.h:183
void endCapture(size_t index)
Definition regex.h:209
SubMatch prefix() const
Return the part of the string before the match.
Definition regex.h:168
size_t position() const
Returns the position of the match or std::string::npos if no position is set.
Definition regex.h:159
std::string str() const
Return a string representing the matching part.
Definition regex.h:165
void init(std::string_view str)
Definition regex.h:192
std::vector< SubMatch > m_subMatches
Definition regex.h:223
void setMatch(size_t pos, size_t len)
Definition regex.h:218
SubMatch suffix() const
Return the part of the string after the match.
Definition regex.h:171
size_t m_captureIndex
Definition regex.h:224
size_t length() const
Returns the position of the match or std::string::npos if no length is set.
Definition regex.h:162
Object representing the match results of a capture range.
Definition regex.h:119
size_t length() const
Returns the length of the matching part.
Definition regex.h:128
void setEnd(size_t pos)
Definition regex.h:136
void setStart(size_t pos)
Definition regex.h:135
void setMatch(size_t pos, size_t len)
Definition regex.h:137
size_t m_len
Definition regex.h:139
size_t m_pos
Definition regex.h:138
std::string_view m_str
Definition regex.h:140
friend class Match
Definition regex.h:134
SubMatch(std::string_view str)
Creates a match for a single capture range given a non-owning pointer to the string.
Definition regex.h:122
size_t position() const
Returns the position in the string at which the match starts.
Definition regex.h:125
std::string str() const
Returns the matching part as a string.
Definition regex.h:131
#define NON_COPYABLE(cls)
Macro to help implementing the rule of 5 for a non-copyable & movable class.
Definition construct.h:37
Namespace for the regular expression functions.
Definition regex.cpp:31
bool search(std::string_view str, Match &match, const Ex &re, size_t pos)
Search in a given string str starting at position pos for a match against regular expression re.
Definition regex.cpp:748
std::string replace(std::string_view str, const Ex &re, std::string_view replacement)
Searching in a given input string for parts that match regular expression re and replaces those parts...
Definition regex.cpp:770
bool match(std::string_view str, Match &match, const Ex &re)
Matches a given string str for a match against regular expression re.
Definition regex.cpp:759