Doxygen
Loading...
Searching...
No Matches
regex.cpp
Go to the documentation of this file.
1/******************************************************************************
2 *
3 * Copyright (C) 1997-2021 by Dimitri van Heesch.
4 *
5 * Permission to use, copy, modify, and distribute this software and its
6 * documentation under the terms of the GNU General Public License is hereby
7 * granted. No representations are made about the suitability of this software
8 * for any purpose. It is provided "as is" without express or implied warranty.
9 * See the GNU General Public License for more details.
10 *
11 * Documents produced by Doxygen are derivative works derived from the
12 * input used in their production; they are not affected by this license.
13 *
14 */
15
16#include "regex.h"
17#include <cstdint>
18#include <vector>
19#include <cctype>
20#include <cassert>
21#include <algorithm>
22
23#define ENABLE_DEBUG 0
24#if ENABLE_DEBUG
25#define DBG(fmt,...) do { fprintf(stderr,fmt,__VA_ARGS__); } while(0)
26#else
27#define DBG(fmt,...) do {} while(0)
28#endif
29
30namespace reg
31{
32
33static inline bool isspace(char c)
34{
35 return c==' ' || c=='\t' || c=='\n' || c=='\r';
36}
37
38static inline bool isalpha(char c)
39{
40 return static_cast<unsigned char>(c)>=128 || (c>='a' && c<='z') || (c>='A' && c<='Z');
41}
42
43static inline bool isdigit(char c)
44{
45 return c>='0' && c<='9';
46}
47
48static inline bool isalnum(char c)
49{
50 return isalpha(c) || isdigit(c);
51}
52
53
54/** Class representing a token in the compiled regular expression token stream.
55 * A token has a kind and an optional value whose meaning depends on the kind.
56 * It is also possible to store a (from,to) character range in a token.
57 */
58class PToken
59{
60 public:
61 /** The kind of token.
62 *
63 * Ranges per bit mask:
64 * - `0x00FF` from part of a range, except for `0x0000` which is the End marker
65 * - `0x1FFF` built-in ranges
66 * - `0x2FFF` user defined ranges
67 * - `0x4FFF` special operations
68 * - `0x8000` literal character
69 */
70 enum class Kind : uint16_t
71 {
72 End = 0x0000,
73 WhiteSpace = 0x1001, // \s range [ \t\r\n]
74 Digit = 0x1002, // \d range [0-9]
75 Alpha = 0x1003, // \a range [a-z_A-Z\x80-\xFF]
76 AlphaNum = 0x1004, // \w range [a-Z_A-Z0-9\x80-\xFF]
77 CharClass = 0x2001, // []
78 NegCharClass = 0x2002, // [^]
79 BeginOfLine = 0x4001, // ^
80 EndOfLine = 0x4002, // $
81 BeginOfWord = 0x4003, // <
82 EndOfWord = 0x4004, // >
83 BeginCapture = 0x4005, // (
84 EndCapture = 0x4006, // )
85 Any = 0x4007, // .
86 Star = 0x4008, // *
87 Optional = 0x4009, // ?
88 Character = 0x8000 // c
89 };
90
91 /** returns a string representation of the tokens kind (useful for debugging). */
92 const char *kindStr() const
93 {
94 if ((m_rep>>16)>=0x1000 || m_rep==0)
95 {
96 switch(static_cast<Kind>((m_rep>>16)))
97 {
98 case Kind::End: return "End";
99 case Kind::Alpha: return "Alpha";
100 case Kind::AlphaNum: return "AlphaNum";
101 case Kind::WhiteSpace: return "WhiteSpace";
102 case Kind::Digit: return "Digit";
103 case Kind::CharClass: return "CharClass";
104 case Kind::NegCharClass: return "NegCharClass";
105 case Kind::Character: return "Character";
106 case Kind::BeginOfLine: return "BeginOfLine";
107 case Kind::EndOfLine: return "EndOfLine";
108 case Kind::BeginOfWord: return "BeginOfWord";
109 case Kind::EndOfWord: return "EndOfWord";
110 case Kind::BeginCapture: return "BeginCapture";
111 case Kind::EndCapture: return "EndCapture";
112 case Kind::Any: return "Any";
113 case Kind::Star: return "Star";
114 case Kind::Optional: return "Optional";
115 }
116 }
117 else
118 {
119 return "Range";
120 }
121 }
122
123 /** Creates a token of kind 'End' */
124 PToken() : m_rep(0) {}
125
126 /** Creates a token of the given kind \a k */
127 explicit PToken(Kind k) : m_rep(static_cast<uint32_t>(k)<<16) {}
128
129 /** Create a token for an ASCII character */
130 PToken(char c) : m_rep((static_cast<uint32_t>(Kind::Character)<<16) |
131 static_cast<uint32_t>(c)) {}
132
133 /** Create a token for a byte of an UTF-8 character */
134 PToken(uint16_t v) : m_rep((static_cast<uint32_t>(Kind::Character)<<16) |
135 static_cast<uint32_t>(v)) {}
136
137 /** Create a token representing a range from one character \a from to another character \a to */
138 PToken(uint16_t from,uint16_t to) : m_rep(static_cast<uint32_t>(from)<<16 | to) {}
139
140 /** Sets the value for a token */
141 void setValue(uint16_t value) { m_rep = (m_rep & 0xFFFF0000) | value; }
142
143 /** Returns the kind of the token */
144 Kind kind() const { return static_cast<Kind>(m_rep>>16); }
145
146 /** Returns the 'from' part of the character range. Only valid if this token represents a range */
147 uint16_t from() const { return m_rep>>16; }
148
149 /** Returns the 'to' part of the character range. Only valid if this token represents a range */
150 uint16_t to() const { return m_rep & 0xFFFF; }
151
152 /** Returns the value for this token */
153 uint16_t value() const { return m_rep & 0xFFFF; }
154
155 /** Returns the value for this token as a ASCII character */
156 char asciiValue() const { return static_cast<char>(m_rep); }
157
158 /** Returns true iff this token represents a range of characters */
159 bool isRange() const { return m_rep!=0 && from()<=to(); }
160
161 /** Returns true iff this token is a positive or negative character class */
162 bool isCharClass() const { return kind()==Kind::CharClass || kind()==Kind::NegCharClass; }
163
164 private:
165 uint32_t m_rep;
166};
167
168/** Private members of a regular expression */
170{
171 public:
172 /** Creates the private part */
173 Private(std::string_view pat) : pattern(pat)
174 {
175 data.reserve(100);
176 }
177 void compile();
178#if ENABLE_DEBUG
179 void dump();
180#endif
181 bool matchAt(size_t tokenPos,size_t tokenLen,std::string_view str,
182 Match &match,size_t pos,int level) const;
183
184 /** Flag indicating the expression was successfully compiled */
185 bool error = false;
186
187 /** The token stream representing the compiled regular expression. */
188 std::vector<PToken> data; // compiled pattern
189
190 /** The pattern string as passed by the user */
191 std::string pattern;
192};
193
194/** Compiles a regular expression passed as a string into a stream of tokens that can be used for
195 * efficient searching.
196 */
198{
199 error = false;
200 data.clear();
201 if (pattern.empty()) return;
202 const char *start = pattern.c_str();
203 const char *ps = start;
204 char c = 0;
205
206 int prevTokenPos=-1;
207 int tokenPos=0;
208
209 auto addToken = [&](PToken tok)
210 {
211 tokenPos++;
212 data.emplace_back(tok);
213 };
214
215 auto getNextCharacter = [&]() -> PToken
216 {
217 char cs=*ps;
218 PToken result = PToken(cs);
219 if (cs=='\\') // escaped character
220 {
221 ps++;
222 cs=*ps;
223 switch (cs)
224 {
225 case 'n': result = PToken('\n'); break;
226 case 'r': result = PToken('\r'); break;
227 case 't': result = PToken('\t'); break;
228 case 's': result = PToken(PToken::Kind::WhiteSpace); break;
229 case 'a': result = PToken(PToken::Kind::Alpha); break;
230 case 'w': result = PToken(PToken::Kind::AlphaNum); break;
231 case 'd': result = PToken(PToken::Kind::Digit); break;
232 case '<': result = PToken(PToken::Kind::BeginOfWord); break;
233 case '>': result = PToken(PToken::Kind::EndOfWord); break;
234 case 'x':
235 case 'X':
236 {
237 uint16_t v=0;
238 for (int i=0;i<2 && (cs=(*(ps+1)));i++) // 2 hex digits
239 {
240 int d = (cs>='a' && cs<='f') ? cs-'a'+10 :
241 (cs>='A' && cs<='F') ? cs-'A'+10 :
242 (cs>='0' && cs<='9') ? cs-'0' :
243 -1;
244 if (d>=0) { v<<=4; v|=d; ps++; } else break;
245 }
246 result = PToken(v);
247 }
248 break;
249 case '\0': ps--; break; // backslash at the end of the pattern
250 default:
251 result = PToken(cs);
252 break;
253 }
254 }
255 return result;
256 };
257
258 while ((c=*ps))
259 {
260 switch (c)
261 {
262 case '^': // beginning of line (if first character of the pattern)
263 prevTokenPos = tokenPos;
264 addToken(ps==start ? PToken(PToken::Kind::BeginOfLine) :
265 PToken(c));
266 break;
267 case '$': // end of the line (if last character of the pattern)
268 prevTokenPos = tokenPos;
269 addToken(*(ps+1)=='\0' ? PToken(PToken::Kind::EndOfLine) :
270 PToken(c));
271 break;
272 case '.': // any character
273 prevTokenPos = tokenPos;
274 addToken(PToken(PToken::Kind::Any));
275 break;
276 case '(': // begin of capture group
277 prevTokenPos = tokenPos;
278 addToken(PToken(PToken::Kind::BeginCapture));
279 break;
280 case ')': // end of capture group
281 prevTokenPos = tokenPos;
282 addToken(PToken(PToken::Kind::EndCapture));
283 break;
284 case '[': // character class
285 {
286 prevTokenPos = tokenPos;
287 ps++;
288 if (*ps==0) { error=true; return; }
289 bool esc = *ps=='\\';
290 PToken tok = getNextCharacter();
291 ps++;
292 if (!esc && tok.kind()==PToken::Kind::Character &&
293 tok.asciiValue()=='^') // negated character class
294 {
295 addToken(PToken(PToken::Kind::NegCharClass));
296 if (*ps==0) { error=true; return; }
297 tok = getNextCharacter();
298 ps++;
299 }
300 else
301 {
302 addToken(PToken(PToken::Kind::CharClass));
303 }
304 uint16_t numTokens=0;
305 while ((c=*ps))
306 {
307 if (c=='-' && *(ps+1)!=']' && *(ps+1)!=0) // range
308 {
309 getNextCharacter();
310 ps++;
311 PToken endTok = getNextCharacter();
312 ps++;
313 if (tok.value()>endTok.value())
314 {
315 addToken(PToken(endTok.value(),tok.value())); // swap start and end
316 }
317 else
318 {
319 addToken(PToken(tok.value(),endTok.value()));
320 }
321 numTokens++;
322 }
323 else // single char, from==to
324 {
325 if (tok.kind()==PToken::Kind::Character)
326 {
327 addToken(PToken(tok.value(),tok.value()));
328 }
329 else // special token, add as-is since from>to
330 {
331 addToken(tok);
332 }
333 numTokens++;
334 }
335 if (*ps==0) { error=true; return; } // expected at least a ]
336 esc = *ps=='\\';
337 tok = getNextCharacter();
338 if (!esc && tok.kind()==PToken::Kind::Character &&
339 tok.value()==static_cast<uint16_t>(']'))
340 {
341 break; // end of character class
342 }
343 if (*ps==0) { error=true; return; } // no ] found
344 ps++;
345 }
346 // set the value of either NegCharClass or CharClass
347 data[prevTokenPos].setValue(numTokens);
348 }
349 break;
350 case '*': // 0 or more
351 case '+': // 1 or more
352 case '?': // optional: 0 or 1
353 {
354 if (prevTokenPos==-1)
355 {
356 error=true;
357 return;
358 }
359 switch (data[prevTokenPos].kind())
360 {
361 case PToken::Kind::BeginOfLine: // $* or $+ or $?
362 case PToken::Kind::BeginOfWord: // <* or <+ or <?
363 case PToken::Kind::EndOfWord: // >* or >+ or >?
364 case PToken::Kind::Star: // ** or *+ or *?
365 case PToken::Kind::Optional: // ?* or ?+ or ??
366 error=true;
367 return;
368 default: // ok
369 break;
370 }
371 int ddiff = static_cast<int>(tokenPos-prevTokenPos);
372 if (*ps=='+') // convert <pat>+ -> <pat><pat>*
373 {
374 // turn a sequence of token [T1...Tn] followed by '+' into [T1..Tn T1..Tn T*]
375 // ddiff=n ^prevTokenPos
376 data.resize(data.size()+ddiff);
377 std::copy_n(data.begin()+prevTokenPos,ddiff,data.begin()+tokenPos);
378 prevTokenPos+=ddiff;
379 tokenPos+=ddiff;
380 }
381 if (data[prevTokenPos].kind()==PToken::Kind::EndCapture)
382 {
383 // find the beginning of the capture range
384 while (prevTokenPos>0 && data[prevTokenPos].kind()!=PToken::Kind::BeginCapture)
385 {
386 prevTokenPos--;
387 }
388 }
389 data.insert(data.begin()+prevTokenPos,
390 c=='?' ? PToken(PToken::Kind::Optional) : PToken(PToken::Kind::Star));
391 tokenPos++;
392 addToken(PToken(PToken::Kind::End));
393 // turn a sequence of tokens [T1 T2 T3] followed by 'T*' or into [T* T1 T2 T3 TEND]
394 // ^prevTokenPos
395 // same for 'T?'.
396 }
397 break;
398 default:
399 prevTokenPos = tokenPos;
400 addToken(getNextCharacter());
401 break;
402 }
403 ps++;
404 }
405 //addToken(PToken(PToken::Kind::End));
406}
407
408#if ENABLE_DEBUG
409/** Dump the compiled token stream for this regular expression. For debugging purposes. */
410void Ex::Private::dump()
411{
412 size_t l = data.size();
413 size_t i =0;
414 DBG("==== compiled token stream for pattern '%s' ===\n",pattern.c_str());
415 while (i<l)
416 {
417 DBG("[%s:%04x]\n",data[i].kindStr(),data[i].value());
418 if (data[i].kind()==PToken::Kind::CharClass || data[i].kind()==PToken::Kind::NegCharClass)
419 {
420 uint16_t num = data[i].value();
421 while (num>0 && i<l)
422 {
423 i++;
424 if (data[i].isRange()) // from-to range
425 {
426 DBG("[%04x(%c)-%04x(%c)]\n",data[i].from(),data[i].from(),data[i].to(),data[i].to());
427 }
428 else // special character like \n or \s
429 {
430 DBG("[%s:%04x]\n",data[i].kindStr(),data[i].value());
431 }
432 num--;
433 }
434 }
435 i++;
436 }
437}
438#endif
439
440/** Internal matching routine.
441 * @param tokenPos Offset into the token stream.
442 * @param tokenLen The length of the token stream.
443 * @param str The input string to match against.
444 * @param match The object used to store the matching results.
445 * @param pos The position in the input string to start with matching
446 * @param level Recursion level (used for debugging)
447 */
448bool Ex::Private::matchAt(size_t tokenPos,size_t tokenLen,std::string_view str,Match &match,const size_t pos,int level) const
449{
450 DBG("%d:matchAt(tokenPos=%zu, str='%s', pos=%zu)\n",level,tokenPos,pos<str.length() ? str.substr(pos).c_str() : "",pos);
451 auto isStartIdChar = [](char c) { return isalpha(c) || c=='_'; };
452 auto isIdChar = [](char c) { return isalnum(c) || c=='_'; };
453 auto matchCharClass = [this,isStartIdChar,isIdChar](size_t tp,char c) -> bool
454 {
455 PToken tok = data[tp];
456 bool negate = tok.kind()==PToken::Kind::NegCharClass;
457 uint16_t numFields = tok.value();
458 bool found = false;
459 for (uint16_t i=0;i<numFields;i++)
460 {
461 tok = data[++tp];
462 // first check for built-in ranges
463 if ((tok.kind()==PToken::Kind::Alpha && isStartIdChar(c)) ||
464 (tok.kind()==PToken::Kind::AlphaNum && isIdChar(c)) ||
465 (tok.kind()==PToken::Kind::WhiteSpace && isspace(c)) ||
466 (tok.kind()==PToken::Kind::Digit && isdigit(c))
467 )
468 {
469 found=true;
470 break;
471 }
472 else // user specified range
473 {
474 uint16_t v = static_cast<uint16_t>(c);
475 if (tok.from()<=v && v<=tok.to())
476 {
477 found=true;
478 break;
479 }
480 }
481 }
482 DBG("matchCharClass(tp=%zu,c=%c (x%02x))=%d\n",tp,c,c,negate?!found:found);
483 return negate ? !found : found;
484 };
485 size_t index = pos;
486 enum SequenceType { Star, Optional, OptionalRange };
487 auto processSequence = [this,&tokenPos,&tokenLen,&index,&str,&matchCharClass,
488 &isStartIdChar,&isIdChar,&match,&level,&pos](SequenceType type) -> bool
489 {
490 size_t startIndex = index;
491 size_t len = str.length();
492 PToken tok = data[++tokenPos];
493 if (tok.kind()==PToken::Kind::Character) // 'x*' -> eat x's
494 {
495 char c_tok = tok.asciiValue();
496 while (index<len && str[index]==c_tok) { index++; if (type==Optional) break; }
497 tokenPos++;
498 }
499 else if (tok.isCharClass()) // '[a-f0-4]* -> eat matching characters
500 {
501 while (index<len && matchCharClass(tokenPos,str[index])) { index++; if (type==Optional) break; }
502 tokenPos+=tok.value()+1; // skip over character ranges + end token
503 }
504 else if (tok.kind()==PToken::Kind::Alpha) // '\a*' -> eat start id characters
505 {
506 while (index<len && isStartIdChar(str[index])) { index++; if (type==Optional) break; }
507 tokenPos++;
508 }
509 else if (tok.kind()==PToken::Kind::AlphaNum) // '\w*' -> eat id characters
510 {
511 while (index<len && isIdChar(str[index])) { index++; if (type==Optional) break; }
512 tokenPos++;
513 }
514 else if (tok.kind()==PToken::Kind::WhiteSpace) // '\s*' -> eat spaces
515 {
516 while (index<len && isspace(str[index])) { index++; if (type==Optional) break; }
517 tokenPos++;
518 }
519 else if (tok.kind()==PToken::Kind::Digit) // '\d*' -> eat digits
520 {
521 while (index<len && isdigit(str[index])) { index++; if (type==Optional) break; }
522 tokenPos++;
523 }
524 else if (tok.kind()==PToken::Kind::Any) // '.*' -> eat all
525 {
526 if (type==Optional) index++; else index = str.length();
527 tokenPos++;
528 }
529 else if (type==OptionalRange && tok.kind()==PToken::Kind::BeginCapture)
530 {
531 size_t tokenStart = ++tokenPos;
532 while (tokenPos<tokenLen && data[tokenPos].kind()!=PToken::Kind::EndCapture) { tokenPos++; }
533 Match rangeMatch;
534 rangeMatch.init(str);
535 bool found = matchAt(tokenStart,tokenPos,str,rangeMatch,index,level+1);
536 if (found)
537 {
538 index+=rangeMatch.length(); // (abc)? matches -> eat all
539 }
540 tokenPos++; // skip over EndCapture
541 }
542 tokenPos++; // skip over end marker
543 while (index>=startIndex)
544 {
545 // pattern 'x*xy' should match 'xy' and 'xxxxy'
546 bool found = matchAt(tokenPos,tokenLen,str,match,index,level+1);
547 if (found)
548 {
549 match.setMatch(pos,index-pos+match.length());
550 return true;
551 }
552 if (index==0) break;
553 index--;
554 }
555 return false;
556 };
557
558 while (tokenPos<tokenLen)
559 {
560 PToken tok = data[tokenPos];
561 DBG("loop tokenPos=%zu token=%s\n",tokenPos,tok.kindStr());
562 if (tok.kind()==PToken::Kind::Character) // match literal character
563 {
564 char c_tok = tok.asciiValue();
565 if (index>=str.length() || str[index]!=c_tok) return false; // end of string, or non matching char
566 index++,tokenPos++;
567 }
568 else if (tok.isCharClass())
569 {
570 if (index>=str.length() || !matchCharClass(tokenPos,str[index])) return false;
571 index++,tokenPos+=tok.value()+1; // skip over character ranges + end token
572 }
573 else
574 {
575 switch (tok.kind())
576 {
578 if (index>=str.length() || !isStartIdChar(str[index])) return false;
579 index++;
580 break;
582 if (index>=str.length() || !isIdChar(str[index])) return false;
583 index++;
584 break;
586 if (index>=str.length() || !isspace(str[index])) return false;
587 index++;
588 break;
590 if (index>=str.length() || !isdigit(str[index])) return false;
591 index++;
592 break;
594 if (index!=pos) return false;
595 break;
597 if (index<str.length()) return false;
598 break;
600 DBG("BeginOfWord: index=%zu isIdChar(%c)=%d prev.isIdChar(%c)=%d\n",
601 index,str[index],isIdChar(str[index]),
602 index>0?str[index]-1:0,
603 index>0?isIdChar(str[index-1]):-1);
604 if (index>=str.length() ||
605 !isIdChar(str[index]) ||
606 (index>0 && isIdChar(str[index-1]))) return false;
607 break;
609 DBG("EndOfWord: index=%zu pos=%zu idIdChar(%c)=%d prev.isIsChar(%c)=%d\n",
610 index,pos,str[index],isIdChar(str[index]),
611 index==0 ? 0 : str[index-1],
612 index==0 ? -1 : isIdChar(str[index-1]));
613 if (index<str.length() &&
614 (isIdChar(str[index]) || index==0 || !isIdChar(str[index-1]))) return false;
615 break;
617 DBG("BeginCapture(%zu)\n",index);
618 match.startCapture(index);
619 break;
621 DBG("EndCapture(%zu)\n",index);
622 match.endCapture(index);
623 break;
625 if (index>=str.length()) return false;
626 index++;
627 break;
629 return processSequence(Star);
631 if (tokenPos<tokenLen-1 && data[tokenPos+1].kind()==PToken::Kind::BeginCapture)
632 {
633 return processSequence(OptionalRange); // (...)?
634 }
635 else
636 {
637 return processSequence(Optional); // x?
638 }
639 default:
640 return false;
641 }
642 tokenPos++;
643 }
644 }
645 match.setMatch(pos,index-pos);
646 return true;
647}
648
649static std::string wildcard2regex(std::string_view pattern)
650{
651 std::string result="^"; // match start of input
652 result.reserve(pattern.length());
653 for (size_t i=0;i<pattern.length();i++)
654 {
655 char c=pattern[i];
656 switch(c)
657 {
658 case '*':
659 result+=".*";
660 break; // '*' => '.*'
661 case '?':
662 result+='.';
663 break; // '?' => '.'
664 case '.':
665 case '+':
666 case '\\':
667 case '$':
668 case '^':
669 case '(':
670 case ')':
671 result+='\\'; result+=c; // escape
672 break;
673 case '[':
674 if (i<pattern.length()-1 && pattern[i+1]=='^') // don't escape ^ after [
675 {
676 result+="[^";
677 i++;
678 }
679 else
680 {
681 result+=c;
682 }
683 break;
684 default: // just copy
685 result+=c;
686 break;
687 }
688 }
689 result+='$'; // match end of input
690 return result;
691}
692
693
694Ex::Ex(std::string_view pattern, Mode mode)
695 : p(std::make_unique<Private>(mode==Mode::RegEx ? pattern : wildcard2regex(pattern)))
696{
697 p->compile();
698#if ENABLE_DEBUG
699 p->dump();
700 assert(!p->error);
701#endif
702}
703
704Ex::~Ex() = default;
705
706bool Ex::match(std::string_view str,Match &match,size_t pos) const
707{
708 bool found=false;
709 if (p->data.size()==0 || p->error) return found;
710 match.init(str);
711
712 PToken tok = p->data[0];
713 if (tok.kind()==PToken::Kind::BeginOfLine) // only test match at the given position
714 {
715 found = p->matchAt(0,p->data.size(),str,match,pos,0);
716 }
717 else
718 {
719 if (tok.kind()==PToken::Kind::Character) // search for the start character
720 {
721 size_t index = str.find(tok.asciiValue(),pos);
722 if (index==std::string::npos)
723 {
724 DBG("Ex::match(str='%s',pos=%zu)=false (no start char '%c')\n",str.c_str(),pos,tok.asciiValue());
725 return false;
726 }
727 DBG("pos=%zu str='%s' char='%c' index=%zu\n",index,str.c_str(),tok.asciiValue(),index);
728 pos=index;
729 }
730 while (pos<str.length()) // search for a match starting at pos
731 {
732 found = p->matchAt(0,p->data.size(),str,match,pos,0);
733 if (found) break;
734 pos++;
735 }
736 }
737 DBG("Ex::match(str='%s',pos=%zu)=%d\n",str.c_str(),pos,found);
738 return found;
739}
740
741bool Ex::isValid() const
742{
743 return !p->pattern.empty() && !p->error;
744}
745
746//----------------------------------------------------------------------------------------
747
748bool search(std::string_view str,Match &match,const Ex &re,size_t pos)
749{
750 return re.match(str,match,pos);
751}
752
753bool search(std::string_view str,const Ex &re,size_t pos)
754{
755 Match match;
756 return re.match(str,match,pos);
757}
758
759bool match(std::string_view str,Match &match,const Ex &re)
760{
761 return re.match(str,match,0) && match.position()==0 && match.length()==str.length();
762}
763
764bool match(std::string_view str,const Ex &re)
765{
766 Match match;
767 return re.match(str,match,0) && match.position()==0 && match.length()==str.length();
768}
769
770std::string replace(std::string_view str,const Ex &re,std::string_view replacement)
771{
772 std::string result;
773 Match match;
774 size_t p=0;
775 while (re.match(str,match,p))
776 {
777 size_t i=match.position();
778 size_t l=match.length();
779 if (i>p) result+=str.substr(p,i-p);
780 result+=replacement;
781 p=i+l;
782 }
783 if (p<str.length()) result+=str.substr(p);
784 return result;
785}
786
787}
Private members of a regular expression.
Definition regex.cpp:170
bool error
Flag indicating the expression was successfully compiled.
Definition regex.cpp:185
void compile()
Compiles a regular expression passed as a string into a stream of tokens that can be used for efficie...
Definition regex.cpp:197
std::string pattern
The pattern string as passed by the user.
Definition regex.cpp:191
Private(std::string_view pat)
Creates the private part.
Definition regex.cpp:173
bool matchAt(size_t tokenPos, size_t tokenLen, std::string_view str, Match &match, size_t pos, int level) const
Internal matching routine.
Definition regex.cpp:448
std::vector< PToken > data
The token stream representing the compiled regular expression.
Definition regex.cpp:188
Class representing a regular expression.
Definition regex.h:39
~Ex()
Destroys the regular expression object.
std::unique_ptr< Private > p
Definition regex.h:114
bool match(std::string_view str, Match &match, size_t pos=0) const
Check if a given string matches this regular expression.
Definition regex.cpp:706
Ex(std::string_view pattern, Mode mode=Mode::RegEx)
Creates a regular expression object given the pattern as a string.
Definition regex.cpp:694
Mode
Matching algorithm.
Definition regex.h:43
@ RegEx
full regular expression.
Definition regex.h:44
bool isValid() const
Definition regex.cpp:741
Object representing the matching results.
Definition regex.h:153
void init(std::string_view str)
Definition regex.h:192
size_t length() const
Returns the position of the match or std::string::npos if no length is set.
Definition regex.h:162
Class representing a token in the compiled regular expression token stream.
Definition regex.cpp:59
uint16_t to() const
Returns the 'to' part of the character range.
Definition regex.cpp:150
char asciiValue() const
Returns the value for this token as a ASCII character.
Definition regex.cpp:156
PToken(Kind k)
Creates a token of the given kind k.
Definition regex.cpp:127
PToken(char c)
Create a token for an ASCII character.
Definition regex.cpp:130
bool isRange() const
Returns true iff this token represents a range of characters.
Definition regex.cpp:159
Kind kind() const
Returns the kind of the token.
Definition regex.cpp:144
PToken()
Creates a token of kind 'End'.
Definition regex.cpp:124
uint16_t from() const
Returns the 'from' part of the character range.
Definition regex.cpp:147
const char * kindStr() const
returns a string representation of the tokens kind (useful for debugging).
Definition regex.cpp:92
Kind
The kind of token.
Definition regex.cpp:71
uint32_t m_rep
Definition regex.cpp:165
void setValue(uint16_t value)
Sets the value for a token.
Definition regex.cpp:141
uint16_t value() const
Returns the value for this token.
Definition regex.cpp:153
PToken(uint16_t v)
Create a token for a byte of an UTF-8 character.
Definition regex.cpp:134
bool isCharClass() const
Returns true iff this token is a positive or negative character class.
Definition regex.cpp:162
PToken(uint16_t from, uint16_t to)
Create a token representing a range from one character from to another character to.
Definition regex.cpp:138
#define DBG(x)
Definition dotrunner.cpp:53
#define isIdChar(c)
Definition markdown.cpp:77
Namespace for the regular expression functions.
Definition regex.cpp:31
static bool isalpha(char c)
Definition regex.cpp:38
bool search(std::string_view str, Match &match, const Ex &re, size_t pos)
Search in a given string str starting at position pos for a match against regular expression re.
Definition regex.cpp:748
static std::string wildcard2regex(std::string_view pattern)
Definition regex.cpp:649
std::string replace(std::string_view str, const Ex &re, std::string_view replacement)
Searching in a given input string for parts that match regular expression re and replaces those parts...
Definition regex.cpp:770
bool match(std::string_view str, Match &match, const Ex &re)
Matches a given string str for a match against regular expression re.
Definition regex.cpp:759
static bool isspace(char c)
Definition regex.cpp:33
static bool isalnum(char c)
Definition regex.cpp:48
static bool isdigit(char c)
Definition regex.cpp:43
bool found
Definition util.cpp:984