Doxygen
Loading...
Searching...
No Matches
regex.cpp
Go to the documentation of this file.
1/******************************************************************************
2 *
3 * Copyright (C) 1997-2025 by Dimitri van Heesch.
4 *
5 * Permission to use, copy, modify, and distribute this software and its
6 * documentation under the terms of the GNU General Public License is hereby
7 * granted. No representations are made about the suitability of this software
8 * for any purpose. It is provided "as is" without express or implied warranty.
9 * See the GNU General Public License for more details.
10 *
11 * Documents produced by Doxygen are derivative works derived from the
12 * input used in their production; they are not affected by this license.
13 *
14 */
15
16#include "regex.h"
17#include <cstdint>
18#include <vector>
19#include <cctype>
20#include <cassert>
21#include <algorithm>
22
23#define ENABLE_DEBUG 0
24#if ENABLE_DEBUG
25#define DBG(fmt,...) do { fprintf(stderr,fmt,__VA_ARGS__); } while(0)
26#else
27#define DBG(fmt,...) do {} while(0)
28#endif
29
30namespace reg
31{
32
33static inline bool isspace(char c)
34{
35 return c==' ' || c=='\t' || c=='\n' || c=='\r';
36}
37
38static inline bool isalpha(char c)
39{
40 return static_cast<unsigned char>(c)>=128 || (c>='a' && c<='z') || (c>='A' && c<='Z');
41}
42
43static inline bool isdigit(char c)
44{
45 return c>='0' && c<='9';
46}
47
48static inline bool isalnum(char c)
49{
50 return isalpha(c) || isdigit(c);
51}
52
53
54/** Class representing a token in the compiled regular expression token stream.
55 * A token has a kind and an optional value whose meaning depends on the kind.
56 * It is also possible to store a (from,to) character range in a token.
57 */
58class PToken
59{
60 public:
61 /** The kind of token.
62 *
63 * Ranges per bit mask:
64 * - `0x00FF` from part of a range, except for `0x0000` which is the End marker
65 * - `0x1FFF` built-in ranges
66 * - `0x2FFF` user defined ranges
67 * - `0x4FFF` special operations
68 * - `0x8000` literal character
69 */
70 enum class Kind : uint16_t
71 {
72 End = 0x0000,
73 WhiteSpace = 0x1001, // \s range [ \t\r\n]
74 Digit = 0x1002, // \d range [0-9]
75 Alpha = 0x1003, // \a range [a-z_A-Z\x80-\xFF]
76 AlphaNum = 0x1004, // \w range [a-Z_A-Z0-9\x80-\xFF]
77 CharClass = 0x2001, // []
78 NegCharClass = 0x2002, // [^]
79 BeginOfLine = 0x4001, // ^
80 EndOfLine = 0x4002, // $
81 BeginOfWord = 0x4003, // <
82 EndOfWord = 0x4004, // >
83 BeginCapture = 0x4005, // (
84 EndCapture = 0x4006, // )
85 Any = 0x4007, // .
86 Star = 0x4008, // *
87 Optional = 0x4009, // ?
88 Character = 0x8000 // c
89 };
90
91 /** returns a string representation of the tokens kind (useful for debugging). */
92 const char *kindStr() const
93 {
94 if ((m_rep>>16)>=0x1000 || m_rep==0)
95 {
96 switch(static_cast<Kind>((m_rep>>16)))
97 {
98 case Kind::End: return "End";
99 case Kind::Alpha: return "Alpha";
100 case Kind::AlphaNum: return "AlphaNum";
101 case Kind::WhiteSpace: return "WhiteSpace";
102 case Kind::Digit: return "Digit";
103 case Kind::CharClass: return "CharClass";
104 case Kind::NegCharClass: return "NegCharClass";
105 case Kind::Character: return "Character";
106 case Kind::BeginOfLine: return "BeginOfLine";
107 case Kind::EndOfLine: return "EndOfLine";
108 case Kind::BeginOfWord: return "BeginOfWord";
109 case Kind::EndOfWord: return "EndOfWord";
110 case Kind::BeginCapture: return "BeginCapture";
111 case Kind::EndCapture: return "EndCapture";
112 case Kind::Any: return "Any";
113 case Kind::Star: return "Star";
114 case Kind::Optional: return "Optional";
115 }
116 }
117 else
118 {
119 return "Range";
120 }
121 }
122
123 /** Creates a token of kind 'End' */
124 PToken() : m_rep(0) {}
125
126 /** Creates a token of the given kind \a k */
127 explicit PToken(Kind k) : m_rep(static_cast<uint32_t>(k)<<16) {}
128
129 /** Create a token for an ASCII character */
130 PToken(char c) : m_rep((static_cast<uint32_t>(Kind::Character)<<16) |
131 static_cast<uint32_t>(c)) {}
132
133 /** Create a token for a byte of an UTF-8 character */
134 PToken(uint16_t v) : m_rep((static_cast<uint32_t>(Kind::Character)<<16) |
135 static_cast<uint32_t>(v)) {}
136
137 /** Create a token representing a range from one character \a from to another character \a to */
138 PToken(uint16_t from,uint16_t to) : m_rep(static_cast<uint32_t>(from)<<16 | to) {}
139
140 /** Sets the value for a token */
141 void setValue(uint16_t value) { m_rep = (m_rep & 0xFFFF0000) | value; }
142
143 /** Returns the kind of the token */
144 Kind kind() const { return static_cast<Kind>(m_rep>>16); }
145
146 /** Returns the 'from' part of the character range. Only valid if this token represents a range */
147 uint16_t from() const { return m_rep>>16; }
148
149 /** Returns the 'to' part of the character range. Only valid if this token represents a range */
150 uint16_t to() const { return m_rep & 0xFFFF; }
151
152 /** Returns the value for this token */
153 uint16_t value() const { return m_rep & 0xFFFF; }
154
155 /** Returns the value for this token as a ASCII character */
156 char asciiValue() const { return static_cast<char>(m_rep); }
157
158 /** Returns true iff this token represents a range of characters */
159 bool isRange() const { return m_rep!=0 && from()<=to(); }
160
161 /** Returns true iff this token is a positive or negative character class */
162 bool isCharClass() const { return kind()==Kind::CharClass || kind()==Kind::NegCharClass; }
163
164 private:
165 uint32_t m_rep;
166};
167
168/** Private members of a regular expression */
170{
171 public:
172 /** Creates the private part */
173 Private(std::string_view pat) : pattern(pat)
174 {
175 data.reserve(100);
176 }
177 void compile();
178#if ENABLE_DEBUG
179 void dump();
180#endif
181 bool matchAt(size_t tokenPos,size_t tokenLen,std::string_view str,
182 Match &match,size_t pos,int level) const;
183
184 /** Flag indicating the expression was successfully compiled */
185 bool error = false;
186
187 /** The token stream representing the compiled regular expression. */
188 std::vector<PToken> data; // compiled pattern
189
190 /** The pattern string as passed by the user */
191 std::string pattern;
192
193 /** Number of capture groups in the pattern (excluding the whole match) */
194 size_t captureCount = 0;
195};
196
197/** Compiles a regular expression passed as a string into a stream of tokens that can be used for
198 * efficient searching.
199 */
201{
202 error = false;
203 data.clear();
204 captureCount = 0;
205 if (pattern.empty()) return;
206 const char *start = pattern.c_str();
207 const char *ps = start;
208 char c = 0;
209
210 int prevTokenPos=-1;
211 int tokenPos=0;
212
213 // capture group assignment
214 std::vector<size_t> captureStack;
215 size_t nextCaptureId = 0;
216
217 auto addToken = [&](PToken tok)
218 {
219 tokenPos++;
220 data.emplace_back(tok);
221 };
222
223 auto getNextCharacter = [&]() -> PToken
224 {
225 char cs=*ps;
226 PToken result = PToken(cs);
227 if (cs=='\\') // escaped character
228 {
229 ps++;
230 cs=*ps;
231 switch (cs)
232 {
233 case 'n': result = PToken('\n'); break;
234 case 'r': result = PToken('\r'); break;
235 case 't': result = PToken('\t'); break;
236 case 's': result = PToken(PToken::Kind::WhiteSpace); break;
237 case 'a': result = PToken(PToken::Kind::Alpha); break;
238 case 'w': result = PToken(PToken::Kind::AlphaNum); break;
239 case 'd': result = PToken(PToken::Kind::Digit); break;
240 case '<': result = PToken(PToken::Kind::BeginOfWord); break;
241 case '>': result = PToken(PToken::Kind::EndOfWord); break;
242 case 'x':
243 case 'X':
244 {
245 uint16_t v=0;
246 for (int i=0;i<2 && (cs=(*(ps+1)));i++) // 2 hex digits
247 {
248 int d = (cs>='a' && cs<='f') ? cs-'a'+10 :
249 (cs>='A' && cs<='F') ? cs-'A'+10 :
250 (cs>='0' && cs<='9') ? cs-'0' :
251 -1;
252 if (d>=0) { v<<=4; v|=d; ps++; } else break;
253 }
254 result = PToken(v);
255 }
256 break;
257 case '\0': ps--; break; // backslash at the end of the pattern
258 default:
259 result = PToken(cs);
260 break;
261 }
262 }
263 return result;
264 };
265
266 while ((c=*ps))
267 {
268 switch (c)
269 {
270 case '^': // beginning of line (if first character of the pattern)
271 prevTokenPos = tokenPos;
272 addToken(ps==start ? PToken(PToken::Kind::BeginOfLine) :
273 PToken(c));
274 break;
275 case '$': // end of the line (if last character of the pattern)
276 prevTokenPos = tokenPos;
277 addToken(*(ps+1)=='\0' ? PToken(PToken::Kind::EndOfLine) :
278 PToken(c));
279 break;
280 case '.': // any character
281 prevTokenPos = tokenPos;
282 addToken(PToken(PToken::Kind::Any));
283 break;
284 case '(': // begin of capture group
285 {
286 prevTokenPos = tokenPos;
288 size_t id = ++nextCaptureId; // groups start at 1, 0 is whole match
289 data.back().setValue(id);
290 captureStack.push_back(id);
291 }
292 break;
293 case ')': // end of capture group
294 {
295 prevTokenPos = tokenPos;
296 if (captureStack.empty())
297 {
298 error=true;
299 return;
300 }
301 size_t id = captureStack.back();
302 captureStack.pop_back();
304 data.back().setValue(id);
305 }
306 break;
307 case '[': // character class
308 {
309 prevTokenPos = tokenPos;
310 ps++;
311 if (*ps==0) { error=true; return; }
312 bool esc = *ps=='\\';
313 PToken tok = getNextCharacter();
314 ps++;
315 if (!esc && tok.kind()==PToken::Kind::Character &&
316 tok.asciiValue()=='^') // negated character class
317 {
319 if (*ps==0) { error=true; return; }
320 tok = getNextCharacter();
321 ps++;
322 }
323 else
324 {
326 }
327 uint16_t numTokens=0;
328 while ((c=*ps))
329 {
330 if (c=='-' && *(ps+1)!=']' && *(ps+1)!=0) // range
331 {
332 getNextCharacter();
333 ps++;
334 PToken endTok = getNextCharacter();
335 ps++;
336 if (tok.value()>endTok.value())
337 {
338 addToken(PToken(endTok.value(),tok.value())); // swap start and end
339 }
340 else
341 {
342 addToken(PToken(tok.value(),endTok.value()));
343 }
344 numTokens++;
345 }
346 else // single char, from==to
347 {
348 if (tok.kind()==PToken::Kind::Character)
349 {
350 addToken(PToken(tok.value(),tok.value()));
351 }
352 else // special token, add as-is since from>to
353 {
354 addToken(tok);
355 }
356 numTokens++;
357 }
358 if (*ps==0) { error=true; return; } // expected at least a ]
359 esc = *ps=='\\';
360 tok = getNextCharacter();
361 if (!esc && tok.kind()==PToken::Kind::Character &&
362 tok.value()==static_cast<uint16_t>(']'))
363 {
364 break; // end of character class
365 }
366 if (*ps==0) { error=true; return; } // no ] found
367 ps++;
368 }
369 // set the value of either NegCharClass or CharClass
370 data[prevTokenPos].setValue(numTokens);
371 }
372 break;
373 case '*': // 0 or more
374 case '+': // 1 or more
375 case '?': // optional: 0 or 1
376 {
377 if (prevTokenPos==-1)
378 {
379 error=true;
380 return;
381 }
382 switch (data[prevTokenPos].kind())
383 {
384 case PToken::Kind::BeginOfLine: // $* or $+ or $?
385 case PToken::Kind::BeginOfWord: // <* or <+ or <?
386 case PToken::Kind::EndOfWord: // >* or >+ or >?
387 case PToken::Kind::Star: // ** or *+ or *?
388 case PToken::Kind::Optional: // ?* or ?+ or ??
389 error=true;
390 return;
391 default: // ok
392 break;
393 }
394 int ddiff = static_cast<int>(tokenPos-prevTokenPos);
395 if (*ps=='+') // convert <pat>+ -> <pat><pat>*
396 {
397 // turn a sequence of token [T1...Tn] followed by '+' into [T1..Tn T1..Tn T*]
398 // ddiff=n ^prevTokenPos
399 data.resize(data.size()+ddiff);
400 std::copy_n(data.begin()+prevTokenPos,ddiff,data.begin()+tokenPos);
401 prevTokenPos+=ddiff;
402 tokenPos+=ddiff;
403 }
404 if (data[prevTokenPos].kind()==PToken::Kind::EndCapture)
405 {
406 // find the beginning of the capture range
407 while (prevTokenPos>0 && data[prevTokenPos].kind()!=PToken::Kind::BeginCapture)
408 {
409 prevTokenPos--;
410 }
411 }
412 data.insert(data.begin()+prevTokenPos,
414 tokenPos++;
415 addToken(PToken(PToken::Kind::End));
416 // turn a sequence of tokens [T1 T2 T3] followed by 'T*' or into [T* T1 T2 T3 TEND]
417 // ^prevTokenPos
418 // same for 'T?'.
419 }
420 break;
421 default:
422 prevTokenPos = tokenPos;
423 addToken(getNextCharacter());
424 break;
425 }
426 ps++;
427 }
428 if (!captureStack.empty()) // Unmatched '('?
429 {
430 error=true;
431 return;
432 }
433 captureCount = nextCaptureId;
434 //addToken(PToken(PToken::Kind::End));
435}
436
437#if ENABLE_DEBUG
438/** Dump the compiled token stream for this regular expression. For debugging purposes. */
439void Ex::Private::dump()
440{
441 size_t l = data.size();
442 size_t i =0;
443 DBG("==== compiled token stream for pattern '%s' ===\n",pattern.c_str());
444 DBG("captureCount=%zu\n",captureCount);
445 while (i<l)
446 {
447 DBG("[%s:%04x]\n",data[i].kindStr(),data[i].value());
448 if (data[i].kind()==PToken::Kind::CharClass || data[i].kind()==PToken::Kind::NegCharClass)
449 {
450 uint16_t num = data[i].value();
451 while (num>0 && i<l)
452 {
453 i++;
454 if (data[i].isRange()) // from-to range
455 {
456 DBG("[%04x(%c)-%04x(%c)]\n",data[i].from(),data[i].from(),data[i].to(),data[i].to());
457 }
458 else // special character like \n or \s
459 {
460 DBG("[%s:%04x]\n",data[i].kindStr(),data[i].value());
461 }
462 num--;
463 }
464 }
465 i++;
466 }
467}
468#endif
469
470/** Internal matching routine.
471 * @param tokenPos Offset into the token stream.
472 * @param tokenLen The length of the token stream.
473 * @param str The input string to match against.
474 * @param match The object used to store the matching results.
475 * @param pos The position in the input string to start with matching
476 * @param level Recursion level (used for debugging)
477 */
478bool Ex::Private::matchAt(size_t tokenPos,size_t tokenLen,std::string_view str,Match &match,const size_t pos,int level) const
479{
480 DBG("%d:matchAt(tokenPos=%zu, str='%s', pos=%zu)\n",level,tokenPos,pos<str.length() ? str.substr(pos).c_str() : "",pos);
481 auto isStartIdChar = [](char c) { return isalpha(c) || c=='_'; };
482 auto isIdChar = [](char c) { return isalnum(c) || c=='_'; };
483 auto matchCharClass = [this,isStartIdChar,isIdChar](size_t tp,char c) -> bool
484 {
485 PToken tok = data[tp];
486 bool negate = tok.kind()==PToken::Kind::NegCharClass;
487 uint16_t numFields = tok.value();
488 bool found = false;
489 for (uint16_t i=0;i<numFields;i++)
490 {
491 tok = data[++tp];
492 // first check for built-in ranges
493 if ((tok.kind()==PToken::Kind::Alpha && isStartIdChar(c)) ||
494 (tok.kind()==PToken::Kind::AlphaNum && isIdChar(c)) ||
495 (tok.kind()==PToken::Kind::WhiteSpace && isspace(c)) ||
496 (tok.kind()==PToken::Kind::Digit && isdigit(c))
497 )
498 {
499 found=true;
500 break;
501 }
502 else // user specified range
503 {
504 uint16_t v = static_cast<uint16_t>(c);
505 if (tok.from()<=v && v<=tok.to())
506 {
507 found=true;
508 break;
509 }
510 }
511 }
512 DBG("matchCharClass(tp=%zu,c=%c (x%02x))=%d\n",tp,c,c,negate?!found:found);
513 return negate ? !found : found;
514 };
515 size_t index = pos;
516 enum SequenceType { Star, Optional, OptionalRange };
517 auto processSequence = [this,&tokenPos,&tokenLen,&index,&str,&matchCharClass,
518 &isStartIdChar,&isIdChar,&match,&level,&pos](SequenceType type) -> bool
519 {
520 size_t startIndex = index;
521 size_t len = str.length();
522 PToken tok = data[++tokenPos];
523
524 // Special handling for an optional capture group: (...)?
525 if (type==OptionalRange && tok.kind()==PToken::Kind::BeginCapture)
526 {
527 size_t groupId = tok.value();
528 size_t innerStart = tokenPos + 1;
529
530 // Find matching EndCapture, accounting for nesting depth
531 size_t tp = innerStart;
532 int depth = 1;
533 while (tp<tokenLen && depth>0)
534 {
535 if (data[tp].kind()==PToken::Kind::BeginCapture) depth++;
536 else if (data[tp].kind()==PToken::Kind::EndCapture) depth--;
537 tp++;
538 }
539 if (depth!=0) return false; // malformed, unmatched ')'
540 size_t endCapturePos = tp - 1; // position of EndCapture
541 size_t afterSeqPos = endCapturePos + 2; // skip EndCapture and End marker
542
543 // Try with the group present
544 Match tmp;
545 tmp.init(str, /*captureCount*/ captureCount);
546 bool innerOk = matchAt(innerStart,endCapturePos,str,tmp,index,level+1);
547 if (innerOk)
548 {
549 size_t capLen = tmp.length();
550
551 // Copy nested captures from tmp (they may exist inside the group)
552 for (size_t gid=1; gid<tmp.size(); gid++)
553 {
554 size_t sp = tmp[gid].position();
555 size_t sl = tmp[gid].length();
556 if (sp!=std::string::npos && sl!=std::string::npos)
557 {
558 match.startCapture(gid,sp);
559 match.endCapture(gid,sp+sl);
560 }
561 }
562 // Set the outer group's capture
563 match.startCapture(groupId,index);
564 match.endCapture(groupId,index+capLen);
565
566 bool ok = matchAt(afterSeqPos,tokenLen,str,match,index+capLen,level+1);
567 if (ok)
568 {
569 match.setMatch(pos,(index+capLen)-pos+match.length());
570 return true;
571 }
572 }
573
574 // Try with the group absent (empty capture)
575 match.startCapture(groupId,index);
576 match.endCapture(groupId,index); // zero-length
577
578 bool ok2 = matchAt(afterSeqPos,tokenLen,str,match,index,level+1);
579 if (ok2)
580 {
581 match.setMatch(pos,index-pos+match.length());
582 return true;
583 }
584 return false;
585 }
586
587 if (tok.kind()==PToken::Kind::Character) // 'x*' or 'x?'
588 {
589 char c_tok = tok.asciiValue();
590 while (index<len && str[index]==c_tok) { index++; if (type==Optional) break; }
591 tokenPos++;
592 }
593 else if (tok.isCharClass()) // '[a-f0-4]*' or '[...]?' -> eat matching characters
594 {
595 while (index<len && matchCharClass(tokenPos,str[index])) { index++; if (type==Optional) break; }
596 tokenPos+=tok.value()+1; // skip over character ranges + end token
597 }
598 else if (tok.kind()==PToken::Kind::Alpha) // '\a*' or '\a?' -> eat start id characters
599 {
600 while (index<len && isStartIdChar(str[index])) { index++; if (type==Optional) break; }
601 tokenPos++;
602 }
603 else if (tok.kind()==PToken::Kind::AlphaNum) // '\w*' or '\w?' -> eat id characters
604 {
605 while (index<len && isIdChar(str[index])) { index++; if (type==Optional) break; }
606 tokenPos++;
607 }
608 else if (tok.kind()==PToken::Kind::WhiteSpace) // '\s*' or '\s?' -> eat spaces
609 {
610 while (index<len && isspace(str[index])) { index++; if (type==Optional) break; }
611 tokenPos++;
612 }
613 else if (tok.kind()==PToken::Kind::Digit) // '\d*' or '\d?' -> eat digits
614 {
615 while (index<len && isdigit(str[index])) { index++; if (type==Optional) break; }
616 tokenPos++;
617 }
618 else if (tok.kind()==PToken::Kind::Any) // '.*' or '.?' -> eat all
619 {
620 if (type==Optional) index++; else index = str.length();
621 tokenPos++;
622 }
623 else if (type==OptionalRange && tok.kind()==PToken::Kind::BeginCapture)
624 {
625 size_t tokenStart = ++tokenPos;
626 while (tokenPos<tokenLen && data[tokenPos].kind()!=PToken::Kind::EndCapture) { tokenPos++; }
627 Match rangeMatch;
628 rangeMatch.init(str,0);
629 bool found = matchAt(tokenStart,tokenPos,str,rangeMatch,index,level+1);
630 if (found)
631 {
632 index+=rangeMatch.length(); // (abc)? matches -> eat all
633 }
634 tokenPos++; // skip over EndCapture
635 }
636 tokenPos++; // skip over end marker
637 while (index>=startIndex)
638 {
639 // pattern 'x*xy' should match 'xy' and 'xxxxy'
640 bool found = matchAt(tokenPos,tokenLen,str,match,index,level+1);
641 if (found)
642 {
643 match.setMatch(pos,index-pos+match.length());
644 return true;
645 }
646 if (index==0) break;
647 index--;
648 }
649 return false;
650 };
651
652 while (tokenPos<tokenLen)
653 {
654 PToken tok = data[tokenPos];
655 DBG("loop tokenPos=%zu token=%s\n",tokenPos,tok.kindStr());
656 if (tok.kind()==PToken::Kind::Character) // match literal character
657 {
658 char c_tok = tok.asciiValue();
659 if (index>=str.length() || str[index]!=c_tok) return false; // end of string, or non matching char
660 index++,tokenPos++;
661 }
662 else if (tok.isCharClass())
663 {
664 if (index>=str.length() || !matchCharClass(tokenPos,str[index])) return false;
665 index++,tokenPos+=tok.value()+1; // skip over character ranges + end token
666 }
667 else
668 {
669 switch (tok.kind())
670 {
672 if (index>=str.length() || !isStartIdChar(str[index])) return false;
673 index++;
674 break;
676 if (index>=str.length() || !isIdChar(str[index])) return false;
677 index++;
678 break;
680 if (index>=str.length() || !isspace(str[index])) return false;
681 index++;
682 break;
684 if (index>=str.length() || !isdigit(str[index])) return false;
685 index++;
686 break;
688 if (index!=pos) return false;
689 break;
691 if (index<str.length()) return false;
692 break;
694 DBG("BeginOfWord: index=%zu isIdChar(%c)=%d prev.isIdChar(%c)=%d\n",
695 index,str[index],isIdChar(str[index]),
696 index>0?str[index]-1:0,
697 index>0?isIdChar(str[index-1]):-1);
698 if (index>=str.length() ||
699 !isIdChar(str[index]) ||
700 (index>0 && isIdChar(str[index-1]))) return false;
701 break;
703 DBG("EndOfWord: index=%zu pos=%zu idIdChar(%c)=%d prev.isIsChar(%c)=%d\n",
704 index,pos,str[index],isIdChar(str[index]),
705 index==0 ? 0 : str[index-1],
706 index==0 ? -1 : isIdChar(str[index-1]));
707 if (index<str.length() &&
708 (isIdChar(str[index]) || index==0 || !isIdChar(str[index-1]))) return false;
709 break;
711 DBG("BeginCapture(%zu) gid=%u\n",index,tok.value());
712 match.startCapture(tok.value(),index);
713 break;
715 DBG("EndCapture(%zu) gid=%u\n",index,tok.value());
716 match.endCapture(tok.value(),index);
717 break;
719 if (index>=str.length()) return false;
720 index++;
721 break;
723 return processSequence(Star);
725 if (tokenPos<tokenLen-1 && data[tokenPos+1].kind()==PToken::Kind::BeginCapture)
726 {
727 return processSequence(OptionalRange); // (...)?
728 }
729 else
730 {
731 return processSequence(Optional); // x?
732 }
733 default:
734 return false;
735 }
736 tokenPos++;
737 }
738 }
739 match.setMatch(pos,index-pos);
740 return true;
741}
742
743static std::string wildcard2regex(std::string_view pattern)
744{
745 std::string result="^"; // match start of input
746 result.reserve(pattern.length());
747 for (size_t i=0;i<pattern.length();i++)
748 {
749 char c=pattern[i];
750 switch(c)
751 {
752 case '*':
753 result+=".*";
754 break; // '*' => '.*'
755 case '?':
756 result+='.';
757 break; // '?' => '.'
758 case '.':
759 case '+':
760 case '\\':
761 case '$':
762 case '^':
763 case '(':
764 case ')':
765 result+='\\'; result+=c; // escape
766 break;
767 case '[':
768 if (i<pattern.length()-1 && pattern[i+1]=='^') // don't escape ^ after [
769 {
770 result+="[^";
771 i++;
772 }
773 else
774 {
775 result+=c;
776 }
777 break;
778 default: // just copy
779 result+=c;
780 break;
781 }
782 }
783 result+='$'; // match end of input
784 return result;
785}
786
787
788Ex::Ex(std::string_view pattern, Mode mode)
789 : p(std::make_unique<Private>(mode==Mode::RegEx ? pattern : wildcard2regex(pattern)))
790{
791 p->compile();
792#if ENABLE_DEBUG
793 p->dump();
794 assert(!p->error);
795#endif
796}
797
798Ex::~Ex() = default;
799
800bool Ex::match(std::string_view str,Match &match,size_t pos) const
801{
802 bool found=false;
803 if (p->data.size()==0 || p->error) return found;
804 match.init(str,p->captureCount);
805
806 PToken tok = p->data[0];
807 if (tok.kind()==PToken::Kind::BeginOfLine) // only test match at the given position
808 {
809 found = p->matchAt(0,p->data.size(),str,match,pos,0);
810 }
811 else
812 {
813 if (tok.kind()==PToken::Kind::Character) // search for the start character
814 {
815 size_t index = str.find(tok.asciiValue(),pos);
816 if (index==std::string::npos)
817 {
818 DBG("Ex::match(str='%s',pos=%zu)=false (no start char '%c')\n",std::string(str).c_str(),pos,tok.asciiValue());
819 return false;
820 }
821 DBG("pos=%zu str='%s' char='%c' index=%zu\n",index,std::string(str).c_str(),tok.asciiValue(),index);
822 pos=index;
823 }
824 while (pos<str.length()) // search for a match starting at pos
825 {
826 found = p->matchAt(0,p->data.size(),str,match,pos,0);
827 if (found) break;
828 pos++;
829 }
830 }
831 DBG("Ex::match(str='%s',pos=%zu)=%d\n",std::string(str).c_str(),pos,found);
832 return found;
833}
834
835bool Ex::isValid() const
836{
837 return !p->pattern.empty() && !p->error;
838}
839
840//----------------------------------------------------------------------------------------
841
842bool search(std::string_view str,Match &match,const Ex &re,size_t pos)
843{
844 return re.match(str,match,pos);
845}
846
847bool search(std::string_view str,const Ex &re,size_t pos)
848{
849 Match match;
850 return re.match(str,match,pos);
851}
852
853bool match(std::string_view str,Match &match,const Ex &re)
854{
855 return re.match(str,match,0) && match.position()==0 && match.length()==str.length();
856}
857
858bool match(std::string_view str,const Ex &re)
859{
860 Match match;
861 return re.match(str,match,0) && match.position()==0 && match.length()==str.length();
862}
863
864std::string replace(std::string_view str,const Ex &re,std::string_view replacement)
865{
866 std::string result;
867 Match match;
868 size_t p=0;
869 while (re.match(str,match,p))
870 {
871 size_t i=match.position();
872 size_t l=match.length();
873 if (i>p) result+=str.substr(p,i-p);
874 result+=replacement;
875 p=i+l;
876 }
877 if (p<str.length()) result+=str.substr(p);
878 return result;
879}
880
881}
Private members of a regular expression.
Definition regex.cpp:170
size_t captureCount
Number of capture groups in the pattern (excluding the whole match).
Definition regex.cpp:194
bool error
Flag indicating the expression was successfully compiled.
Definition regex.cpp:185
void compile()
Compiles a regular expression passed as a string into a stream of tokens that can be used for efficie...
Definition regex.cpp:200
std::string pattern
The pattern string as passed by the user.
Definition regex.cpp:191
Private(std::string_view pat)
Creates the private part.
Definition regex.cpp:173
bool matchAt(size_t tokenPos, size_t tokenLen, std::string_view str, Match &match, size_t pos, int level) const
Internal matching routine.
Definition regex.cpp:478
std::vector< PToken > data
The token stream representing the compiled regular expression.
Definition regex.cpp:188
Class representing a regular expression.
Definition regex.h:39
~Ex()
Destroys the regular expression object.
std::unique_ptr< Private > p
Definition regex.h:112
bool match(std::string_view str, Match &match, size_t pos=0) const
Check if a given string matches this regular expression.
Definition regex.cpp:800
Ex(std::string_view pattern, Mode mode=Mode::RegEx)
Creates a regular expression object given the pattern as a string.
Definition regex.cpp:788
Mode
Matching algorithm.
Definition regex.h:43
@ RegEx
full regular expression.
Definition regex.h:44
bool isValid() const
Definition regex.cpp:835
Object representing the matching results.
Definition regex.h:151
void init(std::string_view str, size_t captureCount)
Definition regex.h:190
size_t size() const
Returns the number of sub matches available in this match.
Definition regex.h:181
size_t position() const
Returns the position of the match or std::string::npos if no position is set.
Definition regex.h:157
size_t length() const
Returns the position of the match or std::string::npos if no length is set.
Definition regex.h:160
Class representing a token in the compiled regular expression token stream.
Definition regex.cpp:59
uint16_t to() const
Returns the 'to' part of the character range.
Definition regex.cpp:150
char asciiValue() const
Returns the value for this token as a ASCII character.
Definition regex.cpp:156
PToken(Kind k)
Creates a token of the given kind k.
Definition regex.cpp:127
PToken(char c)
Create a token for an ASCII character.
Definition regex.cpp:130
bool isRange() const
Returns true iff this token represents a range of characters.
Definition regex.cpp:159
Kind kind() const
Returns the kind of the token.
Definition regex.cpp:144
PToken()
Creates a token of kind 'End'.
Definition regex.cpp:124
uint16_t from() const
Returns the 'from' part of the character range.
Definition regex.cpp:147
const char * kindStr() const
returns a string representation of the tokens kind (useful for debugging).
Definition regex.cpp:92
Kind
The kind of token.
Definition regex.cpp:71
uint32_t m_rep
Definition regex.cpp:165
void setValue(uint16_t value)
Sets the value for a token.
Definition regex.cpp:141
uint16_t value() const
Returns the value for this token.
Definition regex.cpp:153
PToken(uint16_t v)
Create a token for a byte of an UTF-8 character.
Definition regex.cpp:134
bool isCharClass() const
Returns true iff this token is a positive or negative character class.
Definition regex.cpp:162
PToken(uint16_t from, uint16_t to)
Create a token representing a range from one character from to another character to.
Definition regex.cpp:138
#define DBG(x)
Definition dotrunner.cpp:63
#define isIdChar(c)
Definition markdown.cpp:77
Namespace for the regular expression functions.
Definition regex.cpp:31
static bool isalpha(char c)
Definition regex.cpp:38
bool search(std::string_view str, Match &match, const Ex &re, size_t pos)
Search in a given string str starting at position pos for a match against regular expression re.
Definition regex.cpp:842
static std::string wildcard2regex(std::string_view pattern)
Definition regex.cpp:743
std::string replace(std::string_view str, const Ex &re, std::string_view replacement)
Searching in a given input string for parts that match regular expression re and replaces those parts...
Definition regex.cpp:864
bool match(std::string_view str, Match &match, const Ex &re)
Matches a given string str for a match against regular expression re.
Definition regex.cpp:853
static bool isspace(char c)
Definition regex.cpp:33
static bool isalnum(char c)
Definition regex.cpp:48
static bool isdigit(char c)
Definition regex.cpp:43