Doxygen
Loading...
Searching...
No Matches
reg::Ex::Private Class Reference

Private members of a regular expression. More...

Public Member Functions

 Private (std::string_view pat)
 Creates the private part.
void compile ()
 Compiles a regular expression passed as a string into a stream of tokens that can be used for efficient searching.
bool matchAt (size_t tokenPos, size_t tokenLen, std::string_view str, Match &match, size_t pos, int level) const
 Internal matching routine.

Public Attributes

bool error = false
 Flag indicating the expression was successfully compiled.
std::vector< PTokendata
 The token stream representing the compiled regular expression.
std::string pattern
 The pattern string as passed by the user.
size_t captureCount = 0
 Number of capture groups in the pattern (excluding the whole match).

Detailed Description

Private members of a regular expression.

Definition at line 169 of file regex.cpp.

Constructor & Destructor Documentation

◆ Private()

reg::Ex::Private::Private ( std::string_view pat)
inline

Creates the private part.

Definition at line 173 of file regex.cpp.

173 : pattern(pat)
174 {
175 data.reserve(100);
176 }
std::string pattern
The pattern string as passed by the user.
Definition regex.cpp:191
std::vector< PToken > data
The token stream representing the compiled regular expression.
Definition regex.cpp:188

References data, and pattern.

Member Function Documentation

◆ compile()

void reg::Ex::Private::compile ( )

Compiles a regular expression passed as a string into a stream of tokens that can be used for efficient searching.

Definition at line 200 of file regex.cpp.

201{
202 error = false;
203 data.clear();
204 captureCount = 0;
205 if (pattern.empty()) return;
206 const char *start = pattern.c_str();
207 const char *ps = start;
208 char c = 0;
209
210 int prevTokenPos=-1;
211 int tokenPos=0;
212
213 // capture group assignment
214 std::vector<size_t> captureStack;
215 size_t nextCaptureId = 0;
216
217 auto addToken = [&](PToken tok)
218 {
219 tokenPos++;
220 data.emplace_back(tok);
221 };
222
223 auto getNextCharacter = [&]() -> PToken
224 {
225 char cs=*ps;
226 PToken result = PToken(cs);
227 if (cs=='\\') // escaped character
228 {
229 ps++;
230 cs=*ps;
231 switch (cs)
232 {
233 case 'n': result = PToken('\n'); break;
234 case 'r': result = PToken('\r'); break;
235 case 't': result = PToken('\t'); break;
236 case 's': result = PToken(PToken::Kind::WhiteSpace); break;
237 case 'a': result = PToken(PToken::Kind::Alpha); break;
238 case 'w': result = PToken(PToken::Kind::AlphaNum); break;
239 case 'd': result = PToken(PToken::Kind::Digit); break;
240 case '<': result = PToken(PToken::Kind::BeginOfWord); break;
241 case '>': result = PToken(PToken::Kind::EndOfWord); break;
242 case 'x':
243 case 'X':
244 {
245 uint16_t v=0;
246 for (int i=0;i<2 && (cs=(*(ps+1)));i++) // 2 hex digits
247 {
248 int d = (cs>='a' && cs<='f') ? cs-'a'+10 :
249 (cs>='A' && cs<='F') ? cs-'A'+10 :
250 (cs>='0' && cs<='9') ? cs-'0' :
251 -1;
252 if (d>=0) { v<<=4; v|=d; ps++; } else break;
253 }
254 result = PToken(v);
255 }
256 break;
257 case '\0': ps--; break; // backslash at the end of the pattern
258 default:
259 result = PToken(cs);
260 break;
261 }
262 }
263 return result;
264 };
265
266 while ((c=*ps))
267 {
268 switch (c)
269 {
270 case '^': // beginning of line (if first character of the pattern)
271 prevTokenPos = tokenPos;
272 addToken(ps==start ? PToken(PToken::Kind::BeginOfLine) :
273 PToken(c));
274 break;
275 case '$': // end of the line (if last character of the pattern)
276 prevTokenPos = tokenPos;
277 addToken(*(ps+1)=='\0' ? PToken(PToken::Kind::EndOfLine) :
278 PToken(c));
279 break;
280 case '.': // any character
281 prevTokenPos = tokenPos;
282 addToken(PToken(PToken::Kind::Any));
283 break;
284 case '(': // begin of capture group
285 {
286 prevTokenPos = tokenPos;
287 addToken(PToken(PToken::Kind::BeginCapture));
288 size_t id = ++nextCaptureId; // groups start at 1, 0 is whole match
289 data.back().setValue(id);
290 captureStack.push_back(id);
291 }
292 break;
293 case ')': // end of capture group
294 {
295 prevTokenPos = tokenPos;
296 if (captureStack.empty())
297 {
298 error=true;
299 return;
300 }
301 size_t id = captureStack.back();
302 captureStack.pop_back();
303 addToken(PToken(PToken::Kind::EndCapture));
304 data.back().setValue(id);
305 }
306 break;
307 case '[': // character class
308 {
309 prevTokenPos = tokenPos;
310 ps++;
311 if (*ps==0) { error=true; return; }
312 bool esc = *ps=='\\';
313 PToken tok = getNextCharacter();
314 ps++;
315 if (!esc && tok.kind()==PToken::Kind::Character &&
316 tok.asciiValue()=='^') // negated character class
317 {
318 addToken(PToken(PToken::Kind::NegCharClass));
319 if (*ps==0) { error=true; return; }
320 tok = getNextCharacter();
321 ps++;
322 }
323 else
324 {
325 addToken(PToken(PToken::Kind::CharClass));
326 }
327 uint16_t numTokens=0;
328 while ((c=*ps))
329 {
330 if (c=='-' && *(ps+1)!=']' && *(ps+1)!=0) // range
331 {
332 getNextCharacter();
333 ps++;
334 PToken endTok = getNextCharacter();
335 ps++;
336 if (tok.value()>endTok.value())
337 {
338 addToken(PToken(endTok.value(),tok.value())); // swap start and end
339 }
340 else
341 {
342 addToken(PToken(tok.value(),endTok.value()));
343 }
344 numTokens++;
345 }
346 else // single char, from==to
347 {
348 if (tok.kind()==PToken::Kind::Character)
349 {
350 addToken(PToken(tok.value(),tok.value()));
351 }
352 else // special token, add as-is since from>to
353 {
354 addToken(tok);
355 }
356 numTokens++;
357 }
358 if (*ps==0) { error=true; return; } // expected at least a ]
359 esc = *ps=='\\';
360 tok = getNextCharacter();
361 if (!esc && tok.kind()==PToken::Kind::Character &&
362 tok.value()==static_cast<uint16_t>(']'))
363 {
364 break; // end of character class
365 }
366 if (*ps==0) { error=true; return; } // no ] found
367 ps++;
368 }
369 // set the value of either NegCharClass or CharClass
370 data[prevTokenPos].setValue(numTokens);
371 }
372 break;
373 case '*': // 0 or more
374 case '+': // 1 or more
375 case '?': // optional: 0 or 1
376 {
377 if (prevTokenPos==-1)
378 {
379 error=true;
380 return;
381 }
382 switch (data[prevTokenPos].kind())
383 {
384 case PToken::Kind::BeginOfLine: // $* or $+ or $?
385 case PToken::Kind::BeginOfWord: // <* or <+ or <?
386 case PToken::Kind::EndOfWord: // >* or >+ or >?
387 case PToken::Kind::Star: // ** or *+ or *?
388 case PToken::Kind::Optional: // ?* or ?+ or ??
389 error=true;
390 return;
391 default: // ok
392 break;
393 }
394 int ddiff = static_cast<int>(tokenPos-prevTokenPos);
395 if (*ps=='+') // convert <pat>+ -> <pat><pat>*
396 {
397 // turn a sequence of token [T1...Tn] followed by '+' into [T1..Tn T1..Tn T*]
398 // ddiff=n ^prevTokenPos
399 data.resize(data.size()+ddiff);
400 std::copy_n(data.begin()+prevTokenPos,ddiff,data.begin()+tokenPos);
401 prevTokenPos+=ddiff;
402 tokenPos+=ddiff;
403 }
404 if (data[prevTokenPos].kind()==PToken::Kind::EndCapture)
405 {
406 // find the beginning of the capture range
407 while (prevTokenPos>0 && data[prevTokenPos].kind()!=PToken::Kind::BeginCapture)
408 {
409 prevTokenPos--;
410 }
411 }
412 data.insert(data.begin()+prevTokenPos,
413 c=='?' ? PToken(PToken::Kind::Optional) : PToken(PToken::Kind::Star));
414 tokenPos++;
415 addToken(PToken(PToken::Kind::End));
416 // turn a sequence of tokens [T1 T2 T3] followed by 'T*' or into [T* T1 T2 T3 TEND]
417 // ^prevTokenPos
418 // same for 'T?'.
419 }
420 break;
421 default:
422 prevTokenPos = tokenPos;
423 addToken(getNextCharacter());
424 break;
425 }
426 ps++;
427 }
428 if (!captureStack.empty()) // Unmatched '('?
429 {
430 error=true;
431 return;
432 }
433 captureCount = nextCaptureId;
434 //addToken(PToken(PToken::Kind::End));
435}
size_t captureCount
Number of capture groups in the pattern (excluding the whole match).
Definition regex.cpp:194
bool error
Flag indicating the expression was successfully compiled.
Definition regex.cpp:185

References reg::PToken::Alpha, reg::PToken::AlphaNum, reg::PToken::Any, reg::PToken::asciiValue(), reg::PToken::BeginCapture, reg::PToken::BeginOfLine, reg::PToken::BeginOfWord, captureCount, reg::PToken::Character, reg::PToken::CharClass, data, reg::PToken::Digit, reg::PToken::End, reg::PToken::EndCapture, reg::PToken::EndOfLine, reg::PToken::EndOfWord, error, reg::PToken::kind(), reg::PToken::NegCharClass, reg::PToken::Optional, pattern, reg::PToken::Star, reg::PToken::value(), and reg::PToken::WhiteSpace.

◆ matchAt()

bool reg::Ex::Private::matchAt ( size_t tokenPos,
size_t tokenLen,
std::string_view str,
Match & match,
size_t pos,
int level ) const

Internal matching routine.

Parameters
tokenPosOffset into the token stream.
tokenLenThe length of the token stream.
strThe input string to match against.
matchThe object used to store the matching results.
posThe position in the input string to start with matching
levelRecursion level (used for debugging)

Definition at line 478 of file regex.cpp.

479{
480 DBG("%d:matchAt(tokenPos=%zu, str='%s', pos=%zu)\n",level,tokenPos,pos<str.length() ? str.substr(pos).c_str() : "",pos);
481 auto isStartIdChar = [](char c) { return isalpha(c) || c=='_'; };
482 auto isIdChar = [](char c) { return isalnum(c) || c=='_'; };
483 auto matchCharClass = [this,isStartIdChar,isIdChar](size_t tp,char c) -> bool
484 {
485 PToken tok = data[tp];
486 bool negate = tok.kind()==PToken::Kind::NegCharClass;
487 uint16_t numFields = tok.value();
488 bool found = false;
489 for (uint16_t i=0;i<numFields;i++)
490 {
491 tok = data[++tp];
492 // first check for built-in ranges
493 if ((tok.kind()==PToken::Kind::Alpha && isStartIdChar(c)) ||
494 (tok.kind()==PToken::Kind::AlphaNum && isIdChar(c)) ||
495 (tok.kind()==PToken::Kind::WhiteSpace && isspace(c)) ||
496 (tok.kind()==PToken::Kind::Digit && isdigit(c))
497 )
498 {
499 found=true;
500 break;
501 }
502 else // user specified range
503 {
504 uint16_t v = static_cast<uint16_t>(c);
505 if (tok.from()<=v && v<=tok.to())
506 {
507 found=true;
508 break;
509 }
510 }
511 }
512 DBG("matchCharClass(tp=%zu,c=%c (x%02x))=%d\n",tp,c,c,negate?!found:found);
513 return negate ? !found : found;
514 };
515 size_t index = pos;
516 enum SequenceType { Star, Optional, OptionalRange };
517 auto processSequence = [this,&tokenPos,&tokenLen,&index,&str,&matchCharClass,
518 &isStartIdChar,&isIdChar,&match,&level,&pos](SequenceType type) -> bool
519 {
520 size_t startIndex = index;
521 size_t len = str.length();
522 PToken tok = data[++tokenPos];
523
524 // Special handling for an optional capture group: (...)?
525 if (type==OptionalRange && tok.kind()==PToken::Kind::BeginCapture)
526 {
527 size_t groupId = tok.value();
528 size_t innerStart = tokenPos + 1;
529
530 // Find matching EndCapture, accounting for nesting depth
531 size_t tp = innerStart;
532 int depth = 1;
533 while (tp<tokenLen && depth>0)
534 {
535 if (data[tp].kind()==PToken::Kind::BeginCapture) depth++;
536 else if (data[tp].kind()==PToken::Kind::EndCapture) depth--;
537 tp++;
538 }
539 if (depth!=0) return false; // malformed, unmatched ')'
540 size_t endCapturePos = tp - 1; // position of EndCapture
541 size_t afterSeqPos = endCapturePos + 2; // skip EndCapture and End marker
542
543 // Try with the group present
544 Match tmp;
545 tmp.init(str, /*captureCount*/ captureCount);
546 bool innerOk = matchAt(innerStart,endCapturePos,str,tmp,index,level+1);
547 if (innerOk)
548 {
549 size_t capLen = tmp.length();
550
551 // Copy nested captures from tmp (they may exist inside the group)
552 for (size_t gid=1; gid<tmp.size(); gid++)
553 {
554 size_t sp = tmp[gid].position();
555 size_t sl = tmp[gid].length();
556 if (sp!=std::string::npos && sl!=std::string::npos)
557 {
558 match.startCapture(gid,sp);
559 match.endCapture(gid,sp+sl);
560 }
561 }
562 // Set the outer group's capture
563 match.startCapture(groupId,index);
564 match.endCapture(groupId,index+capLen);
565
566 bool ok = matchAt(afterSeqPos,tokenLen,str,match,index+capLen,level+1);
567 if (ok)
568 {
569 match.setMatch(pos,(index+capLen)-pos+match.length());
570 return true;
571 }
572 }
573
574 // Try with the group absent (empty capture)
575 match.startCapture(groupId,index);
576 match.endCapture(groupId,index); // zero-length
577
578 bool ok2 = matchAt(afterSeqPos,tokenLen,str,match,index,level+1);
579 if (ok2)
580 {
581 match.setMatch(pos,index-pos+match.length());
582 return true;
583 }
584 return false;
585 }
586
587 if (tok.kind()==PToken::Kind::Character) // 'x*' or 'x?'
588 {
589 char c_tok = tok.asciiValue();
590 while (index<len && str[index]==c_tok) { index++; if (type==Optional) break; }
591 tokenPos++;
592 }
593 else if (tok.isCharClass()) // '[a-f0-4]*' or '[...]?' -> eat matching characters
594 {
595 while (index<len && matchCharClass(tokenPos,str[index])) { index++; if (type==Optional) break; }
596 tokenPos+=tok.value()+1; // skip over character ranges + end token
597 }
598 else if (tok.kind()==PToken::Kind::Alpha) // '\a*' or '\a?' -> eat start id characters
599 {
600 while (index<len && isStartIdChar(str[index])) { index++; if (type==Optional) break; }
601 tokenPos++;
602 }
603 else if (tok.kind()==PToken::Kind::AlphaNum) // '\w*' or '\w?' -> eat id characters
604 {
605 while (index<len && isIdChar(str[index])) { index++; if (type==Optional) break; }
606 tokenPos++;
607 }
608 else if (tok.kind()==PToken::Kind::WhiteSpace) // '\s*' or '\s?' -> eat spaces
609 {
610 while (index<len && isspace(str[index])) { index++; if (type==Optional) break; }
611 tokenPos++;
612 }
613 else if (tok.kind()==PToken::Kind::Digit) // '\d*' or '\d?' -> eat digits
614 {
615 while (index<len && isdigit(str[index])) { index++; if (type==Optional) break; }
616 tokenPos++;
617 }
618 else if (tok.kind()==PToken::Kind::Any) // '.*' or '.?' -> eat all
619 {
620 if (type==Optional) index++; else index = str.length();
621 tokenPos++;
622 }
623 else if (type==OptionalRange && tok.kind()==PToken::Kind::BeginCapture)
624 {
625 size_t tokenStart = ++tokenPos;
626 while (tokenPos<tokenLen && data[tokenPos].kind()!=PToken::Kind::EndCapture) { tokenPos++; }
627 Match rangeMatch;
628 rangeMatch.init(str,0);
629 bool found = matchAt(tokenStart,tokenPos,str,rangeMatch,index,level+1);
630 if (found)
631 {
632 index+=rangeMatch.length(); // (abc)? matches -> eat all
633 }
634 tokenPos++; // skip over EndCapture
635 }
636 tokenPos++; // skip over end marker
637 while (index>=startIndex)
638 {
639 // pattern 'x*xy' should match 'xy' and 'xxxxy'
640 bool found = matchAt(tokenPos,tokenLen,str,match,index,level+1);
641 if (found)
642 {
643 match.setMatch(pos,index-pos+match.length());
644 return true;
645 }
646 if (index==0) break;
647 index--;
648 }
649 return false;
650 };
651
652 while (tokenPos<tokenLen)
653 {
654 PToken tok = data[tokenPos];
655 DBG("loop tokenPos=%zu token=%s\n",tokenPos,tok.kindStr());
656 if (tok.kind()==PToken::Kind::Character) // match literal character
657 {
658 char c_tok = tok.asciiValue();
659 if (index>=str.length() || str[index]!=c_tok) return false; // end of string, or non matching char
660 index++,tokenPos++;
661 }
662 else if (tok.isCharClass())
663 {
664 if (index>=str.length() || !matchCharClass(tokenPos,str[index])) return false;
665 index++,tokenPos+=tok.value()+1; // skip over character ranges + end token
666 }
667 else
668 {
669 switch (tok.kind())
670 {
672 if (index>=str.length() || !isStartIdChar(str[index])) return false;
673 index++;
674 break;
676 if (index>=str.length() || !isIdChar(str[index])) return false;
677 index++;
678 break;
680 if (index>=str.length() || !isspace(str[index])) return false;
681 index++;
682 break;
684 if (index>=str.length() || !isdigit(str[index])) return false;
685 index++;
686 break;
688 if (index!=pos) return false;
689 break;
691 if (index<str.length()) return false;
692 break;
694 DBG("BeginOfWord: index=%zu isIdChar(%c)=%d prev.isIdChar(%c)=%d\n",
695 index,str[index],isIdChar(str[index]),
696 index>0?str[index]-1:0,
697 index>0?isIdChar(str[index-1]):-1);
698 if (index>=str.length() ||
699 !isIdChar(str[index]) ||
700 (index>0 && isIdChar(str[index-1]))) return false;
701 break;
703 DBG("EndOfWord: index=%zu pos=%zu idIdChar(%c)=%d prev.isIsChar(%c)=%d\n",
704 index,pos,str[index],isIdChar(str[index]),
705 index==0 ? 0 : str[index-1],
706 index==0 ? -1 : isIdChar(str[index-1]));
707 if (index<str.length() &&
708 (isIdChar(str[index]) || index==0 || !isIdChar(str[index-1]))) return false;
709 break;
711 DBG("BeginCapture(%zu) gid=%u\n",index,tok.value());
712 match.startCapture(tok.value(),index);
713 break;
715 DBG("EndCapture(%zu) gid=%u\n",index,tok.value());
716 match.endCapture(tok.value(),index);
717 break;
719 if (index>=str.length()) return false;
720 index++;
721 break;
723 return processSequence(Star);
725 if (tokenPos<tokenLen-1 && data[tokenPos+1].kind()==PToken::Kind::BeginCapture)
726 {
727 return processSequence(OptionalRange); // (...)?
728 }
729 else
730 {
731 return processSequence(Optional); // x?
732 }
733 default:
734 return false;
735 }
736 tokenPos++;
737 }
738 }
739 match.setMatch(pos,index-pos);
740 return true;
741}
bool matchAt(size_t tokenPos, size_t tokenLen, std::string_view str, Match &match, size_t pos, int level) const
Internal matching routine.
Definition regex.cpp:478
bool match(std::string_view str, Match &match, size_t pos=0) const
Check if a given string matches this regular expression.
Definition regex.cpp:800
#define DBG(x)
Definition dotrunner.cpp:63
#define isIdChar(c)
Definition markdown.cpp:77
static bool isalpha(char c)
Definition regex.cpp:38
static bool isspace(char c)
Definition regex.cpp:33
static bool isalnum(char c)
Definition regex.cpp:48
static bool isdigit(char c)
Definition regex.cpp:43

References reg::PToken::Alpha, reg::PToken::AlphaNum, reg::PToken::Any, reg::PToken::asciiValue(), reg::PToken::BeginCapture, reg::PToken::BeginOfLine, reg::PToken::BeginOfWord, captureCount, reg::PToken::Character, data, DBG, reg::PToken::Digit, reg::PToken::EndCapture, reg::PToken::EndOfLine, reg::PToken::EndOfWord, reg::PToken::from(), reg::Match::init(), reg::isalnum(), reg::isalpha(), reg::PToken::isCharClass(), reg::isdigit(), isIdChar, reg::isspace(), reg::PToken::kind(), reg::PToken::kindStr(), reg::Match::length(), reg::Ex::match(), matchAt(), reg::PToken::NegCharClass, reg::PToken::Optional, reg::Match::position(), reg::Match::size(), reg::PToken::Star, reg::PToken::to(), reg::PToken::value(), and reg::PToken::WhiteSpace.

Referenced by matchAt().

Member Data Documentation

◆ captureCount

size_t reg::Ex::Private::captureCount = 0

Number of capture groups in the pattern (excluding the whole match).

Definition at line 194 of file regex.cpp.

Referenced by compile(), and matchAt().

◆ data

std::vector<PToken> reg::Ex::Private::data

The token stream representing the compiled regular expression.

Definition at line 188 of file regex.cpp.

Referenced by compile(), matchAt(), and Private().

◆ error

bool reg::Ex::Private::error = false

Flag indicating the expression was successfully compiled.

Definition at line 185 of file regex.cpp.

Referenced by compile().

◆ pattern

std::string reg::Ex::Private::pattern

The pattern string as passed by the user.

Definition at line 191 of file regex.cpp.

Referenced by compile(), and Private().


The documentation for this class was generated from the following file: