Doxygen
Loading...
Searching...
No Matches
reg::Ex::Private Class Reference

Private members of a regular expression. More...

Public Member Functions

 Private (std::string_view pat)
 Creates the private part.
 
void compile ()
 Compiles a regular expression passed as a string into a stream of tokens that can be used for efficient searching.
 
bool matchAt (size_t tokenPos, size_t tokenLen, std::string_view str, Match &match, size_t pos, int level) const
 Internal matching routine.
 

Public Attributes

bool error = false
 Flag indicating the expression was successfully compiled.
 
std::vector< PTokendata
 The token stream representing the compiled regular expression.
 
std::string pattern
 The pattern string as passed by the user.
 

Detailed Description

Private members of a regular expression.

Definition at line 169 of file regex.cpp.

Constructor & Destructor Documentation

◆ Private()

reg::Ex::Private::Private ( std::string_view pat)
inline

Creates the private part.

Definition at line 173 of file regex.cpp.

173 : pattern(pat)
174 {
175 data.reserve(100);
176 }
std::string pattern
The pattern string as passed by the user.
Definition regex.cpp:191
std::vector< PToken > data
The token stream representing the compiled regular expression.
Definition regex.cpp:188

References data, and pattern.

Member Function Documentation

◆ compile()

void reg::Ex::Private::compile ( )

Compiles a regular expression passed as a string into a stream of tokens that can be used for efficient searching.

Definition at line 197 of file regex.cpp.

198{
199 error = false;
200 data.clear();
201 if (pattern.empty()) return;
202 const char *start = pattern.c_str();
203 const char *ps = start;
204 char c = 0;
205
206 int prevTokenPos=-1;
207 int tokenPos=0;
208
209 auto addToken = [&](PToken tok)
210 {
211 tokenPos++;
212 data.emplace_back(tok);
213 };
214
215 auto getNextCharacter = [&]() -> PToken
216 {
217 char cs=*ps;
218 PToken result = PToken(cs);
219 if (cs=='\\') // escaped character
220 {
221 ps++;
222 cs=*ps;
223 switch (cs)
224 {
225 case 'n': result = PToken('\n'); break;
226 case 'r': result = PToken('\r'); break;
227 case 't': result = PToken('\t'); break;
228 case 's': result = PToken(PToken::Kind::WhiteSpace); break;
229 case 'a': result = PToken(PToken::Kind::Alpha); break;
230 case 'w': result = PToken(PToken::Kind::AlphaNum); break;
231 case 'd': result = PToken(PToken::Kind::Digit); break;
232 case '<': result = PToken(PToken::Kind::BeginOfWord); break;
233 case '>': result = PToken(PToken::Kind::EndOfWord); break;
234 case 'x':
235 case 'X':
236 {
237 uint16_t v=0;
238 for (int i=0;i<2 && (cs=(*(ps+1)));i++) // 2 hex digits
239 {
240 int d = (cs>='a' && cs<='f') ? cs-'a'+10 :
241 (cs>='A' && cs<='F') ? cs-'A'+10 :
242 (cs>='0' && cs<='9') ? cs-'0' :
243 -1;
244 if (d>=0) { v<<=4; v|=d; ps++; } else break;
245 }
246 result = PToken(v);
247 }
248 break;
249 case '\0': ps--; break; // backslash at the end of the pattern
250 default:
251 result = PToken(cs);
252 break;
253 }
254 }
255 return result;
256 };
257
258 while ((c=*ps))
259 {
260 switch (c)
261 {
262 case '^': // beginning of line (if first character of the pattern)
263 prevTokenPos = tokenPos;
264 addToken(ps==start ? PToken(PToken::Kind::BeginOfLine) :
265 PToken(c));
266 break;
267 case '$': // end of the line (if last character of the pattern)
268 prevTokenPos = tokenPos;
269 addToken(*(ps+1)=='\0' ? PToken(PToken::Kind::EndOfLine) :
270 PToken(c));
271 break;
272 case '.': // any character
273 prevTokenPos = tokenPos;
274 addToken(PToken(PToken::Kind::Any));
275 break;
276 case '(': // begin of capture group
277 prevTokenPos = tokenPos;
278 addToken(PToken(PToken::Kind::BeginCapture));
279 break;
280 case ')': // end of capture group
281 prevTokenPos = tokenPos;
282 addToken(PToken(PToken::Kind::EndCapture));
283 break;
284 case '[': // character class
285 {
286 prevTokenPos = tokenPos;
287 ps++;
288 if (*ps==0) { error=true; return; }
289 bool esc = *ps=='\\';
290 PToken tok = getNextCharacter();
291 ps++;
292 if (!esc && tok.kind()==PToken::Kind::Character &&
293 tok.asciiValue()=='^') // negated character class
294 {
295 addToken(PToken(PToken::Kind::NegCharClass));
296 if (*ps==0) { error=true; return; }
297 tok = getNextCharacter();
298 ps++;
299 }
300 else
301 {
302 addToken(PToken(PToken::Kind::CharClass));
303 }
304 uint16_t numTokens=0;
305 while ((c=*ps))
306 {
307 if (c=='-' && *(ps+1)!=']' && *(ps+1)!=0) // range
308 {
309 getNextCharacter();
310 ps++;
311 PToken endTok = getNextCharacter();
312 ps++;
313 if (tok.value()>endTok.value())
314 {
315 addToken(PToken(endTok.value(),tok.value())); // swap start and end
316 }
317 else
318 {
319 addToken(PToken(tok.value(),endTok.value()));
320 }
321 numTokens++;
322 }
323 else // single char, from==to
324 {
325 if (tok.kind()==PToken::Kind::Character)
326 {
327 addToken(PToken(tok.value(),tok.value()));
328 }
329 else // special token, add as-is since from>to
330 {
331 addToken(tok);
332 }
333 numTokens++;
334 }
335 if (*ps==0) { error=true; return; } // expected at least a ]
336 esc = *ps=='\\';
337 tok = getNextCharacter();
338 if (!esc && tok.kind()==PToken::Kind::Character &&
339 tok.value()==static_cast<uint16_t>(']'))
340 {
341 break; // end of character class
342 }
343 if (*ps==0) { error=true; return; } // no ] found
344 ps++;
345 }
346 // set the value of either NegCharClass or CharClass
347 data[prevTokenPos].setValue(numTokens);
348 }
349 break;
350 case '*': // 0 or more
351 case '+': // 1 or more
352 case '?': // optional: 0 or 1
353 {
354 if (prevTokenPos==-1)
355 {
356 error=true;
357 return;
358 }
359 switch (data[prevTokenPos].kind())
360 {
361 case PToken::Kind::BeginOfLine: // $* or $+ or $?
362 case PToken::Kind::BeginOfWord: // <* or <+ or <?
363 case PToken::Kind::EndOfWord: // >* or >+ or >?
364 case PToken::Kind::Star: // ** or *+ or *?
365 case PToken::Kind::Optional: // ?* or ?+ or ??
366 error=true;
367 return;
368 default: // ok
369 break;
370 }
371 int ddiff = static_cast<int>(tokenPos-prevTokenPos);
372 if (*ps=='+') // convert <pat>+ -> <pat><pat>*
373 {
374 // turn a sequence of token [T1...Tn] followed by '+' into [T1..Tn T1..Tn T*]
375 // ddiff=n ^prevTokenPos
376 data.resize(data.size()+ddiff);
377 std::copy_n(data.begin()+prevTokenPos,ddiff,data.begin()+tokenPos);
378 prevTokenPos+=ddiff;
379 tokenPos+=ddiff;
380 }
381 if (data[prevTokenPos].kind()==PToken::Kind::EndCapture)
382 {
383 // find the beginning of the capture range
384 while (prevTokenPos>0 && data[prevTokenPos].kind()!=PToken::Kind::BeginCapture)
385 {
386 prevTokenPos--;
387 }
388 }
389 data.insert(data.begin()+prevTokenPos,
390 c=='?' ? PToken(PToken::Kind::Optional) : PToken(PToken::Kind::Star));
391 tokenPos++;
392 addToken(PToken(PToken::Kind::End));
393 // turn a sequence of tokens [T1 T2 T3] followed by 'T*' or into [T* T1 T2 T3 TEND]
394 // ^prevTokenPos
395 // same for 'T?'.
396 }
397 break;
398 default:
399 prevTokenPos = tokenPos;
400 addToken(getNextCharacter());
401 break;
402 }
403 ps++;
404 }
405 //addToken(PToken(PToken::Kind::End));
406}
bool error
Flag indicating the expression was successfully compiled.
Definition regex.cpp:185

References reg::PToken::Alpha, reg::PToken::AlphaNum, reg::PToken::Any, reg::PToken::asciiValue(), reg::PToken::BeginCapture, reg::PToken::BeginOfLine, reg::PToken::BeginOfWord, reg::PToken::Character, reg::PToken::CharClass, data, reg::PToken::Digit, reg::PToken::End, reg::PToken::EndCapture, reg::PToken::EndOfLine, reg::PToken::EndOfWord, error, reg::PToken::kind(), reg::PToken::NegCharClass, reg::PToken::Optional, pattern, reg::PToken::Star, reg::PToken::value(), and reg::PToken::WhiteSpace.

◆ matchAt()

bool reg::Ex::Private::matchAt ( size_t tokenPos,
size_t tokenLen,
std::string_view str,
Match & match,
size_t pos,
int level ) const

Internal matching routine.

Parameters
tokenPosOffset into the token stream.
tokenLenThe length of the token stream.
strThe input string to match against.
matchThe object used to store the matching results.
posThe position in the input string to start with matching
levelRecursion level (used for debugging)

Definition at line 448 of file regex.cpp.

449{
450 DBG("%d:matchAt(tokenPos=%zu, str='%s', pos=%zu)\n",level,tokenPos,pos<str.length() ? str.substr(pos).c_str() : "",pos);
451 auto isStartIdChar = [](char c) { return isalpha(c) || c=='_'; };
452 auto isIdChar = [](char c) { return isalnum(c) || c=='_'; };
453 auto matchCharClass = [this,isStartIdChar,isIdChar](size_t tp,char c) -> bool
454 {
455 PToken tok = data[tp];
456 bool negate = tok.kind()==PToken::Kind::NegCharClass;
457 uint16_t numFields = tok.value();
458 bool found = false;
459 for (uint16_t i=0;i<numFields;i++)
460 {
461 tok = data[++tp];
462 // first check for built-in ranges
463 if ((tok.kind()==PToken::Kind::Alpha && isStartIdChar(c)) ||
464 (tok.kind()==PToken::Kind::AlphaNum && isIdChar(c)) ||
465 (tok.kind()==PToken::Kind::WhiteSpace && isspace(c)) ||
466 (tok.kind()==PToken::Kind::Digit && isdigit(c))
467 )
468 {
469 found=true;
470 break;
471 }
472 else // user specified range
473 {
474 uint16_t v = static_cast<uint16_t>(c);
475 if (tok.from()<=v && v<=tok.to())
476 {
477 found=true;
478 break;
479 }
480 }
481 }
482 DBG("matchCharClass(tp=%zu,c=%c (x%02x))=%d\n",tp,c,c,negate?!found:found);
483 return negate ? !found : found;
484 };
485 size_t index = pos;
486 enum SequenceType { Star, Optional, OptionalRange };
487 auto processSequence = [this,&tokenPos,&tokenLen,&index,&str,&matchCharClass,
488 &isStartIdChar,&isIdChar,&match,&level,&pos](SequenceType type) -> bool
489 {
490 size_t startIndex = index;
491 size_t len = str.length();
492 PToken tok = data[++tokenPos];
493 if (tok.kind()==PToken::Kind::Character) // 'x*' -> eat x's
494 {
495 char c_tok = tok.asciiValue();
496 while (index<len && str[index]==c_tok) { index++; if (type==Optional) break; }
497 tokenPos++;
498 }
499 else if (tok.isCharClass()) // '[a-f0-4]* -> eat matching characters
500 {
501 while (index<len && matchCharClass(tokenPos,str[index])) { index++; if (type==Optional) break; }
502 tokenPos+=tok.value()+1; // skip over character ranges + end token
503 }
504 else if (tok.kind()==PToken::Kind::Alpha) // '\a*' -> eat start id characters
505 {
506 while (index<len && isStartIdChar(str[index])) { index++; if (type==Optional) break; }
507 tokenPos++;
508 }
509 else if (tok.kind()==PToken::Kind::AlphaNum) // '\w*' -> eat id characters
510 {
511 while (index<len && isIdChar(str[index])) { index++; if (type==Optional) break; }
512 tokenPos++;
513 }
514 else if (tok.kind()==PToken::Kind::WhiteSpace) // '\s*' -> eat spaces
515 {
516 while (index<len && isspace(str[index])) { index++; if (type==Optional) break; }
517 tokenPos++;
518 }
519 else if (tok.kind()==PToken::Kind::Digit) // '\d*' -> eat digits
520 {
521 while (index<len && isdigit(str[index])) { index++; if (type==Optional) break; }
522 tokenPos++;
523 }
524 else if (tok.kind()==PToken::Kind::Any) // '.*' -> eat all
525 {
526 if (type==Optional) index++; else index = str.length();
527 tokenPos++;
528 }
529 else if (type==OptionalRange && tok.kind()==PToken::Kind::BeginCapture)
530 {
531 size_t tokenStart = ++tokenPos;
532 while (tokenPos<tokenLen && data[tokenPos].kind()!=PToken::Kind::EndCapture) { tokenPos++; }
533 Match rangeMatch;
534 rangeMatch.init(str);
535 bool found = matchAt(tokenStart,tokenPos,str,rangeMatch,index,level+1);
536 if (found)
537 {
538 index+=rangeMatch.length(); // (abc)? matches -> eat all
539 }
540 tokenPos++; // skip over EndCapture
541 }
542 tokenPos++; // skip over end marker
543 while (index>=startIndex)
544 {
545 // pattern 'x*xy' should match 'xy' and 'xxxxy'
546 bool found = matchAt(tokenPos,tokenLen,str,match,index,level+1);
547 if (found)
548 {
549 match.setMatch(pos,index-pos+match.length());
550 return true;
551 }
552 if (index==0) break;
553 index--;
554 }
555 return false;
556 };
557
558 while (tokenPos<tokenLen)
559 {
560 PToken tok = data[tokenPos];
561 DBG("loop tokenPos=%zu token=%s\n",tokenPos,tok.kindStr());
562 if (tok.kind()==PToken::Kind::Character) // match literal character
563 {
564 char c_tok = tok.asciiValue();
565 if (index>=str.length() || str[index]!=c_tok) return false; // end of string, or non matching char
566 index++,tokenPos++;
567 }
568 else if (tok.isCharClass())
569 {
570 if (index>=str.length() || !matchCharClass(tokenPos,str[index])) return false;
571 index++,tokenPos+=tok.value()+1; // skip over character ranges + end token
572 }
573 else
574 {
575 switch (tok.kind())
576 {
578 if (index>=str.length() || !isStartIdChar(str[index])) return false;
579 index++;
580 break;
582 if (index>=str.length() || !isIdChar(str[index])) return false;
583 index++;
584 break;
586 if (index>=str.length() || !isspace(str[index])) return false;
587 index++;
588 break;
590 if (index>=str.length() || !isdigit(str[index])) return false;
591 index++;
592 break;
594 if (index!=pos) return false;
595 break;
597 if (index<str.length()) return false;
598 break;
600 DBG("BeginOfWord: index=%zu isIdChar(%c)=%d prev.isIdChar(%c)=%d\n",
601 index,str[index],isIdChar(str[index]),
602 index>0?str[index]-1:0,
603 index>0?isIdChar(str[index-1]):-1);
604 if (index>=str.length() ||
605 !isIdChar(str[index]) ||
606 (index>0 && isIdChar(str[index-1]))) return false;
607 break;
609 DBG("EndOfWord: index=%zu pos=%zu idIdChar(%c)=%d prev.isIsChar(%c)=%d\n",
610 index,pos,str[index],isIdChar(str[index]),
611 index==0 ? 0 : str[index-1],
612 index==0 ? -1 : isIdChar(str[index-1]));
613 if (index<str.length() &&
614 (isIdChar(str[index]) || index==0 || !isIdChar(str[index-1]))) return false;
615 break;
617 DBG("BeginCapture(%zu)\n",index);
618 match.startCapture(index);
619 break;
621 DBG("EndCapture(%zu)\n",index);
622 match.endCapture(index);
623 break;
625 if (index>=str.length()) return false;
626 index++;
627 break;
629 return processSequence(Star);
631 if (tokenPos<tokenLen-1 && data[tokenPos+1].kind()==PToken::Kind::BeginCapture)
632 {
633 return processSequence(OptionalRange); // (...)?
634 }
635 else
636 {
637 return processSequence(Optional); // x?
638 }
639 default:
640 return false;
641 }
642 tokenPos++;
643 }
644 }
645 match.setMatch(pos,index-pos);
646 return true;
647}
bool matchAt(size_t tokenPos, size_t tokenLen, std::string_view str, Match &match, size_t pos, int level) const
Internal matching routine.
Definition regex.cpp:448
bool match(std::string_view str, Match &match, size_t pos=0) const
Check if a given string matches this regular expression.
Definition regex.cpp:706
#define DBG(x)
Definition dotrunner.cpp:63
#define isIdChar(c)
Definition markdown.cpp:77
static bool isalpha(char c)
Definition regex.cpp:38
static bool isspace(char c)
Definition regex.cpp:33
static bool isalnum(char c)
Definition regex.cpp:48
static bool isdigit(char c)
Definition regex.cpp:43
bool found
Definition util.cpp:984

References reg::PToken::Alpha, reg::PToken::AlphaNum, reg::PToken::Any, reg::PToken::asciiValue(), reg::PToken::BeginCapture, reg::PToken::BeginOfLine, reg::PToken::BeginOfWord, reg::PToken::Character, data, DBG, reg::PToken::Digit, reg::PToken::EndCapture, reg::PToken::EndOfLine, reg::PToken::EndOfWord, found, reg::PToken::from(), reg::Match::init(), reg::isalnum(), reg::isalpha(), reg::PToken::isCharClass(), reg::isdigit(), isIdChar, reg::isspace(), reg::PToken::kind(), reg::PToken::kindStr(), reg::Match::length(), reg::Ex::match(), matchAt(), reg::PToken::NegCharClass, reg::PToken::Optional, reg::PToken::Star, reg::PToken::to(), reg::PToken::value(), and reg::PToken::WhiteSpace.

Referenced by matchAt().

Member Data Documentation

◆ data

std::vector<PToken> reg::Ex::Private::data

The token stream representing the compiled regular expression.

Definition at line 188 of file regex.cpp.

Referenced by compile(), matchAt(), and Private().

◆ error

bool reg::Ex::Private::error = false

Flag indicating the expression was successfully compiled.

Definition at line 185 of file regex.cpp.

Referenced by compile().

◆ pattern

std::string reg::Ex::Private::pattern

The pattern string as passed by the user.

Definition at line 191 of file regex.cpp.

Referenced by compile(), and Private().


The documentation for this class was generated from the following file: