25#define DBG(fmt,...) do { fprintf(stderr,fmt,__VA_ARGS__); } while(0)
27#define DBG(fmt,...) do {} while(0)
35 return c==
' ' || c==
'\t' || c==
'\n' || c==
'\r';
40 return static_cast<unsigned char>(c)>=128 || (c>=
'a' && c<=
'z') || (c>=
'A' && c<=
'Z');
45 return c>=
'0' && c<=
'9';
131 static_cast<uint32_t>(c)) {}
135 static_cast<uint32_t>(v)) {}
150 uint16_t
to()
const {
return m_rep & 0xFFFF; }
181 bool matchAt(
size_t tokenPos,
size_t tokenLen,std::string_view str,
206 const char *start =
pattern.c_str();
207 const char *ps = start;
214 std::vector<size_t> captureStack;
215 size_t nextCaptureId = 0;
217 auto addToken = [&](
PToken tok)
220 data.emplace_back(tok);
223 auto getNextCharacter = [&]() ->
PToken
233 case 'n': result =
PToken(
'\n');
break;
234 case 'r': result =
PToken(
'\r');
break;
235 case 't': result =
PToken(
'\t');
break;
246 for (
int i=0;i<2 && (cs=(*(ps+1)));i++)
248 int d = (cs>=
'a' && cs<=
'f') ? cs-
'a'+10 :
249 (cs>=
'A' && cs<=
'F') ? cs-
'A'+10 :
250 (cs>=
'0' && cs<=
'9') ? cs-
'0' :
252 if (d>=0) { v<<=4; v|=d; ps++; }
else break;
257 case '\0': ps--;
break;
271 prevTokenPos = tokenPos;
276 prevTokenPos = tokenPos;
281 prevTokenPos = tokenPos;
286 prevTokenPos = tokenPos;
288 size_t id = ++nextCaptureId;
289 data.back().setValue(
id);
290 captureStack.push_back(
id);
295 prevTokenPos = tokenPos;
296 if (captureStack.empty())
301 size_t id = captureStack.back();
302 captureStack.pop_back();
304 data.back().setValue(
id);
309 prevTokenPos = tokenPos;
311 if (*ps==0) {
error=
true;
return; }
312 bool esc = *ps==
'\\';
313 PToken tok = getNextCharacter();
319 if (*ps==0) {
error=
true;
return; }
320 tok = getNextCharacter();
327 uint16_t numTokens=0;
330 if (c==
'-' && *(ps+1)!=
']' && *(ps+1)!=0)
334 PToken endTok = getNextCharacter();
358 if (*ps==0) {
error=
true;
return; }
360 tok = getNextCharacter();
362 tok.
value()==
static_cast<uint16_t
>(
']'))
366 if (*ps==0) {
error=
true;
return; }
370 data[prevTokenPos].setValue(numTokens);
377 if (prevTokenPos==-1)
382 switch (
data[prevTokenPos].kind())
394 int ddiff =
static_cast<int>(tokenPos-prevTokenPos);
400 std::copy_n(
data.begin()+prevTokenPos,ddiff,
data.begin()+tokenPos);
412 data.insert(
data.begin()+prevTokenPos,
422 prevTokenPos = tokenPos;
423 addToken(getNextCharacter());
428 if (!captureStack.empty())
439void Ex::Private::dump()
441 size_t l = data.size();
443 DBG(
"==== compiled token stream for pattern '%s' ===\n",pattern.c_str());
444 DBG(
"captureCount=%zu\n",captureCount);
447 DBG(
"[%s:%04x]\n",data[i].kindStr(),data[i].value());
450 uint16_t num = data[i].value();
454 if (data[i].isRange())
456 DBG(
"[%04x(%c)-%04x(%c)]\n",data[i].from(),data[i].from(),data[i].to(),data[i].to());
460 DBG(
"[%s:%04x]\n",data[i].kindStr(),data[i].value());
480 DBG(
"%d:matchAt(tokenPos=%zu, str='%s', pos=%zu)\n",level,tokenPos,pos<str.length() ? str.substr(pos).c_str() :
"",pos);
481 auto isStartIdChar = [](
char c) {
return isalpha(c) || c==
'_'; };
483 auto matchCharClass = [
this,isStartIdChar,
isIdChar](
size_t tp,
char c) ->
bool
487 uint16_t numFields = tok.
value();
489 for (uint16_t i=0;i<numFields;i++)
504 uint16_t v =
static_cast<uint16_t
>(c);
505 if (tok.
from()<=v && v<=tok.
to())
512 DBG(
"matchCharClass(tp=%zu,c=%c (x%02x))=%d\n",tp,c,c,negate?!found:found);
513 return negate ? !found : found;
516 enum SequenceType { Star, Optional, OptionalRange };
517 auto processSequence = [
this,&tokenPos,&tokenLen,&index,&str,&matchCharClass,
518 &isStartIdChar,&
isIdChar,&
match,&level,&pos](SequenceType type) ->
bool
520 size_t startIndex = index;
521 size_t len = str.length();
527 size_t groupId = tok.
value();
528 size_t innerStart = tokenPos + 1;
531 size_t tp = innerStart;
533 while (tp<tokenLen && depth>0)
539 if (depth!=0)
return false;
540 size_t endCapturePos = tp - 1;
541 size_t afterSeqPos = endCapturePos + 2;
546 bool innerOk =
matchAt(innerStart,endCapturePos,str,tmp,index,level+1);
549 size_t capLen = tmp.
length();
552 for (
size_t gid=1; gid<tmp.
size(); gid++)
555 size_t sl = tmp[gid].
length();
556 if (sp!=std::string::npos && sl!=std::string::npos)
558 match.startCapture(gid,sp);
559 match.endCapture(gid,sp+sl);
563 match.startCapture(groupId,index);
564 match.endCapture(groupId,index+capLen);
566 bool ok =
matchAt(afterSeqPos,tokenLen,str,
match,index+capLen,level+1);
569 match.setMatch(pos,(index+capLen)-pos+
match.length());
575 match.startCapture(groupId,index);
576 match.endCapture(groupId,index);
578 bool ok2 =
matchAt(afterSeqPos,tokenLen,str,
match,index,level+1);
590 while (index<len && str[index]==c_tok) { index++;
if (type==Optional)
break; }
595 while (index<len && matchCharClass(tokenPos,str[index])) { index++;
if (type==Optional)
break; }
596 tokenPos+=tok.
value()+1;
600 while (index<len && isStartIdChar(str[index])) { index++;
if (type==Optional)
break; }
605 while (index<len &&
isIdChar(str[index])) { index++;
if (type==Optional)
break; }
610 while (index<len &&
isspace(str[index])) { index++;
if (type==Optional)
break; }
615 while (index<len &&
isdigit(str[index])) { index++;
if (type==Optional)
break; }
620 if (type==Optional) index++;
else index = str.length();
625 size_t tokenStart = ++tokenPos;
628 rangeMatch.
init(str,0);
629 bool found =
matchAt(tokenStart,tokenPos,str,rangeMatch,index,level+1);
632 index+=rangeMatch.
length();
637 while (index>=startIndex)
640 bool found =
matchAt(tokenPos,tokenLen,str,
match,index,level+1);
652 while (tokenPos<tokenLen)
655 DBG(
"loop tokenPos=%zu token=%s\n",tokenPos,tok.
kindStr());
659 if (index>=str.length() || str[index]!=c_tok)
return false;
664 if (index>=str.length() || !matchCharClass(tokenPos,str[index]))
return false;
665 index++,tokenPos+=tok.
value()+1;
672 if (index>=str.length() || !isStartIdChar(str[index]))
return false;
676 if (index>=str.length() || !
isIdChar(str[index]))
return false;
680 if (index>=str.length() || !
isspace(str[index]))
return false;
684 if (index>=str.length() || !
isdigit(str[index]))
return false;
688 if (index!=pos)
return false;
691 if (index<str.length())
return false;
694 DBG(
"BeginOfWord: index=%zu isIdChar(%c)=%d prev.isIdChar(%c)=%d\n",
695 index,str[index],
isIdChar(str[index]),
696 index>0?str[index]-1:0,
698 if (index>=str.length() ||
700 (index>0 &&
isIdChar(str[index-1])))
return false;
703 DBG(
"EndOfWord: index=%zu pos=%zu idIdChar(%c)=%d prev.isIsChar(%c)=%d\n",
704 index,pos,str[index],
isIdChar(str[index]),
705 index==0 ? 0 : str[index-1],
706 index==0 ? -1 :
isIdChar(str[index-1]));
707 if (index<str.length() &&
708 (
isIdChar(str[index]) || index==0 || !
isIdChar(str[index-1])))
return false;
711 DBG(
"BeginCapture(%zu) gid=%u\n",index,tok.
value());
715 DBG(
"EndCapture(%zu) gid=%u\n",index,tok.
value());
719 if (index>=str.length())
return false;
723 return processSequence(Star);
727 return processSequence(OptionalRange);
731 return processSequence(Optional);
739 match.setMatch(pos,index-pos);
745 std::string result=
"^";
746 result.reserve(pattern.length());
747 for (
size_t i=0;i<pattern.length();i++)
765 result+=
'\\'; result+=c;
768 if (i<pattern.length()-1 && pattern[i+1]==
'^')
803 if (
p->data.size()==0 ||
p->error)
return found;
804 match.init(str,
p->captureCount);
809 found =
p->matchAt(0,
p->data.size(),str,
match,pos,0);
815 size_t index = str.find(tok.
asciiValue(),pos);
816 if (index==std::string::npos)
818 DBG(
"Ex::match(str='%s',pos=%zu)=false (no start char '%c')\n",std::string(str).c_str(),pos,tok.
asciiValue());
821 DBG(
"pos=%zu str='%s' char='%c' index=%zu\n",index,std::string(str).c_str(),tok.
asciiValue(),index);
824 while (pos<str.length())
826 found =
p->matchAt(0,
p->data.size(),str,
match,pos,0);
831 DBG(
"Ex::match(str='%s',pos=%zu)=%d\n",std::string(str).c_str(),pos,found);
837 return !
p->pattern.empty() && !
p->error;
847bool search(std::string_view str,
const Ex &re,
size_t pos)
864std::string
replace(std::string_view str,
const Ex &re,std::string_view replacement)
871 size_t i=
match.position();
872 size_t l=
match.length();
873 if (i>p) result+=str.substr(p,i-p);
877 if (p<str.length()) result+=str.substr(p);
Private members of a regular expression.
size_t captureCount
Number of capture groups in the pattern (excluding the whole match).
bool error
Flag indicating the expression was successfully compiled.
void compile()
Compiles a regular expression passed as a string into a stream of tokens that can be used for efficie...
std::string pattern
The pattern string as passed by the user.
Private(std::string_view pat)
Creates the private part.
bool matchAt(size_t tokenPos, size_t tokenLen, std::string_view str, Match &match, size_t pos, int level) const
Internal matching routine.
std::vector< PToken > data
The token stream representing the compiled regular expression.
Class representing a regular expression.
~Ex()
Destroys the regular expression object.
std::unique_ptr< Private > p
bool match(std::string_view str, Match &match, size_t pos=0) const
Check if a given string matches this regular expression.
Ex(std::string_view pattern, Mode mode=Mode::RegEx)
Creates a regular expression object given the pattern as a string.
@ RegEx
full regular expression.
Object representing the matching results.
void init(std::string_view str, size_t captureCount)
size_t size() const
Returns the number of sub matches available in this match.
size_t position() const
Returns the position of the match or std::string::npos if no position is set.
size_t length() const
Returns the position of the match or std::string::npos if no length is set.
Class representing a token in the compiled regular expression token stream.
uint16_t to() const
Returns the 'to' part of the character range.
char asciiValue() const
Returns the value for this token as a ASCII character.
PToken(Kind k)
Creates a token of the given kind k.
PToken(char c)
Create a token for an ASCII character.
bool isRange() const
Returns true iff this token represents a range of characters.
Kind kind() const
Returns the kind of the token.
PToken()
Creates a token of kind 'End'.
uint16_t from() const
Returns the 'from' part of the character range.
const char * kindStr() const
returns a string representation of the tokens kind (useful for debugging).
void setValue(uint16_t value)
Sets the value for a token.
uint16_t value() const
Returns the value for this token.
PToken(uint16_t v)
Create a token for a byte of an UTF-8 character.
bool isCharClass() const
Returns true iff this token is a positive or negative character class.
PToken(uint16_t from, uint16_t to)
Create a token representing a range from one character from to another character to.
Namespace for the regular expression functions.
static bool isalpha(char c)
bool search(std::string_view str, Match &match, const Ex &re, size_t pos)
Search in a given string str starting at position pos for a match against regular expression re.
static std::string wildcard2regex(std::string_view pattern)
std::string replace(std::string_view str, const Ex &re, std::string_view replacement)
Searching in a given input string for parts that match regular expression re and replaces those parts...
bool match(std::string_view str, Match &match, const Ex &re)
Matches a given string str for a match against regular expression re.
static bool isspace(char c)
static bool isalnum(char c)
static bool isdigit(char c)