25#define DBG(fmt,...) do { fprintf(stderr,fmt,__VA_ARGS__); } while(0)
27#define DBG(fmt,...) do {} while(0)
35 return c==
' ' || c==
'\t' || c==
'\n' || c==
'\r';
40 return static_cast<unsigned char>(c)>=128 || (c>=
'a' && c<=
'z') || (c>=
'A' && c<=
'Z');
45 return c>=
'0' && c<=
'9';
131 static_cast<uint32_t>(c)) {}
135 static_cast<uint32_t>(v)) {}
150 uint16_t
to()
const {
return m_rep & 0xFFFF; }
181 bool matchAt(
size_t tokenPos,
size_t tokenLen,std::string_view str,
202 const char *start =
pattern.c_str();
203 const char *ps = start;
209 auto addToken = [&](
PToken tok)
212 data.emplace_back(tok);
215 auto getNextCharacter = [&]() ->
PToken
225 case 'n': result =
PToken(
'\n');
break;
226 case 'r': result =
PToken(
'\r');
break;
227 case 't': result =
PToken(
'\t');
break;
238 for (
int i=0;i<2 && (cs=(*(ps+1)));i++)
240 int d = (cs>=
'a' && cs<=
'f') ? cs-
'a'+10 :
241 (cs>=
'A' && cs<=
'F') ? cs-
'A'+10 :
242 (cs>=
'0' && cs<=
'9') ? cs-
'0' :
244 if (d>=0) { v<<=4; v|=d; ps++; }
else break;
249 case '\0': ps--;
break;
263 prevTokenPos = tokenPos;
268 prevTokenPos = tokenPos;
273 prevTokenPos = tokenPos;
277 prevTokenPos = tokenPos;
281 prevTokenPos = tokenPos;
286 prevTokenPos = tokenPos;
288 if (*ps==0) {
error=
true;
return; }
289 bool esc = *ps==
'\\';
290 PToken tok = getNextCharacter();
296 if (*ps==0) {
error=
true;
return; }
297 tok = getNextCharacter();
304 uint16_t numTokens=0;
307 if (c==
'-' && *(ps+1)!=
']' && *(ps+1)!=0)
311 PToken endTok = getNextCharacter();
335 if (*ps==0) {
error=
true;
return; }
337 tok = getNextCharacter();
339 tok.
value()==
static_cast<uint16_t
>(
']'))
343 if (*ps==0) {
error=
true;
return; }
347 data[prevTokenPos].setValue(numTokens);
354 if (prevTokenPos==-1)
359 switch (
data[prevTokenPos].kind())
371 int ddiff =
static_cast<int>(tokenPos-prevTokenPos);
377 std::copy_n(
data.begin()+prevTokenPos,ddiff,
data.begin()+tokenPos);
389 data.insert(
data.begin()+prevTokenPos,
399 prevTokenPos = tokenPos;
400 addToken(getNextCharacter());
410void Ex::Private::dump()
412 size_t l = data.size();
414 DBG(
"==== compiled token stream for pattern '%s' ===\n",pattern.c_str());
417 DBG(
"[%s:%04x]\n",data[i].kindStr(),data[i].value());
420 uint16_t num = data[i].value();
424 if (data[i].isRange())
426 DBG(
"[%04x(%c)-%04x(%c)]\n",data[i].from(),data[i].from(),data[i].to(),data[i].to());
430 DBG(
"[%s:%04x]\n",data[i].kindStr(),data[i].value());
450 DBG(
"%d:matchAt(tokenPos=%zu, str='%s', pos=%zu)\n",level,tokenPos,pos<str.length() ? str.substr(pos).c_str() :
"",pos);
451 auto isStartIdChar = [](
char c) {
return isalpha(c) || c==
'_'; };
453 auto matchCharClass = [
this,isStartIdChar,
isIdChar](
size_t tp,
char c) ->
bool
457 uint16_t numFields = tok.
value();
459 for (uint16_t i=0;i<numFields;i++)
474 uint16_t v =
static_cast<uint16_t
>(c);
475 if (tok.
from()<=v && v<=tok.
to())
482 DBG(
"matchCharClass(tp=%zu,c=%c (x%02x))=%d\n",tp,c,c,negate?!
found:
found);
486 enum SequenceType { Star, Optional, OptionalRange };
487 auto processSequence = [
this,&tokenPos,&tokenLen,&index,&str,&matchCharClass,
488 &isStartIdChar,&
isIdChar,&
match,&level,&pos](SequenceType type) ->
bool
490 size_t startIndex = index;
491 size_t len = str.length();
496 while (index<len && str[index]==c_tok) { index++;
if (type==Optional)
break; }
501 while (index<len && matchCharClass(tokenPos,str[index])) { index++;
if (type==Optional)
break; }
502 tokenPos+=tok.
value()+1;
506 while (index<len && isStartIdChar(str[index])) { index++;
if (type==Optional)
break; }
511 while (index<len &&
isIdChar(str[index])) { index++;
if (type==Optional)
break; }
516 while (index<len &&
isspace(str[index])) { index++;
if (type==Optional)
break; }
521 while (index<len &&
isdigit(str[index])) { index++;
if (type==Optional)
break; }
526 if (type==Optional) index++;
else index = str.length();
531 size_t tokenStart = ++tokenPos;
534 rangeMatch.
init(str);
535 bool found =
matchAt(tokenStart,tokenPos,str,rangeMatch,index,level+1);
538 index+=rangeMatch.
length();
543 while (index>=startIndex)
558 while (tokenPos<tokenLen)
561 DBG(
"loop tokenPos=%zu token=%s\n",tokenPos,tok.
kindStr());
565 if (index>=str.length() || str[index]!=c_tok)
return false;
570 if (index>=str.length() || !matchCharClass(tokenPos,str[index]))
return false;
571 index++,tokenPos+=tok.
value()+1;
578 if (index>=str.length() || !isStartIdChar(str[index]))
return false;
582 if (index>=str.length() || !
isIdChar(str[index]))
return false;
586 if (index>=str.length() || !
isspace(str[index]))
return false;
590 if (index>=str.length() || !
isdigit(str[index]))
return false;
594 if (index!=pos)
return false;
597 if (index<str.length())
return false;
600 DBG(
"BeginOfWord: index=%zu isIdChar(%c)=%d prev.isIdChar(%c)=%d\n",
601 index,str[index],
isIdChar(str[index]),
602 index>0?str[index]-1:0,
604 if (index>=str.length() ||
606 (index>0 &&
isIdChar(str[index-1])))
return false;
609 DBG(
"EndOfWord: index=%zu pos=%zu idIdChar(%c)=%d prev.isIsChar(%c)=%d\n",
610 index,pos,str[index],
isIdChar(str[index]),
611 index==0 ? 0 : str[index-1],
612 index==0 ? -1 :
isIdChar(str[index-1]));
613 if (index<str.length() &&
614 (
isIdChar(str[index]) || index==0 || !
isIdChar(str[index-1])))
return false;
617 DBG(
"BeginCapture(%zu)\n",index);
618 match.startCapture(index);
621 DBG(
"EndCapture(%zu)\n",index);
622 match.endCapture(index);
625 if (index>=str.length())
return false;
629 return processSequence(Star);
633 return processSequence(OptionalRange);
637 return processSequence(Optional);
645 match.setMatch(pos,index-pos);
651 std::string result=
"^";
652 result.reserve(pattern.length());
653 for (
size_t i=0;i<pattern.length();i++)
671 result+=
'\\'; result+=c;
674 if (i<pattern.length()-1 && pattern[i+1]==
'^')
709 if (
p->data.size()==0 ||
p->error)
return found;
721 size_t index = str.find(tok.
asciiValue(),pos);
722 if (index==std::string::npos)
724 DBG(
"Ex::match(str='%s',pos=%zu)=false (no start char '%c')\n",str.c_str(),pos,tok.
asciiValue());
727 DBG(
"pos=%zu str='%s' char='%c' index=%zu\n",index,str.c_str(),tok.
asciiValue(),index);
730 while (pos<str.length())
737 DBG(
"Ex::match(str='%s',pos=%zu)=%d\n",str.c_str(),pos,
found);
743 return !
p->pattern.empty() && !
p->error;
753bool search(std::string_view str,
const Ex &re,
size_t pos)
770std::string
replace(std::string_view str,
const Ex &re,std::string_view replacement)
777 size_t i=
match.position();
778 size_t l=
match.length();
779 if (i>p) result+=str.substr(p,i-p);
783 if (p<str.length()) result+=str.substr(p);
Private members of a regular expression.
bool error
Flag indicating the expression was successfully compiled.
void compile()
Compiles a regular expression passed as a string into a stream of tokens that can be used for efficie...
std::string pattern
The pattern string as passed by the user.
Private(std::string_view pat)
Creates the private part.
bool matchAt(size_t tokenPos, size_t tokenLen, std::string_view str, Match &match, size_t pos, int level) const
Internal matching routine.
std::vector< PToken > data
The token stream representing the compiled regular expression.
Class representing a regular expression.
~Ex()
Destroys the regular expression object.
std::unique_ptr< Private > p
bool match(std::string_view str, Match &match, size_t pos=0) const
Check if a given string matches this regular expression.
Ex(std::string_view pattern, Mode mode=Mode::RegEx)
Creates a regular expression object given the pattern as a string.
@ RegEx
full regular expression.
Object representing the matching results.
void init(std::string_view str)
size_t length() const
Returns the position of the match or std::string::npos if no length is set.
Class representing a token in the compiled regular expression token stream.
uint16_t to() const
Returns the 'to' part of the character range.
char asciiValue() const
Returns the value for this token as a ASCII character.
PToken(Kind k)
Creates a token of the given kind k.
PToken(char c)
Create a token for an ASCII character.
bool isRange() const
Returns true iff this token represents a range of characters.
Kind kind() const
Returns the kind of the token.
PToken()
Creates a token of kind 'End'.
uint16_t from() const
Returns the 'from' part of the character range.
const char * kindStr() const
returns a string representation of the tokens kind (useful for debugging).
void setValue(uint16_t value)
Sets the value for a token.
uint16_t value() const
Returns the value for this token.
PToken(uint16_t v)
Create a token for a byte of an UTF-8 character.
bool isCharClass() const
Returns true iff this token is a positive or negative character class.
PToken(uint16_t from, uint16_t to)
Create a token representing a range from one character from to another character to.
Namespace for the regular expression functions.
static bool isalpha(char c)
bool search(std::string_view str, Match &match, const Ex &re, size_t pos)
Search in a given string str starting at position pos for a match against regular expression re.
static std::string wildcard2regex(std::string_view pattern)
std::string replace(std::string_view str, const Ex &re, std::string_view replacement)
Searching in a given input string for parts that match regular expression re and replaces those parts...
bool match(std::string_view str, Match &match, const Ex &re)
Matches a given string str for a match against regular expression re.
static bool isspace(char c)
static bool isalnum(char c)
static bool isdigit(char c)