Private members of a regular expression. More...

Public Member Functions
	Private (std::string_view pat)
	Creates the private part.

void	compile ()
	Compiles a regular expression passed as a string into a stream of tokens that can be used for efficient searching.

bool	matchAt (size_t tokenPos, size_t tokenLen, std::string_view str, Match &match, size_t pos, int level) const
	Internal matching routine.

Public Attributes
bool	error = false
	Flag indicating the expression was successfully compiled.

std::vector< PToken >	data
	The token stream representing the compiled regular expression.

std::string	pattern
	The pattern string as passed by the user.

Detailed Description

Private members of a regular expression.

Definition at line 169 of file regex.cpp.

Constructor & Destructor Documentation

◆ Private()

reg::Ex::Private::Private ( std::string_view pat )

inline

Creates the private part.

Definition at line 173 of file regex.cpp.

                                : pattern(pat)
    {
      data.reserve(100);
    }

References data, and pattern.

Member Function Documentation

◆ compile()

void reg::Ex::Private::compile ( )

Compiles a regular expression passed as a string into a stream of tokens that can be used for efficient searching.

Definition at line 197 of file regex.cpp.

{
  error = false;
  data.clear();
  if (pattern.empty()) return;
  const char *start = pattern.c_str();
  const char *ps = start;
  char c = 0;
 
  int prevTokenPos=-1;
  int tokenPos=0;
 
  auto addToken = [&](PToken tok)
  {
    tokenPos++;
    data.emplace_back(tok);
  };
 
  auto getNextCharacter = [&]() -> PToken
  {
    char cs=*ps;
    PToken result = PToken(cs);
    if (cs=='\\') // escaped character
    {
      ps++;
      cs=*ps;
      switch (cs)
      {
        case 'n': result = PToken('\n');                      break;
        case 'r': result = PToken('\r');                      break;
        case 't': result = PToken('\t');                      break;
        case 's': result = PToken(PToken::Kind::WhiteSpace);  break;
        case 'a': result = PToken(PToken::Kind::Alpha);       break;
        case 'w': result = PToken(PToken::Kind::AlphaNum);    break;
        case 'd': result = PToken(PToken::Kind::Digit);       break;
        case '<': result = PToken(PToken::Kind::BeginOfWord); break;
        case '>': result = PToken(PToken::Kind::EndOfWord);   break;
        case 'x':
        case 'X':
          {
            uint16_t v=0;
            for (int i=0;i<2 && (cs=(*(ps+1)));i++) // 2 hex digits
            {
              int d = (cs>='a' && cs<='f') ? cs-'a'+10 :
                      (cs>='A' && cs<='F') ? cs-'A'+10 :
                      (cs>='0' && cs<='9') ? cs-'0'    :
                      -1;
              if (d>=0) { v<<=4; v|=d; ps++; } else break;
            }
            result = PToken(v);
          }
          break;
        case '\0': ps--; break; // backslash at the end of the pattern
        default:
          result = PToken(cs);
          break;
      }
    }
    return result;
  };
 
  while ((c=*ps))
  {
    switch (c)
    {
      case '^': // beginning of line (if first character of the pattern)
        prevTokenPos = tokenPos;
        addToken(ps==start ? PToken(PToken::Kind::BeginOfLine) :
                            PToken(c));
        break;
      case '$': // end of the line (if last character of the pattern)
        prevTokenPos = tokenPos;
        addToken(*(ps+1)=='\0' ? PToken(PToken::Kind::EndOfLine) :
                                PToken(c));
        break;
      case '.': // any character
        prevTokenPos = tokenPos;
        addToken(PToken(PToken::Kind::Any));
        break;
      case '(': // begin of capture group
        prevTokenPos = tokenPos;
        addToken(PToken(PToken::Kind::BeginCapture));
        break;
      case ')': // end of capture group
        prevTokenPos = tokenPos;
        addToken(PToken(PToken::Kind::EndCapture));
        break;
      case '[': // character class
        {
          prevTokenPos = tokenPos;
          ps++;
          if (*ps==0) { error=true; return; }
          bool esc = *ps=='\\';
          PToken tok = getNextCharacter();
          ps++;
          if (!esc && tok.kind()==PToken::Kind::Character &&
                      tok.asciiValue()=='^') // negated character class
          {
            addToken(PToken(PToken::Kind::NegCharClass));
            if (*ps==0) { error=true; return; }
            tok = getNextCharacter();
            ps++;
          }
          else
          {
            addToken(PToken(PToken::Kind::CharClass));
          }
          uint16_t numTokens=0;
          while ((c=*ps))
          {
            if (c=='-' && *(ps+1)!=']' && *(ps+1)!=0) // range
            {
              getNextCharacter();
              ps++;
              PToken endTok = getNextCharacter();
              ps++;
              if (tok.value()>endTok.value())
              {
                addToken(PToken(endTok.value(),tok.value())); // swap start and end
              }
              else
              {
                addToken(PToken(tok.value(),endTok.value()));
              }
              numTokens++;
            }
            else // single char, from==to
            {
              if (tok.kind()==PToken::Kind::Character)
              {
                addToken(PToken(tok.value(),tok.value()));
              }
              else // special token, add as-is since from>to
              {
                addToken(tok);
              }
              numTokens++;
            }
            if (*ps==0) { error=true; return; } // expected at least a ]
            esc = *ps=='\\';
            tok = getNextCharacter();
            if (!esc && tok.kind()==PToken::Kind::Character &&
                        tok.value()==static_cast<uint16_t>(']'))
            {
              break; // end of character class
            }
            if (*ps==0) { error=true; return; } // no ] found
            ps++;
          }
          // set the value of either NegCharClass or CharClass
          data[prevTokenPos].setValue(numTokens);
        }
        break;
      case '*': // 0 or more
      case '+': // 1 or more
      case '?': // optional: 0 or 1
        {
          if (prevTokenPos==-1)
          {
            error=true;
            return;
          }
          switch (data[prevTokenPos].kind())
          {
            case PToken::Kind::BeginOfLine:  // $*  or  $+ or  $?
            case PToken::Kind::BeginOfWord:  // <* or <+ or <?
            case PToken::Kind::EndOfWord:    // >* or >+ or >?
            case PToken::Kind::Star:         // **  or  *+ or  *?
            case PToken::Kind::Optional:     // ?*  or  ?+ or  ??
              error=true;
              return;
            default: // ok
              break;
          }
          int ddiff = static_cast<int>(tokenPos-prevTokenPos);
          if (*ps=='+') // convert <pat>+ -> <pat><pat>*
          {
            // turn a sequence of token [T1...Tn] followed by '+' into [T1..Tn T1..Tn T*]
            //                          ddiff=n                                ^prevTokenPos
            data.resize(data.size()+ddiff);
            std::copy_n(data.begin()+prevTokenPos,ddiff,data.begin()+tokenPos);
            prevTokenPos+=ddiff;
            tokenPos+=ddiff;
          }
          if (data[prevTokenPos].kind()==PToken::Kind::EndCapture)
          {
            // find the beginning of the capture range
            while (prevTokenPos>0 && data[prevTokenPos].kind()!=PToken::Kind::BeginCapture)
            {
              prevTokenPos--;
            }
          }
          data.insert(data.begin()+prevTokenPos,
                      c=='?' ? PToken(PToken::Kind::Optional) : PToken(PToken::Kind::Star));
          tokenPos++;
          addToken(PToken(PToken::Kind::End));
          // turn a sequence of tokens [T1 T2 T3] followed by 'T*' or into [T* T1 T2 T3 TEND]
          //                            ^prevTokenPos
          // same for 'T?'.
        }
        break;
      default:
        prevTokenPos = tokenPos;
        addToken(getNextCharacter());
        break;
    }
    ps++;
  }
  //addToken(PToken(PToken::Kind::End));
}

References reg::PToken::Alpha, reg::PToken::AlphaNum, reg::PToken::Any, reg::PToken::asciiValue(), reg::PToken::BeginCapture, reg::PToken::BeginOfLine, reg::PToken::BeginOfWord, reg::PToken::Character, reg::PToken::CharClass, data, reg::PToken::Digit, reg::PToken::End, reg::PToken::EndCapture, reg::PToken::EndOfLine, reg::PToken::EndOfWord, error, reg::PToken::kind(), reg::PToken::NegCharClass, reg::PToken::Optional, pattern, reg::PToken::Star, reg::PToken::value(), and reg::PToken::WhiteSpace.

◆ matchAt()

bool reg::Ex::Private::matchAt	(	size_t	tokenPos,
		size_t	tokenLen,
		std::string_view	str,
		Match &	match,
		size_t	pos,
		int	level ) const

Internal matching routine.

Parameters

tokenPos	Offset into the token stream.
tokenLen	The length of the token stream.
str	The input string to match against.
match	The object used to store the matching results.
pos	The position in the input string to start with matching
level	Recursion level (used for debugging)

Definition at line 448 of file regex.cpp.

{
  DBG("%d:matchAt(tokenPos=%zu, str='%s', pos=%zu)\n",level,tokenPos,pos<str.length() ? str.substr(pos).c_str() : "",pos);
  auto isStartIdChar = [](char c) { return isalpha(c) || c=='_'; };
  auto isIdChar      = [](char c) { return isalnum(c) || c=='_'; };
  auto matchCharClass = [this,isStartIdChar,isIdChar](size_t tp,char c) -> bool
  {
    PToken tok = data[tp];
    bool negate = tok.kind()==PToken::Kind::NegCharClass;
    uint16_t numFields = tok.value();
    bool found = false;
    for (uint16_t i=0;i<numFields;i++)
    {
      tok = data[++tp];
      // first check for built-in ranges
      if ((tok.kind()==PToken::Kind::Alpha      && isStartIdChar(c)) ||
          (tok.kind()==PToken::Kind::AlphaNum   && isIdChar(c))      ||
          (tok.kind()==PToken::Kind::WhiteSpace && isspace(c))  ||
          (tok.kind()==PToken::Kind::Digit      && isdigit(c))
         )
      {
        found=true;
        break;
      }
      else // user specified range
      {
        uint16_t v = static_cast<uint16_t>(c);
        if (tok.from()<=v && v<=tok.to())
        {
          found=true;
          break;
        }
      }
    }
    DBG("matchCharClass(tp=%zu,c=%c (x%02x))=%d\n",tp,c,c,negate?!found:found);
    return negate ? !found : found;
  };
  size_t index = pos;
  enum SequenceType { Star, Optional, OptionalRange };
  auto processSequence = [this,&tokenPos,&tokenLen,&index,&str,&matchCharClass,
                          &isStartIdChar,&isIdChar,&match,&level,&pos](SequenceType type) -> bool
  {
    size_t startIndex = index;
    size_t len = str.length();
    PToken tok = data[++tokenPos];
    if (tok.kind()==PToken::Kind::Character) // 'x*' -> eat x's
    {
      char c_tok = tok.asciiValue();
      while (index<len && str[index]==c_tok) { index++; if (type==Optional) break; }
      tokenPos++;
    }
    else if (tok.isCharClass()) // '[a-f0-4]* -> eat matching characters
    {
      while (index<len && matchCharClass(tokenPos,str[index])) { index++; if (type==Optional) break; }
      tokenPos+=tok.value()+1; // skip over character ranges + end token
    }
    else if (tok.kind()==PToken::Kind::Alpha) // '\a*' -> eat start id characters
    {
      while (index<len && isStartIdChar(str[index])) { index++; if (type==Optional) break; }
      tokenPos++;
    }
    else if (tok.kind()==PToken::Kind::AlphaNum) // '\w*' -> eat id characters
    {
      while (index<len && isIdChar(str[index])) { index++; if (type==Optional) break; }
      tokenPos++;
    }
    else if (tok.kind()==PToken::Kind::WhiteSpace) // '\s*' -> eat spaces
    {
      while (index<len && isspace(str[index])) { index++; if (type==Optional) break; }
      tokenPos++;
    }
    else if (tok.kind()==PToken::Kind::Digit) // '\d*' -> eat digits
    {
      while (index<len && isdigit(str[index])) { index++; if (type==Optional) break; }
      tokenPos++;
    }
    else if (tok.kind()==PToken::Kind::Any) // '.*' -> eat all
    {
      if (type==Optional) index++; else index = str.length();
      tokenPos++;
    }
    else if (type==OptionalRange && tok.kind()==PToken::Kind::BeginCapture)
    {
      size_t tokenStart = ++tokenPos;
      while (tokenPos<tokenLen && data[tokenPos].kind()!=PToken::Kind::EndCapture) { tokenPos++; }
      Match rangeMatch;
      rangeMatch.init(str);
      bool found = matchAt(tokenStart,tokenPos,str,rangeMatch,index,level+1);
      if (found)
      {
        index+=rangeMatch.length(); // (abc)? matches -> eat all
      }
      tokenPos++; // skip over EndCapture
    }
    tokenPos++; // skip over end marker
    while (index>=startIndex)
    {
      // pattern 'x*xy' should match 'xy' and 'xxxxy'
      bool found = matchAt(tokenPos,tokenLen,str,match,index,level+1);
      if (found)
      {
        match.setMatch(pos,index-pos+match.length());
        return true;
      }
      if (index==0) break;
      index--;
    }
    return false;
  };
 
  while (tokenPos<tokenLen)
  {
    PToken tok = data[tokenPos];
    DBG("loop tokenPos=%zu token=%s\n",tokenPos,tok.kindStr());
    if (tok.kind()==PToken::Kind::Character) // match literal character
    {
      char c_tok = tok.asciiValue();
      if (index>=str.length() || str[index]!=c_tok) return false; // end of string, or non matching char
      index++,tokenPos++;
    }
    else if (tok.isCharClass())
    {
      if (index>=str.length() || !matchCharClass(tokenPos,str[index])) return false;
      index++,tokenPos+=tok.value()+1; // skip over character ranges + end token
    }
    else
    {
      switch (tok.kind())
      {
        case PToken::Kind::Alpha:
          if (index>=str.length() || !isStartIdChar(str[index])) return false;
          index++;
          break;
        case PToken::Kind::AlphaNum:
          if (index>=str.length() || !isIdChar(str[index])) return false;
          index++;
          break;
        case PToken::Kind::WhiteSpace:
          if (index>=str.length() || !isspace(str[index])) return false;
          index++;
          break;
        case PToken::Kind::Digit:
          if (index>=str.length() || !isdigit(str[index])) return false;
          index++;
          break;
        case PToken::Kind::BeginOfLine:
          if (index!=pos) return false;
          break;
        case PToken::Kind::EndOfLine:
          if (index<str.length()) return false;
          break;
        case PToken::Kind::BeginOfWord:
          DBG("BeginOfWord: index=%zu isIdChar(%c)=%d prev.isIdChar(%c)=%d\n",
              index,str[index],isIdChar(str[index]),
              index>0?str[index]-1:0,
              index>0?isIdChar(str[index-1]):-1);
          if (index>=str.length() ||
              !isIdChar(str[index]) ||
              (index>0 && isIdChar(str[index-1]))) return false;
          break;
        case PToken::Kind::EndOfWord:
          DBG("EndOfWord: index=%zu pos=%zu idIdChar(%c)=%d  prev.isIsChar(%c)=%d\n",
              index,pos,str[index],isIdChar(str[index]),
              index==0 ? 0 : str[index-1],
              index==0 ? -1 : isIdChar(str[index-1]));
          if (index<str.length() &&
              (isIdChar(str[index]) || index==0 || !isIdChar(str[index-1]))) return false;
          break;
        case PToken::Kind::BeginCapture:
          DBG("BeginCapture(%zu)\n",index);
          match.startCapture(index);
          break;
        case PToken::Kind::EndCapture:
          DBG("EndCapture(%zu)\n",index);
          match.endCapture(index);
          break;
        case PToken::Kind::Any:
          if (index>=str.length()) return false;
          index++;
          break;
        case PToken::Kind::Star:
          return processSequence(Star);
        case PToken::Kind::Optional:
          if (tokenPos<tokenLen-1 && data[tokenPos+1].kind()==PToken::Kind::BeginCapture)
          {
            return processSequence(OptionalRange); // (...)?
          }
          else
          {
            return processSequence(Optional); // x?
          }
        default:
          return false;
      }
      tokenPos++;
    }
  }
  match.setMatch(pos,index-pos);
  return true;
}

References reg::PToken::Alpha, reg::PToken::AlphaNum, reg::PToken::Any, reg::PToken::asciiValue(), reg::PToken::BeginCapture, reg::PToken::BeginOfLine, reg::PToken::BeginOfWord, reg::PToken::Character, data, DBG, reg::PToken::Digit, reg::PToken::EndCapture, reg::PToken::EndOfLine, reg::PToken::EndOfWord, found, reg::PToken::from(), reg::Match::init(), reg::isalnum(), reg::isalpha(), reg::PToken::isCharClass(), reg::isdigit(), isIdChar, reg::isspace(), reg::PToken::kind(), reg::PToken::kindStr(), reg::Match::length(), reg::Ex::match(), matchAt(), reg::PToken::NegCharClass, reg::PToken::Optional, reg::PToken::Star, reg::PToken::to(), reg::PToken::value(), and reg::PToken::WhiteSpace.

Referenced by matchAt().

Member Data Documentation

◆ data

std::vector<PToken> reg::Ex::Private::data

The token stream representing the compiled regular expression.

Definition at line 188 of file regex.cpp.

Referenced by compile(), matchAt(), and Private().

◆ error

bool reg::Ex::Private::error = false

Flag indicating the expression was successfully compiled.

Definition at line 185 of file regex.cpp.

Referenced by compile().

◆ pattern

std::string reg::Ex::Private::pattern

The pattern string as passed by the user.

Definition at line 191 of file regex.cpp.

Referenced by compile(), and Private().

The documentation for this class was generated from the following file:

src/regex.cpp

Public Member Functions

Public Attributes