RegEx.hpp

Basic regular expression handler.

A fully (well, mostly) functional regular expression processor.

Special chars: ‘|’ - or ‘*’ - zero or more of previous ‘+’ - one or more of previous ‘?’ - previous is optional ‘.’ - Match any character except

Plus the following group contents (and change may translation rules) ‘(’ and ‘)’ - group contents ‘”’ - Ignore special characters in contents (quotes still need to be escaped) ‘[’ and ‘]’ - character set

— choose ONE character ‘^’ as first char negates contents ‘-’ indicates range UNLESS first or last.

Additional overloads for functions in lexer_utils.h:

static NFA to_NFA(const RegEx & regex, int stop_id=1); static DFA to_DFA(const RegEx & regex);

Todo:

Implement ^ and $ (beginning and end of line)

Implement {n}, {n,} and {n,m} (exactly n, at least n, and n-m copies, respectively)

Implement \d (for digits), \s (for whitespace), etc.

Consider a separator (maybe backtick?) to divide up a regex expression; the result can be returned by each section as a vector of strings.

Note

Status: BETA

Functions

static NFA to_NFA(const RegEx &regex, size_t stop_id = 1)

Simple conversion of RegEx to NFA (mostly implemented in RegEx)

static DFA to_DFA(const RegEx &regex)

Conversion of RegEx to DFA, via NFA intermediate.

class RegEx
#include <RegEx.hpp>

A basic regular expression handler.

Public Functions

RegEx() = delete
inline RegEx(const std::string &r)
inline RegEx(const RegEx &r)
inline ~RegEx()
inline RegEx &operator=(const RegEx &r)

Set this RegEx equal to another.

inline std::string AsString() const

Convert the RegEx to an standard string, readable from outsite this class.

inline void AddToNFA(NFA &nfa, size_t start, size_t stop) const

Add this regex to an NFA being built.

void Generate() const

Assume the RegEx is ready and setup processing for it.

inline bool Test(const std::string &str) const

Test if a string statisfies this regex.

inline void PrintInternal() const

For debugging: print the internal representation of the regex.

inline void PrintNotes() const

For debugging: print any internal notes generated about this regex.

inline void PrintDebug() const

Print general debuging information about this regex.

Private Types

using opts_t = BitSet<NUM_SYMBOLS>

Private Functions

template<typename ...T>
inline void Error(T&&... args)
inline bool EnsureNext(char x)

Make sure that there is another element in the RegEx (e.g., after an ‘|’) or else trigger and error to report the problem.

inline Ptr<re_charset> ConstructSet()

Construct a character range.

inline Ptr<re_string> ConstructString()

Construct a string, loading everything needed.

inline Ptr<re_base> ConstructSegment()

Should only be called when we know we have a single unit to produce. Build and return it.

inline Ptr<re_block> Process(Ptr<re_block> cur_block = nullptr)

Process the input regex into a tree representaion.

Private Members

std::string regex

Original string to define this RegEx.

vector<std::string> notes

Any warnings or errors would be provided here.

bool valid = true

Set to false if regex cannot be processed.

size_t pos = 0

Position being read in regex.

mutable DFA dfa

DFA that this RegEx translates to.

mutable bool dfa_ready = false

Is the dfa ready? (or does it need to be generated?)

re_block head

Private Static Attributes

static constexpr size_t NUM_SYMBOLS = 128

Maximum number of symbol the RegEx can handle.

struct re_base

Internal base representation of a portion of a regex.

Public Functions

inline virtual ~re_base()
inline virtual void Print(std::ostream &os) const
inline virtual Ptr<re_block> AsBlock()
inline virtual Ptr<re_charset> AsCharSet()
inline virtual Ptr<re_parent> AsParent()
inline virtual Ptr<re_string> AsString()
inline virtual size_t GetSize() const
inline virtual bool Simplify()
inline virtual void AddToNFA(NFA &nfa, size_t start, size_t stop) const
struct re_block : public RegEx::re_parent

Representation of a series of components…

Public Functions

inline void Print(std::ostream &os) const override
inline Ptr<re_block> AsBlock() override
inline bool Simplify() override
inline virtual void AddToNFA(NFA &nfa, size_t start, size_t stop) const override
struct re_charset : public RegEx::re_base

Representation of a character set e.g., [abc].

Public Functions

inline re_charset()
inline re_charset(char x, bool neg = false)
inline re_charset(const std::string &s, bool neg = false)
inline void Print(std::ostream &os) const override
inline Ptr<re_charset> AsCharSet() override
inline size_t GetSize() const override
inline char First() const
inline virtual void AddToNFA(NFA &nfa, size_t start, size_t stop) const override

Public Members

opts_t char_set
struct re_or : public RegEx::re_parent

Representation of two options in a regex, e.g., a|b.

Public Functions

inline re_or(Ptr<re_base> l, Ptr<re_base> r)
inline void Print(std::ostream &os) const override
inline virtual void AddToNFA(NFA &nfa, size_t start, size_t stop) const override
struct re_parent : public RegEx::re_base

Intermediate base class for RegEx components that have children (such as “and” and “or”)

Public Functions

inline re_parent()
inline ~re_parent()
inline void Clear()
inline virtual void push(Ptr<re_base> x)
inline Ptr<re_base> pop()
inline size_t GetSize() const override
inline Ptr<re_parent> AsParent() override
inline bool Simplify() override

Protected Attributes

vector<Ptr<re_base>> nodes
struct re_plus : public RegEx::re_parent

Representations of one-or-more instances of a component. e.g., a+.

Public Functions

inline re_plus(Ptr<re_base> c)
inline void Print(std::ostream &os) const override
inline virtual void AddToNFA(NFA &nfa, size_t start, size_t stop) const override
struct re_qm : public RegEx::re_parent

Representations of zero-or-one instances of a component. e.g., a?

Public Functions

inline re_qm(Ptr<re_base> c)
inline void Print(std::ostream &os) const override
inline virtual void AddToNFA(NFA &nfa, size_t start, size_t stop) const override
struct re_star : public RegEx::re_parent

Representations of zero-or-more instances of a component. e.g., a*.

Public Functions

inline re_star(Ptr<re_base> c)
inline void Print(std::ostream &os) const override
inline virtual void AddToNFA(NFA &nfa, size_t start, size_t stop) const override
struct re_string : public RegEx::re_base

Representation of strings stored in a RegEx.

Public Functions

inline re_string()
inline re_string(char c)
inline re_string(const std::string &s)
inline void Print(std::ostream &os) const override
inline Ptr<re_string> AsString() override
inline size_t GetSize() const override
inline virtual void AddToNFA(NFA &nfa, size_t start, size_t stop) const override

Public Members

std::string str