Lexer.hpp
A general-purpose, fast lexer.
Build a lexer that can convert input strings or streams into a series of provided tokens.
Use AddToken(name, regex) to list out the relevant tokens. ‘name’ is the unique name for this token. ‘regex’ is the regular expression that describes this token. It will return a unique ID associated with this lexeme.
IgnoreToken(name, regex) uses the same arguments, but is used for tokens that should be skipped over.
Names and IDs can be recovered later using GetTokenID(name) and GetTokenName(id).
Tokens can be retrieved either one at a time with Process(string) or Process(stream), which will return the next (non-ignored) token, removing it from the input.
Alternatively, an entire series of tokens can be processed with Tokenize().
Finally, GetLexeme() can be used to retrieve the lexeme from the most recent token found.
Note
Status: BETA
-
struct TokenInfo
- #include <Lexer.hpp>
Information about an individual TYPE of token to be processed within a Lexer.
Public Functions
-
inline TokenInfo()
-
inline TokenInfo()
-
struct Token
- #include <Lexer.hpp>
Information about a token instance from an input stream.
Public Functions
-
inline operator int() const
Token will automatically convert to its ID if used as an int.
-
inline operator int() const
-
class TokenStream
- #include <Lexer.hpp>
Public Functions
-
TokenStream(const TokenStream&) = default
-
TokenStream(TokenStream&&) = default
-
TokenStream &operator=(const TokenStream&) = default
-
TokenStream &operator=(TokenStream&&) = default
-
inline size_t size() const
-
class Iterator
- #include <Lexer.hpp>
Public Functions
-
inline Iterator(const TokenStream &in_ts, size_t in_pos)
-
inline const TokenStream &GetTokenStream() const
-
inline size_t GetIndex() const
-
inline bool IsValid() const
-
inline bool AtEnd() const
-
inline operator bool() const
-
inline Iterator(const TokenStream &in_ts, size_t in_pos)
-
TokenStream(const TokenStream&) = default
-
class Lexer
- #include <Lexer.hpp>
A lexer with a set of token types (and associated regular expressions)
Public Functions
-
Lexer() = default
-
~Lexer() = default
-
inline size_t GetNumTokens() const
How many types of tokens can be identified in this Lexer?
-
inline void Reset()
-
inline bool TokenOK(int id) const
-
inline int AddToken(const std::string &name, const std::string ®ex, bool save_lexeme = true, bool save_token = true, const std::string &desc = "")
Add a new token, specified by a name and the regex used to identify it. Note that token ids count down with highest IDs having priority.
-
inline int IgnoreToken(const std::string &name, const std::string ®ex, const std::string &desc = "")
Add a token to ignore, specified by a name and the regex used to identify it.
-
inline int GetTokenID(const std::string &name) const
Get the ID associated with a token type (you provide the token name)
-
inline const TokenInfo &GetTokenInfo(int id) const
Get the full information about a token (you provide the id)
-
inline std::string GetTokenName(int id) const
Get the name associated with a token type (you provide the ID)
-
inline bool GetSaveToken(int id) const
Identify if a token should be saved.
-
inline void Generate() const
Create the NFA that will identify the current set of tokens in a sequence.
-
inline Token Process(std::istream &is) const
Get the next token found in an input stream. Do so by examining one character at a time. Keep going as long as there is a chance of a valid lexeme (since we want to choose the longest one we can find.) Every time we do hit a valid lexeme, store it as the current “best” and keep going. Once we hit a point where no other valid lexemes are possible, stop and return the best we’ve found so far.
-
inline Token Process(std::string &in_str) const
Shortcut to process a string rather than a stream, chopping off one token each time.
-
inline TokenStream Tokenize(std::istream &is, const std::string &name = "in_stream") const
Turn an input stream of text into a vector of tokens.
-
inline TokenStream Tokenize(std::string_view str, const std::string &name = "in_view") const
Turn an input string into a vector of tokens.
-
inline TokenStream Tokenize(const vector<std::string> &str_v, const std::string &name = "in_string vector") const
Turn a vector of strings into a vector of tokens.
-
inline const std::string &GetLexeme() const
Get the lexeme associated with the last token identified.
-
inline void Print(std::ostream &os = std::cout) const
Print the full information about this lexer (for debugging)
-
inline void DebugString(std::string test_string) const
Try out the lexer on a string and demonstrate how it’s tokenized.
-
inline void DebugToken(int token_id) const
Private Members
-
mutable bool generate_lexer = false
Do we need to regenerate the lexer?
-
Lexer() = default