Lexer.hpp

A general-purpose, fast lexer.

Build a lexer that can convert input strings or streams into a series of provided tokens.

Use AddToken(name, regex) to list out the relevant tokens. ‘name’ is the unique name for this token. ‘regex’ is the regular expression that describes this token. It will return a unique ID associated with this lexeme.

IgnoreToken(name, regex) uses the same arguments, but is used for tokens that should be skipped over.

Names and IDs can be recovered later using GetTokenID(name) and GetTokenName(id).

Tokens can be retrieved either one at a time with Process(string) or Process(stream), which will return the next (non-ignored) token, removing it from the input.

Alternatively, an entire series of tokens can be processed with Tokenize().

Finally, GetLexeme() can be used to retrieve the lexeme from the most recent token found.

Note

Status: BETA

struct TokenInfo

#include <Lexer.hpp>

Information about an individual TYPE of token to be processed within a Lexer.

Public Functions

inline TokenInfo()

inline TokenInfo(const std::string &_name, const std::string &_regex, int _id, bool _save_l = true, bool _save_t = true, const std::string &_desc = "")

TokenInfo(const TokenInfo&) = default

TokenInfo(TokenInfo&&) = default

TokenInfo &operator=(const TokenInfo&) = default

TokenInfo &operator=(TokenInfo&&) = default

inline void Print(std::ostream &os = std::cout) const: Print out the status of this token (for debugging)

Public Members

std::string name: Name of this token type.

std::string desc: More detailed description of this token type.

RegEx regex: Pattern to describe token type.

int id: Unique id for token.

bool save_lexeme: Preserve the lexeme for this token?

bool save_token: Keep token at all? (Whitespace and comments are often discarded).

struct Token

#include <Lexer.hpp>

Information about a token instance from an input stream.

Public Functions

inline Token(int _id, const std::string &str = "", size_t _line = 0)

Token(const Token&) = default

Token(Token&&) = default

Token &operator=(const Token&) = default

Token &operator=(Token&&) = default

inline operator int() const: Token will automatically convert to its ID if used as an int.

inline operator const std::string&() const: Token will automatically convert to its matched sequence (lexeme) is used as a string.

Public Members

int id: Which type of token is this?

std::string lexeme: Sequence matched by this token (or empty if not saved)

size_t line_id: Which line did this token start on?

class TokenStream

#include <Lexer.hpp>

Public Functions

inline TokenStream(const std::string &in_name)

TokenStream(const TokenStream&) = default

TokenStream(TokenStream&&) = default

inline TokenStream(const vector<Token> &in_tokens, const std::string &in_name)

TokenStream &operator=(const TokenStream&) = default

TokenStream &operator=(TokenStream&&) = default

inline size_t size() const

inline const Token &Get(size_t pos) const

inline Ptr<const Token> GetPtr(size_t pos) const

inline const std::string &GetName() const

inline Iterator begin() const

inline Iterator end() const

inline const Token &back() const

inline void push_back(const Token &in)

inline void Print(std::ostream &os = std::cout) const

Private Members

std::string name = ""

vector<Token> tokens

class Iterator

#include <Lexer.hpp>

Public Functions

Iterator(const Iterator&) = default

inline Iterator(const TokenStream &in_ts, size_t in_pos)

Iterator &operator=(const Iterator&) = default

inline const TokenStream &GetTokenStream() const

inline size_t GetIndex() const

inline Ptr<const Token> ToPtr() const

inline Token operator*() const

inline const Token *operator->() const

inline bool operator==(const Iterator &in) const

inline bool operator!=(const Iterator &in) const

inline bool operator<(const Iterator &in) const

inline bool operator<=(const Iterator &in) const

inline bool operator>(const Iterator &in) const

inline bool operator>=(const Iterator &in) const

inline Iterator &operator++()

inline Iterator operator++(int)

inline Iterator &operator--()

inline Iterator operator--(int)

inline bool IsValid() const

inline bool AtEnd() const

inline operator bool() const

Private Members

Ptr<const TokenStream> ts

size_t pos

class Lexer

#include <Lexer.hpp>

A lexer with a set of token types (and associated regular expressions)

Public Functions

Lexer() = default

Lexer(const Lexer&) = default

Lexer(Lexer&&) = default

~Lexer() = default

Lexer &operator=(const Lexer&) = default

Lexer &operator=(Lexer&&) = default

inline size_t GetNumTokens() const: How many types of tokens can be identified in this Lexer?

inline void Reset()

inline bool TokenOK(int id) const

inline int AddToken(const std::string &name, const std::string &regex, bool save_lexeme = true, bool save_token = true, const std::string &desc = ""): Add a new token, specified by a name and the regex used to identify it. Note that token ids count down with highest IDs having priority.

inline int IgnoreToken(const std::string &name, const std::string &regex, const std::string &desc = ""): Add a token to ignore, specified by a name and the regex used to identify it.

inline int GetTokenID(const std::string &name) const: Get the ID associated with a token type (you provide the token name)

inline const TokenInfo &GetTokenInfo(int id) const: Get the full information about a token (you provide the id)

inline std::string GetTokenName(int id) const: Get the name associated with a token type (you provide the ID)

inline bool GetSaveToken(int id) const: Identify if a token should be saved.

inline void Generate() const: Create the NFA that will identify the current set of tokens in a sequence.

inline Token Process(std::istream &is) const: Get the next token found in an input stream. Do so by examining one character at a time. Keep going as long as there is a chance of a valid lexeme (since we want to choose the longest one we can find.) Every time we do hit a valid lexeme, store it as the current “best” and keep going. Once we hit a point where no other valid lexemes are possible, stop and return the best we’ve found so far.

inline Token Process(std::string &in_str) const: Shortcut to process a string rather than a stream, chopping off one token each time.

inline Token ToToken(std::string_view in_str) const: Shortcut to just get a single token.

inline TokenStream Tokenize(std::istream &is, const std::string &name = "in_stream") const: Turn an input stream of text into a vector of tokens.

inline TokenStream Tokenize(std::string_view str, const std::string &name = "in_view") const: Turn an input string into a vector of tokens.

inline TokenStream Tokenize(const vector<std::string> &str_v, const std::string &name = "in_string vector") const: Turn a vector of strings into a vector of tokens.

inline const std::string &GetLexeme() const: Get the lexeme associated with the last token identified.

inline void Print(std::ostream &os = std::cout) const: Print the full information about this lexer (for debugging)

inline void DebugString(std::string test_string) const: Try out the lexer on a string and demonstrate how it’s tokenized.

inline void DebugToken(int token_id) const

Private Members

vector<TokenInfo> token_set: List of all active tokens types.

map<std::string, int> token_map: Map of token names to id.

int cur_token_id = MAX_ID : Which ID should the next new token get?

mutable bool generate_lexer = false: Do we need to regenerate the lexer?

mutable DFA lexer_dfa: Table driven lexer implementation.

mutable std::string lexeme: Current state of lexeme being generated.

Private Static Functions

static inline const TokenInfo &ERROR_TOKEN()

Private Static Attributes

static constexpr int MAX_ID = 255: IDs count down so that first ones have priority.

static constexpr int ERROR_ID = -1: Code for unknown token ID.