# standard library
import re
import logging
import operator
import functools

# application specific
from .utils import logger, enforce_types
from .exceptions import InvalidToken

logger = logger.getChild('Lexer')

[docs]class Lexer(object): """ Code to perform lexing according to token definitions :param token_defs: dictionary containing token definitions, see \ :py:meth:`load_token_definitions <tyrian.lexer.Lexer.load_token_definitions>` \ for definitions """ def __init__(self, token_defs: dict): self.tokens = {} self.TRANS = str.maketrans({ '(': ' ( ', ')': ' ) ', '"': ' " ', "'": " ' ", }) if token_defs: self.load_token_definitions(token_defs) else: self.tokens_loaded = False
[docs] def match_with(self, left) -> object: """ Convenience function. returns an object with a match attribute partial'ed operator.eq, configured to match `left` with the supplied `right` :param left: const for returned function to be configured to compare \ against :rtype: object with `match` attribute """ match = functools.partial(operator.eq, left) return type('obj', (object,), {'match': match})
[docs] def load_token_definitions(self, token_defs: dict): """ Iterates through the supplied token_defs dictionary, creates wrappers for literals and compiles regex's :param token_defs: contains token definitions; see \ :py:meth:`GrammarParser.load_token_definitions \ <tyrian.typarser.grammar_parser.GrammarParser.load_token_definitions>`\ for format """ for k, v in token_defs['literal'].items(): k = self.match_with(k) self.tokens[k] = v for k, v in token_defs['regex'].items(): k = re.compile(k) self.tokens[k] = v self.tokens_loaded = True
[docs] def lex(self, content: str, filename: str=None) -> list: """ Takes a string to lex according to token definition loaded via load_token_definitions :param content: content of file being lexed :param filename: name of file being lexed """ assert self.tokens_loaded, ( 'Please call load_token_definitions before calling this function') lines = content.split('\n') tokens = [] for line_no, line in enumerate(lines, start=1): for token in self._lex(line, line_no, filename): tokens.append(token) return tokens
[docs] def _lex(self, line: str, line_no: int, filename: str): """ used internally by lex, does actual lexing :param line: line from source file :param line_no: line number of provided line :param filename: name of file from which the line originates yields tokens of format .. code-block:: json { "name": str, "token": str, "line_no": int, 'filename': str } """ line = (line.translate(self.TRANS) .strip() .split(' ')) for current_token in line: for definition, name in self.tokens.items(): logging.debug(definition.match(current_token), current_token) if definition.match(current_token): yield { "name": name, "token": current_token, "line_no": line_no, 'filename': filename } break else: if current_token.strip(): msg = '"{}" on line {}'.format(current_token, line_no) if filename: msg += ' of file {}'.format(filename) raise InvalidToken(msg)
