Docs »
Module code »
tyrian.lexer
Edit on GitHub
Source code for tyrian.lexer

"""
Code to perform lexing accourding to token definitions
"""

# standard library
import re
import logging
import operator
import functools

# application specific
from .utils import logger, enforce_types
from .exceptions import InvalidToken

logger = logger.getChild('Lexer')


[docs]class Lexer(object):
    """
    Code to perform lexing according to token definitions

    :param token_defs: dictionary containing token definitions, see \
    :py:meth:`load_token_definitions <tyrian.lexer.Lexer.load_token_definitions>` \
    for definitions
    """

    def __init__(self, token_defs: dict):
        self.tokens = {}
        self.TRANS = str.maketrans({
            '(': ' ( ',
            ')': ' ) ',
            '"': ' " ',
            "'": " ' ",
        })

        if token_defs:
            self.load_token_definitions(token_defs)
        else:
            self.tokens_loaded = False

[docs]    def match_with(self, left) -> object:
        """
        Convenience function.

        returns an object with a match attribute partial'ed operator.eq,
        configured to match `left` with the supplied `right`

        :param left: const for returned function to be configured to compare \
        against
        :rtype: object with `match` attribute
        """

        match = functools.partial(operator.eq, left)

        return type('obj', (object,), {'match': match})

    @enforce_types
[docs]    def load_token_definitions(self, token_defs: dict):
        """
        Iterates through the supplied token_defs dictionary, creates wrappers
        for literals and compiles regex's

        :param token_defs: contains token definitions; see \
        :py:meth:`GrammarParser.load_token_definitions \
        <tyrian.typarser.grammar_parser.GrammarParser.load_token_definitions>`\
         for format
        """

        for k, v in token_defs['literal'].items():
            k = self.match_with(k)
            self.tokens[k] = v

        for k, v in token_defs['regex'].items():
            k = re.compile(k)
            self.tokens[k] = v

        self.tokens_loaded = True

    @enforce_types
[docs]    def lex(self, content: str, filename: str=None) -> list:
        """
        Takes a string to lex according to token definition loaded
        via load_token_definitions

        :param content: content of file being lexed
        :param filename: name of file being lexed
        """

        assert self.tokens_loaded, (
            'Please call load_token_definitions before calling this function')

        lines = content.split('\n')

        tokens = []
        for line_no, line in enumerate(lines, start=1):
            for token in self._lex(line, line_no, filename):
                tokens.append(token)

        return tokens

    @enforce_types
[docs]    def _lex(self, line: str, line_no: int, filename: str):
        """
        used internally by lex, does actual lexing

        :param line: line from source file
        :param line_no: line number of provided line
        :param filename: name of file from which the line originates

        yields tokens of format

        .. code-block:: json

            {
                "name": str,
                "token": str,
                "line_no": int,
                'filename': str
            }
        """

        line = (line.translate(self.TRANS)
                    .strip()
                    .split(' '))

        for current_token in line:
            for definition, name in self.tokens.items():
                logging.debug(definition.match(current_token), current_token)
                if definition.match(current_token):
                    yield {
                        "name": name,
                        "token": current_token,
                        "line_no": line_no,
                        'filename': filename
                    }
                    break
            else:
                if current_token.strip():
                    msg = '"{}" on line {}'.format(current_token, line_no)
                    if filename:
                        msg += ' of file {}'.format(filename)
                    raise InvalidToken(msg)