Source code for ptk.lexer

# -*- coding: utf-8 -*-

# (c) Jérôme Laheurte 2015-2026
# See LICENSE.txt

import inspect
import re
import collections
import warnings

from ptk.utils import Singleton, callback_by_name, chars


LexerPosition = collections.namedtuple('_LexerPosition', ['column', 'line'])

# In Python 3 we'd use __prepare__ and an ordered dict...
_TOKREGISTER = []


class _LexerMeta(type):
    def __new__(mcs, name, bases, attrs):
        global _TOKREGISTER # pylint: disable=W0603
        try:
            attrs['__tokens__'] = (set(), []) # Set of token names, list of (rx, callback, defaultType)
            klass = super().__new__(mcs, name, bases, attrs)
            for func, rx, toktypes in _TOKREGISTER:
                klass.add_token_type(func.__name__, callback_by_name(func.__name__), rx, toktypes)
            return klass
        finally:
            _TOKREGISTER = []


[docs] def token(rx, types=None): """ The method decorator for tokens """ def _wrap(func): if any(func.__name__ == aFunc.__name__ and func != aFunc for aFunc, _, _ in _TOKREGISTER): raise TypeError(f'Duplicate token method name "{func.__name__}"') _TOKREGISTER.append((func, rx, types)) return func return _wrap
[docs] class SkipToken(Exception): """ Raise this from your consumer to ignore the token. """
[docs] class LexerError(Exception): """ Unrecognized token in input :ivar lineno: Line in input :ivar colno: Column in input """ def __init__(self, char, pos): super().__init__(f'Unrecognized token {repr(char)}') self.position = pos # Getters for compatibility with <1.3.8 @property def colno(self): """ Column """ return self.position.column @property def lineno(self): """ Line """ return self.position.line
[docs] class EOF(metaclass=Singleton): """ End symbol """ __reprval__ = '$' @property def type(self): """Read-only attribute for Token duck-typing""" return self @property def value(self): """Read-only attribute for Token duck-typing""" return self
[docs] class LexerBase(metaclass=_LexerMeta): """ This defines the interface for lexer classes. For concrete implementations, see :py:class:`ProgressiveLexer` and :py:class:`ReLexer`. """ Token = collections.namedtuple('Token', ['type', 'value', 'position']) # Shut up pychecker. Those are actually set by the metaclass. __tokens__ = () class _MutableToken: # pylint: disable=too-few-public-methods def __init__(self, type_, value, position): self.type = type_ self.value = value self.position = position def token(self): """Returns the unmutable equivalent""" return EOF if EOF in [self.type, self.value] else LexerBase.Token(self.type, self.value, self.position) def __init__(self): super().__init__() self._pos = None self._consumer = None self.restart_lexer() def restart_lexer(self, reset_pos=True): if reset_pos: self._pos = LexerPosition(column=1, line=1) self._input = [] self._consumer = None def restartLexer(self, resetPos=True): # pylint: disable=invalid-name warnings.warn('restartLexer is deprecated in favor of restart_lexer', DeprecationWarning) self.restart_lexer(reset_pos=resetPos)
[docs] def position(self): """ :return: The current position in stream as a 2-tuple (column, line). """ return self._pos
[docs] def advance_column(self, count=1): """ Advances the current position by *count* columns. """ self._pos = self._pos._replace(column=self._pos.column + count)
def advanceColumn(self, count=1): # pylint: disable=invalid-name warnings.warn('advanceColumn is deprecated in favor of advance_column', DeprecationWarning) self.advance_column(count=count)
[docs] def advance_line(self, count=1): """ Advances the current position by *count* lines. """ self._pos = self._pos._replace(column=1, line=self._pos.line + count)
def advanceLine(self, count=1): # pylint: disable=invalid-name warnings.warn('advanceLine is deprecated in favor of advance_line', DeprecationWarning) self.advance_line(count=count)
[docs] @staticmethod def ignore(char): """ Override this to ignore characters in input stream. The default is to ignore spaces and tabs. :param char: The character to test :return: True if *char* should be ignored """ return char in chars(' ') + chars('\t')
[docs] def set_consumer(self, consumer): """ Sets the current consumer. A consumer is an object with a *feed* method; all characters seen on the input stream after the consumer is set are passed directly to it. When the *feed* method returns a 2-tuple (type, value), the corresponding token is generated and the consumer reset to None. This may be handy to parse tokens that are not easily recognized by a regular expression but easily by code; for instance the following lexer recognizes C strings without having to use negative lookahead: .. code-block:: python class MyLexer(ReLexer): @token('"') def cstring(self, tok): class CString: def __init__(self): self.state = 0 self.value = StringIO.StringIO() def feed(self, char): if self.state == 0: if char == '"': return 'cstring', self.value.getvalue() if char == '\\\\': self.state = 1 else: self.value.write(char) elif self.state == 1: self.value.write(char) self.state = 0 self.set_consumer(CString()) You can also raise SkipToken instead of returning a token if it is to be ignored (comments). """ self._consumer = consumer
def setConsumer(self, consumer): # pylint: disable=invalid-name warnings.warn('setConsumer is deprecated in favor of set_consumer', DeprecationWarning) self.set_consumer(consumer) def consumer(self): return self._consumer
[docs] def parse(self, string): # pragma: no cover """ Parses the whole *string*; returns the start symbol semantic value """ raise NotImplementedError
[docs] def new_token(self, tok): # pragma: no cover """ This method will be invoked as soon as a token is recognized on input. :param tok: The token. This is a named tuple with *type* and *value* attributes. """ raise NotImplementedError
def newToken(self, tok): # pylint: disable=invalid-name warnings.warn('newToken is deprecated in favor of new_token', DeprecationWarning) self.new_token(tok) @classmethod def add_token_type(cls, name, callback, regex, types=None): for type_name in [name] if types is None else types: if type_name is not EOF: cls.__tokens__[0].add(type_name) cls.__tokens__[1].append((regex, callback, name if types is None else None)) @classmethod def _all_tokens(cls): tokens = (set(), []) for base in inspect.getmro(cls): if issubclass(base, LexerBase): tokens[0].update(base.__tokens__[0]) tokens[1].extend(base.__tokens__[1]) return tokens
[docs] @classmethod def token_types(cls): """ :return: the set of all token names, as strings. """ return cls._all_tokens()[0]
@classmethod def tokenTypes(cls): # pylint: disable=invalid-name warnings.warn('tokenTypes is deprecated in favor of token_types', DeprecationWarning) return cls.token_types()
[docs] class ReLexer(LexerBase): # pylint: disable=W0223 """ Concrete lexer based on Python regular expressions. """ def __init__(self): self._regexes = [] for rx, callback, default_type in self._all_tokens()[1]: crx = re.compile((b'^' if isinstance(rx, bytes) else '^') + rx) self._regexes.append((crx, callback, default_type)) super().__init__() def _parse(self, string, pos): # pylint: disable=too-many-nested-blocks while pos < len(string): char = string[pos] try: if self.consumer() is None: if self.ignore(char): pos += 1 continue pos = self._find_match(string, pos) else: try: tok = self.consumer().feed(char) except SkipToken: self.set_consumer(None) else: if tok is not None: self.set_consumer(None) if tok[0] is not None: self.new_token(self.Token(*tok, self.position())) pos += 1 finally: if char in chars('\n'): self.advance_line() else: self.advance_column() return pos
[docs] def parse(self, string): try: self._parse(string, 0) return self.new_token(EOF) except LexerError: self.restart_lexer() raise
def _find_match(self, string, pos): match = None matchlen = 0 pos2d = self.position() for rx, callback, default_type in self._regexes: mtc = rx.match(string[pos:]) if mtc: value = mtc.group(0) if len(value) > matchlen: match = value, callback, default_type matchlen = len(value) if match: value, callback, default_type = match tok = self._MutableToken(default_type, value, pos2d) callback(self, tok) pos += matchlen if self.consumer() is None and tok.type is not None: self.new_token(tok.token()) self.advance_column(matchlen - 1) return pos raise LexerError(self._guess_token(string, pos), pos2d) def _guess_token(self, string, pos): start = pos while True: pos += 1 if pos == len(string) or self.ignore(string[pos]): break for rx, _, _ in self._regexes: mtc = rx.match(string[pos:]) if mtc: break else: continue break return string[start:pos]