|
| 1 | +# BSD 3-Clause License |
| 2 | +# |
| 3 | +# Copyright (c) 2025, Spill-Tea |
| 4 | +# |
| 5 | +# Redistribution and use in source and binary forms, with or without |
| 6 | +# modification, are permitted provided that the following conditions are met: |
| 7 | +# |
| 8 | +# 1. Redistributions of source code must retain the above copyright notice, this |
| 9 | +# list of conditions and the following disclaimer. |
| 10 | +# |
| 11 | +# 2. Redistributions in binary form must reproduce the above copyright notice, |
| 12 | +# this list of conditions and the following disclaimer in the documentation |
| 13 | +# and/or other materials provided with the distribution. |
| 14 | +# |
| 15 | +# 3. Neither the name of the copyright holder nor the names of its |
| 16 | +# contributors may be used to endorse or promote products derived from |
| 17 | +# this software without specific prior written permission. |
| 18 | +# |
| 19 | +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" |
| 20 | +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE |
| 21 | +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE |
| 22 | +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE |
| 23 | +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL |
| 24 | +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR |
| 25 | +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER |
| 26 | +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, |
| 27 | +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE |
| 28 | +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. |
| 29 | + |
| 30 | +"""Customized python lexer.""" |
| 31 | + |
| 32 | +from collections import deque |
| 33 | +from collections.abc import Iterator |
| 34 | +from typing import ClassVar |
| 35 | + |
| 36 | +from pygments.lexer import bygroups, include |
| 37 | +from pygments.lexers.python import PythonLexer |
| 38 | +from pygments.token import ( |
| 39 | + Comment, |
| 40 | + Keyword, |
| 41 | + Name, |
| 42 | + Number, |
| 43 | + Punctuation, |
| 44 | + String, |
| 45 | + Text, |
| 46 | + Whitespace, |
| 47 | + _TokenType, |
| 48 | +) |
| 49 | +from utils import get_bracket_level |
| 50 | + |
| 51 | + |
| 52 | +def _find(it, obj, key=lambda a, b: a == b) -> int: |
| 53 | + for n, j in enumerate(it): |
| 54 | + if key(j, obj): |
| 55 | + return n |
| 56 | + raise IndexError("Unable to find object.") |
| 57 | + |
| 58 | + |
| 59 | +def _get_index(n: int): |
| 60 | + def inner(a, b) -> bool: |
| 61 | + return a[n] == b |
| 62 | + |
| 63 | + return inner |
| 64 | + |
| 65 | + |
| 66 | +root: list = [ |
| 67 | + (r"\n", Whitespace), |
| 68 | + ( # single line docstrings (edge case) |
| 69 | + r'^(\s*)([rRuUbB]{,2})("""(?:.)*?""")', |
| 70 | + bygroups(Whitespace, String.Affix, String.Doc), |
| 71 | + ), |
| 72 | + ( # Modfied triple double quote docstrings to highlight docstring titles |
| 73 | + r'^(\s*)([rRuUbB]{,2})(""")', |
| 74 | + bygroups(Whitespace, String.Affix, String.Doc), |
| 75 | + "docstring-double", |
| 76 | + ), |
| 77 | + ( # Intentionally treat text encapsulated within single triple quotes as String |
| 78 | + r"^(\s*)([rRuUbB]{,2})('''(?:.|\n)*?''')", |
| 79 | + bygroups(Whitespace, String.Affix, String), |
| 80 | + ), |
| 81 | + (r"\A#!.+$", Comment.Hashbang), |
| 82 | + ( |
| 83 | + # Format Special Common Keyword Comments |
| 84 | + # NOTE: Must come before Comment.Single token in order to be matched. |
| 85 | + r"(#\s*)(TODO|FIXME|NOTE|BUG|HACK|XXX)(:?)(.*$)", |
| 86 | + bygroups(Comment.Single, Comment.Special, Comment.Special, Comment.Single), |
| 87 | + ), |
| 88 | + (r"#.*$", Comment.Single), |
| 89 | + (r"\\\n", Text), |
| 90 | + (r"\\", Text), |
| 91 | + include("keywords"), |
| 92 | + include("soft-keywords"), |
| 93 | + ( |
| 94 | + r"(def)((?:\s|\\\s)+)", |
| 95 | + bygroups(Keyword.Declare, Whitespace), |
| 96 | + "funcname", |
| 97 | + ), |
| 98 | + ( |
| 99 | + r"(class)((?:\s|\\\s)+)", |
| 100 | + bygroups(Keyword.Declare, Whitespace), |
| 101 | + "classname", |
| 102 | + ), |
| 103 | + ( |
| 104 | + r"(from)((?:\s|\\\s)+)", |
| 105 | + bygroups(Keyword.Namespace, Whitespace), |
| 106 | + "fromimport", |
| 107 | + ), |
| 108 | + ( |
| 109 | + r"(import)((?:\s|\\\s)+)", |
| 110 | + bygroups(Keyword.Namespace, Whitespace), |
| 111 | + "import", |
| 112 | + ), |
| 113 | + include("expr"), |
| 114 | +] |
| 115 | + |
| 116 | + |
| 117 | +python_tokens: dict[str, list] = PythonLexer.tokens.copy() |
| 118 | +python_tokens["root"] = root |
| 119 | +python_tokens["docstring-double"] = [ |
| 120 | + ( |
| 121 | + r"(?<=\n)(\s*)(Args|Attributes|Returns|Raises|" |
| 122 | + r"Examples|Yields|References|Notes|Equations)(:)(\s*)", |
| 123 | + bygroups(Whitespace, String.Doc.Title, String.Doc, Whitespace), |
| 124 | + ), |
| 125 | + (r'^\s*(?:""")', String.Doc, "#pop"), |
| 126 | + (r".+[\r\n]*", String.Doc), |
| 127 | +] |
| 128 | + |
| 129 | +# Tokenize function names when used (i.e. function calls) |
| 130 | +# NOTE: Must be inserted before general `Name` token but after `Name.Builtins` token |
| 131 | +# NOTE: Implementation limitations -> we cannot distinguish between class and function |
| 132 | +# calls using regex based parsing alone (i.e without semantic analysis). |
| 133 | +python_tokens["name"].insert( |
| 134 | + _find(python_tokens["name"], Name, _get_index(1)), |
| 135 | + (r"\b([a-zA-Z_]\w*)(?=\s*\()", Name.Function), |
| 136 | +) |
| 137 | + |
| 138 | +python_tokens["numbers"] = [ |
| 139 | + ( |
| 140 | + r"(\d(?:_?\d)*\.(?:\d(?:_?\d)*)?|(?:\d(?:_?\d)*)?\.\d(?:_?\d)*)" |
| 141 | + r"([eE][+-]?\d(?:_?\d)*)?([jJ]?)", |
| 142 | + bygroups(Number.Float, Number.Float, Number.Other), |
| 143 | + ), |
| 144 | + (r"(\d(?:_?\d)*[eE][+-]?\d(?:_?\d)*)([jJ]?)", bygroups(Number.Float, Number.Other)), |
| 145 | + (r"(0[oO])((?:_?[0-7])+)", bygroups(Number.Other, Number.Oct)), |
| 146 | + (r"(0[bB])((?:_?[01])+)", bygroups(Number.Other, Number.Bin)), |
| 147 | + (r"(0[xX])((?:_?[a-fA-F0-9])+)", bygroups(Number.Other, Number.Hex)), |
| 148 | + (r"(\d(?:_?\d)*)([jJ]?)", bygroups(Number.Integer, Number.Other)), |
| 149 | +] |
| 150 | + |
| 151 | + |
| 152 | +class CustomPythonLexer(PythonLexer): |
| 153 | + """Enhanced regex-based python Lexer. |
| 154 | +
|
| 155 | + Notes: |
| 156 | + 1. Implemented a simple stack based rainbow bracket colorizer. |
| 157 | + * limitation: Only detects errors that close more brackets than opens. |
| 158 | + 2. Highlight Docstring titles (assumes google docstring format) |
| 159 | + 3. Improved highlighting function calls (with limitations) |
| 160 | + 4. Modify display of number components which indicate a different base number. |
| 161 | +
|
| 162 | + """ |
| 163 | + |
| 164 | + n_brackets: int |
| 165 | + _stack: deque[int] |
| 166 | + tokens: ClassVar[dict[str, list]] = python_tokens |
| 167 | + |
| 168 | + def __init__(self, **options) -> None: |
| 169 | + super().__init__(**options) |
| 170 | + self._stack = deque[int]() |
| 171 | + self.n_brackets = int(options.get("n_brackets", 4)) |
| 172 | + |
| 173 | + def _enter(self) -> _TokenType: |
| 174 | + """Retrieve next token in cycle.""" |
| 175 | + idx = len(self._stack) % self.n_brackets |
| 176 | + self._stack.append(idx) |
| 177 | + |
| 178 | + return get_bracket_level(idx) |
| 179 | + |
| 180 | + def _exit(self) -> _TokenType: |
| 181 | + """Remove element from stack and return token.""" |
| 182 | + try: |
| 183 | + idx: int = self._stack.pop() |
| 184 | + return get_bracket_level(idx) |
| 185 | + |
| 186 | + except IndexError: |
| 187 | + return Punctuation.Error |
| 188 | + |
| 189 | + def get_tokens_unprocessed( |
| 190 | + self, |
| 191 | + text, |
| 192 | + stack=("root",), |
| 193 | + ) -> Iterator[tuple[int, _TokenType, str]]: |
| 194 | + _token: _TokenType |
| 195 | + for idx, token, value in super().get_tokens_unprocessed(text, stack): |
| 196 | + _token = token |
| 197 | + if token is Name and value.isupper(): |
| 198 | + _token = Name.Constant |
| 199 | + |
| 200 | + elif token is Punctuation: |
| 201 | + match value: |
| 202 | + case "(" | "[" | "{": |
| 203 | + _token = self._enter() |
| 204 | + case "}" | "]" | ")": |
| 205 | + _token = self._exit() |
| 206 | + case _: |
| 207 | + ... |
| 208 | + |
| 209 | + yield idx, _token, value |
0 commit comments