SimpleLexer/python_lexer.py at main · MrXerios/SimpleLexer · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
import re
import keyword

from lexer.lexer_base import Lexer, bygroups, from_list, include, default, combination
from lexer.tokens import _token_type
import lexer.unistring as uni

operator_list = [re.escape(p) for p in r"+ - * / % * ** / // @ & | ^ ~ >> << == = := += -= *= /= %= *= **= @= &= |= ^= >>= <<= >= <= != < >".split(" ")]


list_builtins = dir(__builtins__)


# Define root of tokens
Syntax = _token_type()

# Aliases
Whitespace = Syntax.Whitespace
String = Syntax.String
SString = String.Simple
FString = String.Formatted
Comment = Syntax.Comment
Keyword = Syntax.Keyword
Name = Syntax.Name
Operator = Syntax.Operator
Number = Syntax.Number
Punctuation = Syntax.Punctuation

class PythonLexer(Lexer):

    # Regex for possible syntax for an identifier using unicode
    uni_name = f"[{uni.xid_start}][{uni.xid_continue}]*"

    states = {
        'root': [
            # for speed: immediate match on empty lines
            (r'\Z', Whitespace),

            (r'\A#!.+$', Comment.Hashbang),
            (r'#[ \t]*(?i:todo|2do|fixme).*$', Comment.Todo),
            (r'#.*$', Comment), # Simple comment
            (r'^[ \t]*##.*$', Comment.Cell), # Comment cell
            (fr'(def)([ \t]+)({uni_name})', bygroups(Keyword, Whitespace, Name.Function)),
            (fr'(class)([ \t]+)({uni_name})', bygroups(Keyword, Whitespace, Name.Class)),
            (fr'@{uni_name}', Name.Decorator),
            include("expression"),
        ],
        "expression": [
            # Parenthesis
            (r'[\(\{\[]', Syntax.Open),
            (r'[\)\}\]]', Syntax.Close),

            # Non-formatted Strings
            # Multiline strings
            (r"([uUrRbB]{,2})(?=''')", String.Literal, "string-single-multiline"),
            (r'([uUrRbB]{,2})(?=""")', String.Literal, "string-double-multiline"),

            # One line strings
            (r"([uUrRbB]{,2})(?=')", String.Literal, "string-single-oneline"),
            (r'([uUrRbB]{,2})(?=")', String.Literal, "string-double-oneline"),

            # Formatted strings (and templates from python 3.14)
            # Multiline fstrings
            (r"([uUrRbBfFtT]{1,2})(?=''')", String.Literal, "fstring-single-multiline"),
            (r'([uUrRbBfFtT]{1,2})(?=""")', String.Literal, "fstring-double-multiline"),

            # One line fstring
            (r"([uUrRbBfFtT]{1,2})(?=')", String.Literal, "fstring-single-oneline"),
            (r'([uUrRbBfFtT]{1,2})(?=")', String.Literal, "fstring-double-oneline"),

            # Numbers
            (r'\d[\d_.]*j\b', Number.Complex),
            (r'\d[\d_.]*\b', Number.Float),
            (r'\d[\d_]*\b', Number.Int),
            (r'0[xX][_0-9a-fA-F]+\b', Number.Hexadecimal),
            (r'0b[_01]+\b', Number.Binary),

            # Builtins
            from_list(list_builtins, Keyword.Builtin, suffix=r'\b'),

            # Keywords
            from_list([kw for kw in keyword.kwlist if not kw[0].isupper()], Keyword, suffix=r'\b'),
            from_list(['True', 'False', 'None'], Keyword.Constant, suffix=r'\b'),

            # Soft-Keywords (ignore _ for simplicity)
            (r'^([ \t]*)(match|case)\b', bygroups(Whitespace, Keyword)), # TODO improve this

            # Operators
            from_list(operator_list, Operator),

            # Regular name
            (uni_name, Name),

            # Punctuation
            (re.escape('...'), Punctuation),
            (rf'(\.)({uni_name})', bygroups(Punctuation, Name.Attribute)),
            (r'[;:,]',Punctuation),
            (r'[ \t]+', Whitespace),

            # Invalid specifiers
            (r'\w+\b', Syntax.Invalid), # Invalid name
            (r'.+?', Syntax.Invalid), # Default is case nothing else matches
        ],
        "string-single-multiline": [
            # Search for begining and end of string
            (r"'''.*?'''", SString.Multiline, '#pop'),
            # Search for begining and end of line
            (r"'''.*?\Z", SString.Multiline),
            # Search for end of string
            (r".*?'''", SString.Multiline, '#pop'),
            # If previous do not match, whole line is string. Keep going.
            (r".*?\Z", SString.Multiline),
        ],
        "string-double-multiline": [
            # Same as string-single-multiline
            (r'""".*?"""', SString.Multiline, '#pop'),
            (r'""".*?\Z', SString.Multiline),
            (r'.*?"""', SString.Multiline, '#pop'),
            (r'.*?\Z', SString.Multiline),
        ],
        "string-single-oneline": [
            # Search for begining and end of string
            (r"'.*?'", SString.Oneline, '#pop'),
            # Search for begining and end of line with line continuation \
            (r"'.*?\\[ \t]*\Z", SString.Oneline),
            # Search for begining and end of line without line continuation
            (r"'.*?\Z", SString.Oneline.Unterminated, '#pop'),
            # Search for end of string (continue from line continuation)
            (r".*?'", SString.Oneline, '#pop'),
            # If previous do not match, unterminated string.
            (r".*?\Z", SString.Oneline.Unterminated, '#pop'),
        ],
        "string-double-oneline": [
            # Same as string-single-oneline
            (r'".*?"', SString.Oneline, '#pop'),
            (r'".*?\\[ \t]*\Z', SString.Oneline),
            (r'".*?\Z', SString.Oneline.Unterminated, '#pop'),
            (r'.*?"', SString.Oneline, '#pop'),
            (r'.*?\Z', SString.Oneline.Unterminated, '#pop'),
        ],
        "expression-fstring": [
            (r'![ars]', FString.Format.Specifier), # !r !s or !a
            (r':.*?(?=\})', FString.Format.Specifier, '#pop'), # {... :.2f}
            (r'(?=\})', None, '#pop'), # todo this could be better : f"{{}}" is an issue
            include("expression"),
        ],
        "fstring-single-multiline": [
            # Search for formatting cell
            # ((?!''').)* is here to ensure that a cell isn't found out
            # of the string
            (r"'''((?!''').)*?\{", FString.Multiline, "expression-fstring"),
            # In case of multiple formatting cells, or cell after new line
            (r"\}?((?!''').)*?\{", FString.Multiline, "expression-fstring"),
            # Search for end of cell and end of string
            (r"\}.*?'''", FString.Multiline, '#pop'),
            # Search for end of cell and no end of string
            (r"\}.*?\Z", FString.Multiline),
            # Search for beginning and end of string (no formatting cell)
            (r"'''.*?'''", FString.Multiline, '#pop'),
            # If previous do not match, whole line is string. Keep going.
            (r".*?\Z", FString.Multiline),
        ],
        "fstring-double-multiline": [
            # Same as fstring-single-multiline
            (r'"""((?!""").)*?\{', FString.Multiline, "expression-fstring"),
            (r'\}?((?!""").)*?\{', FString.Multiline, "expression-fstring"),
            (r'\}.*?"""', FString.Multiline, '#pop'),
            (r'\}.*?\Z', FString.Multiline),
            (r'""".*?"""', FString.Multiline, '#pop'),
            (r'.*?\Z', FString.Multiline),
        ],
        "fstring-single-oneline": [
            # Search for formatting cell
            (r"'((?!').)*?\{", FString.Oneline, "expression-fstring"),
            # In case of multiple formatting cells
            (r"\}((?!').)*?\{", FString.Oneline, "expression-fstring"),
            # Search for begining and end of string
            (r"'.*?'", FString.Oneline, '#pop'),
            # Search for end of string
            (r".*?'", FString.Oneline, '#pop'),
            # Search for line continuation
            (r'.*?\\[ \t]*\Z', FString.Oneline),
            # If previous do not match, whole line is string. pop.
            (r'.*?\Z', FString.Oneline, '#pop'),
        ],
        "fstring-double-oneline": [
            # Same as fstring-single-oneline
            (r'"((?!").)*?\{', FString.Oneline, "expression-fstring"),
            (r'\}((?!").)*?\{', FString.Oneline, "expression-fstring"),
            (r'.*?"', FString.Oneline, '#pop'),
            (r'.*?\\[ \t]*\Z', FString.Oneline),
            (r'.*?\Z', FString.Oneline, '#pop'),
        ]
    }

if __name__ == "__main__":


    lexer = PythonLexer()
    # Parse itself, as a test
    with open("python_lexer.py") as f:
        for t in lexer.parse(f.read()):
            if t.type == Syntax.Invalid:
                print(t.type, ' ', t.text)