Skip to content

Commit bfc10e3

Browse files
authored
Merge pull request #38 from mdm-code/unicode-escape
Unicode escape sequence in queries
2 parents 6c4fbda + 83cf329 commit bfc10e3

File tree

5 files changed

+99
-20
lines changed

5 files changed

+99
-20
lines changed

README.md

Lines changed: 11 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -88,19 +88,19 @@ that these should be used such that they do not interfere with shell quoting.
8888
Commonly found characters are mapped onto often used escaped sequences. These
8989
can be used in quoted strings mostly the same way one would use them in a TOML
9090
file though the specification for the TOML language advises against the use of
91-
funky keys unless there is a good reason to use them. Tq does not support
92-
Unicode escape sequences in quoted strings as of today, but there are plans to
93-
add it in the future.
91+
funky keys unless there is a good reason to use them.
9492

9593
```txt
96-
\b - backspace
97-
\t - tab
98-
\n - linefeed
99-
\f - form feed
100-
\r - carriage return
101-
\" - double quote
102-
\' - single quote
103-
\\ - backslash
94+
\b - backspace
95+
\t - tab
96+
\n - linefeed
97+
\f - form feed
98+
\r - carriage return
99+
\" - double quote
100+
\' - single quote
101+
\\ - backslash
102+
\uhhhh - short 16-bit hexadecimal form
103+
\Uhhhhhhhh - long 32-bit hexadecimal form
104104
```
105105

106106

internal/lexer/lexer.go

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -169,7 +169,6 @@ func (l *Lexer) scanBareString() bool {
169169
}
170170
l.setToken(String, start, l.offset)
171171
return true
172-
173172
}
174173

175174
func (l *Lexer) scanString() bool {

internal/lexer/token.go

Lines changed: 29 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
package lexer
22

33
import (
4+
"strconv"
45
"strings"
56

67
"github.com/mdm-code/scanner"
@@ -87,19 +88,31 @@ func (t Token) reprString() string {
8788
}
8889
chars := make([]string, 0, size)
8990
for head != end {
90-
token := (*t.Buffer)[head]
9191
// NOTE: For quoted strings, check if the current token initiates an
9292
// escape sequence and there is at least a single token left to look up
9393
// followed by the terminating quote character. Bare strings may not
94-
// contain escape sequence characters.
95-
if token.Rune == '\\' && head+2 != end {
94+
// contain escape sequence characters, because forward slash is a
95+
// disallowed character in bare strings.
96+
token := (*t.Buffer)[head]
97+
if token.Rune == '\\' && head+1 != end {
9698
v, ok := escapeSequenceMap[(*t.Buffer)[head+1].Rune]
9799
if ok {
98-
token = (*t.Buffer)[head]
99100
head += 2
100101
chars = append(chars, v)
101102
continue
102103
}
104+
if (*t.Buffer)[head+1].Rune == 'u' && head+5 != end {
105+
char := t.parseUnicode(head, 2, 6)
106+
head += 6
107+
chars = append(chars, char)
108+
continue
109+
}
110+
if (*t.Buffer)[head+1].Rune == 'U' && head+9 != end {
111+
char := t.parseUnicode(head, 2, 10)
112+
head += 10
113+
chars = append(chars, char)
114+
continue
115+
}
103116
}
104117
chars = append(chars, string(token.Rune))
105118
head++
@@ -110,6 +123,18 @@ func (t Token) reprString() string {
110123
return strings.Join(chars, "")
111124
}
112125

126+
func (t Token) parseUnicode(head, start, end int) string {
127+
size := end - start
128+
rr := make([]rune, 0, size)
129+
for _, t := range (*t.Buffer)[head+start : head+end] {
130+
rr = append(rr, t.Rune)
131+
}
132+
i, _ := strconv.ParseInt(string(rr), 16, 32)
133+
r := rune(i) // NOTE: Make sure it fits into rune/int32.
134+
result := string(r)
135+
return result
136+
}
137+
113138
func (t Token) reprDefault() string {
114139
end := t.End
115140
size := t.End - t.Start

internal/lexer/token_test.go

Lines changed: 58 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -143,6 +143,64 @@ func TestLexeme(t *testing.T) {
143143
},
144144
want: "foo\"",
145145
},
146+
{
147+
name: "escaped-unicode-short",
148+
token: Token{
149+
Buffer: &[]scanner.Token{
150+
{Pos: scanner.Pos{Rune: '"'}, Buffer: nil},
151+
{Pos: scanner.Pos{Rune: '\\'}, Buffer: nil},
152+
{Pos: scanner.Pos{Rune: 'u'}, Buffer: nil},
153+
{Pos: scanner.Pos{Rune: '3'}, Buffer: nil},
154+
{Pos: scanner.Pos{Rune: '0'}, Buffer: nil},
155+
{Pos: scanner.Pos{Rune: 'B'}, Buffer: nil},
156+
{Pos: scanner.Pos{Rune: 'F'}, Buffer: nil},
157+
{Pos: scanner.Pos{Rune: '\\'}, Buffer: nil},
158+
{Pos: scanner.Pos{Rune: 'u'}, Buffer: nil},
159+
{Pos: scanner.Pos{Rune: '3'}, Buffer: nil},
160+
{Pos: scanner.Pos{Rune: '0'}, Buffer: nil},
161+
{Pos: scanner.Pos{Rune: 'c'}, Buffer: nil},
162+
{Pos: scanner.Pos{Rune: 'f'}, Buffer: nil},
163+
{Pos: scanner.Pos{Rune: '"'}, Buffer: nil},
164+
},
165+
Type: String,
166+
Start: 0,
167+
End: 14,
168+
},
169+
want: "タハ",
170+
},
171+
{
172+
name: "escaped-unicode-long",
173+
token: Token{
174+
Buffer: &[]scanner.Token{
175+
{Pos: scanner.Pos{Rune: '"'}, Buffer: nil},
176+
{Pos: scanner.Pos{Rune: '\\'}, Buffer: nil},
177+
{Pos: scanner.Pos{Rune: 'U'}, Buffer: nil},
178+
{Pos: scanner.Pos{Rune: '0'}, Buffer: nil},
179+
{Pos: scanner.Pos{Rune: '0'}, Buffer: nil},
180+
{Pos: scanner.Pos{Rune: '0'}, Buffer: nil},
181+
{Pos: scanner.Pos{Rune: '1'}, Buffer: nil},
182+
{Pos: scanner.Pos{Rune: 'F'}, Buffer: nil},
183+
{Pos: scanner.Pos{Rune: '6'}, Buffer: nil},
184+
{Pos: scanner.Pos{Rune: '3'}, Buffer: nil},
185+
{Pos: scanner.Pos{Rune: '1'}, Buffer: nil},
186+
{Pos: scanner.Pos{Rune: '\\'}, Buffer: nil},
187+
{Pos: scanner.Pos{Rune: 'U'}, Buffer: nil},
188+
{Pos: scanner.Pos{Rune: '0'}, Buffer: nil},
189+
{Pos: scanner.Pos{Rune: '0'}, Buffer: nil},
190+
{Pos: scanner.Pos{Rune: '0'}, Buffer: nil},
191+
{Pos: scanner.Pos{Rune: '1'}, Buffer: nil},
192+
{Pos: scanner.Pos{Rune: 'f'}, Buffer: nil},
193+
{Pos: scanner.Pos{Rune: '6'}, Buffer: nil},
194+
{Pos: scanner.Pos{Rune: '4'}, Buffer: nil},
195+
{Pos: scanner.Pos{Rune: 'f'}, Buffer: nil},
196+
{Pos: scanner.Pos{Rune: '"'}, Buffer: nil},
197+
},
198+
Type: String,
199+
Start: 0,
200+
End: 22,
201+
},
202+
want: "😱🙏",
203+
},
146204
}
147205
for _, c := range cases {
148206
t.Run(c.name, func(t *testing.T) {

internal/parser/parser.go

Lines changed: 1 addition & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -171,10 +171,7 @@ func (p *Parser) advance() lexer.Token {
171171
}
172172

173173
func (p *Parser) isAtEnd() bool {
174-
if p.current > len(p.buffer)-1 {
175-
return true
176-
}
177-
return false
174+
return p.current > len(p.buffer)-1
178175
}
179176

180177
func (p *Parser) previous() lexer.Token {

0 commit comments

Comments
 (0)