Skip to content

Commit 1e1fd66

Browse files
authored
Merge pull request #89 from yoheimuta/support-utf8-bom
Support UTF-8-BOM files
2 parents bd0bfef + d117c9d commit 1e1fd66

File tree

4 files changed

+57
-16
lines changed

4 files changed

+57
-16
lines changed

_testdata/bom.proto

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
syntax = "proto3";

lexer/scanner/token.go

Lines changed: 18 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -37,6 +37,7 @@ const (
3737
TCOMMA // ,
3838
TDOT // .
3939
TMINUS // -
40+
TBOM // Byte Order Mark
4041

4142
// Keywords
4243
TSYNTAX
@@ -64,22 +65,23 @@ const (
6465

6566
func asMiscToken(ch rune) Token {
6667
m := map[rune]Token{
67-
';': TSEMICOLON,
68-
':': TCOLON,
69-
'=': TEQUALS,
70-
'"': TQUOTE,
71-
'\'': TQUOTE,
72-
'(': TLEFTPAREN,
73-
')': TRIGHTPAREN,
74-
'{': TLEFTCURLY,
75-
'}': TRIGHTCURLY,
76-
'[': TLEFTSQUARE,
77-
']': TRIGHTSQUARE,
78-
'<': TLESS,
79-
'>': TGREATER,
80-
',': TCOMMA,
81-
'.': TDOT,
82-
'-': TMINUS,
68+
';': TSEMICOLON,
69+
':': TCOLON,
70+
'=': TEQUALS,
71+
'"': TQUOTE,
72+
'\'': TQUOTE,
73+
'(': TLEFTPAREN,
74+
')': TRIGHTPAREN,
75+
'{': TLEFTCURLY,
76+
'}': TRIGHTCURLY,
77+
'[': TLEFTSQUARE,
78+
']': TRIGHTSQUARE,
79+
'<': TLESS,
80+
'>': TGREATER,
81+
',': TCOMMA,
82+
'.': TDOT,
83+
'-': TMINUS,
84+
'\uFEFF': TBOM,
8385
}
8486
if t, ok := m[ch]; ok {
8587
return t

parser/proto.go

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -33,6 +33,8 @@ func (p *Proto) Accept(v Visitor) {
3333
//
3434
// See https://developers.google.com/protocol-buffers/docs/reference/proto3-spec#proto_file
3535
func (p *Parser) ParseProto() (*Proto, error) {
36+
p.parseBOM()
37+
3638
syntaxComments := p.ParseComments()
3739
syntax, err := p.ParseSyntax()
3840
if err != nil {
@@ -55,6 +57,15 @@ func (p *Parser) ParseProto() (*Proto, error) {
5557
}, nil
5658
}
5759

60+
// See https://protobuf.com/docs/language-spec#source-code-representation
61+
func (p *Parser) parseBOM() {
62+
p.lex.Next()
63+
if p.lex.Token == scanner.TBOM {
64+
return
65+
}
66+
defer p.lex.UnNext()
67+
}
68+
5869
// protoBody = { import | package | option | topLevelDef | emptyStatement }
5970
// topLevelDef = message | enum | service | extend
6071
// See https://developers.google.com/protocol-buffers/docs/reference/proto3-spec#proto_file

parser/proto_test.go

Lines changed: 27 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2242,6 +2242,33 @@ message foo {
22422242
},
22432243
},
22442244
},
2245+
{
2246+
name: "parsing a UTF-8-BOM file",
2247+
input: string([]byte{
2248+
0xEF, 0xBB, 0xBF,
2249+
}) + `
2250+
syntax = "proto3";
2251+
`,
2252+
wantProto: &parser.Proto{
2253+
Syntax: &parser.Syntax{
2254+
ProtobufVersion: "proto3",
2255+
ProtobufVersionQuote: `"proto3"`,
2256+
Meta: meta.Meta{
2257+
Pos: meta.Position{
2258+
Offset: 4,
2259+
Line: 2,
2260+
Column: 1,
2261+
},
2262+
LastPos: meta.Position{
2263+
Offset: 21,
2264+
Line: 2,
2265+
Column: 18,
2266+
},
2267+
},
2268+
},
2269+
Meta: &parser.ProtoMeta{},
2270+
},
2271+
},
22452272
}
22462273

22472274
for _, test := range tests {

0 commit comments

Comments
 (0)