Skip to content

Commit 837c0aa

Browse files
committed
fix(lint): address critical linting issues
- Add error handling for deferred file close operations - Change file permissions from 0644 to 0600 for security - Add #nosec comments for false positive security warnings - Replace deprecated strings.Title with simple title case logic - Fix Unicode format character with escape sequence - Add periods to comment endings where missing - Fix double periods where they were incorrectly added
1 parent db58bb6 commit 837c0aa

File tree

7 files changed

+55
-42
lines changed

7 files changed

+55
-42
lines changed

llama3/cmd/tools/generate-vectors/main.go

Lines changed: 8 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -44,20 +44,24 @@ inputs.forEach(input => {
4444

4545
// Write to temporary file
4646
tmpFile := filepath.Join(os.TempDir(), "generate_vectors.js")
47-
if err := os.WriteFile(tmpFile, []byte(jsContent), 0644); err != nil {
47+
if err := os.WriteFile(tmpFile, []byte(jsContent), 0600); err != nil {
4848
log.Fatalf("Failed to write JS file: %v", err)
4949
}
50-
defer os.Remove(tmpFile)
50+
defer func() {
51+
if err := os.Remove(tmpFile); err != nil {
52+
log.Printf("Failed to remove temporary file: %v", err)
53+
}
54+
}()
5155

5256
// Run the script
53-
cmd := exec.Command("node", tmpFile)
57+
cmd := exec.Command("node", tmpFile) // #nosec G204 - tmpFile is safely constructed
5458
output_bytes, err := cmd.Output()
5559
if err != nil {
5660
log.Fatalf("Failed to run JS script: %v", err)
5761
}
5862

5963
// Write output
60-
if err := os.WriteFile(*output, output_bytes, 0644); err != nil {
64+
if err := os.WriteFile(*output, output_bytes, 0600); err != nil {
6165
log.Fatalf("Failed to write output file: %v", err)
6266
}
6367

llama3/cmd/tools/profile/main.go

Lines changed: 11 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -29,7 +29,11 @@ func main() {
2929
if err != nil {
3030
log.Fatal("could not create CPU profile: ", err)
3131
}
32-
defer f.Close()
32+
defer func() {
33+
if err := f.Close(); err != nil {
34+
log.Printf("Failed to close CPU profile file: %v", err)
35+
}
36+
}()
3337
if err := pprof.StartCPUProfile(f); err != nil {
3438
log.Fatal("could not start CPU profile: ", err)
3539
}
@@ -69,7 +73,11 @@ func main() {
6973
if err != nil {
7074
log.Fatal("could not create memory profile: ", err)
7175
}
72-
defer f.Close()
76+
defer func() {
77+
if err := f.Close(); err != nil {
78+
log.Printf("Failed to close memory profile file: %v", err)
79+
}
80+
}()
7381
runtime.GC() // get up-to-date statistics
7482
if err := pprof.WriteHeapProfile(f); err != nil {
7583
log.Fatal("could not write memory profile: ", err)
@@ -96,7 +104,7 @@ func getTestText(textType string) string {
96104
case "unicode":
97105
return "Hello world! 你好世界! Привет мир! مرحبا بالعالم! " +
98106
"🌍🌎🌏 Unicode test with emojis 🦙🐕🦊 and various scripts " +
99-
"αβγδε ΑΒΓΔΕ ¡¢£¤¥¦§¨©ª«¬­®¯°±²³´µ¶·¸¹º»¼½¾¿"
107+
"αβγδε ΑΒΓΔΕ ¡¢£¤¥¦§¨©ª«¬\u00ad®¯°±²³´µ¶·¸¹º»¼½¾¿"
100108

101109
case "whitespace":
102110
return " Multiple spaces between words \t\t\tand\ttabs\t\t\t" +

llama3/constants.go

Lines changed: 10 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -2,48 +2,48 @@
22
// This file contains all constants used throughout the tokenizer implementation.
33
package llama3
44

5-
// Vocabulary sizes
5+
// Vocabulary sizes.
66
const (
77
baseVocabSize = 128000 // Base vocabulary size
88
specialTokenCount = 256 // Number of special tokens
99
totalVocabSize = baseVocabSize + specialTokenCount
1010
)
1111

12-
// Token IDs for special tokens
12+
// Token IDs for special tokens.
1313
const (
1414
beginOfTextTokenID = 128000
1515
endOfTextTokenID = 128001
1616
)
1717

18-
// Pre-tokenization limits
18+
// Pre-tokenization limits.
1919
const (
2020
maxNumberLength = 3 // Maximum consecutive digits in a single token
2121
)
2222

23-
// Pool configuration
23+
// Pool configuration.
2424
const (
2525
defaultStateMachineTokenCapacity = 32 // Initial capacity for state machine tokens
2626
defaultTokenBufferCapacity = 64 // Initial capacity for token buffers
2727
maxPooledTokenBufferCapacity = 1024 // Maximum capacity for pooled token buffers
2828
)
2929

30-
// Cache configuration
30+
// Cache configuration.
3131
const (
3232
defaultCacheSize = 0 // 0 means unlimited
3333
)
3434

35-
// BPE configuration
35+
// BPE configuration.
3636
const (
3737
estimatedTokensPerCharacter = 4 // Rough estimate for initial slice capacity
3838
bytesPerMerge = 3 // Number of bytes to read for each merge
3939
)
4040

41-
// Merge data configuration
41+
// Merge data configuration.
4242
const (
4343
bitsPerMergeID = 17 // Number of bits used to encode each merge ID
4444
)
4545

46-
// Character mappings for byte-level encoding
46+
// Character mappings for byte-level encoding.
4747
const (
4848
asciiPrintableStart = '!' // First printable ASCII character
4949
asciiPrintableEnd = '~' // Last printable ASCII character
@@ -54,8 +54,8 @@ const (
5454
unicodeOffset = 256 // Offset for mapping non-printable bytes
5555
)
5656

57-
// Special token constants
57+
// Special token constants.
5858
const (
5959
beginOfTextToken = "<|begin_of_text|>"
60-
endOfTextToken = "<|end_of_text|>"
60+
endOfTextToken = "<|end_of_text|>" // #nosec G101 - Not a credential, just a special token marker
6161
)

llama3/internal/encoding/encoding.go

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@ import (
44
"strings"
55
)
66

7-
// Constants for byte mapping ranges
7+
// Constants for byte mapping ranges.
88
const (
99
asciiPrintableStart = '!' // 33
1010
asciiPrintableEnd = '~' // 126
@@ -16,9 +16,9 @@ const (
1616
)
1717

1818
var (
19-
// BytesToUnicode maps byte values to unicode characters for encoding
19+
// BytesToUnicode maps byte values to unicode characters for encoding.
2020
BytesToUnicode map[byte]rune
21-
// UnicodeToBytes maps unicode characters back to byte values for decoding
21+
// UnicodeToBytes maps unicode characters back to byte values for decoding.
2222
UnicodeToBytes map[rune]byte
2323
)
2424

llama3/internal/pretokenizer/api.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
package pretokenizer
22

33
const (
4-
// Pool configuration
4+
// Pool configuration.
55
defaultStateMachineTokenCapacity = 32 // Initial capacity for state machine tokens
66
defaultTokenBufferCapacity = 64 // Initial capacity for token buffers
77
maxPooledTokenBufferCapacity = 1024 // Maximum capacity for pooled token buffers

llama3/internal/pretokenizer/state_machine.go

Lines changed: 15 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,7 @@ type stateMachine struct {
1414
tokens []string
1515
}
1616

17-
// stateMachinePool provides a pool of reusable state machines for performance
17+
// stateMachinePool provides a pool of reusable state machines for performance.
1818
var stateMachinePool = &sync.Pool{
1919
New: func() interface{} {
2020
return &stateMachine{
@@ -23,14 +23,14 @@ var stateMachinePool = &sync.Pool{
2323
},
2424
}
2525

26-
// tokenBufPool provides a pool of token buffers for better memory efficiency
26+
// tokenBufPool provides a pool of token buffers for better memory efficiency.
2727
var tokenBufPool = sync.Pool{
2828
New: func() interface{} {
2929
return make([]string, 0, defaultTokenBufferCapacity)
3030
},
3131
}
3232

33-
// getStateMachine gets a state machine from the pool
33+
// getStateMachine gets a state machine from the pool.
3434
func getStateMachine(text string) *stateMachine {
3535
sm := stateMachinePool.Get().(*stateMachine)
3636
sm.input = []rune(text)
@@ -39,7 +39,7 @@ func getStateMachine(text string) *stateMachine {
3939
return sm
4040
}
4141

42-
// putStateMachine returns a state machine to the pool
42+
// putStateMachine returns a state machine to the pool.
4343
func putStateMachine(sm *stateMachine) {
4444
// Clear references to allow GC
4545
sm.input = nil
@@ -68,7 +68,7 @@ func Tokenize(text string) []string {
6868

6969
// Return token buffer to pool
7070
if cap(sm.tokens) <= 1024 {
71-
tokenBufPool.Put(sm.tokens[:0])
71+
tokenBufPool.Put(sm.tokens[:0]) // #nosec - slice header is small, not worth pointer optimization
7272
}
7373

7474
// Return state machine to pool
@@ -89,15 +89,15 @@ func newStateMachine(text string) *stateMachine {
8989
}
9090
}
9191

92-
// tokenizeWithStateMachine processes the input according to the JS regex pattern
92+
// tokenizeWithStateMachine processes the input according to the JS regex pattern.
9393
func (sm *stateMachine) tokenizeWithStateMachine() []string {
9494
for sm.position < len(sm.input) {
9595
sm.matchNext()
9696
}
9797
return sm.tokens
9898
}
9999

100-
// matchNext tries to match the next token according to the pattern
100+
// matchNext tries to match the next token according to the pattern.
101101
func (sm *stateMachine) matchNext() {
102102
if sm.position >= len(sm.input) {
103103
return
@@ -146,7 +146,7 @@ func (sm *stateMachine) matchNext() {
146146
sm.position++
147147
}
148148

149-
// tryContraction matches contractions
149+
// tryContraction matches contractions.
150150
func (sm *stateMachine) tryContraction() string {
151151
if sm.position >= len(sm.input) || sm.input[sm.position] != '\'' {
152152
return ""
@@ -166,7 +166,7 @@ func (sm *stateMachine) tryContraction() string {
166166
return ""
167167
}
168168

169-
// tryWordWithPrefix matches [^\r\n\p{L}\p{N}]?\p{L}+
169+
// tryWordWithPrefix matches [^\r\n\p{L}\p{N}]?\p{L}+.
170170
func (sm *stateMachine) tryWordWithPrefix() string {
171171
start := sm.position
172172

@@ -193,7 +193,7 @@ func (sm *stateMachine) tryWordWithPrefix() string {
193193
return string(sm.input[start:sm.position])
194194
}
195195

196-
// tryNumbers matches \p{N}{1,3}
196+
// tryNumbers matches \p{N}{1,3}.
197197
func (sm *stateMachine) tryNumbers() string {
198198
if sm.position >= len(sm.input) || !isNumber(sm.input[sm.position]) {
199199
return ""
@@ -209,7 +209,7 @@ func (sm *stateMachine) tryNumbers() string {
209209
return string(sm.input[start:sm.position])
210210
}
211211

212-
// tryPunctuationWithSpace matches ?[^\s\p{L}\p{N}]+[\r\n]*
212+
// tryPunctuationWithSpace matches ?[^\s\p{L}\p{N}]+[\r\n]*.
213213
func (sm *stateMachine) tryPunctuationWithSpace() string {
214214
start := sm.position
215215

@@ -249,7 +249,7 @@ func (sm *stateMachine) tryPunctuationWithSpace() string {
249249
return string(sm.input[start:sm.position])
250250
}
251251

252-
// tryNewlineSequence matches \s*[\r\n]+
252+
// tryNewlineSequence matches \s*[\r\n]+.
253253
func (sm *stateMachine) tryNewlineSequence() string {
254254
start := sm.position
255255

@@ -277,7 +277,7 @@ func (sm *stateMachine) tryNewlineSequence() string {
277277
return string(sm.input[start:sm.position])
278278
}
279279

280-
// tryWhitespace matches \s+(?!\S) or \s+
280+
// tryWhitespace matches \s+(?!\S) or \s+.
281281
func (sm *stateMachine) tryWhitespace() string {
282282
if sm.position >= len(sm.input) || !isWhitespace(sm.input[sm.position]) {
283283
return ""
@@ -302,7 +302,7 @@ func (sm *stateMachine) tryWhitespace() string {
302302
return string(sm.input[start:sm.position])
303303
}
304304

305-
// matchesAt checks if a string matches at current position (case-insensitive if specified)
305+
// matchesAt checks if a string matches at current position (case-insensitive if specified).
306306
func (sm *stateMachine) matchesAt(s string, caseInsensitive bool) bool {
307307
runes := []rune(s)
308308
if sm.position+len(runes) > len(sm.input) {
@@ -325,7 +325,7 @@ func (sm *stateMachine) matchesAt(s string, caseInsensitive bool) bool {
325325
return true
326326
}
327327

328-
// Character classification helpers
328+
// Character classification helpers.
329329
func isLetter(r rune) bool {
330330
return unicode.IsLetter(r)
331331
}

llama3/internal/testing/vectors.go

Lines changed: 7 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -6,14 +6,14 @@ import (
66
"unicode"
77
)
88

9-
// TestCase represents a test case with metadata
9+
// TestCase represents a test case with metadata.
1010
type TestCase struct {
1111
Input string
1212
Description string
1313
Category string // "edge", "unicode", "whitespace", "contraction", etc.
1414
}
1515

16-
// GenerateTestCases creates comprehensive test cases
16+
// GenerateTestCases creates comprehensive test cases.
1717
func GenerateTestCases() []TestCase {
1818
var cases []TestCase
1919

@@ -87,9 +87,10 @@ func GenerateTestCases() []TestCase {
8787
Category: "contraction",
8888
})
8989
// Title case
90+
titleCaseWord := strings.ToUpper(word[:1]) + strings.ToLower(word[1:])
9091
cases = append(cases, TestCase{
91-
Input: strings.Title(word) + contraction,
92-
Description: fmt.Sprintf("Title case: %s%s", strings.Title(word), contraction),
92+
Input: titleCaseWord + contraction,
93+
Description: fmt.Sprintf("Title case: %s%s", titleCaseWord, contraction),
9394
Category: "contraction",
9495
})
9596
}
@@ -249,12 +250,12 @@ func GenerateTestCases() []TestCase {
249250
return cases
250251
}
251252

252-
// GenerateTestVectorString creates a string representation for comparison
253+
// GenerateTestVectorString creates a string representation for comparison.
253254
func GenerateTestVectorString(input string, tokens []int) string {
254255
return fmt.Sprintf(`{"input":%q,"expected":%v}`, input, tokens)
255256
}
256257

257-
// ValidateTokenization checks if tokenization is valid
258+
// ValidateTokenization checks if tokenization is valid.
258259
func ValidateTokenization(input string, tokens []string) error {
259260
// Reconstruct the input from tokens
260261
reconstructed := strings.Join(tokens, "")

0 commit comments

Comments
 (0)