fix(lint): address critical linting issues

jackspirou · jackspirou · commit 837c0aaf7a6a · 2025-08-01T12:37:26.000-05:00
- Add error handling for deferred file close operations
- Change file permissions from 0644 to 0600 for security
- Add #nosec comments for false positive security warnings
- Replace deprecated strings.Title with simple title case logic
- Fix Unicode format character with escape sequence
- Add periods to comment endings where missing
- Fix double periods where they were incorrectly added
diff --git a/llama3/cmd/tools/generate-vectors/main.go b/llama3/cmd/tools/generate-vectors/main.go
@@ -44,20 +44,24 @@ inputs.forEach(input => {
 
 	// Write to temporary file
 	tmpFile := filepath.Join(os.TempDir(), "generate_vectors.js")
-	if err := os.WriteFile(tmpFile, []byte(jsContent), 0644); err != nil {
+	if err := os.WriteFile(tmpFile, []byte(jsContent), 0600); err != nil {
 		log.Fatalf("Failed to write JS file: %v", err)
 	}
-	defer os.Remove(tmpFile)
+	defer func() {
+		if err := os.Remove(tmpFile); err != nil {
+			log.Printf("Failed to remove temporary file: %v", err)
+		}
+	}()
 
 	// Run the script
-	cmd := exec.Command("node", tmpFile)
+	cmd := exec.Command("node", tmpFile) // #nosec G204 - tmpFile is safely constructed
 	output_bytes, err := cmd.Output()
 	if err != nil {
 		log.Fatalf("Failed to run JS script: %v", err)
 	}
 
 	// Write output
-	if err := os.WriteFile(*output, output_bytes, 0644); err != nil {
+	if err := os.WriteFile(*output, output_bytes, 0600); err != nil {
 		log.Fatalf("Failed to write output file: %v", err)
 	}
 
diff --git a/llama3/cmd/tools/profile/main.go b/llama3/cmd/tools/profile/main.go
@@ -29,7 +29,11 @@ func main() {
 		if err != nil {
 			log.Fatal("could not create CPU profile: ", err)
 		}
-		defer f.Close()
+		defer func() {
+			if err := f.Close(); err != nil {
+				log.Printf("Failed to close CPU profile file: %v", err)
+			}
+		}()
 		if err := pprof.StartCPUProfile(f); err != nil {
 			log.Fatal("could not start CPU profile: ", err)
 		}
@@ -69,7 +73,11 @@ func main() {
 		if err != nil {
 			log.Fatal("could not create memory profile: ", err)
 		}
-		defer f.Close()
+		defer func() {
+			if err := f.Close(); err != nil {
+				log.Printf("Failed to close memory profile file: %v", err)
+			}
+		}()
 		runtime.GC() // get up-to-date statistics
 		if err := pprof.WriteHeapProfile(f); err != nil {
 			log.Fatal("could not write memory profile: ", err)
@@ -96,7 +104,7 @@ func getTestText(textType string) string {
 	case "unicode":
 		return "Hello world! 你好世界！ Привет мир! مرحبا بالعالم! " +
 			"🌍🌎🌏 Unicode test with emojis 🦙🐕🦊 and various scripts " +
-			"αβγδε ΑΒΓΔΕ ¡¢£¤¥¦§¨©ª«¬­®¯°±²³´µ¶·¸¹º»¼½¾¿"
+			"αβγδε ΑΒΓΔΕ ¡¢£¤¥¦§¨©ª«¬\u00ad®¯°±²³´µ¶·¸¹º»¼½¾¿"
 
 	case "whitespace":
 		return "   Multiple   spaces   between   words   \t\t\tand\ttabs\t\t\t" +
diff --git a/llama3/constants.go b/llama3/constants.go
@@ -2,48 +2,48 @@
 // This file contains all constants used throughout the tokenizer implementation.
 package llama3
 
-// Vocabulary sizes
+// Vocabulary sizes.
 const (
 	baseVocabSize     = 128000 // Base vocabulary size
 	specialTokenCount = 256    // Number of special tokens
 	totalVocabSize    = baseVocabSize + specialTokenCount
 )
 
-// Token IDs for special tokens
+// Token IDs for special tokens.
 const (
 	beginOfTextTokenID = 128000
 	endOfTextTokenID   = 128001
 )
 
-// Pre-tokenization limits
+// Pre-tokenization limits.
 const (
 	maxNumberLength = 3 // Maximum consecutive digits in a single token
 )
 
-// Pool configuration
+// Pool configuration.
 const (
 	defaultStateMachineTokenCapacity = 32   // Initial capacity for state machine tokens
 	defaultTokenBufferCapacity       = 64   // Initial capacity for token buffers
 	maxPooledTokenBufferCapacity     = 1024 // Maximum capacity for pooled token buffers
 )
 
-// Cache configuration
+// Cache configuration.
 const (
 	defaultCacheSize = 0 // 0 means unlimited
 )
 
-// BPE configuration
+// BPE configuration.
 const (
 	estimatedTokensPerCharacter = 4 // Rough estimate for initial slice capacity
 	bytesPerMerge               = 3 // Number of bytes to read for each merge
 )
 
-// Merge data configuration
+// Merge data configuration.
 const (
 	bitsPerMergeID = 17 // Number of bits used to encode each merge ID
 )
 
-// Character mappings for byte-level encoding
+// Character mappings for byte-level encoding.
 const (
 	asciiPrintableStart = '!' // First printable ASCII character
 	asciiPrintableEnd   = '~' // Last printable ASCII character
@@ -54,8 +54,8 @@ const (
 	unicodeOffset       = 256 // Offset for mapping non-printable bytes
 )
 
-// Special token constants
+// Special token constants.
 const (
 	beginOfTextToken = "<|begin_of_text|>"
-	endOfTextToken   = "<|end_of_text|>"
+	endOfTextToken   = "<|end_of_text|>" // #nosec G101 - Not a credential, just a special token marker
 )
diff --git a/llama3/internal/encoding/encoding.go b/llama3/internal/encoding/encoding.go
@@ -4,7 +4,7 @@ import (
 	"strings"
 )
 
-// Constants for byte mapping ranges
+// Constants for byte mapping ranges.
 const (
 	asciiPrintableStart = '!'      // 33
 	asciiPrintableEnd   = '~'      // 126
@@ -16,9 +16,9 @@ const (
 )
 
 var (
-	// BytesToUnicode maps byte values to unicode characters for encoding
+	// BytesToUnicode maps byte values to unicode characters for encoding.
 	BytesToUnicode map[byte]rune
-	// UnicodeToBytes maps unicode characters back to byte values for decoding
+	// UnicodeToBytes maps unicode characters back to byte values for decoding.
 	UnicodeToBytes map[rune]byte
 )
 
diff --git a/llama3/internal/pretokenizer/api.go b/llama3/internal/pretokenizer/api.go
@@ -1,7 +1,7 @@
 package pretokenizer
 
 const (
-	// Pool configuration
+	// Pool configuration.
 	defaultStateMachineTokenCapacity = 32   // Initial capacity for state machine tokens
 	defaultTokenBufferCapacity       = 64   // Initial capacity for token buffers
 	maxPooledTokenBufferCapacity     = 1024 // Maximum capacity for pooled token buffers
diff --git a/llama3/internal/pretokenizer/state_machine.go b/llama3/internal/pretokenizer/state_machine.go
@@ -14,7 +14,7 @@ type stateMachine struct {
 	tokens   []string
 }
 
-// stateMachinePool provides a pool of reusable state machines for performance
+// stateMachinePool provides a pool of reusable state machines for performance.
 var stateMachinePool = &sync.Pool{
 	New: func() interface{} {
 		return &stateMachine{
@@ -23,14 +23,14 @@ var stateMachinePool = &sync.Pool{
 	},
 }
 
-// tokenBufPool provides a pool of token buffers for better memory efficiency
+// tokenBufPool provides a pool of token buffers for better memory efficiency.
 var tokenBufPool = sync.Pool{
 	New: func() interface{} {
 		return make([]string, 0, defaultTokenBufferCapacity)
 	},
 }
 
-// getStateMachine gets a state machine from the pool
+// getStateMachine gets a state machine from the pool.
 func getStateMachine(text string) *stateMachine {
 	sm := stateMachinePool.Get().(*stateMachine)
 	sm.input = []rune(text)
@@ -39,7 +39,7 @@ func getStateMachine(text string) *stateMachine {
 	return sm
 }
 
-// putStateMachine returns a state machine to the pool
+// putStateMachine returns a state machine to the pool.
 func putStateMachine(sm *stateMachine) {
 	// Clear references to allow GC
 	sm.input = nil
@@ -68,7 +68,7 @@ func Tokenize(text string) []string {
 
 	// Return token buffer to pool
 	if cap(sm.tokens) <= 1024 {
-		tokenBufPool.Put(sm.tokens[:0])
+		tokenBufPool.Put(sm.tokens[:0]) // #nosec - slice header is small, not worth pointer optimization
 	}
 
 	// Return state machine to pool
@@ -89,15 +89,15 @@ func newStateMachine(text string) *stateMachine {
 	}
 }
 
-// tokenizeWithStateMachine processes the input according to the JS regex pattern
+// tokenizeWithStateMachine processes the input according to the JS regex pattern.
 func (sm *stateMachine) tokenizeWithStateMachine() []string {
 	for sm.position < len(sm.input) {
 		sm.matchNext()
 	}
 	return sm.tokens
 }
 
-// matchNext tries to match the next token according to the pattern
+// matchNext tries to match the next token according to the pattern.
 func (sm *stateMachine) matchNext() {
 	if sm.position >= len(sm.input) {
 		return
@@ -146,7 +146,7 @@ func (sm *stateMachine) matchNext() {
 	sm.position++
 }
 
-// tryContraction matches contractions
+// tryContraction matches contractions.
 func (sm *stateMachine) tryContraction() string {
 	if sm.position >= len(sm.input) || sm.input[sm.position] != '\'' {
 		return ""
@@ -166,7 +166,7 @@ func (sm *stateMachine) tryContraction() string {
 	return ""
 }
 
-// tryWordWithPrefix matches [^\r\n\p{L}\p{N}]?\p{L}+
+// tryWordWithPrefix matches [^\r\n\p{L}\p{N}]?\p{L}+.
 func (sm *stateMachine) tryWordWithPrefix() string {
 	start := sm.position
 
@@ -193,7 +193,7 @@ func (sm *stateMachine) tryWordWithPrefix() string {
 	return string(sm.input[start:sm.position])
 }
 
-// tryNumbers matches \p{N}{1,3}
+// tryNumbers matches \p{N}{1,3}.
 func (sm *stateMachine) tryNumbers() string {
 	if sm.position >= len(sm.input) || !isNumber(sm.input[sm.position]) {
 		return ""
@@ -209,7 +209,7 @@ func (sm *stateMachine) tryNumbers() string {
 	return string(sm.input[start:sm.position])
 }
 
-// tryPunctuationWithSpace matches  ?[^\s\p{L}\p{N}]+[\r\n]*
+// tryPunctuationWithSpace matches  ?[^\s\p{L}\p{N}]+[\r\n]*.
 func (sm *stateMachine) tryPunctuationWithSpace() string {
 	start := sm.position
 
@@ -249,7 +249,7 @@ func (sm *stateMachine) tryPunctuationWithSpace() string {
 	return string(sm.input[start:sm.position])
 }
 
-// tryNewlineSequence matches \s*[\r\n]+
+// tryNewlineSequence matches \s*[\r\n]+.
 func (sm *stateMachine) tryNewlineSequence() string {
 	start := sm.position
 
@@ -277,7 +277,7 @@ func (sm *stateMachine) tryNewlineSequence() string {
 	return string(sm.input[start:sm.position])
 }
 
-// tryWhitespace matches \s+(?!\S) or \s+
+// tryWhitespace matches \s+(?!\S) or \s+.
 func (sm *stateMachine) tryWhitespace() string {
 	if sm.position >= len(sm.input) || !isWhitespace(sm.input[sm.position]) {
 		return ""
@@ -302,7 +302,7 @@ func (sm *stateMachine) tryWhitespace() string {
 	return string(sm.input[start:sm.position])
 }
 
-// matchesAt checks if a string matches at current position (case-insensitive if specified)
+// matchesAt checks if a string matches at current position (case-insensitive if specified).
 func (sm *stateMachine) matchesAt(s string, caseInsensitive bool) bool {
 	runes := []rune(s)
 	if sm.position+len(runes) > len(sm.input) {
@@ -325,7 +325,7 @@ func (sm *stateMachine) matchesAt(s string, caseInsensitive bool) bool {
 	return true
 }
 
-// Character classification helpers
+// Character classification helpers.
 func isLetter(r rune) bool {
 	return unicode.IsLetter(r)
 }
diff --git a/llama3/internal/testing/vectors.go b/llama3/internal/testing/vectors.go
@@ -6,14 +6,14 @@ import (
 	"unicode"
 )
 
-// TestCase represents a test case with metadata
+// TestCase represents a test case with metadata.
 type TestCase struct {
 	Input       string
 	Description string
 	Category    string // "edge", "unicode", "whitespace", "contraction", etc.
 }
 
-// GenerateTestCases creates comprehensive test cases
+// GenerateTestCases creates comprehensive test cases.
 func GenerateTestCases() []TestCase {
 	var cases []TestCase
 
@@ -87,9 +87,10 @@ func GenerateTestCases() []TestCase {
 				Category:    "contraction",
 			})
 			// Title case
+			titleCaseWord := strings.ToUpper(word[:1]) + strings.ToLower(word[1:])
 			cases = append(cases, TestCase{
-				Input:       strings.Title(word) + contraction,
-				Description: fmt.Sprintf("Title case: %s%s", strings.Title(word), contraction),
+				Input:       titleCaseWord + contraction,
+				Description: fmt.Sprintf("Title case: %s%s", titleCaseWord, contraction),
 				Category:    "contraction",
 			})
 		}
@@ -249,12 +250,12 @@ func GenerateTestCases() []TestCase {
 	return cases
 }
 
-// GenerateTestVectorString creates a string representation for comparison
+// GenerateTestVectorString creates a string representation for comparison.
 func GenerateTestVectorString(input string, tokens []int) string {
 	return fmt.Sprintf(`{"input":%q,"expected":%v}`, input, tokens)
 }
 
-// ValidateTokenization checks if tokenization is valid
+// ValidateTokenization checks if tokenization is valid.
 func ValidateTokenization(input string, tokens []string) error {
 	// Reconstruct the input from tokens
 	reconstructed := strings.Join(tokens, "")

Original file line number	Diff line number	Diff line change
`@@ -4,7 +4,7 @@ import (`
`4`	`4`	`"strings"`
`5`	`5`	`)`
`6`	`6`
`7`		`-// Constants for byte mapping ranges`
	`7`	`+// Constants for byte mapping ranges.`
`8`	`8`	`const (`
`9`	`9`	`asciiPrintableStart = '!' // 33`
`10`	`10`	`asciiPrintableEnd = '~' // 126`
`@@ -16,9 +16,9 @@ const (`
`16`	`16`	`)`
`17`	`17`
`18`	`18`	`var (`
`19`		`- // BytesToUnicode maps byte values to unicode characters for encoding`
	`19`	`+ // BytesToUnicode maps byte values to unicode characters for encoding.`
`20`	`20`	`BytesToUnicode map[byte]rune`
`21`		`- // UnicodeToBytes maps unicode characters back to byte values for decoding`
	`21`	`+ // UnicodeToBytes maps unicode characters back to byte values for decoding.`
`22`	`22`	`UnicodeToBytes map[rune]byte`
`23`	`23`	`)`
`24`	`24`