fix(lint): address remaining linting issues

jackspirou · jackspirou · commit 68bd0a68fdd4 · 2025-08-01T13:16:47.000-05:00
- Remove duplicate/unused constants from constants.go
- Remove unused test helper functions
- Add package comment for internal/testing
- Use exec.CommandContext instead of exec.Command
- Preallocate slice in test vector generation
- Fix unused parameter warnings with _
- Use constants instead of string literals
- Add nolint comment for staticcheck SA6002

Only remaining issue is cyclomatic complexity in scanner.Scan()
diff --git a/cmd/tokenizer/root.go b/cmd/tokenizer/root.go
@@ -44,7 +44,7 @@ Common operations available for tokenizers:
 var versionCmd = &cobra.Command{
 	Use:   "version",
 	Short: "Print version information",
-	Run: func(_ *cobra.Command, args []string) {
+	Run: func(_ *cobra.Command, _ []string) {
 		fmt.Printf("tokenizer version %s\n", version)
 		if commit != "none" {
 			fmt.Printf("  commit:     %s\n", commit)
diff --git a/llama3/cmd/llama3/info.go b/llama3/cmd/llama3/info.go
@@ -26,7 +26,7 @@ configuration.`,
 	return cmd
 }
 
-func runInfo(_ *cobra.Command, args []string) error {
+func runInfo(_ *cobra.Command, _ []string) error {
 	// Initialize tokenizer
 	tokenizer, err := llama3.New()
 	if err != nil {
diff --git a/llama3/cmd/llama3/stream.go b/llama3/cmd/llama3/stream.go
@@ -64,10 +64,10 @@ Input is read from stdin only.`,
 	return cmd
 }
 
-func runStream(_ *cobra.Command, args []string) error {
+func runStream(_ *cobra.Command, _ []string) error {
 
 	// Validate output format
-	if streamOutput != "space" && streamOutput != "newline" {
+	if streamOutput != outputFormatSpace && streamOutput != outputFormatNewline {
 		return fmt.Errorf("invalid output format %q: must be 'space' or 'newline'", streamOutput)
 	}
 
@@ -98,7 +98,7 @@ func runStream(_ *cobra.Command, args []string) error {
 		switch streamOutput {
 		case outputFormatNewline:
 			fmt.Println(token)
-		case "space":
+		case outputFormatSpace:
 			if !first {
 				fmt.Print(" ")
 			}
diff --git a/llama3/cmd/tools/generate-vectors/main.go b/llama3/cmd/tools/generate-vectors/main.go
@@ -2,13 +2,15 @@
 package main
 
 import (
+	"context"
 	"encoding/json"
 	"flag"
 	"fmt"
 	"log"
 	"os"
 	"os/exec"
 	"path/filepath"
+	"time"
 )
 
 func main() {
@@ -54,8 +56,10 @@ inputs.forEach(input => {
 		}
 	}()
 
-	// Run the script
-	cmd := exec.Command("node", tmpFile) // #nosec G204 - tmpFile is safely constructed
+	// Run the script with timeout
+	ctx, cancel := context.WithTimeout(context.Background(), 30*time.Second)
+	defer cancel()
+	cmd := exec.CommandContext(ctx, "node", tmpFile) // #nosec G204 - tmpFile is safely constructed
 	outputBytes, err := cmd.Output()
 	if err != nil {
 		log.Fatalf("Failed to run JS script: %v", err)
diff --git a/llama3/constants.go b/llama3/constants.go
@@ -9,24 +9,6 @@ const (
 	totalVocabSize    = baseVocabSize + specialTokenCount
 )
 
-// Token IDs for special tokens.
-const (
-	beginOfTextTokenID = 128000
-	endOfTextTokenID   = 128001
-)
-
-// Pre-tokenization limits.
-const (
-	maxNumberLength = 3 // Maximum consecutive digits in a single token
-)
-
-// Pool configuration.
-const (
-	defaultStateMachineTokenCapacity = 32   // Initial capacity for state machine tokens
-	defaultTokenBufferCapacity       = 64   // Initial capacity for token buffers
-	maxPooledTokenBufferCapacity     = 1024 // Maximum capacity for pooled token buffers
-)
-
 // Cache configuration.
 const (
 	defaultCacheSize = 0 // 0 means unlimited
@@ -38,22 +20,6 @@ const (
 	bytesPerMerge               = 3 // Number of bytes to read for each merge
 )
 
-// Merge data configuration.
-const (
-	bitsPerMergeID = 17 // Number of bits used to encode each merge ID
-)
-
-// Character mappings for byte-level encoding.
-const (
-	asciiPrintableStart = '!' // First printable ASCII character
-	asciiPrintableEnd   = '~' // Last printable ASCII character
-	extendedASCIIStart1 = '¡' // First extended ASCII range start
-	extendedASCIIEnd1   = '¬' // First extended ASCII range end
-	extendedASCIIStart2 = '®' // Second extended ASCII range start
-	extendedASCIIEnd2   = 'ÿ' // Second extended ASCII range end
-	unicodeOffset       = 256 // Offset for mapping non-printable bytes
-)
-
 // Special token constants.
 const (
 	beginOfTextToken = "<|begin_of_text|>"
diff --git a/llama3/internal/pretokenizer/state_machine.go b/llama3/internal/pretokenizer/state_machine.go
@@ -68,7 +68,7 @@ func Tokenize(text string) []string {
 
 	// Return token buffer to pool
 	if cap(sm.tokens) <= 1024 {
-		tokenBufPool.Put(sm.tokens[:0]) // #nosec - slice header is small, not worth pointer optimization
+		tokenBufPool.Put(sm.tokens[:0]) //nolint:staticcheck // slice header is small, not worth pointer optimization
 	}
 
 	// Return state machine to pool
@@ -78,25 +78,6 @@ func Tokenize(text string) []string {
 	return result
 }
 
-// newStateMachine creates a new state machine for tokenizing the given text.
-// This function is primarily used for testing. In production, use Tokenize()
-// which uses pooled state machines for better performance.
-func newStateMachine(text string) *stateMachine {
-	return &stateMachine{
-		input:    []rune(text),
-		position: 0,
-		tokens:   make([]string, 0),
-	}
-}
-
-// tokenizeWithStateMachine processes the input according to the JS regex pattern.
-func (sm *stateMachine) tokenizeWithStateMachine() []string {
-	for sm.position < len(sm.input) {
-		sm.matchNext()
-	}
-	return sm.tokens
-}
-
 // matchNext tries to match the next token according to the pattern.
 func (sm *stateMachine) matchNext() {
 	if sm.position >= len(sm.input) {
diff --git a/llama3/internal/testing/vectors.go b/llama3/internal/testing/vectors.go
@@ -1,3 +1,4 @@
+// Package testing provides test vector generation utilities.
 package testing
 
 import (
@@ -15,7 +16,7 @@ type TestCase struct {
 
 // GenerateTestCases creates comprehensive test cases.
 func GenerateTestCases() []TestCase {
-	var cases []TestCase
+	cases := make([]TestCase, 0, 200) // Preallocate for typical test case count
 
 	// Edge cases
 	cases = append(cases, []TestCase{

Original file line number	Diff line number	Diff line change
@@ -26,7 +26,7 @@ configuration.`,
`26`	`26`	`return cmd`
`27`	`27`	`}`
`28`	`28`
`29`		`-func runInfo(_ *cobra.Command, args []string) error {`
	`29`	`+func runInfo(_ *cobra.Command, _ []string) error {`
`30`	`30`	`// Initialize tokenizer`
`31`	`31`	`tokenizer, err := llama3.New()`
`32`	`32`	`if err != nil {`
Original file line number	Diff line number	Diff line change
@@ -64,10 +64,10 @@ Input is read from stdin only.`,
`64`	`64`	`return cmd`
`65`	`65`	`}`
`66`	`66`
`67`		`-func runStream(_ *cobra.Command, args []string) error {`
	`67`	`+func runStream(_ *cobra.Command, _ []string) error {`
`68`	`68`
`69`	`69`	`// Validate output format`
`70`		`- if streamOutput != "space" && streamOutput != "newline" {`
	`70`	`+ if streamOutput != outputFormatSpace && streamOutput != outputFormatNewline {`
`71`	`71`	`return fmt.Errorf("invalid output format %q: must be 'space' or 'newline'", streamOutput)`
`72`	`72`	`}`
`73`	`73`
`@@ -98,7 +98,7 @@ func runStream(_ *cobra.Command, args []string) error {`
`98`	`98`	`switch streamOutput {`
`99`	`99`	`case outputFormatNewline:`
`100`	`100`	`fmt.Println(token)`
`101`		`- case "space":`
	`101`	`+ case outputFormatSpace:`
`102`	`102`	`if !first {`
`103`	`103`	`fmt.Print(" ")`
`104`	`104`	`}`