Skip to content

Commit 68bd0a6

Browse files
committed
fix(lint): address remaining linting issues
- Remove duplicate/unused constants from constants.go - Remove unused test helper functions - Add package comment for internal/testing - Use exec.CommandContext instead of exec.Command - Preallocate slice in test vector generation - Fix unused parameter warnings with _ - Use constants instead of string literals - Add nolint comment for staticcheck SA6002 Only remaining issue is cyclomatic complexity in scanner.Scan()
1 parent 3654ee1 commit 68bd0a6

File tree

7 files changed

+14
-62
lines changed

7 files changed

+14
-62
lines changed

cmd/tokenizer/root.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -44,7 +44,7 @@ Common operations available for tokenizers:
4444
var versionCmd = &cobra.Command{
4545
Use: "version",
4646
Short: "Print version information",
47-
Run: func(_ *cobra.Command, args []string) {
47+
Run: func(_ *cobra.Command, _ []string) {
4848
fmt.Printf("tokenizer version %s\n", version)
4949
if commit != "none" {
5050
fmt.Printf(" commit: %s\n", commit)

llama3/cmd/llama3/info.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -26,7 +26,7 @@ configuration.`,
2626
return cmd
2727
}
2828

29-
func runInfo(_ *cobra.Command, args []string) error {
29+
func runInfo(_ *cobra.Command, _ []string) error {
3030
// Initialize tokenizer
3131
tokenizer, err := llama3.New()
3232
if err != nil {

llama3/cmd/llama3/stream.go

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -64,10 +64,10 @@ Input is read from stdin only.`,
6464
return cmd
6565
}
6666

67-
func runStream(_ *cobra.Command, args []string) error {
67+
func runStream(_ *cobra.Command, _ []string) error {
6868

6969
// Validate output format
70-
if streamOutput != "space" && streamOutput != "newline" {
70+
if streamOutput != outputFormatSpace && streamOutput != outputFormatNewline {
7171
return fmt.Errorf("invalid output format %q: must be 'space' or 'newline'", streamOutput)
7272
}
7373

@@ -98,7 +98,7 @@ func runStream(_ *cobra.Command, args []string) error {
9898
switch streamOutput {
9999
case outputFormatNewline:
100100
fmt.Println(token)
101-
case "space":
101+
case outputFormatSpace:
102102
if !first {
103103
fmt.Print(" ")
104104
}

llama3/cmd/tools/generate-vectors/main.go

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2,13 +2,15 @@
22
package main
33

44
import (
5+
"context"
56
"encoding/json"
67
"flag"
78
"fmt"
89
"log"
910
"os"
1011
"os/exec"
1112
"path/filepath"
13+
"time"
1214
)
1315

1416
func main() {
@@ -54,8 +56,10 @@ inputs.forEach(input => {
5456
}
5557
}()
5658

57-
// Run the script
58-
cmd := exec.Command("node", tmpFile) // #nosec G204 - tmpFile is safely constructed
59+
// Run the script with timeout
60+
ctx, cancel := context.WithTimeout(context.Background(), 30*time.Second)
61+
defer cancel()
62+
cmd := exec.CommandContext(ctx, "node", tmpFile) // #nosec G204 - tmpFile is safely constructed
5963
outputBytes, err := cmd.Output()
6064
if err != nil {
6165
log.Fatalf("Failed to run JS script: %v", err)

llama3/constants.go

Lines changed: 0 additions & 34 deletions
Original file line numberDiff line numberDiff line change
@@ -9,24 +9,6 @@ const (
99
totalVocabSize = baseVocabSize + specialTokenCount
1010
)
1111

12-
// Token IDs for special tokens.
13-
const (
14-
beginOfTextTokenID = 128000
15-
endOfTextTokenID = 128001
16-
)
17-
18-
// Pre-tokenization limits.
19-
const (
20-
maxNumberLength = 3 // Maximum consecutive digits in a single token
21-
)
22-
23-
// Pool configuration.
24-
const (
25-
defaultStateMachineTokenCapacity = 32 // Initial capacity for state machine tokens
26-
defaultTokenBufferCapacity = 64 // Initial capacity for token buffers
27-
maxPooledTokenBufferCapacity = 1024 // Maximum capacity for pooled token buffers
28-
)
29-
3012
// Cache configuration.
3113
const (
3214
defaultCacheSize = 0 // 0 means unlimited
@@ -38,22 +20,6 @@ const (
3820
bytesPerMerge = 3 // Number of bytes to read for each merge
3921
)
4022

41-
// Merge data configuration.
42-
const (
43-
bitsPerMergeID = 17 // Number of bits used to encode each merge ID
44-
)
45-
46-
// Character mappings for byte-level encoding.
47-
const (
48-
asciiPrintableStart = '!' // First printable ASCII character
49-
asciiPrintableEnd = '~' // Last printable ASCII character
50-
extendedASCIIStart1 = '¡' // First extended ASCII range start
51-
extendedASCIIEnd1 = '¬' // First extended ASCII range end
52-
extendedASCIIStart2 = '®' // Second extended ASCII range start
53-
extendedASCIIEnd2 = 'ÿ' // Second extended ASCII range end
54-
unicodeOffset = 256 // Offset for mapping non-printable bytes
55-
)
56-
5723
// Special token constants.
5824
const (
5925
beginOfTextToken = "<|begin_of_text|>"

llama3/internal/pretokenizer/state_machine.go

Lines changed: 1 addition & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -68,7 +68,7 @@ func Tokenize(text string) []string {
6868

6969
// Return token buffer to pool
7070
if cap(sm.tokens) <= 1024 {
71-
tokenBufPool.Put(sm.tokens[:0]) // #nosec - slice header is small, not worth pointer optimization
71+
tokenBufPool.Put(sm.tokens[:0]) //nolint:staticcheck // slice header is small, not worth pointer optimization
7272
}
7373

7474
// Return state machine to pool
@@ -78,25 +78,6 @@ func Tokenize(text string) []string {
7878
return result
7979
}
8080

81-
// newStateMachine creates a new state machine for tokenizing the given text.
82-
// This function is primarily used for testing. In production, use Tokenize()
83-
// which uses pooled state machines for better performance.
84-
func newStateMachine(text string) *stateMachine {
85-
return &stateMachine{
86-
input: []rune(text),
87-
position: 0,
88-
tokens: make([]string, 0),
89-
}
90-
}
91-
92-
// tokenizeWithStateMachine processes the input according to the JS regex pattern.
93-
func (sm *stateMachine) tokenizeWithStateMachine() []string {
94-
for sm.position < len(sm.input) {
95-
sm.matchNext()
96-
}
97-
return sm.tokens
98-
}
99-
10081
// matchNext tries to match the next token according to the pattern.
10182
func (sm *stateMachine) matchNext() {
10283
if sm.position >= len(sm.input) {

llama3/internal/testing/vectors.go

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
1+
// Package testing provides test vector generation utilities.
12
package testing
23

34
import (
@@ -15,7 +16,7 @@ type TestCase struct {
1516

1617
// GenerateTestCases creates comprehensive test cases.
1718
func GenerateTestCases() []TestCase {
18-
var cases []TestCase
19+
cases := make([]TestCase, 0, 200) // Preallocate for typical test case count
1920

2021
// Edge cases
2122
cases = append(cases, []TestCase{

0 commit comments

Comments
 (0)