agentstation · jackspirou · Aug 6, 2025 · Aug 6, 2025
diff --git a/README.md b/README.md
@@ -72,7 +72,7 @@ tokenizer llama3 "Hello, world!"
 # Decode tokens
 tokenizer llama3 decode 128000 9906 11 1917 0 128001
 
-# Stream large files (automatic pipe detection)
+# Process large files (automatic pipe detection)
 cat document.txt | tokenizer llama3
 
 # Get tokenizer information
@@ -117,7 +117,7 @@ tokenizer llama3 "Hello, world!"
 tokenizer llama3 decode 128000 9906 11 1917 0 128001
 # Output: <|begin_of_text|>Hello, world!<|end_of_text|>
 
-# Stream from files (automatic)
+# Process from files (automatic)
 cat document.txt | tokenizer llama3
 
 # Get help
@@ -224,6 +224,8 @@ MIT
 import "github.com/agentstation/tokenizer"
 ```
 
+Package tokenizer provides a collection of high\-performance tokenizer implementations.
+
 ## Index
 
 

diff --git a/cmd/tokenizer/README.md b/cmd/tokenizer/README.md
@@ -168,6 +168,8 @@ tokenizer [tokenizer-name] [command] [options]
 import "github.com/agentstation/tokenizer/cmd/tokenizer"
 ```
 
+Package main provides the tokenizer CLI tool.
+
 ## Index
 
 

diff --git a/cmd/tokenizer/completion.go b/cmd/tokenizer/completion.go
@@ -41,17 +41,18 @@ PowerShell:
 	DisableFlagsInUseLine: true,
 	ValidArgs:             []string{"bash", "zsh", "fish", "powershell"},
 	Args:                  cobra.MatchAll(cobra.ExactArgs(1), cobra.OnlyValidArgs),
-	Run: func(cmd *cobra.Command, args []string) {
+	RunE: func(cmd *cobra.Command, args []string) error {
 		switch args[0] {
 		case "bash":
-			cmd.Root().GenBashCompletion(os.Stdout)
+			return cmd.Root().GenBashCompletion(os.Stdout)
 		case "zsh":
-			cmd.Root().GenZshCompletion(os.Stdout)
+			return cmd.Root().GenZshCompletion(os.Stdout)
 		case "fish":
-			cmd.Root().GenFishCompletion(os.Stdout, true)
+			return cmd.Root().GenFishCompletion(os.Stdout, true)
 		case "powershell":
-			cmd.Root().GenPowerShellCompletionWithDesc(os.Stdout)
+			return cmd.Root().GenPowerShellCompletionWithDesc(os.Stdout)
 		}
+		return nil
 	},
 }
 

diff --git a/docs/demo.gif b/docs/demo.gif
diff --git a/docs/demo.svg b/docs/demo.svg
diff --git a/llama3/README.md b/llama3/README.md
@@ -454,8 +454,7 @@ Package llama3 implements the Llama 3 tokenizer in Go. This file contains the pu
   - [func \(t \*Tokenizer\) EncodeBPE\(pretoken string\) \[\]int](<#Tokenizer.EncodeBPE>)
   - [func \(t \*Tokenizer\) EncodeBytes\(data \[\]byte, opts \*EncodeOptions\) \[\]int](<#Tokenizer.EncodeBytes>)
   - [func \(t \*Tokenizer\) GetSpecialTokenID\(token string\) \(int, error\)](<#Tokenizer.GetSpecialTokenID>)
-  - [func \(t \*Tokenizer\) NewScanner\(r io.Reader\) Scanner](<#Tokenizer.NewScanner>)
-  - [func \(t \*Tokenizer\) NewScannerOptions\(r io.Reader, opts ...ScannerOption\) Scanner](<#Tokenizer.NewScannerOptions>)
+  - [func \(t \*Tokenizer\) NewScanner\(r io.Reader, opts ...ScannerOption\) Scanner](<#Tokenizer.NewScanner>)
   - [func \(t \*Tokenizer\) OptimisticCount\(text string\) int](<#Tokenizer.OptimisticCount>)
   - [func \(t \*Tokenizer\) PreTokenize\(text string\) \[\]string](<#Tokenizer.PreTokenize>)
   - [func \(t \*Tokenizer\) Process\(r io.Reader, w io.Writer\) \(int64, error\)](<#Tokenizer.Process>)
@@ -469,25 +468,25 @@ Package llama3 implements the Llama 3 tokenizer in Go. This file contains the pu
 
 ## Variables
 
-<a name="ErrDataNotFound"></a>Common errors
+<a name="ErrDataNotFound"></a>Common errors.
 
 ```go
 var (
-    // ErrDataNotFound indicates that the tokenizer data files could not be found
+    // ErrDataNotFound indicates that the tokenizer data files could not be found.
     ErrDataNotFound = errors.New("tokenizer data not found")
 
-    // ErrInvalidToken indicates an invalid token was provided
+    // ErrInvalidToken indicates an invalid token was provided.
     ErrInvalidToken = errors.New("invalid token")
 
-    // ErrTokenNotFound indicates a token was not found in the vocabulary
+    // ErrTokenNotFound indicates a token was not found in the vocabulary.
     ErrTokenNotFound = errors.New("token not found")
 
-    // ErrInvalidTokenID indicates an invalid token ID was provided
+    // ErrInvalidTokenID indicates an invalid token ID was provided.
     ErrInvalidTokenID = errors.New("invalid token ID")
 )
 ```
 
-<a name="WithBufferSize"></a>Scanner option functions \- these are re\-exported from the scanner package
+<a name="WithBufferSize"></a>Scanner option functions \- these are re\-exported from the scanner package.
 
 ```go
 var (
@@ -517,7 +516,7 @@ var (
 func NewConfigError(field string, value any, err error) error
 ```
 
-NewConfigError creates a new ConfigError
+NewConfigError creates a new ConfigError.
 
 <a name="NewDataError"></a>
 ## func [NewDataError](<https://github.com/agentstation/tokenizer/blob/master/llama3/errors.go#L81>)
@@ -526,7 +525,7 @@ NewConfigError creates a new ConfigError
 func NewDataError(op, path string, err error) error
 ```
 
-NewDataError creates a new DataError
+NewDataError creates a new DataError.
 
 <a name="NewTokenError"></a>
 ## func [NewTokenError](<https://github.com/agentstation/tokenizer/blob/master/llama3/errors.go#L86>)
@@ -535,7 +534,7 @@ NewDataError creates a new DataError
 func NewTokenError(op, token string, err error) error
 ```
 
-NewTokenError creates a new TokenError
+NewTokenError creates a new TokenError.
 
 <a name="NewTokenIDError"></a>
 ## func [NewTokenIDError](<https://github.com/agentstation/tokenizer/blob/master/llama3/errors.go#L91>)
@@ -544,7 +543,7 @@ NewTokenError creates a new TokenError
 func NewTokenIDError(op string, tokenID int, err error) error
 ```
 
-NewTokenIDError creates a new TokenError with a token ID
+NewTokenIDError creates a new TokenError with a token ID.
 
 <a name="BPE"></a>
 ## type [BPE](<https://github.com/agentstation/tokenizer/blob/master/llama3/tokenizer.go#L80-L84>)
@@ -583,7 +582,7 @@ type Cache interface {
 <a name="ConfigError"></a>
 ## type [ConfigError](<https://github.com/agentstation/tokenizer/blob/master/llama3/errors.go#L64-L68>)
 
-ConfigError represents an error in tokenizer configuration
+ConfigError represents an error in tokenizer configuration.
 
 ```go
 type ConfigError struct {
@@ -614,7 +613,7 @@ func (e *ConfigError) Unwrap() error
 <a name="DataError"></a>
 ## type [DataError](<https://github.com/agentstation/tokenizer/blob/master/llama3/errors.go#L24-L28>)
 
-DataError represents an error related to tokenizer data loading or processing
+DataError represents an error related to tokenizer data loading or processing.
 
 ```go
 type DataError struct {
@@ -809,7 +808,7 @@ type ScannerOption = scanner.Option
 <a name="TokenError"></a>
 ## type [TokenError](<https://github.com/agentstation/tokenizer/blob/master/llama3/errors.go#L42-L47>)
 
-TokenError represents an error related to token operations
+TokenError represents an error related to token operations.
 
 ```go
 type TokenError struct {
@@ -1082,22 +1081,13 @@ func main() {
 </details>
 
 <a name="Tokenizer.NewScanner"></a>
-### func \(\*Tokenizer\) [NewScanner](<https://github.com/agentstation/tokenizer/blob/master/llama3/scanner.go#L65>)
+### func \(\*Tokenizer\) [NewScanner](<https://github.com/agentstation/tokenizer/blob/master/llama3/scanner.go#L67>)
 
 ```go
-func (t *Tokenizer) NewScanner(r io.Reader) Scanner
+func (t *Tokenizer) NewScanner(r io.Reader, opts ...ScannerOption) Scanner
 ```
 
-NewScanner creates a scanner for streaming tokenization with default options.
-
-<a name="Tokenizer.NewScannerOptions"></a>
-### func \(\*Tokenizer\) [NewScannerOptions](<https://github.com/agentstation/tokenizer/blob/master/llama3/scanner.go#L70>)
-
-```go
-func (t *Tokenizer) NewScannerOptions(r io.Reader, opts ...ScannerOption) Scanner
-```
-
-NewScannerOptions creates a scanner with custom options.
+NewScanner creates a scanner for streaming tokenization. The scanner processes input with bounded memory usage, making it suitable for large files or continuous streams.
 
 <a name="Tokenizer.OptimisticCount"></a>
 ### func \(\*Tokenizer\) [OptimisticCount](<https://github.com/agentstation/tokenizer/blob/master/llama3/tokenizer.go#L373>)

diff --git a/llama3/cmd/llama3/command.go b/llama3/cmd/llama3/command.go
@@ -11,6 +11,16 @@ import (
 // This command provides encode, decode, stream, and info subcommands
 // for working with the Llama 3 tokenizer.
 func Command() *cobra.Command {
+	// Define shared flags that can be used with implicit encoding/streaming
+	var (
+		output    string
+		count     bool
+		countOnly bool
+		bos       bool
+		eos       bool
+		metrics   bool
+	)
+
 	cmd := &cobra.Command{
 		Use:   "llama3",
 		Short: "Llama 3 tokenizer operations",
@@ -22,23 +32,26 @@ vocabulary of 128,256 tokens (128,000 regular tokens + 256 special tokens).
 Available commands:
   encode - Encode text to token IDs (default when text is provided)
   decode - Decode token IDs to text
-  stream - Process text in streaming mode
   info   - Display tokenizer information`,
 		Example: `  # Encode text (explicit)
   tokenizer llama3 encode "Hello, world!"
 
   # Encode text (implicit - default action)
   tokenizer llama3 "Hello, world!"
 
+  # Encode with flags (implicit)
+  tokenizer llama3 "Hello, world!" --count
+  tokenizer llama3 "Hello, world!" --output json
+
   # Decode tokens
   tokenizer llama3 decode 128000 9906 11 1917 0 128001
 
-  # Stream from stdin (explicit)
-  cat large_file.txt | tokenizer llama3 stream
-
-  # Stream from stdin (implicit - automatic)
+  # Encode from stdin (implicit - automatic)
   cat large_file.txt | tokenizer llama3
 
+  # Encode with flags (implicit)
+  cat large_file.txt | tokenizer llama3 --count-only
+
   # Show tokenizer info
   tokenizer llama3 info`,
 		Args: cobra.ArbitraryArgs,
@@ -63,27 +76,59 @@ Available commands:
 				// Not a subcommand, treat as text to encode
 				encodeCmd := newEncodeCmd()
 				encodeCmd.SetArgs(args)
+				// Copy over parent command flags for encode
+				encodeCmd.SetOut(cmd.OutOrStdout())
+				encodeCmd.SetErr(cmd.ErrOrStderr())
+				encodeCmd.SetIn(cmd.InOrStdin())
+
+				// Set flags from parent command
+				encAddBOS = bos
+				encAddEOS = eos
+				encOutput = output
+				encCount = count
+				encCountOnly = countOnly
+				encMetrics = metrics
+
 				return encodeCmd.Execute()
 			}
 
 			// No args provided - check if stdin is piped
 			stat, _ := os.Stdin.Stat()
 			if (stat.Mode() & os.ModeCharDevice) == 0 {
-				// Data is being piped to stdin, use streaming mode
-				streamCmd := newStreamCmd()
-				return streamCmd.Execute()
+				// Data is being piped to stdin, use encode
+				encodeCmd := newEncodeCmd()
+				encodeCmd.SetOut(cmd.OutOrStdout())
+				encodeCmd.SetErr(cmd.ErrOrStderr())
+				encodeCmd.SetIn(cmd.InOrStdin())
+
+				// Set flags from parent command
+				encAddBOS = bos
+				encAddEOS = eos
+				encOutput = output
+				encCount = count
+				encCountOnly = countOnly
+				encMetrics = metrics
+
+				return encodeCmd.RunE(encodeCmd, []string{})
 			}
 
 			// No args and no piped input, show help
 			return cmd.Help()
 		},
 	}
 
+	// Add flags that work with implicit encoding/streaming
+	cmd.PersistentFlags().StringVarP(&output, "output", "o", "space", "Output format: space, newline, json")
+	cmd.PersistentFlags().BoolVar(&count, "count", false, "Show token count with output")
+	cmd.PersistentFlags().BoolVar(&countOnly, "count-only", false, "Show only token count (no tokens)")
+	cmd.PersistentFlags().BoolVar(&bos, "bos", true, "Add beginning of sequence token")
+	cmd.PersistentFlags().BoolVar(&eos, "eos", true, "Add end of sequence token")
+	cmd.PersistentFlags().BoolVar(&metrics, "metrics", false, "Show performance metrics")
+
 	// Add subcommands
 	cmd.AddCommand(
 		newEncodeCmd(),
 		newDecodeCmd(),
-		newStreamCmd(),
 		newInfoCmd(),
 	)
-Original file line number
+Diff line change
@@ Expand Up / @@ -168,6 +168,8 @@ tokenizer [tokenizer-name] [command] [options] @@
     import "github.com/agentstation/tokenizer/cmd/tokenizer"
     ```
+    Package main provides the tokenizer CLI tool.
     ## Index
@@ Expand Down @@