diff --git a/README.md b/README.md
index 0d0b22b..fb5836f 100644
--- a/README.md
+++ b/README.md
@@ -72,7 +72,7 @@ tokenizer llama3 "Hello, world!"
# Decode tokens
tokenizer llama3 decode 128000 9906 11 1917 0 128001
-# Stream large files (automatic pipe detection)
+# Process large files (automatic pipe detection)
cat document.txt | tokenizer llama3
# Get tokenizer information
@@ -117,7 +117,7 @@ tokenizer llama3 "Hello, world!"
tokenizer llama3 decode 128000 9906 11 1917 0 128001
# Output: <|begin_of_text|>Hello, world!<|end_of_text|>
-# Stream from files (automatic)
+# Process from files (automatic)
cat document.txt | tokenizer llama3
# Get help
@@ -224,6 +224,8 @@ MIT
import "github.com/agentstation/tokenizer"
```
+Package tokenizer provides a collection of high\-performance tokenizer implementations.
+
## Index
diff --git a/cmd/tokenizer/README.md b/cmd/tokenizer/README.md
index de05bb8..c5c78d0 100644
--- a/cmd/tokenizer/README.md
+++ b/cmd/tokenizer/README.md
@@ -168,6 +168,8 @@ tokenizer [tokenizer-name] [command] [options]
import "github.com/agentstation/tokenizer/cmd/tokenizer"
```
+Package main provides the tokenizer CLI tool.
+
## Index
diff --git a/cmd/tokenizer/completion.go b/cmd/tokenizer/completion.go
index 66171b3..7288680 100644
--- a/cmd/tokenizer/completion.go
+++ b/cmd/tokenizer/completion.go
@@ -41,17 +41,18 @@ PowerShell:
DisableFlagsInUseLine: true,
ValidArgs: []string{"bash", "zsh", "fish", "powershell"},
Args: cobra.MatchAll(cobra.ExactArgs(1), cobra.OnlyValidArgs),
- Run: func(cmd *cobra.Command, args []string) {
+ RunE: func(cmd *cobra.Command, args []string) error {
switch args[0] {
case "bash":
- cmd.Root().GenBashCompletion(os.Stdout)
+ return cmd.Root().GenBashCompletion(os.Stdout)
case "zsh":
- cmd.Root().GenZshCompletion(os.Stdout)
+ return cmd.Root().GenZshCompletion(os.Stdout)
case "fish":
- cmd.Root().GenFishCompletion(os.Stdout, true)
+ return cmd.Root().GenFishCompletion(os.Stdout, true)
case "powershell":
- cmd.Root().GenPowerShellCompletionWithDesc(os.Stdout)
+ return cmd.Root().GenPowerShellCompletionWithDesc(os.Stdout)
}
+ return nil
},
}
diff --git a/docs/demo.gif b/docs/demo.gif
index 6af5396..c478bd5 100644
Binary files a/docs/demo.gif and b/docs/demo.gif differ
diff --git a/docs/demo.svg b/docs/demo.svg
index 790c2a0..c2375d7 100644
--- a/docs/demo.svg
+++ b/docs/demo.svg
@@ -1 +1 @@
-
\ No newline at end of file
+Terminal> #█> # █> # T█> # To█> # Tok█> # Toke█> # Token█> # Tokeni█> # Tokeniz█> # Tokenize█> # Tokenize █> # Tokenize █> # Tokenize P█> # Tokenize Py█> # Tokenize Py█> # Tokenize Pyt█> # Tokenize Pyth█> # Tokenize Pyth█> # Tokenize Pytho█> # Tokenize Python█> # Tokenize Python █> # Tokenize Python c█> # Tokenize Python c█> # Tokenize Python co█> # Tokenize Python cod█> # Tokenize Python cod█> # Tokenize Python code█> # Tokenize Python code> █> # Tokenize Python code> █> # Tokenize Python code> e█> # Tokenize Python code> ec█> # Tokenize Python code> ech█> # Tokenize Python code> echo█> # Tokenize Python code> echo █> # Tokenize Python code> echo '█> # Tokenize Python code> echo 'd█> # Tokenize Python code> echo 'de█> # Tokenize Python code> echo 'def█> # Tokenize Python code> echo 'def █> # Tokenize Python code> echo 'def a█> # Tokenize Python code> echo 'def ad█> # Tokenize Python code> echo 'def add█> # Tokenize Python code> echo 'def add(█> # Tokenize Python code> echo 'def add(a█> # Tokenize Python code> echo 'def add(a,█> # Tokenize Python code> echo 'def add(a, █> # Tokenize Python code> echo 'def add(a, b█> # Tokenize Python code> echo 'def add(a, b)█> # Tokenize Python code> echo 'def add(a, b):█> # Tokenize Python code> echo 'def add(a, b): █> # Tokenize Python code> echo 'def add(a, b): r█> # Tokenize Python code> echo 'def add(a, b): re█> # Tokenize Python code> echo 'def add(a, b): ret█> # Tokenize Python code> echo 'def add(a, b): retu█> # Tokenize Python code> echo 'def add(a, b): retur█> # Tokenize Python code> echo 'def add(a, b): return█> # Tokenize Python code> echo 'def add(a, b): return █> # Tokenize Python code> echo 'def add(a, b): return a█> # Tokenize Python code> echo 'def add(a, b): return a █> # Tokenize Python code> echo 'def add(a, b): return a +█> # Tokenize Python code> echo 'def add(a, b): return a + █> # Tokenize Python code> echo 'def add(a, b): return a + b█> # Tokenize Python code> echo 'def add(a, b): return a + b'█> # Tokenize Python code> echo 'def add(a, b): return a + b' █> # Tokenize Python code> echo 'def add(a, b): return a + b' |█> # Tokenize Python code> echo 'def add(a, b): return a + b' | █> # Tokenize Python code> echo 'def add(a, b): return a + b' | t█> # Tokenize Python code> echo 'def add(a, b): return a + b' | to█> # Tokenize Python code> echo 'def add(a, b): return a + b' | tok█> # Tokenize Python code> echo 'def add(a, b): return a + b' | toke█> # Tokenize Python code> echo 'def add(a, b): return a + b' | token█> # Tokenize Python code> echo 'def add(a, b): return a + b' | tokeni█> # Tokenize Python code> echo 'def add(a, b): return a + b' | tokeniz█> # Tokenize Python code> echo 'def add(a, b): return a + b' | tokenize█> # Tokenize Python code> echo 'def add(a, b): return a + b' | tokenizer█> # Tokenize Python code> echo 'def add(a, b): return a + b' | tokenizer █> # Tokenize Python code> echo 'def add(a, b): return a + b' | tokenizer l█> # Tokenize Python code> echo 'def add(a, b): return a + b' | tokenizer ll█> # Tokenize Python code> echo 'def add(a, b): return a + b' | tokenizer lla█> # Tokenize Python code> echo 'def add(a, b): return a + b' | tokenizer llam█> # Tokenize Python code> echo 'def add(a, b): return a + b' | tokenizer llama█> # Tokenize Python code> echo 'def add(a, b): return a + b' | tokenizer llama3█> # Tokenize Python code> echo 'def add(a, b): return a + b' | tokenizer llama3█> # Tokenize Python code> echo 'def add(a, b): return a + b' | tokenizer llama3128000 755 923 2948 11 293 1680 471 264 489 293 198 128001> █> # Tokenize Python code> echo 'def add(a, b): return a + b' | tokenizer llama3128000 755 923 2948 11 293 1680 471 264 489 293 198 128001> █> # Tokenize Python code> echo 'def add(a, b): return a + b' | tokenizer llama3128000 755 923 2948 11 293 1680 471 264 489 293 198 128001> #█> # Tokenize Python code> echo 'def add(a, b): return a + b' | tokenizer llama3128000 755 923 2948 11 293 1680 471 264 489 293 198 128001> # █> # Tokenize Python code> echo 'def add(a, b): return a + b' | tokenizer llama3128000 755 923 2948 11 293 1680 471 264 489 293 198 128001> # C█> # Tokenize Python code> echo 'def add(a, b): return a + b' | tokenizer llama3128000 755 923 2948 11 293 1680 471 264 489 293 198 128001> # C█> # Tokenize Python code> echo 'def add(a, b): return a + b' | tokenizer llama3128000 755 923 2948 11 293 1680 471 264 489 293 198 128001> # Co█> # Tokenize Python code> echo 'def add(a, b): return a + b' | tokenizer llama3128000 755 923 2948 11 293 1680 471 264 489 293 198 128001> # Cou█> # Tokenize Python code> echo 'def add(a, b): return a + b' | tokenizer llama3128000 755 923 2948 11 293 1680 471 264 489 293 198 128001> # Cou█> # Tokenize Python code> echo 'def add(a, b): return a + b' | tokenizer llama3128000 755 923 2948 11 293 1680 471 264 489 293 198 128001> # Coun█> # Tokenize Python code> echo 'def add(a, b): return a + b' | tokenizer llama3128000 755 923 2948 11 293 1680 471 264 489 293 198 128001> # Count█> # Tokenize Python code> echo 'def add(a, b): return a + b' | tokenizer llama3128000 755 923 2948 11 293 1680 471 264 489 293 198 128001> # Count █> # Tokenize Python code> echo 'def add(a, b): return a + b' | tokenizer llama3128000 755 923 2948 11 293 1680 471 264 489 293 198 128001> # Count t█> # Tokenize Python code> echo 'def add(a, b): return a + b' | tokenizer llama3128000 755 923 2948 11 293 1680 471 264 489 293 198 128001> # Count t█> # Tokenize Python code> echo 'def add(a, b): return a + b' | tokenizer llama3128000 755 923 2948 11 293 1680 471 264 489 293 198 128001> # Count to█> # Tokenize Python code> echo 'def add(a, b): return a + b' | tokenizer llama3128000 755 923 2948 11 293 1680 471 264 489 293 198 128001> # Count tok█> # Tokenize Python code> echo 'def add(a, b): return a + b' | tokenizer llama3128000 755 923 2948 11 293 1680 471 264 489 293 198 128001> # Count tok█> # Tokenize Python code> echo 'def add(a, b): return a + b' | tokenizer llama3128000 755 923 2948 11 293 1680 471 264 489 293 198 128001> # Count toke█> # Tokenize Python code> echo 'def add(a, b): return a + b' | tokenizer llama3128000 755 923 2948 11 293 1680 471 264 489 293 198 128001> # Count token█> # Tokenize Python code> echo 'def add(a, b): return a + b' | tokenizer llama3128000 755 923 2948 11 293 1680 471 264 489 293 198 128001> # Count token█> # Tokenize Python code> echo 'def add(a, b): return a + b' | tokenizer llama3128000 755 923 2948 11 293 1680 471 264 489 293 198 128001> # Count tokens█> # Tokenize Python code> echo 'def add(a, b): return a + b' | tokenizer llama3128000 755 923 2948 11 293 1680 471 264 489 293 198 128001> # Count tokens █> # Tokenize Python code> echo 'def add(a, b): return a + b' | tokenizer llama3128000 755 923 2948 11 293 1680 471 264 489 293 198 128001> # Count tokens i█> # Tokenize Python code> echo 'def add(a, b): return a + b' | tokenizer llama3128000 755 923 2948 11 293 1680 471 264 489 293 198 128001> # Count tokens in█> # Tokenize Python code> echo 'def add(a, b): return a + b' | tokenizer llama3128000 755 923 2948 11 293 1680 471 264 489 293 198 128001> # Count tokens in █> # Tokenize Python code> echo 'def add(a, b): return a + b' | tokenizer llama3128000 755 923 2948 11 293 1680 471 264 489 293 198 128001> # Count tokens in t█> # Tokenize Python code> echo 'def add(a, b): return a + b' | tokenizer llama3128000 755 923 2948 11 293 1680 471 264 489 293 198 128001> # Count tokens in te█> # Tokenize Python code> echo 'def add(a, b): return a + b' | tokenizer llama3128000 755 923 2948 11 293 1680 471 264 489 293 198 128001> # Count tokens in tex█> # Tokenize Python code> echo 'def add(a, b): return a + b' | tokenizer llama3128000 755 923 2948 11 293 1680 471 264 489 293 198 128001> # Count tokens in text█> # Tokenize Python code> echo 'def add(a, b): return a + b' | tokenizer llama3128000 755 923 2948 11 293 1680 471 264 489 293 198 128001> # Count tokens in text> █> # Tokenize Python code> echo 'def add(a, b): return a + b' | tokenizer llama3128000 755 923 2948 11 293 1680 471 264 489 293 198 128001> # Count tokens in text> e█> # Tokenize Python code> echo 'def add(a, b): return a + b' | tokenizer llama3128000 755 923 2948 11 293 1680 471 264 489 293 198 128001> # Count tokens in text> ec█> # Tokenize Python code> echo 'def add(a, b): return a + b' | tokenizer llama3128000 755 923 2948 11 293 1680 471 264 489 293 198 128001> # Count tokens in text> ech█> # Tokenize Python code> echo 'def add(a, b): return a + b' | tokenizer llama3128000 755 923 2948 11 293 1680 471 264 489 293 198 128001> # Count tokens in text> echo█> # Tokenize Python code> echo 'def add(a, b): return a + b' | tokenizer llama3128000 755 923 2948 11 293 1680 471 264 489 293 198 128001> # Count tokens in text> echo █> # Tokenize Python code> echo 'def add(a, b): return a + b' | tokenizer llama3128000 755 923 2948 11 293 1680 471 264 489 293 198 128001> # Count tokens in text> echo '█> # Tokenize Python code> echo 'def add(a, b): return a + b' | tokenizer llama3128000 755 923 2948 11 293 1680 471 264 489 293 198 128001> # Count tokens in text> echo 'H█> # Tokenize Python code> echo 'def add(a, b): return a + b' | tokenizer llama3128000 755 923 2948 11 293 1680 471 264 489 293 198 128001> # Count tokens in text> echo 'He█> # Tokenize Python code> echo 'def add(a, b): return a + b' | tokenizer llama3128000 755 923 2948 11 293 1680 471 264 489 293 198 128001> # Count tokens in text> echo 'Hel█> # Tokenize Python code> echo 'def add(a, b): return a + b' | tokenizer llama3128000 755 923 2948 11 293 1680 471 264 489 293 198 128001> # Count tokens in text> echo 'Hell█> # Tokenize Python code> echo 'def add(a, b): return a + b' | tokenizer llama3128000 755 923 2948 11 293 1680 471 264 489 293 198 128001> # Count tokens in text> echo 'Hello█> # Tokenize Python code> echo 'def add(a, b): return a + b' | tokenizer llama3128000 755 923 2948 11 293 1680 471 264 489 293 198 128001> # Count tokens in text> echo 'Hello █> # Tokenize Python code> echo 'def add(a, b): return a + b' | tokenizer llama3128000 755 923 2948 11 293 1680 471 264 489 293 198 128001> # Count tokens in text> echo 'Hello w█> # Tokenize Python code> echo 'def add(a, b): return a + b' | tokenizer llama3128000 755 923 2948 11 293 1680 471 264 489 293 198 128001> # Count tokens in text> echo 'Hello wo█> # Tokenize Python code> echo 'def add(a, b): return a + b' | tokenizer llama3128000 755 923 2948 11 293 1680 471 264 489 293 198 128001> # Count tokens in text> echo 'Hello wor█> # Tokenize Python code> echo 'def add(a, b): return a + b' | tokenizer llama3128000 755 923 2948 11 293 1680 471 264 489 293 198 128001> # Count tokens in text> echo 'Hello worl█> # Tokenize Python code> echo 'def add(a, b): return a + b' | tokenizer llama3128000 755 923 2948 11 293 1680 471 264 489 293 198 128001> # Count tokens in text> echo 'Hello world█> # Tokenize Python code> echo 'def add(a, b): return a + b' | tokenizer llama3128000 755 923 2948 11 293 1680 471 264 489 293 198 128001> # Count tokens in text> echo 'Hello world'█> # Tokenize Python code> echo 'def add(a, b): return a + b' | tokenizer llama3128000 755 923 2948 11 293 1680 471 264 489 293 198 128001> # Count tokens in text> echo 'Hello world' █> # Tokenize Python code> echo 'def add(a, b): return a + b' | tokenizer llama3128000 755 923 2948 11 293 1680 471 264 489 293 198 128001> # Count tokens in text> echo 'Hello world' |█> # Tokenize Python code> echo 'def add(a, b): return a + b' | tokenizer llama3128000 755 923 2948 11 293 1680 471 264 489 293 198 128001> # Count tokens in text> echo 'Hello world' | █> # Tokenize Python code> echo 'def add(a, b): return a + b' | tokenizer llama3128000 755 923 2948 11 293 1680 471 264 489 293 198 128001> # Count tokens in text> echo 'Hello world' | t█> # Tokenize Python code> echo 'def add(a, b): return a + b' | tokenizer llama3128000 755 923 2948 11 293 1680 471 264 489 293 198 128001> # Count tokens in text> echo 'Hello world' | to█> # Tokenize Python code> echo 'def add(a, b): return a + b' | tokenizer llama3128000 755 923 2948 11 293 1680 471 264 489 293 198 128001> # Count tokens in text> echo 'Hello world' | tok█> # Tokenize Python code> echo 'def add(a, b): return a + b' | tokenizer llama3128000 755 923 2948 11 293 1680 471 264 489 293 198 128001> # Count tokens in text> echo 'Hello world' | toke█> # Tokenize Python code> echo 'def add(a, b): return a + b' | tokenizer llama3128000 755 923 2948 11 293 1680 471 264 489 293 198 128001> # Count tokens in text> echo 'Hello world' | token█> # Tokenize Python code> echo 'def add(a, b): return a + b' | tokenizer llama3128000 755 923 2948 11 293 1680 471 264 489 293 198 128001> # Count tokens in text> echo 'Hello world' | tokeni█> # Tokenize Python code> echo 'def add(a, b): return a + b' | tokenizer llama3128000 755 923 2948 11 293 1680 471 264 489 293 198 128001> # Count tokens in text> echo 'Hello world' | tokeniz█> # Tokenize Python code> echo 'def add(a, b): return a + b' | tokenizer llama3128000 755 923 2948 11 293 1680 471 264 489 293 198 128001> # Count tokens in text> echo 'Hello world' | tokenize█> # Tokenize Python code> echo 'def add(a, b): return a + b' | tokenizer llama3128000 755 923 2948 11 293 1680 471 264 489 293 198 128001> # Count tokens in text> echo 'Hello world' | tokenizer█> # Tokenize Python code> echo 'def add(a, b): return a + b' | tokenizer llama3128000 755 923 2948 11 293 1680 471 264 489 293 198 128001> # Count tokens in text> echo 'Hello world' | tokenizer █> # Tokenize Python code> echo 'def add(a, b): return a + b' | tokenizer llama3128000 755 923 2948 11 293 1680 471 264 489 293 198 128001> # Count tokens in text> echo 'Hello world' | tokenizer l█> # Tokenize Python code> echo 'def add(a, b): return a + b' | tokenizer llama3128000 755 923 2948 11 293 1680 471 264 489 293 198 128001> # Count tokens in text> echo 'Hello world' | tokenizer ll█> # Tokenize Python code> echo 'def add(a, b): return a + b' | tokenizer llama3128000 755 923 2948 11 293 1680 471 264 489 293 198 128001> # Count tokens in text> echo 'Hello world' | tokenizer lla█> # Tokenize Python code> echo 'def add(a, b): return a + b' | tokenizer llama3128000 755 923 2948 11 293 1680 471 264 489 293 198 128001> # Count tokens in text> echo 'Hello world' | tokenizer llam█> # Tokenize Python code> echo 'def add(a, b): return a + b' | tokenizer llama3128000 755 923 2948 11 293 1680 471 264 489 293 198 128001> # Count tokens in text> echo 'Hello world' | tokenizer llama█> # Tokenize Python code> echo 'def add(a, b): return a + b' | tokenizer llama3128000 755 923 2948 11 293 1680 471 264 489 293 198 128001> # Count tokens in text> echo 'Hello world' | tokenizer llama3█> # Tokenize Python code> echo 'def add(a, b): return a + b' | tokenizer llama3128000 755 923 2948 11 293 1680 471 264 489 293 198 128001> # Count tokens in text> echo 'Hello world' | tokenizer llama3 █> # Tokenize Python code> echo 'def add(a, b): return a + b' | tokenizer llama3128000 755 923 2948 11 293 1680 471 264 489 293 198 128001> # Count tokens in text> echo 'Hello world' | tokenizer llama3 e█> # Tokenize Python code> echo 'def add(a, b): return a + b' | tokenizer llama3128000 755 923 2948 11 293 1680 471 264 489 293 198 128001> # Count tokens in text> echo 'Hello world' | tokenizer llama3 en█> # Tokenize Python code> echo 'def add(a, b): return a + b' | tokenizer llama3128000 755 923 2948 11 293 1680 471 264 489 293 198 128001> # Count tokens in text> echo 'Hello world' | tokenizer llama3 enc█> # Tokenize Python code> echo 'def add(a, b): return a + b' | tokenizer llama3128000 755 923 2948 11 293 1680 471 264 489 293 198 128001> # Count tokens in text> echo 'Hello world' | tokenizer llama3 enco█> # Tokenize Python code> echo 'def add(a, b): return a + b' | tokenizer llama3128000 755 923 2948 11 293 1680 471 264 489 293 198 128001> # Count tokens in text> echo 'Hello world' | tokenizer llama3 enco█> # Tokenize Python code> echo 'def add(a, b): return a + b' | tokenizer llama3128000 755 923 2948 11 293 1680 471 264 489 293 198 128001> # Count tokens in text> echo 'Hello world' | tokenizer llama3 encod█> # Tokenize Python code> echo 'def add(a, b): return a + b' | tokenizer llama3128000 755 923 2948 11 293 1680 471 264 489 293 198 128001> # Count tokens in text> echo 'Hello world' | tokenizer llama3 encode█> # Tokenize Python code> echo 'def add(a, b): return a + b' | tokenizer llama3128000 755 923 2948 11 293 1680 471 264 489 293 198 128001> # Count tokens in text> echo 'Hello world' | tokenizer llama3 encode █> # Tokenize Python code> echo 'def add(a, b): return a + b' | tokenizer llama3128000 755 923 2948 11 293 1680 471 264 489 293 198 128001> # Count tokens in text> echo 'Hello world' | tokenizer llama3 encode -█> # Tokenize Python code> echo 'def add(a, b): return a + b' | tokenizer llama3128000 755 923 2948 11 293 1680 471 264 489 293 198 128001> # Count tokens in text> echo 'Hello world' | tokenizer llama3 encode -█> # Tokenize Python code> echo 'def add(a, b): return a + b' | tokenizer llama3128000 755 923 2948 11 293 1680 471 264 489 293 198 128001> # Count tokens in text> echo 'Hello world' | tokenizer llama3 encode --█> # Tokenize Python code> echo 'def add(a, b): return a + b' | tokenizer llama3128000 755 923 2948 11 293 1680 471 264 489 293 198 128001> # Count tokens in text> echo 'Hello world' | tokenizer llama3 encode --o█> # Tokenize Python code> echo 'def add(a, b): return a + b' | tokenizer llama3128000 755 923 2948 11 293 1680 471 264 489 293 198 128001> # Count tokens in text> echo 'Hello world' | tokenizer llama3 encode --o█> # Tokenize Python code> echo 'def add(a, b): return a + b' | tokenizer llama3128000 755 923 2948 11 293 1680 471 264 489 293 198 128001> # Count tokens in text> echo 'Hello world' | tokenizer llama3 encode --ou█> # Tokenize Python code> echo 'def add(a, b): return a + b' | tokenizer llama3128000 755 923 2948 11 293 1680 471 264 489 293 198 128001> # Count tokens in text> echo 'Hello world' | tokenizer llama3 encode --out█> # Tokenize Python code> echo 'def add(a, b): return a + b' | tokenizer llama3128000 755 923 2948 11 293 1680 471 264 489 293 198 128001> # Count tokens in text> echo 'Hello world' | tokenizer llama3 encode --out█> # Tokenize Python code> echo 'def add(a, b): return a + b' | tokenizer llama3128000 755 923 2948 11 293 1680 471 264 489 293 198 128001> # Count tokens in text> echo 'Hello world' | tokenizer llama3 encode --outp█> # Tokenize Python code> echo 'def add(a, b): return a + b' | tokenizer llama3128000 755 923 2948 11 293 1680 471 264 489 293 198 128001> # Count tokens in text> echo 'Hello world' | tokenizer llama3 encode --outpu█> # Tokenize Python code> echo 'def add(a, b): return a + b' | tokenizer llama3128000 755 923 2948 11 293 1680 471 264 489 293 198 128001> # Count tokens in text> echo 'Hello world' | tokenizer llama3 encode --outpu█> # Tokenize Python code> echo 'def add(a, b): return a + b' | tokenizer llama3128000 755 923 2948 11 293 1680 471 264 489 293 198 128001> # Count tokens in text> echo 'Hello world' | tokenizer llama3 encode --output█> # Tokenize Python code> echo 'def add(a, b): return a + b' | tokenizer llama3128000 755 923 2948 11 293 1680 471 264 489 293 198 128001> # Count tokens in text> echo 'Hello world' | tokenizer llama3 encode --output █> # Tokenize Python code> echo 'def add(a, b): return a + b' | tokenizer llama3128000 755 923 2948 11 293 1680 471 264 489 293 198 128001> # Count tokens in text> echo 'Hello world' | tokenizer llama3 encode --output █> # Tokenize Python code> echo 'def add(a, b): return a + b' | tokenizer llama3128000 755 923 2948 11 293 1680 471 264 489 293 198 128001> # Count tokens in text> echo 'Hello world' | tokenizer llama3 encode --output j█> # Tokenize Python code> echo 'def add(a, b): return a + b' | tokenizer llama3128000 755 923 2948 11 293 1680 471 264 489 293 198 128001> # Count tokens in text> echo 'Hello world' | tokenizer llama3 encode --output js█> # Tokenize Python code> echo 'def add(a, b): return a + b' | tokenizer llama3128000 755 923 2948 11 293 1680 471 264 489 293 198 128001> # Count tokens in text> echo 'Hello world' | tokenizer llama3 encode --output js█> # Tokenize Python code> echo 'def add(a, b): return a + b' | tokenizer llama3128000 755 923 2948 11 293 1680 471 264 489 293 198 128001> # Count tokens in text> echo 'Hello world' | tokenizer llama3 encode --output jso█> # Tokenize Python code> echo 'def add(a, b): return a + b' | tokenizer llama3128000 755 923 2948 11 293 1680 471 264 489 293 198 128001> # Count tokens in text> echo 'Hello world' | tokenizer llama3 encode --output json█> # Tokenize Python code> echo 'def add(a, b): return a + b' | tokenizer llama3128000 755 923 2948 11 293 1680 471 264 489 293 198 128001> # Count tokens in text> echo 'Hello world' | tokenizer llama3 encode --output json █> # Tokenize Python code> echo 'def add(a, b): return a + b' | tokenizer llama3128000 755 923 2948 11 293 1680 471 264 489 293 198 128001> # Count tokens in text> echo 'Hello world' | tokenizer llama3 encode --output json |█> # Tokenize Python code> echo 'def add(a, b): return a + b' | tokenizer llama3128000 755 923 2948 11 293 1680 471 264 489 293 198 128001> # Count tokens in text> echo 'Hello world' | tokenizer llama3 encode --output json | █> # Tokenize Python code> echo 'def add(a, b): return a + b' | tokenizer llama3128000 755 923 2948 11 293 1680 471 264 489 293 198 128001> # Count tokens in text> echo 'Hello world' | tokenizer llama3 encode --output json | j█> # Tokenize Python code> echo 'def add(a, b): return a + b' | tokenizer llama3128000 755 923 2948 11 293 1680 471 264 489 293 198 128001> # Count tokens in text> echo 'Hello world' | tokenizer llama3 encode --output json | jq█> # Tokenize Python code> echo 'def add(a, b): return a + b' | tokenizer llama3128000 755 923 2948 11 293 1680 471 264 489 293 198 128001> # Count tokens in text> echo 'Hello world' | tokenizer llama3 encode --output json | jq █> # Tokenize Python code> echo 'def add(a, b): return a + b' | tokenizer llama3128000 755 923 2948 11 293 1680 471 264 489 293 198 128001> # Count tokens in text> echo 'Hello world' | tokenizer llama3 encode --output json | jq l█> # Tokenize Python code> echo 'def add(a, b): return a + b' | tokenizer llama3128000 755 923 2948 11 293 1680 471 264 489 293 198 128001> # Count tokens in text> echo 'Hello world' | tokenizer llama3 encode --output json | jq le█> # Tokenize Python code> echo 'def add(a, b): return a + b' | tokenizer llama3128000 755 923 2948 11 293 1680 471 264 489 293 198 128001> # Count tokens in text> echo 'Hello world' | tokenizer llama3 encode --output json | jq len█> # Tokenize Python code> echo 'def add(a, b): return a + b' | tokenizer llama3128000 755 923 2948 11 293 1680 471 264 489 293 198 128001> # Count tokens in text> echo 'Hello world' | tokenizer llama3 encode --output json | jq leng█> # Tokenize Python code> echo 'def add(a, b): return a + b' | tokenizer llama3128000 755 923 2948 11 293 1680 471 264 489 293 198 128001> # Count tokens in text> echo 'Hello world' | tokenizer llama3 encode --output json | jq lengt█> # Tokenize Python code> echo 'def add(a, b): return a + b' | tokenizer llama3128000 755 923 2948 11 293 1680 471 264 489 293 198 128001> # Count tokens in text> echo 'Hello world' | tokenizer llama3 encode --output json | jq length█> # Tokenize Python code> echo 'def add(a, b): return a + b' | tokenizer llama3128000 755 923 2948 11 293 1680 471 264 489 293 198 128001> # Count tokens in text> echo 'Hello world' | tokenizer llama3 encode --output json | jq length█> # Tokenize Python code> echo 'def add(a, b): return a + b' | tokenizer llama3128000 755 923 2948 11 293 1680 471 264 489 293 198 128001> # Count tokens in text> echo 'Hello world' | tokenizer llama3 encode --output json | jq length5> █> # Tokenize Python code> echo 'def add(a, b): return a + b' | tokenizer llama3128000 755 923 2948 11 293 1680 471 264 489 293 198 128001> # Count tokens in text> echo 'Hello world' | tokenizer llama3 encode --output json | jq length5> █> # Tokenize Python code> echo 'def add(a, b): return a + b' | tokenizer llama3128000 755 923 2948 11 293 1680 471 264 489 293 198 128001> # Count tokens in text> echo 'Hello world' | tokenizer llama3 encode --output json | jq length5> █> # Tokenize Python code> echo 'def add(a, b): return a + b' | tokenizer llama3128000 755 923 2948 11 293 1680 471 264 489 293 198 128001> # Count tokens in text> echo 'Hello world' | tokenizer llama3 encode --output json | jq length5> #█> # Tokenize Python code> echo 'def add(a, b): return a + b' | tokenizer llama3128000 755 923 2948 11 293 1680 471 264 489 293 198 128001> # Count tokens in text> echo 'Hello world' | tokenizer llama3 encode --output json | jq length5> # █> # Tokenize Python code> echo 'def add(a, b): return a + b' | tokenizer llama3128000 755 923 2948 11 293 1680 471 264 489 293 198 128001> # Count tokens in text> echo 'Hello world' | tokenizer llama3 encode --output json | jq length5> # █> # Tokenize Python code> echo 'def add(a, b): return a + b' | tokenizer llama3128000 755 923 2948 11 293 1680 471 264 489 293 198 128001> # Count tokens in text> echo 'Hello world' | tokenizer llama3 encode --output json | jq length5> # R█> # Tokenize Python code> echo 'def add(a, b): return a + b' | tokenizer llama3128000 755 923 2948 11 293 1680 471 264 489 293 198 128001> # Count tokens in text> echo 'Hello world' | tokenizer llama3 encode --output json | jq length5> # Ra█> # Tokenize Python code> echo 'def add(a, b): return a + b' | tokenizer llama3128000 755 923 2948 11 293 1680 471 264 489 293 198 128001> # Count tokens in text> echo 'Hello world' | tokenizer llama3 encode --output json | jq length5> # Ra█> # Tokenize Python code> echo 'def add(a, b): return a + b' | tokenizer llama3128000 755 923 2948 11 293 1680 471 264 489 293 198 128001> # Count tokens in text> echo 'Hello world' | tokenizer llama3 encode --output json | jq length5> # Raw█> # Tokenize Python code> echo 'def add(a, b): return a + b' | tokenizer llama3128000 755 923 2948 11 293 1680 471 264 489 293 198 128001> # Count tokens in text> echo 'Hello world' | tokenizer llama3 encode --output json | jq length5> # Raw █> # Tokenize Python code> echo 'def add(a, b): return a + b' | tokenizer llama3128000 755 923 2948 11 293 1680 471 264 489 293 198 128001> # Count tokens in text> echo 'Hello world' | tokenizer llama3 encode --output json | jq length5> # Raw █> # Tokenize Python code> echo 'def add(a, b): return a + b' | tokenizer llama3128000 755 923 2948 11 293 1680 471 264 489 293 198 128001> # Count tokens in text> echo 'Hello world' | tokenizer llama3 encode --output json | jq length5> # Raw t█> # Tokenize Python code> echo 'def add(a, b): return a + b' | tokenizer llama3128000 755 923 2948 11 293 1680 471 264 489 293 198 128001> # Count tokens in text> echo 'Hello world' | tokenizer llama3 encode --output json | jq length5> # Raw to█> # Tokenize Python code> echo 'def add(a, b): return a + b' | tokenizer llama3128000 755 923 2948 11 293 1680 471 264 489 293 198 128001> # Count tokens in text> echo 'Hello world' | tokenizer llama3 encode --output json | jq length5> # Raw to█> # Tokenize Python code> echo 'def add(a, b): return a + b' | tokenizer llama3128000 755 923 2948 11 293 1680 471 264 489 293 198 128001> # Count tokens in text> echo 'Hello world' | tokenizer llama3 encode --output json | jq length5> # Raw tok█> # Tokenize Python code> echo 'def add(a, b): return a + b' | tokenizer llama3128000 755 923 2948 11 293 1680 471 264 489 293 198 128001> # Count tokens in text> echo 'Hello world' | tokenizer llama3 encode --output json | jq length5> # Raw toke█> # Tokenize Python code> echo 'def add(a, b): return a + b' | tokenizer llama3128000 755 923 2948 11 293 1680 471 264 489 293 198 128001> # Count tokens in text> echo 'Hello world' | tokenizer llama3 encode --output json | jq length5> # Raw toke█> # Tokenize Python code> echo 'def add(a, b): return a + b' | tokenizer llama3128000 755 923 2948 11 293 1680 471 264 489 293 198 128001> # Count tokens in text> echo 'Hello world' | tokenizer llama3 encode --output json | jq length5> # Raw token█> # Tokenize Python code> echo 'def add(a, b): return a + b' | tokenizer llama3128000 755 923 2948 11 293 1680 471 264 489 293 198 128001> # Count tokens in text> echo 'Hello world' | tokenizer llama3 encode --output json | jq length5> # Raw token █> # Tokenize Python code> echo 'def add(a, b): return a + b' | tokenizer llama3128000 755 923 2948 11 293 1680 471 264 489 293 198 128001> # Count tokens in text> echo 'Hello world' | tokenizer llama3 encode --output json | jq length5> # Raw token █> # Tokenize Python code> echo 'def add(a, b): return a + b' | tokenizer llama3128000 755 923 2948 11 293 1680 471 264 489 293 198 128001> # Count tokens in text> echo 'Hello world' | tokenizer llama3 encode --output json | jq length5> # Raw token c█> # Tokenize Python code> echo 'def add(a, b): return a + b' | tokenizer llama3128000 755 923 2948 11 293 1680 471 264 489 293 198 128001> # Count tokens in text> echo 'Hello world' | tokenizer llama3 encode --output json | jq length5> # Raw token co█> # Tokenize Python code> echo 'def add(a, b): return a + b' | tokenizer llama3128000 755 923 2948 11 293 1680 471 264 489 293 198 128001> # Count tokens in text> echo 'Hello world' | tokenizer llama3 encode --output json | jq length5> # Raw token co█> # Tokenize Python code> echo 'def add(a, b): return a + b' | tokenizer llama3128000 755 923 2948 11 293 1680 471 264 489 293 198 128001> # Count tokens in text> echo 'Hello world' | tokenizer llama3 encode --output json | jq length5> # Raw token cou█> # Tokenize Python code> echo 'def add(a, b): return a + b' | tokenizer llama3128000 755 923 2948 11 293 1680 471 264 489 293 198 128001> # Count tokens in text> echo 'Hello world' | tokenizer llama3 encode --output json | jq length5> # Raw token coun█> # Tokenize Python code> echo 'def add(a, b): return a + b' | tokenizer llama3128000 755 923 2948 11 293 1680 471 264 489 293 198 128001> # Count tokens in text> echo 'Hello world' | tokenizer llama3 encode --output json | jq length5> # Raw token coun█> # Tokenize Python code> echo 'def add(a, b): return a + b' | tokenizer llama3128000 755 923 2948 11 293 1680 471 264 489 293 198 128001> # Count tokens in text> echo 'Hello world' | tokenizer llama3 encode --output json | jq length5> # Raw token count█> # Tokenize Python code> echo 'def add(a, b): return a + b' | tokenizer llama3128000 755 923 2948 11 293 1680 471 264 489 293 198 128001> # Count tokens in text> echo 'Hello world' | tokenizer llama3 encode --output json | jq length5> # Raw token count █> # Tokenize Python code> echo 'def add(a, b): return a + b' | tokenizer llama3128000 755 923 2948 11 293 1680 471 264 489 293 198 128001> # Count tokens in text> echo 'Hello world' | tokenizer llama3 encode --output json | jq length5> # Raw token count █> # Tokenize Python code> echo 'def add(a, b): return a + b' | tokenizer llama3128000 755 923 2948 11 293 1680 471 264 489 293 198 128001> # Count tokens in text> echo 'Hello world' | tokenizer llama3 encode --output json | jq length5> # Raw token count (█> # Tokenize Python code> echo 'def add(a, b): return a + b' | tokenizer llama3128000 755 923 2948 11 293 1680 471 264 489 293 198 128001> # Count tokens in text> echo 'Hello world' | tokenizer llama3 encode --output json | jq length5> # Raw token count (n█> # Tokenize Python code> echo 'def add(a, b): return a + b' | tokenizer llama3128000 755 923 2948 11 293 1680 471 264 489 293 198 128001> # Count tokens in text> echo 'Hello world' | tokenizer llama3 encode --output json | jq length5> # Raw token count (n█> # Tokenize Python code> echo 'def add(a, b): return a + b' | tokenizer llama3128000 755 923 2948 11 293 1680 471 264 489 293 198 128001> # Count tokens in text> echo 'Hello world' | tokenizer llama3 encode --output json | jq length5> # Raw token count (no█> # Tokenize Python code> echo 'def add(a, b): return a + b' | tokenizer llama3128000 755 923 2948 11 293 1680 471 264 489 293 198 128001> # Count tokens in text> echo 'Hello world' | tokenizer llama3 encode --output json | jq length5> # Raw token count (no █> # Tokenize Python code> echo 'def add(a, b): return a + b' | tokenizer llama3128000 755 923 2948 11 293 1680 471 264 489 293 198 128001> # Count tokens in text> echo 'Hello world' | tokenizer llama3 encode --output json | jq length5> # Raw token count (no s█> # Tokenize Python code> echo 'def add(a, b): return a + b' | tokenizer llama3128000 755 923 2948 11 293 1680 471 264 489 293 198 128001> # Count tokens in text> echo 'Hello world' | tokenizer llama3 encode --output json | jq length5> # Raw token count (no sp█> # Tokenize Python code> echo 'def add(a, b): return a + b' | tokenizer llama3128000 755 923 2948 11 293 1680 471 264 489 293 198 128001> # Count tokens in text> echo 'Hello world' | tokenizer llama3 encode --output json | jq length5> # Raw token count (no spe█> # Tokenize Python code> echo 'def add(a, b): return a + b' | tokenizer llama3128000 755 923 2948 11 293 1680 471 264 489 293 198 128001> # Count tokens in text> echo 'Hello world' | tokenizer llama3 encode --output json | jq length5> # Raw token count (no spec█> # Tokenize Python code> echo 'def add(a, b): return a + b' | tokenizer llama3128000 755 923 2948 11 293 1680 471 264 489 293 198 128001> # Count tokens in text> echo 'Hello world' | tokenizer llama3 encode --output json | jq length5> # Raw token count (no speci█> # Tokenize Python code> echo 'def add(a, b): return a + b' | tokenizer llama3128000 755 923 2948 11 293 1680 471 264 489 293 198 128001> # Count tokens in text> echo 'Hello world' | tokenizer llama3 encode --output json | jq length5> # Raw token count (no specia█> # Tokenize Python code> echo 'def add(a, b): return a + b' | tokenizer llama3128000 755 923 2948 11 293 1680 471 264 489 293 198 128001> # Count tokens in text> echo 'Hello world' | tokenizer llama3 encode --output json | jq length5> # Raw token count (no special█> # Tokenize Python code> echo 'def add(a, b): return a + b' | tokenizer llama3128000 755 923 2948 11 293 1680 471 264 489 293 198 128001> # Count tokens in text> echo 'Hello world' | tokenizer llama3 encode --output json | jq length5> # Raw token count (no special █> # Tokenize Python code> echo 'def add(a, b): return a + b' | tokenizer llama3128000 755 923 2948 11 293 1680 471 264 489 293 198 128001> # Count tokens in text> echo 'Hello world' | tokenizer llama3 encode --output json | jq length5> # Raw token count (no special t█> # Tokenize Python code> echo 'def add(a, b): return a + b' | tokenizer llama3128000 755 923 2948 11 293 1680 471 264 489 293 198 128001> # Count tokens in text> echo 'Hello world' | tokenizer llama3 encode --output json | jq length5> # Raw token count (no special to█> # Tokenize Python code> echo 'def add(a, b): return a + b' | tokenizer llama3128000 755 923 2948 11 293 1680 471 264 489 293 198 128001> # Count tokens in text> echo 'Hello world' | tokenizer llama3 encode --output json | jq length5> # Raw token count (no special tok█> # Tokenize Python code> echo 'def add(a, b): return a + b' | tokenizer llama3128000 755 923 2948 11 293 1680 471 264 489 293 198 128001> # Count tokens in text> echo 'Hello world' | tokenizer llama3 encode --output json | jq length5> # Raw token count (no special toke█> # Tokenize Python code> echo 'def add(a, b): return a + b' | tokenizer llama3128000 755 923 2948 11 293 1680 471 264 489 293 198 128001> # Count tokens in text> echo 'Hello world' | tokenizer llama3 encode --output json | jq length5> # Raw token count (no special token█> # Tokenize Python code> echo 'def add(a, b): return a + b' | tokenizer llama3128000 755 923 2948 11 293 1680 471 264 489 293 198 128001> # Count tokens in text> echo 'Hello world' | tokenizer llama3 encode --output json | jq length5> # Raw token count (no special tokens█> # Tokenize Python code> echo 'def add(a, b): return a + b' | tokenizer llama3128000 755 923 2948 11 293 1680 471 264 489 293 198 128001> # Count tokens in text> echo 'Hello world' | tokenizer llama3 encode --output json | jq length5> # Raw token count (no special tokens)█> # Tokenize Python code> echo 'def add(a, b): return a + b' | tokenizer llama3128000 755 923 2948 11 293 1680 471 264 489 293 198 128001> # Count tokens in text> echo 'Hello world' | tokenizer llama3 encode --output json | jq length5> # Raw token count (no special tokens)█>> # Tokenize Python code> echo 'def add(a, b): return a + b' | tokenizer llama3128000 755 923 2948 11 293 1680 471 264 489 293 198 128001> # Count tokens in text> echo 'Hello world' | tokenizer llama3 encode --output json | jq length5> # Raw token count (no special tokens)> █> # Tokenize Python code> echo 'def add(a, b): return a + b' | tokenizer llama3128000 755 923 2948 11 293 1680 471 264 489 293 198 128001> # Count tokens in text> echo 'Hello world' | tokenizer llama3 encode --output json | jq length5> # Raw token count (no special tokens)> t█> # Tokenize Python code> echo 'def add(a, b): return a + b' | tokenizer llama3128000 755 923 2948 11 293 1680 471 264 489 293 198 128001> # Count tokens in text> echo 'Hello world' | tokenizer llama3 encode --output json | jq length5> # Raw token count (no special tokens)> to█> # Tokenize Python code> echo 'def add(a, b): return a + b' | tokenizer llama3128000 755 923 2948 11 293 1680 471 264 489 293 198 128001> # Count tokens in text> echo 'Hello world' | tokenizer llama3 encode --output json | jq length5> # Raw token count (no special tokens)> tok█> # Tokenize Python code> echo 'def add(a, b): return a + b' | tokenizer llama3128000 755 923 2948 11 293 1680 471 264 489 293 198 128001> # Count tokens in text> echo 'Hello world' | tokenizer llama3 encode --output json | jq length5> # Raw token count (no special tokens)> toke█> # Tokenize Python code> echo 'def add(a, b): return a + b' | tokenizer llama3128000 755 923 2948 11 293 1680 471 264 489 293 198 128001> # Count tokens in text> echo 'Hello world' | tokenizer llama3 encode --output json | jq length5> # Raw token count (no special tokens)> token█> # Tokenize Python code> echo 'def add(a, b): return a + b' | tokenizer llama3128000 755 923 2948 11 293 1680 471 264 489 293 198 128001> # Count tokens in text> echo 'Hello world' | tokenizer llama3 encode --output json | jq length5> # Raw token count (no special tokens)> tokeni█> # Tokenize Python code> echo 'def add(a, b): return a + b' | tokenizer llama3128000 755 923 2948 11 293 1680 471 264 489 293 198 128001> # Count tokens in text> echo 'Hello world' | tokenizer llama3 encode --output json | jq length5> # Raw token count (no special tokens)> tokeniz█> # Tokenize Python code> echo 'def add(a, b): return a + b' | tokenizer llama3128000 755 923 2948 11 293 1680 471 264 489 293 198 128001> # Count tokens in text> echo 'Hello world' | tokenizer llama3 encode --output json | jq length5> # Raw token count (no special tokens)> tokenize█> # Tokenize Python code> echo 'def add(a, b): return a + b' | tokenizer llama3128000 755 923 2948 11 293 1680 471 264 489 293 198 128001> # Count tokens in text> echo 'Hello world' | tokenizer llama3 encode --output json | jq length5> # Raw token count (no special tokens)> tokenizer█> # Tokenize Python code> echo 'def add(a, b): return a + b' | tokenizer llama3128000 755 923 2948 11 293 1680 471 264 489 293 198 128001> # Count tokens in text> echo 'Hello world' | tokenizer llama3 encode --output json | jq length5> # Raw token count (no special tokens)> tokenizer █> # Tokenize Python code> echo 'def add(a, b): return a + b' | tokenizer llama3128000 755 923 2948 11 293 1680 471 264 489 293 198 128001> # Count tokens in text> echo 'Hello world' | tokenizer llama3 encode --output json | jq length5> # Raw token count (no special tokens)> tokenizer l█> # Tokenize Python code> echo 'def add(a, b): return a + b' | tokenizer llama3128000 755 923 2948 11 293 1680 471 264 489 293 198 128001> # Count tokens in text> echo 'Hello world' | tokenizer llama3 encode --output json | jq length5> # Raw token count (no special tokens)> tokenizer ll█> # Tokenize Python code> echo 'def add(a, b): return a + b' | tokenizer llama3128000 755 923 2948 11 293 1680 471 264 489 293 198 128001> # Count tokens in text> echo 'Hello world' | tokenizer llama3 encode --output json | jq length5> # Raw token count (no special tokens)> tokenizer lla█> # Tokenize Python code> echo 'def add(a, b): return a + b' | tokenizer llama3128000 755 923 2948 11 293 1680 471 264 489 293 198 128001> # Count tokens in text> echo 'Hello world' | tokenizer llama3 encode --output json | jq length5> # Raw token count (no special tokens)> tokenizer llam█> # Tokenize Python code> echo 'def add(a, b): return a + b' | tokenizer llama3128000 755 923 2948 11 293 1680 471 264 489 293 198 128001> # Count tokens in text> echo 'Hello world' | tokenizer llama3 encode --output json | jq length5> # Raw token count (no special tokens)> tokenizer llama█> # Tokenize Python code> echo 'def add(a, b): return a + b' | tokenizer llama3128000 755 923 2948 11 293 1680 471 264 489 293 198 128001> # Count tokens in text> echo 'Hello world' | tokenizer llama3 encode --output json | jq length5> # Raw token count (no special tokens)> tokenizer llama3█> # Tokenize Python code> echo 'def add(a, b): return a + b' | tokenizer llama3128000 755 923 2948 11 293 1680 471 264 489 293 198 128001> # Count tokens in text> echo 'Hello world' | tokenizer llama3 encode --output json | jq length5> # Raw token count (no special tokens)> tokenizer llama3 █> # Tokenize Python code> echo 'def add(a, b): return a + b' | tokenizer llama3128000 755 923 2948 11 293 1680 471 264 489 293 198 128001> # Count tokens in text> echo 'Hello world' | tokenizer llama3 encode --output json | jq length5> # Raw token count (no special tokens)> tokenizer llama3 e█> # Tokenize Python code> echo 'def add(a, b): return a + b' | tokenizer llama3128000 755 923 2948 11 293 1680 471 264 489 293 198 128001> # Count tokens in text> echo 'Hello world' | tokenizer llama3 encode --output json | jq length5> # Raw token count (no special tokens)> tokenizer llama3 en█> # Tokenize Python code> echo 'def add(a, b): return a + b' | tokenizer llama3128000 755 923 2948 11 293 1680 471 264 489 293 198 128001> # Count tokens in text> echo 'Hello world' | tokenizer llama3 encode --output json | jq length5> # Raw token count (no special tokens)> tokenizer llama3 enc█> # Tokenize Python code> echo 'def add(a, b): return a + b' | tokenizer llama3128000 755 923 2948 11 293 1680 471 264 489 293 198 128001> # Count tokens in text> echo 'Hello world' | tokenizer llama3 encode --output json | jq length5> # Raw token count (no special tokens)> tokenizer llama3 enco█> # Tokenize Python code> echo 'def add(a, b): return a + b' | tokenizer llama3128000 755 923 2948 11 293 1680 471 264 489 293 198 128001> # Count tokens in text> echo 'Hello world' | tokenizer llama3 encode --output json | jq length5> # Raw token count (no special tokens)> tokenizer llama3 encod█> # Tokenize Python code> echo 'def add(a, b): return a + b' | tokenizer llama3128000 755 923 2948 11 293 1680 471 264 489 293 198 128001> # Count tokens in text> echo 'Hello world' | tokenizer llama3 encode --output json | jq length5> # Raw token count (no special tokens)> tokenizer llama3 encode█> # Tokenize Python code> echo 'def add(a, b): return a + b' | tokenizer llama3128000 755 923 2948 11 293 1680 471 264 489 293 198 128001> # Count tokens in text> echo 'Hello world' | tokenizer llama3 encode --output json | jq length5> # Raw token count (no special tokens)> tokenizer llama3 encode █> # Tokenize Python code> echo 'def add(a, b): return a + b' | tokenizer llama3128000 755 923 2948 11 293 1680 471 264 489 293 198 128001> # Count tokens in text> echo 'Hello world' | tokenizer llama3 encode --output json | jq length5> # Raw token count (no special tokens)> tokenizer llama3 encode -█> # Tokenize Python code> echo 'def add(a, b): return a + b' | tokenizer llama3128000 755 923 2948 11 293 1680 471 264 489 293 198 128001> # Count tokens in text> echo 'Hello world' | tokenizer llama3 encode --output json | jq length5> # Raw token count (no special tokens)> tokenizer llama3 encode --█> # Tokenize Python code> echo 'def add(a, b): return a + b' | tokenizer llama3128000 755 923 2948 11 293 1680 471 264 489 293 198 128001> # Count tokens in text> echo 'Hello world' | tokenizer llama3 encode --output json | jq length5> # Raw token count (no special tokens)> tokenizer llama3 encode --b█> # Tokenize Python code> echo 'def add(a, b): return a + b' | tokenizer llama3128000 755 923 2948 11 293 1680 471 264 489 293 198 128001> # Count tokens in text> echo 'Hello world' | tokenizer llama3 encode --output json | jq length5> # Raw token count (no special tokens)> tokenizer llama3 encode --bo█> # Tokenize Python code> echo 'def add(a, b): return a + b' | tokenizer llama3128000 755 923 2948 11 293 1680 471 264 489 293 198 128001> # Count tokens in text> echo 'Hello world' | tokenizer llama3 encode --output json | jq length5> # Raw token count (no special tokens)> tokenizer llama3 encode --bos█> # Tokenize Python code> echo 'def add(a, b): return a + b' | tokenizer llama3128000 755 923 2948 11 293 1680 471 264 489 293 198 128001> # Count tokens in text> echo 'Hello world' | tokenizer llama3 encode --output json | jq length5> # Raw token count (no special tokens)> tokenizer llama3 encode --bos=█> # Tokenize Python code> echo 'def add(a, b): return a + b' | tokenizer llama3128000 755 923 2948 11 293 1680 471 264 489 293 198 128001> # Count tokens in text> echo 'Hello world' | tokenizer llama3 encode --output json | jq length5> # Raw token count (no special tokens)> tokenizer llama3 encode --bos=f█> # Tokenize Python code> echo 'def add(a, b): return a + b' | tokenizer llama3128000 755 923 2948 11 293 1680 471 264 489 293 198 128001> # Count tokens in text> echo 'Hello world' | tokenizer llama3 encode --output json | jq length5> # Raw token count (no special tokens)> tokenizer llama3 encode --bos=fa█> # Tokenize Python code> echo 'def add(a, b): return a + b' | tokenizer llama3128000 755 923 2948 11 293 1680 471 264 489 293 198 128001> # Count tokens in text> echo 'Hello world' | tokenizer llama3 encode --output json | jq length5> # Raw token count (no special tokens)> tokenizer llama3 encode --bos=fa█> # Tokenize Python code> echo 'def add(a, b): return a + b' | tokenizer llama3128000 755 923 2948 11 293 1680 471 264 489 293 198 128001> # Count tokens in text> echo 'Hello world' | tokenizer llama3 encode --output json | jq length5> # Raw token count (no special tokens)> tokenizer llama3 encode --bos=fal█> # Tokenize Python code> echo 'def add(a, b): return a + b' | tokenizer llama3128000 755 923 2948 11 293 1680 471 264 489 293 198 128001> # Count tokens in text> echo 'Hello world' | tokenizer llama3 encode --output json | jq length5> # Raw token count (no special tokens)> tokenizer llama3 encode --bos=fals█> # Tokenize Python code> echo 'def add(a, b): return a + b' | tokenizer llama3128000 755 923 2948 11 293 1680 471 264 489 293 198 128001> # Count tokens in text> echo 'Hello world' | tokenizer llama3 encode --output json | jq length5> # Raw token count (no special tokens)> tokenizer llama3 encode --bos=false█> # Tokenize Python code> echo 'def add(a, b): return a + b' | tokenizer llama3128000 755 923 2948 11 293 1680 471 264 489 293 198 128001> # Count tokens in text> echo 'Hello world' | tokenizer llama3 encode --output json | jq length5> # Raw token count (no special tokens)> tokenizer llama3 encode --bos=false █> # Tokenize Python code> echo 'def add(a, b): return a + b' | tokenizer llama3128000 755 923 2948 11 293 1680 471 264 489 293 198 128001> # Count tokens in text> echo 'Hello world' | tokenizer llama3 encode --output json | jq length5> # Raw token count (no special tokens)> tokenizer llama3 encode --bos=false -█> # Tokenize Python code> echo 'def add(a, b): return a + b' | tokenizer llama3128000 755 923 2948 11 293 1680 471 264 489 293 198 128001> # Count tokens in text> echo 'Hello world' | tokenizer llama3 encode --output json | jq length5> # Raw token count (no special tokens)> tokenizer llama3 encode --bos=false --█> # Tokenize Python code> echo 'def add(a, b): return a + b' | tokenizer llama3128000 755 923 2948 11 293 1680 471 264 489 293 198 128001> # Count tokens in text> echo 'Hello world' | tokenizer llama3 encode --output json | jq length5> # Raw token count (no special tokens)> tokenizer llama3 encode --bos=false --e█> # Tokenize Python code> echo 'def add(a, b): return a + b' | tokenizer llama3128000 755 923 2948 11 293 1680 471 264 489 293 198 128001> # Count tokens in text> echo 'Hello world' | tokenizer llama3 encode --output json | jq length5> # Raw token count (no special tokens)> tokenizer llama3 encode --bos=false --eo█> # Tokenize Python code> echo 'def add(a, b): return a + b' | tokenizer llama3128000 755 923 2948 11 293 1680 471 264 489 293 198 128001> # Count tokens in text> echo 'Hello world' | tokenizer llama3 encode --output json | jq length5> # Raw token count (no special tokens)> tokenizer llama3 encode --bos=false --eos█> # Tokenize Python code> echo 'def add(a, b): return a + b' | tokenizer llama3128000 755 923 2948 11 293 1680 471 264 489 293 198 128001> # Count tokens in text> echo 'Hello world' | tokenizer llama3 encode --output json | jq length5> # Raw token count (no special tokens)> tokenizer llama3 encode --bos=false --eos=█> # Tokenize Python code> echo 'def add(a, b): return a + b' | tokenizer llama3128000 755 923 2948 11 293 1680 471 264 489 293 198 128001> # Count tokens in text> echo 'Hello world' | tokenizer llama3 encode --output json | jq length5> # Raw token count (no special tokens)> tokenizer llama3 encode --bos=false --eos=f█> # Tokenize Python code> echo 'def add(a, b): return a + b' | tokenizer llama3128000 755 923 2948 11 293 1680 471 264 489 293 198 128001> # Count tokens in text> echo 'Hello world' | tokenizer llama3 encode --output json | jq length5> # Raw token count (no special tokens)> tokenizer llama3 encode --bos=false --eos=fa█> # Tokenize Python code> echo 'def add(a, b): return a + b' | tokenizer llama3128000 755 923 2948 11 293 1680 471 264 489 293 198 128001> # Count tokens in text> echo 'Hello world' | tokenizer llama3 encode --output json | jq length5> # Raw token count (no special tokens)> tokenizer llama3 encode --bos=false --eos=fal█> # Tokenize Python code> echo 'def add(a, b): return a + b' | tokenizer llama3128000 755 923 2948 11 293 1680 471 264 489 293 198 128001> # Count tokens in text> echo 'Hello world' | tokenizer llama3 encode --output json | jq length5> # Raw token count (no special tokens)> tokenizer llama3 encode --bos=false --eos=fals█> # Tokenize Python code> echo 'def add(a, b): return a + b' | tokenizer llama3128000 755 923 2948 11 293 1680 471 264 489 293 198 128001> # Count tokens in text> echo 'Hello world' | tokenizer llama3 encode --output json | jq length5> # Raw token count (no special tokens)> tokenizer llama3 encode --bos=false --eos=false█> # Tokenize Python code> echo 'def add(a, b): return a + b' | tokenizer llama3128000 755 923 2948 11 293 1680 471 264 489 293 198 128001> # Count tokens in text> echo 'Hello world' | tokenizer llama3 encode --output json | jq length5> # Raw token count (no special tokens)> tokenizer llama3 encode --bos=false --eos=false █> # Tokenize Python code> echo 'def add(a, b): return a + b' | tokenizer llama3128000 755 923 2948 11 293 1680 471 264 489 293 198 128001> # Count tokens in text> echo 'Hello world' | tokenizer llama3 encode --output json | jq length5> # Raw token count (no special tokens)> tokenizer llama3 encode --bos=false --eos=false '█> # Tokenize Python code> echo 'def add(a, b): return a + b' | tokenizer llama3128000 755 923 2948 11 293 1680 471 264 489 293 198 128001> # Count tokens in text> echo 'Hello world' | tokenizer llama3 encode --output json | jq length5> # Raw token count (no special tokens)> tokenizer llama3 encode --bos=false --eos=false 'H█> # Tokenize Python code> echo 'def add(a, b): return a + b' | tokenizer llama3128000 755 923 2948 11 293 1680 471 264 489 293 198 128001> # Count tokens in text> echo 'Hello world' | tokenizer llama3 encode --output json | jq length5> # Raw token count (no special tokens)> tokenizer llama3 encode --bos=false --eos=false 'He█> # Tokenize Python code> echo 'def add(a, b): return a + b' | tokenizer llama3128000 755 923 2948 11 293 1680 471 264 489 293 198 128001> # Count tokens in text> echo 'Hello world' | tokenizer llama3 encode --output json | jq length5> # Raw token count (no special tokens)> tokenizer llama3 encode --bos=false --eos=false 'Hel█> # Tokenize Python code> echo 'def add(a, b): return a + b' | tokenizer llama3128000 755 923 2948 11 293 1680 471 264 489 293 198 128001> # Count tokens in text> echo 'Hello world' | tokenizer llama3 encode --output json | jq length5> # Raw token count (no special tokens)> tokenizer llama3 encode --bos=false --eos=false 'Hell█> # Tokenize Python code> echo 'def add(a, b): return a + b' | tokenizer llama3128000 755 923 2948 11 293 1680 471 264 489 293 198 128001> # Count tokens in text> echo 'Hello world' | tokenizer llama3 encode --output json | jq length5> # Raw token count (no special tokens)> tokenizer llama3 encode --bos=false --eos=false 'Hello█> # Tokenize Python code> echo 'def add(a, b): return a + b' | tokenizer llama3128000 755 923 2948 11 293 1680 471 264 489 293 198 128001> # Count tokens in text> echo 'Hello world' | tokenizer llama3 encode --output json | jq length5> # Raw token count (no special tokens)> tokenizer llama3 encode --bos=false --eos=false 'Hello █> # Tokenize Python code> echo 'def add(a, b): return a + b' | tokenizer llama3128000 755 923 2948 11 293 1680 471 264 489 293 198 128001> # Count tokens in text> echo 'Hello world' | tokenizer llama3 encode --output json | jq length5> # Raw token count (no special tokens)> tokenizer llama3 encode --bos=false --eos=false 'Hello w█> # Tokenize Python code> echo 'def add(a, b): return a + b' | tokenizer llama3128000 755 923 2948 11 293 1680 471 264 489 293 198 128001> # Count tokens in text> echo 'Hello world' | tokenizer llama3 encode --output json | jq length5> # Raw token count (no special tokens)> tokenizer llama3 encode --bos=false --eos=false 'Hello wo█> # Tokenize Python code> echo 'def add(a, b): return a + b' | tokenizer llama3128000 755 923 2948 11 293 1680 471 264 489 293 198 128001> # Count tokens in text> echo 'Hello world' | tokenizer llama3 encode --output json | jq length5> # Raw token count (no special tokens)> tokenizer llama3 encode --bos=false --eos=false 'Hello wor█> # Tokenize Python code> echo 'def add(a, b): return a + b' | tokenizer llama3128000 755 923 2948 11 293 1680 471 264 489 293 198 128001> # Count tokens in text> echo 'Hello world' | tokenizer llama3 encode --output json | jq length5> # Raw token count (no special tokens)> tokenizer llama3 encode --bos=false --eos=false 'Hello worl█> # Tokenize Python code> echo 'def add(a, b): return a + b' | tokenizer llama3128000 755 923 2948 11 293 1680 471 264 489 293 198 128001> # Count tokens in text> echo 'Hello world' | tokenizer llama3 encode --output json | jq length5> # Raw token count (no special tokens)> tokenizer llama3 encode --bos=false --eos=false 'Hello world█> # Tokenize Python code> echo 'def add(a, b): return a + b' | tokenizer llama3128000 755 923 2948 11 293 1680 471 264 489 293 198 128001> # Count tokens in text> echo 'Hello world' | tokenizer llama3 encode --output json | jq length5> # Raw token count (no special tokens)> tokenizer llama3 encode --bos=false --eos=false 'Hello world'█> # Tokenize Python code> echo 'def add(a, b): return a + b' | tokenizer llama3128000 755 923 2948 11 293 1680 471 264 489 293 198 128001> # Count tokens in text> echo 'Hello world' | tokenizer llama3 encode --output json | jq length5> # Raw token count (no special tokens)> tokenizer llama3 encode --bos=false --eos=false 'Hello world'█> # Tokenize Python code> echo 'def add(a, b): return a + b' | tokenizer llama3128000 755 923 2948 11 293 1680 471 264 489 293 198 128001> # Count tokens in text> echo 'Hello world' | tokenizer llama3 encode --output json | jq length5> # Raw token count (no special tokens)> tokenizer llama3 encode --bos=false --eos=false 'Hello world'█906 1917>> # Tokenize Python code> echo 'def add(a, b): return a + b' | tokenizer llama3128000 755 923 2948 11 293 1680 471 264 489 293 198 128001> # Count tokens in text> echo 'Hello world' | tokenizer llama3 encode --output json | jq length5> # Raw token count (no special tokens)> tokenizer llama3 encode --bos=false --eos=false 'Hello world'9906 1917> █> # Tokenize Python code> echo 'def add(a, b): return a + b' | tokenizer llama3128000 755 923 2948 11 293 1680 471 264 489 293 198 128001> # Count tokens in text> echo 'Hello world' | tokenizer llama3 encode --output json | jq length5> # Raw token count (no special tokens)> tokenizer llama3 encode --bos=false --eos=false 'Hello world'9906 1917> █> █> # Te█> # Tes█> # Test█> # Test █> # Test e█> # Test en█> # Test enc█> # Test enco█> # Test encod█> # Test encodi█> # Test encodin█> # Test encoding█> # Test encoding/█> # Test encoding/d█> # Test encoding/de█> # Test encoding/dec█> # Test encoding/deco█> # Test encoding/decod█> # Test encoding/decodi█> # Test encoding/decodin█> # Test encoding/decoding█> # Test encoding/decoding> █> # Test encoding/decoding> t█> # Test encoding/decoding> to█> # Test encoding/decoding> tok█> # Test encoding/decoding> toke█> # Test encoding/decoding> token█> # Test encoding/decoding> tokeni█> # Test encoding/decoding> tokeniz█> # Test encoding/decoding> tokenize█> # Test encoding/decoding> tokenizer█> # Test encoding/decoding> tokenizer █> # Test encoding/decoding> tokenizer l█> # Test encoding/decoding> tokenizer ll█> # Test encoding/decoding> tokenizer lla█> # Test encoding/decoding> tokenizer llam█> # Test encoding/decoding> tokenizer llama█> # Test encoding/decoding> tokenizer llama3█> # Test encoding/decoding> tokenizer llama3 █> # Test encoding/decoding> tokenizer llama3 '█> # Test encoding/decoding> tokenizer llama3 'T█> # Test encoding/decoding> tokenizer llama3 'Te█> # Test encoding/decoding> tokenizer llama3 'Tes█> # Test encoding/decoding> tokenizer llama3 'Test█> # Test encoding/decoding> tokenizer llama3 'Test █> # Test encoding/decoding> tokenizer llama3 'Test █> # Test encoding/decoding> tokenizer llama3 'Test d█> # Test encoding/decoding> tokenizer llama3 'Test da█> # Test encoding/decoding> tokenizer llama3 'Test dat█> # Test encoding/decoding> tokenizer llama3 'Test data█> # Test encoding/decoding> tokenizer llama3 'Test data'█> # Test encoding/decoding> tokenizer llama3 'Test data' █> # Test encoding/decoding> tokenizer llama3 'Test data' |█> # Test encoding/decoding> tokenizer llama3 'Test data' | █> # Test encoding/decoding> tokenizer llama3 'Test data' | t█> # Test encoding/decoding> tokenizer llama3 'Test data' | to█> # Test encoding/decoding> tokenizer llama3 'Test data' | tok█> # Test encoding/decoding> tokenizer llama3 'Test data' | toke█> # Test encoding/decoding> tokenizer llama3 'Test data' | toke█> # Test encoding/decoding> tokenizer llama3 'Test data' | token█> # Test encoding/decoding> tokenizer llama3 'Test data' | tokeni█> # Test encoding/decoding> tokenizer llama3 'Test data' | tokeni█> # Test encoding/decoding> tokenizer llama3 'Test data' | tokeniz█> # Test encoding/decoding> tokenizer llama3 'Test data' | tokenize█> # Test encoding/decoding> tokenizer llama3 'Test data' | tokenize█> # Test encoding/decoding> tokenizer llama3 'Test data' | tokenizer█> # Test encoding/decoding> tokenizer llama3 'Test data' | tokenizer █> # Test encoding/decoding> tokenizer llama3 'Test data' | tokenizer █> # Test encoding/decoding> tokenizer llama3 'Test data' | tokenizer l█> # Test encoding/decoding> tokenizer llama3 'Test data' | tokenizer ll█> # Test encoding/decoding> tokenizer llama3 'Test data' | tokenizer ll█> # Test encoding/decoding> tokenizer llama3 'Test data' | tokenizer lla█> # Test encoding/decoding> tokenizer llama3 'Test data' | tokenizer llam█> # Test encoding/decoding> tokenizer llama3 'Test data' | tokenizer llam█> # Test encoding/decoding> tokenizer llama3 'Test data' | tokenizer llama█> # Test encoding/decoding> tokenizer llama3 'Test data' | tokenizer llama3█> # Test encoding/decoding> tokenizer llama3 'Test data' | tokenizer llama3 █> # Test encoding/decoding> tokenizer llama3 'Test data' | tokenizer llama3 d█> # Test encoding/decoding> tokenizer llama3 'Test data' | tokenizer llama3 d█> # Test encoding/decoding> tokenizer llama3 'Test data' | tokenizer llama3 de█> # Test encoding/decoding> tokenizer llama3 'Test data' | tokenizer llama3 dec█> # Test encoding/decoding> tokenizer llama3 'Test data' | tokenizer llama3 dec█> # Test encoding/decoding> tokenizer llama3 'Test data' | tokenizer llama3 deco█> # Test encoding/decoding> tokenizer llama3 'Test data' | tokenizer llama3 decod█> # Test encoding/decoding> tokenizer llama3 'Test data' | tokenizer llama3 decod█> # Test encoding/decoding> tokenizer llama3 'Test data' | tokenizer llama3 decode█> # Test encoding/decoding> tokenizer llama3 'Test data' | tokenizer llama3 decode█> # Test encoding/decoding> tokenizer llama3 'Test data' | tokenizer llama3 decode█|begin_of_text|>Test data<|end_of_text|>> # Test encoding/decoding> tokenizer llama3 'Test data' | tokenizer llama3 decode<|begin_of_text|>Test data<|end_of_text|>> █> # Test encoding/decoding> tokenizer llama3 'Test data' | tokenizer llama3 decode<|begin_of_text|>Test data<|end_of_text|>>> █> # Test encoding/decoding> tokenizer llama3 'Test data' | tokenizer llama3 decode<|begin_of_text|>Test data<|end_of_text|>>> █> # Test encoding/decoding> tokenizer llama3 'Test data' | tokenizer llama3 decode<|begin_of_text|>Test data<|end_of_text|>>> █> # Test encoding/decoding> tokenizer llama3 'Test data' | tokenizer llama3 decode<|begin_of_text|>Test data<|end_of_text|>>> #█> # Test encoding/decoding> tokenizer llama3 'Test data' | tokenizer llama3 decode<|begin_of_text|>Test data<|end_of_text|>>> # █> # Test encoding/decoding> tokenizer llama3 'Test data' | tokenizer llama3 decode<|begin_of_text|>Test data<|end_of_text|>>> # █> # Test encoding/decoding> tokenizer llama3 'Test data' | tokenizer llama3 decode<|begin_of_text|>Test data<|end_of_text|>>> # T█> # Test encoding/decoding> tokenizer llama3 'Test data' | tokenizer llama3 decode<|begin_of_text|>Test data<|end_of_text|>>> # To█> # Test encoding/decoding> tokenizer llama3 'Test data' | tokenizer llama3 decode<|begin_of_text|>Test data<|end_of_text|>>> # To█> # Test encoding/decoding> tokenizer llama3 'Test data' | tokenizer llama3 decode<|begin_of_text|>Test data<|end_of_text|>>> # Tok█> # Test encoding/decoding> tokenizer llama3 'Test data' | tokenizer llama3 decode<|begin_of_text|>Test data<|end_of_text|>>> # Toke█> # Test encoding/decoding> tokenizer llama3 'Test data' | tokenizer llama3 decode<|begin_of_text|>Test data<|end_of_text|>>> # Toke█> # Test encoding/decoding> tokenizer llama3 'Test data' | tokenizer llama3 decode<|begin_of_text|>Test data<|end_of_text|>>> # Token█> # Test encoding/decoding> tokenizer llama3 'Test data' | tokenizer llama3 decode<|begin_of_text|>Test data<|end_of_text|>>> # Tokeni█> # Test encoding/decoding> tokenizer llama3 'Test data' | tokenizer llama3 decode<|begin_of_text|>Test data<|end_of_text|>>> # Tokeni█> # Test encoding/decoding> tokenizer llama3 'Test data' | tokenizer llama3 decode<|begin_of_text|>Test data<|end_of_text|>>> # Tokeniz█> # Test encoding/decoding> tokenizer llama3 'Test data' | tokenizer llama3 decode<|begin_of_text|>Test data<|end_of_text|>>> # Tokenize█> # Test encoding/decoding> tokenizer llama3 'Test data' | tokenizer llama3 decode<|begin_of_text|>Test data<|end_of_text|>>> # Tokenize█> # Test encoding/decoding> tokenizer llama3 'Test data' | tokenizer llama3 decode<|begin_of_text|>Test data<|end_of_text|>>> # Tokenizer█> # Test encoding/decoding> tokenizer llama3 'Test data' | tokenizer llama3 decode<|begin_of_text|>Test data<|end_of_text|>>> # Tokenizer █> # Test encoding/decoding> tokenizer llama3 'Test data' | tokenizer llama3 decode<|begin_of_text|>Test data<|end_of_text|>>> # Tokenizer █> # Test encoding/decoding> tokenizer llama3 'Test data' | tokenizer llama3 decode<|begin_of_text|>Test data<|end_of_text|>>> # Tokenizer i█> # Test encoding/decoding> tokenizer llama3 'Test data' | tokenizer llama3 decode<|begin_of_text|>Test data<|end_of_text|>>> # Tokenizer in█> # Test encoding/decoding> tokenizer llama3 'Test data' | tokenizer llama3 decode<|begin_of_text|>Test data<|end_of_text|>>> # Tokenizer in█> # Test encoding/decoding> tokenizer llama3 'Test data' | tokenizer llama3 decode<|begin_of_text|>Test data<|end_of_text|>>> # Tokenizer inf█> # Test encoding/decoding> tokenizer llama3 'Test data' | tokenizer llama3 decode<|begin_of_text|>Test data<|end_of_text|>>> # Tokenizer info█> # Test encoding/decoding> tokenizer llama3 'Test data' | tokenizer llama3 decode<|begin_of_text|>Test data<|end_of_text|>>> # Tokenizer info█>> # Test encoding/decoding> tokenizer llama3 'Test data' | tokenizer llama3 decode<|begin_of_text|>Test data<|end_of_text|>>> # Tokenizer info> █> # Test encoding/decoding> tokenizer llama3 'Test data' | tokenizer llama3 decode<|begin_of_text|>Test data<|end_of_text|>>> # Tokenizer info> t█> # Test encoding/decoding> tokenizer llama3 'Test data' | tokenizer llama3 decode<|begin_of_text|>Test data<|end_of_text|>>> # Tokenizer info> t█> # Test encoding/decoding> tokenizer llama3 'Test data' | tokenizer llama3 decode<|begin_of_text|>Test data<|end_of_text|>>> # Tokenizer info> to█> # Test encoding/decoding> tokenizer llama3 'Test data' | tokenizer llama3 decode<|begin_of_text|>Test data<|end_of_text|>>> # Tokenizer info> tok█> # Test encoding/decoding> tokenizer llama3 'Test data' | tokenizer llama3 decode<|begin_of_text|>Test data<|end_of_text|>>> # Tokenizer info> tok█> # Test encoding/decoding> tokenizer llama3 'Test data' | tokenizer llama3 decode<|begin_of_text|>Test data<|end_of_text|>>> # Tokenizer info> toke█> # Test encoding/decoding> tokenizer llama3 'Test data' | tokenizer llama3 decode<|begin_of_text|>Test data<|end_of_text|>>> # Tokenizer info> token█> # Test encoding/decoding> tokenizer llama3 'Test data' | tokenizer llama3 decode<|begin_of_text|>Test data<|end_of_text|>>> # Tokenizer info> tokeni█> # Test encoding/decoding> tokenizer llama3 'Test data' | tokenizer llama3 decode<|begin_of_text|>Test data<|end_of_text|>>> # Tokenizer info> tokeniz█> # Test encoding/decoding> tokenizer llama3 'Test data' | tokenizer llama3 decode<|begin_of_text|>Test data<|end_of_text|>>> # Tokenizer info> tokenize█> # Test encoding/decoding> tokenizer llama3 'Test data' | tokenizer llama3 decode<|begin_of_text|>Test data<|end_of_text|>>> # Tokenizer info> tokenizer█> # Test encoding/decoding> tokenizer llama3 'Test data' | tokenizer llama3 decode<|begin_of_text|>Test data<|end_of_text|>>> # Tokenizer info> tokenizer █> # Test encoding/decoding> tokenizer llama3 'Test data' | tokenizer llama3 decode<|begin_of_text|>Test data<|end_of_text|>>> # Tokenizer info> tokenizer l█> # Test encoding/decoding> tokenizer llama3 'Test data' | tokenizer llama3 decode<|begin_of_text|>Test data<|end_of_text|>>> # Tokenizer info> tokenizer ll█> # Test encoding/decoding> tokenizer llama3 'Test data' | tokenizer llama3 decode<|begin_of_text|>Test data<|end_of_text|>>> # Tokenizer info> tokenizer lla█> # Test encoding/decoding> tokenizer llama3 'Test data' | tokenizer llama3 decode<|begin_of_text|>Test data<|end_of_text|>>> # Tokenizer info> tokenizer llam█> # Test encoding/decoding> tokenizer llama3 'Test data' | tokenizer llama3 decode<|begin_of_text|>Test data<|end_of_text|>>> # Tokenizer info> tokenizer llama█> # Test encoding/decoding> tokenizer llama3 'Test data' | tokenizer llama3 decode<|begin_of_text|>Test data<|end_of_text|>>> # Tokenizer info> tokenizer llama3█> # Test encoding/decoding> tokenizer llama3 'Test data' | tokenizer llama3 decode<|begin_of_text|>Test data<|end_of_text|>>> # Tokenizer info> tokenizer llama3 █> # Test encoding/decoding> tokenizer llama3 'Test data' | tokenizer llama3 decode<|begin_of_text|>Test data<|end_of_text|>>> # Tokenizer info> tokenizer llama3 i█> # Test encoding/decoding> tokenizer llama3 'Test data' | tokenizer llama3 decode<|begin_of_text|>Test data<|end_of_text|>>> # Tokenizer info> tokenizer llama3 in█> # Test encoding/decoding> tokenizer llama3 'Test data' | tokenizer llama3 decode<|begin_of_text|>Test data<|end_of_text|>>> # Tokenizer info> tokenizer llama3 inf█> # Test encoding/decoding> tokenizer llama3 'Test data' | tokenizer llama3 decode<|begin_of_text|>Test data<|end_of_text|>>> # Tokenizer info> tokenizer llama3 info█> # Test encoding/decoding> tokenizer llama3 'Test data' | tokenizer llama3 decode<|begin_of_text|>Test data<|end_of_text|>>> # Tokenizer info> tokenizer llama3 info █> # Test encoding/decoding> tokenizer llama3 'Test data' | tokenizer llama3 decode<|begin_of_text|>Test data<|end_of_text|>>> # Tokenizer info> tokenizer llama3 info |█> # Test encoding/decoding> tokenizer llama3 'Test data' | tokenizer llama3 decode<|begin_of_text|>Test data<|end_of_text|>>> # Tokenizer info> tokenizer llama3 info | █> # Test encoding/decoding> tokenizer llama3 'Test data' | tokenizer llama3 decode<|begin_of_text|>Test data<|end_of_text|>>> # Tokenizer info> tokenizer llama3 info | h█> # Test encoding/decoding> tokenizer llama3 'Test data' | tokenizer llama3 decode<|begin_of_text|>Test data<|end_of_text|>>> # Tokenizer info> tokenizer llama3 info | he█> # Test encoding/decoding> tokenizer llama3 'Test data' | tokenizer llama3 decode<|begin_of_text|>Test data<|end_of_text|>>> # Tokenizer info> tokenizer llama3 info | hea█> # Test encoding/decoding> tokenizer llama3 'Test data' | tokenizer llama3 decode<|begin_of_text|>Test data<|end_of_text|>>> # Tokenizer info> tokenizer llama3 info | head█> # Test encoding/decoding> tokenizer llama3 'Test data' | tokenizer llama3 decode<|begin_of_text|>Test data<|end_of_text|>>> # Tokenizer info> tokenizer llama3 info | head █> # Test encoding/decoding> tokenizer llama3 'Test data' | tokenizer llama3 decode<|begin_of_text|>Test data<|end_of_text|>>> # Tokenizer info> tokenizer llama3 info | head -█> # Test encoding/decoding> tokenizer llama3 'Test data' | tokenizer llama3 decode<|begin_of_text|>Test data<|end_of_text|>>> # Tokenizer info> tokenizer llama3 info | head -8█> # Test encoding/decoding> tokenizer llama3 'Test data' | tokenizer llama3 decode<|begin_of_text|>Test data<|end_of_text|>>> # Tokenizer info> tokenizer llama3 info | head -8█> tokenizer llama3 'Test data' | tokenizer llama3 decode<|begin_of_text|>Test data<|end_of_text|>>> # Tokenizer info> tokenizer llama3 info | head -8Llama 3 Tokenizer Information=============================Model Details: Model Type: Llama 3 (Meta) Tokenizer Type: Byte-level BPE Vocabulary Size: 128256 tokens Regular Tokens: 128000> █> tokenizer llama3 'Test data' | tokenizer llama3 decode<|begin_of_text|>Test data<|end_of_text|>>> # Tokenizer info> tokenizer llama3 info | head -8Llama 3 Tokenizer Information=============================Model Details: Model Type: Llama 3 (Meta) Tokenizer Type: Byte-level BPE Vocabulary Size: 128256 tokens Regular Tokens: 128000> █> tokenizer llama3 'Test data' | tokenizer llama3 decode<|begin_of_text|>Test data<|end_of_text|>>> # Tokenizer info> tokenizer llama3 info | head -8Llama 3 Tokenizer Information=============================Model Details: Model Type: Llama 3 (Meta) Tokenizer Type: Byte-level BPE Vocabulary Size: 128256 tokens Regular Tokens: 128000> █> tokenizer llama3 'Test data' | tokenizer llama3 decode<|begin_of_text|>Test data<|end_of_text|>>> # Tokenizer info> tokenizer llama3 info | head -8Llama 3 Tokenizer Information=============================Model Details: Model Type: Llama 3 (Meta) Tokenizer Type: Byte-level BPE Vocabulary Size: 128256 tokens Regular Tokens: 128000> #█> tokenizer llama3 'Test data' | tokenizer llama3 decode<|begin_of_text|>Test data<|end_of_text|>>> # Tokenizer info> tokenizer llama3 info | head -8Llama 3 Tokenizer Information=============================Model Details: Model Type: Llama 3 (Meta) Tokenizer Type: Byte-level BPE Vocabulary Size: 128256 tokens Regular Tokens: 128000> # █> tokenizer llama3 'Test data' | tokenizer llama3 decode<|begin_of_text|>Test data<|end_of_text|>>> # Tokenizer info> tokenizer llama3 info | head -8Llama 3 Tokenizer Information=============================Model Details: Model Type: Llama 3 (Meta) Tokenizer Type: Byte-level BPE Vocabulary Size: 128256 tokens Regular Tokens: 128000> # I█> tokenizer llama3 'Test data' | tokenizer llama3 decode<|begin_of_text|>Test data<|end_of_text|>>> # Tokenizer info> tokenizer llama3 info | head -8Llama 3 Tokenizer Information=============================Model Details: Model Type: Llama 3 (Meta) Tokenizer Type: Byte-level BPE Vocabulary Size: 128256 tokens Regular Tokens: 128000> # In█> tokenizer llama3 'Test data' | tokenizer llama3 decode<|begin_of_text|>Test data<|end_of_text|>>> # Tokenizer info> tokenizer llama3 info | head -8Llama 3 Tokenizer Information=============================Model Details: Model Type: Llama 3 (Meta) Tokenizer Type: Byte-level BPE Vocabulary Size: 128256 tokens Regular Tokens: 128000> # Ins█> tokenizer llama3 'Test data' | tokenizer llama3 decode<|begin_of_text|>Test data<|end_of_text|>>> # Tokenizer info> tokenizer llama3 info | head -8Llama 3 Tokenizer Information=============================Model Details: Model Type: Llama 3 (Meta) Tokenizer Type: Byte-level BPE Vocabulary Size: 128256 tokens Regular Tokens: 128000> # Inst█> tokenizer llama3 'Test data' | tokenizer llama3 decode<|begin_of_text|>Test data<|end_of_text|>>> # Tokenizer info> tokenizer llama3 info | head -8Llama 3 Tokenizer Information=============================Model Details: Model Type: Llama 3 (Meta) Tokenizer Type: Byte-level BPE Vocabulary Size: 128256 tokens Regular Tokens: 128000> # Insta█> tokenizer llama3 'Test data' | tokenizer llama3 decode<|begin_of_text|>Test data<|end_of_text|>>> # Tokenizer info> tokenizer llama3 info | head -8Llama 3 Tokenizer Information=============================Model Details: Model Type: Llama 3 (Meta) Tokenizer Type: Byte-level BPE Vocabulary Size: 128256 tokens Regular Tokens: 128000> # Instal█> tokenizer llama3 'Test data' | tokenizer llama3 decode<|begin_of_text|>Test data<|end_of_text|>>> # Tokenizer info> tokenizer llama3 info | head -8Llama 3 Tokenizer Information=============================Model Details: Model Type: Llama 3 (Meta) Tokenizer Type: Byte-level BPE Vocabulary Size: 128256 tokens Regular Tokens: 128000> # Instal█> tokenizer llama3 'Test data' | tokenizer llama3 decode<|begin_of_text|>Test data<|end_of_text|>>> # Tokenizer info> tokenizer llama3 info | head -8Llama 3 Tokenizer Information=============================Model Details: Model Type: Llama 3 (Meta) Tokenizer Type: Byte-level BPE Vocabulary Size: 128256 tokens Regular Tokens: 128000> # Install█> tokenizer llama3 'Test data' | tokenizer llama3 decode<|begin_of_text|>Test data<|end_of_text|>>> # Tokenizer info> tokenizer llama3 info | head -8Llama 3 Tokenizer Information=============================Model Details: Model Type: Llama 3 (Meta) Tokenizer Type: Byte-level BPE Vocabulary Size: 128256 tokens Regular Tokens: 128000> # Install:█> tokenizer llama3 'Test data' | tokenizer llama3 decode<|begin_of_text|>Test data<|end_of_text|>>> # Tokenizer info> tokenizer llama3 info | head -8Llama 3 Tokenizer Information=============================Model Details: Model Type: Llama 3 (Meta) Tokenizer Type: Byte-level BPE Vocabulary Size: 128256 tokens Regular Tokens: 128000> # Install: █> tokenizer llama3 'Test data' | tokenizer llama3 decode<|begin_of_text|>Test data<|end_of_text|>>> # Tokenizer info> tokenizer llama3 info | head -8Llama 3 Tokenizer Information=============================Model Details: Model Type: Llama 3 (Meta) Tokenizer Type: Byte-level BPE Vocabulary Size: 128256 tokens Regular Tokens: 128000> # Install: b█> tokenizer llama3 'Test data' | tokenizer llama3 decode<|begin_of_text|>Test data<|end_of_text|>>> # Tokenizer info> tokenizer llama3 info | head -8Llama 3 Tokenizer Information=============================Model Details: Model Type: Llama 3 (Meta) Tokenizer Type: Byte-level BPE Vocabulary Size: 128256 tokens Regular Tokens: 128000> # Install: br█> tokenizer llama3 'Test data' | tokenizer llama3 decode<|begin_of_text|>Test data<|end_of_text|>>> # Tokenizer info> tokenizer llama3 info | head -8Llama 3 Tokenizer Information=============================Model Details: Model Type: Llama 3 (Meta) Tokenizer Type: Byte-level BPE Vocabulary Size: 128256 tokens Regular Tokens: 128000> # Install: bre█> tokenizer llama3 'Test data' | tokenizer llama3 decode<|begin_of_text|>Test data<|end_of_text|>>> # Tokenizer info> tokenizer llama3 info | head -8Llama 3 Tokenizer Information=============================Model Details: Model Type: Llama 3 (Meta) Tokenizer Type: Byte-level BPE Vocabulary Size: 128256 tokens Regular Tokens: 128000> # Install: brew█> tokenizer llama3 'Test data' | tokenizer llama3 decode<|begin_of_text|>Test data<|end_of_text|>>> # Tokenizer info> tokenizer llama3 info | head -8Llama 3 Tokenizer Information=============================Model Details: Model Type: Llama 3 (Meta) Tokenizer Type: Byte-level BPE Vocabulary Size: 128256 tokens Regular Tokens: 128000> # Install: brew █> tokenizer llama3 'Test data' | tokenizer llama3 decode<|begin_of_text|>Test data<|end_of_text|>>> # Tokenizer info> tokenizer llama3 info | head -8Llama 3 Tokenizer Information=============================Model Details: Model Type: Llama 3 (Meta) Tokenizer Type: Byte-level BPE Vocabulary Size: 128256 tokens Regular Tokens: 128000> # Install: brew i█> tokenizer llama3 'Test data' | tokenizer llama3 decode<|begin_of_text|>Test data<|end_of_text|>>> # Tokenizer info> tokenizer llama3 info | head -8Llama 3 Tokenizer Information=============================Model Details: Model Type: Llama 3 (Meta) Tokenizer Type: Byte-level BPE Vocabulary Size: 128256 tokens Regular Tokens: 128000> # Install: brew in█> tokenizer llama3 'Test data' | tokenizer llama3 decode<|begin_of_text|>Test data<|end_of_text|>>> # Tokenizer info> tokenizer llama3 info | head -8Llama 3 Tokenizer Information=============================Model Details: Model Type: Llama 3 (Meta) Tokenizer Type: Byte-level BPE Vocabulary Size: 128256 tokens Regular Tokens: 128000> # Install: brew ins█> tokenizer llama3 'Test data' | tokenizer llama3 decode<|begin_of_text|>Test data<|end_of_text|>>> # Tokenizer info> tokenizer llama3 info | head -8Llama 3 Tokenizer Information=============================Model Details: Model Type: Llama 3 (Meta) Tokenizer Type: Byte-level BPE Vocabulary Size: 128256 tokens Regular Tokens: 128000> # Install: brew inst█> tokenizer llama3 'Test data' | tokenizer llama3 decode<|begin_of_text|>Test data<|end_of_text|>>> # Tokenizer info> tokenizer llama3 info | head -8Llama 3 Tokenizer Information=============================Model Details: Model Type: Llama 3 (Meta) Tokenizer Type: Byte-level BPE Vocabulary Size: 128256 tokens Regular Tokens: 128000> # Install: brew insta█> tokenizer llama3 'Test data' | tokenizer llama3 decode<|begin_of_text|>Test data<|end_of_text|>>> # Tokenizer info> tokenizer llama3 info | head -8Llama 3 Tokenizer Information=============================Model Details: Model Type: Llama 3 (Meta) Tokenizer Type: Byte-level BPE Vocabulary Size: 128256 tokens Regular Tokens: 128000> # Install: brew instal█> tokenizer llama3 'Test data' | tokenizer llama3 decode<|begin_of_text|>Test data<|end_of_text|>>> # Tokenizer info> tokenizer llama3 info | head -8Llama 3 Tokenizer Information=============================Model Details: Model Type: Llama 3 (Meta) Tokenizer Type: Byte-level BPE Vocabulary Size: 128256 tokens Regular Tokens: 128000> # Install: brew install█> tokenizer llama3 'Test data' | tokenizer llama3 decode<|begin_of_text|>Test data<|end_of_text|>>> # Tokenizer info> tokenizer llama3 info | head -8Llama 3 Tokenizer Information=============================Model Details: Model Type: Llama 3 (Meta) Tokenizer Type: Byte-level BPE Vocabulary Size: 128256 tokens Regular Tokens: 128000> # Install: brew install █> tokenizer llama3 'Test data' | tokenizer llama3 decode<|begin_of_text|>Test data<|end_of_text|>>> # Tokenizer info> tokenizer llama3 info | head -8Llama 3 Tokenizer Information=============================Model Details: Model Type: Llama 3 (Meta) Tokenizer Type: Byte-level BPE Vocabulary Size: 128256 tokens Regular Tokens: 128000> # Install: brew install a█> tokenizer llama3 'Test data' | tokenizer llama3 decode<|begin_of_text|>Test data<|end_of_text|>>> # Tokenizer info> tokenizer llama3 info | head -8Llama 3 Tokenizer Information=============================Model Details: Model Type: Llama 3 (Meta) Tokenizer Type: Byte-level BPE Vocabulary Size: 128256 tokens Regular Tokens: 128000> # Install: brew install ag█> tokenizer llama3 'Test data' | tokenizer llama3 decode<|begin_of_text|>Test data<|end_of_text|>>> # Tokenizer info> tokenizer llama3 info | head -8Llama 3 Tokenizer Information=============================Model Details: Model Type: Llama 3 (Meta) Tokenizer Type: Byte-level BPE Vocabulary Size: 128256 tokens Regular Tokens: 128000> # Install: brew install age█> tokenizer llama3 'Test data' | tokenizer llama3 decode<|begin_of_text|>Test data<|end_of_text|>>> # Tokenizer info> tokenizer llama3 info | head -8Llama 3 Tokenizer Information=============================Model Details: Model Type: Llama 3 (Meta) Tokenizer Type: Byte-level BPE Vocabulary Size: 128256 tokens Regular Tokens: 128000> # Install: brew install agen█> tokenizer llama3 'Test data' | tokenizer llama3 decode<|begin_of_text|>Test data<|end_of_text|>>> # Tokenizer info> tokenizer llama3 info | head -8Llama 3 Tokenizer Information=============================Model Details: Model Type: Llama 3 (Meta) Tokenizer Type: Byte-level BPE Vocabulary Size: 128256 tokens Regular Tokens: 128000> # Install: brew install agent█> tokenizer llama3 'Test data' | tokenizer llama3 decode<|begin_of_text|>Test data<|end_of_text|>>> # Tokenizer info> tokenizer llama3 info | head -8Llama 3 Tokenizer Information=============================Model Details: Model Type: Llama 3 (Meta) Tokenizer Type: Byte-level BPE Vocabulary Size: 128256 tokens Regular Tokens: 128000> # Install: brew install agents█> tokenizer llama3 'Test data' | tokenizer llama3 decode<|begin_of_text|>Test data<|end_of_text|>>> # Tokenizer info> tokenizer llama3 info | head -8Llama 3 Tokenizer Information=============================Model Details: Model Type: Llama 3 (Meta) Tokenizer Type: Byte-level BPE Vocabulary Size: 128256 tokens Regular Tokens: 128000> # Install: brew install agentst█> tokenizer llama3 'Test data' | tokenizer llama3 decode<|begin_of_text|>Test data<|end_of_text|>>> # Tokenizer info> tokenizer llama3 info | head -8Llama 3 Tokenizer Information=============================Model Details: Model Type: Llama 3 (Meta) Tokenizer Type: Byte-level BPE Vocabulary Size: 128256 tokens Regular Tokens: 128000> # Install: brew install agentsta█> tokenizer llama3 'Test data' | tokenizer llama3 decode<|begin_of_text|>Test data<|end_of_text|>>> # Tokenizer info> tokenizer llama3 info | head -8Llama 3 Tokenizer Information=============================Model Details: Model Type: Llama 3 (Meta) Tokenizer Type: Byte-level BPE Vocabulary Size: 128256 tokens Regular Tokens: 128000> # Install: brew install agentstat█> tokenizer llama3 'Test data' | tokenizer llama3 decode<|begin_of_text|>Test data<|end_of_text|>>> # Tokenizer info> tokenizer llama3 info | head -8Llama 3 Tokenizer Information=============================Model Details: Model Type: Llama 3 (Meta) Tokenizer Type: Byte-level BPE Vocabulary Size: 128256 tokens Regular Tokens: 128000> # Install: brew install agentstati█> tokenizer llama3 'Test data' | tokenizer llama3 decode<|begin_of_text|>Test data<|end_of_text|>>> # Tokenizer info> tokenizer llama3 info | head -8Llama 3 Tokenizer Information=============================Model Details: Model Type: Llama 3 (Meta) Tokenizer Type: Byte-level BPE Vocabulary Size: 128256 tokens Regular Tokens: 128000> # Install: brew install agentstatio█> tokenizer llama3 'Test data' | tokenizer llama3 decode<|begin_of_text|>Test data<|end_of_text|>>> # Tokenizer info> tokenizer llama3 info | head -8Llama 3 Tokenizer Information=============================Model Details: Model Type: Llama 3 (Meta) Tokenizer Type: Byte-level BPE Vocabulary Size: 128256 tokens Regular Tokens: 128000> # Install: brew install agentstation█> tokenizer llama3 'Test data' | tokenizer llama3 decode<|begin_of_text|>Test data<|end_of_text|>>> # Tokenizer info> tokenizer llama3 info | head -8Llama 3 Tokenizer Information=============================Model Details: Model Type: Llama 3 (Meta) Tokenizer Type: Byte-level BPE Vocabulary Size: 128256 tokens Regular Tokens: 128000> # Install: brew install agentstation/█> tokenizer llama3 'Test data' | tokenizer llama3 decode<|begin_of_text|>Test data<|end_of_text|>>> # Tokenizer info> tokenizer llama3 info | head -8Llama 3 Tokenizer Information=============================Model Details: Model Type: Llama 3 (Meta) Tokenizer Type: Byte-level BPE Vocabulary Size: 128256 tokens Regular Tokens: 128000> # Install: brew install agentstation/t█> tokenizer llama3 'Test data' | tokenizer llama3 decode<|begin_of_text|>Test data<|end_of_text|>>> # Tokenizer info> tokenizer llama3 info | head -8Llama 3 Tokenizer Information=============================Model Details: Model Type: Llama 3 (Meta) Tokenizer Type: Byte-level BPE Vocabulary Size: 128256 tokens Regular Tokens: 128000> # Install: brew install agentstation/ta█> tokenizer llama3 'Test data' | tokenizer llama3 decode<|begin_of_text|>Test data<|end_of_text|>>> # Tokenizer info> tokenizer llama3 info | head -8Llama 3 Tokenizer Information=============================Model Details: Model Type: Llama 3 (Meta) Tokenizer Type: Byte-level BPE Vocabulary Size: 128256 tokens Regular Tokens: 128000> # Install: brew install agentstation/tap█> tokenizer llama3 'Test data' | tokenizer llama3 decode<|begin_of_text|>Test data<|end_of_text|>>> # Tokenizer info> tokenizer llama3 info | head -8Llama 3 Tokenizer Information=============================Model Details: Model Type: Llama 3 (Meta) Tokenizer Type: Byte-level BPE Vocabulary Size: 128256 tokens Regular Tokens: 128000> # Install: brew install agentstation/tap█> tokenizer llama3 'Test data' | tokenizer llama3 decode<|begin_of_text|>Test data<|end_of_text|>>> # Tokenizer info> tokenizer llama3 info | head -8Llama 3 Tokenizer Information=============================Model Details: Model Type: Llama 3 (Meta) Tokenizer Type: Byte-level BPE Vocabulary Size: 128256 tokens Regular Tokens: 128000> # Install: brew install agentstation/tap/█> tokenizer llama3 'Test data' | tokenizer llama3 decode<|begin_of_text|>Test data<|end_of_text|>>> # Tokenizer info> tokenizer llama3 info | head -8Llama 3 Tokenizer Information=============================Model Details: Model Type: Llama 3 (Meta) Tokenizer Type: Byte-level BPE Vocabulary Size: 128256 tokens Regular Tokens: 128000> # Install: brew install agentstation/tap/t█> tokenizer llama3 'Test data' | tokenizer llama3 decode<|begin_of_text|>Test data<|end_of_text|>>> # Tokenizer info> tokenizer llama3 info | head -8Llama 3 Tokenizer Information=============================Model Details: Model Type: Llama 3 (Meta) Tokenizer Type: Byte-level BPE Vocabulary Size: 128256 tokens Regular Tokens: 128000> # Install: brew install agentstation/tap/to█> tokenizer llama3 'Test data' | tokenizer llama3 decode<|begin_of_text|>Test data<|end_of_text|>>> # Tokenizer info> tokenizer llama3 info | head -8Llama 3 Tokenizer Information=============================Model Details: Model Type: Llama 3 (Meta) Tokenizer Type: Byte-level BPE Vocabulary Size: 128256 tokens Regular Tokens: 128000> # Install: brew install agentstation/tap/tok█> tokenizer llama3 'Test data' | tokenizer llama3 decode<|begin_of_text|>Test data<|end_of_text|>>> # Tokenizer info> tokenizer llama3 info | head -8Llama 3 Tokenizer Information=============================Model Details: Model Type: Llama 3 (Meta) Tokenizer Type: Byte-level BPE Vocabulary Size: 128256 tokens Regular Tokens: 128000> # Install: brew install agentstation/tap/toke█> tokenizer llama3 'Test data' | tokenizer llama3 decode<|begin_of_text|>Test data<|end_of_text|>>> # Tokenizer info> tokenizer llama3 info | head -8Llama 3 Tokenizer Information=============================Model Details: Model Type: Llama 3 (Meta) Tokenizer Type: Byte-level BPE Vocabulary Size: 128256 tokens Regular Tokens: 128000> # Install: brew install agentstation/tap/token█> tokenizer llama3 'Test data' | tokenizer llama3 decode<|begin_of_text|>Test data<|end_of_text|>>> # Tokenizer info> tokenizer llama3 info | head -8Llama 3 Tokenizer Information=============================Model Details: Model Type: Llama 3 (Meta) Tokenizer Type: Byte-level BPE Vocabulary Size: 128256 tokens Regular Tokens: 128000> # Install: brew install agentstation/tap/tokeni█> tokenizer llama3 'Test data' | tokenizer llama3 decode<|begin_of_text|>Test data<|end_of_text|>>> # Tokenizer info> tokenizer llama3 info | head -8Llama 3 Tokenizer Information=============================Model Details: Model Type: Llama 3 (Meta) Tokenizer Type: Byte-level BPE Vocabulary Size: 128256 tokens Regular Tokens: 128000> # Install: brew install agentstation/tap/tokeniz█> tokenizer llama3 'Test data' | tokenizer llama3 decode<|begin_of_text|>Test data<|end_of_text|>>> # Tokenizer info> tokenizer llama3 info | head -8Llama 3 Tokenizer Information=============================Model Details: Model Type: Llama 3 (Meta) Tokenizer Type: Byte-level BPE Vocabulary Size: 128256 tokens Regular Tokens: 128000> # Install: brew install agentstation/tap/tokenize█> tokenizer llama3 'Test data' | tokenizer llama3 decode<|begin_of_text|>Test data<|end_of_text|>>> # Tokenizer info> tokenizer llama3 info | head -8Llama 3 Tokenizer Information=============================Model Details: Model Type: Llama 3 (Meta) Tokenizer Type: Byte-level BPE Vocabulary Size: 128256 tokens Regular Tokens: 128000> # Install: brew install agentstation/tap/tokenizer█<|begin_of_text|>Test data<|end_of_text|>>> # Tokenizer info> tokenizer llama3 info | head -8Llama 3 Tokenizer Information=============================Model Details: Model Type: Llama 3 (Meta) Tokenizer Type: Byte-level BPE Vocabulary Size: 128256 tokens Regular Tokens: 128000> # Install: brew install agentstation/tap/tokenizer> █<|begin_of_text|>Test data<|end_of_text|>>> # Tokenizer info> tokenizer llama3 info | head -8Llama 3 Tokenizer Information=============================Model Details: Model Type: Llama 3 (Meta) Tokenizer Type: Byte-level BPE Vocabulary Size: 128256 tokens Regular Tokens: 128000> # Install: brew install agentstation/tap/tokenizer> █
\ No newline at end of file
diff --git a/llama3/README.md b/llama3/README.md
index 6124413..563be42 100644
--- a/llama3/README.md
+++ b/llama3/README.md
@@ -454,8 +454,7 @@ Package llama3 implements the Llama 3 tokenizer in Go. This file contains the pu
- [func \(t \*Tokenizer\) EncodeBPE\(pretoken string\) \[\]int](<#Tokenizer.EncodeBPE>)
- [func \(t \*Tokenizer\) EncodeBytes\(data \[\]byte, opts \*EncodeOptions\) \[\]int](<#Tokenizer.EncodeBytes>)
- [func \(t \*Tokenizer\) GetSpecialTokenID\(token string\) \(int, error\)](<#Tokenizer.GetSpecialTokenID>)
- - [func \(t \*Tokenizer\) NewScanner\(r io.Reader\) Scanner](<#Tokenizer.NewScanner>)
- - [func \(t \*Tokenizer\) NewScannerOptions\(r io.Reader, opts ...ScannerOption\) Scanner](<#Tokenizer.NewScannerOptions>)
+ - [func \(t \*Tokenizer\) NewScanner\(r io.Reader, opts ...ScannerOption\) Scanner](<#Tokenizer.NewScanner>)
- [func \(t \*Tokenizer\) OptimisticCount\(text string\) int](<#Tokenizer.OptimisticCount>)
- [func \(t \*Tokenizer\) PreTokenize\(text string\) \[\]string](<#Tokenizer.PreTokenize>)
- [func \(t \*Tokenizer\) Process\(r io.Reader, w io.Writer\) \(int64, error\)](<#Tokenizer.Process>)
@@ -469,25 +468,25 @@ Package llama3 implements the Llama 3 tokenizer in Go. This file contains the pu
## Variables
-Common errors
+Common errors.
```go
var (
- // ErrDataNotFound indicates that the tokenizer data files could not be found
+ // ErrDataNotFound indicates that the tokenizer data files could not be found.
ErrDataNotFound = errors.New("tokenizer data not found")
- // ErrInvalidToken indicates an invalid token was provided
+ // ErrInvalidToken indicates an invalid token was provided.
ErrInvalidToken = errors.New("invalid token")
- // ErrTokenNotFound indicates a token was not found in the vocabulary
+ // ErrTokenNotFound indicates a token was not found in the vocabulary.
ErrTokenNotFound = errors.New("token not found")
- // ErrInvalidTokenID indicates an invalid token ID was provided
+ // ErrInvalidTokenID indicates an invalid token ID was provided.
ErrInvalidTokenID = errors.New("invalid token ID")
)
```
-Scanner option functions \- these are re\-exported from the scanner package
+Scanner option functions \- these are re\-exported from the scanner package.
```go
var (
@@ -517,7 +516,7 @@ var (
func NewConfigError(field string, value any, err error) error
```
-NewConfigError creates a new ConfigError
+NewConfigError creates a new ConfigError.
## func [NewDataError]()
@@ -526,7 +525,7 @@ NewConfigError creates a new ConfigError
func NewDataError(op, path string, err error) error
```
-NewDataError creates a new DataError
+NewDataError creates a new DataError.
## func [NewTokenError]()
@@ -535,7 +534,7 @@ NewDataError creates a new DataError
func NewTokenError(op, token string, err error) error
```
-NewTokenError creates a new TokenError
+NewTokenError creates a new TokenError.
## func [NewTokenIDError]()
@@ -544,7 +543,7 @@ NewTokenError creates a new TokenError
func NewTokenIDError(op string, tokenID int, err error) error
```
-NewTokenIDError creates a new TokenError with a token ID
+NewTokenIDError creates a new TokenError with a token ID.
## type [BPE]()
@@ -583,7 +582,7 @@ type Cache interface {
## type [ConfigError]()
-ConfigError represents an error in tokenizer configuration
+ConfigError represents an error in tokenizer configuration.
```go
type ConfigError struct {
@@ -614,7 +613,7 @@ func (e *ConfigError) Unwrap() error
## type [DataError]()
-DataError represents an error related to tokenizer data loading or processing
+DataError represents an error related to tokenizer data loading or processing.
```go
type DataError struct {
@@ -809,7 +808,7 @@ type ScannerOption = scanner.Option
## type [TokenError]()
-TokenError represents an error related to token operations
+TokenError represents an error related to token operations.
```go
type TokenError struct {
@@ -1082,22 +1081,13 @@ func main() {
-### func \(\*Tokenizer\) [NewScanner]()
+### func \(\*Tokenizer\) [NewScanner]()
```go
-func (t *Tokenizer) NewScanner(r io.Reader) Scanner
+func (t *Tokenizer) NewScanner(r io.Reader, opts ...ScannerOption) Scanner
```
-NewScanner creates a scanner for streaming tokenization with default options.
-
-
-### func \(\*Tokenizer\) [NewScannerOptions]()
-
-```go
-func (t *Tokenizer) NewScannerOptions(r io.Reader, opts ...ScannerOption) Scanner
-```
-
-NewScannerOptions creates a scanner with custom options.
+NewScanner creates a scanner for streaming tokenization. The scanner processes input with bounded memory usage, making it suitable for large files or continuous streams.
### func \(\*Tokenizer\) [OptimisticCount]()
diff --git a/llama3/cmd/llama3/command.go b/llama3/cmd/llama3/command.go
index 725d1e4..07f6987 100644
--- a/llama3/cmd/llama3/command.go
+++ b/llama3/cmd/llama3/command.go
@@ -11,6 +11,16 @@ import (
// This command provides encode, decode, stream, and info subcommands
// for working with the Llama 3 tokenizer.
func Command() *cobra.Command {
+ // Define shared flags that can be used with implicit encoding/streaming
+ var (
+ output string
+ count bool
+ countOnly bool
+ bos bool
+ eos bool
+ metrics bool
+ )
+
cmd := &cobra.Command{
Use: "llama3",
Short: "Llama 3 tokenizer operations",
@@ -22,7 +32,6 @@ vocabulary of 128,256 tokens (128,000 regular tokens + 256 special tokens).
Available commands:
encode - Encode text to token IDs (default when text is provided)
decode - Decode token IDs to text
- stream - Process text in streaming mode
info - Display tokenizer information`,
Example: ` # Encode text (explicit)
tokenizer llama3 encode "Hello, world!"
@@ -30,15 +39,19 @@ Available commands:
# Encode text (implicit - default action)
tokenizer llama3 "Hello, world!"
+ # Encode with flags (implicit)
+ tokenizer llama3 "Hello, world!" --count
+ tokenizer llama3 "Hello, world!" --output json
+
# Decode tokens
tokenizer llama3 decode 128000 9906 11 1917 0 128001
- # Stream from stdin (explicit)
- cat large_file.txt | tokenizer llama3 stream
-
- # Stream from stdin (implicit - automatic)
+ # Encode from stdin (implicit - automatic)
cat large_file.txt | tokenizer llama3
+ # Encode with flags (implicit)
+ cat large_file.txt | tokenizer llama3 --count-only
+
# Show tokenizer info
tokenizer llama3 info`,
Args: cobra.ArbitraryArgs,
@@ -63,15 +76,40 @@ Available commands:
// Not a subcommand, treat as text to encode
encodeCmd := newEncodeCmd()
encodeCmd.SetArgs(args)
+ // Copy over parent command flags for encode
+ encodeCmd.SetOut(cmd.OutOrStdout())
+ encodeCmd.SetErr(cmd.ErrOrStderr())
+ encodeCmd.SetIn(cmd.InOrStdin())
+
+ // Set flags from parent command
+ encAddBOS = bos
+ encAddEOS = eos
+ encOutput = output
+ encCount = count
+ encCountOnly = countOnly
+ encMetrics = metrics
+
return encodeCmd.Execute()
}
// No args provided - check if stdin is piped
stat, _ := os.Stdin.Stat()
if (stat.Mode() & os.ModeCharDevice) == 0 {
- // Data is being piped to stdin, use streaming mode
- streamCmd := newStreamCmd()
- return streamCmd.Execute()
+ // Data is being piped to stdin, use encode
+ encodeCmd := newEncodeCmd()
+ encodeCmd.SetOut(cmd.OutOrStdout())
+ encodeCmd.SetErr(cmd.ErrOrStderr())
+ encodeCmd.SetIn(cmd.InOrStdin())
+
+ // Set flags from parent command
+ encAddBOS = bos
+ encAddEOS = eos
+ encOutput = output
+ encCount = count
+ encCountOnly = countOnly
+ encMetrics = metrics
+
+ return encodeCmd.RunE(encodeCmd, []string{})
}
// No args and no piped input, show help
@@ -79,11 +117,18 @@ Available commands:
},
}
+ // Add flags that work with implicit encoding/streaming
+ cmd.PersistentFlags().StringVarP(&output, "output", "o", "space", "Output format: space, newline, json")
+ cmd.PersistentFlags().BoolVar(&count, "count", false, "Show token count with output")
+ cmd.PersistentFlags().BoolVar(&countOnly, "count-only", false, "Show only token count (no tokens)")
+ cmd.PersistentFlags().BoolVar(&bos, "bos", true, "Add beginning of sequence token")
+ cmd.PersistentFlags().BoolVar(&eos, "eos", true, "Add end of sequence token")
+ cmd.PersistentFlags().BoolVar(&metrics, "metrics", false, "Show performance metrics")
+
// Add subcommands
cmd.AddCommand(
newEncodeCmd(),
newDecodeCmd(),
- newStreamCmd(),
newInfoCmd(),
)
diff --git a/llama3/cmd/llama3/encode.go b/llama3/cmd/llama3/encode.go
index 42eb31e..073e45b 100644
--- a/llama3/cmd/llama3/encode.go
+++ b/llama3/cmd/llama3/encode.go
@@ -6,6 +6,7 @@ import (
"io"
"os"
"strings"
+ "time"
"github.com/spf13/cobra"
@@ -14,9 +15,12 @@ import (
var (
// Encode command flags.
- encAddBOS bool
- encAddEOS bool
- encOutput string
+ encAddBOS bool
+ encAddEOS bool
+ encOutput string
+ encCount bool
+ encCountOnly bool
+ encMetrics bool
)
// newEncodeCmd creates the encode subcommand.
@@ -46,7 +50,13 @@ The output format can be:
tokenizer llama3 encode --output json "Hello"
# Output one token per line
- tokenizer llama3 encode --output newline "Hello"`,
+ tokenizer llama3 encode --output newline "Hello"
+
+ # Show token count with output
+ tokenizer llama3 encode --count "Hello"
+
+ # Show only the token count
+ tokenizer llama3 encode --count-only "Hello"`,
RunE: runEncode,
}
@@ -54,11 +64,18 @@ The output format can be:
cmd.Flags().BoolVar(&encAddBOS, "bos", true, "Add beginning of sequence token")
cmd.Flags().BoolVar(&encAddEOS, "eos", true, "Add end of sequence token")
cmd.Flags().StringVarP(&encOutput, "output", "o", "space", "Output format: space, newline, json")
+ cmd.Flags().BoolVar(&encCount, "count", false, "Show token count with output")
+ cmd.Flags().BoolVar(&encCountOnly, "count-only", false, "Show only token count (no tokens)")
+ cmd.Flags().BoolVar(&encMetrics, "metrics", false, "Show performance metrics")
return cmd
}
func runEncode(_ *cobra.Command, args []string) error {
+ var startTime time.Time
+ if encMetrics {
+ startTime = time.Now()
+ }
// Initialize tokenizer
tokenizer, err := llama3.New()
@@ -66,39 +83,109 @@ func runEncode(_ *cobra.Command, args []string) error {
return fmt.Errorf("failed to initialize tokenizer: %w", err)
}
- // Get text to encode
- var text string
+ // Create reader based on input source
+ var reader io.Reader
+ var inputBytes int
+
if len(args) > 0 {
- text = strings.Join(args, " ")
+ text := strings.Join(args, " ")
+ inputBytes = len(text)
+ reader = strings.NewReader(text)
} else {
- // Read from stdin
- data, err := io.ReadAll(os.Stdin)
- if err != nil {
- return fmt.Errorf("failed to read from stdin: %w", err)
+ // For stdin, wrap with counting reader if metrics enabled
+ if encMetrics {
+ cr := &countingReader{Reader: os.Stdin}
+ reader = cr
+ defer func() { inputBytes = cr.bytesRead }()
+ } else {
+ reader = os.Stdin
}
- text = string(data)
}
- // Encode with options
- opts := &llama3.EncodeOptions{
- BOS: encAddBOS,
- EOS: encAddEOS,
+ // Create scanner with options
+ scanner := tokenizer.NewScanner(
+ reader,
+ llama3.WithEncodeOptions(&llama3.EncodeOptions{
+ BOS: encAddBOS,
+ EOS: encAddEOS,
+ }),
+ )
+
+ // Collect all tokens (needed for JSON output and count)
+ var tokens []int
+ for scanner.Scan() {
+ tokens = append(tokens, scanner.Token())
+ }
+
+ if err := scanner.Err(); err != nil {
+ return fmt.Errorf("tokenization error: %w", err)
+ }
+
+ // Get byte count if we used counting reader
+ if cr, ok := reader.(*countingReader); ok {
+ inputBytes = cr.bytesRead
+ }
+
+ var encodeDuration time.Duration
+ if encMetrics {
+ encodeDuration = time.Since(startTime)
+ }
+
+ // Handle count-only mode
+ if encCountOnly {
+ switch encOutput {
+ case "json":
+ data, err := json.Marshal(map[string]int{"count": len(tokens)})
+ if err != nil {
+ return fmt.Errorf("failed to marshal count: %w", err)
+ }
+ fmt.Println(string(data))
+ default:
+ fmt.Println(len(tokens))
+ }
+ return nil
}
- tokens := tokenizer.Encode(text, opts)
- // Output tokens
+ // Output tokens with optional count and metrics
switch encOutput {
case "json":
- data, err := json.Marshal(tokens)
+ output := map[string]interface{}{
+ "tokens": tokens,
+ }
+ if encCount {
+ output["count"] = len(tokens)
+ }
+ if encMetrics {
+ metrics := map[string]interface{}{
+ "latency": formatLatency(encodeDuration),
+ "tps": calculateTPS(len(tokens), encodeDuration),
+ "input_bytes": inputBytes,
+ }
+ output["metrics"] = metrics
+ }
+ data, err := json.Marshal(output)
if err != nil {
- return fmt.Errorf("failed to marshal tokens: %w", err)
+ return fmt.Errorf("failed to marshal output: %w", err)
}
fmt.Println(string(data))
case "newline":
+ if encCount {
+ fmt.Printf("count: %d\n", len(tokens))
+ }
for _, token := range tokens {
fmt.Println(token)
}
+ if encMetrics {
+ fmt.Println("metrics:")
+ fmt.Printf(" latency: %s\n", formatLatency(encodeDuration))
+ fmt.Printf(" tps: %d\n", calculateTPS(len(tokens), encodeDuration))
+ fmt.Printf(" input_bytes: %d\n", inputBytes)
+ }
case "space":
+ if encCount {
+ fmt.Printf("count: %d\n", len(tokens))
+ fmt.Print("tokens: ")
+ }
for i, token := range tokens {
if i > 0 {
fmt.Print(" ")
@@ -106,9 +193,27 @@ func runEncode(_ *cobra.Command, args []string) error {
fmt.Print(token)
}
fmt.Println()
+ if encMetrics {
+ fmt.Println("metrics:")
+ fmt.Printf(" latency: %s\n", formatLatency(encodeDuration))
+ fmt.Printf(" tps: %d\n", calculateTPS(len(tokens), encodeDuration))
+ fmt.Printf(" input_bytes: %d\n", inputBytes)
+ }
default:
return fmt.Errorf("unknown output format: %s", encOutput)
}
return nil
}
+
+// countingReader wraps an io.Reader to count bytes read.
+type countingReader struct {
+ io.Reader
+ bytesRead int
+}
+
+func (cr *countingReader) Read(p []byte) (n int, err error) {
+ n, err = cr.Reader.Read(p)
+ cr.bytesRead += n
+ return
+}
diff --git a/llama3/cmd/llama3/metrics.go b/llama3/cmd/llama3/metrics.go
new file mode 100644
index 0000000..d7e3a77
--- /dev/null
+++ b/llama3/cmd/llama3/metrics.go
@@ -0,0 +1,26 @@
+package llama3cmd
+
+import (
+ "fmt"
+ "time"
+)
+
+// formatLatency formats a duration into a human-readable string with appropriate units.
+func formatLatency(d time.Duration) string {
+ if d < time.Microsecond {
+ return fmt.Sprintf("%dns", d.Nanoseconds())
+ } else if d < time.Millisecond {
+ return fmt.Sprintf("%.2fμs", float64(d.Nanoseconds())/1000)
+ } else if d < time.Second {
+ return fmt.Sprintf("%.2fms", float64(d.Microseconds())/1000)
+ }
+ return fmt.Sprintf("%.2fs", d.Seconds())
+}
+
+// calculateTPS calculates tokens per second.
+func calculateTPS(tokenCount int, duration time.Duration) int {
+ if duration == 0 {
+ return 0
+ }
+ return int(float64(tokenCount) / duration.Seconds())
+}
diff --git a/llama3/cmd/llama3/stream.go b/llama3/cmd/llama3/stream.go
deleted file mode 100644
index 91aaf08..0000000
--- a/llama3/cmd/llama3/stream.go
+++ /dev/null
@@ -1,120 +0,0 @@
-package llama3cmd
-
-import (
- "fmt"
- "os"
-
- "github.com/spf13/cobra"
-
- "github.com/agentstation/tokenizer/llama3"
-)
-
-const (
- // Output format constants.
- outputFormatSpace = "space"
- outputFormatNewline = "newline"
-)
-
-var (
- // Stream command flags.
- streamBufferSize int
- streamMaxBuffer int
- streamAddBOS bool
- streamAddEOS bool
- streamOutput string
-)
-
-// newStreamCmd creates the stream subcommand.
-func newStreamCmd() *cobra.Command {
- cmd := &cobra.Command{
- Use: "stream",
- Short: "Process text in streaming mode",
- Long: `Process text in streaming mode, outputting tokens as they are generated.
-
-This command is designed for processing large files or real-time input where
-you want to see tokens as they are produced rather than waiting for the entire
-input to be processed.
-
-The streaming tokenizer uses an internal buffer to accumulate text until it
-finds a good tokenization boundary (like whitespace). This prevents splitting
-UTF-8 sequences or words unnecessarily.
-
-Input is read from stdin only.`,
- Example: ` # Stream a large file
- cat large_file.txt | tokenizer llama3 stream
-
- # Stream with custom buffer size
- cat data.txt | tokenizer llama3 stream --buffer-size 8192
-
- # Stream without special tokens
- echo "Hello world" | tokenizer llama3 stream --no-bos --no-eos
-
- # Stream with one token per line
- cat input.txt | tokenizer llama3 stream --output newline`,
- RunE: runStream,
- }
-
- // Add flags
- cmd.Flags().IntVar(&streamBufferSize, "buffer-size", 4096, "Buffer size for reading")
- cmd.Flags().IntVar(&streamMaxBuffer, "max-buffer", 1048576, "Maximum buffer size before forcing tokenization")
- cmd.Flags().BoolVar(&streamAddBOS, "bos", true, "Add beginning of sequence token")
- cmd.Flags().BoolVar(&streamAddEOS, "eos", true, "Add end of sequence token")
- cmd.Flags().StringVarP(&streamOutput, "output", "o", "space", "Output format: space, newline")
-
- return cmd
-}
-
-func runStream(_ *cobra.Command, _ []string) error {
-
- // Validate output format
- if streamOutput != outputFormatSpace && streamOutput != outputFormatNewline {
- return fmt.Errorf("invalid output format %q: must be 'space' or 'newline'", streamOutput)
- }
-
- // Initialize tokenizer
- tokenizer, err := llama3.New()
- if err != nil {
- return fmt.Errorf("failed to initialize tokenizer: %w", err)
- }
-
- // Create scanner with options
- scanner := tokenizer.NewScannerOptions(
- os.Stdin,
- llama3.WithBufferSize(streamBufferSize),
- llama3.WithMaxBuffer(streamMaxBuffer),
- llama3.WithEncodeOptions(&llama3.EncodeOptions{
- BOS: streamAddBOS,
- EOS: streamAddEOS,
- }),
- )
-
- // Process tokens
- first := true
- tokenCount := 0
- for scanner.Scan() {
- token := scanner.Token()
- tokenCount++
-
- switch streamOutput {
- case outputFormatNewline:
- fmt.Println(token)
- case outputFormatSpace:
- if !first {
- fmt.Print(" ")
- }
- fmt.Print(token)
- first = false
- }
- }
-
- if err := scanner.Err(); err != nil {
- return fmt.Errorf("streaming error: %w", err)
- }
-
- // Final newline for space-separated output
- if streamOutput == "space" && tokenCount > 0 {
- fmt.Println()
- }
-
- return nil
-}
diff --git a/llama3/scanner.go b/llama3/scanner.go
index f7b1b01..edb41e6 100644
--- a/llama3/scanner.go
+++ b/llama3/scanner.go
@@ -61,13 +61,13 @@ func (ta *tokenizerAdapter) Encode(text string, opts *scanner.EncodeOptions) []i
})
}
-// NewScanner creates a scanner for streaming tokenization with default options.
-func (t *Tokenizer) NewScanner(r io.Reader) Scanner {
- return scanner.New(&tokenizerAdapter{t}, r)
-}
-
-// NewScannerOptions creates a scanner with custom options.
-func (t *Tokenizer) NewScannerOptions(r io.Reader, opts ...ScannerOption) Scanner {
+// NewScanner creates a scanner for streaming tokenization.
+// The scanner processes input with bounded memory usage, making it suitable
+// for large files or continuous streams.
+func (t *Tokenizer) NewScanner(r io.Reader, opts ...ScannerOption) Scanner {
+ if len(opts) == 0 {
+ return scanner.New(&tokenizerAdapter{t}, r)
+ }
return scanner.NewWithOptions(&tokenizerAdapter{t}, r, opts...)
}
diff --git a/llama3/scanner_test.go b/llama3/scanner_test.go
index a33e5e5..030ca12 100644
--- a/llama3/scanner_test.go
+++ b/llama3/scanner_test.go
@@ -82,7 +82,7 @@ func TestScanner(t *testing.T) {
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
reader := strings.NewReader(tt.input)
- scanner := tokenizer.NewScannerOptions(reader, WithEncodeOptions(tt.opts))
+ scanner := tokenizer.NewScanner(reader, WithEncodeOptions(tt.opts))
var tokens []int
for scanner.Scan() {
@@ -125,7 +125,7 @@ func TestScannerOptions(t *testing.T) {
input := strings.Repeat("test ", 1000)
reader := strings.NewReader(input)
- scanner := tokenizer.NewScannerOptions(reader,
+ scanner := tokenizer.NewScanner(reader,
WithBufferSize(128),
WithMaxBuffer(512),
)
@@ -197,7 +197,7 @@ func TestScannerEdgeCases(t *testing.T) {
reader := strings.NewReader(input)
// Small buffer that will force a split
- scanner := tokenizer.NewScannerOptions(reader,
+ scanner := tokenizer.NewScanner(reader,
WithBufferSize(32),
WithMaxBuffer(64), // Will hit limit in middle of long word
WithEncodeOptions(&EncodeOptions{BOS: false, EOS: false}),
@@ -226,7 +226,7 @@ func TestScannerEdgeCases(t *testing.T) {
input := strings.Repeat("a", 62) + "世界" // Will hit 64-byte limit in middle of "世"
reader := strings.NewReader(input)
- scanner := tokenizer.NewScannerOptions(reader,
+ scanner := tokenizer.NewScanner(reader,
WithBufferSize(32),
WithMaxBuffer(64),
WithEncodeOptions(&EncodeOptions{BOS: false, EOS: false}),
@@ -257,7 +257,7 @@ func TestScannerEdgeCases(t *testing.T) {
readSizes: []int{64, 10}, // Read exactly to buffer limit, splitting "世"
}
- scanner := tokenizer.NewScannerOptions(reader,
+ scanner := tokenizer.NewScanner(reader,
WithBufferSize(64),
WithMaxBuffer(64), // Exact buffer size
WithEncodeOptions(&EncodeOptions{BOS: false, EOS: false}),
@@ -295,7 +295,7 @@ func TestScannerEdgeCases(t *testing.T) {
longText := strings.Repeat("The quick brown fox jumps over the lazy dog. ", 20)
reader := strings.NewReader(longText)
- scanner := tokenizer.NewScannerOptions(reader,
+ scanner := tokenizer.NewScanner(reader,
WithBufferSize(32),
WithMaxBuffer(128), // Small enough to force multiple chunks
WithEncodeOptions(&EncodeOptions{BOS: false, EOS: false}),
diff --git a/scripts/demo.tape b/scripts/demo.tape
index 1ba50dd..b1e9cc8 100644
--- a/scripts/demo.tape
+++ b/scripts/demo.tape
@@ -44,6 +44,8 @@ Type "# Test encoding/decoding"
Enter
Type "tokenizer llama3 'Test data' | tokenizer llama3 decode"
Enter
+Sleep 500ms
+Enter
Sleep 2s
# Tokenizer info