Skip to content

Token Telemetry

Token Telemetry #36

Workflow file for this run

name: Token Telemetry
on:
schedule:
# Run daily at 6 AM UTC
- cron: '0 6 * * *'
workflow_dispatch: # Allow manual triggering
push:
branches: [ main ]
paths:
- 'tokenizer.py'
- '.github/workflows/tokenizer.yml'
- 'repos.txt'
permissions:
contents: write
pages: write
id-token: write
# Allow only one concurrent deployment, skipping runs queued between the run in-progress and latest queued.
concurrency:
group: "pages"
cancel-in-progress: false
jobs:
analyze-tokens:
runs-on: ubuntu-latest
steps:
- name: Checkout repository
uses: actions/checkout@v4
- name: Set up Python
uses: actions/setup-python@v5
with:
python-version: '3.9'
- name: Install Poetry
uses: snok/install-poetry@v1
with:
version: latest
virtualenvs-create: true
virtualenvs-in-project: true
- name: Load cached venv
id: cached-poetry-dependencies
uses: actions/cache@v4
with:
path: .venv
key: venv-${{ runner.os }}-${{ steps.setup-python.outputs.python-version }}-${{ hashFiles('**/poetry.lock') }}
- name: Install dependencies
if: steps.cached-poetry-dependencies.outputs.cache-hit != 'true'
run: poetry install --no-interaction --no-root
- name: Run token analysis
run: |
mkdir -p _site
poetry run python tokenizer.py --celestia-repos --output _site/index.json
- name: Create index.html redirect
run: |
cat > _site/index.html << 'EOF'
<!DOCTYPE html>
<html>
<head>
<meta charset="utf-8">
<title>Token Telemetry API</title>
<meta http-equiv="refresh" content="0; url=index.json">
</head>
<body>
<h1>🚀 Token Telemetry API</h1>
<p>Comprehensive token analysis of CelestiaOrg repositories for AI development and context management.</p>
<p>Redirecting to <a href="index.json">index.json</a>...</p>
<p><small>Direct link: <code>https://celestiaorg.github.io/tokenmetry/index.json</code></small></p>
<p><a href="https://github.com/celestiaorg/tokenmetry">📚 View Documentation on GitHub</a></p>
</body>
</html>
EOF
- name: Add metadata to JSON and update data_file to full URL
run: |
cat > ./add_metadata_script.py << 'EOL'
import json
from datetime import datetime, timezone
from collections import OrderedDict
import os
# This is the meta-index file, located at the root of the deployment (_site)
index_path = '_site/index.json'
with open(index_path, 'r') as f:
data = json.load(f) # This data is already structured as a meta-index by tokenizer.py
new_data = OrderedDict()
# Ensure metadata is the first key
new_data['metadata'] = {
'generated_at': datetime.now(timezone.utc).isoformat(),
'generator': 'celestiaorg/tokenmetry',
'version': '1.1.0', # Version indicating meta-index structure
'format': 'celestia-token-telemetry-meta-v1', # Format name for the meta-index
'description': 'Meta-index for token count analysis of CelestiaOrg repositories. Provides summaries and links to detailed per-repository data files located in the repository_data/ directory.',
'api_endpoint': 'https://celestiaorg.github.io/tokenmetry/index.json', # This file itself
'purpose': 'This JSON meta-index helps AI agents discover available repository token analyses, understand overall codebase statistics, and selectively access detailed data for specific repositories located in the repository_data/ directory.',
'usage_instructions': {
'overview': 'This meta-index (`index.json`) provides a high-level summary of token counts across multiple repositories. Use `summary` fields for overall codebase statistics. The `repositories` array offers per-repository summaries and a direct URL in `data_file` to access detailed token counts for each file within that repository.',
'repository_discovery': 'The `repositories` array lists all processed repositories. Each entry includes `name`, `url`, `total_tokens`, `total_files`, a `by_extension` summary for that repo, and a `data_file` field which directly provides the full URL to the detailed JSON for that repository (e.g., `https://celestiaorg.github.io/tokenmetry/repository_data/repo_name.json`).',
'accessing_detailed_data': "To get detailed file breakdowns for a repository, directly use the full URL provided in the `data_file` field of the repository entry in this meta-index. The detailed file (a separate JSON) will contain a `files` array with individual file paths (relative to that repository root) and their token counts.",
'context_management': 'Use `total_tokens` from this `index.json` (meta-index) for a quick overview of repository sizes. For detailed planning or when specific file token counts are needed, fetch the individual repository JSON using its `data_file` URL. Be mindful of the size of these detailed JSONs.',
'language_breakdown': '`summary.by_extension_across_all_repos` in this meta-index provides an overall language mix. Per-repository language breakdowns are in `repositories[N].by_extension`.',
'context_window_warning': {
'message': 'This `index.json` (meta-index) file itself is designed to be relatively small. However, individual repository JSON files (linked via `data_file` in the `repositories` array) can be large.',
'recommendation': 'Always fetch and parse this `index.json` (meta-index) first. Then, selectively fetch individual repository JSONs only for the repositories relevant to your current task and context window capacity.',
'retrieval_strategy': [
'1. Fetch and parse this `index.json` (meta-index) file.',
'2. Analyze the `summary` and the `repositories` array to identify repositories of interest based on name, total_tokens, or language breakdown.',
'3. For each repository of interest, use the full URL from its `data_file` field to fetch its detailed JSON data.',
'4. Process the detailed JSON. Be mindful of its size and token count if you plan to include its content in your context.'
]
},
'example_agent_workflow': [
"1. **Fetch Meta-Index:** Retrieve this `index.json` file using the URL from `metadata.api_endpoint`.",
"2. **Understand Scope:** Parse `index.json`. Examine `metadata.description`, `metadata.data_structure`, and the main `summary` object for an overview of available data, total token counts, and repository count.",
"3. **Identify Target Repositories:** Iterate through the `repositories` array. For each repository object, check its `name`, `total_tokens`, and `by_extension` summary to determine its relevance for your current task.",
"4. **Selective Deep Dive (If Needed):** If a repository is relevant and you require file-level token counts or specific file paths, retrieve its detailed JSON data. The full URL for this detailed JSON is provided directly in the repository object's `data_file` field.",
"5. **Process Detailed Data:** Parse the individual repository's JSON. This file will contain a `files` array, where each element details a source file's path (relative to its repository root) and its token count. Be mindful of the size of this detailed JSON (refer to `context_window_warning`).",
"6. **Iterate as Needed:** Repeat steps 3-5 for other relevant repositories based on your task requirements and available context window capacity."
]
},
'data_structure': {
'summary': 'High-level aggregated statistics across all repositories (total_repositories_configured, successful_repositories_processed, total_files_across_all_repos, total_tokens_across_all_repos, by_extension_across_all_repos).',
'repositories': 'Array of objects, each summarizing a single repository (name, url, data_file path, total_files, total_tokens, by_extension summary for that repo, error status). The `data_file` points to a separate JSON in the `repository_data/` directory with detailed file breakdowns for that repository.',
'token_counting': 'Uses GPT-2 tokenizer for consistent token counting across all content.'
}
}
# Add original keys from tokenizer.py output (summary, repositories) after metadata
for key, value in data.items():
if key not in new_data: # Avoid overwriting if script already produced 'metadata'
new_data[key] = value
# Update data_file to be the full URL
base_url = "https://celestiaorg.github.io/tokenmetry/"
if 'repositories' in new_data and isinstance(new_data['repositories'], list):
for repo_entry in new_data['repositories']:
if 'data_file' in repo_entry and isinstance(repo_entry['data_file'], str) and repo_entry['data_file'].startswith('repository_data/'):
# Overwrite data_file with the full URL
repo_entry['data_file'] = base_url + repo_entry['data_file']
# Ensure data_file_full_url is not present if it somehow existed
if 'data_file_full_url' in repo_entry:
del repo_entry['data_file_full_url']
# Calculate approximate token count of this index.json (meta-index) file itself
json_content_for_meta_index = json.dumps(new_data, indent=2)
try:
from transformers import GPT2TokenizerFast
tokenizer_for_meta = GPT2TokenizerFast.from_pretrained('gpt2')
meta_json_token_count = len(tokenizer_for_meta.encode(json_content_for_meta_index))
new_data['metadata']['usage_instructions']['context_window_warning']['meta_index_file_token_count'] = meta_json_token_count
new_data['metadata']['usage_instructions']['context_window_warning']['message'] = f'This index.json (meta-index) file itself contains approximately {meta_json_token_count:,} tokens.'
except Exception as e:
char_count_meta = len(json_content_for_meta_index)
estimated_tokens_meta = char_count_meta // 4
new_data['metadata']['usage_instructions']['context_window_warning']['meta_index_file_token_count'] = estimated_tokens_meta
new_data['metadata']['usage_instructions']['context_window_warning']['message'] = f'This index.json (meta-index) file itself contains approximately {estimated_tokens_meta:,} tokens (estimated).'
with open(index_path, 'w') as f:
json.dump(new_data, f, indent=2)
EOL
poetry run python ./add_metadata_script.py
- name: Update README table
run: |
python3 -c "
import json
import re
from datetime import datetime, timezone
import os
# This is now the meta-index file
index_path = '_site/index.json'
with open(index_path, 'r') as f:
# data from index.json is the meta-index content
data = json.load(f)
table_rows = []
table_rows.append('| Repository | Total Files | Total Tokens | Go Files | Go Tokens | Markdown Files | Markdown Tokens | Rust Files | Rust Tokens | Solidity Files | Solidity Tokens |')
table_rows.append('|------------|-------------|--------------|----------|-----------|----------------|-----------------|------------|-------------|----------------|-----------------|')
# Iterate through repositories listed in the meta-index
for repo_summary_in_meta in data.get('repositories', []):
if not repo_summary_in_meta.get('error'):
name = repo_summary_in_meta['name']
total_files = repo_summary_in_meta['total_files']
total_tokens = f\"{repo_summary_in_meta['total_tokens']:,}\"
# by_extension data is directly available in the repo_summary_in_meta
repo_by_ext = repo_summary_in_meta.get('by_extension', {})
go_files = repo_by_ext.get('.go', {}).get('files', 0)
go_tokens = f\"{repo_by_ext.get('.go', {}).get('tokens', 0):,}\"
md_files = repo_by_ext.get('.md', {}).get('files', 0)
md_tokens = f\"{repo_by_ext.get('.md', {}).get('tokens', 0):,}\"
rs_files = repo_by_ext.get('.rs', {}).get('files', 0)
rs_tokens = f\"{repo_by_ext.get('.rs', {}).get('tokens', 0):,}\"
sol_files = repo_by_ext.get('.sol', {}).get('files', 0)
sol_tokens = f\"{repo_by_ext.get('.sol', {}).get('tokens', 0):,}\"
table_rows.append(f'| {name} | {total_files} | {total_tokens} | {go_files} | {go_tokens} | {md_files} | {md_tokens} | {rs_files} | {rs_tokens} | {sol_files} | {sol_tokens} |')
else:
name = repo_summary_in_meta.get('name', 'Unknown Repo')
table_rows.append(f'| {name} | N/A | N/A | N/A | N/A | N/A | N/A | N/A | N/A | N/A | N/A |') # Row for errored repos
# Use overall summary from meta-index for the TOTAL row
overall_summary = data['summary'] # This is the overall summary from the meta-index
summary_total_files = overall_summary.get('total_files_across_all_repos', 0)
summary_total_tokens = overall_summary.get('total_tokens_across_all_repos', 0)
summary_by_ext = overall_summary.get('by_extension_across_all_repos', {}) # Use the overall by_extension summary
summary_go_files = summary_by_ext.get('.go', {}).get('files', 0)
summary_go_tokens = summary_by_ext.get('.go', {}).get('tokens', 0)
summary_md_files = summary_by_ext.get('.md', {}).get('files', 0)
summary_md_tokens = summary_by_ext.get('.md', {}).get('tokens', 0)
summary_rs_files = summary_by_ext.get('.rs', {}).get('files', 0)
summary_rs_tokens = summary_by_ext.get('.rs', {}).get('tokens', 0)
summary_sol_files = summary_by_ext.get('.sol', {}).get('files', 0)
summary_sol_tokens = summary_by_ext.get('.sol', {}).get('tokens', 0)
table_rows.append(f'| **TOTAL** | **{summary_total_files}** | **{summary_total_tokens:,}** | **{summary_go_files}** | **{summary_go_tokens:,}** | **{summary_md_files}** | **{summary_md_tokens:,}** | **{summary_rs_files}** | **{summary_rs_tokens:,}** | **{summary_sol_files}** | **{summary_sol_tokens:,}** |')
table_content = '\n'.join(table_rows)
timestamp = datetime.now(timezone.utc).strftime('%Y-%m-%d %H:%M:%S UTC')
table_with_timestamp = f'{table_content}\n\n**Last Updated:** {timestamp}'
with open('README.md', 'r') as f:
readme_content = f.read()
pattern = r'<!-- TOKENMETRY_TABLE_START -->.*?<!-- TOKENMETRY_TABLE_END -->'
replacement = f'<!-- TOKENMETRY_TABLE_START -->\n{table_with_timestamp}\n<!-- TOKENMETRY_TABLE_END -->'
updated_readme = re.sub(pattern, replacement, readme_content, flags=re.DOTALL)
with open('README.md', 'w') as f:
f.write(updated_readme)
"
- name: Commit README updates
run: |
git config --local user.email "action@github.com"
git config --local user.name "GitHub Action"
git add README.md
if git diff --staged --quiet; then
echo "No changes to README.md"
else
git commit -m "Update README.md with latest token analysis results [skip ci]"
git push
fi
- name: Setup Pages
uses: actions/configure-pages@v4
- name: Upload artifact
uses: actions/upload-pages-artifact@v3
with:
path: './_site'
deploy:
environment:
name: github-pages
url: ${{ steps.deployment.outputs.page_url }}
runs-on: ubuntu-latest
needs: analyze-tokens
steps:
- name: Deploy to GitHub Pages
id: deployment
uses: actions/deploy-pages@v4