Token Telemetry #36
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| name: Token Telemetry | |
| on: | |
| schedule: | |
| # Run daily at 6 AM UTC | |
| - cron: '0 6 * * *' | |
| workflow_dispatch: # Allow manual triggering | |
| push: | |
| branches: [ main ] | |
| paths: | |
| - 'tokenizer.py' | |
| - '.github/workflows/tokenizer.yml' | |
| - 'repos.txt' | |
| permissions: | |
| contents: write | |
| pages: write | |
| id-token: write | |
| # Allow only one concurrent deployment, skipping runs queued between the run in-progress and latest queued. | |
| concurrency: | |
| group: "pages" | |
| cancel-in-progress: false | |
| jobs: | |
| analyze-tokens: | |
| runs-on: ubuntu-latest | |
| steps: | |
| - name: Checkout repository | |
| uses: actions/checkout@v4 | |
| - name: Set up Python | |
| uses: actions/setup-python@v5 | |
| with: | |
| python-version: '3.9' | |
| - name: Install Poetry | |
| uses: snok/install-poetry@v1 | |
| with: | |
| version: latest | |
| virtualenvs-create: true | |
| virtualenvs-in-project: true | |
| - name: Load cached venv | |
| id: cached-poetry-dependencies | |
| uses: actions/cache@v4 | |
| with: | |
| path: .venv | |
| key: venv-${{ runner.os }}-${{ steps.setup-python.outputs.python-version }}-${{ hashFiles('**/poetry.lock') }} | |
| - name: Install dependencies | |
| if: steps.cached-poetry-dependencies.outputs.cache-hit != 'true' | |
| run: poetry install --no-interaction --no-root | |
| - name: Run token analysis | |
| run: | | |
| mkdir -p _site | |
| poetry run python tokenizer.py --celestia-repos --output _site/index.json | |
| - name: Create index.html redirect | |
| run: | | |
| cat > _site/index.html << 'EOF' | |
| <!DOCTYPE html> | |
| <html> | |
| <head> | |
| <meta charset="utf-8"> | |
| <title>Token Telemetry API</title> | |
| <meta http-equiv="refresh" content="0; url=index.json"> | |
| </head> | |
| <body> | |
| <h1>🚀 Token Telemetry API</h1> | |
| <p>Comprehensive token analysis of CelestiaOrg repositories for AI development and context management.</p> | |
| <p>Redirecting to <a href="index.json">index.json</a>...</p> | |
| <p><small>Direct link: <code>https://celestiaorg.github.io/tokenmetry/index.json</code></small></p> | |
| <p><a href="https://github.com/celestiaorg/tokenmetry">📚 View Documentation on GitHub</a></p> | |
| </body> | |
| </html> | |
| EOF | |
| - name: Add metadata to JSON and update data_file to full URL | |
| run: | | |
| cat > ./add_metadata_script.py << 'EOL' | |
| import json | |
| from datetime import datetime, timezone | |
| from collections import OrderedDict | |
| import os | |
| # This is the meta-index file, located at the root of the deployment (_site) | |
| index_path = '_site/index.json' | |
| with open(index_path, 'r') as f: | |
| data = json.load(f) # This data is already structured as a meta-index by tokenizer.py | |
| new_data = OrderedDict() | |
| # Ensure metadata is the first key | |
| new_data['metadata'] = { | |
| 'generated_at': datetime.now(timezone.utc).isoformat(), | |
| 'generator': 'celestiaorg/tokenmetry', | |
| 'version': '1.1.0', # Version indicating meta-index structure | |
| 'format': 'celestia-token-telemetry-meta-v1', # Format name for the meta-index | |
| 'description': 'Meta-index for token count analysis of CelestiaOrg repositories. Provides summaries and links to detailed per-repository data files located in the repository_data/ directory.', | |
| 'api_endpoint': 'https://celestiaorg.github.io/tokenmetry/index.json', # This file itself | |
| 'purpose': 'This JSON meta-index helps AI agents discover available repository token analyses, understand overall codebase statistics, and selectively access detailed data for specific repositories located in the repository_data/ directory.', | |
| 'usage_instructions': { | |
| 'overview': 'This meta-index (`index.json`) provides a high-level summary of token counts across multiple repositories. Use `summary` fields for overall codebase statistics. The `repositories` array offers per-repository summaries and a direct URL in `data_file` to access detailed token counts for each file within that repository.', | |
| 'repository_discovery': 'The `repositories` array lists all processed repositories. Each entry includes `name`, `url`, `total_tokens`, `total_files`, a `by_extension` summary for that repo, and a `data_file` field which directly provides the full URL to the detailed JSON for that repository (e.g., `https://celestiaorg.github.io/tokenmetry/repository_data/repo_name.json`).', | |
| 'accessing_detailed_data': "To get detailed file breakdowns for a repository, directly use the full URL provided in the `data_file` field of the repository entry in this meta-index. The detailed file (a separate JSON) will contain a `files` array with individual file paths (relative to that repository root) and their token counts.", | |
| 'context_management': 'Use `total_tokens` from this `index.json` (meta-index) for a quick overview of repository sizes. For detailed planning or when specific file token counts are needed, fetch the individual repository JSON using its `data_file` URL. Be mindful of the size of these detailed JSONs.', | |
| 'language_breakdown': '`summary.by_extension_across_all_repos` in this meta-index provides an overall language mix. Per-repository language breakdowns are in `repositories[N].by_extension`.', | |
| 'context_window_warning': { | |
| 'message': 'This `index.json` (meta-index) file itself is designed to be relatively small. However, individual repository JSON files (linked via `data_file` in the `repositories` array) can be large.', | |
| 'recommendation': 'Always fetch and parse this `index.json` (meta-index) first. Then, selectively fetch individual repository JSONs only for the repositories relevant to your current task and context window capacity.', | |
| 'retrieval_strategy': [ | |
| '1. Fetch and parse this `index.json` (meta-index) file.', | |
| '2. Analyze the `summary` and the `repositories` array to identify repositories of interest based on name, total_tokens, or language breakdown.', | |
| '3. For each repository of interest, use the full URL from its `data_file` field to fetch its detailed JSON data.', | |
| '4. Process the detailed JSON. Be mindful of its size and token count if you plan to include its content in your context.' | |
| ] | |
| }, | |
| 'example_agent_workflow': [ | |
| "1. **Fetch Meta-Index:** Retrieve this `index.json` file using the URL from `metadata.api_endpoint`.", | |
| "2. **Understand Scope:** Parse `index.json`. Examine `metadata.description`, `metadata.data_structure`, and the main `summary` object for an overview of available data, total token counts, and repository count.", | |
| "3. **Identify Target Repositories:** Iterate through the `repositories` array. For each repository object, check its `name`, `total_tokens`, and `by_extension` summary to determine its relevance for your current task.", | |
| "4. **Selective Deep Dive (If Needed):** If a repository is relevant and you require file-level token counts or specific file paths, retrieve its detailed JSON data. The full URL for this detailed JSON is provided directly in the repository object's `data_file` field.", | |
| "5. **Process Detailed Data:** Parse the individual repository's JSON. This file will contain a `files` array, where each element details a source file's path (relative to its repository root) and its token count. Be mindful of the size of this detailed JSON (refer to `context_window_warning`).", | |
| "6. **Iterate as Needed:** Repeat steps 3-5 for other relevant repositories based on your task requirements and available context window capacity." | |
| ] | |
| }, | |
| 'data_structure': { | |
| 'summary': 'High-level aggregated statistics across all repositories (total_repositories_configured, successful_repositories_processed, total_files_across_all_repos, total_tokens_across_all_repos, by_extension_across_all_repos).', | |
| 'repositories': 'Array of objects, each summarizing a single repository (name, url, data_file path, total_files, total_tokens, by_extension summary for that repo, error status). The `data_file` points to a separate JSON in the `repository_data/` directory with detailed file breakdowns for that repository.', | |
| 'token_counting': 'Uses GPT-2 tokenizer for consistent token counting across all content.' | |
| } | |
| } | |
| # Add original keys from tokenizer.py output (summary, repositories) after metadata | |
| for key, value in data.items(): | |
| if key not in new_data: # Avoid overwriting if script already produced 'metadata' | |
| new_data[key] = value | |
| # Update data_file to be the full URL | |
| base_url = "https://celestiaorg.github.io/tokenmetry/" | |
| if 'repositories' in new_data and isinstance(new_data['repositories'], list): | |
| for repo_entry in new_data['repositories']: | |
| if 'data_file' in repo_entry and isinstance(repo_entry['data_file'], str) and repo_entry['data_file'].startswith('repository_data/'): | |
| # Overwrite data_file with the full URL | |
| repo_entry['data_file'] = base_url + repo_entry['data_file'] | |
| # Ensure data_file_full_url is not present if it somehow existed | |
| if 'data_file_full_url' in repo_entry: | |
| del repo_entry['data_file_full_url'] | |
| # Calculate approximate token count of this index.json (meta-index) file itself | |
| json_content_for_meta_index = json.dumps(new_data, indent=2) | |
| try: | |
| from transformers import GPT2TokenizerFast | |
| tokenizer_for_meta = GPT2TokenizerFast.from_pretrained('gpt2') | |
| meta_json_token_count = len(tokenizer_for_meta.encode(json_content_for_meta_index)) | |
| new_data['metadata']['usage_instructions']['context_window_warning']['meta_index_file_token_count'] = meta_json_token_count | |
| new_data['metadata']['usage_instructions']['context_window_warning']['message'] = f'This index.json (meta-index) file itself contains approximately {meta_json_token_count:,} tokens.' | |
| except Exception as e: | |
| char_count_meta = len(json_content_for_meta_index) | |
| estimated_tokens_meta = char_count_meta // 4 | |
| new_data['metadata']['usage_instructions']['context_window_warning']['meta_index_file_token_count'] = estimated_tokens_meta | |
| new_data['metadata']['usage_instructions']['context_window_warning']['message'] = f'This index.json (meta-index) file itself contains approximately {estimated_tokens_meta:,} tokens (estimated).' | |
| with open(index_path, 'w') as f: | |
| json.dump(new_data, f, indent=2) | |
| EOL | |
| poetry run python ./add_metadata_script.py | |
| - name: Update README table | |
| run: | | |
| python3 -c " | |
| import json | |
| import re | |
| from datetime import datetime, timezone | |
| import os | |
| # This is now the meta-index file | |
| index_path = '_site/index.json' | |
| with open(index_path, 'r') as f: | |
| # data from index.json is the meta-index content | |
| data = json.load(f) | |
| table_rows = [] | |
| table_rows.append('| Repository | Total Files | Total Tokens | Go Files | Go Tokens | Markdown Files | Markdown Tokens | Rust Files | Rust Tokens | Solidity Files | Solidity Tokens |') | |
| table_rows.append('|------------|-------------|--------------|----------|-----------|----------------|-----------------|------------|-------------|----------------|-----------------|') | |
| # Iterate through repositories listed in the meta-index | |
| for repo_summary_in_meta in data.get('repositories', []): | |
| if not repo_summary_in_meta.get('error'): | |
| name = repo_summary_in_meta['name'] | |
| total_files = repo_summary_in_meta['total_files'] | |
| total_tokens = f\"{repo_summary_in_meta['total_tokens']:,}\" | |
| # by_extension data is directly available in the repo_summary_in_meta | |
| repo_by_ext = repo_summary_in_meta.get('by_extension', {}) | |
| go_files = repo_by_ext.get('.go', {}).get('files', 0) | |
| go_tokens = f\"{repo_by_ext.get('.go', {}).get('tokens', 0):,}\" | |
| md_files = repo_by_ext.get('.md', {}).get('files', 0) | |
| md_tokens = f\"{repo_by_ext.get('.md', {}).get('tokens', 0):,}\" | |
| rs_files = repo_by_ext.get('.rs', {}).get('files', 0) | |
| rs_tokens = f\"{repo_by_ext.get('.rs', {}).get('tokens', 0):,}\" | |
| sol_files = repo_by_ext.get('.sol', {}).get('files', 0) | |
| sol_tokens = f\"{repo_by_ext.get('.sol', {}).get('tokens', 0):,}\" | |
| table_rows.append(f'| {name} | {total_files} | {total_tokens} | {go_files} | {go_tokens} | {md_files} | {md_tokens} | {rs_files} | {rs_tokens} | {sol_files} | {sol_tokens} |') | |
| else: | |
| name = repo_summary_in_meta.get('name', 'Unknown Repo') | |
| table_rows.append(f'| {name} | N/A | N/A | N/A | N/A | N/A | N/A | N/A | N/A | N/A | N/A |') # Row for errored repos | |
| # Use overall summary from meta-index for the TOTAL row | |
| overall_summary = data['summary'] # This is the overall summary from the meta-index | |
| summary_total_files = overall_summary.get('total_files_across_all_repos', 0) | |
| summary_total_tokens = overall_summary.get('total_tokens_across_all_repos', 0) | |
| summary_by_ext = overall_summary.get('by_extension_across_all_repos', {}) # Use the overall by_extension summary | |
| summary_go_files = summary_by_ext.get('.go', {}).get('files', 0) | |
| summary_go_tokens = summary_by_ext.get('.go', {}).get('tokens', 0) | |
| summary_md_files = summary_by_ext.get('.md', {}).get('files', 0) | |
| summary_md_tokens = summary_by_ext.get('.md', {}).get('tokens', 0) | |
| summary_rs_files = summary_by_ext.get('.rs', {}).get('files', 0) | |
| summary_rs_tokens = summary_by_ext.get('.rs', {}).get('tokens', 0) | |
| summary_sol_files = summary_by_ext.get('.sol', {}).get('files', 0) | |
| summary_sol_tokens = summary_by_ext.get('.sol', {}).get('tokens', 0) | |
| table_rows.append(f'| **TOTAL** | **{summary_total_files}** | **{summary_total_tokens:,}** | **{summary_go_files}** | **{summary_go_tokens:,}** | **{summary_md_files}** | **{summary_md_tokens:,}** | **{summary_rs_files}** | **{summary_rs_tokens:,}** | **{summary_sol_files}** | **{summary_sol_tokens:,}** |') | |
| table_content = '\n'.join(table_rows) | |
| timestamp = datetime.now(timezone.utc).strftime('%Y-%m-%d %H:%M:%S UTC') | |
| table_with_timestamp = f'{table_content}\n\n**Last Updated:** {timestamp}' | |
| with open('README.md', 'r') as f: | |
| readme_content = f.read() | |
| pattern = r'<!-- TOKENMETRY_TABLE_START -->.*?<!-- TOKENMETRY_TABLE_END -->' | |
| replacement = f'<!-- TOKENMETRY_TABLE_START -->\n{table_with_timestamp}\n<!-- TOKENMETRY_TABLE_END -->' | |
| updated_readme = re.sub(pattern, replacement, readme_content, flags=re.DOTALL) | |
| with open('README.md', 'w') as f: | |
| f.write(updated_readme) | |
| " | |
| - name: Commit README updates | |
| run: | | |
| git config --local user.email "action@github.com" | |
| git config --local user.name "GitHub Action" | |
| git add README.md | |
| if git diff --staged --quiet; then | |
| echo "No changes to README.md" | |
| else | |
| git commit -m "Update README.md with latest token analysis results [skip ci]" | |
| git push | |
| fi | |
| - name: Setup Pages | |
| uses: actions/configure-pages@v4 | |
| - name: Upload artifact | |
| uses: actions/upload-pages-artifact@v3 | |
| with: | |
| path: './_site' | |
| deploy: | |
| environment: | |
| name: github-pages | |
| url: ${{ steps.deployment.outputs.page_url }} | |
| runs-on: ubuntu-latest | |
| needs: analyze-tokens | |
| steps: | |
| - name: Deploy to GitHub Pages | |
| id: deployment | |
| uses: actions/deploy-pages@v4 |