Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions .github/workflows/python.yml
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,12 @@ jobs:
steps:
- uses: actions/checkout@v3
- uses: astral-sh/setup-uv@v6
- name: Run ruff lint
working-directory: PythonScripts
run: uv run ruff check audit_translations/
- name: Run ruff format check
working-directory: PythonScripts
run: uv run ruff format --check audit_translations/
- name: Run tests
working-directory: PythonScripts
run: uv run pytest
6 changes: 1 addition & 5 deletions PythonScripts/audit_translations/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -68,10 +68,9 @@ uv run --project PythonScripts audit-translations --list
* `--list`: Displays all available languages.
* Region variants are shown as `lang-region` (e.g., `zz-aa`) based on subdirectories under `Rules/Languages/<lang>`.
* `--file`: Audits a single specific file instead of the whole directory.
* `--format`: Output format (`rich`, `jsonl`). `--output` is honored only for `jsonl`; rich output always prints to the console.
* `--rules-dir`: Override the Rules/Languages directory path.
* `--only`: Filter issue types (comma-separated): `missing`, `untranslated`, `extra`, `diffs`, `all`.
* `--verbose`: Show detailed output including English/translated snippets for rule differences (only affects rich format; default shows summary only).
* `--verbose`: Show detailed output including English/translated snippets for rule differences.
* **Summary Stats:** Provides a statistical summary after every run.

**Examples:**
Expand All @@ -92,9 +91,6 @@ uv run audit-translations de
# Audit only a specific file
uv run audit-translations es --file SharedRules/default.yaml

# Produce JSONL output for automation or AI workflows
uv run audit-translations es --format jsonl --output es-issues.jsonl

# Audit a regional variant (merges Rules/Languages/de and Rules/Languages/de/CH)
uv run audit-translations de-CH

Expand Down
2 changes: 1 addition & 1 deletion PythonScripts/audit_translations/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@
import sys

sys.stdout.reconfigure(encoding="utf-8")
from .cli import main
from .cli import main # noqa: E402

__all__ = [
"main",
Expand Down
107 changes: 49 additions & 58 deletions PythonScripts/audit_translations/auditor.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,21 +5,31 @@
and for performing full language audits.
"""

import json
import sys
from pathlib import Path
from typing import TextIO

from rich.panel import Panel
from rich.table import Table

from .dataclasses import RuleInfo, ComparisonResult
from .parsers import parse_yaml_file, diff_rules
from .renderer import collect_issues, console, print_warnings
from .dataclasses import ComparisonResult, RuleInfo
from .parsers import diff_rules, parse_yaml_file
from .renderer import console, print_warnings

# Re-export console so existing `from .auditor import console` callers keep working.
__all__ = ["console"]

GREEN_FILE_COUNT_THRESHOLD = 7
YELLOW_FILE_COUNT_THRESHOLD = 4


def file_count_color(file_count: int) -> str:
"""Map number of translated YAML files to a display color."""
if file_count >= GREEN_FILE_COUNT_THRESHOLD:
return "green"
if file_count >= YELLOW_FILE_COUNT_THRESHOLD:
return "yellow"
return "red"


def split_language_into_base_and_region(language: str) -> tuple[str, str | None]:
"""Split a language code into base and optional region."""
Expand Down Expand Up @@ -148,8 +158,6 @@ def merge_rules(base_rules: list[RuleInfo], region_rules: list[RuleInfo]) -> lis
def audit_language(
language: str,
specific_file: str | None = None,
output_format: str = "rich",
output_path: str | None = None,
rules_dir: str | None = None,
issue_filter: set[str] | None = None,
verbose: bool = False,
Expand Down Expand Up @@ -178,15 +186,10 @@ def audit_language(
# Get list of files to audit
files = [specific_file] if specific_file else get_yaml_files(english_dir, english_region_dir)

if output_format == "rich":
# Print header
console.print(Panel(f"MathCAT Translation Audit: {language.upper()}", style="bold cyan"))
console.print(f"\n [dim]Comparing against English (en) reference files[/]")
console.print(f" [dim]Files to check: {len(files)}[/]")

out_stream: TextIO = sys.stdout
if output_path:
out_stream = open(output_path, "w", encoding="utf-8", newline="")
# Print header
console.print(Panel(f"MathCAT Translation Audit: {language.upper()}", style="bold cyan"))
console.print("\n [dim]Comparing against English (en) reference files[/]")
console.print(f" [dim]Files to check: {len(files)}[/]")

total_issues = 0
total_missing = 0
Expand Down Expand Up @@ -214,52 +217,39 @@ def audit_language(
str(english_region_path) if english_region_path and english_region_path.exists() else None,
)

if output_format == "rich":
if result.has_issues:
issues = print_warnings(result, file_name, verbose, language)
if issues > 0:
files_with_issues += 1
total_issues += issues
else:
files_ok += 1
else:
issues_list = collect_issues(result, file_name, language)
for issue in issues_list:
out_stream.write(json.dumps(issue, ensure_ascii=False) + "\n")
if issues_list:
if result.has_issues:
issues = print_warnings(result, file_name, verbose, language)
if issues > 0:
files_with_issues += 1
total_issues += len(issues_list)
else:
files_ok += 1
total_issues += issues
else:
files_ok += 1

total_missing += len(result.missing_rules)
total_untranslated += sum(len(entries) for _, entries in result.untranslated_text)
total_extra += len(result.extra_rules)
total_differences += len(result.rule_differences)

if output_format == "rich":
# Summary
table = Table(title="SUMMARY", title_style="bold", box=None, show_header=False, padding=(0, 2))
table.add_column(width=30)
table.add_column()
for label, value, color in [
("Files checked", len(files), None),
("Files with issues", files_with_issues, "yellow" if files_with_issues else "green"),
("Files OK", files_ok, "green" if files_ok else None),
("Missing rules", total_missing, "red" if total_missing else "green"),
("Untranslated text", total_untranslated, "yellow" if total_untranslated else "green"),
("Rule differences", total_differences, "magenta" if total_differences else "green"),
("Extra rules", total_extra, "blue" if total_extra else None),
]:
table.add_row(label, f"[{color}]{value}[/]" if color else str(value))
console.print(Panel(table, style="cyan"))

if output_path:
out_stream.close()
# Summary
table = Table(title="SUMMARY", title_style="bold", box=None, show_header=False, padding=(0, 2))
table.add_column(width=30)
table.add_column()
for label, value, color in [
("Files checked", len(files), None),
("Files with issues", files_with_issues, "yellow" if files_with_issues else "green"),
("Files OK", files_ok, "green" if files_ok else None),
("Missing rules", total_missing, "red" if total_missing else "green"),
("Untranslated text", total_untranslated, "yellow" if total_untranslated else "green"),
("Rule differences", total_differences, "magenta" if total_differences else "green"),
("Extra rules", total_extra, "blue" if total_extra else None),
]:
table.add_row(label, f"[{color}]{value}[/]" if color else str(value))
console.print(Panel(table, style="cyan"))

return total_issues


def list_languages(rules_dir: str | None = None):
def list_languages(rules_dir: str | None = None) -> None:
"""List available languages for auditing"""
console.print(Panel("Available Languages", style="bold cyan"))

Expand All @@ -272,15 +262,16 @@ def list_languages(rules_dir: str | None = None):
if not lang_dir.is_dir() or lang_dir.name == "en":
continue
base_count = len(get_yaml_files(lang_dir))
color = "green" if base_count >= 7 else "yellow" if base_count >= 4 else "red"
color = file_count_color(base_count)
table.add_row(lang_dir.name, f"[{color}]{base_count}[/] files")

for region_dir in sorted(lang_dir.iterdir()):
if region_dir.is_dir():
code = f"{lang_dir.name}-{region_dir.name}"
count = len(get_yaml_files(lang_dir, region_dir))
region_color = "green" if count >= 7 else "yellow" if count >= 4 else "red"
table.add_row(code, f"[{region_color}]{count}[/] files")
if not region_dir.is_dir() or region_dir.name.lower() == "sharedrules":
continue
code = f"{lang_dir.name}-{region_dir.name}"
count = len(get_yaml_files(lang_dir, region_dir))
region_color = file_count_color(count)
table.add_row(code, f"[{region_color}]{count}[/] files")

console.print(table)
console.print("\n [dim]Reference: en (English) - base translation[/]\n")
15 changes: 3 additions & 12 deletions PythonScripts/audit_translations/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,10 +7,10 @@
import argparse
import sys

from .auditor import audit_language, list_languages, console
from .auditor import audit_language, console, list_languages


def main():
def main() -> None:
"""Main entry point for the audit tool"""

parser = argparse.ArgumentParser(
Expand All @@ -28,21 +28,14 @@ def main():
parser.add_argument("--file", dest="specific_file", help="Audit only a specific file (e.g., 'SharedRules/default.yaml')")
parser.add_argument("--list", action="store_true", help="List available languages")
parser.add_argument("--rules-dir", help="Override Rules/Languages directory path")
parser.add_argument(
"--format",
choices=["rich", "jsonl"],
default="rich",
help="Output format (default: rich)",
)
parser.add_argument("--output", help="Write output to a file instead of stdout")
parser.add_argument(
"--only",
help="Comma-separated issue types: missing, untranslated, extra, diffs, all",
)
parser.add_argument(
"--verbose",
action="store_true",
help="Show detailed output including rule snippets (only affects rich format)",
help="Show detailed output including rule snippets",
)

args = parser.parse_args()
Expand All @@ -68,8 +61,6 @@ def main():
audit_language(
args.language,
args.specific_file,
args.format,
args.output,
args.rules_dir,
issue_filter,
args.verbose,
Expand Down
28 changes: 25 additions & 3 deletions PythonScripts/audit_translations/dataclasses.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,9 +5,28 @@
"""

from dataclasses import dataclass, field
from enum import StrEnum
from typing import Any


class IssueType(StrEnum):
"""Top-level issue categories used by the audit renderer."""

MISSING_RULE = "missing_rule"
UNTRANSLATED_TEXT = "untranslated_text"
RULE_DIFFERENCE = "rule_difference"
EXTRA_RULE = "extra_rule"


class DiffType(StrEnum):
"""Rule-difference subcategories used for fine-grained diagnostics."""

MATCH = "match" # `match` XPath differs between English and translation.
CONDITION = "condition" # `if` / `test` condition expressions differ.
VARIABLES = "variables" # Variable names defined in `variables` differ.
STRUCTURE = "structure" # Control-flow block shape/order differs (if/then/else/with/replace).


@dataclass
class RuleInfo:
"""
Expand All @@ -29,8 +48,7 @@ class RuleInfo:
Parsed YAML node for the rule; used for structural diffs.
untranslated_entries : list[tuple[str, str, int | None]]
List of (key, text, line) entries extracted from lowercase translation keys.
This drives per-issue JSONL output so each untranslated string can report
the specific YAML line number where it appears.
This preserves exact text fragments and YAML line numbers for diagnostics.
line_map : dict[str, list[int]]
Mapping of element type to line numbers for rule components like match,
conditions, variables, and structural tokens. This is used to point
Expand Down Expand Up @@ -64,11 +82,15 @@ class RuleDifference:

english_rule: RuleInfo
translated_rule: RuleInfo
diff_type: str # 'match', 'condition', 'structure', 'variables'
diff_type: DiffType
description: str
english_snippet: str
translated_snippet: str

def __post_init__(self) -> None:
if isinstance(self.diff_type, str):
self.diff_type = DiffType(self.diff_type)


@dataclass
class ComparisonResult:
Expand Down
20 changes: 8 additions & 12 deletions PythonScripts/audit_translations/line_resolver.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,18 +4,14 @@
Maps rule diff types and structure tokens to precise YAML source line numbers.
"""

from .dataclasses import RuleInfo, RuleDifference
from .dataclasses import DiffType, RuleDifference, RuleInfo
from .parsers import extract_structure_elements


def _get_line_map_lines(rule: RuleInfo, kind: str, token: str | None = None) -> list[int]:
def _get_line_map_lines(rule: RuleInfo, kind: DiffType | str, token: str | None = None) -> list[int]:
"""Return the line-number list for a given element kind from the rule's line map."""
if kind == "match":
return rule.line_map.get("match", [])
if kind == "condition":
return rule.line_map.get("condition", [])
if kind == "variables":
return rule.line_map.get("variables", [])
if kind in ("match", "condition", "variables"):
return rule.line_map.get(kind, [])
if kind == "structure" and token:
return rule.line_map.get(f"structure:{token.rstrip(':')}", [])
return []
Expand Down Expand Up @@ -44,7 +40,7 @@ def first_structure_mismatch(

def resolve_issue_line_at_position(
rule: RuleInfo,
kind: str,
kind: DiffType | str,
token: str | None = None,
position: int = 0,
) -> int | None:
Expand All @@ -64,7 +60,7 @@ def resolve_issue_line_at_position(
return lines[position] if position < len(lines) else None


def resolve_issue_line(rule: RuleInfo, kind: str, token: str | None = None) -> int | None:
def resolve_issue_line(rule: RuleInfo, kind: DiffType | str, token: str | None = None) -> int | None:
"""
Resolve the line number for an issue within a rule.

Expand Down Expand Up @@ -150,8 +146,8 @@ def resolve_diff_lines(diff: RuleDifference) -> tuple[int | None, int | None] |
Resolve issue line numbers for a rule difference.

Returns (line_en, line_tr), or None only for unresolvable structure diffs.
This is the single entry point used by both collect_issues and print_warnings
to avoid duplicating the structure vs non-structure branching logic.
This is the single entry point used by the renderer to avoid duplicating
the structure vs non-structure branching logic.
"""
if diff.diff_type == "structure":
return resolve_structure_issue_lines(diff)
Expand Down
Loading