|
1 | 1 | import logging |
| 2 | +import os |
2 | 3 | import re |
3 | 4 | import threading |
4 | 5 | import time |
|
13 | 14 | # Set up structured logging |
14 | 15 | logger = logging.getLogger(__name__) |
15 | 16 |
|
| 17 | +# Minimum budget_tokens allowed by the Anthropic API for extended thinking |
| 18 | +API_MIN_BUDGET_TOKENS = 1024 |
| 19 | + |
| 20 | +# Models that support extended thinking (regex patterns) |
| 21 | +THINKING_CAPABLE_MODELS = [ |
| 22 | + r"claude-3-7", # claude-3-7-sonnet |
| 23 | + r"claude-4", # claude-4-* |
| 24 | + r"claude-sonnet-4", # claude-sonnet-4-* |
| 25 | + r"claude-opus-4", # claude-opus-4-* |
| 26 | + r"claude-haiku-4", # claude-haiku-4-* |
| 27 | +] |
| 28 | + |
16 | 29 | # Global storage for request metadata, keyed by litellm_call_id |
17 | 30 | # Required because LiteLLM doesn't preserve custom metadata from async_pre_call_hook |
18 | 31 | # to logging callbacks - only internal fields like user_id and hidden_params survive. |
@@ -429,3 +442,202 @@ def forward_apikey(data: dict[str, Any], user_api_key_dict: dict[str, Any], **kw |
429 | 442 | ) |
430 | 443 |
|
431 | 444 | return data |
| 445 | + |
| 446 | + |
| 447 | +def _is_thinking_capable_model(model: str | None) -> bool: |
| 448 | + """Check if the model supports extended thinking. |
| 449 | +
|
| 450 | + Args: |
| 451 | + model: Model name to check |
| 452 | +
|
| 453 | + Returns: |
| 454 | + True if the model supports extended thinking, False otherwise |
| 455 | + """ |
| 456 | + if not model: |
| 457 | + return False |
| 458 | + model_lower = model.lower() |
| 459 | + return any(re.search(pattern, model_lower) for pattern in THINKING_CAPABLE_MODELS) |
| 460 | + |
| 461 | + |
| 462 | +def _get_thinking_config_value( |
| 463 | + key: str, |
| 464 | + kwargs: dict[str, Any], |
| 465 | + default: Any = None, |
| 466 | +) -> Any: |
| 467 | + """Get configuration value for thinking budget hook from kwargs or environment. |
| 468 | +
|
| 469 | + Priority: kwargs > environment variable > default |
| 470 | +
|
| 471 | + Args: |
| 472 | + key: Configuration key name |
| 473 | + kwargs: Hook parameters from ccproxy.yaml |
| 474 | + default: Default value if not found elsewhere |
| 475 | +
|
| 476 | + Returns: |
| 477 | + Configuration value |
| 478 | + """ |
| 479 | + # Check kwargs first (from hook params in ccproxy.yaml) |
| 480 | + if key in kwargs: |
| 481 | + return kwargs[key] |
| 482 | + |
| 483 | + # Check environment variable |
| 484 | + env_key = f"THINKING_{key.upper()}" |
| 485 | + env_value = os.environ.get(env_key) |
| 486 | + if env_value is not None: |
| 487 | + # Convert string env values to appropriate types |
| 488 | + if isinstance(default, bool): |
| 489 | + return env_value.lower() in ("true", "1", "yes") |
| 490 | + if isinstance(default, int): |
| 491 | + try: |
| 492 | + return int(env_value) |
| 493 | + except ValueError: |
| 494 | + pass |
| 495 | + return env_value |
| 496 | + |
| 497 | + return default |
| 498 | + |
| 499 | + |
| 500 | +def thinking_budget(data: dict[str, Any], user_api_key_dict: dict[str, Any], **kwargs: Any) -> dict[str, Any]: |
| 501 | + """Hook to inject or modify thinking budget_tokens in requests. |
| 502 | +
|
| 503 | + This hook intercepts outgoing requests to the Anthropic API and manages |
| 504 | + the extended thinking `budget_tokens` parameter. It can: |
| 505 | + - Inject a default budget_tokens when thinking is enabled but budget is missing |
| 506 | + - Override budget_tokens when it's below a configurable minimum |
| 507 | + - Optionally inject thinking configuration when not present |
| 508 | +
|
| 509 | + Constraints enforced: |
| 510 | + - budget_tokens minimum is 1,024 tokens (API requirement) |
| 511 | + - budget_tokens must be less than max_tokens |
| 512 | + - When thinking is enabled, temperature must be 1 (or unset) |
| 513 | + - Only applies to thinking-capable models (Claude 3.7+, Claude 4) |
| 514 | +
|
| 515 | + Args: |
| 516 | + data: Request data from LiteLLM |
| 517 | + user_api_key_dict: User API key dictionary |
| 518 | + **kwargs: Hook parameters from ccproxy.yaml, including: |
| 519 | + - budget_default: Default budget to inject (default: 10000) |
| 520 | + - budget_min: Minimum budget threshold (default: 1024) |
| 521 | + - inject_if_missing: Whether to inject thinking if not present (default: False) |
| 522 | + - log_modifications: Whether to log when modifying requests (default: True) |
| 523 | + - model_filter: Whether to only apply to thinking-capable models (default: True) |
| 524 | +
|
| 525 | + Returns: |
| 526 | + Modified request data |
| 527 | +
|
| 528 | + Example ccproxy.yaml configuration: |
| 529 | + hooks: |
| 530 | + - hook: ccproxy.hooks.thinking_budget |
| 531 | + params: |
| 532 | + budget_default: 16000 |
| 533 | + budget_min: 4000 |
| 534 | + inject_if_missing: false |
| 535 | + log_modifications: true |
| 536 | + """ |
| 537 | + # Get configuration values |
| 538 | + budget_default = _get_thinking_config_value("budget_default", kwargs, default=10000) |
| 539 | + budget_min = _get_thinking_config_value("budget_min", kwargs, default=API_MIN_BUDGET_TOKENS) |
| 540 | + inject_if_missing = _get_thinking_config_value("inject_if_missing", kwargs, default=False) |
| 541 | + log_modifications = _get_thinking_config_value("log_modifications", kwargs, default=True) |
| 542 | + model_filter = _get_thinking_config_value("model_filter", kwargs, default=True) |
| 543 | + |
| 544 | + # Ensure minimum budget is at least the API minimum |
| 545 | + budget_min = max(budget_min, API_MIN_BUDGET_TOKENS) |
| 546 | + budget_default = max(budget_default, API_MIN_BUDGET_TOKENS) |
| 547 | + |
| 548 | + # Get the request body - check multiple locations where it might be |
| 549 | + request = data.get("proxy_server_request", {}) |
| 550 | + body = request.get("body", {}) |
| 551 | + |
| 552 | + # The 'thinking' field may be at the top level of data or in the body |
| 553 | + thinking = data.get("thinking") or body.get("thinking") |
| 554 | + model = data.get("model") or body.get("model") |
| 555 | + max_tokens = data.get("max_tokens") or body.get("max_tokens") |
| 556 | + |
| 557 | + # Check if model supports thinking (if model_filter is enabled) |
| 558 | + if model_filter and not _is_thinking_capable_model(model): |
| 559 | + logger.debug(f"Skipping thinking budget hook for non-thinking-capable model: {model}") |
| 560 | + return data |
| 561 | + |
| 562 | + modified = False |
| 563 | + modification_reason = "" |
| 564 | + |
| 565 | + if thinking is not None: |
| 566 | + # Thinking is present - check if we need to adjust budget_tokens |
| 567 | + if isinstance(thinking, dict) and thinking.get("type") == "enabled": |
| 568 | + current_budget = thinking.get("budget_tokens") |
| 569 | + |
| 570 | + if current_budget is None: |
| 571 | + # No budget set - inject default |
| 572 | + thinking["budget_tokens"] = budget_default |
| 573 | + modified = True |
| 574 | + modification_reason = f"injected missing budget_tokens={budget_default}" |
| 575 | + elif current_budget < budget_min: |
| 576 | + # Budget below minimum - override |
| 577 | + thinking["budget_tokens"] = budget_default |
| 578 | + modified = True |
| 579 | + modification_reason = ( |
| 580 | + f"increased budget_tokens from {current_budget} to {budget_default} (below minimum {budget_min})" |
| 581 | + ) |
| 582 | + |
| 583 | + # Ensure budget_tokens < max_tokens (API constraint) |
| 584 | + if max_tokens is not None and thinking.get("budget_tokens", 0) >= max_tokens: |
| 585 | + # Check if max_tokens is too low for any valid budget |
| 586 | + if max_tokens <= API_MIN_BUDGET_TOKENS: |
| 587 | + # max_tokens too low for thinking - log warning |
| 588 | + logger.warning( |
| 589 | + f"max_tokens={max_tokens} is too low for thinking " |
| 590 | + f"(minimum budget_tokens is {API_MIN_BUDGET_TOKENS})" |
| 591 | + ) |
| 592 | + else: |
| 593 | + # Adjust budget to be less than max_tokens |
| 594 | + adjusted_budget = max_tokens - 1 |
| 595 | + thinking["budget_tokens"] = adjusted_budget |
| 596 | + modified = True |
| 597 | + modification_reason += f"; adjusted to {adjusted_budget} to be below max_tokens={max_tokens}" |
| 598 | + |
| 599 | + # Update the data with modified thinking |
| 600 | + data["thinking"] = thinking |
| 601 | + |
| 602 | + elif inject_if_missing: |
| 603 | + # Thinking not present - inject it if configured to do so |
| 604 | + new_thinking = { |
| 605 | + "type": "enabled", |
| 606 | + "budget_tokens": budget_default, |
| 607 | + } |
| 608 | + |
| 609 | + # Handle max_tokens constraint for injection |
| 610 | + if max_tokens is not None: |
| 611 | + if max_tokens <= API_MIN_BUDGET_TOKENS: |
| 612 | + # max_tokens too low - skip injection |
| 613 | + logger.debug( |
| 614 | + f"Skipping thinking injection: max_tokens={max_tokens} " |
| 615 | + f"is too low (minimum budget_tokens is {API_MIN_BUDGET_TOKENS})" |
| 616 | + ) |
| 617 | + return data |
| 618 | + # Adjust budget if needed |
| 619 | + if budget_default >= max_tokens: |
| 620 | + new_thinking["budget_tokens"] = max_tokens - 1 |
| 621 | + |
| 622 | + data["thinking"] = new_thinking |
| 623 | + modified = True |
| 624 | + modification_reason = f"injected thinking with budget_tokens={new_thinking['budget_tokens']}" |
| 625 | + |
| 626 | + # When thinking is enabled, temperature must be 1 or unset |
| 627 | + current_temp = data.get("temperature") or body.get("temperature") |
| 628 | + if current_temp is not None and current_temp != 1: |
| 629 | + data["temperature"] = 1 |
| 630 | + modification_reason += "; set temperature=1 (required for thinking)" |
| 631 | + |
| 632 | + # Log modification if enabled |
| 633 | + if modified and log_modifications: |
| 634 | + logger.info( |
| 635 | + f"[thinking_budget] Modified request for model={model}: {modification_reason}", |
| 636 | + extra={ |
| 637 | + "event": "thinking_budget_modified", |
| 638 | + "model": model, |
| 639 | + "reason": modification_reason, |
| 640 | + }, |
| 641 | + ) |
| 642 | + |
| 643 | + return data |
0 commit comments