feat(hooks): add thinking_budget hook for extended thinking management

nmarasoiu · claude · nmarasoiu · commit 6b20e94bb62f · 2025-12-10T10:33:32.000Z
Add a new hook that manages Claude's extended thinking budget_tokens parameter: - Inject default budget_tokens when thinking is enabled but budget is missing - Override budget_tokens when below configurable minimum threshold - Optionally inject thinking configuration when not present - Enforce API constraints (min 1024, budget < max_tokens, temperature=1) - Filter by thinking-capable models (Claude 3.7+, Claude 4) Configuration via hook params or environment variables: - budget_default (THINKING_BUDGET_DEFAULT): Default budget (10000) - budget_min (THINKING_BUDGET_MIN): Minimum threshold (1024) - inject_if_missing (THINKING_INJECT_IF_MISSING): Auto-inject thinking (false) - log_modifications (THINKING_LOG_MODIFICATIONS): Log changes (true) - model_filter (THINKING_MODEL_FILTER): Filter by model (true) Includes 43 comprehensive unit tests covering all functionality. 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
diff --git a/README.md b/README.md
@@ -232,9 +232,57 @@ general_settings:
 
 See [docs/configuration.md](docs/configuration.md) for more information on how to customize your Claude Code experience using `ccproxy`.
 
-<!-- ## Extended Thinking -->
+## Extended Thinking Budget Hook
 
-<!-- Normally, when you send a message, Claude Code does a simple keyword scan for words/phrases like "think deeply" to determine whether or not to enable thinking, as well the size of the thinking token budget. [Simply including the word "ultrathink](https://claudelog.com/mechanics/ultrathink-plus-plus/) sets the thinking token budget to the maximum of `31999`. -->
+`ccproxy` includes a `thinking_budget` hook that manages Claude's extended thinking `budget_tokens` parameter. This hook can:
+
+- Inject a default `budget_tokens` when thinking is enabled but budget is missing
+- Override `budget_tokens` when it's below a configurable minimum
+- Optionally inject thinking configuration when not present in requests
+
+### Configuration
+
+Add the hook to your `ccproxy.yaml`:
+
+```yaml
+ccproxy:
+  hooks:
+    - ccproxy.hooks.rule_evaluator
+    - ccproxy.hooks.model_router
+    - ccproxy.hooks.forward_oauth
+    # Add thinking budget hook (simple form - uses defaults)
+    - ccproxy.hooks.thinking_budget
+
+    # OR with custom parameters:
+    # - hook: ccproxy.hooks.thinking_budget
+    #   params:
+    #     budget_default: 16000      # Default budget to inject (default: 10000)
+    #     budget_min: 4000           # Override if request budget is below this (default: 1024)
+    #     inject_if_missing: false   # Inject thinking if not present (default: false)
+    #     log_modifications: true    # Log when modifying requests (default: true)
+    #     model_filter: true         # Only apply to thinking-capable models (default: true)
+```
+
+### Environment Variables
+
+Configuration can also be set via environment variables:
+
+- `THINKING_BUDGET_DEFAULT` - Default budget to inject (default: 10000)
+- `THINKING_BUDGET_MIN` - Minimum budget threshold (default: 1024)
+- `THINKING_INJECT_IF_MISSING` - Inject thinking if not present (default: false)
+- `THINKING_LOG_MODIFICATIONS` - Log when modifying requests (default: true)
+- `THINKING_MODEL_FILTER` - Only apply to thinking-capable models (default: true)
+
+Priority: hook params > environment variables > defaults
+
+### Constraints
+
+The hook enforces Anthropic API constraints:
+
+- `budget_tokens` minimum is 1,024 tokens
+- `budget_tokens` must be less than `max_tokens`
+- When thinking is enabled, temperature must be 1 (or unset)
+- Only applies to thinking-capable models (Claude 3.7+, Claude 4) when `model_filter` is enabled
 
 ## Routing Rules
 
diff --git a/src/ccproxy/hooks.py b/src/ccproxy/hooks.py
@@ -1,4 +1,5 @@
 import logging
+import os
 import re
 import threading
 import time
@@ -13,6 +14,18 @@
 # Set up structured logging
 logger = logging.getLogger(__name__)
 
+# Minimum budget_tokens allowed by the Anthropic API for extended thinking
+API_MIN_BUDGET_TOKENS = 1024
+
+# Models that support extended thinking (regex patterns)
+THINKING_CAPABLE_MODELS = [
+    r"claude-3-7",  # claude-3-7-sonnet
+    r"claude-4",  # claude-4-*
+    r"claude-sonnet-4",  # claude-sonnet-4-*
+    r"claude-opus-4",  # claude-opus-4-*
+    r"claude-haiku-4",  # claude-haiku-4-*
+]
+
 # Global storage for request metadata, keyed by litellm_call_id
 # Required because LiteLLM doesn't preserve custom metadata from async_pre_call_hook
 # to logging callbacks - only internal fields like user_id and hidden_params survive.
@@ -429,3 +442,202 @@ def forward_apikey(data: dict[str, Any], user_api_key_dict: dict[str, Any], **kw
         )
 
     return data
+
+
+def _is_thinking_capable_model(model: str | None) -> bool:
+    """Check if the model supports extended thinking.
+
+    Args:
+        model: Model name to check
+
+    Returns:
+        True if the model supports extended thinking, False otherwise
+    """
+    if not model:
+        return False
+    model_lower = model.lower()
+    return any(re.search(pattern, model_lower) for pattern in THINKING_CAPABLE_MODELS)
+
+
+def _get_thinking_config_value(
+    key: str,
+    kwargs: dict[str, Any],
+    default: Any = None,
+) -> Any:
+    """Get configuration value for thinking budget hook from kwargs or environment.
+
+    Priority: kwargs > environment variable > default
+
+    Args:
+        key: Configuration key name
+        kwargs: Hook parameters from ccproxy.yaml
+        default: Default value if not found elsewhere
+
+    Returns:
+        Configuration value
+    """
+    # Check kwargs first (from hook params in ccproxy.yaml)
+    if key in kwargs:
+        return kwargs[key]
+
+    # Check environment variable
+    env_key = f"THINKING_{key.upper()}"
+    env_value = os.environ.get(env_key)
+    if env_value is not None:
+        # Convert string env values to appropriate types
+        if isinstance(default, bool):
+            return env_value.lower() in ("true", "1", "yes")
+        if isinstance(default, int):
+            try:
+                return int(env_value)
+            except ValueError:
+                pass
+        return env_value
+
+    return default
+
+
+def thinking_budget(data: dict[str, Any], user_api_key_dict: dict[str, Any], **kwargs: Any) -> dict[str, Any]:
+    """Hook to inject or modify thinking budget_tokens in requests.
+
+    This hook intercepts outgoing requests to the Anthropic API and manages
+    the extended thinking `budget_tokens` parameter. It can:
+    - Inject a default budget_tokens when thinking is enabled but budget is missing
+    - Override budget_tokens when it's below a configurable minimum
+    - Optionally inject thinking configuration when not present
+
+    Constraints enforced:
+    - budget_tokens minimum is 1,024 tokens (API requirement)
+    - budget_tokens must be less than max_tokens
+    - When thinking is enabled, temperature must be 1 (or unset)
+    - Only applies to thinking-capable models (Claude 3.7+, Claude 4)
+
+    Args:
+        data: Request data from LiteLLM
+        user_api_key_dict: User API key dictionary
+        **kwargs: Hook parameters from ccproxy.yaml, including:
+            - budget_default: Default budget to inject (default: 10000)
+            - budget_min: Minimum budget threshold (default: 1024)
+            - inject_if_missing: Whether to inject thinking if not present (default: False)
+            - log_modifications: Whether to log when modifying requests (default: True)
+            - model_filter: Whether to only apply to thinking-capable models (default: True)
+
+    Returns:
+        Modified request data
+
+    Example ccproxy.yaml configuration:
+        hooks:
+          - hook: ccproxy.hooks.thinking_budget
+            params:
+              budget_default: 16000
+              budget_min: 4000
+              inject_if_missing: false
+              log_modifications: true
+    """
+    # Get configuration values
+    budget_default = _get_thinking_config_value("budget_default", kwargs, default=10000)
+    budget_min = _get_thinking_config_value("budget_min", kwargs, default=API_MIN_BUDGET_TOKENS)
+    inject_if_missing = _get_thinking_config_value("inject_if_missing", kwargs, default=False)
+    log_modifications = _get_thinking_config_value("log_modifications", kwargs, default=True)
+    model_filter = _get_thinking_config_value("model_filter", kwargs, default=True)
+
+    # Ensure minimum budget is at least the API minimum
+    budget_min = max(budget_min, API_MIN_BUDGET_TOKENS)
+    budget_default = max(budget_default, API_MIN_BUDGET_TOKENS)
+
+    # Get the request body - check multiple locations where it might be
+    request = data.get("proxy_server_request", {})
+    body = request.get("body", {})
+
+    # The 'thinking' field may be at the top level of data or in the body
+    thinking = data.get("thinking") or body.get("thinking")
+    model = data.get("model") or body.get("model")
+    max_tokens = data.get("max_tokens") or body.get("max_tokens")
+
+    # Check if model supports thinking (if model_filter is enabled)
+    if model_filter and not _is_thinking_capable_model(model):
+        logger.debug(f"Skipping thinking budget hook for non-thinking-capable model: {model}")
+        return data
+
+    modified = False
+    modification_reason = ""
+
+    if thinking is not None:
+        # Thinking is present - check if we need to adjust budget_tokens
+        if isinstance(thinking, dict) and thinking.get("type") == "enabled":
+            current_budget = thinking.get("budget_tokens")
+
+            if current_budget is None:
+                # No budget set - inject default
+                thinking["budget_tokens"] = budget_default
+                modified = True
+                modification_reason = f"injected missing budget_tokens={budget_default}"
+            elif current_budget < budget_min:
+                # Budget below minimum - override
+                thinking["budget_tokens"] = budget_default
+                modified = True
+                modification_reason = (
+                    f"increased budget_tokens from {current_budget} to {budget_default} (below minimum {budget_min})"
+                )
+
+            # Ensure budget_tokens < max_tokens (API constraint)
+            if max_tokens is not None and thinking.get("budget_tokens", 0) >= max_tokens:
+                # Check if max_tokens is too low for any valid budget
+                if max_tokens <= API_MIN_BUDGET_TOKENS:
+                    # max_tokens too low for thinking - log warning
+                    logger.warning(
+                        f"max_tokens={max_tokens} is too low for thinking "
+                        f"(minimum budget_tokens is {API_MIN_BUDGET_TOKENS})"
+                    )
+                else:
+                    # Adjust budget to be less than max_tokens
+                    adjusted_budget = max_tokens - 1
+                    thinking["budget_tokens"] = adjusted_budget
+                    modified = True
+                    modification_reason += f"; adjusted to {adjusted_budget} to be below max_tokens={max_tokens}"
+
+            # Update the data with modified thinking
+            data["thinking"] = thinking
+
+    elif inject_if_missing:
+        # Thinking not present - inject it if configured to do so
+        new_thinking = {
+            "type": "enabled",
+            "budget_tokens": budget_default,
+        }
+
+        # Handle max_tokens constraint for injection
+        if max_tokens is not None:
+            if max_tokens <= API_MIN_BUDGET_TOKENS:
+                # max_tokens too low - skip injection
+                logger.debug(
+                    f"Skipping thinking injection: max_tokens={max_tokens} "
+                    f"is too low (minimum budget_tokens is {API_MIN_BUDGET_TOKENS})"
+                )
+                return data
+            # Adjust budget if needed
+            if budget_default >= max_tokens:
+                new_thinking["budget_tokens"] = max_tokens - 1
+
+        data["thinking"] = new_thinking
+        modified = True
+        modification_reason = f"injected thinking with budget_tokens={new_thinking['budget_tokens']}"
+
+        # When thinking is enabled, temperature must be 1 or unset
+        current_temp = data.get("temperature") or body.get("temperature")
+        if current_temp is not None and current_temp != 1:
+            data["temperature"] = 1
+            modification_reason += "; set temperature=1 (required for thinking)"
+
+    # Log modification if enabled
+    if modified and log_modifications:
+        logger.info(
+            f"[thinking_budget] Modified request for model={model}: {modification_reason}",
+            extra={
+                "event": "thinking_budget_modified",
+                "model": model,
+                "reason": modification_reason,
+            },
+        )
+
+    return data
diff --git a/tests/test_hooks.py b/tests/test_hooks.py