OpenHands
diff --git a/‎.github/run-eval/resolve_model_config.py‎
Lines changed: 24 additions & 0 deletions b/‎.github/run-eval/resolve_model_config.py‎
Lines changed: 24 additions & 0 deletions
diff --git a/‎.github/workflows/assign-reviews.yml‎
Lines changed: 6 additions & 4 deletions b/‎.github/workflows/assign-reviews.yml‎
Lines changed: 6 additions & 4 deletions
diff --git a/‎.openhands/skills/code-review.md‎
Lines changed: 2 additions & 0 deletions b/‎.openhands/skills/code-review.md‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎.openhands/skills/run-eval.md‎
Lines changed: 66 additions & 0 deletions b/‎.openhands/skills/run-eval.md‎
Lines changed: 66 additions & 0 deletions
diff --git a/‎.openhands/skills/write-behavior-test.md‎
Lines changed: 1 addition & 3 deletions b/‎.openhands/skills/write-behavior-test.md‎
Lines changed: 1 addition & 3 deletions
diff --git a/‎MAINTAINERS‎
Lines changed: 11 additions & 0 deletions b/‎MAINTAINERS‎
Lines changed: 11 additions & 0 deletions
diff --git a/‎openhands-agent-server/openhands/agent_server/bash_router.py‎
Lines changed: 9 additions & 0 deletions b/‎openhands-agent-server/openhands/agent_server/bash_router.py‎
Lines changed: 9 additions & 0 deletions
diff --git a/‎openhands-agent-server/openhands/agent_server/bash_service.py‎
Lines changed: 6 additions & 0 deletions b/‎openhands-agent-server/openhands/agent_server/bash_service.py‎
Lines changed: 6 additions & 0 deletions
diff --git a/‎openhands-sdk/openhands/sdk/agent/agent.py‎
Lines changed: 60 additions & 27 deletions b/‎openhands-sdk/openhands/sdk/agent/agent.py‎
Lines changed: 60 additions & 27 deletions
diff --git a/‎openhands-sdk/openhands/sdk/context/condenser/base.py‎
Lines changed: 36 additions & 3 deletions b/‎openhands-sdk/openhands/sdk/context/condenser/base.py‎
Lines changed: 36 additions & 3 deletions
@@ -29,6 +29,25 @@
         "display_name": "Kimi K2 Thinking",
         "llm_config": {"model": "litellm_proxy/moonshot/kimi-k2-thinking"},
     },
+    # https://www.kimi.com/blog/kimi-k2-5.html
+    "kimi-k2.5": {
+        "id": "kimi-k2.5",
+        "display_name": "Kimi K2.5",
+        "llm_config": {
+            "model": "litellm_proxy/moonshot/kimi-k2.5",
+            "temperature": 1.0,
+            "top_p": 0.95,
+        },
+    },
+    # https://www.alibabacloud.com/help/en/model-studio/deep-thinking
+    "qwen3-max-thinking": {
+        "id": "qwen3-max-thinking",
+        "display_name": "Qwen3 Max Thinking",
+        "llm_config": {
+            "model": "litellm_proxy/dashscope/qwen3-max-2026-01-23",
+            "litellm_extra_body": {"enable_thinking": True},
+        },
+    },
     "claude-4.5-opus": {
         "id": "claude-4.5-opus",
         "display_name": "Claude 4.5 Opus",
@@ -95,6 +114,11 @@
             "temperature": 0.0,
         },
     },
+    "glm-4.7": {
+        "id": "glm-4.7",
+        "display_name": "GLM-4.7",
+        "llm_config": {"model": "litellm_proxy/openrouter/z-ai/glm-4.7"},
+    },
 }
 
 
 
@@ -59,10 +59,12 @@ jobs:
                    - Read the issue description and comments
                    - Check if it is a bug report, feature request, or question and add the appropriate label
                    - If it is a bug report and it does not have a priority label
-                     * Find an appropriate maintainer based on the issue topic and recent activity
-                     * Tag them with: "[Automatic Post]: This issue has been waiting for triage. @{maintainer}, could you please take a look and add the
-                appropriate priority label when you have
-                a chance?"
+                     * Read the MAINTAINERS file in the repository root to get the list of maintainers
+                     * Extract all usernames from lines starting with "- @" and join them with spaces, each prefixed with @
+                       (e.g., if the file contains "- @user1" and "- @user2", format as "@user1 @user2")
+                     * Tag ALL maintainers with: "[Automatic Post]: This issue has been waiting for triage. <maintainers>, could you
+                please take a look and add the appropriate priority label when you have a chance?"
+                       (Replace <maintainers> with the formatted list from the previous step)
 
                 # Need Reviewer Action
 
 
@@ -1,4 +1,6 @@
 ---
+name: code-review
+description: Structured code review covering style, readability, and security concerns with actionable feedback. Use when reviewing pull requests or merge requests to identify issues and suggest improvements.
 triggers:
 - /codereview
 ---
 
@@ -0,0 +1,66 @@
+---
+name: run-eval
+description: Trigger and monitor evaluation runs for benchmarks like SWE-bench, GAIA, and others. Use when running evaluations via GitHub Actions or monitoring eval progress through Datadog and kubectl.
+triggers:
+- run eval
+- trigger eval
+- evaluation run
+- swebench eval
+---
+
+# Running Evaluations
+
+## Trigger via GitHub API
+
+```bash
+curl -X POST \
+  -H "Authorization: token $GITHUB_TOKEN" \
+  -H "Accept: application/vnd.github+json" \
+  "https://api.github.com/repos/OpenHands/software-agent-sdk/actions/workflows/run-eval.yml/dispatches" \
+  -d '{
+    "ref": "main",
+    "inputs": {
+      "benchmark": "swebench",
+      "sdk_ref": "main",
+      "eval_limit": "50",
+      "model_ids": "claude-sonnet-4-5-20250929",
+      "reason": "Description of eval run",
+      "benchmarks_branch": "main"
+    }
+  }'
+```
+
+**Key parameters:**
+- `benchmark`: `swebench`, `swebenchmultimodal`, `gaia`, `swtbench`, `commit0`, `multiswebench`
+- `eval_limit`: `1`, `50`, `100`, `200`, `500`
+- `model_ids`: See `.github/run-eval/resolve_model_config.py` for available models
+- `benchmarks_branch`: Use feature branch from the benchmarks repo to test benchmark changes before merging
+
+**Note:** When running a full eval, you must select an `eval_limit` that is greater than or equal to the actual number of instances in the benchmark. If you specify a smaller limit, only that many instances will be evaluated (partial eval).
+
+## Monitoring
+
+**Datadog script** (requires `OpenHands/evaluation` repo; DD_API_KEY, DD_APP_KEY, and DD_SITE environment variables are set):
+```bash
+DD_API_KEY=$DD_API_KEY DD_APP_KEY=$DD_APP_KEY DD_SITE=$DD_SITE \
+  python scripts/analyze_evals.py --job-prefix <EVAL_RUN_ID> --time-range 60
+# EVAL_RUN_ID format: typically the workflow run ID from GitHub Actions
+```
+
+**kubectl** (for users with cluster access - the agent does not have kubectl access):
+```bash
+kubectl logs -f job/eval-eval-<RUN_ID>-<MODEL_SLUG> -n evaluation-jobs
+```
+
+## Common Errors
+
+| Error | Cause | Fix |
+|-------|-------|-----|
+| `503 Service Unavailable` | Infrastructure overloaded | Ask user to stop some evaluation runs |
+| `429 Too Many Requests` | Rate limiting | Wait or reduce concurrency |
+| `failed after 3 retries` | Instance failures | Check Datadog logs for root cause |
+
+## Limits
+
+- Max 256 parallel runtimes (jobs will queue if this limit is exceeded)
+- Full evals typically take 1-3 hours depending on benchmark size
@@ -1,8 +1,6 @@
 ---
 name: write-behavior-test
-type: knowledge
-version: 1.0.0
-agent: CodeActAgent
+description: Guide for writing behavior tests that verify agents follow system message guidelines and avoid undesirable behaviors. Use when creating integration tests for agent behavior validation.
 triggers:
 - /write_behavior_test
 ---
 
@@ -0,0 +1,11 @@
+# Repository Maintainers
+#
+# Format: Each maintainer on a new line starting with "- @username"
+# This file is read by .github/workflows/assign-reviews.yml for automated triage
+#
+
+The following people are maintainers of this repository and are responsible for triage and review:
+
+- @xingyaoww
+- @neubig
+- @enyst
@@ -36,6 +36,14 @@ async def search_bash_events(
     command_id__eq: UUID | None = None,
     timestamp__gte: datetime | None = None,
     timestamp__lt: datetime | None = None,
+    order__gt: Annotated[
+        int | None,
+        Query(
+            title="Filter to events with order greater than this value",
+            description="Only returns BashOutput events with order > this value. "
+            "Useful for polling to fetch only new events since the last poll.",
+        ),
+    ] = None,
     sort_order: BashEventSortOrder = BashEventSortOrder.TIMESTAMP,
     page_id: Annotated[
         str | None,
@@ -55,6 +63,7 @@ async def search_bash_events(
         command_id__eq=command_id__eq,
         timestamp__gte=timestamp__gte,
         timestamp__lt=timestamp__lt,
+        order__gt=order__gt,
         sort_order=sort_order,
         page_id=page_id,
         limit=limit,
 
@@ -104,6 +104,7 @@ async def search_bash_events(
         command_id__eq: UUID | None = None,
         timestamp__gte: datetime | None = None,
         timestamp__lt: datetime | None = None,
+        order__gt: int | None = None,
         sort_order: BashEventSortOrder = BashEventSortOrder.TIMESTAMP,
         page_id: str | None = None,
         limit: int = 100,
@@ -168,6 +169,11 @@ async def search_bash_events(
         for file_path in page_files:
             event = self._load_event_from_file(file_path)
             if event is not None:
+                # Filter by order if specified (only applies to BashOutput events)
+                if order__gt is not None:
+                    event_order = getattr(event, "order", None)
+                    if event_order is not None and event_order <= order__gt:
+                        continue
                 page_events.append(event)
 
         return BashEventPage(items=page_events, next_page_id=next_page_id)
 
@@ -67,6 +67,10 @@
 logger = get_logger(__name__)
 maybe_init_laminar()
 
+# Maximum number of events to scan during init_state defensive checks.
+# SystemPromptEvent must appear within this prefix (at index 0 or 1).
+INIT_STATE_PREFIX_SCAN_WINDOW = 3
+
 
 class Agent(AgentBase):
     """Main agent implementation for OpenHands.
@@ -102,53 +106,82 @@ def init_state(
         state: ConversationState,
         on_event: ConversationCallbackType,
     ) -> None:
+        """Initialize conversation state.
+
+        Invariants enforced by this method:
+        - If a SystemPromptEvent is already present, it must be within the first 3
+          events (index 0 or 1 in practice; index 2 is included in the scan window
+          to detect a user message appearing before the system prompt).
+        - A user MessageEvent should not appear before the SystemPromptEvent.
+
+        These invariants keep event ordering predictable for downstream components
+        (condenser, UI, etc.) and also prevent accidentally materializing the full
+        event history during initialization.
+        """
         super().init_state(state, on_event=on_event)
-        # TODO(openhands): we should add test to test this init_state will actually
-        # modify state in-place
 
         # Defensive check: Analyze state to detect unexpected initialization scenarios
         # These checks help diagnose issues related to lazy loading and event ordering
         # See: https://github.com/OpenHands/software-agent-sdk/issues/1785
-        events = list(state.events)
-        has_system_prompt = any(isinstance(e, SystemPromptEvent) for e in events)
+        #
+        # NOTE: len() is O(1) for EventLog (file-backed implementation).
+        event_count = len(state.events)
+
+        # NOTE: state.events is intentionally an EventsListBase (Sequence-like), not
+        # a plain list. Avoid materializing the full history via list(state.events)
+        # here (conversations can reach 30k+ events).
+        #
+        # Invariant: when init_state is called, SystemPromptEvent (if present) must be
+        # at index 0 or 1.
+        #
+        # Rationale:
+        # - Local conversations start empty and init_state is responsible for adding
+        #   the SystemPromptEvent as the first event.
+        # - Remote conversations may receive an initial ConversationStateUpdateEvent
+        #   from the agent-server immediately after subscription. In a typical remote
+        #   session prefix you may see:
+        #     [ConversationStateUpdateEvent, SystemPromptEvent, MessageEvent, ...]
+        #
+        # We intentionally only inspect the first few events (cheap for both local and
+        # remote) to enforce this invariant.
+        prefix_events = state.events[:INIT_STATE_PREFIX_SCAN_WINDOW]
+
+        has_system_prompt = any(isinstance(e, SystemPromptEvent) for e in prefix_events)
         has_user_message = any(
-            isinstance(e, MessageEvent) and e.source == "user" for e in events
+            isinstance(e, MessageEvent) and e.source == "user" for e in prefix_events
         )
-        has_any_llm_event = any(isinstance(e, LLMConvertibleEvent) for e in events)
-
         # Log state for debugging initialization order issues
         logger.debug(
             f"init_state called: conversation_id={state.id}, "
-            f"event_count={len(events)}, "
+            f"event_count={event_count}, "
             f"has_system_prompt={has_system_prompt}, "
-            f"has_user_message={has_user_message}, "
-            f"has_any_llm_event={has_any_llm_event}"
+            f"has_user_message={has_user_message}"
         )
 
         if has_system_prompt:
-            # SystemPromptEvent already exists - this is unexpected during normal flow
-            # but could happen in persistence/resume scenarios
-            logger.warning(
-                f"init_state called but SystemPromptEvent already exists. "
-                f"conversation_id={state.id}, event_count={len(events)}. "
-                f"This may indicate double initialization or a resume scenario."
+            # Restoring/resuming conversations is normal: a system prompt already
+            # present means this conversation was initialized previously.
+            logger.debug(
+                "init_state: SystemPromptEvent already present; skipping init. "
+                f"conversation_id={state.id}, event_count={event_count}."
             )
             return
 
-        # Assert: If there are user messages but no system prompt, something is wrong
-        # The system prompt should always be added before any user messages
+        # Assert: A user message should never appear before the system prompt.
+        #
+        # NOTE: This is a best-effort check based on the first few events only.
+        # Remote conversations can include a ConversationStateUpdateEvent near the
+        # start, so we scan a small prefix window.
         if has_user_message:
-            event_types = [type(e).__name__ for e in events]
+            event_types = [type(e).__name__ for e in prefix_events]
             logger.error(
-                f"init_state: User message exists without SystemPromptEvent! "
-                f"conversation_id={state.id}, events={event_types}"
+                f"init_state: User message found in prefix before SystemPromptEvent! "
+                f"conversation_id={state.id}, prefix_events={event_types}"
             )
-            assert not has_user_message, (
-                f"Unexpected state: User message exists before SystemPromptEvent. "
-                f"conversation_id={state.id}, event_count={len(events)}, "
-                f"event_types={event_types}. "
-                f"This indicates an initialization order bug - init_state should be "
-                f"called before any user messages are added to the conversation."
+            raise AssertionError(
+                "Unexpected state: user message exists before SystemPromptEvent. "
+                f"conversation_id={state.id}, event_count={event_count}, "
+                f"prefix_event_types={event_types}."
             )
 
         # Prepare system message
 
@@ -103,6 +103,23 @@ class RollingCondenser(PipelinableCondenserBase, ABC):
     `View` to be passed to the LLM.
     """
 
+    def hard_context_reset(
+        self,
+        view: View,  # noqa: ARG002
+        agent_llm: LLM | None = None,  # noqa: ARG002
+    ) -> Condensation | None:
+        """Perform a hard context reset, if supported by the condenser.
+
+        By default, rolling condensers do not support hard context resets. Override this
+        method to implement hard context reset logic by returning a `Condensation`
+        object.
+
+        This method is invoked when:
+        - A HARD condensation requirement is triggered (e.g., by user request)
+        - But the condenser raises a NoCondensationAvailableException error
+        """
+        return None
+
     @abstractmethod
     def condensation_requirement(
         self, view: View, agent_llm: LLM | None = None
@@ -142,9 +159,25 @@ def condense(self, view: View, agent_llm: LLM | None = None) -> View | Condensat
                     # we do so immediately.
                     return view
 
-                # Otherwise re-raise the exception.
-                else:
-                    raise e
+                elif request == CondensationRequirement.HARD:
+                    # The agent has found itself in a situation where it cannot proceed
+                    # without condensation, but the condenser cannot provide one. We'll
+                    # try to recover from this situation by performing a hard context
+                    # reset, if supported by the condenser.
+                    try:
+                        hard_reset_condensation = self.hard_context_reset(
+                            view, agent_llm=agent_llm
+                        )
+                        if hard_reset_condensation is not None:
+                            return hard_reset_condensation
+
+                    # And if something goes wrong with the hard reset make sure we keep
+                    # both errors in the stack
+                    except Exception as hard_reset_exception:
+                        raise hard_reset_exception from e
+
+                # In all other situations re-raise the exception.
+                raise e
 
         # Otherwise we're safe to just return the view.
         else: