Skip to content

Commit 3a16af2

Browse files
authored
Merge branch 'main' into gpt-5-codex-system-prompt
2 parents 1f512a5 + 8296a7f commit 3a16af2

File tree

31 files changed

+2037
-232
lines changed

31 files changed

+2037
-232
lines changed

.github/run-eval/resolve_model_config.py

Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,25 @@
2929
"display_name": "Kimi K2 Thinking",
3030
"llm_config": {"model": "litellm_proxy/moonshot/kimi-k2-thinking"},
3131
},
32+
# https://www.kimi.com/blog/kimi-k2-5.html
33+
"kimi-k2.5": {
34+
"id": "kimi-k2.5",
35+
"display_name": "Kimi K2.5",
36+
"llm_config": {
37+
"model": "litellm_proxy/moonshot/kimi-k2.5",
38+
"temperature": 1.0,
39+
"top_p": 0.95,
40+
},
41+
},
42+
# https://www.alibabacloud.com/help/en/model-studio/deep-thinking
43+
"qwen3-max-thinking": {
44+
"id": "qwen3-max-thinking",
45+
"display_name": "Qwen3 Max Thinking",
46+
"llm_config": {
47+
"model": "litellm_proxy/dashscope/qwen3-max-2026-01-23",
48+
"litellm_extra_body": {"enable_thinking": True},
49+
},
50+
},
3251
"claude-4.5-opus": {
3352
"id": "claude-4.5-opus",
3453
"display_name": "Claude 4.5 Opus",
@@ -95,6 +114,11 @@
95114
"temperature": 0.0,
96115
},
97116
},
117+
"glm-4.7": {
118+
"id": "glm-4.7",
119+
"display_name": "GLM-4.7",
120+
"llm_config": {"model": "litellm_proxy/openrouter/z-ai/glm-4.7"},
121+
},
98122
}
99123

100124

.github/workflows/assign-reviews.yml

Lines changed: 6 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -59,10 +59,12 @@ jobs:
5959
- Read the issue description and comments
6060
- Check if it is a bug report, feature request, or question and add the appropriate label
6161
- If it is a bug report and it does not have a priority label
62-
* Find an appropriate maintainer based on the issue topic and recent activity
63-
* Tag them with: "[Automatic Post]: This issue has been waiting for triage. @{maintainer}, could you please take a look and add the
64-
appropriate priority label when you have
65-
a chance?"
62+
* Read the MAINTAINERS file in the repository root to get the list of maintainers
63+
* Extract all usernames from lines starting with "- @" and join them with spaces, each prefixed with @
64+
(e.g., if the file contains "- @user1" and "- @user2", format as "@user1 @user2")
65+
* Tag ALL maintainers with: "[Automatic Post]: This issue has been waiting for triage. <maintainers>, could you
66+
please take a look and add the appropriate priority label when you have a chance?"
67+
(Replace <maintainers> with the formatted list from the previous step)
6668
6769
# Need Reviewer Action
6870

.openhands/skills/code-review.md

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,6 @@
11
---
2+
name: code-review
3+
description: Structured code review covering style, readability, and security concerns with actionable feedback. Use when reviewing pull requests or merge requests to identify issues and suggest improvements.
24
triggers:
35
- /codereview
46
---

.openhands/skills/run-eval.md

Lines changed: 66 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,66 @@
1+
---
2+
name: run-eval
3+
description: Trigger and monitor evaluation runs for benchmarks like SWE-bench, GAIA, and others. Use when running evaluations via GitHub Actions or monitoring eval progress through Datadog and kubectl.
4+
triggers:
5+
- run eval
6+
- trigger eval
7+
- evaluation run
8+
- swebench eval
9+
---
10+
11+
# Running Evaluations
12+
13+
## Trigger via GitHub API
14+
15+
```bash
16+
curl -X POST \
17+
-H "Authorization: token $GITHUB_TOKEN" \
18+
-H "Accept: application/vnd.github+json" \
19+
"https://api.github.com/repos/OpenHands/software-agent-sdk/actions/workflows/run-eval.yml/dispatches" \
20+
-d '{
21+
"ref": "main",
22+
"inputs": {
23+
"benchmark": "swebench",
24+
"sdk_ref": "main",
25+
"eval_limit": "50",
26+
"model_ids": "claude-sonnet-4-5-20250929",
27+
"reason": "Description of eval run",
28+
"benchmarks_branch": "main"
29+
}
30+
}'
31+
```
32+
33+
**Key parameters:**
34+
- `benchmark`: `swebench`, `swebenchmultimodal`, `gaia`, `swtbench`, `commit0`, `multiswebench`
35+
- `eval_limit`: `1`, `50`, `100`, `200`, `500`
36+
- `model_ids`: See `.github/run-eval/resolve_model_config.py` for available models
37+
- `benchmarks_branch`: Use feature branch from the benchmarks repo to test benchmark changes before merging
38+
39+
**Note:** When running a full eval, you must select an `eval_limit` that is greater than or equal to the actual number of instances in the benchmark. If you specify a smaller limit, only that many instances will be evaluated (partial eval).
40+
41+
## Monitoring
42+
43+
**Datadog script** (requires `OpenHands/evaluation` repo; DD_API_KEY, DD_APP_KEY, and DD_SITE environment variables are set):
44+
```bash
45+
DD_API_KEY=$DD_API_KEY DD_APP_KEY=$DD_APP_KEY DD_SITE=$DD_SITE \
46+
python scripts/analyze_evals.py --job-prefix <EVAL_RUN_ID> --time-range 60
47+
# EVAL_RUN_ID format: typically the workflow run ID from GitHub Actions
48+
```
49+
50+
**kubectl** (for users with cluster access - the agent does not have kubectl access):
51+
```bash
52+
kubectl logs -f job/eval-eval-<RUN_ID>-<MODEL_SLUG> -n evaluation-jobs
53+
```
54+
55+
## Common Errors
56+
57+
| Error | Cause | Fix |
58+
|-------|-------|-----|
59+
| `503 Service Unavailable` | Infrastructure overloaded | Ask user to stop some evaluation runs |
60+
| `429 Too Many Requests` | Rate limiting | Wait or reduce concurrency |
61+
| `failed after 3 retries` | Instance failures | Check Datadog logs for root cause |
62+
63+
## Limits
64+
65+
- Max 256 parallel runtimes (jobs will queue if this limit is exceeded)
66+
- Full evals typically take 1-3 hours depending on benchmark size

.openhands/skills/write-behavior-test.md

Lines changed: 1 addition & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,6 @@
11
---
22
name: write-behavior-test
3-
type: knowledge
4-
version: 1.0.0
5-
agent: CodeActAgent
3+
description: Guide for writing behavior tests that verify agents follow system message guidelines and avoid undesirable behaviors. Use when creating integration tests for agent behavior validation.
64
triggers:
75
- /write_behavior_test
86
---

MAINTAINERS

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,11 @@
1+
# Repository Maintainers
2+
#
3+
# Format: Each maintainer on a new line starting with "- @username"
4+
# This file is read by .github/workflows/assign-reviews.yml for automated triage
5+
#
6+
7+
The following people are maintainers of this repository and are responsible for triage and review:
8+
9+
- @xingyaoww
10+
- @neubig
11+
- @enyst

openhands-agent-server/openhands/agent_server/bash_router.py

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -36,6 +36,14 @@ async def search_bash_events(
3636
command_id__eq: UUID | None = None,
3737
timestamp__gte: datetime | None = None,
3838
timestamp__lt: datetime | None = None,
39+
order__gt: Annotated[
40+
int | None,
41+
Query(
42+
title="Filter to events with order greater than this value",
43+
description="Only returns BashOutput events with order > this value. "
44+
"Useful for polling to fetch only new events since the last poll.",
45+
),
46+
] = None,
3947
sort_order: BashEventSortOrder = BashEventSortOrder.TIMESTAMP,
4048
page_id: Annotated[
4149
str | None,
@@ -55,6 +63,7 @@ async def search_bash_events(
5563
command_id__eq=command_id__eq,
5664
timestamp__gte=timestamp__gte,
5765
timestamp__lt=timestamp__lt,
66+
order__gt=order__gt,
5867
sort_order=sort_order,
5968
page_id=page_id,
6069
limit=limit,

openhands-agent-server/openhands/agent_server/bash_service.py

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -104,6 +104,7 @@ async def search_bash_events(
104104
command_id__eq: UUID | None = None,
105105
timestamp__gte: datetime | None = None,
106106
timestamp__lt: datetime | None = None,
107+
order__gt: int | None = None,
107108
sort_order: BashEventSortOrder = BashEventSortOrder.TIMESTAMP,
108109
page_id: str | None = None,
109110
limit: int = 100,
@@ -168,6 +169,11 @@ async def search_bash_events(
168169
for file_path in page_files:
169170
event = self._load_event_from_file(file_path)
170171
if event is not None:
172+
# Filter by order if specified (only applies to BashOutput events)
173+
if order__gt is not None:
174+
event_order = getattr(event, "order", None)
175+
if event_order is not None and event_order <= order__gt:
176+
continue
171177
page_events.append(event)
172178

173179
return BashEventPage(items=page_events, next_page_id=next_page_id)

openhands-sdk/openhands/sdk/agent/agent.py

Lines changed: 60 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -67,6 +67,10 @@
6767
logger = get_logger(__name__)
6868
maybe_init_laminar()
6969

70+
# Maximum number of events to scan during init_state defensive checks.
71+
# SystemPromptEvent must appear within this prefix (at index 0 or 1).
72+
INIT_STATE_PREFIX_SCAN_WINDOW = 3
73+
7074

7175
class Agent(AgentBase):
7276
"""Main agent implementation for OpenHands.
@@ -102,53 +106,82 @@ def init_state(
102106
state: ConversationState,
103107
on_event: ConversationCallbackType,
104108
) -> None:
109+
"""Initialize conversation state.
110+
111+
Invariants enforced by this method:
112+
- If a SystemPromptEvent is already present, it must be within the first 3
113+
events (index 0 or 1 in practice; index 2 is included in the scan window
114+
to detect a user message appearing before the system prompt).
115+
- A user MessageEvent should not appear before the SystemPromptEvent.
116+
117+
These invariants keep event ordering predictable for downstream components
118+
(condenser, UI, etc.) and also prevent accidentally materializing the full
119+
event history during initialization.
120+
"""
105121
super().init_state(state, on_event=on_event)
106-
# TODO(openhands): we should add test to test this init_state will actually
107-
# modify state in-place
108122

109123
# Defensive check: Analyze state to detect unexpected initialization scenarios
110124
# These checks help diagnose issues related to lazy loading and event ordering
111125
# See: https://github.com/OpenHands/software-agent-sdk/issues/1785
112-
events = list(state.events)
113-
has_system_prompt = any(isinstance(e, SystemPromptEvent) for e in events)
126+
#
127+
# NOTE: len() is O(1) for EventLog (file-backed implementation).
128+
event_count = len(state.events)
129+
130+
# NOTE: state.events is intentionally an EventsListBase (Sequence-like), not
131+
# a plain list. Avoid materializing the full history via list(state.events)
132+
# here (conversations can reach 30k+ events).
133+
#
134+
# Invariant: when init_state is called, SystemPromptEvent (if present) must be
135+
# at index 0 or 1.
136+
#
137+
# Rationale:
138+
# - Local conversations start empty and init_state is responsible for adding
139+
# the SystemPromptEvent as the first event.
140+
# - Remote conversations may receive an initial ConversationStateUpdateEvent
141+
# from the agent-server immediately after subscription. In a typical remote
142+
# session prefix you may see:
143+
# [ConversationStateUpdateEvent, SystemPromptEvent, MessageEvent, ...]
144+
#
145+
# We intentionally only inspect the first few events (cheap for both local and
146+
# remote) to enforce this invariant.
147+
prefix_events = state.events[:INIT_STATE_PREFIX_SCAN_WINDOW]
148+
149+
has_system_prompt = any(isinstance(e, SystemPromptEvent) for e in prefix_events)
114150
has_user_message = any(
115-
isinstance(e, MessageEvent) and e.source == "user" for e in events
151+
isinstance(e, MessageEvent) and e.source == "user" for e in prefix_events
116152
)
117-
has_any_llm_event = any(isinstance(e, LLMConvertibleEvent) for e in events)
118-
119153
# Log state for debugging initialization order issues
120154
logger.debug(
121155
f"init_state called: conversation_id={state.id}, "
122-
f"event_count={len(events)}, "
156+
f"event_count={event_count}, "
123157
f"has_system_prompt={has_system_prompt}, "
124-
f"has_user_message={has_user_message}, "
125-
f"has_any_llm_event={has_any_llm_event}"
158+
f"has_user_message={has_user_message}"
126159
)
127160

128161
if has_system_prompt:
129-
# SystemPromptEvent already exists - this is unexpected during normal flow
130-
# but could happen in persistence/resume scenarios
131-
logger.warning(
132-
f"init_state called but SystemPromptEvent already exists. "
133-
f"conversation_id={state.id}, event_count={len(events)}. "
134-
f"This may indicate double initialization or a resume scenario."
162+
# Restoring/resuming conversations is normal: a system prompt already
163+
# present means this conversation was initialized previously.
164+
logger.debug(
165+
"init_state: SystemPromptEvent already present; skipping init. "
166+
f"conversation_id={state.id}, event_count={event_count}."
135167
)
136168
return
137169

138-
# Assert: If there are user messages but no system prompt, something is wrong
139-
# The system prompt should always be added before any user messages
170+
# Assert: A user message should never appear before the system prompt.
171+
#
172+
# NOTE: This is a best-effort check based on the first few events only.
173+
# Remote conversations can include a ConversationStateUpdateEvent near the
174+
# start, so we scan a small prefix window.
140175
if has_user_message:
141-
event_types = [type(e).__name__ for e in events]
176+
event_types = [type(e).__name__ for e in prefix_events]
142177
logger.error(
143-
f"init_state: User message exists without SystemPromptEvent! "
144-
f"conversation_id={state.id}, events={event_types}"
178+
f"init_state: User message found in prefix before SystemPromptEvent! "
179+
f"conversation_id={state.id}, prefix_events={event_types}"
145180
)
146-
assert not has_user_message, (
147-
f"Unexpected state: User message exists before SystemPromptEvent. "
148-
f"conversation_id={state.id}, event_count={len(events)}, "
149-
f"event_types={event_types}. "
150-
f"This indicates an initialization order bug - init_state should be "
151-
f"called before any user messages are added to the conversation."
181+
raise AssertionError(
182+
"Unexpected state: user message exists before SystemPromptEvent. "
183+
f"conversation_id={state.id}, event_count={event_count}, "
184+
f"prefix_event_types={event_types}."
152185
)
153186

154187
# Prepare system message

openhands-sdk/openhands/sdk/context/condenser/base.py

Lines changed: 36 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -103,6 +103,23 @@ class RollingCondenser(PipelinableCondenserBase, ABC):
103103
`View` to be passed to the LLM.
104104
"""
105105

106+
def hard_context_reset(
107+
self,
108+
view: View, # noqa: ARG002
109+
agent_llm: LLM | None = None, # noqa: ARG002
110+
) -> Condensation | None:
111+
"""Perform a hard context reset, if supported by the condenser.
112+
113+
By default, rolling condensers do not support hard context resets. Override this
114+
method to implement hard context reset logic by returning a `Condensation`
115+
object.
116+
117+
This method is invoked when:
118+
- A HARD condensation requirement is triggered (e.g., by user request)
119+
- But the condenser raises a NoCondensationAvailableException error
120+
"""
121+
return None
122+
106123
@abstractmethod
107124
def condensation_requirement(
108125
self, view: View, agent_llm: LLM | None = None
@@ -142,9 +159,25 @@ def condense(self, view: View, agent_llm: LLM | None = None) -> View | Condensat
142159
# we do so immediately.
143160
return view
144161

145-
# Otherwise re-raise the exception.
146-
else:
147-
raise e
162+
elif request == CondensationRequirement.HARD:
163+
# The agent has found itself in a situation where it cannot proceed
164+
# without condensation, but the condenser cannot provide one. We'll
165+
# try to recover from this situation by performing a hard context
166+
# reset, if supported by the condenser.
167+
try:
168+
hard_reset_condensation = self.hard_context_reset(
169+
view, agent_llm=agent_llm
170+
)
171+
if hard_reset_condensation is not None:
172+
return hard_reset_condensation
173+
174+
# And if something goes wrong with the hard reset make sure we keep
175+
# both errors in the stack
176+
except Exception as hard_reset_exception:
177+
raise hard_reset_exception from e
178+
179+
# In all other situations re-raise the exception.
180+
raise e
148181

149182
# Otherwise we're safe to just return the view.
150183
else:

0 commit comments

Comments
 (0)