TrajEval Staged Evaluation
Evaluate code agent trajectories decomposed into search, edit, and verification stages, rating quality of each stage and determining overall pass/fail verdict.
Configuration Fileconfig.yaml
# TrajEval Staged Evaluation
# Based on "TRAJEVAL: Decomposing Code Agent Trajectories for Fine-Grained Diagnosis" (Kim et al., arXiv 2026)
# Task: Evaluate code agent trajectories by scoring decomposed stages (search, edit, verification)
annotation_task_name: "TrajEval Staged Evaluation"
task_dir: "."
data_files:
- sample-data.json
item_properties:
id_key: "id"
text_key: "text"
output_annotation_dir: "annotation_output/"
output_annotation_format: "json"
html_layout: |
<div class="container" style="font-family: Arial, sans-serif; max-width: 1000px; margin: 0 auto;">
<div style="background: #e8f4fd; padding: 14px; border-radius: 8px; margin-bottom: 14px;">
<h3 style="margin: 0 0 8px 0; color: #1a5276;">GitHub Issue</h3>
<p style="margin: 0; font-size: 15px;">{{text}}</p>
<p style="margin: 8px 0 0 0; font-size: 13px; color: #555;"><strong>Repository:</strong> {{repo_name}}</p>
<p style="margin: 4px 0 0 0; font-size: 13px; color: #555;"><strong>Final Result:</strong> {{final_result}}</p>
</div>
<details style="margin-bottom: 10px;" open>
<summary style="cursor: pointer; font-weight: bold; font-size: 15px; padding: 8px; background: #d5f5e3; border-radius: 6px; color: #1e8449;">Search Stage</summary>
<div style="padding: 12px; border: 1px solid #27ae60; border-top: none; border-radius: 0 0 6px 6px; font-size: 13px; line-height: 1.7; white-space: pre-wrap; font-family: monospace; background: #f8f9fa;">{{search_stage}}</div>
</details>
<details style="margin-bottom: 10px;" open>
<summary style="cursor: pointer; font-weight: bold; font-size: 15px; padding: 8px; background: #fdebd0; border-radius: 6px; color: #a04000;">Edit Stage</summary>
<div style="padding: 12px; border: 1px solid #e67e22; border-top: none; border-radius: 0 0 6px 6px; font-size: 13px; line-height: 1.7; white-space: pre-wrap; font-family: monospace; background: #f8f9fa;">{{edit_stage}}</div>
</details>
<details style="margin-bottom: 10px;" open>
<summary style="cursor: pointer; font-weight: bold; font-size: 15px; padding: 8px; background: #d6eaf8; border-radius: 6px; color: #1a5276;">Verification Stage</summary>
<div style="padding: 12px; border: 1px solid #2980b9; border-top: none; border-radius: 0 0 6px 6px; font-size: 13px; line-height: 1.7; white-space: pre-wrap; font-family: monospace; background: #f8f9fa;">{{verify_stage}}</div>
</details>
</div>
annotation_schemes:
- annotation_type: multirate
name: stage_scores
description: "Rate each dimension of the agent's trajectory quality on a 5-point scale."
labels:
- "1 - Very Poor"
- "2 - Poor"
- "3 - Average"
- "4 - Good"
- "5 - Excellent"
options:
- "Search Quality"
- "Read Comprehension"
- "Edit Correctness"
- "Verification Thoroughness"
- "Overall Efficiency"
- name: overall_verdict
description: "What is the overall verdict for this code agent trajectory?"
annotation_type: radio
labels:
- "Pass"
- "Marginal Pass"
- "Fail"
keyboard_shortcuts:
"Pass": "1"
"Marginal Pass": "2"
"Fail": "3"
- name: stage_justification
description: "Justify which stage had the most impact on the outcome (positive or negative)."
annotation_type: text
allow_all_users: true
instances_per_annotator: 50
annotation_per_instance: 2
Sample Datasample-data.json
[
{
"id": "te-001",
"text": "Fix TypeError: 'NoneType' object is not subscriptable in data_loader.py when CSV file has empty header rows.",
"repo_name": "pandas-dev/pandas",
"search_stage": "$ find . -name 'data_loader.py'\n./pandas/io/data_loader.py\n\n$ grep -n 'header' pandas/io/data_loader.py\n45: def _parse_header(self, row):\n46: return row[0].split(',')\n89: header = self._parse_header(rows[0])\n\n$ cat pandas/io/data_loader.py:40-50\n40: class CSVLoader:\n41: def load(self, filepath):\n42: rows = self._read_raw(filepath)\n43: header = self._parse_header(rows[0])\n44: ...\n45: def _parse_header(self, row):\n46: return row[0].split(',')",
"edit_stage": "$ edit pandas/io/data_loader.py\n--- a/pandas/io/data_loader.py\n+++ b/pandas/io/data_loader.py\n@@ -42,7 +42,10 @@\n def load(self, filepath):\n rows = self._read_raw(filepath)\n- header = self._parse_header(rows[0])\n+ # Skip empty rows at the beginning\n+ first_row = next((r for r in rows if r and r[0]), None)\n+ if first_row is None:\n+ raise ValueError(\"CSV file has no valid header row\")\n+ header = self._parse_header(first_row)",
"verify_stage": "$ python -m pytest tests/io/test_csv_loader.py -v\ntest_load_normal_csv PASSED\ntest_load_empty_header PASSED\ntest_load_no_header PASSED\ntest_load_multiple_empty_rows PASSED\n\n$ python -c \"from pandas.io.data_loader import CSVLoader; CSVLoader().load('test_empty_header.csv')\"\nValueError: CSV file has no valid header row\n\n4 tests passed, 0 failed.",
"final_result": "PASS - All tests pass, fix handles empty headers correctly."
},
{
"id": "te-002",
"text": "Add support for custom timeout parameter in HTTP client's retry mechanism. Currently retries use hardcoded 30s timeout.",
"repo_name": "psf/requests",
"search_stage": "$ grep -rn 'timeout' requests/adapters.py\n78: DEFAULT_TIMEOUT = 30\n112: timeout=self.DEFAULT_TIMEOUT,\n145: def send(self, request, timeout=None):\n146: timeout = timeout or self.DEFAULT_TIMEOUT\n\n$ grep -rn 'retry' requests/adapters.py\n98: def _retry(self, request, retries=3):\n99: for attempt in range(retries):\n101: response = self._do_send(request, timeout=self.DEFAULT_TIMEOUT)\n\n$ cat requests/adapters.py:95-110\n95: class HTTPAdapter:\n96: DEFAULT_TIMEOUT = 30\n97:\n98: def _retry(self, request, retries=3):\n99: for attempt in range(retries):\n100: try:\n101: response = self._do_send(request, timeout=self.DEFAULT_TIMEOUT)\n102: return response\n103: except TimeoutError:\n104: if attempt == retries - 1:\n105: raise\n106: time.sleep(2 ** attempt)",
"edit_stage": "$ edit requests/adapters.py\n--- a/requests/adapters.py\n+++ b/requests/adapters.py\n@@ -98,10 +98,10 @@\n- def _retry(self, request, retries=3):\n+ def _retry(self, request, retries=3, timeout=None):\n+ timeout = timeout or self.DEFAULT_TIMEOUT\n for attempt in range(retries):\n try:\n- response = self._do_send(request, timeout=self.DEFAULT_TIMEOUT)\n+ response = self._do_send(request, timeout=timeout)\n return response\n\n@@ -145,7 +145,7 @@\n def send(self, request, timeout=None, retries=3):\n timeout = timeout or self.DEFAULT_TIMEOUT\n- return self._retry(request, retries=retries)\n+ return self._retry(request, retries=retries, timeout=timeout)",
"verify_stage": "$ python -m pytest tests/test_adapters.py -v\ntest_default_timeout PASSED\ntest_custom_timeout PASSED\ntest_retry_uses_custom_timeout PASSED\ntest_retry_default_timeout PASSED\ntest_timeout_none_uses_default PASSED\n\n5 tests passed, 0 failed.",
"final_result": "PASS - Custom timeout propagated through retry mechanism correctly."
}
]
// ... and 6 more itemsGet This Design
Clone or download from the repository
Quick start:
git clone https://github.com/davidjurgens/potato-showcase.git cd potato-showcase/agentic/trajeval-staged-evaluation potato start config.yaml
Details
Annotation Types
Domain
Use Cases
Tags
Found an issue or want to improve this design?
Open an IssueRelated Designs
DevBench Repository Evaluation
Evaluate AI-generated repositories across the full software development lifecycle. Annotators rate architecture design, code quality, test coverage, documentation, and dependency management for generated projects.
AgentRewardBench Trajectory Scoring
Evaluate web agent trajectories by rating step-level quality across multiple dimensions, judging overall success, and identifying where automatic evaluators disagree with human judgment.
BigCodeBench Human Baseline Evaluation
Evaluate agent-generated code solutions for BigCodeBench tasks. Annotators assess correctness against test suites, rate task complexity, evaluate code quality, and provide notes on the solution approach.