Benchmark Example
Integrate SWE-Bench into our benchmark framework
SWE-Bench
Define SwebenchClient
class SweBenchClient(BenchClient):
def __init__(self, agent_url: str):
super().__init__(agent_url)
def prepare_environment(self, state_update: Dict[str, Any]) -> Dict[str, Any]:
return {"env_info": state_update}
def parse_action(self, raw_action: str) -> Dict[str, Any]:
return {
'model_patch': raw_action
}
Modify entrypoint of SWE-Bench so it can get results from agents
def main(
dataset_name: str,
split: str,
instance_ids: list,
predictions_path: str,
agent_url: str,
max_workers: int,
force_rebuild: bool,
cache_level: str,
clean: bool,
open_file_limit: int,
run_id: str,
timeout: int,
namespace: str | None,
instance_image_tag: str = 'latest',
rewrite_reports: bool = False,
report_dir: str = '.',
modal_name_or_path: str = "self_model",
modal: bool = False
):
"""
Run evaluation harness for the given dataset and predictions.
"""
# original code
###
# Integrate Benchflow Here
agent = SweBenchClient(agent_url)
for task in full_dataset:
state_update = task
prediction = agent.get_action(state_update)
prediction['instance_id'] = task[KEY_INSTANCE_ID]
prediction['model_name_or_path'] = modal_name_or_path
predictions[task[KEY_INSTANCE_ID]] = prediction
###
# original code
Add a scripts as an entrypoint for docker image
#!/bin/bash
AGENT_URL=${AGENT_URL:-"http://0.0.0.0:9002"}
MAX_WORKERS=${MAX_WORKERS:-1}
INSTANCE_IDS=${INSTANCE_IDS:-sympy__sympy-20590}
RUN_ID=${RUN_ID:-${INSTANCE_IDS}}
MODAL_NAME_OR_PATH=${MODAL_NAME_OR_PATH:-self_model}
REPORT_DIR=${REPORT_DIR:-./results}
echo "AGENT_URL: ${AGENT_URL}"
echo "MAX_WORKERS: ${MAX_WORKERS}"
echo "INSTANCE_IDS: ${INSTANCE_IDS}"
echo "RUN_ID: ${RUN_ID}"
echo "MODAL_NAME_OR_PATH: ${MODAL_NAME_OR_PATH}"
echo "REPORT_DIR: ${REPORT_DIR}"
python -m swebench.harness.bf_evaluation \
--agent_url ${AGENT_URL} \
--max_workers ${MAX_WORKERS} \
--instance_ids ${INSTANCE_IDS} \
--run_id ${RUN_ID} \
--modal_name_or_path ${MODAL_NAME_OR_PATH} \
--report_dir ${REPORT_DIR}
Add a Dockerfile to package the benchmark
FROM python:3.11-slim
ENV PYTHONUNBUFFERED=1
WORKDIR /app
COPY . .
RUN pip install -e .
COPY scripts/entrypoint.sh /entrypoint.sh
RUN chmod +x /entrypoint.sh
ENTRYPOINT ["/entrypoint.sh"]
Upload SWE-Bench to dockerhub at kirk2000/benchflow:swebench-v1
Import nessary packages
import json
import os
from typing import Any, Dict
from datasets import Dataset, load_dataset
from benchflow import BaseBench, BaseBenchConfig
Define benchmark config, include required params and optional params to run the benchmarks.
class SwebenchConfig(BaseBenchConfig):
required_env = []
optional_env = ["INSTANCE_IDS", "MAX_WORKERS", "RUN_ID"]
def __init__(self, params: Dict[str, Any], task_id: str):
params.setdefault("INSTANCE_IDS", task_id)
params.setdefault("MAX_WORKERS", 1)
params.setdefault("RUN_ID", task_id)
super().__init__(params)
Integrate Swebench Into Benchflow
class SwebenchBench(BaseBench):
def get_config(self, params: Dict[str, Any], task_id: str) -> BaseBenchConfig:
return SwebenchConfig(params, task_id)
def get_image_name(self) -> str:
return "kirk2000/benchflow:swebench-v1"
def get_results_dir_in_container(self) -> str:
return "/app/results"
def get_log_files_dir_in_container(self) -> str:
return "/app/logs"
def get_result(self, task_id: str) -> Dict[str, Any]:
results_file = os.path.join(self.results_dir, f"self_model.{task_id}.json")
model_prediction_file = os.path.join(self.log_files_dir, f"run_evaluation/{task_id}/self_model/{task_id}/patch.diff")
report_file = os.path.join(self.log_files_dir, f"run_evaluation/{task_id}/self_model/{task_id}/report.json")
try:
with open(results_file, 'r') as f:
result_data = json.load(f)
total_instances = result_data.get("total_instances", 1)
resolved_instances = result_data.get("resolved_instances", 0)
pass_rate = resolved_instances / total_instances if total_instances else 0
with open(model_prediction_file, 'r') as f:
model_prediction = f.read()
with open(report_file, 'r') as f:
report = json.load(f)
return {
"is_resolved": pass_rate > 0.99,
"score": pass_rate,
"message": {"details": result_data},
"log": model_prediction + "\n" + json.dumps(report),
}
except Exception as e:
return {
"is_resolved": False,
"score": 0,
"message": {"error": str(e)},
"log": str(e),
}
def get_all_tasks(self, split: str) -> Dict[str, Any]:
try:
dataset: Dataset = load_dataset("princeton-nlp/SWE-bench_Lite", split=split)
dataset_ids = [instance["instance_id"] for instance in dataset]
return {"task_ids": dataset_ids, "error_message": None}
except Exception as e:
return {"task_ids": [], "error_message": str(e)}
def cleanup(self):
pass
Last updated