Benchmark Example

Integrate SWE-Bench into our benchmark framework

SWE-Bench

Define SwebenchClient

class SweBenchClient(BenchClient):
    def __init__(self, agent_url: str):
        super().__init__(agent_url)

    def prepare_environment(self, state_update: Dict[str, Any]) -> Dict[str, Any]:
        return {"env_info": state_update}

    def parse_action(self, raw_action: str) -> Dict[str, Any]:
        return {
            'model_patch': raw_action
        }

Modify entrypoint of SWE-Bench so it can get results from agents

def main(
        dataset_name: str,
        split: str,
        instance_ids: list,
        predictions_path: str,
        agent_url: str,
        max_workers: int,
        force_rebuild: bool,
        cache_level: str,
        clean: bool,
        open_file_limit: int,
        run_id: str,
        timeout: int,
        namespace: str | None,
        instance_image_tag: str = 'latest',
        rewrite_reports: bool = False,
        report_dir: str = '.',
        modal_name_or_path: str = "self_model",
        modal: bool = False
    ):
    """
    Run evaluation harness for the given dataset and predictions.
    """
    # original code
    
    ### 
    # Integrate Benchflow Here
    agent = SweBenchClient(agent_url)
    for task in full_dataset:
        state_update = task
        prediction = agent.get_action(state_update)
        prediction['instance_id'] = task[KEY_INSTANCE_ID]
        prediction['model_name_or_path'] = modal_name_or_path
        predictions[task[KEY_INSTANCE_ID]] = prediction
    ###
    
    # original code
    

Add a scripts as an entrypoint for docker image

#!/bin/bash

AGENT_URL=${AGENT_URL:-"http://0.0.0.0:9002"}
MAX_WORKERS=${MAX_WORKERS:-1}
INSTANCE_IDS=${INSTANCE_IDS:-sympy__sympy-20590}
RUN_ID=${RUN_ID:-${INSTANCE_IDS}}
MODAL_NAME_OR_PATH=${MODAL_NAME_OR_PATH:-self_model}
REPORT_DIR=${REPORT_DIR:-./results}

echo "AGENT_URL: ${AGENT_URL}"
echo "MAX_WORKERS: ${MAX_WORKERS}"
echo "INSTANCE_IDS: ${INSTANCE_IDS}"
echo "RUN_ID: ${RUN_ID}"
echo "MODAL_NAME_OR_PATH: ${MODAL_NAME_OR_PATH}"
echo "REPORT_DIR: ${REPORT_DIR}"

python -m swebench.harness.bf_evaluation \
     --agent_url ${AGENT_URL} \
     --max_workers ${MAX_WORKERS} \
     --instance_ids ${INSTANCE_IDS} \
     --run_id ${RUN_ID} \
     --modal_name_or_path ${MODAL_NAME_OR_PATH} \
     --report_dir ${REPORT_DIR}

Add a Dockerfile to package the benchmark

FROM python:3.11-slim

ENV PYTHONUNBUFFERED=1

WORKDIR /app

COPY . .

RUN pip install -e .

COPY scripts/entrypoint.sh /entrypoint.sh
RUN chmod +x /entrypoint.sh

ENTRYPOINT ["/entrypoint.sh"]

Upload SWE-Bench to dockerhub at kirk2000/benchflow:swebench-v1

Import nessary packages

import json
import os
from typing import Any, Dict
from datasets import Dataset, load_dataset
from benchflow import BaseBench, BaseBenchConfig

Define benchmark config, include required params and optional params to run the benchmarks.

class SwebenchConfig(BaseBenchConfig):
    required_env = []
    optional_env = ["INSTANCE_IDS", "MAX_WORKERS", "RUN_ID"]

    def __init__(self, params: Dict[str, Any], task_id: str):
        params.setdefault("INSTANCE_IDS", task_id)
        params.setdefault("MAX_WORKERS", 1)
        params.setdefault("RUN_ID", task_id)
        super().__init__(params)

Integrate Swebench Into Benchflow

class SwebenchBench(BaseBench):
    def get_config(self, params: Dict[str, Any], task_id: str) -> BaseBenchConfig:
        return SwebenchConfig(params, task_id)

    def get_image_name(self) -> str:
        return "kirk2000/benchflow:swebench-v1"

    def get_results_dir_in_container(self) -> str:
        return "/app/results"

    def get_log_files_dir_in_container(self) -> str:
        return "/app/logs"

    def get_result(self, task_id: str) -> Dict[str, Any]:
        results_file = os.path.join(self.results_dir, f"self_model.{task_id}.json")
        model_prediction_file = os.path.join(self.log_files_dir, f"run_evaluation/{task_id}/self_model/{task_id}/patch.diff")
        report_file = os.path.join(self.log_files_dir, f"run_evaluation/{task_id}/self_model/{task_id}/report.json")
        try:
            with open(results_file, 'r') as f:
                result_data = json.load(f)
            total_instances = result_data.get("total_instances", 1)
            resolved_instances = result_data.get("resolved_instances", 0)
            pass_rate = resolved_instances / total_instances if total_instances else 0
            with open(model_prediction_file, 'r') as f:
                model_prediction = f.read()
            with open(report_file, 'r') as f:
                report = json.load(f)
            return {
                    "is_resolved": pass_rate > 0.99,
                    "score": pass_rate,
                    "message": {"details": result_data},
                    "log": model_prediction + "\n" + json.dumps(report),
                }
        except Exception as e:
            return {
                "is_resolved": False,
                "score": 0,
                "message": {"error": str(e)},
                "log": str(e),
            }
        
    def get_all_tasks(self, split: str) -> Dict[str, Any]:
        try:
            dataset: Dataset = load_dataset("princeton-nlp/SWE-bench_Lite", split=split)
            dataset_ids = [instance["instance_id"] for instance in dataset]
            return {"task_ids": dataset_ids, "error_message": None}
        except Exception as e:
            return {"task_ids": [], "error_message": str(e)}
    
    def cleanup(self):
        pass

Last updated