# Benchmark Example

### SWE-Bench

**Define SwebenchClient**

```python
class SweBenchClient(BenchClient):
    def __init__(self, agent_url: str):
        super().__init__(agent_url)

    def prepare_environment(self, state_update: Dict[str, Any]) -> Dict[str, Any]:
        return {"env_info": state_update}

    def parse_action(self, raw_action: str) -> Dict[str, Any]:
        return {
            'model_patch': raw_action
        }
```

**Modify entrypoint of SWE-Bench so it can get results from agents**

```python
def main(
        dataset_name: str,
        split: str,
        instance_ids: list,
        predictions_path: str,
        agent_url: str,
        max_workers: int,
        force_rebuild: bool,
        cache_level: str,
        clean: bool,
        open_file_limit: int,
        run_id: str,
        timeout: int,
        namespace: str | None,
        instance_image_tag: str = 'latest',
        rewrite_reports: bool = False,
        report_dir: str = '.',
        modal_name_or_path: str = "self_model",
        modal: bool = False
    ):
    """
    Run evaluation harness for the given dataset and predictions.
    """
    # original code
    
    ### 
    # Integrate Benchflow Here
    agent = SweBenchClient(agent_url)
    for task in full_dataset:
        state_update = task
        prediction = agent.get_action(state_update)
        prediction['instance_id'] = task[KEY_INSTANCE_ID]
        prediction['model_name_or_path'] = modal_name_or_path
        predictions[task[KEY_INSTANCE_ID]] = prediction
    ###
    
    # original code
    
```

**Add a scripts as an entrypoint for docker image**

```sh
#!/bin/bash

AGENT_URL=${AGENT_URL:-"http://0.0.0.0:9002"}
MAX_WORKERS=${MAX_WORKERS:-1}
INSTANCE_IDS=${INSTANCE_IDS:-sympy__sympy-20590}
RUN_ID=${RUN_ID:-${INSTANCE_IDS}}
MODAL_NAME_OR_PATH=${MODAL_NAME_OR_PATH:-self_model}
REPORT_DIR=${REPORT_DIR:-./results}

echo "AGENT_URL: ${AGENT_URL}"
echo "MAX_WORKERS: ${MAX_WORKERS}"
echo "INSTANCE_IDS: ${INSTANCE_IDS}"
echo "RUN_ID: ${RUN_ID}"
echo "MODAL_NAME_OR_PATH: ${MODAL_NAME_OR_PATH}"
echo "REPORT_DIR: ${REPORT_DIR}"

python -m swebench.harness.bf_evaluation \
     --agent_url ${AGENT_URL} \
     --max_workers ${MAX_WORKERS} \
     --instance_ids ${INSTANCE_IDS} \
     --run_id ${RUN_ID} \
     --modal_name_or_path ${MODAL_NAME_OR_PATH} \
     --report_dir ${REPORT_DIR}
```

**Add a Dockerfile to package the benchmark**

```docker
FROM python:3.11-slim

ENV PYTHONUNBUFFERED=1

WORKDIR /app

COPY . .

RUN pip install -e .

COPY scripts/entrypoint.sh /entrypoint.sh
RUN chmod +x /entrypoint.sh

ENTRYPOINT ["/entrypoint.sh"]
```

**Upload SWE-Bench to dockerhub at kirk2000/benchflow:swebench-v1**

**Import nessary packages**

```python
import json
import os
from typing import Any, Dict
from datasets import Dataset, load_dataset
from benchflow import BaseBench, BaseBenchConfig
```

**Define benchmark config, include required params and optional params to run the benchmarks.**

```python
class SwebenchConfig(BaseBenchConfig):
    required_env = []
    optional_env = ["INSTANCE_IDS", "MAX_WORKERS", "RUN_ID"]

    def __init__(self, params: Dict[str, Any], task_id: str):
        params.setdefault("INSTANCE_IDS", task_id)
        params.setdefault("MAX_WORKERS", 1)
        params.setdefault("RUN_ID", task_id)
        super().__init__(params)
```

**Integrate Swebench Into Benchflow**

```python
class SwebenchBench(BaseBench):
    def get_config(self, params: Dict[str, Any], task_id: str) -> BaseBenchConfig:
        return SwebenchConfig(params, task_id)

    def get_image_name(self) -> str:
        return "kirk2000/benchflow:swebench-v1"

    def get_results_dir_in_container(self) -> str:
        return "/app/results"

    def get_log_files_dir_in_container(self) -> str:
        return "/app/logs"

    def get_result(self, task_id: str) -> Dict[str, Any]:
        results_file = os.path.join(self.results_dir, f"self_model.{task_id}.json")
        model_prediction_file = os.path.join(self.log_files_dir, f"run_evaluation/{task_id}/self_model/{task_id}/patch.diff")
        report_file = os.path.join(self.log_files_dir, f"run_evaluation/{task_id}/self_model/{task_id}/report.json")
        try:
            with open(results_file, 'r') as f:
                result_data = json.load(f)
            total_instances = result_data.get("total_instances", 1)
            resolved_instances = result_data.get("resolved_instances", 0)
            pass_rate = resolved_instances / total_instances if total_instances else 0
            with open(model_prediction_file, 'r') as f:
                model_prediction = f.read()
            with open(report_file, 'r') as f:
                report = json.load(f)
            return {
                    "is_resolved": pass_rate > 0.99,
                    "score": pass_rate,
                    "message": {"details": result_data},
                    "log": model_prediction + "\n" + json.dumps(report),
                }
        except Exception as e:
            return {
                "is_resolved": False,
                "score": 0,
                "message": {"error": str(e)},
                "log": str(e),
            }
        
    def get_all_tasks(self, split: str) -> Dict[str, Any]:
        try:
            dataset: Dataset = load_dataset("princeton-nlp/SWE-bench_Lite", split=split)
            dataset_ids = [instance["instance_id"] for instance in dataset]
            return {"task_ids": dataset_ids, "error_message": None}
        except Exception as e:
            return {"task_ids": [], "error_message": str(e)}
    
    def cleanup(self):
        pass
```


---

# Agent Instructions: Querying This Documentation

If you need additional information that is not directly available in this page, you can query the documentation dynamically by asking a question.

Perform an HTTP GET request on the current page URL with the `ask` query parameter:

```
GET https://benchflow.gitbook.io/benchflow/benchmark-example.md?ask=<question>
```

The question should be specific, self-contained, and written in natural language.
The response will contain a direct answer to the question and relevant excerpts and sources from the documentation.

Use this mechanism when the answer is not explicitly present in the current page, you need clarification or additional context, or you want to retrieve related documentation sections.
