from benchflow import BenchClient
from typing import Dict, Any
class YourBenchClient(BenchClient):
def __init__(self, agent_url: str):
super().__init__(agent_url)
def prepare_environment(self, state_update: Dict) -> Dict:
"""Prepare the benchmark infomation for agents"""
return {
"env_info": {
"info1": state_update['info1'],
"info2": state_update['info2']
}
}
def parse_action(self, raw_action: str) -> str:
"""Process the agent response."""
return parsed_action
2
Package and Upload Your Benchmark Docker Image
Packaged your benchmark logic into a Docker image.
Configured the image to read required environment variables (such as AGENT_URL, TEST_START_IDX, etc.).
Uploaded the Docker image to DockerHub
3
Upload Your Benchmark to Benchflow
Extend the BaseBench class and upload your work to benchflow!
class YourbenchConfig(BaseBenchConfig):
# specific required params here
required_env = []
# specific optional params here
optional_env = ["INSTANCE_IDS", "MAX_WORKERS", "RUN_ID"]
def __init__(self, params: Dict[str, Any], task_id: str):
# specific default params here
params.setdefault("INSTANCE_IDS", task_id)
params.setdefault("MAX_WORKERS", 1)
params.setdefault("RUN_ID", task_id)
super().__init__(params)
class YourBench(BaseBench):
def __init__(self):
super().__init__()
def get_config(self, params: Dict[str, Any], task_id: str) -> BaseBenchConfig:
"""
Return a WebArenaConfig instance that validates the input parameters.
"""
return YourbenchConfig(params)
def get_image_name(self) -> str:
"""
Return the Docker image name for running the WebArena benchmark.
"""
return "image/url"
def get_results_dir_in_container(self) -> str:
"""
Return the directory inside the container where benchmark results will be stored.
"""
return "/app/results"
def get_log_files_dir_in_container(self) -> str:
"""
Return the directory inside the container where log files will be stored.
"""
return "/app/log_files"
def get_result(self, task_id: str) -> Dict[str, Any]:
"""
Read and parse the benchmark result from log files.
This method expects a file named 'log_files.txt' in the results directory.
It reads the content of each log file listed, aggregates the logs, and extracts
the average score and pass status.
"""
# define how to get logs and results here
return {
"is_resolved": is_resolved,
"score": score,
"message": {"details": "Task runs successfully."},
"log": log_content
}
def get_all_tasks(self, split: str) -> Dict[str, Any]:
"""
Return a dictionary containing all task IDs and an optional error message.
"""
return {"task_ids": task_ids, "error_message": None}