Introduction
An AI Benchmark runtime.
What can Benchflow do?
Tons of easy to run benchmarks for your agent
Life span agent track including prompts, cost, time, metrics ...
Agent evaluation of over 3x speedup
Benchmark bundle to help you stands out among your competitors
Agent Developer
Only two steps to test your agents
1
2
Test your agent on benchmarks
from benchflow import load_benchmarks
bench = load_benchmark("benchmark_name")
your_agent = YourAgents()
run_ids = bench.run(
agents=your_agent,
requirements_dir = "requirements.txt",
api={"OPENAI_API_KEY": os.getenv("OPENAI_API_KEY")},
params={}
)
results = bench.get_results(run_ids)
Benchmark Developer
Three steps to integrate your benchmarks
1
Make Your Benchmark a Client
from benchflow import BenchClient
from typing import Dict, Any
class YourBenchClient(BenchClient):
def __init__(self, agent_url: str):
super().__init__(agent_url)
def prepare_environment(self, state_update: Dict) -> Dict:
"""Prepare the benchmark infomation for agents"""
return {
"env_info": {
"info1": state_update['info1'],
"info2": state_update['info2']
}
}
def parse_action(self, raw_action: str) -> str:
"""Process the agent response."""
return parsed_action
2
3
Upload Your Benchmark to Benchflow
Extend the BaseBench
class and upload your work to benchflow!
class YourbenchConfig(BaseBenchConfig):
# specific required params here
required_env = []
# specific optional params here
optional_env = ["INSTANCE_IDS", "MAX_WORKERS", "RUN_ID"]
def __init__(self, params: Dict[str, Any], task_id: str):
# specific default params here
params.setdefault("INSTANCE_IDS", task_id)
params.setdefault("MAX_WORKERS", 1)
params.setdefault("RUN_ID", task_id)
super().__init__(params)
class YourBench(BaseBench):
def __init__(self):
super().__init__()
def get_config(self, params: Dict[str, Any], task_id: str) -> BaseBenchConfig:
"""
Return a WebArenaConfig instance that validates the input parameters.
"""
return YourbenchConfig(params)
def get_image_name(self) -> str:
"""
Return the Docker image name for running the WebArena benchmark.
"""
return "image/url"
def get_results_dir_in_container(self) -> str:
"""
Return the directory inside the container where benchmark results will be stored.
"""
return "/app/results"
def get_log_files_dir_in_container(self) -> str:
"""
Return the directory inside the container where log files will be stored.
"""
return "/app/log_files"
def get_result(self, task_id: str) -> Dict[str, Any]:
"""
Read and parse the benchmark result from log files.
This method expects a file named 'log_files.txt' in the results directory.
It reads the content of each log file listed, aggregates the logs, and extracts
the average score and pass status.
"""
# define how to get logs and results here
return {
"is_resolved": is_resolved,
"score": score,
"message": {"details": "Task runs successfully."},
"log": log_content
}
def get_all_tasks(self, split: str) -> Dict[str, Any]:
"""
Return a dictionary containing all task IDs and an optional error message.
"""
return {"task_ids": task_ids, "error_message": None}
Last updated