RetrieverProtocol

class RetrieverProtocol(Protocol):

Protocol for the retriever class. Any retriever class implementing this protocol can be used in the benchmark class.

retrieve

def retrieve(
    self,
    query: str,
    contents: List[str],
    **kwargs: Dict[str, Any]
):

Retrieve the relevant content for the query.

Parameters:

  • query (str): The query to retrieve the content for.
  • contents (List[str]): The list of contents to search in. **kwargs (Dict[str, Any]): Additional keyword arguments.

Returns:

Dict[str, Any]: The relevant content for the query.

reset

def reset(self, **kwargs):

Reset the retriever. Some benchmarks may require resetting the retriever after each query.

Returns:

bool: True if the reset was successful, False otherwise.

DefaultGAIARetriever

class DefaultGAIARetriever(AutoRetriever):

Default retriever for the GAIA benchmark. This retriever uses AutoRetriever in camel to retrieve the content based on the query.

retrieve

def retrieve(
    self,
    query: str,
    contents: List[str],
    **kwargs: Any
):

Retrieve the content based on the query.

Parameters:

  • query (str): The query to search for.
  • contents (List[str]): The list of contents to search from. **kwargs (Any): The keyword arguments to pass to the retriever.

Returns:

Dict[str, Any]: The retrieved content.

reset

def reset(self, **kwargs: Any):

Reset the retriever.

Returns:

bool: Whether the reset was successful.

GAIABenchmark

class GAIABenchmark(BaseBenchmark):

GAIA Benchmark adapted from “GAIA: a benchmark for General AI Assistants”.

Parameters:

  • data_dir (str): The directory to save the data.
  • save_to (str): The file to save the results.
  • retriever (Optional[RetrieverProtocol]): The retriever to use. (default: :obj:None)
  • processes (int, optional): The number of processes to use. (default: :obj:1)

init

def __init__(
    self,
    data_dir: str,
    save_to: str,
    retriever: Optional[RetrieverProtocol] = None,
    processes: int = 1
):

Initialize the GAIA benchmark.

Parameters:

  • data_dir (str): The directory to save the data.
  • save_to (str): The file to save the results.
  • retriever (Optional[RetrieverProtocol], optional): The retriever to use. (default: :obj:None)
  • processes (int, optional): The number of processes to use for parallel processing. (default: :obj:1)

download

def download(self):

Download the GAIA dataset.

load

def load(self, force_download = False):

Load the GAIA dataset.

Parameters:

  • force_download (bool, optional): Whether to force download the data.

train

def train(self):

Get the training set.

run

def run(
    self,
    agent: ChatAgent,
    on: Literal['train', 'valid', 'test'],
    level: Union[int, List[int], Literal['all']],
    randomize: bool = False,
    subset: Optional[int] = None
):

Run the benchmark.

Parameters:

  • agent (ChatAgent): The agent to run the benchmark.
  • on (Literal["valid", "test"]): The set to run the benchmark.
  • level (Union[int, List[int], Literal["all"]]): The level to run the benchmark.
  • randomize (bool, optional): Whether to randomize the data. (default: :obj:False)
  • subset (Optional[int], optional): The subset of data to run. (default: :obj:None)

Returns:

Dict[str, Any]: The results of the benchmark.

_prepare_task

def _prepare_task(self, task: Dict[str, Any]):

Prepare the task by validating and enriching its data.

_create_user_message

def _create_user_message(self, task: Dict[str, Any]):

Create a user message from a task.

_process_result

def _process_result(
    self,
    agent: ChatAgent,
    task: Dict[str, Any],
    result: Any,
    file_obj: Any
):

Process and store the result of a task.

_handle_error

def _handle_error(
    self,
    task: Dict[str, Any],
    error: Exception,
    file_obj: Any
):

Handle errors encountered during task processing.

_generate_summary

def _generate_summary(self):

Generate and return a summary of the benchmark results.

question_scorer

def question_scorer(self, model_answer: str, ground_truth: str):

Scorer for the GAIA benchmark. https://huggingface.co/spaces/gaia-benchmark/leaderboard/blob/main/ scorer.py

Parameters:

  • model_answer (str): The model answer.
  • ground_truth (str): The ground truth answer.

Returns:

bool: The score of the model

normalize_number_str

def normalize_number_str(self, number_str: str):

split_string

def split_string(self, s: str, char_list: Optional[List[str]] = None):

Split a string based on a list of characters.

Parameters:

  • s (str): The string to split.
  • char_list (Optional[List[str]], optional): T he list of characters to split on. (default: :obj:None)

normalize_str

def normalize_str(self, input_str, remove_punct = True):

Normalize a string.

Parameters:

  • input_str: The input string to normalize.
  • remove_punct: Whether to remove punctuation.

Returns:

str: The normalized string.

get_final_answer

def get_final_answer(self, content: str):

Get the final answer from the content.

Parameters:

  • content (str): The content to extract the final answer from.

Returns:

str: The final answer.