camel.benchmarks package

On this page

camel.benchmarks package#

Submodules#

camel.benchmarks.apibank module#

class camel.benchmarks.apibank.APIBankBenchmark(save_to: str, processes: int = 1)[source]#

Bases: BaseBenchmark

API-Bank Benchmark adapted from API-Bank: A Comprehensive Benchmark for Tool-Augmented LLMs <AlibabaResearch/DAMO-ConvAI>.

Parameters:
  • save_to (str) – The file to save the results.

  • processes (int, optional) – The number of processes to use. (default: 1)

download()[source]#

Download APIBank dataset and code from Github.

load(level: str, force_download: bool = False)[source]#

Load the APIBank Benchmark dataset.

Parameters:
  • level (str) – Level to run benchmark on.

  • force_download (bool, optional) – Whether to force download the data.

run(agent: ChatAgent, level: Literal['level-1', 'level-2'], api_test_enabled=True, randomize: bool = False, subset: int | None = None) Dict[str, Any][source]#

Run the benchmark.

Parameters:
  • agent (ChatAgent) – The agent to run the benchmark.

  • level (Literal['level-1', 'level-2']) – The level to run the benchmark on.

  • randomize (bool, optional) – Whether to randomize the data.

  • api_test_enabled (bool) – Whether to test

  • calling (API) – (default: False)

  • subset (Optional[int], optional)

  • run. (The subset of data to) – (default: None)

Returns:

The results of the benchmark.

Return type:

Dict[str, Any]

class camel.benchmarks.apibank.APIBankSample(chat_history, apis, ground_truth)[source]#

Bases: object

APIBank sample used to load the datasets.

classmethod from_chat_history(chat_history)[source]#
class camel.benchmarks.apibank.Evaluator(samples: List[APIBankSample])[source]#

Bases: object

Evaluator for APIBank benchmark.

evaluate(sample_id, model_output)[source]#
get_all_sample_ids()[source]#
get_api_description(api_name)[source]#
get_model_input(sample_id: int)[source]#
camel.benchmarks.apibank.agent_call(messages: List[Dict], agent: ChatAgent)[source]#

Add messages to agent memory and get response.

camel.benchmarks.apibank.calculate_rouge_l_score(reference, hypothesis)[source]#

Calculate rouge l score between hypothesis and reference.

camel.benchmarks.apibank.get_api_call(model_output)[source]#

Parse api call from model output.

camel.benchmarks.apibank.process_messages(chat_history: List[Dict[str, Any]], prompt: str) List[Dict[str, str]][source]#

Processes chat history into a structured format for further use.

Parameters:
  • chat_history (List[Dict[str, Any]) – A list of dictionaries representing the chat history.

  • prompt (str) – A propmt to be set as the system message.

Returns:

A list of dictionaries representing

the processed messages, where each dictionary has:

  • ’role’: The role of the message (‘system’, ‘user’, or ‘assistant’).

  • ’content’: The content of the message, including formatted

    API responses when applicable.

Return type:

List[Dict[str, str]]

camel.benchmarks.apibench module#

class camel.benchmarks.apibench.APIBenchBenchmark(data_dir: str, save_to: str, processes: int = 1)[source]#

Bases: BaseBenchmark

APIBench Benchmark adopted from Gorilla: Large Language Model Connected with Massive APIs <https://huggingface.co/datasets/gorilla-llm/APIBench>.

Parameters:
  • data_dir (str) – The directory to save the data.

  • save_to (str) – The file to save the results.

  • processes (int, optional) – The number of processes to use. (default: 1)

download()[source]#

Download the APIBench dataset.

load(dataset_name: str, force_download: bool = False)[source]#

Load the APIBench Benchmark dataset.

Parameters:
  • dataset_name (str) – Name of the specific dataset to be loaded.

  • force_download (bool, optional) – Whether to force download the data. (default: False)

run(agent: ChatAgent, dataset_name: Literal['huggingface', 'tensorflowhub', 'torchhub'], randomize: bool = False, subset: int | None = None) Dict[str, Any][source]#

Run the benchmark.

Parameters:
  • agent (ChatAgent) – The agent to run the benchmark.

  • (Literal["huggingface" (dataset_name) – “tensorflowhub”, “torchhub”]): The dataset to run the benchmark.

:param“tensorflowhub”, “torchhub”]):

The dataset to run the benchmark.

Parameters:
  • randomize (bool, optional) – Whether to randomize the data. (default: False)

  • subset (Optional[int], optional) – The subset of data to run. (default: None)

camel.benchmarks.apibench.ast_check(candidate_subtree_list, base_tree_list, dataset_name)[source]#
camel.benchmarks.apibench.ast_parse(candidate)[source]#
camel.benchmarks.apibench.encode_question(question: str, dataset_name: str) str[source]#

Encode multiple prompt instructions into a single string.

camel.benchmarks.apibench.evaluate_response(response, question_id, dataset_name, api_database, qa_pairs, ast_database)[source]#
camel.benchmarks.apibench.get_all_sub_trees(root_node)[source]#
camel.benchmarks.apibench.get_args(node, dataset_name)[source]#

camel.benchmarks.base module#

class camel.benchmarks.base.BaseBenchmark(name: str, data_dir: str, save_to: str, processes: int = 1)[source]#

Bases: ABC

Base class for benchmarks.

name#

Name of the benchmark.

Type:

str

data_dir#

Path to the data directory.

Type:

str

save_to#

Path to save the results.

Type:

str

processes#

Number of processes to use for parallel processing. :(default: 1)

Type:

int

abstract download() BaseBenchmark[source]#

Download the benchmark data.

Returns:

The benchmark instance.

Return type:

BaseBenchmark

abstract load(force_download: bool = False) BaseBenchmark[source]#

Load the benchmark data.

Parameters:

force_download (bool) – Whether to force download the data.

Returns:

The benchmark instance.

Return type:

BaseBenchmark

property results: List[Dict[str, Any]]#

Get the results.

Returns:

The results.

Return type:

List[Dict[str, Any]]

abstract run(agent: ChatAgent, on: Literal['train', 'valid', 'test'], randomize: bool = False, subset: int | None = None, *args, **kwargs) BaseBenchmark[source]#

Run the benchmark.

Parameters:
  • agent (ChatAgent) – The chat agent.

  • on (str) – The data split to run the benchmark on.

  • randomize (bool) – Whether to randomize the data.

  • subset (int) – The subset of the data to run the benchmark on.

Returns:

The benchmark instance.

Return type:

BaseBenchmark

property test: List[Dict[str, Any]]#

Get the test data.

Returns:

The test data.

Return type:

List[Dict[str, Any]]

property train: List[Dict[str, Any]]#

Get the training data.

Returns:

The training data.

Return type:

List[Dict[str, Any]]

property valid: List[Dict[str, Any]]#

Get the validation data.

Returns:

The validation data.

Return type:

List[Dict[str, Any]]

camel.benchmarks.gaia module#

class camel.benchmarks.gaia.DefaultGAIARetriever(url_and_api_key: Tuple[str, str] | None = None, vector_storage_local_path: str | None = None, storage_type: StorageType | None = None, embedding_model: BaseEmbedding | None = None)[source]#

Bases: AutoRetriever

Default retriever for the GAIA benchmark. This retriever uses AutoRetriever in camel to retrieve the content based on the query.

reset(**kwargs: Any) bool[source]#

Reset the retriever.

Parameters:

**kwargs (Any) – The keyword arguments to pass to the retriever.

Returns:

Whether the reset was successful.

Return type:

bool

retrieve(query: str, contents: List[str], **kwargs: Any) Dict[str, Any][source]#

Retrieve the content based on the query.

Parameters:
  • query (str) – The query to search for.

  • contents (List[str]) – The list of contents to search from.

  • **kwargs (Any) – The keyword arguments to pass to the retriever.

Returns:

The retrieved content.

Return type:

Dict[str, Any]

class camel.benchmarks.gaia.GAIABenchmark(data_dir: str, save_to: str, retriever: RetrieverProtocol | None = None, processes: int = 1)[source]#

Bases: BaseBenchmark

GAIA Benchmark adapted from “GAIA: a benchmark for General AI Assistants”.

Parameters:
  • data_dir (str) – The directory to save the data.

  • save_to (str) – The file to save the results.

  • retriever (Optional[RetrieverProtocol]) – The retriever to use. (default: None)

  • processes (int, optional) – The number of processes to use. (default: 1)

download()[source]#

Download the GAIA dataset.

get_final_answer(content: str) str[source]#

Get the final answer from the content.

Parameters:

content (str) – The content to extract the final answer from.

Returns:

The final answer.

Return type:

str

load(force_download=False)[source]#

Load the GAIA dataset.

Parameters:

force_download (bool, optional) – Whether to force download the data.

normalize_number_str(number_str: str) float[source]#
normalize_str(input_str, remove_punct=True) str[source]#

Normalize a string.

Parameters:
  • input_str – The input string to normalize.

  • remove_punct – Whether to remove punctuation.

Returns:

The normalized string.

Return type:

str

question_scorer(model_answer: str, ground_truth: str) bool[source]#

Scorer for the GAIA benchmark. https://huggingface.co/spaces/gaia-benchmark/leaderboard/blob/main/ scorer.py

Parameters:
  • model_answer (str) – The model answer.

  • ground_truth (str) – The ground truth answer.

Returns:

The score of the model

Return type:

bool

run(agent: ChatAgent, on: Literal['train', 'valid', 'test'], level: int | List[int] | Literal['all'], randomize: bool = False, subset: int | None = None) Dict[str, Any][source]#

Run the benchmark.

Parameters:
  • agent (ChatAgent) – The agent to run the benchmark.

  • on (Literal["valid", "test"]) – The set to run the benchmark.

  • level (Union[int, List[int], Literal["all"]]) – The level to run the benchmark.

  • randomize (bool, optional) – Whether to randomize the data. (default: False)

  • subset (Optional[int], optional) – The subset of data to run. (default: None)

Returns:

The results of the benchmark.

Return type:

Dict[str, Any]

split_string(s: str, char_list: List[str] | None = None) list[str][source]#

Split a string based on a list of characters.

Parameters:
  • s (str) – The string to split.

  • char_list (Optional[List[str]], optional) – T he list of characters to split on. (default: None)

property train#

Get the training set.

class camel.benchmarks.gaia.RetrieverProtocol(*args, **kwargs)[source]#

Bases: Protocol

Protocol for the retriever class. Any retriever class implementing this protocol can be used in the benchmark class.

reset(**kwargs) bool[source]#

Reset the retriever. Some benchmarks may require resetting the retriever after each query.

Parameters:

**kwargs – Additional keyword arguments.

Returns:

True if the reset was successful, False otherwise.

Return type:

bool

retrieve(query: str, contents: List[str], **kwargs: Dict[str, Any]) Dict[str, Any][source]#

Retrieve the relevant content for the query.

Parameters:
  • query (str) – The query to retrieve the content for.

  • contents (List[str]) – The list of contents to search in.

  • **kwargs (Dict[str, Any]) – Additional keyword arguments.

Returns:

The relevant content for the query.

Return type:

Dict[str, Any]

camel.benchmarks.nexus module#

class camel.benchmarks.nexus.NexusBenchmark(data_dir: str, save_to: str, processes: int = 1)[source]#

Bases: BaseBenchmark

Nexus Function Calling Benchmark adapted from NexusRaven V2 Function Calling Benchmark <https://huggingface.co/collections/Nexusflow/nexusraven-v2-function-calling-benchmark-657a597fb84dbe7a09ebfc3e>.

Parameters:
  • data_dir (str) – The directory to save the data.

  • save_to (str) – The file to save the results.

  • processes (int, optional) – The number of processes to use. (default: 1)

download()[source]#

Download the Nexus Functional Calling Benchmark dataset.

load(dataset_name: str, force_download: bool = False)[source]#

Load the Nexus Benchmark dataset.

Parameters:
  • dataset_name (str) – Name of the specific dataset to be loaded.

  • force_download (bool) – Whether to force download the data.

run(agent: ChatAgent, task: Literal['NVDLibrary', 'VirusTotal', 'OTX', 'PlacesAPI', 'ClimateAPI', 'VirusTotal-ParallelCalls', 'VirusTotal-NestedCalls', 'NVDLibrary-NestedCalls'], randomize: bool = False, subset: int | None = None) Dict[str, Any][source]#

Run the benchmark.

Parameters:
  • agent (ChatAgent) – The agent to run the benchmark.

  • (Literal["NVDLibrary" (task)

  • "VirusTotal"

  • "OTX"

:param : :param “PlacesAPI”: :param “ClimateAPI”: :param “VirusTotal-ParallelCalls”: :param : :param “VirusTotal-NestedCalls”: :param : :param “NVDLibrary-NestedCalls”]): The task to run the benchmark. :param randomize: Whether to randomize the data.

(default: False)

Parameters:

subset (Optional[int], optional) – The subset of data to run. (default: None)

Returns:

The results of the benchmark.

Return type:

Dict[str, Any]

property train#

Get the training set.

class camel.benchmarks.nexus.NexusSample(input: str, output: str)[source]#

Bases: object

Nexus benchmark dataset sample.

input: str#
output: str#
class camel.benchmarks.nexus.NexusTool(function_calls: str, descriptions: str)[source]#

Bases: object

Nexus benchmark tool

descriptions: str#
function_calls: str#
camel.benchmarks.nexus.compare_function_calls(agent_call: str, ground_truth_call: str) bool[source]#

Compare the function name and arguments of agent_call and ground_truth_call. :param agent_call: Function call by agent. :type agent_call: str :param ground_truth_call: Ground truth function call. :type ground_truth_call: str

Returns:

  • True if the function names and arguments match.

  • False otherwise.

camel.benchmarks.nexus.construct_prompt(input: str, tools: str) str[source]#

Construct prompt from tools and input.

camel.benchmarks.nexus.construct_tool_descriptions(dataset_name: str) str[source]#

Construct tool descriptions from function definitions and descriptions.

camel.benchmarks.nexus.parse_function_call(call: str) Tuple[str | None, List[Any] | None, Dict[str, Any] | None][source]#

Parse a function call string to extract the function name, positional arguments, and keyword arguments, including nested function calls.

Parameters:

call (str) – A string in the format func(arg1, arg2, kwarg=value).

Returns:

(function_name (str), positional_args (list), keyword_args (dict)) or (None, None, None).

Return type:

tuple

camel.benchmarks.ragbench module#

class camel.benchmarks.ragbench.RAGBenchBenchmark(processes: int = 1, subset: Literal['covidqa', 'cuad', 'delucionqa', 'emanual', 'expertqa', 'finqa', 'hagrid', 'hotpotqa', 'msmarco', 'pubmedqa', 'tatqa', 'techqa'] = 'hotpotqa', split: Literal['train', 'test', 'validation'] = 'test')[source]#

Bases: BaseBenchmark

RAGBench Benchmark for evaluating RAG performance.

This benchmark uses the rungalileo/ragbench dataset to evaluate retrieval-augmented generation (RAG) systems. It measures context relevancy and faithfulness metrics as described in https://arxiv.org/abs/2407.11005.

Parameters:
  • processes (int, optional) – Number of processes for parallel processing.

  • subset (str, optional) – Dataset subset to use (e.g., “hotpotqa”).

  • split (str, optional) – Dataset split to use (e.g., “test”).

download()[source]#

Download the RAGBench dataset.

load(force_download: bool = False)[source]#

Load the RAGBench dataset.

Parameters:

force_download (bool, optional) – Whether to force download the data.

run(agent: ChatAgent, auto_retriever: AutoRetriever) Dict[str, float | None][source]#

Run the benchmark evaluation.

Parameters:
  • agent (ChatAgent) – Chat agent for generating answers.

  • auto_retriever (AutoRetriever) – Retriever for finding relevant contexts.

Returns:

Dictionary of evaluation metrics.

Return type:

Dict[str, Optional[float]]

class camel.benchmarks.ragbench.RagasFields[source]#

Bases: object

Constants for RAGAS evaluation field names.

INPUT_ANSWER = 'answer'#
INPUT_CONTEXT = 'contexts'#
INPUT_QUESTION = 'question'#
camel.benchmarks.ragbench.annotate_dataset(dataset: Dataset, context_call: Callable[[Dict[str, Any]], List[str]] | None, answer_call: Callable[[Dict[str, Any]], str] | None) Dataset[source]#

Annotate the dataset by adding context and answers using the provided functions.

Parameters:
  • dataset (Dataset) – The input dataset to annotate.

  • context_call (Optional[Callable[[Dict[str, Any]], List[str]]]) – Function to generate context for each example.

  • answer_call (Optional[Callable[[Dict[str, Any]], str]]) – Function to generate answer for each example.

Returns:

The annotated dataset with added contexts and/or answers.

Return type:

Dataset

camel.benchmarks.ragbench.auroc(trues: Sequence[bool], preds: Sequence[float]) float[source]#

Calculate Area Under Receiver Operating Characteristic Curve (AUROC).

Parameters:
  • trues (Sequence[bool]) – Ground truth binary values.

  • preds (Sequence[float]) – Predicted probability values.

Returns:

AUROC score.

Return type:

float

camel.benchmarks.ragbench.ragas_calculate_metrics(dataset: Dataset, pred_context_relevance_field: str | None, pred_faithfulness_field: str | None, metrics_to_evaluate: List[str] | None = None, ground_truth_context_relevance_field: str = 'relevance_score', ground_truth_faithfulness_field: str = 'adherence_score') Dict[str, float | None][source]#

Calculate RAGAS evaluation metrics.

Parameters:
  • dataset (Dataset) – The dataset containing predictions and ground truth.

  • pred_context_relevance_field (Optional[str]) – Field name for predicted context relevance.

  • pred_faithfulness_field (Optional[str]) – Field name for predicted faithfulness.

  • metrics_to_evaluate (Optional[List[str]]) – List of metrics to evaluate.

  • ground_truth_context_relevance_field (str) – Field name for ground truth relevance.

  • ground_truth_faithfulness_field (str) – Field name for ground truth adherence.

Returns:

Dictionary of calculated metrics.

Return type:

Dict[str, Optional[float]]

camel.benchmarks.ragbench.ragas_evaluate_dataset(dataset: Dataset, contexts_field_name: str | None, answer_field_name: str | None, metrics_to_evaluate: List[str] | None = None) Dataset[source]#

Evaluate the dataset using RAGAS metrics.

Parameters:
  • dataset (Dataset) – Input dataset to evaluate.

  • contexts_field_name (Optional[str]) – Field name containing contexts.

  • answer_field_name (Optional[str]) – Field name containing answers.

  • metrics_to_evaluate (Optional[List[str]]) – List of metrics to evaluate.

Returns:

Dataset with added evaluation metrics.

Return type:

Dataset

camel.benchmarks.ragbench.rmse(input_trues: Sequence[float], input_preds: Sequence[float]) float | None[source]#

Calculate Root Mean Squared Error (RMSE).

Parameters:
  • input_trues (Sequence[float]) – Ground truth values.

  • input_preds (Sequence[float]) – Predicted values.

Returns:

RMSE value, or None if inputs have different lengths.

Return type:

Optional[float]

Module contents#

class camel.benchmarks.APIBankBenchmark(save_to: str, processes: int = 1)[source]#

Bases: BaseBenchmark

API-Bank Benchmark adapted from API-Bank: A Comprehensive Benchmark for Tool-Augmented LLMs <AlibabaResearch/DAMO-ConvAI>.

Parameters:
  • save_to (str) – The file to save the results.

  • processes (int, optional) – The number of processes to use. (default: 1)

download()[source]#

Download APIBank dataset and code from Github.

load(level: str, force_download: bool = False)[source]#

Load the APIBank Benchmark dataset.

Parameters:
  • level (str) – Level to run benchmark on.

  • force_download (bool, optional) – Whether to force download the data.

run(agent: ChatAgent, level: Literal['level-1', 'level-2'], api_test_enabled=True, randomize: bool = False, subset: int | None = None) Dict[str, Any][source]#

Run the benchmark.

Parameters:
  • agent (ChatAgent) – The agent to run the benchmark.

  • level (Literal['level-1', 'level-2']) – The level to run the benchmark on.

  • randomize (bool, optional) – Whether to randomize the data.

  • api_test_enabled (bool) – Whether to test

  • calling (API) – (default: False)

  • subset (Optional[int], optional)

  • run. (The subset of data to) – (default: None)

Returns:

The results of the benchmark.

Return type:

Dict[str, Any]

class camel.benchmarks.APIBenchBenchmark(data_dir: str, save_to: str, processes: int = 1)[source]#

Bases: BaseBenchmark

APIBench Benchmark adopted from Gorilla: Large Language Model Connected with Massive APIs <https://huggingface.co/datasets/gorilla-llm/APIBench>.

Parameters:
  • data_dir (str) – The directory to save the data.

  • save_to (str) – The file to save the results.

  • processes (int, optional) – The number of processes to use. (default: 1)

download()[source]#

Download the APIBench dataset.

load(dataset_name: str, force_download: bool = False)[source]#

Load the APIBench Benchmark dataset.

Parameters:
  • dataset_name (str) – Name of the specific dataset to be loaded.

  • force_download (bool, optional) – Whether to force download the data. (default: False)

run(agent: ChatAgent, dataset_name: Literal['huggingface', 'tensorflowhub', 'torchhub'], randomize: bool = False, subset: int | None = None) Dict[str, Any][source]#

Run the benchmark.

Parameters:
  • agent (ChatAgent) – The agent to run the benchmark.

  • (Literal["huggingface" (dataset_name) – “tensorflowhub”, “torchhub”]): The dataset to run the benchmark.

:param“tensorflowhub”, “torchhub”]):

The dataset to run the benchmark.

Parameters:
  • randomize (bool, optional) – Whether to randomize the data. (default: False)

  • subset (Optional[int], optional) – The subset of data to run. (default: None)

class camel.benchmarks.BaseBenchmark(name: str, data_dir: str, save_to: str, processes: int = 1)[source]#

Bases: ABC

Base class for benchmarks.

name#

Name of the benchmark.

Type:

str

data_dir#

Path to the data directory.

Type:

str

save_to#

Path to save the results.

Type:

str

processes#

Number of processes to use for parallel processing. :(default: 1)

Type:

int

abstract download() BaseBenchmark[source]#

Download the benchmark data.

Returns:

The benchmark instance.

Return type:

BaseBenchmark

abstract load(force_download: bool = False) BaseBenchmark[source]#

Load the benchmark data.

Parameters:

force_download (bool) – Whether to force download the data.

Returns:

The benchmark instance.

Return type:

BaseBenchmark

property results: List[Dict[str, Any]]#

Get the results.

Returns:

The results.

Return type:

List[Dict[str, Any]]

abstract run(agent: ChatAgent, on: Literal['train', 'valid', 'test'], randomize: bool = False, subset: int | None = None, *args, **kwargs) BaseBenchmark[source]#

Run the benchmark.

Parameters:
  • agent (ChatAgent) – The chat agent.

  • on (str) – The data split to run the benchmark on.

  • randomize (bool) – Whether to randomize the data.

  • subset (int) – The subset of the data to run the benchmark on.

Returns:

The benchmark instance.

Return type:

BaseBenchmark

property test: List[Dict[str, Any]]#

Get the test data.

Returns:

The test data.

Return type:

List[Dict[str, Any]]

property train: List[Dict[str, Any]]#

Get the training data.

Returns:

The training data.

Return type:

List[Dict[str, Any]]

property valid: List[Dict[str, Any]]#

Get the validation data.

Returns:

The validation data.

Return type:

List[Dict[str, Any]]

class camel.benchmarks.DefaultGAIARetriever(url_and_api_key: Tuple[str, str] | None = None, vector_storage_local_path: str | None = None, storage_type: StorageType | None = None, embedding_model: BaseEmbedding | None = None)[source]#

Bases: AutoRetriever

Default retriever for the GAIA benchmark. This retriever uses AutoRetriever in camel to retrieve the content based on the query.

reset(**kwargs: Any) bool[source]#

Reset the retriever.

Parameters:

**kwargs (Any) – The keyword arguments to pass to the retriever.

Returns:

Whether the reset was successful.

Return type:

bool

retrieve(query: str, contents: List[str], **kwargs: Any) Dict[str, Any][source]#

Retrieve the content based on the query.

Parameters:
  • query (str) – The query to search for.

  • contents (List[str]) – The list of contents to search from.

  • **kwargs (Any) – The keyword arguments to pass to the retriever.

Returns:

The retrieved content.

Return type:

Dict[str, Any]

class camel.benchmarks.GAIABenchmark(data_dir: str, save_to: str, retriever: RetrieverProtocol | None = None, processes: int = 1)[source]#

Bases: BaseBenchmark

GAIA Benchmark adapted from “GAIA: a benchmark for General AI Assistants”.

Parameters:
  • data_dir (str) – The directory to save the data.

  • save_to (str) – The file to save the results.

  • retriever (Optional[RetrieverProtocol]) – The retriever to use. (default: None)

  • processes (int, optional) – The number of processes to use. (default: 1)

download()[source]#

Download the GAIA dataset.

get_final_answer(content: str) str[source]#

Get the final answer from the content.

Parameters:

content (str) – The content to extract the final answer from.

Returns:

The final answer.

Return type:

str

load(force_download=False)[source]#

Load the GAIA dataset.

Parameters:

force_download (bool, optional) – Whether to force download the data.

normalize_number_str(number_str: str) float[source]#
normalize_str(input_str, remove_punct=True) str[source]#

Normalize a string.

Parameters:
  • input_str – The input string to normalize.

  • remove_punct – Whether to remove punctuation.

Returns:

The normalized string.

Return type:

str

question_scorer(model_answer: str, ground_truth: str) bool[source]#

Scorer for the GAIA benchmark. https://huggingface.co/spaces/gaia-benchmark/leaderboard/blob/main/ scorer.py

Parameters:
  • model_answer (str) – The model answer.

  • ground_truth (str) – The ground truth answer.

Returns:

The score of the model

Return type:

bool

run(agent: ChatAgent, on: Literal['train', 'valid', 'test'], level: int | List[int] | Literal['all'], randomize: bool = False, subset: int | None = None) Dict[str, Any][source]#

Run the benchmark.

Parameters:
  • agent (ChatAgent) – The agent to run the benchmark.

  • on (Literal["valid", "test"]) – The set to run the benchmark.

  • level (Union[int, List[int], Literal["all"]]) – The level to run the benchmark.

  • randomize (bool, optional) – Whether to randomize the data. (default: False)

  • subset (Optional[int], optional) – The subset of data to run. (default: None)

Returns:

The results of the benchmark.

Return type:

Dict[str, Any]

split_string(s: str, char_list: List[str] | None = None) list[str][source]#

Split a string based on a list of characters.

Parameters:
  • s (str) – The string to split.

  • char_list (Optional[List[str]], optional) – T he list of characters to split on. (default: None)

property train#

Get the training set.

class camel.benchmarks.NexusBenchmark(data_dir: str, save_to: str, processes: int = 1)[source]#

Bases: BaseBenchmark

Nexus Function Calling Benchmark adapted from NexusRaven V2 Function Calling Benchmark <https://huggingface.co/collections/Nexusflow/nexusraven-v2-function-calling-benchmark-657a597fb84dbe7a09ebfc3e>.

Parameters:
  • data_dir (str) – The directory to save the data.

  • save_to (str) – The file to save the results.

  • processes (int, optional) – The number of processes to use. (default: 1)

download()[source]#

Download the Nexus Functional Calling Benchmark dataset.

load(dataset_name: str, force_download: bool = False)[source]#

Load the Nexus Benchmark dataset.

Parameters:
  • dataset_name (str) – Name of the specific dataset to be loaded.

  • force_download (bool) – Whether to force download the data.

run(agent: ChatAgent, task: Literal['NVDLibrary', 'VirusTotal', 'OTX', 'PlacesAPI', 'ClimateAPI', 'VirusTotal-ParallelCalls', 'VirusTotal-NestedCalls', 'NVDLibrary-NestedCalls'], randomize: bool = False, subset: int | None = None) Dict[str, Any][source]#

Run the benchmark.

Parameters:
  • agent (ChatAgent) – The agent to run the benchmark.

  • (Literal["NVDLibrary" (task)

  • "VirusTotal"

  • "OTX"

:param : :param “PlacesAPI”: :param “ClimateAPI”: :param “VirusTotal-ParallelCalls”: :param : :param “VirusTotal-NestedCalls”: :param : :param “NVDLibrary-NestedCalls”]): The task to run the benchmark. :param randomize: Whether to randomize the data.

(default: False)

Parameters:

subset (Optional[int], optional) – The subset of data to run. (default: None)

Returns:

The results of the benchmark.

Return type:

Dict[str, Any]

property train#

Get the training set.

class camel.benchmarks.RAGBenchBenchmark(processes: int = 1, subset: Literal['covidqa', 'cuad', 'delucionqa', 'emanual', 'expertqa', 'finqa', 'hagrid', 'hotpotqa', 'msmarco', 'pubmedqa', 'tatqa', 'techqa'] = 'hotpotqa', split: Literal['train', 'test', 'validation'] = 'test')[source]#

Bases: BaseBenchmark

RAGBench Benchmark for evaluating RAG performance.

This benchmark uses the rungalileo/ragbench dataset to evaluate retrieval-augmented generation (RAG) systems. It measures context relevancy and faithfulness metrics as described in https://arxiv.org/abs/2407.11005.

Parameters:
  • processes (int, optional) – Number of processes for parallel processing.

  • subset (str, optional) – Dataset subset to use (e.g., “hotpotqa”).

  • split (str, optional) – Dataset split to use (e.g., “test”).

download()[source]#

Download the RAGBench dataset.

load(force_download: bool = False)[source]#

Load the RAGBench dataset.

Parameters:

force_download (bool, optional) – Whether to force download the data.

run(agent: ChatAgent, auto_retriever: AutoRetriever) Dict[str, float | None][source]#

Run the benchmark evaluation.

Parameters:
  • agent (ChatAgent) – Chat agent for generating answers.

  • auto_retriever (AutoRetriever) – Retriever for finding relevant contexts.

Returns:

Dictionary of evaluation metrics.

Return type:

Dict[str, Optional[float]]