process_messages

def process_messages(chat_history: List[Dict[str, Any]], prompt: str):

Processes chat history into a structured format for further use.

Parameters:

  • chat_history (List[Dict[str, Any]): A list of dictionaries representing the chat history.
  • prompt (str): A prompt to be set as the system message.

Returns:

List[Dict[str, str]]: A list of dictionaries representing the processed messages, where each dictionary has:

  • ‘role’: The role of the message (‘system’, ‘user’, or ‘assistant’).
  • ‘content’: The content of the message, including formatted API responses when applicable.

APIBankBenchmark

class APIBankBenchmark(BaseBenchmark):

API-Bank Benchmark adapted from API-Bank: A Comprehensive Benchmark for Tool-Augmented LLMs <https://github.com/AlibabaResearch/DAMO-ConvAI/tree/main/api-bank>.

Parameters:

  • save_to (str): The file to save the results.
  • processes (int, optional): The number of processes to use. (default: :obj:1)

init

def __init__(self, save_to: str, processes: int = 1):

Initialize the APIBank benchmark.

Parameters:

  • save_to (str): The file to save the results.
  • processes (int, optional): The number of processes to use for parallel processing. (default: :obj:1)

download

def download(self):

Download APIBank dataset and code from Github.

load

def load(self, level: str, force_download: bool = False):

Load the APIBank Benchmark dataset.

Parameters:

  • level (str): Level to run benchmark on.
  • force_download (bool, optional): Whether to force download the data.

run

def run(
    self,
    agent: ChatAgent,
    level: Literal['level-1', 'level-2'],
    api_test_enabled = True,
    randomize: bool = False,
    subset: Optional[int] = None
):

Run the benchmark.

Parameters:

  • agent (ChatAgent): The agent to run the benchmark.
  • level (Literal['level-1', 'level-2']): The level to run the benchmark on.
  • randomize (bool, optional): Whether to randomize the data.
  • api_test_enabled (bool): Whether to test API calling (True) or response (False) (default: :obj:False)
  • subset (Optional[int], optional): The subset of data to run. (default: :obj:None)

Returns:

Dict[str, Any]: The results of the benchmark.

agent_call

def agent_call(messages: List[Dict], agent: ChatAgent):

Add messages to agent memory and get response.

calculate_rouge_l_score

def calculate_rouge_l_score(reference, hypothesis):

Calculate rouge l score between hypothesis and reference.

get_api_call

def get_api_call(model_output):

Parse api call from model output.

APIBankSample

class APIBankSample:

APIBank sample used to load the datasets.

init

def __init__(
    self,
    chat_history,
    apis,
    ground_truth
):

repr

def __repr__(self):

from_chat_history

def from_chat_history(cls, chat_history):

Evaluator

class Evaluator:

Evaluator for APIBank benchmark.

init

def __init__(self, samples: List[APIBankSample]):

get_all_sample_ids

def get_all_sample_ids(self):

get_api_description

def get_api_description(self, api_name):

get_model_input

def get_model_input(self, sample_id: int):

evaluate

def evaluate(self, sample_id, model_output):