def process_messages(chat_history: List[Dict[str, Any]], prompt: str):
class APIBankBenchmark(BaseBenchmark):
API-Bank: A Comprehensive Benchmark for Tool-Augmented LLMs
<https://github.com/AlibabaResearch/DAMO-ConvAI/tree/main/api-bank>
.
Parameters:
1
)def __init__(self, save_to: str, processes: int = 1):
1
)def download(self):
def load(self, level: str, force_download: bool = False):
def run(
self,
agent: ChatAgent,
level: Literal['level-1', 'level-2'],
api_test_enabled = True,
randomize: bool = False,
subset: Optional[int] = None
):
Literal['level-1', 'level-2']
): The level to run the benchmark on.True
) or response (False
) (default: :obj:False
)None
)def agent_call(messages: List[Dict], agent: ChatAgent):
def calculate_rouge_l_score(reference, hypothesis):
def get_api_call(model_output):
class APIBankSample:
def __init__(
self,
chat_history,
apis,
ground_truth
):
def __repr__(self):
def from_chat_history(cls, chat_history):
class Evaluator:
def __init__(self, samples: List[APIBankSample]):
def get_all_sample_ids(self):
def get_api_description(self, api_name):
def get_model_input(self, sample_id: int):
def evaluate(self, sample_id, model_output):