Source code for camel.data_collector.sharegpt_collector

# ========= Copyright 2023-2024 @ CAMEL-AI.org. All Rights Reserved. =========
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ========= Copyright 2023-2024 @ CAMEL-AI.org. All Rights Reserved. =========

import json
from typing import Any, ClassVar, Dict, List, Literal, Optional, Union

from pydantic import BaseModel
from typing_extensions import Self

from camel.agents import ChatAgent
from camel.data_collector.base import BaseDataCollector
from camel.messages import BaseMessage
from camel.messages.conversion.conversation_models import (
    ShareGPTConversation,
    ShareGPTMessage,
)
from camel.schemas import OpenAISchemaConverter
from camel.toolkits import FunctionTool

FROM_HASH = {
    "human": "human",
    "gpt": "gpt",
    "observation": "human",
    "function_call": "gpt",
}
# ruff: noqa: E501
DEFAULT_CONVERTER_PROMPTS = """
    Extract key entities and attributes from the conversations
    and convert them into a structured JSON format.
    For example:
    System: You are a helpful assistant
    Tools: [{"name": "get_release_date", "arguments": ["Portal"]}]
    User: When is the release date of the video game Portal?
    Assistant: The release date of the video game Portal is October 9, 2007.
    Your output should be:
    {
        "system": "You are a helpful assistant",
        "tools": "[{"name": "get_release_date", "arguments": ["Portal"]}]",
        "conversations": [
            {"from": "human", "value": "When is the release date of the video game Portal?"},
            {"from": "gpt", "value": "The release date of the video game Portal is October 9, 2007."}
        ]
    }
"""


[docs] class ConversationItem(BaseModel): from_: Literal["human", "gpt", "function_call", "observation"] value: str
[docs] class Config: fields: ClassVar[Dict[str, str]] = {"from_": "from"} extra = "forbid"
[docs] class ShareGPTData(BaseModel): system: str tools: str conversations: List[ConversationItem]
[docs] class Config: extra = "forbid"
[docs] class ShareGPTDataCollector(BaseDataCollector): def __init__(self) -> None: super().__init__() self.system_message: Optional[BaseMessage] = None self.agent_name: Optional[str] = None self.tools: List[FunctionTool] = []
[docs] def record( self, agent: Union[List[ChatAgent], ChatAgent], ) -> Self: r"""Inject an agent into the data collector.""" if not self.agent_name: _agent = agent if isinstance(agent, ChatAgent) else agent[0] self.agent_name = _agent.role_name self.system_message = _agent._system_message self.tools += list(_agent.tool_dict.values()) super().record(agent) return self
[docs] def convert(self) -> Dict[str, Any]: r"""Convert the collected data into a dictionary.""" if self.agent_name is None: raise ValueError("No agent injected") history = self.get_agent_history(self.agent_name) if not history: raise ValueError("No data collected.") data = dict( system=self.system_message.content if self.system_message else "", tools=json.dumps( [t.get_openai_tool_schema()["function"] for t in self.tools] ), ensure_ascii=False, conversations=[], ) conversations: List[Any] = [] for _data in history: role, message = _data.role, _data if role == "user": conversations.append( {"from": "human", "value": message.message} ) elif role == "assistant": if message.function_call: conversations.append( { "from": "function_call", "value": json.dumps( message.function_call, ensure_ascii=False ), } ) else: conversations.append( {"from": "gpt", "value": message.message} ) elif role == "function" or role == "tool": conversations.append( { "from": "observation", "value": json.dumps( message.message, ensure_ascii=False ), # type: ignore[attr-defined] } ) data["conversations"] = conversations self.data.append(data) return data
[docs] def llm_convert( self, converter: Optional[OpenAISchemaConverter] = None, prompt: Optional[str] = None, ) -> Dict[str, Any]: r"""Convert collected data using an LLM schema converter. Args: converter (Optional[OpenAISchemaConverter], optional): The converter to use. (default: :obj:`OpenAISchemaConverter`) prompt (Optional[str], optional): Prompt to guide the conversion. (default: :obj:`DEFAULT_CONVERTER_PROMPTS`) Returns: Dict[str, str]: The converted data. Raises: ValueError: If no agent is injected or data cannot be collected. """ prompt = prompt or DEFAULT_CONVERTER_PROMPTS converter = converter or OpenAISchemaConverter() system = self.system_message.content if self.system_message else "" context = [f"System: {system}\n"] context.append( "Tools: " + json.dumps( [t.get_openai_tool_schema()["function"] for t in self.tools], ensure_ascii=False, ) ) for _data in self.get_agent_history(str(self.agent_name)): role, message = _data.role, _data prefix = ( f"{role}: " if role != "user" else "User: " + f"{_data.name}: " ) if message.function_call: context.append( prefix + json.dumps(message.function_call, ensure_ascii=False) ) elif role == "function" or role == "tool": context.append( prefix + json.dumps(message.message, ensure_ascii=False) ) # type: ignore[attr-defined] else: context.append(prefix + str(message.message)) return converter.convert( "\n".join(context), ShareGPTData, prompt ).model_dump()
[docs] @staticmethod def to_sharegpt_conversation(data: Dict[str, Any]) -> ShareGPTConversation: messages = [ ShareGPTMessage(from_="system", value=data["system"]) # type: ignore[call-arg] ] for item in data["conversations"]: messages.append( ShareGPTMessage( # type: ignore[call-arg] from_=FROM_HASH[item["from"]], value=item["value"], ) ) return ShareGPTConversation(root=messages)