Source code for camel.datagen.self_instruct.self_instruct

# ========= Copyright 2023-2024 @ CAMEL-AI.org. All Rights Reserved. =========
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ========= Copyright 2023-2024 @ CAMEL-AI.org. All Rights Reserved. =========

import json
import os
import random
import time
from typing import Any, Dict, List, Optional

from pydantic import BaseModel, Field

from camel.agents import ChatAgent
from camel.logger import get_logger

from .filter import RougeSimilarityFilter
from .filter.instruction_filter import InstructionFilter
from .templates import SelfInstructTemplates

logger = get_logger(__name__)


[docs] class SelfInstructPipeline: r"""A pipeline to generate and manage machine-generated instructions for tasks, combining human and machine task samples. Args: agent (ChatAgent): The agent used to interact and generate instructions. seed (str): The path to the human-written instructions. num_machine_instructions (int): Number of machine-generated instructions to generate. (default::obj:`5`) data_output_path (Optional[str]): Path to save the generated data. (default::obj:`./data_output.json`) human_to_machine_ratio (tuple): Ratio of human to machine tasks used for instruction generation. (default::obj:`(6, 2)`) instruction_filter (InstructionFilter): A filter to validate generated instructions. (default::obj:`None`) filter_config (Optional[Dict[str, Dict[str, Any]]]): configuration for the filter functions registered in FILE_REGISTRY. (default::obj:`None`) stop_on_first_failure (bool): If True, stops checking filters after the first failure. """ def __init__( self, agent: ChatAgent, seed: str, num_machine_instructions: int = 5, data_output_path: Optional[str] = './data_output.json', human_to_machine_ratio: tuple = (6, 2), instruction_filter: Optional[InstructionFilter] = None, filter_config: Optional[Dict[str, Dict[str, Any]]] = None, stop_on_first_failure: bool = False, ): self.agent = agent self.num_machine_instructions = num_machine_instructions self.data_output_path = data_output_path self.human_to_machine_ratio = human_to_machine_ratio self.human_tasks: List[Dict] = [] self.machine_tasks: List[Dict] = [] self.load_seed(seed) default_config: Dict[str, Dict[str, Any]] = { "length": {}, "keyword": {}, "punctuation": {}, "non_english": {}, "rouge_similarity": {}, } if instruction_filter is not None: # custom self.instruction_filter = instruction_filter else: # default config_to_use = ( filter_config if filter_config is not None else default_config ) self.instruction_filter = InstructionFilter( config_to_use, stop_on_first_failure )
[docs] def load_seed(self, path: str): r"""Load seed tasks from a file. Defaults to a predefined seed file if no path is provided. Args: path (str): Path to the seed file. Raises: FileNotFoundError: If the seed file does not exist. """ if os.path.exists(path): with open(path, 'r') as f: for line in f: line = line.strip() if line: self.human_tasks.append(json.loads(line)) else: raise FileNotFoundError(f"Seed file not found at path: {path}")
[docs] def sample_human_tasks(self, count: int) -> List[dict]: r"""Sample a specified number of human tasks from the loaded seed. Args: count (int): Number of human tasks to sample. Returns: List[dict]: A list of sampled human tasks. """ return random.sample( self.human_tasks, min(count, len(self.human_tasks)) )
[docs] def sample_machine_tasks(self, count: int) -> List[dict]: r"""Sample a specified number of machine tasks. Args: count (int): Number of machine tasks to sample. Returns: List[dict]: A list of sampled machine tasks, with placeholders if insufficient tasks are available. """ available_machine_tasks = len(self.machine_tasks) if available_machine_tasks < count: sampled_tasks = self.machine_tasks.copy() placeholders_needed = count - available_machine_tasks sampled_tasks.extend( [{'instruction': ""} for _ in range(placeholders_needed)] ) return sampled_tasks return random.sample(self.machine_tasks, count)
[docs] def generate_machine_instruction(self) -> List: r"""Generate a machine instruction using the agent. Combines human and machine tasks based on the configured ratio to create a prompt for instruction generation. Returns: List: The prompt and a machine-generated instruction. """ sampled_human_tasks = self.sample_human_tasks( self.human_to_machine_ratio[0] ) sampled_machine_tasks = self.sample_machine_tasks( self.human_to_machine_ratio[1] ) prompt = "Below are some tasks:\n\n" for idx, task in enumerate(sampled_human_tasks, 1): prompt += f"Task {idx}: {task['instruction']}\n" current_task_number = len(sampled_human_tasks) + 1 for idx, task in enumerate(sampled_machine_tasks, current_task_number): prompt += f"Task {idx}: {task['instruction']}\n" task_num = len(sampled_human_tasks) + len(sampled_machine_tasks) + 1 prompt += f"Task {task_num}:" prompt += ( "\nNow, please produce exactly one new task that fits the " "style of the ones above.\n Do not include any task numbering or " "labels like 'Task X:'. Just write the task itself.\n" "The task should be a single sentence.\n\n" ) response = self.agent.step(prompt) self.agent.reset() generated_tasks = [ line.strip() for line in response.msgs[0].content.split("\n") if line.strip() ] return [prompt, generated_tasks[0]]
[docs] def identify_instruction(self, instruction: str) -> bool: r"""Determine if the given instruction is a classification task. Args: instruction (str): The instruction to classify. Returns: bool: True if the instruction is a classification task, otherwise False. """ clf_prompt = ( SelfInstructTemplates.clf_template + f"Task: {instruction}\nIs it classification?" + "\nRespond in the following structured format:" "\n{\n \"answer\": true\n}\n" "or\n" "{\n \"answer\": false\n}\n" ) response = self.agent.step(clf_prompt) self.agent.reset() try: structured_response = AgentResponse.parse_raw( response.msgs[0].content.strip() ) return structured_response.answer except ValueError as e: logger.error(f"Error parsing agent response: {e}") return False
[docs] def generate_machine_instances(self): r"""Generate instances for each machine task based on its classification status. """ logger.info( f"Starting output generation: target {len(self.machine_tasks)} " f"instructions" ) attempt_count = 0 for instruction in self.machine_tasks: instance = self.generate_machine_instance( instruction['instruction'], instruction['is_classification'] ) instruction['instances'] = instance attempt_count += 1 logger.info( f"Attempt[Output]: Progress {attempt_count}/" f"{len(self.machine_tasks)} instructions" )
[docs] def generate_machine_instance( self, instruction: str, classification: bool ) -> list[dict]: r"""Generate instances for a given instruction. Args: instruction (str): The instruction to create instances for. classification (bool): Whether the instruction is a classification task. Returns: List[dict]: A list of generated instances in input-output format. """ if classification: prompt = ( SelfInstructTemplates.output_first_template_for_clf.format( instruction=instruction ) ) else: prompt = SelfInstructTemplates.input_first_template_for_gen.format( instruction=instruction ) response = self.agent.step(prompt) self.agent.reset() generated_text = response.msgs[0].content.strip() if classification: return self.parse_classification_output(generated_text) else: return self.parse_non_classification_output(generated_text)
[docs] def parse_classification_output( self, generated_text: str ) -> List[Dict[str, str]]: r"""Parse the generated text for classification tasks into input-output pairs. Args: generated_text (str): The raw text generated by the agent for classification tasks. Returns: List[Dict[str, str]]: A list of dictionaries with 'input' and 'output' keys. """ instances = [] lines = generated_text.split("\n") current_label = None current_input = None for line in lines: line = line.strip() if not line: continue if line.startswith("Class label:"): if current_label and current_input: instances.append( { "input": current_input.strip(), "output": current_label.strip(), } ) current_label = line[len("Class label:") :].strip() current_input = None else: if current_input is None: current_input = line else: current_input += f"\n{line}" if current_label and current_input: instances.append( { "input": current_input.strip(), "output": current_label.strip(), } ) return instances
[docs] def parse_non_classification_output( self, generated_text: str ) -> List[Dict[str, str]]: r"""Parse the generated text for non-classification tasks into input-output pairs. Args: generated_text (str): The raw text generated by the agent for non-classification tasks. Returns: List[Dict[str, str]]: A list of dictionaries with 'input' and 'output' keys. """ instances = [] prev = 0 lines = generated_text.split("\n") i = 0 while i < len(lines): line = lines[i].strip() if line.startswith("Example "): prev = i + 1 elif line.startswith("Output:"): instance_input = '\n'.join(lines[prev:i]).strip() if instance_input.startswith("Input: "): instance_input = instance_input[len("Input: ") :].strip() else: instance_input = instance_input.strip() instance_output = line[len("Output:") :].strip() i += 1 while i < len(lines) and not lines[i].strip().startswith( "Example " ): instance_output += '\n' + lines[i].strip() i += 1 i -= 1 instance_output = instance_output.strip() instances.append( {"input": instance_input, "output": instance_output} ) prev = i + 1 i += 1 if not instances: instances.append({"input": "", "output": "No valid output found."}) return instances
[docs] def construct_data(self): r"""Save the machine-generated tasks to the specified output path in JSON format. """ with open(self.data_output_path, 'w') as f: json.dump(self.machine_tasks, f, indent=4, ensure_ascii=False)
[docs] def generate(self, timeout_minutes=600): r"""Execute the entire pipeline to generate machine instructions and instances. Args: timeout_minutes (int): Maximum time in minutes to run the generation process before timing out. (default: :obj:`600`) """ start_time = time.time() timeout_seconds = timeout_minutes * 60 logger.info( f"Starting instruction generation: target " f"{self.num_machine_instructions} instructions" ) while len(self.machine_tasks) < self.num_machine_instructions: # Check for timeout elapsed = time.time() - start_time if elapsed > timeout_seconds: logger.info( f"Generation timed out after {elapsed / 60:.1f} minutes. " f"Generated {len(self.machine_tasks)}/" f"{self.num_machine_instructions} instructions." ) break prompt, instruction = self.generate_machine_instruction() existing_instructions = [ t["instruction"] for t in self.human_tasks ] + [t["instruction"] for t in self.machine_tasks] for f in self.instruction_filter.filters: if isinstance(f, RougeSimilarityFilter): f.existing_instructions = existing_instructions if self.instruction_filter.filter(prompt, instruction): instruction_dict = { "id": f"machine_task_{len(self.machine_tasks) + 1}", "instruction": instruction, "is_classification": self.identify_instruction( instruction ), } self.machine_tasks.append(instruction_dict) logger.info( f"Attempt[Instruction]: Progress " f"{len(self.machine_tasks)}/" f"{self.num_machine_instructions} " f"instructions" ) else: logger.warning( f"Instruction failed filters. Skipping instruction: " f"{instruction}" ) self.generate_machine_instances() self.construct_data()
[docs] class AgentResponse(BaseModel): answer: bool = Field( ..., description="Indicates whether the task is " "classification (True/False).", )