Source code for camel.messages.conversion.alpaca

# ========= Copyright 2023-2024 @ CAMEL-AI.org. All Rights Reserved. =========
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ========= Copyright 2023-2024 @ CAMEL-AI.org. All Rights Reserved. =========

import re

from pydantic import BaseModel, Field, field_validator


[docs] class AlpacaItem(BaseModel): r"""Represents an instruction-response item in the Alpaca format. Appropripate for both cases where input field is empty, or populated. Provides parsing from string format using the class method from_string(). Args: instruction (str): The instruction/question/prompt input (str): Input context or examples (put empty string if none) output (str): The response/answer to the instruction """ instruction: str = Field(description="The instruction/question/prompt") input: str = Field( description="Optional context or input for the task." " For example, when the instruction is \"Summarize the " "following article\", the input is the article." ) output: str = Field(description="The response/answer to the instruction")
[docs] @field_validator('instruction', 'output') def no_section_markers(cls, value: str) -> str: r"""Ensures fields don't contain section markers like '### Response:' """ if ( '### Response' in value or '### Instruction' in value or '### Input' in value ): raise ValueError("Field cannot contain section markers") return value.strip()
[docs] @classmethod def from_string(cls, text: str) -> "AlpacaItem": r"""Creates an AlpacaItem from a formatted string. Args: text: String in either of these formats: With input: ### Instruction: {instruction} ### Input: {input} ### Response: {response} Without input: ### Instruction: {instruction} ### Response: {response} Returns: AlpacaItem: Parsed instance Raises: ValueError: text doesn't match expected format or sections missing """ # Strip and standardize newlines text = text.strip().replace('\r\n', '\n') # Try to extract sections using regex instruction_match = re.search( r'###\s*Instruction:\s*\n(.+?)(?=\n###|\Z)', text, re.DOTALL ) input_match = re.search( r'###\s*Input:\s*\n(.+?)(?=\n###|\Z)', text, re.DOTALL ) response_match = re.search( r'###\s*Response:\s*\n(.+?)(?=\n###|\Z)', text, re.DOTALL ) if not instruction_match or not response_match: raise ValueError( "Text must contain '### Instruction:'" " and '### Response:' sections" ) return cls( instruction=instruction_match.group(1).strip(), input=input_match.group(1).strip() if input_match else "", output=response_match.group(1).strip(), )
[docs] def to_string(self) -> str: r"""Converts the AlpacaItem to its string representation. Returns: str: Formatted string representation with sections markers """ return "\n".join( [ "### Instruction:", self.instruction, "", "### Input:", self.input, "", "### Response:", self.output, ] )