Source code for camel.messages.conversion.alpaca
# ========= Copyright 2023-2024 @ CAMEL-AI.org. All Rights Reserved. =========
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ========= Copyright 2023-2024 @ CAMEL-AI.org. All Rights Reserved. =========
import re
from pydantic import BaseModel, Field, field_validator
[docs]
class AlpacaItem(BaseModel):
r"""Represents an instruction-response item in the Alpaca format.
Appropripate for both cases where input field is empty, or populated.
Provides parsing from string format using the class method from_string().
Args:
instruction (str): The instruction/question/prompt
input (str): Input context or examples (put empty string if none)
output (str): The response/answer to the instruction
"""
instruction: str = Field(description="The instruction/question/prompt")
input: str = Field(
description="Optional context or input for the task."
" For example, when the instruction is \"Summarize the "
"following article\", the input is the article."
)
output: str = Field(description="The response/answer to the instruction")
[docs]
@field_validator('instruction', 'output')
def no_section_markers(cls, value: str) -> str:
r"""Ensures fields don't contain section markers like '###
Response:'
"""
if (
'### Response' in value
or '### Instruction' in value
or '### Input' in value
):
raise ValueError("Field cannot contain section markers")
return value.strip()
[docs]
@classmethod
def from_string(cls, text: str) -> "AlpacaItem":
r"""Creates an AlpacaItem from a formatted string.
Args:
text: String in either of these formats:
With input:
### Instruction:
{instruction}
### Input:
{input}
### Response:
{response}
Without input:
### Instruction:
{instruction}
### Response:
{response}
Returns:
AlpacaItem: Parsed instance
Raises:
ValueError: text doesn't match expected format or sections missing
"""
# Strip and standardize newlines
text = text.strip().replace('\r\n', '\n')
# Try to extract sections using regex
instruction_match = re.search(
r'###\s*Instruction:\s*\n(.+?)(?=\n###|\Z)', text, re.DOTALL
)
input_match = re.search(
r'###\s*Input:\s*\n(.+?)(?=\n###|\Z)', text, re.DOTALL
)
response_match = re.search(
r'###\s*Response:\s*\n(.+?)(?=\n###|\Z)', text, re.DOTALL
)
if not instruction_match or not response_match:
raise ValueError(
"Text must contain '### Instruction:'"
" and '### Response:' sections"
)
return cls(
instruction=instruction_match.group(1).strip(),
input=input_match.group(1).strip() if input_match else "",
output=response_match.group(1).strip(),
)
[docs]
def to_string(self) -> str:
r"""Converts the AlpacaItem to its string representation.
Returns:
str: Formatted string representation with sections markers
"""
return "\n".join(
[
"### Instruction:",
self.instruction,
"",
"### Input:",
self.input,
"",
"### Response:",
self.output,
]
)