Source code for camel.prompts.generate_text_embedding_data

# ========= Copyright 2023-2024 @ CAMEL-AI.org. All Rights Reserved. =========
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ========= Copyright 2023-2024 @ CAMEL-AI.org. All Rights Reserved. =========
from typing import Any

from camel.prompts import TextPrompt, TextPromptDict
from camel.types import RoleType


# flake8: noqa :E501
[docs] class GenerateTextEmbeddingDataPromptTemplateDict(TextPromptDict): r"""A :obj:`TextPrompt` dictionary containing text embedding tasks generation, query, positive and hard negative samples generation, from the `"Improving Text Embeddings with Large Language Models" <https://arxiv.org/abs/2401.00368>`_ paper. Attributes: GENERATE_TASKS (TextPrompt): A prompt to generate a list of :obj:`num_tasks` synthetic text_embedding tasks. ASSISTANT_PROMPT (TextPrompt): A system prompt for the AI assistant to generate synthetic :obj:`user_query`, :obj:`positive document`, and :obj:`hard_negative_document` for a specific :obj:`task` with specified parameters including :obj:`query_type`, :obj:`query_length`, :obj:`clarity`, :obj:`num_words`, :obj:`language` and :obj:`difficulty`. """ GENERATE_TASKS = TextPrompt( """You are an expert to brainstorm a list of {num_tasks} potentially useful text retrieval tasks Here are a few examples for your reference: - Provided a scientific claim as query, retrieve documents that help verify or refute the claim. - Search for documents that answers a FAQ-style query on children's nutrition. Please adhere to the following guidelines: - Specify what the query is, and what the desired documents are. - Each retrieval task should cover a wide range of queries, and should not be too specific. Your output should always be a python list of strings starting with `1.`, `2.` etc. And each element corresponds to a distinct retrieval task in one sentence. Do not explain yourself or output anything else. Be creative!""" ) ASSISTANT_PROMPT = TextPrompt( """You have been assigned a retrieval task: {task} Your mission is to write one text retrieval example for this task in JSON format. The JSON object must contain the following keys: - "user_query": a string, a random user search query specified by the retrieval task. - "positive_document": a string, a relevant document for the user query. - "hard_negative_document": a string, a hard negative document that only appears relevant to the query. Please adhere to the following guidelines: - The "user_query" should be {query_type}, {query_length}, {clarity}, and diverse in topic. - All documents must be created independent of the query. Avoid copying the query verbatim. It's acceptable if some parts of the "positive_document" are not topically related to the query. - All documents should be at least {num_words} words long. - The "hard_negative_document" contains some useful information, but it should be less useful or comprehensive compared to the "positive_document". - Both the query and documents should be in {language}. - Do not provide any explanation in any document on why it is relevant or not relevant to the query. - Both the query and documents require {difficulty} level education to understand. Your output must always be a JSON object only (starting and ending with curly brackets), do not explain yourself or output anything else. Be creative!""" ) def __init__(self, *args: Any, **kwargs: Any) -> None: super().__init__(*args, **kwargs) self.update( { "generate_tasks": self.GENERATE_TASKS, RoleType.ASSISTANT: self.ASSISTANT_PROMPT, } )