# ========= Copyright 2023-2024 @ CAMEL-AI.org. All Rights Reserved. =========
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ========= Copyright 2023-2024 @ CAMEL-AI.org. All Rights Reserved. =========
from typing import Dict, List, Literal, Optional
from pydantic import BaseModel
from camel.embeddings.base import BaseEmbedding
[docs]
class DeduplicationResult(BaseModel):
r"""The result of deduplication.
Attributes:
original_texts (List[str]): The original texts.
unique_ids (List[int]): A list of ids that are unique (not duplicates).
unique_embeddings_dict (Dict[int, List[float]]): A mapping from the
index of each unique text to its embedding.
duplicate_to_target_map (Dict[int, int]): A mapping from the index of
the duplicate text to the index of the text it is considered a
duplicate of.
"""
original_texts: List[str]
unique_ids: List[int]
unique_embeddings_dict: Dict[int, List[float]]
duplicate_to_target_map: Dict[int, int]
[docs]
def deduplicate_internally(
texts: List[str],
threshold: float = 0.65,
embedding_instance: Optional[BaseEmbedding[str]] = None,
embeddings: Optional[List[List[float]]] = None,
strategy: Literal["top1", "llm-supervise"] = "top1",
batch_size: int = 1000,
) -> DeduplicationResult:
r"""Deduplicate a list of strings based on their cosine similarity.
You can either:
1) Provide a CAMEL `BaseEmbedding` instance via `embedding_instance` to let
this function handle the embedding internally, OR
2) Directly pass a list of pre-computed embeddings to `embeddings`.
If both `embedding_instance` and `embeddings` are provided, the function
will raise a ValueError to avoid ambiguous usage.
strategy is used to specify different strategies, where 'top1' selects the
one with highest similarity, and 'llm-supervise' uses LLM to determine if
texts are duplicates (not yet implemented).
Args:
texts (List[str]): The list of texts to be deduplicated.
threshold (float, optional): The similarity threshold for considering
two texts as duplicates. (default: :obj:`0.65`)
embedding_instance (Optional[BaseEmbedding[str]], optional):
A CAMEL embedding instance for automatic embedding. (default:
:obj:`None`)
embeddings (Optional[List[List[float]]], optional):
Pre-computed embeddings of `texts`. Each element in the list
corresponds to the embedding of the text in the same index of
`texts`. (default: :obj:`None`)
strategy (Literal["top1", "llm-supervise"], optional):
The strategy to use for deduplication. (default: :obj:`"top1"`)
batch_size (int, optional): The size of the batch to use for
calculating cosine similarities. (default: :obj:`1000`)
Returns:
DeduplicationResult: An object that contains:
- `original_texts`: The original texts.
- `unique_ids`: The unique ids after deduplication.
- `unique_embeddings_dict`: A dict mapping from (unique) text id
to its embedding.
- `duplicate_to_target_map`: A dict mapping from the id of a
duplicate text to the id of the text it is considered a duplicate
of.
Raises:
NotImplementedError: If the strategy is not "top1".
ValueError: If neither embeddings nor embedding_instance is provided,
or if both are provided at the same time.
ValueError: If the length of `embeddings` does not match the length of
`texts`.
Example:
>>> from camel.embeddings.openai_embedding import OpenAIEmbedding
>>> # Suppose we have 5 texts, some of which may be duplicates
>>> texts = [
... "What is AI?",
... "Artificial Intelligence is about machines",
... "What is AI?",
... "Deep Learning is a subset of AI",
... "What is artificial intelligence?"
... ]
>>> # or any other BaseEmbedding instance
>>> embedding_model = OpenAIEmbedding()
>>> result = deduplicate_internally(
... texts=texts,
... threshold=0.7,
... embedding_instance=embedding_model
... )
>>> print("Unique ids:")
>>> for uid in result.unique_ids:
... print(texts[uid])
Unique ids:
What is AI?
Artificial Intelligence is about machines
Deep Learning is a subset of AI
What is artificial intelligence?
>>> print("Duplicate map:")
>>> print(result.duplicate_to_target_map)
{2: 0}
# This indicates the text at index 2 is considered
# a duplicate of index 0.
"""
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
if len(texts) == 0:
return DeduplicationResult(
original_texts=[],
unique_ids=[],
unique_embeddings_dict={},
duplicate_to_target_map={},
)
if len(texts) == 1:
return DeduplicationResult(
original_texts=texts,
unique_ids=[0],
unique_embeddings_dict={
0: embeddings[0]
if embeddings
else embedding_instance.embed_list(texts)[0] # type: ignore[union-attr]
},
duplicate_to_target_map={},
)
if strategy == "llm-supervise":
# TODO: Implement LLM-supervise deduplication.
raise NotImplementedError(
"LLM-supervise deduplication is not yet implemented."
)
# Check if the parameters are valid.
if not 0 <= threshold <= 1:
raise ValueError("Threshold must be between 0 and 1")
if embedding_instance is None and embeddings is None:
raise ValueError(
"Either 'embedding_instance' or 'embeddings' must be provided."
)
if embedding_instance is not None and embeddings is not None:
raise ValueError(
"Cannot provide both 'embedding_instance' and 'embeddings'. "
"Please choose only one way to supply embeddings."
)
if embedding_instance is not None:
# Use Camel's embedding_instance to vectorize.
embeddings = embedding_instance.embed_list(texts)
else:
# Use pre-supplied embeddings.
if embeddings and len(embeddings) != len(texts):
raise ValueError(
"The length of 'embeddings' does not match the length "
"of 'texts'."
)
# Convert embeddings to numpy array for efficient computation
embeddings_array = np.array(embeddings)
n = len(texts)
duplicate_to_target_map: Dict[int, int] = {}
# Process in batches to reduce memory usage
for i in range(0, n, batch_size):
batch_end = min(i + batch_size, n)
# Calculate cosine similarity for current batch
batch_similarities = cosine_similarity(
embeddings_array[i:batch_end], embeddings_array[:batch_end]
)
# Create mask for lower triangle (avoid self-comparison and redundant
# checks)
tril_mask = np.tril(np.ones_like(batch_similarities), k=-1)
batch_similarities = batch_similarities * tril_mask
# Find duplicates in current batch
masked_similarities = np.where(
batch_similarities > threshold, batch_similarities, -1
)
max_indices = masked_similarities.argmax(axis=1)
above_threshold = (
batch_similarities[np.arange(batch_end - i), max_indices]
> threshold
)
# Update duplicate map
for j, is_duplicate in enumerate(above_threshold):
if is_duplicate:
duplicate_to_target_map[i + j] = max_indices[j]
# Get the actual unique ids and embeddings.
unique_ids = []
unique_embeddings_dict = {}
assert embeddings, "embeddings must be valid"
for i, (_, emb) in enumerate(zip(texts, embeddings)):
if i not in duplicate_to_target_map:
unique_ids.append(i)
unique_embeddings_dict[i] = emb
return DeduplicationResult(
original_texts=texts,
unique_ids=unique_ids,
unique_embeddings_dict=unique_embeddings_dict,
duplicate_to_target_map=duplicate_to_target_map,
)