# ========= Copyright 2023-2024 @ CAMEL-AI.org. All Rights Reserved. =========
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ========= Copyright 2023-2024 @ CAMEL-AI.org. All Rights Reserved. =========
import json
import os
import tempfile
from typing import Any, List, Optional
from camel.datahubs.base import BaseDatasetManager
from camel.datahubs.models import Record
from camel.logger import get_logger
from camel.types import HuggingFaceRepoType
from camel.utils import api_keys_required, dependencies_required
logger = get_logger(__name__)
[docs]
class HuggingFaceDatasetManager(BaseDatasetManager):
r"""A dataset manager for Hugging Face datasets. This class provides
methods to create, add, update, delete, and list records in a dataset on
the Hugging Face Hub.
Args:
token (str): The Hugging Face API token. If not provided, the token
will be read from the environment variable `HF_TOKEN`.
"""
@api_keys_required(
[
("token", "HF_TOKEN"),
]
)
@dependencies_required('huggingface_hub')
def __init__(self, token: Optional[str] = None):
from huggingface_hub import HfApi
self._api_key = token or os.getenv("HF_TOKEN")
self.api = HfApi(token=self._api_key)
[docs]
def create_dataset_card(
self,
dataset_name: str,
description: str,
license: Optional[str] = None,
version: Optional[str] = None,
tags: Optional[List[str]] = None,
authors: Optional[List[str]] = None,
size_category: Optional[List[str]] = None,
language: Optional[List[str]] = None,
task_categories: Optional[List[str]] = None,
content: Optional[str] = None,
) -> None:
r"""Creates and uploads a dataset card to the Hugging Face Hub in YAML
format.
Args:
dataset_name (str): The name of the dataset.
description (str): A description of the dataset.
license (str): The license of the dataset. (default: :obj:`None`)
version (str): The version of the dataset. (default: :obj:`None`)
tags (list): A list of tags for the dataset.(default: :obj:`None`)
authors (list): A list of authors of the dataset. (default:
:obj:`None`)
size_category (list): A size category for the dataset. (default:
:obj:`None`)
language (list): A list of languages the dataset is in. (default:
:obj:`None`)
task_categories (list): A list of task categories. (default:
:obj:`None`)
content (str): Custom markdown content that the user wants to add
to the dataset card. (default: :obj:`None`)
"""
import yaml
metadata = {
"license": license,
"authors": authors,
"task_categories": task_categories,
"language": language,
"tags": tags,
"pretty_name": dataset_name,
"size_categories": size_category,
"version": version,
"description": description,
}
# Remove keys with None values
metadata = {k: v for k, v in metadata.items() if v}
card_content = (
"---\n"
+ yaml.dump(metadata, default_flow_style=False, allow_unicode=True)
+ "\n---"
)
if content:
card_content += f"\n\n# Additional Information\n{content}\n"
self._upload_file(
file_content=card_content,
dataset_name=dataset_name,
filepath="README.md",
file_type="md",
)
[docs]
def create_dataset(
self, name: str, private: bool = False, **kwargs: Any
) -> str:
r"""Creates a new dataset on the Hugging Face Hub.
Args:
name (str): The name of the dataset.
private (bool): Whether the dataset should be private. defaults to
False.
kwargs (Any): Additional keyword arguments.
Returns:
str: The URL of the created dataset.
"""
from huggingface_hub.errors import RepositoryNotFoundError
try:
self.api.repo_info(
repo_id=name,
repo_type=HuggingFaceRepoType.DATASET.value,
**kwargs,
)
except RepositoryNotFoundError:
self.api.create_repo(
repo_id=name,
repo_type=HuggingFaceRepoType.DATASET.value,
private=private,
)
return f"https://huggingface.co/datasets/{name}"
[docs]
def list_datasets(
self, username: str, limit: int = 100, **kwargs: Any
) -> List[str]:
r"""Lists all datasets for the current user.
Args:
username (str): The username of the user whose datasets to list.
limit (int): The maximum number of datasets to list.
(default: :obj:`100`)
kwargs (Any): Additional keyword arguments.
Returns:
List[str]: A list of dataset ids.
"""
try:
return [
dataset.id
for dataset in self.api.list_datasets(
author=username, limit=limit, **kwargs
)
]
except Exception as e:
logger.error(f"Error listing datasets: {e}")
return []
[docs]
def delete_dataset(self, dataset_name: str, **kwargs: Any) -> None:
r"""Deletes a dataset from the Hugging Face Hub.
Args:
dataset_name (str): The name of the dataset to delete.
kwargs (Any): Additional keyword arguments.
"""
try:
self.api.delete_repo(
repo_id=dataset_name,
repo_type=HuggingFaceRepoType.DATASET.value,
**kwargs,
)
logger.info(f"Dataset '{dataset_name}' deleted successfully.")
except Exception as e:
logger.error(f"Error deleting dataset '{dataset_name}': {e}")
raise
[docs]
def add_records(
self,
dataset_name: str,
records: List[Record],
filepath: str = "records/records.json",
**kwargs: Any,
) -> None:
r"""Adds records to a dataset on the Hugging Face Hub.
Args:
dataset_name (str): The name of the dataset.
records (List[Record]): A list of records to add to the dataset.
filepath (str): The path to the file containing the records.
kwargs (Any): Additional keyword arguments.
Raises:
ValueError: If the dataset already has a records file.
"""
existing_records = self._download_records(
dataset_name=dataset_name, filepath=filepath, **kwargs
)
if existing_records:
raise ValueError(
f"Dataset '{filepath}' already exists. "
f"Use `update_records` to modify."
)
self._upload_records(
records=records,
dataset_name=dataset_name,
filepath=filepath,
**kwargs,
)
[docs]
def update_records(
self,
dataset_name: str,
records: List[Record],
filepath: str = "records/records.json",
**kwargs: Any,
) -> None:
r"""Updates records in a dataset on the Hugging Face Hub.
Args:
dataset_name (str): The name of the dataset.
records (List[Record]): A list of records to update in the dataset.
filepath (str): The path to the file containing the records.
kwargs (Any): Additional keyword arguments.
Raises:
ValueError: If the dataset does not have an existing file to update
records in.
"""
existing_records = self._download_records(
dataset_name=dataset_name, filepath=filepath, **kwargs
)
if not existing_records:
logger.warning(
f"Dataset '{dataset_name}' does not have existing "
"records. Adding new records."
)
self._upload_records(
records=records,
dataset_name=dataset_name,
filepath=filepath,
**kwargs,
)
return
old_dict = {record.id: record for record in existing_records}
new_dict = {record.id: record for record in records}
merged_dict = old_dict.copy()
merged_dict.update(new_dict)
self._upload_records(
records=list(merged_dict.values()),
dataset_name=dataset_name,
filepath=filepath,
**kwargs,
)
[docs]
def delete_record(
self,
dataset_name: str,
record_id: str,
filepath: str = "records/records.json",
**kwargs: Any,
) -> None:
r"""Deletes a record from the dataset.
Args:
dataset_name (str): The name of the dataset.
record_id (str): The ID of the record to delete.
filepath (str): The path to the file containing the records.
kwargs (Any): Additional keyword arguments.
Raises:
ValueError: If the dataset does not have an existing file to delete
records from.
"""
existing_records = self._download_records(
dataset_name=dataset_name, filepath=filepath, **kwargs
)
if not existing_records:
raise ValueError(
f"Dataset '{dataset_name}' does not have an existing file to "
f"delete records from."
)
filtered_records = [
record for record in existing_records if record.id != record_id
]
self._upload_records(
records=filtered_records,
dataset_name=dataset_name,
filepath=filepath,
**kwargs,
)
[docs]
def list_records(
self,
dataset_name: str,
filepath: str = "records/records.json",
**kwargs: Any,
) -> List[Record]:
r"""Lists all records in a dataset.
Args:
dataset_name (str): The name of the dataset.
filepath (str): The path to the file containing the records.
kwargs (Any): Additional keyword arguments.
Returns:
List[Record]: A list of records in the dataset.
"""
return self._download_records(
dataset_name=dataset_name, filepath=filepath, **kwargs
)
def _download_records(
self, dataset_name: str, filepath: str, **kwargs: Any
) -> List[Record]:
from huggingface_hub import hf_hub_download
from huggingface_hub.errors import EntryNotFoundError
try:
downloaded_file_path = hf_hub_download(
repo_id=dataset_name,
filename=filepath,
repo_type=HuggingFaceRepoType.DATASET.value,
token=self._api_key,
**kwargs,
)
with open(downloaded_file_path, "r") as f:
records_data = json.load(f)
return [Record(**record) for record in records_data]
except EntryNotFoundError:
logger.info(f"No records found for dataset '{dataset_name}'.")
return []
except Exception as e:
logger.error(f"Error downloading or processing records: {e}")
raise e
def _upload_records(
self,
records: List[Record],
dataset_name: str,
filepath: str,
**kwargs: Any,
):
with tempfile.NamedTemporaryFile(
delete=False, mode="w", newline="", encoding="utf-8"
) as f:
json.dump(
[
record.model_dump(exclude_defaults=True)
for record in records
],
f,
ensure_ascii=False,
)
temp_file_path = f.name
try:
self.api.upload_file(
path_or_fileobj=temp_file_path,
path_in_repo=filepath,
repo_id=dataset_name,
repo_type=HuggingFaceRepoType.DATASET.value,
**kwargs,
)
except Exception as e:
logger.error(f"Error uploading records file: {e}")
raise
finally:
if os.path.exists(temp_file_path):
os.remove(temp_file_path)
def _upload_file(
self,
file_content: str,
dataset_name: str,
filepath: str,
file_type: str = "json",
**kwargs: Any,
):
with tempfile.NamedTemporaryFile(
mode="w", delete=False, suffix=f".{file_type}"
) as f:
if file_type == "json":
if isinstance(file_content, str):
try:
json_content = json.loads(file_content)
except json.JSONDecodeError:
raise ValueError(
"Invalid JSON string provided for file_content."
)
else:
try:
json.dumps(file_content, ensure_ascii=False)
json_content = file_content
except (TypeError, ValueError):
raise ValueError(
"file_content is not JSON serializable."
)
json.dump(json_content, f, ensure_ascii=False)
elif file_type == "md" or file_type == "txt":
f.write(file_content)
else:
raise ValueError(f"Unsupported file type: {file_type}")
temp_file_path = f.name
try:
self.api.upload_file(
path_or_fileobj=temp_file_path,
path_in_repo=filepath,
repo_id=dataset_name,
repo_type=HuggingFaceRepoType.DATASET.value,
**kwargs,
)
logger.info(f"File uploaded successfully: {filepath}")
except Exception as e:
logger.error(f"Error uploading file: {e}")
raise
if os.path.exists(temp_file_path):
os.remove(temp_file_path)