Source code for camel.datahubs.huggingface

# ========= Copyright 2023-2024 @ CAMEL-AI.org. All Rights Reserved. =========
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ========= Copyright 2023-2024 @ CAMEL-AI.org. All Rights Reserved. =========
import json
import os
import tempfile
from typing import Any, List, Optional

from camel.datahubs.base import BaseDatasetManager
from camel.datahubs.models import Record
from camel.logger import get_logger
from camel.types import HuggingFaceRepoType
from camel.utils import api_keys_required, dependencies_required

logger = get_logger(__name__)


[docs] class HuggingFaceDatasetManager(BaseDatasetManager): r"""A dataset manager for Hugging Face datasets. This class provides methods to create, add, update, delete, and list records in a dataset on the Hugging Face Hub. Args: token (str): The Hugging Face API token. If not provided, the token will be read from the environment variable `HF_TOKEN`. """ @api_keys_required( [ ("token", "HF_TOKEN"), ] ) @dependencies_required('huggingface_hub') def __init__(self, token: Optional[str] = None): from huggingface_hub import HfApi self._api_key = token or os.getenv("HF_TOKEN") self.api = HfApi(token=self._api_key)
[docs] def create_dataset_card( self, dataset_name: str, description: str, license: Optional[str] = None, version: Optional[str] = None, tags: Optional[List[str]] = None, authors: Optional[List[str]] = None, size_category: Optional[List[str]] = None, language: Optional[List[str]] = None, task_categories: Optional[List[str]] = None, content: Optional[str] = None, ) -> None: r"""Creates and uploads a dataset card to the Hugging Face Hub in YAML format. Args: dataset_name (str): The name of the dataset. description (str): A description of the dataset. license (str): The license of the dataset. (default: :obj:`None`) version (str): The version of the dataset. (default: :obj:`None`) tags (list): A list of tags for the dataset.(default: :obj:`None`) authors (list): A list of authors of the dataset. (default: :obj:`None`) size_category (list): A size category for the dataset. (default: :obj:`None`) language (list): A list of languages the dataset is in. (default: :obj:`None`) task_categories (list): A list of task categories. (default: :obj:`None`) content (str): Custom markdown content that the user wants to add to the dataset card. (default: :obj:`None`) """ import yaml metadata = { "license": license, "authors": authors, "task_categories": task_categories, "language": language, "tags": tags, "pretty_name": dataset_name, "size_categories": size_category, "version": version, "description": description, } # Remove keys with None values metadata = {k: v for k, v in metadata.items() if v} card_content = ( "---\n" + yaml.dump(metadata, default_flow_style=False, allow_unicode=True) + "\n---" ) if content: card_content += f"\n\n# Additional Information\n{content}\n" self._upload_file( file_content=card_content, dataset_name=dataset_name, filepath="README.md", file_type="md", )
[docs] def create_dataset( self, name: str, private: bool = False, **kwargs: Any ) -> str: r"""Creates a new dataset on the Hugging Face Hub. Args: name (str): The name of the dataset. private (bool): Whether the dataset should be private. defaults to False. kwargs (Any): Additional keyword arguments. Returns: str: The URL of the created dataset. """ from huggingface_hub.errors import RepositoryNotFoundError try: self.api.repo_info( repo_id=name, repo_type=HuggingFaceRepoType.DATASET.value, **kwargs, ) except RepositoryNotFoundError: self.api.create_repo( repo_id=name, repo_type=HuggingFaceRepoType.DATASET.value, private=private, ) return f"https://huggingface.co/datasets/{name}"
[docs] def list_datasets( self, username: str, limit: int = 100, **kwargs: Any ) -> List[str]: r"""Lists all datasets for the current user. Args: username (str): The username of the user whose datasets to list. limit (int): The maximum number of datasets to list. (default: :obj:`100`) kwargs (Any): Additional keyword arguments. Returns: List[str]: A list of dataset ids. """ try: return [ dataset.id for dataset in self.api.list_datasets( author=username, limit=limit, **kwargs ) ] except Exception as e: logger.error(f"Error listing datasets: {e}") return []
[docs] def delete_dataset(self, dataset_name: str, **kwargs: Any) -> None: r"""Deletes a dataset from the Hugging Face Hub. Args: dataset_name (str): The name of the dataset to delete. kwargs (Any): Additional keyword arguments. """ try: self.api.delete_repo( repo_id=dataset_name, repo_type=HuggingFaceRepoType.DATASET.value, **kwargs, ) logger.info(f"Dataset '{dataset_name}' deleted successfully.") except Exception as e: logger.error(f"Error deleting dataset '{dataset_name}': {e}") raise
[docs] def add_records( self, dataset_name: str, records: List[Record], filepath: str = "records/records.json", **kwargs: Any, ) -> None: r"""Adds records to a dataset on the Hugging Face Hub. Args: dataset_name (str): The name of the dataset. records (List[Record]): A list of records to add to the dataset. filepath (str): The path to the file containing the records. kwargs (Any): Additional keyword arguments. Raises: ValueError: If the dataset already has a records file. """ existing_records = self._download_records( dataset_name=dataset_name, filepath=filepath, **kwargs ) if existing_records: raise ValueError( f"Dataset '{filepath}' already exists. " f"Use `update_records` to modify." ) self._upload_records( records=records, dataset_name=dataset_name, filepath=filepath, **kwargs, )
[docs] def update_records( self, dataset_name: str, records: List[Record], filepath: str = "records/records.json", **kwargs: Any, ) -> None: r"""Updates records in a dataset on the Hugging Face Hub. Args: dataset_name (str): The name of the dataset. records (List[Record]): A list of records to update in the dataset. filepath (str): The path to the file containing the records. kwargs (Any): Additional keyword arguments. Raises: ValueError: If the dataset does not have an existing file to update records in. """ existing_records = self._download_records( dataset_name=dataset_name, filepath=filepath, **kwargs ) if not existing_records: logger.warning( f"Dataset '{dataset_name}' does not have existing " "records. Adding new records." ) self._upload_records( records=records, dataset_name=dataset_name, filepath=filepath, **kwargs, ) return old_dict = {record.id: record for record in existing_records} new_dict = {record.id: record for record in records} merged_dict = old_dict.copy() merged_dict.update(new_dict) self._upload_records( records=list(merged_dict.values()), dataset_name=dataset_name, filepath=filepath, **kwargs, )
[docs] def delete_record( self, dataset_name: str, record_id: str, filepath: str = "records/records.json", **kwargs: Any, ) -> None: r"""Deletes a record from the dataset. Args: dataset_name (str): The name of the dataset. record_id (str): The ID of the record to delete. filepath (str): The path to the file containing the records. kwargs (Any): Additional keyword arguments. Raises: ValueError: If the dataset does not have an existing file to delete records from. """ existing_records = self._download_records( dataset_name=dataset_name, filepath=filepath, **kwargs ) if not existing_records: raise ValueError( f"Dataset '{dataset_name}' does not have an existing file to " f"delete records from." ) filtered_records = [ record for record in existing_records if record.id != record_id ] self._upload_records( records=filtered_records, dataset_name=dataset_name, filepath=filepath, **kwargs, )
[docs] def list_records( self, dataset_name: str, filepath: str = "records/records.json", **kwargs: Any, ) -> List[Record]: r"""Lists all records in a dataset. Args: dataset_name (str): The name of the dataset. filepath (str): The path to the file containing the records. kwargs (Any): Additional keyword arguments. Returns: List[Record]: A list of records in the dataset. """ return self._download_records( dataset_name=dataset_name, filepath=filepath, **kwargs )
def _download_records( self, dataset_name: str, filepath: str, **kwargs: Any ) -> List[Record]: from huggingface_hub import hf_hub_download from huggingface_hub.errors import EntryNotFoundError try: downloaded_file_path = hf_hub_download( repo_id=dataset_name, filename=filepath, repo_type=HuggingFaceRepoType.DATASET.value, token=self._api_key, **kwargs, ) with open(downloaded_file_path, "r") as f: records_data = json.load(f) return [Record(**record) for record in records_data] except EntryNotFoundError: logger.info(f"No records found for dataset '{dataset_name}'.") return [] except Exception as e: logger.error(f"Error downloading or processing records: {e}") raise e def _upload_records( self, records: List[Record], dataset_name: str, filepath: str, **kwargs: Any, ): with tempfile.NamedTemporaryFile( delete=False, mode="w", newline="", encoding="utf-8" ) as f: json.dump( [ record.model_dump(exclude_defaults=True) for record in records ], f, ensure_ascii=False, ) temp_file_path = f.name try: self.api.upload_file( path_or_fileobj=temp_file_path, path_in_repo=filepath, repo_id=dataset_name, repo_type=HuggingFaceRepoType.DATASET.value, **kwargs, ) except Exception as e: logger.error(f"Error uploading records file: {e}") raise finally: if os.path.exists(temp_file_path): os.remove(temp_file_path) def _upload_file( self, file_content: str, dataset_name: str, filepath: str, file_type: str = "json", **kwargs: Any, ): with tempfile.NamedTemporaryFile( mode="w", delete=False, suffix=f".{file_type}" ) as f: if file_type == "json": if isinstance(file_content, str): try: json_content = json.loads(file_content) except json.JSONDecodeError: raise ValueError( "Invalid JSON string provided for file_content." ) else: try: json.dumps(file_content, ensure_ascii=False) json_content = file_content except (TypeError, ValueError): raise ValueError( "file_content is not JSON serializable." ) json.dump(json_content, f, ensure_ascii=False) elif file_type == "md" or file_type == "txt": f.write(file_content) else: raise ValueError(f"Unsupported file type: {file_type}") temp_file_path = f.name try: self.api.upload_file( path_or_fileobj=temp_file_path, path_in_repo=filepath, repo_id=dataset_name, repo_type=HuggingFaceRepoType.DATASET.value, **kwargs, ) logger.info(f"File uploaded successfully: {filepath}") except Exception as e: logger.error(f"Error uploading file: {e}") raise if os.path.exists(temp_file_path): os.remove(temp_file_path)