HuggingFaceDatasetManager

class HuggingFaceDatasetManager(BaseDatasetManager):

A dataset manager for Hugging Face datasets. This class provides methods to create, add, update, delete, and list records in a dataset on the Hugging Face Hub.

Parameters:

  • token (str): The Hugging Face API token. If not provided, the token will be read from the environment variable HF_TOKEN.

init

def __init__(self, token: Optional[str] = None):

create_dataset_card

def create_dataset_card(
    self,
    dataset_name: str,
    description: str,
    license: Optional[str] = None,
    version: Optional[str] = None,
    tags: Optional[List[str]] = None,
    authors: Optional[List[str]] = None,
    size_category: Optional[List[str]] = None,
    language: Optional[List[str]] = None,
    task_categories: Optional[List[str]] = None,
    content: Optional[str] = None
):

Creates and uploads a dataset card to the Hugging Face Hub in YAML format.

Parameters:

  • dataset_name (str): The name of the dataset.
  • description (str): A description of the dataset.
  • license (str): The license of the dataset. (default: :obj:None)
  • version (str): The version of the dataset. (default: :obj:None)
  • tags (list): A list of tags for the dataset.(default: :obj:None)
  • authors (list): A list of authors of the dataset. (default: :obj:None)
  • size_category (list): A size category for the dataset. (default: :obj:None)
  • language (list): A list of languages the dataset is in. (default: :obj:None)
  • task_categories (list): A list of task categories. (default: :obj:None)
  • content (str): Custom markdown content that the user wants to add to the dataset card. (default: :obj:None)

create_dataset

def create_dataset(
    self,
    name: str,
    private: bool = False,
    **kwargs: Any
):

Creates a new dataset on the Hugging Face Hub.

Parameters:

  • name (str): The name of the dataset.
  • private (bool): Whether the dataset should be private. defaults to False.
  • kwargs (Any): Additional keyword arguments.

Returns:

str: The URL of the created dataset.

list_datasets

def list_datasets(
    self,
    username: str,
    limit: int = 100,
    **kwargs: Any
):

Lists all datasets for the current user.

Parameters:

  • username (str): The username of the user whose datasets to list.
  • limit (int): The maximum number of datasets to list. (default: :obj:100)
  • kwargs (Any): Additional keyword arguments.

Returns:

List[str]: A list of dataset ids.

delete_dataset

def delete_dataset(self, dataset_name: str, **kwargs: Any):

Deletes a dataset from the Hugging Face Hub.

Parameters:

  • dataset_name (str): The name of the dataset to delete.
  • kwargs (Any): Additional keyword arguments.

add_records

def add_records(
    self,
    dataset_name: str,
    records: List[Record],
    filepath: str = 'records/records.json',
    **kwargs: Any
):

Adds records to a dataset on the Hugging Face Hub.

Parameters:

  • dataset_name (str): The name of the dataset.
  • records (List[Record]): A list of records to add to the dataset.
  • filepath (str): The path to the file containing the records.
  • kwargs (Any): Additional keyword arguments.

update_records

def update_records(
    self,
    dataset_name: str,
    records: List[Record],
    filepath: str = 'records/records.json',
    **kwargs: Any
):

Updates records in a dataset on the Hugging Face Hub.

Parameters:

  • dataset_name (str): The name of the dataset.
  • records (List[Record]): A list of records to update in the dataset.
  • filepath (str): The path to the file containing the records.
  • kwargs (Any): Additional keyword arguments.

delete_record

def delete_record(
    self,
    dataset_name: str,
    record_id: str,
    filepath: str = 'records/records.json',
    **kwargs: Any
):

Deletes a record from the dataset.

Parameters:

  • dataset_name (str): The name of the dataset.
  • record_id (str): The ID of the record to delete.
  • filepath (str): The path to the file containing the records.
  • kwargs (Any): Additional keyword arguments.

list_records

def list_records(
    self,
    dataset_name: str,
    filepath: str = 'records/records.json',
    **kwargs: Any
):

Lists all records in a dataset.

Parameters:

  • dataset_name (str): The name of the dataset.
  • filepath (str): The path to the file containing the records.
  • kwargs (Any): Additional keyword arguments.

Returns:

List[Record]: A list of records in the dataset.

_download_records

def _download_records(
    self,
    dataset_name: str,
    filepath: str,
    **kwargs: Any
):

_upload_records

def _upload_records(
    self,
    records: List[Record],
    dataset_name: str,
    filepath: str,
    **kwargs: Any
):

_upload_file

def _upload_file(
    self,
    file_content: str,
    dataset_name: str,
    filepath: str,
    file_type: str = 'json',
    **kwargs: Any
):