Source code for camel.models.fish_audio_model

# ========= Copyright 2023-2024 @ CAMEL-AI.org. All Rights Reserved. =========
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ========= Copyright 2023-2024 @ CAMEL-AI.org. All Rights Reserved. =========

import os
from typing import Any, Optional


[docs] class FishAudioModel: r"""Provides access to FishAudio's Text-to-Speech (TTS) and Speech_to_Text (STT) models. """ def __init__( self, api_key: Optional[str] = None, url: Optional[str] = None, ) -> None: r"""Initialize an instance of FishAudioModel. Args: api_key (Optional[str]): API key for FishAudio service. If not provided, the environment variable `FISHAUDIO_API_KEY` will be used. url (Optional[str]): Base URL for FishAudio API. If not provided, the environment variable `FISHAUDIO_API_BASE_URL` will be used. """ from fish_audio_sdk import Session self._api_key = api_key or os.environ.get("FISHAUDIO_API_KEY") self._url = url or os.environ.get( "FISHAUDIO_API_BASE_URL", "https://api.fish.audio" ) self.session = Session(apikey=self._api_key, base_url=self._url)
[docs] def text_to_speech( self, input: str, storage_path: str, reference_id: Optional[str] = None, reference_audio: Optional[str] = None, reference_audio_text: Optional[str] = None, **kwargs: Any, ) -> Any: r"""Convert text to speech and save the output to a file. Args: input_text (str): The text to convert to speech. storage_path (str): The file path where the resulting speech will be saved. reference_id (Optional[str]): An optional reference ID to associate with the request. (default: :obj:`None`) reference_audio (Optional[str]): Path to an audio file for reference speech. (default: :obj:`None`) reference_audio_text (Optional[str]): Text for the reference audio. (default: :obj:`None`) **kwargs (Any): Additional parameters to pass to the TTS request. Raises: FileNotFoundError: If the reference audio file cannot be found. """ from fish_audio_sdk import ReferenceAudio, TTSRequest directory = os.path.dirname(storage_path) if directory and not os.path.exists(directory): os.makedirs(directory) if not reference_audio: with open(f"{storage_path}", "wb") as f: for chunk in self.session.tts( TTSRequest(reference_id=reference_id, text=input, **kwargs) ): f.write(chunk) else: if not os.path.exists(reference_audio): raise FileNotFoundError( f"Reference audio file not found: {reference_audio}" ) if not reference_audio_text: raise ValueError("reference_audio_text should be provided") with open(f"{reference_audio}", "rb") as audio_file: with open(f"{storage_path}", "wb") as f: for chunk in self.session.tts( TTSRequest( text=input, references=[ ReferenceAudio( audio=audio_file.read(), text=reference_audio_text, ) ], **kwargs, ) ): f.write(chunk)
[docs] def speech_to_text( self, audio_file_path: str, language: Optional[str] = None, ignore_timestamps: Optional[bool] = None, **kwargs: Any, ) -> str: r"""Convert speech to text from an audio file. Args: audio_file_path (str): The path to the audio file to transcribe. language (Optional[str]): The language of the audio. (default: :obj:`None`) ignore_timestamps (Optional[bool]): Whether to ignore timestamps. (default: :obj:`None`) **kwargs (Any): Additional parameters to pass to the STT request. Returns: str: The transcribed text from the audio. Raises: FileNotFoundError: If the audio file cannot be found. """ from fish_audio_sdk import ASRRequest if not os.path.exists(audio_file_path): raise FileNotFoundError(f"Audio file not found: {audio_file_path}") with open(f"{audio_file_path}", "rb") as audio_file: audio_data = audio_file.read() response = self.session.asr( ASRRequest( audio=audio_data, language=language, ignore_timestamps=ignore_timestamps, **kwargs, ) ) return response.text