Source code for camel.models.fish_audio_model
# ========= Copyright 2023-2024 @ CAMEL-AI.org. All Rights Reserved. =========
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ========= Copyright 2023-2024 @ CAMEL-AI.org. All Rights Reserved. =========
import os
from typing import Any, Optional
[docs]
class FishAudioModel:
r"""Provides access to FishAudio's Text-to-Speech (TTS) and Speech_to_Text
(STT) models.
"""
def __init__(
self,
api_key: Optional[str] = None,
url: Optional[str] = None,
) -> None:
r"""Initialize an instance of FishAudioModel.
Args:
api_key (Optional[str]): API key for FishAudio service. If not
provided, the environment variable `FISHAUDIO_API_KEY` will be
used.
url (Optional[str]): Base URL for FishAudio API. If not provided,
the environment variable `FISHAUDIO_API_BASE_URL` will be used.
"""
from fish_audio_sdk import Session
self._api_key = api_key or os.environ.get("FISHAUDIO_API_KEY")
self._url = url or os.environ.get(
"FISHAUDIO_API_BASE_URL", "https://api.fish.audio"
)
self.session = Session(apikey=self._api_key, base_url=self._url)
[docs]
def text_to_speech(
self,
input: str,
storage_path: str,
reference_id: Optional[str] = None,
reference_audio: Optional[str] = None,
reference_audio_text: Optional[str] = None,
**kwargs: Any,
) -> Any:
r"""Convert text to speech and save the output to a file.
Args:
input_text (str): The text to convert to speech.
storage_path (str): The file path where the resulting speech will
be saved.
reference_id (Optional[str]): An optional reference ID to
associate with the request. (default: :obj:`None`)
reference_audio (Optional[str]): Path to an audio file for
reference speech. (default: :obj:`None`)
reference_audio_text (Optional[str]): Text for the reference audio.
(default: :obj:`None`)
**kwargs (Any): Additional parameters to pass to the TTS request.
Raises:
FileNotFoundError: If the reference audio file cannot be found.
"""
from fish_audio_sdk import ReferenceAudio, TTSRequest
directory = os.path.dirname(storage_path)
if directory and not os.path.exists(directory):
os.makedirs(directory)
if not reference_audio:
with open(f"{storage_path}", "wb") as f:
for chunk in self.session.tts(
TTSRequest(reference_id=reference_id, text=input, **kwargs)
):
f.write(chunk)
else:
if not os.path.exists(reference_audio):
raise FileNotFoundError(
f"Reference audio file not found: {reference_audio}"
)
if not reference_audio_text:
raise ValueError("reference_audio_text should be provided")
with open(f"{reference_audio}", "rb") as audio_file:
with open(f"{storage_path}", "wb") as f:
for chunk in self.session.tts(
TTSRequest(
text=input,
references=[
ReferenceAudio(
audio=audio_file.read(),
text=reference_audio_text,
)
],
**kwargs,
)
):
f.write(chunk)
[docs]
def speech_to_text(
self,
audio_file_path: str,
language: Optional[str] = None,
ignore_timestamps: Optional[bool] = None,
**kwargs: Any,
) -> str:
r"""Convert speech to text from an audio file.
Args:
audio_file_path (str): The path to the audio file to transcribe.
language (Optional[str]): The language of the audio. (default:
:obj:`None`)
ignore_timestamps (Optional[bool]): Whether to ignore timestamps.
(default: :obj:`None`)
**kwargs (Any): Additional parameters to pass to the STT request.
Returns:
str: The transcribed text from the audio.
Raises:
FileNotFoundError: If the audio file cannot be found.
"""
from fish_audio_sdk import ASRRequest
if not os.path.exists(audio_file_path):
raise FileNotFoundError(f"Audio file not found: {audio_file_path}")
with open(f"{audio_file_path}", "rb") as audio_file:
audio_data = audio_file.read()
response = self.session.asr(
ASRRequest(
audio=audio_data,
language=language,
ignore_timestamps=ignore_timestamps,
**kwargs,
)
)
return response.text