Source code for camel.toolkits.arxiv_toolkit
# ========= Copyright 2023-2024 @ CAMEL-AI.org. All Rights Reserved. =========
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ========= Copyright 2023-2024 @ CAMEL-AI.org. All Rights Reserved. =========
from typing import Dict, Generator, List, Optional
from camel.logger import get_logger
from camel.toolkits.base import BaseToolkit
from camel.toolkits.function_tool import FunctionTool
from camel.utils import dependencies_required
logger = get_logger(__name__)
[docs]
class ArxivToolkit(BaseToolkit):
r"""A toolkit for interacting with the arXiv API to search and download
academic papers.
"""
@dependencies_required('arxiv')
def __init__(self) -> None:
r"""Initializes the ArxivToolkit and sets up the arXiv client."""
import arxiv
self.client = arxiv.Client()
def _get_search_results(
self,
query: str,
paper_ids: Optional[List[str]] = None,
max_results: Optional[int] = 5,
) -> Generator:
r"""Retrieves search results from the arXiv API based on the provided
query and optional paper IDs.
Args:
query (str): The search query string used to search for papers on
arXiv.
paper_ids (List[str], optional): A list of specific arXiv paper
IDs to search for. (default: :obj: `None`)
max_results (int, optional): The maximum number of search results
to retrieve. (default: :obj: `5`)
Returns:
Generator: A generator that yields results from the arXiv search
query, which includes metadata about each paper matching the
query.
"""
import arxiv
paper_ids = paper_ids or []
search_query = arxiv.Search(
query=query,
id_list=paper_ids,
max_results=max_results,
)
return self.client.results(search_query)
[docs]
def search_papers(
self,
query: str,
paper_ids: Optional[List[str]] = None,
max_results: Optional[int] = 5,
) -> List[Dict[str, str]]:
r"""Searches for academic papers on arXiv using a query string and
optional paper IDs.
Args:
query (str): The search query string.
paper_ids (List[str], optional): A list of specific arXiv paper
IDs to search for. (default: :obj: `None`)
max_results (int, optional): The maximum number of search results
to return. (default: :obj: `5`)
Returns:
List[Dict[str, str]]: A list of dictionaries, each containing
information about a paper, including title, published date,
authors, entry ID, summary, and extracted text from the paper.
"""
from arxiv2text import arxiv_to_text
search_results = self._get_search_results(
query, paper_ids, max_results
)
papers_data = []
for paper in search_results:
paper_info = {
"title": paper.title,
"published_date": paper.updated.date().isoformat(),
"authors": [author.name for author in paper.authors],
"entry_id": paper.entry_id,
"summary": paper.summary,
"pdf_url": paper.pdf_url,
}
# Extract text from the paper
try:
# TODO: Use chunkr instead of atxiv_to_text for better
# performance and reliability
text = arxiv_to_text(paper_info["pdf_url"])
except Exception as e:
logger.error(
"Failed to extract text content from the PDF at "
"the specified URL. "
f"URL: {paper_info.get('pdf_url', 'Unknown')} | Error: {e}"
)
text = ""
paper_info['paper_text'] = text
papers_data.append(paper_info)
return papers_data
[docs]
def download_papers(
self,
query: str,
paper_ids: Optional[List[str]] = None,
max_results: Optional[int] = 5,
output_dir: Optional[str] = "./",
) -> str:
r"""Downloads PDFs of academic papers from arXiv based on the provided
query.
Args:
query (str): The search query string.
paper_ids (List[str], optional): A list of specific arXiv paper
IDs to download. (default: :obj: `None`)
max_results (int, optional): The maximum number of search results
to download. (default: :obj: `5`)
output_dir (str, optional): The directory to save the downloaded
PDFs. Defaults to the current directory.
Returns:
str: Status message indicating success or failure.
"""
try:
search_results = self._get_search_results(
query, paper_ids, max_results
)
for paper in search_results:
paper.download_pdf(
dirpath=output_dir, filename=f"{paper.title}" + ".pdf"
)
return "papers downloaded successfully"
except Exception as e:
return f"An error occurred: {e}"
[docs]
def get_tools(self) -> List[FunctionTool]:
r"""Returns a list of FunctionTool objects representing the
functions in the toolkit.
Returns:
List[FunctionTool]: A list of FunctionTool objects
representing the functions in the toolkit.
"""
return [
FunctionTool(self.search_papers),
FunctionTool(self.download_papers),
]