Source code for camel.toolkits.pubmed_toolkit
# ========= Copyright 2023-2024 @ CAMEL-AI.org. All Rights Reserved. =========
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ========= Copyright 2023-2024 @ CAMEL-AI.org. All Rights Reserved. =========
from typing import Any, Dict, List, Optional, Union, cast
import requests
from camel.logger import get_logger
from camel.toolkits import BaseToolkit, FunctionTool
logger = get_logger(__name__)
[docs]
class PubMedToolkit(BaseToolkit):
r"""A toolkit for interacting with PubMed's E-utilities API to access
MEDLINE data.
This toolkit provides functionality to search and retrieve papers from the
PubMed database, including abstracts, citations, and other metadata.
Args:
timeout (Optional[float]): The timeout for API requests in seconds.
(default: :obj:`None`)
"""
BASE_URL = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils"
def __init__(self, timeout: Optional[float] = None) -> None:
r"""Initializes the PubMedToolkit."""
super().__init__(timeout=timeout)
def _make_request(
self,
endpoint: str,
params: Dict[str, Union[str, int]],
retries: int = 3,
) -> Optional[Dict[str, Any]]:
r"""Makes a request to the PubMed/MEDLINE API with error handling and
retries.
Args:
endpoint (str): The API endpoint to call.
params (Dict[str, Union[str, int]]): Query parameters.
retries (int, optional): Number of retry attempts.
(default: :obj:`3`)
Returns:
Optional[Dict[str, Any]]: JSON response if successful, else None.
"""
url = f"{self.BASE_URL}/{endpoint}"
request_params = cast(Dict[str, Union[str, int]], params)
for attempt in range(retries):
try:
response = requests.get(
url, params=request_params, timeout=self.timeout
)
response.raise_for_status()
if not response.text:
logger.warning(
f"Empty response from PubMed API: {endpoint}"
)
return None
return response.json()
except requests.RequestException as e:
if attempt == retries - 1:
logger.error(f"Failed to fetch data from PubMed: {e!s}")
return None
logger.warning(f"Request attempt {attempt + 1} failed: {e!s}")
except ValueError as e:
logger.error(f"Failed to parse JSON response: {e!s}")
return None
return None
[docs]
def search_papers(
self,
query: str,
max_results: int = 10,
sort: str = "relevance",
date_range: Optional[Dict[str, str]] = None,
publication_type: Optional[List[str]] = None,
) -> List[Dict[str, str]]:
r"""Search for biomedical papers in MEDLINE via PubMed with advanced
filtering options.
Args:
query (str): The search query string.
max_results (int, optional): Maximum number of results to return.
(default: :obj:`10`)
sort (str, optional): Sort order - 'relevance' or 'date'.
(default: :obj:`"relevance"`)
date_range (Optional[Dict[str, str]], optional): Date range filter
with 'from' and 'to' dates in YYYY/MM/DD format.
(default: :obj:`None`)
publication_type (Optional[List[str]], optional): Filter by
publication types (e.g., ["Journal Article", "Review"]).
(default: :obj:`None`)
Returns:
List[Dict[str, str]]: List of papers with their metadata.
"""
# Build query with filters
filtered_query = query
if publication_type:
type_filter = " OR ".join(
[f'"{pt}"[Publication Type]' for pt in publication_type]
)
filtered_query = f"({query}) AND ({type_filter})"
if date_range:
date_filter = (
f"{date_range.get('from', '')}:"
f"{date_range.get('to', '')}[Date - Publication]"
)
filtered_query = f"({filtered_query}) AND ({date_filter})"
# Search for paper IDs
search_params: Dict[str, Union[str, int]] = {
"db": "pubmed",
"term": filtered_query,
"retmax": max_results,
"sort": "relevance" if sort == "relevance" else "pub+date",
"retmode": "json",
}
search_data = self._make_request("esearch.fcgi", search_params)
if not search_data or "esearchresult" not in search_data:
logger.error("Failed to retrieve search results")
return []
paper_ids = search_data["esearchresult"].get("idlist", [])
if not paper_ids:
return []
# Fetch details for papers
results = []
for paper_id in paper_ids:
paper_details = self.get_paper_details(paper_id)
if paper_details:
results.append(paper_details)
return results
[docs]
def get_paper_details(
self,
paper_id: Union[str, int],
include_references: bool = False,
) -> Optional[Dict[str, Any]]:
r"""Get detailed information about a specific biomedical paper from
MEDLINE/PubMed.
Args:
paper_id (Union[str, int]): PubMed ID of the paper.
include_references (bool, optional): Whether to include referenced
papers. (default: :obj:`False`)
Returns:
Optional[Dict[str, Any]]: Paper details including title, authors,
abstract, etc., or None if retrieval fails.
"""
# Fetch summary
summary_params: Dict[str, Union[str, int]] = {
"db": "pubmed",
"id": str(paper_id),
"retmode": "json",
}
summary_data = self._make_request("esummary.fcgi", summary_params)
if not summary_data or "result" not in summary_data:
logger.error(
f"Failed to retrieve paper details for ID: {paper_id}"
)
return None
paper_data = summary_data["result"][str(paper_id)]
# Handle authors - they come as a list of dicts with 'name' key
authors = paper_data.get("authors", [])
author_names = []
for author in authors:
if isinstance(author, dict) and "name" in author:
author_names.append(author["name"])
elif isinstance(author, str):
author_names.append(author)
# Get abstract
abstract = self.get_abstract(paper_id)
# Get references if requested
references = []
if include_references:
ref_params: Dict[str, Union[str, int]] = {
"db": "pubmed",
"id": str(paper_id),
"linkname": "pubmed_pubmed_refs",
"retmode": "json",
}
ref_data = self._make_request("elink.fcgi", ref_params)
if ref_data and "linksets" in ref_data:
try:
references = ref_data["linksets"][0]["linksetdbs"][0][
"links"
]
except (KeyError, IndexError):
logger.warning(
f"No references found for paper ID: {paper_id}"
)
return cast(
Dict[str, Any],
{
"id": str(paper_id),
"title": paper_data.get("title", ""),
"authors": ", ".join(author_names),
"journal": paper_data.get("source", ""),
"pub_date": paper_data.get("pubdate", ""),
"abstract": abstract,
"doi": paper_data.get("elocationid", ""),
"keywords": paper_data.get("keywords", []),
"mesh_terms": paper_data.get("mesh", []),
"publication_types": paper_data.get("pubtype", []),
"references": references if include_references else None,
},
)
[docs]
def get_abstract(self, paper_id: Union[str, int]) -> str:
r"""Get the abstract of a specific biomedical paper from MEDLINE/
PubMed.
Args:
paper_id (Union[str, int]): PubMed ID of the paper.
Returns:
str: The abstract text.
"""
params: Dict[str, Union[str, int]] = {
"db": "pubmed",
"id": str(paper_id),
"rettype": "abstract",
"retmode": "text",
}
try:
response = requests.get(
f"{self.BASE_URL}/efetch.fcgi", params=params
)
response.raise_for_status()
return response.text.strip()
except requests.exceptions.RequestException as e:
logger.error(
f"Failed to retrieve abstract for ID {paper_id}: {e!s}"
)
return ""
[docs]
def get_citation_count(self, paper_id: Union[str, int]) -> int:
r"""Get the number of citations for a biomedical paper in MEDLINE/
PubMed.
Args:
paper_id (Union[str, int]): PubMed ID of the paper.
Returns:
int: Number of citations, or 0 if retrieval fails.
"""
params: Dict[str, Union[str, int]] = {
"db": "pubmed",
"id": str(paper_id),
"linkname": "pubmed_pubmed_citedin",
"retmode": "json",
}
data = self._make_request("elink.fcgi", params)
if not data or "linksets" not in data:
return 0
try:
return len(data["linksets"][0]["linksetdbs"][0]["links"])
except (KeyError, IndexError):
return 0
[docs]
def get_tools(self) -> List[FunctionTool]:
r"""Returns a list of tools provided by the PubMed toolkit.
Returns:
List[FunctionTool]: List of available tools.
"""
return [
FunctionTool(self.search_papers),
FunctionTool(self.get_paper_details),
FunctionTool(self.get_abstract),
FunctionTool(self.get_citation_count),
FunctionTool(self.get_related_papers),
]