Source code for camel.toolkits.google_scholar_toolkit

# ========= Copyright 2023-2024 @ CAMEL-AI.org. All Rights Reserved. =========
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ========= Copyright 2023-2024 @ CAMEL-AI.org. All Rights Reserved. =========
import re
from typing import Any, Dict, List, Optional

from camel.toolkits import FunctionTool
from camel.toolkits.base import BaseToolkit


[docs] class GoogleScholarToolkit(BaseToolkit): r"""A toolkit for retrieving information about authors and their publications from Google Scholar. Attributes: author_identifier (Union[str, None]): The author's Google Scholar URL or name of the author to search for. is_author_name (bool): Flag to indicate if the identifier is a name. (default: :obj:`False`) scholarly (module): The scholarly module for querying Google Scholar. author (Optional[Dict[str, Any]]): Cached author details, allowing manual assignment if desired. """ def __init__( self, author_identifier: str, is_author_name: bool = False, use_free_proxies: bool = False, proxy_http: Optional[str] = None, proxy_https: Optional[str] = None, ) -> None: r"""Initializes the GoogleScholarToolkit with the author's identifier. Args: author_identifier (str): The author's Google Scholar URL or name of the author to search for. is_author_name (bool): Flag to indicate if the identifier is a name. (default: :obj:`False`) use_free_proxies (bool): Whether to use Free Proxies. (default: :obj:`False`) proxy_http ( Optional[str]): Proxy http address pass to pg. SingleProxy. (default: :obj:`None`) proxy_https ( Optional[str]): Proxy https address pass to pg. SingleProxy. (default: :obj:`None`) """ from scholarly import ProxyGenerator, scholarly # Set Free Proxies is needed if use_free_proxies: pg = ProxyGenerator() pg.FreeProxies() scholarly.use_proxy(pg) # Set Proxy is HTTP or HTTPS provided if proxy_http or proxy_https: pg = ProxyGenerator() pg.SingleProxy(http=proxy_http, https=proxy_https) scholarly.use_proxy(pg) self.scholarly = scholarly self.author_identifier = author_identifier self.is_author_name = is_author_name self._author: Optional[Dict[str, Any]] = None @property def author(self) -> Dict[str, Any]: r"""Getter for the author attribute, fetching details if not cached. Returns: Dict[str, Any]: A dictionary containing author details. If no data is available, returns an empty dictionary. """ if self._author is None: self.get_author_detailed_info() return self._author or {} @author.setter def author(self, value: Optional[Dict[str, Any]]) -> None: r"""Sets or overrides the cached author information. Args: value (Optional[Dict[str, Any]]): A dictionary containing author details to cache or `None` to clear the cached data. Raises: ValueError: If `value` is not a dictionary or `None`. """ if value is None or isinstance(value, dict): self._author = value else: raise ValueError("Author must be a dictionary or None.") def _extract_author_id(self) -> Optional[str]: r"""Extracts the author ID from a Google Scholar URL if provided. Returns: Optional[str]: The extracted author ID, or None if not found. """ match = re.search(r'user=([A-Za-z0-9-]+)', self.author_identifier) return match.group(1) if match else None
[docs] def get_author_detailed_info( self, ) -> dict: r"""Retrieves detailed information about the author. Returns: dict: A dictionary containing detailed information about the author. """ if self.is_author_name: search_query = self.scholarly.search_author(self.author_identifier) # Retrieve the first result from the iterator first_author_result = next(search_query) else: author_id = self._extract_author_id() first_author_result = self.scholarly.search_author_id(id=author_id) self._author = self.scholarly.fill(first_author_result) return self._author # type: ignore[return-value]
[docs] def get_author_publications( self, ) -> List[str]: r"""Retrieves the titles of the author's publications. Returns: List[str]: A list of publication titles authored by the author. """ publication_titles = [ pub['bib']['title'] for pub in self.author['publications'] ] return publication_titles
[docs] def get_publication_by_title( self, publication_title: str ) -> Optional[dict]: r"""Retrieves detailed information about a specific publication by its title. Note that this method cannot retrieve the full content of the paper. Args: publication_title (str): The title of the publication to search for. Returns: Optional[dict]: A dictionary containing detailed information about the publication if found; otherwise, `None`. """ publications = self.author['publications'] for publication in publications: if publication['bib']['title'] == publication_title: return self.scholarly.fill(publication) return None # Return None if not found
[docs] def get_tools(self) -> List[FunctionTool]: r"""Returns a list of FunctionTool objects representing the functions in the toolkit. Returns: List[FunctionTool]: A list of FunctionTool objects representing the functions in the toolkit. """ return [ FunctionTool(self.get_author_detailed_info), FunctionTool(self.get_author_publications), FunctionTool(self.get_publication_by_title), FunctionTool(self.get_full_paper_content_by_link), ]