Source code for camel.toolkits.google_scholar_toolkit
# ========= Copyright 2023-2024 @ CAMEL-AI.org. All Rights Reserved. =========
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ========= Copyright 2023-2024 @ CAMEL-AI.org. All Rights Reserved. =========
import re
from typing import Any, Dict, List, Optional
from camel.toolkits import FunctionTool
from camel.toolkits.base import BaseToolkit
[docs]
class GoogleScholarToolkit(BaseToolkit):
r"""A toolkit for retrieving information about authors and their
publications from Google Scholar.
Attributes:
author_identifier (Union[str, None]): The author's Google Scholar URL
or name of the author to search for.
is_author_name (bool): Flag to indicate if the identifier is a name.
(default: :obj:`False`)
scholarly (module): The scholarly module for querying Google Scholar.
author (Optional[Dict[str, Any]]): Cached author details, allowing
manual assignment if desired.
"""
def __init__(
self,
author_identifier: str,
is_author_name: bool = False,
use_free_proxies: bool = False,
proxy_http: Optional[str] = None,
proxy_https: Optional[str] = None,
) -> None:
r"""Initializes the GoogleScholarToolkit with the author's identifier.
Args:
author_identifier (str): The author's Google Scholar URL or name
of the author to search for.
is_author_name (bool): Flag to indicate if the identifier is a
name. (default: :obj:`False`)
use_free_proxies (bool): Whether to use Free Proxies.
(default: :obj:`False`)
proxy_http ( Optional[str]): Proxy http address pass to pg.
SingleProxy. (default: :obj:`None`)
proxy_https ( Optional[str]): Proxy https address pass to pg.
SingleProxy. (default: :obj:`None`)
"""
from scholarly import ProxyGenerator, scholarly
# Set Free Proxies is needed
if use_free_proxies:
pg = ProxyGenerator()
pg.FreeProxies()
scholarly.use_proxy(pg)
# Set Proxy is HTTP or HTTPS provided
if proxy_http or proxy_https:
pg = ProxyGenerator()
pg.SingleProxy(http=proxy_http, https=proxy_https)
scholarly.use_proxy(pg)
self.scholarly = scholarly
self.author_identifier = author_identifier
self.is_author_name = is_author_name
self._author: Optional[Dict[str, Any]] = None
@property
def author(self) -> Dict[str, Any]:
r"""Getter for the author attribute, fetching details if not cached.
Returns:
Dict[str, Any]: A dictionary containing author details. If no data
is available, returns an empty dictionary.
"""
if self._author is None:
self.get_author_detailed_info()
return self._author or {}
@author.setter
def author(self, value: Optional[Dict[str, Any]]) -> None:
r"""Sets or overrides the cached author information.
Args:
value (Optional[Dict[str, Any]]): A dictionary containing author
details to cache or `None` to clear the cached data.
Raises:
ValueError: If `value` is not a dictionary or `None`.
"""
if value is None or isinstance(value, dict):
self._author = value
else:
raise ValueError("Author must be a dictionary or None.")
def _extract_author_id(self) -> Optional[str]:
r"""Extracts the author ID from a Google Scholar URL if provided.
Returns:
Optional[str]: The extracted author ID, or None if not found.
"""
match = re.search(r'user=([A-Za-z0-9-]+)', self.author_identifier)
return match.group(1) if match else None
[docs]
def get_author_detailed_info(
self,
) -> dict:
r"""Retrieves detailed information about the author.
Returns:
dict: A dictionary containing detailed information about the
author.
"""
if self.is_author_name:
search_query = self.scholarly.search_author(self.author_identifier)
# Retrieve the first result from the iterator
first_author_result = next(search_query)
else:
author_id = self._extract_author_id()
first_author_result = self.scholarly.search_author_id(id=author_id)
self._author = self.scholarly.fill(first_author_result)
return self._author # type: ignore[return-value]
[docs]
def get_author_publications(
self,
) -> List[str]:
r"""Retrieves the titles of the author's publications.
Returns:
List[str]: A list of publication titles authored by the author.
"""
publication_titles = [
pub['bib']['title'] for pub in self.author['publications']
]
return publication_titles
[docs]
def get_publication_by_title(
self, publication_title: str
) -> Optional[dict]:
r"""Retrieves detailed information about a specific publication by its
title. Note that this method cannot retrieve the full content of the
paper.
Args:
publication_title (str): The title of the publication to search
for.
Returns:
Optional[dict]: A dictionary containing detailed information about
the publication if found; otherwise, `None`.
"""
publications = self.author['publications']
for publication in publications:
if publication['bib']['title'] == publication_title:
return self.scholarly.fill(publication)
return None # Return None if not found
[docs]
def get_full_paper_content_by_link(self, pdf_url: str) -> Optional[str]:
r"""Retrieves the full paper content from a given PDF URL using the
arxiv2text tool.
Args:
pdf_url (str): The URL of the PDF file.
Returns:
Optional[str]: The full text extracted from the PDF, or `None` if
an error occurs.
"""
from arxiv2text import arxiv_to_text
try:
return arxiv_to_text(pdf_url)
except Exception:
return None # Return None in case of any error
[docs]
def get_tools(self) -> List[FunctionTool]:
r"""Returns a list of FunctionTool objects representing the
functions in the toolkit.
Returns:
List[FunctionTool]: A list of FunctionTool objects
representing the functions in the toolkit.
"""
return [
FunctionTool(self.get_author_detailed_info),
FunctionTool(self.get_author_publications),
FunctionTool(self.get_publication_by_title),
FunctionTool(self.get_full_paper_content_by_link),
]