Source code for camel.loaders.jina_url_reader
# ========= Copyright 2023-2024 @ CAMEL-AI.org. All Rights Reserved. =========
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ========= Copyright 2023-2024 @ CAMEL-AI.org. All Rights Reserved. =========
import os
from typing import Any, Optional
from warnings import warn
from camel.types.enums import JinaReturnFormat
JINA_ENDPOINT = "https://r.jina.ai/"
[docs]
class JinaURLReader:
r"""URL Reader provided by Jina AI. The output is cleaner and more
LLM-friendly than the URL Reader of UnstructuredIO. Can be configured to
replace the UnstructuredIO URL Reader in the pipeline.
Args:
api_key (Optional[str], optional): The API key for Jina AI. If not
provided, the reader will have a lower rate limit. Defaults to
None.
return_format (ReturnFormat, optional): The level of detail
of the returned content, which is optimized for LLMs. For
now screenshots are not supported. Defaults to
ReturnFormat.DEFAULT.
json_response (bool, optional): Whether to return the response
in JSON format. Defaults to False.
timeout (int, optional): The maximum time in seconds to wait for
the page to be rendered. Defaults to 30.
**kwargs (Any): Additional keyword arguments, including proxies,
cookies, etc. It should align with the HTTP Header field and
value pairs listed in the reference.
References:
https://jina.ai/reader
"""
def __init__(
self,
api_key: Optional[str] = None,
return_format: JinaReturnFormat = JinaReturnFormat.DEFAULT,
json_response: bool = False,
timeout: int = 30,
**kwargs: Any,
) -> None:
api_key = api_key or os.getenv('JINA_API_KEY')
if not api_key:
warn(
"JINA_API_KEY not set. This will result in a low rate limit "
"of Jina URL Reader. Get API key here: https://jina.ai/reader."
)
# if the following field not provided, it will be None
api_field = f"Bearer {api_key}" if api_key else None
json_field = "application/json" if json_response else None
raw_headers = {
"Authorization": api_field,
"X-Return-Format": return_format.value,
"Accept": json_field,
"X-Timeout": str(timeout),
**kwargs,
}
# eliminate None values
self._headers = {k: v for k, v in raw_headers.items() if v}
[docs]
def read_content(self, url: str) -> str:
r"""Reads the content of a URL and returns it as a string with
given form.
Args:
url (str): The URL to read.
Returns:
str: The content of the URL.
"""
import requests
full_url = f"{JINA_ENDPOINT}{url}"
try:
resp = requests.get(full_url, headers=self._headers)
resp.raise_for_status()
except Exception as e:
raise ValueError(f"Failed to read content from {url}: {e}") from e
return resp.text