Source code for camel.loaders.firecrawl_reader

# ========= Copyright 2023-2024 @ CAMEL-AI.org. All Rights Reserved. =========
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ========= Copyright 2023-2024 @ CAMEL-AI.org. All Rights Reserved. =========

import os
from typing import Any, Dict, Optional

from pydantic import BaseModel


[docs] class Firecrawl: r"""Firecrawl allows you to turn entire websites into LLM-ready markdown. Args: api_key (Optional[str]): API key for authenticating with the Firecrawl API. api_url (Optional[str]): Base URL for the Firecrawl API. References: https://docs.firecrawl.dev/introduction """ def __init__( self, api_key: Optional[str] = None, api_url: Optional[str] = None, ) -> None: from firecrawl import FirecrawlApp self._api_key = api_key or os.environ.get("FIRECRAWL_API_KEY") self._api_url = api_url or os.environ.get("FIRECRAWL_API_URL") self.app = FirecrawlApp(api_key=self._api_key, api_url=self._api_url)
[docs] def crawl( self, url: str, params: Optional[Dict[str, Any]] = None, **kwargs: Any, ) -> Any: r"""Crawl a URL and all accessible subpages. Customize the crawl by setting different parameters, and receive the full response or a job ID based on the specified options. Args: url (str): The URL to crawl. params (Optional[Dict[str, Any]]): Additional parameters for the crawl request. Defaults to `None`. **kwargs (Any): Additional keyword arguments, such as `poll_interval`, `idempotency_key`. Returns: Any: The crawl job ID or the crawl results if waiting until completion. Raises: RuntimeError: If the crawling process fails. """ try: crawl_response = self.app.crawl_url( url=url, params=params, **kwargs, ) return crawl_response except Exception as e: raise RuntimeError(f"Failed to crawl the URL: {e}")
[docs] def check_crawl_job(self, job_id: str) -> Dict: r"""Check the status of a crawl job. Args: job_id (str): The ID of the crawl job. Returns: Dict: The response including status of the crawl job. Raises: RuntimeError: If the check process fails. """ try: return self.app.check_crawl_status(job_id) except Exception as e: raise RuntimeError(f"Failed to check the crawl job status: {e}")
[docs] def scrape( self, url: str, params: Optional[Dict[str, Any]] = None, ) -> Dict: r"""To scrape a single URL. This function supports advanced scraping by setting different parameters and returns the full scraped data as a dictionary. Reference: https://docs.firecrawl.dev/advanced-scraping-guide Args: url (str): The URL to read. params (Optional[Dict[str, Any]]): Additional parameters for the scrape request. Returns: Dict: The scraped data. Raises: RuntimeError: If the scrape process fails. """ try: return self.app.scrape_url(url=url, params=params) except Exception as e: raise RuntimeError(f"Failed to scrape the URL: {e}")
[docs] def structured_scrape(self, url: str, response_format: BaseModel) -> Dict: r"""Use LLM to extract structured data from given URL. Args: url (str): The URL to read. response_format (BaseModel): A pydantic model that includes value types and field descriptions used to generate a structured response by LLM. This schema helps in defining the expected output format. Returns: Dict: The content of the URL. Raises: RuntimeError: If the scrape process fails. """ try: data = self.app.scrape_url( url, { 'formats': ['extract'], 'extract': {'schema': response_format.model_json_schema()}, }, ) return data.get("extract", {}) except Exception as e: raise RuntimeError(f"Failed to perform structured scrape: {e}")
[docs] def map_site( self, url: str, params: Optional[Dict[str, Any]] = None ) -> list: r"""Map a website to retrieve all accessible URLs. Args: url (str): The URL of the site to map. params (Optional[Dict[str, Any]]): Additional parameters for the map request. Defaults to `None`. Returns: list: A list containing the URLs found on the site. Raises: RuntimeError: If the mapping process fails. """ try: return self.app.map_url(url=url, params=params) except Exception as e: raise RuntimeError(f"Failed to map the site: {e}")