# ========= Copyright 2023-2024 @ CAMEL-AI.org. All Rights Reserved. =========
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ========= Copyright 2023-2024 @ CAMEL-AI.org. All Rights Reserved. =========
import os
from concurrent.futures import ThreadPoolExecutor, as_completed
from typing import ClassVar, Dict, List, Optional
from camel.logger import get_logger
logger = get_logger(__name__)
[docs]
class MarkItDownLoader:
r"""MarkitDown convert various file types into Markdown format.
Supported Input Formats:
- PDF
- Microsoft Office documents:
- Word (.doc, .docx)
- Excel (.xls, .xlsx)
- PowerPoint (.ppt, .pptx)
- EPUB
- HTML
- Images (with EXIF metadata and OCR support)
- Audio files (with EXIF metadata and speech transcription)
- Text-based formats:
- CSV
- JSON
- XML
- ZIP archives (iterates over contents)
- YouTube URLs (via transcript extraction)
"""
SUPPORTED_FORMATS: ClassVar[List[str]] = [
".pdf",
".doc",
".docx",
".xls",
".xlsx",
".ppt",
".pptx",
".epub",
".html",
".htm",
".jpg",
".jpeg",
".png",
".mp3",
".wav",
".csv",
".json",
".xml",
".zip",
".txt",
]
def __init__(
self,
llm_client: Optional[object] = None,
llm_model: Optional[str] = None,
):
r"""Initializes the Converter.
Args:
llm_client (Optional[object]): Optional client for LLM integration.
(default: :obj:`None`)
llm_model (Optional[str]): Optional model name for the LLM.
(default: :obj:`None`)
"""
from markitdown import MarkItDown
try:
self.converter = MarkItDown(
llm_client=llm_client, llm_model=llm_model
)
logger.info("MarkItDownLoader initialized successfully.")
except Exception as e:
logger.error(f"Failed to initialize MarkItDown Converter: {e}")
raise Exception(f"Failed to initialize MarkItDown Converter: {e}")
def _validate_format(self, file_path: str) -> bool:
r"""Validates if the file format is supported.
Args:
file_path (str): Path to the input file.
Returns:
bool: True if the format is supported, False otherwise.
"""
_, ext = os.path.splitext(file_path)
return ext.lower() in self.SUPPORTED_FORMATS
[docs]
def convert_file(self, file_path: str) -> str:
r"""Converts the given file to Markdown format.
Args:
file_path (str): Path to the input file.
Returns:
str: Converted Markdown text.
Raises:
FileNotFoundError: If the specified file does not exist.
ValueError: If the file format is not supported.
Exception: For other errors during conversion.
"""
if not os.path.isfile(file_path):
logger.error(f"File not found: {file_path}")
raise FileNotFoundError(f"File not found: {file_path}")
if not self._validate_format(file_path):
logger.error(
f"Unsupported file format: {file_path}."
f"Supported formats are "
f"{MarkItDownLoader.SUPPORTED_FORMATS}"
)
raise ValueError(f"Unsupported file format: {file_path}")
try:
logger.info(f"Converting file: {file_path}")
result = self.converter.convert(file_path)
logger.info(f"File converted successfully: {file_path}")
return result.text_content
except Exception as e:
logger.error(f"Error converting file '{file_path}': {e}")
raise Exception(f"Error converting file '{file_path}': {e}")
[docs]
def convert_files(
self,
file_paths: List[str],
parallel: bool = False,
skip_failed: bool = False,
) -> Dict[str, str]:
r"""Converts multiple files to Markdown format.
Args:
file_paths (List[str]): List of file paths to convert.
parallel (bool): Whether to process files in parallel.
(default: :obj:`False`)
skip_failed (bool): Whether to skip failed files instead
of including error messages.
(default: :obj:`False`)
Returns:
Dict[str, str]: Dictionary mapping file paths to their
converted Markdown text.
Raises:
Exception: For errors during conversion of any file if
skip_failed is False.
"""
from tqdm.auto import tqdm
converted_files = {}
if parallel:
with ThreadPoolExecutor() as executor:
future_to_path = {
executor.submit(self.convert_file, path): path
for path in file_paths
}
for future in tqdm(
as_completed(future_to_path),
total=len(file_paths),
desc="Converting files (parallel)",
):
path = future_to_path[future]
try:
converted_files[path] = future.result()
except Exception as e:
if skip_failed:
logger.warning(
f"Skipping file '{path}' due to error: {e}"
)
else:
logger.error(
f"Error processing file '{path}': {e}"
)
converted_files[path] = f"Error: {e}"
else:
for path in tqdm(file_paths, desc="Converting files (sequential)"):
try:
logger.info(f"Processing file: {path}")
converted_files[path] = self.convert_file(path)
except Exception as e:
if skip_failed:
logger.warning(
f"Skipping file '{path}' due to error: {e}"
)
else:
logger.error(f"Error processing file '{path}': {e}")
converted_files[path] = f"Error: {e}"
return converted_files