Source code for camel.toolkits.excel_toolkit
# ========= Copyright 2023-2024 @ CAMEL-AI.org. All Rights Reserved. =========
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ========= Copyright 2023-2024 @ CAMEL-AI.org. All Rights Reserved. =========
from typing import List
import pandas as pd
from camel.logger import get_logger
from camel.toolkits.base import BaseToolkit
from camel.toolkits.function_tool import FunctionTool
logger = get_logger(__name__)
[docs]
class ExcelToolkit(BaseToolkit):
r"""A class representing a toolkit for extract detailed cell information
from an Excel file.
This class provides method for processing docx, pdf, pptx, etc. It cannot
process excel files.
"""
def _convert_to_markdown(self, df: pd.DataFrame) -> str:
r"""Convert DataFrame to Markdown format table.
Args:
df (pd.DataFrame): DataFrame containing the Excel data.
Returns:
str: Markdown formatted table.
"""
from tabulate import tabulate
md_table = tabulate(df, headers='keys', tablefmt='pipe')
return str(md_table)
[docs]
def extract_excel_content(self, document_path: str) -> str:
r"""Extract detailed cell information from an Excel file, including
multiple sheets.
Args:
document_path (str): The path of the Excel file.
Returns:
str: Extracted excel information, including details of each sheet.
"""
from openpyxl import load_workbook
from xls2xlsx import XLS2XLSX
logger.debug(
f"Calling extract_excel_content with document_path"
f": {document_path}"
)
if not (
document_path.endswith("xls")
or document_path.endswith("xlsx")
or document_path.endswith("csv")
):
logger.error("Only xls, xlsx, csv files are supported.")
return (
f"Failed to process file {document_path}: "
f"It is not excel format. Please try other ways."
)
if document_path.endswith("csv"):
try:
df = pd.read_csv(document_path)
md_table = self._convert_to_markdown(df)
return f"CSV File Processed:\n{md_table}"
except Exception as e:
logger.error(f"Failed to process file {document_path}: {e}")
return f"Failed to process file {document_path}: {e}"
if document_path.endswith("xls"):
output_path = document_path.replace(".xls", ".xlsx")
x2x = XLS2XLSX(document_path)
x2x.to_xlsx(output_path)
document_path = output_path
# Load the Excel workbook
wb = load_workbook(document_path, data_only=True)
sheet_info_list = []
# Iterate through all sheets
for sheet in wb.sheetnames:
ws = wb[sheet]
cell_info_list = []
for row in ws.iter_rows():
for cell in row:
row_num = cell.row
col_letter = cell.column_letter
cell_value = cell.value
font_color = None
if (
cell.font
and cell.font.color
and "rgb=None" not in str(cell.font.color)
): # Handle font color
font_color = cell.font.color.rgb
fill_color = None
if (
cell.fill
and cell.fill.fgColor
and "rgb=None" not in str(cell.fill.fgColor)
): # Handle fill color
fill_color = cell.fill.fgColor.rgb
cell_info_list.append(
{
"index": f"{row_num}{col_letter}",
"value": cell_value,
"font_color": font_color,
"fill_color": fill_color,
}
)
# Convert the sheet to a DataFrame and then to markdown
sheet_df = pd.read_excel(
document_path, sheet_name=sheet, engine='openpyxl'
)
markdown_content = self._convert_to_markdown(sheet_df)
# Collect all information for the sheet
sheet_info = {
"sheet_name": sheet,
"cell_info_list": cell_info_list,
"markdown_content": markdown_content,
}
sheet_info_list.append(sheet_info)
result_str = ""
for sheet_info in sheet_info_list:
result_str += f"""
Sheet Name: {sheet_info['sheet_name']}
Cell information list:
{sheet_info['cell_info_list']}
Markdown View of the content:
{sheet_info['markdown_content']}
{'-'*40}
"""
return result_str
[docs]
def get_tools(self) -> List[FunctionTool]:
r"""Returns a list of FunctionTool objects representing the functions
in the toolkit.
Returns:
List[FunctionTool]: A list of FunctionTool objects representing
the functions in the toolkit.
"""
return [
FunctionTool(self.extract_excel_content),
]