Source code for camel.utils.filename
# ========= Copyright 2023-2024 @ CAMEL-AI.org. All Rights Reserved. =========
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ========= Copyright 2023-2024 @ CAMEL-AI.org. All Rights Reserved. =========
import platform
import re
import unicodedata
MAX_FILENAME_LENGTH = 255
WINDOWS_RESERVED = {
'CON',
'PRN',
'AUX',
'NUL',
'COM1',
'COM2',
'COM3',
'COM4',
'LPT1',
'LPT2',
'LPT3',
}
[docs]
def sanitize_filename(
url_name: str,
default: str = "index",
max_length: int = MAX_FILENAME_LENGTH,
) -> str:
r"""Sanitize a URL path into a safe filename that is safe for
most platforms.
Args:
url_name (str): The URL path to sanitize.
default (str): Default name if sanitization results in empty string.
(default: :obj:`"index"`)
max_length (int): Maximum length of the filename.
(default: :obj:`MAX_FILENAME_LENGTH`)
Returns:
str: A sanitized filename safe for most platforms.
"""
if max_length < 1:
raise ValueError(
f"`max_length` must be greater than " f"0, got {max_length}"
)
if not url_name:
return default
# Normalize Unicode characters by removing characters
# such as accents and special characters:
# café☕.txt -> cafe.txt
url_name = unicodedata.normalize('NFKD', url_name)
url_name = url_name.encode('ASCII', 'ignore').decode('ASCII')
# Replace special characters such as:
# Separators: my/file:name*.txt -> my_file_name.txt etc.
url_name = re.sub(r'[\\/:*?"<>|.]', '_', url_name)
url_name = re.sub(r'_+', '_', url_name) # Collapse multiple underscores
url_name = url_name.strip('_') # Remove leading/trailing underscores
# Handle empty result if all characters are invalid:
if not url_name:
return default
# Handle Windows reserved names
if platform.system() == "Windows" and url_name.upper() in WINDOWS_RESERVED:
url_name = f"_{url_name}"
return url_name[:max_length]