Source code for camel.verifiers.python_verifier

# ========= Copyright 2023-2024 @ CAMEL-AI.org. All Rights Reserved. =========
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ========= Copyright 2023-2024 @ CAMEL-AI.org. All Rights Reserved. =========

import ast
import asyncio
import os
import shutil
import subprocess
import sys
import tempfile
import venv
from typing import List, Optional, Tuple

from camel.extractors.base import BaseExtractor
from camel.logger import get_logger
from camel.verifiers import BaseVerifier

from .models import VerificationOutcome, VerificationResult

logger = get_logger(__name__)


[docs] class PythonVerifier(BaseVerifier): r"""The PythonVerifier class verifies Python-based implementations by executing them in an isolated virtual environment. Features: - Creates a virtual environment with a specified Python version. - Installs required packages before executing the provided script. - Executes the script and compares the output against a ground truth, if supplied. - Automatically cleans up the virtual environment after execution. The verification process ensures that the code runs in a controlled environment, minimizing external dependencies and conflicts. """ def __init__( self, extractor: Optional[BaseExtractor] = None, timeout: Optional[float] = 30.0, required_packages: Optional[List[str]] = None, **kwargs, ): r"""Initializes the PythonVerifier. Args: extractor (Optional[BaseExtractor], optional): The extractor to use for extracting code from the solution. (default: :obj:`None`) timeout (Optional[float], optional): The execution timeout in seconds. (default: :obj:`30.0`) required_packages (Optional[List[str]], optional): A list of packages to install in the virtual environment. (default: :obj:`None`) """ # TODO: Use CAMEL's Interpreter to execute the code super().__init__(extractor=extractor, timeout=timeout, **kwargs) self.venv_path: Optional[str] = None self.required_packages = required_packages or [] if os.name == 'nt': # Windows self.bin_dir = 'Scripts' else: # Unix-like systems self.bin_dir = 'bin' async def _setup(self, **kwargs) -> None: r"""Set up a virtual environment and install required packages.""" # Check if we're in a uv environment and use uv if available if kwargs.get("uv", False) or self._is_uv_environment(): logger.info("[UV] Detected uv environment. Using uv for setup.") self._setup_with_uv() return self.venv_path = tempfile.mkdtemp() try: # Use system=True to ensure that the virtual environment uses the # system Python libraries venv.create( self.venv_path, with_pip=True, system_site_packages=True ) logger.info(f"Virtual environment created at {self.venv_path}") except Exception as e: logger.error(f"Failed to create virtual environment: {e}") # Clean up resources before re-raising if self.venv_path and os.path.exists(self.venv_path): shutil.rmtree(self.venv_path) self.venv_path = None raise venv_pip = os.path.join(self.venv_path, self.bin_dir, "pip") if self.required_packages: try: # Add timeout to subprocess call subprocess.run( [venv_pip, "install", *self.required_packages], check=True, capture_output=True, timeout=self._timeout, ) logger.info( "Installed required packages: " f"{', '.join(self.required_packages)}" ) except subprocess.CalledProcessError as e: logger.error( "Failed to install required packages: " f"{e.stderr.decode().strip()}" ) # Clean up resources before re-raising if self.venv_path and os.path.exists(self.venv_path): shutil.rmtree(self.venv_path) self.venv_path = None raise except subprocess.TimeoutExpired: logger.error( f"Package installation timed out " f"after {self._timeout} seconds" ) if self.venv_path and os.path.exists(self.venv_path): shutil.rmtree(self.venv_path) self.venv_path = None raise def _is_uv_environment(self) -> bool: r"""Detect whether the current Python runtime is managed by uv.""" return "UV_CACHE_DIR" in os.environ or "uv" in sys.executable def _setup_with_uv(self) -> None: r"""Create virtual environment and install packages using uv.""" self.venv_path = tempfile.mkdtemp() try: subprocess.run( ["uv", "venv", self.venv_path], check=True, capture_output=True, timeout=self._timeout, ) logger.info( f"[UV] Virtual environment created at {self.venv_path}" ) except subprocess.CalledProcessError as e: logger.error( "[UV] Failed to create virtual environment:\n" f"{e.stderr.decode().strip()}" ) # Clean up resources before re-raising if self.venv_path and os.path.exists(self.venv_path): shutil.rmtree(self.venv_path) self.venv_path = None raise except subprocess.TimeoutExpired: logger.error( f"[UV] Virtual environment creation timed " f"out after {self._timeout} seconds" ) if self.venv_path and os.path.exists(self.venv_path): shutil.rmtree(self.venv_path) self.venv_path = None raise if self.required_packages: venv_python = os.path.join( self.venv_path, self.bin_dir, "python.exe" if os.name == 'nt' else "python", ) try: subprocess.run( [ "uv", "pip", "install", "--python", venv_python, *self.required_packages, ], check=True, capture_output=True, timeout=self._timeout, ) logger.info( "[UV] Installed required packages via uv: " f"{', '.join(self.required_packages)}" ) except subprocess.CalledProcessError as e: logger.error( "[UV] Failed to install required packages via uv:\n" f"{e.stderr.decode().strip()}" ) # Clean up resources before re-raising if self.venv_path and os.path.exists(self.venv_path): shutil.rmtree(self.venv_path) self.venv_path = None raise except subprocess.TimeoutExpired: logger.error( f"[UV] Package installation timed " f"out after {self._timeout} seconds" ) if self.venv_path and os.path.exists(self.venv_path): shutil.rmtree(self.venv_path) self.venv_path = None raise async def _cleanup(self) -> None: r"""Clean up the virtual environment.""" if self.venv_path: shutil.rmtree(self.venv_path) logger.info(f"Virtual environment at {self.venv_path} removed") self.venv_path = None async def _verify_implementation( self, solution: str, ground_truth: Optional[str] ) -> VerificationResult: r"""Executes the provided Python solution in an isolated environment and verifies its output against an expected ground truth expression. This method runs the solution in a subprocess inside a virtual environment. The ground truth is assumed to be a pure Python expression and is evaluated directly in the verifier process. If both executions are successful, the actual output is compared against the evaluated ground truth using semantic equality. If evaluation fails, string comparison is used as a fallback. Args: solution (str): The Python code or expression to execute and verify. ground_truth (Optional[str]): The expected value as a Python expression. If None, only execution success is verified. Returns: VerificationResult: Result of the verification process. """ # Check for virtual environment setup if not self.venv_path: return VerificationResult( status=VerificationOutcome.ERROR, result="", error_message="Virtual environment is not set up.", ) # If the solution is an expression, evaluate it directly if self._is_expression(solution): try: sol_val = ast.literal_eval(solution) except Exception as e: return VerificationResult( status=VerificationOutcome.ERROR, result="", error_message=f"Expression evaluation error: {e}", ) if ground_truth is not None: try: gt_val = ast.literal_eval(ground_truth) except Exception as e: return VerificationResult( status=VerificationOutcome.ERROR, result="", error_message=f"Ground truth evaluation error: {e}", ) if sol_val == gt_val: return VerificationResult( status=VerificationOutcome.SUCCESS, result=str(sol_val), ) else: return VerificationResult( status=VerificationOutcome.FAILURE, result=str(sol_val), error_message="Output mismatch: " f"{sol_val} != {gt_val}", ) else: return VerificationResult( status=VerificationOutcome.SUCCESS, result=str(sol_val), ) # Otherwise, run the code block, # which should already include a print(...) in the end venv_python = os.path.join( self.venv_path, self.bin_dir, "python.exe" if os.name == 'nt' else "python", ) if not os.path.exists(venv_python): return VerificationResult( status=VerificationOutcome.ERROR, result="", error_message="Python binary not found in virtual environment", ) try: sol_out, sol_err, sol_code = await self._run_code_block( solution, venv_python ) if sol_code != 0: return VerificationResult( status=VerificationOutcome.ERROR, result=sol_out, error_message=f"Solution code error:\n{sol_err}", ) if ground_truth is not None: try: # First, try to evaluate the output as-is. sol_val = ast.literal_eval(sol_out) except Exception as e: logger.warning(f"Direct eval failed: {e}.") sol_val = None if sol_val is not None: try: gt_val = ast.literal_eval(ground_truth) except Exception as e: return VerificationResult( status=VerificationOutcome.ERROR, result="", error_message="Ground truth evaluation error:" f"{e}", ) if sol_val == gt_val: return VerificationResult( status=VerificationOutcome.SUCCESS, result=sol_out, ) else: return VerificationResult( status=VerificationOutcome.FAILURE, result=sol_out, error_message="Output mismatch: " f"{sol_val} != {gt_val}", ) else: # Fallback: string comparison if sol_out.strip() == ground_truth.strip(): return VerificationResult( status=VerificationOutcome.SUCCESS, result=sol_out, ) else: return VerificationResult( status=VerificationOutcome.FAILURE, result=sol_out, error_message="Fallback string mismatch: " f"'{sol_out}' != '{ground_truth}'", ) else: return VerificationResult( status=VerificationOutcome.SUCCESS, result=sol_out, ) except asyncio.TimeoutError: return VerificationResult( status=VerificationOutcome.TIMEOUT, result="", error_message="Execution timed out.", ) except Exception as e: return VerificationResult( status=VerificationOutcome.ERROR, result="", error_message=f"Unexpected error: {e}", ) async def _run_code_block( self, code: str, venv_path: str ) -> Tuple[str, str, int]: r"""Executes a block of Python code in the virtual environment. The code is written to a temporary file, executed using the Python interpreter from the specified virtual environment, and its output and error streams are captured. Args: code (str): The Python code to execute. venv_path (str): The path to the virtual environment's Python binary. Returns: Tuple[str, str, int]: A tuple containing the stdout output, stderr output, and return code from the executed script. """ # No longer checking for expressions since they're handled separately with tempfile.NamedTemporaryFile( "w+", suffix=".py", delete=False ) as tmp: tmp.write(code) tmp_path = tmp.name proc = await asyncio.create_subprocess_exec( venv_path, tmp_path, stdout=asyncio.subprocess.PIPE, stderr=asyncio.subprocess.PIPE, ) stdout, stderr = await asyncio.wait_for( proc.communicate(), timeout=self._timeout ) os.remove(tmp_path) return ( stdout.decode().strip(), stderr.decode().strip(), proc.returncode if proc.returncode is not None else -1, ) def _is_expression(self, code: str) -> bool: r"""Determines whether a given string of code is a single expression. This utility uses Python's AST module to parse the code and checks if it consists of a single expression node. Args: code (str): The Python code to analyze. Returns: bool: True if the code is a single expression, False otherwise. """ # Skip empty or whitespace-only strings if not code or code.isspace(): return False try: # First try parsing as an expression - this is more reliable than # starting with literal_eval tree = ast.parse(code.strip(), mode='eval') # Check if it's a function call (like print()) - these should not # be treated as expressions if isinstance(tree.body, ast.Call): return False # If parsing succeeds in 'eval' mode and it's not a function call, # it's a valid expression return True except SyntaxError: # If parsing as expression fails, it's not a valid expression return False except Exception: # For any other parsing errors, try literal_eval as fallback for # simple literals try: ast.literal_eval(code) return True except Exception: return False