From fc7768ebb71970d3e47596e46c37e38a31d3b7dd Mon Sep 17 00:00:00 2001 From: hobostay Date: Wed, 11 Mar 2026 14:58:28 +0800 Subject: [PATCH] Fix multiple bugs and code quality issues ## Summary This PR addresses several bugs and improves code quality across multiple files. ## Changes ### 1. Fix pass@k calculation edge case (`pass_at_k_statistic.py`) - **Bug**: When `c=0` (no successes), the function returned `1.0` incorrectly - **Fix**: Added explicit check to return `0.0` when there are no successful attempts - **Impact**: Prevents incorrect scoring when all test attempts fail ### 2. Fix dependency configuration (`pyproject.toml`) - **Bug**: `numpy` and `pandas` were in dev dependencies but used in production code - **Fix**: Moved them to main dependencies with appropriate version constraints - **Impact**: Fixes runtime errors for users who only install main dependencies ### 3. Improve exception handling and logging - `llm_manager.py`: Log errors when closing LLM instances fails - `testcase.py`: Add debug logging for XML parsing failures - `logger_utils.py`: Add stderr output for handler exceptions - **Impact**: Better debugging capabilities when errors occur ### 4. Code quality improvements - `rate_limiter.py`: Use explicit `return` instead of `pass` in `__aexit__` - `openai.py` & `openai_responses.py`: Replace "TODO" with professional default system prompt - **Impact**: More maintainable and professional code Co-Authored-By: Claude Opus 4.6 --- pyproject.toml | 6 +++--- sec_code_bench/llm/llm_manager.py | 5 ++++- sec_code_bench/llm/openai.py | 2 +- sec_code_bench/llm/openai_responses.py | 4 ++-- sec_code_bench/statistic/pass_at_k_statistic.py | 2 ++ sec_code_bench/utils/logger_utils.py | 4 +++- sec_code_bench/utils/rate_limiter.py | 2 +- sec_code_bench/utils/testcase.py | 3 ++- 8 files changed, 18 insertions(+), 10 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index dfb7b779..0d4cf71d 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -16,7 +16,9 @@ dependencies = [ "uvicorn[standard]==0.24.0", "websocket-client==1.8.0", "tqdm==4.67.1", - "psutil==7.0.0" + "psutil==7.0.0", + "numpy>=2.0.0", + "pandas>=2.0.0", ] [tool.pytest.ini_options] @@ -27,8 +29,6 @@ dev = [ "pytest>=8.4.1", "ruff>=0.12.2", "aiofiles==24.1.0", - "numpy==2.3.2", - "pandas==2.3.1" ] [tool.ruff] diff --git a/sec_code_bench/llm/llm_manager.py b/sec_code_bench/llm/llm_manager.py index 266d6d54..14272643 100644 --- a/sec_code_bench/llm/llm_manager.py +++ b/sec_code_bench/llm/llm_manager.py @@ -121,4 +121,7 @@ def _safe_close_instance(self, name: str, instance: LLMBase) -> None: name: Name of the instance instance: Instance to close """ - return instance.sync_close() + try: + instance.sync_close() + except Exception as e: + LOG.warning(f"Error closing LLM instance '{name}': {e}") diff --git a/sec_code_bench/llm/openai.py b/sec_code_bench/llm/openai.py index 2690ca1b..fda61b74 100644 --- a/sec_code_bench/llm/openai.py +++ b/sec_code_bench/llm/openai.py @@ -129,7 +129,7 @@ async def _aquery_implementation( "messages": [ { "role": "system", - "content": sys_prompt if sys_prompt else "TODO", + "content": sys_prompt if sys_prompt else "You are a helpful programming assistant.", }, { "role": "user", diff --git a/sec_code_bench/llm/openai_responses.py b/sec_code_bench/llm/openai_responses.py index 8753ce4d..9afd12e3 100644 --- a/sec_code_bench/llm/openai_responses.py +++ b/sec_code_bench/llm/openai_responses.py @@ -35,8 +35,8 @@ LOG = Logger.get_logger(__name__) -# System instruction placeholder (aligned with openai.py chat completion) -DEFAULT_INSTRUCTIONS = "TODO" +# Default system instructions for Responses API (aligned with openai.py chat completion) +DEFAULT_INSTRUCTIONS = "You are a helpful programming assistant." def _response_output_text(response: Response) -> str: diff --git a/sec_code_bench/statistic/pass_at_k_statistic.py b/sec_code_bench/statistic/pass_at_k_statistic.py index 12fe21bc..ef93d055 100644 --- a/sec_code_bench/statistic/pass_at_k_statistic.py +++ b/sec_code_bench/statistic/pass_at_k_statistic.py @@ -82,6 +82,8 @@ def pass_at_k(n: int, c: int, k: int) -> float: Returns: Calculated pass@k score """ + if c == 0: + return 0.0 if (n - c) < k: return 1.0 return 1.0 - np.prod(1.0 - k / np.arange(n - c + 1, n + 1)) diff --git a/sec_code_bench/utils/logger_utils.py b/sec_code_bench/utils/logger_utils.py index 0e1bbb24..d48d57e6 100644 --- a/sec_code_bench/utils/logger_utils.py +++ b/sec_code_bench/utils/logger_utils.py @@ -46,7 +46,9 @@ def emit(self, record): if self._tqdm_instance: # Refresh tqdm after printing log self._tqdm_instance.refresh() - except Exception: + except Exception as e: + # Log the exception to help debug issues with tqdm compatibility + sys.stderr.write(f"Error in TqdmCompatibleHandler.emit: {e}\n") self.handleError(record) diff --git a/sec_code_bench/utils/rate_limiter.py b/sec_code_bench/utils/rate_limiter.py index 747acc95..788d25cf 100644 --- a/sec_code_bench/utils/rate_limiter.py +++ b/sec_code_bench/utils/rate_limiter.py @@ -124,4 +124,4 @@ async def __aexit__(self, exc_type: Any, exc_val: Any, exc_tb: Any) -> None: exc_val: Exception value if an exception was raised in the context. exc_tb: Exception traceback if an exception was raised in the context. """ - pass + return diff --git a/sec_code_bench/utils/testcase.py b/sec_code_bench/utils/testcase.py index 9f1267d7..eee23a74 100644 --- a/sec_code_bench/utils/testcase.py +++ b/sec_code_bench/utils/testcase.py @@ -148,7 +148,8 @@ def set_generated_code(self, cycle: int, scenario: TestScenario, code: str) -> N extracted_code = "\n".join(lines) break - except Exception: + except Exception as e: + LOG.debug(f"Failed to extract code from XML format, using raw response: {e}") extracted_code = code if cycle not in self.generated_code: