alibaba · hobostay · Mar 11, 2026
diff --git a/pyproject.toml b/pyproject.toml
@@ -16,7 +16,9 @@ dependencies = [
     "uvicorn[standard]==0.24.0",
     "websocket-client==1.8.0",
     "tqdm==4.67.1",
-    "psutil==7.0.0"
+    "psutil==7.0.0",
+    "numpy>=2.0.0",
+    "pandas>=2.0.0",
 ]
 
 [tool.pytest.ini_options]
@@ -27,8 +29,6 @@ dev = [
     "pytest>=8.4.1",
     "ruff>=0.12.2",
     "aiofiles==24.1.0",
-    "numpy==2.3.2",
-    "pandas==2.3.1"
 ]
 
 [tool.ruff]

diff --git a/sec_code_bench/llm/llm_manager.py b/sec_code_bench/llm/llm_manager.py
@@ -121,4 +121,7 @@ def _safe_close_instance(self, name: str, instance: LLMBase) -> None:
             name: Name of the instance
             instance: Instance to close
         """
-        return instance.sync_close()
+        try:
+            instance.sync_close()
+        except Exception as e:
+            LOG.warning(f"Error closing LLM instance '{name}': {e}")
diff --git a/sec_code_bench/llm/openai.py b/sec_code_bench/llm/openai.py
@@ -129,7 +129,7 @@ async def _aquery_implementation(
                 "messages": [
                     {
                         "role": "system",
-                        "content": sys_prompt if sys_prompt else "TODO",
+                        "content": sys_prompt if sys_prompt else "You are a helpful programming assistant.",
                     },
                     {
                         "role": "user",

diff --git a/sec_code_bench/llm/openai_responses.py b/sec_code_bench/llm/openai_responses.py
@@ -35,8 +35,8 @@
 
 LOG = Logger.get_logger(__name__)
 
-# System instruction placeholder (aligned with openai.py chat completion)
-DEFAULT_INSTRUCTIONS = "TODO"
+# Default system instructions for Responses API (aligned with openai.py chat completion)
+DEFAULT_INSTRUCTIONS = "You are a helpful programming assistant."
 
 
 def _response_output_text(response: Response) -> str:

diff --git a/sec_code_bench/statistic/pass_at_k_statistic.py b/sec_code_bench/statistic/pass_at_k_statistic.py
@@ -82,6 +82,8 @@ def pass_at_k(n: int, c: int, k: int) -> float:
     Returns:
         Calculated pass@k score
     """
+    if c == 0:
+        return 0.0
     if (n - c) < k:
         return 1.0
     return 1.0 - np.prod(1.0 - k / np.arange(n - c + 1, n + 1))

diff --git a/sec_code_bench/utils/logger_utils.py b/sec_code_bench/utils/logger_utils.py
@@ -46,7 +46,9 @@ def emit(self, record):
             if self._tqdm_instance:
                 # Refresh tqdm after printing log
                 self._tqdm_instance.refresh()
-        except Exception:
+        except Exception as e:
+            # Log the exception to help debug issues with tqdm compatibility
+            sys.stderr.write(f"Error in TqdmCompatibleHandler.emit: {e}\n")
             self.handleError(record)
 
 

diff --git a/sec_code_bench/utils/rate_limiter.py b/sec_code_bench/utils/rate_limiter.py
@@ -124,4 +124,4 @@ async def __aexit__(self, exc_type: Any, exc_val: Any, exc_tb: Any) -> None:
             exc_val: Exception value if an exception was raised in the context.
             exc_tb: Exception traceback if an exception was raised in the context.
         """
-        pass
+        return
diff --git a/sec_code_bench/utils/testcase.py b/sec_code_bench/utils/testcase.py
@@ -148,7 +148,8 @@ def set_generated_code(self, cycle: int, scenario: TestScenario, code: str) -> N
 
                     extracted_code = "\n".join(lines)
                     break
-        except Exception:
+        except Exception as e:
+            LOG.debug(f"Failed to extract code from XML format, using raw response: {e}")
             extracted_code = code
 
         if cycle not in self.generated_code:
-Original file line number
+Diff line change
@@ Expand Up @@
                 exc_val: Exception value if an exception was raised in the context.
                 exc_tb: Exception traceback if an exception was raised in the context.
             """
-            pass
+            return