Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 3 additions & 3 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,9 @@ dependencies = [
"uvicorn[standard]==0.24.0",
"websocket-client==1.8.0",
"tqdm==4.67.1",
"psutil==7.0.0"
"psutil==7.0.0",
"numpy>=2.0.0",
"pandas>=2.0.0",
]

[tool.pytest.ini_options]
Expand All @@ -27,8 +29,6 @@ dev = [
"pytest>=8.4.1",
"ruff>=0.12.2",
"aiofiles==24.1.0",
"numpy==2.3.2",
"pandas==2.3.1"
]

[tool.ruff]
Expand Down
5 changes: 4 additions & 1 deletion sec_code_bench/llm/llm_manager.py
Original file line number Diff line number Diff line change
Expand Up @@ -121,4 +121,7 @@ def _safe_close_instance(self, name: str, instance: LLMBase) -> None:
name: Name of the instance
instance: Instance to close
"""
return instance.sync_close()
try:
instance.sync_close()
except Exception as e:
LOG.warning(f"Error closing LLM instance '{name}': {e}")
2 changes: 1 addition & 1 deletion sec_code_bench/llm/openai.py
Original file line number Diff line number Diff line change
Expand Up @@ -129,7 +129,7 @@ async def _aquery_implementation(
"messages": [
{
"role": "system",
"content": sys_prompt if sys_prompt else "TODO",
"content": sys_prompt if sys_prompt else "You are a helpful programming assistant.",
},
{
"role": "user",
Expand Down
4 changes: 2 additions & 2 deletions sec_code_bench/llm/openai_responses.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,8 +35,8 @@

LOG = Logger.get_logger(__name__)

# System instruction placeholder (aligned with openai.py chat completion)
DEFAULT_INSTRUCTIONS = "TODO"
# Default system instructions for Responses API (aligned with openai.py chat completion)
DEFAULT_INSTRUCTIONS = "You are a helpful programming assistant."


def _response_output_text(response: Response) -> str:
Expand Down
2 changes: 2 additions & 0 deletions sec_code_bench/statistic/pass_at_k_statistic.py
Original file line number Diff line number Diff line change
Expand Up @@ -82,6 +82,8 @@ def pass_at_k(n: int, c: int, k: int) -> float:
Returns:
Calculated pass@k score
"""
if c == 0:
return 0.0
if (n - c) < k:
return 1.0
return 1.0 - np.prod(1.0 - k / np.arange(n - c + 1, n + 1))
Expand Down
4 changes: 3 additions & 1 deletion sec_code_bench/utils/logger_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,7 +46,9 @@ def emit(self, record):
if self._tqdm_instance:
# Refresh tqdm after printing log
self._tqdm_instance.refresh()
except Exception:
except Exception as e:
# Log the exception to help debug issues with tqdm compatibility
sys.stderr.write(f"Error in TqdmCompatibleHandler.emit: {e}\n")
self.handleError(record)


Expand Down
2 changes: 1 addition & 1 deletion sec_code_bench/utils/rate_limiter.py
Original file line number Diff line number Diff line change
Expand Up @@ -124,4 +124,4 @@ async def __aexit__(self, exc_type: Any, exc_val: Any, exc_tb: Any) -> None:
exc_val: Exception value if an exception was raised in the context.
exc_tb: Exception traceback if an exception was raised in the context.
"""
pass
return
3 changes: 2 additions & 1 deletion sec_code_bench/utils/testcase.py
Original file line number Diff line number Diff line change
Expand Up @@ -148,7 +148,8 @@ def set_generated_code(self, cycle: int, scenario: TestScenario, code: str) -> N

extracted_code = "\n".join(lines)
break
except Exception:
except Exception as e:
LOG.debug(f"Failed to extract code from XML format, using raw response: {e}")
extracted_code = code

if cycle not in self.generated_code:
Expand Down