Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
26 commits
Select commit Hold shift + click to select a range
65053c9
Update Python and mlflow dependencies in pyproject.toml
chakravarthik27 Mar 21, 2026
08851d3
fixed the lint issues.
chakravarthik27 Mar 21, 2026
f2cfc01
Update pyproject-flake8 dependency to version 7.0.0
chakravarthik27 Mar 21, 2026
50790c7
fix: add verbose flag to poetry install command for better logging
chakravarthik27 Mar 21, 2026
efd6362
updated the poetry.lock file
chakravarthik27 Mar 21, 2026
91fd6b8
fix: update scipy version to 1.17.1 for improved compatibility and se…
chakravarthik27 Mar 21, 2026
d2312a0
fix: update TestConfig structure to support in python 3.13 above vers…
chakravarthik27 Mar 21, 2026
97d58bd
fix: refactor TestConfig structure in fairness and robustness modules
chakravarthik27 Mar 21, 2026
d513d93
fix: refactor TypedDict definitions in representation classes
chakravarthik27 Mar 21, 2026
d3453bd
fix: update TypedDict definitions for parameters and TestConfig in ro…
chakravarthik27 Mar 21, 2026
f1e558e
fix: update TestConfig structure
chakravarthik27 Mar 21, 2026
b2fc2c5
fix: update Python version matrix in build workflow
chakravarthik27 Mar 21, 2026
e16abdf
fix: update type hints for examples and singleton instance in PromptC…
chakravarthik27 Mar 21, 2026
f8bb784
fix: update Python version matrix and refactor model validators in pr…
chakravarthik27 Mar 21, 2026
b4e0df6
updated: spacy version to 3.8.11 in pyproject.toml
chakravarthik27 Mar 21, 2026
4ffbd84
chore(dependencies): update Python and package versions in pyproject.…
chakravarthik27 Mar 21, 2026
a9df98e
fix: remove unnecessary blank lines in modelhandler initialization
chakravarthik27 Mar 21, 2026
ee79441
fix: configure poetry to use only binary installers and clean up pysp…
chakravarthik27 Mar 21, 2026
7cea6a9
fix: update markers for spacy dependencies in poetry.lock
chakravarthik27 Mar 21, 2026
bff938f
updated: poetry.lock file
chakravarthik27 Mar 21, 2026
f7204fe
fix: remove local installer binary configuration from poetry install …
chakravarthik27 Mar 21, 2026
44e8ce7
Update dependencies in pyproject.toml
chakravarthik27 Mar 22, 2026
8cd8f00
fix: update task types for model compatibility in transformers and gr…
chakravarthik27 Mar 22, 2026
8dc0d80
fix: update task type from text2text-generation to text-generation in…
chakravarthik27 Mar 22, 2026
0e27db0
fix: update Python version matrix and adjust model loading for text g…
chakravarthik27 Mar 22, 2026
ccff538
fix: refactor architecture check for task compatibility in HuggingFac…
chakravarthik27 Mar 23, 2026
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 2 additions & 3 deletions .github/workflows/build_and_test.yml
Original file line number Diff line number Diff line change
Expand Up @@ -17,15 +17,14 @@ jobs:
strategy:
fail-fast: false
matrix:
python-version: [ "3.9","3.10", "3.11" ]
python-version: ["3.12", "3.13"]

steps:
- name: Free up disk space at start
run: |
sudo apt clean
sudo rm -rf /usr/share/dotnet /usr/local/lib/android /opt/ghc
sudo rm -rf "/usr/local/share/boost"
sudo rm -rf "$AGENT_TOOLSDIRECTORY"
df -h

- uses: actions/checkout@v3
Expand Down Expand Up @@ -53,7 +52,7 @@ jobs:
if: steps.cached-poetry-dependencies.outputs.cache-hit != 'true'
run: |
poetry cache clear pypi --all -n > /dev/null
poetry install --with dev --all-extras --no-cache --quiet --no-interaction
poetry install --with dev --all-extras --no-cache --no-interaction
source ./.venv/bin/activate && pip uninstall -y pyspark && rm -rf ./.venv/lib/python${{ matrix.python-version }}/site-packages/pyspark*/
pip install pyspark==3.5.6

Expand Down
2 changes: 1 addition & 1 deletion .github/workflows/llm_tests_build.yml
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ jobs:
strategy:
fail-fast: false
matrix:
python-version: [ "3.8", "3.9", "3.10" ]
python-version: [ "3.12", "3.13" ]

steps:
- uses: actions/checkout@v3
Expand Down
2 changes: 1 addition & 1 deletion .github/workflows/release.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ jobs:
strategy:
fail-fast: false
matrix:
python-version: [3.9]
python-version: [3.12]
poetry-version: [2.1.3]
os: [ubuntu-latest]
runs-on: ${{ matrix.os }}
Expand Down
2 changes: 1 addition & 1 deletion langtest/datahandler/datasource.py
Original file line number Diff line number Diff line change
Expand Up @@ -808,7 +808,7 @@ def load_raw_data(self, standardize_columns: bool = False) -> List[Dict]:
parsed CSV file into list of dicts
"""

if type(self._file_path) == dict:
if isinstance(self._file_path, dict):
df = pd.read_csv(self._file_path["data_source"])

if self.task == "text-classification":
Expand Down
2 changes: 1 addition & 1 deletion langtest/langtest.py
Original file line number Diff line number Diff line change
Expand Up @@ -286,7 +286,7 @@ def configure(self, config: Union[HarnessConfig, dict, str]) -> HarnessConfig:
Returns:
dict: Loaded configuration.
"""
if type(config) == dict:
if isinstance(config, dict):
self._config = config
else:
with open(config, "r", encoding="utf-8") as yml:
Expand Down
35 changes: 16 additions & 19 deletions langtest/metrics/llm_eval.py
Original file line number Diff line number Diff line change
Expand Up @@ -66,25 +66,22 @@ def build_prompt(
f"""\n\nScore the student answer based on the following criteria:\n{eval_criteria}"""
)

prompt += dedent(
f"""
Example Format:
QUESTION: question here
STUDENT ANSWER: student's answer here
TRUE ANSWER: true answer here
GRADE: {grade_list} here

{
("Grade the student answers based ONLY on their factual accuracy. Ignore differences"
" in punctuation and phrasing between the student answer and true answer. It is OK "
"if the student answer contains more or relevant information than the true answer, as"
" long as it does not contain any conflicting statements. Begin!")
}

QUESTION: {{query}}
STUDENT ANSWER: {{result}}
TRUE ANSWER: {{answer}}
GRADE:"""
prompt += (
"Example Format:\n"
"QUESTION: question here\n"
"STUDENT ANSWER: student's answer here\n"
"TRUE ANSWER: true answer here\n"
f"GRADE: {grade_list} here"
"\n\n"
"Grade the student answers based ONLY on their factual accuracy. Ignore differences"
" in punctuation and phrasing between the student answer and true answer. It is OK "
"if the student answer contains more or relevant information than the true answer, as"
" long as it does not contain any conflicting statements. Begin!"
"\n\n"
"QUESTION: {{query}}\n"
"STUDENT ANSWER: {{result}}\n"
"TRUE ANSWER: {{answer}}\n"
"GRADE:\n"
)
return prompt

Expand Down
4 changes: 2 additions & 2 deletions langtest/modelhandler/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,15 +33,15 @@


if "langchain" in INSTALLED_HUBS:
import langchain
from langtest.modelhandler.utils import MODEL_CLASSES

LANGCHAIN_HUBS = {
(
RENAME_HUBS.get(hub.lower(), hub.lower())
if hub.lower() in RENAME_HUBS
else hub.lower()
): hub
for hub in langchain.llms.__all__
for hub in list(MODEL_CLASSES.keys())
}
LANGCHAIN_HUBS["openrouter"] = "openrouter"

Expand Down
4 changes: 2 additions & 2 deletions langtest/modelhandler/llm_modelhandler.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,9 +3,9 @@

import os
from typing import Any, List, Type, Union, TypeVar
import langchain.llms as lc
import langchain_classic.llms as lc
import langchain.chat_models as chat_models
from langchain.chains.llm import LLMChain
from langchain_classic.chains.llm import LLMChain
from langchain_core.prompts import PromptTemplate
from langchain_core.language_models.base import BaseLanguageModel
from langchain_core.exceptions import OutputParserException
Expand Down
6 changes: 3 additions & 3 deletions langtest/modelhandler/modelhandler.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,16 +14,16 @@
}

if try_import_lib("langchain"):
import langchain
import langchain.llms

from langtest.modelhandler.utils import MODEL_CLASSES

LANGCHAIN_HUBS = {
(
RENAME_HUBS.get(hub.lower(), hub.lower())
if hub.lower() in RENAME_HUBS
else hub.lower()
): hub
for hub in langchain.llms.__all__
for hub in list(MODEL_CLASSES.keys())
}
LANGCHAIN_HUBS["openrouter"] = "openrouter"
else:
Expand Down
3 changes: 1 addition & 2 deletions langtest/modelhandler/transformers_modelhandler.py
Original file line number Diff line number Diff line change
Expand Up @@ -441,7 +441,7 @@ def load_model(cls, path: str, *args, **kwargs) -> "Pipeline":
tgt_lang = config.get("target_language") or kwargs.get("target_language")

if "t5" in path:
return cls(pipeline(f"translation_en_to_{tgt_lang}", model=path))
return cls(pipeline("text-generation", model=path))
else:
return cls(pipeline(model=path, src_lang="en", tgt_lang=tgt_lang))

Expand Down Expand Up @@ -746,7 +746,6 @@ def load_model(cls, path: str, **kwargs):
task = filtered_kwargs.pop("task", None)
tasks = [
"text-generation",
"text2text-generation",
"summarization",
] # Add more tasks if needed

Expand Down
1 change: 0 additions & 1 deletion langtest/modelhandler/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -75,7 +75,6 @@ class Info(TypedDict):
"gigachat": "GigaChat",
"google_palm": "ChatGooglePalm",
"gpt_router": "GPTRouter",
"huggingface": "ChatHuggingFace",
"human": "HumanInputChatModel",
"hunyuan": "ChatHunyuan",
"javelin_ai_gateway": "ChatJavelinAIGateway",
Expand Down
47 changes: 25 additions & 22 deletions langtest/prompts.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,11 @@
from collections import defaultdict
from typing import Dict, List, Union
from typing import Any, Dict, List, Optional, Union, ClassVar

from pydantic.v1 import BaseModel, validator, Extra, ConfigDict
from pydantic import BaseModel, ConfigDict, model_validator


class MessageType(BaseModel):
__field_order: List[str] = [
__field_order: ClassVar[List[str]] = [
"content",
"context",
"question",
Expand All @@ -15,14 +15,15 @@ class MessageType(BaseModel):
"answer",
]

model_config = ConfigDict(extra=Extra.allow)
model_config = ConfigDict(extra="allow")

@validator("*", pre=True, allow_reuse=True)
def add_field(cls, v, values, field, **kwargs):
if "fields" not in values:
values["fields"] = []
values["fields"].append(field)
return v
@model_validator(mode="before")
def add_field(cls, data: Any):
if isinstance(data, dict):
data = dict(data)
data.setdefault("fields", [])
data["fields"] = [k for k in data.keys() if k != "fields"]
return data

@property
def get_template(self):
Expand Down Expand Up @@ -70,14 +71,16 @@ class Conversion(BaseModel):
user: MessageType
ai: MessageType

model_config = ConfigDict(extra=Extra.allow)
model_config = ConfigDict(extra="allow")

@validator("*", pre=True, allow_reuse=True)
def add_field(cls, v, values, field, **kwargs):
if "fields" not in values:
values["fields"] = []
values["fields"].append(field)
return v
@model_validator(mode="before")
@classmethod
def add_field(cls, data: Any):
if isinstance(data, dict):
data = dict(data)
data.setdefault("fields", [])
data["fields"] = [k for k in data.keys() if k != "fields"]
return data

@property
def get_examples(self):
Expand All @@ -93,7 +96,7 @@ def get_suffix_user(self):
class PromptConfig(BaseModel):
instructions: str
prompt_type: str
examples: Union[Conversion, List[Conversion]] = None
examples: Optional[Union[Conversion, List[Conversion]]] = None

@property
def get_examples(self) -> List[dict]:
Expand Down Expand Up @@ -126,7 +129,7 @@ def get_input_variables(self):
def prompt_style(self):
"""Generate a prompt based on the prompt type."""
if self.prompt_type in ["chat", "instruct"]:
from langchain.prompts import (
from langchain_core.prompts import (
ChatPromptTemplate,
FewShotChatMessagePromptTemplate,
)
Expand All @@ -149,7 +152,7 @@ def prompt_style(self):
return final_prompt

elif self.prompt_type == "completion":
from langchain.prompts import FewShotPromptTemplate, PromptTemplate
from langchain_core.prompts import FewShotPromptTemplate, PromptTemplate

template = "".join(v for _, v in self.get_template)
template = f"{template.replace('Answer:', '')}"
Expand Down Expand Up @@ -208,9 +211,9 @@ def lm_studio_prompt(self):


class PromptManager:
_instance = None
_instance: "PromptManager" = None
prompt_configs: Dict[str, PromptConfig] = defaultdict(PromptConfig)
_default_state = None
_default_state: str = None

def __new__(cls, *args, **kwargs):
if cls._instance is None:
Expand Down
15 changes: 11 additions & 4 deletions langtest/transform/accuracy.py
Original file line number Diff line number Diff line change
Expand Up @@ -276,7 +276,8 @@ class BaseAccuracy(ABC):

TestConfig = TypedDict(
"TestConfig",
min_score=Union[Dict[str, float], float],
# min_score=Union[Dict[str, float], float],
{"min_score": Union[Dict[str, float], float]},
)

@classmethod
Expand Down Expand Up @@ -1029,9 +1030,15 @@ class LLMEval(BaseAccuracy):

TestConfig = TypedDict(
"TestConfig",
model=str,
hub=str,
min_score=float,
# model=str,
# hub=str,
# min_score=float,
{
"model": str,
"hub": str,
"model_parameters": dict,
"min_score": float,
},
)

@classmethod
Expand Down
4 changes: 2 additions & 2 deletions langtest/transform/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -201,7 +201,7 @@ async def async_run(
category_output = all_categories[each].run(
values, model_handler, progress_bar=tests, **kwargs
)
if type(category_output) == list:
if isinstance(category_output, list):
all_results.extend(category_output)
else:
all_results.append(category_output)
Expand Down Expand Up @@ -264,7 +264,7 @@ def run(
if len(test_name.split("-")) > 1:
test_name = "multiple_perturbations"
test_output = supported_tests[test_name].async_run(samples, model, **kwargs)
if type(test_output) == list:
if isinstance(test_output, list):
tasks.extend(test_output)
else:
tasks.append(test_output)
Expand Down
2 changes: 1 addition & 1 deletion langtest/transform/bias.py
Original file line number Diff line number Diff line change
Expand Up @@ -267,7 +267,7 @@ class BaseBias(ABC):
]

# Config Hint for the bias tests
TestConfig = TypedDict("TestConfig", min_pass_rate=float)
TestConfig = TypedDict("TestConfig", {"min_pass_rate": float})

@abstractmethod
def transform(self, sample_list: List[Sample], *args, **kwargs) -> List[Sample]:
Expand Down
3 changes: 2 additions & 1 deletion langtest/transform/clinical.py
Original file line number Diff line number Diff line change
Expand Up @@ -119,7 +119,8 @@ class BaseClinical(ABC):
# TestConfig
TestConfig = TypedDict(
"TestConfig",
min_pass_rate=float,
# min_pass_rate=float,
{"min_pass_rate": float},
)

@staticmethod
Expand Down
3 changes: 2 additions & 1 deletion langtest/transform/disinformation.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,8 @@ class DisinformationTestFactory(ITests):
# TestConfig
TestConfig = TypedDict(
"TestConfig",
min_pass_rate=float,
# min_pass_rate=float,
{"min_pass_rate": float},
)

def __init__(self, data_handler: List[Sample], tests: Dict = None, **kwargs) -> None:
Expand Down
3 changes: 2 additions & 1 deletion langtest/transform/factuality.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,8 @@ class FactualityTestFactory(ITests):
# TestConfig
TestConfig = TypedDict(
"TestConfig",
min_pass_rate=float,
# min_pass_rate=float,
{"min_pass_rate": float},
)

def __init__(self, data_handler: List[Sample], tests: Dict = None, **kwargs) -> None:
Expand Down
Loading
Loading