diff --git a/docs/api/linear.rst b/docs/api/linear.rst index 888db896..1b087948 100644 --- a/docs/api/linear.rst +++ b/docs/api/linear.rst @@ -101,3 +101,18 @@ Grid Search with Sklearn Estimators :members: .. automethod:: __init__ + +Grid Search with Tree-Based Linear Method +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +.. autoclass:: GridParameter + :members: + + .. automethod:: __init__ + +.. autoclass:: GridSearch + :members: + + .. automethod:: __init__ + + .. automethod:: __call__ \ No newline at end of file diff --git a/docs/examples/plot_linear_gridsearch_tutorial.py b/docs/examples/plot_linear_gridsearch_tutorial.py index d1a239e7..01c1e069 100644 --- a/docs/examples/plot_linear_gridsearch_tutorial.py +++ b/docs/examples/plot_linear_gridsearch_tutorial.py @@ -1,7 +1,14 @@ """ -Hyperparameter Search for Linear Methods +Hyperparameter Search for One-vs-rest Linear Methods ============================================================= +.. warning:: + + If you are using the tree-based linear method, + please check `Hyperparameter Search for Tree-Based Linear Method <../auto_examples/plot_tree_gridsearch_tutorial.html>`_. + This guide helps users to tune the hyperparameters of the feature generation step and the linear model. +In this guide, the following methods are available: +``1vsrest``, ``thresholding``, ``cost_sensitive``, ``cost_sensitive_micro``, and ``binary_and_multiclass``. Here we show an example of tuning a linear text classifier with the `rcv1 dataset `_. Starting with loading and preprocessing of the data without using ``Preprocessor``: diff --git a/docs/examples/plot_tree_gridsearch_tutorial.py b/docs/examples/plot_tree_gridsearch_tutorial.py new file mode 100644 index 00000000..2875db52 --- /dev/null +++ b/docs/examples/plot_tree_gridsearch_tutorial.py @@ -0,0 +1,131 @@ +""" +Hyperparameter Search for Tree-Based Linear Method +============================================================= +.. warning:: + + If you are using the one-vs-rest linear methods, + please check `Hyperparameter Search for One-vs-rest Linear Methods <../auto_examples/plot_linear_gridsearch_tutorial.html>`_. + +To apply tree-based linear methods, +we first convert raw text into numerical BoW features. +During training, the method builds a label tree and trains classifiers. +At inference, the model traverses the tree to make prediction. +Each stage involves multiple hyperparameters that can be tuned to improve model performance. + +In this guide, we help users tune the hyperparameters of the tree-based linear method. + +.. seealso:: + + `Implementation Document `_: + For more details about the implementation of tree-based linear methods. + +Here we show an example of tuning a tree-based linear text classifier with the `rcv1 dataset `_. +Starting with loading the data: +""" + +import logging + +from libmultilabel import linear + +logging.basicConfig(level=logging.INFO) + +datasets = linear.load_dataset("txt", "data/rcv1/train.txt", "data/rcv1/test.txt") +L = len(datasets["train"]["y"]) + +###################################################################### +# Next, we set up the search space. + +import numpy as np + +dmax = 10 +K_factors = [-2, 5] +search_space_dict = { + "ngram_range": [(1, 1), (1, 2), (1, 3)], + "stop_words": ["english"], + "dmax": [dmax], + "K": [max(2, int(np.round(np.power(L, 1 / dmax) * np.power(2.0, alpha) + 0.5))) for alpha in K_factors], + "s": [1], + "c": [0.5, 1, 2], + "B": [1], + "beam_width": [10], +} + +###################################################################### +# Following the suggestions in this `paper `__, +# we define 18 configurations to build a simple yet strong baseline. +# +# The search space covers several key parts of the pipeline: +# +# - Text feature extraction: (``ngram_range``, ``stop_words``) +# +# - We use the vectorizer ``TfidfVectorizer`` from ``sklearn`` to generate features from raw text. +# +# - Label tree structure: (``dmax``, ``K``) +# +# - Note that ``K`` is the number of clusters and is calculated using the formula from the paper. +# +# - Linear classifier: (``s``, ``c``, ``B``) +# +# - We combined them into a LIBLINEAR option string. (see *train Usage* in `liblinear `__ README) +# +# - Prediction: (``beam_width``) +# +# .. tip:: +# +# Available hyperparameters (and their defaults) are defined in the class variables of :py:class:`~libmultilabel.linear.GridParameter`. +# +# We implement the entire search process in linear.GridSearch. +# Initialize it with the dataset, the number of cross-validation folds, +# and the evaluation metrics to monitor. + +n_folds = 3 +monitor_metrics = ["P@1", "P@3", "P@5"] +search = linear.GridSearch(datasets, n_folds, monitor_metrics) +cv_scores = search(search_space_dict) + +###################################################################### +# The returned scores are a ``dict`` whose keys are ``linear.GridParameter`` instances from the search space, +# and whose values are the scores for ``monitor_metrics``. +# +# Here we sort the results in descending order by the first metric in ``monitor_metrics``. +# You can retrieve the best parameters after the grid search with the following code: + +sorted_cv_scores = sorted(cv_scores.items(), key=lambda x: x[1][monitor_metrics[0]], reverse=True) +print(sorted_cv_scores) + +best_params, best_cv_scores = list(sorted_cv_scores)[0] +print(best_params, best_cv_scores) + +###################################################################### +# The best parameters are:: +# +# {'s': 1, 'c': 0.5, 'ngram_range': (1, 2), 'stop_words': 'english', 'dmax': 10, 'K': 88, 'beam_width': 10} +# +# We can then retrain using the best parameters, +# and use ``linear.GridSearch.compute_scores`` and ``linear.get_metrics`` to compute test performance. + +from dataclasses import asdict + +preprocessor = linear.Preprocessor(tfidf_params=asdict(best_params.tfidf)) +transformed_dataset = preprocessor.fit_transform(datasets) + +model = linear.train_tree( + transformed_dataset["train"]["y"], + transformed_dataset["train"]["x"], + best_params.linear_options, + **asdict(best_params.tree), +) + +metrics = linear.GridSearch.compute_scores( + transformed_dataset["test"]["y"], + transformed_dataset["test"]["x"], + model, + best_params, + {best_params: linear.get_metrics(monitor_metrics, num_classes=-1)}, +) +print(metrics[best_params].compute()) + +###################################################################### +# The result of the best parameters will look similar to:: +# +# {'P@1': 0.8100209275981901, 'P@3': 0.7310622302718446, 'P@5': 0.5290965293466371} diff --git a/docs/search_retrain.rst b/docs/search_retrain.rst index 26acfccb..9242b8ce 100644 --- a/docs/search_retrain.rst +++ b/docs/search_retrain.rst @@ -7,4 +7,5 @@ Hyperparameter Search ../auto_examples/plot_linear_gridsearch_tutorial + ../auto_examples/plot_tree_gridsearch_tutorial tutorials/Parameter_Selection_for_Neural_Networks diff --git a/libmultilabel/linear/tree.py b/libmultilabel/linear/tree.py index 7f1ce851..f8411b68 100644 --- a/libmultilabel/linear/tree.py +++ b/libmultilabel/linear/tree.py @@ -4,6 +4,7 @@ import numpy as np import scipy.sparse as sparse +from scipy.special import log_expit from sparsekmeans import LloydKmeans, ElkanKmeans import sklearn.preprocessing from tqdm import tqdm @@ -58,16 +59,35 @@ def __init__( self.multiclass = False self._model_separated = False # Indicates whether the model has been separated for pruning tree. + def sigmoid_A(self, x: np.ndarray, prob_A: int) -> np.ndarray: + """ + Calculate log(sigmoid(prob_A * x)), which represents the probability of the positive class in binary classification. + + Args: + x (np.ndarray): The decision value matrix with dimension number of instances * number of classes. + prob_A (int): + The hyperparameter used in the probability estimation function for + binary classification: sigmoid(prob_A * x). + + Returns: + np.ndarray: A matrix with dimension number of instances * number of classes. + """ + return log_expit(prob_A * x) + def predict_values( self, x: sparse.csr_matrix, beam_width: int = 10, + prob_A: int = 3, ) -> np.ndarray: """Calculate the probability estimates associated with x. Args: x (sparse.csr_matrix): A matrix with dimension number of instances * number of features. - beam_width (int, optional): Number of candidates considered during beam search. Defaults to 10. + beam_width (int, optional): Number of candidates considered during beam search. + prob_A (int, optional): + The hyperparameter used in the probability estimation function for + binary classification: sigmoid(prob_A * decision_value_matrix). Returns: np.ndarray: A matrix with dimension number of instances * number of classes. @@ -81,8 +101,8 @@ def predict_values( if not self._model_separated: self._separate_model_for_pruning_tree() self._model_separated = True - all_preds = self._prune_tree_and_predict_values(x, beam_width) # number of instances * (number of labels + total number of metalabels) - return np.vstack([self._beam_search(all_preds[i], beam_width) for i in range(all_preds.shape[0])]) + all_preds = self._prune_tree_and_predict_values(x, beam_width, prob_A) # number of instances * (number of labels + total number of metalabels) + return np.vstack([self._beam_search(all_preds[i], beam_width, prob_A) for i in range(all_preds.shape[0])]) def _separate_model_for_pruning_tree(self): """ @@ -113,7 +133,7 @@ def _separate_model_for_pruning_tree(self): ) self.subtree_models.append(subtree_flatmodel) - def _prune_tree_and_predict_values(self, x: sparse.csr_matrix, beam_width: int) -> np.ndarray: + def _prune_tree_and_predict_values(self, x: sparse.csr_matrix, beam_width: int, prob_A: int) -> np.ndarray: """Calculates the selective decision values associated with instances x by evaluating only the most relevant subtrees. Only subtrees corresponding to the top beam_width candidates from the root are evaluated, @@ -122,6 +142,9 @@ def _prune_tree_and_predict_values(self, x: sparse.csr_matrix, beam_width: int) Args: x (sparse.csr_matrix): A matrix with dimension number of instances * number of features. beam_width (int): Number of top candidate branches considered for prediction. + prob_A (int): + The hyperparameter used in the probability estimation function for + binary classification: sigmoid(prob_A * decision_value_matrix). Returns: np.ndarray: A matrix with dimension number of instances * (number of labels + total number of metalabels). @@ -132,7 +155,7 @@ def _prune_tree_and_predict_values(self, x: sparse.csr_matrix, beam_width: int) # Calculate root decision values and scores root_preds = linear.predict_values(self.root_model, x) - children_scores = 0.0 - np.square(np.maximum(0, 1 - root_preds)) + children_scores = 0.0 + self.sigmoid_A(root_preds, prob_A) slice = np.s_[:, self.node_ptr[self.root.index] : self.node_ptr[self.root.index + 1]] all_preds[slice] = root_preds @@ -159,12 +182,15 @@ def _prune_tree_and_predict_values(self, x: sparse.csr_matrix, beam_width: int) return all_preds - def _beam_search(self, instance_preds: np.ndarray, beam_width: int) -> np.ndarray: + def _beam_search(self, instance_preds: np.ndarray, beam_width: int, prob_A: int) -> np.ndarray: """Predict with beam search using cached probability estimates for a single instance. Args: instance_preds (np.ndarray): A vector of cached probability estimates of each node, has dimension number of labels + total number of metalabels. beam_width (int): Number of candidates considered. + prob_A (int, optional): + The hyperparameter used in the probability estimation function for + binary classification: sigmoid(prob_A * decision_value_matrix). Returns: np.ndarray: A vector with dimension number of classes. @@ -182,7 +208,7 @@ def _beam_search(self, instance_preds: np.ndarray, beam_width: int) -> np.ndarra continue slice = np.s_[self.node_ptr[node.index] : self.node_ptr[node.index + 1]] pred = instance_preds[slice] - children_score = score - np.square(np.maximum(0, 1 - pred)) + children_score = score + self.sigmoid_A(pred, prob_A) next_level.extend(zip(node.children, children_score.tolist())) cur_level = sorted(next_level, key=lambda pair: -pair[1])[:beam_width] @@ -193,7 +219,7 @@ def _beam_search(self, instance_preds: np.ndarray, beam_width: int) -> np.ndarra for node, score in cur_level: slice = np.s_[self.node_ptr[node.index] : self.node_ptr[node.index + 1]] pred = instance_preds[slice] - scores[node.label_map] = np.exp(score - np.square(np.maximum(0, 1 - pred))) + scores[node.label_map] = np.exp(score + self.sigmoid_A(pred, prob_A)) return scores @@ -204,6 +230,7 @@ def train_tree( K=DEFAULT_K, dmax=DEFAULT_DMAX, verbose: bool = True, + root: Node = None, ) -> TreeModel: """Train a linear model for multi-label data using a divide-and-conquer strategy. The algorithm used is based on https://github.com/xmc-aalto/bonsai. @@ -215,14 +242,16 @@ def train_tree( K (int, optional): Maximum degree of nodes in the tree. Defaults to 100. dmax (int, optional): Maximum depth of the tree. Defaults to 10. verbose (bool, optional): Output extra progress information. Defaults to True. + root (Node, optional): Pre-built tree root. Defaults to None. Returns: TreeModel: A model which can be used in predict_values. """ - label_representation = (y.T * x).tocsr() - label_representation = sklearn.preprocessing.normalize(label_representation, norm="l2", axis=1) - root = _build_tree(label_representation, np.arange(y.shape[1]), 0, K, dmax) - root.is_root = True + if root is None: + label_representation = (y.T * x).tocsr() + label_representation = sklearn.preprocessing.normalize(label_representation, norm="l2", axis=1) + root = _build_tree(label_representation, np.arange(y.shape[1]), 0, K, dmax) + root.is_root = True num_nodes = 0 # Both type(x) and type(y) are sparse.csr_matrix diff --git a/libmultilabel/linear/utils.py b/libmultilabel/linear/utils.py index 3324a896..7f1f3612 100644 --- a/libmultilabel/linear/utils.py +++ b/libmultilabel/linear/utils.py @@ -1,10 +1,15 @@ from __future__ import annotations import os +import sys +import math +import itertools +import logging import pathlib import pickle import re -from typing import Any +from typing import Any, Callable +from dataclasses import make_dataclass, field, fields, asdict import numpy as np import scipy.sparse as sparse @@ -12,12 +17,14 @@ import sklearn.model_selection import sklearn.pipeline import sklearn.utils +import sklearn.preprocessing import libmultilabel.linear as linear from .preprocessor import Preprocessor +from .tree import _build_tree -__all__ = ["save_pipeline", "load_pipeline", "MultiLabelEstimator", "GridSearchCV"] +__all__ = ["save_pipeline", "load_pipeline", "MultiLabelEstimator", "GridSearchCV", "GridParameter", "GridSearch"] LINEAR_TECHNIQUES = { @@ -143,3 +150,327 @@ def _set_singlecore_options(self, estimator, param_grid: dict): key = f"{name}__options" param_grid[key] = [f"{re.sub(regex, '', v)} -m 1" for v in param_grid[key]] return param_grid + + +# suppress inevitable outputs from sparsekmeans and sklearn preprocessors +class __silent__: + def __init__(self): + self.stderr = os.dup(2) + self.devnull = os.open(os.devnull, os.O_WRONLY) + + def __enter__(self): + os.dup2(self.devnull, 2) + self.stdout = sys.stdout + sys.stdout = open(os.devnull, "w") + + def __exit__(self, type, value, traceback): + os.dup2(self.stderr, 2) + os.close(self.devnull) + os.close(self.stderr) + sys.stdout.close() + sys.stdout = self.stdout + + +class GridParameter: + """A tree-based linear method hyperparameter class for GridSearch. + Transform the parameter dict into dataclass instances and handle the defaults. + + Args: + params (dict, optional): The keys are the parameter names, and the valus are the parameter values. + """ + + _tfidf_fields = [ + ("ngram_range", tuple[int, int], field(default=(1, 1))), + ("max_features", int, field(default=None)), + ("min_df", float | int, field(default=1)), + ("stop_words", str | list, field(default=None)), + ("strip_accents", str | Callable, field(default=None)), + ("tokenizer", Callable, field(default=None)), + ] + _tree_fields = [ + ("dmax", int, field(default=10)), + ("K", int, field(default=8)), + ] + _linear_fields = [ + ("s", int, field(default=1)), + ("c", float, field(default=1)), + ("B", int, field(default=-1)), + ] + _predict_fields = [ + ("beam_width", int, field(default=10)), + ("prob_A", int, field(default=3)), + ] + + # set frozen=True to make instances hashable. + # set order=True to enable comparison operations. + param_types = { + "tfidf": make_dataclass("TfidfParams", _tfidf_fields, frozen=True, order=True), + "tree": make_dataclass("TreeParams", _tree_fields, frozen=True, order=True), + "linear": make_dataclass("LinearParams", _linear_fields, frozen=True, order=True), + "predict": make_dataclass("PredictParams", _predict_fields, frozen=True, order=True), + } + _param_field_names = { + param_type: {f.name for f in fields(class_name)} for param_type, class_name in param_types.items() + } + + def __init__(self, params: dict | None = None): + self.params = params or {} + + params_set = set(self.params) + for param_type, class_name in self.param_types.items(): + field_names = self._param_field_names[param_type] + filtered_keys = params_set & field_names + params_set -= field_names + + filtered_params = {k: self.params[k] for k in filtered_keys} + setattr(self, param_type, class_name(**filtered_params)) + + @property + def linear_options(self): + options = "" + for field_name in self._param_field_names["linear"]: + options += f" -{field_name} {getattr(self.linear, field_name)}" + return options.strip() + + def __repr__(self): # provide a readable string representation of the object + return str(self.params) + + def __eq__(self, other): # compare instance attributes to define equality. + return all(getattr(self, t) == getattr(other, t) for t in self.param_types) + + def __lt__(self, other): # define ordering for sorting. + # "<" for tuple is automatically lexicographic ordering + my_values = tuple(getattr(self, t) for t in self.param_types) + other_values = tuple(getattr(other, t) for t in self.param_types) + return my_values < other_values + + def __hash__(self): # make instances hashable for use as dict keys + return hash(tuple(getattr(self, t) for t in self.param_types)) + + +class GridSearch: + """Grid search the search space and find the best parameters for tree-based linear method. + + Args: + datasets (dict[str, dict[str, list[str]]]): The training and/or test data, with keys 'train' and 'test' respectively. + The data has keys 'x' for input features and 'y' for labels. + n_folds (int, optional): The number of cross-validation folds. + monitor_metrics (list[str], optional): The evaluation metrics to monitor. + """ + + def __init__( + self, + datasets: dict[str, dict[str, list[str]]], + n_folds: int = 3, + monitor_metrics: list[str] = ["P@1", "P@3", "P@5"], + ): + self.datasets = datasets + self.n_folds = n_folds + self.monitor_metrics = monitor_metrics + + self._cached_params = GridParameter() + for param_type in self._cached_params.param_types: + setattr(self._cached_params, param_type, None) + self._cached_transformed_dataset = None + self._cached_tree_root = None + self._cached_fold_data = None + self._cached_model = None + self.no_cache = True + + self.num_instances = len(self.datasets["train"]["y"]) + + def get_fold_dataset(self, train_idx, valid_idx): + def take(data, idx): + if isinstance(data, list): + return [data[i] for i in idx] + else: + return data[idx] + + return { + "data_format": self.datasets["data_format"], + "train": { + "y": take(self.datasets["train"]["y"], train_idx), + "x": take(self.datasets["train"]["x"], train_idx), + }, + "test": { + "y": take(self.datasets["train"]["y"], valid_idx), + "x": take(self.datasets["train"]["x"], valid_idx), + }, + } + + def get_transformed_dataset( + self, dataset: dict[str, dict[str, list[str]]], params: GridParameter + ) -> dict[str, dict[str, sparse.csr_matrix]]: + """ + Get and cache the dataset for the given TF-IDF params. + If we have processed the coming params, return the cached dataset directly without computation. + + Args: + dataset (dict[str, dict[str, list[str]]]): The training and/or test data, with keys 'train' and 'test' respectively. + The data has keys 'x' for input features and 'y' for labels. + params (GridParameter): The params to build the dataset. + + Returns: + dict[str, dict[str, sparse.csr_matrix]]: The transformed dataset. + """ + tfidf_params = params.tfidf + self.no_cache = tfidf_params != self._cached_params.tfidf + if self.no_cache: + logging.info(f"TFIDF - Preprocessing: {tfidf_params}") + if self.datasets["data_format"] not in {"txt", "dataframe"}: + logging.info( + "Please make sure the data format is 'txt' or 'dataframe'. Otherwise, the TF-IDF parameters have no effect on the dataset." + ) + with __silent__(): + preprocessor = linear.Preprocessor(tfidf_params=asdict(tfidf_params)) + self._cached_params.tfidf = tfidf_params + self._cached_transformed_dataset = preprocessor.fit_transform(dataset) + else: + logging.info(f"TFIDF - Using cached data: {tfidf_params}") + + return self._cached_transformed_dataset + + def get_tree(self, y, x, params): + tree_params = params.tree + self.no_cache |= tree_params != self._cached_params.tree + if self.no_cache: + logging.info(f"Tree - Preprocessing: {tree_params}") + with __silent__(): + label_representation = (y.T * x).tocsr() + label_representation = sklearn.preprocessing.normalize(label_representation, norm="l2", axis=1) + self._cached_params.tree = tree_params + self._cached_tree_root = _build_tree( + label_representation, np.arange(y.shape[1]), 0, **asdict(tree_params) + ) + self._cached_tree_root.is_root = True + else: + logging.info(f"Tree - Using cached data: {tree_params}") + + return self._cached_tree_root + + def get_model(self, y: sparse.csr_matrix, x: sparse.csr_matrix, params: GridParameter) -> linear.TreeModel: + """ + Get and cache the model for the given params. + If we have processed the coming params, return the cached model directly without computation. + + Args: + y (sparse.csr_matrix): The labels of the training data. + x (sparse.csr_matrix): The features of the training data. + params (GridParameter): The params to build the model. + + Returns: + linear.TreeModel: The model for the given params. + """ + root = self.get_tree(y, x, params) + + linear_params = params.linear + + if self.no_cache or (linear_params != self._cached_params.linear): + logging.info(f"Model - Training: {linear_params}") + with __silent__(): + self._cached_params.linear = linear_params + self._cached_model = linear.train_tree( + y, + x, + root=root, + options=params.linear_options, + ) + else: + logging.info(f"Model - Using cached data: {linear_params}") + + return self._cached_model + + @staticmethod + def compute_scores( + y: sparse.csr_matrix, + x: sparse.csr_matrix, + model: linear.TreeModel, + params: GridParameter, + param_metrics: dict[str, linear.MetricCollection], + ) -> dict[str, linear.MetricCollection]: + """ + Update the metric values in param_metrics with y, x, and model. + + Args: + y (sparse.csr_matrix): The labels of the test data. + x (sparse.csr_matrix): The features of the test data. + model (linear.TreeModel): The trained model. + params (GridParameter): The params used to compute the scores. + param_metrics (dict[str, linear.MetricCollection]): The metric values for each GridParameter. + + Returns: + dict[str, linear.MetricCollection]: The updated metric values. + """ + logging.info(f"Metric - Scoring: {params.predict}\n") + + batch_size = 256 + num_instances = x.shape[0] + num_batches = math.ceil(num_instances / batch_size) + + for i in range(num_batches): + preds = model.predict_values(x[i * batch_size : (i + 1) * batch_size], **asdict(params.predict)) + target = y[i * batch_size : (i + 1) * batch_size].toarray() + param_metrics[params].update(preds, target) + + return param_metrics + + def __call__(self, search_space_dict: dict[str, list]) -> dict[GridParameter, dict[str, float]]: + """ + Run the grid search on the search space. + + Args: + search_space_dict (dict[str, list]): The search space for the grid search. + + Returns: + dict[GridParameter, dict[str, float]]: The cross-validation scores for each GridParameter in the search space. + """ + param_names = search_space_dict.keys() + + # To avoid redundant computation (e.g., building the same tree multiple times across different params), + # we group identical settings in fields and process them continuously. + # This is implemented by sorting the params in the order of the four fields: + # TF-IDF, tree, linear, and predict. Finally, cache and reuse the most recent result of each field. + self.search_space = sorted( + [ + GridParameter(dict(zip(param_names, param_values))) + for param_values in itertools.product(*search_space_dict.values()) + ], + reverse=True, + ) + + # When the number of labels is large, evaluation often focuses on top-ranked + # metrics (e.g., Precision@K), which do not depend on num_classes. + # We therefore use -1 as a placeholder. + self.param_metrics = { + params: linear.get_metrics(self.monitor_metrics, num_classes=-1) for params in self.search_space + } + + permutation = np.random.permutation(self.num_instances) + index_per_fold = [] + for fold in range(self.n_folds): + index = permutation[ + int(fold * self.num_instances / self.n_folds) : int((fold + 1) * self.num_instances / self.n_folds) + ] + index_per_fold.append(index) + + for fold in range(self.n_folds): + train_idx = np.concatenate(index_per_fold[:fold] + index_per_fold[fold + 1 :]) + valid_idx = index_per_fold[fold] + fold_dataset = self.get_fold_dataset(train_idx, valid_idx) + + self._cached_params.tfidf = None + for params in self.search_space: + logging.info(f"Status - Running fold {fold}, params: {params}") + + transformed_dataset = self.get_transformed_dataset(fold_dataset, params) + model = self.get_model(transformed_dataset["train"]["y"], transformed_dataset["train"]["x"], params) + + self.param_metrics = self.compute_scores( + transformed_dataset["test"]["y"], + transformed_dataset["test"]["x"], + model, + params, + self.param_metrics, + ) + + return {params: metrics.compute() for params, metrics in self.param_metrics.items()} diff --git a/linear_trainer.py b/linear_trainer.py index b9133857..f5f374fa 100644 --- a/linear_trainer.py +++ b/linear_trainer.py @@ -24,6 +24,7 @@ def linear_test(config, model, datasets, label_mapping): predict_kwargs = {} if isinstance(model, (TreeModel, EnsembleTreeModel)): predict_kwargs["beam_width"] = config.beam_width + predict_kwargs["prob_A"] = config.prob_A for i in tqdm(range(ceil(num_instance / config.eval_batch_size))): slice = np.s_[i * config.eval_batch_size : (i + 1) * config.eval_batch_size] diff --git a/main.py b/main.py index 7a523f1f..59e3e379 100644 --- a/main.py +++ b/main.py @@ -252,6 +252,12 @@ def add_all_arguments(parser): default=10, help="The width of the beam search (default: %(default)s)", ) + parser.add_argument( + "--prob_A", + type=int, + default=3, + help="The hyperparameter used in the probability estimation function for binary classification: sigmoid(prob_A * decision_value_matrix). (default: %(default)s)", + ) # AttentionXML parser.add_argument( "--cluster_size",