ntumlgroup · chcwww · Oct 28, 2025 · Oct 28, 2025 · Oct 28, 2025 · Nov 6, 2025
@@ -101,3 +101,18 @@ Grid Search with Sklearn Estimators
    :members:
 
    .. automethod:: __init__
+
+Grid Search with Tree-Based Linear Method
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+.. autoclass:: GridParameter
+   :members:
+
+   .. automethod:: __init__
+
+.. autoclass:: GridSearch
+   :members:
+
+   .. automethod:: __init__
+
+   .. automethod:: __call__
@@ -1,7 +1,14 @@
 """
-Hyperparameter Search for Linear Methods
+Hyperparameter Search for One-vs-rest Linear Methods
 =============================================================
+.. warning::
+
+    If you are using the tree-based linear method,
+    please check `Hyperparameter Search for Tree-Based Linear Method  <../auto_examples/plot_tree_gridsearch_tutorial.html>`_.
+
 This guide helps users to tune the hyperparameters of the feature generation step and the linear model.
+In this guide, the following methods are available:
+``1vsrest``, ``thresholding``, ``cost_sensitive``, ``cost_sensitive_micro``, and ``binary_and_multiclass``.
 
 Here we show an example of tuning a linear text classifier with the `rcv1 dataset <https://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/multilabel.html#rcv1v2%20(topics;%20full%20sets)>`_.
 Starting with loading and preprocessing of the data without using ``Preprocessor``:

@@ -0,0 +1,131 @@
+"""
+Hyperparameter Search for Tree-Based Linear Method
+=============================================================
+.. warning::
+
+    If you are using the one-vs-rest linear methods,
+    please check `Hyperparameter Search for One-vs-rest Linear Methods  <../auto_examples/plot_linear_gridsearch_tutorial.html>`_.
+
+To apply tree-based linear methods,
+we first convert raw text into numerical BoW features.
+During training, the method builds a label tree and trains classifiers.
+At inference, the model traverses the tree to make prediction.
+Each stage involves multiple hyperparameters that can be tuned to improve model performance.
+
+In this guide, we help users tune the hyperparameters of the tree-based linear method.
+
+.. seealso::
+
+    `Implementation Document <https://www.csie.ntu.edu.tw/~cjlin/papers/libmultilabel/libmultilabel_implementation.pdf>`_:
+        For more details about the implementation of tree-based linear methods.
+
+Here we show an example of tuning a tree-based linear text classifier with the `rcv1 dataset <https://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/multilabel.html#rcv1v2%20(topics;%20full%20sets)>`_.
+Starting with loading the data:
+"""
+
+import logging
+
+from libmultilabel import linear
+
+logging.basicConfig(level=logging.INFO)
+
+datasets = linear.load_dataset("txt", "data/rcv1/train.txt", "data/rcv1/test.txt")
+L = len(datasets["train"]["y"])
+
+######################################################################
+# Next, we set up the search space.
+
+import numpy as np
+
+dmax = 10
+K_factors = [-2, 5]
+search_space_dict = {
+    "ngram_range": [(1, 1), (1, 2), (1, 3)],
+    "stop_words": ["english"],
+    "dmax": [dmax],
+    "K": [max(2, int(np.round(np.power(L, 1 / dmax) * np.power(2.0, alpha) + 0.5))) for alpha in K_factors],
+    "s": [1],
+    "c": [0.5, 1, 2],
+    "B": [1],
+    "beam_width": [10],
+}
+
+######################################################################
+# Following the suggestions in this `paper <https://drive.google.com/file/d/1kxqNJwg4E_EKjVG-umoG876XKxz3mfm9/view>`__,
+# we define 18 configurations to build a simple yet strong baseline.
+#
+# The search space covers several key parts of the pipeline:
+#
+# - Text feature extraction: (``ngram_range``, ``stop_words``)
+#
+#       - We use the vectorizer ``TfidfVectorizer`` from ``sklearn`` to generate features from raw text.
+#
+# - Label tree structure: (``dmax``, ``K``)
+#
+#      - Note that ``K`` is the number of clusters and is calculated using the formula from the paper.
+#
+# - Linear classifier: (``s``, ``c``, ``B``)
+#
+#       - We combined them into a LIBLINEAR option string. (see *train Usage* in `liblinear <https://github.com/cjlin1/liblinear>`__ README)
+#
+# - Prediction: (``beam_width``)
+#
+# .. tip::
+#
+#     Available hyperparameters (and their defaults) are defined in the class variables of :py:class:`~libmultilabel.linear.GridParameter`.
+#
+# We implement the entire search process in linear.GridSearch.
+# Initialize it with the dataset, the number of cross-validation folds, 
+# and the evaluation metrics to monitor.
+
+n_folds = 3
+monitor_metrics = ["P@1", "P@3", "P@5"]
+search = linear.GridSearch(datasets, n_folds, monitor_metrics)
+cv_scores = search(search_space_dict)
+
+######################################################################
+# The returned scores are a ``dict`` whose keys are ``linear.GridParameter`` instances from the search space,
+# and whose values are the scores for ``monitor_metrics``.
+#
+# Here we sort the results in descending order by the first metric in ``monitor_metrics``.
+# You can retrieve the best parameters after the grid search with the following code:
+
+sorted_cv_scores = sorted(cv_scores.items(), key=lambda x: x[1][monitor_metrics[0]], reverse=True)
+print(sorted_cv_scores)
+
+best_params, best_cv_scores = list(sorted_cv_scores)[0]
+print(best_params, best_cv_scores)
+
+######################################################################
+# The best parameters are::
+#
+#   {'s': 1, 'c': 0.5, 'ngram_range': (1, 2), 'stop_words': 'english', 'dmax': 10, 'K': 88, 'beam_width': 10}
+#
+# We can then retrain using the best parameters,
+# and use ``linear.GridSearch.compute_scores`` and ``linear.get_metrics`` to compute test performance.
+
+from dataclasses import asdict
+
+preprocessor = linear.Preprocessor(tfidf_params=asdict(best_params.tfidf))
+transformed_dataset = preprocessor.fit_transform(datasets)
+
+model = linear.train_tree(
+    transformed_dataset["train"]["y"],
+    transformed_dataset["train"]["x"],
+    best_params.linear_options,
+    **asdict(best_params.tree),
+)
+
+metrics = linear.GridSearch.compute_scores(
+    transformed_dataset["test"]["y"],
+    transformed_dataset["test"]["x"],
+    model,
+    best_params,
+    {best_params: linear.get_metrics(monitor_metrics, num_classes=-1)},
+)
+print(metrics[best_params].compute())
+
+######################################################################
+# The result of the best parameters will look similar to::
+#
+#   {'P@1': 0.8100209275981901, 'P@3': 0.7310622302718446, 'P@5': 0.5290965293466371}
@@ -7,4 +7,5 @@ Hyperparameter Search
 
 
     ../auto_examples/plot_linear_gridsearch_tutorial
+    ../auto_examples/plot_tree_gridsearch_tutorial
     tutorials/Parameter_Selection_for_Neural_Networks
@@ -4,6 +4,7 @@
 
 import numpy as np
 import scipy.sparse as sparse
+from scipy.special import log_expit
 from sparsekmeans import LloydKmeans, ElkanKmeans
 import sklearn.preprocessing
 from tqdm import tqdm
@@ -58,16 +59,35 @@ def __init__(
         self.multiclass = False
         self._model_separated = False # Indicates whether the model has been separated for pruning tree.
 
+    def sigmoid_A(self, x: np.ndarray, prob_A: int) -> np.ndarray:
+        """
+        Calculate log(sigmoid(prob_A * x)), which represents the probability of the positive class in binary classification.
+
+        Args:
+            x (np.ndarray): The decision value matrix with dimension number of instances * number of classes.
+            prob_A (int):
+                The hyperparameter used in the probability estimation function for
+                binary classification: sigmoid(prob_A * x).
+
+        Returns:
+            np.ndarray: A matrix with dimension number of instances * number of classes.
+        """
+        return log_expit(prob_A * x)
+
     def predict_values(
         self,
         x: sparse.csr_matrix,
         beam_width: int = 10,
+        prob_A: int = 3,
     ) -> np.ndarray:
         """Calculate the probability estimates associated with x.
 
         Args:
             x (sparse.csr_matrix): A matrix with dimension number of instances * number of features.
-            beam_width (int, optional): Number of candidates considered during beam search. Defaults to 10.
+            beam_width (int, optional): Number of candidates considered during beam search.
+            prob_A (int, optional):
+                The hyperparameter used in the probability estimation function for
+                binary classification: sigmoid(prob_A * decision_value_matrix).
 
         Returns:
             np.ndarray: A matrix with dimension number of instances * number of classes.
@@ -81,8 +101,8 @@ def predict_values(
             if not self._model_separated:
                 self._separate_model_for_pruning_tree()
                 self._model_separated = True
-            all_preds = self._prune_tree_and_predict_values(x, beam_width) # number of instances * (number of labels + total number of metalabels)
-        return np.vstack([self._beam_search(all_preds[i], beam_width) for i in range(all_preds.shape[0])])
+            all_preds = self._prune_tree_and_predict_values(x, beam_width, prob_A) # number of instances * (number of labels + total number of metalabels)
+        return np.vstack([self._beam_search(all_preds[i], beam_width, prob_A) for i in range(all_preds.shape[0])])
 
     def _separate_model_for_pruning_tree(self):
         """
@@ -113,7 +133,7 @@ def _separate_model_for_pruning_tree(self):
             )
             self.subtree_models.append(subtree_flatmodel)
 
-    def _prune_tree_and_predict_values(self, x: sparse.csr_matrix, beam_width: int) -> np.ndarray:
+    def _prune_tree_and_predict_values(self, x: sparse.csr_matrix, beam_width: int, prob_A: int) -> np.ndarray:
         """Calculates the selective decision values associated with instances x by evaluating only the most relevant subtrees.
 
         Only subtrees corresponding to the top beam_width candidates from the root are evaluated,
@@ -122,6 +142,9 @@ def _prune_tree_and_predict_values(self, x: sparse.csr_matrix, beam_width: int)
         Args:
             x (sparse.csr_matrix): A matrix with dimension number of instances * number of features.
             beam_width (int): Number of top candidate branches considered for prediction.
+            prob_A (int):
+                The hyperparameter used in the probability estimation function for
+                binary classification: sigmoid(prob_A * decision_value_matrix).
 
         Returns:
             np.ndarray: A matrix with dimension number of instances * (number of labels + total number of metalabels).
@@ -132,7 +155,7 @@ def _prune_tree_and_predict_values(self, x: sparse.csr_matrix, beam_width: int)
 
         # Calculate root decision values and scores
         root_preds = linear.predict_values(self.root_model, x)
-        children_scores = 0.0 - np.square(np.maximum(0, 1 - root_preds))
+        children_scores = 0.0 + self.sigmoid_A(root_preds, prob_A)
 
         slice = np.s_[:, self.node_ptr[self.root.index] : self.node_ptr[self.root.index + 1]]
         all_preds[slice] = root_preds
@@ -159,12 +182,15 @@ def _prune_tree_and_predict_values(self, x: sparse.csr_matrix, beam_width: int)
 
         return all_preds
 
-    def _beam_search(self, instance_preds: np.ndarray, beam_width: int) -> np.ndarray:
+    def _beam_search(self, instance_preds: np.ndarray, beam_width: int, prob_A: int) -> np.ndarray:
         """Predict with beam search using cached probability estimates for a single instance.
 
         Args:
             instance_preds (np.ndarray): A vector of cached probability estimates of each node, has dimension number of labels + total number of metalabels.
             beam_width (int): Number of candidates considered.
+            prob_A (int, optional):
+                The hyperparameter used in the probability estimation function for
+                binary classification: sigmoid(prob_A * decision_value_matrix).
 
         Returns:
             np.ndarray: A vector with dimension number of classes.
@@ -182,7 +208,7 @@ def _beam_search(self, instance_preds: np.ndarray, beam_width: int) -> np.ndarra
                     continue
                 slice = np.s_[self.node_ptr[node.index] : self.node_ptr[node.index + 1]]
                 pred = instance_preds[slice]
-                children_score = score - np.square(np.maximum(0, 1 - pred))
+                children_score = score + self.sigmoid_A(pred, prob_A)
                 next_level.extend(zip(node.children, children_score.tolist()))
 
             cur_level = sorted(next_level, key=lambda pair: -pair[1])[:beam_width]
@@ -193,7 +219,7 @@ def _beam_search(self, instance_preds: np.ndarray, beam_width: int) -> np.ndarra
         for node, score in cur_level:
             slice = np.s_[self.node_ptr[node.index] : self.node_ptr[node.index + 1]]
             pred = instance_preds[slice]
-            scores[node.label_map] = np.exp(score - np.square(np.maximum(0, 1 - pred)))
+            scores[node.label_map] = np.exp(score + self.sigmoid_A(pred, prob_A))
         return scores
 
 
@@ -204,6 +230,7 @@ def train_tree(
     K=DEFAULT_K,
     dmax=DEFAULT_DMAX,
     verbose: bool = True,
+    root: Node = None,
 ) -> TreeModel:
     """Train a linear model for multi-label data using a divide-and-conquer strategy.
     The algorithm used is based on https://github.com/xmc-aalto/bonsai.
@@ -215,14 +242,16 @@ def train_tree(
         K (int, optional): Maximum degree of nodes in the tree. Defaults to 100.
         dmax (int, optional): Maximum depth of the tree. Defaults to 10.
         verbose (bool, optional): Output extra progress information. Defaults to True.
+        root (Node, optional): Pre-built tree root. Defaults to None.
 
     Returns:
         TreeModel: A model which can be used in predict_values.
     """
-    label_representation = (y.T * x).tocsr()
-    label_representation = sklearn.preprocessing.normalize(label_representation, norm="l2", axis=1)
-    root = _build_tree(label_representation, np.arange(y.shape[1]), 0, K, dmax)
-    root.is_root = True
+    if root is None:
+        label_representation = (y.T * x).tocsr()
+        label_representation = sklearn.preprocessing.normalize(label_representation, norm="l2", axis=1)
+        root = _build_tree(label_representation, np.arange(y.shape[1]), 0, K, dmax)
+        root.is_root = True
 
     num_nodes = 0
     # Both type(x) and type(y) are sparse.csr_matrix
Original file line number	Diff line number	Diff line change
Expand Up		@@ -7,4 +7,5 @@ Hyperparameter Search


		../auto_examples/plot_linear_gridsearch_tutorial
		../auto_examples/plot_tree_gridsearch_tutorial
		tutorials/Parameter_Selection_for_Neural_Networks