diff --git a/docs/api/linear.rst b/docs/api/linear.rst
index 888db896..1b087948 100644
--- a/docs/api/linear.rst
+++ b/docs/api/linear.rst
@@ -101,3 +101,18 @@ Grid Search with Sklearn Estimators
    :members:
 
    .. automethod:: __init__
+
+Grid Search with Tree-Based Linear Method
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+.. autoclass:: GridParameter
+   :members:
+
+   .. automethod:: __init__
+
+.. autoclass:: GridSearch
+   :members:
+
+   .. automethod:: __init__
+
+   .. automethod:: __call__
\ No newline at end of file
diff --git a/docs/examples/plot_linear_gridsearch_tutorial.py b/docs/examples/plot_linear_gridsearch_tutorial.py
index d1a239e7..01c1e069 100644
--- a/docs/examples/plot_linear_gridsearch_tutorial.py
+++ b/docs/examples/plot_linear_gridsearch_tutorial.py
@@ -1,7 +1,14 @@
 """
-Hyperparameter Search for Linear Methods
+Hyperparameter Search for One-vs-rest Linear Methods
 =============================================================
+.. warning::
+
+    If you are using the tree-based linear method,
+    please check `Hyperparameter Search for Tree-Based Linear Method  <../auto_examples/plot_tree_gridsearch_tutorial.html>`_.
+
 This guide helps users to tune the hyperparameters of the feature generation step and the linear model.
+In this guide, the following methods are available:
+``1vsrest``, ``thresholding``, ``cost_sensitive``, ``cost_sensitive_micro``, and ``binary_and_multiclass``.
 
 Here we show an example of tuning a linear text classifier with the `rcv1 dataset <https://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/multilabel.html#rcv1v2%20(topics;%20full%20sets)>`_.
 Starting with loading and preprocessing of the data without using ``Preprocessor``:
diff --git a/docs/examples/plot_tree_gridsearch_tutorial.py b/docs/examples/plot_tree_gridsearch_tutorial.py
new file mode 100644
index 00000000..2875db52
--- /dev/null
+++ b/docs/examples/plot_tree_gridsearch_tutorial.py
@@ -0,0 +1,131 @@
+"""
+Hyperparameter Search for Tree-Based Linear Method
+=============================================================
+.. warning::
+
+    If you are using the one-vs-rest linear methods,
+    please check `Hyperparameter Search for One-vs-rest Linear Methods  <../auto_examples/plot_linear_gridsearch_tutorial.html>`_.
+
+To apply tree-based linear methods,
+we first convert raw text into numerical BoW features.
+During training, the method builds a label tree and trains classifiers.
+At inference, the model traverses the tree to make prediction.
+Each stage involves multiple hyperparameters that can be tuned to improve model performance.
+
+In this guide, we help users tune the hyperparameters of the tree-based linear method.
+
+.. seealso::
+
+    `Implementation Document <https://www.csie.ntu.edu.tw/~cjlin/papers/libmultilabel/libmultilabel_implementation.pdf>`_:
+        For more details about the implementation of tree-based linear methods.
+
+Here we show an example of tuning a tree-based linear text classifier with the `rcv1 dataset <https://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/multilabel.html#rcv1v2%20(topics;%20full%20sets)>`_.
+Starting with loading the data:
+"""
+
+import logging
+
+from libmultilabel import linear
+
+logging.basicConfig(level=logging.INFO)
+
+datasets = linear.load_dataset("txt", "data/rcv1/train.txt", "data/rcv1/test.txt")
+L = len(datasets["train"]["y"])
+
+######################################################################
+# Next, we set up the search space.
+
+import numpy as np
+
+dmax = 10
+K_factors = [-2, 5]
+search_space_dict = {
+    "ngram_range": [(1, 1), (1, 2), (1, 3)],
+    "stop_words": ["english"],
+    "dmax": [dmax],
+    "K": [max(2, int(np.round(np.power(L, 1 / dmax) * np.power(2.0, alpha) + 0.5))) for alpha in K_factors],
+    "s": [1],
+    "c": [0.5, 1, 2],
+    "B": [1],
+    "beam_width": [10],
+}
+
+######################################################################
+# Following the suggestions in this `paper <https://drive.google.com/file/d/1kxqNJwg4E_EKjVG-umoG876XKxz3mfm9/view>`__,
+# we define 18 configurations to build a simple yet strong baseline.
+#
+# The search space covers several key parts of the pipeline:
+#
+# - Text feature extraction: (``ngram_range``, ``stop_words``)
+#
+#       - We use the vectorizer ``TfidfVectorizer`` from ``sklearn`` to generate features from raw text.
+#
+# - Label tree structure: (``dmax``, ``K``)
+#
+#      - Note that ``K`` is the number of clusters and is calculated using the formula from the paper.
+#
+# - Linear classifier: (``s``, ``c``, ``B``)
+#
+#       - We combined them into a LIBLINEAR option string. (see *train Usage* in `liblinear <https://github.com/cjlin1/liblinear>`__ README)
+#
+# - Prediction: (``beam_width``)
+#
+# .. tip::
+#
+#     Available hyperparameters (and their defaults) are defined in the class variables of :py:class:`~libmultilabel.linear.GridParameter`.
+#
+# We implement the entire search process in linear.GridSearch.
+# Initialize it with the dataset, the number of cross-validation folds, 
+# and the evaluation metrics to monitor.
+
+n_folds = 3
+monitor_metrics = ["P@1", "P@3", "P@5"]
+search = linear.GridSearch(datasets, n_folds, monitor_metrics)
+cv_scores = search(search_space_dict)
+
+######################################################################
+# The returned scores are a ``dict`` whose keys are ``linear.GridParameter`` instances from the search space,
+# and whose values are the scores for ``monitor_metrics``.
+#
+# Here we sort the results in descending order by the first metric in ``monitor_metrics``.
+# You can retrieve the best parameters after the grid search with the following code:
+
+sorted_cv_scores = sorted(cv_scores.items(), key=lambda x: x[1][monitor_metrics[0]], reverse=True)
+print(sorted_cv_scores)
+
+best_params, best_cv_scores = list(sorted_cv_scores)[0]
+print(best_params, best_cv_scores)
+
+######################################################################
+# The best parameters are::
+#
+#   {'s': 1, 'c': 0.5, 'ngram_range': (1, 2), 'stop_words': 'english', 'dmax': 10, 'K': 88, 'beam_width': 10}
+#
+# We can then retrain using the best parameters,
+# and use ``linear.GridSearch.compute_scores`` and ``linear.get_metrics`` to compute test performance.
+
+from dataclasses import asdict
+
+preprocessor = linear.Preprocessor(tfidf_params=asdict(best_params.tfidf))
+transformed_dataset = preprocessor.fit_transform(datasets)
+
+model = linear.train_tree(
+    transformed_dataset["train"]["y"],
+    transformed_dataset["train"]["x"],
+    best_params.linear_options,
+    **asdict(best_params.tree),
+)
+
+metrics = linear.GridSearch.compute_scores(
+    transformed_dataset["test"]["y"],
+    transformed_dataset["test"]["x"],
+    model,
+    best_params,
+    {best_params: linear.get_metrics(monitor_metrics, num_classes=-1)},
+)
+print(metrics[best_params].compute())
+
+######################################################################
+# The result of the best parameters will look similar to::
+#
+#   {'P@1': 0.8100209275981901, 'P@3': 0.7310622302718446, 'P@5': 0.5290965293466371}
diff --git a/docs/search_retrain.rst b/docs/search_retrain.rst
index 26acfccb..9242b8ce 100644
--- a/docs/search_retrain.rst
+++ b/docs/search_retrain.rst
@@ -7,4 +7,5 @@ Hyperparameter Search
 
 
     ../auto_examples/plot_linear_gridsearch_tutorial
+    ../auto_examples/plot_tree_gridsearch_tutorial
     tutorials/Parameter_Selection_for_Neural_Networks
diff --git a/libmultilabel/linear/tree.py b/libmultilabel/linear/tree.py
index 7f1ce851..f8411b68 100644
--- a/libmultilabel/linear/tree.py
+++ b/libmultilabel/linear/tree.py
@@ -4,6 +4,7 @@
 
 import numpy as np
 import scipy.sparse as sparse
+from scipy.special import log_expit
 from sparsekmeans import LloydKmeans, ElkanKmeans
 import sklearn.preprocessing
 from tqdm import tqdm
@@ -58,16 +59,35 @@ def __init__(
         self.multiclass = False
         self._model_separated = False # Indicates whether the model has been separated for pruning tree.
 
+    def sigmoid_A(self, x: np.ndarray, prob_A: int) -> np.ndarray:
+        """
+        Calculate log(sigmoid(prob_A * x)), which represents the probability of the positive class in binary classification.
+
+        Args:
+            x (np.ndarray): The decision value matrix with dimension number of instances * number of classes.
+            prob_A (int):
+                The hyperparameter used in the probability estimation function for
+                binary classification: sigmoid(prob_A * x).
+
+        Returns:
+            np.ndarray: A matrix with dimension number of instances * number of classes.
+        """
+        return log_expit(prob_A * x)
+
     def predict_values(
         self,
         x: sparse.csr_matrix,
         beam_width: int = 10,
+        prob_A: int = 3,
     ) -> np.ndarray:
         """Calculate the probability estimates associated with x.
 
         Args:
             x (sparse.csr_matrix): A matrix with dimension number of instances * number of features.
-            beam_width (int, optional): Number of candidates considered during beam search. Defaults to 10.
+            beam_width (int, optional): Number of candidates considered during beam search.
+            prob_A (int, optional):
+                The hyperparameter used in the probability estimation function for
+                binary classification: sigmoid(prob_A * decision_value_matrix).
 
         Returns:
             np.ndarray: A matrix with dimension number of instances * number of classes.
@@ -81,8 +101,8 @@ def predict_values(
             if not self._model_separated:
                 self._separate_model_for_pruning_tree()
                 self._model_separated = True
-            all_preds = self._prune_tree_and_predict_values(x, beam_width) # number of instances * (number of labels + total number of metalabels)
-        return np.vstack([self._beam_search(all_preds[i], beam_width) for i in range(all_preds.shape[0])])
+            all_preds = self._prune_tree_and_predict_values(x, beam_width, prob_A) # number of instances * (number of labels + total number of metalabels)
+        return np.vstack([self._beam_search(all_preds[i], beam_width, prob_A) for i in range(all_preds.shape[0])])
 
     def _separate_model_for_pruning_tree(self):
         """
@@ -113,7 +133,7 @@ def _separate_model_for_pruning_tree(self):
             )
             self.subtree_models.append(subtree_flatmodel)
         
-    def _prune_tree_and_predict_values(self, x: sparse.csr_matrix, beam_width: int) -> np.ndarray:
+    def _prune_tree_and_predict_values(self, x: sparse.csr_matrix, beam_width: int, prob_A: int) -> np.ndarray:
         """Calculates the selective decision values associated with instances x by evaluating only the most relevant subtrees.
 
         Only subtrees corresponding to the top beam_width candidates from the root are evaluated,
@@ -122,6 +142,9 @@ def _prune_tree_and_predict_values(self, x: sparse.csr_matrix, beam_width: int)
         Args:
             x (sparse.csr_matrix): A matrix with dimension number of instances * number of features.
             beam_width (int): Number of top candidate branches considered for prediction.
+            prob_A (int):
+                The hyperparameter used in the probability estimation function for
+                binary classification: sigmoid(prob_A * decision_value_matrix).
 
         Returns:
             np.ndarray: A matrix with dimension number of instances * (number of labels + total number of metalabels).
@@ -132,7 +155,7 @@ def _prune_tree_and_predict_values(self, x: sparse.csr_matrix, beam_width: int)
 
         # Calculate root decision values and scores
         root_preds = linear.predict_values(self.root_model, x)
-        children_scores = 0.0 - np.square(np.maximum(0, 1 - root_preds))
+        children_scores = 0.0 + self.sigmoid_A(root_preds, prob_A)
 
         slice = np.s_[:, self.node_ptr[self.root.index] : self.node_ptr[self.root.index + 1]]
         all_preds[slice] = root_preds
@@ -159,12 +182,15 @@ def _prune_tree_and_predict_values(self, x: sparse.csr_matrix, beam_width: int)
 
         return all_preds
 
-    def _beam_search(self, instance_preds: np.ndarray, beam_width: int) -> np.ndarray:
+    def _beam_search(self, instance_preds: np.ndarray, beam_width: int, prob_A: int) -> np.ndarray:
         """Predict with beam search using cached probability estimates for a single instance.
 
         Args:
             instance_preds (np.ndarray): A vector of cached probability estimates of each node, has dimension number of labels + total number of metalabels.
             beam_width (int): Number of candidates considered.
+            prob_A (int, optional):
+                The hyperparameter used in the probability estimation function for
+                binary classification: sigmoid(prob_A * decision_value_matrix).
 
         Returns:
             np.ndarray: A vector with dimension number of classes.
@@ -182,7 +208,7 @@ def _beam_search(self, instance_preds: np.ndarray, beam_width: int) -> np.ndarra
                     continue
                 slice = np.s_[self.node_ptr[node.index] : self.node_ptr[node.index + 1]]
                 pred = instance_preds[slice]
-                children_score = score - np.square(np.maximum(0, 1 - pred))
+                children_score = score + self.sigmoid_A(pred, prob_A)
                 next_level.extend(zip(node.children, children_score.tolist()))
 
             cur_level = sorted(next_level, key=lambda pair: -pair[1])[:beam_width]
@@ -193,7 +219,7 @@ def _beam_search(self, instance_preds: np.ndarray, beam_width: int) -> np.ndarra
         for node, score in cur_level:
             slice = np.s_[self.node_ptr[node.index] : self.node_ptr[node.index + 1]]
             pred = instance_preds[slice]
-            scores[node.label_map] = np.exp(score - np.square(np.maximum(0, 1 - pred)))
+            scores[node.label_map] = np.exp(score + self.sigmoid_A(pred, prob_A))
         return scores
 
 
@@ -204,6 +230,7 @@ def train_tree(
     K=DEFAULT_K,
     dmax=DEFAULT_DMAX,
     verbose: bool = True,
+    root: Node = None,
 ) -> TreeModel:
     """Train a linear model for multi-label data using a divide-and-conquer strategy.
     The algorithm used is based on https://github.com/xmc-aalto/bonsai.
@@ -215,14 +242,16 @@ def train_tree(
         K (int, optional): Maximum degree of nodes in the tree. Defaults to 100.
         dmax (int, optional): Maximum depth of the tree. Defaults to 10.
         verbose (bool, optional): Output extra progress information. Defaults to True.
+        root (Node, optional): Pre-built tree root. Defaults to None.
 
     Returns:
         TreeModel: A model which can be used in predict_values.
     """
-    label_representation = (y.T * x).tocsr()
-    label_representation = sklearn.preprocessing.normalize(label_representation, norm="l2", axis=1)
-    root = _build_tree(label_representation, np.arange(y.shape[1]), 0, K, dmax)
-    root.is_root = True
+    if root is None:
+        label_representation = (y.T * x).tocsr()
+        label_representation = sklearn.preprocessing.normalize(label_representation, norm="l2", axis=1)
+        root = _build_tree(label_representation, np.arange(y.shape[1]), 0, K, dmax)
+        root.is_root = True
 
     num_nodes = 0
     # Both type(x) and type(y) are sparse.csr_matrix
diff --git a/libmultilabel/linear/utils.py b/libmultilabel/linear/utils.py
index 3324a896..7f1f3612 100644
--- a/libmultilabel/linear/utils.py
+++ b/libmultilabel/linear/utils.py
@@ -1,10 +1,15 @@
 from __future__ import annotations
 
 import os
+import sys
+import math
+import itertools
+import logging
 import pathlib
 import pickle
 import re
-from typing import Any
+from typing import Any, Callable
+from dataclasses import make_dataclass, field, fields, asdict
 
 import numpy as np
 import scipy.sparse as sparse
@@ -12,12 +17,14 @@
 import sklearn.model_selection
 import sklearn.pipeline
 import sklearn.utils
+import sklearn.preprocessing
 
 import libmultilabel.linear as linear
 
 from .preprocessor import Preprocessor
+from .tree import _build_tree
 
-__all__ = ["save_pipeline", "load_pipeline", "MultiLabelEstimator", "GridSearchCV"]
+__all__ = ["save_pipeline", "load_pipeline", "MultiLabelEstimator", "GridSearchCV", "GridParameter", "GridSearch"]
 
 
 LINEAR_TECHNIQUES = {
@@ -143,3 +150,327 @@ def _set_singlecore_options(self, estimator, param_grid: dict):
                 key = f"{name}__options"
                 param_grid[key] = [f"{re.sub(regex, '', v)} -m 1" for v in param_grid[key]]
         return param_grid
+
+
+# suppress inevitable outputs from sparsekmeans and sklearn preprocessors
+class __silent__:
+    def __init__(self):
+        self.stderr = os.dup(2)
+        self.devnull = os.open(os.devnull, os.O_WRONLY)
+
+    def __enter__(self):
+        os.dup2(self.devnull, 2)
+        self.stdout = sys.stdout
+        sys.stdout = open(os.devnull, "w")
+
+    def __exit__(self, type, value, traceback):
+        os.dup2(self.stderr, 2)
+        os.close(self.devnull)
+        os.close(self.stderr)
+        sys.stdout.close()
+        sys.stdout = self.stdout
+
+
+class GridParameter:
+    """A tree-based linear method hyperparameter class for GridSearch.
+    Transform the parameter dict into dataclass instances and handle the defaults.
+
+    Args:
+        params (dict, optional): The keys are the parameter names, and the valus are the parameter values.
+    """
+
+    _tfidf_fields = [
+        ("ngram_range", tuple[int, int], field(default=(1, 1))),
+        ("max_features", int, field(default=None)),
+        ("min_df", float | int, field(default=1)),
+        ("stop_words", str | list, field(default=None)),
+        ("strip_accents", str | Callable, field(default=None)),
+        ("tokenizer", Callable, field(default=None)),
+    ]
+    _tree_fields = [
+        ("dmax", int, field(default=10)),
+        ("K", int, field(default=8)),
+    ]
+    _linear_fields = [
+        ("s", int, field(default=1)),
+        ("c", float, field(default=1)),
+        ("B", int, field(default=-1)),
+    ]
+    _predict_fields = [
+        ("beam_width", int, field(default=10)),
+        ("prob_A", int, field(default=3)),
+    ]
+
+    # set frozen=True to make instances hashable.
+    # set order=True to enable comparison operations.
+    param_types = {
+        "tfidf": make_dataclass("TfidfParams", _tfidf_fields, frozen=True, order=True),
+        "tree": make_dataclass("TreeParams", _tree_fields, frozen=True, order=True),
+        "linear": make_dataclass("LinearParams", _linear_fields, frozen=True, order=True),
+        "predict": make_dataclass("PredictParams", _predict_fields, frozen=True, order=True),
+    }
+    _param_field_names = {
+        param_type: {f.name for f in fields(class_name)} for param_type, class_name in param_types.items()
+    }
+
+    def __init__(self, params: dict | None = None):
+        self.params = params or {}
+
+        params_set = set(self.params)
+        for param_type, class_name in self.param_types.items():
+            field_names = self._param_field_names[param_type]
+            filtered_keys = params_set & field_names
+            params_set -= field_names
+
+            filtered_params = {k: self.params[k] for k in filtered_keys}
+            setattr(self, param_type, class_name(**filtered_params))
+
+    @property
+    def linear_options(self):
+        options = ""
+        for field_name in self._param_field_names["linear"]:
+            options += f" -{field_name} {getattr(self.linear, field_name)}"
+        return options.strip()
+
+    def __repr__(self):  # provide a readable string representation of the object
+        return str(self.params)
+
+    def __eq__(self, other):  # compare instance attributes to define equality.
+        return all(getattr(self, t) == getattr(other, t) for t in self.param_types)
+
+    def __lt__(self, other):  # define ordering for sorting.
+        # "<" for tuple is automatically lexicographic ordering
+        my_values = tuple(getattr(self, t) for t in self.param_types)
+        other_values = tuple(getattr(other, t) for t in self.param_types)
+        return my_values < other_values
+
+    def __hash__(self):  # make instances hashable for use as dict keys
+        return hash(tuple(getattr(self, t) for t in self.param_types))
+
+
+class GridSearch:
+    """Grid search the search space and find the best parameters for tree-based linear method.
+
+    Args:
+        datasets (dict[str, dict[str, list[str]]]): The training and/or test data, with keys 'train' and 'test' respectively.
+                The data has keys 'x' for input features and 'y' for labels.
+        n_folds (int, optional): The number of cross-validation folds.
+        monitor_metrics (list[str], optional): The evaluation metrics to monitor.
+    """
+
+    def __init__(
+        self,
+        datasets: dict[str, dict[str, list[str]]],
+        n_folds: int = 3,
+        monitor_metrics: list[str] = ["P@1", "P@3", "P@5"],
+    ):
+        self.datasets = datasets
+        self.n_folds = n_folds
+        self.monitor_metrics = monitor_metrics
+
+        self._cached_params = GridParameter()
+        for param_type in self._cached_params.param_types:
+            setattr(self._cached_params, param_type, None)
+        self._cached_transformed_dataset = None
+        self._cached_tree_root = None
+        self._cached_fold_data = None
+        self._cached_model = None
+        self.no_cache = True
+
+        self.num_instances = len(self.datasets["train"]["y"])
+
+    def get_fold_dataset(self, train_idx, valid_idx):
+        def take(data, idx):
+            if isinstance(data, list):
+                return [data[i] for i in idx]
+            else:
+                return data[idx]
+
+        return {
+            "data_format": self.datasets["data_format"],
+            "train": {
+                "y": take(self.datasets["train"]["y"], train_idx),
+                "x": take(self.datasets["train"]["x"], train_idx),
+            },
+            "test": {
+                "y": take(self.datasets["train"]["y"], valid_idx),
+                "x": take(self.datasets["train"]["x"], valid_idx),
+            },
+        }
+
+    def get_transformed_dataset(
+        self, dataset: dict[str, dict[str, list[str]]], params: GridParameter
+    ) -> dict[str, dict[str, sparse.csr_matrix]]:
+        """
+        Get and cache the dataset for the given TF-IDF params.
+        If we have processed the coming params, return the cached dataset directly without computation.
+
+        Args:
+            dataset (dict[str, dict[str, list[str]]]): The training and/or test data, with keys 'train' and 'test' respectively.
+                The data has keys 'x' for input features and 'y' for labels.
+            params (GridParameter): The params to build the dataset.
+
+        Returns:
+            dict[str, dict[str, sparse.csr_matrix]]: The transformed dataset.
+        """
+        tfidf_params = params.tfidf
+        self.no_cache = tfidf_params != self._cached_params.tfidf
+        if self.no_cache:
+            logging.info(f"TFIDF  - Preprocessing: {tfidf_params}")
+            if self.datasets["data_format"] not in {"txt", "dataframe"}:
+                logging.info(
+                    "Please make sure the data format is 'txt' or 'dataframe'. Otherwise, the TF-IDF parameters have no effect on the dataset."
+                )
+            with __silent__():
+                preprocessor = linear.Preprocessor(tfidf_params=asdict(tfidf_params))
+                self._cached_params.tfidf = tfidf_params
+                self._cached_transformed_dataset = preprocessor.fit_transform(dataset)
+        else:
+            logging.info(f"TFIDF  - Using cached data: {tfidf_params}")
+
+        return self._cached_transformed_dataset
+
+    def get_tree(self, y, x, params):
+        tree_params = params.tree
+        self.no_cache |= tree_params != self._cached_params.tree
+        if self.no_cache:
+            logging.info(f"Tree   - Preprocessing: {tree_params}")
+            with __silent__():
+                label_representation = (y.T * x).tocsr()
+                label_representation = sklearn.preprocessing.normalize(label_representation, norm="l2", axis=1)
+                self._cached_params.tree = tree_params
+                self._cached_tree_root = _build_tree(
+                    label_representation, np.arange(y.shape[1]), 0, **asdict(tree_params)
+                )
+                self._cached_tree_root.is_root = True
+        else:
+            logging.info(f"Tree   - Using cached data: {tree_params}")
+
+        return self._cached_tree_root
+
+    def get_model(self, y: sparse.csr_matrix, x: sparse.csr_matrix, params: GridParameter) -> linear.TreeModel:
+        """
+        Get and cache the model for the given params.
+        If we have processed the coming params, return the cached model directly without computation.
+
+        Args:
+            y (sparse.csr_matrix): The labels of the training data.
+            x (sparse.csr_matrix): The features of the training data.
+            params (GridParameter): The params to build the model.
+
+        Returns:
+            linear.TreeModel: The model for the given params.
+        """
+        root = self.get_tree(y, x, params)
+
+        linear_params = params.linear
+
+        if self.no_cache or (linear_params != self._cached_params.linear):
+            logging.info(f"Model  - Training: {linear_params}")
+            with __silent__():
+                self._cached_params.linear = linear_params
+                self._cached_model = linear.train_tree(
+                    y,
+                    x,
+                    root=root,
+                    options=params.linear_options,
+                )
+        else:
+            logging.info(f"Model  - Using cached data: {linear_params}")
+
+        return self._cached_model
+
+    @staticmethod
+    def compute_scores(
+        y: sparse.csr_matrix,
+        x: sparse.csr_matrix,
+        model: linear.TreeModel,
+        params: GridParameter,
+        param_metrics: dict[str, linear.MetricCollection],
+    ) -> dict[str, linear.MetricCollection]:
+        """
+        Update the metric values in param_metrics with y, x, and model.
+
+        Args:
+            y (sparse.csr_matrix): The labels of the test data.
+            x (sparse.csr_matrix): The features of the test data.
+            model (linear.TreeModel): The trained model.
+            params (GridParameter): The params used to compute the scores.
+            param_metrics (dict[str, linear.MetricCollection]): The metric values for each GridParameter.
+
+        Returns:
+            dict[str, linear.MetricCollection]: The updated metric values.
+        """
+        logging.info(f"Metric - Scoring: {params.predict}\n")
+
+        batch_size = 256
+        num_instances = x.shape[0]
+        num_batches = math.ceil(num_instances / batch_size)
+
+        for i in range(num_batches):
+            preds = model.predict_values(x[i * batch_size : (i + 1) * batch_size], **asdict(params.predict))
+            target = y[i * batch_size : (i + 1) * batch_size].toarray()
+            param_metrics[params].update(preds, target)
+
+        return param_metrics
+
+    def __call__(self, search_space_dict: dict[str, list]) -> dict[GridParameter, dict[str, float]]:
+        """
+        Run the grid search on the search space.
+
+        Args:
+            search_space_dict (dict[str, list]): The search space for the grid search.
+
+        Returns:
+            dict[GridParameter, dict[str, float]]: The cross-validation scores for each GridParameter in the search space.
+        """
+        param_names = search_space_dict.keys()
+
+        # To avoid redundant computation (e.g., building the same tree multiple times across different params),
+        # we group identical settings in fields and process them continuously.
+        # This is implemented by sorting the params in the order of the four fields:
+        # TF-IDF, tree, linear, and predict. Finally, cache and reuse the most recent result of each field.
+        self.search_space = sorted(
+            [
+                GridParameter(dict(zip(param_names, param_values)))
+                for param_values in itertools.product(*search_space_dict.values())
+            ],
+            reverse=True,
+        )
+
+        # When the number of labels is large, evaluation often focuses on top-ranked
+        # metrics (e.g., Precision@K), which do not depend on num_classes.
+        # We therefore use -1 as a placeholder.
+        self.param_metrics = {
+            params: linear.get_metrics(self.monitor_metrics, num_classes=-1) for params in self.search_space
+        }
+
+        permutation = np.random.permutation(self.num_instances)
+        index_per_fold = []
+        for fold in range(self.n_folds):
+            index = permutation[
+                int(fold * self.num_instances / self.n_folds) : int((fold + 1) * self.num_instances / self.n_folds)
+            ]
+            index_per_fold.append(index)
+
+        for fold in range(self.n_folds):
+            train_idx = np.concatenate(index_per_fold[:fold] + index_per_fold[fold + 1 :])
+            valid_idx = index_per_fold[fold]
+            fold_dataset = self.get_fold_dataset(train_idx, valid_idx)
+
+            self._cached_params.tfidf = None
+            for params in self.search_space:
+                logging.info(f"Status - Running fold {fold}, params: {params}")
+
+                transformed_dataset = self.get_transformed_dataset(fold_dataset, params)
+                model = self.get_model(transformed_dataset["train"]["y"], transformed_dataset["train"]["x"], params)
+
+                self.param_metrics = self.compute_scores(
+                    transformed_dataset["test"]["y"],
+                    transformed_dataset["test"]["x"],
+                    model,
+                    params,
+                    self.param_metrics,
+                )
+
+        return {params: metrics.compute() for params, metrics in self.param_metrics.items()}
diff --git a/linear_trainer.py b/linear_trainer.py
index b9133857..f5f374fa 100644
--- a/linear_trainer.py
+++ b/linear_trainer.py
@@ -24,6 +24,7 @@ def linear_test(config, model, datasets, label_mapping):
     predict_kwargs = {}
     if isinstance(model, (TreeModel, EnsembleTreeModel)):
         predict_kwargs["beam_width"] = config.beam_width
+        predict_kwargs["prob_A"] = config.prob_A
 
     for i in tqdm(range(ceil(num_instance / config.eval_batch_size))):
         slice = np.s_[i * config.eval_batch_size : (i + 1) * config.eval_batch_size]
diff --git a/main.py b/main.py
index 7a523f1f..59e3e379 100644
--- a/main.py
+++ b/main.py
@@ -252,6 +252,12 @@ def add_all_arguments(parser):
         default=10,
         help="The width of the beam search (default: %(default)s)",
     )
+    parser.add_argument(
+        "--prob_A",
+        type=int,
+        default=3,
+        help="The hyperparameter used in the probability estimation function for binary classification: sigmoid(prob_A * decision_value_matrix). (default: %(default)s)",
+    )
     # AttentionXML
     parser.add_argument(
         "--cluster_size",