diff --git a/.github/workflows/docker.yml b/.github/workflows/docker.yml
index 97db370..b71dad7 100644
--- a/.github/workflows/docker.yml
+++ b/.github/workflows/docker.yml
@@ -17,7 +17,7 @@ jobs:
           remove_haskell: true
           remove_tool_cache: true
 
-      - uses: actions/checkout@v4
+      - uses: actions/checkout@v5
         with:
           fetch-depth: 0
 
diff --git a/.github/workflows/pre-commit.yml b/.github/workflows/pre-commit.yml
index 985e06f..01fe9de 100644
--- a/.github/workflows/pre-commit.yml
+++ b/.github/workflows/pre-commit.yml
@@ -9,11 +9,12 @@ jobs:
     runs-on: ubuntu-latest
 
     steps:
-      - uses: actions/checkout@v4
+      - uses: actions/checkout@v5
 
-      - uses: actions/setup-python@v5
+      - uses: actions/setup-python@v6
         with:
           python-version: '3.12'
+          cache: 'pip'
 
       - name: Install dependencies
         run: pip install '.[dev]'
diff --git a/.github/workflows/wheels.yml b/.github/workflows/wheels.yml
index 146c7c2..ca11d62 100644
--- a/.github/workflows/wheels.yml
+++ b/.github/workflows/wheels.yml
@@ -10,7 +10,7 @@ jobs:
     runs-on: ubuntu-latest
 
     steps:
-      - uses: actions/checkout@v4
+      - uses: actions/checkout@v5
         with:
           fetch-depth: 0
 
@@ -27,7 +27,7 @@ jobs:
     runs-on: ubuntu-latest
 
     steps:
-      - uses: actions/checkout@v4
+      - uses: actions/checkout@v5
         with:
           fetch-depth: 0
 
@@ -48,10 +48,10 @@ jobs:
       url: https://pypi.org/project/qmb/
     permissions:
       id-token: write
-    if: github.event_name == 'push' && startsWith(github.ref, 'refs/tags/v') && github.event.repository.visibility == 'public'
+    if: github.event_name == 'push' && startsWith(github.ref, 'refs/tags/v')
 
     steps:
-      - uses: actions/download-artifact@v4
+      - uses: actions/download-artifact@v5
         with:
           pattern: build-*
           path: dist
diff --git a/Dockerfile b/Dockerfile
index caaa86b..dd22ca7 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -1,5 +1,5 @@
 # Use the specified CUDA base image with Rocky Linux 9
-FROM nvidia/cuda:12.9.1-cudnn-devel-rockylinux9
+FROM nvidia/cuda:13.0.1-cudnn-devel-rockylinux9
 
 # Install dependencies
 WORKDIR /app
diff --git a/README.md b/README.md
index 0fa83ed..3a5eaa3 100644
--- a/README.md
+++ b/README.md
@@ -1,6 +1,6 @@
-# An efficient Neural-Network Quantum States Architecture for Strongly Correlated Systems
+# Hamiltonian-Guided Autoregressive Selected-Configuration Interaction(HAAR-SCI) Achieves Chemical Accuracy in Strongly Correlated Systems
 
-The Quantum-Many-Body (`qmb`) is a powerful tool designed to solve quantum-many-body problems.
+The current project temporarily named as Quantum-Many-Body (`qmb`) which is a powerful tool designed to solve quantum-many-body problems especially for strongly correlated systems.
 
 ## About The Project
 
diff --git a/pyproject.toml b/pyproject.toml
index 631bfa5..4b8993a 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -1,21 +1,21 @@
 [build-system]
-requires = ["setuptools>=78.0,<80.10", "setuptools_scm~=8.2.0"]
+requires = ["setuptools>=78.0,<80.10", "setuptools_scm>=8.2,<9.3"]
 build-backend = "setuptools.build_meta"
 
 [project]
 name = "qmb"
 dynamic = ["version"]
 dependencies = [
-  "platformdirs~=4.3.7",
-  "numpy~=1.26.4",
-  "scipy~=1.15.2",
-  "torch>=2.6,<2.8",
-  "pybind11~=2.13.6",
-  "ninja~=1.11.1.4",
+  "platformdirs>=4.3.7,<4.5.0",
+  "numpy>=1.26.4,<2.4.0",
+  "scipy>=1.15.2,<1.17.0",
+  "torch>=2.6,<2.9",
+  "pybind11>=2.13.6,<3.1.0",
+  "ninja>=1.11.1.4,<1.13.1.0",
   "tyro~=0.9.18",
   "pyyaml~=6.0.2",
   "openfermion~=1.7.0",
-  "tensorboard~=2.19.0",
+  "tensorboard>=2.19,<2.21",
   "standard-imghdr~=3.13.0 ; python_version>='3.13'",
 ]
 requires-python = ">=3.12"
@@ -40,9 +40,9 @@ classifiers = [
 dev = [
   "yapf~=0.43.0",
   "pylint~=3.3.6",
-  "mypy>=1.15,<1.18",
+  "mypy>=1.15,<1.19",
   "pytest>=8.3.5,<8.5.0",
-  "pytest-cov~=6.0.0",
+  "pytest-cov>=6.0,<7.1",
   "types-pyyaml~=6.0.12.20250326",
 ]
 
diff --git a/qmb/__main__.py b/qmb/__main__.py
index 094a028..4e3b892 100644
--- a/qmb/__main__.py
+++ b/qmb/__main__.py
@@ -8,13 +8,16 @@
 from . import openfermion as _  # type: ignore[no-redef]
 from . import fcidump as _  # type: ignore[no-redef]
 from . import hubbard as _  # type: ignore[no-redef]
+from . import free_fermion as _  # type: ignore[no-redef]
 from . import ising as _  # type: ignore[no-redef]
 from . import vmc as _  # type: ignore[no-redef]
+from . import markov as _  # type: ignore[no-redef]
 from . import haar as _  # type: ignore[no-redef]
 from . import rldiag as _  # type: ignore[no-redef]
 from . import precompile as _  # type: ignore[no-redef]
 from . import list_loss as _  # type: ignore[no-redef]
 from . import chop_imag as _  # type: ignore[no-redef]
+from . import pert as _  # type: ignore[no-redef]
 from . import run as _  # type: ignore[no-redef]
 from .subcommand_dict import subcommand_dict
 
diff --git a/qmb/_hamiltonian.cpp b/qmb/_hamiltonian.cpp
index 634fd3b..1968002 100644
--- a/qmb/_hamiltonian.cpp
+++ b/qmb/_hamiltonian.cpp
@@ -84,6 +84,7 @@ TORCH_LIBRARY_FRAGMENT(QMB_LIBRARY(N_QUBYTES, PARTICLE_CUT), m) {
     m.def("apply_within(Tensor configs_i, Tensor psi_i, Tensor configs_j, Tensor site, Tensor kind, Tensor coef) -> Tensor");
     m.def("find_relative(Tensor configs_i, Tensor psi_i, int count_selected, Tensor site, Tensor kind, Tensor coef, Tensor configs_exclude) -> Tensor"
     );
+    m.def("diagonal_term(Tensor configs, Tensor site, Tensor kind, Tensor coef) -> Tensor");
     m.def("single_relative(Tensor configs, Tensor site, Tensor kind, Tensor coef) -> Tensor");
 }
 #undef QMB_LIBRARY
diff --git a/qmb/_hamiltonian_cuda.cu b/qmb/_hamiltonian_cuda.cu
index c561e7e..74c7e55 100644
--- a/qmb/_hamiltonian_cuda.cu
+++ b/qmb/_hamiltonian_cuda.cu
@@ -674,6 +674,134 @@ auto find_relative_interface(
     return unique_nonzero_result_config;
 }
 
+template<std::int64_t max_op_number, std::int64_t n_qubytes, std::int64_t particle_cut>
+__device__ void diagonal_term_kernel(
+    std::int64_t term_index,
+    std::int64_t batch_index,
+    std::int64_t term_number,
+    std::int64_t batch_size,
+    const std::array<std::int16_t, max_op_number>* site, // term_number
+    const std::array<std::uint8_t, max_op_number>* kind, // term_number
+    const std::array<double, 2>* coef, // term_number
+    const std::array<std::uint8_t, n_qubytes>* configs, // batch_size
+    std::array<double, 2>* result_psi
+) {
+    std::array<std::uint8_t, n_qubytes> current_configs = configs[batch_index];
+    auto [success, parity] = hamiltonian_apply_kernel<max_op_number, n_qubytes, particle_cut>(
+        /*current_configs=*/current_configs,
+        /*term_index=*/term_index,
+        /*batch_index=*/batch_index,
+        /*site=*/site,
+        /*kind=*/kind
+    );
+
+    if (!success) {
+        return;
+    }
+    auto less = array_less<std::uint8_t, n_qubytes>();
+    if (less(current_configs, configs[batch_index]) || less(configs[batch_index], current_configs)) {
+        return; // The term does not apply to the current configuration
+    }
+    std::int8_t sign = parity ? -1 : +1;
+    atomicAdd(&result_psi[batch_index][0], sign * coef[term_index][0]);
+    atomicAdd(&result_psi[batch_index][1], sign * coef[term_index][1]);
+}
+
+template<std::int64_t max_op_number, std::int64_t n_qubytes, std::int64_t particle_cut>
+__global__ void diagonal_term_kernel_interface(
+    std::int64_t term_number,
+    std::int64_t batch_size,
+    const std::array<std::int16_t, max_op_number>* site, // term_number
+    const std::array<std::uint8_t, max_op_number>* kind, // term_number
+    const std::array<double, 2>* coef, // term_number
+    const std::array<std::uint8_t, n_qubytes>* configs, // batch_size
+    std::array<double, 2>* result_psi
+) {
+    std::int64_t term_index = blockIdx.x * blockDim.x + threadIdx.x;
+    std::int64_t batch_index = blockIdx.y * blockDim.y + threadIdx.y;
+
+    if (term_index < term_number && batch_index < batch_size) {
+        diagonal_term_kernel<max_op_number, n_qubytes, particle_cut>(
+            /*term_index=*/term_index,
+            /*batch_index=*/batch_index,
+            /*term_number=*/term_number,
+            /*batch_size=*/batch_size,
+            /*site=*/site,
+            /*kind=*/kind,
+            /*coef=*/coef,
+            /*configs=*/configs,
+            /*result_psi=*/result_psi
+        );
+    }
+}
+
+template<std::int64_t max_op_number, std::int64_t n_qubytes, std::int64_t particle_cut>
+auto diagonal_term_interface(const torch::Tensor& configs, const torch::Tensor& site, const torch::Tensor& kind, const torch::Tensor& coef)
+    -> torch::Tensor {
+    std::int64_t device_id = configs.device().index();
+    std::int64_t batch_size = configs.size(0);
+    std::int64_t term_number = site.size(0);
+    at::cuda::CUDAGuard cuda_device_guard(device_id);
+
+    TORCH_CHECK(configs.device().type() == torch::kCUDA, "configs must be on CUDA.")
+    TORCH_CHECK(configs.device().index() == device_id, "configs must be on the same device as others.");
+    TORCH_CHECK(configs.is_contiguous(), "configs must be contiguous.")
+    TORCH_CHECK(configs.dtype() == torch::kUInt8, "configs must be uint8.")
+    TORCH_CHECK(configs.dim() == 2, "configs must be 2D.")
+    TORCH_CHECK(configs.size(0) == batch_size, "configs batch size must match the provided batch_size.");
+    TORCH_CHECK(configs.size(1) == n_qubytes, "configs must have the same number of qubits as the provided n_qubytes.");
+
+    TORCH_CHECK(site.device().type() == torch::kCUDA, "site must be on CUDA.")
+    TORCH_CHECK(site.device().index() == device_id, "site must be on the same device as others.");
+    TORCH_CHECK(site.is_contiguous(), "site must be contiguous.")
+    TORCH_CHECK(site.dtype() == torch::kInt16, "site must be int16.")
+    TORCH_CHECK(site.dim() == 2, "site must be 2D.")
+    TORCH_CHECK(site.size(0) == term_number, "site size must match the provided term_number.");
+    TORCH_CHECK(site.size(1) == max_op_number, "site must match the provided max_op_number.");
+
+    TORCH_CHECK(kind.device().type() == torch::kCUDA, "kind must be on CUDA.")
+    TORCH_CHECK(kind.device().index() == device_id, "kind must be on the same device as others.");
+    TORCH_CHECK(kind.is_contiguous(), "kind must be contiguous.")
+    TORCH_CHECK(kind.dtype() == torch::kUInt8, "kind must be uint8.")
+    TORCH_CHECK(kind.dim() == 2, "kind must be 2D.")
+    TORCH_CHECK(kind.size(0) == term_number, "kind size must match the provided term_number.");
+    TORCH_CHECK(kind.size(1) == max_op_number, "kind must match the provided max_op_number.");
+
+    TORCH_CHECK(coef.device().type() == torch::kCUDA, "coef must be on CUDA.")
+    TORCH_CHECK(coef.device().index() == device_id, "coef must be on the same device as others.");
+    TORCH_CHECK(coef.is_contiguous(), "coef must be contiguous.")
+    TORCH_CHECK(coef.dtype() == torch::kFloat64, "coef must be float64.")
+    TORCH_CHECK(coef.dim() == 2, "coef must be 2D.")
+    TORCH_CHECK(coef.size(0) == term_number, "coef size must match the provided term_number.");
+    TORCH_CHECK(coef.size(1) == 2, "coef must contain 2 elements for each term.");
+
+    auto stream = at::cuda::getCurrentCUDAStream(device_id);
+    auto policy = thrust::device.on(stream);
+
+    cudaDeviceProp prop;
+    AT_CUDA_CHECK(cudaGetDeviceProperties(&prop, device_id));
+    std::int64_t max_threads_per_block = prop.maxThreadsPerBlock;
+
+    auto result_psi = torch::zeros({batch_size, 2}, torch::TensorOptions().dtype(torch::kFloat64).device(device, device_id));
+
+    auto threads_per_block = dim3{1, max_threads_per_block >> 1}; // I don't know why, but need to divide by 2 to avoid errors
+    auto num_blocks =
+        dim3{(term_number + threads_per_block.x - 1) / threads_per_block.x, (batch_size + threads_per_block.y - 1) / threads_per_block.y};
+
+    diagonal_term_kernel_interface<max_op_number, n_qubytes, particle_cut><<<num_blocks, threads_per_block, 0, stream>>>(
+        /*term_number=*/term_number,
+        /*batch_size=*/batch_size,
+        /*site=*/reinterpret_cast<const std::array<std::int16_t, max_op_number>*>(site.data_ptr()),
+        /*kind=*/reinterpret_cast<const std::array<std::uint8_t, max_op_number>*>(kind.data_ptr()),
+        /*coef=*/reinterpret_cast<const std::array<double, 2>*>(coef.data_ptr()),
+        /*configs=*/reinterpret_cast<const std::array<std::uint8_t, n_qubytes>*>(configs.data_ptr()),
+        /*result_psi=*/reinterpret_cast<std::array<double, 2>*>(result_psi.data_ptr())
+    );
+    AT_CUDA_CHECK(cudaStreamSynchronize(stream));
+
+    return result_psi;
+}
+
 template<std::int64_t max_op_number, std::int64_t n_qubytes, std::int64_t particle_cut>
 __device__ void single_relative_kernel(
     std::int64_t term_index,
@@ -880,6 +1008,7 @@ auto single_relative_interface(const torch::Tensor& configs, const torch::Tensor
 TORCH_LIBRARY_IMPL(QMB_LIBRARY(N_QUBYTES, PARTICLE_CUT), CUDA, m) {
     m.impl("apply_within", apply_within_interface</*max_op_number=*/4, /*n_qubytes=*/N_QUBYTES, /*particle_cut=*/PARTICLE_CUT>);
     m.impl("find_relative", find_relative_interface</*max_op_number=*/4, /*n_qubytes=*/N_QUBYTES, /*particle_cut=*/PARTICLE_CUT>);
+    m.impl("diagonal_term", diagonal_term_interface</*max_op_number=*/4, /*n_qubytes=*/N_QUBYTES, /*particle_cut=*/PARTICLE_CUT>);
     m.impl("single_relative", single_relative_interface</*max_op_number=*/4, /*n_qubytes=*/N_QUBYTES, /*particle_cut=*/PARTICLE_CUT>);
 }
 #undef QMB_LIBRARY
diff --git a/qmb/fcidump.py b/qmb/fcidump.py
index 5ffd75f..4a738e7 100644
--- a/qmb/fcidump.py
+++ b/qmb/fcidump.py
@@ -192,6 +192,9 @@ def apply_within(self, configs_i: torch.Tensor, psi_i: torch.Tensor, configs_j:
     def find_relative(self, configs_i: torch.Tensor, psi_i: torch.Tensor, count_selected: int, configs_exclude: torch.Tensor | None = None) -> torch.Tensor:
         return self.hamiltonian.find_relative(configs_i, psi_i, count_selected, configs_exclude)
 
+    def diagonal_term(self, configs: torch.Tensor) -> torch.Tensor:
+        return self.hamiltonian.diagonal_term(configs)
+
     def single_relative(self, configs: torch.Tensor) -> torch.Tensor:
         return self.hamiltonian.single_relative(configs)
 
diff --git a/qmb/free_fermion.py b/qmb/free_fermion.py
new file mode 100644
index 0000000..562c3eb
--- /dev/null
+++ b/qmb/free_fermion.py
@@ -0,0 +1,116 @@
+"""
+This file offers an interface for defining free fermion models on a two-dimensional lattice.
+"""
+
+import typing
+import logging
+import dataclasses
+import torch
+import tyro
+from .hamiltonian import Hamiltonian
+from .model_dict import model_dict, ModelProto, NetworkConfigProto
+
+
+@dataclasses.dataclass
+class ModelConfig:
+    """
+    The configuration for the free fermion model.
+    """
+
+    # The width of the free fermion lattice
+    m: typing.Annotated[int, tyro.conf.Positional]
+    # The height of the free fermion lattice
+    n: typing.Annotated[int, tyro.conf.Positional]
+
+    # The electron number
+    electron_number: typing.Annotated[int, tyro.conf.arg(aliases=["-e"])]
+
+    def __post_init__(self) -> None:
+        if self.m <= 0 or self.n <= 0:
+            raise ValueError("The dimensions of the free fermion model must be positive integers.")
+
+        if self.electron_number < 0 or self.electron_number > self.m * self.n:
+            raise ValueError(f"The electron number {self.electron_number} is out of bounds for a {self.m}x{self.n} lattice. Each site can host up to one electron.")
+
+
+class Model(ModelProto[ModelConfig]):
+    """
+    This class handles the free fermion model.
+    """
+
+    network_dict: dict[str, type[NetworkConfigProto["Model"]]] = {}
+
+    config_t = ModelConfig
+
+    @classmethod
+    def default_group_name(cls, config: ModelConfig) -> str:
+        return f"FreeFermion_{config.m}x{config.n}_e{config.electron_number}"
+
+    @classmethod
+    def _prepare_hamiltonian(cls, args: ModelConfig) -> dict[tuple[tuple[int, int], ...], complex]:
+
+        def _index(i: int, j: int) -> int:
+            return i + j * args.m
+
+        hamiltonian_dict: dict[tuple[tuple[int, int], ...], complex] = {}
+        for i in range(args.m):
+            for j in range(args.n):
+                # Nearest neighbor hopping
+                if i != 0:
+                    hamiltonian_dict[(_index(i, j), 1), (_index(i - 1, j), 0)] = 1
+                    hamiltonian_dict[(_index(i - 1, j), 1), (_index(i, j), 0)] = 1
+                if j != 0:
+                    hamiltonian_dict[(_index(i, j), 1), (_index(i, j - 1), 0)] = 1
+                    hamiltonian_dict[(_index(i, j - 1), 1), (_index(i, j), 0)] = 1
+
+        return hamiltonian_dict
+
+    def _calculate_ref_energy(self, hamiltonian_dict: dict[tuple[tuple[int, int], ...], complex]) -> float:
+        sites = self.m * self.n
+        hamiltonian = torch.zeros((sites, sites), dtype=torch.complex128)
+        for ((site_1, _), (site_2, _)), value in hamiltonian_dict.items():
+            hamiltonian[site_1, site_2] = torch.tensor(value, dtype=torch.complex128)
+        return torch.linalg.eigh(hamiltonian).eigenvalues[:self.electron_number].sum().item()  # pylint: disable=not-callable
+
+    def __init__(
+        self,
+        args: ModelConfig,
+    ):
+        logging.info("Input arguments successfully parsed")
+
+        assert args.electron_number is not None
+        self.m: int = args.m
+        self.n: int = args.n
+        self.electron_number: int = args.electron_number
+        logging.info("Constructing a free fermion model with dimensions: width = %d, height = %d", self.m, self.n)
+        logging.info("The parameters of the model are: N = %d", args.electron_number)
+
+        logging.info("Initializing Hamiltonian for the lattice")
+        hamiltonian_dict: dict[tuple[tuple[int, int], ...], complex] = self._prepare_hamiltonian(args)
+        logging.info("Hamiltonian initialization complete")
+
+        self.ref_energy: float = self._calculate_ref_energy(hamiltonian_dict)
+        logging.info("The ref energy is %.10f", self.ref_energy)
+
+        logging.info("Converting the Hamiltonian to internal Hamiltonian representation")
+        self.hamiltonian: Hamiltonian = Hamiltonian(hamiltonian_dict, kind="fermi")
+        logging.info("Internal Hamiltonian representation for model has been successfully created")
+
+    def apply_within(self, configs_i: torch.Tensor, psi_i: torch.Tensor, configs_j: torch.Tensor) -> torch.Tensor:
+        return self.hamiltonian.apply_within(configs_i, psi_i, configs_j)
+
+    def find_relative(self, configs_i: torch.Tensor, psi_i: torch.Tensor, count_selected: int, configs_exclude: torch.Tensor | None = None) -> torch.Tensor:
+        return self.hamiltonian.find_relative(configs_i, psi_i, count_selected, configs_exclude)
+
+    def diagonal_term(self, configs: torch.Tensor) -> torch.Tensor:
+        return self.hamiltonian.diagonal_term(configs)
+
+    def single_relative(self, configs: torch.Tensor) -> torch.Tensor:
+        return self.hamiltonian.single_relative(configs)
+
+    def show_config(self, config: torch.Tensor) -> str:
+        string = "".join(f"{i:08b}"[::-1] for i in config.cpu().numpy())
+        return "[" + ".".join("".join("-" if string[i + j * self.m] == "0" else "x" for i in range(self.m)) for j in range(self.n)) + "]"
+
+
+model_dict["free_fermion"] = Model
diff --git a/qmb/haar.py b/qmb/haar.py
index 4e517ef..00e5f80 100644
--- a/qmb/haar.py
+++ b/qmb/haar.py
@@ -316,8 +316,8 @@ def main(self, *, model_param: typing.Any = None, network_param: typing.Any = No
             "Krylov Extend Count: %d, "
             "Krylov Extend First: %s, "
             "Krylov Single Extend: %s, "
-            "krylov Iteration: %d, "
-            "krylov Threshold: %.10f, "
+            "Krylov Iteration: %d, "
+            "Krylov Threshold: %.10f, "
             "Loss Function: %s, "
             "Global Optimizer: %s, "
             "Use LBFGS: %s, "
@@ -523,18 +523,4 @@ def closure() -> torch.Tensor:
 
 
 subcommand_dict["haar"] = HaarConfig
-
-
-class ImagConfig(HaarConfig):
-    """
-    Deprecated, use "haar" instead.
-    """
-
-    # pylint: disable=too-few-public-methods
-
-    def __post_init__(self) -> None:
-        logging.warning("The 'imag' subcommand is deprecated, please use 'haar' instead.")
-        super().__post_init__()
-
-
-subcommand_dict["imag"] = ImagConfig
+subcommand_dict["imag"] = HaarConfig
diff --git a/qmb/hamiltonian.py b/qmb/hamiltonian.py
index 5dca2a6..af1e830 100644
--- a/qmb/hamiltonian.py
+++ b/qmb/hamiltonian.py
@@ -165,6 +165,26 @@ def find_relative(
         configs_j = _find_relative(configs_i, torch.view_as_real(psi_i), count_selected, self.site, self.kind, self.coef, configs_exclude)
         return configs_j
 
+    def diagonal_term(self, configs: torch.Tensor) -> torch.Tensor:
+        """
+        Get the diagonal term of the Hamiltonian for the given configurations.
+
+        Parameters
+        ----------
+        configs : torch.Tensor
+            A uint8 tensor of shape [batch_size, n_qubytes] representing the input configurations.
+
+        Returns
+        -------
+        torch.Tensor
+            A complex64 tensor of shape [batch_size] representing the diagonal term of the Hamiltonian for the given configurations.
+        """
+        self._prepare_data(configs.device)
+        _diagonal_term: typing.Callable[[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor], torch.Tensor]
+        _diagonal_term = getattr(self._load_module(configs.device.type, configs.size(1), self.particle_cut), "diagonal_term")
+        psi_result = torch.view_as_complex(_diagonal_term(configs, self.site, self.kind, self.coef))
+        return psi_result
+
     def single_relative(self, configs: torch.Tensor) -> torch.Tensor:
         """
         Find a single relative configuration for each configurations.
diff --git a/qmb/hubbard.py b/qmb/hubbard.py
index 6100e05..a65ec5e 100644
--- a/qmb/hubbard.py
+++ b/qmb/hubbard.py
@@ -112,6 +112,9 @@ def apply_within(self, configs_i: torch.Tensor, psi_i: torch.Tensor, configs_j:
     def find_relative(self, configs_i: torch.Tensor, psi_i: torch.Tensor, count_selected: int, configs_exclude: torch.Tensor | None = None) -> torch.Tensor:
         return self.hamiltonian.find_relative(configs_i, psi_i, count_selected, configs_exclude)
 
+    def diagonal_term(self, configs: torch.Tensor) -> torch.Tensor:
+        return self.hamiltonian.diagonal_term(configs)
+
     def single_relative(self, configs: torch.Tensor) -> torch.Tensor:
         return self.hamiltonian.single_relative(configs)
 
diff --git a/qmb/ising.py b/qmb/ising.py
index 574d3bd..8d432ef 100644
--- a/qmb/ising.py
+++ b/qmb/ising.py
@@ -222,6 +222,9 @@ def apply_within(self, configs_i: torch.Tensor, psi_i: torch.Tensor, configs_j:
     def find_relative(self, configs_i: torch.Tensor, psi_i: torch.Tensor, count_selected: int, configs_exclude: torch.Tensor | None = None) -> torch.Tensor:
         return self.hamiltonian.find_relative(configs_i, psi_i, count_selected, configs_exclude)
 
+    def diagonal_term(self, configs: torch.Tensor) -> torch.Tensor:
+        return self.hamiltonian.diagonal_term(configs)
+
     def single_relative(self, configs: torch.Tensor) -> torch.Tensor:
         return self.hamiltonian.single_relative(configs)
 
@@ -335,8 +338,6 @@ class PepsConfig:
     D: typing.Annotated[int, tyro.conf.arg(aliases=["-d"])] = 4  # pylint: disable=invalid-name
     # The cut-off bond dimension of the network
     Dc: typing.Annotated[int, tyro.conf.arg(aliases=["-c"])] = 16  # pylint: disable=invalid-name
-    # Use complex tensors
-    use_complex: typing.Annotated[bool, tyro.conf.arg(aliases=["-z"])] = False
 
     def create(self, model: Model) -> NetworkProto:
         """
@@ -345,11 +346,9 @@ def create(self, model: Model) -> NetworkProto:
         logging.info(
             "PEPS network configuration: "
             "bond dimension: %d, "
-            "cut-off bond dimension: %d, "
-            "use complex: %s",
+            "cut-off bond dimension: %d",
             self.D,
             self.Dc,
-            self.use_complex,
         )
 
         network = PepsFunction(
@@ -358,7 +357,7 @@ def create(self, model: Model) -> NetworkProto:
             d=2,
             D=self.D,
             Dc=self.Dc,
-            use_complex=self.use_complex,
+            use_complex=True,
         )
 
         return network
diff --git a/qmb/markov.py b/qmb/markov.py
new file mode 100644
index 0000000..0b6a568
--- /dev/null
+++ b/qmb/markov.py
@@ -0,0 +1,186 @@
+"""
+This file implements a VMC method based on the Markov chain for solving quantum many-body problems.
+"""
+
+import logging
+import typing
+import dataclasses
+import torch
+import torch.utils.tensorboard
+import tyro
+from .common import CommonConfig
+from .subcommand_dict import subcommand_dict
+from .optimizer import initialize_optimizer
+from .bitspack import pack_int, unpack_int
+
+
+@dataclasses.dataclass
+class MarkovConfig:
+    """
+    The VMC optimization based on the Markov chain for solving quantum many-body problems.
+    """
+
+    # pylint: disable=too-many-instance-attributes
+
+    common: typing.Annotated[CommonConfig, tyro.conf.OmitArgPrefixes]
+
+    # The sampling count
+    sampling_count: typing.Annotated[int, tyro.conf.arg(aliases=["-n"])] = 4000
+    # The number of relative configurations to be used in energy calculation
+    relative_count: typing.Annotated[int, tyro.conf.arg(aliases=["-c"])] = 40000
+    # Whether to use the global optimizer
+    global_opt: typing.Annotated[bool, tyro.conf.arg(aliases=["-g"])] = False
+    # Whether to use LBFGS instead of Adam
+    use_lbfgs: typing.Annotated[bool, tyro.conf.arg(aliases=["-2"])] = False
+    # The learning rate for the local optimizer
+    learning_rate: typing.Annotated[float, tyro.conf.arg(aliases=["-r"], help_behavior_hint="(default: 1e-3 for Adam, 1 for LBFGS)")] = -1
+    # The number of steps for the local optimizer
+    local_step: typing.Annotated[int, tyro.conf.arg(aliases=["-s"])] = 1000
+    # The initial configurations for the first step
+    initial_config: typing.Annotated[str, tyro.conf.arg(aliases=["-i"])] = ""
+
+    def __post_init__(self) -> None:
+        if self.learning_rate == -1:
+            self.learning_rate = 1 if self.use_lbfgs else 1e-3
+
+    def main(self) -> None:
+        """
+        The main function for the VMC optimization based on the Markov chain.
+        """
+        # pylint: disable=too-many-statements
+        # pylint: disable=too-many-locals
+
+        model, network, data = self.common.main()
+
+        logging.info(
+            "Arguments Summary: "
+            "Sampling Count: %d, "
+            "Relative Count: %d, "
+            "Global Optimizer: %s, "
+            "Use LBFGS: %s, "
+            "Learning Rate: %.10f, "
+            "Local Steps: %d, ",
+            self.sampling_count,
+            self.relative_count,
+            "Yes" if self.global_opt else "No",
+            "Yes" if self.use_lbfgs else "No",
+            self.learning_rate,
+            self.local_step,
+        )
+
+        optimizer = initialize_optimizer(
+            network.parameters(),
+            use_lbfgs=self.use_lbfgs,
+            learning_rate=self.learning_rate,
+            state_dict=data.get("optimizer"),
+        )
+
+        if "markov" not in data:
+            data["markov"] = {"global": 0, "local": 0, "pool": None}
+
+        # TODO: 如何确认大小?
+        configs = pack_int(
+            torch.tensor([[int(i) for i in self.initial_config]], dtype=torch.uint8, device=self.common.device),
+            size=1,
+        )
+        if data["markov"]["pool"] is None:
+            data["markov"]["pool"] = configs
+            logging.info("The initial configuration is imported successfully.")
+        else:
+            logging.info("The initial configuration is provided, but the pool from the last iteration is not empty, so the initial configuration will be ignored.")
+
+        writer = torch.utils.tensorboard.SummaryWriter(log_dir=self.common.folder())  # type: ignore[no-untyped-call]
+
+        while True:
+            logging.info("Starting a new optimization cycle")
+
+            logging.info("Checking the configuration pool")
+            config = data["markov"]["pool"]
+            old_config = config.repeat(self.sampling_count // config.shape[0] + 1, 1)[:self.sampling_count]
+
+            logging.info("Hopping configurations")
+            def hop(config):
+                # TODO: use hamiltonian
+                x = unpack_int(config, size=1, last_dim=model.m * model.n)
+                x = x.view(-1, model.m, model.n)
+                batch, L1, L2 = x.shape
+                swap_dim = torch.randint(0, 2, (batch,))
+
+                out = x.clone()
+                for b in range(batch):
+                    if swap_dim[b] == 0:  # 在 L1 方向交换
+                        i = torch.randint(0, L1 - 1, (1,)).item()
+                        j = torch.randint(0, L2, (1,)).item()
+                        out[b, i, j], out[b, i+1, j] = x[b, i+1, j], x[b, i, j]
+                    else:  # 在 L2 方向交换
+                        i = torch.randint(0, L1, (1,)).item()
+                        j = torch.randint(0, L2 - 1, (1,)).item()
+                        out[b, i, j], out[b, i, j+1] = x[b, i, j+1], x[b, i, j]
+                x = out.view(-1, model.m * model.n)
+                return pack_int(x, size=1)
+            new_config = hop(old_config)
+            old_weight = network(old_config)
+            new_weight = network(new_config)
+            accept_prob = (new_weight / old_weight).abs().clamp(max=1)**2
+            accept = torch.rand_like(accept_prob) < accept_prob
+            configs = torch.where(accept.unsqueeze(-1), new_config, old_config)
+            configs_i = torch.unique(configs, dim=0)
+            psi_i = network(configs_i)
+            data["markov"]["pool"] = configs_i
+            logging.info("Sampling completed, configurations count: %d", len(configs_i))
+
+            logging.info("Calculating relative configurations")
+            if self.relative_count <= len(configs_i):
+                configs_src = configs_i
+                configs_dst = configs_i
+            else:
+                configs_src = configs_i
+                configs_dst = torch.cat([configs_i, model.find_relative(configs_i, psi_i, self.relative_count - len(configs_i))])
+            logging.info("Relative configurations calculated, count: %d", len(configs_dst))
+
+            optimizer = initialize_optimizer(
+                network.parameters(),
+                use_lbfgs=self.use_lbfgs,
+                learning_rate=self.learning_rate,
+                new_opt=not self.global_opt,
+                optimizer=optimizer,
+            )
+
+            def closure() -> torch.Tensor:
+                # Optimizing energy
+                optimizer.zero_grad()
+                psi_src = network(configs_src)
+                with torch.no_grad():
+                    psi_dst = network(configs_dst)
+                    hamiltonian_psi_dst = model.apply_within(configs_dst, psi_dst, configs_src)
+                num = psi_src.conj() @ hamiltonian_psi_dst
+                den = psi_src.conj() @ psi_src.detach()
+                energy = num / den
+                energy = energy.real
+                energy.backward()  # type: ignore[no-untyped-call]
+                return energy
+
+            logging.info("Starting local optimization process")
+
+            for i in range(self.local_step):
+                energy: torch.Tensor = optimizer.step(closure)  # type: ignore[assignment,arg-type]
+                logging.info("Local optimization in progress, step: %d, energy: %.10f, ref energy: %.10f", i, energy.item(), model.ref_energy)
+                writer.add_scalar("markov/energy", energy, data["markov"]["local"])  # type: ignore[no-untyped-call]
+                writer.add_scalar("markov/error", energy - model.ref_energy, data["markov"]["local"])  # type: ignore[no-untyped-call]
+                data["markov"]["local"] += 1
+
+            logging.info("Local optimization process completed")
+
+            writer.flush()  # type: ignore[no-untyped-call]
+
+            logging.info("Saving model checkpoint")
+            data["markov"]["global"] += 1
+            data["network"] = network.state_dict()
+            data["optimizer"] = optimizer.state_dict()
+            self.common.save(data, data["markov"]["global"])
+            logging.info("Checkpoint successfully saved")
+
+            logging.info("Current optimization cycle completed")
+
+
+subcommand_dict["markov"] = MarkovConfig
diff --git a/qmb/model_dict.py b/qmb/model_dict.py
index 2972c23..6d1a85e 100644
--- a/qmb/model_dict.py
+++ b/qmb/model_dict.py
@@ -173,6 +173,21 @@ def find_relative(self, configs_i: torch.Tensor, psi_i: torch.Tensor, count_sele
             The relative configurations.
         """
 
+    def diagonal_term(self, configs: torch.Tensor) -> torch.Tensor:
+        """
+        Calculate the diagonal term for the given configurations.
+
+        Parameters
+        ----------
+        configs : torch.Tensor
+            The configurations to calculate the diagonal term for.
+
+        Returns
+        -------
+        torch.Tensor
+            The diagonal term of the configurations.
+        """
+
     def single_relative(self, configs: torch.Tensor) -> torch.Tensor:
         """
         Find a single relative configuration for each configurations.
diff --git a/qmb/openfermion.py b/qmb/openfermion.py
index 088e40e..7abe916 100644
--- a/qmb/openfermion.py
+++ b/qmb/openfermion.py
@@ -86,6 +86,9 @@ def apply_within(self, configs_i: torch.Tensor, psi_i: torch.Tensor, configs_j:
     def find_relative(self, configs_i: torch.Tensor, psi_i: torch.Tensor, count_selected: int, configs_exclude: torch.Tensor | None = None) -> torch.Tensor:
         return self.hamiltonian.find_relative(configs_i, psi_i, count_selected, configs_exclude)
 
+    def diagonal_term(self, configs: torch.Tensor) -> torch.Tensor:
+        return self.hamiltonian.diagonal_term(configs)
+
     def single_relative(self, configs: torch.Tensor) -> torch.Tensor:
         return self.hamiltonian.single_relative(configs)
 
diff --git a/qmb/peps.py b/qmb/peps.py
index 1a38967..19c5b7a 100644
--- a/qmb/peps.py
+++ b/qmb/peps.py
@@ -30,7 +30,7 @@ def _tensor(self, l1: int, l2: int, config: torch.Tensor) -> torch.Tensor:
         """
         # pylint: disable=unsubscriptable-object
         # Order: L, U, D, R
-        tensor: torch.Tensor = self.tensors[l1, l2, config]
+        tensor: torch.Tensor = self.tensors[l1, l2, config.to(torch.int64)]
         if l2 == 0:
             tensor = tensor[:, :1, :, :, :]
         if l1 == 0:
diff --git a/qmb/pert.py b/qmb/pert.py
new file mode 100644
index 0000000..84c3f40
--- /dev/null
+++ b/qmb/pert.py
@@ -0,0 +1,66 @@
+"""
+This file implements a perturbation estimator from haar.
+"""
+
+import logging
+import typing
+import dataclasses
+import tyro
+from .common import CommonConfig
+from .subcommand_dict import subcommand_dict
+
+
+@dataclasses.dataclass
+class PerturbationConfig:
+    """
+    The perturbation estimator from haar.
+    """
+
+    common: typing.Annotated[CommonConfig, tyro.conf.OmitArgPrefixes]
+
+    def main(self, *, model_param: typing.Any = None, network_param: typing.Any = None) -> None:
+        """
+        The main function of two-step optimization process based on imaginary time.
+        """
+        # pylint: disable=too-many-locals
+        # pylint: disable=too-many-statements
+        # pylint: disable=too-many-branches
+
+        model, _, data = self.common.main(model_param=model_param, network_param=network_param)
+
+        if "haar" not in data and "imag" in data:
+            data["haar"] = data.pop("imag")
+        configs, psi = data["haar"]["pool"]
+        configs = configs.to(self.common.device)
+        psi = psi.to(self.common.device)
+
+        energy0_num = psi.conj() @ model.apply_within(configs, psi, configs)
+        energy0_den = psi.conj() @ psi
+        energy0 = (energy0_num / energy0_den).real.item()
+        logging.info("Current energy is %.8f", energy0)
+        logging.info("Reference energy is %.8f", model.ref_energy)
+
+        number = configs.size(0)
+        last_result_number = 0
+        current_target_number = number
+        logging.info("Starting finding relative configurations with %d.", number)
+        while True:
+            other_configs = model.find_relative(configs, psi, current_target_number, configs)
+            current_result_number = other_configs.size(0)
+            logging.info("Found %d relative configurations.", current_result_number)
+            if current_result_number == last_result_number:
+                logging.info("No new configurations found, stopping at %d.", current_result_number)
+                break
+            current_target_number = current_target_number * 2
+            logging.info("Doubling target number to %d.", current_target_number)
+            break
+
+        hamiltonian_psi = model.apply_within(configs, psi, other_configs)
+        energy2_num = (hamiltonian_psi.conj() @ hamiltonian_psi).real / (psi.conj() @ psi).real
+        energy2_den = energy0 - model.diagonal_term(other_configs).real
+        energy2 = (energy2_num / energy2_den).sum().item()
+        logging.info("Correct energy is %.8f", energy2)
+        logging.info("Error is reduced from %.8f to %.8f", energy0 - model.ref_energy, energy2 - model.ref_energy)
+
+
+subcommand_dict["pert"] = PerturbationConfig