From 85f25693e1b4118d8f719824e846578bf18d52fa Mon Sep 17 00:00:00 2001
From: JBFYS-XD <jbfysxd@gmail.com>
Date: Thu, 19 Mar 2026 02:40:16 +0000
Subject: [PATCH 1/3] issue/1031 merge T1-1-32

---
 include/infinicore/ops/cat.hpp                |   9 +
 include/infinicore/ops/inner.hpp              |  17 ++
 include/infinicore/ops/masked_select.hpp      |  16 ++
 include/infinicore/ops/tan.hpp                |  17 ++
 include/infinicore/ops/tanhshrink.hpp         |  16 ++
 include/infiniop.h                            |   4 +
 include/infiniop/ops/inner.h                  |  28 +++
 include/infiniop/ops/masked_select.h          |  28 +++
 include/infiniop/ops/tan.h                    |  26 +++
 include/infiniop/ops/tanhshrink.h             |  24 +++
 python/infinicore/__init__.py                 |   8 +
 python/infinicore/nn/functional/__init__.py   |   2 +
 python/infinicore/nn/functional/tanhshrink.py |  11 ++
 python/infinicore/ops/cat.py                  |  12 ++
 python/infinicore/ops/inner.py                |  11 ++
 python/infinicore/ops/masked_select.py        |   5 +
 python/infinicore/ops/tan.py                  |  11 ++
 src/infinicore/ops/cat/cat.cc                 | 165 ++++++++++++++++++
 src/infinicore/ops/inner/inner.cc             |  46 +++++
 src/infinicore/ops/inner/inner_infiniop.cc    |  59 +++++++
 .../ops/masked_select/masked_select.cc        |  33 ++++
 .../masked_select/masked_select_infiniop.cc   |  59 +++++++
 src/infinicore/ops/tan/tan.cc                 |  32 ++++
 src/infinicore/ops/tan/tan_infiniop.cc        |  58 ++++++
 src/infinicore/ops/tanhshrink/tanhshrink.cc   |  32 ++++
 .../ops/tanhshrink/tanhshrink_infiniop.cc     |  58 ++++++
 src/infinicore/pybind11/ops.hpp               |  12 ++
 src/infinicore/pybind11/ops/cat.hpp           |  26 +++
 src/infinicore/pybind11/ops/inner.hpp         |  26 +++
 src/infinicore/pybind11/ops/masked_select.hpp |  20 +++
 src/infinicore/pybind11/ops/tan.hpp           |  24 +++
 src/infinicore/pybind11/ops/tanhshrink.hpp    |  24 +++
 src/infiniop/ops/inner/cpu/inner_cpu.cc       |  88 ++++++++++
 src/infiniop/ops/inner/cpu/inner_cpu.h        |   7 +
 src/infiniop/ops/inner/cuda/kernel.cuh        |  41 +++++
 src/infiniop/ops/inner/info.h                 |  88 ++++++++++
 src/infiniop/ops/inner/inner.h                |  49 ++++++
 src/infiniop/ops/inner/metax/inner_metax.h    |   7 +
 src/infiniop/ops/inner/metax/inner_metax.maca | 123 +++++++++++++
 .../ops/inner/metax/inner_metax_kernel.h      |  59 +++++++
 src/infiniop/ops/inner/moore/inner_moore.h    |   7 +
 src/infiniop/ops/inner/moore/inner_moore.mu   | 123 +++++++++++++
 .../ops/inner/moore/inner_moore_kernel.h      |  45 +++++
 src/infiniop/ops/inner/nvidia/inner_nvidia.cu | 123 +++++++++++++
 .../ops/inner/nvidia/inner_nvidia.cuh         |   7 +
 src/infiniop/ops/inner/operator.cc            | 139 +++++++++++++++
 .../masked_select/cpu/masked_select_cpu.cc    |  58 ++++++
 .../ops/masked_select/cpu/masked_select_cpu.h |   7 +
 .../ops/masked_select/cuda/kernel.cuh         | 101 +++++++++++
 src/infiniop/ops/masked_select/info.h         |  53 ++++++
 .../ops/masked_select/masked_select.h         |  48 +++++
 .../masked_select/metax/masked_select_metax.h |   7 +
 .../metax/masked_select_metax.maca            | 154 ++++++++++++++++
 .../metax/masked_select_metax_kernel.h        | 101 +++++++++++
 .../masked_select/moore/masked_select_moore.h |   7 +
 .../moore/masked_select_moore.mu              | 156 +++++++++++++++++
 .../moore/masked_select_moore_kernel.h        | 101 +++++++++++
 .../nvidia/masked_select_nvidia.cu            | 149 ++++++++++++++++
 .../nvidia/masked_select_nvidia.cuh           |   8 +
 src/infiniop/ops/masked_select/operator.cc    | 138 +++++++++++++++
 src/infiniop/ops/tan/cpu/tan_cpu.cc           |  50 ++++++
 src/infiniop/ops/tan/cpu/tan_cpu.h            |  23 +++
 src/infiniop/ops/tan/cuda/kernel.cuh          |  30 ++++
 src/infiniop/ops/tan/metax/tan_metax.h        |   8 +
 src/infiniop/ops/tan/metax/tan_metax.maca     |  56 ++++++
 src/infiniop/ops/tan/moore/tan_moore.h        |   8 +
 src/infiniop/ops/tan/moore/tan_moore.mu       |  56 ++++++
 src/infiniop/ops/tan/nvidia/tan_nvidia.cu     |  57 ++++++
 src/infiniop/ops/tan/nvidia/tan_nvidia.cuh    |   8 +
 src/infiniop/ops/tan/operator.cc              | 157 +++++++++++++++++
 .../ops/tanhshrink/cpu/tanhshrink_cpu.cc      |  50 ++++++
 .../ops/tanhshrink/cpu/tanhshrink_cpu.h       |  23 +++
 src/infiniop/ops/tanhshrink/cuda/kernel.cuh   |  30 ++++
 .../ops/tanhshrink/metax/tanhshrink_metax.h   |   8 +
 .../tanhshrink/metax/tanhshrink_metax.maca    |  56 ++++++
 .../ops/tanhshrink/moore/tanhshrink_moore.h   |   8 +
 .../ops/tanhshrink/moore/tanhshrink_moore.mu  |  56 ++++++
 .../moore/tanhshrink_moore_kernel.h           |  28 +++
 .../tanhshrink/nvidia/tanhshrink_nvidia.cu    |  59 +++++++
 .../tanhshrink/nvidia/tanhshrink_nvidia.cuh   |   8 +
 src/infiniop/ops/tanhshrink/operator.cc       | 157 +++++++++++++++++
 test/infinicore/ops/cat.py                    |   6 +-
 test/infinicore/ops/inner.py                  |   6 +-
 test/infinicore/ops/masked_select.py          |   6 +-
 test/infinicore/ops/tan.py                    |   6 +-
 test/infinicore/ops/tanhshrink.py             |   6 +-
 86 files changed, 3864 insertions(+), 15 deletions(-)
 create mode 100644 include/infinicore/ops/cat.hpp
 create mode 100644 include/infinicore/ops/inner.hpp
 create mode 100644 include/infinicore/ops/masked_select.hpp
 create mode 100644 include/infinicore/ops/tan.hpp
 create mode 100644 include/infinicore/ops/tanhshrink.hpp
 create mode 100644 include/infiniop/ops/inner.h
 create mode 100644 include/infiniop/ops/masked_select.h
 create mode 100644 include/infiniop/ops/tan.h
 create mode 100644 include/infiniop/ops/tanhshrink.h
 create mode 100644 python/infinicore/nn/functional/tanhshrink.py
 create mode 100644 python/infinicore/ops/cat.py
 create mode 100644 python/infinicore/ops/inner.py
 create mode 100644 python/infinicore/ops/masked_select.py
 create mode 100644 python/infinicore/ops/tan.py
 create mode 100644 src/infinicore/ops/cat/cat.cc
 create mode 100644 src/infinicore/ops/inner/inner.cc
 create mode 100644 src/infinicore/ops/inner/inner_infiniop.cc
 create mode 100644 src/infinicore/ops/masked_select/masked_select.cc
 create mode 100644 src/infinicore/ops/masked_select/masked_select_infiniop.cc
 create mode 100644 src/infinicore/ops/tan/tan.cc
 create mode 100644 src/infinicore/ops/tan/tan_infiniop.cc
 create mode 100644 src/infinicore/ops/tanhshrink/tanhshrink.cc
 create mode 100644 src/infinicore/ops/tanhshrink/tanhshrink_infiniop.cc
 create mode 100644 src/infinicore/pybind11/ops/cat.hpp
 create mode 100644 src/infinicore/pybind11/ops/inner.hpp
 create mode 100644 src/infinicore/pybind11/ops/masked_select.hpp
 create mode 100644 src/infinicore/pybind11/ops/tan.hpp
 create mode 100644 src/infinicore/pybind11/ops/tanhshrink.hpp
 create mode 100644 src/infiniop/ops/inner/cpu/inner_cpu.cc
 create mode 100644 src/infiniop/ops/inner/cpu/inner_cpu.h
 create mode 100644 src/infiniop/ops/inner/cuda/kernel.cuh
 create mode 100644 src/infiniop/ops/inner/info.h
 create mode 100644 src/infiniop/ops/inner/inner.h
 create mode 100644 src/infiniop/ops/inner/metax/inner_metax.h
 create mode 100644 src/infiniop/ops/inner/metax/inner_metax.maca
 create mode 100644 src/infiniop/ops/inner/metax/inner_metax_kernel.h
 create mode 100644 src/infiniop/ops/inner/moore/inner_moore.h
 create mode 100644 src/infiniop/ops/inner/moore/inner_moore.mu
 create mode 100644 src/infiniop/ops/inner/moore/inner_moore_kernel.h
 create mode 100644 src/infiniop/ops/inner/nvidia/inner_nvidia.cu
 create mode 100644 src/infiniop/ops/inner/nvidia/inner_nvidia.cuh
 create mode 100644 src/infiniop/ops/inner/operator.cc
 create mode 100644 src/infiniop/ops/masked_select/cpu/masked_select_cpu.cc
 create mode 100644 src/infiniop/ops/masked_select/cpu/masked_select_cpu.h
 create mode 100644 src/infiniop/ops/masked_select/cuda/kernel.cuh
 create mode 100644 src/infiniop/ops/masked_select/info.h
 create mode 100644 src/infiniop/ops/masked_select/masked_select.h
 create mode 100644 src/infiniop/ops/masked_select/metax/masked_select_metax.h
 create mode 100644 src/infiniop/ops/masked_select/metax/masked_select_metax.maca
 create mode 100644 src/infiniop/ops/masked_select/metax/masked_select_metax_kernel.h
 create mode 100644 src/infiniop/ops/masked_select/moore/masked_select_moore.h
 create mode 100644 src/infiniop/ops/masked_select/moore/masked_select_moore.mu
 create mode 100644 src/infiniop/ops/masked_select/moore/masked_select_moore_kernel.h
 create mode 100644 src/infiniop/ops/masked_select/nvidia/masked_select_nvidia.cu
 create mode 100644 src/infiniop/ops/masked_select/nvidia/masked_select_nvidia.cuh
 create mode 100644 src/infiniop/ops/masked_select/operator.cc
 create mode 100644 src/infiniop/ops/tan/cpu/tan_cpu.cc
 create mode 100644 src/infiniop/ops/tan/cpu/tan_cpu.h
 create mode 100644 src/infiniop/ops/tan/cuda/kernel.cuh
 create mode 100644 src/infiniop/ops/tan/metax/tan_metax.h
 create mode 100644 src/infiniop/ops/tan/metax/tan_metax.maca
 create mode 100644 src/infiniop/ops/tan/moore/tan_moore.h
 create mode 100644 src/infiniop/ops/tan/moore/tan_moore.mu
 create mode 100644 src/infiniop/ops/tan/nvidia/tan_nvidia.cu
 create mode 100644 src/infiniop/ops/tan/nvidia/tan_nvidia.cuh
 create mode 100644 src/infiniop/ops/tan/operator.cc
 create mode 100644 src/infiniop/ops/tanhshrink/cpu/tanhshrink_cpu.cc
 create mode 100644 src/infiniop/ops/tanhshrink/cpu/tanhshrink_cpu.h
 create mode 100644 src/infiniop/ops/tanhshrink/cuda/kernel.cuh
 create mode 100644 src/infiniop/ops/tanhshrink/metax/tanhshrink_metax.h
 create mode 100644 src/infiniop/ops/tanhshrink/metax/tanhshrink_metax.maca
 create mode 100644 src/infiniop/ops/tanhshrink/moore/tanhshrink_moore.h
 create mode 100644 src/infiniop/ops/tanhshrink/moore/tanhshrink_moore.mu
 create mode 100644 src/infiniop/ops/tanhshrink/moore/tanhshrink_moore_kernel.h
 create mode 100644 src/infiniop/ops/tanhshrink/nvidia/tanhshrink_nvidia.cu
 create mode 100644 src/infiniop/ops/tanhshrink/nvidia/tanhshrink_nvidia.cuh
 create mode 100644 src/infiniop/ops/tanhshrink/operator.cc

diff --git a/include/infinicore/ops/cat.hpp b/include/infinicore/ops/cat.hpp
new file mode 100644
index 000000000..5fdfdcce8
--- /dev/null
+++ b/include/infinicore/ops/cat.hpp
@@ -0,0 +1,9 @@
+#pragma once
+
+#include "common/op.hpp"
+
+namespace infinicore::op {
+
+Tensor cat(std::vector<Tensor> tensors, int dim);
+void cat_(Tensor out, std::vector<Tensor> tensors, int dim);
+} // namespace infinicore::op
\ No newline at end of file
diff --git a/include/infinicore/ops/inner.hpp b/include/infinicore/ops/inner.hpp
new file mode 100644
index 000000000..017ddc604
--- /dev/null
+++ b/include/infinicore/ops/inner.hpp
@@ -0,0 +1,17 @@
+#pragma once
+
+#include "../device.hpp"
+#include "common/op.hpp"
+
+namespace infinicore::op {
+class Inner {
+public:
+    using schema = void (*)(Tensor, Tensor, Tensor);
+    static void execute(Tensor out, Tensor input, Tensor other);
+    static common::OpDispatcher<schema> &dispatcher();
+};
+
+Tensor inner(Tensor input, Tensor other);
+void inner_(Tensor out, Tensor input, Tensor other);
+
+}
\ No newline at end of file
diff --git a/include/infinicore/ops/masked_select.hpp b/include/infinicore/ops/masked_select.hpp
new file mode 100644
index 000000000..89bbfa49d
--- /dev/null
+++ b/include/infinicore/ops/masked_select.hpp
@@ -0,0 +1,16 @@
+#pragma once
+
+#include "../device.hpp"
+#include "common/op.hpp"
+
+namespace infinicore::op {
+class MaskedSelect {
+public:
+    using schema = void (*)(Tensor, Tensor, void **, size_t *);
+    static void execute(Tensor input, Tensor mask, void **data_ptr, size_t *dlen_ptr);
+    static common::OpDispatcher<schema> &dispatcher();
+};
+
+Tensor masked_select(Tensor input, Tensor mask);
+
+} // namespace infinicore::op
\ No newline at end of file
diff --git a/include/infinicore/ops/tan.hpp b/include/infinicore/ops/tan.hpp
new file mode 100644
index 000000000..9c2d7a860
--- /dev/null
+++ b/include/infinicore/ops/tan.hpp
@@ -0,0 +1,17 @@
+#pragma once
+
+#include "../device.hpp"
+#include "common/op.hpp"
+
+namespace infinicore::op {
+class Tan {
+public:
+    using schema = void (*)(Tensor, Tensor);
+    static void execute(Tensor output, Tensor input);
+    static common::OpDispatcher<schema> &dispatcher();
+};
+
+Tensor tan(Tensor input);
+void tan_(Tensor output, Tensor input);
+
+}
\ No newline at end of file
diff --git a/include/infinicore/ops/tanhshrink.hpp b/include/infinicore/ops/tanhshrink.hpp
new file mode 100644
index 000000000..83673b22c
--- /dev/null
+++ b/include/infinicore/ops/tanhshrink.hpp
@@ -0,0 +1,16 @@
+#pragma once
+
+#include "../device.hpp"
+#include "common/op.hpp"
+
+namespace infinicore::op {
+class Tanhshrink {
+public:
+    using schema = void (*)(Tensor, Tensor);
+    static void execute(Tensor output, Tensor input);
+    static common::OpDispatcher<schema> &dispatcher();
+};
+
+Tensor tanhshrink(Tensor input);
+void tanhshrink_(Tensor output, Tensor input);
+} // namespace infinicore::op
diff --git a/include/infiniop.h b/include/infiniop.h
index 2b30ff4ae..b5aa2d966 100644
--- a/include/infiniop.h
+++ b/include/infiniop.h
@@ -30,9 +30,11 @@
 #include "infiniop/ops/hardtanh.h"
 #include "infiniop/ops/int8_gemm.h"
 #include "infiniop/ops/kv_caching.h"
+#include "infiniop/ops/inner.h"
 #include "infiniop/ops/layer_norm.h"
 #include "infiniop/ops/logsoftmax.h"
 #include "infiniop/ops/lp_norm.h"
+#include "infiniop/ops/masked_select.h"
 #include "infiniop/ops/mul.h"
 #include "infiniop/ops/ones.h"
 #include "infiniop/ops/paged_attention.h"
@@ -54,8 +56,10 @@
 #include "infiniop/ops/sub.h"
 #include "infiniop/ops/sum.h"
 #include "infiniop/ops/swiglu.h"
+#include "infiniop/ops/tan.h"
 #include "infiniop/ops/tanh.h"
 #include "infiniop/ops/topk.h"
+#include "infiniop/ops/tanhshrink.h"
 #include "infiniop/ops/topkrouter.h"
 #include "infiniop/ops/topksoftmax.h"
 #include "infiniop/ops/var.h"
diff --git a/include/infiniop/ops/inner.h b/include/infiniop/ops/inner.h
new file mode 100644
index 000000000..f484cde9c
--- /dev/null
+++ b/include/infiniop/ops/inner.h
@@ -0,0 +1,28 @@
+#ifndef __INFINIOP_INNER_API_H__
+#define __INFINIOP_INNER_API_H__
+
+#include "../operator_descriptor.h"
+
+typedef struct InfiniopDescriptor *infiniopInnerDescriptor_t;
+
+__C __export infiniStatus_t infiniopCreateInnerDescriptor(
+    infiniopHandle_t handle,
+    infiniopInnerDescriptor_t *desc_ptr,
+    infiniopTensorDescriptor_t out_desc,
+    infiniopTensorDescriptor_t input_desc,
+    infiniopTensorDescriptor_t other_desc);
+
+__C __export infiniStatus_t infiniopGetInnerWorkspaceSize(infiniopInnerDescriptor_t desc, size_t *size);
+
+__C __export infiniStatus_t infiniopInner(
+    infiniopInnerDescriptor_t desc,
+    void *workspace,
+    size_t workspace_size,
+    void *out,
+    const void *input,
+    const void *other,
+    void *stream);
+
+__C __export infiniStatus_t infiniopDestroyInnerDescriptor(infiniopInnerDescriptor_t desc);
+
+#endif
\ No newline at end of file
diff --git a/include/infiniop/ops/masked_select.h b/include/infiniop/ops/masked_select.h
new file mode 100644
index 000000000..7adf65415
--- /dev/null
+++ b/include/infiniop/ops/masked_select.h
@@ -0,0 +1,28 @@
+#ifndef __INFINIOP_MASKED_SELECT_API_H__
+#define __INFINIOP_MASKED_SELECT_API_H__
+
+#include "../operator_descriptor.h"
+
+typedef struct InfiniopDescriptor *infiniopMaskedSelectDescriptor_t;
+
+__C __export infiniStatus_t infiniopCreateMaskedSelectDescriptor(
+    infiniopHandle_t handle,
+    infiniopMaskedSelectDescriptor_t *desc_ptr,
+    infiniopTensorDescriptor_t input_desc,
+    infiniopTensorDescriptor_t mask_desc);
+
+__C __export infiniStatus_t infiniopGetMaskedSelectWorkspaceSize(infiniopMaskedSelectDescriptor_t desc, size_t *size);
+
+__C __export infiniStatus_t infiniopMaskedSelect(
+    infiniopMaskedSelectDescriptor_t desc,
+    void *workspace,
+    size_t workspace_size,
+    const void *input,
+    const bool *mask,
+    void **data_ptr,
+    size_t *dlen,
+    void *stream);
+
+__C __export infiniStatus_t infiniopDestroyMaskedSelectDescriptor(infiniopMaskedSelectDescriptor_t desc);
+
+#endif
\ No newline at end of file
diff --git a/include/infiniop/ops/tan.h b/include/infiniop/ops/tan.h
new file mode 100644
index 000000000..2e20c61da
--- /dev/null
+++ b/include/infiniop/ops/tan.h
@@ -0,0 +1,26 @@
+#ifndef __INFINIOP_TAN_API_H__
+#define __INFINIOP_TAN_API_H__
+
+#include "../operator_descriptor.h"
+
+typedef struct InfiniopDescriptor *infiniopTanDescriptor_t;
+
+__C __export infiniStatus_t infiniopCreateTanDescriptor(
+    infiniopHandle_t handle,
+    infiniopTanDescriptor_t *desc_ptr,
+    infiniopTensorDescriptor_t input_desc,
+    infiniopTensorDescriptor_t out_desc);
+
+__C __export infiniStatus_t infiniopGetTanWorkspaceSize(infiniopTanDescriptor_t desc, size_t *size);
+
+__C __export infiniStatus_t infiniopTan(
+    infiniopTanDescriptor_t desc,
+    void *workspace,
+    size_t workspace_size,
+    void *output,
+    const void *input,
+    void *stream);
+
+__C __export infiniStatus_t infiniopDestroyTanDescriptor(infiniopTanDescriptor_t desc);
+
+#endif
\ No newline at end of file
diff --git a/include/infiniop/ops/tanhshrink.h b/include/infiniop/ops/tanhshrink.h
new file mode 100644
index 000000000..e136f6523
--- /dev/null
+++ b/include/infiniop/ops/tanhshrink.h
@@ -0,0 +1,24 @@
+#ifndef __INFINIOP_TANHSHRINK_API_H__
+#define __INFINIOP_TANHSHRINK_API_H__
+
+#include "../operator_descriptor.h"
+
+typedef struct InfiniopDescriptor *infiniopTanhshrinkDescriptor_t;
+
+__C __export infiniStatus_t infiniopCreateTanhshrinkDescriptor(infiniopHandle_t handle,
+                                                         infiniopTanhshrinkDescriptor_t *desc_ptr,
+                                                         infiniopTensorDescriptor_t output,
+                                                         infiniopTensorDescriptor_t intput);
+
+__C __export infiniStatus_t infiniopGetTanhshrinkWorkspaceSize(infiniopTanhshrinkDescriptor_t desc, size_t *size);
+
+__C __export infiniStatus_t infiniopTanhshrink(infiniopTanhshrinkDescriptor_t desc,
+                                         void *workspace,
+                                         size_t workspace_size,
+                                         void *output,
+                                         const void *intput,
+                                         void *stream);
+
+__C __export infiniStatus_t infiniopDestroyTanhshrinkDescriptor(infiniopTanhshrinkDescriptor_t desc);
+
+#endif
diff --git a/python/infinicore/__init__.py b/python/infinicore/__init__.py
index a895b0426..f5e9a72b0 100644
--- a/python/infinicore/__init__.py
+++ b/python/infinicore/__init__.py
@@ -64,6 +64,9 @@
 from infinicore.ops.equal import equal
 from infinicore.ops.fmod import fmod
 from infinicore.ops.kv_caching import kv_caching
+from infinicore.ops.cat import cat
+from infinicore.ops.inner import inner
+from infinicore.ops.masked_select import masked_select
 from infinicore.ops.matmul import matmul
 from infinicore.ops.mha_kvcache import mha_kvcache
 from infinicore.ops.mha_varlen import mha_varlen
@@ -80,6 +83,7 @@
 from infinicore.ops.unsqueeze import unsqueeze
 from infinicore.ops.var import var
 from infinicore.ops.var_mean import var_mean
+from infinicore.ops.tan import tan
 from infinicore.tensor import (
     Tensor,
     empty,
@@ -151,6 +155,9 @@
     "baddbmm",
     "bilinear",
     "fmod",
+    "cat",
+    "inner",
+    "masked_select",
     "matmul",
     "equal",
     "mul",
@@ -159,6 +166,7 @@
     "unsqueeze",
     "rearrange",
     "cross_entropy",
+    "tan",
     "empty",
     "empty_like",
     "from_blob",
diff --git a/python/infinicore/nn/functional/__init__.py b/python/infinicore/nn/functional/__init__.py
index 815eb8a2b..91e101f9d 100644
--- a/python/infinicore/nn/functional/__init__.py
+++ b/python/infinicore/nn/functional/__init__.py
@@ -14,6 +14,7 @@
 from .silu import silu
 from .silu_and_mul import silu_and_mul
 from .swiglu import swiglu
+from .tanhshrink import tanhshrink
 
 __all__ = [
     "adaptive_max_pool1d",
@@ -33,4 +34,5 @@
     "swiglu",
     "linear_w8a8i8",
     "silu_and_mul",
+    "tanhshrink"
 ]
diff --git a/python/infinicore/nn/functional/tanhshrink.py b/python/infinicore/nn/functional/tanhshrink.py
new file mode 100644
index 000000000..10d17077d
--- /dev/null
+++ b/python/infinicore/nn/functional/tanhshrink.py
@@ -0,0 +1,11 @@
+from infinicore.lib import _infinicore
+from infinicore.tensor import Tensor
+
+
+def tanhshrink(input, *, out=None):
+    if out is None:
+        return Tensor(_infinicore.tanhshrink(input._underlying))
+
+    _infinicore.tanhshrink_(out._underlying, input._underlying)
+
+    return out
diff --git a/python/infinicore/ops/cat.py b/python/infinicore/ops/cat.py
new file mode 100644
index 000000000..3a3e1470f
--- /dev/null
+++ b/python/infinicore/ops/cat.py
@@ -0,0 +1,12 @@
+from infinicore.lib import _infinicore
+from infinicore.tensor import Tensor
+
+
+def cat(tensors, dim:int=0, *,out=None):
+    if out is None:
+        return Tensor(_infinicore.cat([tensor._underlying for tensor in tensors], dim))
+
+    _infinicore.cat_(out._underlying, [tensor._underlying for tensor in tensors], dim)
+    
+    # raise RuntimeError("breakpointer!!!")
+    return out
diff --git a/python/infinicore/ops/inner.py b/python/infinicore/ops/inner.py
new file mode 100644
index 000000000..df87c5b34
--- /dev/null
+++ b/python/infinicore/ops/inner.py
@@ -0,0 +1,11 @@
+from infinicore.lib import _infinicore
+from infinicore.tensor import Tensor
+
+
+def inner(input, other, *, out=None):
+    if out is None:
+        return Tensor(_infinicore.inner(input._underlying, other._underlying))
+
+    _infinicore.inner_(out._underlying, input._underlying, other._underlying)
+
+    return out
diff --git a/python/infinicore/ops/masked_select.py b/python/infinicore/ops/masked_select.py
new file mode 100644
index 000000000..d1846edfc
--- /dev/null
+++ b/python/infinicore/ops/masked_select.py
@@ -0,0 +1,5 @@
+from infinicore.lib import _infinicore
+from infinicore.tensor import Tensor
+
+def masked_select(input, mask):
+    return Tensor(_infinicore.masked_select(input._underlying, mask._underlying))
\ No newline at end of file
diff --git a/python/infinicore/ops/tan.py b/python/infinicore/ops/tan.py
new file mode 100644
index 000000000..da438e6d8
--- /dev/null
+++ b/python/infinicore/ops/tan.py
@@ -0,0 +1,11 @@
+from infinicore.lib import _infinicore
+from infinicore.tensor import Tensor
+
+
+def tan(input, *, out=None):
+    if out is None:
+        return Tensor(_infinicore.tan(input._underlying))
+
+    _infinicore.tan_(out._underlying, input._underlying)
+
+    return out
diff --git a/src/infinicore/ops/cat/cat.cc b/src/infinicore/ops/cat/cat.cc
new file mode 100644
index 000000000..908f5f457
--- /dev/null
+++ b/src/infinicore/ops/cat/cat.cc
@@ -0,0 +1,165 @@
+#include "infinicore/ops/cat.hpp"
+#include "infinicore/context/context.hpp"
+#include <cstring>
+#include <stdexcept>
+namespace infinicore::op {
+
+namespace {
+
+class CatInfo {
+    
+    CatInfo() = default;
+
+public:
+
+    int dim;
+    int ndim;
+    size_t tensors_size;
+    
+    std::vector<int> contiguous_dim;
+    std::vector<size_t> copy_size;
+
+    static CatInfo create(Tensor& out, std::vector<Tensor>& tensors, int dim) {
+        
+        int ndim = out->ndim();
+        size_t tensors_size = tensors.size();
+
+        std::vector<int> contiguous_dim(tensors_size, ndim);
+        std::vector<size_t> copy_size(tensors_size, dsize(out->dtype()));
+
+        for (int i = 0; i < tensors_size; i ++) {
+            if (tensors[i]->ndim() == 1) continue;
+            if (tensors[i]->stride(ndim - 1) == 1) {
+                contiguous_dim[i] = ndim - 1;
+                for (int j = ndim - 2; j >= dim; j --) {
+                    if (tensors[i]->stride(j) == tensors[i]->stride(j + 1) * tensors[i]->shape()[j + 1]) {
+                        contiguous_dim[i] = j;
+                    }
+                }
+            }
+            for (int j = contiguous_dim[i]; j < ndim; j ++) {
+                copy_size[i] *= tensors[i]->shape()[j];
+            }
+        }
+
+        return CatInfo{dim, ndim, tensors_size, contiguous_dim, copy_size};
+    }
+};
+
+
+void low_dim_copy(
+    CatInfo &info, Tensor& tensor, Tensor& out, 
+    std::byte *tensor_ptr, std::byte *out_ptr, 
+    int depth, int tensor_pos) {
+
+    if (depth != info.contiguous_dim[tensor_pos]) {
+        std::byte* now_tensor_ptr = tensor_ptr;
+        std::byte* now_out_ptr = out_ptr;
+
+        for (int i = 0; i < tensor->shape()[depth]; i ++) {
+        
+            low_dim_copy(info, tensor, out, now_tensor_ptr, now_out_ptr, depth + 1, tensor_pos);
+        
+            now_tensor_ptr += tensor->stride(depth) * dsize(tensor->dtype());
+            now_out_ptr += out->stride(depth) * dsize(out->dtype());
+        }
+    } else {
+        if (out->device().getType() == Device::Type::CPU) {
+
+            std::memcpy(out_ptr, tensor_ptr, info.copy_size[tensor_pos]);
+        } else {
+            
+            context::memcpyD2D(out_ptr, tensor_ptr, info.copy_size[tensor_pos]);
+        }
+    }
+}
+
+void high_dim_split(
+    CatInfo &info, std::vector<Tensor>& tensors, Tensor& out, 
+    std::vector<std::byte*> tensors_ptr, std::byte* out_ptr, 
+    int depth) {
+    
+    if (depth != info.dim) {
+        std::vector<std::byte*> now_tensors_ptr = tensors_ptr;
+        std::byte* now_out_ptr = out_ptr;
+
+        for (int i = 0; i < out->shape()[depth]; i ++) {
+
+            high_dim_split(info, tensors, out, now_tensors_ptr, now_out_ptr, depth + 1);
+            
+            for (int i = 0; i < info.tensors_size; i ++) {
+                if (tensors[i]->ndim() == 1)    continue;
+                now_tensors_ptr[i] += tensors[i]->stride(depth) * dsize(tensors[i]->dtype());
+            }
+            now_out_ptr += out->stride(depth) * dsize(out->dtype());
+        }
+    } else {
+        std::byte* now_out_ptr = out_ptr;
+
+        for (int i = 0; i < info.tensors_size; i ++) {
+            if (tensors[i]->ndim() == 1)    continue;
+            
+            low_dim_copy(info, tensors[i], out, tensors_ptr[i], now_out_ptr, depth, i);
+            
+            now_out_ptr += tensors[i]->shape()[depth] * out->stride(depth) * dsize(out->dtype());
+        }
+    }
+}
+
+} // namespace
+
+Tensor cat(std::vector<Tensor> tensors, int dim) {
+    assert(tensors.size() >= 2);
+    int ndim = tensors[0]->ndim();
+    assert(-ndim <= dim && dim < ndim);
+    dim = (dim + ndim) % ndim;
+
+    Shape shape = tensors[0]->shape();
+    for (int i = 1; i < tensors.size(); i ++) {
+        assert(tensors[i]->ndim() == dim || tensors[i]->ndim() == 1);
+        if (tensors[i]->ndim() != ndim) continue;
+        shape[dim] += tensors[i]->shape()[dim];
+    }
+
+    auto out = Tensor::empty(shape, tensors[0]->dtype(), tensors[0]->device());
+    cat_(out, tensors, dim);
+    return out;
+}
+
+void cat_(Tensor out, std::vector<Tensor> tensors, int dim) {
+    // assert the parameter properties are correct.
+    assert(tensors.size() >= 2);
+    int ndim = out->ndim();
+    assert(-ndim <= dim && dim < ndim);
+    dim = (dim + ndim) % ndim;
+
+    size_t dim_shape = 0;
+    for (auto& tensor : tensors) {
+        assert(tensor->ndim() == ndim || tensors[i]->ndim() == 1);
+        if (tensor->ndim() == 1) {
+            assert(tensor->shape()[0] == 0);
+            continue;
+        }
+        for (int i = 0; i < ndim; i ++) {
+            if (i != dim) {
+                assert(tensor->shape()[i] == out->shape()[i]);
+            } else {
+                dim_shape += tensor->shape()[i];
+            }
+        }
+    }
+    assert(dim_shape == out->shape()[dim]);
+
+    // Get info
+    CatInfo info = CatInfo::create(out, tensors, dim);
+    std::vector<std::byte*> tensors_ptr(tensors.size());
+    for (int i = 0; i < tensors.size(); i ++) {
+        tensors_ptr[i] = tensors[i]->data();
+    }
+    std::byte* out_ptr = out->data();
+
+    high_dim_split(info, tensors, out, tensors_ptr, out_ptr, 0);
+
+}
+
+} // namespace infinicore::op
\ No newline at end of file
diff --git a/src/infinicore/ops/inner/inner.cc b/src/infinicore/ops/inner/inner.cc
new file mode 100644
index 000000000..271ad101c
--- /dev/null
+++ b/src/infinicore/ops/inner/inner.cc
@@ -0,0 +1,46 @@
+#include <infinicore/ops/inner.hpp>
+#include <stdexcept>
+
+namespace infinicore::op {
+
+
+common::OpDispatcher<Inner::schema> &Inner::dispatcher() {
+    static common::OpDispatcher<Inner::schema> dispatcher_;
+    return dispatcher_;
+};
+
+void Inner::execute(Tensor out, Tensor input, Tensor other) {
+    auto device_type = context::getDevice().getType();
+    auto func = dispatcher().lookup(device_type);
+
+    if (func == nullptr) {
+        throw std::runtime_error("No Inner implementation found for device type: " + std::to_string(static_cast<int>(device_type)));
+    }
+
+    func(out, input, other);
+}
+
+Tensor inner(Tensor input, Tensor other) {
+    size_t input_ndim = input->ndim();
+    size_t other_ndim = other->ndim();
+
+    assert(input->shape()[input_ndim - 1] == other->shape()[other_ndim - 1]);
+
+    Shape out_shape;
+    for (int i = 0; i < input_ndim - 1; i ++)
+        out_shape.push_back(input->shape()[i]);
+    for (int i = 0; i < other_ndim - 1; i ++)
+        out_shape.push_back(other->shape()[i]);
+    auto out = Tensor::zeros(out_shape, input->dtype(), input->device());
+
+    inner_(out, input, other);
+    return out;
+}
+
+void inner_(Tensor out, Tensor input, Tensor other) {
+    
+    Inner::execute(out, input, other);
+
+}
+
+}
\ No newline at end of file
diff --git a/src/infinicore/ops/inner/inner_infiniop.cc b/src/infinicore/ops/inner/inner_infiniop.cc
new file mode 100644
index 000000000..fe6b8de9a
--- /dev/null
+++ b/src/infinicore/ops/inner/inner_infiniop.cc
@@ -0,0 +1,59 @@
+#include "../../utils.hpp"
+#include "infinicore/common/hash.hpp"
+#include "infinicore/ops/inner.hpp"
+#include "infinicore/ops/common/cache.hpp"
+#include <infiniop.h>
+
+namespace infinicore::op::inner_impl::infiniop {
+
+thread_local common::OpCache<size_t, infiniopInnerDescriptor_t> caches(
+    100,
+    [](infiniopInnerDescriptor_t &desc) {
+        if (desc != nullptr) {
+            INFINICORE_CHECK_ERROR(infiniopDestroyInnerDescriptor(desc));
+            desc = nullptr;
+        }
+    });
+
+void calculate(Tensor out, Tensor input, Tensor other) {
+    size_t seed = hash_combine(out, input, other);
+
+    auto device_type = context::getDevice().getType();
+    auto device_index = context::getDevice().getIndex();
+
+    auto &cache = caches.getCache(device_type, device_index);
+
+    auto desc_opt = cache.get(seed);
+    infiniopInnerDescriptor_t desc = nullptr;
+
+    if (!desc_opt) {
+        INFINICORE_CHECK_ERROR(infiniopCreateInnerDescriptor(
+            context::getInfiniopHandle(input->device()), &desc,
+            out->desc(), input->desc(), other->desc()));
+        cache.put(seed, desc);
+    } else {
+        desc = *desc_opt;
+    }
+
+    size_t workspace_size = 0;
+    INFINICORE_CHECK_ERROR(infiniopGetInnerWorkspaceSize(desc, &workspace_size));
+    std::shared_ptr<Memory> workspace = context::allocateMemory(workspace_size);
+
+
+    INFINICORE_CHECK_ERROR(infiniopInner(
+        desc, workspace->data(), workspace_size,
+        out->data(), input->data(), other->data(), context::getStream()));
+}
+
+static bool registered = []() {
+    Inner::dispatcher().registerDevice({
+            Device::Type::CPU,
+            Device::Type::NVIDIA,
+            Device::Type::METAX,
+            Device::Type::MOORE,
+            Device::Type::ILUVATAR
+        }, &calculate, false);
+    return true;
+}();
+
+}
\ No newline at end of file
diff --git a/src/infinicore/ops/masked_select/masked_select.cc b/src/infinicore/ops/masked_select/masked_select.cc
new file mode 100644
index 000000000..13ac6dfe4
--- /dev/null
+++ b/src/infinicore/ops/masked_select/masked_select.cc
@@ -0,0 +1,33 @@
+#include <infinicore/ops/masked_select.hpp>
+#include <stdexcept>
+
+namespace infinicore::op {
+
+common::OpDispatcher<MaskedSelect::schema> &MaskedSelect::dispatcher() {
+    static common::OpDispatcher<MaskedSelect::schema> dispatcher_;
+    return dispatcher_;
+};
+
+void MaskedSelect::execute(Tensor input, Tensor mask, void **data_ptr, size_t *dlen_ptr) {
+    auto device_type = context::getDevice().getType();
+    auto func = dispatcher().lookup(device_type);
+    
+    if (func == nullptr) {
+        throw std::runtime_error("No MaskedSelect implementation found for device type: " + std::to_string(static_cast<int>(device_type)));
+    }
+
+    func(input, mask, data_ptr, dlen_ptr);
+}  
+
+Tensor masked_select(Tensor input, Tensor mask) {
+
+    std::byte *data;
+    size_t dlen;
+    MaskedSelect::execute(input, mask, (void **)&data, &dlen);
+
+    auto out = Tensor::from_blob(data, {dlen}, input->dtype(), input->device());
+    
+    return out;
+}
+
+}
diff --git a/src/infinicore/ops/masked_select/masked_select_infiniop.cc b/src/infinicore/ops/masked_select/masked_select_infiniop.cc
new file mode 100644
index 000000000..0144ce425
--- /dev/null
+++ b/src/infinicore/ops/masked_select/masked_select_infiniop.cc
@@ -0,0 +1,59 @@
+#include "../../utils.hpp"
+#include "infinicore/common/hash.hpp"
+#include "infinicore/ops/masked_select.hpp"
+#include "infinicore/ops/common/cache.hpp"
+#include <infiniop.h>
+
+namespace infinicore::op::masked_select_impl::infiniop {
+
+thread_local common::OpCache<size_t, infiniopMaskedSelectDescriptor_t> caches(
+    100,
+    [](infiniopMaskedSelectDescriptor_t &desc) {
+        if (desc != nullptr) {
+            INFINICORE_CHECK_ERROR(infiniopDestroyMaskedSelectDescriptor(desc));
+            desc = nullptr;
+        }
+    });
+
+void calculate(Tensor input, Tensor mask, void **data_ptr, size_t *dlen_ptr) {
+    size_t seed = hash_combine(input, mask, (std::uintptr_t)data_ptr, (std::uintptr_t)dlen_ptr);
+
+    auto device_type = context::getDevice().getType();
+    auto device_index = context::getDevice().getIndex();
+
+    auto &cache = caches.getCache(device_type, device_index);
+
+    auto desc_opt = cache.get(seed);
+    infiniopMaskedSelectDescriptor_t desc = nullptr;
+
+    if (!desc_opt) {
+        INFINICORE_CHECK_ERROR(infiniopCreateMaskedSelectDescriptor(
+            context::getInfiniopHandle(input->device()), &desc,
+            input->desc(), mask->desc()));
+        cache.put(seed, desc);
+    } else {
+        desc = *desc_opt;
+    }
+
+    size_t workspace_size = 0;
+    INFINICORE_CHECK_ERROR(infiniopGetMaskedSelectWorkspaceSize(desc, &workspace_size));
+    std::shared_ptr<Memory> workspace = context::allocateMemory(workspace_size);
+
+
+    INFINICORE_CHECK_ERROR(infiniopMaskedSelect(
+        desc, workspace->data(), workspace_size,
+        input->data(), (const bool *)mask->data(), data_ptr, dlen_ptr, context::getStream()));
+}
+
+static bool registered = []() {
+    MaskedSelect::dispatcher().registerDevice({
+            Device::Type::CPU,
+            Device::Type::NVIDIA,
+            Device::Type::METAX,
+            Device::Type::MOORE,
+            Device::Type::ILUVATAR
+        }, &calculate, false);
+    return true;
+}();
+
+}
diff --git a/src/infinicore/ops/tan/tan.cc b/src/infinicore/ops/tan/tan.cc
new file mode 100644
index 000000000..20bcab4b5
--- /dev/null
+++ b/src/infinicore/ops/tan/tan.cc
@@ -0,0 +1,32 @@
+#include "infinicore/ops/tan.hpp"
+#include <stdexcept>
+
+namespace infinicore::op {
+
+common::OpDispatcher<Tan::schema> &Tan::dispatcher() {
+    static common::OpDispatcher<Tan::schema> dispatcher_;
+    return dispatcher_;
+};
+
+void Tan::execute(Tensor output, Tensor input) {
+    auto device_type = context::getDevice().getType();
+    auto func = dispatcher().lookup(device_type);
+
+    if (func == nullptr) {
+        throw std::runtime_error("No Tan implementation found for device type: " + std::to_string(static_cast<int>(device_type)));
+    }
+
+    func(output, input);
+}
+
+Tensor tan(Tensor input) {
+    Shape shape = input->shape();
+    auto output = Tensor::empty(shape, input->dtype(), input->device());
+    tan_(output, input);
+    return output;
+}
+
+void tan_(Tensor output, Tensor input) {
+    Tan::execute(output, input);
+}
+} // namespace infinicore::op
diff --git a/src/infinicore/ops/tan/tan_infiniop.cc b/src/infinicore/ops/tan/tan_infiniop.cc
new file mode 100644
index 000000000..1d7c9c874
--- /dev/null
+++ b/src/infinicore/ops/tan/tan_infiniop.cc
@@ -0,0 +1,58 @@
+#include "../../utils.hpp"
+#include "infinicore/common/hash.hpp"
+#include "infinicore/ops/common/cache.hpp"
+#include "infinicore/ops/tan.hpp"
+#include <infiniop.h>
+
+namespace infinicore::op::tan_impl::infiniop {
+
+thread_local common::OpCache<size_t, infiniopTanDescriptor_t> caches(
+    100, // capacity
+    [](infiniopTanDescriptor_t &desc) {
+        if (desc != nullptr) {
+            INFINICORE_CHECK_ERROR(infiniopDestroyTanDescriptor(desc));
+            desc = nullptr;
+        }
+    });
+
+void calculate(Tensor output, Tensor input) {
+    size_t seed = hash_combine(output, input);
+
+    auto device_type = context::getDevice().getType();
+    auto device_index = context::getDevice().getIndex();
+
+    auto &cache = caches.getCache(device_type, device_index);
+
+    auto desc_opt = cache.get(seed);
+    infiniopTanDescriptor_t desc = nullptr;
+
+    if (!desc_opt) {
+        INFINICORE_CHECK_ERROR(infiniopCreateTanDescriptor(
+            context::getInfiniopHandle(output->device()), &desc,
+            output->desc(), input->desc()));
+        cache.put(seed, desc);
+    } else {
+        desc = *desc_opt;
+    }
+
+    size_t workspace_size = 0;
+    INFINICORE_CHECK_ERROR(infiniopGetTanWorkspaceSize(desc, &workspace_size));
+    std::shared_ptr<Memory> workspace = context::allocateMemory(workspace_size);
+
+    INFINICORE_CHECK_ERROR(infiniopTan(
+        desc, workspace->data(), workspace_size,
+        output->data(), input->data(), context::getStream()));
+}
+
+static bool registered = []() {
+    Tan::dispatcher().registerDevice({
+            Device::Type::CPU,
+            Device::Type::NVIDIA,
+            Device::Type::METAX,
+            Device::Type::MOORE,
+            Device::Type::ILUVATAR
+        }, &calculate, false);
+    return true;
+}();
+
+} // namespace infinicore::op::tan_impl::infiniop
diff --git a/src/infinicore/ops/tanhshrink/tanhshrink.cc b/src/infinicore/ops/tanhshrink/tanhshrink.cc
new file mode 100644
index 000000000..ea96e46f4
--- /dev/null
+++ b/src/infinicore/ops/tanhshrink/tanhshrink.cc
@@ -0,0 +1,32 @@
+#include "infinicore/ops/tanhshrink.hpp"
+#include <stdexcept>
+
+namespace infinicore::op {
+
+common::OpDispatcher<Tanhshrink::schema> &Tanhshrink::dispatcher() {
+    static common::OpDispatcher<Tanhshrink::schema> dispatcher_;
+    return dispatcher_;
+};
+
+void Tanhshrink::execute(Tensor output, Tensor input) {
+    auto device_type = context::getDevice().getType();
+    auto func = dispatcher().lookup(device_type);
+
+    if (func == nullptr) {
+        throw std::runtime_error("No Tanhshrink implementation found for device type: " + std::to_string(static_cast<int>(device_type)));
+    }
+
+    func(output, input);
+}
+
+Tensor tanhshrink(Tensor input) {
+    Shape shape = input->shape();
+    auto output = Tensor::empty(shape, input->dtype(), input->device());
+    tanhshrink_(output, input);
+    return output;
+}
+
+void tanhshrink_(Tensor output, Tensor input) {
+    Tanhshrink::execute(output, input);
+}
+} // namespace infinicore::op
diff --git a/src/infinicore/ops/tanhshrink/tanhshrink_infiniop.cc b/src/infinicore/ops/tanhshrink/tanhshrink_infiniop.cc
new file mode 100644
index 000000000..ca440f875
--- /dev/null
+++ b/src/infinicore/ops/tanhshrink/tanhshrink_infiniop.cc
@@ -0,0 +1,58 @@
+#include "../../utils.hpp"
+#include "infinicore/common/hash.hpp"
+#include "infinicore/ops/common/cache.hpp"
+#include "infinicore/ops/tanhshrink.hpp"
+#include <infiniop.h>
+
+namespace infinicore::op::tanhshrink_impl::infiniop {
+
+thread_local common::OpCache<size_t, infiniopTanhshrinkDescriptor_t> caches(
+    100, // capacity
+    [](infiniopTanhshrinkDescriptor_t &desc) {
+        if (desc != nullptr) {
+            INFINICORE_CHECK_ERROR(infiniopDestroyTanhshrinkDescriptor(desc));
+            desc = nullptr;
+        }
+    });
+
+void calculate(Tensor output, Tensor input) {
+    size_t seed = hash_combine(output, input);
+
+    auto device_type = context::getDevice().getType();
+    auto device_index = context::getDevice().getIndex();
+
+    auto &cache = caches.getCache(device_type, device_index);
+
+    auto desc_opt = cache.get(seed);
+    infiniopTanhshrinkDescriptor_t desc = nullptr;
+
+    if (!desc_opt) {
+        INFINICORE_CHECK_ERROR(infiniopCreateTanhshrinkDescriptor(
+            context::getInfiniopHandle(output->device()), &desc,
+            output->desc(), input->desc()));
+        cache.put(seed, desc);
+    } else {
+        desc = *desc_opt;
+    }
+
+    size_t workspace_size = 0;
+    INFINICORE_CHECK_ERROR(infiniopGetTanhshrinkWorkspaceSize(desc, &workspace_size));
+    std::shared_ptr<Memory> workspace = context::allocateMemory(workspace_size);
+
+    INFINICORE_CHECK_ERROR(infiniopTanhshrink(
+        desc, workspace->data(), workspace_size,
+        output->data(), input->data(), context::getStream()));
+}
+
+static bool registered = []() {
+    Tanhshrink::dispatcher().registerDevice({
+            Device::Type::CPU,
+            Device::Type::NVIDIA,
+            Device::Type::METAX,
+            Device::Type::MOORE,
+            Device::Type::ILUVATAR
+        }, &calculate, false);
+    return true;
+}();
+
+} // namespace infinicore::op::tanhshrink_impl::infiniop
diff --git a/src/infinicore/pybind11/ops.hpp b/src/infinicore/pybind11/ops.hpp
index a468defd8..ffc2e010c 100644
--- a/src/infinicore/pybind11/ops.hpp
+++ b/src/infinicore/pybind11/ops.hpp
@@ -14,6 +14,7 @@
 #include "ops/baddbmm.hpp"
 #include "ops/bilinear.hpp"
 #include "ops/binary_cross_entropy_with_logits.hpp"
+#include "ops/cat.hpp"
 #include "ops/causal_softmax.hpp"
 #include "ops/cdist.hpp"
 #include "ops/cross_entropy.hpp"
@@ -26,6 +27,9 @@
 #include "ops/kv_caching.hpp"
 #include "ops/linear.hpp"
 #include "ops/linear_w8a8i8.hpp"
+#include "ops/inner.hpp"
+#include "ops/linear.hpp"
+#include "ops/masked_select.hpp"
 #include "ops/matmul.hpp"
 #include "ops/mha_kvcache.hpp"
 #include "ops/mha_varlen.hpp"
@@ -45,6 +49,8 @@
 #include "ops/topk.hpp"
 #include "ops/var.hpp"
 #include "ops/var_mean.hpp"
+#include "ops/tan.hpp"
+#include "ops/tanhshrink.hpp"
 
 namespace py = pybind11;
 
@@ -62,8 +68,12 @@ inline void bind(py::module &m) {
     bind_flash_attention(m);
     bind_kv_caching(m);
     bind_fmod(m);
+    bind_cat(m);
+    bind_causal_softmax(m);
+    bind_inner(m);
     bind_random_sample(m);
     bind_linear(m);
+    bind_masked_select(m);
     bind_matmul(m);
     bind_mul(m);
     bind_mha_kvcache(m);
@@ -80,6 +90,8 @@ inline void bind(py::module &m) {
     bind_avg_pool1d(m);
     bind_silu(m);
     bind_swiglu(m);
+    bind_tan(m);
+    bind_tanhshrink(m);
     bind_rope(m);
     bind_embedding(m);
     bind_linear_w8a8i8(m);
diff --git a/src/infinicore/pybind11/ops/cat.hpp b/src/infinicore/pybind11/ops/cat.hpp
new file mode 100644
index 000000000..2ef3392b0
--- /dev/null
+++ b/src/infinicore/pybind11/ops/cat.hpp
@@ -0,0 +1,26 @@
+#pragma once
+
+#include "infinicore/ops/cat.hpp"
+#include <pybind11/pybind11.h>
+
+namespace py = pybind11;
+
+namespace infinicore::ops {
+
+inline void bind_cat(py::module &m) {
+
+    m.def("cat",
+          &op::cat,
+          py::arg("tensors"),
+          py::arg("dim") = 0,
+          R"doc(opertor: torch.cat, out-of-place mode)doc");
+          
+    m.def("cat_",
+          &op::cat_,
+          py::arg("out"),
+          py::arg("tensors"),
+          py::arg("dim") = 0,
+          R"doc(opertor: torch.cat, in-place mode)doc");
+}
+
+} // namespace infinicore::ops
\ No newline at end of file
diff --git a/src/infinicore/pybind11/ops/inner.hpp b/src/infinicore/pybind11/ops/inner.hpp
new file mode 100644
index 000000000..16ee91932
--- /dev/null
+++ b/src/infinicore/pybind11/ops/inner.hpp
@@ -0,0 +1,26 @@
+#pragma once
+
+#include "infinicore/ops/inner.hpp"
+#include <pybind11/pybind11.h>
+
+namespace py = pybind11;
+
+namespace infinicore::ops {
+
+inline void bind_inner(py::module &m) {
+
+    m.def("inner",
+          &op::inner,
+          py::arg("input"),
+          py::arg("other"),
+          R"doc(opertor: torch.inner, out-of-place mode)doc");
+          
+    m.def("inner_",
+          &op::inner_,
+          py::arg("out"),
+          py::arg("input"),
+          py::arg("other"),
+          R"doc(opertor: torch.inner, in-place mode)doc");
+}
+
+} // namespace infinicore::ops
\ No newline at end of file
diff --git a/src/infinicore/pybind11/ops/masked_select.hpp b/src/infinicore/pybind11/ops/masked_select.hpp
new file mode 100644
index 000000000..2734590e6
--- /dev/null
+++ b/src/infinicore/pybind11/ops/masked_select.hpp
@@ -0,0 +1,20 @@
+#pragma once
+
+#include "infinicore/ops/masked_select.hpp"
+#include <pybind11/pybind11.h>
+
+namespace py = pybind11;
+
+namespace infinicore::ops {
+
+inline void bind_masked_select(py::module &m) {
+
+    m.def("masked_select",
+          &op::masked_select,
+          py::arg("input"),
+          py::arg("mask"),
+          R"doc(opertor: torch.masked_select, out-of-place mode)doc");
+          
+}
+
+} // namespace infinicore::ops
\ No newline at end of file
diff --git a/src/infinicore/pybind11/ops/tan.hpp b/src/infinicore/pybind11/ops/tan.hpp
new file mode 100644
index 000000000..acd62e048
--- /dev/null
+++ b/src/infinicore/pybind11/ops/tan.hpp
@@ -0,0 +1,24 @@
+#pragma once
+
+#include <pybind11/pybind11.h>
+
+#include "infinicore/ops/tan.hpp"
+
+namespace py = pybind11;
+
+namespace infinicore::ops {
+
+inline void bind_tan(py::module &m) {
+    m.def("tan",
+          &op::tan,
+          py::arg("input"),
+          R"doc(opertor: torch.tan, out-of-place mode)doc");
+
+    m.def("tan_",
+          &op::tan_,
+          py::arg("output"),
+          py::arg("input"),
+          R"doc(opertor: torch.tan, in-place mode)doc");
+}
+
+} // namespace infinicore::ops
diff --git a/src/infinicore/pybind11/ops/tanhshrink.hpp b/src/infinicore/pybind11/ops/tanhshrink.hpp
new file mode 100644
index 000000000..3780555f4
--- /dev/null
+++ b/src/infinicore/pybind11/ops/tanhshrink.hpp
@@ -0,0 +1,24 @@
+#pragma once
+
+#include <pybind11/pybind11.h>
+
+#include "infinicore/ops/tanhshrink.hpp"
+
+namespace py = pybind11;
+
+namespace infinicore::ops {
+
+inline void bind_tanhshrink(py::module &m) {
+    m.def("tanhshrink",
+          &op::tanhshrink,
+          py::arg("input"),
+          R"doc(opertor: torch.tanhshrink, out-of-place mode)doc");
+
+    m.def("tanhshrink_",
+          &op::tanhshrink_,
+          py::arg("output"),
+          py::arg("input"),
+          R"doc(opertor: torch.tanhshrink, in-place mode)doc");
+}
+
+} // namespace infinicore::ops
diff --git a/src/infiniop/ops/inner/cpu/inner_cpu.cc b/src/infiniop/ops/inner/cpu/inner_cpu.cc
new file mode 100644
index 000000000..5915cb3fc
--- /dev/null
+++ b/src/infiniop/ops/inner/cpu/inner_cpu.cc
@@ -0,0 +1,88 @@
+#include "inner_cpu.h"
+#include "../../../devices/cpu/common_cpu.h"
+
+namespace op::inner::cpu {
+   
+Descriptor::~Descriptor() {}
+
+infiniStatus_t Descriptor::create(
+    infiniopHandle_t handle,
+    Descriptor **desc_ptr,
+    infiniopTensorDescriptor_t out_desc,
+    infiniopTensorDescriptor_t input_desc,
+    infiniopTensorDescriptor_t other_desc) {
+    
+    auto result = InnerInfo::create(out_desc, input_desc, other_desc);
+    CHECK_RESULT(result);
+    *desc_ptr = new Descriptor(nullptr, result.take(), 0, handle->device, handle->device_id);
+    return INFINI_STATUS_SUCCESS;
+}
+
+namespace {
+
+template<typename T>
+infiniStatus_t inner(const InnerInfo *info, const T *input, const T *other, T *out) {
+#pragma omp parallel for
+    for (size_t out_index = 0; out_index < info->total_elements; out_index ++) {
+        
+        size_t out_dim_pos = info->out_ndim - 1;
+        size_t out_offset = op::common_cpu::indexToOffset(out_index, info->out_ndim, info->out_shape.data(), info->out_strides.data());
+        size_t index = out_index;
+
+        ptrdiff_t input_offset = 0;
+        ptrdiff_t other_offset = 0;
+        
+        for (int i = (int)info->other_ndim - 2; i >= 0; i --) {
+            other_offset += (index % info->out_shape[out_dim_pos]) * info->other_strides[i];
+            index /= info->out_shape[out_dim_pos --];
+        }
+        for (int i = (int)info->input_ndim - 2; i >= 0; i --) {
+            input_offset += (index % info->out_shape[out_dim_pos]) * info->input_strides[i];
+            index /= info->out_shape[out_dim_pos --];
+        }
+
+        float tmp = 0.;
+        for (size_t i = 0; i < info->oper_len; i ++) {
+            if constexpr (std::is_same<T, fp16_t>::value || std::is_same<T, bf16_t>::value) {
+                tmp += utils::cast<float>(input[input_offset]) * utils::cast<float>(other[other_offset]);
+            } else {
+                tmp += input[input_offset] * other[other_offset];
+            }
+            input_offset += info->input_strides[info->input_ndim - 1];
+            other_offset += info->other_strides[info->other_ndim - 1];
+        }
+
+        if constexpr (std::is_same<T, fp16_t>::value || std::is_same<T, bf16_t>::value) {
+            out[out_offset] = utils::cast<T>(tmp);
+        } else {
+            out[out_offset] = tmp;
+        }
+    }
+
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+}
+
+infiniStatus_t Descriptor::calculate(
+    void* workspace, size_t workspace_size,
+    void *out,
+    const void *input,
+    const void *other,
+    void *stream) const {
+
+    if (_info.dtype == INFINI_DTYPE_BF16) {
+        CHECK_STATUS(inner(&_info, (const bf16_t *)input, (const bf16_t *)other, (bf16_t *)out));
+    } else if (_info.dtype == INFINI_DTYPE_F16) {
+        CHECK_STATUS(inner(&_info, (const fp16_t *)input, (const fp16_t *)other, (fp16_t *)out));
+    } else if (_info.dtype == INFINI_DTYPE_F32){
+        CHECK_STATUS(inner(&_info, (const float *)input, (const float *)other, (float *)out));
+    } else {
+        return INFINI_STATUS_BAD_TENSOR_DTYPE;
+    }
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+}
\ No newline at end of file
diff --git a/src/infiniop/ops/inner/cpu/inner_cpu.h b/src/infiniop/ops/inner/cpu/inner_cpu.h
new file mode 100644
index 000000000..5d4774dfa
--- /dev/null
+++ b/src/infiniop/ops/inner/cpu/inner_cpu.h
@@ -0,0 +1,7 @@
+#ifndef __INNER_CPU_H__
+#define __INNER_CPU_H__
+#include "../inner.h"
+
+DESCRIPTOR(cpu);
+
+#endif
\ No newline at end of file
diff --git a/src/infiniop/ops/inner/cuda/kernel.cuh b/src/infiniop/ops/inner/cuda/kernel.cuh
new file mode 100644
index 000000000..1616a4281
--- /dev/null
+++ b/src/infiniop/ops/inner/cuda/kernel.cuh
@@ -0,0 +1,41 @@
+#ifndef __INNER_KERNEL_CUH__
+#define __INNER_KERNEL_CUH__
+
+template<size_t BLOCK_SIZE, typename T>
+INFINIOP_CUDA_KERNEL innerKernel(
+    const T *input, const T *other, T *out,
+    size_t total_elements, size_t oper_len, size_t *out_shape,
+    ptrdiff_t *input_strides, ptrdiff_t *other_strides, ptrdiff_t *out_strides,
+    size_t input_ndim, size_t other_ndim, size_t out_ndim) {
+    
+    size_t out_index = blockDim.x * blockIdx.x + threadIdx.x;
+    if (out_index >= total_elements) return;
+
+    size_t out_offset = device::nvidia::indexToOffset(out_index, out_ndim, out_shape, out_strides);
+    
+    size_t out_dim_pos = out_ndim - 1;
+    size_t index = out_index;
+
+    ptrdiff_t input_offset = 0;
+    ptrdiff_t other_offset = 0;
+
+    for (int i = (int)other_ndim - 2; i >= 0; i --) {
+        other_offset += (index % out_shape[out_dim_pos]) * other_strides[i];
+        index /= out_shape[out_dim_pos --];
+    }
+    for (int i = (int)input_ndim - 2; i >= 0; i --) {
+        input_offset += (index % out_shape[out_dim_pos]) * input_strides[i];
+        index /= out_shape[out_dim_pos --];
+    }
+
+    T tmp = 0;
+    for (size_t i = 0; i < oper_len; i ++) {
+        tmp += input[input_offset] * other[other_offset];
+        input_offset += input_strides[input_ndim - 1];
+        other_offset += other_strides[other_ndim - 1];
+    }
+
+    out[out_offset] = tmp;
+}
+
+#endif // __INNER_KERNEL_CUH__
\ No newline at end of file
diff --git a/src/infiniop/ops/inner/info.h b/src/infiniop/ops/inner/info.h
new file mode 100644
index 000000000..2484277eb
--- /dev/null
+++ b/src/infiniop/ops/inner/info.h
@@ -0,0 +1,88 @@
+#ifndef __INNER_INFO_H__
+#define __INNER_INFO_H__
+
+#include "../../../utils.h"
+#include "../../tensor.h"
+#include <vector>
+
+namespace op::inner {
+
+class InnerInfo {
+    InnerInfo() = default;
+
+public:
+    infiniDtype_t dtype;
+    
+    size_t input_ndim;
+    size_t other_ndim;
+    size_t out_ndim;
+
+    size_t oper_len;
+    size_t total_elements;
+
+    std::vector<size_t> out_shape;
+
+    std::vector<ptrdiff_t> input_strides;
+    std::vector<ptrdiff_t> other_strides;
+    std::vector<ptrdiff_t> out_strides;
+
+    static utils::Result<InnerInfo> create(
+        infiniopTensorDescriptor_t out_desc,
+        infiniopTensorDescriptor_t input_desc,
+        infiniopTensorDescriptor_t other_desc) {
+
+        auto dtype = out_desc->dtype();
+        if (dtype != input_desc->dtype() || dtype != other_desc->dtype())
+            return INFINI_STATUS_BAD_TENSOR_DTYPE;
+        CHECK_DTYPE(dtype, INFINI_DTYPE_BF16, INFINI_DTYPE_F16, INFINI_DTYPE_F32);
+
+        auto input_ndim = input_desc->ndim();
+        auto other_ndim = other_desc->ndim();
+        auto out_ndim = out_desc->ndim();
+        if (out_ndim + 2 != input_ndim + other_ndim)
+            return INFINI_STATUS_BAD_TENSOR_SHAPE;
+
+        auto input_shape = input_desc->shape();
+        auto other_shape = other_desc->shape();
+        auto out_shape = out_desc->shape();
+        if (input_ndim && other_ndim) {
+            if (input_shape[input_ndim - 1] != other_shape[other_ndim - 1])
+                return INFINI_STATUS_BAD_TENSOR_SHAPE;
+        }
+        size_t out_dim_pos = 0;
+        if (input_ndim) {
+            for (size_t i = 0; i < input_ndim - 1; i ++)
+                if (input_shape[i] != out_shape[out_dim_pos ++])
+                    return INFINI_STATUS_BAD_TENSOR_SHAPE;
+        }
+        if (other_ndim) {
+            for (size_t i = 0; i < other_ndim - 1; i ++)
+                if (other_shape[i] != out_shape[out_dim_pos ++])
+                    return INFINI_STATUS_BAD_TENSOR_SHAPE;
+        }
+
+        size_t oper_len = 1;
+        if (input_ndim && other_ndim) {
+            oper_len = input_shape[input_ndim - 1];
+        }
+
+        size_t total_elements = 1;
+        for (size_t i = 0; i < out_ndim; i ++)
+            total_elements *= out_shape[i];
+        
+        auto input_strides = input_desc->strides();
+        auto other_strides = other_desc->strides();
+        auto out_strides = out_desc->strides();
+
+        return utils::Result<InnerInfo>(InnerInfo{
+            dtype,
+            input_ndim, other_ndim, out_ndim,
+            oper_len, total_elements, out_shape,
+            input_strides, other_strides, out_strides
+        });
+    }
+};
+
+} // namespace op::inner
+
+#endif // __INNER_INFO_H__
\ No newline at end of file
diff --git a/src/infiniop/ops/inner/inner.h b/src/infiniop/ops/inner/inner.h
new file mode 100644
index 000000000..5c44262a7
--- /dev/null
+++ b/src/infiniop/ops/inner/inner.h
@@ -0,0 +1,49 @@
+#ifndef Inner_H
+#define Inner_H
+
+#include "../../operator.h"
+#include "info.h"
+
+#define DESCRIPTOR(NAMESPACE)                                    \
+                                                                 \
+    namespace op::inner::NAMESPACE {                             \
+    class Descriptor final: public InfiniopDescriptor {          \
+        struct Opaque;                                           \
+        Opaque *_opaque;                                         \
+        InnerInfo _info;                                         \
+        size_t _workspace_size;                                  \
+                                                                 \
+        Descriptor(                                              \
+            Opaque *opaque,                                      \
+            InnerInfo info,                                      \
+            size_t workspace_size,                               \
+            infiniDevice_t device_type,                          \
+            int device_id)                                       \
+            : InfiniopDescriptor{device_type, device_id},        \
+              _opaque(opaque),                                   \
+              _info(info),                                       \
+              _workspace_size(workspace_size) {}                 \
+                                                                 \
+                                                                 \
+    public:                                                      \
+        ~Descriptor();                                           \
+        size_t workspaceSize() const { return _workspace_size; } \
+                                                                 \
+        static infiniStatus_t create(                            \
+            infiniopHandle_t handle,                             \
+            Descriptor **desc_ptr,                               \
+            infiniopTensorDescriptor_t out_desc,                 \
+            infiniopTensorDescriptor_t input_desc,               \
+            infiniopTensorDescriptor_t other_desc);              \
+                                                                 \
+        infiniStatus_t calculate(                                \
+            void *workspace, size_t workspace_size,              \
+            void *out,                                           \
+            const void *input,                                   \
+            const void *other,                                   \
+            void *stream) const;                                 \
+    };                                                           \
+    }
+
+
+#endif
\ No newline at end of file
diff --git a/src/infiniop/ops/inner/metax/inner_metax.h b/src/infiniop/ops/inner/metax/inner_metax.h
new file mode 100644
index 000000000..04a47d445
--- /dev/null
+++ b/src/infiniop/ops/inner/metax/inner_metax.h
@@ -0,0 +1,7 @@
+#ifndef __INNER_METAX_H__
+#define __INNER_METAX_H__
+#include "../inner.h"
+
+DESCRIPTOR(metax);
+
+#endif
\ No newline at end of file
diff --git a/src/infiniop/ops/inner/metax/inner_metax.maca b/src/infiniop/ops/inner/metax/inner_metax.maca
new file mode 100644
index 000000000..683f4a1e1
--- /dev/null
+++ b/src/infiniop/ops/inner/metax/inner_metax.maca
@@ -0,0 +1,123 @@
+#include "../../../devices/metax/metax_common.h"
+#include "inner_metax.h"
+
+#include "../../../devices/metax/metax_kernel_common.h"
+
+#include "inner_metax_kernel.h"
+
+namespace op::inner::metax {
+
+struct Descriptor::Opaque {
+    std::shared_ptr<device::metax::Handle::Internal> internal;
+};
+
+Descriptor::~Descriptor() {
+    delete _opaque;
+}
+
+infiniStatus_t Descriptor::create(
+    infiniopHandle_t handle,
+    Descriptor **desc_ptr,
+    infiniopTensorDescriptor_t out_desc,
+    infiniopTensorDescriptor_t input_desc,
+    infiniopTensorDescriptor_t other_desc) {
+
+    auto result = InnerInfo::create(out_desc, input_desc, other_desc);
+    CHECK_RESULT(result);
+    auto info = result.take();
+    size_t workspace_size = 0;
+    // out_shape
+    workspace_size += info.out_ndim * sizeof(size_t);
+    // input, other, out strides
+    workspace_size += (info.input_ndim + info.other_ndim + info.out_ndim) * sizeof(ptrdiff_t);
+    *desc_ptr = new Descriptor(
+        new Opaque{reinterpret_cast<device::metax::Handle *>(handle)->internal()},
+        info, workspace_size, handle->device, handle->device_id);
+    return INFINI_STATUS_SUCCESS;
+}
+
+namespace {
+
+template<size_t BLOCK_SIZE, typename T>
+infiniStatus_t launchKernel(
+    const InnerInfo &info,
+    const T *input, const T *other, T *out,
+    hcStream_t stream, void *workspace, size_t workspace_size) {
+
+    size_t input_ndim = info.input_ndim;
+    size_t other_ndim = info.other_ndim;
+    size_t out_ndim = info.out_ndim;
+    size_t total_elements = info.total_elements;
+    size_t oper_len = info.oper_len;
+
+    size_t workspace_offset = 0;
+    unsigned char *workspace_ptr = reinterpret_cast<unsigned char *>(workspace);
+
+    size_t *out_shape_hc = reinterpret_cast<size_t *>(workspace_ptr + workspace_offset);
+    workspace_offset += out_ndim * sizeof(size_t);
+
+    ptrdiff_t *input_strides_hc = reinterpret_cast<ptrdiff_t *>(workspace_ptr + workspace_offset);
+    ptrdiff_t *other_strides_hc = input_strides_hc + input_ndim;
+    ptrdiff_t *out_strides_hc = other_strides_hc + other_ndim;
+    workspace_offset += (info.input_ndim + info.other_ndim + info.out_ndim) * sizeof(ptrdiff_t);
+
+    assert(workspace_offset == workspace_size);
+
+    CHECK_METAX(hcMemcpyAsync(out_shape_hc, info.out_shape.data(), out_ndim * sizeof(size_t), hcMemcpyHostToDevice, stream));
+    CHECK_METAX(hcMemcpyAsync(input_strides_hc, info.input_strides.data(), input_ndim * sizeof(ptrdiff_t), hcMemcpyHostToDevice, stream));
+    CHECK_METAX(hcMemcpyAsync(other_strides_hc, info.other_strides.data(), other_ndim * sizeof(ptrdiff_t), hcMemcpyHostToDevice, stream));
+    CHECK_METAX(hcMemcpyAsync(out_strides_hc, info.out_strides.data(), out_ndim * sizeof(ptrdiff_t), hcMemcpyHostToDevice, stream));
+
+    size_t block_size = BLOCK_SIZE;
+    size_t grid_size = (total_elements + BLOCK_SIZE - 1) / BLOCK_SIZE;
+
+    innerKernel<BLOCK_SIZE, T><<<grid_size, block_size, 0, stream>>>(
+        input, other, out,
+        total_elements, oper_len, out_shape_hc,
+        input_strides_hc, other_strides_hc, out_strides_hc,
+        input_ndim, other_ndim, out_ndim);
+
+    // CHECK_METAX(hcDeviceSynchronize());
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+}
+
+infiniStatus_t Descriptor::calculate(
+    void *workspace, size_t workspace_size,
+    void *out,
+    const void *input,
+    const void *other,
+    void *stream_) const {
+
+    hcStream_t stream = (hcStream_t)stream_;
+#define CALCULATE_INNER(BLOCK_SIZE, T)                          \
+    launchKernel<BLOCK_SIZE, T>(                                \
+        _info,                                                  \
+        (const T *)input, (const T *)other, (T *)out,           \
+        stream, workspace, workspace_size                       \
+    )
+#define CALCULATE_INNER_WITH_BLOCK_SIZE(BLOCK_SIZE)             \
+    {                                                           \
+        if (_info.dtype == INFINI_DTYPE_BF16)                   \
+            return CALCULATE_INNER(BLOCK_SIZE, __hpcc_bfloat16);\
+        else if(_info.dtype == INFINI_DTYPE_F16)                \
+            return CALCULATE_INNER(BLOCK_SIZE, half);           \
+        else if(_info.dtype == INFINI_DTYPE_F32)                \
+            return CALCULATE_INNER(BLOCK_SIZE, float);          \
+        else                                                    \
+            return INFINI_STATUS_BAD_TENSOR_DTYPE;              \
+    }
+
+    if (_opaque->internal->maxThreadsPerBlock() == METAX_BLOCK_SIZE_1024) {
+        CALCULATE_INNER_WITH_BLOCK_SIZE(METAX_BLOCK_SIZE_1024)
+    } else if (_opaque->internal->maxThreadsPerBlock() == METAX_BLOCK_SIZE_512) {
+        CALCULATE_INNER_WITH_BLOCK_SIZE(METAX_BLOCK_SIZE_512)
+    } else {
+        return INFINI_STATUS_DEVICE_ARCHITECTURE_NOT_SUPPORTED;
+    }
+    return INFINI_STATUS_SUCCESS;
+}
+
+}
\ No newline at end of file
diff --git a/src/infiniop/ops/inner/metax/inner_metax_kernel.h b/src/infiniop/ops/inner/metax/inner_metax_kernel.h
new file mode 100644
index 000000000..52f37e1ef
--- /dev/null
+++ b/src/infiniop/ops/inner/metax/inner_metax_kernel.h
@@ -0,0 +1,59 @@
+#ifndef __INNER_METAX_KERNEL_H__
+#define __INNER_METAX_KERNEL_H__
+
+template<size_t BLOCK_SIZE, typename T>
+INFINIOP_METAX_KERNEL innerKernel(
+    const T *input, const T *other, T *out,
+    size_t total_elements, size_t oper_len, size_t *out_shape,
+    ptrdiff_t *input_strides, ptrdiff_t *other_strides, ptrdiff_t *out_strides,
+    size_t input_ndim, size_t other_ndim, size_t out_ndim) {
+    
+    size_t out_index = blockDim.x * blockIdx.x + threadIdx.x;
+    if (out_index >= total_elements) return;
+
+    size_t out_offset = device::metax::indexToOffset(out_index, out_ndim, out_shape, out_strides);
+    
+    size_t out_dim_pos = out_ndim - 1;
+    size_t index = out_index;
+
+    ptrdiff_t input_offset = 0;
+    ptrdiff_t other_offset = 0;
+
+    for (int i = (int)other_ndim - 2; i >= 0; i --) {
+        other_offset += (index % out_shape[out_dim_pos]) * other_strides[i];
+        index /= out_shape[out_dim_pos --];
+    }
+    for (int i = (int)input_ndim - 2; i >= 0; i --) {
+        input_offset += (index % out_shape[out_dim_pos]) * input_strides[i];
+        index /= out_shape[out_dim_pos --];
+    }
+
+    // T tmp = input[input_offset] * other[other_offset];
+    // input_offset += input_strides[input_ndim - 1];
+    // other_offset += other_strides[other_ndim - 1];
+    T tmp = 0.;
+    for (size_t i = 0; i < oper_len; i ++) {
+        if constexpr (std::is_same_v<T, half> || std::is_same_v<T, cuda_bfloat16>) {
+            tmp += input[input_offset] * other[other_offset];
+        } else {
+            // if (out_index == 0) {
+            //     printf("input: %.10f\tother:%.10f\n", input[input_offset], other[other_offset]);
+            // }
+            T mul_a = __fmul_rn(input[input_offset], other[other_offset]);
+            // if (out_index == 0) {
+            //     printf("mul answer: %.10f\n", mul_a);
+            //     printf("tmp val: %.10f\n", tmp);
+            // }
+            tmp = __fadd_rd(tmp, mul_a);
+            // if (out_index == 0)
+            //     printf("add answer: %.10f\n", tmp);
+        }
+        input_offset += input_strides[input_ndim - 1];
+        other_offset += other_strides[other_ndim - 1];
+    }
+
+    // out[out_offset] = static_cast<T>(tmp);
+    out[out_offset] = tmp;
+}
+
+#endif // __INNER_KERNEL_CUH__
\ No newline at end of file
diff --git a/src/infiniop/ops/inner/moore/inner_moore.h b/src/infiniop/ops/inner/moore/inner_moore.h
new file mode 100644
index 000000000..a508a6718
--- /dev/null
+++ b/src/infiniop/ops/inner/moore/inner_moore.h
@@ -0,0 +1,7 @@
+#ifndef __INNER_MOORE_H__
+#define __INNER_MOORE_H__
+#include "../inner.h"
+
+DESCRIPTOR(moore);
+
+#endif
\ No newline at end of file
diff --git a/src/infiniop/ops/inner/moore/inner_moore.mu b/src/infiniop/ops/inner/moore/inner_moore.mu
new file mode 100644
index 000000000..61a430d2c
--- /dev/null
+++ b/src/infiniop/ops/inner/moore/inner_moore.mu
@@ -0,0 +1,123 @@
+#include "../../../devices/moore/moore_common.h"
+#include "inner_moore.h"
+
+#include "../../../devices/moore/moore_kernel_common.h"
+
+#include "inner_moore_kernel.h"
+
+namespace op::inner::moore {
+
+struct Descriptor::Opaque {
+    std::shared_ptr<device::moore::Handle::Internal> internal;
+};
+
+Descriptor::~Descriptor() {
+    delete _opaque;
+}
+
+infiniStatus_t Descriptor::create(
+    infiniopHandle_t handle,
+    Descriptor **desc_ptr,
+    infiniopTensorDescriptor_t out_desc,
+    infiniopTensorDescriptor_t input_desc,
+    infiniopTensorDescriptor_t other_desc) {
+
+    auto result = InnerInfo::create(out_desc, input_desc, other_desc);
+    CHECK_RESULT(result);
+    auto info = result.take();
+    size_t workspace_size = 0;
+    // out_shape
+    workspace_size += info.out_ndim * sizeof(size_t);
+    // input, other, out strides
+    workspace_size += (info.input_ndim + info.other_ndim + info.out_ndim) * sizeof(ptrdiff_t);
+    *desc_ptr = new Descriptor(
+        new Opaque{reinterpret_cast<device::moore::Handle *>(handle)->internal()},
+        info, workspace_size, handle->device, handle->device_id);
+    return INFINI_STATUS_SUCCESS;
+}
+
+namespace {
+
+template<size_t BLOCK_SIZE, typename T>
+infiniStatus_t launchKernel(
+    const InnerInfo &info,
+    const T *input, const T *other, T *out,
+    musaStream_t stream, void *workspace, size_t workspace_size) {
+
+    size_t input_ndim = info.input_ndim;
+    size_t other_ndim = info.other_ndim;
+    size_t out_ndim = info.out_ndim;
+    size_t total_elements = info.total_elements;
+    size_t oper_len = info.oper_len;
+
+    size_t workspace_offset = 0;
+    unsigned char *workspace_ptr = reinterpret_cast<unsigned char *>(workspace);
+
+    size_t *out_shape_musa = reinterpret_cast<size_t *>(workspace_ptr + workspace_offset);
+    workspace_offset += out_ndim * sizeof(size_t);
+
+    ptrdiff_t *input_strides_musa = reinterpret_cast<ptrdiff_t *>(workspace_ptr + workspace_offset);
+    ptrdiff_t *other_strides_musa = input_strides_musa + input_ndim;
+    ptrdiff_t *out_strides_musa = other_strides_musa + other_ndim;
+    workspace_offset += (info.input_ndim + info.other_ndim + info.out_ndim) * sizeof(ptrdiff_t);
+
+    assert(workspace_offset == workspace_size);
+
+    CHECK_MOORE(musaMemcpyAsync(out_shape_musa, info.out_shape.data(), out_ndim * sizeof(size_t), musaMemcpyHostToDevice, stream));
+    CHECK_MOORE(musaMemcpyAsync(input_strides_musa, info.input_strides.data(), input_ndim * sizeof(ptrdiff_t), musaMemcpyHostToDevice, stream));
+    CHECK_MOORE(musaMemcpyAsync(other_strides_musa, info.other_strides.data(), other_ndim * sizeof(ptrdiff_t), musaMemcpyHostToDevice, stream));
+    CHECK_MOORE(musaMemcpyAsync(out_strides_musa, info.out_strides.data(), out_ndim * sizeof(ptrdiff_t), musaMemcpyHostToDevice, stream));
+
+    size_t block_size = BLOCK_SIZE;
+    size_t grid_size = (total_elements + BLOCK_SIZE - 1) / BLOCK_SIZE;
+
+    innerKernel<BLOCK_SIZE, T><<<grid_size, block_size, 0, stream>>>(
+        input, other, out,
+        total_elements, oper_len, out_shape_musa,
+        input_strides_musa, other_strides_musa, out_strides_musa,
+        input_ndim, other_ndim, out_ndim);
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+}
+
+infiniStatus_t Descriptor::calculate(
+    void *workspace, size_t workspace_size,
+    void *out,
+    const void *input,
+    const void *other,
+    void *stream_) const {
+
+    musaStream_t stream = (musaStream_t)stream_;
+#define CALCULATE_INNER(BLOCK_SIZE, T)                          \
+    launchKernel<BLOCK_SIZE, T>(                                \
+        _info,                                                  \
+        (const T *)input, (const T *)other, (T *)out,           \
+        stream, workspace, workspace_size                       \
+    )
+#define CALCULATE_INNER_WITH_BLOCK_SIZE(BLOCK_SIZE)             \
+    {                                                           \
+        if (_info.dtype == INFINI_DTYPE_BF16)                   \
+            return CALCULATE_INNER(BLOCK_SIZE, __mt_bfloat16);  \
+        else if(_info.dtype == INFINI_DTYPE_F16)                \
+            return CALCULATE_INNER(BLOCK_SIZE, half);           \
+        else if(_info.dtype == INFINI_DTYPE_F32)                \
+            return CALCULATE_INNER(BLOCK_SIZE, float);          \
+        else                                                    \
+            return INFINI_STATUS_BAD_TENSOR_DTYPE;              \
+    }
+
+    if (_opaque->internal->maxThreadsPerBlock() == MOORE_BLOCK_SIZE_1024) {
+        CALCULATE_INNER_WITH_BLOCK_SIZE(MOORE_BLOCK_SIZE_1024)
+    } else if (_opaque->internal->maxThreadsPerBlock() == MOORE_BLOCK_SIZE_512) {
+        CALCULATE_INNER_WITH_BLOCK_SIZE(MOORE_BLOCK_SIZE_512)
+    } else if (_opaque->internal->maxThreadsPerBlock() == MOORE_BLOCK_SIZE_2048) {
+        CALCULATE_INNER_WITH_BLOCK_SIZE(MOORE_BLOCK_SIZE_2048)
+    } else {
+        return INFINI_STATUS_DEVICE_ARCHITECTURE_NOT_SUPPORTED;
+    }
+    return INFINI_STATUS_SUCCESS;
+}
+
+}
\ No newline at end of file
diff --git a/src/infiniop/ops/inner/moore/inner_moore_kernel.h b/src/infiniop/ops/inner/moore/inner_moore_kernel.h
new file mode 100644
index 000000000..de82dc8a9
--- /dev/null
+++ b/src/infiniop/ops/inner/moore/inner_moore_kernel.h
@@ -0,0 +1,45 @@
+#ifndef __INNER_MOORE_KERNEL_H__
+#define __INNER_MOORE_KERNEL_H__
+
+template<size_t BLOCK_SIZE, typename T>
+INFINIOP_MOORE_KERNEL innerKernel(
+    const T *input, const T *other, T *out,
+    size_t total_elements, size_t oper_len, size_t *out_shape,
+    ptrdiff_t *input_strides, ptrdiff_t *other_strides, ptrdiff_t *out_strides,
+    size_t input_ndim, size_t other_ndim, size_t out_ndim) {
+    
+    size_t out_index = blockDim.x * blockIdx.x + threadIdx.x;
+    if (out_index >= total_elements) return;
+
+    size_t out_offset = device::moore::indexToOffset(out_index, out_ndim, out_shape, out_strides);
+    
+    size_t out_dim_pos = out_ndim - 1;
+    size_t index = out_index;
+
+    ptrdiff_t input_offset = 0;
+    ptrdiff_t other_offset = 0;
+
+    for (int i = (int)other_ndim - 2; i >= 0; i --) {
+        other_offset += (index % out_shape[out_dim_pos]) * other_strides[i];
+        index /= out_shape[out_dim_pos --];
+    }
+    for (int i = (int)input_ndim - 2; i >= 0; i --) {
+        input_offset += (index % out_shape[out_dim_pos]) * input_strides[i];
+        index /= out_shape[out_dim_pos --];
+    }
+
+    float tmp = 0.;
+    for (size_t i = 0; i < oper_len; i ++) {
+        // if constexpr (std::is_same_v<T, half> || std::is_same_v<T, cuda_bfloat16>) {
+        tmp += static_cast<float>(input[input_offset]) * static_cast<float>(other[other_offset]);
+        // } else {
+        //     tmp += input[input_offset] * other[other_offset];
+        // }
+        input_offset += input_strides[input_ndim - 1];
+        other_offset += other_strides[other_ndim - 1];
+    }
+
+    out[out_offset] = static_cast<T>(tmp);
+}
+
+#endif // __INNER_KERNEL_CUH__
\ No newline at end of file
diff --git a/src/infiniop/ops/inner/nvidia/inner_nvidia.cu b/src/infiniop/ops/inner/nvidia/inner_nvidia.cu
new file mode 100644
index 000000000..e2b3ed0f9
--- /dev/null
+++ b/src/infiniop/ops/inner/nvidia/inner_nvidia.cu
@@ -0,0 +1,123 @@
+#include "../../../devices/nvidia/nvidia_common.cuh"
+#include "inner_nvidia.cuh"
+
+#include "../../../devices/nvidia/nvidia_kernel_common.cuh"
+
+#include "../cuda/kernel.cuh"
+
+namespace op::inner::nvidia {
+
+struct Descriptor::Opaque {
+    std::shared_ptr<device::nvidia::Handle::Internal> internal;
+};
+
+Descriptor::~Descriptor() {
+    delete _opaque;
+}
+
+infiniStatus_t Descriptor::create(
+    infiniopHandle_t handle,
+    Descriptor **desc_ptr,
+    infiniopTensorDescriptor_t out_desc,
+    infiniopTensorDescriptor_t input_desc,
+    infiniopTensorDescriptor_t other_desc) {
+
+    auto result = InnerInfo::create(out_desc, input_desc, other_desc);
+    CHECK_RESULT(result);
+    auto info = result.take();
+    size_t workspace_size = 0;
+    // out_shape
+    workspace_size += info.out_ndim * sizeof(size_t);
+    // input, other, out strides
+    workspace_size += (info.input_ndim + info.other_ndim + info.out_ndim) * sizeof(ptrdiff_t);
+    *desc_ptr = new Descriptor(
+        new Opaque{reinterpret_cast<device::nvidia::Handle *>(handle)->internal()},
+        info, workspace_size, handle->device, handle->device_id);
+    return INFINI_STATUS_SUCCESS;
+}
+
+namespace {
+
+template<size_t BLOCK_SIZE, typename T>
+infiniStatus_t launchKernel(
+    const InnerInfo &info,
+    const T *input, const T *other, T *out,
+    cudaStream_t stream, void *workspace, size_t workspace_size) {
+
+    size_t input_ndim = info.input_ndim;
+    size_t other_ndim = info.other_ndim;
+    size_t out_ndim = info.out_ndim;
+    size_t total_elements = info.total_elements;
+    size_t oper_len = info.oper_len;
+
+    size_t workspace_offset = 0;
+    unsigned char *workspace_ptr = reinterpret_cast<unsigned char *>(workspace);
+
+    size_t *out_shape_cuda = reinterpret_cast<size_t *>(workspace_ptr + workspace_offset);
+    workspace_offset += out_ndim * sizeof(size_t);
+
+    ptrdiff_t *input_strides_cuda = reinterpret_cast<ptrdiff_t *>(workspace_ptr + workspace_offset);
+    ptrdiff_t *other_strides_cuda = input_strides_cuda + input_ndim;
+    ptrdiff_t *out_strides_cuda = other_strides_cuda + other_ndim;
+    workspace_offset += (info.input_ndim + info.other_ndim + info.out_ndim) * sizeof(ptrdiff_t);
+
+    CHECK_CUDA(cudaMemcpyAsync(out_shape_cuda, info.out_shape.data(), out_ndim * sizeof(size_t), cudaMemcpyHostToDevice, stream));
+    CHECK_CUDA(cudaMemcpyAsync(input_strides_cuda, info.input_strides.data(), input_ndim * sizeof(ptrdiff_t), cudaMemcpyHostToDevice, stream));
+    CHECK_CUDA(cudaMemcpyAsync(other_strides_cuda, info.other_strides.data(), other_ndim * sizeof(ptrdiff_t), cudaMemcpyHostToDevice, stream));
+    CHECK_CUDA(cudaMemcpyAsync(out_strides_cuda, info.out_strides.data(), out_ndim * sizeof(ptrdiff_t), cudaMemcpyHostToDevice, stream));
+
+    size_t block_size = BLOCK_SIZE;
+    size_t grid_size = (total_elements + BLOCK_SIZE - 1) / BLOCK_SIZE;
+
+    innerKernel<BLOCK_SIZE, T><<<grid_size, block_size, 0, stream>>>(
+        input, other, out,
+        total_elements, oper_len, out_shape_cuda,
+        input_strides_cuda, other_strides_cuda, out_strides_cuda,
+        input_ndim, other_ndim, out_ndim);
+
+    // CHECK_CUDA(cudaDeviceSynchronize());
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+}
+
+infiniStatus_t Descriptor::calculate(
+    void *workspace, size_t workspace_size,
+    void *out,
+    const void *input,
+    const void *other,
+    void *stream_) const {
+
+    cudaStream_t stream = (cudaStream_t)stream_;
+#define CALCULATE_INNER(BLOCK_SIZE, T)                          \
+    launchKernel<BLOCK_SIZE, T>(                                \
+        _info,                                                  \
+        (const T *)input, (const T *)other, (T *)out,           \
+        stream, workspace, workspace_size                       \
+    )
+#define CALCULATE_INNER_WITH_BLOCK_SIZE(BLOCK_SIZE)             \
+    {                                                           \
+        if (_info.dtype == INFINI_DTYPE_BF16)                   \
+            return CALCULATE_INNER(BLOCK_SIZE, __nv_bfloat16);  \
+        else if(_info.dtype == INFINI_DTYPE_F16)                \
+            return CALCULATE_INNER(BLOCK_SIZE, half);           \
+        else if(_info.dtype == INFINI_DTYPE_F32)                \
+            return CALCULATE_INNER(BLOCK_SIZE, float);          \
+        else                                                    \
+            return INFINI_STATUS_BAD_TENSOR_DTYPE;              \
+    }
+
+    if (_opaque->internal->maxThreadsPerBlock() == CUDA_BLOCK_SIZE_1024) {
+        CALCULATE_INNER_WITH_BLOCK_SIZE(CUDA_BLOCK_SIZE_1024)
+    } else if (_opaque->internal->maxThreadsPerBlock() == CUDA_BLOCK_SIZE_512) {
+        CALCULATE_INNER_WITH_BLOCK_SIZE(CUDA_BLOCK_SIZE_512)
+    } else if (_opaque->internal->maxThreadsPerBlock() == CUDA_BLOCK_SIZE_4096) {
+        CALCULATE_INNER_WITH_BLOCK_SIZE(CUDA_BLOCK_SIZE_4096)
+    } else {
+        return INFINI_STATUS_DEVICE_ARCHITECTURE_NOT_SUPPORTED;
+    }
+    return INFINI_STATUS_SUCCESS;
+}
+
+}
\ No newline at end of file
diff --git a/src/infiniop/ops/inner/nvidia/inner_nvidia.cuh b/src/infiniop/ops/inner/nvidia/inner_nvidia.cuh
new file mode 100644
index 000000000..11033c2c0
--- /dev/null
+++ b/src/infiniop/ops/inner/nvidia/inner_nvidia.cuh
@@ -0,0 +1,7 @@
+#ifndef __INNER_NVIDIA_H__
+#define __INNER_NVIDIA_H__
+#include "../inner.h"
+
+DESCRIPTOR(nvidia);
+
+#endif
\ No newline at end of file
diff --git a/src/infiniop/ops/inner/operator.cc b/src/infiniop/ops/inner/operator.cc
new file mode 100644
index 000000000..c68762d0e
--- /dev/null
+++ b/src/infiniop/ops/inner/operator.cc
@@ -0,0 +1,139 @@
+#include "../../operator.h"
+#include "../../handle.h"
+#include "infiniop/ops/inner.h"
+
+#ifdef ENABLE_CPU_API
+#include "cpu/inner_cpu.h"
+#endif
+#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API) || defined(ENABLE_QY_API) || defined(ENABLE_HYGON_API)
+#include "nvidia/inner_nvidia.cuh"
+#endif
+#ifdef ENABLE_METAX_API
+#include "metax/inner_metax.h"
+#endif
+#ifdef ENABLE_MOORE_API
+#include "moore/inner_moore.h"
+#endif
+
+__C infiniStatus_t infiniopCreateInnerDescriptor(
+    infiniopHandle_t handle,
+    infiniopInnerDescriptor_t *desc_ptr,
+    infiniopTensorDescriptor_t out_desc,
+    infiniopTensorDescriptor_t input_desc,
+    infiniopTensorDescriptor_t other_desc) {
+
+#define CREATE(CASE, NAMESPACE)                                                     \
+    case CASE:                                                                      \
+        return op::inner::NAMESPACE::Descriptor::create(                            \
+            handle,                                                                 \
+            reinterpret_cast<op::inner::NAMESPACE::Descriptor **>(desc_ptr),        \
+            out_desc,                                                               \
+            input_desc,                                                             \
+            other_desc);
+
+    switch (handle->device) {
+#ifdef ENABLE_CPU_API
+        CREATE(INFINI_DEVICE_CPU, cpu)
+#endif
+#ifdef ENABLE_NVIDIA_API
+        CREATE(INFINI_DEVICE_NVIDIA, nvidia)
+#endif
+#ifdef ENABLE_ILUVATAR_API
+        CREATE(INFINI_DEVICE_ILUVATAR, nvidia);
+#endif
+#ifdef ENABLE_METAX_API
+        CREATE(INFINI_DEVICE_METAX, metax)
+#endif
+#ifdef ENABLE_MOORE_API
+        CREATE(INFINI_DEVICE_MOORE, moore)
+#endif
+    }
+    return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+}
+
+__C infiniStatus_t infiniopGetInnerWorkspaceSize(infiniopInnerDescriptor_t desc, size_t *size) {
+
+#define GET(CASE, NAMESPACE)                                                                    \
+    case CASE:                                                                                  \
+        *size = reinterpret_cast<op::inner::NAMESPACE::Descriptor *>(desc)->workspaceSize();    \
+        return INFINI_STATUS_SUCCESS;
+
+    switch (desc->device_type) {
+#ifdef ENABLE_CPU_API
+        GET(INFINI_DEVICE_CPU, cpu)
+#endif
+#ifdef ENABLE_NVIDIA_API
+        GET(INFINI_DEVICE_NVIDIA, nvidia)
+#endif
+#ifdef ENABLE_ILUVATAR_API
+        GET(INFINI_DEVICE_ILUVATAR, nvidia);
+#endif
+#ifdef ENABLE_METAX_API
+        GET(INFINI_DEVICE_METAX, metax)
+#endif
+#ifdef ENABLE_MOORE_API
+        GET(INFINI_DEVICE_MOORE, moore)
+#endif
+    }
+    return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+}
+
+__C infiniStatus_t infiniopInner(
+    infiniopInnerDescriptor_t desc,
+    void *workspace, size_t workspace_size,
+    void *out,
+    const void *input,
+    const void *other,
+    void *stream) {
+
+#define CALCULATE(CASE, NAMESPACE)                                                             \
+    case CASE:                                                                                 \
+        return reinterpret_cast<op::inner::NAMESPACE::Descriptor *>(desc)->calculate( \
+            workspace, workspace_size, out, input, other, stream);
+
+    switch (desc->device_type) {
+#ifdef ENABLE_CPU_API
+        CALCULATE(INFINI_DEVICE_CPU, cpu)
+#endif
+#ifdef ENABLE_NVIDIA_API
+        CALCULATE(INFINI_DEVICE_NVIDIA, nvidia)
+#endif
+#ifdef ENABLE_ILUVATAR_API
+        CALCULATE(INFINI_DEVICE_ILUVATAR, nvidia);
+#endif
+#ifdef ENABLE_METAX_API
+        CALCULATE(INFINI_DEVICE_METAX, metax)
+#endif
+#ifdef ENABLE_MOORE_API
+        CALCULATE(INFINI_DEVICE_MOORE, moore)
+#endif
+    }
+    return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+}
+
+__C infiniStatus_t infiniopDestroyInnerDescriptor(infiniopInnerDescriptor_t desc) {
+
+#define DESTROY(CASE, NAMESPACE)                                                    \
+    case CASE:                                                                      \
+        delete reinterpret_cast<op::inner::NAMESPACE::Descriptor *>(desc); \
+        return INFINI_STATUS_SUCCESS;
+
+    switch (desc->device_type) {
+#ifdef ENABLE_CPU_API
+        DESTROY(INFINI_DEVICE_CPU, cpu)
+#endif
+#ifdef ENABLE_NVIDIA_API
+        DESTROY(INFINI_DEVICE_NVIDIA, nvidia)
+#endif
+#ifdef ENABLE_ILUVATAR_API
+        DESTROY(INFINI_DEVICE_ILUVATAR, nvidia);
+#endif
+#ifdef ENABLE_METAX_API
+        DESTROY(INFINI_DEVICE_METAX, metax)
+#endif
+#ifdef ENABLE_MOORE_API
+        DESTROY(INFINI_DEVICE_MOORE, moore)
+#endif
+    }
+    return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+}
diff --git a/src/infiniop/ops/masked_select/cpu/masked_select_cpu.cc b/src/infiniop/ops/masked_select/cpu/masked_select_cpu.cc
new file mode 100644
index 000000000..ae74587d8
--- /dev/null
+++ b/src/infiniop/ops/masked_select/cpu/masked_select_cpu.cc
@@ -0,0 +1,58 @@
+#include "masked_select_cpu.h"
+#include "../../../devices/cpu/common_cpu.h"
+
+namespace op::masked_select::cpu {
+
+Descriptor::~Descriptor() {}
+
+infiniStatus_t Descriptor::create(
+    infiniopHandle_t handle,
+    Descriptor **desc_ptr,
+    infiniopTensorDescriptor_t input_desc,
+    infiniopTensorDescriptor_t mask_desc) {
+    
+    auto result = MaskedSelectInfo::create(input_desc, mask_desc);
+    CHECK_RESULT(result);
+    *desc_ptr = new Descriptor(nullptr, result.take(), 0, handle->device, handle->device_id);
+    return INFINI_STATUS_SUCCESS;
+}
+
+namespace {
+
+template<typename T>
+infiniStatus_t maskedSelect(const MaskedSelectInfo *info, const T *input, const bool *mask, void **data_ptr, size_t *dlen_ptr) {
+    std::vector<T> res;
+    for (size_t index = 0; index < info->total_elements; index ++) {
+        size_t input_offset = op::common_cpu::indexToOffset(index, info->ndim, info->shape.data(), info->input_strides.data());
+        size_t mask_offset = op::common_cpu::indexToOffset(index, info->ndim, info->shape.data(), info->mask_strides.data());
+        if (mask[mask_offset]) {
+            res.push_back(input[input_offset]);
+        }
+    }
+    *dlen_ptr = res.size();
+    *data_ptr = new T[*dlen_ptr];
+    std::memcpy(*data_ptr, res.data(), sizeof(T) * (*dlen_ptr));
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+}
+
+infiniStatus_t Descriptor::calculate(
+    void *workspace, size_t workspace_size,
+    const void *input,
+    const bool *mask,
+    void **data_ptr,
+    size_t *dlen_ptr,
+    void *stream) const {
+
+    if (_info.dtype == INFINI_DTYPE_F32) {
+        CHECK_STATUS(maskedSelect(&_info, (const float *)input, mask, data_ptr, dlen_ptr));
+    } else {
+        return INFINI_STATUS_BAD_TENSOR_DTYPE;
+    }
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+}
\ No newline at end of file
diff --git a/src/infiniop/ops/masked_select/cpu/masked_select_cpu.h b/src/infiniop/ops/masked_select/cpu/masked_select_cpu.h
new file mode 100644
index 000000000..9fed9e2e6
--- /dev/null
+++ b/src/infiniop/ops/masked_select/cpu/masked_select_cpu.h
@@ -0,0 +1,7 @@
+#ifndef __MASKED_SELECT_CPU_H__
+#define __MASKED_SELECT_CPU_H__
+#include "../masked_select.h"
+
+DESCRIPTOR(cpu)
+
+#endif // __MASKED_SELECT_CPU_H__
\ No newline at end of file
diff --git a/src/infiniop/ops/masked_select/cuda/kernel.cuh b/src/infiniop/ops/masked_select/cuda/kernel.cuh
new file mode 100644
index 000000000..fc7485f31
--- /dev/null
+++ b/src/infiniop/ops/masked_select/cuda/kernel.cuh
@@ -0,0 +1,101 @@
+#ifndef __MASKED_SELECT_KERNEL_CUH__
+#define __MASKED_SELECT_KERNEL_CUH__
+
+template<size_t BLOCK_SIZE>
+INFINIOP_CUDA_KERNEL maskedSelectGetMarkScanOnceKernel(
+    const bool *mask, size_t *mark_scan, size_t total_elements,
+    size_t *shape, ptrdiff_t *mask_strides, size_t ndim) {
+
+    __shared__ __align__(128) size_t smem[BLOCK_SIZE];
+
+    size_t tid = threadIdx.x;
+    size_t index = blockDim.x * blockIdx.x + threadIdx.x;
+    if (index < total_elements) {
+        size_t mask_offset = device::nvidia::indexToOffset(index, ndim, shape, mask_strides);
+        smem[tid] = mask[mask_offset];
+    } else {
+        smem[tid] = 0;
+    }
+
+    for (int s = 1; s < BLOCK_SIZE; s <<= 1) {
+        __syncthreads();
+        
+        float temp;
+        if (tid >= s) 
+            temp = smem[tid] + smem[tid - s];
+        __syncthreads();
+
+        if (tid >= s) 
+            smem[tid] = temp;
+    }
+
+    if (index < total_elements)
+        mark_scan[index] = smem[tid];
+}
+
+template<size_t BLOCK_SIZE>
+INFINIOP_CUDA_KERNEL maskedSelectScanWithStrideKernel(
+    size_t *mark_scan, size_t total_elements, size_t stride) {
+
+    __shared__ __align__(128) size_t smem[BLOCK_SIZE];
+
+    size_t tid = threadIdx.x;
+    size_t index = (blockDim.x * blockIdx.x + threadIdx.x + 1) * stride - 1;
+    smem[tid] = index < total_elements ? mark_scan[index] : 0.;
+
+    for (int s = 1; s < BLOCK_SIZE; s <<= 1) {
+        __syncthreads();
+        
+        size_t temp;
+        if (tid >= s) 
+            temp = smem[tid] + smem[tid - s];
+        __syncthreads();
+
+        if (tid >= s) 
+            smem[tid] = temp;
+    }
+
+    if (index < total_elements)
+        mark_scan[index] = smem[tid];
+}
+
+template<size_t BLOCK_SIZE>
+INFINIOP_CUDA_KERNEL maskedSelectCountScanResultKernel(
+    size_t *mark_scan, size_t *scan_result, size_t total_elements) {
+
+    size_t index = blockDim.x * blockIdx.x + threadIdx.x;
+    if (index >= total_elements)
+        return;
+
+    size_t val = mark_scan[index];
+
+    for (size_t s = BLOCK_SIZE; s < total_elements; s *= BLOCK_SIZE) {
+        size_t now_stride_block = (index + 1) / s;
+        size_t pre_stride_block = now_stride_block * s - 1;
+        if (now_stride_block == 0 || (index + 1) % s == 0 || (pre_stride_block + 1) % (s * BLOCK_SIZE) == 0)
+            continue;
+        val += mark_scan[pre_stride_block];
+    }
+    
+    scan_result[index] = val;
+}
+
+template<size_t BLOCK_SIZE, typename T>
+INFINIOP_CUDA_KERNEL maskedSelectGetDataKernel(
+    const T *input, const bool *mask, size_t *scan_result, T *data, size_t total_elements,
+    size_t *shape, ptrdiff_t *input_strides, ptrdiff_t *mask_strides, size_t ndim) {
+
+    size_t index = blockDim.x * blockIdx.x + threadIdx.x;
+
+    if (index >= total_elements)
+        return;
+
+    size_t input_offset = device::nvidia::indexToOffset(index, ndim, shape, input_strides);
+    size_t mask_offset = device::nvidia::indexToOffset(index, ndim, shape, mask_strides);
+
+    if (mask[mask_offset]) {
+        data[scan_result[index] - 1] = input[input_offset];
+    }
+}
+
+#endif
\ No newline at end of file
diff --git a/src/infiniop/ops/masked_select/info.h b/src/infiniop/ops/masked_select/info.h
new file mode 100644
index 000000000..889b32821
--- /dev/null
+++ b/src/infiniop/ops/masked_select/info.h
@@ -0,0 +1,53 @@
+#ifndef __MASKED_SELECT__INFO_H__
+#define __MASKED_SELECT__INFO_H__
+
+#include "../../../utils.h"
+#include "../../tensor.h"
+#include <vector>
+
+namespace op::masked_select {
+
+class MaskedSelectInfo {
+    MaskedSelectInfo() = default;
+
+public:
+    infiniDtype_t dtype;
+    size_t ndim;
+    size_t total_elements;
+
+    std::vector<size_t> shape;
+
+    std::vector<ptrdiff_t> input_strides;
+    std::vector<ptrdiff_t> mask_strides;
+
+    static utils::Result<MaskedSelectInfo> create(infiniopTensorDescriptor_t input_desc, infiniopTensorDescriptor_t mask_desc) {        
+        auto dtype = input_desc->dtype();
+        CHECK_DTYPE(dtype, INFINI_DTYPE_F32);
+        CHECK_DTYPE(mask_desc->dtype(), INFINI_DTYPE_BOOL);
+
+        auto shape = input_desc->shape();
+        CHECK_SAME_SHAPE(shape, mask_desc->shape());
+
+        auto ndim = input_desc->ndim();
+
+        size_t total_elements = 1;
+        for (auto& dim : shape) {
+            total_elements *= dim;
+        }
+
+        auto input_strides = input_desc->strides();
+        auto mask_strides = mask_desc->strides();
+
+        return utils::Result<MaskedSelectInfo>(MaskedSelectInfo{
+            dtype,
+            ndim,
+            total_elements,
+            shape,
+            input_strides,
+            mask_strides});
+    }
+};
+
+} // namespace op::masked_select
+
+#endif // __MASKED_SELECT_INFO_H__
diff --git a/src/infiniop/ops/masked_select/masked_select.h b/src/infiniop/ops/masked_select/masked_select.h
new file mode 100644
index 000000000..98148ebaf
--- /dev/null
+++ b/src/infiniop/ops/masked_select/masked_select.h
@@ -0,0 +1,48 @@
+#ifndef MASKED_SELECT_H
+#define MASKED_SELECT_H
+
+#include "../../operator.h"
+#include "info.h"
+
+#define DESCRIPTOR(NAMESPACE)                                    \
+                                                                 \
+    namespace op::masked_select::NAMESPACE {                     \
+    class Descriptor final: public InfiniopDescriptor {          \
+        struct Opaque;                                           \
+        Opaque *_opaque;                                         \
+        MaskedSelectInfo _info;                                  \
+        size_t _workspace_size;                                  \
+                                                                 \
+        Descriptor(                                              \
+            Opaque *opaque,                                      \
+            MaskedSelectInfo info,                               \
+            size_t workspace_size,                               \
+            infiniDevice_t device_type,                          \
+            int device_id)                                       \
+            : InfiniopDescriptor{device_type, device_id},        \
+              _opaque(opaque),                                   \
+              _info(info),                                       \
+              _workspace_size(workspace_size) {}                 \
+                                                                 \
+                                                                 \
+    public:                                                      \
+        ~Descriptor();                                           \
+        size_t workspaceSize() const { return _workspace_size; } \
+                                                                 \
+        static infiniStatus_t create(                            \
+            infiniopHandle_t handle,                             \
+            Descriptor **desc_ptr,                               \
+            infiniopTensorDescriptor_t input_desc,               \
+            infiniopTensorDescriptor_t mask_desc);               \
+                                                                 \
+        infiniStatus_t calculate(                                \
+            void *workspace, size_t workspace_size,              \
+            const void *input,                                   \
+            const bool *mask,                                    \
+            void **data_ptr,                                     \
+            size_t *dlen_ptr,                                    \
+            void *stream) const;                                 \
+    };                                                           \
+    }
+
+#endif // MASKED_SELECT_H
\ No newline at end of file
diff --git a/src/infiniop/ops/masked_select/metax/masked_select_metax.h b/src/infiniop/ops/masked_select/metax/masked_select_metax.h
new file mode 100644
index 000000000..47460777f
--- /dev/null
+++ b/src/infiniop/ops/masked_select/metax/masked_select_metax.h
@@ -0,0 +1,7 @@
+#ifndef __MASKED_SELECT_METAX_H__
+#define __MASKED_SELECT_METAX_H__
+#include "../masked_select.h"
+
+DESCRIPTOR(metax);
+
+#endif
\ No newline at end of file
diff --git a/src/infiniop/ops/masked_select/metax/masked_select_metax.maca b/src/infiniop/ops/masked_select/metax/masked_select_metax.maca
new file mode 100644
index 000000000..4a4181153
--- /dev/null
+++ b/src/infiniop/ops/masked_select/metax/masked_select_metax.maca
@@ -0,0 +1,154 @@
+#include "../../../devices/metax/metax_common.h"
+#include "masked_select_metax.h"
+
+#include "../../../devices/metax/metax_kernel_common.h"
+
+// #include "infinicore/context/context.hpp"
+#include "masked_select_metax_kernel.h"
+
+namespace op::masked_select::metax {
+
+struct Descriptor::Opaque {
+    std::shared_ptr<device::metax::Handle::Internal> internal;
+};
+
+Descriptor::~Descriptor() {
+    delete _opaque;
+}
+
+infiniStatus_t Descriptor::create(
+    infiniopHandle_t handle,
+    Descriptor **desc_ptr,
+    infiniopTensorDescriptor_t input_desc,
+    infiniopTensorDescriptor_t mask_desc) {
+
+    auto result = MaskedSelectInfo::create(input_desc, mask_desc);
+    CHECK_RESULT(result);
+    auto info = result.take();
+    size_t workspace_size = 0;
+    // shape size
+    workspace_size += info.ndim * sizeof(size_t);
+    // strides size * 2
+    workspace_size += info.ndim * sizeof(ptrdiff_t) * 2;
+    // size_t mark_scan
+    workspace_size += info.total_elements * sizeof(size_t);
+    // size_t mark_result
+    workspace_size += info.total_elements * sizeof(size_t);
+    *desc_ptr = new Descriptor(
+        new Opaque{reinterpret_cast<device::metax::Handle *>(handle)->internal()},
+        info, workspace_size, handle->device, handle->device_id);
+    return INFINI_STATUS_SUCCESS;
+}
+
+namespace {
+
+template<size_t BLOCK_SIZE, typename T>
+infiniStatus_t launchKernel(
+    const MaskedSelectInfo &info, 
+    const T *input, const bool *mask, void **data_ptr, size_t *dlen_ptr,
+    hcStream_t stream, void *workspace, size_t workspace_size) {
+
+    size_t ndim = info.ndim;
+    size_t total_elements = info.total_elements;
+
+    size_t workspace_offset = 0;
+    unsigned char *workspace_ptr = reinterpret_cast<unsigned char *>(workspace);
+
+    size_t *shape_hc = reinterpret_cast<size_t *>(workspace_ptr + workspace_offset);
+    workspace_offset += ndim * sizeof(size_t);
+    
+    ptrdiff_t *input_strides_hc = reinterpret_cast<ptrdiff_t *>(workspace_ptr + workspace_offset);
+    workspace_offset += ndim * sizeof(ptrdiff_t);
+    
+    ptrdiff_t *mask_strides_hc = reinterpret_cast<ptrdiff_t *>(workspace_ptr + workspace_offset);
+    workspace_offset += ndim * sizeof(ptrdiff_t);
+
+    size_t *mark_scan = reinterpret_cast<size_t *>(workspace_ptr + workspace_offset);
+    workspace_offset += info.total_elements * sizeof(size_t);
+
+    size_t *scan_result = reinterpret_cast<size_t *>(workspace_ptr + workspace_offset);
+    workspace_offset += info.total_elements * sizeof(size_t);
+
+    assert(workspace_offset == workspace_size);
+
+    CHECK_METAX(hcMemcpyAsync(shape_hc, info.shape.data(), ndim * sizeof(size_t), hcMemcpyHostToDevice, stream));
+    CHECK_METAX(hcMemcpyAsync(input_strides_hc, info.input_strides.data(), ndim * sizeof(ptrdiff_t), hcMemcpyHostToDevice, stream));
+    CHECK_METAX(hcMemcpyAsync(mask_strides_hc, info.mask_strides.data(), ndim * sizeof(ptrdiff_t), hcMemcpyHostToDevice, stream));
+
+    size_t block_size = BLOCK_SIZE;
+    size_t grid_size = (total_elements + block_size - 1) / block_size;
+    
+    maskedSelectGetMarkScanOnceKernel<BLOCK_SIZE><<<grid_size, block_size, 0, stream>>>(
+        mask, mark_scan, total_elements,
+        shape_hc, mask_strides_hc, ndim);
+    CHECK_METAX(hcDeviceSynchronize());
+    
+    for (size_t stride = BLOCK_SIZE; stride < total_elements; stride *= BLOCK_SIZE) {
+        size_t stride_elements = (total_elements + stride - 1) / stride;
+        size_t stride_grid_size = (stride_elements + BLOCK_SIZE - 1) / BLOCK_SIZE;
+        
+        maskedSelectScanWithStrideKernel<BLOCK_SIZE><<<stride_grid_size, block_size, 0, stream>>>(
+            mark_scan, total_elements, stride);
+        CHECK_METAX(hcDeviceSynchronize());
+    }
+    if (total_elements > BLOCK_SIZE) {
+        maskedSelectCountScanResultKernel<BLOCK_SIZE><<<grid_size, block_size, 0, stream>>>(
+            mark_scan, scan_result, total_elements);
+        CHECK_METAX(hcDeviceSynchronize());
+    } else {
+        scan_result = mark_scan;
+    }
+    
+    
+    CHECK_METAX(hcMemcpyAsync(dlen_ptr, scan_result + total_elements - 1, sizeof(size_t), hcMemcpyDeviceToHost, stream));
+    
+    CHECK_METAX(hcMalloc(data_ptr, *dlen_ptr * sizeof(T)));
+    
+    // context::setDevice(METAX);
+    // context::allocateMemory(data_ptr, *dlen_ptr * sizeof(T));
+    
+    maskedSelectGetDataKernel<BLOCK_SIZE, T><<<grid_size, block_size, 0, stream>>>(
+        input, mask, scan_result, (T *)*data_ptr, total_elements,
+        shape_hc, input_strides_hc, mask_strides_hc, ndim);
+    CHECK_METAX(hcDeviceSynchronize());
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+}
+
+
+infiniStatus_t Descriptor::calculate(
+    void *workspace, size_t workspace_size,
+    const void *input,
+    const bool *mask,
+    void **data_ptr,
+    size_t *dlen_ptr,
+    void *stream_) const {
+
+    hcStream_t stream = (hcStream_t)stream_;
+#define CALCULATE_MASKED_SELECT(BLOCK_SIZE, T)                  \
+    launchKernel<BLOCK_SIZE, T>(                                \
+        _info,                                                  \
+        (const T *)input, mask, data_ptr, dlen_ptr,             \
+        stream, workspace, workspace_size                       \
+    )
+#define CALCULATE_MASKED_SELECT_WITH_BLOCK_SIZE(BLOCK_SIZE)     \
+    {                                                           \
+        if (_info.dtype == INFINI_DTYPE_F32)                    \
+            return CALCULATE_MASKED_SELECT(BLOCK_SIZE, float);  \
+        else                                                    \
+            return INFINI_STATUS_BAD_TENSOR_DTYPE;              \
+    }
+
+    if (_opaque->internal->maxThreadsPerBlock() == METAX_BLOCK_SIZE_1024) {
+        CALCULATE_MASKED_SELECT_WITH_BLOCK_SIZE(METAX_BLOCK_SIZE_1024);
+    } else if (_opaque->internal->maxThreadsPerBlock() == METAX_BLOCK_SIZE_512) {
+        CALCULATE_MASKED_SELECT_WITH_BLOCK_SIZE(METAX_BLOCK_SIZE_512);
+    } else {
+        return INFINI_STATUS_DEVICE_ARCHITECTURE_NOT_SUPPORTED;
+    }
+    return INFINI_STATUS_SUCCESS;
+}
+
+}
diff --git a/src/infiniop/ops/masked_select/metax/masked_select_metax_kernel.h b/src/infiniop/ops/masked_select/metax/masked_select_metax_kernel.h
new file mode 100644
index 000000000..e590de955
--- /dev/null
+++ b/src/infiniop/ops/masked_select/metax/masked_select_metax_kernel.h
@@ -0,0 +1,101 @@
+#ifndef __MASKED_SELECT_METAX_KERNEL_CUH__
+#define __MASKED_SELECT_METAX_KERNEL_CUH__
+
+template<size_t BLOCK_SIZE>
+INFINIOP_METAX_KERNEL maskedSelectGetMarkScanOnceKernel(
+    const bool *mask, size_t *mark_scan, size_t total_elements,
+    size_t *shape, ptrdiff_t *mask_strides, size_t ndim) {
+
+    __shared__ __align__(128) size_t smem[BLOCK_SIZE];
+
+    size_t tid = threadIdx.x;
+    size_t index = blockDim.x * blockIdx.x + threadIdx.x;
+    if (index < total_elements) {
+        size_t mask_offset = device::metax::indexToOffset(index, ndim, shape, mask_strides);
+        smem[tid] = mask[mask_offset];
+    } else {
+        smem[tid] = 0;
+    }
+
+    for (int s = 1; s < BLOCK_SIZE; s <<= 1) {
+        __syncthreads();
+        
+        float temp;
+        if (tid >= s) 
+            temp = smem[tid] + smem[tid - s];
+        __syncthreads();
+
+        if (tid >= s) 
+            smem[tid] = temp;
+    }
+
+    if (index < total_elements)
+        mark_scan[index] = smem[tid];
+}
+
+template<size_t BLOCK_SIZE>
+INFINIOP_METAX_KERNEL maskedSelectScanWithStrideKernel(
+    size_t *mark_scan, size_t total_elements, size_t stride) {
+
+    __shared__ __align__(128) size_t smem[BLOCK_SIZE];
+
+    size_t tid = threadIdx.x;
+    size_t index = (blockDim.x * blockIdx.x + threadIdx.x + 1) * stride - 1;
+    smem[tid] = index < total_elements ? mark_scan[index] : 0.;
+
+    for (int s = 1; s < BLOCK_SIZE; s <<= 1) {
+        __syncthreads();
+        
+        size_t temp;
+        if (tid >= s) 
+            temp = smem[tid] + smem[tid - s];
+        __syncthreads();
+
+        if (tid >= s) 
+            smem[tid] = temp;
+    }
+
+    if (index < total_elements)
+        mark_scan[index] = smem[tid];
+}
+
+template<size_t BLOCK_SIZE>
+INFINIOP_METAX_KERNEL maskedSelectCountScanResultKernel(
+    size_t *mark_scan, size_t *scan_result, size_t total_elements) {
+
+    size_t index = blockDim.x * blockIdx.x + threadIdx.x;
+    if (index >= total_elements)
+        return;
+
+    size_t val = mark_scan[index];
+
+    for (size_t s = BLOCK_SIZE; s < total_elements; s *= BLOCK_SIZE) {
+        size_t now_stride_block = (index + 1) / s;
+        size_t pre_stride_block = now_stride_block * s - 1;
+        if (now_stride_block == 0 || (index + 1) % s == 0 || (pre_stride_block + 1) % (s * BLOCK_SIZE) == 0)
+            continue;
+        val += mark_scan[pre_stride_block];
+    }
+    
+    scan_result[index] = val;
+}
+
+template<size_t BLOCK_SIZE, typename T>
+INFINIOP_METAX_KERNEL maskedSelectGetDataKernel(
+    const T *input, const bool *mask, size_t *scan_result, T *data, size_t total_elements,
+    size_t *shape, ptrdiff_t *input_strides, ptrdiff_t *mask_strides, size_t ndim) {
+
+    size_t index = blockDim.x * blockIdx.x + threadIdx.x;
+
+    if (index >= total_elements)
+        return;
+
+    size_t input_offset = device::metax::indexToOffset(index, ndim, shape, input_strides);
+    size_t mask_offset = device::metax::indexToOffset(index, ndim, shape, mask_strides);
+
+    if (mask[mask_offset]) {
+        data[scan_result[index] - 1] = input[input_offset];
+    }
+}
+
+#endif
\ No newline at end of file
diff --git a/src/infiniop/ops/masked_select/moore/masked_select_moore.h b/src/infiniop/ops/masked_select/moore/masked_select_moore.h
new file mode 100644
index 000000000..24548d8fc
--- /dev/null
+++ b/src/infiniop/ops/masked_select/moore/masked_select_moore.h
@@ -0,0 +1,7 @@
+#ifndef __MASKED_SELECT_MOORE_H__
+#define __MASKED_SELECT_MOORE_H__
+#include "../masked_select.h"
+
+DESCRIPTOR(moore);
+
+#endif
\ No newline at end of file
diff --git a/src/infiniop/ops/masked_select/moore/masked_select_moore.mu b/src/infiniop/ops/masked_select/moore/masked_select_moore.mu
new file mode 100644
index 000000000..3c218bc19
--- /dev/null
+++ b/src/infiniop/ops/masked_select/moore/masked_select_moore.mu
@@ -0,0 +1,156 @@
+#include "../../../devices/moore/moore_common.h"
+#include "masked_select_moore.h"
+
+#include "../../../devices/moore/moore_kernel_common.h"
+
+// #include "infinicore/context/context.hpp"
+#include "masked_select_moore_kernel.h"
+
+namespace op::masked_select::moore {
+
+struct Descriptor::Opaque {
+    std::shared_ptr<device::moore::Handle::Internal> internal;
+};
+
+Descriptor::~Descriptor() {
+    delete _opaque;
+}
+
+infiniStatus_t Descriptor::create(
+    infiniopHandle_t handle,
+    Descriptor **desc_ptr,
+    infiniopTensorDescriptor_t input_desc,
+    infiniopTensorDescriptor_t mask_desc) {
+
+    auto result = MaskedSelectInfo::create(input_desc, mask_desc);
+    CHECK_RESULT(result);
+    auto info = result.take();
+    size_t workspace_size = 0;
+    // shape size
+    workspace_size += info.ndim * sizeof(size_t);
+    // strides size * 2
+    workspace_size += info.ndim * sizeof(ptrdiff_t) * 2;
+    // size_t mark_scan
+    workspace_size += info.total_elements * sizeof(size_t);
+    // size_t mark_result
+    workspace_size += info.total_elements * sizeof(size_t);
+    *desc_ptr = new Descriptor(
+        new Opaque{reinterpret_cast<device::moore::Handle *>(handle)->internal()},
+        info, workspace_size, handle->device, handle->device_id);
+    return INFINI_STATUS_SUCCESS;
+}
+
+namespace {
+
+template<size_t BLOCK_SIZE, typename T>
+infiniStatus_t launchKernel(
+    const MaskedSelectInfo &info, 
+    const T *input, const bool *mask, void **data_ptr, size_t *dlen_ptr,
+    musaStream_t stream, void *workspace, size_t workspace_size) {
+
+    size_t ndim = info.ndim;
+    size_t total_elements = info.total_elements;
+
+    size_t workspace_offset = 0;
+    unsigned char *workspace_ptr = reinterpret_cast<unsigned char *>(workspace);
+
+    size_t *shape_musa = reinterpret_cast<size_t *>(workspace_ptr + workspace_offset);
+    workspace_offset += ndim * sizeof(size_t);
+    
+    ptrdiff_t *input_strides_musa = reinterpret_cast<ptrdiff_t *>(workspace_ptr + workspace_offset);
+    workspace_offset += ndim * sizeof(ptrdiff_t);
+    
+    ptrdiff_t *mask_strides_musa = reinterpret_cast<ptrdiff_t *>(workspace_ptr + workspace_offset);
+    workspace_offset += ndim * sizeof(ptrdiff_t);
+
+    size_t *mark_scan = reinterpret_cast<size_t *>(workspace_ptr + workspace_offset);
+    workspace_offset += info.total_elements * sizeof(size_t);
+
+    size_t *scan_result = reinterpret_cast<size_t *>(workspace_ptr + workspace_offset);
+    workspace_offset += info.total_elements * sizeof(size_t);
+
+    assert(workspace_offset == workspace_size);
+
+    CHECK_MOORE(musaMemcpyAsync(shape_musa, info.shape.data(), ndim * sizeof(size_t), musaMemcpyHostToDevice, stream));
+    CHECK_MOORE(musaMemcpyAsync(input_strides_musa, info.input_strides.data(), ndim * sizeof(ptrdiff_t), musaMemcpyHostToDevice, stream));
+    CHECK_MOORE(musaMemcpyAsync(mask_strides_musa, info.mask_strides.data(), ndim * sizeof(ptrdiff_t), musaMemcpyHostToDevice, stream));
+
+    size_t block_size = BLOCK_SIZE;
+    size_t grid_size = (total_elements + block_size - 1) / block_size;
+    
+    maskedSelectGetMarkScanOnceKernel<BLOCK_SIZE><<<grid_size, block_size, 0, stream>>>(
+        mask, mark_scan, total_elements,
+        shape_musa, mask_strides_musa, ndim);
+    CHECK_MOORE(musaDeviceSynchronize());
+    
+    for (size_t stride = BLOCK_SIZE; stride < total_elements; stride *= BLOCK_SIZE) {
+        size_t stride_elements = (total_elements + stride - 1) / stride;
+        size_t stride_grid_size = (stride_elements + BLOCK_SIZE - 1) / BLOCK_SIZE;
+        
+        maskedSelectScanWithStrideKernel<BLOCK_SIZE><<<stride_grid_size, block_size, 0, stream>>>(
+            mark_scan, total_elements, stride);
+        CHECK_MOORE(musaDeviceSynchronize());
+    }
+    if (total_elements > BLOCK_SIZE) {
+        maskedSelectCountScanResultKernel<BLOCK_SIZE><<<grid_size, block_size, 0, stream>>>(
+            mark_scan, scan_result, total_elements);
+        CHECK_MOORE(musaDeviceSynchronize());
+    } else {
+        scan_result = mark_scan;
+    }
+    
+    
+    CHECK_MOORE(musaMemcpyAsync(dlen_ptr, scan_result + total_elements - 1, sizeof(size_t), musaMemcpyDeviceToHost, stream));
+    
+    CHECK_MOORE(musaMalloc(data_ptr, *dlen_ptr * sizeof(T)));
+    
+    // context::setDevice(MOORE);
+    // context::allocateMemory(data_ptr, *dlen_ptr * sizeof(T));
+    
+    maskedSelectGetDataKernel<BLOCK_SIZE, T><<<grid_size, block_size, 0, stream>>>(
+        input, mask, scan_result, (T *)*data_ptr, total_elements,
+        shape_musa, input_strides_musa, mask_strides_musa, ndim);
+    CHECK_MOORE(musaDeviceSynchronize());
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+}
+
+
+infiniStatus_t Descriptor::calculate(
+    void *workspace, size_t workspace_size,
+    const void *input,
+    const bool *mask,
+    void **data_ptr,
+    size_t *dlen_ptr,
+    void *stream_) const {
+
+    musaStream_t stream = (musaStream_t)stream_;
+#define CALCULATE_MASKED_SELECT(BLOCK_SIZE, T)                  \
+    launchKernel<BLOCK_SIZE, T>(                                \
+        _info,                                                  \
+        (const T *)input, mask, data_ptr, dlen_ptr,             \
+        stream, workspace, workspace_size                       \
+    )
+#define CALCULATE_MASKED_SELECT_WITH_BLOCK_SIZE(BLOCK_SIZE)     \
+    {                                                           \
+        if (_info.dtype == INFINI_DTYPE_F32)                    \
+            return CALCULATE_MASKED_SELECT(BLOCK_SIZE, float);  \
+        else                                                    \
+            return INFINI_STATUS_BAD_TENSOR_DTYPE;              \
+    }
+
+    if (_opaque->internal->maxThreadsPerBlock() == MOORE_BLOCK_SIZE_1024) {
+        CALCULATE_MASKED_SELECT_WITH_BLOCK_SIZE(MOORE_BLOCK_SIZE_1024);
+    } else if (_opaque->internal->maxThreadsPerBlock() == MOORE_BLOCK_SIZE_512) {
+        CALCULATE_MASKED_SELECT_WITH_BLOCK_SIZE(MOORE_BLOCK_SIZE_512);
+    } else if (_opaque->internal->maxThreadsPerBlock() == MOORE_BLOCK_SIZE_2048) {
+        CALCULATE_MASKED_SELECT_WITH_BLOCK_SIZE(MOORE_BLOCK_SIZE_2048);
+    } else {
+        return INFINI_STATUS_DEVICE_ARCHITECTURE_NOT_SUPPORTED;
+    }
+    return INFINI_STATUS_SUCCESS;
+}
+
+}
diff --git a/src/infiniop/ops/masked_select/moore/masked_select_moore_kernel.h b/src/infiniop/ops/masked_select/moore/masked_select_moore_kernel.h
new file mode 100644
index 000000000..dcdd87c6d
--- /dev/null
+++ b/src/infiniop/ops/masked_select/moore/masked_select_moore_kernel.h
@@ -0,0 +1,101 @@
+#ifndef __MASKED_SELECT_MOORE_KERNEL_CUH__
+#define __MASKED_SELECT_MOORE_KERNEL_CUH__
+
+template<size_t BLOCK_SIZE>
+INFINIOP_MOORE_KERNEL maskedSelectGetMarkScanOnceKernel(
+    const bool *mask, size_t *mark_scan, size_t total_elements,
+    size_t *shape, ptrdiff_t *mask_strides, size_t ndim) {
+
+    __shared__ __align__(128) size_t smem[BLOCK_SIZE];
+
+    size_t tid = threadIdx.x;
+    size_t index = blockDim.x * blockIdx.x + threadIdx.x;
+    if (index < total_elements) {
+        size_t mask_offset = device::moore::indexToOffset(index, ndim, shape, mask_strides);
+        smem[tid] = mask[mask_offset];
+    } else {
+        smem[tid] = 0;
+    }
+
+    for (int s = 1; s < BLOCK_SIZE; s <<= 1) {
+        __syncthreads();
+        
+        float temp;
+        if (tid >= s) 
+            temp = smem[tid] + smem[tid - s];
+        __syncthreads();
+
+        if (tid >= s) 
+            smem[tid] = temp;
+    }
+
+    if (index < total_elements)
+        mark_scan[index] = smem[tid];
+}
+
+template<size_t BLOCK_SIZE>
+INFINIOP_MOORE_KERNEL maskedSelectScanWithStrideKernel(
+    size_t *mark_scan, size_t total_elements, size_t stride) {
+
+    __shared__ __align__(128) size_t smem[BLOCK_SIZE];
+
+    size_t tid = threadIdx.x;
+    size_t index = (blockDim.x * blockIdx.x + threadIdx.x + 1) * stride - 1;
+    smem[tid] = index < total_elements ? mark_scan[index] : 0.;
+
+    for (int s = 1; s < BLOCK_SIZE; s <<= 1) {
+        __syncthreads();
+        
+        size_t temp;
+        if (tid >= s) 
+            temp = smem[tid] + smem[tid - s];
+        __syncthreads();
+
+        if (tid >= s) 
+            smem[tid] = temp;
+    }
+
+    if (index < total_elements)
+        mark_scan[index] = smem[tid];
+}
+
+template<size_t BLOCK_SIZE>
+INFINIOP_MOORE_KERNEL maskedSelectCountScanResultKernel(
+    size_t *mark_scan, size_t *scan_result, size_t total_elements) {
+
+    size_t index = blockDim.x * blockIdx.x + threadIdx.x;
+    if (index >= total_elements)
+        return;
+
+    size_t val = mark_scan[index];
+
+    for (size_t s = BLOCK_SIZE; s < total_elements; s *= BLOCK_SIZE) {
+        size_t now_stride_block = (index + 1) / s;
+        size_t pre_stride_block = now_stride_block * s - 1;
+        if (now_stride_block == 0 || (index + 1) % s == 0 || (pre_stride_block + 1) % (s * BLOCK_SIZE) == 0)
+            continue;
+        val += mark_scan[pre_stride_block];
+    }
+    
+    scan_result[index] = val;
+}
+
+template<size_t BLOCK_SIZE, typename T>
+INFINIOP_MOORE_KERNEL maskedSelectGetDataKernel(
+    const T *input, const bool *mask, size_t *scan_result, T *data, size_t total_elements,
+    size_t *shape, ptrdiff_t *input_strides, ptrdiff_t *mask_strides, size_t ndim) {
+
+    size_t index = blockDim.x * blockIdx.x + threadIdx.x;
+
+    if (index >= total_elements)
+        return;
+
+    size_t input_offset = device::moore::indexToOffset(index, ndim, shape, input_strides);
+    size_t mask_offset = device::moore::indexToOffset(index, ndim, shape, mask_strides);
+
+    if (mask[mask_offset]) {
+        data[scan_result[index] - 1] = input[input_offset];
+    }
+}
+
+#endif
\ No newline at end of file
diff --git a/src/infiniop/ops/masked_select/nvidia/masked_select_nvidia.cu b/src/infiniop/ops/masked_select/nvidia/masked_select_nvidia.cu
new file mode 100644
index 000000000..eebba0fd3
--- /dev/null
+++ b/src/infiniop/ops/masked_select/nvidia/masked_select_nvidia.cu
@@ -0,0 +1,149 @@
+#include "../../../devices/nvidia/nvidia_common.cuh"
+#include "masked_select_nvidia.cuh"
+
+#include "../../../devices/nvidia/nvidia_kernel_common.cuh"
+
+#include "../cuda/kernel.cuh"
+
+namespace op::masked_select::nvidia {
+
+struct Descriptor::Opaque {
+    std::shared_ptr<device::nvidia::Handle::Internal> internal;
+};
+
+Descriptor::~Descriptor() {
+    delete _opaque;
+}
+
+infiniStatus_t Descriptor::create(
+    infiniopHandle_t handle,
+    Descriptor **desc_ptr,
+    infiniopTensorDescriptor_t input_desc,
+    infiniopTensorDescriptor_t mask_desc) {
+
+    auto result = MaskedSelectInfo::create(input_desc, mask_desc);
+    CHECK_RESULT(result);
+    auto info = result.take();
+    size_t workspace_size = 0;
+    // shape size
+    workspace_size += info.ndim * sizeof(size_t);
+    // strides size * 2
+    workspace_size += info.ndim * sizeof(ptrdiff_t) * 2;
+    // size_t mark_scan
+    workspace_size += info.total_elements * sizeof(size_t);
+    // size_t mark_result
+    workspace_size += info.total_elements * sizeof(size_t);
+    *desc_ptr = new Descriptor(
+        new Opaque{reinterpret_cast<device::nvidia::Handle *>(handle)->internal()},
+        info, workspace_size, handle->device, handle->device_id);
+    return INFINI_STATUS_SUCCESS;
+}
+
+namespace {
+
+template<size_t BLOCK_SIZE, typename T>
+infiniStatus_t launchKernel(
+    const MaskedSelectInfo &info, 
+    const T *input, const bool *mask, void **data_ptr, size_t *dlen_ptr,
+    cudaStream_t stream, void *workspace, size_t workspace_size) {
+
+    size_t ndim = info.ndim;
+    size_t total_elements = info.total_elements;
+
+    size_t workspace_offset = 0;
+    unsigned char *workspace_ptr = reinterpret_cast<unsigned char *>(workspace);
+
+    size_t *shape_cuda = reinterpret_cast<size_t *>(workspace_ptr + workspace_offset);
+    workspace_offset += ndim * sizeof(size_t);
+    
+    ptrdiff_t *input_strides_cuda = reinterpret_cast<ptrdiff_t *>(workspace_ptr + workspace_offset);
+    workspace_offset += ndim * sizeof(ptrdiff_t);
+    
+    ptrdiff_t *mask_strides_cuda = reinterpret_cast<ptrdiff_t *>(workspace_ptr + workspace_offset);
+    workspace_offset += ndim * sizeof(ptrdiff_t);
+
+    size_t *mark_scan = reinterpret_cast<size_t *>(workspace_ptr + workspace_offset);
+    workspace_offset += info.total_elements * sizeof(size_t);
+
+    size_t *scan_result = reinterpret_cast<size_t *>(workspace_ptr + workspace_offset);
+    workspace_offset += info.total_elements * sizeof(size_t);
+
+    CHECK_CUDA(cudaMemcpyAsync(shape_cuda, info.shape.data(), ndim * sizeof(size_t), cudaMemcpyHostToDevice, stream));
+    CHECK_CUDA(cudaMemcpyAsync(input_strides_cuda, info.input_strides.data(), ndim * sizeof(ptrdiff_t), cudaMemcpyHostToDevice, stream));
+    CHECK_CUDA(cudaMemcpyAsync(mask_strides_cuda, info.mask_strides.data(), ndim * sizeof(ptrdiff_t), cudaMemcpyHostToDevice, stream));
+
+    size_t block_size = BLOCK_SIZE;
+    size_t grid_size = (total_elements + block_size - 1) / block_size;
+    
+    maskedSelectGetMarkScanOnceKernel<BLOCK_SIZE><<<grid_size, block_size, 0, stream>>>(
+        mask, mark_scan, total_elements,
+        shape_cuda, mask_strides_cuda, ndim);
+    CHECK_CUDA(cudaDeviceSynchronize());
+    
+    for (size_t stride = BLOCK_SIZE; stride < total_elements; stride *= BLOCK_SIZE) {
+        size_t stride_elements = (total_elements + stride - 1) / stride;
+        size_t stride_grid_size = (stride_elements + BLOCK_SIZE - 1) / BLOCK_SIZE;
+        
+        maskedSelectScanWithStrideKernel<BLOCK_SIZE><<<stride_grid_size, block_size, 0, stream>>>(
+            mark_scan, total_elements, stride);
+        CHECK_CUDA(cudaDeviceSynchronize());
+    }
+    if (total_elements > BLOCK_SIZE) {
+        maskedSelectCountScanResultKernel<BLOCK_SIZE><<<grid_size, block_size, 0, stream>>>(
+            mark_scan, scan_result, total_elements);
+        CHECK_CUDA(cudaDeviceSynchronize());
+    } else {
+        scan_result = mark_scan;
+    }
+    
+    CHECK_CUDA(cudaMemcpyAsync(dlen_ptr, scan_result + total_elements - 1, sizeof(size_t), cudaMemcpyDeviceToHost, stream));
+
+    CHECK_CUDA(cudaMalloc(data_ptr, *dlen_ptr * sizeof(T)));
+    
+    maskedSelectGetDataKernel<BLOCK_SIZE, T><<<grid_size, block_size, 0, stream>>>(
+        input, mask, scan_result, (T *)*data_ptr, total_elements,
+        shape_cuda, input_strides_cuda, mask_strides_cuda, ndim);
+    CHECK_CUDA(cudaDeviceSynchronize());
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+}
+
+
+infiniStatus_t Descriptor::calculate(
+    void *workspace, size_t workspace_size,
+    const void *input,
+    const bool *mask,
+    void **data_ptr,
+    size_t *dlen_ptr,
+    void *stream_) const {
+
+    cudaStream_t stream = (cudaStream_t)stream_;
+#define CALCULATE_MASKED_SELECT(BLOCK_SIZE, T)                  \
+    launchKernel<BLOCK_SIZE, T>(                                \
+        _info,                                                  \
+        (const T *)input, mask, data_ptr, dlen_ptr,             \
+        stream, workspace, workspace_size                       \
+    )
+#define CALCULATE_MASKED_SELECT_WITH_BLOCK_SIZE(BLOCK_SIZE)     \
+    {                                                           \
+        if (_info.dtype == INFINI_DTYPE_F32)                    \
+            return CALCULATE_MASKED_SELECT(BLOCK_SIZE, float);  \
+        else                                                    \
+            return INFINI_STATUS_BAD_TENSOR_DTYPE;              \
+    }
+
+    if (_opaque->internal->maxThreadsPerBlock() == CUDA_BLOCK_SIZE_1024) {
+        CALCULATE_MASKED_SELECT_WITH_BLOCK_SIZE(CUDA_BLOCK_SIZE_1024);
+    } else if (_opaque->internal->maxThreadsPerBlock() == CUDA_BLOCK_SIZE_512) {
+        CALCULATE_MASKED_SELECT_WITH_BLOCK_SIZE(CUDA_BLOCK_SIZE_512);
+    } else if (_opaque->internal->maxThreadsPerBlock() == CUDA_BLOCK_SIZE_4096) {
+        CALCULATE_MASKED_SELECT_WITH_BLOCK_SIZE(CUDA_BLOCK_SIZE_4096);
+    } else {
+        return INFINI_STATUS_DEVICE_ARCHITECTURE_NOT_SUPPORTED;
+    }
+    return INFINI_STATUS_SUCCESS;
+}
+
+}
diff --git a/src/infiniop/ops/masked_select/nvidia/masked_select_nvidia.cuh b/src/infiniop/ops/masked_select/nvidia/masked_select_nvidia.cuh
new file mode 100644
index 000000000..bc493edbe
--- /dev/null
+++ b/src/infiniop/ops/masked_select/nvidia/masked_select_nvidia.cuh
@@ -0,0 +1,8 @@
+#ifndef __MASKED_SELECT_NVIDIA_H__
+#define __MASKED_SELECT_NVIDIA_H__
+
+#include "../masked_select.h"
+
+DESCRIPTOR(nvidia)
+
+#endif 
\ No newline at end of file
diff --git a/src/infiniop/ops/masked_select/operator.cc b/src/infiniop/ops/masked_select/operator.cc
new file mode 100644
index 000000000..055b67b12
--- /dev/null
+++ b/src/infiniop/ops/masked_select/operator.cc
@@ -0,0 +1,138 @@
+#include "../../operator.h"
+#include "../../handle.h"
+#include "infiniop/ops/masked_select.h"
+
+#ifdef ENABLE_CPU_API
+#include "cpu/masked_select_cpu.h"
+#endif
+#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API) || defined(ENABLE_QY_API) || defined(ENABLE_HYGON_API)
+#include "nvidia/masked_select_nvidia.cuh"
+#endif
+#ifdef ENABLE_METAX_API
+#include "metax/masked_select_metax.h"
+#endif
+#ifdef ENABLE_MOORE_API
+#include "moore/masked_select_moore.h"
+#endif
+
+__C infiniStatus_t infiniopCreateMaskedSelectDescriptor(
+    infiniopHandle_t handle,
+    infiniopMaskedSelectDescriptor_t *desc_ptr,
+    infiniopTensorDescriptor_t input_desc,
+    infiniopTensorDescriptor_t mask_desc) {
+
+#define CREATE(CASE, NAMESPACE)                                                     \
+    case CASE:                                                                      \
+        return op::masked_select::NAMESPACE::Descriptor::create(                    \
+            handle,                                                                 \
+            reinterpret_cast<op::masked_select::NAMESPACE::Descriptor **>(desc_ptr),\
+            input_desc,                                                             \
+            mask_desc);
+
+    switch (handle->device) {
+#ifdef ENABLE_CPU_API
+        CREATE(INFINI_DEVICE_CPU, cpu)
+#endif
+#ifdef ENABLE_NVIDIA_API
+        CREATE(INFINI_DEVICE_NVIDIA, nvidia)
+#endif
+#ifdef ENABLE_ILUVATAR_API
+        CREATE(INFINI_DEVICE_ILUVATAR, nvidia);
+#endif
+#ifdef ENABLE_METAX_API
+        CREATE(INFINI_DEVICE_METAX, metax)
+#endif
+#ifdef ENABLE_MOORE_API
+        CREATE(INFINI_DEVICE_MOORE, moore)
+#endif
+    }
+    return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+}
+
+__C infiniStatus_t infiniopGetMaskedSelectWorkspaceSize(infiniopMaskedSelectDescriptor_t desc, size_t *size) {
+
+#define GET(CASE, NAMESPACE)                                                                          \
+    case CASE:                                                                                        \
+        *size = reinterpret_cast<op::masked_select::NAMESPACE::Descriptor *>(desc)->workspaceSize(); \
+        return INFINI_STATUS_SUCCESS;
+
+    switch (desc->device_type) {
+#ifdef ENABLE_CPU_API
+        GET(INFINI_DEVICE_CPU, cpu)
+#endif
+#ifdef ENABLE_NVIDIA_API
+        GET(INFINI_DEVICE_NVIDIA, nvidia)
+#endif
+#ifdef ENABLE_ILUVATAR_API
+        GET(INFINI_DEVICE_ILUVATAR, nvidia);
+#endif
+#ifdef ENABLE_METAX_API
+        GET(INFINI_DEVICE_METAX, metax)
+#endif
+#ifdef ENABLE_MOORE_API
+        GET(INFINI_DEVICE_MOORE, moore)
+#endif
+    }
+    return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+}
+
+__C infiniStatus_t infiniopMaskedSelect(
+    infiniopMaskedSelectDescriptor_t desc,
+    void *workspace, size_t workspace_size,
+    const void *input,
+    const bool *mask,
+    void **data_size,
+    size_t *dlen,
+    void *stream) {
+
+#define CALCULATE(CASE, NAMESPACE)                                                             \
+    case CASE:                                                                                 \
+        return reinterpret_cast<op::masked_select::NAMESPACE::Descriptor *>(desc)->calculate( \
+            workspace, workspace_size, input, mask, data_size, dlen, stream);
+
+    switch (desc->device_type) {
+#ifdef ENABLE_CPU_API
+        CALCULATE(INFINI_DEVICE_CPU, cpu)
+#endif
+#ifdef ENABLE_NVIDIA_API
+        CALCULATE(INFINI_DEVICE_NVIDIA, nvidia)
+#endif
+#ifdef ENABLE_ILUVATAR_API
+        CALCULATE(INFINI_DEVICE_ILUVATAR, nvidia);
+#endif
+#ifdef ENABLE_METAX_API
+        CALCULATE(INFINI_DEVICE_METAX, metax)
+#endif
+#ifdef ENABLE_MOORE_API
+        CALCULATE(INFINI_DEVICE_MOORE, moore)
+#endif
+    }
+    return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+}
+
+__C infiniStatus_t infiniopDestroyMaskedSelectDescriptor(infiniopMaskedSelectDescriptor_t desc) {
+
+#define DESTROY(CASE, NAMESPACE)                                                    \
+    case CASE:                                                                      \
+        delete reinterpret_cast<op::masked_select::NAMESPACE::Descriptor *>(desc); \
+        return INFINI_STATUS_SUCCESS;
+
+    switch (desc->device_type) {
+#ifdef ENABLE_CPU_API
+        DESTROY(INFINI_DEVICE_CPU, cpu)
+#endif
+#ifdef ENABLE_NVIDIA_API
+        DESTROY(INFINI_DEVICE_NVIDIA, nvidia)
+#endif
+#ifdef ENABLE_ILUVATAR_API
+        DESTROY(INFINI_DEVICE_ILUVATAR, nvidia);
+#endif
+#ifdef ENABLE_METAX_API
+        DESTROY(INFINI_DEVICE_METAX, metax)
+#endif
+#ifdef ENABLE_MOORE_API
+        DESTROY(INFINI_DEVICE_MOORE, moore)
+#endif
+    }
+    return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+}
diff --git a/src/infiniop/ops/tan/cpu/tan_cpu.cc b/src/infiniop/ops/tan/cpu/tan_cpu.cc
new file mode 100644
index 000000000..90939f9cb
--- /dev/null
+++ b/src/infiniop/ops/tan/cpu/tan_cpu.cc
@@ -0,0 +1,50 @@
+#include "tan_cpu.h"
+
+namespace op::tan::cpu {
+
+Descriptor::~Descriptor() = default;
+
+infiniStatus_t Descriptor::create(
+    infiniopHandle_t handle_,
+    Descriptor **desc_ptr,
+    infiniopTensorDescriptor_t out_desc,
+    std::vector<infiniopTensorDescriptor_t> input_desc_vec) {
+
+    auto handle = reinterpret_cast<device::cpu::Handle *>(handle_);
+    auto dtype = out_desc->dtype();
+
+    const auto &input_desc = input_desc_vec.at(0);
+    const auto &output_shape = out_desc->shape();
+    const auto &input_shape = input_desc->shape();
+
+    CHECK_DTYPE(dtype, INFINI_DTYPE_BF16, INFINI_DTYPE_F16, INFINI_DTYPE_F32);
+
+    CHECK_SAME_SHAPE(output_shape, input_shape);
+
+    // create CPU elementwise descriptor
+    CREATE_ELEMENTWISE_CPU_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec);
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+infiniStatus_t Descriptor::calculate(
+    void *workspace,
+    size_t workspace_size,
+    void *output,
+    std::vector<const void *> inputs,
+    void *stream) const {
+
+    switch (_dtype) {
+    case INFINI_DTYPE_BF16:
+        return _device_info->calculate<TanOp, bf16_t>(_info, output, inputs, stream);
+    case INFINI_DTYPE_F16:
+        return _device_info->calculate<TanOp, fp16_t>(_info, output, inputs, stream);
+    case INFINI_DTYPE_F32:
+        return _device_info->calculate<TanOp, float>(_info, output, inputs, stream);
+    default:
+        return INFINI_STATUS_BAD_TENSOR_DTYPE;
+    }
+
+    return INFINI_STATUS_SUCCESS;
+}
+} // namespace op::tan::cpu
diff --git a/src/infiniop/ops/tan/cpu/tan_cpu.h b/src/infiniop/ops/tan/cpu/tan_cpu.h
new file mode 100644
index 000000000..04004dd19
--- /dev/null
+++ b/src/infiniop/ops/tan/cpu/tan_cpu.h
@@ -0,0 +1,23 @@
+#ifndef __TAN_CPU_H__
+#define __TAN_CPU_H__
+
+#include "../../../elementwise/cpu/elementwise_cpu.h"
+
+ELEMENTWISE_DESCRIPTOR(tan, cpu)
+
+#include <cmath>
+
+namespace op::tan::cpu {
+typedef struct TanOp {
+public:
+    static constexpr size_t num_inputs = 1;
+
+    template <typename T>
+    T operator()(const T &x) const {
+        return std::tan(x);
+    }
+} TanOp;
+
+} // namespace op::tan::cpu
+
+#endif // __TAN_CPU_H__
diff --git a/src/infiniop/ops/tan/cuda/kernel.cuh b/src/infiniop/ops/tan/cuda/kernel.cuh
new file mode 100644
index 000000000..85c9eca27
--- /dev/null
+++ b/src/infiniop/ops/tan/cuda/kernel.cuh
@@ -0,0 +1,30 @@
+#ifndef __TAN_CUDA_H__
+#define __TAN_CUDA_H__
+
+namespace op::tan::cuda {
+
+typedef struct TanOp {
+public:
+    static constexpr size_t num_inputs = 1;
+    template <typename T>
+    __device__ __forceinline__ T operator()(const T &x) const {
+        if constexpr (std::is_same_v<T, cuda_bfloat16>) {
+            // BF16
+            const float x_f = __bfloat162float(x);
+            return __float2bfloat16(__tanf(x_f));
+        } else if constexpr (std::is_same_v<T, half>) {
+            // FP16
+            const float x_f = __half2float(x);
+            return __float2half(__tanf(x_f));
+        } else if constexpr (std::is_same_v<T, float>) {
+            // FP32
+            return __tanf(x);
+        } else {
+            return __tanf(x);
+        }
+    }
+} TanOp;
+
+} // namespace op::tan::cuda
+
+#endif // __TAN_CUDA_H__
diff --git a/src/infiniop/ops/tan/metax/tan_metax.h b/src/infiniop/ops/tan/metax/tan_metax.h
new file mode 100644
index 000000000..507fd9d5e
--- /dev/null
+++ b/src/infiniop/ops/tan/metax/tan_metax.h
@@ -0,0 +1,8 @@
+#ifndef __TAN_METAX_API_H__
+#define __TAN_METAX_API_H__
+
+#include "../../../elementwise/metax/elementwise_metax_api.h"
+
+ELEMENTWISE_DESCRIPTOR(tan, metax)
+
+#endif // __TAN_METAX_API_H__
diff --git a/src/infiniop/ops/tan/metax/tan_metax.maca b/src/infiniop/ops/tan/metax/tan_metax.maca
new file mode 100644
index 000000000..85768bec3
--- /dev/null
+++ b/src/infiniop/ops/tan/metax/tan_metax.maca
@@ -0,0 +1,56 @@
+#include "../../../elementwise/metax/elementwise_metax.h"
+
+#include "../cuda/kernel.cuh"
+#include "tan_metax.h"
+
+namespace op::tan::metax {
+
+Descriptor::~Descriptor() = default;
+
+infiniStatus_t Descriptor::create(
+    infiniopHandle_t handle_,
+    Descriptor **desc_ptr,
+    infiniopTensorDescriptor_t out_desc,
+    std::vector<infiniopTensorDescriptor_t> input_desc_vec) {
+
+    auto handle = reinterpret_cast<device::metax::Handle *>(handle_);
+    auto dtype = out_desc->dtype();
+
+    const auto &input_desc = input_desc_vec.at(0);
+    const auto &output_shape = out_desc->shape();
+    const auto &input_shape = input_desc->shape();
+
+    CHECK_DTYPE(dtype, INFINI_DTYPE_BF16, INFINI_DTYPE_F16, INFINI_DTYPE_F32);
+
+    CHECK_SAME_SHAPE(output_shape, input_shape);
+
+    CREATE_ELEMENTWISE_METAX_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec)
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+infiniStatus_t Descriptor::calculate(
+    void *workspace,
+    size_t workspace_size,
+    void *output,
+    std::vector<const void *> inputs,
+    void *stream) const {
+
+    if (workspace_size < _workspace_size) {
+        return INFINI_STATUS_INSUFFICIENT_WORKSPACE;
+    }
+
+    switch (_dtype) {
+    case INFINI_DTYPE_BF16:
+        return _device_info->calculate<256, cuda::TanOp, hpcc_bfloat16>(_info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_F16:
+        return _device_info->calculate<256, cuda::TanOp, half>(_info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_F32:
+        return _device_info->calculate<256, cuda::TanOp, float>(_info, workspace, output, inputs, stream);
+    default:
+        return INFINI_STATUS_BAD_TENSOR_DTYPE;
+    }
+
+    return INFINI_STATUS_SUCCESS;
+}
+} // namespace op::tan::metax
diff --git a/src/infiniop/ops/tan/moore/tan_moore.h b/src/infiniop/ops/tan/moore/tan_moore.h
new file mode 100644
index 000000000..c261e4a69
--- /dev/null
+++ b/src/infiniop/ops/tan/moore/tan_moore.h
@@ -0,0 +1,8 @@
+#ifndef __TAN_MOORE_API_H__
+#define __TAN_MOORE_API_H__
+
+#include "../../../elementwise/moore/elementwise_moore_api.h"
+
+ELEMENTWISE_DESCRIPTOR(tan, moore)
+
+#endif // __TAN_MOORE_API_H__
diff --git a/src/infiniop/ops/tan/moore/tan_moore.mu b/src/infiniop/ops/tan/moore/tan_moore.mu
new file mode 100644
index 000000000..4caec16ef
--- /dev/null
+++ b/src/infiniop/ops/tan/moore/tan_moore.mu
@@ -0,0 +1,56 @@
+#include "../../../elementwise/moore/elementwise_moore.h"
+
+#include "../cuda/kernel.cuh"
+#include "tan_moore.h"
+
+namespace op::tan::moore {
+
+Descriptor::~Descriptor() = default;
+
+infiniStatus_t Descriptor::create(
+    infiniopHandle_t handle_,
+    Descriptor **desc_ptr,
+    infiniopTensorDescriptor_t out_desc,
+    std::vector<infiniopTensorDescriptor_t> input_desc_vec) {
+
+    auto handle = reinterpret_cast<device::moore::Handle *>(handle_);
+    auto dtype = out_desc->dtype();
+
+    const auto &input_desc = input_desc_vec.at(0);
+    const auto &output_shape = out_desc->shape();
+    const auto &input_shape = input_desc->shape();
+
+    CHECK_DTYPE(dtype, INFINI_DTYPE_BF16, INFINI_DTYPE_F16, INFINI_DTYPE_F32);
+
+    CHECK_SAME_SHAPE(output_shape, input_shape);
+
+    CREATE_ELEMENTWISE_MOORE_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec)
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+infiniStatus_t Descriptor::calculate(
+    void *workspace,
+    size_t workspace_size,
+    void *output,
+    std::vector<const void *> inputs,
+    void *stream) const {
+
+    if (workspace_size < _workspace_size) {
+        return INFINI_STATUS_INSUFFICIENT_WORKSPACE;
+    }
+
+    switch (_dtype) {
+    case INFINI_DTYPE_BF16:
+        return _device_info->calculate<256, cuda::TanOp, mt_bfloat16>(_info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_F16:
+        return _device_info->calculate<256, cuda::TanOp, half>(_info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_F32:
+        return _device_info->calculate<256, cuda::TanOp, float>(_info, workspace, output, inputs, stream);
+    default:
+        return INFINI_STATUS_BAD_TENSOR_DTYPE;
+    }
+
+    return INFINI_STATUS_SUCCESS;
+}
+} // namespace op::tan::moore
diff --git a/src/infiniop/ops/tan/nvidia/tan_nvidia.cu b/src/infiniop/ops/tan/nvidia/tan_nvidia.cu
new file mode 100644
index 000000000..d5415cb62
--- /dev/null
+++ b/src/infiniop/ops/tan/nvidia/tan_nvidia.cu
@@ -0,0 +1,57 @@
+#include "../../../elementwise/nvidia/elementwise_nvidia.cuh"
+
+#include "../cuda/kernel.cuh"
+#include "tan_nvidia.cuh"
+
+namespace op::tan::nvidia {
+
+Descriptor::~Descriptor() = default;
+
+infiniStatus_t Descriptor::create(
+    infiniopHandle_t handle_,
+    Descriptor **desc_ptr,
+    infiniopTensorDescriptor_t out_desc,
+    std::vector<infiniopTensorDescriptor_t> input_desc_vec) {
+
+    auto handle = reinterpret_cast<device::nvidia::Handle *>(handle_);
+    auto dtype = out_desc->dtype();
+
+    const auto &input_desc = input_desc_vec.at(0);
+    const auto &output_shape = out_desc->shape();
+    const auto &input_shape = input_desc->shape();
+
+    CHECK_DTYPE(dtype, INFINI_DTYPE_BF16, INFINI_DTYPE_F16, INFINI_DTYPE_F32);
+
+    CHECK_SAME_SHAPE(output_shape, input_shape);
+
+    // create CUDA elementwise descriptor
+    CREATE_ELEMENTWISE_CUDA_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec)
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+infiniStatus_t Descriptor::calculate(
+    void *workspace,
+    size_t workspace_size,
+    void *output,
+    std::vector<const void *> inputs,
+    void *stream) const {
+
+    if (workspace_size < _workspace_size) {
+        return INFINI_STATUS_INSUFFICIENT_WORKSPACE;
+    }
+
+    switch (_dtype) {
+    case INFINI_DTYPE_BF16:
+        return _device_info->calculate<256, cuda::TanOp, cuda_bfloat16>(_info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_F16:
+        return _device_info->calculate<256, cuda::TanOp, half>(_info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_F32:
+        return _device_info->calculate<256, cuda::TanOp, float>(_info, workspace, output, inputs, stream);
+    default:
+        return INFINI_STATUS_BAD_TENSOR_DTYPE;
+    }
+
+    return INFINI_STATUS_SUCCESS;
+}
+} // namespace op::tan::nvidia
diff --git a/src/infiniop/ops/tan/nvidia/tan_nvidia.cuh b/src/infiniop/ops/tan/nvidia/tan_nvidia.cuh
new file mode 100644
index 000000000..1915cc2b1
--- /dev/null
+++ b/src/infiniop/ops/tan/nvidia/tan_nvidia.cuh
@@ -0,0 +1,8 @@
+#ifndef __TAN_CUDA_API_H__
+#define __TAN_CUDA_API_H__
+
+#include "../../../elementwise/nvidia/elementwise_nvidia_api.cuh"
+
+ELEMENTWISE_DESCRIPTOR(tan, nvidia)
+
+#endif // __TAN_CUDA_API_H__
diff --git a/src/infiniop/ops/tan/operator.cc b/src/infiniop/ops/tan/operator.cc
new file mode 100644
index 000000000..1f222a8d1
--- /dev/null
+++ b/src/infiniop/ops/tan/operator.cc
@@ -0,0 +1,157 @@
+#include "../../operator.h"
+#include "../../handle.h"
+#include "infiniop/ops/tan.h"
+
+#ifdef ENABLE_CPU_API
+#include "cpu/tan_cpu.h"
+#endif
+#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API)
+#include "nvidia/tan_nvidia.cuh"
+#endif
+#ifdef ENABLE_METAX_API
+#include "metax/tan_metax.h"
+#endif
+#ifdef ENABLE_MOORE_API
+#include "moore/tan_moore.h"
+#endif
+
+__C infiniStatus_t infiniopCreateTanDescriptor(
+    infiniopHandle_t handle,
+    infiniopTanDescriptor_t *desc_ptr,
+    infiniopTensorDescriptor_t output_desc,
+    infiniopTensorDescriptor_t input_desc) {
+
+#define CREATE(CASE, NAMESPACE)                                             \
+    case CASE:                                                              \
+        return op::tan::NAMESPACE::Descriptor::create(                     \
+            handle,                                                         \
+            reinterpret_cast<op::tan::NAMESPACE::Descriptor **>(desc_ptr), \
+            output_desc,                                                    \
+            {input_desc})
+
+    switch (handle->device) {
+
+#ifdef ENABLE_CPU_API
+        CREATE(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_NVIDIA_API
+        CREATE(INFINI_DEVICE_NVIDIA, nvidia);
+#endif
+#ifdef ENABLE_ILUVATAR_API
+        CREATE(INFINI_DEVICE_ILUVATAR, nvidia);
+#endif
+#ifdef ENABLE_METAX_API
+        CREATE(INFINI_DEVICE_METAX, metax);
+#endif
+#ifdef ENABLE_MOORE_API
+        CREATE(INFINI_DEVICE_MOORE, moore);
+#endif
+
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+
+#undef CREATE
+}
+
+__C infiniStatus_t infiniopGetTanWorkspaceSize(infiniopTanDescriptor_t desc, size_t *size) {
+
+#define GET(CASE, NAMESPACE)                                                                \
+    case CASE:                                                                              \
+        *size = reinterpret_cast<op::tan::NAMESPACE::Descriptor *>(desc)->workspaceSize(); \
+        return INFINI_STATUS_SUCCESS
+
+    switch (desc->device_type) {
+#ifdef ENABLE_CPU_API
+        GET(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_NVIDIA_API
+        GET(INFINI_DEVICE_NVIDIA, nvidia);
+#endif
+#ifdef ENABLE_ILUVATAR_API
+        GET(INFINI_DEVICE_ILUVATAR, nvidia);
+#endif
+#ifdef ENABLE_METAX_API
+        GET(INFINI_DEVICE_METAX, metax);
+#endif
+#ifdef ENABLE_MOORE_API
+        GET(INFINI_DEVICE_MOORE, moore);
+#endif
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+#undef GET
+
+    return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+}
+
+__C infiniStatus_t infiniopTan(
+    infiniopTanDescriptor_t desc,
+    void *workspace,
+    size_t workspace_size,
+    void *output,
+    const void *input,
+    void *stream) {
+
+#define CALCULATE(CASE, NAMESPACE)                                             \
+    case CASE:                                                                 \
+        return reinterpret_cast<const op::tan::NAMESPACE::Descriptor *>(desc) \
+            ->calculate(workspace, workspace_size, output, {input}, stream)
+
+    switch (desc->device_type) {
+
+#ifdef ENABLE_CPU_API
+        CALCULATE(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_NVIDIA_API
+        CALCULATE(INFINI_DEVICE_NVIDIA, nvidia);
+#endif
+#ifdef ENABLE_ILUVATAR_API
+        CALCULATE(INFINI_DEVICE_ILUVATAR, nvidia);
+#endif
+#ifdef ENABLE_METAX_API
+        CALCULATE(INFINI_DEVICE_METAX, metax);
+#endif
+#ifdef ENABLE_MOORE_API
+        CALCULATE(INFINI_DEVICE_MOORE, moore);
+#endif
+
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+
+#undef CALCULATE
+}
+
+__C infiniStatus_t
+infiniopDestroyTanDescriptor(infiniopTanDescriptor_t desc) {
+
+#define DELETE(CASE, NAMESPACE)                                                 \
+    case CASE:                                                                  \
+        delete reinterpret_cast<const op::tan::NAMESPACE::Descriptor *>(desc); \
+        return INFINI_STATUS_SUCCESS
+
+    switch (desc->device_type) {
+
+#ifdef ENABLE_CPU_API
+        DELETE(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_NVIDIA_API
+        DELETE(INFINI_DEVICE_NVIDIA, nvidia);
+#endif
+#ifdef ENABLE_ILUVATAR_API
+        DELETE(INFINI_DEVICE_ILUVATAR, nvidia);
+#endif
+#ifdef ENABLE_METAX_API
+        DELETE(INFINI_DEVICE_METAX, metax);
+#endif
+#ifdef ENABLE_MOORE_API
+        DELETE(INFINI_DEVICE_MOORE, moore);
+#endif
+
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+
+#undef DELETE
+}
diff --git a/src/infiniop/ops/tanhshrink/cpu/tanhshrink_cpu.cc b/src/infiniop/ops/tanhshrink/cpu/tanhshrink_cpu.cc
new file mode 100644
index 000000000..1abca6a33
--- /dev/null
+++ b/src/infiniop/ops/tanhshrink/cpu/tanhshrink_cpu.cc
@@ -0,0 +1,50 @@
+#include "tanhshrink_cpu.h"
+
+namespace op::tanhshrink::cpu {
+
+Descriptor::~Descriptor() = default;
+
+infiniStatus_t Descriptor::create(
+    infiniopHandle_t handle_,
+    Descriptor **desc_ptr,
+    infiniopTensorDescriptor_t out_desc,
+    std::vector<infiniopTensorDescriptor_t> input_desc_vec) {
+
+    auto handle = reinterpret_cast<device::cpu::Handle *>(handle_);
+    auto dtype = out_desc->dtype();
+
+    const auto &input_desc = input_desc_vec.at(0);
+    const auto &output_shape = out_desc->shape();
+    const auto &input_shape = input_desc->shape();
+
+    CHECK_DTYPE(dtype, INFINI_DTYPE_BF16, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_F64);
+
+    CHECK_SAME_SHAPE(output_shape, input_shape);
+
+    // create CPU elementwise descriptor
+    CREATE_ELEMENTWISE_CPU_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec);
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+infiniStatus_t Descriptor::calculate(
+    void *workspace,
+    size_t workspace_size,
+    void *output,
+    std::vector<const void *> inputs,
+    void *stream) const {
+
+    switch (_dtype) {
+    case INFINI_DTYPE_BF16:
+        return _device_info->calculate<TanhshrinkOp, bf16_t>(_info, output, inputs, stream);
+    case INFINI_DTYPE_F16:
+        return _device_info->calculate<TanhshrinkOp, fp16_t>(_info, output, inputs, stream);
+    case INFINI_DTYPE_F32:
+        return _device_info->calculate<TanhshrinkOp, float>(_info, output, inputs, stream);
+    default:
+        return INFINI_STATUS_BAD_TENSOR_DTYPE;
+    }
+
+    return INFINI_STATUS_SUCCESS;
+}
+} // namespace op::tanhshrink::cpu
diff --git a/src/infiniop/ops/tanhshrink/cpu/tanhshrink_cpu.h b/src/infiniop/ops/tanhshrink/cpu/tanhshrink_cpu.h
new file mode 100644
index 000000000..8c5f722ad
--- /dev/null
+++ b/src/infiniop/ops/tanhshrink/cpu/tanhshrink_cpu.h
@@ -0,0 +1,23 @@
+#ifndef __TANHSHRINK_CPU_H__
+#define __TANHSHRINK_CPU_H__
+
+#include "../../../elementwise/cpu/elementwise_cpu.h"
+
+ELEMENTWISE_DESCRIPTOR(tanhshrink, cpu)
+
+#include <cmath>
+
+namespace op::tanhshrink::cpu {
+typedef struct TanhshrinkOp {
+public:
+    static constexpr size_t num_inputs = 1;
+
+    template <typename T>
+    T operator()(const T &x) const {
+        return x - tanh(x);
+    }
+} TanhshrinkOp;
+
+} // namespace op::tanhshrink::cpu
+
+#endif // __TANHSHRINK_CPU_H__
diff --git a/src/infiniop/ops/tanhshrink/cuda/kernel.cuh b/src/infiniop/ops/tanhshrink/cuda/kernel.cuh
new file mode 100644
index 000000000..dfb88731e
--- /dev/null
+++ b/src/infiniop/ops/tanhshrink/cuda/kernel.cuh
@@ -0,0 +1,30 @@
+#ifndef __TANHSHRINK_CUDA_H__
+#define __TANHSHRINK_CUDA_H__
+
+namespace op::tanhshrink::cuda {
+
+typedef struct TanhshrinkOp {
+public:
+    static constexpr size_t num_inputs = 1;
+    template <typename T>
+    __device__ __forceinline__ T operator()(const T &x) const {
+        if constexpr (std::is_same_v<T, cuda_bfloat16>) {
+            // BF16
+            const float x_f = __bfloat162float(x);
+            return __float2bfloat16(x_f - tanhf(x_f));
+        } else if constexpr (std::is_same_v<T, half>) {
+            // FP16
+            const float x_f = __half2float(x);
+            return __float2half(x_f - tanhf(x_f));
+        } else if constexpr (std::is_same_v<T, float>) {
+            // FP32
+            return x - tanhf(x);
+        } else {
+            return x - tanhf(x);
+        }
+    }
+} TanhshrinkOp;
+
+} // namespace op::tanhshrink::cuda
+
+#endif // __TANHSHRINK_CUDA_H__
diff --git a/src/infiniop/ops/tanhshrink/metax/tanhshrink_metax.h b/src/infiniop/ops/tanhshrink/metax/tanhshrink_metax.h
new file mode 100644
index 000000000..b79edcbf9
--- /dev/null
+++ b/src/infiniop/ops/tanhshrink/metax/tanhshrink_metax.h
@@ -0,0 +1,8 @@
+#ifndef __TANHSHRINK_METAX_API_H__
+#define __TANHSHRINK_METAX_API_H__
+
+#include "../../../elementwise/metax/elementwise_metax_api.h"
+
+ELEMENTWISE_DESCRIPTOR(tanhshrink, metax)
+
+#endif // __TANHSHRINK_METAX_API_H__
diff --git a/src/infiniop/ops/tanhshrink/metax/tanhshrink_metax.maca b/src/infiniop/ops/tanhshrink/metax/tanhshrink_metax.maca
new file mode 100644
index 000000000..c63390ab7
--- /dev/null
+++ b/src/infiniop/ops/tanhshrink/metax/tanhshrink_metax.maca
@@ -0,0 +1,56 @@
+#include "../../../elementwise/metax/elementwise_metax.h"
+
+#include "../cuda/kernel.cuh"
+#include "tanhshrink_metax.h"
+
+namespace op::tanhshrink::metax {
+
+Descriptor::~Descriptor() = default;
+
+infiniStatus_t Descriptor::create(
+    infiniopHandle_t handle_,
+    Descriptor **desc_ptr,
+    infiniopTensorDescriptor_t out_desc,
+    std::vector<infiniopTensorDescriptor_t> input_desc_vec) {
+
+    auto handle = reinterpret_cast<device::metax::Handle *>(handle_);
+    auto dtype = out_desc->dtype();
+
+    const auto &input_desc = input_desc_vec.at(0);
+    const auto &output_shape = out_desc->shape();
+    const auto &input_shape = input_desc->shape();
+
+    CHECK_DTYPE(dtype, INFINI_DTYPE_BF16, INFINI_DTYPE_F16, INFINI_DTYPE_F32);
+
+    CHECK_SAME_SHAPE(output_shape, input_shape);
+
+    CREATE_ELEMENTWISE_METAX_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec)
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+infiniStatus_t Descriptor::calculate(
+    void *workspace,
+    size_t workspace_size,
+    void *output,
+    std::vector<const void *> inputs,
+    void *stream) const {
+
+    if (workspace_size < _workspace_size) {
+        return INFINI_STATUS_INSUFFICIENT_WORKSPACE;
+    }
+
+    switch (_dtype) {
+    case INFINI_DTYPE_BF16:
+        return _device_info->calculate<256, cuda::TanhshrinkOp, hpcc_bfloat16>(_info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_F16:
+        return _device_info->calculate<256, cuda::TanhshrinkOp, half>(_info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_F32:
+        return _device_info->calculate<256, cuda::TanhshrinkOp, float>(_info, workspace, output, inputs, stream);
+    default:
+        return INFINI_STATUS_BAD_TENSOR_DTYPE;
+    }
+
+    return INFINI_STATUS_SUCCESS;
+}
+} // namespace op::tanhshrink::metax
diff --git a/src/infiniop/ops/tanhshrink/moore/tanhshrink_moore.h b/src/infiniop/ops/tanhshrink/moore/tanhshrink_moore.h
new file mode 100644
index 000000000..5a01a76ce
--- /dev/null
+++ b/src/infiniop/ops/tanhshrink/moore/tanhshrink_moore.h
@@ -0,0 +1,8 @@
+#ifndef __TANHSHRINK_MOORE_API_H__
+#define __TANHSHRINK_MOORE_API_H__
+
+#include "../../../elementwise/moore/elementwise_moore_api.h"
+
+ELEMENTWISE_DESCRIPTOR(tanhshrink, moore)
+
+#endif // __TANHSHRINK_MOORE_API_H__
diff --git a/src/infiniop/ops/tanhshrink/moore/tanhshrink_moore.mu b/src/infiniop/ops/tanhshrink/moore/tanhshrink_moore.mu
new file mode 100644
index 000000000..4c4bdee8f
--- /dev/null
+++ b/src/infiniop/ops/tanhshrink/moore/tanhshrink_moore.mu
@@ -0,0 +1,56 @@
+#include "../../../elementwise/moore/elementwise_moore.h"
+
+#include "../cuda/kernel.cuh"
+#include "tanhshrink_moore.h"
+
+namespace op::tanhshrink::moore {
+
+Descriptor::~Descriptor() = default;
+
+infiniStatus_t Descriptor::create(
+    infiniopHandle_t handle_,
+    Descriptor **desc_ptr,
+    infiniopTensorDescriptor_t out_desc,
+    std::vector<infiniopTensorDescriptor_t> input_desc_vec) {
+
+    auto handle = reinterpret_cast<device::moore::Handle *>(handle_);
+    auto dtype = out_desc->dtype();
+
+    const auto &input_desc = input_desc_vec.at(0);
+    const auto &output_shape = out_desc->shape();
+    const auto &input_shape = input_desc->shape();
+
+    CHECK_DTYPE(dtype, INFINI_DTYPE_BF16, INFINI_DTYPE_F16, INFINI_DTYPE_F32);
+
+    CHECK_SAME_SHAPE(output_shape, input_shape);
+
+    CREATE_ELEMENTWISE_MOORE_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec)
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+infiniStatus_t Descriptor::calculate(
+    void *workspace,
+    size_t workspace_size,
+    void *output,
+    std::vector<const void *> inputs,
+    void *stream) const {
+
+    if (workspace_size < _workspace_size) {
+        return INFINI_STATUS_INSUFFICIENT_WORKSPACE;
+    }
+
+    switch (_dtype) {
+    case INFINI_DTYPE_BF16:
+        return _device_info->calculate<256, cuda::TanhshrinkOp, mt_bfloat16>(_info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_F16:
+        return _device_info->calculate<256, cuda::TanhshrinkOp, half>(_info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_F32:
+        return _device_info->calculate<256, cuda::TanhshrinkOp, float>(_info, workspace, output, inputs, stream);
+    default:
+        return INFINI_STATUS_BAD_TENSOR_DTYPE;
+    }
+
+    return INFINI_STATUS_SUCCESS;
+}
+} // namespace op::tanhshrink::moore
diff --git a/src/infiniop/ops/tanhshrink/moore/tanhshrink_moore_kernel.h b/src/infiniop/ops/tanhshrink/moore/tanhshrink_moore_kernel.h
new file mode 100644
index 000000000..9c75b916e
--- /dev/null
+++ b/src/infiniop/ops/tanhshrink/moore/tanhshrink_moore_kernel.h
@@ -0,0 +1,28 @@
+#ifndef __TANHSHRINK_MOORE_KERNEL_H__
+#define __TANHSHRINK_MOORE_KERNEL_H__
+
+namespace op::tanhshrink::moore {
+
+typedef struct TanhshrinkOp {
+public:
+    static constexpr size_t num_inputs = 1;
+    template <typename T>
+    __device__ __forceinline__ T operator()(const T &x) const {
+        if constexpr (std::is_same_v<T, cuda_bfloat16>) {
+            // BF16
+            const float x_f = __bfloat162float(x);
+            return __float2bfloat16(x_f - tanhf(x_f));
+        } else if constexpr (std::is_same_v<T, half>) {
+            // FP16
+            const float x_f = __half2float(x);
+            return __float2half(x_f - tanhf(x_f));
+        } else if constexpr (std::is_same_v<T, float>) {
+            // FP32
+            return x - tanhf(x);
+        }
+    }
+} TanhshrinkOp;
+
+} // namespace op::tanhshrink::moore
+
+#endif // __TANHSHRINK_MOORE_KERNEL_H__
\ No newline at end of file
diff --git a/src/infiniop/ops/tanhshrink/nvidia/tanhshrink_nvidia.cu b/src/infiniop/ops/tanhshrink/nvidia/tanhshrink_nvidia.cu
new file mode 100644
index 000000000..baff908a2
--- /dev/null
+++ b/src/infiniop/ops/tanhshrink/nvidia/tanhshrink_nvidia.cu
@@ -0,0 +1,59 @@
+#include "../../../elementwise/nvidia/elementwise_nvidia.cuh"
+
+#include "../cuda/kernel.cuh"
+#include "tanhshrink_nvidia.cuh"
+
+namespace op::tanhshrink::nvidia {
+
+Descriptor::~Descriptor() = default;
+
+infiniStatus_t Descriptor::create(
+    infiniopHandle_t handle_,
+    Descriptor **desc_ptr,
+    infiniopTensorDescriptor_t out_desc,
+    std::vector<infiniopTensorDescriptor_t> input_desc_vec) {
+
+    auto handle = reinterpret_cast<device::nvidia::Handle *>(handle_);
+    auto dtype = out_desc->dtype();
+
+    const auto &input_desc = input_desc_vec.at(0);
+    const auto &output_shape = out_desc->shape();
+    const auto &input_shape = input_desc->shape();
+
+    CHECK_DTYPE(dtype, INFINI_DTYPE_BF16, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_F64);
+
+    CHECK_SAME_SHAPE(output_shape, input_shape);
+
+    // create CUDA elementwise descriptor
+    CREATE_ELEMENTWISE_CUDA_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec)
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+infiniStatus_t Descriptor::calculate(
+    void *workspace,
+    size_t workspace_size,
+    void *output,
+    std::vector<const void *> inputs,
+    void *stream) const {
+
+    if (workspace_size < _workspace_size) {
+        return INFINI_STATUS_INSUFFICIENT_WORKSPACE;
+    }
+
+    switch (_dtype) {
+    case INFINI_DTYPE_BF16:
+        return _device_info->calculate<256, cuda::TanhshrinkOp, cuda_bfloat16>(_info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_F16:
+        return _device_info->calculate<256, cuda::TanhshrinkOp, half>(_info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_F32:
+        return _device_info->calculate<256, cuda::TanhshrinkOp, float>(_info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_F64:
+        return _device_info->calculate<256, cuda::TanhshrinkOp, double>(_info, workspace, output, inputs, stream);
+    default:
+        return INFINI_STATUS_BAD_TENSOR_DTYPE;
+    }
+
+    return INFINI_STATUS_SUCCESS;
+}
+} // namespace op::tanhshrink::nvidia
diff --git a/src/infiniop/ops/tanhshrink/nvidia/tanhshrink_nvidia.cuh b/src/infiniop/ops/tanhshrink/nvidia/tanhshrink_nvidia.cuh
new file mode 100644
index 000000000..abe56f444
--- /dev/null
+++ b/src/infiniop/ops/tanhshrink/nvidia/tanhshrink_nvidia.cuh
@@ -0,0 +1,8 @@
+#ifndef __TANHSHRINK_CUDA_API_H__
+#define __TANHSHRINK_CUDA_API_H__
+
+#include "../../../elementwise/nvidia/elementwise_nvidia_api.cuh"
+
+ELEMENTWISE_DESCRIPTOR(tanhshrink, nvidia)
+
+#endif // __TANHSHRINK_CUDA_API_H__
diff --git a/src/infiniop/ops/tanhshrink/operator.cc b/src/infiniop/ops/tanhshrink/operator.cc
new file mode 100644
index 000000000..66e62b850
--- /dev/null
+++ b/src/infiniop/ops/tanhshrink/operator.cc
@@ -0,0 +1,157 @@
+#include "../../operator.h"
+#include "../../handle.h"
+#include "infiniop/ops/tanhshrink.h"
+
+#ifdef ENABLE_CPU_API
+#include "cpu/tanhshrink_cpu.h"
+#endif
+#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API)
+#include "nvidia/tanhshrink_nvidia.cuh"
+#endif
+#ifdef ENABLE_METAX_API
+#include "metax/tanhshrink_metax.h"
+#endif
+#ifdef ENABLE_MOORE_API
+#include "moore/tanhshrink_moore.h"
+#endif
+
+__C infiniStatus_t infiniopCreateTanhshrinkDescriptor(
+    infiniopHandle_t handle,
+    infiniopTanhshrinkDescriptor_t *desc_ptr,
+    infiniopTensorDescriptor_t output_desc,
+    infiniopTensorDescriptor_t input_desc) {
+
+#define CREATE(CASE, NAMESPACE)                                             \
+    case CASE:                                                              \
+        return op::tanhshrink::NAMESPACE::Descriptor::create(                     \
+            handle,                                                         \
+            reinterpret_cast<op::tanhshrink::NAMESPACE::Descriptor **>(desc_ptr), \
+            output_desc,                                                    \
+            {input_desc})
+
+    switch (handle->device) {
+
+#ifdef ENABLE_CPU_API
+        CREATE(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_NVIDIA_API
+        CREATE(INFINI_DEVICE_NVIDIA, nvidia);
+#endif
+#ifdef ENABLE_ILUVATAR_API
+        CREATE(INFINI_DEVICE_ILUVATAR, nvidia);
+#endif
+#ifdef ENABLE_METAX_API
+        CREATE(INFINI_DEVICE_METAX, metax);
+#endif
+#ifdef ENABLE_MOORE_API
+        CREATE(INFINI_DEVICE_MOORE, moore);
+#endif
+
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+
+#undef CREATE
+}
+
+__C infiniStatus_t infiniopGetTanhshrinkWorkspaceSize(infiniopTanhshrinkDescriptor_t desc, size_t *size) {
+
+#define GET(CASE, NAMESPACE)                                                                \
+    case CASE:                                                                              \
+        *size = reinterpret_cast<op::tanhshrink::NAMESPACE::Descriptor *>(desc)->workspaceSize(); \
+        return INFINI_STATUS_SUCCESS
+
+    switch (desc->device_type) {
+#ifdef ENABLE_CPU_API
+        GET(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_NVIDIA_API
+        GET(INFINI_DEVICE_NVIDIA, nvidia);
+#endif
+#ifdef ENABLE_ILUVATAR_API
+        GET(INFINI_DEVICE_ILUVATAR, nvidia);
+#endif
+#ifdef ENABLE_METAX_API
+        GET(INFINI_DEVICE_METAX, metax);
+#endif
+#ifdef ENABLE_MOORE_API
+        GET(INFINI_DEVICE_MOORE, moore);
+#endif
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+#undef GET
+
+    return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+}
+
+__C infiniStatus_t infiniopTanhshrink(
+    infiniopTanhshrinkDescriptor_t desc,
+    void *workspace,
+    size_t workspace_size,
+    void *output,
+    const void *input,
+    void *stream) {
+
+#define CALCULATE(CASE, NAMESPACE)                                             \
+    case CASE:                                                                 \
+        return reinterpret_cast<const op::tanhshrink::NAMESPACE::Descriptor *>(desc) \
+            ->calculate(workspace, workspace_size, output, {input}, stream)
+
+    switch (desc->device_type) {
+
+#ifdef ENABLE_CPU_API
+        CALCULATE(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_NVIDIA_API
+        CALCULATE(INFINI_DEVICE_NVIDIA, nvidia);
+#endif
+#ifdef ENABLE_ILUVATAR_API
+        CALCULATE(INFINI_DEVICE_ILUVATAR, nvidia);
+#endif
+#ifdef ENABLE_METAX_API
+        CALCULATE(INFINI_DEVICE_METAX, metax);
+#endif
+#ifdef ENABLE_MOORE_API
+        CALCULATE(INFINI_DEVICE_MOORE, moore);
+#endif
+
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+
+#undef CALCULATE
+}
+
+__C infiniStatus_t
+infiniopDestroyTanhshrinkDescriptor(infiniopTanhshrinkDescriptor_t desc) {
+
+#define DELETE(CASE, NAMESPACE)                                                 \
+    case CASE:                                                                  \
+        delete reinterpret_cast<const op::tanhshrink::NAMESPACE::Descriptor *>(desc); \
+        return INFINI_STATUS_SUCCESS
+
+    switch (desc->device_type) {
+
+#ifdef ENABLE_CPU_API
+        DELETE(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_NVIDIA_API
+        DELETE(INFINI_DEVICE_NVIDIA, nvidia);
+#endif
+#ifdef ENABLE_ILUVATAR_API
+        DELETE(INFINI_DEVICE_ILUVATAR, nvidia);
+#endif
+#ifdef ENABLE_METAX_API
+        DELETE(INFINI_DEVICE_METAX, metax);
+#endif
+#ifdef ENABLE_MOORE_API
+        DELETE(INFINI_DEVICE_MOORE, moore);
+#endif
+
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+
+#undef DELETE
+}
diff --git a/test/infinicore/ops/cat.py b/test/infinicore/ops/cat.py
index e66f29fa4..787692343 100644
--- a/test/infinicore/ops/cat.py
+++ b/test/infinicore/ops/cat.py
@@ -126,9 +126,9 @@ def torch_operator(self, *args, **kwargs):
         """PyTorch cat implementation"""
         return torch.cat(*args, **kwargs)
 
-    # def infinicore_operator(self, *args, **kwargs):
-    #     """InfiniCore cat implementation"""
-    #     return infinicore.cat(*args, **kwargs)
+    def infinicore_operator(self, *args, **kwargs):
+        """InfiniCore cat implementation"""
+        return infinicore.cat(*args, **kwargs)
 
 
 def main():
diff --git a/test/infinicore/ops/inner.py b/test/infinicore/ops/inner.py
index 3766ca860..420123eec 100644
--- a/test/infinicore/ops/inner.py
+++ b/test/infinicore/ops/inner.py
@@ -62,9 +62,9 @@ def get_test_cases(self):
     def torch_operator(self, *args, **kwargs):
         return torch.inner(*args, **kwargs)
 
-    # def infinicore_operator(self, *args, **kwargs):
-    #     """InfiniCore implementation (operator not yet available)."""
-    #     return infinicore.inner(*args, **kwargs)
+    def infinicore_operator(self, *args, **kwargs):
+        """InfiniCore implementation (operator not yet available)."""
+        return infinicore.inner(*args, **kwargs)
 
 
 def main():
diff --git a/test/infinicore/ops/masked_select.py b/test/infinicore/ops/masked_select.py
index e7230b661..17ecc0fe9 100644
--- a/test/infinicore/ops/masked_select.py
+++ b/test/infinicore/ops/masked_select.py
@@ -53,9 +53,9 @@ def get_test_cases(self):
     def torch_operator(self, *args, **kwargs):
         return torch.masked_select(*args, **kwargs)
 
-    # def infinicore_operator(self, *args, **kwargs):
-    #     """InfiniCore implementation (operator not yet available)."""
-    #     return infinicore.masked_select(*args, **kwargs)
+    def infinicore_operator(self, *args, **kwargs):
+        """InfiniCore implementation (operator not yet available)."""
+        return infinicore.masked_select(*args, **kwargs)
 
 
 def main():
diff --git a/test/infinicore/ops/tan.py b/test/infinicore/ops/tan.py
index acb1cac5a..9a02615b5 100644
--- a/test/infinicore/ops/tan.py
+++ b/test/infinicore/ops/tan.py
@@ -97,9 +97,9 @@ def get_test_cases(self):
     def torch_operator(self, *args, **kwargs):
         return torch.tan(*args, **kwargs)
 
-    # def infinicore_operator(self, *args, **kwargs):
-    #     """InfiniCore implementation (operator not yet available)."""
-    #     return infinicore.tan(*args, **kwargs)
+    def infinicore_operator(self, *args, **kwargs):
+        """InfiniCore implementation (operator not yet available)."""
+        return infinicore.tan(*args, **kwargs)
 
 
 def main():
diff --git a/test/infinicore/ops/tanhshrink.py b/test/infinicore/ops/tanhshrink.py
index 4e60eb264..36bcb8b95 100644
--- a/test/infinicore/ops/tanhshrink.py
+++ b/test/infinicore/ops/tanhshrink.py
@@ -68,9 +68,9 @@ def get_test_cases(self):
     def torch_operator(self, *args, **kwargs):
         return torch.nn.functional.tanhshrink(*args, **kwargs)
 
-    # def infinicore_operator(self, *args, **kwargs):
-    #     """InfiniCore implementation (operator not yet available)."""
-    #     return infinicore.nn.functional.tanhshrink(*args, **kwargs)
+    def infinicore_operator(self, *args, **kwargs):
+        """InfiniCore implementation (operator not yet available)."""
+        return infinicore.nn.functional.tanhshrink(*args, **kwargs)
 
 
 def main():

From 8d7e2f8c527e29ed0a84eef128c595ffea361249 Mon Sep 17 00:00:00 2001
From: PanZezhong <panzezhong@qiyuanlab.com>
Date: Thu, 19 Mar 2026 03:13:31 +0000
Subject: [PATCH 2/3] issue/1031 fix T1-1-32

---
 include/infinicore/ops/cat.hpp                |  2 +-
 include/infinicore/ops/inner.hpp              |  2 +-
 include/infinicore/ops/masked_select.hpp      |  2 +-
 include/infinicore/ops/tan.hpp                |  2 +-
 include/infiniop.h                            |  6 +-
 include/infiniop/ops/inner.h                  | 10 +--
 include/infiniop/ops/masked_select.h          | 10 +--
 include/infiniop/ops/tan.h                    | 10 +--
 include/infiniop/ops/tanhshrink.h             | 24 ++---
 python/infinicore/__init__.py                 |  6 +-
 python/infinicore/nn/functional/__init__.py   |  2 +-
 python/infinicore/ops/cat.py                  |  4 +-
 python/infinicore/ops/masked_select.py        |  3 +-
 src/infinicore/ops/cat/cat.cc                 | 87 ++++++++++---------
 src/infinicore/ops/inner/inner.cc             | 12 +--
 src/infinicore/ops/inner/inner_infiniop.cc    | 18 ++--
 .../ops/masked_select/masked_select.cc        |  8 +-
 .../masked_select/masked_select_infiniop.cc   | 18 ++--
 src/infinicore/ops/tan/tan_infiniop.cc        | 13 ++-
 .../ops/tanhshrink/tanhshrink_infiniop.cc     | 13 ++-
 src/infinicore/pybind11/ops.hpp               |  7 +-
 src/infinicore/pybind11/ops/cat.hpp           |  4 +-
 src/infinicore/pybind11/ops/inner.hpp         |  4 +-
 src/infinicore/pybind11/ops/masked_select.hpp |  3 +-
 src/infiniop/ops/inner/cpu/inner_cpu.cc       | 31 ++++---
 src/infiniop/ops/inner/cpu/inner_cpu.h        |  2 +-
 src/infiniop/ops/inner/cuda/kernel.cuh        | 22 ++---
 src/infiniop/ops/inner/info.h                 | 33 ++++---
 src/infiniop/ops/inner/inner.h                |  6 +-
 src/infiniop/ops/inner/metax/inner_metax.h    |  2 +-
 src/infiniop/ops/inner/metax/inner_metax.maca | 37 ++++----
 .../ops/inner/metax/inner_metax_kernel.h      | 22 ++---
 src/infiniop/ops/inner/moore/inner_moore.h    |  2 +-
 src/infiniop/ops/inner/moore/inner_moore.mu   | 37 ++++----
 .../ops/inner/moore/inner_moore_kernel.h      | 22 ++---
 src/infiniop/ops/inner/nvidia/inner_nvidia.cu | 39 +++++----
 .../ops/inner/nvidia/inner_nvidia.cuh         |  2 +-
 src/infiniop/ops/inner/operator.cc            | 36 ++++----
 .../masked_select/cpu/masked_select_cpu.cc    | 10 +--
 .../ops/masked_select/cpu/masked_select_cpu.h |  2 +-
 .../ops/masked_select/cuda/kernel.cuh         | 43 +++++----
 src/infiniop/ops/masked_select/info.h         |  4 +-
 .../ops/masked_select/masked_select.h         |  5 +-
 .../masked_select/metax/masked_select_metax.h |  2 +-
 .../metax/masked_select_metax.maca            | 51 +++++------
 .../metax/masked_select_metax_kernel.h        | 43 +++++----
 .../masked_select/moore/masked_select_moore.h |  2 +-
 .../moore/masked_select_moore.mu              | 51 +++++------
 .../moore/masked_select_moore_kernel.h        | 43 +++++----
 .../nvidia/masked_select_nvidia.cu            | 48 +++++-----
 .../nvidia/masked_select_nvidia.cuh           |  2 +-
 src/infiniop/ops/masked_select/operator.cc    | 32 +++----
 src/infiniop/ops/tan/operator.cc              | 28 +++---
 .../moore/tanhshrink_moore_kernel.h           |  2 +-
 src/infiniop/ops/tanhshrink/operator.cc       | 28 +++---
 test/infinicore/ops/inner.py                  |  2 +-
 56 files changed, 494 insertions(+), 467 deletions(-)

diff --git a/include/infinicore/ops/cat.hpp b/include/infinicore/ops/cat.hpp
index 5fdfdcce8..95be62103 100644
--- a/include/infinicore/ops/cat.hpp
+++ b/include/infinicore/ops/cat.hpp
@@ -6,4 +6,4 @@ namespace infinicore::op {
 
 Tensor cat(std::vector<Tensor> tensors, int dim);
 void cat_(Tensor out, std::vector<Tensor> tensors, int dim);
-} // namespace infinicore::op
\ No newline at end of file
+} // namespace infinicore::op
diff --git a/include/infinicore/ops/inner.hpp b/include/infinicore/ops/inner.hpp
index 017ddc604..ae372c415 100644
--- a/include/infinicore/ops/inner.hpp
+++ b/include/infinicore/ops/inner.hpp
@@ -14,4 +14,4 @@ class Inner {
 Tensor inner(Tensor input, Tensor other);
 void inner_(Tensor out, Tensor input, Tensor other);
 
-}
\ No newline at end of file
+} // namespace infinicore::op
diff --git a/include/infinicore/ops/masked_select.hpp b/include/infinicore/ops/masked_select.hpp
index 89bbfa49d..d003e3bb8 100644
--- a/include/infinicore/ops/masked_select.hpp
+++ b/include/infinicore/ops/masked_select.hpp
@@ -13,4 +13,4 @@ class MaskedSelect {
 
 Tensor masked_select(Tensor input, Tensor mask);
 
-} // namespace infinicore::op
\ No newline at end of file
+} // namespace infinicore::op
diff --git a/include/infinicore/ops/tan.hpp b/include/infinicore/ops/tan.hpp
index 9c2d7a860..72410f866 100644
--- a/include/infinicore/ops/tan.hpp
+++ b/include/infinicore/ops/tan.hpp
@@ -14,4 +14,4 @@ class Tan {
 Tensor tan(Tensor input);
 void tan_(Tensor output, Tensor input);
 
-}
\ No newline at end of file
+} // namespace infinicore::op
diff --git a/include/infiniop.h b/include/infiniop.h
index b5aa2d966..753f7d4dd 100644
--- a/include/infiniop.h
+++ b/include/infiniop.h
@@ -19,18 +19,18 @@
 #include "infiniop/ops/cross_entropy.h"
 #include "infiniop/ops/dequant/per_tensor_dequant_int8.h"
 #include "infiniop/ops/dequantize_awq.h"
+#include "infiniop/ops/dequantize_gptq.h"
 #include "infiniop/ops/embedding.h"
 #include "infiniop/ops/equal.h"
 #include "infiniop/ops/flash_attention.h"
 #include "infiniop/ops/fmod.h"
-#include "infiniop/ops/dequantize_gptq.h"
 #include "infiniop/ops/gelu.h"
 #include "infiniop/ops/gemm.h"
 #include "infiniop/ops/hardswish.h"
 #include "infiniop/ops/hardtanh.h"
+#include "infiniop/ops/inner.h"
 #include "infiniop/ops/int8_gemm.h"
 #include "infiniop/ops/kv_caching.h"
-#include "infiniop/ops/inner.h"
 #include "infiniop/ops/layer_norm.h"
 #include "infiniop/ops/logsoftmax.h"
 #include "infiniop/ops/lp_norm.h"
@@ -58,8 +58,8 @@
 #include "infiniop/ops/swiglu.h"
 #include "infiniop/ops/tan.h"
 #include "infiniop/ops/tanh.h"
-#include "infiniop/ops/topk.h"
 #include "infiniop/ops/tanhshrink.h"
+#include "infiniop/ops/topk.h"
 #include "infiniop/ops/topkrouter.h"
 #include "infiniop/ops/topksoftmax.h"
 #include "infiniop/ops/var.h"
diff --git a/include/infiniop/ops/inner.h b/include/infiniop/ops/inner.h
index f484cde9c..99fc8ab13 100644
--- a/include/infiniop/ops/inner.h
+++ b/include/infiniop/ops/inner.h
@@ -5,16 +5,16 @@
 
 typedef struct InfiniopDescriptor *infiniopInnerDescriptor_t;
 
-__C __export infiniStatus_t infiniopCreateInnerDescriptor(
+__INFINI_C __export infiniStatus_t infiniopCreateInnerDescriptor(
     infiniopHandle_t handle,
     infiniopInnerDescriptor_t *desc_ptr,
     infiniopTensorDescriptor_t out_desc,
     infiniopTensorDescriptor_t input_desc,
     infiniopTensorDescriptor_t other_desc);
 
-__C __export infiniStatus_t infiniopGetInnerWorkspaceSize(infiniopInnerDescriptor_t desc, size_t *size);
+__INFINI_C __export infiniStatus_t infiniopGetInnerWorkspaceSize(infiniopInnerDescriptor_t desc, size_t *size);
 
-__C __export infiniStatus_t infiniopInner(
+__INFINI_C __export infiniStatus_t infiniopInner(
     infiniopInnerDescriptor_t desc,
     void *workspace,
     size_t workspace_size,
@@ -23,6 +23,6 @@ __C __export infiniStatus_t infiniopInner(
     const void *other,
     void *stream);
 
-__C __export infiniStatus_t infiniopDestroyInnerDescriptor(infiniopInnerDescriptor_t desc);
+__INFINI_C __export infiniStatus_t infiniopDestroyInnerDescriptor(infiniopInnerDescriptor_t desc);
 
-#endif
\ No newline at end of file
+#endif
diff --git a/include/infiniop/ops/masked_select.h b/include/infiniop/ops/masked_select.h
index 7adf65415..d24ed11f7 100644
--- a/include/infiniop/ops/masked_select.h
+++ b/include/infiniop/ops/masked_select.h
@@ -5,15 +5,15 @@
 
 typedef struct InfiniopDescriptor *infiniopMaskedSelectDescriptor_t;
 
-__C __export infiniStatus_t infiniopCreateMaskedSelectDescriptor(
+__INFINI_C __export infiniStatus_t infiniopCreateMaskedSelectDescriptor(
     infiniopHandle_t handle,
     infiniopMaskedSelectDescriptor_t *desc_ptr,
     infiniopTensorDescriptor_t input_desc,
     infiniopTensorDescriptor_t mask_desc);
 
-__C __export infiniStatus_t infiniopGetMaskedSelectWorkspaceSize(infiniopMaskedSelectDescriptor_t desc, size_t *size);
+__INFINI_C __export infiniStatus_t infiniopGetMaskedSelectWorkspaceSize(infiniopMaskedSelectDescriptor_t desc, size_t *size);
 
-__C __export infiniStatus_t infiniopMaskedSelect(
+__INFINI_C __export infiniStatus_t infiniopMaskedSelect(
     infiniopMaskedSelectDescriptor_t desc,
     void *workspace,
     size_t workspace_size,
@@ -23,6 +23,6 @@ __C __export infiniStatus_t infiniopMaskedSelect(
     size_t *dlen,
     void *stream);
 
-__C __export infiniStatus_t infiniopDestroyMaskedSelectDescriptor(infiniopMaskedSelectDescriptor_t desc);
+__INFINI_C __export infiniStatus_t infiniopDestroyMaskedSelectDescriptor(infiniopMaskedSelectDescriptor_t desc);
 
-#endif
\ No newline at end of file
+#endif
diff --git a/include/infiniop/ops/tan.h b/include/infiniop/ops/tan.h
index 2e20c61da..1b7bef23e 100644
--- a/include/infiniop/ops/tan.h
+++ b/include/infiniop/ops/tan.h
@@ -5,15 +5,15 @@
 
 typedef struct InfiniopDescriptor *infiniopTanDescriptor_t;
 
-__C __export infiniStatus_t infiniopCreateTanDescriptor(
+__INFINI_C __export infiniStatus_t infiniopCreateTanDescriptor(
     infiniopHandle_t handle,
     infiniopTanDescriptor_t *desc_ptr,
     infiniopTensorDescriptor_t input_desc,
     infiniopTensorDescriptor_t out_desc);
 
-__C __export infiniStatus_t infiniopGetTanWorkspaceSize(infiniopTanDescriptor_t desc, size_t *size);
+__INFINI_C __export infiniStatus_t infiniopGetTanWorkspaceSize(infiniopTanDescriptor_t desc, size_t *size);
 
-__C __export infiniStatus_t infiniopTan(
+__INFINI_C __export infiniStatus_t infiniopTan(
     infiniopTanDescriptor_t desc,
     void *workspace,
     size_t workspace_size,
@@ -21,6 +21,6 @@ __C __export infiniStatus_t infiniopTan(
     const void *input,
     void *stream);
 
-__C __export infiniStatus_t infiniopDestroyTanDescriptor(infiniopTanDescriptor_t desc);
+__INFINI_C __export infiniStatus_t infiniopDestroyTanDescriptor(infiniopTanDescriptor_t desc);
 
-#endif
\ No newline at end of file
+#endif
diff --git a/include/infiniop/ops/tanhshrink.h b/include/infiniop/ops/tanhshrink.h
index e136f6523..b2895ccab 100644
--- a/include/infiniop/ops/tanhshrink.h
+++ b/include/infiniop/ops/tanhshrink.h
@@ -5,20 +5,20 @@
 
 typedef struct InfiniopDescriptor *infiniopTanhshrinkDescriptor_t;
 
-__C __export infiniStatus_t infiniopCreateTanhshrinkDescriptor(infiniopHandle_t handle,
-                                                         infiniopTanhshrinkDescriptor_t *desc_ptr,
-                                                         infiniopTensorDescriptor_t output,
-                                                         infiniopTensorDescriptor_t intput);
+__INFINI_C __export infiniStatus_t infiniopCreateTanhshrinkDescriptor(infiniopHandle_t handle,
+                                                                      infiniopTanhshrinkDescriptor_t *desc_ptr,
+                                                                      infiniopTensorDescriptor_t output,
+                                                                      infiniopTensorDescriptor_t intput);
 
-__C __export infiniStatus_t infiniopGetTanhshrinkWorkspaceSize(infiniopTanhshrinkDescriptor_t desc, size_t *size);
+__INFINI_C __export infiniStatus_t infiniopGetTanhshrinkWorkspaceSize(infiniopTanhshrinkDescriptor_t desc, size_t *size);
 
-__C __export infiniStatus_t infiniopTanhshrink(infiniopTanhshrinkDescriptor_t desc,
-                                         void *workspace,
-                                         size_t workspace_size,
-                                         void *output,
-                                         const void *intput,
-                                         void *stream);
+__INFINI_C __export infiniStatus_t infiniopTanhshrink(infiniopTanhshrinkDescriptor_t desc,
+                                                      void *workspace,
+                                                      size_t workspace_size,
+                                                      void *output,
+                                                      const void *intput,
+                                                      void *stream);
 
-__C __export infiniStatus_t infiniopDestroyTanhshrinkDescriptor(infiniopTanhshrinkDescriptor_t desc);
+__INFINI_C __export infiniStatus_t infiniopDestroyTanhshrinkDescriptor(infiniopTanhshrinkDescriptor_t desc);
 
 #endif
diff --git a/python/infinicore/__init__.py b/python/infinicore/__init__.py
index f5e9a72b0..16b1362a3 100644
--- a/python/infinicore/__init__.py
+++ b/python/infinicore/__init__.py
@@ -59,13 +59,13 @@
 from infinicore.ops.binary_cross_entropy_with_logits import (
     binary_cross_entropy_with_logits,
 )
+from infinicore.ops.cat import cat
 from infinicore.ops.cdist import cdist
 from infinicore.ops.cross_entropy import cross_entropy
 from infinicore.ops.equal import equal
 from infinicore.ops.fmod import fmod
-from infinicore.ops.kv_caching import kv_caching
-from infinicore.ops.cat import cat
 from infinicore.ops.inner import inner
+from infinicore.ops.kv_caching import kv_caching
 from infinicore.ops.masked_select import masked_select
 from infinicore.ops.matmul import matmul
 from infinicore.ops.mha_kvcache import mha_kvcache
@@ -79,11 +79,11 @@
 from infinicore.ops.reciprocal import reciprocal
 from infinicore.ops.squeeze import squeeze
 from infinicore.ops.sum import sum
+from infinicore.ops.tan import tan
 from infinicore.ops.topk import topk
 from infinicore.ops.unsqueeze import unsqueeze
 from infinicore.ops.var import var
 from infinicore.ops.var_mean import var_mean
-from infinicore.ops.tan import tan
 from infinicore.tensor import (
     Tensor,
     empty,
diff --git a/python/infinicore/nn/functional/__init__.py b/python/infinicore/nn/functional/__init__.py
index 91e101f9d..55de7900b 100644
--- a/python/infinicore/nn/functional/__init__.py
+++ b/python/infinicore/nn/functional/__init__.py
@@ -34,5 +34,5 @@
     "swiglu",
     "linear_w8a8i8",
     "silu_and_mul",
-    "tanhshrink"
+    "tanhshrink",
 ]
diff --git a/python/infinicore/ops/cat.py b/python/infinicore/ops/cat.py
index 3a3e1470f..e45e9294a 100644
--- a/python/infinicore/ops/cat.py
+++ b/python/infinicore/ops/cat.py
@@ -2,11 +2,11 @@
 from infinicore.tensor import Tensor
 
 
-def cat(tensors, dim:int=0, *,out=None):
+def cat(tensors, dim: int = 0, *, out=None):
     if out is None:
         return Tensor(_infinicore.cat([tensor._underlying for tensor in tensors], dim))
 
     _infinicore.cat_(out._underlying, [tensor._underlying for tensor in tensors], dim)
-    
+
     # raise RuntimeError("breakpointer!!!")
     return out
diff --git a/python/infinicore/ops/masked_select.py b/python/infinicore/ops/masked_select.py
index d1846edfc..14a3a1ff0 100644
--- a/python/infinicore/ops/masked_select.py
+++ b/python/infinicore/ops/masked_select.py
@@ -1,5 +1,6 @@
 from infinicore.lib import _infinicore
 from infinicore.tensor import Tensor
 
+
 def masked_select(input, mask):
-    return Tensor(_infinicore.masked_select(input._underlying, mask._underlying))
\ No newline at end of file
+    return Tensor(_infinicore.masked_select(input._underlying, mask._underlying))
diff --git a/src/infinicore/ops/cat/cat.cc b/src/infinicore/ops/cat/cat.cc
index 908f5f457..a4e0a8ae0 100644
--- a/src/infinicore/ops/cat/cat.cc
+++ b/src/infinicore/ops/cat/cat.cc
@@ -7,37 +7,38 @@ namespace infinicore::op {
 namespace {
 
 class CatInfo {
-    
+
     CatInfo() = default;
 
 public:
-
     int dim;
     int ndim;
     size_t tensors_size;
-    
+
     std::vector<int> contiguous_dim;
     std::vector<size_t> copy_size;
 
-    static CatInfo create(Tensor& out, std::vector<Tensor>& tensors, int dim) {
-        
+    static CatInfo create(Tensor &out, std::vector<Tensor> &tensors, int dim) {
+
         int ndim = out->ndim();
         size_t tensors_size = tensors.size();
 
         std::vector<int> contiguous_dim(tensors_size, ndim);
         std::vector<size_t> copy_size(tensors_size, dsize(out->dtype()));
 
-        for (int i = 0; i < tensors_size; i ++) {
-            if (tensors[i]->ndim() == 1) continue;
+        for (int i = 0; i < tensors_size; i++) {
+            if (tensors[i]->ndim() == 1) {
+                continue;
+            }
             if (tensors[i]->stride(ndim - 1) == 1) {
                 contiguous_dim[i] = ndim - 1;
-                for (int j = ndim - 2; j >= dim; j --) {
+                for (int j = ndim - 2; j >= dim; j--) {
                     if (tensors[i]->stride(j) == tensors[i]->stride(j + 1) * tensors[i]->shape()[j + 1]) {
                         contiguous_dim[i] = j;
                     }
                 }
             }
-            for (int j = contiguous_dim[i]; j < ndim; j ++) {
+            for (int j = contiguous_dim[i]; j < ndim; j++) {
                 copy_size[i] *= tensors[i]->shape()[j];
             }
         }
@@ -46,20 +47,19 @@ class CatInfo {
     }
 };
 
-
 void low_dim_copy(
-    CatInfo &info, Tensor& tensor, Tensor& out, 
-    std::byte *tensor_ptr, std::byte *out_ptr, 
+    CatInfo &info, Tensor &tensor, Tensor &out,
+    std::byte *tensor_ptr, std::byte *out_ptr,
     int depth, int tensor_pos) {
 
     if (depth != info.contiguous_dim[tensor_pos]) {
-        std::byte* now_tensor_ptr = tensor_ptr;
-        std::byte* now_out_ptr = out_ptr;
+        std::byte *now_tensor_ptr = tensor_ptr;
+        std::byte *now_out_ptr = out_ptr;
+
+        for (int i = 0; i < tensor->shape()[depth]; i++) {
 
-        for (int i = 0; i < tensor->shape()[depth]; i ++) {
-        
             low_dim_copy(info, tensor, out, now_tensor_ptr, now_out_ptr, depth + 1, tensor_pos);
-        
+
             now_tensor_ptr += tensor->stride(depth) * dsize(tensor->dtype());
             now_out_ptr += out->stride(depth) * dsize(out->dtype());
         }
@@ -68,39 +68,43 @@ void low_dim_copy(
 
             std::memcpy(out_ptr, tensor_ptr, info.copy_size[tensor_pos]);
         } else {
-            
+
             context::memcpyD2D(out_ptr, tensor_ptr, info.copy_size[tensor_pos]);
         }
     }
 }
 
 void high_dim_split(
-    CatInfo &info, std::vector<Tensor>& tensors, Tensor& out, 
-    std::vector<std::byte*> tensors_ptr, std::byte* out_ptr, 
+    CatInfo &info, std::vector<Tensor> &tensors, Tensor &out,
+    std::vector<std::byte *> tensors_ptr, std::byte *out_ptr,
     int depth) {
-    
+
     if (depth != info.dim) {
-        std::vector<std::byte*> now_tensors_ptr = tensors_ptr;
-        std::byte* now_out_ptr = out_ptr;
+        std::vector<std::byte *> now_tensors_ptr = tensors_ptr;
+        std::byte *now_out_ptr = out_ptr;
 
-        for (int i = 0; i < out->shape()[depth]; i ++) {
+        for (int i = 0; i < out->shape()[depth]; i++) {
 
             high_dim_split(info, tensors, out, now_tensors_ptr, now_out_ptr, depth + 1);
-            
-            for (int i = 0; i < info.tensors_size; i ++) {
-                if (tensors[i]->ndim() == 1)    continue;
+
+            for (int i = 0; i < info.tensors_size; i++) {
+                if (tensors[i]->ndim() == 1) {
+                    continue;
+                }
                 now_tensors_ptr[i] += tensors[i]->stride(depth) * dsize(tensors[i]->dtype());
             }
             now_out_ptr += out->stride(depth) * dsize(out->dtype());
         }
     } else {
-        std::byte* now_out_ptr = out_ptr;
+        std::byte *now_out_ptr = out_ptr;
+
+        for (int i = 0; i < info.tensors_size; i++) {
+            if (tensors[i]->ndim() == 1) {
+                continue;
+            }
 
-        for (int i = 0; i < info.tensors_size; i ++) {
-            if (tensors[i]->ndim() == 1)    continue;
-            
             low_dim_copy(info, tensors[i], out, tensors_ptr[i], now_out_ptr, depth, i);
-            
+
             now_out_ptr += tensors[i]->shape()[depth] * out->stride(depth) * dsize(out->dtype());
         }
     }
@@ -115,9 +119,11 @@ Tensor cat(std::vector<Tensor> tensors, int dim) {
     dim = (dim + ndim) % ndim;
 
     Shape shape = tensors[0]->shape();
-    for (int i = 1; i < tensors.size(); i ++) {
+    for (int i = 1; i < tensors.size(); i++) {
         assert(tensors[i]->ndim() == dim || tensors[i]->ndim() == 1);
-        if (tensors[i]->ndim() != ndim) continue;
+        if (tensors[i]->ndim() != ndim) {
+            continue;
+        }
         shape[dim] += tensors[i]->shape()[dim];
     }
 
@@ -134,13 +140,13 @@ void cat_(Tensor out, std::vector<Tensor> tensors, int dim) {
     dim = (dim + ndim) % ndim;
 
     size_t dim_shape = 0;
-    for (auto& tensor : tensors) {
+    for (auto &tensor : tensors) {
         assert(tensor->ndim() == ndim || tensors[i]->ndim() == 1);
         if (tensor->ndim() == 1) {
             assert(tensor->shape()[0] == 0);
             continue;
         }
-        for (int i = 0; i < ndim; i ++) {
+        for (int i = 0; i < ndim; i++) {
             if (i != dim) {
                 assert(tensor->shape()[i] == out->shape()[i]);
             } else {
@@ -152,14 +158,13 @@ void cat_(Tensor out, std::vector<Tensor> tensors, int dim) {
 
     // Get info
     CatInfo info = CatInfo::create(out, tensors, dim);
-    std::vector<std::byte*> tensors_ptr(tensors.size());
-    for (int i = 0; i < tensors.size(); i ++) {
+    std::vector<std::byte *> tensors_ptr(tensors.size());
+    for (int i = 0; i < tensors.size(); i++) {
         tensors_ptr[i] = tensors[i]->data();
     }
-    std::byte* out_ptr = out->data();
+    std::byte *out_ptr = out->data();
 
     high_dim_split(info, tensors, out, tensors_ptr, out_ptr, 0);
-
 }
 
-} // namespace infinicore::op
\ No newline at end of file
+} // namespace infinicore::op
diff --git a/src/infinicore/ops/inner/inner.cc b/src/infinicore/ops/inner/inner.cc
index 271ad101c..c4a9dec8b 100644
--- a/src/infinicore/ops/inner/inner.cc
+++ b/src/infinicore/ops/inner/inner.cc
@@ -3,7 +3,6 @@
 
 namespace infinicore::op {
 
-
 common::OpDispatcher<Inner::schema> &Inner::dispatcher() {
     static common::OpDispatcher<Inner::schema> dispatcher_;
     return dispatcher_;
@@ -27,10 +26,12 @@ Tensor inner(Tensor input, Tensor other) {
     assert(input->shape()[input_ndim - 1] == other->shape()[other_ndim - 1]);
 
     Shape out_shape;
-    for (int i = 0; i < input_ndim - 1; i ++)
+    for (int i = 0; i < input_ndim - 1; i++) {
         out_shape.push_back(input->shape()[i]);
-    for (int i = 0; i < other_ndim - 1; i ++)
+    }
+    for (int i = 0; i < other_ndim - 1; i++) {
         out_shape.push_back(other->shape()[i]);
+    }
     auto out = Tensor::zeros(out_shape, input->dtype(), input->device());
 
     inner_(out, input, other);
@@ -38,9 +39,8 @@ Tensor inner(Tensor input, Tensor other) {
 }
 
 void inner_(Tensor out, Tensor input, Tensor other) {
-    
-    Inner::execute(out, input, other);
 
+    Inner::execute(out, input, other);
 }
 
-}
\ No newline at end of file
+} // namespace infinicore::op
diff --git a/src/infinicore/ops/inner/inner_infiniop.cc b/src/infinicore/ops/inner/inner_infiniop.cc
index fe6b8de9a..fd51d2128 100644
--- a/src/infinicore/ops/inner/inner_infiniop.cc
+++ b/src/infinicore/ops/inner/inner_infiniop.cc
@@ -1,7 +1,7 @@
 #include "../../utils.hpp"
 #include "infinicore/common/hash.hpp"
-#include "infinicore/ops/inner.hpp"
 #include "infinicore/ops/common/cache.hpp"
+#include "infinicore/ops/inner.hpp"
 #include <infiniop.h>
 
 namespace infinicore::op::inner_impl::infiniop {
@@ -39,21 +39,19 @@ void calculate(Tensor out, Tensor input, Tensor other) {
     INFINICORE_CHECK_ERROR(infiniopGetInnerWorkspaceSize(desc, &workspace_size));
     std::shared_ptr<Memory> workspace = context::allocateMemory(workspace_size);
 
-
     INFINICORE_CHECK_ERROR(infiniopInner(
         desc, workspace->data(), workspace_size,
         out->data(), input->data(), other->data(), context::getStream()));
 }
 
 static bool registered = []() {
-    Inner::dispatcher().registerDevice({
-            Device::Type::CPU,
-            Device::Type::NVIDIA,
-            Device::Type::METAX,
-            Device::Type::MOORE,
-            Device::Type::ILUVATAR
-        }, &calculate, false);
+    Inner::dispatcher().registerDevice({Device::Type::CPU,
+                                        Device::Type::NVIDIA,
+                                        Device::Type::METAX,
+                                        Device::Type::MOORE,
+                                        Device::Type::ILUVATAR},
+                                       &calculate, false);
     return true;
 }();
 
-}
\ No newline at end of file
+} // namespace infinicore::op::inner_impl::infiniop
diff --git a/src/infinicore/ops/masked_select/masked_select.cc b/src/infinicore/ops/masked_select/masked_select.cc
index 13ac6dfe4..1ade25a09 100644
--- a/src/infinicore/ops/masked_select/masked_select.cc
+++ b/src/infinicore/ops/masked_select/masked_select.cc
@@ -11,13 +11,13 @@ common::OpDispatcher<MaskedSelect::schema> &MaskedSelect::dispatcher() {
 void MaskedSelect::execute(Tensor input, Tensor mask, void **data_ptr, size_t *dlen_ptr) {
     auto device_type = context::getDevice().getType();
     auto func = dispatcher().lookup(device_type);
-    
+
     if (func == nullptr) {
         throw std::runtime_error("No MaskedSelect implementation found for device type: " + std::to_string(static_cast<int>(device_type)));
     }
 
     func(input, mask, data_ptr, dlen_ptr);
-}  
+}
 
 Tensor masked_select(Tensor input, Tensor mask) {
 
@@ -26,8 +26,8 @@ Tensor masked_select(Tensor input, Tensor mask) {
     MaskedSelect::execute(input, mask, (void **)&data, &dlen);
 
     auto out = Tensor::from_blob(data, {dlen}, input->dtype(), input->device());
-    
+
     return out;
 }
 
-}
+} // namespace infinicore::op
diff --git a/src/infinicore/ops/masked_select/masked_select_infiniop.cc b/src/infinicore/ops/masked_select/masked_select_infiniop.cc
index 0144ce425..31d08c09f 100644
--- a/src/infinicore/ops/masked_select/masked_select_infiniop.cc
+++ b/src/infinicore/ops/masked_select/masked_select_infiniop.cc
@@ -1,7 +1,7 @@
 #include "../../utils.hpp"
 #include "infinicore/common/hash.hpp"
-#include "infinicore/ops/masked_select.hpp"
 #include "infinicore/ops/common/cache.hpp"
+#include "infinicore/ops/masked_select.hpp"
 #include <infiniop.h>
 
 namespace infinicore::op::masked_select_impl::infiniop {
@@ -39,21 +39,19 @@ void calculate(Tensor input, Tensor mask, void **data_ptr, size_t *dlen_ptr) {
     INFINICORE_CHECK_ERROR(infiniopGetMaskedSelectWorkspaceSize(desc, &workspace_size));
     std::shared_ptr<Memory> workspace = context::allocateMemory(workspace_size);
 
-
     INFINICORE_CHECK_ERROR(infiniopMaskedSelect(
         desc, workspace->data(), workspace_size,
         input->data(), (const bool *)mask->data(), data_ptr, dlen_ptr, context::getStream()));
 }
 
 static bool registered = []() {
-    MaskedSelect::dispatcher().registerDevice({
-            Device::Type::CPU,
-            Device::Type::NVIDIA,
-            Device::Type::METAX,
-            Device::Type::MOORE,
-            Device::Type::ILUVATAR
-        }, &calculate, false);
+    MaskedSelect::dispatcher().registerDevice({Device::Type::CPU,
+                                               Device::Type::NVIDIA,
+                                               Device::Type::METAX,
+                                               Device::Type::MOORE,
+                                               Device::Type::ILUVATAR},
+                                              &calculate, false);
     return true;
 }();
 
-}
+} // namespace infinicore::op::masked_select_impl::infiniop
diff --git a/src/infinicore/ops/tan/tan_infiniop.cc b/src/infinicore/ops/tan/tan_infiniop.cc
index 1d7c9c874..ccb766490 100644
--- a/src/infinicore/ops/tan/tan_infiniop.cc
+++ b/src/infinicore/ops/tan/tan_infiniop.cc
@@ -45,13 +45,12 @@ void calculate(Tensor output, Tensor input) {
 }
 
 static bool registered = []() {
-    Tan::dispatcher().registerDevice({
-            Device::Type::CPU,
-            Device::Type::NVIDIA,
-            Device::Type::METAX,
-            Device::Type::MOORE,
-            Device::Type::ILUVATAR
-        }, &calculate, false);
+    Tan::dispatcher().registerDevice({Device::Type::CPU,
+                                      Device::Type::NVIDIA,
+                                      Device::Type::METAX,
+                                      Device::Type::MOORE,
+                                      Device::Type::ILUVATAR},
+                                     &calculate, false);
     return true;
 }();
 
diff --git a/src/infinicore/ops/tanhshrink/tanhshrink_infiniop.cc b/src/infinicore/ops/tanhshrink/tanhshrink_infiniop.cc
index ca440f875..1d7fa4d78 100644
--- a/src/infinicore/ops/tanhshrink/tanhshrink_infiniop.cc
+++ b/src/infinicore/ops/tanhshrink/tanhshrink_infiniop.cc
@@ -45,13 +45,12 @@ void calculate(Tensor output, Tensor input) {
 }
 
 static bool registered = []() {
-    Tanhshrink::dispatcher().registerDevice({
-            Device::Type::CPU,
-            Device::Type::NVIDIA,
-            Device::Type::METAX,
-            Device::Type::MOORE,
-            Device::Type::ILUVATAR
-        }, &calculate, false);
+    Tanhshrink::dispatcher().registerDevice({Device::Type::CPU,
+                                             Device::Type::NVIDIA,
+                                             Device::Type::METAX,
+                                             Device::Type::MOORE,
+                                             Device::Type::ILUVATAR},
+                                            &calculate, false);
     return true;
 }();
 
diff --git a/src/infinicore/pybind11/ops.hpp b/src/infinicore/pybind11/ops.hpp
index ffc2e010c..849d6c474 100644
--- a/src/infinicore/pybind11/ops.hpp
+++ b/src/infinicore/pybind11/ops.hpp
@@ -24,11 +24,10 @@
 #include "ops/fmod.hpp"
 #include "ops/hardswish.hpp"
 #include "ops/hardtanh.hpp"
+#include "ops/inner.hpp"
 #include "ops/kv_caching.hpp"
 #include "ops/linear.hpp"
 #include "ops/linear_w8a8i8.hpp"
-#include "ops/inner.hpp"
-#include "ops/linear.hpp"
 #include "ops/masked_select.hpp"
 #include "ops/matmul.hpp"
 #include "ops/mha_kvcache.hpp"
@@ -46,11 +45,11 @@
 #include "ops/silu_and_mul.hpp"
 #include "ops/sum.hpp"
 #include "ops/swiglu.hpp"
+#include "ops/tan.hpp"
+#include "ops/tanhshrink.hpp"
 #include "ops/topk.hpp"
 #include "ops/var.hpp"
 #include "ops/var_mean.hpp"
-#include "ops/tan.hpp"
-#include "ops/tanhshrink.hpp"
 
 namespace py = pybind11;
 
diff --git a/src/infinicore/pybind11/ops/cat.hpp b/src/infinicore/pybind11/ops/cat.hpp
index 2ef3392b0..093b15ff2 100644
--- a/src/infinicore/pybind11/ops/cat.hpp
+++ b/src/infinicore/pybind11/ops/cat.hpp
@@ -14,7 +14,7 @@ inline void bind_cat(py::module &m) {
           py::arg("tensors"),
           py::arg("dim") = 0,
           R"doc(opertor: torch.cat, out-of-place mode)doc");
-          
+
     m.def("cat_",
           &op::cat_,
           py::arg("out"),
@@ -23,4 +23,4 @@ inline void bind_cat(py::module &m) {
           R"doc(opertor: torch.cat, in-place mode)doc");
 }
 
-} // namespace infinicore::ops
\ No newline at end of file
+} // namespace infinicore::ops
diff --git a/src/infinicore/pybind11/ops/inner.hpp b/src/infinicore/pybind11/ops/inner.hpp
index 16ee91932..b54d6117b 100644
--- a/src/infinicore/pybind11/ops/inner.hpp
+++ b/src/infinicore/pybind11/ops/inner.hpp
@@ -14,7 +14,7 @@ inline void bind_inner(py::module &m) {
           py::arg("input"),
           py::arg("other"),
           R"doc(opertor: torch.inner, out-of-place mode)doc");
-          
+
     m.def("inner_",
           &op::inner_,
           py::arg("out"),
@@ -23,4 +23,4 @@ inline void bind_inner(py::module &m) {
           R"doc(opertor: torch.inner, in-place mode)doc");
 }
 
-} // namespace infinicore::ops
\ No newline at end of file
+} // namespace infinicore::ops
diff --git a/src/infinicore/pybind11/ops/masked_select.hpp b/src/infinicore/pybind11/ops/masked_select.hpp
index 2734590e6..949566b6b 100644
--- a/src/infinicore/pybind11/ops/masked_select.hpp
+++ b/src/infinicore/pybind11/ops/masked_select.hpp
@@ -14,7 +14,6 @@ inline void bind_masked_select(py::module &m) {
           py::arg("input"),
           py::arg("mask"),
           R"doc(opertor: torch.masked_select, out-of-place mode)doc");
-          
 }
 
-} // namespace infinicore::ops
\ No newline at end of file
+} // namespace infinicore::ops
diff --git a/src/infiniop/ops/inner/cpu/inner_cpu.cc b/src/infiniop/ops/inner/cpu/inner_cpu.cc
index 5915cb3fc..085e46478 100644
--- a/src/infiniop/ops/inner/cpu/inner_cpu.cc
+++ b/src/infiniop/ops/inner/cpu/inner_cpu.cc
@@ -2,7 +2,7 @@
 #include "../../../devices/cpu/common_cpu.h"
 
 namespace op::inner::cpu {
-   
+
 Descriptor::~Descriptor() {}
 
 infiniStatus_t Descriptor::create(
@@ -11,7 +11,7 @@ infiniStatus_t Descriptor::create(
     infiniopTensorDescriptor_t out_desc,
     infiniopTensorDescriptor_t input_desc,
     infiniopTensorDescriptor_t other_desc) {
-    
+
     auto result = InnerInfo::create(out_desc, input_desc, other_desc);
     CHECK_RESULT(result);
     *desc_ptr = new Descriptor(nullptr, result.take(), 0, handle->device, handle->device_id);
@@ -20,29 +20,29 @@ infiniStatus_t Descriptor::create(
 
 namespace {
 
-template<typename T>
+template <typename T>
 infiniStatus_t inner(const InnerInfo *info, const T *input, const T *other, T *out) {
 #pragma omp parallel for
-    for (size_t out_index = 0; out_index < info->total_elements; out_index ++) {
-        
+    for (ptrdiff_t out_index = 0; out_index < ptrdiff_t(info->total_elements); out_index++) {
+
         size_t out_dim_pos = info->out_ndim - 1;
         size_t out_offset = op::common_cpu::indexToOffset(out_index, info->out_ndim, info->out_shape.data(), info->out_strides.data());
         size_t index = out_index;
 
         ptrdiff_t input_offset = 0;
         ptrdiff_t other_offset = 0;
-        
-        for (int i = (int)info->other_ndim - 2; i >= 0; i --) {
+
+        for (int i = (int)info->other_ndim - 2; i >= 0; i--) {
             other_offset += (index % info->out_shape[out_dim_pos]) * info->other_strides[i];
-            index /= info->out_shape[out_dim_pos --];
+            index /= info->out_shape[out_dim_pos--];
         }
-        for (int i = (int)info->input_ndim - 2; i >= 0; i --) {
+        for (int i = (int)info->input_ndim - 2; i >= 0; i--) {
             input_offset += (index % info->out_shape[out_dim_pos]) * info->input_strides[i];
-            index /= info->out_shape[out_dim_pos --];
+            index /= info->out_shape[out_dim_pos--];
         }
 
         float tmp = 0.;
-        for (size_t i = 0; i < info->oper_len; i ++) {
+        for (size_t i = 0; i < info->oper_len; i++) {
             if constexpr (std::is_same<T, fp16_t>::value || std::is_same<T, bf16_t>::value) {
                 tmp += utils::cast<float>(input[input_offset]) * utils::cast<float>(other[other_offset]);
             } else {
@@ -59,14 +59,13 @@ infiniStatus_t inner(const InnerInfo *info, const T *input, const T *other, T *o
         }
     }
 
-
     return INFINI_STATUS_SUCCESS;
 }
 
-}
+} // namespace
 
 infiniStatus_t Descriptor::calculate(
-    void* workspace, size_t workspace_size,
+    void *workspace, size_t workspace_size,
     void *out,
     const void *input,
     const void *other,
@@ -76,7 +75,7 @@ infiniStatus_t Descriptor::calculate(
         CHECK_STATUS(inner(&_info, (const bf16_t *)input, (const bf16_t *)other, (bf16_t *)out));
     } else if (_info.dtype == INFINI_DTYPE_F16) {
         CHECK_STATUS(inner(&_info, (const fp16_t *)input, (const fp16_t *)other, (fp16_t *)out));
-    } else if (_info.dtype == INFINI_DTYPE_F32){
+    } else if (_info.dtype == INFINI_DTYPE_F32) {
         CHECK_STATUS(inner(&_info, (const float *)input, (const float *)other, (float *)out));
     } else {
         return INFINI_STATUS_BAD_TENSOR_DTYPE;
@@ -85,4 +84,4 @@ infiniStatus_t Descriptor::calculate(
     return INFINI_STATUS_SUCCESS;
 }
 
-}
\ No newline at end of file
+} // namespace op::inner::cpu
diff --git a/src/infiniop/ops/inner/cpu/inner_cpu.h b/src/infiniop/ops/inner/cpu/inner_cpu.h
index 5d4774dfa..18b2e7b32 100644
--- a/src/infiniop/ops/inner/cpu/inner_cpu.h
+++ b/src/infiniop/ops/inner/cpu/inner_cpu.h
@@ -4,4 +4,4 @@
 
 DESCRIPTOR(cpu);
 
-#endif
\ No newline at end of file
+#endif
diff --git a/src/infiniop/ops/inner/cuda/kernel.cuh b/src/infiniop/ops/inner/cuda/kernel.cuh
index 1616a4281..928161ba8 100644
--- a/src/infiniop/ops/inner/cuda/kernel.cuh
+++ b/src/infiniop/ops/inner/cuda/kernel.cuh
@@ -1,35 +1,37 @@
 #ifndef __INNER_KERNEL_CUH__
 #define __INNER_KERNEL_CUH__
 
-template<size_t BLOCK_SIZE, typename T>
+template <size_t BLOCK_SIZE, typename T>
 INFINIOP_CUDA_KERNEL innerKernel(
     const T *input, const T *other, T *out,
     size_t total_elements, size_t oper_len, size_t *out_shape,
     ptrdiff_t *input_strides, ptrdiff_t *other_strides, ptrdiff_t *out_strides,
     size_t input_ndim, size_t other_ndim, size_t out_ndim) {
-    
+
     size_t out_index = blockDim.x * blockIdx.x + threadIdx.x;
-    if (out_index >= total_elements) return;
+    if (out_index >= total_elements) {
+        return;
+    }
 
     size_t out_offset = device::nvidia::indexToOffset(out_index, out_ndim, out_shape, out_strides);
-    
+
     size_t out_dim_pos = out_ndim - 1;
     size_t index = out_index;
 
     ptrdiff_t input_offset = 0;
     ptrdiff_t other_offset = 0;
 
-    for (int i = (int)other_ndim - 2; i >= 0; i --) {
+    for (int i = (int)other_ndim - 2; i >= 0; i--) {
         other_offset += (index % out_shape[out_dim_pos]) * other_strides[i];
-        index /= out_shape[out_dim_pos --];
+        index /= out_shape[out_dim_pos--];
     }
-    for (int i = (int)input_ndim - 2; i >= 0; i --) {
+    for (int i = (int)input_ndim - 2; i >= 0; i--) {
         input_offset += (index % out_shape[out_dim_pos]) * input_strides[i];
-        index /= out_shape[out_dim_pos --];
+        index /= out_shape[out_dim_pos--];
     }
 
     T tmp = 0;
-    for (size_t i = 0; i < oper_len; i ++) {
+    for (size_t i = 0; i < oper_len; i++) {
         tmp += input[input_offset] * other[other_offset];
         input_offset += input_strides[input_ndim - 1];
         other_offset += other_strides[other_ndim - 1];
@@ -38,4 +40,4 @@ INFINIOP_CUDA_KERNEL innerKernel(
     out[out_offset] = tmp;
 }
 
-#endif // __INNER_KERNEL_CUH__
\ No newline at end of file
+#endif // __INNER_KERNEL_CUH__
diff --git a/src/infiniop/ops/inner/info.h b/src/infiniop/ops/inner/info.h
index 2484277eb..3262f9cf9 100644
--- a/src/infiniop/ops/inner/info.h
+++ b/src/infiniop/ops/inner/info.h
@@ -12,7 +12,7 @@ class InnerInfo {
 
 public:
     infiniDtype_t dtype;
-    
+
     size_t input_ndim;
     size_t other_ndim;
     size_t out_ndim;
@@ -32,33 +32,40 @@ class InnerInfo {
         infiniopTensorDescriptor_t other_desc) {
 
         auto dtype = out_desc->dtype();
-        if (dtype != input_desc->dtype() || dtype != other_desc->dtype())
+        if (dtype != input_desc->dtype() || dtype != other_desc->dtype()) {
             return INFINI_STATUS_BAD_TENSOR_DTYPE;
+        }
         CHECK_DTYPE(dtype, INFINI_DTYPE_BF16, INFINI_DTYPE_F16, INFINI_DTYPE_F32);
 
         auto input_ndim = input_desc->ndim();
         auto other_ndim = other_desc->ndim();
         auto out_ndim = out_desc->ndim();
-        if (out_ndim + 2 != input_ndim + other_ndim)
+        if (out_ndim + 2 != input_ndim + other_ndim) {
             return INFINI_STATUS_BAD_TENSOR_SHAPE;
+        }
 
         auto input_shape = input_desc->shape();
         auto other_shape = other_desc->shape();
         auto out_shape = out_desc->shape();
         if (input_ndim && other_ndim) {
-            if (input_shape[input_ndim - 1] != other_shape[other_ndim - 1])
+            if (input_shape[input_ndim - 1] != other_shape[other_ndim - 1]) {
                 return INFINI_STATUS_BAD_TENSOR_SHAPE;
+            }
         }
         size_t out_dim_pos = 0;
         if (input_ndim) {
-            for (size_t i = 0; i < input_ndim - 1; i ++)
-                if (input_shape[i] != out_shape[out_dim_pos ++])
+            for (size_t i = 0; i < input_ndim - 1; i++) {
+                if (input_shape[i] != out_shape[out_dim_pos++]) {
                     return INFINI_STATUS_BAD_TENSOR_SHAPE;
+                }
+            }
         }
         if (other_ndim) {
-            for (size_t i = 0; i < other_ndim - 1; i ++)
-                if (other_shape[i] != out_shape[out_dim_pos ++])
+            for (size_t i = 0; i < other_ndim - 1; i++) {
+                if (other_shape[i] != out_shape[out_dim_pos++]) {
                     return INFINI_STATUS_BAD_TENSOR_SHAPE;
+                }
+            }
         }
 
         size_t oper_len = 1;
@@ -67,9 +74,10 @@ class InnerInfo {
         }
 
         size_t total_elements = 1;
-        for (size_t i = 0; i < out_ndim; i ++)
+        for (size_t i = 0; i < out_ndim; i++) {
             total_elements *= out_shape[i];
-        
+        }
+
         auto input_strides = input_desc->strides();
         auto other_strides = other_desc->strides();
         auto out_strides = out_desc->strides();
@@ -78,11 +86,10 @@ class InnerInfo {
             dtype,
             input_ndim, other_ndim, out_ndim,
             oper_len, total_elements, out_shape,
-            input_strides, other_strides, out_strides
-        });
+            input_strides, other_strides, out_strides});
     }
 };
 
 } // namespace op::inner
 
-#endif // __INNER_INFO_H__
\ No newline at end of file
+#endif // __INNER_INFO_H__
diff --git a/src/infiniop/ops/inner/inner.h b/src/infiniop/ops/inner/inner.h
index 5c44262a7..c57e82708 100644
--- a/src/infiniop/ops/inner/inner.h
+++ b/src/infiniop/ops/inner/inner.h
@@ -7,7 +7,7 @@
 #define DESCRIPTOR(NAMESPACE)                                    \
                                                                  \
     namespace op::inner::NAMESPACE {                             \
-    class Descriptor final: public InfiniopDescriptor {          \
+    class Descriptor final : public InfiniopDescriptor {         \
         struct Opaque;                                           \
         Opaque *_opaque;                                         \
         InnerInfo _info;                                         \
@@ -24,7 +24,6 @@
               _info(info),                                       \
               _workspace_size(workspace_size) {}                 \
                                                                  \
-                                                                 \
     public:                                                      \
         ~Descriptor();                                           \
         size_t workspaceSize() const { return _workspace_size; } \
@@ -45,5 +44,4 @@
     };                                                           \
     }
 
-
-#endif
\ No newline at end of file
+#endif
diff --git a/src/infiniop/ops/inner/metax/inner_metax.h b/src/infiniop/ops/inner/metax/inner_metax.h
index 04a47d445..72d989a3d 100644
--- a/src/infiniop/ops/inner/metax/inner_metax.h
+++ b/src/infiniop/ops/inner/metax/inner_metax.h
@@ -4,4 +4,4 @@
 
 DESCRIPTOR(metax);
 
-#endif
\ No newline at end of file
+#endif
diff --git a/src/infiniop/ops/inner/metax/inner_metax.maca b/src/infiniop/ops/inner/metax/inner_metax.maca
index 683f4a1e1..95c8dcad7 100644
--- a/src/infiniop/ops/inner/metax/inner_metax.maca
+++ b/src/infiniop/ops/inner/metax/inner_metax.maca
@@ -38,7 +38,7 @@ infiniStatus_t Descriptor::create(
 
 namespace {
 
-template<size_t BLOCK_SIZE, typename T>
+template <size_t BLOCK_SIZE, typename T>
 infiniStatus_t launchKernel(
     const InnerInfo &info,
     const T *input, const T *other, T *out,
@@ -82,7 +82,7 @@ infiniStatus_t launchKernel(
     return INFINI_STATUS_SUCCESS;
 }
 
-}
+} // namespace
 
 infiniStatus_t Descriptor::calculate(
     void *workspace, size_t workspace_size,
@@ -92,22 +92,21 @@ infiniStatus_t Descriptor::calculate(
     void *stream_) const {
 
     hcStream_t stream = (hcStream_t)stream_;
-#define CALCULATE_INNER(BLOCK_SIZE, T)                          \
-    launchKernel<BLOCK_SIZE, T>(                                \
-        _info,                                                  \
-        (const T *)input, (const T *)other, (T *)out,           \
-        stream, workspace, workspace_size                       \
-    )
-#define CALCULATE_INNER_WITH_BLOCK_SIZE(BLOCK_SIZE)             \
-    {                                                           \
-        if (_info.dtype == INFINI_DTYPE_BF16)                   \
-            return CALCULATE_INNER(BLOCK_SIZE, __hpcc_bfloat16);\
-        else if(_info.dtype == INFINI_DTYPE_F16)                \
-            return CALCULATE_INNER(BLOCK_SIZE, half);           \
-        else if(_info.dtype == INFINI_DTYPE_F32)                \
-            return CALCULATE_INNER(BLOCK_SIZE, float);          \
-        else                                                    \
-            return INFINI_STATUS_BAD_TENSOR_DTYPE;              \
+#define CALCULATE_INNER(BLOCK_SIZE, T)                \
+    launchKernel<BLOCK_SIZE, T>(                      \
+        _info,                                        \
+        (const T *)input, (const T *)other, (T *)out, \
+        stream, workspace, workspace_size)
+#define CALCULATE_INNER_WITH_BLOCK_SIZE(BLOCK_SIZE)              \
+    {                                                            \
+        if (_info.dtype == INFINI_DTYPE_BF16)                    \
+            return CALCULATE_INNER(BLOCK_SIZE, __hpcc_bfloat16); \
+        else if (_info.dtype == INFINI_DTYPE_F16)                \
+            return CALCULATE_INNER(BLOCK_SIZE, half);            \
+        else if (_info.dtype == INFINI_DTYPE_F32)                \
+            return CALCULATE_INNER(BLOCK_SIZE, float);           \
+        else                                                     \
+            return INFINI_STATUS_BAD_TENSOR_DTYPE;               \
     }
 
     if (_opaque->internal->maxThreadsPerBlock() == METAX_BLOCK_SIZE_1024) {
@@ -120,4 +119,4 @@ infiniStatus_t Descriptor::calculate(
     return INFINI_STATUS_SUCCESS;
 }
 
-}
\ No newline at end of file
+} // namespace op::inner::metax
diff --git a/src/infiniop/ops/inner/metax/inner_metax_kernel.h b/src/infiniop/ops/inner/metax/inner_metax_kernel.h
index 52f37e1ef..3e66a4590 100644
--- a/src/infiniop/ops/inner/metax/inner_metax_kernel.h
+++ b/src/infiniop/ops/inner/metax/inner_metax_kernel.h
@@ -1,38 +1,40 @@
 #ifndef __INNER_METAX_KERNEL_H__
 #define __INNER_METAX_KERNEL_H__
 
-template<size_t BLOCK_SIZE, typename T>
+template <size_t BLOCK_SIZE, typename T>
 INFINIOP_METAX_KERNEL innerKernel(
     const T *input, const T *other, T *out,
     size_t total_elements, size_t oper_len, size_t *out_shape,
     ptrdiff_t *input_strides, ptrdiff_t *other_strides, ptrdiff_t *out_strides,
     size_t input_ndim, size_t other_ndim, size_t out_ndim) {
-    
+
     size_t out_index = blockDim.x * blockIdx.x + threadIdx.x;
-    if (out_index >= total_elements) return;
+    if (out_index >= total_elements) {
+        return;
+    }
 
     size_t out_offset = device::metax::indexToOffset(out_index, out_ndim, out_shape, out_strides);
-    
+
     size_t out_dim_pos = out_ndim - 1;
     size_t index = out_index;
 
     ptrdiff_t input_offset = 0;
     ptrdiff_t other_offset = 0;
 
-    for (int i = (int)other_ndim - 2; i >= 0; i --) {
+    for (int i = (int)other_ndim - 2; i >= 0; i--) {
         other_offset += (index % out_shape[out_dim_pos]) * other_strides[i];
-        index /= out_shape[out_dim_pos --];
+        index /= out_shape[out_dim_pos--];
     }
-    for (int i = (int)input_ndim - 2; i >= 0; i --) {
+    for (int i = (int)input_ndim - 2; i >= 0; i--) {
         input_offset += (index % out_shape[out_dim_pos]) * input_strides[i];
-        index /= out_shape[out_dim_pos --];
+        index /= out_shape[out_dim_pos--];
     }
 
     // T tmp = input[input_offset] * other[other_offset];
     // input_offset += input_strides[input_ndim - 1];
     // other_offset += other_strides[other_ndim - 1];
     T tmp = 0.;
-    for (size_t i = 0; i < oper_len; i ++) {
+    for (size_t i = 0; i < oper_len; i++) {
         if constexpr (std::is_same_v<T, half> || std::is_same_v<T, cuda_bfloat16>) {
             tmp += input[input_offset] * other[other_offset];
         } else {
@@ -56,4 +58,4 @@ INFINIOP_METAX_KERNEL innerKernel(
     out[out_offset] = tmp;
 }
 
-#endif // __INNER_KERNEL_CUH__
\ No newline at end of file
+#endif // __INNER_KERNEL_CUH__
diff --git a/src/infiniop/ops/inner/moore/inner_moore.h b/src/infiniop/ops/inner/moore/inner_moore.h
index a508a6718..e722f2adf 100644
--- a/src/infiniop/ops/inner/moore/inner_moore.h
+++ b/src/infiniop/ops/inner/moore/inner_moore.h
@@ -4,4 +4,4 @@
 
 DESCRIPTOR(moore);
 
-#endif
\ No newline at end of file
+#endif
diff --git a/src/infiniop/ops/inner/moore/inner_moore.mu b/src/infiniop/ops/inner/moore/inner_moore.mu
index 61a430d2c..82abda6b4 100644
--- a/src/infiniop/ops/inner/moore/inner_moore.mu
+++ b/src/infiniop/ops/inner/moore/inner_moore.mu
@@ -38,7 +38,7 @@ infiniStatus_t Descriptor::create(
 
 namespace {
 
-template<size_t BLOCK_SIZE, typename T>
+template <size_t BLOCK_SIZE, typename T>
 infiniStatus_t launchKernel(
     const InnerInfo &info,
     const T *input, const T *other, T *out,
@@ -80,7 +80,7 @@ infiniStatus_t launchKernel(
     return INFINI_STATUS_SUCCESS;
 }
 
-}
+} // namespace
 
 infiniStatus_t Descriptor::calculate(
     void *workspace, size_t workspace_size,
@@ -90,22 +90,21 @@ infiniStatus_t Descriptor::calculate(
     void *stream_) const {
 
     musaStream_t stream = (musaStream_t)stream_;
-#define CALCULATE_INNER(BLOCK_SIZE, T)                          \
-    launchKernel<BLOCK_SIZE, T>(                                \
-        _info,                                                  \
-        (const T *)input, (const T *)other, (T *)out,           \
-        stream, workspace, workspace_size                       \
-    )
-#define CALCULATE_INNER_WITH_BLOCK_SIZE(BLOCK_SIZE)             \
-    {                                                           \
-        if (_info.dtype == INFINI_DTYPE_BF16)                   \
-            return CALCULATE_INNER(BLOCK_SIZE, __mt_bfloat16);  \
-        else if(_info.dtype == INFINI_DTYPE_F16)                \
-            return CALCULATE_INNER(BLOCK_SIZE, half);           \
-        else if(_info.dtype == INFINI_DTYPE_F32)                \
-            return CALCULATE_INNER(BLOCK_SIZE, float);          \
-        else                                                    \
-            return INFINI_STATUS_BAD_TENSOR_DTYPE;              \
+#define CALCULATE_INNER(BLOCK_SIZE, T)                \
+    launchKernel<BLOCK_SIZE, T>(                      \
+        _info,                                        \
+        (const T *)input, (const T *)other, (T *)out, \
+        stream, workspace, workspace_size)
+#define CALCULATE_INNER_WITH_BLOCK_SIZE(BLOCK_SIZE)            \
+    {                                                          \
+        if (_info.dtype == INFINI_DTYPE_BF16)                  \
+            return CALCULATE_INNER(BLOCK_SIZE, __mt_bfloat16); \
+        else if (_info.dtype == INFINI_DTYPE_F16)              \
+            return CALCULATE_INNER(BLOCK_SIZE, half);          \
+        else if (_info.dtype == INFINI_DTYPE_F32)              \
+            return CALCULATE_INNER(BLOCK_SIZE, float);         \
+        else                                                   \
+            return INFINI_STATUS_BAD_TENSOR_DTYPE;             \
     }
 
     if (_opaque->internal->maxThreadsPerBlock() == MOORE_BLOCK_SIZE_1024) {
@@ -120,4 +119,4 @@ infiniStatus_t Descriptor::calculate(
     return INFINI_STATUS_SUCCESS;
 }
 
-}
\ No newline at end of file
+} // namespace op::inner::moore
diff --git a/src/infiniop/ops/inner/moore/inner_moore_kernel.h b/src/infiniop/ops/inner/moore/inner_moore_kernel.h
index de82dc8a9..fcb3f948f 100644
--- a/src/infiniop/ops/inner/moore/inner_moore_kernel.h
+++ b/src/infiniop/ops/inner/moore/inner_moore_kernel.h
@@ -1,35 +1,37 @@
 #ifndef __INNER_MOORE_KERNEL_H__
 #define __INNER_MOORE_KERNEL_H__
 
-template<size_t BLOCK_SIZE, typename T>
+template <size_t BLOCK_SIZE, typename T>
 INFINIOP_MOORE_KERNEL innerKernel(
     const T *input, const T *other, T *out,
     size_t total_elements, size_t oper_len, size_t *out_shape,
     ptrdiff_t *input_strides, ptrdiff_t *other_strides, ptrdiff_t *out_strides,
     size_t input_ndim, size_t other_ndim, size_t out_ndim) {
-    
+
     size_t out_index = blockDim.x * blockIdx.x + threadIdx.x;
-    if (out_index >= total_elements) return;
+    if (out_index >= total_elements) {
+        return;
+    }
 
     size_t out_offset = device::moore::indexToOffset(out_index, out_ndim, out_shape, out_strides);
-    
+
     size_t out_dim_pos = out_ndim - 1;
     size_t index = out_index;
 
     ptrdiff_t input_offset = 0;
     ptrdiff_t other_offset = 0;
 
-    for (int i = (int)other_ndim - 2; i >= 0; i --) {
+    for (int i = (int)other_ndim - 2; i >= 0; i--) {
         other_offset += (index % out_shape[out_dim_pos]) * other_strides[i];
-        index /= out_shape[out_dim_pos --];
+        index /= out_shape[out_dim_pos--];
     }
-    for (int i = (int)input_ndim - 2; i >= 0; i --) {
+    for (int i = (int)input_ndim - 2; i >= 0; i--) {
         input_offset += (index % out_shape[out_dim_pos]) * input_strides[i];
-        index /= out_shape[out_dim_pos --];
+        index /= out_shape[out_dim_pos--];
     }
 
     float tmp = 0.;
-    for (size_t i = 0; i < oper_len; i ++) {
+    for (size_t i = 0; i < oper_len; i++) {
         // if constexpr (std::is_same_v<T, half> || std::is_same_v<T, cuda_bfloat16>) {
         tmp += static_cast<float>(input[input_offset]) * static_cast<float>(other[other_offset]);
         // } else {
@@ -42,4 +44,4 @@ INFINIOP_MOORE_KERNEL innerKernel(
     out[out_offset] = static_cast<T>(tmp);
 }
 
-#endif // __INNER_KERNEL_CUH__
\ No newline at end of file
+#endif // __INNER_KERNEL_CUH__
diff --git a/src/infiniop/ops/inner/nvidia/inner_nvidia.cu b/src/infiniop/ops/inner/nvidia/inner_nvidia.cu
index e2b3ed0f9..a06e8955c 100644
--- a/src/infiniop/ops/inner/nvidia/inner_nvidia.cu
+++ b/src/infiniop/ops/inner/nvidia/inner_nvidia.cu
@@ -38,7 +38,7 @@ infiniStatus_t Descriptor::create(
 
 namespace {
 
-template<size_t BLOCK_SIZE, typename T>
+template <size_t BLOCK_SIZE, typename T>
 infiniStatus_t launchKernel(
     const InnerInfo &info,
     const T *input, const T *other, T *out,
@@ -80,7 +80,7 @@ infiniStatus_t launchKernel(
     return INFINI_STATUS_SUCCESS;
 }
 
-}
+} // namespace
 
 infiniStatus_t Descriptor::calculate(
     void *workspace, size_t workspace_size,
@@ -90,28 +90,29 @@ infiniStatus_t Descriptor::calculate(
     void *stream_) const {
 
     cudaStream_t stream = (cudaStream_t)stream_;
-#define CALCULATE_INNER(BLOCK_SIZE, T)                          \
-    launchKernel<BLOCK_SIZE, T>(                                \
-        _info,                                                  \
-        (const T *)input, (const T *)other, (T *)out,           \
-        stream, workspace, workspace_size                       \
-    )
-#define CALCULATE_INNER_WITH_BLOCK_SIZE(BLOCK_SIZE)             \
-    {                                                           \
-        if (_info.dtype == INFINI_DTYPE_BF16)                   \
-            return CALCULATE_INNER(BLOCK_SIZE, __nv_bfloat16);  \
-        else if(_info.dtype == INFINI_DTYPE_F16)                \
-            return CALCULATE_INNER(BLOCK_SIZE, half);           \
-        else if(_info.dtype == INFINI_DTYPE_F32)                \
-            return CALCULATE_INNER(BLOCK_SIZE, float);          \
-        else                                                    \
-            return INFINI_STATUS_BAD_TENSOR_DTYPE;              \
+#define CALCULATE_INNER(BLOCK_SIZE, T)                \
+    launchKernel<BLOCK_SIZE, T>(                      \
+        _info,                                        \
+        (const T *)input, (const T *)other, (T *)out, \
+        stream, workspace, workspace_size)
+#define CALCULATE_INNER_WITH_BLOCK_SIZE(BLOCK_SIZE)            \
+    {                                                          \
+        if (_info.dtype == INFINI_DTYPE_BF16)                  \
+            return CALCULATE_INNER(BLOCK_SIZE, __nv_bfloat16); \
+        else if (_info.dtype == INFINI_DTYPE_F16)              \
+            return CALCULATE_INNER(BLOCK_SIZE, half);          \
+        else if (_info.dtype == INFINI_DTYPE_F32)              \
+            return CALCULATE_INNER(BLOCK_SIZE, float);         \
+        else                                                   \
+            return INFINI_STATUS_BAD_TENSOR_DTYPE;             \
     }
 
     if (_opaque->internal->maxThreadsPerBlock() == CUDA_BLOCK_SIZE_1024) {
         CALCULATE_INNER_WITH_BLOCK_SIZE(CUDA_BLOCK_SIZE_1024)
     } else if (_opaque->internal->maxThreadsPerBlock() == CUDA_BLOCK_SIZE_512) {
         CALCULATE_INNER_WITH_BLOCK_SIZE(CUDA_BLOCK_SIZE_512)
+    } else if (_opaque->internal->maxThreadsPerBlock() == CUDA_BLOCK_SIZE_2048) {
+        CALCULATE_INNER_WITH_BLOCK_SIZE(CUDA_BLOCK_SIZE_2048)
     } else if (_opaque->internal->maxThreadsPerBlock() == CUDA_BLOCK_SIZE_4096) {
         CALCULATE_INNER_WITH_BLOCK_SIZE(CUDA_BLOCK_SIZE_4096)
     } else {
@@ -120,4 +121,4 @@ infiniStatus_t Descriptor::calculate(
     return INFINI_STATUS_SUCCESS;
 }
 
-}
\ No newline at end of file
+} // namespace op::inner::nvidia
diff --git a/src/infiniop/ops/inner/nvidia/inner_nvidia.cuh b/src/infiniop/ops/inner/nvidia/inner_nvidia.cuh
index 11033c2c0..4a3b8cb95 100644
--- a/src/infiniop/ops/inner/nvidia/inner_nvidia.cuh
+++ b/src/infiniop/ops/inner/nvidia/inner_nvidia.cuh
@@ -4,4 +4,4 @@
 
 DESCRIPTOR(nvidia);
 
-#endif
\ No newline at end of file
+#endif
diff --git a/src/infiniop/ops/inner/operator.cc b/src/infiniop/ops/inner/operator.cc
index c68762d0e..89c6774d1 100644
--- a/src/infiniop/ops/inner/operator.cc
+++ b/src/infiniop/ops/inner/operator.cc
@@ -15,20 +15,20 @@
 #include "moore/inner_moore.h"
 #endif
 
-__C infiniStatus_t infiniopCreateInnerDescriptor(
+__INFINI_C infiniStatus_t infiniopCreateInnerDescriptor(
     infiniopHandle_t handle,
     infiniopInnerDescriptor_t *desc_ptr,
     infiniopTensorDescriptor_t out_desc,
     infiniopTensorDescriptor_t input_desc,
     infiniopTensorDescriptor_t other_desc) {
 
-#define CREATE(CASE, NAMESPACE)                                                     \
-    case CASE:                                                                      \
-        return op::inner::NAMESPACE::Descriptor::create(                            \
-            handle,                                                                 \
-            reinterpret_cast<op::inner::NAMESPACE::Descriptor **>(desc_ptr),        \
-            out_desc,                                                               \
-            input_desc,                                                             \
+#define CREATE(CASE, NAMESPACE)                                              \
+    case CASE:                                                               \
+        return op::inner::NAMESPACE::Descriptor::create(                     \
+            handle,                                                          \
+            reinterpret_cast<op::inner::NAMESPACE::Descriptor **>(desc_ptr), \
+            out_desc,                                                        \
+            input_desc,                                                      \
             other_desc);
 
     switch (handle->device) {
@@ -51,11 +51,11 @@ __C infiniStatus_t infiniopCreateInnerDescriptor(
     return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
 }
 
-__C infiniStatus_t infiniopGetInnerWorkspaceSize(infiniopInnerDescriptor_t desc, size_t *size) {
+__INFINI_C infiniStatus_t infiniopGetInnerWorkspaceSize(infiniopInnerDescriptor_t desc, size_t *size) {
 
-#define GET(CASE, NAMESPACE)                                                                    \
-    case CASE:                                                                                  \
-        *size = reinterpret_cast<op::inner::NAMESPACE::Descriptor *>(desc)->workspaceSize();    \
+#define GET(CASE, NAMESPACE)                                                                 \
+    case CASE:                                                                               \
+        *size = reinterpret_cast<op::inner::NAMESPACE::Descriptor *>(desc)->workspaceSize(); \
         return INFINI_STATUS_SUCCESS;
 
     switch (desc->device_type) {
@@ -78,7 +78,7 @@ __C infiniStatus_t infiniopGetInnerWorkspaceSize(infiniopInnerDescriptor_t desc,
     return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
 }
 
-__C infiniStatus_t infiniopInner(
+__INFINI_C infiniStatus_t infiniopInner(
     infiniopInnerDescriptor_t desc,
     void *workspace, size_t workspace_size,
     void *out,
@@ -86,8 +86,8 @@ __C infiniStatus_t infiniopInner(
     const void *other,
     void *stream) {
 
-#define CALCULATE(CASE, NAMESPACE)                                                             \
-    case CASE:                                                                                 \
+#define CALCULATE(CASE, NAMESPACE)                                                    \
+    case CASE:                                                                        \
         return reinterpret_cast<op::inner::NAMESPACE::Descriptor *>(desc)->calculate( \
             workspace, workspace_size, out, input, other, stream);
 
@@ -111,10 +111,10 @@ __C infiniStatus_t infiniopInner(
     return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
 }
 
-__C infiniStatus_t infiniopDestroyInnerDescriptor(infiniopInnerDescriptor_t desc) {
+__INFINI_C infiniStatus_t infiniopDestroyInnerDescriptor(infiniopInnerDescriptor_t desc) {
 
-#define DESTROY(CASE, NAMESPACE)                                                    \
-    case CASE:                                                                      \
+#define DESTROY(CASE, NAMESPACE)                                           \
+    case CASE:                                                             \
         delete reinterpret_cast<op::inner::NAMESPACE::Descriptor *>(desc); \
         return INFINI_STATUS_SUCCESS;
 
diff --git a/src/infiniop/ops/masked_select/cpu/masked_select_cpu.cc b/src/infiniop/ops/masked_select/cpu/masked_select_cpu.cc
index ae74587d8..1157340c0 100644
--- a/src/infiniop/ops/masked_select/cpu/masked_select_cpu.cc
+++ b/src/infiniop/ops/masked_select/cpu/masked_select_cpu.cc
@@ -10,7 +10,7 @@ infiniStatus_t Descriptor::create(
     Descriptor **desc_ptr,
     infiniopTensorDescriptor_t input_desc,
     infiniopTensorDescriptor_t mask_desc) {
-    
+
     auto result = MaskedSelectInfo::create(input_desc, mask_desc);
     CHECK_RESULT(result);
     *desc_ptr = new Descriptor(nullptr, result.take(), 0, handle->device, handle->device_id);
@@ -19,10 +19,10 @@ infiniStatus_t Descriptor::create(
 
 namespace {
 
-template<typename T>
+template <typename T>
 infiniStatus_t maskedSelect(const MaskedSelectInfo *info, const T *input, const bool *mask, void **data_ptr, size_t *dlen_ptr) {
     std::vector<T> res;
-    for (size_t index = 0; index < info->total_elements; index ++) {
+    for (size_t index = 0; index < info->total_elements; index++) {
         size_t input_offset = op::common_cpu::indexToOffset(index, info->ndim, info->shape.data(), info->input_strides.data());
         size_t mask_offset = op::common_cpu::indexToOffset(index, info->ndim, info->shape.data(), info->mask_strides.data());
         if (mask[mask_offset]) {
@@ -36,7 +36,7 @@ infiniStatus_t maskedSelect(const MaskedSelectInfo *info, const T *input, const
     return INFINI_STATUS_SUCCESS;
 }
 
-}
+} // namespace
 
 infiniStatus_t Descriptor::calculate(
     void *workspace, size_t workspace_size,
@@ -55,4 +55,4 @@ infiniStatus_t Descriptor::calculate(
     return INFINI_STATUS_SUCCESS;
 }
 
-}
\ No newline at end of file
+} // namespace op::masked_select::cpu
diff --git a/src/infiniop/ops/masked_select/cpu/masked_select_cpu.h b/src/infiniop/ops/masked_select/cpu/masked_select_cpu.h
index 9fed9e2e6..3fc0c1dbf 100644
--- a/src/infiniop/ops/masked_select/cpu/masked_select_cpu.h
+++ b/src/infiniop/ops/masked_select/cpu/masked_select_cpu.h
@@ -4,4 +4,4 @@
 
 DESCRIPTOR(cpu)
 
-#endif // __MASKED_SELECT_CPU_H__
\ No newline at end of file
+#endif // __MASKED_SELECT_CPU_H__
diff --git a/src/infiniop/ops/masked_select/cuda/kernel.cuh b/src/infiniop/ops/masked_select/cuda/kernel.cuh
index fc7485f31..5b66b244a 100644
--- a/src/infiniop/ops/masked_select/cuda/kernel.cuh
+++ b/src/infiniop/ops/masked_select/cuda/kernel.cuh
@@ -1,7 +1,7 @@
 #ifndef __MASKED_SELECT_KERNEL_CUH__
 #define __MASKED_SELECT_KERNEL_CUH__
 
-template<size_t BLOCK_SIZE>
+template <size_t BLOCK_SIZE>
 INFINIOP_CUDA_KERNEL maskedSelectGetMarkScanOnceKernel(
     const bool *mask, size_t *mark_scan, size_t total_elements,
     size_t *shape, ptrdiff_t *mask_strides, size_t ndim) {
@@ -19,21 +19,24 @@ INFINIOP_CUDA_KERNEL maskedSelectGetMarkScanOnceKernel(
 
     for (int s = 1; s < BLOCK_SIZE; s <<= 1) {
         __syncthreads();
-        
+
         float temp;
-        if (tid >= s) 
+        if (tid >= s) {
             temp = smem[tid] + smem[tid - s];
+        }
         __syncthreads();
 
-        if (tid >= s) 
+        if (tid >= s) {
             smem[tid] = temp;
+        }
     }
 
-    if (index < total_elements)
+    if (index < total_elements) {
         mark_scan[index] = smem[tid];
+    }
 }
 
-template<size_t BLOCK_SIZE>
+template <size_t BLOCK_SIZE>
 INFINIOP_CUDA_KERNEL maskedSelectScanWithStrideKernel(
     size_t *mark_scan, size_t total_elements, size_t stride) {
 
@@ -45,50 +48,56 @@ INFINIOP_CUDA_KERNEL maskedSelectScanWithStrideKernel(
 
     for (int s = 1; s < BLOCK_SIZE; s <<= 1) {
         __syncthreads();
-        
+
         size_t temp;
-        if (tid >= s) 
+        if (tid >= s) {
             temp = smem[tid] + smem[tid - s];
+        }
         __syncthreads();
 
-        if (tid >= s) 
+        if (tid >= s) {
             smem[tid] = temp;
+        }
     }
 
-    if (index < total_elements)
+    if (index < total_elements) {
         mark_scan[index] = smem[tid];
+    }
 }
 
-template<size_t BLOCK_SIZE>
+template <size_t BLOCK_SIZE>
 INFINIOP_CUDA_KERNEL maskedSelectCountScanResultKernel(
     size_t *mark_scan, size_t *scan_result, size_t total_elements) {
 
     size_t index = blockDim.x * blockIdx.x + threadIdx.x;
-    if (index >= total_elements)
+    if (index >= total_elements) {
         return;
+    }
 
     size_t val = mark_scan[index];
 
     for (size_t s = BLOCK_SIZE; s < total_elements; s *= BLOCK_SIZE) {
         size_t now_stride_block = (index + 1) / s;
         size_t pre_stride_block = now_stride_block * s - 1;
-        if (now_stride_block == 0 || (index + 1) % s == 0 || (pre_stride_block + 1) % (s * BLOCK_SIZE) == 0)
+        if (now_stride_block == 0 || (index + 1) % s == 0 || (pre_stride_block + 1) % (s * BLOCK_SIZE) == 0) {
             continue;
+        }
         val += mark_scan[pre_stride_block];
     }
-    
+
     scan_result[index] = val;
 }
 
-template<size_t BLOCK_SIZE, typename T>
+template <size_t BLOCK_SIZE, typename T>
 INFINIOP_CUDA_KERNEL maskedSelectGetDataKernel(
     const T *input, const bool *mask, size_t *scan_result, T *data, size_t total_elements,
     size_t *shape, ptrdiff_t *input_strides, ptrdiff_t *mask_strides, size_t ndim) {
 
     size_t index = blockDim.x * blockIdx.x + threadIdx.x;
 
-    if (index >= total_elements)
+    if (index >= total_elements) {
         return;
+    }
 
     size_t input_offset = device::nvidia::indexToOffset(index, ndim, shape, input_strides);
     size_t mask_offset = device::nvidia::indexToOffset(index, ndim, shape, mask_strides);
@@ -98,4 +107,4 @@ INFINIOP_CUDA_KERNEL maskedSelectGetDataKernel(
     }
 }
 
-#endif
\ No newline at end of file
+#endif
diff --git a/src/infiniop/ops/masked_select/info.h b/src/infiniop/ops/masked_select/info.h
index 889b32821..cd20f1d33 100644
--- a/src/infiniop/ops/masked_select/info.h
+++ b/src/infiniop/ops/masked_select/info.h
@@ -20,7 +20,7 @@ class MaskedSelectInfo {
     std::vector<ptrdiff_t> input_strides;
     std::vector<ptrdiff_t> mask_strides;
 
-    static utils::Result<MaskedSelectInfo> create(infiniopTensorDescriptor_t input_desc, infiniopTensorDescriptor_t mask_desc) {        
+    static utils::Result<MaskedSelectInfo> create(infiniopTensorDescriptor_t input_desc, infiniopTensorDescriptor_t mask_desc) {
         auto dtype = input_desc->dtype();
         CHECK_DTYPE(dtype, INFINI_DTYPE_F32);
         CHECK_DTYPE(mask_desc->dtype(), INFINI_DTYPE_BOOL);
@@ -31,7 +31,7 @@ class MaskedSelectInfo {
         auto ndim = input_desc->ndim();
 
         size_t total_elements = 1;
-        for (auto& dim : shape) {
+        for (auto &dim : shape) {
             total_elements *= dim;
         }
 
diff --git a/src/infiniop/ops/masked_select/masked_select.h b/src/infiniop/ops/masked_select/masked_select.h
index 98148ebaf..ae44a8738 100644
--- a/src/infiniop/ops/masked_select/masked_select.h
+++ b/src/infiniop/ops/masked_select/masked_select.h
@@ -7,7 +7,7 @@
 #define DESCRIPTOR(NAMESPACE)                                    \
                                                                  \
     namespace op::masked_select::NAMESPACE {                     \
-    class Descriptor final: public InfiniopDescriptor {          \
+    class Descriptor final : public InfiniopDescriptor {         \
         struct Opaque;                                           \
         Opaque *_opaque;                                         \
         MaskedSelectInfo _info;                                  \
@@ -24,7 +24,6 @@
               _info(info),                                       \
               _workspace_size(workspace_size) {}                 \
                                                                  \
-                                                                 \
     public:                                                      \
         ~Descriptor();                                           \
         size_t workspaceSize() const { return _workspace_size; } \
@@ -45,4 +44,4 @@
     };                                                           \
     }
 
-#endif // MASKED_SELECT_H
\ No newline at end of file
+#endif // MASKED_SELECT_H
diff --git a/src/infiniop/ops/masked_select/metax/masked_select_metax.h b/src/infiniop/ops/masked_select/metax/masked_select_metax.h
index 47460777f..8d438d242 100644
--- a/src/infiniop/ops/masked_select/metax/masked_select_metax.h
+++ b/src/infiniop/ops/masked_select/metax/masked_select_metax.h
@@ -4,4 +4,4 @@
 
 DESCRIPTOR(metax);
 
-#endif
\ No newline at end of file
+#endif
diff --git a/src/infiniop/ops/masked_select/metax/masked_select_metax.maca b/src/infiniop/ops/masked_select/metax/masked_select_metax.maca
index 4a4181153..a024e6de7 100644
--- a/src/infiniop/ops/masked_select/metax/masked_select_metax.maca
+++ b/src/infiniop/ops/masked_select/metax/masked_select_metax.maca
@@ -42,9 +42,9 @@ infiniStatus_t Descriptor::create(
 
 namespace {
 
-template<size_t BLOCK_SIZE, typename T>
+template <size_t BLOCK_SIZE, typename T>
 infiniStatus_t launchKernel(
-    const MaskedSelectInfo &info, 
+    const MaskedSelectInfo &info,
     const T *input, const bool *mask, void **data_ptr, size_t *dlen_ptr,
     hcStream_t stream, void *workspace, size_t workspace_size) {
 
@@ -56,10 +56,10 @@ infiniStatus_t launchKernel(
 
     size_t *shape_hc = reinterpret_cast<size_t *>(workspace_ptr + workspace_offset);
     workspace_offset += ndim * sizeof(size_t);
-    
+
     ptrdiff_t *input_strides_hc = reinterpret_cast<ptrdiff_t *>(workspace_ptr + workspace_offset);
     workspace_offset += ndim * sizeof(ptrdiff_t);
-    
+
     ptrdiff_t *mask_strides_hc = reinterpret_cast<ptrdiff_t *>(workspace_ptr + workspace_offset);
     workspace_offset += ndim * sizeof(ptrdiff_t);
 
@@ -77,16 +77,16 @@ infiniStatus_t launchKernel(
 
     size_t block_size = BLOCK_SIZE;
     size_t grid_size = (total_elements + block_size - 1) / block_size;
-    
+
     maskedSelectGetMarkScanOnceKernel<BLOCK_SIZE><<<grid_size, block_size, 0, stream>>>(
         mask, mark_scan, total_elements,
         shape_hc, mask_strides_hc, ndim);
     CHECK_METAX(hcDeviceSynchronize());
-    
+
     for (size_t stride = BLOCK_SIZE; stride < total_elements; stride *= BLOCK_SIZE) {
         size_t stride_elements = (total_elements + stride - 1) / stride;
         size_t stride_grid_size = (stride_elements + BLOCK_SIZE - 1) / BLOCK_SIZE;
-        
+
         maskedSelectScanWithStrideKernel<BLOCK_SIZE><<<stride_grid_size, block_size, 0, stream>>>(
             mark_scan, total_elements, stride);
         CHECK_METAX(hcDeviceSynchronize());
@@ -98,15 +98,14 @@ infiniStatus_t launchKernel(
     } else {
         scan_result = mark_scan;
     }
-    
-    
+
     CHECK_METAX(hcMemcpyAsync(dlen_ptr, scan_result + total_elements - 1, sizeof(size_t), hcMemcpyDeviceToHost, stream));
-    
+
     CHECK_METAX(hcMalloc(data_ptr, *dlen_ptr * sizeof(T)));
-    
+
     // context::setDevice(METAX);
     // context::allocateMemory(data_ptr, *dlen_ptr * sizeof(T));
-    
+
     maskedSelectGetDataKernel<BLOCK_SIZE, T><<<grid_size, block_size, 0, stream>>>(
         input, mask, scan_result, (T *)*data_ptr, total_elements,
         shape_hc, input_strides_hc, mask_strides_hc, ndim);
@@ -115,8 +114,7 @@ infiniStatus_t launchKernel(
     return INFINI_STATUS_SUCCESS;
 }
 
-}
-
+} // namespace
 
 infiniStatus_t Descriptor::calculate(
     void *workspace, size_t workspace_size,
@@ -127,18 +125,17 @@ infiniStatus_t Descriptor::calculate(
     void *stream_) const {
 
     hcStream_t stream = (hcStream_t)stream_;
-#define CALCULATE_MASKED_SELECT(BLOCK_SIZE, T)                  \
-    launchKernel<BLOCK_SIZE, T>(                                \
-        _info,                                                  \
-        (const T *)input, mask, data_ptr, dlen_ptr,             \
-        stream, workspace, workspace_size                       \
-    )
-#define CALCULATE_MASKED_SELECT_WITH_BLOCK_SIZE(BLOCK_SIZE)     \
-    {                                                           \
-        if (_info.dtype == INFINI_DTYPE_F32)                    \
-            return CALCULATE_MASKED_SELECT(BLOCK_SIZE, float);  \
-        else                                                    \
-            return INFINI_STATUS_BAD_TENSOR_DTYPE;              \
+#define CALCULATE_MASKED_SELECT(BLOCK_SIZE, T)      \
+    launchKernel<BLOCK_SIZE, T>(                    \
+        _info,                                      \
+        (const T *)input, mask, data_ptr, dlen_ptr, \
+        stream, workspace, workspace_size)
+#define CALCULATE_MASKED_SELECT_WITH_BLOCK_SIZE(BLOCK_SIZE)    \
+    {                                                          \
+        if (_info.dtype == INFINI_DTYPE_F32)                   \
+            return CALCULATE_MASKED_SELECT(BLOCK_SIZE, float); \
+        else                                                   \
+            return INFINI_STATUS_BAD_TENSOR_DTYPE;             \
     }
 
     if (_opaque->internal->maxThreadsPerBlock() == METAX_BLOCK_SIZE_1024) {
@@ -151,4 +148,4 @@ infiniStatus_t Descriptor::calculate(
     return INFINI_STATUS_SUCCESS;
 }
 
-}
+} // namespace op::masked_select::metax
diff --git a/src/infiniop/ops/masked_select/metax/masked_select_metax_kernel.h b/src/infiniop/ops/masked_select/metax/masked_select_metax_kernel.h
index e590de955..9482e4dfe 100644
--- a/src/infiniop/ops/masked_select/metax/masked_select_metax_kernel.h
+++ b/src/infiniop/ops/masked_select/metax/masked_select_metax_kernel.h
@@ -1,7 +1,7 @@
 #ifndef __MASKED_SELECT_METAX_KERNEL_CUH__
 #define __MASKED_SELECT_METAX_KERNEL_CUH__
 
-template<size_t BLOCK_SIZE>
+template <size_t BLOCK_SIZE>
 INFINIOP_METAX_KERNEL maskedSelectGetMarkScanOnceKernel(
     const bool *mask, size_t *mark_scan, size_t total_elements,
     size_t *shape, ptrdiff_t *mask_strides, size_t ndim) {
@@ -19,21 +19,24 @@ INFINIOP_METAX_KERNEL maskedSelectGetMarkScanOnceKernel(
 
     for (int s = 1; s < BLOCK_SIZE; s <<= 1) {
         __syncthreads();
-        
+
         float temp;
-        if (tid >= s) 
+        if (tid >= s) {
             temp = smem[tid] + smem[tid - s];
+        }
         __syncthreads();
 
-        if (tid >= s) 
+        if (tid >= s) {
             smem[tid] = temp;
+        }
     }
 
-    if (index < total_elements)
+    if (index < total_elements) {
         mark_scan[index] = smem[tid];
+    }
 }
 
-template<size_t BLOCK_SIZE>
+template <size_t BLOCK_SIZE>
 INFINIOP_METAX_KERNEL maskedSelectScanWithStrideKernel(
     size_t *mark_scan, size_t total_elements, size_t stride) {
 
@@ -45,50 +48,56 @@ INFINIOP_METAX_KERNEL maskedSelectScanWithStrideKernel(
 
     for (int s = 1; s < BLOCK_SIZE; s <<= 1) {
         __syncthreads();
-        
+
         size_t temp;
-        if (tid >= s) 
+        if (tid >= s) {
             temp = smem[tid] + smem[tid - s];
+        }
         __syncthreads();
 
-        if (tid >= s) 
+        if (tid >= s) {
             smem[tid] = temp;
+        }
     }
 
-    if (index < total_elements)
+    if (index < total_elements) {
         mark_scan[index] = smem[tid];
+    }
 }
 
-template<size_t BLOCK_SIZE>
+template <size_t BLOCK_SIZE>
 INFINIOP_METAX_KERNEL maskedSelectCountScanResultKernel(
     size_t *mark_scan, size_t *scan_result, size_t total_elements) {
 
     size_t index = blockDim.x * blockIdx.x + threadIdx.x;
-    if (index >= total_elements)
+    if (index >= total_elements) {
         return;
+    }
 
     size_t val = mark_scan[index];
 
     for (size_t s = BLOCK_SIZE; s < total_elements; s *= BLOCK_SIZE) {
         size_t now_stride_block = (index + 1) / s;
         size_t pre_stride_block = now_stride_block * s - 1;
-        if (now_stride_block == 0 || (index + 1) % s == 0 || (pre_stride_block + 1) % (s * BLOCK_SIZE) == 0)
+        if (now_stride_block == 0 || (index + 1) % s == 0 || (pre_stride_block + 1) % (s * BLOCK_SIZE) == 0) {
             continue;
+        }
         val += mark_scan[pre_stride_block];
     }
-    
+
     scan_result[index] = val;
 }
 
-template<size_t BLOCK_SIZE, typename T>
+template <size_t BLOCK_SIZE, typename T>
 INFINIOP_METAX_KERNEL maskedSelectGetDataKernel(
     const T *input, const bool *mask, size_t *scan_result, T *data, size_t total_elements,
     size_t *shape, ptrdiff_t *input_strides, ptrdiff_t *mask_strides, size_t ndim) {
 
     size_t index = blockDim.x * blockIdx.x + threadIdx.x;
 
-    if (index >= total_elements)
+    if (index >= total_elements) {
         return;
+    }
 
     size_t input_offset = device::metax::indexToOffset(index, ndim, shape, input_strides);
     size_t mask_offset = device::metax::indexToOffset(index, ndim, shape, mask_strides);
@@ -98,4 +107,4 @@ INFINIOP_METAX_KERNEL maskedSelectGetDataKernel(
     }
 }
 
-#endif
\ No newline at end of file
+#endif
diff --git a/src/infiniop/ops/masked_select/moore/masked_select_moore.h b/src/infiniop/ops/masked_select/moore/masked_select_moore.h
index 24548d8fc..ca2e7b00d 100644
--- a/src/infiniop/ops/masked_select/moore/masked_select_moore.h
+++ b/src/infiniop/ops/masked_select/moore/masked_select_moore.h
@@ -4,4 +4,4 @@
 
 DESCRIPTOR(moore);
 
-#endif
\ No newline at end of file
+#endif
diff --git a/src/infiniop/ops/masked_select/moore/masked_select_moore.mu b/src/infiniop/ops/masked_select/moore/masked_select_moore.mu
index 3c218bc19..e9a195ee8 100644
--- a/src/infiniop/ops/masked_select/moore/masked_select_moore.mu
+++ b/src/infiniop/ops/masked_select/moore/masked_select_moore.mu
@@ -42,9 +42,9 @@ infiniStatus_t Descriptor::create(
 
 namespace {
 
-template<size_t BLOCK_SIZE, typename T>
+template <size_t BLOCK_SIZE, typename T>
 infiniStatus_t launchKernel(
-    const MaskedSelectInfo &info, 
+    const MaskedSelectInfo &info,
     const T *input, const bool *mask, void **data_ptr, size_t *dlen_ptr,
     musaStream_t stream, void *workspace, size_t workspace_size) {
 
@@ -56,10 +56,10 @@ infiniStatus_t launchKernel(
 
     size_t *shape_musa = reinterpret_cast<size_t *>(workspace_ptr + workspace_offset);
     workspace_offset += ndim * sizeof(size_t);
-    
+
     ptrdiff_t *input_strides_musa = reinterpret_cast<ptrdiff_t *>(workspace_ptr + workspace_offset);
     workspace_offset += ndim * sizeof(ptrdiff_t);
-    
+
     ptrdiff_t *mask_strides_musa = reinterpret_cast<ptrdiff_t *>(workspace_ptr + workspace_offset);
     workspace_offset += ndim * sizeof(ptrdiff_t);
 
@@ -77,16 +77,16 @@ infiniStatus_t launchKernel(
 
     size_t block_size = BLOCK_SIZE;
     size_t grid_size = (total_elements + block_size - 1) / block_size;
-    
+
     maskedSelectGetMarkScanOnceKernel<BLOCK_SIZE><<<grid_size, block_size, 0, stream>>>(
         mask, mark_scan, total_elements,
         shape_musa, mask_strides_musa, ndim);
     CHECK_MOORE(musaDeviceSynchronize());
-    
+
     for (size_t stride = BLOCK_SIZE; stride < total_elements; stride *= BLOCK_SIZE) {
         size_t stride_elements = (total_elements + stride - 1) / stride;
         size_t stride_grid_size = (stride_elements + BLOCK_SIZE - 1) / BLOCK_SIZE;
-        
+
         maskedSelectScanWithStrideKernel<BLOCK_SIZE><<<stride_grid_size, block_size, 0, stream>>>(
             mark_scan, total_elements, stride);
         CHECK_MOORE(musaDeviceSynchronize());
@@ -98,15 +98,14 @@ infiniStatus_t launchKernel(
     } else {
         scan_result = mark_scan;
     }
-    
-    
+
     CHECK_MOORE(musaMemcpyAsync(dlen_ptr, scan_result + total_elements - 1, sizeof(size_t), musaMemcpyDeviceToHost, stream));
-    
+
     CHECK_MOORE(musaMalloc(data_ptr, *dlen_ptr * sizeof(T)));
-    
+
     // context::setDevice(MOORE);
     // context::allocateMemory(data_ptr, *dlen_ptr * sizeof(T));
-    
+
     maskedSelectGetDataKernel<BLOCK_SIZE, T><<<grid_size, block_size, 0, stream>>>(
         input, mask, scan_result, (T *)*data_ptr, total_elements,
         shape_musa, input_strides_musa, mask_strides_musa, ndim);
@@ -115,8 +114,7 @@ infiniStatus_t launchKernel(
     return INFINI_STATUS_SUCCESS;
 }
 
-}
-
+} // namespace
 
 infiniStatus_t Descriptor::calculate(
     void *workspace, size_t workspace_size,
@@ -127,18 +125,17 @@ infiniStatus_t Descriptor::calculate(
     void *stream_) const {
 
     musaStream_t stream = (musaStream_t)stream_;
-#define CALCULATE_MASKED_SELECT(BLOCK_SIZE, T)                  \
-    launchKernel<BLOCK_SIZE, T>(                                \
-        _info,                                                  \
-        (const T *)input, mask, data_ptr, dlen_ptr,             \
-        stream, workspace, workspace_size                       \
-    )
-#define CALCULATE_MASKED_SELECT_WITH_BLOCK_SIZE(BLOCK_SIZE)     \
-    {                                                           \
-        if (_info.dtype == INFINI_DTYPE_F32)                    \
-            return CALCULATE_MASKED_SELECT(BLOCK_SIZE, float);  \
-        else                                                    \
-            return INFINI_STATUS_BAD_TENSOR_DTYPE;              \
+#define CALCULATE_MASKED_SELECT(BLOCK_SIZE, T)      \
+    launchKernel<BLOCK_SIZE, T>(                    \
+        _info,                                      \
+        (const T *)input, mask, data_ptr, dlen_ptr, \
+        stream, workspace, workspace_size)
+#define CALCULATE_MASKED_SELECT_WITH_BLOCK_SIZE(BLOCK_SIZE)    \
+    {                                                          \
+        if (_info.dtype == INFINI_DTYPE_F32)                   \
+            return CALCULATE_MASKED_SELECT(BLOCK_SIZE, float); \
+        else                                                   \
+            return INFINI_STATUS_BAD_TENSOR_DTYPE;             \
     }
 
     if (_opaque->internal->maxThreadsPerBlock() == MOORE_BLOCK_SIZE_1024) {
@@ -153,4 +150,4 @@ infiniStatus_t Descriptor::calculate(
     return INFINI_STATUS_SUCCESS;
 }
 
-}
+} // namespace op::masked_select::moore
diff --git a/src/infiniop/ops/masked_select/moore/masked_select_moore_kernel.h b/src/infiniop/ops/masked_select/moore/masked_select_moore_kernel.h
index dcdd87c6d..76cb62b50 100644
--- a/src/infiniop/ops/masked_select/moore/masked_select_moore_kernel.h
+++ b/src/infiniop/ops/masked_select/moore/masked_select_moore_kernel.h
@@ -1,7 +1,7 @@
 #ifndef __MASKED_SELECT_MOORE_KERNEL_CUH__
 #define __MASKED_SELECT_MOORE_KERNEL_CUH__
 
-template<size_t BLOCK_SIZE>
+template <size_t BLOCK_SIZE>
 INFINIOP_MOORE_KERNEL maskedSelectGetMarkScanOnceKernel(
     const bool *mask, size_t *mark_scan, size_t total_elements,
     size_t *shape, ptrdiff_t *mask_strides, size_t ndim) {
@@ -19,21 +19,24 @@ INFINIOP_MOORE_KERNEL maskedSelectGetMarkScanOnceKernel(
 
     for (int s = 1; s < BLOCK_SIZE; s <<= 1) {
         __syncthreads();
-        
+
         float temp;
-        if (tid >= s) 
+        if (tid >= s) {
             temp = smem[tid] + smem[tid - s];
+        }
         __syncthreads();
 
-        if (tid >= s) 
+        if (tid >= s) {
             smem[tid] = temp;
+        }
     }
 
-    if (index < total_elements)
+    if (index < total_elements) {
         mark_scan[index] = smem[tid];
+    }
 }
 
-template<size_t BLOCK_SIZE>
+template <size_t BLOCK_SIZE>
 INFINIOP_MOORE_KERNEL maskedSelectScanWithStrideKernel(
     size_t *mark_scan, size_t total_elements, size_t stride) {
 
@@ -45,50 +48,56 @@ INFINIOP_MOORE_KERNEL maskedSelectScanWithStrideKernel(
 
     for (int s = 1; s < BLOCK_SIZE; s <<= 1) {
         __syncthreads();
-        
+
         size_t temp;
-        if (tid >= s) 
+        if (tid >= s) {
             temp = smem[tid] + smem[tid - s];
+        }
         __syncthreads();
 
-        if (tid >= s) 
+        if (tid >= s) {
             smem[tid] = temp;
+        }
     }
 
-    if (index < total_elements)
+    if (index < total_elements) {
         mark_scan[index] = smem[tid];
+    }
 }
 
-template<size_t BLOCK_SIZE>
+template <size_t BLOCK_SIZE>
 INFINIOP_MOORE_KERNEL maskedSelectCountScanResultKernel(
     size_t *mark_scan, size_t *scan_result, size_t total_elements) {
 
     size_t index = blockDim.x * blockIdx.x + threadIdx.x;
-    if (index >= total_elements)
+    if (index >= total_elements) {
         return;
+    }
 
     size_t val = mark_scan[index];
 
     for (size_t s = BLOCK_SIZE; s < total_elements; s *= BLOCK_SIZE) {
         size_t now_stride_block = (index + 1) / s;
         size_t pre_stride_block = now_stride_block * s - 1;
-        if (now_stride_block == 0 || (index + 1) % s == 0 || (pre_stride_block + 1) % (s * BLOCK_SIZE) == 0)
+        if (now_stride_block == 0 || (index + 1) % s == 0 || (pre_stride_block + 1) % (s * BLOCK_SIZE) == 0) {
             continue;
+        }
         val += mark_scan[pre_stride_block];
     }
-    
+
     scan_result[index] = val;
 }
 
-template<size_t BLOCK_SIZE, typename T>
+template <size_t BLOCK_SIZE, typename T>
 INFINIOP_MOORE_KERNEL maskedSelectGetDataKernel(
     const T *input, const bool *mask, size_t *scan_result, T *data, size_t total_elements,
     size_t *shape, ptrdiff_t *input_strides, ptrdiff_t *mask_strides, size_t ndim) {
 
     size_t index = blockDim.x * blockIdx.x + threadIdx.x;
 
-    if (index >= total_elements)
+    if (index >= total_elements) {
         return;
+    }
 
     size_t input_offset = device::moore::indexToOffset(index, ndim, shape, input_strides);
     size_t mask_offset = device::moore::indexToOffset(index, ndim, shape, mask_strides);
@@ -98,4 +107,4 @@ INFINIOP_MOORE_KERNEL maskedSelectGetDataKernel(
     }
 }
 
-#endif
\ No newline at end of file
+#endif
diff --git a/src/infiniop/ops/masked_select/nvidia/masked_select_nvidia.cu b/src/infiniop/ops/masked_select/nvidia/masked_select_nvidia.cu
index eebba0fd3..17016ebc8 100644
--- a/src/infiniop/ops/masked_select/nvidia/masked_select_nvidia.cu
+++ b/src/infiniop/ops/masked_select/nvidia/masked_select_nvidia.cu
@@ -41,9 +41,9 @@ infiniStatus_t Descriptor::create(
 
 namespace {
 
-template<size_t BLOCK_SIZE, typename T>
+template <size_t BLOCK_SIZE, typename T>
 infiniStatus_t launchKernel(
-    const MaskedSelectInfo &info, 
+    const MaskedSelectInfo &info,
     const T *input, const bool *mask, void **data_ptr, size_t *dlen_ptr,
     cudaStream_t stream, void *workspace, size_t workspace_size) {
 
@@ -55,10 +55,10 @@ infiniStatus_t launchKernel(
 
     size_t *shape_cuda = reinterpret_cast<size_t *>(workspace_ptr + workspace_offset);
     workspace_offset += ndim * sizeof(size_t);
-    
+
     ptrdiff_t *input_strides_cuda = reinterpret_cast<ptrdiff_t *>(workspace_ptr + workspace_offset);
     workspace_offset += ndim * sizeof(ptrdiff_t);
-    
+
     ptrdiff_t *mask_strides_cuda = reinterpret_cast<ptrdiff_t *>(workspace_ptr + workspace_offset);
     workspace_offset += ndim * sizeof(ptrdiff_t);
 
@@ -74,16 +74,16 @@ infiniStatus_t launchKernel(
 
     size_t block_size = BLOCK_SIZE;
     size_t grid_size = (total_elements + block_size - 1) / block_size;
-    
+
     maskedSelectGetMarkScanOnceKernel<BLOCK_SIZE><<<grid_size, block_size, 0, stream>>>(
         mask, mark_scan, total_elements,
         shape_cuda, mask_strides_cuda, ndim);
     CHECK_CUDA(cudaDeviceSynchronize());
-    
+
     for (size_t stride = BLOCK_SIZE; stride < total_elements; stride *= BLOCK_SIZE) {
         size_t stride_elements = (total_elements + stride - 1) / stride;
         size_t stride_grid_size = (stride_elements + BLOCK_SIZE - 1) / BLOCK_SIZE;
-        
+
         maskedSelectScanWithStrideKernel<BLOCK_SIZE><<<stride_grid_size, block_size, 0, stream>>>(
             mark_scan, total_elements, stride);
         CHECK_CUDA(cudaDeviceSynchronize());
@@ -95,11 +95,11 @@ infiniStatus_t launchKernel(
     } else {
         scan_result = mark_scan;
     }
-    
+
     CHECK_CUDA(cudaMemcpyAsync(dlen_ptr, scan_result + total_elements - 1, sizeof(size_t), cudaMemcpyDeviceToHost, stream));
 
     CHECK_CUDA(cudaMalloc(data_ptr, *dlen_ptr * sizeof(T)));
-    
+
     maskedSelectGetDataKernel<BLOCK_SIZE, T><<<grid_size, block_size, 0, stream>>>(
         input, mask, scan_result, (T *)*data_ptr, total_elements,
         shape_cuda, input_strides_cuda, mask_strides_cuda, ndim);
@@ -108,8 +108,7 @@ infiniStatus_t launchKernel(
     return INFINI_STATUS_SUCCESS;
 }
 
-}
-
+} // namespace
 
 infiniStatus_t Descriptor::calculate(
     void *workspace, size_t workspace_size,
@@ -120,24 +119,25 @@ infiniStatus_t Descriptor::calculate(
     void *stream_) const {
 
     cudaStream_t stream = (cudaStream_t)stream_;
-#define CALCULATE_MASKED_SELECT(BLOCK_SIZE, T)                  \
-    launchKernel<BLOCK_SIZE, T>(                                \
-        _info,                                                  \
-        (const T *)input, mask, data_ptr, dlen_ptr,             \
-        stream, workspace, workspace_size                       \
-    )
-#define CALCULATE_MASKED_SELECT_WITH_BLOCK_SIZE(BLOCK_SIZE)     \
-    {                                                           \
-        if (_info.dtype == INFINI_DTYPE_F32)                    \
-            return CALCULATE_MASKED_SELECT(BLOCK_SIZE, float);  \
-        else                                                    \
-            return INFINI_STATUS_BAD_TENSOR_DTYPE;              \
+#define CALCULATE_MASKED_SELECT(BLOCK_SIZE, T)      \
+    launchKernel<BLOCK_SIZE, T>(                    \
+        _info,                                      \
+        (const T *)input, mask, data_ptr, dlen_ptr, \
+        stream, workspace, workspace_size)
+#define CALCULATE_MASKED_SELECT_WITH_BLOCK_SIZE(BLOCK_SIZE)    \
+    {                                                          \
+        if (_info.dtype == INFINI_DTYPE_F32)                   \
+            return CALCULATE_MASKED_SELECT(BLOCK_SIZE, float); \
+        else                                                   \
+            return INFINI_STATUS_BAD_TENSOR_DTYPE;             \
     }
 
     if (_opaque->internal->maxThreadsPerBlock() == CUDA_BLOCK_SIZE_1024) {
         CALCULATE_MASKED_SELECT_WITH_BLOCK_SIZE(CUDA_BLOCK_SIZE_1024);
     } else if (_opaque->internal->maxThreadsPerBlock() == CUDA_BLOCK_SIZE_512) {
         CALCULATE_MASKED_SELECT_WITH_BLOCK_SIZE(CUDA_BLOCK_SIZE_512);
+    } else if (_opaque->internal->maxThreadsPerBlock() == CUDA_BLOCK_SIZE_2048) {
+        CALCULATE_MASKED_SELECT_WITH_BLOCK_SIZE(CUDA_BLOCK_SIZE_2048);
     } else if (_opaque->internal->maxThreadsPerBlock() == CUDA_BLOCK_SIZE_4096) {
         CALCULATE_MASKED_SELECT_WITH_BLOCK_SIZE(CUDA_BLOCK_SIZE_4096);
     } else {
@@ -146,4 +146,4 @@ infiniStatus_t Descriptor::calculate(
     return INFINI_STATUS_SUCCESS;
 }
 
-}
+} // namespace op::masked_select::nvidia
diff --git a/src/infiniop/ops/masked_select/nvidia/masked_select_nvidia.cuh b/src/infiniop/ops/masked_select/nvidia/masked_select_nvidia.cuh
index bc493edbe..c3145025b 100644
--- a/src/infiniop/ops/masked_select/nvidia/masked_select_nvidia.cuh
+++ b/src/infiniop/ops/masked_select/nvidia/masked_select_nvidia.cuh
@@ -5,4 +5,4 @@
 
 DESCRIPTOR(nvidia)
 
-#endif 
\ No newline at end of file
+#endif
diff --git a/src/infiniop/ops/masked_select/operator.cc b/src/infiniop/ops/masked_select/operator.cc
index 055b67b12..1a581bd6d 100644
--- a/src/infiniop/ops/masked_select/operator.cc
+++ b/src/infiniop/ops/masked_select/operator.cc
@@ -15,18 +15,18 @@
 #include "moore/masked_select_moore.h"
 #endif
 
-__C infiniStatus_t infiniopCreateMaskedSelectDescriptor(
+__INFINI_C infiniStatus_t infiniopCreateMaskedSelectDescriptor(
     infiniopHandle_t handle,
     infiniopMaskedSelectDescriptor_t *desc_ptr,
     infiniopTensorDescriptor_t input_desc,
     infiniopTensorDescriptor_t mask_desc) {
 
-#define CREATE(CASE, NAMESPACE)                                                     \
-    case CASE:                                                                      \
-        return op::masked_select::NAMESPACE::Descriptor::create(                    \
-            handle,                                                                 \
-            reinterpret_cast<op::masked_select::NAMESPACE::Descriptor **>(desc_ptr),\
-            input_desc,                                                             \
+#define CREATE(CASE, NAMESPACE)                                                      \
+    case CASE:                                                                       \
+        return op::masked_select::NAMESPACE::Descriptor::create(                     \
+            handle,                                                                  \
+            reinterpret_cast<op::masked_select::NAMESPACE::Descriptor **>(desc_ptr), \
+            input_desc,                                                              \
             mask_desc);
 
     switch (handle->device) {
@@ -49,10 +49,10 @@ __C infiniStatus_t infiniopCreateMaskedSelectDescriptor(
     return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
 }
 
-__C infiniStatus_t infiniopGetMaskedSelectWorkspaceSize(infiniopMaskedSelectDescriptor_t desc, size_t *size) {
+__INFINI_C infiniStatus_t infiniopGetMaskedSelectWorkspaceSize(infiniopMaskedSelectDescriptor_t desc, size_t *size) {
 
-#define GET(CASE, NAMESPACE)                                                                          \
-    case CASE:                                                                                        \
+#define GET(CASE, NAMESPACE)                                                                         \
+    case CASE:                                                                                       \
         *size = reinterpret_cast<op::masked_select::NAMESPACE::Descriptor *>(desc)->workspaceSize(); \
         return INFINI_STATUS_SUCCESS;
 
@@ -76,7 +76,7 @@ __C infiniStatus_t infiniopGetMaskedSelectWorkspaceSize(infiniopMaskedSelectDesc
     return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
 }
 
-__C infiniStatus_t infiniopMaskedSelect(
+__INFINI_C infiniStatus_t infiniopMaskedSelect(
     infiniopMaskedSelectDescriptor_t desc,
     void *workspace, size_t workspace_size,
     const void *input,
@@ -85,8 +85,8 @@ __C infiniStatus_t infiniopMaskedSelect(
     size_t *dlen,
     void *stream) {
 
-#define CALCULATE(CASE, NAMESPACE)                                                             \
-    case CASE:                                                                                 \
+#define CALCULATE(CASE, NAMESPACE)                                                            \
+    case CASE:                                                                                \
         return reinterpret_cast<op::masked_select::NAMESPACE::Descriptor *>(desc)->calculate( \
             workspace, workspace_size, input, mask, data_size, dlen, stream);
 
@@ -110,10 +110,10 @@ __C infiniStatus_t infiniopMaskedSelect(
     return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
 }
 
-__C infiniStatus_t infiniopDestroyMaskedSelectDescriptor(infiniopMaskedSelectDescriptor_t desc) {
+__INFINI_C infiniStatus_t infiniopDestroyMaskedSelectDescriptor(infiniopMaskedSelectDescriptor_t desc) {
 
-#define DESTROY(CASE, NAMESPACE)                                                    \
-    case CASE:                                                                      \
+#define DESTROY(CASE, NAMESPACE)                                                   \
+    case CASE:                                                                     \
         delete reinterpret_cast<op::masked_select::NAMESPACE::Descriptor *>(desc); \
         return INFINI_STATUS_SUCCESS;
 
diff --git a/src/infiniop/ops/tan/operator.cc b/src/infiniop/ops/tan/operator.cc
index 1f222a8d1..99081d8b9 100644
--- a/src/infiniop/ops/tan/operator.cc
+++ b/src/infiniop/ops/tan/operator.cc
@@ -15,18 +15,18 @@
 #include "moore/tan_moore.h"
 #endif
 
-__C infiniStatus_t infiniopCreateTanDescriptor(
+__INFINI_C infiniStatus_t infiniopCreateTanDescriptor(
     infiniopHandle_t handle,
     infiniopTanDescriptor_t *desc_ptr,
     infiniopTensorDescriptor_t output_desc,
     infiniopTensorDescriptor_t input_desc) {
 
-#define CREATE(CASE, NAMESPACE)                                             \
-    case CASE:                                                              \
+#define CREATE(CASE, NAMESPACE)                                            \
+    case CASE:                                                             \
         return op::tan::NAMESPACE::Descriptor::create(                     \
-            handle,                                                         \
+            handle,                                                        \
             reinterpret_cast<op::tan::NAMESPACE::Descriptor **>(desc_ptr), \
-            output_desc,                                                    \
+            output_desc,                                                   \
             {input_desc})
 
     switch (handle->device) {
@@ -54,10 +54,10 @@ __C infiniStatus_t infiniopCreateTanDescriptor(
 #undef CREATE
 }
 
-__C infiniStatus_t infiniopGetTanWorkspaceSize(infiniopTanDescriptor_t desc, size_t *size) {
+__INFINI_C infiniStatus_t infiniopGetTanWorkspaceSize(infiniopTanDescriptor_t desc, size_t *size) {
 
-#define GET(CASE, NAMESPACE)                                                                \
-    case CASE:                                                                              \
+#define GET(CASE, NAMESPACE)                                                               \
+    case CASE:                                                                             \
         *size = reinterpret_cast<op::tan::NAMESPACE::Descriptor *>(desc)->workspaceSize(); \
         return INFINI_STATUS_SUCCESS
 
@@ -85,7 +85,7 @@ __C infiniStatus_t infiniopGetTanWorkspaceSize(infiniopTanDescriptor_t desc, siz
     return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
 }
 
-__C infiniStatus_t infiniopTan(
+__INFINI_C infiniStatus_t infiniopTan(
     infiniopTanDescriptor_t desc,
     void *workspace,
     size_t workspace_size,
@@ -93,8 +93,8 @@ __C infiniStatus_t infiniopTan(
     const void *input,
     void *stream) {
 
-#define CALCULATE(CASE, NAMESPACE)                                             \
-    case CASE:                                                                 \
+#define CALCULATE(CASE, NAMESPACE)                                            \
+    case CASE:                                                                \
         return reinterpret_cast<const op::tan::NAMESPACE::Descriptor *>(desc) \
             ->calculate(workspace, workspace_size, output, {input}, stream)
 
@@ -123,11 +123,11 @@ __C infiniStatus_t infiniopTan(
 #undef CALCULATE
 }
 
-__C infiniStatus_t
+__INFINI_C infiniStatus_t
 infiniopDestroyTanDescriptor(infiniopTanDescriptor_t desc) {
 
-#define DELETE(CASE, NAMESPACE)                                                 \
-    case CASE:                                                                  \
+#define DELETE(CASE, NAMESPACE)                                                \
+    case CASE:                                                                 \
         delete reinterpret_cast<const op::tan::NAMESPACE::Descriptor *>(desc); \
         return INFINI_STATUS_SUCCESS
 
diff --git a/src/infiniop/ops/tanhshrink/moore/tanhshrink_moore_kernel.h b/src/infiniop/ops/tanhshrink/moore/tanhshrink_moore_kernel.h
index 9c75b916e..f4e97b27b 100644
--- a/src/infiniop/ops/tanhshrink/moore/tanhshrink_moore_kernel.h
+++ b/src/infiniop/ops/tanhshrink/moore/tanhshrink_moore_kernel.h
@@ -25,4 +25,4 @@ typedef struct TanhshrinkOp {
 
 } // namespace op::tanhshrink::moore
 
-#endif // __TANHSHRINK_MOORE_KERNEL_H__
\ No newline at end of file
+#endif // __TANHSHRINK_MOORE_KERNEL_H__
diff --git a/src/infiniop/ops/tanhshrink/operator.cc b/src/infiniop/ops/tanhshrink/operator.cc
index 66e62b850..0cc5d8959 100644
--- a/src/infiniop/ops/tanhshrink/operator.cc
+++ b/src/infiniop/ops/tanhshrink/operator.cc
@@ -15,18 +15,18 @@
 #include "moore/tanhshrink_moore.h"
 #endif
 
-__C infiniStatus_t infiniopCreateTanhshrinkDescriptor(
+__INFINI_C infiniStatus_t infiniopCreateTanhshrinkDescriptor(
     infiniopHandle_t handle,
     infiniopTanhshrinkDescriptor_t *desc_ptr,
     infiniopTensorDescriptor_t output_desc,
     infiniopTensorDescriptor_t input_desc) {
 
-#define CREATE(CASE, NAMESPACE)                                             \
-    case CASE:                                                              \
+#define CREATE(CASE, NAMESPACE)                                                   \
+    case CASE:                                                                    \
         return op::tanhshrink::NAMESPACE::Descriptor::create(                     \
-            handle,                                                         \
+            handle,                                                               \
             reinterpret_cast<op::tanhshrink::NAMESPACE::Descriptor **>(desc_ptr), \
-            output_desc,                                                    \
+            output_desc,                                                          \
             {input_desc})
 
     switch (handle->device) {
@@ -54,10 +54,10 @@ __C infiniStatus_t infiniopCreateTanhshrinkDescriptor(
 #undef CREATE
 }
 
-__C infiniStatus_t infiniopGetTanhshrinkWorkspaceSize(infiniopTanhshrinkDescriptor_t desc, size_t *size) {
+__INFINI_C infiniStatus_t infiniopGetTanhshrinkWorkspaceSize(infiniopTanhshrinkDescriptor_t desc, size_t *size) {
 
-#define GET(CASE, NAMESPACE)                                                                \
-    case CASE:                                                                              \
+#define GET(CASE, NAMESPACE)                                                                      \
+    case CASE:                                                                                    \
         *size = reinterpret_cast<op::tanhshrink::NAMESPACE::Descriptor *>(desc)->workspaceSize(); \
         return INFINI_STATUS_SUCCESS
 
@@ -85,7 +85,7 @@ __C infiniStatus_t infiniopGetTanhshrinkWorkspaceSize(infiniopTanhshrinkDescript
     return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
 }
 
-__C infiniStatus_t infiniopTanhshrink(
+__INFINI_C infiniStatus_t infiniopTanhshrink(
     infiniopTanhshrinkDescriptor_t desc,
     void *workspace,
     size_t workspace_size,
@@ -93,8 +93,8 @@ __C infiniStatus_t infiniopTanhshrink(
     const void *input,
     void *stream) {
 
-#define CALCULATE(CASE, NAMESPACE)                                             \
-    case CASE:                                                                 \
+#define CALCULATE(CASE, NAMESPACE)                                                   \
+    case CASE:                                                                       \
         return reinterpret_cast<const op::tanhshrink::NAMESPACE::Descriptor *>(desc) \
             ->calculate(workspace, workspace_size, output, {input}, stream)
 
@@ -123,11 +123,11 @@ __C infiniStatus_t infiniopTanhshrink(
 #undef CALCULATE
 }
 
-__C infiniStatus_t
+__INFINI_C infiniStatus_t
 infiniopDestroyTanhshrinkDescriptor(infiniopTanhshrinkDescriptor_t desc) {
 
-#define DELETE(CASE, NAMESPACE)                                                 \
-    case CASE:                                                                  \
+#define DELETE(CASE, NAMESPACE)                                                       \
+    case CASE:                                                                        \
         delete reinterpret_cast<const op::tanhshrink::NAMESPACE::Descriptor *>(desc); \
         return INFINI_STATUS_SUCCESS
 
diff --git a/test/infinicore/ops/inner.py b/test/infinicore/ops/inner.py
index 420123eec..8235515d8 100644
--- a/test/infinicore/ops/inner.py
+++ b/test/infinicore/ops/inner.py
@@ -21,7 +21,7 @@
 
 _TOLERANCE_MAP = {
     infinicore.float16: {"atol": 1e-3, "rtol": 1e-2},
-    infinicore.float32: {"atol": 1e-5, "rtol": 1e-4},
+    infinicore.float32: {"atol": 1e-5, "rtol": 1e-3},
     infinicore.bfloat16: {"atol": 1e-2, "rtol": 5e-2},
 }
 

From d68d0b889c698b0c75020279cfbae7e24edf1de6 Mon Sep 17 00:00:00 2001
From: PanZezhong <panzezhong@qiyuanlab.com>
Date: Thu, 19 Mar 2026 05:24:16 +0000
Subject: [PATCH 3/3] issue/1031 fix sum iluvatar

---
 src/infiniop/ops/sum/nvidia/sum_nvidia.cu | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/src/infiniop/ops/sum/nvidia/sum_nvidia.cu b/src/infiniop/ops/sum/nvidia/sum_nvidia.cu
index af052be0a..9f165d271 100644
--- a/src/infiniop/ops/sum/nvidia/sum_nvidia.cu
+++ b/src/infiniop/ops/sum/nvidia/sum_nvidia.cu
@@ -107,6 +107,8 @@ infiniStatus_t Descriptor::calculate(
         CALCULATE_SUM_WITH_BLOCK_SIZE(CUDA_BLOCK_SIZE_1024)
     } else if (_opaque->internal->maxThreadsPerBlock() == CUDA_BLOCK_SIZE_512) {
         CALCULATE_SUM_WITH_BLOCK_SIZE(CUDA_BLOCK_SIZE_512)
+    } else if (_opaque->internal->maxThreadsPerBlock() == CUDA_BLOCK_SIZE_2048) {
+        CALCULATE_SUM_WITH_BLOCK_SIZE(CUDA_BLOCK_SIZE_2048)
     } else if (_opaque->internal->maxThreadsPerBlock() == CUDA_BLOCK_SIZE_4096) {
         CALCULATE_SUM_WITH_BLOCK_SIZE(CUDA_BLOCK_SIZE_4096)
     } else {