From 85f25693e1b4118d8f719824e846578bf18d52fa Mon Sep 17 00:00:00 2001 From: JBFYS-XD Date: Thu, 19 Mar 2026 02:40:16 +0000 Subject: [PATCH 1/3] issue/1031 merge T1-1-32 --- include/infinicore/ops/cat.hpp | 9 + include/infinicore/ops/inner.hpp | 17 ++ include/infinicore/ops/masked_select.hpp | 16 ++ include/infinicore/ops/tan.hpp | 17 ++ include/infinicore/ops/tanhshrink.hpp | 16 ++ include/infiniop.h | 4 + include/infiniop/ops/inner.h | 28 +++ include/infiniop/ops/masked_select.h | 28 +++ include/infiniop/ops/tan.h | 26 +++ include/infiniop/ops/tanhshrink.h | 24 +++ python/infinicore/__init__.py | 8 + python/infinicore/nn/functional/__init__.py | 2 + python/infinicore/nn/functional/tanhshrink.py | 11 ++ python/infinicore/ops/cat.py | 12 ++ python/infinicore/ops/inner.py | 11 ++ python/infinicore/ops/masked_select.py | 5 + python/infinicore/ops/tan.py | 11 ++ src/infinicore/ops/cat/cat.cc | 165 ++++++++++++++++++ src/infinicore/ops/inner/inner.cc | 46 +++++ src/infinicore/ops/inner/inner_infiniop.cc | 59 +++++++ .../ops/masked_select/masked_select.cc | 33 ++++ .../masked_select/masked_select_infiniop.cc | 59 +++++++ src/infinicore/ops/tan/tan.cc | 32 ++++ src/infinicore/ops/tan/tan_infiniop.cc | 58 ++++++ src/infinicore/ops/tanhshrink/tanhshrink.cc | 32 ++++ .../ops/tanhshrink/tanhshrink_infiniop.cc | 58 ++++++ src/infinicore/pybind11/ops.hpp | 12 ++ src/infinicore/pybind11/ops/cat.hpp | 26 +++ src/infinicore/pybind11/ops/inner.hpp | 26 +++ src/infinicore/pybind11/ops/masked_select.hpp | 20 +++ src/infinicore/pybind11/ops/tan.hpp | 24 +++ src/infinicore/pybind11/ops/tanhshrink.hpp | 24 +++ src/infiniop/ops/inner/cpu/inner_cpu.cc | 88 ++++++++++ src/infiniop/ops/inner/cpu/inner_cpu.h | 7 + src/infiniop/ops/inner/cuda/kernel.cuh | 41 +++++ src/infiniop/ops/inner/info.h | 88 ++++++++++ src/infiniop/ops/inner/inner.h | 49 ++++++ src/infiniop/ops/inner/metax/inner_metax.h | 7 + src/infiniop/ops/inner/metax/inner_metax.maca | 123 +++++++++++++ .../ops/inner/metax/inner_metax_kernel.h | 59 +++++++ src/infiniop/ops/inner/moore/inner_moore.h | 7 + src/infiniop/ops/inner/moore/inner_moore.mu | 123 +++++++++++++ .../ops/inner/moore/inner_moore_kernel.h | 45 +++++ src/infiniop/ops/inner/nvidia/inner_nvidia.cu | 123 +++++++++++++ .../ops/inner/nvidia/inner_nvidia.cuh | 7 + src/infiniop/ops/inner/operator.cc | 139 +++++++++++++++ .../masked_select/cpu/masked_select_cpu.cc | 58 ++++++ .../ops/masked_select/cpu/masked_select_cpu.h | 7 + .../ops/masked_select/cuda/kernel.cuh | 101 +++++++++++ src/infiniop/ops/masked_select/info.h | 53 ++++++ .../ops/masked_select/masked_select.h | 48 +++++ .../masked_select/metax/masked_select_metax.h | 7 + .../metax/masked_select_metax.maca | 154 ++++++++++++++++ .../metax/masked_select_metax_kernel.h | 101 +++++++++++ .../masked_select/moore/masked_select_moore.h | 7 + .../moore/masked_select_moore.mu | 156 +++++++++++++++++ .../moore/masked_select_moore_kernel.h | 101 +++++++++++ .../nvidia/masked_select_nvidia.cu | 149 ++++++++++++++++ .../nvidia/masked_select_nvidia.cuh | 8 + src/infiniop/ops/masked_select/operator.cc | 138 +++++++++++++++ src/infiniop/ops/tan/cpu/tan_cpu.cc | 50 ++++++ src/infiniop/ops/tan/cpu/tan_cpu.h | 23 +++ src/infiniop/ops/tan/cuda/kernel.cuh | 30 ++++ src/infiniop/ops/tan/metax/tan_metax.h | 8 + src/infiniop/ops/tan/metax/tan_metax.maca | 56 ++++++ src/infiniop/ops/tan/moore/tan_moore.h | 8 + src/infiniop/ops/tan/moore/tan_moore.mu | 56 ++++++ src/infiniop/ops/tan/nvidia/tan_nvidia.cu | 57 ++++++ src/infiniop/ops/tan/nvidia/tan_nvidia.cuh | 8 + src/infiniop/ops/tan/operator.cc | 157 +++++++++++++++++ .../ops/tanhshrink/cpu/tanhshrink_cpu.cc | 50 ++++++ .../ops/tanhshrink/cpu/tanhshrink_cpu.h | 23 +++ src/infiniop/ops/tanhshrink/cuda/kernel.cuh | 30 ++++ .../ops/tanhshrink/metax/tanhshrink_metax.h | 8 + .../tanhshrink/metax/tanhshrink_metax.maca | 56 ++++++ .../ops/tanhshrink/moore/tanhshrink_moore.h | 8 + .../ops/tanhshrink/moore/tanhshrink_moore.mu | 56 ++++++ .../moore/tanhshrink_moore_kernel.h | 28 +++ .../tanhshrink/nvidia/tanhshrink_nvidia.cu | 59 +++++++ .../tanhshrink/nvidia/tanhshrink_nvidia.cuh | 8 + src/infiniop/ops/tanhshrink/operator.cc | 157 +++++++++++++++++ test/infinicore/ops/cat.py | 6 +- test/infinicore/ops/inner.py | 6 +- test/infinicore/ops/masked_select.py | 6 +- test/infinicore/ops/tan.py | 6 +- test/infinicore/ops/tanhshrink.py | 6 +- 86 files changed, 3864 insertions(+), 15 deletions(-) create mode 100644 include/infinicore/ops/cat.hpp create mode 100644 include/infinicore/ops/inner.hpp create mode 100644 include/infinicore/ops/masked_select.hpp create mode 100644 include/infinicore/ops/tan.hpp create mode 100644 include/infinicore/ops/tanhshrink.hpp create mode 100644 include/infiniop/ops/inner.h create mode 100644 include/infiniop/ops/masked_select.h create mode 100644 include/infiniop/ops/tan.h create mode 100644 include/infiniop/ops/tanhshrink.h create mode 100644 python/infinicore/nn/functional/tanhshrink.py create mode 100644 python/infinicore/ops/cat.py create mode 100644 python/infinicore/ops/inner.py create mode 100644 python/infinicore/ops/masked_select.py create mode 100644 python/infinicore/ops/tan.py create mode 100644 src/infinicore/ops/cat/cat.cc create mode 100644 src/infinicore/ops/inner/inner.cc create mode 100644 src/infinicore/ops/inner/inner_infiniop.cc create mode 100644 src/infinicore/ops/masked_select/masked_select.cc create mode 100644 src/infinicore/ops/masked_select/masked_select_infiniop.cc create mode 100644 src/infinicore/ops/tan/tan.cc create mode 100644 src/infinicore/ops/tan/tan_infiniop.cc create mode 100644 src/infinicore/ops/tanhshrink/tanhshrink.cc create mode 100644 src/infinicore/ops/tanhshrink/tanhshrink_infiniop.cc create mode 100644 src/infinicore/pybind11/ops/cat.hpp create mode 100644 src/infinicore/pybind11/ops/inner.hpp create mode 100644 src/infinicore/pybind11/ops/masked_select.hpp create mode 100644 src/infinicore/pybind11/ops/tan.hpp create mode 100644 src/infinicore/pybind11/ops/tanhshrink.hpp create mode 100644 src/infiniop/ops/inner/cpu/inner_cpu.cc create mode 100644 src/infiniop/ops/inner/cpu/inner_cpu.h create mode 100644 src/infiniop/ops/inner/cuda/kernel.cuh create mode 100644 src/infiniop/ops/inner/info.h create mode 100644 src/infiniop/ops/inner/inner.h create mode 100644 src/infiniop/ops/inner/metax/inner_metax.h create mode 100644 src/infiniop/ops/inner/metax/inner_metax.maca create mode 100644 src/infiniop/ops/inner/metax/inner_metax_kernel.h create mode 100644 src/infiniop/ops/inner/moore/inner_moore.h create mode 100644 src/infiniop/ops/inner/moore/inner_moore.mu create mode 100644 src/infiniop/ops/inner/moore/inner_moore_kernel.h create mode 100644 src/infiniop/ops/inner/nvidia/inner_nvidia.cu create mode 100644 src/infiniop/ops/inner/nvidia/inner_nvidia.cuh create mode 100644 src/infiniop/ops/inner/operator.cc create mode 100644 src/infiniop/ops/masked_select/cpu/masked_select_cpu.cc create mode 100644 src/infiniop/ops/masked_select/cpu/masked_select_cpu.h create mode 100644 src/infiniop/ops/masked_select/cuda/kernel.cuh create mode 100644 src/infiniop/ops/masked_select/info.h create mode 100644 src/infiniop/ops/masked_select/masked_select.h create mode 100644 src/infiniop/ops/masked_select/metax/masked_select_metax.h create mode 100644 src/infiniop/ops/masked_select/metax/masked_select_metax.maca create mode 100644 src/infiniop/ops/masked_select/metax/masked_select_metax_kernel.h create mode 100644 src/infiniop/ops/masked_select/moore/masked_select_moore.h create mode 100644 src/infiniop/ops/masked_select/moore/masked_select_moore.mu create mode 100644 src/infiniop/ops/masked_select/moore/masked_select_moore_kernel.h create mode 100644 src/infiniop/ops/masked_select/nvidia/masked_select_nvidia.cu create mode 100644 src/infiniop/ops/masked_select/nvidia/masked_select_nvidia.cuh create mode 100644 src/infiniop/ops/masked_select/operator.cc create mode 100644 src/infiniop/ops/tan/cpu/tan_cpu.cc create mode 100644 src/infiniop/ops/tan/cpu/tan_cpu.h create mode 100644 src/infiniop/ops/tan/cuda/kernel.cuh create mode 100644 src/infiniop/ops/tan/metax/tan_metax.h create mode 100644 src/infiniop/ops/tan/metax/tan_metax.maca create mode 100644 src/infiniop/ops/tan/moore/tan_moore.h create mode 100644 src/infiniop/ops/tan/moore/tan_moore.mu create mode 100644 src/infiniop/ops/tan/nvidia/tan_nvidia.cu create mode 100644 src/infiniop/ops/tan/nvidia/tan_nvidia.cuh create mode 100644 src/infiniop/ops/tan/operator.cc create mode 100644 src/infiniop/ops/tanhshrink/cpu/tanhshrink_cpu.cc create mode 100644 src/infiniop/ops/tanhshrink/cpu/tanhshrink_cpu.h create mode 100644 src/infiniop/ops/tanhshrink/cuda/kernel.cuh create mode 100644 src/infiniop/ops/tanhshrink/metax/tanhshrink_metax.h create mode 100644 src/infiniop/ops/tanhshrink/metax/tanhshrink_metax.maca create mode 100644 src/infiniop/ops/tanhshrink/moore/tanhshrink_moore.h create mode 100644 src/infiniop/ops/tanhshrink/moore/tanhshrink_moore.mu create mode 100644 src/infiniop/ops/tanhshrink/moore/tanhshrink_moore_kernel.h create mode 100644 src/infiniop/ops/tanhshrink/nvidia/tanhshrink_nvidia.cu create mode 100644 src/infiniop/ops/tanhshrink/nvidia/tanhshrink_nvidia.cuh create mode 100644 src/infiniop/ops/tanhshrink/operator.cc diff --git a/include/infinicore/ops/cat.hpp b/include/infinicore/ops/cat.hpp new file mode 100644 index 000000000..5fdfdcce8 --- /dev/null +++ b/include/infinicore/ops/cat.hpp @@ -0,0 +1,9 @@ +#pragma once + +#include "common/op.hpp" + +namespace infinicore::op { + +Tensor cat(std::vector tensors, int dim); +void cat_(Tensor out, std::vector tensors, int dim); +} // namespace infinicore::op \ No newline at end of file diff --git a/include/infinicore/ops/inner.hpp b/include/infinicore/ops/inner.hpp new file mode 100644 index 000000000..017ddc604 --- /dev/null +++ b/include/infinicore/ops/inner.hpp @@ -0,0 +1,17 @@ +#pragma once + +#include "../device.hpp" +#include "common/op.hpp" + +namespace infinicore::op { +class Inner { +public: + using schema = void (*)(Tensor, Tensor, Tensor); + static void execute(Tensor out, Tensor input, Tensor other); + static common::OpDispatcher &dispatcher(); +}; + +Tensor inner(Tensor input, Tensor other); +void inner_(Tensor out, Tensor input, Tensor other); + +} \ No newline at end of file diff --git a/include/infinicore/ops/masked_select.hpp b/include/infinicore/ops/masked_select.hpp new file mode 100644 index 000000000..89bbfa49d --- /dev/null +++ b/include/infinicore/ops/masked_select.hpp @@ -0,0 +1,16 @@ +#pragma once + +#include "../device.hpp" +#include "common/op.hpp" + +namespace infinicore::op { +class MaskedSelect { +public: + using schema = void (*)(Tensor, Tensor, void **, size_t *); + static void execute(Tensor input, Tensor mask, void **data_ptr, size_t *dlen_ptr); + static common::OpDispatcher &dispatcher(); +}; + +Tensor masked_select(Tensor input, Tensor mask); + +} // namespace infinicore::op \ No newline at end of file diff --git a/include/infinicore/ops/tan.hpp b/include/infinicore/ops/tan.hpp new file mode 100644 index 000000000..9c2d7a860 --- /dev/null +++ b/include/infinicore/ops/tan.hpp @@ -0,0 +1,17 @@ +#pragma once + +#include "../device.hpp" +#include "common/op.hpp" + +namespace infinicore::op { +class Tan { +public: + using schema = void (*)(Tensor, Tensor); + static void execute(Tensor output, Tensor input); + static common::OpDispatcher &dispatcher(); +}; + +Tensor tan(Tensor input); +void tan_(Tensor output, Tensor input); + +} \ No newline at end of file diff --git a/include/infinicore/ops/tanhshrink.hpp b/include/infinicore/ops/tanhshrink.hpp new file mode 100644 index 000000000..83673b22c --- /dev/null +++ b/include/infinicore/ops/tanhshrink.hpp @@ -0,0 +1,16 @@ +#pragma once + +#include "../device.hpp" +#include "common/op.hpp" + +namespace infinicore::op { +class Tanhshrink { +public: + using schema = void (*)(Tensor, Tensor); + static void execute(Tensor output, Tensor input); + static common::OpDispatcher &dispatcher(); +}; + +Tensor tanhshrink(Tensor input); +void tanhshrink_(Tensor output, Tensor input); +} // namespace infinicore::op diff --git a/include/infiniop.h b/include/infiniop.h index 2b30ff4ae..b5aa2d966 100644 --- a/include/infiniop.h +++ b/include/infiniop.h @@ -30,9 +30,11 @@ #include "infiniop/ops/hardtanh.h" #include "infiniop/ops/int8_gemm.h" #include "infiniop/ops/kv_caching.h" +#include "infiniop/ops/inner.h" #include "infiniop/ops/layer_norm.h" #include "infiniop/ops/logsoftmax.h" #include "infiniop/ops/lp_norm.h" +#include "infiniop/ops/masked_select.h" #include "infiniop/ops/mul.h" #include "infiniop/ops/ones.h" #include "infiniop/ops/paged_attention.h" @@ -54,8 +56,10 @@ #include "infiniop/ops/sub.h" #include "infiniop/ops/sum.h" #include "infiniop/ops/swiglu.h" +#include "infiniop/ops/tan.h" #include "infiniop/ops/tanh.h" #include "infiniop/ops/topk.h" +#include "infiniop/ops/tanhshrink.h" #include "infiniop/ops/topkrouter.h" #include "infiniop/ops/topksoftmax.h" #include "infiniop/ops/var.h" diff --git a/include/infiniop/ops/inner.h b/include/infiniop/ops/inner.h new file mode 100644 index 000000000..f484cde9c --- /dev/null +++ b/include/infiniop/ops/inner.h @@ -0,0 +1,28 @@ +#ifndef __INFINIOP_INNER_API_H__ +#define __INFINIOP_INNER_API_H__ + +#include "../operator_descriptor.h" + +typedef struct InfiniopDescriptor *infiniopInnerDescriptor_t; + +__C __export infiniStatus_t infiniopCreateInnerDescriptor( + infiniopHandle_t handle, + infiniopInnerDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t out_desc, + infiniopTensorDescriptor_t input_desc, + infiniopTensorDescriptor_t other_desc); + +__C __export infiniStatus_t infiniopGetInnerWorkspaceSize(infiniopInnerDescriptor_t desc, size_t *size); + +__C __export infiniStatus_t infiniopInner( + infiniopInnerDescriptor_t desc, + void *workspace, + size_t workspace_size, + void *out, + const void *input, + const void *other, + void *stream); + +__C __export infiniStatus_t infiniopDestroyInnerDescriptor(infiniopInnerDescriptor_t desc); + +#endif \ No newline at end of file diff --git a/include/infiniop/ops/masked_select.h b/include/infiniop/ops/masked_select.h new file mode 100644 index 000000000..7adf65415 --- /dev/null +++ b/include/infiniop/ops/masked_select.h @@ -0,0 +1,28 @@ +#ifndef __INFINIOP_MASKED_SELECT_API_H__ +#define __INFINIOP_MASKED_SELECT_API_H__ + +#include "../operator_descriptor.h" + +typedef struct InfiniopDescriptor *infiniopMaskedSelectDescriptor_t; + +__C __export infiniStatus_t infiniopCreateMaskedSelectDescriptor( + infiniopHandle_t handle, + infiniopMaskedSelectDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t input_desc, + infiniopTensorDescriptor_t mask_desc); + +__C __export infiniStatus_t infiniopGetMaskedSelectWorkspaceSize(infiniopMaskedSelectDescriptor_t desc, size_t *size); + +__C __export infiniStatus_t infiniopMaskedSelect( + infiniopMaskedSelectDescriptor_t desc, + void *workspace, + size_t workspace_size, + const void *input, + const bool *mask, + void **data_ptr, + size_t *dlen, + void *stream); + +__C __export infiniStatus_t infiniopDestroyMaskedSelectDescriptor(infiniopMaskedSelectDescriptor_t desc); + +#endif \ No newline at end of file diff --git a/include/infiniop/ops/tan.h b/include/infiniop/ops/tan.h new file mode 100644 index 000000000..2e20c61da --- /dev/null +++ b/include/infiniop/ops/tan.h @@ -0,0 +1,26 @@ +#ifndef __INFINIOP_TAN_API_H__ +#define __INFINIOP_TAN_API_H__ + +#include "../operator_descriptor.h" + +typedef struct InfiniopDescriptor *infiniopTanDescriptor_t; + +__C __export infiniStatus_t infiniopCreateTanDescriptor( + infiniopHandle_t handle, + infiniopTanDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t input_desc, + infiniopTensorDescriptor_t out_desc); + +__C __export infiniStatus_t infiniopGetTanWorkspaceSize(infiniopTanDescriptor_t desc, size_t *size); + +__C __export infiniStatus_t infiniopTan( + infiniopTanDescriptor_t desc, + void *workspace, + size_t workspace_size, + void *output, + const void *input, + void *stream); + +__C __export infiniStatus_t infiniopDestroyTanDescriptor(infiniopTanDescriptor_t desc); + +#endif \ No newline at end of file diff --git a/include/infiniop/ops/tanhshrink.h b/include/infiniop/ops/tanhshrink.h new file mode 100644 index 000000000..e136f6523 --- /dev/null +++ b/include/infiniop/ops/tanhshrink.h @@ -0,0 +1,24 @@ +#ifndef __INFINIOP_TANHSHRINK_API_H__ +#define __INFINIOP_TANHSHRINK_API_H__ + +#include "../operator_descriptor.h" + +typedef struct InfiniopDescriptor *infiniopTanhshrinkDescriptor_t; + +__C __export infiniStatus_t infiniopCreateTanhshrinkDescriptor(infiniopHandle_t handle, + infiniopTanhshrinkDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t output, + infiniopTensorDescriptor_t intput); + +__C __export infiniStatus_t infiniopGetTanhshrinkWorkspaceSize(infiniopTanhshrinkDescriptor_t desc, size_t *size); + +__C __export infiniStatus_t infiniopTanhshrink(infiniopTanhshrinkDescriptor_t desc, + void *workspace, + size_t workspace_size, + void *output, + const void *intput, + void *stream); + +__C __export infiniStatus_t infiniopDestroyTanhshrinkDescriptor(infiniopTanhshrinkDescriptor_t desc); + +#endif diff --git a/python/infinicore/__init__.py b/python/infinicore/__init__.py index a895b0426..f5e9a72b0 100644 --- a/python/infinicore/__init__.py +++ b/python/infinicore/__init__.py @@ -64,6 +64,9 @@ from infinicore.ops.equal import equal from infinicore.ops.fmod import fmod from infinicore.ops.kv_caching import kv_caching +from infinicore.ops.cat import cat +from infinicore.ops.inner import inner +from infinicore.ops.masked_select import masked_select from infinicore.ops.matmul import matmul from infinicore.ops.mha_kvcache import mha_kvcache from infinicore.ops.mha_varlen import mha_varlen @@ -80,6 +83,7 @@ from infinicore.ops.unsqueeze import unsqueeze from infinicore.ops.var import var from infinicore.ops.var_mean import var_mean +from infinicore.ops.tan import tan from infinicore.tensor import ( Tensor, empty, @@ -151,6 +155,9 @@ "baddbmm", "bilinear", "fmod", + "cat", + "inner", + "masked_select", "matmul", "equal", "mul", @@ -159,6 +166,7 @@ "unsqueeze", "rearrange", "cross_entropy", + "tan", "empty", "empty_like", "from_blob", diff --git a/python/infinicore/nn/functional/__init__.py b/python/infinicore/nn/functional/__init__.py index 815eb8a2b..91e101f9d 100644 --- a/python/infinicore/nn/functional/__init__.py +++ b/python/infinicore/nn/functional/__init__.py @@ -14,6 +14,7 @@ from .silu import silu from .silu_and_mul import silu_and_mul from .swiglu import swiglu +from .tanhshrink import tanhshrink __all__ = [ "adaptive_max_pool1d", @@ -33,4 +34,5 @@ "swiglu", "linear_w8a8i8", "silu_and_mul", + "tanhshrink" ] diff --git a/python/infinicore/nn/functional/tanhshrink.py b/python/infinicore/nn/functional/tanhshrink.py new file mode 100644 index 000000000..10d17077d --- /dev/null +++ b/python/infinicore/nn/functional/tanhshrink.py @@ -0,0 +1,11 @@ +from infinicore.lib import _infinicore +from infinicore.tensor import Tensor + + +def tanhshrink(input, *, out=None): + if out is None: + return Tensor(_infinicore.tanhshrink(input._underlying)) + + _infinicore.tanhshrink_(out._underlying, input._underlying) + + return out diff --git a/python/infinicore/ops/cat.py b/python/infinicore/ops/cat.py new file mode 100644 index 000000000..3a3e1470f --- /dev/null +++ b/python/infinicore/ops/cat.py @@ -0,0 +1,12 @@ +from infinicore.lib import _infinicore +from infinicore.tensor import Tensor + + +def cat(tensors, dim:int=0, *,out=None): + if out is None: + return Tensor(_infinicore.cat([tensor._underlying for tensor in tensors], dim)) + + _infinicore.cat_(out._underlying, [tensor._underlying for tensor in tensors], dim) + + # raise RuntimeError("breakpointer!!!") + return out diff --git a/python/infinicore/ops/inner.py b/python/infinicore/ops/inner.py new file mode 100644 index 000000000..df87c5b34 --- /dev/null +++ b/python/infinicore/ops/inner.py @@ -0,0 +1,11 @@ +from infinicore.lib import _infinicore +from infinicore.tensor import Tensor + + +def inner(input, other, *, out=None): + if out is None: + return Tensor(_infinicore.inner(input._underlying, other._underlying)) + + _infinicore.inner_(out._underlying, input._underlying, other._underlying) + + return out diff --git a/python/infinicore/ops/masked_select.py b/python/infinicore/ops/masked_select.py new file mode 100644 index 000000000..d1846edfc --- /dev/null +++ b/python/infinicore/ops/masked_select.py @@ -0,0 +1,5 @@ +from infinicore.lib import _infinicore +from infinicore.tensor import Tensor + +def masked_select(input, mask): + return Tensor(_infinicore.masked_select(input._underlying, mask._underlying)) \ No newline at end of file diff --git a/python/infinicore/ops/tan.py b/python/infinicore/ops/tan.py new file mode 100644 index 000000000..da438e6d8 --- /dev/null +++ b/python/infinicore/ops/tan.py @@ -0,0 +1,11 @@ +from infinicore.lib import _infinicore +from infinicore.tensor import Tensor + + +def tan(input, *, out=None): + if out is None: + return Tensor(_infinicore.tan(input._underlying)) + + _infinicore.tan_(out._underlying, input._underlying) + + return out diff --git a/src/infinicore/ops/cat/cat.cc b/src/infinicore/ops/cat/cat.cc new file mode 100644 index 000000000..908f5f457 --- /dev/null +++ b/src/infinicore/ops/cat/cat.cc @@ -0,0 +1,165 @@ +#include "infinicore/ops/cat.hpp" +#include "infinicore/context/context.hpp" +#include +#include +namespace infinicore::op { + +namespace { + +class CatInfo { + + CatInfo() = default; + +public: + + int dim; + int ndim; + size_t tensors_size; + + std::vector contiguous_dim; + std::vector copy_size; + + static CatInfo create(Tensor& out, std::vector& tensors, int dim) { + + int ndim = out->ndim(); + size_t tensors_size = tensors.size(); + + std::vector contiguous_dim(tensors_size, ndim); + std::vector copy_size(tensors_size, dsize(out->dtype())); + + for (int i = 0; i < tensors_size; i ++) { + if (tensors[i]->ndim() == 1) continue; + if (tensors[i]->stride(ndim - 1) == 1) { + contiguous_dim[i] = ndim - 1; + for (int j = ndim - 2; j >= dim; j --) { + if (tensors[i]->stride(j) == tensors[i]->stride(j + 1) * tensors[i]->shape()[j + 1]) { + contiguous_dim[i] = j; + } + } + } + for (int j = contiguous_dim[i]; j < ndim; j ++) { + copy_size[i] *= tensors[i]->shape()[j]; + } + } + + return CatInfo{dim, ndim, tensors_size, contiguous_dim, copy_size}; + } +}; + + +void low_dim_copy( + CatInfo &info, Tensor& tensor, Tensor& out, + std::byte *tensor_ptr, std::byte *out_ptr, + int depth, int tensor_pos) { + + if (depth != info.contiguous_dim[tensor_pos]) { + std::byte* now_tensor_ptr = tensor_ptr; + std::byte* now_out_ptr = out_ptr; + + for (int i = 0; i < tensor->shape()[depth]; i ++) { + + low_dim_copy(info, tensor, out, now_tensor_ptr, now_out_ptr, depth + 1, tensor_pos); + + now_tensor_ptr += tensor->stride(depth) * dsize(tensor->dtype()); + now_out_ptr += out->stride(depth) * dsize(out->dtype()); + } + } else { + if (out->device().getType() == Device::Type::CPU) { + + std::memcpy(out_ptr, tensor_ptr, info.copy_size[tensor_pos]); + } else { + + context::memcpyD2D(out_ptr, tensor_ptr, info.copy_size[tensor_pos]); + } + } +} + +void high_dim_split( + CatInfo &info, std::vector& tensors, Tensor& out, + std::vector tensors_ptr, std::byte* out_ptr, + int depth) { + + if (depth != info.dim) { + std::vector now_tensors_ptr = tensors_ptr; + std::byte* now_out_ptr = out_ptr; + + for (int i = 0; i < out->shape()[depth]; i ++) { + + high_dim_split(info, tensors, out, now_tensors_ptr, now_out_ptr, depth + 1); + + for (int i = 0; i < info.tensors_size; i ++) { + if (tensors[i]->ndim() == 1) continue; + now_tensors_ptr[i] += tensors[i]->stride(depth) * dsize(tensors[i]->dtype()); + } + now_out_ptr += out->stride(depth) * dsize(out->dtype()); + } + } else { + std::byte* now_out_ptr = out_ptr; + + for (int i = 0; i < info.tensors_size; i ++) { + if (tensors[i]->ndim() == 1) continue; + + low_dim_copy(info, tensors[i], out, tensors_ptr[i], now_out_ptr, depth, i); + + now_out_ptr += tensors[i]->shape()[depth] * out->stride(depth) * dsize(out->dtype()); + } + } +} + +} // namespace + +Tensor cat(std::vector tensors, int dim) { + assert(tensors.size() >= 2); + int ndim = tensors[0]->ndim(); + assert(-ndim <= dim && dim < ndim); + dim = (dim + ndim) % ndim; + + Shape shape = tensors[0]->shape(); + for (int i = 1; i < tensors.size(); i ++) { + assert(tensors[i]->ndim() == dim || tensors[i]->ndim() == 1); + if (tensors[i]->ndim() != ndim) continue; + shape[dim] += tensors[i]->shape()[dim]; + } + + auto out = Tensor::empty(shape, tensors[0]->dtype(), tensors[0]->device()); + cat_(out, tensors, dim); + return out; +} + +void cat_(Tensor out, std::vector tensors, int dim) { + // assert the parameter properties are correct. + assert(tensors.size() >= 2); + int ndim = out->ndim(); + assert(-ndim <= dim && dim < ndim); + dim = (dim + ndim) % ndim; + + size_t dim_shape = 0; + for (auto& tensor : tensors) { + assert(tensor->ndim() == ndim || tensors[i]->ndim() == 1); + if (tensor->ndim() == 1) { + assert(tensor->shape()[0] == 0); + continue; + } + for (int i = 0; i < ndim; i ++) { + if (i != dim) { + assert(tensor->shape()[i] == out->shape()[i]); + } else { + dim_shape += tensor->shape()[i]; + } + } + } + assert(dim_shape == out->shape()[dim]); + + // Get info + CatInfo info = CatInfo::create(out, tensors, dim); + std::vector tensors_ptr(tensors.size()); + for (int i = 0; i < tensors.size(); i ++) { + tensors_ptr[i] = tensors[i]->data(); + } + std::byte* out_ptr = out->data(); + + high_dim_split(info, tensors, out, tensors_ptr, out_ptr, 0); + +} + +} // namespace infinicore::op \ No newline at end of file diff --git a/src/infinicore/ops/inner/inner.cc b/src/infinicore/ops/inner/inner.cc new file mode 100644 index 000000000..271ad101c --- /dev/null +++ b/src/infinicore/ops/inner/inner.cc @@ -0,0 +1,46 @@ +#include +#include + +namespace infinicore::op { + + +common::OpDispatcher &Inner::dispatcher() { + static common::OpDispatcher dispatcher_; + return dispatcher_; +}; + +void Inner::execute(Tensor out, Tensor input, Tensor other) { + auto device_type = context::getDevice().getType(); + auto func = dispatcher().lookup(device_type); + + if (func == nullptr) { + throw std::runtime_error("No Inner implementation found for device type: " + std::to_string(static_cast(device_type))); + } + + func(out, input, other); +} + +Tensor inner(Tensor input, Tensor other) { + size_t input_ndim = input->ndim(); + size_t other_ndim = other->ndim(); + + assert(input->shape()[input_ndim - 1] == other->shape()[other_ndim - 1]); + + Shape out_shape; + for (int i = 0; i < input_ndim - 1; i ++) + out_shape.push_back(input->shape()[i]); + for (int i = 0; i < other_ndim - 1; i ++) + out_shape.push_back(other->shape()[i]); + auto out = Tensor::zeros(out_shape, input->dtype(), input->device()); + + inner_(out, input, other); + return out; +} + +void inner_(Tensor out, Tensor input, Tensor other) { + + Inner::execute(out, input, other); + +} + +} \ No newline at end of file diff --git a/src/infinicore/ops/inner/inner_infiniop.cc b/src/infinicore/ops/inner/inner_infiniop.cc new file mode 100644 index 000000000..fe6b8de9a --- /dev/null +++ b/src/infinicore/ops/inner/inner_infiniop.cc @@ -0,0 +1,59 @@ +#include "../../utils.hpp" +#include "infinicore/common/hash.hpp" +#include "infinicore/ops/inner.hpp" +#include "infinicore/ops/common/cache.hpp" +#include + +namespace infinicore::op::inner_impl::infiniop { + +thread_local common::OpCache caches( + 100, + [](infiniopInnerDescriptor_t &desc) { + if (desc != nullptr) { + INFINICORE_CHECK_ERROR(infiniopDestroyInnerDescriptor(desc)); + desc = nullptr; + } + }); + +void calculate(Tensor out, Tensor input, Tensor other) { + size_t seed = hash_combine(out, input, other); + + auto device_type = context::getDevice().getType(); + auto device_index = context::getDevice().getIndex(); + + auto &cache = caches.getCache(device_type, device_index); + + auto desc_opt = cache.get(seed); + infiniopInnerDescriptor_t desc = nullptr; + + if (!desc_opt) { + INFINICORE_CHECK_ERROR(infiniopCreateInnerDescriptor( + context::getInfiniopHandle(input->device()), &desc, + out->desc(), input->desc(), other->desc())); + cache.put(seed, desc); + } else { + desc = *desc_opt; + } + + size_t workspace_size = 0; + INFINICORE_CHECK_ERROR(infiniopGetInnerWorkspaceSize(desc, &workspace_size)); + std::shared_ptr workspace = context::allocateMemory(workspace_size); + + + INFINICORE_CHECK_ERROR(infiniopInner( + desc, workspace->data(), workspace_size, + out->data(), input->data(), other->data(), context::getStream())); +} + +static bool registered = []() { + Inner::dispatcher().registerDevice({ + Device::Type::CPU, + Device::Type::NVIDIA, + Device::Type::METAX, + Device::Type::MOORE, + Device::Type::ILUVATAR + }, &calculate, false); + return true; +}(); + +} \ No newline at end of file diff --git a/src/infinicore/ops/masked_select/masked_select.cc b/src/infinicore/ops/masked_select/masked_select.cc new file mode 100644 index 000000000..13ac6dfe4 --- /dev/null +++ b/src/infinicore/ops/masked_select/masked_select.cc @@ -0,0 +1,33 @@ +#include +#include + +namespace infinicore::op { + +common::OpDispatcher &MaskedSelect::dispatcher() { + static common::OpDispatcher dispatcher_; + return dispatcher_; +}; + +void MaskedSelect::execute(Tensor input, Tensor mask, void **data_ptr, size_t *dlen_ptr) { + auto device_type = context::getDevice().getType(); + auto func = dispatcher().lookup(device_type); + + if (func == nullptr) { + throw std::runtime_error("No MaskedSelect implementation found for device type: " + std::to_string(static_cast(device_type))); + } + + func(input, mask, data_ptr, dlen_ptr); +} + +Tensor masked_select(Tensor input, Tensor mask) { + + std::byte *data; + size_t dlen; + MaskedSelect::execute(input, mask, (void **)&data, &dlen); + + auto out = Tensor::from_blob(data, {dlen}, input->dtype(), input->device()); + + return out; +} + +} diff --git a/src/infinicore/ops/masked_select/masked_select_infiniop.cc b/src/infinicore/ops/masked_select/masked_select_infiniop.cc new file mode 100644 index 000000000..0144ce425 --- /dev/null +++ b/src/infinicore/ops/masked_select/masked_select_infiniop.cc @@ -0,0 +1,59 @@ +#include "../../utils.hpp" +#include "infinicore/common/hash.hpp" +#include "infinicore/ops/masked_select.hpp" +#include "infinicore/ops/common/cache.hpp" +#include + +namespace infinicore::op::masked_select_impl::infiniop { + +thread_local common::OpCache caches( + 100, + [](infiniopMaskedSelectDescriptor_t &desc) { + if (desc != nullptr) { + INFINICORE_CHECK_ERROR(infiniopDestroyMaskedSelectDescriptor(desc)); + desc = nullptr; + } + }); + +void calculate(Tensor input, Tensor mask, void **data_ptr, size_t *dlen_ptr) { + size_t seed = hash_combine(input, mask, (std::uintptr_t)data_ptr, (std::uintptr_t)dlen_ptr); + + auto device_type = context::getDevice().getType(); + auto device_index = context::getDevice().getIndex(); + + auto &cache = caches.getCache(device_type, device_index); + + auto desc_opt = cache.get(seed); + infiniopMaskedSelectDescriptor_t desc = nullptr; + + if (!desc_opt) { + INFINICORE_CHECK_ERROR(infiniopCreateMaskedSelectDescriptor( + context::getInfiniopHandle(input->device()), &desc, + input->desc(), mask->desc())); + cache.put(seed, desc); + } else { + desc = *desc_opt; + } + + size_t workspace_size = 0; + INFINICORE_CHECK_ERROR(infiniopGetMaskedSelectWorkspaceSize(desc, &workspace_size)); + std::shared_ptr workspace = context::allocateMemory(workspace_size); + + + INFINICORE_CHECK_ERROR(infiniopMaskedSelect( + desc, workspace->data(), workspace_size, + input->data(), (const bool *)mask->data(), data_ptr, dlen_ptr, context::getStream())); +} + +static bool registered = []() { + MaskedSelect::dispatcher().registerDevice({ + Device::Type::CPU, + Device::Type::NVIDIA, + Device::Type::METAX, + Device::Type::MOORE, + Device::Type::ILUVATAR + }, &calculate, false); + return true; +}(); + +} diff --git a/src/infinicore/ops/tan/tan.cc b/src/infinicore/ops/tan/tan.cc new file mode 100644 index 000000000..20bcab4b5 --- /dev/null +++ b/src/infinicore/ops/tan/tan.cc @@ -0,0 +1,32 @@ +#include "infinicore/ops/tan.hpp" +#include + +namespace infinicore::op { + +common::OpDispatcher &Tan::dispatcher() { + static common::OpDispatcher dispatcher_; + return dispatcher_; +}; + +void Tan::execute(Tensor output, Tensor input) { + auto device_type = context::getDevice().getType(); + auto func = dispatcher().lookup(device_type); + + if (func == nullptr) { + throw std::runtime_error("No Tan implementation found for device type: " + std::to_string(static_cast(device_type))); + } + + func(output, input); +} + +Tensor tan(Tensor input) { + Shape shape = input->shape(); + auto output = Tensor::empty(shape, input->dtype(), input->device()); + tan_(output, input); + return output; +} + +void tan_(Tensor output, Tensor input) { + Tan::execute(output, input); +} +} // namespace infinicore::op diff --git a/src/infinicore/ops/tan/tan_infiniop.cc b/src/infinicore/ops/tan/tan_infiniop.cc new file mode 100644 index 000000000..1d7c9c874 --- /dev/null +++ b/src/infinicore/ops/tan/tan_infiniop.cc @@ -0,0 +1,58 @@ +#include "../../utils.hpp" +#include "infinicore/common/hash.hpp" +#include "infinicore/ops/common/cache.hpp" +#include "infinicore/ops/tan.hpp" +#include + +namespace infinicore::op::tan_impl::infiniop { + +thread_local common::OpCache caches( + 100, // capacity + [](infiniopTanDescriptor_t &desc) { + if (desc != nullptr) { + INFINICORE_CHECK_ERROR(infiniopDestroyTanDescriptor(desc)); + desc = nullptr; + } + }); + +void calculate(Tensor output, Tensor input) { + size_t seed = hash_combine(output, input); + + auto device_type = context::getDevice().getType(); + auto device_index = context::getDevice().getIndex(); + + auto &cache = caches.getCache(device_type, device_index); + + auto desc_opt = cache.get(seed); + infiniopTanDescriptor_t desc = nullptr; + + if (!desc_opt) { + INFINICORE_CHECK_ERROR(infiniopCreateTanDescriptor( + context::getInfiniopHandle(output->device()), &desc, + output->desc(), input->desc())); + cache.put(seed, desc); + } else { + desc = *desc_opt; + } + + size_t workspace_size = 0; + INFINICORE_CHECK_ERROR(infiniopGetTanWorkspaceSize(desc, &workspace_size)); + std::shared_ptr workspace = context::allocateMemory(workspace_size); + + INFINICORE_CHECK_ERROR(infiniopTan( + desc, workspace->data(), workspace_size, + output->data(), input->data(), context::getStream())); +} + +static bool registered = []() { + Tan::dispatcher().registerDevice({ + Device::Type::CPU, + Device::Type::NVIDIA, + Device::Type::METAX, + Device::Type::MOORE, + Device::Type::ILUVATAR + }, &calculate, false); + return true; +}(); + +} // namespace infinicore::op::tan_impl::infiniop diff --git a/src/infinicore/ops/tanhshrink/tanhshrink.cc b/src/infinicore/ops/tanhshrink/tanhshrink.cc new file mode 100644 index 000000000..ea96e46f4 --- /dev/null +++ b/src/infinicore/ops/tanhshrink/tanhshrink.cc @@ -0,0 +1,32 @@ +#include "infinicore/ops/tanhshrink.hpp" +#include + +namespace infinicore::op { + +common::OpDispatcher &Tanhshrink::dispatcher() { + static common::OpDispatcher dispatcher_; + return dispatcher_; +}; + +void Tanhshrink::execute(Tensor output, Tensor input) { + auto device_type = context::getDevice().getType(); + auto func = dispatcher().lookup(device_type); + + if (func == nullptr) { + throw std::runtime_error("No Tanhshrink implementation found for device type: " + std::to_string(static_cast(device_type))); + } + + func(output, input); +} + +Tensor tanhshrink(Tensor input) { + Shape shape = input->shape(); + auto output = Tensor::empty(shape, input->dtype(), input->device()); + tanhshrink_(output, input); + return output; +} + +void tanhshrink_(Tensor output, Tensor input) { + Tanhshrink::execute(output, input); +} +} // namespace infinicore::op diff --git a/src/infinicore/ops/tanhshrink/tanhshrink_infiniop.cc b/src/infinicore/ops/tanhshrink/tanhshrink_infiniop.cc new file mode 100644 index 000000000..ca440f875 --- /dev/null +++ b/src/infinicore/ops/tanhshrink/tanhshrink_infiniop.cc @@ -0,0 +1,58 @@ +#include "../../utils.hpp" +#include "infinicore/common/hash.hpp" +#include "infinicore/ops/common/cache.hpp" +#include "infinicore/ops/tanhshrink.hpp" +#include + +namespace infinicore::op::tanhshrink_impl::infiniop { + +thread_local common::OpCache caches( + 100, // capacity + [](infiniopTanhshrinkDescriptor_t &desc) { + if (desc != nullptr) { + INFINICORE_CHECK_ERROR(infiniopDestroyTanhshrinkDescriptor(desc)); + desc = nullptr; + } + }); + +void calculate(Tensor output, Tensor input) { + size_t seed = hash_combine(output, input); + + auto device_type = context::getDevice().getType(); + auto device_index = context::getDevice().getIndex(); + + auto &cache = caches.getCache(device_type, device_index); + + auto desc_opt = cache.get(seed); + infiniopTanhshrinkDescriptor_t desc = nullptr; + + if (!desc_opt) { + INFINICORE_CHECK_ERROR(infiniopCreateTanhshrinkDescriptor( + context::getInfiniopHandle(output->device()), &desc, + output->desc(), input->desc())); + cache.put(seed, desc); + } else { + desc = *desc_opt; + } + + size_t workspace_size = 0; + INFINICORE_CHECK_ERROR(infiniopGetTanhshrinkWorkspaceSize(desc, &workspace_size)); + std::shared_ptr workspace = context::allocateMemory(workspace_size); + + INFINICORE_CHECK_ERROR(infiniopTanhshrink( + desc, workspace->data(), workspace_size, + output->data(), input->data(), context::getStream())); +} + +static bool registered = []() { + Tanhshrink::dispatcher().registerDevice({ + Device::Type::CPU, + Device::Type::NVIDIA, + Device::Type::METAX, + Device::Type::MOORE, + Device::Type::ILUVATAR + }, &calculate, false); + return true; +}(); + +} // namespace infinicore::op::tanhshrink_impl::infiniop diff --git a/src/infinicore/pybind11/ops.hpp b/src/infinicore/pybind11/ops.hpp index a468defd8..ffc2e010c 100644 --- a/src/infinicore/pybind11/ops.hpp +++ b/src/infinicore/pybind11/ops.hpp @@ -14,6 +14,7 @@ #include "ops/baddbmm.hpp" #include "ops/bilinear.hpp" #include "ops/binary_cross_entropy_with_logits.hpp" +#include "ops/cat.hpp" #include "ops/causal_softmax.hpp" #include "ops/cdist.hpp" #include "ops/cross_entropy.hpp" @@ -26,6 +27,9 @@ #include "ops/kv_caching.hpp" #include "ops/linear.hpp" #include "ops/linear_w8a8i8.hpp" +#include "ops/inner.hpp" +#include "ops/linear.hpp" +#include "ops/masked_select.hpp" #include "ops/matmul.hpp" #include "ops/mha_kvcache.hpp" #include "ops/mha_varlen.hpp" @@ -45,6 +49,8 @@ #include "ops/topk.hpp" #include "ops/var.hpp" #include "ops/var_mean.hpp" +#include "ops/tan.hpp" +#include "ops/tanhshrink.hpp" namespace py = pybind11; @@ -62,8 +68,12 @@ inline void bind(py::module &m) { bind_flash_attention(m); bind_kv_caching(m); bind_fmod(m); + bind_cat(m); + bind_causal_softmax(m); + bind_inner(m); bind_random_sample(m); bind_linear(m); + bind_masked_select(m); bind_matmul(m); bind_mul(m); bind_mha_kvcache(m); @@ -80,6 +90,8 @@ inline void bind(py::module &m) { bind_avg_pool1d(m); bind_silu(m); bind_swiglu(m); + bind_tan(m); + bind_tanhshrink(m); bind_rope(m); bind_embedding(m); bind_linear_w8a8i8(m); diff --git a/src/infinicore/pybind11/ops/cat.hpp b/src/infinicore/pybind11/ops/cat.hpp new file mode 100644 index 000000000..2ef3392b0 --- /dev/null +++ b/src/infinicore/pybind11/ops/cat.hpp @@ -0,0 +1,26 @@ +#pragma once + +#include "infinicore/ops/cat.hpp" +#include + +namespace py = pybind11; + +namespace infinicore::ops { + +inline void bind_cat(py::module &m) { + + m.def("cat", + &op::cat, + py::arg("tensors"), + py::arg("dim") = 0, + R"doc(opertor: torch.cat, out-of-place mode)doc"); + + m.def("cat_", + &op::cat_, + py::arg("out"), + py::arg("tensors"), + py::arg("dim") = 0, + R"doc(opertor: torch.cat, in-place mode)doc"); +} + +} // namespace infinicore::ops \ No newline at end of file diff --git a/src/infinicore/pybind11/ops/inner.hpp b/src/infinicore/pybind11/ops/inner.hpp new file mode 100644 index 000000000..16ee91932 --- /dev/null +++ b/src/infinicore/pybind11/ops/inner.hpp @@ -0,0 +1,26 @@ +#pragma once + +#include "infinicore/ops/inner.hpp" +#include + +namespace py = pybind11; + +namespace infinicore::ops { + +inline void bind_inner(py::module &m) { + + m.def("inner", + &op::inner, + py::arg("input"), + py::arg("other"), + R"doc(opertor: torch.inner, out-of-place mode)doc"); + + m.def("inner_", + &op::inner_, + py::arg("out"), + py::arg("input"), + py::arg("other"), + R"doc(opertor: torch.inner, in-place mode)doc"); +} + +} // namespace infinicore::ops \ No newline at end of file diff --git a/src/infinicore/pybind11/ops/masked_select.hpp b/src/infinicore/pybind11/ops/masked_select.hpp new file mode 100644 index 000000000..2734590e6 --- /dev/null +++ b/src/infinicore/pybind11/ops/masked_select.hpp @@ -0,0 +1,20 @@ +#pragma once + +#include "infinicore/ops/masked_select.hpp" +#include + +namespace py = pybind11; + +namespace infinicore::ops { + +inline void bind_masked_select(py::module &m) { + + m.def("masked_select", + &op::masked_select, + py::arg("input"), + py::arg("mask"), + R"doc(opertor: torch.masked_select, out-of-place mode)doc"); + +} + +} // namespace infinicore::ops \ No newline at end of file diff --git a/src/infinicore/pybind11/ops/tan.hpp b/src/infinicore/pybind11/ops/tan.hpp new file mode 100644 index 000000000..acd62e048 --- /dev/null +++ b/src/infinicore/pybind11/ops/tan.hpp @@ -0,0 +1,24 @@ +#pragma once + +#include + +#include "infinicore/ops/tan.hpp" + +namespace py = pybind11; + +namespace infinicore::ops { + +inline void bind_tan(py::module &m) { + m.def("tan", + &op::tan, + py::arg("input"), + R"doc(opertor: torch.tan, out-of-place mode)doc"); + + m.def("tan_", + &op::tan_, + py::arg("output"), + py::arg("input"), + R"doc(opertor: torch.tan, in-place mode)doc"); +} + +} // namespace infinicore::ops diff --git a/src/infinicore/pybind11/ops/tanhshrink.hpp b/src/infinicore/pybind11/ops/tanhshrink.hpp new file mode 100644 index 000000000..3780555f4 --- /dev/null +++ b/src/infinicore/pybind11/ops/tanhshrink.hpp @@ -0,0 +1,24 @@ +#pragma once + +#include + +#include "infinicore/ops/tanhshrink.hpp" + +namespace py = pybind11; + +namespace infinicore::ops { + +inline void bind_tanhshrink(py::module &m) { + m.def("tanhshrink", + &op::tanhshrink, + py::arg("input"), + R"doc(opertor: torch.tanhshrink, out-of-place mode)doc"); + + m.def("tanhshrink_", + &op::tanhshrink_, + py::arg("output"), + py::arg("input"), + R"doc(opertor: torch.tanhshrink, in-place mode)doc"); +} + +} // namespace infinicore::ops diff --git a/src/infiniop/ops/inner/cpu/inner_cpu.cc b/src/infiniop/ops/inner/cpu/inner_cpu.cc new file mode 100644 index 000000000..5915cb3fc --- /dev/null +++ b/src/infiniop/ops/inner/cpu/inner_cpu.cc @@ -0,0 +1,88 @@ +#include "inner_cpu.h" +#include "../../../devices/cpu/common_cpu.h" + +namespace op::inner::cpu { + +Descriptor::~Descriptor() {} + +infiniStatus_t Descriptor::create( + infiniopHandle_t handle, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t out_desc, + infiniopTensorDescriptor_t input_desc, + infiniopTensorDescriptor_t other_desc) { + + auto result = InnerInfo::create(out_desc, input_desc, other_desc); + CHECK_RESULT(result); + *desc_ptr = new Descriptor(nullptr, result.take(), 0, handle->device, handle->device_id); + return INFINI_STATUS_SUCCESS; +} + +namespace { + +template +infiniStatus_t inner(const InnerInfo *info, const T *input, const T *other, T *out) { +#pragma omp parallel for + for (size_t out_index = 0; out_index < info->total_elements; out_index ++) { + + size_t out_dim_pos = info->out_ndim - 1; + size_t out_offset = op::common_cpu::indexToOffset(out_index, info->out_ndim, info->out_shape.data(), info->out_strides.data()); + size_t index = out_index; + + ptrdiff_t input_offset = 0; + ptrdiff_t other_offset = 0; + + for (int i = (int)info->other_ndim - 2; i >= 0; i --) { + other_offset += (index % info->out_shape[out_dim_pos]) * info->other_strides[i]; + index /= info->out_shape[out_dim_pos --]; + } + for (int i = (int)info->input_ndim - 2; i >= 0; i --) { + input_offset += (index % info->out_shape[out_dim_pos]) * info->input_strides[i]; + index /= info->out_shape[out_dim_pos --]; + } + + float tmp = 0.; + for (size_t i = 0; i < info->oper_len; i ++) { + if constexpr (std::is_same::value || std::is_same::value) { + tmp += utils::cast(input[input_offset]) * utils::cast(other[other_offset]); + } else { + tmp += input[input_offset] * other[other_offset]; + } + input_offset += info->input_strides[info->input_ndim - 1]; + other_offset += info->other_strides[info->other_ndim - 1]; + } + + if constexpr (std::is_same::value || std::is_same::value) { + out[out_offset] = utils::cast(tmp); + } else { + out[out_offset] = tmp; + } + } + + + return INFINI_STATUS_SUCCESS; +} + +} + +infiniStatus_t Descriptor::calculate( + void* workspace, size_t workspace_size, + void *out, + const void *input, + const void *other, + void *stream) const { + + if (_info.dtype == INFINI_DTYPE_BF16) { + CHECK_STATUS(inner(&_info, (const bf16_t *)input, (const bf16_t *)other, (bf16_t *)out)); + } else if (_info.dtype == INFINI_DTYPE_F16) { + CHECK_STATUS(inner(&_info, (const fp16_t *)input, (const fp16_t *)other, (fp16_t *)out)); + } else if (_info.dtype == INFINI_DTYPE_F32){ + CHECK_STATUS(inner(&_info, (const float *)input, (const float *)other, (float *)out)); + } else { + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } + + return INFINI_STATUS_SUCCESS; +} + +} \ No newline at end of file diff --git a/src/infiniop/ops/inner/cpu/inner_cpu.h b/src/infiniop/ops/inner/cpu/inner_cpu.h new file mode 100644 index 000000000..5d4774dfa --- /dev/null +++ b/src/infiniop/ops/inner/cpu/inner_cpu.h @@ -0,0 +1,7 @@ +#ifndef __INNER_CPU_H__ +#define __INNER_CPU_H__ +#include "../inner.h" + +DESCRIPTOR(cpu); + +#endif \ No newline at end of file diff --git a/src/infiniop/ops/inner/cuda/kernel.cuh b/src/infiniop/ops/inner/cuda/kernel.cuh new file mode 100644 index 000000000..1616a4281 --- /dev/null +++ b/src/infiniop/ops/inner/cuda/kernel.cuh @@ -0,0 +1,41 @@ +#ifndef __INNER_KERNEL_CUH__ +#define __INNER_KERNEL_CUH__ + +template +INFINIOP_CUDA_KERNEL innerKernel( + const T *input, const T *other, T *out, + size_t total_elements, size_t oper_len, size_t *out_shape, + ptrdiff_t *input_strides, ptrdiff_t *other_strides, ptrdiff_t *out_strides, + size_t input_ndim, size_t other_ndim, size_t out_ndim) { + + size_t out_index = blockDim.x * blockIdx.x + threadIdx.x; + if (out_index >= total_elements) return; + + size_t out_offset = device::nvidia::indexToOffset(out_index, out_ndim, out_shape, out_strides); + + size_t out_dim_pos = out_ndim - 1; + size_t index = out_index; + + ptrdiff_t input_offset = 0; + ptrdiff_t other_offset = 0; + + for (int i = (int)other_ndim - 2; i >= 0; i --) { + other_offset += (index % out_shape[out_dim_pos]) * other_strides[i]; + index /= out_shape[out_dim_pos --]; + } + for (int i = (int)input_ndim - 2; i >= 0; i --) { + input_offset += (index % out_shape[out_dim_pos]) * input_strides[i]; + index /= out_shape[out_dim_pos --]; + } + + T tmp = 0; + for (size_t i = 0; i < oper_len; i ++) { + tmp += input[input_offset] * other[other_offset]; + input_offset += input_strides[input_ndim - 1]; + other_offset += other_strides[other_ndim - 1]; + } + + out[out_offset] = tmp; +} + +#endif // __INNER_KERNEL_CUH__ \ No newline at end of file diff --git a/src/infiniop/ops/inner/info.h b/src/infiniop/ops/inner/info.h new file mode 100644 index 000000000..2484277eb --- /dev/null +++ b/src/infiniop/ops/inner/info.h @@ -0,0 +1,88 @@ +#ifndef __INNER_INFO_H__ +#define __INNER_INFO_H__ + +#include "../../../utils.h" +#include "../../tensor.h" +#include + +namespace op::inner { + +class InnerInfo { + InnerInfo() = default; + +public: + infiniDtype_t dtype; + + size_t input_ndim; + size_t other_ndim; + size_t out_ndim; + + size_t oper_len; + size_t total_elements; + + std::vector out_shape; + + std::vector input_strides; + std::vector other_strides; + std::vector out_strides; + + static utils::Result create( + infiniopTensorDescriptor_t out_desc, + infiniopTensorDescriptor_t input_desc, + infiniopTensorDescriptor_t other_desc) { + + auto dtype = out_desc->dtype(); + if (dtype != input_desc->dtype() || dtype != other_desc->dtype()) + return INFINI_STATUS_BAD_TENSOR_DTYPE; + CHECK_DTYPE(dtype, INFINI_DTYPE_BF16, INFINI_DTYPE_F16, INFINI_DTYPE_F32); + + auto input_ndim = input_desc->ndim(); + auto other_ndim = other_desc->ndim(); + auto out_ndim = out_desc->ndim(); + if (out_ndim + 2 != input_ndim + other_ndim) + return INFINI_STATUS_BAD_TENSOR_SHAPE; + + auto input_shape = input_desc->shape(); + auto other_shape = other_desc->shape(); + auto out_shape = out_desc->shape(); + if (input_ndim && other_ndim) { + if (input_shape[input_ndim - 1] != other_shape[other_ndim - 1]) + return INFINI_STATUS_BAD_TENSOR_SHAPE; + } + size_t out_dim_pos = 0; + if (input_ndim) { + for (size_t i = 0; i < input_ndim - 1; i ++) + if (input_shape[i] != out_shape[out_dim_pos ++]) + return INFINI_STATUS_BAD_TENSOR_SHAPE; + } + if (other_ndim) { + for (size_t i = 0; i < other_ndim - 1; i ++) + if (other_shape[i] != out_shape[out_dim_pos ++]) + return INFINI_STATUS_BAD_TENSOR_SHAPE; + } + + size_t oper_len = 1; + if (input_ndim && other_ndim) { + oper_len = input_shape[input_ndim - 1]; + } + + size_t total_elements = 1; + for (size_t i = 0; i < out_ndim; i ++) + total_elements *= out_shape[i]; + + auto input_strides = input_desc->strides(); + auto other_strides = other_desc->strides(); + auto out_strides = out_desc->strides(); + + return utils::Result(InnerInfo{ + dtype, + input_ndim, other_ndim, out_ndim, + oper_len, total_elements, out_shape, + input_strides, other_strides, out_strides + }); + } +}; + +} // namespace op::inner + +#endif // __INNER_INFO_H__ \ No newline at end of file diff --git a/src/infiniop/ops/inner/inner.h b/src/infiniop/ops/inner/inner.h new file mode 100644 index 000000000..5c44262a7 --- /dev/null +++ b/src/infiniop/ops/inner/inner.h @@ -0,0 +1,49 @@ +#ifndef Inner_H +#define Inner_H + +#include "../../operator.h" +#include "info.h" + +#define DESCRIPTOR(NAMESPACE) \ + \ + namespace op::inner::NAMESPACE { \ + class Descriptor final: public InfiniopDescriptor { \ + struct Opaque; \ + Opaque *_opaque; \ + InnerInfo _info; \ + size_t _workspace_size; \ + \ + Descriptor( \ + Opaque *opaque, \ + InnerInfo info, \ + size_t workspace_size, \ + infiniDevice_t device_type, \ + int device_id) \ + : InfiniopDescriptor{device_type, device_id}, \ + _opaque(opaque), \ + _info(info), \ + _workspace_size(workspace_size) {} \ + \ + \ + public: \ + ~Descriptor(); \ + size_t workspaceSize() const { return _workspace_size; } \ + \ + static infiniStatus_t create( \ + infiniopHandle_t handle, \ + Descriptor **desc_ptr, \ + infiniopTensorDescriptor_t out_desc, \ + infiniopTensorDescriptor_t input_desc, \ + infiniopTensorDescriptor_t other_desc); \ + \ + infiniStatus_t calculate( \ + void *workspace, size_t workspace_size, \ + void *out, \ + const void *input, \ + const void *other, \ + void *stream) const; \ + }; \ + } + + +#endif \ No newline at end of file diff --git a/src/infiniop/ops/inner/metax/inner_metax.h b/src/infiniop/ops/inner/metax/inner_metax.h new file mode 100644 index 000000000..04a47d445 --- /dev/null +++ b/src/infiniop/ops/inner/metax/inner_metax.h @@ -0,0 +1,7 @@ +#ifndef __INNER_METAX_H__ +#define __INNER_METAX_H__ +#include "../inner.h" + +DESCRIPTOR(metax); + +#endif \ No newline at end of file diff --git a/src/infiniop/ops/inner/metax/inner_metax.maca b/src/infiniop/ops/inner/metax/inner_metax.maca new file mode 100644 index 000000000..683f4a1e1 --- /dev/null +++ b/src/infiniop/ops/inner/metax/inner_metax.maca @@ -0,0 +1,123 @@ +#include "../../../devices/metax/metax_common.h" +#include "inner_metax.h" + +#include "../../../devices/metax/metax_kernel_common.h" + +#include "inner_metax_kernel.h" + +namespace op::inner::metax { + +struct Descriptor::Opaque { + std::shared_ptr internal; +}; + +Descriptor::~Descriptor() { + delete _opaque; +} + +infiniStatus_t Descriptor::create( + infiniopHandle_t handle, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t out_desc, + infiniopTensorDescriptor_t input_desc, + infiniopTensorDescriptor_t other_desc) { + + auto result = InnerInfo::create(out_desc, input_desc, other_desc); + CHECK_RESULT(result); + auto info = result.take(); + size_t workspace_size = 0; + // out_shape + workspace_size += info.out_ndim * sizeof(size_t); + // input, other, out strides + workspace_size += (info.input_ndim + info.other_ndim + info.out_ndim) * sizeof(ptrdiff_t); + *desc_ptr = new Descriptor( + new Opaque{reinterpret_cast(handle)->internal()}, + info, workspace_size, handle->device, handle->device_id); + return INFINI_STATUS_SUCCESS; +} + +namespace { + +template +infiniStatus_t launchKernel( + const InnerInfo &info, + const T *input, const T *other, T *out, + hcStream_t stream, void *workspace, size_t workspace_size) { + + size_t input_ndim = info.input_ndim; + size_t other_ndim = info.other_ndim; + size_t out_ndim = info.out_ndim; + size_t total_elements = info.total_elements; + size_t oper_len = info.oper_len; + + size_t workspace_offset = 0; + unsigned char *workspace_ptr = reinterpret_cast(workspace); + + size_t *out_shape_hc = reinterpret_cast(workspace_ptr + workspace_offset); + workspace_offset += out_ndim * sizeof(size_t); + + ptrdiff_t *input_strides_hc = reinterpret_cast(workspace_ptr + workspace_offset); + ptrdiff_t *other_strides_hc = input_strides_hc + input_ndim; + ptrdiff_t *out_strides_hc = other_strides_hc + other_ndim; + workspace_offset += (info.input_ndim + info.other_ndim + info.out_ndim) * sizeof(ptrdiff_t); + + assert(workspace_offset == workspace_size); + + CHECK_METAX(hcMemcpyAsync(out_shape_hc, info.out_shape.data(), out_ndim * sizeof(size_t), hcMemcpyHostToDevice, stream)); + CHECK_METAX(hcMemcpyAsync(input_strides_hc, info.input_strides.data(), input_ndim * sizeof(ptrdiff_t), hcMemcpyHostToDevice, stream)); + CHECK_METAX(hcMemcpyAsync(other_strides_hc, info.other_strides.data(), other_ndim * sizeof(ptrdiff_t), hcMemcpyHostToDevice, stream)); + CHECK_METAX(hcMemcpyAsync(out_strides_hc, info.out_strides.data(), out_ndim * sizeof(ptrdiff_t), hcMemcpyHostToDevice, stream)); + + size_t block_size = BLOCK_SIZE; + size_t grid_size = (total_elements + BLOCK_SIZE - 1) / BLOCK_SIZE; + + innerKernel<<>>( + input, other, out, + total_elements, oper_len, out_shape_hc, + input_strides_hc, other_strides_hc, out_strides_hc, + input_ndim, other_ndim, out_ndim); + + // CHECK_METAX(hcDeviceSynchronize()); + + return INFINI_STATUS_SUCCESS; +} + +} + +infiniStatus_t Descriptor::calculate( + void *workspace, size_t workspace_size, + void *out, + const void *input, + const void *other, + void *stream_) const { + + hcStream_t stream = (hcStream_t)stream_; +#define CALCULATE_INNER(BLOCK_SIZE, T) \ + launchKernel( \ + _info, \ + (const T *)input, (const T *)other, (T *)out, \ + stream, workspace, workspace_size \ + ) +#define CALCULATE_INNER_WITH_BLOCK_SIZE(BLOCK_SIZE) \ + { \ + if (_info.dtype == INFINI_DTYPE_BF16) \ + return CALCULATE_INNER(BLOCK_SIZE, __hpcc_bfloat16);\ + else if(_info.dtype == INFINI_DTYPE_F16) \ + return CALCULATE_INNER(BLOCK_SIZE, half); \ + else if(_info.dtype == INFINI_DTYPE_F32) \ + return CALCULATE_INNER(BLOCK_SIZE, float); \ + else \ + return INFINI_STATUS_BAD_TENSOR_DTYPE; \ + } + + if (_opaque->internal->maxThreadsPerBlock() == METAX_BLOCK_SIZE_1024) { + CALCULATE_INNER_WITH_BLOCK_SIZE(METAX_BLOCK_SIZE_1024) + } else if (_opaque->internal->maxThreadsPerBlock() == METAX_BLOCK_SIZE_512) { + CALCULATE_INNER_WITH_BLOCK_SIZE(METAX_BLOCK_SIZE_512) + } else { + return INFINI_STATUS_DEVICE_ARCHITECTURE_NOT_SUPPORTED; + } + return INFINI_STATUS_SUCCESS; +} + +} \ No newline at end of file diff --git a/src/infiniop/ops/inner/metax/inner_metax_kernel.h b/src/infiniop/ops/inner/metax/inner_metax_kernel.h new file mode 100644 index 000000000..52f37e1ef --- /dev/null +++ b/src/infiniop/ops/inner/metax/inner_metax_kernel.h @@ -0,0 +1,59 @@ +#ifndef __INNER_METAX_KERNEL_H__ +#define __INNER_METAX_KERNEL_H__ + +template +INFINIOP_METAX_KERNEL innerKernel( + const T *input, const T *other, T *out, + size_t total_elements, size_t oper_len, size_t *out_shape, + ptrdiff_t *input_strides, ptrdiff_t *other_strides, ptrdiff_t *out_strides, + size_t input_ndim, size_t other_ndim, size_t out_ndim) { + + size_t out_index = blockDim.x * blockIdx.x + threadIdx.x; + if (out_index >= total_elements) return; + + size_t out_offset = device::metax::indexToOffset(out_index, out_ndim, out_shape, out_strides); + + size_t out_dim_pos = out_ndim - 1; + size_t index = out_index; + + ptrdiff_t input_offset = 0; + ptrdiff_t other_offset = 0; + + for (int i = (int)other_ndim - 2; i >= 0; i --) { + other_offset += (index % out_shape[out_dim_pos]) * other_strides[i]; + index /= out_shape[out_dim_pos --]; + } + for (int i = (int)input_ndim - 2; i >= 0; i --) { + input_offset += (index % out_shape[out_dim_pos]) * input_strides[i]; + index /= out_shape[out_dim_pos --]; + } + + // T tmp = input[input_offset] * other[other_offset]; + // input_offset += input_strides[input_ndim - 1]; + // other_offset += other_strides[other_ndim - 1]; + T tmp = 0.; + for (size_t i = 0; i < oper_len; i ++) { + if constexpr (std::is_same_v || std::is_same_v) { + tmp += input[input_offset] * other[other_offset]; + } else { + // if (out_index == 0) { + // printf("input: %.10f\tother:%.10f\n", input[input_offset], other[other_offset]); + // } + T mul_a = __fmul_rn(input[input_offset], other[other_offset]); + // if (out_index == 0) { + // printf("mul answer: %.10f\n", mul_a); + // printf("tmp val: %.10f\n", tmp); + // } + tmp = __fadd_rd(tmp, mul_a); + // if (out_index == 0) + // printf("add answer: %.10f\n", tmp); + } + input_offset += input_strides[input_ndim - 1]; + other_offset += other_strides[other_ndim - 1]; + } + + // out[out_offset] = static_cast(tmp); + out[out_offset] = tmp; +} + +#endif // __INNER_KERNEL_CUH__ \ No newline at end of file diff --git a/src/infiniop/ops/inner/moore/inner_moore.h b/src/infiniop/ops/inner/moore/inner_moore.h new file mode 100644 index 000000000..a508a6718 --- /dev/null +++ b/src/infiniop/ops/inner/moore/inner_moore.h @@ -0,0 +1,7 @@ +#ifndef __INNER_MOORE_H__ +#define __INNER_MOORE_H__ +#include "../inner.h" + +DESCRIPTOR(moore); + +#endif \ No newline at end of file diff --git a/src/infiniop/ops/inner/moore/inner_moore.mu b/src/infiniop/ops/inner/moore/inner_moore.mu new file mode 100644 index 000000000..61a430d2c --- /dev/null +++ b/src/infiniop/ops/inner/moore/inner_moore.mu @@ -0,0 +1,123 @@ +#include "../../../devices/moore/moore_common.h" +#include "inner_moore.h" + +#include "../../../devices/moore/moore_kernel_common.h" + +#include "inner_moore_kernel.h" + +namespace op::inner::moore { + +struct Descriptor::Opaque { + std::shared_ptr internal; +}; + +Descriptor::~Descriptor() { + delete _opaque; +} + +infiniStatus_t Descriptor::create( + infiniopHandle_t handle, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t out_desc, + infiniopTensorDescriptor_t input_desc, + infiniopTensorDescriptor_t other_desc) { + + auto result = InnerInfo::create(out_desc, input_desc, other_desc); + CHECK_RESULT(result); + auto info = result.take(); + size_t workspace_size = 0; + // out_shape + workspace_size += info.out_ndim * sizeof(size_t); + // input, other, out strides + workspace_size += (info.input_ndim + info.other_ndim + info.out_ndim) * sizeof(ptrdiff_t); + *desc_ptr = new Descriptor( + new Opaque{reinterpret_cast(handle)->internal()}, + info, workspace_size, handle->device, handle->device_id); + return INFINI_STATUS_SUCCESS; +} + +namespace { + +template +infiniStatus_t launchKernel( + const InnerInfo &info, + const T *input, const T *other, T *out, + musaStream_t stream, void *workspace, size_t workspace_size) { + + size_t input_ndim = info.input_ndim; + size_t other_ndim = info.other_ndim; + size_t out_ndim = info.out_ndim; + size_t total_elements = info.total_elements; + size_t oper_len = info.oper_len; + + size_t workspace_offset = 0; + unsigned char *workspace_ptr = reinterpret_cast(workspace); + + size_t *out_shape_musa = reinterpret_cast(workspace_ptr + workspace_offset); + workspace_offset += out_ndim * sizeof(size_t); + + ptrdiff_t *input_strides_musa = reinterpret_cast(workspace_ptr + workspace_offset); + ptrdiff_t *other_strides_musa = input_strides_musa + input_ndim; + ptrdiff_t *out_strides_musa = other_strides_musa + other_ndim; + workspace_offset += (info.input_ndim + info.other_ndim + info.out_ndim) * sizeof(ptrdiff_t); + + assert(workspace_offset == workspace_size); + + CHECK_MOORE(musaMemcpyAsync(out_shape_musa, info.out_shape.data(), out_ndim * sizeof(size_t), musaMemcpyHostToDevice, stream)); + CHECK_MOORE(musaMemcpyAsync(input_strides_musa, info.input_strides.data(), input_ndim * sizeof(ptrdiff_t), musaMemcpyHostToDevice, stream)); + CHECK_MOORE(musaMemcpyAsync(other_strides_musa, info.other_strides.data(), other_ndim * sizeof(ptrdiff_t), musaMemcpyHostToDevice, stream)); + CHECK_MOORE(musaMemcpyAsync(out_strides_musa, info.out_strides.data(), out_ndim * sizeof(ptrdiff_t), musaMemcpyHostToDevice, stream)); + + size_t block_size = BLOCK_SIZE; + size_t grid_size = (total_elements + BLOCK_SIZE - 1) / BLOCK_SIZE; + + innerKernel<<>>( + input, other, out, + total_elements, oper_len, out_shape_musa, + input_strides_musa, other_strides_musa, out_strides_musa, + input_ndim, other_ndim, out_ndim); + + return INFINI_STATUS_SUCCESS; +} + +} + +infiniStatus_t Descriptor::calculate( + void *workspace, size_t workspace_size, + void *out, + const void *input, + const void *other, + void *stream_) const { + + musaStream_t stream = (musaStream_t)stream_; +#define CALCULATE_INNER(BLOCK_SIZE, T) \ + launchKernel( \ + _info, \ + (const T *)input, (const T *)other, (T *)out, \ + stream, workspace, workspace_size \ + ) +#define CALCULATE_INNER_WITH_BLOCK_SIZE(BLOCK_SIZE) \ + { \ + if (_info.dtype == INFINI_DTYPE_BF16) \ + return CALCULATE_INNER(BLOCK_SIZE, __mt_bfloat16); \ + else if(_info.dtype == INFINI_DTYPE_F16) \ + return CALCULATE_INNER(BLOCK_SIZE, half); \ + else if(_info.dtype == INFINI_DTYPE_F32) \ + return CALCULATE_INNER(BLOCK_SIZE, float); \ + else \ + return INFINI_STATUS_BAD_TENSOR_DTYPE; \ + } + + if (_opaque->internal->maxThreadsPerBlock() == MOORE_BLOCK_SIZE_1024) { + CALCULATE_INNER_WITH_BLOCK_SIZE(MOORE_BLOCK_SIZE_1024) + } else if (_opaque->internal->maxThreadsPerBlock() == MOORE_BLOCK_SIZE_512) { + CALCULATE_INNER_WITH_BLOCK_SIZE(MOORE_BLOCK_SIZE_512) + } else if (_opaque->internal->maxThreadsPerBlock() == MOORE_BLOCK_SIZE_2048) { + CALCULATE_INNER_WITH_BLOCK_SIZE(MOORE_BLOCK_SIZE_2048) + } else { + return INFINI_STATUS_DEVICE_ARCHITECTURE_NOT_SUPPORTED; + } + return INFINI_STATUS_SUCCESS; +} + +} \ No newline at end of file diff --git a/src/infiniop/ops/inner/moore/inner_moore_kernel.h b/src/infiniop/ops/inner/moore/inner_moore_kernel.h new file mode 100644 index 000000000..de82dc8a9 --- /dev/null +++ b/src/infiniop/ops/inner/moore/inner_moore_kernel.h @@ -0,0 +1,45 @@ +#ifndef __INNER_MOORE_KERNEL_H__ +#define __INNER_MOORE_KERNEL_H__ + +template +INFINIOP_MOORE_KERNEL innerKernel( + const T *input, const T *other, T *out, + size_t total_elements, size_t oper_len, size_t *out_shape, + ptrdiff_t *input_strides, ptrdiff_t *other_strides, ptrdiff_t *out_strides, + size_t input_ndim, size_t other_ndim, size_t out_ndim) { + + size_t out_index = blockDim.x * blockIdx.x + threadIdx.x; + if (out_index >= total_elements) return; + + size_t out_offset = device::moore::indexToOffset(out_index, out_ndim, out_shape, out_strides); + + size_t out_dim_pos = out_ndim - 1; + size_t index = out_index; + + ptrdiff_t input_offset = 0; + ptrdiff_t other_offset = 0; + + for (int i = (int)other_ndim - 2; i >= 0; i --) { + other_offset += (index % out_shape[out_dim_pos]) * other_strides[i]; + index /= out_shape[out_dim_pos --]; + } + for (int i = (int)input_ndim - 2; i >= 0; i --) { + input_offset += (index % out_shape[out_dim_pos]) * input_strides[i]; + index /= out_shape[out_dim_pos --]; + } + + float tmp = 0.; + for (size_t i = 0; i < oper_len; i ++) { + // if constexpr (std::is_same_v || std::is_same_v) { + tmp += static_cast(input[input_offset]) * static_cast(other[other_offset]); + // } else { + // tmp += input[input_offset] * other[other_offset]; + // } + input_offset += input_strides[input_ndim - 1]; + other_offset += other_strides[other_ndim - 1]; + } + + out[out_offset] = static_cast(tmp); +} + +#endif // __INNER_KERNEL_CUH__ \ No newline at end of file diff --git a/src/infiniop/ops/inner/nvidia/inner_nvidia.cu b/src/infiniop/ops/inner/nvidia/inner_nvidia.cu new file mode 100644 index 000000000..e2b3ed0f9 --- /dev/null +++ b/src/infiniop/ops/inner/nvidia/inner_nvidia.cu @@ -0,0 +1,123 @@ +#include "../../../devices/nvidia/nvidia_common.cuh" +#include "inner_nvidia.cuh" + +#include "../../../devices/nvidia/nvidia_kernel_common.cuh" + +#include "../cuda/kernel.cuh" + +namespace op::inner::nvidia { + +struct Descriptor::Opaque { + std::shared_ptr internal; +}; + +Descriptor::~Descriptor() { + delete _opaque; +} + +infiniStatus_t Descriptor::create( + infiniopHandle_t handle, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t out_desc, + infiniopTensorDescriptor_t input_desc, + infiniopTensorDescriptor_t other_desc) { + + auto result = InnerInfo::create(out_desc, input_desc, other_desc); + CHECK_RESULT(result); + auto info = result.take(); + size_t workspace_size = 0; + // out_shape + workspace_size += info.out_ndim * sizeof(size_t); + // input, other, out strides + workspace_size += (info.input_ndim + info.other_ndim + info.out_ndim) * sizeof(ptrdiff_t); + *desc_ptr = new Descriptor( + new Opaque{reinterpret_cast(handle)->internal()}, + info, workspace_size, handle->device, handle->device_id); + return INFINI_STATUS_SUCCESS; +} + +namespace { + +template +infiniStatus_t launchKernel( + const InnerInfo &info, + const T *input, const T *other, T *out, + cudaStream_t stream, void *workspace, size_t workspace_size) { + + size_t input_ndim = info.input_ndim; + size_t other_ndim = info.other_ndim; + size_t out_ndim = info.out_ndim; + size_t total_elements = info.total_elements; + size_t oper_len = info.oper_len; + + size_t workspace_offset = 0; + unsigned char *workspace_ptr = reinterpret_cast(workspace); + + size_t *out_shape_cuda = reinterpret_cast(workspace_ptr + workspace_offset); + workspace_offset += out_ndim * sizeof(size_t); + + ptrdiff_t *input_strides_cuda = reinterpret_cast(workspace_ptr + workspace_offset); + ptrdiff_t *other_strides_cuda = input_strides_cuda + input_ndim; + ptrdiff_t *out_strides_cuda = other_strides_cuda + other_ndim; + workspace_offset += (info.input_ndim + info.other_ndim + info.out_ndim) * sizeof(ptrdiff_t); + + CHECK_CUDA(cudaMemcpyAsync(out_shape_cuda, info.out_shape.data(), out_ndim * sizeof(size_t), cudaMemcpyHostToDevice, stream)); + CHECK_CUDA(cudaMemcpyAsync(input_strides_cuda, info.input_strides.data(), input_ndim * sizeof(ptrdiff_t), cudaMemcpyHostToDevice, stream)); + CHECK_CUDA(cudaMemcpyAsync(other_strides_cuda, info.other_strides.data(), other_ndim * sizeof(ptrdiff_t), cudaMemcpyHostToDevice, stream)); + CHECK_CUDA(cudaMemcpyAsync(out_strides_cuda, info.out_strides.data(), out_ndim * sizeof(ptrdiff_t), cudaMemcpyHostToDevice, stream)); + + size_t block_size = BLOCK_SIZE; + size_t grid_size = (total_elements + BLOCK_SIZE - 1) / BLOCK_SIZE; + + innerKernel<<>>( + input, other, out, + total_elements, oper_len, out_shape_cuda, + input_strides_cuda, other_strides_cuda, out_strides_cuda, + input_ndim, other_ndim, out_ndim); + + // CHECK_CUDA(cudaDeviceSynchronize()); + + return INFINI_STATUS_SUCCESS; +} + +} + +infiniStatus_t Descriptor::calculate( + void *workspace, size_t workspace_size, + void *out, + const void *input, + const void *other, + void *stream_) const { + + cudaStream_t stream = (cudaStream_t)stream_; +#define CALCULATE_INNER(BLOCK_SIZE, T) \ + launchKernel( \ + _info, \ + (const T *)input, (const T *)other, (T *)out, \ + stream, workspace, workspace_size \ + ) +#define CALCULATE_INNER_WITH_BLOCK_SIZE(BLOCK_SIZE) \ + { \ + if (_info.dtype == INFINI_DTYPE_BF16) \ + return CALCULATE_INNER(BLOCK_SIZE, __nv_bfloat16); \ + else if(_info.dtype == INFINI_DTYPE_F16) \ + return CALCULATE_INNER(BLOCK_SIZE, half); \ + else if(_info.dtype == INFINI_DTYPE_F32) \ + return CALCULATE_INNER(BLOCK_SIZE, float); \ + else \ + return INFINI_STATUS_BAD_TENSOR_DTYPE; \ + } + + if (_opaque->internal->maxThreadsPerBlock() == CUDA_BLOCK_SIZE_1024) { + CALCULATE_INNER_WITH_BLOCK_SIZE(CUDA_BLOCK_SIZE_1024) + } else if (_opaque->internal->maxThreadsPerBlock() == CUDA_BLOCK_SIZE_512) { + CALCULATE_INNER_WITH_BLOCK_SIZE(CUDA_BLOCK_SIZE_512) + } else if (_opaque->internal->maxThreadsPerBlock() == CUDA_BLOCK_SIZE_4096) { + CALCULATE_INNER_WITH_BLOCK_SIZE(CUDA_BLOCK_SIZE_4096) + } else { + return INFINI_STATUS_DEVICE_ARCHITECTURE_NOT_SUPPORTED; + } + return INFINI_STATUS_SUCCESS; +} + +} \ No newline at end of file diff --git a/src/infiniop/ops/inner/nvidia/inner_nvidia.cuh b/src/infiniop/ops/inner/nvidia/inner_nvidia.cuh new file mode 100644 index 000000000..11033c2c0 --- /dev/null +++ b/src/infiniop/ops/inner/nvidia/inner_nvidia.cuh @@ -0,0 +1,7 @@ +#ifndef __INNER_NVIDIA_H__ +#define __INNER_NVIDIA_H__ +#include "../inner.h" + +DESCRIPTOR(nvidia); + +#endif \ No newline at end of file diff --git a/src/infiniop/ops/inner/operator.cc b/src/infiniop/ops/inner/operator.cc new file mode 100644 index 000000000..c68762d0e --- /dev/null +++ b/src/infiniop/ops/inner/operator.cc @@ -0,0 +1,139 @@ +#include "../../operator.h" +#include "../../handle.h" +#include "infiniop/ops/inner.h" + +#ifdef ENABLE_CPU_API +#include "cpu/inner_cpu.h" +#endif +#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API) || defined(ENABLE_QY_API) || defined(ENABLE_HYGON_API) +#include "nvidia/inner_nvidia.cuh" +#endif +#ifdef ENABLE_METAX_API +#include "metax/inner_metax.h" +#endif +#ifdef ENABLE_MOORE_API +#include "moore/inner_moore.h" +#endif + +__C infiniStatus_t infiniopCreateInnerDescriptor( + infiniopHandle_t handle, + infiniopInnerDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t out_desc, + infiniopTensorDescriptor_t input_desc, + infiniopTensorDescriptor_t other_desc) { + +#define CREATE(CASE, NAMESPACE) \ + case CASE: \ + return op::inner::NAMESPACE::Descriptor::create( \ + handle, \ + reinterpret_cast(desc_ptr), \ + out_desc, \ + input_desc, \ + other_desc); + + switch (handle->device) { +#ifdef ENABLE_CPU_API + CREATE(INFINI_DEVICE_CPU, cpu) +#endif +#ifdef ENABLE_NVIDIA_API + CREATE(INFINI_DEVICE_NVIDIA, nvidia) +#endif +#ifdef ENABLE_ILUVATAR_API + CREATE(INFINI_DEVICE_ILUVATAR, nvidia); +#endif +#ifdef ENABLE_METAX_API + CREATE(INFINI_DEVICE_METAX, metax) +#endif +#ifdef ENABLE_MOORE_API + CREATE(INFINI_DEVICE_MOORE, moore) +#endif + } + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; +} + +__C infiniStatus_t infiniopGetInnerWorkspaceSize(infiniopInnerDescriptor_t desc, size_t *size) { + +#define GET(CASE, NAMESPACE) \ + case CASE: \ + *size = reinterpret_cast(desc)->workspaceSize(); \ + return INFINI_STATUS_SUCCESS; + + switch (desc->device_type) { +#ifdef ENABLE_CPU_API + GET(INFINI_DEVICE_CPU, cpu) +#endif +#ifdef ENABLE_NVIDIA_API + GET(INFINI_DEVICE_NVIDIA, nvidia) +#endif +#ifdef ENABLE_ILUVATAR_API + GET(INFINI_DEVICE_ILUVATAR, nvidia); +#endif +#ifdef ENABLE_METAX_API + GET(INFINI_DEVICE_METAX, metax) +#endif +#ifdef ENABLE_MOORE_API + GET(INFINI_DEVICE_MOORE, moore) +#endif + } + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; +} + +__C infiniStatus_t infiniopInner( + infiniopInnerDescriptor_t desc, + void *workspace, size_t workspace_size, + void *out, + const void *input, + const void *other, + void *stream) { + +#define CALCULATE(CASE, NAMESPACE) \ + case CASE: \ + return reinterpret_cast(desc)->calculate( \ + workspace, workspace_size, out, input, other, stream); + + switch (desc->device_type) { +#ifdef ENABLE_CPU_API + CALCULATE(INFINI_DEVICE_CPU, cpu) +#endif +#ifdef ENABLE_NVIDIA_API + CALCULATE(INFINI_DEVICE_NVIDIA, nvidia) +#endif +#ifdef ENABLE_ILUVATAR_API + CALCULATE(INFINI_DEVICE_ILUVATAR, nvidia); +#endif +#ifdef ENABLE_METAX_API + CALCULATE(INFINI_DEVICE_METAX, metax) +#endif +#ifdef ENABLE_MOORE_API + CALCULATE(INFINI_DEVICE_MOORE, moore) +#endif + } + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; +} + +__C infiniStatus_t infiniopDestroyInnerDescriptor(infiniopInnerDescriptor_t desc) { + +#define DESTROY(CASE, NAMESPACE) \ + case CASE: \ + delete reinterpret_cast(desc); \ + return INFINI_STATUS_SUCCESS; + + switch (desc->device_type) { +#ifdef ENABLE_CPU_API + DESTROY(INFINI_DEVICE_CPU, cpu) +#endif +#ifdef ENABLE_NVIDIA_API + DESTROY(INFINI_DEVICE_NVIDIA, nvidia) +#endif +#ifdef ENABLE_ILUVATAR_API + DESTROY(INFINI_DEVICE_ILUVATAR, nvidia); +#endif +#ifdef ENABLE_METAX_API + DESTROY(INFINI_DEVICE_METAX, metax) +#endif +#ifdef ENABLE_MOORE_API + DESTROY(INFINI_DEVICE_MOORE, moore) +#endif + } + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; +} diff --git a/src/infiniop/ops/masked_select/cpu/masked_select_cpu.cc b/src/infiniop/ops/masked_select/cpu/masked_select_cpu.cc new file mode 100644 index 000000000..ae74587d8 --- /dev/null +++ b/src/infiniop/ops/masked_select/cpu/masked_select_cpu.cc @@ -0,0 +1,58 @@ +#include "masked_select_cpu.h" +#include "../../../devices/cpu/common_cpu.h" + +namespace op::masked_select::cpu { + +Descriptor::~Descriptor() {} + +infiniStatus_t Descriptor::create( + infiniopHandle_t handle, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t input_desc, + infiniopTensorDescriptor_t mask_desc) { + + auto result = MaskedSelectInfo::create(input_desc, mask_desc); + CHECK_RESULT(result); + *desc_ptr = new Descriptor(nullptr, result.take(), 0, handle->device, handle->device_id); + return INFINI_STATUS_SUCCESS; +} + +namespace { + +template +infiniStatus_t maskedSelect(const MaskedSelectInfo *info, const T *input, const bool *mask, void **data_ptr, size_t *dlen_ptr) { + std::vector res; + for (size_t index = 0; index < info->total_elements; index ++) { + size_t input_offset = op::common_cpu::indexToOffset(index, info->ndim, info->shape.data(), info->input_strides.data()); + size_t mask_offset = op::common_cpu::indexToOffset(index, info->ndim, info->shape.data(), info->mask_strides.data()); + if (mask[mask_offset]) { + res.push_back(input[input_offset]); + } + } + *dlen_ptr = res.size(); + *data_ptr = new T[*dlen_ptr]; + std::memcpy(*data_ptr, res.data(), sizeof(T) * (*dlen_ptr)); + + return INFINI_STATUS_SUCCESS; +} + +} + +infiniStatus_t Descriptor::calculate( + void *workspace, size_t workspace_size, + const void *input, + const bool *mask, + void **data_ptr, + size_t *dlen_ptr, + void *stream) const { + + if (_info.dtype == INFINI_DTYPE_F32) { + CHECK_STATUS(maskedSelect(&_info, (const float *)input, mask, data_ptr, dlen_ptr)); + } else { + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } + + return INFINI_STATUS_SUCCESS; +} + +} \ No newline at end of file diff --git a/src/infiniop/ops/masked_select/cpu/masked_select_cpu.h b/src/infiniop/ops/masked_select/cpu/masked_select_cpu.h new file mode 100644 index 000000000..9fed9e2e6 --- /dev/null +++ b/src/infiniop/ops/masked_select/cpu/masked_select_cpu.h @@ -0,0 +1,7 @@ +#ifndef __MASKED_SELECT_CPU_H__ +#define __MASKED_SELECT_CPU_H__ +#include "../masked_select.h" + +DESCRIPTOR(cpu) + +#endif // __MASKED_SELECT_CPU_H__ \ No newline at end of file diff --git a/src/infiniop/ops/masked_select/cuda/kernel.cuh b/src/infiniop/ops/masked_select/cuda/kernel.cuh new file mode 100644 index 000000000..fc7485f31 --- /dev/null +++ b/src/infiniop/ops/masked_select/cuda/kernel.cuh @@ -0,0 +1,101 @@ +#ifndef __MASKED_SELECT_KERNEL_CUH__ +#define __MASKED_SELECT_KERNEL_CUH__ + +template +INFINIOP_CUDA_KERNEL maskedSelectGetMarkScanOnceKernel( + const bool *mask, size_t *mark_scan, size_t total_elements, + size_t *shape, ptrdiff_t *mask_strides, size_t ndim) { + + __shared__ __align__(128) size_t smem[BLOCK_SIZE]; + + size_t tid = threadIdx.x; + size_t index = blockDim.x * blockIdx.x + threadIdx.x; + if (index < total_elements) { + size_t mask_offset = device::nvidia::indexToOffset(index, ndim, shape, mask_strides); + smem[tid] = mask[mask_offset]; + } else { + smem[tid] = 0; + } + + for (int s = 1; s < BLOCK_SIZE; s <<= 1) { + __syncthreads(); + + float temp; + if (tid >= s) + temp = smem[tid] + smem[tid - s]; + __syncthreads(); + + if (tid >= s) + smem[tid] = temp; + } + + if (index < total_elements) + mark_scan[index] = smem[tid]; +} + +template +INFINIOP_CUDA_KERNEL maskedSelectScanWithStrideKernel( + size_t *mark_scan, size_t total_elements, size_t stride) { + + __shared__ __align__(128) size_t smem[BLOCK_SIZE]; + + size_t tid = threadIdx.x; + size_t index = (blockDim.x * blockIdx.x + threadIdx.x + 1) * stride - 1; + smem[tid] = index < total_elements ? mark_scan[index] : 0.; + + for (int s = 1; s < BLOCK_SIZE; s <<= 1) { + __syncthreads(); + + size_t temp; + if (tid >= s) + temp = smem[tid] + smem[tid - s]; + __syncthreads(); + + if (tid >= s) + smem[tid] = temp; + } + + if (index < total_elements) + mark_scan[index] = smem[tid]; +} + +template +INFINIOP_CUDA_KERNEL maskedSelectCountScanResultKernel( + size_t *mark_scan, size_t *scan_result, size_t total_elements) { + + size_t index = blockDim.x * blockIdx.x + threadIdx.x; + if (index >= total_elements) + return; + + size_t val = mark_scan[index]; + + for (size_t s = BLOCK_SIZE; s < total_elements; s *= BLOCK_SIZE) { + size_t now_stride_block = (index + 1) / s; + size_t pre_stride_block = now_stride_block * s - 1; + if (now_stride_block == 0 || (index + 1) % s == 0 || (pre_stride_block + 1) % (s * BLOCK_SIZE) == 0) + continue; + val += mark_scan[pre_stride_block]; + } + + scan_result[index] = val; +} + +template +INFINIOP_CUDA_KERNEL maskedSelectGetDataKernel( + const T *input, const bool *mask, size_t *scan_result, T *data, size_t total_elements, + size_t *shape, ptrdiff_t *input_strides, ptrdiff_t *mask_strides, size_t ndim) { + + size_t index = blockDim.x * blockIdx.x + threadIdx.x; + + if (index >= total_elements) + return; + + size_t input_offset = device::nvidia::indexToOffset(index, ndim, shape, input_strides); + size_t mask_offset = device::nvidia::indexToOffset(index, ndim, shape, mask_strides); + + if (mask[mask_offset]) { + data[scan_result[index] - 1] = input[input_offset]; + } +} + +#endif \ No newline at end of file diff --git a/src/infiniop/ops/masked_select/info.h b/src/infiniop/ops/masked_select/info.h new file mode 100644 index 000000000..889b32821 --- /dev/null +++ b/src/infiniop/ops/masked_select/info.h @@ -0,0 +1,53 @@ +#ifndef __MASKED_SELECT__INFO_H__ +#define __MASKED_SELECT__INFO_H__ + +#include "../../../utils.h" +#include "../../tensor.h" +#include + +namespace op::masked_select { + +class MaskedSelectInfo { + MaskedSelectInfo() = default; + +public: + infiniDtype_t dtype; + size_t ndim; + size_t total_elements; + + std::vector shape; + + std::vector input_strides; + std::vector mask_strides; + + static utils::Result create(infiniopTensorDescriptor_t input_desc, infiniopTensorDescriptor_t mask_desc) { + auto dtype = input_desc->dtype(); + CHECK_DTYPE(dtype, INFINI_DTYPE_F32); + CHECK_DTYPE(mask_desc->dtype(), INFINI_DTYPE_BOOL); + + auto shape = input_desc->shape(); + CHECK_SAME_SHAPE(shape, mask_desc->shape()); + + auto ndim = input_desc->ndim(); + + size_t total_elements = 1; + for (auto& dim : shape) { + total_elements *= dim; + } + + auto input_strides = input_desc->strides(); + auto mask_strides = mask_desc->strides(); + + return utils::Result(MaskedSelectInfo{ + dtype, + ndim, + total_elements, + shape, + input_strides, + mask_strides}); + } +}; + +} // namespace op::masked_select + +#endif // __MASKED_SELECT_INFO_H__ diff --git a/src/infiniop/ops/masked_select/masked_select.h b/src/infiniop/ops/masked_select/masked_select.h new file mode 100644 index 000000000..98148ebaf --- /dev/null +++ b/src/infiniop/ops/masked_select/masked_select.h @@ -0,0 +1,48 @@ +#ifndef MASKED_SELECT_H +#define MASKED_SELECT_H + +#include "../../operator.h" +#include "info.h" + +#define DESCRIPTOR(NAMESPACE) \ + \ + namespace op::masked_select::NAMESPACE { \ + class Descriptor final: public InfiniopDescriptor { \ + struct Opaque; \ + Opaque *_opaque; \ + MaskedSelectInfo _info; \ + size_t _workspace_size; \ + \ + Descriptor( \ + Opaque *opaque, \ + MaskedSelectInfo info, \ + size_t workspace_size, \ + infiniDevice_t device_type, \ + int device_id) \ + : InfiniopDescriptor{device_type, device_id}, \ + _opaque(opaque), \ + _info(info), \ + _workspace_size(workspace_size) {} \ + \ + \ + public: \ + ~Descriptor(); \ + size_t workspaceSize() const { return _workspace_size; } \ + \ + static infiniStatus_t create( \ + infiniopHandle_t handle, \ + Descriptor **desc_ptr, \ + infiniopTensorDescriptor_t input_desc, \ + infiniopTensorDescriptor_t mask_desc); \ + \ + infiniStatus_t calculate( \ + void *workspace, size_t workspace_size, \ + const void *input, \ + const bool *mask, \ + void **data_ptr, \ + size_t *dlen_ptr, \ + void *stream) const; \ + }; \ + } + +#endif // MASKED_SELECT_H \ No newline at end of file diff --git a/src/infiniop/ops/masked_select/metax/masked_select_metax.h b/src/infiniop/ops/masked_select/metax/masked_select_metax.h new file mode 100644 index 000000000..47460777f --- /dev/null +++ b/src/infiniop/ops/masked_select/metax/masked_select_metax.h @@ -0,0 +1,7 @@ +#ifndef __MASKED_SELECT_METAX_H__ +#define __MASKED_SELECT_METAX_H__ +#include "../masked_select.h" + +DESCRIPTOR(metax); + +#endif \ No newline at end of file diff --git a/src/infiniop/ops/masked_select/metax/masked_select_metax.maca b/src/infiniop/ops/masked_select/metax/masked_select_metax.maca new file mode 100644 index 000000000..4a4181153 --- /dev/null +++ b/src/infiniop/ops/masked_select/metax/masked_select_metax.maca @@ -0,0 +1,154 @@ +#include "../../../devices/metax/metax_common.h" +#include "masked_select_metax.h" + +#include "../../../devices/metax/metax_kernel_common.h" + +// #include "infinicore/context/context.hpp" +#include "masked_select_metax_kernel.h" + +namespace op::masked_select::metax { + +struct Descriptor::Opaque { + std::shared_ptr internal; +}; + +Descriptor::~Descriptor() { + delete _opaque; +} + +infiniStatus_t Descriptor::create( + infiniopHandle_t handle, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t input_desc, + infiniopTensorDescriptor_t mask_desc) { + + auto result = MaskedSelectInfo::create(input_desc, mask_desc); + CHECK_RESULT(result); + auto info = result.take(); + size_t workspace_size = 0; + // shape size + workspace_size += info.ndim * sizeof(size_t); + // strides size * 2 + workspace_size += info.ndim * sizeof(ptrdiff_t) * 2; + // size_t mark_scan + workspace_size += info.total_elements * sizeof(size_t); + // size_t mark_result + workspace_size += info.total_elements * sizeof(size_t); + *desc_ptr = new Descriptor( + new Opaque{reinterpret_cast(handle)->internal()}, + info, workspace_size, handle->device, handle->device_id); + return INFINI_STATUS_SUCCESS; +} + +namespace { + +template +infiniStatus_t launchKernel( + const MaskedSelectInfo &info, + const T *input, const bool *mask, void **data_ptr, size_t *dlen_ptr, + hcStream_t stream, void *workspace, size_t workspace_size) { + + size_t ndim = info.ndim; + size_t total_elements = info.total_elements; + + size_t workspace_offset = 0; + unsigned char *workspace_ptr = reinterpret_cast(workspace); + + size_t *shape_hc = reinterpret_cast(workspace_ptr + workspace_offset); + workspace_offset += ndim * sizeof(size_t); + + ptrdiff_t *input_strides_hc = reinterpret_cast(workspace_ptr + workspace_offset); + workspace_offset += ndim * sizeof(ptrdiff_t); + + ptrdiff_t *mask_strides_hc = reinterpret_cast(workspace_ptr + workspace_offset); + workspace_offset += ndim * sizeof(ptrdiff_t); + + size_t *mark_scan = reinterpret_cast(workspace_ptr + workspace_offset); + workspace_offset += info.total_elements * sizeof(size_t); + + size_t *scan_result = reinterpret_cast(workspace_ptr + workspace_offset); + workspace_offset += info.total_elements * sizeof(size_t); + + assert(workspace_offset == workspace_size); + + CHECK_METAX(hcMemcpyAsync(shape_hc, info.shape.data(), ndim * sizeof(size_t), hcMemcpyHostToDevice, stream)); + CHECK_METAX(hcMemcpyAsync(input_strides_hc, info.input_strides.data(), ndim * sizeof(ptrdiff_t), hcMemcpyHostToDevice, stream)); + CHECK_METAX(hcMemcpyAsync(mask_strides_hc, info.mask_strides.data(), ndim * sizeof(ptrdiff_t), hcMemcpyHostToDevice, stream)); + + size_t block_size = BLOCK_SIZE; + size_t grid_size = (total_elements + block_size - 1) / block_size; + + maskedSelectGetMarkScanOnceKernel<<>>( + mask, mark_scan, total_elements, + shape_hc, mask_strides_hc, ndim); + CHECK_METAX(hcDeviceSynchronize()); + + for (size_t stride = BLOCK_SIZE; stride < total_elements; stride *= BLOCK_SIZE) { + size_t stride_elements = (total_elements + stride - 1) / stride; + size_t stride_grid_size = (stride_elements + BLOCK_SIZE - 1) / BLOCK_SIZE; + + maskedSelectScanWithStrideKernel<<>>( + mark_scan, total_elements, stride); + CHECK_METAX(hcDeviceSynchronize()); + } + if (total_elements > BLOCK_SIZE) { + maskedSelectCountScanResultKernel<<>>( + mark_scan, scan_result, total_elements); + CHECK_METAX(hcDeviceSynchronize()); + } else { + scan_result = mark_scan; + } + + + CHECK_METAX(hcMemcpyAsync(dlen_ptr, scan_result + total_elements - 1, sizeof(size_t), hcMemcpyDeviceToHost, stream)); + + CHECK_METAX(hcMalloc(data_ptr, *dlen_ptr * sizeof(T))); + + // context::setDevice(METAX); + // context::allocateMemory(data_ptr, *dlen_ptr * sizeof(T)); + + maskedSelectGetDataKernel<<>>( + input, mask, scan_result, (T *)*data_ptr, total_elements, + shape_hc, input_strides_hc, mask_strides_hc, ndim); + CHECK_METAX(hcDeviceSynchronize()); + + return INFINI_STATUS_SUCCESS; +} + +} + + +infiniStatus_t Descriptor::calculate( + void *workspace, size_t workspace_size, + const void *input, + const bool *mask, + void **data_ptr, + size_t *dlen_ptr, + void *stream_) const { + + hcStream_t stream = (hcStream_t)stream_; +#define CALCULATE_MASKED_SELECT(BLOCK_SIZE, T) \ + launchKernel( \ + _info, \ + (const T *)input, mask, data_ptr, dlen_ptr, \ + stream, workspace, workspace_size \ + ) +#define CALCULATE_MASKED_SELECT_WITH_BLOCK_SIZE(BLOCK_SIZE) \ + { \ + if (_info.dtype == INFINI_DTYPE_F32) \ + return CALCULATE_MASKED_SELECT(BLOCK_SIZE, float); \ + else \ + return INFINI_STATUS_BAD_TENSOR_DTYPE; \ + } + + if (_opaque->internal->maxThreadsPerBlock() == METAX_BLOCK_SIZE_1024) { + CALCULATE_MASKED_SELECT_WITH_BLOCK_SIZE(METAX_BLOCK_SIZE_1024); + } else if (_opaque->internal->maxThreadsPerBlock() == METAX_BLOCK_SIZE_512) { + CALCULATE_MASKED_SELECT_WITH_BLOCK_SIZE(METAX_BLOCK_SIZE_512); + } else { + return INFINI_STATUS_DEVICE_ARCHITECTURE_NOT_SUPPORTED; + } + return INFINI_STATUS_SUCCESS; +} + +} diff --git a/src/infiniop/ops/masked_select/metax/masked_select_metax_kernel.h b/src/infiniop/ops/masked_select/metax/masked_select_metax_kernel.h new file mode 100644 index 000000000..e590de955 --- /dev/null +++ b/src/infiniop/ops/masked_select/metax/masked_select_metax_kernel.h @@ -0,0 +1,101 @@ +#ifndef __MASKED_SELECT_METAX_KERNEL_CUH__ +#define __MASKED_SELECT_METAX_KERNEL_CUH__ + +template +INFINIOP_METAX_KERNEL maskedSelectGetMarkScanOnceKernel( + const bool *mask, size_t *mark_scan, size_t total_elements, + size_t *shape, ptrdiff_t *mask_strides, size_t ndim) { + + __shared__ __align__(128) size_t smem[BLOCK_SIZE]; + + size_t tid = threadIdx.x; + size_t index = blockDim.x * blockIdx.x + threadIdx.x; + if (index < total_elements) { + size_t mask_offset = device::metax::indexToOffset(index, ndim, shape, mask_strides); + smem[tid] = mask[mask_offset]; + } else { + smem[tid] = 0; + } + + for (int s = 1; s < BLOCK_SIZE; s <<= 1) { + __syncthreads(); + + float temp; + if (tid >= s) + temp = smem[tid] + smem[tid - s]; + __syncthreads(); + + if (tid >= s) + smem[tid] = temp; + } + + if (index < total_elements) + mark_scan[index] = smem[tid]; +} + +template +INFINIOP_METAX_KERNEL maskedSelectScanWithStrideKernel( + size_t *mark_scan, size_t total_elements, size_t stride) { + + __shared__ __align__(128) size_t smem[BLOCK_SIZE]; + + size_t tid = threadIdx.x; + size_t index = (blockDim.x * blockIdx.x + threadIdx.x + 1) * stride - 1; + smem[tid] = index < total_elements ? mark_scan[index] : 0.; + + for (int s = 1; s < BLOCK_SIZE; s <<= 1) { + __syncthreads(); + + size_t temp; + if (tid >= s) + temp = smem[tid] + smem[tid - s]; + __syncthreads(); + + if (tid >= s) + smem[tid] = temp; + } + + if (index < total_elements) + mark_scan[index] = smem[tid]; +} + +template +INFINIOP_METAX_KERNEL maskedSelectCountScanResultKernel( + size_t *mark_scan, size_t *scan_result, size_t total_elements) { + + size_t index = blockDim.x * blockIdx.x + threadIdx.x; + if (index >= total_elements) + return; + + size_t val = mark_scan[index]; + + for (size_t s = BLOCK_SIZE; s < total_elements; s *= BLOCK_SIZE) { + size_t now_stride_block = (index + 1) / s; + size_t pre_stride_block = now_stride_block * s - 1; + if (now_stride_block == 0 || (index + 1) % s == 0 || (pre_stride_block + 1) % (s * BLOCK_SIZE) == 0) + continue; + val += mark_scan[pre_stride_block]; + } + + scan_result[index] = val; +} + +template +INFINIOP_METAX_KERNEL maskedSelectGetDataKernel( + const T *input, const bool *mask, size_t *scan_result, T *data, size_t total_elements, + size_t *shape, ptrdiff_t *input_strides, ptrdiff_t *mask_strides, size_t ndim) { + + size_t index = blockDim.x * blockIdx.x + threadIdx.x; + + if (index >= total_elements) + return; + + size_t input_offset = device::metax::indexToOffset(index, ndim, shape, input_strides); + size_t mask_offset = device::metax::indexToOffset(index, ndim, shape, mask_strides); + + if (mask[mask_offset]) { + data[scan_result[index] - 1] = input[input_offset]; + } +} + +#endif \ No newline at end of file diff --git a/src/infiniop/ops/masked_select/moore/masked_select_moore.h b/src/infiniop/ops/masked_select/moore/masked_select_moore.h new file mode 100644 index 000000000..24548d8fc --- /dev/null +++ b/src/infiniop/ops/masked_select/moore/masked_select_moore.h @@ -0,0 +1,7 @@ +#ifndef __MASKED_SELECT_MOORE_H__ +#define __MASKED_SELECT_MOORE_H__ +#include "../masked_select.h" + +DESCRIPTOR(moore); + +#endif \ No newline at end of file diff --git a/src/infiniop/ops/masked_select/moore/masked_select_moore.mu b/src/infiniop/ops/masked_select/moore/masked_select_moore.mu new file mode 100644 index 000000000..3c218bc19 --- /dev/null +++ b/src/infiniop/ops/masked_select/moore/masked_select_moore.mu @@ -0,0 +1,156 @@ +#include "../../../devices/moore/moore_common.h" +#include "masked_select_moore.h" + +#include "../../../devices/moore/moore_kernel_common.h" + +// #include "infinicore/context/context.hpp" +#include "masked_select_moore_kernel.h" + +namespace op::masked_select::moore { + +struct Descriptor::Opaque { + std::shared_ptr internal; +}; + +Descriptor::~Descriptor() { + delete _opaque; +} + +infiniStatus_t Descriptor::create( + infiniopHandle_t handle, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t input_desc, + infiniopTensorDescriptor_t mask_desc) { + + auto result = MaskedSelectInfo::create(input_desc, mask_desc); + CHECK_RESULT(result); + auto info = result.take(); + size_t workspace_size = 0; + // shape size + workspace_size += info.ndim * sizeof(size_t); + // strides size * 2 + workspace_size += info.ndim * sizeof(ptrdiff_t) * 2; + // size_t mark_scan + workspace_size += info.total_elements * sizeof(size_t); + // size_t mark_result + workspace_size += info.total_elements * sizeof(size_t); + *desc_ptr = new Descriptor( + new Opaque{reinterpret_cast(handle)->internal()}, + info, workspace_size, handle->device, handle->device_id); + return INFINI_STATUS_SUCCESS; +} + +namespace { + +template +infiniStatus_t launchKernel( + const MaskedSelectInfo &info, + const T *input, const bool *mask, void **data_ptr, size_t *dlen_ptr, + musaStream_t stream, void *workspace, size_t workspace_size) { + + size_t ndim = info.ndim; + size_t total_elements = info.total_elements; + + size_t workspace_offset = 0; + unsigned char *workspace_ptr = reinterpret_cast(workspace); + + size_t *shape_musa = reinterpret_cast(workspace_ptr + workspace_offset); + workspace_offset += ndim * sizeof(size_t); + + ptrdiff_t *input_strides_musa = reinterpret_cast(workspace_ptr + workspace_offset); + workspace_offset += ndim * sizeof(ptrdiff_t); + + ptrdiff_t *mask_strides_musa = reinterpret_cast(workspace_ptr + workspace_offset); + workspace_offset += ndim * sizeof(ptrdiff_t); + + size_t *mark_scan = reinterpret_cast(workspace_ptr + workspace_offset); + workspace_offset += info.total_elements * sizeof(size_t); + + size_t *scan_result = reinterpret_cast(workspace_ptr + workspace_offset); + workspace_offset += info.total_elements * sizeof(size_t); + + assert(workspace_offset == workspace_size); + + CHECK_MOORE(musaMemcpyAsync(shape_musa, info.shape.data(), ndim * sizeof(size_t), musaMemcpyHostToDevice, stream)); + CHECK_MOORE(musaMemcpyAsync(input_strides_musa, info.input_strides.data(), ndim * sizeof(ptrdiff_t), musaMemcpyHostToDevice, stream)); + CHECK_MOORE(musaMemcpyAsync(mask_strides_musa, info.mask_strides.data(), ndim * sizeof(ptrdiff_t), musaMemcpyHostToDevice, stream)); + + size_t block_size = BLOCK_SIZE; + size_t grid_size = (total_elements + block_size - 1) / block_size; + + maskedSelectGetMarkScanOnceKernel<<>>( + mask, mark_scan, total_elements, + shape_musa, mask_strides_musa, ndim); + CHECK_MOORE(musaDeviceSynchronize()); + + for (size_t stride = BLOCK_SIZE; stride < total_elements; stride *= BLOCK_SIZE) { + size_t stride_elements = (total_elements + stride - 1) / stride; + size_t stride_grid_size = (stride_elements + BLOCK_SIZE - 1) / BLOCK_SIZE; + + maskedSelectScanWithStrideKernel<<>>( + mark_scan, total_elements, stride); + CHECK_MOORE(musaDeviceSynchronize()); + } + if (total_elements > BLOCK_SIZE) { + maskedSelectCountScanResultKernel<<>>( + mark_scan, scan_result, total_elements); + CHECK_MOORE(musaDeviceSynchronize()); + } else { + scan_result = mark_scan; + } + + + CHECK_MOORE(musaMemcpyAsync(dlen_ptr, scan_result + total_elements - 1, sizeof(size_t), musaMemcpyDeviceToHost, stream)); + + CHECK_MOORE(musaMalloc(data_ptr, *dlen_ptr * sizeof(T))); + + // context::setDevice(MOORE); + // context::allocateMemory(data_ptr, *dlen_ptr * sizeof(T)); + + maskedSelectGetDataKernel<<>>( + input, mask, scan_result, (T *)*data_ptr, total_elements, + shape_musa, input_strides_musa, mask_strides_musa, ndim); + CHECK_MOORE(musaDeviceSynchronize()); + + return INFINI_STATUS_SUCCESS; +} + +} + + +infiniStatus_t Descriptor::calculate( + void *workspace, size_t workspace_size, + const void *input, + const bool *mask, + void **data_ptr, + size_t *dlen_ptr, + void *stream_) const { + + musaStream_t stream = (musaStream_t)stream_; +#define CALCULATE_MASKED_SELECT(BLOCK_SIZE, T) \ + launchKernel( \ + _info, \ + (const T *)input, mask, data_ptr, dlen_ptr, \ + stream, workspace, workspace_size \ + ) +#define CALCULATE_MASKED_SELECT_WITH_BLOCK_SIZE(BLOCK_SIZE) \ + { \ + if (_info.dtype == INFINI_DTYPE_F32) \ + return CALCULATE_MASKED_SELECT(BLOCK_SIZE, float); \ + else \ + return INFINI_STATUS_BAD_TENSOR_DTYPE; \ + } + + if (_opaque->internal->maxThreadsPerBlock() == MOORE_BLOCK_SIZE_1024) { + CALCULATE_MASKED_SELECT_WITH_BLOCK_SIZE(MOORE_BLOCK_SIZE_1024); + } else if (_opaque->internal->maxThreadsPerBlock() == MOORE_BLOCK_SIZE_512) { + CALCULATE_MASKED_SELECT_WITH_BLOCK_SIZE(MOORE_BLOCK_SIZE_512); + } else if (_opaque->internal->maxThreadsPerBlock() == MOORE_BLOCK_SIZE_2048) { + CALCULATE_MASKED_SELECT_WITH_BLOCK_SIZE(MOORE_BLOCK_SIZE_2048); + } else { + return INFINI_STATUS_DEVICE_ARCHITECTURE_NOT_SUPPORTED; + } + return INFINI_STATUS_SUCCESS; +} + +} diff --git a/src/infiniop/ops/masked_select/moore/masked_select_moore_kernel.h b/src/infiniop/ops/masked_select/moore/masked_select_moore_kernel.h new file mode 100644 index 000000000..dcdd87c6d --- /dev/null +++ b/src/infiniop/ops/masked_select/moore/masked_select_moore_kernel.h @@ -0,0 +1,101 @@ +#ifndef __MASKED_SELECT_MOORE_KERNEL_CUH__ +#define __MASKED_SELECT_MOORE_KERNEL_CUH__ + +template +INFINIOP_MOORE_KERNEL maskedSelectGetMarkScanOnceKernel( + const bool *mask, size_t *mark_scan, size_t total_elements, + size_t *shape, ptrdiff_t *mask_strides, size_t ndim) { + + __shared__ __align__(128) size_t smem[BLOCK_SIZE]; + + size_t tid = threadIdx.x; + size_t index = blockDim.x * blockIdx.x + threadIdx.x; + if (index < total_elements) { + size_t mask_offset = device::moore::indexToOffset(index, ndim, shape, mask_strides); + smem[tid] = mask[mask_offset]; + } else { + smem[tid] = 0; + } + + for (int s = 1; s < BLOCK_SIZE; s <<= 1) { + __syncthreads(); + + float temp; + if (tid >= s) + temp = smem[tid] + smem[tid - s]; + __syncthreads(); + + if (tid >= s) + smem[tid] = temp; + } + + if (index < total_elements) + mark_scan[index] = smem[tid]; +} + +template +INFINIOP_MOORE_KERNEL maskedSelectScanWithStrideKernel( + size_t *mark_scan, size_t total_elements, size_t stride) { + + __shared__ __align__(128) size_t smem[BLOCK_SIZE]; + + size_t tid = threadIdx.x; + size_t index = (blockDim.x * blockIdx.x + threadIdx.x + 1) * stride - 1; + smem[tid] = index < total_elements ? mark_scan[index] : 0.; + + for (int s = 1; s < BLOCK_SIZE; s <<= 1) { + __syncthreads(); + + size_t temp; + if (tid >= s) + temp = smem[tid] + smem[tid - s]; + __syncthreads(); + + if (tid >= s) + smem[tid] = temp; + } + + if (index < total_elements) + mark_scan[index] = smem[tid]; +} + +template +INFINIOP_MOORE_KERNEL maskedSelectCountScanResultKernel( + size_t *mark_scan, size_t *scan_result, size_t total_elements) { + + size_t index = blockDim.x * blockIdx.x + threadIdx.x; + if (index >= total_elements) + return; + + size_t val = mark_scan[index]; + + for (size_t s = BLOCK_SIZE; s < total_elements; s *= BLOCK_SIZE) { + size_t now_stride_block = (index + 1) / s; + size_t pre_stride_block = now_stride_block * s - 1; + if (now_stride_block == 0 || (index + 1) % s == 0 || (pre_stride_block + 1) % (s * BLOCK_SIZE) == 0) + continue; + val += mark_scan[pre_stride_block]; + } + + scan_result[index] = val; +} + +template +INFINIOP_MOORE_KERNEL maskedSelectGetDataKernel( + const T *input, const bool *mask, size_t *scan_result, T *data, size_t total_elements, + size_t *shape, ptrdiff_t *input_strides, ptrdiff_t *mask_strides, size_t ndim) { + + size_t index = blockDim.x * blockIdx.x + threadIdx.x; + + if (index >= total_elements) + return; + + size_t input_offset = device::moore::indexToOffset(index, ndim, shape, input_strides); + size_t mask_offset = device::moore::indexToOffset(index, ndim, shape, mask_strides); + + if (mask[mask_offset]) { + data[scan_result[index] - 1] = input[input_offset]; + } +} + +#endif \ No newline at end of file diff --git a/src/infiniop/ops/masked_select/nvidia/masked_select_nvidia.cu b/src/infiniop/ops/masked_select/nvidia/masked_select_nvidia.cu new file mode 100644 index 000000000..eebba0fd3 --- /dev/null +++ b/src/infiniop/ops/masked_select/nvidia/masked_select_nvidia.cu @@ -0,0 +1,149 @@ +#include "../../../devices/nvidia/nvidia_common.cuh" +#include "masked_select_nvidia.cuh" + +#include "../../../devices/nvidia/nvidia_kernel_common.cuh" + +#include "../cuda/kernel.cuh" + +namespace op::masked_select::nvidia { + +struct Descriptor::Opaque { + std::shared_ptr internal; +}; + +Descriptor::~Descriptor() { + delete _opaque; +} + +infiniStatus_t Descriptor::create( + infiniopHandle_t handle, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t input_desc, + infiniopTensorDescriptor_t mask_desc) { + + auto result = MaskedSelectInfo::create(input_desc, mask_desc); + CHECK_RESULT(result); + auto info = result.take(); + size_t workspace_size = 0; + // shape size + workspace_size += info.ndim * sizeof(size_t); + // strides size * 2 + workspace_size += info.ndim * sizeof(ptrdiff_t) * 2; + // size_t mark_scan + workspace_size += info.total_elements * sizeof(size_t); + // size_t mark_result + workspace_size += info.total_elements * sizeof(size_t); + *desc_ptr = new Descriptor( + new Opaque{reinterpret_cast(handle)->internal()}, + info, workspace_size, handle->device, handle->device_id); + return INFINI_STATUS_SUCCESS; +} + +namespace { + +template +infiniStatus_t launchKernel( + const MaskedSelectInfo &info, + const T *input, const bool *mask, void **data_ptr, size_t *dlen_ptr, + cudaStream_t stream, void *workspace, size_t workspace_size) { + + size_t ndim = info.ndim; + size_t total_elements = info.total_elements; + + size_t workspace_offset = 0; + unsigned char *workspace_ptr = reinterpret_cast(workspace); + + size_t *shape_cuda = reinterpret_cast(workspace_ptr + workspace_offset); + workspace_offset += ndim * sizeof(size_t); + + ptrdiff_t *input_strides_cuda = reinterpret_cast(workspace_ptr + workspace_offset); + workspace_offset += ndim * sizeof(ptrdiff_t); + + ptrdiff_t *mask_strides_cuda = reinterpret_cast(workspace_ptr + workspace_offset); + workspace_offset += ndim * sizeof(ptrdiff_t); + + size_t *mark_scan = reinterpret_cast(workspace_ptr + workspace_offset); + workspace_offset += info.total_elements * sizeof(size_t); + + size_t *scan_result = reinterpret_cast(workspace_ptr + workspace_offset); + workspace_offset += info.total_elements * sizeof(size_t); + + CHECK_CUDA(cudaMemcpyAsync(shape_cuda, info.shape.data(), ndim * sizeof(size_t), cudaMemcpyHostToDevice, stream)); + CHECK_CUDA(cudaMemcpyAsync(input_strides_cuda, info.input_strides.data(), ndim * sizeof(ptrdiff_t), cudaMemcpyHostToDevice, stream)); + CHECK_CUDA(cudaMemcpyAsync(mask_strides_cuda, info.mask_strides.data(), ndim * sizeof(ptrdiff_t), cudaMemcpyHostToDevice, stream)); + + size_t block_size = BLOCK_SIZE; + size_t grid_size = (total_elements + block_size - 1) / block_size; + + maskedSelectGetMarkScanOnceKernel<<>>( + mask, mark_scan, total_elements, + shape_cuda, mask_strides_cuda, ndim); + CHECK_CUDA(cudaDeviceSynchronize()); + + for (size_t stride = BLOCK_SIZE; stride < total_elements; stride *= BLOCK_SIZE) { + size_t stride_elements = (total_elements + stride - 1) / stride; + size_t stride_grid_size = (stride_elements + BLOCK_SIZE - 1) / BLOCK_SIZE; + + maskedSelectScanWithStrideKernel<<>>( + mark_scan, total_elements, stride); + CHECK_CUDA(cudaDeviceSynchronize()); + } + if (total_elements > BLOCK_SIZE) { + maskedSelectCountScanResultKernel<<>>( + mark_scan, scan_result, total_elements); + CHECK_CUDA(cudaDeviceSynchronize()); + } else { + scan_result = mark_scan; + } + + CHECK_CUDA(cudaMemcpyAsync(dlen_ptr, scan_result + total_elements - 1, sizeof(size_t), cudaMemcpyDeviceToHost, stream)); + + CHECK_CUDA(cudaMalloc(data_ptr, *dlen_ptr * sizeof(T))); + + maskedSelectGetDataKernel<<>>( + input, mask, scan_result, (T *)*data_ptr, total_elements, + shape_cuda, input_strides_cuda, mask_strides_cuda, ndim); + CHECK_CUDA(cudaDeviceSynchronize()); + + return INFINI_STATUS_SUCCESS; +} + +} + + +infiniStatus_t Descriptor::calculate( + void *workspace, size_t workspace_size, + const void *input, + const bool *mask, + void **data_ptr, + size_t *dlen_ptr, + void *stream_) const { + + cudaStream_t stream = (cudaStream_t)stream_; +#define CALCULATE_MASKED_SELECT(BLOCK_SIZE, T) \ + launchKernel( \ + _info, \ + (const T *)input, mask, data_ptr, dlen_ptr, \ + stream, workspace, workspace_size \ + ) +#define CALCULATE_MASKED_SELECT_WITH_BLOCK_SIZE(BLOCK_SIZE) \ + { \ + if (_info.dtype == INFINI_DTYPE_F32) \ + return CALCULATE_MASKED_SELECT(BLOCK_SIZE, float); \ + else \ + return INFINI_STATUS_BAD_TENSOR_DTYPE; \ + } + + if (_opaque->internal->maxThreadsPerBlock() == CUDA_BLOCK_SIZE_1024) { + CALCULATE_MASKED_SELECT_WITH_BLOCK_SIZE(CUDA_BLOCK_SIZE_1024); + } else if (_opaque->internal->maxThreadsPerBlock() == CUDA_BLOCK_SIZE_512) { + CALCULATE_MASKED_SELECT_WITH_BLOCK_SIZE(CUDA_BLOCK_SIZE_512); + } else if (_opaque->internal->maxThreadsPerBlock() == CUDA_BLOCK_SIZE_4096) { + CALCULATE_MASKED_SELECT_WITH_BLOCK_SIZE(CUDA_BLOCK_SIZE_4096); + } else { + return INFINI_STATUS_DEVICE_ARCHITECTURE_NOT_SUPPORTED; + } + return INFINI_STATUS_SUCCESS; +} + +} diff --git a/src/infiniop/ops/masked_select/nvidia/masked_select_nvidia.cuh b/src/infiniop/ops/masked_select/nvidia/masked_select_nvidia.cuh new file mode 100644 index 000000000..bc493edbe --- /dev/null +++ b/src/infiniop/ops/masked_select/nvidia/masked_select_nvidia.cuh @@ -0,0 +1,8 @@ +#ifndef __MASKED_SELECT_NVIDIA_H__ +#define __MASKED_SELECT_NVIDIA_H__ + +#include "../masked_select.h" + +DESCRIPTOR(nvidia) + +#endif \ No newline at end of file diff --git a/src/infiniop/ops/masked_select/operator.cc b/src/infiniop/ops/masked_select/operator.cc new file mode 100644 index 000000000..055b67b12 --- /dev/null +++ b/src/infiniop/ops/masked_select/operator.cc @@ -0,0 +1,138 @@ +#include "../../operator.h" +#include "../../handle.h" +#include "infiniop/ops/masked_select.h" + +#ifdef ENABLE_CPU_API +#include "cpu/masked_select_cpu.h" +#endif +#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API) || defined(ENABLE_QY_API) || defined(ENABLE_HYGON_API) +#include "nvidia/masked_select_nvidia.cuh" +#endif +#ifdef ENABLE_METAX_API +#include "metax/masked_select_metax.h" +#endif +#ifdef ENABLE_MOORE_API +#include "moore/masked_select_moore.h" +#endif + +__C infiniStatus_t infiniopCreateMaskedSelectDescriptor( + infiniopHandle_t handle, + infiniopMaskedSelectDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t input_desc, + infiniopTensorDescriptor_t mask_desc) { + +#define CREATE(CASE, NAMESPACE) \ + case CASE: \ + return op::masked_select::NAMESPACE::Descriptor::create( \ + handle, \ + reinterpret_cast(desc_ptr),\ + input_desc, \ + mask_desc); + + switch (handle->device) { +#ifdef ENABLE_CPU_API + CREATE(INFINI_DEVICE_CPU, cpu) +#endif +#ifdef ENABLE_NVIDIA_API + CREATE(INFINI_DEVICE_NVIDIA, nvidia) +#endif +#ifdef ENABLE_ILUVATAR_API + CREATE(INFINI_DEVICE_ILUVATAR, nvidia); +#endif +#ifdef ENABLE_METAX_API + CREATE(INFINI_DEVICE_METAX, metax) +#endif +#ifdef ENABLE_MOORE_API + CREATE(INFINI_DEVICE_MOORE, moore) +#endif + } + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; +} + +__C infiniStatus_t infiniopGetMaskedSelectWorkspaceSize(infiniopMaskedSelectDescriptor_t desc, size_t *size) { + +#define GET(CASE, NAMESPACE) \ + case CASE: \ + *size = reinterpret_cast(desc)->workspaceSize(); \ + return INFINI_STATUS_SUCCESS; + + switch (desc->device_type) { +#ifdef ENABLE_CPU_API + GET(INFINI_DEVICE_CPU, cpu) +#endif +#ifdef ENABLE_NVIDIA_API + GET(INFINI_DEVICE_NVIDIA, nvidia) +#endif +#ifdef ENABLE_ILUVATAR_API + GET(INFINI_DEVICE_ILUVATAR, nvidia); +#endif +#ifdef ENABLE_METAX_API + GET(INFINI_DEVICE_METAX, metax) +#endif +#ifdef ENABLE_MOORE_API + GET(INFINI_DEVICE_MOORE, moore) +#endif + } + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; +} + +__C infiniStatus_t infiniopMaskedSelect( + infiniopMaskedSelectDescriptor_t desc, + void *workspace, size_t workspace_size, + const void *input, + const bool *mask, + void **data_size, + size_t *dlen, + void *stream) { + +#define CALCULATE(CASE, NAMESPACE) \ + case CASE: \ + return reinterpret_cast(desc)->calculate( \ + workspace, workspace_size, input, mask, data_size, dlen, stream); + + switch (desc->device_type) { +#ifdef ENABLE_CPU_API + CALCULATE(INFINI_DEVICE_CPU, cpu) +#endif +#ifdef ENABLE_NVIDIA_API + CALCULATE(INFINI_DEVICE_NVIDIA, nvidia) +#endif +#ifdef ENABLE_ILUVATAR_API + CALCULATE(INFINI_DEVICE_ILUVATAR, nvidia); +#endif +#ifdef ENABLE_METAX_API + CALCULATE(INFINI_DEVICE_METAX, metax) +#endif +#ifdef ENABLE_MOORE_API + CALCULATE(INFINI_DEVICE_MOORE, moore) +#endif + } + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; +} + +__C infiniStatus_t infiniopDestroyMaskedSelectDescriptor(infiniopMaskedSelectDescriptor_t desc) { + +#define DESTROY(CASE, NAMESPACE) \ + case CASE: \ + delete reinterpret_cast(desc); \ + return INFINI_STATUS_SUCCESS; + + switch (desc->device_type) { +#ifdef ENABLE_CPU_API + DESTROY(INFINI_DEVICE_CPU, cpu) +#endif +#ifdef ENABLE_NVIDIA_API + DESTROY(INFINI_DEVICE_NVIDIA, nvidia) +#endif +#ifdef ENABLE_ILUVATAR_API + DESTROY(INFINI_DEVICE_ILUVATAR, nvidia); +#endif +#ifdef ENABLE_METAX_API + DESTROY(INFINI_DEVICE_METAX, metax) +#endif +#ifdef ENABLE_MOORE_API + DESTROY(INFINI_DEVICE_MOORE, moore) +#endif + } + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; +} diff --git a/src/infiniop/ops/tan/cpu/tan_cpu.cc b/src/infiniop/ops/tan/cpu/tan_cpu.cc new file mode 100644 index 000000000..90939f9cb --- /dev/null +++ b/src/infiniop/ops/tan/cpu/tan_cpu.cc @@ -0,0 +1,50 @@ +#include "tan_cpu.h" + +namespace op::tan::cpu { + +Descriptor::~Descriptor() = default; + +infiniStatus_t Descriptor::create( + infiniopHandle_t handle_, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t out_desc, + std::vector input_desc_vec) { + + auto handle = reinterpret_cast(handle_); + auto dtype = out_desc->dtype(); + + const auto &input_desc = input_desc_vec.at(0); + const auto &output_shape = out_desc->shape(); + const auto &input_shape = input_desc->shape(); + + CHECK_DTYPE(dtype, INFINI_DTYPE_BF16, INFINI_DTYPE_F16, INFINI_DTYPE_F32); + + CHECK_SAME_SHAPE(output_shape, input_shape); + + // create CPU elementwise descriptor + CREATE_ELEMENTWISE_CPU_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec); + + return INFINI_STATUS_SUCCESS; +} + +infiniStatus_t Descriptor::calculate( + void *workspace, + size_t workspace_size, + void *output, + std::vector inputs, + void *stream) const { + + switch (_dtype) { + case INFINI_DTYPE_BF16: + return _device_info->calculate(_info, output, inputs, stream); + case INFINI_DTYPE_F16: + return _device_info->calculate(_info, output, inputs, stream); + case INFINI_DTYPE_F32: + return _device_info->calculate(_info, output, inputs, stream); + default: + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } + + return INFINI_STATUS_SUCCESS; +} +} // namespace op::tan::cpu diff --git a/src/infiniop/ops/tan/cpu/tan_cpu.h b/src/infiniop/ops/tan/cpu/tan_cpu.h new file mode 100644 index 000000000..04004dd19 --- /dev/null +++ b/src/infiniop/ops/tan/cpu/tan_cpu.h @@ -0,0 +1,23 @@ +#ifndef __TAN_CPU_H__ +#define __TAN_CPU_H__ + +#include "../../../elementwise/cpu/elementwise_cpu.h" + +ELEMENTWISE_DESCRIPTOR(tan, cpu) + +#include + +namespace op::tan::cpu { +typedef struct TanOp { +public: + static constexpr size_t num_inputs = 1; + + template + T operator()(const T &x) const { + return std::tan(x); + } +} TanOp; + +} // namespace op::tan::cpu + +#endif // __TAN_CPU_H__ diff --git a/src/infiniop/ops/tan/cuda/kernel.cuh b/src/infiniop/ops/tan/cuda/kernel.cuh new file mode 100644 index 000000000..85c9eca27 --- /dev/null +++ b/src/infiniop/ops/tan/cuda/kernel.cuh @@ -0,0 +1,30 @@ +#ifndef __TAN_CUDA_H__ +#define __TAN_CUDA_H__ + +namespace op::tan::cuda { + +typedef struct TanOp { +public: + static constexpr size_t num_inputs = 1; + template + __device__ __forceinline__ T operator()(const T &x) const { + if constexpr (std::is_same_v) { + // BF16 + const float x_f = __bfloat162float(x); + return __float2bfloat16(__tanf(x_f)); + } else if constexpr (std::is_same_v) { + // FP16 + const float x_f = __half2float(x); + return __float2half(__tanf(x_f)); + } else if constexpr (std::is_same_v) { + // FP32 + return __tanf(x); + } else { + return __tanf(x); + } + } +} TanOp; + +} // namespace op::tan::cuda + +#endif // __TAN_CUDA_H__ diff --git a/src/infiniop/ops/tan/metax/tan_metax.h b/src/infiniop/ops/tan/metax/tan_metax.h new file mode 100644 index 000000000..507fd9d5e --- /dev/null +++ b/src/infiniop/ops/tan/metax/tan_metax.h @@ -0,0 +1,8 @@ +#ifndef __TAN_METAX_API_H__ +#define __TAN_METAX_API_H__ + +#include "../../../elementwise/metax/elementwise_metax_api.h" + +ELEMENTWISE_DESCRIPTOR(tan, metax) + +#endif // __TAN_METAX_API_H__ diff --git a/src/infiniop/ops/tan/metax/tan_metax.maca b/src/infiniop/ops/tan/metax/tan_metax.maca new file mode 100644 index 000000000..85768bec3 --- /dev/null +++ b/src/infiniop/ops/tan/metax/tan_metax.maca @@ -0,0 +1,56 @@ +#include "../../../elementwise/metax/elementwise_metax.h" + +#include "../cuda/kernel.cuh" +#include "tan_metax.h" + +namespace op::tan::metax { + +Descriptor::~Descriptor() = default; + +infiniStatus_t Descriptor::create( + infiniopHandle_t handle_, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t out_desc, + std::vector input_desc_vec) { + + auto handle = reinterpret_cast(handle_); + auto dtype = out_desc->dtype(); + + const auto &input_desc = input_desc_vec.at(0); + const auto &output_shape = out_desc->shape(); + const auto &input_shape = input_desc->shape(); + + CHECK_DTYPE(dtype, INFINI_DTYPE_BF16, INFINI_DTYPE_F16, INFINI_DTYPE_F32); + + CHECK_SAME_SHAPE(output_shape, input_shape); + + CREATE_ELEMENTWISE_METAX_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec) + + return INFINI_STATUS_SUCCESS; +} + +infiniStatus_t Descriptor::calculate( + void *workspace, + size_t workspace_size, + void *output, + std::vector inputs, + void *stream) const { + + if (workspace_size < _workspace_size) { + return INFINI_STATUS_INSUFFICIENT_WORKSPACE; + } + + switch (_dtype) { + case INFINI_DTYPE_BF16: + return _device_info->calculate<256, cuda::TanOp, hpcc_bfloat16>(_info, workspace, output, inputs, stream); + case INFINI_DTYPE_F16: + return _device_info->calculate<256, cuda::TanOp, half>(_info, workspace, output, inputs, stream); + case INFINI_DTYPE_F32: + return _device_info->calculate<256, cuda::TanOp, float>(_info, workspace, output, inputs, stream); + default: + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } + + return INFINI_STATUS_SUCCESS; +} +} // namespace op::tan::metax diff --git a/src/infiniop/ops/tan/moore/tan_moore.h b/src/infiniop/ops/tan/moore/tan_moore.h new file mode 100644 index 000000000..c261e4a69 --- /dev/null +++ b/src/infiniop/ops/tan/moore/tan_moore.h @@ -0,0 +1,8 @@ +#ifndef __TAN_MOORE_API_H__ +#define __TAN_MOORE_API_H__ + +#include "../../../elementwise/moore/elementwise_moore_api.h" + +ELEMENTWISE_DESCRIPTOR(tan, moore) + +#endif // __TAN_MOORE_API_H__ diff --git a/src/infiniop/ops/tan/moore/tan_moore.mu b/src/infiniop/ops/tan/moore/tan_moore.mu new file mode 100644 index 000000000..4caec16ef --- /dev/null +++ b/src/infiniop/ops/tan/moore/tan_moore.mu @@ -0,0 +1,56 @@ +#include "../../../elementwise/moore/elementwise_moore.h" + +#include "../cuda/kernel.cuh" +#include "tan_moore.h" + +namespace op::tan::moore { + +Descriptor::~Descriptor() = default; + +infiniStatus_t Descriptor::create( + infiniopHandle_t handle_, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t out_desc, + std::vector input_desc_vec) { + + auto handle = reinterpret_cast(handle_); + auto dtype = out_desc->dtype(); + + const auto &input_desc = input_desc_vec.at(0); + const auto &output_shape = out_desc->shape(); + const auto &input_shape = input_desc->shape(); + + CHECK_DTYPE(dtype, INFINI_DTYPE_BF16, INFINI_DTYPE_F16, INFINI_DTYPE_F32); + + CHECK_SAME_SHAPE(output_shape, input_shape); + + CREATE_ELEMENTWISE_MOORE_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec) + + return INFINI_STATUS_SUCCESS; +} + +infiniStatus_t Descriptor::calculate( + void *workspace, + size_t workspace_size, + void *output, + std::vector inputs, + void *stream) const { + + if (workspace_size < _workspace_size) { + return INFINI_STATUS_INSUFFICIENT_WORKSPACE; + } + + switch (_dtype) { + case INFINI_DTYPE_BF16: + return _device_info->calculate<256, cuda::TanOp, mt_bfloat16>(_info, workspace, output, inputs, stream); + case INFINI_DTYPE_F16: + return _device_info->calculate<256, cuda::TanOp, half>(_info, workspace, output, inputs, stream); + case INFINI_DTYPE_F32: + return _device_info->calculate<256, cuda::TanOp, float>(_info, workspace, output, inputs, stream); + default: + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } + + return INFINI_STATUS_SUCCESS; +} +} // namespace op::tan::moore diff --git a/src/infiniop/ops/tan/nvidia/tan_nvidia.cu b/src/infiniop/ops/tan/nvidia/tan_nvidia.cu new file mode 100644 index 000000000..d5415cb62 --- /dev/null +++ b/src/infiniop/ops/tan/nvidia/tan_nvidia.cu @@ -0,0 +1,57 @@ +#include "../../../elementwise/nvidia/elementwise_nvidia.cuh" + +#include "../cuda/kernel.cuh" +#include "tan_nvidia.cuh" + +namespace op::tan::nvidia { + +Descriptor::~Descriptor() = default; + +infiniStatus_t Descriptor::create( + infiniopHandle_t handle_, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t out_desc, + std::vector input_desc_vec) { + + auto handle = reinterpret_cast(handle_); + auto dtype = out_desc->dtype(); + + const auto &input_desc = input_desc_vec.at(0); + const auto &output_shape = out_desc->shape(); + const auto &input_shape = input_desc->shape(); + + CHECK_DTYPE(dtype, INFINI_DTYPE_BF16, INFINI_DTYPE_F16, INFINI_DTYPE_F32); + + CHECK_SAME_SHAPE(output_shape, input_shape); + + // create CUDA elementwise descriptor + CREATE_ELEMENTWISE_CUDA_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec) + + return INFINI_STATUS_SUCCESS; +} + +infiniStatus_t Descriptor::calculate( + void *workspace, + size_t workspace_size, + void *output, + std::vector inputs, + void *stream) const { + + if (workspace_size < _workspace_size) { + return INFINI_STATUS_INSUFFICIENT_WORKSPACE; + } + + switch (_dtype) { + case INFINI_DTYPE_BF16: + return _device_info->calculate<256, cuda::TanOp, cuda_bfloat16>(_info, workspace, output, inputs, stream); + case INFINI_DTYPE_F16: + return _device_info->calculate<256, cuda::TanOp, half>(_info, workspace, output, inputs, stream); + case INFINI_DTYPE_F32: + return _device_info->calculate<256, cuda::TanOp, float>(_info, workspace, output, inputs, stream); + default: + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } + + return INFINI_STATUS_SUCCESS; +} +} // namespace op::tan::nvidia diff --git a/src/infiniop/ops/tan/nvidia/tan_nvidia.cuh b/src/infiniop/ops/tan/nvidia/tan_nvidia.cuh new file mode 100644 index 000000000..1915cc2b1 --- /dev/null +++ b/src/infiniop/ops/tan/nvidia/tan_nvidia.cuh @@ -0,0 +1,8 @@ +#ifndef __TAN_CUDA_API_H__ +#define __TAN_CUDA_API_H__ + +#include "../../../elementwise/nvidia/elementwise_nvidia_api.cuh" + +ELEMENTWISE_DESCRIPTOR(tan, nvidia) + +#endif // __TAN_CUDA_API_H__ diff --git a/src/infiniop/ops/tan/operator.cc b/src/infiniop/ops/tan/operator.cc new file mode 100644 index 000000000..1f222a8d1 --- /dev/null +++ b/src/infiniop/ops/tan/operator.cc @@ -0,0 +1,157 @@ +#include "../../operator.h" +#include "../../handle.h" +#include "infiniop/ops/tan.h" + +#ifdef ENABLE_CPU_API +#include "cpu/tan_cpu.h" +#endif +#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API) +#include "nvidia/tan_nvidia.cuh" +#endif +#ifdef ENABLE_METAX_API +#include "metax/tan_metax.h" +#endif +#ifdef ENABLE_MOORE_API +#include "moore/tan_moore.h" +#endif + +__C infiniStatus_t infiniopCreateTanDescriptor( + infiniopHandle_t handle, + infiniopTanDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t output_desc, + infiniopTensorDescriptor_t input_desc) { + +#define CREATE(CASE, NAMESPACE) \ + case CASE: \ + return op::tan::NAMESPACE::Descriptor::create( \ + handle, \ + reinterpret_cast(desc_ptr), \ + output_desc, \ + {input_desc}) + + switch (handle->device) { + +#ifdef ENABLE_CPU_API + CREATE(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + CREATE(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_ILUVATAR_API + CREATE(INFINI_DEVICE_ILUVATAR, nvidia); +#endif +#ifdef ENABLE_METAX_API + CREATE(INFINI_DEVICE_METAX, metax); +#endif +#ifdef ENABLE_MOORE_API + CREATE(INFINI_DEVICE_MOORE, moore); +#endif + + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } + +#undef CREATE +} + +__C infiniStatus_t infiniopGetTanWorkspaceSize(infiniopTanDescriptor_t desc, size_t *size) { + +#define GET(CASE, NAMESPACE) \ + case CASE: \ + *size = reinterpret_cast(desc)->workspaceSize(); \ + return INFINI_STATUS_SUCCESS + + switch (desc->device_type) { +#ifdef ENABLE_CPU_API + GET(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + GET(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_ILUVATAR_API + GET(INFINI_DEVICE_ILUVATAR, nvidia); +#endif +#ifdef ENABLE_METAX_API + GET(INFINI_DEVICE_METAX, metax); +#endif +#ifdef ENABLE_MOORE_API + GET(INFINI_DEVICE_MOORE, moore); +#endif + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } +#undef GET + + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; +} + +__C infiniStatus_t infiniopTan( + infiniopTanDescriptor_t desc, + void *workspace, + size_t workspace_size, + void *output, + const void *input, + void *stream) { + +#define CALCULATE(CASE, NAMESPACE) \ + case CASE: \ + return reinterpret_cast(desc) \ + ->calculate(workspace, workspace_size, output, {input}, stream) + + switch (desc->device_type) { + +#ifdef ENABLE_CPU_API + CALCULATE(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + CALCULATE(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_ILUVATAR_API + CALCULATE(INFINI_DEVICE_ILUVATAR, nvidia); +#endif +#ifdef ENABLE_METAX_API + CALCULATE(INFINI_DEVICE_METAX, metax); +#endif +#ifdef ENABLE_MOORE_API + CALCULATE(INFINI_DEVICE_MOORE, moore); +#endif + + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } + +#undef CALCULATE +} + +__C infiniStatus_t +infiniopDestroyTanDescriptor(infiniopTanDescriptor_t desc) { + +#define DELETE(CASE, NAMESPACE) \ + case CASE: \ + delete reinterpret_cast(desc); \ + return INFINI_STATUS_SUCCESS + + switch (desc->device_type) { + +#ifdef ENABLE_CPU_API + DELETE(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + DELETE(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_ILUVATAR_API + DELETE(INFINI_DEVICE_ILUVATAR, nvidia); +#endif +#ifdef ENABLE_METAX_API + DELETE(INFINI_DEVICE_METAX, metax); +#endif +#ifdef ENABLE_MOORE_API + DELETE(INFINI_DEVICE_MOORE, moore); +#endif + + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } + +#undef DELETE +} diff --git a/src/infiniop/ops/tanhshrink/cpu/tanhshrink_cpu.cc b/src/infiniop/ops/tanhshrink/cpu/tanhshrink_cpu.cc new file mode 100644 index 000000000..1abca6a33 --- /dev/null +++ b/src/infiniop/ops/tanhshrink/cpu/tanhshrink_cpu.cc @@ -0,0 +1,50 @@ +#include "tanhshrink_cpu.h" + +namespace op::tanhshrink::cpu { + +Descriptor::~Descriptor() = default; + +infiniStatus_t Descriptor::create( + infiniopHandle_t handle_, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t out_desc, + std::vector input_desc_vec) { + + auto handle = reinterpret_cast(handle_); + auto dtype = out_desc->dtype(); + + const auto &input_desc = input_desc_vec.at(0); + const auto &output_shape = out_desc->shape(); + const auto &input_shape = input_desc->shape(); + + CHECK_DTYPE(dtype, INFINI_DTYPE_BF16, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_F64); + + CHECK_SAME_SHAPE(output_shape, input_shape); + + // create CPU elementwise descriptor + CREATE_ELEMENTWISE_CPU_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec); + + return INFINI_STATUS_SUCCESS; +} + +infiniStatus_t Descriptor::calculate( + void *workspace, + size_t workspace_size, + void *output, + std::vector inputs, + void *stream) const { + + switch (_dtype) { + case INFINI_DTYPE_BF16: + return _device_info->calculate(_info, output, inputs, stream); + case INFINI_DTYPE_F16: + return _device_info->calculate(_info, output, inputs, stream); + case INFINI_DTYPE_F32: + return _device_info->calculate(_info, output, inputs, stream); + default: + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } + + return INFINI_STATUS_SUCCESS; +} +} // namespace op::tanhshrink::cpu diff --git a/src/infiniop/ops/tanhshrink/cpu/tanhshrink_cpu.h b/src/infiniop/ops/tanhshrink/cpu/tanhshrink_cpu.h new file mode 100644 index 000000000..8c5f722ad --- /dev/null +++ b/src/infiniop/ops/tanhshrink/cpu/tanhshrink_cpu.h @@ -0,0 +1,23 @@ +#ifndef __TANHSHRINK_CPU_H__ +#define __TANHSHRINK_CPU_H__ + +#include "../../../elementwise/cpu/elementwise_cpu.h" + +ELEMENTWISE_DESCRIPTOR(tanhshrink, cpu) + +#include + +namespace op::tanhshrink::cpu { +typedef struct TanhshrinkOp { +public: + static constexpr size_t num_inputs = 1; + + template + T operator()(const T &x) const { + return x - tanh(x); + } +} TanhshrinkOp; + +} // namespace op::tanhshrink::cpu + +#endif // __TANHSHRINK_CPU_H__ diff --git a/src/infiniop/ops/tanhshrink/cuda/kernel.cuh b/src/infiniop/ops/tanhshrink/cuda/kernel.cuh new file mode 100644 index 000000000..dfb88731e --- /dev/null +++ b/src/infiniop/ops/tanhshrink/cuda/kernel.cuh @@ -0,0 +1,30 @@ +#ifndef __TANHSHRINK_CUDA_H__ +#define __TANHSHRINK_CUDA_H__ + +namespace op::tanhshrink::cuda { + +typedef struct TanhshrinkOp { +public: + static constexpr size_t num_inputs = 1; + template + __device__ __forceinline__ T operator()(const T &x) const { + if constexpr (std::is_same_v) { + // BF16 + const float x_f = __bfloat162float(x); + return __float2bfloat16(x_f - tanhf(x_f)); + } else if constexpr (std::is_same_v) { + // FP16 + const float x_f = __half2float(x); + return __float2half(x_f - tanhf(x_f)); + } else if constexpr (std::is_same_v) { + // FP32 + return x - tanhf(x); + } else { + return x - tanhf(x); + } + } +} TanhshrinkOp; + +} // namespace op::tanhshrink::cuda + +#endif // __TANHSHRINK_CUDA_H__ diff --git a/src/infiniop/ops/tanhshrink/metax/tanhshrink_metax.h b/src/infiniop/ops/tanhshrink/metax/tanhshrink_metax.h new file mode 100644 index 000000000..b79edcbf9 --- /dev/null +++ b/src/infiniop/ops/tanhshrink/metax/tanhshrink_metax.h @@ -0,0 +1,8 @@ +#ifndef __TANHSHRINK_METAX_API_H__ +#define __TANHSHRINK_METAX_API_H__ + +#include "../../../elementwise/metax/elementwise_metax_api.h" + +ELEMENTWISE_DESCRIPTOR(tanhshrink, metax) + +#endif // __TANHSHRINK_METAX_API_H__ diff --git a/src/infiniop/ops/tanhshrink/metax/tanhshrink_metax.maca b/src/infiniop/ops/tanhshrink/metax/tanhshrink_metax.maca new file mode 100644 index 000000000..c63390ab7 --- /dev/null +++ b/src/infiniop/ops/tanhshrink/metax/tanhshrink_metax.maca @@ -0,0 +1,56 @@ +#include "../../../elementwise/metax/elementwise_metax.h" + +#include "../cuda/kernel.cuh" +#include "tanhshrink_metax.h" + +namespace op::tanhshrink::metax { + +Descriptor::~Descriptor() = default; + +infiniStatus_t Descriptor::create( + infiniopHandle_t handle_, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t out_desc, + std::vector input_desc_vec) { + + auto handle = reinterpret_cast(handle_); + auto dtype = out_desc->dtype(); + + const auto &input_desc = input_desc_vec.at(0); + const auto &output_shape = out_desc->shape(); + const auto &input_shape = input_desc->shape(); + + CHECK_DTYPE(dtype, INFINI_DTYPE_BF16, INFINI_DTYPE_F16, INFINI_DTYPE_F32); + + CHECK_SAME_SHAPE(output_shape, input_shape); + + CREATE_ELEMENTWISE_METAX_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec) + + return INFINI_STATUS_SUCCESS; +} + +infiniStatus_t Descriptor::calculate( + void *workspace, + size_t workspace_size, + void *output, + std::vector inputs, + void *stream) const { + + if (workspace_size < _workspace_size) { + return INFINI_STATUS_INSUFFICIENT_WORKSPACE; + } + + switch (_dtype) { + case INFINI_DTYPE_BF16: + return _device_info->calculate<256, cuda::TanhshrinkOp, hpcc_bfloat16>(_info, workspace, output, inputs, stream); + case INFINI_DTYPE_F16: + return _device_info->calculate<256, cuda::TanhshrinkOp, half>(_info, workspace, output, inputs, stream); + case INFINI_DTYPE_F32: + return _device_info->calculate<256, cuda::TanhshrinkOp, float>(_info, workspace, output, inputs, stream); + default: + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } + + return INFINI_STATUS_SUCCESS; +} +} // namespace op::tanhshrink::metax diff --git a/src/infiniop/ops/tanhshrink/moore/tanhshrink_moore.h b/src/infiniop/ops/tanhshrink/moore/tanhshrink_moore.h new file mode 100644 index 000000000..5a01a76ce --- /dev/null +++ b/src/infiniop/ops/tanhshrink/moore/tanhshrink_moore.h @@ -0,0 +1,8 @@ +#ifndef __TANHSHRINK_MOORE_API_H__ +#define __TANHSHRINK_MOORE_API_H__ + +#include "../../../elementwise/moore/elementwise_moore_api.h" + +ELEMENTWISE_DESCRIPTOR(tanhshrink, moore) + +#endif // __TANHSHRINK_MOORE_API_H__ diff --git a/src/infiniop/ops/tanhshrink/moore/tanhshrink_moore.mu b/src/infiniop/ops/tanhshrink/moore/tanhshrink_moore.mu new file mode 100644 index 000000000..4c4bdee8f --- /dev/null +++ b/src/infiniop/ops/tanhshrink/moore/tanhshrink_moore.mu @@ -0,0 +1,56 @@ +#include "../../../elementwise/moore/elementwise_moore.h" + +#include "../cuda/kernel.cuh" +#include "tanhshrink_moore.h" + +namespace op::tanhshrink::moore { + +Descriptor::~Descriptor() = default; + +infiniStatus_t Descriptor::create( + infiniopHandle_t handle_, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t out_desc, + std::vector input_desc_vec) { + + auto handle = reinterpret_cast(handle_); + auto dtype = out_desc->dtype(); + + const auto &input_desc = input_desc_vec.at(0); + const auto &output_shape = out_desc->shape(); + const auto &input_shape = input_desc->shape(); + + CHECK_DTYPE(dtype, INFINI_DTYPE_BF16, INFINI_DTYPE_F16, INFINI_DTYPE_F32); + + CHECK_SAME_SHAPE(output_shape, input_shape); + + CREATE_ELEMENTWISE_MOORE_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec) + + return INFINI_STATUS_SUCCESS; +} + +infiniStatus_t Descriptor::calculate( + void *workspace, + size_t workspace_size, + void *output, + std::vector inputs, + void *stream) const { + + if (workspace_size < _workspace_size) { + return INFINI_STATUS_INSUFFICIENT_WORKSPACE; + } + + switch (_dtype) { + case INFINI_DTYPE_BF16: + return _device_info->calculate<256, cuda::TanhshrinkOp, mt_bfloat16>(_info, workspace, output, inputs, stream); + case INFINI_DTYPE_F16: + return _device_info->calculate<256, cuda::TanhshrinkOp, half>(_info, workspace, output, inputs, stream); + case INFINI_DTYPE_F32: + return _device_info->calculate<256, cuda::TanhshrinkOp, float>(_info, workspace, output, inputs, stream); + default: + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } + + return INFINI_STATUS_SUCCESS; +} +} // namespace op::tanhshrink::moore diff --git a/src/infiniop/ops/tanhshrink/moore/tanhshrink_moore_kernel.h b/src/infiniop/ops/tanhshrink/moore/tanhshrink_moore_kernel.h new file mode 100644 index 000000000..9c75b916e --- /dev/null +++ b/src/infiniop/ops/tanhshrink/moore/tanhshrink_moore_kernel.h @@ -0,0 +1,28 @@ +#ifndef __TANHSHRINK_MOORE_KERNEL_H__ +#define __TANHSHRINK_MOORE_KERNEL_H__ + +namespace op::tanhshrink::moore { + +typedef struct TanhshrinkOp { +public: + static constexpr size_t num_inputs = 1; + template + __device__ __forceinline__ T operator()(const T &x) const { + if constexpr (std::is_same_v) { + // BF16 + const float x_f = __bfloat162float(x); + return __float2bfloat16(x_f - tanhf(x_f)); + } else if constexpr (std::is_same_v) { + // FP16 + const float x_f = __half2float(x); + return __float2half(x_f - tanhf(x_f)); + } else if constexpr (std::is_same_v) { + // FP32 + return x - tanhf(x); + } + } +} TanhshrinkOp; + +} // namespace op::tanhshrink::moore + +#endif // __TANHSHRINK_MOORE_KERNEL_H__ \ No newline at end of file diff --git a/src/infiniop/ops/tanhshrink/nvidia/tanhshrink_nvidia.cu b/src/infiniop/ops/tanhshrink/nvidia/tanhshrink_nvidia.cu new file mode 100644 index 000000000..baff908a2 --- /dev/null +++ b/src/infiniop/ops/tanhshrink/nvidia/tanhshrink_nvidia.cu @@ -0,0 +1,59 @@ +#include "../../../elementwise/nvidia/elementwise_nvidia.cuh" + +#include "../cuda/kernel.cuh" +#include "tanhshrink_nvidia.cuh" + +namespace op::tanhshrink::nvidia { + +Descriptor::~Descriptor() = default; + +infiniStatus_t Descriptor::create( + infiniopHandle_t handle_, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t out_desc, + std::vector input_desc_vec) { + + auto handle = reinterpret_cast(handle_); + auto dtype = out_desc->dtype(); + + const auto &input_desc = input_desc_vec.at(0); + const auto &output_shape = out_desc->shape(); + const auto &input_shape = input_desc->shape(); + + CHECK_DTYPE(dtype, INFINI_DTYPE_BF16, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_F64); + + CHECK_SAME_SHAPE(output_shape, input_shape); + + // create CUDA elementwise descriptor + CREATE_ELEMENTWISE_CUDA_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec) + + return INFINI_STATUS_SUCCESS; +} + +infiniStatus_t Descriptor::calculate( + void *workspace, + size_t workspace_size, + void *output, + std::vector inputs, + void *stream) const { + + if (workspace_size < _workspace_size) { + return INFINI_STATUS_INSUFFICIENT_WORKSPACE; + } + + switch (_dtype) { + case INFINI_DTYPE_BF16: + return _device_info->calculate<256, cuda::TanhshrinkOp, cuda_bfloat16>(_info, workspace, output, inputs, stream); + case INFINI_DTYPE_F16: + return _device_info->calculate<256, cuda::TanhshrinkOp, half>(_info, workspace, output, inputs, stream); + case INFINI_DTYPE_F32: + return _device_info->calculate<256, cuda::TanhshrinkOp, float>(_info, workspace, output, inputs, stream); + case INFINI_DTYPE_F64: + return _device_info->calculate<256, cuda::TanhshrinkOp, double>(_info, workspace, output, inputs, stream); + default: + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } + + return INFINI_STATUS_SUCCESS; +} +} // namespace op::tanhshrink::nvidia diff --git a/src/infiniop/ops/tanhshrink/nvidia/tanhshrink_nvidia.cuh b/src/infiniop/ops/tanhshrink/nvidia/tanhshrink_nvidia.cuh new file mode 100644 index 000000000..abe56f444 --- /dev/null +++ b/src/infiniop/ops/tanhshrink/nvidia/tanhshrink_nvidia.cuh @@ -0,0 +1,8 @@ +#ifndef __TANHSHRINK_CUDA_API_H__ +#define __TANHSHRINK_CUDA_API_H__ + +#include "../../../elementwise/nvidia/elementwise_nvidia_api.cuh" + +ELEMENTWISE_DESCRIPTOR(tanhshrink, nvidia) + +#endif // __TANHSHRINK_CUDA_API_H__ diff --git a/src/infiniop/ops/tanhshrink/operator.cc b/src/infiniop/ops/tanhshrink/operator.cc new file mode 100644 index 000000000..66e62b850 --- /dev/null +++ b/src/infiniop/ops/tanhshrink/operator.cc @@ -0,0 +1,157 @@ +#include "../../operator.h" +#include "../../handle.h" +#include "infiniop/ops/tanhshrink.h" + +#ifdef ENABLE_CPU_API +#include "cpu/tanhshrink_cpu.h" +#endif +#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API) +#include "nvidia/tanhshrink_nvidia.cuh" +#endif +#ifdef ENABLE_METAX_API +#include "metax/tanhshrink_metax.h" +#endif +#ifdef ENABLE_MOORE_API +#include "moore/tanhshrink_moore.h" +#endif + +__C infiniStatus_t infiniopCreateTanhshrinkDescriptor( + infiniopHandle_t handle, + infiniopTanhshrinkDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t output_desc, + infiniopTensorDescriptor_t input_desc) { + +#define CREATE(CASE, NAMESPACE) \ + case CASE: \ + return op::tanhshrink::NAMESPACE::Descriptor::create( \ + handle, \ + reinterpret_cast(desc_ptr), \ + output_desc, \ + {input_desc}) + + switch (handle->device) { + +#ifdef ENABLE_CPU_API + CREATE(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + CREATE(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_ILUVATAR_API + CREATE(INFINI_DEVICE_ILUVATAR, nvidia); +#endif +#ifdef ENABLE_METAX_API + CREATE(INFINI_DEVICE_METAX, metax); +#endif +#ifdef ENABLE_MOORE_API + CREATE(INFINI_DEVICE_MOORE, moore); +#endif + + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } + +#undef CREATE +} + +__C infiniStatus_t infiniopGetTanhshrinkWorkspaceSize(infiniopTanhshrinkDescriptor_t desc, size_t *size) { + +#define GET(CASE, NAMESPACE) \ + case CASE: \ + *size = reinterpret_cast(desc)->workspaceSize(); \ + return INFINI_STATUS_SUCCESS + + switch (desc->device_type) { +#ifdef ENABLE_CPU_API + GET(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + GET(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_ILUVATAR_API + GET(INFINI_DEVICE_ILUVATAR, nvidia); +#endif +#ifdef ENABLE_METAX_API + GET(INFINI_DEVICE_METAX, metax); +#endif +#ifdef ENABLE_MOORE_API + GET(INFINI_DEVICE_MOORE, moore); +#endif + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } +#undef GET + + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; +} + +__C infiniStatus_t infiniopTanhshrink( + infiniopTanhshrinkDescriptor_t desc, + void *workspace, + size_t workspace_size, + void *output, + const void *input, + void *stream) { + +#define CALCULATE(CASE, NAMESPACE) \ + case CASE: \ + return reinterpret_cast(desc) \ + ->calculate(workspace, workspace_size, output, {input}, stream) + + switch (desc->device_type) { + +#ifdef ENABLE_CPU_API + CALCULATE(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + CALCULATE(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_ILUVATAR_API + CALCULATE(INFINI_DEVICE_ILUVATAR, nvidia); +#endif +#ifdef ENABLE_METAX_API + CALCULATE(INFINI_DEVICE_METAX, metax); +#endif +#ifdef ENABLE_MOORE_API + CALCULATE(INFINI_DEVICE_MOORE, moore); +#endif + + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } + +#undef CALCULATE +} + +__C infiniStatus_t +infiniopDestroyTanhshrinkDescriptor(infiniopTanhshrinkDescriptor_t desc) { + +#define DELETE(CASE, NAMESPACE) \ + case CASE: \ + delete reinterpret_cast(desc); \ + return INFINI_STATUS_SUCCESS + + switch (desc->device_type) { + +#ifdef ENABLE_CPU_API + DELETE(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + DELETE(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_ILUVATAR_API + DELETE(INFINI_DEVICE_ILUVATAR, nvidia); +#endif +#ifdef ENABLE_METAX_API + DELETE(INFINI_DEVICE_METAX, metax); +#endif +#ifdef ENABLE_MOORE_API + DELETE(INFINI_DEVICE_MOORE, moore); +#endif + + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } + +#undef DELETE +} diff --git a/test/infinicore/ops/cat.py b/test/infinicore/ops/cat.py index e66f29fa4..787692343 100644 --- a/test/infinicore/ops/cat.py +++ b/test/infinicore/ops/cat.py @@ -126,9 +126,9 @@ def torch_operator(self, *args, **kwargs): """PyTorch cat implementation""" return torch.cat(*args, **kwargs) - # def infinicore_operator(self, *args, **kwargs): - # """InfiniCore cat implementation""" - # return infinicore.cat(*args, **kwargs) + def infinicore_operator(self, *args, **kwargs): + """InfiniCore cat implementation""" + return infinicore.cat(*args, **kwargs) def main(): diff --git a/test/infinicore/ops/inner.py b/test/infinicore/ops/inner.py index 3766ca860..420123eec 100644 --- a/test/infinicore/ops/inner.py +++ b/test/infinicore/ops/inner.py @@ -62,9 +62,9 @@ def get_test_cases(self): def torch_operator(self, *args, **kwargs): return torch.inner(*args, **kwargs) - # def infinicore_operator(self, *args, **kwargs): - # """InfiniCore implementation (operator not yet available).""" - # return infinicore.inner(*args, **kwargs) + def infinicore_operator(self, *args, **kwargs): + """InfiniCore implementation (operator not yet available).""" + return infinicore.inner(*args, **kwargs) def main(): diff --git a/test/infinicore/ops/masked_select.py b/test/infinicore/ops/masked_select.py index e7230b661..17ecc0fe9 100644 --- a/test/infinicore/ops/masked_select.py +++ b/test/infinicore/ops/masked_select.py @@ -53,9 +53,9 @@ def get_test_cases(self): def torch_operator(self, *args, **kwargs): return torch.masked_select(*args, **kwargs) - # def infinicore_operator(self, *args, **kwargs): - # """InfiniCore implementation (operator not yet available).""" - # return infinicore.masked_select(*args, **kwargs) + def infinicore_operator(self, *args, **kwargs): + """InfiniCore implementation (operator not yet available).""" + return infinicore.masked_select(*args, **kwargs) def main(): diff --git a/test/infinicore/ops/tan.py b/test/infinicore/ops/tan.py index acb1cac5a..9a02615b5 100644 --- a/test/infinicore/ops/tan.py +++ b/test/infinicore/ops/tan.py @@ -97,9 +97,9 @@ def get_test_cases(self): def torch_operator(self, *args, **kwargs): return torch.tan(*args, **kwargs) - # def infinicore_operator(self, *args, **kwargs): - # """InfiniCore implementation (operator not yet available).""" - # return infinicore.tan(*args, **kwargs) + def infinicore_operator(self, *args, **kwargs): + """InfiniCore implementation (operator not yet available).""" + return infinicore.tan(*args, **kwargs) def main(): diff --git a/test/infinicore/ops/tanhshrink.py b/test/infinicore/ops/tanhshrink.py index 4e60eb264..36bcb8b95 100644 --- a/test/infinicore/ops/tanhshrink.py +++ b/test/infinicore/ops/tanhshrink.py @@ -68,9 +68,9 @@ def get_test_cases(self): def torch_operator(self, *args, **kwargs): return torch.nn.functional.tanhshrink(*args, **kwargs) - # def infinicore_operator(self, *args, **kwargs): - # """InfiniCore implementation (operator not yet available).""" - # return infinicore.nn.functional.tanhshrink(*args, **kwargs) + def infinicore_operator(self, *args, **kwargs): + """InfiniCore implementation (operator not yet available).""" + return infinicore.nn.functional.tanhshrink(*args, **kwargs) def main(): From 8d7e2f8c527e29ed0a84eef128c595ffea361249 Mon Sep 17 00:00:00 2001 From: PanZezhong Date: Thu, 19 Mar 2026 03:13:31 +0000 Subject: [PATCH 2/3] issue/1031 fix T1-1-32 --- include/infinicore/ops/cat.hpp | 2 +- include/infinicore/ops/inner.hpp | 2 +- include/infinicore/ops/masked_select.hpp | 2 +- include/infinicore/ops/tan.hpp | 2 +- include/infiniop.h | 6 +- include/infiniop/ops/inner.h | 10 +-- include/infiniop/ops/masked_select.h | 10 +-- include/infiniop/ops/tan.h | 10 +-- include/infiniop/ops/tanhshrink.h | 24 ++--- python/infinicore/__init__.py | 6 +- python/infinicore/nn/functional/__init__.py | 2 +- python/infinicore/ops/cat.py | 4 +- python/infinicore/ops/masked_select.py | 3 +- src/infinicore/ops/cat/cat.cc | 87 ++++++++++--------- src/infinicore/ops/inner/inner.cc | 12 +-- src/infinicore/ops/inner/inner_infiniop.cc | 18 ++-- .../ops/masked_select/masked_select.cc | 8 +- .../masked_select/masked_select_infiniop.cc | 18 ++-- src/infinicore/ops/tan/tan_infiniop.cc | 13 ++- .../ops/tanhshrink/tanhshrink_infiniop.cc | 13 ++- src/infinicore/pybind11/ops.hpp | 7 +- src/infinicore/pybind11/ops/cat.hpp | 4 +- src/infinicore/pybind11/ops/inner.hpp | 4 +- src/infinicore/pybind11/ops/masked_select.hpp | 3 +- src/infiniop/ops/inner/cpu/inner_cpu.cc | 31 ++++--- src/infiniop/ops/inner/cpu/inner_cpu.h | 2 +- src/infiniop/ops/inner/cuda/kernel.cuh | 22 ++--- src/infiniop/ops/inner/info.h | 33 ++++--- src/infiniop/ops/inner/inner.h | 6 +- src/infiniop/ops/inner/metax/inner_metax.h | 2 +- src/infiniop/ops/inner/metax/inner_metax.maca | 37 ++++---- .../ops/inner/metax/inner_metax_kernel.h | 22 ++--- src/infiniop/ops/inner/moore/inner_moore.h | 2 +- src/infiniop/ops/inner/moore/inner_moore.mu | 37 ++++---- .../ops/inner/moore/inner_moore_kernel.h | 22 ++--- src/infiniop/ops/inner/nvidia/inner_nvidia.cu | 39 +++++---- .../ops/inner/nvidia/inner_nvidia.cuh | 2 +- src/infiniop/ops/inner/operator.cc | 36 ++++---- .../masked_select/cpu/masked_select_cpu.cc | 10 +-- .../ops/masked_select/cpu/masked_select_cpu.h | 2 +- .../ops/masked_select/cuda/kernel.cuh | 43 +++++---- src/infiniop/ops/masked_select/info.h | 4 +- .../ops/masked_select/masked_select.h | 5 +- .../masked_select/metax/masked_select_metax.h | 2 +- .../metax/masked_select_metax.maca | 51 +++++------ .../metax/masked_select_metax_kernel.h | 43 +++++---- .../masked_select/moore/masked_select_moore.h | 2 +- .../moore/masked_select_moore.mu | 51 +++++------ .../moore/masked_select_moore_kernel.h | 43 +++++---- .../nvidia/masked_select_nvidia.cu | 48 +++++----- .../nvidia/masked_select_nvidia.cuh | 2 +- src/infiniop/ops/masked_select/operator.cc | 32 +++---- src/infiniop/ops/tan/operator.cc | 28 +++--- .../moore/tanhshrink_moore_kernel.h | 2 +- src/infiniop/ops/tanhshrink/operator.cc | 28 +++--- test/infinicore/ops/inner.py | 2 +- 56 files changed, 494 insertions(+), 467 deletions(-) diff --git a/include/infinicore/ops/cat.hpp b/include/infinicore/ops/cat.hpp index 5fdfdcce8..95be62103 100644 --- a/include/infinicore/ops/cat.hpp +++ b/include/infinicore/ops/cat.hpp @@ -6,4 +6,4 @@ namespace infinicore::op { Tensor cat(std::vector tensors, int dim); void cat_(Tensor out, std::vector tensors, int dim); -} // namespace infinicore::op \ No newline at end of file +} // namespace infinicore::op diff --git a/include/infinicore/ops/inner.hpp b/include/infinicore/ops/inner.hpp index 017ddc604..ae372c415 100644 --- a/include/infinicore/ops/inner.hpp +++ b/include/infinicore/ops/inner.hpp @@ -14,4 +14,4 @@ class Inner { Tensor inner(Tensor input, Tensor other); void inner_(Tensor out, Tensor input, Tensor other); -} \ No newline at end of file +} // namespace infinicore::op diff --git a/include/infinicore/ops/masked_select.hpp b/include/infinicore/ops/masked_select.hpp index 89bbfa49d..d003e3bb8 100644 --- a/include/infinicore/ops/masked_select.hpp +++ b/include/infinicore/ops/masked_select.hpp @@ -13,4 +13,4 @@ class MaskedSelect { Tensor masked_select(Tensor input, Tensor mask); -} // namespace infinicore::op \ No newline at end of file +} // namespace infinicore::op diff --git a/include/infinicore/ops/tan.hpp b/include/infinicore/ops/tan.hpp index 9c2d7a860..72410f866 100644 --- a/include/infinicore/ops/tan.hpp +++ b/include/infinicore/ops/tan.hpp @@ -14,4 +14,4 @@ class Tan { Tensor tan(Tensor input); void tan_(Tensor output, Tensor input); -} \ No newline at end of file +} // namespace infinicore::op diff --git a/include/infiniop.h b/include/infiniop.h index b5aa2d966..753f7d4dd 100644 --- a/include/infiniop.h +++ b/include/infiniop.h @@ -19,18 +19,18 @@ #include "infiniop/ops/cross_entropy.h" #include "infiniop/ops/dequant/per_tensor_dequant_int8.h" #include "infiniop/ops/dequantize_awq.h" +#include "infiniop/ops/dequantize_gptq.h" #include "infiniop/ops/embedding.h" #include "infiniop/ops/equal.h" #include "infiniop/ops/flash_attention.h" #include "infiniop/ops/fmod.h" -#include "infiniop/ops/dequantize_gptq.h" #include "infiniop/ops/gelu.h" #include "infiniop/ops/gemm.h" #include "infiniop/ops/hardswish.h" #include "infiniop/ops/hardtanh.h" +#include "infiniop/ops/inner.h" #include "infiniop/ops/int8_gemm.h" #include "infiniop/ops/kv_caching.h" -#include "infiniop/ops/inner.h" #include "infiniop/ops/layer_norm.h" #include "infiniop/ops/logsoftmax.h" #include "infiniop/ops/lp_norm.h" @@ -58,8 +58,8 @@ #include "infiniop/ops/swiglu.h" #include "infiniop/ops/tan.h" #include "infiniop/ops/tanh.h" -#include "infiniop/ops/topk.h" #include "infiniop/ops/tanhshrink.h" +#include "infiniop/ops/topk.h" #include "infiniop/ops/topkrouter.h" #include "infiniop/ops/topksoftmax.h" #include "infiniop/ops/var.h" diff --git a/include/infiniop/ops/inner.h b/include/infiniop/ops/inner.h index f484cde9c..99fc8ab13 100644 --- a/include/infiniop/ops/inner.h +++ b/include/infiniop/ops/inner.h @@ -5,16 +5,16 @@ typedef struct InfiniopDescriptor *infiniopInnerDescriptor_t; -__C __export infiniStatus_t infiniopCreateInnerDescriptor( +__INFINI_C __export infiniStatus_t infiniopCreateInnerDescriptor( infiniopHandle_t handle, infiniopInnerDescriptor_t *desc_ptr, infiniopTensorDescriptor_t out_desc, infiniopTensorDescriptor_t input_desc, infiniopTensorDescriptor_t other_desc); -__C __export infiniStatus_t infiniopGetInnerWorkspaceSize(infiniopInnerDescriptor_t desc, size_t *size); +__INFINI_C __export infiniStatus_t infiniopGetInnerWorkspaceSize(infiniopInnerDescriptor_t desc, size_t *size); -__C __export infiniStatus_t infiniopInner( +__INFINI_C __export infiniStatus_t infiniopInner( infiniopInnerDescriptor_t desc, void *workspace, size_t workspace_size, @@ -23,6 +23,6 @@ __C __export infiniStatus_t infiniopInner( const void *other, void *stream); -__C __export infiniStatus_t infiniopDestroyInnerDescriptor(infiniopInnerDescriptor_t desc); +__INFINI_C __export infiniStatus_t infiniopDestroyInnerDescriptor(infiniopInnerDescriptor_t desc); -#endif \ No newline at end of file +#endif diff --git a/include/infiniop/ops/masked_select.h b/include/infiniop/ops/masked_select.h index 7adf65415..d24ed11f7 100644 --- a/include/infiniop/ops/masked_select.h +++ b/include/infiniop/ops/masked_select.h @@ -5,15 +5,15 @@ typedef struct InfiniopDescriptor *infiniopMaskedSelectDescriptor_t; -__C __export infiniStatus_t infiniopCreateMaskedSelectDescriptor( +__INFINI_C __export infiniStatus_t infiniopCreateMaskedSelectDescriptor( infiniopHandle_t handle, infiniopMaskedSelectDescriptor_t *desc_ptr, infiniopTensorDescriptor_t input_desc, infiniopTensorDescriptor_t mask_desc); -__C __export infiniStatus_t infiniopGetMaskedSelectWorkspaceSize(infiniopMaskedSelectDescriptor_t desc, size_t *size); +__INFINI_C __export infiniStatus_t infiniopGetMaskedSelectWorkspaceSize(infiniopMaskedSelectDescriptor_t desc, size_t *size); -__C __export infiniStatus_t infiniopMaskedSelect( +__INFINI_C __export infiniStatus_t infiniopMaskedSelect( infiniopMaskedSelectDescriptor_t desc, void *workspace, size_t workspace_size, @@ -23,6 +23,6 @@ __C __export infiniStatus_t infiniopMaskedSelect( size_t *dlen, void *stream); -__C __export infiniStatus_t infiniopDestroyMaskedSelectDescriptor(infiniopMaskedSelectDescriptor_t desc); +__INFINI_C __export infiniStatus_t infiniopDestroyMaskedSelectDescriptor(infiniopMaskedSelectDescriptor_t desc); -#endif \ No newline at end of file +#endif diff --git a/include/infiniop/ops/tan.h b/include/infiniop/ops/tan.h index 2e20c61da..1b7bef23e 100644 --- a/include/infiniop/ops/tan.h +++ b/include/infiniop/ops/tan.h @@ -5,15 +5,15 @@ typedef struct InfiniopDescriptor *infiniopTanDescriptor_t; -__C __export infiniStatus_t infiniopCreateTanDescriptor( +__INFINI_C __export infiniStatus_t infiniopCreateTanDescriptor( infiniopHandle_t handle, infiniopTanDescriptor_t *desc_ptr, infiniopTensorDescriptor_t input_desc, infiniopTensorDescriptor_t out_desc); -__C __export infiniStatus_t infiniopGetTanWorkspaceSize(infiniopTanDescriptor_t desc, size_t *size); +__INFINI_C __export infiniStatus_t infiniopGetTanWorkspaceSize(infiniopTanDescriptor_t desc, size_t *size); -__C __export infiniStatus_t infiniopTan( +__INFINI_C __export infiniStatus_t infiniopTan( infiniopTanDescriptor_t desc, void *workspace, size_t workspace_size, @@ -21,6 +21,6 @@ __C __export infiniStatus_t infiniopTan( const void *input, void *stream); -__C __export infiniStatus_t infiniopDestroyTanDescriptor(infiniopTanDescriptor_t desc); +__INFINI_C __export infiniStatus_t infiniopDestroyTanDescriptor(infiniopTanDescriptor_t desc); -#endif \ No newline at end of file +#endif diff --git a/include/infiniop/ops/tanhshrink.h b/include/infiniop/ops/tanhshrink.h index e136f6523..b2895ccab 100644 --- a/include/infiniop/ops/tanhshrink.h +++ b/include/infiniop/ops/tanhshrink.h @@ -5,20 +5,20 @@ typedef struct InfiniopDescriptor *infiniopTanhshrinkDescriptor_t; -__C __export infiniStatus_t infiniopCreateTanhshrinkDescriptor(infiniopHandle_t handle, - infiniopTanhshrinkDescriptor_t *desc_ptr, - infiniopTensorDescriptor_t output, - infiniopTensorDescriptor_t intput); +__INFINI_C __export infiniStatus_t infiniopCreateTanhshrinkDescriptor(infiniopHandle_t handle, + infiniopTanhshrinkDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t output, + infiniopTensorDescriptor_t intput); -__C __export infiniStatus_t infiniopGetTanhshrinkWorkspaceSize(infiniopTanhshrinkDescriptor_t desc, size_t *size); +__INFINI_C __export infiniStatus_t infiniopGetTanhshrinkWorkspaceSize(infiniopTanhshrinkDescriptor_t desc, size_t *size); -__C __export infiniStatus_t infiniopTanhshrink(infiniopTanhshrinkDescriptor_t desc, - void *workspace, - size_t workspace_size, - void *output, - const void *intput, - void *stream); +__INFINI_C __export infiniStatus_t infiniopTanhshrink(infiniopTanhshrinkDescriptor_t desc, + void *workspace, + size_t workspace_size, + void *output, + const void *intput, + void *stream); -__C __export infiniStatus_t infiniopDestroyTanhshrinkDescriptor(infiniopTanhshrinkDescriptor_t desc); +__INFINI_C __export infiniStatus_t infiniopDestroyTanhshrinkDescriptor(infiniopTanhshrinkDescriptor_t desc); #endif diff --git a/python/infinicore/__init__.py b/python/infinicore/__init__.py index f5e9a72b0..16b1362a3 100644 --- a/python/infinicore/__init__.py +++ b/python/infinicore/__init__.py @@ -59,13 +59,13 @@ from infinicore.ops.binary_cross_entropy_with_logits import ( binary_cross_entropy_with_logits, ) +from infinicore.ops.cat import cat from infinicore.ops.cdist import cdist from infinicore.ops.cross_entropy import cross_entropy from infinicore.ops.equal import equal from infinicore.ops.fmod import fmod -from infinicore.ops.kv_caching import kv_caching -from infinicore.ops.cat import cat from infinicore.ops.inner import inner +from infinicore.ops.kv_caching import kv_caching from infinicore.ops.masked_select import masked_select from infinicore.ops.matmul import matmul from infinicore.ops.mha_kvcache import mha_kvcache @@ -79,11 +79,11 @@ from infinicore.ops.reciprocal import reciprocal from infinicore.ops.squeeze import squeeze from infinicore.ops.sum import sum +from infinicore.ops.tan import tan from infinicore.ops.topk import topk from infinicore.ops.unsqueeze import unsqueeze from infinicore.ops.var import var from infinicore.ops.var_mean import var_mean -from infinicore.ops.tan import tan from infinicore.tensor import ( Tensor, empty, diff --git a/python/infinicore/nn/functional/__init__.py b/python/infinicore/nn/functional/__init__.py index 91e101f9d..55de7900b 100644 --- a/python/infinicore/nn/functional/__init__.py +++ b/python/infinicore/nn/functional/__init__.py @@ -34,5 +34,5 @@ "swiglu", "linear_w8a8i8", "silu_and_mul", - "tanhshrink" + "tanhshrink", ] diff --git a/python/infinicore/ops/cat.py b/python/infinicore/ops/cat.py index 3a3e1470f..e45e9294a 100644 --- a/python/infinicore/ops/cat.py +++ b/python/infinicore/ops/cat.py @@ -2,11 +2,11 @@ from infinicore.tensor import Tensor -def cat(tensors, dim:int=0, *,out=None): +def cat(tensors, dim: int = 0, *, out=None): if out is None: return Tensor(_infinicore.cat([tensor._underlying for tensor in tensors], dim)) _infinicore.cat_(out._underlying, [tensor._underlying for tensor in tensors], dim) - + # raise RuntimeError("breakpointer!!!") return out diff --git a/python/infinicore/ops/masked_select.py b/python/infinicore/ops/masked_select.py index d1846edfc..14a3a1ff0 100644 --- a/python/infinicore/ops/masked_select.py +++ b/python/infinicore/ops/masked_select.py @@ -1,5 +1,6 @@ from infinicore.lib import _infinicore from infinicore.tensor import Tensor + def masked_select(input, mask): - return Tensor(_infinicore.masked_select(input._underlying, mask._underlying)) \ No newline at end of file + return Tensor(_infinicore.masked_select(input._underlying, mask._underlying)) diff --git a/src/infinicore/ops/cat/cat.cc b/src/infinicore/ops/cat/cat.cc index 908f5f457..a4e0a8ae0 100644 --- a/src/infinicore/ops/cat/cat.cc +++ b/src/infinicore/ops/cat/cat.cc @@ -7,37 +7,38 @@ namespace infinicore::op { namespace { class CatInfo { - + CatInfo() = default; public: - int dim; int ndim; size_t tensors_size; - + std::vector contiguous_dim; std::vector copy_size; - static CatInfo create(Tensor& out, std::vector& tensors, int dim) { - + static CatInfo create(Tensor &out, std::vector &tensors, int dim) { + int ndim = out->ndim(); size_t tensors_size = tensors.size(); std::vector contiguous_dim(tensors_size, ndim); std::vector copy_size(tensors_size, dsize(out->dtype())); - for (int i = 0; i < tensors_size; i ++) { - if (tensors[i]->ndim() == 1) continue; + for (int i = 0; i < tensors_size; i++) { + if (tensors[i]->ndim() == 1) { + continue; + } if (tensors[i]->stride(ndim - 1) == 1) { contiguous_dim[i] = ndim - 1; - for (int j = ndim - 2; j >= dim; j --) { + for (int j = ndim - 2; j >= dim; j--) { if (tensors[i]->stride(j) == tensors[i]->stride(j + 1) * tensors[i]->shape()[j + 1]) { contiguous_dim[i] = j; } } } - for (int j = contiguous_dim[i]; j < ndim; j ++) { + for (int j = contiguous_dim[i]; j < ndim; j++) { copy_size[i] *= tensors[i]->shape()[j]; } } @@ -46,20 +47,19 @@ class CatInfo { } }; - void low_dim_copy( - CatInfo &info, Tensor& tensor, Tensor& out, - std::byte *tensor_ptr, std::byte *out_ptr, + CatInfo &info, Tensor &tensor, Tensor &out, + std::byte *tensor_ptr, std::byte *out_ptr, int depth, int tensor_pos) { if (depth != info.contiguous_dim[tensor_pos]) { - std::byte* now_tensor_ptr = tensor_ptr; - std::byte* now_out_ptr = out_ptr; + std::byte *now_tensor_ptr = tensor_ptr; + std::byte *now_out_ptr = out_ptr; + + for (int i = 0; i < tensor->shape()[depth]; i++) { - for (int i = 0; i < tensor->shape()[depth]; i ++) { - low_dim_copy(info, tensor, out, now_tensor_ptr, now_out_ptr, depth + 1, tensor_pos); - + now_tensor_ptr += tensor->stride(depth) * dsize(tensor->dtype()); now_out_ptr += out->stride(depth) * dsize(out->dtype()); } @@ -68,39 +68,43 @@ void low_dim_copy( std::memcpy(out_ptr, tensor_ptr, info.copy_size[tensor_pos]); } else { - + context::memcpyD2D(out_ptr, tensor_ptr, info.copy_size[tensor_pos]); } } } void high_dim_split( - CatInfo &info, std::vector& tensors, Tensor& out, - std::vector tensors_ptr, std::byte* out_ptr, + CatInfo &info, std::vector &tensors, Tensor &out, + std::vector tensors_ptr, std::byte *out_ptr, int depth) { - + if (depth != info.dim) { - std::vector now_tensors_ptr = tensors_ptr; - std::byte* now_out_ptr = out_ptr; + std::vector now_tensors_ptr = tensors_ptr; + std::byte *now_out_ptr = out_ptr; - for (int i = 0; i < out->shape()[depth]; i ++) { + for (int i = 0; i < out->shape()[depth]; i++) { high_dim_split(info, tensors, out, now_tensors_ptr, now_out_ptr, depth + 1); - - for (int i = 0; i < info.tensors_size; i ++) { - if (tensors[i]->ndim() == 1) continue; + + for (int i = 0; i < info.tensors_size; i++) { + if (tensors[i]->ndim() == 1) { + continue; + } now_tensors_ptr[i] += tensors[i]->stride(depth) * dsize(tensors[i]->dtype()); } now_out_ptr += out->stride(depth) * dsize(out->dtype()); } } else { - std::byte* now_out_ptr = out_ptr; + std::byte *now_out_ptr = out_ptr; + + for (int i = 0; i < info.tensors_size; i++) { + if (tensors[i]->ndim() == 1) { + continue; + } - for (int i = 0; i < info.tensors_size; i ++) { - if (tensors[i]->ndim() == 1) continue; - low_dim_copy(info, tensors[i], out, tensors_ptr[i], now_out_ptr, depth, i); - + now_out_ptr += tensors[i]->shape()[depth] * out->stride(depth) * dsize(out->dtype()); } } @@ -115,9 +119,11 @@ Tensor cat(std::vector tensors, int dim) { dim = (dim + ndim) % ndim; Shape shape = tensors[0]->shape(); - for (int i = 1; i < tensors.size(); i ++) { + for (int i = 1; i < tensors.size(); i++) { assert(tensors[i]->ndim() == dim || tensors[i]->ndim() == 1); - if (tensors[i]->ndim() != ndim) continue; + if (tensors[i]->ndim() != ndim) { + continue; + } shape[dim] += tensors[i]->shape()[dim]; } @@ -134,13 +140,13 @@ void cat_(Tensor out, std::vector tensors, int dim) { dim = (dim + ndim) % ndim; size_t dim_shape = 0; - for (auto& tensor : tensors) { + for (auto &tensor : tensors) { assert(tensor->ndim() == ndim || tensors[i]->ndim() == 1); if (tensor->ndim() == 1) { assert(tensor->shape()[0] == 0); continue; } - for (int i = 0; i < ndim; i ++) { + for (int i = 0; i < ndim; i++) { if (i != dim) { assert(tensor->shape()[i] == out->shape()[i]); } else { @@ -152,14 +158,13 @@ void cat_(Tensor out, std::vector tensors, int dim) { // Get info CatInfo info = CatInfo::create(out, tensors, dim); - std::vector tensors_ptr(tensors.size()); - for (int i = 0; i < tensors.size(); i ++) { + std::vector tensors_ptr(tensors.size()); + for (int i = 0; i < tensors.size(); i++) { tensors_ptr[i] = tensors[i]->data(); } - std::byte* out_ptr = out->data(); + std::byte *out_ptr = out->data(); high_dim_split(info, tensors, out, tensors_ptr, out_ptr, 0); - } -} // namespace infinicore::op \ No newline at end of file +} // namespace infinicore::op diff --git a/src/infinicore/ops/inner/inner.cc b/src/infinicore/ops/inner/inner.cc index 271ad101c..c4a9dec8b 100644 --- a/src/infinicore/ops/inner/inner.cc +++ b/src/infinicore/ops/inner/inner.cc @@ -3,7 +3,6 @@ namespace infinicore::op { - common::OpDispatcher &Inner::dispatcher() { static common::OpDispatcher dispatcher_; return dispatcher_; @@ -27,10 +26,12 @@ Tensor inner(Tensor input, Tensor other) { assert(input->shape()[input_ndim - 1] == other->shape()[other_ndim - 1]); Shape out_shape; - for (int i = 0; i < input_ndim - 1; i ++) + for (int i = 0; i < input_ndim - 1; i++) { out_shape.push_back(input->shape()[i]); - for (int i = 0; i < other_ndim - 1; i ++) + } + for (int i = 0; i < other_ndim - 1; i++) { out_shape.push_back(other->shape()[i]); + } auto out = Tensor::zeros(out_shape, input->dtype(), input->device()); inner_(out, input, other); @@ -38,9 +39,8 @@ Tensor inner(Tensor input, Tensor other) { } void inner_(Tensor out, Tensor input, Tensor other) { - - Inner::execute(out, input, other); + Inner::execute(out, input, other); } -} \ No newline at end of file +} // namespace infinicore::op diff --git a/src/infinicore/ops/inner/inner_infiniop.cc b/src/infinicore/ops/inner/inner_infiniop.cc index fe6b8de9a..fd51d2128 100644 --- a/src/infinicore/ops/inner/inner_infiniop.cc +++ b/src/infinicore/ops/inner/inner_infiniop.cc @@ -1,7 +1,7 @@ #include "../../utils.hpp" #include "infinicore/common/hash.hpp" -#include "infinicore/ops/inner.hpp" #include "infinicore/ops/common/cache.hpp" +#include "infinicore/ops/inner.hpp" #include namespace infinicore::op::inner_impl::infiniop { @@ -39,21 +39,19 @@ void calculate(Tensor out, Tensor input, Tensor other) { INFINICORE_CHECK_ERROR(infiniopGetInnerWorkspaceSize(desc, &workspace_size)); std::shared_ptr workspace = context::allocateMemory(workspace_size); - INFINICORE_CHECK_ERROR(infiniopInner( desc, workspace->data(), workspace_size, out->data(), input->data(), other->data(), context::getStream())); } static bool registered = []() { - Inner::dispatcher().registerDevice({ - Device::Type::CPU, - Device::Type::NVIDIA, - Device::Type::METAX, - Device::Type::MOORE, - Device::Type::ILUVATAR - }, &calculate, false); + Inner::dispatcher().registerDevice({Device::Type::CPU, + Device::Type::NVIDIA, + Device::Type::METAX, + Device::Type::MOORE, + Device::Type::ILUVATAR}, + &calculate, false); return true; }(); -} \ No newline at end of file +} // namespace infinicore::op::inner_impl::infiniop diff --git a/src/infinicore/ops/masked_select/masked_select.cc b/src/infinicore/ops/masked_select/masked_select.cc index 13ac6dfe4..1ade25a09 100644 --- a/src/infinicore/ops/masked_select/masked_select.cc +++ b/src/infinicore/ops/masked_select/masked_select.cc @@ -11,13 +11,13 @@ common::OpDispatcher &MaskedSelect::dispatcher() { void MaskedSelect::execute(Tensor input, Tensor mask, void **data_ptr, size_t *dlen_ptr) { auto device_type = context::getDevice().getType(); auto func = dispatcher().lookup(device_type); - + if (func == nullptr) { throw std::runtime_error("No MaskedSelect implementation found for device type: " + std::to_string(static_cast(device_type))); } func(input, mask, data_ptr, dlen_ptr); -} +} Tensor masked_select(Tensor input, Tensor mask) { @@ -26,8 +26,8 @@ Tensor masked_select(Tensor input, Tensor mask) { MaskedSelect::execute(input, mask, (void **)&data, &dlen); auto out = Tensor::from_blob(data, {dlen}, input->dtype(), input->device()); - + return out; } -} +} // namespace infinicore::op diff --git a/src/infinicore/ops/masked_select/masked_select_infiniop.cc b/src/infinicore/ops/masked_select/masked_select_infiniop.cc index 0144ce425..31d08c09f 100644 --- a/src/infinicore/ops/masked_select/masked_select_infiniop.cc +++ b/src/infinicore/ops/masked_select/masked_select_infiniop.cc @@ -1,7 +1,7 @@ #include "../../utils.hpp" #include "infinicore/common/hash.hpp" -#include "infinicore/ops/masked_select.hpp" #include "infinicore/ops/common/cache.hpp" +#include "infinicore/ops/masked_select.hpp" #include namespace infinicore::op::masked_select_impl::infiniop { @@ -39,21 +39,19 @@ void calculate(Tensor input, Tensor mask, void **data_ptr, size_t *dlen_ptr) { INFINICORE_CHECK_ERROR(infiniopGetMaskedSelectWorkspaceSize(desc, &workspace_size)); std::shared_ptr workspace = context::allocateMemory(workspace_size); - INFINICORE_CHECK_ERROR(infiniopMaskedSelect( desc, workspace->data(), workspace_size, input->data(), (const bool *)mask->data(), data_ptr, dlen_ptr, context::getStream())); } static bool registered = []() { - MaskedSelect::dispatcher().registerDevice({ - Device::Type::CPU, - Device::Type::NVIDIA, - Device::Type::METAX, - Device::Type::MOORE, - Device::Type::ILUVATAR - }, &calculate, false); + MaskedSelect::dispatcher().registerDevice({Device::Type::CPU, + Device::Type::NVIDIA, + Device::Type::METAX, + Device::Type::MOORE, + Device::Type::ILUVATAR}, + &calculate, false); return true; }(); -} +} // namespace infinicore::op::masked_select_impl::infiniop diff --git a/src/infinicore/ops/tan/tan_infiniop.cc b/src/infinicore/ops/tan/tan_infiniop.cc index 1d7c9c874..ccb766490 100644 --- a/src/infinicore/ops/tan/tan_infiniop.cc +++ b/src/infinicore/ops/tan/tan_infiniop.cc @@ -45,13 +45,12 @@ void calculate(Tensor output, Tensor input) { } static bool registered = []() { - Tan::dispatcher().registerDevice({ - Device::Type::CPU, - Device::Type::NVIDIA, - Device::Type::METAX, - Device::Type::MOORE, - Device::Type::ILUVATAR - }, &calculate, false); + Tan::dispatcher().registerDevice({Device::Type::CPU, + Device::Type::NVIDIA, + Device::Type::METAX, + Device::Type::MOORE, + Device::Type::ILUVATAR}, + &calculate, false); return true; }(); diff --git a/src/infinicore/ops/tanhshrink/tanhshrink_infiniop.cc b/src/infinicore/ops/tanhshrink/tanhshrink_infiniop.cc index ca440f875..1d7fa4d78 100644 --- a/src/infinicore/ops/tanhshrink/tanhshrink_infiniop.cc +++ b/src/infinicore/ops/tanhshrink/tanhshrink_infiniop.cc @@ -45,13 +45,12 @@ void calculate(Tensor output, Tensor input) { } static bool registered = []() { - Tanhshrink::dispatcher().registerDevice({ - Device::Type::CPU, - Device::Type::NVIDIA, - Device::Type::METAX, - Device::Type::MOORE, - Device::Type::ILUVATAR - }, &calculate, false); + Tanhshrink::dispatcher().registerDevice({Device::Type::CPU, + Device::Type::NVIDIA, + Device::Type::METAX, + Device::Type::MOORE, + Device::Type::ILUVATAR}, + &calculate, false); return true; }(); diff --git a/src/infinicore/pybind11/ops.hpp b/src/infinicore/pybind11/ops.hpp index ffc2e010c..849d6c474 100644 --- a/src/infinicore/pybind11/ops.hpp +++ b/src/infinicore/pybind11/ops.hpp @@ -24,11 +24,10 @@ #include "ops/fmod.hpp" #include "ops/hardswish.hpp" #include "ops/hardtanh.hpp" +#include "ops/inner.hpp" #include "ops/kv_caching.hpp" #include "ops/linear.hpp" #include "ops/linear_w8a8i8.hpp" -#include "ops/inner.hpp" -#include "ops/linear.hpp" #include "ops/masked_select.hpp" #include "ops/matmul.hpp" #include "ops/mha_kvcache.hpp" @@ -46,11 +45,11 @@ #include "ops/silu_and_mul.hpp" #include "ops/sum.hpp" #include "ops/swiglu.hpp" +#include "ops/tan.hpp" +#include "ops/tanhshrink.hpp" #include "ops/topk.hpp" #include "ops/var.hpp" #include "ops/var_mean.hpp" -#include "ops/tan.hpp" -#include "ops/tanhshrink.hpp" namespace py = pybind11; diff --git a/src/infinicore/pybind11/ops/cat.hpp b/src/infinicore/pybind11/ops/cat.hpp index 2ef3392b0..093b15ff2 100644 --- a/src/infinicore/pybind11/ops/cat.hpp +++ b/src/infinicore/pybind11/ops/cat.hpp @@ -14,7 +14,7 @@ inline void bind_cat(py::module &m) { py::arg("tensors"), py::arg("dim") = 0, R"doc(opertor: torch.cat, out-of-place mode)doc"); - + m.def("cat_", &op::cat_, py::arg("out"), @@ -23,4 +23,4 @@ inline void bind_cat(py::module &m) { R"doc(opertor: torch.cat, in-place mode)doc"); } -} // namespace infinicore::ops \ No newline at end of file +} // namespace infinicore::ops diff --git a/src/infinicore/pybind11/ops/inner.hpp b/src/infinicore/pybind11/ops/inner.hpp index 16ee91932..b54d6117b 100644 --- a/src/infinicore/pybind11/ops/inner.hpp +++ b/src/infinicore/pybind11/ops/inner.hpp @@ -14,7 +14,7 @@ inline void bind_inner(py::module &m) { py::arg("input"), py::arg("other"), R"doc(opertor: torch.inner, out-of-place mode)doc"); - + m.def("inner_", &op::inner_, py::arg("out"), @@ -23,4 +23,4 @@ inline void bind_inner(py::module &m) { R"doc(opertor: torch.inner, in-place mode)doc"); } -} // namespace infinicore::ops \ No newline at end of file +} // namespace infinicore::ops diff --git a/src/infinicore/pybind11/ops/masked_select.hpp b/src/infinicore/pybind11/ops/masked_select.hpp index 2734590e6..949566b6b 100644 --- a/src/infinicore/pybind11/ops/masked_select.hpp +++ b/src/infinicore/pybind11/ops/masked_select.hpp @@ -14,7 +14,6 @@ inline void bind_masked_select(py::module &m) { py::arg("input"), py::arg("mask"), R"doc(opertor: torch.masked_select, out-of-place mode)doc"); - } -} // namespace infinicore::ops \ No newline at end of file +} // namespace infinicore::ops diff --git a/src/infiniop/ops/inner/cpu/inner_cpu.cc b/src/infiniop/ops/inner/cpu/inner_cpu.cc index 5915cb3fc..085e46478 100644 --- a/src/infiniop/ops/inner/cpu/inner_cpu.cc +++ b/src/infiniop/ops/inner/cpu/inner_cpu.cc @@ -2,7 +2,7 @@ #include "../../../devices/cpu/common_cpu.h" namespace op::inner::cpu { - + Descriptor::~Descriptor() {} infiniStatus_t Descriptor::create( @@ -11,7 +11,7 @@ infiniStatus_t Descriptor::create( infiniopTensorDescriptor_t out_desc, infiniopTensorDescriptor_t input_desc, infiniopTensorDescriptor_t other_desc) { - + auto result = InnerInfo::create(out_desc, input_desc, other_desc); CHECK_RESULT(result); *desc_ptr = new Descriptor(nullptr, result.take(), 0, handle->device, handle->device_id); @@ -20,29 +20,29 @@ infiniStatus_t Descriptor::create( namespace { -template +template infiniStatus_t inner(const InnerInfo *info, const T *input, const T *other, T *out) { #pragma omp parallel for - for (size_t out_index = 0; out_index < info->total_elements; out_index ++) { - + for (ptrdiff_t out_index = 0; out_index < ptrdiff_t(info->total_elements); out_index++) { + size_t out_dim_pos = info->out_ndim - 1; size_t out_offset = op::common_cpu::indexToOffset(out_index, info->out_ndim, info->out_shape.data(), info->out_strides.data()); size_t index = out_index; ptrdiff_t input_offset = 0; ptrdiff_t other_offset = 0; - - for (int i = (int)info->other_ndim - 2; i >= 0; i --) { + + for (int i = (int)info->other_ndim - 2; i >= 0; i--) { other_offset += (index % info->out_shape[out_dim_pos]) * info->other_strides[i]; - index /= info->out_shape[out_dim_pos --]; + index /= info->out_shape[out_dim_pos--]; } - for (int i = (int)info->input_ndim - 2; i >= 0; i --) { + for (int i = (int)info->input_ndim - 2; i >= 0; i--) { input_offset += (index % info->out_shape[out_dim_pos]) * info->input_strides[i]; - index /= info->out_shape[out_dim_pos --]; + index /= info->out_shape[out_dim_pos--]; } float tmp = 0.; - for (size_t i = 0; i < info->oper_len; i ++) { + for (size_t i = 0; i < info->oper_len; i++) { if constexpr (std::is_same::value || std::is_same::value) { tmp += utils::cast(input[input_offset]) * utils::cast(other[other_offset]); } else { @@ -59,14 +59,13 @@ infiniStatus_t inner(const InnerInfo *info, const T *input, const T *other, T *o } } - return INFINI_STATUS_SUCCESS; } -} +} // namespace infiniStatus_t Descriptor::calculate( - void* workspace, size_t workspace_size, + void *workspace, size_t workspace_size, void *out, const void *input, const void *other, @@ -76,7 +75,7 @@ infiniStatus_t Descriptor::calculate( CHECK_STATUS(inner(&_info, (const bf16_t *)input, (const bf16_t *)other, (bf16_t *)out)); } else if (_info.dtype == INFINI_DTYPE_F16) { CHECK_STATUS(inner(&_info, (const fp16_t *)input, (const fp16_t *)other, (fp16_t *)out)); - } else if (_info.dtype == INFINI_DTYPE_F32){ + } else if (_info.dtype == INFINI_DTYPE_F32) { CHECK_STATUS(inner(&_info, (const float *)input, (const float *)other, (float *)out)); } else { return INFINI_STATUS_BAD_TENSOR_DTYPE; @@ -85,4 +84,4 @@ infiniStatus_t Descriptor::calculate( return INFINI_STATUS_SUCCESS; } -} \ No newline at end of file +} // namespace op::inner::cpu diff --git a/src/infiniop/ops/inner/cpu/inner_cpu.h b/src/infiniop/ops/inner/cpu/inner_cpu.h index 5d4774dfa..18b2e7b32 100644 --- a/src/infiniop/ops/inner/cpu/inner_cpu.h +++ b/src/infiniop/ops/inner/cpu/inner_cpu.h @@ -4,4 +4,4 @@ DESCRIPTOR(cpu); -#endif \ No newline at end of file +#endif diff --git a/src/infiniop/ops/inner/cuda/kernel.cuh b/src/infiniop/ops/inner/cuda/kernel.cuh index 1616a4281..928161ba8 100644 --- a/src/infiniop/ops/inner/cuda/kernel.cuh +++ b/src/infiniop/ops/inner/cuda/kernel.cuh @@ -1,35 +1,37 @@ #ifndef __INNER_KERNEL_CUH__ #define __INNER_KERNEL_CUH__ -template +template INFINIOP_CUDA_KERNEL innerKernel( const T *input, const T *other, T *out, size_t total_elements, size_t oper_len, size_t *out_shape, ptrdiff_t *input_strides, ptrdiff_t *other_strides, ptrdiff_t *out_strides, size_t input_ndim, size_t other_ndim, size_t out_ndim) { - + size_t out_index = blockDim.x * blockIdx.x + threadIdx.x; - if (out_index >= total_elements) return; + if (out_index >= total_elements) { + return; + } size_t out_offset = device::nvidia::indexToOffset(out_index, out_ndim, out_shape, out_strides); - + size_t out_dim_pos = out_ndim - 1; size_t index = out_index; ptrdiff_t input_offset = 0; ptrdiff_t other_offset = 0; - for (int i = (int)other_ndim - 2; i >= 0; i --) { + for (int i = (int)other_ndim - 2; i >= 0; i--) { other_offset += (index % out_shape[out_dim_pos]) * other_strides[i]; - index /= out_shape[out_dim_pos --]; + index /= out_shape[out_dim_pos--]; } - for (int i = (int)input_ndim - 2; i >= 0; i --) { + for (int i = (int)input_ndim - 2; i >= 0; i--) { input_offset += (index % out_shape[out_dim_pos]) * input_strides[i]; - index /= out_shape[out_dim_pos --]; + index /= out_shape[out_dim_pos--]; } T tmp = 0; - for (size_t i = 0; i < oper_len; i ++) { + for (size_t i = 0; i < oper_len; i++) { tmp += input[input_offset] * other[other_offset]; input_offset += input_strides[input_ndim - 1]; other_offset += other_strides[other_ndim - 1]; @@ -38,4 +40,4 @@ INFINIOP_CUDA_KERNEL innerKernel( out[out_offset] = tmp; } -#endif // __INNER_KERNEL_CUH__ \ No newline at end of file +#endif // __INNER_KERNEL_CUH__ diff --git a/src/infiniop/ops/inner/info.h b/src/infiniop/ops/inner/info.h index 2484277eb..3262f9cf9 100644 --- a/src/infiniop/ops/inner/info.h +++ b/src/infiniop/ops/inner/info.h @@ -12,7 +12,7 @@ class InnerInfo { public: infiniDtype_t dtype; - + size_t input_ndim; size_t other_ndim; size_t out_ndim; @@ -32,33 +32,40 @@ class InnerInfo { infiniopTensorDescriptor_t other_desc) { auto dtype = out_desc->dtype(); - if (dtype != input_desc->dtype() || dtype != other_desc->dtype()) + if (dtype != input_desc->dtype() || dtype != other_desc->dtype()) { return INFINI_STATUS_BAD_TENSOR_DTYPE; + } CHECK_DTYPE(dtype, INFINI_DTYPE_BF16, INFINI_DTYPE_F16, INFINI_DTYPE_F32); auto input_ndim = input_desc->ndim(); auto other_ndim = other_desc->ndim(); auto out_ndim = out_desc->ndim(); - if (out_ndim + 2 != input_ndim + other_ndim) + if (out_ndim + 2 != input_ndim + other_ndim) { return INFINI_STATUS_BAD_TENSOR_SHAPE; + } auto input_shape = input_desc->shape(); auto other_shape = other_desc->shape(); auto out_shape = out_desc->shape(); if (input_ndim && other_ndim) { - if (input_shape[input_ndim - 1] != other_shape[other_ndim - 1]) + if (input_shape[input_ndim - 1] != other_shape[other_ndim - 1]) { return INFINI_STATUS_BAD_TENSOR_SHAPE; + } } size_t out_dim_pos = 0; if (input_ndim) { - for (size_t i = 0; i < input_ndim - 1; i ++) - if (input_shape[i] != out_shape[out_dim_pos ++]) + for (size_t i = 0; i < input_ndim - 1; i++) { + if (input_shape[i] != out_shape[out_dim_pos++]) { return INFINI_STATUS_BAD_TENSOR_SHAPE; + } + } } if (other_ndim) { - for (size_t i = 0; i < other_ndim - 1; i ++) - if (other_shape[i] != out_shape[out_dim_pos ++]) + for (size_t i = 0; i < other_ndim - 1; i++) { + if (other_shape[i] != out_shape[out_dim_pos++]) { return INFINI_STATUS_BAD_TENSOR_SHAPE; + } + } } size_t oper_len = 1; @@ -67,9 +74,10 @@ class InnerInfo { } size_t total_elements = 1; - for (size_t i = 0; i < out_ndim; i ++) + for (size_t i = 0; i < out_ndim; i++) { total_elements *= out_shape[i]; - + } + auto input_strides = input_desc->strides(); auto other_strides = other_desc->strides(); auto out_strides = out_desc->strides(); @@ -78,11 +86,10 @@ class InnerInfo { dtype, input_ndim, other_ndim, out_ndim, oper_len, total_elements, out_shape, - input_strides, other_strides, out_strides - }); + input_strides, other_strides, out_strides}); } }; } // namespace op::inner -#endif // __INNER_INFO_H__ \ No newline at end of file +#endif // __INNER_INFO_H__ diff --git a/src/infiniop/ops/inner/inner.h b/src/infiniop/ops/inner/inner.h index 5c44262a7..c57e82708 100644 --- a/src/infiniop/ops/inner/inner.h +++ b/src/infiniop/ops/inner/inner.h @@ -7,7 +7,7 @@ #define DESCRIPTOR(NAMESPACE) \ \ namespace op::inner::NAMESPACE { \ - class Descriptor final: public InfiniopDescriptor { \ + class Descriptor final : public InfiniopDescriptor { \ struct Opaque; \ Opaque *_opaque; \ InnerInfo _info; \ @@ -24,7 +24,6 @@ _info(info), \ _workspace_size(workspace_size) {} \ \ - \ public: \ ~Descriptor(); \ size_t workspaceSize() const { return _workspace_size; } \ @@ -45,5 +44,4 @@ }; \ } - -#endif \ No newline at end of file +#endif diff --git a/src/infiniop/ops/inner/metax/inner_metax.h b/src/infiniop/ops/inner/metax/inner_metax.h index 04a47d445..72d989a3d 100644 --- a/src/infiniop/ops/inner/metax/inner_metax.h +++ b/src/infiniop/ops/inner/metax/inner_metax.h @@ -4,4 +4,4 @@ DESCRIPTOR(metax); -#endif \ No newline at end of file +#endif diff --git a/src/infiniop/ops/inner/metax/inner_metax.maca b/src/infiniop/ops/inner/metax/inner_metax.maca index 683f4a1e1..95c8dcad7 100644 --- a/src/infiniop/ops/inner/metax/inner_metax.maca +++ b/src/infiniop/ops/inner/metax/inner_metax.maca @@ -38,7 +38,7 @@ infiniStatus_t Descriptor::create( namespace { -template +template infiniStatus_t launchKernel( const InnerInfo &info, const T *input, const T *other, T *out, @@ -82,7 +82,7 @@ infiniStatus_t launchKernel( return INFINI_STATUS_SUCCESS; } -} +} // namespace infiniStatus_t Descriptor::calculate( void *workspace, size_t workspace_size, @@ -92,22 +92,21 @@ infiniStatus_t Descriptor::calculate( void *stream_) const { hcStream_t stream = (hcStream_t)stream_; -#define CALCULATE_INNER(BLOCK_SIZE, T) \ - launchKernel( \ - _info, \ - (const T *)input, (const T *)other, (T *)out, \ - stream, workspace, workspace_size \ - ) -#define CALCULATE_INNER_WITH_BLOCK_SIZE(BLOCK_SIZE) \ - { \ - if (_info.dtype == INFINI_DTYPE_BF16) \ - return CALCULATE_INNER(BLOCK_SIZE, __hpcc_bfloat16);\ - else if(_info.dtype == INFINI_DTYPE_F16) \ - return CALCULATE_INNER(BLOCK_SIZE, half); \ - else if(_info.dtype == INFINI_DTYPE_F32) \ - return CALCULATE_INNER(BLOCK_SIZE, float); \ - else \ - return INFINI_STATUS_BAD_TENSOR_DTYPE; \ +#define CALCULATE_INNER(BLOCK_SIZE, T) \ + launchKernel( \ + _info, \ + (const T *)input, (const T *)other, (T *)out, \ + stream, workspace, workspace_size) +#define CALCULATE_INNER_WITH_BLOCK_SIZE(BLOCK_SIZE) \ + { \ + if (_info.dtype == INFINI_DTYPE_BF16) \ + return CALCULATE_INNER(BLOCK_SIZE, __hpcc_bfloat16); \ + else if (_info.dtype == INFINI_DTYPE_F16) \ + return CALCULATE_INNER(BLOCK_SIZE, half); \ + else if (_info.dtype == INFINI_DTYPE_F32) \ + return CALCULATE_INNER(BLOCK_SIZE, float); \ + else \ + return INFINI_STATUS_BAD_TENSOR_DTYPE; \ } if (_opaque->internal->maxThreadsPerBlock() == METAX_BLOCK_SIZE_1024) { @@ -120,4 +119,4 @@ infiniStatus_t Descriptor::calculate( return INFINI_STATUS_SUCCESS; } -} \ No newline at end of file +} // namespace op::inner::metax diff --git a/src/infiniop/ops/inner/metax/inner_metax_kernel.h b/src/infiniop/ops/inner/metax/inner_metax_kernel.h index 52f37e1ef..3e66a4590 100644 --- a/src/infiniop/ops/inner/metax/inner_metax_kernel.h +++ b/src/infiniop/ops/inner/metax/inner_metax_kernel.h @@ -1,38 +1,40 @@ #ifndef __INNER_METAX_KERNEL_H__ #define __INNER_METAX_KERNEL_H__ -template +template INFINIOP_METAX_KERNEL innerKernel( const T *input, const T *other, T *out, size_t total_elements, size_t oper_len, size_t *out_shape, ptrdiff_t *input_strides, ptrdiff_t *other_strides, ptrdiff_t *out_strides, size_t input_ndim, size_t other_ndim, size_t out_ndim) { - + size_t out_index = blockDim.x * blockIdx.x + threadIdx.x; - if (out_index >= total_elements) return; + if (out_index >= total_elements) { + return; + } size_t out_offset = device::metax::indexToOffset(out_index, out_ndim, out_shape, out_strides); - + size_t out_dim_pos = out_ndim - 1; size_t index = out_index; ptrdiff_t input_offset = 0; ptrdiff_t other_offset = 0; - for (int i = (int)other_ndim - 2; i >= 0; i --) { + for (int i = (int)other_ndim - 2; i >= 0; i--) { other_offset += (index % out_shape[out_dim_pos]) * other_strides[i]; - index /= out_shape[out_dim_pos --]; + index /= out_shape[out_dim_pos--]; } - for (int i = (int)input_ndim - 2; i >= 0; i --) { + for (int i = (int)input_ndim - 2; i >= 0; i--) { input_offset += (index % out_shape[out_dim_pos]) * input_strides[i]; - index /= out_shape[out_dim_pos --]; + index /= out_shape[out_dim_pos--]; } // T tmp = input[input_offset] * other[other_offset]; // input_offset += input_strides[input_ndim - 1]; // other_offset += other_strides[other_ndim - 1]; T tmp = 0.; - for (size_t i = 0; i < oper_len; i ++) { + for (size_t i = 0; i < oper_len; i++) { if constexpr (std::is_same_v || std::is_same_v) { tmp += input[input_offset] * other[other_offset]; } else { @@ -56,4 +58,4 @@ INFINIOP_METAX_KERNEL innerKernel( out[out_offset] = tmp; } -#endif // __INNER_KERNEL_CUH__ \ No newline at end of file +#endif // __INNER_KERNEL_CUH__ diff --git a/src/infiniop/ops/inner/moore/inner_moore.h b/src/infiniop/ops/inner/moore/inner_moore.h index a508a6718..e722f2adf 100644 --- a/src/infiniop/ops/inner/moore/inner_moore.h +++ b/src/infiniop/ops/inner/moore/inner_moore.h @@ -4,4 +4,4 @@ DESCRIPTOR(moore); -#endif \ No newline at end of file +#endif diff --git a/src/infiniop/ops/inner/moore/inner_moore.mu b/src/infiniop/ops/inner/moore/inner_moore.mu index 61a430d2c..82abda6b4 100644 --- a/src/infiniop/ops/inner/moore/inner_moore.mu +++ b/src/infiniop/ops/inner/moore/inner_moore.mu @@ -38,7 +38,7 @@ infiniStatus_t Descriptor::create( namespace { -template +template infiniStatus_t launchKernel( const InnerInfo &info, const T *input, const T *other, T *out, @@ -80,7 +80,7 @@ infiniStatus_t launchKernel( return INFINI_STATUS_SUCCESS; } -} +} // namespace infiniStatus_t Descriptor::calculate( void *workspace, size_t workspace_size, @@ -90,22 +90,21 @@ infiniStatus_t Descriptor::calculate( void *stream_) const { musaStream_t stream = (musaStream_t)stream_; -#define CALCULATE_INNER(BLOCK_SIZE, T) \ - launchKernel( \ - _info, \ - (const T *)input, (const T *)other, (T *)out, \ - stream, workspace, workspace_size \ - ) -#define CALCULATE_INNER_WITH_BLOCK_SIZE(BLOCK_SIZE) \ - { \ - if (_info.dtype == INFINI_DTYPE_BF16) \ - return CALCULATE_INNER(BLOCK_SIZE, __mt_bfloat16); \ - else if(_info.dtype == INFINI_DTYPE_F16) \ - return CALCULATE_INNER(BLOCK_SIZE, half); \ - else if(_info.dtype == INFINI_DTYPE_F32) \ - return CALCULATE_INNER(BLOCK_SIZE, float); \ - else \ - return INFINI_STATUS_BAD_TENSOR_DTYPE; \ +#define CALCULATE_INNER(BLOCK_SIZE, T) \ + launchKernel( \ + _info, \ + (const T *)input, (const T *)other, (T *)out, \ + stream, workspace, workspace_size) +#define CALCULATE_INNER_WITH_BLOCK_SIZE(BLOCK_SIZE) \ + { \ + if (_info.dtype == INFINI_DTYPE_BF16) \ + return CALCULATE_INNER(BLOCK_SIZE, __mt_bfloat16); \ + else if (_info.dtype == INFINI_DTYPE_F16) \ + return CALCULATE_INNER(BLOCK_SIZE, half); \ + else if (_info.dtype == INFINI_DTYPE_F32) \ + return CALCULATE_INNER(BLOCK_SIZE, float); \ + else \ + return INFINI_STATUS_BAD_TENSOR_DTYPE; \ } if (_opaque->internal->maxThreadsPerBlock() == MOORE_BLOCK_SIZE_1024) { @@ -120,4 +119,4 @@ infiniStatus_t Descriptor::calculate( return INFINI_STATUS_SUCCESS; } -} \ No newline at end of file +} // namespace op::inner::moore diff --git a/src/infiniop/ops/inner/moore/inner_moore_kernel.h b/src/infiniop/ops/inner/moore/inner_moore_kernel.h index de82dc8a9..fcb3f948f 100644 --- a/src/infiniop/ops/inner/moore/inner_moore_kernel.h +++ b/src/infiniop/ops/inner/moore/inner_moore_kernel.h @@ -1,35 +1,37 @@ #ifndef __INNER_MOORE_KERNEL_H__ #define __INNER_MOORE_KERNEL_H__ -template +template INFINIOP_MOORE_KERNEL innerKernel( const T *input, const T *other, T *out, size_t total_elements, size_t oper_len, size_t *out_shape, ptrdiff_t *input_strides, ptrdiff_t *other_strides, ptrdiff_t *out_strides, size_t input_ndim, size_t other_ndim, size_t out_ndim) { - + size_t out_index = blockDim.x * blockIdx.x + threadIdx.x; - if (out_index >= total_elements) return; + if (out_index >= total_elements) { + return; + } size_t out_offset = device::moore::indexToOffset(out_index, out_ndim, out_shape, out_strides); - + size_t out_dim_pos = out_ndim - 1; size_t index = out_index; ptrdiff_t input_offset = 0; ptrdiff_t other_offset = 0; - for (int i = (int)other_ndim - 2; i >= 0; i --) { + for (int i = (int)other_ndim - 2; i >= 0; i--) { other_offset += (index % out_shape[out_dim_pos]) * other_strides[i]; - index /= out_shape[out_dim_pos --]; + index /= out_shape[out_dim_pos--]; } - for (int i = (int)input_ndim - 2; i >= 0; i --) { + for (int i = (int)input_ndim - 2; i >= 0; i--) { input_offset += (index % out_shape[out_dim_pos]) * input_strides[i]; - index /= out_shape[out_dim_pos --]; + index /= out_shape[out_dim_pos--]; } float tmp = 0.; - for (size_t i = 0; i < oper_len; i ++) { + for (size_t i = 0; i < oper_len; i++) { // if constexpr (std::is_same_v || std::is_same_v) { tmp += static_cast(input[input_offset]) * static_cast(other[other_offset]); // } else { @@ -42,4 +44,4 @@ INFINIOP_MOORE_KERNEL innerKernel( out[out_offset] = static_cast(tmp); } -#endif // __INNER_KERNEL_CUH__ \ No newline at end of file +#endif // __INNER_KERNEL_CUH__ diff --git a/src/infiniop/ops/inner/nvidia/inner_nvidia.cu b/src/infiniop/ops/inner/nvidia/inner_nvidia.cu index e2b3ed0f9..a06e8955c 100644 --- a/src/infiniop/ops/inner/nvidia/inner_nvidia.cu +++ b/src/infiniop/ops/inner/nvidia/inner_nvidia.cu @@ -38,7 +38,7 @@ infiniStatus_t Descriptor::create( namespace { -template +template infiniStatus_t launchKernel( const InnerInfo &info, const T *input, const T *other, T *out, @@ -80,7 +80,7 @@ infiniStatus_t launchKernel( return INFINI_STATUS_SUCCESS; } -} +} // namespace infiniStatus_t Descriptor::calculate( void *workspace, size_t workspace_size, @@ -90,28 +90,29 @@ infiniStatus_t Descriptor::calculate( void *stream_) const { cudaStream_t stream = (cudaStream_t)stream_; -#define CALCULATE_INNER(BLOCK_SIZE, T) \ - launchKernel( \ - _info, \ - (const T *)input, (const T *)other, (T *)out, \ - stream, workspace, workspace_size \ - ) -#define CALCULATE_INNER_WITH_BLOCK_SIZE(BLOCK_SIZE) \ - { \ - if (_info.dtype == INFINI_DTYPE_BF16) \ - return CALCULATE_INNER(BLOCK_SIZE, __nv_bfloat16); \ - else if(_info.dtype == INFINI_DTYPE_F16) \ - return CALCULATE_INNER(BLOCK_SIZE, half); \ - else if(_info.dtype == INFINI_DTYPE_F32) \ - return CALCULATE_INNER(BLOCK_SIZE, float); \ - else \ - return INFINI_STATUS_BAD_TENSOR_DTYPE; \ +#define CALCULATE_INNER(BLOCK_SIZE, T) \ + launchKernel( \ + _info, \ + (const T *)input, (const T *)other, (T *)out, \ + stream, workspace, workspace_size) +#define CALCULATE_INNER_WITH_BLOCK_SIZE(BLOCK_SIZE) \ + { \ + if (_info.dtype == INFINI_DTYPE_BF16) \ + return CALCULATE_INNER(BLOCK_SIZE, __nv_bfloat16); \ + else if (_info.dtype == INFINI_DTYPE_F16) \ + return CALCULATE_INNER(BLOCK_SIZE, half); \ + else if (_info.dtype == INFINI_DTYPE_F32) \ + return CALCULATE_INNER(BLOCK_SIZE, float); \ + else \ + return INFINI_STATUS_BAD_TENSOR_DTYPE; \ } if (_opaque->internal->maxThreadsPerBlock() == CUDA_BLOCK_SIZE_1024) { CALCULATE_INNER_WITH_BLOCK_SIZE(CUDA_BLOCK_SIZE_1024) } else if (_opaque->internal->maxThreadsPerBlock() == CUDA_BLOCK_SIZE_512) { CALCULATE_INNER_WITH_BLOCK_SIZE(CUDA_BLOCK_SIZE_512) + } else if (_opaque->internal->maxThreadsPerBlock() == CUDA_BLOCK_SIZE_2048) { + CALCULATE_INNER_WITH_BLOCK_SIZE(CUDA_BLOCK_SIZE_2048) } else if (_opaque->internal->maxThreadsPerBlock() == CUDA_BLOCK_SIZE_4096) { CALCULATE_INNER_WITH_BLOCK_SIZE(CUDA_BLOCK_SIZE_4096) } else { @@ -120,4 +121,4 @@ infiniStatus_t Descriptor::calculate( return INFINI_STATUS_SUCCESS; } -} \ No newline at end of file +} // namespace op::inner::nvidia diff --git a/src/infiniop/ops/inner/nvidia/inner_nvidia.cuh b/src/infiniop/ops/inner/nvidia/inner_nvidia.cuh index 11033c2c0..4a3b8cb95 100644 --- a/src/infiniop/ops/inner/nvidia/inner_nvidia.cuh +++ b/src/infiniop/ops/inner/nvidia/inner_nvidia.cuh @@ -4,4 +4,4 @@ DESCRIPTOR(nvidia); -#endif \ No newline at end of file +#endif diff --git a/src/infiniop/ops/inner/operator.cc b/src/infiniop/ops/inner/operator.cc index c68762d0e..89c6774d1 100644 --- a/src/infiniop/ops/inner/operator.cc +++ b/src/infiniop/ops/inner/operator.cc @@ -15,20 +15,20 @@ #include "moore/inner_moore.h" #endif -__C infiniStatus_t infiniopCreateInnerDescriptor( +__INFINI_C infiniStatus_t infiniopCreateInnerDescriptor( infiniopHandle_t handle, infiniopInnerDescriptor_t *desc_ptr, infiniopTensorDescriptor_t out_desc, infiniopTensorDescriptor_t input_desc, infiniopTensorDescriptor_t other_desc) { -#define CREATE(CASE, NAMESPACE) \ - case CASE: \ - return op::inner::NAMESPACE::Descriptor::create( \ - handle, \ - reinterpret_cast(desc_ptr), \ - out_desc, \ - input_desc, \ +#define CREATE(CASE, NAMESPACE) \ + case CASE: \ + return op::inner::NAMESPACE::Descriptor::create( \ + handle, \ + reinterpret_cast(desc_ptr), \ + out_desc, \ + input_desc, \ other_desc); switch (handle->device) { @@ -51,11 +51,11 @@ __C infiniStatus_t infiniopCreateInnerDescriptor( return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; } -__C infiniStatus_t infiniopGetInnerWorkspaceSize(infiniopInnerDescriptor_t desc, size_t *size) { +__INFINI_C infiniStatus_t infiniopGetInnerWorkspaceSize(infiniopInnerDescriptor_t desc, size_t *size) { -#define GET(CASE, NAMESPACE) \ - case CASE: \ - *size = reinterpret_cast(desc)->workspaceSize(); \ +#define GET(CASE, NAMESPACE) \ + case CASE: \ + *size = reinterpret_cast(desc)->workspaceSize(); \ return INFINI_STATUS_SUCCESS; switch (desc->device_type) { @@ -78,7 +78,7 @@ __C infiniStatus_t infiniopGetInnerWorkspaceSize(infiniopInnerDescriptor_t desc, return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; } -__C infiniStatus_t infiniopInner( +__INFINI_C infiniStatus_t infiniopInner( infiniopInnerDescriptor_t desc, void *workspace, size_t workspace_size, void *out, @@ -86,8 +86,8 @@ __C infiniStatus_t infiniopInner( const void *other, void *stream) { -#define CALCULATE(CASE, NAMESPACE) \ - case CASE: \ +#define CALCULATE(CASE, NAMESPACE) \ + case CASE: \ return reinterpret_cast(desc)->calculate( \ workspace, workspace_size, out, input, other, stream); @@ -111,10 +111,10 @@ __C infiniStatus_t infiniopInner( return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; } -__C infiniStatus_t infiniopDestroyInnerDescriptor(infiniopInnerDescriptor_t desc) { +__INFINI_C infiniStatus_t infiniopDestroyInnerDescriptor(infiniopInnerDescriptor_t desc) { -#define DESTROY(CASE, NAMESPACE) \ - case CASE: \ +#define DESTROY(CASE, NAMESPACE) \ + case CASE: \ delete reinterpret_cast(desc); \ return INFINI_STATUS_SUCCESS; diff --git a/src/infiniop/ops/masked_select/cpu/masked_select_cpu.cc b/src/infiniop/ops/masked_select/cpu/masked_select_cpu.cc index ae74587d8..1157340c0 100644 --- a/src/infiniop/ops/masked_select/cpu/masked_select_cpu.cc +++ b/src/infiniop/ops/masked_select/cpu/masked_select_cpu.cc @@ -10,7 +10,7 @@ infiniStatus_t Descriptor::create( Descriptor **desc_ptr, infiniopTensorDescriptor_t input_desc, infiniopTensorDescriptor_t mask_desc) { - + auto result = MaskedSelectInfo::create(input_desc, mask_desc); CHECK_RESULT(result); *desc_ptr = new Descriptor(nullptr, result.take(), 0, handle->device, handle->device_id); @@ -19,10 +19,10 @@ infiniStatus_t Descriptor::create( namespace { -template +template infiniStatus_t maskedSelect(const MaskedSelectInfo *info, const T *input, const bool *mask, void **data_ptr, size_t *dlen_ptr) { std::vector res; - for (size_t index = 0; index < info->total_elements; index ++) { + for (size_t index = 0; index < info->total_elements; index++) { size_t input_offset = op::common_cpu::indexToOffset(index, info->ndim, info->shape.data(), info->input_strides.data()); size_t mask_offset = op::common_cpu::indexToOffset(index, info->ndim, info->shape.data(), info->mask_strides.data()); if (mask[mask_offset]) { @@ -36,7 +36,7 @@ infiniStatus_t maskedSelect(const MaskedSelectInfo *info, const T *input, const return INFINI_STATUS_SUCCESS; } -} +} // namespace infiniStatus_t Descriptor::calculate( void *workspace, size_t workspace_size, @@ -55,4 +55,4 @@ infiniStatus_t Descriptor::calculate( return INFINI_STATUS_SUCCESS; } -} \ No newline at end of file +} // namespace op::masked_select::cpu diff --git a/src/infiniop/ops/masked_select/cpu/masked_select_cpu.h b/src/infiniop/ops/masked_select/cpu/masked_select_cpu.h index 9fed9e2e6..3fc0c1dbf 100644 --- a/src/infiniop/ops/masked_select/cpu/masked_select_cpu.h +++ b/src/infiniop/ops/masked_select/cpu/masked_select_cpu.h @@ -4,4 +4,4 @@ DESCRIPTOR(cpu) -#endif // __MASKED_SELECT_CPU_H__ \ No newline at end of file +#endif // __MASKED_SELECT_CPU_H__ diff --git a/src/infiniop/ops/masked_select/cuda/kernel.cuh b/src/infiniop/ops/masked_select/cuda/kernel.cuh index fc7485f31..5b66b244a 100644 --- a/src/infiniop/ops/masked_select/cuda/kernel.cuh +++ b/src/infiniop/ops/masked_select/cuda/kernel.cuh @@ -1,7 +1,7 @@ #ifndef __MASKED_SELECT_KERNEL_CUH__ #define __MASKED_SELECT_KERNEL_CUH__ -template +template INFINIOP_CUDA_KERNEL maskedSelectGetMarkScanOnceKernel( const bool *mask, size_t *mark_scan, size_t total_elements, size_t *shape, ptrdiff_t *mask_strides, size_t ndim) { @@ -19,21 +19,24 @@ INFINIOP_CUDA_KERNEL maskedSelectGetMarkScanOnceKernel( for (int s = 1; s < BLOCK_SIZE; s <<= 1) { __syncthreads(); - + float temp; - if (tid >= s) + if (tid >= s) { temp = smem[tid] + smem[tid - s]; + } __syncthreads(); - if (tid >= s) + if (tid >= s) { smem[tid] = temp; + } } - if (index < total_elements) + if (index < total_elements) { mark_scan[index] = smem[tid]; + } } -template +template INFINIOP_CUDA_KERNEL maskedSelectScanWithStrideKernel( size_t *mark_scan, size_t total_elements, size_t stride) { @@ -45,50 +48,56 @@ INFINIOP_CUDA_KERNEL maskedSelectScanWithStrideKernel( for (int s = 1; s < BLOCK_SIZE; s <<= 1) { __syncthreads(); - + size_t temp; - if (tid >= s) + if (tid >= s) { temp = smem[tid] + smem[tid - s]; + } __syncthreads(); - if (tid >= s) + if (tid >= s) { smem[tid] = temp; + } } - if (index < total_elements) + if (index < total_elements) { mark_scan[index] = smem[tid]; + } } -template +template INFINIOP_CUDA_KERNEL maskedSelectCountScanResultKernel( size_t *mark_scan, size_t *scan_result, size_t total_elements) { size_t index = blockDim.x * blockIdx.x + threadIdx.x; - if (index >= total_elements) + if (index >= total_elements) { return; + } size_t val = mark_scan[index]; for (size_t s = BLOCK_SIZE; s < total_elements; s *= BLOCK_SIZE) { size_t now_stride_block = (index + 1) / s; size_t pre_stride_block = now_stride_block * s - 1; - if (now_stride_block == 0 || (index + 1) % s == 0 || (pre_stride_block + 1) % (s * BLOCK_SIZE) == 0) + if (now_stride_block == 0 || (index + 1) % s == 0 || (pre_stride_block + 1) % (s * BLOCK_SIZE) == 0) { continue; + } val += mark_scan[pre_stride_block]; } - + scan_result[index] = val; } -template +template INFINIOP_CUDA_KERNEL maskedSelectGetDataKernel( const T *input, const bool *mask, size_t *scan_result, T *data, size_t total_elements, size_t *shape, ptrdiff_t *input_strides, ptrdiff_t *mask_strides, size_t ndim) { size_t index = blockDim.x * blockIdx.x + threadIdx.x; - if (index >= total_elements) + if (index >= total_elements) { return; + } size_t input_offset = device::nvidia::indexToOffset(index, ndim, shape, input_strides); size_t mask_offset = device::nvidia::indexToOffset(index, ndim, shape, mask_strides); @@ -98,4 +107,4 @@ INFINIOP_CUDA_KERNEL maskedSelectGetDataKernel( } } -#endif \ No newline at end of file +#endif diff --git a/src/infiniop/ops/masked_select/info.h b/src/infiniop/ops/masked_select/info.h index 889b32821..cd20f1d33 100644 --- a/src/infiniop/ops/masked_select/info.h +++ b/src/infiniop/ops/masked_select/info.h @@ -20,7 +20,7 @@ class MaskedSelectInfo { std::vector input_strides; std::vector mask_strides; - static utils::Result create(infiniopTensorDescriptor_t input_desc, infiniopTensorDescriptor_t mask_desc) { + static utils::Result create(infiniopTensorDescriptor_t input_desc, infiniopTensorDescriptor_t mask_desc) { auto dtype = input_desc->dtype(); CHECK_DTYPE(dtype, INFINI_DTYPE_F32); CHECK_DTYPE(mask_desc->dtype(), INFINI_DTYPE_BOOL); @@ -31,7 +31,7 @@ class MaskedSelectInfo { auto ndim = input_desc->ndim(); size_t total_elements = 1; - for (auto& dim : shape) { + for (auto &dim : shape) { total_elements *= dim; } diff --git a/src/infiniop/ops/masked_select/masked_select.h b/src/infiniop/ops/masked_select/masked_select.h index 98148ebaf..ae44a8738 100644 --- a/src/infiniop/ops/masked_select/masked_select.h +++ b/src/infiniop/ops/masked_select/masked_select.h @@ -7,7 +7,7 @@ #define DESCRIPTOR(NAMESPACE) \ \ namespace op::masked_select::NAMESPACE { \ - class Descriptor final: public InfiniopDescriptor { \ + class Descriptor final : public InfiniopDescriptor { \ struct Opaque; \ Opaque *_opaque; \ MaskedSelectInfo _info; \ @@ -24,7 +24,6 @@ _info(info), \ _workspace_size(workspace_size) {} \ \ - \ public: \ ~Descriptor(); \ size_t workspaceSize() const { return _workspace_size; } \ @@ -45,4 +44,4 @@ }; \ } -#endif // MASKED_SELECT_H \ No newline at end of file +#endif // MASKED_SELECT_H diff --git a/src/infiniop/ops/masked_select/metax/masked_select_metax.h b/src/infiniop/ops/masked_select/metax/masked_select_metax.h index 47460777f..8d438d242 100644 --- a/src/infiniop/ops/masked_select/metax/masked_select_metax.h +++ b/src/infiniop/ops/masked_select/metax/masked_select_metax.h @@ -4,4 +4,4 @@ DESCRIPTOR(metax); -#endif \ No newline at end of file +#endif diff --git a/src/infiniop/ops/masked_select/metax/masked_select_metax.maca b/src/infiniop/ops/masked_select/metax/masked_select_metax.maca index 4a4181153..a024e6de7 100644 --- a/src/infiniop/ops/masked_select/metax/masked_select_metax.maca +++ b/src/infiniop/ops/masked_select/metax/masked_select_metax.maca @@ -42,9 +42,9 @@ infiniStatus_t Descriptor::create( namespace { -template +template infiniStatus_t launchKernel( - const MaskedSelectInfo &info, + const MaskedSelectInfo &info, const T *input, const bool *mask, void **data_ptr, size_t *dlen_ptr, hcStream_t stream, void *workspace, size_t workspace_size) { @@ -56,10 +56,10 @@ infiniStatus_t launchKernel( size_t *shape_hc = reinterpret_cast(workspace_ptr + workspace_offset); workspace_offset += ndim * sizeof(size_t); - + ptrdiff_t *input_strides_hc = reinterpret_cast(workspace_ptr + workspace_offset); workspace_offset += ndim * sizeof(ptrdiff_t); - + ptrdiff_t *mask_strides_hc = reinterpret_cast(workspace_ptr + workspace_offset); workspace_offset += ndim * sizeof(ptrdiff_t); @@ -77,16 +77,16 @@ infiniStatus_t launchKernel( size_t block_size = BLOCK_SIZE; size_t grid_size = (total_elements + block_size - 1) / block_size; - + maskedSelectGetMarkScanOnceKernel<<>>( mask, mark_scan, total_elements, shape_hc, mask_strides_hc, ndim); CHECK_METAX(hcDeviceSynchronize()); - + for (size_t stride = BLOCK_SIZE; stride < total_elements; stride *= BLOCK_SIZE) { size_t stride_elements = (total_elements + stride - 1) / stride; size_t stride_grid_size = (stride_elements + BLOCK_SIZE - 1) / BLOCK_SIZE; - + maskedSelectScanWithStrideKernel<<>>( mark_scan, total_elements, stride); CHECK_METAX(hcDeviceSynchronize()); @@ -98,15 +98,14 @@ infiniStatus_t launchKernel( } else { scan_result = mark_scan; } - - + CHECK_METAX(hcMemcpyAsync(dlen_ptr, scan_result + total_elements - 1, sizeof(size_t), hcMemcpyDeviceToHost, stream)); - + CHECK_METAX(hcMalloc(data_ptr, *dlen_ptr * sizeof(T))); - + // context::setDevice(METAX); // context::allocateMemory(data_ptr, *dlen_ptr * sizeof(T)); - + maskedSelectGetDataKernel<<>>( input, mask, scan_result, (T *)*data_ptr, total_elements, shape_hc, input_strides_hc, mask_strides_hc, ndim); @@ -115,8 +114,7 @@ infiniStatus_t launchKernel( return INFINI_STATUS_SUCCESS; } -} - +} // namespace infiniStatus_t Descriptor::calculate( void *workspace, size_t workspace_size, @@ -127,18 +125,17 @@ infiniStatus_t Descriptor::calculate( void *stream_) const { hcStream_t stream = (hcStream_t)stream_; -#define CALCULATE_MASKED_SELECT(BLOCK_SIZE, T) \ - launchKernel( \ - _info, \ - (const T *)input, mask, data_ptr, dlen_ptr, \ - stream, workspace, workspace_size \ - ) -#define CALCULATE_MASKED_SELECT_WITH_BLOCK_SIZE(BLOCK_SIZE) \ - { \ - if (_info.dtype == INFINI_DTYPE_F32) \ - return CALCULATE_MASKED_SELECT(BLOCK_SIZE, float); \ - else \ - return INFINI_STATUS_BAD_TENSOR_DTYPE; \ +#define CALCULATE_MASKED_SELECT(BLOCK_SIZE, T) \ + launchKernel( \ + _info, \ + (const T *)input, mask, data_ptr, dlen_ptr, \ + stream, workspace, workspace_size) +#define CALCULATE_MASKED_SELECT_WITH_BLOCK_SIZE(BLOCK_SIZE) \ + { \ + if (_info.dtype == INFINI_DTYPE_F32) \ + return CALCULATE_MASKED_SELECT(BLOCK_SIZE, float); \ + else \ + return INFINI_STATUS_BAD_TENSOR_DTYPE; \ } if (_opaque->internal->maxThreadsPerBlock() == METAX_BLOCK_SIZE_1024) { @@ -151,4 +148,4 @@ infiniStatus_t Descriptor::calculate( return INFINI_STATUS_SUCCESS; } -} +} // namespace op::masked_select::metax diff --git a/src/infiniop/ops/masked_select/metax/masked_select_metax_kernel.h b/src/infiniop/ops/masked_select/metax/masked_select_metax_kernel.h index e590de955..9482e4dfe 100644 --- a/src/infiniop/ops/masked_select/metax/masked_select_metax_kernel.h +++ b/src/infiniop/ops/masked_select/metax/masked_select_metax_kernel.h @@ -1,7 +1,7 @@ #ifndef __MASKED_SELECT_METAX_KERNEL_CUH__ #define __MASKED_SELECT_METAX_KERNEL_CUH__ -template +template INFINIOP_METAX_KERNEL maskedSelectGetMarkScanOnceKernel( const bool *mask, size_t *mark_scan, size_t total_elements, size_t *shape, ptrdiff_t *mask_strides, size_t ndim) { @@ -19,21 +19,24 @@ INFINIOP_METAX_KERNEL maskedSelectGetMarkScanOnceKernel( for (int s = 1; s < BLOCK_SIZE; s <<= 1) { __syncthreads(); - + float temp; - if (tid >= s) + if (tid >= s) { temp = smem[tid] + smem[tid - s]; + } __syncthreads(); - if (tid >= s) + if (tid >= s) { smem[tid] = temp; + } } - if (index < total_elements) + if (index < total_elements) { mark_scan[index] = smem[tid]; + } } -template +template INFINIOP_METAX_KERNEL maskedSelectScanWithStrideKernel( size_t *mark_scan, size_t total_elements, size_t stride) { @@ -45,50 +48,56 @@ INFINIOP_METAX_KERNEL maskedSelectScanWithStrideKernel( for (int s = 1; s < BLOCK_SIZE; s <<= 1) { __syncthreads(); - + size_t temp; - if (tid >= s) + if (tid >= s) { temp = smem[tid] + smem[tid - s]; + } __syncthreads(); - if (tid >= s) + if (tid >= s) { smem[tid] = temp; + } } - if (index < total_elements) + if (index < total_elements) { mark_scan[index] = smem[tid]; + } } -template +template INFINIOP_METAX_KERNEL maskedSelectCountScanResultKernel( size_t *mark_scan, size_t *scan_result, size_t total_elements) { size_t index = blockDim.x * blockIdx.x + threadIdx.x; - if (index >= total_elements) + if (index >= total_elements) { return; + } size_t val = mark_scan[index]; for (size_t s = BLOCK_SIZE; s < total_elements; s *= BLOCK_SIZE) { size_t now_stride_block = (index + 1) / s; size_t pre_stride_block = now_stride_block * s - 1; - if (now_stride_block == 0 || (index + 1) % s == 0 || (pre_stride_block + 1) % (s * BLOCK_SIZE) == 0) + if (now_stride_block == 0 || (index + 1) % s == 0 || (pre_stride_block + 1) % (s * BLOCK_SIZE) == 0) { continue; + } val += mark_scan[pre_stride_block]; } - + scan_result[index] = val; } -template +template INFINIOP_METAX_KERNEL maskedSelectGetDataKernel( const T *input, const bool *mask, size_t *scan_result, T *data, size_t total_elements, size_t *shape, ptrdiff_t *input_strides, ptrdiff_t *mask_strides, size_t ndim) { size_t index = blockDim.x * blockIdx.x + threadIdx.x; - if (index >= total_elements) + if (index >= total_elements) { return; + } size_t input_offset = device::metax::indexToOffset(index, ndim, shape, input_strides); size_t mask_offset = device::metax::indexToOffset(index, ndim, shape, mask_strides); @@ -98,4 +107,4 @@ INFINIOP_METAX_KERNEL maskedSelectGetDataKernel( } } -#endif \ No newline at end of file +#endif diff --git a/src/infiniop/ops/masked_select/moore/masked_select_moore.h b/src/infiniop/ops/masked_select/moore/masked_select_moore.h index 24548d8fc..ca2e7b00d 100644 --- a/src/infiniop/ops/masked_select/moore/masked_select_moore.h +++ b/src/infiniop/ops/masked_select/moore/masked_select_moore.h @@ -4,4 +4,4 @@ DESCRIPTOR(moore); -#endif \ No newline at end of file +#endif diff --git a/src/infiniop/ops/masked_select/moore/masked_select_moore.mu b/src/infiniop/ops/masked_select/moore/masked_select_moore.mu index 3c218bc19..e9a195ee8 100644 --- a/src/infiniop/ops/masked_select/moore/masked_select_moore.mu +++ b/src/infiniop/ops/masked_select/moore/masked_select_moore.mu @@ -42,9 +42,9 @@ infiniStatus_t Descriptor::create( namespace { -template +template infiniStatus_t launchKernel( - const MaskedSelectInfo &info, + const MaskedSelectInfo &info, const T *input, const bool *mask, void **data_ptr, size_t *dlen_ptr, musaStream_t stream, void *workspace, size_t workspace_size) { @@ -56,10 +56,10 @@ infiniStatus_t launchKernel( size_t *shape_musa = reinterpret_cast(workspace_ptr + workspace_offset); workspace_offset += ndim * sizeof(size_t); - + ptrdiff_t *input_strides_musa = reinterpret_cast(workspace_ptr + workspace_offset); workspace_offset += ndim * sizeof(ptrdiff_t); - + ptrdiff_t *mask_strides_musa = reinterpret_cast(workspace_ptr + workspace_offset); workspace_offset += ndim * sizeof(ptrdiff_t); @@ -77,16 +77,16 @@ infiniStatus_t launchKernel( size_t block_size = BLOCK_SIZE; size_t grid_size = (total_elements + block_size - 1) / block_size; - + maskedSelectGetMarkScanOnceKernel<<>>( mask, mark_scan, total_elements, shape_musa, mask_strides_musa, ndim); CHECK_MOORE(musaDeviceSynchronize()); - + for (size_t stride = BLOCK_SIZE; stride < total_elements; stride *= BLOCK_SIZE) { size_t stride_elements = (total_elements + stride - 1) / stride; size_t stride_grid_size = (stride_elements + BLOCK_SIZE - 1) / BLOCK_SIZE; - + maskedSelectScanWithStrideKernel<<>>( mark_scan, total_elements, stride); CHECK_MOORE(musaDeviceSynchronize()); @@ -98,15 +98,14 @@ infiniStatus_t launchKernel( } else { scan_result = mark_scan; } - - + CHECK_MOORE(musaMemcpyAsync(dlen_ptr, scan_result + total_elements - 1, sizeof(size_t), musaMemcpyDeviceToHost, stream)); - + CHECK_MOORE(musaMalloc(data_ptr, *dlen_ptr * sizeof(T))); - + // context::setDevice(MOORE); // context::allocateMemory(data_ptr, *dlen_ptr * sizeof(T)); - + maskedSelectGetDataKernel<<>>( input, mask, scan_result, (T *)*data_ptr, total_elements, shape_musa, input_strides_musa, mask_strides_musa, ndim); @@ -115,8 +114,7 @@ infiniStatus_t launchKernel( return INFINI_STATUS_SUCCESS; } -} - +} // namespace infiniStatus_t Descriptor::calculate( void *workspace, size_t workspace_size, @@ -127,18 +125,17 @@ infiniStatus_t Descriptor::calculate( void *stream_) const { musaStream_t stream = (musaStream_t)stream_; -#define CALCULATE_MASKED_SELECT(BLOCK_SIZE, T) \ - launchKernel( \ - _info, \ - (const T *)input, mask, data_ptr, dlen_ptr, \ - stream, workspace, workspace_size \ - ) -#define CALCULATE_MASKED_SELECT_WITH_BLOCK_SIZE(BLOCK_SIZE) \ - { \ - if (_info.dtype == INFINI_DTYPE_F32) \ - return CALCULATE_MASKED_SELECT(BLOCK_SIZE, float); \ - else \ - return INFINI_STATUS_BAD_TENSOR_DTYPE; \ +#define CALCULATE_MASKED_SELECT(BLOCK_SIZE, T) \ + launchKernel( \ + _info, \ + (const T *)input, mask, data_ptr, dlen_ptr, \ + stream, workspace, workspace_size) +#define CALCULATE_MASKED_SELECT_WITH_BLOCK_SIZE(BLOCK_SIZE) \ + { \ + if (_info.dtype == INFINI_DTYPE_F32) \ + return CALCULATE_MASKED_SELECT(BLOCK_SIZE, float); \ + else \ + return INFINI_STATUS_BAD_TENSOR_DTYPE; \ } if (_opaque->internal->maxThreadsPerBlock() == MOORE_BLOCK_SIZE_1024) { @@ -153,4 +150,4 @@ infiniStatus_t Descriptor::calculate( return INFINI_STATUS_SUCCESS; } -} +} // namespace op::masked_select::moore diff --git a/src/infiniop/ops/masked_select/moore/masked_select_moore_kernel.h b/src/infiniop/ops/masked_select/moore/masked_select_moore_kernel.h index dcdd87c6d..76cb62b50 100644 --- a/src/infiniop/ops/masked_select/moore/masked_select_moore_kernel.h +++ b/src/infiniop/ops/masked_select/moore/masked_select_moore_kernel.h @@ -1,7 +1,7 @@ #ifndef __MASKED_SELECT_MOORE_KERNEL_CUH__ #define __MASKED_SELECT_MOORE_KERNEL_CUH__ -template +template INFINIOP_MOORE_KERNEL maskedSelectGetMarkScanOnceKernel( const bool *mask, size_t *mark_scan, size_t total_elements, size_t *shape, ptrdiff_t *mask_strides, size_t ndim) { @@ -19,21 +19,24 @@ INFINIOP_MOORE_KERNEL maskedSelectGetMarkScanOnceKernel( for (int s = 1; s < BLOCK_SIZE; s <<= 1) { __syncthreads(); - + float temp; - if (tid >= s) + if (tid >= s) { temp = smem[tid] + smem[tid - s]; + } __syncthreads(); - if (tid >= s) + if (tid >= s) { smem[tid] = temp; + } } - if (index < total_elements) + if (index < total_elements) { mark_scan[index] = smem[tid]; + } } -template +template INFINIOP_MOORE_KERNEL maskedSelectScanWithStrideKernel( size_t *mark_scan, size_t total_elements, size_t stride) { @@ -45,50 +48,56 @@ INFINIOP_MOORE_KERNEL maskedSelectScanWithStrideKernel( for (int s = 1; s < BLOCK_SIZE; s <<= 1) { __syncthreads(); - + size_t temp; - if (tid >= s) + if (tid >= s) { temp = smem[tid] + smem[tid - s]; + } __syncthreads(); - if (tid >= s) + if (tid >= s) { smem[tid] = temp; + } } - if (index < total_elements) + if (index < total_elements) { mark_scan[index] = smem[tid]; + } } -template +template INFINIOP_MOORE_KERNEL maskedSelectCountScanResultKernel( size_t *mark_scan, size_t *scan_result, size_t total_elements) { size_t index = blockDim.x * blockIdx.x + threadIdx.x; - if (index >= total_elements) + if (index >= total_elements) { return; + } size_t val = mark_scan[index]; for (size_t s = BLOCK_SIZE; s < total_elements; s *= BLOCK_SIZE) { size_t now_stride_block = (index + 1) / s; size_t pre_stride_block = now_stride_block * s - 1; - if (now_stride_block == 0 || (index + 1) % s == 0 || (pre_stride_block + 1) % (s * BLOCK_SIZE) == 0) + if (now_stride_block == 0 || (index + 1) % s == 0 || (pre_stride_block + 1) % (s * BLOCK_SIZE) == 0) { continue; + } val += mark_scan[pre_stride_block]; } - + scan_result[index] = val; } -template +template INFINIOP_MOORE_KERNEL maskedSelectGetDataKernel( const T *input, const bool *mask, size_t *scan_result, T *data, size_t total_elements, size_t *shape, ptrdiff_t *input_strides, ptrdiff_t *mask_strides, size_t ndim) { size_t index = blockDim.x * blockIdx.x + threadIdx.x; - if (index >= total_elements) + if (index >= total_elements) { return; + } size_t input_offset = device::moore::indexToOffset(index, ndim, shape, input_strides); size_t mask_offset = device::moore::indexToOffset(index, ndim, shape, mask_strides); @@ -98,4 +107,4 @@ INFINIOP_MOORE_KERNEL maskedSelectGetDataKernel( } } -#endif \ No newline at end of file +#endif diff --git a/src/infiniop/ops/masked_select/nvidia/masked_select_nvidia.cu b/src/infiniop/ops/masked_select/nvidia/masked_select_nvidia.cu index eebba0fd3..17016ebc8 100644 --- a/src/infiniop/ops/masked_select/nvidia/masked_select_nvidia.cu +++ b/src/infiniop/ops/masked_select/nvidia/masked_select_nvidia.cu @@ -41,9 +41,9 @@ infiniStatus_t Descriptor::create( namespace { -template +template infiniStatus_t launchKernel( - const MaskedSelectInfo &info, + const MaskedSelectInfo &info, const T *input, const bool *mask, void **data_ptr, size_t *dlen_ptr, cudaStream_t stream, void *workspace, size_t workspace_size) { @@ -55,10 +55,10 @@ infiniStatus_t launchKernel( size_t *shape_cuda = reinterpret_cast(workspace_ptr + workspace_offset); workspace_offset += ndim * sizeof(size_t); - + ptrdiff_t *input_strides_cuda = reinterpret_cast(workspace_ptr + workspace_offset); workspace_offset += ndim * sizeof(ptrdiff_t); - + ptrdiff_t *mask_strides_cuda = reinterpret_cast(workspace_ptr + workspace_offset); workspace_offset += ndim * sizeof(ptrdiff_t); @@ -74,16 +74,16 @@ infiniStatus_t launchKernel( size_t block_size = BLOCK_SIZE; size_t grid_size = (total_elements + block_size - 1) / block_size; - + maskedSelectGetMarkScanOnceKernel<<>>( mask, mark_scan, total_elements, shape_cuda, mask_strides_cuda, ndim); CHECK_CUDA(cudaDeviceSynchronize()); - + for (size_t stride = BLOCK_SIZE; stride < total_elements; stride *= BLOCK_SIZE) { size_t stride_elements = (total_elements + stride - 1) / stride; size_t stride_grid_size = (stride_elements + BLOCK_SIZE - 1) / BLOCK_SIZE; - + maskedSelectScanWithStrideKernel<<>>( mark_scan, total_elements, stride); CHECK_CUDA(cudaDeviceSynchronize()); @@ -95,11 +95,11 @@ infiniStatus_t launchKernel( } else { scan_result = mark_scan; } - + CHECK_CUDA(cudaMemcpyAsync(dlen_ptr, scan_result + total_elements - 1, sizeof(size_t), cudaMemcpyDeviceToHost, stream)); CHECK_CUDA(cudaMalloc(data_ptr, *dlen_ptr * sizeof(T))); - + maskedSelectGetDataKernel<<>>( input, mask, scan_result, (T *)*data_ptr, total_elements, shape_cuda, input_strides_cuda, mask_strides_cuda, ndim); @@ -108,8 +108,7 @@ infiniStatus_t launchKernel( return INFINI_STATUS_SUCCESS; } -} - +} // namespace infiniStatus_t Descriptor::calculate( void *workspace, size_t workspace_size, @@ -120,24 +119,25 @@ infiniStatus_t Descriptor::calculate( void *stream_) const { cudaStream_t stream = (cudaStream_t)stream_; -#define CALCULATE_MASKED_SELECT(BLOCK_SIZE, T) \ - launchKernel( \ - _info, \ - (const T *)input, mask, data_ptr, dlen_ptr, \ - stream, workspace, workspace_size \ - ) -#define CALCULATE_MASKED_SELECT_WITH_BLOCK_SIZE(BLOCK_SIZE) \ - { \ - if (_info.dtype == INFINI_DTYPE_F32) \ - return CALCULATE_MASKED_SELECT(BLOCK_SIZE, float); \ - else \ - return INFINI_STATUS_BAD_TENSOR_DTYPE; \ +#define CALCULATE_MASKED_SELECT(BLOCK_SIZE, T) \ + launchKernel( \ + _info, \ + (const T *)input, mask, data_ptr, dlen_ptr, \ + stream, workspace, workspace_size) +#define CALCULATE_MASKED_SELECT_WITH_BLOCK_SIZE(BLOCK_SIZE) \ + { \ + if (_info.dtype == INFINI_DTYPE_F32) \ + return CALCULATE_MASKED_SELECT(BLOCK_SIZE, float); \ + else \ + return INFINI_STATUS_BAD_TENSOR_DTYPE; \ } if (_opaque->internal->maxThreadsPerBlock() == CUDA_BLOCK_SIZE_1024) { CALCULATE_MASKED_SELECT_WITH_BLOCK_SIZE(CUDA_BLOCK_SIZE_1024); } else if (_opaque->internal->maxThreadsPerBlock() == CUDA_BLOCK_SIZE_512) { CALCULATE_MASKED_SELECT_WITH_BLOCK_SIZE(CUDA_BLOCK_SIZE_512); + } else if (_opaque->internal->maxThreadsPerBlock() == CUDA_BLOCK_SIZE_2048) { + CALCULATE_MASKED_SELECT_WITH_BLOCK_SIZE(CUDA_BLOCK_SIZE_2048); } else if (_opaque->internal->maxThreadsPerBlock() == CUDA_BLOCK_SIZE_4096) { CALCULATE_MASKED_SELECT_WITH_BLOCK_SIZE(CUDA_BLOCK_SIZE_4096); } else { @@ -146,4 +146,4 @@ infiniStatus_t Descriptor::calculate( return INFINI_STATUS_SUCCESS; } -} +} // namespace op::masked_select::nvidia diff --git a/src/infiniop/ops/masked_select/nvidia/masked_select_nvidia.cuh b/src/infiniop/ops/masked_select/nvidia/masked_select_nvidia.cuh index bc493edbe..c3145025b 100644 --- a/src/infiniop/ops/masked_select/nvidia/masked_select_nvidia.cuh +++ b/src/infiniop/ops/masked_select/nvidia/masked_select_nvidia.cuh @@ -5,4 +5,4 @@ DESCRIPTOR(nvidia) -#endif \ No newline at end of file +#endif diff --git a/src/infiniop/ops/masked_select/operator.cc b/src/infiniop/ops/masked_select/operator.cc index 055b67b12..1a581bd6d 100644 --- a/src/infiniop/ops/masked_select/operator.cc +++ b/src/infiniop/ops/masked_select/operator.cc @@ -15,18 +15,18 @@ #include "moore/masked_select_moore.h" #endif -__C infiniStatus_t infiniopCreateMaskedSelectDescriptor( +__INFINI_C infiniStatus_t infiniopCreateMaskedSelectDescriptor( infiniopHandle_t handle, infiniopMaskedSelectDescriptor_t *desc_ptr, infiniopTensorDescriptor_t input_desc, infiniopTensorDescriptor_t mask_desc) { -#define CREATE(CASE, NAMESPACE) \ - case CASE: \ - return op::masked_select::NAMESPACE::Descriptor::create( \ - handle, \ - reinterpret_cast(desc_ptr),\ - input_desc, \ +#define CREATE(CASE, NAMESPACE) \ + case CASE: \ + return op::masked_select::NAMESPACE::Descriptor::create( \ + handle, \ + reinterpret_cast(desc_ptr), \ + input_desc, \ mask_desc); switch (handle->device) { @@ -49,10 +49,10 @@ __C infiniStatus_t infiniopCreateMaskedSelectDescriptor( return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; } -__C infiniStatus_t infiniopGetMaskedSelectWorkspaceSize(infiniopMaskedSelectDescriptor_t desc, size_t *size) { +__INFINI_C infiniStatus_t infiniopGetMaskedSelectWorkspaceSize(infiniopMaskedSelectDescriptor_t desc, size_t *size) { -#define GET(CASE, NAMESPACE) \ - case CASE: \ +#define GET(CASE, NAMESPACE) \ + case CASE: \ *size = reinterpret_cast(desc)->workspaceSize(); \ return INFINI_STATUS_SUCCESS; @@ -76,7 +76,7 @@ __C infiniStatus_t infiniopGetMaskedSelectWorkspaceSize(infiniopMaskedSelectDesc return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; } -__C infiniStatus_t infiniopMaskedSelect( +__INFINI_C infiniStatus_t infiniopMaskedSelect( infiniopMaskedSelectDescriptor_t desc, void *workspace, size_t workspace_size, const void *input, @@ -85,8 +85,8 @@ __C infiniStatus_t infiniopMaskedSelect( size_t *dlen, void *stream) { -#define CALCULATE(CASE, NAMESPACE) \ - case CASE: \ +#define CALCULATE(CASE, NAMESPACE) \ + case CASE: \ return reinterpret_cast(desc)->calculate( \ workspace, workspace_size, input, mask, data_size, dlen, stream); @@ -110,10 +110,10 @@ __C infiniStatus_t infiniopMaskedSelect( return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; } -__C infiniStatus_t infiniopDestroyMaskedSelectDescriptor(infiniopMaskedSelectDescriptor_t desc) { +__INFINI_C infiniStatus_t infiniopDestroyMaskedSelectDescriptor(infiniopMaskedSelectDescriptor_t desc) { -#define DESTROY(CASE, NAMESPACE) \ - case CASE: \ +#define DESTROY(CASE, NAMESPACE) \ + case CASE: \ delete reinterpret_cast(desc); \ return INFINI_STATUS_SUCCESS; diff --git a/src/infiniop/ops/tan/operator.cc b/src/infiniop/ops/tan/operator.cc index 1f222a8d1..99081d8b9 100644 --- a/src/infiniop/ops/tan/operator.cc +++ b/src/infiniop/ops/tan/operator.cc @@ -15,18 +15,18 @@ #include "moore/tan_moore.h" #endif -__C infiniStatus_t infiniopCreateTanDescriptor( +__INFINI_C infiniStatus_t infiniopCreateTanDescriptor( infiniopHandle_t handle, infiniopTanDescriptor_t *desc_ptr, infiniopTensorDescriptor_t output_desc, infiniopTensorDescriptor_t input_desc) { -#define CREATE(CASE, NAMESPACE) \ - case CASE: \ +#define CREATE(CASE, NAMESPACE) \ + case CASE: \ return op::tan::NAMESPACE::Descriptor::create( \ - handle, \ + handle, \ reinterpret_cast(desc_ptr), \ - output_desc, \ + output_desc, \ {input_desc}) switch (handle->device) { @@ -54,10 +54,10 @@ __C infiniStatus_t infiniopCreateTanDescriptor( #undef CREATE } -__C infiniStatus_t infiniopGetTanWorkspaceSize(infiniopTanDescriptor_t desc, size_t *size) { +__INFINI_C infiniStatus_t infiniopGetTanWorkspaceSize(infiniopTanDescriptor_t desc, size_t *size) { -#define GET(CASE, NAMESPACE) \ - case CASE: \ +#define GET(CASE, NAMESPACE) \ + case CASE: \ *size = reinterpret_cast(desc)->workspaceSize(); \ return INFINI_STATUS_SUCCESS @@ -85,7 +85,7 @@ __C infiniStatus_t infiniopGetTanWorkspaceSize(infiniopTanDescriptor_t desc, siz return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; } -__C infiniStatus_t infiniopTan( +__INFINI_C infiniStatus_t infiniopTan( infiniopTanDescriptor_t desc, void *workspace, size_t workspace_size, @@ -93,8 +93,8 @@ __C infiniStatus_t infiniopTan( const void *input, void *stream) { -#define CALCULATE(CASE, NAMESPACE) \ - case CASE: \ +#define CALCULATE(CASE, NAMESPACE) \ + case CASE: \ return reinterpret_cast(desc) \ ->calculate(workspace, workspace_size, output, {input}, stream) @@ -123,11 +123,11 @@ __C infiniStatus_t infiniopTan( #undef CALCULATE } -__C infiniStatus_t +__INFINI_C infiniStatus_t infiniopDestroyTanDescriptor(infiniopTanDescriptor_t desc) { -#define DELETE(CASE, NAMESPACE) \ - case CASE: \ +#define DELETE(CASE, NAMESPACE) \ + case CASE: \ delete reinterpret_cast(desc); \ return INFINI_STATUS_SUCCESS diff --git a/src/infiniop/ops/tanhshrink/moore/tanhshrink_moore_kernel.h b/src/infiniop/ops/tanhshrink/moore/tanhshrink_moore_kernel.h index 9c75b916e..f4e97b27b 100644 --- a/src/infiniop/ops/tanhshrink/moore/tanhshrink_moore_kernel.h +++ b/src/infiniop/ops/tanhshrink/moore/tanhshrink_moore_kernel.h @@ -25,4 +25,4 @@ typedef struct TanhshrinkOp { } // namespace op::tanhshrink::moore -#endif // __TANHSHRINK_MOORE_KERNEL_H__ \ No newline at end of file +#endif // __TANHSHRINK_MOORE_KERNEL_H__ diff --git a/src/infiniop/ops/tanhshrink/operator.cc b/src/infiniop/ops/tanhshrink/operator.cc index 66e62b850..0cc5d8959 100644 --- a/src/infiniop/ops/tanhshrink/operator.cc +++ b/src/infiniop/ops/tanhshrink/operator.cc @@ -15,18 +15,18 @@ #include "moore/tanhshrink_moore.h" #endif -__C infiniStatus_t infiniopCreateTanhshrinkDescriptor( +__INFINI_C infiniStatus_t infiniopCreateTanhshrinkDescriptor( infiniopHandle_t handle, infiniopTanhshrinkDescriptor_t *desc_ptr, infiniopTensorDescriptor_t output_desc, infiniopTensorDescriptor_t input_desc) { -#define CREATE(CASE, NAMESPACE) \ - case CASE: \ +#define CREATE(CASE, NAMESPACE) \ + case CASE: \ return op::tanhshrink::NAMESPACE::Descriptor::create( \ - handle, \ + handle, \ reinterpret_cast(desc_ptr), \ - output_desc, \ + output_desc, \ {input_desc}) switch (handle->device) { @@ -54,10 +54,10 @@ __C infiniStatus_t infiniopCreateTanhshrinkDescriptor( #undef CREATE } -__C infiniStatus_t infiniopGetTanhshrinkWorkspaceSize(infiniopTanhshrinkDescriptor_t desc, size_t *size) { +__INFINI_C infiniStatus_t infiniopGetTanhshrinkWorkspaceSize(infiniopTanhshrinkDescriptor_t desc, size_t *size) { -#define GET(CASE, NAMESPACE) \ - case CASE: \ +#define GET(CASE, NAMESPACE) \ + case CASE: \ *size = reinterpret_cast(desc)->workspaceSize(); \ return INFINI_STATUS_SUCCESS @@ -85,7 +85,7 @@ __C infiniStatus_t infiniopGetTanhshrinkWorkspaceSize(infiniopTanhshrinkDescript return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; } -__C infiniStatus_t infiniopTanhshrink( +__INFINI_C infiniStatus_t infiniopTanhshrink( infiniopTanhshrinkDescriptor_t desc, void *workspace, size_t workspace_size, @@ -93,8 +93,8 @@ __C infiniStatus_t infiniopTanhshrink( const void *input, void *stream) { -#define CALCULATE(CASE, NAMESPACE) \ - case CASE: \ +#define CALCULATE(CASE, NAMESPACE) \ + case CASE: \ return reinterpret_cast(desc) \ ->calculate(workspace, workspace_size, output, {input}, stream) @@ -123,11 +123,11 @@ __C infiniStatus_t infiniopTanhshrink( #undef CALCULATE } -__C infiniStatus_t +__INFINI_C infiniStatus_t infiniopDestroyTanhshrinkDescriptor(infiniopTanhshrinkDescriptor_t desc) { -#define DELETE(CASE, NAMESPACE) \ - case CASE: \ +#define DELETE(CASE, NAMESPACE) \ + case CASE: \ delete reinterpret_cast(desc); \ return INFINI_STATUS_SUCCESS diff --git a/test/infinicore/ops/inner.py b/test/infinicore/ops/inner.py index 420123eec..8235515d8 100644 --- a/test/infinicore/ops/inner.py +++ b/test/infinicore/ops/inner.py @@ -21,7 +21,7 @@ _TOLERANCE_MAP = { infinicore.float16: {"atol": 1e-3, "rtol": 1e-2}, - infinicore.float32: {"atol": 1e-5, "rtol": 1e-4}, + infinicore.float32: {"atol": 1e-5, "rtol": 1e-3}, infinicore.bfloat16: {"atol": 1e-2, "rtol": 5e-2}, } From d68d0b889c698b0c75020279cfbae7e24edf1de6 Mon Sep 17 00:00:00 2001 From: PanZezhong Date: Thu, 19 Mar 2026 05:24:16 +0000 Subject: [PATCH 3/3] issue/1031 fix sum iluvatar --- src/infiniop/ops/sum/nvidia/sum_nvidia.cu | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/infiniop/ops/sum/nvidia/sum_nvidia.cu b/src/infiniop/ops/sum/nvidia/sum_nvidia.cu index af052be0a..9f165d271 100644 --- a/src/infiniop/ops/sum/nvidia/sum_nvidia.cu +++ b/src/infiniop/ops/sum/nvidia/sum_nvidia.cu @@ -107,6 +107,8 @@ infiniStatus_t Descriptor::calculate( CALCULATE_SUM_WITH_BLOCK_SIZE(CUDA_BLOCK_SIZE_1024) } else if (_opaque->internal->maxThreadsPerBlock() == CUDA_BLOCK_SIZE_512) { CALCULATE_SUM_WITH_BLOCK_SIZE(CUDA_BLOCK_SIZE_512) + } else if (_opaque->internal->maxThreadsPerBlock() == CUDA_BLOCK_SIZE_2048) { + CALCULATE_SUM_WITH_BLOCK_SIZE(CUDA_BLOCK_SIZE_2048) } else if (_opaque->internal->maxThreadsPerBlock() == CUDA_BLOCK_SIZE_4096) { CALCULATE_SUM_WITH_BLOCK_SIZE(CUDA_BLOCK_SIZE_4096) } else {