diff --git a/Cargo.lock b/Cargo.lock
index aac946db8ee27..204e5c4913bb0 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -6023,6 +6023,7 @@ dependencies = [
  "sha2",
  "socket2 0.5.9",
  "sqlx",
+ "strength_reduce",
  "sysinfo",
  "tantivy",
  "temp-env",
diff --git a/src/query/catalog/src/sbbf.rs b/src/query/catalog/src/sbbf.rs
index 5156d3196e221..30f123a259c99 100644
--- a/src/query/catalog/src/sbbf.rs
+++ b/src/query/catalog/src/sbbf.rs
@@ -73,9 +73,15 @@
 //! [sbbf-paper]: https://arxiv.org/pdf/2101.01719
 //! [bf-formulae]: http://tfk.mit.edu/pdf/bloom.pdf
 
-use core::simd::Simd;
-use core::simd::cmp::SimdPartialEq;
+// Use NEON intrinsics on aarch64 for better performance
+#[cfg(target_arch = "aarch64")]
+use std::arch::aarch64::*;
 use std::mem::size_of;
+// Use portable SIMD on other platforms
+#[cfg(not(target_arch = "aarch64"))]
+use std::simd::Simd;
+#[cfg(not(target_arch = "aarch64"))]
+use std::simd::cmp::SimdPartialEq;
 use std::sync::Arc;
 use std::sync::atomic::AtomicU32;
 use std::sync::atomic::Ordering;
@@ -83,7 +89,11 @@ use std::sync::atomic::Ordering;
 use databend_common_base::runtime::Runtime;
 
 /// Salt values as defined in the [spec](https://github.com/apache/parquet-format/blob/master/BloomFilter.md#technical-approach).
-const SALT: [u32; 8] = [
+/// 32-byte aligned for optimal SIMD load performance.
+#[repr(C, align(32))]
+struct AlignedSalt([u32; 8]);
+
+static SALT: AlignedSalt = AlignedSalt([
     0x47b6137b_u32,
     0x44974d91_u32,
     0x8824ad5b_u32,
@@ -92,7 +102,10 @@ const SALT: [u32; 8] = [
     0x2df1424b_u32,
     0x9efc4947_u32,
     0x5c6bfb31_u32,
-];
+]);
+
+/// Shift amount for extracting bit index: (hash * salt) >> 27 gives 5 bits (0-31)
+const SHIFT_NUM: i32 = 27;
 
 /// Each block is 256 bits, broken up into eight contiguous "words", each consisting of 32 bits.
 /// Each word is thought of as an array of bits; each bit is either "set" or "not set".
@@ -100,6 +113,7 @@ const SALT: [u32; 8] = [
 #[repr(transparent)]
 struct Block([u32; 8]);
 
+#[cfg(not(target_arch = "aarch64"))]
 type U32x8 = Simd<u32, 8>;
 
 impl Block {
@@ -107,6 +121,33 @@ impl Block {
 
     /// takes as its argument a single unsigned 32-bit integer and returns a block in which each
     /// word has exactly one bit set.
+    #[cfg(target_arch = "aarch64")]
+    #[inline]
+    fn mask(x: u32) -> Self {
+        unsafe {
+            let (mask_lo, mask_hi) = Self::mask_neon(x);
+            let mut result = [0u32; 8];
+            vst1q_u32_x2(result.as_mut_ptr(), uint32x4x2_t(mask_lo, mask_hi));
+            Self(result)
+        }
+    }
+
+    #[cfg(target_arch = "aarch64")]
+    #[inline(always)]
+    unsafe fn mask_neon(x: u32) -> (uint32x4_t, uint32x4_t) {
+        unsafe {
+            let ones = vdupq_n_u32(1);
+            let hash_data = vdupq_n_u32(x);
+            let salt = vld1q_u32_x2(SALT.0.as_ptr());
+            let bit_index_lo =
+                vreinterpretq_s32_u32(vshrq_n_u32::<SHIFT_NUM>(vmulq_u32(salt.0, hash_data)));
+            let bit_index_hi =
+                vreinterpretq_s32_u32(vshrq_n_u32::<SHIFT_NUM>(vmulq_u32(salt.1, hash_data)));
+            (vshlq_u32(ones, bit_index_lo), vshlq_u32(ones, bit_index_hi))
+        }
+    }
+
+    #[cfg(not(target_arch = "aarch64"))]
     fn mask(x: u32) -> Self {
         Self(Self::mask_simd(x).to_array())
     }
@@ -132,6 +173,18 @@ impl Block {
     }
 
     /// Setting every bit in the block that was also set in the result from mask
+    #[cfg(target_arch = "aarch64")]
+    #[inline]
+    fn insert(&mut self, hash: u32) {
+        unsafe {
+            let (mask_lo, mask_hi) = Self::mask_neon(hash);
+            let data = vld1q_u32_x2(self.0.as_ptr());
+            let result = uint32x4x2_t(vorrq_u32(data.0, mask_lo), vorrq_u32(data.1, mask_hi));
+            vst1q_u32_x2(self.0.as_mut_ptr(), result);
+        }
+    }
+
+    #[cfg(not(target_arch = "aarch64"))]
     fn insert(&mut self, hash: u32) {
         let mask = Self::mask(hash);
         for i in 0..8 {
@@ -140,16 +193,30 @@ impl Block {
     }
 
     /// Returns true when every bit that is set in the result of mask is also set in the block.
+    #[cfg(target_arch = "aarch64")]
+    #[inline]
+    fn check(&self, hash: u32) -> bool {
+        unsafe {
+            let (mask_lo, mask_hi) = Self::mask_neon(hash);
+            let data = vld1q_u32_x2(self.0.as_ptr());
+            // vbicq_u32(a, b) = a & !b: bits set in mask but not in data
+            let miss = vorrq_u32(vbicq_u32(mask_lo, data.0), vbicq_u32(mask_hi, data.1));
+            vmaxvq_u32(miss) == 0
+        }
+    }
+
+    #[cfg(not(target_arch = "aarch64"))]
     fn check(&self, hash: u32) -> bool {
         let mask = Self::mask_simd(hash);
         let block_vec = U32x8::from_array(self.0);
         (block_vec & mask).simd_ne(U32x8::splat(0)).all()
     }
 
+    #[cfg(not(target_arch = "aarch64"))]
     #[inline(always)]
     fn mask_simd(x: u32) -> U32x8 {
         let hash_vec = U32x8::splat(x);
-        let salt_vec = U32x8::from_array(SALT);
+        let salt_vec = U32x8::from_array(SALT.0);
         let bit_index = (hash_vec * salt_vec) >> U32x8::splat(27);
         U32x8::splat(1) << bit_index
     }
@@ -199,7 +266,7 @@ pub struct Sbbf(Vec<Block>);
 pub struct SbbfAtomic(Vec<BlockAtomic>);
 
 pub(crate) const BITSET_MIN_LENGTH: usize = 32;
-pub(crate) const BITSET_MAX_LENGTH: usize = 128 * 1024 * 1024;
+pub(crate) const BITSET_MAX_LENGTH: usize = 64 * 1024 * 1024;
 
 #[inline]
 fn hash_to_block_index_for_blocks(hash: u64, num_blocks: usize) -> usize {
@@ -306,6 +373,28 @@ impl Sbbf {
     pub fn estimated_memory_size(&self) -> usize {
         self.0.capacity() * std::mem::size_of::<Block>()
     }
+
+    /// Zero-copy serialize to Vec<u32>, consuming self.
+    pub fn into_u32s(self) -> Vec<u32> {
+        let mut blocks = std::mem::ManuallyDrop::new(self.0);
+        let ptr = blocks.as_mut_ptr() as *mut u32;
+        let len = blocks.len() * 8;
+        let cap = blocks.capacity() * 8;
+        unsafe { Vec::from_raw_parts(ptr, len, cap) }
+    }
+
+    /// Zero-copy deserialize from Vec<u32>.
+    /// Returns None if length is not a multiple of 8 (one Block = 8 x u32).
+    pub fn from_u32s(words: Vec<u32>) -> Option<Self> {
+        if words.is_empty() || !words.len().is_multiple_of(8) {
+            return None;
+        }
+        let mut words = std::mem::ManuallyDrop::new(words);
+        let len = words.len() / 8;
+        let cap = words.capacity() / 8;
+        let ptr = words.as_mut_ptr() as *mut Block;
+        Some(Self(unsafe { Vec::from_raw_parts(ptr, len, cap) }))
+    }
 }
 
 impl SbbfAtomic {
@@ -497,7 +586,7 @@ mod tests {
             (33, 64),
             (99, 128),
             (1024, 1024),
-            (999_000_000, 128 * 1024 * 1024),
+            (999_000_000, 64 * 1024 * 1024),
         ] {
             assert_eq!(*expected, optimal_num_of_bytes(*input));
         }
@@ -529,4 +618,49 @@ mod tests {
             assert_eq!(*num_bits, num_of_bits_from_ndv_fpp(*ndv, *fpp) as u64);
         }
     }
+
+    #[test]
+    fn test_sbbf_to_bytes_from_bytes_roundtrip() {
+        let mut filter = Sbbf::new_with_ndv_fpp(1000, 0.01).unwrap();
+        let hashes: Vec<u64> = (0..500).collect();
+        filter.insert_hash_batch(&hashes);
+
+        let words = filter.into_u32s();
+        let restored = Sbbf::from_u32s(words).unwrap();
+
+        for hash in &hashes {
+            assert!(restored.check_hash(*hash));
+        }
+    }
+
+    #[test]
+    fn test_sbbf_from_u32s_invalid() {
+        assert!(Sbbf::from_u32s(vec![]).is_none());
+        assert!(Sbbf::from_u32s(vec![0; 7]).is_none());
+        assert!(Sbbf::from_u32s(vec![0; 9]).is_none());
+        assert!(Sbbf::from_u32s(vec![0; 8]).is_some());
+        assert!(Sbbf::from_u32s(vec![0; 16]).is_some());
+    }
+
+    #[test]
+    fn test_sbbf_union_after_serialization() {
+        let mut f1 = Sbbf::new_with_ndv_fpp(100, 0.01).unwrap();
+        for i in 0..50 {
+            f1.insert_hash(i);
+        }
+        let mut f2 = Sbbf::new_with_ndv_fpp(100, 0.01).unwrap();
+        for i in 50..100 {
+            f2.insert_hash(i);
+        }
+
+        let words1 = f1.into_u32s();
+        let words2 = f2.into_u32s();
+        let mut restored1 = Sbbf::from_u32s(words1).unwrap();
+        let restored2 = Sbbf::from_u32s(words2).unwrap();
+        restored1.union(&restored2);
+
+        for i in 0..100 {
+            assert!(restored1.check_hash(i));
+        }
+    }
 }
diff --git a/src/query/expression/src/aggregate/group_hash.rs b/src/query/expression/src/aggregate/group_hash.rs
index cfb29554f3620..9b68cf7635b0d 100644
--- a/src/query/expression/src/aggregate/group_hash.rs
+++ b/src/query/expression/src/aggregate/group_hash.rs
@@ -60,7 +60,7 @@ pub fn group_hash_entries(entries: ProjectedBlock, values: &mut [u64]) {
     }
 }
 
-fn combine_group_hash_column<const IS_FIRST: bool>(c: &Column, values: &mut [u64]) {
+pub fn combine_group_hash_column<const IS_FIRST: bool>(c: &Column, values: &mut [u64]) {
     HashVisitor::<IS_FIRST> { values }
         .visit_column(c.clone())
         .unwrap()
diff --git a/src/query/service/Cargo.toml b/src/query/service/Cargo.toml
index 547d06c4f83af..c1b10b4f8a1af 100644
--- a/src/query/service/Cargo.toml
+++ b/src/query/service/Cargo.toml
@@ -164,6 +164,7 @@ serde_urlencoded = { workspace = true }
 sha2 = { workspace = true }
 socket2 = { workspace = true }
 sqlx = { workspace = true }
+strength_reduce = { workspace = true }
 sysinfo = { workspace = true }
 tempfile = { workspace = true }
 tokio = { workspace = true }
diff --git a/src/query/service/src/physical_plans/physical_add_stream_column.rs b/src/query/service/src/physical_plans/physical_add_stream_column.rs
index cf11d49dad7b9..50ec67fa873f0 100644
--- a/src/query/service/src/physical_plans/physical_add_stream_column.rs
+++ b/src/query/service/src/physical_plans/physical_add_stream_column.rs
@@ -36,6 +36,7 @@ use databend_common_sql::StreamContext;
 use databend_common_sql::Symbol;
 use databend_common_sql::Visibility;
 use databend_common_sql::evaluator::BlockOperator;
+use databend_common_sql::executor::physical_plans::DataDistribution;
 use databend_common_sql::plans::BoundColumnRef;
 use databend_common_sql::plans::ConstantExpr;
 use databend_common_sql::plans::FunctionCall;
@@ -69,6 +70,10 @@ impl IPhysicalPlan for AddStreamColumn {
         &mut self.meta
     }
 
+    fn output_data_distribution(&self) -> DataDistribution {
+        self.input.output_data_distribution()
+    }
+
     fn children<'a>(&'a self) -> Box<dyn Iterator<Item = &'a PhysicalPlan> + 'a> {
         Box::new(std::iter::once(&self.input))
     }
diff --git a/src/query/service/src/physical_plans/physical_aggregate_expand.rs b/src/query/service/src/physical_plans/physical_aggregate_expand.rs
index ea747aace91eb..0585b1ea85503 100644
--- a/src/query/service/src/physical_plans/physical_aggregate_expand.rs
+++ b/src/query/service/src/physical_plans/physical_aggregate_expand.rs
@@ -22,6 +22,7 @@ use databend_common_expression::types::DataType;
 use databend_common_expression::types::NumberDataType;
 use databend_common_pipeline_transforms::TransformPipelineHelper;
 use databend_common_sql::Symbol;
+use databend_common_sql::executor::physical_plans::DataDistribution;
 use databend_common_sql::plans::GroupingSets;
 
 use crate::physical_plans::explain::PlanStatsInfo;
@@ -58,6 +59,10 @@ impl IPhysicalPlan for AggregateExpand {
         &mut self.meta
     }
 
+    fn output_data_distribution(&self) -> DataDistribution {
+        self.input.output_data_distribution()
+    }
+
     #[recursive::recursive]
     fn output_schema(&self) -> Result<DataSchemaRef> {
         let input_schema = self.input.output_schema()?;
diff --git a/src/query/service/src/physical_plans/physical_aggregate_final.rs b/src/query/service/src/physical_plans/physical_aggregate_final.rs
index 67008d6f96690..f37b41671fd70 100644
--- a/src/query/service/src/physical_plans/physical_aggregate_final.rs
+++ b/src/query/service/src/physical_plans/physical_aggregate_final.rs
@@ -29,6 +29,7 @@ use databend_common_sql::ScalarExpr;
 use databend_common_sql::Symbol;
 use databend_common_sql::executor::physical_plans::AggregateFunctionDesc;
 use databend_common_sql::executor::physical_plans::AggregateFunctionSignature;
+use databend_common_sql::executor::physical_plans::DataDistribution;
 use databend_common_sql::executor::physical_plans::SortDesc;
 use databend_common_sql::optimizer::ir::SExpr;
 use databend_common_sql::plans::Aggregate;
@@ -111,6 +112,13 @@ impl IPhysicalPlan for AggregateFinal {
         Ok(AggregateFinalFormatter::create(self))
     }
 
+    fn output_data_distribution(&self) -> DataDistribution {
+        match self.group_by.is_empty() {
+            true => DataDistribution::Serial,
+            false => DataDistribution::Random,
+        }
+    }
+
     fn get_desc(&self) -> Result<String> {
         Ok(self.agg_funcs.iter().map(|x| x.display.clone()).join(", "))
     }
diff --git a/src/query/service/src/physical_plans/physical_aggregate_partial.rs b/src/query/service/src/physical_plans/physical_aggregate_partial.rs
index c2d39d7430af3..247099bf3405d 100644
--- a/src/query/service/src/physical_plans/physical_aggregate_partial.rs
+++ b/src/query/service/src/physical_plans/physical_aggregate_partial.rs
@@ -32,6 +32,7 @@ use databend_common_pipeline_transforms::TransformPipelineHelper;
 use databend_common_pipeline_transforms::sorts::TransformRankLimitSort;
 use databend_common_sql::Symbol;
 use databend_common_sql::executor::physical_plans::AggregateFunctionDesc;
+use databend_common_sql::executor::physical_plans::DataDistribution;
 use databend_common_sql::executor::physical_plans::SortDesc;
 use databend_common_storage::DataOperator;
 use itertools::Itertools;
@@ -81,6 +82,10 @@ impl IPhysicalPlan for AggregatePartial {
         &mut self.meta
     }
 
+    fn output_data_distribution(&self) -> DataDistribution {
+        self.input.output_data_distribution()
+    }
+
     #[recursive::recursive]
     fn output_schema(&self) -> Result<DataSchemaRef> {
         let input_schema = self.input.output_schema()?;
diff --git a/src/query/service/src/physical_plans/physical_async_func.rs b/src/query/service/src/physical_plans/physical_async_func.rs
index b32fd60af7ddd..08ec88a4edc95 100644
--- a/src/query/service/src/physical_plans/physical_async_func.rs
+++ b/src/query/service/src/physical_plans/physical_async_func.rs
@@ -24,6 +24,7 @@ use databend_common_pipeline_transforms::TransformPipelineHelper;
 use databend_common_sql::ColumnSet;
 use databend_common_sql::ScalarExpr;
 use databend_common_sql::binder::AsyncFunctionDesc;
+use databend_common_sql::executor::physical_plans::DataDistribution;
 use databend_common_sql::optimizer::ir::SExpr;
 use itertools::Itertools;
 
@@ -59,6 +60,10 @@ impl IPhysicalPlan for AsyncFunction {
         &mut self.meta
     }
 
+    fn output_data_distribution(&self) -> DataDistribution {
+        self.input.output_data_distribution()
+    }
+
     #[recursive::recursive]
     fn output_schema(&self) -> Result<DataSchemaRef> {
         let input_schema = self.input.output_schema()?;
diff --git a/src/query/service/src/physical_plans/physical_broadcast.rs b/src/query/service/src/physical_plans/physical_broadcast.rs
index 88f961c5103ed..37a59c46261e7 100644
--- a/src/query/service/src/physical_plans/physical_broadcast.rs
+++ b/src/query/service/src/physical_plans/physical_broadcast.rs
@@ -17,6 +17,7 @@ use std::any::Any;
 use databend_common_catalog::table_context::TableContext;
 use databend_common_exception::Result;
 use databend_common_expression::DataSchemaRef;
+use databend_common_sql::executor::physical_plans::DataDistribution;
 use databend_common_sql::executor::physical_plans::FragmentKind;
 
 use super::Exchange;
@@ -48,6 +49,10 @@ impl IPhysicalPlan for BroadcastSource {
         &mut self.meta
     }
 
+    fn output_data_distribution(&self) -> DataDistribution {
+        DataDistribution::Random
+    }
+
     fn derive(&self, children: Vec<PhysicalPlan>) -> PhysicalPlan {
         assert!(children.is_empty());
         PhysicalPlan::new(BroadcastSource {
@@ -88,6 +93,10 @@ impl IPhysicalPlan for BroadcastSink {
         &mut self.meta
     }
 
+    fn output_data_distribution(&self) -> DataDistribution {
+        DataDistribution::Serial
+    }
+
     #[recursive::recursive]
     fn output_schema(&self) -> Result<DataSchemaRef> {
         Ok(DataSchemaRef::default())
diff --git a/src/query/service/src/physical_plans/physical_cache_scan.rs b/src/query/service/src/physical_plans/physical_cache_scan.rs
index 0a01747df77e3..5110acb590559 100644
--- a/src/query/service/src/physical_plans/physical_cache_scan.rs
+++ b/src/query/service/src/physical_plans/physical_cache_scan.rs
@@ -19,6 +19,7 @@ use databend_common_exception::Result;
 use databend_common_expression::DataSchemaRef;
 use databend_common_expression::DataSchemaRefExt;
 use databend_common_sql::ColumnSet;
+use databend_common_sql::executor::physical_plans::DataDistribution;
 use databend_common_sql::plans::CacheSource;
 
 use crate::physical_plans::format::CacheScanFormatter;
@@ -59,6 +60,10 @@ impl IPhysicalPlan for CacheScan {
         Ok(self.output_schema.clone())
     }
 
+    fn output_data_distribution(&self) -> DataDistribution {
+        DataDistribution::Random
+    }
+
     fn formatter(&self) -> Result<Box<dyn PhysicalFormat + '_>> {
         Ok(CacheScanFormatter::create(self))
     }
diff --git a/src/query/service/src/physical_plans/physical_column_mutation.rs b/src/query/service/src/physical_plans/physical_column_mutation.rs
index c1bb9b709ec8b..8fa9af0fd175e 100644
--- a/src/query/service/src/physical_plans/physical_column_mutation.rs
+++ b/src/query/service/src/physical_plans/physical_column_mutation.rs
@@ -25,6 +25,7 @@ use databend_common_meta_app::schema::TableInfo;
 use databend_common_pipeline_transforms::TransformPipelineHelper;
 use databend_common_pipeline_transforms::blocks::CompoundBlockOperator;
 use databend_common_sql::evaluator::BlockOperator;
+use databend_common_sql::executor::physical_plans::DataDistribution;
 use databend_common_sql::executor::physical_plans::MutationKind;
 use databend_common_storages_fuse::FuseTable;
 use databend_common_storages_fuse::operations::TransformSerializeBlock;
@@ -65,6 +66,10 @@ impl IPhysicalPlan for ColumnMutation {
         &mut self.meta
     }
 
+    fn output_data_distribution(&self) -> DataDistribution {
+        self.input.output_data_distribution()
+    }
+
     #[recursive::recursive]
     fn output_schema(&self) -> Result<DataSchemaRef> {
         Ok(DataSchemaRef::default())
diff --git a/src/query/service/src/physical_plans/physical_commit_sink.rs b/src/query/service/src/physical_plans/physical_commit_sink.rs
index c8f584484b0f0..59303d641a1e7 100644
--- a/src/query/service/src/physical_plans/physical_commit_sink.rs
+++ b/src/query/service/src/physical_plans/physical_commit_sink.rs
@@ -23,6 +23,7 @@ use databend_common_meta_app::schema::TableInfo;
 use databend_common_meta_app::schema::UpdateStreamMetaReq;
 use databend_common_pipeline::core::ExecutionInfo;
 use databend_common_pipeline_transforms::TransformPipelineHelper;
+use databend_common_sql::executor::physical_plans::DataDistribution;
 use databend_common_sql::executor::physical_plans::MutationKind;
 use databend_common_sql::plans::TruncateMode;
 use databend_common_storages_fuse::FuseTable;
@@ -71,6 +72,10 @@ impl IPhysicalPlan for CommitSink {
         &mut self.meta
     }
 
+    fn output_data_distribution(&self) -> DataDistribution {
+        DataDistribution::Serial
+    }
+
     #[recursive::recursive]
     fn output_schema(&self) -> Result<DataSchemaRef> {
         Ok(DataSchemaRef::default())
diff --git a/src/query/service/src/physical_plans/physical_compact_source.rs b/src/query/service/src/physical_plans/physical_compact_source.rs
index 52fa41a4b8533..94593db34eb87 100644
--- a/src/query/service/src/physical_plans/physical_compact_source.rs
+++ b/src/query/service/src/physical_plans/physical_compact_source.rs
@@ -31,6 +31,7 @@ use databend_common_pipeline::sources::EmptySource;
 use databend_common_pipeline::sources::PrefetchAsyncSourcer;
 use databend_common_pipeline_transforms::TransformPipelineHelper;
 use databend_common_sql::StreamContext;
+use databend_common_sql::executor::physical_plans::DataDistribution;
 use databend_common_sql::executor::physical_plans::FragmentKind;
 use databend_common_sql::executor::physical_plans::MutationKind;
 use databend_common_storages_fuse::FuseTable;
@@ -72,6 +73,10 @@ impl IPhysicalPlan for CompactSource {
         &mut self.meta
     }
 
+    fn output_data_distribution(&self) -> DataDistribution {
+        DataDistribution::Random
+    }
+
     fn derive(&self, children: Vec<PhysicalPlan>) -> PhysicalPlan {
         assert!(children.is_empty());
         PhysicalPlan::new(CompactSource {
diff --git a/src/query/service/src/physical_plans/physical_constant_table_scan.rs b/src/query/service/src/physical_plans/physical_constant_table_scan.rs
index 2024c4c5fe1d1..c568618b1ca67 100644
--- a/src/query/service/src/physical_plans/physical_constant_table_scan.rs
+++ b/src/query/service/src/physical_plans/physical_constant_table_scan.rs
@@ -21,6 +21,7 @@ use databend_common_expression::DataSchemaRef;
 use databend_common_pipeline::sources::OneBlockSource;
 use databend_common_sql::ColumnSet;
 use databend_common_sql::Symbol;
+use databend_common_sql::executor::physical_plans::DataDistribution;
 
 use crate::physical_plans::PhysicalPlanBuilder;
 use crate::physical_plans::format::ConstantTableScanFormatter;
@@ -56,6 +57,10 @@ impl IPhysicalPlan for ConstantTableScan {
         Ok(self.output_schema.clone())
     }
 
+    fn output_data_distribution(&self) -> DataDistribution {
+        DataDistribution::Random
+    }
+
     fn formatter(&self) -> Result<Box<dyn PhysicalFormat + '_>> {
         Ok(ConstantTableScanFormatter::create(self))
     }
diff --git a/src/query/service/src/physical_plans/physical_copy_into_location.rs b/src/query/service/src/physical_plans/physical_copy_into_location.rs
index 873dcdd3e4f16..678b8d81d3263 100644
--- a/src/query/service/src/physical_plans/physical_copy_into_location.rs
+++ b/src/query/service/src/physical_plans/physical_copy_into_location.rs
@@ -27,6 +27,7 @@ use databend_common_expression::TableSchemaRef;
 use databend_common_expression::types::DataType;
 use databend_common_expression::types::NumberDataType;
 use databend_common_sql::ColumnBinding;
+use databend_common_sql::executor::physical_plans::DataDistribution;
 use databend_common_storages_stage::StageSinkTable;
 use databend_storages_common_stage::CopyIntoLocationInfo;
 use databend_storages_common_table_meta::meta::TableMetaTimestamps;
@@ -60,6 +61,10 @@ impl IPhysicalPlan for CopyIntoLocation {
         &mut self.meta
     }
 
+    fn output_data_distribution(&self) -> DataDistribution {
+        DataDistribution::Random
+    }
+
     #[recursive::recursive]
     fn output_schema(&self) -> Result<DataSchemaRef> {
         Ok(DataSchemaRefExt::create(vec![
diff --git a/src/query/service/src/physical_plans/physical_copy_into_table.rs b/src/query/service/src/physical_plans/physical_copy_into_table.rs
index a5b3bdaa34e3a..2c34ec02500e8 100644
--- a/src/query/service/src/physical_plans/physical_copy_into_table.rs
+++ b/src/query/service/src/physical_plans/physical_copy_into_table.rs
@@ -23,6 +23,7 @@ use databend_common_expression::DataSchemaRefExt;
 use databend_common_expression::Scalar;
 use databend_common_meta_app::schema::TableInfo;
 use databend_common_sql::ColumnBinding;
+use databend_common_sql::executor::physical_plans::DataDistribution;
 use databend_common_sql::plans::CopyIntoTableMode;
 use databend_common_sql::plans::ValidationMode;
 use databend_storages_common_table_meta::meta::TableMetaTimestamps;
@@ -64,6 +65,10 @@ impl IPhysicalPlan for CopyIntoTable {
         &mut self.meta
     }
 
+    fn output_data_distribution(&self) -> DataDistribution {
+        DataDistribution::Random
+    }
+
     #[recursive::recursive]
     fn output_schema(&self) -> Result<DataSchemaRef> {
         Ok(DataSchemaRefExt::create(vec![]))
diff --git a/src/query/service/src/physical_plans/physical_cte_consumer.rs b/src/query/service/src/physical_plans/physical_cte_consumer.rs
index 19f399c6b4fbb..252c52fddb057 100644
--- a/src/query/service/src/physical_plans/physical_cte_consumer.rs
+++ b/src/query/service/src/physical_plans/physical_cte_consumer.rs
@@ -19,6 +19,7 @@ use databend_common_exception::Result;
 use databend_common_expression::DataField;
 use databend_common_expression::DataSchemaRef;
 use databend_common_expression::DataSchemaRefExt;
+use databend_common_sql::executor::physical_plans::DataDistribution;
 
 use crate::physical_plans::IPhysicalPlan;
 use crate::physical_plans::PhysicalPlan;
@@ -62,6 +63,10 @@ impl IPhysicalPlan for MaterializeCTERef {
         Ok(self.cte_schema.clone())
     }
 
+    fn output_data_distribution(&self) -> DataDistribution {
+        DataDistribution::Random
+    }
+
     fn derive(&self, children: Vec<PhysicalPlan>) -> PhysicalPlan {
         assert!(children.is_empty());
         PhysicalPlan::new(MaterializeCTERef {
diff --git a/src/query/service/src/physical_plans/physical_distributed_insert_select.rs b/src/query/service/src/physical_plans/physical_distributed_insert_select.rs
index 3c7280778f4a6..795744908abfb 100644
--- a/src/query/service/src/physical_plans/physical_distributed_insert_select.rs
+++ b/src/query/service/src/physical_plans/physical_distributed_insert_select.rs
@@ -21,6 +21,7 @@ use databend_common_meta_app::schema::TableInfo;
 use databend_common_pipeline_transforms::TransformPipelineHelper;
 use databend_common_pipeline_transforms::blocks::TransformCastSchema;
 use databend_common_sql::ColumnBinding;
+use databend_common_sql::executor::physical_plans::DataDistribution;
 use databend_storages_common_table_meta::meta::TableMetaTimestamps;
 
 use crate::physical_plans::physical_plan::IPhysicalPlan;
@@ -53,6 +54,10 @@ impl IPhysicalPlan for DistributedInsertSelect {
         &mut self.meta
     }
 
+    fn output_data_distribution(&self) -> DataDistribution {
+        DataDistribution::Random
+    }
+
     #[recursive::recursive]
     fn output_schema(&self) -> Result<DataSchemaRef> {
         Ok(DataSchemaRef::default())
diff --git a/src/query/service/src/physical_plans/physical_eval_scalar.rs b/src/query/service/src/physical_plans/physical_eval_scalar.rs
index 92f411baf6d2d..0b2ca0667c161 100644
--- a/src/query/service/src/physical_plans/physical_eval_scalar.rs
+++ b/src/query/service/src/physical_plans/physical_eval_scalar.rs
@@ -33,6 +33,7 @@ use databend_common_sql::ColumnSet;
 use databend_common_sql::Symbol;
 use databend_common_sql::TypeCheck;
 use databend_common_sql::evaluator::BlockOperator;
+use databend_common_sql::executor::physical_plans::DataDistribution;
 use databend_common_sql::optimizer::ir::Matcher;
 use databend_common_sql::optimizer::ir::SExpr;
 use databend_common_sql::plans::Filter;
@@ -78,6 +79,10 @@ impl IPhysicalPlan for EvalScalar {
         &mut self.meta
     }
 
+    fn output_data_distribution(&self) -> DataDistribution {
+        self.input.output_data_distribution()
+    }
+
     #[recursive::recursive]
     fn output_schema(&self) -> Result<DataSchemaRef> {
         if self.exprs.is_empty() {
diff --git a/src/query/service/src/physical_plans/physical_exchange.rs b/src/query/service/src/physical_plans/physical_exchange.rs
index d2c691dde0870..0fb0f69014d7d 100644
--- a/src/query/service/src/physical_plans/physical_exchange.rs
+++ b/src/query/service/src/physical_plans/physical_exchange.rs
@@ -21,6 +21,7 @@ use databend_common_expression::RemoteExpr;
 use databend_common_functions::BUILTIN_FUNCTIONS;
 use databend_common_sql::ColumnSet;
 use databend_common_sql::TypeCheck;
+use databend_common_sql::executor::physical_plans::DataDistribution;
 use databend_common_sql::executor::physical_plans::FragmentKind;
 use databend_common_sql::optimizer::ir::SExpr;
 
@@ -75,6 +76,16 @@ impl IPhysicalPlan for Exchange {
         true
     }
 
+    fn output_data_distribution(&self) -> DataDistribution {
+        match &self.kind {
+            FragmentKind::Init => DataDistribution::Random,
+            FragmentKind::Normal => DataDistribution::NodeHash(self.keys.clone()),
+            FragmentKind::Expansive => DataDistribution::Broadcast,
+            FragmentKind::Merge => DataDistribution::Serial,
+            FragmentKind::GlobalShuffle => DataDistribution::GlobalHash(self.keys.clone()),
+        }
+    }
+
     fn display_in_profile(&self) -> bool {
         false
     }
diff --git a/src/query/service/src/physical_plans/physical_exchange_sink.rs b/src/query/service/src/physical_plans/physical_exchange_sink.rs
index 17f2e3b51d9e5..9c110e77b5682 100644
--- a/src/query/service/src/physical_plans/physical_exchange_sink.rs
+++ b/src/query/service/src/physical_plans/physical_exchange_sink.rs
@@ -18,6 +18,7 @@ use databend_common_catalog::plan::DataSourcePlan;
 use databend_common_exception::Result;
 use databend_common_expression::DataSchemaRef;
 use databend_common_expression::RemoteExpr;
+use databend_common_sql::executor::physical_plans::DataDistribution;
 use databend_common_sql::executor::physical_plans::FragmentKind;
 
 use crate::physical_plans::format::ExchangeSinkFormatter;
@@ -84,6 +85,16 @@ impl IPhysicalPlan for ExchangeSink {
         true
     }
 
+    fn output_data_distribution(&self) -> DataDistribution {
+        match &self.kind {
+            FragmentKind::Init => DataDistribution::Random,
+            FragmentKind::Normal => DataDistribution::NodeHash(self.keys.clone()),
+            FragmentKind::Expansive => DataDistribution::Broadcast,
+            FragmentKind::Merge => DataDistribution::Serial,
+            FragmentKind::GlobalShuffle => DataDistribution::GlobalHash(self.keys.clone()),
+        }
+    }
+
     fn display_in_profile(&self) -> bool {
         false
     }
diff --git a/src/query/service/src/physical_plans/physical_exchange_source.rs b/src/query/service/src/physical_plans/physical_exchange_source.rs
index 79076f92fb2b4..772e32978026a 100644
--- a/src/query/service/src/physical_plans/physical_exchange_source.rs
+++ b/src/query/service/src/physical_plans/physical_exchange_source.rs
@@ -16,7 +16,10 @@ use std::any::Any;
 
 use databend_common_exception::Result;
 use databend_common_expression::DataSchemaRef;
+use databend_common_expression::RemoteExpr;
 use databend_common_pipeline::core::PlanScope;
+use databend_common_sql::executor::physical_plans::DataDistribution;
+use databend_common_sql::executor::physical_plans::FragmentKind;
 
 use crate::physical_plans::format::ExchangeSourceFormatter;
 use crate::physical_plans::format::PhysicalFormat;
@@ -35,6 +38,8 @@ pub struct ExchangeSource {
     // Fragment ID of source fragment
     pub source_fragment_id: usize,
     pub query_id: String,
+    pub kind: FragmentKind,
+    pub keys: Vec<RemoteExpr>,
 }
 
 #[typetag::serde]
@@ -63,6 +68,16 @@ impl IPhysicalPlan for ExchangeSource {
         true
     }
 
+    fn output_data_distribution(&self) -> DataDistribution {
+        match &self.kind {
+            FragmentKind::Init => DataDistribution::Random,
+            FragmentKind::Normal => DataDistribution::NodeHash(self.keys.clone()),
+            FragmentKind::Expansive => DataDistribution::Broadcast,
+            FragmentKind::Merge => DataDistribution::Serial,
+            FragmentKind::GlobalShuffle => DataDistribution::GlobalHash(self.keys.clone()),
+        }
+    }
+
     fn display_in_profile(&self) -> bool {
         false
     }
@@ -74,6 +89,8 @@ impl IPhysicalPlan for ExchangeSource {
             schema: self.schema.clone(),
             source_fragment_id: self.source_fragment_id,
             query_id: self.query_id.clone(),
+            kind: self.kind.clone(),
+            keys: self.keys.clone(),
         })
     }
 
diff --git a/src/query/service/src/physical_plans/physical_expression_scan.rs b/src/query/service/src/physical_plans/physical_expression_scan.rs
index 8d13ffbe9ed7f..955a5885d2f31 100644
--- a/src/query/service/src/physical_plans/physical_expression_scan.rs
+++ b/src/query/service/src/physical_plans/physical_expression_scan.rs
@@ -22,6 +22,7 @@ use databend_common_functions::BUILTIN_FUNCTIONS;
 use databend_common_pipeline::core::ProcessorPtr;
 use databend_common_sql::ColumnSet;
 use databend_common_sql::TypeCheck;
+use databend_common_sql::executor::physical_plans::DataDistribution;
 use databend_common_sql::optimizer::ir::SExpr;
 
 use crate::physical_plans::PhysicalPlanBuilder;
@@ -59,6 +60,10 @@ impl IPhysicalPlan for ExpressionScan {
         Ok(self.output_schema.clone())
     }
 
+    fn output_data_distribution(&self) -> DataDistribution {
+        self.input.output_data_distribution()
+    }
+
     fn children<'a>(&'a self) -> Box<dyn Iterator<Item = &'a PhysicalPlan> + 'a> {
         Box::new(std::iter::once(&self.input))
     }
diff --git a/src/query/service/src/physical_plans/physical_filter.rs b/src/query/service/src/physical_plans/physical_filter.rs
index 8002add0b72a2..f4eb46c95cc55 100644
--- a/src/query/service/src/physical_plans/physical_filter.rs
+++ b/src/query/service/src/physical_plans/physical_filter.rs
@@ -26,6 +26,7 @@ use databend_common_functions::BUILTIN_FUNCTIONS;
 use databend_common_sql::ColumnSet;
 use databend_common_sql::TypeCheck;
 use databend_common_sql::executor::cast_expr_to_non_null_boolean;
+use databend_common_sql::executor::physical_plans::DataDistribution;
 use databend_common_sql::optimizer::ir::SExpr;
 
 use crate::physical_plans::PhysicalPlanBuilder;
@@ -62,6 +63,10 @@ impl IPhysicalPlan for Filter {
         &mut self.meta
     }
 
+    fn output_data_distribution(&self) -> DataDistribution {
+        self.input.output_data_distribution()
+    }
+
     #[recursive::recursive]
     fn output_schema(&self) -> Result<DataSchemaRef> {
         let input_schema = self.input.output_schema()?;
diff --git a/src/query/service/src/physical_plans/physical_hash_join.rs b/src/query/service/src/physical_plans/physical_hash_join.rs
index 0a86fe90db3c7..058a0bf8b5756 100644
--- a/src/query/service/src/physical_plans/physical_hash_join.rs
+++ b/src/query/service/src/physical_plans/physical_hash_join.rs
@@ -43,6 +43,7 @@ use databend_common_sql::IndexType;
 use databend_common_sql::ScalarExpr;
 use databend_common_sql::Symbol;
 use databend_common_sql::TypeCheck;
+use databend_common_sql::executor::physical_plans::DataDistribution;
 use databend_common_sql::optimizer::ir::SExpr;
 use databend_common_sql::plans::FunctionCall;
 use databend_common_sql::plans::Join;
@@ -71,9 +72,11 @@ use crate::pipelines::processors::HashJoinState;
 use crate::pipelines::processors::transforms::HashJoinFactory;
 use crate::pipelines::processors::transforms::HashJoinProbeState;
 use crate::pipelines::processors::transforms::RuntimeFiltersDesc;
+use crate::pipelines::processors::transforms::SharedRuntimeFilterPackets;
 use crate::pipelines::processors::transforms::TransformHashJoin;
 use crate::pipelines::processors::transforms::TransformHashJoinBuild;
 use crate::pipelines::processors::transforms::TransformHashJoinProbe;
+use crate::pipelines::processors::transforms::TransformPartitionedHashJoin;
 use crate::sessions::QueryContext;
 
 // Type aliases to simplify complex return types
@@ -195,6 +198,24 @@ impl IPhysicalPlan for HashJoin {
         Ok(HashJoinFormatter::create(self))
     }
 
+    fn output_data_distribution(&self) -> DataDistribution {
+        let build_dist = self.build.output_data_distribution();
+        let probe_dist = self.probe.output_data_distribution();
+
+        let can_preserve_global_hash = matches!(
+            &build_dist,
+            DataDistribution::GlobalHash(keys) if keys == &self.build_keys
+        ) && matches!(
+            &probe_dist,
+            DataDistribution::GlobalHash(keys) if keys == &self.probe_keys
+        );
+
+        match can_preserve_global_hash {
+            true => probe_dist,
+            false => DataDistribution::Random,
+        }
+    }
+
     fn get_desc(&self) -> Result<String> {
         let mut conditions = self
             .build_keys
@@ -312,7 +333,7 @@ impl IPhysicalPlan for HashJoin {
             && !enable_optimization
             && !self.need_hold_hash_table
         {
-            return self.build_new_join_pipeline(builder, desc);
+            return self.build_join(builder, desc);
         }
 
         // Create the join state with optimization flags
@@ -436,18 +457,105 @@ impl HashJoin {
         Ok(())
     }
 
-    fn build_new_join_pipeline(
-        &self,
-        builder: &mut PipelineBuilder,
-        desc: Arc<HashJoinDesc>,
-    ) -> Result<()> {
-        let factory = self.join_factory(builder, desc)?;
+    fn build_join(&self, pb: &mut PipelineBuilder, desc: Arc<HashJoinDesc>) -> Result<()> {
+        let build_distribution = self.build.output_data_distribution();
+        let global_hash_build = matches!(build_distribution, DataDistribution::GlobalHash(_));
+
+        let probe_distribution = self.probe.output_data_distribution();
+        let global_hash_probe = matches!(probe_distribution, DataDistribution::GlobalHash(_));
 
-        // We must build the runtime filter before constructing the child nodes,
-        // as we will inject some runtime filter information into the context for the child nodes to use.
+        let enable_partitioned = pb.settings.get_enable_partitioned_hash_join()?;
+        match global_hash_build
+            && global_hash_probe
+            && self.build_side_cache_info.is_none()
+            && enable_partitioned
+        {
+            true => self.shuffle_join(pb, desc),
+            false => self.broadcast_join(pb, desc),
+        }
+    }
+
+    fn shuffle_join(&self, builder: &mut PipelineBuilder, desc: Arc<HashJoinDesc>) -> Result<()> {
+        let rf_desc = RuntimeFiltersDesc::create(&builder.ctx, self)?;
+
+        let hash_key_types = self
+            .build_keys
+            .iter()
+            .zip(&desc.is_null_equal)
+            .map(|(expr, is_null_equal)| {
+                let expr = expr.as_expr(&BUILTIN_FUNCTIONS);
+                if *is_null_equal {
+                    expr.data_type().clone()
+                } else {
+                    expr.data_type().remove_nullable()
+                }
+            })
+            .collect::<Vec<_>>();
+        let hash_method = DataBlock::choose_hash_method_with_types(&hash_key_types)?;
+        let max_block_size = builder.settings.get_max_block_size()? as usize;
+
+        let mut sub_query_ctx = QueryContext::create_from(&builder.ctx);
+        std::mem::swap(&mut builder.ctx, &mut sub_query_ctx);
+        self.build.build_pipeline(builder)?;
+        std::mem::swap(&mut builder.ctx, &mut sub_query_ctx);
+        let build_sinks = builder.main_pipeline.take_sinks();
+
+        self.probe.build_pipeline(builder)?;
+        let probe_sinks = builder.main_pipeline.take_sinks();
+
+        assert_eq!(build_sinks.len(), probe_sinks.len());
+        let output_len = build_sinks.len();
+
+        let barrier = databend_common_base::base::Barrier::new(output_len);
+        let stage_sync_barrier = Arc::new(barrier);
+        let shared_rf_packets = SharedRuntimeFilterPackets::create();
+        let mut join_sinks = Vec::with_capacity(output_len * 2);
+        let mut join_pipe_items = Vec::with_capacity(output_len);
+        for (build_sink, probe_sink) in build_sinks.into_iter().zip(probe_sinks.into_iter()) {
+            join_sinks.push(build_sink);
+            join_sinks.push(probe_sink);
+
+            let build_input = InputPort::create();
+            let probe_input = InputPort::create();
+            let joined_output = OutputPort::create();
+
+            let join = TransformPartitionedHashJoin::create_join(
+                self.join_type,
+                hash_method.clone(),
+                desc.clone(),
+                builder.func_ctx.clone(),
+                max_block_size,
+            );
+
+            let hash_join = TransformPartitionedHashJoin::create(
+                build_input.clone(),
+                probe_input.clone(),
+                joined_output.clone(),
+                join,
+                stage_sync_barrier.clone(),
+                self.projections.clone(),
+                rf_desc.clone(),
+                shared_rf_packets.clone(),
+            )?;
+
+            join_pipe_items.push(PipeItem::create(
+                hash_join,
+                vec![build_input, probe_input],
+                vec![joined_output],
+            ))
+        }
+
+        builder.main_pipeline.extend_sinks(join_sinks);
+        let join_pipe = Pipe::create(output_len * 2, output_len, join_pipe_items);
+        builder.main_pipeline.add_pipe(join_pipe);
+
+        Ok(())
+    }
+
+    fn broadcast_join(&self, builder: &mut PipelineBuilder, desc: Arc<HashJoinDesc>) -> Result<()> {
+        let factory = self.join_factory(builder, desc)?;
         let rf_desc = RuntimeFiltersDesc::create(&builder.ctx, self)?;
 
-        // After common subexpression elimination is completed, we can delete this type of code.
         {
             let state = factory.create_basic_state(0)?;
 
@@ -467,7 +575,6 @@ impl HashJoin {
 
         self.probe.build_pipeline(builder)?;
 
-        // Aligning hash join build and probe parallelism
         let output_len = std::cmp::max(build_sinks.len(), builder.main_pipeline.output_len());
         builder.main_pipeline.resize(output_len, false)?;
 
@@ -514,11 +621,8 @@ impl HashJoin {
         let join_pipe = Pipe::create(output_len * 2, output_len, join_pipe_items);
         builder.main_pipeline.add_pipe(join_pipe);
 
-        // In the case of spilling, we need to share state among multiple threads
-        // Quickly fetch all data from this round to quickly start the next round
-        builder
-            .main_pipeline
-            .resize(builder.main_pipeline.output_len(), true)
+        let item_size = builder.main_pipeline.output_len();
+        builder.main_pipeline.resize(item_size, true)
     }
 
     fn join_factory(
@@ -1409,7 +1513,7 @@ impl PhysicalPlanBuilder {
         }
 
         for scalar in &join.non_equi_conditions {
-            predicates.push(resolve_scalar(scalar, &merged).map_err(|err|{
+            predicates.push(resolve_scalar(scalar, &merged).map_err(|err| {
                 err.add_message(format!(
                     "Failed build nested loop filter schema: {merged:#?} non_equi_conditions: {:#?}",
                     join.non_equi_conditions
diff --git a/src/query/service/src/physical_plans/physical_limit.rs b/src/query/service/src/physical_plans/physical_limit.rs
index 90e741c5772e1..bd5d93f466d3f 100644
--- a/src/query/service/src/physical_plans/physical_limit.rs
+++ b/src/query/service/src/physical_plans/physical_limit.rs
@@ -26,6 +26,7 @@ use databend_common_sql::ColumnEntry;
 use databend_common_sql::ColumnSet;
 use databend_common_sql::IndexType;
 use databend_common_sql::Symbol;
+use databend_common_sql::executor::physical_plans::DataDistribution;
 use databend_common_sql::optimizer::ir::SExpr;
 
 use crate::physical_plans::PhysicalPlanBuilder;
@@ -75,6 +76,10 @@ impl IPhysicalPlan for Limit {
         Ok(LimitFormatter::create(self))
     }
 
+    fn output_data_distribution(&self) -> DataDistribution {
+        DataDistribution::Serial
+    }
+
     #[recursive::recursive]
     fn try_find_single_data_source(&self) -> Option<&DataSourcePlan> {
         self.input.try_find_single_data_source()
diff --git a/src/query/service/src/physical_plans/physical_materialized_cte.rs b/src/query/service/src/physical_plans/physical_materialized_cte.rs
index f0d31445eb829..66e9a37b88b44 100644
--- a/src/query/service/src/physical_plans/physical_materialized_cte.rs
+++ b/src/query/service/src/physical_plans/physical_materialized_cte.rs
@@ -20,6 +20,7 @@ use databend_common_pipeline_transforms::TransformPipelineHelper;
 use databend_common_pipeline_transforms::blocks::CompoundBlockOperator;
 use databend_common_sql::Symbol;
 use databend_common_sql::evaluator::BlockOperator;
+use databend_common_sql::executor::physical_plans::DataDistribution;
 use databend_common_sql::optimizer::ir::SExpr;
 
 use crate::physical_plans::IPhysicalPlan;
@@ -60,6 +61,10 @@ impl IPhysicalPlan for MaterializedCTE {
         &mut self.meta
     }
 
+    fn output_data_distribution(&self) -> DataDistribution {
+        DataDistribution::Serial
+    }
+
     #[recursive::recursive]
     fn output_schema(&self) -> Result<DataSchemaRef> {
         self.input.output_schema()
diff --git a/src/query/service/src/physical_plans/physical_multi_table_insert.rs b/src/query/service/src/physical_plans/physical_multi_table_insert.rs
index cefb3817f2e22..12f24daa36a0a 100644
--- a/src/query/service/src/physical_plans/physical_multi_table_insert.rs
+++ b/src/query/service/src/physical_plans/physical_multi_table_insert.rs
@@ -39,6 +39,7 @@ use databend_common_pipeline_transforms::blocks::CompoundBlockOperator;
 use databend_common_pipeline_transforms::columns::TransformAddComputedColumns;
 use databend_common_pipeline_transforms::sorts::TransformSortPartial;
 use databend_common_sql::DefaultExprBinder;
+use databend_common_sql::executor::physical_plans::DataDistribution;
 use databend_common_storages_fuse::FuseTable;
 use databend_common_storages_fuse::operations::CommitMultiTableInsert;
 use databend_storages_common_table_meta::meta::TableMetaTimestamps;
@@ -86,6 +87,10 @@ impl IPhysicalPlan for Duplicate {
         Box::new(std::iter::once(&mut self.input))
     }
 
+    fn output_data_distribution(&self) -> DataDistribution {
+        self.input.output_data_distribution()
+    }
+
     fn formatter(&self) -> Result<Box<dyn PhysicalFormat + '_>> {
         Ok(DuplicateFormatter::create(self))
     }
@@ -135,6 +140,10 @@ impl IPhysicalPlan for Shuffle {
         Box::new(std::iter::once(&mut self.input))
     }
 
+    fn output_data_distribution(&self) -> DataDistribution {
+        self.input.output_data_distribution()
+    }
+
     fn formatter(&self) -> Result<Box<dyn PhysicalFormat + '_>> {
         Ok(ShuffleFormatter::create(self))
     }
@@ -218,6 +227,10 @@ impl IPhysicalPlan for ChunkFilter {
         Box::new(std::iter::once(&mut self.input))
     }
 
+    fn output_data_distribution(&self) -> DataDistribution {
+        self.input.output_data_distribution()
+    }
+
     fn formatter(&self) -> Result<Box<dyn PhysicalFormat + '_>> {
         Ok(ChunkFilterFormatter::create(self))
     }
@@ -283,6 +296,10 @@ impl IPhysicalPlan for ChunkEvalScalar {
         Box::new(std::iter::once(&mut self.input))
     }
 
+    fn output_data_distribution(&self) -> DataDistribution {
+        self.input.output_data_distribution()
+    }
+
     fn formatter(&self) -> Result<Box<dyn PhysicalFormat + '_>> {
         Ok(ChunkEvalScalarFormatter::create(self))
     }
@@ -355,6 +372,10 @@ impl IPhysicalPlan for ChunkCastSchema {
         Box::new(std::iter::once(&mut self.input))
     }
 
+    fn output_data_distribution(&self) -> DataDistribution {
+        self.input.output_data_distribution()
+    }
+
     fn formatter(&self) -> Result<Box<dyn PhysicalFormat + '_>> {
         Ok(ChunkCastSchemaFormatter::create(self))
     }
@@ -496,6 +517,10 @@ impl IPhysicalPlan for ChunkFillAndReorder {
         Box::new(std::iter::once(&mut self.input))
     }
 
+    fn output_data_distribution(&self) -> DataDistribution {
+        self.input.output_data_distribution()
+    }
+
     fn formatter(&self) -> Result<Box<dyn PhysicalFormat + '_>> {
         Ok(ChunkFillAndReorderFormatter::create(self))
     }
@@ -645,6 +670,10 @@ impl IPhysicalPlan for ChunkAppendData {
         Box::new(std::iter::once(&mut self.input))
     }
 
+    fn output_data_distribution(&self) -> DataDistribution {
+        self.input.output_data_distribution()
+    }
+
     fn formatter(&self) -> Result<Box<dyn PhysicalFormat + '_>> {
         Ok(ChunkAppendDataFormatter::create(self))
     }
@@ -804,6 +833,10 @@ impl IPhysicalPlan for ChunkMerge {
         Box::new(std::iter::once(&mut self.input))
     }
 
+    fn output_data_distribution(&self) -> DataDistribution {
+        DataDistribution::Random
+    }
+
     fn formatter(&self) -> Result<Box<dyn PhysicalFormat + '_>> {
         Ok(ChunkMergeFormatter::create(self))
     }
@@ -876,6 +909,10 @@ impl IPhysicalPlan for ChunkCommitInsert {
         Box::new(std::iter::once(&mut self.input))
     }
 
+    fn output_data_distribution(&self) -> DataDistribution {
+        DataDistribution::Serial
+    }
+
     fn derive(&self, mut children: Vec<PhysicalPlan>) -> PhysicalPlan {
         assert_eq!(children.len(), 1);
         let input = children.pop().unwrap();
diff --git a/src/query/service/src/physical_plans/physical_mutation.rs b/src/query/service/src/physical_plans/physical_mutation.rs
index 30988393b8119..2025aeff75c55 100644
--- a/src/query/service/src/physical_plans/physical_mutation.rs
+++ b/src/query/service/src/physical_plans/physical_mutation.rs
@@ -55,6 +55,7 @@ use databend_common_sql::Visibility;
 use databend_common_sql::binder::MutationStrategy;
 use databend_common_sql::binder::MutationType;
 use databend_common_sql::binder::wrap_cast;
+use databend_common_sql::executor::physical_plans::DataDistribution;
 use databend_common_sql::executor::physical_plans::FragmentKind;
 use databend_common_sql::executor::physical_plans::MutationKind;
 use databend_common_sql::optimizer::ir::SExpr;
@@ -120,6 +121,10 @@ impl IPhysicalPlan for Mutation {
         &mut self.meta
     }
 
+    fn output_data_distribution(&self) -> DataDistribution {
+        DataDistribution::Random
+    }
+
     #[recursive::recursive]
     fn output_schema(&self) -> Result<DataSchemaRef> {
         Ok(DataSchemaRef::default())
diff --git a/src/query/service/src/physical_plans/physical_mutation_into_organize.rs b/src/query/service/src/physical_plans/physical_mutation_into_organize.rs
index b5770451d56bf..cdff50ce57401 100644
--- a/src/query/service/src/physical_plans/physical_mutation_into_organize.rs
+++ b/src/query/service/src/physical_plans/physical_mutation_into_organize.rs
@@ -16,6 +16,7 @@ use std::any::Any;
 
 use databend_common_exception::Result;
 use databend_common_sql::binder::MutationStrategy;
+use databend_common_sql::executor::physical_plans::DataDistribution;
 
 use crate::physical_plans::format::MutationOrganizeFormatter;
 use crate::physical_plans::format::PhysicalFormat;
@@ -44,6 +45,10 @@ impl IPhysicalPlan for MutationOrganize {
         &mut self.meta
     }
 
+    fn output_data_distribution(&self) -> DataDistribution {
+        DataDistribution::Random
+    }
+
     fn children<'a>(&'a self) -> Box<dyn Iterator<Item = &'a PhysicalPlan> + 'a> {
         Box::new(std::iter::once(&self.input))
     }
diff --git a/src/query/service/src/physical_plans/physical_mutation_into_split.rs b/src/query/service/src/physical_plans/physical_mutation_into_split.rs
index e86c70641ab64..cc3e6a5a13ca6 100644
--- a/src/query/service/src/physical_plans/physical_mutation_into_split.rs
+++ b/src/query/service/src/physical_plans/physical_mutation_into_split.rs
@@ -17,6 +17,7 @@ use std::any::Any;
 use databend_common_exception::Result;
 use databend_common_pipeline::core::Pipe;
 use databend_common_sql::IndexType;
+use databend_common_sql::executor::physical_plans::DataDistribution;
 use databend_common_storages_fuse::operations::MutationSplitProcessor;
 
 use crate::physical_plans::format::MutationSplitFormatter;
@@ -46,6 +47,10 @@ impl IPhysicalPlan for MutationSplit {
         &mut self.meta
     }
 
+    fn output_data_distribution(&self) -> DataDistribution {
+        DataDistribution::Random
+    }
+
     fn children<'a>(&'a self) -> Box<dyn Iterator<Item = &'a PhysicalPlan> + 'a> {
         Box::new(std::iter::once(&self.input))
     }
diff --git a/src/query/service/src/physical_plans/physical_mutation_manipulate.rs b/src/query/service/src/physical_plans/physical_mutation_manipulate.rs
index cad8392e9133a..a45d607c91f8a 100644
--- a/src/query/service/src/physical_plans/physical_mutation_manipulate.rs
+++ b/src/query/service/src/physical_plans/physical_mutation_manipulate.rs
@@ -24,6 +24,7 @@ use databend_common_expression::RemoteExpr;
 use databend_common_meta_app::schema::TableInfo;
 use databend_common_pipeline::core::Pipe;
 use databend_common_sql::binder::MutationStrategy;
+use databend_common_sql::executor::physical_plans::DataDistribution;
 use databend_common_sql::executor::physical_plans::MatchExpr;
 use databend_common_storages_fuse::operations::MatchedSplitProcessor;
 use databend_common_storages_fuse::operations::MergeIntoNotMatchedProcessor;
@@ -67,6 +68,10 @@ impl IPhysicalPlan for MutationManipulate {
         &mut self.meta
     }
 
+    fn output_data_distribution(&self) -> DataDistribution {
+        DataDistribution::Random
+    }
+
     fn children<'a>(&'a self) -> Box<dyn Iterator<Item = &'a PhysicalPlan> + 'a> {
         Box::new(std::iter::once(&self.input))
     }
diff --git a/src/query/service/src/physical_plans/physical_mutation_source.rs b/src/query/service/src/physical_plans/physical_mutation_source.rs
index 8d2da1f34432e..2be37d0c8e240 100644
--- a/src/query/service/src/physical_plans/physical_mutation_source.rs
+++ b/src/query/service/src/physical_plans/physical_mutation_source.rs
@@ -43,6 +43,7 @@ use databend_common_sql::ScalarExpr;
 use databend_common_sql::StreamContext;
 use databend_common_sql::binder::MutationType;
 use databend_common_sql::executor::cast_expr_to_non_null_boolean;
+use databend_common_sql::executor::physical_plans::DataDistribution;
 use databend_common_storages_fuse::FuseLazyPartInfo;
 use databend_common_storages_fuse::FuseTable;
 use databend_common_storages_fuse::SegmentLocation;
@@ -96,6 +97,10 @@ impl IPhysicalPlan for MutationSource {
         Ok(self.output_schema.clone())
     }
 
+    fn output_data_distribution(&self) -> DataDistribution {
+        DataDistribution::Random
+    }
+
     fn formatter(&self) -> Result<Box<dyn PhysicalFormat + '_>> {
         Ok(MutationSourceFormatter::create(self))
     }
diff --git a/src/query/service/src/physical_plans/physical_plan.rs b/src/query/service/src/physical_plans/physical_plan.rs
index 27cc04c134d07..2a2e6e4e539c6 100644
--- a/src/query/service/src/physical_plans/physical_plan.rs
+++ b/src/query/service/src/physical_plans/physical_plan.rs
@@ -30,6 +30,7 @@ use databend_common_expression::DataSchemaRef;
 use databend_common_pipeline::core::PlanProfile;
 use databend_common_pipeline::core::PlanScope;
 use databend_common_sql::Metadata;
+use databend_common_sql::executor::physical_plans::DataDistribution;
 use dyn_clone::DynClone;
 use serde::Deserializer;
 use serde::Serializer;
@@ -168,6 +169,8 @@ pub trait IPhysicalPlan: DynClone + Debug + Send + Sync + 'static {
             .any(|child| child.is_warehouse_distributed_plan())
     }
 
+    fn output_data_distribution(&self) -> DataDistribution;
+
     fn display_in_profile(&self) -> bool {
         true
     }
diff --git a/src/query/service/src/physical_plans/physical_project_set.rs b/src/query/service/src/physical_plans/physical_project_set.rs
index 4e66842b41d41..714d77d5c394e 100644
--- a/src/query/service/src/physical_plans/physical_project_set.rs
+++ b/src/query/service/src/physical_plans/physical_project_set.rs
@@ -27,6 +27,7 @@ use databend_common_pipeline::core::ProcessorPtr;
 use databend_common_sql::ColumnSet;
 use databend_common_sql::Symbol;
 use databend_common_sql::TypeCheck;
+use databend_common_sql::executor::physical_plans::DataDistribution;
 use databend_common_sql::optimizer::ir::SExpr;
 use itertools::Itertools;
 
@@ -64,6 +65,10 @@ impl IPhysicalPlan for ProjectSet {
         &mut self.meta
     }
 
+    fn output_data_distribution(&self) -> DataDistribution {
+        self.input.output_data_distribution()
+    }
+
     #[recursive::recursive]
     fn output_schema(&self) -> Result<DataSchemaRef> {
         let input_schema = self.input.output_schema()?;
diff --git a/src/query/service/src/physical_plans/physical_r_cte_scan.rs b/src/query/service/src/physical_plans/physical_r_cte_scan.rs
index b0e251232e216..9c8afca3e5fe2 100644
--- a/src/query/service/src/physical_plans/physical_r_cte_scan.rs
+++ b/src/query/service/src/physical_plans/physical_r_cte_scan.rs
@@ -18,6 +18,7 @@ use std::fmt::Display;
 use databend_common_exception::Result;
 use databend_common_expression::DataSchemaRef;
 use databend_common_expression::DataSchemaRefExt;
+use databend_common_sql::executor::physical_plans::DataDistribution;
 
 use crate::physical_plans::PhysicalPlanBuilder;
 use crate::physical_plans::explain::PlanStatsInfo;
@@ -54,6 +55,10 @@ impl IPhysicalPlan for RecursiveCteScan {
         Ok(self.output_schema.clone())
     }
 
+    fn output_data_distribution(&self) -> DataDistribution {
+        DataDistribution::Random
+    }
+
     fn derive(&self, children: Vec<PhysicalPlan>) -> PhysicalPlan {
         assert!(children.is_empty());
         PhysicalPlan::new(RecursiveCteScan {
diff --git a/src/query/service/src/physical_plans/physical_range_join.rs b/src/query/service/src/physical_plans/physical_range_join.rs
index 4ed7b99c7f33f..b5ce47e6f2255 100644
--- a/src/query/service/src/physical_plans/physical_range_join.rs
+++ b/src/query/service/src/physical_plans/physical_range_join.rs
@@ -29,6 +29,7 @@ use databend_common_sql::ScalarExpr;
 use databend_common_sql::TypeCheck;
 use databend_common_sql::binder::JoinPredicate;
 use databend_common_sql::binder::wrap_cast;
+use databend_common_sql::executor::physical_plans::DataDistribution;
 use databend_common_sql::optimizer::ir::RelExpr;
 use databend_common_sql::optimizer::ir::RelationalProperty;
 use databend_common_sql::optimizer::ir::SExpr;
@@ -95,6 +96,10 @@ impl IPhysicalPlan for RangeJoin {
         Ok(RangeJoinFormatter::create(self))
     }
 
+    fn output_data_distribution(&self) -> DataDistribution {
+        DataDistribution::Random
+    }
+
     fn get_desc(&self) -> Result<String> {
         let mut condition = self
             .conditions
diff --git a/src/query/service/src/physical_plans/physical_recluster.rs b/src/query/service/src/physical_plans/physical_recluster.rs
index 827ad456244cf..513ee651c2ce6 100644
--- a/src/query/service/src/physical_plans/physical_recluster.rs
+++ b/src/query/service/src/physical_plans/physical_recluster.rs
@@ -40,6 +40,7 @@ use databend_common_pipeline_transforms::blocks::CompoundBlockOperator;
 use databend_common_pipeline_transforms::build_ordered_compact_pipeline;
 use databend_common_pipeline_transforms::columns::TransformAddStreamColumns;
 use databend_common_sql::StreamContext;
+use databend_common_sql::executor::physical_plans::DataDistribution;
 use databend_common_sql::executor::physical_plans::MutationKind;
 use databend_common_storages_fuse::FUSE_OPT_KEY_BLOCK_IN_MEM_SIZE_THRESHOLD;
 use databend_common_storages_fuse::FuseTable;
@@ -80,6 +81,10 @@ impl IPhysicalPlan for Recluster {
         &mut self.meta
     }
 
+    fn output_data_distribution(&self) -> DataDistribution {
+        DataDistribution::Random
+    }
+
     fn derive(&self, children: Vec<PhysicalPlan>) -> PhysicalPlan {
         assert!(children.is_empty());
         PhysicalPlan::new(Recluster {
@@ -281,6 +286,10 @@ impl IPhysicalPlan for HilbertPartition {
         &mut self.meta
     }
 
+    fn output_data_distribution(&self) -> DataDistribution {
+        DataDistribution::Random
+    }
+
     #[recursive::recursive]
     fn output_schema(&self) -> Result<DataSchemaRef> {
         Ok(DataSchemaRef::default())
diff --git a/src/query/service/src/physical_plans/physical_replace_async_source.rs b/src/query/service/src/physical_plans/physical_replace_async_source.rs
index e57f78f04fb6f..5d1599aeaa74b 100644
--- a/src/query/service/src/physical_plans/physical_replace_async_source.rs
+++ b/src/query/service/src/physical_plans/physical_replace_async_source.rs
@@ -19,6 +19,7 @@ use databend_common_exception::Result;
 use databend_common_expression::DataSchemaRef;
 use databend_common_pipeline::sources::AsyncSourcer;
 use databend_common_sql::NameResolutionContext;
+use databend_common_sql::executor::physical_plans::DataDistribution;
 use databend_common_sql::plans::InsertValue;
 
 use crate::physical_plans::physical_plan::IPhysicalPlan;
@@ -48,6 +49,10 @@ impl IPhysicalPlan for ReplaceAsyncSourcer {
         &mut self.meta
     }
 
+    fn output_data_distribution(&self) -> DataDistribution {
+        DataDistribution::Random
+    }
+
     fn derive(&self, children: Vec<PhysicalPlan>) -> PhysicalPlan {
         assert!(children.is_empty());
         PhysicalPlan::new(ReplaceAsyncSourcer {
diff --git a/src/query/service/src/physical_plans/physical_replace_deduplicate.rs b/src/query/service/src/physical_plans/physical_replace_deduplicate.rs
index 29f1ce6d2ec89..a9e8d3b35e757 100644
--- a/src/query/service/src/physical_plans/physical_replace_deduplicate.rs
+++ b/src/query/service/src/physical_plans/physical_replace_deduplicate.rs
@@ -31,6 +31,7 @@ use databend_common_pipeline_transforms::TransformPipelineHelper;
 use databend_common_pipeline_transforms::blocks::TransformCastSchema;
 use databend_common_pipeline_transforms::build_compact_block_pipeline;
 use databend_common_sql::ColumnBinding;
+use databend_common_sql::executor::physical_plans::DataDistribution;
 use databend_common_sql::executor::physical_plans::OnConflictField;
 use databend_common_storages_fuse::FuseTable;
 use databend_common_storages_fuse::operations::ReplaceIntoProcessor;
@@ -70,6 +71,10 @@ impl IPhysicalPlan for ReplaceDeduplicate {
         &mut self.meta
     }
 
+    fn output_data_distribution(&self) -> DataDistribution {
+        DataDistribution::Serial
+    }
+
     #[recursive::recursive]
     fn output_schema(&self) -> Result<DataSchemaRef> {
         Ok(DataSchemaRef::default())
diff --git a/src/query/service/src/physical_plans/physical_replace_into.rs b/src/query/service/src/physical_plans/physical_replace_into.rs
index 9e502fb467a81..7c916ac4361cc 100644
--- a/src/query/service/src/physical_plans/physical_replace_into.rs
+++ b/src/query/service/src/physical_plans/physical_replace_into.rs
@@ -26,6 +26,7 @@ use databend_common_pipeline::core::InputPort;
 use databend_common_pipeline::core::OutputPort;
 use databend_common_pipeline::core::Pipe;
 use databend_common_pipeline_transforms::create_dummy_item;
+use databend_common_sql::executor::physical_plans::DataDistribution;
 use databend_common_sql::executor::physical_plans::MutationKind;
 use databend_common_sql::executor::physical_plans::OnConflictField;
 use databend_common_storages_fuse::FuseTable;
@@ -70,6 +71,10 @@ impl IPhysicalPlan for ReplaceInto {
         &mut self.meta
     }
 
+    fn output_data_distribution(&self) -> DataDistribution {
+        DataDistribution::Random
+    }
+
     #[recursive::recursive]
     fn output_schema(&self) -> Result<DataSchemaRef> {
         Ok(DataSchemaRef::default())
diff --git a/src/query/service/src/physical_plans/physical_row_fetch.rs b/src/query/service/src/physical_plans/physical_row_fetch.rs
index 11fef083c340a..e1e687f55d8c8 100644
--- a/src/query/service/src/physical_plans/physical_row_fetch.rs
+++ b/src/query/service/src/physical_plans/physical_row_fetch.rs
@@ -25,6 +25,7 @@ use databend_common_pipeline::core::OutputPort;
 use databend_common_pipeline::core::Pipe;
 use databend_common_pipeline::core::PipeItem;
 use databend_common_pipeline_transforms::create_dummy_item;
+use databend_common_sql::executor::physical_plans::DataDistribution;
 use databend_common_storages_fuse::operations::row_fetch_processor;
 use itertools::Itertools;
 
@@ -67,6 +68,10 @@ impl IPhysicalPlan for RowFetch {
         &mut self.meta
     }
 
+    fn output_data_distribution(&self) -> DataDistribution {
+        self.input.output_data_distribution()
+    }
+
     #[recursive::recursive]
     fn output_schema(&self) -> Result<DataSchemaRef> {
         let mut fields = self.input.output_schema()?.fields().clone();
diff --git a/src/query/service/src/physical_plans/physical_secure_filter.rs b/src/query/service/src/physical_plans/physical_secure_filter.rs
index 88e8f871574d6..ef2daeaf0ab22 100644
--- a/src/query/service/src/physical_plans/physical_secure_filter.rs
+++ b/src/query/service/src/physical_plans/physical_secure_filter.rs
@@ -26,6 +26,7 @@ use databend_common_functions::BUILTIN_FUNCTIONS;
 use databend_common_sql::ColumnSet;
 use databend_common_sql::TypeCheck;
 use databend_common_sql::executor::cast_expr_to_non_null_boolean;
+use databend_common_sql::executor::physical_plans::DataDistribution;
 use databend_common_sql::optimizer::ir::SExpr;
 use sha2::Digest;
 use sha2::Sha256;
@@ -64,6 +65,10 @@ impl IPhysicalPlan for SecureFilter {
         &mut self.meta
     }
 
+    fn output_data_distribution(&self) -> DataDistribution {
+        self.input.output_data_distribution()
+    }
+
     #[recursive::recursive]
     fn output_schema(&self) -> Result<DataSchemaRef> {
         let input_schema = self.input.output_schema()?;
diff --git a/src/query/service/src/physical_plans/physical_sequence.rs b/src/query/service/src/physical_plans/physical_sequence.rs
index 4642019c961f5..d201fa082a415 100644
--- a/src/query/service/src/physical_plans/physical_sequence.rs
+++ b/src/query/service/src/physical_plans/physical_sequence.rs
@@ -17,6 +17,7 @@ use std::any::Any;
 use databend_common_exception::Result;
 use databend_common_expression::DataSchemaRef;
 use databend_common_sql::ColumnSet;
+use databend_common_sql::executor::physical_plans::DataDistribution;
 use databend_common_sql::optimizer::ir::SExpr;
 
 use crate::physical_plans::IPhysicalPlan;
@@ -53,6 +54,10 @@ impl IPhysicalPlan for Sequence {
         &mut self.meta
     }
 
+    fn output_data_distribution(&self) -> DataDistribution {
+        self.right.output_data_distribution()
+    }
+
     #[recursive::recursive]
     fn output_schema(&self) -> Result<DataSchemaRef> {
         self.right.output_schema()
diff --git a/src/query/service/src/physical_plans/physical_sort.rs b/src/query/service/src/physical_plans/physical_sort.rs
index 5e049c9daf3ce..d808d47d50633 100644
--- a/src/query/service/src/physical_plans/physical_sort.rs
+++ b/src/query/service/src/physical_plans/physical_sort.rs
@@ -28,6 +28,7 @@ use databend_common_pipeline_transforms::sorts::core::SortKeyDescription;
 use databend_common_sql::ColumnSet;
 use databend_common_sql::IndexType;
 use databend_common_sql::evaluator::BlockOperator;
+use databend_common_sql::executor::physical_plans::DataDistribution;
 use databend_common_sql::executor::physical_plans::FragmentKind;
 use databend_common_sql::executor::physical_plans::SortDesc;
 use databend_common_sql::optimizer::ir::SExpr;
@@ -148,6 +149,15 @@ impl IPhysicalPlan for Sort {
         Ok(SortFormatter::create(self))
     }
 
+    fn output_data_distribution(&self) -> DataDistribution {
+        match self.step {
+            SortStep::Single | SortStep::Partial | SortStep::Final | SortStep::Shuffled => {
+                DataDistribution::Serial
+            }
+            SortStep::Sample | SortStep::Route => DataDistribution::Random,
+        }
+    }
+
     #[recursive::recursive]
     fn try_find_single_data_source(&self) -> Option<&DataSourcePlan> {
         self.input.try_find_single_data_source()
diff --git a/src/query/service/src/physical_plans/physical_table_scan.rs b/src/query/service/src/physical_plans/physical_table_scan.rs
index 73a7ad54f5262..d9121abaf971e 100644
--- a/src/query/service/src/physical_plans/physical_table_scan.rs
+++ b/src/query/service/src/physical_plans/physical_table_scan.rs
@@ -64,6 +64,7 @@ use databend_common_sql::VirtualColumn;
 use databend_common_sql::binder::INTERNAL_COLUMN_FACTORY;
 use databend_common_sql::evaluator::BlockOperator;
 use databend_common_sql::executor::cast_expr_to_non_null_boolean;
+use databend_common_sql::executor::physical_plans::DataDistribution;
 use databend_common_sql::executor::table_read_plan::ToReadDataSourcePlan;
 use databend_common_sql::plans::FunctionCall;
 use databend_common_storages_fuse::FuseTable;
@@ -111,6 +112,10 @@ impl IPhysicalPlan for TableScan {
         Self::output_fields(self.source.schema(), &self.name_mapping).map(DataSchema::new_ref)
     }
 
+    fn output_data_distribution(&self) -> DataDistribution {
+        DataDistribution::Random
+    }
+
     fn formatter(&self) -> Result<Box<dyn PhysicalFormat + '_>> {
         Ok(TableScanFormatter::create(self))
     }
diff --git a/src/query/service/src/physical_plans/physical_udf.rs b/src/query/service/src/physical_plans/physical_udf.rs
index 1b62115d6f0c1..d82a3c416eb64 100644
--- a/src/query/service/src/physical_plans/physical_udf.rs
+++ b/src/query/service/src/physical_plans/physical_udf.rs
@@ -28,6 +28,7 @@ use databend_common_sql::ColumnSet;
 use databend_common_sql::IndexType;
 use databend_common_sql::ScalarExpr;
 use databend_common_sql::Symbol;
+use databend_common_sql::executor::physical_plans::DataDistribution;
 use databend_common_sql::optimizer::ir::SExpr;
 use databend_common_sql::plans::UDFType;
 use itertools::Itertools;
@@ -66,6 +67,10 @@ impl IPhysicalPlan for Udf {
         &mut self.meta
     }
 
+    fn output_data_distribution(&self) -> DataDistribution {
+        self.input.output_data_distribution()
+    }
+
     #[recursive::recursive]
     fn output_schema(&self) -> Result<DataSchemaRef> {
         let input_schema = self.input.output_schema()?;
diff --git a/src/query/service/src/physical_plans/physical_union_all.rs b/src/query/service/src/physical_plans/physical_union_all.rs
index 1965739f66fe9..6c73f614feca5 100644
--- a/src/query/service/src/physical_plans/physical_union_all.rs
+++ b/src/query/service/src/physical_plans/physical_union_all.rs
@@ -28,8 +28,10 @@ use databend_common_sql::ScalarExpr;
 use databend_common_sql::Symbol;
 use databend_common_sql::TypeCheck;
 use databend_common_sql::evaluator::BlockOperator;
+use databend_common_sql::executor::physical_plans::DataDistribution;
 use databend_common_sql::optimizer::ir::SExpr;
 use itertools::Itertools;
+use recursive::recursive;
 
 use crate::physical_plans::Exchange;
 use crate::physical_plans::PhysicalPlanBuilder;
@@ -89,6 +91,16 @@ impl IPhysicalPlan for UnionAll {
         Ok(UnionAllFormatter::create(self))
     }
 
+    #[recursive]
+    fn output_data_distribution(&self) -> DataDistribution {
+        let left_dist = self.left.output_data_distribution();
+        let right_dist = self.right.output_data_distribution();
+        match left_dist == right_dist {
+            true => left_dist,
+            false => DataDistribution::Random,
+        }
+    }
+
     fn get_desc(&self) -> Result<String> {
         Ok(self
             .left_outputs
diff --git a/src/query/service/src/physical_plans/physical_window.rs b/src/query/service/src/physical_plans/physical_window.rs
index 35a9608e95e19..1267afe96c1de 100644
--- a/src/query/service/src/physical_plans/physical_window.rs
+++ b/src/query/service/src/physical_plans/physical_window.rs
@@ -37,6 +37,7 @@ use databend_common_sql::TypeCheck;
 use databend_common_sql::binder::wrap_cast;
 use databend_common_sql::executor::physical_plans::AggregateFunctionDesc;
 use databend_common_sql::executor::physical_plans::AggregateFunctionSignature;
+use databend_common_sql::executor::physical_plans::DataDistribution;
 use databend_common_sql::executor::physical_plans::SortDesc;
 use databend_common_sql::optimizer::ir::SExpr;
 use databend_common_sql::plans::WindowFuncFrame;
@@ -110,6 +111,14 @@ impl IPhysicalPlan for Window {
         Ok(WindowFormatter::create(self))
     }
 
+    fn output_data_distribution(&self) -> DataDistribution {
+        if self.partition_by.is_empty() {
+            DataDistribution::Random
+        } else {
+            self.input.output_data_distribution()
+        }
+    }
+
     #[recursive::recursive]
     fn try_find_single_data_source(&self) -> Option<&DataSourcePlan> {
         self.input.try_find_single_data_source()
diff --git a/src/query/service/src/physical_plans/physical_window_partition.rs b/src/query/service/src/physical_plans/physical_window_partition.rs
index 734c5299258e7..bc12f224d3de7 100644
--- a/src/query/service/src/physical_plans/physical_window_partition.rs
+++ b/src/query/service/src/physical_plans/physical_window_partition.rs
@@ -24,6 +24,7 @@ use databend_common_expression::SortColumnDescription;
 use databend_common_pipeline::core::ProcessorPtr;
 use databend_common_pipeline_transforms::MemorySettings;
 use databend_common_sql::Symbol;
+use databend_common_sql::executor::physical_plans::DataDistribution;
 use databend_common_sql::executor::physical_plans::SortDesc;
 use databend_storages_common_cache::TempDirManager;
 
@@ -77,6 +78,10 @@ impl IPhysicalPlan for WindowPartition {
         Ok(WindowPartitionFormatter::create(self))
     }
 
+    fn output_data_distribution(&self) -> DataDistribution {
+        DataDistribution::Random
+    }
+
     #[recursive::recursive]
     fn try_find_single_data_source(&self) -> Option<&DataSourcePlan> {
         self.input.try_find_single_data_source()
diff --git a/src/query/service/src/pipelines/processors/transforms/hash_join/runtime_filter/convert.rs b/src/query/service/src/pipelines/processors/transforms/hash_join/runtime_filter/convert.rs
index 82dab8970d2b6..2174f39789a0e 100644
--- a/src/query/service/src/pipelines/processors/transforms/hash_join/runtime_filter/convert.rs
+++ b/src/query/service/src/pipelines/processors/transforms/hash_join/runtime_filter/convert.rs
@@ -21,7 +21,6 @@ use databend_common_catalog::runtime_filter_info::RuntimeFilterInfo;
 use databend_common_catalog::runtime_filter_info::RuntimeFilterSpatial;
 use databend_common_catalog::runtime_filter_info::RuntimeFilterStats;
 use databend_common_catalog::sbbf::Sbbf;
-use databend_common_catalog::sbbf::SbbfAtomic;
 use databend_common_exception::ErrorCode;
 use databend_common_exception::Result;
 use databend_common_expression::Column;
@@ -55,7 +54,7 @@ pub async fn build_runtime_filter_infos(
     packet: JoinRuntimeFilterPacket,
     runtime_filter_descs: HashMap<usize, &RuntimeFilterDesc>,
     selectivity_threshold: u64,
-    max_threads: usize,
+    _max_threads: usize,
 ) -> Result<HashMap<usize, RuntimeFilterInfo>> {
     let total_build_rows = packet.build_rows;
     let Some(packets) = packet.packets else {
@@ -104,7 +103,7 @@ pub async fn build_runtime_filter_infos(
             };
             let bloom = if bloom_enabled {
                 if let Some(ref bloom) = packet.bloom {
-                    Some(build_bloom_filter(bloom.clone(), probe_key, max_threads, desc.id).await?)
+                    Some(build_bloom_filter(bloom.clone(), probe_key)?)
                 } else {
                     None
                 }
@@ -278,37 +277,14 @@ fn build_min_max_filter(
     Ok(min_max_filter)
 }
 
-async fn build_bloom_filter(
-    bloom: Vec<u64>,
+fn build_bloom_filter(
+    bloom_words: Vec<u32>,
     probe_key: &Expr<String>,
-    max_threads: usize,
-    filter_id: usize,
 ) -> Result<RuntimeFilterBloom> {
     let probe_column = resolve_probe_column_ref(probe_key);
     let column_name = probe_column.id.to_string();
-    let total_items = bloom.len();
-
-    if total_items < 3_000_000 {
-        let mut filter = Sbbf::new_with_ndv_fpp(total_items as u64, 0.01)
-            .map_err(|e| ErrorCode::Internal(e.to_string()))?;
-        filter.insert_hash_batch(&bloom);
-        return Ok(RuntimeFilterBloom {
-            column_name,
-            filter: Arc::new(filter),
-        });
-    }
-
-    let start = std::time::Instant::now();
-    let builder = SbbfAtomic::new_with_ndv_fpp(total_items as u64, 0.01)
-        .map_err(|e| ErrorCode::Internal(e.to_string()))?
-        .insert_hash_batch_parallel(bloom, max_threads);
-    let filter = builder.finish();
-    log::info!(
-        "filter_id: {}, build_time: {:?}",
-        filter_id,
-        start.elapsed()
-    );
-
+    let filter = Sbbf::from_u32s(bloom_words)
+        .ok_or_else(|| ErrorCode::Internal("Invalid bloom filter data in runtime filter"))?;
     Ok(RuntimeFilterBloom {
         column_name,
         filter: Arc::new(filter),
@@ -331,6 +307,7 @@ fn resolve_probe_column_ref(probe_key: &Expr<String>) -> &ColumnRef<String> {
 mod tests {
     use std::collections::HashMap;
 
+    use databend_common_catalog::sbbf::Sbbf;
     use databend_common_expression::ColumnBuilder;
     use databend_common_expression::ColumnRef;
     use databend_common_expression::Constant;
@@ -392,7 +369,11 @@ mod tests {
                 min: Scalar::Number(1i32.into()),
                 max: Scalar::Number(10i32.into()),
             }),
-            bloom: Some(vec![11, 22]),
+            bloom: Some({
+                let mut f = Sbbf::new_with_ndv_fpp(100, 0.01).unwrap();
+                f.insert_hash_batch(&[11, 22]);
+                f.into_u32s()
+            }),
             spatial: None,
         });
 
diff --git a/src/query/service/src/pipelines/processors/transforms/hash_join/runtime_filter/local_builder.rs b/src/query/service/src/pipelines/processors/transforms/hash_join/runtime_filter/local_builder.rs
index 7b2e724ec63d7..7cd9bd9e81455 100644
--- a/src/query/service/src/pipelines/processors/transforms/hash_join/runtime_filter/local_builder.rs
+++ b/src/query/service/src/pipelines/processors/transforms/hash_join/runtime_filter/local_builder.rs
@@ -12,6 +12,8 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
+use databend_common_catalog::sbbf::Sbbf;
+use databend_common_exception::ErrorCode;
 use databend_common_exception::Result;
 use databend_common_expression::BlockEntry;
 use databend_common_expression::Column;
@@ -47,8 +49,9 @@ struct SingleFilterBuilder {
     inlist_builder: Option<ColumnBuilder>,
     inlist_threshold: usize,
 
-    bloom_hashes: Option<Vec<u64>>,
+    bloom_filter: Option<Sbbf>,
     bloom_threshold: usize,
+    bloom_disabled: bool,
 
     is_spatial: bool,
     spatial_rects: Vec<(f64, f64, f64, f64)>,
@@ -90,12 +93,21 @@ impl SingleFilterBuilder {
             } else {
                 0
             },
-            bloom_hashes: None,
+            bloom_filter: if desc.enable_bloom_runtime_filter && bloom_threshold > 0 {
+                let ndv = match desc.build_table_rows {
+                    Some(rows) => rows.min(bloom_threshold as u64),
+                    None => bloom_threshold as u64,
+                };
+                Some(Sbbf::new_with_ndv_fpp(ndv, 0.01).map_err(ErrorCode::Internal)?)
+            } else {
+                None
+            },
             bloom_threshold: if desc.enable_bloom_runtime_filter {
                 bloom_threshold
             } else {
                 0
             },
+            bloom_disabled: !desc.enable_bloom_runtime_filter || bloom_threshold == 0,
             is_spatial: desc.is_spatial,
             spatial_rects: Vec::new(),
             spatial_srid: None,
@@ -142,22 +154,21 @@ impl SingleFilterBuilder {
     }
 
     fn add_bloom(&mut self, column: &Column, new_total: usize) -> Result<()> {
-        if new_total > self.bloom_threshold {
-            self.bloom_hashes = None;
+        if self.bloom_disabled || new_total > self.bloom_threshold {
+            self.bloom_filter = None;
+            self.bloom_disabled = true;
             return Ok(());
         }
-        let mut hashes = match self.bloom_hashes.take() {
-            Some(h) => h,
-            None => Vec::with_capacity(column.len()),
-        };
-        hashes.reserve(column.len());
-        let entry = BlockEntry::from(column.clone());
-        let hash_method = self
-            .hash_method
-            .as_ref()
-            .expect("hash_method must exist for non-spatial filters");
-        hash_by_method_for_bloom(hash_method, (&[entry]).into(), column.len(), &mut hashes)?;
-        self.bloom_hashes = Some(hashes);
+        if let Some(ref mut filter) = self.bloom_filter {
+            let mut hashes = Vec::with_capacity(column.len());
+            let entry = BlockEntry::from(column.clone());
+            let hash_method = self
+                .hash_method
+                .as_ref()
+                .expect("hash_method must exist for non-spatial filters");
+            hash_by_method_for_bloom(hash_method, (&[entry]).into(), column.len(), &mut hashes)?;
+            filter.insert_hash_batch(&hashes);
+        }
         Ok(())
     }
 
@@ -219,7 +230,7 @@ impl SingleFilterBuilder {
                 None
             };
 
-            let bloom = self.bloom_hashes.take();
+            let bloom = self.bloom_filter.take().map(|f| f.into_u32s());
 
             Ok(RuntimeFilterPacket {
                 id: self.id,
diff --git a/src/query/service/src/pipelines/processors/transforms/hash_join/runtime_filter/merge.rs b/src/query/service/src/pipelines/processors/transforms/hash_join/runtime_filter/merge.rs
index 18282b1194a64..acf623ed5d2a1 100644
--- a/src/query/service/src/pipelines/processors/transforms/hash_join/runtime_filter/merge.rs
+++ b/src/query/service/src/pipelines/processors/transforms/hash_join/runtime_filter/merge.rs
@@ -14,6 +14,7 @@
 
 use std::collections::HashMap;
 
+use databend_common_catalog::sbbf::Sbbf;
 use databend_common_exception::Result;
 use databend_common_expression::Column;
 
@@ -50,7 +51,7 @@ pub fn merge_join_runtime_filter_packets(
     let should_merge_bloom = total_build_rows < bloom_threshold;
     let should_merge_min_max = total_build_rows < min_max_threshold;
 
-    let packets = packets
+    let mut packets = packets
         .into_iter()
         .filter_map(|packet| packet.packets)
         .collect::<Vec<_>>();
@@ -62,26 +63,27 @@ pub fn merge_join_runtime_filter_packets(
         ));
     }
 
+    let keys: Vec<usize> = packets[0].keys().copied().collect();
     let mut result = HashMap::new();
-    for id in packets[0].keys() {
-        result.insert(*id, RuntimeFilterPacket {
-            id: *id,
+    for id in keys {
+        result.insert(id, RuntimeFilterPacket {
+            id,
             inlist: if should_merge_inlist {
-                merge_inlist(&packets, *id)?
+                merge_inlist(&packets, id)?
             } else {
                 None
             },
             min_max: if should_merge_min_max {
-                merge_min_max(&packets, *id)
+                merge_min_max(&packets, id)
             } else {
                 None
             },
             bloom: if should_merge_bloom {
-                merge_bloom(&packets, *id)
+                merge_bloom(&mut packets, id)
             } else {
                 None
             },
-            spatial: merge_spatial(&packets, *id, spatial_threshold)?,
+            spatial: merge_spatial(&packets, id, spatial_threshold)?,
         });
     }
 
@@ -158,25 +160,25 @@ fn merge_min_max(
     Some(SerializableDomain { min, max })
 }
 
-fn merge_bloom(packets: &[HashMap<usize, RuntimeFilterPacket>], rf_id: usize) -> Option<Vec<u64>> {
+fn merge_bloom(
+    packets: &mut [HashMap<usize, RuntimeFilterPacket>],
+    rf_id: usize,
+) -> Option<Vec<u32>> {
     if packets
         .iter()
         .any(|packet| packet.get(&rf_id).unwrap().bloom.is_none())
     {
         return None;
     }
-    let mut bloom = packets[0]
-        .get(&rf_id)
-        .unwrap()
-        .bloom
-        .as_ref()
-        .unwrap()
-        .clone();
-    for packet in packets.iter().skip(1) {
-        let other = packet.get(&rf_id).unwrap().bloom.as_ref().unwrap();
-        bloom.extend_from_slice(other);
+
+    let first = packets[0].get_mut(&rf_id).unwrap().bloom.take().unwrap();
+    let mut merged = Sbbf::from_u32s(first)?;
+    for packet in packets.iter_mut().skip(1) {
+        let other_words = packet.get_mut(&rf_id).unwrap().bloom.take().unwrap();
+        let other = Sbbf::from_u32s(other_words)?;
+        merged.union(&other);
     }
-    Some(bloom)
+    Some(merged.into_u32s())
 }
 
 fn merge_spatial(
@@ -223,10 +225,110 @@ fn merge_spatial(
     }))
 }
 
+/// Pairwise merge of two runtime filter packets without threshold checks.
+/// Used for work-stealing incremental merge within a node.
+pub fn merge_two_runtime_filter_packets(
+    mut a: JoinRuntimeFilterPacket,
+    mut b: JoinRuntimeFilterPacket,
+) -> Result<JoinRuntimeFilterPacket> {
+    let total_build_rows = a.build_rows + b.build_rows;
+    let disable_all = a.disable_all_due_to_spill || b.disable_all_due_to_spill;
+
+    if disable_all {
+        return Ok(JoinRuntimeFilterPacket::disable_all(total_build_rows));
+    }
+
+    let (a_packets, b_packets) = match (a.packets.take(), b.packets.take()) {
+        (None, None) => {
+            return Ok(JoinRuntimeFilterPacket::complete_without_filters(
+                total_build_rows,
+            ));
+        }
+        (Some(p), None) | (None, Some(p)) => {
+            return Ok(JoinRuntimeFilterPacket::complete(p, total_build_rows));
+        }
+        (Some(a), Some(b)) => (a, b),
+    };
+
+    let mut result = HashMap::new();
+    for (id, mut a_pkt) in a_packets {
+        if let Some(mut b_pkt) = b_packets.get(&id).cloned() {
+            // Merge bloom via Sbbf::union
+            let bloom = match (a_pkt.bloom.take(), b_pkt.bloom.take()) {
+                (Some(a_words), Some(b_words)) => {
+                    if let (Some(mut a_filter), Some(b_filter)) =
+                        (Sbbf::from_u32s(a_words), Sbbf::from_u32s(b_words))
+                    {
+                        a_filter.union(&b_filter);
+                        Some(a_filter.into_u32s())
+                    } else {
+                        None
+                    }
+                }
+                _ => None,
+            };
+
+            // Merge inlist via concat
+            let inlist = match (a_pkt.inlist.take(), b_pkt.inlist.take()) {
+                (Some(a_col), Some(b_col)) => {
+                    Some(Column::concat_columns([a_col, b_col].into_iter())?)
+                }
+                _ => None,
+            };
+
+            // Merge min_max
+            let min_max = match (a_pkt.min_max.take(), b_pkt.min_max.take()) {
+                (Some(a_mm), Some(b_mm)) => Some(SerializableDomain {
+                    min: a_mm.min.min(b_mm.min),
+                    max: a_mm.max.max(b_mm.max),
+                }),
+                _ => None,
+            };
+
+            // Merge spatial
+            let spatial = match (a_pkt.spatial.take(), b_pkt.spatial.take()) {
+                (Some(a_sp), Some(b_sp)) => {
+                    if a_sp.valid && b_sp.valid && a_sp.srid == b_sp.srid {
+                        let rtrees = merge_rtrees_to_threshold(
+                            vec![a_sp.rtrees.as_slice(), b_sp.rtrees.as_slice()],
+                            usize::MAX,
+                        )?;
+                        Some(SpatialPacket {
+                            valid: true,
+                            srid: a_sp.srid,
+                            rtrees,
+                        })
+                    } else {
+                        None
+                    }
+                }
+                _ => None,
+            };
+
+            result.insert(id, RuntimeFilterPacket {
+                id,
+                bloom,
+                inlist,
+                min_max,
+                spatial,
+            });
+        }
+    }
+
+    if result.is_empty() {
+        return Ok(JoinRuntimeFilterPacket::complete_without_filters(
+            total_build_rows,
+        ));
+    }
+
+    Ok(JoinRuntimeFilterPacket::complete(result, total_build_rows))
+}
+
 #[cfg(test)]
 mod tests {
     use std::collections::HashMap;
 
+    use databend_common_catalog::sbbf::Sbbf;
     use databend_common_expression::ColumnBuilder;
     use databend_common_expression::Scalar;
     use databend_common_expression::types::DataType;
@@ -244,6 +346,12 @@ mod tests {
         builder.build()
     }
 
+    fn make_bloom(hashes: &[u64]) -> Vec<u32> {
+        let mut filter = Sbbf::new_with_ndv_fpp(100, 0.01).unwrap();
+        filter.insert_hash_batch(hashes);
+        filter.into_u32s()
+    }
+
     #[test]
     fn test_merge_short_circuit_all_types() -> Result<()> {
         let mut runtime_filters = HashMap::new();
@@ -254,7 +362,7 @@ mod tests {
                 min: Scalar::Number(NumberScalar::Int32(1)),
                 max: Scalar::Number(NumberScalar::Int32(3)),
             }),
-            bloom: Some(vec![11, 22, 33]),
+            bloom: Some(make_bloom(&[11, 22, 33])),
             spatial: None,
         });
 
@@ -272,6 +380,9 @@ mod tests {
 
     #[test]
     fn test_merge_short_circuit_inlist_only() -> Result<()> {
+        let bloom1 = make_bloom(&[1, 2]);
+        let bloom2 = make_bloom(&[3, 4]);
+
         let mut runtime_filters_1 = HashMap::new();
         runtime_filters_1.insert(7, RuntimeFilterPacket {
             id: 7,
@@ -280,7 +391,7 @@ mod tests {
                 min: Scalar::Number(NumberScalar::Int32(1)),
                 max: Scalar::Number(NumberScalar::Int32(5)),
             }),
-            bloom: Some(vec![1, 2]),
+            bloom: Some(bloom1.clone()),
             spatial: None,
         });
         let mut runtime_filters_2 = HashMap::new();
@@ -291,7 +402,7 @@ mod tests {
                 min: Scalar::Number(NumberScalar::Int32(-1)),
                 max: Scalar::Number(NumberScalar::Int32(8)),
             }),
-            bloom: Some(vec![3, 4]),
+            bloom: Some(bloom2.clone()),
             spatial: None,
         });
 
@@ -309,7 +420,11 @@ mod tests {
         let packet = merged.packets.unwrap().remove(&7).unwrap();
         assert_eq!(merged.build_rows, 11);
         assert!(packet.inlist.is_none());
-        assert_eq!(packet.bloom, Some(vec![1, 2, 3, 4]));
+        // Bloom should be a merged Sbbf containing all hashes
+        let merged_filter = Sbbf::from_u32s(packet.bloom.unwrap()).unwrap();
+        for h in &[1u64, 2, 3, 4] {
+            assert!(merged_filter.check_hash(*h));
+        }
         assert_eq!(
             packet.min_max,
             Some(SerializableDomain {
diff --git a/src/query/service/src/pipelines/processors/transforms/hash_join/runtime_filter/mod.rs b/src/query/service/src/pipelines/processors/transforms/hash_join/runtime_filter/mod.rs
index 0fbf2e9b84717..0fb339b4f21e1 100644
--- a/src/query/service/src/pipelines/processors/transforms/hash_join/runtime_filter/mod.rs
+++ b/src/query/service/src/pipelines/processors/transforms/hash_join/runtime_filter/mod.rs
@@ -26,4 +26,5 @@ pub use global::get_global_runtime_filter_packet;
 pub use interface::build_and_push_down_runtime_filter;
 pub use local_builder::RuntimeFilterLocalBuilder;
 pub use merge::merge_join_runtime_filter_packets;
+pub use merge::merge_two_runtime_filter_packets;
 pub use packet::JoinRuntimeFilterPacket;
diff --git a/src/query/service/src/pipelines/processors/transforms/hash_join/runtime_filter/packet.rs b/src/query/service/src/pipelines/processors/transforms/hash_join/runtime_filter/packet.rs
index 707eff18a212f..ffe5263cf6bd3 100644
--- a/src/query/service/src/pipelines/processors/transforms/hash_join/runtime_filter/packet.rs
+++ b/src/query/service/src/pipelines/processors/transforms/hash_join/runtime_filter/packet.rs
@@ -45,7 +45,7 @@ pub struct RuntimeFilterPacket {
     pub id: usize,
     pub inlist: Option<Column>,
     pub min_max: Option<SerializableDomain>,
-    pub bloom: Option<Vec<u64>>,
+    pub bloom: Option<Vec<u32>>,
     pub spatial: Option<SpatialPacket>,
 }
 
@@ -161,7 +161,7 @@ impl TryInto<DataBlock> for JoinRuntimeFilterPacket {
                     bloom_pos = Some(entities.len());
 
                     let builder = ArrayColumnBuilder {
-                        builder: ColumnBuilder::Number(NumberColumnBuilder::UInt64(bloom_filter)),
+                        builder: ColumnBuilder::Number(NumberColumnBuilder::UInt32(bloom_filter)),
                         offsets: vec![0, len],
                     };
                     entities.push(Column::Array(Box::new(builder.build())));
@@ -228,7 +228,7 @@ impl TryFrom<DataBlock> for JoinRuntimeFilterPacket {
                     let array_column = column.into_array().expect("it's a bug");
                     let bloom_value_column = array_column.index(0).expect("It's a bug");
                     bloom = Some(match bloom_value_column {
-                        Column::Number(NumberColumn::UInt64(v)) => v.to_vec(),
+                        Column::Number(NumberColumn::UInt32(v)) => v.to_vec(),
                         _ => unreachable!("Unexpected runtime bloom filter column type"),
                     })
                 }
diff --git a/src/query/service/src/pipelines/processors/transforms/new_hash_join/join.rs b/src/query/service/src/pipelines/processors/transforms/new_hash_join/common/join.rs
similarity index 73%
rename from src/query/service/src/pipelines/processors/transforms/new_hash_join/join.rs
rename to src/query/service/src/pipelines/processors/transforms/new_hash_join/common/join.rs
index d3e4cc5bf9cc8..a3a8493f4f731 100644
--- a/src/query/service/src/pipelines/processors/transforms/new_hash_join/join.rs
+++ b/src/query/service/src/pipelines/processors/transforms/new_hash_join/common/join.rs
@@ -16,6 +16,7 @@ use databend_common_base::base::ProgressValues;
 use databend_common_exception::ErrorCode;
 use databend_common_exception::Result;
 use databend_common_expression::DataBlock;
+use databend_common_expression::FilterExecutor;
 
 use crate::pipelines::processors::transforms::JoinRuntimeFilterPacket;
 
@@ -31,7 +32,9 @@ pub trait Join: Send + Sync + 'static {
     /// returns its progress. Once all batches are consumed it returns `None` to signal completion.
     fn final_build(&mut self) -> Result<Option<ProgressValues>>;
 
-    fn add_runtime_filter_packet(&self, _packet: JoinRuntimeFilterPacket) {}
+    fn add_runtime_filter_packet(&self, _packet: JoinRuntimeFilterPacket) -> Result<()> {
+        Ok(())
+    }
 
     /// Generate runtime filter packet for the given filter description.
     fn build_runtime_filter(&self) -> Result<JoinRuntimeFilterPacket> {
@@ -89,3 +92,42 @@ impl Join for FinishedJoin {
         Err(ErrorCode::Internal("Join is finished"))
     }
 }
+
+pub struct InnerHashJoinFilterStream<'a> {
+    inner: Box<dyn JoinStream + 'a>,
+    filter_executor: &'a mut FilterExecutor,
+}
+
+impl<'a> InnerHashJoinFilterStream<'a> {
+    pub fn create(
+        inner: Box<dyn JoinStream + 'a>,
+        filter_executor: &'a mut FilterExecutor,
+    ) -> Box<dyn JoinStream + 'a> {
+        Box::new(InnerHashJoinFilterStream {
+            inner,
+            filter_executor,
+        })
+    }
+}
+
+impl<'a> JoinStream for InnerHashJoinFilterStream<'a> {
+    fn next(&mut self) -> Result<Option<DataBlock>> {
+        loop {
+            let Some(data_block) = self.inner.next()? else {
+                return Ok(None);
+            };
+
+            if data_block.is_empty() {
+                continue;
+            }
+
+            let data_block = self.filter_executor.filter(data_block)?;
+
+            if data_block.is_empty() {
+                continue;
+            }
+
+            return Ok(Some(data_block));
+        }
+    }
+}
diff --git a/src/query/service/src/pipelines/processors/transforms/new_hash_join/common/mod.rs b/src/query/service/src/pipelines/processors/transforms/new_hash_join/common/mod.rs
index f0127c9d681cd..c76453cc9b693 100644
--- a/src/query/service/src/pipelines/processors/transforms/new_hash_join/common/mod.rs
+++ b/src/query/service/src/pipelines/processors/transforms/new_hash_join/common/mod.rs
@@ -13,6 +13,9 @@
 // limitations under the License.
 
 mod cstyle_cell;
+pub mod join;
+pub mod probe_stream;
+pub mod runtime_filter;
 mod squash_blocks;
 
 pub use cstyle_cell::CStyleCell;
diff --git a/src/query/service/src/pipelines/processors/transforms/new_hash_join/hashtable/basic.rs b/src/query/service/src/pipelines/processors/transforms/new_hash_join/common/probe_stream.rs
similarity index 67%
rename from src/query/service/src/pipelines/processors/transforms/new_hash_join/hashtable/basic.rs
rename to src/query/service/src/pipelines/processors/transforms/new_hash_join/common/probe_stream.rs
index 72320f145d165..d1960ef572853 100644
--- a/src/query/service/src/pipelines/processors/transforms/new_hash_join/hashtable/basic.rs
+++ b/src/query/service/src/pipelines/processors/transforms/new_hash_join/common/probe_stream.rs
@@ -61,41 +61,6 @@ impl ProbedRows {
     }
 }
 
-pub trait ProbeStream {
+pub trait ProbeStream: Send + Sync {
     fn advance(&mut self, res: &mut ProbedRows, max_rows: usize) -> Result<()>;
 }
-
-pub struct EmptyProbeStream;
-
-impl ProbeStream for EmptyProbeStream {
-    fn advance(&mut self, _res: &mut ProbedRows, _max_rows: usize) -> Result<()> {
-        Ok(())
-    }
-}
-
-pub struct AllUnmatchedProbeStream {
-    idx: u64,
-    size: u64,
-}
-
-impl AllUnmatchedProbeStream {
-    pub fn create(size: usize) -> Box<dyn ProbeStream> {
-        Box::new(AllUnmatchedProbeStream {
-            idx: 0,
-            size: size as u64,
-        })
-    }
-}
-
-impl ProbeStream for AllUnmatchedProbeStream {
-    fn advance(&mut self, rows: &mut ProbedRows, max_rows: usize) -> Result<()> {
-        if self.idx >= self.size {
-            return Ok(());
-        }
-
-        let unmatched_rows = std::cmp::min(self.size - self.idx, max_rows as u64);
-        rows.unmatched.extend(self.idx..self.idx + unmatched_rows);
-        self.idx += unmatched_rows;
-        Ok(())
-    }
-}
diff --git a/src/query/service/src/pipelines/processors/transforms/new_hash_join/runtime_filter.rs b/src/query/service/src/pipelines/processors/transforms/new_hash_join/common/runtime_filter.rs
similarity index 100%
rename from src/query/service/src/pipelines/processors/transforms/new_hash_join/runtime_filter.rs
rename to src/query/service/src/pipelines/processors/transforms/new_hash_join/common/runtime_filter.rs
diff --git a/src/query/service/src/pipelines/processors/transforms/new_hash_join/mod.rs b/src/query/service/src/pipelines/processors/transforms/new_hash_join/mod.rs
index 790c0ce9e3eb0..1857969bb6059 100644
--- a/src/query/service/src/pipelines/processors/transforms/new_hash_join/mod.rs
+++ b/src/query/service/src/pipelines/processors/transforms/new_hash_join/mod.rs
@@ -13,24 +13,19 @@
 // limitations under the License.
 
 mod common;
-mod grace;
-mod hash_join_factory;
-mod hashtable;
-mod hybrid;
-mod join;
-pub mod memory;
-mod performance;
-mod runtime_filter;
-mod transform_hash_join;
+pub mod partitioned;
+pub mod unpartitioned;
 
-pub use grace::GraceHashJoin;
-pub use grace::GraceMemoryJoin;
-pub use hash_join_factory::HashJoinFactory;
-pub use hybrid::HybridHashJoin;
-pub use hybrid::HybridHashJoinState;
-pub use join::Join;
-pub use join::JoinStream;
-pub use memory::BasicHashJoinState;
-pub use memory::InnerHashJoin;
-pub use runtime_filter::RuntimeFiltersDesc;
-pub use transform_hash_join::TransformHashJoin;
+pub use common::join::Join;
+pub use common::join::JoinStream;
+pub use common::runtime_filter::RuntimeFiltersDesc;
+pub use partitioned::SharedRuntimeFilterPackets;
+pub use partitioned::TransformPartitionedHashJoin;
+pub use unpartitioned::HashJoinFactory;
+pub use unpartitioned::TransformHashJoin;
+pub use unpartitioned::grace::GraceHashJoin;
+pub use unpartitioned::grace::GraceMemoryJoin;
+pub use unpartitioned::hybrid::HybridHashJoin;
+pub use unpartitioned::hybrid::HybridHashJoinState;
+pub use unpartitioned::memory::BasicHashJoinState;
+pub use unpartitioned::memory::InnerHashJoin;
diff --git a/src/query/service/src/pipelines/processors/transforms/new_hash_join/partitioned/chunk_accumulator.rs b/src/query/service/src/pipelines/processors/transforms/new_hash_join/partitioned/chunk_accumulator.rs
new file mode 100644
index 0000000000000..02223c3b9d037
--- /dev/null
+++ b/src/query/service/src/pipelines/processors/transforms/new_hash_join/partitioned/chunk_accumulator.rs
@@ -0,0 +1,219 @@
+// Copyright 2021 Datafuse Labs
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+use databend_common_expression::BlockEntry;
+use databend_common_expression::Column;
+use databend_common_expression::ColumnBuilder;
+use databend_common_expression::DataBlock;
+
+/// Accumulates rows from input blocks into fixed-size output chunks
+/// using mutable ColumnBuilders. When the accumulated rows reach
+/// `chunk_size`, a chunk is flushed and returned.
+pub struct FixedSizeChunkAccumulator {
+    chunk_size: usize,
+    builder_rows: usize,
+    builders: Vec<ColumnBuilder>,
+}
+
+impl FixedSizeChunkAccumulator {
+    pub fn new(chunk_size: usize) -> Self {
+        FixedSizeChunkAccumulator {
+            chunk_size,
+            builders: vec![],
+            builder_rows: 0,
+        }
+    }
+
+    pub fn accumulate(&mut self, block: DataBlock) -> Vec<DataBlock> {
+        let mut output = Vec::new();
+        self.append_block(block, &mut output);
+        output
+    }
+
+    pub fn finalize(&mut self) -> Option<DataBlock> {
+        match self.builder_rows {
+            0 => None,
+            _ => Some(self.build_chunk()),
+        }
+    }
+
+    fn ensure_builders(&mut self, block: &DataBlock) {
+        if self.builders.is_empty() {
+            self.builders = block
+                .columns()
+                .iter()
+                .map(|entry| ColumnBuilder::with_capacity(&entry.data_type(), self.chunk_size))
+                .collect();
+        }
+    }
+
+    fn append_block(&mut self, block: DataBlock, output: &mut Vec<DataBlock>) {
+        self.ensure_builders(&block);
+
+        let block_rows = block.num_rows();
+        let columns: Vec<Column> = block
+            .take_columns()
+            .into_iter()
+            .map(|e| e.to_column())
+            .collect();
+
+        let mut offset = 0;
+        while offset < block_rows {
+            let remaining_capacity = self.chunk_size - self.builder_rows;
+            let rows_to_copy = (block_rows - offset).min(remaining_capacity);
+
+            for (builder, col) in self.builders.iter_mut().zip(columns.iter()) {
+                let sliced = col.slice(offset..offset + rows_to_copy);
+                builder.append_column(&sliced);
+            }
+
+            self.builder_rows += rows_to_copy;
+            offset += rows_to_copy;
+
+            if self.builder_rows == self.chunk_size {
+                output.push(self.build_chunk());
+            }
+        }
+    }
+
+    fn build_chunk(&mut self) -> DataBlock {
+        let num_rows = self.builder_rows;
+
+        let builders = std::mem::take(&mut self.builders);
+
+        // Reinitialize builders with same column types for next chunk.
+        let mut new_builders = Vec::with_capacity(builders.len());
+        let mut columns = Vec::with_capacity(builders.len());
+        for b in builders {
+            let dt = b.data_type();
+            columns.push(BlockEntry::from(b.build()));
+            new_builders.push(ColumnBuilder::with_capacity(&dt, self.chunk_size));
+        }
+
+        self.builder_rows = 0;
+        self.builders = new_builders;
+
+        DataBlock::new(columns, num_rows)
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use databend_common_expression::DataBlock;
+    use databend_common_expression::FromData;
+    use databend_common_expression::types::AccessType;
+    use databend_common_expression::types::Int32Type;
+    use databend_common_expression::types::StringType;
+
+    use super::*;
+
+    fn make_int_block(values: Vec<i32>) -> DataBlock {
+        DataBlock::new_from_columns(vec![Int32Type::from_data(values)])
+    }
+
+    fn extract_int_col(block: &DataBlock) -> Vec<i32> {
+        let col = block.get_by_offset(0).to_column();
+        let col = Int32Type::try_downcast_column(&col).unwrap();
+        col.iter().copied().collect()
+    }
+
+    #[test]
+    fn test_single_block_under_chunk_size() {
+        let mut acc = FixedSizeChunkAccumulator::new(4);
+        let chunks = acc.accumulate(make_int_block(vec![1, 2, 3]));
+        assert!(chunks.is_empty());
+
+        let last = acc.finalize().unwrap();
+        assert_eq!(extract_int_col(&last), vec![1, 2, 3]);
+    }
+
+    #[test]
+    fn test_exact_chunk_size() {
+        let mut acc = FixedSizeChunkAccumulator::new(3);
+        let chunks = acc.accumulate(make_int_block(vec![1, 2, 3]));
+        assert_eq!(chunks.len(), 1);
+        assert_eq!(extract_int_col(&chunks[0]), vec![1, 2, 3]);
+
+        assert!(acc.finalize().is_none());
+    }
+
+    #[test]
+    fn test_block_larger_than_chunk_size() {
+        let mut acc = FixedSizeChunkAccumulator::new(3);
+        let chunks = acc.accumulate(make_int_block(vec![1, 2, 3, 4, 5, 6, 7]));
+        assert_eq!(chunks.len(), 2);
+        assert_eq!(extract_int_col(&chunks[0]), vec![1, 2, 3]);
+        assert_eq!(extract_int_col(&chunks[1]), vec![4, 5, 6]);
+
+        let last = acc.finalize().unwrap();
+        assert_eq!(extract_int_col(&last), vec![7]);
+    }
+
+    #[test]
+    fn test_multiple_small_blocks() {
+        let mut acc = FixedSizeChunkAccumulator::new(4);
+        assert!(acc.accumulate(make_int_block(vec![1, 2])).is_empty());
+        let chunks = acc.accumulate(make_int_block(vec![3, 4, 5]));
+        assert_eq!(chunks.len(), 1);
+        assert_eq!(extract_int_col(&chunks[0]), vec![1, 2, 3, 4]);
+
+        let last = acc.finalize().unwrap();
+        assert_eq!(extract_int_col(&last), vec![5]);
+    }
+
+    #[test]
+    fn test_flush_empty() {
+        let mut acc = FixedSizeChunkAccumulator::new(4);
+        assert!(acc.finalize().is_none());
+    }
+
+    #[test]
+    fn test_multi_column_blocks() {
+        let mut acc = FixedSizeChunkAccumulator::new(3);
+        let block = DataBlock::new_from_columns(vec![
+            Int32Type::from_data(vec![1, 2, 3, 4, 5]),
+            StringType::from_data(vec!["a", "b", "c", "d", "e"]),
+        ]);
+        let chunks = acc.accumulate(block);
+        assert_eq!(chunks.len(), 1);
+        assert_eq!(chunks[0].num_rows(), 3);
+        assert_eq!(chunks[0].num_columns(), 2);
+
+        let last = acc.finalize().unwrap();
+        assert_eq!(last.num_rows(), 2);
+        assert_eq!(last.num_columns(), 2);
+
+        let int_col = Int32Type::try_downcast_column(&last.get_by_offset(0).to_column()).unwrap();
+        let str_col = StringType::try_downcast_column(&last.get_by_offset(1).to_column()).unwrap();
+        assert_eq!(int_col.iter().copied().collect::<Vec<_>>(), vec![4, 5]);
+        let strs: Vec<&str> = str_col.iter().collect();
+        assert_eq!(strs, vec!["d", "e"]);
+    }
+
+    #[test]
+    fn test_reuse_after_flush() {
+        let mut acc = FixedSizeChunkAccumulator::new(2);
+        let chunks = acc.accumulate(make_int_block(vec![1, 2]));
+        assert_eq!(chunks.len(), 1);
+        assert!(acc.finalize().is_none());
+
+        // Accumulator can be reused after flush
+        let chunks = acc.accumulate(make_int_block(vec![3, 4, 5]));
+        assert_eq!(chunks.len(), 1);
+        assert_eq!(extract_int_col(&chunks[0]), vec![3, 4]);
+
+        let last = acc.finalize().unwrap();
+        assert_eq!(extract_int_col(&last), vec![5]);
+    }
+}
diff --git a/src/query/service/src/pipelines/processors/transforms/new_hash_join/partitioned/compact_hash_table.rs b/src/query/service/src/pipelines/processors/transforms/new_hash_join/partitioned/compact_hash_table.rs
new file mode 100644
index 0000000000000..201e832b0e9da
--- /dev/null
+++ b/src/query/service/src/pipelines/processors/transforms/new_hash_join/partitioned/compact_hash_table.rs
@@ -0,0 +1,176 @@
+// Copyright 2021 Datafuse Labs
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+use databend_common_column::bitmap::Bitmap;
+
+/// Index 0 is a sentinel (empty/chain-end). Actual rows are indexed from 1.
+/// Memory per row: 4 bytes (next chain) vs current ~32 bytes (pointer-based entry).
+///
+/// The table is single-threaded (no atomics) — designed for per-thread use
+/// under hash shuffle where each thread independently builds and probes.
+/// Trait for row index types. Supports u32 (up to ~4B rows) and u64.
+pub trait RowIndex: Copy + Default + Eq + Send + Sync + 'static + std::fmt::Debug {
+    const ZERO: Self;
+    fn from_usize(v: usize) -> Self;
+    fn to_usize(self) -> usize;
+}
+
+impl RowIndex for u32 {
+    const ZERO: Self = 0;
+    #[inline(always)]
+    fn from_usize(v: usize) -> Self {
+        v as u32
+    }
+    #[inline(always)]
+    fn to_usize(self) -> usize {
+        self as usize
+    }
+}
+
+impl RowIndex for u64 {
+    const ZERO: Self = 0;
+    #[inline(always)]
+    fn from_usize(v: usize) -> Self {
+        v as u64
+    }
+    #[inline(always)]
+    fn to_usize(self) -> usize {
+        self as usize
+    }
+}
+
+/// Compact join hash table using index-based chaining.
+///
+/// `first[bucket]` stores the first row index in that bucket's chain.
+/// `next[row_index]` stores the next row index in the same bucket's chain.
+/// Chain ends when the value is `I::ZERO` (sentinel).
+pub struct CompactJoinHashTable<I: RowIndex = u32> {
+    /// Bucket array: first[hash & mask] = first row index (1-based)
+    first: Vec<I>,
+    /// Chain array: next[row_index] = next row in same bucket (0 = end)
+    pub next: Vec<I>,
+    /// Bucket count minus one, for masking
+    bucket_mask: usize,
+}
+
+impl<I: RowIndex> CompactJoinHashTable<I> {
+    /// Create a new compact hash table for `num_rows` rows.
+    /// Bucket count is next power of 2 >= num_rows + (num_rows - 1) / 7.
+    pub fn new(num_rows: usize) -> Self {
+        let bucket_count = Self::calc_bucket_count(num_rows);
+        CompactJoinHashTable {
+            first: vec![I::ZERO; bucket_count],
+            // Index 0 is sentinel, so we need num_rows + 1 entries
+            next: vec![I::ZERO; num_rows + 1],
+            bucket_mask: bucket_count - 1,
+        }
+    }
+
+    /// Create a direct-mapping hash table where keys are used as array indices.
+    /// `range` is `max_key - min_key`; the caller subtracts min_key before insertion/probe.
+    pub fn new_direct(num_rows: usize, range: usize) -> Self {
+        CompactJoinHashTable {
+            first: vec![I::ZERO; range + 1],
+            next: vec![I::ZERO; num_rows + 1],
+            bucket_mask: 0,
+        }
+    }
+
+    pub fn insert_chunk<const DIRECT: bool>(&mut self, vals: &[u64], row_offset: usize) {
+        for (i, v) in vals.iter().enumerate() {
+            let row_index = row_offset + i;
+            let bucket = match DIRECT {
+                true => *v as usize,
+                false => (*v as usize) & self.bucket_mask,
+            };
+
+            self.next[row_index] = self.first[bucket];
+            self.first[bucket] = I::from_usize(row_index);
+        }
+    }
+
+    fn calc_bucket_count(num_rows: usize) -> usize {
+        if num_rows == 0 {
+            return 1;
+        }
+
+        let target = num_rows + (num_rows.saturating_sub(1)) / 7;
+        target.next_power_of_two()
+    }
+
+    pub fn probe<const DIRECT: bool>(&self, vals: &mut [u64], bitmap: Option<Bitmap>) -> usize {
+        let mut valids = None;
+
+        if let Some(bitmap) = bitmap {
+            if bitmap.null_count() == bitmap.len() {
+                vals.iter_mut().for_each(|v| {
+                    *v = 0;
+                });
+                return 0;
+            } else if bitmap.null_count() > 0 {
+                valids = Some(bitmap);
+            }
+        }
+
+        let mut count = 0;
+        let first_len = self.first.len();
+
+        match valids {
+            Some(valids) => {
+                for (valid, val) in valids.iter().zip(vals.iter_mut()) {
+                    if valid {
+                        let bucket = match DIRECT {
+                            false => (*val as usize) & self.bucket_mask,
+                            true if (*val as usize) < first_len => *val as usize,
+                            true => {
+                                *val = 0;
+                                continue;
+                            }
+                        };
+
+                        if self.first[bucket] != I::default() {
+                            *val = self.first[bucket].to_usize() as u64;
+                            count += 1;
+                        } else {
+                            *val = 0;
+                        }
+                    } else {
+                        *val = 0;
+                    }
+                }
+            }
+            None => {
+                vals.iter_mut().for_each(|val| {
+                    let bucket = if DIRECT {
+                        let b = *val as usize;
+                        if b >= first_len {
+                            *val = 0;
+                            return;
+                        }
+                        b
+                    } else {
+                        (*val as usize) & self.bucket_mask
+                    };
+                    if self.first[bucket] != I::default() {
+                        *val = self.first[bucket].to_usize() as u64;
+                        count += 1;
+                    } else {
+                        *val = 0;
+                    }
+                });
+            }
+        }
+        count
+    }
+}
diff --git a/src/query/service/src/pipelines/processors/transforms/new_hash_join/partitioned/inner_join.rs b/src/query/service/src/pipelines/processors/transforms/new_hash_join/partitioned/inner_join.rs
new file mode 100644
index 0000000000000..c8160edaae275
--- /dev/null
+++ b/src/query/service/src/pipelines/processors/transforms/new_hash_join/partitioned/inner_join.rs
@@ -0,0 +1,207 @@
+// Copyright 2021 Datafuse Labs
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+use std::sync::Arc;
+
+use databend_common_base::base::ProgressValues;
+use databend_common_column::bitmap::Bitmap;
+use databend_common_exception::Result;
+use databend_common_expression::BlockEntry;
+use databend_common_expression::DataBlock;
+use databend_common_expression::FunctionContext;
+use databend_common_expression::HashMethodKind;
+use databend_common_expression::types::NullableColumn;
+
+use super::partitioned_build::PartitionedHashJoinState;
+use super::partitioned_build::ProbeData;
+use crate::pipelines::processors::HashJoinDesc;
+use crate::pipelines::processors::transforms::new_hash_join::common::join::EmptyJoinStream;
+use crate::pipelines::processors::transforms::new_hash_join::common::join::InnerHashJoinFilterStream;
+use crate::pipelines::processors::transforms::new_hash_join::common::join::Join;
+use crate::pipelines::processors::transforms::new_hash_join::common::join::JoinStream;
+use crate::pipelines::processors::transforms::new_hash_join::common::probe_stream::ProbeStream;
+use crate::pipelines::processors::transforms::new_hash_join::common::probe_stream::ProbedRows;
+use crate::pipelines::processors::transforms::unpartitioned::PerformanceContext;
+
+pub struct PartitionedInnerJoin {
+    build: PartitionedHashJoinState,
+    desc: Arc<HashJoinDesc>,
+    function_ctx: Arc<FunctionContext>,
+    context: PerformanceContext,
+}
+
+impl PartitionedInnerJoin {
+    pub fn create(
+        method: HashMethodKind,
+        desc: Arc<HashJoinDesc>,
+        function_ctx: FunctionContext,
+        max_block_size: usize,
+    ) -> Self {
+        let context =
+            PerformanceContext::create(max_block_size, desc.clone(), function_ctx.clone());
+
+        let function_ctx = Arc::new(function_ctx);
+
+        PartitionedInnerJoin {
+            function_ctx: function_ctx.clone(),
+            build: PartitionedHashJoinState::create(method, desc.clone(), function_ctx),
+            desc,
+            context,
+        }
+    }
+}
+
+impl Join for PartitionedInnerJoin {
+    fn add_block(&mut self, data: Option<DataBlock>) -> Result<()> {
+        self.build.add_block::<false>(data)
+    }
+
+    fn final_build(&mut self) -> Result<Option<ProgressValues>> {
+        self.build.final_build()
+    }
+
+    fn probe_block(&mut self, data: DataBlock) -> Result<Box<dyn JoinStream + '_>> {
+        if data.is_empty() || self.build.num_rows == 0 {
+            return Ok(Box::new(EmptyJoinStream));
+        }
+
+        let probe_keys = self.desc.probe_key(&data, &self.function_ctx)?;
+
+        let mut keys = DataBlock::new(probe_keys, data.num_rows());
+        let valids = match self.desc.from_correlated_subquery {
+            true => None,
+            false => self.desc.build_valids_by_keys(&keys)?,
+        };
+
+        self.desc.remove_keys_nullable(&mut keys);
+        let probe_block = data.project(&self.desc.probe_projection);
+
+        let probe_data = ProbeData::new(keys, valids);
+        let probe_keys_stream = self.build.probe::<true, false>(probe_data)?;
+        let joined_stream = PartitionedInnerJoinStream::create(
+            probe_block,
+            &self.build,
+            probe_keys_stream,
+            self.desc.clone(),
+            &mut self.context.probe_result,
+        );
+
+        match &mut self.context.filter_executor {
+            None => Ok(joined_stream),
+            Some(filter_executor) => Ok(InnerHashJoinFilterStream::create(
+                joined_stream,
+                filter_executor,
+            )),
+        }
+    }
+}
+
+struct PartitionedInnerJoinStream<'a> {
+    desc: Arc<HashJoinDesc>,
+    probe_data_block: DataBlock,
+    build: &'a PartitionedHashJoinState,
+    probe_keys_stream: Box<dyn ProbeStream + 'a>,
+    probed_rows: &'a mut ProbedRows,
+}
+
+impl<'a> PartitionedInnerJoinStream<'a> {
+    pub fn create(
+        probe_data_block: DataBlock,
+        build: &'a PartitionedHashJoinState,
+        probe_keys_stream: Box<dyn ProbeStream + 'a>,
+        desc: Arc<HashJoinDesc>,
+        probed_rows: &'a mut ProbedRows,
+    ) -> Box<dyn JoinStream + 'a> {
+        Box::new(PartitionedInnerJoinStream {
+            desc,
+            build,
+            probed_rows,
+            probe_data_block,
+            probe_keys_stream,
+        })
+    }
+}
+
+impl<'a> JoinStream for PartitionedInnerJoinStream<'a> {
+    fn next(&mut self) -> Result<Option<DataBlock>> {
+        loop {
+            self.probed_rows.clear();
+            let max_rows = self.probed_rows.matched_probe.capacity();
+            self.probe_keys_stream.advance(self.probed_rows, max_rows)?;
+
+            if self.probed_rows.is_empty() {
+                return Ok(None);
+            }
+
+            if self.probed_rows.is_all_unmatched() {
+                continue;
+            }
+
+            let probe_block = match self.probe_data_block.num_columns() {
+                0 => None,
+                _ => Some(DataBlock::take(
+                    &self.probe_data_block,
+                    self.probed_rows.matched_probe.as_slice(),
+                )?),
+            };
+
+            let build_block = match self.build.columns.is_empty() {
+                true => None,
+                false => {
+                    let row_ptrs = self.probed_rows.matched_build.as_slice();
+                    Some(DataBlock::take_column_vec(
+                        self.build.columns.as_slice(),
+                        self.build.column_types.as_slice(),
+                        row_ptrs,
+                    ))
+                }
+            };
+
+            let mut result_block = match (probe_block, build_block) {
+                (Some(mut probe_block), Some(build_block)) => {
+                    probe_block.merge_block(build_block);
+                    probe_block
+                }
+                (Some(probe_block), None) => probe_block,
+                (None, Some(build_block)) => build_block,
+                (None, None) => DataBlock::new(vec![], self.probed_rows.matched_build.len()),
+            };
+
+            for (index, (is_probe_nullable, is_build_nullable)) in
+                self.desc.probe_to_build.iter().cloned()
+            {
+                let entry = match (is_probe_nullable, is_build_nullable) {
+                    (true, true) | (false, false) => result_block.get_by_offset(index).clone(),
+                    (true, false) => result_block.get_by_offset(index).clone().remove_nullable(),
+                    (false, true) => {
+                        let entry = result_block.get_by_offset(index);
+                        let col = entry.to_column();
+
+                        match col.is_null() || col.is_nullable() {
+                            true => entry.clone(),
+                            false => BlockEntry::from(NullableColumn::new_column(
+                                col,
+                                Bitmap::new_constant(true, result_block.num_rows()),
+                            )),
+                        }
+                    }
+                };
+
+                result_block.add_entry(entry);
+            }
+
+            return Ok(Some(result_block));
+        }
+    }
+}
diff --git a/src/query/service/src/pipelines/processors/transforms/new_hash_join/partitioned/left_join.rs b/src/query/service/src/pipelines/processors/transforms/new_hash_join/partitioned/left_join.rs
new file mode 100644
index 0000000000000..4552607a8e17c
--- /dev/null
+++ b/src/query/service/src/pipelines/processors/transforms/new_hash_join/partitioned/left_join.rs
@@ -0,0 +1,360 @@
+// Copyright 2021 Datafuse Labs
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+use std::sync::Arc;
+
+use databend_common_base::base::ProgressValues;
+use databend_common_column::bitmap::Bitmap;
+use databend_common_exception::Result;
+use databend_common_expression::BlockEntry;
+use databend_common_expression::DataBlock;
+use databend_common_expression::FilterExecutor;
+use databend_common_expression::FunctionContext;
+use databend_common_expression::HashMethodKind;
+use databend_common_expression::Scalar;
+use databend_common_expression::types::DataType;
+use databend_common_expression::types::NullableColumn;
+
+use super::partitioned_build::PartitionedHashJoinState;
+use crate::pipelines::processors::HashJoinDesc;
+use crate::pipelines::processors::transforms::new_hash_join::common::join::EmptyJoinStream;
+use crate::pipelines::processors::transforms::new_hash_join::common::join::Join;
+use crate::pipelines::processors::transforms::new_hash_join::common::join::JoinStream;
+use crate::pipelines::processors::transforms::new_hash_join::common::join::OneBlockJoinStream;
+use crate::pipelines::processors::transforms::new_hash_join::common::probe_stream::ProbeStream;
+use crate::pipelines::processors::transforms::new_hash_join::common::probe_stream::ProbedRows;
+use crate::pipelines::processors::transforms::partitioned::partitioned_build::ProbeData;
+use crate::pipelines::processors::transforms::unpartitioned::PerformanceContext;
+use crate::pipelines::processors::transforms::wrap_true_validity;
+
+pub struct PartitionedLeftJoin {
+    build: PartitionedHashJoinState,
+    desc: Arc<HashJoinDesc>,
+    function_ctx: FunctionContext,
+    performance_context: PerformanceContext,
+}
+
+impl PartitionedLeftJoin {
+    pub fn create(
+        method: HashMethodKind,
+        desc: Arc<HashJoinDesc>,
+        function_ctx: FunctionContext,
+        max_block_size: usize,
+    ) -> Self {
+        let context =
+            PerformanceContext::create(max_block_size, desc.clone(), function_ctx.clone());
+
+        PartitionedLeftJoin {
+            desc: desc.clone(),
+            function_ctx: function_ctx.clone(),
+            performance_context: context,
+            build: PartitionedHashJoinState::create(method, desc, Arc::new(function_ctx)),
+        }
+    }
+}
+
+impl Join for PartitionedLeftJoin {
+    fn add_block(&mut self, data: Option<DataBlock>) -> Result<()> {
+        self.build.add_block::<false>(data)
+    }
+
+    fn final_build(&mut self) -> Result<Option<ProgressValues>> {
+        self.build.final_build()
+    }
+
+    fn probe_block(&mut self, data: DataBlock) -> Result<Box<dyn JoinStream + '_>> {
+        if data.is_empty() {
+            return Ok(Box::new(EmptyJoinStream));
+        }
+
+        if self.build.num_rows == 0 {
+            let num_rows = data.num_rows();
+
+            let types = self
+                .desc
+                .build_schema
+                .fields
+                .iter()
+                .map(|x| x.data_type().clone())
+                .collect::<Vec<_>>();
+
+            let build_block =
+                match crate::pipelines::processors::transforms::unpartitioned::left_join::null_block(
+                    &types,
+                    data.num_rows(),
+                ) {
+                    None => None,
+                    Some(data_block) => Some(data_block.project(&self.desc.build_projection)),
+                };
+
+            let probe_block = Some(data.project(&self.desc.probe_projection));
+            let result_block = final_result_block(&self.desc, probe_block, build_block, num_rows);
+            return Ok(Box::new(OneBlockJoinStream(Some(result_block))));
+        }
+
+        let probe_keys = self.desc.probe_key(&data, &self.function_ctx)?;
+
+        let mut keys = DataBlock::new(probe_keys, data.num_rows());
+        let valids = match self.desc.from_correlated_subquery {
+            true => None,
+            false => self.desc.build_valids_by_keys(&keys)?,
+        };
+
+        self.desc.remove_keys_nullable(&mut keys);
+        let probe_block = data.project(&self.desc.probe_projection);
+
+        let probe_data = ProbeData::new(keys, valids);
+        let probe_stream = self.build.probe::<false, false>(probe_data)?;
+
+        match self.performance_context.filter_executor.as_mut() {
+            None => Ok(OuterLeftHashJoinStream::<false>::create(
+                probe_block,
+                &self.build,
+                probe_stream,
+                self.desc.clone(),
+                &mut self.performance_context.probe_result,
+                None,
+            )),
+            Some(filter_executor) => Ok(OuterLeftHashJoinStream::<true>::create(
+                probe_block,
+                &self.build,
+                probe_stream,
+                self.desc.clone(),
+                &mut self.performance_context.probe_result,
+                Some(filter_executor),
+            )),
+        }
+    }
+}
+
+struct OuterLeftHashJoinStream<'a, const CONJUNCT: bool> {
+    desc: Arc<HashJoinDesc>,
+    probe_data_block: DataBlock,
+    join_state: &'a PartitionedHashJoinState,
+    probe_keys_stream: Box<dyn ProbeStream + 'a>,
+    probed_rows: &'a mut ProbedRows,
+    conjunct_unmatched: Vec<u8>,
+    unmatched_rows: Vec<u64>,
+    filter_executor: Option<&'a mut FilterExecutor>,
+}
+
+unsafe impl<'a, const CONJUNCT: bool> Send for OuterLeftHashJoinStream<'a, CONJUNCT> {}
+unsafe impl<'a, const CONJUNCT: bool> Sync for OuterLeftHashJoinStream<'a, CONJUNCT> {}
+
+impl<'a, const CONJUNCT: bool> JoinStream for OuterLeftHashJoinStream<'a, CONJUNCT> {
+    fn next(&mut self) -> Result<Option<DataBlock>> {
+        loop {
+            self.probed_rows.clear();
+            let max_rows = self.probed_rows.matched_probe.capacity();
+            self.probe_keys_stream.advance(self.probed_rows, max_rows)?;
+
+            if !CONJUNCT && !self.probed_rows.unmatched.is_empty() {
+                self.unmatched_rows
+                    .extend_from_slice(&self.probed_rows.unmatched);
+            }
+
+            if self.probed_rows.is_empty() {
+                if self.conjunct_unmatched.is_empty() && self.unmatched_rows.is_empty() {
+                    return Ok(None);
+                }
+
+                let unmatched_row_id = match CONJUNCT {
+                    true => std::mem::take(&mut self.conjunct_unmatched)
+                        .into_iter()
+                        .enumerate()
+                        .filter(|(_, matched)| *matched == 0)
+                        .map(|(row_id, _)| row_id as u64)
+                        .collect::<Vec<_>>(),
+                    false => std::mem::take(&mut self.unmatched_rows),
+                };
+
+                let probe_block = match self.probe_data_block.num_columns() {
+                    0 => None,
+                    _ => Some(DataBlock::take(
+                        &self.probe_data_block,
+                        unmatched_row_id.as_slice(),
+                    )?),
+                };
+
+                let types = &self.join_state.column_types;
+                let build_block =
+                    crate::pipelines::processors::transforms::unpartitioned::left_join::null_block(
+                        types,
+                        unmatched_row_id.len(),
+                    );
+
+                return Ok(Some(crate::pipelines::processors::transforms::unpartitioned::left_join::final_result_block(
+                    &self.desc,
+                    probe_block,
+                    build_block,
+                    unmatched_row_id.len(),
+                )));
+            }
+
+            if self.probed_rows.matched_probe.is_empty() {
+                continue;
+            }
+
+            let probe_block = match self.probe_data_block.num_columns() {
+                0 => None,
+                _ => Some(DataBlock::take(
+                    &self.probe_data_block,
+                    self.probed_rows.matched_probe.as_slice(),
+                )?),
+            };
+
+            let build_block = match self.join_state.columns.is_empty() {
+                true => None,
+                false => {
+                    let row_ptrs = self.probed_rows.matched_build.as_slice();
+                    let build_block = DataBlock::take_column_vec(
+                        self.join_state.columns.as_slice(),
+                        self.join_state.column_types.as_slice(),
+                        row_ptrs,
+                    );
+
+                    let true_validity = Bitmap::new_constant(true, row_ptrs.len());
+                    let entries = build_block
+                        .columns()
+                        .iter()
+                        .map(|c| wrap_true_validity(c, row_ptrs.len(), &true_validity));
+                    Some(DataBlock::from_iter(entries, row_ptrs.len()))
+                }
+            };
+
+            let mut result_block = crate::pipelines::processors::transforms::unpartitioned::left_join::final_result_block(
+                &self.desc,
+                probe_block,
+                build_block,
+                self.probed_rows.matched_build.len(),
+            );
+
+            if CONJUNCT && let Some(filter_executor) = self.filter_executor.as_mut() {
+                let result_count = filter_executor.select(&result_block)?;
+
+                if result_count == 0 {
+                    continue;
+                }
+
+                let true_sel = filter_executor.true_selection();
+
+                for idx in true_sel.iter().take(result_count) {
+                    let row_id = self.probed_rows.matched_probe[*idx as usize] as usize;
+                    self.conjunct_unmatched[row_id] = 1;
+                }
+
+                let origin_rows = result_block.num_rows();
+                result_block = filter_executor.take(result_block, origin_rows, result_count)?;
+            }
+
+            return Ok(Some(result_block));
+        }
+    }
+}
+
+impl<'a, const CONJUNCT: bool> OuterLeftHashJoinStream<'a, CONJUNCT> {
+    pub fn create(
+        probe_data_block: DataBlock,
+        join_state: &'a PartitionedHashJoinState,
+        probe_keys_stream: Box<dyn ProbeStream + 'a>,
+        desc: Arc<HashJoinDesc>,
+        probed_rows: &'a mut ProbedRows,
+        filter_executor: Option<&'a mut FilterExecutor>,
+    ) -> Box<dyn JoinStream + 'a> {
+        let num_rows = probe_data_block.num_rows();
+        let pending_unmatched = match CONJUNCT {
+            true => vec![0; num_rows],
+            false => Vec::new(),
+        };
+
+        let unmatched_rows = match CONJUNCT {
+            true => Vec::new(),
+            false => Vec::with_capacity(num_rows),
+        };
+
+        probed_rows.unmatched.reserve(num_rows);
+        Box::new(OuterLeftHashJoinStream::<'a, CONJUNCT> {
+            desc,
+            join_state,
+            probed_rows,
+            probe_data_block,
+            probe_keys_stream,
+            filter_executor,
+            unmatched_rows,
+            conjunct_unmatched: pending_unmatched,
+        })
+    }
+}
+
+pub fn final_result_block(
+    desc: &HashJoinDesc,
+    probe_block: Option<DataBlock>,
+    build_block: Option<DataBlock>,
+    num_rows: usize,
+) -> DataBlock {
+    let mut result_block = match (probe_block, build_block) {
+        (Some(mut probe_block), Some(build_block)) => {
+            probe_block.merge_block(build_block);
+            probe_block
+        }
+        (Some(probe_block), None) => probe_block,
+        (None, Some(build_block)) => build_block,
+        (None, None) => DataBlock::new(vec![], num_rows),
+    };
+
+    if !desc.probe_to_build.is_empty() {
+        for (index, (is_probe_nullable, is_build_nullable)) in desc.probe_to_build.iter() {
+            let entry = match (is_probe_nullable, is_build_nullable) {
+                (true, true) | (false, false) => result_block.get_by_offset(*index).clone(),
+                (true, false) => result_block.get_by_offset(*index).clone().remove_nullable(),
+                (false, true) => {
+                    let entry = result_block.get_by_offset(*index);
+                    let col = entry.to_column();
+
+                    match col.is_null() || col.is_nullable() {
+                        true => entry.clone(),
+                        false => BlockEntry::from(NullableColumn::new_column(
+                            col,
+                            Bitmap::new_constant(true, result_block.num_rows()),
+                        )),
+                    }
+                }
+            };
+
+            result_block.add_entry(entry);
+        }
+    }
+    result_block
+}
+
+#[allow(dead_code)]
+pub fn null_block(types: &[DataType], num_rows: usize) -> Option<DataBlock> {
+    match types.is_empty() {
+        true => None,
+        false => {
+            let columns = types
+                .iter()
+                .map(|column_type| {
+                    BlockEntry::new_const_column(
+                        column_type.wrap_nullable(),
+                        Scalar::Null,
+                        num_rows,
+                    )
+                })
+                .collect::<Vec<_>>();
+
+            Some(DataBlock::new(columns, num_rows))
+        }
+    }
+}
diff --git a/src/query/service/src/pipelines/processors/transforms/new_hash_join/partitioned/left_join_anti.rs b/src/query/service/src/pipelines/processors/transforms/new_hash_join/partitioned/left_join_anti.rs
new file mode 100644
index 0000000000000..0a9bfec2c6d88
--- /dev/null
+++ b/src/query/service/src/pipelines/processors/transforms/new_hash_join/partitioned/left_join_anti.rs
@@ -0,0 +1,270 @@
+// Copyright 2021 Datafuse Labs
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+use std::sync::Arc;
+
+use databend_common_base::base::ProgressValues;
+use databend_common_base::hints::assume;
+use databend_common_column::bitmap::Bitmap;
+use databend_common_exception::Result;
+use databend_common_expression::DataBlock;
+use databend_common_expression::FilterExecutor;
+use databend_common_expression::FunctionContext;
+use databend_common_expression::HashMethodKind;
+
+use super::partitioned_build::PartitionedHashJoinState;
+use super::partitioned_build::ProbeData;
+use crate::pipelines::processors::HashJoinDesc;
+use crate::pipelines::processors::transforms::new_hash_join::common::join::EmptyJoinStream;
+use crate::pipelines::processors::transforms::new_hash_join::common::join::Join;
+use crate::pipelines::processors::transforms::new_hash_join::common::join::JoinStream;
+use crate::pipelines::processors::transforms::new_hash_join::common::join::OneBlockJoinStream;
+use crate::pipelines::processors::transforms::new_hash_join::common::probe_stream::ProbeStream;
+use crate::pipelines::processors::transforms::new_hash_join::common::probe_stream::ProbedRows;
+use crate::pipelines::processors::transforms::new_hash_join::unpartitioned::memory::left_join::final_result_block;
+use crate::pipelines::processors::transforms::unpartitioned::PerformanceContext;
+
+pub struct PartitionedLeftAntiJoin {
+    build: PartitionedHashJoinState,
+    desc: Arc<HashJoinDesc>,
+    function_ctx: Arc<FunctionContext>,
+    context: PerformanceContext,
+}
+
+impl PartitionedLeftAntiJoin {
+    pub fn create(
+        method: HashMethodKind,
+        desc: Arc<HashJoinDesc>,
+        function_ctx: FunctionContext,
+        max_block_size: usize,
+    ) -> Self {
+        let context =
+            PerformanceContext::create(max_block_size, desc.clone(), function_ctx.clone());
+
+        let function_ctx = Arc::new(function_ctx);
+
+        PartitionedLeftAntiJoin {
+            function_ctx: function_ctx.clone(),
+            build: PartitionedHashJoinState::create(method, desc.clone(), function_ctx),
+            desc,
+            context,
+        }
+    }
+}
+
+impl Join for PartitionedLeftAntiJoin {
+    fn add_block(&mut self, data: Option<DataBlock>) -> Result<()> {
+        self.build.add_block::<false>(data)
+    }
+
+    fn final_build(&mut self) -> Result<Option<ProgressValues>> {
+        self.build.final_build()
+    }
+
+    fn probe_block(&mut self, data: DataBlock) -> Result<Box<dyn JoinStream + '_>> {
+        if data.is_empty() {
+            return Ok(Box::new(EmptyJoinStream));
+        }
+
+        if self.build.num_rows == 0 {
+            let probe_projected = data.project(&self.desc.probe_projection);
+            return Ok(Box::new(OneBlockJoinStream(Some(probe_projected))));
+        }
+
+        let probe_keys = self.desc.probe_key(&data, &self.function_ctx)?;
+
+        let mut keys = DataBlock::new(probe_keys, data.num_rows());
+        let valids = match self.desc.from_correlated_subquery {
+            true => None,
+            false => self.desc.build_valids_by_keys(&keys)?,
+        };
+
+        self.desc.remove_keys_nullable(&mut keys);
+        let probe_block = data.project(&self.desc.probe_projection);
+
+        let probe_data = ProbeData::new(keys, valids);
+
+        match &mut self.context.filter_executor {
+            None => {
+                let probe_keys_stream = self.build.probe::<false, true>(probe_data)?;
+                Ok(LeftAntiHashJoinStream::create(
+                    probe_block,
+                    probe_keys_stream,
+                    &mut self.context.probe_result,
+                ))
+            }
+            Some(filter_executor) => {
+                let probe_keys_stream = self.build.probe::<false, false>(probe_data)?;
+                Ok(LeftAntiFilterHashJoinStream::create(
+                    probe_block,
+                    &self.build,
+                    probe_keys_stream,
+                    self.desc.clone(),
+                    &mut self.context.probe_result,
+                    filter_executor,
+                ))
+            }
+        }
+    }
+}
+
+struct LeftAntiHashJoinStream<'a> {
+    probe_data_block: Option<DataBlock>,
+    probe_keys_stream: Box<dyn ProbeStream + 'a>,
+    probed_rows: &'a mut ProbedRows,
+}
+
+impl<'a> LeftAntiHashJoinStream<'a> {
+    pub fn create(
+        probe_data_block: DataBlock,
+        probe_keys_stream: Box<dyn ProbeStream + 'a>,
+        probed_rows: &'a mut ProbedRows,
+    ) -> Box<dyn JoinStream + 'a> {
+        Box::new(LeftAntiHashJoinStream {
+            probed_rows,
+            probe_data_block: Some(probe_data_block),
+            probe_keys_stream,
+        })
+    }
+}
+
+impl<'a> JoinStream for LeftAntiHashJoinStream<'a> {
+    fn next(&mut self) -> Result<Option<DataBlock>> {
+        let Some(probe_data_block) = self.probe_data_block.take() else {
+            return Ok(None);
+        };
+
+        let num_rows = probe_data_block.num_rows();
+        let mut selected = vec![false; num_rows];
+
+        loop {
+            self.probed_rows.clear();
+            let max_rows = self.probed_rows.matched_probe.capacity();
+            self.probe_keys_stream.advance(self.probed_rows, max_rows)?;
+
+            if self.probed_rows.is_empty() {
+                let bitmap = Bitmap::from_trusted_len_iter(selected.into_iter());
+                return Ok(Some(probe_data_block.filter_with_bitmap(&bitmap)?));
+            }
+
+            for idx in &self.probed_rows.unmatched {
+                selected[*idx as usize] = true;
+            }
+        }
+    }
+}
+
+struct LeftAntiFilterHashJoinStream<'a> {
+    desc: Arc<HashJoinDesc>,
+    probe_data_block: Option<DataBlock>,
+    build: &'a PartitionedHashJoinState,
+    probe_keys_stream: Box<dyn ProbeStream + 'a>,
+    probed_rows: &'a mut ProbedRows,
+    filter_executor: &'a mut FilterExecutor,
+}
+
+impl<'a> LeftAntiFilterHashJoinStream<'a> {
+    pub fn create(
+        probe_data_block: DataBlock,
+        build: &'a PartitionedHashJoinState,
+        probe_keys_stream: Box<dyn ProbeStream + 'a>,
+        desc: Arc<HashJoinDesc>,
+        probed_rows: &'a mut ProbedRows,
+        filter_executor: &'a mut FilterExecutor,
+    ) -> Box<dyn JoinStream + 'a> {
+        Box::new(LeftAntiFilterHashJoinStream {
+            desc,
+            build,
+            probed_rows,
+            filter_executor,
+            probe_keys_stream,
+            probe_data_block: Some(probe_data_block),
+        })
+    }
+}
+
+impl<'a> JoinStream for LeftAntiFilterHashJoinStream<'a> {
+    fn next(&mut self) -> Result<Option<DataBlock>> {
+        let Some(probe_data_block) = self.probe_data_block.take() else {
+            return Ok(None);
+        };
+
+        let num_rows = probe_data_block.num_rows();
+        let mut selected = vec![true; num_rows];
+
+        loop {
+            self.probed_rows.clear();
+            let max_rows = self.probed_rows.matched_probe.capacity();
+            self.probe_keys_stream.advance(self.probed_rows, max_rows)?;
+
+            if self.probed_rows.is_empty() {
+                break;
+            }
+
+            if self.probed_rows.is_all_unmatched() {
+                continue;
+            }
+
+            let probe_block = match probe_data_block.num_columns() {
+                0 => None,
+                _ => Some(DataBlock::take(
+                    &probe_data_block,
+                    self.probed_rows.matched_probe.as_slice(),
+                )?),
+            };
+
+            let build_block = match self.build.columns.is_empty() {
+                true => None,
+                false => {
+                    let row_ptrs = self.probed_rows.matched_build.as_slice();
+                    Some(DataBlock::take_column_vec(
+                        self.build.columns.as_slice(),
+                        self.build.column_types.as_slice(),
+                        row_ptrs,
+                    ))
+                }
+            };
+
+            let result_block = final_result_block(
+                &self.desc,
+                probe_block,
+                build_block,
+                self.probed_rows.matched_build.len(),
+            );
+
+            let selected_rows = self.filter_executor.select(&result_block)?;
+
+            if selected_rows == result_block.num_rows() {
+                for probe_idx in &self.probed_rows.matched_probe {
+                    assume((*probe_idx as usize) < selected.len());
+                    selected[*probe_idx as usize] = false;
+                }
+            } else if selected_rows != 0 {
+                let selection = self.filter_executor.true_selection();
+                for idx in selection[..selected_rows].iter() {
+                    assume((*idx as usize) < self.probed_rows.matched_probe.len());
+                    let idx = self.probed_rows.matched_probe[*idx as usize];
+                    assume((idx as usize) < selected.len());
+                    selected[idx as usize] = false;
+                }
+            }
+        }
+
+        let bitmap = Bitmap::from_trusted_len_iter(selected.iter().copied());
+        match bitmap.true_count() {
+            0 => Ok(None),
+            _ => Ok(Some(probe_data_block.filter_with_bitmap(&bitmap)?)),
+        }
+    }
+}
diff --git a/src/query/service/src/pipelines/processors/transforms/new_hash_join/partitioned/left_join_semi.rs b/src/query/service/src/pipelines/processors/transforms/new_hash_join/partitioned/left_join_semi.rs
new file mode 100644
index 0000000000000..74cc501f616df
--- /dev/null
+++ b/src/query/service/src/pipelines/processors/transforms/new_hash_join/partitioned/left_join_semi.rs
@@ -0,0 +1,258 @@
+// Copyright 2021 Datafuse Labs
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+use std::sync::Arc;
+
+use databend_common_base::base::ProgressValues;
+use databend_common_base::hints::assume;
+use databend_common_column::bitmap::Bitmap;
+use databend_common_exception::Result;
+use databend_common_expression::DataBlock;
+use databend_common_expression::FilterExecutor;
+use databend_common_expression::FunctionContext;
+use databend_common_expression::HashMethodKind;
+
+use super::partitioned_build::PartitionedHashJoinState;
+use super::partitioned_build::ProbeData;
+use crate::pipelines::processors::HashJoinDesc;
+use crate::pipelines::processors::transforms::new_hash_join::common::join::EmptyJoinStream;
+use crate::pipelines::processors::transforms::new_hash_join::common::join::Join;
+use crate::pipelines::processors::transforms::new_hash_join::common::join::JoinStream;
+use crate::pipelines::processors::transforms::new_hash_join::common::probe_stream::ProbeStream;
+use crate::pipelines::processors::transforms::new_hash_join::common::probe_stream::ProbedRows;
+use crate::pipelines::processors::transforms::new_hash_join::unpartitioned::memory::left_join::final_result_block;
+use crate::pipelines::processors::transforms::unpartitioned::PerformanceContext;
+
+pub struct PartitionedLeftSemiJoin {
+    build: PartitionedHashJoinState,
+    desc: Arc<HashJoinDesc>,
+    function_ctx: Arc<FunctionContext>,
+    context: PerformanceContext,
+}
+
+impl PartitionedLeftSemiJoin {
+    pub fn create(
+        method: HashMethodKind,
+        desc: Arc<HashJoinDesc>,
+        function_ctx: FunctionContext,
+        max_block_size: usize,
+    ) -> Self {
+        let context =
+            PerformanceContext::create(max_block_size, desc.clone(), function_ctx.clone());
+
+        let function_ctx = Arc::new(function_ctx);
+
+        PartitionedLeftSemiJoin {
+            function_ctx: function_ctx.clone(),
+            build: PartitionedHashJoinState::create(method, desc.clone(), function_ctx),
+            desc,
+            context,
+        }
+    }
+}
+
+impl Join for PartitionedLeftSemiJoin {
+    fn add_block(&mut self, data: Option<DataBlock>) -> Result<()> {
+        self.build.add_block::<false>(data)
+    }
+
+    fn final_build(&mut self) -> Result<Option<ProgressValues>> {
+        self.build.final_build()
+    }
+
+    fn probe_block(&mut self, data: DataBlock) -> Result<Box<dyn JoinStream + '_>> {
+        if data.is_empty() || self.build.num_rows == 0 {
+            return Ok(Box::new(EmptyJoinStream));
+        }
+
+        let probe_keys = self.desc.probe_key(&data, &self.function_ctx)?;
+
+        let mut keys = DataBlock::new(probe_keys, data.num_rows());
+        let valids = match self.desc.from_correlated_subquery {
+            true => None,
+            false => self.desc.build_valids_by_keys(&keys)?,
+        };
+
+        self.desc.remove_keys_nullable(&mut keys);
+        let probe_block = data.project(&self.desc.probe_projection);
+
+        let probe_data = ProbeData::new(keys, valids);
+
+        match &mut self.context.filter_executor {
+            None => {
+                let probe_keys_stream = self.build.probe::<true, true>(probe_data)?;
+                Ok(LeftSemiHashJoinStream::create(
+                    probe_block,
+                    probe_keys_stream,
+                    &mut self.context.probe_result,
+                ))
+            }
+            Some(filter_executor) => {
+                let probe_keys_stream = self.build.probe::<true, false>(probe_data)?;
+                Ok(LeftSemiFilterHashJoinStream::create(
+                    probe_block,
+                    &self.build,
+                    probe_keys_stream,
+                    self.desc.clone(),
+                    &mut self.context.probe_result,
+                    filter_executor,
+                ))
+            }
+        }
+    }
+}
+
+struct LeftSemiHashJoinStream<'a> {
+    probe_data_block: DataBlock,
+    probe_keys_stream: Box<dyn ProbeStream + 'a>,
+    probed_rows: &'a mut ProbedRows,
+}
+
+impl<'a> LeftSemiHashJoinStream<'a> {
+    pub fn create(
+        probe_data_block: DataBlock,
+        probe_keys_stream: Box<dyn ProbeStream + 'a>,
+        probed_rows: &'a mut ProbedRows,
+    ) -> Box<dyn JoinStream + 'a> {
+        Box::new(LeftSemiHashJoinStream {
+            probed_rows,
+            probe_data_block,
+            probe_keys_stream,
+        })
+    }
+}
+
+impl<'a> JoinStream for LeftSemiHashJoinStream<'a> {
+    fn next(&mut self) -> Result<Option<DataBlock>> {
+        loop {
+            self.probed_rows.clear();
+            let max_rows = self.probed_rows.matched_probe.capacity();
+            self.probe_keys_stream.advance(self.probed_rows, max_rows)?;
+
+            if self.probed_rows.is_empty() {
+                return Ok(None);
+            }
+
+            if self.probed_rows.is_all_unmatched() {
+                continue;
+            }
+
+            return Ok(Some(DataBlock::take(
+                &self.probe_data_block,
+                self.probed_rows.matched_probe.as_slice(),
+            )?));
+        }
+    }
+}
+
+struct LeftSemiFilterHashJoinStream<'a> {
+    desc: Arc<HashJoinDesc>,
+    probe_data_block: Option<DataBlock>,
+    build: &'a PartitionedHashJoinState,
+    probe_keys_stream: Box<dyn ProbeStream + 'a>,
+    probed_rows: &'a mut ProbedRows,
+    filter_executor: &'a mut FilterExecutor,
+}
+
+impl<'a> LeftSemiFilterHashJoinStream<'a> {
+    pub fn create(
+        probe_data_block: DataBlock,
+        build: &'a PartitionedHashJoinState,
+        probe_keys_stream: Box<dyn ProbeStream + 'a>,
+        desc: Arc<HashJoinDesc>,
+        probed_rows: &'a mut ProbedRows,
+        filter_executor: &'a mut FilterExecutor,
+    ) -> Box<dyn JoinStream + 'a> {
+        Box::new(LeftSemiFilterHashJoinStream {
+            desc,
+            build,
+            probed_rows,
+            filter_executor,
+            probe_keys_stream,
+            probe_data_block: Some(probe_data_block),
+        })
+    }
+}
+
+impl<'a> JoinStream for LeftSemiFilterHashJoinStream<'a> {
+    fn next(&mut self) -> Result<Option<DataBlock>> {
+        let Some(probe_data_block) = self.probe_data_block.take() else {
+            return Ok(None);
+        };
+
+        let num_rows = probe_data_block.num_rows();
+        let mut selected = vec![false; num_rows];
+
+        loop {
+            self.probed_rows.clear();
+            let max_rows = self.probed_rows.matched_probe.capacity();
+            self.probe_keys_stream.advance(self.probed_rows, max_rows)?;
+
+            if self.probed_rows.is_empty() {
+                break;
+            }
+
+            if self.probed_rows.is_all_unmatched() {
+                continue;
+            }
+
+            let probe_block = match probe_data_block.num_columns() {
+                0 => None,
+                _ => Some(DataBlock::take(
+                    &probe_data_block,
+                    self.probed_rows.matched_probe.as_slice(),
+                )?),
+            };
+
+            let build_block = match self.build.columns.is_empty() {
+                true => None,
+                false => {
+                    let row_ptrs = self.probed_rows.matched_build.as_slice();
+                    Some(DataBlock::take_column_vec(
+                        self.build.columns.as_slice(),
+                        self.build.column_types.as_slice(),
+                        row_ptrs,
+                    ))
+                }
+            };
+
+            let num_matched = self.probed_rows.matched_probe.len();
+            let result = final_result_block(&self.desc, probe_block, build_block, num_matched);
+
+            let selected_rows = self.filter_executor.select(&result)?;
+
+            if selected_rows == result.num_rows() {
+                for probe_idx in &self.probed_rows.matched_probe {
+                    assume((*probe_idx as usize) < selected.len());
+                    selected[*probe_idx as usize] = true;
+                }
+            } else if selected_rows != 0 {
+                let selection = self.filter_executor.true_selection();
+                for idx in selection[..selected_rows].iter() {
+                    assume((*idx as usize) < self.probed_rows.matched_probe.len());
+                    let idx = self.probed_rows.matched_probe[*idx as usize];
+                    assume((idx as usize) < selected.len());
+                    selected[idx as usize] = true;
+                }
+            }
+        }
+
+        let bitmap = Bitmap::from_trusted_len_iter(selected.iter().copied());
+
+        match bitmap.true_count() {
+            0 => Ok(None),
+            _ => Ok(Some(probe_data_block.filter_with_bitmap(&bitmap)?)),
+        }
+    }
+}
diff --git a/src/query/service/src/pipelines/processors/transforms/new_hash_join/partitioned/mod.rs b/src/query/service/src/pipelines/processors/transforms/new_hash_join/partitioned/mod.rs
new file mode 100644
index 0000000000000..a896bf13bc191
--- /dev/null
+++ b/src/query/service/src/pipelines/processors/transforms/new_hash_join/partitioned/mod.rs
@@ -0,0 +1,38 @@
+// Copyright 2021 Datafuse Labs
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+mod chunk_accumulator;
+mod compact_hash_table;
+mod inner_join;
+mod left_join;
+mod left_join_anti;
+mod left_join_semi;
+mod partitioned_build;
+mod right_join;
+mod right_join_anti;
+mod right_join_semi;
+mod transform_hash_join;
+
+pub use compact_hash_table::CompactJoinHashTable;
+pub use compact_hash_table::RowIndex;
+pub use inner_join::PartitionedInnerJoin;
+pub use left_join::PartitionedLeftJoin;
+pub use left_join_anti::PartitionedLeftAntiJoin;
+pub use left_join_semi::PartitionedLeftSemiJoin;
+pub use partitioned_build::PartitionedHashJoinState;
+pub use right_join::PartitionedRightJoin;
+pub use right_join_anti::PartitionedRightAntiJoin;
+pub use right_join_semi::PartitionedRightSemiJoin;
+pub use transform_hash_join::SharedRuntimeFilterPackets;
+pub use transform_hash_join::TransformPartitionedHashJoin;
diff --git a/src/query/service/src/pipelines/processors/transforms/new_hash_join/partitioned/partitioned_build.rs b/src/query/service/src/pipelines/processors/transforms/new_hash_join/partitioned/partitioned_build.rs
new file mode 100644
index 0000000000000..94d792bc826ba
--- /dev/null
+++ b/src/query/service/src/pipelines/processors/transforms/new_hash_join/partitioned/partitioned_build.rs
@@ -0,0 +1,945 @@
+// Copyright 2021 Datafuse Labs
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+use std::sync::Arc;
+
+use databend_common_base::base::ProgressValues;
+use databend_common_base::hints::assume;
+use databend_common_column::binary::BinaryColumn;
+use databend_common_column::bitmap::Bitmap;
+use databend_common_column::buffer::Buffer;
+use databend_common_exception::Result;
+use databend_common_expression::BlockEntry;
+use databend_common_expression::Column;
+use databend_common_expression::ColumnVec;
+use databend_common_expression::DataBlock;
+use databend_common_expression::FixedKey;
+use databend_common_expression::FunctionContext;
+use databend_common_expression::HashMethod;
+use databend_common_expression::HashMethodKind;
+use databend_common_expression::KeysState;
+use databend_common_expression::ProjectedBlock;
+use databend_common_expression::types::DataType;
+use databend_common_expression::with_hash_method;
+use ethnum::u256;
+
+use super::chunk_accumulator::FixedSizeChunkAccumulator;
+use super::compact_hash_table::CompactJoinHashTable;
+use crate::pipelines::processors::HashJoinDesc;
+use crate::pipelines::processors::transforms::hash_join_table::RowPtr;
+use crate::pipelines::processors::transforms::new_hash_join::common::probe_stream::ProbeStream;
+use crate::pipelines::processors::transforms::new_hash_join::common::probe_stream::ProbedRows;
+use crate::pipelines::processors::transforms::partitioned::RowIndex;
+use crate::pipelines::processors::transforms::unpartitioned::hashtable::basic::AllUnmatchedProbeStream;
+use crate::pipelines::processors::transforms::unpartitioned::hashtable::basic::EmptyProbeStream;
+
+pub const CHUNK_BITS: usize = 16;
+pub const CHUNK_SIZE: usize = 1 << CHUNK_BITS; // 65536
+
+/// Convert a 1-based flat index to RowPtr (chunk_index, row_offset).
+#[inline(always)]
+pub fn flat_to_row_ptr(flat_index: usize) -> RowPtr {
+    let zero_based = flat_index - 1;
+    RowPtr {
+        chunk_index: (zero_based >> CHUNK_BITS) as u32,
+        row_index: (zero_based & (CHUNK_SIZE - 1)) as u32,
+    }
+}
+
+pub struct ProbeData {
+    keys: DataBlock,
+    valids: Option<Bitmap>,
+}
+
+impl ProbeData {
+    pub fn new(keys: DataBlock, valids: Option<Bitmap>) -> Self {
+        ProbeData { keys, valids }
+    }
+
+    pub fn num_rows(&self) -> usize {
+        self.keys.num_rows()
+    }
+
+    pub fn columns(&self) -> &[BlockEntry] {
+        self.keys.columns()
+    }
+
+    pub fn non_null_rows(&self) -> usize {
+        match &self.valids {
+            None => self.keys.num_rows(),
+            Some(valids) => valids.len() - valids.null_count(),
+        }
+    }
+
+    pub fn into_raw(self) -> (DataBlock, Option<Bitmap>) {
+        (self.keys, self.valids)
+    }
+}
+
+pub enum BuildKeysStates {
+    UInt8(Vec<Buffer<u8>>),
+    UInt16(Vec<Buffer<u16>>),
+    UInt32(Vec<Buffer<u32>>),
+    UInt64(Vec<Buffer<u64>>),
+    UInt128(Vec<Buffer<u128>>),
+    UInt256(Vec<Buffer<u256>>),
+    Binary(Vec<BinaryColumn>),
+}
+
+impl BuildKeysStates {
+    pub fn get(&self, idx: usize) -> KeysState {
+        match self {
+            BuildKeysStates::UInt8(v) => u8::upcast(v[idx].clone()),
+            BuildKeysStates::UInt16(v) => u16::upcast(v[idx].clone()),
+            BuildKeysStates::UInt32(v) => u32::upcast(v[idx].clone()),
+            BuildKeysStates::UInt64(v) => u64::upcast(v[idx].clone()),
+            BuildKeysStates::UInt128(v) => u128::upcast(v[idx].clone()),
+            BuildKeysStates::UInt256(v) => u256::upcast(v[idx].clone()),
+            BuildKeysStates::Binary(v) => KeysState::Column(Column::Binary(v[idx].clone())),
+        }
+    }
+}
+
+impl BuildKeysStates {
+    pub fn new(method: &HashMethodKind) -> Self {
+        match method {
+            HashMethodKind::Serializer(_) => BuildKeysStates::Binary(vec![]),
+            HashMethodKind::SingleBinary(_) => BuildKeysStates::Binary(vec![]),
+            HashMethodKind::KeysU8(_) => BuildKeysStates::UInt8(vec![]),
+            HashMethodKind::KeysU16(_) => BuildKeysStates::UInt16(vec![]),
+            HashMethodKind::KeysU32(_) => BuildKeysStates::UInt32(vec![]),
+            HashMethodKind::KeysU64(_) => BuildKeysStates::UInt64(vec![]),
+            HashMethodKind::KeysU128(_) => BuildKeysStates::UInt128(vec![]),
+            HashMethodKind::KeysU256(_) => BuildKeysStates::UInt256(vec![]),
+        }
+    }
+}
+
+/// Maximum key range for direct hash join (same as Doris: 1 << 23 = 8M).
+const DIRECT_JOIN_MAX_RANGE: u64 = 1 << 23;
+
+/// Per-thread build state for partitioned hash join.
+pub struct PartitionedHashJoinState {
+    pub chunks: Vec<DataBlock>,
+    pub method: HashMethodKind,
+    pub build_keys_states: BuildKeysStates,
+    pub hash_table: CompactJoinHashTable<u32>,
+
+    pub columns: Vec<ColumnVec>,
+    pub column_types: Vec<DataType>,
+
+    pub num_rows: usize,
+    pub build_block_idx: usize,
+
+    pub direct_join: bool,
+    pub min_key: u256,
+
+    pub visited: Vec<Vec<u8>>,
+    pub desc: Arc<HashJoinDesc>,
+    pub function_ctx: Arc<FunctionContext>,
+
+    pub accumulator: FixedSizeChunkAccumulator,
+}
+
+impl PartitionedHashJoinState {
+    pub fn create(
+        method: HashMethodKind,
+        desc: Arc<HashJoinDesc>,
+        function_ctx: Arc<FunctionContext>,
+    ) -> Self {
+        PartitionedHashJoinState {
+            chunks: Vec::new(),
+            build_keys_states: BuildKeysStates::new(&method),
+            hash_table: CompactJoinHashTable::new(0),
+            columns: Vec::new(),
+            column_types: Vec::new(),
+            num_rows: 0,
+            method,
+            desc,
+            function_ctx,
+            build_block_idx: 0,
+            direct_join: false,
+            min_key: u256::ZERO,
+            visited: vec![],
+            accumulator: FixedSizeChunkAccumulator::new(CHUNK_SIZE),
+        }
+    }
+
+    pub fn add_block<const VISITED: bool>(&mut self, data: Option<DataBlock>) -> Result<()> {
+        let Some(data_block) = data else {
+            if let Some(chunk) = self.accumulator.finalize() {
+                self.ingest_chunk::<VISITED>(chunk)?;
+            }
+
+            return Ok(());
+        };
+
+        let data_block = self.prepare_data::<VISITED>(data_block)?;
+        for ready_block in self.accumulator.accumulate(data_block) {
+            self.ingest_chunk::<VISITED>(ready_block)?;
+        }
+
+        Ok(())
+    }
+
+    fn ingest_chunk<const VISITED: bool>(&mut self, chunk: DataBlock) -> Result<()> {
+        let num_rows = chunk.num_rows();
+        let mut columns = chunk.take_columns();
+        let data_columns = columns.split_off(self.desc.build_keys.len());
+
+        let mut keys_block = DataBlock::new(columns, num_rows);
+        let mut chunk = DataBlock::new(data_columns, num_rows);
+        if VISITED {
+            if let Some(bitmap) = self.desc.build_valids_by_keys(&keys_block)? {
+                if bitmap.true_count() != bitmap.len() {
+                    keys_block = keys_block.filter_with_bitmap(&bitmap)?;
+                    let null_keys = chunk.clone().filter_with_bitmap(&(!(&bitmap)))?;
+                    let nonnull_keys = chunk.filter_with_bitmap(&bitmap)?;
+                    chunk = DataBlock::concat(&[nonnull_keys, null_keys])?;
+                }
+            }
+            self.desc.remove_keys_nullable(&mut keys_block);
+        }
+
+        let keys = ProjectedBlock::from(keys_block.columns());
+
+        let keys_state = with_hash_method!(|T| match &self.method {
+            HashMethodKind::T(method) => method.build_keys_state(keys, keys_block.num_rows())?,
+        });
+
+        if VISITED {
+            self.visited.push(vec![0u8; num_rows]);
+        }
+
+        self.num_rows += num_rows;
+        self.chunks.push(chunk);
+        self.add_build_state(keys_state);
+        Ok(())
+    }
+
+    fn prepare_data<const VISITED: bool>(&self, mut chunk: DataBlock) -> Result<DataBlock> {
+        let num_rows = chunk.num_rows();
+
+        let keys_entries = self.desc.build_key(&chunk, &self.function_ctx)?;
+        let mut keys_block = DataBlock::new(keys_entries, num_rows);
+
+        chunk = chunk.project(&self.desc.build_projection);
+
+        if !VISITED {
+            if let Some(bitmap) = self.desc.build_valids_by_keys(&keys_block)? {
+                if bitmap.true_count() != bitmap.len() {
+                    keys_block = keys_block.filter_with_bitmap(&bitmap)?;
+                    chunk = chunk.filter_with_bitmap(&bitmap)?;
+                }
+            }
+            self.desc.remove_keys_nullable(&mut keys_block);
+        }
+
+        keys_block.merge_block(chunk);
+        Ok(keys_block)
+    }
+
+    pub fn final_build(&mut self) -> Result<Option<ProgressValues>> {
+        if self.num_rows == 0 {
+            return Ok(None);
+        }
+
+        if self.build_block_idx == 0 {
+            if let Some(first_chunk) = self.chunks.first() {
+                self.column_types = (0..first_chunk.num_columns())
+                    .map(|offset| first_chunk.get_by_offset(offset).data_type())
+                    .collect();
+
+                let num_cols = first_chunk.num_columns();
+                let mut columns = Vec::with_capacity(num_cols);
+                for offset in 0..num_cols {
+                    let full_columns: Vec<Column> = self
+                        .chunks
+                        .iter()
+                        .map(|chunk| chunk.get_by_offset(offset).to_column())
+                        .collect();
+                    columns.push(Column::take_downcast_column_vec(&full_columns));
+                }
+                self.columns = columns;
+            }
+
+            // Decide whether to use direct mapping
+            let direct_range = match &self.build_keys_states {
+                BuildKeysStates::UInt8(_) => Some((u256::ZERO, u8::MAX as u64)),
+                BuildKeysStates::UInt16(_) => Some((u256::ZERO, u16::MAX as u64)),
+                BuildKeysStates::UInt32(bufs) => scan_min_max_u32(bufs),
+                BuildKeysStates::UInt64(bufs) => scan_min_max_u64(bufs),
+                BuildKeysStates::UInt128(bufs) => scan_min_max_u128(bufs),
+                BuildKeysStates::UInt256(bufs) => scan_min_max_u256(bufs),
+                _ => None,
+            };
+
+            match direct_range {
+                Some((min_key, range)) => {
+                    self.direct_join = true;
+                    self.min_key = min_key;
+                    self.hash_table =
+                        CompactJoinHashTable::new_direct(self.num_rows, range as usize);
+                }
+                None => {
+                    self.hash_table = CompactJoinHashTable::new(self.num_rows);
+                }
+            };
+        }
+
+        let row_offset = CHUNK_SIZE * self.build_block_idx + 1;
+        let idx = self.build_block_idx;
+
+        if self.direct_join {
+            match &self.build_keys_states {
+                BuildKeysStates::UInt8(states) => {
+                    let min_t = self.min_key.as_u8();
+                    let adjusted: Vec<u64> = states[idx]
+                        .iter()
+                        .map(|k| k.wrapping_sub(min_t) as u64)
+                        .collect();
+                    self.hash_table.insert_chunk::<true>(&adjusted, row_offset);
+                }
+                BuildKeysStates::UInt16(states) => {
+                    let min_t = self.min_key.as_u16();
+                    let adjusted: Vec<u64> = states[idx]
+                        .iter()
+                        .map(|k| k.wrapping_sub(min_t) as u64)
+                        .collect();
+                    self.hash_table.insert_chunk::<true>(&adjusted, row_offset);
+                }
+                BuildKeysStates::UInt32(states) => {
+                    let min_t = self.min_key.as_u32();
+                    let adjusted: Vec<u64> = states[idx]
+                        .iter()
+                        .map(|k| k.wrapping_sub(min_t) as u64)
+                        .collect();
+                    self.hash_table.insert_chunk::<true>(&adjusted, row_offset);
+                }
+                BuildKeysStates::UInt64(states) => {
+                    let min_t = self.min_key.as_u64();
+                    let adjusted: Vec<u64> =
+                        states[idx].iter().map(|k| k.wrapping_sub(min_t)).collect();
+                    self.hash_table.insert_chunk::<true>(&adjusted, row_offset);
+                }
+                BuildKeysStates::UInt128(states) => {
+                    let min_t = self.min_key.as_u128();
+                    let adjusted: Vec<u64> = states[idx]
+                        .iter()
+                        .map(|k| k.wrapping_sub(min_t) as u64)
+                        .collect();
+                    self.hash_table.insert_chunk::<true>(&adjusted, row_offset);
+                }
+                BuildKeysStates::UInt256(states) => {
+                    let min_t = self.min_key;
+                    let adjusted: Vec<u64> = states[idx]
+                        .iter()
+                        .map(|k| k.wrapping_sub(min_t).as_u64())
+                        .collect();
+                    self.hash_table.insert_chunk::<true>(&adjusted, row_offset);
+                }
+                _ => unreachable!(),
+            }
+        } else {
+            let keys_state = self.build_keys_states.get(idx);
+            with_hash_method!(|T| match &self.method {
+                HashMethodKind::T(method) => {
+                    let mut hashes = Vec::with_capacity(CHUNK_SIZE);
+                    method.build_keys_hashes(&keys_state, &mut hashes);
+                    self.hash_table.insert_chunk::<false>(&hashes, row_offset);
+                }
+            });
+        }
+
+        self.build_block_idx += 1;
+        match self.build_block_idx == self.chunks.len() {
+            true => Ok(None),
+            false => Ok(Some(ProgressValues { rows: 0, bytes: 0 })),
+        }
+    }
+
+    pub fn probe<'a, const MATCHED: bool, const MATCH_FIRST: bool>(
+        &'a self,
+        data: ProbeData,
+    ) -> Result<Box<dyn ProbeStream + 'a>> {
+        let num_rows = data.num_rows();
+        let (keys_block, valids) = data.into_raw();
+        let keys = ProjectedBlock::from(keys_block.columns());
+
+        if self.direct_join {
+            return self.probe_direct::<MATCHED, MATCH_FIRST>(keys, num_rows, valids);
+        }
+
+        let mut hashes = Vec::with_capacity(num_rows);
+        let (keys_state, matched_rows) = with_hash_method!(|T| match &self.method {
+            HashMethodKind::T(method) => {
+                let keys_state = method.build_keys_state(keys, num_rows)?;
+                method.build_keys_hashes(&keys_state, &mut hashes);
+                (
+                    keys_state,
+                    self.hash_table.probe::<false>(&mut hashes, valids),
+                )
+            }
+        });
+
+        if matched_rows == 0 {
+            return match MATCHED {
+                true => Ok(Box::new(EmptyProbeStream)),
+                false => Ok(AllUnmatchedProbeStream::create(hashes.len())),
+            };
+        }
+
+        Ok(match (&self.method, &self.build_keys_states) {
+            (HashMethodKind::KeysU8(_), BuildKeysStates::UInt8(states)) => {
+                let probe_keys = u8::downcast_owned(keys_state).unwrap();
+                PrimitiveProbeStream::<'a, u8, MATCHED, MATCH_FIRST, false, u32>::new(
+                    hashes,
+                    states,
+                    probe_keys,
+                    &self.hash_table.next,
+                )
+            }
+            (HashMethodKind::KeysU16(_), BuildKeysStates::UInt16(states)) => {
+                let probe_keys = u16::downcast_owned(keys_state).unwrap();
+                PrimitiveProbeStream::<'a, u16, MATCHED, MATCH_FIRST, false, u32>::new(
+                    hashes,
+                    states,
+                    probe_keys,
+                    &self.hash_table.next,
+                )
+            }
+            (HashMethodKind::KeysU32(_), BuildKeysStates::UInt32(states)) => {
+                let probe_keys = u32::downcast_owned(keys_state).unwrap();
+                PrimitiveProbeStream::<'a, u32, MATCHED, MATCH_FIRST, false, u32>::new(
+                    hashes,
+                    states,
+                    probe_keys,
+                    &self.hash_table.next,
+                )
+            }
+            (HashMethodKind::KeysU64(_), BuildKeysStates::UInt64(states)) => {
+                let probe_keys = u64::downcast_owned(keys_state).unwrap();
+                PrimitiveProbeStream::<'a, u64, MATCHED, MATCH_FIRST, false, u32>::new(
+                    hashes,
+                    states,
+                    probe_keys,
+                    &self.hash_table.next,
+                )
+            }
+            (HashMethodKind::KeysU128(_), BuildKeysStates::UInt128(states)) => {
+                let probe_keys = u128::downcast_owned(keys_state).unwrap();
+                PrimitiveProbeStream::<'a, u128, MATCHED, MATCH_FIRST, false, u32>::new(
+                    hashes,
+                    states,
+                    probe_keys,
+                    &self.hash_table.next,
+                )
+            }
+            (HashMethodKind::KeysU256(_), BuildKeysStates::UInt256(states)) => {
+                let probe_keys = u256::downcast_owned(keys_state).unwrap();
+                PrimitiveProbeStream::<'a, u256, MATCHED, MATCH_FIRST, false, u32>::new(
+                    hashes,
+                    states,
+                    probe_keys,
+                    &self.hash_table.next,
+                )
+            }
+            (
+                HashMethodKind::Serializer(_) | HashMethodKind::SingleBinary(_),
+                BuildKeysStates::Binary(states),
+            ) => match keys_state {
+                KeysState::Column(Column::Binary(probe_keys))
+                | KeysState::Column(Column::Variant(probe_keys))
+                | KeysState::Column(Column::Bitmap(probe_keys)) => {
+                    BinaryProbeStream::<'a, MATCHED, MATCH_FIRST, u32>::create(
+                        hashes,
+                        states,
+                        probe_keys,
+                        &self.hash_table.next,
+                    )
+                }
+                _ => unreachable!(),
+            },
+            _ => unreachable!(),
+        })
+    }
+
+    fn probe_direct<'a, const MATCHED: bool, const MATCH_FIRST: bool>(
+        &'a self,
+        keys: ProjectedBlock<'_>,
+        num_rows: usize,
+        valids: Option<Bitmap>,
+    ) -> Result<Box<dyn ProbeStream + 'a>> {
+        let keys_state = with_hash_method!(|T| match &self.method {
+            HashMethodKind::T(method) => method.build_keys_state(keys, num_rows)?,
+        });
+
+        Ok(match &self.build_keys_states {
+            BuildKeysStates::UInt8(bufs) => {
+                let min_t = self.min_key.as_u8();
+                let probe_keys = u8::downcast_owned(keys_state).unwrap();
+                let mut adjusted: Vec<u64> = probe_keys
+                    .iter()
+                    .map(|k| k.wrapping_sub(min_t) as u64)
+                    .collect();
+
+                if self.hash_table.probe::<true>(&mut adjusted, valids) == 0 {
+                    return match MATCHED {
+                        true => Ok(Box::new(EmptyProbeStream)),
+                        false => Ok(AllUnmatchedProbeStream::create(adjusted.len())),
+                    };
+                }
+                PrimitiveProbeStream::<'a, u8, MATCHED, MATCH_FIRST, true, u32>::new(
+                    adjusted,
+                    bufs,
+                    probe_keys,
+                    &self.hash_table.next,
+                )
+            }
+            BuildKeysStates::UInt16(bufs) => {
+                let min_t = self.min_key.as_u16();
+                let probe_keys = u16::downcast_owned(keys_state).unwrap();
+                let mut adjusted: Vec<u64> = probe_keys
+                    .iter()
+                    .map(|k| k.wrapping_sub(min_t) as u64)
+                    .collect();
+
+                if self.hash_table.probe::<true>(&mut adjusted, valids) == 0 {
+                    return match MATCHED {
+                        true => Ok(Box::new(EmptyProbeStream)),
+                        false => Ok(AllUnmatchedProbeStream::create(adjusted.len())),
+                    };
+                }
+                PrimitiveProbeStream::<'a, u16, MATCHED, MATCH_FIRST, true, u32>::new(
+                    adjusted,
+                    bufs,
+                    probe_keys,
+                    &self.hash_table.next,
+                )
+            }
+            BuildKeysStates::UInt32(bufs) => {
+                let min_t = self.min_key.as_u32();
+                let probe_keys = u32::downcast_owned(keys_state).unwrap();
+                let mut adjusted: Vec<u64> = probe_keys
+                    .iter()
+                    .map(|k| k.wrapping_sub(min_t) as u64)
+                    .collect();
+
+                if self.hash_table.probe::<true>(&mut adjusted, valids) == 0 {
+                    return match MATCHED {
+                        true => Ok(Box::new(EmptyProbeStream)),
+                        false => Ok(AllUnmatchedProbeStream::create(adjusted.len())),
+                    };
+                }
+                PrimitiveProbeStream::<'a, u32, MATCHED, MATCH_FIRST, true, u32>::new(
+                    adjusted,
+                    bufs,
+                    probe_keys,
+                    &self.hash_table.next,
+                )
+            }
+            BuildKeysStates::UInt64(bufs) => {
+                let min_t = self.min_key.as_u64();
+                let probe_keys = u64::downcast_owned(keys_state).unwrap();
+                let mut adjusted: Vec<u64> =
+                    probe_keys.iter().map(|k| k.wrapping_sub(min_t)).collect();
+
+                if self.hash_table.probe::<true>(&mut adjusted, valids) == 0 {
+                    return match MATCHED {
+                        true => Ok(Box::new(EmptyProbeStream)),
+                        false => Ok(AllUnmatchedProbeStream::create(adjusted.len())),
+                    };
+                }
+                PrimitiveProbeStream::<'a, u64, MATCHED, MATCH_FIRST, true, u32>::new(
+                    adjusted,
+                    bufs,
+                    probe_keys,
+                    &self.hash_table.next,
+                )
+            }
+            BuildKeysStates::UInt128(bufs) => {
+                let min_t = self.min_key.as_u128();
+                let probe_keys = u128::downcast_owned(keys_state).unwrap();
+                let mut adjusted: Vec<u64> = probe_keys
+                    .iter()
+                    .map(|k| k.wrapping_sub(min_t) as u64)
+                    .collect();
+
+                if self.hash_table.probe::<true>(&mut adjusted, valids) == 0 {
+                    return match MATCHED {
+                        true => Ok(Box::new(EmptyProbeStream)),
+                        false => Ok(AllUnmatchedProbeStream::create(adjusted.len())),
+                    };
+                }
+                PrimitiveProbeStream::<'a, u128, MATCHED, MATCH_FIRST, true, u32>::new(
+                    adjusted,
+                    bufs,
+                    probe_keys,
+                    &self.hash_table.next,
+                )
+            }
+            BuildKeysStates::UInt256(bufs) => {
+                let min_t = self.min_key;
+                let probe_keys = u256::downcast_owned(keys_state).unwrap();
+                let mut adjusted: Vec<u64> = probe_keys
+                    .iter()
+                    .map(|k| k.wrapping_sub(min_t).as_u64())
+                    .collect();
+
+                if self.hash_table.probe::<true>(&mut adjusted, valids) == 0 {
+                    return match MATCHED {
+                        true => Ok(Box::new(EmptyProbeStream)),
+                        false => Ok(AllUnmatchedProbeStream::create(adjusted.len())),
+                    };
+                }
+                PrimitiveProbeStream::<'a, u256, MATCHED, MATCH_FIRST, true, u32>::new(
+                    adjusted,
+                    bufs,
+                    probe_keys,
+                    &self.hash_table.next,
+                )
+            }
+            _ => unreachable!(),
+        })
+    }
+
+    fn add_build_state(&mut self, state: KeysState) {
+        match &mut self.build_keys_states {
+            BuildKeysStates::UInt8(states) => {
+                states.push(u8::downcast_owned(state).unwrap());
+            }
+            BuildKeysStates::UInt16(states) => {
+                states.push(u16::downcast_owned(state).unwrap());
+            }
+            BuildKeysStates::UInt32(states) => {
+                states.push(u32::downcast_owned(state).unwrap());
+            }
+            BuildKeysStates::UInt64(states) => {
+                states.push(u64::downcast_owned(state).unwrap());
+            }
+            BuildKeysStates::UInt128(states) => {
+                states.push(u128::downcast_owned(state).unwrap());
+            }
+            BuildKeysStates::UInt256(states) => {
+                states.push(u256::downcast_owned(state).unwrap());
+            }
+            BuildKeysStates::Binary(states) => match state {
+                KeysState::Column(Column::Binary(build_keys))
+                | KeysState::Column(Column::Variant(build_keys))
+                | KeysState::Column(Column::Bitmap(build_keys)) => {
+                    states.push(build_keys);
+                }
+                _ => unreachable!(),
+            },
+        };
+    }
+}
+
+struct PrimitiveProbeStream<
+    'a,
+    T: Send + Sync + PartialEq,
+    const MATCHED: bool,
+    const MATCH_FIRST: bool,
+    const DIRECT: bool,
+    I: RowIndex = u32,
+> {
+    key_idx: usize,
+    pointers: Vec<u64>,
+    build_idx: usize,
+    probe_keys: Buffer<T>,
+    build_keys: &'a [Buffer<T>],
+    next: &'a [I],
+    matched_num_rows: usize,
+}
+
+impl<'a, T, const MATCHED: bool, const MATCH_FIRST: bool, const DIRECT: bool, I>
+    PrimitiveProbeStream<'a, T, MATCHED, MATCH_FIRST, DIRECT, I>
+where
+    T: Send + Sync + PartialEq,
+    I: RowIndex,
+{
+    #[allow(clippy::new_ret_no_self)]
+    pub fn new(
+        pointers: Vec<u64>,
+        build_keys: &'a [Buffer<T>],
+        probe_keys: Buffer<T>,
+        next: &'a [I],
+    ) -> Box<dyn ProbeStream + 'a> {
+        Box::new(Self {
+            next,
+            pointers,
+            probe_keys,
+            build_keys,
+            key_idx: 0,
+            build_idx: 0,
+            matched_num_rows: 0,
+        })
+    }
+}
+
+impl<'a, T, const MATCHED: bool, const MATCH_FIRST: bool, const DIRECT: bool, I> ProbeStream
+    for PrimitiveProbeStream<'a, T, MATCHED, MATCH_FIRST, DIRECT, I>
+where
+    I: RowIndex,
+    T: Send + Sync + PartialEq,
+{
+    fn advance(&mut self, res: &mut ProbedRows, max_rows: usize) -> Result<()> {
+        while self.key_idx < self.probe_keys.len() {
+            assume(res.matched_probe.len() == res.matched_build.len());
+            assume(res.matched_build.len() < res.matched_build.capacity());
+            assume(res.matched_probe.len() < res.matched_probe.capacity());
+            assume(self.key_idx < self.pointers.len());
+
+            if res.matched_probe.len() == max_rows {
+                break;
+            }
+
+            if self.build_idx == 0 {
+                self.build_idx = self.pointers[self.key_idx].to_usize();
+
+                if self.build_idx == 0 {
+                    if !MATCHED {
+                        res.unmatched.push(self.key_idx as u64);
+                    }
+
+                    self.key_idx += 1;
+                    self.matched_num_rows = 0;
+                    continue;
+                }
+            }
+
+            while self.build_idx != 0 {
+                let row_ptr = flat_to_row_ptr(self.build_idx);
+
+                let key_match = DIRECT
+                    || self.probe_keys[self.key_idx]
+                        == self.build_keys[row_ptr.chunk_index as usize]
+                            [row_ptr.row_index as usize];
+
+                if key_match {
+                    res.matched_build.push(row_ptr);
+                    res.matched_probe.push(self.key_idx as u64);
+                    self.matched_num_rows += 1;
+
+                    if res.matched_probe.len() == max_rows {
+                        self.build_idx = match MATCH_FIRST {
+                            true => 0,
+                            false => self.next[self.build_idx].to_usize(),
+                        };
+
+                        if self.build_idx == 0 {
+                            self.key_idx += 1;
+                            self.matched_num_rows = 0;
+                        }
+
+                        return Ok(());
+                    }
+
+                    if MATCH_FIRST {
+                        self.build_idx = 0;
+                        break;
+                    }
+                }
+
+                self.build_idx = self.next[self.build_idx].to_usize();
+            }
+
+            if !MATCHED && self.matched_num_rows == 0 {
+                res.unmatched.push(self.key_idx as u64);
+            }
+
+            self.key_idx += 1;
+            self.matched_num_rows = 0;
+        }
+
+        Ok(())
+    }
+}
+
+struct BinaryProbeStream<'a, const MATCHED: bool, const MATCH_FIRST: bool, I: RowIndex = u32> {
+    key_idx: usize,
+    pointers: Vec<u64>,
+    build_idx: usize,
+    probe_keys: BinaryColumn,
+    build_keys: &'a [BinaryColumn],
+    next: &'a [I],
+    matched_num_rows: usize,
+}
+
+impl<'a, const MATCHED: bool, const MATCH_FIRST: bool, I: RowIndex>
+    BinaryProbeStream<'a, MATCHED, MATCH_FIRST, I>
+{
+    pub fn create(
+        pointers: Vec<u64>,
+        build_keys: &'a [BinaryColumn],
+        probe_keys: BinaryColumn,
+        next: &'a [I],
+    ) -> Box<dyn ProbeStream + 'a> {
+        Box::new(Self {
+            next,
+            pointers,
+            probe_keys,
+            build_keys,
+            key_idx: 0,
+            build_idx: 0,
+            matched_num_rows: 0,
+        })
+    }
+}
+
+impl<'a, const MATCHED: bool, const MATCH_FIRST: bool, I: RowIndex> ProbeStream
+    for BinaryProbeStream<'a, MATCHED, MATCH_FIRST, I>
+{
+    fn advance(&mut self, res: &mut ProbedRows, max_rows: usize) -> Result<()> {
+        while self.key_idx < self.probe_keys.len() {
+            assume(res.matched_probe.len() == res.matched_build.len());
+            assume(res.matched_build.len() < res.matched_build.capacity());
+            assume(res.matched_probe.len() < res.matched_probe.capacity());
+            assume(self.key_idx < self.pointers.len());
+
+            if res.matched_probe.len() == max_rows {
+                break;
+            }
+
+            if self.build_idx == 0 {
+                self.build_idx = self.pointers[self.key_idx].to_usize();
+
+                if self.build_idx == 0 {
+                    if !MATCHED {
+                        res.unmatched.push(self.key_idx as u64);
+                    }
+
+                    self.key_idx += 1;
+                    self.matched_num_rows = 0;
+                    continue;
+                }
+            }
+
+            while self.build_idx != 0 {
+                let row_ptr = flat_to_row_ptr(self.build_idx);
+                if self.probe_keys.value(self.key_idx)
+                    == self.build_keys[row_ptr.chunk_index as usize]
+                        .value(row_ptr.row_index as usize)
+                {
+                    res.matched_build.push(row_ptr);
+                    res.matched_probe.push(self.key_idx as u64);
+                    self.matched_num_rows += 1;
+
+                    if res.matched_probe.len() == max_rows {
+                        self.build_idx = match MATCH_FIRST {
+                            true => 0,
+                            false => self.next[self.build_idx].to_usize(),
+                        };
+
+                        if self.build_idx == 0 {
+                            self.key_idx += 1;
+                            self.matched_num_rows = 0;
+                        }
+
+                        return Ok(());
+                    }
+
+                    if MATCH_FIRST {
+                        self.build_idx = 0;
+                        break;
+                    }
+                }
+
+                self.build_idx = self.next[self.build_idx].to_usize();
+            }
+
+            if !MATCHED && self.matched_num_rows == 0 {
+                res.unmatched.push(self.key_idx as u64);
+            }
+
+            self.key_idx += 1;
+            self.matched_num_rows = 0;
+        }
+
+        Ok(())
+    }
+}
+
+/// Scan min/max with short-circuit per chunk. Returns Some((min as u256, range)) if range <= threshold.
+fn scan_min_max_u32(buffers: &[Buffer<u32>]) -> Option<(u256, u64)> {
+    let mut min_val = u32::MAX;
+    let mut max_val = u32::MIN;
+    for buf in buffers {
+        for &k in buf.iter() {
+            min_val = min_val.min(k);
+            max_val = max_val.max(k);
+        }
+        if (max_val as u64).wrapping_sub(min_val as u64) > DIRECT_JOIN_MAX_RANGE {
+            return None;
+        }
+    }
+    if min_val > max_val {
+        return None;
+    }
+    Some((u256::from(min_val), (max_val as u64) - (min_val as u64)))
+}
+
+fn scan_min_max_u64(buffers: &[Buffer<u64>]) -> Option<(u256, u64)> {
+    let mut min_val = u64::MAX;
+    let mut max_val = u64::MIN;
+    for buf in buffers {
+        for &k in buf.iter() {
+            min_val = min_val.min(k);
+            max_val = max_val.max(k);
+        }
+        if max_val.wrapping_sub(min_val) > DIRECT_JOIN_MAX_RANGE {
+            return None;
+        }
+    }
+    if min_val > max_val {
+        return None;
+    }
+    Some((u256::from(min_val), max_val - min_val))
+}
+
+fn scan_min_max_u128(buffers: &[Buffer<u128>]) -> Option<(u256, u64)> {
+    let mut min_val = u128::MAX;
+    let mut max_val = u128::MIN;
+    for buf in buffers {
+        for &k in buf.iter() {
+            min_val = min_val.min(k);
+            max_val = max_val.max(k);
+        }
+        if max_val.wrapping_sub(min_val) > DIRECT_JOIN_MAX_RANGE as u128 {
+            return None;
+        }
+    }
+    if min_val > max_val {
+        return None;
+    }
+    Some((u256::from(min_val), (max_val - min_val) as u64))
+}
+
+fn scan_min_max_u256(buffers: &[Buffer<u256>]) -> Option<(u256, u64)> {
+    let mut min_val = u256::MAX;
+    let mut max_val = u256::MIN;
+    for buf in buffers {
+        for &k in buf.iter() {
+            min_val = min_val.min(k);
+            max_val = max_val.max(k);
+        }
+        if max_val.wrapping_sub(min_val) > u256::from(DIRECT_JOIN_MAX_RANGE) {
+            return None;
+        }
+    }
+    if min_val > max_val {
+        return None;
+    }
+    Some((min_val, (max_val - min_val).as_u64()))
+}
diff --git a/src/query/service/src/pipelines/processors/transforms/new_hash_join/partitioned/right_join.rs b/src/query/service/src/pipelines/processors/transforms/new_hash_join/partitioned/right_join.rs
new file mode 100644
index 0000000000000..6f5399e32623d
--- /dev/null
+++ b/src/query/service/src/pipelines/processors/transforms/new_hash_join/partitioned/right_join.rs
@@ -0,0 +1,322 @@
+// Copyright 2021 Datafuse Labs
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+use std::sync::Arc;
+
+use databend_common_base::base::ProgressValues;
+use databend_common_exception::Result;
+use databend_common_expression::ColumnVec;
+use databend_common_expression::DataBlock;
+use databend_common_expression::FilterExecutor;
+use databend_common_expression::FunctionContext;
+use databend_common_expression::HashMethodKind;
+use databend_common_expression::types::DataType;
+
+use super::partitioned_build::PartitionedHashJoinState;
+use super::partitioned_build::ProbeData;
+use crate::pipelines::processors::HashJoinDesc;
+use crate::pipelines::processors::transforms::hash_join_table::RowPtr;
+use crate::pipelines::processors::transforms::new_hash_join::common::join::EmptyJoinStream;
+use crate::pipelines::processors::transforms::new_hash_join::common::join::Join;
+use crate::pipelines::processors::transforms::new_hash_join::common::join::JoinStream;
+use crate::pipelines::processors::transforms::new_hash_join::common::probe_stream::ProbeStream;
+use crate::pipelines::processors::transforms::new_hash_join::common::probe_stream::ProbedRows;
+use crate::pipelines::processors::transforms::new_hash_join::unpartitioned::memory::left_join::final_result_block;
+use crate::pipelines::processors::transforms::new_hash_join::unpartitioned::memory::left_join::null_block;
+use crate::pipelines::processors::transforms::unpartitioned::PerformanceContext;
+use crate::pipelines::processors::transforms::wrap_nullable_block;
+
+pub struct PartitionedRightJoin {
+    build: PartitionedHashJoinState,
+    max_block_size: usize,
+    desc: Arc<HashJoinDesc>,
+    function_ctx: Arc<FunctionContext>,
+    context: PerformanceContext,
+    finished: bool,
+}
+
+impl PartitionedRightJoin {
+    pub fn create(
+        method: HashMethodKind,
+        desc: Arc<HashJoinDesc>,
+        function_ctx: FunctionContext,
+        max_block_size: usize,
+    ) -> Self {
+        let context =
+            PerformanceContext::create(max_block_size, desc.clone(), function_ctx.clone());
+
+        let function_ctx = Arc::new(function_ctx);
+
+        PartitionedRightJoin {
+            function_ctx: function_ctx.clone(),
+            build: PartitionedHashJoinState::create(method, desc.clone(), function_ctx),
+            max_block_size,
+            desc,
+            context,
+            finished: false,
+        }
+    }
+}
+
+impl Join for PartitionedRightJoin {
+    fn add_block(&mut self, data: Option<DataBlock>) -> Result<()> {
+        self.build.add_block::<true>(data)
+    }
+
+    fn final_build(&mut self) -> Result<Option<ProgressValues>> {
+        self.build.final_build()
+    }
+
+    fn probe_block(&mut self, data: DataBlock) -> Result<Box<dyn JoinStream + '_>> {
+        if data.is_empty() || self.build.num_rows == 0 {
+            return Ok(Box::new(EmptyJoinStream));
+        }
+
+        let mut probe_keys = {
+            let nullable_block = wrap_nullable_block(&data);
+            let probe_keys = self.desc.probe_key(&nullable_block, &self.function_ctx)?;
+            DataBlock::new(probe_keys, data.num_rows())
+        };
+
+        let valids = self.desc.build_valids_by_keys(&probe_keys)?;
+
+        self.desc.remove_keys_nullable(&mut probe_keys);
+        let probe_block = data.project(&self.desc.probe_projection);
+
+        let probe_data = ProbeData::new(probe_keys, valids);
+        let probe_keys_stream = self.build.probe::<true, false>(probe_data)?;
+
+        match self.context.filter_executor.as_mut() {
+            None => Ok(OuterRightHashJoinStream::<false>::create(
+                probe_block,
+                &self.build,
+                probe_keys_stream,
+                self.desc.clone(),
+                &mut self.context.probe_result,
+                None,
+            )),
+            Some(filter_executor) => Ok(OuterRightHashJoinStream::<true>::create(
+                probe_block,
+                &self.build,
+                probe_keys_stream,
+                self.desc.clone(),
+                &mut self.context.probe_result,
+                Some(filter_executor),
+            )),
+        }
+    }
+
+    fn final_probe(&mut self) -> Result<Option<Box<dyn JoinStream + '_>>> {
+        if self.finished || self.build.num_rows == 0 {
+            return Ok(None);
+        }
+        self.finished = true;
+
+        let mut probe_types = Vec::new();
+        for (i, field) in self.desc.probe_schema.fields().iter().enumerate() {
+            if self.desc.probe_projection.contains(&i) {
+                probe_types.push(field.data_type().clone());
+            }
+        }
+
+        Ok(Some(Box::new(PartitionedRightFinalStream {
+            columns: &self.build.columns,
+            column_types: &self.build.column_types,
+            visited: &self.build.visited,
+            chunk_idx: 0,
+            row_idx: 0,
+            max_block_size: self.max_block_size,
+            desc: self.desc.clone(),
+            probe_types,
+        })))
+    }
+}
+
+struct OuterRightHashJoinStream<'a, const CONJUNCT: bool> {
+    desc: Arc<HashJoinDesc>,
+    probe_data_block: DataBlock,
+    build: &'a PartitionedHashJoinState,
+    probe_keys_stream: Box<dyn ProbeStream + 'a>,
+    probed_rows: &'a mut ProbedRows,
+    filter_executor: Option<&'a mut FilterExecutor>,
+}
+
+impl<'a, const CONJUNCT: bool> OuterRightHashJoinStream<'a, CONJUNCT> {
+    pub fn create(
+        probe_data_block: DataBlock,
+        build: &'a PartitionedHashJoinState,
+        probe_keys_stream: Box<dyn ProbeStream + 'a>,
+        desc: Arc<HashJoinDesc>,
+        probed_rows: &'a mut ProbedRows,
+        filter_executor: Option<&'a mut FilterExecutor>,
+    ) -> Box<dyn JoinStream + 'a> {
+        Box::new(OuterRightHashJoinStream::<'a, CONJUNCT> {
+            desc,
+            build,
+            probed_rows,
+            probe_data_block,
+            probe_keys_stream,
+            filter_executor,
+        })
+    }
+}
+
+impl<'a, const CONJUNCT: bool> JoinStream for OuterRightHashJoinStream<'a, CONJUNCT> {
+    fn next(&mut self) -> Result<Option<DataBlock>> {
+        loop {
+            self.probed_rows.clear();
+            let max_rows = self.probed_rows.matched_probe.capacity();
+            self.probe_keys_stream.advance(self.probed_rows, max_rows)?;
+
+            if self.probed_rows.is_empty() {
+                return Ok(None);
+            }
+
+            if self.probed_rows.matched_probe.is_empty() {
+                continue;
+            }
+
+            let probe_block = match self.probe_data_block.num_columns() {
+                0 => None,
+                _ => Some(wrap_nullable_block(&DataBlock::take(
+                    &self.probe_data_block,
+                    self.probed_rows.matched_probe.as_slice(),
+                )?)),
+            };
+
+            let build_block = match self.build.columns.is_empty() {
+                true => None,
+                false => {
+                    let row_ptrs = self.probed_rows.matched_build.as_slice();
+                    Some(DataBlock::take_column_vec(
+                        self.build.columns.as_slice(),
+                        self.build.column_types.as_slice(),
+                        row_ptrs,
+                    ))
+                }
+            };
+
+            let data_block = final_result_block(
+                &self.desc,
+                probe_block,
+                build_block,
+                self.probed_rows.matched_build.len(),
+            );
+
+            if !CONJUNCT {
+                for row_ptr in &self.probed_rows.matched_build {
+                    unsafe {
+                        *self.build.visited[row_ptr.chunk_index as usize]
+                            .as_ptr()
+                            .add(row_ptr.row_index as usize)
+                            .cast_mut() = 1;
+                    }
+                }
+
+                return Ok(Some(data_block));
+            }
+
+            let Some(filter_executor) = self.filter_executor.as_mut() else {
+                for row_ptr in &self.probed_rows.matched_build {
+                    unsafe {
+                        *self.build.visited[row_ptr.chunk_index as usize]
+                            .as_ptr()
+                            .add(row_ptr.row_index as usize)
+                            .cast_mut() = 1;
+                    }
+                }
+
+                return Ok(Some(data_block));
+            };
+
+            if !data_block.is_empty() {
+                let res_rows = filter_executor.select(&data_block)?;
+
+                if res_rows == 0 {
+                    continue;
+                }
+
+                let true_sel = filter_executor.true_selection();
+
+                for idx in true_sel.iter().take(res_rows) {
+                    let row_ptr = self.probed_rows.matched_build[*idx as usize];
+                    unsafe {
+                        *self.build.visited[row_ptr.chunk_index as usize]
+                            .as_ptr()
+                            .add(row_ptr.row_index as usize)
+                            .cast_mut() = 1;
+                    }
+                }
+
+                let num_rows = data_block.num_rows();
+                return Ok(Some(filter_executor.take(data_block, num_rows, res_rows)?));
+            }
+        }
+    }
+}
+
+struct PartitionedRightFinalStream<'a> {
+    columns: &'a Vec<ColumnVec>,
+    column_types: &'a Vec<DataType>,
+    visited: &'a Vec<Vec<u8>>,
+    chunk_idx: usize,
+    row_idx: usize,
+    max_block_size: usize,
+    desc: Arc<HashJoinDesc>,
+    probe_types: Vec<DataType>,
+}
+
+impl<'a> JoinStream for PartitionedRightFinalStream<'a> {
+    fn next(&mut self) -> Result<Option<DataBlock>> {
+        let mut row_ptrs = Vec::with_capacity(self.max_block_size);
+        while self.chunk_idx < self.visited.len() && row_ptrs.len() < self.max_block_size {
+            let chunk = &self.visited[self.chunk_idx];
+            while self.row_idx < chunk.len() && row_ptrs.len() < self.max_block_size {
+                if chunk[self.row_idx] == 0 {
+                    row_ptrs.push(RowPtr {
+                        chunk_index: self.chunk_idx as u32,
+                        row_index: self.row_idx as u32,
+                    });
+                }
+                self.row_idx += 1;
+            }
+            if self.row_idx >= chunk.len() {
+                self.chunk_idx += 1;
+                self.row_idx = 0;
+            }
+        }
+
+        if row_ptrs.is_empty() {
+            return Ok(None);
+        }
+
+        let probe_block = null_block(&self.probe_types, row_ptrs.len());
+        let build_block = if self.columns.is_empty() {
+            None
+        } else {
+            Some(DataBlock::take_column_vec(
+                self.columns,
+                self.column_types,
+                &row_ptrs,
+            ))
+        };
+
+        Ok(Some(final_result_block(
+            &self.desc,
+            probe_block,
+            build_block,
+            row_ptrs.len(),
+        )))
+    }
+}
diff --git a/src/query/service/src/pipelines/processors/transforms/new_hash_join/partitioned/right_join_anti.rs b/src/query/service/src/pipelines/processors/transforms/new_hash_join/partitioned/right_join_anti.rs
new file mode 100644
index 0000000000000..525a956837151
--- /dev/null
+++ b/src/query/service/src/pipelines/processors/transforms/new_hash_join/partitioned/right_join_anti.rs
@@ -0,0 +1,170 @@
+// Copyright 2021 Datafuse Labs
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+use std::sync::Arc;
+
+use databend_common_base::base::ProgressValues;
+use databend_common_exception::Result;
+use databend_common_expression::ColumnVec;
+use databend_common_expression::DataBlock;
+use databend_common_expression::FunctionContext;
+use databend_common_expression::HashMethodKind;
+use databend_common_expression::types::DataType;
+
+use super::partitioned_build::PartitionedHashJoinState;
+use super::partitioned_build::ProbeData;
+use super::right_join_semi::SemiRightHashJoinStream;
+use crate::pipelines::processors::HashJoinDesc;
+use crate::pipelines::processors::transforms::hash_join_table::RowPtr;
+use crate::pipelines::processors::transforms::new_hash_join::common::join::EmptyJoinStream;
+use crate::pipelines::processors::transforms::new_hash_join::common::join::Join;
+use crate::pipelines::processors::transforms::new_hash_join::common::join::JoinStream;
+use crate::pipelines::processors::transforms::unpartitioned::PerformanceContext;
+
+pub struct PartitionedRightAntiJoin {
+    build: PartitionedHashJoinState,
+    max_block_size: usize,
+    desc: Arc<HashJoinDesc>,
+    function_ctx: Arc<FunctionContext>,
+    context: PerformanceContext,
+    finished: bool,
+}
+
+impl PartitionedRightAntiJoin {
+    pub fn create(
+        method: HashMethodKind,
+        desc: Arc<HashJoinDesc>,
+        function_ctx: FunctionContext,
+        max_block_size: usize,
+    ) -> Self {
+        let context =
+            PerformanceContext::create(max_block_size, desc.clone(), function_ctx.clone());
+
+        let function_ctx = Arc::new(function_ctx);
+
+        PartitionedRightAntiJoin {
+            function_ctx: function_ctx.clone(),
+            build: PartitionedHashJoinState::create(method, desc.clone(), function_ctx),
+            max_block_size,
+            desc,
+            context,
+            finished: false,
+        }
+    }
+}
+
+impl Join for PartitionedRightAntiJoin {
+    fn add_block(&mut self, data: Option<DataBlock>) -> Result<()> {
+        self.build.add_block::<true>(data)
+    }
+
+    fn final_build(&mut self) -> Result<Option<ProgressValues>> {
+        self.build.final_build()
+    }
+
+    fn probe_block(&mut self, data: DataBlock) -> Result<Box<dyn JoinStream + '_>> {
+        if data.is_empty() || self.build.num_rows == 0 {
+            return Ok(Box::new(EmptyJoinStream));
+        }
+
+        let probe_keys = self.desc.probe_key(&data, &self.function_ctx)?;
+        let mut keys = DataBlock::new(probe_keys, data.num_rows());
+        let valids = self.desc.build_valids_by_keys(&keys)?;
+
+        self.desc.remove_keys_nullable(&mut keys);
+        let probe_block = data.project(&self.desc.probe_projection);
+
+        let probe_data = ProbeData::new(keys, valids);
+        let probe_keys_stream = self.build.probe::<true, false>(probe_data)?;
+
+        match self.context.filter_executor.as_mut() {
+            None => Ok(SemiRightHashJoinStream::<false>::create(
+                probe_block,
+                &self.build,
+                probe_keys_stream,
+                self.desc.clone(),
+                &mut self.context.probe_result,
+                None,
+            )),
+            Some(filter_executor) => Ok(SemiRightHashJoinStream::<true>::create(
+                probe_block,
+                &self.build,
+                probe_keys_stream,
+                self.desc.clone(),
+                &mut self.context.probe_result,
+                Some(filter_executor),
+            )),
+        }
+    }
+
+    fn final_probe(&mut self) -> Result<Option<Box<dyn JoinStream + '_>>> {
+        if self.finished || self.build.num_rows == 0 {
+            return Ok(None);
+        }
+        self.finished = true;
+
+        Ok(Some(Box::new(PartitionedRightAntiFinalStream {
+            columns: &self.build.columns,
+            column_types: &self.build.column_types,
+            visited: &self.build.visited,
+            chunk_idx: 0,
+            row_idx: 0,
+            max_block_size: self.max_block_size,
+        })))
+    }
+}
+
+struct PartitionedRightAntiFinalStream<'a> {
+    columns: &'a Vec<ColumnVec>,
+    column_types: &'a Vec<DataType>,
+    visited: &'a Vec<Vec<u8>>,
+    chunk_idx: usize,
+    row_idx: usize,
+    max_block_size: usize,
+}
+
+impl<'a> JoinStream for PartitionedRightAntiFinalStream<'a> {
+    fn next(&mut self) -> Result<Option<DataBlock>> {
+        let mut row_ptrs = Vec::with_capacity(self.max_block_size);
+        while self.chunk_idx < self.visited.len() && row_ptrs.len() < self.max_block_size {
+            let chunk = &self.visited[self.chunk_idx];
+            while self.row_idx < chunk.len() && row_ptrs.len() < self.max_block_size {
+                if chunk[self.row_idx] == 0 {
+                    row_ptrs.push(RowPtr {
+                        chunk_index: self.chunk_idx as u32,
+                        row_index: self.row_idx as u32,
+                    });
+                }
+                self.row_idx += 1;
+            }
+            if self.row_idx >= chunk.len() {
+                self.chunk_idx += 1;
+                self.row_idx = 0;
+            }
+        }
+
+        if row_ptrs.is_empty() {
+            return Ok(None);
+        }
+
+        if self.columns.is_empty() {
+            return Ok(Some(DataBlock::new(vec![], row_ptrs.len())));
+        }
+        Ok(Some(DataBlock::take_column_vec(
+            self.columns,
+            self.column_types,
+            &row_ptrs,
+        )))
+    }
+}
diff --git a/src/query/service/src/pipelines/processors/transforms/new_hash_join/partitioned/right_join_semi.rs b/src/query/service/src/pipelines/processors/transforms/new_hash_join/partitioned/right_join_semi.rs
new file mode 100644
index 0000000000000..c41d5995ad605
--- /dev/null
+++ b/src/query/service/src/pipelines/processors/transforms/new_hash_join/partitioned/right_join_semi.rs
@@ -0,0 +1,291 @@
+// Copyright 2021 Datafuse Labs
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+use std::sync::Arc;
+
+use databend_common_base::base::ProgressValues;
+use databend_common_exception::Result;
+use databend_common_expression::ColumnVec;
+use databend_common_expression::DataBlock;
+use databend_common_expression::FilterExecutor;
+use databend_common_expression::FunctionContext;
+use databend_common_expression::HashMethodKind;
+use databend_common_expression::types::DataType;
+
+use super::partitioned_build::PartitionedHashJoinState;
+use super::partitioned_build::ProbeData;
+use crate::pipelines::processors::HashJoinDesc;
+use crate::pipelines::processors::transforms::hash_join_table::RowPtr;
+use crate::pipelines::processors::transforms::new_hash_join::common::join::EmptyJoinStream;
+use crate::pipelines::processors::transforms::new_hash_join::common::join::Join;
+use crate::pipelines::processors::transforms::new_hash_join::common::join::JoinStream;
+use crate::pipelines::processors::transforms::new_hash_join::common::probe_stream::ProbeStream;
+use crate::pipelines::processors::transforms::new_hash_join::common::probe_stream::ProbedRows;
+use crate::pipelines::processors::transforms::new_hash_join::unpartitioned::memory::left_join::final_result_block;
+use crate::pipelines::processors::transforms::unpartitioned::PerformanceContext;
+
+pub struct PartitionedRightSemiJoin {
+    build: PartitionedHashJoinState,
+    max_block_size: usize,
+    desc: Arc<HashJoinDesc>,
+    function_ctx: Arc<FunctionContext>,
+    context: PerformanceContext,
+    finished: bool,
+}
+
+impl PartitionedRightSemiJoin {
+    pub fn create(
+        method: HashMethodKind,
+        desc: Arc<HashJoinDesc>,
+        function_ctx: FunctionContext,
+        max_block_size: usize,
+    ) -> Self {
+        let context =
+            PerformanceContext::create(max_block_size, desc.clone(), function_ctx.clone());
+
+        let function_ctx = Arc::new(function_ctx);
+
+        PartitionedRightSemiJoin {
+            function_ctx: function_ctx.clone(),
+            build: PartitionedHashJoinState::create(method, desc.clone(), function_ctx),
+            max_block_size,
+            desc,
+            context,
+            finished: false,
+        }
+    }
+}
+
+impl Join for PartitionedRightSemiJoin {
+    fn add_block(&mut self, data: Option<DataBlock>) -> Result<()> {
+        self.build.add_block::<true>(data)
+    }
+
+    fn final_build(&mut self) -> Result<Option<ProgressValues>> {
+        self.build.final_build()
+    }
+
+    fn probe_block(&mut self, data: DataBlock) -> Result<Box<dyn JoinStream + '_>> {
+        if data.is_empty() || self.build.num_rows == 0 {
+            return Ok(Box::new(EmptyJoinStream));
+        }
+
+        let probe_keys = self.desc.probe_key(&data, &self.function_ctx)?;
+        let mut keys = DataBlock::new(probe_keys, data.num_rows());
+        let valids = self.desc.build_valids_by_keys(&keys)?;
+
+        self.desc.remove_keys_nullable(&mut keys);
+        let probe_block = data.project(&self.desc.probe_projection);
+
+        let probe_data = ProbeData::new(keys, valids);
+        let probe_keys_stream = self.build.probe::<true, false>(probe_data)?;
+
+        match self.context.filter_executor.as_mut() {
+            None => Ok(SemiRightHashJoinStream::<false>::create(
+                probe_block,
+                &self.build,
+                probe_keys_stream,
+                self.desc.clone(),
+                &mut self.context.probe_result,
+                None,
+            )),
+            Some(filter_executor) => Ok(SemiRightHashJoinStream::<true>::create(
+                probe_block,
+                &self.build,
+                probe_keys_stream,
+                self.desc.clone(),
+                &mut self.context.probe_result,
+                Some(filter_executor),
+            )),
+        }
+    }
+
+    fn final_probe(&mut self) -> Result<Option<Box<dyn JoinStream + '_>>> {
+        if self.finished || self.build.num_rows == 0 {
+            return Ok(None);
+        }
+        self.finished = true;
+
+        Ok(Some(Box::new(PartitionedRightSemiFinalStream {
+            columns: &self.build.columns,
+            column_types: &self.build.column_types,
+            visited: &self.build.visited,
+            chunk_idx: 0,
+            row_idx: 0,
+            max_block_size: self.max_block_size,
+        })))
+    }
+}
+
+pub(super) struct SemiRightHashJoinStream<'a, const CONJUNCT: bool> {
+    desc: Arc<HashJoinDesc>,
+    probe_data_block: DataBlock,
+    build: &'a PartitionedHashJoinState,
+    probe_keys_stream: Box<dyn ProbeStream + 'a>,
+    probed_rows: &'a mut ProbedRows,
+    filter_executor: Option<&'a mut FilterExecutor>,
+}
+
+impl<'a, const CONJUNCT: bool> SemiRightHashJoinStream<'a, CONJUNCT> {
+    pub fn create(
+        probe_data_block: DataBlock,
+        build: &'a PartitionedHashJoinState,
+        probe_keys_stream: Box<dyn ProbeStream + 'a>,
+        desc: Arc<HashJoinDesc>,
+        probed_rows: &'a mut ProbedRows,
+        filter_executor: Option<&'a mut FilterExecutor>,
+    ) -> Box<dyn JoinStream + 'a> {
+        Box::new(SemiRightHashJoinStream::<'a, CONJUNCT> {
+            desc,
+            build,
+            probed_rows,
+            probe_data_block,
+            probe_keys_stream,
+            filter_executor,
+        })
+    }
+}
+
+impl<'a, const CONJUNCT: bool> JoinStream for SemiRightHashJoinStream<'a, CONJUNCT> {
+    fn next(&mut self) -> Result<Option<DataBlock>> {
+        loop {
+            self.probed_rows.clear();
+            let max_rows = self.probed_rows.matched_probe.capacity();
+            self.probe_keys_stream.advance(self.probed_rows, max_rows)?;
+
+            if self.probed_rows.is_empty() {
+                return Ok(None);
+            }
+
+            if self.probed_rows.matched_probe.is_empty() {
+                continue;
+            }
+
+            if !CONJUNCT {
+                for row_ptr in &self.probed_rows.matched_build {
+                    unsafe {
+                        *self.build.visited[row_ptr.chunk_index as usize]
+                            .as_ptr()
+                            .add(row_ptr.row_index as usize)
+                            .cast_mut() = 1;
+                    }
+                }
+                continue;
+            }
+
+            let Some(filter_executor) = self.filter_executor.as_mut() else {
+                for row_ptr in &self.probed_rows.matched_build {
+                    unsafe {
+                        *self.build.visited[row_ptr.chunk_index as usize]
+                            .as_ptr()
+                            .add(row_ptr.row_index as usize)
+                            .cast_mut() = 1;
+                    }
+                }
+                continue;
+            };
+
+            let probe_block = match self.probe_data_block.num_columns() {
+                0 => None,
+                _ => Some(DataBlock::take(
+                    &self.probe_data_block,
+                    self.probed_rows.matched_probe.as_slice(),
+                )?),
+            };
+
+            let build_block = match self.build.columns.is_empty() {
+                true => None,
+                false => {
+                    let row_ptrs = self.probed_rows.matched_build.as_slice();
+                    Some(DataBlock::take_column_vec(
+                        self.build.columns.as_slice(),
+                        self.build.column_types.as_slice(),
+                        row_ptrs,
+                    ))
+                }
+            };
+
+            let result_block = final_result_block(
+                &self.desc,
+                probe_block,
+                build_block,
+                self.probed_rows.matched_build.len(),
+            );
+
+            if !result_block.is_empty() {
+                let result_count = filter_executor.select(&result_block)?;
+
+                if result_count == 0 {
+                    continue;
+                }
+
+                let true_sel = filter_executor.true_selection();
+
+                for idx in true_sel.iter().take(result_count) {
+                    let row_ptr = self.probed_rows.matched_build[*idx as usize];
+                    unsafe {
+                        *self.build.visited[row_ptr.chunk_index as usize]
+                            .as_ptr()
+                            .add(row_ptr.row_index as usize)
+                            .cast_mut() = 1;
+                    }
+                }
+            }
+        }
+    }
+}
+
+struct PartitionedRightSemiFinalStream<'a> {
+    columns: &'a Vec<ColumnVec>,
+    column_types: &'a Vec<DataType>,
+    visited: &'a Vec<Vec<u8>>,
+    chunk_idx: usize,
+    row_idx: usize,
+    max_block_size: usize,
+}
+
+impl<'a> JoinStream for PartitionedRightSemiFinalStream<'a> {
+    fn next(&mut self) -> Result<Option<DataBlock>> {
+        let mut row_ptrs = Vec::with_capacity(self.max_block_size);
+        while self.chunk_idx < self.visited.len() && row_ptrs.len() < self.max_block_size {
+            let chunk = &self.visited[self.chunk_idx];
+            while self.row_idx < chunk.len() && row_ptrs.len() < self.max_block_size {
+                if chunk[self.row_idx] != 0 {
+                    row_ptrs.push(RowPtr {
+                        chunk_index: self.chunk_idx as u32,
+                        row_index: self.row_idx as u32,
+                    });
+                }
+                self.row_idx += 1;
+            }
+            if self.row_idx >= chunk.len() {
+                self.chunk_idx += 1;
+                self.row_idx = 0;
+            }
+        }
+
+        if row_ptrs.is_empty() {
+            return Ok(None);
+        }
+
+        if self.columns.is_empty() {
+            return Ok(Some(DataBlock::new(vec![], row_ptrs.len())));
+        }
+        Ok(Some(DataBlock::take_column_vec(
+            self.columns,
+            self.column_types,
+            &row_ptrs,
+        )))
+    }
+}
diff --git a/src/query/service/src/pipelines/processors/transforms/new_hash_join/partitioned/transform_hash_join.rs b/src/query/service/src/pipelines/processors/transforms/new_hash_join/partitioned/transform_hash_join.rs
new file mode 100644
index 0000000000000..27a6edaa92a5a
--- /dev/null
+++ b/src/query/service/src/pipelines/processors/transforms/new_hash_join/partitioned/transform_hash_join.rs
@@ -0,0 +1,544 @@
+// Copyright 2021 Datafuse Labs
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+use std::any::Any;
+use std::collections::BTreeSet;
+use std::fmt::Debug;
+use std::fmt::Formatter;
+use std::sync::Arc;
+use std::sync::Mutex;
+use std::sync::PoisonError;
+use std::time::Instant;
+
+use databend_common_base::base::Barrier;
+use databend_common_exception::Result;
+use databend_common_expression::DataBlock;
+use databend_common_expression::FunctionContext;
+use databend_common_expression::HashMethodKind;
+use databend_common_pipeline::core::Event;
+use databend_common_pipeline::core::InputPort;
+use databend_common_pipeline::core::OutputPort;
+use databend_common_pipeline::core::Processor;
+use databend_common_pipeline::core::ProcessorPtr;
+use databend_common_sql::plans::JoinType;
+use log::info;
+
+use super::PartitionedInnerJoin;
+use super::PartitionedLeftAntiJoin;
+use super::PartitionedLeftJoin;
+use super::PartitionedLeftSemiJoin;
+use super::PartitionedRightAntiJoin;
+use super::PartitionedRightJoin;
+use super::PartitionedRightSemiJoin;
+use crate::pipelines::processors::HashJoinDesc;
+use crate::pipelines::processors::transforms::JoinRuntimeFilterPacket;
+use crate::pipelines::processors::transforms::RuntimeFilterLocalBuilder;
+use crate::pipelines::processors::transforms::merge_two_runtime_filter_packets;
+use crate::pipelines::processors::transforms::new_hash_join::common::join::FinishedJoin;
+use crate::pipelines::processors::transforms::new_hash_join::common::join::Join;
+use crate::pipelines::processors::transforms::new_hash_join::common::join::JoinStream;
+use crate::pipelines::processors::transforms::new_hash_join::common::runtime_filter::RuntimeFiltersDesc;
+
+pub struct SharedRuntimeFilterPackets {
+    packets: Mutex<Vec<JoinRuntimeFilterPacket>>,
+}
+
+impl SharedRuntimeFilterPackets {
+    pub fn create() -> Arc<Self> {
+        Arc::new(SharedRuntimeFilterPackets {
+            packets: Mutex::new(Vec::new()),
+        })
+    }
+
+    pub fn merge_packet(&self, mut my_packet: JoinRuntimeFilterPacket) -> Result<()> {
+        loop {
+            let locked = self.packets.lock();
+            let mut guard = locked.unwrap_or_else(PoisonError::into_inner);
+
+            if guard.is_empty() {
+                guard.push(my_packet);
+                return Ok(());
+            }
+
+            let other = guard.pop().unwrap();
+            drop(guard);
+            my_packet = merge_two_runtime_filter_packets(my_packet, other)?;
+        }
+    }
+
+    pub fn take_packet(&self) -> Option<JoinRuntimeFilterPacket> {
+        let mut guard = self.packets.lock().unwrap_or_else(PoisonError::into_inner);
+        guard.pop()
+    }
+}
+
+pub struct TransformPartitionedHashJoin {
+    build_port: Arc<InputPort>,
+    probe_port: Arc<InputPort>,
+    joined_port: Arc<OutputPort>,
+
+    stage: Stage,
+    join: Box<dyn Join>,
+    joined_data: Option<DataBlock>,
+
+    stage_sync_barrier: Arc<Barrier>,
+    projection: BTreeSet<usize>,
+    rf_desc: Arc<RuntimeFiltersDesc>,
+    runtime_filter_builder: Option<RuntimeFilterLocalBuilder>,
+    shared_rf_packets: Arc<SharedRuntimeFilterPackets>,
+    instant: Instant,
+}
+
+impl TransformPartitionedHashJoin {
+    pub fn create(
+        build_port: Arc<InputPort>,
+        probe_port: Arc<InputPort>,
+        joined_port: Arc<OutputPort>,
+        join: Box<dyn Join>,
+        stage_sync_barrier: Arc<Barrier>,
+        projection: BTreeSet<usize>,
+        rf_desc: Arc<RuntimeFiltersDesc>,
+        shared_rf_packets: Arc<SharedRuntimeFilterPackets>,
+    ) -> Result<ProcessorPtr> {
+        let runtime_filter_builder = RuntimeFilterLocalBuilder::try_create(
+            &rf_desc.func_ctx,
+            rf_desc.filters_desc.clone(),
+            rf_desc.inlist_threshold,
+            rf_desc.bloom_threshold,
+            rf_desc.min_max_threshold,
+            rf_desc.spatial_threshold,
+        )?;
+
+        Ok(ProcessorPtr::create(Box::new(
+            TransformPartitionedHashJoin {
+                build_port,
+                probe_port,
+                joined_port,
+                join,
+                rf_desc,
+                projection,
+                stage_sync_barrier,
+                shared_rf_packets,
+                joined_data: None,
+                runtime_filter_builder,
+                stage: Stage::Build(BuildState {
+                    finished: false,
+                    build_data: None,
+                }),
+                instant: Instant::now(),
+            },
+        )))
+    }
+
+    pub fn create_join(
+        typ: JoinType,
+        hash_method: HashMethodKind,
+        desc: Arc<HashJoinDesc>,
+        function_ctx: FunctionContext,
+        max_block_size: usize,
+    ) -> Box<dyn Join> {
+        match typ {
+            JoinType::Inner => Box::new(PartitionedInnerJoin::create(
+                hash_method,
+                desc,
+                function_ctx,
+                max_block_size,
+            )),
+            JoinType::Left => Box::new(PartitionedLeftJoin::create(
+                hash_method,
+                desc,
+                function_ctx,
+                max_block_size,
+            )),
+            JoinType::LeftAnti => Box::new(PartitionedLeftAntiJoin::create(
+                hash_method,
+                desc,
+                function_ctx,
+                max_block_size,
+            )),
+            JoinType::LeftSemi => Box::new(PartitionedLeftSemiJoin::create(
+                hash_method,
+                desc,
+                function_ctx,
+                max_block_size,
+            )),
+            JoinType::Right => Box::new(PartitionedRightJoin::create(
+                hash_method,
+                desc,
+                function_ctx,
+                max_block_size,
+            )),
+            JoinType::RightSemi => Box::new(PartitionedRightSemiJoin::create(
+                hash_method,
+                desc,
+                function_ctx,
+                max_block_size,
+            )),
+            JoinType::RightAnti => Box::new(PartitionedRightAntiJoin::create(
+                hash_method,
+                desc,
+                function_ctx,
+                max_block_size,
+            )),
+            _ => unreachable!(),
+        }
+    }
+}
+
+#[async_trait::async_trait]
+impl Processor for TransformPartitionedHashJoin {
+    fn name(&self) -> String {
+        String::from("TransformPartitionedHashJoin")
+    }
+
+    fn as_any(&mut self) -> &mut dyn Any {
+        self
+    }
+
+    fn event(&mut self) -> Result<Event> {
+        if self.joined_port.is_finished() {
+            self.build_port.finish();
+            self.probe_port.finish();
+
+            if !matches!(self.stage, Stage::Finished) {
+                self.stage = Stage::Finished;
+                let mut finished = FinishedJoin::create();
+                std::mem::swap(&mut finished, &mut self.join);
+                self.stage_sync_barrier.reduce_quorum(1);
+                drop(finished);
+            }
+
+            return Ok(Event::Finished);
+        }
+
+        if !self.joined_port.can_push() {
+            match self.stage {
+                Stage::Build(_) => self.build_port.set_not_need_data(),
+                Stage::Probe(_) => self.probe_port.set_not_need_data(),
+                Stage::BuildFinal(_) | Stage::ProbeFinal(_) | Stage::Finished => (),
+            }
+            return Ok(Event::NeedConsume);
+        }
+
+        if let Some(joined_data) = self.joined_data.take() {
+            let joined_data = joined_data.project(&self.projection);
+            self.joined_port.push_data(Ok(joined_data));
+            return Ok(Event::NeedConsume);
+        }
+
+        match &mut self.stage {
+            Stage::Build(state) => state.event(&self.build_port),
+            Stage::BuildFinal(state) => state.event(),
+            Stage::Probe(state) => state.event(&self.probe_port),
+            Stage::ProbeFinal(state) => state.event(&self.joined_port),
+            Stage::Finished => Ok(Event::Finished),
+        }
+    }
+
+    fn process(&mut self) -> Result<()> {
+        match &mut self.stage {
+            Stage::Finished => Ok(()),
+            Stage::Build(state) => {
+                let Some(data_block) = state.build_data.take() else {
+                    if !state.finished {
+                        state.finished = true;
+                        self.join.add_block(None)?;
+
+                        if let Some(builder) = self.runtime_filter_builder.take() {
+                            let spill_happened = self.join.is_spill_happened();
+                            let packet = builder.finish(spill_happened)?;
+                            self.shared_rf_packets.merge_packet(packet)?;
+                        }
+                    }
+                    return Ok(());
+                };
+
+                if !data_block.is_empty() {
+                    if let Some(builder) = self.runtime_filter_builder.as_mut() {
+                        builder.add_block(&data_block)?;
+                    }
+                    self.join.add_block(Some(data_block))?;
+                }
+
+                Ok(())
+            }
+            Stage::BuildFinal(state) => {
+                state.finished = self.join.final_build()?.is_none();
+                Ok(())
+            }
+            Stage::Probe(state) => {
+                if let Some(probe_data) = state.input_data.take() {
+                    let stream = self.join.probe_block(probe_data)?;
+                    state.stream = Some(unsafe {
+                        std::mem::transmute::<Box<dyn JoinStream + '_>, Box<dyn JoinStream>>(stream)
+                    });
+                }
+
+                if let Some(mut stream) = state.stream.take() {
+                    if let Some(joined_data) = stream.next()? {
+                        self.joined_data = Some(joined_data);
+                        state.stream = Some(stream);
+                    }
+                }
+
+                Ok(())
+            }
+            Stage::ProbeFinal(state) => {
+                if state.stream.is_none() {
+                    if let Some(final_stream) = self.join.final_probe()? {
+                        state.initialize = true;
+                        state.stream = Some(unsafe {
+                            std::mem::transmute::<Box<dyn JoinStream + '_>, Box<dyn JoinStream>>(
+                                final_stream,
+                            )
+                        });
+                    } else {
+                        state.finished = true;
+                    }
+                }
+
+                if let Some(mut stream) = state.stream.take() {
+                    if let Some(joined_data) = stream.next()? {
+                        self.joined_data = Some(joined_data);
+                        state.stream = Some(stream);
+                    } else {
+                        state.initialize = false;
+                    }
+                }
+
+                Ok(())
+            }
+        }
+    }
+
+    async fn async_process(&mut self) -> Result<()> {
+        let elapsed = self.instant.elapsed();
+
+        self.stage = match &mut self.stage {
+            Stage::Build(_) => {
+                let wait_res = self.stage_sync_barrier.wait().await;
+
+                let rf_build_elapsed = self.instant.elapsed() - elapsed;
+                let _wait_res = self.stage_sync_barrier.wait().await;
+                let before_wait = self.instant.elapsed();
+
+                if wait_res.is_leader() {
+                    let packet = self
+                        .shared_rf_packets
+                        .take_packet()
+                        .unwrap_or_else(|| JoinRuntimeFilterPacket::complete_without_filters(0));
+                    info!(
+                        "spilled: false, globalize runtime filter: total {}, disable_all_due_to_spill: {}",
+                        packet.packets.as_ref().map_or(0, |p| p.len()),
+                        packet.disable_all_due_to_spill
+                    );
+                    self.rf_desc.globalization(packet).await?;
+                }
+
+                let _wait_res = self.stage_sync_barrier.wait().await;
+                let wait_rf_elapsed = self.instant.elapsed() - before_wait;
+
+                log::info!(
+                    "PartitionedHashJoin build stage, sync work elapsed: {:?}, build rf elapsed: {:?}, wait other node rf elapsed: {:?}",
+                    elapsed,
+                    rf_build_elapsed,
+                    wait_rf_elapsed
+                );
+
+                self.instant = Instant::now();
+                Stage::BuildFinal(BuildFinalState::new())
+            }
+            // BuildFinal → Probe: barrier
+            Stage::BuildFinal(_) => {
+                let _wait_res = self.stage_sync_barrier.wait().await;
+                let wait_elapsed = self.instant.elapsed() - elapsed;
+                log::info!(
+                    "PartitionedHashJoin build final stage, sync work elapsed: {:?}, wait elapsed: {:?}",
+                    elapsed,
+                    wait_elapsed
+                );
+
+                self.instant = Instant::now();
+                Stage::Probe(ProbeState::new())
+            }
+            // Probe → ProbeFinal: no barrier
+            Stage::Probe(_) => {
+                log::info!("PartitionedHashJoin probe stage elapsed: {:?}", elapsed);
+                self.instant = Instant::now();
+                Stage::ProbeFinal(ProbeFinalState::new())
+            }
+            // ProbeFinal → Finished or continue: no barrier
+            Stage::ProbeFinal(state) => match state.finished {
+                true => {
+                    log::info!(
+                        "PartitionedHashJoin probe final stage elapsed: {:?}",
+                        elapsed
+                    );
+                    self.instant = Instant::now();
+
+                    let mut finished = FinishedJoin::create();
+                    std::mem::swap(&mut finished, &mut self.join);
+                    drop(finished);
+
+                    Stage::Finished
+                }
+                false => {
+                    self.instant = Instant::now();
+                    Stage::ProbeFinal(ProbeFinalState {
+                        initialize: true,
+                        finished: state.finished,
+                        stream: state.stream.take(),
+                    })
+                }
+            },
+            Stage::Finished => Stage::Finished,
+        };
+
+        Ok(())
+    }
+}
+
+#[derive(Debug)]
+enum Stage {
+    Build(BuildState),
+    BuildFinal(BuildFinalState),
+    Probe(ProbeState),
+    ProbeFinal(ProbeFinalState),
+    Finished,
+}
+
+#[derive(Debug)]
+struct BuildState {
+    finished: bool,
+    build_data: Option<DataBlock>,
+}
+
+impl BuildState {
+    pub fn event(&mut self, input: &InputPort) -> Result<Event> {
+        if self.build_data.is_some() {
+            return Ok(Event::Sync);
+        }
+
+        if input.has_data() {
+            self.build_data = Some(input.pull_data().unwrap()?);
+            return Ok(Event::Sync);
+        }
+
+        if input.is_finished() {
+            return match self.finished {
+                true => Ok(Event::Async),
+                false => Ok(Event::Sync),
+            };
+        }
+
+        input.set_need_data();
+        Ok(Event::NeedData)
+    }
+}
+
+#[derive(Debug)]
+struct BuildFinalState {
+    finished: bool,
+}
+
+impl BuildFinalState {
+    pub fn new() -> BuildFinalState {
+        BuildFinalState { finished: false }
+    }
+
+    pub fn event(&mut self) -> Result<Event> {
+        match self.finished {
+            true => Ok(Event::Async),
+            false => Ok(Event::Sync),
+        }
+    }
+}
+
+struct ProbeState {
+    input_data: Option<DataBlock>,
+    stream: Option<Box<dyn JoinStream>>,
+}
+
+impl Debug for ProbeState {
+    fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
+        f.debug_struct("ProbeState").finish()
+    }
+}
+
+impl ProbeState {
+    pub fn new() -> ProbeState {
+        ProbeState {
+            input_data: None,
+            stream: None,
+        }
+    }
+
+    pub fn event(&mut self, input: &InputPort) -> Result<Event> {
+        if self.input_data.is_some() || self.stream.is_some() {
+            return Ok(Event::Sync);
+        }
+
+        if input.has_data() {
+            self.input_data = Some(input.pull_data().unwrap()?);
+            return Ok(Event::Sync);
+        }
+
+        if input.is_finished() {
+            return Ok(Event::Async);
+        }
+
+        input.set_need_data();
+        Ok(Event::NeedData)
+    }
+}
+
+struct ProbeFinalState {
+    finished: bool,
+    initialize: bool,
+    stream: Option<Box<dyn JoinStream>>,
+}
+
+impl Debug for ProbeFinalState {
+    fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
+        f.debug_struct("ProbeFinalState")
+            .field("initialized", &self.finished)
+            .finish()
+    }
+}
+
+impl ProbeFinalState {
+    pub fn new() -> ProbeFinalState {
+        ProbeFinalState {
+            stream: None,
+            finished: false,
+            initialize: false,
+        }
+    }
+
+    pub fn event(&mut self, output_port: &OutputPort) -> Result<Event> {
+        if self.stream.is_some() {
+            return Ok(Event::Sync);
+        }
+
+        if self.finished {
+            output_port.finish();
+            return Ok(Event::Async);
+        }
+
+        match self.initialize {
+            true => Ok(Event::Sync),
+            false => Ok(Event::Async),
+        }
+    }
+}
diff --git a/src/query/service/src/pipelines/processors/transforms/new_hash_join/grace/grace_join.rs b/src/query/service/src/pipelines/processors/transforms/new_hash_join/unpartitioned/grace/grace_join.rs
similarity index 97%
rename from src/query/service/src/pipelines/processors/transforms/new_hash_join/grace/grace_join.rs
rename to src/query/service/src/pipelines/processors/transforms/new_hash_join/unpartitioned/grace/grace_join.rs
index e60310020408d..b5e98ef2e7587 100644
--- a/src/query/service/src/pipelines/processors/transforms/new_hash_join/grace/grace_join.rs
+++ b/src/query/service/src/pipelines/processors/transforms/new_hash_join/unpartitioned/grace/grace_join.rs
@@ -29,15 +29,15 @@ use databend_common_pipeline_transforms::traits::Location;
 use databend_common_storage::DataOperator;
 use databend_common_storages_parquet::ReadSettings;
 
+use super::grace_memory::GraceMemoryJoin;
+use super::grace_state::GraceHashJoinState;
+use super::grace_state::SpillMetadata;
 use crate::pipelines::processors::HashJoinDesc;
 use crate::pipelines::processors::transforms::Join;
 use crate::pipelines::processors::transforms::JoinRuntimeFilterPacket;
 use crate::pipelines::processors::transforms::get_hashes;
-use crate::pipelines::processors::transforms::new_hash_join::grace::grace_memory::GraceMemoryJoin;
-use crate::pipelines::processors::transforms::new_hash_join::grace::grace_state::GraceHashJoinState;
-use crate::pipelines::processors::transforms::new_hash_join::grace::grace_state::SpillMetadata;
-use crate::pipelines::processors::transforms::new_hash_join::join::EmptyJoinStream;
-use crate::pipelines::processors::transforms::new_hash_join::join::JoinStream;
+use crate::pipelines::processors::transforms::new_hash_join::common::join::EmptyJoinStream;
+use crate::pipelines::processors::transforms::new_hash_join::common::join::JoinStream;
 use crate::sessions::QueryContext;
 use crate::spillers::Layout;
 use crate::spillers::SpillAdapter;
diff --git a/src/query/service/src/pipelines/processors/transforms/new_hash_join/grace/grace_memory.rs b/src/query/service/src/pipelines/processors/transforms/new_hash_join/unpartitioned/grace/grace_memory.rs
similarity index 87%
rename from src/query/service/src/pipelines/processors/transforms/new_hash_join/grace/grace_memory.rs
rename to src/query/service/src/pipelines/processors/transforms/new_hash_join/unpartitioned/grace/grace_memory.rs
index cfc7fae6c05da..e4b7d4fe7a0f5 100644
--- a/src/query/service/src/pipelines/processors/transforms/new_hash_join/grace/grace_memory.rs
+++ b/src/query/service/src/pipelines/processors/transforms/new_hash_join/unpartitioned/grace/grace_memory.rs
@@ -14,16 +14,16 @@
 
 use std::sync::PoisonError;
 
+use super::super::memory::AntiLeftHashJoin;
+use super::super::memory::AntiRightHashJoin;
+use super::super::memory::OuterRightHashJoin;
+use super::super::memory::SemiLeftHashJoin;
+use super::super::memory::SemiRightHashJoin;
+use super::super::memory::left_join::OuterLeftHashJoin;
 use crate::pipelines::processors::transforms::BasicHashJoinState;
 use crate::pipelines::processors::transforms::HashJoinHashTable;
 use crate::pipelines::processors::transforms::InnerHashJoin;
 use crate::pipelines::processors::transforms::Join;
-use crate::pipelines::processors::transforms::memory::AntiLeftHashJoin;
-use crate::pipelines::processors::transforms::memory::AntiRightHashJoin;
-use crate::pipelines::processors::transforms::memory::OuterRightHashJoin;
-use crate::pipelines::processors::transforms::memory::SemiLeftHashJoin;
-use crate::pipelines::processors::transforms::memory::SemiRightHashJoin;
-use crate::pipelines::processors::transforms::memory::left_join::OuterLeftHashJoin;
 
 pub trait GraceMemoryJoin: Join {
     fn reset_memory(&mut self);
diff --git a/src/query/service/src/pipelines/processors/transforms/new_hash_join/grace/grace_state.rs b/src/query/service/src/pipelines/processors/transforms/new_hash_join/unpartitioned/grace/grace_state.rs
similarity index 100%
rename from src/query/service/src/pipelines/processors/transforms/new_hash_join/grace/grace_state.rs
rename to src/query/service/src/pipelines/processors/transforms/new_hash_join/unpartitioned/grace/grace_state.rs
diff --git a/src/query/service/src/pipelines/processors/transforms/new_hash_join/grace/mod.rs b/src/query/service/src/pipelines/processors/transforms/new_hash_join/unpartitioned/grace/mod.rs
similarity index 100%
rename from src/query/service/src/pipelines/processors/transforms/new_hash_join/grace/mod.rs
rename to src/query/service/src/pipelines/processors/transforms/new_hash_join/unpartitioned/grace/mod.rs
diff --git a/src/query/service/src/pipelines/processors/transforms/new_hash_join/hash_join_factory.rs b/src/query/service/src/pipelines/processors/transforms/new_hash_join/unpartitioned/hash_join_factory.rs
similarity index 95%
rename from src/query/service/src/pipelines/processors/transforms/new_hash_join/hash_join_factory.rs
rename to src/query/service/src/pipelines/processors/transforms/new_hash_join/unpartitioned/hash_join_factory.rs
index 33482a95cd320..fc14bc04d2d75 100644
--- a/src/query/service/src/pipelines/processors/transforms/new_hash_join/hash_join_factory.rs
+++ b/src/query/service/src/pipelines/processors/transforms/new_hash_join/unpartitioned/hash_join_factory.rs
@@ -27,7 +27,7 @@ use databend_common_pipeline_transforms::MemorySettings;
 use databend_common_sql::plans::JoinType;
 use databend_common_storages_fuse::TableContext;
 
-use super::common::CStyleCell;
+use super::super::common::CStyleCell;
 use super::grace::GraceHashJoinState;
 use super::grace::GraceMemoryJoin;
 use super::hybrid::HybridHashJoin;
@@ -39,12 +39,12 @@ use crate::pipelines::processors::transforms::BasicHashJoinState;
 use crate::pipelines::processors::transforms::GraceHashJoin;
 use crate::pipelines::processors::transforms::InnerHashJoin;
 use crate::pipelines::processors::transforms::Join;
-use crate::pipelines::processors::transforms::memory::AntiLeftHashJoin;
-use crate::pipelines::processors::transforms::memory::AntiRightHashJoin;
-use crate::pipelines::processors::transforms::memory::OuterRightHashJoin;
-use crate::pipelines::processors::transforms::memory::SemiLeftHashJoin;
-use crate::pipelines::processors::transforms::memory::SemiRightHashJoin;
-use crate::pipelines::processors::transforms::memory::left_join::OuterLeftHashJoin;
+use crate::pipelines::processors::transforms::new_hash_join::unpartitioned::memory::AntiLeftHashJoin;
+use crate::pipelines::processors::transforms::new_hash_join::unpartitioned::memory::AntiRightHashJoin;
+use crate::pipelines::processors::transforms::new_hash_join::unpartitioned::memory::OuterRightHashJoin;
+use crate::pipelines::processors::transforms::new_hash_join::unpartitioned::memory::SemiLeftHashJoin;
+use crate::pipelines::processors::transforms::new_hash_join::unpartitioned::memory::SemiRightHashJoin;
+use crate::pipelines::processors::transforms::new_hash_join::unpartitioned::memory::left_join::OuterLeftHashJoin;
 use crate::sessions::QueryContext;
 
 pub struct HashJoinFactory {
diff --git a/src/query/service/src/pipelines/processors/transforms/new_hash_join/unpartitioned/hashtable/basic.rs b/src/query/service/src/pipelines/processors/transforms/new_hash_join/unpartitioned/hashtable/basic.rs
new file mode 100644
index 0000000000000..cb6f7c35e7cbc
--- /dev/null
+++ b/src/query/service/src/pipelines/processors/transforms/new_hash_join/unpartitioned/hashtable/basic.rs
@@ -0,0 +1,53 @@
+// Copyright 2021 Datafuse Labs
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+use databend_common_exception::Result;
+
+use crate::pipelines::processors::transforms::new_hash_join::common::probe_stream::ProbeStream;
+use crate::pipelines::processors::transforms::new_hash_join::common::probe_stream::ProbedRows;
+
+pub struct EmptyProbeStream;
+
+impl ProbeStream for EmptyProbeStream {
+    fn advance(&mut self, _res: &mut ProbedRows, _max_rows: usize) -> Result<()> {
+        Ok(())
+    }
+}
+
+pub struct AllUnmatchedProbeStream {
+    idx: u64,
+    size: u64,
+}
+
+impl AllUnmatchedProbeStream {
+    pub fn create(size: usize) -> Box<dyn ProbeStream> {
+        Box::new(AllUnmatchedProbeStream {
+            idx: 0,
+            size: size as u64,
+        })
+    }
+}
+
+impl ProbeStream for AllUnmatchedProbeStream {
+    fn advance(&mut self, rows: &mut ProbedRows, max_rows: usize) -> Result<()> {
+        if self.idx >= self.size {
+            return Ok(());
+        }
+
+        let unmatched_rows = std::cmp::min(self.size - self.idx, max_rows as u64);
+        rows.unmatched.extend(self.idx..self.idx + unmatched_rows);
+        self.idx += unmatched_rows;
+        Ok(())
+    }
+}
diff --git a/src/query/service/src/pipelines/processors/transforms/new_hash_join/hashtable/fixed_keys.rs b/src/query/service/src/pipelines/processors/transforms/new_hash_join/unpartitioned/hashtable/fixed_keys.rs
similarity index 96%
rename from src/query/service/src/pipelines/processors/transforms/new_hash_join/hashtable/fixed_keys.rs
rename to src/query/service/src/pipelines/processors/transforms/new_hash_join/unpartitioned/hashtable/fixed_keys.rs
index 82ea33dbe7ae6..a428a109385c1 100644
--- a/src/query/service/src/pipelines/processors/transforms/new_hash_join/hashtable/fixed_keys.rs
+++ b/src/query/service/src/pipelines/processors/transforms/new_hash_join/unpartitioned/hashtable/fixed_keys.rs
@@ -22,16 +22,16 @@ use databend_common_expression::KeyAccessor;
 use databend_common_expression::ProjectedBlock;
 use databend_common_hashtable::HashtableKeyable;
 
+use super::ProbeData;
+use super::basic::AllUnmatchedProbeStream;
+use super::basic::EmptyProbeStream;
 use crate::pipelines::processors::transforms::FixedKeyHashJoinHashTable;
 use crate::pipelines::processors::transforms::hash_join_table::HashJoinHashMap;
 use crate::pipelines::processors::transforms::hash_join_table::HashJoinHashtableLike;
 use crate::pipelines::processors::transforms::hash_join_table::RawEntry;
 use crate::pipelines::processors::transforms::hash_join_table::RowPtr;
-use crate::pipelines::processors::transforms::new_hash_join::hashtable::ProbeData;
-use crate::pipelines::processors::transforms::new_hash_join::hashtable::basic::AllUnmatchedProbeStream;
-use crate::pipelines::processors::transforms::new_hash_join::hashtable::basic::EmptyProbeStream;
-use crate::pipelines::processors::transforms::new_hash_join::hashtable::basic::ProbeStream;
-use crate::pipelines::processors::transforms::new_hash_join::hashtable::basic::ProbedRows;
+use crate::pipelines::processors::transforms::new_hash_join::common::probe_stream::ProbeStream;
+use crate::pipelines::processors::transforms::new_hash_join::common::probe_stream::ProbedRows;
 
 impl<T: HashtableKeyable + FixedKey, const UNIQUE: bool> FixedKeyHashJoinHashTable<T, UNIQUE> {
     pub fn new(
diff --git a/src/query/service/src/pipelines/processors/transforms/new_hash_join/hashtable/mod.rs b/src/query/service/src/pipelines/processors/transforms/new_hash_join/unpartitioned/hashtable/mod.rs
similarity index 100%
rename from src/query/service/src/pipelines/processors/transforms/new_hash_join/hashtable/mod.rs
rename to src/query/service/src/pipelines/processors/transforms/new_hash_join/unpartitioned/hashtable/mod.rs
diff --git a/src/query/service/src/pipelines/processors/transforms/new_hash_join/hashtable/serialize_keys.rs b/src/query/service/src/pipelines/processors/transforms/new_hash_join/unpartitioned/hashtable/serialize_keys.rs
similarity index 97%
rename from src/query/service/src/pipelines/processors/transforms/new_hash_join/hashtable/serialize_keys.rs
rename to src/query/service/src/pipelines/processors/transforms/new_hash_join/unpartitioned/hashtable/serialize_keys.rs
index 0416169c7cb23..079240f90ec68 100644
--- a/src/query/service/src/pipelines/processors/transforms/new_hash_join/hashtable/serialize_keys.rs
+++ b/src/query/service/src/pipelines/processors/transforms/new_hash_join/unpartitioned/hashtable/serialize_keys.rs
@@ -22,17 +22,17 @@ use databend_common_expression::KeyAccessor;
 use databend_common_expression::KeysState;
 use databend_common_expression::ProjectedBlock;
 
+use super::ProbeData;
+use super::basic::AllUnmatchedProbeStream;
+use super::basic::EmptyProbeStream;
 use crate::pipelines::processors::transforms::SerializerHashJoinHashTable;
 use crate::pipelines::processors::transforms::hash_join_table::BinaryHashJoinHashMap;
 use crate::pipelines::processors::transforms::hash_join_table::HashJoinHashtableLike;
 use crate::pipelines::processors::transforms::hash_join_table::RowPtr;
 use crate::pipelines::processors::transforms::hash_join_table::STRING_EARLY_SIZE;
 use crate::pipelines::processors::transforms::hash_join_table::StringRawEntry;
-use crate::pipelines::processors::transforms::new_hash_join::hashtable::ProbeData;
-use crate::pipelines::processors::transforms::new_hash_join::hashtable::basic::AllUnmatchedProbeStream;
-use crate::pipelines::processors::transforms::new_hash_join::hashtable::basic::EmptyProbeStream;
-use crate::pipelines::processors::transforms::new_hash_join::hashtable::basic::ProbeStream;
-use crate::pipelines::processors::transforms::new_hash_join::hashtable::basic::ProbedRows;
+use crate::pipelines::processors::transforms::new_hash_join::common::probe_stream::ProbeStream;
+use crate::pipelines::processors::transforms::new_hash_join::common::probe_stream::ProbedRows;
 
 impl<const UNIQUE: bool> SerializerHashJoinHashTable<UNIQUE> {
     pub fn new(
diff --git a/src/query/service/src/pipelines/processors/transforms/new_hash_join/hashtable/single_binary_key.rs b/src/query/service/src/pipelines/processors/transforms/new_hash_join/unpartitioned/hashtable/single_binary_key.rs
similarity index 92%
rename from src/query/service/src/pipelines/processors/transforms/new_hash_join/hashtable/single_binary_key.rs
rename to src/query/service/src/pipelines/processors/transforms/new_hash_join/unpartitioned/hashtable/single_binary_key.rs
index fe6d12d307560..523d428258045 100644
--- a/src/query/service/src/pipelines/processors/transforms/new_hash_join/hashtable/single_binary_key.rs
+++ b/src/query/service/src/pipelines/processors/transforms/new_hash_join/unpartitioned/hashtable/single_binary_key.rs
@@ -20,18 +20,18 @@ use databend_common_expression::HashMethodSingleBinary;
 use databend_common_expression::KeysState;
 use databend_common_expression::ProjectedBlock;
 
+use super::ProbeData;
+use super::basic::AllUnmatchedProbeStream;
+use super::basic::EmptyProbeStream;
+use super::serialize_keys::BinaryKeyProbeStream;
+use super::serialize_keys::EarlyFilteringProbeStream;
 use crate::pipelines::processors::transforms::SingleBinaryHashJoinHashTable;
 use crate::pipelines::processors::transforms::hash_join_table::BinaryHashJoinHashMap;
 use crate::pipelines::processors::transforms::hash_join_table::HashJoinHashtableLike;
 use crate::pipelines::processors::transforms::hash_join_table::RowPtr;
 use crate::pipelines::processors::transforms::hash_join_table::STRING_EARLY_SIZE;
 use crate::pipelines::processors::transforms::hash_join_table::StringRawEntry;
-use crate::pipelines::processors::transforms::new_hash_join::hashtable::ProbeData;
-use crate::pipelines::processors::transforms::new_hash_join::hashtable::basic::AllUnmatchedProbeStream;
-use crate::pipelines::processors::transforms::new_hash_join::hashtable::basic::EmptyProbeStream;
-use crate::pipelines::processors::transforms::new_hash_join::hashtable::basic::ProbeStream;
-use crate::pipelines::processors::transforms::new_hash_join::hashtable::serialize_keys::BinaryKeyProbeStream;
-use crate::pipelines::processors::transforms::new_hash_join::hashtable::serialize_keys::EarlyFilteringProbeStream;
+use crate::pipelines::processors::transforms::new_hash_join::common::probe_stream::ProbeStream;
 
 impl<const UNIQUE: bool> SingleBinaryHashJoinHashTable<UNIQUE> {
     pub fn new(
diff --git a/src/query/service/src/pipelines/processors/transforms/new_hash_join/hybrid/hybrid_join.rs b/src/query/service/src/pipelines/processors/transforms/new_hash_join/unpartitioned/hybrid/hybrid_join.rs
similarity index 96%
rename from src/query/service/src/pipelines/processors/transforms/new_hash_join/hybrid/hybrid_join.rs
rename to src/query/service/src/pipelines/processors/transforms/new_hash_join/unpartitioned/hybrid/hybrid_join.rs
index d78a4b1c625ab..c8d2bd46e102c 100644
--- a/src/query/service/src/pipelines/processors/transforms/new_hash_join/hybrid/hybrid_join.rs
+++ b/src/query/service/src/pipelines/processors/transforms/new_hash_join/unpartitioned/hybrid/hybrid_join.rs
@@ -24,14 +24,14 @@ use databend_common_expression::HashMethodKind;
 use databend_common_pipeline_transforms::MemorySettings;
 use databend_common_sql::plans::JoinType;
 
+use super::hybrid_state::HybridHashJoinState;
 use crate::pipelines::processors::HashJoinDesc;
 use crate::pipelines::processors::transforms::BasicHashJoinState;
 use crate::pipelines::processors::transforms::GraceHashJoin;
 use crate::pipelines::processors::transforms::JoinRuntimeFilterPacket;
-use crate::pipelines::processors::transforms::new_hash_join::grace::GraceMemoryJoin;
-use crate::pipelines::processors::transforms::new_hash_join::hybrid::hybrid_state::HybridHashJoinState;
-use crate::pipelines::processors::transforms::new_hash_join::join::Join;
-use crate::pipelines::processors::transforms::new_hash_join::join::JoinStream;
+use crate::pipelines::processors::transforms::new_hash_join::common::join::Join;
+use crate::pipelines::processors::transforms::new_hash_join::common::join::JoinStream;
+use crate::pipelines::processors::transforms::new_hash_join::unpartitioned::grace::GraceMemoryJoin;
 use crate::sessions::QueryContext;
 
 /// Hybrid hash join mode:
@@ -212,7 +212,7 @@ impl Join for HybridHashJoin {
         }
     }
 
-    fn add_runtime_filter_packet(&self, packet: JoinRuntimeFilterPacket) {
+    fn add_runtime_filter_packet(&self, packet: JoinRuntimeFilterPacket) -> Result<()> {
         match &self.mode {
             HybridJoinMode::Memory(join) => join.add_runtime_filter_packet(packet),
             HybridJoinMode::Grace(join) => join.add_runtime_filter_packet(packet),
diff --git a/src/query/service/src/pipelines/processors/transforms/new_hash_join/hybrid/hybrid_state.rs b/src/query/service/src/pipelines/processors/transforms/new_hash_join/unpartitioned/hybrid/hybrid_state.rs
similarity index 96%
rename from src/query/service/src/pipelines/processors/transforms/new_hash_join/hybrid/hybrid_state.rs
rename to src/query/service/src/pipelines/processors/transforms/new_hash_join/unpartitioned/hybrid/hybrid_state.rs
index 6d6cf34197208..4917ebfd0bda8 100644
--- a/src/query/service/src/pipelines/processors/transforms/new_hash_join/hybrid/hybrid_state.rs
+++ b/src/query/service/src/pipelines/processors/transforms/new_hash_join/unpartitioned/hybrid/hybrid_state.rs
@@ -24,7 +24,7 @@ use databend_common_sql::plans::JoinType;
 
 use crate::pipelines::processors::transforms::HashJoinFactory;
 use crate::pipelines::processors::transforms::HybridHashJoin;
-use crate::pipelines::processors::transforms::new_hash_join::grace::GraceHashJoinState;
+use crate::pipelines::processors::transforms::new_hash_join::unpartitioned::grace::GraceHashJoinState;
 use crate::sessions::QueryContext;
 
 pub struct HybridHashJoinState {
diff --git a/src/query/service/src/pipelines/processors/transforms/new_hash_join/hybrid/mod.rs b/src/query/service/src/pipelines/processors/transforms/new_hash_join/unpartitioned/hybrid/mod.rs
similarity index 100%
rename from src/query/service/src/pipelines/processors/transforms/new_hash_join/hybrid/mod.rs
rename to src/query/service/src/pipelines/processors/transforms/new_hash_join/unpartitioned/hybrid/mod.rs
diff --git a/src/query/service/src/pipelines/processors/transforms/new_hash_join/memory/basic.rs b/src/query/service/src/pipelines/processors/transforms/new_hash_join/unpartitioned/memory/basic.rs
similarity index 100%
rename from src/query/service/src/pipelines/processors/transforms/new_hash_join/memory/basic.rs
rename to src/query/service/src/pipelines/processors/transforms/new_hash_join/unpartitioned/memory/basic.rs
diff --git a/src/query/service/src/pipelines/processors/transforms/new_hash_join/memory/basic_state.rs b/src/query/service/src/pipelines/processors/transforms/new_hash_join/unpartitioned/memory/basic_state.rs
similarity index 100%
rename from src/query/service/src/pipelines/processors/transforms/new_hash_join/memory/basic_state.rs
rename to src/query/service/src/pipelines/processors/transforms/new_hash_join/unpartitioned/memory/basic_state.rs
diff --git a/src/query/service/src/pipelines/processors/transforms/new_hash_join/memory/inner_join.rs b/src/query/service/src/pipelines/processors/transforms/new_hash_join/unpartitioned/memory/inner_join.rs
similarity index 75%
rename from src/query/service/src/pipelines/processors/transforms/new_hash_join/memory/inner_join.rs
rename to src/query/service/src/pipelines/processors/transforms/new_hash_join/unpartitioned/memory/inner_join.rs
index a51e593766a7b..5881035096fd0 100644
--- a/src/query/service/src/pipelines/processors/transforms/new_hash_join/memory/inner_join.rs
+++ b/src/query/service/src/pipelines/processors/transforms/new_hash_join/unpartitioned/memory/inner_join.rs
@@ -22,26 +22,26 @@ use databend_common_exception::ErrorCode;
 use databend_common_exception::Result;
 use databend_common_expression::BlockEntry;
 use databend_common_expression::DataBlock;
-use databend_common_expression::FilterExecutor;
 use databend_common_expression::FunctionContext;
 use databend_common_expression::HashMethodKind;
 use databend_common_expression::types::NullableColumn;
 use databend_common_expression::with_join_hash_method;
 use databend_common_settings::Settings;
 
+use super::super::performance::PerformanceContext;
 use super::basic::BasicHashJoin;
 use super::basic_state::BasicHashJoinState;
 use crate::pipelines::processors::HashJoinDesc;
 use crate::pipelines::processors::transforms::HashJoinHashTable;
 use crate::pipelines::processors::transforms::JoinRuntimeFilterPacket;
-use crate::pipelines::processors::transforms::merge_join_runtime_filter_packets;
-use crate::pipelines::processors::transforms::new_hash_join::hashtable::ProbeData;
-use crate::pipelines::processors::transforms::new_hash_join::hashtable::basic::ProbeStream;
-use crate::pipelines::processors::transforms::new_hash_join::hashtable::basic::ProbedRows;
-use crate::pipelines::processors::transforms::new_hash_join::join::EmptyJoinStream;
-use crate::pipelines::processors::transforms::new_hash_join::join::Join;
-use crate::pipelines::processors::transforms::new_hash_join::join::JoinStream;
-use crate::pipelines::processors::transforms::new_hash_join::performance::PerformanceContext;
+use crate::pipelines::processors::transforms::merge_two_runtime_filter_packets;
+use crate::pipelines::processors::transforms::new_hash_join::common::join::EmptyJoinStream;
+use crate::pipelines::processors::transforms::new_hash_join::common::join::InnerHashJoinFilterStream;
+use crate::pipelines::processors::transforms::new_hash_join::common::join::Join;
+use crate::pipelines::processors::transforms::new_hash_join::common::join::JoinStream;
+use crate::pipelines::processors::transforms::new_hash_join::common::probe_stream::ProbeStream;
+use crate::pipelines::processors::transforms::new_hash_join::common::probe_stream::ProbedRows;
+use crate::pipelines::processors::transforms::new_hash_join::unpartitioned::hashtable::ProbeData;
 
 pub struct InnerHashJoin {
     pub(crate) basic_hash_join: BasicHashJoin,
@@ -50,10 +50,6 @@ pub struct InnerHashJoin {
     pub(crate) function_ctx: FunctionContext,
     pub(crate) basic_state: Arc<BasicHashJoinState>,
     pub(crate) performance_context: PerformanceContext,
-    pub(crate) inlist_threshold: usize,
-    pub(crate) bloom_threshold: usize,
-    pub(crate) min_max_threshold: usize,
-    pub(crate) spatial_threshold: usize,
 }
 
 impl InnerHashJoin {
@@ -66,10 +62,6 @@ impl InnerHashJoin {
         nested_loop_join_threshold: usize,
     ) -> Result<Self> {
         let block_size = settings.get_max_block_size()? as usize;
-        let inlist_threshold = settings.get_inlist_runtime_filter_threshold()? as usize;
-        let bloom_threshold = settings.get_bloom_runtime_filter_threshold()? as usize;
-        let min_max_threshold = settings.get_min_max_runtime_filter_threshold()? as usize;
-        let spatial_threshold = settings.get_spatial_runtime_filter_threshold()? as usize;
 
         let context = PerformanceContext::create(block_size, desc.clone(), function_ctx.clone());
 
@@ -88,10 +80,6 @@ impl InnerHashJoin {
             function_ctx,
             basic_state: state,
             performance_context: context,
-            inlist_threshold,
-            bloom_threshold,
-            min_max_threshold,
-            spatial_threshold,
         })
     }
 }
@@ -105,21 +93,27 @@ impl Join for InnerHashJoin {
         self.basic_hash_join.final_build::<false>()
     }
 
-    fn add_runtime_filter_packet(&self, packet: JoinRuntimeFilterPacket) {
-        let locked = self.basic_state.mutex.lock();
-        let _locked = locked.unwrap_or_else(PoisonError::into_inner);
-        self.basic_state.packets.as_mut().push(packet);
+    fn add_runtime_filter_packet(&self, mut packet: JoinRuntimeFilterPacket) -> Result<()> {
+        loop {
+            let locked = self.basic_state.mutex.lock();
+            let _locked = locked.unwrap_or_else(PoisonError::into_inner);
+
+            let rf_packets = self.basic_state.packets.as_mut();
+
+            if rf_packets.is_empty() {
+                rf_packets.push(packet);
+                return Ok(());
+            }
+
+            let other = rf_packets.pop().unwrap();
+            drop(_locked);
+            packet = merge_two_runtime_filter_packets(packet, other)?;
+        }
     }
 
     fn build_runtime_filter(&self) -> Result<JoinRuntimeFilterPacket> {
-        let packets = std::mem::take(self.basic_state.packets.as_mut());
-        merge_join_runtime_filter_packets(
-            packets,
-            self.inlist_threshold,
-            self.bloom_threshold,
-            self.min_max_threshold,
-            self.spatial_threshold,
-        )
+        let mut packets = std::mem::take(self.basic_state.packets.as_mut());
+        Ok(packets.pop().unwrap_or_default())
     }
 
     fn probe_block(&mut self, data: DataBlock) -> Result<Box<dyn JoinStream + '_>> {
@@ -275,42 +269,3 @@ impl<'a> JoinStream for InnerHashJoinStream<'a> {
         }
     }
 }
-
-pub(super) struct InnerHashJoinFilterStream<'a> {
-    inner: Box<dyn JoinStream + 'a>,
-    filter_executor: &'a mut FilterExecutor,
-}
-
-impl<'a> InnerHashJoinFilterStream<'a> {
-    pub fn create(
-        inner: Box<dyn JoinStream + 'a>,
-        filter_executor: &'a mut FilterExecutor,
-    ) -> Box<dyn JoinStream + 'a> {
-        Box::new(InnerHashJoinFilterStream {
-            inner,
-            filter_executor,
-        })
-    }
-}
-
-impl<'a> JoinStream for InnerHashJoinFilterStream<'a> {
-    fn next(&mut self) -> Result<Option<DataBlock>> {
-        loop {
-            let Some(data_block) = self.inner.next()? else {
-                return Ok(None);
-            };
-
-            if data_block.is_empty() {
-                continue;
-            }
-
-            let data_block = self.filter_executor.filter(data_block)?;
-
-            if data_block.is_empty() {
-                continue;
-            }
-
-            return Ok(Some(data_block));
-        }
-    }
-}
diff --git a/src/query/service/src/pipelines/processors/transforms/new_hash_join/memory/left_join.rs b/src/query/service/src/pipelines/processors/transforms/new_hash_join/unpartitioned/memory/left_join.rs
similarity index 95%
rename from src/query/service/src/pipelines/processors/transforms/new_hash_join/memory/left_join.rs
rename to src/query/service/src/pipelines/processors/transforms/new_hash_join/unpartitioned/memory/left_join.rs
index f0447e941c71b..9dfca00690583 100644
--- a/src/query/service/src/pipelines/processors/transforms/new_hash_join/memory/left_join.rs
+++ b/src/query/service/src/pipelines/processors/transforms/new_hash_join/unpartitioned/memory/left_join.rs
@@ -30,18 +30,18 @@ use databend_common_expression::types::DataType;
 use databend_common_expression::types::NullableColumn;
 use databend_common_expression::with_join_hash_method;
 
+use super::super::performance::PerformanceContext;
+use super::basic::BasicHashJoin;
 use crate::pipelines::processors::HashJoinDesc;
 use crate::pipelines::processors::transforms::BasicHashJoinState;
 use crate::pipelines::processors::transforms::HashJoinHashTable;
 use crate::pipelines::processors::transforms::Join;
-use crate::pipelines::processors::transforms::new_hash_join::hashtable::ProbeData;
-use crate::pipelines::processors::transforms::new_hash_join::hashtable::basic::ProbeStream;
-use crate::pipelines::processors::transforms::new_hash_join::hashtable::basic::ProbedRows;
-use crate::pipelines::processors::transforms::new_hash_join::join::EmptyJoinStream;
-use crate::pipelines::processors::transforms::new_hash_join::join::JoinStream;
-use crate::pipelines::processors::transforms::new_hash_join::join::OneBlockJoinStream;
-use crate::pipelines::processors::transforms::new_hash_join::memory::basic::BasicHashJoin;
-use crate::pipelines::processors::transforms::new_hash_join::performance::PerformanceContext;
+use crate::pipelines::processors::transforms::new_hash_join::common::join::EmptyJoinStream;
+use crate::pipelines::processors::transforms::new_hash_join::common::join::JoinStream;
+use crate::pipelines::processors::transforms::new_hash_join::common::join::OneBlockJoinStream;
+use crate::pipelines::processors::transforms::new_hash_join::common::probe_stream::ProbeStream;
+use crate::pipelines::processors::transforms::new_hash_join::common::probe_stream::ProbedRows;
+use crate::pipelines::processors::transforms::new_hash_join::unpartitioned::hashtable::ProbeData;
 use crate::pipelines::processors::transforms::wrap_true_validity;
 use crate::sessions::QueryContext;
 
diff --git a/src/query/service/src/pipelines/processors/transforms/new_hash_join/memory/left_join_anti.rs b/src/query/service/src/pipelines/processors/transforms/new_hash_join/unpartitioned/memory/left_join_anti.rs
similarity index 93%
rename from src/query/service/src/pipelines/processors/transforms/new_hash_join/memory/left_join_anti.rs
rename to src/query/service/src/pipelines/processors/transforms/new_hash_join/unpartitioned/memory/left_join_anti.rs
index fe569f901d7ad..5977c3930c588 100644
--- a/src/query/service/src/pipelines/processors/transforms/new_hash_join/memory/left_join_anti.rs
+++ b/src/query/service/src/pipelines/processors/transforms/new_hash_join/unpartitioned/memory/left_join_anti.rs
@@ -27,19 +27,19 @@ use databend_common_expression::FunctionContext;
 use databend_common_expression::HashMethodKind;
 use databend_common_expression::with_join_hash_method;
 
+use super::super::performance::PerformanceContext;
+use super::basic::BasicHashJoin;
+use super::left_join::final_result_block;
 use crate::pipelines::processors::HashJoinDesc;
 use crate::pipelines::processors::transforms::BasicHashJoinState;
 use crate::pipelines::processors::transforms::HashJoinHashTable;
 use crate::pipelines::processors::transforms::Join;
-use crate::pipelines::processors::transforms::memory::basic::BasicHashJoin;
-use crate::pipelines::processors::transforms::memory::left_join::final_result_block;
-use crate::pipelines::processors::transforms::new_hash_join::hashtable::ProbeData;
-use crate::pipelines::processors::transforms::new_hash_join::hashtable::basic::ProbeStream;
-use crate::pipelines::processors::transforms::new_hash_join::hashtable::basic::ProbedRows;
-use crate::pipelines::processors::transforms::new_hash_join::join::EmptyJoinStream;
-use crate::pipelines::processors::transforms::new_hash_join::join::JoinStream;
-use crate::pipelines::processors::transforms::new_hash_join::join::OneBlockJoinStream;
-use crate::pipelines::processors::transforms::new_hash_join::performance::PerformanceContext;
+use crate::pipelines::processors::transforms::new_hash_join::common::join::EmptyJoinStream;
+use crate::pipelines::processors::transforms::new_hash_join::common::join::JoinStream;
+use crate::pipelines::processors::transforms::new_hash_join::common::join::OneBlockJoinStream;
+use crate::pipelines::processors::transforms::new_hash_join::common::probe_stream::ProbeStream;
+use crate::pipelines::processors::transforms::new_hash_join::common::probe_stream::ProbedRows;
+use crate::pipelines::processors::transforms::new_hash_join::unpartitioned::hashtable::ProbeData;
 use crate::sessions::QueryContext;
 
 pub struct AntiLeftHashJoin {
diff --git a/src/query/service/src/pipelines/processors/transforms/new_hash_join/memory/left_join_semi.rs b/src/query/service/src/pipelines/processors/transforms/new_hash_join/unpartitioned/memory/left_join_semi.rs
similarity index 86%
rename from src/query/service/src/pipelines/processors/transforms/new_hash_join/memory/left_join_semi.rs
rename to src/query/service/src/pipelines/processors/transforms/new_hash_join/unpartitioned/memory/left_join_semi.rs
index 4ee2e762ac589..dfae7ab8aae78 100644
--- a/src/query/service/src/pipelines/processors/transforms/new_hash_join/memory/left_join_semi.rs
+++ b/src/query/service/src/pipelines/processors/transforms/new_hash_join/unpartitioned/memory/left_join_semi.rs
@@ -30,19 +30,19 @@ use databend_common_expression::HashMethodKind;
 use databend_common_expression::types::NullableColumn;
 use databend_common_expression::with_join_hash_method;
 
+use super::super::performance::PerformanceContext;
+use super::basic::BasicHashJoin;
 use crate::pipelines::processors::HashJoinDesc;
 use crate::pipelines::processors::transforms::BasicHashJoinState;
 use crate::pipelines::processors::transforms::HashJoinHashTable;
 use crate::pipelines::processors::transforms::Join;
 use crate::pipelines::processors::transforms::JoinRuntimeFilterPacket;
-use crate::pipelines::processors::transforms::memory::basic::BasicHashJoin;
-use crate::pipelines::processors::transforms::merge_join_runtime_filter_packets;
-use crate::pipelines::processors::transforms::new_hash_join::hashtable::ProbeData;
-use crate::pipelines::processors::transforms::new_hash_join::hashtable::basic::ProbeStream;
-use crate::pipelines::processors::transforms::new_hash_join::hashtable::basic::ProbedRows;
-use crate::pipelines::processors::transforms::new_hash_join::join::EmptyJoinStream;
-use crate::pipelines::processors::transforms::new_hash_join::join::JoinStream;
-use crate::pipelines::processors::transforms::new_hash_join::performance::PerformanceContext;
+use crate::pipelines::processors::transforms::merge_two_runtime_filter_packets;
+use crate::pipelines::processors::transforms::new_hash_join::common::join::EmptyJoinStream;
+use crate::pipelines::processors::transforms::new_hash_join::common::join::JoinStream;
+use crate::pipelines::processors::transforms::new_hash_join::common::probe_stream::ProbeStream;
+use crate::pipelines::processors::transforms::new_hash_join::common::probe_stream::ProbedRows;
+use crate::pipelines::processors::transforms::new_hash_join::unpartitioned::hashtable::ProbeData;
 use crate::sessions::QueryContext;
 
 pub struct SemiLeftHashJoin {
@@ -52,10 +52,6 @@ pub struct SemiLeftHashJoin {
     pub(crate) function_ctx: FunctionContext,
     pub(crate) basic_state: Arc<BasicHashJoinState>,
     pub(crate) performance_context: PerformanceContext,
-    pub(crate) inlist_threshold: usize,
-    pub(crate) bloom_threshold: usize,
-    pub(crate) min_max_threshold: usize,
-    pub(crate) spatial_threshold: usize,
 }
 
 impl SemiLeftHashJoin {
@@ -68,10 +64,6 @@ impl SemiLeftHashJoin {
     ) -> Result<Self> {
         let settings = ctx.get_settings();
         let block_size = settings.get_max_block_size()? as usize;
-        let inlist_threshold = settings.get_inlist_runtime_filter_threshold()? as usize;
-        let bloom_threshold = settings.get_bloom_runtime_filter_threshold()? as usize;
-        let min_max_threshold = settings.get_min_max_runtime_filter_threshold()? as usize;
-        let spatial_threshold = settings.get_spatial_runtime_filter_threshold()? as usize;
 
         let context = PerformanceContext::create(block_size, desc.clone(), function_ctx.clone());
 
@@ -90,10 +82,6 @@ impl SemiLeftHashJoin {
             function_ctx,
             basic_state: state,
             performance_context: context,
-            inlist_threshold,
-            bloom_threshold,
-            min_max_threshold,
-            spatial_threshold,
         })
     }
 }
@@ -107,21 +95,27 @@ impl Join for SemiLeftHashJoin {
         self.basic_hash_join.final_build::<false>()
     }
 
-    fn add_runtime_filter_packet(&self, packet: JoinRuntimeFilterPacket) {
-        let locked = self.basic_state.mutex.lock();
-        let _locked = locked.unwrap_or_else(PoisonError::into_inner);
-        self.basic_state.packets.as_mut().push(packet);
+    fn add_runtime_filter_packet(&self, mut packet: JoinRuntimeFilterPacket) -> Result<()> {
+        loop {
+            let locked = self.basic_state.mutex.lock();
+            let _locked = locked.unwrap_or_else(PoisonError::into_inner);
+
+            let rf_packets = self.basic_state.packets.as_mut();
+
+            if rf_packets.is_empty() {
+                rf_packets.push(packet);
+                return Ok(());
+            }
+
+            let other = rf_packets.pop().unwrap();
+            drop(_locked);
+            packet = merge_two_runtime_filter_packets(packet, other)?;
+        }
     }
 
     fn build_runtime_filter(&self) -> Result<JoinRuntimeFilterPacket> {
-        let packets = std::mem::take(self.basic_state.packets.as_mut());
-        merge_join_runtime_filter_packets(
-            packets,
-            self.inlist_threshold,
-            self.bloom_threshold,
-            self.min_max_threshold,
-            self.spatial_threshold,
-        )
+        let mut packets = std::mem::take(self.basic_state.packets.as_mut());
+        Ok(packets.pop().unwrap_or_default())
     }
 
     fn probe_block(&mut self, data: DataBlock) -> Result<Box<dyn JoinStream + '_>> {
diff --git a/src/query/service/src/pipelines/processors/transforms/new_hash_join/memory/mod.rs b/src/query/service/src/pipelines/processors/transforms/new_hash_join/unpartitioned/memory/mod.rs
similarity index 97%
rename from src/query/service/src/pipelines/processors/transforms/new_hash_join/memory/mod.rs
rename to src/query/service/src/pipelines/processors/transforms/new_hash_join/unpartitioned/memory/mod.rs
index 150758af79896..5c09205bdc733 100644
--- a/src/query/service/src/pipelines/processors/transforms/new_hash_join/memory/mod.rs
+++ b/src/query/service/src/pipelines/processors/transforms/new_hash_join/unpartitioned/memory/mod.rs
@@ -12,12 +12,13 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-mod basic;
+pub(crate) mod basic;
 mod basic_state;
 mod inner_join;
 pub mod left_join;
 mod left_join_anti;
 mod left_join_semi;
+mod nested_loop;
 mod right_join;
 mod right_join_anti;
 mod right_join_semi;
@@ -26,9 +27,7 @@ pub use basic_state::BasicHashJoinState;
 pub use inner_join::InnerHashJoin;
 pub use left_join_anti::AntiLeftHashJoin;
 pub use left_join_semi::SemiLeftHashJoin;
+pub use nested_loop::*;
 pub use right_join::OuterRightHashJoin;
 pub use right_join_anti::AntiRightHashJoin;
 pub use right_join_semi::SemiRightHashJoin;
-mod nested_loop;
-
-pub use nested_loop::*;
diff --git a/src/query/service/src/pipelines/processors/transforms/new_hash_join/memory/nested_loop.rs b/src/query/service/src/pipelines/processors/transforms/new_hash_join/unpartitioned/memory/nested_loop.rs
similarity index 98%
rename from src/query/service/src/pipelines/processors/transforms/new_hash_join/memory/nested_loop.rs
rename to src/query/service/src/pipelines/processors/transforms/new_hash_join/unpartitioned/memory/nested_loop.rs
index 89c67354391ed..21529fc43bd9d 100644
--- a/src/query/service/src/pipelines/processors/transforms/new_hash_join/memory/nested_loop.rs
+++ b/src/query/service/src/pipelines/processors/transforms/new_hash_join/unpartitioned/memory/nested_loop.rs
@@ -30,7 +30,7 @@ use crate::pipelines::processors::transforms::JoinRuntimeFilterPacket;
 use crate::pipelines::processors::transforms::JoinStream;
 use crate::pipelines::processors::transforms::NestedLoopDesc;
 use crate::pipelines::processors::transforms::hash_join_table::RowPtr;
-use crate::pipelines::processors::transforms::new_hash_join::join::EmptyJoinStream;
+use crate::pipelines::processors::transforms::new_hash_join::common::join::EmptyJoinStream;
 
 pub struct NestedLoopJoin<T> {
     inner: T,
@@ -89,8 +89,8 @@ impl<T: Join> Join for NestedLoopJoin<T> {
         self.inner.final_build()
     }
 
-    fn add_runtime_filter_packet(&self, packet: JoinRuntimeFilterPacket) {
-        self.inner.add_runtime_filter_packet(packet);
+    fn add_runtime_filter_packet(&self, packet: JoinRuntimeFilterPacket) -> Result<()> {
+        self.inner.add_runtime_filter_packet(packet)
     }
 
     fn build_runtime_filter(&self) -> Result<JoinRuntimeFilterPacket> {
diff --git a/src/query/service/src/pipelines/processors/transforms/new_hash_join/memory/right_join.rs b/src/query/service/src/pipelines/processors/transforms/new_hash_join/unpartitioned/memory/right_join.rs
similarity index 87%
rename from src/query/service/src/pipelines/processors/transforms/new_hash_join/memory/right_join.rs
rename to src/query/service/src/pipelines/processors/transforms/new_hash_join/unpartitioned/memory/right_join.rs
index 0bc9b37712b7a..cef266511b80b 100644
--- a/src/query/service/src/pipelines/processors/transforms/new_hash_join/memory/right_join.rs
+++ b/src/query/service/src/pipelines/processors/transforms/new_hash_join/unpartitioned/memory/right_join.rs
@@ -28,21 +28,21 @@ use databend_common_expression::HashMethodKind;
 use databend_common_expression::types::DataType;
 use databend_common_expression::with_join_hash_method;
 
+use super::super::performance::PerformanceContext;
+use super::basic::BasicHashJoin;
+use super::left_join::final_result_block;
+use super::left_join::null_block;
 use crate::pipelines::processors::HashJoinDesc;
 use crate::pipelines::processors::transforms::BasicHashJoinState;
 use crate::pipelines::processors::transforms::HashJoinHashTable;
 use crate::pipelines::processors::transforms::Join;
 use crate::pipelines::processors::transforms::JoinRuntimeFilterPacket;
 use crate::pipelines::processors::transforms::hash_join_table::RowPtr;
-use crate::pipelines::processors::transforms::memory::basic::BasicHashJoin;
-use crate::pipelines::processors::transforms::memory::left_join::final_result_block;
-use crate::pipelines::processors::transforms::memory::left_join::null_block;
-use crate::pipelines::processors::transforms::merge_join_runtime_filter_packets;
-use crate::pipelines::processors::transforms::new_hash_join::hashtable::ProbeData;
-use crate::pipelines::processors::transforms::new_hash_join::hashtable::basic::ProbeStream;
-use crate::pipelines::processors::transforms::new_hash_join::hashtable::basic::ProbedRows;
-use crate::pipelines::processors::transforms::new_hash_join::join::JoinStream;
-use crate::pipelines::processors::transforms::new_hash_join::performance::PerformanceContext;
+use crate::pipelines::processors::transforms::merge_two_runtime_filter_packets;
+use crate::pipelines::processors::transforms::new_hash_join::common::join::JoinStream;
+use crate::pipelines::processors::transforms::new_hash_join::common::probe_stream::ProbeStream;
+use crate::pipelines::processors::transforms::new_hash_join::common::probe_stream::ProbedRows;
+use crate::pipelines::processors::transforms::new_hash_join::unpartitioned::hashtable::ProbeData;
 use crate::pipelines::processors::transforms::wrap_nullable_block;
 use crate::sessions::QueryContext;
 
@@ -53,10 +53,6 @@ pub struct OuterRightHashJoin {
     pub(crate) function_ctx: FunctionContext,
     pub(crate) basic_state: Arc<BasicHashJoinState>,
     pub(crate) performance_context: PerformanceContext,
-    pub(crate) inlist_threshold: usize,
-    pub(crate) bloom_threshold: usize,
-    pub(crate) min_max_threshold: usize,
-    pub(crate) spatial_threshold: usize,
 
     pub(crate) finished: bool,
 }
@@ -71,10 +67,6 @@ impl OuterRightHashJoin {
     ) -> Result<Self> {
         let settings = ctx.get_settings();
         let block_size = settings.get_max_block_size()? as usize;
-        let inlist_threshold = settings.get_inlist_runtime_filter_threshold()? as usize;
-        let bloom_threshold = settings.get_bloom_runtime_filter_threshold()? as usize;
-        let min_max_threshold = settings.get_min_max_runtime_filter_threshold()? as usize;
-        let spatial_threshold = settings.get_spatial_runtime_filter_threshold()? as usize;
 
         let context = PerformanceContext::create(block_size, desc.clone(), function_ctx.clone());
 
@@ -93,10 +85,6 @@ impl OuterRightHashJoin {
             function_ctx,
             basic_state: state,
             performance_context: context,
-            inlist_threshold,
-            bloom_threshold,
-            min_max_threshold,
-            spatial_threshold,
             finished: false,
         })
     }
@@ -111,21 +99,27 @@ impl Join for OuterRightHashJoin {
         self.basic_hash_join.final_build::<true>()
     }
 
-    fn add_runtime_filter_packet(&self, packet: JoinRuntimeFilterPacket) {
-        let locked = self.basic_state.mutex.lock();
-        let _locked = locked.unwrap_or_else(PoisonError::into_inner);
-        self.basic_state.packets.as_mut().push(packet);
+    fn add_runtime_filter_packet(&self, mut packet: JoinRuntimeFilterPacket) -> Result<()> {
+        loop {
+            let locked = self.basic_state.mutex.lock();
+            let _locked = locked.unwrap_or_else(PoisonError::into_inner);
+
+            let rf_packets = self.basic_state.packets.as_mut();
+
+            if rf_packets.is_empty() {
+                rf_packets.push(packet);
+                return Ok(());
+            }
+
+            let other = rf_packets.pop().unwrap();
+            drop(_locked);
+            packet = merge_two_runtime_filter_packets(packet, other)?;
+        }
     }
 
     fn build_runtime_filter(&self) -> Result<JoinRuntimeFilterPacket> {
-        let packets = std::mem::take(self.basic_state.packets.as_mut());
-        merge_join_runtime_filter_packets(
-            packets,
-            self.inlist_threshold,
-            self.bloom_threshold,
-            self.min_max_threshold,
-            self.spatial_threshold,
-        )
+        let mut packets = std::mem::take(self.basic_state.packets.as_mut());
+        Ok(packets.pop().unwrap_or_default())
     }
 
     fn probe_block(&mut self, data: DataBlock) -> Result<Box<dyn JoinStream + '_>> {
diff --git a/src/query/service/src/pipelines/processors/transforms/new_hash_join/memory/right_join_anti.rs b/src/query/service/src/pipelines/processors/transforms/new_hash_join/unpartitioned/memory/right_join_anti.rs
similarity index 82%
rename from src/query/service/src/pipelines/processors/transforms/new_hash_join/memory/right_join_anti.rs
rename to src/query/service/src/pipelines/processors/transforms/new_hash_join/unpartitioned/memory/right_join_anti.rs
index 74b2f1365bd87..bd35eda235a20 100644
--- a/src/query/service/src/pipelines/processors/transforms/new_hash_join/memory/right_join_anti.rs
+++ b/src/query/service/src/pipelines/processors/transforms/new_hash_join/unpartitioned/memory/right_join_anti.rs
@@ -26,18 +26,18 @@ use databend_common_expression::FunctionContext;
 use databend_common_expression::HashMethodKind;
 use databend_common_expression::with_join_hash_method;
 
+use super::super::performance::PerformanceContext;
+use super::basic::BasicHashJoin;
+use super::right_join_semi::SemiRightHashJoinStream;
 use crate::pipelines::processors::HashJoinDesc;
 use crate::pipelines::processors::transforms::BasicHashJoinState;
 use crate::pipelines::processors::transforms::HashJoinHashTable;
 use crate::pipelines::processors::transforms::Join;
 use crate::pipelines::processors::transforms::JoinRuntimeFilterPacket;
 use crate::pipelines::processors::transforms::hash_join_table::RowPtr;
-use crate::pipelines::processors::transforms::memory::basic::BasicHashJoin;
-use crate::pipelines::processors::transforms::memory::right_join_semi::SemiRightHashJoinStream;
-use crate::pipelines::processors::transforms::merge_join_runtime_filter_packets;
-use crate::pipelines::processors::transforms::new_hash_join::hashtable::ProbeData;
-use crate::pipelines::processors::transforms::new_hash_join::join::JoinStream;
-use crate::pipelines::processors::transforms::new_hash_join::performance::PerformanceContext;
+use crate::pipelines::processors::transforms::merge_two_runtime_filter_packets;
+use crate::pipelines::processors::transforms::new_hash_join::common::join::JoinStream;
+use crate::pipelines::processors::transforms::new_hash_join::unpartitioned::hashtable::ProbeData;
 use crate::sessions::QueryContext;
 
 pub struct AntiRightHashJoin {
@@ -47,10 +47,6 @@ pub struct AntiRightHashJoin {
     pub(crate) function_ctx: FunctionContext,
     pub(crate) basic_state: Arc<BasicHashJoinState>,
     pub(crate) performance_context: PerformanceContext,
-    pub(crate) inlist_threshold: usize,
-    pub(crate) bloom_threshold: usize,
-    pub(crate) min_max_threshold: usize,
-    pub(crate) spatial_threshold: usize,
 
     pub(crate) finished: bool,
 }
@@ -65,10 +61,6 @@ impl AntiRightHashJoin {
     ) -> Result<Self> {
         let settings = ctx.get_settings();
         let block_size = settings.get_max_block_size()? as usize;
-        let inlist_threshold = settings.get_inlist_runtime_filter_threshold()? as usize;
-        let bloom_threshold = settings.get_bloom_runtime_filter_threshold()? as usize;
-        let min_max_threshold = settings.get_min_max_runtime_filter_threshold()? as usize;
-        let spatial_threshold = settings.get_spatial_runtime_filter_threshold()? as usize;
 
         let context = PerformanceContext::create(block_size, desc.clone(), function_ctx.clone());
 
@@ -87,10 +79,6 @@ impl AntiRightHashJoin {
             function_ctx,
             basic_state: state,
             performance_context: context,
-            inlist_threshold,
-            bloom_threshold,
-            min_max_threshold,
-            spatial_threshold,
             finished: false,
         })
     }
@@ -105,21 +93,27 @@ impl Join for AntiRightHashJoin {
         self.basic_hash_join.final_build::<true>()
     }
 
-    fn add_runtime_filter_packet(&self, packet: JoinRuntimeFilterPacket) {
-        let locked = self.basic_state.mutex.lock();
-        let _locked = locked.unwrap_or_else(PoisonError::into_inner);
-        self.basic_state.packets.as_mut().push(packet);
+    fn add_runtime_filter_packet(&self, mut packet: JoinRuntimeFilterPacket) -> Result<()> {
+        loop {
+            let locked = self.basic_state.mutex.lock();
+            let _locked = locked.unwrap_or_else(PoisonError::into_inner);
+
+            let rf_packets = self.basic_state.packets.as_mut();
+
+            if rf_packets.is_empty() {
+                rf_packets.push(packet);
+                return Ok(());
+            }
+
+            let other = rf_packets.pop().unwrap();
+            drop(_locked);
+            packet = merge_two_runtime_filter_packets(packet, other)?;
+        }
     }
 
     fn build_runtime_filter(&self) -> Result<JoinRuntimeFilterPacket> {
-        let packets = std::mem::take(self.basic_state.packets.as_mut());
-        merge_join_runtime_filter_packets(
-            packets,
-            self.inlist_threshold,
-            self.bloom_threshold,
-            self.min_max_threshold,
-            self.spatial_threshold,
-        )
+        let mut packets = std::mem::take(self.basic_state.packets.as_mut());
+        Ok(packets.pop().unwrap_or_default())
     }
 
     fn probe_block(&mut self, data: DataBlock) -> Result<Box<dyn JoinStream + '_>> {
diff --git a/src/query/service/src/pipelines/processors/transforms/new_hash_join/memory/right_join_semi.rs b/src/query/service/src/pipelines/processors/transforms/new_hash_join/unpartitioned/memory/right_join_semi.rs
similarity index 86%
rename from src/query/service/src/pipelines/processors/transforms/new_hash_join/memory/right_join_semi.rs
rename to src/query/service/src/pipelines/processors/transforms/new_hash_join/unpartitioned/memory/right_join_semi.rs
index 2f3d57c38e452..df860823ec4c3 100644
--- a/src/query/service/src/pipelines/processors/transforms/new_hash_join/memory/right_join_semi.rs
+++ b/src/query/service/src/pipelines/processors/transforms/new_hash_join/unpartitioned/memory/right_join_semi.rs
@@ -27,20 +27,20 @@ use databend_common_expression::FunctionContext;
 use databend_common_expression::HashMethodKind;
 use databend_common_expression::with_join_hash_method;
 
+use super::super::performance::PerformanceContext;
+use super::basic::BasicHashJoin;
+use super::left_join::final_result_block;
 use crate::pipelines::processors::HashJoinDesc;
 use crate::pipelines::processors::transforms::BasicHashJoinState;
 use crate::pipelines::processors::transforms::HashJoinHashTable;
 use crate::pipelines::processors::transforms::Join;
 use crate::pipelines::processors::transforms::JoinRuntimeFilterPacket;
 use crate::pipelines::processors::transforms::hash_join_table::RowPtr;
-use crate::pipelines::processors::transforms::memory::basic::BasicHashJoin;
-use crate::pipelines::processors::transforms::memory::left_join::final_result_block;
-use crate::pipelines::processors::transforms::merge_join_runtime_filter_packets;
-use crate::pipelines::processors::transforms::new_hash_join::hashtable::ProbeData;
-use crate::pipelines::processors::transforms::new_hash_join::hashtable::basic::ProbeStream;
-use crate::pipelines::processors::transforms::new_hash_join::hashtable::basic::ProbedRows;
-use crate::pipelines::processors::transforms::new_hash_join::join::JoinStream;
-use crate::pipelines::processors::transforms::new_hash_join::performance::PerformanceContext;
+use crate::pipelines::processors::transforms::merge_two_runtime_filter_packets;
+use crate::pipelines::processors::transforms::new_hash_join::common::join::JoinStream;
+use crate::pipelines::processors::transforms::new_hash_join::common::probe_stream::ProbeStream;
+use crate::pipelines::processors::transforms::new_hash_join::common::probe_stream::ProbedRows;
+use crate::pipelines::processors::transforms::new_hash_join::unpartitioned::hashtable::ProbeData;
 use crate::sessions::QueryContext;
 
 pub struct SemiRightHashJoin {
@@ -50,10 +50,6 @@ pub struct SemiRightHashJoin {
     pub(crate) function_ctx: FunctionContext,
     pub(crate) basic_state: Arc<BasicHashJoinState>,
     pub(crate) performance_context: PerformanceContext,
-    pub(crate) inlist_threshold: usize,
-    pub(crate) bloom_threshold: usize,
-    pub(crate) min_max_threshold: usize,
-    pub(crate) spatial_threshold: usize,
 
     pub(crate) finished: bool,
 }
@@ -68,10 +64,6 @@ impl SemiRightHashJoin {
     ) -> Result<Self> {
         let settings = ctx.get_settings();
         let block_size = settings.get_max_block_size()? as usize;
-        let inlist_threshold = settings.get_inlist_runtime_filter_threshold()? as usize;
-        let bloom_threshold = settings.get_bloom_runtime_filter_threshold()? as usize;
-        let min_max_threshold = settings.get_min_max_runtime_filter_threshold()? as usize;
-        let spatial_threshold = settings.get_spatial_runtime_filter_threshold()? as usize;
 
         let context = PerformanceContext::create(block_size, desc.clone(), function_ctx.clone());
 
@@ -90,10 +82,6 @@ impl SemiRightHashJoin {
             function_ctx,
             basic_state: state,
             performance_context: context,
-            inlist_threshold,
-            bloom_threshold,
-            min_max_threshold,
-            spatial_threshold,
             finished: false,
         })
     }
@@ -108,21 +96,27 @@ impl Join for SemiRightHashJoin {
         self.basic_hash_join.final_build::<true>()
     }
 
-    fn add_runtime_filter_packet(&self, packet: JoinRuntimeFilterPacket) {
-        let locked = self.basic_state.mutex.lock();
-        let _locked = locked.unwrap_or_else(PoisonError::into_inner);
-        self.basic_state.packets.as_mut().push(packet);
+    fn add_runtime_filter_packet(&self, mut packet: JoinRuntimeFilterPacket) -> Result<()> {
+        loop {
+            let locked = self.basic_state.mutex.lock();
+            let _locked = locked.unwrap_or_else(PoisonError::into_inner);
+
+            let rf_packets = self.basic_state.packets.as_mut();
+
+            if rf_packets.is_empty() {
+                rf_packets.push(packet);
+                return Ok(());
+            }
+
+            let other = rf_packets.pop().unwrap();
+            drop(_locked);
+            packet = merge_two_runtime_filter_packets(packet, other)?;
+        }
     }
 
     fn build_runtime_filter(&self) -> Result<JoinRuntimeFilterPacket> {
-        let packets = std::mem::take(self.basic_state.packets.as_mut());
-        merge_join_runtime_filter_packets(
-            packets,
-            self.inlist_threshold,
-            self.bloom_threshold,
-            self.min_max_threshold,
-            self.spatial_threshold,
-        )
+        let mut packets = std::mem::take(self.basic_state.packets.as_mut());
+        Ok(packets.pop().unwrap_or_default())
     }
 
     fn probe_block(&mut self, data: DataBlock) -> Result<Box<dyn JoinStream + '_>> {
diff --git a/src/query/service/src/pipelines/processors/transforms/new_hash_join/unpartitioned/mod.rs b/src/query/service/src/pipelines/processors/transforms/new_hash_join/unpartitioned/mod.rs
new file mode 100644
index 0000000000000..0cef379072498
--- /dev/null
+++ b/src/query/service/src/pipelines/processors/transforms/new_hash_join/unpartitioned/mod.rs
@@ -0,0 +1,26 @@
+// Copyright 2021 Datafuse Labs
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+pub mod grace;
+mod hash_join_factory;
+pub mod hashtable;
+pub mod hybrid;
+pub mod memory;
+mod performance;
+mod transform_hash_join;
+
+pub use hash_join_factory::HashJoinFactory;
+pub use memory::*;
+pub use performance::PerformanceContext;
+pub use transform_hash_join::TransformHashJoin;
diff --git a/src/query/service/src/pipelines/processors/transforms/new_hash_join/performance.rs b/src/query/service/src/pipelines/processors/transforms/new_hash_join/unpartitioned/performance.rs
similarity index 90%
rename from src/query/service/src/pipelines/processors/transforms/new_hash_join/performance.rs
rename to src/query/service/src/pipelines/processors/transforms/new_hash_join/unpartitioned/performance.rs
index 0f3f7cb8f560b..92743c6f2e9d4 100644
--- a/src/query/service/src/pipelines/processors/transforms/new_hash_join/performance.rs
+++ b/src/query/service/src/pipelines/processors/transforms/new_hash_join/unpartitioned/performance.rs
@@ -19,8 +19,8 @@ use databend_common_expression::FunctionContext;
 use databend_common_functions::BUILTIN_FUNCTIONS;
 
 use crate::pipelines::processors::HashJoinDesc;
-use crate::pipelines::processors::transforms::new_hash_join::hashtable::ProbeHashStatistics;
-use crate::pipelines::processors::transforms::new_hash_join::hashtable::basic::ProbedRows;
+use crate::pipelines::processors::transforms::new_hash_join::common::probe_stream::ProbedRows;
+use crate::pipelines::processors::transforms::new_hash_join::unpartitioned::hashtable::ProbeHashStatistics;
 
 pub struct PerformanceContext {
     pub probe_result: ProbedRows,
diff --git a/src/query/service/src/pipelines/processors/transforms/new_hash_join/transform_hash_join.rs b/src/query/service/src/pipelines/processors/transforms/new_hash_join/unpartitioned/transform_hash_join.rs
similarity index 94%
rename from src/query/service/src/pipelines/processors/transforms/new_hash_join/transform_hash_join.rs
rename to src/query/service/src/pipelines/processors/transforms/new_hash_join/unpartitioned/transform_hash_join.rs
index d8051592155c9..de806c34d66b5 100644
--- a/src/query/service/src/pipelines/processors/transforms/new_hash_join/transform_hash_join.rs
+++ b/src/query/service/src/pipelines/processors/transforms/new_hash_join/unpartitioned/transform_hash_join.rs
@@ -31,10 +31,10 @@ use databend_common_pipeline::core::ProcessorPtr;
 use log::info;
 
 use crate::pipelines::processors::transforms::RuntimeFilterLocalBuilder;
-use crate::pipelines::processors::transforms::new_hash_join::join::FinishedJoin;
-use crate::pipelines::processors::transforms::new_hash_join::join::Join;
-use crate::pipelines::processors::transforms::new_hash_join::join::JoinStream;
-use crate::pipelines::processors::transforms::new_hash_join::runtime_filter::RuntimeFiltersDesc;
+use crate::pipelines::processors::transforms::new_hash_join::common::join::FinishedJoin;
+use crate::pipelines::processors::transforms::new_hash_join::common::join::Join;
+use crate::pipelines::processors::transforms::new_hash_join::common::join::JoinStream;
+use crate::pipelines::processors::transforms::new_hash_join::common::runtime_filter::RuntimeFiltersDesc;
 
 pub struct TransformHashJoin {
     build_port: Arc<InputPort>,
@@ -154,6 +154,12 @@ impl Processor for TransformHashJoin {
                     if !state.finished {
                         state.finished = true;
                         self.join.add_block(None)?;
+
+                        if let Some(builder) = self.runtime_filter_builder.take() {
+                            let spill_happened = self.join.is_spill_happened();
+                            let packet = builder.finish(spill_happened)?;
+                            self.join.add_runtime_filter_packet(packet)?;
+                        }
                     }
                     return Ok(());
                 };
@@ -224,14 +230,6 @@ impl Processor for TransformHashJoin {
 
         self.stage = match &mut self.stage {
             Stage::Build(_) => {
-                if let Some(builder) = self.runtime_filter_builder.take() {
-                    let spill_happened = self.join.is_spill_happened();
-                    // Disable runtime filters once spilling occurs to avoid partial-build filters
-                    // being globalized across the cluster, which can prune valid probe rows.
-                    let packet = builder.finish(spill_happened)?;
-                    self.join.add_runtime_filter_packet(packet);
-                }
-
                 let rf_build_elapsed = self.instant.elapsed() - elapsed;
                 let _wait_res = self.stage_sync_barrier.wait().await;
                 let before_wait = self.instant.elapsed();
diff --git a/src/query/service/src/schedulers/fragments/fragmenter.rs b/src/query/service/src/schedulers/fragments/fragmenter.rs
index 0cbce7f5e7a18..b20a3738441d3 100644
--- a/src/query/service/src/schedulers/fragments/fragmenter.rs
+++ b/src/query/service/src/schedulers/fragments/fragmenter.rs
@@ -283,6 +283,8 @@ impl DeriveHandle for FragmentDeriveHandle {
 
             let plan_id = v.get_id();
             let source_fragment_id = self.ctx.get_fragment_id();
+            let exchange_kind = exchange.kind.clone();
+            let exchange_keys = exchange.keys.clone();
 
             let plan: PhysicalPlan = PhysicalPlan::new(ExchangeSink {
                 input,
@@ -327,6 +329,8 @@ impl DeriveHandle for FragmentDeriveHandle {
 
                 source_fragment_id,
                 meta: PhysicalPlanMeta::with_plan_id("ExchangeSource", plan_id),
+                kind: exchange_kind,
+                keys: exchange_keys,
             }));
         }
 
diff --git a/src/query/service/src/servers/flight/v1/packets/packet_fragment.rs b/src/query/service/src/servers/flight/v1/packets/packet_fragment.rs
index 963363e7267d1..290924a7b261f 100644
--- a/src/query/service/src/servers/flight/v1/packets/packet_fragment.rs
+++ b/src/query/service/src/servers/flight/v1/packets/packet_fragment.rs
@@ -18,6 +18,7 @@ use std::collections::VecDeque;
 use std::fmt::Debug;
 use std::fmt::Formatter;
 
+use databend_common_sql::executor::physical_plans::DataDistribution;
 use serde::Deserializer;
 use serde::Serializer;
 use serde::de::Error;
@@ -69,6 +70,10 @@ impl IPhysicalPlan for SerializedPhysicalPlanRef {
     fn derive(&self, _: Vec<PhysicalPlan>) -> PhysicalPlan {
         unimplemented!()
     }
+
+    fn output_data_distribution(&self) -> DataDistribution {
+        unimplemented!()
+    }
 }
 
 #[derive(Clone, Debug, serde::Serialize, serde::Deserialize)]
diff --git a/src/query/service/src/servers/flight/v1/scatter/flight_scatter_hash.rs b/src/query/service/src/servers/flight/v1/scatter/flight_scatter_hash.rs
index 533ca56929a0d..0b73c7cdabc25 100644
--- a/src/query/service/src/servers/flight/v1/scatter/flight_scatter_hash.rs
+++ b/src/query/service/src/servers/flight/v1/scatter/flight_scatter_hash.rs
@@ -25,6 +25,7 @@ use databend_common_expression::FunctionID;
 use databend_common_expression::RemoteExpr;
 use databend_common_expression::Scalar;
 use databend_common_expression::Value;
+use databend_common_expression::aggregate::combine_group_hash_column;
 use databend_common_expression::type_check::check_function;
 use databend_common_expression::types::AccessType;
 use databend_common_expression::types::AnyType;
@@ -35,6 +36,7 @@ use databend_common_expression::types::NumberDataType;
 use databend_common_expression::types::NumberType;
 use databend_common_expression::types::number::NumberScalar;
 use databend_common_functions::BUILTIN_FUNCTIONS;
+use strength_reduce::StrengthReducedU64;
 
 use crate::servers::flight::v1::scatter::flight_scatter::FlightScatter;
 
@@ -43,6 +45,8 @@ pub struct HashFlightScatter {
     func_ctx: FunctionContext,
     hash_key: Vec<Expr>,
     scatter_size: usize,
+    raw_hash_keys: Vec<Expr>,
+    hash_key_data_types: Vec<DataType>,
 }
 
 impl HashFlightScatter {
@@ -60,23 +64,25 @@ impl HashFlightScatter {
                 local_pos,
             );
         }
-        let hash_key = hash_keys
+        let raw_hash_keys: Vec<Expr> = hash_keys
             .iter()
-            .map(|key| {
-                check_function(
-                    None,
-                    "siphash",
-                    &[],
-                    &[key.as_expr(&BUILTIN_FUNCTIONS)],
-                    &BUILTIN_FUNCTIONS,
-                )
-            })
+            .map(|key| key.as_expr(&BUILTIN_FUNCTIONS))
+            .collect();
+        let hash_key_data_types: Vec<DataType> = raw_hash_keys
+            .iter()
+            .map(|expr| expr.data_type().clone())
+            .collect();
+        let hash_key = raw_hash_keys
+            .iter()
+            .map(|expr| check_function(None, "siphash", &[], &[expr.clone()], &BUILTIN_FUNCTIONS))
             .collect::<Result<_>>()?;
 
         Ok(Box::new(Self {
             func_ctx,
             scatter_size,
             hash_key,
+            raw_hash_keys,
+            hash_key_data_types,
         }))
     }
 }
@@ -87,6 +93,8 @@ struct OneHashKeyFlightScatter {
     func_ctx: FunctionContext,
     indices_scalar: Expr,
     default_scatter_index: u64,
+    hash_key_expr: Expr,
+    hash_key_data_type: DataType,
 }
 
 impl OneHashKeyFlightScatter {
@@ -101,6 +109,8 @@ impl OneHashKeyFlightScatter {
         } else {
             0
         };
+        let hash_key_expr = hash_key.as_expr(&BUILTIN_FUNCTIONS);
+        let hash_key_data_type = hash_key_expr.data_type().clone();
         let indices_scalar = check_function(
             None,
             "modulo",
@@ -110,7 +120,7 @@ impl OneHashKeyFlightScatter {
                     None,
                     "siphash",
                     &[],
-                    &[hash_key.as_expr(&BUILTIN_FUNCTIONS)],
+                    &[hash_key_expr.clone()],
                     &BUILTIN_FUNCTIONS,
                 )?,
                 Expr::constant(
@@ -126,6 +136,8 @@ impl OneHashKeyFlightScatter {
             func_ctx,
             indices_scalar,
             default_scatter_index,
+            hash_key_expr,
+            hash_key_data_type,
         }))
     }
 }
@@ -155,9 +167,15 @@ impl FlightScatter for OneHashKeyFlightScatter {
     fn scatter_indices(&self, data_block: &DataBlock) -> Result<Option<Vec<u64>>> {
         let evaluator = Evaluator::new(data_block, &self.func_ctx, &BUILTIN_FUNCTIONS);
         let num = data_block.num_rows();
-        let indices = evaluator.run(&self.indices_scalar).unwrap();
-        let indices = get_hash_values(indices, num, self.default_scatter_index)?;
-        Ok(Some(indices.to_vec()))
+        let value = evaluator.run(&self.hash_key_expr)?;
+        let column = value.convert_to_full_column(&self.hash_key_data_type, num);
+        let mut hashes = vec![0u64; num];
+        combine_group_hash_column::<true>(&column, &mut hashes);
+        let m = StrengthReducedU64::new(self.scatter_size as u64);
+        for h in hashes.iter_mut() {
+            *h = *h % m;
+        }
+        Ok(Some(hashes))
     }
 }
 
@@ -195,18 +213,26 @@ impl FlightScatter for HashFlightScatter {
     fn scatter_indices(&self, data_block: &DataBlock) -> Result<Option<Vec<u64>>> {
         let evaluator = Evaluator::new(data_block, &self.func_ctx, &BUILTIN_FUNCTIONS);
         let num = data_block.num_rows();
-        let indices = if !self.hash_key.is_empty() {
-            let mut hash_keys = Vec::with_capacity(self.hash_key.len());
-            for expr in &self.hash_key {
-                let indices = evaluator.run(expr).unwrap();
-                let indices = get_hash_values(indices, num, 0)?;
-                hash_keys.push(indices)
+        let mut hashes = vec![0u64; num];
+        for (i, (expr, dt)) in self
+            .raw_hash_keys
+            .iter()
+            .zip(&self.hash_key_data_types)
+            .enumerate()
+        {
+            let value = evaluator.run(expr)?;
+            let column = value.convert_to_full_column(dt, num);
+            if i == 0 {
+                combine_group_hash_column::<true>(&column, &mut hashes);
+            } else {
+                combine_group_hash_column::<false>(&column, &mut hashes);
             }
-            self.combine_hash_keys(&hash_keys, num)
-        } else {
-            Ok(vec![0; num])
-        }?;
-        Ok(Some(indices))
+        }
+        let m = StrengthReducedU64::new(self.scatter_size as u64);
+        for h in hashes.iter_mut() {
+            *h = *h % m;
+        }
+        Ok(Some(hashes))
     }
 }
 
diff --git a/src/query/service/tests/it/storages/fuse/operations/prewhere.rs b/src/query/service/tests/it/storages/fuse/operations/prewhere.rs
index e20caf0a88cee..941a04b9a3bc2 100644
--- a/src/query/service/tests/it/storages/fuse/operations/prewhere.rs
+++ b/src/query/service/tests/it/storages/fuse/operations/prewhere.rs
@@ -70,7 +70,8 @@ async fn test_prewhere() -> Result<()> {
     let _ = _fixture;
 
     // Create ReadState which combines prewhere and runtime filter logic
-    let read_state = ReadState::create(ctx.clone(), scan_id, Some(&prewhere_info), &block_reader)?;
+    let mut read_state =
+        ReadState::create(ctx.clone(), scan_id, Some(&prewhere_info), &block_reader)?;
 
     // Use the new unified API that handles all states internally
     let (data_block, _row_selection, bitmap_selection) =
diff --git a/src/query/settings/src/settings_default.rs b/src/query/settings/src/settings_default.rs
index b14f575c6206e..b3d5107774615 100644
--- a/src/query/settings/src/settings_default.rs
+++ b/src/query/settings/src/settings_default.rs
@@ -574,6 +574,20 @@ impl DefaultSettings {
                     scope: SettingScope::Both,
                     range: Some(SettingRange::Numeric(1..=u64::MAX)),
                 }),
+                ("bloom_runtime_filter_selectivity_threshold", DefaultSettingValue {
+                    value: UserSettingValue::UInt64(40),
+                    desc: "Probe-side selectivity threshold (percentage) for bloom runtime filters. If a bloom filter filters less than this percentage of rows, it is temporarily disabled. Default 40 means 40%.",
+                    mode: SettingMode::Both,
+                    scope: SettingScope::Both,
+                    range: Some(SettingRange::Numeric(0..=100)),
+                }),
+                ("bloom_runtime_filter_sampling_frequency", DefaultSettingValue {
+                    value: UserSettingValue::UInt64(32),
+                    desc: "Number of block evaluations between re-checks of bloom runtime filter selectivity. After this many evaluations, counters reset and selectivity is re-evaluated.",
+                    mode: SettingMode::Both,
+                    scope: SettingScope::Both,
+                    range: Some(SettingRange::Numeric(1..=u64::MAX)),
+                }),
                 ("max_execute_time_in_seconds", DefaultSettingValue {
                     value: UserSettingValue::UInt64(0),
                     desc: "Sets the maximum query execution time in seconds. Setting it to 0 means no limit.",
@@ -623,6 +637,13 @@ impl DefaultSettings {
                     scope: SettingScope::Both,
                     range: Some(SettingRange::Numeric(0..=1)),
                 }),
+                ("broadcast_join_max_build_rows", DefaultSettingValue {
+                    value: UserSettingValue::UInt64(30_000_000),
+                    desc: "Maximum estimated build-side rows for broadcast join when partitioned hash join is enabled. 0 means no limit.",
+                    mode: SettingMode::Both,
+                    scope: SettingScope::Both,
+                    range: Some(SettingRange::Numeric(0..=u64::MAX)),
+                }),
                 ("grouping_sets_to_union", DefaultSettingValue {
                     value: UserSettingValue::UInt64(0),
                     desc: "Enables grouping sets to union.",
@@ -1601,6 +1622,13 @@ impl DefaultSettings {
                     scope: SettingScope::Both,
                     range: Some(SettingRange::Numeric(0..=1)),
                 }),
+                ("enable_partitioned_hash_join", DefaultSettingValue {
+                    value: UserSettingValue::UInt64(1),
+                    desc: "Enables partitioned hash join for shuffle join.",
+                    mode: SettingMode::Both,
+                    scope: SettingScope::Both,
+                    range: Some(SettingRange::Numeric(0..=1)),
+                }),
                 ("s3_storage_class", DefaultSettingValue {
                     value: {
                         let storage_class = Self::extract_s3_storage_class_config(&global_conf).unwrap_or_default();
diff --git a/src/query/settings/src/settings_getter_setter.rs b/src/query/settings/src/settings_getter_setter.rs
index dca74c1d5491d..d5bb231373e95 100644
--- a/src/query/settings/src/settings_getter_setter.rs
+++ b/src/query/settings/src/settings_getter_setter.rs
@@ -365,6 +365,14 @@ impl Settings {
         self.try_get_u64("bloom_runtime_filter_threshold")
     }
 
+    pub fn get_bloom_runtime_filter_selectivity_threshold(&self) -> Result<u64> {
+        self.try_get_u64("bloom_runtime_filter_selectivity_threshold")
+    }
+
+    pub fn get_bloom_runtime_filter_sampling_frequency(&self) -> Result<u64> {
+        self.try_get_u64("bloom_runtime_filter_sampling_frequency")
+    }
+
     pub fn get_min_max_runtime_filter_threshold(&self) -> Result<u64> {
         self.try_get_u64("min_max_runtime_filter_threshold")
     }
@@ -466,6 +474,10 @@ impl Settings {
         Ok(self.try_get_u64("enforce_shuffle_join")? != 0)
     }
 
+    pub fn get_broadcast_join_max_build_rows(&self) -> Result<u64> {
+        self.try_get_u64("broadcast_join_max_build_rows")
+    }
+
     pub fn get_enable_merge_into_row_fetch(&self) -> Result<bool> {
         Ok(self.try_get_u64("enable_merge_into_row_fetch")? != 0)
     }
@@ -1178,6 +1190,10 @@ impl Settings {
         Ok(self.try_get_u64("enable_experimental_new_join")? == 1)
     }
 
+    pub fn get_enable_partitioned_hash_join(&self) -> Result<bool> {
+        Ok(self.try_get_u64("enable_partitioned_hash_join")? != 0)
+    }
+
     pub fn get_s3_storage_class(&self) -> Result<S3StorageClass> {
         let s3_storage_class_setting = self.try_get_string("s3_storage_class")?;
         S3StorageClass::from_str(&s3_storage_class_setting).map_err(|e| {
diff --git a/src/query/sql/src/executor/physical_plans/common.rs b/src/query/sql/src/executor/physical_plans/common.rs
index 3023ae01d72b2..14173aad0e89d 100644
--- a/src/query/sql/src/executor/physical_plans/common.rs
+++ b/src/query/sql/src/executor/physical_plans/common.rs
@@ -71,10 +71,19 @@ pub enum FragmentKind {
     // Broadcast
     Expansive,
     Merge,
-    // Ping-pong based hash shuffle (used by hash join)
+    // Ping-pong based hash shuffle
     GlobalShuffle,
 }
 
+#[derive(Clone, Debug, PartialEq, Eq, serde::Serialize, serde::Deserialize)]
+pub enum DataDistribution {
+    Random,
+    NodeHash(Vec<RemoteExpr>),
+    GlobalHash(Vec<RemoteExpr>),
+    Broadcast,
+    Serial,
+}
+
 #[derive(Clone, Debug, serde::Serialize, serde::Deserialize, Copy)]
 pub enum MutationKind {
     Delete,
diff --git a/src/query/sql/src/planner/plans/join.rs b/src/query/sql/src/planner/plans/join.rs
index af67786a076f0..58ab3875ba103 100644
--- a/src/query/sql/src/planner/plans/join.rs
+++ b/src/query/sql/src/planner/plans/join.rs
@@ -20,6 +20,7 @@ use std::sync::Arc;
 
 use databend_common_catalog::table_context::TableContext;
 use databend_common_exception::Result;
+use databend_common_settings::Settings;
 use databend_common_statistics::DEFAULT_HISTOGRAM_BUCKETS;
 use databend_common_statistics::Datum;
 use databend_common_statistics::Histogram;
@@ -547,6 +548,18 @@ impl Join {
                 .iter()
                 .any(|expr| expr.has_subquery())
     }
+
+    fn enforce_shuffle_join(settings: &Settings, right_stat_info: &Arc<StatInfo>) -> Result<bool> {
+        let max_build_rows = settings.get_broadcast_join_max_build_rows()?;
+        if max_build_rows > 0
+            && settings.get_enable_partitioned_hash_join()?
+            && right_stat_info.cardinality >= max_build_rows as f64
+        {
+            return Ok(true);
+        }
+
+        settings.get_enforce_shuffle_join()
+    }
 }
 
 impl Operator for Join {
@@ -715,7 +728,8 @@ impl Operator for Join {
                 // Use a very large value to prevent broadcast join.
                 1000.0
             };
-            if !settings.get_enforce_shuffle_join()?
+
+            if !Self::enforce_shuffle_join(&settings, &right_stat_info)?
                 && (right_stat_info.cardinality * broadcast_join_threshold
                     < left_stat_info.cardinality
                     || settings.get_enforce_broadcast_join()?)
@@ -752,7 +766,7 @@ impl Operator for Join {
     fn compute_required_prop_children(
         &self,
         ctx: Arc<dyn TableContext>,
-        _rel_expr: &RelExpr,
+        rel_expr: &RelExpr,
         _required: &RequiredProperty,
     ) -> Result<Vec<Vec<RequiredProperty>>> {
         let mut children_required = vec![];
@@ -838,19 +852,21 @@ impl Operator for Join {
                 | JoinType::Asof
                 | JoinType::LeftAsof
                 | JoinType::RightAsof
-        ) && !settings.get_enforce_shuffle_join()?
-        {
-            // (Any, Broadcast)
-            let left_distribution = Distribution::Any;
-            let right_distribution = Distribution::Broadcast;
-            children_required.push(vec![
-                RequiredProperty {
-                    distribution: left_distribution,
-                },
-                RequiredProperty {
-                    distribution: right_distribution,
-                },
-            ]);
+        ) {
+            let right_stat_info = rel_expr.derive_cardinality_child(1)?;
+            if !Self::enforce_shuffle_join(&settings, &right_stat_info)? {
+                // (Any, Broadcast)
+                let left_distribution = Distribution::Any;
+                let right_distribution = Distribution::Broadcast;
+                children_required.push(vec![
+                    RequiredProperty {
+                        distribution: left_distribution,
+                    },
+                    RequiredProperty {
+                        distribution: right_distribution,
+                    },
+                ]);
+            }
         }
 
         if children_required.is_empty() {
diff --git a/src/query/storages/fuse/src/operations/read/parquet_data_source_deserializer.rs b/src/query/storages/fuse/src/operations/read/parquet_data_source_deserializer.rs
index 80c51a926b9f7..87f78b1729809 100644
--- a/src/query/storages/fuse/src/operations/read/parquet_data_source_deserializer.rs
+++ b/src/query/storages/fuse/src/operations/read/parquet_data_source_deserializer.rs
@@ -233,7 +233,7 @@ impl Processor for DeserializeDataTransform {
 
                     let (mut data_block, row_selection, bitmap_selection) = self
                         .read_state
-                        .as_ref()
+                        .as_mut()
                         .unwrap()
                         .deserialize_and_filter(columns_chunks, part)?;
 
diff --git a/src/query/storages/fuse/src/operations/read/read_state.rs b/src/query/storages/fuse/src/operations/read/read_state.rs
index 1218f605a5920..f23a66e383834 100644
--- a/src/query/storages/fuse/src/operations/read/read_state.rs
+++ b/src/query/storages/fuse/src/operations/read/read_state.rs
@@ -44,11 +44,67 @@ use crate::io::DataItem;
 use crate::io::RowSelection;
 use crate::pruning::ExprBloomFilter;
 
+const DEFAULT_MIN_INPUT_ROWS: usize = 40960;
+
+#[derive(Clone)]
+pub struct BloomFilterSelectivity {
+    input_rows: usize,
+    filtered_rows: usize,
+    eval_counter: usize,
+    always_true: bool,
+    sampling_frequency: usize,
+    selectivity_threshold: usize,
+    min_input_rows: usize,
+}
+
+impl BloomFilterSelectivity {
+    pub fn new(selectivity_threshold: usize, sampling_frequency: usize) -> Self {
+        Self {
+            input_rows: 0,
+            filtered_rows: 0,
+            eval_counter: 0,
+            always_true: false,
+            sampling_frequency,
+            selectivity_threshold,
+            min_input_rows: DEFAULT_MIN_INPUT_ROWS,
+        }
+    }
+
+    pub fn should_skip(&self) -> bool {
+        self.always_true
+    }
+
+    pub fn update(&mut self, block_input_rows: usize, block_filtered_rows: usize) {
+        self.input_rows += block_input_rows;
+        self.filtered_rows += block_filtered_rows;
+        self.eval_counter += 1;
+
+        if self.eval_counter >= self.sampling_frequency {
+            self.judge_selectivity();
+            self.reset();
+        }
+    }
+
+    fn judge_selectivity(&mut self) {
+        if self.input_rows >= self.min_input_rows {
+            let selectivity_pct = (self.filtered_rows * 100) / self.input_rows;
+            self.always_true = selectivity_pct < self.selectivity_threshold;
+        }
+    }
+
+    fn reset(&mut self) {
+        self.input_rows = 0;
+        self.filtered_rows = 0;
+        self.eval_counter = 0;
+    }
+}
+
 #[derive(Clone)]
 pub struct BloomRuntimeFilterRef {
     pub column_index: FieldIndex,
     pub filter: RuntimeBloomFilter,
     pub stats: Arc<RuntimeFilterStats>,
+    pub selectivity: BloomFilterSelectivity,
 }
 
 pub struct ReadState {
@@ -98,6 +154,11 @@ impl ReadState {
 
         let prewhere_schema: DataSchema = (prewhere_reader.schema().as_ref()).into();
 
+        let settings = ctx.get_settings();
+        let selectivity_threshold =
+            settings.get_bloom_runtime_filter_selectivity_threshold()? as usize;
+        let sampling_frequency = settings.get_bloom_runtime_filter_sampling_frequency()? as usize;
+
         let runtime_filters: Vec<BloomRuntimeFilterRef> = runtime_filter_entries
             .into_iter()
             .filter_map(|entry| {
@@ -107,6 +168,10 @@ impl ReadState {
                     column_index,
                     filter: bloom.filter,
                     stats: entry.stats,
+                    selectivity: BloomFilterSelectivity::new(
+                        selectivity_threshold,
+                        sampling_frequency,
+                    ),
                 })
             })
             .collect();
@@ -147,16 +212,24 @@ impl ReadState {
     }
 
     pub fn runtime_filter(
-        &self,
+        &mut self,
         block: &DataBlock,
-        _num_rows: usize,
+        num_rows: usize,
     ) -> Result<Option<MutableBitmap>> {
         let bloom_start = Instant::now();
 
         let mut bitmaps = vec![];
-        for runtime_filter in &self.runtime_filters {
+        for runtime_filter in &mut self.runtime_filters {
+            if runtime_filter.selectivity.should_skip() {
+                continue;
+            }
+
             let probe_column = block.get_by_offset(runtime_filter.column_index).to_column();
             let bitmap = ExprBloomFilter::new(&runtime_filter.filter).apply(probe_column)?;
+
+            let filtered_rows = bitmap.null_count();
+            runtime_filter.selectivity.update(num_rows, filtered_rows);
+
             bitmaps.push(bitmap);
         }
 
@@ -175,7 +248,7 @@ impl ReadState {
     }
 
     pub fn deserialize_and_filter(
-        &self,
+        &mut self,
         columns_chunks: HashMap<ColumnId, DataItem>,
         part: &FuseBlockPartInfo,
     ) -> Result<(DataBlock, Option<RowSelection>, Option<Bitmap>)> {