From d0bf5dc77d86a499ba2b2237314e2c9c0dd32c8d Mon Sep 17 00:00:00 2001 From: zhang2014 Date: Mon, 16 Mar 2026 11:04:14 +0800 Subject: [PATCH 01/38] refactor(query): add per-thread compact hash join for hash shuffle Under hash shuffle, build and probe data are already partitioned by thread. This replaces the shared hash table (atomic CAS, Mutex, Barrier) with a Doris-style compact hash table (4 bytes/row index-based chain) that each thread builds and probes independently, eliminating all synchronization overhead. - Reorganize memory/ into unpartitioned/ (broadcast) and partitioned/ (shuffle) - Add CompactJoinHashTable with index-based chaining - Add PartitionedBuild with fixed 65536-row chunks and bit-shift addressing - Implement all 7 join types for the partitioned path - Route hash shuffle joins through partitioned pipeline in physical_hash_join Co-Authored-By: Claude Opus 4.6 --- .../src/physical_plans/physical_hash_join.rs | 6 +- .../new_hash_join/hash_join_factory.rs | 60 ++++ .../transforms/new_hash_join/memory/mod.rs | 22 +- .../memory/partitioned/compact_hash_table.rs | 146 ++++++++ .../memory/partitioned/inner_join.rs | 153 ++++++++ .../memory/partitioned/left_join.rs | 283 +++++++++++++++ .../memory/partitioned/left_join_anti.rs | 141 ++++++++ .../memory/partitioned/left_join_semi.rs | 138 +++++++ .../new_hash_join/memory/partitioned/mod.rs | 34 ++ .../memory/partitioned/partitioned_build.rs | 338 ++++++++++++++++++ .../memory/partitioned/right_join.rs | 165 +++++++++ .../memory/partitioned/right_join_anti.rs | 145 ++++++++ .../memory/partitioned/right_join_semi.rs | 144 ++++++++ .../memory/{ => unpartitioned}/basic.rs | 0 .../memory/{ => unpartitioned}/basic_state.rs | 0 .../memory/{ => unpartitioned}/inner_join.rs | 0 .../memory/{ => unpartitioned}/left_join.rs | 2 +- .../{ => unpartitioned}/left_join_anti.rs | 4 +- .../{ => unpartitioned}/left_join_semi.rs | 2 +- .../new_hash_join/memory/unpartitioned/mod.rs | 33 ++ .../memory/{ => unpartitioned}/nested_loop.rs | 0 .../memory/{ => unpartitioned}/right_join.rs | 6 +- .../{ => unpartitioned}/right_join_anti.rs | 4 +- .../{ => unpartitioned}/right_join_semi.rs | 4 +- 24 files changed, 1799 insertions(+), 31 deletions(-) create mode 100644 src/query/service/src/pipelines/processors/transforms/new_hash_join/memory/partitioned/compact_hash_table.rs create mode 100644 src/query/service/src/pipelines/processors/transforms/new_hash_join/memory/partitioned/inner_join.rs create mode 100644 src/query/service/src/pipelines/processors/transforms/new_hash_join/memory/partitioned/left_join.rs create mode 100644 src/query/service/src/pipelines/processors/transforms/new_hash_join/memory/partitioned/left_join_anti.rs create mode 100644 src/query/service/src/pipelines/processors/transforms/new_hash_join/memory/partitioned/left_join_semi.rs create mode 100644 src/query/service/src/pipelines/processors/transforms/new_hash_join/memory/partitioned/mod.rs create mode 100644 src/query/service/src/pipelines/processors/transforms/new_hash_join/memory/partitioned/partitioned_build.rs create mode 100644 src/query/service/src/pipelines/processors/transforms/new_hash_join/memory/partitioned/right_join.rs create mode 100644 src/query/service/src/pipelines/processors/transforms/new_hash_join/memory/partitioned/right_join_anti.rs create mode 100644 src/query/service/src/pipelines/processors/transforms/new_hash_join/memory/partitioned/right_join_semi.rs rename src/query/service/src/pipelines/processors/transforms/new_hash_join/memory/{ => unpartitioned}/basic.rs (100%) rename src/query/service/src/pipelines/processors/transforms/new_hash_join/memory/{ => unpartitioned}/basic_state.rs (100%) rename src/query/service/src/pipelines/processors/transforms/new_hash_join/memory/{ => unpartitioned}/inner_join.rs (100%) rename src/query/service/src/pipelines/processors/transforms/new_hash_join/memory/{ => unpartitioned}/left_join.rs (99%) rename src/query/service/src/pipelines/processors/transforms/new_hash_join/memory/{ => unpartitioned}/left_join_anti.rs (98%) rename src/query/service/src/pipelines/processors/transforms/new_hash_join/memory/{ => unpartitioned}/left_join_semi.rs (99%) create mode 100644 src/query/service/src/pipelines/processors/transforms/new_hash_join/memory/unpartitioned/mod.rs rename src/query/service/src/pipelines/processors/transforms/new_hash_join/memory/{ => unpartitioned}/nested_loop.rs (100%) rename src/query/service/src/pipelines/processors/transforms/new_hash_join/memory/{ => unpartitioned}/right_join.rs (98%) rename src/query/service/src/pipelines/processors/transforms/new_hash_join/memory/{ => unpartitioned}/right_join_anti.rs (98%) rename src/query/service/src/pipelines/processors/transforms/new_hash_join/memory/{ => unpartitioned}/right_join_semi.rs (98%) diff --git a/src/query/service/src/physical_plans/physical_hash_join.rs b/src/query/service/src/physical_plans/physical_hash_join.rs index dc8f91c89eb8d..10f6f5e4c2904 100644 --- a/src/query/service/src/physical_plans/physical_hash_join.rs +++ b/src/query/service/src/physical_plans/physical_hash_join.rs @@ -488,7 +488,11 @@ impl HashJoin { build_input.clone(), probe_input.clone(), joined_output.clone(), - factory.create_hash_join(self.join_type, 0)?, + if self.broadcast_id.is_some() { + factory.create_partitioned_join(self.join_type)? + } else { + factory.create_hash_join(self.join_type, 0)? + }, stage_sync_barrier.clone(), self.projections.clone(), rf_desc.clone(), diff --git a/src/query/service/src/pipelines/processors/transforms/new_hash_join/hash_join_factory.rs b/src/query/service/src/pipelines/processors/transforms/new_hash_join/hash_join_factory.rs index 33482a95cd320..b7d03c1aba14d 100644 --- a/src/query/service/src/pipelines/processors/transforms/new_hash_join/hash_join_factory.rs +++ b/src/query/service/src/pipelines/processors/transforms/new_hash_join/hash_join_factory.rs @@ -45,6 +45,13 @@ use crate::pipelines::processors::transforms::memory::OuterRightHashJoin; use crate::pipelines::processors::transforms::memory::SemiLeftHashJoin; use crate::pipelines::processors::transforms::memory::SemiRightHashJoin; use crate::pipelines::processors::transforms::memory::left_join::OuterLeftHashJoin; +use crate::pipelines::processors::transforms::memory::partitioned::PartitionedInnerJoin; +use crate::pipelines::processors::transforms::memory::partitioned::PartitionedLeftJoin; +use crate::pipelines::processors::transforms::memory::partitioned::PartitionedLeftAntiJoin; +use crate::pipelines::processors::transforms::memory::partitioned::PartitionedLeftSemiJoin; +use crate::pipelines::processors::transforms::memory::partitioned::PartitionedRightJoin; +use crate::pipelines::processors::transforms::memory::partitioned::PartitionedRightAntiJoin; +use crate::pipelines::processors::transforms::memory::partitioned::PartitionedRightSemiJoin; use crate::sessions::QueryContext; pub struct HashJoinFactory { @@ -405,4 +412,57 @@ impl HashJoinFactory { typ, )) } + + /// Create a partitioned (per-thread) join for hash shuffle mode. + /// No shared state — each thread independently builds and probes. + pub fn create_partitioned_join(self: &Arc, typ: JoinType) -> Result> { + let settings = self.ctx.get_settings(); + let max_block_size = settings.get_max_block_size()? as usize; + + match typ { + JoinType::Inner => Ok(Box::new(PartitionedInnerJoin::create( + self.hash_method.clone(), + self.desc.clone(), + self.function_ctx.clone(), + max_block_size, + ))), + JoinType::Left => Ok(Box::new(PartitionedLeftJoin::create( + self.hash_method.clone(), + self.desc.clone(), + self.function_ctx.clone(), + max_block_size, + ))), + JoinType::LeftAnti => Ok(Box::new(PartitionedLeftAntiJoin::create( + self.hash_method.clone(), + self.desc.clone(), + self.function_ctx.clone(), + max_block_size, + ))), + JoinType::LeftSemi => Ok(Box::new(PartitionedLeftSemiJoin::create( + self.hash_method.clone(), + self.desc.clone(), + self.function_ctx.clone(), + max_block_size, + ))), + JoinType::Right => Ok(Box::new(PartitionedRightJoin::create( + self.hash_method.clone(), + self.desc.clone(), + self.function_ctx.clone(), + max_block_size, + ))), + JoinType::RightSemi => Ok(Box::new(PartitionedRightSemiJoin::create( + self.hash_method.clone(), + self.desc.clone(), + self.function_ctx.clone(), + max_block_size, + ))), + JoinType::RightAnti => Ok(Box::new(PartitionedRightAntiJoin::create( + self.hash_method.clone(), + self.desc.clone(), + self.function_ctx.clone(), + max_block_size, + ))), + _ => unreachable!(), + } + } } diff --git a/src/query/service/src/pipelines/processors/transforms/new_hash_join/memory/mod.rs b/src/query/service/src/pipelines/processors/transforms/new_hash_join/memory/mod.rs index 150758af79896..f4f32d914b27c 100644 --- a/src/query/service/src/pipelines/processors/transforms/new_hash_join/memory/mod.rs +++ b/src/query/service/src/pipelines/processors/transforms/new_hash_join/memory/mod.rs @@ -12,23 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. -mod basic; -mod basic_state; -mod inner_join; -pub mod left_join; -mod left_join_anti; -mod left_join_semi; -mod right_join; -mod right_join_anti; -mod right_join_semi; +pub mod unpartitioned; +pub mod partitioned; -pub use basic_state::BasicHashJoinState; -pub use inner_join::InnerHashJoin; -pub use left_join_anti::AntiLeftHashJoin; -pub use left_join_semi::SemiLeftHashJoin; -pub use right_join::OuterRightHashJoin; -pub use right_join_anti::AntiRightHashJoin; -pub use right_join_semi::SemiRightHashJoin; -mod nested_loop; - -pub use nested_loop::*; +pub use unpartitioned::*; diff --git a/src/query/service/src/pipelines/processors/transforms/new_hash_join/memory/partitioned/compact_hash_table.rs b/src/query/service/src/pipelines/processors/transforms/new_hash_join/memory/partitioned/compact_hash_table.rs new file mode 100644 index 0000000000000..2c4ca7ae8024e --- /dev/null +++ b/src/query/service/src/pipelines/processors/transforms/new_hash_join/memory/partitioned/compact_hash_table.rs @@ -0,0 +1,146 @@ +// Copyright 2021 Datafuse Labs +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +/// Doris-style compact hash table for join. +/// +/// Index 0 is a sentinel (empty/chain-end). Actual rows are indexed from 1. +/// Memory per row: 4 bytes (next chain) vs current ~32 bytes (pointer-based entry). +/// +/// The table is single-threaded (no atomics) — designed for per-thread use +/// under hash shuffle where each thread independently builds and probes. +/// Trait for row index types. Supports u32 (up to ~4B rows) and u64. +pub trait RowIndex: + Copy + Default + Eq + Send + Sync + 'static + std::fmt::Debug +{ + const ZERO: Self; + fn from_usize(v: usize) -> Self; + fn to_usize(self) -> usize; +} + +impl RowIndex for u32 { + const ZERO: Self = 0; + #[inline(always)] + fn from_usize(v: usize) -> Self { + v as u32 + } + #[inline(always)] + fn to_usize(self) -> usize { + self as usize + } +} + +impl RowIndex for u64 { + const ZERO: Self = 0; + #[inline(always)] + fn from_usize(v: usize) -> Self { + v as u64 + } + #[inline(always)] + fn to_usize(self) -> usize { + self as usize + } +} + +/// Compact join hash table using index-based chaining. +/// +/// `first[bucket]` stores the first row index in that bucket's chain. +/// `next[row_index]` stores the next row index in the same bucket's chain. +/// Chain ends when the value is `I::ZERO` (sentinel). +pub struct CompactJoinHashTable { + /// Bucket array: first[hash & mask] = first row index (1-based) + first: Vec, + /// Chain array: next[row_index] = next row in same bucket (0 = end) + next: Vec, + /// Visited bitmap for right outer/semi/anti joins + visited: Vec, + /// Bucket count minus one, for masking + bucket_mask: usize, +} + +impl CompactJoinHashTable { + /// Create a new compact hash table for `num_rows` rows. + /// Bucket count is next power of 2 >= num_rows + (num_rows - 1) / 7. + pub fn new(num_rows: usize) -> Self { + let bucket_count = Self::calc_bucket_count(num_rows); + CompactJoinHashTable { + first: vec![I::ZERO; bucket_count], + // Index 0 is sentinel, so we need num_rows + 1 entries + next: vec![I::ZERO; num_rows + 1], + visited: Vec::new(), + bucket_mask: bucket_count - 1, + } + } + + /// Get the bucket mask for external hash computation. + pub fn bucket_mask(&self) -> usize { + self.bucket_mask + } + + /// Build the hash table from precomputed bucket numbers. + /// `bucket_nums[i]` is the bucket for row i (1-based indexing, skip index 0). + pub fn build(&mut self, bucket_nums: &[usize]) { + // bucket_nums[0] is unused (sentinel), actual rows start at index 1 + for (i, bucket_num) in bucket_nums.iter().enumerate().skip(1) { + let bucket = bucket_num & self.bucket_mask; + self.next[i] = self.first[bucket]; + self.first[bucket] = I::from_usize(i); + } + } + + /// Get the first row index in the given bucket. + #[inline(always)] + pub fn first_index(&self, bucket: usize) -> I { + unsafe { *self.first.get_unchecked(bucket & self.bucket_mask) } + } + + /// Get the next row index in the chain. + #[inline(always)] + pub fn next_index(&self, row_index: I) -> I { + unsafe { *self.next.get_unchecked(row_index.to_usize()) } + } + + /// Initialize visited array for right-side join types. + pub fn init_visited(&mut self, num_rows: usize) { + self.visited = vec![0u8; num_rows + 1]; + } + + /// Mark a row as visited. + #[inline(always)] + pub fn set_visited(&mut self, row_index: I) { + unsafe { + *self.visited.get_unchecked_mut(row_index.to_usize()) = 1; + } + } + + /// Check if a row has been visited. + #[inline(always)] + pub fn is_visited(&self, row_index: usize) -> bool { + unsafe { *self.visited.get_unchecked(row_index) != 0 } + } + + /// Get a reference to the visited array (for final_probe scanning). + pub fn visited(&self) -> &[u8] { + &self.visited + } + + fn calc_bucket_count(num_rows: usize) -> usize { + if num_rows == 0 { + return 1; + } + // Doris formula: num_elem + (num_elem - 1) / 7, then round up to power of 2 + let target = num_rows + (num_rows.saturating_sub(1)) / 7; + target.next_power_of_two() + } +} + diff --git a/src/query/service/src/pipelines/processors/transforms/new_hash_join/memory/partitioned/inner_join.rs b/src/query/service/src/pipelines/processors/transforms/new_hash_join/memory/partitioned/inner_join.rs new file mode 100644 index 0000000000000..524ff505716ba --- /dev/null +++ b/src/query/service/src/pipelines/processors/transforms/new_hash_join/memory/partitioned/inner_join.rs @@ -0,0 +1,153 @@ +// Copyright 2021 Datafuse Labs +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use std::sync::Arc; + +use databend_common_base::base::ProgressValues; +use databend_common_column::bitmap::Bitmap; +use databend_common_exception::Result; +use databend_common_expression::BlockEntry; +use databend_common_expression::DataBlock; +use databend_common_expression::FilterExecutor; +use databend_common_expression::FunctionContext; +use databend_common_expression::HashMethodKind; +use databend_common_expression::types::NullableColumn; +use databend_common_functions::BUILTIN_FUNCTIONS; + +use super::partitioned_build::PartitionedBuild; +use crate::pipelines::processors::HashJoinDesc; +use crate::pipelines::processors::transforms::new_hash_join::join::EmptyJoinStream; +use crate::pipelines::processors::transforms::new_hash_join::join::Join; +use crate::pipelines::processors::transforms::new_hash_join::join::JoinStream; +use crate::pipelines::processors::transforms::new_hash_join::join::OneBlockJoinStream; +use crate::pipelines::processors::transforms::new_hash_join::grace::grace_memory::GraceMemoryJoin; + +pub struct PartitionedInnerJoin { + build: PartitionedBuild, + filter_executor: Option, +} + +impl PartitionedInnerJoin { + pub fn create( + method: HashMethodKind, + desc: Arc, + function_ctx: FunctionContext, + max_block_size: usize, + ) -> Self { + let filter_executor = desc.other_predicate.as_ref().map(|predicate| { + FilterExecutor::new( + predicate.clone(), + function_ctx.clone(), + max_block_size, + None, + &BUILTIN_FUNCTIONS, + false, + ) + }); + PartitionedInnerJoin { + build: PartitionedBuild::create(method, desc, function_ctx), + filter_executor, + } + } + + fn result_block( + desc: &HashJoinDesc, + probe_block: Option, + build_block: Option, + num_rows: usize, + ) -> DataBlock { + let mut result_block = match (probe_block, build_block) { + (Some(mut p), Some(b)) => { + p.merge_block(b); + p + } + (Some(p), None) => p, + (None, Some(b)) => b, + (None, None) => DataBlock::new(vec![], num_rows), + }; + + for (index, (is_probe_nullable, is_build_nullable)) in desc.probe_to_build.iter().cloned() { + let entry = match (is_probe_nullable, is_build_nullable) { + (true, true) | (false, false) => result_block.get_by_offset(index).clone(), + (true, false) => result_block.get_by_offset(index).clone().remove_nullable(), + (false, true) => { + let entry = result_block.get_by_offset(index); + let col = entry.to_column(); + match col.is_null() || col.is_nullable() { + true => entry.clone(), + false => BlockEntry::from(NullableColumn::new_column( + col, + Bitmap::new_constant(true, result_block.num_rows()), + )), + } + } + }; + result_block.add_entry(entry); + } + result_block + } +} + +impl Join for PartitionedInnerJoin { + fn add_block(&mut self, data: Option) -> Result<()> { + self.build.add_block(data) + } + + fn final_build(&mut self) -> Result> { + self.build.final_build()?; + Ok(None) + } + + fn probe_block(&mut self, data: DataBlock) -> Result> { + if data.is_empty() || self.build.num_rows == 0 { + return Ok(Box::new(EmptyJoinStream)); + } + + let (matched_probe, matched_build, _) = self.build.probe(&data)?; + + if matched_probe.is_empty() { + return Ok(Box::new(EmptyJoinStream)); + } + + let projected = data.project(&self.build.desc.probe_projection); + let probe_block = match projected.num_columns() { + 0 => None, + _ => Some(DataBlock::take(&projected, matched_probe.as_slice())?), + }; + + let build_block = self.build.gather_build_block(&matched_build); + + let mut result = Self::result_block( + &self.build.desc, + probe_block, + build_block, + matched_probe.len(), + ); + + if let Some(filter) = self.filter_executor.as_mut() { + result = filter.filter(result)?; + if result.is_empty() { + return Ok(Box::new(EmptyJoinStream)); + } + } + + Ok(Box::new(OneBlockJoinStream(Some(result)))) + } +} + +impl GraceMemoryJoin for PartitionedInnerJoin { + fn reset_memory(&mut self) { + self.build.reset(); + } +} diff --git a/src/query/service/src/pipelines/processors/transforms/new_hash_join/memory/partitioned/left_join.rs b/src/query/service/src/pipelines/processors/transforms/new_hash_join/memory/partitioned/left_join.rs new file mode 100644 index 0000000000000..603ee81b04e05 --- /dev/null +++ b/src/query/service/src/pipelines/processors/transforms/new_hash_join/memory/partitioned/left_join.rs @@ -0,0 +1,283 @@ +// Copyright 2021 Datafuse Labs +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use std::sync::Arc; + +use databend_common_base::base::ProgressValues; +use databend_common_column::bitmap::Bitmap; +use databend_common_exception::Result; +use databend_common_expression::BlockEntry; +use databend_common_expression::DataBlock; +use databend_common_expression::FilterExecutor; +use databend_common_expression::FunctionContext; +use databend_common_expression::HashMethodKind; +use databend_common_expression::Scalar; +use databend_common_expression::types::DataType; +use databend_common_expression::types::NullableColumn; +use databend_common_functions::BUILTIN_FUNCTIONS; + +use super::partitioned_build::PartitionedBuild; +use crate::pipelines::processors::HashJoinDesc; +use crate::pipelines::processors::transforms::new_hash_join::join::EmptyJoinStream; +use crate::pipelines::processors::transforms::new_hash_join::join::Join; +use crate::pipelines::processors::transforms::new_hash_join::join::JoinStream; +use crate::pipelines::processors::transforms::new_hash_join::join::OneBlockJoinStream; +use crate::pipelines::processors::transforms::new_hash_join::grace::grace_memory::GraceMemoryJoin; +use crate::pipelines::processors::transforms::wrap_true_validity; + +pub struct PartitionedLeftJoin { + build: PartitionedBuild, + filter_executor: Option, +} + +impl PartitionedLeftJoin { + pub fn create( + method: HashMethodKind, + desc: Arc, + function_ctx: FunctionContext, + max_block_size: usize, + ) -> Self { + let filter_executor = desc.other_predicate.as_ref().map(|predicate| { + FilterExecutor::new( + predicate.clone(), + function_ctx.clone(), + max_block_size, + None, + &BUILTIN_FUNCTIONS, + false, + ) + }); + PartitionedLeftJoin { + build: PartitionedBuild::create(method, desc, function_ctx), + filter_executor, + } + } +} + +impl Join for PartitionedLeftJoin { + fn add_block(&mut self, data: Option) -> Result<()> { + self.build.add_block(data) + } + + fn final_build(&mut self) -> Result> { + self.build.final_build()?; + Ok(None) + } + + fn probe_block(&mut self, data: DataBlock) -> Result> { + if data.is_empty() { + return Ok(Box::new(EmptyJoinStream)); + } + + let desc = &self.build.desc; + + if self.build.num_rows == 0 { + let num_rows = data.num_rows(); + let types: Vec<_> = desc + .build_schema + .fields + .iter() + .map(|x| x.data_type().clone()) + .collect(); + let build_block = null_block(&types, num_rows) + .map(|b| b.project(&desc.build_projection)); + let probe_block = Some(data.project(&desc.probe_projection)); + let result = final_result_block(desc, probe_block, build_block, num_rows); + return Ok(Box::new(OneBlockJoinStream(Some(result)))); + } + + let (matched_probe, matched_build, unmatched) = self.build.probe(&data)?; + let num_rows = data.num_rows(); + let probe_projected = data.project(&desc.probe_projection); + + // Build matched result + let mut result_blocks = Vec::new(); + + if !matched_probe.is_empty() { + let probe_block = match probe_projected.num_columns() { + 0 => None, + _ => Some(DataBlock::take(&probe_projected, matched_probe.as_slice())?), + }; + + let build_block = self.build.gather_build_block(&matched_build); + let build_block = build_block.map(|b| { + let true_validity = Bitmap::new_constant(true, matched_build.len()); + let entries = b + .columns() + .iter() + .map(|c| wrap_true_validity(c, matched_build.len(), &true_validity)); + DataBlock::from_iter(entries, matched_build.len()) + }); + + let mut matched_result = final_result_block( + desc, + probe_block, + build_block, + matched_build.len(), + ); + + if let Some(filter) = self.filter_executor.as_mut() { + let count = filter.select(&matched_result)?; + if count > 0 { + // Track which probe rows passed the filter + let true_sel = filter.true_selection(); + let mut passed = vec![false; num_rows]; + for idx in true_sel.iter().take(count) { + passed[matched_probe[*idx as usize] as usize] = true; + } + + let origin_rows = matched_result.num_rows(); + matched_result = filter.take(matched_result, origin_rows, count)?; + result_blocks.push(matched_result); + + // Unmatched = original unmatched + matched rows that failed ALL filter checks + let mut all_unmatched: Vec = unmatched; + // Rows that were matched but never passed filter + let mut matched_set = vec![false; num_rows]; + for idx in &matched_probe { + matched_set[*idx as usize] = true; + } + for i in 0..num_rows { + if matched_set[i] && !passed[i] { + all_unmatched.push(i as u64); + } + } + + if !all_unmatched.is_empty() { + let unmatched_probe = match probe_projected.num_columns() { + 0 => None, + _ => Some(DataBlock::take(&probe_projected, all_unmatched.as_slice())?), + }; + let types = &self.build.column_types; + let unmatched_build = null_block(types, all_unmatched.len()); + result_blocks.push(final_result_block( + desc, + unmatched_probe, + unmatched_build, + all_unmatched.len(), + )); + } + } else { + // All matched rows failed filter, treat all as unmatched + let all_indices: Vec = (0..num_rows as u64).collect(); + let unmatched_probe = match probe_projected.num_columns() { + 0 => None, + _ => Some(DataBlock::take(&probe_projected, all_indices.as_slice())?), + }; + let types = &self.build.column_types; + let unmatched_build = null_block(types, all_indices.len()); + result_blocks.push(final_result_block( + desc, + unmatched_probe, + unmatched_build, + all_indices.len(), + )); + } + } else { + result_blocks.push(matched_result); + + // Append unmatched rows with NULL build side + if !unmatched.is_empty() { + let unmatched_probe = match probe_projected.num_columns() { + 0 => None, + _ => Some(DataBlock::take(&probe_projected, unmatched.as_slice())?), + }; + let types = &self.build.column_types; + let unmatched_build = null_block(types, unmatched.len()); + result_blocks.push(final_result_block( + desc, + unmatched_probe, + unmatched_build, + unmatched.len(), + )); + } + } + } else { + // All rows unmatched + let unmatched_probe = match probe_projected.num_columns() { + 0 => None, + _ => Some(DataBlock::take(&probe_projected, unmatched.as_slice())?), + }; + let types = &self.build.column_types; + let unmatched_build = null_block(types, unmatched.len()); + result_blocks.push(final_result_block( + desc, + unmatched_probe, + unmatched_build, + unmatched.len(), + )); + } + + let result = DataBlock::concat(&result_blocks)?; + Ok(Box::new(OneBlockJoinStream(Some(result)))) + } +} + +impl GraceMemoryJoin for PartitionedLeftJoin { + fn reset_memory(&mut self) { + self.build.reset(); + } +} + +pub fn final_result_block( + desc: &HashJoinDesc, + probe_block: Option, + build_block: Option, + num_rows: usize, +) -> DataBlock { + let mut result_block = match (probe_block, build_block) { + (Some(mut probe_block), Some(build_block)) => { + probe_block.merge_block(build_block); + probe_block + } + (Some(probe_block), None) => probe_block, + (None, Some(build_block)) => build_block, + (None, None) => DataBlock::new(vec![], num_rows), + }; + + if !desc.probe_to_build.is_empty() { + for (index, (is_probe_nullable, is_build_nullable)) in desc.probe_to_build.iter() { + let entry = match (is_probe_nullable, is_build_nullable) { + (true, true) | (false, false) => result_block.get_by_offset(*index).clone(), + (true, false) => result_block.get_by_offset(*index).clone().remove_nullable(), + (false, true) => { + let entry = result_block.get_by_offset(*index); + let col = entry.to_column(); + match col.is_null() || col.is_nullable() { + true => entry.clone(), + false => BlockEntry::from(NullableColumn::new_column( + col, + Bitmap::new_constant(true, result_block.num_rows()), + )), + } + } + }; + result_block.add_entry(entry); + } + } + result_block +} + +pub fn null_block(types: &[DataType], num_rows: usize) -> Option { + if types.is_empty() { + return None; + } + let columns = types + .iter() + .map(|column_type| { + BlockEntry::new_const_column(column_type.wrap_nullable(), Scalar::Null, num_rows) + }) + .collect::>(); + Some(DataBlock::new(columns, num_rows)) +} diff --git a/src/query/service/src/pipelines/processors/transforms/new_hash_join/memory/partitioned/left_join_anti.rs b/src/query/service/src/pipelines/processors/transforms/new_hash_join/memory/partitioned/left_join_anti.rs new file mode 100644 index 0000000000000..fa32b533c84c6 --- /dev/null +++ b/src/query/service/src/pipelines/processors/transforms/new_hash_join/memory/partitioned/left_join_anti.rs @@ -0,0 +1,141 @@ +// Copyright 2021 Datafuse Labs +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use std::sync::Arc; + +use databend_common_base::base::ProgressValues; +use databend_common_column::bitmap::Bitmap; +use databend_common_exception::Result; +use databend_common_expression::DataBlock; +use databend_common_expression::FilterExecutor; +use databend_common_expression::FunctionContext; +use databend_common_expression::HashMethodKind; +use databend_common_functions::BUILTIN_FUNCTIONS; + +use super::left_join::final_result_block; +use super::partitioned_build::PartitionedBuild; +use crate::pipelines::processors::HashJoinDesc; +use crate::pipelines::processors::transforms::new_hash_join::join::EmptyJoinStream; +use crate::pipelines::processors::transforms::new_hash_join::join::Join; +use crate::pipelines::processors::transforms::new_hash_join::join::JoinStream; +use crate::pipelines::processors::transforms::new_hash_join::join::OneBlockJoinStream; +use crate::pipelines::processors::transforms::new_hash_join::grace::grace_memory::GraceMemoryJoin; + +pub struct PartitionedLeftAntiJoin { + build: PartitionedBuild, + filter_executor: Option, +} + +impl PartitionedLeftAntiJoin { + pub fn create( + method: HashMethodKind, + desc: Arc, + function_ctx: FunctionContext, + max_block_size: usize, + ) -> Self { + let filter_executor = desc.other_predicate.as_ref().map(|predicate| { + FilterExecutor::new( + predicate.clone(), + function_ctx.clone(), + max_block_size, + None, + &BUILTIN_FUNCTIONS, + false, + ) + }); + PartitionedLeftAntiJoin { + build: PartitionedBuild::create(method, desc, function_ctx), + filter_executor, + } + } +} + +impl Join for PartitionedLeftAntiJoin { + fn add_block(&mut self, data: Option) -> Result<()> { + self.build.add_block(data) + } + + fn final_build(&mut self) -> Result> { + self.build.final_build()?; + Ok(None) + } + + fn probe_block(&mut self, data: DataBlock) -> Result> { + if data.is_empty() { + return Ok(Box::new(EmptyJoinStream)); + } + + let desc = &self.build.desc; + + if self.build.num_rows == 0 { + let probe_projected = data.project(&desc.probe_projection); + return Ok(Box::new(OneBlockJoinStream(Some(probe_projected)))); + } + + let (matched_probe, matched_build, unmatched) = self.build.probe(&data)?; + let probe_projected = data.project(&desc.probe_projection); + + if matched_probe.is_empty() { + // All rows are unmatched + return Ok(Box::new(OneBlockJoinStream(Some(probe_projected)))); + } + + if let Some(filter) = self.filter_executor.as_mut() { + // With filter: rows that match but fail filter are still "anti" rows + let probe_block = match probe_projected.num_columns() { + 0 => None, + _ => Some(DataBlock::take(&probe_projected, matched_probe.as_slice())?), + }; + let build_block = self.build.gather_build_block(&matched_build); + let result = final_result_block( + &self.build.desc, + probe_block, + build_block, + matched_probe.len(), + ); + + let count = filter.select(&result)?; + + // selected[i] = true means probe row i should be EXCLUDED (it matched and passed filter) + let mut excluded = vec![false; probe_projected.num_rows()]; + if count > 0 { + let true_sel = filter.true_selection(); + for idx in true_sel.iter().take(count) { + excluded[matched_probe[*idx as usize] as usize] = true; + } + } + + let bitmap = Bitmap::from_trusted_len_iter(excluded.iter().map(|e| !e)); + match bitmap.true_count() { + 0 => Ok(Box::new(EmptyJoinStream)), + _ => Ok(Box::new(OneBlockJoinStream(Some( + probe_projected.filter_with_bitmap(&bitmap)?, + )))), + } + } else { + // Without filter: output only unmatched rows + if unmatched.is_empty() { + return Ok(Box::new(EmptyJoinStream)); + } + let result = DataBlock::take(&probe_projected, unmatched.as_slice())?; + Ok(Box::new(OneBlockJoinStream(Some(result)))) + } + } +} + +impl GraceMemoryJoin for PartitionedLeftAntiJoin { + fn reset_memory(&mut self) { + self.build.reset(); + } +} diff --git a/src/query/service/src/pipelines/processors/transforms/new_hash_join/memory/partitioned/left_join_semi.rs b/src/query/service/src/pipelines/processors/transforms/new_hash_join/memory/partitioned/left_join_semi.rs new file mode 100644 index 0000000000000..82a1ef8c84c66 --- /dev/null +++ b/src/query/service/src/pipelines/processors/transforms/new_hash_join/memory/partitioned/left_join_semi.rs @@ -0,0 +1,138 @@ +// Copyright 2021 Datafuse Labs +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use std::sync::Arc; + +use databend_common_base::base::ProgressValues; +use databend_common_column::bitmap::Bitmap; +use databend_common_exception::Result; +use databend_common_expression::DataBlock; +use databend_common_expression::FilterExecutor; +use databend_common_expression::FunctionContext; +use databend_common_expression::HashMethodKind; +use databend_common_functions::BUILTIN_FUNCTIONS; + +use super::left_join::final_result_block; +use super::partitioned_build::PartitionedBuild; +use crate::pipelines::processors::HashJoinDesc; +use crate::pipelines::processors::transforms::new_hash_join::join::EmptyJoinStream; +use crate::pipelines::processors::transforms::new_hash_join::join::Join; +use crate::pipelines::processors::transforms::new_hash_join::join::JoinStream; +use crate::pipelines::processors::transforms::new_hash_join::join::OneBlockJoinStream; +use crate::pipelines::processors::transforms::new_hash_join::grace::grace_memory::GraceMemoryJoin; + +pub struct PartitionedLeftSemiJoin { + build: PartitionedBuild, + filter_executor: Option, +} + +impl PartitionedLeftSemiJoin { + pub fn create( + method: HashMethodKind, + desc: Arc, + function_ctx: FunctionContext, + max_block_size: usize, + ) -> Self { + let filter_executor = desc.other_predicate.as_ref().map(|predicate| { + FilterExecutor::new( + predicate.clone(), + function_ctx.clone(), + max_block_size, + None, + &BUILTIN_FUNCTIONS, + false, + ) + }); + PartitionedLeftSemiJoin { + build: PartitionedBuild::create(method, desc, function_ctx), + filter_executor, + } + } +} + +impl Join for PartitionedLeftSemiJoin { + fn add_block(&mut self, data: Option) -> Result<()> { + self.build.add_block(data) + } + + fn final_build(&mut self) -> Result> { + self.build.final_build()?; + Ok(None) + } + + fn probe_block(&mut self, data: DataBlock) -> Result> { + if data.is_empty() || self.build.num_rows == 0 { + return Ok(Box::new(EmptyJoinStream)); + } + + let desc = &self.build.desc; + let (matched_probe, matched_build, _) = self.build.probe(&data)?; + + if matched_probe.is_empty() { + return Ok(Box::new(EmptyJoinStream)); + } + + let probe_projected = data.project(&desc.probe_projection); + + if let Some(filter) = self.filter_executor.as_mut() { + // With filter: build full result, apply filter, deduplicate probe indices + let probe_block = match probe_projected.num_columns() { + 0 => None, + _ => Some(DataBlock::take(&probe_projected, matched_probe.as_slice())?), + }; + let build_block = self.build.gather_build_block(&matched_build); + let result = final_result_block( + &self.build.desc, + probe_block, + build_block, + matched_probe.len(), + ); + + let count = filter.select(&result)?; + if count == 0 { + return Ok(Box::new(EmptyJoinStream)); + } + + let true_sel = filter.true_selection(); + let mut selected = vec![false; probe_projected.num_rows()]; + for idx in true_sel.iter().take(count) { + selected[matched_probe[*idx as usize] as usize] = true; + } + + let bitmap = Bitmap::from_trusted_len_iter(selected.into_iter()); + match bitmap.true_count() { + 0 => Ok(Box::new(EmptyJoinStream)), + _ => Ok(Box::new(OneBlockJoinStream(Some( + probe_projected.filter_with_bitmap(&bitmap)?, + )))), + } + } else { + // Without filter: deduplicate matched probe indices + let mut selected = vec![false; probe_projected.num_rows()]; + for idx in &matched_probe { + selected[*idx as usize] = true; + } + let bitmap = Bitmap::from_trusted_len_iter(selected.into_iter()); + Ok(Box::new(OneBlockJoinStream(Some( + probe_projected.filter_with_bitmap(&bitmap)?, + )))) + } + } +} + +impl GraceMemoryJoin for PartitionedLeftSemiJoin { + fn reset_memory(&mut self) { + self.build.reset(); + } +} diff --git a/src/query/service/src/pipelines/processors/transforms/new_hash_join/memory/partitioned/mod.rs b/src/query/service/src/pipelines/processors/transforms/new_hash_join/memory/partitioned/mod.rs new file mode 100644 index 0000000000000..00c83007a4c65 --- /dev/null +++ b/src/query/service/src/pipelines/processors/transforms/new_hash_join/memory/partitioned/mod.rs @@ -0,0 +1,34 @@ +// Copyright 2021 Datafuse Labs +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +mod compact_hash_table; +mod partitioned_build; +mod inner_join; +mod left_join; +mod left_join_anti; +mod left_join_semi; +mod right_join; +mod right_join_anti; +mod right_join_semi; + +pub use compact_hash_table::CompactJoinHashTable; +pub use compact_hash_table::RowIndex; +pub use partitioned_build::PartitionedBuild; +pub use inner_join::PartitionedInnerJoin; +pub use left_join::PartitionedLeftJoin; +pub use left_join_anti::PartitionedLeftAntiJoin; +pub use left_join_semi::PartitionedLeftSemiJoin; +pub use right_join::PartitionedRightJoin; +pub use right_join_anti::PartitionedRightAntiJoin; +pub use right_join_semi::PartitionedRightSemiJoin; diff --git a/src/query/service/src/pipelines/processors/transforms/new_hash_join/memory/partitioned/partitioned_build.rs b/src/query/service/src/pipelines/processors/transforms/new_hash_join/memory/partitioned/partitioned_build.rs new file mode 100644 index 0000000000000..8069c815a7ceb --- /dev/null +++ b/src/query/service/src/pipelines/processors/transforms/new_hash_join/memory/partitioned/partitioned_build.rs @@ -0,0 +1,338 @@ +// Copyright 2021 Datafuse Labs +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use std::sync::Arc; + +use databend_common_exception::Result; +use databend_common_expression::Column; +use databend_common_expression::ColumnVec; +use databend_common_expression::DataBlock; +use databend_common_expression::FunctionContext; +use databend_common_expression::HashMethod; +use databend_common_expression::HashMethodKind; +use databend_common_expression::KeysState; +use databend_common_expression::ProjectedBlock; +use databend_common_expression::types::DataType; +use databend_common_expression::with_hash_method; + +use super::compact_hash_table::CompactJoinHashTable; +use crate::pipelines::processors::HashJoinDesc; +use crate::pipelines::processors::transforms::hash_join_table::RowPtr; + +pub const CHUNK_BITS: usize = 16; +pub const CHUNK_SIZE: usize = 1 << CHUNK_BITS; // 65536 +const CHUNK_MASK: usize = CHUNK_SIZE - 1; + +/// Convert a 1-based flat index to RowPtr (chunk_index, row_offset). +#[inline(always)] +pub fn flat_to_row_ptr(flat_index: usize) -> RowPtr { + let zero_based = flat_index - 1; + RowPtr { + chunk_index: (zero_based >> CHUNK_BITS) as u32, + row_index: (zero_based & CHUNK_MASK) as u32, + } +} + +/// Per-thread build state for partitioned hash join. +pub struct PartitionedBuild { + /// Build blocks, each strictly CHUNK_SIZE rows (last may be shorter). + pub chunks: Vec, + /// Per-chunk build key states for key comparison during probe. + pub build_keys_states: Vec, + /// Compact hash table (u32 row indices, 1-based). + pub hash_table: CompactJoinHashTable, + /// Build columns in ColumnVec format for fast gather. + pub columns: Vec, + /// Column types for build side. + pub column_types: Vec, + /// Total build rows. + pub num_rows: usize, + /// Hash method for key extraction. + pub method: HashMethodKind, + /// Join descriptor. + pub desc: Arc, + /// Function context. + pub function_ctx: FunctionContext, + /// Accumulator for fixed-size chunks. + squash_buffer: Vec, + squash_rows: usize, +} + +impl PartitionedBuild { + pub fn create( + method: HashMethodKind, + desc: Arc, + function_ctx: FunctionContext, + ) -> Self { + PartitionedBuild { + chunks: Vec::new(), + build_keys_states: Vec::new(), + hash_table: CompactJoinHashTable::new(0), + columns: Vec::new(), + column_types: Vec::new(), + num_rows: 0, + method, + desc, + function_ctx, + squash_buffer: Vec::new(), + squash_rows: 0, + } + } + + /// Push a build block. None signals end of input. + pub fn add_block(&mut self, data: Option) -> Result<()> { + match data { + Some(block) if !block.is_empty() => { + self.squash_rows += block.num_rows(); + self.squash_buffer.push(block); + while self.squash_rows >= CHUNK_SIZE { + self.flush_one_chunk()?; + } + } + _ => { + if !self.squash_buffer.is_empty() { + let block = DataBlock::concat(&std::mem::take(&mut self.squash_buffer))?; + if !block.is_empty() { + self.num_rows += block.num_rows(); + self.chunks.push(block); + } + self.squash_rows = 0; + } + } + } + Ok(()) + } + + /// Finalize build: extract keys, compute hashes, build compact hash table, extract ColumnVec. + pub fn final_build(&mut self) -> Result<()> { + if self.num_rows == 0 { + return Ok(()); + } + + let mut all_keys_states = Vec::with_capacity(self.chunks.len()); + let mut all_hashes = Vec::with_capacity(self.num_rows); + + with_hash_method!(|T| match &self.method { + HashMethodKind::T(method) => { + for chunk in &self.chunks { + let keys_entries = self.desc.build_key(chunk, &self.function_ctx)?; + let mut keys_block = DataBlock::new(keys_entries, chunk.num_rows()); + self.desc.remove_keys_nullable(&mut keys_block); + let keys = ProjectedBlock::from(keys_block.columns()); + let keys_state = method.build_keys_state(keys, chunk.num_rows())?; + method.build_keys_hashes(&keys_state, &mut all_hashes); + all_keys_states.push(keys_state); + } + } + }); + + // Build compact hash table (1-indexed) + let mut bucket_nums = vec![0usize; self.num_rows + 1]; + self.hash_table = CompactJoinHashTable::new(self.num_rows); + let bucket_mask = self.hash_table.bucket_mask(); + for (i, h) in all_hashes.iter().enumerate() { + bucket_nums[i + 1] = (*h as usize) & bucket_mask; + } + self.hash_table.build(&bucket_nums); + + // Project build columns and extract ColumnVec + if let Some(first_chunk) = self.chunks.first() { + let first_projected = first_chunk.clone().project(&self.desc.build_projection); + self.column_types = (0..first_projected.num_columns()) + .map(|offset| first_projected.get_by_offset(offset).data_type()) + .collect(); + + let num_cols = first_projected.num_columns(); + let mut columns = Vec::with_capacity(num_cols); + for offset in 0..num_cols { + let full_columns: Vec = self + .chunks + .iter() + .map(|chunk| { + chunk + .clone() + .project(&self.desc.build_projection) + .get_by_offset(offset) + .to_column() + }) + .collect(); + columns.push(Column::take_downcast_column_vec(&full_columns)); + } + self.columns = columns; + } + + self.build_keys_states = all_keys_states; + Ok(()) + } + + fn flush_one_chunk(&mut self) -> Result<()> { + let concat = DataBlock::concat(&std::mem::take(&mut self.squash_buffer))?; + let chunk = concat.slice(0..CHUNK_SIZE).maybe_gc(); + let remain_rows = concat.num_rows() - CHUNK_SIZE; + if remain_rows > 0 { + let remain = concat.slice(CHUNK_SIZE..concat.num_rows()).maybe_gc(); + self.squash_buffer.push(remain); + } + self.squash_rows = remain_rows; + self.num_rows += chunk.num_rows(); + self.chunks.push(chunk); + Ok(()) + } + + pub fn reset(&mut self) { + self.chunks.clear(); + self.build_keys_states.clear(); + self.hash_table = CompactJoinHashTable::new(0); + self.columns.clear(); + self.column_types.clear(); + self.num_rows = 0; + self.squash_buffer.clear(); + self.squash_rows = 0; + } + + /// Probe the hash table with a data block. Returns matched pairs and unmatched probe indices. + /// For each probe row, walks the hash chain and compares keys. + pub fn probe( + &self, + data: &DataBlock, + ) -> Result<(Vec, Vec, Vec)> { + if self.num_rows == 0 { + let unmatched: Vec = (0..data.num_rows() as u64).collect(); + return Ok((vec![], vec![], unmatched)); + } + + let probe_keys_entries = self.desc.probe_key(data, &self.function_ctx)?; + let mut probe_keys_block = DataBlock::new(probe_keys_entries, data.num_rows()); + self.desc.remove_keys_nullable(&mut probe_keys_block); + + let mut matched_probe = Vec::new(); + let mut matched_build = Vec::new(); + let mut has_match = vec![false; data.num_rows()]; + + with_hash_method!(|T| match &self.method { + HashMethodKind::T(method) => { + let keys = ProjectedBlock::from(probe_keys_block.columns()); + let probe_ks = method.build_keys_state(keys, data.num_rows())?; + let mut probe_hashes = Vec::with_capacity(data.num_rows()); + method.build_keys_hashes(&probe_ks, &mut probe_hashes); + + let probe_acc = method.build_keys_accessor(probe_ks)?; + let build_accs: Vec<_> = self + .build_keys_states + .iter() + .map(|ks| method.build_keys_accessor(ks.clone())) + .collect::>>()?; + + let bucket_mask = self.hash_table.bucket_mask(); + for probe_idx in 0..data.num_rows() { + let bucket = (probe_hashes[probe_idx] as usize) & bucket_mask; + let mut build_idx = self.hash_table.first_index(bucket); + while build_idx != 0 { + let bi = build_idx as usize; + let chunk_idx = (bi - 1) >> CHUNK_BITS; + let offset = (bi - 1) & CHUNK_MASK; + let build_key = unsafe { build_accs[chunk_idx].key_unchecked(offset) }; + let probe_key = unsafe { probe_acc.key_unchecked(probe_idx) }; + if build_key == probe_key { + has_match[probe_idx] = true; + matched_probe.push(probe_idx as u64); + matched_build.push(flat_to_row_ptr(bi)); + } + build_idx = self.hash_table.next_index(build_idx); + } + } + } + }); + + let unmatched: Vec = has_match + .iter() + .enumerate() + .filter(|(_, m)| !**m) + .map(|(i, _)| i as u64) + .collect(); + + Ok((matched_probe, matched_build, unmatched)) + } + + /// Probe and mark visited build rows (for right join types). + pub fn probe_and_mark_visited( + &mut self, + data: &DataBlock, + ) -> Result<(Vec, Vec)> { + if self.num_rows == 0 { + return Ok((vec![], vec![])); + } + + let probe_keys_entries = self.desc.probe_key(data, &self.function_ctx)?; + let mut probe_keys_block = DataBlock::new(probe_keys_entries, data.num_rows()); + self.desc.remove_keys_nullable(&mut probe_keys_block); + + let mut matched_probe = Vec::new(); + let mut matched_build = Vec::new(); + + with_hash_method!(|T| match &self.method { + HashMethodKind::T(method) => { + let keys = ProjectedBlock::from(probe_keys_block.columns()); + let probe_ks = method.build_keys_state(keys, data.num_rows())?; + let mut probe_hashes = Vec::with_capacity(data.num_rows()); + method.build_keys_hashes(&probe_ks, &mut probe_hashes); + + let probe_acc = method.build_keys_accessor(probe_ks)?; + let build_accs: Vec<_> = self + .build_keys_states + .iter() + .map(|ks| method.build_keys_accessor(ks.clone())) + .collect::>>()?; + + let bucket_mask = self.hash_table.bucket_mask(); + for probe_idx in 0..data.num_rows() { + let bucket = (probe_hashes[probe_idx] as usize) & bucket_mask; + let mut build_idx = self.hash_table.first_index(bucket); + while build_idx != 0 { + let bi = build_idx as usize; + let chunk_idx = (bi - 1) >> CHUNK_BITS; + let offset = (bi - 1) & CHUNK_MASK; + let build_key = unsafe { build_accs[chunk_idx].key_unchecked(offset) }; + let probe_key = unsafe { probe_acc.key_unchecked(probe_idx) }; + if build_key == probe_key { + matched_probe.push(probe_idx as u64); + matched_build.push(flat_to_row_ptr(bi)); + self.hash_table.set_visited(build_idx); + } + build_idx = self.hash_table.next_index(build_idx); + } + } + } + }); + + Ok((matched_probe, matched_build)) + } + + /// Initialize visited tracking for right-side join types. + pub fn init_visited(&mut self) { + self.hash_table.init_visited(self.num_rows); + } + + /// Gather build columns for the given row pointers. + pub fn gather_build_block(&self, row_ptrs: &[RowPtr]) -> Option { + if self.columns.is_empty() { + return None; + } + Some(DataBlock::take_column_vec( + &self.columns, + &self.column_types, + row_ptrs, + )) + } +} diff --git a/src/query/service/src/pipelines/processors/transforms/new_hash_join/memory/partitioned/right_join.rs b/src/query/service/src/pipelines/processors/transforms/new_hash_join/memory/partitioned/right_join.rs new file mode 100644 index 0000000000000..97dcd7774b9b7 --- /dev/null +++ b/src/query/service/src/pipelines/processors/transforms/new_hash_join/memory/partitioned/right_join.rs @@ -0,0 +1,165 @@ +// Copyright 2021 Datafuse Labs +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use std::sync::Arc; + +use databend_common_base::base::ProgressValues; +use databend_common_exception::Result; +use databend_common_expression::DataBlock; +use databend_common_expression::FilterExecutor; +use databend_common_expression::FunctionContext; +use databend_common_expression::HashMethodKind; +use databend_common_functions::BUILTIN_FUNCTIONS; + +use super::left_join::final_result_block; +use super::left_join::null_block; +use super::partitioned_build::PartitionedBuild; +use super::partitioned_build::flat_to_row_ptr; +use crate::pipelines::processors::HashJoinDesc; +use crate::pipelines::processors::transforms::new_hash_join::join::EmptyJoinStream; +use crate::pipelines::processors::transforms::new_hash_join::join::Join; +use crate::pipelines::processors::transforms::new_hash_join::join::JoinStream; +use crate::pipelines::processors::transforms::new_hash_join::join::OneBlockJoinStream; +use crate::pipelines::processors::transforms::new_hash_join::grace::grace_memory::GraceMemoryJoin; +use crate::pipelines::processors::transforms::wrap_nullable_block; + +pub struct PartitionedRightJoin { + build: PartitionedBuild, + filter_executor: Option, + finished: bool, +} + +impl PartitionedRightJoin { + pub fn create( + method: HashMethodKind, + desc: Arc, + function_ctx: FunctionContext, + max_block_size: usize, + ) -> Self { + let filter_executor = desc.other_predicate.as_ref().map(|predicate| { + FilterExecutor::new( + predicate.clone(), + function_ctx.clone(), + max_block_size, + None, + &BUILTIN_FUNCTIONS, + false, + ) + }); + PartitionedRightJoin { + build: PartitionedBuild::create(method, desc, function_ctx), + filter_executor, + finished: false, + } + } +} + +impl Join for PartitionedRightJoin { + fn add_block(&mut self, data: Option) -> Result<()> { + self.build.add_block(data) + } + + fn final_build(&mut self) -> Result> { + self.build.final_build()?; + self.build.init_visited(); + Ok(None) + } + + fn probe_block(&mut self, data: DataBlock) -> Result> { + if data.is_empty() || self.build.num_rows == 0 { + return Ok(Box::new(EmptyJoinStream)); + } + + let desc = self.build.desc.clone(); + let (matched_probe, matched_build) = self.build.probe_and_mark_visited(&data)?; + + if matched_probe.is_empty() { + return Ok(Box::new(EmptyJoinStream)); + } + + let probe_projected = data.project(&desc.probe_projection); + let probe_block = match probe_projected.num_columns() { + 0 => None, + _ => Some(wrap_nullable_block(&DataBlock::take( + &probe_projected, + matched_probe.as_slice(), + )?)), + }; + + let build_block = self.build.gather_build_block(&matched_build); + + let mut result = final_result_block( + &desc, + probe_block, + build_block, + matched_build.len(), + ); + + if let Some(filter) = self.filter_executor.as_mut() { + result = filter.filter(result)?; + if result.is_empty() { + return Ok(Box::new(EmptyJoinStream)); + } + } + + Ok(Box::new(OneBlockJoinStream(Some(result)))) + } + + fn final_probe(&mut self) -> Result>> { + if self.finished || self.build.num_rows == 0 { + return Ok(None); + } + self.finished = true; + + let desc = self.build.desc.clone(); + let mut unvisited_ptrs = Vec::new(); + + // Scan visited array (1-based indexing) + for i in 1..=self.build.num_rows { + if !self.build.hash_table.is_visited(i) { + unvisited_ptrs.push(flat_to_row_ptr(i)); + } + } + + if unvisited_ptrs.is_empty() { + return Ok(None); + } + + // Build NULL probe block + let mut probe_types = Vec::new(); + for (i, field) in desc.probe_schema.fields().iter().enumerate() { + if desc.probe_projection.contains(&i) { + probe_types.push(field.data_type().clone()); + } + } + let probe_block = null_block(&probe_types, unvisited_ptrs.len()); + let build_block = self.build.gather_build_block(&unvisited_ptrs); + + let result = final_result_block( + &desc, + probe_block, + build_block, + unvisited_ptrs.len(), + ); + + Ok(Some(Box::new(OneBlockJoinStream(Some(result))))) + } +} + +impl GraceMemoryJoin for PartitionedRightJoin { + fn reset_memory(&mut self) { + self.finished = false; + self.build.reset(); + } +} diff --git a/src/query/service/src/pipelines/processors/transforms/new_hash_join/memory/partitioned/right_join_anti.rs b/src/query/service/src/pipelines/processors/transforms/new_hash_join/memory/partitioned/right_join_anti.rs new file mode 100644 index 0000000000000..1bb91bd0b59a9 --- /dev/null +++ b/src/query/service/src/pipelines/processors/transforms/new_hash_join/memory/partitioned/right_join_anti.rs @@ -0,0 +1,145 @@ +// Copyright 2021 Datafuse Labs +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use std::sync::Arc; + +use databend_common_base::base::ProgressValues; +use databend_common_exception::Result; +use databend_common_expression::DataBlock; +use databend_common_expression::FilterExecutor; +use databend_common_expression::FunctionContext; +use databend_common_expression::HashMethodKind; +use databend_common_functions::BUILTIN_FUNCTIONS; + +use super::left_join::final_result_block; +use super::partitioned_build::PartitionedBuild; +use super::partitioned_build::flat_to_row_ptr; +use crate::pipelines::processors::HashJoinDesc; +use crate::pipelines::processors::transforms::new_hash_join::join::EmptyJoinStream; +use crate::pipelines::processors::transforms::new_hash_join::join::Join; +use crate::pipelines::processors::transforms::new_hash_join::join::JoinStream; +use crate::pipelines::processors::transforms::new_hash_join::join::OneBlockJoinStream; +use crate::pipelines::processors::transforms::new_hash_join::grace::grace_memory::GraceMemoryJoin; + +pub struct PartitionedRightAntiJoin { + build: PartitionedBuild, + filter_executor: Option, + finished: bool, +} + +impl PartitionedRightAntiJoin { + pub fn create( + method: HashMethodKind, + desc: Arc, + function_ctx: FunctionContext, + max_block_size: usize, + ) -> Self { + let filter_executor = desc.other_predicate.as_ref().map(|predicate| { + FilterExecutor::new( + predicate.clone(), + function_ctx.clone(), + max_block_size, + None, + &BUILTIN_FUNCTIONS, + false, + ) + }); + PartitionedRightAntiJoin { + build: PartitionedBuild::create(method, desc, function_ctx), + filter_executor, + finished: false, + } + } +} + +impl Join for PartitionedRightAntiJoin { + fn add_block(&mut self, data: Option) -> Result<()> { + self.build.add_block(data) + } + + fn final_build(&mut self) -> Result> { + self.build.final_build()?; + self.build.init_visited(); + Ok(None) + } + + fn probe_block(&mut self, data: DataBlock) -> Result> { + if data.is_empty() || self.build.num_rows == 0 { + return Ok(Box::new(EmptyJoinStream)); + } + + let desc = self.build.desc.clone(); + + if let Some(filter) = self.filter_executor.as_mut() { + let (matched_probe, matched_build) = self.build.probe_and_mark_visited(&data)?; + if !matched_probe.is_empty() { + let probe_projected = data.project(&desc.probe_projection); + let probe_block = match probe_projected.num_columns() { + 0 => None, + _ => Some(DataBlock::take(&probe_projected, matched_probe.as_slice())?), + }; + let build_block = self.build.gather_build_block(&matched_build); + let result = final_result_block(&desc, probe_block, build_block, matched_probe.len()); + + if !result.is_empty() { + let _count = filter.select(&result)?; + // For anti join with filter: probe_and_mark_visited marks all matches. + // Ideally we'd only mark rows that pass the filter, but since + // probe_and_mark_visited already marked them, this is conservative + // (may exclude some rows that should be in anti output). + // This matches the unpartitioned behavior where scan_map is set + // during probe regardless of filter. + } + } + } else { + let _ = self.build.probe_and_mark_visited(&data)?; + } + + // Right anti doesn't output during probe phase + Ok(Box::new(EmptyJoinStream)) + } + + fn final_probe(&mut self) -> Result>> { + if self.finished || self.build.num_rows == 0 { + return Ok(None); + } + self.finished = true; + + let mut unvisited_ptrs = Vec::new(); + for i in 1..=self.build.num_rows { + if !self.build.hash_table.is_visited(i) { + unvisited_ptrs.push(flat_to_row_ptr(i)); + } + } + + if unvisited_ptrs.is_empty() { + return Ok(None); + } + + let build_block = self.build.gather_build_block(&unvisited_ptrs); + match build_block { + Some(block) => Ok(Some(Box::new(OneBlockJoinStream(Some(block))))), + None => Ok(Some(Box::new(OneBlockJoinStream(Some( + DataBlock::new(vec![], unvisited_ptrs.len()), + ))))), + } + } +} + +impl GraceMemoryJoin for PartitionedRightAntiJoin { + fn reset_memory(&mut self) { + self.finished = false; + self.build.reset(); + } +} diff --git a/src/query/service/src/pipelines/processors/transforms/new_hash_join/memory/partitioned/right_join_semi.rs b/src/query/service/src/pipelines/processors/transforms/new_hash_join/memory/partitioned/right_join_semi.rs new file mode 100644 index 0000000000000..b755b7862d8e3 --- /dev/null +++ b/src/query/service/src/pipelines/processors/transforms/new_hash_join/memory/partitioned/right_join_semi.rs @@ -0,0 +1,144 @@ +// Copyright 2021 Datafuse Labs +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use std::sync::Arc; + +use databend_common_base::base::ProgressValues; +use databend_common_exception::Result; +use databend_common_expression::DataBlock; +use databend_common_expression::FilterExecutor; +use databend_common_expression::FunctionContext; +use databend_common_expression::HashMethodKind; +use databend_common_functions::BUILTIN_FUNCTIONS; + +use super::left_join::final_result_block; +use super::partitioned_build::PartitionedBuild; +use super::partitioned_build::flat_to_row_ptr; +use crate::pipelines::processors::HashJoinDesc; +use crate::pipelines::processors::transforms::new_hash_join::join::EmptyJoinStream; +use crate::pipelines::processors::transforms::new_hash_join::join::Join; +use crate::pipelines::processors::transforms::new_hash_join::join::JoinStream; +use crate::pipelines::processors::transforms::new_hash_join::join::OneBlockJoinStream; +use crate::pipelines::processors::transforms::new_hash_join::grace::grace_memory::GraceMemoryJoin; + +pub struct PartitionedRightSemiJoin { + build: PartitionedBuild, + filter_executor: Option, + finished: bool, +} + +impl PartitionedRightSemiJoin { + pub fn create( + method: HashMethodKind, + desc: Arc, + function_ctx: FunctionContext, + max_block_size: usize, + ) -> Self { + let filter_executor = desc.other_predicate.as_ref().map(|predicate| { + FilterExecutor::new( + predicate.clone(), + function_ctx.clone(), + max_block_size, + None, + &BUILTIN_FUNCTIONS, + false, + ) + }); + PartitionedRightSemiJoin { + build: PartitionedBuild::create(method, desc, function_ctx), + filter_executor, + finished: false, + } + } +} + +impl Join for PartitionedRightSemiJoin { + fn add_block(&mut self, data: Option) -> Result<()> { + self.build.add_block(data) + } + + fn final_build(&mut self) -> Result> { + self.build.final_build()?; + self.build.init_visited(); + Ok(None) + } + + fn probe_block(&mut self, data: DataBlock) -> Result> { + if data.is_empty() || self.build.num_rows == 0 { + return Ok(Box::new(EmptyJoinStream)); + } + + let desc = self.build.desc.clone(); + + if let Some(filter) = self.filter_executor.as_mut() { + let (matched_probe, matched_build) = self.build.probe_and_mark_visited(&data)?; + if !matched_probe.is_empty() { + let probe_projected = data.project(&desc.probe_projection); + let probe_block = match probe_projected.num_columns() { + 0 => None, + _ => Some(DataBlock::take(&probe_projected, matched_probe.as_slice())?), + }; + let build_block = self.build.gather_build_block(&matched_build); + let result = final_result_block(&desc, probe_block, build_block, matched_probe.len()); + + if !result.is_empty() { + let count = filter.select(&result)?; + if count > 0 { + // probe_and_mark_visited already marked all matches as visited, + // which is correct for semi join (any match suffices) + } + } + } + } else { + // Without filter, probe_and_mark_visited already marks all matched build rows + let _ = self.build.probe_and_mark_visited(&data)?; + } + + // Right semi doesn't output during probe phase + Ok(Box::new(EmptyJoinStream)) + } + + fn final_probe(&mut self) -> Result>> { + if self.finished || self.build.num_rows == 0 { + return Ok(None); + } + self.finished = true; + + let mut visited_ptrs = Vec::new(); + for i in 1..=self.build.num_rows { + if self.build.hash_table.is_visited(i) { + visited_ptrs.push(flat_to_row_ptr(i)); + } + } + + if visited_ptrs.is_empty() { + return Ok(None); + } + + let build_block = self.build.gather_build_block(&visited_ptrs); + match build_block { + Some(block) => Ok(Some(Box::new(OneBlockJoinStream(Some(block))))), + None => Ok(Some(Box::new(OneBlockJoinStream(Some( + DataBlock::new(vec![], visited_ptrs.len()), + ))))), + } + } +} + +impl GraceMemoryJoin for PartitionedRightSemiJoin { + fn reset_memory(&mut self) { + self.finished = false; + self.build.reset(); + } +} diff --git a/src/query/service/src/pipelines/processors/transforms/new_hash_join/memory/basic.rs b/src/query/service/src/pipelines/processors/transforms/new_hash_join/memory/unpartitioned/basic.rs similarity index 100% rename from src/query/service/src/pipelines/processors/transforms/new_hash_join/memory/basic.rs rename to src/query/service/src/pipelines/processors/transforms/new_hash_join/memory/unpartitioned/basic.rs diff --git a/src/query/service/src/pipelines/processors/transforms/new_hash_join/memory/basic_state.rs b/src/query/service/src/pipelines/processors/transforms/new_hash_join/memory/unpartitioned/basic_state.rs similarity index 100% rename from src/query/service/src/pipelines/processors/transforms/new_hash_join/memory/basic_state.rs rename to src/query/service/src/pipelines/processors/transforms/new_hash_join/memory/unpartitioned/basic_state.rs diff --git a/src/query/service/src/pipelines/processors/transforms/new_hash_join/memory/inner_join.rs b/src/query/service/src/pipelines/processors/transforms/new_hash_join/memory/unpartitioned/inner_join.rs similarity index 100% rename from src/query/service/src/pipelines/processors/transforms/new_hash_join/memory/inner_join.rs rename to src/query/service/src/pipelines/processors/transforms/new_hash_join/memory/unpartitioned/inner_join.rs diff --git a/src/query/service/src/pipelines/processors/transforms/new_hash_join/memory/left_join.rs b/src/query/service/src/pipelines/processors/transforms/new_hash_join/memory/unpartitioned/left_join.rs similarity index 99% rename from src/query/service/src/pipelines/processors/transforms/new_hash_join/memory/left_join.rs rename to src/query/service/src/pipelines/processors/transforms/new_hash_join/memory/unpartitioned/left_join.rs index f0447e941c71b..ce62243ee82f2 100644 --- a/src/query/service/src/pipelines/processors/transforms/new_hash_join/memory/left_join.rs +++ b/src/query/service/src/pipelines/processors/transforms/new_hash_join/memory/unpartitioned/left_join.rs @@ -40,7 +40,7 @@ use crate::pipelines::processors::transforms::new_hash_join::hashtable::basic::P use crate::pipelines::processors::transforms::new_hash_join::join::EmptyJoinStream; use crate::pipelines::processors::transforms::new_hash_join::join::JoinStream; use crate::pipelines::processors::transforms::new_hash_join::join::OneBlockJoinStream; -use crate::pipelines::processors::transforms::new_hash_join::memory::basic::BasicHashJoin; +use super::basic::BasicHashJoin; use crate::pipelines::processors::transforms::new_hash_join::performance::PerformanceContext; use crate::pipelines::processors::transforms::wrap_true_validity; use crate::sessions::QueryContext; diff --git a/src/query/service/src/pipelines/processors/transforms/new_hash_join/memory/left_join_anti.rs b/src/query/service/src/pipelines/processors/transforms/new_hash_join/memory/unpartitioned/left_join_anti.rs similarity index 98% rename from src/query/service/src/pipelines/processors/transforms/new_hash_join/memory/left_join_anti.rs rename to src/query/service/src/pipelines/processors/transforms/new_hash_join/memory/unpartitioned/left_join_anti.rs index fe569f901d7ad..024374a26c0e4 100644 --- a/src/query/service/src/pipelines/processors/transforms/new_hash_join/memory/left_join_anti.rs +++ b/src/query/service/src/pipelines/processors/transforms/new_hash_join/memory/unpartitioned/left_join_anti.rs @@ -31,8 +31,8 @@ use crate::pipelines::processors::HashJoinDesc; use crate::pipelines::processors::transforms::BasicHashJoinState; use crate::pipelines::processors::transforms::HashJoinHashTable; use crate::pipelines::processors::transforms::Join; -use crate::pipelines::processors::transforms::memory::basic::BasicHashJoin; -use crate::pipelines::processors::transforms::memory::left_join::final_result_block; +use super::basic::BasicHashJoin; +use super::left_join::final_result_block; use crate::pipelines::processors::transforms::new_hash_join::hashtable::ProbeData; use crate::pipelines::processors::transforms::new_hash_join::hashtable::basic::ProbeStream; use crate::pipelines::processors::transforms::new_hash_join::hashtable::basic::ProbedRows; diff --git a/src/query/service/src/pipelines/processors/transforms/new_hash_join/memory/left_join_semi.rs b/src/query/service/src/pipelines/processors/transforms/new_hash_join/memory/unpartitioned/left_join_semi.rs similarity index 99% rename from src/query/service/src/pipelines/processors/transforms/new_hash_join/memory/left_join_semi.rs rename to src/query/service/src/pipelines/processors/transforms/new_hash_join/memory/unpartitioned/left_join_semi.rs index 0d0046cf24752..dfd0f50f03e5e 100644 --- a/src/query/service/src/pipelines/processors/transforms/new_hash_join/memory/left_join_semi.rs +++ b/src/query/service/src/pipelines/processors/transforms/new_hash_join/memory/unpartitioned/left_join_semi.rs @@ -35,7 +35,7 @@ use crate::pipelines::processors::transforms::BasicHashJoinState; use crate::pipelines::processors::transforms::HashJoinHashTable; use crate::pipelines::processors::transforms::Join; use crate::pipelines::processors::transforms::JoinRuntimeFilterPacket; -use crate::pipelines::processors::transforms::memory::basic::BasicHashJoin; +use super::basic::BasicHashJoin; use crate::pipelines::processors::transforms::merge_join_runtime_filter_packets; use crate::pipelines::processors::transforms::new_hash_join::hashtable::ProbeData; use crate::pipelines::processors::transforms::new_hash_join::hashtable::basic::ProbeStream; diff --git a/src/query/service/src/pipelines/processors/transforms/new_hash_join/memory/unpartitioned/mod.rs b/src/query/service/src/pipelines/processors/transforms/new_hash_join/memory/unpartitioned/mod.rs new file mode 100644 index 0000000000000..0a1b217a2cb02 --- /dev/null +++ b/src/query/service/src/pipelines/processors/transforms/new_hash_join/memory/unpartitioned/mod.rs @@ -0,0 +1,33 @@ +// Copyright 2021 Datafuse Labs +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +pub(crate) mod basic; +mod basic_state; +mod inner_join; +pub mod left_join; +mod left_join_anti; +mod left_join_semi; +mod right_join; +mod right_join_anti; +mod right_join_semi; +mod nested_loop; + +pub use basic_state::BasicHashJoinState; +pub use inner_join::InnerHashJoin; +pub use left_join_anti::AntiLeftHashJoin; +pub use left_join_semi::SemiLeftHashJoin; +pub use right_join::OuterRightHashJoin; +pub use right_join_anti::AntiRightHashJoin; +pub use right_join_semi::SemiRightHashJoin; +pub use nested_loop::*; diff --git a/src/query/service/src/pipelines/processors/transforms/new_hash_join/memory/nested_loop.rs b/src/query/service/src/pipelines/processors/transforms/new_hash_join/memory/unpartitioned/nested_loop.rs similarity index 100% rename from src/query/service/src/pipelines/processors/transforms/new_hash_join/memory/nested_loop.rs rename to src/query/service/src/pipelines/processors/transforms/new_hash_join/memory/unpartitioned/nested_loop.rs diff --git a/src/query/service/src/pipelines/processors/transforms/new_hash_join/memory/right_join.rs b/src/query/service/src/pipelines/processors/transforms/new_hash_join/memory/unpartitioned/right_join.rs similarity index 98% rename from src/query/service/src/pipelines/processors/transforms/new_hash_join/memory/right_join.rs rename to src/query/service/src/pipelines/processors/transforms/new_hash_join/memory/unpartitioned/right_join.rs index d718d3bf7de6a..c3046b7ea3417 100644 --- a/src/query/service/src/pipelines/processors/transforms/new_hash_join/memory/right_join.rs +++ b/src/query/service/src/pipelines/processors/transforms/new_hash_join/memory/unpartitioned/right_join.rs @@ -34,9 +34,9 @@ use crate::pipelines::processors::transforms::HashJoinHashTable; use crate::pipelines::processors::transforms::Join; use crate::pipelines::processors::transforms::JoinRuntimeFilterPacket; use crate::pipelines::processors::transforms::hash_join_table::RowPtr; -use crate::pipelines::processors::transforms::memory::basic::BasicHashJoin; -use crate::pipelines::processors::transforms::memory::left_join::final_result_block; -use crate::pipelines::processors::transforms::memory::left_join::null_block; +use super::basic::BasicHashJoin; +use super::left_join::final_result_block; +use super::left_join::null_block; use crate::pipelines::processors::transforms::merge_join_runtime_filter_packets; use crate::pipelines::processors::transforms::new_hash_join::hashtable::ProbeData; use crate::pipelines::processors::transforms::new_hash_join::hashtable::basic::ProbeStream; diff --git a/src/query/service/src/pipelines/processors/transforms/new_hash_join/memory/right_join_anti.rs b/src/query/service/src/pipelines/processors/transforms/new_hash_join/memory/unpartitioned/right_join_anti.rs similarity index 98% rename from src/query/service/src/pipelines/processors/transforms/new_hash_join/memory/right_join_anti.rs rename to src/query/service/src/pipelines/processors/transforms/new_hash_join/memory/unpartitioned/right_join_anti.rs index 5a9389954fbdc..efb8a48013fc2 100644 --- a/src/query/service/src/pipelines/processors/transforms/new_hash_join/memory/right_join_anti.rs +++ b/src/query/service/src/pipelines/processors/transforms/new_hash_join/memory/unpartitioned/right_join_anti.rs @@ -32,8 +32,8 @@ use crate::pipelines::processors::transforms::HashJoinHashTable; use crate::pipelines::processors::transforms::Join; use crate::pipelines::processors::transforms::JoinRuntimeFilterPacket; use crate::pipelines::processors::transforms::hash_join_table::RowPtr; -use crate::pipelines::processors::transforms::memory::basic::BasicHashJoin; -use crate::pipelines::processors::transforms::memory::right_join_semi::SemiRightHashJoinStream; +use super::basic::BasicHashJoin; +use super::right_join_semi::SemiRightHashJoinStream; use crate::pipelines::processors::transforms::merge_join_runtime_filter_packets; use crate::pipelines::processors::transforms::new_hash_join::hashtable::ProbeData; use crate::pipelines::processors::transforms::new_hash_join::join::JoinStream; diff --git a/src/query/service/src/pipelines/processors/transforms/new_hash_join/memory/right_join_semi.rs b/src/query/service/src/pipelines/processors/transforms/new_hash_join/memory/unpartitioned/right_join_semi.rs similarity index 98% rename from src/query/service/src/pipelines/processors/transforms/new_hash_join/memory/right_join_semi.rs rename to src/query/service/src/pipelines/processors/transforms/new_hash_join/memory/unpartitioned/right_join_semi.rs index 12e780d398a63..255b079a92504 100644 --- a/src/query/service/src/pipelines/processors/transforms/new_hash_join/memory/right_join_semi.rs +++ b/src/query/service/src/pipelines/processors/transforms/new_hash_join/memory/unpartitioned/right_join_semi.rs @@ -33,8 +33,8 @@ use crate::pipelines::processors::transforms::HashJoinHashTable; use crate::pipelines::processors::transforms::Join; use crate::pipelines::processors::transforms::JoinRuntimeFilterPacket; use crate::pipelines::processors::transforms::hash_join_table::RowPtr; -use crate::pipelines::processors::transforms::memory::basic::BasicHashJoin; -use crate::pipelines::processors::transforms::memory::left_join::final_result_block; +use super::basic::BasicHashJoin; +use super::left_join::final_result_block; use crate::pipelines::processors::transforms::merge_join_runtime_filter_packets; use crate::pipelines::processors::transforms::new_hash_join::hashtable::ProbeData; use crate::pipelines::processors::transforms::new_hash_join::hashtable::basic::ProbeStream; From 1f9f2f392e8cf9c5209a6fc316e9037ea5b62778 Mon Sep 17 00:00:00 2001 From: zhang2014 Date: Mon, 16 Mar 2026 15:10:37 +0800 Subject: [PATCH 02/38] refactor(query): streaming CompactProbeStream + per-join-type JoinStream for partitioned hash join Move visited bitmap from CompactJoinHashTable to PartitionedBuild so the hash table is fully immutable after build. Introduce CompactProbeStream implementing the ProbeStream trait for streaming probe with index-based chaining. Replace eager probe()/probe_and_mark_visited() with streaming create_probe_matched/create_probe factory methods. Rewrite all 7 join types (inner, left, left semi, left anti, right, right semi, right anti) with dedicated streaming JoinStream implementations. Right-side joins use field-level split borrowing to avoid borrow conflicts between immutable hash table access and mutable visited marking. Co-Authored-By: Claude Opus 4.6 --- .../memory/partitioned/chunk_accumulator.rs | 244 ++++++++++++ .../memory/partitioned/compact_hash_table.rs | 48 +-- .../partitioned/compact_probe_stream.rs | 190 ++++++++++ .../memory/partitioned/inner_join.rs | 159 +++++--- .../memory/partitioned/left_join.rs | 356 +++++++++--------- .../memory/partitioned/left_join_anti.rs | 147 +++++--- .../memory/partitioned/left_join_semi.rs | 149 +++++--- .../new_hash_join/memory/partitioned/mod.rs | 6 +- .../memory/partitioned/partitioned_build.rs | 253 +++++-------- .../memory/partitioned/right_join.rs | 232 +++++++++--- .../memory/partitioned/right_join_anti.rs | 199 +++++++--- .../memory/partitioned/right_join_semi.rs | 199 +++++++--- 12 files changed, 1470 insertions(+), 712 deletions(-) create mode 100644 src/query/service/src/pipelines/processors/transforms/new_hash_join/memory/partitioned/chunk_accumulator.rs create mode 100644 src/query/service/src/pipelines/processors/transforms/new_hash_join/memory/partitioned/compact_probe_stream.rs diff --git a/src/query/service/src/pipelines/processors/transforms/new_hash_join/memory/partitioned/chunk_accumulator.rs b/src/query/service/src/pipelines/processors/transforms/new_hash_join/memory/partitioned/chunk_accumulator.rs new file mode 100644 index 0000000000000..9e53190dc92f5 --- /dev/null +++ b/src/query/service/src/pipelines/processors/transforms/new_hash_join/memory/partitioned/chunk_accumulator.rs @@ -0,0 +1,244 @@ +// Copyright 2021 Datafuse Labs +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use databend_common_expression::BlockEntry; +use databend_common_expression::Column; +use databend_common_expression::ColumnBuilder; +use databend_common_expression::DataBlock; + +/// Accumulates rows from input blocks into fixed-size output chunks +/// using mutable ColumnBuilders. When the accumulated rows reach +/// `chunk_size`, a chunk is flushed and returned. +/// +/// This avoids the overhead of `DataBlock::concat()` + `slice()` by +/// directly appending rows into builders. +pub struct FixedSizeChunkAccumulator { + chunk_size: usize, + builders: Option>, + builder_rows: usize, +} + +impl FixedSizeChunkAccumulator { + pub fn new(chunk_size: usize) -> Self { + FixedSizeChunkAccumulator { + chunk_size, + builders: None, + builder_rows: 0, + } + } + + /// Accumulate a block. Returns any full chunks that were flushed. + pub fn accumulate(&mut self, block: DataBlock) -> Vec { + let mut output = Vec::new(); + self.append_block(block, &mut output); + output + } + + /// Flush remaining rows as the last (possibly shorter) chunk. + pub fn flush(&mut self) -> Option { + if self.builder_rows == 0 { + return None; + } + Some(self.build_chunk()) + } + + /// Reset the accumulator, discarding any buffered rows. + pub fn reset(&mut self) { + self.builders = None; + self.builder_rows = 0; + } + + fn ensure_builders(&mut self, block: &DataBlock) { + if self.builders.is_none() { + let builders = block + .columns() + .iter() + .map(|entry| ColumnBuilder::with_capacity(&entry.data_type(), self.chunk_size)) + .collect(); + self.builders = Some(builders); + } + } + + fn append_block(&mut self, block: DataBlock, output: &mut Vec) { + self.ensure_builders(&block); + + let block_rows = block.num_rows(); + let columns: Vec = block + .take_columns() + .into_iter() + .map(|e| e.to_column()) + .collect(); + + let mut offset = 0; + while offset < block_rows { + let remaining_capacity = self.chunk_size - self.builder_rows; + let rows_to_copy = (block_rows - offset).min(remaining_capacity); + + let builders = self.builders.as_mut().unwrap(); + if offset == 0 && rows_to_copy == block_rows { + for (builder, col) in builders.iter_mut().zip(columns.iter()) { + builder.append_column(col); + } + } else { + for (builder, col) in builders.iter_mut().zip(columns.iter()) { + let sliced = col.slice(offset..offset + rows_to_copy); + builder.append_column(&sliced); + } + } + + self.builder_rows += rows_to_copy; + offset += rows_to_copy; + + if self.builder_rows == self.chunk_size { + output.push(self.build_chunk()); + } + } + } + + fn build_chunk(&mut self) -> DataBlock { + let builders = self.builders.take().unwrap(); + let num_rows = self.builder_rows; + self.builder_rows = 0; + + // Reinitialize builders with same column types for next chunk. + let mut new_builders = Vec::with_capacity(builders.len()); + let mut columns = Vec::with_capacity(builders.len()); + for b in builders { + let dt = b.data_type(); + columns.push(BlockEntry::from(b.build())); + new_builders.push(ColumnBuilder::with_capacity(&dt, self.chunk_size)); + } + self.builders = Some(new_builders); + + DataBlock::new(columns, num_rows) + } +} + +#[cfg(test)] +mod tests { + use databend_common_expression::DataBlock; + use databend_common_expression::FromData; + use databend_common_expression::types::AccessType; + use databend_common_expression::types::Int32Type; + use databend_common_expression::types::StringType; + + use super::*; + + fn make_int_block(values: Vec) -> DataBlock { + DataBlock::new_from_columns(vec![Int32Type::from_data(values)]) + } + + fn extract_int_col(block: &DataBlock) -> Vec { + let col = block.get_by_offset(0).to_column(); + let col = Int32Type::try_downcast_column(&col).unwrap(); + col.iter().copied().collect() + } + + #[test] + fn test_single_block_under_chunk_size() { + let mut acc = FixedSizeChunkAccumulator::new(4); + let chunks = acc.accumulate(make_int_block(vec![1, 2, 3])); + assert!(chunks.is_empty()); + + let last = acc.flush().unwrap(); + assert_eq!(extract_int_col(&last), vec![1, 2, 3]); + } + + #[test] + fn test_exact_chunk_size() { + let mut acc = FixedSizeChunkAccumulator::new(3); + let chunks = acc.accumulate(make_int_block(vec![1, 2, 3])); + assert_eq!(chunks.len(), 1); + assert_eq!(extract_int_col(&chunks[0]), vec![1, 2, 3]); + + assert!(acc.flush().is_none()); + } + + #[test] + fn test_block_larger_than_chunk_size() { + let mut acc = FixedSizeChunkAccumulator::new(3); + let chunks = acc.accumulate(make_int_block(vec![1, 2, 3, 4, 5, 6, 7])); + assert_eq!(chunks.len(), 2); + assert_eq!(extract_int_col(&chunks[0]), vec![1, 2, 3]); + assert_eq!(extract_int_col(&chunks[1]), vec![4, 5, 6]); + + let last = acc.flush().unwrap(); + assert_eq!(extract_int_col(&last), vec![7]); + } + + #[test] + fn test_multiple_small_blocks() { + let mut acc = FixedSizeChunkAccumulator::new(4); + assert!(acc.accumulate(make_int_block(vec![1, 2])).is_empty()); + let chunks = acc.accumulate(make_int_block(vec![3, 4, 5])); + assert_eq!(chunks.len(), 1); + assert_eq!(extract_int_col(&chunks[0]), vec![1, 2, 3, 4]); + + let last = acc.flush().unwrap(); + assert_eq!(extract_int_col(&last), vec![5]); + } + + #[test] + fn test_flush_empty() { + let mut acc = FixedSizeChunkAccumulator::new(4); + assert!(acc.flush().is_none()); + } + + #[test] + fn test_reset() { + let mut acc = FixedSizeChunkAccumulator::new(4); + acc.accumulate(make_int_block(vec![1, 2, 3])); + acc.reset(); + assert!(acc.flush().is_none()); + } + + #[test] + fn test_multi_column_blocks() { + let mut acc = FixedSizeChunkAccumulator::new(3); + let block = DataBlock::new_from_columns(vec![ + Int32Type::from_data(vec![1, 2, 3, 4, 5]), + StringType::from_data(vec!["a", "b", "c", "d", "e"]), + ]); + let chunks = acc.accumulate(block); + assert_eq!(chunks.len(), 1); + assert_eq!(chunks[0].num_rows(), 3); + assert_eq!(chunks[0].num_columns(), 2); + + let last = acc.flush().unwrap(); + assert_eq!(last.num_rows(), 2); + assert_eq!(last.num_columns(), 2); + + let int_col = Int32Type::try_downcast_column(&last.get_by_offset(0).to_column()).unwrap(); + let str_col = StringType::try_downcast_column(&last.get_by_offset(1).to_column()).unwrap(); + assert_eq!(int_col.iter().copied().collect::>(), vec![4, 5]); + let strs: Vec<&str> = str_col.iter().collect(); + assert_eq!(strs, vec!["d", "e"]); + } + + #[test] + fn test_reuse_after_flush() { + let mut acc = FixedSizeChunkAccumulator::new(2); + let chunks = acc.accumulate(make_int_block(vec![1, 2])); + assert_eq!(chunks.len(), 1); + assert!(acc.flush().is_none()); + + // Accumulator can be reused after flush + let chunks = acc.accumulate(make_int_block(vec![3, 4, 5])); + assert_eq!(chunks.len(), 1); + assert_eq!(extract_int_col(&chunks[0]), vec![3, 4]); + + let last = acc.flush().unwrap(); + assert_eq!(extract_int_col(&last), vec![5]); + } +} diff --git a/src/query/service/src/pipelines/processors/transforms/new_hash_join/memory/partitioned/compact_hash_table.rs b/src/query/service/src/pipelines/processors/transforms/new_hash_join/memory/partitioned/compact_hash_table.rs index 2c4ca7ae8024e..bf3229d7264cd 100644 --- a/src/query/service/src/pipelines/processors/transforms/new_hash_join/memory/partitioned/compact_hash_table.rs +++ b/src/query/service/src/pipelines/processors/transforms/new_hash_join/memory/partitioned/compact_hash_table.rs @@ -12,17 +12,13 @@ // See the License for the specific language governing permissions and // limitations under the License. -/// Doris-style compact hash table for join. -/// /// Index 0 is a sentinel (empty/chain-end). Actual rows are indexed from 1. /// Memory per row: 4 bytes (next chain) vs current ~32 bytes (pointer-based entry). /// /// The table is single-threaded (no atomics) — designed for per-thread use /// under hash shuffle where each thread independently builds and probes. /// Trait for row index types. Supports u32 (up to ~4B rows) and u64. -pub trait RowIndex: - Copy + Default + Eq + Send + Sync + 'static + std::fmt::Debug -{ +pub trait RowIndex: Copy + Default + Eq + Send + Sync + 'static + std::fmt::Debug { const ZERO: Self; fn from_usize(v: usize) -> Self; fn to_usize(self) -> usize; @@ -62,8 +58,6 @@ pub struct CompactJoinHashTable { first: Vec, /// Chain array: next[row_index] = next row in same bucket (0 = end) next: Vec, - /// Visited bitmap for right outer/semi/anti joins - visited: Vec, /// Bucket count minus one, for masking bucket_mask: usize, } @@ -77,7 +71,6 @@ impl CompactJoinHashTable { first: vec![I::ZERO; bucket_count], // Index 0 is sentinel, so we need num_rows + 1 entries next: vec![I::ZERO; num_rows + 1], - visited: Vec::new(), bucket_mask: bucket_count - 1, } } @@ -98,6 +91,18 @@ impl CompactJoinHashTable { } } + /// Insert a chunk of rows starting at `row_offset` (1-based). + /// `hashes[i]` is the hash for the row at flat index `row_offset + i`. + pub fn insert_chunk(&mut self, hashes: &[u64], row_offset: usize) { + let mask = self.bucket_mask; + for (i, h) in hashes.iter().enumerate() { + let row_index = row_offset + i; + let bucket = (*h as usize) & mask; + self.next[row_index] = self.first[bucket]; + self.first[bucket] = I::from_usize(row_index); + } + } + /// Get the first row index in the given bucket. #[inline(always)] pub fn first_index(&self, bucket: usize) -> I { @@ -110,37 +115,12 @@ impl CompactJoinHashTable { unsafe { *self.next.get_unchecked(row_index.to_usize()) } } - /// Initialize visited array for right-side join types. - pub fn init_visited(&mut self, num_rows: usize) { - self.visited = vec![0u8; num_rows + 1]; - } - - /// Mark a row as visited. - #[inline(always)] - pub fn set_visited(&mut self, row_index: I) { - unsafe { - *self.visited.get_unchecked_mut(row_index.to_usize()) = 1; - } - } - - /// Check if a row has been visited. - #[inline(always)] - pub fn is_visited(&self, row_index: usize) -> bool { - unsafe { *self.visited.get_unchecked(row_index) != 0 } - } - - /// Get a reference to the visited array (for final_probe scanning). - pub fn visited(&self) -> &[u8] { - &self.visited - } - fn calc_bucket_count(num_rows: usize) -> usize { if num_rows == 0 { return 1; } - // Doris formula: num_elem + (num_elem - 1) / 7, then round up to power of 2 + let target = num_rows + (num_rows.saturating_sub(1)) / 7; target.next_power_of_two() } } - diff --git a/src/query/service/src/pipelines/processors/transforms/new_hash_join/memory/partitioned/compact_probe_stream.rs b/src/query/service/src/pipelines/processors/transforms/new_hash_join/memory/partitioned/compact_probe_stream.rs new file mode 100644 index 0000000000000..8f85880d46c3e --- /dev/null +++ b/src/query/service/src/pipelines/processors/transforms/new_hash_join/memory/partitioned/compact_probe_stream.rs @@ -0,0 +1,190 @@ +// Copyright 2021 Datafuse Labs +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use databend_common_exception::Result; +use databend_common_expression::DataBlock; +use databend_common_expression::FunctionContext; +use databend_common_expression::HashMethod; +use databend_common_expression::HashMethodKind; +use databend_common_expression::KeyAccessor; +use databend_common_expression::KeysState; +use databend_common_expression::ProjectedBlock; +use databend_common_expression::with_hash_method; + +use super::compact_hash_table::CompactJoinHashTable; +use super::partitioned_build::CHUNK_BITS; +use super::partitioned_build::CHUNK_SIZE; +use super::partitioned_build::flat_to_row_ptr; +use crate::pipelines::processors::HashJoinDesc; +use crate::pipelines::processors::transforms::new_hash_join::hashtable::basic::ProbeStream; +use crate::pipelines::processors::transforms::new_hash_join::hashtable::basic::ProbedRows; + +const CHUNK_MASK: usize = CHUNK_SIZE - 1; + +struct CompactProbeStream<'a, Key: ?Sized + Eq, const MATCHED: bool> { + key_idx: usize, + build_idx: u32, + matched_num_rows: usize, + + probe_hashes: Vec, + bucket_mask: usize, + probe_acc: Box>, + build_accs: Vec>>, + + hash_table: &'a CompactJoinHashTable, +} + +impl<'a, Key: ?Sized + Eq + Send + Sync + 'static, const MATCHED: bool> ProbeStream + for CompactProbeStream<'a, Key, MATCHED> +{ + fn advance(&mut self, res: &mut ProbedRows, max_rows: usize) -> Result<()> { + while self.key_idx < self.probe_acc.len() { + if res.matched_probe.len() >= max_rows { + break; + } + + if self.build_idx == 0 { + let bucket = (self.probe_hashes[self.key_idx] as usize) & self.bucket_mask; + self.build_idx = self.hash_table.first_index(bucket); + if self.build_idx == 0 { + if !MATCHED { + res.unmatched.push(self.key_idx as u64); + } + self.key_idx += 1; + continue; + } + } + + let probe_key = unsafe { self.probe_acc.key_unchecked(self.key_idx) }; + + while self.build_idx != 0 { + let bi = self.build_idx as usize; + let chunk_idx = (bi - 1) >> CHUNK_BITS; + let offset = (bi - 1) & CHUNK_MASK; + let build_key = unsafe { self.build_accs[chunk_idx].key_unchecked(offset) }; + + if build_key == probe_key { + res.matched_probe.push(self.key_idx as u64); + res.matched_build.push(flat_to_row_ptr(bi)); + self.matched_num_rows += 1; + + if res.matched_probe.len() >= max_rows { + self.build_idx = self.hash_table.next_index(self.build_idx); + if self.build_idx == 0 { + self.key_idx += 1; + self.matched_num_rows = 0; + } + return Ok(()); + } + } + self.build_idx = self.hash_table.next_index(self.build_idx); + } + + if !MATCHED && self.matched_num_rows == 0 { + res.unmatched.push(self.key_idx as u64); + } + self.key_idx += 1; + self.matched_num_rows = 0; + } + Ok(()) + } +} + +fn create_probe_stream_inner<'a, M: HashMethod, const MATCHED: bool>( + method: &M, + hash_table: &'a CompactJoinHashTable, + build_keys_states: &'a [KeysState], + desc: &HashJoinDesc, + function_ctx: &FunctionContext, + data: &DataBlock, +) -> Result> +where + M::HashKey: Send + Sync, +{ + let probe_keys_entries = desc.probe_key(data, function_ctx)?; + let mut probe_keys_block = DataBlock::new(probe_keys_entries, data.num_rows()); + desc.remove_keys_nullable(&mut probe_keys_block); + + let keys = ProjectedBlock::from(probe_keys_block.columns()); + let probe_ks = method.build_keys_state(keys, data.num_rows())?; + let mut probe_hashes = Vec::with_capacity(data.num_rows()); + method.build_keys_hashes(&probe_ks, &mut probe_hashes); + + let probe_acc = method.build_keys_accessor(probe_ks)?; + let build_accs = build_keys_states + .iter() + .map(|ks| method.build_keys_accessor(ks.clone())) + .collect::>>()?; + + let bucket_mask = hash_table.bucket_mask(); + + Ok(Box::new(CompactProbeStream::<'a, M::HashKey, MATCHED> { + key_idx: 0, + build_idx: 0, + matched_num_rows: 0, + probe_hashes, + bucket_mask, + probe_acc, + build_accs, + hash_table, + })) +} + +/// Create a CompactProbeStream that only tracks matched rows (MATCHED=true). +/// For inner join, left semi, right series. +pub fn create_compact_probe_matched<'a>( + hash_table: &'a CompactJoinHashTable, + build_keys_states: &'a [KeysState], + method: &HashMethodKind, + desc: &HashJoinDesc, + function_ctx: &FunctionContext, + data: &DataBlock, +) -> Result> { + with_hash_method!(|T| match method { + HashMethodKind::T(method) => { + create_probe_stream_inner::<_, true>( + method, + hash_table, + build_keys_states, + desc, + function_ctx, + data, + ) + } + }) +} + +/// Create a CompactProbeStream that also tracks unmatched rows (MATCHED=false). +/// For left join, left anti. +pub fn create_compact_probe<'a>( + hash_table: &'a CompactJoinHashTable, + build_keys_states: &'a [KeysState], + method: &HashMethodKind, + desc: &HashJoinDesc, + function_ctx: &FunctionContext, + data: &DataBlock, +) -> Result> { + with_hash_method!(|T| match method { + HashMethodKind::T(method) => { + create_probe_stream_inner::<_, false>( + method, + hash_table, + build_keys_states, + desc, + function_ctx, + data, + ) + } + }) +} diff --git a/src/query/service/src/pipelines/processors/transforms/new_hash_join/memory/partitioned/inner_join.rs b/src/query/service/src/pipelines/processors/transforms/new_hash_join/memory/partitioned/inner_join.rs index 524ff505716ba..0b5ce268ed40f 100644 --- a/src/query/service/src/pipelines/processors/transforms/new_hash_join/memory/partitioned/inner_join.rs +++ b/src/query/service/src/pipelines/processors/transforms/new_hash_join/memory/partitioned/inner_join.rs @@ -27,15 +27,17 @@ use databend_common_functions::BUILTIN_FUNCTIONS; use super::partitioned_build::PartitionedBuild; use crate::pipelines::processors::HashJoinDesc; +use crate::pipelines::processors::transforms::new_hash_join::grace::grace_memory::GraceMemoryJoin; +use crate::pipelines::processors::transforms::new_hash_join::hashtable::basic::ProbeStream; +use crate::pipelines::processors::transforms::new_hash_join::hashtable::basic::ProbedRows; use crate::pipelines::processors::transforms::new_hash_join::join::EmptyJoinStream; use crate::pipelines::processors::transforms::new_hash_join::join::Join; use crate::pipelines::processors::transforms::new_hash_join::join::JoinStream; -use crate::pipelines::processors::transforms::new_hash_join::join::OneBlockJoinStream; -use crate::pipelines::processors::transforms::new_hash_join::grace::grace_memory::GraceMemoryJoin; pub struct PartitionedInnerJoin { build: PartitionedBuild, filter_executor: Option, + max_block_size: usize, } impl PartitionedInnerJoin { @@ -58,44 +60,92 @@ impl PartitionedInnerJoin { PartitionedInnerJoin { build: PartitionedBuild::create(method, desc, function_ctx), filter_executor, + max_block_size, } } +} - fn result_block( - desc: &HashJoinDesc, - probe_block: Option, - build_block: Option, - num_rows: usize, - ) -> DataBlock { - let mut result_block = match (probe_block, build_block) { - (Some(mut p), Some(b)) => { - p.merge_block(b); - p +pub fn result_block( + desc: &HashJoinDesc, + probe_block: Option, + build_block: Option, + num_rows: usize, +) -> DataBlock { + let mut result_block = match (probe_block, build_block) { + (Some(mut p), Some(b)) => { + p.merge_block(b); + p + } + (Some(p), None) => p, + (None, Some(b)) => b, + (None, None) => DataBlock::new(vec![], num_rows), + }; + + for (index, (is_probe_nullable, is_build_nullable)) in desc.probe_to_build.iter().cloned() { + let entry = match (is_probe_nullable, is_build_nullable) { + (true, true) | (false, false) => result_block.get_by_offset(index).clone(), + (true, false) => result_block.get_by_offset(index).clone().remove_nullable(), + (false, true) => { + let entry = result_block.get_by_offset(index); + let col = entry.to_column(); + match col.is_null() || col.is_nullable() { + true => entry.clone(), + false => BlockEntry::from(NullableColumn::new_column( + col, + Bitmap::new_constant(true, result_block.num_rows()), + )), + } } - (Some(p), None) => p, - (None, Some(b)) => b, - (None, None) => DataBlock::new(vec![], num_rows), }; + result_block.add_entry(entry); + } + result_block +} - for (index, (is_probe_nullable, is_build_nullable)) in desc.probe_to_build.iter().cloned() { - let entry = match (is_probe_nullable, is_build_nullable) { - (true, true) | (false, false) => result_block.get_by_offset(index).clone(), - (true, false) => result_block.get_by_offset(index).clone().remove_nullable(), - (false, true) => { - let entry = result_block.get_by_offset(index); - let col = entry.to_column(); - match col.is_null() || col.is_nullable() { - true => entry.clone(), - false => BlockEntry::from(NullableColumn::new_column( - col, - Bitmap::new_constant(true, result_block.num_rows()), - )), - } - } +struct PartitionedInnerJoinStream<'a> { + desc: Arc, + probe_data_block: DataBlock, + build: &'a PartitionedBuild, + probe_stream: Box, + probed_rows: ProbedRows, + filter_executor: Option<&'a mut FilterExecutor>, + max_block_size: usize, +} + +impl<'a> JoinStream for PartitionedInnerJoinStream<'a> { + fn next(&mut self) -> Result> { + loop { + self.probed_rows.clear(); + self.probe_stream + .advance(&mut self.probed_rows, self.max_block_size)?; + + if self.probed_rows.is_empty() { + return Ok(None); + } + + let probe_block = match self.probe_data_block.num_columns() { + 0 => None, + _ => Some(DataBlock::take( + &self.probe_data_block, + self.probed_rows.matched_probe.as_slice(), + )?), }; - result_block.add_entry(entry); + let build_block = self + .build + .gather_build_block(&self.probed_rows.matched_build); + let num_rows = self.probed_rows.matched_probe.len(); + + let mut block = result_block(&self.desc, probe_block, build_block, num_rows); + + if let Some(filter) = self.filter_executor.as_mut() { + block = filter.filter(block)?; + if block.is_empty() { + continue; + } + } + + return Ok(Some(block)); } - result_block } } @@ -114,35 +164,22 @@ impl Join for PartitionedInnerJoin { return Ok(Box::new(EmptyJoinStream)); } - let (matched_probe, matched_build, _) = self.build.probe(&data)?; - - if matched_probe.is_empty() { - return Ok(Box::new(EmptyJoinStream)); - } - - let projected = data.project(&self.build.desc.probe_projection); - let probe_block = match projected.num_columns() { - 0 => None, - _ => Some(DataBlock::take(&projected, matched_probe.as_slice())?), - }; - - let build_block = self.build.gather_build_block(&matched_build); - - let mut result = Self::result_block( - &self.build.desc, - probe_block, - build_block, - matched_probe.len(), - ); - - if let Some(filter) = self.filter_executor.as_mut() { - result = filter.filter(result)?; - if result.is_empty() { - return Ok(Box::new(EmptyJoinStream)); - } - } - - Ok(Box::new(OneBlockJoinStream(Some(result)))) + let probe_stream = self.build.create_probe_matched(&data)?; + let probe_data_block = data.project(&self.build.desc.probe_projection); + + Ok(Box::new(PartitionedInnerJoinStream { + desc: self.build.desc.clone(), + probe_data_block, + build: &self.build, + probe_stream, + probed_rows: ProbedRows::new( + Vec::with_capacity(self.max_block_size), + Vec::with_capacity(self.max_block_size), + Vec::with_capacity(self.max_block_size), + ), + filter_executor: self.filter_executor.as_mut(), + max_block_size: self.max_block_size, + })) } } diff --git a/src/query/service/src/pipelines/processors/transforms/new_hash_join/memory/partitioned/left_join.rs b/src/query/service/src/pipelines/processors/transforms/new_hash_join/memory/partitioned/left_join.rs index 603ee81b04e05..6147c5ff865de 100644 --- a/src/query/service/src/pipelines/processors/transforms/new_hash_join/memory/partitioned/left_join.rs +++ b/src/query/service/src/pipelines/processors/transforms/new_hash_join/memory/partitioned/left_join.rs @@ -24,21 +24,35 @@ use databend_common_expression::FunctionContext; use databend_common_expression::HashMethodKind; use databend_common_expression::Scalar; use databend_common_expression::types::DataType; -use databend_common_expression::types::NullableColumn; use databend_common_functions::BUILTIN_FUNCTIONS; +use super::inner_join::result_block; use super::partitioned_build::PartitionedBuild; use crate::pipelines::processors::HashJoinDesc; +use crate::pipelines::processors::transforms::new_hash_join::grace::grace_memory::GraceMemoryJoin; +use crate::pipelines::processors::transforms::new_hash_join::hashtable::basic::ProbeStream; +use crate::pipelines::processors::transforms::new_hash_join::hashtable::basic::ProbedRows; use crate::pipelines::processors::transforms::new_hash_join::join::EmptyJoinStream; use crate::pipelines::processors::transforms::new_hash_join::join::Join; use crate::pipelines::processors::transforms::new_hash_join::join::JoinStream; use crate::pipelines::processors::transforms::new_hash_join::join::OneBlockJoinStream; -use crate::pipelines::processors::transforms::new_hash_join::grace::grace_memory::GraceMemoryJoin; use crate::pipelines::processors::transforms::wrap_true_validity; +pub fn null_block(types: &[DataType], num_rows: usize) -> Option { + if types.is_empty() { + return None; + } + let columns = types + .iter() + .map(|t| BlockEntry::new_const_column(t.wrap_nullable(), Scalar::Null, num_rows)) + .collect::>(); + Some(DataBlock::new(columns, num_rows)) +} + pub struct PartitionedLeftJoin { build: PartitionedBuild, filter_executor: Option, + max_block_size: usize, } impl PartitionedLeftJoin { @@ -61,6 +75,137 @@ impl PartitionedLeftJoin { PartitionedLeftJoin { build: PartitionedBuild::create(method, desc, function_ctx), filter_executor, + max_block_size, + } + } +} + +fn wrap_nullable_build(build_block: DataBlock, num_rows: usize) -> DataBlock { + let true_validity = Bitmap::new_constant(true, num_rows); + let entries = build_block + .columns() + .iter() + .map(|c| wrap_true_validity(c, num_rows, &true_validity)); + DataBlock::from_iter(entries, num_rows) +} + +struct PartitionedLeftJoinStream<'a> { + desc: Arc, + probe_data_block: DataBlock, + build: &'a PartitionedBuild, + probe_stream: Box, + probed_rows: ProbedRows, + filter_executor: Option<&'a mut FilterExecutor>, + max_block_size: usize, + // Accumulated unmatched probe indices (no hash match) + unmatched_indices: Vec, + // Per-probe-row state for filter case: 0=unseen, 1=matched-no-pass, 2=passed + row_state: Vec, + has_filter: bool, + probe_done: bool, + unmatched_offset: usize, +} + +impl<'a> PartitionedLeftJoinStream<'a> { + fn output_unmatched(&mut self) -> Result> { + // Collect all unmatched indices: from ProbeStream + filter-failed matched rows + if self.unmatched_offset == 0 && self.has_filter { + for i in 0..self.row_state.len() { + if self.row_state[i] == 1 { + self.unmatched_indices.push(i as u64); + } + } + } + + if self.unmatched_offset >= self.unmatched_indices.len() { + return Ok(None); + } + + let end = (self.unmatched_offset + self.max_block_size).min(self.unmatched_indices.len()); + let batch = &self.unmatched_indices[self.unmatched_offset..end]; + self.unmatched_offset = end; + + let probe_block = match self.probe_data_block.num_columns() { + 0 => None, + _ => Some(DataBlock::take(&self.probe_data_block, batch)?), + }; + let build_block = null_block(&self.build.column_types, batch.len()); + Ok(Some(result_block( + &self.desc, + probe_block, + build_block, + batch.len(), + ))) + } +} + +impl<'a> JoinStream for PartitionedLeftJoinStream<'a> { + fn next(&mut self) -> Result> { + if self.probe_done { + return self.output_unmatched(); + } + + loop { + self.probed_rows.clear(); + self.probe_stream + .advance(&mut self.probed_rows, self.max_block_size)?; + + if self.probed_rows.is_empty() { + self.probe_done = true; + return self.output_unmatched(); + } + + // Save unmatched indices + self.unmatched_indices + .extend_from_slice(&self.probed_rows.unmatched); + + if self.probed_rows.matched_probe.is_empty() { + continue; + } + + let num_matched = self.probed_rows.matched_probe.len(); + let probe_block = match self.probe_data_block.num_columns() { + 0 => None, + _ => Some(DataBlock::take( + &self.probe_data_block, + self.probed_rows.matched_probe.as_slice(), + )?), + }; + + let build_block = self + .build + .gather_build_block(&self.probed_rows.matched_build); + let build_block = build_block.map(|b| wrap_nullable_build(b, num_matched)); + + let mut block = result_block(&self.desc, probe_block, build_block, num_matched); + + if let Some(filter) = self.filter_executor.as_mut() { + // Track matched rows + for &idx in &self.probed_rows.matched_probe { + let i = idx as usize; + if self.row_state[i] == 0 { + self.row_state[i] = 1; + } + } + + let count = filter.select(&block)?; + if count == 0 { + continue; + } + + let true_sel = filter.true_selection(); + for &sel_idx in true_sel.iter().take(count) { + let probe_idx = self.probed_rows.matched_probe[sel_idx as usize] as usize; + self.row_state[probe_idx] = 2; + } + + let origin_rows = block.num_rows(); + block = filter.take(block, origin_rows, count)?; + } + + if !block.is_empty() { + return Ok(Some(block)); + } } } } @@ -90,137 +235,40 @@ impl Join for PartitionedLeftJoin { .iter() .map(|x| x.data_type().clone()) .collect(); - let build_block = null_block(&types, num_rows) - .map(|b| b.project(&desc.build_projection)); + let build_block = + null_block(&types, num_rows).map(|b| b.project(&desc.build_projection)); let probe_block = Some(data.project(&desc.probe_projection)); - let result = final_result_block(desc, probe_block, build_block, num_rows); - return Ok(Box::new(OneBlockJoinStream(Some(result)))); + let block = result_block(desc, probe_block, build_block, num_rows); + return Ok(Box::new(OneBlockJoinStream(Some(block)))); } - let (matched_probe, matched_build, unmatched) = self.build.probe(&data)?; - let num_rows = data.num_rows(); - let probe_projected = data.project(&desc.probe_projection); - - // Build matched result - let mut result_blocks = Vec::new(); - - if !matched_probe.is_empty() { - let probe_block = match probe_projected.num_columns() { - 0 => None, - _ => Some(DataBlock::take(&probe_projected, matched_probe.as_slice())?), - }; + let num_probe_rows = data.num_rows(); + let has_filter = self.filter_executor.is_some(); + let probe_stream = self.build.create_probe(&data)?; + let probe_data_block = data.project(&desc.probe_projection); - let build_block = self.build.gather_build_block(&matched_build); - let build_block = build_block.map(|b| { - let true_validity = Bitmap::new_constant(true, matched_build.len()); - let entries = b - .columns() - .iter() - .map(|c| wrap_true_validity(c, matched_build.len(), &true_validity)); - DataBlock::from_iter(entries, matched_build.len()) - }); - - let mut matched_result = final_result_block( - desc, - probe_block, - build_block, - matched_build.len(), - ); - - if let Some(filter) = self.filter_executor.as_mut() { - let count = filter.select(&matched_result)?; - if count > 0 { - // Track which probe rows passed the filter - let true_sel = filter.true_selection(); - let mut passed = vec![false; num_rows]; - for idx in true_sel.iter().take(count) { - passed[matched_probe[*idx as usize] as usize] = true; - } - - let origin_rows = matched_result.num_rows(); - matched_result = filter.take(matched_result, origin_rows, count)?; - result_blocks.push(matched_result); - - // Unmatched = original unmatched + matched rows that failed ALL filter checks - let mut all_unmatched: Vec = unmatched; - // Rows that were matched but never passed filter - let mut matched_set = vec![false; num_rows]; - for idx in &matched_probe { - matched_set[*idx as usize] = true; - } - for i in 0..num_rows { - if matched_set[i] && !passed[i] { - all_unmatched.push(i as u64); - } - } - - if !all_unmatched.is_empty() { - let unmatched_probe = match probe_projected.num_columns() { - 0 => None, - _ => Some(DataBlock::take(&probe_projected, all_unmatched.as_slice())?), - }; - let types = &self.build.column_types; - let unmatched_build = null_block(types, all_unmatched.len()); - result_blocks.push(final_result_block( - desc, - unmatched_probe, - unmatched_build, - all_unmatched.len(), - )); - } - } else { - // All matched rows failed filter, treat all as unmatched - let all_indices: Vec = (0..num_rows as u64).collect(); - let unmatched_probe = match probe_projected.num_columns() { - 0 => None, - _ => Some(DataBlock::take(&probe_projected, all_indices.as_slice())?), - }; - let types = &self.build.column_types; - let unmatched_build = null_block(types, all_indices.len()); - result_blocks.push(final_result_block( - desc, - unmatched_probe, - unmatched_build, - all_indices.len(), - )); - } + Ok(Box::new(PartitionedLeftJoinStream { + desc: self.build.desc.clone(), + probe_data_block, + build: &self.build, + probe_stream, + probed_rows: ProbedRows::new( + Vec::with_capacity(self.max_block_size), + Vec::with_capacity(self.max_block_size), + Vec::with_capacity(self.max_block_size), + ), + filter_executor: self.filter_executor.as_mut(), + max_block_size: self.max_block_size, + unmatched_indices: Vec::new(), + row_state: if has_filter { + vec![0u8; num_probe_rows] } else { - result_blocks.push(matched_result); - - // Append unmatched rows with NULL build side - if !unmatched.is_empty() { - let unmatched_probe = match probe_projected.num_columns() { - 0 => None, - _ => Some(DataBlock::take(&probe_projected, unmatched.as_slice())?), - }; - let types = &self.build.column_types; - let unmatched_build = null_block(types, unmatched.len()); - result_blocks.push(final_result_block( - desc, - unmatched_probe, - unmatched_build, - unmatched.len(), - )); - } - } - } else { - // All rows unmatched - let unmatched_probe = match probe_projected.num_columns() { - 0 => None, - _ => Some(DataBlock::take(&probe_projected, unmatched.as_slice())?), - }; - let types = &self.build.column_types; - let unmatched_build = null_block(types, unmatched.len()); - result_blocks.push(final_result_block( - desc, - unmatched_probe, - unmatched_build, - unmatched.len(), - )); - } - - let result = DataBlock::concat(&result_blocks)?; - Ok(Box::new(OneBlockJoinStream(Some(result)))) + Vec::new() + }, + has_filter, + probe_done: false, + unmatched_offset: 0, + })) } } @@ -229,55 +277,3 @@ impl GraceMemoryJoin for PartitionedLeftJoin { self.build.reset(); } } - -pub fn final_result_block( - desc: &HashJoinDesc, - probe_block: Option, - build_block: Option, - num_rows: usize, -) -> DataBlock { - let mut result_block = match (probe_block, build_block) { - (Some(mut probe_block), Some(build_block)) => { - probe_block.merge_block(build_block); - probe_block - } - (Some(probe_block), None) => probe_block, - (None, Some(build_block)) => build_block, - (None, None) => DataBlock::new(vec![], num_rows), - }; - - if !desc.probe_to_build.is_empty() { - for (index, (is_probe_nullable, is_build_nullable)) in desc.probe_to_build.iter() { - let entry = match (is_probe_nullable, is_build_nullable) { - (true, true) | (false, false) => result_block.get_by_offset(*index).clone(), - (true, false) => result_block.get_by_offset(*index).clone().remove_nullable(), - (false, true) => { - let entry = result_block.get_by_offset(*index); - let col = entry.to_column(); - match col.is_null() || col.is_nullable() { - true => entry.clone(), - false => BlockEntry::from(NullableColumn::new_column( - col, - Bitmap::new_constant(true, result_block.num_rows()), - )), - } - } - }; - result_block.add_entry(entry); - } - } - result_block -} - -pub fn null_block(types: &[DataType], num_rows: usize) -> Option { - if types.is_empty() { - return None; - } - let columns = types - .iter() - .map(|column_type| { - BlockEntry::new_const_column(column_type.wrap_nullable(), Scalar::Null, num_rows) - }) - .collect::>(); - Some(DataBlock::new(columns, num_rows)) -} diff --git a/src/query/service/src/pipelines/processors/transforms/new_hash_join/memory/partitioned/left_join_anti.rs b/src/query/service/src/pipelines/processors/transforms/new_hash_join/memory/partitioned/left_join_anti.rs index fa32b533c84c6..ac9302842d752 100644 --- a/src/query/service/src/pipelines/processors/transforms/new_hash_join/memory/partitioned/left_join_anti.rs +++ b/src/query/service/src/pipelines/processors/transforms/new_hash_join/memory/partitioned/left_join_anti.rs @@ -23,18 +23,21 @@ use databend_common_expression::FunctionContext; use databend_common_expression::HashMethodKind; use databend_common_functions::BUILTIN_FUNCTIONS; -use super::left_join::final_result_block; +use super::inner_join::result_block; use super::partitioned_build::PartitionedBuild; use crate::pipelines::processors::HashJoinDesc; +use crate::pipelines::processors::transforms::new_hash_join::grace::grace_memory::GraceMemoryJoin; +use crate::pipelines::processors::transforms::new_hash_join::hashtable::basic::ProbeStream; +use crate::pipelines::processors::transforms::new_hash_join::hashtable::basic::ProbedRows; use crate::pipelines::processors::transforms::new_hash_join::join::EmptyJoinStream; use crate::pipelines::processors::transforms::new_hash_join::join::Join; use crate::pipelines::processors::transforms::new_hash_join::join::JoinStream; use crate::pipelines::processors::transforms::new_hash_join::join::OneBlockJoinStream; -use crate::pipelines::processors::transforms::new_hash_join::grace::grace_memory::GraceMemoryJoin; pub struct PartitionedLeftAntiJoin { build: PartitionedBuild, filter_executor: Option, + max_block_size: usize, } impl PartitionedLeftAntiJoin { @@ -57,6 +60,75 @@ impl PartitionedLeftAntiJoin { PartitionedLeftAntiJoin { build: PartitionedBuild::create(method, desc, function_ctx), filter_executor, + max_block_size, + } + } +} +struct PartitionedLeftAntiJoinStream<'a> { + probe_data_block: DataBlock, + build: &'a PartitionedBuild, + desc: Arc, + probe_stream: Box, + probed_rows: ProbedRows, + filter_executor: Option<&'a mut FilterExecutor>, + max_block_size: usize, + excluded: Vec, + probe_done: bool, +} + +impl<'a> JoinStream for PartitionedLeftAntiJoinStream<'a> { + fn next(&mut self) -> Result> { + if self.probe_done { + return Ok(None); + } + + loop { + self.probed_rows.clear(); + self.probe_stream + .advance(&mut self.probed_rows, self.max_block_size)?; + + if self.probed_rows.is_empty() { + self.probe_done = true; + let bitmap = Bitmap::from_trusted_len_iter(self.excluded.iter().map(|e| !e)); + return match bitmap.true_count() { + 0 => Ok(None), + _ => Ok(Some( + self.probe_data_block.clone().filter_with_bitmap(&bitmap)?, + )), + }; + } + + if self.probed_rows.matched_probe.is_empty() { + continue; + } + + if let Some(filter) = self.filter_executor.as_mut() { + let num_matched = self.probed_rows.matched_probe.len(); + let probe_block = match self.probe_data_block.num_columns() { + 0 => None, + _ => Some(DataBlock::take( + &self.probe_data_block, + self.probed_rows.matched_probe.as_slice(), + )?), + }; + let build_block = self + .build + .gather_build_block(&self.probed_rows.matched_build); + let block = result_block(&self.desc, probe_block, build_block, num_matched); + + let count = filter.select(&block)?; + if count > 0 { + let true_sel = filter.true_selection(); + for &sel_idx in true_sel.iter().take(count) { + let probe_idx = self.probed_rows.matched_probe[sel_idx as usize] as usize; + self.excluded[probe_idx] = true; + } + } + } else { + for &idx in &self.probed_rows.matched_probe { + self.excluded[idx as usize] = true; + } + } } } } @@ -76,61 +148,30 @@ impl Join for PartitionedLeftAntiJoin { return Ok(Box::new(EmptyJoinStream)); } - let desc = &self.build.desc; - if self.build.num_rows == 0 { - let probe_projected = data.project(&desc.probe_projection); - return Ok(Box::new(OneBlockJoinStream(Some(probe_projected)))); - } - - let (matched_probe, matched_build, unmatched) = self.build.probe(&data)?; - let probe_projected = data.project(&desc.probe_projection); - - if matched_probe.is_empty() { - // All rows are unmatched + let probe_projected = data.project(&self.build.desc.probe_projection); return Ok(Box::new(OneBlockJoinStream(Some(probe_projected)))); } - if let Some(filter) = self.filter_executor.as_mut() { - // With filter: rows that match but fail filter are still "anti" rows - let probe_block = match probe_projected.num_columns() { - 0 => None, - _ => Some(DataBlock::take(&probe_projected, matched_probe.as_slice())?), - }; - let build_block = self.build.gather_build_block(&matched_build); - let result = final_result_block( - &self.build.desc, - probe_block, - build_block, - matched_probe.len(), - ); - - let count = filter.select(&result)?; - - // selected[i] = true means probe row i should be EXCLUDED (it matched and passed filter) - let mut excluded = vec![false; probe_projected.num_rows()]; - if count > 0 { - let true_sel = filter.true_selection(); - for idx in true_sel.iter().take(count) { - excluded[matched_probe[*idx as usize] as usize] = true; - } - } - - let bitmap = Bitmap::from_trusted_len_iter(excluded.iter().map(|e| !e)); - match bitmap.true_count() { - 0 => Ok(Box::new(EmptyJoinStream)), - _ => Ok(Box::new(OneBlockJoinStream(Some( - probe_projected.filter_with_bitmap(&bitmap)?, - )))), - } - } else { - // Without filter: output only unmatched rows - if unmatched.is_empty() { - return Ok(Box::new(EmptyJoinStream)); - } - let result = DataBlock::take(&probe_projected, unmatched.as_slice())?; - Ok(Box::new(OneBlockJoinStream(Some(result)))) - } + let num_probe_rows = data.num_rows(); + let probe_stream = self.build.create_probe_matched(&data)?; + let probe_data_block = data.project(&self.build.desc.probe_projection); + + Ok(Box::new(PartitionedLeftAntiJoinStream { + probe_data_block, + build: &self.build, + desc: self.build.desc.clone(), + probe_stream, + probed_rows: ProbedRows::new( + Vec::with_capacity(self.max_block_size), + Vec::with_capacity(self.max_block_size), + Vec::with_capacity(self.max_block_size), + ), + filter_executor: self.filter_executor.as_mut(), + max_block_size: self.max_block_size, + excluded: vec![false; num_probe_rows], + probe_done: false, + })) } } diff --git a/src/query/service/src/pipelines/processors/transforms/new_hash_join/memory/partitioned/left_join_semi.rs b/src/query/service/src/pipelines/processors/transforms/new_hash_join/memory/partitioned/left_join_semi.rs index 82a1ef8c84c66..ce710a8db212b 100644 --- a/src/query/service/src/pipelines/processors/transforms/new_hash_join/memory/partitioned/left_join_semi.rs +++ b/src/query/service/src/pipelines/processors/transforms/new_hash_join/memory/partitioned/left_join_semi.rs @@ -23,18 +23,20 @@ use databend_common_expression::FunctionContext; use databend_common_expression::HashMethodKind; use databend_common_functions::BUILTIN_FUNCTIONS; -use super::left_join::final_result_block; +use super::inner_join::result_block; use super::partitioned_build::PartitionedBuild; use crate::pipelines::processors::HashJoinDesc; +use crate::pipelines::processors::transforms::new_hash_join::grace::grace_memory::GraceMemoryJoin; +use crate::pipelines::processors::transforms::new_hash_join::hashtable::basic::ProbeStream; +use crate::pipelines::processors::transforms::new_hash_join::hashtable::basic::ProbedRows; use crate::pipelines::processors::transforms::new_hash_join::join::EmptyJoinStream; use crate::pipelines::processors::transforms::new_hash_join::join::Join; use crate::pipelines::processors::transforms::new_hash_join::join::JoinStream; -use crate::pipelines::processors::transforms::new_hash_join::join::OneBlockJoinStream; -use crate::pipelines::processors::transforms::new_hash_join::grace::grace_memory::GraceMemoryJoin; pub struct PartitionedLeftSemiJoin { build: PartitionedBuild, filter_executor: Option, + max_block_size: usize, } impl PartitionedLeftSemiJoin { @@ -57,6 +59,76 @@ impl PartitionedLeftSemiJoin { PartitionedLeftSemiJoin { build: PartitionedBuild::create(method, desc, function_ctx), filter_executor, + max_block_size, + } + } +} + +struct PartitionedLeftSemiJoinStream<'a> { + probe_data_block: DataBlock, + build: &'a PartitionedBuild, + desc: Arc, + probe_stream: Box, + probed_rows: ProbedRows, + filter_executor: Option<&'a mut FilterExecutor>, + max_block_size: usize, + selected: Vec, + probe_done: bool, +} + +impl<'a> JoinStream for PartitionedLeftSemiJoinStream<'a> { + fn next(&mut self) -> Result> { + if self.probe_done { + return Ok(None); + } + + loop { + self.probed_rows.clear(); + self.probe_stream + .advance(&mut self.probed_rows, self.max_block_size)?; + + if self.probed_rows.is_empty() { + self.probe_done = true; + let bitmap = Bitmap::from_trusted_len_iter(self.selected.iter().copied()); + return match bitmap.true_count() { + 0 => Ok(None), + _ => Ok(Some( + self.probe_data_block.clone().filter_with_bitmap(&bitmap)?, + )), + }; + } + + if self.probed_rows.matched_probe.is_empty() { + continue; + } + + if let Some(filter) = self.filter_executor.as_mut() { + let num_matched = self.probed_rows.matched_probe.len(); + let probe_block = match self.probe_data_block.num_columns() { + 0 => None, + _ => Some(DataBlock::take( + &self.probe_data_block, + self.probed_rows.matched_probe.as_slice(), + )?), + }; + let build_block = self + .build + .gather_build_block(&self.probed_rows.matched_build); + let block = result_block(&self.desc, probe_block, build_block, num_matched); + + let count = filter.select(&block)?; + if count > 0 { + let true_sel = filter.true_selection(); + for &sel_idx in true_sel.iter().take(count) { + let probe_idx = self.probed_rows.matched_probe[sel_idx as usize] as usize; + self.selected[probe_idx] = true; + } + } + } else { + for &idx in &self.probed_rows.matched_probe { + self.selected[idx as usize] = true; + } + } } } } @@ -76,58 +148,25 @@ impl Join for PartitionedLeftSemiJoin { return Ok(Box::new(EmptyJoinStream)); } - let desc = &self.build.desc; - let (matched_probe, matched_build, _) = self.build.probe(&data)?; - - if matched_probe.is_empty() { - return Ok(Box::new(EmptyJoinStream)); - } - - let probe_projected = data.project(&desc.probe_projection); - - if let Some(filter) = self.filter_executor.as_mut() { - // With filter: build full result, apply filter, deduplicate probe indices - let probe_block = match probe_projected.num_columns() { - 0 => None, - _ => Some(DataBlock::take(&probe_projected, matched_probe.as_slice())?), - }; - let build_block = self.build.gather_build_block(&matched_build); - let result = final_result_block( - &self.build.desc, - probe_block, - build_block, - matched_probe.len(), - ); - - let count = filter.select(&result)?; - if count == 0 { - return Ok(Box::new(EmptyJoinStream)); - } - - let true_sel = filter.true_selection(); - let mut selected = vec![false; probe_projected.num_rows()]; - for idx in true_sel.iter().take(count) { - selected[matched_probe[*idx as usize] as usize] = true; - } - - let bitmap = Bitmap::from_trusted_len_iter(selected.into_iter()); - match bitmap.true_count() { - 0 => Ok(Box::new(EmptyJoinStream)), - _ => Ok(Box::new(OneBlockJoinStream(Some( - probe_projected.filter_with_bitmap(&bitmap)?, - )))), - } - } else { - // Without filter: deduplicate matched probe indices - let mut selected = vec![false; probe_projected.num_rows()]; - for idx in &matched_probe { - selected[*idx as usize] = true; - } - let bitmap = Bitmap::from_trusted_len_iter(selected.into_iter()); - Ok(Box::new(OneBlockJoinStream(Some( - probe_projected.filter_with_bitmap(&bitmap)?, - )))) - } + let num_probe_rows = data.num_rows(); + let probe_stream = self.build.create_probe_matched(&data)?; + let probe_data_block = data.project(&self.build.desc.probe_projection); + + Ok(Box::new(PartitionedLeftSemiJoinStream { + probe_data_block, + build: &self.build, + desc: self.build.desc.clone(), + probe_stream, + probed_rows: ProbedRows::new( + Vec::with_capacity(self.max_block_size), + Vec::with_capacity(self.max_block_size), + Vec::with_capacity(self.max_block_size), + ), + filter_executor: self.filter_executor.as_mut(), + max_block_size: self.max_block_size, + selected: vec![false; num_probe_rows], + probe_done: false, + })) } } diff --git a/src/query/service/src/pipelines/processors/transforms/new_hash_join/memory/partitioned/mod.rs b/src/query/service/src/pipelines/processors/transforms/new_hash_join/memory/partitioned/mod.rs index 00c83007a4c65..9b41ddd7a0832 100644 --- a/src/query/service/src/pipelines/processors/transforms/new_hash_join/memory/partitioned/mod.rs +++ b/src/query/service/src/pipelines/processors/transforms/new_hash_join/memory/partitioned/mod.rs @@ -12,23 +12,25 @@ // See the License for the specific language governing permissions and // limitations under the License. +mod chunk_accumulator; mod compact_hash_table; -mod partitioned_build; +mod compact_probe_stream; mod inner_join; mod left_join; mod left_join_anti; mod left_join_semi; +mod partitioned_build; mod right_join; mod right_join_anti; mod right_join_semi; pub use compact_hash_table::CompactJoinHashTable; pub use compact_hash_table::RowIndex; -pub use partitioned_build::PartitionedBuild; pub use inner_join::PartitionedInnerJoin; pub use left_join::PartitionedLeftJoin; pub use left_join_anti::PartitionedLeftAntiJoin; pub use left_join_semi::PartitionedLeftSemiJoin; +pub use partitioned_build::PartitionedBuild; pub use right_join::PartitionedRightJoin; pub use right_join_anti::PartitionedRightAntiJoin; pub use right_join_semi::PartitionedRightSemiJoin; diff --git a/src/query/service/src/pipelines/processors/transforms/new_hash_join/memory/partitioned/partitioned_build.rs b/src/query/service/src/pipelines/processors/transforms/new_hash_join/memory/partitioned/partitioned_build.rs index 8069c815a7ceb..ec18ed228e460 100644 --- a/src/query/service/src/pipelines/processors/transforms/new_hash_join/memory/partitioned/partitioned_build.rs +++ b/src/query/service/src/pipelines/processors/transforms/new_hash_join/memory/partitioned/partitioned_build.rs @@ -26,13 +26,16 @@ use databend_common_expression::ProjectedBlock; use databend_common_expression::types::DataType; use databend_common_expression::with_hash_method; +use super::chunk_accumulator::FixedSizeChunkAccumulator; use super::compact_hash_table::CompactJoinHashTable; +use super::compact_probe_stream::create_compact_probe; +use super::compact_probe_stream::create_compact_probe_matched; use crate::pipelines::processors::HashJoinDesc; use crate::pipelines::processors::transforms::hash_join_table::RowPtr; +use crate::pipelines::processors::transforms::new_hash_join::hashtable::basic::ProbeStream; pub const CHUNK_BITS: usize = 16; pub const CHUNK_SIZE: usize = 1 << CHUNK_BITS; // 65536 -const CHUNK_MASK: usize = CHUNK_SIZE - 1; /// Convert a 1-based flat index to RowPtr (chunk_index, row_offset). #[inline(always)] @@ -40,7 +43,7 @@ pub fn flat_to_row_ptr(flat_index: usize) -> RowPtr { let zero_based = flat_index - 1; RowPtr { chunk_index: (zero_based >> CHUNK_BITS) as u32, - row_index: (zero_based & CHUNK_MASK) as u32, + row_index: (zero_based & (CHUNK_SIZE - 1)) as u32, } } @@ -64,9 +67,10 @@ pub struct PartitionedBuild { pub desc: Arc, /// Function context. pub function_ctx: FunctionContext, - /// Accumulator for fixed-size chunks. - squash_buffer: Vec, - squash_rows: usize, + /// Visited bitmap for right outer/semi/anti joins (1-based indexing, index 0 unused). + pub visited: Vec, + /// Fixed-size chunk accumulator using mutable columns. + accumulator: FixedSizeChunkAccumulator, } impl PartitionedBuild { @@ -85,8 +89,8 @@ impl PartitionedBuild { method, desc, function_ctx, - squash_buffer: Vec::new(), - squash_rows: 0, + visited: Vec::new(), + accumulator: FixedSizeChunkAccumulator::new(CHUNK_SIZE), } } @@ -94,58 +98,60 @@ impl PartitionedBuild { pub fn add_block(&mut self, data: Option) -> Result<()> { match data { Some(block) if !block.is_empty() => { - self.squash_rows += block.num_rows(); - self.squash_buffer.push(block); - while self.squash_rows >= CHUNK_SIZE { - self.flush_one_chunk()?; + let flushed = self.accumulator.accumulate(block); + for chunk in flushed { + self.ingest_chunk(chunk)?; } } _ => { - if !self.squash_buffer.is_empty() { - let block = DataBlock::concat(&std::mem::take(&mut self.squash_buffer))?; - if !block.is_empty() { - self.num_rows += block.num_rows(); - self.chunks.push(block); - } - self.squash_rows = 0; + if let Some(chunk) = self.accumulator.flush() { + self.ingest_chunk(chunk)?; } } } Ok(()) } - /// Finalize build: extract keys, compute hashes, build compact hash table, extract ColumnVec. + /// Process a flushed chunk: compute KeysState immediately. + fn ingest_chunk(&mut self, chunk: DataBlock) -> Result<()> { + let num_rows = chunk.num_rows(); + with_hash_method!(|T| match &self.method { + HashMethodKind::T(method) => { + let keys_entries = self.desc.build_key(&chunk, &self.function_ctx)?; + let mut keys_block = DataBlock::new(keys_entries, num_rows); + self.desc.remove_keys_nullable(&mut keys_block); + let keys = ProjectedBlock::from(keys_block.columns()); + let keys_state = method.build_keys_state(keys, num_rows)?; + self.build_keys_states.push(keys_state); + } + }); + self.num_rows += num_rows; + self.chunks.push(chunk); + Ok(()) + } + + /// Finalize build: build hash table chunk by chunk, extract ColumnVec. pub fn final_build(&mut self) -> Result<()> { if self.num_rows == 0 { return Ok(()); } - let mut all_keys_states = Vec::with_capacity(self.chunks.len()); - let mut all_hashes = Vec::with_capacity(self.num_rows); + // Allocate hash table with known total rows + self.hash_table = CompactJoinHashTable::new(self.num_rows); + // Process one chunk at a time: compute hashes, insert into table + let mut row_offset = 1; // 1-based indexing with_hash_method!(|T| match &self.method { HashMethodKind::T(method) => { - for chunk in &self.chunks { - let keys_entries = self.desc.build_key(chunk, &self.function_ctx)?; - let mut keys_block = DataBlock::new(keys_entries, chunk.num_rows()); - self.desc.remove_keys_nullable(&mut keys_block); - let keys = ProjectedBlock::from(keys_block.columns()); - let keys_state = method.build_keys_state(keys, chunk.num_rows())?; - method.build_keys_hashes(&keys_state, &mut all_hashes); - all_keys_states.push(keys_state); + for keys_state in &self.build_keys_states { + let mut hashes = Vec::new(); + method.build_keys_hashes(keys_state, &mut hashes); + self.hash_table.insert_chunk(&hashes, row_offset); + row_offset += hashes.len(); } } }); - // Build compact hash table (1-indexed) - let mut bucket_nums = vec![0usize; self.num_rows + 1]; - self.hash_table = CompactJoinHashTable::new(self.num_rows); - let bucket_mask = self.hash_table.bucket_mask(); - for (i, h) in all_hashes.iter().enumerate() { - bucket_nums[i + 1] = (*h as usize) & bucket_mask; - } - self.hash_table.build(&bucket_nums); - // Project build columns and extract ColumnVec if let Some(first_chunk) = self.chunks.first() { let first_projected = first_chunk.clone().project(&self.desc.build_projection); @@ -172,21 +178,6 @@ impl PartitionedBuild { self.columns = columns; } - self.build_keys_states = all_keys_states; - Ok(()) - } - - fn flush_one_chunk(&mut self) -> Result<()> { - let concat = DataBlock::concat(&std::mem::take(&mut self.squash_buffer))?; - let chunk = concat.slice(0..CHUNK_SIZE).maybe_gc(); - let remain_rows = concat.num_rows() - CHUNK_SIZE; - if remain_rows > 0 { - let remain = concat.slice(CHUNK_SIZE..concat.num_rows()).maybe_gc(); - self.squash_buffer.push(remain); - } - self.squash_rows = remain_rows; - self.num_rows += chunk.num_rows(); - self.chunks.push(chunk); Ok(()) } @@ -197,131 +188,57 @@ impl PartitionedBuild { self.columns.clear(); self.column_types.clear(); self.num_rows = 0; - self.squash_buffer.clear(); - self.squash_rows = 0; + self.visited.clear(); + self.accumulator.reset(); } - /// Probe the hash table with a data block. Returns matched pairs and unmatched probe indices. - /// For each probe row, walks the hash chain and compares keys. - pub fn probe( - &self, + /// Create a probe stream that only tracks matched rows (for inner, left semi, right series). + pub fn create_probe_matched<'a>( + &'a self, data: &DataBlock, - ) -> Result<(Vec, Vec, Vec)> { - if self.num_rows == 0 { - let unmatched: Vec = (0..data.num_rows() as u64).collect(); - return Ok((vec![], vec![], unmatched)); - } - - let probe_keys_entries = self.desc.probe_key(data, &self.function_ctx)?; - let mut probe_keys_block = DataBlock::new(probe_keys_entries, data.num_rows()); - self.desc.remove_keys_nullable(&mut probe_keys_block); - - let mut matched_probe = Vec::new(); - let mut matched_build = Vec::new(); - let mut has_match = vec![false; data.num_rows()]; - - with_hash_method!(|T| match &self.method { - HashMethodKind::T(method) => { - let keys = ProjectedBlock::from(probe_keys_block.columns()); - let probe_ks = method.build_keys_state(keys, data.num_rows())?; - let mut probe_hashes = Vec::with_capacity(data.num_rows()); - method.build_keys_hashes(&probe_ks, &mut probe_hashes); - - let probe_acc = method.build_keys_accessor(probe_ks)?; - let build_accs: Vec<_> = self - .build_keys_states - .iter() - .map(|ks| method.build_keys_accessor(ks.clone())) - .collect::>>()?; - - let bucket_mask = self.hash_table.bucket_mask(); - for probe_idx in 0..data.num_rows() { - let bucket = (probe_hashes[probe_idx] as usize) & bucket_mask; - let mut build_idx = self.hash_table.first_index(bucket); - while build_idx != 0 { - let bi = build_idx as usize; - let chunk_idx = (bi - 1) >> CHUNK_BITS; - let offset = (bi - 1) & CHUNK_MASK; - let build_key = unsafe { build_accs[chunk_idx].key_unchecked(offset) }; - let probe_key = unsafe { probe_acc.key_unchecked(probe_idx) }; - if build_key == probe_key { - has_match[probe_idx] = true; - matched_probe.push(probe_idx as u64); - matched_build.push(flat_to_row_ptr(bi)); - } - build_idx = self.hash_table.next_index(build_idx); - } - } - } - }); - - let unmatched: Vec = has_match - .iter() - .enumerate() - .filter(|(_, m)| !**m) - .map(|(i, _)| i as u64) - .collect(); - - Ok((matched_probe, matched_build, unmatched)) + ) -> Result> { + create_compact_probe_matched( + &self.hash_table, + &self.build_keys_states, + &self.method, + &self.desc, + &self.function_ctx, + data, + ) } - /// Probe and mark visited build rows (for right join types). - pub fn probe_and_mark_visited( - &mut self, + /// Create a probe stream that also tracks unmatched rows (for left, left anti). + pub fn create_probe<'a>( + &'a self, data: &DataBlock, - ) -> Result<(Vec, Vec)> { - if self.num_rows == 0 { - return Ok((vec![], vec![])); - } - - let probe_keys_entries = self.desc.probe_key(data, &self.function_ctx)?; - let mut probe_keys_block = DataBlock::new(probe_keys_entries, data.num_rows()); - self.desc.remove_keys_nullable(&mut probe_keys_block); - - let mut matched_probe = Vec::new(); - let mut matched_build = Vec::new(); - - with_hash_method!(|T| match &self.method { - HashMethodKind::T(method) => { - let keys = ProjectedBlock::from(probe_keys_block.columns()); - let probe_ks = method.build_keys_state(keys, data.num_rows())?; - let mut probe_hashes = Vec::with_capacity(data.num_rows()); - method.build_keys_hashes(&probe_ks, &mut probe_hashes); - - let probe_acc = method.build_keys_accessor(probe_ks)?; - let build_accs: Vec<_> = self - .build_keys_states - .iter() - .map(|ks| method.build_keys_accessor(ks.clone())) - .collect::>>()?; - - let bucket_mask = self.hash_table.bucket_mask(); - for probe_idx in 0..data.num_rows() { - let bucket = (probe_hashes[probe_idx] as usize) & bucket_mask; - let mut build_idx = self.hash_table.first_index(bucket); - while build_idx != 0 { - let bi = build_idx as usize; - let chunk_idx = (bi - 1) >> CHUNK_BITS; - let offset = (bi - 1) & CHUNK_MASK; - let build_key = unsafe { build_accs[chunk_idx].key_unchecked(offset) }; - let probe_key = unsafe { probe_acc.key_unchecked(probe_idx) }; - if build_key == probe_key { - matched_probe.push(probe_idx as u64); - matched_build.push(flat_to_row_ptr(bi)); - self.hash_table.set_visited(build_idx); - } - build_idx = self.hash_table.next_index(build_idx); - } - } - } - }); - - Ok((matched_probe, matched_build)) + ) -> Result> { + create_compact_probe( + &self.hash_table, + &self.build_keys_states, + &self.method, + &self.desc, + &self.function_ctx, + data, + ) } /// Initialize visited tracking for right-side join types. pub fn init_visited(&mut self) { - self.hash_table.init_visited(self.num_rows); + self.visited = vec![0u8; self.num_rows + 1]; + } + + /// Mark a build row as visited (1-based index). + #[inline(always)] + pub fn set_visited(&mut self, row_index: usize) { + unsafe { + *self.visited.get_unchecked_mut(row_index) = 1; + } + } + + /// Check if a build row has been visited (1-based index). + #[inline(always)] + pub fn is_visited(&self, row_index: usize) -> bool { + unsafe { *self.visited.get_unchecked(row_index) != 0 } } /// Gather build columns for the given row pointers. diff --git a/src/query/service/src/pipelines/processors/transforms/new_hash_join/memory/partitioned/right_join.rs b/src/query/service/src/pipelines/processors/transforms/new_hash_join/memory/partitioned/right_join.rs index 97dcd7774b9b7..0eac0ce88d14a 100644 --- a/src/query/service/src/pipelines/processors/transforms/new_hash_join/memory/partitioned/right_join.rs +++ b/src/query/service/src/pipelines/processors/transforms/new_hash_join/memory/partitioned/right_join.rs @@ -16,30 +16,35 @@ use std::sync::Arc; use databend_common_base::base::ProgressValues; use databend_common_exception::Result; +use databend_common_expression::ColumnVec; use databend_common_expression::DataBlock; use databend_common_expression::FilterExecutor; use databend_common_expression::FunctionContext; use databend_common_expression::HashMethodKind; +use databend_common_expression::types::DataType; use databend_common_functions::BUILTIN_FUNCTIONS; -use super::left_join::final_result_block; +use super::compact_probe_stream::create_compact_probe_matched; +use super::inner_join::result_block; use super::left_join::null_block; use super::partitioned_build::PartitionedBuild; use super::partitioned_build::flat_to_row_ptr; use crate::pipelines::processors::HashJoinDesc; +use crate::pipelines::processors::transforms::hash_join_table::RowPtr; +use crate::pipelines::processors::transforms::new_hash_join::grace::grace_memory::GraceMemoryJoin; +use crate::pipelines::processors::transforms::new_hash_join::hashtable::basic::ProbeStream; +use crate::pipelines::processors::transforms::new_hash_join::hashtable::basic::ProbedRows; use crate::pipelines::processors::transforms::new_hash_join::join::EmptyJoinStream; use crate::pipelines::processors::transforms::new_hash_join::join::Join; use crate::pipelines::processors::transforms::new_hash_join::join::JoinStream; -use crate::pipelines::processors::transforms::new_hash_join::join::OneBlockJoinStream; -use crate::pipelines::processors::transforms::new_hash_join::grace::grace_memory::GraceMemoryJoin; use crate::pipelines::processors::transforms::wrap_nullable_block; pub struct PartitionedRightJoin { build: PartitionedBuild, filter_executor: Option, + max_block_size: usize, finished: bool, } - impl PartitionedRightJoin { pub fn create( method: HashMethodKind, @@ -60,11 +65,140 @@ impl PartitionedRightJoin { PartitionedRightJoin { build: PartitionedBuild::create(method, desc, function_ctx), filter_executor, + max_block_size, finished: false, } } } +struct PartitionedRightJoinStream<'a> { + desc: Arc, + probe_data_block: DataBlock, + columns: &'a Vec, + column_types: &'a Vec, + visited: &'a mut Vec, + probe_stream: Box, + probed_rows: ProbedRows, + filter_executor: Option<&'a mut FilterExecutor>, + max_block_size: usize, +} + +impl<'a> PartitionedRightJoinStream<'a> { + fn gather_build_block(&self, row_ptrs: &[RowPtr]) -> Option { + if self.columns.is_empty() { + return None; + } + Some(DataBlock::take_column_vec( + self.columns, + self.column_types, + row_ptrs, + )) + } +} + +impl<'a> JoinStream for PartitionedRightJoinStream<'a> { + fn next(&mut self) -> Result> { + loop { + self.probed_rows.clear(); + self.probe_stream + .advance(&mut self.probed_rows, self.max_block_size)?; + + if self.probed_rows.is_empty() { + return Ok(None); + } + + let num_matched = self.probed_rows.matched_probe.len(); + let probe_block = match self.probe_data_block.num_columns() { + 0 => None, + _ => Some(wrap_nullable_block(&DataBlock::take( + &self.probe_data_block, + self.probed_rows.matched_probe.as_slice(), + )?)), + }; + let build_block = self.gather_build_block(&self.probed_rows.matched_build); + + let mut block = result_block(&self.desc, probe_block, build_block, num_matched); + + if let Some(filter) = self.filter_executor.as_mut() { + let count = filter.select(&block)?; + if count == 0 { + continue; + } + // Mark visited only for rows that pass filter + let true_sel = filter.true_selection(); + for &sel_idx in true_sel.iter().take(count) { + let row_ptr = &self.probed_rows.matched_build[sel_idx as usize]; + let flat_idx = (row_ptr.chunk_index as usize) + * super::partitioned_build::CHUNK_SIZE + + row_ptr.row_index as usize + + 1; + self.visited[flat_idx] = 1; + } + let origin_rows = block.num_rows(); + block = filter.take(block, origin_rows, count)?; + } else { + // Mark all matched as visited + for row_ptr in &self.probed_rows.matched_build { + let flat_idx = (row_ptr.chunk_index as usize) + * super::partitioned_build::CHUNK_SIZE + + row_ptr.row_index as usize + + 1; + self.visited[flat_idx] = 1; + } + } + + if !block.is_empty() { + return Ok(Some(block)); + } + } + } +} + +struct PartitionedRightFinalStream<'a> { + columns: &'a Vec, + column_types: &'a Vec, + visited: &'a [u8], + num_rows: usize, + scan_idx: usize, + max_block_size: usize, + desc: Arc, + probe_types: Vec, +} + +impl<'a> JoinStream for PartitionedRightFinalStream<'a> { + fn next(&mut self) -> Result> { + let mut row_ptrs = Vec::with_capacity(self.max_block_size); + while self.scan_idx <= self.num_rows && row_ptrs.len() < self.max_block_size { + if self.visited[self.scan_idx] == 0 { + row_ptrs.push(flat_to_row_ptr(self.scan_idx)); + } + self.scan_idx += 1; + } + + if row_ptrs.is_empty() { + return Ok(None); + } + + let probe_block = null_block(&self.probe_types, row_ptrs.len()); + let build_block = if self.columns.is_empty() { + None + } else { + Some(DataBlock::take_column_vec( + self.columns, + self.column_types, + &row_ptrs, + )) + }; + + Ok(Some(result_block( + &self.desc, + probe_block, + build_block, + row_ptrs.len(), + ))) + } +} + impl Join for PartitionedRightJoin { fn add_block(&mut self, data: Option) -> Result<()> { self.build.add_block(data) @@ -81,39 +215,31 @@ impl Join for PartitionedRightJoin { return Ok(Box::new(EmptyJoinStream)); } - let desc = self.build.desc.clone(); - let (matched_probe, matched_build) = self.build.probe_and_mark_visited(&data)?; + let probe_stream = create_compact_probe_matched( + &self.build.hash_table, + &self.build.build_keys_states, + &self.build.method, + &self.build.desc, + &self.build.function_ctx, + &data, + )?; + let probe_data_block = data.project(&self.build.desc.probe_projection); - if matched_probe.is_empty() { - return Ok(Box::new(EmptyJoinStream)); - } - - let probe_projected = data.project(&desc.probe_projection); - let probe_block = match probe_projected.num_columns() { - 0 => None, - _ => Some(wrap_nullable_block(&DataBlock::take( - &probe_projected, - matched_probe.as_slice(), - )?)), - }; - - let build_block = self.build.gather_build_block(&matched_build); - - let mut result = final_result_block( - &desc, - probe_block, - build_block, - matched_build.len(), - ); - - if let Some(filter) = self.filter_executor.as_mut() { - result = filter.filter(result)?; - if result.is_empty() { - return Ok(Box::new(EmptyJoinStream)); - } - } - - Ok(Box::new(OneBlockJoinStream(Some(result)))) + Ok(Box::new(PartitionedRightJoinStream { + desc: self.build.desc.clone(), + probe_data_block, + columns: &self.build.columns, + column_types: &self.build.column_types, + visited: &mut self.build.visited, + probe_stream, + probed_rows: ProbedRows::new( + Vec::with_capacity(self.max_block_size), + Vec::with_capacity(self.max_block_size), + Vec::with_capacity(self.max_block_size), + ), + filter_executor: self.filter_executor.as_mut(), + max_block_size: self.max_block_size, + })) } fn final_probe(&mut self) -> Result>> { @@ -123,37 +249,23 @@ impl Join for PartitionedRightJoin { self.finished = true; let desc = self.build.desc.clone(); - let mut unvisited_ptrs = Vec::new(); - - // Scan visited array (1-based indexing) - for i in 1..=self.build.num_rows { - if !self.build.hash_table.is_visited(i) { - unvisited_ptrs.push(flat_to_row_ptr(i)); - } - } - - if unvisited_ptrs.is_empty() { - return Ok(None); - } - - // Build NULL probe block let mut probe_types = Vec::new(); for (i, field) in desc.probe_schema.fields().iter().enumerate() { if desc.probe_projection.contains(&i) { probe_types.push(field.data_type().clone()); } } - let probe_block = null_block(&probe_types, unvisited_ptrs.len()); - let build_block = self.build.gather_build_block(&unvisited_ptrs); - - let result = final_result_block( - &desc, - probe_block, - build_block, - unvisited_ptrs.len(), - ); - Ok(Some(Box::new(OneBlockJoinStream(Some(result))))) + Ok(Some(Box::new(PartitionedRightFinalStream { + columns: &self.build.columns, + column_types: &self.build.column_types, + visited: &self.build.visited, + num_rows: self.build.num_rows, + scan_idx: 1, + max_block_size: self.max_block_size, + desc, + probe_types, + }))) } } diff --git a/src/query/service/src/pipelines/processors/transforms/new_hash_join/memory/partitioned/right_join_anti.rs b/src/query/service/src/pipelines/processors/transforms/new_hash_join/memory/partitioned/right_join_anti.rs index 1bb91bd0b59a9..dc4cffb42b8a2 100644 --- a/src/query/service/src/pipelines/processors/transforms/new_hash_join/memory/partitioned/right_join_anti.rs +++ b/src/query/service/src/pipelines/processors/transforms/new_hash_join/memory/partitioned/right_join_anti.rs @@ -16,25 +16,30 @@ use std::sync::Arc; use databend_common_base::base::ProgressValues; use databend_common_exception::Result; +use databend_common_expression::ColumnVec; use databend_common_expression::DataBlock; use databend_common_expression::FilterExecutor; use databend_common_expression::FunctionContext; use databend_common_expression::HashMethodKind; +use databend_common_expression::types::DataType; use databend_common_functions::BUILTIN_FUNCTIONS; -use super::left_join::final_result_block; +use super::compact_probe_stream::create_compact_probe_matched; +use super::inner_join::result_block; use super::partitioned_build::PartitionedBuild; use super::partitioned_build::flat_to_row_ptr; use crate::pipelines::processors::HashJoinDesc; +use crate::pipelines::processors::transforms::new_hash_join::grace::grace_memory::GraceMemoryJoin; +use crate::pipelines::processors::transforms::new_hash_join::hashtable::basic::ProbeStream; +use crate::pipelines::processors::transforms::new_hash_join::hashtable::basic::ProbedRows; use crate::pipelines::processors::transforms::new_hash_join::join::EmptyJoinStream; use crate::pipelines::processors::transforms::new_hash_join::join::Join; use crate::pipelines::processors::transforms::new_hash_join::join::JoinStream; -use crate::pipelines::processors::transforms::new_hash_join::join::OneBlockJoinStream; -use crate::pipelines::processors::transforms::new_hash_join::grace::grace_memory::GraceMemoryJoin; pub struct PartitionedRightAntiJoin { build: PartitionedBuild, filter_executor: Option, + max_block_size: usize, finished: bool, } @@ -58,10 +63,118 @@ impl PartitionedRightAntiJoin { PartitionedRightAntiJoin { build: PartitionedBuild::create(method, desc, function_ctx), filter_executor, + max_block_size, finished: false, } } } +/// Probe stream that marks visited build rows, outputs nothing. +struct PartitionedRightAntiProbeStream<'a> { + desc: Arc, + probe_data_block: DataBlock, + columns: &'a Vec, + column_types: &'a Vec, + visited: &'a mut Vec, + probe_stream: Box, + probed_rows: ProbedRows, + filter_executor: Option<&'a mut FilterExecutor>, + max_block_size: usize, +} + +impl<'a> JoinStream for PartitionedRightAntiProbeStream<'a> { + fn next(&mut self) -> Result> { + loop { + self.probed_rows.clear(); + self.probe_stream + .advance(&mut self.probed_rows, self.max_block_size)?; + + if self.probed_rows.is_empty() { + return Ok(None); + } + + if self.probed_rows.matched_probe.is_empty() { + continue; + } + + if let Some(filter) = self.filter_executor.as_mut() { + let num_matched = self.probed_rows.matched_probe.len(); + let probe_block = match self.probe_data_block.num_columns() { + 0 => None, + _ => Some(DataBlock::take( + &self.probe_data_block, + self.probed_rows.matched_probe.as_slice(), + )?), + }; + let build_block = if self.columns.is_empty() { + None + } else { + Some(DataBlock::take_column_vec( + self.columns, + self.column_types, + &self.probed_rows.matched_build, + )) + }; + let block = result_block(&self.desc, probe_block, build_block, num_matched); + let count = filter.select(&block)?; + if count > 0 { + let true_sel = filter.true_selection(); + for &sel_idx in true_sel.iter().take(count) { + let row_ptr = &self.probed_rows.matched_build[sel_idx as usize]; + let flat_idx = (row_ptr.chunk_index as usize) + * super::partitioned_build::CHUNK_SIZE + + row_ptr.row_index as usize + + 1; + self.visited[flat_idx] = 1; + } + } + } else { + for row_ptr in &self.probed_rows.matched_build { + let flat_idx = (row_ptr.chunk_index as usize) + * super::partitioned_build::CHUNK_SIZE + + row_ptr.row_index as usize + + 1; + self.visited[flat_idx] = 1; + } + } + // Right anti outputs nothing during probe + } + } +} + +/// Final stream: output unvisited build rows. +struct PartitionedRightAntiFinalStream<'a> { + columns: &'a Vec, + column_types: &'a Vec, + visited: &'a [u8], + num_rows: usize, + scan_idx: usize, + max_block_size: usize, +} + +impl<'a> JoinStream for PartitionedRightAntiFinalStream<'a> { + fn next(&mut self) -> Result> { + let mut row_ptrs = Vec::with_capacity(self.max_block_size); + while self.scan_idx <= self.num_rows && row_ptrs.len() < self.max_block_size { + if self.visited[self.scan_idx] == 0 { + row_ptrs.push(flat_to_row_ptr(self.scan_idx)); + } + self.scan_idx += 1; + } + + if row_ptrs.is_empty() { + return Ok(None); + } + + if self.columns.is_empty() { + return Ok(Some(DataBlock::new(vec![], row_ptrs.len()))); + } + Ok(Some(DataBlock::take_column_vec( + self.columns, + self.column_types, + &row_ptrs, + ))) + } +} impl Join for PartitionedRightAntiJoin { fn add_block(&mut self, data: Option) -> Result<()> { @@ -79,35 +192,31 @@ impl Join for PartitionedRightAntiJoin { return Ok(Box::new(EmptyJoinStream)); } - let desc = self.build.desc.clone(); - - if let Some(filter) = self.filter_executor.as_mut() { - let (matched_probe, matched_build) = self.build.probe_and_mark_visited(&data)?; - if !matched_probe.is_empty() { - let probe_projected = data.project(&desc.probe_projection); - let probe_block = match probe_projected.num_columns() { - 0 => None, - _ => Some(DataBlock::take(&probe_projected, matched_probe.as_slice())?), - }; - let build_block = self.build.gather_build_block(&matched_build); - let result = final_result_block(&desc, probe_block, build_block, matched_probe.len()); - - if !result.is_empty() { - let _count = filter.select(&result)?; - // For anti join with filter: probe_and_mark_visited marks all matches. - // Ideally we'd only mark rows that pass the filter, but since - // probe_and_mark_visited already marked them, this is conservative - // (may exclude some rows that should be in anti output). - // This matches the unpartitioned behavior where scan_map is set - // during probe regardless of filter. - } - } - } else { - let _ = self.build.probe_and_mark_visited(&data)?; - } - - // Right anti doesn't output during probe phase - Ok(Box::new(EmptyJoinStream)) + let probe_stream = create_compact_probe_matched( + &self.build.hash_table, + &self.build.build_keys_states, + &self.build.method, + &self.build.desc, + &self.build.function_ctx, + &data, + )?; + let probe_data_block = data.project(&self.build.desc.probe_projection); + + Ok(Box::new(PartitionedRightAntiProbeStream { + desc: self.build.desc.clone(), + probe_data_block, + columns: &self.build.columns, + column_types: &self.build.column_types, + visited: &mut self.build.visited, + probe_stream, + probed_rows: ProbedRows::new( + Vec::with_capacity(self.max_block_size), + Vec::with_capacity(self.max_block_size), + Vec::with_capacity(self.max_block_size), + ), + filter_executor: self.filter_executor.as_mut(), + max_block_size: self.max_block_size, + })) } fn final_probe(&mut self) -> Result>> { @@ -116,24 +225,14 @@ impl Join for PartitionedRightAntiJoin { } self.finished = true; - let mut unvisited_ptrs = Vec::new(); - for i in 1..=self.build.num_rows { - if !self.build.hash_table.is_visited(i) { - unvisited_ptrs.push(flat_to_row_ptr(i)); - } - } - - if unvisited_ptrs.is_empty() { - return Ok(None); - } - - let build_block = self.build.gather_build_block(&unvisited_ptrs); - match build_block { - Some(block) => Ok(Some(Box::new(OneBlockJoinStream(Some(block))))), - None => Ok(Some(Box::new(OneBlockJoinStream(Some( - DataBlock::new(vec![], unvisited_ptrs.len()), - ))))), - } + Ok(Some(Box::new(PartitionedRightAntiFinalStream { + columns: &self.build.columns, + column_types: &self.build.column_types, + visited: &self.build.visited, + num_rows: self.build.num_rows, + scan_idx: 1, + max_block_size: self.max_block_size, + }))) } } diff --git a/src/query/service/src/pipelines/processors/transforms/new_hash_join/memory/partitioned/right_join_semi.rs b/src/query/service/src/pipelines/processors/transforms/new_hash_join/memory/partitioned/right_join_semi.rs index b755b7862d8e3..e693279ccf26b 100644 --- a/src/query/service/src/pipelines/processors/transforms/new_hash_join/memory/partitioned/right_join_semi.rs +++ b/src/query/service/src/pipelines/processors/transforms/new_hash_join/memory/partitioned/right_join_semi.rs @@ -16,25 +16,30 @@ use std::sync::Arc; use databend_common_base::base::ProgressValues; use databend_common_exception::Result; +use databend_common_expression::ColumnVec; use databend_common_expression::DataBlock; use databend_common_expression::FilterExecutor; use databend_common_expression::FunctionContext; use databend_common_expression::HashMethodKind; +use databend_common_expression::types::DataType; use databend_common_functions::BUILTIN_FUNCTIONS; -use super::left_join::final_result_block; +use super::compact_probe_stream::create_compact_probe_matched; +use super::inner_join::result_block; use super::partitioned_build::PartitionedBuild; use super::partitioned_build::flat_to_row_ptr; use crate::pipelines::processors::HashJoinDesc; +use crate::pipelines::processors::transforms::new_hash_join::grace::grace_memory::GraceMemoryJoin; +use crate::pipelines::processors::transforms::new_hash_join::hashtable::basic::ProbeStream; +use crate::pipelines::processors::transforms::new_hash_join::hashtable::basic::ProbedRows; use crate::pipelines::processors::transforms::new_hash_join::join::EmptyJoinStream; use crate::pipelines::processors::transforms::new_hash_join::join::Join; use crate::pipelines::processors::transforms::new_hash_join::join::JoinStream; -use crate::pipelines::processors::transforms::new_hash_join::join::OneBlockJoinStream; -use crate::pipelines::processors::transforms::new_hash_join::grace::grace_memory::GraceMemoryJoin; pub struct PartitionedRightSemiJoin { build: PartitionedBuild, filter_executor: Option, + max_block_size: usize, finished: bool, } @@ -58,11 +63,120 @@ impl PartitionedRightSemiJoin { PartitionedRightSemiJoin { build: PartitionedBuild::create(method, desc, function_ctx), filter_executor, + max_block_size, finished: false, } } } +/// Probe stream that marks visited build rows, outputs nothing. +struct PartitionedRightSemiProbeStream<'a> { + desc: Arc, + probe_data_block: DataBlock, + columns: &'a Vec, + column_types: &'a Vec, + visited: &'a mut Vec, + probe_stream: Box, + probed_rows: ProbedRows, + filter_executor: Option<&'a mut FilterExecutor>, + max_block_size: usize, +} + +impl<'a> JoinStream for PartitionedRightSemiProbeStream<'a> { + fn next(&mut self) -> Result> { + loop { + self.probed_rows.clear(); + self.probe_stream + .advance(&mut self.probed_rows, self.max_block_size)?; + + if self.probed_rows.is_empty() { + return Ok(None); + } + + if self.probed_rows.matched_probe.is_empty() { + continue; + } + + if let Some(filter) = self.filter_executor.as_mut() { + let num_matched = self.probed_rows.matched_probe.len(); + let probe_block = match self.probe_data_block.num_columns() { + 0 => None, + _ => Some(DataBlock::take( + &self.probe_data_block, + self.probed_rows.matched_probe.as_slice(), + )?), + }; + let build_block = if self.columns.is_empty() { + None + } else { + Some(DataBlock::take_column_vec( + self.columns, + self.column_types, + &self.probed_rows.matched_build, + )) + }; + let block = result_block(&self.desc, probe_block, build_block, num_matched); + let count = filter.select(&block)?; + if count > 0 { + let true_sel = filter.true_selection(); + for &sel_idx in true_sel.iter().take(count) { + let row_ptr = &self.probed_rows.matched_build[sel_idx as usize]; + let flat_idx = (row_ptr.chunk_index as usize) + * super::partitioned_build::CHUNK_SIZE + + row_ptr.row_index as usize + + 1; + self.visited[flat_idx] = 1; + } + } + } else { + for row_ptr in &self.probed_rows.matched_build { + let flat_idx = (row_ptr.chunk_index as usize) + * super::partitioned_build::CHUNK_SIZE + + row_ptr.row_index as usize + + 1; + self.visited[flat_idx] = 1; + } + } + // Right semi outputs nothing during probe + } + } +} + +/// Final stream: output visited build rows. +struct PartitionedRightSemiFinalStream<'a> { + columns: &'a Vec, + column_types: &'a Vec, + visited: &'a [u8], + num_rows: usize, + scan_idx: usize, + max_block_size: usize, +} + +impl<'a> JoinStream for PartitionedRightSemiFinalStream<'a> { + fn next(&mut self) -> Result> { + let mut row_ptrs = Vec::with_capacity(self.max_block_size); + while self.scan_idx <= self.num_rows && row_ptrs.len() < self.max_block_size { + if self.visited[self.scan_idx] != 0 { + row_ptrs.push(flat_to_row_ptr(self.scan_idx)); + } + self.scan_idx += 1; + } + + if row_ptrs.is_empty() { + return Ok(None); + } + + if self.columns.is_empty() { + return Ok(Some(DataBlock::new(vec![], row_ptrs.len()))); + } + Ok(Some(DataBlock::take_column_vec( + self.columns, + self.column_types, + &row_ptrs, + ))) + } +} + impl Join for PartitionedRightSemiJoin { fn add_block(&mut self, data: Option) -> Result<()> { self.build.add_block(data) @@ -79,34 +193,31 @@ impl Join for PartitionedRightSemiJoin { return Ok(Box::new(EmptyJoinStream)); } - let desc = self.build.desc.clone(); - - if let Some(filter) = self.filter_executor.as_mut() { - let (matched_probe, matched_build) = self.build.probe_and_mark_visited(&data)?; - if !matched_probe.is_empty() { - let probe_projected = data.project(&desc.probe_projection); - let probe_block = match probe_projected.num_columns() { - 0 => None, - _ => Some(DataBlock::take(&probe_projected, matched_probe.as_slice())?), - }; - let build_block = self.build.gather_build_block(&matched_build); - let result = final_result_block(&desc, probe_block, build_block, matched_probe.len()); - - if !result.is_empty() { - let count = filter.select(&result)?; - if count > 0 { - // probe_and_mark_visited already marked all matches as visited, - // which is correct for semi join (any match suffices) - } - } - } - } else { - // Without filter, probe_and_mark_visited already marks all matched build rows - let _ = self.build.probe_and_mark_visited(&data)?; - } - - // Right semi doesn't output during probe phase - Ok(Box::new(EmptyJoinStream)) + let probe_stream = create_compact_probe_matched( + &self.build.hash_table, + &self.build.build_keys_states, + &self.build.method, + &self.build.desc, + &self.build.function_ctx, + &data, + )?; + let probe_data_block = data.project(&self.build.desc.probe_projection); + + Ok(Box::new(PartitionedRightSemiProbeStream { + desc: self.build.desc.clone(), + probe_data_block, + columns: &self.build.columns, + column_types: &self.build.column_types, + visited: &mut self.build.visited, + probe_stream, + probed_rows: ProbedRows::new( + Vec::with_capacity(self.max_block_size), + Vec::with_capacity(self.max_block_size), + Vec::with_capacity(self.max_block_size), + ), + filter_executor: self.filter_executor.as_mut(), + max_block_size: self.max_block_size, + })) } fn final_probe(&mut self) -> Result>> { @@ -115,24 +226,14 @@ impl Join for PartitionedRightSemiJoin { } self.finished = true; - let mut visited_ptrs = Vec::new(); - for i in 1..=self.build.num_rows { - if self.build.hash_table.is_visited(i) { - visited_ptrs.push(flat_to_row_ptr(i)); - } - } - - if visited_ptrs.is_empty() { - return Ok(None); - } - - let build_block = self.build.gather_build_block(&visited_ptrs); - match build_block { - Some(block) => Ok(Some(Box::new(OneBlockJoinStream(Some(block))))), - None => Ok(Some(Box::new(OneBlockJoinStream(Some( - DataBlock::new(vec![], visited_ptrs.len()), - ))))), - } + Ok(Some(Box::new(PartitionedRightSemiFinalStream { + columns: &self.build.columns, + column_types: &self.build.column_types, + visited: &self.build.visited, + num_rows: self.build.num_rows, + scan_idx: 1, + max_block_size: self.max_block_size, + }))) } } From 3551243bc4b7af8c9f8d733242b66d6d7ead46f1 Mon Sep 17 00:00:00 2001 From: zhang2014 Date: Mon, 16 Mar 2026 15:12:36 +0800 Subject: [PATCH 03/38] z --- .../transforms/new_hash_join/hash_join_factory.rs | 4 ++-- .../processors/transforms/new_hash_join/memory/mod.rs | 2 +- .../new_hash_join/memory/unpartitioned/left_join.rs | 2 +- .../new_hash_join/memory/unpartitioned/left_join_anti.rs | 4 ++-- .../new_hash_join/memory/unpartitioned/left_join_semi.rs | 2 +- .../transforms/new_hash_join/memory/unpartitioned/mod.rs | 4 ++-- .../new_hash_join/memory/unpartitioned/right_join.rs | 6 +++--- .../new_hash_join/memory/unpartitioned/right_join_anti.rs | 4 ++-- .../new_hash_join/memory/unpartitioned/right_join_semi.rs | 4 ++-- 9 files changed, 16 insertions(+), 16 deletions(-) diff --git a/src/query/service/src/pipelines/processors/transforms/new_hash_join/hash_join_factory.rs b/src/query/service/src/pipelines/processors/transforms/new_hash_join/hash_join_factory.rs index b7d03c1aba14d..919743ace6384 100644 --- a/src/query/service/src/pipelines/processors/transforms/new_hash_join/hash_join_factory.rs +++ b/src/query/service/src/pipelines/processors/transforms/new_hash_join/hash_join_factory.rs @@ -46,11 +46,11 @@ use crate::pipelines::processors::transforms::memory::SemiLeftHashJoin; use crate::pipelines::processors::transforms::memory::SemiRightHashJoin; use crate::pipelines::processors::transforms::memory::left_join::OuterLeftHashJoin; use crate::pipelines::processors::transforms::memory::partitioned::PartitionedInnerJoin; -use crate::pipelines::processors::transforms::memory::partitioned::PartitionedLeftJoin; use crate::pipelines::processors::transforms::memory::partitioned::PartitionedLeftAntiJoin; +use crate::pipelines::processors::transforms::memory::partitioned::PartitionedLeftJoin; use crate::pipelines::processors::transforms::memory::partitioned::PartitionedLeftSemiJoin; -use crate::pipelines::processors::transforms::memory::partitioned::PartitionedRightJoin; use crate::pipelines::processors::transforms::memory::partitioned::PartitionedRightAntiJoin; +use crate::pipelines::processors::transforms::memory::partitioned::PartitionedRightJoin; use crate::pipelines::processors::transforms::memory::partitioned::PartitionedRightSemiJoin; use crate::sessions::QueryContext; diff --git a/src/query/service/src/pipelines/processors/transforms/new_hash_join/memory/mod.rs b/src/query/service/src/pipelines/processors/transforms/new_hash_join/memory/mod.rs index f4f32d914b27c..d1890d4a56dc1 100644 --- a/src/query/service/src/pipelines/processors/transforms/new_hash_join/memory/mod.rs +++ b/src/query/service/src/pipelines/processors/transforms/new_hash_join/memory/mod.rs @@ -12,7 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. -pub mod unpartitioned; pub mod partitioned; +pub mod unpartitioned; pub use unpartitioned::*; diff --git a/src/query/service/src/pipelines/processors/transforms/new_hash_join/memory/unpartitioned/left_join.rs b/src/query/service/src/pipelines/processors/transforms/new_hash_join/memory/unpartitioned/left_join.rs index ce62243ee82f2..ea4502f8133c3 100644 --- a/src/query/service/src/pipelines/processors/transforms/new_hash_join/memory/unpartitioned/left_join.rs +++ b/src/query/service/src/pipelines/processors/transforms/new_hash_join/memory/unpartitioned/left_join.rs @@ -30,6 +30,7 @@ use databend_common_expression::types::DataType; use databend_common_expression::types::NullableColumn; use databend_common_expression::with_join_hash_method; +use super::basic::BasicHashJoin; use crate::pipelines::processors::HashJoinDesc; use crate::pipelines::processors::transforms::BasicHashJoinState; use crate::pipelines::processors::transforms::HashJoinHashTable; @@ -40,7 +41,6 @@ use crate::pipelines::processors::transforms::new_hash_join::hashtable::basic::P use crate::pipelines::processors::transforms::new_hash_join::join::EmptyJoinStream; use crate::pipelines::processors::transforms::new_hash_join::join::JoinStream; use crate::pipelines::processors::transforms::new_hash_join::join::OneBlockJoinStream; -use super::basic::BasicHashJoin; use crate::pipelines::processors::transforms::new_hash_join::performance::PerformanceContext; use crate::pipelines::processors::transforms::wrap_true_validity; use crate::sessions::QueryContext; diff --git a/src/query/service/src/pipelines/processors/transforms/new_hash_join/memory/unpartitioned/left_join_anti.rs b/src/query/service/src/pipelines/processors/transforms/new_hash_join/memory/unpartitioned/left_join_anti.rs index 024374a26c0e4..6131995f23c2e 100644 --- a/src/query/service/src/pipelines/processors/transforms/new_hash_join/memory/unpartitioned/left_join_anti.rs +++ b/src/query/service/src/pipelines/processors/transforms/new_hash_join/memory/unpartitioned/left_join_anti.rs @@ -27,12 +27,12 @@ use databend_common_expression::FunctionContext; use databend_common_expression::HashMethodKind; use databend_common_expression::with_join_hash_method; +use super::basic::BasicHashJoin; +use super::left_join::final_result_block; use crate::pipelines::processors::HashJoinDesc; use crate::pipelines::processors::transforms::BasicHashJoinState; use crate::pipelines::processors::transforms::HashJoinHashTable; use crate::pipelines::processors::transforms::Join; -use super::basic::BasicHashJoin; -use super::left_join::final_result_block; use crate::pipelines::processors::transforms::new_hash_join::hashtable::ProbeData; use crate::pipelines::processors::transforms::new_hash_join::hashtable::basic::ProbeStream; use crate::pipelines::processors::transforms::new_hash_join::hashtable::basic::ProbedRows; diff --git a/src/query/service/src/pipelines/processors/transforms/new_hash_join/memory/unpartitioned/left_join_semi.rs b/src/query/service/src/pipelines/processors/transforms/new_hash_join/memory/unpartitioned/left_join_semi.rs index dfd0f50f03e5e..8efce103b03a9 100644 --- a/src/query/service/src/pipelines/processors/transforms/new_hash_join/memory/unpartitioned/left_join_semi.rs +++ b/src/query/service/src/pipelines/processors/transforms/new_hash_join/memory/unpartitioned/left_join_semi.rs @@ -30,12 +30,12 @@ use databend_common_expression::HashMethodKind; use databend_common_expression::types::NullableColumn; use databend_common_expression::with_join_hash_method; +use super::basic::BasicHashJoin; use crate::pipelines::processors::HashJoinDesc; use crate::pipelines::processors::transforms::BasicHashJoinState; use crate::pipelines::processors::transforms::HashJoinHashTable; use crate::pipelines::processors::transforms::Join; use crate::pipelines::processors::transforms::JoinRuntimeFilterPacket; -use super::basic::BasicHashJoin; use crate::pipelines::processors::transforms::merge_join_runtime_filter_packets; use crate::pipelines::processors::transforms::new_hash_join::hashtable::ProbeData; use crate::pipelines::processors::transforms::new_hash_join::hashtable::basic::ProbeStream; diff --git a/src/query/service/src/pipelines/processors/transforms/new_hash_join/memory/unpartitioned/mod.rs b/src/query/service/src/pipelines/processors/transforms/new_hash_join/memory/unpartitioned/mod.rs index 0a1b217a2cb02..5c09205bdc733 100644 --- a/src/query/service/src/pipelines/processors/transforms/new_hash_join/memory/unpartitioned/mod.rs +++ b/src/query/service/src/pipelines/processors/transforms/new_hash_join/memory/unpartitioned/mod.rs @@ -18,16 +18,16 @@ mod inner_join; pub mod left_join; mod left_join_anti; mod left_join_semi; +mod nested_loop; mod right_join; mod right_join_anti; mod right_join_semi; -mod nested_loop; pub use basic_state::BasicHashJoinState; pub use inner_join::InnerHashJoin; pub use left_join_anti::AntiLeftHashJoin; pub use left_join_semi::SemiLeftHashJoin; +pub use nested_loop::*; pub use right_join::OuterRightHashJoin; pub use right_join_anti::AntiRightHashJoin; pub use right_join_semi::SemiRightHashJoin; -pub use nested_loop::*; diff --git a/src/query/service/src/pipelines/processors/transforms/new_hash_join/memory/unpartitioned/right_join.rs b/src/query/service/src/pipelines/processors/transforms/new_hash_join/memory/unpartitioned/right_join.rs index c3046b7ea3417..9d1e49147d140 100644 --- a/src/query/service/src/pipelines/processors/transforms/new_hash_join/memory/unpartitioned/right_join.rs +++ b/src/query/service/src/pipelines/processors/transforms/new_hash_join/memory/unpartitioned/right_join.rs @@ -28,15 +28,15 @@ use databend_common_expression::HashMethodKind; use databend_common_expression::types::DataType; use databend_common_expression::with_join_hash_method; +use super::basic::BasicHashJoin; +use super::left_join::final_result_block; +use super::left_join::null_block; use crate::pipelines::processors::HashJoinDesc; use crate::pipelines::processors::transforms::BasicHashJoinState; use crate::pipelines::processors::transforms::HashJoinHashTable; use crate::pipelines::processors::transforms::Join; use crate::pipelines::processors::transforms::JoinRuntimeFilterPacket; use crate::pipelines::processors::transforms::hash_join_table::RowPtr; -use super::basic::BasicHashJoin; -use super::left_join::final_result_block; -use super::left_join::null_block; use crate::pipelines::processors::transforms::merge_join_runtime_filter_packets; use crate::pipelines::processors::transforms::new_hash_join::hashtable::ProbeData; use crate::pipelines::processors::transforms::new_hash_join::hashtable::basic::ProbeStream; diff --git a/src/query/service/src/pipelines/processors/transforms/new_hash_join/memory/unpartitioned/right_join_anti.rs b/src/query/service/src/pipelines/processors/transforms/new_hash_join/memory/unpartitioned/right_join_anti.rs index efb8a48013fc2..864e68ec8fe7a 100644 --- a/src/query/service/src/pipelines/processors/transforms/new_hash_join/memory/unpartitioned/right_join_anti.rs +++ b/src/query/service/src/pipelines/processors/transforms/new_hash_join/memory/unpartitioned/right_join_anti.rs @@ -26,14 +26,14 @@ use databend_common_expression::FunctionContext; use databend_common_expression::HashMethodKind; use databend_common_expression::with_join_hash_method; +use super::basic::BasicHashJoin; +use super::right_join_semi::SemiRightHashJoinStream; use crate::pipelines::processors::HashJoinDesc; use crate::pipelines::processors::transforms::BasicHashJoinState; use crate::pipelines::processors::transforms::HashJoinHashTable; use crate::pipelines::processors::transforms::Join; use crate::pipelines::processors::transforms::JoinRuntimeFilterPacket; use crate::pipelines::processors::transforms::hash_join_table::RowPtr; -use super::basic::BasicHashJoin; -use super::right_join_semi::SemiRightHashJoinStream; use crate::pipelines::processors::transforms::merge_join_runtime_filter_packets; use crate::pipelines::processors::transforms::new_hash_join::hashtable::ProbeData; use crate::pipelines::processors::transforms::new_hash_join::join::JoinStream; diff --git a/src/query/service/src/pipelines/processors/transforms/new_hash_join/memory/unpartitioned/right_join_semi.rs b/src/query/service/src/pipelines/processors/transforms/new_hash_join/memory/unpartitioned/right_join_semi.rs index 255b079a92504..ef0d999894e4e 100644 --- a/src/query/service/src/pipelines/processors/transforms/new_hash_join/memory/unpartitioned/right_join_semi.rs +++ b/src/query/service/src/pipelines/processors/transforms/new_hash_join/memory/unpartitioned/right_join_semi.rs @@ -27,14 +27,14 @@ use databend_common_expression::FunctionContext; use databend_common_expression::HashMethodKind; use databend_common_expression::with_join_hash_method; +use super::basic::BasicHashJoin; +use super::left_join::final_result_block; use crate::pipelines::processors::HashJoinDesc; use crate::pipelines::processors::transforms::BasicHashJoinState; use crate::pipelines::processors::transforms::HashJoinHashTable; use crate::pipelines::processors::transforms::Join; use crate::pipelines::processors::transforms::JoinRuntimeFilterPacket; use crate::pipelines::processors::transforms::hash_join_table::RowPtr; -use super::basic::BasicHashJoin; -use super::left_join::final_result_block; use crate::pipelines::processors::transforms::merge_join_runtime_filter_packets; use crate::pipelines::processors::transforms::new_hash_join::hashtable::ProbeData; use crate::pipelines::processors::transforms::new_hash_join::hashtable::basic::ProbeStream; From f4b332bdafcbb36ac7bdb79af853f4f5222cd91c Mon Sep 17 00:00:00 2001 From: zhang2014 Date: Sun, 22 Mar 2026 12:30:46 +0800 Subject: [PATCH 04/38] z --- .../memory/partitioned/chunk_accumulator.rs | 63 +++---- .../memory/partitioned/compact_hash_table.rs | 2 - .../memory/partitioned/partitioned_build.rs | 168 ++++++++++-------- 3 files changed, 116 insertions(+), 117 deletions(-) diff --git a/src/query/service/src/pipelines/processors/transforms/new_hash_join/memory/partitioned/chunk_accumulator.rs b/src/query/service/src/pipelines/processors/transforms/new_hash_join/memory/partitioned/chunk_accumulator.rs index 9e53190dc92f5..5cbb27bcbfd04 100644 --- a/src/query/service/src/pipelines/processors/transforms/new_hash_join/memory/partitioned/chunk_accumulator.rs +++ b/src/query/service/src/pipelines/processors/transforms/new_hash_join/memory/partitioned/chunk_accumulator.rs @@ -20,53 +20,46 @@ use databend_common_expression::DataBlock; /// Accumulates rows from input blocks into fixed-size output chunks /// using mutable ColumnBuilders. When the accumulated rows reach /// `chunk_size`, a chunk is flushed and returned. -/// -/// This avoids the overhead of `DataBlock::concat()` + `slice()` by -/// directly appending rows into builders. pub struct FixedSizeChunkAccumulator { chunk_size: usize, - builders: Option>, builder_rows: usize, + builders: Vec, } impl FixedSizeChunkAccumulator { pub fn new(chunk_size: usize) -> Self { FixedSizeChunkAccumulator { chunk_size, - builders: None, + builders: vec![], builder_rows: 0, } } - /// Accumulate a block. Returns any full chunks that were flushed. pub fn accumulate(&mut self, block: DataBlock) -> Vec { let mut output = Vec::new(); self.append_block(block, &mut output); output } - /// Flush remaining rows as the last (possibly shorter) chunk. - pub fn flush(&mut self) -> Option { - if self.builder_rows == 0 { - return None; + pub fn finalize(&mut self) -> Option { + match self.builder_rows { + 0 => None, + _ => Some(self.build_chunk()), } - Some(self.build_chunk()) } - /// Reset the accumulator, discarding any buffered rows. pub fn reset(&mut self) { - self.builders = None; self.builder_rows = 0; + self.builders = vec![]; } fn ensure_builders(&mut self, block: &DataBlock) { - if self.builders.is_none() { - let builders = block + if self.builders.is_empty() { + self.builders = block .columns() .iter() .map(|entry| ColumnBuilder::with_capacity(&entry.data_type(), self.chunk_size)) .collect(); - self.builders = Some(builders); } } @@ -85,16 +78,9 @@ impl FixedSizeChunkAccumulator { let remaining_capacity = self.chunk_size - self.builder_rows; let rows_to_copy = (block_rows - offset).min(remaining_capacity); - let builders = self.builders.as_mut().unwrap(); - if offset == 0 && rows_to_copy == block_rows { - for (builder, col) in builders.iter_mut().zip(columns.iter()) { - builder.append_column(col); - } - } else { - for (builder, col) in builders.iter_mut().zip(columns.iter()) { - let sliced = col.slice(offset..offset + rows_to_copy); - builder.append_column(&sliced); - } + for (builder, col) in self.builders.iter_mut().zip(columns.iter()) { + let sliced = col.slice(offset..offset + rows_to_copy); + builder.append_column(&sliced); } self.builder_rows += rows_to_copy; @@ -107,9 +93,10 @@ impl FixedSizeChunkAccumulator { } fn build_chunk(&mut self) -> DataBlock { - let builders = self.builders.take().unwrap(); - let num_rows = self.builder_rows; self.builder_rows = 0; + let num_rows = self.builder_rows; + + let builders = std::mem::take(&mut self.builders); // Reinitialize builders with same column types for next chunk. let mut new_builders = Vec::with_capacity(builders.len()); @@ -119,7 +106,7 @@ impl FixedSizeChunkAccumulator { columns.push(BlockEntry::from(b.build())); new_builders.push(ColumnBuilder::with_capacity(&dt, self.chunk_size)); } - self.builders = Some(new_builders); + self.builders = new_builders; DataBlock::new(columns, num_rows) } @@ -151,7 +138,7 @@ mod tests { let chunks = acc.accumulate(make_int_block(vec![1, 2, 3])); assert!(chunks.is_empty()); - let last = acc.flush().unwrap(); + let last = acc.finalize().unwrap(); assert_eq!(extract_int_col(&last), vec![1, 2, 3]); } @@ -162,7 +149,7 @@ mod tests { assert_eq!(chunks.len(), 1); assert_eq!(extract_int_col(&chunks[0]), vec![1, 2, 3]); - assert!(acc.flush().is_none()); + assert!(acc.finalize().is_none()); } #[test] @@ -173,7 +160,7 @@ mod tests { assert_eq!(extract_int_col(&chunks[0]), vec![1, 2, 3]); assert_eq!(extract_int_col(&chunks[1]), vec![4, 5, 6]); - let last = acc.flush().unwrap(); + let last = acc.finalize().unwrap(); assert_eq!(extract_int_col(&last), vec![7]); } @@ -185,14 +172,14 @@ mod tests { assert_eq!(chunks.len(), 1); assert_eq!(extract_int_col(&chunks[0]), vec![1, 2, 3, 4]); - let last = acc.flush().unwrap(); + let last = acc.finalize().unwrap(); assert_eq!(extract_int_col(&last), vec![5]); } #[test] fn test_flush_empty() { let mut acc = FixedSizeChunkAccumulator::new(4); - assert!(acc.flush().is_none()); + assert!(acc.finalize().is_none()); } #[test] @@ -200,7 +187,7 @@ mod tests { let mut acc = FixedSizeChunkAccumulator::new(4); acc.accumulate(make_int_block(vec![1, 2, 3])); acc.reset(); - assert!(acc.flush().is_none()); + assert!(acc.finalize().is_none()); } #[test] @@ -215,7 +202,7 @@ mod tests { assert_eq!(chunks[0].num_rows(), 3); assert_eq!(chunks[0].num_columns(), 2); - let last = acc.flush().unwrap(); + let last = acc.finalize().unwrap(); assert_eq!(last.num_rows(), 2); assert_eq!(last.num_columns(), 2); @@ -231,14 +218,14 @@ mod tests { let mut acc = FixedSizeChunkAccumulator::new(2); let chunks = acc.accumulate(make_int_block(vec![1, 2])); assert_eq!(chunks.len(), 1); - assert!(acc.flush().is_none()); + assert!(acc.finalize().is_none()); // Accumulator can be reused after flush let chunks = acc.accumulate(make_int_block(vec![3, 4, 5])); assert_eq!(chunks.len(), 1); assert_eq!(extract_int_col(&chunks[0]), vec![3, 4]); - let last = acc.flush().unwrap(); + let last = acc.finalize().unwrap(); assert_eq!(extract_int_col(&last), vec![5]); } } diff --git a/src/query/service/src/pipelines/processors/transforms/new_hash_join/memory/partitioned/compact_hash_table.rs b/src/query/service/src/pipelines/processors/transforms/new_hash_join/memory/partitioned/compact_hash_table.rs index bf3229d7264cd..ce1e385cb7fdf 100644 --- a/src/query/service/src/pipelines/processors/transforms/new_hash_join/memory/partitioned/compact_hash_table.rs +++ b/src/query/service/src/pipelines/processors/transforms/new_hash_join/memory/partitioned/compact_hash_table.rs @@ -91,8 +91,6 @@ impl CompactJoinHashTable { } } - /// Insert a chunk of rows starting at `row_offset` (1-based). - /// `hashes[i]` is the hash for the row at flat index `row_offset + i`. pub fn insert_chunk(&mut self, hashes: &[u64], row_offset: usize) { let mask = self.bucket_mask; for (i, h) in hashes.iter().enumerate() { diff --git a/src/query/service/src/pipelines/processors/transforms/new_hash_join/memory/partitioned/partitioned_build.rs b/src/query/service/src/pipelines/processors/transforms/new_hash_join/memory/partitioned/partitioned_build.rs index ec18ed228e460..dc49966d019b1 100644 --- a/src/query/service/src/pipelines/processors/transforms/new_hash_join/memory/partitioned/partitioned_build.rs +++ b/src/query/service/src/pipelines/processors/transforms/new_hash_join/memory/partitioned/partitioned_build.rs @@ -14,6 +14,7 @@ use std::sync::Arc; +use databend_common_base::base::ProgressValues; use databend_common_exception::Result; use databend_common_expression::Column; use databend_common_expression::ColumnVec; @@ -49,27 +50,21 @@ pub fn flat_to_row_ptr(flat_index: usize) -> RowPtr { /// Per-thread build state for partitioned hash join. pub struct PartitionedBuild { - /// Build blocks, each strictly CHUNK_SIZE rows (last may be shorter). pub chunks: Vec, - /// Per-chunk build key states for key comparison during probe. + pub method: HashMethodKind, pub build_keys_states: Vec, - /// Compact hash table (u32 row indices, 1-based). pub hash_table: CompactJoinHashTable, - /// Build columns in ColumnVec format for fast gather. + pub columns: Vec, - /// Column types for build side. pub column_types: Vec, - /// Total build rows. + pub num_rows: usize, - /// Hash method for key extraction. - pub method: HashMethodKind, - /// Join descriptor. + pub build_block_idx: usize, + + pub visited: Vec, pub desc: Arc, - /// Function context. pub function_ctx: FunctionContext, - /// Visited bitmap for right outer/semi/anti joins (1-based indexing, index 0 unused). - pub visited: Vec, - /// Fixed-size chunk accumulator using mutable columns. + accumulator: FixedSizeChunkAccumulator, } @@ -91,94 +86,113 @@ impl PartitionedBuild { function_ctx, visited: Vec::new(), accumulator: FixedSizeChunkAccumulator::new(CHUNK_SIZE), + build_block_idx: 0, } } - /// Push a build block. None signals end of input. - pub fn add_block(&mut self, data: Option) -> Result<()> { - match data { - Some(block) if !block.is_empty() => { - let flushed = self.accumulator.accumulate(block); - for chunk in flushed { - self.ingest_chunk(chunk)?; - } - } - _ => { - if let Some(chunk) = self.accumulator.flush() { - self.ingest_chunk(chunk)?; - } + pub fn add_block(&mut self, data: Option) -> Result<()> { + let Some(data_block) = data else { + if let Some(chunk) = self.accumulator.finalize() { + self.ingest_chunk::(chunk)?; } + + return Ok(()); + }; + + let data_block = self.prepare_data(data_block)?; + for ready_block in self.accumulator.accumulate(data_block) { + self.ingest_chunk::(ready_block)?; } + Ok(()) } - /// Process a flushed chunk: compute KeysState immediately. - fn ingest_chunk(&mut self, chunk: DataBlock) -> Result<()> { + fn ingest_chunk(&mut self, chunk: DataBlock) -> Result<()> { let num_rows = chunk.num_rows(); - with_hash_method!(|T| match &self.method { - HashMethodKind::T(method) => { - let keys_entries = self.desc.build_key(&chunk, &self.function_ctx)?; - let mut keys_block = DataBlock::new(keys_entries, num_rows); - self.desc.remove_keys_nullable(&mut keys_block); - let keys = ProjectedBlock::from(keys_block.columns()); - let keys_state = method.build_keys_state(keys, num_rows)?; - self.build_keys_states.push(keys_state); - } + let mut columns = chunk.take_columns(); + let data_columns = columns.split_off(self.desc.build_keys.len()); + let keys = ProjectedBlock::from(&columns); + + let keys_state = with_hash_method!(|T| match &self.method { + HashMethodKind::T(method) => method.build_keys_state(keys, num_rows)?, }); + self.num_rows += num_rows; - self.chunks.push(chunk); + self.build_keys_states.push(keys_state); + self.chunks.push(DataBlock::new(data_columns, num_rows)); Ok(()) } - /// Finalize build: build hash table chunk by chunk, extract ColumnVec. - pub fn final_build(&mut self) -> Result<()> { + fn prepare_data(&self, mut chunk: DataBlock) -> Result { + let num_rows = chunk.num_rows(); + + let keys_entries = self.desc.build_key(&chunk, &self.function_ctx)?; + let mut keys_block = DataBlock::new(keys_entries, num_rows); + + chunk = chunk.project(&self.desc.build_projection); + if let Some(bitmap) = self.desc.build_valids_by_keys(&keys_block)? { + if bitmap.true_count() != bitmap.len() { + keys_block = keys_block.filter_with_bitmap(&bitmap)?; + + chunk = match SCAN_MAP { + true => { + let null_keys = chunk.clone().filter_with_bitmap(&(!(&bitmap)))?; + DataBlock::concat(&[chunk.filter_with_bitmap(&bitmap)?, null_keys])? + } + false => chunk.filter_with_bitmap(&bitmap)?, + }; + } + } + + self.desc.remove_keys_nullable(&mut keys_block); + keys_block.merge_block(chunk); + Ok(keys_block) + } + + pub fn final_build(&mut self) -> Result> { if self.num_rows == 0 { - return Ok(()); + return Ok(None); } - // Allocate hash table with known total rows - self.hash_table = CompactJoinHashTable::new(self.num_rows); + if self.build_block_idx == 0 { + // Allocate hash table with known total rows + self.hash_table = CompactJoinHashTable::new(self.num_rows); + + if let Some(first_chunk) = self.chunks.first() { + self.column_types = (0..first_chunk.num_columns()) + .map(|offset| first_chunk.get_by_offset(offset).data_type()) + .collect(); + + let num_cols = first_chunk.num_columns(); + let mut columns = Vec::with_capacity(num_cols); + for offset in 0..num_cols { + let full_columns: Vec = self + .chunks + .iter() + .map(|chunk| chunk.get_by_offset(offset).to_column()) + .collect(); + columns.push(Column::take_downcast_column_vec(&full_columns)); + } + self.columns = columns; + } + } + + let row_offset = CHUNK_SIZE * self.build_block_idx + 1; + let keys_state = &self.build_keys_states[self.build_block_idx]; - // Process one chunk at a time: compute hashes, insert into table - let mut row_offset = 1; // 1-based indexing with_hash_method!(|T| match &self.method { HashMethodKind::T(method) => { - for keys_state in &self.build_keys_states { - let mut hashes = Vec::new(); - method.build_keys_hashes(keys_state, &mut hashes); - self.hash_table.insert_chunk(&hashes, row_offset); - row_offset += hashes.len(); - } + let mut hashes = Vec::new(); + method.build_keys_hashes(keys_state, &mut hashes); + self.hash_table.insert_chunk(&hashes, row_offset)?; + self.build_block_idx += 1; } }); - // Project build columns and extract ColumnVec - if let Some(first_chunk) = self.chunks.first() { - let first_projected = first_chunk.clone().project(&self.desc.build_projection); - self.column_types = (0..first_projected.num_columns()) - .map(|offset| first_projected.get_by_offset(offset).data_type()) - .collect(); - - let num_cols = first_projected.num_columns(); - let mut columns = Vec::with_capacity(num_cols); - for offset in 0..num_cols { - let full_columns: Vec = self - .chunks - .iter() - .map(|chunk| { - chunk - .clone() - .project(&self.desc.build_projection) - .get_by_offset(offset) - .to_column() - }) - .collect(); - columns.push(Column::take_downcast_column_vec(&full_columns)); - } - self.columns = columns; + match self.build_block_idx == self.chunks.len() { + true => Ok(None), + false => Ok(Some(ProgressValues { rows: 0, bytes: 0 })), } - - Ok(()) } pub fn reset(&mut self) { From 5e06faee11f26785434f96e78db9cb4bd2c2e82a Mon Sep 17 00:00:00 2001 From: zhang2014 Date: Sun, 22 Mar 2026 19:52:20 +0800 Subject: [PATCH 05/38] z --- .../memory/partitioned/partitioned_build.rs | 21 +++++++------------ 1 file changed, 7 insertions(+), 14 deletions(-) diff --git a/src/query/service/src/pipelines/processors/transforms/new_hash_join/memory/partitioned/partitioned_build.rs b/src/query/service/src/pipelines/processors/transforms/new_hash_join/memory/partitioned/partitioned_build.rs index dc49966d019b1..649ce2c14755a 100644 --- a/src/query/service/src/pipelines/processors/transforms/new_hash_join/memory/partitioned/partitioned_build.rs +++ b/src/query/service/src/pipelines/processors/transforms/new_hash_join/memory/partitioned/partitioned_build.rs @@ -90,10 +90,10 @@ impl PartitionedBuild { } } - pub fn add_block(&mut self, data: Option) -> Result<()> { + pub fn add_block(&mut self, data: Option) -> Result<()> { let Some(data_block) = data else { if let Some(chunk) = self.accumulator.finalize() { - self.ingest_chunk::(chunk)?; + self.ingest_chunk(chunk)?; } return Ok(()); @@ -101,13 +101,13 @@ impl PartitionedBuild { let data_block = self.prepare_data(data_block)?; for ready_block in self.accumulator.accumulate(data_block) { - self.ingest_chunk::(ready_block)?; + self.ingest_chunk(ready_block)?; } Ok(()) } - fn ingest_chunk(&mut self, chunk: DataBlock) -> Result<()> { + fn ingest_chunk(&mut self, chunk: DataBlock) -> Result<()> { let num_rows = chunk.num_rows(); let mut columns = chunk.take_columns(); let data_columns = columns.split_off(self.desc.build_keys.len()); @@ -123,7 +123,7 @@ impl PartitionedBuild { Ok(()) } - fn prepare_data(&self, mut chunk: DataBlock) -> Result { + fn prepare_data(&self, mut chunk: DataBlock) -> Result { let num_rows = chunk.num_rows(); let keys_entries = self.desc.build_key(&chunk, &self.function_ctx)?; @@ -133,14 +133,7 @@ impl PartitionedBuild { if let Some(bitmap) = self.desc.build_valids_by_keys(&keys_block)? { if bitmap.true_count() != bitmap.len() { keys_block = keys_block.filter_with_bitmap(&bitmap)?; - - chunk = match SCAN_MAP { - true => { - let null_keys = chunk.clone().filter_with_bitmap(&(!(&bitmap)))?; - DataBlock::concat(&[chunk.filter_with_bitmap(&bitmap)?, null_keys])? - } - false => chunk.filter_with_bitmap(&bitmap)?, - }; + chunk = chunk.filter_with_bitmap(&bitmap)?; } } @@ -184,7 +177,7 @@ impl PartitionedBuild { HashMethodKind::T(method) => { let mut hashes = Vec::new(); method.build_keys_hashes(keys_state, &mut hashes); - self.hash_table.insert_chunk(&hashes, row_offset)?; + self.hash_table.insert_chunk(&hashes, row_offset); self.build_block_idx += 1; } }); From 1036f529aa7b4cda3c1368bc36ca58d2021921b6 Mon Sep 17 00:00:00 2001 From: zhang2014 Date: Sun, 22 Mar 2026 20:10:16 +0800 Subject: [PATCH 06/38] z --- src/query/service/src/physical_plans/physical_hash_join.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/query/service/src/physical_plans/physical_hash_join.rs b/src/query/service/src/physical_plans/physical_hash_join.rs index 6baaadda4623d..23524ab2f0114 100644 --- a/src/query/service/src/physical_plans/physical_hash_join.rs +++ b/src/query/service/src/physical_plans/physical_hash_join.rs @@ -497,7 +497,7 @@ impl HashJoin { build_input.clone(), probe_input.clone(), joined_output.clone(), - if self.broadcast_id.is_some() { + if self.broadcast_id.is_some() && self.join_type == JoinType::Inner { factory.create_partitioned_join(self.join_type)? } else { factory.create_hash_join(self.join_type, 0)? From 8d29c42bb64df8e8a593813187aaf94df927e6f6 Mon Sep 17 00:00:00 2001 From: zhang2014 Date: Sun, 22 Mar 2026 21:24:32 +0800 Subject: [PATCH 07/38] z --- .../new_hash_join/memory/partitioned/chunk_accumulator.rs | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/query/service/src/pipelines/processors/transforms/new_hash_join/memory/partitioned/chunk_accumulator.rs b/src/query/service/src/pipelines/processors/transforms/new_hash_join/memory/partitioned/chunk_accumulator.rs index 5cbb27bcbfd04..0a0999fa25a5d 100644 --- a/src/query/service/src/pipelines/processors/transforms/new_hash_join/memory/partitioned/chunk_accumulator.rs +++ b/src/query/service/src/pipelines/processors/transforms/new_hash_join/memory/partitioned/chunk_accumulator.rs @@ -93,7 +93,6 @@ impl FixedSizeChunkAccumulator { } fn build_chunk(&mut self) -> DataBlock { - self.builder_rows = 0; let num_rows = self.builder_rows; let builders = std::mem::take(&mut self.builders); @@ -106,6 +105,8 @@ impl FixedSizeChunkAccumulator { columns.push(BlockEntry::from(b.build())); new_builders.push(ColumnBuilder::with_capacity(&dt, self.chunk_size)); } + + self.builder_rows = 0; self.builders = new_builders; DataBlock::new(columns, num_rows) From bc7aae79f44e72223692f4702cb6e222686129df Mon Sep 17 00:00:00 2001 From: zhang2014 Date: Sun, 22 Mar 2026 23:04:32 +0800 Subject: [PATCH 08/38] z --- .../src/physical_plans/physical_hash_join.rs | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) diff --git a/src/query/service/src/physical_plans/physical_hash_join.rs b/src/query/service/src/physical_plans/physical_hash_join.rs index 23524ab2f0114..22cace21abff5 100644 --- a/src/query/service/src/physical_plans/physical_hash_join.rs +++ b/src/query/service/src/physical_plans/physical_hash_join.rs @@ -481,6 +481,8 @@ impl HashJoin { debug_assert_eq!(build_sinks.len(), probe_sinks.len()); + let use_partitioned_join = self.broadcast_id.is_some() && self.join_type == JoinType::Inner; + let barrier = databend_common_base::base::Barrier::new(output_len); let stage_sync_barrier = Arc::new(barrier); let mut join_sinks = Vec::with_capacity(output_len * 2); @@ -497,7 +499,7 @@ impl HashJoin { build_input.clone(), probe_input.clone(), joined_output.clone(), - if self.broadcast_id.is_some() && self.join_type == JoinType::Inner { + if use_partitioned_join { factory.create_partitioned_join(self.join_type)? } else { factory.create_hash_join(self.join_type, 0)? @@ -518,11 +520,12 @@ impl HashJoin { let join_pipe = Pipe::create(output_len * 2, output_len, join_pipe_items); builder.main_pipeline.add_pipe(join_pipe); - // In the case of spilling, we need to share state among multiple threads - // Quickly fetch all data from this round to quickly start the next round - builder - .main_pipeline - .resize(builder.main_pipeline.output_len(), true) + if !use_partitioned_join { + let item_size = builder.main_pipeline.output_len(); + builder.main_pipeline.resize(item_size, true)?; + } + + Ok(()) } fn join_factory( From 221d591a2026563a6b17cfe6b48c8a0ae5fe719a Mon Sep 17 00:00:00 2001 From: zhang2014 Date: Mon, 23 Mar 2026 14:32:14 +0800 Subject: [PATCH 09/38] z --- .../src/physical_plans/physical_exchange.rs | 11 +++++++++++ .../physical_plans/physical_exchange_sink.rs | 11 +++++++++++ .../physical_plans/physical_exchange_source.rs | 17 +++++++++++++++++ .../src/physical_plans/physical_hash_join.rs | 18 ++++++++++++++++-- .../src/physical_plans/physical_plan.rs | 9 +++++++++ .../src/physical_plans/physical_range_join.rs | 5 +++++ .../src/physical_plans/physical_union_all.rs | 12 ++++++++++++ .../physical_window_partition.rs | 5 +++++ .../src/schedulers/fragments/fragmenter.rs | 4 ++++ .../sql/src/executor/physical_plans/common.rs | 11 ++++++++++- 10 files changed, 100 insertions(+), 3 deletions(-) diff --git a/src/query/service/src/physical_plans/physical_exchange.rs b/src/query/service/src/physical_plans/physical_exchange.rs index d2c691dde0870..0fb0f69014d7d 100644 --- a/src/query/service/src/physical_plans/physical_exchange.rs +++ b/src/query/service/src/physical_plans/physical_exchange.rs @@ -21,6 +21,7 @@ use databend_common_expression::RemoteExpr; use databend_common_functions::BUILTIN_FUNCTIONS; use databend_common_sql::ColumnSet; use databend_common_sql::TypeCheck; +use databend_common_sql::executor::physical_plans::DataDistribution; use databend_common_sql::executor::physical_plans::FragmentKind; use databend_common_sql::optimizer::ir::SExpr; @@ -75,6 +76,16 @@ impl IPhysicalPlan for Exchange { true } + fn output_data_distribution(&self) -> DataDistribution { + match &self.kind { + FragmentKind::Init => DataDistribution::Random, + FragmentKind::Normal => DataDistribution::NodeHash(self.keys.clone()), + FragmentKind::Expansive => DataDistribution::Broadcast, + FragmentKind::Merge => DataDistribution::Serial, + FragmentKind::GlobalShuffle => DataDistribution::GlobalHash(self.keys.clone()), + } + } + fn display_in_profile(&self) -> bool { false } diff --git a/src/query/service/src/physical_plans/physical_exchange_sink.rs b/src/query/service/src/physical_plans/physical_exchange_sink.rs index 17f2e3b51d9e5..9c110e77b5682 100644 --- a/src/query/service/src/physical_plans/physical_exchange_sink.rs +++ b/src/query/service/src/physical_plans/physical_exchange_sink.rs @@ -18,6 +18,7 @@ use databend_common_catalog::plan::DataSourcePlan; use databend_common_exception::Result; use databend_common_expression::DataSchemaRef; use databend_common_expression::RemoteExpr; +use databend_common_sql::executor::physical_plans::DataDistribution; use databend_common_sql::executor::physical_plans::FragmentKind; use crate::physical_plans::format::ExchangeSinkFormatter; @@ -84,6 +85,16 @@ impl IPhysicalPlan for ExchangeSink { true } + fn output_data_distribution(&self) -> DataDistribution { + match &self.kind { + FragmentKind::Init => DataDistribution::Random, + FragmentKind::Normal => DataDistribution::NodeHash(self.keys.clone()), + FragmentKind::Expansive => DataDistribution::Broadcast, + FragmentKind::Merge => DataDistribution::Serial, + FragmentKind::GlobalShuffle => DataDistribution::GlobalHash(self.keys.clone()), + } + } + fn display_in_profile(&self) -> bool { false } diff --git a/src/query/service/src/physical_plans/physical_exchange_source.rs b/src/query/service/src/physical_plans/physical_exchange_source.rs index 79076f92fb2b4..772e32978026a 100644 --- a/src/query/service/src/physical_plans/physical_exchange_source.rs +++ b/src/query/service/src/physical_plans/physical_exchange_source.rs @@ -16,7 +16,10 @@ use std::any::Any; use databend_common_exception::Result; use databend_common_expression::DataSchemaRef; +use databend_common_expression::RemoteExpr; use databend_common_pipeline::core::PlanScope; +use databend_common_sql::executor::physical_plans::DataDistribution; +use databend_common_sql::executor::physical_plans::FragmentKind; use crate::physical_plans::format::ExchangeSourceFormatter; use crate::physical_plans::format::PhysicalFormat; @@ -35,6 +38,8 @@ pub struct ExchangeSource { // Fragment ID of source fragment pub source_fragment_id: usize, pub query_id: String, + pub kind: FragmentKind, + pub keys: Vec, } #[typetag::serde] @@ -63,6 +68,16 @@ impl IPhysicalPlan for ExchangeSource { true } + fn output_data_distribution(&self) -> DataDistribution { + match &self.kind { + FragmentKind::Init => DataDistribution::Random, + FragmentKind::Normal => DataDistribution::NodeHash(self.keys.clone()), + FragmentKind::Expansive => DataDistribution::Broadcast, + FragmentKind::Merge => DataDistribution::Serial, + FragmentKind::GlobalShuffle => DataDistribution::GlobalHash(self.keys.clone()), + } + } + fn display_in_profile(&self) -> bool { false } @@ -74,6 +89,8 @@ impl IPhysicalPlan for ExchangeSource { schema: self.schema.clone(), source_fragment_id: self.source_fragment_id, query_id: self.query_id.clone(), + kind: self.kind.clone(), + keys: self.keys.clone(), }) } diff --git a/src/query/service/src/physical_plans/physical_hash_join.rs b/src/query/service/src/physical_plans/physical_hash_join.rs index 22cace21abff5..d6d7d525b702e 100644 --- a/src/query/service/src/physical_plans/physical_hash_join.rs +++ b/src/query/service/src/physical_plans/physical_hash_join.rs @@ -43,6 +43,7 @@ use databend_common_sql::IndexType; use databend_common_sql::ScalarExpr; use databend_common_sql::Symbol; use databend_common_sql::TypeCheck; +use databend_common_sql::executor::physical_plans::DataDistribution; use databend_common_sql::optimizer::ir::SExpr; use databend_common_sql::plans::FunctionCall; use databend_common_sql::plans::Join; @@ -195,6 +196,10 @@ impl IPhysicalPlan for HashJoin { Ok(HashJoinFormatter::create(self)) } + fn output_data_distribution(&self) -> DataDistribution { + self.probe.output_data_distribution() + } + fn get_desc(&self) -> Result { let mut conditions = self .build_keys @@ -481,7 +486,16 @@ impl HashJoin { debug_assert_eq!(build_sinks.len(), probe_sinks.len()); - let use_partitioned_join = self.broadcast_id.is_some() && self.join_type == JoinType::Inner; + // let use_partitioned_join = self.join_type == JoinType::Inner && self.broadcast_id.is_some(); + let use_partitioned_join = self.join_type == JoinType::Inner + && matches!( + self.build.output_data_distribution(), + DataDistribution::GlobalHash(_) + ) + && matches!( + self.probe.output_data_distribution(), + DataDistribution::GlobalHash(_) + ); let barrier = databend_common_base::base::Barrier::new(output_len); let stage_sync_barrier = Arc::new(barrier); @@ -1416,7 +1430,7 @@ impl PhysicalPlanBuilder { } for scalar in &join.non_equi_conditions { - predicates.push(resolve_scalar(scalar, &merged).map_err(|err|{ + predicates.push(resolve_scalar(scalar, &merged).map_err(|err| { err.add_message(format!( "Failed build nested loop filter schema: {merged:#?} non_equi_conditions: {:#?}", join.non_equi_conditions diff --git a/src/query/service/src/physical_plans/physical_plan.rs b/src/query/service/src/physical_plans/physical_plan.rs index 27cc04c134d07..aa828044687aa 100644 --- a/src/query/service/src/physical_plans/physical_plan.rs +++ b/src/query/service/src/physical_plans/physical_plan.rs @@ -30,6 +30,7 @@ use databend_common_expression::DataSchemaRef; use databend_common_pipeline::core::PlanProfile; use databend_common_pipeline::core::PlanScope; use databend_common_sql::Metadata; +use databend_common_sql::executor::physical_plans::DataDistribution; use dyn_clone::DynClone; use serde::Deserializer; use serde::Serializer; @@ -168,6 +169,14 @@ pub trait IPhysicalPlan: DynClone + Debug + Send + Sync + 'static { .any(|child| child.is_warehouse_distributed_plan()) } + #[recursive::recursive] + fn output_data_distribution(&self) -> DataDistribution { + match self.children().next() { + None => DataDistribution::Random, + Some(child) => child.output_data_distribution(), + } + } + fn display_in_profile(&self) -> bool { true } diff --git a/src/query/service/src/physical_plans/physical_range_join.rs b/src/query/service/src/physical_plans/physical_range_join.rs index 4ed7b99c7f33f..b5ce47e6f2255 100644 --- a/src/query/service/src/physical_plans/physical_range_join.rs +++ b/src/query/service/src/physical_plans/physical_range_join.rs @@ -29,6 +29,7 @@ use databend_common_sql::ScalarExpr; use databend_common_sql::TypeCheck; use databend_common_sql::binder::JoinPredicate; use databend_common_sql::binder::wrap_cast; +use databend_common_sql::executor::physical_plans::DataDistribution; use databend_common_sql::optimizer::ir::RelExpr; use databend_common_sql::optimizer::ir::RelationalProperty; use databend_common_sql::optimizer::ir::SExpr; @@ -95,6 +96,10 @@ impl IPhysicalPlan for RangeJoin { Ok(RangeJoinFormatter::create(self)) } + fn output_data_distribution(&self) -> DataDistribution { + DataDistribution::Random + } + fn get_desc(&self) -> Result { let mut condition = self .conditions diff --git a/src/query/service/src/physical_plans/physical_union_all.rs b/src/query/service/src/physical_plans/physical_union_all.rs index 1965739f66fe9..6c73f614feca5 100644 --- a/src/query/service/src/physical_plans/physical_union_all.rs +++ b/src/query/service/src/physical_plans/physical_union_all.rs @@ -28,8 +28,10 @@ use databend_common_sql::ScalarExpr; use databend_common_sql::Symbol; use databend_common_sql::TypeCheck; use databend_common_sql::evaluator::BlockOperator; +use databend_common_sql::executor::physical_plans::DataDistribution; use databend_common_sql::optimizer::ir::SExpr; use itertools::Itertools; +use recursive::recursive; use crate::physical_plans::Exchange; use crate::physical_plans::PhysicalPlanBuilder; @@ -89,6 +91,16 @@ impl IPhysicalPlan for UnionAll { Ok(UnionAllFormatter::create(self)) } + #[recursive] + fn output_data_distribution(&self) -> DataDistribution { + let left_dist = self.left.output_data_distribution(); + let right_dist = self.right.output_data_distribution(); + match left_dist == right_dist { + true => left_dist, + false => DataDistribution::Random, + } + } + fn get_desc(&self) -> Result { Ok(self .left_outputs diff --git a/src/query/service/src/physical_plans/physical_window_partition.rs b/src/query/service/src/physical_plans/physical_window_partition.rs index 734c5299258e7..bc12f224d3de7 100644 --- a/src/query/service/src/physical_plans/physical_window_partition.rs +++ b/src/query/service/src/physical_plans/physical_window_partition.rs @@ -24,6 +24,7 @@ use databend_common_expression::SortColumnDescription; use databend_common_pipeline::core::ProcessorPtr; use databend_common_pipeline_transforms::MemorySettings; use databend_common_sql::Symbol; +use databend_common_sql::executor::physical_plans::DataDistribution; use databend_common_sql::executor::physical_plans::SortDesc; use databend_storages_common_cache::TempDirManager; @@ -77,6 +78,10 @@ impl IPhysicalPlan for WindowPartition { Ok(WindowPartitionFormatter::create(self)) } + fn output_data_distribution(&self) -> DataDistribution { + DataDistribution::Random + } + #[recursive::recursive] fn try_find_single_data_source(&self) -> Option<&DataSourcePlan> { self.input.try_find_single_data_source() diff --git a/src/query/service/src/schedulers/fragments/fragmenter.rs b/src/query/service/src/schedulers/fragments/fragmenter.rs index 0cbce7f5e7a18..b20a3738441d3 100644 --- a/src/query/service/src/schedulers/fragments/fragmenter.rs +++ b/src/query/service/src/schedulers/fragments/fragmenter.rs @@ -283,6 +283,8 @@ impl DeriveHandle for FragmentDeriveHandle { let plan_id = v.get_id(); let source_fragment_id = self.ctx.get_fragment_id(); + let exchange_kind = exchange.kind.clone(); + let exchange_keys = exchange.keys.clone(); let plan: PhysicalPlan = PhysicalPlan::new(ExchangeSink { input, @@ -327,6 +329,8 @@ impl DeriveHandle for FragmentDeriveHandle { source_fragment_id, meta: PhysicalPlanMeta::with_plan_id("ExchangeSource", plan_id), + kind: exchange_kind, + keys: exchange_keys, })); } diff --git a/src/query/sql/src/executor/physical_plans/common.rs b/src/query/sql/src/executor/physical_plans/common.rs index 3023ae01d72b2..14173aad0e89d 100644 --- a/src/query/sql/src/executor/physical_plans/common.rs +++ b/src/query/sql/src/executor/physical_plans/common.rs @@ -71,10 +71,19 @@ pub enum FragmentKind { // Broadcast Expansive, Merge, - // Ping-pong based hash shuffle (used by hash join) + // Ping-pong based hash shuffle GlobalShuffle, } +#[derive(Clone, Debug, PartialEq, Eq, serde::Serialize, serde::Deserialize)] +pub enum DataDistribution { + Random, + NodeHash(Vec), + GlobalHash(Vec), + Broadcast, + Serial, +} + #[derive(Clone, Debug, serde::Serialize, serde::Deserialize, Copy)] pub enum MutationKind { Delete, From e2b965fa047afc81dfa74ce511e54b6475dbdf24 Mon Sep 17 00:00:00 2001 From: zhang2014 Date: Tue, 24 Mar 2026 18:13:52 +0800 Subject: [PATCH 10/38] z --- .../physical_add_stream_column.rs | 5 +++ .../physical_aggregate_expand.rs | 5 +++ .../physical_aggregate_final.rs | 8 ++++ .../physical_aggregate_partial.rs | 5 +++ .../src/physical_plans/physical_async_func.rs | 5 +++ .../src/physical_plans/physical_broadcast.rs | 9 +++++ .../src/physical_plans/physical_cache_scan.rs | 5 +++ .../physical_column_mutation.rs | 5 +++ .../physical_plans/physical_commit_sink.rs | 5 +++ .../physical_plans/physical_compact_source.rs | 5 +++ .../physical_constant_table_scan.rs | 5 +++ .../physical_copy_into_location.rs | 5 +++ .../physical_copy_into_table.rs | 5 +++ .../physical_plans/physical_cte_consumer.rs | 5 +++ .../physical_distributed_insert_select.rs | 5 +++ .../physical_plans/physical_eval_scalar.rs | 5 +++ .../physical_expression_scan.rs | 5 +++ .../src/physical_plans/physical_filter.rs | 5 +++ .../src/physical_plans/physical_hash_join.rs | 20 +++++++++- .../src/physical_plans/physical_limit.rs | 5 +++ .../physical_materialized_cte.rs | 5 +++ .../physical_multi_table_insert.rs | 37 +++++++++++++++++++ .../src/physical_plans/physical_mutation.rs | 5 +++ .../physical_mutation_into_organize.rs | 5 +++ .../physical_mutation_into_split.rs | 5 +++ .../physical_mutation_manipulate.rs | 5 +++ .../physical_mutation_source.rs | 5 +++ .../src/physical_plans/physical_plan.rs | 8 +--- .../physical_plans/physical_project_set.rs | 5 +++ .../src/physical_plans/physical_r_cte_scan.rs | 5 +++ .../src/physical_plans/physical_recluster.rs | 9 +++++ .../physical_replace_async_source.rs | 5 +++ .../physical_replace_deduplicate.rs | 5 +++ .../physical_plans/physical_replace_into.rs | 5 +++ .../src/physical_plans/physical_row_fetch.rs | 5 +++ .../physical_plans/physical_secure_filter.rs | 5 +++ .../src/physical_plans/physical_sequence.rs | 5 +++ .../src/physical_plans/physical_sort.rs | 10 +++++ .../src/physical_plans/physical_table_scan.rs | 5 +++ .../src/physical_plans/physical_udf.rs | 5 +++ .../src/physical_plans/physical_window.rs | 9 +++++ .../partitioned/compact_probe_stream.rs | 25 ++++++++++++- .../memory/partitioned/inner_join.rs | 3 +- .../memory/partitioned/left_join.rs | 3 +- .../memory/partitioned/left_join_anti.rs | 3 +- .../memory/partitioned/left_join_semi.rs | 3 +- .../memory/partitioned/partitioned_build.rs | 1 + .../memory/partitioned/right_join.rs | 8 ++-- .../memory/partitioned/right_join_anti.rs | 8 ++-- .../memory/partitioned/right_join_semi.rs | 8 ++-- .../flight/v1/packets/packet_fragment.rs | 5 +++ 51 files changed, 314 insertions(+), 28 deletions(-) diff --git a/src/query/service/src/physical_plans/physical_add_stream_column.rs b/src/query/service/src/physical_plans/physical_add_stream_column.rs index cf11d49dad7b9..50ec67fa873f0 100644 --- a/src/query/service/src/physical_plans/physical_add_stream_column.rs +++ b/src/query/service/src/physical_plans/physical_add_stream_column.rs @@ -36,6 +36,7 @@ use databend_common_sql::StreamContext; use databend_common_sql::Symbol; use databend_common_sql::Visibility; use databend_common_sql::evaluator::BlockOperator; +use databend_common_sql::executor::physical_plans::DataDistribution; use databend_common_sql::plans::BoundColumnRef; use databend_common_sql::plans::ConstantExpr; use databend_common_sql::plans::FunctionCall; @@ -69,6 +70,10 @@ impl IPhysicalPlan for AddStreamColumn { &mut self.meta } + fn output_data_distribution(&self) -> DataDistribution { + self.input.output_data_distribution() + } + fn children<'a>(&'a self) -> Box + 'a> { Box::new(std::iter::once(&self.input)) } diff --git a/src/query/service/src/physical_plans/physical_aggregate_expand.rs b/src/query/service/src/physical_plans/physical_aggregate_expand.rs index ea747aace91eb..0585b1ea85503 100644 --- a/src/query/service/src/physical_plans/physical_aggregate_expand.rs +++ b/src/query/service/src/physical_plans/physical_aggregate_expand.rs @@ -22,6 +22,7 @@ use databend_common_expression::types::DataType; use databend_common_expression::types::NumberDataType; use databend_common_pipeline_transforms::TransformPipelineHelper; use databend_common_sql::Symbol; +use databend_common_sql::executor::physical_plans::DataDistribution; use databend_common_sql::plans::GroupingSets; use crate::physical_plans::explain::PlanStatsInfo; @@ -58,6 +59,10 @@ impl IPhysicalPlan for AggregateExpand { &mut self.meta } + fn output_data_distribution(&self) -> DataDistribution { + self.input.output_data_distribution() + } + #[recursive::recursive] fn output_schema(&self) -> Result { let input_schema = self.input.output_schema()?; diff --git a/src/query/service/src/physical_plans/physical_aggregate_final.rs b/src/query/service/src/physical_plans/physical_aggregate_final.rs index 67008d6f96690..f37b41671fd70 100644 --- a/src/query/service/src/physical_plans/physical_aggregate_final.rs +++ b/src/query/service/src/physical_plans/physical_aggregate_final.rs @@ -29,6 +29,7 @@ use databend_common_sql::ScalarExpr; use databend_common_sql::Symbol; use databend_common_sql::executor::physical_plans::AggregateFunctionDesc; use databend_common_sql::executor::physical_plans::AggregateFunctionSignature; +use databend_common_sql::executor::physical_plans::DataDistribution; use databend_common_sql::executor::physical_plans::SortDesc; use databend_common_sql::optimizer::ir::SExpr; use databend_common_sql::plans::Aggregate; @@ -111,6 +112,13 @@ impl IPhysicalPlan for AggregateFinal { Ok(AggregateFinalFormatter::create(self)) } + fn output_data_distribution(&self) -> DataDistribution { + match self.group_by.is_empty() { + true => DataDistribution::Serial, + false => DataDistribution::Random, + } + } + fn get_desc(&self) -> Result { Ok(self.agg_funcs.iter().map(|x| x.display.clone()).join(", ")) } diff --git a/src/query/service/src/physical_plans/physical_aggregate_partial.rs b/src/query/service/src/physical_plans/physical_aggregate_partial.rs index c2d39d7430af3..247099bf3405d 100644 --- a/src/query/service/src/physical_plans/physical_aggregate_partial.rs +++ b/src/query/service/src/physical_plans/physical_aggregate_partial.rs @@ -32,6 +32,7 @@ use databend_common_pipeline_transforms::TransformPipelineHelper; use databend_common_pipeline_transforms::sorts::TransformRankLimitSort; use databend_common_sql::Symbol; use databend_common_sql::executor::physical_plans::AggregateFunctionDesc; +use databend_common_sql::executor::physical_plans::DataDistribution; use databend_common_sql::executor::physical_plans::SortDesc; use databend_common_storage::DataOperator; use itertools::Itertools; @@ -81,6 +82,10 @@ impl IPhysicalPlan for AggregatePartial { &mut self.meta } + fn output_data_distribution(&self) -> DataDistribution { + self.input.output_data_distribution() + } + #[recursive::recursive] fn output_schema(&self) -> Result { let input_schema = self.input.output_schema()?; diff --git a/src/query/service/src/physical_plans/physical_async_func.rs b/src/query/service/src/physical_plans/physical_async_func.rs index b32fd60af7ddd..08ec88a4edc95 100644 --- a/src/query/service/src/physical_plans/physical_async_func.rs +++ b/src/query/service/src/physical_plans/physical_async_func.rs @@ -24,6 +24,7 @@ use databend_common_pipeline_transforms::TransformPipelineHelper; use databend_common_sql::ColumnSet; use databend_common_sql::ScalarExpr; use databend_common_sql::binder::AsyncFunctionDesc; +use databend_common_sql::executor::physical_plans::DataDistribution; use databend_common_sql::optimizer::ir::SExpr; use itertools::Itertools; @@ -59,6 +60,10 @@ impl IPhysicalPlan for AsyncFunction { &mut self.meta } + fn output_data_distribution(&self) -> DataDistribution { + self.input.output_data_distribution() + } + #[recursive::recursive] fn output_schema(&self) -> Result { let input_schema = self.input.output_schema()?; diff --git a/src/query/service/src/physical_plans/physical_broadcast.rs b/src/query/service/src/physical_plans/physical_broadcast.rs index 88f961c5103ed..37a59c46261e7 100644 --- a/src/query/service/src/physical_plans/physical_broadcast.rs +++ b/src/query/service/src/physical_plans/physical_broadcast.rs @@ -17,6 +17,7 @@ use std::any::Any; use databend_common_catalog::table_context::TableContext; use databend_common_exception::Result; use databend_common_expression::DataSchemaRef; +use databend_common_sql::executor::physical_plans::DataDistribution; use databend_common_sql::executor::physical_plans::FragmentKind; use super::Exchange; @@ -48,6 +49,10 @@ impl IPhysicalPlan for BroadcastSource { &mut self.meta } + fn output_data_distribution(&self) -> DataDistribution { + DataDistribution::Random + } + fn derive(&self, children: Vec) -> PhysicalPlan { assert!(children.is_empty()); PhysicalPlan::new(BroadcastSource { @@ -88,6 +93,10 @@ impl IPhysicalPlan for BroadcastSink { &mut self.meta } + fn output_data_distribution(&self) -> DataDistribution { + DataDistribution::Serial + } + #[recursive::recursive] fn output_schema(&self) -> Result { Ok(DataSchemaRef::default()) diff --git a/src/query/service/src/physical_plans/physical_cache_scan.rs b/src/query/service/src/physical_plans/physical_cache_scan.rs index 0a01747df77e3..5110acb590559 100644 --- a/src/query/service/src/physical_plans/physical_cache_scan.rs +++ b/src/query/service/src/physical_plans/physical_cache_scan.rs @@ -19,6 +19,7 @@ use databend_common_exception::Result; use databend_common_expression::DataSchemaRef; use databend_common_expression::DataSchemaRefExt; use databend_common_sql::ColumnSet; +use databend_common_sql::executor::physical_plans::DataDistribution; use databend_common_sql::plans::CacheSource; use crate::physical_plans::format::CacheScanFormatter; @@ -59,6 +60,10 @@ impl IPhysicalPlan for CacheScan { Ok(self.output_schema.clone()) } + fn output_data_distribution(&self) -> DataDistribution { + DataDistribution::Random + } + fn formatter(&self) -> Result> { Ok(CacheScanFormatter::create(self)) } diff --git a/src/query/service/src/physical_plans/physical_column_mutation.rs b/src/query/service/src/physical_plans/physical_column_mutation.rs index c1bb9b709ec8b..8fa9af0fd175e 100644 --- a/src/query/service/src/physical_plans/physical_column_mutation.rs +++ b/src/query/service/src/physical_plans/physical_column_mutation.rs @@ -25,6 +25,7 @@ use databend_common_meta_app::schema::TableInfo; use databend_common_pipeline_transforms::TransformPipelineHelper; use databend_common_pipeline_transforms::blocks::CompoundBlockOperator; use databend_common_sql::evaluator::BlockOperator; +use databend_common_sql::executor::physical_plans::DataDistribution; use databend_common_sql::executor::physical_plans::MutationKind; use databend_common_storages_fuse::FuseTable; use databend_common_storages_fuse::operations::TransformSerializeBlock; @@ -65,6 +66,10 @@ impl IPhysicalPlan for ColumnMutation { &mut self.meta } + fn output_data_distribution(&self) -> DataDistribution { + self.input.output_data_distribution() + } + #[recursive::recursive] fn output_schema(&self) -> Result { Ok(DataSchemaRef::default()) diff --git a/src/query/service/src/physical_plans/physical_commit_sink.rs b/src/query/service/src/physical_plans/physical_commit_sink.rs index c8f584484b0f0..59303d641a1e7 100644 --- a/src/query/service/src/physical_plans/physical_commit_sink.rs +++ b/src/query/service/src/physical_plans/physical_commit_sink.rs @@ -23,6 +23,7 @@ use databend_common_meta_app::schema::TableInfo; use databend_common_meta_app::schema::UpdateStreamMetaReq; use databend_common_pipeline::core::ExecutionInfo; use databend_common_pipeline_transforms::TransformPipelineHelper; +use databend_common_sql::executor::physical_plans::DataDistribution; use databend_common_sql::executor::physical_plans::MutationKind; use databend_common_sql::plans::TruncateMode; use databend_common_storages_fuse::FuseTable; @@ -71,6 +72,10 @@ impl IPhysicalPlan for CommitSink { &mut self.meta } + fn output_data_distribution(&self) -> DataDistribution { + DataDistribution::Serial + } + #[recursive::recursive] fn output_schema(&self) -> Result { Ok(DataSchemaRef::default()) diff --git a/src/query/service/src/physical_plans/physical_compact_source.rs b/src/query/service/src/physical_plans/physical_compact_source.rs index 52fa41a4b8533..94593db34eb87 100644 --- a/src/query/service/src/physical_plans/physical_compact_source.rs +++ b/src/query/service/src/physical_plans/physical_compact_source.rs @@ -31,6 +31,7 @@ use databend_common_pipeline::sources::EmptySource; use databend_common_pipeline::sources::PrefetchAsyncSourcer; use databend_common_pipeline_transforms::TransformPipelineHelper; use databend_common_sql::StreamContext; +use databend_common_sql::executor::physical_plans::DataDistribution; use databend_common_sql::executor::physical_plans::FragmentKind; use databend_common_sql::executor::physical_plans::MutationKind; use databend_common_storages_fuse::FuseTable; @@ -72,6 +73,10 @@ impl IPhysicalPlan for CompactSource { &mut self.meta } + fn output_data_distribution(&self) -> DataDistribution { + DataDistribution::Random + } + fn derive(&self, children: Vec) -> PhysicalPlan { assert!(children.is_empty()); PhysicalPlan::new(CompactSource { diff --git a/src/query/service/src/physical_plans/physical_constant_table_scan.rs b/src/query/service/src/physical_plans/physical_constant_table_scan.rs index 2024c4c5fe1d1..c568618b1ca67 100644 --- a/src/query/service/src/physical_plans/physical_constant_table_scan.rs +++ b/src/query/service/src/physical_plans/physical_constant_table_scan.rs @@ -21,6 +21,7 @@ use databend_common_expression::DataSchemaRef; use databend_common_pipeline::sources::OneBlockSource; use databend_common_sql::ColumnSet; use databend_common_sql::Symbol; +use databend_common_sql::executor::physical_plans::DataDistribution; use crate::physical_plans::PhysicalPlanBuilder; use crate::physical_plans::format::ConstantTableScanFormatter; @@ -56,6 +57,10 @@ impl IPhysicalPlan for ConstantTableScan { Ok(self.output_schema.clone()) } + fn output_data_distribution(&self) -> DataDistribution { + DataDistribution::Random + } + fn formatter(&self) -> Result> { Ok(ConstantTableScanFormatter::create(self)) } diff --git a/src/query/service/src/physical_plans/physical_copy_into_location.rs b/src/query/service/src/physical_plans/physical_copy_into_location.rs index 873dcdd3e4f16..678b8d81d3263 100644 --- a/src/query/service/src/physical_plans/physical_copy_into_location.rs +++ b/src/query/service/src/physical_plans/physical_copy_into_location.rs @@ -27,6 +27,7 @@ use databend_common_expression::TableSchemaRef; use databend_common_expression::types::DataType; use databend_common_expression::types::NumberDataType; use databend_common_sql::ColumnBinding; +use databend_common_sql::executor::physical_plans::DataDistribution; use databend_common_storages_stage::StageSinkTable; use databend_storages_common_stage::CopyIntoLocationInfo; use databend_storages_common_table_meta::meta::TableMetaTimestamps; @@ -60,6 +61,10 @@ impl IPhysicalPlan for CopyIntoLocation { &mut self.meta } + fn output_data_distribution(&self) -> DataDistribution { + DataDistribution::Random + } + #[recursive::recursive] fn output_schema(&self) -> Result { Ok(DataSchemaRefExt::create(vec![ diff --git a/src/query/service/src/physical_plans/physical_copy_into_table.rs b/src/query/service/src/physical_plans/physical_copy_into_table.rs index a5b3bdaa34e3a..2c34ec02500e8 100644 --- a/src/query/service/src/physical_plans/physical_copy_into_table.rs +++ b/src/query/service/src/physical_plans/physical_copy_into_table.rs @@ -23,6 +23,7 @@ use databend_common_expression::DataSchemaRefExt; use databend_common_expression::Scalar; use databend_common_meta_app::schema::TableInfo; use databend_common_sql::ColumnBinding; +use databend_common_sql::executor::physical_plans::DataDistribution; use databend_common_sql::plans::CopyIntoTableMode; use databend_common_sql::plans::ValidationMode; use databend_storages_common_table_meta::meta::TableMetaTimestamps; @@ -64,6 +65,10 @@ impl IPhysicalPlan for CopyIntoTable { &mut self.meta } + fn output_data_distribution(&self) -> DataDistribution { + DataDistribution::Random + } + #[recursive::recursive] fn output_schema(&self) -> Result { Ok(DataSchemaRefExt::create(vec![])) diff --git a/src/query/service/src/physical_plans/physical_cte_consumer.rs b/src/query/service/src/physical_plans/physical_cte_consumer.rs index 19f399c6b4fbb..252c52fddb057 100644 --- a/src/query/service/src/physical_plans/physical_cte_consumer.rs +++ b/src/query/service/src/physical_plans/physical_cte_consumer.rs @@ -19,6 +19,7 @@ use databend_common_exception::Result; use databend_common_expression::DataField; use databend_common_expression::DataSchemaRef; use databend_common_expression::DataSchemaRefExt; +use databend_common_sql::executor::physical_plans::DataDistribution; use crate::physical_plans::IPhysicalPlan; use crate::physical_plans::PhysicalPlan; @@ -62,6 +63,10 @@ impl IPhysicalPlan for MaterializeCTERef { Ok(self.cte_schema.clone()) } + fn output_data_distribution(&self) -> DataDistribution { + DataDistribution::Random + } + fn derive(&self, children: Vec) -> PhysicalPlan { assert!(children.is_empty()); PhysicalPlan::new(MaterializeCTERef { diff --git a/src/query/service/src/physical_plans/physical_distributed_insert_select.rs b/src/query/service/src/physical_plans/physical_distributed_insert_select.rs index 3c7280778f4a6..795744908abfb 100644 --- a/src/query/service/src/physical_plans/physical_distributed_insert_select.rs +++ b/src/query/service/src/physical_plans/physical_distributed_insert_select.rs @@ -21,6 +21,7 @@ use databend_common_meta_app::schema::TableInfo; use databend_common_pipeline_transforms::TransformPipelineHelper; use databend_common_pipeline_transforms::blocks::TransformCastSchema; use databend_common_sql::ColumnBinding; +use databend_common_sql::executor::physical_plans::DataDistribution; use databend_storages_common_table_meta::meta::TableMetaTimestamps; use crate::physical_plans::physical_plan::IPhysicalPlan; @@ -53,6 +54,10 @@ impl IPhysicalPlan for DistributedInsertSelect { &mut self.meta } + fn output_data_distribution(&self) -> DataDistribution { + DataDistribution::Random + } + #[recursive::recursive] fn output_schema(&self) -> Result { Ok(DataSchemaRef::default()) diff --git a/src/query/service/src/physical_plans/physical_eval_scalar.rs b/src/query/service/src/physical_plans/physical_eval_scalar.rs index 92f411baf6d2d..0b2ca0667c161 100644 --- a/src/query/service/src/physical_plans/physical_eval_scalar.rs +++ b/src/query/service/src/physical_plans/physical_eval_scalar.rs @@ -33,6 +33,7 @@ use databend_common_sql::ColumnSet; use databend_common_sql::Symbol; use databend_common_sql::TypeCheck; use databend_common_sql::evaluator::BlockOperator; +use databend_common_sql::executor::physical_plans::DataDistribution; use databend_common_sql::optimizer::ir::Matcher; use databend_common_sql::optimizer::ir::SExpr; use databend_common_sql::plans::Filter; @@ -78,6 +79,10 @@ impl IPhysicalPlan for EvalScalar { &mut self.meta } + fn output_data_distribution(&self) -> DataDistribution { + self.input.output_data_distribution() + } + #[recursive::recursive] fn output_schema(&self) -> Result { if self.exprs.is_empty() { diff --git a/src/query/service/src/physical_plans/physical_expression_scan.rs b/src/query/service/src/physical_plans/physical_expression_scan.rs index 8d13ffbe9ed7f..955a5885d2f31 100644 --- a/src/query/service/src/physical_plans/physical_expression_scan.rs +++ b/src/query/service/src/physical_plans/physical_expression_scan.rs @@ -22,6 +22,7 @@ use databend_common_functions::BUILTIN_FUNCTIONS; use databend_common_pipeline::core::ProcessorPtr; use databend_common_sql::ColumnSet; use databend_common_sql::TypeCheck; +use databend_common_sql::executor::physical_plans::DataDistribution; use databend_common_sql::optimizer::ir::SExpr; use crate::physical_plans::PhysicalPlanBuilder; @@ -59,6 +60,10 @@ impl IPhysicalPlan for ExpressionScan { Ok(self.output_schema.clone()) } + fn output_data_distribution(&self) -> DataDistribution { + self.input.output_data_distribution() + } + fn children<'a>(&'a self) -> Box + 'a> { Box::new(std::iter::once(&self.input)) } diff --git a/src/query/service/src/physical_plans/physical_filter.rs b/src/query/service/src/physical_plans/physical_filter.rs index 1213b43e7f793..42db796e6f9d9 100644 --- a/src/query/service/src/physical_plans/physical_filter.rs +++ b/src/query/service/src/physical_plans/physical_filter.rs @@ -26,6 +26,7 @@ use databend_common_functions::BUILTIN_FUNCTIONS; use databend_common_sql::ColumnSet; use databend_common_sql::TypeCheck; use databend_common_sql::executor::cast_expr_to_non_null_boolean; +use databend_common_sql::executor::physical_plans::DataDistribution; use databend_common_sql::optimizer::ir::SExpr; use crate::physical_plans::PhysicalPlanBuilder; @@ -62,6 +63,10 @@ impl IPhysicalPlan for Filter { &mut self.meta } + fn output_data_distribution(&self) -> DataDistribution { + self.input.output_data_distribution() + } + #[recursive::recursive] fn output_schema(&self) -> Result { let input_schema = self.input.output_schema()?; diff --git a/src/query/service/src/physical_plans/physical_hash_join.rs b/src/query/service/src/physical_plans/physical_hash_join.rs index d6d7d525b702e..ff4048e1d9348 100644 --- a/src/query/service/src/physical_plans/physical_hash_join.rs +++ b/src/query/service/src/physical_plans/physical_hash_join.rs @@ -197,7 +197,24 @@ impl IPhysicalPlan for HashJoin { } fn output_data_distribution(&self) -> DataDistribution { - self.probe.output_data_distribution() + let build_dist = self.build.output_data_distribution(); + let probe_dist = self.probe.output_data_distribution(); + + let can_preserve_global_hash = self.join_type == JoinType::Inner + && matches!( + &build_dist, + DataDistribution::GlobalHash(keys) if keys == &self.build_keys + ) + && matches!( + &probe_dist, + DataDistribution::GlobalHash(keys) if keys == &self.probe_keys + ); + + if can_preserve_global_hash { + probe_dist + } else { + DataDistribution::Random + } } fn get_desc(&self) -> Result { @@ -486,7 +503,6 @@ impl HashJoin { debug_assert_eq!(build_sinks.len(), probe_sinks.len()); - // let use_partitioned_join = self.join_type == JoinType::Inner && self.broadcast_id.is_some(); let use_partitioned_join = self.join_type == JoinType::Inner && matches!( self.build.output_data_distribution(), diff --git a/src/query/service/src/physical_plans/physical_limit.rs b/src/query/service/src/physical_plans/physical_limit.rs index 90e741c5772e1..bd5d93f466d3f 100644 --- a/src/query/service/src/physical_plans/physical_limit.rs +++ b/src/query/service/src/physical_plans/physical_limit.rs @@ -26,6 +26,7 @@ use databend_common_sql::ColumnEntry; use databend_common_sql::ColumnSet; use databend_common_sql::IndexType; use databend_common_sql::Symbol; +use databend_common_sql::executor::physical_plans::DataDistribution; use databend_common_sql::optimizer::ir::SExpr; use crate::physical_plans::PhysicalPlanBuilder; @@ -75,6 +76,10 @@ impl IPhysicalPlan for Limit { Ok(LimitFormatter::create(self)) } + fn output_data_distribution(&self) -> DataDistribution { + DataDistribution::Serial + } + #[recursive::recursive] fn try_find_single_data_source(&self) -> Option<&DataSourcePlan> { self.input.try_find_single_data_source() diff --git a/src/query/service/src/physical_plans/physical_materialized_cte.rs b/src/query/service/src/physical_plans/physical_materialized_cte.rs index f0d31445eb829..66e9a37b88b44 100644 --- a/src/query/service/src/physical_plans/physical_materialized_cte.rs +++ b/src/query/service/src/physical_plans/physical_materialized_cte.rs @@ -20,6 +20,7 @@ use databend_common_pipeline_transforms::TransformPipelineHelper; use databend_common_pipeline_transforms::blocks::CompoundBlockOperator; use databend_common_sql::Symbol; use databend_common_sql::evaluator::BlockOperator; +use databend_common_sql::executor::physical_plans::DataDistribution; use databend_common_sql::optimizer::ir::SExpr; use crate::physical_plans::IPhysicalPlan; @@ -60,6 +61,10 @@ impl IPhysicalPlan for MaterializedCTE { &mut self.meta } + fn output_data_distribution(&self) -> DataDistribution { + DataDistribution::Serial + } + #[recursive::recursive] fn output_schema(&self) -> Result { self.input.output_schema() diff --git a/src/query/service/src/physical_plans/physical_multi_table_insert.rs b/src/query/service/src/physical_plans/physical_multi_table_insert.rs index 688e2d156d8e9..bffeebe447b66 100644 --- a/src/query/service/src/physical_plans/physical_multi_table_insert.rs +++ b/src/query/service/src/physical_plans/physical_multi_table_insert.rs @@ -39,6 +39,7 @@ use databend_common_pipeline_transforms::blocks::CompoundBlockOperator; use databend_common_pipeline_transforms::columns::TransformAddComputedColumns; use databend_common_pipeline_transforms::sorts::TransformSortPartial; use databend_common_sql::DefaultExprBinder; +use databend_common_sql::executor::physical_plans::DataDistribution; use databend_common_storages_fuse::FuseTable; use databend_common_storages_fuse::operations::CommitMultiTableInsert; use databend_storages_common_table_meta::meta::TableMetaTimestamps; @@ -86,6 +87,10 @@ impl IPhysicalPlan for Duplicate { Box::new(std::iter::once(&mut self.input)) } + fn output_data_distribution(&self) -> DataDistribution { + self.input.output_data_distribution() + } + fn formatter(&self) -> Result> { Ok(DuplicateFormatter::create(self)) } @@ -135,6 +140,10 @@ impl IPhysicalPlan for Shuffle { Box::new(std::iter::once(&mut self.input)) } + fn output_data_distribution(&self) -> DataDistribution { + self.input.output_data_distribution() + } + fn formatter(&self) -> Result> { Ok(ShuffleFormatter::create(self)) } @@ -218,6 +227,10 @@ impl IPhysicalPlan for ChunkFilter { Box::new(std::iter::once(&mut self.input)) } + fn output_data_distribution(&self) -> DataDistribution { + self.input.output_data_distribution() + } + fn formatter(&self) -> Result> { Ok(ChunkFilterFormatter::create(self)) } @@ -283,6 +296,10 @@ impl IPhysicalPlan for ChunkEvalScalar { Box::new(std::iter::once(&mut self.input)) } + fn output_data_distribution(&self) -> DataDistribution { + self.input.output_data_distribution() + } + fn formatter(&self) -> Result> { Ok(ChunkEvalScalarFormatter::create(self)) } @@ -355,6 +372,10 @@ impl IPhysicalPlan for ChunkCastSchema { Box::new(std::iter::once(&mut self.input)) } + fn output_data_distribution(&self) -> DataDistribution { + self.input.output_data_distribution() + } + fn formatter(&self) -> Result> { Ok(ChunkCastSchemaFormatter::create(self)) } @@ -496,6 +517,10 @@ impl IPhysicalPlan for ChunkFillAndReorder { Box::new(std::iter::once(&mut self.input)) } + fn output_data_distribution(&self) -> DataDistribution { + self.input.output_data_distribution() + } + fn formatter(&self) -> Result> { Ok(ChunkFillAndReorderFormatter::create(self)) } @@ -645,6 +670,10 @@ impl IPhysicalPlan for ChunkAppendData { Box::new(std::iter::once(&mut self.input)) } + fn output_data_distribution(&self) -> DataDistribution { + self.input.output_data_distribution() + } + fn formatter(&self) -> Result> { Ok(ChunkAppendDataFormatter::create(self)) } @@ -804,6 +833,10 @@ impl IPhysicalPlan for ChunkMerge { Box::new(std::iter::once(&mut self.input)) } + fn output_data_distribution(&self) -> DataDistribution { + DataDistribution::Random + } + fn formatter(&self) -> Result> { Ok(ChunkMergeFormatter::create(self)) } @@ -876,6 +909,10 @@ impl IPhysicalPlan for ChunkCommitInsert { Box::new(std::iter::once(&mut self.input)) } + fn output_data_distribution(&self) -> DataDistribution { + DataDistribution::Serial + } + fn derive(&self, mut children: Vec) -> PhysicalPlan { assert_eq!(children.len(), 1); let input = children.pop().unwrap(); diff --git a/src/query/service/src/physical_plans/physical_mutation.rs b/src/query/service/src/physical_plans/physical_mutation.rs index 30988393b8119..2025aeff75c55 100644 --- a/src/query/service/src/physical_plans/physical_mutation.rs +++ b/src/query/service/src/physical_plans/physical_mutation.rs @@ -55,6 +55,7 @@ use databend_common_sql::Visibility; use databend_common_sql::binder::MutationStrategy; use databend_common_sql::binder::MutationType; use databend_common_sql::binder::wrap_cast; +use databend_common_sql::executor::physical_plans::DataDistribution; use databend_common_sql::executor::physical_plans::FragmentKind; use databend_common_sql::executor::physical_plans::MutationKind; use databend_common_sql::optimizer::ir::SExpr; @@ -120,6 +121,10 @@ impl IPhysicalPlan for Mutation { &mut self.meta } + fn output_data_distribution(&self) -> DataDistribution { + DataDistribution::Random + } + #[recursive::recursive] fn output_schema(&self) -> Result { Ok(DataSchemaRef::default()) diff --git a/src/query/service/src/physical_plans/physical_mutation_into_organize.rs b/src/query/service/src/physical_plans/physical_mutation_into_organize.rs index b5770451d56bf..cdff50ce57401 100644 --- a/src/query/service/src/physical_plans/physical_mutation_into_organize.rs +++ b/src/query/service/src/physical_plans/physical_mutation_into_organize.rs @@ -16,6 +16,7 @@ use std::any::Any; use databend_common_exception::Result; use databend_common_sql::binder::MutationStrategy; +use databend_common_sql::executor::physical_plans::DataDistribution; use crate::physical_plans::format::MutationOrganizeFormatter; use crate::physical_plans::format::PhysicalFormat; @@ -44,6 +45,10 @@ impl IPhysicalPlan for MutationOrganize { &mut self.meta } + fn output_data_distribution(&self) -> DataDistribution { + DataDistribution::Random + } + fn children<'a>(&'a self) -> Box + 'a> { Box::new(std::iter::once(&self.input)) } diff --git a/src/query/service/src/physical_plans/physical_mutation_into_split.rs b/src/query/service/src/physical_plans/physical_mutation_into_split.rs index e86c70641ab64..cc3e6a5a13ca6 100644 --- a/src/query/service/src/physical_plans/physical_mutation_into_split.rs +++ b/src/query/service/src/physical_plans/physical_mutation_into_split.rs @@ -17,6 +17,7 @@ use std::any::Any; use databend_common_exception::Result; use databend_common_pipeline::core::Pipe; use databend_common_sql::IndexType; +use databend_common_sql::executor::physical_plans::DataDistribution; use databend_common_storages_fuse::operations::MutationSplitProcessor; use crate::physical_plans::format::MutationSplitFormatter; @@ -46,6 +47,10 @@ impl IPhysicalPlan for MutationSplit { &mut self.meta } + fn output_data_distribution(&self) -> DataDistribution { + DataDistribution::Random + } + fn children<'a>(&'a self) -> Box + 'a> { Box::new(std::iter::once(&self.input)) } diff --git a/src/query/service/src/physical_plans/physical_mutation_manipulate.rs b/src/query/service/src/physical_plans/physical_mutation_manipulate.rs index cad8392e9133a..a45d607c91f8a 100644 --- a/src/query/service/src/physical_plans/physical_mutation_manipulate.rs +++ b/src/query/service/src/physical_plans/physical_mutation_manipulate.rs @@ -24,6 +24,7 @@ use databend_common_expression::RemoteExpr; use databend_common_meta_app::schema::TableInfo; use databend_common_pipeline::core::Pipe; use databend_common_sql::binder::MutationStrategy; +use databend_common_sql::executor::physical_plans::DataDistribution; use databend_common_sql::executor::physical_plans::MatchExpr; use databend_common_storages_fuse::operations::MatchedSplitProcessor; use databend_common_storages_fuse::operations::MergeIntoNotMatchedProcessor; @@ -67,6 +68,10 @@ impl IPhysicalPlan for MutationManipulate { &mut self.meta } + fn output_data_distribution(&self) -> DataDistribution { + DataDistribution::Random + } + fn children<'a>(&'a self) -> Box + 'a> { Box::new(std::iter::once(&self.input)) } diff --git a/src/query/service/src/physical_plans/physical_mutation_source.rs b/src/query/service/src/physical_plans/physical_mutation_source.rs index a6cb8911c0585..2d0ad0888f057 100644 --- a/src/query/service/src/physical_plans/physical_mutation_source.rs +++ b/src/query/service/src/physical_plans/physical_mutation_source.rs @@ -43,6 +43,7 @@ use databend_common_sql::ScalarExpr; use databend_common_sql::StreamContext; use databend_common_sql::binder::MutationType; use databend_common_sql::executor::cast_expr_to_non_null_boolean; +use databend_common_sql::executor::physical_plans::DataDistribution; use databend_common_storages_fuse::FuseLazyPartInfo; use databend_common_storages_fuse::FuseTable; use databend_common_storages_fuse::SegmentLocation; @@ -93,6 +94,10 @@ impl IPhysicalPlan for MutationSource { Ok(self.output_schema.clone()) } + fn output_data_distribution(&self) -> DataDistribution { + DataDistribution::Random + } + fn formatter(&self) -> Result> { Ok(MutationSourceFormatter::create(self)) } diff --git a/src/query/service/src/physical_plans/physical_plan.rs b/src/query/service/src/physical_plans/physical_plan.rs index aa828044687aa..2a2e6e4e539c6 100644 --- a/src/query/service/src/physical_plans/physical_plan.rs +++ b/src/query/service/src/physical_plans/physical_plan.rs @@ -169,13 +169,7 @@ pub trait IPhysicalPlan: DynClone + Debug + Send + Sync + 'static { .any(|child| child.is_warehouse_distributed_plan()) } - #[recursive::recursive] - fn output_data_distribution(&self) -> DataDistribution { - match self.children().next() { - None => DataDistribution::Random, - Some(child) => child.output_data_distribution(), - } - } + fn output_data_distribution(&self) -> DataDistribution; fn display_in_profile(&self) -> bool { true diff --git a/src/query/service/src/physical_plans/physical_project_set.rs b/src/query/service/src/physical_plans/physical_project_set.rs index 4e66842b41d41..714d77d5c394e 100644 --- a/src/query/service/src/physical_plans/physical_project_set.rs +++ b/src/query/service/src/physical_plans/physical_project_set.rs @@ -27,6 +27,7 @@ use databend_common_pipeline::core::ProcessorPtr; use databend_common_sql::ColumnSet; use databend_common_sql::Symbol; use databend_common_sql::TypeCheck; +use databend_common_sql::executor::physical_plans::DataDistribution; use databend_common_sql::optimizer::ir::SExpr; use itertools::Itertools; @@ -64,6 +65,10 @@ impl IPhysicalPlan for ProjectSet { &mut self.meta } + fn output_data_distribution(&self) -> DataDistribution { + self.input.output_data_distribution() + } + #[recursive::recursive] fn output_schema(&self) -> Result { let input_schema = self.input.output_schema()?; diff --git a/src/query/service/src/physical_plans/physical_r_cte_scan.rs b/src/query/service/src/physical_plans/physical_r_cte_scan.rs index b0e251232e216..9c8afca3e5fe2 100644 --- a/src/query/service/src/physical_plans/physical_r_cte_scan.rs +++ b/src/query/service/src/physical_plans/physical_r_cte_scan.rs @@ -18,6 +18,7 @@ use std::fmt::Display; use databend_common_exception::Result; use databend_common_expression::DataSchemaRef; use databend_common_expression::DataSchemaRefExt; +use databend_common_sql::executor::physical_plans::DataDistribution; use crate::physical_plans::PhysicalPlanBuilder; use crate::physical_plans::explain::PlanStatsInfo; @@ -54,6 +55,10 @@ impl IPhysicalPlan for RecursiveCteScan { Ok(self.output_schema.clone()) } + fn output_data_distribution(&self) -> DataDistribution { + DataDistribution::Random + } + fn derive(&self, children: Vec) -> PhysicalPlan { assert!(children.is_empty()); PhysicalPlan::new(RecursiveCteScan { diff --git a/src/query/service/src/physical_plans/physical_recluster.rs b/src/query/service/src/physical_plans/physical_recluster.rs index 1c9475a15f187..5d400e182b5ed 100644 --- a/src/query/service/src/physical_plans/physical_recluster.rs +++ b/src/query/service/src/physical_plans/physical_recluster.rs @@ -40,6 +40,7 @@ use databend_common_pipeline_transforms::blocks::CompoundBlockOperator; use databend_common_pipeline_transforms::build_compact_block_no_split_pipeline; use databend_common_pipeline_transforms::columns::TransformAddStreamColumns; use databend_common_sql::StreamContext; +use databend_common_sql::executor::physical_plans::DataDistribution; use databend_common_sql::executor::physical_plans::MutationKind; use databend_common_storages_fuse::FUSE_OPT_KEY_BLOCK_IN_MEM_SIZE_THRESHOLD; use databend_common_storages_fuse::FuseTable; @@ -80,6 +81,10 @@ impl IPhysicalPlan for Recluster { &mut self.meta } + fn output_data_distribution(&self) -> DataDistribution { + DataDistribution::Random + } + fn derive(&self, children: Vec) -> PhysicalPlan { assert!(children.is_empty()); PhysicalPlan::new(Recluster { @@ -279,6 +284,10 @@ impl IPhysicalPlan for HilbertPartition { &mut self.meta } + fn output_data_distribution(&self) -> DataDistribution { + DataDistribution::Random + } + #[recursive::recursive] fn output_schema(&self) -> Result { Ok(DataSchemaRef::default()) diff --git a/src/query/service/src/physical_plans/physical_replace_async_source.rs b/src/query/service/src/physical_plans/physical_replace_async_source.rs index e57f78f04fb6f..5d1599aeaa74b 100644 --- a/src/query/service/src/physical_plans/physical_replace_async_source.rs +++ b/src/query/service/src/physical_plans/physical_replace_async_source.rs @@ -19,6 +19,7 @@ use databend_common_exception::Result; use databend_common_expression::DataSchemaRef; use databend_common_pipeline::sources::AsyncSourcer; use databend_common_sql::NameResolutionContext; +use databend_common_sql::executor::physical_plans::DataDistribution; use databend_common_sql::plans::InsertValue; use crate::physical_plans::physical_plan::IPhysicalPlan; @@ -48,6 +49,10 @@ impl IPhysicalPlan for ReplaceAsyncSourcer { &mut self.meta } + fn output_data_distribution(&self) -> DataDistribution { + DataDistribution::Random + } + fn derive(&self, children: Vec) -> PhysicalPlan { assert!(children.is_empty()); PhysicalPlan::new(ReplaceAsyncSourcer { diff --git a/src/query/service/src/physical_plans/physical_replace_deduplicate.rs b/src/query/service/src/physical_plans/physical_replace_deduplicate.rs index 29f1ce6d2ec89..a9e8d3b35e757 100644 --- a/src/query/service/src/physical_plans/physical_replace_deduplicate.rs +++ b/src/query/service/src/physical_plans/physical_replace_deduplicate.rs @@ -31,6 +31,7 @@ use databend_common_pipeline_transforms::TransformPipelineHelper; use databend_common_pipeline_transforms::blocks::TransformCastSchema; use databend_common_pipeline_transforms::build_compact_block_pipeline; use databend_common_sql::ColumnBinding; +use databend_common_sql::executor::physical_plans::DataDistribution; use databend_common_sql::executor::physical_plans::OnConflictField; use databend_common_storages_fuse::FuseTable; use databend_common_storages_fuse::operations::ReplaceIntoProcessor; @@ -70,6 +71,10 @@ impl IPhysicalPlan for ReplaceDeduplicate { &mut self.meta } + fn output_data_distribution(&self) -> DataDistribution { + DataDistribution::Serial + } + #[recursive::recursive] fn output_schema(&self) -> Result { Ok(DataSchemaRef::default()) diff --git a/src/query/service/src/physical_plans/physical_replace_into.rs b/src/query/service/src/physical_plans/physical_replace_into.rs index 9e502fb467a81..7c916ac4361cc 100644 --- a/src/query/service/src/physical_plans/physical_replace_into.rs +++ b/src/query/service/src/physical_plans/physical_replace_into.rs @@ -26,6 +26,7 @@ use databend_common_pipeline::core::InputPort; use databend_common_pipeline::core::OutputPort; use databend_common_pipeline::core::Pipe; use databend_common_pipeline_transforms::create_dummy_item; +use databend_common_sql::executor::physical_plans::DataDistribution; use databend_common_sql::executor::physical_plans::MutationKind; use databend_common_sql::executor::physical_plans::OnConflictField; use databend_common_storages_fuse::FuseTable; @@ -70,6 +71,10 @@ impl IPhysicalPlan for ReplaceInto { &mut self.meta } + fn output_data_distribution(&self) -> DataDistribution { + DataDistribution::Random + } + #[recursive::recursive] fn output_schema(&self) -> Result { Ok(DataSchemaRef::default()) diff --git a/src/query/service/src/physical_plans/physical_row_fetch.rs b/src/query/service/src/physical_plans/physical_row_fetch.rs index 11fef083c340a..e1e687f55d8c8 100644 --- a/src/query/service/src/physical_plans/physical_row_fetch.rs +++ b/src/query/service/src/physical_plans/physical_row_fetch.rs @@ -25,6 +25,7 @@ use databend_common_pipeline::core::OutputPort; use databend_common_pipeline::core::Pipe; use databend_common_pipeline::core::PipeItem; use databend_common_pipeline_transforms::create_dummy_item; +use databend_common_sql::executor::physical_plans::DataDistribution; use databend_common_storages_fuse::operations::row_fetch_processor; use itertools::Itertools; @@ -67,6 +68,10 @@ impl IPhysicalPlan for RowFetch { &mut self.meta } + fn output_data_distribution(&self) -> DataDistribution { + self.input.output_data_distribution() + } + #[recursive::recursive] fn output_schema(&self) -> Result { let mut fields = self.input.output_schema()?.fields().clone(); diff --git a/src/query/service/src/physical_plans/physical_secure_filter.rs b/src/query/service/src/physical_plans/physical_secure_filter.rs index d3b382da91737..0f0572641c668 100644 --- a/src/query/service/src/physical_plans/physical_secure_filter.rs +++ b/src/query/service/src/physical_plans/physical_secure_filter.rs @@ -26,6 +26,7 @@ use databend_common_functions::BUILTIN_FUNCTIONS; use databend_common_sql::ColumnSet; use databend_common_sql::TypeCheck; use databend_common_sql::executor::cast_expr_to_non_null_boolean; +use databend_common_sql::executor::physical_plans::DataDistribution; use databend_common_sql::optimizer::ir::SExpr; use sha2::Digest; use sha2::Sha256; @@ -64,6 +65,10 @@ impl IPhysicalPlan for SecureFilter { &mut self.meta } + fn output_data_distribution(&self) -> DataDistribution { + self.input.output_data_distribution() + } + #[recursive::recursive] fn output_schema(&self) -> Result { let input_schema = self.input.output_schema()?; diff --git a/src/query/service/src/physical_plans/physical_sequence.rs b/src/query/service/src/physical_plans/physical_sequence.rs index 4642019c961f5..d201fa082a415 100644 --- a/src/query/service/src/physical_plans/physical_sequence.rs +++ b/src/query/service/src/physical_plans/physical_sequence.rs @@ -17,6 +17,7 @@ use std::any::Any; use databend_common_exception::Result; use databend_common_expression::DataSchemaRef; use databend_common_sql::ColumnSet; +use databend_common_sql::executor::physical_plans::DataDistribution; use databend_common_sql::optimizer::ir::SExpr; use crate::physical_plans::IPhysicalPlan; @@ -53,6 +54,10 @@ impl IPhysicalPlan for Sequence { &mut self.meta } + fn output_data_distribution(&self) -> DataDistribution { + self.right.output_data_distribution() + } + #[recursive::recursive] fn output_schema(&self) -> Result { self.right.output_schema() diff --git a/src/query/service/src/physical_plans/physical_sort.rs b/src/query/service/src/physical_plans/physical_sort.rs index 5e049c9daf3ce..d808d47d50633 100644 --- a/src/query/service/src/physical_plans/physical_sort.rs +++ b/src/query/service/src/physical_plans/physical_sort.rs @@ -28,6 +28,7 @@ use databend_common_pipeline_transforms::sorts::core::SortKeyDescription; use databend_common_sql::ColumnSet; use databend_common_sql::IndexType; use databend_common_sql::evaluator::BlockOperator; +use databend_common_sql::executor::physical_plans::DataDistribution; use databend_common_sql::executor::physical_plans::FragmentKind; use databend_common_sql::executor::physical_plans::SortDesc; use databend_common_sql::optimizer::ir::SExpr; @@ -148,6 +149,15 @@ impl IPhysicalPlan for Sort { Ok(SortFormatter::create(self)) } + fn output_data_distribution(&self) -> DataDistribution { + match self.step { + SortStep::Single | SortStep::Partial | SortStep::Final | SortStep::Shuffled => { + DataDistribution::Serial + } + SortStep::Sample | SortStep::Route => DataDistribution::Random, + } + } + #[recursive::recursive] fn try_find_single_data_source(&self) -> Option<&DataSourcePlan> { self.input.try_find_single_data_source() diff --git a/src/query/service/src/physical_plans/physical_table_scan.rs b/src/query/service/src/physical_plans/physical_table_scan.rs index 73a7ad54f5262..d9121abaf971e 100644 --- a/src/query/service/src/physical_plans/physical_table_scan.rs +++ b/src/query/service/src/physical_plans/physical_table_scan.rs @@ -64,6 +64,7 @@ use databend_common_sql::VirtualColumn; use databend_common_sql::binder::INTERNAL_COLUMN_FACTORY; use databend_common_sql::evaluator::BlockOperator; use databend_common_sql::executor::cast_expr_to_non_null_boolean; +use databend_common_sql::executor::physical_plans::DataDistribution; use databend_common_sql::executor::table_read_plan::ToReadDataSourcePlan; use databend_common_sql::plans::FunctionCall; use databend_common_storages_fuse::FuseTable; @@ -111,6 +112,10 @@ impl IPhysicalPlan for TableScan { Self::output_fields(self.source.schema(), &self.name_mapping).map(DataSchema::new_ref) } + fn output_data_distribution(&self) -> DataDistribution { + DataDistribution::Random + } + fn formatter(&self) -> Result> { Ok(TableScanFormatter::create(self)) } diff --git a/src/query/service/src/physical_plans/physical_udf.rs b/src/query/service/src/physical_plans/physical_udf.rs index 1b62115d6f0c1..d82a3c416eb64 100644 --- a/src/query/service/src/physical_plans/physical_udf.rs +++ b/src/query/service/src/physical_plans/physical_udf.rs @@ -28,6 +28,7 @@ use databend_common_sql::ColumnSet; use databend_common_sql::IndexType; use databend_common_sql::ScalarExpr; use databend_common_sql::Symbol; +use databend_common_sql::executor::physical_plans::DataDistribution; use databend_common_sql::optimizer::ir::SExpr; use databend_common_sql::plans::UDFType; use itertools::Itertools; @@ -66,6 +67,10 @@ impl IPhysicalPlan for Udf { &mut self.meta } + fn output_data_distribution(&self) -> DataDistribution { + self.input.output_data_distribution() + } + #[recursive::recursive] fn output_schema(&self) -> Result { let input_schema = self.input.output_schema()?; diff --git a/src/query/service/src/physical_plans/physical_window.rs b/src/query/service/src/physical_plans/physical_window.rs index 35a9608e95e19..1267afe96c1de 100644 --- a/src/query/service/src/physical_plans/physical_window.rs +++ b/src/query/service/src/physical_plans/physical_window.rs @@ -37,6 +37,7 @@ use databend_common_sql::TypeCheck; use databend_common_sql::binder::wrap_cast; use databend_common_sql::executor::physical_plans::AggregateFunctionDesc; use databend_common_sql::executor::physical_plans::AggregateFunctionSignature; +use databend_common_sql::executor::physical_plans::DataDistribution; use databend_common_sql::executor::physical_plans::SortDesc; use databend_common_sql::optimizer::ir::SExpr; use databend_common_sql::plans::WindowFuncFrame; @@ -110,6 +111,14 @@ impl IPhysicalPlan for Window { Ok(WindowFormatter::create(self)) } + fn output_data_distribution(&self) -> DataDistribution { + if self.partition_by.is_empty() { + DataDistribution::Random + } else { + self.input.output_data_distribution() + } + } + #[recursive::recursive] fn try_find_single_data_source(&self) -> Option<&DataSourcePlan> { self.input.try_find_single_data_source() diff --git a/src/query/service/src/pipelines/processors/transforms/new_hash_join/memory/partitioned/compact_probe_stream.rs b/src/query/service/src/pipelines/processors/transforms/new_hash_join/memory/partitioned/compact_probe_stream.rs index 8f85880d46c3e..cb0718878c6f5 100644 --- a/src/query/service/src/pipelines/processors/transforms/new_hash_join/memory/partitioned/compact_probe_stream.rs +++ b/src/query/service/src/pipelines/processors/transforms/new_hash_join/memory/partitioned/compact_probe_stream.rs @@ -12,6 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. +use databend_common_column::bitmap::Bitmap; use databend_common_exception::Result; use databend_common_expression::DataBlock; use databend_common_expression::FunctionContext; @@ -36,6 +37,7 @@ struct CompactProbeStream<'a, Key: ?Sized + Eq, const MATCHED: bool> { key_idx: usize, build_idx: u32, matched_num_rows: usize, + probe_validity: Option, probe_hashes: Vec, bucket_mask: usize, @@ -54,9 +56,22 @@ impl<'a, Key: ?Sized + Eq + Send + Sync + 'static, const MATCHED: bool> ProbeStr break; } + if self + .probe_validity + .as_ref() + .is_some_and(|validity| !validity.get_bit(self.key_idx)) + { + if !MATCHED { + res.unmatched.push(self.key_idx as u64); + } + self.key_idx += 1; + continue; + } + if self.build_idx == 0 { let bucket = (self.probe_hashes[self.key_idx] as usize) & self.bucket_mask; self.build_idx = self.hash_table.first_index(bucket); + if self.build_idx == 0 { if !MATCHED { res.unmatched.push(self.key_idx as u64); @@ -70,9 +85,9 @@ impl<'a, Key: ?Sized + Eq + Send + Sync + 'static, const MATCHED: bool> ProbeStr while self.build_idx != 0 { let bi = self.build_idx as usize; + let row_idx = (bi - 1) & CHUNK_MASK; let chunk_idx = (bi - 1) >> CHUNK_BITS; - let offset = (bi - 1) & CHUNK_MASK; - let build_key = unsafe { self.build_accs[chunk_idx].key_unchecked(offset) }; + let build_key = unsafe { self.build_accs[chunk_idx].key_unchecked(row_idx) }; if build_key == probe_key { res.matched_probe.push(self.key_idx as u64); @@ -114,6 +129,11 @@ where { let probe_keys_entries = desc.probe_key(data, function_ctx)?; let mut probe_keys_block = DataBlock::new(probe_keys_entries, data.num_rows()); + let probe_validity = match desc.from_correlated_subquery { + true => None, + false => desc.build_valids_by_keys(&probe_keys_block)?, + }; + desc.remove_keys_nullable(&mut probe_keys_block); let keys = ProjectedBlock::from(probe_keys_block.columns()); @@ -133,6 +153,7 @@ where key_idx: 0, build_idx: 0, matched_num_rows: 0, + probe_validity, probe_hashes, bucket_mask, probe_acc, diff --git a/src/query/service/src/pipelines/processors/transforms/new_hash_join/memory/partitioned/inner_join.rs b/src/query/service/src/pipelines/processors/transforms/new_hash_join/memory/partitioned/inner_join.rs index 0b5ce268ed40f..d3c8d755ecffa 100644 --- a/src/query/service/src/pipelines/processors/transforms/new_hash_join/memory/partitioned/inner_join.rs +++ b/src/query/service/src/pipelines/processors/transforms/new_hash_join/memory/partitioned/inner_join.rs @@ -155,8 +155,7 @@ impl Join for PartitionedInnerJoin { } fn final_build(&mut self) -> Result> { - self.build.final_build()?; - Ok(None) + self.build.final_build() } fn probe_block(&mut self, data: DataBlock) -> Result> { diff --git a/src/query/service/src/pipelines/processors/transforms/new_hash_join/memory/partitioned/left_join.rs b/src/query/service/src/pipelines/processors/transforms/new_hash_join/memory/partitioned/left_join.rs index 6147c5ff865de..adc824d13bc4d 100644 --- a/src/query/service/src/pipelines/processors/transforms/new_hash_join/memory/partitioned/left_join.rs +++ b/src/query/service/src/pipelines/processors/transforms/new_hash_join/memory/partitioned/left_join.rs @@ -216,8 +216,7 @@ impl Join for PartitionedLeftJoin { } fn final_build(&mut self) -> Result> { - self.build.final_build()?; - Ok(None) + self.build.final_build() } fn probe_block(&mut self, data: DataBlock) -> Result> { diff --git a/src/query/service/src/pipelines/processors/transforms/new_hash_join/memory/partitioned/left_join_anti.rs b/src/query/service/src/pipelines/processors/transforms/new_hash_join/memory/partitioned/left_join_anti.rs index ac9302842d752..fc9e931773111 100644 --- a/src/query/service/src/pipelines/processors/transforms/new_hash_join/memory/partitioned/left_join_anti.rs +++ b/src/query/service/src/pipelines/processors/transforms/new_hash_join/memory/partitioned/left_join_anti.rs @@ -139,8 +139,7 @@ impl Join for PartitionedLeftAntiJoin { } fn final_build(&mut self) -> Result> { - self.build.final_build()?; - Ok(None) + self.build.final_build() } fn probe_block(&mut self, data: DataBlock) -> Result> { diff --git a/src/query/service/src/pipelines/processors/transforms/new_hash_join/memory/partitioned/left_join_semi.rs b/src/query/service/src/pipelines/processors/transforms/new_hash_join/memory/partitioned/left_join_semi.rs index ce710a8db212b..1f17e0a336509 100644 --- a/src/query/service/src/pipelines/processors/transforms/new_hash_join/memory/partitioned/left_join_semi.rs +++ b/src/query/service/src/pipelines/processors/transforms/new_hash_join/memory/partitioned/left_join_semi.rs @@ -139,8 +139,7 @@ impl Join for PartitionedLeftSemiJoin { } fn final_build(&mut self) -> Result> { - self.build.final_build()?; - Ok(None) + self.build.final_build() } fn probe_block(&mut self, data: DataBlock) -> Result> { diff --git a/src/query/service/src/pipelines/processors/transforms/new_hash_join/memory/partitioned/partitioned_build.rs b/src/query/service/src/pipelines/processors/transforms/new_hash_join/memory/partitioned/partitioned_build.rs index 649ce2c14755a..66a66feaf4fc6 100644 --- a/src/query/service/src/pipelines/processors/transforms/new_hash_join/memory/partitioned/partitioned_build.rs +++ b/src/query/service/src/pipelines/processors/transforms/new_hash_join/memory/partitioned/partitioned_build.rs @@ -195,6 +195,7 @@ impl PartitionedBuild { self.columns.clear(); self.column_types.clear(); self.num_rows = 0; + self.build_block_idx = 0; self.visited.clear(); self.accumulator.reset(); } diff --git a/src/query/service/src/pipelines/processors/transforms/new_hash_join/memory/partitioned/right_join.rs b/src/query/service/src/pipelines/processors/transforms/new_hash_join/memory/partitioned/right_join.rs index 0eac0ce88d14a..ebc1be1c57b2f 100644 --- a/src/query/service/src/pipelines/processors/transforms/new_hash_join/memory/partitioned/right_join.rs +++ b/src/query/service/src/pipelines/processors/transforms/new_hash_join/memory/partitioned/right_join.rs @@ -205,9 +205,11 @@ impl Join for PartitionedRightJoin { } fn final_build(&mut self) -> Result> { - self.build.final_build()?; - self.build.init_visited(); - Ok(None) + let progress = self.build.final_build()?; + if progress.is_none() { + self.build.init_visited(); + } + Ok(progress) } fn probe_block(&mut self, data: DataBlock) -> Result> { diff --git a/src/query/service/src/pipelines/processors/transforms/new_hash_join/memory/partitioned/right_join_anti.rs b/src/query/service/src/pipelines/processors/transforms/new_hash_join/memory/partitioned/right_join_anti.rs index dc4cffb42b8a2..80f933383aa2a 100644 --- a/src/query/service/src/pipelines/processors/transforms/new_hash_join/memory/partitioned/right_join_anti.rs +++ b/src/query/service/src/pipelines/processors/transforms/new_hash_join/memory/partitioned/right_join_anti.rs @@ -182,9 +182,11 @@ impl Join for PartitionedRightAntiJoin { } fn final_build(&mut self) -> Result> { - self.build.final_build()?; - self.build.init_visited(); - Ok(None) + let progress = self.build.final_build()?; + if progress.is_none() { + self.build.init_visited(); + } + Ok(progress) } fn probe_block(&mut self, data: DataBlock) -> Result> { diff --git a/src/query/service/src/pipelines/processors/transforms/new_hash_join/memory/partitioned/right_join_semi.rs b/src/query/service/src/pipelines/processors/transforms/new_hash_join/memory/partitioned/right_join_semi.rs index e693279ccf26b..04cf18bc8c89a 100644 --- a/src/query/service/src/pipelines/processors/transforms/new_hash_join/memory/partitioned/right_join_semi.rs +++ b/src/query/service/src/pipelines/processors/transforms/new_hash_join/memory/partitioned/right_join_semi.rs @@ -183,9 +183,11 @@ impl Join for PartitionedRightSemiJoin { } fn final_build(&mut self) -> Result> { - self.build.final_build()?; - self.build.init_visited(); - Ok(None) + let progress = self.build.final_build()?; + if progress.is_none() { + self.build.init_visited(); + } + Ok(progress) } fn probe_block(&mut self, data: DataBlock) -> Result> { diff --git a/src/query/service/src/servers/flight/v1/packets/packet_fragment.rs b/src/query/service/src/servers/flight/v1/packets/packet_fragment.rs index 963363e7267d1..290924a7b261f 100644 --- a/src/query/service/src/servers/flight/v1/packets/packet_fragment.rs +++ b/src/query/service/src/servers/flight/v1/packets/packet_fragment.rs @@ -18,6 +18,7 @@ use std::collections::VecDeque; use std::fmt::Debug; use std::fmt::Formatter; +use databend_common_sql::executor::physical_plans::DataDistribution; use serde::Deserializer; use serde::Serializer; use serde::de::Error; @@ -69,6 +70,10 @@ impl IPhysicalPlan for SerializedPhysicalPlanRef { fn derive(&self, _: Vec) -> PhysicalPlan { unimplemented!() } + + fn output_data_distribution(&self) -> DataDistribution { + unimplemented!() + } } #[derive(Clone, Debug, serde::Serialize, serde::Deserialize)] From 36d7c78d33967b615b8630b2d4d784bb9dd858cf Mon Sep 17 00:00:00 2001 From: zhang2014 Date: Tue, 24 Mar 2026 19:52:10 +0800 Subject: [PATCH 11/38] z --- .../src/physical_plans/physical_hash_join.rs | 16 +++++++--------- 1 file changed, 7 insertions(+), 9 deletions(-) diff --git a/src/query/service/src/physical_plans/physical_hash_join.rs b/src/query/service/src/physical_plans/physical_hash_join.rs index ff4048e1d9348..26e9eb8c1c332 100644 --- a/src/query/service/src/physical_plans/physical_hash_join.rs +++ b/src/query/service/src/physical_plans/physical_hash_join.rs @@ -503,15 +503,13 @@ impl HashJoin { debug_assert_eq!(build_sinks.len(), probe_sinks.len()); - let use_partitioned_join = self.join_type == JoinType::Inner - && matches!( - self.build.output_data_distribution(), - DataDistribution::GlobalHash(_) - ) - && matches!( - self.probe.output_data_distribution(), - DataDistribution::GlobalHash(_) - ); + let use_partitioned_join = matches!( + self.build.output_data_distribution(), + DataDistribution::GlobalHash(_) + ) && matches!( + self.probe.output_data_distribution(), + DataDistribution::GlobalHash(_) + ); let barrier = databend_common_base::base::Barrier::new(output_len); let stage_sync_barrier = Arc::new(barrier); From 381b2e8c4296b395e04300a4e11862f429a56ebf Mon Sep 17 00:00:00 2001 From: zhang2014 Date: Wed, 25 Mar 2026 14:09:38 +0800 Subject: [PATCH 12/38] z --- .../memory/partitioned/compact_hash_table.rs | 18 ++++ .../memory/partitioned/partitioned_build.rs | 84 +++++++++++++++++-- .../memory/partitioned/right_join.rs | 2 +- .../memory/partitioned/right_join_anti.rs | 2 +- 4 files changed, 99 insertions(+), 7 deletions(-) diff --git a/src/query/service/src/pipelines/processors/transforms/new_hash_join/memory/partitioned/compact_hash_table.rs b/src/query/service/src/pipelines/processors/transforms/new_hash_join/memory/partitioned/compact_hash_table.rs index ce1e385cb7fdf..62b375f3f8d45 100644 --- a/src/query/service/src/pipelines/processors/transforms/new_hash_join/memory/partitioned/compact_hash_table.rs +++ b/src/query/service/src/pipelines/processors/transforms/new_hash_join/memory/partitioned/compact_hash_table.rs @@ -101,6 +101,24 @@ impl CompactJoinHashTable { } } + pub fn insert_chunk_with_validity( + &mut self, + hashes: &[u64], + row_offset: usize, + validity: &databend_common_column::bitmap::Bitmap, + ) { + let mask = self.bucket_mask; + for (i, h) in hashes.iter().enumerate() { + if !validity.get_bit(i) { + continue; + } + let row_index = row_offset + i; + let bucket = (*h as usize) & mask; + self.next[row_index] = self.first[bucket]; + self.first[bucket] = I::from_usize(row_index); + } + } + /// Get the first row index in the given bucket. #[inline(always)] pub fn first_index(&self, bucket: usize) -> I { diff --git a/src/query/service/src/pipelines/processors/transforms/new_hash_join/memory/partitioned/partitioned_build.rs b/src/query/service/src/pipelines/processors/transforms/new_hash_join/memory/partitioned/partitioned_build.rs index 66a66feaf4fc6..d0b5fab2ce0c4 100644 --- a/src/query/service/src/pipelines/processors/transforms/new_hash_join/memory/partitioned/partitioned_build.rs +++ b/src/query/service/src/pipelines/processors/transforms/new_hash_join/memory/partitioned/partitioned_build.rs @@ -15,15 +15,21 @@ use std::sync::Arc; use databend_common_base::base::ProgressValues; +use databend_common_column::bitmap::Bitmap; use databend_common_exception::Result; +use databend_common_expression::BlockEntry; use databend_common_expression::Column; use databend_common_expression::ColumnVec; use databend_common_expression::DataBlock; +use databend_common_expression::FromData; use databend_common_expression::FunctionContext; use databend_common_expression::HashMethod; use databend_common_expression::HashMethodKind; use databend_common_expression::KeysState; use databend_common_expression::ProjectedBlock; +use databend_common_expression::Scalar; +use databend_common_expression::types::AccessType; +use databend_common_expression::types::BooleanType; use databend_common_expression::types::DataType; use databend_common_expression::with_hash_method; @@ -65,6 +71,14 @@ pub struct PartitionedBuild { pub desc: Arc, pub function_ctx: FunctionContext, + /// When true, NULL build keys are kept in the data (not filtered out). + /// Required for RIGHT and RIGHT ANTI joins where unmatched build rows + /// (including those with NULL keys) must be output in final_probe. + keep_null_keys: bool, + /// Per-chunk validity bitmaps for build keys (only used when keep_null_keys is true). + /// Rows with invalid (NULL) keys are skipped during hash table insertion. + chunk_validities: Vec>, + accumulator: FixedSizeChunkAccumulator, } @@ -73,6 +87,23 @@ impl PartitionedBuild { method: HashMethodKind, desc: Arc, function_ctx: FunctionContext, + ) -> Self { + Self::create_with_options(method, desc, function_ctx, false) + } + + pub fn create_keep_null_keys( + method: HashMethodKind, + desc: Arc, + function_ctx: FunctionContext, + ) -> Self { + Self::create_with_options(method, desc, function_ctx, true) + } + + fn create_with_options( + method: HashMethodKind, + desc: Arc, + function_ctx: FunctionContext, + keep_null_keys: bool, ) -> Self { PartitionedBuild { chunks: Vec::new(), @@ -85,6 +116,8 @@ impl PartitionedBuild { desc, function_ctx, visited: Vec::new(), + keep_null_keys, + chunk_validities: Vec::new(), accumulator: FixedSizeChunkAccumulator::new(CHUNK_SIZE), build_block_idx: 0, } @@ -110,6 +143,16 @@ impl PartitionedBuild { fn ingest_chunk(&mut self, chunk: DataBlock) -> Result<()> { let num_rows = chunk.num_rows(); let mut columns = chunk.take_columns(); + + // Extract the trailing validity column if keep_null_keys is enabled. + let chunk_validity = if self.keep_null_keys { + let valid_entry = columns.pop().unwrap(); + let col = valid_entry.to_column(); + Some(BooleanType::try_downcast_column(&col).unwrap()) + } else { + None + }; + let data_columns = columns.split_off(self.desc.build_keys.len()); let keys = ProjectedBlock::from(&columns); @@ -120,6 +163,7 @@ impl PartitionedBuild { self.num_rows += num_rows; self.build_keys_states.push(keys_state); self.chunks.push(DataBlock::new(data_columns, num_rows)); + self.chunk_validities.push(chunk_validity); Ok(()) } @@ -130,15 +174,36 @@ impl PartitionedBuild { let mut keys_block = DataBlock::new(keys_entries, num_rows); chunk = chunk.project(&self.desc.build_projection); - if let Some(bitmap) = self.desc.build_valids_by_keys(&keys_block)? { - if bitmap.true_count() != bitmap.len() { - keys_block = keys_block.filter_with_bitmap(&bitmap)?; - chunk = chunk.filter_with_bitmap(&bitmap)?; + + let validity = self.desc.build_valids_by_keys(&keys_block)?; + if !self.keep_null_keys { + if let Some(ref bitmap) = validity { + if bitmap.true_count() != bitmap.len() { + keys_block = keys_block.filter_with_bitmap(bitmap)?; + chunk = chunk.filter_with_bitmap(bitmap)?; + } } } self.desc.remove_keys_nullable(&mut keys_block); keys_block.merge_block(chunk); + + // When keeping NULL keys, append a boolean validity column so it flows + // through the accumulator and can be extracted in ingest_chunk. + if self.keep_null_keys { + let valid_col = match validity { + Some(bitmap) => { + BlockEntry::from(BooleanType::from_data(bitmap.iter().collect::>())) + } + None => BlockEntry::new_const_column( + DataType::Boolean, + Scalar::Boolean(true), + keys_block.num_rows(), + ), + }; + keys_block.add_entry(valid_col); + } + Ok(keys_block) } @@ -177,7 +242,15 @@ impl PartitionedBuild { HashMethodKind::T(method) => { let mut hashes = Vec::new(); method.build_keys_hashes(keys_state, &mut hashes); - self.hash_table.insert_chunk(&hashes, row_offset); + match &self.chunk_validities[self.build_block_idx] { + Some(validity) => { + self.hash_table + .insert_chunk_with_validity(&hashes, row_offset, validity); + } + None => { + self.hash_table.insert_chunk(&hashes, row_offset); + } + } self.build_block_idx += 1; } }); @@ -197,6 +270,7 @@ impl PartitionedBuild { self.num_rows = 0; self.build_block_idx = 0; self.visited.clear(); + self.chunk_validities.clear(); self.accumulator.reset(); } diff --git a/src/query/service/src/pipelines/processors/transforms/new_hash_join/memory/partitioned/right_join.rs b/src/query/service/src/pipelines/processors/transforms/new_hash_join/memory/partitioned/right_join.rs index ebc1be1c57b2f..61c7f01cf7b67 100644 --- a/src/query/service/src/pipelines/processors/transforms/new_hash_join/memory/partitioned/right_join.rs +++ b/src/query/service/src/pipelines/processors/transforms/new_hash_join/memory/partitioned/right_join.rs @@ -63,7 +63,7 @@ impl PartitionedRightJoin { ) }); PartitionedRightJoin { - build: PartitionedBuild::create(method, desc, function_ctx), + build: PartitionedBuild::create_keep_null_keys(method, desc, function_ctx), filter_executor, max_block_size, finished: false, diff --git a/src/query/service/src/pipelines/processors/transforms/new_hash_join/memory/partitioned/right_join_anti.rs b/src/query/service/src/pipelines/processors/transforms/new_hash_join/memory/partitioned/right_join_anti.rs index 80f933383aa2a..c9f266c6a365f 100644 --- a/src/query/service/src/pipelines/processors/transforms/new_hash_join/memory/partitioned/right_join_anti.rs +++ b/src/query/service/src/pipelines/processors/transforms/new_hash_join/memory/partitioned/right_join_anti.rs @@ -61,7 +61,7 @@ impl PartitionedRightAntiJoin { ) }); PartitionedRightAntiJoin { - build: PartitionedBuild::create(method, desc, function_ctx), + build: PartitionedBuild::create_keep_null_keys(method, desc, function_ctx), filter_executor, max_block_size, finished: false, From f801b4b1a59070dc6afb8523d9c081ae20be390e Mon Sep 17 00:00:00 2001 From: zhang2014 Date: Wed, 25 Mar 2026 21:12:13 +0800 Subject: [PATCH 13/38] z --- src/query/service/src/physical_plans/physical_hash_join.rs | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/src/query/service/src/physical_plans/physical_hash_join.rs b/src/query/service/src/physical_plans/physical_hash_join.rs index 26e9eb8c1c332..162a8a03e1ca2 100644 --- a/src/query/service/src/physical_plans/physical_hash_join.rs +++ b/src/query/service/src/physical_plans/physical_hash_join.rs @@ -548,10 +548,8 @@ impl HashJoin { let join_pipe = Pipe::create(output_len * 2, output_len, join_pipe_items); builder.main_pipeline.add_pipe(join_pipe); - if !use_partitioned_join { - let item_size = builder.main_pipeline.output_len(); - builder.main_pipeline.resize(item_size, true)?; - } + let item_size = builder.main_pipeline.output_len(); + builder.main_pipeline.resize(item_size, true)?; Ok(()) } From 53a8957c8311469231e5ed78aa9f4eef96b6ebfb Mon Sep 17 00:00:00 2001 From: zhang2014 Date: Thu, 26 Mar 2026 00:07:06 +0800 Subject: [PATCH 14/38] z --- .../src/physical_plans/physical_hash_join.rs | 143 +++-- .../new_hash_join/{ => common}/join.rs | 0 .../transforms/new_hash_join/common/mod.rs | 3 + .../basic.rs => common/probe_stream.rs} | 35 -- .../{ => common}/runtime_filter.rs | 0 .../transforms/new_hash_join/mod.rs | 34 +- .../partitioned/chunk_accumulator.rs | 0 .../partitioned/compact_hash_table.rs | 0 .../partitioned/compact_probe_stream.rs | 4 +- .../{memory => }/partitioned/inner_join.rs | 17 +- .../{memory => }/partitioned/left_join.rs | 19 +- .../partitioned/left_join_anti.rs | 19 +- .../partitioned/left_join_semi.rs | 17 +- .../{memory => }/partitioned/mod.rs | 3 + .../partitioned/partitioned_build.rs | 2 +- .../{memory => }/partitioned/right_join.rs | 18 +- .../partitioned/right_join_anti.rs | 18 +- .../partitioned/right_join_semi.rs | 18 +- .../partitioned/transform_hash_join.rs | 502 ++++++++++++++++++ .../{ => unpartitioned}/grace/grace_join.rs | 10 +- .../{ => unpartitioned}/grace/grace_memory.rs | 12 +- .../{ => unpartitioned}/grace/grace_state.rs | 0 .../{ => unpartitioned}/grace/mod.rs | 0 .../{ => unpartitioned}/hash_join_factory.rs | 74 +-- .../unpartitioned/hashtable/basic.rs | 53 ++ .../hashtable/fixed_keys.rs | 10 +- .../{ => unpartitioned}/hashtable/mod.rs | 0 .../hashtable/serialize_keys.rs | 10 +- .../hashtable/single_binary_key.rs | 12 +- .../{ => unpartitioned}/hybrid/hybrid_join.rs | 8 +- .../hybrid/hybrid_state.rs | 2 +- .../{ => unpartitioned}/hybrid/mod.rs | 0 .../memory}/basic.rs | 0 .../memory}/basic_state.rs | 0 .../memory}/inner_join.rs | 14 +- .../memory}/left_join.rs | 14 +- .../memory}/left_join_anti.rs | 14 +- .../memory}/left_join_semi.rs | 12 +- .../memory}/mod.rs | 0 .../memory}/nested_loop.rs | 2 +- .../memory}/right_join.rs | 10 +- .../memory}/right_join_anti.rs | 6 +- .../memory}/right_join_semi.rs | 10 +- .../{memory => unpartitioned}/mod.rs | 13 +- .../{ => unpartitioned}/performance.rs | 4 +- .../transform_hash_join.rs | 8 +- 46 files changed, 814 insertions(+), 336 deletions(-) rename src/query/service/src/pipelines/processors/transforms/new_hash_join/{ => common}/join.rs (100%) rename src/query/service/src/pipelines/processors/transforms/new_hash_join/{hashtable/basic.rs => common/probe_stream.rs} (68%) rename src/query/service/src/pipelines/processors/transforms/new_hash_join/{ => common}/runtime_filter.rs (100%) rename src/query/service/src/pipelines/processors/transforms/new_hash_join/{memory => }/partitioned/chunk_accumulator.rs (100%) rename src/query/service/src/pipelines/processors/transforms/new_hash_join/{memory => }/partitioned/compact_hash_table.rs (100%) rename src/query/service/src/pipelines/processors/transforms/new_hash_join/{memory => }/partitioned/compact_probe_stream.rs (97%) rename src/query/service/src/pipelines/processors/transforms/new_hash_join/{memory => }/partitioned/inner_join.rs (90%) rename src/query/service/src/pipelines/processors/transforms/new_hash_join/{memory => }/partitioned/left_join.rs (92%) rename src/query/service/src/pipelines/processors/transforms/new_hash_join/{memory => }/partitioned/left_join_anti.rs (89%) rename src/query/service/src/pipelines/processors/transforms/new_hash_join/{memory => }/partitioned/left_join_semi.rs (90%) rename src/query/service/src/pipelines/processors/transforms/new_hash_join/{memory => }/partitioned/mod.rs (92%) rename src/query/service/src/pipelines/processors/transforms/new_hash_join/{memory => }/partitioned/partitioned_build.rs (99%) rename src/query/service/src/pipelines/processors/transforms/new_hash_join/{memory => }/partitioned/right_join.rs (93%) rename src/query/service/src/pipelines/processors/transforms/new_hash_join/{memory => }/partitioned/right_join_anti.rs (92%) rename src/query/service/src/pipelines/processors/transforms/new_hash_join/{memory => }/partitioned/right_join_semi.rs (92%) create mode 100644 src/query/service/src/pipelines/processors/transforms/new_hash_join/partitioned/transform_hash_join.rs rename src/query/service/src/pipelines/processors/transforms/new_hash_join/{ => unpartitioned}/grace/grace_join.rs (97%) rename src/query/service/src/pipelines/processors/transforms/new_hash_join/{ => unpartitioned}/grace/grace_memory.rs (87%) rename src/query/service/src/pipelines/processors/transforms/new_hash_join/{ => unpartitioned}/grace/grace_state.rs (100%) rename src/query/service/src/pipelines/processors/transforms/new_hash_join/{ => unpartitioned}/grace/mod.rs (100%) rename src/query/service/src/pipelines/processors/transforms/new_hash_join/{ => unpartitioned}/hash_join_factory.rs (81%) create mode 100644 src/query/service/src/pipelines/processors/transforms/new_hash_join/unpartitioned/hashtable/basic.rs rename src/query/service/src/pipelines/processors/transforms/new_hash_join/{ => unpartitioned}/hashtable/fixed_keys.rs (96%) rename src/query/service/src/pipelines/processors/transforms/new_hash_join/{ => unpartitioned}/hashtable/mod.rs (100%) rename src/query/service/src/pipelines/processors/transforms/new_hash_join/{ => unpartitioned}/hashtable/serialize_keys.rs (97%) rename src/query/service/src/pipelines/processors/transforms/new_hash_join/{ => unpartitioned}/hashtable/single_binary_key.rs (92%) rename src/query/service/src/pipelines/processors/transforms/new_hash_join/{ => unpartitioned}/hybrid/hybrid_join.rs (96%) rename src/query/service/src/pipelines/processors/transforms/new_hash_join/{ => unpartitioned}/hybrid/hybrid_state.rs (96%) rename src/query/service/src/pipelines/processors/transforms/new_hash_join/{ => unpartitioned}/hybrid/mod.rs (100%) rename src/query/service/src/pipelines/processors/transforms/new_hash_join/{memory/unpartitioned => unpartitioned/memory}/basic.rs (100%) rename src/query/service/src/pipelines/processors/transforms/new_hash_join/{memory/unpartitioned => unpartitioned/memory}/basic_state.rs (100%) rename src/query/service/src/pipelines/processors/transforms/new_hash_join/{memory/unpartitioned => unpartitioned/memory}/inner_join.rs (94%) rename src/query/service/src/pipelines/processors/transforms/new_hash_join/{memory/unpartitioned => unpartitioned/memory}/left_join.rs (95%) rename src/query/service/src/pipelines/processors/transforms/new_hash_join/{memory/unpartitioned => unpartitioned/memory}/left_join_anti.rs (94%) rename src/query/service/src/pipelines/processors/transforms/new_hash_join/{memory/unpartitioned => unpartitioned/memory}/left_join_semi.rs (96%) rename src/query/service/src/pipelines/processors/transforms/new_hash_join/{memory/unpartitioned => unpartitioned/memory}/mod.rs (100%) rename src/query/service/src/pipelines/processors/transforms/new_hash_join/{memory/unpartitioned => unpartitioned/memory}/nested_loop.rs (99%) rename src/query/service/src/pipelines/processors/transforms/new_hash_join/{memory/unpartitioned => unpartitioned/memory}/right_join.rs (97%) rename src/query/service/src/pipelines/processors/transforms/new_hash_join/{memory/unpartitioned => unpartitioned/memory}/right_join_anti.rs (97%) rename src/query/service/src/pipelines/processors/transforms/new_hash_join/{memory/unpartitioned => unpartitioned/memory}/right_join_semi.rs (96%) rename src/query/service/src/pipelines/processors/transforms/new_hash_join/{memory => unpartitioned}/mod.rs (70%) rename src/query/service/src/pipelines/processors/transforms/new_hash_join/{ => unpartitioned}/performance.rs (90%) rename src/query/service/src/pipelines/processors/transforms/new_hash_join/{ => unpartitioned}/transform_hash_join.rs (97%) diff --git a/src/query/service/src/physical_plans/physical_hash_join.rs b/src/query/service/src/physical_plans/physical_hash_join.rs index 162a8a03e1ca2..f3fac7b0d606c 100644 --- a/src/query/service/src/physical_plans/physical_hash_join.rs +++ b/src/query/service/src/physical_plans/physical_hash_join.rs @@ -75,6 +75,7 @@ use crate::pipelines::processors::transforms::RuntimeFiltersDesc; use crate::pipelines::processors::transforms::TransformHashJoin; use crate::pipelines::processors::transforms::TransformHashJoinBuild; use crate::pipelines::processors::transforms::TransformHashJoinProbe; +use crate::pipelines::processors::transforms::TransformPartitionedHashJoin; use crate::sessions::QueryContext; // Type aliases to simplify complex return types @@ -200,20 +201,17 @@ impl IPhysicalPlan for HashJoin { let build_dist = self.build.output_data_distribution(); let probe_dist = self.probe.output_data_distribution(); - let can_preserve_global_hash = self.join_type == JoinType::Inner - && matches!( - &build_dist, - DataDistribution::GlobalHash(keys) if keys == &self.build_keys - ) - && matches!( - &probe_dist, - DataDistribution::GlobalHash(keys) if keys == &self.probe_keys - ); + let can_preserve_global_hash = matches!( + &build_dist, + DataDistribution::GlobalHash(keys) if keys == &self.build_keys + ) && matches!( + &probe_dist, + DataDistribution::GlobalHash(keys) if keys == &self.probe_keys + ); - if can_preserve_global_hash { - probe_dist - } else { - DataDistribution::Random + match can_preserve_global_hash { + true => probe_dist, + false => DataDistribution::Random, } } @@ -334,7 +332,7 @@ impl IPhysicalPlan for HashJoin { && !enable_optimization && !self.need_hold_hash_table { - return self.build_new_join_pipeline(builder, desc); + return self.build_join(builder, desc); } // Create the join state with optimization flags @@ -458,18 +456,98 @@ impl HashJoin { Ok(()) } - fn build_new_join_pipeline( - &self, - builder: &mut PipelineBuilder, - desc: Arc, - ) -> Result<()> { - let factory = self.join_factory(builder, desc)?; + fn build_join(&self, pb: &mut PipelineBuilder, desc: Arc) -> Result<()> { + let build_distribution = self.build.output_data_distribution(); + let global_hash_build = matches!(build_distribution, DataDistribution::GlobalHash(_)); + + let probe_distribution = self.probe.output_data_distribution(); + let global_hash_probe = matches!(probe_distribution, DataDistribution::GlobalHash(_)); - // We must build the runtime filter before constructing the child nodes, - // as we will inject some runtime filter information into the context for the child nodes to use. + match (global_hash_build && global_hash_probe) || self.build_side_cache_info.is_some() { + true => self.shuffle_join(pb, desc), + false => self.broadcast_join(pb, desc), + } + } + + fn shuffle_join(&self, builder: &mut PipelineBuilder, desc: Arc) -> Result<()> { + let rf_desc = RuntimeFiltersDesc::create(&builder.ctx, self)?; + + let hash_key_types = self + .build_keys + .iter() + .zip(&desc.is_null_equal) + .map(|(expr, is_null_equal)| { + let expr = expr.as_expr(&BUILTIN_FUNCTIONS); + if *is_null_equal { + expr.data_type().clone() + } else { + expr.data_type().remove_nullable() + } + }) + .collect::>(); + let hash_method = DataBlock::choose_hash_method_with_types(&hash_key_types)?; + let max_block_size = builder.settings.get_max_block_size()? as usize; + + let mut sub_query_ctx = QueryContext::create_from(&builder.ctx); + std::mem::swap(&mut builder.ctx, &mut sub_query_ctx); + self.build.build_pipeline(builder)?; + std::mem::swap(&mut builder.ctx, &mut sub_query_ctx); + let build_sinks = builder.main_pipeline.take_sinks(); + + self.probe.build_pipeline(builder)?; + let probe_sinks = builder.main_pipeline.take_sinks(); + + assert_eq!(build_sinks.len(), probe_sinks.len()); + let output_len = build_sinks.len(); + + let barrier = databend_common_base::base::Barrier::new(output_len); + let stage_sync_barrier = Arc::new(barrier); + let mut join_sinks = Vec::with_capacity(output_len * 2); + let mut join_pipe_items = Vec::with_capacity(output_len); + for (build_sink, probe_sink) in build_sinks.into_iter().zip(probe_sinks.into_iter()) { + join_sinks.push(build_sink); + join_sinks.push(probe_sink); + + let build_input = InputPort::create(); + let probe_input = InputPort::create(); + let joined_output = OutputPort::create(); + + let join = TransformPartitionedHashJoin::create_join( + self.join_type, + hash_method.clone(), + desc.clone(), + builder.func_ctx.clone(), + max_block_size, + ); + + let hash_join = TransformPartitionedHashJoin::create( + build_input.clone(), + probe_input.clone(), + joined_output.clone(), + join, + stage_sync_barrier.clone(), + self.projections.clone(), + rf_desc.clone(), + )?; + + join_pipe_items.push(PipeItem::create( + hash_join, + vec![build_input, probe_input], + vec![joined_output], + )) + } + + builder.main_pipeline.extend_sinks(join_sinks); + let join_pipe = Pipe::create(output_len * 2, output_len, join_pipe_items); + builder.main_pipeline.add_pipe(join_pipe); + + Ok(()) + } + + fn broadcast_join(&self, builder: &mut PipelineBuilder, desc: Arc) -> Result<()> { + let factory = self.join_factory(builder, desc)?; let rf_desc = RuntimeFiltersDesc::create(&builder.ctx, self)?; - // After common subexpression elimination is completed, we can delete this type of code. { let state = factory.create_basic_state(0)?; @@ -489,7 +567,6 @@ impl HashJoin { self.probe.build_pipeline(builder)?; - // Aligning hash join build and probe parallelism let output_len = std::cmp::max(build_sinks.len(), builder.main_pipeline.output_len()); builder.main_pipeline.resize(output_len, false)?; @@ -503,14 +580,6 @@ impl HashJoin { debug_assert_eq!(build_sinks.len(), probe_sinks.len()); - let use_partitioned_join = matches!( - self.build.output_data_distribution(), - DataDistribution::GlobalHash(_) - ) && matches!( - self.probe.output_data_distribution(), - DataDistribution::GlobalHash(_) - ); - let barrier = databend_common_base::base::Barrier::new(output_len); let stage_sync_barrier = Arc::new(barrier); let mut join_sinks = Vec::with_capacity(output_len * 2); @@ -527,11 +596,7 @@ impl HashJoin { build_input.clone(), probe_input.clone(), joined_output.clone(), - if use_partitioned_join { - factory.create_partitioned_join(self.join_type)? - } else { - factory.create_hash_join(self.join_type, 0)? - }, + factory.create_hash_join(self.join_type, 0)?, stage_sync_barrier.clone(), self.projections.clone(), rf_desc.clone(), @@ -549,9 +614,7 @@ impl HashJoin { builder.main_pipeline.add_pipe(join_pipe); let item_size = builder.main_pipeline.output_len(); - builder.main_pipeline.resize(item_size, true)?; - - Ok(()) + builder.main_pipeline.resize(item_size, true) } fn join_factory( diff --git a/src/query/service/src/pipelines/processors/transforms/new_hash_join/join.rs b/src/query/service/src/pipelines/processors/transforms/new_hash_join/common/join.rs similarity index 100% rename from src/query/service/src/pipelines/processors/transforms/new_hash_join/join.rs rename to src/query/service/src/pipelines/processors/transforms/new_hash_join/common/join.rs diff --git a/src/query/service/src/pipelines/processors/transforms/new_hash_join/common/mod.rs b/src/query/service/src/pipelines/processors/transforms/new_hash_join/common/mod.rs index f0127c9d681cd..c76453cc9b693 100644 --- a/src/query/service/src/pipelines/processors/transforms/new_hash_join/common/mod.rs +++ b/src/query/service/src/pipelines/processors/transforms/new_hash_join/common/mod.rs @@ -13,6 +13,9 @@ // limitations under the License. mod cstyle_cell; +pub mod join; +pub mod probe_stream; +pub mod runtime_filter; mod squash_blocks; pub use cstyle_cell::CStyleCell; diff --git a/src/query/service/src/pipelines/processors/transforms/new_hash_join/hashtable/basic.rs b/src/query/service/src/pipelines/processors/transforms/new_hash_join/common/probe_stream.rs similarity index 68% rename from src/query/service/src/pipelines/processors/transforms/new_hash_join/hashtable/basic.rs rename to src/query/service/src/pipelines/processors/transforms/new_hash_join/common/probe_stream.rs index 72320f145d165..9911ee8eaa133 100644 --- a/src/query/service/src/pipelines/processors/transforms/new_hash_join/hashtable/basic.rs +++ b/src/query/service/src/pipelines/processors/transforms/new_hash_join/common/probe_stream.rs @@ -64,38 +64,3 @@ impl ProbedRows { pub trait ProbeStream { fn advance(&mut self, res: &mut ProbedRows, max_rows: usize) -> Result<()>; } - -pub struct EmptyProbeStream; - -impl ProbeStream for EmptyProbeStream { - fn advance(&mut self, _res: &mut ProbedRows, _max_rows: usize) -> Result<()> { - Ok(()) - } -} - -pub struct AllUnmatchedProbeStream { - idx: u64, - size: u64, -} - -impl AllUnmatchedProbeStream { - pub fn create(size: usize) -> Box { - Box::new(AllUnmatchedProbeStream { - idx: 0, - size: size as u64, - }) - } -} - -impl ProbeStream for AllUnmatchedProbeStream { - fn advance(&mut self, rows: &mut ProbedRows, max_rows: usize) -> Result<()> { - if self.idx >= self.size { - return Ok(()); - } - - let unmatched_rows = std::cmp::min(self.size - self.idx, max_rows as u64); - rows.unmatched.extend(self.idx..self.idx + unmatched_rows); - self.idx += unmatched_rows; - Ok(()) - } -} diff --git a/src/query/service/src/pipelines/processors/transforms/new_hash_join/runtime_filter.rs b/src/query/service/src/pipelines/processors/transforms/new_hash_join/common/runtime_filter.rs similarity index 100% rename from src/query/service/src/pipelines/processors/transforms/new_hash_join/runtime_filter.rs rename to src/query/service/src/pipelines/processors/transforms/new_hash_join/common/runtime_filter.rs diff --git a/src/query/service/src/pipelines/processors/transforms/new_hash_join/mod.rs b/src/query/service/src/pipelines/processors/transforms/new_hash_join/mod.rs index 790c0ce9e3eb0..acb1eb5e7e57e 100644 --- a/src/query/service/src/pipelines/processors/transforms/new_hash_join/mod.rs +++ b/src/query/service/src/pipelines/processors/transforms/new_hash_join/mod.rs @@ -13,24 +13,18 @@ // limitations under the License. mod common; -mod grace; -mod hash_join_factory; -mod hashtable; -mod hybrid; -mod join; -pub mod memory; -mod performance; -mod runtime_filter; -mod transform_hash_join; +pub mod partitioned; +pub mod unpartitioned; -pub use grace::GraceHashJoin; -pub use grace::GraceMemoryJoin; -pub use hash_join_factory::HashJoinFactory; -pub use hybrid::HybridHashJoin; -pub use hybrid::HybridHashJoinState; -pub use join::Join; -pub use join::JoinStream; -pub use memory::BasicHashJoinState; -pub use memory::InnerHashJoin; -pub use runtime_filter::RuntimeFiltersDesc; -pub use transform_hash_join::TransformHashJoin; +pub use common::join::Join; +pub use common::join::JoinStream; +pub use common::runtime_filter::RuntimeFiltersDesc; +pub use partitioned::TransformPartitionedHashJoin; +pub use unpartitioned::HashJoinFactory; +pub use unpartitioned::TransformHashJoin; +pub use unpartitioned::grace::GraceHashJoin; +pub use unpartitioned::grace::GraceMemoryJoin; +pub use unpartitioned::hybrid::HybridHashJoin; +pub use unpartitioned::hybrid::HybridHashJoinState; +pub use unpartitioned::memory::BasicHashJoinState; +pub use unpartitioned::memory::InnerHashJoin; diff --git a/src/query/service/src/pipelines/processors/transforms/new_hash_join/memory/partitioned/chunk_accumulator.rs b/src/query/service/src/pipelines/processors/transforms/new_hash_join/partitioned/chunk_accumulator.rs similarity index 100% rename from src/query/service/src/pipelines/processors/transforms/new_hash_join/memory/partitioned/chunk_accumulator.rs rename to src/query/service/src/pipelines/processors/transforms/new_hash_join/partitioned/chunk_accumulator.rs diff --git a/src/query/service/src/pipelines/processors/transforms/new_hash_join/memory/partitioned/compact_hash_table.rs b/src/query/service/src/pipelines/processors/transforms/new_hash_join/partitioned/compact_hash_table.rs similarity index 100% rename from src/query/service/src/pipelines/processors/transforms/new_hash_join/memory/partitioned/compact_hash_table.rs rename to src/query/service/src/pipelines/processors/transforms/new_hash_join/partitioned/compact_hash_table.rs diff --git a/src/query/service/src/pipelines/processors/transforms/new_hash_join/memory/partitioned/compact_probe_stream.rs b/src/query/service/src/pipelines/processors/transforms/new_hash_join/partitioned/compact_probe_stream.rs similarity index 97% rename from src/query/service/src/pipelines/processors/transforms/new_hash_join/memory/partitioned/compact_probe_stream.rs rename to src/query/service/src/pipelines/processors/transforms/new_hash_join/partitioned/compact_probe_stream.rs index cb0718878c6f5..60c13f837c345 100644 --- a/src/query/service/src/pipelines/processors/transforms/new_hash_join/memory/partitioned/compact_probe_stream.rs +++ b/src/query/service/src/pipelines/processors/transforms/new_hash_join/partitioned/compact_probe_stream.rs @@ -28,8 +28,8 @@ use super::partitioned_build::CHUNK_BITS; use super::partitioned_build::CHUNK_SIZE; use super::partitioned_build::flat_to_row_ptr; use crate::pipelines::processors::HashJoinDesc; -use crate::pipelines::processors::transforms::new_hash_join::hashtable::basic::ProbeStream; -use crate::pipelines::processors::transforms::new_hash_join::hashtable::basic::ProbedRows; +use crate::pipelines::processors::transforms::new_hash_join::common::probe_stream::ProbeStream; +use crate::pipelines::processors::transforms::new_hash_join::common::probe_stream::ProbedRows; const CHUNK_MASK: usize = CHUNK_SIZE - 1; diff --git a/src/query/service/src/pipelines/processors/transforms/new_hash_join/memory/partitioned/inner_join.rs b/src/query/service/src/pipelines/processors/transforms/new_hash_join/partitioned/inner_join.rs similarity index 90% rename from src/query/service/src/pipelines/processors/transforms/new_hash_join/memory/partitioned/inner_join.rs rename to src/query/service/src/pipelines/processors/transforms/new_hash_join/partitioned/inner_join.rs index d3c8d755ecffa..df070235d1363 100644 --- a/src/query/service/src/pipelines/processors/transforms/new_hash_join/memory/partitioned/inner_join.rs +++ b/src/query/service/src/pipelines/processors/transforms/new_hash_join/partitioned/inner_join.rs @@ -27,12 +27,11 @@ use databend_common_functions::BUILTIN_FUNCTIONS; use super::partitioned_build::PartitionedBuild; use crate::pipelines::processors::HashJoinDesc; -use crate::pipelines::processors::transforms::new_hash_join::grace::grace_memory::GraceMemoryJoin; -use crate::pipelines::processors::transforms::new_hash_join::hashtable::basic::ProbeStream; -use crate::pipelines::processors::transforms::new_hash_join::hashtable::basic::ProbedRows; -use crate::pipelines::processors::transforms::new_hash_join::join::EmptyJoinStream; -use crate::pipelines::processors::transforms::new_hash_join::join::Join; -use crate::pipelines::processors::transforms::new_hash_join::join::JoinStream; +use crate::pipelines::processors::transforms::new_hash_join::common::join::EmptyJoinStream; +use crate::pipelines::processors::transforms::new_hash_join::common::join::Join; +use crate::pipelines::processors::transforms::new_hash_join::common::join::JoinStream; +use crate::pipelines::processors::transforms::new_hash_join::common::probe_stream::ProbeStream; +use crate::pipelines::processors::transforms::new_hash_join::common::probe_stream::ProbedRows; pub struct PartitionedInnerJoin { build: PartitionedBuild, @@ -181,9 +180,3 @@ impl Join for PartitionedInnerJoin { })) } } - -impl GraceMemoryJoin for PartitionedInnerJoin { - fn reset_memory(&mut self) { - self.build.reset(); - } -} diff --git a/src/query/service/src/pipelines/processors/transforms/new_hash_join/memory/partitioned/left_join.rs b/src/query/service/src/pipelines/processors/transforms/new_hash_join/partitioned/left_join.rs similarity index 92% rename from src/query/service/src/pipelines/processors/transforms/new_hash_join/memory/partitioned/left_join.rs rename to src/query/service/src/pipelines/processors/transforms/new_hash_join/partitioned/left_join.rs index adc824d13bc4d..81e7c6f3d8c2e 100644 --- a/src/query/service/src/pipelines/processors/transforms/new_hash_join/memory/partitioned/left_join.rs +++ b/src/query/service/src/pipelines/processors/transforms/new_hash_join/partitioned/left_join.rs @@ -29,13 +29,12 @@ use databend_common_functions::BUILTIN_FUNCTIONS; use super::inner_join::result_block; use super::partitioned_build::PartitionedBuild; use crate::pipelines::processors::HashJoinDesc; -use crate::pipelines::processors::transforms::new_hash_join::grace::grace_memory::GraceMemoryJoin; -use crate::pipelines::processors::transforms::new_hash_join::hashtable::basic::ProbeStream; -use crate::pipelines::processors::transforms::new_hash_join::hashtable::basic::ProbedRows; -use crate::pipelines::processors::transforms::new_hash_join::join::EmptyJoinStream; -use crate::pipelines::processors::transforms::new_hash_join::join::Join; -use crate::pipelines::processors::transforms::new_hash_join::join::JoinStream; -use crate::pipelines::processors::transforms::new_hash_join::join::OneBlockJoinStream; +use crate::pipelines::processors::transforms::new_hash_join::common::join::EmptyJoinStream; +use crate::pipelines::processors::transforms::new_hash_join::common::join::Join; +use crate::pipelines::processors::transforms::new_hash_join::common::join::JoinStream; +use crate::pipelines::processors::transforms::new_hash_join::common::join::OneBlockJoinStream; +use crate::pipelines::processors::transforms::new_hash_join::common::probe_stream::ProbeStream; +use crate::pipelines::processors::transforms::new_hash_join::common::probe_stream::ProbedRows; use crate::pipelines::processors::transforms::wrap_true_validity; pub fn null_block(types: &[DataType], num_rows: usize) -> Option { @@ -270,9 +269,3 @@ impl Join for PartitionedLeftJoin { })) } } - -impl GraceMemoryJoin for PartitionedLeftJoin { - fn reset_memory(&mut self) { - self.build.reset(); - } -} diff --git a/src/query/service/src/pipelines/processors/transforms/new_hash_join/memory/partitioned/left_join_anti.rs b/src/query/service/src/pipelines/processors/transforms/new_hash_join/partitioned/left_join_anti.rs similarity index 89% rename from src/query/service/src/pipelines/processors/transforms/new_hash_join/memory/partitioned/left_join_anti.rs rename to src/query/service/src/pipelines/processors/transforms/new_hash_join/partitioned/left_join_anti.rs index fc9e931773111..c70fab517df2e 100644 --- a/src/query/service/src/pipelines/processors/transforms/new_hash_join/memory/partitioned/left_join_anti.rs +++ b/src/query/service/src/pipelines/processors/transforms/new_hash_join/partitioned/left_join_anti.rs @@ -26,13 +26,12 @@ use databend_common_functions::BUILTIN_FUNCTIONS; use super::inner_join::result_block; use super::partitioned_build::PartitionedBuild; use crate::pipelines::processors::HashJoinDesc; -use crate::pipelines::processors::transforms::new_hash_join::grace::grace_memory::GraceMemoryJoin; -use crate::pipelines::processors::transforms::new_hash_join::hashtable::basic::ProbeStream; -use crate::pipelines::processors::transforms::new_hash_join::hashtable::basic::ProbedRows; -use crate::pipelines::processors::transforms::new_hash_join::join::EmptyJoinStream; -use crate::pipelines::processors::transforms::new_hash_join::join::Join; -use crate::pipelines::processors::transforms::new_hash_join::join::JoinStream; -use crate::pipelines::processors::transforms::new_hash_join::join::OneBlockJoinStream; +use crate::pipelines::processors::transforms::new_hash_join::common::join::EmptyJoinStream; +use crate::pipelines::processors::transforms::new_hash_join::common::join::Join; +use crate::pipelines::processors::transforms::new_hash_join::common::join::JoinStream; +use crate::pipelines::processors::transforms::new_hash_join::common::join::OneBlockJoinStream; +use crate::pipelines::processors::transforms::new_hash_join::common::probe_stream::ProbeStream; +use crate::pipelines::processors::transforms::new_hash_join::common::probe_stream::ProbedRows; pub struct PartitionedLeftAntiJoin { build: PartitionedBuild, @@ -173,9 +172,3 @@ impl Join for PartitionedLeftAntiJoin { })) } } - -impl GraceMemoryJoin for PartitionedLeftAntiJoin { - fn reset_memory(&mut self) { - self.build.reset(); - } -} diff --git a/src/query/service/src/pipelines/processors/transforms/new_hash_join/memory/partitioned/left_join_semi.rs b/src/query/service/src/pipelines/processors/transforms/new_hash_join/partitioned/left_join_semi.rs similarity index 90% rename from src/query/service/src/pipelines/processors/transforms/new_hash_join/memory/partitioned/left_join_semi.rs rename to src/query/service/src/pipelines/processors/transforms/new_hash_join/partitioned/left_join_semi.rs index 1f17e0a336509..30b39c6de86ce 100644 --- a/src/query/service/src/pipelines/processors/transforms/new_hash_join/memory/partitioned/left_join_semi.rs +++ b/src/query/service/src/pipelines/processors/transforms/new_hash_join/partitioned/left_join_semi.rs @@ -26,12 +26,11 @@ use databend_common_functions::BUILTIN_FUNCTIONS; use super::inner_join::result_block; use super::partitioned_build::PartitionedBuild; use crate::pipelines::processors::HashJoinDesc; -use crate::pipelines::processors::transforms::new_hash_join::grace::grace_memory::GraceMemoryJoin; -use crate::pipelines::processors::transforms::new_hash_join::hashtable::basic::ProbeStream; -use crate::pipelines::processors::transforms::new_hash_join::hashtable::basic::ProbedRows; -use crate::pipelines::processors::transforms::new_hash_join::join::EmptyJoinStream; -use crate::pipelines::processors::transforms::new_hash_join::join::Join; -use crate::pipelines::processors::transforms::new_hash_join::join::JoinStream; +use crate::pipelines::processors::transforms::new_hash_join::common::join::EmptyJoinStream; +use crate::pipelines::processors::transforms::new_hash_join::common::join::Join; +use crate::pipelines::processors::transforms::new_hash_join::common::join::JoinStream; +use crate::pipelines::processors::transforms::new_hash_join::common::probe_stream::ProbeStream; +use crate::pipelines::processors::transforms::new_hash_join::common::probe_stream::ProbedRows; pub struct PartitionedLeftSemiJoin { build: PartitionedBuild, @@ -168,9 +167,3 @@ impl Join for PartitionedLeftSemiJoin { })) } } - -impl GraceMemoryJoin for PartitionedLeftSemiJoin { - fn reset_memory(&mut self) { - self.build.reset(); - } -} diff --git a/src/query/service/src/pipelines/processors/transforms/new_hash_join/memory/partitioned/mod.rs b/src/query/service/src/pipelines/processors/transforms/new_hash_join/partitioned/mod.rs similarity index 92% rename from src/query/service/src/pipelines/processors/transforms/new_hash_join/memory/partitioned/mod.rs rename to src/query/service/src/pipelines/processors/transforms/new_hash_join/partitioned/mod.rs index 9b41ddd7a0832..6ba6c62f824f6 100644 --- a/src/query/service/src/pipelines/processors/transforms/new_hash_join/memory/partitioned/mod.rs +++ b/src/query/service/src/pipelines/processors/transforms/new_hash_join/partitioned/mod.rs @@ -23,6 +23,8 @@ mod partitioned_build; mod right_join; mod right_join_anti; mod right_join_semi; +#[allow(dead_code)] +mod transform_hash_join; pub use compact_hash_table::CompactJoinHashTable; pub use compact_hash_table::RowIndex; @@ -34,3 +36,4 @@ pub use partitioned_build::PartitionedBuild; pub use right_join::PartitionedRightJoin; pub use right_join_anti::PartitionedRightAntiJoin; pub use right_join_semi::PartitionedRightSemiJoin; +pub use transform_hash_join::TransformPartitionedHashJoin; diff --git a/src/query/service/src/pipelines/processors/transforms/new_hash_join/memory/partitioned/partitioned_build.rs b/src/query/service/src/pipelines/processors/transforms/new_hash_join/partitioned/partitioned_build.rs similarity index 99% rename from src/query/service/src/pipelines/processors/transforms/new_hash_join/memory/partitioned/partitioned_build.rs rename to src/query/service/src/pipelines/processors/transforms/new_hash_join/partitioned/partitioned_build.rs index d0b5fab2ce0c4..85dfca8507507 100644 --- a/src/query/service/src/pipelines/processors/transforms/new_hash_join/memory/partitioned/partitioned_build.rs +++ b/src/query/service/src/pipelines/processors/transforms/new_hash_join/partitioned/partitioned_build.rs @@ -39,7 +39,7 @@ use super::compact_probe_stream::create_compact_probe; use super::compact_probe_stream::create_compact_probe_matched; use crate::pipelines::processors::HashJoinDesc; use crate::pipelines::processors::transforms::hash_join_table::RowPtr; -use crate::pipelines::processors::transforms::new_hash_join::hashtable::basic::ProbeStream; +use crate::pipelines::processors::transforms::new_hash_join::common::probe_stream::ProbeStream; pub const CHUNK_BITS: usize = 16; pub const CHUNK_SIZE: usize = 1 << CHUNK_BITS; // 65536 diff --git a/src/query/service/src/pipelines/processors/transforms/new_hash_join/memory/partitioned/right_join.rs b/src/query/service/src/pipelines/processors/transforms/new_hash_join/partitioned/right_join.rs similarity index 93% rename from src/query/service/src/pipelines/processors/transforms/new_hash_join/memory/partitioned/right_join.rs rename to src/query/service/src/pipelines/processors/transforms/new_hash_join/partitioned/right_join.rs index 61c7f01cf7b67..3e03a5dad958f 100644 --- a/src/query/service/src/pipelines/processors/transforms/new_hash_join/memory/partitioned/right_join.rs +++ b/src/query/service/src/pipelines/processors/transforms/new_hash_join/partitioned/right_join.rs @@ -31,12 +31,11 @@ use super::partitioned_build::PartitionedBuild; use super::partitioned_build::flat_to_row_ptr; use crate::pipelines::processors::HashJoinDesc; use crate::pipelines::processors::transforms::hash_join_table::RowPtr; -use crate::pipelines::processors::transforms::new_hash_join::grace::grace_memory::GraceMemoryJoin; -use crate::pipelines::processors::transforms::new_hash_join::hashtable::basic::ProbeStream; -use crate::pipelines::processors::transforms::new_hash_join::hashtable::basic::ProbedRows; -use crate::pipelines::processors::transforms::new_hash_join::join::EmptyJoinStream; -use crate::pipelines::processors::transforms::new_hash_join::join::Join; -use crate::pipelines::processors::transforms::new_hash_join::join::JoinStream; +use crate::pipelines::processors::transforms::new_hash_join::common::join::EmptyJoinStream; +use crate::pipelines::processors::transforms::new_hash_join::common::join::Join; +use crate::pipelines::processors::transforms::new_hash_join::common::join::JoinStream; +use crate::pipelines::processors::transforms::new_hash_join::common::probe_stream::ProbeStream; +use crate::pipelines::processors::transforms::new_hash_join::common::probe_stream::ProbedRows; use crate::pipelines::processors::transforms::wrap_nullable_block; pub struct PartitionedRightJoin { @@ -270,10 +269,3 @@ impl Join for PartitionedRightJoin { }))) } } - -impl GraceMemoryJoin for PartitionedRightJoin { - fn reset_memory(&mut self) { - self.finished = false; - self.build.reset(); - } -} diff --git a/src/query/service/src/pipelines/processors/transforms/new_hash_join/memory/partitioned/right_join_anti.rs b/src/query/service/src/pipelines/processors/transforms/new_hash_join/partitioned/right_join_anti.rs similarity index 92% rename from src/query/service/src/pipelines/processors/transforms/new_hash_join/memory/partitioned/right_join_anti.rs rename to src/query/service/src/pipelines/processors/transforms/new_hash_join/partitioned/right_join_anti.rs index c9f266c6a365f..9f36b2abe82fc 100644 --- a/src/query/service/src/pipelines/processors/transforms/new_hash_join/memory/partitioned/right_join_anti.rs +++ b/src/query/service/src/pipelines/processors/transforms/new_hash_join/partitioned/right_join_anti.rs @@ -29,12 +29,11 @@ use super::inner_join::result_block; use super::partitioned_build::PartitionedBuild; use super::partitioned_build::flat_to_row_ptr; use crate::pipelines::processors::HashJoinDesc; -use crate::pipelines::processors::transforms::new_hash_join::grace::grace_memory::GraceMemoryJoin; -use crate::pipelines::processors::transforms::new_hash_join::hashtable::basic::ProbeStream; -use crate::pipelines::processors::transforms::new_hash_join::hashtable::basic::ProbedRows; -use crate::pipelines::processors::transforms::new_hash_join::join::EmptyJoinStream; -use crate::pipelines::processors::transforms::new_hash_join::join::Join; -use crate::pipelines::processors::transforms::new_hash_join::join::JoinStream; +use crate::pipelines::processors::transforms::new_hash_join::common::join::EmptyJoinStream; +use crate::pipelines::processors::transforms::new_hash_join::common::join::Join; +use crate::pipelines::processors::transforms::new_hash_join::common::join::JoinStream; +use crate::pipelines::processors::transforms::new_hash_join::common::probe_stream::ProbeStream; +use crate::pipelines::processors::transforms::new_hash_join::common::probe_stream::ProbedRows; pub struct PartitionedRightAntiJoin { build: PartitionedBuild, @@ -237,10 +236,3 @@ impl Join for PartitionedRightAntiJoin { }))) } } - -impl GraceMemoryJoin for PartitionedRightAntiJoin { - fn reset_memory(&mut self) { - self.finished = false; - self.build.reset(); - } -} diff --git a/src/query/service/src/pipelines/processors/transforms/new_hash_join/memory/partitioned/right_join_semi.rs b/src/query/service/src/pipelines/processors/transforms/new_hash_join/partitioned/right_join_semi.rs similarity index 92% rename from src/query/service/src/pipelines/processors/transforms/new_hash_join/memory/partitioned/right_join_semi.rs rename to src/query/service/src/pipelines/processors/transforms/new_hash_join/partitioned/right_join_semi.rs index 04cf18bc8c89a..82fee79bba989 100644 --- a/src/query/service/src/pipelines/processors/transforms/new_hash_join/memory/partitioned/right_join_semi.rs +++ b/src/query/service/src/pipelines/processors/transforms/new_hash_join/partitioned/right_join_semi.rs @@ -29,12 +29,11 @@ use super::inner_join::result_block; use super::partitioned_build::PartitionedBuild; use super::partitioned_build::flat_to_row_ptr; use crate::pipelines::processors::HashJoinDesc; -use crate::pipelines::processors::transforms::new_hash_join::grace::grace_memory::GraceMemoryJoin; -use crate::pipelines::processors::transforms::new_hash_join::hashtable::basic::ProbeStream; -use crate::pipelines::processors::transforms::new_hash_join::hashtable::basic::ProbedRows; -use crate::pipelines::processors::transforms::new_hash_join::join::EmptyJoinStream; -use crate::pipelines::processors::transforms::new_hash_join::join::Join; -use crate::pipelines::processors::transforms::new_hash_join::join::JoinStream; +use crate::pipelines::processors::transforms::new_hash_join::common::join::EmptyJoinStream; +use crate::pipelines::processors::transforms::new_hash_join::common::join::Join; +use crate::pipelines::processors::transforms::new_hash_join::common::join::JoinStream; +use crate::pipelines::processors::transforms::new_hash_join::common::probe_stream::ProbeStream; +use crate::pipelines::processors::transforms::new_hash_join::common::probe_stream::ProbedRows; pub struct PartitionedRightSemiJoin { build: PartitionedBuild, @@ -238,10 +237,3 @@ impl Join for PartitionedRightSemiJoin { }))) } } - -impl GraceMemoryJoin for PartitionedRightSemiJoin { - fn reset_memory(&mut self) { - self.finished = false; - self.build.reset(); - } -} diff --git a/src/query/service/src/pipelines/processors/transforms/new_hash_join/partitioned/transform_hash_join.rs b/src/query/service/src/pipelines/processors/transforms/new_hash_join/partitioned/transform_hash_join.rs new file mode 100644 index 0000000000000..e0155aa6e85f7 --- /dev/null +++ b/src/query/service/src/pipelines/processors/transforms/new_hash_join/partitioned/transform_hash_join.rs @@ -0,0 +1,502 @@ +// Copyright 2021 Datafuse Labs +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use std::any::Any; +use std::collections::BTreeSet; +use std::fmt::Debug; +use std::fmt::Formatter; +use std::sync::Arc; +use std::time::Instant; + +use databend_common_base::base::Barrier; +use databend_common_exception::Result; +use databend_common_expression::DataBlock; +use databend_common_expression::FunctionContext; +use databend_common_expression::HashMethodKind; +use databend_common_pipeline::core::Event; +use databend_common_pipeline::core::InputPort; +use databend_common_pipeline::core::OutputPort; +use databend_common_pipeline::core::Processor; +use databend_common_pipeline::core::ProcessorPtr; +use databend_common_sql::plans::JoinType; +use log::info; + +use super::PartitionedInnerJoin; +use super::PartitionedLeftAntiJoin; +use super::PartitionedLeftJoin; +use super::PartitionedLeftSemiJoin; +use super::PartitionedRightAntiJoin; +use super::PartitionedRightJoin; +use super::PartitionedRightSemiJoin; +use crate::pipelines::processors::HashJoinDesc; +use crate::pipelines::processors::transforms::RuntimeFilterLocalBuilder; +use crate::pipelines::processors::transforms::new_hash_join::common::join::FinishedJoin; +use crate::pipelines::processors::transforms::new_hash_join::common::join::Join; +use crate::pipelines::processors::transforms::new_hash_join::common::join::JoinStream; +use crate::pipelines::processors::transforms::new_hash_join::common::runtime_filter::RuntimeFiltersDesc; + +pub struct TransformPartitionedHashJoin { + build_port: Arc, + probe_port: Arc, + joined_port: Arc, + + stage: Stage, + join: Box, + joined_data: Option, + + stage_sync_barrier: Arc, + projection: BTreeSet, + rf_desc: Arc, + runtime_filter_builder: Option, + instant: Instant, +} + +impl TransformPartitionedHashJoin { + pub fn create( + build_port: Arc, + probe_port: Arc, + joined_port: Arc, + join: Box, + stage_sync_barrier: Arc, + projection: BTreeSet, + rf_desc: Arc, + ) -> Result { + let runtime_filter_builder = RuntimeFilterLocalBuilder::try_create( + &rf_desc.func_ctx, + rf_desc.filters_desc.clone(), + rf_desc.inlist_threshold, + rf_desc.bloom_threshold, + rf_desc.min_max_threshold, + rf_desc.spatial_threshold, + )?; + + Ok(ProcessorPtr::create(Box::new( + TransformPartitionedHashJoin { + build_port, + probe_port, + joined_port, + join, + rf_desc, + projection, + stage_sync_barrier, + joined_data: None, + runtime_filter_builder, + stage: Stage::Build(BuildState { + finished: false, + build_data: None, + }), + instant: Instant::now(), + }, + ))) + } + + pub fn create_join( + typ: JoinType, + hash_method: HashMethodKind, + desc: Arc, + function_ctx: FunctionContext, + max_block_size: usize, + ) -> Box { + match typ { + JoinType::Inner => Box::new(PartitionedInnerJoin::create( + hash_method, + desc, + function_ctx, + max_block_size, + )), + JoinType::Left => Box::new(PartitionedLeftJoin::create( + hash_method, + desc, + function_ctx, + max_block_size, + )), + JoinType::LeftAnti => Box::new(PartitionedLeftAntiJoin::create( + hash_method, + desc, + function_ctx, + max_block_size, + )), + JoinType::LeftSemi => Box::new(PartitionedLeftSemiJoin::create( + hash_method, + desc, + function_ctx, + max_block_size, + )), + JoinType::Right => Box::new(PartitionedRightJoin::create( + hash_method, + desc, + function_ctx, + max_block_size, + )), + JoinType::RightSemi => Box::new(PartitionedRightSemiJoin::create( + hash_method, + desc, + function_ctx, + max_block_size, + )), + JoinType::RightAnti => Box::new(PartitionedRightAntiJoin::create( + hash_method, + desc, + function_ctx, + max_block_size, + )), + _ => unreachable!(), + } + } +} + +#[async_trait::async_trait] +impl Processor for TransformPartitionedHashJoin { + fn name(&self) -> String { + String::from("TransformPartitionedHashJoin") + } + + fn as_any(&mut self) -> &mut dyn Any { + self + } + + fn event(&mut self) -> Result { + if self.joined_port.is_finished() { + self.build_port.finish(); + self.probe_port.finish(); + + if !matches!(self.stage, Stage::Finished) { + self.stage = Stage::Finished; + let mut finished = FinishedJoin::create(); + std::mem::swap(&mut finished, &mut self.join); + drop(finished); + } + + return Ok(Event::Finished); + } + + if !self.joined_port.can_push() { + match self.stage { + Stage::Build(_) => self.build_port.set_not_need_data(), + Stage::Probe(_) => self.probe_port.set_not_need_data(), + Stage::BuildFinal(_) | Stage::ProbeFinal(_) | Stage::Finished => (), + } + return Ok(Event::NeedConsume); + } + + if let Some(joined_data) = self.joined_data.take() { + let joined_data = joined_data.project(&self.projection); + self.joined_port.push_data(Ok(joined_data)); + return Ok(Event::NeedConsume); + } + + match &mut self.stage { + Stage::Build(state) => state.event(&self.build_port), + Stage::BuildFinal(state) => state.event(), + Stage::Probe(state) => state.event(&self.probe_port), + Stage::ProbeFinal(state) => state.event(&self.joined_port), + Stage::Finished => Ok(Event::Finished), + } + } + + fn process(&mut self) -> Result<()> { + match &mut self.stage { + Stage::Finished => Ok(()), + Stage::Build(state) => { + let Some(data_block) = state.build_data.take() else { + if !state.finished { + state.finished = true; + self.join.add_block(None)?; + } + return Ok(()); + }; + + if !data_block.is_empty() { + if let Some(builder) = self.runtime_filter_builder.as_mut() { + builder.add_block(&data_block)?; + } + self.join.add_block(Some(data_block))?; + } + + Ok(()) + } + Stage::BuildFinal(state) => { + state.finished = self.join.final_build()?.is_none(); + Ok(()) + } + Stage::Probe(state) => { + if let Some(probe_data) = state.input_data.take() { + let stream = self.join.probe_block(probe_data)?; + state.stream = Some(unsafe { + std::mem::transmute::, Box>(stream) + }); + } + + if let Some(mut stream) = state.stream.take() { + if let Some(joined_data) = stream.next()? { + self.joined_data = Some(joined_data); + state.stream = Some(stream); + } + } + + Ok(()) + } + Stage::ProbeFinal(state) => { + if state.stream.is_none() { + if let Some(final_stream) = self.join.final_probe()? { + state.initialize = true; + state.stream = Some(unsafe { + std::mem::transmute::, Box>( + final_stream, + ) + }); + } else { + state.finished = true; + } + } + + if let Some(mut stream) = state.stream.take() { + if let Some(joined_data) = stream.next()? { + self.joined_data = Some(joined_data); + state.stream = Some(stream); + } else { + state.initialize = false; + } + } + + Ok(()) + } + } + } + + async fn async_process(&mut self) -> Result<()> { + let elapsed = self.instant.elapsed(); + + self.stage = match &mut self.stage { + Stage::Build(_) => { + let wait_res = self.stage_sync_barrier.wait().await; + + if let Some(builder) = self.runtime_filter_builder.take() { + let spill_happened = self.join.is_spill_happened(); + let packet = builder.finish(spill_happened)?; + self.join.add_runtime_filter_packet(packet); + } + + let rf_build_elapsed = self.instant.elapsed() - elapsed; + let _wait_res = self.stage_sync_barrier.wait().await; + let before_wait = self.instant.elapsed(); + + if wait_res.is_leader() { + let spilled = self.join.is_spill_happened(); + let packet = self.join.build_runtime_filter()?; + info!( + "spilled: {}, globalize runtime filter: total {}, disable_all_due_to_spill: {}", + spilled, + packet.packets.as_ref().map_or(0, |p| p.len()), + packet.disable_all_due_to_spill + ); + self.rf_desc.globalization(packet).await?; + } + + let _wait_res = self.stage_sync_barrier.wait().await; + let wait_rf_elapsed = self.instant.elapsed() - before_wait; + + log::info!( + "PartitionedHashJoin build stage, sync work elapsed: {:?}, build rf elapsed: {:?}, wait other node rf elapsed: {:?}", + elapsed, + rf_build_elapsed, + wait_rf_elapsed + ); + + self.instant = Instant::now(); + Stage::BuildFinal(BuildFinalState::new()) + } + // BuildFinal → Probe: barrier + Stage::BuildFinal(_) => { + let _wait_res = self.stage_sync_barrier.wait().await; + let wait_elapsed = self.instant.elapsed() - elapsed; + log::info!( + "PartitionedHashJoin build final stage, sync work elapsed: {:?}, wait elapsed: {:?}", + elapsed, + wait_elapsed + ); + + self.instant = Instant::now(); + Stage::Probe(ProbeState::new()) + } + // Probe → ProbeFinal: no barrier + Stage::Probe(_) => { + log::info!("PartitionedHashJoin probe stage elapsed: {:?}", elapsed); + self.instant = Instant::now(); + Stage::ProbeFinal(ProbeFinalState::new()) + } + // ProbeFinal → Finished or continue: no barrier + Stage::ProbeFinal(state) => match state.finished { + true => { + log::info!( + "PartitionedHashJoin probe final stage elapsed: {:?}", + elapsed + ); + self.instant = Instant::now(); + + let mut finished = FinishedJoin::create(); + std::mem::swap(&mut finished, &mut self.join); + drop(finished); + + Stage::Finished + } + false => { + self.instant = Instant::now(); + Stage::ProbeFinal(ProbeFinalState { + initialize: true, + finished: state.finished, + stream: state.stream.take(), + }) + } + }, + Stage::Finished => Stage::Finished, + }; + + Ok(()) + } +} + +#[derive(Debug)] +enum Stage { + Build(BuildState), + BuildFinal(BuildFinalState), + Probe(ProbeState), + ProbeFinal(ProbeFinalState), + Finished, +} + +#[derive(Debug)] +struct BuildState { + finished: bool, + build_data: Option, +} + +impl BuildState { + pub fn event(&mut self, input: &InputPort) -> Result { + if self.build_data.is_some() { + return Ok(Event::Sync); + } + + if input.has_data() { + self.build_data = Some(input.pull_data().unwrap()?); + return Ok(Event::Sync); + } + + if input.is_finished() { + return match self.finished { + true => Ok(Event::Async), + false => Ok(Event::Sync), + }; + } + + input.set_need_data(); + Ok(Event::NeedData) + } +} + +#[derive(Debug)] +struct BuildFinalState { + finished: bool, +} + +impl BuildFinalState { + pub fn new() -> BuildFinalState { + BuildFinalState { finished: false } + } + + pub fn event(&mut self) -> Result { + match self.finished { + true => Ok(Event::Async), + false => Ok(Event::Sync), + } + } +} + +struct ProbeState { + input_data: Option, + stream: Option>, +} + +impl Debug for ProbeState { + fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result { + f.debug_struct("ProbeState").finish() + } +} + +impl ProbeState { + pub fn new() -> ProbeState { + ProbeState { + input_data: None, + stream: None, + } + } + + pub fn event(&mut self, input: &InputPort) -> Result { + if self.input_data.is_some() || self.stream.is_some() { + return Ok(Event::Sync); + } + + if input.has_data() { + self.input_data = Some(input.pull_data().unwrap()?); + return Ok(Event::Sync); + } + + if input.is_finished() { + return Ok(Event::Async); + } + + input.set_need_data(); + Ok(Event::NeedData) + } +} + +struct ProbeFinalState { + finished: bool, + initialize: bool, + stream: Option>, +} + +impl Debug for ProbeFinalState { + fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result { + f.debug_struct("ProbeFinalState") + .field("initialized", &self.finished) + .finish() + } +} + +impl ProbeFinalState { + pub fn new() -> ProbeFinalState { + ProbeFinalState { + stream: None, + finished: false, + initialize: false, + } + } + + pub fn event(&mut self, output_port: &OutputPort) -> Result { + if self.stream.is_some() { + return Ok(Event::Sync); + } + + if self.finished { + output_port.finish(); + return Ok(Event::Async); + } + + match self.initialize { + true => Ok(Event::Sync), + false => Ok(Event::Async), + } + } +} diff --git a/src/query/service/src/pipelines/processors/transforms/new_hash_join/grace/grace_join.rs b/src/query/service/src/pipelines/processors/transforms/new_hash_join/unpartitioned/grace/grace_join.rs similarity index 97% rename from src/query/service/src/pipelines/processors/transforms/new_hash_join/grace/grace_join.rs rename to src/query/service/src/pipelines/processors/transforms/new_hash_join/unpartitioned/grace/grace_join.rs index e60310020408d..b5e98ef2e7587 100644 --- a/src/query/service/src/pipelines/processors/transforms/new_hash_join/grace/grace_join.rs +++ b/src/query/service/src/pipelines/processors/transforms/new_hash_join/unpartitioned/grace/grace_join.rs @@ -29,15 +29,15 @@ use databend_common_pipeline_transforms::traits::Location; use databend_common_storage::DataOperator; use databend_common_storages_parquet::ReadSettings; +use super::grace_memory::GraceMemoryJoin; +use super::grace_state::GraceHashJoinState; +use super::grace_state::SpillMetadata; use crate::pipelines::processors::HashJoinDesc; use crate::pipelines::processors::transforms::Join; use crate::pipelines::processors::transforms::JoinRuntimeFilterPacket; use crate::pipelines::processors::transforms::get_hashes; -use crate::pipelines::processors::transforms::new_hash_join::grace::grace_memory::GraceMemoryJoin; -use crate::pipelines::processors::transforms::new_hash_join::grace::grace_state::GraceHashJoinState; -use crate::pipelines::processors::transforms::new_hash_join::grace::grace_state::SpillMetadata; -use crate::pipelines::processors::transforms::new_hash_join::join::EmptyJoinStream; -use crate::pipelines::processors::transforms::new_hash_join::join::JoinStream; +use crate::pipelines::processors::transforms::new_hash_join::common::join::EmptyJoinStream; +use crate::pipelines::processors::transforms::new_hash_join::common::join::JoinStream; use crate::sessions::QueryContext; use crate::spillers::Layout; use crate::spillers::SpillAdapter; diff --git a/src/query/service/src/pipelines/processors/transforms/new_hash_join/grace/grace_memory.rs b/src/query/service/src/pipelines/processors/transforms/new_hash_join/unpartitioned/grace/grace_memory.rs similarity index 87% rename from src/query/service/src/pipelines/processors/transforms/new_hash_join/grace/grace_memory.rs rename to src/query/service/src/pipelines/processors/transforms/new_hash_join/unpartitioned/grace/grace_memory.rs index cfc7fae6c05da..e4b7d4fe7a0f5 100644 --- a/src/query/service/src/pipelines/processors/transforms/new_hash_join/grace/grace_memory.rs +++ b/src/query/service/src/pipelines/processors/transforms/new_hash_join/unpartitioned/grace/grace_memory.rs @@ -14,16 +14,16 @@ use std::sync::PoisonError; +use super::super::memory::AntiLeftHashJoin; +use super::super::memory::AntiRightHashJoin; +use super::super::memory::OuterRightHashJoin; +use super::super::memory::SemiLeftHashJoin; +use super::super::memory::SemiRightHashJoin; +use super::super::memory::left_join::OuterLeftHashJoin; use crate::pipelines::processors::transforms::BasicHashJoinState; use crate::pipelines::processors::transforms::HashJoinHashTable; use crate::pipelines::processors::transforms::InnerHashJoin; use crate::pipelines::processors::transforms::Join; -use crate::pipelines::processors::transforms::memory::AntiLeftHashJoin; -use crate::pipelines::processors::transforms::memory::AntiRightHashJoin; -use crate::pipelines::processors::transforms::memory::OuterRightHashJoin; -use crate::pipelines::processors::transforms::memory::SemiLeftHashJoin; -use crate::pipelines::processors::transforms::memory::SemiRightHashJoin; -use crate::pipelines::processors::transforms::memory::left_join::OuterLeftHashJoin; pub trait GraceMemoryJoin: Join { fn reset_memory(&mut self); diff --git a/src/query/service/src/pipelines/processors/transforms/new_hash_join/grace/grace_state.rs b/src/query/service/src/pipelines/processors/transforms/new_hash_join/unpartitioned/grace/grace_state.rs similarity index 100% rename from src/query/service/src/pipelines/processors/transforms/new_hash_join/grace/grace_state.rs rename to src/query/service/src/pipelines/processors/transforms/new_hash_join/unpartitioned/grace/grace_state.rs diff --git a/src/query/service/src/pipelines/processors/transforms/new_hash_join/grace/mod.rs b/src/query/service/src/pipelines/processors/transforms/new_hash_join/unpartitioned/grace/mod.rs similarity index 100% rename from src/query/service/src/pipelines/processors/transforms/new_hash_join/grace/mod.rs rename to src/query/service/src/pipelines/processors/transforms/new_hash_join/unpartitioned/grace/mod.rs diff --git a/src/query/service/src/pipelines/processors/transforms/new_hash_join/hash_join_factory.rs b/src/query/service/src/pipelines/processors/transforms/new_hash_join/unpartitioned/hash_join_factory.rs similarity index 81% rename from src/query/service/src/pipelines/processors/transforms/new_hash_join/hash_join_factory.rs rename to src/query/service/src/pipelines/processors/transforms/new_hash_join/unpartitioned/hash_join_factory.rs index 919743ace6384..fc14bc04d2d75 100644 --- a/src/query/service/src/pipelines/processors/transforms/new_hash_join/hash_join_factory.rs +++ b/src/query/service/src/pipelines/processors/transforms/new_hash_join/unpartitioned/hash_join_factory.rs @@ -27,7 +27,7 @@ use databend_common_pipeline_transforms::MemorySettings; use databend_common_sql::plans::JoinType; use databend_common_storages_fuse::TableContext; -use super::common::CStyleCell; +use super::super::common::CStyleCell; use super::grace::GraceHashJoinState; use super::grace::GraceMemoryJoin; use super::hybrid::HybridHashJoin; @@ -39,19 +39,12 @@ use crate::pipelines::processors::transforms::BasicHashJoinState; use crate::pipelines::processors::transforms::GraceHashJoin; use crate::pipelines::processors::transforms::InnerHashJoin; use crate::pipelines::processors::transforms::Join; -use crate::pipelines::processors::transforms::memory::AntiLeftHashJoin; -use crate::pipelines::processors::transforms::memory::AntiRightHashJoin; -use crate::pipelines::processors::transforms::memory::OuterRightHashJoin; -use crate::pipelines::processors::transforms::memory::SemiLeftHashJoin; -use crate::pipelines::processors::transforms::memory::SemiRightHashJoin; -use crate::pipelines::processors::transforms::memory::left_join::OuterLeftHashJoin; -use crate::pipelines::processors::transforms::memory::partitioned::PartitionedInnerJoin; -use crate::pipelines::processors::transforms::memory::partitioned::PartitionedLeftAntiJoin; -use crate::pipelines::processors::transforms::memory::partitioned::PartitionedLeftJoin; -use crate::pipelines::processors::transforms::memory::partitioned::PartitionedLeftSemiJoin; -use crate::pipelines::processors::transforms::memory::partitioned::PartitionedRightAntiJoin; -use crate::pipelines::processors::transforms::memory::partitioned::PartitionedRightJoin; -use crate::pipelines::processors::transforms::memory::partitioned::PartitionedRightSemiJoin; +use crate::pipelines::processors::transforms::new_hash_join::unpartitioned::memory::AntiLeftHashJoin; +use crate::pipelines::processors::transforms::new_hash_join::unpartitioned::memory::AntiRightHashJoin; +use crate::pipelines::processors::transforms::new_hash_join::unpartitioned::memory::OuterRightHashJoin; +use crate::pipelines::processors::transforms::new_hash_join::unpartitioned::memory::SemiLeftHashJoin; +use crate::pipelines::processors::transforms::new_hash_join::unpartitioned::memory::SemiRightHashJoin; +use crate::pipelines::processors::transforms::new_hash_join::unpartitioned::memory::left_join::OuterLeftHashJoin; use crate::sessions::QueryContext; pub struct HashJoinFactory { @@ -412,57 +405,4 @@ impl HashJoinFactory { typ, )) } - - /// Create a partitioned (per-thread) join for hash shuffle mode. - /// No shared state — each thread independently builds and probes. - pub fn create_partitioned_join(self: &Arc, typ: JoinType) -> Result> { - let settings = self.ctx.get_settings(); - let max_block_size = settings.get_max_block_size()? as usize; - - match typ { - JoinType::Inner => Ok(Box::new(PartitionedInnerJoin::create( - self.hash_method.clone(), - self.desc.clone(), - self.function_ctx.clone(), - max_block_size, - ))), - JoinType::Left => Ok(Box::new(PartitionedLeftJoin::create( - self.hash_method.clone(), - self.desc.clone(), - self.function_ctx.clone(), - max_block_size, - ))), - JoinType::LeftAnti => Ok(Box::new(PartitionedLeftAntiJoin::create( - self.hash_method.clone(), - self.desc.clone(), - self.function_ctx.clone(), - max_block_size, - ))), - JoinType::LeftSemi => Ok(Box::new(PartitionedLeftSemiJoin::create( - self.hash_method.clone(), - self.desc.clone(), - self.function_ctx.clone(), - max_block_size, - ))), - JoinType::Right => Ok(Box::new(PartitionedRightJoin::create( - self.hash_method.clone(), - self.desc.clone(), - self.function_ctx.clone(), - max_block_size, - ))), - JoinType::RightSemi => Ok(Box::new(PartitionedRightSemiJoin::create( - self.hash_method.clone(), - self.desc.clone(), - self.function_ctx.clone(), - max_block_size, - ))), - JoinType::RightAnti => Ok(Box::new(PartitionedRightAntiJoin::create( - self.hash_method.clone(), - self.desc.clone(), - self.function_ctx.clone(), - max_block_size, - ))), - _ => unreachable!(), - } - } } diff --git a/src/query/service/src/pipelines/processors/transforms/new_hash_join/unpartitioned/hashtable/basic.rs b/src/query/service/src/pipelines/processors/transforms/new_hash_join/unpartitioned/hashtable/basic.rs new file mode 100644 index 0000000000000..cb6f7c35e7cbc --- /dev/null +++ b/src/query/service/src/pipelines/processors/transforms/new_hash_join/unpartitioned/hashtable/basic.rs @@ -0,0 +1,53 @@ +// Copyright 2021 Datafuse Labs +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use databend_common_exception::Result; + +use crate::pipelines::processors::transforms::new_hash_join::common::probe_stream::ProbeStream; +use crate::pipelines::processors::transforms::new_hash_join::common::probe_stream::ProbedRows; + +pub struct EmptyProbeStream; + +impl ProbeStream for EmptyProbeStream { + fn advance(&mut self, _res: &mut ProbedRows, _max_rows: usize) -> Result<()> { + Ok(()) + } +} + +pub struct AllUnmatchedProbeStream { + idx: u64, + size: u64, +} + +impl AllUnmatchedProbeStream { + pub fn create(size: usize) -> Box { + Box::new(AllUnmatchedProbeStream { + idx: 0, + size: size as u64, + }) + } +} + +impl ProbeStream for AllUnmatchedProbeStream { + fn advance(&mut self, rows: &mut ProbedRows, max_rows: usize) -> Result<()> { + if self.idx >= self.size { + return Ok(()); + } + + let unmatched_rows = std::cmp::min(self.size - self.idx, max_rows as u64); + rows.unmatched.extend(self.idx..self.idx + unmatched_rows); + self.idx += unmatched_rows; + Ok(()) + } +} diff --git a/src/query/service/src/pipelines/processors/transforms/new_hash_join/hashtable/fixed_keys.rs b/src/query/service/src/pipelines/processors/transforms/new_hash_join/unpartitioned/hashtable/fixed_keys.rs similarity index 96% rename from src/query/service/src/pipelines/processors/transforms/new_hash_join/hashtable/fixed_keys.rs rename to src/query/service/src/pipelines/processors/transforms/new_hash_join/unpartitioned/hashtable/fixed_keys.rs index 82ea33dbe7ae6..a428a109385c1 100644 --- a/src/query/service/src/pipelines/processors/transforms/new_hash_join/hashtable/fixed_keys.rs +++ b/src/query/service/src/pipelines/processors/transforms/new_hash_join/unpartitioned/hashtable/fixed_keys.rs @@ -22,16 +22,16 @@ use databend_common_expression::KeyAccessor; use databend_common_expression::ProjectedBlock; use databend_common_hashtable::HashtableKeyable; +use super::ProbeData; +use super::basic::AllUnmatchedProbeStream; +use super::basic::EmptyProbeStream; use crate::pipelines::processors::transforms::FixedKeyHashJoinHashTable; use crate::pipelines::processors::transforms::hash_join_table::HashJoinHashMap; use crate::pipelines::processors::transforms::hash_join_table::HashJoinHashtableLike; use crate::pipelines::processors::transforms::hash_join_table::RawEntry; use crate::pipelines::processors::transforms::hash_join_table::RowPtr; -use crate::pipelines::processors::transforms::new_hash_join::hashtable::ProbeData; -use crate::pipelines::processors::transforms::new_hash_join::hashtable::basic::AllUnmatchedProbeStream; -use crate::pipelines::processors::transforms::new_hash_join::hashtable::basic::EmptyProbeStream; -use crate::pipelines::processors::transforms::new_hash_join::hashtable::basic::ProbeStream; -use crate::pipelines::processors::transforms::new_hash_join::hashtable::basic::ProbedRows; +use crate::pipelines::processors::transforms::new_hash_join::common::probe_stream::ProbeStream; +use crate::pipelines::processors::transforms::new_hash_join::common::probe_stream::ProbedRows; impl FixedKeyHashJoinHashTable { pub fn new( diff --git a/src/query/service/src/pipelines/processors/transforms/new_hash_join/hashtable/mod.rs b/src/query/service/src/pipelines/processors/transforms/new_hash_join/unpartitioned/hashtable/mod.rs similarity index 100% rename from src/query/service/src/pipelines/processors/transforms/new_hash_join/hashtable/mod.rs rename to src/query/service/src/pipelines/processors/transforms/new_hash_join/unpartitioned/hashtable/mod.rs diff --git a/src/query/service/src/pipelines/processors/transforms/new_hash_join/hashtable/serialize_keys.rs b/src/query/service/src/pipelines/processors/transforms/new_hash_join/unpartitioned/hashtable/serialize_keys.rs similarity index 97% rename from src/query/service/src/pipelines/processors/transforms/new_hash_join/hashtable/serialize_keys.rs rename to src/query/service/src/pipelines/processors/transforms/new_hash_join/unpartitioned/hashtable/serialize_keys.rs index 0416169c7cb23..079240f90ec68 100644 --- a/src/query/service/src/pipelines/processors/transforms/new_hash_join/hashtable/serialize_keys.rs +++ b/src/query/service/src/pipelines/processors/transforms/new_hash_join/unpartitioned/hashtable/serialize_keys.rs @@ -22,17 +22,17 @@ use databend_common_expression::KeyAccessor; use databend_common_expression::KeysState; use databend_common_expression::ProjectedBlock; +use super::ProbeData; +use super::basic::AllUnmatchedProbeStream; +use super::basic::EmptyProbeStream; use crate::pipelines::processors::transforms::SerializerHashJoinHashTable; use crate::pipelines::processors::transforms::hash_join_table::BinaryHashJoinHashMap; use crate::pipelines::processors::transforms::hash_join_table::HashJoinHashtableLike; use crate::pipelines::processors::transforms::hash_join_table::RowPtr; use crate::pipelines::processors::transforms::hash_join_table::STRING_EARLY_SIZE; use crate::pipelines::processors::transforms::hash_join_table::StringRawEntry; -use crate::pipelines::processors::transforms::new_hash_join::hashtable::ProbeData; -use crate::pipelines::processors::transforms::new_hash_join::hashtable::basic::AllUnmatchedProbeStream; -use crate::pipelines::processors::transforms::new_hash_join::hashtable::basic::EmptyProbeStream; -use crate::pipelines::processors::transforms::new_hash_join::hashtable::basic::ProbeStream; -use crate::pipelines::processors::transforms::new_hash_join::hashtable::basic::ProbedRows; +use crate::pipelines::processors::transforms::new_hash_join::common::probe_stream::ProbeStream; +use crate::pipelines::processors::transforms::new_hash_join::common::probe_stream::ProbedRows; impl SerializerHashJoinHashTable { pub fn new( diff --git a/src/query/service/src/pipelines/processors/transforms/new_hash_join/hashtable/single_binary_key.rs b/src/query/service/src/pipelines/processors/transforms/new_hash_join/unpartitioned/hashtable/single_binary_key.rs similarity index 92% rename from src/query/service/src/pipelines/processors/transforms/new_hash_join/hashtable/single_binary_key.rs rename to src/query/service/src/pipelines/processors/transforms/new_hash_join/unpartitioned/hashtable/single_binary_key.rs index fe6d12d307560..523d428258045 100644 --- a/src/query/service/src/pipelines/processors/transforms/new_hash_join/hashtable/single_binary_key.rs +++ b/src/query/service/src/pipelines/processors/transforms/new_hash_join/unpartitioned/hashtable/single_binary_key.rs @@ -20,18 +20,18 @@ use databend_common_expression::HashMethodSingleBinary; use databend_common_expression::KeysState; use databend_common_expression::ProjectedBlock; +use super::ProbeData; +use super::basic::AllUnmatchedProbeStream; +use super::basic::EmptyProbeStream; +use super::serialize_keys::BinaryKeyProbeStream; +use super::serialize_keys::EarlyFilteringProbeStream; use crate::pipelines::processors::transforms::SingleBinaryHashJoinHashTable; use crate::pipelines::processors::transforms::hash_join_table::BinaryHashJoinHashMap; use crate::pipelines::processors::transforms::hash_join_table::HashJoinHashtableLike; use crate::pipelines::processors::transforms::hash_join_table::RowPtr; use crate::pipelines::processors::transforms::hash_join_table::STRING_EARLY_SIZE; use crate::pipelines::processors::transforms::hash_join_table::StringRawEntry; -use crate::pipelines::processors::transforms::new_hash_join::hashtable::ProbeData; -use crate::pipelines::processors::transforms::new_hash_join::hashtable::basic::AllUnmatchedProbeStream; -use crate::pipelines::processors::transforms::new_hash_join::hashtable::basic::EmptyProbeStream; -use crate::pipelines::processors::transforms::new_hash_join::hashtable::basic::ProbeStream; -use crate::pipelines::processors::transforms::new_hash_join::hashtable::serialize_keys::BinaryKeyProbeStream; -use crate::pipelines::processors::transforms::new_hash_join::hashtable::serialize_keys::EarlyFilteringProbeStream; +use crate::pipelines::processors::transforms::new_hash_join::common::probe_stream::ProbeStream; impl SingleBinaryHashJoinHashTable { pub fn new( diff --git a/src/query/service/src/pipelines/processors/transforms/new_hash_join/hybrid/hybrid_join.rs b/src/query/service/src/pipelines/processors/transforms/new_hash_join/unpartitioned/hybrid/hybrid_join.rs similarity index 96% rename from src/query/service/src/pipelines/processors/transforms/new_hash_join/hybrid/hybrid_join.rs rename to src/query/service/src/pipelines/processors/transforms/new_hash_join/unpartitioned/hybrid/hybrid_join.rs index d78a4b1c625ab..85c0268910685 100644 --- a/src/query/service/src/pipelines/processors/transforms/new_hash_join/hybrid/hybrid_join.rs +++ b/src/query/service/src/pipelines/processors/transforms/new_hash_join/unpartitioned/hybrid/hybrid_join.rs @@ -24,14 +24,14 @@ use databend_common_expression::HashMethodKind; use databend_common_pipeline_transforms::MemorySettings; use databend_common_sql::plans::JoinType; +use super::hybrid_state::HybridHashJoinState; use crate::pipelines::processors::HashJoinDesc; use crate::pipelines::processors::transforms::BasicHashJoinState; use crate::pipelines::processors::transforms::GraceHashJoin; use crate::pipelines::processors::transforms::JoinRuntimeFilterPacket; -use crate::pipelines::processors::transforms::new_hash_join::grace::GraceMemoryJoin; -use crate::pipelines::processors::transforms::new_hash_join::hybrid::hybrid_state::HybridHashJoinState; -use crate::pipelines::processors::transforms::new_hash_join::join::Join; -use crate::pipelines::processors::transforms::new_hash_join::join::JoinStream; +use crate::pipelines::processors::transforms::new_hash_join::common::join::Join; +use crate::pipelines::processors::transforms::new_hash_join::common::join::JoinStream; +use crate::pipelines::processors::transforms::new_hash_join::unpartitioned::grace::GraceMemoryJoin; use crate::sessions::QueryContext; /// Hybrid hash join mode: diff --git a/src/query/service/src/pipelines/processors/transforms/new_hash_join/hybrid/hybrid_state.rs b/src/query/service/src/pipelines/processors/transforms/new_hash_join/unpartitioned/hybrid/hybrid_state.rs similarity index 96% rename from src/query/service/src/pipelines/processors/transforms/new_hash_join/hybrid/hybrid_state.rs rename to src/query/service/src/pipelines/processors/transforms/new_hash_join/unpartitioned/hybrid/hybrid_state.rs index 6d6cf34197208..4917ebfd0bda8 100644 --- a/src/query/service/src/pipelines/processors/transforms/new_hash_join/hybrid/hybrid_state.rs +++ b/src/query/service/src/pipelines/processors/transforms/new_hash_join/unpartitioned/hybrid/hybrid_state.rs @@ -24,7 +24,7 @@ use databend_common_sql::plans::JoinType; use crate::pipelines::processors::transforms::HashJoinFactory; use crate::pipelines::processors::transforms::HybridHashJoin; -use crate::pipelines::processors::transforms::new_hash_join::grace::GraceHashJoinState; +use crate::pipelines::processors::transforms::new_hash_join::unpartitioned::grace::GraceHashJoinState; use crate::sessions::QueryContext; pub struct HybridHashJoinState { diff --git a/src/query/service/src/pipelines/processors/transforms/new_hash_join/hybrid/mod.rs b/src/query/service/src/pipelines/processors/transforms/new_hash_join/unpartitioned/hybrid/mod.rs similarity index 100% rename from src/query/service/src/pipelines/processors/transforms/new_hash_join/hybrid/mod.rs rename to src/query/service/src/pipelines/processors/transforms/new_hash_join/unpartitioned/hybrid/mod.rs diff --git a/src/query/service/src/pipelines/processors/transforms/new_hash_join/memory/unpartitioned/basic.rs b/src/query/service/src/pipelines/processors/transforms/new_hash_join/unpartitioned/memory/basic.rs similarity index 100% rename from src/query/service/src/pipelines/processors/transforms/new_hash_join/memory/unpartitioned/basic.rs rename to src/query/service/src/pipelines/processors/transforms/new_hash_join/unpartitioned/memory/basic.rs diff --git a/src/query/service/src/pipelines/processors/transforms/new_hash_join/memory/unpartitioned/basic_state.rs b/src/query/service/src/pipelines/processors/transforms/new_hash_join/unpartitioned/memory/basic_state.rs similarity index 100% rename from src/query/service/src/pipelines/processors/transforms/new_hash_join/memory/unpartitioned/basic_state.rs rename to src/query/service/src/pipelines/processors/transforms/new_hash_join/unpartitioned/memory/basic_state.rs diff --git a/src/query/service/src/pipelines/processors/transforms/new_hash_join/memory/unpartitioned/inner_join.rs b/src/query/service/src/pipelines/processors/transforms/new_hash_join/unpartitioned/memory/inner_join.rs similarity index 94% rename from src/query/service/src/pipelines/processors/transforms/new_hash_join/memory/unpartitioned/inner_join.rs rename to src/query/service/src/pipelines/processors/transforms/new_hash_join/unpartitioned/memory/inner_join.rs index a51e593766a7b..ebe8b2c13dc99 100644 --- a/src/query/service/src/pipelines/processors/transforms/new_hash_join/memory/unpartitioned/inner_join.rs +++ b/src/query/service/src/pipelines/processors/transforms/new_hash_join/unpartitioned/memory/inner_join.rs @@ -29,19 +29,19 @@ use databend_common_expression::types::NullableColumn; use databend_common_expression::with_join_hash_method; use databend_common_settings::Settings; +use super::super::performance::PerformanceContext; use super::basic::BasicHashJoin; use super::basic_state::BasicHashJoinState; use crate::pipelines::processors::HashJoinDesc; use crate::pipelines::processors::transforms::HashJoinHashTable; use crate::pipelines::processors::transforms::JoinRuntimeFilterPacket; use crate::pipelines::processors::transforms::merge_join_runtime_filter_packets; -use crate::pipelines::processors::transforms::new_hash_join::hashtable::ProbeData; -use crate::pipelines::processors::transforms::new_hash_join::hashtable::basic::ProbeStream; -use crate::pipelines::processors::transforms::new_hash_join::hashtable::basic::ProbedRows; -use crate::pipelines::processors::transforms::new_hash_join::join::EmptyJoinStream; -use crate::pipelines::processors::transforms::new_hash_join::join::Join; -use crate::pipelines::processors::transforms::new_hash_join::join::JoinStream; -use crate::pipelines::processors::transforms::new_hash_join::performance::PerformanceContext; +use crate::pipelines::processors::transforms::new_hash_join::common::join::EmptyJoinStream; +use crate::pipelines::processors::transforms::new_hash_join::common::join::Join; +use crate::pipelines::processors::transforms::new_hash_join::common::join::JoinStream; +use crate::pipelines::processors::transforms::new_hash_join::common::probe_stream::ProbeStream; +use crate::pipelines::processors::transforms::new_hash_join::common::probe_stream::ProbedRows; +use crate::pipelines::processors::transforms::new_hash_join::unpartitioned::hashtable::ProbeData; pub struct InnerHashJoin { pub(crate) basic_hash_join: BasicHashJoin, diff --git a/src/query/service/src/pipelines/processors/transforms/new_hash_join/memory/unpartitioned/left_join.rs b/src/query/service/src/pipelines/processors/transforms/new_hash_join/unpartitioned/memory/left_join.rs similarity index 95% rename from src/query/service/src/pipelines/processors/transforms/new_hash_join/memory/unpartitioned/left_join.rs rename to src/query/service/src/pipelines/processors/transforms/new_hash_join/unpartitioned/memory/left_join.rs index ea4502f8133c3..9dfca00690583 100644 --- a/src/query/service/src/pipelines/processors/transforms/new_hash_join/memory/unpartitioned/left_join.rs +++ b/src/query/service/src/pipelines/processors/transforms/new_hash_join/unpartitioned/memory/left_join.rs @@ -30,18 +30,18 @@ use databend_common_expression::types::DataType; use databend_common_expression::types::NullableColumn; use databend_common_expression::with_join_hash_method; +use super::super::performance::PerformanceContext; use super::basic::BasicHashJoin; use crate::pipelines::processors::HashJoinDesc; use crate::pipelines::processors::transforms::BasicHashJoinState; use crate::pipelines::processors::transforms::HashJoinHashTable; use crate::pipelines::processors::transforms::Join; -use crate::pipelines::processors::transforms::new_hash_join::hashtable::ProbeData; -use crate::pipelines::processors::transforms::new_hash_join::hashtable::basic::ProbeStream; -use crate::pipelines::processors::transforms::new_hash_join::hashtable::basic::ProbedRows; -use crate::pipelines::processors::transforms::new_hash_join::join::EmptyJoinStream; -use crate::pipelines::processors::transforms::new_hash_join::join::JoinStream; -use crate::pipelines::processors::transforms::new_hash_join::join::OneBlockJoinStream; -use crate::pipelines::processors::transforms::new_hash_join::performance::PerformanceContext; +use crate::pipelines::processors::transforms::new_hash_join::common::join::EmptyJoinStream; +use crate::pipelines::processors::transforms::new_hash_join::common::join::JoinStream; +use crate::pipelines::processors::transforms::new_hash_join::common::join::OneBlockJoinStream; +use crate::pipelines::processors::transforms::new_hash_join::common::probe_stream::ProbeStream; +use crate::pipelines::processors::transforms::new_hash_join::common::probe_stream::ProbedRows; +use crate::pipelines::processors::transforms::new_hash_join::unpartitioned::hashtable::ProbeData; use crate::pipelines::processors::transforms::wrap_true_validity; use crate::sessions::QueryContext; diff --git a/src/query/service/src/pipelines/processors/transforms/new_hash_join/memory/unpartitioned/left_join_anti.rs b/src/query/service/src/pipelines/processors/transforms/new_hash_join/unpartitioned/memory/left_join_anti.rs similarity index 94% rename from src/query/service/src/pipelines/processors/transforms/new_hash_join/memory/unpartitioned/left_join_anti.rs rename to src/query/service/src/pipelines/processors/transforms/new_hash_join/unpartitioned/memory/left_join_anti.rs index 6131995f23c2e..5977c3930c588 100644 --- a/src/query/service/src/pipelines/processors/transforms/new_hash_join/memory/unpartitioned/left_join_anti.rs +++ b/src/query/service/src/pipelines/processors/transforms/new_hash_join/unpartitioned/memory/left_join_anti.rs @@ -27,19 +27,19 @@ use databend_common_expression::FunctionContext; use databend_common_expression::HashMethodKind; use databend_common_expression::with_join_hash_method; +use super::super::performance::PerformanceContext; use super::basic::BasicHashJoin; use super::left_join::final_result_block; use crate::pipelines::processors::HashJoinDesc; use crate::pipelines::processors::transforms::BasicHashJoinState; use crate::pipelines::processors::transforms::HashJoinHashTable; use crate::pipelines::processors::transforms::Join; -use crate::pipelines::processors::transforms::new_hash_join::hashtable::ProbeData; -use crate::pipelines::processors::transforms::new_hash_join::hashtable::basic::ProbeStream; -use crate::pipelines::processors::transforms::new_hash_join::hashtable::basic::ProbedRows; -use crate::pipelines::processors::transforms::new_hash_join::join::EmptyJoinStream; -use crate::pipelines::processors::transforms::new_hash_join::join::JoinStream; -use crate::pipelines::processors::transforms::new_hash_join::join::OneBlockJoinStream; -use crate::pipelines::processors::transforms::new_hash_join::performance::PerformanceContext; +use crate::pipelines::processors::transforms::new_hash_join::common::join::EmptyJoinStream; +use crate::pipelines::processors::transforms::new_hash_join::common::join::JoinStream; +use crate::pipelines::processors::transforms::new_hash_join::common::join::OneBlockJoinStream; +use crate::pipelines::processors::transforms::new_hash_join::common::probe_stream::ProbeStream; +use crate::pipelines::processors::transforms::new_hash_join::common::probe_stream::ProbedRows; +use crate::pipelines::processors::transforms::new_hash_join::unpartitioned::hashtable::ProbeData; use crate::sessions::QueryContext; pub struct AntiLeftHashJoin { diff --git a/src/query/service/src/pipelines/processors/transforms/new_hash_join/memory/unpartitioned/left_join_semi.rs b/src/query/service/src/pipelines/processors/transforms/new_hash_join/unpartitioned/memory/left_join_semi.rs similarity index 96% rename from src/query/service/src/pipelines/processors/transforms/new_hash_join/memory/unpartitioned/left_join_semi.rs rename to src/query/service/src/pipelines/processors/transforms/new_hash_join/unpartitioned/memory/left_join_semi.rs index 9ad5d568da19d..d4245609a7e29 100644 --- a/src/query/service/src/pipelines/processors/transforms/new_hash_join/memory/unpartitioned/left_join_semi.rs +++ b/src/query/service/src/pipelines/processors/transforms/new_hash_join/unpartitioned/memory/left_join_semi.rs @@ -30,6 +30,7 @@ use databend_common_expression::HashMethodKind; use databend_common_expression::types::NullableColumn; use databend_common_expression::with_join_hash_method; +use super::super::performance::PerformanceContext; use super::basic::BasicHashJoin; use crate::pipelines::processors::HashJoinDesc; use crate::pipelines::processors::transforms::BasicHashJoinState; @@ -37,12 +38,11 @@ use crate::pipelines::processors::transforms::HashJoinHashTable; use crate::pipelines::processors::transforms::Join; use crate::pipelines::processors::transforms::JoinRuntimeFilterPacket; use crate::pipelines::processors::transforms::merge_join_runtime_filter_packets; -use crate::pipelines::processors::transforms::new_hash_join::hashtable::ProbeData; -use crate::pipelines::processors::transforms::new_hash_join::hashtable::basic::ProbeStream; -use crate::pipelines::processors::transforms::new_hash_join::hashtable::basic::ProbedRows; -use crate::pipelines::processors::transforms::new_hash_join::join::EmptyJoinStream; -use crate::pipelines::processors::transforms::new_hash_join::join::JoinStream; -use crate::pipelines::processors::transforms::new_hash_join::performance::PerformanceContext; +use crate::pipelines::processors::transforms::new_hash_join::common::join::EmptyJoinStream; +use crate::pipelines::processors::transforms::new_hash_join::common::join::JoinStream; +use crate::pipelines::processors::transforms::new_hash_join::common::probe_stream::ProbeStream; +use crate::pipelines::processors::transforms::new_hash_join::common::probe_stream::ProbedRows; +use crate::pipelines::processors::transforms::new_hash_join::unpartitioned::hashtable::ProbeData; use crate::sessions::QueryContext; pub struct SemiLeftHashJoin { diff --git a/src/query/service/src/pipelines/processors/transforms/new_hash_join/memory/unpartitioned/mod.rs b/src/query/service/src/pipelines/processors/transforms/new_hash_join/unpartitioned/memory/mod.rs similarity index 100% rename from src/query/service/src/pipelines/processors/transforms/new_hash_join/memory/unpartitioned/mod.rs rename to src/query/service/src/pipelines/processors/transforms/new_hash_join/unpartitioned/memory/mod.rs diff --git a/src/query/service/src/pipelines/processors/transforms/new_hash_join/memory/unpartitioned/nested_loop.rs b/src/query/service/src/pipelines/processors/transforms/new_hash_join/unpartitioned/memory/nested_loop.rs similarity index 99% rename from src/query/service/src/pipelines/processors/transforms/new_hash_join/memory/unpartitioned/nested_loop.rs rename to src/query/service/src/pipelines/processors/transforms/new_hash_join/unpartitioned/memory/nested_loop.rs index 89c67354391ed..75ad373691556 100644 --- a/src/query/service/src/pipelines/processors/transforms/new_hash_join/memory/unpartitioned/nested_loop.rs +++ b/src/query/service/src/pipelines/processors/transforms/new_hash_join/unpartitioned/memory/nested_loop.rs @@ -30,7 +30,7 @@ use crate::pipelines::processors::transforms::JoinRuntimeFilterPacket; use crate::pipelines::processors::transforms::JoinStream; use crate::pipelines::processors::transforms::NestedLoopDesc; use crate::pipelines::processors::transforms::hash_join_table::RowPtr; -use crate::pipelines::processors::transforms::new_hash_join::join::EmptyJoinStream; +use crate::pipelines::processors::transforms::new_hash_join::common::join::EmptyJoinStream; pub struct NestedLoopJoin { inner: T, diff --git a/src/query/service/src/pipelines/processors/transforms/new_hash_join/memory/unpartitioned/right_join.rs b/src/query/service/src/pipelines/processors/transforms/new_hash_join/unpartitioned/memory/right_join.rs similarity index 97% rename from src/query/service/src/pipelines/processors/transforms/new_hash_join/memory/unpartitioned/right_join.rs rename to src/query/service/src/pipelines/processors/transforms/new_hash_join/unpartitioned/memory/right_join.rs index 8030027624ec6..b84b8cd422991 100644 --- a/src/query/service/src/pipelines/processors/transforms/new_hash_join/memory/unpartitioned/right_join.rs +++ b/src/query/service/src/pipelines/processors/transforms/new_hash_join/unpartitioned/memory/right_join.rs @@ -28,6 +28,7 @@ use databend_common_expression::HashMethodKind; use databend_common_expression::types::DataType; use databend_common_expression::with_join_hash_method; +use super::super::performance::PerformanceContext; use super::basic::BasicHashJoin; use super::left_join::final_result_block; use super::left_join::null_block; @@ -38,11 +39,10 @@ use crate::pipelines::processors::transforms::Join; use crate::pipelines::processors::transforms::JoinRuntimeFilterPacket; use crate::pipelines::processors::transforms::hash_join_table::RowPtr; use crate::pipelines::processors::transforms::merge_join_runtime_filter_packets; -use crate::pipelines::processors::transforms::new_hash_join::hashtable::ProbeData; -use crate::pipelines::processors::transforms::new_hash_join::hashtable::basic::ProbeStream; -use crate::pipelines::processors::transforms::new_hash_join::hashtable::basic::ProbedRows; -use crate::pipelines::processors::transforms::new_hash_join::join::JoinStream; -use crate::pipelines::processors::transforms::new_hash_join::performance::PerformanceContext; +use crate::pipelines::processors::transforms::new_hash_join::common::join::JoinStream; +use crate::pipelines::processors::transforms::new_hash_join::common::probe_stream::ProbeStream; +use crate::pipelines::processors::transforms::new_hash_join::common::probe_stream::ProbedRows; +use crate::pipelines::processors::transforms::new_hash_join::unpartitioned::hashtable::ProbeData; use crate::pipelines::processors::transforms::wrap_nullable_block; use crate::sessions::QueryContext; diff --git a/src/query/service/src/pipelines/processors/transforms/new_hash_join/memory/unpartitioned/right_join_anti.rs b/src/query/service/src/pipelines/processors/transforms/new_hash_join/unpartitioned/memory/right_join_anti.rs similarity index 97% rename from src/query/service/src/pipelines/processors/transforms/new_hash_join/memory/unpartitioned/right_join_anti.rs rename to src/query/service/src/pipelines/processors/transforms/new_hash_join/unpartitioned/memory/right_join_anti.rs index 183dccff7230a..dc24e6d27b990 100644 --- a/src/query/service/src/pipelines/processors/transforms/new_hash_join/memory/unpartitioned/right_join_anti.rs +++ b/src/query/service/src/pipelines/processors/transforms/new_hash_join/unpartitioned/memory/right_join_anti.rs @@ -26,6 +26,7 @@ use databend_common_expression::FunctionContext; use databend_common_expression::HashMethodKind; use databend_common_expression::with_join_hash_method; +use super::super::performance::PerformanceContext; use super::basic::BasicHashJoin; use super::right_join_semi::SemiRightHashJoinStream; use crate::pipelines::processors::HashJoinDesc; @@ -35,9 +36,8 @@ use crate::pipelines::processors::transforms::Join; use crate::pipelines::processors::transforms::JoinRuntimeFilterPacket; use crate::pipelines::processors::transforms::hash_join_table::RowPtr; use crate::pipelines::processors::transforms::merge_join_runtime_filter_packets; -use crate::pipelines::processors::transforms::new_hash_join::hashtable::ProbeData; -use crate::pipelines::processors::transforms::new_hash_join::join::JoinStream; -use crate::pipelines::processors::transforms::new_hash_join::performance::PerformanceContext; +use crate::pipelines::processors::transforms::new_hash_join::common::join::JoinStream; +use crate::pipelines::processors::transforms::new_hash_join::unpartitioned::hashtable::ProbeData; use crate::sessions::QueryContext; pub struct AntiRightHashJoin { diff --git a/src/query/service/src/pipelines/processors/transforms/new_hash_join/memory/unpartitioned/right_join_semi.rs b/src/query/service/src/pipelines/processors/transforms/new_hash_join/unpartitioned/memory/right_join_semi.rs similarity index 96% rename from src/query/service/src/pipelines/processors/transforms/new_hash_join/memory/unpartitioned/right_join_semi.rs rename to src/query/service/src/pipelines/processors/transforms/new_hash_join/unpartitioned/memory/right_join_semi.rs index 74b4871db4955..8338ca158541d 100644 --- a/src/query/service/src/pipelines/processors/transforms/new_hash_join/memory/unpartitioned/right_join_semi.rs +++ b/src/query/service/src/pipelines/processors/transforms/new_hash_join/unpartitioned/memory/right_join_semi.rs @@ -27,6 +27,7 @@ use databend_common_expression::FunctionContext; use databend_common_expression::HashMethodKind; use databend_common_expression::with_join_hash_method; +use super::super::performance::PerformanceContext; use super::basic::BasicHashJoin; use super::left_join::final_result_block; use crate::pipelines::processors::HashJoinDesc; @@ -36,11 +37,10 @@ use crate::pipelines::processors::transforms::Join; use crate::pipelines::processors::transforms::JoinRuntimeFilterPacket; use crate::pipelines::processors::transforms::hash_join_table::RowPtr; use crate::pipelines::processors::transforms::merge_join_runtime_filter_packets; -use crate::pipelines::processors::transforms::new_hash_join::hashtable::ProbeData; -use crate::pipelines::processors::transforms::new_hash_join::hashtable::basic::ProbeStream; -use crate::pipelines::processors::transforms::new_hash_join::hashtable::basic::ProbedRows; -use crate::pipelines::processors::transforms::new_hash_join::join::JoinStream; -use crate::pipelines::processors::transforms::new_hash_join::performance::PerformanceContext; +use crate::pipelines::processors::transforms::new_hash_join::common::join::JoinStream; +use crate::pipelines::processors::transforms::new_hash_join::common::probe_stream::ProbeStream; +use crate::pipelines::processors::transforms::new_hash_join::common::probe_stream::ProbedRows; +use crate::pipelines::processors::transforms::new_hash_join::unpartitioned::hashtable::ProbeData; use crate::sessions::QueryContext; pub struct SemiRightHashJoin { diff --git a/src/query/service/src/pipelines/processors/transforms/new_hash_join/memory/mod.rs b/src/query/service/src/pipelines/processors/transforms/new_hash_join/unpartitioned/mod.rs similarity index 70% rename from src/query/service/src/pipelines/processors/transforms/new_hash_join/memory/mod.rs rename to src/query/service/src/pipelines/processors/transforms/new_hash_join/unpartitioned/mod.rs index d1890d4a56dc1..872a33dc34f6e 100644 --- a/src/query/service/src/pipelines/processors/transforms/new_hash_join/memory/mod.rs +++ b/src/query/service/src/pipelines/processors/transforms/new_hash_join/unpartitioned/mod.rs @@ -12,7 +12,14 @@ // See the License for the specific language governing permissions and // limitations under the License. -pub mod partitioned; -pub mod unpartitioned; +pub mod grace; +mod hash_join_factory; +pub mod hashtable; +pub mod hybrid; +pub mod memory; +mod performance; +mod transform_hash_join; -pub use unpartitioned::*; +pub use hash_join_factory::HashJoinFactory; +pub use memory::*; +pub use transform_hash_join::TransformHashJoin; diff --git a/src/query/service/src/pipelines/processors/transforms/new_hash_join/performance.rs b/src/query/service/src/pipelines/processors/transforms/new_hash_join/unpartitioned/performance.rs similarity index 90% rename from src/query/service/src/pipelines/processors/transforms/new_hash_join/performance.rs rename to src/query/service/src/pipelines/processors/transforms/new_hash_join/unpartitioned/performance.rs index 0f3f7cb8f560b..92743c6f2e9d4 100644 --- a/src/query/service/src/pipelines/processors/transforms/new_hash_join/performance.rs +++ b/src/query/service/src/pipelines/processors/transforms/new_hash_join/unpartitioned/performance.rs @@ -19,8 +19,8 @@ use databend_common_expression::FunctionContext; use databend_common_functions::BUILTIN_FUNCTIONS; use crate::pipelines::processors::HashJoinDesc; -use crate::pipelines::processors::transforms::new_hash_join::hashtable::ProbeHashStatistics; -use crate::pipelines::processors::transforms::new_hash_join::hashtable::basic::ProbedRows; +use crate::pipelines::processors::transforms::new_hash_join::common::probe_stream::ProbedRows; +use crate::pipelines::processors::transforms::new_hash_join::unpartitioned::hashtable::ProbeHashStatistics; pub struct PerformanceContext { pub probe_result: ProbedRows, diff --git a/src/query/service/src/pipelines/processors/transforms/new_hash_join/transform_hash_join.rs b/src/query/service/src/pipelines/processors/transforms/new_hash_join/unpartitioned/transform_hash_join.rs similarity index 97% rename from src/query/service/src/pipelines/processors/transforms/new_hash_join/transform_hash_join.rs rename to src/query/service/src/pipelines/processors/transforms/new_hash_join/unpartitioned/transform_hash_join.rs index 0bc94bf6e99eb..815827b3fbe48 100644 --- a/src/query/service/src/pipelines/processors/transforms/new_hash_join/transform_hash_join.rs +++ b/src/query/service/src/pipelines/processors/transforms/new_hash_join/unpartitioned/transform_hash_join.rs @@ -31,10 +31,10 @@ use databend_common_pipeline::core::ProcessorPtr; use log::info; use crate::pipelines::processors::transforms::RuntimeFilterLocalBuilder; -use crate::pipelines::processors::transforms::new_hash_join::join::FinishedJoin; -use crate::pipelines::processors::transforms::new_hash_join::join::Join; -use crate::pipelines::processors::transforms::new_hash_join::join::JoinStream; -use crate::pipelines::processors::transforms::new_hash_join::runtime_filter::RuntimeFiltersDesc; +use crate::pipelines::processors::transforms::new_hash_join::common::join::FinishedJoin; +use crate::pipelines::processors::transforms::new_hash_join::common::join::Join; +use crate::pipelines::processors::transforms::new_hash_join::common::join::JoinStream; +use crate::pipelines::processors::transforms::new_hash_join::common::runtime_filter::RuntimeFiltersDesc; pub struct TransformHashJoin { build_port: Arc, From cf692d8dec7d9dc3d7157dcbfd2256a56d2f98cf Mon Sep 17 00:00:00 2001 From: zhang2014 Date: Thu, 26 Mar 2026 23:58:06 +0800 Subject: [PATCH 15/38] z --- .../transforms/new_hash_join/common/join.rs | 40 ++ .../new_hash_join/common/probe_stream.rs | 2 +- .../partitioned/chunk_accumulator.rs | 13 - .../partitioned/compact_hash_table.rs | 54 ++- .../partitioned/compact_probe_stream.rs | 211 --------- .../new_hash_join/partitioned/inner_join.rs | 226 +++++---- .../new_hash_join/partitioned/left_join.rs | 421 ++++++++++------- .../partitioned/left_join_anti.rs | 297 +++++++----- .../partitioned/left_join_semi.rs | 274 +++++++---- .../new_hash_join/partitioned/mod.rs | 4 +- .../partitioned/partitioned_build.rs | 437 +++++++++++++++--- .../new_hash_join/partitioned/right_join.rs | 317 +++++++------ .../partitioned/right_join_anti.rs | 224 +++------ .../partitioned/right_join_semi.rs | 297 +++++++----- .../unpartitioned/memory/inner_join.rs | 41 +- .../new_hash_join/unpartitioned/mod.rs | 1 + 16 files changed, 1645 insertions(+), 1214 deletions(-) delete mode 100644 src/query/service/src/pipelines/processors/transforms/new_hash_join/partitioned/compact_probe_stream.rs diff --git a/src/query/service/src/pipelines/processors/transforms/new_hash_join/common/join.rs b/src/query/service/src/pipelines/processors/transforms/new_hash_join/common/join.rs index d3e4cc5bf9cc8..524b026b8a158 100644 --- a/src/query/service/src/pipelines/processors/transforms/new_hash_join/common/join.rs +++ b/src/query/service/src/pipelines/processors/transforms/new_hash_join/common/join.rs @@ -16,6 +16,7 @@ use databend_common_base::base::ProgressValues; use databend_common_exception::ErrorCode; use databend_common_exception::Result; use databend_common_expression::DataBlock; +use databend_common_expression::FilterExecutor; use crate::pipelines::processors::transforms::JoinRuntimeFilterPacket; @@ -89,3 +90,42 @@ impl Join for FinishedJoin { Err(ErrorCode::Internal("Join is finished")) } } + +pub struct InnerHashJoinFilterStream<'a> { + inner: Box, + filter_executor: &'a mut FilterExecutor, +} + +impl<'a> InnerHashJoinFilterStream<'a> { + pub fn create( + inner: Box, + filter_executor: &'a mut FilterExecutor, + ) -> Box { + Box::new(InnerHashJoinFilterStream { + inner, + filter_executor, + }) + } +} + +impl<'a> JoinStream for InnerHashJoinFilterStream<'a> { + fn next(&mut self) -> Result> { + loop { + let Some(data_block) = self.inner.next()? else { + return Ok(None); + }; + + if data_block.is_empty() { + continue; + } + + let data_block = self.filter_executor.filter(data_block)?; + + if data_block.is_empty() { + continue; + } + + return Ok(Some(data_block)); + } + } +} diff --git a/src/query/service/src/pipelines/processors/transforms/new_hash_join/common/probe_stream.rs b/src/query/service/src/pipelines/processors/transforms/new_hash_join/common/probe_stream.rs index 9911ee8eaa133..d1960ef572853 100644 --- a/src/query/service/src/pipelines/processors/transforms/new_hash_join/common/probe_stream.rs +++ b/src/query/service/src/pipelines/processors/transforms/new_hash_join/common/probe_stream.rs @@ -61,6 +61,6 @@ impl ProbedRows { } } -pub trait ProbeStream { +pub trait ProbeStream: Send + Sync { fn advance(&mut self, res: &mut ProbedRows, max_rows: usize) -> Result<()>; } diff --git a/src/query/service/src/pipelines/processors/transforms/new_hash_join/partitioned/chunk_accumulator.rs b/src/query/service/src/pipelines/processors/transforms/new_hash_join/partitioned/chunk_accumulator.rs index 0a0999fa25a5d..02223c3b9d037 100644 --- a/src/query/service/src/pipelines/processors/transforms/new_hash_join/partitioned/chunk_accumulator.rs +++ b/src/query/service/src/pipelines/processors/transforms/new_hash_join/partitioned/chunk_accumulator.rs @@ -48,11 +48,6 @@ impl FixedSizeChunkAccumulator { } } - pub fn reset(&mut self) { - self.builder_rows = 0; - self.builders = vec![]; - } - fn ensure_builders(&mut self, block: &DataBlock) { if self.builders.is_empty() { self.builders = block @@ -183,14 +178,6 @@ mod tests { assert!(acc.finalize().is_none()); } - #[test] - fn test_reset() { - let mut acc = FixedSizeChunkAccumulator::new(4); - acc.accumulate(make_int_block(vec![1, 2, 3])); - acc.reset(); - assert!(acc.finalize().is_none()); - } - #[test] fn test_multi_column_blocks() { let mut acc = FixedSizeChunkAccumulator::new(3); diff --git a/src/query/service/src/pipelines/processors/transforms/new_hash_join/partitioned/compact_hash_table.rs b/src/query/service/src/pipelines/processors/transforms/new_hash_join/partitioned/compact_hash_table.rs index 62b375f3f8d45..02a7f141faff3 100644 --- a/src/query/service/src/pipelines/processors/transforms/new_hash_join/partitioned/compact_hash_table.rs +++ b/src/query/service/src/pipelines/processors/transforms/new_hash_join/partitioned/compact_hash_table.rs @@ -12,6 +12,8 @@ // See the License for the specific language governing permissions and // limitations under the License. +use databend_common_column::bitmap::Bitmap; + /// Index 0 is a sentinel (empty/chain-end). Actual rows are indexed from 1. /// Memory per row: 4 bytes (next chain) vs current ~32 bytes (pointer-based entry). /// @@ -57,7 +59,7 @@ pub struct CompactJoinHashTable { /// Bucket array: first[hash & mask] = first row index (1-based) first: Vec, /// Chain array: next[row_index] = next row in same bucket (0 = end) - next: Vec, + pub next: Vec, /// Bucket count minus one, for masking bucket_mask: usize, } @@ -139,4 +141,54 @@ impl CompactJoinHashTable { let target = num_rows + (num_rows.saturating_sub(1)) / 7; target.next_power_of_two() } + + pub fn probe(&self, hashes: &mut [u64], bitmap: Option) -> usize { + let mut valids = None; + + if let Some(bitmap) = bitmap { + if bitmap.null_count() == bitmap.len() { + hashes.iter_mut().for_each(|hash| { + *hash = 0; + }); + return 0; + } else if bitmap.null_count() > 0 { + valids = Some(bitmap); + } + } + + let mut count = 0; + + match valids { + Some(valids) => { + valids + .iter() + .zip(hashes.iter_mut()) + .for_each(|(valid, hash)| { + if valid { + let bucket = (*hash as usize) & self.bucket_mask; + if self.first[bucket] != I::default() { + *hash = self.first[bucket].to_usize() as u64; + count += 1; + } else { + *hash = 0; + } + } else { + *hash = 0; + } + }); + } + None => { + hashes.iter_mut().for_each(|hash| { + let bucket = (*hash as usize) & self.bucket_mask; + if self.first[bucket] != I::default() { + *hash = self.first[bucket].to_usize() as u64; + count += 1; + } else { + *hash = 0; + } + }); + } + } + count + } } diff --git a/src/query/service/src/pipelines/processors/transforms/new_hash_join/partitioned/compact_probe_stream.rs b/src/query/service/src/pipelines/processors/transforms/new_hash_join/partitioned/compact_probe_stream.rs deleted file mode 100644 index 60c13f837c345..0000000000000 --- a/src/query/service/src/pipelines/processors/transforms/new_hash_join/partitioned/compact_probe_stream.rs +++ /dev/null @@ -1,211 +0,0 @@ -// Copyright 2021 Datafuse Labs -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -use databend_common_column::bitmap::Bitmap; -use databend_common_exception::Result; -use databend_common_expression::DataBlock; -use databend_common_expression::FunctionContext; -use databend_common_expression::HashMethod; -use databend_common_expression::HashMethodKind; -use databend_common_expression::KeyAccessor; -use databend_common_expression::KeysState; -use databend_common_expression::ProjectedBlock; -use databend_common_expression::with_hash_method; - -use super::compact_hash_table::CompactJoinHashTable; -use super::partitioned_build::CHUNK_BITS; -use super::partitioned_build::CHUNK_SIZE; -use super::partitioned_build::flat_to_row_ptr; -use crate::pipelines::processors::HashJoinDesc; -use crate::pipelines::processors::transforms::new_hash_join::common::probe_stream::ProbeStream; -use crate::pipelines::processors::transforms::new_hash_join::common::probe_stream::ProbedRows; - -const CHUNK_MASK: usize = CHUNK_SIZE - 1; - -struct CompactProbeStream<'a, Key: ?Sized + Eq, const MATCHED: bool> { - key_idx: usize, - build_idx: u32, - matched_num_rows: usize, - probe_validity: Option, - - probe_hashes: Vec, - bucket_mask: usize, - probe_acc: Box>, - build_accs: Vec>>, - - hash_table: &'a CompactJoinHashTable, -} - -impl<'a, Key: ?Sized + Eq + Send + Sync + 'static, const MATCHED: bool> ProbeStream - for CompactProbeStream<'a, Key, MATCHED> -{ - fn advance(&mut self, res: &mut ProbedRows, max_rows: usize) -> Result<()> { - while self.key_idx < self.probe_acc.len() { - if res.matched_probe.len() >= max_rows { - break; - } - - if self - .probe_validity - .as_ref() - .is_some_and(|validity| !validity.get_bit(self.key_idx)) - { - if !MATCHED { - res.unmatched.push(self.key_idx as u64); - } - self.key_idx += 1; - continue; - } - - if self.build_idx == 0 { - let bucket = (self.probe_hashes[self.key_idx] as usize) & self.bucket_mask; - self.build_idx = self.hash_table.first_index(bucket); - - if self.build_idx == 0 { - if !MATCHED { - res.unmatched.push(self.key_idx as u64); - } - self.key_idx += 1; - continue; - } - } - - let probe_key = unsafe { self.probe_acc.key_unchecked(self.key_idx) }; - - while self.build_idx != 0 { - let bi = self.build_idx as usize; - let row_idx = (bi - 1) & CHUNK_MASK; - let chunk_idx = (bi - 1) >> CHUNK_BITS; - let build_key = unsafe { self.build_accs[chunk_idx].key_unchecked(row_idx) }; - - if build_key == probe_key { - res.matched_probe.push(self.key_idx as u64); - res.matched_build.push(flat_to_row_ptr(bi)); - self.matched_num_rows += 1; - - if res.matched_probe.len() >= max_rows { - self.build_idx = self.hash_table.next_index(self.build_idx); - if self.build_idx == 0 { - self.key_idx += 1; - self.matched_num_rows = 0; - } - return Ok(()); - } - } - self.build_idx = self.hash_table.next_index(self.build_idx); - } - - if !MATCHED && self.matched_num_rows == 0 { - res.unmatched.push(self.key_idx as u64); - } - self.key_idx += 1; - self.matched_num_rows = 0; - } - Ok(()) - } -} - -fn create_probe_stream_inner<'a, M: HashMethod, const MATCHED: bool>( - method: &M, - hash_table: &'a CompactJoinHashTable, - build_keys_states: &'a [KeysState], - desc: &HashJoinDesc, - function_ctx: &FunctionContext, - data: &DataBlock, -) -> Result> -where - M::HashKey: Send + Sync, -{ - let probe_keys_entries = desc.probe_key(data, function_ctx)?; - let mut probe_keys_block = DataBlock::new(probe_keys_entries, data.num_rows()); - let probe_validity = match desc.from_correlated_subquery { - true => None, - false => desc.build_valids_by_keys(&probe_keys_block)?, - }; - - desc.remove_keys_nullable(&mut probe_keys_block); - - let keys = ProjectedBlock::from(probe_keys_block.columns()); - let probe_ks = method.build_keys_state(keys, data.num_rows())?; - let mut probe_hashes = Vec::with_capacity(data.num_rows()); - method.build_keys_hashes(&probe_ks, &mut probe_hashes); - - let probe_acc = method.build_keys_accessor(probe_ks)?; - let build_accs = build_keys_states - .iter() - .map(|ks| method.build_keys_accessor(ks.clone())) - .collect::>>()?; - - let bucket_mask = hash_table.bucket_mask(); - - Ok(Box::new(CompactProbeStream::<'a, M::HashKey, MATCHED> { - key_idx: 0, - build_idx: 0, - matched_num_rows: 0, - probe_validity, - probe_hashes, - bucket_mask, - probe_acc, - build_accs, - hash_table, - })) -} - -/// Create a CompactProbeStream that only tracks matched rows (MATCHED=true). -/// For inner join, left semi, right series. -pub fn create_compact_probe_matched<'a>( - hash_table: &'a CompactJoinHashTable, - build_keys_states: &'a [KeysState], - method: &HashMethodKind, - desc: &HashJoinDesc, - function_ctx: &FunctionContext, - data: &DataBlock, -) -> Result> { - with_hash_method!(|T| match method { - HashMethodKind::T(method) => { - create_probe_stream_inner::<_, true>( - method, - hash_table, - build_keys_states, - desc, - function_ctx, - data, - ) - } - }) -} - -/// Create a CompactProbeStream that also tracks unmatched rows (MATCHED=false). -/// For left join, left anti. -pub fn create_compact_probe<'a>( - hash_table: &'a CompactJoinHashTable, - build_keys_states: &'a [KeysState], - method: &HashMethodKind, - desc: &HashJoinDesc, - function_ctx: &FunctionContext, - data: &DataBlock, -) -> Result> { - with_hash_method!(|T| match method { - HashMethodKind::T(method) => { - create_probe_stream_inner::<_, false>( - method, - hash_table, - build_keys_states, - desc, - function_ctx, - data, - ) - } - }) -} diff --git a/src/query/service/src/pipelines/processors/transforms/new_hash_join/partitioned/inner_join.rs b/src/query/service/src/pipelines/processors/transforms/new_hash_join/partitioned/inner_join.rs index df070235d1363..e0856759ac803 100644 --- a/src/query/service/src/pipelines/processors/transforms/new_hash_join/partitioned/inner_join.rs +++ b/src/query/service/src/pipelines/processors/transforms/new_hash_join/partitioned/inner_join.rs @@ -19,24 +19,26 @@ use databend_common_column::bitmap::Bitmap; use databend_common_exception::Result; use databend_common_expression::BlockEntry; use databend_common_expression::DataBlock; -use databend_common_expression::FilterExecutor; use databend_common_expression::FunctionContext; use databend_common_expression::HashMethodKind; use databend_common_expression::types::NullableColumn; -use databend_common_functions::BUILTIN_FUNCTIONS; -use super::partitioned_build::PartitionedBuild; +use super::partitioned_build::PartitionedHashJoinState; +use super::partitioned_build::ProbeData; use crate::pipelines::processors::HashJoinDesc; use crate::pipelines::processors::transforms::new_hash_join::common::join::EmptyJoinStream; +use crate::pipelines::processors::transforms::new_hash_join::common::join::InnerHashJoinFilterStream; use crate::pipelines::processors::transforms::new_hash_join::common::join::Join; use crate::pipelines::processors::transforms::new_hash_join::common::join::JoinStream; use crate::pipelines::processors::transforms::new_hash_join::common::probe_stream::ProbeStream; use crate::pipelines::processors::transforms::new_hash_join::common::probe_stream::ProbedRows; +use crate::pipelines::processors::transforms::unpartitioned::PerformanceContext; pub struct PartitionedInnerJoin { - build: PartitionedBuild, - filter_executor: Option, - max_block_size: usize, + build: PartitionedHashJoinState, + desc: Arc, + function_ctx: Arc, + context: PerformanceContext, } impl PartitionedInnerJoin { @@ -46,82 +48,107 @@ impl PartitionedInnerJoin { function_ctx: FunctionContext, max_block_size: usize, ) -> Self { - let filter_executor = desc.other_predicate.as_ref().map(|predicate| { - FilterExecutor::new( - predicate.clone(), - function_ctx.clone(), - max_block_size, - None, - &BUILTIN_FUNCTIONS, - false, - ) - }); + let context = + PerformanceContext::create(max_block_size, desc.clone(), function_ctx.clone()); + + let function_ctx = Arc::new(function_ctx); + PartitionedInnerJoin { - build: PartitionedBuild::create(method, desc, function_ctx), - filter_executor, + function_ctx: function_ctx.clone(), + build: PartitionedHashJoinState::create(method, desc.clone(), function_ctx), max_block_size, + desc, + context, } } } -pub fn result_block( - desc: &HashJoinDesc, - probe_block: Option, - build_block: Option, - num_rows: usize, -) -> DataBlock { - let mut result_block = match (probe_block, build_block) { - (Some(mut p), Some(b)) => { - p.merge_block(b); - p +impl Join for PartitionedInnerJoin { + fn add_block(&mut self, data: Option) -> Result<()> { + self.build.add_block(data) + } + + fn final_build(&mut self) -> Result> { + self.build.final_build() + } + + fn probe_block(&mut self, data: DataBlock) -> Result> { + if data.is_empty() || self.build.num_rows == 0 { + return Ok(Box::new(EmptyJoinStream)); } - (Some(p), None) => p, - (None, Some(b)) => b, - (None, None) => DataBlock::new(vec![], num_rows), - }; - - for (index, (is_probe_nullable, is_build_nullable)) in desc.probe_to_build.iter().cloned() { - let entry = match (is_probe_nullable, is_build_nullable) { - (true, true) | (false, false) => result_block.get_by_offset(index).clone(), - (true, false) => result_block.get_by_offset(index).clone().remove_nullable(), - (false, true) => { - let entry = result_block.get_by_offset(index); - let col = entry.to_column(); - match col.is_null() || col.is_nullable() { - true => entry.clone(), - false => BlockEntry::from(NullableColumn::new_column( - col, - Bitmap::new_constant(true, result_block.num_rows()), - )), - } - } + + let probe_keys = self.desc.probe_key(&data, &self.function_ctx)?; + + let mut keys = DataBlock::new(probe_keys, data.num_rows()); + let valids = match self.desc.from_correlated_subquery { + true => None, + false => self.desc.build_valids_by_keys(&keys)?, }; - result_block.add_entry(entry); + + self.desc.remove_keys_nullable(&mut keys); + let probe_block = data.project(&self.desc.probe_projection); + + let probe_data = ProbeData::new(keys, valids); + let probe_keys_stream = self.build.probe::(probe_data)?; + let joined_stream = PartitionedInnerJoinStream::create( + probe_block, + &self.build, + probe_keys_stream, + self.desc.clone(), + &mut self.context.probe_result, + ); + + match &mut self.context.filter_executor { + None => Ok(joined_stream), + Some(filter_executor) => Ok(InnerHashJoinFilterStream::create( + joined_stream, + filter_executor, + )), + } } - result_block } struct PartitionedInnerJoinStream<'a> { desc: Arc, probe_data_block: DataBlock, - build: &'a PartitionedBuild, - probe_stream: Box, - probed_rows: ProbedRows, - filter_executor: Option<&'a mut FilterExecutor>, - max_block_size: usize, + build: &'a PartitionedHashJoinState, + probe_keys_stream: Box, + probed_rows: &'a mut ProbedRows, +} + +impl<'a> PartitionedInnerJoinStream<'a> { + pub fn create( + probe_data_block: DataBlock, + build: &'a PartitionedHashJoinState, + probe_keys_stream: Box, + desc: Arc, + probed_rows: &'a mut ProbedRows, + ) -> Box { + Box::new(PartitionedInnerJoinStream { + desc, + build, + probed_rows, + probe_data_block, + probe_keys_stream, + }) + } } impl<'a> JoinStream for PartitionedInnerJoinStream<'a> { fn next(&mut self) -> Result> { loop { self.probed_rows.clear(); - self.probe_stream - .advance(&mut self.probed_rows, self.max_block_size)?; + let max_rows = self.probed_rows.matched_probe.capacity(); + self.probe_keys_stream.advance(self.probed_rows, max_rows)?; if self.probed_rows.is_empty() { return Ok(None); } + if self.probed_rows.is_all_unmatched() { + continue; + } + let probe_block = match self.probe_data_block.num_columns() { 0 => None, _ => Some(DataBlock::take( @@ -129,54 +156,53 @@ impl<'a> JoinStream for PartitionedInnerJoinStream<'a> { self.probed_rows.matched_probe.as_slice(), )?), }; - let build_block = self - .build - .gather_build_block(&self.probed_rows.matched_build); - let num_rows = self.probed_rows.matched_probe.len(); - - let mut block = result_block(&self.desc, probe_block, build_block, num_rows); - if let Some(filter) = self.filter_executor.as_mut() { - block = filter.filter(block)?; - if block.is_empty() { - continue; + let build_block = match self.build.columns.is_empty() { + true => None, + false => { + let row_ptrs = self.probed_rows.matched_build.as_slice(); + Some(DataBlock::take_column_vec( + self.build.columns.as_slice(), + self.build.column_types.as_slice(), + row_ptrs, + )) } - } - - return Ok(Some(block)); - } - } -} + }; -impl Join for PartitionedInnerJoin { - fn add_block(&mut self, data: Option) -> Result<()> { - self.build.add_block(data) - } + let mut result_block = match (probe_block, build_block) { + (Some(mut probe_block), Some(build_block)) => { + probe_block.merge_block(build_block); + probe_block + } + (Some(probe_block), None) => probe_block, + (None, Some(build_block)) => build_block, + (None, None) => DataBlock::new(vec![], self.probed_rows.matched_build.len()), + }; - fn final_build(&mut self) -> Result> { - self.build.final_build() - } + for (index, (is_probe_nullable, is_build_nullable)) in + self.desc.probe_to_build.iter().cloned() + { + let entry = match (is_probe_nullable, is_build_nullable) { + (true, true) | (false, false) => result_block.get_by_offset(index).clone(), + (true, false) => result_block.get_by_offset(index).clone().remove_nullable(), + (false, true) => { + let entry = result_block.get_by_offset(index); + let col = entry.to_column(); + + match col.is_null() || col.is_nullable() { + true => entry.clone(), + false => BlockEntry::from(NullableColumn::new_column( + col, + Bitmap::new_constant(true, result_block.num_rows()), + )), + } + } + }; + + result_block.add_entry(entry); + } - fn probe_block(&mut self, data: DataBlock) -> Result> { - if data.is_empty() || self.build.num_rows == 0 { - return Ok(Box::new(EmptyJoinStream)); + return Ok(Some(result_block)); } - - let probe_stream = self.build.create_probe_matched(&data)?; - let probe_data_block = data.project(&self.build.desc.probe_projection); - - Ok(Box::new(PartitionedInnerJoinStream { - desc: self.build.desc.clone(), - probe_data_block, - build: &self.build, - probe_stream, - probed_rows: ProbedRows::new( - Vec::with_capacity(self.max_block_size), - Vec::with_capacity(self.max_block_size), - Vec::with_capacity(self.max_block_size), - ), - filter_executor: self.filter_executor.as_mut(), - max_block_size: self.max_block_size, - })) } } diff --git a/src/query/service/src/pipelines/processors/transforms/new_hash_join/partitioned/left_join.rs b/src/query/service/src/pipelines/processors/transforms/new_hash_join/partitioned/left_join.rs index 81e7c6f3d8c2e..5e9a304d58537 100644 --- a/src/query/service/src/pipelines/processors/transforms/new_hash_join/partitioned/left_join.rs +++ b/src/query/service/src/pipelines/processors/transforms/new_hash_join/partitioned/left_join.rs @@ -24,10 +24,9 @@ use databend_common_expression::FunctionContext; use databend_common_expression::HashMethodKind; use databend_common_expression::Scalar; use databend_common_expression::types::DataType; -use databend_common_functions::BUILTIN_FUNCTIONS; +use databend_common_expression::types::NullableColumn; -use super::inner_join::result_block; -use super::partitioned_build::PartitionedBuild; +use super::partitioned_build::PartitionedHashJoinState; use crate::pipelines::processors::HashJoinDesc; use crate::pipelines::processors::transforms::new_hash_join::common::join::EmptyJoinStream; use crate::pipelines::processors::transforms::new_hash_join::common::join::Join; @@ -35,23 +34,15 @@ use crate::pipelines::processors::transforms::new_hash_join::common::join::JoinS use crate::pipelines::processors::transforms::new_hash_join::common::join::OneBlockJoinStream; use crate::pipelines::processors::transforms::new_hash_join::common::probe_stream::ProbeStream; use crate::pipelines::processors::transforms::new_hash_join::common::probe_stream::ProbedRows; +use crate::pipelines::processors::transforms::partitioned::partitioned_build::ProbeData; +use crate::pipelines::processors::transforms::unpartitioned::PerformanceContext; use crate::pipelines::processors::transforms::wrap_true_validity; -pub fn null_block(types: &[DataType], num_rows: usize) -> Option { - if types.is_empty() { - return None; - } - let columns = types - .iter() - .map(|t| BlockEntry::new_const_column(t.wrap_nullable(), Scalar::Null, num_rows)) - .collect::>(); - Some(DataBlock::new(columns, num_rows)) -} - pub struct PartitionedLeftJoin { - build: PartitionedBuild, - filter_executor: Option, - max_block_size: usize, + build: PartitionedHashJoinState, + desc: Arc, + function_ctx: FunctionContext, + performance_context: PerformanceContext, } impl PartitionedLeftJoin { @@ -61,108 +52,160 @@ impl PartitionedLeftJoin { function_ctx: FunctionContext, max_block_size: usize, ) -> Self { - let filter_executor = desc.other_predicate.as_ref().map(|predicate| { - FilterExecutor::new( - predicate.clone(), - function_ctx.clone(), - max_block_size, - None, - &BUILTIN_FUNCTIONS, - false, - ) - }); + let context = + PerformanceContext::create(max_block_size, desc.clone(), function_ctx.clone()); + PartitionedLeftJoin { - build: PartitionedBuild::create(method, desc, function_ctx), - filter_executor, - max_block_size, + desc: desc.clone(), + function_ctx: function_ctx.clone(), + performance_context: context, + build: PartitionedHashJoinState::create(method, desc, Arc::new(function_ctx)), } } } -fn wrap_nullable_build(build_block: DataBlock, num_rows: usize) -> DataBlock { - let true_validity = Bitmap::new_constant(true, num_rows); - let entries = build_block - .columns() - .iter() - .map(|c| wrap_true_validity(c, num_rows, &true_validity)); - DataBlock::from_iter(entries, num_rows) -} +impl Join for PartitionedLeftJoin { + fn add_block(&mut self, data: Option) -> Result<()> { + self.build.add_block(data) + } -struct PartitionedLeftJoinStream<'a> { - desc: Arc, - probe_data_block: DataBlock, - build: &'a PartitionedBuild, - probe_stream: Box, - probed_rows: ProbedRows, - filter_executor: Option<&'a mut FilterExecutor>, - max_block_size: usize, - // Accumulated unmatched probe indices (no hash match) - unmatched_indices: Vec, - // Per-probe-row state for filter case: 0=unseen, 1=matched-no-pass, 2=passed - row_state: Vec, - has_filter: bool, - probe_done: bool, - unmatched_offset: usize, -} + fn final_build(&mut self) -> Result> { + self.build.final_build() + } -impl<'a> PartitionedLeftJoinStream<'a> { - fn output_unmatched(&mut self) -> Result> { - // Collect all unmatched indices: from ProbeStream + filter-failed matched rows - if self.unmatched_offset == 0 && self.has_filter { - for i in 0..self.row_state.len() { - if self.row_state[i] == 1 { - self.unmatched_indices.push(i as u64); - } - } + fn probe_block(&mut self, data: DataBlock) -> Result> { + if data.is_empty() { + return Ok(Box::new(EmptyJoinStream)); } - if self.unmatched_offset >= self.unmatched_indices.len() { - return Ok(None); + if self.build.num_rows == 0 { + let num_rows = data.num_rows(); + + let types = self + .desc + .build_schema + .fields + .iter() + .map(|x| x.data_type().clone()) + .collect::>(); + + let build_block = + match crate::pipelines::processors::transforms::unpartitioned::left_join::null_block( + &types, + data.num_rows(), + ) { + None => None, + Some(data_block) => Some(data_block.project(&self.desc.build_projection)), + }; + + let probe_block = Some(data.project(&self.desc.probe_projection)); + let result_block = final_result_block(&self.desc, probe_block, build_block, num_rows); + return Ok(Box::new(OneBlockJoinStream(Some(result_block)))); } - let end = (self.unmatched_offset + self.max_block_size).min(self.unmatched_indices.len()); - let batch = &self.unmatched_indices[self.unmatched_offset..end]; - self.unmatched_offset = end; + let probe_keys = self.desc.probe_key(&data, &self.function_ctx)?; - let probe_block = match self.probe_data_block.num_columns() { - 0 => None, - _ => Some(DataBlock::take(&self.probe_data_block, batch)?), + let mut keys = DataBlock::new(probe_keys, data.num_rows()); + let valids = match self.desc.from_correlated_subquery { + true => None, + false => self.desc.build_valids_by_keys(&keys)?, }; - let build_block = null_block(&self.build.column_types, batch.len()); - Ok(Some(result_block( - &self.desc, - probe_block, - build_block, - batch.len(), - ))) + + self.desc.remove_keys_nullable(&mut keys); + let probe_block = data.project(&self.desc.probe_projection); + + let probe_data = ProbeData::new(keys, valids); + let probe_stream = self.build.probe::(probe_data)?; + + match self.performance_context.filter_executor.as_mut() { + None => Ok(OuterLeftHashJoinStream::::create( + probe_block, + &self.build, + probe_stream, + self.desc.clone(), + &mut self.performance_context.probe_result, + None, + )), + Some(filter_executor) => Ok(OuterLeftHashJoinStream::::create( + probe_block, + &self.build, + probe_stream, + self.desc.clone(), + &mut self.performance_context.probe_result, + Some(filter_executor), + )), + } } } -impl<'a> JoinStream for PartitionedLeftJoinStream<'a> { - fn next(&mut self) -> Result> { - if self.probe_done { - return self.output_unmatched(); - } +struct OuterLeftHashJoinStream<'a, const CONJUNCT: bool> { + desc: Arc, + probe_data_block: DataBlock, + join_state: &'a PartitionedHashJoinState, + probe_keys_stream: Box, + probed_rows: &'a mut ProbedRows, + conjunct_unmatched: Vec, + unmatched_rows: Vec, + filter_executor: Option<&'a mut FilterExecutor>, +} + +unsafe impl<'a, const CONJUNCT: bool> Send for OuterLeftHashJoinStream<'a, CONJUNCT> {} +unsafe impl<'a, const CONJUNCT: bool> Sync for OuterLeftHashJoinStream<'a, CONJUNCT> {} +impl<'a, const CONJUNCT: bool> JoinStream for OuterLeftHashJoinStream<'a, CONJUNCT> { + fn next(&mut self) -> Result> { loop { self.probed_rows.clear(); - self.probe_stream - .advance(&mut self.probed_rows, self.max_block_size)?; + let max_rows = self.probed_rows.matched_probe.capacity(); + self.probe_keys_stream.advance(self.probed_rows, max_rows)?; - if self.probed_rows.is_empty() { - self.probe_done = true; - return self.output_unmatched(); + if !CONJUNCT && !self.probed_rows.unmatched.is_empty() { + self.unmatched_rows + .extend_from_slice(&self.probed_rows.unmatched); } - // Save unmatched indices - self.unmatched_indices - .extend_from_slice(&self.probed_rows.unmatched); + if self.probed_rows.is_empty() { + if self.conjunct_unmatched.is_empty() && self.unmatched_rows.is_empty() { + return Ok(None); + } + + let unmatched_row_id = match CONJUNCT { + true => std::mem::take(&mut self.conjunct_unmatched) + .into_iter() + .enumerate() + .filter(|(_, matched)| *matched == 0) + .map(|(row_id, _)| row_id as u64) + .collect::>(), + false => std::mem::take(&mut self.unmatched_rows), + }; + + let probe_block = match self.probe_data_block.num_columns() { + 0 => None, + _ => Some(DataBlock::take( + &self.probe_data_block, + unmatched_row_id.as_slice(), + )?), + }; + + let types = &self.join_state.column_types; + let build_block = + crate::pipelines::processors::transforms::unpartitioned::left_join::null_block( + types, + unmatched_row_id.len(), + ); + + return Ok(Some(crate::pipelines::processors::transforms::unpartitioned::left_join::final_result_block( + &self.desc, + probe_block, + build_block, + unmatched_row_id.len(), + ))); + } if self.probed_rows.matched_probe.is_empty() { continue; } - let num_matched = self.probed_rows.matched_probe.len(); let probe_block = match self.probe_data_block.num_columns() { 0 => None, _ => Some(DataBlock::take( @@ -171,101 +214,147 @@ impl<'a> JoinStream for PartitionedLeftJoinStream<'a> { )?), }; - let build_block = self - .build - .gather_build_block(&self.probed_rows.matched_build); - let build_block = build_block.map(|b| wrap_nullable_build(b, num_matched)); + let build_block = match self.join_state.columns.is_empty() { + true => None, + false => { + let row_ptrs = self.probed_rows.matched_build.as_slice(); + let build_block = DataBlock::take_column_vec( + self.join_state.columns.as_slice(), + self.join_state.column_types.as_slice(), + row_ptrs, + ); + + let true_validity = Bitmap::new_constant(true, row_ptrs.len()); + let entries = build_block + .columns() + .iter() + .map(|c| wrap_true_validity(c, row_ptrs.len(), &true_validity)); + Some(DataBlock::from_iter(entries, row_ptrs.len())) + } + }; - let mut block = result_block(&self.desc, probe_block, build_block, num_matched); + let mut result_block = crate::pipelines::processors::transforms::unpartitioned::left_join::final_result_block( + &self.desc, + probe_block, + build_block, + self.probed_rows.matched_build.len(), + ); - if let Some(filter) = self.filter_executor.as_mut() { - // Track matched rows - for &idx in &self.probed_rows.matched_probe { - let i = idx as usize; - if self.row_state[i] == 0 { - self.row_state[i] = 1; - } - } + if CONJUNCT && let Some(filter_executor) = self.filter_executor.as_mut() { + let result_count = filter_executor.select(&result_block)?; - let count = filter.select(&block)?; - if count == 0 { + if result_count == 0 { continue; } - let true_sel = filter.true_selection(); - for &sel_idx in true_sel.iter().take(count) { - let probe_idx = self.probed_rows.matched_probe[sel_idx as usize] as usize; - self.row_state[probe_idx] = 2; + let true_sel = filter_executor.true_selection(); + + for idx in true_sel.iter().take(result_count) { + let row_id = self.probed_rows.matched_probe[*idx as usize] as usize; + self.conjunct_unmatched[row_id] = 1; } - let origin_rows = block.num_rows(); - block = filter.take(block, origin_rows, count)?; + let origin_rows = result_block.num_rows(); + result_block = filter_executor.take(result_block, origin_rows, result_count)?; } - if !block.is_empty() { - return Ok(Some(block)); - } + return Ok(Some(result_block)); } } } -impl Join for PartitionedLeftJoin { - fn add_block(&mut self, data: Option) -> Result<()> { - self.build.add_block(data) - } +impl<'a, const CONJUNCT: bool> OuterLeftHashJoinStream<'a, CONJUNCT> { + pub fn create( + probe_data_block: DataBlock, + join_state: &'a PartitionedHashJoinState, + probe_keys_stream: Box, + desc: Arc, + probed_rows: &'a mut ProbedRows, + filter_executor: Option<&'a mut FilterExecutor>, + ) -> Box { + let num_rows = probe_data_block.num_rows(); + let pending_unmatched = match CONJUNCT { + true => vec![0; num_rows], + false => Vec::new(), + }; - fn final_build(&mut self) -> Result> { - self.build.final_build() + let unmatched_rows = match CONJUNCT { + true => Vec::new(), + false => Vec::with_capacity(num_rows), + }; + + probed_rows.unmatched.reserve(num_rows); + Box::new(OuterLeftHashJoinStream::<'a, CONJUNCT> { + desc, + join_state, + probed_rows, + probe_data_block, + probe_keys_stream, + filter_executor, + unmatched_rows, + conjunct_unmatched: pending_unmatched, + }) } +} - fn probe_block(&mut self, data: DataBlock) -> Result> { - if data.is_empty() { - return Ok(Box::new(EmptyJoinStream)); +pub fn final_result_block( + desc: &HashJoinDesc, + probe_block: Option, + build_block: Option, + num_rows: usize, +) -> DataBlock { + let mut result_block = match (probe_block, build_block) { + (Some(mut probe_block), Some(build_block)) => { + probe_block.merge_block(build_block); + probe_block } + (Some(probe_block), None) => probe_block, + (None, Some(build_block)) => build_block, + (None, None) => DataBlock::new(vec![], num_rows), + }; + + if !desc.probe_to_build.is_empty() { + for (index, (is_probe_nullable, is_build_nullable)) in desc.probe_to_build.iter() { + let entry = match (is_probe_nullable, is_build_nullable) { + (true, true) | (false, false) => result_block.get_by_offset(*index).clone(), + (true, false) => result_block.get_by_offset(*index).clone().remove_nullable(), + (false, true) => { + let entry = result_block.get_by_offset(*index); + let col = entry.to_column(); + + match col.is_null() || col.is_nullable() { + true => entry.clone(), + false => BlockEntry::from(NullableColumn::new_column( + col, + Bitmap::new_constant(true, result_block.num_rows()), + )), + } + } + }; - let desc = &self.build.desc; + result_block.add_entry(entry); + } + } + result_block +} - if self.build.num_rows == 0 { - let num_rows = data.num_rows(); - let types: Vec<_> = desc - .build_schema - .fields +#[allow(dead_code)] +pub fn null_block(types: &[DataType], num_rows: usize) -> Option { + match types.is_empty() { + true => None, + false => { + let columns = types .iter() - .map(|x| x.data_type().clone()) - .collect(); - let build_block = - null_block(&types, num_rows).map(|b| b.project(&desc.build_projection)); - let probe_block = Some(data.project(&desc.probe_projection)); - let block = result_block(desc, probe_block, build_block, num_rows); - return Ok(Box::new(OneBlockJoinStream(Some(block)))); + .map(|column_type| { + BlockEntry::new_const_column( + column_type.wrap_nullable(), + Scalar::Null, + num_rows, + ) + }) + .collect::>(); + + Some(DataBlock::new(columns, num_rows)) } - - let num_probe_rows = data.num_rows(); - let has_filter = self.filter_executor.is_some(); - let probe_stream = self.build.create_probe(&data)?; - let probe_data_block = data.project(&desc.probe_projection); - - Ok(Box::new(PartitionedLeftJoinStream { - desc: self.build.desc.clone(), - probe_data_block, - build: &self.build, - probe_stream, - probed_rows: ProbedRows::new( - Vec::with_capacity(self.max_block_size), - Vec::with_capacity(self.max_block_size), - Vec::with_capacity(self.max_block_size), - ), - filter_executor: self.filter_executor.as_mut(), - max_block_size: self.max_block_size, - unmatched_indices: Vec::new(), - row_state: if has_filter { - vec![0u8; num_probe_rows] - } else { - Vec::new() - }, - has_filter, - probe_done: false, - unmatched_offset: 0, - })) } } diff --git a/src/query/service/src/pipelines/processors/transforms/new_hash_join/partitioned/left_join_anti.rs b/src/query/service/src/pipelines/processors/transforms/new_hash_join/partitioned/left_join_anti.rs index c70fab517df2e..a904da5635d4a 100644 --- a/src/query/service/src/pipelines/processors/transforms/new_hash_join/partitioned/left_join_anti.rs +++ b/src/query/service/src/pipelines/processors/transforms/new_hash_join/partitioned/left_join_anti.rs @@ -21,10 +21,9 @@ use databend_common_expression::DataBlock; use databend_common_expression::FilterExecutor; use databend_common_expression::FunctionContext; use databend_common_expression::HashMethodKind; -use databend_common_functions::BUILTIN_FUNCTIONS; -use super::inner_join::result_block; -use super::partitioned_build::PartitionedBuild; +use super::partitioned_build::PartitionedHashJoinState; +use super::partitioned_build::ProbeData; use crate::pipelines::processors::HashJoinDesc; use crate::pipelines::processors::transforms::new_hash_join::common::join::EmptyJoinStream; use crate::pipelines::processors::transforms::new_hash_join::common::join::Join; @@ -32,11 +31,14 @@ use crate::pipelines::processors::transforms::new_hash_join::common::join::JoinS use crate::pipelines::processors::transforms::new_hash_join::common::join::OneBlockJoinStream; use crate::pipelines::processors::transforms::new_hash_join::common::probe_stream::ProbeStream; use crate::pipelines::processors::transforms::new_hash_join::common::probe_stream::ProbedRows; +use crate::pipelines::processors::transforms::new_hash_join::unpartitioned::memory::left_join::final_result_block; +use crate::pipelines::processors::transforms::unpartitioned::PerformanceContext; pub struct PartitionedLeftAntiJoin { - build: PartitionedBuild, - filter_executor: Option, - max_block_size: usize, + build: PartitionedHashJoinState, + desc: Arc, + function_ctx: Arc, + context: PerformanceContext, } impl PartitionedLeftAntiJoin { @@ -46,129 +48,216 @@ impl PartitionedLeftAntiJoin { function_ctx: FunctionContext, max_block_size: usize, ) -> Self { - let filter_executor = desc.other_predicate.as_ref().map(|predicate| { - FilterExecutor::new( - predicate.clone(), - function_ctx.clone(), - max_block_size, - None, - &BUILTIN_FUNCTIONS, - false, - ) - }); + let context = + PerformanceContext::create(max_block_size, desc.clone(), function_ctx.clone()); + + let function_ctx = Arc::new(function_ctx); + PartitionedLeftAntiJoin { - build: PartitionedBuild::create(method, desc, function_ctx), - filter_executor, - max_block_size, + function_ctx: function_ctx.clone(), + build: PartitionedHashJoinState::create(method, desc.clone(), function_ctx), + desc, + context, } } } -struct PartitionedLeftAntiJoinStream<'a> { - probe_data_block: DataBlock, - build: &'a PartitionedBuild, - desc: Arc, - probe_stream: Box, - probed_rows: ProbedRows, - filter_executor: Option<&'a mut FilterExecutor>, - max_block_size: usize, - excluded: Vec, - probe_done: bool, + +impl Join for PartitionedLeftAntiJoin { + fn add_block(&mut self, data: Option) -> Result<()> { + self.build.add_block(data) + } + + fn final_build(&mut self) -> Result> { + self.build.final_build() + } + + fn probe_block(&mut self, data: DataBlock) -> Result> { + if data.is_empty() { + return Ok(Box::new(EmptyJoinStream)); + } + + if self.build.num_rows == 0 { + let probe_projected = data.project(&self.desc.probe_projection); + return Ok(Box::new(OneBlockJoinStream(Some(probe_projected)))); + } + + let num_probe_rows = data.num_rows(); + let probe_keys = self.desc.probe_key(&data, &self.function_ctx)?; + + let mut keys = DataBlock::new(probe_keys, data.num_rows()); + let valids = match self.desc.from_correlated_subquery { + true => None, + false => self.desc.build_valids_by_keys(&keys)?, + }; + + self.desc.remove_keys_nullable(&mut keys); + let probe_block = data.project(&self.desc.probe_projection); + + let probe_data = ProbeData::new(keys, valids); + let probe_keys_stream = self.build.probe::(probe_data)?; + + match &mut self.context.filter_executor { + None => Ok(LeftAntiHashJoinStream::create( + probe_block, + probe_keys_stream, + &mut self.context.probe_result, + )), + Some(filter_executor) => Ok(LeftAntiFilterHashJoinStream::create( + probe_block, + &self.build, + probe_keys_stream, + self.desc.clone(), + &mut self.context.probe_result, + filter_executor, + num_probe_rows, + )), + } + } } -impl<'a> JoinStream for PartitionedLeftAntiJoinStream<'a> { +struct LeftAntiHashJoinStream<'a> { + probe_data_block: Option, + probe_keys_stream: Box, + probed_rows: &'a mut ProbedRows, +} + +impl<'a> LeftAntiHashJoinStream<'a> { + pub fn create( + probe_data_block: DataBlock, + probe_keys_stream: Box, + probed_rows: &'a mut ProbedRows, + ) -> Box { + Box::new(LeftAntiHashJoinStream { + probed_rows, + probe_data_block: Some(probe_data_block), + probe_keys_stream, + }) + } +} + +impl<'a> JoinStream for LeftAntiHashJoinStream<'a> { fn next(&mut self) -> Result> { - if self.probe_done { + let Some(probe_data_block) = self.probe_data_block.take() else { return Ok(None); - } + }; + + let num_rows = probe_data_block.num_rows(); + let mut selected = vec![false; num_rows]; loop { self.probed_rows.clear(); - self.probe_stream - .advance(&mut self.probed_rows, self.max_block_size)?; + let max_rows = self.probed_rows.matched_probe.capacity(); + self.probe_keys_stream.advance(self.probed_rows, max_rows)?; if self.probed_rows.is_empty() { - self.probe_done = true; - let bitmap = Bitmap::from_trusted_len_iter(self.excluded.iter().map(|e| !e)); - return match bitmap.true_count() { - 0 => Ok(None), - _ => Ok(Some( - self.probe_data_block.clone().filter_with_bitmap(&bitmap)?, - )), - }; + let bitmap = Bitmap::from_trusted_len_iter(selected.into_iter()); + return Ok(Some(probe_data_block.filter_with_bitmap(&bitmap)?)); } - if self.probed_rows.matched_probe.is_empty() { - continue; - } - - if let Some(filter) = self.filter_executor.as_mut() { - let num_matched = self.probed_rows.matched_probe.len(); - let probe_block = match self.probe_data_block.num_columns() { - 0 => None, - _ => Some(DataBlock::take( - &self.probe_data_block, - self.probed_rows.matched_probe.as_slice(), - )?), - }; - let build_block = self - .build - .gather_build_block(&self.probed_rows.matched_build); - let block = result_block(&self.desc, probe_block, build_block, num_matched); - - let count = filter.select(&block)?; - if count > 0 { - let true_sel = filter.true_selection(); - for &sel_idx in true_sel.iter().take(count) { - let probe_idx = self.probed_rows.matched_probe[sel_idx as usize] as usize; - self.excluded[probe_idx] = true; - } - } - } else { - for &idx in &self.probed_rows.matched_probe { - self.excluded[idx as usize] = true; - } + for idx in &self.probed_rows.unmatched { + selected[*idx as usize] = true; } } } } -impl Join for PartitionedLeftAntiJoin { - fn add_block(&mut self, data: Option) -> Result<()> { - self.build.add_block(data) - } +struct LeftAntiFilterHashJoinStream<'a> { + desc: Arc, + probe_data_block: Option, + build: &'a PartitionedHashJoinState, + probe_keys_stream: Box, + probed_rows: &'a mut ProbedRows, + filter_executor: &'a mut FilterExecutor, + selected: Vec, +} - fn final_build(&mut self) -> Result> { - self.build.final_build() +impl<'a> LeftAntiFilterHashJoinStream<'a> { + pub fn create( + probe_data_block: DataBlock, + build: &'a PartitionedHashJoinState, + probe_keys_stream: Box, + desc: Arc, + probed_rows: &'a mut ProbedRows, + filter_executor: &'a mut FilterExecutor, + num_probe_rows: usize, + ) -> Box { + Box::new(LeftAntiFilterHashJoinStream { + desc, + build, + probed_rows, + filter_executor, + probe_keys_stream, + probe_data_block: Some(probe_data_block), + selected: vec![true; num_probe_rows], + }) } +} - fn probe_block(&mut self, data: DataBlock) -> Result> { - if data.is_empty() { - return Ok(Box::new(EmptyJoinStream)); - } +impl<'a> JoinStream for LeftAntiFilterHashJoinStream<'a> { + fn next(&mut self) -> Result> { + let Some(probe_data_block) = self.probe_data_block.take() else { + return Ok(None); + }; - if self.build.num_rows == 0 { - let probe_projected = data.project(&self.build.desc.probe_projection); - return Ok(Box::new(OneBlockJoinStream(Some(probe_projected)))); + loop { + self.probed_rows.clear(); + let max_rows = self.probed_rows.matched_probe.capacity(); + self.probe_keys_stream.advance(self.probed_rows, max_rows)?; + + if self.probed_rows.is_empty() { + break; + } + + if self.probed_rows.is_all_unmatched() { + continue; + } + + let probe_block = match probe_data_block.num_columns() { + 0 => None, + _ => Some(DataBlock::take( + &probe_data_block, + self.probed_rows.matched_probe.as_slice(), + )?), + }; + + let build_block = match self.build.columns.is_empty() { + true => None, + false => { + let row_ptrs = self.probed_rows.matched_build.as_slice(); + Some(DataBlock::take_column_vec( + self.build.columns.as_slice(), + self.build.column_types.as_slice(), + row_ptrs, + )) + } + }; + + let result_block = final_result_block( + &self.desc, + probe_block, + build_block, + self.probed_rows.matched_build.len(), + ); + + let selected_rows = self.filter_executor.select(&result_block)?; + + if selected_rows == result_block.num_rows() { + for probe_idx in &self.probed_rows.matched_probe { + self.selected[*probe_idx as usize] = false; + } + } else if selected_rows != 0 { + let selection = self.filter_executor.true_selection(); + for idx in selection[..selected_rows].iter() { + let idx = self.probed_rows.matched_probe[*idx as usize]; + self.selected[idx as usize] = false; + } + } } - let num_probe_rows = data.num_rows(); - let probe_stream = self.build.create_probe_matched(&data)?; - let probe_data_block = data.project(&self.build.desc.probe_projection); - - Ok(Box::new(PartitionedLeftAntiJoinStream { - probe_data_block, - build: &self.build, - desc: self.build.desc.clone(), - probe_stream, - probed_rows: ProbedRows::new( - Vec::with_capacity(self.max_block_size), - Vec::with_capacity(self.max_block_size), - Vec::with_capacity(self.max_block_size), - ), - filter_executor: self.filter_executor.as_mut(), - max_block_size: self.max_block_size, - excluded: vec![false; num_probe_rows], - probe_done: false, - })) + let bitmap = Bitmap::from_trusted_len_iter(self.selected.iter().copied()); + match bitmap.true_count() { + 0 => Ok(None), + _ => Ok(Some(probe_data_block.filter_with_bitmap(&bitmap)?)), + } } } diff --git a/src/query/service/src/pipelines/processors/transforms/new_hash_join/partitioned/left_join_semi.rs b/src/query/service/src/pipelines/processors/transforms/new_hash_join/partitioned/left_join_semi.rs index 30b39c6de86ce..7baffe38c2fea 100644 --- a/src/query/service/src/pipelines/processors/transforms/new_hash_join/partitioned/left_join_semi.rs +++ b/src/query/service/src/pipelines/processors/transforms/new_hash_join/partitioned/left_join_semi.rs @@ -21,21 +21,23 @@ use databend_common_expression::DataBlock; use databend_common_expression::FilterExecutor; use databend_common_expression::FunctionContext; use databend_common_expression::HashMethodKind; -use databend_common_functions::BUILTIN_FUNCTIONS; -use super::inner_join::result_block; -use super::partitioned_build::PartitionedBuild; +use super::partitioned_build::PartitionedHashJoinState; +use super::partitioned_build::ProbeData; use crate::pipelines::processors::HashJoinDesc; use crate::pipelines::processors::transforms::new_hash_join::common::join::EmptyJoinStream; use crate::pipelines::processors::transforms::new_hash_join::common::join::Join; use crate::pipelines::processors::transforms::new_hash_join::common::join::JoinStream; use crate::pipelines::processors::transforms::new_hash_join::common::probe_stream::ProbeStream; use crate::pipelines::processors::transforms::new_hash_join::common::probe_stream::ProbedRows; +use crate::pipelines::processors::transforms::new_hash_join::unpartitioned::memory::left_join::final_result_block; +use crate::pipelines::processors::transforms::unpartitioned::PerformanceContext; pub struct PartitionedLeftSemiJoin { - build: PartitionedBuild, - filter_executor: Option, - max_block_size: usize, + build: PartitionedHashJoinState, + desc: Arc, + function_ctx: Arc, + context: PerformanceContext, } impl PartitionedLeftSemiJoin { @@ -45,125 +47,205 @@ impl PartitionedLeftSemiJoin { function_ctx: FunctionContext, max_block_size: usize, ) -> Self { - let filter_executor = desc.other_predicate.as_ref().map(|predicate| { - FilterExecutor::new( - predicate.clone(), - function_ctx.clone(), - max_block_size, - None, - &BUILTIN_FUNCTIONS, - false, - ) - }); + let context = + PerformanceContext::create(max_block_size, desc.clone(), function_ctx.clone()); + + let function_ctx = Arc::new(function_ctx); + PartitionedLeftSemiJoin { - build: PartitionedBuild::create(method, desc, function_ctx), - filter_executor, - max_block_size, + function_ctx: function_ctx.clone(), + build: PartitionedHashJoinState::create(method, desc.clone(), function_ctx), + desc, + context, } } } -struct PartitionedLeftSemiJoinStream<'a> { +impl Join for PartitionedLeftSemiJoin { + fn add_block(&mut self, data: Option) -> Result<()> { + self.build.add_block(data) + } + + fn final_build(&mut self) -> Result> { + self.build.final_build() + } + + fn probe_block(&mut self, data: DataBlock) -> Result> { + if data.is_empty() || self.build.num_rows == 0 { + return Ok(Box::new(EmptyJoinStream)); + } + + let num_probe_rows = data.num_rows(); + let probe_keys = self.desc.probe_key(&data, &self.function_ctx)?; + + let mut keys = DataBlock::new(probe_keys, data.num_rows()); + let valids = match self.desc.from_correlated_subquery { + true => None, + false => self.desc.build_valids_by_keys(&keys)?, + }; + + self.desc.remove_keys_nullable(&mut keys); + let probe_block = data.project(&self.desc.probe_projection); + + let probe_data = ProbeData::new(keys, valids); + let probe_keys_stream = self.build.probe::(probe_data)?; + + match &mut self.context.filter_executor { + None => Ok(LeftSemiHashJoinStream::create( + probe_block, + probe_keys_stream, + &mut self.context.probe_result, + )), + Some(filter_executor) => Ok(LeftSemiFilterHashJoinStream::create( + probe_block, + &self.build, + probe_keys_stream, + self.desc.clone(), + &mut self.context.probe_result, + filter_executor, + num_probe_rows, + )), + } + } +} + +struct LeftSemiHashJoinStream<'a> { probe_data_block: DataBlock, - build: &'a PartitionedBuild, + probe_keys_stream: Box, + probed_rows: &'a mut ProbedRows, +} + +impl<'a> LeftSemiHashJoinStream<'a> { + pub fn create( + probe_data_block: DataBlock, + probe_keys_stream: Box, + probed_rows: &'a mut ProbedRows, + ) -> Box { + Box::new(LeftSemiHashJoinStream { + probed_rows, + probe_data_block, + probe_keys_stream, + }) + } +} + +impl<'a> JoinStream for LeftSemiHashJoinStream<'a> { + fn next(&mut self) -> Result> { + loop { + self.probed_rows.clear(); + let max_rows = self.probed_rows.matched_probe.capacity(); + self.probe_keys_stream.advance(self.probed_rows, max_rows)?; + + if self.probed_rows.is_empty() { + return Ok(None); + } + + if self.probed_rows.is_all_unmatched() { + continue; + } + + return Ok(Some(DataBlock::take( + &self.probe_data_block, + self.probed_rows.matched_probe.as_slice(), + )?)); + } + } +} + +struct LeftSemiFilterHashJoinStream<'a> { desc: Arc, - probe_stream: Box, - probed_rows: ProbedRows, - filter_executor: Option<&'a mut FilterExecutor>, - max_block_size: usize, + probe_data_block: Option, + build: &'a PartitionedHashJoinState, + probe_keys_stream: Box, + probed_rows: &'a mut ProbedRows, + filter_executor: &'a mut FilterExecutor, selected: Vec, - probe_done: bool, } -impl<'a> JoinStream for PartitionedLeftSemiJoinStream<'a> { +impl<'a> LeftSemiFilterHashJoinStream<'a> { + pub fn create( + probe_data_block: DataBlock, + build: &'a PartitionedHashJoinState, + probe_keys_stream: Box, + desc: Arc, + probed_rows: &'a mut ProbedRows, + filter_executor: &'a mut FilterExecutor, + num_probe_rows: usize, + ) -> Box { + Box::new(LeftSemiFilterHashJoinStream { + desc, + build, + probed_rows, + filter_executor, + probe_keys_stream, + probe_data_block: Some(probe_data_block), + selected: vec![false; num_probe_rows], + }) + } +} + +impl<'a> JoinStream for LeftSemiFilterHashJoinStream<'a> { fn next(&mut self) -> Result> { - if self.probe_done { + let Some(probe_data_block) = self.probe_data_block.take() else { return Ok(None); - } + }; loop { self.probed_rows.clear(); - self.probe_stream - .advance(&mut self.probed_rows, self.max_block_size)?; + let max_rows = self.probed_rows.matched_probe.capacity(); + self.probe_keys_stream.advance(self.probed_rows, max_rows)?; if self.probed_rows.is_empty() { - self.probe_done = true; - let bitmap = Bitmap::from_trusted_len_iter(self.selected.iter().copied()); - return match bitmap.true_count() { - 0 => Ok(None), - _ => Ok(Some( - self.probe_data_block.clone().filter_with_bitmap(&bitmap)?, - )), - }; + break; } - if self.probed_rows.matched_probe.is_empty() { + if self.probed_rows.is_all_unmatched() { continue; } - if let Some(filter) = self.filter_executor.as_mut() { - let num_matched = self.probed_rows.matched_probe.len(); - let probe_block = match self.probe_data_block.num_columns() { - 0 => None, - _ => Some(DataBlock::take( - &self.probe_data_block, - self.probed_rows.matched_probe.as_slice(), - )?), - }; - let build_block = self - .build - .gather_build_block(&self.probed_rows.matched_build); - let block = result_block(&self.desc, probe_block, build_block, num_matched); - - let count = filter.select(&block)?; - if count > 0 { - let true_sel = filter.true_selection(); - for &sel_idx in true_sel.iter().take(count) { - let probe_idx = self.probed_rows.matched_probe[sel_idx as usize] as usize; - self.selected[probe_idx] = true; - } + let probe_block = match probe_data_block.num_columns() { + 0 => None, + _ => Some(DataBlock::take( + &probe_data_block, + self.probed_rows.matched_probe.as_slice(), + )?), + }; + + let build_block = match self.build.columns.is_empty() { + true => None, + false => { + let row_ptrs = self.probed_rows.matched_build.as_slice(); + Some(DataBlock::take_column_vec( + self.build.columns.as_slice(), + self.build.column_types.as_slice(), + row_ptrs, + )) + } + }; + + let num_matched = self.probed_rows.matched_probe.len(); + let result = final_result_block(&self.desc, probe_block, build_block, num_matched); + + let selected_rows = self.filter_executor.select(&result)?; + + if selected_rows == result.num_rows() { + for probe_idx in &self.probed_rows.matched_probe { + self.selected[*probe_idx as usize] = true; } - } else { - for &idx in &self.probed_rows.matched_probe { + } else if selected_rows != 0 { + let selection = self.filter_executor.true_selection(); + for idx in selection[..selected_rows].iter() { + let idx = self.probed_rows.matched_probe[*idx as usize]; self.selected[idx as usize] = true; } } } - } -} -impl Join for PartitionedLeftSemiJoin { - fn add_block(&mut self, data: Option) -> Result<()> { - self.build.add_block(data) - } - - fn final_build(&mut self) -> Result> { - self.build.final_build() - } + let bitmap = Bitmap::from_trusted_len_iter(self.selected.iter().copied()); - fn probe_block(&mut self, data: DataBlock) -> Result> { - if data.is_empty() || self.build.num_rows == 0 { - return Ok(Box::new(EmptyJoinStream)); + match bitmap.true_count() { + 0 => Ok(None), + _ => Ok(Some(probe_data_block.filter_with_bitmap(&bitmap)?)), } - - let num_probe_rows = data.num_rows(); - let probe_stream = self.build.create_probe_matched(&data)?; - let probe_data_block = data.project(&self.build.desc.probe_projection); - - Ok(Box::new(PartitionedLeftSemiJoinStream { - probe_data_block, - build: &self.build, - desc: self.build.desc.clone(), - probe_stream, - probed_rows: ProbedRows::new( - Vec::with_capacity(self.max_block_size), - Vec::with_capacity(self.max_block_size), - Vec::with_capacity(self.max_block_size), - ), - filter_executor: self.filter_executor.as_mut(), - max_block_size: self.max_block_size, - selected: vec![false; num_probe_rows], - probe_done: false, - })) } } diff --git a/src/query/service/src/pipelines/processors/transforms/new_hash_join/partitioned/mod.rs b/src/query/service/src/pipelines/processors/transforms/new_hash_join/partitioned/mod.rs index 6ba6c62f824f6..ad53dcd1ac78f 100644 --- a/src/query/service/src/pipelines/processors/transforms/new_hash_join/partitioned/mod.rs +++ b/src/query/service/src/pipelines/processors/transforms/new_hash_join/partitioned/mod.rs @@ -14,7 +14,6 @@ mod chunk_accumulator; mod compact_hash_table; -mod compact_probe_stream; mod inner_join; mod left_join; mod left_join_anti; @@ -23,7 +22,6 @@ mod partitioned_build; mod right_join; mod right_join_anti; mod right_join_semi; -#[allow(dead_code)] mod transform_hash_join; pub use compact_hash_table::CompactJoinHashTable; @@ -32,7 +30,7 @@ pub use inner_join::PartitionedInnerJoin; pub use left_join::PartitionedLeftJoin; pub use left_join_anti::PartitionedLeftAntiJoin; pub use left_join_semi::PartitionedLeftSemiJoin; -pub use partitioned_build::PartitionedBuild; +pub use partitioned_build::PartitionedHashJoinState; pub use right_join::PartitionedRightJoin; pub use right_join_anti::PartitionedRightAntiJoin; pub use right_join_semi::PartitionedRightSemiJoin; diff --git a/src/query/service/src/pipelines/processors/transforms/new_hash_join/partitioned/partitioned_build.rs b/src/query/service/src/pipelines/processors/transforms/new_hash_join/partitioned/partitioned_build.rs index 85dfca8507507..0ca1e39304efe 100644 --- a/src/query/service/src/pipelines/processors/transforms/new_hash_join/partitioned/partitioned_build.rs +++ b/src/query/service/src/pipelines/processors/transforms/new_hash_join/partitioned/partitioned_build.rs @@ -15,12 +15,16 @@ use std::sync::Arc; use databend_common_base::base::ProgressValues; +use databend_common_base::hints::assume; +use databend_common_column::binary::BinaryColumn; use databend_common_column::bitmap::Bitmap; +use databend_common_column::buffer::Buffer; use databend_common_exception::Result; use databend_common_expression::BlockEntry; use databend_common_expression::Column; use databend_common_expression::ColumnVec; use databend_common_expression::DataBlock; +use databend_common_expression::FixedKey; use databend_common_expression::FromData; use databend_common_expression::FunctionContext; use databend_common_expression::HashMethod; @@ -32,14 +36,17 @@ use databend_common_expression::types::AccessType; use databend_common_expression::types::BooleanType; use databend_common_expression::types::DataType; use databend_common_expression::with_hash_method; +use ethnum::u256; use super::chunk_accumulator::FixedSizeChunkAccumulator; use super::compact_hash_table::CompactJoinHashTable; -use super::compact_probe_stream::create_compact_probe; -use super::compact_probe_stream::create_compact_probe_matched; use crate::pipelines::processors::HashJoinDesc; use crate::pipelines::processors::transforms::hash_join_table::RowPtr; use crate::pipelines::processors::transforms::new_hash_join::common::probe_stream::ProbeStream; +use crate::pipelines::processors::transforms::new_hash_join::common::probe_stream::ProbedRows; +use crate::pipelines::processors::transforms::partitioned::RowIndex; +use crate::pipelines::processors::transforms::unpartitioned::hashtable::basic::AllUnmatchedProbeStream; +use crate::pipelines::processors::transforms::unpartitioned::hashtable::basic::EmptyProbeStream; pub const CHUNK_BITS: usize = 16; pub const CHUNK_SIZE: usize = 1 << CHUNK_BITS; // 65536 @@ -54,11 +61,67 @@ pub fn flat_to_row_ptr(flat_index: usize) -> RowPtr { } } +pub struct ProbeData { + keys: DataBlock, + valids: Option, +} + +impl ProbeData { + pub fn new(keys: DataBlock, valids: Option) -> Self { + ProbeData { keys, valids } + } + + pub fn num_rows(&self) -> usize { + self.keys.num_rows() + } + + pub fn columns(&self) -> &[BlockEntry] { + self.keys.columns() + } + + pub fn non_null_rows(&self) -> usize { + match &self.valids { + None => self.keys.num_rows(), + Some(valids) => valids.len() - valids.null_count(), + } + } + + pub fn into_raw(self) -> (DataBlock, Option) { + (self.keys, self.valids) + } +} + +pub enum BuildKeysStates { + UInt8(Vec>), + UInt16(Vec>), + UInt32(Vec>), + UInt64(Vec>), + UInt128(Vec>), + UInt256(Vec>), + Binary(Vec), +} + +impl BuildKeysStates { + pub fn new(method: &HashMethodKind) -> Self { + match method { + HashMethodKind::Serializer(_) => BuildKeysStates::Binary(vec![]), + HashMethodKind::SingleBinary(_) => BuildKeysStates::Binary(vec![]), + HashMethodKind::KeysU8(_) => BuildKeysStates::UInt8(vec![]), + HashMethodKind::KeysU16(_) => BuildKeysStates::UInt16(vec![]), + HashMethodKind::KeysU32(_) => BuildKeysStates::UInt32(vec![]), + HashMethodKind::KeysU64(_) => BuildKeysStates::UInt64(vec![]), + HashMethodKind::KeysU128(_) => BuildKeysStates::UInt128(vec![]), + HashMethodKind::KeysU256(_) => BuildKeysStates::UInt256(vec![]), + } + } +} + /// Per-thread build state for partitioned hash join. -pub struct PartitionedBuild { +pub struct PartitionedHashJoinState { pub chunks: Vec, pub method: HashMethodKind, - pub build_keys_states: Vec, + pub build_keys_states: BuildKeysStates, + pub chunk_keys_states: Vec, pub hash_table: CompactJoinHashTable, pub columns: Vec, @@ -69,7 +132,7 @@ pub struct PartitionedBuild { pub visited: Vec, pub desc: Arc, - pub function_ctx: FunctionContext, + pub function_ctx: Arc, /// When true, NULL build keys are kept in the data (not filtered out). /// Required for RIGHT and RIGHT ANTI joins where unmatched build rows @@ -82,32 +145,16 @@ pub struct PartitionedBuild { accumulator: FixedSizeChunkAccumulator, } -impl PartitionedBuild { +impl PartitionedHashJoinState { pub fn create( method: HashMethodKind, desc: Arc, - function_ctx: FunctionContext, - ) -> Self { - Self::create_with_options(method, desc, function_ctx, false) - } - - pub fn create_keep_null_keys( - method: HashMethodKind, - desc: Arc, - function_ctx: FunctionContext, + function_ctx: Arc, ) -> Self { - Self::create_with_options(method, desc, function_ctx, true) - } - - fn create_with_options( - method: HashMethodKind, - desc: Arc, - function_ctx: FunctionContext, - keep_null_keys: bool, - ) -> Self { - PartitionedBuild { + PartitionedHashJoinState { chunks: Vec::new(), - build_keys_states: Vec::new(), + build_keys_states: BuildKeysStates::new(&method), + chunk_keys_states: Vec::new(), hash_table: CompactJoinHashTable::new(0), columns: Vec::new(), column_types: Vec::new(), @@ -116,7 +163,7 @@ impl PartitionedBuild { desc, function_ctx, visited: Vec::new(), - keep_null_keys, + keep_null_keys: false, chunk_validities: Vec::new(), accumulator: FixedSizeChunkAccumulator::new(CHUNK_SIZE), build_block_idx: 0, @@ -161,7 +208,7 @@ impl PartitionedBuild { }); self.num_rows += num_rows; - self.build_keys_states.push(keys_state); + self.chunk_keys_states.push(keys_state); self.chunks.push(DataBlock::new(data_columns, num_rows)); self.chunk_validities.push(chunk_validity); Ok(()) @@ -236,7 +283,7 @@ impl PartitionedBuild { } let row_offset = CHUNK_SIZE * self.build_block_idx + 1; - let keys_state = &self.build_keys_states[self.build_block_idx]; + let keys_state = &self.chunk_keys_states[self.build_block_idx]; with_hash_method!(|T| match &self.method { HashMethodKind::T(method) => { @@ -261,49 +308,6 @@ impl PartitionedBuild { } } - pub fn reset(&mut self) { - self.chunks.clear(); - self.build_keys_states.clear(); - self.hash_table = CompactJoinHashTable::new(0); - self.columns.clear(); - self.column_types.clear(); - self.num_rows = 0; - self.build_block_idx = 0; - self.visited.clear(); - self.chunk_validities.clear(); - self.accumulator.reset(); - } - - /// Create a probe stream that only tracks matched rows (for inner, left semi, right series). - pub fn create_probe_matched<'a>( - &'a self, - data: &DataBlock, - ) -> Result> { - create_compact_probe_matched( - &self.hash_table, - &self.build_keys_states, - &self.method, - &self.desc, - &self.function_ctx, - data, - ) - } - - /// Create a probe stream that also tracks unmatched rows (for left, left anti). - pub fn create_probe<'a>( - &'a self, - data: &DataBlock, - ) -> Result> { - create_compact_probe( - &self.hash_table, - &self.build_keys_states, - &self.method, - &self.desc, - &self.function_ctx, - data, - ) - } - /// Initialize visited tracking for right-side join types. pub fn init_visited(&mut self) { self.visited = vec![0u8; self.num_rows + 1]; @@ -334,4 +338,293 @@ impl PartitionedBuild { row_ptrs, )) } + + pub fn probe<'a, const MATCHED: bool>( + &'a self, + data: ProbeData, + ) -> Result> { + let num_rows = data.num_rows(); + let (keys_block, valids) = data.into_raw(); + let keys = ProjectedBlock::from(keys_block.columns()); + let mut hashes = Vec::with_capacity(num_rows); + + let (keys_state, matched_rows) = with_hash_method!(|T| match &self.method { + HashMethodKind::T(method) => { + let keys_state = method.build_keys_state(keys, num_rows)?; + method.build_keys_hashes(&keys_state, &mut hashes); + (keys_state, self.hash_table.probe(&mut hashes, valids)) + } + }); + + if matched_rows == 0 { + return match MATCHED { + true => Ok(Box::new(EmptyProbeStream)), + false => Ok(AllUnmatchedProbeStream::create(hashes.len())), + }; + } + + Ok(match (&self.method, &self.build_keys_states) { + (HashMethodKind::KeysU8(_), BuildKeysStates::UInt8(states)) => { + let probe_keys = u8::downcast_owned(keys_state).unwrap(); + PrimitiveProbeStream::<'a, u8, MATCHED, u32>::new( + hashes, + states, + probe_keys, + &self.hash_table.next, + ) + } + (HashMethodKind::KeysU16(_), BuildKeysStates::UInt16(states)) => { + let probe_keys = u16::downcast_owned(keys_state).unwrap(); + PrimitiveProbeStream::<'a, u16, MATCHED, u32>::new( + hashes, + states, + probe_keys, + &self.hash_table.next, + ) + } + (HashMethodKind::KeysU32(_), BuildKeysStates::UInt32(states)) => { + let probe_keys = u32::downcast_owned(keys_state).unwrap(); + PrimitiveProbeStream::<'a, u32, MATCHED, u32>::new( + hashes, + states, + probe_keys, + &self.hash_table.next, + ) + } + (HashMethodKind::KeysU64(_), BuildKeysStates::UInt64(states)) => { + let probe_keys = u64::downcast_owned(keys_state).unwrap(); + PrimitiveProbeStream::<'a, u64, MATCHED, u32>::new( + hashes, + states, + probe_keys, + &self.hash_table.next, + ) + } + (HashMethodKind::KeysU128(_), BuildKeysStates::UInt128(states)) => { + let probe_keys = u128::downcast_owned(keys_state).unwrap(); + PrimitiveProbeStream::<'a, u128, MATCHED, u32>::new( + hashes, + states, + probe_keys, + &self.hash_table.next, + ) + } + (HashMethodKind::KeysU256(_), BuildKeysStates::UInt256(states)) => { + let probe_keys = u256::downcast_owned(keys_state).unwrap(); + PrimitiveProbeStream::<'a, u256, MATCHED, u32>::new( + hashes, + states, + probe_keys, + &self.hash_table.next, + ) + } + ( + HashMethodKind::Serializer(_) | HashMethodKind::SingleBinary(_), + BuildKeysStates::Binary(states), + ) => match keys_state { + KeysState::Column(Column::Binary(probe_keys)) + | KeysState::Column(Column::Variant(probe_keys)) + | KeysState::Column(Column::Bitmap(probe_keys)) => { + BinaryProbeStream::<'a, MATCHED, u32>::create( + hashes, + states, + probe_keys, + &self.hash_table.next, + ) + } + _ => unreachable!(), + }, + _ => unreachable!(), + }) + } +} + +struct PrimitiveProbeStream<'a, T: Send + Sync + PartialEq, const MATCHED: bool, I: RowIndex = u32> +{ + key_idx: usize, + pointers: Vec, + build_idx: usize, + probe_keys: Buffer, + build_keys: &'a [Buffer], + next: &'a [I], + matched_num_rows: usize, +} + +impl<'a, T: Send + Sync + PartialEq, const MATCHED: bool, I: RowIndex> + PrimitiveProbeStream<'a, T, MATCHED, I> +{ + #[allow(clippy::new_ret_no_self)] + pub fn new( + pointers: Vec, + build_keys: &'a [Buffer], + probe_keys: Buffer, + next: &'a [I], + ) -> Box { + Box::new(Self { + next, + pointers, + probe_keys, + build_keys, + key_idx: 0, + build_idx: 0, + matched_num_rows: 0, + }) + } +} + +impl<'a, T: Send + Sync + PartialEq, const MATCHED: bool, I: RowIndex> ProbeStream + for PrimitiveProbeStream<'a, T, MATCHED, I> +{ + fn advance(&mut self, res: &mut ProbedRows, max_rows: usize) -> Result<()> { + while self.key_idx < self.probe_keys.len() { + assume(res.matched_probe.len() == res.matched_build.len()); + assume(res.matched_build.len() < res.matched_build.capacity()); + assume(res.matched_probe.len() < res.matched_probe.capacity()); + assume(self.key_idx < self.pointers.len()); + + if res.matched_probe.len() == max_rows { + break; + } + + if self.build_idx == 0 { + self.build_idx = self.pointers[self.key_idx].to_usize(); + + if self.build_idx == 0 { + if !MATCHED { + res.unmatched.push(self.key_idx as u64); + } + + self.key_idx += 1; + self.matched_num_rows = 0; + continue; + } + } + + while self.build_idx != 0 { + let row_ptr = flat_to_row_ptr(self.build_idx); + + if self.probe_keys[self.key_idx] + == self.build_keys[row_ptr.chunk_index as usize][row_ptr.row_index as usize] + { + res.matched_build.push(row_ptr); + res.matched_probe.push(self.key_idx as u64); + self.matched_num_rows += 1; + + if res.matched_probe.len() == max_rows { + self.build_idx = self.next[self.build_idx].to_usize(); + + if self.build_idx == 0 { + self.key_idx += 1; + self.matched_num_rows = 0; + } + + return Ok(()); + } + } + + self.build_idx = self.next[self.build_idx].to_usize(); + } + + if !MATCHED && self.matched_num_rows == 0 { + res.unmatched.push(self.key_idx as u64); + } + + self.key_idx += 1; + self.matched_num_rows = 0; + } + + Ok(()) + } +} + +struct BinaryProbeStream<'a, const MATCHED: bool, I: RowIndex = u32> { + key_idx: usize, + pointers: Vec, + build_idx: usize, + probe_keys: BinaryColumn, + build_keys: &'a [BinaryColumn], + next: &'a [I], + matched_num_rows: usize, +} + +impl<'a, const MATCHED: bool, I: RowIndex> BinaryProbeStream<'a, MATCHED, I> { + pub fn create( + pointers: Vec, + build_keys: &'a [BinaryColumn], + probe_keys: BinaryColumn, + next: &'a [I], + ) -> Box { + Box::new(Self { + next, + pointers, + probe_keys, + build_keys, + key_idx: 0, + build_idx: 0, + matched_num_rows: 0, + }) + } +} + +impl<'a, const MATCHED: bool, I: RowIndex> ProbeStream for BinaryProbeStream<'a, MATCHED, I> { + fn advance(&mut self, res: &mut ProbedRows, max_rows: usize) -> Result<()> { + while self.key_idx < self.probe_keys.len() { + assume(res.matched_probe.len() == res.matched_build.len()); + assume(res.matched_build.len() < res.matched_build.capacity()); + assume(res.matched_probe.len() < res.matched_probe.capacity()); + assume(self.key_idx < self.pointers.len()); + + if res.matched_probe.len() == max_rows { + break; + } + + if self.build_idx == 0 { + self.build_idx = self.pointers[self.key_idx].to_usize(); + + if self.build_idx == 0 { + if !MATCHED { + res.unmatched.push(self.key_idx as u64); + } + + self.key_idx += 1; + self.matched_num_rows = 0; + continue; + } + } + + while self.build_idx != 0 { + let row_ptr = flat_to_row_ptr(self.build_idx); + if self.probe_keys.value(self.key_idx) + == self.build_keys[row_ptr.chunk_index as usize] + .value(row_ptr.row_index as usize) + { + res.matched_build.push(row_ptr); + res.matched_probe.push(self.key_idx as u64); + self.matched_num_rows += 1; + + if res.matched_probe.len() == max_rows { + self.build_idx = self.next[self.build_idx].to_usize(); + + if self.build_idx == 0 { + self.key_idx += 1; + self.matched_num_rows = 0; + } + + return Ok(()); + } + } + + self.build_idx = self.next[self.build_idx].to_usize(); + } + + if !MATCHED && self.matched_num_rows == 0 { + res.unmatched.push(self.key_idx as u64); + } + + self.key_idx += 1; + self.matched_num_rows = 0; + } + + Ok(()) + } } diff --git a/src/query/service/src/pipelines/processors/transforms/new_hash_join/partitioned/right_join.rs b/src/query/service/src/pipelines/processors/transforms/new_hash_join/partitioned/right_join.rs index 3e03a5dad958f..e2197d21ff1c9 100644 --- a/src/query/service/src/pipelines/processors/transforms/new_hash_join/partitioned/right_join.rs +++ b/src/query/service/src/pipelines/processors/transforms/new_hash_join/partitioned/right_join.rs @@ -22,28 +22,30 @@ use databend_common_expression::FilterExecutor; use databend_common_expression::FunctionContext; use databend_common_expression::HashMethodKind; use databend_common_expression::types::DataType; -use databend_common_functions::BUILTIN_FUNCTIONS; -use super::compact_probe_stream::create_compact_probe_matched; -use super::inner_join::result_block; -use super::left_join::null_block; -use super::partitioned_build::PartitionedBuild; +use super::partitioned_build::PartitionedHashJoinState; +use super::partitioned_build::ProbeData; use super::partitioned_build::flat_to_row_ptr; use crate::pipelines::processors::HashJoinDesc; -use crate::pipelines::processors::transforms::hash_join_table::RowPtr; use crate::pipelines::processors::transforms::new_hash_join::common::join::EmptyJoinStream; use crate::pipelines::processors::transforms::new_hash_join::common::join::Join; use crate::pipelines::processors::transforms::new_hash_join::common::join::JoinStream; use crate::pipelines::processors::transforms::new_hash_join::common::probe_stream::ProbeStream; use crate::pipelines::processors::transforms::new_hash_join::common::probe_stream::ProbedRows; +use crate::pipelines::processors::transforms::new_hash_join::unpartitioned::memory::left_join::final_result_block; +use crate::pipelines::processors::transforms::new_hash_join::unpartitioned::memory::left_join::null_block; +use crate::pipelines::processors::transforms::unpartitioned::PerformanceContext; use crate::pipelines::processors::transforms::wrap_nullable_block; pub struct PartitionedRightJoin { - build: PartitionedBuild, - filter_executor: Option, + build: PartitionedHashJoinState, max_block_size: usize, + desc: Arc, + function_ctx: Arc, + context: PerformanceContext, finished: bool, } + impl PartitionedRightJoin { pub fn create( method: HashMethodKind, @@ -51,62 +53,144 @@ impl PartitionedRightJoin { function_ctx: FunctionContext, max_block_size: usize, ) -> Self { - let filter_executor = desc.other_predicate.as_ref().map(|predicate| { - FilterExecutor::new( - predicate.clone(), - function_ctx.clone(), - max_block_size, - None, - &BUILTIN_FUNCTIONS, - false, - ) - }); + let context = + PerformanceContext::create(max_block_size, desc.clone(), function_ctx.clone()); + + let function_ctx = Arc::new(function_ctx); + PartitionedRightJoin { - build: PartitionedBuild::create_keep_null_keys(method, desc, function_ctx), - filter_executor, + function_ctx: function_ctx.clone(), + build: PartitionedHashJoinState::create(method, desc.clone(), function_ctx), max_block_size, + desc, + context, finished: false, } } } -struct PartitionedRightJoinStream<'a> { +impl Join for PartitionedRightJoin { + fn add_block(&mut self, data: Option) -> Result<()> { + self.build.add_block(data) + } + + fn final_build(&mut self) -> Result> { + let progress = self.build.final_build()?; + if progress.is_none() { + self.build.init_visited(); + } + Ok(progress) + } + + fn probe_block(&mut self, data: DataBlock) -> Result> { + if data.is_empty() || self.build.num_rows == 0 { + return Ok(Box::new(EmptyJoinStream)); + } + + let mut probe_keys = { + let nullable_block = wrap_nullable_block(&data); + let probe_keys = self.desc.probe_key(&nullable_block, &self.function_ctx)?; + DataBlock::new(probe_keys, data.num_rows()) + }; + + let valids = self.desc.build_valids_by_keys(&probe_keys)?; + + self.desc.remove_keys_nullable(&mut probe_keys); + let probe_block = data.project(&self.desc.probe_projection); + + let probe_data = ProbeData::new(probe_keys, valids); + let probe_keys_stream = self.build.probe::(probe_data)?; + + match self.context.filter_executor.as_mut() { + None => Ok(OuterRightHashJoinStream::::create( + probe_block, + &self.build, + probe_keys_stream, + self.desc.clone(), + &mut self.context.probe_result, + None, + )), + Some(filter_executor) => Ok(OuterRightHashJoinStream::::create( + probe_block, + &self.build, + probe_keys_stream, + self.desc.clone(), + &mut self.context.probe_result, + Some(filter_executor), + )), + } + } + + fn final_probe(&mut self) -> Result>> { + if self.finished || self.build.num_rows == 0 { + return Ok(None); + } + self.finished = true; + + let mut probe_types = Vec::new(); + for (i, field) in self.desc.probe_schema.fields().iter().enumerate() { + if self.desc.probe_projection.contains(&i) { + probe_types.push(field.data_type().clone()); + } + } + + Ok(Some(Box::new(PartitionedRightFinalStream { + columns: &self.build.columns, + column_types: &self.build.column_types, + visited: &self.build.visited, + num_rows: self.build.num_rows, + scan_idx: 1, + max_block_size: self.max_block_size, + desc: self.desc.clone(), + probe_types, + }))) + } +} + +struct OuterRightHashJoinStream<'a, const CONJUNCT: bool> { desc: Arc, probe_data_block: DataBlock, - columns: &'a Vec, - column_types: &'a Vec, - visited: &'a mut Vec, - probe_stream: Box, - probed_rows: ProbedRows, + build: &'a PartitionedHashJoinState, + probe_keys_stream: Box, + probed_rows: &'a mut ProbedRows, filter_executor: Option<&'a mut FilterExecutor>, - max_block_size: usize, } -impl<'a> PartitionedRightJoinStream<'a> { - fn gather_build_block(&self, row_ptrs: &[RowPtr]) -> Option { - if self.columns.is_empty() { - return None; - } - Some(DataBlock::take_column_vec( - self.columns, - self.column_types, - row_ptrs, - )) +impl<'a, const CONJUNCT: bool> OuterRightHashJoinStream<'a, CONJUNCT> { + pub fn create( + probe_data_block: DataBlock, + build: &'a PartitionedHashJoinState, + probe_keys_stream: Box, + desc: Arc, + probed_rows: &'a mut ProbedRows, + filter_executor: Option<&'a mut FilterExecutor>, + ) -> Box { + Box::new(OuterRightHashJoinStream::<'a, CONJUNCT> { + desc, + build, + probed_rows, + probe_data_block, + probe_keys_stream, + filter_executor, + }) } } -impl<'a> JoinStream for PartitionedRightJoinStream<'a> { +impl<'a, const CONJUNCT: bool> JoinStream for OuterRightHashJoinStream<'a, CONJUNCT> { fn next(&mut self) -> Result> { loop { self.probed_rows.clear(); - self.probe_stream - .advance(&mut self.probed_rows, self.max_block_size)?; + let max_rows = self.probed_rows.matched_probe.capacity(); + self.probe_keys_stream.advance(self.probed_rows, max_rows)?; if self.probed_rows.is_empty() { return Ok(None); } - let num_matched = self.probed_rows.matched_probe.len(); + if self.probed_rows.matched_probe.is_empty() { + continue; + } + let probe_block = match self.probe_data_block.num_columns() { 0 => None, _ => Some(wrap_nullable_block(&DataBlock::take( @@ -114,40 +198,77 @@ impl<'a> JoinStream for PartitionedRightJoinStream<'a> { self.probed_rows.matched_probe.as_slice(), )?)), }; - let build_block = self.gather_build_block(&self.probed_rows.matched_build); - let mut block = result_block(&self.desc, probe_block, build_block, num_matched); - - if let Some(filter) = self.filter_executor.as_mut() { - let count = filter.select(&block)?; - if count == 0 { - continue; + let build_block = match self.build.columns.is_empty() { + true => None, + false => { + let row_ptrs = self.probed_rows.matched_build.as_slice(); + Some(DataBlock::take_column_vec( + self.build.columns.as_slice(), + self.build.column_types.as_slice(), + row_ptrs, + )) } - // Mark visited only for rows that pass filter - let true_sel = filter.true_selection(); - for &sel_idx in true_sel.iter().take(count) { - let row_ptr = &self.probed_rows.matched_build[sel_idx as usize]; + }; + + let data_block = final_result_block( + &self.desc, + probe_block, + build_block, + self.probed_rows.matched_build.len(), + ); + + if !CONJUNCT { + for row_ptr in &self.probed_rows.matched_build { let flat_idx = (row_ptr.chunk_index as usize) * super::partitioned_build::CHUNK_SIZE + row_ptr.row_index as usize + 1; - self.visited[flat_idx] = 1; + self.build.visited.as_ptr(); + unsafe { + *self.build.visited.as_ptr().add(flat_idx).cast_mut() = 1; + } } - let origin_rows = block.num_rows(); - block = filter.take(block, origin_rows, count)?; - } else { - // Mark all matched as visited + + return Ok(Some(data_block)); + } + + let Some(filter_executor) = self.filter_executor.as_mut() else { for row_ptr in &self.probed_rows.matched_build { let flat_idx = (row_ptr.chunk_index as usize) * super::partitioned_build::CHUNK_SIZE + row_ptr.row_index as usize + 1; - self.visited[flat_idx] = 1; + unsafe { + *self.build.visited.as_ptr().add(flat_idx).cast_mut() = 1; + } + } + + return Ok(Some(data_block)); + }; + + if !data_block.is_empty() { + let res_rows = filter_executor.select(&data_block)?; + + if res_rows == 0 { + continue; } - } - if !block.is_empty() { - return Ok(Some(block)); + let true_sel = filter_executor.true_selection(); + + for idx in true_sel.iter().take(res_rows) { + let row_ptr = self.probed_rows.matched_build[*idx as usize]; + let flat_idx = (row_ptr.chunk_index as usize) + * super::partitioned_build::CHUNK_SIZE + + row_ptr.row_index as usize + + 1; + unsafe { + *self.build.visited.as_ptr().add(flat_idx).cast_mut() = 1; + } + } + + let num_rows = data_block.num_rows(); + return Ok(Some(filter_executor.take(data_block, num_rows, res_rows)?)); } } } @@ -189,7 +310,7 @@ impl<'a> JoinStream for PartitionedRightFinalStream<'a> { )) }; - Ok(Some(result_block( + Ok(Some(final_result_block( &self.desc, probe_block, build_block, @@ -197,75 +318,3 @@ impl<'a> JoinStream for PartitionedRightFinalStream<'a> { ))) } } - -impl Join for PartitionedRightJoin { - fn add_block(&mut self, data: Option) -> Result<()> { - self.build.add_block(data) - } - - fn final_build(&mut self) -> Result> { - let progress = self.build.final_build()?; - if progress.is_none() { - self.build.init_visited(); - } - Ok(progress) - } - - fn probe_block(&mut self, data: DataBlock) -> Result> { - if data.is_empty() || self.build.num_rows == 0 { - return Ok(Box::new(EmptyJoinStream)); - } - - let probe_stream = create_compact_probe_matched( - &self.build.hash_table, - &self.build.build_keys_states, - &self.build.method, - &self.build.desc, - &self.build.function_ctx, - &data, - )?; - let probe_data_block = data.project(&self.build.desc.probe_projection); - - Ok(Box::new(PartitionedRightJoinStream { - desc: self.build.desc.clone(), - probe_data_block, - columns: &self.build.columns, - column_types: &self.build.column_types, - visited: &mut self.build.visited, - probe_stream, - probed_rows: ProbedRows::new( - Vec::with_capacity(self.max_block_size), - Vec::with_capacity(self.max_block_size), - Vec::with_capacity(self.max_block_size), - ), - filter_executor: self.filter_executor.as_mut(), - max_block_size: self.max_block_size, - })) - } - - fn final_probe(&mut self) -> Result>> { - if self.finished || self.build.num_rows == 0 { - return Ok(None); - } - self.finished = true; - - let desc = self.build.desc.clone(); - let mut probe_types = Vec::new(); - for (i, field) in desc.probe_schema.fields().iter().enumerate() { - if desc.probe_projection.contains(&i) { - probe_types.push(field.data_type().clone()); - } - } - - Ok(Some(Box::new(PartitionedRightFinalStream { - columns: &self.build.columns, - column_types: &self.build.column_types, - visited: &self.build.visited, - num_rows: self.build.num_rows, - scan_idx: 1, - max_block_size: self.max_block_size, - desc, - probe_types, - }))) - } -} diff --git a/src/query/service/src/pipelines/processors/transforms/new_hash_join/partitioned/right_join_anti.rs b/src/query/service/src/pipelines/processors/transforms/new_hash_join/partitioned/right_join_anti.rs index 9f36b2abe82fc..95f4f24c1144c 100644 --- a/src/query/service/src/pipelines/processors/transforms/new_hash_join/partitioned/right_join_anti.rs +++ b/src/query/service/src/pipelines/processors/transforms/new_hash_join/partitioned/right_join_anti.rs @@ -18,27 +18,26 @@ use databend_common_base::base::ProgressValues; use databend_common_exception::Result; use databend_common_expression::ColumnVec; use databend_common_expression::DataBlock; -use databend_common_expression::FilterExecutor; use databend_common_expression::FunctionContext; use databend_common_expression::HashMethodKind; use databend_common_expression::types::DataType; -use databend_common_functions::BUILTIN_FUNCTIONS; -use super::compact_probe_stream::create_compact_probe_matched; -use super::inner_join::result_block; -use super::partitioned_build::PartitionedBuild; +use super::partitioned_build::PartitionedHashJoinState; +use super::partitioned_build::ProbeData; use super::partitioned_build::flat_to_row_ptr; +use super::right_join_semi::SemiRightHashJoinStream; use crate::pipelines::processors::HashJoinDesc; use crate::pipelines::processors::transforms::new_hash_join::common::join::EmptyJoinStream; use crate::pipelines::processors::transforms::new_hash_join::common::join::Join; use crate::pipelines::processors::transforms::new_hash_join::common::join::JoinStream; -use crate::pipelines::processors::transforms::new_hash_join::common::probe_stream::ProbeStream; -use crate::pipelines::processors::transforms::new_hash_join::common::probe_stream::ProbedRows; +use crate::pipelines::processors::transforms::unpartitioned::PerformanceContext; pub struct PartitionedRightAntiJoin { - build: PartitionedBuild, - filter_executor: Option, + build: PartitionedHashJoinState, max_block_size: usize, + desc: Arc, + function_ctx: Arc, + context: PerformanceContext, finished: bool, } @@ -49,98 +48,87 @@ impl PartitionedRightAntiJoin { function_ctx: FunctionContext, max_block_size: usize, ) -> Self { - let filter_executor = desc.other_predicate.as_ref().map(|predicate| { - FilterExecutor::new( - predicate.clone(), - function_ctx.clone(), - max_block_size, - None, - &BUILTIN_FUNCTIONS, - false, - ) - }); + let context = + PerformanceContext::create(max_block_size, desc.clone(), function_ctx.clone()); + + let function_ctx = Arc::new(function_ctx); + PartitionedRightAntiJoin { - build: PartitionedBuild::create_keep_null_keys(method, desc, function_ctx), - filter_executor, + function_ctx: function_ctx.clone(), + build: PartitionedHashJoinState::create(method, desc.clone(), function_ctx), max_block_size, + desc, + context, finished: false, } } } -/// Probe stream that marks visited build rows, outputs nothing. -struct PartitionedRightAntiProbeStream<'a> { - desc: Arc, - probe_data_block: DataBlock, - columns: &'a Vec, - column_types: &'a Vec, - visited: &'a mut Vec, - probe_stream: Box, - probed_rows: ProbedRows, - filter_executor: Option<&'a mut FilterExecutor>, - max_block_size: usize, -} -impl<'a> JoinStream for PartitionedRightAntiProbeStream<'a> { - fn next(&mut self) -> Result> { - loop { - self.probed_rows.clear(); - self.probe_stream - .advance(&mut self.probed_rows, self.max_block_size)?; +impl Join for PartitionedRightAntiJoin { + fn add_block(&mut self, data: Option) -> Result<()> { + self.build.add_block(data) + } - if self.probed_rows.is_empty() { - return Ok(None); - } + fn final_build(&mut self) -> Result> { + let progress = self.build.final_build()?; + if progress.is_none() { + self.build.init_visited(); + } + Ok(progress) + } - if self.probed_rows.matched_probe.is_empty() { - continue; - } + fn probe_block(&mut self, data: DataBlock) -> Result> { + if data.is_empty() || self.build.num_rows == 0 { + return Ok(Box::new(EmptyJoinStream)); + } - if let Some(filter) = self.filter_executor.as_mut() { - let num_matched = self.probed_rows.matched_probe.len(); - let probe_block = match self.probe_data_block.num_columns() { - 0 => None, - _ => Some(DataBlock::take( - &self.probe_data_block, - self.probed_rows.matched_probe.as_slice(), - )?), - }; - let build_block = if self.columns.is_empty() { - None - } else { - Some(DataBlock::take_column_vec( - self.columns, - self.column_types, - &self.probed_rows.matched_build, - )) - }; - let block = result_block(&self.desc, probe_block, build_block, num_matched); - let count = filter.select(&block)?; - if count > 0 { - let true_sel = filter.true_selection(); - for &sel_idx in true_sel.iter().take(count) { - let row_ptr = &self.probed_rows.matched_build[sel_idx as usize]; - let flat_idx = (row_ptr.chunk_index as usize) - * super::partitioned_build::CHUNK_SIZE - + row_ptr.row_index as usize - + 1; - self.visited[flat_idx] = 1; - } - } - } else { - for row_ptr in &self.probed_rows.matched_build { - let flat_idx = (row_ptr.chunk_index as usize) - * super::partitioned_build::CHUNK_SIZE - + row_ptr.row_index as usize - + 1; - self.visited[flat_idx] = 1; - } - } - // Right anti outputs nothing during probe + let probe_keys = self.desc.probe_key(&data, &self.function_ctx)?; + let mut keys = DataBlock::new(probe_keys, data.num_rows()); + let valids = self.desc.build_valids_by_keys(&keys)?; + + self.desc.remove_keys_nullable(&mut keys); + let probe_block = data.project(&self.desc.probe_projection); + + let probe_data = ProbeData::new(keys, valids); + let probe_keys_stream = self.build.probe::(probe_data)?; + + match self.context.filter_executor.as_mut() { + None => Ok(SemiRightHashJoinStream::::create( + probe_block, + &self.build, + probe_keys_stream, + self.desc.clone(), + &mut self.context.probe_result, + None, + )), + Some(filter_executor) => Ok(SemiRightHashJoinStream::::create( + probe_block, + &self.build, + probe_keys_stream, + self.desc.clone(), + &mut self.context.probe_result, + Some(filter_executor), + )), + } + } + + fn final_probe(&mut self) -> Result>> { + if self.finished || self.build.num_rows == 0 { + return Ok(None); } + self.finished = true; + + Ok(Some(Box::new(PartitionedRightAntiFinalStream { + columns: &self.build.columns, + column_types: &self.build.column_types, + visited: &self.build.visited, + num_rows: self.build.num_rows, + scan_idx: 1, + max_block_size: self.max_block_size, + }))) } } -/// Final stream: output unvisited build rows. struct PartitionedRightAntiFinalStream<'a> { columns: &'a Vec, column_types: &'a Vec, @@ -174,65 +162,3 @@ impl<'a> JoinStream for PartitionedRightAntiFinalStream<'a> { ))) } } - -impl Join for PartitionedRightAntiJoin { - fn add_block(&mut self, data: Option) -> Result<()> { - self.build.add_block(data) - } - - fn final_build(&mut self) -> Result> { - let progress = self.build.final_build()?; - if progress.is_none() { - self.build.init_visited(); - } - Ok(progress) - } - - fn probe_block(&mut self, data: DataBlock) -> Result> { - if data.is_empty() || self.build.num_rows == 0 { - return Ok(Box::new(EmptyJoinStream)); - } - - let probe_stream = create_compact_probe_matched( - &self.build.hash_table, - &self.build.build_keys_states, - &self.build.method, - &self.build.desc, - &self.build.function_ctx, - &data, - )?; - let probe_data_block = data.project(&self.build.desc.probe_projection); - - Ok(Box::new(PartitionedRightAntiProbeStream { - desc: self.build.desc.clone(), - probe_data_block, - columns: &self.build.columns, - column_types: &self.build.column_types, - visited: &mut self.build.visited, - probe_stream, - probed_rows: ProbedRows::new( - Vec::with_capacity(self.max_block_size), - Vec::with_capacity(self.max_block_size), - Vec::with_capacity(self.max_block_size), - ), - filter_executor: self.filter_executor.as_mut(), - max_block_size: self.max_block_size, - })) - } - - fn final_probe(&mut self) -> Result>> { - if self.finished || self.build.num_rows == 0 { - return Ok(None); - } - self.finished = true; - - Ok(Some(Box::new(PartitionedRightAntiFinalStream { - columns: &self.build.columns, - column_types: &self.build.column_types, - visited: &self.build.visited, - num_rows: self.build.num_rows, - scan_idx: 1, - max_block_size: self.max_block_size, - }))) - } -} diff --git a/src/query/service/src/pipelines/processors/transforms/new_hash_join/partitioned/right_join_semi.rs b/src/query/service/src/pipelines/processors/transforms/new_hash_join/partitioned/right_join_semi.rs index 82fee79bba989..9717f162c1ba1 100644 --- a/src/query/service/src/pipelines/processors/transforms/new_hash_join/partitioned/right_join_semi.rs +++ b/src/query/service/src/pipelines/processors/transforms/new_hash_join/partitioned/right_join_semi.rs @@ -22,11 +22,9 @@ use databend_common_expression::FilterExecutor; use databend_common_expression::FunctionContext; use databend_common_expression::HashMethodKind; use databend_common_expression::types::DataType; -use databend_common_functions::BUILTIN_FUNCTIONS; -use super::compact_probe_stream::create_compact_probe_matched; -use super::inner_join::result_block; -use super::partitioned_build::PartitionedBuild; +use super::partitioned_build::PartitionedHashJoinState; +use super::partitioned_build::ProbeData; use super::partitioned_build::flat_to_row_ptr; use crate::pipelines::processors::HashJoinDesc; use crate::pipelines::processors::transforms::new_hash_join::common::join::EmptyJoinStream; @@ -34,11 +32,15 @@ use crate::pipelines::processors::transforms::new_hash_join::common::join::Join; use crate::pipelines::processors::transforms::new_hash_join::common::join::JoinStream; use crate::pipelines::processors::transforms::new_hash_join::common::probe_stream::ProbeStream; use crate::pipelines::processors::transforms::new_hash_join::common::probe_stream::ProbedRows; +use crate::pipelines::processors::transforms::new_hash_join::unpartitioned::memory::left_join::final_result_block; +use crate::pipelines::processors::transforms::unpartitioned::PerformanceContext; pub struct PartitionedRightSemiJoin { - build: PartitionedBuild, - filter_executor: Option, + build: PartitionedHashJoinState, max_block_size: usize, + desc: Arc, + function_ctx: Arc, + context: PerformanceContext, finished: bool, } @@ -49,44 +51,122 @@ impl PartitionedRightSemiJoin { function_ctx: FunctionContext, max_block_size: usize, ) -> Self { - let filter_executor = desc.other_predicate.as_ref().map(|predicate| { - FilterExecutor::new( - predicate.clone(), - function_ctx.clone(), - max_block_size, - None, - &BUILTIN_FUNCTIONS, - false, - ) - }); + let context = + PerformanceContext::create(max_block_size, desc.clone(), function_ctx.clone()); + + let function_ctx = Arc::new(function_ctx); + PartitionedRightSemiJoin { - build: PartitionedBuild::create(method, desc, function_ctx), - filter_executor, + function_ctx: function_ctx.clone(), + build: PartitionedHashJoinState::create(method, desc.clone(), function_ctx), max_block_size, + desc, + context, finished: false, } } } -/// Probe stream that marks visited build rows, outputs nothing. -struct PartitionedRightSemiProbeStream<'a> { +impl Join for PartitionedRightSemiJoin { + fn add_block(&mut self, data: Option) -> Result<()> { + self.build.add_block(data) + } + + fn final_build(&mut self) -> Result> { + let progress = self.build.final_build()?; + if progress.is_none() { + self.build.init_visited(); + } + Ok(progress) + } + + fn probe_block(&mut self, data: DataBlock) -> Result> { + if data.is_empty() || self.build.num_rows == 0 { + return Ok(Box::new(EmptyJoinStream)); + } + + let probe_keys = self.desc.probe_key(&data, &self.function_ctx)?; + let mut keys = DataBlock::new(probe_keys, data.num_rows()); + let valids = self.desc.build_valids_by_keys(&keys)?; + + self.desc.remove_keys_nullable(&mut keys); + let probe_block = data.project(&self.desc.probe_projection); + + let probe_data = ProbeData::new(keys, valids); + let probe_keys_stream = self.build.probe::(probe_data)?; + + match self.context.filter_executor.as_mut() { + None => Ok(SemiRightHashJoinStream::::create( + probe_block, + &self.build, + probe_keys_stream, + self.desc.clone(), + &mut self.context.probe_result, + None, + )), + Some(filter_executor) => Ok(SemiRightHashJoinStream::::create( + probe_block, + &self.build, + probe_keys_stream, + self.desc.clone(), + &mut self.context.probe_result, + Some(filter_executor), + )), + } + } + + fn final_probe(&mut self) -> Result>> { + if self.finished || self.build.num_rows == 0 { + return Ok(None); + } + self.finished = true; + + Ok(Some(Box::new(PartitionedRightSemiFinalStream { + columns: &self.build.columns, + column_types: &self.build.column_types, + visited: &self.build.visited, + num_rows: self.build.num_rows, + scan_idx: 1, + max_block_size: self.max_block_size, + }))) + } +} + +pub(super) struct SemiRightHashJoinStream<'a, const CONJUNCT: bool> { desc: Arc, probe_data_block: DataBlock, - columns: &'a Vec, - column_types: &'a Vec, - visited: &'a mut Vec, - probe_stream: Box, - probed_rows: ProbedRows, + build: &'a PartitionedHashJoinState, + probe_keys_stream: Box, + probed_rows: &'a mut ProbedRows, filter_executor: Option<&'a mut FilterExecutor>, - max_block_size: usize, } -impl<'a> JoinStream for PartitionedRightSemiProbeStream<'a> { +impl<'a, const CONJUNCT: bool> SemiRightHashJoinStream<'a, CONJUNCT> { + pub fn create( + probe_data_block: DataBlock, + build: &'a PartitionedHashJoinState, + probe_keys_stream: Box, + desc: Arc, + probed_rows: &'a mut ProbedRows, + filter_executor: Option<&'a mut FilterExecutor>, + ) -> Box { + Box::new(SemiRightHashJoinStream::<'a, CONJUNCT> { + desc, + build, + probed_rows, + probe_data_block, + probe_keys_stream, + filter_executor, + }) + } +} + +impl<'a, const CONJUNCT: bool> JoinStream for SemiRightHashJoinStream<'a, CONJUNCT> { fn next(&mut self) -> Result> { loop { self.probed_rows.clear(); - self.probe_stream - .advance(&mut self.probed_rows, self.max_block_size)?; + let max_rows = self.probed_rows.matched_probe.capacity(); + self.probe_keys_stream.advance(self.probed_rows, max_rows)?; if self.probed_rows.is_empty() { return Ok(None); @@ -96,52 +176,83 @@ impl<'a> JoinStream for PartitionedRightSemiProbeStream<'a> { continue; } - if let Some(filter) = self.filter_executor.as_mut() { - let num_matched = self.probed_rows.matched_probe.len(); - let probe_block = match self.probe_data_block.num_columns() { - 0 => None, - _ => Some(DataBlock::take( - &self.probe_data_block, - self.probed_rows.matched_probe.as_slice(), - )?), - }; - let build_block = if self.columns.is_empty() { - None - } else { - Some(DataBlock::take_column_vec( - self.columns, - self.column_types, - &self.probed_rows.matched_build, - )) - }; - let block = result_block(&self.desc, probe_block, build_block, num_matched); - let count = filter.select(&block)?; - if count > 0 { - let true_sel = filter.true_selection(); - for &sel_idx in true_sel.iter().take(count) { - let row_ptr = &self.probed_rows.matched_build[sel_idx as usize]; - let flat_idx = (row_ptr.chunk_index as usize) - * super::partitioned_build::CHUNK_SIZE - + row_ptr.row_index as usize - + 1; - self.visited[flat_idx] = 1; + if !CONJUNCT { + for row_ptr in &self.probed_rows.matched_build { + let flat_idx = (row_ptr.chunk_index as usize) + * super::partitioned_build::CHUNK_SIZE + + row_ptr.row_index as usize + + 1; + unsafe { + *self.build.visited.as_ptr().add(flat_idx).cast_mut() = 1; } } - } else { + continue; + } + + let Some(filter_executor) = self.filter_executor.as_mut() else { for row_ptr in &self.probed_rows.matched_build { let flat_idx = (row_ptr.chunk_index as usize) * super::partitioned_build::CHUNK_SIZE + row_ptr.row_index as usize + 1; - self.visited[flat_idx] = 1; + unsafe { + *self.build.visited.as_ptr().add(flat_idx).cast_mut() = 1; + } + } + continue; + }; + + let probe_block = match self.probe_data_block.num_columns() { + 0 => None, + _ => Some(DataBlock::take( + &self.probe_data_block, + self.probed_rows.matched_probe.as_slice(), + )?), + }; + + let build_block = match self.build.columns.is_empty() { + true => None, + false => { + let row_ptrs = self.probed_rows.matched_build.as_slice(); + Some(DataBlock::take_column_vec( + self.build.columns.as_slice(), + self.build.column_types.as_slice(), + row_ptrs, + )) + } + }; + + let result_block = final_result_block( + &self.desc, + probe_block, + build_block, + self.probed_rows.matched_build.len(), + ); + + if !result_block.is_empty() { + let result_count = filter_executor.select(&result_block)?; + + if result_count == 0 { + continue; + } + + let true_sel = filter_executor.true_selection(); + + for idx in true_sel.iter().take(result_count) { + let row_ptr = self.probed_rows.matched_build[*idx as usize]; + let flat_idx = (row_ptr.chunk_index as usize) + * super::partitioned_build::CHUNK_SIZE + + row_ptr.row_index as usize + + 1; + unsafe { + *self.build.visited.as_ptr().add(flat_idx).cast_mut() = 1; + } } } - // Right semi outputs nothing during probe } } } -/// Final stream: output visited build rows. struct PartitionedRightSemiFinalStream<'a> { columns: &'a Vec, column_types: &'a Vec, @@ -175,65 +286,3 @@ impl<'a> JoinStream for PartitionedRightSemiFinalStream<'a> { ))) } } - -impl Join for PartitionedRightSemiJoin { - fn add_block(&mut self, data: Option) -> Result<()> { - self.build.add_block(data) - } - - fn final_build(&mut self) -> Result> { - let progress = self.build.final_build()?; - if progress.is_none() { - self.build.init_visited(); - } - Ok(progress) - } - - fn probe_block(&mut self, data: DataBlock) -> Result> { - if data.is_empty() || self.build.num_rows == 0 { - return Ok(Box::new(EmptyJoinStream)); - } - - let probe_stream = create_compact_probe_matched( - &self.build.hash_table, - &self.build.build_keys_states, - &self.build.method, - &self.build.desc, - &self.build.function_ctx, - &data, - )?; - let probe_data_block = data.project(&self.build.desc.probe_projection); - - Ok(Box::new(PartitionedRightSemiProbeStream { - desc: self.build.desc.clone(), - probe_data_block, - columns: &self.build.columns, - column_types: &self.build.column_types, - visited: &mut self.build.visited, - probe_stream, - probed_rows: ProbedRows::new( - Vec::with_capacity(self.max_block_size), - Vec::with_capacity(self.max_block_size), - Vec::with_capacity(self.max_block_size), - ), - filter_executor: self.filter_executor.as_mut(), - max_block_size: self.max_block_size, - })) - } - - fn final_probe(&mut self) -> Result>> { - if self.finished || self.build.num_rows == 0 { - return Ok(None); - } - self.finished = true; - - Ok(Some(Box::new(PartitionedRightSemiFinalStream { - columns: &self.build.columns, - column_types: &self.build.column_types, - visited: &self.build.visited, - num_rows: self.build.num_rows, - scan_idx: 1, - max_block_size: self.max_block_size, - }))) - } -} diff --git a/src/query/service/src/pipelines/processors/transforms/new_hash_join/unpartitioned/memory/inner_join.rs b/src/query/service/src/pipelines/processors/transforms/new_hash_join/unpartitioned/memory/inner_join.rs index ebe8b2c13dc99..a039492b46d56 100644 --- a/src/query/service/src/pipelines/processors/transforms/new_hash_join/unpartitioned/memory/inner_join.rs +++ b/src/query/service/src/pipelines/processors/transforms/new_hash_join/unpartitioned/memory/inner_join.rs @@ -22,7 +22,6 @@ use databend_common_exception::ErrorCode; use databend_common_exception::Result; use databend_common_expression::BlockEntry; use databend_common_expression::DataBlock; -use databend_common_expression::FilterExecutor; use databend_common_expression::FunctionContext; use databend_common_expression::HashMethodKind; use databend_common_expression::types::NullableColumn; @@ -37,6 +36,7 @@ use crate::pipelines::processors::transforms::HashJoinHashTable; use crate::pipelines::processors::transforms::JoinRuntimeFilterPacket; use crate::pipelines::processors::transforms::merge_join_runtime_filter_packets; use crate::pipelines::processors::transforms::new_hash_join::common::join::EmptyJoinStream; +use crate::pipelines::processors::transforms::new_hash_join::common::join::InnerHashJoinFilterStream; use crate::pipelines::processors::transforms::new_hash_join::common::join::Join; use crate::pipelines::processors::transforms::new_hash_join::common::join::JoinStream; use crate::pipelines::processors::transforms::new_hash_join::common::probe_stream::ProbeStream; @@ -275,42 +275,3 @@ impl<'a> JoinStream for InnerHashJoinStream<'a> { } } } - -pub(super) struct InnerHashJoinFilterStream<'a> { - inner: Box, - filter_executor: &'a mut FilterExecutor, -} - -impl<'a> InnerHashJoinFilterStream<'a> { - pub fn create( - inner: Box, - filter_executor: &'a mut FilterExecutor, - ) -> Box { - Box::new(InnerHashJoinFilterStream { - inner, - filter_executor, - }) - } -} - -impl<'a> JoinStream for InnerHashJoinFilterStream<'a> { - fn next(&mut self) -> Result> { - loop { - let Some(data_block) = self.inner.next()? else { - return Ok(None); - }; - - if data_block.is_empty() { - continue; - } - - let data_block = self.filter_executor.filter(data_block)?; - - if data_block.is_empty() { - continue; - } - - return Ok(Some(data_block)); - } - } -} diff --git a/src/query/service/src/pipelines/processors/transforms/new_hash_join/unpartitioned/mod.rs b/src/query/service/src/pipelines/processors/transforms/new_hash_join/unpartitioned/mod.rs index 872a33dc34f6e..0cef379072498 100644 --- a/src/query/service/src/pipelines/processors/transforms/new_hash_join/unpartitioned/mod.rs +++ b/src/query/service/src/pipelines/processors/transforms/new_hash_join/unpartitioned/mod.rs @@ -22,4 +22,5 @@ mod transform_hash_join; pub use hash_join_factory::HashJoinFactory; pub use memory::*; +pub use performance::PerformanceContext; pub use transform_hash_join::TransformHashJoin; From 4c985d772564a6f9a72160fee647389742d57506 Mon Sep 17 00:00:00 2001 From: zhang2014 Date: Fri, 27 Mar 2026 01:15:32 +0800 Subject: [PATCH 16/38] z --- .../new_hash_join/partitioned/inner_join.rs | 3 +- .../new_hash_join/partitioned/left_join.rs | 2 +- .../partitioned/left_join_anti.rs | 2 +- .../partitioned/left_join_semi.rs | 2 +- .../partitioned/partitioned_build.rs | 191 ++++++++---------- .../new_hash_join/partitioned/right_join.rs | 66 +++--- .../partitioned/right_join_anti.rs | 38 ++-- .../partitioned/right_join_semi.rs | 65 +++--- 8 files changed, 182 insertions(+), 187 deletions(-) diff --git a/src/query/service/src/pipelines/processors/transforms/new_hash_join/partitioned/inner_join.rs b/src/query/service/src/pipelines/processors/transforms/new_hash_join/partitioned/inner_join.rs index e0856759ac803..d467448ef3f85 100644 --- a/src/query/service/src/pipelines/processors/transforms/new_hash_join/partitioned/inner_join.rs +++ b/src/query/service/src/pipelines/processors/transforms/new_hash_join/partitioned/inner_join.rs @@ -56,7 +56,6 @@ impl PartitionedInnerJoin { PartitionedInnerJoin { function_ctx: function_ctx.clone(), build: PartitionedHashJoinState::create(method, desc.clone(), function_ctx), - max_block_size, desc, context, } @@ -65,7 +64,7 @@ impl PartitionedInnerJoin { impl Join for PartitionedInnerJoin { fn add_block(&mut self, data: Option) -> Result<()> { - self.build.add_block(data) + self.build.add_block::(data) } fn final_build(&mut self) -> Result> { diff --git a/src/query/service/src/pipelines/processors/transforms/new_hash_join/partitioned/left_join.rs b/src/query/service/src/pipelines/processors/transforms/new_hash_join/partitioned/left_join.rs index 5e9a304d58537..3b4ba5fec2406 100644 --- a/src/query/service/src/pipelines/processors/transforms/new_hash_join/partitioned/left_join.rs +++ b/src/query/service/src/pipelines/processors/transforms/new_hash_join/partitioned/left_join.rs @@ -66,7 +66,7 @@ impl PartitionedLeftJoin { impl Join for PartitionedLeftJoin { fn add_block(&mut self, data: Option) -> Result<()> { - self.build.add_block(data) + self.build.add_block::(data) } fn final_build(&mut self) -> Result> { diff --git a/src/query/service/src/pipelines/processors/transforms/new_hash_join/partitioned/left_join_anti.rs b/src/query/service/src/pipelines/processors/transforms/new_hash_join/partitioned/left_join_anti.rs index a904da5635d4a..d442884ace864 100644 --- a/src/query/service/src/pipelines/processors/transforms/new_hash_join/partitioned/left_join_anti.rs +++ b/src/query/service/src/pipelines/processors/transforms/new_hash_join/partitioned/left_join_anti.rs @@ -64,7 +64,7 @@ impl PartitionedLeftAntiJoin { impl Join for PartitionedLeftAntiJoin { fn add_block(&mut self, data: Option) -> Result<()> { - self.build.add_block(data) + self.build.add_block::(data) } fn final_build(&mut self) -> Result> { diff --git a/src/query/service/src/pipelines/processors/transforms/new_hash_join/partitioned/left_join_semi.rs b/src/query/service/src/pipelines/processors/transforms/new_hash_join/partitioned/left_join_semi.rs index 7baffe38c2fea..96e166a421fb0 100644 --- a/src/query/service/src/pipelines/processors/transforms/new_hash_join/partitioned/left_join_semi.rs +++ b/src/query/service/src/pipelines/processors/transforms/new_hash_join/partitioned/left_join_semi.rs @@ -63,7 +63,7 @@ impl PartitionedLeftSemiJoin { impl Join for PartitionedLeftSemiJoin { fn add_block(&mut self, data: Option) -> Result<()> { - self.build.add_block(data) + self.build.add_block::(data) } fn final_build(&mut self) -> Result> { diff --git a/src/query/service/src/pipelines/processors/transforms/new_hash_join/partitioned/partitioned_build.rs b/src/query/service/src/pipelines/processors/transforms/new_hash_join/partitioned/partitioned_build.rs index 0ca1e39304efe..9454bf83da675 100644 --- a/src/query/service/src/pipelines/processors/transforms/new_hash_join/partitioned/partitioned_build.rs +++ b/src/query/service/src/pipelines/processors/transforms/new_hash_join/partitioned/partitioned_build.rs @@ -25,13 +25,11 @@ use databend_common_expression::Column; use databend_common_expression::ColumnVec; use databend_common_expression::DataBlock; use databend_common_expression::FixedKey; -use databend_common_expression::FromData; use databend_common_expression::FunctionContext; use databend_common_expression::HashMethod; use databend_common_expression::HashMethodKind; use databend_common_expression::KeysState; use databend_common_expression::ProjectedBlock; -use databend_common_expression::Scalar; use databend_common_expression::types::AccessType; use databend_common_expression::types::BooleanType; use databend_common_expression::types::DataType; @@ -101,6 +99,20 @@ pub enum BuildKeysStates { Binary(Vec), } +impl BuildKeysStates { + pub fn get(&self, idx: usize) -> KeysState { + match self { + BuildKeysStates::UInt8(v) => u8::upcast(v[idx].clone()), + BuildKeysStates::UInt16(v) => u16::upcast(v[idx].clone()), + BuildKeysStates::UInt32(v) => u32::upcast(v[idx].clone()), + BuildKeysStates::UInt64(v) => u64::upcast(v[idx].clone()), + BuildKeysStates::UInt128(v) => u128::upcast(v[idx].clone()), + BuildKeysStates::UInt256(v) => u256::upcast(v[idx].clone()), + BuildKeysStates::Binary(v) => KeysState::Column(Column::Binary(v[idx].clone())), + } + } +} + impl BuildKeysStates { pub fn new(method: &HashMethodKind) -> Self { match method { @@ -121,7 +133,6 @@ pub struct PartitionedHashJoinState { pub chunks: Vec, pub method: HashMethodKind, pub build_keys_states: BuildKeysStates, - pub chunk_keys_states: Vec, pub hash_table: CompactJoinHashTable, pub columns: Vec, @@ -130,19 +141,11 @@ pub struct PartitionedHashJoinState { pub num_rows: usize, pub build_block_idx: usize, - pub visited: Vec, + pub visited: Vec>, pub desc: Arc, pub function_ctx: Arc, - /// When true, NULL build keys are kept in the data (not filtered out). - /// Required for RIGHT and RIGHT ANTI joins where unmatched build rows - /// (including those with NULL keys) must be output in final_probe. - keep_null_keys: bool, - /// Per-chunk validity bitmaps for build keys (only used when keep_null_keys is true). - /// Rows with invalid (NULL) keys are skipped during hash table insertion. - chunk_validities: Vec>, - - accumulator: FixedSizeChunkAccumulator, + pub accumulator: FixedSizeChunkAccumulator, } impl PartitionedHashJoinState { @@ -154,7 +157,6 @@ impl PartitionedHashJoinState { PartitionedHashJoinState { chunks: Vec::new(), build_keys_states: BuildKeysStates::new(&method), - chunk_keys_states: Vec::new(), hash_table: CompactJoinHashTable::new(0), columns: Vec::new(), column_types: Vec::new(), @@ -162,59 +164,59 @@ impl PartitionedHashJoinState { method, desc, function_ctx, - visited: Vec::new(), - keep_null_keys: false, - chunk_validities: Vec::new(), - accumulator: FixedSizeChunkAccumulator::new(CHUNK_SIZE), build_block_idx: 0, + visited: vec![], + accumulator: FixedSizeChunkAccumulator::new(CHUNK_SIZE), } } - pub fn add_block(&mut self, data: Option) -> Result<()> { + pub fn add_block(&mut self, data: Option) -> Result<()> { let Some(data_block) = data else { if let Some(chunk) = self.accumulator.finalize() { - self.ingest_chunk(chunk)?; + self.ingest_chunk::(chunk)?; } return Ok(()); }; - let data_block = self.prepare_data(data_block)?; + let data_block = self.prepare_data::(data_block)?; for ready_block in self.accumulator.accumulate(data_block) { - self.ingest_chunk(ready_block)?; + self.ingest_chunk::(ready_block)?; } Ok(()) } - fn ingest_chunk(&mut self, chunk: DataBlock) -> Result<()> { + fn ingest_chunk(&mut self, chunk: DataBlock) -> Result<()> { let num_rows = chunk.num_rows(); let mut columns = chunk.take_columns(); + let mut data_columns = columns.split_off(self.desc.build_keys.len()); + + if VISITED && data_columns.len() != self.desc.probe_keys.len() { + let valid_entry = data_columns.pop().unwrap(); + let valid_column = valid_entry.to_column(); + let valid_bitmap = BooleanType::try_downcast_column(&valid_column).unwrap(); + let keys_block = DataBlock::new(columns, num_rows); + columns = keys_block.filter_with_bitmap(&valid_bitmap)?.take_columns(); + } - // Extract the trailing validity column if keep_null_keys is enabled. - let chunk_validity = if self.keep_null_keys { - let valid_entry = columns.pop().unwrap(); - let col = valid_entry.to_column(); - Some(BooleanType::try_downcast_column(&col).unwrap()) - } else { - None - }; - - let data_columns = columns.split_off(self.desc.build_keys.len()); let keys = ProjectedBlock::from(&columns); let keys_state = with_hash_method!(|T| match &self.method { HashMethodKind::T(method) => method.build_keys_state(keys, num_rows)?, }); + if VISITED { + self.visited.push(vec![0u8; num_rows]); + } + self.num_rows += num_rows; - self.chunk_keys_states.push(keys_state); + self.add_build_state(keys_state); self.chunks.push(DataBlock::new(data_columns, num_rows)); - self.chunk_validities.push(chunk_validity); Ok(()) } - fn prepare_data(&self, mut chunk: DataBlock) -> Result { + fn prepare_data(&self, mut chunk: DataBlock) -> Result { let num_rows = chunk.num_rows(); let keys_entries = self.desc.build_key(&chunk, &self.function_ctx)?; @@ -222,35 +224,26 @@ impl PartitionedHashJoinState { chunk = chunk.project(&self.desc.build_projection); - let validity = self.desc.build_valids_by_keys(&keys_block)?; - if !self.keep_null_keys { - if let Some(ref bitmap) = validity { - if bitmap.true_count() != bitmap.len() { - keys_block = keys_block.filter_with_bitmap(bitmap)?; - chunk = chunk.filter_with_bitmap(bitmap)?; - } + if let Some(bitmap) = self.desc.build_valids_by_keys(&keys_block)? { + if bitmap.true_count() != bitmap.len() { + chunk = match VISITED { + true => { + let null_keys = chunk.clone().filter_with_bitmap(&(!(&bitmap)))?; + let nonnull_keys = chunk.filter_with_bitmap(&bitmap)?; + let mut chunk = DataBlock::concat(&[nonnull_keys, null_keys])?; + chunk.add_column(Column::Boolean(bitmap)); + chunk + } + false => { + keys_block = keys_block.filter_with_bitmap(&bitmap)?; + chunk.filter_with_bitmap(&bitmap)? + } + }; } } self.desc.remove_keys_nullable(&mut keys_block); keys_block.merge_block(chunk); - - // When keeping NULL keys, append a boolean validity column so it flows - // through the accumulator and can be extracted in ingest_chunk. - if self.keep_null_keys { - let valid_col = match validity { - Some(bitmap) => { - BlockEntry::from(BooleanType::from_data(bitmap.iter().collect::>())) - } - None => BlockEntry::new_const_column( - DataType::Boolean, - Scalar::Boolean(true), - keys_block.num_rows(), - ), - }; - keys_block.add_entry(valid_col); - } - Ok(keys_block) } @@ -283,21 +276,13 @@ impl PartitionedHashJoinState { } let row_offset = CHUNK_SIZE * self.build_block_idx + 1; - let keys_state = &self.chunk_keys_states[self.build_block_idx]; + let keys_state = self.build_keys_states.get(self.build_block_idx); with_hash_method!(|T| match &self.method { HashMethodKind::T(method) => { - let mut hashes = Vec::new(); - method.build_keys_hashes(keys_state, &mut hashes); - match &self.chunk_validities[self.build_block_idx] { - Some(validity) => { - self.hash_table - .insert_chunk_with_validity(&hashes, row_offset, validity); - } - None => { - self.hash_table.insert_chunk(&hashes, row_offset); - } - } + let mut hashes = Vec::with_capacity(CHUNK_SIZE); + method.build_keys_hashes(&keys_state, &mut hashes); + self.hash_table.insert_chunk(&hashes, row_offset); self.build_block_idx += 1; } }); @@ -308,37 +293,6 @@ impl PartitionedHashJoinState { } } - /// Initialize visited tracking for right-side join types. - pub fn init_visited(&mut self) { - self.visited = vec![0u8; self.num_rows + 1]; - } - - /// Mark a build row as visited (1-based index). - #[inline(always)] - pub fn set_visited(&mut self, row_index: usize) { - unsafe { - *self.visited.get_unchecked_mut(row_index) = 1; - } - } - - /// Check if a build row has been visited (1-based index). - #[inline(always)] - pub fn is_visited(&self, row_index: usize) -> bool { - unsafe { *self.visited.get_unchecked(row_index) != 0 } - } - - /// Gather build columns for the given row pointers. - pub fn gather_build_block(&self, row_ptrs: &[RowPtr]) -> Option { - if self.columns.is_empty() { - return None; - } - Some(DataBlock::take_column_vec( - &self.columns, - &self.column_types, - row_ptrs, - )) - } - pub fn probe<'a, const MATCHED: bool>( &'a self, data: ProbeData, @@ -437,6 +391,37 @@ impl PartitionedHashJoinState { _ => unreachable!(), }) } + + fn add_build_state(&mut self, state: KeysState) { + match &mut self.build_keys_states { + BuildKeysStates::UInt8(states) => { + states.push(u8::downcast_owned(state).unwrap()); + } + BuildKeysStates::UInt16(states) => { + states.push(u16::downcast_owned(state).unwrap()); + } + BuildKeysStates::UInt32(states) => { + states.push(u32::downcast_owned(state).unwrap()); + } + BuildKeysStates::UInt64(states) => { + states.push(u64::downcast_owned(state).unwrap()); + } + BuildKeysStates::UInt128(states) => { + states.push(u128::downcast_owned(state).unwrap()); + } + BuildKeysStates::UInt256(states) => { + states.push(u256::downcast_owned(state).unwrap()); + } + BuildKeysStates::Binary(states) => match state { + KeysState::Column(Column::Binary(build_keys)) + | KeysState::Column(Column::Variant(build_keys)) + | KeysState::Column(Column::Bitmap(build_keys)) => { + states.push(build_keys); + } + _ => unreachable!(), + }, + }; + } } struct PrimitiveProbeStream<'a, T: Send + Sync + PartialEq, const MATCHED: bool, I: RowIndex = u32> diff --git a/src/query/service/src/pipelines/processors/transforms/new_hash_join/partitioned/right_join.rs b/src/query/service/src/pipelines/processors/transforms/new_hash_join/partitioned/right_join.rs index e2197d21ff1c9..26983d65f5117 100644 --- a/src/query/service/src/pipelines/processors/transforms/new_hash_join/partitioned/right_join.rs +++ b/src/query/service/src/pipelines/processors/transforms/new_hash_join/partitioned/right_join.rs @@ -25,8 +25,8 @@ use databend_common_expression::types::DataType; use super::partitioned_build::PartitionedHashJoinState; use super::partitioned_build::ProbeData; -use super::partitioned_build::flat_to_row_ptr; use crate::pipelines::processors::HashJoinDesc; +use crate::pipelines::processors::transforms::hash_join_table::RowPtr; use crate::pipelines::processors::transforms::new_hash_join::common::join::EmptyJoinStream; use crate::pipelines::processors::transforms::new_hash_join::common::join::Join; use crate::pipelines::processors::transforms::new_hash_join::common::join::JoinStream; @@ -71,15 +71,11 @@ impl PartitionedRightJoin { impl Join for PartitionedRightJoin { fn add_block(&mut self, data: Option) -> Result<()> { - self.build.add_block(data) + self.build.add_block::(data) } fn final_build(&mut self) -> Result> { - let progress = self.build.final_build()?; - if progress.is_none() { - self.build.init_visited(); - } - Ok(progress) + self.build.final_build() } fn probe_block(&mut self, data: DataBlock) -> Result> { @@ -138,8 +134,8 @@ impl Join for PartitionedRightJoin { columns: &self.build.columns, column_types: &self.build.column_types, visited: &self.build.visited, - num_rows: self.build.num_rows, - scan_idx: 1, + chunk_idx: 0, + row_idx: 0, max_block_size: self.max_block_size, desc: self.desc.clone(), probe_types, @@ -220,13 +216,11 @@ impl<'a, const CONJUNCT: bool> JoinStream for OuterRightHashJoinStream<'a, CONJU if !CONJUNCT { for row_ptr in &self.probed_rows.matched_build { - let flat_idx = (row_ptr.chunk_index as usize) - * super::partitioned_build::CHUNK_SIZE - + row_ptr.row_index as usize - + 1; - self.build.visited.as_ptr(); unsafe { - *self.build.visited.as_ptr().add(flat_idx).cast_mut() = 1; + *self.build.visited[row_ptr.chunk_index as usize] + .as_ptr() + .add(row_ptr.row_index as usize) + .cast_mut() = 1; } } @@ -235,12 +229,11 @@ impl<'a, const CONJUNCT: bool> JoinStream for OuterRightHashJoinStream<'a, CONJU let Some(filter_executor) = self.filter_executor.as_mut() else { for row_ptr in &self.probed_rows.matched_build { - let flat_idx = (row_ptr.chunk_index as usize) - * super::partitioned_build::CHUNK_SIZE - + row_ptr.row_index as usize - + 1; unsafe { - *self.build.visited.as_ptr().add(flat_idx).cast_mut() = 1; + *self.build.visited[row_ptr.chunk_index as usize] + .as_ptr() + .add(row_ptr.row_index as usize) + .cast_mut() = 1; } } @@ -258,12 +251,11 @@ impl<'a, const CONJUNCT: bool> JoinStream for OuterRightHashJoinStream<'a, CONJU for idx in true_sel.iter().take(res_rows) { let row_ptr = self.probed_rows.matched_build[*idx as usize]; - let flat_idx = (row_ptr.chunk_index as usize) - * super::partitioned_build::CHUNK_SIZE - + row_ptr.row_index as usize - + 1; unsafe { - *self.build.visited.as_ptr().add(flat_idx).cast_mut() = 1; + *self.build.visited[row_ptr.chunk_index as usize] + .as_ptr() + .add(row_ptr.row_index as usize) + .cast_mut() = 1; } } @@ -277,9 +269,9 @@ impl<'a, const CONJUNCT: bool> JoinStream for OuterRightHashJoinStream<'a, CONJU struct PartitionedRightFinalStream<'a> { columns: &'a Vec, column_types: &'a Vec, - visited: &'a [u8], - num_rows: usize, - scan_idx: usize, + visited: &'a Vec>, + chunk_idx: usize, + row_idx: usize, max_block_size: usize, desc: Arc, probe_types: Vec, @@ -288,11 +280,21 @@ struct PartitionedRightFinalStream<'a> { impl<'a> JoinStream for PartitionedRightFinalStream<'a> { fn next(&mut self) -> Result> { let mut row_ptrs = Vec::with_capacity(self.max_block_size); - while self.scan_idx <= self.num_rows && row_ptrs.len() < self.max_block_size { - if self.visited[self.scan_idx] == 0 { - row_ptrs.push(flat_to_row_ptr(self.scan_idx)); + while self.chunk_idx < self.visited.len() && row_ptrs.len() < self.max_block_size { + let chunk = &self.visited[self.chunk_idx]; + while self.row_idx < chunk.len() && row_ptrs.len() < self.max_block_size { + if chunk[self.row_idx] == 0 { + row_ptrs.push(RowPtr { + chunk_index: self.chunk_idx as u32, + row_index: self.row_idx as u32, + }); + } + self.row_idx += 1; + } + if self.row_idx >= chunk.len() { + self.chunk_idx += 1; + self.row_idx = 0; } - self.scan_idx += 1; } if row_ptrs.is_empty() { diff --git a/src/query/service/src/pipelines/processors/transforms/new_hash_join/partitioned/right_join_anti.rs b/src/query/service/src/pipelines/processors/transforms/new_hash_join/partitioned/right_join_anti.rs index 95f4f24c1144c..734d72cd8b036 100644 --- a/src/query/service/src/pipelines/processors/transforms/new_hash_join/partitioned/right_join_anti.rs +++ b/src/query/service/src/pipelines/processors/transforms/new_hash_join/partitioned/right_join_anti.rs @@ -24,9 +24,9 @@ use databend_common_expression::types::DataType; use super::partitioned_build::PartitionedHashJoinState; use super::partitioned_build::ProbeData; -use super::partitioned_build::flat_to_row_ptr; use super::right_join_semi::SemiRightHashJoinStream; use crate::pipelines::processors::HashJoinDesc; +use crate::pipelines::processors::transforms::hash_join_table::RowPtr; use crate::pipelines::processors::transforms::new_hash_join::common::join::EmptyJoinStream; use crate::pipelines::processors::transforms::new_hash_join::common::join::Join; use crate::pipelines::processors::transforms::new_hash_join::common::join::JoinStream; @@ -66,15 +66,11 @@ impl PartitionedRightAntiJoin { impl Join for PartitionedRightAntiJoin { fn add_block(&mut self, data: Option) -> Result<()> { - self.build.add_block(data) + self.build.add_block::(data) } fn final_build(&mut self) -> Result> { - let progress = self.build.final_build()?; - if progress.is_none() { - self.build.init_visited(); - } - Ok(progress) + self.build.final_build() } fn probe_block(&mut self, data: DataBlock) -> Result> { @@ -122,8 +118,8 @@ impl Join for PartitionedRightAntiJoin { columns: &self.build.columns, column_types: &self.build.column_types, visited: &self.build.visited, - num_rows: self.build.num_rows, - scan_idx: 1, + chunk_idx: 0, + row_idx: 0, max_block_size: self.max_block_size, }))) } @@ -132,20 +128,30 @@ impl Join for PartitionedRightAntiJoin { struct PartitionedRightAntiFinalStream<'a> { columns: &'a Vec, column_types: &'a Vec, - visited: &'a [u8], - num_rows: usize, - scan_idx: usize, + visited: &'a Vec>, + chunk_idx: usize, + row_idx: usize, max_block_size: usize, } impl<'a> JoinStream for PartitionedRightAntiFinalStream<'a> { fn next(&mut self) -> Result> { let mut row_ptrs = Vec::with_capacity(self.max_block_size); - while self.scan_idx <= self.num_rows && row_ptrs.len() < self.max_block_size { - if self.visited[self.scan_idx] == 0 { - row_ptrs.push(flat_to_row_ptr(self.scan_idx)); + while self.chunk_idx < self.visited.len() && row_ptrs.len() < self.max_block_size { + let chunk = &self.visited[self.chunk_idx]; + while self.row_idx < chunk.len() && row_ptrs.len() < self.max_block_size { + if chunk[self.row_idx] == 0 { + row_ptrs.push(RowPtr { + chunk_index: self.chunk_idx as u32, + row_index: self.row_idx as u32, + }); + } + self.row_idx += 1; + } + if self.row_idx >= chunk.len() { + self.chunk_idx += 1; + self.row_idx = 0; } - self.scan_idx += 1; } if row_ptrs.is_empty() { diff --git a/src/query/service/src/pipelines/processors/transforms/new_hash_join/partitioned/right_join_semi.rs b/src/query/service/src/pipelines/processors/transforms/new_hash_join/partitioned/right_join_semi.rs index 9717f162c1ba1..2758eed1f6bbe 100644 --- a/src/query/service/src/pipelines/processors/transforms/new_hash_join/partitioned/right_join_semi.rs +++ b/src/query/service/src/pipelines/processors/transforms/new_hash_join/partitioned/right_join_semi.rs @@ -25,8 +25,8 @@ use databend_common_expression::types::DataType; use super::partitioned_build::PartitionedHashJoinState; use super::partitioned_build::ProbeData; -use super::partitioned_build::flat_to_row_ptr; use crate::pipelines::processors::HashJoinDesc; +use crate::pipelines::processors::transforms::hash_join_table::RowPtr; use crate::pipelines::processors::transforms::new_hash_join::common::join::EmptyJoinStream; use crate::pipelines::processors::transforms::new_hash_join::common::join::Join; use crate::pipelines::processors::transforms::new_hash_join::common::join::JoinStream; @@ -69,15 +69,11 @@ impl PartitionedRightSemiJoin { impl Join for PartitionedRightSemiJoin { fn add_block(&mut self, data: Option) -> Result<()> { - self.build.add_block(data) + self.build.add_block::(data) } fn final_build(&mut self) -> Result> { - let progress = self.build.final_build()?; - if progress.is_none() { - self.build.init_visited(); - } - Ok(progress) + self.build.final_build() } fn probe_block(&mut self, data: DataBlock) -> Result> { @@ -125,8 +121,8 @@ impl Join for PartitionedRightSemiJoin { columns: &self.build.columns, column_types: &self.build.column_types, visited: &self.build.visited, - num_rows: self.build.num_rows, - scan_idx: 1, + chunk_idx: 0, + row_idx: 0, max_block_size: self.max_block_size, }))) } @@ -178,12 +174,11 @@ impl<'a, const CONJUNCT: bool> JoinStream for SemiRightHashJoinStream<'a, CONJUN if !CONJUNCT { for row_ptr in &self.probed_rows.matched_build { - let flat_idx = (row_ptr.chunk_index as usize) - * super::partitioned_build::CHUNK_SIZE - + row_ptr.row_index as usize - + 1; unsafe { - *self.build.visited.as_ptr().add(flat_idx).cast_mut() = 1; + *self.build.visited[row_ptr.chunk_index as usize] + .as_ptr() + .add(row_ptr.row_index as usize) + .cast_mut() = 1; } } continue; @@ -191,12 +186,11 @@ impl<'a, const CONJUNCT: bool> JoinStream for SemiRightHashJoinStream<'a, CONJUN let Some(filter_executor) = self.filter_executor.as_mut() else { for row_ptr in &self.probed_rows.matched_build { - let flat_idx = (row_ptr.chunk_index as usize) - * super::partitioned_build::CHUNK_SIZE - + row_ptr.row_index as usize - + 1; unsafe { - *self.build.visited.as_ptr().add(flat_idx).cast_mut() = 1; + *self.build.visited[row_ptr.chunk_index as usize] + .as_ptr() + .add(row_ptr.row_index as usize) + .cast_mut() = 1; } } continue; @@ -240,12 +234,11 @@ impl<'a, const CONJUNCT: bool> JoinStream for SemiRightHashJoinStream<'a, CONJUN for idx in true_sel.iter().take(result_count) { let row_ptr = self.probed_rows.matched_build[*idx as usize]; - let flat_idx = (row_ptr.chunk_index as usize) - * super::partitioned_build::CHUNK_SIZE - + row_ptr.row_index as usize - + 1; unsafe { - *self.build.visited.as_ptr().add(flat_idx).cast_mut() = 1; + *self.build.visited[row_ptr.chunk_index as usize] + .as_ptr() + .add(row_ptr.row_index as usize) + .cast_mut() = 1; } } } @@ -256,20 +249,30 @@ impl<'a, const CONJUNCT: bool> JoinStream for SemiRightHashJoinStream<'a, CONJUN struct PartitionedRightSemiFinalStream<'a> { columns: &'a Vec, column_types: &'a Vec, - visited: &'a [u8], - num_rows: usize, - scan_idx: usize, + visited: &'a Vec>, + chunk_idx: usize, + row_idx: usize, max_block_size: usize, } impl<'a> JoinStream for PartitionedRightSemiFinalStream<'a> { fn next(&mut self) -> Result> { let mut row_ptrs = Vec::with_capacity(self.max_block_size); - while self.scan_idx <= self.num_rows && row_ptrs.len() < self.max_block_size { - if self.visited[self.scan_idx] != 0 { - row_ptrs.push(flat_to_row_ptr(self.scan_idx)); + while self.chunk_idx < self.visited.len() && row_ptrs.len() < self.max_block_size { + let chunk = &self.visited[self.chunk_idx]; + while self.row_idx < chunk.len() && row_ptrs.len() < self.max_block_size { + if chunk[self.row_idx] != 0 { + row_ptrs.push(RowPtr { + chunk_index: self.chunk_idx as u32, + row_index: self.row_idx as u32, + }); + } + self.row_idx += 1; + } + if self.row_idx >= chunk.len() { + self.chunk_idx += 1; + self.row_idx = 0; } - self.scan_idx += 1; } if row_ptrs.is_empty() { From 210931021afd8f369fccfd16a45cbc23411c44ca Mon Sep 17 00:00:00 2001 From: zhang2014 Date: Fri, 27 Mar 2026 11:21:32 +0800 Subject: [PATCH 17/38] z --- .../partitioned/partitioned_build.rs | 43 ++++++++----------- 1 file changed, 18 insertions(+), 25 deletions(-) diff --git a/src/query/service/src/pipelines/processors/transforms/new_hash_join/partitioned/partitioned_build.rs b/src/query/service/src/pipelines/processors/transforms/new_hash_join/partitioned/partitioned_build.rs index 9454bf83da675..4ce20a3b58332 100644 --- a/src/query/service/src/pipelines/processors/transforms/new_hash_join/partitioned/partitioned_build.rs +++ b/src/query/service/src/pipelines/processors/transforms/new_hash_join/partitioned/partitioned_build.rs @@ -190,17 +190,21 @@ impl PartitionedHashJoinState { fn ingest_chunk(&mut self, chunk: DataBlock) -> Result<()> { let num_rows = chunk.num_rows(); let mut columns = chunk.take_columns(); - let mut data_columns = columns.split_off(self.desc.build_keys.len()); - - if VISITED && data_columns.len() != self.desc.probe_keys.len() { - let valid_entry = data_columns.pop().unwrap(); - let valid_column = valid_entry.to_column(); - let valid_bitmap = BooleanType::try_downcast_column(&valid_column).unwrap(); - let keys_block = DataBlock::new(columns, num_rows); - columns = keys_block.filter_with_bitmap(&valid_bitmap)?.take_columns(); + let data_columns = columns.split_off(self.desc.build_keys.len()); + + let mut keys_block = DataBlock::new(columns, num_rows); + let mut chunk = DataBlock::new(data_columns, num_rows); + if VISITED && let Some(bitmap) = self.desc.build_valids_by_keys(&keys_block)? { + if bitmap.true_count() != bitmap.len() { + keys_block = keys_block.filter_with_bitmap(&bitmap)?; + let null_keys = chunk.clone().filter_with_bitmap(&(!(&bitmap)))?; + let nonnull_keys = chunk.filter_with_bitmap(&bitmap)?; + chunk = DataBlock::concat(&[nonnull_keys, null_keys])?; + self.desc.remove_keys_nullable(&mut keys_block); + } } - let keys = ProjectedBlock::from(&columns); + let keys = ProjectedBlock::from(keys_block.columns()); let keys_state = with_hash_method!(|T| match &self.method { HashMethodKind::T(method) => method.build_keys_state(keys, num_rows)?, @@ -211,8 +215,8 @@ impl PartitionedHashJoinState { } self.num_rows += num_rows; + self.chunks.push(chunk); self.add_build_state(keys_state); - self.chunks.push(DataBlock::new(data_columns, num_rows)); Ok(()) } @@ -224,25 +228,14 @@ impl PartitionedHashJoinState { chunk = chunk.project(&self.desc.build_projection); - if let Some(bitmap) = self.desc.build_valids_by_keys(&keys_block)? { + if !VISITED && let Some(bitmap) = self.desc.build_valids_by_keys(&keys_block)? { if bitmap.true_count() != bitmap.len() { - chunk = match VISITED { - true => { - let null_keys = chunk.clone().filter_with_bitmap(&(!(&bitmap)))?; - let nonnull_keys = chunk.filter_with_bitmap(&bitmap)?; - let mut chunk = DataBlock::concat(&[nonnull_keys, null_keys])?; - chunk.add_column(Column::Boolean(bitmap)); - chunk - } - false => { - keys_block = keys_block.filter_with_bitmap(&bitmap)?; - chunk.filter_with_bitmap(&bitmap)? - } - }; + keys_block = keys_block.filter_with_bitmap(&bitmap)?; + chunk = chunk.filter_with_bitmap(&bitmap)?; + self.desc.remove_keys_nullable(&mut keys_block); } } - self.desc.remove_keys_nullable(&mut keys_block); keys_block.merge_block(chunk); Ok(keys_block) } From f900c1dfb819e880539a8d2b00837b534ab41215 Mon Sep 17 00:00:00 2001 From: zhang2014 Date: Fri, 27 Mar 2026 11:55:21 +0800 Subject: [PATCH 18/38] z --- .../transforms/new_hash_join/partitioned/partitioned_build.rs | 2 -- 1 file changed, 2 deletions(-) diff --git a/src/query/service/src/pipelines/processors/transforms/new_hash_join/partitioned/partitioned_build.rs b/src/query/service/src/pipelines/processors/transforms/new_hash_join/partitioned/partitioned_build.rs index 4ce20a3b58332..23d779142018d 100644 --- a/src/query/service/src/pipelines/processors/transforms/new_hash_join/partitioned/partitioned_build.rs +++ b/src/query/service/src/pipelines/processors/transforms/new_hash_join/partitioned/partitioned_build.rs @@ -30,8 +30,6 @@ use databend_common_expression::HashMethod; use databend_common_expression::HashMethodKind; use databend_common_expression::KeysState; use databend_common_expression::ProjectedBlock; -use databend_common_expression::types::AccessType; -use databend_common_expression::types::BooleanType; use databend_common_expression::types::DataType; use databend_common_expression::with_hash_method; use ethnum::u256; From 0f3361e95705f0a228603c9975ccef4d86bd3d6b Mon Sep 17 00:00:00 2001 From: zhang2014 Date: Fri, 27 Mar 2026 17:23:03 +0800 Subject: [PATCH 19/38] z --- .../transforms/new_hash_join/partitioned/partitioned_build.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/query/service/src/pipelines/processors/transforms/new_hash_join/partitioned/partitioned_build.rs b/src/query/service/src/pipelines/processors/transforms/new_hash_join/partitioned/partitioned_build.rs index 23d779142018d..b538f08136dc1 100644 --- a/src/query/service/src/pipelines/processors/transforms/new_hash_join/partitioned/partitioned_build.rs +++ b/src/query/service/src/pipelines/processors/transforms/new_hash_join/partitioned/partitioned_build.rs @@ -205,7 +205,7 @@ impl PartitionedHashJoinState { let keys = ProjectedBlock::from(keys_block.columns()); let keys_state = with_hash_method!(|T| match &self.method { - HashMethodKind::T(method) => method.build_keys_state(keys, num_rows)?, + HashMethodKind::T(method) => method.build_keys_state(keys, keys_block.num_rows())?, }); if VISITED { From abd629f6b7791b9d309df243eec9830f85539bda Mon Sep 17 00:00:00 2001 From: zhang2014 Date: Fri, 27 Mar 2026 18:26:37 +0800 Subject: [PATCH 20/38] z --- .../partitioned/partitioned_build.rs | 28 +++++++++++-------- 1 file changed, 16 insertions(+), 12 deletions(-) diff --git a/src/query/service/src/pipelines/processors/transforms/new_hash_join/partitioned/partitioned_build.rs b/src/query/service/src/pipelines/processors/transforms/new_hash_join/partitioned/partitioned_build.rs index b538f08136dc1..3d6b0811bbaa0 100644 --- a/src/query/service/src/pipelines/processors/transforms/new_hash_join/partitioned/partitioned_build.rs +++ b/src/query/service/src/pipelines/processors/transforms/new_hash_join/partitioned/partitioned_build.rs @@ -192,14 +192,16 @@ impl PartitionedHashJoinState { let mut keys_block = DataBlock::new(columns, num_rows); let mut chunk = DataBlock::new(data_columns, num_rows); - if VISITED && let Some(bitmap) = self.desc.build_valids_by_keys(&keys_block)? { - if bitmap.true_count() != bitmap.len() { - keys_block = keys_block.filter_with_bitmap(&bitmap)?; - let null_keys = chunk.clone().filter_with_bitmap(&(!(&bitmap)))?; - let nonnull_keys = chunk.filter_with_bitmap(&bitmap)?; - chunk = DataBlock::concat(&[nonnull_keys, null_keys])?; - self.desc.remove_keys_nullable(&mut keys_block); + if VISITED { + if let Some(bitmap) = self.desc.build_valids_by_keys(&keys_block)? { + if bitmap.true_count() != bitmap.len() { + keys_block = keys_block.filter_with_bitmap(&bitmap)?; + let null_keys = chunk.clone().filter_with_bitmap(&(!(&bitmap)))?; + let nonnull_keys = chunk.filter_with_bitmap(&bitmap)?; + chunk = DataBlock::concat(&[nonnull_keys, null_keys])?; + } } + self.desc.remove_keys_nullable(&mut keys_block); } let keys = ProjectedBlock::from(keys_block.columns()); @@ -226,12 +228,14 @@ impl PartitionedHashJoinState { chunk = chunk.project(&self.desc.build_projection); - if !VISITED && let Some(bitmap) = self.desc.build_valids_by_keys(&keys_block)? { - if bitmap.true_count() != bitmap.len() { - keys_block = keys_block.filter_with_bitmap(&bitmap)?; - chunk = chunk.filter_with_bitmap(&bitmap)?; - self.desc.remove_keys_nullable(&mut keys_block); + if !VISITED { + if let Some(bitmap) = self.desc.build_valids_by_keys(&keys_block)? { + if bitmap.true_count() != bitmap.len() { + keys_block = keys_block.filter_with_bitmap(&bitmap)?; + chunk = chunk.filter_with_bitmap(&bitmap)?; + } } + self.desc.remove_keys_nullable(&mut keys_block); } keys_block.merge_block(chunk); From 4ad0955b96970ecdcc88d7d07a134b855020b7bb Mon Sep 17 00:00:00 2001 From: zhang2014 Date: Fri, 27 Mar 2026 20:02:48 +0800 Subject: [PATCH 21/38] z --- .../new_hash_join/partitioned/inner_join.rs | 2 +- .../new_hash_join/partitioned/left_join.rs | 2 +- .../partitioned/left_join_anti.rs | 51 ++++++++------- .../partitioned/left_join_semi.rs | 51 ++++++++------- .../partitioned/partitioned_build.rs | 63 +++++++++++++------ .../new_hash_join/partitioned/right_join.rs | 2 +- .../partitioned/right_join_anti.rs | 2 +- .../partitioned/right_join_semi.rs | 2 +- 8 files changed, 107 insertions(+), 68 deletions(-) diff --git a/src/query/service/src/pipelines/processors/transforms/new_hash_join/partitioned/inner_join.rs b/src/query/service/src/pipelines/processors/transforms/new_hash_join/partitioned/inner_join.rs index d467448ef3f85..c8160edaae275 100644 --- a/src/query/service/src/pipelines/processors/transforms/new_hash_join/partitioned/inner_join.rs +++ b/src/query/service/src/pipelines/processors/transforms/new_hash_join/partitioned/inner_join.rs @@ -88,7 +88,7 @@ impl Join for PartitionedInnerJoin { let probe_block = data.project(&self.desc.probe_projection); let probe_data = ProbeData::new(keys, valids); - let probe_keys_stream = self.build.probe::(probe_data)?; + let probe_keys_stream = self.build.probe::(probe_data)?; let joined_stream = PartitionedInnerJoinStream::create( probe_block, &self.build, diff --git a/src/query/service/src/pipelines/processors/transforms/new_hash_join/partitioned/left_join.rs b/src/query/service/src/pipelines/processors/transforms/new_hash_join/partitioned/left_join.rs index 3b4ba5fec2406..4552607a8e17c 100644 --- a/src/query/service/src/pipelines/processors/transforms/new_hash_join/partitioned/left_join.rs +++ b/src/query/service/src/pipelines/processors/transforms/new_hash_join/partitioned/left_join.rs @@ -115,7 +115,7 @@ impl Join for PartitionedLeftJoin { let probe_block = data.project(&self.desc.probe_projection); let probe_data = ProbeData::new(keys, valids); - let probe_stream = self.build.probe::(probe_data)?; + let probe_stream = self.build.probe::(probe_data)?; match self.performance_context.filter_executor.as_mut() { None => Ok(OuterLeftHashJoinStream::::create( diff --git a/src/query/service/src/pipelines/processors/transforms/new_hash_join/partitioned/left_join_anti.rs b/src/query/service/src/pipelines/processors/transforms/new_hash_join/partitioned/left_join_anti.rs index d442884ace864..0a9bfec2c6d88 100644 --- a/src/query/service/src/pipelines/processors/transforms/new_hash_join/partitioned/left_join_anti.rs +++ b/src/query/service/src/pipelines/processors/transforms/new_hash_join/partitioned/left_join_anti.rs @@ -15,6 +15,7 @@ use std::sync::Arc; use databend_common_base::base::ProgressValues; +use databend_common_base::hints::assume; use databend_common_column::bitmap::Bitmap; use databend_common_exception::Result; use databend_common_expression::DataBlock; @@ -81,7 +82,6 @@ impl Join for PartitionedLeftAntiJoin { return Ok(Box::new(OneBlockJoinStream(Some(probe_projected)))); } - let num_probe_rows = data.num_rows(); let probe_keys = self.desc.probe_key(&data, &self.function_ctx)?; let mut keys = DataBlock::new(probe_keys, data.num_rows()); @@ -94,23 +94,27 @@ impl Join for PartitionedLeftAntiJoin { let probe_block = data.project(&self.desc.probe_projection); let probe_data = ProbeData::new(keys, valids); - let probe_keys_stream = self.build.probe::(probe_data)?; match &mut self.context.filter_executor { - None => Ok(LeftAntiHashJoinStream::create( - probe_block, - probe_keys_stream, - &mut self.context.probe_result, - )), - Some(filter_executor) => Ok(LeftAntiFilterHashJoinStream::create( - probe_block, - &self.build, - probe_keys_stream, - self.desc.clone(), - &mut self.context.probe_result, - filter_executor, - num_probe_rows, - )), + None => { + let probe_keys_stream = self.build.probe::(probe_data)?; + Ok(LeftAntiHashJoinStream::create( + probe_block, + probe_keys_stream, + &mut self.context.probe_result, + )) + } + Some(filter_executor) => { + let probe_keys_stream = self.build.probe::(probe_data)?; + Ok(LeftAntiFilterHashJoinStream::create( + probe_block, + &self.build, + probe_keys_stream, + self.desc.clone(), + &mut self.context.probe_result, + filter_executor, + )) + } } } } @@ -168,7 +172,6 @@ struct LeftAntiFilterHashJoinStream<'a> { probe_keys_stream: Box, probed_rows: &'a mut ProbedRows, filter_executor: &'a mut FilterExecutor, - selected: Vec, } impl<'a> LeftAntiFilterHashJoinStream<'a> { @@ -179,7 +182,6 @@ impl<'a> LeftAntiFilterHashJoinStream<'a> { desc: Arc, probed_rows: &'a mut ProbedRows, filter_executor: &'a mut FilterExecutor, - num_probe_rows: usize, ) -> Box { Box::new(LeftAntiFilterHashJoinStream { desc, @@ -188,7 +190,6 @@ impl<'a> LeftAntiFilterHashJoinStream<'a> { filter_executor, probe_keys_stream, probe_data_block: Some(probe_data_block), - selected: vec![true; num_probe_rows], }) } } @@ -199,6 +200,9 @@ impl<'a> JoinStream for LeftAntiFilterHashJoinStream<'a> { return Ok(None); }; + let num_rows = probe_data_block.num_rows(); + let mut selected = vec![true; num_rows]; + loop { self.probed_rows.clear(); let max_rows = self.probed_rows.matched_probe.capacity(); @@ -243,18 +247,21 @@ impl<'a> JoinStream for LeftAntiFilterHashJoinStream<'a> { if selected_rows == result_block.num_rows() { for probe_idx in &self.probed_rows.matched_probe { - self.selected[*probe_idx as usize] = false; + assume((*probe_idx as usize) < selected.len()); + selected[*probe_idx as usize] = false; } } else if selected_rows != 0 { let selection = self.filter_executor.true_selection(); for idx in selection[..selected_rows].iter() { + assume((*idx as usize) < self.probed_rows.matched_probe.len()); let idx = self.probed_rows.matched_probe[*idx as usize]; - self.selected[idx as usize] = false; + assume((idx as usize) < selected.len()); + selected[idx as usize] = false; } } } - let bitmap = Bitmap::from_trusted_len_iter(self.selected.iter().copied()); + let bitmap = Bitmap::from_trusted_len_iter(selected.iter().copied()); match bitmap.true_count() { 0 => Ok(None), _ => Ok(Some(probe_data_block.filter_with_bitmap(&bitmap)?)), diff --git a/src/query/service/src/pipelines/processors/transforms/new_hash_join/partitioned/left_join_semi.rs b/src/query/service/src/pipelines/processors/transforms/new_hash_join/partitioned/left_join_semi.rs index 96e166a421fb0..74cc501f616df 100644 --- a/src/query/service/src/pipelines/processors/transforms/new_hash_join/partitioned/left_join_semi.rs +++ b/src/query/service/src/pipelines/processors/transforms/new_hash_join/partitioned/left_join_semi.rs @@ -15,6 +15,7 @@ use std::sync::Arc; use databend_common_base::base::ProgressValues; +use databend_common_base::hints::assume; use databend_common_column::bitmap::Bitmap; use databend_common_exception::Result; use databend_common_expression::DataBlock; @@ -75,7 +76,6 @@ impl Join for PartitionedLeftSemiJoin { return Ok(Box::new(EmptyJoinStream)); } - let num_probe_rows = data.num_rows(); let probe_keys = self.desc.probe_key(&data, &self.function_ctx)?; let mut keys = DataBlock::new(probe_keys, data.num_rows()); @@ -88,23 +88,27 @@ impl Join for PartitionedLeftSemiJoin { let probe_block = data.project(&self.desc.probe_projection); let probe_data = ProbeData::new(keys, valids); - let probe_keys_stream = self.build.probe::(probe_data)?; match &mut self.context.filter_executor { - None => Ok(LeftSemiHashJoinStream::create( - probe_block, - probe_keys_stream, - &mut self.context.probe_result, - )), - Some(filter_executor) => Ok(LeftSemiFilterHashJoinStream::create( - probe_block, - &self.build, - probe_keys_stream, - self.desc.clone(), - &mut self.context.probe_result, - filter_executor, - num_probe_rows, - )), + None => { + let probe_keys_stream = self.build.probe::(probe_data)?; + Ok(LeftSemiHashJoinStream::create( + probe_block, + probe_keys_stream, + &mut self.context.probe_result, + )) + } + Some(filter_executor) => { + let probe_keys_stream = self.build.probe::(probe_data)?; + Ok(LeftSemiFilterHashJoinStream::create( + probe_block, + &self.build, + probe_keys_stream, + self.desc.clone(), + &mut self.context.probe_result, + filter_executor, + )) + } } } } @@ -159,7 +163,6 @@ struct LeftSemiFilterHashJoinStream<'a> { probe_keys_stream: Box, probed_rows: &'a mut ProbedRows, filter_executor: &'a mut FilterExecutor, - selected: Vec, } impl<'a> LeftSemiFilterHashJoinStream<'a> { @@ -170,7 +173,6 @@ impl<'a> LeftSemiFilterHashJoinStream<'a> { desc: Arc, probed_rows: &'a mut ProbedRows, filter_executor: &'a mut FilterExecutor, - num_probe_rows: usize, ) -> Box { Box::new(LeftSemiFilterHashJoinStream { desc, @@ -179,7 +181,6 @@ impl<'a> LeftSemiFilterHashJoinStream<'a> { filter_executor, probe_keys_stream, probe_data_block: Some(probe_data_block), - selected: vec![false; num_probe_rows], }) } } @@ -190,6 +191,9 @@ impl<'a> JoinStream for LeftSemiFilterHashJoinStream<'a> { return Ok(None); }; + let num_rows = probe_data_block.num_rows(); + let mut selected = vec![false; num_rows]; + loop { self.probed_rows.clear(); let max_rows = self.probed_rows.matched_probe.capacity(); @@ -230,18 +234,21 @@ impl<'a> JoinStream for LeftSemiFilterHashJoinStream<'a> { if selected_rows == result.num_rows() { for probe_idx in &self.probed_rows.matched_probe { - self.selected[*probe_idx as usize] = true; + assume((*probe_idx as usize) < selected.len()); + selected[*probe_idx as usize] = true; } } else if selected_rows != 0 { let selection = self.filter_executor.true_selection(); for idx in selection[..selected_rows].iter() { + assume((*idx as usize) < self.probed_rows.matched_probe.len()); let idx = self.probed_rows.matched_probe[*idx as usize]; - self.selected[idx as usize] = true; + assume((idx as usize) < selected.len()); + selected[idx as usize] = true; } } } - let bitmap = Bitmap::from_trusted_len_iter(self.selected.iter().copied()); + let bitmap = Bitmap::from_trusted_len_iter(selected.iter().copied()); match bitmap.true_count() { 0 => Ok(None), diff --git a/src/query/service/src/pipelines/processors/transforms/new_hash_join/partitioned/partitioned_build.rs b/src/query/service/src/pipelines/processors/transforms/new_hash_join/partitioned/partitioned_build.rs index 3d6b0811bbaa0..1e8e7026eb613 100644 --- a/src/query/service/src/pipelines/processors/transforms/new_hash_join/partitioned/partitioned_build.rs +++ b/src/query/service/src/pipelines/processors/transforms/new_hash_join/partitioned/partitioned_build.rs @@ -288,7 +288,7 @@ impl PartitionedHashJoinState { } } - pub fn probe<'a, const MATCHED: bool>( + pub fn probe<'a, const MATCHED: bool, const MATCH_FIRST: bool>( &'a self, data: ProbeData, ) -> Result> { @@ -315,7 +315,7 @@ impl PartitionedHashJoinState { Ok(match (&self.method, &self.build_keys_states) { (HashMethodKind::KeysU8(_), BuildKeysStates::UInt8(states)) => { let probe_keys = u8::downcast_owned(keys_state).unwrap(); - PrimitiveProbeStream::<'a, u8, MATCHED, u32>::new( + PrimitiveProbeStream::<'a, u8, MATCHED, MATCH_FIRST, u32>::new( hashes, states, probe_keys, @@ -324,7 +324,7 @@ impl PartitionedHashJoinState { } (HashMethodKind::KeysU16(_), BuildKeysStates::UInt16(states)) => { let probe_keys = u16::downcast_owned(keys_state).unwrap(); - PrimitiveProbeStream::<'a, u16, MATCHED, u32>::new( + PrimitiveProbeStream::<'a, u16, MATCHED, MATCH_FIRST, u32>::new( hashes, states, probe_keys, @@ -333,7 +333,7 @@ impl PartitionedHashJoinState { } (HashMethodKind::KeysU32(_), BuildKeysStates::UInt32(states)) => { let probe_keys = u32::downcast_owned(keys_state).unwrap(); - PrimitiveProbeStream::<'a, u32, MATCHED, u32>::new( + PrimitiveProbeStream::<'a, u32, MATCHED, MATCH_FIRST, u32>::new( hashes, states, probe_keys, @@ -342,7 +342,7 @@ impl PartitionedHashJoinState { } (HashMethodKind::KeysU64(_), BuildKeysStates::UInt64(states)) => { let probe_keys = u64::downcast_owned(keys_state).unwrap(); - PrimitiveProbeStream::<'a, u64, MATCHED, u32>::new( + PrimitiveProbeStream::<'a, u64, MATCHED, MATCH_FIRST, u32>::new( hashes, states, probe_keys, @@ -351,7 +351,7 @@ impl PartitionedHashJoinState { } (HashMethodKind::KeysU128(_), BuildKeysStates::UInt128(states)) => { let probe_keys = u128::downcast_owned(keys_state).unwrap(); - PrimitiveProbeStream::<'a, u128, MATCHED, u32>::new( + PrimitiveProbeStream::<'a, u128, MATCHED, MATCH_FIRST, u32>::new( hashes, states, probe_keys, @@ -360,7 +360,7 @@ impl PartitionedHashJoinState { } (HashMethodKind::KeysU256(_), BuildKeysStates::UInt256(states)) => { let probe_keys = u256::downcast_owned(keys_state).unwrap(); - PrimitiveProbeStream::<'a, u256, MATCHED, u32>::new( + PrimitiveProbeStream::<'a, u256, MATCHED, MATCH_FIRST, u32>::new( hashes, states, probe_keys, @@ -374,7 +374,7 @@ impl PartitionedHashJoinState { KeysState::Column(Column::Binary(probe_keys)) | KeysState::Column(Column::Variant(probe_keys)) | KeysState::Column(Column::Bitmap(probe_keys)) => { - BinaryProbeStream::<'a, MATCHED, u32>::create( + BinaryProbeStream::<'a, MATCHED, MATCH_FIRST, u32>::create( hashes, states, probe_keys, @@ -419,8 +419,13 @@ impl PartitionedHashJoinState { } } -struct PrimitiveProbeStream<'a, T: Send + Sync + PartialEq, const MATCHED: bool, I: RowIndex = u32> -{ +struct PrimitiveProbeStream< + 'a, + T: Send + Sync + PartialEq, + const MATCHED: bool, + const MATCH_FIRST: bool, + I: RowIndex = u32, +> { key_idx: usize, pointers: Vec, build_idx: usize, @@ -430,8 +435,8 @@ struct PrimitiveProbeStream<'a, T: Send + Sync + PartialEq, const MATCHED: bool, matched_num_rows: usize, } -impl<'a, T: Send + Sync + PartialEq, const MATCHED: bool, I: RowIndex> - PrimitiveProbeStream<'a, T, MATCHED, I> +impl<'a, T: Send + Sync + PartialEq, const MATCHED: bool, const MATCH_FIRST: bool, I: RowIndex> + PrimitiveProbeStream<'a, T, MATCHED, MATCH_FIRST, I> { #[allow(clippy::new_ret_no_self)] pub fn new( @@ -452,8 +457,8 @@ impl<'a, T: Send + Sync + PartialEq, const MATCHED: bool, I: RowIndex> } } -impl<'a, T: Send + Sync + PartialEq, const MATCHED: bool, I: RowIndex> ProbeStream - for PrimitiveProbeStream<'a, T, MATCHED, I> +impl<'a, T: Send + Sync + PartialEq, const MATCHED: bool, const MATCH_FIRST: bool, I: RowIndex> + ProbeStream for PrimitiveProbeStream<'a, T, MATCHED, MATCH_FIRST, I> { fn advance(&mut self, res: &mut ProbedRows, max_rows: usize) -> Result<()> { while self.key_idx < self.probe_keys.len() { @@ -491,7 +496,10 @@ impl<'a, T: Send + Sync + PartialEq, const MATCHED: bool, I: RowIndex> ProbeStre self.matched_num_rows += 1; if res.matched_probe.len() == max_rows { - self.build_idx = self.next[self.build_idx].to_usize(); + self.build_idx = match MATCH_FIRST { + true => 0, + false => self.next[self.build_idx].to_usize(), + }; if self.build_idx == 0 { self.key_idx += 1; @@ -500,6 +508,11 @@ impl<'a, T: Send + Sync + PartialEq, const MATCHED: bool, I: RowIndex> ProbeStre return Ok(()); } + + if MATCH_FIRST { + self.build_idx = 0; + break; + } } self.build_idx = self.next[self.build_idx].to_usize(); @@ -517,7 +530,7 @@ impl<'a, T: Send + Sync + PartialEq, const MATCHED: bool, I: RowIndex> ProbeStre } } -struct BinaryProbeStream<'a, const MATCHED: bool, I: RowIndex = u32> { +struct BinaryProbeStream<'a, const MATCHED: bool, const MATCH_FIRST: bool, I: RowIndex = u32> { key_idx: usize, pointers: Vec, build_idx: usize, @@ -527,7 +540,9 @@ struct BinaryProbeStream<'a, const MATCHED: bool, I: RowIndex = u32> { matched_num_rows: usize, } -impl<'a, const MATCHED: bool, I: RowIndex> BinaryProbeStream<'a, MATCHED, I> { +impl<'a, const MATCHED: bool, const MATCH_FIRST: bool, I: RowIndex> + BinaryProbeStream<'a, MATCHED, MATCH_FIRST, I> +{ pub fn create( pointers: Vec, build_keys: &'a [BinaryColumn], @@ -546,7 +561,9 @@ impl<'a, const MATCHED: bool, I: RowIndex> BinaryProbeStream<'a, MATCHED, I> { } } -impl<'a, const MATCHED: bool, I: RowIndex> ProbeStream for BinaryProbeStream<'a, MATCHED, I> { +impl<'a, const MATCHED: bool, const MATCH_FIRST: bool, I: RowIndex> ProbeStream + for BinaryProbeStream<'a, MATCHED, MATCH_FIRST, I> +{ fn advance(&mut self, res: &mut ProbedRows, max_rows: usize) -> Result<()> { while self.key_idx < self.probe_keys.len() { assume(res.matched_probe.len() == res.matched_build.len()); @@ -583,7 +600,10 @@ impl<'a, const MATCHED: bool, I: RowIndex> ProbeStream for BinaryProbeStream<'a, self.matched_num_rows += 1; if res.matched_probe.len() == max_rows { - self.build_idx = self.next[self.build_idx].to_usize(); + self.build_idx = match MATCH_FIRST { + true => 0, + false => self.next[self.build_idx].to_usize(), + }; if self.build_idx == 0 { self.key_idx += 1; @@ -592,6 +612,11 @@ impl<'a, const MATCHED: bool, I: RowIndex> ProbeStream for BinaryProbeStream<'a, return Ok(()); } + + if MATCH_FIRST { + self.build_idx = 0; + break; + } } self.build_idx = self.next[self.build_idx].to_usize(); diff --git a/src/query/service/src/pipelines/processors/transforms/new_hash_join/partitioned/right_join.rs b/src/query/service/src/pipelines/processors/transforms/new_hash_join/partitioned/right_join.rs index 26983d65f5117..6f5399e32623d 100644 --- a/src/query/service/src/pipelines/processors/transforms/new_hash_join/partitioned/right_join.rs +++ b/src/query/service/src/pipelines/processors/transforms/new_hash_join/partitioned/right_join.rs @@ -95,7 +95,7 @@ impl Join for PartitionedRightJoin { let probe_block = data.project(&self.desc.probe_projection); let probe_data = ProbeData::new(probe_keys, valids); - let probe_keys_stream = self.build.probe::(probe_data)?; + let probe_keys_stream = self.build.probe::(probe_data)?; match self.context.filter_executor.as_mut() { None => Ok(OuterRightHashJoinStream::::create( diff --git a/src/query/service/src/pipelines/processors/transforms/new_hash_join/partitioned/right_join_anti.rs b/src/query/service/src/pipelines/processors/transforms/new_hash_join/partitioned/right_join_anti.rs index 734d72cd8b036..525a956837151 100644 --- a/src/query/service/src/pipelines/processors/transforms/new_hash_join/partitioned/right_join_anti.rs +++ b/src/query/service/src/pipelines/processors/transforms/new_hash_join/partitioned/right_join_anti.rs @@ -86,7 +86,7 @@ impl Join for PartitionedRightAntiJoin { let probe_block = data.project(&self.desc.probe_projection); let probe_data = ProbeData::new(keys, valids); - let probe_keys_stream = self.build.probe::(probe_data)?; + let probe_keys_stream = self.build.probe::(probe_data)?; match self.context.filter_executor.as_mut() { None => Ok(SemiRightHashJoinStream::::create( diff --git a/src/query/service/src/pipelines/processors/transforms/new_hash_join/partitioned/right_join_semi.rs b/src/query/service/src/pipelines/processors/transforms/new_hash_join/partitioned/right_join_semi.rs index 2758eed1f6bbe..c41d5995ad605 100644 --- a/src/query/service/src/pipelines/processors/transforms/new_hash_join/partitioned/right_join_semi.rs +++ b/src/query/service/src/pipelines/processors/transforms/new_hash_join/partitioned/right_join_semi.rs @@ -89,7 +89,7 @@ impl Join for PartitionedRightSemiJoin { let probe_block = data.project(&self.desc.probe_projection); let probe_data = ProbeData::new(keys, valids); - let probe_keys_stream = self.build.probe::(probe_data)?; + let probe_keys_stream = self.build.probe::(probe_data)?; match self.context.filter_executor.as_mut() { None => Ok(SemiRightHashJoinStream::::create( From e076cc8882ba8b3fe249a9c182c5113cf46bebad Mon Sep 17 00:00:00 2001 From: zhang2014 Date: Fri, 27 Mar 2026 20:56:13 +0800 Subject: [PATCH 22/38] z --- src/query/service/src/physical_plans/physical_hash_join.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/query/service/src/physical_plans/physical_hash_join.rs b/src/query/service/src/physical_plans/physical_hash_join.rs index f3fac7b0d606c..25b17a9fcb473 100644 --- a/src/query/service/src/physical_plans/physical_hash_join.rs +++ b/src/query/service/src/physical_plans/physical_hash_join.rs @@ -463,7 +463,7 @@ impl HashJoin { let probe_distribution = self.probe.output_data_distribution(); let global_hash_probe = matches!(probe_distribution, DataDistribution::GlobalHash(_)); - match (global_hash_build && global_hash_probe) || self.build_side_cache_info.is_some() { + match global_hash_build && global_hash_probe && self.build_side_cache_info.is_none() { true => self.shuffle_join(pb, desc), false => self.broadcast_join(pb, desc), } From 4872d06868dc746931ce2d5c6bfc126b156f3dee Mon Sep 17 00:00:00 2001 From: zhang2014 Date: Tue, 31 Mar 2026 13:39:07 +0800 Subject: [PATCH 23/38] z --- src/query/service/src/physical_plans/physical_hash_join.rs | 7 ++++++- src/query/settings/src/settings_default.rs | 7 +++++++ src/query/settings/src/settings_getter_setter.rs | 4 ++++ 3 files changed, 17 insertions(+), 1 deletion(-) diff --git a/src/query/service/src/physical_plans/physical_hash_join.rs b/src/query/service/src/physical_plans/physical_hash_join.rs index 25b17a9fcb473..78dbc2fdca9ba 100644 --- a/src/query/service/src/physical_plans/physical_hash_join.rs +++ b/src/query/service/src/physical_plans/physical_hash_join.rs @@ -463,7 +463,12 @@ impl HashJoin { let probe_distribution = self.probe.output_data_distribution(); let global_hash_probe = matches!(probe_distribution, DataDistribution::GlobalHash(_)); - match global_hash_build && global_hash_probe && self.build_side_cache_info.is_none() { + let enable_partitioned = pb.settings.get_enable_partitioned_hash_join()?; + match global_hash_build + && global_hash_probe + && self.build_side_cache_info.is_none() + && enable_partitioned + { true => self.shuffle_join(pb, desc), false => self.broadcast_join(pb, desc), } diff --git a/src/query/settings/src/settings_default.rs b/src/query/settings/src/settings_default.rs index a70982b090701..33800c1c17f4d 100644 --- a/src/query/settings/src/settings_default.rs +++ b/src/query/settings/src/settings_default.rs @@ -1584,6 +1584,13 @@ impl DefaultSettings { scope: SettingScope::Both, range: Some(SettingRange::Numeric(0..=1)), }), + ("enable_partitioned_hash_join", DefaultSettingValue { + value: UserSettingValue::UInt64(0), + desc: "Enables partitioned hash join for shuffle join.", + mode: SettingMode::Both, + scope: SettingScope::Both, + range: Some(SettingRange::Numeric(0..=1)), + }), ("s3_storage_class", DefaultSettingValue { value: { let storage_class = Self::extract_s3_storage_class_config(&global_conf).unwrap_or_default(); diff --git a/src/query/settings/src/settings_getter_setter.rs b/src/query/settings/src/settings_getter_setter.rs index 81b6a00fb289d..50fc69ab2b53d 100644 --- a/src/query/settings/src/settings_getter_setter.rs +++ b/src/query/settings/src/settings_getter_setter.rs @@ -1168,6 +1168,10 @@ impl Settings { Ok(self.try_get_u64("enable_experimental_new_join")? == 1) } + pub fn get_enable_partitioned_hash_join(&self) -> Result { + Ok(self.try_get_u64("enable_partitioned_hash_join")? != 0) + } + pub fn get_s3_storage_class(&self) -> Result { let s3_storage_class_setting = self.try_get_string("s3_storage_class")?; S3StorageClass::from_str(&s3_storage_class_setting).map_err(|e| { From a364fcf9cb7dc3c279227a912c8a9f5b6178401c Mon Sep 17 00:00:00 2001 From: zhang2014 Date: Tue, 31 Mar 2026 22:23:35 +0800 Subject: [PATCH 24/38] z --- Cargo.lock | 1 + .../expression/src/aggregate/group_hash.rs | 2 +- src/query/service/Cargo.toml | 1 + .../flight/v1/scatter/flight_scatter_hash.rs | 76 +++++++++++++------ 4 files changed, 54 insertions(+), 26 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index bc2efbfde17c5..5ee961a3ea5c3 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -6026,6 +6026,7 @@ dependencies = [ "sha2", "socket2 0.5.9", "sqlx", + "strength_reduce", "sysinfo", "tantivy", "temp-env", diff --git a/src/query/expression/src/aggregate/group_hash.rs b/src/query/expression/src/aggregate/group_hash.rs index cfb29554f3620..9b68cf7635b0d 100644 --- a/src/query/expression/src/aggregate/group_hash.rs +++ b/src/query/expression/src/aggregate/group_hash.rs @@ -60,7 +60,7 @@ pub fn group_hash_entries(entries: ProjectedBlock, values: &mut [u64]) { } } -fn combine_group_hash_column(c: &Column, values: &mut [u64]) { +pub fn combine_group_hash_column(c: &Column, values: &mut [u64]) { HashVisitor:: { values } .visit_column(c.clone()) .unwrap() diff --git a/src/query/service/Cargo.toml b/src/query/service/Cargo.toml index 547d06c4f83af..c1b10b4f8a1af 100644 --- a/src/query/service/Cargo.toml +++ b/src/query/service/Cargo.toml @@ -164,6 +164,7 @@ serde_urlencoded = { workspace = true } sha2 = { workspace = true } socket2 = { workspace = true } sqlx = { workspace = true } +strength_reduce = { workspace = true } sysinfo = { workspace = true } tempfile = { workspace = true } tokio = { workspace = true } diff --git a/src/query/service/src/servers/flight/v1/scatter/flight_scatter_hash.rs b/src/query/service/src/servers/flight/v1/scatter/flight_scatter_hash.rs index 533ca56929a0d..0b73c7cdabc25 100644 --- a/src/query/service/src/servers/flight/v1/scatter/flight_scatter_hash.rs +++ b/src/query/service/src/servers/flight/v1/scatter/flight_scatter_hash.rs @@ -25,6 +25,7 @@ use databend_common_expression::FunctionID; use databend_common_expression::RemoteExpr; use databend_common_expression::Scalar; use databend_common_expression::Value; +use databend_common_expression::aggregate::combine_group_hash_column; use databend_common_expression::type_check::check_function; use databend_common_expression::types::AccessType; use databend_common_expression::types::AnyType; @@ -35,6 +36,7 @@ use databend_common_expression::types::NumberDataType; use databend_common_expression::types::NumberType; use databend_common_expression::types::number::NumberScalar; use databend_common_functions::BUILTIN_FUNCTIONS; +use strength_reduce::StrengthReducedU64; use crate::servers::flight::v1::scatter::flight_scatter::FlightScatter; @@ -43,6 +45,8 @@ pub struct HashFlightScatter { func_ctx: FunctionContext, hash_key: Vec, scatter_size: usize, + raw_hash_keys: Vec, + hash_key_data_types: Vec, } impl HashFlightScatter { @@ -60,23 +64,25 @@ impl HashFlightScatter { local_pos, ); } - let hash_key = hash_keys + let raw_hash_keys: Vec = hash_keys .iter() - .map(|key| { - check_function( - None, - "siphash", - &[], - &[key.as_expr(&BUILTIN_FUNCTIONS)], - &BUILTIN_FUNCTIONS, - ) - }) + .map(|key| key.as_expr(&BUILTIN_FUNCTIONS)) + .collect(); + let hash_key_data_types: Vec = raw_hash_keys + .iter() + .map(|expr| expr.data_type().clone()) + .collect(); + let hash_key = raw_hash_keys + .iter() + .map(|expr| check_function(None, "siphash", &[], &[expr.clone()], &BUILTIN_FUNCTIONS)) .collect::>()?; Ok(Box::new(Self { func_ctx, scatter_size, hash_key, + raw_hash_keys, + hash_key_data_types, })) } } @@ -87,6 +93,8 @@ struct OneHashKeyFlightScatter { func_ctx: FunctionContext, indices_scalar: Expr, default_scatter_index: u64, + hash_key_expr: Expr, + hash_key_data_type: DataType, } impl OneHashKeyFlightScatter { @@ -101,6 +109,8 @@ impl OneHashKeyFlightScatter { } else { 0 }; + let hash_key_expr = hash_key.as_expr(&BUILTIN_FUNCTIONS); + let hash_key_data_type = hash_key_expr.data_type().clone(); let indices_scalar = check_function( None, "modulo", @@ -110,7 +120,7 @@ impl OneHashKeyFlightScatter { None, "siphash", &[], - &[hash_key.as_expr(&BUILTIN_FUNCTIONS)], + &[hash_key_expr.clone()], &BUILTIN_FUNCTIONS, )?, Expr::constant( @@ -126,6 +136,8 @@ impl OneHashKeyFlightScatter { func_ctx, indices_scalar, default_scatter_index, + hash_key_expr, + hash_key_data_type, })) } } @@ -155,9 +167,15 @@ impl FlightScatter for OneHashKeyFlightScatter { fn scatter_indices(&self, data_block: &DataBlock) -> Result>> { let evaluator = Evaluator::new(data_block, &self.func_ctx, &BUILTIN_FUNCTIONS); let num = data_block.num_rows(); - let indices = evaluator.run(&self.indices_scalar).unwrap(); - let indices = get_hash_values(indices, num, self.default_scatter_index)?; - Ok(Some(indices.to_vec())) + let value = evaluator.run(&self.hash_key_expr)?; + let column = value.convert_to_full_column(&self.hash_key_data_type, num); + let mut hashes = vec![0u64; num]; + combine_group_hash_column::(&column, &mut hashes); + let m = StrengthReducedU64::new(self.scatter_size as u64); + for h in hashes.iter_mut() { + *h = *h % m; + } + Ok(Some(hashes)) } } @@ -195,18 +213,26 @@ impl FlightScatter for HashFlightScatter { fn scatter_indices(&self, data_block: &DataBlock) -> Result>> { let evaluator = Evaluator::new(data_block, &self.func_ctx, &BUILTIN_FUNCTIONS); let num = data_block.num_rows(); - let indices = if !self.hash_key.is_empty() { - let mut hash_keys = Vec::with_capacity(self.hash_key.len()); - for expr in &self.hash_key { - let indices = evaluator.run(expr).unwrap(); - let indices = get_hash_values(indices, num, 0)?; - hash_keys.push(indices) + let mut hashes = vec![0u64; num]; + for (i, (expr, dt)) in self + .raw_hash_keys + .iter() + .zip(&self.hash_key_data_types) + .enumerate() + { + let value = evaluator.run(expr)?; + let column = value.convert_to_full_column(dt, num); + if i == 0 { + combine_group_hash_column::(&column, &mut hashes); + } else { + combine_group_hash_column::(&column, &mut hashes); } - self.combine_hash_keys(&hash_keys, num) - } else { - Ok(vec![0; num]) - }?; - Ok(Some(indices)) + } + let m = StrengthReducedU64::new(self.scatter_size as u64); + for h in hashes.iter_mut() { + *h = *h % m; + } + Ok(Some(hashes)) } } From 7504545fcf24216735ca78b279feb2b81c5af815 Mon Sep 17 00:00:00 2001 From: zhang2014 Date: Wed, 1 Apr 2026 14:56:44 +0800 Subject: [PATCH 25/38] z --- .../src/physical_plans/physical_hash_join.rs | 3 ++ .../transforms/new_hash_join/mod.rs | 1 + .../new_hash_join/partitioned/mod.rs | 1 + .../partitioned/transform_hash_join.rs | 47 +++++++++++++++++-- 4 files changed, 47 insertions(+), 5 deletions(-) diff --git a/src/query/service/src/physical_plans/physical_hash_join.rs b/src/query/service/src/physical_plans/physical_hash_join.rs index 78dbc2fdca9ba..058a0bf8b5756 100644 --- a/src/query/service/src/physical_plans/physical_hash_join.rs +++ b/src/query/service/src/physical_plans/physical_hash_join.rs @@ -72,6 +72,7 @@ use crate::pipelines::processors::HashJoinState; use crate::pipelines::processors::transforms::HashJoinFactory; use crate::pipelines::processors::transforms::HashJoinProbeState; use crate::pipelines::processors::transforms::RuntimeFiltersDesc; +use crate::pipelines::processors::transforms::SharedRuntimeFilterPackets; use crate::pipelines::processors::transforms::TransformHashJoin; use crate::pipelines::processors::transforms::TransformHashJoinBuild; use crate::pipelines::processors::transforms::TransformHashJoinProbe; @@ -507,6 +508,7 @@ impl HashJoin { let barrier = databend_common_base::base::Barrier::new(output_len); let stage_sync_barrier = Arc::new(barrier); + let shared_rf_packets = SharedRuntimeFilterPackets::create(); let mut join_sinks = Vec::with_capacity(output_len * 2); let mut join_pipe_items = Vec::with_capacity(output_len); for (build_sink, probe_sink) in build_sinks.into_iter().zip(probe_sinks.into_iter()) { @@ -533,6 +535,7 @@ impl HashJoin { stage_sync_barrier.clone(), self.projections.clone(), rf_desc.clone(), + shared_rf_packets.clone(), )?; join_pipe_items.push(PipeItem::create( diff --git a/src/query/service/src/pipelines/processors/transforms/new_hash_join/mod.rs b/src/query/service/src/pipelines/processors/transforms/new_hash_join/mod.rs index acb1eb5e7e57e..1857969bb6059 100644 --- a/src/query/service/src/pipelines/processors/transforms/new_hash_join/mod.rs +++ b/src/query/service/src/pipelines/processors/transforms/new_hash_join/mod.rs @@ -19,6 +19,7 @@ pub mod unpartitioned; pub use common::join::Join; pub use common::join::JoinStream; pub use common::runtime_filter::RuntimeFiltersDesc; +pub use partitioned::SharedRuntimeFilterPackets; pub use partitioned::TransformPartitionedHashJoin; pub use unpartitioned::HashJoinFactory; pub use unpartitioned::TransformHashJoin; diff --git a/src/query/service/src/pipelines/processors/transforms/new_hash_join/partitioned/mod.rs b/src/query/service/src/pipelines/processors/transforms/new_hash_join/partitioned/mod.rs index ad53dcd1ac78f..a896bf13bc191 100644 --- a/src/query/service/src/pipelines/processors/transforms/new_hash_join/partitioned/mod.rs +++ b/src/query/service/src/pipelines/processors/transforms/new_hash_join/partitioned/mod.rs @@ -34,4 +34,5 @@ pub use partitioned_build::PartitionedHashJoinState; pub use right_join::PartitionedRightJoin; pub use right_join_anti::PartitionedRightAntiJoin; pub use right_join_semi::PartitionedRightSemiJoin; +pub use transform_hash_join::SharedRuntimeFilterPackets; pub use transform_hash_join::TransformPartitionedHashJoin; diff --git a/src/query/service/src/pipelines/processors/transforms/new_hash_join/partitioned/transform_hash_join.rs b/src/query/service/src/pipelines/processors/transforms/new_hash_join/partitioned/transform_hash_join.rs index e0155aa6e85f7..bd58c52a57dbb 100644 --- a/src/query/service/src/pipelines/processors/transforms/new_hash_join/partitioned/transform_hash_join.rs +++ b/src/query/service/src/pipelines/processors/transforms/new_hash_join/partitioned/transform_hash_join.rs @@ -17,6 +17,8 @@ use std::collections::BTreeSet; use std::fmt::Debug; use std::fmt::Formatter; use std::sync::Arc; +use std::sync::Mutex; +use std::sync::PoisonError; use std::time::Instant; use databend_common_base::base::Barrier; @@ -40,12 +42,38 @@ use super::PartitionedRightAntiJoin; use super::PartitionedRightJoin; use super::PartitionedRightSemiJoin; use crate::pipelines::processors::HashJoinDesc; +use crate::pipelines::processors::transforms::JoinRuntimeFilterPacket; use crate::pipelines::processors::transforms::RuntimeFilterLocalBuilder; +use crate::pipelines::processors::transforms::merge_join_runtime_filter_packets; use crate::pipelines::processors::transforms::new_hash_join::common::join::FinishedJoin; use crate::pipelines::processors::transforms::new_hash_join::common::join::Join; use crate::pipelines::processors::transforms::new_hash_join::common::join::JoinStream; use crate::pipelines::processors::transforms::new_hash_join::common::runtime_filter::RuntimeFiltersDesc; +pub struct SharedRuntimeFilterPackets { + packets: Mutex>, +} + +impl SharedRuntimeFilterPackets { + pub fn create() -> Arc { + Arc::new(SharedRuntimeFilterPackets { + packets: Mutex::new(Vec::new()), + }) + } + + pub fn add_packet(&self, packet: JoinRuntimeFilterPacket) { + let locked = self.packets.lock(); + let mut locked = locked.unwrap_or_else(PoisonError::into_inner); + locked.push(packet); + } + + pub fn take_packets(&self) -> Vec { + let locked = self.packets.lock(); + let mut locked = locked.unwrap_or_else(PoisonError::into_inner); + std::mem::take(&mut *locked) + } +} + pub struct TransformPartitionedHashJoin { build_port: Arc, probe_port: Arc, @@ -59,6 +87,7 @@ pub struct TransformPartitionedHashJoin { projection: BTreeSet, rf_desc: Arc, runtime_filter_builder: Option, + shared_rf_packets: Arc, instant: Instant, } @@ -71,6 +100,7 @@ impl TransformPartitionedHashJoin { stage_sync_barrier: Arc, projection: BTreeSet, rf_desc: Arc, + shared_rf_packets: Arc, ) -> Result { let runtime_filter_builder = RuntimeFilterLocalBuilder::try_create( &rf_desc.func_ctx, @@ -90,6 +120,7 @@ impl TransformPartitionedHashJoin { rf_desc, projection, stage_sync_barrier, + shared_rf_packets, joined_data: None, runtime_filter_builder, stage: Stage::Build(BuildState { @@ -175,6 +206,7 @@ impl Processor for TransformPartitionedHashJoin { self.stage = Stage::Finished; let mut finished = FinishedJoin::create(); std::mem::swap(&mut finished, &mut self.join); + self.stage_sync_barrier.reduce_quorum(1); drop(finished); } @@ -285,7 +317,7 @@ impl Processor for TransformPartitionedHashJoin { if let Some(builder) = self.runtime_filter_builder.take() { let spill_happened = self.join.is_spill_happened(); let packet = builder.finish(spill_happened)?; - self.join.add_runtime_filter_packet(packet); + self.shared_rf_packets.add_packet(packet); } let rf_build_elapsed = self.instant.elapsed() - elapsed; @@ -293,11 +325,16 @@ impl Processor for TransformPartitionedHashJoin { let before_wait = self.instant.elapsed(); if wait_res.is_leader() { - let spilled = self.join.is_spill_happened(); - let packet = self.join.build_runtime_filter()?; + let packets = self.shared_rf_packets.take_packets(); + let packet = merge_join_runtime_filter_packets( + packets, + self.rf_desc.inlist_threshold, + self.rf_desc.bloom_threshold, + self.rf_desc.min_max_threshold, + self.rf_desc.spatial_threshold, + )?; info!( - "spilled: {}, globalize runtime filter: total {}, disable_all_due_to_spill: {}", - spilled, + "spilled: false, globalize runtime filter: total {}, disable_all_due_to_spill: {}", packet.packets.as_ref().map_or(0, |p| p.len()), packet.disable_all_due_to_spill ); From 5f1c088ff3e1151fe0bf256eae9cca3445e05847 Mon Sep 17 00:00:00 2001 From: zhang2014 Date: Wed, 1 Apr 2026 19:16:07 +0800 Subject: [PATCH 26/38] z --- src/query/catalog/src/sbbf.rs | 78 +++++++++- .../hash_join/runtime_filter/convert.rs | 43 ++---- .../hash_join/runtime_filter/local_builder.rs | 45 +++--- .../hash_join/runtime_filter/merge.rs | 140 ++++++++++++++++-- .../hash_join/runtime_filter/mod.rs | 1 + .../hash_join/runtime_filter/packet.rs | 6 +- .../transforms/new_hash_join/common/join.rs | 4 +- .../partitioned/transform_hash_join.rs | 41 ++--- .../unpartitioned/hybrid/hybrid_join.rs | 2 +- .../unpartitioned/memory/inner_join.rs | 32 ++-- .../unpartitioned/memory/left_join_semi.rs | 32 ++-- .../unpartitioned/memory/nested_loop.rs | 4 +- .../unpartitioned/memory/right_join.rs | 32 ++-- .../unpartitioned/memory/right_join_anti.rs | 32 ++-- .../unpartitioned/memory/right_join_semi.rs | 32 ++-- .../unpartitioned/transform_hash_join.rs | 2 +- 16 files changed, 370 insertions(+), 156 deletions(-) diff --git a/src/query/catalog/src/sbbf.rs b/src/query/catalog/src/sbbf.rs index 5156d3196e221..3f176f6fb01a7 100644 --- a/src/query/catalog/src/sbbf.rs +++ b/src/query/catalog/src/sbbf.rs @@ -199,7 +199,7 @@ pub struct Sbbf(Vec); pub struct SbbfAtomic(Vec); pub(crate) const BITSET_MIN_LENGTH: usize = 32; -pub(crate) const BITSET_MAX_LENGTH: usize = 128 * 1024 * 1024; +pub(crate) const BITSET_MAX_LENGTH: usize = 64 * 1024 * 1024; #[inline] fn hash_to_block_index_for_blocks(hash: u64, num_blocks: usize) -> usize { @@ -306,6 +306,35 @@ impl Sbbf { pub fn estimated_memory_size(&self) -> usize { self.0.capacity() * std::mem::size_of::() } + + /// Serialize the bloom filter to bytes (little-endian). + pub fn to_bytes(&self) -> Vec { + let mut bytes = Vec::with_capacity(self.0.len() * size_of::()); + for block in &self.0 { + for word in &block.0 { + bytes.extend_from_slice(&word.to_le_bytes()); + } + } + bytes + } + + /// Deserialize a bloom filter from bytes (little-endian). + /// Returns None if bytes length is not a multiple of 32 (Block size). + pub fn from_bytes(bytes: &[u8]) -> Option { + if bytes.is_empty() || bytes.len() % size_of::() != 0 { + return None; + } + let num_blocks = bytes.len() / size_of::(); + let mut blocks = Vec::with_capacity(num_blocks); + for chunk in bytes.chunks_exact(size_of::()) { + let mut words = [0u32; 8]; + for (i, word_bytes) in chunk.chunks_exact(4).enumerate() { + words[i] = u32::from_le_bytes(word_bytes.try_into().unwrap()); + } + blocks.push(Block(words)); + } + Some(Self(blocks)) + } } impl SbbfAtomic { @@ -497,7 +526,7 @@ mod tests { (33, 64), (99, 128), (1024, 1024), - (999_000_000, 128 * 1024 * 1024), + (999_000_000, 64 * 1024 * 1024), ] { assert_eq!(*expected, optimal_num_of_bytes(*input)); } @@ -529,4 +558,49 @@ mod tests { assert_eq!(*num_bits, num_of_bits_from_ndv_fpp(*ndv, *fpp) as u64); } } + + #[test] + fn test_sbbf_to_bytes_from_bytes_roundtrip() { + let mut filter = Sbbf::new_with_ndv_fpp(1000, 0.01).unwrap(); + let hashes: Vec = (0..500).collect(); + filter.insert_hash_batch(&hashes); + + let bytes = filter.to_bytes(); + let restored = Sbbf::from_bytes(&bytes).unwrap(); + + for hash in &hashes { + assert_eq!(filter.check_hash(*hash), restored.check_hash(*hash)); + } + } + + #[test] + fn test_sbbf_from_bytes_invalid() { + assert!(Sbbf::from_bytes(&[]).is_none()); + assert!(Sbbf::from_bytes(&[0; 31]).is_none()); + assert!(Sbbf::from_bytes(&[0; 33]).is_none()); + assert!(Sbbf::from_bytes(&[0; 32]).is_some()); + assert!(Sbbf::from_bytes(&[0; 64]).is_some()); + } + + #[test] + fn test_sbbf_union_after_serialization() { + let mut f1 = Sbbf::new_with_ndv_fpp(100, 0.01).unwrap(); + for i in 0..50 { + f1.insert_hash(i); + } + let mut f2 = Sbbf::new_with_ndv_fpp(100, 0.01).unwrap(); + for i in 50..100 { + f2.insert_hash(i); + } + + let bytes1 = f1.to_bytes(); + let bytes2 = f2.to_bytes(); + let mut restored1 = Sbbf::from_bytes(&bytes1).unwrap(); + let restored2 = Sbbf::from_bytes(&bytes2).unwrap(); + restored1.union(&restored2); + + for i in 0..100 { + assert!(restored1.check_hash(i)); + } + } } diff --git a/src/query/service/src/pipelines/processors/transforms/hash_join/runtime_filter/convert.rs b/src/query/service/src/pipelines/processors/transforms/hash_join/runtime_filter/convert.rs index 82dab8970d2b6..58c717d61267a 100644 --- a/src/query/service/src/pipelines/processors/transforms/hash_join/runtime_filter/convert.rs +++ b/src/query/service/src/pipelines/processors/transforms/hash_join/runtime_filter/convert.rs @@ -21,7 +21,6 @@ use databend_common_catalog::runtime_filter_info::RuntimeFilterInfo; use databend_common_catalog::runtime_filter_info::RuntimeFilterSpatial; use databend_common_catalog::runtime_filter_info::RuntimeFilterStats; use databend_common_catalog::sbbf::Sbbf; -use databend_common_catalog::sbbf::SbbfAtomic; use databend_common_exception::ErrorCode; use databend_common_exception::Result; use databend_common_expression::Column; @@ -55,7 +54,7 @@ pub async fn build_runtime_filter_infos( packet: JoinRuntimeFilterPacket, runtime_filter_descs: HashMap, selectivity_threshold: u64, - max_threads: usize, + _max_threads: usize, ) -> Result> { let total_build_rows = packet.build_rows; let Some(packets) = packet.packets else { @@ -104,7 +103,7 @@ pub async fn build_runtime_filter_infos( }; let bloom = if bloom_enabled { if let Some(ref bloom) = packet.bloom { - Some(build_bloom_filter(bloom.clone(), probe_key, max_threads, desc.id).await?) + Some(build_bloom_filter(bloom.clone(), probe_key)?) } else { None } @@ -278,37 +277,14 @@ fn build_min_max_filter( Ok(min_max_filter) } -async fn build_bloom_filter( - bloom: Vec, +fn build_bloom_filter( + bloom_bytes: Vec, probe_key: &Expr, - max_threads: usize, - filter_id: usize, ) -> Result { let probe_column = resolve_probe_column_ref(probe_key); let column_name = probe_column.id.to_string(); - let total_items = bloom.len(); - - if total_items < 3_000_000 { - let mut filter = Sbbf::new_with_ndv_fpp(total_items as u64, 0.01) - .map_err(|e| ErrorCode::Internal(e.to_string()))?; - filter.insert_hash_batch(&bloom); - return Ok(RuntimeFilterBloom { - column_name, - filter: Arc::new(filter), - }); - } - - let start = std::time::Instant::now(); - let builder = SbbfAtomic::new_with_ndv_fpp(total_items as u64, 0.01) - .map_err(|e| ErrorCode::Internal(e.to_string()))? - .insert_hash_batch_parallel(bloom, max_threads); - let filter = builder.finish(); - log::info!( - "filter_id: {}, build_time: {:?}", - filter_id, - start.elapsed() - ); - + let filter = Sbbf::from_bytes(&bloom_bytes) + .ok_or_else(|| ErrorCode::Internal("Invalid bloom filter bytes in runtime filter"))?; Ok(RuntimeFilterBloom { column_name, filter: Arc::new(filter), @@ -331,6 +307,7 @@ fn resolve_probe_column_ref(probe_key: &Expr) -> &ColumnRef { mod tests { use std::collections::HashMap; + use databend_common_catalog::sbbf::Sbbf; use databend_common_expression::ColumnBuilder; use databend_common_expression::ColumnRef; use databend_common_expression::Constant; @@ -392,7 +369,11 @@ mod tests { min: Scalar::Number(1i32.into()), max: Scalar::Number(10i32.into()), }), - bloom: Some(vec![11, 22]), + bloom: Some({ + let mut f = Sbbf::new_with_ndv_fpp(100, 0.01).unwrap(); + f.insert_hash_batch(&[11, 22]); + f.to_bytes() + }), spatial: None, }); diff --git a/src/query/service/src/pipelines/processors/transforms/hash_join/runtime_filter/local_builder.rs b/src/query/service/src/pipelines/processors/transforms/hash_join/runtime_filter/local_builder.rs index 7b2e724ec63d7..704e5e4bb2e3b 100644 --- a/src/query/service/src/pipelines/processors/transforms/hash_join/runtime_filter/local_builder.rs +++ b/src/query/service/src/pipelines/processors/transforms/hash_join/runtime_filter/local_builder.rs @@ -12,6 +12,8 @@ // See the License for the specific language governing permissions and // limitations under the License. +use databend_common_catalog::sbbf::Sbbf; +use databend_common_exception::ErrorCode; use databend_common_exception::Result; use databend_common_expression::BlockEntry; use databend_common_expression::Column; @@ -47,8 +49,9 @@ struct SingleFilterBuilder { inlist_builder: Option, inlist_threshold: usize, - bloom_hashes: Option>, + bloom_filter: Option, bloom_threshold: usize, + bloom_disabled: bool, is_spatial: bool, spatial_rects: Vec<(f64, f64, f64, f64)>, @@ -90,12 +93,21 @@ impl SingleFilterBuilder { } else { 0 }, - bloom_hashes: None, + bloom_filter: if desc.enable_bloom_runtime_filter && bloom_threshold > 0 { + let ndv = match desc.build_table_rows { + Some(rows) => (rows as u64).min(bloom_threshold as u64), + None => bloom_threshold as u64, + }; + Some(Sbbf::new_with_ndv_fpp(ndv, 0.01).map_err(|e| ErrorCode::Internal(e))?) + } else { + None + }, bloom_threshold: if desc.enable_bloom_runtime_filter { bloom_threshold } else { 0 }, + bloom_disabled: !desc.enable_bloom_runtime_filter || bloom_threshold == 0, is_spatial: desc.is_spatial, spatial_rects: Vec::new(), spatial_srid: None, @@ -142,22 +154,21 @@ impl SingleFilterBuilder { } fn add_bloom(&mut self, column: &Column, new_total: usize) -> Result<()> { - if new_total > self.bloom_threshold { - self.bloom_hashes = None; + if self.bloom_disabled || new_total > self.bloom_threshold { + self.bloom_filter = None; + self.bloom_disabled = true; return Ok(()); } - let mut hashes = match self.bloom_hashes.take() { - Some(h) => h, - None => Vec::with_capacity(column.len()), - }; - hashes.reserve(column.len()); - let entry = BlockEntry::from(column.clone()); - let hash_method = self - .hash_method - .as_ref() - .expect("hash_method must exist for non-spatial filters"); - hash_by_method_for_bloom(hash_method, (&[entry]).into(), column.len(), &mut hashes)?; - self.bloom_hashes = Some(hashes); + if let Some(ref mut filter) = self.bloom_filter { + let mut hashes = Vec::with_capacity(column.len()); + let entry = BlockEntry::from(column.clone()); + let hash_method = self + .hash_method + .as_ref() + .expect("hash_method must exist for non-spatial filters"); + hash_by_method_for_bloom(hash_method, (&[entry]).into(), column.len(), &mut hashes)?; + filter.insert_hash_batch(&hashes); + } Ok(()) } @@ -219,7 +230,7 @@ impl SingleFilterBuilder { None }; - let bloom = self.bloom_hashes.take(); + let bloom = self.bloom_filter.take().map(|f| f.to_bytes()); Ok(RuntimeFilterPacket { id: self.id, diff --git a/src/query/service/src/pipelines/processors/transforms/hash_join/runtime_filter/merge.rs b/src/query/service/src/pipelines/processors/transforms/hash_join/runtime_filter/merge.rs index 18282b1194a64..f528478021cf9 100644 --- a/src/query/service/src/pipelines/processors/transforms/hash_join/runtime_filter/merge.rs +++ b/src/query/service/src/pipelines/processors/transforms/hash_join/runtime_filter/merge.rs @@ -14,6 +14,7 @@ use std::collections::HashMap; +use databend_common_catalog::sbbf::Sbbf; use databend_common_exception::Result; use databend_common_expression::Column; @@ -158,25 +159,21 @@ fn merge_min_max( Some(SerializableDomain { min, max }) } -fn merge_bloom(packets: &[HashMap], rf_id: usize) -> Option> { +fn merge_bloom(packets: &[HashMap], rf_id: usize) -> Option> { if packets .iter() .any(|packet| packet.get(&rf_id).unwrap().bloom.is_none()) { return None; } - let mut bloom = packets[0] - .get(&rf_id) - .unwrap() - .bloom - .as_ref() - .unwrap() - .clone(); + let first_bytes = packets[0].get(&rf_id).unwrap().bloom.as_ref().unwrap(); + let mut merged = Sbbf::from_bytes(first_bytes)?; for packet in packets.iter().skip(1) { - let other = packet.get(&rf_id).unwrap().bloom.as_ref().unwrap(); - bloom.extend_from_slice(other); + let other_bytes = packet.get(&rf_id).unwrap().bloom.as_ref().unwrap(); + let other = Sbbf::from_bytes(other_bytes)?; + merged.union(&other); } - Some(bloom) + Some(merged.to_bytes()) } fn merge_spatial( @@ -223,10 +220,110 @@ fn merge_spatial( })) } +/// Pairwise merge of two runtime filter packets without threshold checks. +/// Used for work-stealing incremental merge within a node. +pub fn merge_two_runtime_filter_packets( + mut a: JoinRuntimeFilterPacket, + mut b: JoinRuntimeFilterPacket, +) -> Result { + let total_build_rows = a.build_rows + b.build_rows; + let disable_all = a.disable_all_due_to_spill || b.disable_all_due_to_spill; + + if disable_all { + return Ok(JoinRuntimeFilterPacket::disable_all(total_build_rows)); + } + + let (a_packets, b_packets) = match (a.packets.take(), b.packets.take()) { + (None, None) => { + return Ok(JoinRuntimeFilterPacket::complete_without_filters( + total_build_rows, + )); + } + (Some(p), None) | (None, Some(p)) => { + return Ok(JoinRuntimeFilterPacket::complete(p, total_build_rows)); + } + (Some(a), Some(b)) => (a, b), + }; + + let mut result = HashMap::new(); + for (id, mut a_pkt) in a_packets { + if let Some(mut b_pkt) = b_packets.get(&id).cloned() { + // Merge bloom via Sbbf::union + let bloom = match (a_pkt.bloom.take(), b_pkt.bloom.take()) { + (Some(a_bytes), Some(b_bytes)) => { + if let (Some(mut a_filter), Some(b_filter)) = + (Sbbf::from_bytes(&a_bytes), Sbbf::from_bytes(&b_bytes)) + { + a_filter.union(&b_filter); + Some(a_filter.to_bytes()) + } else { + None + } + } + _ => None, + }; + + // Merge inlist via concat + let inlist = match (a_pkt.inlist.take(), b_pkt.inlist.take()) { + (Some(a_col), Some(b_col)) => { + Some(Column::concat_columns([a_col, b_col].into_iter())?) + } + _ => None, + }; + + // Merge min_max + let min_max = match (a_pkt.min_max.take(), b_pkt.min_max.take()) { + (Some(a_mm), Some(b_mm)) => Some(SerializableDomain { + min: a_mm.min.min(b_mm.min), + max: a_mm.max.max(b_mm.max), + }), + _ => None, + }; + + // Merge spatial + let spatial = match (a_pkt.spatial.take(), b_pkt.spatial.take()) { + (Some(a_sp), Some(b_sp)) => { + if a_sp.valid && b_sp.valid && a_sp.srid == b_sp.srid { + let rtrees = merge_rtrees_to_threshold( + vec![a_sp.rtrees.as_slice(), b_sp.rtrees.as_slice()], + usize::MAX, + )?; + Some(SpatialPacket { + valid: true, + srid: a_sp.srid, + rtrees, + }) + } else { + None + } + } + _ => None, + }; + + result.insert(id, RuntimeFilterPacket { + id, + bloom, + inlist, + min_max, + spatial, + }); + } + } + + if result.is_empty() { + return Ok(JoinRuntimeFilterPacket::complete_without_filters( + total_build_rows, + )); + } + + Ok(JoinRuntimeFilterPacket::complete(result, total_build_rows)) +} + #[cfg(test)] mod tests { use std::collections::HashMap; + use databend_common_catalog::sbbf::Sbbf; use databend_common_expression::ColumnBuilder; use databend_common_expression::Scalar; use databend_common_expression::types::DataType; @@ -244,6 +341,12 @@ mod tests { builder.build() } + fn make_bloom(hashes: &[u64]) -> Vec { + let mut filter = Sbbf::new_with_ndv_fpp(100, 0.01).unwrap(); + filter.insert_hash_batch(hashes); + filter.to_bytes() + } + #[test] fn test_merge_short_circuit_all_types() -> Result<()> { let mut runtime_filters = HashMap::new(); @@ -254,7 +357,7 @@ mod tests { min: Scalar::Number(NumberScalar::Int32(1)), max: Scalar::Number(NumberScalar::Int32(3)), }), - bloom: Some(vec![11, 22, 33]), + bloom: Some(make_bloom(&[11, 22, 33])), spatial: None, }); @@ -272,6 +375,9 @@ mod tests { #[test] fn test_merge_short_circuit_inlist_only() -> Result<()> { + let bloom1 = make_bloom(&[1, 2]); + let bloom2 = make_bloom(&[3, 4]); + let mut runtime_filters_1 = HashMap::new(); runtime_filters_1.insert(7, RuntimeFilterPacket { id: 7, @@ -280,7 +386,7 @@ mod tests { min: Scalar::Number(NumberScalar::Int32(1)), max: Scalar::Number(NumberScalar::Int32(5)), }), - bloom: Some(vec![1, 2]), + bloom: Some(bloom1.clone()), spatial: None, }); let mut runtime_filters_2 = HashMap::new(); @@ -291,7 +397,7 @@ mod tests { min: Scalar::Number(NumberScalar::Int32(-1)), max: Scalar::Number(NumberScalar::Int32(8)), }), - bloom: Some(vec![3, 4]), + bloom: Some(bloom2.clone()), spatial: None, }); @@ -309,7 +415,11 @@ mod tests { let packet = merged.packets.unwrap().remove(&7).unwrap(); assert_eq!(merged.build_rows, 11); assert!(packet.inlist.is_none()); - assert_eq!(packet.bloom, Some(vec![1, 2, 3, 4])); + // Bloom should be a merged Sbbf containing all hashes + let merged_filter = Sbbf::from_bytes(packet.bloom.as_ref().unwrap()).unwrap(); + for h in &[1u64, 2, 3, 4] { + assert!(merged_filter.check_hash(*h)); + } assert_eq!( packet.min_max, Some(SerializableDomain { diff --git a/src/query/service/src/pipelines/processors/transforms/hash_join/runtime_filter/mod.rs b/src/query/service/src/pipelines/processors/transforms/hash_join/runtime_filter/mod.rs index 0fbf2e9b84717..0fb339b4f21e1 100644 --- a/src/query/service/src/pipelines/processors/transforms/hash_join/runtime_filter/mod.rs +++ b/src/query/service/src/pipelines/processors/transforms/hash_join/runtime_filter/mod.rs @@ -26,4 +26,5 @@ pub use global::get_global_runtime_filter_packet; pub use interface::build_and_push_down_runtime_filter; pub use local_builder::RuntimeFilterLocalBuilder; pub use merge::merge_join_runtime_filter_packets; +pub use merge::merge_two_runtime_filter_packets; pub use packet::JoinRuntimeFilterPacket; diff --git a/src/query/service/src/pipelines/processors/transforms/hash_join/runtime_filter/packet.rs b/src/query/service/src/pipelines/processors/transforms/hash_join/runtime_filter/packet.rs index 707eff18a212f..1c0eacea8cdf2 100644 --- a/src/query/service/src/pipelines/processors/transforms/hash_join/runtime_filter/packet.rs +++ b/src/query/service/src/pipelines/processors/transforms/hash_join/runtime_filter/packet.rs @@ -45,7 +45,7 @@ pub struct RuntimeFilterPacket { pub id: usize, pub inlist: Option, pub min_max: Option, - pub bloom: Option>, + pub bloom: Option>, pub spatial: Option, } @@ -161,7 +161,7 @@ impl TryInto for JoinRuntimeFilterPacket { bloom_pos = Some(entities.len()); let builder = ArrayColumnBuilder { - builder: ColumnBuilder::Number(NumberColumnBuilder::UInt64(bloom_filter)), + builder: ColumnBuilder::Number(NumberColumnBuilder::UInt8(bloom_filter)), offsets: vec![0, len], }; entities.push(Column::Array(Box::new(builder.build()))); @@ -228,7 +228,7 @@ impl TryFrom for JoinRuntimeFilterPacket { let array_column = column.into_array().expect("it's a bug"); let bloom_value_column = array_column.index(0).expect("It's a bug"); bloom = Some(match bloom_value_column { - Column::Number(NumberColumn::UInt64(v)) => v.to_vec(), + Column::Number(NumberColumn::UInt8(v)) => v.to_vec(), _ => unreachable!("Unexpected runtime bloom filter column type"), }) } diff --git a/src/query/service/src/pipelines/processors/transforms/new_hash_join/common/join.rs b/src/query/service/src/pipelines/processors/transforms/new_hash_join/common/join.rs index 524b026b8a158..a3a8493f4f731 100644 --- a/src/query/service/src/pipelines/processors/transforms/new_hash_join/common/join.rs +++ b/src/query/service/src/pipelines/processors/transforms/new_hash_join/common/join.rs @@ -32,7 +32,9 @@ pub trait Join: Send + Sync + 'static { /// returns its progress. Once all batches are consumed it returns `None` to signal completion. fn final_build(&mut self) -> Result>; - fn add_runtime_filter_packet(&self, _packet: JoinRuntimeFilterPacket) {} + fn add_runtime_filter_packet(&self, _packet: JoinRuntimeFilterPacket) -> Result<()> { + Ok(()) + } /// Generate runtime filter packet for the given filter description. fn build_runtime_filter(&self) -> Result { diff --git a/src/query/service/src/pipelines/processors/transforms/new_hash_join/partitioned/transform_hash_join.rs b/src/query/service/src/pipelines/processors/transforms/new_hash_join/partitioned/transform_hash_join.rs index bd58c52a57dbb..292fbf4c0cdff 100644 --- a/src/query/service/src/pipelines/processors/transforms/new_hash_join/partitioned/transform_hash_join.rs +++ b/src/query/service/src/pipelines/processors/transforms/new_hash_join/partitioned/transform_hash_join.rs @@ -44,7 +44,7 @@ use super::PartitionedRightSemiJoin; use crate::pipelines::processors::HashJoinDesc; use crate::pipelines::processors::transforms::JoinRuntimeFilterPacket; use crate::pipelines::processors::transforms::RuntimeFilterLocalBuilder; -use crate::pipelines::processors::transforms::merge_join_runtime_filter_packets; +use crate::pipelines::processors::transforms::merge_two_runtime_filter_packets; use crate::pipelines::processors::transforms::new_hash_join::common::join::FinishedJoin; use crate::pipelines::processors::transforms::new_hash_join::common::join::Join; use crate::pipelines::processors::transforms::new_hash_join::common::join::JoinStream; @@ -61,16 +61,25 @@ impl SharedRuntimeFilterPackets { }) } - pub fn add_packet(&self, packet: JoinRuntimeFilterPacket) { - let locked = self.packets.lock(); - let mut locked = locked.unwrap_or_else(PoisonError::into_inner); - locked.push(packet); + pub fn merge_packet(&self, mut my_packet: JoinRuntimeFilterPacket) -> Result<()> { + loop { + let locked = self.packets.lock(); + let mut guard = locked.unwrap_or_else(PoisonError::into_inner); + + if guard.is_empty() { + guard.push(my_packet); + return Ok(()); + } + + let other = guard.pop().unwrap(); + drop(guard); + my_packet = merge_two_runtime_filter_packets(my_packet, other)?; + } } - pub fn take_packets(&self) -> Vec { - let locked = self.packets.lock(); - let mut locked = locked.unwrap_or_else(PoisonError::into_inner); - std::mem::take(&mut *locked) + pub fn take_packet(&self) -> Option { + let mut guard = self.packets.lock().unwrap_or_else(PoisonError::into_inner); + guard.pop() } } @@ -317,7 +326,7 @@ impl Processor for TransformPartitionedHashJoin { if let Some(builder) = self.runtime_filter_builder.take() { let spill_happened = self.join.is_spill_happened(); let packet = builder.finish(spill_happened)?; - self.shared_rf_packets.add_packet(packet); + self.shared_rf_packets.merge_packet(packet)?; } let rf_build_elapsed = self.instant.elapsed() - elapsed; @@ -325,14 +334,10 @@ impl Processor for TransformPartitionedHashJoin { let before_wait = self.instant.elapsed(); if wait_res.is_leader() { - let packets = self.shared_rf_packets.take_packets(); - let packet = merge_join_runtime_filter_packets( - packets, - self.rf_desc.inlist_threshold, - self.rf_desc.bloom_threshold, - self.rf_desc.min_max_threshold, - self.rf_desc.spatial_threshold, - )?; + let packet = self + .shared_rf_packets + .take_packet() + .unwrap_or_else(|| JoinRuntimeFilterPacket::complete_without_filters(0)); info!( "spilled: false, globalize runtime filter: total {}, disable_all_due_to_spill: {}", packet.packets.as_ref().map_or(0, |p| p.len()), diff --git a/src/query/service/src/pipelines/processors/transforms/new_hash_join/unpartitioned/hybrid/hybrid_join.rs b/src/query/service/src/pipelines/processors/transforms/new_hash_join/unpartitioned/hybrid/hybrid_join.rs index 85c0268910685..c8d2bd46e102c 100644 --- a/src/query/service/src/pipelines/processors/transforms/new_hash_join/unpartitioned/hybrid/hybrid_join.rs +++ b/src/query/service/src/pipelines/processors/transforms/new_hash_join/unpartitioned/hybrid/hybrid_join.rs @@ -212,7 +212,7 @@ impl Join for HybridHashJoin { } } - fn add_runtime_filter_packet(&self, packet: JoinRuntimeFilterPacket) { + fn add_runtime_filter_packet(&self, packet: JoinRuntimeFilterPacket) -> Result<()> { match &self.mode { HybridJoinMode::Memory(join) => join.add_runtime_filter_packet(packet), HybridJoinMode::Grace(join) => join.add_runtime_filter_packet(packet), diff --git a/src/query/service/src/pipelines/processors/transforms/new_hash_join/unpartitioned/memory/inner_join.rs b/src/query/service/src/pipelines/processors/transforms/new_hash_join/unpartitioned/memory/inner_join.rs index a039492b46d56..35e93ea256938 100644 --- a/src/query/service/src/pipelines/processors/transforms/new_hash_join/unpartitioned/memory/inner_join.rs +++ b/src/query/service/src/pipelines/processors/transforms/new_hash_join/unpartitioned/memory/inner_join.rs @@ -34,7 +34,7 @@ use super::basic_state::BasicHashJoinState; use crate::pipelines::processors::HashJoinDesc; use crate::pipelines::processors::transforms::HashJoinHashTable; use crate::pipelines::processors::transforms::JoinRuntimeFilterPacket; -use crate::pipelines::processors::transforms::merge_join_runtime_filter_packets; +use crate::pipelines::processors::transforms::merge_two_runtime_filter_packets; use crate::pipelines::processors::transforms::new_hash_join::common::join::EmptyJoinStream; use crate::pipelines::processors::transforms::new_hash_join::common::join::InnerHashJoinFilterStream; use crate::pipelines::processors::transforms::new_hash_join::common::join::Join; @@ -105,21 +105,27 @@ impl Join for InnerHashJoin { self.basic_hash_join.final_build::() } - fn add_runtime_filter_packet(&self, packet: JoinRuntimeFilterPacket) { - let locked = self.basic_state.mutex.lock(); - let _locked = locked.unwrap_or_else(PoisonError::into_inner); - self.basic_state.packets.as_mut().push(packet); + fn add_runtime_filter_packet(&self, mut packet: JoinRuntimeFilterPacket) -> Result<()> { + loop { + let locked = self.basic_state.mutex.lock(); + let _locked = locked.unwrap_or_else(PoisonError::into_inner); + + let rf_packets = self.basic_state.packets.as_mut(); + + if rf_packets.is_empty() { + rf_packets.push(packet); + return Ok(()); + } + + let other = rf_packets.pop().unwrap(); + drop(_locked); + packet = merge_two_runtime_filter_packets(packet, other)?; + } } fn build_runtime_filter(&self) -> Result { - let packets = std::mem::take(self.basic_state.packets.as_mut()); - merge_join_runtime_filter_packets( - packets, - self.inlist_threshold, - self.bloom_threshold, - self.min_max_threshold, - self.spatial_threshold, - ) + let mut packets = std::mem::take(self.basic_state.packets.as_mut()); + Ok(packets.pop().unwrap_or_default()) } fn probe_block(&mut self, data: DataBlock) -> Result> { diff --git a/src/query/service/src/pipelines/processors/transforms/new_hash_join/unpartitioned/memory/left_join_semi.rs b/src/query/service/src/pipelines/processors/transforms/new_hash_join/unpartitioned/memory/left_join_semi.rs index d4245609a7e29..6c4dbfaaf8400 100644 --- a/src/query/service/src/pipelines/processors/transforms/new_hash_join/unpartitioned/memory/left_join_semi.rs +++ b/src/query/service/src/pipelines/processors/transforms/new_hash_join/unpartitioned/memory/left_join_semi.rs @@ -37,7 +37,7 @@ use crate::pipelines::processors::transforms::BasicHashJoinState; use crate::pipelines::processors::transforms::HashJoinHashTable; use crate::pipelines::processors::transforms::Join; use crate::pipelines::processors::transforms::JoinRuntimeFilterPacket; -use crate::pipelines::processors::transforms::merge_join_runtime_filter_packets; +use crate::pipelines::processors::transforms::merge_two_runtime_filter_packets; use crate::pipelines::processors::transforms::new_hash_join::common::join::EmptyJoinStream; use crate::pipelines::processors::transforms::new_hash_join::common::join::JoinStream; use crate::pipelines::processors::transforms::new_hash_join::common::probe_stream::ProbeStream; @@ -107,21 +107,27 @@ impl Join for SemiLeftHashJoin { self.basic_hash_join.final_build::() } - fn add_runtime_filter_packet(&self, packet: JoinRuntimeFilterPacket) { - let locked = self.basic_state.mutex.lock(); - let _locked = locked.unwrap_or_else(PoisonError::into_inner); - self.basic_state.packets.as_mut().push(packet); + fn add_runtime_filter_packet(&self, mut packet: JoinRuntimeFilterPacket) -> Result<()> { + loop { + let locked = self.basic_state.mutex.lock(); + let _locked = locked.unwrap_or_else(PoisonError::into_inner); + + let rf_packets = self.basic_state.packets.as_mut(); + + if rf_packets.is_empty() { + rf_packets.push(packet); + return Ok(()); + } + + let other = rf_packets.pop().unwrap(); + drop(_locked); + packet = merge_two_runtime_filter_packets(packet, other)?; + } } fn build_runtime_filter(&self) -> Result { - let packets = std::mem::take(self.basic_state.packets.as_mut()); - merge_join_runtime_filter_packets( - packets, - self.inlist_threshold, - self.bloom_threshold, - self.min_max_threshold, - self.spatial_threshold, - ) + let mut packets = std::mem::take(self.basic_state.packets.as_mut()); + Ok(packets.pop().unwrap_or_default()) } fn probe_block(&mut self, data: DataBlock) -> Result> { diff --git a/src/query/service/src/pipelines/processors/transforms/new_hash_join/unpartitioned/memory/nested_loop.rs b/src/query/service/src/pipelines/processors/transforms/new_hash_join/unpartitioned/memory/nested_loop.rs index 75ad373691556..21529fc43bd9d 100644 --- a/src/query/service/src/pipelines/processors/transforms/new_hash_join/unpartitioned/memory/nested_loop.rs +++ b/src/query/service/src/pipelines/processors/transforms/new_hash_join/unpartitioned/memory/nested_loop.rs @@ -89,8 +89,8 @@ impl Join for NestedLoopJoin { self.inner.final_build() } - fn add_runtime_filter_packet(&self, packet: JoinRuntimeFilterPacket) { - self.inner.add_runtime_filter_packet(packet); + fn add_runtime_filter_packet(&self, packet: JoinRuntimeFilterPacket) -> Result<()> { + self.inner.add_runtime_filter_packet(packet) } fn build_runtime_filter(&self) -> Result { diff --git a/src/query/service/src/pipelines/processors/transforms/new_hash_join/unpartitioned/memory/right_join.rs b/src/query/service/src/pipelines/processors/transforms/new_hash_join/unpartitioned/memory/right_join.rs index b84b8cd422991..884d14a056eaf 100644 --- a/src/query/service/src/pipelines/processors/transforms/new_hash_join/unpartitioned/memory/right_join.rs +++ b/src/query/service/src/pipelines/processors/transforms/new_hash_join/unpartitioned/memory/right_join.rs @@ -38,7 +38,7 @@ use crate::pipelines::processors::transforms::HashJoinHashTable; use crate::pipelines::processors::transforms::Join; use crate::pipelines::processors::transforms::JoinRuntimeFilterPacket; use crate::pipelines::processors::transforms::hash_join_table::RowPtr; -use crate::pipelines::processors::transforms::merge_join_runtime_filter_packets; +use crate::pipelines::processors::transforms::merge_two_runtime_filter_packets; use crate::pipelines::processors::transforms::new_hash_join::common::join::JoinStream; use crate::pipelines::processors::transforms::new_hash_join::common::probe_stream::ProbeStream; use crate::pipelines::processors::transforms::new_hash_join::common::probe_stream::ProbedRows; @@ -111,21 +111,27 @@ impl Join for OuterRightHashJoin { self.basic_hash_join.final_build::() } - fn add_runtime_filter_packet(&self, packet: JoinRuntimeFilterPacket) { - let locked = self.basic_state.mutex.lock(); - let _locked = locked.unwrap_or_else(PoisonError::into_inner); - self.basic_state.packets.as_mut().push(packet); + fn add_runtime_filter_packet(&self, mut packet: JoinRuntimeFilterPacket) -> Result<()> { + loop { + let locked = self.basic_state.mutex.lock(); + let _locked = locked.unwrap_or_else(PoisonError::into_inner); + + let rf_packets = self.basic_state.packets.as_mut(); + + if rf_packets.is_empty() { + rf_packets.push(packet); + return Ok(()); + } + + let other = rf_packets.pop().unwrap(); + drop(_locked); + packet = merge_two_runtime_filter_packets(packet, other)?; + } } fn build_runtime_filter(&self) -> Result { - let packets = std::mem::take(self.basic_state.packets.as_mut()); - merge_join_runtime_filter_packets( - packets, - self.inlist_threshold, - self.bloom_threshold, - self.min_max_threshold, - self.spatial_threshold, - ) + let mut packets = std::mem::take(self.basic_state.packets.as_mut()); + Ok(packets.pop().unwrap_or_default()) } fn probe_block(&mut self, data: DataBlock) -> Result> { diff --git a/src/query/service/src/pipelines/processors/transforms/new_hash_join/unpartitioned/memory/right_join_anti.rs b/src/query/service/src/pipelines/processors/transforms/new_hash_join/unpartitioned/memory/right_join_anti.rs index dc24e6d27b990..f64991c528174 100644 --- a/src/query/service/src/pipelines/processors/transforms/new_hash_join/unpartitioned/memory/right_join_anti.rs +++ b/src/query/service/src/pipelines/processors/transforms/new_hash_join/unpartitioned/memory/right_join_anti.rs @@ -35,7 +35,7 @@ use crate::pipelines::processors::transforms::HashJoinHashTable; use crate::pipelines::processors::transforms::Join; use crate::pipelines::processors::transforms::JoinRuntimeFilterPacket; use crate::pipelines::processors::transforms::hash_join_table::RowPtr; -use crate::pipelines::processors::transforms::merge_join_runtime_filter_packets; +use crate::pipelines::processors::transforms::merge_two_runtime_filter_packets; use crate::pipelines::processors::transforms::new_hash_join::common::join::JoinStream; use crate::pipelines::processors::transforms::new_hash_join::unpartitioned::hashtable::ProbeData; use crate::sessions::QueryContext; @@ -105,21 +105,27 @@ impl Join for AntiRightHashJoin { self.basic_hash_join.final_build::() } - fn add_runtime_filter_packet(&self, packet: JoinRuntimeFilterPacket) { - let locked = self.basic_state.mutex.lock(); - let _locked = locked.unwrap_or_else(PoisonError::into_inner); - self.basic_state.packets.as_mut().push(packet); + fn add_runtime_filter_packet(&self, mut packet: JoinRuntimeFilterPacket) -> Result<()> { + loop { + let locked = self.basic_state.mutex.lock(); + let _locked = locked.unwrap_or_else(PoisonError::into_inner); + + let rf_packets = self.basic_state.packets.as_mut(); + + if rf_packets.is_empty() { + rf_packets.push(packet); + return Ok(()); + } + + let other = rf_packets.pop().unwrap(); + drop(_locked); + packet = merge_two_runtime_filter_packets(packet, other)?; + } } fn build_runtime_filter(&self) -> Result { - let packets = std::mem::take(self.basic_state.packets.as_mut()); - merge_join_runtime_filter_packets( - packets, - self.inlist_threshold, - self.bloom_threshold, - self.min_max_threshold, - self.spatial_threshold, - ) + let mut packets = std::mem::take(self.basic_state.packets.as_mut()); + Ok(packets.pop().unwrap_or_default()) } fn probe_block(&mut self, data: DataBlock) -> Result> { diff --git a/src/query/service/src/pipelines/processors/transforms/new_hash_join/unpartitioned/memory/right_join_semi.rs b/src/query/service/src/pipelines/processors/transforms/new_hash_join/unpartitioned/memory/right_join_semi.rs index 8338ca158541d..dfd4861acd55f 100644 --- a/src/query/service/src/pipelines/processors/transforms/new_hash_join/unpartitioned/memory/right_join_semi.rs +++ b/src/query/service/src/pipelines/processors/transforms/new_hash_join/unpartitioned/memory/right_join_semi.rs @@ -36,7 +36,7 @@ use crate::pipelines::processors::transforms::HashJoinHashTable; use crate::pipelines::processors::transforms::Join; use crate::pipelines::processors::transforms::JoinRuntimeFilterPacket; use crate::pipelines::processors::transforms::hash_join_table::RowPtr; -use crate::pipelines::processors::transforms::merge_join_runtime_filter_packets; +use crate::pipelines::processors::transforms::merge_two_runtime_filter_packets; use crate::pipelines::processors::transforms::new_hash_join::common::join::JoinStream; use crate::pipelines::processors::transforms::new_hash_join::common::probe_stream::ProbeStream; use crate::pipelines::processors::transforms::new_hash_join::common::probe_stream::ProbedRows; @@ -108,21 +108,27 @@ impl Join for SemiRightHashJoin { self.basic_hash_join.final_build::() } - fn add_runtime_filter_packet(&self, packet: JoinRuntimeFilterPacket) { - let locked = self.basic_state.mutex.lock(); - let _locked = locked.unwrap_or_else(PoisonError::into_inner); - self.basic_state.packets.as_mut().push(packet); + fn add_runtime_filter_packet(&self, mut packet: JoinRuntimeFilterPacket) -> Result<()> { + loop { + let locked = self.basic_state.mutex.lock(); + let _locked = locked.unwrap_or_else(PoisonError::into_inner); + + let rf_packets = self.basic_state.packets.as_mut(); + + if rf_packets.is_empty() { + rf_packets.push(packet); + return Ok(()); + } + + let other = rf_packets.pop().unwrap(); + drop(_locked); + packet = merge_two_runtime_filter_packets(packet, other)?; + } } fn build_runtime_filter(&self) -> Result { - let packets = std::mem::take(self.basic_state.packets.as_mut()); - merge_join_runtime_filter_packets( - packets, - self.inlist_threshold, - self.bloom_threshold, - self.min_max_threshold, - self.spatial_threshold, - ) + let mut packets = std::mem::take(self.basic_state.packets.as_mut()); + Ok(packets.pop().unwrap_or_default()) } fn probe_block(&mut self, data: DataBlock) -> Result> { diff --git a/src/query/service/src/pipelines/processors/transforms/new_hash_join/unpartitioned/transform_hash_join.rs b/src/query/service/src/pipelines/processors/transforms/new_hash_join/unpartitioned/transform_hash_join.rs index 815827b3fbe48..84c9fa35b87aa 100644 --- a/src/query/service/src/pipelines/processors/transforms/new_hash_join/unpartitioned/transform_hash_join.rs +++ b/src/query/service/src/pipelines/processors/transforms/new_hash_join/unpartitioned/transform_hash_join.rs @@ -227,7 +227,7 @@ impl Processor for TransformHashJoin { // Disable runtime filters once spilling occurs to avoid partial-build filters // being globalized across the cluster, which can prune valid probe rows. let packet = builder.finish(spill_happened)?; - self.join.add_runtime_filter_packet(packet); + self.join.add_runtime_filter_packet(packet)?; } let rf_build_elapsed = self.instant.elapsed() - elapsed; From b17090c90b6283850032219560a3b58066fe602a Mon Sep 17 00:00:00 2001 From: zhang2014 Date: Wed, 1 Apr 2026 19:37:39 +0800 Subject: [PATCH 27/38] z --- src/query/catalog/src/sbbf.rs | 2 +- .../hash_join/runtime_filter/local_builder.rs | 4 ++-- .../new_hash_join/unpartitioned/memory/inner_join.rs | 12 ------------ .../unpartitioned/memory/left_join_semi.rs | 12 ------------ .../new_hash_join/unpartitioned/memory/right_join.rs | 12 ------------ .../unpartitioned/memory/right_join_anti.rs | 12 ------------ .../unpartitioned/memory/right_join_semi.rs | 12 ------------ 7 files changed, 3 insertions(+), 63 deletions(-) diff --git a/src/query/catalog/src/sbbf.rs b/src/query/catalog/src/sbbf.rs index 3f176f6fb01a7..510a8d1397f5a 100644 --- a/src/query/catalog/src/sbbf.rs +++ b/src/query/catalog/src/sbbf.rs @@ -321,7 +321,7 @@ impl Sbbf { /// Deserialize a bloom filter from bytes (little-endian). /// Returns None if bytes length is not a multiple of 32 (Block size). pub fn from_bytes(bytes: &[u8]) -> Option { - if bytes.is_empty() || bytes.len() % size_of::() != 0 { + if bytes.is_empty() || !bytes.len().is_multiple_of(size_of::()) { return None; } let num_blocks = bytes.len() / size_of::(); diff --git a/src/query/service/src/pipelines/processors/transforms/hash_join/runtime_filter/local_builder.rs b/src/query/service/src/pipelines/processors/transforms/hash_join/runtime_filter/local_builder.rs index 704e5e4bb2e3b..d973bad3c7e14 100644 --- a/src/query/service/src/pipelines/processors/transforms/hash_join/runtime_filter/local_builder.rs +++ b/src/query/service/src/pipelines/processors/transforms/hash_join/runtime_filter/local_builder.rs @@ -95,10 +95,10 @@ impl SingleFilterBuilder { }, bloom_filter: if desc.enable_bloom_runtime_filter && bloom_threshold > 0 { let ndv = match desc.build_table_rows { - Some(rows) => (rows as u64).min(bloom_threshold as u64), + Some(rows) => rows.min(bloom_threshold as u64), None => bloom_threshold as u64, }; - Some(Sbbf::new_with_ndv_fpp(ndv, 0.01).map_err(|e| ErrorCode::Internal(e))?) + Some(Sbbf::new_with_ndv_fpp(ndv, 0.01).map_err(ErrorCode::Internal)?) } else { None }, diff --git a/src/query/service/src/pipelines/processors/transforms/new_hash_join/unpartitioned/memory/inner_join.rs b/src/query/service/src/pipelines/processors/transforms/new_hash_join/unpartitioned/memory/inner_join.rs index 35e93ea256938..5881035096fd0 100644 --- a/src/query/service/src/pipelines/processors/transforms/new_hash_join/unpartitioned/memory/inner_join.rs +++ b/src/query/service/src/pipelines/processors/transforms/new_hash_join/unpartitioned/memory/inner_join.rs @@ -50,10 +50,6 @@ pub struct InnerHashJoin { pub(crate) function_ctx: FunctionContext, pub(crate) basic_state: Arc, pub(crate) performance_context: PerformanceContext, - pub(crate) inlist_threshold: usize, - pub(crate) bloom_threshold: usize, - pub(crate) min_max_threshold: usize, - pub(crate) spatial_threshold: usize, } impl InnerHashJoin { @@ -66,10 +62,6 @@ impl InnerHashJoin { nested_loop_join_threshold: usize, ) -> Result { let block_size = settings.get_max_block_size()? as usize; - let inlist_threshold = settings.get_inlist_runtime_filter_threshold()? as usize; - let bloom_threshold = settings.get_bloom_runtime_filter_threshold()? as usize; - let min_max_threshold = settings.get_min_max_runtime_filter_threshold()? as usize; - let spatial_threshold = settings.get_spatial_runtime_filter_threshold()? as usize; let context = PerformanceContext::create(block_size, desc.clone(), function_ctx.clone()); @@ -88,10 +80,6 @@ impl InnerHashJoin { function_ctx, basic_state: state, performance_context: context, - inlist_threshold, - bloom_threshold, - min_max_threshold, - spatial_threshold, }) } } diff --git a/src/query/service/src/pipelines/processors/transforms/new_hash_join/unpartitioned/memory/left_join_semi.rs b/src/query/service/src/pipelines/processors/transforms/new_hash_join/unpartitioned/memory/left_join_semi.rs index 6c4dbfaaf8400..dfae7ab8aae78 100644 --- a/src/query/service/src/pipelines/processors/transforms/new_hash_join/unpartitioned/memory/left_join_semi.rs +++ b/src/query/service/src/pipelines/processors/transforms/new_hash_join/unpartitioned/memory/left_join_semi.rs @@ -52,10 +52,6 @@ pub struct SemiLeftHashJoin { pub(crate) function_ctx: FunctionContext, pub(crate) basic_state: Arc, pub(crate) performance_context: PerformanceContext, - pub(crate) inlist_threshold: usize, - pub(crate) bloom_threshold: usize, - pub(crate) min_max_threshold: usize, - pub(crate) spatial_threshold: usize, } impl SemiLeftHashJoin { @@ -68,10 +64,6 @@ impl SemiLeftHashJoin { ) -> Result { let settings = ctx.get_settings(); let block_size = settings.get_max_block_size()? as usize; - let inlist_threshold = settings.get_inlist_runtime_filter_threshold()? as usize; - let bloom_threshold = settings.get_bloom_runtime_filter_threshold()? as usize; - let min_max_threshold = settings.get_min_max_runtime_filter_threshold()? as usize; - let spatial_threshold = settings.get_spatial_runtime_filter_threshold()? as usize; let context = PerformanceContext::create(block_size, desc.clone(), function_ctx.clone()); @@ -90,10 +82,6 @@ impl SemiLeftHashJoin { function_ctx, basic_state: state, performance_context: context, - inlist_threshold, - bloom_threshold, - min_max_threshold, - spatial_threshold, }) } } diff --git a/src/query/service/src/pipelines/processors/transforms/new_hash_join/unpartitioned/memory/right_join.rs b/src/query/service/src/pipelines/processors/transforms/new_hash_join/unpartitioned/memory/right_join.rs index 884d14a056eaf..cef266511b80b 100644 --- a/src/query/service/src/pipelines/processors/transforms/new_hash_join/unpartitioned/memory/right_join.rs +++ b/src/query/service/src/pipelines/processors/transforms/new_hash_join/unpartitioned/memory/right_join.rs @@ -53,10 +53,6 @@ pub struct OuterRightHashJoin { pub(crate) function_ctx: FunctionContext, pub(crate) basic_state: Arc, pub(crate) performance_context: PerformanceContext, - pub(crate) inlist_threshold: usize, - pub(crate) bloom_threshold: usize, - pub(crate) min_max_threshold: usize, - pub(crate) spatial_threshold: usize, pub(crate) finished: bool, } @@ -71,10 +67,6 @@ impl OuterRightHashJoin { ) -> Result { let settings = ctx.get_settings(); let block_size = settings.get_max_block_size()? as usize; - let inlist_threshold = settings.get_inlist_runtime_filter_threshold()? as usize; - let bloom_threshold = settings.get_bloom_runtime_filter_threshold()? as usize; - let min_max_threshold = settings.get_min_max_runtime_filter_threshold()? as usize; - let spatial_threshold = settings.get_spatial_runtime_filter_threshold()? as usize; let context = PerformanceContext::create(block_size, desc.clone(), function_ctx.clone()); @@ -93,10 +85,6 @@ impl OuterRightHashJoin { function_ctx, basic_state: state, performance_context: context, - inlist_threshold, - bloom_threshold, - min_max_threshold, - spatial_threshold, finished: false, }) } diff --git a/src/query/service/src/pipelines/processors/transforms/new_hash_join/unpartitioned/memory/right_join_anti.rs b/src/query/service/src/pipelines/processors/transforms/new_hash_join/unpartitioned/memory/right_join_anti.rs index f64991c528174..bd35eda235a20 100644 --- a/src/query/service/src/pipelines/processors/transforms/new_hash_join/unpartitioned/memory/right_join_anti.rs +++ b/src/query/service/src/pipelines/processors/transforms/new_hash_join/unpartitioned/memory/right_join_anti.rs @@ -47,10 +47,6 @@ pub struct AntiRightHashJoin { pub(crate) function_ctx: FunctionContext, pub(crate) basic_state: Arc, pub(crate) performance_context: PerformanceContext, - pub(crate) inlist_threshold: usize, - pub(crate) bloom_threshold: usize, - pub(crate) min_max_threshold: usize, - pub(crate) spatial_threshold: usize, pub(crate) finished: bool, } @@ -65,10 +61,6 @@ impl AntiRightHashJoin { ) -> Result { let settings = ctx.get_settings(); let block_size = settings.get_max_block_size()? as usize; - let inlist_threshold = settings.get_inlist_runtime_filter_threshold()? as usize; - let bloom_threshold = settings.get_bloom_runtime_filter_threshold()? as usize; - let min_max_threshold = settings.get_min_max_runtime_filter_threshold()? as usize; - let spatial_threshold = settings.get_spatial_runtime_filter_threshold()? as usize; let context = PerformanceContext::create(block_size, desc.clone(), function_ctx.clone()); @@ -87,10 +79,6 @@ impl AntiRightHashJoin { function_ctx, basic_state: state, performance_context: context, - inlist_threshold, - bloom_threshold, - min_max_threshold, - spatial_threshold, finished: false, }) } diff --git a/src/query/service/src/pipelines/processors/transforms/new_hash_join/unpartitioned/memory/right_join_semi.rs b/src/query/service/src/pipelines/processors/transforms/new_hash_join/unpartitioned/memory/right_join_semi.rs index dfd4861acd55f..df860823ec4c3 100644 --- a/src/query/service/src/pipelines/processors/transforms/new_hash_join/unpartitioned/memory/right_join_semi.rs +++ b/src/query/service/src/pipelines/processors/transforms/new_hash_join/unpartitioned/memory/right_join_semi.rs @@ -50,10 +50,6 @@ pub struct SemiRightHashJoin { pub(crate) function_ctx: FunctionContext, pub(crate) basic_state: Arc, pub(crate) performance_context: PerformanceContext, - pub(crate) inlist_threshold: usize, - pub(crate) bloom_threshold: usize, - pub(crate) min_max_threshold: usize, - pub(crate) spatial_threshold: usize, pub(crate) finished: bool, } @@ -68,10 +64,6 @@ impl SemiRightHashJoin { ) -> Result { let settings = ctx.get_settings(); let block_size = settings.get_max_block_size()? as usize; - let inlist_threshold = settings.get_inlist_runtime_filter_threshold()? as usize; - let bloom_threshold = settings.get_bloom_runtime_filter_threshold()? as usize; - let min_max_threshold = settings.get_min_max_runtime_filter_threshold()? as usize; - let spatial_threshold = settings.get_spatial_runtime_filter_threshold()? as usize; let context = PerformanceContext::create(block_size, desc.clone(), function_ctx.clone()); @@ -90,10 +82,6 @@ impl SemiRightHashJoin { function_ctx, basic_state: state, performance_context: context, - inlist_threshold, - bloom_threshold, - min_max_threshold, - spatial_threshold, finished: false, }) } From de4bd6e57dc28b7a36f952b27f7f39ca8c696891 Mon Sep 17 00:00:00 2001 From: zhang2014 Date: Wed, 1 Apr 2026 20:35:43 +0800 Subject: [PATCH 28/38] z --- src/query/settings/src/settings_default.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/query/settings/src/settings_default.rs b/src/query/settings/src/settings_default.rs index 33800c1c17f4d..8e62886c70216 100644 --- a/src/query/settings/src/settings_default.rs +++ b/src/query/settings/src/settings_default.rs @@ -1585,7 +1585,7 @@ impl DefaultSettings { range: Some(SettingRange::Numeric(0..=1)), }), ("enable_partitioned_hash_join", DefaultSettingValue { - value: UserSettingValue::UInt64(0), + value: UserSettingValue::UInt64(1), desc: "Enables partitioned hash join for shuffle join.", mode: SettingMode::Both, scope: SettingScope::Both, From 471a2c933f4bfce08041fd4ae953cbf6b21c19a3 Mon Sep 17 00:00:00 2001 From: zhang2014 Date: Wed, 1 Apr 2026 23:23:28 +0800 Subject: [PATCH 29/38] z --- .../partitioned/transform_hash_join.rs | 12 ++++++------ .../unpartitioned/transform_hash_join.rs | 14 ++++++-------- 2 files changed, 12 insertions(+), 14 deletions(-) diff --git a/src/query/service/src/pipelines/processors/transforms/new_hash_join/partitioned/transform_hash_join.rs b/src/query/service/src/pipelines/processors/transforms/new_hash_join/partitioned/transform_hash_join.rs index 292fbf4c0cdff..27a6edaa92a5a 100644 --- a/src/query/service/src/pipelines/processors/transforms/new_hash_join/partitioned/transform_hash_join.rs +++ b/src/query/service/src/pipelines/processors/transforms/new_hash_join/partitioned/transform_hash_join.rs @@ -254,6 +254,12 @@ impl Processor for TransformPartitionedHashJoin { if !state.finished { state.finished = true; self.join.add_block(None)?; + + if let Some(builder) = self.runtime_filter_builder.take() { + let spill_happened = self.join.is_spill_happened(); + let packet = builder.finish(spill_happened)?; + self.shared_rf_packets.merge_packet(packet)?; + } } return Ok(()); }; @@ -323,12 +329,6 @@ impl Processor for TransformPartitionedHashJoin { Stage::Build(_) => { let wait_res = self.stage_sync_barrier.wait().await; - if let Some(builder) = self.runtime_filter_builder.take() { - let spill_happened = self.join.is_spill_happened(); - let packet = builder.finish(spill_happened)?; - self.shared_rf_packets.merge_packet(packet)?; - } - let rf_build_elapsed = self.instant.elapsed() - elapsed; let _wait_res = self.stage_sync_barrier.wait().await; let before_wait = self.instant.elapsed(); diff --git a/src/query/service/src/pipelines/processors/transforms/new_hash_join/unpartitioned/transform_hash_join.rs b/src/query/service/src/pipelines/processors/transforms/new_hash_join/unpartitioned/transform_hash_join.rs index 84c9fa35b87aa..25d9074c1a639 100644 --- a/src/query/service/src/pipelines/processors/transforms/new_hash_join/unpartitioned/transform_hash_join.rs +++ b/src/query/service/src/pipelines/processors/transforms/new_hash_join/unpartitioned/transform_hash_join.rs @@ -152,6 +152,12 @@ impl Processor for TransformHashJoin { if !state.finished { state.finished = true; self.join.add_block(None)?; + + if let Some(builder) = self.runtime_filter_builder.take() { + let spill_happened = self.join.is_spill_happened(); + let packet = builder.finish(spill_happened)?; + self.join.add_runtime_filter_packet(packet)?; + } } return Ok(()); }; @@ -222,14 +228,6 @@ impl Processor for TransformHashJoin { self.stage = match &mut self.stage { Stage::Build(_) => { - if let Some(builder) = self.runtime_filter_builder.take() { - let spill_happened = self.join.is_spill_happened(); - // Disable runtime filters once spilling occurs to avoid partial-build filters - // being globalized across the cluster, which can prune valid probe rows. - let packet = builder.finish(spill_happened)?; - self.join.add_runtime_filter_packet(packet)?; - } - let rf_build_elapsed = self.instant.elapsed() - elapsed; let _wait_res = self.stage_sync_barrier.wait().await; let before_wait = self.instant.elapsed(); From 363ba4440e9ebaf7257289a5de3269ea3f29e1d8 Mon Sep 17 00:00:00 2001 From: zhang2014 Date: Thu, 2 Apr 2026 00:24:27 +0800 Subject: [PATCH 30/38] z --- src/query/catalog/src/sbbf.rs | 69 +++++++++---------- .../hash_join/runtime_filter/convert.rs | 8 +-- .../hash_join/runtime_filter/local_builder.rs | 2 +- .../hash_join/runtime_filter/merge.rs | 24 +++---- .../hash_join/runtime_filter/packet.rs | 6 +- 5 files changed, 51 insertions(+), 58 deletions(-) diff --git a/src/query/catalog/src/sbbf.rs b/src/query/catalog/src/sbbf.rs index 510a8d1397f5a..d3646cb31785f 100644 --- a/src/query/catalog/src/sbbf.rs +++ b/src/query/catalog/src/sbbf.rs @@ -307,33 +307,26 @@ impl Sbbf { self.0.capacity() * std::mem::size_of::() } - /// Serialize the bloom filter to bytes (little-endian). - pub fn to_bytes(&self) -> Vec { - let mut bytes = Vec::with_capacity(self.0.len() * size_of::()); - for block in &self.0 { - for word in &block.0 { - bytes.extend_from_slice(&word.to_le_bytes()); - } - } - bytes - } - - /// Deserialize a bloom filter from bytes (little-endian). - /// Returns None if bytes length is not a multiple of 32 (Block size). - pub fn from_bytes(bytes: &[u8]) -> Option { - if bytes.is_empty() || !bytes.len().is_multiple_of(size_of::()) { + /// Zero-copy serialize to Vec, consuming self. + pub fn into_u32s(self) -> Vec { + let mut blocks = std::mem::ManuallyDrop::new(self.0); + let ptr = blocks.as_mut_ptr() as *mut u32; + let len = blocks.len() * 8; + let cap = blocks.capacity() * 8; + unsafe { Vec::from_raw_parts(ptr, len, cap) } + } + + /// Zero-copy deserialize from Vec. + /// Returns None if length is not a multiple of 8 (one Block = 8 x u32). + pub fn from_u32s(words: Vec) -> Option { + if words.is_empty() || !words.len().is_multiple_of(8) { return None; } - let num_blocks = bytes.len() / size_of::(); - let mut blocks = Vec::with_capacity(num_blocks); - for chunk in bytes.chunks_exact(size_of::()) { - let mut words = [0u32; 8]; - for (i, word_bytes) in chunk.chunks_exact(4).enumerate() { - words[i] = u32::from_le_bytes(word_bytes.try_into().unwrap()); - } - blocks.push(Block(words)); - } - Some(Self(blocks)) + let mut words = std::mem::ManuallyDrop::new(words); + let len = words.len() / 8; + let cap = words.capacity() / 8; + let ptr = words.as_mut_ptr() as *mut Block; + Some(Self(unsafe { Vec::from_raw_parts(ptr, len, cap) })) } } @@ -565,21 +558,21 @@ mod tests { let hashes: Vec = (0..500).collect(); filter.insert_hash_batch(&hashes); - let bytes = filter.to_bytes(); - let restored = Sbbf::from_bytes(&bytes).unwrap(); + let words = filter.into_u32s(); + let restored = Sbbf::from_u32s(words).unwrap(); for hash in &hashes { - assert_eq!(filter.check_hash(*hash), restored.check_hash(*hash)); + assert!(restored.check_hash(*hash)); } } #[test] - fn test_sbbf_from_bytes_invalid() { - assert!(Sbbf::from_bytes(&[]).is_none()); - assert!(Sbbf::from_bytes(&[0; 31]).is_none()); - assert!(Sbbf::from_bytes(&[0; 33]).is_none()); - assert!(Sbbf::from_bytes(&[0; 32]).is_some()); - assert!(Sbbf::from_bytes(&[0; 64]).is_some()); + fn test_sbbf_from_u32s_invalid() { + assert!(Sbbf::from_u32s(vec![]).is_none()); + assert!(Sbbf::from_u32s(vec![0; 7]).is_none()); + assert!(Sbbf::from_u32s(vec![0; 9]).is_none()); + assert!(Sbbf::from_u32s(vec![0; 8]).is_some()); + assert!(Sbbf::from_u32s(vec![0; 16]).is_some()); } #[test] @@ -593,10 +586,10 @@ mod tests { f2.insert_hash(i); } - let bytes1 = f1.to_bytes(); - let bytes2 = f2.to_bytes(); - let mut restored1 = Sbbf::from_bytes(&bytes1).unwrap(); - let restored2 = Sbbf::from_bytes(&bytes2).unwrap(); + let words1 = f1.into_u32s(); + let words2 = f2.into_u32s(); + let mut restored1 = Sbbf::from_u32s(words1).unwrap(); + let restored2 = Sbbf::from_u32s(words2).unwrap(); restored1.union(&restored2); for i in 0..100 { diff --git a/src/query/service/src/pipelines/processors/transforms/hash_join/runtime_filter/convert.rs b/src/query/service/src/pipelines/processors/transforms/hash_join/runtime_filter/convert.rs index 58c717d61267a..2174f39789a0e 100644 --- a/src/query/service/src/pipelines/processors/transforms/hash_join/runtime_filter/convert.rs +++ b/src/query/service/src/pipelines/processors/transforms/hash_join/runtime_filter/convert.rs @@ -278,13 +278,13 @@ fn build_min_max_filter( } fn build_bloom_filter( - bloom_bytes: Vec, + bloom_words: Vec, probe_key: &Expr, ) -> Result { let probe_column = resolve_probe_column_ref(probe_key); let column_name = probe_column.id.to_string(); - let filter = Sbbf::from_bytes(&bloom_bytes) - .ok_or_else(|| ErrorCode::Internal("Invalid bloom filter bytes in runtime filter"))?; + let filter = Sbbf::from_u32s(bloom_words) + .ok_or_else(|| ErrorCode::Internal("Invalid bloom filter data in runtime filter"))?; Ok(RuntimeFilterBloom { column_name, filter: Arc::new(filter), @@ -372,7 +372,7 @@ mod tests { bloom: Some({ let mut f = Sbbf::new_with_ndv_fpp(100, 0.01).unwrap(); f.insert_hash_batch(&[11, 22]); - f.to_bytes() + f.into_u32s() }), spatial: None, }); diff --git a/src/query/service/src/pipelines/processors/transforms/hash_join/runtime_filter/local_builder.rs b/src/query/service/src/pipelines/processors/transforms/hash_join/runtime_filter/local_builder.rs index d973bad3c7e14..7cd9bd9e81455 100644 --- a/src/query/service/src/pipelines/processors/transforms/hash_join/runtime_filter/local_builder.rs +++ b/src/query/service/src/pipelines/processors/transforms/hash_join/runtime_filter/local_builder.rs @@ -230,7 +230,7 @@ impl SingleFilterBuilder { None }; - let bloom = self.bloom_filter.take().map(|f| f.to_bytes()); + let bloom = self.bloom_filter.take().map(|f| f.into_u32s()); Ok(RuntimeFilterPacket { id: self.id, diff --git a/src/query/service/src/pipelines/processors/transforms/hash_join/runtime_filter/merge.rs b/src/query/service/src/pipelines/processors/transforms/hash_join/runtime_filter/merge.rs index f528478021cf9..02dd73b810650 100644 --- a/src/query/service/src/pipelines/processors/transforms/hash_join/runtime_filter/merge.rs +++ b/src/query/service/src/pipelines/processors/transforms/hash_join/runtime_filter/merge.rs @@ -159,21 +159,21 @@ fn merge_min_max( Some(SerializableDomain { min, max }) } -fn merge_bloom(packets: &[HashMap], rf_id: usize) -> Option> { +fn merge_bloom(packets: &[HashMap], rf_id: usize) -> Option> { if packets .iter() .any(|packet| packet.get(&rf_id).unwrap().bloom.is_none()) { return None; } - let first_bytes = packets[0].get(&rf_id).unwrap().bloom.as_ref().unwrap(); - let mut merged = Sbbf::from_bytes(first_bytes)?; + let first = packets[0].get(&rf_id).unwrap().bloom.clone().unwrap(); + let mut merged = Sbbf::from_u32s(first)?; for packet in packets.iter().skip(1) { - let other_bytes = packet.get(&rf_id).unwrap().bloom.as_ref().unwrap(); - let other = Sbbf::from_bytes(other_bytes)?; + let other_words = packet.get(&rf_id).unwrap().bloom.clone().unwrap(); + let other = Sbbf::from_u32s(other_words)?; merged.union(&other); } - Some(merged.to_bytes()) + Some(merged.into_u32s()) } fn merge_spatial( @@ -250,12 +250,12 @@ pub fn merge_two_runtime_filter_packets( if let Some(mut b_pkt) = b_packets.get(&id).cloned() { // Merge bloom via Sbbf::union let bloom = match (a_pkt.bloom.take(), b_pkt.bloom.take()) { - (Some(a_bytes), Some(b_bytes)) => { + (Some(a_words), Some(b_words)) => { if let (Some(mut a_filter), Some(b_filter)) = - (Sbbf::from_bytes(&a_bytes), Sbbf::from_bytes(&b_bytes)) + (Sbbf::from_u32s(a_words), Sbbf::from_u32s(b_words)) { a_filter.union(&b_filter); - Some(a_filter.to_bytes()) + Some(a_filter.into_u32s()) } else { None } @@ -341,10 +341,10 @@ mod tests { builder.build() } - fn make_bloom(hashes: &[u64]) -> Vec { + fn make_bloom(hashes: &[u64]) -> Vec { let mut filter = Sbbf::new_with_ndv_fpp(100, 0.01).unwrap(); filter.insert_hash_batch(hashes); - filter.to_bytes() + filter.into_u32s() } #[test] @@ -416,7 +416,7 @@ mod tests { assert_eq!(merged.build_rows, 11); assert!(packet.inlist.is_none()); // Bloom should be a merged Sbbf containing all hashes - let merged_filter = Sbbf::from_bytes(packet.bloom.as_ref().unwrap()).unwrap(); + let merged_filter = Sbbf::from_u32s(packet.bloom.unwrap()).unwrap(); for h in &[1u64, 2, 3, 4] { assert!(merged_filter.check_hash(*h)); } diff --git a/src/query/service/src/pipelines/processors/transforms/hash_join/runtime_filter/packet.rs b/src/query/service/src/pipelines/processors/transforms/hash_join/runtime_filter/packet.rs index 1c0eacea8cdf2..ffe5263cf6bd3 100644 --- a/src/query/service/src/pipelines/processors/transforms/hash_join/runtime_filter/packet.rs +++ b/src/query/service/src/pipelines/processors/transforms/hash_join/runtime_filter/packet.rs @@ -45,7 +45,7 @@ pub struct RuntimeFilterPacket { pub id: usize, pub inlist: Option, pub min_max: Option, - pub bloom: Option>, + pub bloom: Option>, pub spatial: Option, } @@ -161,7 +161,7 @@ impl TryInto for JoinRuntimeFilterPacket { bloom_pos = Some(entities.len()); let builder = ArrayColumnBuilder { - builder: ColumnBuilder::Number(NumberColumnBuilder::UInt8(bloom_filter)), + builder: ColumnBuilder::Number(NumberColumnBuilder::UInt32(bloom_filter)), offsets: vec![0, len], }; entities.push(Column::Array(Box::new(builder.build()))); @@ -228,7 +228,7 @@ impl TryFrom for JoinRuntimeFilterPacket { let array_column = column.into_array().expect("it's a bug"); let bloom_value_column = array_column.index(0).expect("It's a bug"); bloom = Some(match bloom_value_column { - Column::Number(NumberColumn::UInt8(v)) => v.to_vec(), + Column::Number(NumberColumn::UInt32(v)) => v.to_vec(), _ => unreachable!("Unexpected runtime bloom filter column type"), }) } From fe0d92083f5f525b72d220149855004b3ef12002 Mon Sep 17 00:00:00 2001 From: zhang2014 Date: Thu, 2 Apr 2026 00:54:27 +0800 Subject: [PATCH 31/38] z --- src/query/catalog/src/sbbf.rs | 77 +++++++++++++++++-- .../hash_join/runtime_filter/merge.rs | 29 ++++--- 2 files changed, 89 insertions(+), 17 deletions(-) diff --git a/src/query/catalog/src/sbbf.rs b/src/query/catalog/src/sbbf.rs index d3646cb31785f..30f123a259c99 100644 --- a/src/query/catalog/src/sbbf.rs +++ b/src/query/catalog/src/sbbf.rs @@ -73,9 +73,15 @@ //! [sbbf-paper]: https://arxiv.org/pdf/2101.01719 //! [bf-formulae]: http://tfk.mit.edu/pdf/bloom.pdf -use core::simd::Simd; -use core::simd::cmp::SimdPartialEq; +// Use NEON intrinsics on aarch64 for better performance +#[cfg(target_arch = "aarch64")] +use std::arch::aarch64::*; use std::mem::size_of; +// Use portable SIMD on other platforms +#[cfg(not(target_arch = "aarch64"))] +use std::simd::Simd; +#[cfg(not(target_arch = "aarch64"))] +use std::simd::cmp::SimdPartialEq; use std::sync::Arc; use std::sync::atomic::AtomicU32; use std::sync::atomic::Ordering; @@ -83,7 +89,11 @@ use std::sync::atomic::Ordering; use databend_common_base::runtime::Runtime; /// Salt values as defined in the [spec](https://github.com/apache/parquet-format/blob/master/BloomFilter.md#technical-approach). -const SALT: [u32; 8] = [ +/// 32-byte aligned for optimal SIMD load performance. +#[repr(C, align(32))] +struct AlignedSalt([u32; 8]); + +static SALT: AlignedSalt = AlignedSalt([ 0x47b6137b_u32, 0x44974d91_u32, 0x8824ad5b_u32, @@ -92,7 +102,10 @@ const SALT: [u32; 8] = [ 0x2df1424b_u32, 0x9efc4947_u32, 0x5c6bfb31_u32, -]; +]); + +/// Shift amount for extracting bit index: (hash * salt) >> 27 gives 5 bits (0-31) +const SHIFT_NUM: i32 = 27; /// Each block is 256 bits, broken up into eight contiguous "words", each consisting of 32 bits. /// Each word is thought of as an array of bits; each bit is either "set" or "not set". @@ -100,6 +113,7 @@ const SALT: [u32; 8] = [ #[repr(transparent)] struct Block([u32; 8]); +#[cfg(not(target_arch = "aarch64"))] type U32x8 = Simd; impl Block { @@ -107,6 +121,33 @@ impl Block { /// takes as its argument a single unsigned 32-bit integer and returns a block in which each /// word has exactly one bit set. + #[cfg(target_arch = "aarch64")] + #[inline] + fn mask(x: u32) -> Self { + unsafe { + let (mask_lo, mask_hi) = Self::mask_neon(x); + let mut result = [0u32; 8]; + vst1q_u32_x2(result.as_mut_ptr(), uint32x4x2_t(mask_lo, mask_hi)); + Self(result) + } + } + + #[cfg(target_arch = "aarch64")] + #[inline(always)] + unsafe fn mask_neon(x: u32) -> (uint32x4_t, uint32x4_t) { + unsafe { + let ones = vdupq_n_u32(1); + let hash_data = vdupq_n_u32(x); + let salt = vld1q_u32_x2(SALT.0.as_ptr()); + let bit_index_lo = + vreinterpretq_s32_u32(vshrq_n_u32::(vmulq_u32(salt.0, hash_data))); + let bit_index_hi = + vreinterpretq_s32_u32(vshrq_n_u32::(vmulq_u32(salt.1, hash_data))); + (vshlq_u32(ones, bit_index_lo), vshlq_u32(ones, bit_index_hi)) + } + } + + #[cfg(not(target_arch = "aarch64"))] fn mask(x: u32) -> Self { Self(Self::mask_simd(x).to_array()) } @@ -132,6 +173,18 @@ impl Block { } /// Setting every bit in the block that was also set in the result from mask + #[cfg(target_arch = "aarch64")] + #[inline] + fn insert(&mut self, hash: u32) { + unsafe { + let (mask_lo, mask_hi) = Self::mask_neon(hash); + let data = vld1q_u32_x2(self.0.as_ptr()); + let result = uint32x4x2_t(vorrq_u32(data.0, mask_lo), vorrq_u32(data.1, mask_hi)); + vst1q_u32_x2(self.0.as_mut_ptr(), result); + } + } + + #[cfg(not(target_arch = "aarch64"))] fn insert(&mut self, hash: u32) { let mask = Self::mask(hash); for i in 0..8 { @@ -140,16 +193,30 @@ impl Block { } /// Returns true when every bit that is set in the result of mask is also set in the block. + #[cfg(target_arch = "aarch64")] + #[inline] + fn check(&self, hash: u32) -> bool { + unsafe { + let (mask_lo, mask_hi) = Self::mask_neon(hash); + let data = vld1q_u32_x2(self.0.as_ptr()); + // vbicq_u32(a, b) = a & !b: bits set in mask but not in data + let miss = vorrq_u32(vbicq_u32(mask_lo, data.0), vbicq_u32(mask_hi, data.1)); + vmaxvq_u32(miss) == 0 + } + } + + #[cfg(not(target_arch = "aarch64"))] fn check(&self, hash: u32) -> bool { let mask = Self::mask_simd(hash); let block_vec = U32x8::from_array(self.0); (block_vec & mask).simd_ne(U32x8::splat(0)).all() } + #[cfg(not(target_arch = "aarch64"))] #[inline(always)] fn mask_simd(x: u32) -> U32x8 { let hash_vec = U32x8::splat(x); - let salt_vec = U32x8::from_array(SALT); + let salt_vec = U32x8::from_array(SALT.0); let bit_index = (hash_vec * salt_vec) >> U32x8::splat(27); U32x8::splat(1) << bit_index } diff --git a/src/query/service/src/pipelines/processors/transforms/hash_join/runtime_filter/merge.rs b/src/query/service/src/pipelines/processors/transforms/hash_join/runtime_filter/merge.rs index 02dd73b810650..acf623ed5d2a1 100644 --- a/src/query/service/src/pipelines/processors/transforms/hash_join/runtime_filter/merge.rs +++ b/src/query/service/src/pipelines/processors/transforms/hash_join/runtime_filter/merge.rs @@ -51,7 +51,7 @@ pub fn merge_join_runtime_filter_packets( let should_merge_bloom = total_build_rows < bloom_threshold; let should_merge_min_max = total_build_rows < min_max_threshold; - let packets = packets + let mut packets = packets .into_iter() .filter_map(|packet| packet.packets) .collect::>(); @@ -63,26 +63,27 @@ pub fn merge_join_runtime_filter_packets( )); } + let keys: Vec = packets[0].keys().copied().collect(); let mut result = HashMap::new(); - for id in packets[0].keys() { - result.insert(*id, RuntimeFilterPacket { - id: *id, + for id in keys { + result.insert(id, RuntimeFilterPacket { + id, inlist: if should_merge_inlist { - merge_inlist(&packets, *id)? + merge_inlist(&packets, id)? } else { None }, min_max: if should_merge_min_max { - merge_min_max(&packets, *id) + merge_min_max(&packets, id) } else { None }, bloom: if should_merge_bloom { - merge_bloom(&packets, *id) + merge_bloom(&mut packets, id) } else { None }, - spatial: merge_spatial(&packets, *id, spatial_threshold)?, + spatial: merge_spatial(&packets, id, spatial_threshold)?, }); } @@ -159,17 +160,21 @@ fn merge_min_max( Some(SerializableDomain { min, max }) } -fn merge_bloom(packets: &[HashMap], rf_id: usize) -> Option> { +fn merge_bloom( + packets: &mut [HashMap], + rf_id: usize, +) -> Option> { if packets .iter() .any(|packet| packet.get(&rf_id).unwrap().bloom.is_none()) { return None; } - let first = packets[0].get(&rf_id).unwrap().bloom.clone().unwrap(); + + let first = packets[0].get_mut(&rf_id).unwrap().bloom.take().unwrap(); let mut merged = Sbbf::from_u32s(first)?; - for packet in packets.iter().skip(1) { - let other_words = packet.get(&rf_id).unwrap().bloom.clone().unwrap(); + for packet in packets.iter_mut().skip(1) { + let other_words = packet.get_mut(&rf_id).unwrap().bloom.take().unwrap(); let other = Sbbf::from_u32s(other_words)?; merged.union(&other); } From 510dc4fb7c5b06a8d4e950abfb059043dc8add29 Mon Sep 17 00:00:00 2001 From: zhang2014 Date: Thu, 2 Apr 2026 10:44:44 +0800 Subject: [PATCH 32/38] z --- src/query/settings/src/settings_default.rs | 7 +++++++ src/query/settings/src/settings_getter_setter.rs | 4 ++++ src/query/sql/src/planner/plans/join.rs | 16 +++++++++++++++- 3 files changed, 26 insertions(+), 1 deletion(-) diff --git a/src/query/settings/src/settings_default.rs b/src/query/settings/src/settings_default.rs index 8e62886c70216..c493f1024c27a 100644 --- a/src/query/settings/src/settings_default.rs +++ b/src/query/settings/src/settings_default.rs @@ -623,6 +623,13 @@ impl DefaultSettings { scope: SettingScope::Both, range: Some(SettingRange::Numeric(0..=1)), }), + ("broadcast_join_max_build_rows", DefaultSettingValue { + value: UserSettingValue::UInt64(30_000_000), + desc: "Maximum estimated build-side rows for broadcast join when partitioned hash join is enabled. 0 means no limit.", + mode: SettingMode::Both, + scope: SettingScope::Both, + range: Some(SettingRange::Numeric(0..=u64::MAX)), + }), ("grouping_sets_to_union", DefaultSettingValue { value: UserSettingValue::UInt64(0), desc: "Enables grouping sets to union.", diff --git a/src/query/settings/src/settings_getter_setter.rs b/src/query/settings/src/settings_getter_setter.rs index 50fc69ab2b53d..bb226bd36a2cd 100644 --- a/src/query/settings/src/settings_getter_setter.rs +++ b/src/query/settings/src/settings_getter_setter.rs @@ -465,6 +465,10 @@ impl Settings { Ok(self.try_get_u64("enforce_shuffle_join")? != 0) } + pub fn get_broadcast_join_max_build_rows(&self) -> Result { + self.try_get_u64("broadcast_join_max_build_rows") + } + pub fn get_enable_merge_into_row_fetch(&self) -> Result { Ok(self.try_get_u64("enable_merge_into_row_fetch")? != 0) } diff --git a/src/query/sql/src/planner/plans/join.rs b/src/query/sql/src/planner/plans/join.rs index af67786a076f0..434b8455786eb 100644 --- a/src/query/sql/src/planner/plans/join.rs +++ b/src/query/sql/src/planner/plans/join.rs @@ -20,6 +20,7 @@ use std::sync::Arc; use databend_common_catalog::table_context::TableContext; use databend_common_exception::Result; +use databend_common_settings::Settings; use databend_common_statistics::DEFAULT_HISTOGRAM_BUCKETS; use databend_common_statistics::Datum; use databend_common_statistics::Histogram; @@ -547,6 +548,18 @@ impl Join { .iter() .any(|expr| expr.has_subquery()) } + + fn enforce_shuffle_join(settings: &Settings, right_stat_info: &Arc) -> Result { + let max_build_rows = settings.get_broadcast_join_max_build_rows()?; + if max_build_rows > 0 + && settings.get_enable_partitioned_hash_join()? + && right_stat_info.cardinality > max_build_rows as f64 + { + return Ok(true); + } + + settings.get_enforce_shuffle_join() + } } impl Operator for Join { @@ -715,7 +728,8 @@ impl Operator for Join { // Use a very large value to prevent broadcast join. 1000.0 }; - if !settings.get_enforce_shuffle_join()? + + if !Self::enforce_shuffle_join(&settings, &right_stat_info)? && (right_stat_info.cardinality * broadcast_join_threshold < left_stat_info.cardinality || settings.get_enforce_broadcast_join()?) From e12d53cc7cdf09e83021ecb806b4c70bf7ba5761 Mon Sep 17 00:00:00 2001 From: zhang2014 Date: Thu, 2 Apr 2026 14:17:06 +0800 Subject: [PATCH 33/38] z --- .../partitioned/compact_hash_table.rs | 114 +++--- .../partitioned/partitioned_build.rs | 362 ++++++++++++++++-- 2 files changed, 384 insertions(+), 92 deletions(-) diff --git a/src/query/service/src/pipelines/processors/transforms/new_hash_join/partitioned/compact_hash_table.rs b/src/query/service/src/pipelines/processors/transforms/new_hash_join/partitioned/compact_hash_table.rs index 02a7f141faff3..201e832b0e9da 100644 --- a/src/query/service/src/pipelines/processors/transforms/new_hash_join/partitioned/compact_hash_table.rs +++ b/src/query/service/src/pipelines/processors/transforms/new_hash_join/partitioned/compact_hash_table.rs @@ -77,62 +77,29 @@ impl CompactJoinHashTable { } } - /// Get the bucket mask for external hash computation. - pub fn bucket_mask(&self) -> usize { - self.bucket_mask - } - - /// Build the hash table from precomputed bucket numbers. - /// `bucket_nums[i]` is the bucket for row i (1-based indexing, skip index 0). - pub fn build(&mut self, bucket_nums: &[usize]) { - // bucket_nums[0] is unused (sentinel), actual rows start at index 1 - for (i, bucket_num) in bucket_nums.iter().enumerate().skip(1) { - let bucket = bucket_num & self.bucket_mask; - self.next[i] = self.first[bucket]; - self.first[bucket] = I::from_usize(i); + /// Create a direct-mapping hash table where keys are used as array indices. + /// `range` is `max_key - min_key`; the caller subtracts min_key before insertion/probe. + pub fn new_direct(num_rows: usize, range: usize) -> Self { + CompactJoinHashTable { + first: vec![I::ZERO; range + 1], + next: vec![I::ZERO; num_rows + 1], + bucket_mask: 0, } } - pub fn insert_chunk(&mut self, hashes: &[u64], row_offset: usize) { - let mask = self.bucket_mask; - for (i, h) in hashes.iter().enumerate() { + pub fn insert_chunk(&mut self, vals: &[u64], row_offset: usize) { + for (i, v) in vals.iter().enumerate() { let row_index = row_offset + i; - let bucket = (*h as usize) & mask; - self.next[row_index] = self.first[bucket]; - self.first[bucket] = I::from_usize(row_index); - } - } + let bucket = match DIRECT { + true => *v as usize, + false => (*v as usize) & self.bucket_mask, + }; - pub fn insert_chunk_with_validity( - &mut self, - hashes: &[u64], - row_offset: usize, - validity: &databend_common_column::bitmap::Bitmap, - ) { - let mask = self.bucket_mask; - for (i, h) in hashes.iter().enumerate() { - if !validity.get_bit(i) { - continue; - } - let row_index = row_offset + i; - let bucket = (*h as usize) & mask; self.next[row_index] = self.first[bucket]; self.first[bucket] = I::from_usize(row_index); } } - /// Get the first row index in the given bucket. - #[inline(always)] - pub fn first_index(&self, bucket: usize) -> I { - unsafe { *self.first.get_unchecked(bucket & self.bucket_mask) } - } - - /// Get the next row index in the chain. - #[inline(always)] - pub fn next_index(&self, row_index: I) -> I { - unsafe { *self.next.get_unchecked(row_index.to_usize()) } - } - fn calc_bucket_count(num_rows: usize) -> usize { if num_rows == 0 { return 1; @@ -142,13 +109,13 @@ impl CompactJoinHashTable { target.next_power_of_two() } - pub fn probe(&self, hashes: &mut [u64], bitmap: Option) -> usize { + pub fn probe(&self, vals: &mut [u64], bitmap: Option) -> usize { let mut valids = None; if let Some(bitmap) = bitmap { if bitmap.null_count() == bitmap.len() { - hashes.iter_mut().for_each(|hash| { - *hash = 0; + vals.iter_mut().for_each(|v| { + *v = 0; }); return 0; } else if bitmap.null_count() > 0 { @@ -157,34 +124,49 @@ impl CompactJoinHashTable { } let mut count = 0; + let first_len = self.first.len(); match valids { Some(valids) => { - valids - .iter() - .zip(hashes.iter_mut()) - .for_each(|(valid, hash)| { - if valid { - let bucket = (*hash as usize) & self.bucket_mask; - if self.first[bucket] != I::default() { - *hash = self.first[bucket].to_usize() as u64; - count += 1; - } else { - *hash = 0; + for (valid, val) in valids.iter().zip(vals.iter_mut()) { + if valid { + let bucket = match DIRECT { + false => (*val as usize) & self.bucket_mask, + true if (*val as usize) < first_len => *val as usize, + true => { + *val = 0; + continue; } + }; + + if self.first[bucket] != I::default() { + *val = self.first[bucket].to_usize() as u64; + count += 1; } else { - *hash = 0; + *val = 0; } - }); + } else { + *val = 0; + } + } } None => { - hashes.iter_mut().for_each(|hash| { - let bucket = (*hash as usize) & self.bucket_mask; + vals.iter_mut().for_each(|val| { + let bucket = if DIRECT { + let b = *val as usize; + if b >= first_len { + *val = 0; + return; + } + b + } else { + (*val as usize) & self.bucket_mask + }; if self.first[bucket] != I::default() { - *hash = self.first[bucket].to_usize() as u64; + *val = self.first[bucket].to_usize() as u64; count += 1; } else { - *hash = 0; + *val = 0; } }); } diff --git a/src/query/service/src/pipelines/processors/transforms/new_hash_join/partitioned/partitioned_build.rs b/src/query/service/src/pipelines/processors/transforms/new_hash_join/partitioned/partitioned_build.rs index 1e8e7026eb613..94d792bc826ba 100644 --- a/src/query/service/src/pipelines/processors/transforms/new_hash_join/partitioned/partitioned_build.rs +++ b/src/query/service/src/pipelines/processors/transforms/new_hash_join/partitioned/partitioned_build.rs @@ -126,6 +126,9 @@ impl BuildKeysStates { } } +/// Maximum key range for direct hash join (same as Doris: 1 << 23 = 8M). +const DIRECT_JOIN_MAX_RANGE: u64 = 1 << 23; + /// Per-thread build state for partitioned hash join. pub struct PartitionedHashJoinState { pub chunks: Vec, @@ -139,6 +142,9 @@ pub struct PartitionedHashJoinState { pub num_rows: usize, pub build_block_idx: usize, + pub direct_join: bool, + pub min_key: u256, + pub visited: Vec>, pub desc: Arc, pub function_ctx: Arc, @@ -163,6 +169,8 @@ impl PartitionedHashJoinState { desc, function_ctx, build_block_idx: 0, + direct_join: false, + min_key: u256::ZERO, visited: vec![], accumulator: FixedSizeChunkAccumulator::new(CHUNK_SIZE), } @@ -248,9 +256,6 @@ impl PartitionedHashJoinState { } if self.build_block_idx == 0 { - // Allocate hash table with known total rows - self.hash_table = CompactJoinHashTable::new(self.num_rows); - if let Some(first_chunk) = self.chunks.first() { self.column_types = (0..first_chunk.num_columns()) .map(|offset| first_chunk.get_by_offset(offset).data_type()) @@ -268,20 +273,96 @@ impl PartitionedHashJoinState { } self.columns = columns; } + + // Decide whether to use direct mapping + let direct_range = match &self.build_keys_states { + BuildKeysStates::UInt8(_) => Some((u256::ZERO, u8::MAX as u64)), + BuildKeysStates::UInt16(_) => Some((u256::ZERO, u16::MAX as u64)), + BuildKeysStates::UInt32(bufs) => scan_min_max_u32(bufs), + BuildKeysStates::UInt64(bufs) => scan_min_max_u64(bufs), + BuildKeysStates::UInt128(bufs) => scan_min_max_u128(bufs), + BuildKeysStates::UInt256(bufs) => scan_min_max_u256(bufs), + _ => None, + }; + + match direct_range { + Some((min_key, range)) => { + self.direct_join = true; + self.min_key = min_key; + self.hash_table = + CompactJoinHashTable::new_direct(self.num_rows, range as usize); + } + None => { + self.hash_table = CompactJoinHashTable::new(self.num_rows); + } + }; } let row_offset = CHUNK_SIZE * self.build_block_idx + 1; - let keys_state = self.build_keys_states.get(self.build_block_idx); + let idx = self.build_block_idx; - with_hash_method!(|T| match &self.method { - HashMethodKind::T(method) => { - let mut hashes = Vec::with_capacity(CHUNK_SIZE); - method.build_keys_hashes(&keys_state, &mut hashes); - self.hash_table.insert_chunk(&hashes, row_offset); - self.build_block_idx += 1; + if self.direct_join { + match &self.build_keys_states { + BuildKeysStates::UInt8(states) => { + let min_t = self.min_key.as_u8(); + let adjusted: Vec = states[idx] + .iter() + .map(|k| k.wrapping_sub(min_t) as u64) + .collect(); + self.hash_table.insert_chunk::(&adjusted, row_offset); + } + BuildKeysStates::UInt16(states) => { + let min_t = self.min_key.as_u16(); + let adjusted: Vec = states[idx] + .iter() + .map(|k| k.wrapping_sub(min_t) as u64) + .collect(); + self.hash_table.insert_chunk::(&adjusted, row_offset); + } + BuildKeysStates::UInt32(states) => { + let min_t = self.min_key.as_u32(); + let adjusted: Vec = states[idx] + .iter() + .map(|k| k.wrapping_sub(min_t) as u64) + .collect(); + self.hash_table.insert_chunk::(&adjusted, row_offset); + } + BuildKeysStates::UInt64(states) => { + let min_t = self.min_key.as_u64(); + let adjusted: Vec = + states[idx].iter().map(|k| k.wrapping_sub(min_t)).collect(); + self.hash_table.insert_chunk::(&adjusted, row_offset); + } + BuildKeysStates::UInt128(states) => { + let min_t = self.min_key.as_u128(); + let adjusted: Vec = states[idx] + .iter() + .map(|k| k.wrapping_sub(min_t) as u64) + .collect(); + self.hash_table.insert_chunk::(&adjusted, row_offset); + } + BuildKeysStates::UInt256(states) => { + let min_t = self.min_key; + let adjusted: Vec = states[idx] + .iter() + .map(|k| k.wrapping_sub(min_t).as_u64()) + .collect(); + self.hash_table.insert_chunk::(&adjusted, row_offset); + } + _ => unreachable!(), } - }); + } else { + let keys_state = self.build_keys_states.get(idx); + with_hash_method!(|T| match &self.method { + HashMethodKind::T(method) => { + let mut hashes = Vec::with_capacity(CHUNK_SIZE); + method.build_keys_hashes(&keys_state, &mut hashes); + self.hash_table.insert_chunk::(&hashes, row_offset); + } + }); + } + self.build_block_idx += 1; match self.build_block_idx == self.chunks.len() { true => Ok(None), false => Ok(Some(ProgressValues { rows: 0, bytes: 0 })), @@ -295,13 +376,20 @@ impl PartitionedHashJoinState { let num_rows = data.num_rows(); let (keys_block, valids) = data.into_raw(); let keys = ProjectedBlock::from(keys_block.columns()); - let mut hashes = Vec::with_capacity(num_rows); + if self.direct_join { + return self.probe_direct::(keys, num_rows, valids); + } + + let mut hashes = Vec::with_capacity(num_rows); let (keys_state, matched_rows) = with_hash_method!(|T| match &self.method { HashMethodKind::T(method) => { let keys_state = method.build_keys_state(keys, num_rows)?; method.build_keys_hashes(&keys_state, &mut hashes); - (keys_state, self.hash_table.probe(&mut hashes, valids)) + ( + keys_state, + self.hash_table.probe::(&mut hashes, valids), + ) } }); @@ -315,7 +403,7 @@ impl PartitionedHashJoinState { Ok(match (&self.method, &self.build_keys_states) { (HashMethodKind::KeysU8(_), BuildKeysStates::UInt8(states)) => { let probe_keys = u8::downcast_owned(keys_state).unwrap(); - PrimitiveProbeStream::<'a, u8, MATCHED, MATCH_FIRST, u32>::new( + PrimitiveProbeStream::<'a, u8, MATCHED, MATCH_FIRST, false, u32>::new( hashes, states, probe_keys, @@ -324,7 +412,7 @@ impl PartitionedHashJoinState { } (HashMethodKind::KeysU16(_), BuildKeysStates::UInt16(states)) => { let probe_keys = u16::downcast_owned(keys_state).unwrap(); - PrimitiveProbeStream::<'a, u16, MATCHED, MATCH_FIRST, u32>::new( + PrimitiveProbeStream::<'a, u16, MATCHED, MATCH_FIRST, false, u32>::new( hashes, states, probe_keys, @@ -333,7 +421,7 @@ impl PartitionedHashJoinState { } (HashMethodKind::KeysU32(_), BuildKeysStates::UInt32(states)) => { let probe_keys = u32::downcast_owned(keys_state).unwrap(); - PrimitiveProbeStream::<'a, u32, MATCHED, MATCH_FIRST, u32>::new( + PrimitiveProbeStream::<'a, u32, MATCHED, MATCH_FIRST, false, u32>::new( hashes, states, probe_keys, @@ -342,7 +430,7 @@ impl PartitionedHashJoinState { } (HashMethodKind::KeysU64(_), BuildKeysStates::UInt64(states)) => { let probe_keys = u64::downcast_owned(keys_state).unwrap(); - PrimitiveProbeStream::<'a, u64, MATCHED, MATCH_FIRST, u32>::new( + PrimitiveProbeStream::<'a, u64, MATCHED, MATCH_FIRST, false, u32>::new( hashes, states, probe_keys, @@ -351,7 +439,7 @@ impl PartitionedHashJoinState { } (HashMethodKind::KeysU128(_), BuildKeysStates::UInt128(states)) => { let probe_keys = u128::downcast_owned(keys_state).unwrap(); - PrimitiveProbeStream::<'a, u128, MATCHED, MATCH_FIRST, u32>::new( + PrimitiveProbeStream::<'a, u128, MATCHED, MATCH_FIRST, false, u32>::new( hashes, states, probe_keys, @@ -360,7 +448,7 @@ impl PartitionedHashJoinState { } (HashMethodKind::KeysU256(_), BuildKeysStates::UInt256(states)) => { let probe_keys = u256::downcast_owned(keys_state).unwrap(); - PrimitiveProbeStream::<'a, u256, MATCHED, MATCH_FIRST, u32>::new( + PrimitiveProbeStream::<'a, u256, MATCHED, MATCH_FIRST, false, u32>::new( hashes, states, probe_keys, @@ -387,6 +475,145 @@ impl PartitionedHashJoinState { }) } + fn probe_direct<'a, const MATCHED: bool, const MATCH_FIRST: bool>( + &'a self, + keys: ProjectedBlock<'_>, + num_rows: usize, + valids: Option, + ) -> Result> { + let keys_state = with_hash_method!(|T| match &self.method { + HashMethodKind::T(method) => method.build_keys_state(keys, num_rows)?, + }); + + Ok(match &self.build_keys_states { + BuildKeysStates::UInt8(bufs) => { + let min_t = self.min_key.as_u8(); + let probe_keys = u8::downcast_owned(keys_state).unwrap(); + let mut adjusted: Vec = probe_keys + .iter() + .map(|k| k.wrapping_sub(min_t) as u64) + .collect(); + + if self.hash_table.probe::(&mut adjusted, valids) == 0 { + return match MATCHED { + true => Ok(Box::new(EmptyProbeStream)), + false => Ok(AllUnmatchedProbeStream::create(adjusted.len())), + }; + } + PrimitiveProbeStream::<'a, u8, MATCHED, MATCH_FIRST, true, u32>::new( + adjusted, + bufs, + probe_keys, + &self.hash_table.next, + ) + } + BuildKeysStates::UInt16(bufs) => { + let min_t = self.min_key.as_u16(); + let probe_keys = u16::downcast_owned(keys_state).unwrap(); + let mut adjusted: Vec = probe_keys + .iter() + .map(|k| k.wrapping_sub(min_t) as u64) + .collect(); + + if self.hash_table.probe::(&mut adjusted, valids) == 0 { + return match MATCHED { + true => Ok(Box::new(EmptyProbeStream)), + false => Ok(AllUnmatchedProbeStream::create(adjusted.len())), + }; + } + PrimitiveProbeStream::<'a, u16, MATCHED, MATCH_FIRST, true, u32>::new( + adjusted, + bufs, + probe_keys, + &self.hash_table.next, + ) + } + BuildKeysStates::UInt32(bufs) => { + let min_t = self.min_key.as_u32(); + let probe_keys = u32::downcast_owned(keys_state).unwrap(); + let mut adjusted: Vec = probe_keys + .iter() + .map(|k| k.wrapping_sub(min_t) as u64) + .collect(); + + if self.hash_table.probe::(&mut adjusted, valids) == 0 { + return match MATCHED { + true => Ok(Box::new(EmptyProbeStream)), + false => Ok(AllUnmatchedProbeStream::create(adjusted.len())), + }; + } + PrimitiveProbeStream::<'a, u32, MATCHED, MATCH_FIRST, true, u32>::new( + adjusted, + bufs, + probe_keys, + &self.hash_table.next, + ) + } + BuildKeysStates::UInt64(bufs) => { + let min_t = self.min_key.as_u64(); + let probe_keys = u64::downcast_owned(keys_state).unwrap(); + let mut adjusted: Vec = + probe_keys.iter().map(|k| k.wrapping_sub(min_t)).collect(); + + if self.hash_table.probe::(&mut adjusted, valids) == 0 { + return match MATCHED { + true => Ok(Box::new(EmptyProbeStream)), + false => Ok(AllUnmatchedProbeStream::create(adjusted.len())), + }; + } + PrimitiveProbeStream::<'a, u64, MATCHED, MATCH_FIRST, true, u32>::new( + adjusted, + bufs, + probe_keys, + &self.hash_table.next, + ) + } + BuildKeysStates::UInt128(bufs) => { + let min_t = self.min_key.as_u128(); + let probe_keys = u128::downcast_owned(keys_state).unwrap(); + let mut adjusted: Vec = probe_keys + .iter() + .map(|k| k.wrapping_sub(min_t) as u64) + .collect(); + + if self.hash_table.probe::(&mut adjusted, valids) == 0 { + return match MATCHED { + true => Ok(Box::new(EmptyProbeStream)), + false => Ok(AllUnmatchedProbeStream::create(adjusted.len())), + }; + } + PrimitiveProbeStream::<'a, u128, MATCHED, MATCH_FIRST, true, u32>::new( + adjusted, + bufs, + probe_keys, + &self.hash_table.next, + ) + } + BuildKeysStates::UInt256(bufs) => { + let min_t = self.min_key; + let probe_keys = u256::downcast_owned(keys_state).unwrap(); + let mut adjusted: Vec = probe_keys + .iter() + .map(|k| k.wrapping_sub(min_t).as_u64()) + .collect(); + + if self.hash_table.probe::(&mut adjusted, valids) == 0 { + return match MATCHED { + true => Ok(Box::new(EmptyProbeStream)), + false => Ok(AllUnmatchedProbeStream::create(adjusted.len())), + }; + } + PrimitiveProbeStream::<'a, u256, MATCHED, MATCH_FIRST, true, u32>::new( + adjusted, + bufs, + probe_keys, + &self.hash_table.next, + ) + } + _ => unreachable!(), + }) + } + fn add_build_state(&mut self, state: KeysState) { match &mut self.build_keys_states { BuildKeysStates::UInt8(states) => { @@ -424,6 +651,7 @@ struct PrimitiveProbeStream< T: Send + Sync + PartialEq, const MATCHED: bool, const MATCH_FIRST: bool, + const DIRECT: bool, I: RowIndex = u32, > { key_idx: usize, @@ -435,8 +663,11 @@ struct PrimitiveProbeStream< matched_num_rows: usize, } -impl<'a, T: Send + Sync + PartialEq, const MATCHED: bool, const MATCH_FIRST: bool, I: RowIndex> - PrimitiveProbeStream<'a, T, MATCHED, MATCH_FIRST, I> +impl<'a, T, const MATCHED: bool, const MATCH_FIRST: bool, const DIRECT: bool, I> + PrimitiveProbeStream<'a, T, MATCHED, MATCH_FIRST, DIRECT, I> +where + T: Send + Sync + PartialEq, + I: RowIndex, { #[allow(clippy::new_ret_no_self)] pub fn new( @@ -457,8 +688,11 @@ impl<'a, T: Send + Sync + PartialEq, const MATCHED: bool, const MATCH_FIRST: boo } } -impl<'a, T: Send + Sync + PartialEq, const MATCHED: bool, const MATCH_FIRST: bool, I: RowIndex> - ProbeStream for PrimitiveProbeStream<'a, T, MATCHED, MATCH_FIRST, I> +impl<'a, T, const MATCHED: bool, const MATCH_FIRST: bool, const DIRECT: bool, I> ProbeStream + for PrimitiveProbeStream<'a, T, MATCHED, MATCH_FIRST, DIRECT, I> +where + I: RowIndex, + T: Send + Sync + PartialEq, { fn advance(&mut self, res: &mut ProbedRows, max_rows: usize) -> Result<()> { while self.key_idx < self.probe_keys.len() { @@ -488,9 +722,12 @@ impl<'a, T: Send + Sync + PartialEq, const MATCHED: bool, const MATCH_FIRST: boo while self.build_idx != 0 { let row_ptr = flat_to_row_ptr(self.build_idx); - if self.probe_keys[self.key_idx] - == self.build_keys[row_ptr.chunk_index as usize][row_ptr.row_index as usize] - { + let key_match = DIRECT + || self.probe_keys[self.key_idx] + == self.build_keys[row_ptr.chunk_index as usize] + [row_ptr.row_index as usize]; + + if key_match { res.matched_build.push(row_ptr); res.matched_probe.push(self.key_idx as u64); self.matched_num_rows += 1; @@ -633,3 +870,76 @@ impl<'a, const MATCHED: bool, const MATCH_FIRST: bool, I: RowIndex> ProbeStream Ok(()) } } + +/// Scan min/max with short-circuit per chunk. Returns Some((min as u256, range)) if range <= threshold. +fn scan_min_max_u32(buffers: &[Buffer]) -> Option<(u256, u64)> { + let mut min_val = u32::MAX; + let mut max_val = u32::MIN; + for buf in buffers { + for &k in buf.iter() { + min_val = min_val.min(k); + max_val = max_val.max(k); + } + if (max_val as u64).wrapping_sub(min_val as u64) > DIRECT_JOIN_MAX_RANGE { + return None; + } + } + if min_val > max_val { + return None; + } + Some((u256::from(min_val), (max_val as u64) - (min_val as u64))) +} + +fn scan_min_max_u64(buffers: &[Buffer]) -> Option<(u256, u64)> { + let mut min_val = u64::MAX; + let mut max_val = u64::MIN; + for buf in buffers { + for &k in buf.iter() { + min_val = min_val.min(k); + max_val = max_val.max(k); + } + if max_val.wrapping_sub(min_val) > DIRECT_JOIN_MAX_RANGE { + return None; + } + } + if min_val > max_val { + return None; + } + Some((u256::from(min_val), max_val - min_val)) +} + +fn scan_min_max_u128(buffers: &[Buffer]) -> Option<(u256, u64)> { + let mut min_val = u128::MAX; + let mut max_val = u128::MIN; + for buf in buffers { + for &k in buf.iter() { + min_val = min_val.min(k); + max_val = max_val.max(k); + } + if max_val.wrapping_sub(min_val) > DIRECT_JOIN_MAX_RANGE as u128 { + return None; + } + } + if min_val > max_val { + return None; + } + Some((u256::from(min_val), (max_val - min_val) as u64)) +} + +fn scan_min_max_u256(buffers: &[Buffer]) -> Option<(u256, u64)> { + let mut min_val = u256::MAX; + let mut max_val = u256::MIN; + for buf in buffers { + for &k in buf.iter() { + min_val = min_val.min(k); + max_val = max_val.max(k); + } + if max_val.wrapping_sub(min_val) > u256::from(DIRECT_JOIN_MAX_RANGE) { + return None; + } + } + if min_val > max_val { + return None; + } + Some((min_val, (max_val - min_val).as_u64())) +} From 394bfe869a66ab22396f075e045ae4484f5b7c00 Mon Sep 17 00:00:00 2001 From: zhang2014 Date: Thu, 2 Apr 2026 16:35:17 +0800 Subject: [PATCH 34/38] z --- src/query/sql/src/planner/plans/join.rs | 32 +++++++++++++------------ 1 file changed, 17 insertions(+), 15 deletions(-) diff --git a/src/query/sql/src/planner/plans/join.rs b/src/query/sql/src/planner/plans/join.rs index 434b8455786eb..58ab3875ba103 100644 --- a/src/query/sql/src/planner/plans/join.rs +++ b/src/query/sql/src/planner/plans/join.rs @@ -553,7 +553,7 @@ impl Join { let max_build_rows = settings.get_broadcast_join_max_build_rows()?; if max_build_rows > 0 && settings.get_enable_partitioned_hash_join()? - && right_stat_info.cardinality > max_build_rows as f64 + && right_stat_info.cardinality >= max_build_rows as f64 { return Ok(true); } @@ -766,7 +766,7 @@ impl Operator for Join { fn compute_required_prop_children( &self, ctx: Arc, - _rel_expr: &RelExpr, + rel_expr: &RelExpr, _required: &RequiredProperty, ) -> Result>> { let mut children_required = vec![]; @@ -852,19 +852,21 @@ impl Operator for Join { | JoinType::Asof | JoinType::LeftAsof | JoinType::RightAsof - ) && !settings.get_enforce_shuffle_join()? - { - // (Any, Broadcast) - let left_distribution = Distribution::Any; - let right_distribution = Distribution::Broadcast; - children_required.push(vec![ - RequiredProperty { - distribution: left_distribution, - }, - RequiredProperty { - distribution: right_distribution, - }, - ]); + ) { + let right_stat_info = rel_expr.derive_cardinality_child(1)?; + if !Self::enforce_shuffle_join(&settings, &right_stat_info)? { + // (Any, Broadcast) + let left_distribution = Distribution::Any; + let right_distribution = Distribution::Broadcast; + children_required.push(vec![ + RequiredProperty { + distribution: left_distribution, + }, + RequiredProperty { + distribution: right_distribution, + }, + ]); + } } if children_required.is_empty() { From 4be08eb529581873070a1009b21f7b172e59a46f Mon Sep 17 00:00:00 2001 From: zhang2014 Date: Thu, 2 Apr 2026 19:49:22 +0800 Subject: [PATCH 35/38] z --- src/query/settings/src/settings_default.rs | 14 ++++ .../settings/src/settings_getter_setter.rs | 8 ++ .../read/parquet_data_source_deserializer.rs | 2 +- .../fuse/src/operations/read/read_state.rs | 81 ++++++++++++++++++- 4 files changed, 100 insertions(+), 5 deletions(-) diff --git a/src/query/settings/src/settings_default.rs b/src/query/settings/src/settings_default.rs index c493f1024c27a..721ab896e0606 100644 --- a/src/query/settings/src/settings_default.rs +++ b/src/query/settings/src/settings_default.rs @@ -574,6 +574,20 @@ impl DefaultSettings { scope: SettingScope::Both, range: Some(SettingRange::Numeric(1..=u64::MAX)), }), + ("bloom_runtime_filter_selectivity_threshold", DefaultSettingValue { + value: UserSettingValue::UInt64(40), + desc: "Probe-side selectivity threshold (percentage) for bloom runtime filters. If a bloom filter filters less than this percentage of rows, it is temporarily disabled. Default 40 means 40%.", + mode: SettingMode::Both, + scope: SettingScope::Both, + range: Some(SettingRange::Numeric(0..=100)), + }), + ("bloom_runtime_filter_sampling_frequency", DefaultSettingValue { + value: UserSettingValue::UInt64(32), + desc: "Number of block evaluations between re-checks of bloom runtime filter selectivity. After this many evaluations, counters reset and selectivity is re-evaluated.", + mode: SettingMode::Both, + scope: SettingScope::Both, + range: Some(SettingRange::Numeric(1..=u64::MAX)), + }), ("max_execute_time_in_seconds", DefaultSettingValue { value: UserSettingValue::UInt64(0), desc: "Sets the maximum query execution time in seconds. Setting it to 0 means no limit.", diff --git a/src/query/settings/src/settings_getter_setter.rs b/src/query/settings/src/settings_getter_setter.rs index bb226bd36a2cd..4b9b1bc78f776 100644 --- a/src/query/settings/src/settings_getter_setter.rs +++ b/src/query/settings/src/settings_getter_setter.rs @@ -364,6 +364,14 @@ impl Settings { self.try_get_u64("bloom_runtime_filter_threshold") } + pub fn get_bloom_runtime_filter_selectivity_threshold(&self) -> Result { + self.try_get_u64("bloom_runtime_filter_selectivity_threshold") + } + + pub fn get_bloom_runtime_filter_sampling_frequency(&self) -> Result { + self.try_get_u64("bloom_runtime_filter_sampling_frequency") + } + pub fn get_min_max_runtime_filter_threshold(&self) -> Result { self.try_get_u64("min_max_runtime_filter_threshold") } diff --git a/src/query/storages/fuse/src/operations/read/parquet_data_source_deserializer.rs b/src/query/storages/fuse/src/operations/read/parquet_data_source_deserializer.rs index 80c51a926b9f7..87f78b1729809 100644 --- a/src/query/storages/fuse/src/operations/read/parquet_data_source_deserializer.rs +++ b/src/query/storages/fuse/src/operations/read/parquet_data_source_deserializer.rs @@ -233,7 +233,7 @@ impl Processor for DeserializeDataTransform { let (mut data_block, row_selection, bitmap_selection) = self .read_state - .as_ref() + .as_mut() .unwrap() .deserialize_and_filter(columns_chunks, part)?; diff --git a/src/query/storages/fuse/src/operations/read/read_state.rs b/src/query/storages/fuse/src/operations/read/read_state.rs index 1218f605a5920..f23a66e383834 100644 --- a/src/query/storages/fuse/src/operations/read/read_state.rs +++ b/src/query/storages/fuse/src/operations/read/read_state.rs @@ -44,11 +44,67 @@ use crate::io::DataItem; use crate::io::RowSelection; use crate::pruning::ExprBloomFilter; +const DEFAULT_MIN_INPUT_ROWS: usize = 40960; + +#[derive(Clone)] +pub struct BloomFilterSelectivity { + input_rows: usize, + filtered_rows: usize, + eval_counter: usize, + always_true: bool, + sampling_frequency: usize, + selectivity_threshold: usize, + min_input_rows: usize, +} + +impl BloomFilterSelectivity { + pub fn new(selectivity_threshold: usize, sampling_frequency: usize) -> Self { + Self { + input_rows: 0, + filtered_rows: 0, + eval_counter: 0, + always_true: false, + sampling_frequency, + selectivity_threshold, + min_input_rows: DEFAULT_MIN_INPUT_ROWS, + } + } + + pub fn should_skip(&self) -> bool { + self.always_true + } + + pub fn update(&mut self, block_input_rows: usize, block_filtered_rows: usize) { + self.input_rows += block_input_rows; + self.filtered_rows += block_filtered_rows; + self.eval_counter += 1; + + if self.eval_counter >= self.sampling_frequency { + self.judge_selectivity(); + self.reset(); + } + } + + fn judge_selectivity(&mut self) { + if self.input_rows >= self.min_input_rows { + let selectivity_pct = (self.filtered_rows * 100) / self.input_rows; + self.always_true = selectivity_pct < self.selectivity_threshold; + } + } + + fn reset(&mut self) { + self.input_rows = 0; + self.filtered_rows = 0; + self.eval_counter = 0; + } +} + #[derive(Clone)] pub struct BloomRuntimeFilterRef { pub column_index: FieldIndex, pub filter: RuntimeBloomFilter, pub stats: Arc, + pub selectivity: BloomFilterSelectivity, } pub struct ReadState { @@ -98,6 +154,11 @@ impl ReadState { let prewhere_schema: DataSchema = (prewhere_reader.schema().as_ref()).into(); + let settings = ctx.get_settings(); + let selectivity_threshold = + settings.get_bloom_runtime_filter_selectivity_threshold()? as usize; + let sampling_frequency = settings.get_bloom_runtime_filter_sampling_frequency()? as usize; + let runtime_filters: Vec = runtime_filter_entries .into_iter() .filter_map(|entry| { @@ -107,6 +168,10 @@ impl ReadState { column_index, filter: bloom.filter, stats: entry.stats, + selectivity: BloomFilterSelectivity::new( + selectivity_threshold, + sampling_frequency, + ), }) }) .collect(); @@ -147,16 +212,24 @@ impl ReadState { } pub fn runtime_filter( - &self, + &mut self, block: &DataBlock, - _num_rows: usize, + num_rows: usize, ) -> Result> { let bloom_start = Instant::now(); let mut bitmaps = vec![]; - for runtime_filter in &self.runtime_filters { + for runtime_filter in &mut self.runtime_filters { + if runtime_filter.selectivity.should_skip() { + continue; + } + let probe_column = block.get_by_offset(runtime_filter.column_index).to_column(); let bitmap = ExprBloomFilter::new(&runtime_filter.filter).apply(probe_column)?; + + let filtered_rows = bitmap.null_count(); + runtime_filter.selectivity.update(num_rows, filtered_rows); + bitmaps.push(bitmap); } @@ -175,7 +248,7 @@ impl ReadState { } pub fn deserialize_and_filter( - &self, + &mut self, columns_chunks: HashMap, part: &FuseBlockPartInfo, ) -> Result<(DataBlock, Option, Option)> { From f6ce8466591cd355b8f11111d2fc8a84a14f94a4 Mon Sep 17 00:00:00 2001 From: zhang2014 Date: Thu, 2 Apr 2026 20:12:21 +0800 Subject: [PATCH 36/38] z --- .../service/tests/it/storages/fuse/operations/prewhere.rs | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/query/service/tests/it/storages/fuse/operations/prewhere.rs b/src/query/service/tests/it/storages/fuse/operations/prewhere.rs index e20caf0a88cee..941a04b9a3bc2 100644 --- a/src/query/service/tests/it/storages/fuse/operations/prewhere.rs +++ b/src/query/service/tests/it/storages/fuse/operations/prewhere.rs @@ -70,7 +70,8 @@ async fn test_prewhere() -> Result<()> { let _ = _fixture; // Create ReadState which combines prewhere and runtime filter logic - let read_state = ReadState::create(ctx.clone(), scan_id, Some(&prewhere_info), &block_reader)?; + let mut read_state = + ReadState::create(ctx.clone(), scan_id, Some(&prewhere_info), &block_reader)?; // Use the new unified API that handles all states internally let (data_block, _row_selection, bitmap_selection) = From 9941d2253b50e370267a537565abe107b873170c Mon Sep 17 00:00:00 2001 From: zhang2014 Date: Mon, 6 Apr 2026 12:07:51 +0800 Subject: [PATCH 37/38] z --- .../src/pipelines/executor/executor_graph.rs | 46 ++++++++++++++++++- .../partitioned/compact_hash_table.rs | 37 +++++++++++++++ .../partitioned/partitioned_build.rs | 5 +- .../flight/v1/exchange/hash_send_transform.rs | 16 +++++++ .../fuse/src/operations/read/read_state.rs | 10 ++-- 5 files changed, 105 insertions(+), 9 deletions(-) diff --git a/src/query/service/src/pipelines/executor/executor_graph.rs b/src/query/service/src/pipelines/executor/executor_graph.rs index 7e272dbdb7b29..48dda22a0ae00 100644 --- a/src/query/service/src/pipelines/executor/executor_graph.rs +++ b/src/query/service/src/pipelines/executor/executor_graph.rs @@ -100,6 +100,9 @@ pub(crate) struct Node { updated_list: Arc, inputs_port: Vec>, outputs_port: Vec>, + + // Event counters: [NeedData, NeedConsume, Sync, Async, Finished] + pub(crate) event_counts: [AtomicU64; 5], } impl Node { @@ -148,6 +151,7 @@ impl Node { inputs_port: inputs_port.to_vec(), outputs_port: outputs_port.to_vec(), tracking_payload, + event_counts: Default::default(), }) } @@ -462,14 +466,22 @@ impl ExecutingGraph { ); let processor_state = match event { Event::Finished => { + node.event_counts[4].fetch_add(1, Ordering::Relaxed); if !matches!(state_guard_cache.as_deref(), Some(State::Finished)) { locker.finished_nodes.fetch_add(1, Ordering::SeqCst); } State::Finished } - Event::NeedData | Event::NeedConsume => State::Idle, + Event::NeedData | Event::NeedConsume => { + match event { + Event::NeedData => node.event_counts[0].fetch_add(1, Ordering::Relaxed), + _ => node.event_counts[1].fetch_add(1, Ordering::Relaxed), + }; + State::Idle + } Event::Sync => { + node.event_counts[2].fetch_add(1, Ordering::Relaxed); schedule_queue.push_sync(ProcessorWrapper { processor: node.processor.clone(), graph: graph.clone(), @@ -478,6 +490,7 @@ impl ExecutingGraph { State::Processing } Event::Async => { + node.event_counts[3].fetch_add(1, Ordering::Relaxed); schedule_queue.push_async(ProcessorWrapper { processor: node.processor.clone(), graph: graph.clone(), @@ -1087,6 +1100,37 @@ impl RunningGraph { impl Drop for RunningGraph { fn drop(&mut self) { + // Log per-processor event counts + { + use std::collections::BTreeMap; + let mut aggregated: BTreeMap = BTreeMap::new(); + for node in self.0.graph.node_weights() { + let name = unsafe { node.processor.name() }; + let counts = [ + node.event_counts[0].load(Ordering::Relaxed), + node.event_counts[1].load(Ordering::Relaxed), + node.event_counts[2].load(Ordering::Relaxed), + node.event_counts[3].load(Ordering::Relaxed), + node.event_counts[4].load(Ordering::Relaxed), + ]; + let total: u64 = counts.iter().sum(); + if total == 0 { + continue; + } + let entry = aggregated.entry(name).or_insert([0; 5]); + for i in 0..5 { + entry[i] += counts[i]; + } + } + for (name, c) in &aggregated { + let total: u64 = c.iter().sum(); + log::info!( + "Processor event stats: {} => total={}, NeedData={}, NeedConsume={}, Sync={}, Async={}, Finished={}", + name, total, c[0], c[1], c[2], c[3], c[4] + ); + } + } + let execution_stats = self.get_query_execution_stats(); if let Ok(queue) = QueryExecutionStatsQueue::instance() { let _ = queue.append_data((self.get_query_id().to_string(), execution_stats)); diff --git a/src/query/service/src/pipelines/processors/transforms/new_hash_join/partitioned/compact_hash_table.rs b/src/query/service/src/pipelines/processors/transforms/new_hash_join/partitioned/compact_hash_table.rs index 201e832b0e9da..d881365a47d5f 100644 --- a/src/query/service/src/pipelines/processors/transforms/new_hash_join/partitioned/compact_hash_table.rs +++ b/src/query/service/src/pipelines/processors/transforms/new_hash_join/partitioned/compact_hash_table.rs @@ -109,6 +109,43 @@ impl CompactJoinHashTable { target.next_power_of_two() } + pub fn log_stats(&self) { + let num_buckets = self.first.len(); + let mut non_empty = 0usize; + let mut max_chain = 0usize; + let mut total_chain = 0usize; + + for i in 0..num_buckets { + if self.first[i] != I::ZERO { + non_empty += 1; + let mut chain_len = 0usize; + let mut idx = self.first[i].to_usize(); + while idx != 0 { + chain_len += 1; + idx = self.next[idx].to_usize(); + } + max_chain = max_chain.max(chain_len); + total_chain += chain_len; + } + } + + let avg_chain = if non_empty > 0 { + total_chain as f64 / non_empty as f64 + } else { + 0.0 + }; + let occupancy = if num_buckets > 0 { + non_empty as f64 / num_buckets as f64 * 100.0 + } else { + 0.0 + }; + + log::info!( + "CompactJoinHashTable stats: buckets={}, non_empty={}, occupancy={:.1}%, total_rows={}, avg_chain={:.2}, max_chain={}", + num_buckets, non_empty, occupancy, total_chain, avg_chain, max_chain + ); + } + pub fn probe(&self, vals: &mut [u64], bitmap: Option) -> usize { let mut valids = None; diff --git a/src/query/service/src/pipelines/processors/transforms/new_hash_join/partitioned/partitioned_build.rs b/src/query/service/src/pipelines/processors/transforms/new_hash_join/partitioned/partitioned_build.rs index 94d792bc826ba..2d77de1a8c670 100644 --- a/src/query/service/src/pipelines/processors/transforms/new_hash_join/partitioned/partitioned_build.rs +++ b/src/query/service/src/pipelines/processors/transforms/new_hash_join/partitioned/partitioned_build.rs @@ -364,7 +364,10 @@ impl PartitionedHashJoinState { self.build_block_idx += 1; match self.build_block_idx == self.chunks.len() { - true => Ok(None), + true => { + self.hash_table.log_stats(); + Ok(None) + } false => Ok(Some(ProgressValues { rows: 0, bytes: 0 })), } } diff --git a/src/query/service/src/servers/flight/v1/exchange/hash_send_transform.rs b/src/query/service/src/servers/flight/v1/exchange/hash_send_transform.rs index 5cf4a674eb972..b47c5e1c806dd 100644 --- a/src/query/service/src/servers/flight/v1/exchange/hash_send_transform.rs +++ b/src/query/service/src/servers/flight/v1/exchange/hash_send_transform.rs @@ -44,6 +44,7 @@ pub struct HashSendTransform { tasks: SyncTaskSet, channels: Vec>, handle: Option>>>, + partition_row_counts: Vec, } impl HashSendTransform { @@ -73,6 +74,7 @@ impl HashSendTransform { ), handle: None, id: NodeIndex::default(), + partition_row_counts: vec![0; scatter_size], })); PipeItem::create(processor, vec![input], vec![output]) @@ -121,6 +123,7 @@ impl Processor for HashSendTransform { if block.is_empty() { continue; } + self.partition_row_counts[partition_id] += block.num_rows(); if partition_id == self.local_pos { if self.output.is_finished() { @@ -159,6 +162,19 @@ impl Processor for HashSendTransform { if self.input.is_finished() { self.output.finish(); + // Log partition row distribution + let total: usize = self.partition_row_counts.iter().sum(); + if total > 0 { + let avg = total / self.partition_row_counts.len().max(1); + let max_rows = self.partition_row_counts.iter().max().copied().unwrap_or(0); + let min_rows = self.partition_row_counts.iter().min().copied().unwrap_or(0); + let skew = if avg > 0 { max_rows as f64 / avg as f64 } else { 0.0 }; + log::info!( + "HashSendTransform partition distribution: total={}, partitions={}, min={}, max={}, avg={}, skew={:.2}x, counts={:?}", + total, self.partition_row_counts.len(), min_rows, max_rows, avg, skew, self.partition_row_counts + ); + } + let mut futures = Vec::new(); for partition_id in 0..self.channels.len() { diff --git a/src/query/storages/fuse/src/operations/read/read_state.rs b/src/query/storages/fuse/src/operations/read/read_state.rs index f23a66e383834..97584d26c56ad 100644 --- a/src/query/storages/fuse/src/operations/read/read_state.rs +++ b/src/query/storages/fuse/src/operations/read/read_state.rs @@ -81,7 +81,9 @@ impl BloomFilterSelectivity { if self.eval_counter >= self.sampling_frequency { self.judge_selectivity(); - self.reset(); + self.input_rows = 0; + self.filtered_rows = 0; + self.eval_counter = 0; } } @@ -91,12 +93,6 @@ impl BloomFilterSelectivity { self.always_true = selectivity_pct < self.selectivity_threshold; } } - - fn reset(&mut self) { - self.input_rows = 0; - self.filtered_rows = 0; - self.eval_counter = 0; - } } #[derive(Clone)] From f5a9b789c98a3dd1a00c33734850467c2f7df5d4 Mon Sep 17 00:00:00 2001 From: zhang2014 Date: Wed, 8 Apr 2026 15:17:38 +0800 Subject: [PATCH 38/38] Revert "z" This reverts commit 9941d2253b50e370267a537565abe107b873170c. --- .../src/pipelines/executor/executor_graph.rs | 46 +------------------ .../partitioned/compact_hash_table.rs | 37 --------------- .../partitioned/partitioned_build.rs | 5 +- .../flight/v1/exchange/hash_send_transform.rs | 16 ------- .../fuse/src/operations/read/read_state.rs | 10 ++-- 5 files changed, 9 insertions(+), 105 deletions(-) diff --git a/src/query/service/src/pipelines/executor/executor_graph.rs b/src/query/service/src/pipelines/executor/executor_graph.rs index 48dda22a0ae00..7e272dbdb7b29 100644 --- a/src/query/service/src/pipelines/executor/executor_graph.rs +++ b/src/query/service/src/pipelines/executor/executor_graph.rs @@ -100,9 +100,6 @@ pub(crate) struct Node { updated_list: Arc, inputs_port: Vec>, outputs_port: Vec>, - - // Event counters: [NeedData, NeedConsume, Sync, Async, Finished] - pub(crate) event_counts: [AtomicU64; 5], } impl Node { @@ -151,7 +148,6 @@ impl Node { inputs_port: inputs_port.to_vec(), outputs_port: outputs_port.to_vec(), tracking_payload, - event_counts: Default::default(), }) } @@ -466,22 +462,14 @@ impl ExecutingGraph { ); let processor_state = match event { Event::Finished => { - node.event_counts[4].fetch_add(1, Ordering::Relaxed); if !matches!(state_guard_cache.as_deref(), Some(State::Finished)) { locker.finished_nodes.fetch_add(1, Ordering::SeqCst); } State::Finished } - Event::NeedData | Event::NeedConsume => { - match event { - Event::NeedData => node.event_counts[0].fetch_add(1, Ordering::Relaxed), - _ => node.event_counts[1].fetch_add(1, Ordering::Relaxed), - }; - State::Idle - } + Event::NeedData | Event::NeedConsume => State::Idle, Event::Sync => { - node.event_counts[2].fetch_add(1, Ordering::Relaxed); schedule_queue.push_sync(ProcessorWrapper { processor: node.processor.clone(), graph: graph.clone(), @@ -490,7 +478,6 @@ impl ExecutingGraph { State::Processing } Event::Async => { - node.event_counts[3].fetch_add(1, Ordering::Relaxed); schedule_queue.push_async(ProcessorWrapper { processor: node.processor.clone(), graph: graph.clone(), @@ -1100,37 +1087,6 @@ impl RunningGraph { impl Drop for RunningGraph { fn drop(&mut self) { - // Log per-processor event counts - { - use std::collections::BTreeMap; - let mut aggregated: BTreeMap = BTreeMap::new(); - for node in self.0.graph.node_weights() { - let name = unsafe { node.processor.name() }; - let counts = [ - node.event_counts[0].load(Ordering::Relaxed), - node.event_counts[1].load(Ordering::Relaxed), - node.event_counts[2].load(Ordering::Relaxed), - node.event_counts[3].load(Ordering::Relaxed), - node.event_counts[4].load(Ordering::Relaxed), - ]; - let total: u64 = counts.iter().sum(); - if total == 0 { - continue; - } - let entry = aggregated.entry(name).or_insert([0; 5]); - for i in 0..5 { - entry[i] += counts[i]; - } - } - for (name, c) in &aggregated { - let total: u64 = c.iter().sum(); - log::info!( - "Processor event stats: {} => total={}, NeedData={}, NeedConsume={}, Sync={}, Async={}, Finished={}", - name, total, c[0], c[1], c[2], c[3], c[4] - ); - } - } - let execution_stats = self.get_query_execution_stats(); if let Ok(queue) = QueryExecutionStatsQueue::instance() { let _ = queue.append_data((self.get_query_id().to_string(), execution_stats)); diff --git a/src/query/service/src/pipelines/processors/transforms/new_hash_join/partitioned/compact_hash_table.rs b/src/query/service/src/pipelines/processors/transforms/new_hash_join/partitioned/compact_hash_table.rs index d881365a47d5f..201e832b0e9da 100644 --- a/src/query/service/src/pipelines/processors/transforms/new_hash_join/partitioned/compact_hash_table.rs +++ b/src/query/service/src/pipelines/processors/transforms/new_hash_join/partitioned/compact_hash_table.rs @@ -109,43 +109,6 @@ impl CompactJoinHashTable { target.next_power_of_two() } - pub fn log_stats(&self) { - let num_buckets = self.first.len(); - let mut non_empty = 0usize; - let mut max_chain = 0usize; - let mut total_chain = 0usize; - - for i in 0..num_buckets { - if self.first[i] != I::ZERO { - non_empty += 1; - let mut chain_len = 0usize; - let mut idx = self.first[i].to_usize(); - while idx != 0 { - chain_len += 1; - idx = self.next[idx].to_usize(); - } - max_chain = max_chain.max(chain_len); - total_chain += chain_len; - } - } - - let avg_chain = if non_empty > 0 { - total_chain as f64 / non_empty as f64 - } else { - 0.0 - }; - let occupancy = if num_buckets > 0 { - non_empty as f64 / num_buckets as f64 * 100.0 - } else { - 0.0 - }; - - log::info!( - "CompactJoinHashTable stats: buckets={}, non_empty={}, occupancy={:.1}%, total_rows={}, avg_chain={:.2}, max_chain={}", - num_buckets, non_empty, occupancy, total_chain, avg_chain, max_chain - ); - } - pub fn probe(&self, vals: &mut [u64], bitmap: Option) -> usize { let mut valids = None; diff --git a/src/query/service/src/pipelines/processors/transforms/new_hash_join/partitioned/partitioned_build.rs b/src/query/service/src/pipelines/processors/transforms/new_hash_join/partitioned/partitioned_build.rs index 2d77de1a8c670..94d792bc826ba 100644 --- a/src/query/service/src/pipelines/processors/transforms/new_hash_join/partitioned/partitioned_build.rs +++ b/src/query/service/src/pipelines/processors/transforms/new_hash_join/partitioned/partitioned_build.rs @@ -364,10 +364,7 @@ impl PartitionedHashJoinState { self.build_block_idx += 1; match self.build_block_idx == self.chunks.len() { - true => { - self.hash_table.log_stats(); - Ok(None) - } + true => Ok(None), false => Ok(Some(ProgressValues { rows: 0, bytes: 0 })), } } diff --git a/src/query/service/src/servers/flight/v1/exchange/hash_send_transform.rs b/src/query/service/src/servers/flight/v1/exchange/hash_send_transform.rs index b47c5e1c806dd..5cf4a674eb972 100644 --- a/src/query/service/src/servers/flight/v1/exchange/hash_send_transform.rs +++ b/src/query/service/src/servers/flight/v1/exchange/hash_send_transform.rs @@ -44,7 +44,6 @@ pub struct HashSendTransform { tasks: SyncTaskSet, channels: Vec>, handle: Option>>>, - partition_row_counts: Vec, } impl HashSendTransform { @@ -74,7 +73,6 @@ impl HashSendTransform { ), handle: None, id: NodeIndex::default(), - partition_row_counts: vec![0; scatter_size], })); PipeItem::create(processor, vec![input], vec![output]) @@ -123,7 +121,6 @@ impl Processor for HashSendTransform { if block.is_empty() { continue; } - self.partition_row_counts[partition_id] += block.num_rows(); if partition_id == self.local_pos { if self.output.is_finished() { @@ -162,19 +159,6 @@ impl Processor for HashSendTransform { if self.input.is_finished() { self.output.finish(); - // Log partition row distribution - let total: usize = self.partition_row_counts.iter().sum(); - if total > 0 { - let avg = total / self.partition_row_counts.len().max(1); - let max_rows = self.partition_row_counts.iter().max().copied().unwrap_or(0); - let min_rows = self.partition_row_counts.iter().min().copied().unwrap_or(0); - let skew = if avg > 0 { max_rows as f64 / avg as f64 } else { 0.0 }; - log::info!( - "HashSendTransform partition distribution: total={}, partitions={}, min={}, max={}, avg={}, skew={:.2}x, counts={:?}", - total, self.partition_row_counts.len(), min_rows, max_rows, avg, skew, self.partition_row_counts - ); - } - let mut futures = Vec::new(); for partition_id in 0..self.channels.len() { diff --git a/src/query/storages/fuse/src/operations/read/read_state.rs b/src/query/storages/fuse/src/operations/read/read_state.rs index 97584d26c56ad..f23a66e383834 100644 --- a/src/query/storages/fuse/src/operations/read/read_state.rs +++ b/src/query/storages/fuse/src/operations/read/read_state.rs @@ -81,9 +81,7 @@ impl BloomFilterSelectivity { if self.eval_counter >= self.sampling_frequency { self.judge_selectivity(); - self.input_rows = 0; - self.filtered_rows = 0; - self.eval_counter = 0; + self.reset(); } } @@ -93,6 +91,12 @@ impl BloomFilterSelectivity { self.always_true = selectivity_pct < self.selectivity_threshold; } } + + fn reset(&mut self) { + self.input_rows = 0; + self.filtered_rows = 0; + self.eval_counter = 0; + } } #[derive(Clone)]