diff --git a/Cargo.lock b/Cargo.lock index aac946db8ee27..204e5c4913bb0 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -6023,6 +6023,7 @@ dependencies = [ "sha2", "socket2 0.5.9", "sqlx", + "strength_reduce", "sysinfo", "tantivy", "temp-env", diff --git a/src/query/catalog/src/sbbf.rs b/src/query/catalog/src/sbbf.rs index 5156d3196e221..30f123a259c99 100644 --- a/src/query/catalog/src/sbbf.rs +++ b/src/query/catalog/src/sbbf.rs @@ -73,9 +73,15 @@ //! [sbbf-paper]: https://arxiv.org/pdf/2101.01719 //! [bf-formulae]: http://tfk.mit.edu/pdf/bloom.pdf -use core::simd::Simd; -use core::simd::cmp::SimdPartialEq; +// Use NEON intrinsics on aarch64 for better performance +#[cfg(target_arch = "aarch64")] +use std::arch::aarch64::*; use std::mem::size_of; +// Use portable SIMD on other platforms +#[cfg(not(target_arch = "aarch64"))] +use std::simd::Simd; +#[cfg(not(target_arch = "aarch64"))] +use std::simd::cmp::SimdPartialEq; use std::sync::Arc; use std::sync::atomic::AtomicU32; use std::sync::atomic::Ordering; @@ -83,7 +89,11 @@ use std::sync::atomic::Ordering; use databend_common_base::runtime::Runtime; /// Salt values as defined in the [spec](https://github.com/apache/parquet-format/blob/master/BloomFilter.md#technical-approach). -const SALT: [u32; 8] = [ +/// 32-byte aligned for optimal SIMD load performance. +#[repr(C, align(32))] +struct AlignedSalt([u32; 8]); + +static SALT: AlignedSalt = AlignedSalt([ 0x47b6137b_u32, 0x44974d91_u32, 0x8824ad5b_u32, @@ -92,7 +102,10 @@ const SALT: [u32; 8] = [ 0x2df1424b_u32, 0x9efc4947_u32, 0x5c6bfb31_u32, -]; +]); + +/// Shift amount for extracting bit index: (hash * salt) >> 27 gives 5 bits (0-31) +const SHIFT_NUM: i32 = 27; /// Each block is 256 bits, broken up into eight contiguous "words", each consisting of 32 bits. /// Each word is thought of as an array of bits; each bit is either "set" or "not set". @@ -100,6 +113,7 @@ const SALT: [u32; 8] = [ #[repr(transparent)] struct Block([u32; 8]); +#[cfg(not(target_arch = "aarch64"))] type U32x8 = Simd; impl Block { @@ -107,6 +121,33 @@ impl Block { /// takes as its argument a single unsigned 32-bit integer and returns a block in which each /// word has exactly one bit set. + #[cfg(target_arch = "aarch64")] + #[inline] + fn mask(x: u32) -> Self { + unsafe { + let (mask_lo, mask_hi) = Self::mask_neon(x); + let mut result = [0u32; 8]; + vst1q_u32_x2(result.as_mut_ptr(), uint32x4x2_t(mask_lo, mask_hi)); + Self(result) + } + } + + #[cfg(target_arch = "aarch64")] + #[inline(always)] + unsafe fn mask_neon(x: u32) -> (uint32x4_t, uint32x4_t) { + unsafe { + let ones = vdupq_n_u32(1); + let hash_data = vdupq_n_u32(x); + let salt = vld1q_u32_x2(SALT.0.as_ptr()); + let bit_index_lo = + vreinterpretq_s32_u32(vshrq_n_u32::(vmulq_u32(salt.0, hash_data))); + let bit_index_hi = + vreinterpretq_s32_u32(vshrq_n_u32::(vmulq_u32(salt.1, hash_data))); + (vshlq_u32(ones, bit_index_lo), vshlq_u32(ones, bit_index_hi)) + } + } + + #[cfg(not(target_arch = "aarch64"))] fn mask(x: u32) -> Self { Self(Self::mask_simd(x).to_array()) } @@ -132,6 +173,18 @@ impl Block { } /// Setting every bit in the block that was also set in the result from mask + #[cfg(target_arch = "aarch64")] + #[inline] + fn insert(&mut self, hash: u32) { + unsafe { + let (mask_lo, mask_hi) = Self::mask_neon(hash); + let data = vld1q_u32_x2(self.0.as_ptr()); + let result = uint32x4x2_t(vorrq_u32(data.0, mask_lo), vorrq_u32(data.1, mask_hi)); + vst1q_u32_x2(self.0.as_mut_ptr(), result); + } + } + + #[cfg(not(target_arch = "aarch64"))] fn insert(&mut self, hash: u32) { let mask = Self::mask(hash); for i in 0..8 { @@ -140,16 +193,30 @@ impl Block { } /// Returns true when every bit that is set in the result of mask is also set in the block. + #[cfg(target_arch = "aarch64")] + #[inline] + fn check(&self, hash: u32) -> bool { + unsafe { + let (mask_lo, mask_hi) = Self::mask_neon(hash); + let data = vld1q_u32_x2(self.0.as_ptr()); + // vbicq_u32(a, b) = a & !b: bits set in mask but not in data + let miss = vorrq_u32(vbicq_u32(mask_lo, data.0), vbicq_u32(mask_hi, data.1)); + vmaxvq_u32(miss) == 0 + } + } + + #[cfg(not(target_arch = "aarch64"))] fn check(&self, hash: u32) -> bool { let mask = Self::mask_simd(hash); let block_vec = U32x8::from_array(self.0); (block_vec & mask).simd_ne(U32x8::splat(0)).all() } + #[cfg(not(target_arch = "aarch64"))] #[inline(always)] fn mask_simd(x: u32) -> U32x8 { let hash_vec = U32x8::splat(x); - let salt_vec = U32x8::from_array(SALT); + let salt_vec = U32x8::from_array(SALT.0); let bit_index = (hash_vec * salt_vec) >> U32x8::splat(27); U32x8::splat(1) << bit_index } @@ -199,7 +266,7 @@ pub struct Sbbf(Vec); pub struct SbbfAtomic(Vec); pub(crate) const BITSET_MIN_LENGTH: usize = 32; -pub(crate) const BITSET_MAX_LENGTH: usize = 128 * 1024 * 1024; +pub(crate) const BITSET_MAX_LENGTH: usize = 64 * 1024 * 1024; #[inline] fn hash_to_block_index_for_blocks(hash: u64, num_blocks: usize) -> usize { @@ -306,6 +373,28 @@ impl Sbbf { pub fn estimated_memory_size(&self) -> usize { self.0.capacity() * std::mem::size_of::() } + + /// Zero-copy serialize to Vec, consuming self. + pub fn into_u32s(self) -> Vec { + let mut blocks = std::mem::ManuallyDrop::new(self.0); + let ptr = blocks.as_mut_ptr() as *mut u32; + let len = blocks.len() * 8; + let cap = blocks.capacity() * 8; + unsafe { Vec::from_raw_parts(ptr, len, cap) } + } + + /// Zero-copy deserialize from Vec. + /// Returns None if length is not a multiple of 8 (one Block = 8 x u32). + pub fn from_u32s(words: Vec) -> Option { + if words.is_empty() || !words.len().is_multiple_of(8) { + return None; + } + let mut words = std::mem::ManuallyDrop::new(words); + let len = words.len() / 8; + let cap = words.capacity() / 8; + let ptr = words.as_mut_ptr() as *mut Block; + Some(Self(unsafe { Vec::from_raw_parts(ptr, len, cap) })) + } } impl SbbfAtomic { @@ -497,7 +586,7 @@ mod tests { (33, 64), (99, 128), (1024, 1024), - (999_000_000, 128 * 1024 * 1024), + (999_000_000, 64 * 1024 * 1024), ] { assert_eq!(*expected, optimal_num_of_bytes(*input)); } @@ -529,4 +618,49 @@ mod tests { assert_eq!(*num_bits, num_of_bits_from_ndv_fpp(*ndv, *fpp) as u64); } } + + #[test] + fn test_sbbf_to_bytes_from_bytes_roundtrip() { + let mut filter = Sbbf::new_with_ndv_fpp(1000, 0.01).unwrap(); + let hashes: Vec = (0..500).collect(); + filter.insert_hash_batch(&hashes); + + let words = filter.into_u32s(); + let restored = Sbbf::from_u32s(words).unwrap(); + + for hash in &hashes { + assert!(restored.check_hash(*hash)); + } + } + + #[test] + fn test_sbbf_from_u32s_invalid() { + assert!(Sbbf::from_u32s(vec![]).is_none()); + assert!(Sbbf::from_u32s(vec![0; 7]).is_none()); + assert!(Sbbf::from_u32s(vec![0; 9]).is_none()); + assert!(Sbbf::from_u32s(vec![0; 8]).is_some()); + assert!(Sbbf::from_u32s(vec![0; 16]).is_some()); + } + + #[test] + fn test_sbbf_union_after_serialization() { + let mut f1 = Sbbf::new_with_ndv_fpp(100, 0.01).unwrap(); + for i in 0..50 { + f1.insert_hash(i); + } + let mut f2 = Sbbf::new_with_ndv_fpp(100, 0.01).unwrap(); + for i in 50..100 { + f2.insert_hash(i); + } + + let words1 = f1.into_u32s(); + let words2 = f2.into_u32s(); + let mut restored1 = Sbbf::from_u32s(words1).unwrap(); + let restored2 = Sbbf::from_u32s(words2).unwrap(); + restored1.union(&restored2); + + for i in 0..100 { + assert!(restored1.check_hash(i)); + } + } } diff --git a/src/query/expression/src/aggregate/group_hash.rs b/src/query/expression/src/aggregate/group_hash.rs index cfb29554f3620..9b68cf7635b0d 100644 --- a/src/query/expression/src/aggregate/group_hash.rs +++ b/src/query/expression/src/aggregate/group_hash.rs @@ -60,7 +60,7 @@ pub fn group_hash_entries(entries: ProjectedBlock, values: &mut [u64]) { } } -fn combine_group_hash_column(c: &Column, values: &mut [u64]) { +pub fn combine_group_hash_column(c: &Column, values: &mut [u64]) { HashVisitor:: { values } .visit_column(c.clone()) .unwrap() diff --git a/src/query/service/Cargo.toml b/src/query/service/Cargo.toml index 547d06c4f83af..c1b10b4f8a1af 100644 --- a/src/query/service/Cargo.toml +++ b/src/query/service/Cargo.toml @@ -164,6 +164,7 @@ serde_urlencoded = { workspace = true } sha2 = { workspace = true } socket2 = { workspace = true } sqlx = { workspace = true } +strength_reduce = { workspace = true } sysinfo = { workspace = true } tempfile = { workspace = true } tokio = { workspace = true } diff --git a/src/query/service/src/physical_plans/physical_add_stream_column.rs b/src/query/service/src/physical_plans/physical_add_stream_column.rs index cf11d49dad7b9..50ec67fa873f0 100644 --- a/src/query/service/src/physical_plans/physical_add_stream_column.rs +++ b/src/query/service/src/physical_plans/physical_add_stream_column.rs @@ -36,6 +36,7 @@ use databend_common_sql::StreamContext; use databend_common_sql::Symbol; use databend_common_sql::Visibility; use databend_common_sql::evaluator::BlockOperator; +use databend_common_sql::executor::physical_plans::DataDistribution; use databend_common_sql::plans::BoundColumnRef; use databend_common_sql::plans::ConstantExpr; use databend_common_sql::plans::FunctionCall; @@ -69,6 +70,10 @@ impl IPhysicalPlan for AddStreamColumn { &mut self.meta } + fn output_data_distribution(&self) -> DataDistribution { + self.input.output_data_distribution() + } + fn children<'a>(&'a self) -> Box + 'a> { Box::new(std::iter::once(&self.input)) } diff --git a/src/query/service/src/physical_plans/physical_aggregate_expand.rs b/src/query/service/src/physical_plans/physical_aggregate_expand.rs index ea747aace91eb..0585b1ea85503 100644 --- a/src/query/service/src/physical_plans/physical_aggregate_expand.rs +++ b/src/query/service/src/physical_plans/physical_aggregate_expand.rs @@ -22,6 +22,7 @@ use databend_common_expression::types::DataType; use databend_common_expression::types::NumberDataType; use databend_common_pipeline_transforms::TransformPipelineHelper; use databend_common_sql::Symbol; +use databend_common_sql::executor::physical_plans::DataDistribution; use databend_common_sql::plans::GroupingSets; use crate::physical_plans::explain::PlanStatsInfo; @@ -58,6 +59,10 @@ impl IPhysicalPlan for AggregateExpand { &mut self.meta } + fn output_data_distribution(&self) -> DataDistribution { + self.input.output_data_distribution() + } + #[recursive::recursive] fn output_schema(&self) -> Result { let input_schema = self.input.output_schema()?; diff --git a/src/query/service/src/physical_plans/physical_aggregate_final.rs b/src/query/service/src/physical_plans/physical_aggregate_final.rs index 67008d6f96690..f37b41671fd70 100644 --- a/src/query/service/src/physical_plans/physical_aggregate_final.rs +++ b/src/query/service/src/physical_plans/physical_aggregate_final.rs @@ -29,6 +29,7 @@ use databend_common_sql::ScalarExpr; use databend_common_sql::Symbol; use databend_common_sql::executor::physical_plans::AggregateFunctionDesc; use databend_common_sql::executor::physical_plans::AggregateFunctionSignature; +use databend_common_sql::executor::physical_plans::DataDistribution; use databend_common_sql::executor::physical_plans::SortDesc; use databend_common_sql::optimizer::ir::SExpr; use databend_common_sql::plans::Aggregate; @@ -111,6 +112,13 @@ impl IPhysicalPlan for AggregateFinal { Ok(AggregateFinalFormatter::create(self)) } + fn output_data_distribution(&self) -> DataDistribution { + match self.group_by.is_empty() { + true => DataDistribution::Serial, + false => DataDistribution::Random, + } + } + fn get_desc(&self) -> Result { Ok(self.agg_funcs.iter().map(|x| x.display.clone()).join(", ")) } diff --git a/src/query/service/src/physical_plans/physical_aggregate_partial.rs b/src/query/service/src/physical_plans/physical_aggregate_partial.rs index c2d39d7430af3..247099bf3405d 100644 --- a/src/query/service/src/physical_plans/physical_aggregate_partial.rs +++ b/src/query/service/src/physical_plans/physical_aggregate_partial.rs @@ -32,6 +32,7 @@ use databend_common_pipeline_transforms::TransformPipelineHelper; use databend_common_pipeline_transforms::sorts::TransformRankLimitSort; use databend_common_sql::Symbol; use databend_common_sql::executor::physical_plans::AggregateFunctionDesc; +use databend_common_sql::executor::physical_plans::DataDistribution; use databend_common_sql::executor::physical_plans::SortDesc; use databend_common_storage::DataOperator; use itertools::Itertools; @@ -81,6 +82,10 @@ impl IPhysicalPlan for AggregatePartial { &mut self.meta } + fn output_data_distribution(&self) -> DataDistribution { + self.input.output_data_distribution() + } + #[recursive::recursive] fn output_schema(&self) -> Result { let input_schema = self.input.output_schema()?; diff --git a/src/query/service/src/physical_plans/physical_async_func.rs b/src/query/service/src/physical_plans/physical_async_func.rs index b32fd60af7ddd..08ec88a4edc95 100644 --- a/src/query/service/src/physical_plans/physical_async_func.rs +++ b/src/query/service/src/physical_plans/physical_async_func.rs @@ -24,6 +24,7 @@ use databend_common_pipeline_transforms::TransformPipelineHelper; use databend_common_sql::ColumnSet; use databend_common_sql::ScalarExpr; use databend_common_sql::binder::AsyncFunctionDesc; +use databend_common_sql::executor::physical_plans::DataDistribution; use databend_common_sql::optimizer::ir::SExpr; use itertools::Itertools; @@ -59,6 +60,10 @@ impl IPhysicalPlan for AsyncFunction { &mut self.meta } + fn output_data_distribution(&self) -> DataDistribution { + self.input.output_data_distribution() + } + #[recursive::recursive] fn output_schema(&self) -> Result { let input_schema = self.input.output_schema()?; diff --git a/src/query/service/src/physical_plans/physical_broadcast.rs b/src/query/service/src/physical_plans/physical_broadcast.rs index 88f961c5103ed..37a59c46261e7 100644 --- a/src/query/service/src/physical_plans/physical_broadcast.rs +++ b/src/query/service/src/physical_plans/physical_broadcast.rs @@ -17,6 +17,7 @@ use std::any::Any; use databend_common_catalog::table_context::TableContext; use databend_common_exception::Result; use databend_common_expression::DataSchemaRef; +use databend_common_sql::executor::physical_plans::DataDistribution; use databend_common_sql::executor::physical_plans::FragmentKind; use super::Exchange; @@ -48,6 +49,10 @@ impl IPhysicalPlan for BroadcastSource { &mut self.meta } + fn output_data_distribution(&self) -> DataDistribution { + DataDistribution::Random + } + fn derive(&self, children: Vec) -> PhysicalPlan { assert!(children.is_empty()); PhysicalPlan::new(BroadcastSource { @@ -88,6 +93,10 @@ impl IPhysicalPlan for BroadcastSink { &mut self.meta } + fn output_data_distribution(&self) -> DataDistribution { + DataDistribution::Serial + } + #[recursive::recursive] fn output_schema(&self) -> Result { Ok(DataSchemaRef::default()) diff --git a/src/query/service/src/physical_plans/physical_cache_scan.rs b/src/query/service/src/physical_plans/physical_cache_scan.rs index 0a01747df77e3..5110acb590559 100644 --- a/src/query/service/src/physical_plans/physical_cache_scan.rs +++ b/src/query/service/src/physical_plans/physical_cache_scan.rs @@ -19,6 +19,7 @@ use databend_common_exception::Result; use databend_common_expression::DataSchemaRef; use databend_common_expression::DataSchemaRefExt; use databend_common_sql::ColumnSet; +use databend_common_sql::executor::physical_plans::DataDistribution; use databend_common_sql::plans::CacheSource; use crate::physical_plans::format::CacheScanFormatter; @@ -59,6 +60,10 @@ impl IPhysicalPlan for CacheScan { Ok(self.output_schema.clone()) } + fn output_data_distribution(&self) -> DataDistribution { + DataDistribution::Random + } + fn formatter(&self) -> Result> { Ok(CacheScanFormatter::create(self)) } diff --git a/src/query/service/src/physical_plans/physical_column_mutation.rs b/src/query/service/src/physical_plans/physical_column_mutation.rs index c1bb9b709ec8b..8fa9af0fd175e 100644 --- a/src/query/service/src/physical_plans/physical_column_mutation.rs +++ b/src/query/service/src/physical_plans/physical_column_mutation.rs @@ -25,6 +25,7 @@ use databend_common_meta_app::schema::TableInfo; use databend_common_pipeline_transforms::TransformPipelineHelper; use databend_common_pipeline_transforms::blocks::CompoundBlockOperator; use databend_common_sql::evaluator::BlockOperator; +use databend_common_sql::executor::physical_plans::DataDistribution; use databend_common_sql::executor::physical_plans::MutationKind; use databend_common_storages_fuse::FuseTable; use databend_common_storages_fuse::operations::TransformSerializeBlock; @@ -65,6 +66,10 @@ impl IPhysicalPlan for ColumnMutation { &mut self.meta } + fn output_data_distribution(&self) -> DataDistribution { + self.input.output_data_distribution() + } + #[recursive::recursive] fn output_schema(&self) -> Result { Ok(DataSchemaRef::default()) diff --git a/src/query/service/src/physical_plans/physical_commit_sink.rs b/src/query/service/src/physical_plans/physical_commit_sink.rs index c8f584484b0f0..59303d641a1e7 100644 --- a/src/query/service/src/physical_plans/physical_commit_sink.rs +++ b/src/query/service/src/physical_plans/physical_commit_sink.rs @@ -23,6 +23,7 @@ use databend_common_meta_app::schema::TableInfo; use databend_common_meta_app::schema::UpdateStreamMetaReq; use databend_common_pipeline::core::ExecutionInfo; use databend_common_pipeline_transforms::TransformPipelineHelper; +use databend_common_sql::executor::physical_plans::DataDistribution; use databend_common_sql::executor::physical_plans::MutationKind; use databend_common_sql::plans::TruncateMode; use databend_common_storages_fuse::FuseTable; @@ -71,6 +72,10 @@ impl IPhysicalPlan for CommitSink { &mut self.meta } + fn output_data_distribution(&self) -> DataDistribution { + DataDistribution::Serial + } + #[recursive::recursive] fn output_schema(&self) -> Result { Ok(DataSchemaRef::default()) diff --git a/src/query/service/src/physical_plans/physical_compact_source.rs b/src/query/service/src/physical_plans/physical_compact_source.rs index 52fa41a4b8533..94593db34eb87 100644 --- a/src/query/service/src/physical_plans/physical_compact_source.rs +++ b/src/query/service/src/physical_plans/physical_compact_source.rs @@ -31,6 +31,7 @@ use databend_common_pipeline::sources::EmptySource; use databend_common_pipeline::sources::PrefetchAsyncSourcer; use databend_common_pipeline_transforms::TransformPipelineHelper; use databend_common_sql::StreamContext; +use databend_common_sql::executor::physical_plans::DataDistribution; use databend_common_sql::executor::physical_plans::FragmentKind; use databend_common_sql::executor::physical_plans::MutationKind; use databend_common_storages_fuse::FuseTable; @@ -72,6 +73,10 @@ impl IPhysicalPlan for CompactSource { &mut self.meta } + fn output_data_distribution(&self) -> DataDistribution { + DataDistribution::Random + } + fn derive(&self, children: Vec) -> PhysicalPlan { assert!(children.is_empty()); PhysicalPlan::new(CompactSource { diff --git a/src/query/service/src/physical_plans/physical_constant_table_scan.rs b/src/query/service/src/physical_plans/physical_constant_table_scan.rs index 2024c4c5fe1d1..c568618b1ca67 100644 --- a/src/query/service/src/physical_plans/physical_constant_table_scan.rs +++ b/src/query/service/src/physical_plans/physical_constant_table_scan.rs @@ -21,6 +21,7 @@ use databend_common_expression::DataSchemaRef; use databend_common_pipeline::sources::OneBlockSource; use databend_common_sql::ColumnSet; use databend_common_sql::Symbol; +use databend_common_sql::executor::physical_plans::DataDistribution; use crate::physical_plans::PhysicalPlanBuilder; use crate::physical_plans::format::ConstantTableScanFormatter; @@ -56,6 +57,10 @@ impl IPhysicalPlan for ConstantTableScan { Ok(self.output_schema.clone()) } + fn output_data_distribution(&self) -> DataDistribution { + DataDistribution::Random + } + fn formatter(&self) -> Result> { Ok(ConstantTableScanFormatter::create(self)) } diff --git a/src/query/service/src/physical_plans/physical_copy_into_location.rs b/src/query/service/src/physical_plans/physical_copy_into_location.rs index 873dcdd3e4f16..678b8d81d3263 100644 --- a/src/query/service/src/physical_plans/physical_copy_into_location.rs +++ b/src/query/service/src/physical_plans/physical_copy_into_location.rs @@ -27,6 +27,7 @@ use databend_common_expression::TableSchemaRef; use databend_common_expression::types::DataType; use databend_common_expression::types::NumberDataType; use databend_common_sql::ColumnBinding; +use databend_common_sql::executor::physical_plans::DataDistribution; use databend_common_storages_stage::StageSinkTable; use databend_storages_common_stage::CopyIntoLocationInfo; use databend_storages_common_table_meta::meta::TableMetaTimestamps; @@ -60,6 +61,10 @@ impl IPhysicalPlan for CopyIntoLocation { &mut self.meta } + fn output_data_distribution(&self) -> DataDistribution { + DataDistribution::Random + } + #[recursive::recursive] fn output_schema(&self) -> Result { Ok(DataSchemaRefExt::create(vec![ diff --git a/src/query/service/src/physical_plans/physical_copy_into_table.rs b/src/query/service/src/physical_plans/physical_copy_into_table.rs index a5b3bdaa34e3a..2c34ec02500e8 100644 --- a/src/query/service/src/physical_plans/physical_copy_into_table.rs +++ b/src/query/service/src/physical_plans/physical_copy_into_table.rs @@ -23,6 +23,7 @@ use databend_common_expression::DataSchemaRefExt; use databend_common_expression::Scalar; use databend_common_meta_app::schema::TableInfo; use databend_common_sql::ColumnBinding; +use databend_common_sql::executor::physical_plans::DataDistribution; use databend_common_sql::plans::CopyIntoTableMode; use databend_common_sql::plans::ValidationMode; use databend_storages_common_table_meta::meta::TableMetaTimestamps; @@ -64,6 +65,10 @@ impl IPhysicalPlan for CopyIntoTable { &mut self.meta } + fn output_data_distribution(&self) -> DataDistribution { + DataDistribution::Random + } + #[recursive::recursive] fn output_schema(&self) -> Result { Ok(DataSchemaRefExt::create(vec![])) diff --git a/src/query/service/src/physical_plans/physical_cte_consumer.rs b/src/query/service/src/physical_plans/physical_cte_consumer.rs index 19f399c6b4fbb..252c52fddb057 100644 --- a/src/query/service/src/physical_plans/physical_cte_consumer.rs +++ b/src/query/service/src/physical_plans/physical_cte_consumer.rs @@ -19,6 +19,7 @@ use databend_common_exception::Result; use databend_common_expression::DataField; use databend_common_expression::DataSchemaRef; use databend_common_expression::DataSchemaRefExt; +use databend_common_sql::executor::physical_plans::DataDistribution; use crate::physical_plans::IPhysicalPlan; use crate::physical_plans::PhysicalPlan; @@ -62,6 +63,10 @@ impl IPhysicalPlan for MaterializeCTERef { Ok(self.cte_schema.clone()) } + fn output_data_distribution(&self) -> DataDistribution { + DataDistribution::Random + } + fn derive(&self, children: Vec) -> PhysicalPlan { assert!(children.is_empty()); PhysicalPlan::new(MaterializeCTERef { diff --git a/src/query/service/src/physical_plans/physical_distributed_insert_select.rs b/src/query/service/src/physical_plans/physical_distributed_insert_select.rs index 3c7280778f4a6..795744908abfb 100644 --- a/src/query/service/src/physical_plans/physical_distributed_insert_select.rs +++ b/src/query/service/src/physical_plans/physical_distributed_insert_select.rs @@ -21,6 +21,7 @@ use databend_common_meta_app::schema::TableInfo; use databend_common_pipeline_transforms::TransformPipelineHelper; use databend_common_pipeline_transforms::blocks::TransformCastSchema; use databend_common_sql::ColumnBinding; +use databend_common_sql::executor::physical_plans::DataDistribution; use databend_storages_common_table_meta::meta::TableMetaTimestamps; use crate::physical_plans::physical_plan::IPhysicalPlan; @@ -53,6 +54,10 @@ impl IPhysicalPlan for DistributedInsertSelect { &mut self.meta } + fn output_data_distribution(&self) -> DataDistribution { + DataDistribution::Random + } + #[recursive::recursive] fn output_schema(&self) -> Result { Ok(DataSchemaRef::default()) diff --git a/src/query/service/src/physical_plans/physical_eval_scalar.rs b/src/query/service/src/physical_plans/physical_eval_scalar.rs index 92f411baf6d2d..0b2ca0667c161 100644 --- a/src/query/service/src/physical_plans/physical_eval_scalar.rs +++ b/src/query/service/src/physical_plans/physical_eval_scalar.rs @@ -33,6 +33,7 @@ use databend_common_sql::ColumnSet; use databend_common_sql::Symbol; use databend_common_sql::TypeCheck; use databend_common_sql::evaluator::BlockOperator; +use databend_common_sql::executor::physical_plans::DataDistribution; use databend_common_sql::optimizer::ir::Matcher; use databend_common_sql::optimizer::ir::SExpr; use databend_common_sql::plans::Filter; @@ -78,6 +79,10 @@ impl IPhysicalPlan for EvalScalar { &mut self.meta } + fn output_data_distribution(&self) -> DataDistribution { + self.input.output_data_distribution() + } + #[recursive::recursive] fn output_schema(&self) -> Result { if self.exprs.is_empty() { diff --git a/src/query/service/src/physical_plans/physical_exchange.rs b/src/query/service/src/physical_plans/physical_exchange.rs index d2c691dde0870..0fb0f69014d7d 100644 --- a/src/query/service/src/physical_plans/physical_exchange.rs +++ b/src/query/service/src/physical_plans/physical_exchange.rs @@ -21,6 +21,7 @@ use databend_common_expression::RemoteExpr; use databend_common_functions::BUILTIN_FUNCTIONS; use databend_common_sql::ColumnSet; use databend_common_sql::TypeCheck; +use databend_common_sql::executor::physical_plans::DataDistribution; use databend_common_sql::executor::physical_plans::FragmentKind; use databend_common_sql::optimizer::ir::SExpr; @@ -75,6 +76,16 @@ impl IPhysicalPlan for Exchange { true } + fn output_data_distribution(&self) -> DataDistribution { + match &self.kind { + FragmentKind::Init => DataDistribution::Random, + FragmentKind::Normal => DataDistribution::NodeHash(self.keys.clone()), + FragmentKind::Expansive => DataDistribution::Broadcast, + FragmentKind::Merge => DataDistribution::Serial, + FragmentKind::GlobalShuffle => DataDistribution::GlobalHash(self.keys.clone()), + } + } + fn display_in_profile(&self) -> bool { false } diff --git a/src/query/service/src/physical_plans/physical_exchange_sink.rs b/src/query/service/src/physical_plans/physical_exchange_sink.rs index 17f2e3b51d9e5..9c110e77b5682 100644 --- a/src/query/service/src/physical_plans/physical_exchange_sink.rs +++ b/src/query/service/src/physical_plans/physical_exchange_sink.rs @@ -18,6 +18,7 @@ use databend_common_catalog::plan::DataSourcePlan; use databend_common_exception::Result; use databend_common_expression::DataSchemaRef; use databend_common_expression::RemoteExpr; +use databend_common_sql::executor::physical_plans::DataDistribution; use databend_common_sql::executor::physical_plans::FragmentKind; use crate::physical_plans::format::ExchangeSinkFormatter; @@ -84,6 +85,16 @@ impl IPhysicalPlan for ExchangeSink { true } + fn output_data_distribution(&self) -> DataDistribution { + match &self.kind { + FragmentKind::Init => DataDistribution::Random, + FragmentKind::Normal => DataDistribution::NodeHash(self.keys.clone()), + FragmentKind::Expansive => DataDistribution::Broadcast, + FragmentKind::Merge => DataDistribution::Serial, + FragmentKind::GlobalShuffle => DataDistribution::GlobalHash(self.keys.clone()), + } + } + fn display_in_profile(&self) -> bool { false } diff --git a/src/query/service/src/physical_plans/physical_exchange_source.rs b/src/query/service/src/physical_plans/physical_exchange_source.rs index 79076f92fb2b4..772e32978026a 100644 --- a/src/query/service/src/physical_plans/physical_exchange_source.rs +++ b/src/query/service/src/physical_plans/physical_exchange_source.rs @@ -16,7 +16,10 @@ use std::any::Any; use databend_common_exception::Result; use databend_common_expression::DataSchemaRef; +use databend_common_expression::RemoteExpr; use databend_common_pipeline::core::PlanScope; +use databend_common_sql::executor::physical_plans::DataDistribution; +use databend_common_sql::executor::physical_plans::FragmentKind; use crate::physical_plans::format::ExchangeSourceFormatter; use crate::physical_plans::format::PhysicalFormat; @@ -35,6 +38,8 @@ pub struct ExchangeSource { // Fragment ID of source fragment pub source_fragment_id: usize, pub query_id: String, + pub kind: FragmentKind, + pub keys: Vec, } #[typetag::serde] @@ -63,6 +68,16 @@ impl IPhysicalPlan for ExchangeSource { true } + fn output_data_distribution(&self) -> DataDistribution { + match &self.kind { + FragmentKind::Init => DataDistribution::Random, + FragmentKind::Normal => DataDistribution::NodeHash(self.keys.clone()), + FragmentKind::Expansive => DataDistribution::Broadcast, + FragmentKind::Merge => DataDistribution::Serial, + FragmentKind::GlobalShuffle => DataDistribution::GlobalHash(self.keys.clone()), + } + } + fn display_in_profile(&self) -> bool { false } @@ -74,6 +89,8 @@ impl IPhysicalPlan for ExchangeSource { schema: self.schema.clone(), source_fragment_id: self.source_fragment_id, query_id: self.query_id.clone(), + kind: self.kind.clone(), + keys: self.keys.clone(), }) } diff --git a/src/query/service/src/physical_plans/physical_expression_scan.rs b/src/query/service/src/physical_plans/physical_expression_scan.rs index 8d13ffbe9ed7f..955a5885d2f31 100644 --- a/src/query/service/src/physical_plans/physical_expression_scan.rs +++ b/src/query/service/src/physical_plans/physical_expression_scan.rs @@ -22,6 +22,7 @@ use databend_common_functions::BUILTIN_FUNCTIONS; use databend_common_pipeline::core::ProcessorPtr; use databend_common_sql::ColumnSet; use databend_common_sql::TypeCheck; +use databend_common_sql::executor::physical_plans::DataDistribution; use databend_common_sql::optimizer::ir::SExpr; use crate::physical_plans::PhysicalPlanBuilder; @@ -59,6 +60,10 @@ impl IPhysicalPlan for ExpressionScan { Ok(self.output_schema.clone()) } + fn output_data_distribution(&self) -> DataDistribution { + self.input.output_data_distribution() + } + fn children<'a>(&'a self) -> Box + 'a> { Box::new(std::iter::once(&self.input)) } diff --git a/src/query/service/src/physical_plans/physical_filter.rs b/src/query/service/src/physical_plans/physical_filter.rs index 8002add0b72a2..f4eb46c95cc55 100644 --- a/src/query/service/src/physical_plans/physical_filter.rs +++ b/src/query/service/src/physical_plans/physical_filter.rs @@ -26,6 +26,7 @@ use databend_common_functions::BUILTIN_FUNCTIONS; use databend_common_sql::ColumnSet; use databend_common_sql::TypeCheck; use databend_common_sql::executor::cast_expr_to_non_null_boolean; +use databend_common_sql::executor::physical_plans::DataDistribution; use databend_common_sql::optimizer::ir::SExpr; use crate::physical_plans::PhysicalPlanBuilder; @@ -62,6 +63,10 @@ impl IPhysicalPlan for Filter { &mut self.meta } + fn output_data_distribution(&self) -> DataDistribution { + self.input.output_data_distribution() + } + #[recursive::recursive] fn output_schema(&self) -> Result { let input_schema = self.input.output_schema()?; diff --git a/src/query/service/src/physical_plans/physical_hash_join.rs b/src/query/service/src/physical_plans/physical_hash_join.rs index 0a86fe90db3c7..058a0bf8b5756 100644 --- a/src/query/service/src/physical_plans/physical_hash_join.rs +++ b/src/query/service/src/physical_plans/physical_hash_join.rs @@ -43,6 +43,7 @@ use databend_common_sql::IndexType; use databend_common_sql::ScalarExpr; use databend_common_sql::Symbol; use databend_common_sql::TypeCheck; +use databend_common_sql::executor::physical_plans::DataDistribution; use databend_common_sql::optimizer::ir::SExpr; use databend_common_sql::plans::FunctionCall; use databend_common_sql::plans::Join; @@ -71,9 +72,11 @@ use crate::pipelines::processors::HashJoinState; use crate::pipelines::processors::transforms::HashJoinFactory; use crate::pipelines::processors::transforms::HashJoinProbeState; use crate::pipelines::processors::transforms::RuntimeFiltersDesc; +use crate::pipelines::processors::transforms::SharedRuntimeFilterPackets; use crate::pipelines::processors::transforms::TransformHashJoin; use crate::pipelines::processors::transforms::TransformHashJoinBuild; use crate::pipelines::processors::transforms::TransformHashJoinProbe; +use crate::pipelines::processors::transforms::TransformPartitionedHashJoin; use crate::sessions::QueryContext; // Type aliases to simplify complex return types @@ -195,6 +198,24 @@ impl IPhysicalPlan for HashJoin { Ok(HashJoinFormatter::create(self)) } + fn output_data_distribution(&self) -> DataDistribution { + let build_dist = self.build.output_data_distribution(); + let probe_dist = self.probe.output_data_distribution(); + + let can_preserve_global_hash = matches!( + &build_dist, + DataDistribution::GlobalHash(keys) if keys == &self.build_keys + ) && matches!( + &probe_dist, + DataDistribution::GlobalHash(keys) if keys == &self.probe_keys + ); + + match can_preserve_global_hash { + true => probe_dist, + false => DataDistribution::Random, + } + } + fn get_desc(&self) -> Result { let mut conditions = self .build_keys @@ -312,7 +333,7 @@ impl IPhysicalPlan for HashJoin { && !enable_optimization && !self.need_hold_hash_table { - return self.build_new_join_pipeline(builder, desc); + return self.build_join(builder, desc); } // Create the join state with optimization flags @@ -436,18 +457,105 @@ impl HashJoin { Ok(()) } - fn build_new_join_pipeline( - &self, - builder: &mut PipelineBuilder, - desc: Arc, - ) -> Result<()> { - let factory = self.join_factory(builder, desc)?; + fn build_join(&self, pb: &mut PipelineBuilder, desc: Arc) -> Result<()> { + let build_distribution = self.build.output_data_distribution(); + let global_hash_build = matches!(build_distribution, DataDistribution::GlobalHash(_)); + + let probe_distribution = self.probe.output_data_distribution(); + let global_hash_probe = matches!(probe_distribution, DataDistribution::GlobalHash(_)); - // We must build the runtime filter before constructing the child nodes, - // as we will inject some runtime filter information into the context for the child nodes to use. + let enable_partitioned = pb.settings.get_enable_partitioned_hash_join()?; + match global_hash_build + && global_hash_probe + && self.build_side_cache_info.is_none() + && enable_partitioned + { + true => self.shuffle_join(pb, desc), + false => self.broadcast_join(pb, desc), + } + } + + fn shuffle_join(&self, builder: &mut PipelineBuilder, desc: Arc) -> Result<()> { + let rf_desc = RuntimeFiltersDesc::create(&builder.ctx, self)?; + + let hash_key_types = self + .build_keys + .iter() + .zip(&desc.is_null_equal) + .map(|(expr, is_null_equal)| { + let expr = expr.as_expr(&BUILTIN_FUNCTIONS); + if *is_null_equal { + expr.data_type().clone() + } else { + expr.data_type().remove_nullable() + } + }) + .collect::>(); + let hash_method = DataBlock::choose_hash_method_with_types(&hash_key_types)?; + let max_block_size = builder.settings.get_max_block_size()? as usize; + + let mut sub_query_ctx = QueryContext::create_from(&builder.ctx); + std::mem::swap(&mut builder.ctx, &mut sub_query_ctx); + self.build.build_pipeline(builder)?; + std::mem::swap(&mut builder.ctx, &mut sub_query_ctx); + let build_sinks = builder.main_pipeline.take_sinks(); + + self.probe.build_pipeline(builder)?; + let probe_sinks = builder.main_pipeline.take_sinks(); + + assert_eq!(build_sinks.len(), probe_sinks.len()); + let output_len = build_sinks.len(); + + let barrier = databend_common_base::base::Barrier::new(output_len); + let stage_sync_barrier = Arc::new(barrier); + let shared_rf_packets = SharedRuntimeFilterPackets::create(); + let mut join_sinks = Vec::with_capacity(output_len * 2); + let mut join_pipe_items = Vec::with_capacity(output_len); + for (build_sink, probe_sink) in build_sinks.into_iter().zip(probe_sinks.into_iter()) { + join_sinks.push(build_sink); + join_sinks.push(probe_sink); + + let build_input = InputPort::create(); + let probe_input = InputPort::create(); + let joined_output = OutputPort::create(); + + let join = TransformPartitionedHashJoin::create_join( + self.join_type, + hash_method.clone(), + desc.clone(), + builder.func_ctx.clone(), + max_block_size, + ); + + let hash_join = TransformPartitionedHashJoin::create( + build_input.clone(), + probe_input.clone(), + joined_output.clone(), + join, + stage_sync_barrier.clone(), + self.projections.clone(), + rf_desc.clone(), + shared_rf_packets.clone(), + )?; + + join_pipe_items.push(PipeItem::create( + hash_join, + vec![build_input, probe_input], + vec![joined_output], + )) + } + + builder.main_pipeline.extend_sinks(join_sinks); + let join_pipe = Pipe::create(output_len * 2, output_len, join_pipe_items); + builder.main_pipeline.add_pipe(join_pipe); + + Ok(()) + } + + fn broadcast_join(&self, builder: &mut PipelineBuilder, desc: Arc) -> Result<()> { + let factory = self.join_factory(builder, desc)?; let rf_desc = RuntimeFiltersDesc::create(&builder.ctx, self)?; - // After common subexpression elimination is completed, we can delete this type of code. { let state = factory.create_basic_state(0)?; @@ -467,7 +575,6 @@ impl HashJoin { self.probe.build_pipeline(builder)?; - // Aligning hash join build and probe parallelism let output_len = std::cmp::max(build_sinks.len(), builder.main_pipeline.output_len()); builder.main_pipeline.resize(output_len, false)?; @@ -514,11 +621,8 @@ impl HashJoin { let join_pipe = Pipe::create(output_len * 2, output_len, join_pipe_items); builder.main_pipeline.add_pipe(join_pipe); - // In the case of spilling, we need to share state among multiple threads - // Quickly fetch all data from this round to quickly start the next round - builder - .main_pipeline - .resize(builder.main_pipeline.output_len(), true) + let item_size = builder.main_pipeline.output_len(); + builder.main_pipeline.resize(item_size, true) } fn join_factory( @@ -1409,7 +1513,7 @@ impl PhysicalPlanBuilder { } for scalar in &join.non_equi_conditions { - predicates.push(resolve_scalar(scalar, &merged).map_err(|err|{ + predicates.push(resolve_scalar(scalar, &merged).map_err(|err| { err.add_message(format!( "Failed build nested loop filter schema: {merged:#?} non_equi_conditions: {:#?}", join.non_equi_conditions diff --git a/src/query/service/src/physical_plans/physical_limit.rs b/src/query/service/src/physical_plans/physical_limit.rs index 90e741c5772e1..bd5d93f466d3f 100644 --- a/src/query/service/src/physical_plans/physical_limit.rs +++ b/src/query/service/src/physical_plans/physical_limit.rs @@ -26,6 +26,7 @@ use databend_common_sql::ColumnEntry; use databend_common_sql::ColumnSet; use databend_common_sql::IndexType; use databend_common_sql::Symbol; +use databend_common_sql::executor::physical_plans::DataDistribution; use databend_common_sql::optimizer::ir::SExpr; use crate::physical_plans::PhysicalPlanBuilder; @@ -75,6 +76,10 @@ impl IPhysicalPlan for Limit { Ok(LimitFormatter::create(self)) } + fn output_data_distribution(&self) -> DataDistribution { + DataDistribution::Serial + } + #[recursive::recursive] fn try_find_single_data_source(&self) -> Option<&DataSourcePlan> { self.input.try_find_single_data_source() diff --git a/src/query/service/src/physical_plans/physical_materialized_cte.rs b/src/query/service/src/physical_plans/physical_materialized_cte.rs index f0d31445eb829..66e9a37b88b44 100644 --- a/src/query/service/src/physical_plans/physical_materialized_cte.rs +++ b/src/query/service/src/physical_plans/physical_materialized_cte.rs @@ -20,6 +20,7 @@ use databend_common_pipeline_transforms::TransformPipelineHelper; use databend_common_pipeline_transforms::blocks::CompoundBlockOperator; use databend_common_sql::Symbol; use databend_common_sql::evaluator::BlockOperator; +use databend_common_sql::executor::physical_plans::DataDistribution; use databend_common_sql::optimizer::ir::SExpr; use crate::physical_plans::IPhysicalPlan; @@ -60,6 +61,10 @@ impl IPhysicalPlan for MaterializedCTE { &mut self.meta } + fn output_data_distribution(&self) -> DataDistribution { + DataDistribution::Serial + } + #[recursive::recursive] fn output_schema(&self) -> Result { self.input.output_schema() diff --git a/src/query/service/src/physical_plans/physical_multi_table_insert.rs b/src/query/service/src/physical_plans/physical_multi_table_insert.rs index cefb3817f2e22..12f24daa36a0a 100644 --- a/src/query/service/src/physical_plans/physical_multi_table_insert.rs +++ b/src/query/service/src/physical_plans/physical_multi_table_insert.rs @@ -39,6 +39,7 @@ use databend_common_pipeline_transforms::blocks::CompoundBlockOperator; use databend_common_pipeline_transforms::columns::TransformAddComputedColumns; use databend_common_pipeline_transforms::sorts::TransformSortPartial; use databend_common_sql::DefaultExprBinder; +use databend_common_sql::executor::physical_plans::DataDistribution; use databend_common_storages_fuse::FuseTable; use databend_common_storages_fuse::operations::CommitMultiTableInsert; use databend_storages_common_table_meta::meta::TableMetaTimestamps; @@ -86,6 +87,10 @@ impl IPhysicalPlan for Duplicate { Box::new(std::iter::once(&mut self.input)) } + fn output_data_distribution(&self) -> DataDistribution { + self.input.output_data_distribution() + } + fn formatter(&self) -> Result> { Ok(DuplicateFormatter::create(self)) } @@ -135,6 +140,10 @@ impl IPhysicalPlan for Shuffle { Box::new(std::iter::once(&mut self.input)) } + fn output_data_distribution(&self) -> DataDistribution { + self.input.output_data_distribution() + } + fn formatter(&self) -> Result> { Ok(ShuffleFormatter::create(self)) } @@ -218,6 +227,10 @@ impl IPhysicalPlan for ChunkFilter { Box::new(std::iter::once(&mut self.input)) } + fn output_data_distribution(&self) -> DataDistribution { + self.input.output_data_distribution() + } + fn formatter(&self) -> Result> { Ok(ChunkFilterFormatter::create(self)) } @@ -283,6 +296,10 @@ impl IPhysicalPlan for ChunkEvalScalar { Box::new(std::iter::once(&mut self.input)) } + fn output_data_distribution(&self) -> DataDistribution { + self.input.output_data_distribution() + } + fn formatter(&self) -> Result> { Ok(ChunkEvalScalarFormatter::create(self)) } @@ -355,6 +372,10 @@ impl IPhysicalPlan for ChunkCastSchema { Box::new(std::iter::once(&mut self.input)) } + fn output_data_distribution(&self) -> DataDistribution { + self.input.output_data_distribution() + } + fn formatter(&self) -> Result> { Ok(ChunkCastSchemaFormatter::create(self)) } @@ -496,6 +517,10 @@ impl IPhysicalPlan for ChunkFillAndReorder { Box::new(std::iter::once(&mut self.input)) } + fn output_data_distribution(&self) -> DataDistribution { + self.input.output_data_distribution() + } + fn formatter(&self) -> Result> { Ok(ChunkFillAndReorderFormatter::create(self)) } @@ -645,6 +670,10 @@ impl IPhysicalPlan for ChunkAppendData { Box::new(std::iter::once(&mut self.input)) } + fn output_data_distribution(&self) -> DataDistribution { + self.input.output_data_distribution() + } + fn formatter(&self) -> Result> { Ok(ChunkAppendDataFormatter::create(self)) } @@ -804,6 +833,10 @@ impl IPhysicalPlan for ChunkMerge { Box::new(std::iter::once(&mut self.input)) } + fn output_data_distribution(&self) -> DataDistribution { + DataDistribution::Random + } + fn formatter(&self) -> Result> { Ok(ChunkMergeFormatter::create(self)) } @@ -876,6 +909,10 @@ impl IPhysicalPlan for ChunkCommitInsert { Box::new(std::iter::once(&mut self.input)) } + fn output_data_distribution(&self) -> DataDistribution { + DataDistribution::Serial + } + fn derive(&self, mut children: Vec) -> PhysicalPlan { assert_eq!(children.len(), 1); let input = children.pop().unwrap(); diff --git a/src/query/service/src/physical_plans/physical_mutation.rs b/src/query/service/src/physical_plans/physical_mutation.rs index 30988393b8119..2025aeff75c55 100644 --- a/src/query/service/src/physical_plans/physical_mutation.rs +++ b/src/query/service/src/physical_plans/physical_mutation.rs @@ -55,6 +55,7 @@ use databend_common_sql::Visibility; use databend_common_sql::binder::MutationStrategy; use databend_common_sql::binder::MutationType; use databend_common_sql::binder::wrap_cast; +use databend_common_sql::executor::physical_plans::DataDistribution; use databend_common_sql::executor::physical_plans::FragmentKind; use databend_common_sql::executor::physical_plans::MutationKind; use databend_common_sql::optimizer::ir::SExpr; @@ -120,6 +121,10 @@ impl IPhysicalPlan for Mutation { &mut self.meta } + fn output_data_distribution(&self) -> DataDistribution { + DataDistribution::Random + } + #[recursive::recursive] fn output_schema(&self) -> Result { Ok(DataSchemaRef::default()) diff --git a/src/query/service/src/physical_plans/physical_mutation_into_organize.rs b/src/query/service/src/physical_plans/physical_mutation_into_organize.rs index b5770451d56bf..cdff50ce57401 100644 --- a/src/query/service/src/physical_plans/physical_mutation_into_organize.rs +++ b/src/query/service/src/physical_plans/physical_mutation_into_organize.rs @@ -16,6 +16,7 @@ use std::any::Any; use databend_common_exception::Result; use databend_common_sql::binder::MutationStrategy; +use databend_common_sql::executor::physical_plans::DataDistribution; use crate::physical_plans::format::MutationOrganizeFormatter; use crate::physical_plans::format::PhysicalFormat; @@ -44,6 +45,10 @@ impl IPhysicalPlan for MutationOrganize { &mut self.meta } + fn output_data_distribution(&self) -> DataDistribution { + DataDistribution::Random + } + fn children<'a>(&'a self) -> Box + 'a> { Box::new(std::iter::once(&self.input)) } diff --git a/src/query/service/src/physical_plans/physical_mutation_into_split.rs b/src/query/service/src/physical_plans/physical_mutation_into_split.rs index e86c70641ab64..cc3e6a5a13ca6 100644 --- a/src/query/service/src/physical_plans/physical_mutation_into_split.rs +++ b/src/query/service/src/physical_plans/physical_mutation_into_split.rs @@ -17,6 +17,7 @@ use std::any::Any; use databend_common_exception::Result; use databend_common_pipeline::core::Pipe; use databend_common_sql::IndexType; +use databend_common_sql::executor::physical_plans::DataDistribution; use databend_common_storages_fuse::operations::MutationSplitProcessor; use crate::physical_plans::format::MutationSplitFormatter; @@ -46,6 +47,10 @@ impl IPhysicalPlan for MutationSplit { &mut self.meta } + fn output_data_distribution(&self) -> DataDistribution { + DataDistribution::Random + } + fn children<'a>(&'a self) -> Box + 'a> { Box::new(std::iter::once(&self.input)) } diff --git a/src/query/service/src/physical_plans/physical_mutation_manipulate.rs b/src/query/service/src/physical_plans/physical_mutation_manipulate.rs index cad8392e9133a..a45d607c91f8a 100644 --- a/src/query/service/src/physical_plans/physical_mutation_manipulate.rs +++ b/src/query/service/src/physical_plans/physical_mutation_manipulate.rs @@ -24,6 +24,7 @@ use databend_common_expression::RemoteExpr; use databend_common_meta_app::schema::TableInfo; use databend_common_pipeline::core::Pipe; use databend_common_sql::binder::MutationStrategy; +use databend_common_sql::executor::physical_plans::DataDistribution; use databend_common_sql::executor::physical_plans::MatchExpr; use databend_common_storages_fuse::operations::MatchedSplitProcessor; use databend_common_storages_fuse::operations::MergeIntoNotMatchedProcessor; @@ -67,6 +68,10 @@ impl IPhysicalPlan for MutationManipulate { &mut self.meta } + fn output_data_distribution(&self) -> DataDistribution { + DataDistribution::Random + } + fn children<'a>(&'a self) -> Box + 'a> { Box::new(std::iter::once(&self.input)) } diff --git a/src/query/service/src/physical_plans/physical_mutation_source.rs b/src/query/service/src/physical_plans/physical_mutation_source.rs index 8d2da1f34432e..2be37d0c8e240 100644 --- a/src/query/service/src/physical_plans/physical_mutation_source.rs +++ b/src/query/service/src/physical_plans/physical_mutation_source.rs @@ -43,6 +43,7 @@ use databend_common_sql::ScalarExpr; use databend_common_sql::StreamContext; use databend_common_sql::binder::MutationType; use databend_common_sql::executor::cast_expr_to_non_null_boolean; +use databend_common_sql::executor::physical_plans::DataDistribution; use databend_common_storages_fuse::FuseLazyPartInfo; use databend_common_storages_fuse::FuseTable; use databend_common_storages_fuse::SegmentLocation; @@ -96,6 +97,10 @@ impl IPhysicalPlan for MutationSource { Ok(self.output_schema.clone()) } + fn output_data_distribution(&self) -> DataDistribution { + DataDistribution::Random + } + fn formatter(&self) -> Result> { Ok(MutationSourceFormatter::create(self)) } diff --git a/src/query/service/src/physical_plans/physical_plan.rs b/src/query/service/src/physical_plans/physical_plan.rs index 27cc04c134d07..2a2e6e4e539c6 100644 --- a/src/query/service/src/physical_plans/physical_plan.rs +++ b/src/query/service/src/physical_plans/physical_plan.rs @@ -30,6 +30,7 @@ use databend_common_expression::DataSchemaRef; use databend_common_pipeline::core::PlanProfile; use databend_common_pipeline::core::PlanScope; use databend_common_sql::Metadata; +use databend_common_sql::executor::physical_plans::DataDistribution; use dyn_clone::DynClone; use serde::Deserializer; use serde::Serializer; @@ -168,6 +169,8 @@ pub trait IPhysicalPlan: DynClone + Debug + Send + Sync + 'static { .any(|child| child.is_warehouse_distributed_plan()) } + fn output_data_distribution(&self) -> DataDistribution; + fn display_in_profile(&self) -> bool { true } diff --git a/src/query/service/src/physical_plans/physical_project_set.rs b/src/query/service/src/physical_plans/physical_project_set.rs index 4e66842b41d41..714d77d5c394e 100644 --- a/src/query/service/src/physical_plans/physical_project_set.rs +++ b/src/query/service/src/physical_plans/physical_project_set.rs @@ -27,6 +27,7 @@ use databend_common_pipeline::core::ProcessorPtr; use databend_common_sql::ColumnSet; use databend_common_sql::Symbol; use databend_common_sql::TypeCheck; +use databend_common_sql::executor::physical_plans::DataDistribution; use databend_common_sql::optimizer::ir::SExpr; use itertools::Itertools; @@ -64,6 +65,10 @@ impl IPhysicalPlan for ProjectSet { &mut self.meta } + fn output_data_distribution(&self) -> DataDistribution { + self.input.output_data_distribution() + } + #[recursive::recursive] fn output_schema(&self) -> Result { let input_schema = self.input.output_schema()?; diff --git a/src/query/service/src/physical_plans/physical_r_cte_scan.rs b/src/query/service/src/physical_plans/physical_r_cte_scan.rs index b0e251232e216..9c8afca3e5fe2 100644 --- a/src/query/service/src/physical_plans/physical_r_cte_scan.rs +++ b/src/query/service/src/physical_plans/physical_r_cte_scan.rs @@ -18,6 +18,7 @@ use std::fmt::Display; use databend_common_exception::Result; use databend_common_expression::DataSchemaRef; use databend_common_expression::DataSchemaRefExt; +use databend_common_sql::executor::physical_plans::DataDistribution; use crate::physical_plans::PhysicalPlanBuilder; use crate::physical_plans::explain::PlanStatsInfo; @@ -54,6 +55,10 @@ impl IPhysicalPlan for RecursiveCteScan { Ok(self.output_schema.clone()) } + fn output_data_distribution(&self) -> DataDistribution { + DataDistribution::Random + } + fn derive(&self, children: Vec) -> PhysicalPlan { assert!(children.is_empty()); PhysicalPlan::new(RecursiveCteScan { diff --git a/src/query/service/src/physical_plans/physical_range_join.rs b/src/query/service/src/physical_plans/physical_range_join.rs index 4ed7b99c7f33f..b5ce47e6f2255 100644 --- a/src/query/service/src/physical_plans/physical_range_join.rs +++ b/src/query/service/src/physical_plans/physical_range_join.rs @@ -29,6 +29,7 @@ use databend_common_sql::ScalarExpr; use databend_common_sql::TypeCheck; use databend_common_sql::binder::JoinPredicate; use databend_common_sql::binder::wrap_cast; +use databend_common_sql::executor::physical_plans::DataDistribution; use databend_common_sql::optimizer::ir::RelExpr; use databend_common_sql::optimizer::ir::RelationalProperty; use databend_common_sql::optimizer::ir::SExpr; @@ -95,6 +96,10 @@ impl IPhysicalPlan for RangeJoin { Ok(RangeJoinFormatter::create(self)) } + fn output_data_distribution(&self) -> DataDistribution { + DataDistribution::Random + } + fn get_desc(&self) -> Result { let mut condition = self .conditions diff --git a/src/query/service/src/physical_plans/physical_recluster.rs b/src/query/service/src/physical_plans/physical_recluster.rs index 827ad456244cf..513ee651c2ce6 100644 --- a/src/query/service/src/physical_plans/physical_recluster.rs +++ b/src/query/service/src/physical_plans/physical_recluster.rs @@ -40,6 +40,7 @@ use databend_common_pipeline_transforms::blocks::CompoundBlockOperator; use databend_common_pipeline_transforms::build_ordered_compact_pipeline; use databend_common_pipeline_transforms::columns::TransformAddStreamColumns; use databend_common_sql::StreamContext; +use databend_common_sql::executor::physical_plans::DataDistribution; use databend_common_sql::executor::physical_plans::MutationKind; use databend_common_storages_fuse::FUSE_OPT_KEY_BLOCK_IN_MEM_SIZE_THRESHOLD; use databend_common_storages_fuse::FuseTable; @@ -80,6 +81,10 @@ impl IPhysicalPlan for Recluster { &mut self.meta } + fn output_data_distribution(&self) -> DataDistribution { + DataDistribution::Random + } + fn derive(&self, children: Vec) -> PhysicalPlan { assert!(children.is_empty()); PhysicalPlan::new(Recluster { @@ -281,6 +286,10 @@ impl IPhysicalPlan for HilbertPartition { &mut self.meta } + fn output_data_distribution(&self) -> DataDistribution { + DataDistribution::Random + } + #[recursive::recursive] fn output_schema(&self) -> Result { Ok(DataSchemaRef::default()) diff --git a/src/query/service/src/physical_plans/physical_replace_async_source.rs b/src/query/service/src/physical_plans/physical_replace_async_source.rs index e57f78f04fb6f..5d1599aeaa74b 100644 --- a/src/query/service/src/physical_plans/physical_replace_async_source.rs +++ b/src/query/service/src/physical_plans/physical_replace_async_source.rs @@ -19,6 +19,7 @@ use databend_common_exception::Result; use databend_common_expression::DataSchemaRef; use databend_common_pipeline::sources::AsyncSourcer; use databend_common_sql::NameResolutionContext; +use databend_common_sql::executor::physical_plans::DataDistribution; use databend_common_sql::plans::InsertValue; use crate::physical_plans::physical_plan::IPhysicalPlan; @@ -48,6 +49,10 @@ impl IPhysicalPlan for ReplaceAsyncSourcer { &mut self.meta } + fn output_data_distribution(&self) -> DataDistribution { + DataDistribution::Random + } + fn derive(&self, children: Vec) -> PhysicalPlan { assert!(children.is_empty()); PhysicalPlan::new(ReplaceAsyncSourcer { diff --git a/src/query/service/src/physical_plans/physical_replace_deduplicate.rs b/src/query/service/src/physical_plans/physical_replace_deduplicate.rs index 29f1ce6d2ec89..a9e8d3b35e757 100644 --- a/src/query/service/src/physical_plans/physical_replace_deduplicate.rs +++ b/src/query/service/src/physical_plans/physical_replace_deduplicate.rs @@ -31,6 +31,7 @@ use databend_common_pipeline_transforms::TransformPipelineHelper; use databend_common_pipeline_transforms::blocks::TransformCastSchema; use databend_common_pipeline_transforms::build_compact_block_pipeline; use databend_common_sql::ColumnBinding; +use databend_common_sql::executor::physical_plans::DataDistribution; use databend_common_sql::executor::physical_plans::OnConflictField; use databend_common_storages_fuse::FuseTable; use databend_common_storages_fuse::operations::ReplaceIntoProcessor; @@ -70,6 +71,10 @@ impl IPhysicalPlan for ReplaceDeduplicate { &mut self.meta } + fn output_data_distribution(&self) -> DataDistribution { + DataDistribution::Serial + } + #[recursive::recursive] fn output_schema(&self) -> Result { Ok(DataSchemaRef::default()) diff --git a/src/query/service/src/physical_plans/physical_replace_into.rs b/src/query/service/src/physical_plans/physical_replace_into.rs index 9e502fb467a81..7c916ac4361cc 100644 --- a/src/query/service/src/physical_plans/physical_replace_into.rs +++ b/src/query/service/src/physical_plans/physical_replace_into.rs @@ -26,6 +26,7 @@ use databend_common_pipeline::core::InputPort; use databend_common_pipeline::core::OutputPort; use databend_common_pipeline::core::Pipe; use databend_common_pipeline_transforms::create_dummy_item; +use databend_common_sql::executor::physical_plans::DataDistribution; use databend_common_sql::executor::physical_plans::MutationKind; use databend_common_sql::executor::physical_plans::OnConflictField; use databend_common_storages_fuse::FuseTable; @@ -70,6 +71,10 @@ impl IPhysicalPlan for ReplaceInto { &mut self.meta } + fn output_data_distribution(&self) -> DataDistribution { + DataDistribution::Random + } + #[recursive::recursive] fn output_schema(&self) -> Result { Ok(DataSchemaRef::default()) diff --git a/src/query/service/src/physical_plans/physical_row_fetch.rs b/src/query/service/src/physical_plans/physical_row_fetch.rs index 11fef083c340a..e1e687f55d8c8 100644 --- a/src/query/service/src/physical_plans/physical_row_fetch.rs +++ b/src/query/service/src/physical_plans/physical_row_fetch.rs @@ -25,6 +25,7 @@ use databend_common_pipeline::core::OutputPort; use databend_common_pipeline::core::Pipe; use databend_common_pipeline::core::PipeItem; use databend_common_pipeline_transforms::create_dummy_item; +use databend_common_sql::executor::physical_plans::DataDistribution; use databend_common_storages_fuse::operations::row_fetch_processor; use itertools::Itertools; @@ -67,6 +68,10 @@ impl IPhysicalPlan for RowFetch { &mut self.meta } + fn output_data_distribution(&self) -> DataDistribution { + self.input.output_data_distribution() + } + #[recursive::recursive] fn output_schema(&self) -> Result { let mut fields = self.input.output_schema()?.fields().clone(); diff --git a/src/query/service/src/physical_plans/physical_secure_filter.rs b/src/query/service/src/physical_plans/physical_secure_filter.rs index 88e8f871574d6..ef2daeaf0ab22 100644 --- a/src/query/service/src/physical_plans/physical_secure_filter.rs +++ b/src/query/service/src/physical_plans/physical_secure_filter.rs @@ -26,6 +26,7 @@ use databend_common_functions::BUILTIN_FUNCTIONS; use databend_common_sql::ColumnSet; use databend_common_sql::TypeCheck; use databend_common_sql::executor::cast_expr_to_non_null_boolean; +use databend_common_sql::executor::physical_plans::DataDistribution; use databend_common_sql::optimizer::ir::SExpr; use sha2::Digest; use sha2::Sha256; @@ -64,6 +65,10 @@ impl IPhysicalPlan for SecureFilter { &mut self.meta } + fn output_data_distribution(&self) -> DataDistribution { + self.input.output_data_distribution() + } + #[recursive::recursive] fn output_schema(&self) -> Result { let input_schema = self.input.output_schema()?; diff --git a/src/query/service/src/physical_plans/physical_sequence.rs b/src/query/service/src/physical_plans/physical_sequence.rs index 4642019c961f5..d201fa082a415 100644 --- a/src/query/service/src/physical_plans/physical_sequence.rs +++ b/src/query/service/src/physical_plans/physical_sequence.rs @@ -17,6 +17,7 @@ use std::any::Any; use databend_common_exception::Result; use databend_common_expression::DataSchemaRef; use databend_common_sql::ColumnSet; +use databend_common_sql::executor::physical_plans::DataDistribution; use databend_common_sql::optimizer::ir::SExpr; use crate::physical_plans::IPhysicalPlan; @@ -53,6 +54,10 @@ impl IPhysicalPlan for Sequence { &mut self.meta } + fn output_data_distribution(&self) -> DataDistribution { + self.right.output_data_distribution() + } + #[recursive::recursive] fn output_schema(&self) -> Result { self.right.output_schema() diff --git a/src/query/service/src/physical_plans/physical_sort.rs b/src/query/service/src/physical_plans/physical_sort.rs index 5e049c9daf3ce..d808d47d50633 100644 --- a/src/query/service/src/physical_plans/physical_sort.rs +++ b/src/query/service/src/physical_plans/physical_sort.rs @@ -28,6 +28,7 @@ use databend_common_pipeline_transforms::sorts::core::SortKeyDescription; use databend_common_sql::ColumnSet; use databend_common_sql::IndexType; use databend_common_sql::evaluator::BlockOperator; +use databend_common_sql::executor::physical_plans::DataDistribution; use databend_common_sql::executor::physical_plans::FragmentKind; use databend_common_sql::executor::physical_plans::SortDesc; use databend_common_sql::optimizer::ir::SExpr; @@ -148,6 +149,15 @@ impl IPhysicalPlan for Sort { Ok(SortFormatter::create(self)) } + fn output_data_distribution(&self) -> DataDistribution { + match self.step { + SortStep::Single | SortStep::Partial | SortStep::Final | SortStep::Shuffled => { + DataDistribution::Serial + } + SortStep::Sample | SortStep::Route => DataDistribution::Random, + } + } + #[recursive::recursive] fn try_find_single_data_source(&self) -> Option<&DataSourcePlan> { self.input.try_find_single_data_source() diff --git a/src/query/service/src/physical_plans/physical_table_scan.rs b/src/query/service/src/physical_plans/physical_table_scan.rs index 73a7ad54f5262..d9121abaf971e 100644 --- a/src/query/service/src/physical_plans/physical_table_scan.rs +++ b/src/query/service/src/physical_plans/physical_table_scan.rs @@ -64,6 +64,7 @@ use databend_common_sql::VirtualColumn; use databend_common_sql::binder::INTERNAL_COLUMN_FACTORY; use databend_common_sql::evaluator::BlockOperator; use databend_common_sql::executor::cast_expr_to_non_null_boolean; +use databend_common_sql::executor::physical_plans::DataDistribution; use databend_common_sql::executor::table_read_plan::ToReadDataSourcePlan; use databend_common_sql::plans::FunctionCall; use databend_common_storages_fuse::FuseTable; @@ -111,6 +112,10 @@ impl IPhysicalPlan for TableScan { Self::output_fields(self.source.schema(), &self.name_mapping).map(DataSchema::new_ref) } + fn output_data_distribution(&self) -> DataDistribution { + DataDistribution::Random + } + fn formatter(&self) -> Result> { Ok(TableScanFormatter::create(self)) } diff --git a/src/query/service/src/physical_plans/physical_udf.rs b/src/query/service/src/physical_plans/physical_udf.rs index 1b62115d6f0c1..d82a3c416eb64 100644 --- a/src/query/service/src/physical_plans/physical_udf.rs +++ b/src/query/service/src/physical_plans/physical_udf.rs @@ -28,6 +28,7 @@ use databend_common_sql::ColumnSet; use databend_common_sql::IndexType; use databend_common_sql::ScalarExpr; use databend_common_sql::Symbol; +use databend_common_sql::executor::physical_plans::DataDistribution; use databend_common_sql::optimizer::ir::SExpr; use databend_common_sql::plans::UDFType; use itertools::Itertools; @@ -66,6 +67,10 @@ impl IPhysicalPlan for Udf { &mut self.meta } + fn output_data_distribution(&self) -> DataDistribution { + self.input.output_data_distribution() + } + #[recursive::recursive] fn output_schema(&self) -> Result { let input_schema = self.input.output_schema()?; diff --git a/src/query/service/src/physical_plans/physical_union_all.rs b/src/query/service/src/physical_plans/physical_union_all.rs index 1965739f66fe9..6c73f614feca5 100644 --- a/src/query/service/src/physical_plans/physical_union_all.rs +++ b/src/query/service/src/physical_plans/physical_union_all.rs @@ -28,8 +28,10 @@ use databend_common_sql::ScalarExpr; use databend_common_sql::Symbol; use databend_common_sql::TypeCheck; use databend_common_sql::evaluator::BlockOperator; +use databend_common_sql::executor::physical_plans::DataDistribution; use databend_common_sql::optimizer::ir::SExpr; use itertools::Itertools; +use recursive::recursive; use crate::physical_plans::Exchange; use crate::physical_plans::PhysicalPlanBuilder; @@ -89,6 +91,16 @@ impl IPhysicalPlan for UnionAll { Ok(UnionAllFormatter::create(self)) } + #[recursive] + fn output_data_distribution(&self) -> DataDistribution { + let left_dist = self.left.output_data_distribution(); + let right_dist = self.right.output_data_distribution(); + match left_dist == right_dist { + true => left_dist, + false => DataDistribution::Random, + } + } + fn get_desc(&self) -> Result { Ok(self .left_outputs diff --git a/src/query/service/src/physical_plans/physical_window.rs b/src/query/service/src/physical_plans/physical_window.rs index 35a9608e95e19..1267afe96c1de 100644 --- a/src/query/service/src/physical_plans/physical_window.rs +++ b/src/query/service/src/physical_plans/physical_window.rs @@ -37,6 +37,7 @@ use databend_common_sql::TypeCheck; use databend_common_sql::binder::wrap_cast; use databend_common_sql::executor::physical_plans::AggregateFunctionDesc; use databend_common_sql::executor::physical_plans::AggregateFunctionSignature; +use databend_common_sql::executor::physical_plans::DataDistribution; use databend_common_sql::executor::physical_plans::SortDesc; use databend_common_sql::optimizer::ir::SExpr; use databend_common_sql::plans::WindowFuncFrame; @@ -110,6 +111,14 @@ impl IPhysicalPlan for Window { Ok(WindowFormatter::create(self)) } + fn output_data_distribution(&self) -> DataDistribution { + if self.partition_by.is_empty() { + DataDistribution::Random + } else { + self.input.output_data_distribution() + } + } + #[recursive::recursive] fn try_find_single_data_source(&self) -> Option<&DataSourcePlan> { self.input.try_find_single_data_source() diff --git a/src/query/service/src/physical_plans/physical_window_partition.rs b/src/query/service/src/physical_plans/physical_window_partition.rs index 734c5299258e7..bc12f224d3de7 100644 --- a/src/query/service/src/physical_plans/physical_window_partition.rs +++ b/src/query/service/src/physical_plans/physical_window_partition.rs @@ -24,6 +24,7 @@ use databend_common_expression::SortColumnDescription; use databend_common_pipeline::core::ProcessorPtr; use databend_common_pipeline_transforms::MemorySettings; use databend_common_sql::Symbol; +use databend_common_sql::executor::physical_plans::DataDistribution; use databend_common_sql::executor::physical_plans::SortDesc; use databend_storages_common_cache::TempDirManager; @@ -77,6 +78,10 @@ impl IPhysicalPlan for WindowPartition { Ok(WindowPartitionFormatter::create(self)) } + fn output_data_distribution(&self) -> DataDistribution { + DataDistribution::Random + } + #[recursive::recursive] fn try_find_single_data_source(&self) -> Option<&DataSourcePlan> { self.input.try_find_single_data_source() diff --git a/src/query/service/src/pipelines/processors/transforms/hash_join/runtime_filter/convert.rs b/src/query/service/src/pipelines/processors/transforms/hash_join/runtime_filter/convert.rs index 82dab8970d2b6..2174f39789a0e 100644 --- a/src/query/service/src/pipelines/processors/transforms/hash_join/runtime_filter/convert.rs +++ b/src/query/service/src/pipelines/processors/transforms/hash_join/runtime_filter/convert.rs @@ -21,7 +21,6 @@ use databend_common_catalog::runtime_filter_info::RuntimeFilterInfo; use databend_common_catalog::runtime_filter_info::RuntimeFilterSpatial; use databend_common_catalog::runtime_filter_info::RuntimeFilterStats; use databend_common_catalog::sbbf::Sbbf; -use databend_common_catalog::sbbf::SbbfAtomic; use databend_common_exception::ErrorCode; use databend_common_exception::Result; use databend_common_expression::Column; @@ -55,7 +54,7 @@ pub async fn build_runtime_filter_infos( packet: JoinRuntimeFilterPacket, runtime_filter_descs: HashMap, selectivity_threshold: u64, - max_threads: usize, + _max_threads: usize, ) -> Result> { let total_build_rows = packet.build_rows; let Some(packets) = packet.packets else { @@ -104,7 +103,7 @@ pub async fn build_runtime_filter_infos( }; let bloom = if bloom_enabled { if let Some(ref bloom) = packet.bloom { - Some(build_bloom_filter(bloom.clone(), probe_key, max_threads, desc.id).await?) + Some(build_bloom_filter(bloom.clone(), probe_key)?) } else { None } @@ -278,37 +277,14 @@ fn build_min_max_filter( Ok(min_max_filter) } -async fn build_bloom_filter( - bloom: Vec, +fn build_bloom_filter( + bloom_words: Vec, probe_key: &Expr, - max_threads: usize, - filter_id: usize, ) -> Result { let probe_column = resolve_probe_column_ref(probe_key); let column_name = probe_column.id.to_string(); - let total_items = bloom.len(); - - if total_items < 3_000_000 { - let mut filter = Sbbf::new_with_ndv_fpp(total_items as u64, 0.01) - .map_err(|e| ErrorCode::Internal(e.to_string()))?; - filter.insert_hash_batch(&bloom); - return Ok(RuntimeFilterBloom { - column_name, - filter: Arc::new(filter), - }); - } - - let start = std::time::Instant::now(); - let builder = SbbfAtomic::new_with_ndv_fpp(total_items as u64, 0.01) - .map_err(|e| ErrorCode::Internal(e.to_string()))? - .insert_hash_batch_parallel(bloom, max_threads); - let filter = builder.finish(); - log::info!( - "filter_id: {}, build_time: {:?}", - filter_id, - start.elapsed() - ); - + let filter = Sbbf::from_u32s(bloom_words) + .ok_or_else(|| ErrorCode::Internal("Invalid bloom filter data in runtime filter"))?; Ok(RuntimeFilterBloom { column_name, filter: Arc::new(filter), @@ -331,6 +307,7 @@ fn resolve_probe_column_ref(probe_key: &Expr) -> &ColumnRef { mod tests { use std::collections::HashMap; + use databend_common_catalog::sbbf::Sbbf; use databend_common_expression::ColumnBuilder; use databend_common_expression::ColumnRef; use databend_common_expression::Constant; @@ -392,7 +369,11 @@ mod tests { min: Scalar::Number(1i32.into()), max: Scalar::Number(10i32.into()), }), - bloom: Some(vec![11, 22]), + bloom: Some({ + let mut f = Sbbf::new_with_ndv_fpp(100, 0.01).unwrap(); + f.insert_hash_batch(&[11, 22]); + f.into_u32s() + }), spatial: None, }); diff --git a/src/query/service/src/pipelines/processors/transforms/hash_join/runtime_filter/local_builder.rs b/src/query/service/src/pipelines/processors/transforms/hash_join/runtime_filter/local_builder.rs index 7b2e724ec63d7..7cd9bd9e81455 100644 --- a/src/query/service/src/pipelines/processors/transforms/hash_join/runtime_filter/local_builder.rs +++ b/src/query/service/src/pipelines/processors/transforms/hash_join/runtime_filter/local_builder.rs @@ -12,6 +12,8 @@ // See the License for the specific language governing permissions and // limitations under the License. +use databend_common_catalog::sbbf::Sbbf; +use databend_common_exception::ErrorCode; use databend_common_exception::Result; use databend_common_expression::BlockEntry; use databend_common_expression::Column; @@ -47,8 +49,9 @@ struct SingleFilterBuilder { inlist_builder: Option, inlist_threshold: usize, - bloom_hashes: Option>, + bloom_filter: Option, bloom_threshold: usize, + bloom_disabled: bool, is_spatial: bool, spatial_rects: Vec<(f64, f64, f64, f64)>, @@ -90,12 +93,21 @@ impl SingleFilterBuilder { } else { 0 }, - bloom_hashes: None, + bloom_filter: if desc.enable_bloom_runtime_filter && bloom_threshold > 0 { + let ndv = match desc.build_table_rows { + Some(rows) => rows.min(bloom_threshold as u64), + None => bloom_threshold as u64, + }; + Some(Sbbf::new_with_ndv_fpp(ndv, 0.01).map_err(ErrorCode::Internal)?) + } else { + None + }, bloom_threshold: if desc.enable_bloom_runtime_filter { bloom_threshold } else { 0 }, + bloom_disabled: !desc.enable_bloom_runtime_filter || bloom_threshold == 0, is_spatial: desc.is_spatial, spatial_rects: Vec::new(), spatial_srid: None, @@ -142,22 +154,21 @@ impl SingleFilterBuilder { } fn add_bloom(&mut self, column: &Column, new_total: usize) -> Result<()> { - if new_total > self.bloom_threshold { - self.bloom_hashes = None; + if self.bloom_disabled || new_total > self.bloom_threshold { + self.bloom_filter = None; + self.bloom_disabled = true; return Ok(()); } - let mut hashes = match self.bloom_hashes.take() { - Some(h) => h, - None => Vec::with_capacity(column.len()), - }; - hashes.reserve(column.len()); - let entry = BlockEntry::from(column.clone()); - let hash_method = self - .hash_method - .as_ref() - .expect("hash_method must exist for non-spatial filters"); - hash_by_method_for_bloom(hash_method, (&[entry]).into(), column.len(), &mut hashes)?; - self.bloom_hashes = Some(hashes); + if let Some(ref mut filter) = self.bloom_filter { + let mut hashes = Vec::with_capacity(column.len()); + let entry = BlockEntry::from(column.clone()); + let hash_method = self + .hash_method + .as_ref() + .expect("hash_method must exist for non-spatial filters"); + hash_by_method_for_bloom(hash_method, (&[entry]).into(), column.len(), &mut hashes)?; + filter.insert_hash_batch(&hashes); + } Ok(()) } @@ -219,7 +230,7 @@ impl SingleFilterBuilder { None }; - let bloom = self.bloom_hashes.take(); + let bloom = self.bloom_filter.take().map(|f| f.into_u32s()); Ok(RuntimeFilterPacket { id: self.id, diff --git a/src/query/service/src/pipelines/processors/transforms/hash_join/runtime_filter/merge.rs b/src/query/service/src/pipelines/processors/transforms/hash_join/runtime_filter/merge.rs index 18282b1194a64..acf623ed5d2a1 100644 --- a/src/query/service/src/pipelines/processors/transforms/hash_join/runtime_filter/merge.rs +++ b/src/query/service/src/pipelines/processors/transforms/hash_join/runtime_filter/merge.rs @@ -14,6 +14,7 @@ use std::collections::HashMap; +use databend_common_catalog::sbbf::Sbbf; use databend_common_exception::Result; use databend_common_expression::Column; @@ -50,7 +51,7 @@ pub fn merge_join_runtime_filter_packets( let should_merge_bloom = total_build_rows < bloom_threshold; let should_merge_min_max = total_build_rows < min_max_threshold; - let packets = packets + let mut packets = packets .into_iter() .filter_map(|packet| packet.packets) .collect::>(); @@ -62,26 +63,27 @@ pub fn merge_join_runtime_filter_packets( )); } + let keys: Vec = packets[0].keys().copied().collect(); let mut result = HashMap::new(); - for id in packets[0].keys() { - result.insert(*id, RuntimeFilterPacket { - id: *id, + for id in keys { + result.insert(id, RuntimeFilterPacket { + id, inlist: if should_merge_inlist { - merge_inlist(&packets, *id)? + merge_inlist(&packets, id)? } else { None }, min_max: if should_merge_min_max { - merge_min_max(&packets, *id) + merge_min_max(&packets, id) } else { None }, bloom: if should_merge_bloom { - merge_bloom(&packets, *id) + merge_bloom(&mut packets, id) } else { None }, - spatial: merge_spatial(&packets, *id, spatial_threshold)?, + spatial: merge_spatial(&packets, id, spatial_threshold)?, }); } @@ -158,25 +160,25 @@ fn merge_min_max( Some(SerializableDomain { min, max }) } -fn merge_bloom(packets: &[HashMap], rf_id: usize) -> Option> { +fn merge_bloom( + packets: &mut [HashMap], + rf_id: usize, +) -> Option> { if packets .iter() .any(|packet| packet.get(&rf_id).unwrap().bloom.is_none()) { return None; } - let mut bloom = packets[0] - .get(&rf_id) - .unwrap() - .bloom - .as_ref() - .unwrap() - .clone(); - for packet in packets.iter().skip(1) { - let other = packet.get(&rf_id).unwrap().bloom.as_ref().unwrap(); - bloom.extend_from_slice(other); + + let first = packets[0].get_mut(&rf_id).unwrap().bloom.take().unwrap(); + let mut merged = Sbbf::from_u32s(first)?; + for packet in packets.iter_mut().skip(1) { + let other_words = packet.get_mut(&rf_id).unwrap().bloom.take().unwrap(); + let other = Sbbf::from_u32s(other_words)?; + merged.union(&other); } - Some(bloom) + Some(merged.into_u32s()) } fn merge_spatial( @@ -223,10 +225,110 @@ fn merge_spatial( })) } +/// Pairwise merge of two runtime filter packets without threshold checks. +/// Used for work-stealing incremental merge within a node. +pub fn merge_two_runtime_filter_packets( + mut a: JoinRuntimeFilterPacket, + mut b: JoinRuntimeFilterPacket, +) -> Result { + let total_build_rows = a.build_rows + b.build_rows; + let disable_all = a.disable_all_due_to_spill || b.disable_all_due_to_spill; + + if disable_all { + return Ok(JoinRuntimeFilterPacket::disable_all(total_build_rows)); + } + + let (a_packets, b_packets) = match (a.packets.take(), b.packets.take()) { + (None, None) => { + return Ok(JoinRuntimeFilterPacket::complete_without_filters( + total_build_rows, + )); + } + (Some(p), None) | (None, Some(p)) => { + return Ok(JoinRuntimeFilterPacket::complete(p, total_build_rows)); + } + (Some(a), Some(b)) => (a, b), + }; + + let mut result = HashMap::new(); + for (id, mut a_pkt) in a_packets { + if let Some(mut b_pkt) = b_packets.get(&id).cloned() { + // Merge bloom via Sbbf::union + let bloom = match (a_pkt.bloom.take(), b_pkt.bloom.take()) { + (Some(a_words), Some(b_words)) => { + if let (Some(mut a_filter), Some(b_filter)) = + (Sbbf::from_u32s(a_words), Sbbf::from_u32s(b_words)) + { + a_filter.union(&b_filter); + Some(a_filter.into_u32s()) + } else { + None + } + } + _ => None, + }; + + // Merge inlist via concat + let inlist = match (a_pkt.inlist.take(), b_pkt.inlist.take()) { + (Some(a_col), Some(b_col)) => { + Some(Column::concat_columns([a_col, b_col].into_iter())?) + } + _ => None, + }; + + // Merge min_max + let min_max = match (a_pkt.min_max.take(), b_pkt.min_max.take()) { + (Some(a_mm), Some(b_mm)) => Some(SerializableDomain { + min: a_mm.min.min(b_mm.min), + max: a_mm.max.max(b_mm.max), + }), + _ => None, + }; + + // Merge spatial + let spatial = match (a_pkt.spatial.take(), b_pkt.spatial.take()) { + (Some(a_sp), Some(b_sp)) => { + if a_sp.valid && b_sp.valid && a_sp.srid == b_sp.srid { + let rtrees = merge_rtrees_to_threshold( + vec![a_sp.rtrees.as_slice(), b_sp.rtrees.as_slice()], + usize::MAX, + )?; + Some(SpatialPacket { + valid: true, + srid: a_sp.srid, + rtrees, + }) + } else { + None + } + } + _ => None, + }; + + result.insert(id, RuntimeFilterPacket { + id, + bloom, + inlist, + min_max, + spatial, + }); + } + } + + if result.is_empty() { + return Ok(JoinRuntimeFilterPacket::complete_without_filters( + total_build_rows, + )); + } + + Ok(JoinRuntimeFilterPacket::complete(result, total_build_rows)) +} + #[cfg(test)] mod tests { use std::collections::HashMap; + use databend_common_catalog::sbbf::Sbbf; use databend_common_expression::ColumnBuilder; use databend_common_expression::Scalar; use databend_common_expression::types::DataType; @@ -244,6 +346,12 @@ mod tests { builder.build() } + fn make_bloom(hashes: &[u64]) -> Vec { + let mut filter = Sbbf::new_with_ndv_fpp(100, 0.01).unwrap(); + filter.insert_hash_batch(hashes); + filter.into_u32s() + } + #[test] fn test_merge_short_circuit_all_types() -> Result<()> { let mut runtime_filters = HashMap::new(); @@ -254,7 +362,7 @@ mod tests { min: Scalar::Number(NumberScalar::Int32(1)), max: Scalar::Number(NumberScalar::Int32(3)), }), - bloom: Some(vec![11, 22, 33]), + bloom: Some(make_bloom(&[11, 22, 33])), spatial: None, }); @@ -272,6 +380,9 @@ mod tests { #[test] fn test_merge_short_circuit_inlist_only() -> Result<()> { + let bloom1 = make_bloom(&[1, 2]); + let bloom2 = make_bloom(&[3, 4]); + let mut runtime_filters_1 = HashMap::new(); runtime_filters_1.insert(7, RuntimeFilterPacket { id: 7, @@ -280,7 +391,7 @@ mod tests { min: Scalar::Number(NumberScalar::Int32(1)), max: Scalar::Number(NumberScalar::Int32(5)), }), - bloom: Some(vec![1, 2]), + bloom: Some(bloom1.clone()), spatial: None, }); let mut runtime_filters_2 = HashMap::new(); @@ -291,7 +402,7 @@ mod tests { min: Scalar::Number(NumberScalar::Int32(-1)), max: Scalar::Number(NumberScalar::Int32(8)), }), - bloom: Some(vec![3, 4]), + bloom: Some(bloom2.clone()), spatial: None, }); @@ -309,7 +420,11 @@ mod tests { let packet = merged.packets.unwrap().remove(&7).unwrap(); assert_eq!(merged.build_rows, 11); assert!(packet.inlist.is_none()); - assert_eq!(packet.bloom, Some(vec![1, 2, 3, 4])); + // Bloom should be a merged Sbbf containing all hashes + let merged_filter = Sbbf::from_u32s(packet.bloom.unwrap()).unwrap(); + for h in &[1u64, 2, 3, 4] { + assert!(merged_filter.check_hash(*h)); + } assert_eq!( packet.min_max, Some(SerializableDomain { diff --git a/src/query/service/src/pipelines/processors/transforms/hash_join/runtime_filter/mod.rs b/src/query/service/src/pipelines/processors/transforms/hash_join/runtime_filter/mod.rs index 0fbf2e9b84717..0fb339b4f21e1 100644 --- a/src/query/service/src/pipelines/processors/transforms/hash_join/runtime_filter/mod.rs +++ b/src/query/service/src/pipelines/processors/transforms/hash_join/runtime_filter/mod.rs @@ -26,4 +26,5 @@ pub use global::get_global_runtime_filter_packet; pub use interface::build_and_push_down_runtime_filter; pub use local_builder::RuntimeFilterLocalBuilder; pub use merge::merge_join_runtime_filter_packets; +pub use merge::merge_two_runtime_filter_packets; pub use packet::JoinRuntimeFilterPacket; diff --git a/src/query/service/src/pipelines/processors/transforms/hash_join/runtime_filter/packet.rs b/src/query/service/src/pipelines/processors/transforms/hash_join/runtime_filter/packet.rs index 707eff18a212f..ffe5263cf6bd3 100644 --- a/src/query/service/src/pipelines/processors/transforms/hash_join/runtime_filter/packet.rs +++ b/src/query/service/src/pipelines/processors/transforms/hash_join/runtime_filter/packet.rs @@ -45,7 +45,7 @@ pub struct RuntimeFilterPacket { pub id: usize, pub inlist: Option, pub min_max: Option, - pub bloom: Option>, + pub bloom: Option>, pub spatial: Option, } @@ -161,7 +161,7 @@ impl TryInto for JoinRuntimeFilterPacket { bloom_pos = Some(entities.len()); let builder = ArrayColumnBuilder { - builder: ColumnBuilder::Number(NumberColumnBuilder::UInt64(bloom_filter)), + builder: ColumnBuilder::Number(NumberColumnBuilder::UInt32(bloom_filter)), offsets: vec![0, len], }; entities.push(Column::Array(Box::new(builder.build()))); @@ -228,7 +228,7 @@ impl TryFrom for JoinRuntimeFilterPacket { let array_column = column.into_array().expect("it's a bug"); let bloom_value_column = array_column.index(0).expect("It's a bug"); bloom = Some(match bloom_value_column { - Column::Number(NumberColumn::UInt64(v)) => v.to_vec(), + Column::Number(NumberColumn::UInt32(v)) => v.to_vec(), _ => unreachable!("Unexpected runtime bloom filter column type"), }) } diff --git a/src/query/service/src/pipelines/processors/transforms/new_hash_join/join.rs b/src/query/service/src/pipelines/processors/transforms/new_hash_join/common/join.rs similarity index 73% rename from src/query/service/src/pipelines/processors/transforms/new_hash_join/join.rs rename to src/query/service/src/pipelines/processors/transforms/new_hash_join/common/join.rs index d3e4cc5bf9cc8..a3a8493f4f731 100644 --- a/src/query/service/src/pipelines/processors/transforms/new_hash_join/join.rs +++ b/src/query/service/src/pipelines/processors/transforms/new_hash_join/common/join.rs @@ -16,6 +16,7 @@ use databend_common_base::base::ProgressValues; use databend_common_exception::ErrorCode; use databend_common_exception::Result; use databend_common_expression::DataBlock; +use databend_common_expression::FilterExecutor; use crate::pipelines::processors::transforms::JoinRuntimeFilterPacket; @@ -31,7 +32,9 @@ pub trait Join: Send + Sync + 'static { /// returns its progress. Once all batches are consumed it returns `None` to signal completion. fn final_build(&mut self) -> Result>; - fn add_runtime_filter_packet(&self, _packet: JoinRuntimeFilterPacket) {} + fn add_runtime_filter_packet(&self, _packet: JoinRuntimeFilterPacket) -> Result<()> { + Ok(()) + } /// Generate runtime filter packet for the given filter description. fn build_runtime_filter(&self) -> Result { @@ -89,3 +92,42 @@ impl Join for FinishedJoin { Err(ErrorCode::Internal("Join is finished")) } } + +pub struct InnerHashJoinFilterStream<'a> { + inner: Box, + filter_executor: &'a mut FilterExecutor, +} + +impl<'a> InnerHashJoinFilterStream<'a> { + pub fn create( + inner: Box, + filter_executor: &'a mut FilterExecutor, + ) -> Box { + Box::new(InnerHashJoinFilterStream { + inner, + filter_executor, + }) + } +} + +impl<'a> JoinStream for InnerHashJoinFilterStream<'a> { + fn next(&mut self) -> Result> { + loop { + let Some(data_block) = self.inner.next()? else { + return Ok(None); + }; + + if data_block.is_empty() { + continue; + } + + let data_block = self.filter_executor.filter(data_block)?; + + if data_block.is_empty() { + continue; + } + + return Ok(Some(data_block)); + } + } +} diff --git a/src/query/service/src/pipelines/processors/transforms/new_hash_join/common/mod.rs b/src/query/service/src/pipelines/processors/transforms/new_hash_join/common/mod.rs index f0127c9d681cd..c76453cc9b693 100644 --- a/src/query/service/src/pipelines/processors/transforms/new_hash_join/common/mod.rs +++ b/src/query/service/src/pipelines/processors/transforms/new_hash_join/common/mod.rs @@ -13,6 +13,9 @@ // limitations under the License. mod cstyle_cell; +pub mod join; +pub mod probe_stream; +pub mod runtime_filter; mod squash_blocks; pub use cstyle_cell::CStyleCell; diff --git a/src/query/service/src/pipelines/processors/transforms/new_hash_join/hashtable/basic.rs b/src/query/service/src/pipelines/processors/transforms/new_hash_join/common/probe_stream.rs similarity index 67% rename from src/query/service/src/pipelines/processors/transforms/new_hash_join/hashtable/basic.rs rename to src/query/service/src/pipelines/processors/transforms/new_hash_join/common/probe_stream.rs index 72320f145d165..d1960ef572853 100644 --- a/src/query/service/src/pipelines/processors/transforms/new_hash_join/hashtable/basic.rs +++ b/src/query/service/src/pipelines/processors/transforms/new_hash_join/common/probe_stream.rs @@ -61,41 +61,6 @@ impl ProbedRows { } } -pub trait ProbeStream { +pub trait ProbeStream: Send + Sync { fn advance(&mut self, res: &mut ProbedRows, max_rows: usize) -> Result<()>; } - -pub struct EmptyProbeStream; - -impl ProbeStream for EmptyProbeStream { - fn advance(&mut self, _res: &mut ProbedRows, _max_rows: usize) -> Result<()> { - Ok(()) - } -} - -pub struct AllUnmatchedProbeStream { - idx: u64, - size: u64, -} - -impl AllUnmatchedProbeStream { - pub fn create(size: usize) -> Box { - Box::new(AllUnmatchedProbeStream { - idx: 0, - size: size as u64, - }) - } -} - -impl ProbeStream for AllUnmatchedProbeStream { - fn advance(&mut self, rows: &mut ProbedRows, max_rows: usize) -> Result<()> { - if self.idx >= self.size { - return Ok(()); - } - - let unmatched_rows = std::cmp::min(self.size - self.idx, max_rows as u64); - rows.unmatched.extend(self.idx..self.idx + unmatched_rows); - self.idx += unmatched_rows; - Ok(()) - } -} diff --git a/src/query/service/src/pipelines/processors/transforms/new_hash_join/runtime_filter.rs b/src/query/service/src/pipelines/processors/transforms/new_hash_join/common/runtime_filter.rs similarity index 100% rename from src/query/service/src/pipelines/processors/transforms/new_hash_join/runtime_filter.rs rename to src/query/service/src/pipelines/processors/transforms/new_hash_join/common/runtime_filter.rs diff --git a/src/query/service/src/pipelines/processors/transforms/new_hash_join/mod.rs b/src/query/service/src/pipelines/processors/transforms/new_hash_join/mod.rs index 790c0ce9e3eb0..1857969bb6059 100644 --- a/src/query/service/src/pipelines/processors/transforms/new_hash_join/mod.rs +++ b/src/query/service/src/pipelines/processors/transforms/new_hash_join/mod.rs @@ -13,24 +13,19 @@ // limitations under the License. mod common; -mod grace; -mod hash_join_factory; -mod hashtable; -mod hybrid; -mod join; -pub mod memory; -mod performance; -mod runtime_filter; -mod transform_hash_join; +pub mod partitioned; +pub mod unpartitioned; -pub use grace::GraceHashJoin; -pub use grace::GraceMemoryJoin; -pub use hash_join_factory::HashJoinFactory; -pub use hybrid::HybridHashJoin; -pub use hybrid::HybridHashJoinState; -pub use join::Join; -pub use join::JoinStream; -pub use memory::BasicHashJoinState; -pub use memory::InnerHashJoin; -pub use runtime_filter::RuntimeFiltersDesc; -pub use transform_hash_join::TransformHashJoin; +pub use common::join::Join; +pub use common::join::JoinStream; +pub use common::runtime_filter::RuntimeFiltersDesc; +pub use partitioned::SharedRuntimeFilterPackets; +pub use partitioned::TransformPartitionedHashJoin; +pub use unpartitioned::HashJoinFactory; +pub use unpartitioned::TransformHashJoin; +pub use unpartitioned::grace::GraceHashJoin; +pub use unpartitioned::grace::GraceMemoryJoin; +pub use unpartitioned::hybrid::HybridHashJoin; +pub use unpartitioned::hybrid::HybridHashJoinState; +pub use unpartitioned::memory::BasicHashJoinState; +pub use unpartitioned::memory::InnerHashJoin; diff --git a/src/query/service/src/pipelines/processors/transforms/new_hash_join/partitioned/chunk_accumulator.rs b/src/query/service/src/pipelines/processors/transforms/new_hash_join/partitioned/chunk_accumulator.rs new file mode 100644 index 0000000000000..02223c3b9d037 --- /dev/null +++ b/src/query/service/src/pipelines/processors/transforms/new_hash_join/partitioned/chunk_accumulator.rs @@ -0,0 +1,219 @@ +// Copyright 2021 Datafuse Labs +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use databend_common_expression::BlockEntry; +use databend_common_expression::Column; +use databend_common_expression::ColumnBuilder; +use databend_common_expression::DataBlock; + +/// Accumulates rows from input blocks into fixed-size output chunks +/// using mutable ColumnBuilders. When the accumulated rows reach +/// `chunk_size`, a chunk is flushed and returned. +pub struct FixedSizeChunkAccumulator { + chunk_size: usize, + builder_rows: usize, + builders: Vec, +} + +impl FixedSizeChunkAccumulator { + pub fn new(chunk_size: usize) -> Self { + FixedSizeChunkAccumulator { + chunk_size, + builders: vec![], + builder_rows: 0, + } + } + + pub fn accumulate(&mut self, block: DataBlock) -> Vec { + let mut output = Vec::new(); + self.append_block(block, &mut output); + output + } + + pub fn finalize(&mut self) -> Option { + match self.builder_rows { + 0 => None, + _ => Some(self.build_chunk()), + } + } + + fn ensure_builders(&mut self, block: &DataBlock) { + if self.builders.is_empty() { + self.builders = block + .columns() + .iter() + .map(|entry| ColumnBuilder::with_capacity(&entry.data_type(), self.chunk_size)) + .collect(); + } + } + + fn append_block(&mut self, block: DataBlock, output: &mut Vec) { + self.ensure_builders(&block); + + let block_rows = block.num_rows(); + let columns: Vec = block + .take_columns() + .into_iter() + .map(|e| e.to_column()) + .collect(); + + let mut offset = 0; + while offset < block_rows { + let remaining_capacity = self.chunk_size - self.builder_rows; + let rows_to_copy = (block_rows - offset).min(remaining_capacity); + + for (builder, col) in self.builders.iter_mut().zip(columns.iter()) { + let sliced = col.slice(offset..offset + rows_to_copy); + builder.append_column(&sliced); + } + + self.builder_rows += rows_to_copy; + offset += rows_to_copy; + + if self.builder_rows == self.chunk_size { + output.push(self.build_chunk()); + } + } + } + + fn build_chunk(&mut self) -> DataBlock { + let num_rows = self.builder_rows; + + let builders = std::mem::take(&mut self.builders); + + // Reinitialize builders with same column types for next chunk. + let mut new_builders = Vec::with_capacity(builders.len()); + let mut columns = Vec::with_capacity(builders.len()); + for b in builders { + let dt = b.data_type(); + columns.push(BlockEntry::from(b.build())); + new_builders.push(ColumnBuilder::with_capacity(&dt, self.chunk_size)); + } + + self.builder_rows = 0; + self.builders = new_builders; + + DataBlock::new(columns, num_rows) + } +} + +#[cfg(test)] +mod tests { + use databend_common_expression::DataBlock; + use databend_common_expression::FromData; + use databend_common_expression::types::AccessType; + use databend_common_expression::types::Int32Type; + use databend_common_expression::types::StringType; + + use super::*; + + fn make_int_block(values: Vec) -> DataBlock { + DataBlock::new_from_columns(vec![Int32Type::from_data(values)]) + } + + fn extract_int_col(block: &DataBlock) -> Vec { + let col = block.get_by_offset(0).to_column(); + let col = Int32Type::try_downcast_column(&col).unwrap(); + col.iter().copied().collect() + } + + #[test] + fn test_single_block_under_chunk_size() { + let mut acc = FixedSizeChunkAccumulator::new(4); + let chunks = acc.accumulate(make_int_block(vec![1, 2, 3])); + assert!(chunks.is_empty()); + + let last = acc.finalize().unwrap(); + assert_eq!(extract_int_col(&last), vec![1, 2, 3]); + } + + #[test] + fn test_exact_chunk_size() { + let mut acc = FixedSizeChunkAccumulator::new(3); + let chunks = acc.accumulate(make_int_block(vec![1, 2, 3])); + assert_eq!(chunks.len(), 1); + assert_eq!(extract_int_col(&chunks[0]), vec![1, 2, 3]); + + assert!(acc.finalize().is_none()); + } + + #[test] + fn test_block_larger_than_chunk_size() { + let mut acc = FixedSizeChunkAccumulator::new(3); + let chunks = acc.accumulate(make_int_block(vec![1, 2, 3, 4, 5, 6, 7])); + assert_eq!(chunks.len(), 2); + assert_eq!(extract_int_col(&chunks[0]), vec![1, 2, 3]); + assert_eq!(extract_int_col(&chunks[1]), vec![4, 5, 6]); + + let last = acc.finalize().unwrap(); + assert_eq!(extract_int_col(&last), vec![7]); + } + + #[test] + fn test_multiple_small_blocks() { + let mut acc = FixedSizeChunkAccumulator::new(4); + assert!(acc.accumulate(make_int_block(vec![1, 2])).is_empty()); + let chunks = acc.accumulate(make_int_block(vec![3, 4, 5])); + assert_eq!(chunks.len(), 1); + assert_eq!(extract_int_col(&chunks[0]), vec![1, 2, 3, 4]); + + let last = acc.finalize().unwrap(); + assert_eq!(extract_int_col(&last), vec![5]); + } + + #[test] + fn test_flush_empty() { + let mut acc = FixedSizeChunkAccumulator::new(4); + assert!(acc.finalize().is_none()); + } + + #[test] + fn test_multi_column_blocks() { + let mut acc = FixedSizeChunkAccumulator::new(3); + let block = DataBlock::new_from_columns(vec![ + Int32Type::from_data(vec![1, 2, 3, 4, 5]), + StringType::from_data(vec!["a", "b", "c", "d", "e"]), + ]); + let chunks = acc.accumulate(block); + assert_eq!(chunks.len(), 1); + assert_eq!(chunks[0].num_rows(), 3); + assert_eq!(chunks[0].num_columns(), 2); + + let last = acc.finalize().unwrap(); + assert_eq!(last.num_rows(), 2); + assert_eq!(last.num_columns(), 2); + + let int_col = Int32Type::try_downcast_column(&last.get_by_offset(0).to_column()).unwrap(); + let str_col = StringType::try_downcast_column(&last.get_by_offset(1).to_column()).unwrap(); + assert_eq!(int_col.iter().copied().collect::>(), vec![4, 5]); + let strs: Vec<&str> = str_col.iter().collect(); + assert_eq!(strs, vec!["d", "e"]); + } + + #[test] + fn test_reuse_after_flush() { + let mut acc = FixedSizeChunkAccumulator::new(2); + let chunks = acc.accumulate(make_int_block(vec![1, 2])); + assert_eq!(chunks.len(), 1); + assert!(acc.finalize().is_none()); + + // Accumulator can be reused after flush + let chunks = acc.accumulate(make_int_block(vec![3, 4, 5])); + assert_eq!(chunks.len(), 1); + assert_eq!(extract_int_col(&chunks[0]), vec![3, 4]); + + let last = acc.finalize().unwrap(); + assert_eq!(extract_int_col(&last), vec![5]); + } +} diff --git a/src/query/service/src/pipelines/processors/transforms/new_hash_join/partitioned/compact_hash_table.rs b/src/query/service/src/pipelines/processors/transforms/new_hash_join/partitioned/compact_hash_table.rs new file mode 100644 index 0000000000000..201e832b0e9da --- /dev/null +++ b/src/query/service/src/pipelines/processors/transforms/new_hash_join/partitioned/compact_hash_table.rs @@ -0,0 +1,176 @@ +// Copyright 2021 Datafuse Labs +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use databend_common_column::bitmap::Bitmap; + +/// Index 0 is a sentinel (empty/chain-end). Actual rows are indexed from 1. +/// Memory per row: 4 bytes (next chain) vs current ~32 bytes (pointer-based entry). +/// +/// The table is single-threaded (no atomics) — designed for per-thread use +/// under hash shuffle where each thread independently builds and probes. +/// Trait for row index types. Supports u32 (up to ~4B rows) and u64. +pub trait RowIndex: Copy + Default + Eq + Send + Sync + 'static + std::fmt::Debug { + const ZERO: Self; + fn from_usize(v: usize) -> Self; + fn to_usize(self) -> usize; +} + +impl RowIndex for u32 { + const ZERO: Self = 0; + #[inline(always)] + fn from_usize(v: usize) -> Self { + v as u32 + } + #[inline(always)] + fn to_usize(self) -> usize { + self as usize + } +} + +impl RowIndex for u64 { + const ZERO: Self = 0; + #[inline(always)] + fn from_usize(v: usize) -> Self { + v as u64 + } + #[inline(always)] + fn to_usize(self) -> usize { + self as usize + } +} + +/// Compact join hash table using index-based chaining. +/// +/// `first[bucket]` stores the first row index in that bucket's chain. +/// `next[row_index]` stores the next row index in the same bucket's chain. +/// Chain ends when the value is `I::ZERO` (sentinel). +pub struct CompactJoinHashTable { + /// Bucket array: first[hash & mask] = first row index (1-based) + first: Vec, + /// Chain array: next[row_index] = next row in same bucket (0 = end) + pub next: Vec, + /// Bucket count minus one, for masking + bucket_mask: usize, +} + +impl CompactJoinHashTable { + /// Create a new compact hash table for `num_rows` rows. + /// Bucket count is next power of 2 >= num_rows + (num_rows - 1) / 7. + pub fn new(num_rows: usize) -> Self { + let bucket_count = Self::calc_bucket_count(num_rows); + CompactJoinHashTable { + first: vec![I::ZERO; bucket_count], + // Index 0 is sentinel, so we need num_rows + 1 entries + next: vec![I::ZERO; num_rows + 1], + bucket_mask: bucket_count - 1, + } + } + + /// Create a direct-mapping hash table where keys are used as array indices. + /// `range` is `max_key - min_key`; the caller subtracts min_key before insertion/probe. + pub fn new_direct(num_rows: usize, range: usize) -> Self { + CompactJoinHashTable { + first: vec![I::ZERO; range + 1], + next: vec![I::ZERO; num_rows + 1], + bucket_mask: 0, + } + } + + pub fn insert_chunk(&mut self, vals: &[u64], row_offset: usize) { + for (i, v) in vals.iter().enumerate() { + let row_index = row_offset + i; + let bucket = match DIRECT { + true => *v as usize, + false => (*v as usize) & self.bucket_mask, + }; + + self.next[row_index] = self.first[bucket]; + self.first[bucket] = I::from_usize(row_index); + } + } + + fn calc_bucket_count(num_rows: usize) -> usize { + if num_rows == 0 { + return 1; + } + + let target = num_rows + (num_rows.saturating_sub(1)) / 7; + target.next_power_of_two() + } + + pub fn probe(&self, vals: &mut [u64], bitmap: Option) -> usize { + let mut valids = None; + + if let Some(bitmap) = bitmap { + if bitmap.null_count() == bitmap.len() { + vals.iter_mut().for_each(|v| { + *v = 0; + }); + return 0; + } else if bitmap.null_count() > 0 { + valids = Some(bitmap); + } + } + + let mut count = 0; + let first_len = self.first.len(); + + match valids { + Some(valids) => { + for (valid, val) in valids.iter().zip(vals.iter_mut()) { + if valid { + let bucket = match DIRECT { + false => (*val as usize) & self.bucket_mask, + true if (*val as usize) < first_len => *val as usize, + true => { + *val = 0; + continue; + } + }; + + if self.first[bucket] != I::default() { + *val = self.first[bucket].to_usize() as u64; + count += 1; + } else { + *val = 0; + } + } else { + *val = 0; + } + } + } + None => { + vals.iter_mut().for_each(|val| { + let bucket = if DIRECT { + let b = *val as usize; + if b >= first_len { + *val = 0; + return; + } + b + } else { + (*val as usize) & self.bucket_mask + }; + if self.first[bucket] != I::default() { + *val = self.first[bucket].to_usize() as u64; + count += 1; + } else { + *val = 0; + } + }); + } + } + count + } +} diff --git a/src/query/service/src/pipelines/processors/transforms/new_hash_join/partitioned/inner_join.rs b/src/query/service/src/pipelines/processors/transforms/new_hash_join/partitioned/inner_join.rs new file mode 100644 index 0000000000000..c8160edaae275 --- /dev/null +++ b/src/query/service/src/pipelines/processors/transforms/new_hash_join/partitioned/inner_join.rs @@ -0,0 +1,207 @@ +// Copyright 2021 Datafuse Labs +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use std::sync::Arc; + +use databend_common_base::base::ProgressValues; +use databend_common_column::bitmap::Bitmap; +use databend_common_exception::Result; +use databend_common_expression::BlockEntry; +use databend_common_expression::DataBlock; +use databend_common_expression::FunctionContext; +use databend_common_expression::HashMethodKind; +use databend_common_expression::types::NullableColumn; + +use super::partitioned_build::PartitionedHashJoinState; +use super::partitioned_build::ProbeData; +use crate::pipelines::processors::HashJoinDesc; +use crate::pipelines::processors::transforms::new_hash_join::common::join::EmptyJoinStream; +use crate::pipelines::processors::transforms::new_hash_join::common::join::InnerHashJoinFilterStream; +use crate::pipelines::processors::transforms::new_hash_join::common::join::Join; +use crate::pipelines::processors::transforms::new_hash_join::common::join::JoinStream; +use crate::pipelines::processors::transforms::new_hash_join::common::probe_stream::ProbeStream; +use crate::pipelines::processors::transforms::new_hash_join::common::probe_stream::ProbedRows; +use crate::pipelines::processors::transforms::unpartitioned::PerformanceContext; + +pub struct PartitionedInnerJoin { + build: PartitionedHashJoinState, + desc: Arc, + function_ctx: Arc, + context: PerformanceContext, +} + +impl PartitionedInnerJoin { + pub fn create( + method: HashMethodKind, + desc: Arc, + function_ctx: FunctionContext, + max_block_size: usize, + ) -> Self { + let context = + PerformanceContext::create(max_block_size, desc.clone(), function_ctx.clone()); + + let function_ctx = Arc::new(function_ctx); + + PartitionedInnerJoin { + function_ctx: function_ctx.clone(), + build: PartitionedHashJoinState::create(method, desc.clone(), function_ctx), + desc, + context, + } + } +} + +impl Join for PartitionedInnerJoin { + fn add_block(&mut self, data: Option) -> Result<()> { + self.build.add_block::(data) + } + + fn final_build(&mut self) -> Result> { + self.build.final_build() + } + + fn probe_block(&mut self, data: DataBlock) -> Result> { + if data.is_empty() || self.build.num_rows == 0 { + return Ok(Box::new(EmptyJoinStream)); + } + + let probe_keys = self.desc.probe_key(&data, &self.function_ctx)?; + + let mut keys = DataBlock::new(probe_keys, data.num_rows()); + let valids = match self.desc.from_correlated_subquery { + true => None, + false => self.desc.build_valids_by_keys(&keys)?, + }; + + self.desc.remove_keys_nullable(&mut keys); + let probe_block = data.project(&self.desc.probe_projection); + + let probe_data = ProbeData::new(keys, valids); + let probe_keys_stream = self.build.probe::(probe_data)?; + let joined_stream = PartitionedInnerJoinStream::create( + probe_block, + &self.build, + probe_keys_stream, + self.desc.clone(), + &mut self.context.probe_result, + ); + + match &mut self.context.filter_executor { + None => Ok(joined_stream), + Some(filter_executor) => Ok(InnerHashJoinFilterStream::create( + joined_stream, + filter_executor, + )), + } + } +} + +struct PartitionedInnerJoinStream<'a> { + desc: Arc, + probe_data_block: DataBlock, + build: &'a PartitionedHashJoinState, + probe_keys_stream: Box, + probed_rows: &'a mut ProbedRows, +} + +impl<'a> PartitionedInnerJoinStream<'a> { + pub fn create( + probe_data_block: DataBlock, + build: &'a PartitionedHashJoinState, + probe_keys_stream: Box, + desc: Arc, + probed_rows: &'a mut ProbedRows, + ) -> Box { + Box::new(PartitionedInnerJoinStream { + desc, + build, + probed_rows, + probe_data_block, + probe_keys_stream, + }) + } +} + +impl<'a> JoinStream for PartitionedInnerJoinStream<'a> { + fn next(&mut self) -> Result> { + loop { + self.probed_rows.clear(); + let max_rows = self.probed_rows.matched_probe.capacity(); + self.probe_keys_stream.advance(self.probed_rows, max_rows)?; + + if self.probed_rows.is_empty() { + return Ok(None); + } + + if self.probed_rows.is_all_unmatched() { + continue; + } + + let probe_block = match self.probe_data_block.num_columns() { + 0 => None, + _ => Some(DataBlock::take( + &self.probe_data_block, + self.probed_rows.matched_probe.as_slice(), + )?), + }; + + let build_block = match self.build.columns.is_empty() { + true => None, + false => { + let row_ptrs = self.probed_rows.matched_build.as_slice(); + Some(DataBlock::take_column_vec( + self.build.columns.as_slice(), + self.build.column_types.as_slice(), + row_ptrs, + )) + } + }; + + let mut result_block = match (probe_block, build_block) { + (Some(mut probe_block), Some(build_block)) => { + probe_block.merge_block(build_block); + probe_block + } + (Some(probe_block), None) => probe_block, + (None, Some(build_block)) => build_block, + (None, None) => DataBlock::new(vec![], self.probed_rows.matched_build.len()), + }; + + for (index, (is_probe_nullable, is_build_nullable)) in + self.desc.probe_to_build.iter().cloned() + { + let entry = match (is_probe_nullable, is_build_nullable) { + (true, true) | (false, false) => result_block.get_by_offset(index).clone(), + (true, false) => result_block.get_by_offset(index).clone().remove_nullable(), + (false, true) => { + let entry = result_block.get_by_offset(index); + let col = entry.to_column(); + + match col.is_null() || col.is_nullable() { + true => entry.clone(), + false => BlockEntry::from(NullableColumn::new_column( + col, + Bitmap::new_constant(true, result_block.num_rows()), + )), + } + } + }; + + result_block.add_entry(entry); + } + + return Ok(Some(result_block)); + } + } +} diff --git a/src/query/service/src/pipelines/processors/transforms/new_hash_join/partitioned/left_join.rs b/src/query/service/src/pipelines/processors/transforms/new_hash_join/partitioned/left_join.rs new file mode 100644 index 0000000000000..4552607a8e17c --- /dev/null +++ b/src/query/service/src/pipelines/processors/transforms/new_hash_join/partitioned/left_join.rs @@ -0,0 +1,360 @@ +// Copyright 2021 Datafuse Labs +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use std::sync::Arc; + +use databend_common_base::base::ProgressValues; +use databend_common_column::bitmap::Bitmap; +use databend_common_exception::Result; +use databend_common_expression::BlockEntry; +use databend_common_expression::DataBlock; +use databend_common_expression::FilterExecutor; +use databend_common_expression::FunctionContext; +use databend_common_expression::HashMethodKind; +use databend_common_expression::Scalar; +use databend_common_expression::types::DataType; +use databend_common_expression::types::NullableColumn; + +use super::partitioned_build::PartitionedHashJoinState; +use crate::pipelines::processors::HashJoinDesc; +use crate::pipelines::processors::transforms::new_hash_join::common::join::EmptyJoinStream; +use crate::pipelines::processors::transforms::new_hash_join::common::join::Join; +use crate::pipelines::processors::transforms::new_hash_join::common::join::JoinStream; +use crate::pipelines::processors::transforms::new_hash_join::common::join::OneBlockJoinStream; +use crate::pipelines::processors::transforms::new_hash_join::common::probe_stream::ProbeStream; +use crate::pipelines::processors::transforms::new_hash_join::common::probe_stream::ProbedRows; +use crate::pipelines::processors::transforms::partitioned::partitioned_build::ProbeData; +use crate::pipelines::processors::transforms::unpartitioned::PerformanceContext; +use crate::pipelines::processors::transforms::wrap_true_validity; + +pub struct PartitionedLeftJoin { + build: PartitionedHashJoinState, + desc: Arc, + function_ctx: FunctionContext, + performance_context: PerformanceContext, +} + +impl PartitionedLeftJoin { + pub fn create( + method: HashMethodKind, + desc: Arc, + function_ctx: FunctionContext, + max_block_size: usize, + ) -> Self { + let context = + PerformanceContext::create(max_block_size, desc.clone(), function_ctx.clone()); + + PartitionedLeftJoin { + desc: desc.clone(), + function_ctx: function_ctx.clone(), + performance_context: context, + build: PartitionedHashJoinState::create(method, desc, Arc::new(function_ctx)), + } + } +} + +impl Join for PartitionedLeftJoin { + fn add_block(&mut self, data: Option) -> Result<()> { + self.build.add_block::(data) + } + + fn final_build(&mut self) -> Result> { + self.build.final_build() + } + + fn probe_block(&mut self, data: DataBlock) -> Result> { + if data.is_empty() { + return Ok(Box::new(EmptyJoinStream)); + } + + if self.build.num_rows == 0 { + let num_rows = data.num_rows(); + + let types = self + .desc + .build_schema + .fields + .iter() + .map(|x| x.data_type().clone()) + .collect::>(); + + let build_block = + match crate::pipelines::processors::transforms::unpartitioned::left_join::null_block( + &types, + data.num_rows(), + ) { + None => None, + Some(data_block) => Some(data_block.project(&self.desc.build_projection)), + }; + + let probe_block = Some(data.project(&self.desc.probe_projection)); + let result_block = final_result_block(&self.desc, probe_block, build_block, num_rows); + return Ok(Box::new(OneBlockJoinStream(Some(result_block)))); + } + + let probe_keys = self.desc.probe_key(&data, &self.function_ctx)?; + + let mut keys = DataBlock::new(probe_keys, data.num_rows()); + let valids = match self.desc.from_correlated_subquery { + true => None, + false => self.desc.build_valids_by_keys(&keys)?, + }; + + self.desc.remove_keys_nullable(&mut keys); + let probe_block = data.project(&self.desc.probe_projection); + + let probe_data = ProbeData::new(keys, valids); + let probe_stream = self.build.probe::(probe_data)?; + + match self.performance_context.filter_executor.as_mut() { + None => Ok(OuterLeftHashJoinStream::::create( + probe_block, + &self.build, + probe_stream, + self.desc.clone(), + &mut self.performance_context.probe_result, + None, + )), + Some(filter_executor) => Ok(OuterLeftHashJoinStream::::create( + probe_block, + &self.build, + probe_stream, + self.desc.clone(), + &mut self.performance_context.probe_result, + Some(filter_executor), + )), + } + } +} + +struct OuterLeftHashJoinStream<'a, const CONJUNCT: bool> { + desc: Arc, + probe_data_block: DataBlock, + join_state: &'a PartitionedHashJoinState, + probe_keys_stream: Box, + probed_rows: &'a mut ProbedRows, + conjunct_unmatched: Vec, + unmatched_rows: Vec, + filter_executor: Option<&'a mut FilterExecutor>, +} + +unsafe impl<'a, const CONJUNCT: bool> Send for OuterLeftHashJoinStream<'a, CONJUNCT> {} +unsafe impl<'a, const CONJUNCT: bool> Sync for OuterLeftHashJoinStream<'a, CONJUNCT> {} + +impl<'a, const CONJUNCT: bool> JoinStream for OuterLeftHashJoinStream<'a, CONJUNCT> { + fn next(&mut self) -> Result> { + loop { + self.probed_rows.clear(); + let max_rows = self.probed_rows.matched_probe.capacity(); + self.probe_keys_stream.advance(self.probed_rows, max_rows)?; + + if !CONJUNCT && !self.probed_rows.unmatched.is_empty() { + self.unmatched_rows + .extend_from_slice(&self.probed_rows.unmatched); + } + + if self.probed_rows.is_empty() { + if self.conjunct_unmatched.is_empty() && self.unmatched_rows.is_empty() { + return Ok(None); + } + + let unmatched_row_id = match CONJUNCT { + true => std::mem::take(&mut self.conjunct_unmatched) + .into_iter() + .enumerate() + .filter(|(_, matched)| *matched == 0) + .map(|(row_id, _)| row_id as u64) + .collect::>(), + false => std::mem::take(&mut self.unmatched_rows), + }; + + let probe_block = match self.probe_data_block.num_columns() { + 0 => None, + _ => Some(DataBlock::take( + &self.probe_data_block, + unmatched_row_id.as_slice(), + )?), + }; + + let types = &self.join_state.column_types; + let build_block = + crate::pipelines::processors::transforms::unpartitioned::left_join::null_block( + types, + unmatched_row_id.len(), + ); + + return Ok(Some(crate::pipelines::processors::transforms::unpartitioned::left_join::final_result_block( + &self.desc, + probe_block, + build_block, + unmatched_row_id.len(), + ))); + } + + if self.probed_rows.matched_probe.is_empty() { + continue; + } + + let probe_block = match self.probe_data_block.num_columns() { + 0 => None, + _ => Some(DataBlock::take( + &self.probe_data_block, + self.probed_rows.matched_probe.as_slice(), + )?), + }; + + let build_block = match self.join_state.columns.is_empty() { + true => None, + false => { + let row_ptrs = self.probed_rows.matched_build.as_slice(); + let build_block = DataBlock::take_column_vec( + self.join_state.columns.as_slice(), + self.join_state.column_types.as_slice(), + row_ptrs, + ); + + let true_validity = Bitmap::new_constant(true, row_ptrs.len()); + let entries = build_block + .columns() + .iter() + .map(|c| wrap_true_validity(c, row_ptrs.len(), &true_validity)); + Some(DataBlock::from_iter(entries, row_ptrs.len())) + } + }; + + let mut result_block = crate::pipelines::processors::transforms::unpartitioned::left_join::final_result_block( + &self.desc, + probe_block, + build_block, + self.probed_rows.matched_build.len(), + ); + + if CONJUNCT && let Some(filter_executor) = self.filter_executor.as_mut() { + let result_count = filter_executor.select(&result_block)?; + + if result_count == 0 { + continue; + } + + let true_sel = filter_executor.true_selection(); + + for idx in true_sel.iter().take(result_count) { + let row_id = self.probed_rows.matched_probe[*idx as usize] as usize; + self.conjunct_unmatched[row_id] = 1; + } + + let origin_rows = result_block.num_rows(); + result_block = filter_executor.take(result_block, origin_rows, result_count)?; + } + + return Ok(Some(result_block)); + } + } +} + +impl<'a, const CONJUNCT: bool> OuterLeftHashJoinStream<'a, CONJUNCT> { + pub fn create( + probe_data_block: DataBlock, + join_state: &'a PartitionedHashJoinState, + probe_keys_stream: Box, + desc: Arc, + probed_rows: &'a mut ProbedRows, + filter_executor: Option<&'a mut FilterExecutor>, + ) -> Box { + let num_rows = probe_data_block.num_rows(); + let pending_unmatched = match CONJUNCT { + true => vec![0; num_rows], + false => Vec::new(), + }; + + let unmatched_rows = match CONJUNCT { + true => Vec::new(), + false => Vec::with_capacity(num_rows), + }; + + probed_rows.unmatched.reserve(num_rows); + Box::new(OuterLeftHashJoinStream::<'a, CONJUNCT> { + desc, + join_state, + probed_rows, + probe_data_block, + probe_keys_stream, + filter_executor, + unmatched_rows, + conjunct_unmatched: pending_unmatched, + }) + } +} + +pub fn final_result_block( + desc: &HashJoinDesc, + probe_block: Option, + build_block: Option, + num_rows: usize, +) -> DataBlock { + let mut result_block = match (probe_block, build_block) { + (Some(mut probe_block), Some(build_block)) => { + probe_block.merge_block(build_block); + probe_block + } + (Some(probe_block), None) => probe_block, + (None, Some(build_block)) => build_block, + (None, None) => DataBlock::new(vec![], num_rows), + }; + + if !desc.probe_to_build.is_empty() { + for (index, (is_probe_nullable, is_build_nullable)) in desc.probe_to_build.iter() { + let entry = match (is_probe_nullable, is_build_nullable) { + (true, true) | (false, false) => result_block.get_by_offset(*index).clone(), + (true, false) => result_block.get_by_offset(*index).clone().remove_nullable(), + (false, true) => { + let entry = result_block.get_by_offset(*index); + let col = entry.to_column(); + + match col.is_null() || col.is_nullable() { + true => entry.clone(), + false => BlockEntry::from(NullableColumn::new_column( + col, + Bitmap::new_constant(true, result_block.num_rows()), + )), + } + } + }; + + result_block.add_entry(entry); + } + } + result_block +} + +#[allow(dead_code)] +pub fn null_block(types: &[DataType], num_rows: usize) -> Option { + match types.is_empty() { + true => None, + false => { + let columns = types + .iter() + .map(|column_type| { + BlockEntry::new_const_column( + column_type.wrap_nullable(), + Scalar::Null, + num_rows, + ) + }) + .collect::>(); + + Some(DataBlock::new(columns, num_rows)) + } + } +} diff --git a/src/query/service/src/pipelines/processors/transforms/new_hash_join/partitioned/left_join_anti.rs b/src/query/service/src/pipelines/processors/transforms/new_hash_join/partitioned/left_join_anti.rs new file mode 100644 index 0000000000000..0a9bfec2c6d88 --- /dev/null +++ b/src/query/service/src/pipelines/processors/transforms/new_hash_join/partitioned/left_join_anti.rs @@ -0,0 +1,270 @@ +// Copyright 2021 Datafuse Labs +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use std::sync::Arc; + +use databend_common_base::base::ProgressValues; +use databend_common_base::hints::assume; +use databend_common_column::bitmap::Bitmap; +use databend_common_exception::Result; +use databend_common_expression::DataBlock; +use databend_common_expression::FilterExecutor; +use databend_common_expression::FunctionContext; +use databend_common_expression::HashMethodKind; + +use super::partitioned_build::PartitionedHashJoinState; +use super::partitioned_build::ProbeData; +use crate::pipelines::processors::HashJoinDesc; +use crate::pipelines::processors::transforms::new_hash_join::common::join::EmptyJoinStream; +use crate::pipelines::processors::transforms::new_hash_join::common::join::Join; +use crate::pipelines::processors::transforms::new_hash_join::common::join::JoinStream; +use crate::pipelines::processors::transforms::new_hash_join::common::join::OneBlockJoinStream; +use crate::pipelines::processors::transforms::new_hash_join::common::probe_stream::ProbeStream; +use crate::pipelines::processors::transforms::new_hash_join::common::probe_stream::ProbedRows; +use crate::pipelines::processors::transforms::new_hash_join::unpartitioned::memory::left_join::final_result_block; +use crate::pipelines::processors::transforms::unpartitioned::PerformanceContext; + +pub struct PartitionedLeftAntiJoin { + build: PartitionedHashJoinState, + desc: Arc, + function_ctx: Arc, + context: PerformanceContext, +} + +impl PartitionedLeftAntiJoin { + pub fn create( + method: HashMethodKind, + desc: Arc, + function_ctx: FunctionContext, + max_block_size: usize, + ) -> Self { + let context = + PerformanceContext::create(max_block_size, desc.clone(), function_ctx.clone()); + + let function_ctx = Arc::new(function_ctx); + + PartitionedLeftAntiJoin { + function_ctx: function_ctx.clone(), + build: PartitionedHashJoinState::create(method, desc.clone(), function_ctx), + desc, + context, + } + } +} + +impl Join for PartitionedLeftAntiJoin { + fn add_block(&mut self, data: Option) -> Result<()> { + self.build.add_block::(data) + } + + fn final_build(&mut self) -> Result> { + self.build.final_build() + } + + fn probe_block(&mut self, data: DataBlock) -> Result> { + if data.is_empty() { + return Ok(Box::new(EmptyJoinStream)); + } + + if self.build.num_rows == 0 { + let probe_projected = data.project(&self.desc.probe_projection); + return Ok(Box::new(OneBlockJoinStream(Some(probe_projected)))); + } + + let probe_keys = self.desc.probe_key(&data, &self.function_ctx)?; + + let mut keys = DataBlock::new(probe_keys, data.num_rows()); + let valids = match self.desc.from_correlated_subquery { + true => None, + false => self.desc.build_valids_by_keys(&keys)?, + }; + + self.desc.remove_keys_nullable(&mut keys); + let probe_block = data.project(&self.desc.probe_projection); + + let probe_data = ProbeData::new(keys, valids); + + match &mut self.context.filter_executor { + None => { + let probe_keys_stream = self.build.probe::(probe_data)?; + Ok(LeftAntiHashJoinStream::create( + probe_block, + probe_keys_stream, + &mut self.context.probe_result, + )) + } + Some(filter_executor) => { + let probe_keys_stream = self.build.probe::(probe_data)?; + Ok(LeftAntiFilterHashJoinStream::create( + probe_block, + &self.build, + probe_keys_stream, + self.desc.clone(), + &mut self.context.probe_result, + filter_executor, + )) + } + } + } +} + +struct LeftAntiHashJoinStream<'a> { + probe_data_block: Option, + probe_keys_stream: Box, + probed_rows: &'a mut ProbedRows, +} + +impl<'a> LeftAntiHashJoinStream<'a> { + pub fn create( + probe_data_block: DataBlock, + probe_keys_stream: Box, + probed_rows: &'a mut ProbedRows, + ) -> Box { + Box::new(LeftAntiHashJoinStream { + probed_rows, + probe_data_block: Some(probe_data_block), + probe_keys_stream, + }) + } +} + +impl<'a> JoinStream for LeftAntiHashJoinStream<'a> { + fn next(&mut self) -> Result> { + let Some(probe_data_block) = self.probe_data_block.take() else { + return Ok(None); + }; + + let num_rows = probe_data_block.num_rows(); + let mut selected = vec![false; num_rows]; + + loop { + self.probed_rows.clear(); + let max_rows = self.probed_rows.matched_probe.capacity(); + self.probe_keys_stream.advance(self.probed_rows, max_rows)?; + + if self.probed_rows.is_empty() { + let bitmap = Bitmap::from_trusted_len_iter(selected.into_iter()); + return Ok(Some(probe_data_block.filter_with_bitmap(&bitmap)?)); + } + + for idx in &self.probed_rows.unmatched { + selected[*idx as usize] = true; + } + } + } +} + +struct LeftAntiFilterHashJoinStream<'a> { + desc: Arc, + probe_data_block: Option, + build: &'a PartitionedHashJoinState, + probe_keys_stream: Box, + probed_rows: &'a mut ProbedRows, + filter_executor: &'a mut FilterExecutor, +} + +impl<'a> LeftAntiFilterHashJoinStream<'a> { + pub fn create( + probe_data_block: DataBlock, + build: &'a PartitionedHashJoinState, + probe_keys_stream: Box, + desc: Arc, + probed_rows: &'a mut ProbedRows, + filter_executor: &'a mut FilterExecutor, + ) -> Box { + Box::new(LeftAntiFilterHashJoinStream { + desc, + build, + probed_rows, + filter_executor, + probe_keys_stream, + probe_data_block: Some(probe_data_block), + }) + } +} + +impl<'a> JoinStream for LeftAntiFilterHashJoinStream<'a> { + fn next(&mut self) -> Result> { + let Some(probe_data_block) = self.probe_data_block.take() else { + return Ok(None); + }; + + let num_rows = probe_data_block.num_rows(); + let mut selected = vec![true; num_rows]; + + loop { + self.probed_rows.clear(); + let max_rows = self.probed_rows.matched_probe.capacity(); + self.probe_keys_stream.advance(self.probed_rows, max_rows)?; + + if self.probed_rows.is_empty() { + break; + } + + if self.probed_rows.is_all_unmatched() { + continue; + } + + let probe_block = match probe_data_block.num_columns() { + 0 => None, + _ => Some(DataBlock::take( + &probe_data_block, + self.probed_rows.matched_probe.as_slice(), + )?), + }; + + let build_block = match self.build.columns.is_empty() { + true => None, + false => { + let row_ptrs = self.probed_rows.matched_build.as_slice(); + Some(DataBlock::take_column_vec( + self.build.columns.as_slice(), + self.build.column_types.as_slice(), + row_ptrs, + )) + } + }; + + let result_block = final_result_block( + &self.desc, + probe_block, + build_block, + self.probed_rows.matched_build.len(), + ); + + let selected_rows = self.filter_executor.select(&result_block)?; + + if selected_rows == result_block.num_rows() { + for probe_idx in &self.probed_rows.matched_probe { + assume((*probe_idx as usize) < selected.len()); + selected[*probe_idx as usize] = false; + } + } else if selected_rows != 0 { + let selection = self.filter_executor.true_selection(); + for idx in selection[..selected_rows].iter() { + assume((*idx as usize) < self.probed_rows.matched_probe.len()); + let idx = self.probed_rows.matched_probe[*idx as usize]; + assume((idx as usize) < selected.len()); + selected[idx as usize] = false; + } + } + } + + let bitmap = Bitmap::from_trusted_len_iter(selected.iter().copied()); + match bitmap.true_count() { + 0 => Ok(None), + _ => Ok(Some(probe_data_block.filter_with_bitmap(&bitmap)?)), + } + } +} diff --git a/src/query/service/src/pipelines/processors/transforms/new_hash_join/partitioned/left_join_semi.rs b/src/query/service/src/pipelines/processors/transforms/new_hash_join/partitioned/left_join_semi.rs new file mode 100644 index 0000000000000..74cc501f616df --- /dev/null +++ b/src/query/service/src/pipelines/processors/transforms/new_hash_join/partitioned/left_join_semi.rs @@ -0,0 +1,258 @@ +// Copyright 2021 Datafuse Labs +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use std::sync::Arc; + +use databend_common_base::base::ProgressValues; +use databend_common_base::hints::assume; +use databend_common_column::bitmap::Bitmap; +use databend_common_exception::Result; +use databend_common_expression::DataBlock; +use databend_common_expression::FilterExecutor; +use databend_common_expression::FunctionContext; +use databend_common_expression::HashMethodKind; + +use super::partitioned_build::PartitionedHashJoinState; +use super::partitioned_build::ProbeData; +use crate::pipelines::processors::HashJoinDesc; +use crate::pipelines::processors::transforms::new_hash_join::common::join::EmptyJoinStream; +use crate::pipelines::processors::transforms::new_hash_join::common::join::Join; +use crate::pipelines::processors::transforms::new_hash_join::common::join::JoinStream; +use crate::pipelines::processors::transforms::new_hash_join::common::probe_stream::ProbeStream; +use crate::pipelines::processors::transforms::new_hash_join::common::probe_stream::ProbedRows; +use crate::pipelines::processors::transforms::new_hash_join::unpartitioned::memory::left_join::final_result_block; +use crate::pipelines::processors::transforms::unpartitioned::PerformanceContext; + +pub struct PartitionedLeftSemiJoin { + build: PartitionedHashJoinState, + desc: Arc, + function_ctx: Arc, + context: PerformanceContext, +} + +impl PartitionedLeftSemiJoin { + pub fn create( + method: HashMethodKind, + desc: Arc, + function_ctx: FunctionContext, + max_block_size: usize, + ) -> Self { + let context = + PerformanceContext::create(max_block_size, desc.clone(), function_ctx.clone()); + + let function_ctx = Arc::new(function_ctx); + + PartitionedLeftSemiJoin { + function_ctx: function_ctx.clone(), + build: PartitionedHashJoinState::create(method, desc.clone(), function_ctx), + desc, + context, + } + } +} + +impl Join for PartitionedLeftSemiJoin { + fn add_block(&mut self, data: Option) -> Result<()> { + self.build.add_block::(data) + } + + fn final_build(&mut self) -> Result> { + self.build.final_build() + } + + fn probe_block(&mut self, data: DataBlock) -> Result> { + if data.is_empty() || self.build.num_rows == 0 { + return Ok(Box::new(EmptyJoinStream)); + } + + let probe_keys = self.desc.probe_key(&data, &self.function_ctx)?; + + let mut keys = DataBlock::new(probe_keys, data.num_rows()); + let valids = match self.desc.from_correlated_subquery { + true => None, + false => self.desc.build_valids_by_keys(&keys)?, + }; + + self.desc.remove_keys_nullable(&mut keys); + let probe_block = data.project(&self.desc.probe_projection); + + let probe_data = ProbeData::new(keys, valids); + + match &mut self.context.filter_executor { + None => { + let probe_keys_stream = self.build.probe::(probe_data)?; + Ok(LeftSemiHashJoinStream::create( + probe_block, + probe_keys_stream, + &mut self.context.probe_result, + )) + } + Some(filter_executor) => { + let probe_keys_stream = self.build.probe::(probe_data)?; + Ok(LeftSemiFilterHashJoinStream::create( + probe_block, + &self.build, + probe_keys_stream, + self.desc.clone(), + &mut self.context.probe_result, + filter_executor, + )) + } + } + } +} + +struct LeftSemiHashJoinStream<'a> { + probe_data_block: DataBlock, + probe_keys_stream: Box, + probed_rows: &'a mut ProbedRows, +} + +impl<'a> LeftSemiHashJoinStream<'a> { + pub fn create( + probe_data_block: DataBlock, + probe_keys_stream: Box, + probed_rows: &'a mut ProbedRows, + ) -> Box { + Box::new(LeftSemiHashJoinStream { + probed_rows, + probe_data_block, + probe_keys_stream, + }) + } +} + +impl<'a> JoinStream for LeftSemiHashJoinStream<'a> { + fn next(&mut self) -> Result> { + loop { + self.probed_rows.clear(); + let max_rows = self.probed_rows.matched_probe.capacity(); + self.probe_keys_stream.advance(self.probed_rows, max_rows)?; + + if self.probed_rows.is_empty() { + return Ok(None); + } + + if self.probed_rows.is_all_unmatched() { + continue; + } + + return Ok(Some(DataBlock::take( + &self.probe_data_block, + self.probed_rows.matched_probe.as_slice(), + )?)); + } + } +} + +struct LeftSemiFilterHashJoinStream<'a> { + desc: Arc, + probe_data_block: Option, + build: &'a PartitionedHashJoinState, + probe_keys_stream: Box, + probed_rows: &'a mut ProbedRows, + filter_executor: &'a mut FilterExecutor, +} + +impl<'a> LeftSemiFilterHashJoinStream<'a> { + pub fn create( + probe_data_block: DataBlock, + build: &'a PartitionedHashJoinState, + probe_keys_stream: Box, + desc: Arc, + probed_rows: &'a mut ProbedRows, + filter_executor: &'a mut FilterExecutor, + ) -> Box { + Box::new(LeftSemiFilterHashJoinStream { + desc, + build, + probed_rows, + filter_executor, + probe_keys_stream, + probe_data_block: Some(probe_data_block), + }) + } +} + +impl<'a> JoinStream for LeftSemiFilterHashJoinStream<'a> { + fn next(&mut self) -> Result> { + let Some(probe_data_block) = self.probe_data_block.take() else { + return Ok(None); + }; + + let num_rows = probe_data_block.num_rows(); + let mut selected = vec![false; num_rows]; + + loop { + self.probed_rows.clear(); + let max_rows = self.probed_rows.matched_probe.capacity(); + self.probe_keys_stream.advance(self.probed_rows, max_rows)?; + + if self.probed_rows.is_empty() { + break; + } + + if self.probed_rows.is_all_unmatched() { + continue; + } + + let probe_block = match probe_data_block.num_columns() { + 0 => None, + _ => Some(DataBlock::take( + &probe_data_block, + self.probed_rows.matched_probe.as_slice(), + )?), + }; + + let build_block = match self.build.columns.is_empty() { + true => None, + false => { + let row_ptrs = self.probed_rows.matched_build.as_slice(); + Some(DataBlock::take_column_vec( + self.build.columns.as_slice(), + self.build.column_types.as_slice(), + row_ptrs, + )) + } + }; + + let num_matched = self.probed_rows.matched_probe.len(); + let result = final_result_block(&self.desc, probe_block, build_block, num_matched); + + let selected_rows = self.filter_executor.select(&result)?; + + if selected_rows == result.num_rows() { + for probe_idx in &self.probed_rows.matched_probe { + assume((*probe_idx as usize) < selected.len()); + selected[*probe_idx as usize] = true; + } + } else if selected_rows != 0 { + let selection = self.filter_executor.true_selection(); + for idx in selection[..selected_rows].iter() { + assume((*idx as usize) < self.probed_rows.matched_probe.len()); + let idx = self.probed_rows.matched_probe[*idx as usize]; + assume((idx as usize) < selected.len()); + selected[idx as usize] = true; + } + } + } + + let bitmap = Bitmap::from_trusted_len_iter(selected.iter().copied()); + + match bitmap.true_count() { + 0 => Ok(None), + _ => Ok(Some(probe_data_block.filter_with_bitmap(&bitmap)?)), + } + } +} diff --git a/src/query/service/src/pipelines/processors/transforms/new_hash_join/partitioned/mod.rs b/src/query/service/src/pipelines/processors/transforms/new_hash_join/partitioned/mod.rs new file mode 100644 index 0000000000000..a896bf13bc191 --- /dev/null +++ b/src/query/service/src/pipelines/processors/transforms/new_hash_join/partitioned/mod.rs @@ -0,0 +1,38 @@ +// Copyright 2021 Datafuse Labs +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +mod chunk_accumulator; +mod compact_hash_table; +mod inner_join; +mod left_join; +mod left_join_anti; +mod left_join_semi; +mod partitioned_build; +mod right_join; +mod right_join_anti; +mod right_join_semi; +mod transform_hash_join; + +pub use compact_hash_table::CompactJoinHashTable; +pub use compact_hash_table::RowIndex; +pub use inner_join::PartitionedInnerJoin; +pub use left_join::PartitionedLeftJoin; +pub use left_join_anti::PartitionedLeftAntiJoin; +pub use left_join_semi::PartitionedLeftSemiJoin; +pub use partitioned_build::PartitionedHashJoinState; +pub use right_join::PartitionedRightJoin; +pub use right_join_anti::PartitionedRightAntiJoin; +pub use right_join_semi::PartitionedRightSemiJoin; +pub use transform_hash_join::SharedRuntimeFilterPackets; +pub use transform_hash_join::TransformPartitionedHashJoin; diff --git a/src/query/service/src/pipelines/processors/transforms/new_hash_join/partitioned/partitioned_build.rs b/src/query/service/src/pipelines/processors/transforms/new_hash_join/partitioned/partitioned_build.rs new file mode 100644 index 0000000000000..94d792bc826ba --- /dev/null +++ b/src/query/service/src/pipelines/processors/transforms/new_hash_join/partitioned/partitioned_build.rs @@ -0,0 +1,945 @@ +// Copyright 2021 Datafuse Labs +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use std::sync::Arc; + +use databend_common_base::base::ProgressValues; +use databend_common_base::hints::assume; +use databend_common_column::binary::BinaryColumn; +use databend_common_column::bitmap::Bitmap; +use databend_common_column::buffer::Buffer; +use databend_common_exception::Result; +use databend_common_expression::BlockEntry; +use databend_common_expression::Column; +use databend_common_expression::ColumnVec; +use databend_common_expression::DataBlock; +use databend_common_expression::FixedKey; +use databend_common_expression::FunctionContext; +use databend_common_expression::HashMethod; +use databend_common_expression::HashMethodKind; +use databend_common_expression::KeysState; +use databend_common_expression::ProjectedBlock; +use databend_common_expression::types::DataType; +use databend_common_expression::with_hash_method; +use ethnum::u256; + +use super::chunk_accumulator::FixedSizeChunkAccumulator; +use super::compact_hash_table::CompactJoinHashTable; +use crate::pipelines::processors::HashJoinDesc; +use crate::pipelines::processors::transforms::hash_join_table::RowPtr; +use crate::pipelines::processors::transforms::new_hash_join::common::probe_stream::ProbeStream; +use crate::pipelines::processors::transforms::new_hash_join::common::probe_stream::ProbedRows; +use crate::pipelines::processors::transforms::partitioned::RowIndex; +use crate::pipelines::processors::transforms::unpartitioned::hashtable::basic::AllUnmatchedProbeStream; +use crate::pipelines::processors::transforms::unpartitioned::hashtable::basic::EmptyProbeStream; + +pub const CHUNK_BITS: usize = 16; +pub const CHUNK_SIZE: usize = 1 << CHUNK_BITS; // 65536 + +/// Convert a 1-based flat index to RowPtr (chunk_index, row_offset). +#[inline(always)] +pub fn flat_to_row_ptr(flat_index: usize) -> RowPtr { + let zero_based = flat_index - 1; + RowPtr { + chunk_index: (zero_based >> CHUNK_BITS) as u32, + row_index: (zero_based & (CHUNK_SIZE - 1)) as u32, + } +} + +pub struct ProbeData { + keys: DataBlock, + valids: Option, +} + +impl ProbeData { + pub fn new(keys: DataBlock, valids: Option) -> Self { + ProbeData { keys, valids } + } + + pub fn num_rows(&self) -> usize { + self.keys.num_rows() + } + + pub fn columns(&self) -> &[BlockEntry] { + self.keys.columns() + } + + pub fn non_null_rows(&self) -> usize { + match &self.valids { + None => self.keys.num_rows(), + Some(valids) => valids.len() - valids.null_count(), + } + } + + pub fn into_raw(self) -> (DataBlock, Option) { + (self.keys, self.valids) + } +} + +pub enum BuildKeysStates { + UInt8(Vec>), + UInt16(Vec>), + UInt32(Vec>), + UInt64(Vec>), + UInt128(Vec>), + UInt256(Vec>), + Binary(Vec), +} + +impl BuildKeysStates { + pub fn get(&self, idx: usize) -> KeysState { + match self { + BuildKeysStates::UInt8(v) => u8::upcast(v[idx].clone()), + BuildKeysStates::UInt16(v) => u16::upcast(v[idx].clone()), + BuildKeysStates::UInt32(v) => u32::upcast(v[idx].clone()), + BuildKeysStates::UInt64(v) => u64::upcast(v[idx].clone()), + BuildKeysStates::UInt128(v) => u128::upcast(v[idx].clone()), + BuildKeysStates::UInt256(v) => u256::upcast(v[idx].clone()), + BuildKeysStates::Binary(v) => KeysState::Column(Column::Binary(v[idx].clone())), + } + } +} + +impl BuildKeysStates { + pub fn new(method: &HashMethodKind) -> Self { + match method { + HashMethodKind::Serializer(_) => BuildKeysStates::Binary(vec![]), + HashMethodKind::SingleBinary(_) => BuildKeysStates::Binary(vec![]), + HashMethodKind::KeysU8(_) => BuildKeysStates::UInt8(vec![]), + HashMethodKind::KeysU16(_) => BuildKeysStates::UInt16(vec![]), + HashMethodKind::KeysU32(_) => BuildKeysStates::UInt32(vec![]), + HashMethodKind::KeysU64(_) => BuildKeysStates::UInt64(vec![]), + HashMethodKind::KeysU128(_) => BuildKeysStates::UInt128(vec![]), + HashMethodKind::KeysU256(_) => BuildKeysStates::UInt256(vec![]), + } + } +} + +/// Maximum key range for direct hash join (same as Doris: 1 << 23 = 8M). +const DIRECT_JOIN_MAX_RANGE: u64 = 1 << 23; + +/// Per-thread build state for partitioned hash join. +pub struct PartitionedHashJoinState { + pub chunks: Vec, + pub method: HashMethodKind, + pub build_keys_states: BuildKeysStates, + pub hash_table: CompactJoinHashTable, + + pub columns: Vec, + pub column_types: Vec, + + pub num_rows: usize, + pub build_block_idx: usize, + + pub direct_join: bool, + pub min_key: u256, + + pub visited: Vec>, + pub desc: Arc, + pub function_ctx: Arc, + + pub accumulator: FixedSizeChunkAccumulator, +} + +impl PartitionedHashJoinState { + pub fn create( + method: HashMethodKind, + desc: Arc, + function_ctx: Arc, + ) -> Self { + PartitionedHashJoinState { + chunks: Vec::new(), + build_keys_states: BuildKeysStates::new(&method), + hash_table: CompactJoinHashTable::new(0), + columns: Vec::new(), + column_types: Vec::new(), + num_rows: 0, + method, + desc, + function_ctx, + build_block_idx: 0, + direct_join: false, + min_key: u256::ZERO, + visited: vec![], + accumulator: FixedSizeChunkAccumulator::new(CHUNK_SIZE), + } + } + + pub fn add_block(&mut self, data: Option) -> Result<()> { + let Some(data_block) = data else { + if let Some(chunk) = self.accumulator.finalize() { + self.ingest_chunk::(chunk)?; + } + + return Ok(()); + }; + + let data_block = self.prepare_data::(data_block)?; + for ready_block in self.accumulator.accumulate(data_block) { + self.ingest_chunk::(ready_block)?; + } + + Ok(()) + } + + fn ingest_chunk(&mut self, chunk: DataBlock) -> Result<()> { + let num_rows = chunk.num_rows(); + let mut columns = chunk.take_columns(); + let data_columns = columns.split_off(self.desc.build_keys.len()); + + let mut keys_block = DataBlock::new(columns, num_rows); + let mut chunk = DataBlock::new(data_columns, num_rows); + if VISITED { + if let Some(bitmap) = self.desc.build_valids_by_keys(&keys_block)? { + if bitmap.true_count() != bitmap.len() { + keys_block = keys_block.filter_with_bitmap(&bitmap)?; + let null_keys = chunk.clone().filter_with_bitmap(&(!(&bitmap)))?; + let nonnull_keys = chunk.filter_with_bitmap(&bitmap)?; + chunk = DataBlock::concat(&[nonnull_keys, null_keys])?; + } + } + self.desc.remove_keys_nullable(&mut keys_block); + } + + let keys = ProjectedBlock::from(keys_block.columns()); + + let keys_state = with_hash_method!(|T| match &self.method { + HashMethodKind::T(method) => method.build_keys_state(keys, keys_block.num_rows())?, + }); + + if VISITED { + self.visited.push(vec![0u8; num_rows]); + } + + self.num_rows += num_rows; + self.chunks.push(chunk); + self.add_build_state(keys_state); + Ok(()) + } + + fn prepare_data(&self, mut chunk: DataBlock) -> Result { + let num_rows = chunk.num_rows(); + + let keys_entries = self.desc.build_key(&chunk, &self.function_ctx)?; + let mut keys_block = DataBlock::new(keys_entries, num_rows); + + chunk = chunk.project(&self.desc.build_projection); + + if !VISITED { + if let Some(bitmap) = self.desc.build_valids_by_keys(&keys_block)? { + if bitmap.true_count() != bitmap.len() { + keys_block = keys_block.filter_with_bitmap(&bitmap)?; + chunk = chunk.filter_with_bitmap(&bitmap)?; + } + } + self.desc.remove_keys_nullable(&mut keys_block); + } + + keys_block.merge_block(chunk); + Ok(keys_block) + } + + pub fn final_build(&mut self) -> Result> { + if self.num_rows == 0 { + return Ok(None); + } + + if self.build_block_idx == 0 { + if let Some(first_chunk) = self.chunks.first() { + self.column_types = (0..first_chunk.num_columns()) + .map(|offset| first_chunk.get_by_offset(offset).data_type()) + .collect(); + + let num_cols = first_chunk.num_columns(); + let mut columns = Vec::with_capacity(num_cols); + for offset in 0..num_cols { + let full_columns: Vec = self + .chunks + .iter() + .map(|chunk| chunk.get_by_offset(offset).to_column()) + .collect(); + columns.push(Column::take_downcast_column_vec(&full_columns)); + } + self.columns = columns; + } + + // Decide whether to use direct mapping + let direct_range = match &self.build_keys_states { + BuildKeysStates::UInt8(_) => Some((u256::ZERO, u8::MAX as u64)), + BuildKeysStates::UInt16(_) => Some((u256::ZERO, u16::MAX as u64)), + BuildKeysStates::UInt32(bufs) => scan_min_max_u32(bufs), + BuildKeysStates::UInt64(bufs) => scan_min_max_u64(bufs), + BuildKeysStates::UInt128(bufs) => scan_min_max_u128(bufs), + BuildKeysStates::UInt256(bufs) => scan_min_max_u256(bufs), + _ => None, + }; + + match direct_range { + Some((min_key, range)) => { + self.direct_join = true; + self.min_key = min_key; + self.hash_table = + CompactJoinHashTable::new_direct(self.num_rows, range as usize); + } + None => { + self.hash_table = CompactJoinHashTable::new(self.num_rows); + } + }; + } + + let row_offset = CHUNK_SIZE * self.build_block_idx + 1; + let idx = self.build_block_idx; + + if self.direct_join { + match &self.build_keys_states { + BuildKeysStates::UInt8(states) => { + let min_t = self.min_key.as_u8(); + let adjusted: Vec = states[idx] + .iter() + .map(|k| k.wrapping_sub(min_t) as u64) + .collect(); + self.hash_table.insert_chunk::(&adjusted, row_offset); + } + BuildKeysStates::UInt16(states) => { + let min_t = self.min_key.as_u16(); + let adjusted: Vec = states[idx] + .iter() + .map(|k| k.wrapping_sub(min_t) as u64) + .collect(); + self.hash_table.insert_chunk::(&adjusted, row_offset); + } + BuildKeysStates::UInt32(states) => { + let min_t = self.min_key.as_u32(); + let adjusted: Vec = states[idx] + .iter() + .map(|k| k.wrapping_sub(min_t) as u64) + .collect(); + self.hash_table.insert_chunk::(&adjusted, row_offset); + } + BuildKeysStates::UInt64(states) => { + let min_t = self.min_key.as_u64(); + let adjusted: Vec = + states[idx].iter().map(|k| k.wrapping_sub(min_t)).collect(); + self.hash_table.insert_chunk::(&adjusted, row_offset); + } + BuildKeysStates::UInt128(states) => { + let min_t = self.min_key.as_u128(); + let adjusted: Vec = states[idx] + .iter() + .map(|k| k.wrapping_sub(min_t) as u64) + .collect(); + self.hash_table.insert_chunk::(&adjusted, row_offset); + } + BuildKeysStates::UInt256(states) => { + let min_t = self.min_key; + let adjusted: Vec = states[idx] + .iter() + .map(|k| k.wrapping_sub(min_t).as_u64()) + .collect(); + self.hash_table.insert_chunk::(&adjusted, row_offset); + } + _ => unreachable!(), + } + } else { + let keys_state = self.build_keys_states.get(idx); + with_hash_method!(|T| match &self.method { + HashMethodKind::T(method) => { + let mut hashes = Vec::with_capacity(CHUNK_SIZE); + method.build_keys_hashes(&keys_state, &mut hashes); + self.hash_table.insert_chunk::(&hashes, row_offset); + } + }); + } + + self.build_block_idx += 1; + match self.build_block_idx == self.chunks.len() { + true => Ok(None), + false => Ok(Some(ProgressValues { rows: 0, bytes: 0 })), + } + } + + pub fn probe<'a, const MATCHED: bool, const MATCH_FIRST: bool>( + &'a self, + data: ProbeData, + ) -> Result> { + let num_rows = data.num_rows(); + let (keys_block, valids) = data.into_raw(); + let keys = ProjectedBlock::from(keys_block.columns()); + + if self.direct_join { + return self.probe_direct::(keys, num_rows, valids); + } + + let mut hashes = Vec::with_capacity(num_rows); + let (keys_state, matched_rows) = with_hash_method!(|T| match &self.method { + HashMethodKind::T(method) => { + let keys_state = method.build_keys_state(keys, num_rows)?; + method.build_keys_hashes(&keys_state, &mut hashes); + ( + keys_state, + self.hash_table.probe::(&mut hashes, valids), + ) + } + }); + + if matched_rows == 0 { + return match MATCHED { + true => Ok(Box::new(EmptyProbeStream)), + false => Ok(AllUnmatchedProbeStream::create(hashes.len())), + }; + } + + Ok(match (&self.method, &self.build_keys_states) { + (HashMethodKind::KeysU8(_), BuildKeysStates::UInt8(states)) => { + let probe_keys = u8::downcast_owned(keys_state).unwrap(); + PrimitiveProbeStream::<'a, u8, MATCHED, MATCH_FIRST, false, u32>::new( + hashes, + states, + probe_keys, + &self.hash_table.next, + ) + } + (HashMethodKind::KeysU16(_), BuildKeysStates::UInt16(states)) => { + let probe_keys = u16::downcast_owned(keys_state).unwrap(); + PrimitiveProbeStream::<'a, u16, MATCHED, MATCH_FIRST, false, u32>::new( + hashes, + states, + probe_keys, + &self.hash_table.next, + ) + } + (HashMethodKind::KeysU32(_), BuildKeysStates::UInt32(states)) => { + let probe_keys = u32::downcast_owned(keys_state).unwrap(); + PrimitiveProbeStream::<'a, u32, MATCHED, MATCH_FIRST, false, u32>::new( + hashes, + states, + probe_keys, + &self.hash_table.next, + ) + } + (HashMethodKind::KeysU64(_), BuildKeysStates::UInt64(states)) => { + let probe_keys = u64::downcast_owned(keys_state).unwrap(); + PrimitiveProbeStream::<'a, u64, MATCHED, MATCH_FIRST, false, u32>::new( + hashes, + states, + probe_keys, + &self.hash_table.next, + ) + } + (HashMethodKind::KeysU128(_), BuildKeysStates::UInt128(states)) => { + let probe_keys = u128::downcast_owned(keys_state).unwrap(); + PrimitiveProbeStream::<'a, u128, MATCHED, MATCH_FIRST, false, u32>::new( + hashes, + states, + probe_keys, + &self.hash_table.next, + ) + } + (HashMethodKind::KeysU256(_), BuildKeysStates::UInt256(states)) => { + let probe_keys = u256::downcast_owned(keys_state).unwrap(); + PrimitiveProbeStream::<'a, u256, MATCHED, MATCH_FIRST, false, u32>::new( + hashes, + states, + probe_keys, + &self.hash_table.next, + ) + } + ( + HashMethodKind::Serializer(_) | HashMethodKind::SingleBinary(_), + BuildKeysStates::Binary(states), + ) => match keys_state { + KeysState::Column(Column::Binary(probe_keys)) + | KeysState::Column(Column::Variant(probe_keys)) + | KeysState::Column(Column::Bitmap(probe_keys)) => { + BinaryProbeStream::<'a, MATCHED, MATCH_FIRST, u32>::create( + hashes, + states, + probe_keys, + &self.hash_table.next, + ) + } + _ => unreachable!(), + }, + _ => unreachable!(), + }) + } + + fn probe_direct<'a, const MATCHED: bool, const MATCH_FIRST: bool>( + &'a self, + keys: ProjectedBlock<'_>, + num_rows: usize, + valids: Option, + ) -> Result> { + let keys_state = with_hash_method!(|T| match &self.method { + HashMethodKind::T(method) => method.build_keys_state(keys, num_rows)?, + }); + + Ok(match &self.build_keys_states { + BuildKeysStates::UInt8(bufs) => { + let min_t = self.min_key.as_u8(); + let probe_keys = u8::downcast_owned(keys_state).unwrap(); + let mut adjusted: Vec = probe_keys + .iter() + .map(|k| k.wrapping_sub(min_t) as u64) + .collect(); + + if self.hash_table.probe::(&mut adjusted, valids) == 0 { + return match MATCHED { + true => Ok(Box::new(EmptyProbeStream)), + false => Ok(AllUnmatchedProbeStream::create(adjusted.len())), + }; + } + PrimitiveProbeStream::<'a, u8, MATCHED, MATCH_FIRST, true, u32>::new( + adjusted, + bufs, + probe_keys, + &self.hash_table.next, + ) + } + BuildKeysStates::UInt16(bufs) => { + let min_t = self.min_key.as_u16(); + let probe_keys = u16::downcast_owned(keys_state).unwrap(); + let mut adjusted: Vec = probe_keys + .iter() + .map(|k| k.wrapping_sub(min_t) as u64) + .collect(); + + if self.hash_table.probe::(&mut adjusted, valids) == 0 { + return match MATCHED { + true => Ok(Box::new(EmptyProbeStream)), + false => Ok(AllUnmatchedProbeStream::create(adjusted.len())), + }; + } + PrimitiveProbeStream::<'a, u16, MATCHED, MATCH_FIRST, true, u32>::new( + adjusted, + bufs, + probe_keys, + &self.hash_table.next, + ) + } + BuildKeysStates::UInt32(bufs) => { + let min_t = self.min_key.as_u32(); + let probe_keys = u32::downcast_owned(keys_state).unwrap(); + let mut adjusted: Vec = probe_keys + .iter() + .map(|k| k.wrapping_sub(min_t) as u64) + .collect(); + + if self.hash_table.probe::(&mut adjusted, valids) == 0 { + return match MATCHED { + true => Ok(Box::new(EmptyProbeStream)), + false => Ok(AllUnmatchedProbeStream::create(adjusted.len())), + }; + } + PrimitiveProbeStream::<'a, u32, MATCHED, MATCH_FIRST, true, u32>::new( + adjusted, + bufs, + probe_keys, + &self.hash_table.next, + ) + } + BuildKeysStates::UInt64(bufs) => { + let min_t = self.min_key.as_u64(); + let probe_keys = u64::downcast_owned(keys_state).unwrap(); + let mut adjusted: Vec = + probe_keys.iter().map(|k| k.wrapping_sub(min_t)).collect(); + + if self.hash_table.probe::(&mut adjusted, valids) == 0 { + return match MATCHED { + true => Ok(Box::new(EmptyProbeStream)), + false => Ok(AllUnmatchedProbeStream::create(adjusted.len())), + }; + } + PrimitiveProbeStream::<'a, u64, MATCHED, MATCH_FIRST, true, u32>::new( + adjusted, + bufs, + probe_keys, + &self.hash_table.next, + ) + } + BuildKeysStates::UInt128(bufs) => { + let min_t = self.min_key.as_u128(); + let probe_keys = u128::downcast_owned(keys_state).unwrap(); + let mut adjusted: Vec = probe_keys + .iter() + .map(|k| k.wrapping_sub(min_t) as u64) + .collect(); + + if self.hash_table.probe::(&mut adjusted, valids) == 0 { + return match MATCHED { + true => Ok(Box::new(EmptyProbeStream)), + false => Ok(AllUnmatchedProbeStream::create(adjusted.len())), + }; + } + PrimitiveProbeStream::<'a, u128, MATCHED, MATCH_FIRST, true, u32>::new( + adjusted, + bufs, + probe_keys, + &self.hash_table.next, + ) + } + BuildKeysStates::UInt256(bufs) => { + let min_t = self.min_key; + let probe_keys = u256::downcast_owned(keys_state).unwrap(); + let mut adjusted: Vec = probe_keys + .iter() + .map(|k| k.wrapping_sub(min_t).as_u64()) + .collect(); + + if self.hash_table.probe::(&mut adjusted, valids) == 0 { + return match MATCHED { + true => Ok(Box::new(EmptyProbeStream)), + false => Ok(AllUnmatchedProbeStream::create(adjusted.len())), + }; + } + PrimitiveProbeStream::<'a, u256, MATCHED, MATCH_FIRST, true, u32>::new( + adjusted, + bufs, + probe_keys, + &self.hash_table.next, + ) + } + _ => unreachable!(), + }) + } + + fn add_build_state(&mut self, state: KeysState) { + match &mut self.build_keys_states { + BuildKeysStates::UInt8(states) => { + states.push(u8::downcast_owned(state).unwrap()); + } + BuildKeysStates::UInt16(states) => { + states.push(u16::downcast_owned(state).unwrap()); + } + BuildKeysStates::UInt32(states) => { + states.push(u32::downcast_owned(state).unwrap()); + } + BuildKeysStates::UInt64(states) => { + states.push(u64::downcast_owned(state).unwrap()); + } + BuildKeysStates::UInt128(states) => { + states.push(u128::downcast_owned(state).unwrap()); + } + BuildKeysStates::UInt256(states) => { + states.push(u256::downcast_owned(state).unwrap()); + } + BuildKeysStates::Binary(states) => match state { + KeysState::Column(Column::Binary(build_keys)) + | KeysState::Column(Column::Variant(build_keys)) + | KeysState::Column(Column::Bitmap(build_keys)) => { + states.push(build_keys); + } + _ => unreachable!(), + }, + }; + } +} + +struct PrimitiveProbeStream< + 'a, + T: Send + Sync + PartialEq, + const MATCHED: bool, + const MATCH_FIRST: bool, + const DIRECT: bool, + I: RowIndex = u32, +> { + key_idx: usize, + pointers: Vec, + build_idx: usize, + probe_keys: Buffer, + build_keys: &'a [Buffer], + next: &'a [I], + matched_num_rows: usize, +} + +impl<'a, T, const MATCHED: bool, const MATCH_FIRST: bool, const DIRECT: bool, I> + PrimitiveProbeStream<'a, T, MATCHED, MATCH_FIRST, DIRECT, I> +where + T: Send + Sync + PartialEq, + I: RowIndex, +{ + #[allow(clippy::new_ret_no_self)] + pub fn new( + pointers: Vec, + build_keys: &'a [Buffer], + probe_keys: Buffer, + next: &'a [I], + ) -> Box { + Box::new(Self { + next, + pointers, + probe_keys, + build_keys, + key_idx: 0, + build_idx: 0, + matched_num_rows: 0, + }) + } +} + +impl<'a, T, const MATCHED: bool, const MATCH_FIRST: bool, const DIRECT: bool, I> ProbeStream + for PrimitiveProbeStream<'a, T, MATCHED, MATCH_FIRST, DIRECT, I> +where + I: RowIndex, + T: Send + Sync + PartialEq, +{ + fn advance(&mut self, res: &mut ProbedRows, max_rows: usize) -> Result<()> { + while self.key_idx < self.probe_keys.len() { + assume(res.matched_probe.len() == res.matched_build.len()); + assume(res.matched_build.len() < res.matched_build.capacity()); + assume(res.matched_probe.len() < res.matched_probe.capacity()); + assume(self.key_idx < self.pointers.len()); + + if res.matched_probe.len() == max_rows { + break; + } + + if self.build_idx == 0 { + self.build_idx = self.pointers[self.key_idx].to_usize(); + + if self.build_idx == 0 { + if !MATCHED { + res.unmatched.push(self.key_idx as u64); + } + + self.key_idx += 1; + self.matched_num_rows = 0; + continue; + } + } + + while self.build_idx != 0 { + let row_ptr = flat_to_row_ptr(self.build_idx); + + let key_match = DIRECT + || self.probe_keys[self.key_idx] + == self.build_keys[row_ptr.chunk_index as usize] + [row_ptr.row_index as usize]; + + if key_match { + res.matched_build.push(row_ptr); + res.matched_probe.push(self.key_idx as u64); + self.matched_num_rows += 1; + + if res.matched_probe.len() == max_rows { + self.build_idx = match MATCH_FIRST { + true => 0, + false => self.next[self.build_idx].to_usize(), + }; + + if self.build_idx == 0 { + self.key_idx += 1; + self.matched_num_rows = 0; + } + + return Ok(()); + } + + if MATCH_FIRST { + self.build_idx = 0; + break; + } + } + + self.build_idx = self.next[self.build_idx].to_usize(); + } + + if !MATCHED && self.matched_num_rows == 0 { + res.unmatched.push(self.key_idx as u64); + } + + self.key_idx += 1; + self.matched_num_rows = 0; + } + + Ok(()) + } +} + +struct BinaryProbeStream<'a, const MATCHED: bool, const MATCH_FIRST: bool, I: RowIndex = u32> { + key_idx: usize, + pointers: Vec, + build_idx: usize, + probe_keys: BinaryColumn, + build_keys: &'a [BinaryColumn], + next: &'a [I], + matched_num_rows: usize, +} + +impl<'a, const MATCHED: bool, const MATCH_FIRST: bool, I: RowIndex> + BinaryProbeStream<'a, MATCHED, MATCH_FIRST, I> +{ + pub fn create( + pointers: Vec, + build_keys: &'a [BinaryColumn], + probe_keys: BinaryColumn, + next: &'a [I], + ) -> Box { + Box::new(Self { + next, + pointers, + probe_keys, + build_keys, + key_idx: 0, + build_idx: 0, + matched_num_rows: 0, + }) + } +} + +impl<'a, const MATCHED: bool, const MATCH_FIRST: bool, I: RowIndex> ProbeStream + for BinaryProbeStream<'a, MATCHED, MATCH_FIRST, I> +{ + fn advance(&mut self, res: &mut ProbedRows, max_rows: usize) -> Result<()> { + while self.key_idx < self.probe_keys.len() { + assume(res.matched_probe.len() == res.matched_build.len()); + assume(res.matched_build.len() < res.matched_build.capacity()); + assume(res.matched_probe.len() < res.matched_probe.capacity()); + assume(self.key_idx < self.pointers.len()); + + if res.matched_probe.len() == max_rows { + break; + } + + if self.build_idx == 0 { + self.build_idx = self.pointers[self.key_idx].to_usize(); + + if self.build_idx == 0 { + if !MATCHED { + res.unmatched.push(self.key_idx as u64); + } + + self.key_idx += 1; + self.matched_num_rows = 0; + continue; + } + } + + while self.build_idx != 0 { + let row_ptr = flat_to_row_ptr(self.build_idx); + if self.probe_keys.value(self.key_idx) + == self.build_keys[row_ptr.chunk_index as usize] + .value(row_ptr.row_index as usize) + { + res.matched_build.push(row_ptr); + res.matched_probe.push(self.key_idx as u64); + self.matched_num_rows += 1; + + if res.matched_probe.len() == max_rows { + self.build_idx = match MATCH_FIRST { + true => 0, + false => self.next[self.build_idx].to_usize(), + }; + + if self.build_idx == 0 { + self.key_idx += 1; + self.matched_num_rows = 0; + } + + return Ok(()); + } + + if MATCH_FIRST { + self.build_idx = 0; + break; + } + } + + self.build_idx = self.next[self.build_idx].to_usize(); + } + + if !MATCHED && self.matched_num_rows == 0 { + res.unmatched.push(self.key_idx as u64); + } + + self.key_idx += 1; + self.matched_num_rows = 0; + } + + Ok(()) + } +} + +/// Scan min/max with short-circuit per chunk. Returns Some((min as u256, range)) if range <= threshold. +fn scan_min_max_u32(buffers: &[Buffer]) -> Option<(u256, u64)> { + let mut min_val = u32::MAX; + let mut max_val = u32::MIN; + for buf in buffers { + for &k in buf.iter() { + min_val = min_val.min(k); + max_val = max_val.max(k); + } + if (max_val as u64).wrapping_sub(min_val as u64) > DIRECT_JOIN_MAX_RANGE { + return None; + } + } + if min_val > max_val { + return None; + } + Some((u256::from(min_val), (max_val as u64) - (min_val as u64))) +} + +fn scan_min_max_u64(buffers: &[Buffer]) -> Option<(u256, u64)> { + let mut min_val = u64::MAX; + let mut max_val = u64::MIN; + for buf in buffers { + for &k in buf.iter() { + min_val = min_val.min(k); + max_val = max_val.max(k); + } + if max_val.wrapping_sub(min_val) > DIRECT_JOIN_MAX_RANGE { + return None; + } + } + if min_val > max_val { + return None; + } + Some((u256::from(min_val), max_val - min_val)) +} + +fn scan_min_max_u128(buffers: &[Buffer]) -> Option<(u256, u64)> { + let mut min_val = u128::MAX; + let mut max_val = u128::MIN; + for buf in buffers { + for &k in buf.iter() { + min_val = min_val.min(k); + max_val = max_val.max(k); + } + if max_val.wrapping_sub(min_val) > DIRECT_JOIN_MAX_RANGE as u128 { + return None; + } + } + if min_val > max_val { + return None; + } + Some((u256::from(min_val), (max_val - min_val) as u64)) +} + +fn scan_min_max_u256(buffers: &[Buffer]) -> Option<(u256, u64)> { + let mut min_val = u256::MAX; + let mut max_val = u256::MIN; + for buf in buffers { + for &k in buf.iter() { + min_val = min_val.min(k); + max_val = max_val.max(k); + } + if max_val.wrapping_sub(min_val) > u256::from(DIRECT_JOIN_MAX_RANGE) { + return None; + } + } + if min_val > max_val { + return None; + } + Some((min_val, (max_val - min_val).as_u64())) +} diff --git a/src/query/service/src/pipelines/processors/transforms/new_hash_join/partitioned/right_join.rs b/src/query/service/src/pipelines/processors/transforms/new_hash_join/partitioned/right_join.rs new file mode 100644 index 0000000000000..6f5399e32623d --- /dev/null +++ b/src/query/service/src/pipelines/processors/transforms/new_hash_join/partitioned/right_join.rs @@ -0,0 +1,322 @@ +// Copyright 2021 Datafuse Labs +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use std::sync::Arc; + +use databend_common_base::base::ProgressValues; +use databend_common_exception::Result; +use databend_common_expression::ColumnVec; +use databend_common_expression::DataBlock; +use databend_common_expression::FilterExecutor; +use databend_common_expression::FunctionContext; +use databend_common_expression::HashMethodKind; +use databend_common_expression::types::DataType; + +use super::partitioned_build::PartitionedHashJoinState; +use super::partitioned_build::ProbeData; +use crate::pipelines::processors::HashJoinDesc; +use crate::pipelines::processors::transforms::hash_join_table::RowPtr; +use crate::pipelines::processors::transforms::new_hash_join::common::join::EmptyJoinStream; +use crate::pipelines::processors::transforms::new_hash_join::common::join::Join; +use crate::pipelines::processors::transforms::new_hash_join::common::join::JoinStream; +use crate::pipelines::processors::transforms::new_hash_join::common::probe_stream::ProbeStream; +use crate::pipelines::processors::transforms::new_hash_join::common::probe_stream::ProbedRows; +use crate::pipelines::processors::transforms::new_hash_join::unpartitioned::memory::left_join::final_result_block; +use crate::pipelines::processors::transforms::new_hash_join::unpartitioned::memory::left_join::null_block; +use crate::pipelines::processors::transforms::unpartitioned::PerformanceContext; +use crate::pipelines::processors::transforms::wrap_nullable_block; + +pub struct PartitionedRightJoin { + build: PartitionedHashJoinState, + max_block_size: usize, + desc: Arc, + function_ctx: Arc, + context: PerformanceContext, + finished: bool, +} + +impl PartitionedRightJoin { + pub fn create( + method: HashMethodKind, + desc: Arc, + function_ctx: FunctionContext, + max_block_size: usize, + ) -> Self { + let context = + PerformanceContext::create(max_block_size, desc.clone(), function_ctx.clone()); + + let function_ctx = Arc::new(function_ctx); + + PartitionedRightJoin { + function_ctx: function_ctx.clone(), + build: PartitionedHashJoinState::create(method, desc.clone(), function_ctx), + max_block_size, + desc, + context, + finished: false, + } + } +} + +impl Join for PartitionedRightJoin { + fn add_block(&mut self, data: Option) -> Result<()> { + self.build.add_block::(data) + } + + fn final_build(&mut self) -> Result> { + self.build.final_build() + } + + fn probe_block(&mut self, data: DataBlock) -> Result> { + if data.is_empty() || self.build.num_rows == 0 { + return Ok(Box::new(EmptyJoinStream)); + } + + let mut probe_keys = { + let nullable_block = wrap_nullable_block(&data); + let probe_keys = self.desc.probe_key(&nullable_block, &self.function_ctx)?; + DataBlock::new(probe_keys, data.num_rows()) + }; + + let valids = self.desc.build_valids_by_keys(&probe_keys)?; + + self.desc.remove_keys_nullable(&mut probe_keys); + let probe_block = data.project(&self.desc.probe_projection); + + let probe_data = ProbeData::new(probe_keys, valids); + let probe_keys_stream = self.build.probe::(probe_data)?; + + match self.context.filter_executor.as_mut() { + None => Ok(OuterRightHashJoinStream::::create( + probe_block, + &self.build, + probe_keys_stream, + self.desc.clone(), + &mut self.context.probe_result, + None, + )), + Some(filter_executor) => Ok(OuterRightHashJoinStream::::create( + probe_block, + &self.build, + probe_keys_stream, + self.desc.clone(), + &mut self.context.probe_result, + Some(filter_executor), + )), + } + } + + fn final_probe(&mut self) -> Result>> { + if self.finished || self.build.num_rows == 0 { + return Ok(None); + } + self.finished = true; + + let mut probe_types = Vec::new(); + for (i, field) in self.desc.probe_schema.fields().iter().enumerate() { + if self.desc.probe_projection.contains(&i) { + probe_types.push(field.data_type().clone()); + } + } + + Ok(Some(Box::new(PartitionedRightFinalStream { + columns: &self.build.columns, + column_types: &self.build.column_types, + visited: &self.build.visited, + chunk_idx: 0, + row_idx: 0, + max_block_size: self.max_block_size, + desc: self.desc.clone(), + probe_types, + }))) + } +} + +struct OuterRightHashJoinStream<'a, const CONJUNCT: bool> { + desc: Arc, + probe_data_block: DataBlock, + build: &'a PartitionedHashJoinState, + probe_keys_stream: Box, + probed_rows: &'a mut ProbedRows, + filter_executor: Option<&'a mut FilterExecutor>, +} + +impl<'a, const CONJUNCT: bool> OuterRightHashJoinStream<'a, CONJUNCT> { + pub fn create( + probe_data_block: DataBlock, + build: &'a PartitionedHashJoinState, + probe_keys_stream: Box, + desc: Arc, + probed_rows: &'a mut ProbedRows, + filter_executor: Option<&'a mut FilterExecutor>, + ) -> Box { + Box::new(OuterRightHashJoinStream::<'a, CONJUNCT> { + desc, + build, + probed_rows, + probe_data_block, + probe_keys_stream, + filter_executor, + }) + } +} + +impl<'a, const CONJUNCT: bool> JoinStream for OuterRightHashJoinStream<'a, CONJUNCT> { + fn next(&mut self) -> Result> { + loop { + self.probed_rows.clear(); + let max_rows = self.probed_rows.matched_probe.capacity(); + self.probe_keys_stream.advance(self.probed_rows, max_rows)?; + + if self.probed_rows.is_empty() { + return Ok(None); + } + + if self.probed_rows.matched_probe.is_empty() { + continue; + } + + let probe_block = match self.probe_data_block.num_columns() { + 0 => None, + _ => Some(wrap_nullable_block(&DataBlock::take( + &self.probe_data_block, + self.probed_rows.matched_probe.as_slice(), + )?)), + }; + + let build_block = match self.build.columns.is_empty() { + true => None, + false => { + let row_ptrs = self.probed_rows.matched_build.as_slice(); + Some(DataBlock::take_column_vec( + self.build.columns.as_slice(), + self.build.column_types.as_slice(), + row_ptrs, + )) + } + }; + + let data_block = final_result_block( + &self.desc, + probe_block, + build_block, + self.probed_rows.matched_build.len(), + ); + + if !CONJUNCT { + for row_ptr in &self.probed_rows.matched_build { + unsafe { + *self.build.visited[row_ptr.chunk_index as usize] + .as_ptr() + .add(row_ptr.row_index as usize) + .cast_mut() = 1; + } + } + + return Ok(Some(data_block)); + } + + let Some(filter_executor) = self.filter_executor.as_mut() else { + for row_ptr in &self.probed_rows.matched_build { + unsafe { + *self.build.visited[row_ptr.chunk_index as usize] + .as_ptr() + .add(row_ptr.row_index as usize) + .cast_mut() = 1; + } + } + + return Ok(Some(data_block)); + }; + + if !data_block.is_empty() { + let res_rows = filter_executor.select(&data_block)?; + + if res_rows == 0 { + continue; + } + + let true_sel = filter_executor.true_selection(); + + for idx in true_sel.iter().take(res_rows) { + let row_ptr = self.probed_rows.matched_build[*idx as usize]; + unsafe { + *self.build.visited[row_ptr.chunk_index as usize] + .as_ptr() + .add(row_ptr.row_index as usize) + .cast_mut() = 1; + } + } + + let num_rows = data_block.num_rows(); + return Ok(Some(filter_executor.take(data_block, num_rows, res_rows)?)); + } + } + } +} + +struct PartitionedRightFinalStream<'a> { + columns: &'a Vec, + column_types: &'a Vec, + visited: &'a Vec>, + chunk_idx: usize, + row_idx: usize, + max_block_size: usize, + desc: Arc, + probe_types: Vec, +} + +impl<'a> JoinStream for PartitionedRightFinalStream<'a> { + fn next(&mut self) -> Result> { + let mut row_ptrs = Vec::with_capacity(self.max_block_size); + while self.chunk_idx < self.visited.len() && row_ptrs.len() < self.max_block_size { + let chunk = &self.visited[self.chunk_idx]; + while self.row_idx < chunk.len() && row_ptrs.len() < self.max_block_size { + if chunk[self.row_idx] == 0 { + row_ptrs.push(RowPtr { + chunk_index: self.chunk_idx as u32, + row_index: self.row_idx as u32, + }); + } + self.row_idx += 1; + } + if self.row_idx >= chunk.len() { + self.chunk_idx += 1; + self.row_idx = 0; + } + } + + if row_ptrs.is_empty() { + return Ok(None); + } + + let probe_block = null_block(&self.probe_types, row_ptrs.len()); + let build_block = if self.columns.is_empty() { + None + } else { + Some(DataBlock::take_column_vec( + self.columns, + self.column_types, + &row_ptrs, + )) + }; + + Ok(Some(final_result_block( + &self.desc, + probe_block, + build_block, + row_ptrs.len(), + ))) + } +} diff --git a/src/query/service/src/pipelines/processors/transforms/new_hash_join/partitioned/right_join_anti.rs b/src/query/service/src/pipelines/processors/transforms/new_hash_join/partitioned/right_join_anti.rs new file mode 100644 index 0000000000000..525a956837151 --- /dev/null +++ b/src/query/service/src/pipelines/processors/transforms/new_hash_join/partitioned/right_join_anti.rs @@ -0,0 +1,170 @@ +// Copyright 2021 Datafuse Labs +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use std::sync::Arc; + +use databend_common_base::base::ProgressValues; +use databend_common_exception::Result; +use databend_common_expression::ColumnVec; +use databend_common_expression::DataBlock; +use databend_common_expression::FunctionContext; +use databend_common_expression::HashMethodKind; +use databend_common_expression::types::DataType; + +use super::partitioned_build::PartitionedHashJoinState; +use super::partitioned_build::ProbeData; +use super::right_join_semi::SemiRightHashJoinStream; +use crate::pipelines::processors::HashJoinDesc; +use crate::pipelines::processors::transforms::hash_join_table::RowPtr; +use crate::pipelines::processors::transforms::new_hash_join::common::join::EmptyJoinStream; +use crate::pipelines::processors::transforms::new_hash_join::common::join::Join; +use crate::pipelines::processors::transforms::new_hash_join::common::join::JoinStream; +use crate::pipelines::processors::transforms::unpartitioned::PerformanceContext; + +pub struct PartitionedRightAntiJoin { + build: PartitionedHashJoinState, + max_block_size: usize, + desc: Arc, + function_ctx: Arc, + context: PerformanceContext, + finished: bool, +} + +impl PartitionedRightAntiJoin { + pub fn create( + method: HashMethodKind, + desc: Arc, + function_ctx: FunctionContext, + max_block_size: usize, + ) -> Self { + let context = + PerformanceContext::create(max_block_size, desc.clone(), function_ctx.clone()); + + let function_ctx = Arc::new(function_ctx); + + PartitionedRightAntiJoin { + function_ctx: function_ctx.clone(), + build: PartitionedHashJoinState::create(method, desc.clone(), function_ctx), + max_block_size, + desc, + context, + finished: false, + } + } +} + +impl Join for PartitionedRightAntiJoin { + fn add_block(&mut self, data: Option) -> Result<()> { + self.build.add_block::(data) + } + + fn final_build(&mut self) -> Result> { + self.build.final_build() + } + + fn probe_block(&mut self, data: DataBlock) -> Result> { + if data.is_empty() || self.build.num_rows == 0 { + return Ok(Box::new(EmptyJoinStream)); + } + + let probe_keys = self.desc.probe_key(&data, &self.function_ctx)?; + let mut keys = DataBlock::new(probe_keys, data.num_rows()); + let valids = self.desc.build_valids_by_keys(&keys)?; + + self.desc.remove_keys_nullable(&mut keys); + let probe_block = data.project(&self.desc.probe_projection); + + let probe_data = ProbeData::new(keys, valids); + let probe_keys_stream = self.build.probe::(probe_data)?; + + match self.context.filter_executor.as_mut() { + None => Ok(SemiRightHashJoinStream::::create( + probe_block, + &self.build, + probe_keys_stream, + self.desc.clone(), + &mut self.context.probe_result, + None, + )), + Some(filter_executor) => Ok(SemiRightHashJoinStream::::create( + probe_block, + &self.build, + probe_keys_stream, + self.desc.clone(), + &mut self.context.probe_result, + Some(filter_executor), + )), + } + } + + fn final_probe(&mut self) -> Result>> { + if self.finished || self.build.num_rows == 0 { + return Ok(None); + } + self.finished = true; + + Ok(Some(Box::new(PartitionedRightAntiFinalStream { + columns: &self.build.columns, + column_types: &self.build.column_types, + visited: &self.build.visited, + chunk_idx: 0, + row_idx: 0, + max_block_size: self.max_block_size, + }))) + } +} + +struct PartitionedRightAntiFinalStream<'a> { + columns: &'a Vec, + column_types: &'a Vec, + visited: &'a Vec>, + chunk_idx: usize, + row_idx: usize, + max_block_size: usize, +} + +impl<'a> JoinStream for PartitionedRightAntiFinalStream<'a> { + fn next(&mut self) -> Result> { + let mut row_ptrs = Vec::with_capacity(self.max_block_size); + while self.chunk_idx < self.visited.len() && row_ptrs.len() < self.max_block_size { + let chunk = &self.visited[self.chunk_idx]; + while self.row_idx < chunk.len() && row_ptrs.len() < self.max_block_size { + if chunk[self.row_idx] == 0 { + row_ptrs.push(RowPtr { + chunk_index: self.chunk_idx as u32, + row_index: self.row_idx as u32, + }); + } + self.row_idx += 1; + } + if self.row_idx >= chunk.len() { + self.chunk_idx += 1; + self.row_idx = 0; + } + } + + if row_ptrs.is_empty() { + return Ok(None); + } + + if self.columns.is_empty() { + return Ok(Some(DataBlock::new(vec![], row_ptrs.len()))); + } + Ok(Some(DataBlock::take_column_vec( + self.columns, + self.column_types, + &row_ptrs, + ))) + } +} diff --git a/src/query/service/src/pipelines/processors/transforms/new_hash_join/partitioned/right_join_semi.rs b/src/query/service/src/pipelines/processors/transforms/new_hash_join/partitioned/right_join_semi.rs new file mode 100644 index 0000000000000..c41d5995ad605 --- /dev/null +++ b/src/query/service/src/pipelines/processors/transforms/new_hash_join/partitioned/right_join_semi.rs @@ -0,0 +1,291 @@ +// Copyright 2021 Datafuse Labs +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use std::sync::Arc; + +use databend_common_base::base::ProgressValues; +use databend_common_exception::Result; +use databend_common_expression::ColumnVec; +use databend_common_expression::DataBlock; +use databend_common_expression::FilterExecutor; +use databend_common_expression::FunctionContext; +use databend_common_expression::HashMethodKind; +use databend_common_expression::types::DataType; + +use super::partitioned_build::PartitionedHashJoinState; +use super::partitioned_build::ProbeData; +use crate::pipelines::processors::HashJoinDesc; +use crate::pipelines::processors::transforms::hash_join_table::RowPtr; +use crate::pipelines::processors::transforms::new_hash_join::common::join::EmptyJoinStream; +use crate::pipelines::processors::transforms::new_hash_join::common::join::Join; +use crate::pipelines::processors::transforms::new_hash_join::common::join::JoinStream; +use crate::pipelines::processors::transforms::new_hash_join::common::probe_stream::ProbeStream; +use crate::pipelines::processors::transforms::new_hash_join::common::probe_stream::ProbedRows; +use crate::pipelines::processors::transforms::new_hash_join::unpartitioned::memory::left_join::final_result_block; +use crate::pipelines::processors::transforms::unpartitioned::PerformanceContext; + +pub struct PartitionedRightSemiJoin { + build: PartitionedHashJoinState, + max_block_size: usize, + desc: Arc, + function_ctx: Arc, + context: PerformanceContext, + finished: bool, +} + +impl PartitionedRightSemiJoin { + pub fn create( + method: HashMethodKind, + desc: Arc, + function_ctx: FunctionContext, + max_block_size: usize, + ) -> Self { + let context = + PerformanceContext::create(max_block_size, desc.clone(), function_ctx.clone()); + + let function_ctx = Arc::new(function_ctx); + + PartitionedRightSemiJoin { + function_ctx: function_ctx.clone(), + build: PartitionedHashJoinState::create(method, desc.clone(), function_ctx), + max_block_size, + desc, + context, + finished: false, + } + } +} + +impl Join for PartitionedRightSemiJoin { + fn add_block(&mut self, data: Option) -> Result<()> { + self.build.add_block::(data) + } + + fn final_build(&mut self) -> Result> { + self.build.final_build() + } + + fn probe_block(&mut self, data: DataBlock) -> Result> { + if data.is_empty() || self.build.num_rows == 0 { + return Ok(Box::new(EmptyJoinStream)); + } + + let probe_keys = self.desc.probe_key(&data, &self.function_ctx)?; + let mut keys = DataBlock::new(probe_keys, data.num_rows()); + let valids = self.desc.build_valids_by_keys(&keys)?; + + self.desc.remove_keys_nullable(&mut keys); + let probe_block = data.project(&self.desc.probe_projection); + + let probe_data = ProbeData::new(keys, valids); + let probe_keys_stream = self.build.probe::(probe_data)?; + + match self.context.filter_executor.as_mut() { + None => Ok(SemiRightHashJoinStream::::create( + probe_block, + &self.build, + probe_keys_stream, + self.desc.clone(), + &mut self.context.probe_result, + None, + )), + Some(filter_executor) => Ok(SemiRightHashJoinStream::::create( + probe_block, + &self.build, + probe_keys_stream, + self.desc.clone(), + &mut self.context.probe_result, + Some(filter_executor), + )), + } + } + + fn final_probe(&mut self) -> Result>> { + if self.finished || self.build.num_rows == 0 { + return Ok(None); + } + self.finished = true; + + Ok(Some(Box::new(PartitionedRightSemiFinalStream { + columns: &self.build.columns, + column_types: &self.build.column_types, + visited: &self.build.visited, + chunk_idx: 0, + row_idx: 0, + max_block_size: self.max_block_size, + }))) + } +} + +pub(super) struct SemiRightHashJoinStream<'a, const CONJUNCT: bool> { + desc: Arc, + probe_data_block: DataBlock, + build: &'a PartitionedHashJoinState, + probe_keys_stream: Box, + probed_rows: &'a mut ProbedRows, + filter_executor: Option<&'a mut FilterExecutor>, +} + +impl<'a, const CONJUNCT: bool> SemiRightHashJoinStream<'a, CONJUNCT> { + pub fn create( + probe_data_block: DataBlock, + build: &'a PartitionedHashJoinState, + probe_keys_stream: Box, + desc: Arc, + probed_rows: &'a mut ProbedRows, + filter_executor: Option<&'a mut FilterExecutor>, + ) -> Box { + Box::new(SemiRightHashJoinStream::<'a, CONJUNCT> { + desc, + build, + probed_rows, + probe_data_block, + probe_keys_stream, + filter_executor, + }) + } +} + +impl<'a, const CONJUNCT: bool> JoinStream for SemiRightHashJoinStream<'a, CONJUNCT> { + fn next(&mut self) -> Result> { + loop { + self.probed_rows.clear(); + let max_rows = self.probed_rows.matched_probe.capacity(); + self.probe_keys_stream.advance(self.probed_rows, max_rows)?; + + if self.probed_rows.is_empty() { + return Ok(None); + } + + if self.probed_rows.matched_probe.is_empty() { + continue; + } + + if !CONJUNCT { + for row_ptr in &self.probed_rows.matched_build { + unsafe { + *self.build.visited[row_ptr.chunk_index as usize] + .as_ptr() + .add(row_ptr.row_index as usize) + .cast_mut() = 1; + } + } + continue; + } + + let Some(filter_executor) = self.filter_executor.as_mut() else { + for row_ptr in &self.probed_rows.matched_build { + unsafe { + *self.build.visited[row_ptr.chunk_index as usize] + .as_ptr() + .add(row_ptr.row_index as usize) + .cast_mut() = 1; + } + } + continue; + }; + + let probe_block = match self.probe_data_block.num_columns() { + 0 => None, + _ => Some(DataBlock::take( + &self.probe_data_block, + self.probed_rows.matched_probe.as_slice(), + )?), + }; + + let build_block = match self.build.columns.is_empty() { + true => None, + false => { + let row_ptrs = self.probed_rows.matched_build.as_slice(); + Some(DataBlock::take_column_vec( + self.build.columns.as_slice(), + self.build.column_types.as_slice(), + row_ptrs, + )) + } + }; + + let result_block = final_result_block( + &self.desc, + probe_block, + build_block, + self.probed_rows.matched_build.len(), + ); + + if !result_block.is_empty() { + let result_count = filter_executor.select(&result_block)?; + + if result_count == 0 { + continue; + } + + let true_sel = filter_executor.true_selection(); + + for idx in true_sel.iter().take(result_count) { + let row_ptr = self.probed_rows.matched_build[*idx as usize]; + unsafe { + *self.build.visited[row_ptr.chunk_index as usize] + .as_ptr() + .add(row_ptr.row_index as usize) + .cast_mut() = 1; + } + } + } + } + } +} + +struct PartitionedRightSemiFinalStream<'a> { + columns: &'a Vec, + column_types: &'a Vec, + visited: &'a Vec>, + chunk_idx: usize, + row_idx: usize, + max_block_size: usize, +} + +impl<'a> JoinStream for PartitionedRightSemiFinalStream<'a> { + fn next(&mut self) -> Result> { + let mut row_ptrs = Vec::with_capacity(self.max_block_size); + while self.chunk_idx < self.visited.len() && row_ptrs.len() < self.max_block_size { + let chunk = &self.visited[self.chunk_idx]; + while self.row_idx < chunk.len() && row_ptrs.len() < self.max_block_size { + if chunk[self.row_idx] != 0 { + row_ptrs.push(RowPtr { + chunk_index: self.chunk_idx as u32, + row_index: self.row_idx as u32, + }); + } + self.row_idx += 1; + } + if self.row_idx >= chunk.len() { + self.chunk_idx += 1; + self.row_idx = 0; + } + } + + if row_ptrs.is_empty() { + return Ok(None); + } + + if self.columns.is_empty() { + return Ok(Some(DataBlock::new(vec![], row_ptrs.len()))); + } + Ok(Some(DataBlock::take_column_vec( + self.columns, + self.column_types, + &row_ptrs, + ))) + } +} diff --git a/src/query/service/src/pipelines/processors/transforms/new_hash_join/partitioned/transform_hash_join.rs b/src/query/service/src/pipelines/processors/transforms/new_hash_join/partitioned/transform_hash_join.rs new file mode 100644 index 0000000000000..27a6edaa92a5a --- /dev/null +++ b/src/query/service/src/pipelines/processors/transforms/new_hash_join/partitioned/transform_hash_join.rs @@ -0,0 +1,544 @@ +// Copyright 2021 Datafuse Labs +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use std::any::Any; +use std::collections::BTreeSet; +use std::fmt::Debug; +use std::fmt::Formatter; +use std::sync::Arc; +use std::sync::Mutex; +use std::sync::PoisonError; +use std::time::Instant; + +use databend_common_base::base::Barrier; +use databend_common_exception::Result; +use databend_common_expression::DataBlock; +use databend_common_expression::FunctionContext; +use databend_common_expression::HashMethodKind; +use databend_common_pipeline::core::Event; +use databend_common_pipeline::core::InputPort; +use databend_common_pipeline::core::OutputPort; +use databend_common_pipeline::core::Processor; +use databend_common_pipeline::core::ProcessorPtr; +use databend_common_sql::plans::JoinType; +use log::info; + +use super::PartitionedInnerJoin; +use super::PartitionedLeftAntiJoin; +use super::PartitionedLeftJoin; +use super::PartitionedLeftSemiJoin; +use super::PartitionedRightAntiJoin; +use super::PartitionedRightJoin; +use super::PartitionedRightSemiJoin; +use crate::pipelines::processors::HashJoinDesc; +use crate::pipelines::processors::transforms::JoinRuntimeFilterPacket; +use crate::pipelines::processors::transforms::RuntimeFilterLocalBuilder; +use crate::pipelines::processors::transforms::merge_two_runtime_filter_packets; +use crate::pipelines::processors::transforms::new_hash_join::common::join::FinishedJoin; +use crate::pipelines::processors::transforms::new_hash_join::common::join::Join; +use crate::pipelines::processors::transforms::new_hash_join::common::join::JoinStream; +use crate::pipelines::processors::transforms::new_hash_join::common::runtime_filter::RuntimeFiltersDesc; + +pub struct SharedRuntimeFilterPackets { + packets: Mutex>, +} + +impl SharedRuntimeFilterPackets { + pub fn create() -> Arc { + Arc::new(SharedRuntimeFilterPackets { + packets: Mutex::new(Vec::new()), + }) + } + + pub fn merge_packet(&self, mut my_packet: JoinRuntimeFilterPacket) -> Result<()> { + loop { + let locked = self.packets.lock(); + let mut guard = locked.unwrap_or_else(PoisonError::into_inner); + + if guard.is_empty() { + guard.push(my_packet); + return Ok(()); + } + + let other = guard.pop().unwrap(); + drop(guard); + my_packet = merge_two_runtime_filter_packets(my_packet, other)?; + } + } + + pub fn take_packet(&self) -> Option { + let mut guard = self.packets.lock().unwrap_or_else(PoisonError::into_inner); + guard.pop() + } +} + +pub struct TransformPartitionedHashJoin { + build_port: Arc, + probe_port: Arc, + joined_port: Arc, + + stage: Stage, + join: Box, + joined_data: Option, + + stage_sync_barrier: Arc, + projection: BTreeSet, + rf_desc: Arc, + runtime_filter_builder: Option, + shared_rf_packets: Arc, + instant: Instant, +} + +impl TransformPartitionedHashJoin { + pub fn create( + build_port: Arc, + probe_port: Arc, + joined_port: Arc, + join: Box, + stage_sync_barrier: Arc, + projection: BTreeSet, + rf_desc: Arc, + shared_rf_packets: Arc, + ) -> Result { + let runtime_filter_builder = RuntimeFilterLocalBuilder::try_create( + &rf_desc.func_ctx, + rf_desc.filters_desc.clone(), + rf_desc.inlist_threshold, + rf_desc.bloom_threshold, + rf_desc.min_max_threshold, + rf_desc.spatial_threshold, + )?; + + Ok(ProcessorPtr::create(Box::new( + TransformPartitionedHashJoin { + build_port, + probe_port, + joined_port, + join, + rf_desc, + projection, + stage_sync_barrier, + shared_rf_packets, + joined_data: None, + runtime_filter_builder, + stage: Stage::Build(BuildState { + finished: false, + build_data: None, + }), + instant: Instant::now(), + }, + ))) + } + + pub fn create_join( + typ: JoinType, + hash_method: HashMethodKind, + desc: Arc, + function_ctx: FunctionContext, + max_block_size: usize, + ) -> Box { + match typ { + JoinType::Inner => Box::new(PartitionedInnerJoin::create( + hash_method, + desc, + function_ctx, + max_block_size, + )), + JoinType::Left => Box::new(PartitionedLeftJoin::create( + hash_method, + desc, + function_ctx, + max_block_size, + )), + JoinType::LeftAnti => Box::new(PartitionedLeftAntiJoin::create( + hash_method, + desc, + function_ctx, + max_block_size, + )), + JoinType::LeftSemi => Box::new(PartitionedLeftSemiJoin::create( + hash_method, + desc, + function_ctx, + max_block_size, + )), + JoinType::Right => Box::new(PartitionedRightJoin::create( + hash_method, + desc, + function_ctx, + max_block_size, + )), + JoinType::RightSemi => Box::new(PartitionedRightSemiJoin::create( + hash_method, + desc, + function_ctx, + max_block_size, + )), + JoinType::RightAnti => Box::new(PartitionedRightAntiJoin::create( + hash_method, + desc, + function_ctx, + max_block_size, + )), + _ => unreachable!(), + } + } +} + +#[async_trait::async_trait] +impl Processor for TransformPartitionedHashJoin { + fn name(&self) -> String { + String::from("TransformPartitionedHashJoin") + } + + fn as_any(&mut self) -> &mut dyn Any { + self + } + + fn event(&mut self) -> Result { + if self.joined_port.is_finished() { + self.build_port.finish(); + self.probe_port.finish(); + + if !matches!(self.stage, Stage::Finished) { + self.stage = Stage::Finished; + let mut finished = FinishedJoin::create(); + std::mem::swap(&mut finished, &mut self.join); + self.stage_sync_barrier.reduce_quorum(1); + drop(finished); + } + + return Ok(Event::Finished); + } + + if !self.joined_port.can_push() { + match self.stage { + Stage::Build(_) => self.build_port.set_not_need_data(), + Stage::Probe(_) => self.probe_port.set_not_need_data(), + Stage::BuildFinal(_) | Stage::ProbeFinal(_) | Stage::Finished => (), + } + return Ok(Event::NeedConsume); + } + + if let Some(joined_data) = self.joined_data.take() { + let joined_data = joined_data.project(&self.projection); + self.joined_port.push_data(Ok(joined_data)); + return Ok(Event::NeedConsume); + } + + match &mut self.stage { + Stage::Build(state) => state.event(&self.build_port), + Stage::BuildFinal(state) => state.event(), + Stage::Probe(state) => state.event(&self.probe_port), + Stage::ProbeFinal(state) => state.event(&self.joined_port), + Stage::Finished => Ok(Event::Finished), + } + } + + fn process(&mut self) -> Result<()> { + match &mut self.stage { + Stage::Finished => Ok(()), + Stage::Build(state) => { + let Some(data_block) = state.build_data.take() else { + if !state.finished { + state.finished = true; + self.join.add_block(None)?; + + if let Some(builder) = self.runtime_filter_builder.take() { + let spill_happened = self.join.is_spill_happened(); + let packet = builder.finish(spill_happened)?; + self.shared_rf_packets.merge_packet(packet)?; + } + } + return Ok(()); + }; + + if !data_block.is_empty() { + if let Some(builder) = self.runtime_filter_builder.as_mut() { + builder.add_block(&data_block)?; + } + self.join.add_block(Some(data_block))?; + } + + Ok(()) + } + Stage::BuildFinal(state) => { + state.finished = self.join.final_build()?.is_none(); + Ok(()) + } + Stage::Probe(state) => { + if let Some(probe_data) = state.input_data.take() { + let stream = self.join.probe_block(probe_data)?; + state.stream = Some(unsafe { + std::mem::transmute::, Box>(stream) + }); + } + + if let Some(mut stream) = state.stream.take() { + if let Some(joined_data) = stream.next()? { + self.joined_data = Some(joined_data); + state.stream = Some(stream); + } + } + + Ok(()) + } + Stage::ProbeFinal(state) => { + if state.stream.is_none() { + if let Some(final_stream) = self.join.final_probe()? { + state.initialize = true; + state.stream = Some(unsafe { + std::mem::transmute::, Box>( + final_stream, + ) + }); + } else { + state.finished = true; + } + } + + if let Some(mut stream) = state.stream.take() { + if let Some(joined_data) = stream.next()? { + self.joined_data = Some(joined_data); + state.stream = Some(stream); + } else { + state.initialize = false; + } + } + + Ok(()) + } + } + } + + async fn async_process(&mut self) -> Result<()> { + let elapsed = self.instant.elapsed(); + + self.stage = match &mut self.stage { + Stage::Build(_) => { + let wait_res = self.stage_sync_barrier.wait().await; + + let rf_build_elapsed = self.instant.elapsed() - elapsed; + let _wait_res = self.stage_sync_barrier.wait().await; + let before_wait = self.instant.elapsed(); + + if wait_res.is_leader() { + let packet = self + .shared_rf_packets + .take_packet() + .unwrap_or_else(|| JoinRuntimeFilterPacket::complete_without_filters(0)); + info!( + "spilled: false, globalize runtime filter: total {}, disable_all_due_to_spill: {}", + packet.packets.as_ref().map_or(0, |p| p.len()), + packet.disable_all_due_to_spill + ); + self.rf_desc.globalization(packet).await?; + } + + let _wait_res = self.stage_sync_barrier.wait().await; + let wait_rf_elapsed = self.instant.elapsed() - before_wait; + + log::info!( + "PartitionedHashJoin build stage, sync work elapsed: {:?}, build rf elapsed: {:?}, wait other node rf elapsed: {:?}", + elapsed, + rf_build_elapsed, + wait_rf_elapsed + ); + + self.instant = Instant::now(); + Stage::BuildFinal(BuildFinalState::new()) + } + // BuildFinal → Probe: barrier + Stage::BuildFinal(_) => { + let _wait_res = self.stage_sync_barrier.wait().await; + let wait_elapsed = self.instant.elapsed() - elapsed; + log::info!( + "PartitionedHashJoin build final stage, sync work elapsed: {:?}, wait elapsed: {:?}", + elapsed, + wait_elapsed + ); + + self.instant = Instant::now(); + Stage::Probe(ProbeState::new()) + } + // Probe → ProbeFinal: no barrier + Stage::Probe(_) => { + log::info!("PartitionedHashJoin probe stage elapsed: {:?}", elapsed); + self.instant = Instant::now(); + Stage::ProbeFinal(ProbeFinalState::new()) + } + // ProbeFinal → Finished or continue: no barrier + Stage::ProbeFinal(state) => match state.finished { + true => { + log::info!( + "PartitionedHashJoin probe final stage elapsed: {:?}", + elapsed + ); + self.instant = Instant::now(); + + let mut finished = FinishedJoin::create(); + std::mem::swap(&mut finished, &mut self.join); + drop(finished); + + Stage::Finished + } + false => { + self.instant = Instant::now(); + Stage::ProbeFinal(ProbeFinalState { + initialize: true, + finished: state.finished, + stream: state.stream.take(), + }) + } + }, + Stage::Finished => Stage::Finished, + }; + + Ok(()) + } +} + +#[derive(Debug)] +enum Stage { + Build(BuildState), + BuildFinal(BuildFinalState), + Probe(ProbeState), + ProbeFinal(ProbeFinalState), + Finished, +} + +#[derive(Debug)] +struct BuildState { + finished: bool, + build_data: Option, +} + +impl BuildState { + pub fn event(&mut self, input: &InputPort) -> Result { + if self.build_data.is_some() { + return Ok(Event::Sync); + } + + if input.has_data() { + self.build_data = Some(input.pull_data().unwrap()?); + return Ok(Event::Sync); + } + + if input.is_finished() { + return match self.finished { + true => Ok(Event::Async), + false => Ok(Event::Sync), + }; + } + + input.set_need_data(); + Ok(Event::NeedData) + } +} + +#[derive(Debug)] +struct BuildFinalState { + finished: bool, +} + +impl BuildFinalState { + pub fn new() -> BuildFinalState { + BuildFinalState { finished: false } + } + + pub fn event(&mut self) -> Result { + match self.finished { + true => Ok(Event::Async), + false => Ok(Event::Sync), + } + } +} + +struct ProbeState { + input_data: Option, + stream: Option>, +} + +impl Debug for ProbeState { + fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result { + f.debug_struct("ProbeState").finish() + } +} + +impl ProbeState { + pub fn new() -> ProbeState { + ProbeState { + input_data: None, + stream: None, + } + } + + pub fn event(&mut self, input: &InputPort) -> Result { + if self.input_data.is_some() || self.stream.is_some() { + return Ok(Event::Sync); + } + + if input.has_data() { + self.input_data = Some(input.pull_data().unwrap()?); + return Ok(Event::Sync); + } + + if input.is_finished() { + return Ok(Event::Async); + } + + input.set_need_data(); + Ok(Event::NeedData) + } +} + +struct ProbeFinalState { + finished: bool, + initialize: bool, + stream: Option>, +} + +impl Debug for ProbeFinalState { + fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result { + f.debug_struct("ProbeFinalState") + .field("initialized", &self.finished) + .finish() + } +} + +impl ProbeFinalState { + pub fn new() -> ProbeFinalState { + ProbeFinalState { + stream: None, + finished: false, + initialize: false, + } + } + + pub fn event(&mut self, output_port: &OutputPort) -> Result { + if self.stream.is_some() { + return Ok(Event::Sync); + } + + if self.finished { + output_port.finish(); + return Ok(Event::Async); + } + + match self.initialize { + true => Ok(Event::Sync), + false => Ok(Event::Async), + } + } +} diff --git a/src/query/service/src/pipelines/processors/transforms/new_hash_join/grace/grace_join.rs b/src/query/service/src/pipelines/processors/transforms/new_hash_join/unpartitioned/grace/grace_join.rs similarity index 97% rename from src/query/service/src/pipelines/processors/transforms/new_hash_join/grace/grace_join.rs rename to src/query/service/src/pipelines/processors/transforms/new_hash_join/unpartitioned/grace/grace_join.rs index e60310020408d..b5e98ef2e7587 100644 --- a/src/query/service/src/pipelines/processors/transforms/new_hash_join/grace/grace_join.rs +++ b/src/query/service/src/pipelines/processors/transforms/new_hash_join/unpartitioned/grace/grace_join.rs @@ -29,15 +29,15 @@ use databend_common_pipeline_transforms::traits::Location; use databend_common_storage::DataOperator; use databend_common_storages_parquet::ReadSettings; +use super::grace_memory::GraceMemoryJoin; +use super::grace_state::GraceHashJoinState; +use super::grace_state::SpillMetadata; use crate::pipelines::processors::HashJoinDesc; use crate::pipelines::processors::transforms::Join; use crate::pipelines::processors::transforms::JoinRuntimeFilterPacket; use crate::pipelines::processors::transforms::get_hashes; -use crate::pipelines::processors::transforms::new_hash_join::grace::grace_memory::GraceMemoryJoin; -use crate::pipelines::processors::transforms::new_hash_join::grace::grace_state::GraceHashJoinState; -use crate::pipelines::processors::transforms::new_hash_join::grace::grace_state::SpillMetadata; -use crate::pipelines::processors::transforms::new_hash_join::join::EmptyJoinStream; -use crate::pipelines::processors::transforms::new_hash_join::join::JoinStream; +use crate::pipelines::processors::transforms::new_hash_join::common::join::EmptyJoinStream; +use crate::pipelines::processors::transforms::new_hash_join::common::join::JoinStream; use crate::sessions::QueryContext; use crate::spillers::Layout; use crate::spillers::SpillAdapter; diff --git a/src/query/service/src/pipelines/processors/transforms/new_hash_join/grace/grace_memory.rs b/src/query/service/src/pipelines/processors/transforms/new_hash_join/unpartitioned/grace/grace_memory.rs similarity index 87% rename from src/query/service/src/pipelines/processors/transforms/new_hash_join/grace/grace_memory.rs rename to src/query/service/src/pipelines/processors/transforms/new_hash_join/unpartitioned/grace/grace_memory.rs index cfc7fae6c05da..e4b7d4fe7a0f5 100644 --- a/src/query/service/src/pipelines/processors/transforms/new_hash_join/grace/grace_memory.rs +++ b/src/query/service/src/pipelines/processors/transforms/new_hash_join/unpartitioned/grace/grace_memory.rs @@ -14,16 +14,16 @@ use std::sync::PoisonError; +use super::super::memory::AntiLeftHashJoin; +use super::super::memory::AntiRightHashJoin; +use super::super::memory::OuterRightHashJoin; +use super::super::memory::SemiLeftHashJoin; +use super::super::memory::SemiRightHashJoin; +use super::super::memory::left_join::OuterLeftHashJoin; use crate::pipelines::processors::transforms::BasicHashJoinState; use crate::pipelines::processors::transforms::HashJoinHashTable; use crate::pipelines::processors::transforms::InnerHashJoin; use crate::pipelines::processors::transforms::Join; -use crate::pipelines::processors::transforms::memory::AntiLeftHashJoin; -use crate::pipelines::processors::transforms::memory::AntiRightHashJoin; -use crate::pipelines::processors::transforms::memory::OuterRightHashJoin; -use crate::pipelines::processors::transforms::memory::SemiLeftHashJoin; -use crate::pipelines::processors::transforms::memory::SemiRightHashJoin; -use crate::pipelines::processors::transforms::memory::left_join::OuterLeftHashJoin; pub trait GraceMemoryJoin: Join { fn reset_memory(&mut self); diff --git a/src/query/service/src/pipelines/processors/transforms/new_hash_join/grace/grace_state.rs b/src/query/service/src/pipelines/processors/transforms/new_hash_join/unpartitioned/grace/grace_state.rs similarity index 100% rename from src/query/service/src/pipelines/processors/transforms/new_hash_join/grace/grace_state.rs rename to src/query/service/src/pipelines/processors/transforms/new_hash_join/unpartitioned/grace/grace_state.rs diff --git a/src/query/service/src/pipelines/processors/transforms/new_hash_join/grace/mod.rs b/src/query/service/src/pipelines/processors/transforms/new_hash_join/unpartitioned/grace/mod.rs similarity index 100% rename from src/query/service/src/pipelines/processors/transforms/new_hash_join/grace/mod.rs rename to src/query/service/src/pipelines/processors/transforms/new_hash_join/unpartitioned/grace/mod.rs diff --git a/src/query/service/src/pipelines/processors/transforms/new_hash_join/hash_join_factory.rs b/src/query/service/src/pipelines/processors/transforms/new_hash_join/unpartitioned/hash_join_factory.rs similarity index 95% rename from src/query/service/src/pipelines/processors/transforms/new_hash_join/hash_join_factory.rs rename to src/query/service/src/pipelines/processors/transforms/new_hash_join/unpartitioned/hash_join_factory.rs index 33482a95cd320..fc14bc04d2d75 100644 --- a/src/query/service/src/pipelines/processors/transforms/new_hash_join/hash_join_factory.rs +++ b/src/query/service/src/pipelines/processors/transforms/new_hash_join/unpartitioned/hash_join_factory.rs @@ -27,7 +27,7 @@ use databend_common_pipeline_transforms::MemorySettings; use databend_common_sql::plans::JoinType; use databend_common_storages_fuse::TableContext; -use super::common::CStyleCell; +use super::super::common::CStyleCell; use super::grace::GraceHashJoinState; use super::grace::GraceMemoryJoin; use super::hybrid::HybridHashJoin; @@ -39,12 +39,12 @@ use crate::pipelines::processors::transforms::BasicHashJoinState; use crate::pipelines::processors::transforms::GraceHashJoin; use crate::pipelines::processors::transforms::InnerHashJoin; use crate::pipelines::processors::transforms::Join; -use crate::pipelines::processors::transforms::memory::AntiLeftHashJoin; -use crate::pipelines::processors::transforms::memory::AntiRightHashJoin; -use crate::pipelines::processors::transforms::memory::OuterRightHashJoin; -use crate::pipelines::processors::transforms::memory::SemiLeftHashJoin; -use crate::pipelines::processors::transforms::memory::SemiRightHashJoin; -use crate::pipelines::processors::transforms::memory::left_join::OuterLeftHashJoin; +use crate::pipelines::processors::transforms::new_hash_join::unpartitioned::memory::AntiLeftHashJoin; +use crate::pipelines::processors::transforms::new_hash_join::unpartitioned::memory::AntiRightHashJoin; +use crate::pipelines::processors::transforms::new_hash_join::unpartitioned::memory::OuterRightHashJoin; +use crate::pipelines::processors::transforms::new_hash_join::unpartitioned::memory::SemiLeftHashJoin; +use crate::pipelines::processors::transforms::new_hash_join::unpartitioned::memory::SemiRightHashJoin; +use crate::pipelines::processors::transforms::new_hash_join::unpartitioned::memory::left_join::OuterLeftHashJoin; use crate::sessions::QueryContext; pub struct HashJoinFactory { diff --git a/src/query/service/src/pipelines/processors/transforms/new_hash_join/unpartitioned/hashtable/basic.rs b/src/query/service/src/pipelines/processors/transforms/new_hash_join/unpartitioned/hashtable/basic.rs new file mode 100644 index 0000000000000..cb6f7c35e7cbc --- /dev/null +++ b/src/query/service/src/pipelines/processors/transforms/new_hash_join/unpartitioned/hashtable/basic.rs @@ -0,0 +1,53 @@ +// Copyright 2021 Datafuse Labs +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use databend_common_exception::Result; + +use crate::pipelines::processors::transforms::new_hash_join::common::probe_stream::ProbeStream; +use crate::pipelines::processors::transforms::new_hash_join::common::probe_stream::ProbedRows; + +pub struct EmptyProbeStream; + +impl ProbeStream for EmptyProbeStream { + fn advance(&mut self, _res: &mut ProbedRows, _max_rows: usize) -> Result<()> { + Ok(()) + } +} + +pub struct AllUnmatchedProbeStream { + idx: u64, + size: u64, +} + +impl AllUnmatchedProbeStream { + pub fn create(size: usize) -> Box { + Box::new(AllUnmatchedProbeStream { + idx: 0, + size: size as u64, + }) + } +} + +impl ProbeStream for AllUnmatchedProbeStream { + fn advance(&mut self, rows: &mut ProbedRows, max_rows: usize) -> Result<()> { + if self.idx >= self.size { + return Ok(()); + } + + let unmatched_rows = std::cmp::min(self.size - self.idx, max_rows as u64); + rows.unmatched.extend(self.idx..self.idx + unmatched_rows); + self.idx += unmatched_rows; + Ok(()) + } +} diff --git a/src/query/service/src/pipelines/processors/transforms/new_hash_join/hashtable/fixed_keys.rs b/src/query/service/src/pipelines/processors/transforms/new_hash_join/unpartitioned/hashtable/fixed_keys.rs similarity index 96% rename from src/query/service/src/pipelines/processors/transforms/new_hash_join/hashtable/fixed_keys.rs rename to src/query/service/src/pipelines/processors/transforms/new_hash_join/unpartitioned/hashtable/fixed_keys.rs index 82ea33dbe7ae6..a428a109385c1 100644 --- a/src/query/service/src/pipelines/processors/transforms/new_hash_join/hashtable/fixed_keys.rs +++ b/src/query/service/src/pipelines/processors/transforms/new_hash_join/unpartitioned/hashtable/fixed_keys.rs @@ -22,16 +22,16 @@ use databend_common_expression::KeyAccessor; use databend_common_expression::ProjectedBlock; use databend_common_hashtable::HashtableKeyable; +use super::ProbeData; +use super::basic::AllUnmatchedProbeStream; +use super::basic::EmptyProbeStream; use crate::pipelines::processors::transforms::FixedKeyHashJoinHashTable; use crate::pipelines::processors::transforms::hash_join_table::HashJoinHashMap; use crate::pipelines::processors::transforms::hash_join_table::HashJoinHashtableLike; use crate::pipelines::processors::transforms::hash_join_table::RawEntry; use crate::pipelines::processors::transforms::hash_join_table::RowPtr; -use crate::pipelines::processors::transforms::new_hash_join::hashtable::ProbeData; -use crate::pipelines::processors::transforms::new_hash_join::hashtable::basic::AllUnmatchedProbeStream; -use crate::pipelines::processors::transforms::new_hash_join::hashtable::basic::EmptyProbeStream; -use crate::pipelines::processors::transforms::new_hash_join::hashtable::basic::ProbeStream; -use crate::pipelines::processors::transforms::new_hash_join::hashtable::basic::ProbedRows; +use crate::pipelines::processors::transforms::new_hash_join::common::probe_stream::ProbeStream; +use crate::pipelines::processors::transforms::new_hash_join::common::probe_stream::ProbedRows; impl FixedKeyHashJoinHashTable { pub fn new( diff --git a/src/query/service/src/pipelines/processors/transforms/new_hash_join/hashtable/mod.rs b/src/query/service/src/pipelines/processors/transforms/new_hash_join/unpartitioned/hashtable/mod.rs similarity index 100% rename from src/query/service/src/pipelines/processors/transforms/new_hash_join/hashtable/mod.rs rename to src/query/service/src/pipelines/processors/transforms/new_hash_join/unpartitioned/hashtable/mod.rs diff --git a/src/query/service/src/pipelines/processors/transforms/new_hash_join/hashtable/serialize_keys.rs b/src/query/service/src/pipelines/processors/transforms/new_hash_join/unpartitioned/hashtable/serialize_keys.rs similarity index 97% rename from src/query/service/src/pipelines/processors/transforms/new_hash_join/hashtable/serialize_keys.rs rename to src/query/service/src/pipelines/processors/transforms/new_hash_join/unpartitioned/hashtable/serialize_keys.rs index 0416169c7cb23..079240f90ec68 100644 --- a/src/query/service/src/pipelines/processors/transforms/new_hash_join/hashtable/serialize_keys.rs +++ b/src/query/service/src/pipelines/processors/transforms/new_hash_join/unpartitioned/hashtable/serialize_keys.rs @@ -22,17 +22,17 @@ use databend_common_expression::KeyAccessor; use databend_common_expression::KeysState; use databend_common_expression::ProjectedBlock; +use super::ProbeData; +use super::basic::AllUnmatchedProbeStream; +use super::basic::EmptyProbeStream; use crate::pipelines::processors::transforms::SerializerHashJoinHashTable; use crate::pipelines::processors::transforms::hash_join_table::BinaryHashJoinHashMap; use crate::pipelines::processors::transforms::hash_join_table::HashJoinHashtableLike; use crate::pipelines::processors::transforms::hash_join_table::RowPtr; use crate::pipelines::processors::transforms::hash_join_table::STRING_EARLY_SIZE; use crate::pipelines::processors::transforms::hash_join_table::StringRawEntry; -use crate::pipelines::processors::transforms::new_hash_join::hashtable::ProbeData; -use crate::pipelines::processors::transforms::new_hash_join::hashtable::basic::AllUnmatchedProbeStream; -use crate::pipelines::processors::transforms::new_hash_join::hashtable::basic::EmptyProbeStream; -use crate::pipelines::processors::transforms::new_hash_join::hashtable::basic::ProbeStream; -use crate::pipelines::processors::transforms::new_hash_join::hashtable::basic::ProbedRows; +use crate::pipelines::processors::transforms::new_hash_join::common::probe_stream::ProbeStream; +use crate::pipelines::processors::transforms::new_hash_join::common::probe_stream::ProbedRows; impl SerializerHashJoinHashTable { pub fn new( diff --git a/src/query/service/src/pipelines/processors/transforms/new_hash_join/hashtable/single_binary_key.rs b/src/query/service/src/pipelines/processors/transforms/new_hash_join/unpartitioned/hashtable/single_binary_key.rs similarity index 92% rename from src/query/service/src/pipelines/processors/transforms/new_hash_join/hashtable/single_binary_key.rs rename to src/query/service/src/pipelines/processors/transforms/new_hash_join/unpartitioned/hashtable/single_binary_key.rs index fe6d12d307560..523d428258045 100644 --- a/src/query/service/src/pipelines/processors/transforms/new_hash_join/hashtable/single_binary_key.rs +++ b/src/query/service/src/pipelines/processors/transforms/new_hash_join/unpartitioned/hashtable/single_binary_key.rs @@ -20,18 +20,18 @@ use databend_common_expression::HashMethodSingleBinary; use databend_common_expression::KeysState; use databend_common_expression::ProjectedBlock; +use super::ProbeData; +use super::basic::AllUnmatchedProbeStream; +use super::basic::EmptyProbeStream; +use super::serialize_keys::BinaryKeyProbeStream; +use super::serialize_keys::EarlyFilteringProbeStream; use crate::pipelines::processors::transforms::SingleBinaryHashJoinHashTable; use crate::pipelines::processors::transforms::hash_join_table::BinaryHashJoinHashMap; use crate::pipelines::processors::transforms::hash_join_table::HashJoinHashtableLike; use crate::pipelines::processors::transforms::hash_join_table::RowPtr; use crate::pipelines::processors::transforms::hash_join_table::STRING_EARLY_SIZE; use crate::pipelines::processors::transforms::hash_join_table::StringRawEntry; -use crate::pipelines::processors::transforms::new_hash_join::hashtable::ProbeData; -use crate::pipelines::processors::transforms::new_hash_join::hashtable::basic::AllUnmatchedProbeStream; -use crate::pipelines::processors::transforms::new_hash_join::hashtable::basic::EmptyProbeStream; -use crate::pipelines::processors::transforms::new_hash_join::hashtable::basic::ProbeStream; -use crate::pipelines::processors::transforms::new_hash_join::hashtable::serialize_keys::BinaryKeyProbeStream; -use crate::pipelines::processors::transforms::new_hash_join::hashtable::serialize_keys::EarlyFilteringProbeStream; +use crate::pipelines::processors::transforms::new_hash_join::common::probe_stream::ProbeStream; impl SingleBinaryHashJoinHashTable { pub fn new( diff --git a/src/query/service/src/pipelines/processors/transforms/new_hash_join/hybrid/hybrid_join.rs b/src/query/service/src/pipelines/processors/transforms/new_hash_join/unpartitioned/hybrid/hybrid_join.rs similarity index 96% rename from src/query/service/src/pipelines/processors/transforms/new_hash_join/hybrid/hybrid_join.rs rename to src/query/service/src/pipelines/processors/transforms/new_hash_join/unpartitioned/hybrid/hybrid_join.rs index d78a4b1c625ab..c8d2bd46e102c 100644 --- a/src/query/service/src/pipelines/processors/transforms/new_hash_join/hybrid/hybrid_join.rs +++ b/src/query/service/src/pipelines/processors/transforms/new_hash_join/unpartitioned/hybrid/hybrid_join.rs @@ -24,14 +24,14 @@ use databend_common_expression::HashMethodKind; use databend_common_pipeline_transforms::MemorySettings; use databend_common_sql::plans::JoinType; +use super::hybrid_state::HybridHashJoinState; use crate::pipelines::processors::HashJoinDesc; use crate::pipelines::processors::transforms::BasicHashJoinState; use crate::pipelines::processors::transforms::GraceHashJoin; use crate::pipelines::processors::transforms::JoinRuntimeFilterPacket; -use crate::pipelines::processors::transforms::new_hash_join::grace::GraceMemoryJoin; -use crate::pipelines::processors::transforms::new_hash_join::hybrid::hybrid_state::HybridHashJoinState; -use crate::pipelines::processors::transforms::new_hash_join::join::Join; -use crate::pipelines::processors::transforms::new_hash_join::join::JoinStream; +use crate::pipelines::processors::transforms::new_hash_join::common::join::Join; +use crate::pipelines::processors::transforms::new_hash_join::common::join::JoinStream; +use crate::pipelines::processors::transforms::new_hash_join::unpartitioned::grace::GraceMemoryJoin; use crate::sessions::QueryContext; /// Hybrid hash join mode: @@ -212,7 +212,7 @@ impl Join for HybridHashJoin { } } - fn add_runtime_filter_packet(&self, packet: JoinRuntimeFilterPacket) { + fn add_runtime_filter_packet(&self, packet: JoinRuntimeFilterPacket) -> Result<()> { match &self.mode { HybridJoinMode::Memory(join) => join.add_runtime_filter_packet(packet), HybridJoinMode::Grace(join) => join.add_runtime_filter_packet(packet), diff --git a/src/query/service/src/pipelines/processors/transforms/new_hash_join/hybrid/hybrid_state.rs b/src/query/service/src/pipelines/processors/transforms/new_hash_join/unpartitioned/hybrid/hybrid_state.rs similarity index 96% rename from src/query/service/src/pipelines/processors/transforms/new_hash_join/hybrid/hybrid_state.rs rename to src/query/service/src/pipelines/processors/transforms/new_hash_join/unpartitioned/hybrid/hybrid_state.rs index 6d6cf34197208..4917ebfd0bda8 100644 --- a/src/query/service/src/pipelines/processors/transforms/new_hash_join/hybrid/hybrid_state.rs +++ b/src/query/service/src/pipelines/processors/transforms/new_hash_join/unpartitioned/hybrid/hybrid_state.rs @@ -24,7 +24,7 @@ use databend_common_sql::plans::JoinType; use crate::pipelines::processors::transforms::HashJoinFactory; use crate::pipelines::processors::transforms::HybridHashJoin; -use crate::pipelines::processors::transforms::new_hash_join::grace::GraceHashJoinState; +use crate::pipelines::processors::transforms::new_hash_join::unpartitioned::grace::GraceHashJoinState; use crate::sessions::QueryContext; pub struct HybridHashJoinState { diff --git a/src/query/service/src/pipelines/processors/transforms/new_hash_join/hybrid/mod.rs b/src/query/service/src/pipelines/processors/transforms/new_hash_join/unpartitioned/hybrid/mod.rs similarity index 100% rename from src/query/service/src/pipelines/processors/transforms/new_hash_join/hybrid/mod.rs rename to src/query/service/src/pipelines/processors/transforms/new_hash_join/unpartitioned/hybrid/mod.rs diff --git a/src/query/service/src/pipelines/processors/transforms/new_hash_join/memory/basic.rs b/src/query/service/src/pipelines/processors/transforms/new_hash_join/unpartitioned/memory/basic.rs similarity index 100% rename from src/query/service/src/pipelines/processors/transforms/new_hash_join/memory/basic.rs rename to src/query/service/src/pipelines/processors/transforms/new_hash_join/unpartitioned/memory/basic.rs diff --git a/src/query/service/src/pipelines/processors/transforms/new_hash_join/memory/basic_state.rs b/src/query/service/src/pipelines/processors/transforms/new_hash_join/unpartitioned/memory/basic_state.rs similarity index 100% rename from src/query/service/src/pipelines/processors/transforms/new_hash_join/memory/basic_state.rs rename to src/query/service/src/pipelines/processors/transforms/new_hash_join/unpartitioned/memory/basic_state.rs diff --git a/src/query/service/src/pipelines/processors/transforms/new_hash_join/memory/inner_join.rs b/src/query/service/src/pipelines/processors/transforms/new_hash_join/unpartitioned/memory/inner_join.rs similarity index 75% rename from src/query/service/src/pipelines/processors/transforms/new_hash_join/memory/inner_join.rs rename to src/query/service/src/pipelines/processors/transforms/new_hash_join/unpartitioned/memory/inner_join.rs index a51e593766a7b..5881035096fd0 100644 --- a/src/query/service/src/pipelines/processors/transforms/new_hash_join/memory/inner_join.rs +++ b/src/query/service/src/pipelines/processors/transforms/new_hash_join/unpartitioned/memory/inner_join.rs @@ -22,26 +22,26 @@ use databend_common_exception::ErrorCode; use databend_common_exception::Result; use databend_common_expression::BlockEntry; use databend_common_expression::DataBlock; -use databend_common_expression::FilterExecutor; use databend_common_expression::FunctionContext; use databend_common_expression::HashMethodKind; use databend_common_expression::types::NullableColumn; use databend_common_expression::with_join_hash_method; use databend_common_settings::Settings; +use super::super::performance::PerformanceContext; use super::basic::BasicHashJoin; use super::basic_state::BasicHashJoinState; use crate::pipelines::processors::HashJoinDesc; use crate::pipelines::processors::transforms::HashJoinHashTable; use crate::pipelines::processors::transforms::JoinRuntimeFilterPacket; -use crate::pipelines::processors::transforms::merge_join_runtime_filter_packets; -use crate::pipelines::processors::transforms::new_hash_join::hashtable::ProbeData; -use crate::pipelines::processors::transforms::new_hash_join::hashtable::basic::ProbeStream; -use crate::pipelines::processors::transforms::new_hash_join::hashtable::basic::ProbedRows; -use crate::pipelines::processors::transforms::new_hash_join::join::EmptyJoinStream; -use crate::pipelines::processors::transforms::new_hash_join::join::Join; -use crate::pipelines::processors::transforms::new_hash_join::join::JoinStream; -use crate::pipelines::processors::transforms::new_hash_join::performance::PerformanceContext; +use crate::pipelines::processors::transforms::merge_two_runtime_filter_packets; +use crate::pipelines::processors::transforms::new_hash_join::common::join::EmptyJoinStream; +use crate::pipelines::processors::transforms::new_hash_join::common::join::InnerHashJoinFilterStream; +use crate::pipelines::processors::transforms::new_hash_join::common::join::Join; +use crate::pipelines::processors::transforms::new_hash_join::common::join::JoinStream; +use crate::pipelines::processors::transforms::new_hash_join::common::probe_stream::ProbeStream; +use crate::pipelines::processors::transforms::new_hash_join::common::probe_stream::ProbedRows; +use crate::pipelines::processors::transforms::new_hash_join::unpartitioned::hashtable::ProbeData; pub struct InnerHashJoin { pub(crate) basic_hash_join: BasicHashJoin, @@ -50,10 +50,6 @@ pub struct InnerHashJoin { pub(crate) function_ctx: FunctionContext, pub(crate) basic_state: Arc, pub(crate) performance_context: PerformanceContext, - pub(crate) inlist_threshold: usize, - pub(crate) bloom_threshold: usize, - pub(crate) min_max_threshold: usize, - pub(crate) spatial_threshold: usize, } impl InnerHashJoin { @@ -66,10 +62,6 @@ impl InnerHashJoin { nested_loop_join_threshold: usize, ) -> Result { let block_size = settings.get_max_block_size()? as usize; - let inlist_threshold = settings.get_inlist_runtime_filter_threshold()? as usize; - let bloom_threshold = settings.get_bloom_runtime_filter_threshold()? as usize; - let min_max_threshold = settings.get_min_max_runtime_filter_threshold()? as usize; - let spatial_threshold = settings.get_spatial_runtime_filter_threshold()? as usize; let context = PerformanceContext::create(block_size, desc.clone(), function_ctx.clone()); @@ -88,10 +80,6 @@ impl InnerHashJoin { function_ctx, basic_state: state, performance_context: context, - inlist_threshold, - bloom_threshold, - min_max_threshold, - spatial_threshold, }) } } @@ -105,21 +93,27 @@ impl Join for InnerHashJoin { self.basic_hash_join.final_build::() } - fn add_runtime_filter_packet(&self, packet: JoinRuntimeFilterPacket) { - let locked = self.basic_state.mutex.lock(); - let _locked = locked.unwrap_or_else(PoisonError::into_inner); - self.basic_state.packets.as_mut().push(packet); + fn add_runtime_filter_packet(&self, mut packet: JoinRuntimeFilterPacket) -> Result<()> { + loop { + let locked = self.basic_state.mutex.lock(); + let _locked = locked.unwrap_or_else(PoisonError::into_inner); + + let rf_packets = self.basic_state.packets.as_mut(); + + if rf_packets.is_empty() { + rf_packets.push(packet); + return Ok(()); + } + + let other = rf_packets.pop().unwrap(); + drop(_locked); + packet = merge_two_runtime_filter_packets(packet, other)?; + } } fn build_runtime_filter(&self) -> Result { - let packets = std::mem::take(self.basic_state.packets.as_mut()); - merge_join_runtime_filter_packets( - packets, - self.inlist_threshold, - self.bloom_threshold, - self.min_max_threshold, - self.spatial_threshold, - ) + let mut packets = std::mem::take(self.basic_state.packets.as_mut()); + Ok(packets.pop().unwrap_or_default()) } fn probe_block(&mut self, data: DataBlock) -> Result> { @@ -275,42 +269,3 @@ impl<'a> JoinStream for InnerHashJoinStream<'a> { } } } - -pub(super) struct InnerHashJoinFilterStream<'a> { - inner: Box, - filter_executor: &'a mut FilterExecutor, -} - -impl<'a> InnerHashJoinFilterStream<'a> { - pub fn create( - inner: Box, - filter_executor: &'a mut FilterExecutor, - ) -> Box { - Box::new(InnerHashJoinFilterStream { - inner, - filter_executor, - }) - } -} - -impl<'a> JoinStream for InnerHashJoinFilterStream<'a> { - fn next(&mut self) -> Result> { - loop { - let Some(data_block) = self.inner.next()? else { - return Ok(None); - }; - - if data_block.is_empty() { - continue; - } - - let data_block = self.filter_executor.filter(data_block)?; - - if data_block.is_empty() { - continue; - } - - return Ok(Some(data_block)); - } - } -} diff --git a/src/query/service/src/pipelines/processors/transforms/new_hash_join/memory/left_join.rs b/src/query/service/src/pipelines/processors/transforms/new_hash_join/unpartitioned/memory/left_join.rs similarity index 95% rename from src/query/service/src/pipelines/processors/transforms/new_hash_join/memory/left_join.rs rename to src/query/service/src/pipelines/processors/transforms/new_hash_join/unpartitioned/memory/left_join.rs index f0447e941c71b..9dfca00690583 100644 --- a/src/query/service/src/pipelines/processors/transforms/new_hash_join/memory/left_join.rs +++ b/src/query/service/src/pipelines/processors/transforms/new_hash_join/unpartitioned/memory/left_join.rs @@ -30,18 +30,18 @@ use databend_common_expression::types::DataType; use databend_common_expression::types::NullableColumn; use databend_common_expression::with_join_hash_method; +use super::super::performance::PerformanceContext; +use super::basic::BasicHashJoin; use crate::pipelines::processors::HashJoinDesc; use crate::pipelines::processors::transforms::BasicHashJoinState; use crate::pipelines::processors::transforms::HashJoinHashTable; use crate::pipelines::processors::transforms::Join; -use crate::pipelines::processors::transforms::new_hash_join::hashtable::ProbeData; -use crate::pipelines::processors::transforms::new_hash_join::hashtable::basic::ProbeStream; -use crate::pipelines::processors::transforms::new_hash_join::hashtable::basic::ProbedRows; -use crate::pipelines::processors::transforms::new_hash_join::join::EmptyJoinStream; -use crate::pipelines::processors::transforms::new_hash_join::join::JoinStream; -use crate::pipelines::processors::transforms::new_hash_join::join::OneBlockJoinStream; -use crate::pipelines::processors::transforms::new_hash_join::memory::basic::BasicHashJoin; -use crate::pipelines::processors::transforms::new_hash_join::performance::PerformanceContext; +use crate::pipelines::processors::transforms::new_hash_join::common::join::EmptyJoinStream; +use crate::pipelines::processors::transforms::new_hash_join::common::join::JoinStream; +use crate::pipelines::processors::transforms::new_hash_join::common::join::OneBlockJoinStream; +use crate::pipelines::processors::transforms::new_hash_join::common::probe_stream::ProbeStream; +use crate::pipelines::processors::transforms::new_hash_join::common::probe_stream::ProbedRows; +use crate::pipelines::processors::transforms::new_hash_join::unpartitioned::hashtable::ProbeData; use crate::pipelines::processors::transforms::wrap_true_validity; use crate::sessions::QueryContext; diff --git a/src/query/service/src/pipelines/processors/transforms/new_hash_join/memory/left_join_anti.rs b/src/query/service/src/pipelines/processors/transforms/new_hash_join/unpartitioned/memory/left_join_anti.rs similarity index 93% rename from src/query/service/src/pipelines/processors/transforms/new_hash_join/memory/left_join_anti.rs rename to src/query/service/src/pipelines/processors/transforms/new_hash_join/unpartitioned/memory/left_join_anti.rs index fe569f901d7ad..5977c3930c588 100644 --- a/src/query/service/src/pipelines/processors/transforms/new_hash_join/memory/left_join_anti.rs +++ b/src/query/service/src/pipelines/processors/transforms/new_hash_join/unpartitioned/memory/left_join_anti.rs @@ -27,19 +27,19 @@ use databend_common_expression::FunctionContext; use databend_common_expression::HashMethodKind; use databend_common_expression::with_join_hash_method; +use super::super::performance::PerformanceContext; +use super::basic::BasicHashJoin; +use super::left_join::final_result_block; use crate::pipelines::processors::HashJoinDesc; use crate::pipelines::processors::transforms::BasicHashJoinState; use crate::pipelines::processors::transforms::HashJoinHashTable; use crate::pipelines::processors::transforms::Join; -use crate::pipelines::processors::transforms::memory::basic::BasicHashJoin; -use crate::pipelines::processors::transforms::memory::left_join::final_result_block; -use crate::pipelines::processors::transforms::new_hash_join::hashtable::ProbeData; -use crate::pipelines::processors::transforms::new_hash_join::hashtable::basic::ProbeStream; -use crate::pipelines::processors::transforms::new_hash_join::hashtable::basic::ProbedRows; -use crate::pipelines::processors::transforms::new_hash_join::join::EmptyJoinStream; -use crate::pipelines::processors::transforms::new_hash_join::join::JoinStream; -use crate::pipelines::processors::transforms::new_hash_join::join::OneBlockJoinStream; -use crate::pipelines::processors::transforms::new_hash_join::performance::PerformanceContext; +use crate::pipelines::processors::transforms::new_hash_join::common::join::EmptyJoinStream; +use crate::pipelines::processors::transforms::new_hash_join::common::join::JoinStream; +use crate::pipelines::processors::transforms::new_hash_join::common::join::OneBlockJoinStream; +use crate::pipelines::processors::transforms::new_hash_join::common::probe_stream::ProbeStream; +use crate::pipelines::processors::transforms::new_hash_join::common::probe_stream::ProbedRows; +use crate::pipelines::processors::transforms::new_hash_join::unpartitioned::hashtable::ProbeData; use crate::sessions::QueryContext; pub struct AntiLeftHashJoin { diff --git a/src/query/service/src/pipelines/processors/transforms/new_hash_join/memory/left_join_semi.rs b/src/query/service/src/pipelines/processors/transforms/new_hash_join/unpartitioned/memory/left_join_semi.rs similarity index 86% rename from src/query/service/src/pipelines/processors/transforms/new_hash_join/memory/left_join_semi.rs rename to src/query/service/src/pipelines/processors/transforms/new_hash_join/unpartitioned/memory/left_join_semi.rs index 4ee2e762ac589..dfae7ab8aae78 100644 --- a/src/query/service/src/pipelines/processors/transforms/new_hash_join/memory/left_join_semi.rs +++ b/src/query/service/src/pipelines/processors/transforms/new_hash_join/unpartitioned/memory/left_join_semi.rs @@ -30,19 +30,19 @@ use databend_common_expression::HashMethodKind; use databend_common_expression::types::NullableColumn; use databend_common_expression::with_join_hash_method; +use super::super::performance::PerformanceContext; +use super::basic::BasicHashJoin; use crate::pipelines::processors::HashJoinDesc; use crate::pipelines::processors::transforms::BasicHashJoinState; use crate::pipelines::processors::transforms::HashJoinHashTable; use crate::pipelines::processors::transforms::Join; use crate::pipelines::processors::transforms::JoinRuntimeFilterPacket; -use crate::pipelines::processors::transforms::memory::basic::BasicHashJoin; -use crate::pipelines::processors::transforms::merge_join_runtime_filter_packets; -use crate::pipelines::processors::transforms::new_hash_join::hashtable::ProbeData; -use crate::pipelines::processors::transforms::new_hash_join::hashtable::basic::ProbeStream; -use crate::pipelines::processors::transforms::new_hash_join::hashtable::basic::ProbedRows; -use crate::pipelines::processors::transforms::new_hash_join::join::EmptyJoinStream; -use crate::pipelines::processors::transforms::new_hash_join::join::JoinStream; -use crate::pipelines::processors::transforms::new_hash_join::performance::PerformanceContext; +use crate::pipelines::processors::transforms::merge_two_runtime_filter_packets; +use crate::pipelines::processors::transforms::new_hash_join::common::join::EmptyJoinStream; +use crate::pipelines::processors::transforms::new_hash_join::common::join::JoinStream; +use crate::pipelines::processors::transforms::new_hash_join::common::probe_stream::ProbeStream; +use crate::pipelines::processors::transforms::new_hash_join::common::probe_stream::ProbedRows; +use crate::pipelines::processors::transforms::new_hash_join::unpartitioned::hashtable::ProbeData; use crate::sessions::QueryContext; pub struct SemiLeftHashJoin { @@ -52,10 +52,6 @@ pub struct SemiLeftHashJoin { pub(crate) function_ctx: FunctionContext, pub(crate) basic_state: Arc, pub(crate) performance_context: PerformanceContext, - pub(crate) inlist_threshold: usize, - pub(crate) bloom_threshold: usize, - pub(crate) min_max_threshold: usize, - pub(crate) spatial_threshold: usize, } impl SemiLeftHashJoin { @@ -68,10 +64,6 @@ impl SemiLeftHashJoin { ) -> Result { let settings = ctx.get_settings(); let block_size = settings.get_max_block_size()? as usize; - let inlist_threshold = settings.get_inlist_runtime_filter_threshold()? as usize; - let bloom_threshold = settings.get_bloom_runtime_filter_threshold()? as usize; - let min_max_threshold = settings.get_min_max_runtime_filter_threshold()? as usize; - let spatial_threshold = settings.get_spatial_runtime_filter_threshold()? as usize; let context = PerformanceContext::create(block_size, desc.clone(), function_ctx.clone()); @@ -90,10 +82,6 @@ impl SemiLeftHashJoin { function_ctx, basic_state: state, performance_context: context, - inlist_threshold, - bloom_threshold, - min_max_threshold, - spatial_threshold, }) } } @@ -107,21 +95,27 @@ impl Join for SemiLeftHashJoin { self.basic_hash_join.final_build::() } - fn add_runtime_filter_packet(&self, packet: JoinRuntimeFilterPacket) { - let locked = self.basic_state.mutex.lock(); - let _locked = locked.unwrap_or_else(PoisonError::into_inner); - self.basic_state.packets.as_mut().push(packet); + fn add_runtime_filter_packet(&self, mut packet: JoinRuntimeFilterPacket) -> Result<()> { + loop { + let locked = self.basic_state.mutex.lock(); + let _locked = locked.unwrap_or_else(PoisonError::into_inner); + + let rf_packets = self.basic_state.packets.as_mut(); + + if rf_packets.is_empty() { + rf_packets.push(packet); + return Ok(()); + } + + let other = rf_packets.pop().unwrap(); + drop(_locked); + packet = merge_two_runtime_filter_packets(packet, other)?; + } } fn build_runtime_filter(&self) -> Result { - let packets = std::mem::take(self.basic_state.packets.as_mut()); - merge_join_runtime_filter_packets( - packets, - self.inlist_threshold, - self.bloom_threshold, - self.min_max_threshold, - self.spatial_threshold, - ) + let mut packets = std::mem::take(self.basic_state.packets.as_mut()); + Ok(packets.pop().unwrap_or_default()) } fn probe_block(&mut self, data: DataBlock) -> Result> { diff --git a/src/query/service/src/pipelines/processors/transforms/new_hash_join/memory/mod.rs b/src/query/service/src/pipelines/processors/transforms/new_hash_join/unpartitioned/memory/mod.rs similarity index 97% rename from src/query/service/src/pipelines/processors/transforms/new_hash_join/memory/mod.rs rename to src/query/service/src/pipelines/processors/transforms/new_hash_join/unpartitioned/memory/mod.rs index 150758af79896..5c09205bdc733 100644 --- a/src/query/service/src/pipelines/processors/transforms/new_hash_join/memory/mod.rs +++ b/src/query/service/src/pipelines/processors/transforms/new_hash_join/unpartitioned/memory/mod.rs @@ -12,12 +12,13 @@ // See the License for the specific language governing permissions and // limitations under the License. -mod basic; +pub(crate) mod basic; mod basic_state; mod inner_join; pub mod left_join; mod left_join_anti; mod left_join_semi; +mod nested_loop; mod right_join; mod right_join_anti; mod right_join_semi; @@ -26,9 +27,7 @@ pub use basic_state::BasicHashJoinState; pub use inner_join::InnerHashJoin; pub use left_join_anti::AntiLeftHashJoin; pub use left_join_semi::SemiLeftHashJoin; +pub use nested_loop::*; pub use right_join::OuterRightHashJoin; pub use right_join_anti::AntiRightHashJoin; pub use right_join_semi::SemiRightHashJoin; -mod nested_loop; - -pub use nested_loop::*; diff --git a/src/query/service/src/pipelines/processors/transforms/new_hash_join/memory/nested_loop.rs b/src/query/service/src/pipelines/processors/transforms/new_hash_join/unpartitioned/memory/nested_loop.rs similarity index 98% rename from src/query/service/src/pipelines/processors/transforms/new_hash_join/memory/nested_loop.rs rename to src/query/service/src/pipelines/processors/transforms/new_hash_join/unpartitioned/memory/nested_loop.rs index 89c67354391ed..21529fc43bd9d 100644 --- a/src/query/service/src/pipelines/processors/transforms/new_hash_join/memory/nested_loop.rs +++ b/src/query/service/src/pipelines/processors/transforms/new_hash_join/unpartitioned/memory/nested_loop.rs @@ -30,7 +30,7 @@ use crate::pipelines::processors::transforms::JoinRuntimeFilterPacket; use crate::pipelines::processors::transforms::JoinStream; use crate::pipelines::processors::transforms::NestedLoopDesc; use crate::pipelines::processors::transforms::hash_join_table::RowPtr; -use crate::pipelines::processors::transforms::new_hash_join::join::EmptyJoinStream; +use crate::pipelines::processors::transforms::new_hash_join::common::join::EmptyJoinStream; pub struct NestedLoopJoin { inner: T, @@ -89,8 +89,8 @@ impl Join for NestedLoopJoin { self.inner.final_build() } - fn add_runtime_filter_packet(&self, packet: JoinRuntimeFilterPacket) { - self.inner.add_runtime_filter_packet(packet); + fn add_runtime_filter_packet(&self, packet: JoinRuntimeFilterPacket) -> Result<()> { + self.inner.add_runtime_filter_packet(packet) } fn build_runtime_filter(&self) -> Result { diff --git a/src/query/service/src/pipelines/processors/transforms/new_hash_join/memory/right_join.rs b/src/query/service/src/pipelines/processors/transforms/new_hash_join/unpartitioned/memory/right_join.rs similarity index 87% rename from src/query/service/src/pipelines/processors/transforms/new_hash_join/memory/right_join.rs rename to src/query/service/src/pipelines/processors/transforms/new_hash_join/unpartitioned/memory/right_join.rs index 0bc9b37712b7a..cef266511b80b 100644 --- a/src/query/service/src/pipelines/processors/transforms/new_hash_join/memory/right_join.rs +++ b/src/query/service/src/pipelines/processors/transforms/new_hash_join/unpartitioned/memory/right_join.rs @@ -28,21 +28,21 @@ use databend_common_expression::HashMethodKind; use databend_common_expression::types::DataType; use databend_common_expression::with_join_hash_method; +use super::super::performance::PerformanceContext; +use super::basic::BasicHashJoin; +use super::left_join::final_result_block; +use super::left_join::null_block; use crate::pipelines::processors::HashJoinDesc; use crate::pipelines::processors::transforms::BasicHashJoinState; use crate::pipelines::processors::transforms::HashJoinHashTable; use crate::pipelines::processors::transforms::Join; use crate::pipelines::processors::transforms::JoinRuntimeFilterPacket; use crate::pipelines::processors::transforms::hash_join_table::RowPtr; -use crate::pipelines::processors::transforms::memory::basic::BasicHashJoin; -use crate::pipelines::processors::transforms::memory::left_join::final_result_block; -use crate::pipelines::processors::transforms::memory::left_join::null_block; -use crate::pipelines::processors::transforms::merge_join_runtime_filter_packets; -use crate::pipelines::processors::transforms::new_hash_join::hashtable::ProbeData; -use crate::pipelines::processors::transforms::new_hash_join::hashtable::basic::ProbeStream; -use crate::pipelines::processors::transforms::new_hash_join::hashtable::basic::ProbedRows; -use crate::pipelines::processors::transforms::new_hash_join::join::JoinStream; -use crate::pipelines::processors::transforms::new_hash_join::performance::PerformanceContext; +use crate::pipelines::processors::transforms::merge_two_runtime_filter_packets; +use crate::pipelines::processors::transforms::new_hash_join::common::join::JoinStream; +use crate::pipelines::processors::transforms::new_hash_join::common::probe_stream::ProbeStream; +use crate::pipelines::processors::transforms::new_hash_join::common::probe_stream::ProbedRows; +use crate::pipelines::processors::transforms::new_hash_join::unpartitioned::hashtable::ProbeData; use crate::pipelines::processors::transforms::wrap_nullable_block; use crate::sessions::QueryContext; @@ -53,10 +53,6 @@ pub struct OuterRightHashJoin { pub(crate) function_ctx: FunctionContext, pub(crate) basic_state: Arc, pub(crate) performance_context: PerformanceContext, - pub(crate) inlist_threshold: usize, - pub(crate) bloom_threshold: usize, - pub(crate) min_max_threshold: usize, - pub(crate) spatial_threshold: usize, pub(crate) finished: bool, } @@ -71,10 +67,6 @@ impl OuterRightHashJoin { ) -> Result { let settings = ctx.get_settings(); let block_size = settings.get_max_block_size()? as usize; - let inlist_threshold = settings.get_inlist_runtime_filter_threshold()? as usize; - let bloom_threshold = settings.get_bloom_runtime_filter_threshold()? as usize; - let min_max_threshold = settings.get_min_max_runtime_filter_threshold()? as usize; - let spatial_threshold = settings.get_spatial_runtime_filter_threshold()? as usize; let context = PerformanceContext::create(block_size, desc.clone(), function_ctx.clone()); @@ -93,10 +85,6 @@ impl OuterRightHashJoin { function_ctx, basic_state: state, performance_context: context, - inlist_threshold, - bloom_threshold, - min_max_threshold, - spatial_threshold, finished: false, }) } @@ -111,21 +99,27 @@ impl Join for OuterRightHashJoin { self.basic_hash_join.final_build::() } - fn add_runtime_filter_packet(&self, packet: JoinRuntimeFilterPacket) { - let locked = self.basic_state.mutex.lock(); - let _locked = locked.unwrap_or_else(PoisonError::into_inner); - self.basic_state.packets.as_mut().push(packet); + fn add_runtime_filter_packet(&self, mut packet: JoinRuntimeFilterPacket) -> Result<()> { + loop { + let locked = self.basic_state.mutex.lock(); + let _locked = locked.unwrap_or_else(PoisonError::into_inner); + + let rf_packets = self.basic_state.packets.as_mut(); + + if rf_packets.is_empty() { + rf_packets.push(packet); + return Ok(()); + } + + let other = rf_packets.pop().unwrap(); + drop(_locked); + packet = merge_two_runtime_filter_packets(packet, other)?; + } } fn build_runtime_filter(&self) -> Result { - let packets = std::mem::take(self.basic_state.packets.as_mut()); - merge_join_runtime_filter_packets( - packets, - self.inlist_threshold, - self.bloom_threshold, - self.min_max_threshold, - self.spatial_threshold, - ) + let mut packets = std::mem::take(self.basic_state.packets.as_mut()); + Ok(packets.pop().unwrap_or_default()) } fn probe_block(&mut self, data: DataBlock) -> Result> { diff --git a/src/query/service/src/pipelines/processors/transforms/new_hash_join/memory/right_join_anti.rs b/src/query/service/src/pipelines/processors/transforms/new_hash_join/unpartitioned/memory/right_join_anti.rs similarity index 82% rename from src/query/service/src/pipelines/processors/transforms/new_hash_join/memory/right_join_anti.rs rename to src/query/service/src/pipelines/processors/transforms/new_hash_join/unpartitioned/memory/right_join_anti.rs index 74b2f1365bd87..bd35eda235a20 100644 --- a/src/query/service/src/pipelines/processors/transforms/new_hash_join/memory/right_join_anti.rs +++ b/src/query/service/src/pipelines/processors/transforms/new_hash_join/unpartitioned/memory/right_join_anti.rs @@ -26,18 +26,18 @@ use databend_common_expression::FunctionContext; use databend_common_expression::HashMethodKind; use databend_common_expression::with_join_hash_method; +use super::super::performance::PerformanceContext; +use super::basic::BasicHashJoin; +use super::right_join_semi::SemiRightHashJoinStream; use crate::pipelines::processors::HashJoinDesc; use crate::pipelines::processors::transforms::BasicHashJoinState; use crate::pipelines::processors::transforms::HashJoinHashTable; use crate::pipelines::processors::transforms::Join; use crate::pipelines::processors::transforms::JoinRuntimeFilterPacket; use crate::pipelines::processors::transforms::hash_join_table::RowPtr; -use crate::pipelines::processors::transforms::memory::basic::BasicHashJoin; -use crate::pipelines::processors::transforms::memory::right_join_semi::SemiRightHashJoinStream; -use crate::pipelines::processors::transforms::merge_join_runtime_filter_packets; -use crate::pipelines::processors::transforms::new_hash_join::hashtable::ProbeData; -use crate::pipelines::processors::transforms::new_hash_join::join::JoinStream; -use crate::pipelines::processors::transforms::new_hash_join::performance::PerformanceContext; +use crate::pipelines::processors::transforms::merge_two_runtime_filter_packets; +use crate::pipelines::processors::transforms::new_hash_join::common::join::JoinStream; +use crate::pipelines::processors::transforms::new_hash_join::unpartitioned::hashtable::ProbeData; use crate::sessions::QueryContext; pub struct AntiRightHashJoin { @@ -47,10 +47,6 @@ pub struct AntiRightHashJoin { pub(crate) function_ctx: FunctionContext, pub(crate) basic_state: Arc, pub(crate) performance_context: PerformanceContext, - pub(crate) inlist_threshold: usize, - pub(crate) bloom_threshold: usize, - pub(crate) min_max_threshold: usize, - pub(crate) spatial_threshold: usize, pub(crate) finished: bool, } @@ -65,10 +61,6 @@ impl AntiRightHashJoin { ) -> Result { let settings = ctx.get_settings(); let block_size = settings.get_max_block_size()? as usize; - let inlist_threshold = settings.get_inlist_runtime_filter_threshold()? as usize; - let bloom_threshold = settings.get_bloom_runtime_filter_threshold()? as usize; - let min_max_threshold = settings.get_min_max_runtime_filter_threshold()? as usize; - let spatial_threshold = settings.get_spatial_runtime_filter_threshold()? as usize; let context = PerformanceContext::create(block_size, desc.clone(), function_ctx.clone()); @@ -87,10 +79,6 @@ impl AntiRightHashJoin { function_ctx, basic_state: state, performance_context: context, - inlist_threshold, - bloom_threshold, - min_max_threshold, - spatial_threshold, finished: false, }) } @@ -105,21 +93,27 @@ impl Join for AntiRightHashJoin { self.basic_hash_join.final_build::() } - fn add_runtime_filter_packet(&self, packet: JoinRuntimeFilterPacket) { - let locked = self.basic_state.mutex.lock(); - let _locked = locked.unwrap_or_else(PoisonError::into_inner); - self.basic_state.packets.as_mut().push(packet); + fn add_runtime_filter_packet(&self, mut packet: JoinRuntimeFilterPacket) -> Result<()> { + loop { + let locked = self.basic_state.mutex.lock(); + let _locked = locked.unwrap_or_else(PoisonError::into_inner); + + let rf_packets = self.basic_state.packets.as_mut(); + + if rf_packets.is_empty() { + rf_packets.push(packet); + return Ok(()); + } + + let other = rf_packets.pop().unwrap(); + drop(_locked); + packet = merge_two_runtime_filter_packets(packet, other)?; + } } fn build_runtime_filter(&self) -> Result { - let packets = std::mem::take(self.basic_state.packets.as_mut()); - merge_join_runtime_filter_packets( - packets, - self.inlist_threshold, - self.bloom_threshold, - self.min_max_threshold, - self.spatial_threshold, - ) + let mut packets = std::mem::take(self.basic_state.packets.as_mut()); + Ok(packets.pop().unwrap_or_default()) } fn probe_block(&mut self, data: DataBlock) -> Result> { diff --git a/src/query/service/src/pipelines/processors/transforms/new_hash_join/memory/right_join_semi.rs b/src/query/service/src/pipelines/processors/transforms/new_hash_join/unpartitioned/memory/right_join_semi.rs similarity index 86% rename from src/query/service/src/pipelines/processors/transforms/new_hash_join/memory/right_join_semi.rs rename to src/query/service/src/pipelines/processors/transforms/new_hash_join/unpartitioned/memory/right_join_semi.rs index 2f3d57c38e452..df860823ec4c3 100644 --- a/src/query/service/src/pipelines/processors/transforms/new_hash_join/memory/right_join_semi.rs +++ b/src/query/service/src/pipelines/processors/transforms/new_hash_join/unpartitioned/memory/right_join_semi.rs @@ -27,20 +27,20 @@ use databend_common_expression::FunctionContext; use databend_common_expression::HashMethodKind; use databend_common_expression::with_join_hash_method; +use super::super::performance::PerformanceContext; +use super::basic::BasicHashJoin; +use super::left_join::final_result_block; use crate::pipelines::processors::HashJoinDesc; use crate::pipelines::processors::transforms::BasicHashJoinState; use crate::pipelines::processors::transforms::HashJoinHashTable; use crate::pipelines::processors::transforms::Join; use crate::pipelines::processors::transforms::JoinRuntimeFilterPacket; use crate::pipelines::processors::transforms::hash_join_table::RowPtr; -use crate::pipelines::processors::transforms::memory::basic::BasicHashJoin; -use crate::pipelines::processors::transforms::memory::left_join::final_result_block; -use crate::pipelines::processors::transforms::merge_join_runtime_filter_packets; -use crate::pipelines::processors::transforms::new_hash_join::hashtable::ProbeData; -use crate::pipelines::processors::transforms::new_hash_join::hashtable::basic::ProbeStream; -use crate::pipelines::processors::transforms::new_hash_join::hashtable::basic::ProbedRows; -use crate::pipelines::processors::transforms::new_hash_join::join::JoinStream; -use crate::pipelines::processors::transforms::new_hash_join::performance::PerformanceContext; +use crate::pipelines::processors::transforms::merge_two_runtime_filter_packets; +use crate::pipelines::processors::transforms::new_hash_join::common::join::JoinStream; +use crate::pipelines::processors::transforms::new_hash_join::common::probe_stream::ProbeStream; +use crate::pipelines::processors::transforms::new_hash_join::common::probe_stream::ProbedRows; +use crate::pipelines::processors::transforms::new_hash_join::unpartitioned::hashtable::ProbeData; use crate::sessions::QueryContext; pub struct SemiRightHashJoin { @@ -50,10 +50,6 @@ pub struct SemiRightHashJoin { pub(crate) function_ctx: FunctionContext, pub(crate) basic_state: Arc, pub(crate) performance_context: PerformanceContext, - pub(crate) inlist_threshold: usize, - pub(crate) bloom_threshold: usize, - pub(crate) min_max_threshold: usize, - pub(crate) spatial_threshold: usize, pub(crate) finished: bool, } @@ -68,10 +64,6 @@ impl SemiRightHashJoin { ) -> Result { let settings = ctx.get_settings(); let block_size = settings.get_max_block_size()? as usize; - let inlist_threshold = settings.get_inlist_runtime_filter_threshold()? as usize; - let bloom_threshold = settings.get_bloom_runtime_filter_threshold()? as usize; - let min_max_threshold = settings.get_min_max_runtime_filter_threshold()? as usize; - let spatial_threshold = settings.get_spatial_runtime_filter_threshold()? as usize; let context = PerformanceContext::create(block_size, desc.clone(), function_ctx.clone()); @@ -90,10 +82,6 @@ impl SemiRightHashJoin { function_ctx, basic_state: state, performance_context: context, - inlist_threshold, - bloom_threshold, - min_max_threshold, - spatial_threshold, finished: false, }) } @@ -108,21 +96,27 @@ impl Join for SemiRightHashJoin { self.basic_hash_join.final_build::() } - fn add_runtime_filter_packet(&self, packet: JoinRuntimeFilterPacket) { - let locked = self.basic_state.mutex.lock(); - let _locked = locked.unwrap_or_else(PoisonError::into_inner); - self.basic_state.packets.as_mut().push(packet); + fn add_runtime_filter_packet(&self, mut packet: JoinRuntimeFilterPacket) -> Result<()> { + loop { + let locked = self.basic_state.mutex.lock(); + let _locked = locked.unwrap_or_else(PoisonError::into_inner); + + let rf_packets = self.basic_state.packets.as_mut(); + + if rf_packets.is_empty() { + rf_packets.push(packet); + return Ok(()); + } + + let other = rf_packets.pop().unwrap(); + drop(_locked); + packet = merge_two_runtime_filter_packets(packet, other)?; + } } fn build_runtime_filter(&self) -> Result { - let packets = std::mem::take(self.basic_state.packets.as_mut()); - merge_join_runtime_filter_packets( - packets, - self.inlist_threshold, - self.bloom_threshold, - self.min_max_threshold, - self.spatial_threshold, - ) + let mut packets = std::mem::take(self.basic_state.packets.as_mut()); + Ok(packets.pop().unwrap_or_default()) } fn probe_block(&mut self, data: DataBlock) -> Result> { diff --git a/src/query/service/src/pipelines/processors/transforms/new_hash_join/unpartitioned/mod.rs b/src/query/service/src/pipelines/processors/transforms/new_hash_join/unpartitioned/mod.rs new file mode 100644 index 0000000000000..0cef379072498 --- /dev/null +++ b/src/query/service/src/pipelines/processors/transforms/new_hash_join/unpartitioned/mod.rs @@ -0,0 +1,26 @@ +// Copyright 2021 Datafuse Labs +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +pub mod grace; +mod hash_join_factory; +pub mod hashtable; +pub mod hybrid; +pub mod memory; +mod performance; +mod transform_hash_join; + +pub use hash_join_factory::HashJoinFactory; +pub use memory::*; +pub use performance::PerformanceContext; +pub use transform_hash_join::TransformHashJoin; diff --git a/src/query/service/src/pipelines/processors/transforms/new_hash_join/performance.rs b/src/query/service/src/pipelines/processors/transforms/new_hash_join/unpartitioned/performance.rs similarity index 90% rename from src/query/service/src/pipelines/processors/transforms/new_hash_join/performance.rs rename to src/query/service/src/pipelines/processors/transforms/new_hash_join/unpartitioned/performance.rs index 0f3f7cb8f560b..92743c6f2e9d4 100644 --- a/src/query/service/src/pipelines/processors/transforms/new_hash_join/performance.rs +++ b/src/query/service/src/pipelines/processors/transforms/new_hash_join/unpartitioned/performance.rs @@ -19,8 +19,8 @@ use databend_common_expression::FunctionContext; use databend_common_functions::BUILTIN_FUNCTIONS; use crate::pipelines::processors::HashJoinDesc; -use crate::pipelines::processors::transforms::new_hash_join::hashtable::ProbeHashStatistics; -use crate::pipelines::processors::transforms::new_hash_join::hashtable::basic::ProbedRows; +use crate::pipelines::processors::transforms::new_hash_join::common::probe_stream::ProbedRows; +use crate::pipelines::processors::transforms::new_hash_join::unpartitioned::hashtable::ProbeHashStatistics; pub struct PerformanceContext { pub probe_result: ProbedRows, diff --git a/src/query/service/src/pipelines/processors/transforms/new_hash_join/transform_hash_join.rs b/src/query/service/src/pipelines/processors/transforms/new_hash_join/unpartitioned/transform_hash_join.rs similarity index 94% rename from src/query/service/src/pipelines/processors/transforms/new_hash_join/transform_hash_join.rs rename to src/query/service/src/pipelines/processors/transforms/new_hash_join/unpartitioned/transform_hash_join.rs index d8051592155c9..de806c34d66b5 100644 --- a/src/query/service/src/pipelines/processors/transforms/new_hash_join/transform_hash_join.rs +++ b/src/query/service/src/pipelines/processors/transforms/new_hash_join/unpartitioned/transform_hash_join.rs @@ -31,10 +31,10 @@ use databend_common_pipeline::core::ProcessorPtr; use log::info; use crate::pipelines::processors::transforms::RuntimeFilterLocalBuilder; -use crate::pipelines::processors::transforms::new_hash_join::join::FinishedJoin; -use crate::pipelines::processors::transforms::new_hash_join::join::Join; -use crate::pipelines::processors::transforms::new_hash_join::join::JoinStream; -use crate::pipelines::processors::transforms::new_hash_join::runtime_filter::RuntimeFiltersDesc; +use crate::pipelines::processors::transforms::new_hash_join::common::join::FinishedJoin; +use crate::pipelines::processors::transforms::new_hash_join::common::join::Join; +use crate::pipelines::processors::transforms::new_hash_join::common::join::JoinStream; +use crate::pipelines::processors::transforms::new_hash_join::common::runtime_filter::RuntimeFiltersDesc; pub struct TransformHashJoin { build_port: Arc, @@ -154,6 +154,12 @@ impl Processor for TransformHashJoin { if !state.finished { state.finished = true; self.join.add_block(None)?; + + if let Some(builder) = self.runtime_filter_builder.take() { + let spill_happened = self.join.is_spill_happened(); + let packet = builder.finish(spill_happened)?; + self.join.add_runtime_filter_packet(packet)?; + } } return Ok(()); }; @@ -224,14 +230,6 @@ impl Processor for TransformHashJoin { self.stage = match &mut self.stage { Stage::Build(_) => { - if let Some(builder) = self.runtime_filter_builder.take() { - let spill_happened = self.join.is_spill_happened(); - // Disable runtime filters once spilling occurs to avoid partial-build filters - // being globalized across the cluster, which can prune valid probe rows. - let packet = builder.finish(spill_happened)?; - self.join.add_runtime_filter_packet(packet); - } - let rf_build_elapsed = self.instant.elapsed() - elapsed; let _wait_res = self.stage_sync_barrier.wait().await; let before_wait = self.instant.elapsed(); diff --git a/src/query/service/src/schedulers/fragments/fragmenter.rs b/src/query/service/src/schedulers/fragments/fragmenter.rs index 0cbce7f5e7a18..b20a3738441d3 100644 --- a/src/query/service/src/schedulers/fragments/fragmenter.rs +++ b/src/query/service/src/schedulers/fragments/fragmenter.rs @@ -283,6 +283,8 @@ impl DeriveHandle for FragmentDeriveHandle { let plan_id = v.get_id(); let source_fragment_id = self.ctx.get_fragment_id(); + let exchange_kind = exchange.kind.clone(); + let exchange_keys = exchange.keys.clone(); let plan: PhysicalPlan = PhysicalPlan::new(ExchangeSink { input, @@ -327,6 +329,8 @@ impl DeriveHandle for FragmentDeriveHandle { source_fragment_id, meta: PhysicalPlanMeta::with_plan_id("ExchangeSource", plan_id), + kind: exchange_kind, + keys: exchange_keys, })); } diff --git a/src/query/service/src/servers/flight/v1/packets/packet_fragment.rs b/src/query/service/src/servers/flight/v1/packets/packet_fragment.rs index 963363e7267d1..290924a7b261f 100644 --- a/src/query/service/src/servers/flight/v1/packets/packet_fragment.rs +++ b/src/query/service/src/servers/flight/v1/packets/packet_fragment.rs @@ -18,6 +18,7 @@ use std::collections::VecDeque; use std::fmt::Debug; use std::fmt::Formatter; +use databend_common_sql::executor::physical_plans::DataDistribution; use serde::Deserializer; use serde::Serializer; use serde::de::Error; @@ -69,6 +70,10 @@ impl IPhysicalPlan for SerializedPhysicalPlanRef { fn derive(&self, _: Vec) -> PhysicalPlan { unimplemented!() } + + fn output_data_distribution(&self) -> DataDistribution { + unimplemented!() + } } #[derive(Clone, Debug, serde::Serialize, serde::Deserialize)] diff --git a/src/query/service/src/servers/flight/v1/scatter/flight_scatter_hash.rs b/src/query/service/src/servers/flight/v1/scatter/flight_scatter_hash.rs index 533ca56929a0d..0b73c7cdabc25 100644 --- a/src/query/service/src/servers/flight/v1/scatter/flight_scatter_hash.rs +++ b/src/query/service/src/servers/flight/v1/scatter/flight_scatter_hash.rs @@ -25,6 +25,7 @@ use databend_common_expression::FunctionID; use databend_common_expression::RemoteExpr; use databend_common_expression::Scalar; use databend_common_expression::Value; +use databend_common_expression::aggregate::combine_group_hash_column; use databend_common_expression::type_check::check_function; use databend_common_expression::types::AccessType; use databend_common_expression::types::AnyType; @@ -35,6 +36,7 @@ use databend_common_expression::types::NumberDataType; use databend_common_expression::types::NumberType; use databend_common_expression::types::number::NumberScalar; use databend_common_functions::BUILTIN_FUNCTIONS; +use strength_reduce::StrengthReducedU64; use crate::servers::flight::v1::scatter::flight_scatter::FlightScatter; @@ -43,6 +45,8 @@ pub struct HashFlightScatter { func_ctx: FunctionContext, hash_key: Vec, scatter_size: usize, + raw_hash_keys: Vec, + hash_key_data_types: Vec, } impl HashFlightScatter { @@ -60,23 +64,25 @@ impl HashFlightScatter { local_pos, ); } - let hash_key = hash_keys + let raw_hash_keys: Vec = hash_keys .iter() - .map(|key| { - check_function( - None, - "siphash", - &[], - &[key.as_expr(&BUILTIN_FUNCTIONS)], - &BUILTIN_FUNCTIONS, - ) - }) + .map(|key| key.as_expr(&BUILTIN_FUNCTIONS)) + .collect(); + let hash_key_data_types: Vec = raw_hash_keys + .iter() + .map(|expr| expr.data_type().clone()) + .collect(); + let hash_key = raw_hash_keys + .iter() + .map(|expr| check_function(None, "siphash", &[], &[expr.clone()], &BUILTIN_FUNCTIONS)) .collect::>()?; Ok(Box::new(Self { func_ctx, scatter_size, hash_key, + raw_hash_keys, + hash_key_data_types, })) } } @@ -87,6 +93,8 @@ struct OneHashKeyFlightScatter { func_ctx: FunctionContext, indices_scalar: Expr, default_scatter_index: u64, + hash_key_expr: Expr, + hash_key_data_type: DataType, } impl OneHashKeyFlightScatter { @@ -101,6 +109,8 @@ impl OneHashKeyFlightScatter { } else { 0 }; + let hash_key_expr = hash_key.as_expr(&BUILTIN_FUNCTIONS); + let hash_key_data_type = hash_key_expr.data_type().clone(); let indices_scalar = check_function( None, "modulo", @@ -110,7 +120,7 @@ impl OneHashKeyFlightScatter { None, "siphash", &[], - &[hash_key.as_expr(&BUILTIN_FUNCTIONS)], + &[hash_key_expr.clone()], &BUILTIN_FUNCTIONS, )?, Expr::constant( @@ -126,6 +136,8 @@ impl OneHashKeyFlightScatter { func_ctx, indices_scalar, default_scatter_index, + hash_key_expr, + hash_key_data_type, })) } } @@ -155,9 +167,15 @@ impl FlightScatter for OneHashKeyFlightScatter { fn scatter_indices(&self, data_block: &DataBlock) -> Result>> { let evaluator = Evaluator::new(data_block, &self.func_ctx, &BUILTIN_FUNCTIONS); let num = data_block.num_rows(); - let indices = evaluator.run(&self.indices_scalar).unwrap(); - let indices = get_hash_values(indices, num, self.default_scatter_index)?; - Ok(Some(indices.to_vec())) + let value = evaluator.run(&self.hash_key_expr)?; + let column = value.convert_to_full_column(&self.hash_key_data_type, num); + let mut hashes = vec![0u64; num]; + combine_group_hash_column::(&column, &mut hashes); + let m = StrengthReducedU64::new(self.scatter_size as u64); + for h in hashes.iter_mut() { + *h = *h % m; + } + Ok(Some(hashes)) } } @@ -195,18 +213,26 @@ impl FlightScatter for HashFlightScatter { fn scatter_indices(&self, data_block: &DataBlock) -> Result>> { let evaluator = Evaluator::new(data_block, &self.func_ctx, &BUILTIN_FUNCTIONS); let num = data_block.num_rows(); - let indices = if !self.hash_key.is_empty() { - let mut hash_keys = Vec::with_capacity(self.hash_key.len()); - for expr in &self.hash_key { - let indices = evaluator.run(expr).unwrap(); - let indices = get_hash_values(indices, num, 0)?; - hash_keys.push(indices) + let mut hashes = vec![0u64; num]; + for (i, (expr, dt)) in self + .raw_hash_keys + .iter() + .zip(&self.hash_key_data_types) + .enumerate() + { + let value = evaluator.run(expr)?; + let column = value.convert_to_full_column(dt, num); + if i == 0 { + combine_group_hash_column::(&column, &mut hashes); + } else { + combine_group_hash_column::(&column, &mut hashes); } - self.combine_hash_keys(&hash_keys, num) - } else { - Ok(vec![0; num]) - }?; - Ok(Some(indices)) + } + let m = StrengthReducedU64::new(self.scatter_size as u64); + for h in hashes.iter_mut() { + *h = *h % m; + } + Ok(Some(hashes)) } } diff --git a/src/query/service/tests/it/storages/fuse/operations/prewhere.rs b/src/query/service/tests/it/storages/fuse/operations/prewhere.rs index e20caf0a88cee..941a04b9a3bc2 100644 --- a/src/query/service/tests/it/storages/fuse/operations/prewhere.rs +++ b/src/query/service/tests/it/storages/fuse/operations/prewhere.rs @@ -70,7 +70,8 @@ async fn test_prewhere() -> Result<()> { let _ = _fixture; // Create ReadState which combines prewhere and runtime filter logic - let read_state = ReadState::create(ctx.clone(), scan_id, Some(&prewhere_info), &block_reader)?; + let mut read_state = + ReadState::create(ctx.clone(), scan_id, Some(&prewhere_info), &block_reader)?; // Use the new unified API that handles all states internally let (data_block, _row_selection, bitmap_selection) = diff --git a/src/query/settings/src/settings_default.rs b/src/query/settings/src/settings_default.rs index b14f575c6206e..b3d5107774615 100644 --- a/src/query/settings/src/settings_default.rs +++ b/src/query/settings/src/settings_default.rs @@ -574,6 +574,20 @@ impl DefaultSettings { scope: SettingScope::Both, range: Some(SettingRange::Numeric(1..=u64::MAX)), }), + ("bloom_runtime_filter_selectivity_threshold", DefaultSettingValue { + value: UserSettingValue::UInt64(40), + desc: "Probe-side selectivity threshold (percentage) for bloom runtime filters. If a bloom filter filters less than this percentage of rows, it is temporarily disabled. Default 40 means 40%.", + mode: SettingMode::Both, + scope: SettingScope::Both, + range: Some(SettingRange::Numeric(0..=100)), + }), + ("bloom_runtime_filter_sampling_frequency", DefaultSettingValue { + value: UserSettingValue::UInt64(32), + desc: "Number of block evaluations between re-checks of bloom runtime filter selectivity. After this many evaluations, counters reset and selectivity is re-evaluated.", + mode: SettingMode::Both, + scope: SettingScope::Both, + range: Some(SettingRange::Numeric(1..=u64::MAX)), + }), ("max_execute_time_in_seconds", DefaultSettingValue { value: UserSettingValue::UInt64(0), desc: "Sets the maximum query execution time in seconds. Setting it to 0 means no limit.", @@ -623,6 +637,13 @@ impl DefaultSettings { scope: SettingScope::Both, range: Some(SettingRange::Numeric(0..=1)), }), + ("broadcast_join_max_build_rows", DefaultSettingValue { + value: UserSettingValue::UInt64(30_000_000), + desc: "Maximum estimated build-side rows for broadcast join when partitioned hash join is enabled. 0 means no limit.", + mode: SettingMode::Both, + scope: SettingScope::Both, + range: Some(SettingRange::Numeric(0..=u64::MAX)), + }), ("grouping_sets_to_union", DefaultSettingValue { value: UserSettingValue::UInt64(0), desc: "Enables grouping sets to union.", @@ -1601,6 +1622,13 @@ impl DefaultSettings { scope: SettingScope::Both, range: Some(SettingRange::Numeric(0..=1)), }), + ("enable_partitioned_hash_join", DefaultSettingValue { + value: UserSettingValue::UInt64(1), + desc: "Enables partitioned hash join for shuffle join.", + mode: SettingMode::Both, + scope: SettingScope::Both, + range: Some(SettingRange::Numeric(0..=1)), + }), ("s3_storage_class", DefaultSettingValue { value: { let storage_class = Self::extract_s3_storage_class_config(&global_conf).unwrap_or_default(); diff --git a/src/query/settings/src/settings_getter_setter.rs b/src/query/settings/src/settings_getter_setter.rs index dca74c1d5491d..d5bb231373e95 100644 --- a/src/query/settings/src/settings_getter_setter.rs +++ b/src/query/settings/src/settings_getter_setter.rs @@ -365,6 +365,14 @@ impl Settings { self.try_get_u64("bloom_runtime_filter_threshold") } + pub fn get_bloom_runtime_filter_selectivity_threshold(&self) -> Result { + self.try_get_u64("bloom_runtime_filter_selectivity_threshold") + } + + pub fn get_bloom_runtime_filter_sampling_frequency(&self) -> Result { + self.try_get_u64("bloom_runtime_filter_sampling_frequency") + } + pub fn get_min_max_runtime_filter_threshold(&self) -> Result { self.try_get_u64("min_max_runtime_filter_threshold") } @@ -466,6 +474,10 @@ impl Settings { Ok(self.try_get_u64("enforce_shuffle_join")? != 0) } + pub fn get_broadcast_join_max_build_rows(&self) -> Result { + self.try_get_u64("broadcast_join_max_build_rows") + } + pub fn get_enable_merge_into_row_fetch(&self) -> Result { Ok(self.try_get_u64("enable_merge_into_row_fetch")? != 0) } @@ -1178,6 +1190,10 @@ impl Settings { Ok(self.try_get_u64("enable_experimental_new_join")? == 1) } + pub fn get_enable_partitioned_hash_join(&self) -> Result { + Ok(self.try_get_u64("enable_partitioned_hash_join")? != 0) + } + pub fn get_s3_storage_class(&self) -> Result { let s3_storage_class_setting = self.try_get_string("s3_storage_class")?; S3StorageClass::from_str(&s3_storage_class_setting).map_err(|e| { diff --git a/src/query/sql/src/executor/physical_plans/common.rs b/src/query/sql/src/executor/physical_plans/common.rs index 3023ae01d72b2..14173aad0e89d 100644 --- a/src/query/sql/src/executor/physical_plans/common.rs +++ b/src/query/sql/src/executor/physical_plans/common.rs @@ -71,10 +71,19 @@ pub enum FragmentKind { // Broadcast Expansive, Merge, - // Ping-pong based hash shuffle (used by hash join) + // Ping-pong based hash shuffle GlobalShuffle, } +#[derive(Clone, Debug, PartialEq, Eq, serde::Serialize, serde::Deserialize)] +pub enum DataDistribution { + Random, + NodeHash(Vec), + GlobalHash(Vec), + Broadcast, + Serial, +} + #[derive(Clone, Debug, serde::Serialize, serde::Deserialize, Copy)] pub enum MutationKind { Delete, diff --git a/src/query/sql/src/planner/plans/join.rs b/src/query/sql/src/planner/plans/join.rs index af67786a076f0..58ab3875ba103 100644 --- a/src/query/sql/src/planner/plans/join.rs +++ b/src/query/sql/src/planner/plans/join.rs @@ -20,6 +20,7 @@ use std::sync::Arc; use databend_common_catalog::table_context::TableContext; use databend_common_exception::Result; +use databend_common_settings::Settings; use databend_common_statistics::DEFAULT_HISTOGRAM_BUCKETS; use databend_common_statistics::Datum; use databend_common_statistics::Histogram; @@ -547,6 +548,18 @@ impl Join { .iter() .any(|expr| expr.has_subquery()) } + + fn enforce_shuffle_join(settings: &Settings, right_stat_info: &Arc) -> Result { + let max_build_rows = settings.get_broadcast_join_max_build_rows()?; + if max_build_rows > 0 + && settings.get_enable_partitioned_hash_join()? + && right_stat_info.cardinality >= max_build_rows as f64 + { + return Ok(true); + } + + settings.get_enforce_shuffle_join() + } } impl Operator for Join { @@ -715,7 +728,8 @@ impl Operator for Join { // Use a very large value to prevent broadcast join. 1000.0 }; - if !settings.get_enforce_shuffle_join()? + + if !Self::enforce_shuffle_join(&settings, &right_stat_info)? && (right_stat_info.cardinality * broadcast_join_threshold < left_stat_info.cardinality || settings.get_enforce_broadcast_join()?) @@ -752,7 +766,7 @@ impl Operator for Join { fn compute_required_prop_children( &self, ctx: Arc, - _rel_expr: &RelExpr, + rel_expr: &RelExpr, _required: &RequiredProperty, ) -> Result>> { let mut children_required = vec![]; @@ -838,19 +852,21 @@ impl Operator for Join { | JoinType::Asof | JoinType::LeftAsof | JoinType::RightAsof - ) && !settings.get_enforce_shuffle_join()? - { - // (Any, Broadcast) - let left_distribution = Distribution::Any; - let right_distribution = Distribution::Broadcast; - children_required.push(vec![ - RequiredProperty { - distribution: left_distribution, - }, - RequiredProperty { - distribution: right_distribution, - }, - ]); + ) { + let right_stat_info = rel_expr.derive_cardinality_child(1)?; + if !Self::enforce_shuffle_join(&settings, &right_stat_info)? { + // (Any, Broadcast) + let left_distribution = Distribution::Any; + let right_distribution = Distribution::Broadcast; + children_required.push(vec![ + RequiredProperty { + distribution: left_distribution, + }, + RequiredProperty { + distribution: right_distribution, + }, + ]); + } } if children_required.is_empty() { diff --git a/src/query/storages/fuse/src/operations/read/parquet_data_source_deserializer.rs b/src/query/storages/fuse/src/operations/read/parquet_data_source_deserializer.rs index 80c51a926b9f7..87f78b1729809 100644 --- a/src/query/storages/fuse/src/operations/read/parquet_data_source_deserializer.rs +++ b/src/query/storages/fuse/src/operations/read/parquet_data_source_deserializer.rs @@ -233,7 +233,7 @@ impl Processor for DeserializeDataTransform { let (mut data_block, row_selection, bitmap_selection) = self .read_state - .as_ref() + .as_mut() .unwrap() .deserialize_and_filter(columns_chunks, part)?; diff --git a/src/query/storages/fuse/src/operations/read/read_state.rs b/src/query/storages/fuse/src/operations/read/read_state.rs index 1218f605a5920..f23a66e383834 100644 --- a/src/query/storages/fuse/src/operations/read/read_state.rs +++ b/src/query/storages/fuse/src/operations/read/read_state.rs @@ -44,11 +44,67 @@ use crate::io::DataItem; use crate::io::RowSelection; use crate::pruning::ExprBloomFilter; +const DEFAULT_MIN_INPUT_ROWS: usize = 40960; + +#[derive(Clone)] +pub struct BloomFilterSelectivity { + input_rows: usize, + filtered_rows: usize, + eval_counter: usize, + always_true: bool, + sampling_frequency: usize, + selectivity_threshold: usize, + min_input_rows: usize, +} + +impl BloomFilterSelectivity { + pub fn new(selectivity_threshold: usize, sampling_frequency: usize) -> Self { + Self { + input_rows: 0, + filtered_rows: 0, + eval_counter: 0, + always_true: false, + sampling_frequency, + selectivity_threshold, + min_input_rows: DEFAULT_MIN_INPUT_ROWS, + } + } + + pub fn should_skip(&self) -> bool { + self.always_true + } + + pub fn update(&mut self, block_input_rows: usize, block_filtered_rows: usize) { + self.input_rows += block_input_rows; + self.filtered_rows += block_filtered_rows; + self.eval_counter += 1; + + if self.eval_counter >= self.sampling_frequency { + self.judge_selectivity(); + self.reset(); + } + } + + fn judge_selectivity(&mut self) { + if self.input_rows >= self.min_input_rows { + let selectivity_pct = (self.filtered_rows * 100) / self.input_rows; + self.always_true = selectivity_pct < self.selectivity_threshold; + } + } + + fn reset(&mut self) { + self.input_rows = 0; + self.filtered_rows = 0; + self.eval_counter = 0; + } +} + #[derive(Clone)] pub struct BloomRuntimeFilterRef { pub column_index: FieldIndex, pub filter: RuntimeBloomFilter, pub stats: Arc, + pub selectivity: BloomFilterSelectivity, } pub struct ReadState { @@ -98,6 +154,11 @@ impl ReadState { let prewhere_schema: DataSchema = (prewhere_reader.schema().as_ref()).into(); + let settings = ctx.get_settings(); + let selectivity_threshold = + settings.get_bloom_runtime_filter_selectivity_threshold()? as usize; + let sampling_frequency = settings.get_bloom_runtime_filter_sampling_frequency()? as usize; + let runtime_filters: Vec = runtime_filter_entries .into_iter() .filter_map(|entry| { @@ -107,6 +168,10 @@ impl ReadState { column_index, filter: bloom.filter, stats: entry.stats, + selectivity: BloomFilterSelectivity::new( + selectivity_threshold, + sampling_frequency, + ), }) }) .collect(); @@ -147,16 +212,24 @@ impl ReadState { } pub fn runtime_filter( - &self, + &mut self, block: &DataBlock, - _num_rows: usize, + num_rows: usize, ) -> Result> { let bloom_start = Instant::now(); let mut bitmaps = vec![]; - for runtime_filter in &self.runtime_filters { + for runtime_filter in &mut self.runtime_filters { + if runtime_filter.selectivity.should_skip() { + continue; + } + let probe_column = block.get_by_offset(runtime_filter.column_index).to_column(); let bitmap = ExprBloomFilter::new(&runtime_filter.filter).apply(probe_column)?; + + let filtered_rows = bitmap.null_count(); + runtime_filter.selectivity.update(num_rows, filtered_rows); + bitmaps.push(bitmap); } @@ -175,7 +248,7 @@ impl ReadState { } pub fn deserialize_and_filter( - &self, + &mut self, columns_chunks: HashMap, part: &FuseBlockPartInfo, ) -> Result<(DataBlock, Option, Option)> {