Is your feature request related to a problem or challenge?
Part of #10922
We are adding APIs to efficiently convert the data stored in Parquet's "PageIndex" into ArrayRefs -- which will make it significiantly easier to use this information for pruning and other tasks.
Describe the solution you'd like
Add support to StatisticsConverter::min_page_statistics and StatisticsConverter::max_page_statistics for the types above
|
/// of parquet page [`Index`]'es to an [`ArrayRef`] |
|
pub(crate) fn min_page_statistics<'a, I>( |
|
data_type: Option<&DataType>, |
|
iterator: I, |
|
) -> Result<ArrayRef> |
|
where |
|
I: Iterator<Item = (usize, &'a Index)>, |
|
{ |
|
get_data_page_statistics!(Min, data_type, iterator) |
|
} |
|
|
|
/// Extracts the max statistics from an iterator |
|
/// of parquet page [`Index`]'es to an [`ArrayRef`] |
|
pub(crate) fn max_page_statistics<'a, I>( |
|
data_type: Option<&DataType>, |
|
iterator: I, |
|
) -> Result<ArrayRef> |
|
where |
|
I: Iterator<Item = (usize, &'a Index)>, |
|
{ |
Describe alternatives you've considered
You can follow the model from @Weijun-H in #10931
- Update the test for the listed data types following the model of
test_int64
|
async fn test_int_64() { |
|
// This creates a parquet files of 4 columns named "i8", "i16", "i32", "i64" |
|
let reader = TestReader { |
|
scenario: Scenario::Int, |
|
row_per_group: 5, |
|
} |
|
.build() |
|
.await; |
|
|
|
// since each row has only one data page, the statistics are the same |
|
Test { |
|
reader: &reader, |
|
// mins are [-5, -4, 0, 5] |
|
expected_min: Arc::new(Int64Array::from(vec![-5, -4, 0, 5])), |
|
// maxes are [-1, 0, 4, 9] |
|
expected_max: Arc::new(Int64Array::from(vec![-1, 0, 4, 9])), |
|
// nulls are [0, 0, 0, 0] |
|
expected_null_counts: UInt64Array::from(vec![0, 0, 0, 0]), |
|
// row counts are [5, 5, 5, 5] |
|
expected_row_counts: UInt64Array::from(vec![5, 5, 5, 5]), |
|
column_name: "i64", |
|
check: Check::Both, |
|
} |
|
.run(); |
- Add any required implementation in
|
make_data_page_stats_iterator!(MinInt64DataPageStatsIterator, min, Index::INT64, i64); |
|
make_data_page_stats_iterator!(MaxInt64DataPageStatsIterator, max, Index::INT64, i64); |
|
|
|
macro_rules! get_data_page_statistics { |
|
($stat_type_prefix: ident, $data_type: ident, $iterator: ident) => { |
|
paste! { |
|
match $data_type { |
|
Some(DataType::Int64) => Ok(Arc::new(Int64Array::from_iter([<$stat_type_prefix Int64DataPageStatsIterator>]::new($iterator).flatten()))), |
|
_ => unimplemented!() |
|
} |
|
} |
|
} |
(follow the model of the row counts,
|
macro_rules! make_stats_iterator { |
)
Additional context
No response
Is your feature request related to a problem or challenge?
Part of #10922
We are adding APIs to efficiently convert the data stored in Parquet's "PageIndex" into
ArrayRefs -- which will make it significiantly easier to use this information for pruning and other tasks.Describe the solution you'd like
Add support to
StatisticsConverter::min_page_statisticsandStatisticsConverter::max_page_statisticsfor the types abovedatafusion/datafusion/core/src/datasource/physical_plan/parquet/statistics.rs
Lines 637 to 656 in a923c65
Describe alternatives you've considered
You can follow the model from @Weijun-H in #10931
test_int64datafusion/datafusion/core/tests/parquet/arrow_statistics.rs
Lines 506 to 529 in a923c65
datafusion/datafusion/core/src/datasource/physical_plan/parquet/statistics.rs
Lines 575 to 586 in 2f43476
datafusion/datafusion/core/src/datasource/physical_plan/parquet/statistics.rs
Line 90 in 2f43476
Additional context
No response