From db78b1372144a6e981368d7789bc0e06e7a1363d Mon Sep 17 00:00:00 2001 From: alamb Date: Wed, 19 Aug 2020 11:06:13 -0400 Subject: [PATCH 1/2] ARROW-9790: [Rust][Parquet]: Increase test coverage in arrow_reader.rs --- rust/parquet/src/arrow/arrow_reader.rs | 145 +++++++++++++++---------- 1 file changed, 89 insertions(+), 56 deletions(-) diff --git a/rust/parquet/src/arrow/arrow_reader.rs b/rust/parquet/src/arrow/arrow_reader.rs index 180514b39883..7a64f2429214 100644 --- a/rust/parquet/src/arrow/arrow_reader.rs +++ b/rust/parquet/src/arrow/arrow_reader.rs @@ -296,51 +296,12 @@ mod tests { "; let converter = FromConverter::new(); - single_column_reader_test::< + run_single_column_reader_tests::< BoolType, BooleanArray, FromConverter>, BooleanArray>, BoolType, - >(2, 100, 2, message_type, 15, 50, converter); - } - - #[test] - fn test_bool_single_column_reader_test_batch_size_divides_into_row_group_size() { - let message_type = " - message test_schema { - REQUIRED BOOLEAN leaf; - } - "; - - // Use a batch size (5) so batches to fall on - // row group boundaries (25 rows in 3 row groups --> row - // groups of 10, 10, and 5) to test edge refilling edge cases. - let converter = FromConverter::new(); - single_column_reader_test::< - BoolType, - BooleanArray, - FromConverter>, BooleanArray>, - BoolType, - >(3, 25, 2, message_type, 5, 50, converter); - } - - #[test] - fn test_bool_single_column_reader_test_batch_size_divides_into_row_group_size2() { - let message_type = " - message test_schema { - REQUIRED BOOLEAN leaf; - } - "; - - // Ensure that every batch size (25) falls exactly a row group - // boundary (25 in this case) to test edge case. - let converter = FromConverter::new(); - single_column_reader_test::< - BoolType, - BooleanArray, - FromConverter>, BooleanArray>, - BoolType, - >(4, 100, 2, message_type, 25, 50, converter); + >(2, message_type, &converter); } struct RandFixedLenGen {} @@ -362,12 +323,12 @@ mod tests { "; let converter = FixedSizeArrayConverter::new(20); - single_column_reader_test::< + run_single_column_reader_tests::< FixedLenByteArrayType, FixedSizeBinaryArray, FixedSizeArrayConverter, RandFixedLenGen, - >(2, 100, 20, message_type, 15, 50, converter); + >(20, message_type, &converter); } struct RandUtf8Gen {} @@ -387,30 +348,101 @@ mod tests { "; let converter = Utf8ArrayConverter {}; - single_column_reader_test::< + run_single_column_reader_tests::< ByteArrayType, StringArray, Utf8ArrayConverter, RandUtf8Gen, - >(2, 100, 2, message_type, 15, 50, converter); + >(2, message_type, &converter); } - fn single_column_reader_test( + /// Parameters for single_column_reader_test + #[derive(Debug)] + struct TestOptions { + /// Number of row group to write to parquet (row group size = + /// num_row_groups / num_rows) num_row_groups: usize, + /// Total number of rows num_rows: usize, - rand_max: i32, - message_type: &str, + /// Size of batches to read back record_batch_size: usize, + /// Total number of batches to attempt to read. + /// `record_batch_size` * `num_iterations` should be greater + /// than `num_rows` to ensure the data can be read back completely num_iterations: usize, - converter: C, + } + + impl TestOptions { + fn new( + num_row_groups: usize, + num_rows: usize, + record_batch_size: usize, + num_iterations: usize, + ) -> Self { + TestOptions { + num_row_groups, + num_rows, + record_batch_size, + num_iterations, + } + } + } + + /// Create a parquet file and then read it using + /// `ParquetFileArrowReader` using a standard set of parameters + /// `opts`. + /// + /// `rand_max` represents the maximum size of value to pass to to + /// value generator + fn run_single_column_reader_tests( + rand_max: i32, + message_type: &str, + converter: &C, ) where T: DataType, G: RandGen, A: PartialEq + Array + 'static, C: Converter>, A> + 'static, { - let values: Vec> = (0..num_row_groups) - .map(|_| G::gen_vec(rand_max, num_rows)) + let all_options = vec![ + TestOptions::new(2, 100, 15, 50), + // batch size (5) so batches to fall on row group + // boundaries (25 rows in 3 row groups --> row groups of + // 10, 10, and 5) to test edge refilling edge cases. + TestOptions::new(3, 25, 5, 50), + // Ensure that every batch size (25) falls exactly a row group + // boundary (25 in this case) to test edge case. + TestOptions::new(4, 100, 25, 50), + ]; + + all_options.into_iter().for_each(|opts| { + // Print out options to facilitate debugging failures on CI + println!("Running with Test Options: {:?}", opts); + single_column_reader_test::( + opts, + rand_max, + message_type, + converter, + ) + }); + } + + /// Create a parquet file and then read it using + /// `ParquetFileArrowReader` using the parameters described in + /// `opts`. + fn single_column_reader_test( + opts: TestOptions, + rand_max: i32, + message_type: &str, + converter: &C, + ) where + T: DataType, + G: RandGen, + A: PartialEq + Array + 'static, + C: Converter>, A> + 'static, + { + let values: Vec> = (0..opts.num_row_groups) + .map(|_| G::gen_vec(rand_max, opts.num_rows)) .collect(); let path = get_temp_filename(); @@ -426,8 +458,9 @@ mod tests { SerializedFileReader::try_from(File::open(&path).unwrap()).unwrap(); let mut arrow_reader = ParquetFileArrowReader::new(Rc::new(parquet_reader)); - let mut record_reader = - arrow_reader.get_record_reader(record_batch_size).unwrap(); + let mut record_reader = arrow_reader + .get_record_reader(opts.record_batch_size) + .unwrap(); let expected_data: Vec> = values .iter() @@ -435,12 +468,12 @@ mod tests { .map(|b| Some(b.clone())) .collect(); - for i in 0..num_iterations { - let start = i * record_batch_size; + for i in 0..opts.num_iterations { + let start = i * opts.record_batch_size; let batch = record_reader.next_batch().unwrap(); if start < expected_data.len() { - let end = min(start + record_batch_size, expected_data.len()); + let end = min(start + opts.record_batch_size, expected_data.len()); assert!(batch.is_some()); let mut data = vec![]; From bc384b3f0bcf5ffa90a0011ad315d62f5d51d619 Mon Sep 17 00:00:00 2001 From: alamb Date: Fri, 21 Aug 2020 11:05:00 -0400 Subject: [PATCH 2/2] Add additional documentation for sets of test parameters --- rust/parquet/src/arrow/arrow_reader.rs | 51 ++++++++++++++------------ 1 file changed, 27 insertions(+), 24 deletions(-) diff --git a/rust/parquet/src/arrow/arrow_reader.rs b/rust/parquet/src/arrow/arrow_reader.rs index 7a64f2429214..06df784d598c 100644 --- a/rust/parquet/src/arrow/arrow_reader.rs +++ b/rust/parquet/src/arrow/arrow_reader.rs @@ -372,22 +372,6 @@ mod tests { num_iterations: usize, } - impl TestOptions { - fn new( - num_row_groups: usize, - num_rows: usize, - record_batch_size: usize, - num_iterations: usize, - ) -> Self { - TestOptions { - num_row_groups, - num_rows, - record_batch_size, - num_iterations, - } - } - } - /// Create a parquet file and then read it using /// `ParquetFileArrowReader` using a standard set of parameters /// `opts`. @@ -405,14 +389,33 @@ mod tests { C: Converter>, A> + 'static, { let all_options = vec![ - TestOptions::new(2, 100, 15, 50), - // batch size (5) so batches to fall on row group - // boundaries (25 rows in 3 row groups --> row groups of - // 10, 10, and 5) to test edge refilling edge cases. - TestOptions::new(3, 25, 5, 50), - // Ensure that every batch size (25) falls exactly a row group - // boundary (25 in this case) to test edge case. - TestOptions::new(4, 100, 25, 50), + // choose record_batch_batch (15) so batches cross row + // group boundaries (50 rows in 2 row groups) cases. + TestOptions { + num_row_groups: 2, + num_rows: 100, + record_batch_size: 15, + num_iterations: 50, + }, + // choose record_batch_batch (5) so batches sometime fall + // on row group boundaries and (25 rows in 3 row groups + // --> row groups of 10, 10, and 5). Tests buffer + // refilling edge cases. + TestOptions { + num_row_groups: 3, + num_rows: 25, + record_batch_size: 5, + num_iterations: 50, + }, + // Choose record_batch_size (25) so all batches fall + // exactly on row group boundary (25). Tests buffer + // refilling edge cases. + TestOptions { + num_row_groups: 4, + num_rows: 100, + record_batch_size: 25, + num_iterations: 50, + }, ]; all_options.into_iter().for_each(|opts| {