From 596b3e116ce9c95b2b897e0acebf91d50ffb0934 Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Sat, 10 Dec 2022 06:37:39 -0500 Subject: [PATCH 1/2] Output sqllogictests with arrow display --- .../core/tests/sqllogictests/src/main.rs | 40 +-- .../core/tests/sqllogictests/src/normalize.rs | 115 ++++---- .../sqllogictests/test_files/aggregate.slt | 248 +++++++++--------- .../tests/sqllogictests/test_files/ddl.slt | 16 +- .../sqllogictests/test_files/prepare.slt | 3 +- .../sqllogictests/test_files/timestamps.slt | 26 +- 6 files changed, 218 insertions(+), 230 deletions(-) diff --git a/datafusion/core/tests/sqllogictests/src/main.rs b/datafusion/core/tests/sqllogictests/src/main.rs index 47ae8f80db7a9..3d2fe492ca2e9 100644 --- a/datafusion/core/tests/sqllogictests/src/main.rs +++ b/datafusion/core/tests/sqllogictests/src/main.rs @@ -16,13 +16,12 @@ // under the License. use async_trait::async_trait; -use datafusion::arrow::csv::WriterBuilder; use datafusion::arrow::record_batch::RecordBatch; use datafusion::prelude::{SessionConfig, SessionContext}; use datafusion_sql::parser::{DFParser, Statement}; use log::info; -use normalize::normalize_batch; -use sqllogictest::{ColumnType, DBOutput}; +use normalize::convert_batches; +use sqllogictest::DBOutput; use sqlparser::ast::Statement as SQLStatement; use std::path::{Path, PathBuf}; use std::time::Duration; @@ -173,41 +172,6 @@ async fn context_for_test_file(file_name: &str) -> SessionContext { } } -fn convert_batches(batches: Vec) -> Result { - let mut bytes = vec![]; - if batches.is_empty() { - return Ok(DBOutput::StatementComplete(0)); - } - // TODO: use the actual types - let types = vec![ColumnType::Any; batches[0].num_columns()]; - - { - let builder = WriterBuilder::new() - .has_headers(false) - .with_delimiter(b'\t'); - let mut writer = builder.build(&mut bytes); - for batch in batches { - writer.write(&normalize_batch(batch)).unwrap(); - } - } - let res = String::from_utf8(bytes).unwrap(); - let rows = res - .lines() - .map(|s| { - s.split('\t') - .map(|s| { - if s.is_empty() { - "NULL".to_string() - } else { - s.to_string() - } - }) - .collect() - }) - .collect(); - Ok(DBOutput::Rows { types, rows }) -} - async fn run_query(ctx: &SessionContext, sql: impl Into) -> Result { let sql = sql.into(); // Check if the sql is `insert` diff --git a/datafusion/core/tests/sqllogictests/src/normalize.rs b/datafusion/core/tests/sqllogictests/src/normalize.rs index 512de350e7fbb..3032a6e6dd579 100644 --- a/datafusion/core/tests/sqllogictests/src/normalize.rs +++ b/datafusion/core/tests/sqllogictests/src/normalize.rs @@ -15,61 +15,86 @@ // specific language governing permissions and limitations // under the License. -use std::sync::Arc; +use crate::error::{DFSqlLogicTestError, Result}; +use arrow::{array::ArrayRef, datatypes::DataType, record_batch::RecordBatch}; +use datafusion::error::DataFusionError; +use sqllogictest::{ColumnType, DBOutput}; -use arrow::{ - array::{ - as_largestring_array, as_string_array, ArrayRef, LargeStringArray, StringArray, - }, - datatypes::DataType, - record_batch::RecordBatch, -}; +/// Converts `batches` to a DBOutput as expected by sqllogicteset. +/// +/// Assumes empty record batches are a successful statement completion +/// +pub fn convert_batches(batches: Vec) -> Result { + if batches.is_empty() { + // DataFusion doesn't report number of rows complete + return Ok(DBOutput::StatementComplete(0)); + } + + let schema = batches[0].schema(); + + // TODO: report the the actual types of the result + // https://github.com/apache/arrow-datafusion/issues/4499 + let types = vec![ColumnType::Any; batches[0].num_columns()]; + + let mut rows = vec![]; + for batch in batches { + // Verify schema + if schema != batch.schema() { + return Err(DFSqlLogicTestError::DataFusion(DataFusionError::Internal( + format!( + "Schema mismatch. Previously had\n{:#?}\n\nGot:\n{:#?}", + schema, + batch.schema() + ), + ))); + } + rows.append(&mut convert_batch(batch)?); + } + + Ok(DBOutput::Rows { types, rows }) +} -/// Normalizes the content of a RecordBatch prior to printing. +/// Convert a single batch to a `Vec>` for comparison +fn convert_batch(batch: RecordBatch) -> Result>> { + (0..batch.num_rows()) + .map(|row| { + batch + .columns() + .iter() + .map(|col| cell_to_string(col, row)) + .collect::>>() + }) + .collect() +} + +/// Normalizes the content of a single cell in RecordBatch prior to printing. /// /// This is to make the output comparable to the semi-standard .slt format /// /// Normalizations applied to [NULL Values and empty strings] /// /// [NULL Values and empty strings]: https://duckdb.org/dev/sqllogictest/result_verification#null-values-and-empty-strings -pub fn normalize_batch(batch: RecordBatch) -> RecordBatch { - let new_columns = batch - .columns() - .iter() - .map(|array| { - match array.data_type() { - DataType::Utf8 => { - let arr: StringArray = as_string_array(array.as_ref()) - .iter() - .map(normalize_string) - .collect(); - Arc::new(arr) as ArrayRef - } - DataType::LargeUtf8 => { - let arr: LargeStringArray = as_largestring_array(array.as_ref()) - .iter() - .map(normalize_string) - .collect(); - Arc::new(arr) as ArrayRef - } - // todo normalize dictionary values +/// +pub fn cell_to_string(col: &ArrayRef, row: usize) -> Result { + // represent any null value with the string "NULL" + if !col.is_valid(row) { + return Ok("NULL".into()); + } - // no normalization on this type - _ => array.clone(), - } - }) - .collect(); + // Convert to normal string representation + let s = arrow::util::display::array_value_to_string(col, row) + .map_err(DFSqlLogicTestError::Arrow)?; - RecordBatch::try_new(batch.schema(), new_columns).expect("creating normalized batch") -} + // apply subsequent normalization depending on type -fn normalize_string(v: Option<&str>) -> Option<&str> { - v.map(|v| { - // All empty strings are replaced with this value - if v.is_empty() { - "(empty)" - } else { - v + let s = match col.data_type() { + DataType::Utf8 | DataType::LargeUtf8 if s.is_empty() => { + // All empty strings are replaced with this value + "(empty)".to_string() } - }) + // no other normalization + _ => s, + }; + + Ok(s) } diff --git a/datafusion/core/tests/sqllogictests/test_files/aggregate.slt b/datafusion/core/tests/sqllogictests/test_files/aggregate.slt index 26c4b258bdce7..90241e3e8a6aa 100644 --- a/datafusion/core/tests/sqllogictests/test_files/aggregate.slt +++ b/datafusion/core/tests/sqllogictests/test_files/aggregate.slt @@ -52,7 +52,7 @@ SELECT var_pop(c2) FROM aggregate_test_100 query R SELECT var_pop(c6) FROM aggregate_test_100 ---- -2.615633434202189e37 +26156334342021890000000000000000000000 # csv_query_variance_3 query R @@ -82,7 +82,7 @@ SELECT stddev_pop(c2) FROM aggregate_test_100 query R SELECT stddev_pop(c6) FROM aggregate_test_100 ---- -5.114326382039172e18 +5114326382039172000 # csv_query_stddev_3 query R @@ -523,98 +523,98 @@ true query TIR SELECT c1, c2, AVG(c3) FROM aggregate_test_100_by_sql GROUP BY CUBE (c1, c2) ORDER BY c1, c2 ---- -a 1 -17.6 -a 2 -15.333333333333334 -a 3 -4.5 -a 4 -32.0 -a 5 -32.0 -a NULL -18.333333333333332 -b 1 31.666666666666668 -b 2 25.5 -b 3 -42.0 -b 4 -44.6 -b 5 -0.2 -b NULL -5.842105263157895 -c 1 47.5 -c 2 -55.57142857142857 -c 3 47.5 -c 4 -10.75 -c 5 12.0 -c NULL -1.3333333333333333 -d 1 -8.142857142857142 -d 2 109.33333333333333 -d 3 41.333333333333336 -d 4 54.0 -d 5 -49.5 -d NULL 25.444444444444443 -e 1 75.66666666666667 -e 2 37.8 -e 3 48.0 -e 4 37.285714285714285 -e 5 -11.0 -e NULL 40.333333333333336 -NULL 1 16.681818181818183 -NULL 2 8.363636363636363 -NULL 3 20.789473684210527 -NULL 4 1.2608695652173914 -NULL 5 -13.857142857142858 +a 1 -17.6 +a 2 -15.333333333333334 +a 3 -4.5 +a 4 -32 +a 5 -32 +a NULL -18.333333333333332 +b 1 31.666666666666668 +b 2 25.5 +b 3 -42 +b 4 -44.6 +b 5 -0.2 +b NULL -5.842105263157895 +c 1 47.5 +c 2 -55.57142857142857 +c 3 47.5 +c 4 -10.75 +c 5 12 +c NULL -1.3333333333333333 +d 1 -8.142857142857142 +d 2 109.33333333333333 +d 3 41.333333333333336 +d 4 54 +d 5 -49.5 +d NULL 25.444444444444443 +e 1 75.66666666666667 +e 2 37.8 +e 3 48 +e 4 37.285714285714285 +e 5 -11 +e NULL 40.333333333333336 +NULL 1 16.681818181818183 +NULL 2 8.363636363636363 +NULL 3 20.789473684210527 +NULL 4 1.2608695652173914 +NULL 5 -13.857142857142858 NULL NULL 7.81 # csv_query_rollup_avg query TIIR SELECT c1, c2, c3, AVG(c4) FROM aggregate_test_100_by_sql WHERE c1 IN ('a', 'b', NULL) GROUP BY ROLLUP (c1, c2, c3) ORDER BY c1, c2, c3 ---- -a 1 -85 -15154.0 -a 1 -56 8692.0 -a 1 -25 15295.0 -a 1 -5 12636.0 -a 1 83 -14704.0 -a 1 NULL 1353.0 -a 2 -48 -18025.0 -a 2 -43 13080.0 -a 2 45 15673.0 -a 2 NULL 3576.0 -a 3 -72 -11122.0 -a 3 -12 -9168.0 -a 3 13 22338.5 -a 3 14 28162.0 -a 3 17 -22796.0 -a 3 NULL 4958.833333333333 -a 4 -101 11640.0 -a 4 -54 -2376.0 -a 4 -38 20744.0 -a 4 65 -28462.0 -a 4 NULL 386.5 -a 5 -101 -12484.0 -a 5 -31 -12907.0 -a 5 36 -16974.0 -a 5 NULL -14121.666666666666 -a NULL NULL 306.04761904761904 -b 1 12 7652.0 -b 1 29 -18218.0 -b 1 54 -18410.0 -b 1 NULL -9658.666666666666 -b 2 -60 -21739.0 -b 2 31 23127.0 -b 2 63 21456.0 -b 2 68 15874.0 -b 2 NULL 9679.5 -b 3 -101 -13217.0 -b 3 17 14457.0 -b 3 NULL 620.0 -b 4 -117 19316.0 -b 4 -111 -1967.0 -b 4 -59 25286.0 -b 4 17 -28070.0 -b 4 47 20690.0 -b 4 NULL 7051.0 -b 5 -82 22080.0 -b 5 -44 15788.0 -b 5 -5 24896.0 -b 5 62 16337.0 -b 5 68 21576.0 -b 5 NULL 20135.4 -b NULL NULL 7732.315789473684 +a 1 -85 -15154 +a 1 -56 8692 +a 1 -25 15295 +a 1 -5 12636 +a 1 83 -14704 +a 1 NULL 1353 +a 2 -48 -18025 +a 2 -43 13080 +a 2 45 15673 +a 2 NULL 3576 +a 3 -72 -11122 +a 3 -12 -9168 +a 3 13 22338.5 +a 3 14 28162 +a 3 17 -22796 +a 3 NULL 4958.833333333333 +a 4 -101 11640 +a 4 -54 -2376 +a 4 -38 20744 +a 4 65 -28462 +a 4 NULL 386.5 +a 5 -101 -12484 +a 5 -31 -12907 +a 5 36 -16974 +a 5 NULL -14121.666666666666 +a NULL NULL 306.04761904761904 +b 1 12 7652 +b 1 29 -18218 +b 1 54 -18410 +b 1 NULL -9658.666666666666 +b 2 -60 -21739 +b 2 31 23127 +b 2 63 21456 +b 2 68 15874 +b 2 NULL 9679.5 +b 3 -101 -13217 +b 3 17 14457 +b 3 NULL 620 +b 4 -117 19316 +b 4 -111 -1967 +b 4 -59 25286 +b 4 17 -28070 +b 4 47 20690 +b 4 NULL 7051 +b 5 -82 22080 +b 5 -44 15788 +b 5 -5 24896 +b 5 62 16337 +b 5 68 21576 +b 5 NULL 20135.4 +b NULL NULL 7732.315789473684 NULL NULL NULL 3833.525 # csv_query_approx_percentile_cont_with_weight @@ -885,42 +885,42 @@ select c2, sum(c3) sum_c3, avg(c3) avg_c3, max(c3) max_c3, min(c3) min_c3, count query TIIRIII select c1, c2, sum(c3) sum_c3, avg(c3) avg_c3, max(c3) max_c3, min(c3) min_c3, count(c3) count_c3 from aggregate_test_100 group by CUBE (c1,c2) order by c1, c2 ---- -a 1 -88 -17.6 83 -85 5 -a 2 -46 -15.333333333333334 45 -48 3 -a 3 -27 -4.5 17 -72 6 -a 4 -128 -32.0 65 -101 4 -a 5 -96 -32.0 36 -101 3 -a NULL -385 -18.333333333333332 83 -101 21 -b 1 95 31.666666666666668 54 12 3 -b 2 102 25.5 68 -60 4 -b 3 -84 -42.0 17 -101 2 -b 4 -223 -44.6 47 -117 5 -b 5 -1 -0.2 68 -82 5 -b NULL -111 -5.842105263157895 68 -117 19 -c 1 190 47.5 103 -24 4 -c 2 -389 -55.57142857142857 29 -117 7 -c 3 190 47.5 97 -2 4 -c 4 -43 -10.75 123 -90 4 -c 5 24 12.0 118 -94 2 -c NULL -28 -1.3333333333333333 123 -117 21 -d 1 -57 -8.142857142857142 125 -99 7 -d 2 328 109.33333333333333 122 93 3 -d 3 124 41.333333333333336 123 -76 3 -d 4 162 54.0 102 5 3 -d 5 -99 -49.5 -40 -59 2 -d NULL 458 25.444444444444443 125 -99 18 -e 1 227 75.66666666666667 120 36 3 -e 2 189 37.8 97 -61 5 -e 3 192 48.0 112 -95 4 -e 4 261 37.285714285714285 97 -56 7 -e 5 -22 -11.0 64 -86 2 -e NULL 847 40.333333333333336 120 -95 21 -NULL 1 367 16.681818181818183 125 -99 22 -NULL 2 184 8.363636363636363 122 -117 22 -NULL 3 395 20.789473684210527 123 -101 19 -NULL 4 29 1.2608695652173914 123 -117 23 -NULL 5 -194 -13.857142857142858 118 -101 14 -NULL NULL 781 7.81 125 -117 100 +a 1 -88 -17.6 83 -85 5 +a 2 -46 -15.333333333333334 45 -48 3 +a 3 -27 -4.5 17 -72 6 +a 4 -128 -32 65 -101 4 +a 5 -96 -32 36 -101 3 +a NULL -385 -18.333333333333332 83 -101 21 +b 1 95 31.666666666666668 54 12 3 +b 2 102 25.5 68 -60 4 +b 3 -84 -42 17 -101 2 +b 4 -223 -44.6 47 -117 5 +b 5 -1 -0.2 68 -82 5 +b NULL -111 -5.842105263157895 68 -117 19 +c 1 190 47.5 103 -24 4 +c 2 -389 -55.57142857142857 29 -117 7 +c 3 190 47.5 97 -2 4 +c 4 -43 -10.75 123 -90 4 +c 5 24 12 118 -94 2 +c NULL -28 -1.3333333333333333 123 -117 21 +d 1 -57 -8.142857142857142 125 -99 7 +d 2 328 109.33333333333333 122 93 3 +d 3 124 41.333333333333336 123 -76 3 +d 4 162 54 102 5 3 +d 5 -99 -49.5 -40 -59 2 +d NULL 458 25.444444444444443 125 -99 18 +e 1 227 75.66666666666667 120 36 3 +e 2 189 37.8 97 -61 5 +e 3 192 48 112 -95 4 +e 4 261 37.285714285714285 97 -56 7 +e 5 -22 -11 64 -86 2 +e NULL 847 40.333333333333336 120 -95 21 +NULL 1 367 16.681818181818183 125 -99 22 +NULL 2 184 8.363636363636363 122 -117 22 +NULL 3 395 20.789473684210527 123 -101 19 +NULL 4 29 1.2608695652173914 123 -117 23 +NULL 5 -194 -13.857142857142858 118 -101 14 +NULL NULL 781 7.81 125 -117 100 # csv_query_array_agg_distinct # query T diff --git a/datafusion/core/tests/sqllogictests/test_files/ddl.slt b/datafusion/core/tests/sqllogictests/test_files/ddl.slt index 8af2a9dada676..7445157eec14b 100644 --- a/datafusion/core/tests/sqllogictests/test_files/ddl.slt +++ b/datafusion/core/tests/sqllogictests/test_files/ddl.slt @@ -212,7 +212,7 @@ CREATE TABLE my_table AS SELECT * FROM aggregate_simple query II rowsort SELECT * FROM my_table order by c1 LIMIT 1 ---- -0.00001 1.0e-12 true +0.00001 0.000000000001 true statement ok DROP TABLE my_table; @@ -237,7 +237,7 @@ CREATE TABLE my_table(c1 float, c2 double, c3 boolean, c4 varchar) AS SELECT *,c query II rowsort SELECT * FROM my_table order by c1 LIMIT 1 ---- -0.00001 1.0e-12 true 1 +0.00001 0.000000000001 true 1 statement ok DROP TABLE my_table; @@ -257,7 +257,7 @@ CREATE TABLE my_table(c1 int, c2 float, c3 varchar) AS VALUES(1, 2, 'hello') query II rowsort SELECT * FROM my_table; ---- -1 2.0 hello +1 2 hello statement ok DROP TABLE my_table; @@ -343,8 +343,8 @@ ts TIMESTAMP query CC SELECT * from csv_with_timestamps ---- -Jorge 2018-12-13T12:12:10.011000000 -Andrew 2018-11-13T17:11:10.011000000 +Jorge 2018-12-13T12:12:10.011 +Andrew 2018-11-13T17:11:10.011 statement ok DROP TABLE csv_with_timestamps @@ -369,8 +369,8 @@ LOCATION 'tests/data/partitioned_table'; query CCC SELECT * from csv_with_timestamps where c_date='2018-11-13' ---- -Jorge 2018-12-13T12:12:10.011000000 2018-11-13 -Andrew 2018-11-13T17:11:10.011000000 2018-11-13 +Jorge 2018-12-13T12:12:10.011 2018-11-13 +Andrew 2018-11-13T17:11:10.011 2018-11-13 statement ok DROP TABLE csv_with_timestamps @@ -409,7 +409,7 @@ CREATE EXTERNAL TABLE aggregate_simple STORED AS CSV WITH HEADER ROW DELIMITER ' query CCC SELECT * FROM aggregate_simple order by c1 LIMIT 1; ---- -0.00001 1.0e-12 true +0.00001 0.000000000001 true statement ok DROP TABLE aggregate_simple diff --git a/datafusion/core/tests/sqllogictests/test_files/prepare.slt b/datafusion/core/tests/sqllogictests/test_files/prepare.slt index 948a2e3bc830d..813288e386737 100644 --- a/datafusion/core/tests/sqllogictests/test_files/prepare.slt +++ b/datafusion/core/tests/sqllogictests/test_files/prepare.slt @@ -25,7 +25,7 @@ create table person (id int, first_name varchar, last_name varchar, age int, sta query C rowsort select * from person; ---- -1 jane smith 20 MA 100000.45 2000-11-12T00:00:00.000000000 99 +1 jane smith 20 MA 100000.45 2000-11-12T00:00:00 99 # Error due to syntax and semantic violation @@ -80,4 +80,3 @@ PREPARE my_plan(INT, DOUBLE, DOUBLE, DOUBLE) AS SELECT id, SUM(age) FROM person statement error PREPARE my_plan(STRING, STRING) AS SELECT * FROM (VALUES(1, $1), (2, $2)) AS t (num, letter); - diff --git a/datafusion/core/tests/sqllogictests/test_files/timestamps.slt b/datafusion/core/tests/sqllogictests/test_files/timestamps.slt index f4d627df4523d..56c6690be18b6 100644 --- a/datafusion/core/tests/sqllogictests/test_files/timestamps.slt +++ b/datafusion/core/tests/sqllogictests/test_files/timestamps.slt @@ -25,42 +25,42 @@ create table foo (val int, ts timestamp) as values (1, '2000-01-01T00:00:00'::ti query C rowsort select * from foo; ---- -1 2000-01-01T00:00:00.000000000 -2 2000-02-01T00:00:00.000000000 -3 2000-03-01T00:00:00.000000000 +1 2000-01-01T00:00:00 +2 2000-02-01T00:00:00 +3 2000-03-01T00:00:00 # Test that we can compare a timestamp to a casted string query C rowsort select * from foo where ts > '2000-01-01T00:00:00'::timestamp; ---- -2 2000-02-01T00:00:00.000000000 -3 2000-03-01T00:00:00.000000000 +2 2000-02-01T00:00:00 +3 2000-03-01T00:00:00 # Test that we can compare a timestamp to a string and it will be coerced query C rowsort select * from foo where ts > '2000-01-01T00:00:00'; ---- -2 2000-02-01T00:00:00.000000000 -3 2000-03-01T00:00:00.000000000 +2 2000-02-01T00:00:00 +3 2000-03-01T00:00:00 query C rowsort select * from foo where ts < '2000-02-01T00:00:00'; ---- -1 2000-01-01T00:00:00.000000000 +1 2000-01-01T00:00:00 query C rowsort select * from foo where ts <= '2000-02-01T00:00:00'; ---- -1 2000-01-01T00:00:00.000000000 -2 2000-02-01T00:00:00.000000000 +1 2000-01-01T00:00:00 +2 2000-02-01T00:00:00 query C rowsort select * from foo where ts = '2000-02-01T00:00:00'; ---- -2 2000-02-01T00:00:00.000000000 +2 2000-02-01T00:00:00 query C rowsort select * from foo where ts != '2000-02-01T00:00:00'; ---- -1 2000-01-01T00:00:00.000000000 -3 2000-03-01T00:00:00.000000000 +1 2000-01-01T00:00:00 +3 2000-03-01T00:00:00 From d0e7db9f801bdef7e393a5d209c01557cc97a22b Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Sun, 11 Dec 2022 04:42:53 -0500 Subject: [PATCH 2/2] Improve normalization clarity --- .../core/tests/sqllogictests/src/normalize.rs | 17 ++++++----------- 1 file changed, 6 insertions(+), 11 deletions(-) diff --git a/datafusion/core/tests/sqllogictests/src/normalize.rs b/datafusion/core/tests/sqllogictests/src/normalize.rs index 3032a6e6dd579..474ec6e85f9cd 100644 --- a/datafusion/core/tests/sqllogictests/src/normalize.rs +++ b/datafusion/core/tests/sqllogictests/src/normalize.rs @@ -82,19 +82,14 @@ pub fn cell_to_string(col: &ArrayRef, row: usize) -> Result { } // Convert to normal string representation - let s = arrow::util::display::array_value_to_string(col, row) + let mut s = arrow::util::display::array_value_to_string(col, row) .map_err(DFSqlLogicTestError::Arrow)?; - // apply subsequent normalization depending on type - - let s = match col.data_type() { - DataType::Utf8 | DataType::LargeUtf8 if s.is_empty() => { - // All empty strings are replaced with this value - "(empty)".to_string() - } - // no other normalization - _ => s, - }; + // apply subsequent normalization depending on type if + if matches!(col.data_type(), DataType::Utf8 | DataType::LargeUtf8) && s.is_empty() { + // All empty strings are replaced with this value + s = "(empty)".to_string(); + } Ok(s) }