diff --git a/r/R/arrowExports.R b/r/R/arrowExports.R index aa48384495e2..0d0c37621ce7 100644 --- a/r/R/arrowExports.R +++ b/r/R/arrowExports.R @@ -1260,8 +1260,8 @@ RecordBatch__GetColumnByName <- function(batch, name){ .Call(`_arrow_RecordBatch__GetColumnByName` , batch, name) } -RecordBatch__select <- function(batch, indices){ - .Call(`_arrow_RecordBatch__select` , batch, indices) +RecordBatch__SelectColumns <- function(batch, indices){ + .Call(`_arrow_RecordBatch__SelectColumns` , batch, indices) } RecordBatch__Equals <- function(self, other, check_metadata){ @@ -1504,8 +1504,8 @@ Table__GetColumnByName <- function(table, name){ .Call(`_arrow_Table__GetColumnByName` , table, name) } -Table__select <- function(table, indices){ - .Call(`_arrow_Table__select` , table, indices) +Table__SelectColumns <- function(table, indices){ + .Call(`_arrow_Table__SelectColumns` , table, indices) } all_record_batches <- function(lst){ diff --git a/r/R/csv.R b/r/R/csv.R index e145a907e28c..85ba788025a9 100644 --- a/r/R/csv.R +++ b/r/R/csv.R @@ -129,7 +129,12 @@ read_delim_arrow <- function(file, convert_options = convert_options ) - tab <- reader$Read()$select(!!enquo(col_select)) + tab <- reader$Read() + + col_select <- enquo(col_select) + if (!quo_is_null(col_select)) { + tab <- tab[vars_select(names(tab), !!col_select)] + } if (isTRUE(as_data_frame)) { tab <- as.data.frame(tab) diff --git a/r/R/json.R b/r/R/json.R index 1f05f84cfd02..006a7d7cf956 100644 --- a/r/R/json.R +++ b/r/R/json.R @@ -36,7 +36,12 @@ #' df <- read_json_arrow(tf) #' } read_json_arrow <- function(file, col_select = NULL, as_data_frame = TRUE, ...) { - tab <- JsonTableReader$create(file, ...)$Read()$select(!!enquo(col_select)) + tab <- JsonTableReader$create(file, ...)$Read() + + col_select <- enquo(col_select) + if (!quo_is_null(col_select)) { + tab <- tab[vars_select(names(tab), !!col_select)] + } if (isTRUE(as_data_frame)) { tab <- as.data.frame(tab) diff --git a/r/R/record-batch.R b/r/R/record-batch.R index 712000a97ab2..6996112d2685 100644 --- a/r/R/record-batch.R +++ b/r/R/record-batch.R @@ -48,9 +48,7 @@ #' - `$names()`: Get all column names (called by `names(batch)`) #' - `$GetColumnByName(name)`: Extract an `Array` by string name #' - `$RemoveColumn(i)`: Drops a column from the batch by integer position -#' - `$select(spec)`: Return a new record batch with a selection of columns. -#' This supports the usual `character`, `numeric`, and `logical` selection -#' methods as well as "tidy select" expressions. +#' - `$selectColumns(indices)`: Return a new record batch with a selection of columns, expressed as 0-based integers. #' - `$Slice(offset, length = NULL)`: Create a zero-copy view starting at the #' indicated integer offset and going for the given length, or to the end #' of the table if `NULL`, the default. @@ -84,21 +82,12 @@ RecordBatch <- R6Class("RecordBatch", inherit = ArrowObject, assert_that(is.string(name)) shared_ptr(Array, RecordBatch__GetColumnByName(self, name)) }, - select = function(spec) { - spec <- enquo(spec) - if (quo_is_null(spec)) { - self - } else { - all_vars <- self$names() - vars <- vars_select(all_vars, !!spec) - indices <- match(vars, all_vars) - shared_ptr(RecordBatch, RecordBatch__select(self, indices)) - } + SelectColumns = function(indices) { + shared_ptr(RecordBatch, RecordBatch__SelectColumns(self, indices)) }, RemoveColumn = function(i){ shared_ptr(RecordBatch, RecordBatch__RemoveColumn(self, i)) }, - Slice = function(offset, length = NULL) { if (is.null(length)) { shared_ptr(RecordBatch, RecordBatch__Slice1(self, offset)) @@ -218,7 +207,16 @@ names.RecordBatch <- function(x) x$names() if (!missing(j)) { # Selecting columns is cheaper than filtering rows, so do it first. # That way, if we're filtering too, we have fewer arrays to filter/slice/take - x <- x$select(j) + if (is_integerish(j)) { + if (all(j < 0)) { + # in R, negative j means "everything but j" + j <- setdiff(seq_len(x$num_columns), -1 * j) + } + x <- x$SelectColumns(as.integer(j) - 1L) + } else if (is.character(j)) { + x <- x$SelectColumns(match(j, names(x)) - 1L) + } + if (drop && ncol(x) == 1L) { x <- x$column(0) } diff --git a/r/R/table.R b/r/R/table.R index 20eb9bb0a672..81ba7fdf030e 100644 --- a/r/R/table.R +++ b/r/R/table.R @@ -55,9 +55,7 @@ #' - `$ColumnNames()`: Get all column names (called by `names(tab)`) #' - `$GetColumnByName(name)`: Extract a `ChunkedArray` by string name #' - `$field(i)`: Extract a `Field` from the table schema by integer position -#' - `$select(spec)`: Return a new table with a selection of columns. -#' This supports the usual `character`, `numeric`, and `logical` selection -#' methods as well as "tidy select" expressions. +#' - `$SelectColumns(indices)`: Return new `Table` with specified columns, expressed as 0-based integers. #' - `$Slice(offset, length = NULL)`: Create a zero-copy view starting at the #' indicated integer offset and going for the given length, or to the end #' of the table if `NULL`, the default. @@ -115,16 +113,8 @@ Table <- R6Class("Table", inherit = ArrowObject, shared_ptr(Table, Table__cast(self, target_schema, options)) }, - select = function(spec) { - spec <- enquo(spec) - if (quo_is_null(spec)) { - self - } else { - all_vars <- self$ColumnNames() - vars <- vars_select(all_vars, !!spec) - indices <- match(vars, all_vars) - shared_ptr(Table, Table__select(self, indices)) - } + SelectColumns = function(indices) { + shared_ptr(Table, Table__SelectColumns(self, indices)) }, Slice = function(offset, length = NULL) { diff --git a/r/man/RecordBatch.Rd b/r/man/RecordBatch.Rd index 40c34968d63e..1cf70a00a157 100644 --- a/r/man/RecordBatch.Rd +++ b/r/man/RecordBatch.Rd @@ -48,9 +48,8 @@ the following R6 methods that map onto the underlying C++ methods: \item \verb{$names()}: Get all column names (called by \code{names(batch)}) \item \verb{$GetColumnByName(name)}: Extract an \code{Array} by string name \item \verb{$RemoveColumn(i)}: Drops a column from the batch by integer position -\item \verb{$select(spec)}: Return a new record batch with a selection of columns. -This supports the usual \code{character}, \code{numeric}, and \code{logical} selection -methods as well as "tidy select" expressions. +\item \verb{$selectColumns(indices)}: Return a new record batch with a selection of columns. Supports +0-based integer indices and character vectors. \item \verb{$Slice(offset, length = NULL)}: Create a zero-copy view starting at the indicated integer offset and going for the given length, or to the end of the table if \code{NULL}, the default. diff --git a/r/man/Table.Rd b/r/man/Table.Rd index 2014a30be9e8..e53cba6ba60a 100644 --- a/r/man/Table.Rd +++ b/r/man/Table.Rd @@ -46,9 +46,8 @@ the following R6 methods that map onto the underlying C++ methods: \item \verb{$ColumnNames()}: Get all column names (called by \code{names(tab)}) \item \verb{$GetColumnByName(name)}: Extract a \code{ChunkedArray} by string name \item \verb{$field(i)}: Extract a \code{Field} from the table schema by integer position -\item \verb{$select(spec)}: Return a new table with a selection of columns. -This supports the usual \code{character}, \code{numeric}, and \code{logical} selection -methods as well as "tidy select" expressions. +\item \verb{$SelectColumns(indices)}: Return new \code{Table} with specified columns. Supports +0-based integer indices and character vectors. \item \verb{$Slice(offset, length = NULL)}: Create a zero-copy view starting at the indicated integer offset and going for the given length, or to the end of the table if \code{NULL}, the default. diff --git a/r/src/arrowExports.cpp b/r/src/arrowExports.cpp index b6c8f2b91c86..b162a795ac86 100644 --- a/r/src/arrowExports.cpp +++ b/r/src/arrowExports.cpp @@ -4925,17 +4925,17 @@ extern "C" SEXP _arrow_RecordBatch__GetColumnByName(SEXP batch_sexp, SEXP name_s // recordbatch.cpp #if defined(ARROW_R_WITH_ARROW) -std::shared_ptr RecordBatch__select(const std::shared_ptr& batch, cpp11::integers indices); -extern "C" SEXP _arrow_RecordBatch__select(SEXP batch_sexp, SEXP indices_sexp){ +std::shared_ptr RecordBatch__SelectColumns(const std::shared_ptr& batch, cpp11::integers indices); +extern "C" SEXP _arrow_RecordBatch__SelectColumns(SEXP batch_sexp, SEXP indices_sexp){ BEGIN_CPP11 arrow::r::Input&>::type batch(batch_sexp); arrow::r::Input::type indices(indices_sexp); - return cpp11::as_sexp(RecordBatch__select(batch, indices)); + return cpp11::as_sexp(RecordBatch__SelectColumns(batch, indices)); END_CPP11 } #else -extern "C" SEXP _arrow_RecordBatch__select(SEXP batch_sexp, SEXP indices_sexp){ - Rf_error("Cannot call RecordBatch__select(). Please use arrow::install_arrow() to install required runtime libraries. "); +extern "C" SEXP _arrow_RecordBatch__SelectColumns(SEXP batch_sexp, SEXP indices_sexp){ + Rf_error("Cannot call RecordBatch__SelectColumns(). Please use arrow::install_arrow() to install required runtime libraries. "); } #endif @@ -5880,17 +5880,17 @@ extern "C" SEXP _arrow_Table__GetColumnByName(SEXP table_sexp, SEXP name_sexp){ // table.cpp #if defined(ARROW_R_WITH_ARROW) -std::shared_ptr Table__select(const std::shared_ptr& table, cpp11::integers indices); -extern "C" SEXP _arrow_Table__select(SEXP table_sexp, SEXP indices_sexp){ +std::shared_ptr Table__SelectColumns(const std::shared_ptr& table, const std::vector& indices); +extern "C" SEXP _arrow_Table__SelectColumns(SEXP table_sexp, SEXP indices_sexp){ BEGIN_CPP11 arrow::r::Input&>::type table(table_sexp); - arrow::r::Input::type indices(indices_sexp); - return cpp11::as_sexp(Table__select(table, indices)); + arrow::r::Input&>::type indices(indices_sexp); + return cpp11::as_sexp(Table__SelectColumns(table, indices)); END_CPP11 } #else -extern "C" SEXP _arrow_Table__select(SEXP table_sexp, SEXP indices_sexp){ - Rf_error("Cannot call Table__select(). Please use arrow::install_arrow() to install required runtime libraries. "); +extern "C" SEXP _arrow_Table__SelectColumns(SEXP table_sexp, SEXP indices_sexp){ + Rf_error("Cannot call Table__SelectColumns(). Please use arrow::install_arrow() to install required runtime libraries. "); } #endif @@ -6310,7 +6310,7 @@ static const R_CallMethodDef CallEntries[] = { { "_arrow_RecordBatch__columns", (DL_FUNC) &_arrow_RecordBatch__columns, 1}, { "_arrow_RecordBatch__column", (DL_FUNC) &_arrow_RecordBatch__column, 2}, { "_arrow_RecordBatch__GetColumnByName", (DL_FUNC) &_arrow_RecordBatch__GetColumnByName, 2}, - { "_arrow_RecordBatch__select", (DL_FUNC) &_arrow_RecordBatch__select, 2}, + { "_arrow_RecordBatch__SelectColumns", (DL_FUNC) &_arrow_RecordBatch__SelectColumns, 2}, { "_arrow_RecordBatch__Equals", (DL_FUNC) &_arrow_RecordBatch__Equals, 3}, { "_arrow_RecordBatch__RemoveColumn", (DL_FUNC) &_arrow_RecordBatch__RemoveColumn, 2}, { "_arrow_RecordBatch__column_name", (DL_FUNC) &_arrow_RecordBatch__column_name, 2}, @@ -6371,7 +6371,7 @@ static const R_CallMethodDef CallEntries[] = { { "_arrow_Table__Validate", (DL_FUNC) &_arrow_Table__Validate, 1}, { "_arrow_Table__ValidateFull", (DL_FUNC) &_arrow_Table__ValidateFull, 1}, { "_arrow_Table__GetColumnByName", (DL_FUNC) &_arrow_Table__GetColumnByName, 2}, - { "_arrow_Table__select", (DL_FUNC) &_arrow_Table__select, 2}, + { "_arrow_Table__SelectColumns", (DL_FUNC) &_arrow_Table__SelectColumns, 2}, { "_arrow_all_record_batches", (DL_FUNC) &_arrow_all_record_batches, 1}, { "_arrow_Table__from_record_batches", (DL_FUNC) &_arrow_Table__from_record_batches, 2}, { "_arrow_Table__from_dots", (DL_FUNC) &_arrow_Table__from_dots, 2}, diff --git a/r/src/recordbatch.cpp b/r/src/recordbatch.cpp index e9648da55ac3..02b61f60633d 100644 --- a/r/src/recordbatch.cpp +++ b/r/src/recordbatch.cpp @@ -77,7 +77,7 @@ std::shared_ptr RecordBatch__GetColumnByName( } // [[arrow::export]] -std::shared_ptr RecordBatch__select( +std::shared_ptr RecordBatch__SelectColumns( const std::shared_ptr& batch, cpp11::integers indices) { R_xlen_t n = indices.size(); auto nrows = batch->num_rows(); @@ -86,7 +86,7 @@ std::shared_ptr RecordBatch__select( std::vector> columns(n); for (R_xlen_t i = 0; i < n; i++) { - int pos = indices[i] - 1; + int pos = indices[i]; fields[i] = batch->schema()->field(pos); columns[i] = batch->column(pos); } diff --git a/r/src/table.cpp b/r/src/table.cpp index 6a56e8e8f2d9..5024df970092 100644 --- a/r/src/table.cpp +++ b/r/src/table.cpp @@ -115,21 +115,9 @@ std::shared_ptr Table__GetColumnByName( } // [[arrow::export]] -std::shared_ptr Table__select(const std::shared_ptr& table, - cpp11::integers indices) { - R_xlen_t n = indices.size(); - - std::vector> fields(n); - std::vector> columns(n); - - for (R_xlen_t i = 0; i < n; i++) { - int pos = indices[i] - 1; - fields[i] = table->schema()->field(pos); - columns[i] = table->column(pos); - } - - auto schema = std::make_shared(std::move(fields)); - return arrow::Table::Make(schema, columns); +std::shared_ptr Table__SelectColumns( + const std::shared_ptr& table, const std::vector& indices) { + return ValueOrStop(table->SelectColumns(indices)); } namespace arrow { diff --git a/r/tests/testthat/test-Table.R b/r/tests/testthat/test-Table.R index bd6206ab1979..0758edd45097 100644 --- a/r/tests/testthat/test-Table.R +++ b/r/tests/testthat/test-Table.R @@ -130,16 +130,16 @@ test_that("[, [[, $ for Table", { expect_null(tab[["asdf"]]) # List-like column slicing expect_data_frame(tab[2:4], tbl[2:4]) - expect_data_frame(tab[c(1, 0)], tbl[c(1, 0)]) + expect_data_frame(tab[c(2, 1)], tbl[c(2, 1)]) + expect_data_frame(tab[-3], tbl[-3]) expect_error(tab[[c(4, 3)]]) expect_error(tab[[NA]], "'i' must be character or numeric, not logical") expect_error(tab[[NULL]], "'i' must be character or numeric, not NULL") expect_error(tab[[c("asdf", "jkl;")]], 'length(name) not equal to 1', fixed = TRUE) - expect_error(tab[-3], "Selections can't have negative value") # From tidyselect - expect_error(tab[-3:3], "Selections can't have negative value") # From tidyselect - expect_error(tab[1000]) # This is caught in vctrs, assert more specifically when it stabilizes - expect_error(tab[1:1000]) # same as ^ + expect_error(tab[-3:3], "Invalid column index") + expect_error(tab[1000], "Invalid column index") + expect_error(tab[1:1000], "Invalid column index") skip("Table with 0 cols doesn't know how many rows it should have") expect_data_frame(tab[0], tbl[0]) @@ -349,3 +349,12 @@ test_that("Table unifies dictionary on conversion back to R (ARROW-8374)", { expect_identical(as.data.frame(tab), res) }) + +test_that("Table$SelectColumns()", { + tab <- Table$create(x = 1:10, y = 1:10) + + expect_equal(tab$SelectColumns(0L), Table$create(x = 1:10)) + + expect_error(tab$SelectColumns(2:4)) + expect_error(tab$SelectColumns("")) +})