apache · romainfrancois · Sep 7, 2020 · Sep 11, 2020 · Sep 11, 2020 · Sep 11, 2020
diff --git a/r/R/arrowExports.R b/r/R/arrowExports.R
diff --git a/r/R/csv.R b/r/R/csv.R
@@ -129,7 +129,12 @@ read_delim_arrow <- function(file,
     convert_options = convert_options
   )
 
-  tab <- reader$Read()$select(!!enquo(col_select))
+  tab <- reader$Read()
+
+  col_select <- enquo(col_select)
+  if (!quo_is_null(col_select)) {
+    tab <- tab[vars_select(names(tab), !!col_select)]
+  }
 
   if (isTRUE(as_data_frame)) {
     tab <- as.data.frame(tab)

diff --git a/r/R/json.R b/r/R/json.R
@@ -36,7 +36,12 @@
 #'   df <- read_json_arrow(tf)
 #' }
 read_json_arrow <- function(file, col_select = NULL, as_data_frame = TRUE, ...) {
-  tab <- JsonTableReader$create(file, ...)$Read()$select(!!enquo(col_select))
+  tab <- JsonTableReader$create(file, ...)$Read()
+
+  col_select <- enquo(col_select)
+  if (!quo_is_null(col_select)) {
+    tab <- tab[vars_select(names(tab), !!col_select)]
+  }
 
   if (isTRUE(as_data_frame)) {
     tab <- as.data.frame(tab)

diff --git a/r/R/record-batch.R b/r/R/record-batch.R
@@ -48,9 +48,7 @@
 #' - `$names()`: Get all column names (called by `names(batch)`)
 #' - `$GetColumnByName(name)`: Extract an `Array` by string name
 #' - `$RemoveColumn(i)`: Drops a column from the batch by integer position
-#' - `$select(spec)`: Return a new record batch with a selection of columns.
-#'    This supports the usual `character`, `numeric`, and `logical` selection
-#'    methods as well as "tidy select" expressions.
+#' - `$selectColumns(indices)`: Return a new record batch with a selection of columns, expressed as 0-based integers.
 #' - `$Slice(offset, length = NULL)`: Create a zero-copy view starting at the
 #'    indicated integer offset and going for the given length, or to the end
 #'    of the table if `NULL`, the default.
@@ -84,21 +82,12 @@ RecordBatch <- R6Class("RecordBatch", inherit = ArrowObject,
       assert_that(is.string(name))
       shared_ptr(Array, RecordBatch__GetColumnByName(self, name))
     },
-    select = function(spec) {
-      spec <- enquo(spec)
-      if (quo_is_null(spec)) {
-        self
-      } else {
-        all_vars <- self$names()
-        vars <- vars_select(all_vars, !!spec)
-        indices <- match(vars, all_vars)
-        shared_ptr(RecordBatch, RecordBatch__select(self, indices))
-      }
+    SelectColumns = function(indices) {
+      shared_ptr(RecordBatch, RecordBatch__SelectColumns(self, indices))
     },
     RemoveColumn = function(i){
       shared_ptr(RecordBatch, RecordBatch__RemoveColumn(self, i))
     },
-
     Slice = function(offset, length = NULL) {
       if (is.null(length)) {
         shared_ptr(RecordBatch, RecordBatch__Slice1(self, offset))
@@ -218,7 +207,16 @@ names.RecordBatch <- function(x) x$names()
   if (!missing(j)) {
     # Selecting columns is cheaper than filtering rows, so do it first.
     # That way, if we're filtering too, we have fewer arrays to filter/slice/take
-    x <- x$select(j)
+    if (is_integerish(j)) {
+      if (all(j < 0)) {
+        # in R, negative j means "everything but j"
+        j <- setdiff(seq_len(x$num_columns), -1 * j)
+      }
+      x <- x$SelectColumns(as.integer(j) - 1L)
+    } else if (is.character(j)) {
+      x <- x$SelectColumns(match(j, names(x)) - 1L)
+    }
+
     if (drop && ncol(x) == 1L) {
       x <- x$column(0)
     }

diff --git a/r/R/table.R b/r/R/table.R
@@ -55,9 +55,7 @@
 #' - `$ColumnNames()`: Get all column names (called by `names(tab)`)
 #' - `$GetColumnByName(name)`: Extract a `ChunkedArray` by string name
 #' - `$field(i)`: Extract a `Field` from the table schema by integer position
-#' - `$select(spec)`: Return a new table with a selection of columns.
-#'    This supports the usual `character`, `numeric`, and `logical` selection
-#'    methods as well as "tidy select" expressions.
+#' - `$SelectColumns(indices)`: Return new `Table` with specified columns, expressed as 0-based integers.
 #' - `$Slice(offset, length = NULL)`: Create a zero-copy view starting at the
 #'    indicated integer offset and going for the given length, or to the end
 #'    of the table if `NULL`, the default.
@@ -115,16 +113,8 @@ Table <- R6Class("Table", inherit = ArrowObject,
       shared_ptr(Table, Table__cast(self, target_schema, options))
     },
 
-    select = function(spec) {
-      spec <- enquo(spec)
-      if (quo_is_null(spec)) {
-        self
-      } else {
-        all_vars <- self$ColumnNames()
-        vars <- vars_select(all_vars, !!spec)
-        indices <- match(vars, all_vars)
-        shared_ptr(Table, Table__select(self, indices))
-      }
+    SelectColumns = function(indices) {
+      shared_ptr(Table, Table__SelectColumns(self, indices))
     },
 
     Slice = function(offset, length = NULL) {

diff --git a/r/man/RecordBatch.Rd b/r/man/RecordBatch.Rd
diff --git a/r/man/Table.Rd b/r/man/Table.Rd
diff --git a/r/src/arrowExports.cpp b/r/src/arrowExports.cpp
diff --git a/r/src/recordbatch.cpp b/r/src/recordbatch.cpp
@@ -77,7 +77,7 @@ std::shared_ptr<arrow::Array> RecordBatch__GetColumnByName(
 }
 
 // [[arrow::export]]
-std::shared_ptr<arrow::RecordBatch> RecordBatch__select(
+std::shared_ptr<arrow::RecordBatch> RecordBatch__SelectColumns(
     const std::shared_ptr<arrow::RecordBatch>& batch, cpp11::integers indices) {
   R_xlen_t n = indices.size();
   auto nrows = batch->num_rows();
@@ -86,7 +86,7 @@ std::shared_ptr<arrow::RecordBatch> RecordBatch__select(
   std::vector<std::shared_ptr<arrow::Array>> columns(n);
 
   for (R_xlen_t i = 0; i < n; i++) {
-    int pos = indices[i] - 1;
+    int pos = indices[i];
     fields[i] = batch->schema()->field(pos);
     columns[i] = batch->column(pos);
   }

diff --git a/r/src/table.cpp b/r/src/table.cpp
@@ -115,21 +115,9 @@ std::shared_ptr<arrow::ChunkedArray> Table__GetColumnByName(
 }
 
 // [[arrow::export]]
-std::shared_ptr<arrow::Table> Table__select(const std::shared_ptr<arrow::Table>& table,
-                                            cpp11::integers indices) {
-  R_xlen_t n = indices.size();
-
-  std::vector<std::shared_ptr<arrow::Field>> fields(n);
-  std::vector<std::shared_ptr<arrow::ChunkedArray>> columns(n);
-
-  for (R_xlen_t i = 0; i < n; i++) {
-    int pos = indices[i] - 1;
-    fields[i] = table->schema()->field(pos);
-    columns[i] = table->column(pos);
-  }
-
-  auto schema = std::make_shared<arrow::Schema>(std::move(fields));
-  return arrow::Table::Make(schema, columns);
+std::shared_ptr<arrow::Table> Table__SelectColumns(
+    const std::shared_ptr<arrow::Table>& table, const std::vector<int>& indices) {
+  return ValueOrStop(table->SelectColumns(indices));
 }
 
 namespace arrow {

diff --git a/r/tests/testthat/test-Table.R b/r/tests/testthat/test-Table.R
@@ -130,16 +130,16 @@ test_that("[, [[, $ for Table", {
   expect_null(tab[["asdf"]])
   # List-like column slicing
   expect_data_frame(tab[2:4], tbl[2:4])
-  expect_data_frame(tab[c(1, 0)], tbl[c(1, 0)])
+  expect_data_frame(tab[c(2, 1)], tbl[c(2, 1)])
+  expect_data_frame(tab[-3], tbl[-3])
 
   expect_error(tab[[c(4, 3)]])
   expect_error(tab[[NA]], "'i' must be character or numeric, not logical")
   expect_error(tab[[NULL]], "'i' must be character or numeric, not NULL")
   expect_error(tab[[c("asdf", "jkl;")]], 'length(name) not equal to 1', fixed = TRUE)
-  expect_error(tab[-3], "Selections can't have negative value") # From tidyselect
-  expect_error(tab[-3:3], "Selections can't have negative value") # From tidyselect
-  expect_error(tab[1000]) # This is caught in vctrs, assert more specifically when it stabilizes
-  expect_error(tab[1:1000]) # same as ^
+  expect_error(tab[-3:3], "Invalid column index")
+  expect_error(tab[1000],  "Invalid column index")
+  expect_error(tab[1:1000], "Invalid column index")
 
   skip("Table with 0 cols doesn't know how many rows it should have")
   expect_data_frame(tab[0], tbl[0])
@@ -349,3 +349,12 @@ test_that("Table unifies dictionary on conversion back to R (ARROW-8374)", {
 
   expect_identical(as.data.frame(tab), res)
 })
+
+test_that("Table$SelectColumns()", {
+  tab <- Table$create(x = 1:10, y = 1:10)
+
+  expect_equal(tab$SelectColumns(0L), Table$create(x = 1:10))
+
+  expect_error(tab$SelectColumns(2:4))
+  expect_error(tab$SelectColumns(""))
+})