Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 4 additions & 4 deletions r/R/arrowExports.R

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

7 changes: 6 additions & 1 deletion r/R/csv.R
Original file line number Diff line number Diff line change
Expand Up @@ -129,7 +129,12 @@ read_delim_arrow <- function(file,
convert_options = convert_options
)

tab <- reader$Read()$select(!!enquo(col_select))
tab <- reader$Read()

col_select <- enquo(col_select)
if (!quo_is_null(col_select)) {
tab <- tab[vars_select(names(tab), !!col_select)]
}

if (isTRUE(as_data_frame)) {
tab <- as.data.frame(tab)
Expand Down
7 changes: 6 additions & 1 deletion r/R/json.R
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,12 @@
#' df <- read_json_arrow(tf)
#' }
read_json_arrow <- function(file, col_select = NULL, as_data_frame = TRUE, ...) {
tab <- JsonTableReader$create(file, ...)$Read()$select(!!enquo(col_select))
tab <- JsonTableReader$create(file, ...)$Read()

col_select <- enquo(col_select)
if (!quo_is_null(col_select)) {
tab <- tab[vars_select(names(tab), !!col_select)]
}

if (isTRUE(as_data_frame)) {
tab <- as.data.frame(tab)
Expand Down
28 changes: 13 additions & 15 deletions r/R/record-batch.R
Original file line number Diff line number Diff line change
Expand Up @@ -48,9 +48,7 @@
#' - `$names()`: Get all column names (called by `names(batch)`)
#' - `$GetColumnByName(name)`: Extract an `Array` by string name
#' - `$RemoveColumn(i)`: Drops a column from the batch by integer position
#' - `$select(spec)`: Return a new record batch with a selection of columns.
#' This supports the usual `character`, `numeric`, and `logical` selection
#' methods as well as "tidy select" expressions.
#' - `$selectColumns(indices)`: Return a new record batch with a selection of columns, expressed as 0-based integers.
#' - `$Slice(offset, length = NULL)`: Create a zero-copy view starting at the
#' indicated integer offset and going for the given length, or to the end
#' of the table if `NULL`, the default.
Expand Down Expand Up @@ -84,21 +82,12 @@ RecordBatch <- R6Class("RecordBatch", inherit = ArrowObject,
assert_that(is.string(name))
shared_ptr(Array, RecordBatch__GetColumnByName(self, name))
},
select = function(spec) {
spec <- enquo(spec)
if (quo_is_null(spec)) {
self
} else {
all_vars <- self$names()
vars <- vars_select(all_vars, !!spec)
indices <- match(vars, all_vars)
shared_ptr(RecordBatch, RecordBatch__select(self, indices))
}
SelectColumns = function(indices) {
shared_ptr(RecordBatch, RecordBatch__SelectColumns(self, indices))
},
RemoveColumn = function(i){
shared_ptr(RecordBatch, RecordBatch__RemoveColumn(self, i))
},

Slice = function(offset, length = NULL) {
if (is.null(length)) {
shared_ptr(RecordBatch, RecordBatch__Slice1(self, offset))
Expand Down Expand Up @@ -218,7 +207,16 @@ names.RecordBatch <- function(x) x$names()
if (!missing(j)) {
# Selecting columns is cheaper than filtering rows, so do it first.
# That way, if we're filtering too, we have fewer arrays to filter/slice/take
x <- x$select(j)
if (is_integerish(j)) {
if (all(j < 0)) {
# in R, negative j means "everything but j"
j <- setdiff(seq_len(x$num_columns), -1 * j)
}
x <- x$SelectColumns(as.integer(j) - 1L)
Comment thread
nealrichardson marked this conversation as resolved.
} else if (is.character(j)) {
x <- x$SelectColumns(match(j, names(x)) - 1L)
}

if (drop && ncol(x) == 1L) {
x <- x$column(0)
}
Expand Down
16 changes: 3 additions & 13 deletions r/R/table.R
Original file line number Diff line number Diff line change
Expand Up @@ -55,9 +55,7 @@
#' - `$ColumnNames()`: Get all column names (called by `names(tab)`)
#' - `$GetColumnByName(name)`: Extract a `ChunkedArray` by string name
#' - `$field(i)`: Extract a `Field` from the table schema by integer position
#' - `$select(spec)`: Return a new table with a selection of columns.
#' This supports the usual `character`, `numeric`, and `logical` selection
#' methods as well as "tidy select" expressions.
#' - `$SelectColumns(indices)`: Return new `Table` with specified columns, expressed as 0-based integers.
#' - `$Slice(offset, length = NULL)`: Create a zero-copy view starting at the
#' indicated integer offset and going for the given length, or to the end
#' of the table if `NULL`, the default.
Expand Down Expand Up @@ -115,16 +113,8 @@ Table <- R6Class("Table", inherit = ArrowObject,
shared_ptr(Table, Table__cast(self, target_schema, options))
},

select = function(spec) {
spec <- enquo(spec)
if (quo_is_null(spec)) {
self
} else {
all_vars <- self$ColumnNames()
vars <- vars_select(all_vars, !!spec)
indices <- match(vars, all_vars)
shared_ptr(Table, Table__select(self, indices))
}
SelectColumns = function(indices) {
shared_ptr(Table, Table__SelectColumns(self, indices))
},

Slice = function(offset, length = NULL) {
Expand Down
5 changes: 2 additions & 3 deletions r/man/RecordBatch.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

5 changes: 2 additions & 3 deletions r/man/Table.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

26 changes: 13 additions & 13 deletions r/src/arrowExports.cpp

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

4 changes: 2 additions & 2 deletions r/src/recordbatch.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -77,7 +77,7 @@ std::shared_ptr<arrow::Array> RecordBatch__GetColumnByName(
}

// [[arrow::export]]
std::shared_ptr<arrow::RecordBatch> RecordBatch__select(
std::shared_ptr<arrow::RecordBatch> RecordBatch__SelectColumns(
const std::shared_ptr<arrow::RecordBatch>& batch, cpp11::integers indices) {
R_xlen_t n = indices.size();
auto nrows = batch->num_rows();
Expand All @@ -86,7 +86,7 @@ std::shared_ptr<arrow::RecordBatch> RecordBatch__select(
std::vector<std::shared_ptr<arrow::Array>> columns(n);

for (R_xlen_t i = 0; i < n; i++) {
int pos = indices[i] - 1;
int pos = indices[i];
fields[i] = batch->schema()->field(pos);
columns[i] = batch->column(pos);
}
Expand Down
18 changes: 3 additions & 15 deletions r/src/table.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -115,21 +115,9 @@ std::shared_ptr<arrow::ChunkedArray> Table__GetColumnByName(
}

// [[arrow::export]]
std::shared_ptr<arrow::Table> Table__select(const std::shared_ptr<arrow::Table>& table,
cpp11::integers indices) {
R_xlen_t n = indices.size();

std::vector<std::shared_ptr<arrow::Field>> fields(n);
std::vector<std::shared_ptr<arrow::ChunkedArray>> columns(n);

for (R_xlen_t i = 0; i < n; i++) {
int pos = indices[i] - 1;
fields[i] = table->schema()->field(pos);
columns[i] = table->column(pos);
}

auto schema = std::make_shared<arrow::Schema>(std::move(fields));
return arrow::Table::Make(schema, columns);
std::shared_ptr<arrow::Table> Table__SelectColumns(
const std::shared_ptr<arrow::Table>& table, const std::vector<int>& indices) {
return ValueOrStop(table->SelectColumns(indices));
}

namespace arrow {
Expand Down
19 changes: 14 additions & 5 deletions r/tests/testthat/test-Table.R
Original file line number Diff line number Diff line change
Expand Up @@ -130,16 +130,16 @@ test_that("[, [[, $ for Table", {
expect_null(tab[["asdf"]])
# List-like column slicing
expect_data_frame(tab[2:4], tbl[2:4])
expect_data_frame(tab[c(1, 0)], tbl[c(1, 0)])
expect_data_frame(tab[c(2, 1)], tbl[c(2, 1)])
Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Should we maintain the base R behavior for subsetting on column 0 (it is ignored)? I can't imagine why someone would do such a thing on purpose.

expect_data_frame(tab[-3], tbl[-3])

expect_error(tab[[c(4, 3)]])
expect_error(tab[[NA]], "'i' must be character or numeric, not logical")
expect_error(tab[[NULL]], "'i' must be character or numeric, not NULL")
expect_error(tab[[c("asdf", "jkl;")]], 'length(name) not equal to 1', fixed = TRUE)
expect_error(tab[-3], "Selections can't have negative value") # From tidyselect
expect_error(tab[-3:3], "Selections can't have negative value") # From tidyselect
expect_error(tab[1000]) # This is caught in vctrs, assert more specifically when it stabilizes
expect_error(tab[1:1000]) # same as ^
expect_error(tab[-3:3], "Invalid column index")
expect_error(tab[1000], "Invalid column index")
expect_error(tab[1:1000], "Invalid column index")

skip("Table with 0 cols doesn't know how many rows it should have")
expect_data_frame(tab[0], tbl[0])
Expand Down Expand Up @@ -349,3 +349,12 @@ test_that("Table unifies dictionary on conversion back to R (ARROW-8374)", {

expect_identical(as.data.frame(tab), res)
})

test_that("Table$SelectColumns()", {
tab <- Table$create(x = 1:10, y = 1:10)

expect_equal(tab$SelectColumns(0L), Table$create(x = 1:10))

expect_error(tab$SelectColumns(2:4))
expect_error(tab$SelectColumns(""))
})