aboutsummaryrefslogtreecommitdiff
path: root/R
diff options
context:
space:
mode:
authorSun Rui <rui.sun@intel.com>2015-11-29 11:08:26 -0800
committerShivaram Venkataraman <shivaram@cs.berkeley.edu>2015-11-29 11:08:26 -0800
commitcc7a1bc9370b163f51230e5ca4be612d133a5086 (patch)
treec4220282a67132df0b93f843b9d4ce327a18e134 /R
parentc793d2d9a1ccc203fc103eb0636958fe8d71f471 (diff)
downloadspark-cc7a1bc9370b163f51230e5ca4be612d133a5086.tar.gz
spark-cc7a1bc9370b163f51230e5ca4be612d133a5086.tar.bz2
spark-cc7a1bc9370b163f51230e5ca4be612d133a5086.zip
[SPARK-11781][SPARKR] SparkR has problem in inferring type of raw type.
Author: Sun Rui <rui.sun@intel.com> Closes #9769 from sun-rui/SPARK-11781.
Diffstat (limited to 'R')
-rw-r--r--R/pkg/R/DataFrame.R34
-rw-r--r--R/pkg/R/SQLContext.R2
-rw-r--r--R/pkg/R/types.R37
-rw-r--r--R/pkg/inst/tests/test_sparkSQL.R6
4 files changed, 47 insertions, 32 deletions
diff --git a/R/pkg/R/DataFrame.R b/R/pkg/R/DataFrame.R
index f89e2682d9..a82ded9c51 100644
--- a/R/pkg/R/DataFrame.R
+++ b/R/pkg/R/DataFrame.R
@@ -793,8 +793,8 @@ setMethod("dim",
setMethod("collect",
signature(x = "DataFrame"),
function(x, stringsAsFactors = FALSE) {
- names <- columns(x)
- ncol <- length(names)
+ dtypes <- dtypes(x)
+ ncol <- length(dtypes)
if (ncol <= 0) {
# empty data.frame with 0 columns and 0 rows
data.frame()
@@ -817,25 +817,29 @@ setMethod("collect",
# data of complex type can be held. But getting a cell from a column
# of list type returns a list instead of a vector. So for columns of
# non-complex type, append them as vector.
+ #
+ # For columns of complex type, be careful to access them.
+ # Get a column of complex type returns a list.
+ # Get a cell from a column of complex type returns a list instead of a vector.
col <- listCols[[colIndex]]
+ colName <- dtypes[[colIndex]][[1]]
if (length(col) <= 0) {
- df[[names[colIndex]]] <- col
+ df[[colName]] <- col
} else {
- # TODO: more robust check on column of primitive types
- vec <- do.call(c, col)
- if (class(vec) != "list") {
- df[[names[colIndex]]] <- vec
+ colType <- dtypes[[colIndex]][[2]]
+ # Note that "binary" columns behave like complex types.
+ if (!is.null(PRIMITIVE_TYPES[[colType]]) && colType != "binary") {
+ vec <- do.call(c, col)
+ stopifnot(class(vec) != "list")
+ df[[colName]] <- vec
} else {
- # For columns of complex type, be careful to access them.
- # Get a column of complex type returns a list.
- # Get a cell from a column of complex type returns a list instead of a vector.
- df[[names[colIndex]]] <- col
- }
+ df[[colName]] <- col
+ }
+ }
}
+ df
}
- df
- }
- })
+ })
#' Limit
#'
diff --git a/R/pkg/R/SQLContext.R b/R/pkg/R/SQLContext.R
index a62b25fde9..85541c8e22 100644
--- a/R/pkg/R/SQLContext.R
+++ b/R/pkg/R/SQLContext.R
@@ -63,7 +63,7 @@ infer_type <- function(x) {
})
type <- Reduce(paste0, type)
type <- paste0("struct<", substr(type, 1, nchar(type) - 1), ">")
- } else if (length(x) > 1) {
+ } else if (length(x) > 1 && type != "binary") {
paste0("array<", infer_type(x[[1]]), ">")
} else {
type
diff --git a/R/pkg/R/types.R b/R/pkg/R/types.R
index dae4fe858b..1f06af7e90 100644
--- a/R/pkg/R/types.R
+++ b/R/pkg/R/types.R
@@ -19,25 +19,30 @@
# values are equivalent R types. This is stored in an environment to allow for
# more efficient look up (environments use hashmaps).
PRIMITIVE_TYPES <- as.environment(list(
- "byte"="integer",
- "tinyint"="integer",
- "smallint"="integer",
- "integer"="integer",
- "bigint"="numeric",
- "float"="numeric",
- "double"="numeric",
- "decimal"="numeric",
- "string"="character",
- "binary"="raw",
- "boolean"="logical",
- "timestamp"="POSIXct",
- "date"="Date"))
+ "tinyint" = "integer",
+ "smallint" = "integer",
+ "int" = "integer",
+ "bigint" = "numeric",
+ "float" = "numeric",
+ "double" = "numeric",
+ "decimal" = "numeric",
+ "string" = "character",
+ "binary" = "raw",
+ "boolean" = "logical",
+ "timestamp" = "POSIXct",
+ "date" = "Date",
+ # following types are not SQL types returned by dtypes(). They are listed here for usage
+ # by checkType() in schema.R.
+ # TODO: refactor checkType() in schema.R.
+ "byte" = "integer",
+ "integer" = "integer"
+ ))
# The complex data types. These do not have any direct mapping to R's types.
COMPLEX_TYPES <- list(
- "map"=NA,
- "array"=NA,
- "struct"=NA)
+ "map" = NA,
+ "array" = NA,
+ "struct" = NA)
# The full list of data types.
DATA_TYPES <- as.environment(c(as.list(PRIMITIVE_TYPES), COMPLEX_TYPES))
diff --git a/R/pkg/inst/tests/test_sparkSQL.R b/R/pkg/inst/tests/test_sparkSQL.R
index d3b2f20bf8..92ec82096c 100644
--- a/R/pkg/inst/tests/test_sparkSQL.R
+++ b/R/pkg/inst/tests/test_sparkSQL.R
@@ -72,6 +72,8 @@ test_that("infer types and check types", {
expect_equal(infer_type(e), "map<string,integer>")
expect_error(checkType("map<integer,integer>"), "Key type in a map must be string or character")
+
+ expect_equal(infer_type(as.raw(c(1, 2, 3))), "binary")
})
test_that("structType and structField", {
@@ -250,6 +252,10 @@ test_that("create DataFrame from list or data.frame", {
mtcarsdf <- createDataFrame(sqlContext, mtcars)
expect_equivalent(collect(mtcarsdf), mtcars)
+
+ bytes <- as.raw(c(1, 2, 3))
+ df <- createDataFrame(sqlContext, list(list(bytes)))
+ expect_equal(collect(df)[[1]][[1]], bytes)
})
test_that("create DataFrame with different data types", {