[SPARK-11781][SPARKR] SparkR has problem in inferring type of raw type.

Author: Sun Rui <rui.sun@intel.com> Closes #9769 from sun-rui/SPARK-11781.
author: Sun Rui <rui.sun@intel.com> 2015-11-29 11:08:26 -0800
committer: Shivaram Venkataraman <shivaram@cs.berkeley.edu> 2015-11-29 11:08:26 -0800
commit: cc7a1bc9370b163f51230e5ca4be612d133a5086 (patch)
tree: c4220282a67132df0b93f843b9d4ce327a18e134 /R
parent: c793d2d9a1ccc203fc103eb0636958fe8d71f471 (diff)
download: spark-cc7a1bc9370b163f51230e5ca4be612d133a5086.tar.gz
spark-cc7a1bc9370b163f51230e5ca4be612d133a5086.tar.bz2
spark-cc7a1bc9370b163f51230e5ca4be612d133a5086.zip
4 files changed, 47 insertions, 32 deletions
diff --git a/R/pkg/R/DataFrame.R b/R/pkg/R/DataFrame.R
index f89e2682d9..a82ded9c51 100644
--- a/R/pkg/R/DataFrame.R
+++ b/R/pkg/R/DataFrame.R
@@ -793,8 +793,8 @@ setMethod("dim",
 setMethod("collect",
           signature(x = "DataFrame"),
           function(x, stringsAsFactors = FALSE) {
-            names <- columns(x)
-            ncol <- length(names)
+            dtypes <- dtypes(x)
+            ncol <- length(dtypes)
             if (ncol <= 0) {
               # empty data.frame with 0 columns and 0 rows
               data.frame()
@@ -817,25 +817,29 @@ setMethod("collect",
                 # data of complex type can be held. But getting a cell from a column
                 # of list type returns a list instead of a vector. So for columns of
                 # non-complex type, append them as vector.
+                #
+                # For columns of complex type, be careful to access them.
+                # Get a column of complex type returns a list.
+                # Get a cell from a column of complex type returns a list instead of a vector.
                 col <- listCols[[colIndex]]
+                colName <- dtypes[[colIndex]][[1]]
                 if (length(col) <= 0) {
-                  df[[names[colIndex]]] <- col
+                  df[[colName]] <- col
                 } else {
-                  # TODO: more robust check on column of primitive types
-                  vec <- do.call(c, col)
-                  if (class(vec) != "list") {
-                    df[[names[colIndex]]] <- vec
+                  colType <- dtypes[[colIndex]][[2]]
+                  # Note that "binary" columns behave like complex types.
+                  if (!is.null(PRIMITIVE_TYPES[[colType]]) && colType != "binary") {
+                    vec <- do.call(c, col)
+                    stopifnot(class(vec) != "list")
+                    df[[colName]] <- vec
                   } else {
-                    # For columns of complex type, be careful to access them.
-                    # Get a column of complex type returns a list.
-                    # Get a cell from a column of complex type returns a list instead of a vector.
-                    df[[names[colIndex]]] <- col
-                 }
+                    df[[colName]] <- col
+                  }
+                }
               }
+              df
             }
-            df
-          }
-        })
+          })
 
 #' Limit
 #'
diff --git a/R/pkg/R/SQLContext.R b/R/pkg/R/SQLContext.R
index a62b25fde9..85541c8e22 100644
--- a/R/pkg/R/SQLContext.R
+++ b/R/pkg/R/SQLContext.R
@@ -63,7 +63,7 @@ infer_type <- function(x) {
     })
     type <- Reduce(paste0, type)
     type <- paste0("struct<", substr(type, 1, nchar(type) - 1), ">")
-  } else if (length(x) > 1) {
+  } else if (length(x) > 1 && type != "binary") {
     paste0("array<", infer_type(x[[1]]), ">")
   } else {
     type
diff --git a/R/pkg/R/types.R b/R/pkg/R/types.R
index dae4fe858b..1f06af7e90 100644
--- a/R/pkg/R/types.R
+++ b/R/pkg/R/types.R
@@ -19,25 +19,30 @@
 # values are equivalent R types. This is stored in an environment to allow for
 # more efficient look up (environments use hashmaps).
 PRIMITIVE_TYPES <- as.environment(list(
-  "byte"="integer",
-  "tinyint"="integer",
-  "smallint"="integer",
-  "integer"="integer",
-  "bigint"="numeric",
-  "float"="numeric",
-  "double"="numeric",
-  "decimal"="numeric",
-  "string"="character",
-  "binary"="raw",
-  "boolean"="logical",
-  "timestamp"="POSIXct",
-  "date"="Date"))
+  "tinyint" = "integer",
+  "smallint" = "integer",
+  "int" = "integer",
+  "bigint" = "numeric",
+  "float" = "numeric",
+  "double" = "numeric",
+  "decimal" = "numeric",
+  "string" = "character",
+  "binary" = "raw",
+  "boolean" = "logical",
+  "timestamp" = "POSIXct",
+  "date" = "Date",
+  # following types are not SQL types returned by dtypes(). They are listed here for usage
+  # by checkType() in schema.R.
+  # TODO: refactor checkType() in schema.R.
+  "byte" = "integer",
+  "integer" = "integer"
+  ))
 
 # The complex data types. These do not have any direct mapping to R's types.
 COMPLEX_TYPES <- list(
-  "map"=NA,
-  "array"=NA,
-  "struct"=NA)
+  "map" = NA,
+  "array" = NA,
+  "struct" = NA)
 
 # The full list of data types.
 DATA_TYPES <- as.environment(c(as.list(PRIMITIVE_TYPES), COMPLEX_TYPES))
diff --git a/R/pkg/inst/tests/test_sparkSQL.R b/R/pkg/inst/tests/test_sparkSQL.R
index d3b2f20bf8..92ec82096c 100644
--- a/R/pkg/inst/tests/test_sparkSQL.R
+++ b/R/pkg/inst/tests/test_sparkSQL.R
@@ -72,6 +72,8 @@ test_that("infer types and check types", {
   expect_equal(infer_type(e), "map<string,integer>")
 
   expect_error(checkType("map<integer,integer>"), "Key type in a map must be string or character")
+
+  expect_equal(infer_type(as.raw(c(1, 2, 3))), "binary")
 })
 
 test_that("structType and structField", {
@@ -250,6 +252,10 @@ test_that("create DataFrame from list or data.frame", {
 
   mtcarsdf <- createDataFrame(sqlContext, mtcars)
   expect_equivalent(collect(mtcarsdf), mtcars)
+
+  bytes <- as.raw(c(1, 2, 3))
+  df <- createDataFrame(sqlContext, list(list(bytes)))
+  expect_equal(collect(df)[[1]][[1]], bytes)
 })
 
 test_that("create DataFrame with different data types", {
author	Sun Rui <rui.sun@intel.com>	2015-11-29 11:08:26 -0800
committer	Shivaram Venkataraman <shivaram@cs.berkeley.edu>	2015-11-29 11:08:26 -0800
commit	cc7a1bc9370b163f51230e5ca4be612d133a5086 (patch)
tree	c4220282a67132df0b93f843b9d4ce327a18e134 /R
parent	c793d2d9a1ccc203fc103eb0636958fe8d71f471 (diff)
download	spark-cc7a1bc9370b163f51230e5ca4be612d133a5086.tar.gz spark-cc7a1bc9370b163f51230e5ca4be612d133a5086.tar.bz2 spark-cc7a1bc9370b163f51230e5ca4be612d133a5086.zip