[SPARK-11086][SPARKR] Use dropFactors column-wise instead of nested loop when createDataFrame

Use `dropFactors` column-wise instead of nested loop when `createDataFrame` from a `data.frame` At this moment SparkR createDataFrame is using nested loop to convert factors to character when called on a local data.frame. It works but is incredibly slow especially with data.table (~ 2 orders of magnitude compared to PySpark / Pandas version on a DateFrame of size 1M rows x 2 columns). A simple improvement is to apply `dropFactor `column-wise and then reshape output list. It should at least partially address [SPARK-8277](https://issues.apache.org/jira/browse/SPARK-8277). Author: zero323 <matthew.szymkiewicz@gmail.com> Closes #9099 from zero323/SPARK-11086.
author: zero323 <matthew.szymkiewicz@gmail.com> 2015-11-15 19:15:27 -0800
committer: Shivaram Venkataraman <shivaram@cs.berkeley.edu> 2015-11-15 19:15:27 -0800
commit: d7d9fa0b8750166f8b74f9bc321df26908683a8b (patch)
tree: cbd4e96432c4f54ae07b5417eb97a22db7875b9a /R/pkg/inst
parent: 72c1d68b4ab6acb3f85971e10947caabb4bd846d (diff)
download: spark-d7d9fa0b8750166f8b74f9bc321df26908683a8b.tar.gz
spark-d7d9fa0b8750166f8b74f9bc321df26908683a8b.tar.bz2
spark-d7d9fa0b8750166f8b74f9bc321df26908683a8b.zip
1 files changed, 16 insertions, 0 deletions
diff --git a/R/pkg/inst/tests/test_sparkSQL.R b/R/pkg/inst/tests/test_sparkSQL.R
index af024e6183..8ff0627659 100644
--- a/R/pkg/inst/tests/test_sparkSQL.R
+++ b/R/pkg/inst/tests/test_sparkSQL.R
@@ -242,6 +242,14 @@ test_that("create DataFrame from list or data.frame", {
   expect_equal(count(df), 3)
   ldf2 <- collect(df)
   expect_equal(ldf$a, ldf2$a)
+
+  irisdf <- createDataFrame(sqlContext, iris)
+  iris_collected <- collect(irisdf)
+  expect_equivalent(iris_collected[,-5], iris[,-5])
+  expect_equal(iris_collected$Species, as.character(iris$Species))
+
+  mtcarsdf <- createDataFrame(sqlContext, mtcars)
+  expect_equivalent(collect(mtcarsdf), mtcars)
 })
 
 test_that("create DataFrame with different data types", {
@@ -283,6 +291,14 @@ test_that("create DataFrame with complex types", {
   expect_equal(s$b, 3L)
 })
 
+test_that("create DataFrame from a data.frame with complex types", {
+  ldf <- data.frame(row.names=1:2)
+  ldf$a_list <- list(list(1, 2), list(3, 4))
+  sdf <- createDataFrame(sqlContext, ldf)
+
+  expect_equivalent(ldf, collect(sdf))
+})
+
 # For test map type and struct type in DataFrame
 mockLinesMapType <- c("{\"name\":\"Bob\",\"info\":{\"age\":16,\"height\":176.5}}",
                       "{\"name\":\"Alice\",\"info\":{\"age\":20,\"height\":164.3}}",
author	zero323 <matthew.szymkiewicz@gmail.com>	2015-11-15 19:15:27 -0800
committer	Shivaram Venkataraman <shivaram@cs.berkeley.edu>	2015-11-15 19:15:27 -0800
commit	d7d9fa0b8750166f8b74f9bc321df26908683a8b (patch)
tree	cbd4e96432c4f54ae07b5417eb97a22db7875b9a /R/pkg/inst
parent	72c1d68b4ab6acb3f85971e10947caabb4bd846d (diff)
download	spark-d7d9fa0b8750166f8b74f9bc321df26908683a8b.tar.gz spark-d7d9fa0b8750166f8b74f9bc321df26908683a8b.tar.bz2 spark-d7d9fa0b8750166f8b74f9bc321df26908683a8b.zip