diff options
author | Hossein <hossein@databricks.com> | 2016-10-12 10:32:38 -0700 |
---|---|---|
committer | Felix Cheung <felixcheung@apache.org> | 2016-10-12 10:32:38 -0700 |
commit | 5cc503f4fe9737a4c7947a80eecac053780606df (patch) | |
tree | 02cfea5ff7007d7375b17786880d55a6867eedb7 /core/src/main | |
parent | d5580ebaa086b9feb72d5428f24c5b60cd7da745 (diff) | |
download | spark-5cc503f4fe9737a4c7947a80eecac053780606df.tar.gz spark-5cc503f4fe9737a4c7947a80eecac053780606df.tar.bz2 spark-5cc503f4fe9737a4c7947a80eecac053780606df.zip |
[SPARK-17790][SPARKR] Support for parallelizing R data.frame larger than 2GB
## What changes were proposed in this pull request?
If the R data structure that is being parallelized is larger than `INT_MAX` we use files to transfer data to JVM. The serialization protocol mimics Python pickling. This allows us to simply call `PythonRDD.readRDDFromFile` to create the RDD.
I tested this on my MacBook. Following code works with this patch:
```R
intMax <- .Machine$integer.max
largeVec <- 1:intMax
rdd <- SparkR:::parallelize(sc, largeVec, 2)
```
## How was this patch tested?
* [x] Unit tests
Author: Hossein <hossein@databricks.com>
Closes #15375 from falaki/SPARK-17790.
Diffstat (limited to 'core/src/main')
-rw-r--r-- | core/src/main/scala/org/apache/spark/api/r/RBackendHandler.scala | 2 | ||||
-rw-r--r-- | core/src/main/scala/org/apache/spark/api/r/RRDD.scala | 13 |
2 files changed, 14 insertions, 1 deletions
diff --git a/core/src/main/scala/org/apache/spark/api/r/RBackendHandler.scala b/core/src/main/scala/org/apache/spark/api/r/RBackendHandler.scala index 7d5348266b..1422ef888f 100644 --- a/core/src/main/scala/org/apache/spark/api/r/RBackendHandler.scala +++ b/core/src/main/scala/org/apache/spark/api/r/RBackendHandler.scala @@ -168,7 +168,7 @@ private[r] class RBackendHandler(server: RBackend) } } catch { case e: Exception => - logError(s"$methodName on $objId failed") + logError(s"$methodName on $objId failed", e) writeInt(dos, -1) // Writing the error message of the cause for the exception. This will be returned // to user in the R process. diff --git a/core/src/main/scala/org/apache/spark/api/r/RRDD.scala b/core/src/main/scala/org/apache/spark/api/r/RRDD.scala index 59c8429c80..a1a5eb8cf5 100644 --- a/core/src/main/scala/org/apache/spark/api/r/RRDD.scala +++ b/core/src/main/scala/org/apache/spark/api/r/RRDD.scala @@ -24,6 +24,7 @@ import scala.reflect.ClassTag import org.apache.spark._ import org.apache.spark.api.java.{JavaPairRDD, JavaRDD, JavaSparkContext} +import org.apache.spark.api.python.PythonRDD import org.apache.spark.broadcast.Broadcast import org.apache.spark.internal.Logging import org.apache.spark.rdd.RDD @@ -140,4 +141,16 @@ private[r] object RRDD { def createRDDFromArray(jsc: JavaSparkContext, arr: Array[Array[Byte]]): JavaRDD[Array[Byte]] = { JavaRDD.fromRDD(jsc.sc.parallelize(arr, arr.length)) } + + /** + * Create an RRDD given a temporary file name. This is used to create RRDD when parallelize is + * called on large R objects. + * + * @param fileName name of temporary file on driver machine + * @param parallelism number of slices defaults to 4 + */ + def createRDDFromFile(jsc: JavaSparkContext, fileName: String, parallelism: Int): + JavaRDD[Array[Byte]] = { + PythonRDD.readRDDFromFile(jsc, fileName, parallelism) + } } |