aboutsummaryrefslogtreecommitdiff
path: root/R
diff options
context:
space:
mode:
authorFelix Cheung <felixcheung_m@hotmail.com>2017-03-19 10:37:15 -0700
committerFelix Cheung <felixcheung@apache.org>2017-03-19 10:37:15 -0700
commit422aa67d1bb84f913b06e6d94615adb6557e2870 (patch)
treebf1378ba979f1135dabb24817b9d5e1ca9a4a3fd /R
parent60262bc951864a7a3874ab3570b723198e99d613 (diff)
downloadspark-422aa67d1bb84f913b06e6d94615adb6557e2870.tar.gz
spark-422aa67d1bb84f913b06e6d94615adb6557e2870.tar.bz2
spark-422aa67d1bb84f913b06e6d94615adb6557e2870.zip
[SPARK-18817][SPARKR][SQL] change derby log output to temp dir
## What changes were proposed in this pull request? Passes R `tempdir()` (this is the R session temp dir, shared with other temp files/dirs) to JVM, set System.Property for derby home dir to move derby.log ## How was this patch tested? Manually, unit tests With this, these are relocated to under /tmp ``` # ls /tmp/RtmpG2M0cB/ derby.log ``` And they are removed automatically when the R session is ended. Author: Felix Cheung <felixcheung_m@hotmail.com> Closes #16330 from felixcheung/rderby.
Diffstat (limited to 'R')
-rw-r--r--R/pkg/R/sparkR.R15
-rw-r--r--R/pkg/inst/tests/testthat/test_sparkSQL.R34
-rw-r--r--R/pkg/tests/run-all.R6
3 files changed, 54 insertions, 1 deletions
diff --git a/R/pkg/R/sparkR.R b/R/pkg/R/sparkR.R
index 61773ed3ee..d0a12b7ece 100644
--- a/R/pkg/R/sparkR.R
+++ b/R/pkg/R/sparkR.R
@@ -322,10 +322,19 @@ sparkRHive.init <- function(jsc = NULL) {
#' SparkSession or initializes a new SparkSession.
#' Additional Spark properties can be set in \code{...}, and these named parameters take priority
#' over values in \code{master}, \code{appName}, named lists of \code{sparkConfig}.
-#' When called in an interactive session, this checks for the Spark installation, and, if not
+#'
+#' When called in an interactive session, this method checks for the Spark installation, and, if not
#' found, it will be downloaded and cached automatically. Alternatively, \code{install.spark} can
#' be called manually.
#'
+#' A default warehouse is created automatically in the current directory when a managed table is
+#' created via \code{sql} statement \code{CREATE TABLE}, for example. To change the location of the
+#' warehouse, set the named parameter \code{spark.sql.warehouse.dir} to the SparkSession. Along with
+#' the warehouse, an accompanied metastore may also be automatically created in the current
+#' directory when a new SparkSession is initialized with \code{enableHiveSupport} set to
+#' \code{TRUE}, which is the default. For more details, refer to Hive configuration at
+#' \url{http://spark.apache.org/docs/latest/sql-programming-guide.html#hive-tables}.
+#'
#' For details on how to initialize and use SparkR, refer to SparkR programming guide at
#' \url{http://spark.apache.org/docs/latest/sparkr.html#starting-up-sparksession}.
#'
@@ -381,6 +390,10 @@ sparkR.session <- function(
deployMode <- sparkConfigMap[["spark.submit.deployMode"]]
}
+ if (!exists("spark.r.sql.derby.temp.dir", envir = sparkConfigMap)) {
+ sparkConfigMap[["spark.r.sql.derby.temp.dir"]] <- tempdir()
+ }
+
if (!exists(".sparkRjsc", envir = .sparkREnv)) {
retHome <- sparkCheckInstall(sparkHome, master, deployMode)
if (!is.null(retHome)) sparkHome <- retHome
diff --git a/R/pkg/inst/tests/testthat/test_sparkSQL.R b/R/pkg/inst/tests/testthat/test_sparkSQL.R
index f7081cb1d4..32856b399c 100644
--- a/R/pkg/inst/tests/testthat/test_sparkSQL.R
+++ b/R/pkg/inst/tests/testthat/test_sparkSQL.R
@@ -60,6 +60,7 @@ unsetHiveContext <- function() {
# Tests for SparkSQL functions in SparkR
+filesBefore <- list.files(path = sparkRDir, all.files = TRUE)
sparkSession <- sparkR.session()
sc <- callJStatic("org.apache.spark.sql.api.r.SQLUtils", "getJavaSparkContext", sparkSession)
@@ -2909,6 +2910,39 @@ test_that("Collect on DataFrame when NAs exists at the top of a timestamp column
expect_equal(class(ldf3$col3), c("POSIXct", "POSIXt"))
})
+compare_list <- function(list1, list2) {
+ # get testthat to show the diff by first making the 2 lists equal in length
+ expect_equal(length(list1), length(list2))
+ l <- max(length(list1), length(list2))
+ length(list1) <- l
+ length(list2) <- l
+ expect_equal(sort(list1, na.last = TRUE), sort(list2, na.last = TRUE))
+}
+
+# This should always be the **very last test** in this test file.
+test_that("No extra files are created in SPARK_HOME by starting session and making calls", {
+ # Check that it is not creating any extra file.
+ # Does not check the tempdir which would be cleaned up after.
+ filesAfter <- list.files(path = sparkRDir, all.files = TRUE)
+
+ expect_true(length(sparkRFilesBefore) > 0)
+ # first, ensure derby.log is not there
+ expect_false("derby.log" %in% filesAfter)
+ # second, ensure only spark-warehouse is created when calling SparkSession, enableHiveSupport = F
+ # note: currently all other test files have enableHiveSupport = F, so we capture the list of files
+ # before creating a SparkSession with enableHiveSupport = T at the top of this test file
+ # (filesBefore). The test here is to compare that (filesBefore) against the list of files before
+ # any test is run in run-all.R (sparkRFilesBefore).
+ # sparkRWhitelistSQLDirs is also defined in run-all.R, and should contain only 2 whitelisted dirs,
+ # here allow the first value, spark-warehouse, in the diff, everything else should be exactly the
+ # same as before any test is run.
+ compare_list(sparkRFilesBefore, setdiff(filesBefore, sparkRWhitelistSQLDirs[[1]]))
+ # third, ensure only spark-warehouse and metastore_db are created when enableHiveSupport = T
+ # note: as the note above, after running all tests in this file while enableHiveSupport = T, we
+ # check the list of files again. This time we allow both whitelisted dirs to be in the diff.
+ compare_list(sparkRFilesBefore, setdiff(filesAfter, sparkRWhitelistSQLDirs))
+})
+
unlink(parquetPath)
unlink(orcPath)
unlink(jsonPath)
diff --git a/R/pkg/tests/run-all.R b/R/pkg/tests/run-all.R
index ab8d1ca019..cefaadda6e 100644
--- a/R/pkg/tests/run-all.R
+++ b/R/pkg/tests/run-all.R
@@ -22,6 +22,12 @@ library(SparkR)
options("warn" = 2)
# Setup global test environment
+sparkRDir <- file.path(Sys.getenv("SPARK_HOME"), "R")
+sparkRFilesBefore <- list.files(path = sparkRDir, all.files = TRUE)
+sparkRWhitelistSQLDirs <- c("spark-warehouse", "metastore_db")
+invisible(lapply(sparkRWhitelistSQLDirs,
+ function(x) { unlink(file.path(sparkRDir, x), recursive = TRUE, force = TRUE)}))
+
install.spark()
test_package("SparkR")