From a55fbe2a16aa0866ff8aca25bf9f772e6eb516a1 Mon Sep 17 00:00:00 2001 From: felixcheung Date: Sat, 23 Apr 2016 00:20:27 -0700 Subject: [SPARK-12148][SPARKR] SparkR: rename DataFrame to SparkDataFrame ## What changes were proposed in this pull request? Changed class name defined in R from "DataFrame" to "SparkDataFrame". A popular package, S4Vector already defines "DataFrame" - this change is to avoid conflict. Aside from class name and API/roxygen2 references, SparkR APIs like `createDataFrame`, `as.DataFrame` are not changed (S4Vector does not define a "as.DataFrame"). Since in R, one would rarely reference type/class, this change should have minimal/almost-no impact to a SparkR user in terms of back compat. ## How was this patch tested? SparkR tests, manually loading S4Vector then SparkR package Author: felixcheung Closes #12621 from felixcheung/rdataframe. --- R/pkg/NAMESPACE | 2 +- R/pkg/R/DataFrame.R | 653 +++++++++++++++--------------- R/pkg/R/RDD.R | 4 +- R/pkg/R/SQLContext.R | 70 ++-- R/pkg/R/column.R | 6 +- R/pkg/R/deserialize.R | 2 +- R/pkg/R/functions.R | 2 +- R/pkg/R/generics.R | 2 +- R/pkg/R/group.R | 14 +- R/pkg/R/mllib.R | 38 +- R/pkg/R/schema.R | 6 +- R/pkg/R/stats.R | 38 +- R/pkg/R/utils.R | 2 +- R/pkg/inst/tests/testthat/test_sparkSQL.R | 102 ++--- 14 files changed, 473 insertions(+), 468 deletions(-) diff --git a/R/pkg/NAMESPACE b/R/pkg/NAMESPACE index b3aff10b7d..0f92b5e597 100644 --- a/R/pkg/NAMESPACE +++ b/R/pkg/NAMESPACE @@ -27,7 +27,7 @@ export("setJobGroup", # Export Utility methods export("setLogLevel") -exportClasses("DataFrame") +exportClasses("SparkDataFrame") exportMethods("arrange", "as.data.frame", diff --git a/R/pkg/R/DataFrame.R b/R/pkg/R/DataFrame.R index 95e2eb2be0..69feec735c 100644 --- a/R/pkg/R/DataFrame.R +++ b/R/pkg/R/DataFrame.R @@ -15,21 +15,21 @@ # limitations under the License. # -# DataFrame.R - DataFrame class and methods implemented in S4 OO classes +# DataFrame.R - SparkDataFrame class and methods implemented in S4 OO classes #' @include generics.R jobj.R schema.R RDD.R pairRDD.R column.R group.R NULL setOldClass("jobj") -#' @title S4 class that represents a DataFrame +#' @title S4 class that represents a SparkDataFrame #' @description DataFrames can be created using functions like \link{createDataFrame}, #' \link{read.json}, \link{table} etc. -#' @family DataFrame functions -#' @rdname DataFrame +#' @family SparkSparkDataFrame functions +#' @rdname SparkDataFrame #' @docType class #' -#' @slot env An R environment that stores bookkeeping states of the DataFrame +#' @slot env An R environment that stores bookkeeping states of the SparkDataFrame #' @slot sdf A Java object reference to the backing Scala DataFrame #' @seealso \link{createDataFrame}, \link{read.json}, \link{table} #' @seealso \url{https://spark.apache.org/docs/latest/sparkr.html#sparkr-dataframes} @@ -40,11 +40,11 @@ setOldClass("jobj") #' sqlContext <- sparkRSQL.init(sc) #' df <- createDataFrame(sqlContext, faithful) #'} -setClass("DataFrame", +setClass("SparkDataFrame", slots = list(env = "environment", sdf = "jobj")) -setMethod("initialize", "DataFrame", function(.Object, sdf, isCached) { +setMethod("initialize", "SparkDataFrame", function(.Object, sdf, isCached) { .Object@env <- new.env() .Object@env$isCached <- isCached @@ -52,23 +52,23 @@ setMethod("initialize", "DataFrame", function(.Object, sdf, isCached) { .Object }) -#' @rdname DataFrame +#' @rdname SparkDataFrame #' @export #' @param sdf A Java object reference to the backing Scala DataFrame -#' @param isCached TRUE if the dataFrame is cached +#' @param isCached TRUE if the SparkDataFrame is cached dataFrame <- function(sdf, isCached = FALSE) { - new("DataFrame", sdf, isCached) + new("SparkDataFrame", sdf, isCached) } -############################ DataFrame Methods ############################################## +############################ SparkDataFrame Methods ############################################## -#' Print Schema of a DataFrame +#' Print Schema of a SparkDataFrame #' #' Prints out the schema in tree format #' -#' @param x A SparkSQL DataFrame +#' @param x A SparkDataFrame #' -#' @family DataFrame functions +#' @family SparkSparkDataFrame functions #' @rdname printSchema #' @name printSchema #' @export @@ -81,7 +81,7 @@ dataFrame <- function(sdf, isCached = FALSE) { #' printSchema(df) #'} setMethod("printSchema", - signature(x = "DataFrame"), + signature(x = "SparkDataFrame"), function(x) { schemaString <- callJMethod(schema(x)$jobj, "treeString") cat(schemaString) @@ -89,11 +89,11 @@ setMethod("printSchema", #' Get schema object #' -#' Returns the schema of this DataFrame as a structType object. +#' Returns the schema of this SparkDataFrame as a structType object. #' -#' @param x A SparkSQL DataFrame +#' @param x A SparkDataFrame #' -#' @family DataFrame functions +#' @family SparkSparkDataFrame functions #' @rdname schema #' @name schema #' @export @@ -106,7 +106,7 @@ setMethod("printSchema", #' dfSchema <- schema(df) #'} setMethod("schema", - signature(x = "DataFrame"), + signature(x = "SparkDataFrame"), function(x) { structType(callJMethod(x@sdf, "schema")) }) @@ -115,9 +115,9 @@ setMethod("schema", #' #' Print the logical and physical Catalyst plans to the console for debugging. #' -#' @param x A SparkSQL DataFrame +#' @param x A SparkDataFrame #' @param extended Logical. If extended is False, explain() only prints the physical plan. -#' @family DataFrame functions +#' @family SparkSparkDataFrame functions #' @rdname explain #' @name explain #' @export @@ -130,7 +130,7 @@ setMethod("schema", #' explain(df, TRUE) #'} setMethod("explain", - signature(x = "DataFrame"), + signature(x = "SparkDataFrame"), function(x, extended = FALSE) { queryExec <- callJMethod(x@sdf, "queryExecution") if (extended) { @@ -146,9 +146,9 @@ setMethod("explain", #' Returns True if the `collect` and `take` methods can be run locally #' (without any Spark executors). #' -#' @param x A SparkSQL DataFrame +#' @param x A SparkDataFrame #' -#' @family DataFrame functions +#' @family SparkSparkDataFrame functions #' @rdname isLocal #' @name isLocal #' @export @@ -161,19 +161,19 @@ setMethod("explain", #' isLocal(df) #'} setMethod("isLocal", - signature(x = "DataFrame"), + signature(x = "SparkDataFrame"), function(x) { callJMethod(x@sdf, "isLocal") }) #' showDF #' -#' Print the first numRows rows of a DataFrame +#' Print the first numRows rows of a SparkDataFrame #' -#' @param x A SparkSQL DataFrame +#' @param x A SparkDataFrame #' @param numRows The number of rows to print. Defaults to 20. #' -#' @family DataFrame functions +#' @family SparkSparkDataFrame functions #' @rdname showDF #' @name showDF #' @export @@ -186,7 +186,7 @@ setMethod("isLocal", #' showDF(df) #'} setMethod("showDF", - signature(x = "DataFrame"), + signature(x = "SparkDataFrame"), function(x, numRows = 20, truncate = TRUE) { s <- callJMethod(x@sdf, "showString", numToInt(numRows), truncate) cat(s) @@ -194,11 +194,11 @@ setMethod("showDF", #' show #' -#' Print the DataFrame column names and types +#' Print the SparkDataFrame column names and types #' -#' @param x A SparkSQL DataFrame +#' @param x A SparkDataFrame #' -#' @family DataFrame functions +#' @family SparkSparkDataFrame functions #' @rdname show #' @name show #' @export @@ -210,22 +210,22 @@ setMethod("showDF", #' df <- read.json(sqlContext, path) #' df #'} -setMethod("show", "DataFrame", +setMethod("show", "SparkDataFrame", function(object) { cols <- lapply(dtypes(object), function(l) { paste(l, collapse = ":") }) s <- paste(cols, collapse = ", ") - cat(paste("DataFrame[", s, "]\n", sep = "")) + cat(paste(class(object), "[", s, "]\n", sep = "")) }) #' DataTypes #' #' Return all column names and their data types as a list #' -#' @param x A SparkSQL DataFrame +#' @param x A SparkDataFrame #' -#' @family DataFrame functions +#' @family SparkSparkDataFrame functions #' @rdname dtypes #' @name dtypes #' @export @@ -238,7 +238,7 @@ setMethod("show", "DataFrame", #' dtypes(df) #'} setMethod("dtypes", - signature(x = "DataFrame"), + signature(x = "SparkDataFrame"), function(x) { lapply(schema(x)$fields(), function(f) { c(f$name(), f$dataType.simpleString()) @@ -249,9 +249,9 @@ setMethod("dtypes", #' #' Return all column names as a list #' -#' @param x A SparkSQL DataFrame +#' @param x A SparkDataFrame #' -#' @family DataFrame functions +#' @family SparkDataFrame functions #' @rdname columns #' @name columns @@ -266,7 +266,7 @@ setMethod("dtypes", #' colnames(df) #'} setMethod("columns", - signature(x = "DataFrame"), + signature(x = "SparkDataFrame"), function(x) { sapply(schema(x)$fields(), function(f) { f$name() @@ -276,7 +276,7 @@ setMethod("columns", #' @rdname columns #' @name names setMethod("names", - signature(x = "DataFrame"), + signature(x = "SparkDataFrame"), function(x) { columns(x) }) @@ -284,7 +284,7 @@ setMethod("names", #' @rdname columns #' @name names<- setMethod("names<-", - signature(x = "DataFrame"), + signature(x = "SparkDataFrame"), function(x, value) { if (!is.null(value)) { sdf <- callJMethod(x@sdf, "toDF", as.list(value)) @@ -295,7 +295,7 @@ setMethod("names<-", #' @rdname columns #' @name colnames setMethod("colnames", - signature(x = "DataFrame"), + signature(x = "SparkDataFrame"), function(x) { columns(x) }) @@ -303,7 +303,7 @@ setMethod("colnames", #' @rdname columns #' @name colnames<- setMethod("colnames<-", - signature(x = "DataFrame"), + signature(x = "SparkDataFrame"), function(x, value) { # Check parameter integrity @@ -331,13 +331,13 @@ setMethod("colnames<-", #' coltypes #' -#' Get column types of a DataFrame +#' Get column types of a SparkDataFrame #' -#' @param x A SparkSQL DataFrame -#' @return value A character vector with the column types of the given DataFrame +#' @param x A SparkDataFrame +#' @return value A character vector with the column types of the given SparkDataFrame #' @rdname coltypes #' @name coltypes -#' @family DataFrame functions +#' @family SparkDataFrame functions #' @export #' @examples #'\dontrun{ @@ -345,9 +345,9 @@ setMethod("colnames<-", #' coltypes(irisDF) #'} setMethod("coltypes", - signature(x = "DataFrame"), + signature(x = "SparkDataFrame"), function(x) { - # Get the data types of the DataFrame by invoking dtypes() function + # Get the data types of the SparkDataFrame by invoking dtypes() function types <- sapply(dtypes(x), function(x) {x[[2]]}) # Map Spark data types into R's data types using DATA_TYPES environment @@ -382,11 +382,11 @@ setMethod("coltypes", #' coltypes #' -#' Set the column types of a DataFrame. +#' Set the column types of a SparkDataFrame. #' -#' @param x A SparkSQL DataFrame +#' @param x A SparkDataFrame #' @param value A character vector with the target column types for the given -#' DataFrame. Column types can be one of integer, numeric/double, character, logical, or NA +#' SparkDataFrame. Column types can be one of integer, numeric/double, character, logical, or NA #' to keep that column as-is. #' @rdname coltypes #' @name coltypes<- @@ -401,15 +401,15 @@ setMethod("coltypes", #' coltypes(df) <- c(NA, "numeric") #'} setMethod("coltypes<-", - signature(x = "DataFrame", value = "character"), + signature(x = "SparkDataFrame", value = "character"), function(x, value) { cols <- columns(x) ncols <- length(cols) if (length(value) == 0) { - stop("Cannot set types of an empty DataFrame with no Column") + stop("Cannot set types of an empty SparkDataFrame with no Column") } if (length(value) != ncols) { - stop("Length of type vector should match the number of columns for DataFrame") + stop("Length of type vector should match the number of columns for SparkDataFrame") } newCols <- lapply(seq_len(ncols), function(i) { col <- getColumn(x, cols[i]) @@ -429,12 +429,12 @@ setMethod("coltypes<-", #' Register Temporary Table #' -#' Registers a DataFrame as a Temporary Table in the SQLContext +#' Registers a SparkDataFrame as a Temporary Table in the SQLContext #' -#' @param x A SparkSQL DataFrame +#' @param x A SparkDataFrame #' @param tableName A character vector containing the name of the table #' -#' @family DataFrame functions +#' @family SparkDataFrame functions #' @rdname registerTempTable #' @name registerTempTable #' @export @@ -448,21 +448,21 @@ setMethod("coltypes<-", #' new_df <- sql(sqlContext, "SELECT * FROM json_df") #'} setMethod("registerTempTable", - signature(x = "DataFrame", tableName = "character"), + signature(x = "SparkDataFrame", tableName = "character"), function(x, tableName) { invisible(callJMethod(x@sdf, "registerTempTable", tableName)) }) #' insertInto #' -#' Insert the contents of a DataFrame into a table registered in the current SQL Context. +#' Insert the contents of a SparkDataFrame into a table registered in the current SQL Context. #' -#' @param x A SparkSQL DataFrame +#' @param x A SparkDataFrame #' @param tableName A character vector containing the name of the table #' @param overwrite A logical argument indicating whether or not to overwrite #' the existing rows in the table. #' -#' @family DataFrame functions +#' @family SparkDataFrame functions #' @rdname insertInto #' @name insertInto #' @export @@ -476,7 +476,7 @@ setMethod("registerTempTable", #' insertInto(df2, "table1", overwrite = TRUE) #'} setMethod("insertInto", - signature(x = "DataFrame", tableName = "character"), + signature(x = "SparkDataFrame", tableName = "character"), function(x, tableName, overwrite = FALSE) { jmode <- convertToJSaveMode(ifelse(overwrite, "overwrite", "append")) write <- callJMethod(x@sdf, "write") @@ -488,9 +488,9 @@ setMethod("insertInto", #' #' Persist with the default storage level (MEMORY_ONLY). #' -#' @param x A SparkSQL DataFrame +#' @param x A SparkDataFrame #' -#' @family DataFrame functions +#' @family SparkDataFrame functions #' @rdname cache #' @name cache #' @export @@ -503,7 +503,7 @@ setMethod("insertInto", #' cache(df) #'} setMethod("cache", - signature(x = "DataFrame"), + signature(x = "SparkDataFrame"), function(x) { cached <- callJMethod(x@sdf, "cache") x@env$isCached <- TRUE @@ -512,13 +512,13 @@ setMethod("cache", #' Persist #' -#' Persist this DataFrame with the specified storage level. For details of the +#' Persist this SparkDataFrame with the specified storage level. For details of the #' supported storage levels, refer to #' \url{http://spark.apache.org/docs/latest/programming-guide.html#rdd-persistence}. #' -#' @param x The DataFrame to persist +#' @param x The SparkDataFrame to persist #' -#' @family DataFrame functions +#' @family SparkDataFrame functions #' @rdname persist #' @name persist #' @export @@ -531,7 +531,7 @@ setMethod("cache", #' persist(df, "MEMORY_AND_DISK") #'} setMethod("persist", - signature(x = "DataFrame", newLevel = "character"), + signature(x = "SparkDataFrame", newLevel = "character"), function(x, newLevel) { callJMethod(x@sdf, "persist", getStorageLevel(newLevel)) x@env$isCached <- TRUE @@ -540,13 +540,13 @@ setMethod("persist", #' Unpersist #' -#' Mark this DataFrame as non-persistent, and remove all blocks for it from memory and +#' Mark this SparkDataFrame as non-persistent, and remove all blocks for it from memory and #' disk. #' -#' @param x The DataFrame to unpersist +#' @param x The SparkDataFrame to unpersist #' @param blocking Whether to block until all blocks are deleted #' -#' @family DataFrame functions +#' @family SparkDataFrame functions #' @rdname unpersist-methods #' @name unpersist #' @export @@ -560,7 +560,7 @@ setMethod("persist", #' unpersist(df) #'} setMethod("unpersist", - signature(x = "DataFrame"), + signature(x = "SparkDataFrame"), function(x, blocking = TRUE) { callJMethod(x@sdf, "unpersist", blocking) x@env$isCached <- FALSE @@ -569,12 +569,12 @@ setMethod("unpersist", #' Repartition #' -#' Return a new DataFrame that has exactly numPartitions partitions. +#' Return a new SparkDataFrame that has exactly numPartitions partitions. #' -#' @param x A SparkSQL DataFrame +#' @param x A SparkDataFrame #' @param numPartitions The number of partitions to use. #' -#' @family DataFrame functions +#' @family SparkDataFrame functions #' @rdname repartition #' @name repartition #' @export @@ -587,7 +587,7 @@ setMethod("unpersist", #' newDF <- repartition(df, 2L) #'} setMethod("repartition", - signature(x = "DataFrame", numPartitions = "numeric"), + signature(x = "SparkDataFrame", numPartitions = "numeric"), function(x, numPartitions) { sdf <- callJMethod(x@sdf, "repartition", numToInt(numPartitions)) dataFrame(sdf) @@ -595,12 +595,12 @@ setMethod("repartition", #' toJSON #' -#' Convert the rows of a DataFrame into JSON objects and return an RDD where +#' Convert the rows of a SparkDataFrame into JSON objects and return an RDD where #' each element contains a JSON string. #' -#' @param x A SparkSQL DataFrame +#' @param x A SparkDataFrame #' @return A StringRRDD of JSON objects -#' @family DataFrame functions +#' @family SparkDataFrame functions #' @rdname tojson #' @noRd #' @examples @@ -612,7 +612,7 @@ setMethod("repartition", #' newRDD <- toJSON(df) #'} setMethod("toJSON", - signature(x = "DataFrame"), + signature(x = "SparkDataFrame"), function(x) { rdd <- callJMethod(x@sdf, "toJSON") jrdd <- callJMethod(rdd, "toJavaRDD") @@ -621,13 +621,13 @@ setMethod("toJSON", #' write.json #' -#' Save the contents of a DataFrame as a JSON file (one object per line). Files written out -#' with this method can be read back in as a DataFrame using read.json(). +#' Save the contents of a SparkDataFrame as a JSON file (one object per line). Files written out +#' with this method can be read back in as a SparkDataFrame using read.json(). #' -#' @param x A SparkSQL DataFrame +#' @param x A SparkDataFrame #' @param path The directory where the file is saved #' -#' @family DataFrame functions +#' @family SparkDataFrame functions #' @rdname write.json #' @name write.json #' @export @@ -640,7 +640,7 @@ setMethod("toJSON", #' write.json(df, "/tmp/sparkr-tmp/") #'} setMethod("write.json", - signature(x = "DataFrame", path = "character"), + signature(x = "SparkDataFrame", path = "character"), function(x, path) { write <- callJMethod(x@sdf, "write") invisible(callJMethod(write, "json", path)) @@ -648,13 +648,13 @@ setMethod("write.json", #' write.parquet #' -#' Save the contents of a DataFrame as a Parquet file, preserving the schema. Files written out -#' with this method can be read back in as a DataFrame using read.parquet(). +#' Save the contents of a SparkDataFrame as a Parquet file, preserving the schema. Files written out +#' with this method can be read back in as a SparkDataFrame using read.parquet(). #' -#' @param x A SparkSQL DataFrame +#' @param x A SparkDataFrame #' @param path The directory where the file is saved #' -#' @family DataFrame functions +#' @family SparkDataFrame functions #' @rdname write.parquet #' @name write.parquet #' @export @@ -668,7 +668,7 @@ setMethod("write.json", #' saveAsParquetFile(df, "/tmp/sparkr-tmp2/") #'} setMethod("write.parquet", - signature(x = "DataFrame", path = "character"), + signature(x = "SparkDataFrame", path = "character"), function(x, path) { write <- callJMethod(x@sdf, "write") invisible(callJMethod(write, "parquet", path)) @@ -678,7 +678,7 @@ setMethod("write.parquet", #' @name saveAsParquetFile #' @export setMethod("saveAsParquetFile", - signature(x = "DataFrame", path = "character"), + signature(x = "SparkDataFrame", path = "character"), function(x, path) { .Deprecated("write.parquet") write.parquet(x, path) @@ -686,14 +686,14 @@ setMethod("saveAsParquetFile", #' write.text #' -#' Saves the content of the DataFrame in a text file at the specified path. -#' The DataFrame must have only one column of string type with the name "value". +#' Saves the content of the SparkDataFrame in a text file at the specified path. +#' The SparkDataFrame must have only one column of string type with the name "value". #' Each row becomes a new line in the output file. #' -#' @param x A SparkSQL DataFrame +#' @param x A SparkDataFrame #' @param path The directory where the file is saved #' -#' @family DataFrame functions +#' @family SparkDataFrame functions #' @rdname write.text #' @name write.text #' @export @@ -706,7 +706,7 @@ setMethod("saveAsParquetFile", #' write.text(df, "/tmp/sparkr-tmp/") #'} setMethod("write.text", - signature(x = "DataFrame", path = "character"), + signature(x = "SparkDataFrame", path = "character"), function(x, path) { write <- callJMethod(x@sdf, "write") invisible(callJMethod(write, "text", path)) @@ -714,11 +714,11 @@ setMethod("write.text", #' Distinct #' -#' Return a new DataFrame containing the distinct rows in this DataFrame. +#' Return a new SparkDataFrame containing the distinct rows in this SparkDataFrame. #' -#' @param x A SparkSQL DataFrame +#' @param x A SparkDataFrame #' -#' @family DataFrame functions +#' @family SparkDataFrame functions #' @rdname distinct #' @name distinct #' @export @@ -731,7 +731,7 @@ setMethod("write.text", #' distinctDF <- distinct(df) #'} setMethod("distinct", - signature(x = "DataFrame"), + signature(x = "SparkDataFrame"), function(x) { sdf <- callJMethod(x@sdf, "distinct") dataFrame(sdf) @@ -740,21 +740,21 @@ setMethod("distinct", #' @rdname distinct #' @name unique setMethod("unique", - signature(x = "DataFrame"), + signature(x = "SparkDataFrame"), function(x) { distinct(x) }) #' Sample #' -#' Return a sampled subset of this DataFrame using a random seed. +#' Return a sampled subset of this SparkDataFrame using a random seed. #' -#' @param x A SparkSQL DataFrame +#' @param x A SparkDataFrame #' @param withReplacement Sampling with replacement or not #' @param fraction The (rough) sample target fraction #' @param seed Randomness seed value #' -#' @family DataFrame functions +#' @family SparkDataFrame functions #' @rdname sample #' @name sample #' @export @@ -768,7 +768,7 @@ setMethod("unique", #' collect(sample(df, TRUE, 0.5)) #'} setMethod("sample", - signature(x = "DataFrame", withReplacement = "logical", + signature(x = "SparkDataFrame", withReplacement = "logical", fraction = "numeric"), function(x, withReplacement, fraction, seed) { if (fraction < 0.0) stop(cat("Negative fraction value:", fraction)) @@ -785,7 +785,7 @@ setMethod("sample", #' @rdname sample #' @name sample_frac setMethod("sample_frac", - signature(x = "DataFrame", withReplacement = "logical", + signature(x = "SparkDataFrame", withReplacement = "logical", fraction = "numeric"), function(x, withReplacement, fraction, seed) { sample(x, withReplacement, fraction, seed) @@ -793,11 +793,11 @@ setMethod("sample_frac", #' nrow #' -#' Returns the number of rows in a DataFrame +#' Returns the number of rows in a SparkDataFrame #' -#' @param x A SparkSQL DataFrame +#' @param x A SparkDataFrame #' -#' @family DataFrame functions +#' @family SparkDataFrame functions #' @rdname nrow #' @name count #' @export @@ -810,7 +810,7 @@ setMethod("sample_frac", #' count(df) #' } setMethod("count", - signature(x = "DataFrame"), + signature(x = "SparkDataFrame"), function(x) { callJMethod(x@sdf, "count") }) @@ -818,16 +818,16 @@ setMethod("count", #' @name nrow #' @rdname nrow setMethod("nrow", - signature(x = "DataFrame"), + signature(x = "SparkDataFrame"), function(x) { count(x) }) -#' Returns the number of columns in a DataFrame +#' Returns the number of columns in a SparkDataFrame #' -#' @param x a SparkSQL DataFrame +#' @param x a SparkDataFrame #' -#' @family DataFrame functions +#' @family SparkDataFrame functions #' @rdname ncol #' @name ncol #' @export @@ -840,15 +840,15 @@ setMethod("nrow", #' ncol(df) #' } setMethod("ncol", - signature(x = "DataFrame"), + signature(x = "SparkDataFrame"), function(x) { length(columns(x)) }) -#' Returns the dimentions (number of rows and columns) of a DataFrame -#' @param x a SparkSQL DataFrame +#' Returns the dimentions (number of rows and columns) of a SparkDataFrame +#' @param x a SparkDataFrame #' -#' @family DataFrame functions +#' @family SparkDataFrame functions #' @rdname dim #' @name dim #' @export @@ -861,18 +861,18 @@ setMethod("ncol", #' dim(df) #' } setMethod("dim", - signature(x = "DataFrame"), + signature(x = "SparkDataFrame"), function(x) { c(count(x), ncol(x)) }) -#' Collects all the elements of a Spark DataFrame and coerces them into an R data.frame. +#' Collects all the elements of a SparkDataFrame and coerces them into an R data.frame. #' -#' @param x A SparkSQL DataFrame +#' @param x A SparkDataFrame #' @param stringsAsFactors (Optional) A logical indicating whether or not string columns #' should be converted to factors. FALSE by default. #' -#' @family DataFrame functions +#' @family SparkDataFrame functions #' @rdname collect #' @name collect #' @export @@ -886,7 +886,7 @@ setMethod("dim", #' firstName <- collected[[1]]$name #' } setMethod("collect", - signature(x = "DataFrame"), + signature(x = "SparkDataFrame"), function(x, stringsAsFactors = FALSE) { dtypes <- dtypes(x) ncol <- length(dtypes) @@ -938,13 +938,13 @@ setMethod("collect", #' Limit #' -#' Limit the resulting DataFrame to the number of rows specified. +#' Limit the resulting SparkDataFrame to the number of rows specified. #' -#' @param x A SparkSQL DataFrame +#' @param x A SparkDataFrame #' @param num The number of rows to return -#' @return A new DataFrame containing the number of rows specified. +#' @return A new SparkDataFrame containing the number of rows specified. #' -#' @family DataFrame functions +#' @family SparkDataFrame functions #' @rdname limit #' @name limit #' @export @@ -957,15 +957,15 @@ setMethod("collect", #' limitedDF <- limit(df, 10) #' } setMethod("limit", - signature(x = "DataFrame", num = "numeric"), + signature(x = "SparkDataFrame", num = "numeric"), function(x, num) { res <- callJMethod(x@sdf, "limit", as.integer(num)) dataFrame(res) }) -#' Take the first NUM rows of a DataFrame and return a the results as a data.frame +#' Take the first NUM rows of a SparkDataFrame and return a the results as a data.frame #' -#' @family DataFrame functions +#' @family SparkDataFrame functions #' @rdname take #' @name take #' @export @@ -978,7 +978,7 @@ setMethod("limit", #' take(df, 2) #' } setMethod("take", - signature(x = "DataFrame", num = "numeric"), + signature(x = "SparkDataFrame", num = "numeric"), function(x, num) { limited <- limit(x, num) collect(limited) @@ -986,15 +986,15 @@ setMethod("take", #' Head #' -#' Return the first NUM rows of a DataFrame as a data.frame. If NUM is NULL, +#' Return the first NUM rows of a SparkDataFrame as a data.frame. If NUM is NULL, #' then head() returns the first 6 rows in keeping with the current data.frame #' convention in R. #' -#' @param x A SparkSQL DataFrame +#' @param x A SparkDataFrame #' @param num The number of rows to return. Default is 6. #' @return A data.frame #' -#' @family DataFrame functions +#' @family SparkDataFrame functions #' @rdname head #' @name head #' @export @@ -1007,17 +1007,17 @@ setMethod("take", #' head(df) #' } setMethod("head", - signature(x = "DataFrame"), + signature(x = "SparkDataFrame"), function(x, num = 6L) { # Default num is 6L in keeping with R's data.frame convention take(x, num) }) -#' Return the first row of a DataFrame +#' Return the first row of a SparkDataFrame #' -#' @param x A SparkSQL DataFrame +#' @param x A SparkDataFrame #' -#' @family DataFrame functions +#' @family SparkDataFrame functions #' @rdname first #' @name first #' @export @@ -1030,16 +1030,16 @@ setMethod("head", #' first(df) #' } setMethod("first", - signature(x = "DataFrame"), + signature(x = "SparkDataFrame"), function(x) { take(x, 1) }) #' toRDD #' -#' Converts a Spark DataFrame to an RDD while preserving column names. +#' Converts a SparkDataFrame to an RDD while preserving column names. #' -#' @param x A Spark DataFrame +#' @param x A SparkDataFrame #' #' @noRd #' @examples @@ -1051,7 +1051,7 @@ setMethod("first", #' rdd <- toRDD(df) #'} setMethod("toRDD", - signature(x = "DataFrame"), + signature(x = "SparkDataFrame"), function(x) { jrdd <- callJStatic("org.apache.spark.sql.api.r.SQLUtils", "dfToRowRDD", x@sdf) colNames <- callJMethod(x@sdf, "columns") @@ -1064,12 +1064,12 @@ setMethod("toRDD", #' GroupBy #' -#' Groups the DataFrame using the specified columns, so we can run aggregation on them. +#' Groups the SparkDataFrame using the specified columns, so we can run aggregation on them. #' -#' @param x a DataFrame +#' @param x a SparkDataFrame #' @return a GroupedData #' @seealso GroupedData -#' @family DataFrame functions +#' @family SparkDataFrame functions #' @rdname groupBy #' @name groupBy #' @export @@ -1082,7 +1082,7 @@ setMethod("toRDD", #' agg(groupBy(df, "department", "gender"), salary="avg", "age" -> "max") #' } setMethod("groupBy", - signature(x = "DataFrame"), + signature(x = "SparkDataFrame"), function(x, ...) { cols <- list(...) if (length(cols) >= 1 && class(cols[[1]]) == "character") { @@ -1097,7 +1097,7 @@ setMethod("groupBy", #' @rdname groupBy #' @name group_by setMethod("group_by", - signature(x = "DataFrame"), + signature(x = "SparkDataFrame"), function(x, ...) { groupBy(x, ...) }) @@ -1106,13 +1106,13 @@ setMethod("group_by", #' #' Compute aggregates by specifying a list of columns #' -#' @param x a DataFrame -#' @family DataFrame functions +#' @param x a SparkDataFrame +#' @family SparkDataFrame functions #' @rdname agg #' @name agg #' @export setMethod("agg", - signature(x = "DataFrame"), + signature(x = "SparkDataFrame"), function(x, ...) { agg(groupBy(x), ...) }) @@ -1120,7 +1120,7 @@ setMethod("agg", #' @rdname agg #' @name summarize setMethod("summarize", - signature(x = "DataFrame"), + signature(x = "SparkDataFrame"), function(x, ...) { agg(x, ...) }) @@ -1135,7 +1135,7 @@ setMethod("summarize", #' @rdname lapply #' @noRd setMethod("lapply", - signature(X = "DataFrame", FUN = "function"), + signature(X = "SparkDataFrame", FUN = "function"), function(X, FUN) { rdd <- toRDD(X) lapply(rdd, FUN) @@ -1144,7 +1144,7 @@ setMethod("lapply", #' @rdname lapply #' @noRd setMethod("map", - signature(X = "DataFrame", FUN = "function"), + signature(X = "SparkDataFrame", FUN = "function"), function(X, FUN) { lapply(X, FUN) }) @@ -1152,7 +1152,7 @@ setMethod("map", #' @rdname flatMap #' @noRd setMethod("flatMap", - signature(X = "DataFrame", FUN = "function"), + signature(X = "SparkDataFrame", FUN = "function"), function(X, FUN) { rdd <- toRDD(X) flatMap(rdd, FUN) @@ -1161,7 +1161,7 @@ setMethod("flatMap", #' @rdname lapplyPartition #' @noRd setMethod("lapplyPartition", - signature(X = "DataFrame", FUN = "function"), + signature(X = "SparkDataFrame", FUN = "function"), function(X, FUN) { rdd <- toRDD(X) lapplyPartition(rdd, FUN) @@ -1170,7 +1170,7 @@ setMethod("lapplyPartition", #' @rdname lapplyPartition #' @noRd setMethod("mapPartitions", - signature(X = "DataFrame", FUN = "function"), + signature(X = "SparkDataFrame", FUN = "function"), function(X, FUN) { lapplyPartition(X, FUN) }) @@ -1178,7 +1178,7 @@ setMethod("mapPartitions", #' @rdname foreach #' @noRd setMethod("foreach", - signature(x = "DataFrame", func = "function"), + signature(x = "SparkDataFrame", func = "function"), function(x, func) { rdd <- toRDD(x) foreach(rdd, func) @@ -1187,7 +1187,7 @@ setMethod("foreach", #' @rdname foreach #' @noRd setMethod("foreachPartition", - signature(x = "DataFrame", func = "function"), + signature(x = "SparkDataFrame", func = "function"), function(x, func) { rdd <- toRDD(x) foreachPartition(rdd, func) @@ -1202,14 +1202,14 @@ getColumn <- function(x, c) { #' @rdname select #' @name $ -setMethod("$", signature(x = "DataFrame"), +setMethod("$", signature(x = "SparkDataFrame"), function(x, name) { getColumn(x, name) }) #' @rdname select #' @name $<- -setMethod("$<-", signature(x = "DataFrame"), +setMethod("$<-", signature(x = "SparkDataFrame"), function(x, name, value) { stopifnot(class(value) == "Column" || is.null(value)) @@ -1226,7 +1226,7 @@ setClassUnion("numericOrcharacter", c("numeric", "character")) #' @rdname subset #' @name [[ -setMethod("[[", signature(x = "DataFrame", i = "numericOrcharacter"), +setMethod("[[", signature(x = "SparkDataFrame", i = "numericOrcharacter"), function(x, i) { if (is.numeric(i)) { cols <- columns(x) @@ -1237,7 +1237,7 @@ setMethod("[[", signature(x = "DataFrame", i = "numericOrcharacter"), #' @rdname subset #' @name [ -setMethod("[", signature(x = "DataFrame", i = "missing"), +setMethod("[", signature(x = "SparkDataFrame", i = "missing"), function(x, i, j, ...) { if (is.numeric(j)) { cols <- columns(x) @@ -1251,7 +1251,7 @@ setMethod("[", signature(x = "DataFrame", i = "missing"), #' @rdname subset #' @name [ -setMethod("[", signature(x = "DataFrame", i = "Column"), +setMethod("[", signature(x = "SparkDataFrame", i = "Column"), function(x, i, j, ...) { # It could handle i as "character" but it seems confusing and not required # https://stat.ethz.ch/R-manual/R-devel/library/base/html/Extract.data.frame.html @@ -1265,13 +1265,15 @@ setMethod("[", signature(x = "DataFrame", i = "Column"), #' Subset #' -#' Return subsets of DataFrame according to given conditions -#' @param x A DataFrame +#' Return subsets of SparkDataFrame according to given conditions +#' @param x A SparkDataFrame #' @param subset (Optional) A logical expression to filter on rows -#' @param select expression for the single Column or a list of columns to select from the DataFrame -#' @return A new DataFrame containing only the rows that meet the condition with selected columns +#' @param select expression for the single Column or a list of columns to select from the +#' SparkDataFrame +#' @return A new SparkDataFrame containing only the rows that meet the condition with selected +#' columns #' @export -#' @family DataFrame functions +#' @family SparkDataFrame functions #' @rdname subset #' @name subset #' @family subsetting functions @@ -1283,14 +1285,14 @@ setMethod("[", signature(x = "DataFrame", i = "Column"), #' df[,c("name", "age")] #' # Or to filter rows #' df[df$age > 20,] -#' # DataFrame can be subset on both rows and Columns +#' # SparkDataFrame can be subset on both rows and Columns #' df[df$name == "Smith", c(1,2)] #' df[df$age %in% c(19, 30), 1:2] #' subset(df, df$age %in% c(19, 30), 1:2) #' subset(df, df$age %in% c(19), select = c(1,2)) #' subset(df, select = c(1,2)) #' } -setMethod("subset", signature(x = "DataFrame"), +setMethod("subset", signature(x = "SparkDataFrame"), function(x, subset, select, ...) { if (missing(subset)) { x[, select, ...] @@ -1302,11 +1304,11 @@ setMethod("subset", signature(x = "DataFrame"), #' Select #' #' Selects a set of columns with names or Column expressions. -#' @param x A DataFrame +#' @param x A SparkDataFrame #' @param col A list of columns or single Column or name -#' @return A new DataFrame with selected columns +#' @return A new SparkDataFrame with selected columns #' @export -#' @family DataFrame functions +#' @family SparkDataFrame functions #' @rdname select #' @name select #' @family subsetting functions @@ -1320,7 +1322,7 @@ setMethod("subset", signature(x = "DataFrame"), #' # Similar to R data frames columns can also be selected using `$` #' df[,df$age] #' } -setMethod("select", signature(x = "DataFrame", col = "character"), +setMethod("select", signature(x = "SparkDataFrame", col = "character"), function(x, col, ...) { if (length(col) > 1) { if (length(list(...)) > 0) { @@ -1334,10 +1336,10 @@ setMethod("select", signature(x = "DataFrame", col = "character"), } }) -#' @family DataFrame functions +#' @family SparkDataFrame functions #' @rdname select #' @export -setMethod("select", signature(x = "DataFrame", col = "Column"), +setMethod("select", signature(x = "SparkDataFrame", col = "Column"), function(x, col, ...) { jcols <- lapply(list(col, ...), function(c) { c@jc @@ -1346,11 +1348,11 @@ setMethod("select", signature(x = "DataFrame", col = "Column"), dataFrame(sdf) }) -#' @family DataFrame functions +#' @family SparkDataFrame functions #' @rdname select #' @export setMethod("select", - signature(x = "DataFrame", col = "list"), + signature(x = "SparkDataFrame", col = "list"), function(x, col) { cols <- lapply(col, function(c) { if (class(c) == "Column") { @@ -1365,13 +1367,13 @@ setMethod("select", #' SelectExpr #' -#' Select from a DataFrame using a set of SQL expressions. +#' Select from a SparkDataFrame using a set of SQL expressions. #' -#' @param x A DataFrame to be selected from. +#' @param x A SparkDataFrame to be selected from. #' @param expr A string containing a SQL expression #' @param ... Additional expressions -#' @return A DataFrame -#' @family DataFrame functions +#' @return A SparkDataFrame +#' @family SparkDataFrame functions #' @rdname selectExpr #' @name selectExpr #' @export @@ -1384,7 +1386,7 @@ setMethod("select", #' selectExpr(df, "col1", "(col2 * 5) as newCol") #' } setMethod("selectExpr", - signature(x = "DataFrame", expr = "character"), + signature(x = "SparkDataFrame", expr = "character"), function(x, expr, ...) { exprList <- list(expr, ...) sdf <- callJMethod(x@sdf, "selectExpr", exprList) @@ -1393,14 +1395,14 @@ setMethod("selectExpr", #' WithColumn #' -#' Return a new DataFrame by adding a column or replacing the existing column +#' Return a new SparkDataFrame by adding a column or replacing the existing column #' that has the same name. #' -#' @param x A DataFrame +#' @param x A SparkDataFrame #' @param colName A column name. #' @param col A Column expression. -#' @return A DataFrame with the new column added or the existing column replaced. -#' @family DataFrame functions +#' @return A SparkDataFrame with the new column added or the existing column replaced. +#' @family SparkDataFrame functions #' @rdname withColumn #' @name withColumn #' @seealso \link{rename} \link{mutate} @@ -1416,7 +1418,7 @@ setMethod("selectExpr", #' newDF2 <- withColumn(newDF, "newCol", newDF$col1) #' } setMethod("withColumn", - signature(x = "DataFrame", colName = "character", col = "Column"), + signature(x = "SparkDataFrame", colName = "character", col = "Column"), function(x, colName, col) { sdf <- callJMethod(x@sdf, "withColumn", colName, col@jc) dataFrame(sdf) @@ -1424,12 +1426,12 @@ setMethod("withColumn", #' Mutate #' -#' Return a new DataFrame with the specified columns added. +#' Return a new SparkDataFrame with the specified columns added. #' -#' @param .data A DataFrame +#' @param .data A SparkDataFrame #' @param col a named argument of the form name = col -#' @return A new DataFrame with the new columns added. -#' @family DataFrame functions +#' @return A new SparkDataFrame with the new columns added. +#' @family SparkDataFrame functions #' @rdname mutate #' @name mutate #' @seealso \link{rename} \link{withColumn} @@ -1445,7 +1447,7 @@ setMethod("withColumn", #' newDF2 <- transform(df, newCol = df$col1 / 5, newCol2 = df$col1 * 2) #' } setMethod("mutate", - signature(.data = "DataFrame"), + signature(.data = "SparkDataFrame"), function(.data, ...) { x <- .data cols <- list(...) @@ -1466,20 +1468,20 @@ setMethod("mutate", #' @rdname mutate #' @name transform setMethod("transform", - signature(`_data` = "DataFrame"), + signature(`_data` = "SparkDataFrame"), function(`_data`, ...) { mutate(`_data`, ...) }) #' rename #' -#' Rename an existing column in a DataFrame. +#' Rename an existing column in a SparkDataFrame. #' -#' @param x A DataFrame +#' @param x A SparkDataFrame #' @param existingCol The name of the column you want to change. #' @param newCol The new column name. -#' @return A DataFrame with the column name changed. -#' @family DataFrame functions +#' @return A SparkDataFrame with the column name changed. +#' @family SparkDataFrame functions #' @rdname rename #' @name withColumnRenamed #' @seealso \link{mutate} @@ -1493,7 +1495,7 @@ setMethod("transform", #' newDF <- withColumnRenamed(df, "col1", "newCol1") #' } setMethod("withColumnRenamed", - signature(x = "DataFrame", existingCol = "character", newCol = "character"), + signature(x = "SparkDataFrame", existingCol = "character", newCol = "character"), function(x, existingCol, newCol) { cols <- lapply(columns(x), function(c) { if (c == existingCol) { @@ -1518,7 +1520,7 @@ setMethod("withColumnRenamed", #' newDF <- rename(df, col1 = df$newCol1) #' } setMethod("rename", - signature(x = "DataFrame"), + signature(x = "SparkDataFrame"), function(x, ...) { renameCols <- list(...) stopifnot(length(renameCols) > 0) @@ -1541,15 +1543,15 @@ setClassUnion("characterOrColumn", c("character", "Column")) #' Arrange #' -#' Sort a DataFrame by the specified column(s). +#' Sort a SparkDataFrame by the specified column(s). #' -#' @param x A DataFrame to be sorted. +#' @param x A SparkDataFrame to be sorted. #' @param col A character or Column object vector indicating the fields to sort on #' @param ... Additional sorting fields #' @param decreasing A logical argument indicating sorting order for columns when #' a character vector is specified for col -#' @return A DataFrame where all elements are sorted. -#' @family DataFrame functions +#' @return A SparkDataFrame where all elements are sorted. +#' @family SparkDataFrame functions #' @rdname arrange #' @name arrange #' @export @@ -1565,7 +1567,7 @@ setClassUnion("characterOrColumn", c("character", "Column")) #' arrange(df, "col1", "col2", decreasing = c(TRUE, FALSE)) #' } setMethod("arrange", - signature(x = "DataFrame", col = "Column"), + signature(x = "SparkDataFrame", col = "Column"), function(x, col, ...) { jcols <- lapply(list(col, ...), function(c) { c@jc @@ -1579,7 +1581,7 @@ setMethod("arrange", #' @name arrange #' @export setMethod("arrange", - signature(x = "DataFrame", col = "character"), + signature(x = "SparkDataFrame", col = "character"), function(x, col, ..., decreasing = FALSE) { # all sorting columns @@ -1611,20 +1613,20 @@ setMethod("arrange", #' @name orderBy #' @export setMethod("orderBy", - signature(x = "DataFrame", col = "characterOrColumn"), + signature(x = "SparkDataFrame", col = "characterOrColumn"), function(x, col) { arrange(x, col) }) #' Filter #' -#' Filter the rows of a DataFrame according to a given condition. +#' Filter the rows of a SparkDataFrame according to a given condition. #' -#' @param x A DataFrame to be sorted. +#' @param x A SparkDataFrame to be sorted. #' @param condition The condition to filter on. This may either be a Column expression #' or a string containing a SQL statement -#' @return A DataFrame containing only the rows that meet the condition. -#' @family DataFrame functions +#' @return A SparkDataFrame containing only the rows that meet the condition. +#' @family SparkDataFrame functions #' @rdname filter #' @name filter #' @family subsetting functions @@ -1639,7 +1641,7 @@ setMethod("orderBy", #' filter(df, df$col2 != "abcdefg") #' } setMethod("filter", - signature(x = "DataFrame", condition = "characterOrColumn"), + signature(x = "SparkDataFrame", condition = "characterOrColumn"), function(x, condition) { if (class(condition) == "Column") { condition <- condition@jc @@ -1648,24 +1650,24 @@ setMethod("filter", dataFrame(sdf) }) -#' @family DataFrame functions +#' @family SparkDataFrame functions #' @rdname filter #' @name where setMethod("where", - signature(x = "DataFrame", condition = "characterOrColumn"), + signature(x = "SparkDataFrame", condition = "characterOrColumn"), function(x, condition) { filter(x, condition) }) #' dropDuplicates #' -#' Returns a new DataFrame with duplicate rows removed, considering only +#' Returns a new SparkDataFrame with duplicate rows removed, considering only #' the subset of columns. #' -#' @param x A DataFrame. +#' @param x A SparkDataFrame. #' @param colnames A character vector of column names. -#' @return A DataFrame with duplicate rows removed. -#' @family DataFrame functions +#' @return A SparkDataFrame with duplicate rows removed. +#' @family SparkDataFrame functions #' @rdname dropduplicates #' @name dropDuplicates #' @export @@ -1679,7 +1681,7 @@ setMethod("where", #' dropDuplicates(df, c("col1", "col2")) #' } setMethod("dropDuplicates", - signature(x = "DataFrame"), + signature(x = "SparkDataFrame"), function(x, colNames = columns(x)) { stopifnot(class(colNames) == "character") @@ -1689,17 +1691,17 @@ setMethod("dropDuplicates", #' Join #' -#' Join two DataFrames based on the given join expression. +#' Join two SparkDataFrames based on the given join expression. #' -#' @param x A Spark DataFrame -#' @param y A Spark DataFrame +#' @param x A SparkDataFrame +#' @param y A SparkDataFrame #' @param joinExpr (Optional) The expression used to perform the join. joinExpr must be a #' Column expression. If joinExpr is omitted, join() will perform a Cartesian join #' @param joinType The type of join to perform. The following join types are available: #' 'inner', 'outer', 'full', 'fullouter', leftouter', 'left_outer', 'left', #' 'right_outer', 'rightouter', 'right', and 'leftsemi'. The default joinType is "inner". -#' @return A DataFrame containing the result of the join operation. -#' @family DataFrame functions +#' @return A SparkDataFrame containing the result of the join operation. +#' @family SparkDataFrame functions #' @rdname join #' @name join #' @seealso \link{merge} @@ -1715,7 +1717,7 @@ setMethod("dropDuplicates", #' join(df1, df2, df1$col1 == df2$col2, "right_outer") #' } setMethod("join", - signature(x = "DataFrame", y = "DataFrame"), + signature(x = "SparkDataFrame", y = "SparkDataFrame"), function(x, y, joinExpr = NULL, joinType = NULL) { if (is.null(joinExpr)) { sdf <- callJMethod(x@sdf, "join", y@sdf) @@ -1757,7 +1759,7 @@ setMethod("join", #' be returned. If all.x is set to FALSE and all.y is set to TRUE, a right #' outer join will be returned. If all.x and all.y are set to TRUE, a full #' outer join will be returned. -#' @family DataFrame functions +#' @family SparkDataFrame functions #' @rdname merge #' @seealso \link{join} #' @export @@ -1776,7 +1778,7 @@ setMethod("join", #' merge(df1, df2, by = "col1", all = TRUE, suffixes = c("-X", "-Y")) #' } setMethod("merge", - signature(x = "DataFrame", y = "DataFrame"), + signature(x = "SparkDataFrame", y = "SparkDataFrame"), function(x, y, by = intersect(names(x), names(y)), by.x = by, by.y = by, all = FALSE, all.x = all, all.y = all, sort = TRUE, suffixes = c("_x", "_y"), ... ) { @@ -1858,7 +1860,7 @@ setMethod("merge", #' Creates a list of columns by replacing the intersected ones with aliases. #' The name of the alias column is formed by concatanating the original column name and a suffix. #' -#' @param x a DataFrame on which the +#' @param x a SparkDataFrame on which the #' @param intersectedColNames a list of intersected column names #' @param suffix a suffix for the column name #' @return list of columns @@ -1883,14 +1885,14 @@ generateAliasesForIntersectedCols <- function (x, intersectedColNames, suffix) { #' rbind #' -#' Return a new DataFrame containing the union of rows in this DataFrame -#' and another DataFrame. This is equivalent to `UNION ALL` in SQL. -#' Note that this does not remove duplicate rows across the two DataFrames. +#' Return a new SparkDataFrame containing the union of rows in this SparkDataFrame +#' and another SparkDataFrame. This is equivalent to `UNION ALL` in SQL. +#' Note that this does not remove duplicate rows across the two SparkDataFrames. #' -#' @param x A Spark DataFrame -#' @param y A Spark DataFrame -#' @return A DataFrame containing the result of the union. -#' @family DataFrame functions +#' @param x A SparkDataFrame +#' @param y A SparkDataFrame +#' @return A SparkDataFrame containing the result of the union. +#' @family SparkDataFrame functions #' @rdname rbind #' @name unionAll #' @export @@ -1903,20 +1905,20 @@ generateAliasesForIntersectedCols <- function (x, intersectedColNames, suffix) { #' unioned <- unionAll(df, df2) #' } setMethod("unionAll", - signature(x = "DataFrame", y = "DataFrame"), + signature(x = "SparkDataFrame", y = "SparkDataFrame"), function(x, y) { unioned <- callJMethod(x@sdf, "unionAll", y@sdf) dataFrame(unioned) }) -#' @title Union two or more DataFrames -#' @description Returns a new DataFrame containing rows of all parameters. +#' @title Union two or more SparkDataFrames +#' @description Returns a new SparkDataFrame containing rows of all parameters. #' #' @rdname rbind #' @name rbind #' @export setMethod("rbind", - signature(... = "DataFrame"), + signature(... = "SparkDataFrame"), function(x, ..., deparse.level = 1) { if (nargs() == 3) { unionAll(x, ...) @@ -1927,13 +1929,13 @@ setMethod("rbind", #' Intersect #' -#' Return a new DataFrame containing rows only in both this DataFrame -#' and another DataFrame. This is equivalent to `INTERSECT` in SQL. +#' Return a new SparkDataFrame containing rows only in both this SparkDataFrame +#' and another SparkDataFrame. This is equivalent to `INTERSECT` in SQL. #' -#' @param x A Spark DataFrame -#' @param y A Spark DataFrame -#' @return A DataFrame containing the result of the intersect. -#' @family DataFrame functions +#' @param x A SparkDataFrame +#' @param y A SparkDataFrame +#' @return A SparkDataFrame containing the result of the intersect. +#' @family SparkDataFrame functions #' @rdname intersect #' @name intersect #' @export @@ -1946,7 +1948,7 @@ setMethod("rbind", #' intersectDF <- intersect(df, df2) #' } setMethod("intersect", - signature(x = "DataFrame", y = "DataFrame"), + signature(x = "SparkDataFrame", y = "SparkDataFrame"), function(x, y) { intersected <- callJMethod(x@sdf, "intersect", y@sdf) dataFrame(intersected) @@ -1954,13 +1956,13 @@ setMethod("intersect", #' except #' -#' Return a new DataFrame containing rows in this DataFrame -#' but not in another DataFrame. This is equivalent to `EXCEPT` in SQL. +#' Return a new SparkDataFrame containing rows in this SparkDataFrame +#' but not in another SparkDataFrame. This is equivalent to `EXCEPT` in SQL. #' -#' @param x A Spark DataFrame -#' @param y A Spark DataFrame -#' @return A DataFrame containing the result of the except operation. -#' @family DataFrame functions +#' @param x A SparkDataFrame +#' @param y A SparkDataFrame +#' @return A SparkDataFrame containing the result of the except operation. +#' @family SparkDataFrame functions #' @rdname except #' @name except #' @export @@ -1975,13 +1977,13 @@ setMethod("intersect", #' @rdname except #' @export setMethod("except", - signature(x = "DataFrame", y = "DataFrame"), + signature(x = "SparkDataFrame", y = "SparkDataFrame"), function(x, y) { excepted <- callJMethod(x@sdf, "except", y@sdf) dataFrame(excepted) }) -#' Save the contents of the DataFrame to a data source +#' Save the contents of the SparkDataFrame to a data source #' #' The data source is specified by the `source` and a set of options (...). #' If `source` is not specified, the default data source configured by @@ -1989,18 +1991,19 @@ setMethod("except", #' #' Additionally, mode is used to specify the behavior of the save operation when #' data already exists in the data source. There are four modes: \cr -#' append: Contents of this DataFrame are expected to be appended to existing data. \cr -#' overwrite: Existing data is expected to be overwritten by the contents of this DataFrame. \cr +#' append: Contents of this SparkDataFrame are expected to be appended to existing data. \cr +#' overwrite: Existing data is expected to be overwritten by the contents of this +#' SparkDataFrame. \cr #' error: An exception is expected to be thrown. \cr -#' ignore: The save operation is expected to not save the contents of the DataFrame +#' ignore: The save operation is expected to not save the contents of the SparkDataFrame #' and to not change the existing data. \cr #' -#' @param df A SparkSQL DataFrame +#' @param df A SparkDataFrame #' @param path A name for the table #' @param source A name for external data source #' @param mode One of 'append', 'overwrite', 'error', 'ignore' save mode (it is 'error' by default) #' -#' @family DataFrame functions +#' @family SparkDataFrame functions #' @rdname write.df #' @name write.df #' @export @@ -2014,7 +2017,7 @@ setMethod("except", #' saveDF(df, parquetPath2, "parquet", mode = saveMode, mergeSchema = mergeSchema) #' } setMethod("write.df", - signature(df = "DataFrame", path = "character"), + signature(df = "SparkDataFrame", path = "character"), function(df, path, source = NULL, mode = "error", ...){ if (is.null(source)) { if (exists(".sparkRSQLsc", envir = .sparkREnv)) { @@ -2042,14 +2045,14 @@ setMethod("write.df", #' @name saveDF #' @export setMethod("saveDF", - signature(df = "DataFrame", path = "character"), + signature(df = "SparkDataFrame", path = "character"), function(df, path, source = NULL, mode = "error", ...){ write.df(df, path, source, mode, ...) }) #' saveAsTable #' -#' Save the contents of the DataFrame to a data source as a table +#' Save the contents of the SparkDataFrame to a data source as a table #' #' The data source is specified by the `source` and a set of options (...). #' If `source` is not specified, the default data source configured by @@ -2057,18 +2060,19 @@ setMethod("saveDF", #' #' Additionally, mode is used to specify the behavior of the save operation when #' data already exists in the data source. There are four modes: \cr -#' append: Contents of this DataFrame are expected to be appended to existing data. \cr -#' overwrite: Existing data is expected to be overwritten by the contents of this DataFrame. \cr +#' append: Contents of this SparkDataFrame are expected to be appended to existing data. \cr +#' overwrite: Existing data is expected to be overwritten by the contents of this +#' SparkDataFrame. \cr #' error: An exception is expected to be thrown. \cr -#' ignore: The save operation is expected to not save the contents of the DataFrame +#' ignore: The save operation is expected to not save the contents of the SparkDataFrame #' and to not change the existing data. \cr #' -#' @param df A SparkSQL DataFrame +#' @param df A SparkDataFrame #' @param tableName A name for the table #' @param source A name for external data source #' @param mode One of 'append', 'overwrite', 'error', 'ignore' save mode (it is 'error' by default) #' -#' @family DataFrame functions +#' @family SparkDataFrame functions #' @rdname saveAsTable #' @name saveAsTable #' @export @@ -2081,7 +2085,7 @@ setMethod("saveDF", #' saveAsTable(df, "myfile") #' } setMethod("saveAsTable", - signature(df = "DataFrame", tableName = "character"), + signature(df = "SparkDataFrame", tableName = "character"), function(df, tableName, source = NULL, mode="error", ...){ if (is.null(source)) { if (exists(".sparkRSQLsc", envir = .sparkREnv)) { @@ -2109,11 +2113,11 @@ setMethod("saveAsTable", #' Computes statistics for numeric columns. #' If no columns are given, this function computes statistics for all numerical columns. #' -#' @param x A DataFrame to be computed. +#' @param x A SparkDataFrame to be computed. #' @param col A string of name #' @param ... Additional expressions -#' @return A DataFrame -#' @family DataFrame functions +#' @return A SparkDataFrame +#' @family SparkDataFrame functions #' @rdname summary #' @name describe #' @export @@ -2128,7 +2132,7 @@ setMethod("saveAsTable", #' describe(df, "col1", "col2") #' } setMethod("describe", - signature(x = "DataFrame", col = "character"), + signature(x = "SparkDataFrame", col = "character"), function(x, col, ...) { colList <- list(col, ...) sdf <- callJMethod(x@sdf, "describe", colList) @@ -2138,7 +2142,7 @@ setMethod("describe", #' @rdname summary #' @name describe setMethod("describe", - signature(x = "DataFrame"), + signature(x = "SparkDataFrame"), function(x) { colList <- as.list(c(columns(x))) sdf <- callJMethod(x@sdf, "describe", colList) @@ -2148,7 +2152,7 @@ setMethod("describe", #' @rdname summary #' @name summary setMethod("summary", - signature(object = "DataFrame"), + signature(object = "SparkDataFrame"), function(object, ...) { describe(object) }) @@ -2156,9 +2160,9 @@ setMethod("summary", #' dropna #' -#' Returns a new DataFrame omitting rows with null values. +#' Returns a new SparkDataFrame omitting rows with null values. #' -#' @param x A SparkSQL DataFrame. +#' @param x A SparkDataFrame. #' @param how "any" or "all". #' if "any", drop a row if it contains any nulls. #' if "all", drop a row only if all its values are null. @@ -2167,9 +2171,9 @@ setMethod("summary", #' minNonNulls non-null values. #' This overwrites the how parameter. #' @param cols Optional list of column names to consider. -#' @return A DataFrame +#' @return A SparkDataFrame #' -#' @family DataFrame functions +#' @family SparkDataFrame functions #' @rdname nafunctions #' @name dropna #' @export @@ -2182,7 +2186,7 @@ setMethod("summary", #' dropna(df) #' } setMethod("dropna", - signature(x = "DataFrame"), + signature(x = "SparkDataFrame"), function(x, how = c("any", "all"), minNonNulls = NULL, cols = NULL) { how <- match.arg(how) if (is.null(cols)) { @@ -2202,7 +2206,7 @@ setMethod("dropna", #' @name na.omit #' @export setMethod("na.omit", - signature(object = "DataFrame"), + signature(object = "SparkDataFrame"), function(object, how = c("any", "all"), minNonNulls = NULL, cols = NULL) { dropna(object, how, minNonNulls, cols) }) @@ -2211,7 +2215,7 @@ setMethod("na.omit", #' #' Replace null values. #' -#' @param x A SparkSQL DataFrame. +#' @param x A SparkDataFrame. #' @param value Value to replace null values with. #' Should be an integer, numeric, character or named list. #' If the value is a named list, then cols is ignored and @@ -2237,7 +2241,7 @@ setMethod("na.omit", #' fillna(df, list("age" = 20, "name" = "unknown")) #' } setMethod("fillna", - signature(x = "DataFrame"), + signature(x = "SparkDataFrame"), function(x, value, cols = NULL) { if (!(class(value) %in% c("integer", "numeric", "character", "list"))) { stop("value should be an integer, numeric, charactor or named list.") @@ -2280,14 +2284,14 @@ setMethod("fillna", dataFrame(sdf) }) -#' This function downloads the contents of a DataFrame into an R's data.frame. +#' This function downloads the contents of a SparkDataFrame into an R's data.frame. #' Since data.frames are held in memory, ensure that you have enough memory #' in your system to accommodate the contents. #' -#' @title Download data from a DataFrame into a data.frame -#' @param x a DataFrame +#' @title Download data from a SparkDataFrame into a data.frame +#' @param x a SparkDataFrame #' @return a data.frame -#' @family DataFrame functions +#' @family SparkDataFrame functions #' @rdname as.data.frame #' @examples \dontrun{ #' @@ -2295,24 +2299,24 @@ setMethod("fillna", #' df <- as.data.frame(irisDF[irisDF$Species == "setosa", ]) #' } setMethod("as.data.frame", - signature(x = "DataFrame"), + signature(x = "SparkDataFrame"), function(x, row.names = NULL, optional = FALSE, ...) { as.data.frame(collect(x), row.names, optional, ...) }) -#' The specified DataFrame is attached to the R search path. This means that -#' the DataFrame is searched by R when evaluating a variable, so columns in -#' the DataFrame can be accessed by simply giving their names. +#' The specified SparkDataFrame is attached to the R search path. This means that +#' the SparkDataFrame is searched by R when evaluating a variable, so columns in +#' the SparkDataFrame can be accessed by simply giving their names. #' -#' @family DataFrame functions +#' @family SparkDataFrame functions #' @rdname attach -#' @title Attach DataFrame to R search path -#' @param what (DataFrame) The DataFrame to attach +#' @title Attach SparkDataFrame to R search path +#' @param what (SparkDataFrame) The SparkDataFrame to attach #' @param pos (integer) Specify position in search() where to attach. -#' @param name (character) Name to use for the attached DataFrame. Names +#' @param name (character) Name to use for the attached SparkDataFrame. Names #' starting with package: are reserved for library. #' @param warn.conflicts (logical) If TRUE, warnings are printed about conflicts -#' from attaching the database, unless that DataFrame contains an object +#' from attaching the database, unless that SparkDataFrame contains an object #' @examples #' \dontrun{ #' attach(irisDf) @@ -2320,21 +2324,21 @@ setMethod("as.data.frame", #' } #' @seealso \link{detach} setMethod("attach", - signature(what = "DataFrame"), + signature(what = "SparkDataFrame"), function(what, pos = 2, name = deparse(substitute(what)), warn.conflicts = TRUE) { newEnv <- assignNewEnv(what) attach(newEnv, pos = pos, name = name, warn.conflicts = warn.conflicts) }) -#' Evaluate a R expression in an environment constructed from a DataFrame -#' with() allows access to columns of a DataFrame by simply referring to -#' their name. It appends every column of a DataFrame into a new +#' Evaluate a R expression in an environment constructed from a SparkDataFrame +#' with() allows access to columns of a SparkDataFrame by simply referring to +#' their name. It appends every column of a SparkDataFrame into a new #' environment. Then, the given expression is evaluated in this new #' environment. #' #' @rdname with -#' @title Evaluate a R expression in an environment constructed from a DataFrame -#' @param data (DataFrame) DataFrame to use for constructing an environment. +#' @title Evaluate a R expression in an environment constructed from a SparkDataFrame +#' @param data (SparkDataFrame) SparkDataFrame to use for constructing an environment. #' @param expr (expression) Expression to evaluate. #' @param ... arguments to be passed to future methods. #' @examples @@ -2343,28 +2347,28 @@ setMethod("attach", #' } #' @seealso \link{attach} setMethod("with", - signature(data = "DataFrame"), + signature(data = "SparkDataFrame"), function(data, expr, ...) { newEnv <- assignNewEnv(data) eval(substitute(expr), envir = newEnv, enclos = newEnv) }) -#' Display the structure of a DataFrame, including column names, column types, as well as a +#' Display the structure of a SparkDataFrame, including column names, column types, as well as a #' a small sample of rows. #' @name str #' @title Compactly display the structure of a dataset #' @rdname str -#' @family DataFrame functions -#' @param object a DataFrame +#' @family SparkDataFrame functions +#' @param object a SparkDataFrame #' @examples \dontrun{ -#' # Create a DataFrame from the Iris dataset +#' # Create a SparkDataFrame from the Iris dataset #' irisDF <- createDataFrame(sqlContext, iris) #' -#' # Show the structure of the DataFrame +#' # Show the structure of the SparkDataFrame #' str(irisDF) #' } setMethod("str", - signature(object = "DataFrame"), + signature(object = "SparkDataFrame"), function(object) { # TODO: These could be made global parameters, though in R it's not the case @@ -2424,14 +2428,14 @@ setMethod("str", #' drop #' -#' Returns a new DataFrame with columns dropped. +#' Returns a new SparkDataFrame with columns dropped. #' This is a no-op if schema doesn't contain column name(s). -#' -#' @param x A SparkSQL DataFrame. +#' +#' @param x A SparkDataFrame. #' @param cols A character vector of column names or a Column. -#' @return A DataFrame +#' @return A SparkDataFrame #' -#' @family DataFrame functions +#' @family SparkDataFrame functions #' @rdname drop #' @name drop #' @export @@ -2446,7 +2450,7 @@ setMethod("str", #' drop(df, df$col1) #' } setMethod("drop", - signature(x = "DataFrame"), + signature(x = "SparkDataFrame"), function(x, col) { stopifnot(class(col) == "character" || class(col) == "Column") @@ -2465,23 +2469,24 @@ setMethod("drop", base::drop(x) }) -#' Saves the content of the DataFrame to an external database table via JDBC +#' Saves the content of the SparkDataFrame to an external database table via JDBC #' #' Additional JDBC database connection properties can be set (...) #' #' Also, mode is used to specify the behavior of the save operation when #' data already exists in the data source. There are four modes: \cr -#' append: Contents of this DataFrame are expected to be appended to existing data. \cr -#' overwrite: Existing data is expected to be overwritten by the contents of this DataFrame. \cr +#' append: Contents of this SparkDataFrame are expected to be appended to existing data. \cr +#' overwrite: Existing data is expected to be overwritten by the contents of this +#' SparkDataFrame. \cr #' error: An exception is expected to be thrown. \cr -#' ignore: The save operation is expected to not save the contents of the DataFrame +#' ignore: The save operation is expected to not save the contents of the SparkDataFrame #' and to not change the existing data. \cr #' -#' @param x A SparkSQL DataFrame +#' @param x A SparkDataFrame #' @param url JDBC database url of the form `jdbc:subprotocol:subname` #' @param tableName The name of the table in the external database #' @param mode One of 'append', 'overwrite', 'error', 'ignore' save mode (it is 'error' by default) -#' @family DataFrame functions +#' @family SparkDataFrame functions #' @rdname write.jdbc #' @name write.jdbc #' @export @@ -2493,7 +2498,7 @@ setMethod("drop", #' write.jdbc(df, jdbcUrl, "table", user = "username", password = "password") #' } setMethod("write.jdbc", - signature(x = "DataFrame", url = "character", tableName = "character"), + signature(x = "SparkDataFrame", url = "character", tableName = "character"), function(x, url, tableName, mode = "error", ...){ jmode <- convertToJSaveMode(mode) jprops <- varargsToJProperties(...) diff --git a/R/pkg/R/RDD.R b/R/pkg/R/RDD.R index 35c4e6f1af..34d29ddbfd 100644 --- a/R/pkg/R/RDD.R +++ b/R/pkg/R/RDD.R @@ -46,7 +46,7 @@ setMethod("initialize", "RDD", function(.Object, jrdd, serializedMode, # RDD has three serialization types: # byte: The RDD stores data serialized in R. # string: The RDD stores data as strings. - # row: The RDD stores the serialized rows of a DataFrame. + # row: The RDD stores the serialized rows of a SparkDataFrame. # We use an environment to store mutable states inside an RDD object. # Note that R's call-by-value semantics makes modifying slots inside an @@ -114,7 +114,7 @@ setMethod("initialize", "PipelinedRDD", function(.Object, prev, func, jrdd_val) #' @noRd #' @param jrdd Java object reference to the backing JavaRDD #' @param serializedMode Use "byte" if the RDD stores data serialized in R, "string" if the RDD -#' stores strings, and "row" if the RDD stores the rows of a DataFrame +#' stores strings, and "row" if the RDD stores the rows of a SparkDataFrame #' @param isCached TRUE if the RDD is cached #' @param isCheckpointed TRUE if the RDD has been checkpointed RDD <- function(jrdd, serializedMode = "byte", isCached = FALSE, diff --git a/R/pkg/R/SQLContext.R b/R/pkg/R/SQLContext.R index b726c1e1b9..3824e0a995 100644 --- a/R/pkg/R/SQLContext.R +++ b/R/pkg/R/SQLContext.R @@ -34,7 +34,7 @@ getInternalType <- function(x) { Date = "date", POSIXlt = "timestamp", POSIXct = "timestamp", - stop(paste("Unsupported type for DataFrame:", class(x)))) + stop(paste("Unsupported type for SparkDataFrame:", class(x)))) } #' infer the SQL type @@ -70,14 +70,14 @@ infer_type <- function(x) { } } -#' Create a DataFrame +#' Create a SparkDataFrame #' -#' Converts R data.frame or list into DataFrame. +#' Converts R data.frame or list into SparkDataFrame. #' #' @param sqlContext A SQLContext #' @param data An RDD or list or data.frame #' @param schema a list of column names or named list (StructType), optional -#' @return an DataFrame +#' @return a SparkDataFrame #' @rdname createDataFrame #' @export #' @examples @@ -173,11 +173,11 @@ as.DataFrame <- function(sqlContext, data, schema = NULL, samplingRatio = 1.0) { #' toDF #' -#' Converts an RDD to a DataFrame by infer the types. +#' Converts an RDD to a SparkDataFrame by infer the types. #' #' @param x An RDD #' -#' @rdname DataFrame +#' @rdname SparkDataFrame #' @noRd #' @examples #'\dontrun{ @@ -200,14 +200,14 @@ setMethod("toDF", signature(x = "RDD"), createDataFrame(sqlContext, x, ...) }) -#' Create a DataFrame from a JSON file. +#' Create a SparkDataFrame from a JSON file. #' -#' Loads a JSON file (one object per line), returning the result as a DataFrame +#' Loads a JSON file (one object per line), returning the result as a SparkDataFrame #' It goes through the entire dataset once to determine the schema. #' #' @param sqlContext SQLContext to use #' @param path Path of file to read. A vector of multiple paths is allowed. -#' @return DataFrame +#' @return SparkDataFrame #' @rdname read.json #' @name read.json #' @export @@ -238,13 +238,13 @@ jsonFile <- function(sqlContext, path) { #' JSON RDD #' -#' Loads an RDD storing one JSON object per string as a DataFrame. +#' Loads an RDD storing one JSON object per string as a SparkDataFrame. #' #' @param sqlContext SQLContext to use #' @param rdd An RDD of JSON string #' @param schema A StructType object to use as schema #' @param samplingRatio The ratio of simpling used to infer the schema -#' @return A DataFrame +#' @return A SparkDataFrame #' @noRd #' @examples #'\dontrun{ @@ -268,13 +268,13 @@ jsonRDD <- function(sqlContext, rdd, schema = NULL, samplingRatio = 1.0) { } } -#' Create a DataFrame from a Parquet file. +#' Create a SparkDataFrame from a Parquet file. #' -#' Loads a Parquet file, returning the result as a DataFrame. +#' Loads a Parquet file, returning the result as a SparkDataFrame. #' #' @param sqlContext SQLContext to use #' @param path Path of file to read. A vector of multiple paths is allowed. -#' @return DataFrame +#' @return SparkDataFrame #' @rdname read.parquet #' @name read.parquet #' @export @@ -295,14 +295,14 @@ parquetFile <- function(sqlContext, ...) { read.parquet(sqlContext, unlist(list(...))) } -#' Create a DataFrame from a text file. +#' Create a SparkDataFrame from a text file. #' -#' Loads a text file and returns a DataFrame with a single string column named "value". -#' Each line in the text file is a new row in the resulting DataFrame. +#' Loads a text file and returns a SparkDataFrame with a single string column named "value". +#' Each line in the text file is a new row in the resulting SparkDataFrame. #' #' @param sqlContext SQLContext to use #' @param path Path of file to read. A vector of multiple paths is allowed. -#' @return DataFrame +#' @return SparkDataFrame #' @rdname read.text #' @name read.text #' @export @@ -323,11 +323,11 @@ read.text <- function(sqlContext, path) { #' SQL Query #' -#' Executes a SQL query using Spark, returning the result as a DataFrame. +#' Executes a SQL query using Spark, returning the result as a SparkDataFrame. #' #' @param sqlContext SQLContext to use #' @param sqlQuery A character vector containing the SQL query -#' @return DataFrame +#' @return SparkDataFrame #' @export #' @examples #'\dontrun{ @@ -344,14 +344,14 @@ sql <- function(sqlContext, sqlQuery) { dataFrame(sdf) } -#' Create a DataFrame from a SparkSQL Table +#' Create a SparkDataFrame from a SparkSQL Table #' -#' Returns the specified Table as a DataFrame. The Table must have already been registered +#' Returns the specified Table as a SparkDataFrame. The Table must have already been registered #' in the SQLContext. #' #' @param sqlContext SQLContext to use -#' @param tableName The SparkSQL Table to convert to a DataFrame. -#' @return DataFrame +#' @param tableName The SparkSQL Table to convert to a SparkDataFrame. +#' @return SparkDataFrame #' @rdname tableToDF #' @name tableToDF #' @export @@ -372,11 +372,11 @@ tableToDF <- function(sqlContext, tableName) { #' Tables #' -#' Returns a DataFrame containing names of tables in the given database. +#' Returns a SparkDataFrame containing names of tables in the given database. #' #' @param sqlContext SQLContext to use #' @param databaseName name of the database -#' @return a DataFrame +#' @return a SparkDataFrame #' @export #' @examples #'\dontrun{ @@ -425,7 +425,7 @@ tableNames <- function(sqlContext, databaseName = NULL) { #' #' @param sqlContext SQLContext to use #' @param tableName The name of the table being cached -#' @return DataFrame +#' @return SparkDataFrame #' @export #' @examples #'\dontrun{ @@ -447,7 +447,7 @@ cacheTable <- function(sqlContext, tableName) { #' #' @param sqlContext SQLContext to use #' @param tableName The name of the table being uncached -#' @return DataFrame +#' @return SparkDataFrame #' @export #' @examples #'\dontrun{ @@ -500,9 +500,9 @@ dropTempTable <- function(sqlContext, tableName) { callJMethod(sqlContext, "dropTempTable", tableName) } -#' Load an DataFrame +#' Load a SparkDataFrame #' -#' Returns the dataset in a data source as a DataFrame +#' Returns the dataset in a data source as a SparkDataFrame #' #' The data source is specified by the `source` and a set of options(...). #' If `source` is not specified, the default data source configured by @@ -512,7 +512,7 @@ dropTempTable <- function(sqlContext, tableName) { #' @param path The path of files to load #' @param source The name of external data source #' @param schema The data schema defined in structType -#' @return DataFrame +#' @return SparkDataFrame #' @rdname read.df #' @name read.df #' @export @@ -556,7 +556,7 @@ loadDF <- function(sqlContext, path = NULL, source = NULL, schema = NULL, ...) { #' Create an external table #' #' Creates an external table based on the dataset in a data source, -#' Returns the DataFrame associated with the external table. +#' Returns a SparkDataFrame associated with the external table. #' #' The data source is specified by the `source` and a set of options(...). #' If `source` is not specified, the default data source configured by @@ -566,7 +566,7 @@ loadDF <- function(sqlContext, path = NULL, source = NULL, schema = NULL, ...) { #' @param tableName A name of the table #' @param path The path of files to load #' @param source the name of external data source -#' @return DataFrame +#' @return SparkDataFrame #' @export #' @examples #'\dontrun{ @@ -584,7 +584,7 @@ createExternalTable <- function(sqlContext, tableName, path = NULL, source = NUL dataFrame(sdf) } -#' Create a DataFrame representing the database table accessible via JDBC URL +#' Create a SparkDataFrame representing the database table accessible via JDBC URL #' #' Additional JDBC database connection properties can be set (...) #' @@ -605,7 +605,7 @@ createExternalTable <- function(sqlContext, tableName, path = NULL, source = NUL #' clause expressions used to split the column `partitionColumn` evenly. #' This defaults to SparkContext.defaultParallelism when unset. #' @param predicates a list of conditions in the where clause; each one defines one partition -#' @return DataFrame +#' @return SparkDataFrame #' @rdname read.jdbc #' @name read.jdbc #' @export diff --git a/R/pkg/R/column.R b/R/pkg/R/column.R index 3ffd9a9890..a3e09372bb 100644 --- a/R/pkg/R/column.R +++ b/R/pkg/R/column.R @@ -22,11 +22,11 @@ NULL setOldClass("jobj") -#' @title S4 class that represents a DataFrame column -#' @description The column class supports unary, binary operations on DataFrame columns +#' @title S4 class that represents a SparkDataFrame column +#' @description The column class supports unary, binary operations on SparkDataFrame columns #' @rdname column #' -#' @slot jc reference to JVM DataFrame column +#' @slot jc reference to JVM SparkDataFrame column #' @export setClass("Column", slots = list(jc = "jobj")) diff --git a/R/pkg/R/deserialize.R b/R/pkg/R/deserialize.R index eefdf17873..ce071b1a84 100644 --- a/R/pkg/R/deserialize.R +++ b/R/pkg/R/deserialize.R @@ -139,7 +139,7 @@ readEnv <- function(con) { env } -# Read a field of StructType from DataFrame +# Read a field of StructType from SparkDataFrame # into a named list in R whose class is "struct" readStruct <- function(con) { names <- readObject(con) diff --git a/R/pkg/R/functions.R b/R/pkg/R/functions.R index 54234b0455..4a0bdf3315 100644 --- a/R/pkg/R/functions.R +++ b/R/pkg/R/functions.R @@ -2097,7 +2097,7 @@ setMethod("conv", signature(x = "Column", fromBase = "numeric", toBase = "numeri #' expr #' #' Parses the expression string into the column that it represents, similar to -#' DataFrame.selectExpr +#' SparkDataFrame.selectExpr #' #' @family normal_funcs #' @rdname expr diff --git a/R/pkg/R/generics.R b/R/pkg/R/generics.R index 6b67258d77..04274a12bc 100644 --- a/R/pkg/R/generics.R +++ b/R/pkg/R/generics.R @@ -385,7 +385,7 @@ setGeneric("subtractByKey", setGeneric("value", function(bcast) { standardGeneric("value") }) -#################### DataFrame Methods ######################## +#################### SparkDataFrame Methods ######################## #' @rdname agg #' @export diff --git a/R/pkg/R/group.R b/R/pkg/R/group.R index 23b49aebda..08f4a490c8 100644 --- a/R/pkg/R/group.R +++ b/R/pkg/R/group.R @@ -23,7 +23,7 @@ NULL setOldClass("jobj") #' @title S4 class that represents a GroupedData -#' @description GroupedDatas can be created using groupBy() on a DataFrame +#' @description GroupedDatas can be created using groupBy() on a SparkDataFrame #' @rdname GroupedData #' @seealso groupBy #' @@ -37,7 +37,7 @@ setMethod("initialize", "GroupedData", function(.Object, sgd) { .Object }) -#' @rdname DataFrame +#' @rdname GroupedData groupedData <- function(sgd) { new("GroupedData", sgd) } @@ -52,10 +52,10 @@ setMethod("show", "GroupedData", #' Count #' #' Count the number of rows for each group. -#' The resulting DataFrame will also contain the grouping columns. +#' The resulting SparkDataFrame will also contain the grouping columns. #' #' @param x a GroupedData -#' @return a DataFrame +#' @return a SparkDataFrame #' @rdname agg #' @export #' @examples @@ -70,14 +70,14 @@ setMethod("count", #' summarize #' -#' Aggregates on the entire DataFrame without groups. -#' The resulting DataFrame will also contain the grouping columns. +#' Aggregates on the entire SparkDataFrame without groups. +#' The resulting SparkDataFrame will also contain the grouping columns. #' #' df2 <- agg(df, = ) #' df2 <- agg(df, newColName = aggFunction(column)) #' #' @param x a GroupedData -#' @return a DataFrame +#' @return a SparkDataFrame #' @rdname summarize #' @name agg #' @family agg_funcs diff --git a/R/pkg/R/mllib.R b/R/pkg/R/mllib.R index 922a9b13db..7dd82963a1 100644 --- a/R/pkg/R/mllib.R +++ b/R/pkg/R/mllib.R @@ -43,7 +43,7 @@ setClass("KMeansModel", representation(jobj = "jobj")) #' #' @param formula A symbolic description of the model to be fitted. Currently only a few formula #' operators are supported, including '~', '.', ':', '+', and '-'. -#' @param data DataFrame for training. +#' @param data SparkDataFrame for training. #' @param family A description of the error distribution and link function to be used in the model. #' This can be a character string naming a family function, a family function or #' the result of a call to a family function. Refer R family at @@ -62,7 +62,7 @@ setClass("KMeansModel", representation(jobj = "jobj")) #' model <- glm(Sepal_Length ~ Sepal_Width, df, family="gaussian") #' summary(model) #' } -setMethod("glm", signature(formula = "formula", family = "ANY", data = "DataFrame"), +setMethod("glm", signature(formula = "formula", family = "ANY", data = "SparkDataFrame"), function(formula, family = gaussian, data, epsilon = 1e-06, maxit = 25) { if (is.character(family)) { family <- get(family, mode = "function", envir = parent.frame()) @@ -155,8 +155,8 @@ print.summary.GeneralizedLinearRegressionModel <- function(x, ...) { #' Makes predictions from a generalized linear model produced by glm(), similarly to R's predict(). #' #' @param object A fitted generalized linear model -#' @param newData DataFrame for testing -#' @return DataFrame containing predicted labels in a column named "prediction" +#' @param newData SparkDataFrame for testing +#' @return SparkDataFrame containing predicted labels in a column named "prediction" #' @rdname predict #' @export #' @examples @@ -175,8 +175,8 @@ setMethod("predict", signature(object = "GeneralizedLinearRegressionModel"), #' Makes predictions from a model produced by naiveBayes(), similarly to R package e1071's predict. #' #' @param object A fitted naive Bayes model -#' @param newData DataFrame for testing -#' @return DataFrame containing predicted labels in a column named "prediction" +#' @param newData SparkDataFrame for testing +#' @return SparkDataFrame containing predicted labels in a column named "prediction" #' @rdname predict #' @export #' @examples @@ -223,7 +223,7 @@ setMethod("summary", signature(object = "NaiveBayesModel"), #' #' Fit a k-means model, similarly to R's kmeans(). #' -#' @param x DataFrame for training +#' @param x SparkDataFrame for training #' @param centers Number of centers #' @param iter.max Maximum iteration number #' @param algorithm Algorithm choosen to fit the model @@ -234,7 +234,7 @@ setMethod("summary", signature(object = "NaiveBayesModel"), #' \dontrun{ #' model <- kmeans(x, centers = 2, algorithm="random") #' } -setMethod("kmeans", signature(x = "DataFrame"), +setMethod("kmeans", signature(x = "SparkDataFrame"), function(x, centers, iter.max = 10, algorithm = c("random", "k-means||")) { columnNames <- as.array(colnames(x)) algorithm <- match.arg(algorithm) @@ -248,7 +248,7 @@ setMethod("kmeans", signature(x = "DataFrame"), #' Get fitted result from a k-means model, similarly to R's fitted(). #' #' @param object A fitted k-means model -#' @return DataFrame containing fitted values +#' @return SparkDataFrame containing fitted values #' @rdname fitted #' @export #' @examples @@ -296,8 +296,8 @@ setMethod("summary", signature(object = "KMeansModel"), #' Make predictions from a model produced by kmeans(). #' #' @param object A fitted k-means model -#' @param newData DataFrame for testing -#' @return DataFrame containing predicted labels in a column named "prediction" +#' @param newData SparkDataFrame for testing +#' @return SparkDataFrame containing predicted labels in a column named "prediction" #' @rdname predict #' @export #' @examples @@ -314,12 +314,12 @@ setMethod("predict", signature(object = "KMeansModel"), #' Fit a Bernoulli naive Bayes model #' #' Fit a Bernoulli naive Bayes model, similarly to R package e1071's naiveBayes() while only -#' categorical features are supported. The input should be a DataFrame of observations instead of a -#' contingency table. +#' categorical features are supported. The input should be a SparkDataFrame of observations instead +#' of a contingency table. #' #' @param object A symbolic description of the model to be fitted. Currently only a few formula #' operators are supported, including '~', '.', ':', '+', and '-'. -#' @param data DataFrame for training +#' @param data SparkDataFrame for training #' @param laplace Smoothing parameter #' @return a fitted naive Bayes model #' @rdname naiveBayes @@ -330,7 +330,7 @@ setMethod("predict", signature(object = "KMeansModel"), #' df <- createDataFrame(sqlContext, infert) #' model <- naiveBayes(education ~ ., df, laplace = 0) #'} -setMethod("naiveBayes", signature(formula = "formula", data = "DataFrame"), +setMethod("naiveBayes", signature(formula = "formula", data = "SparkDataFrame"), function(formula, data, laplace = 0, ...) { formula <- paste(deparse(formula), collapse = "") jobj <- callJStatic("org.apache.spark.ml.r.NaiveBayesWrapper", "fit", @@ -345,7 +345,7 @@ setMethod("naiveBayes", signature(formula = "formula", data = "DataFrame"), #' @param formula A symbolic description of the model to be fitted. Currently only a few formula #' operators are supported, including '~', ':', '+', and '-'. #' Note that operator '.' is not supported currently. -#' @param data DataFrame for training. +#' @param data SparkDataFrame for training. #' @return a fitted AFT survival regression model #' @rdname survreg #' @seealso survival: \url{https://cran.r-project.org/web/packages/survival/} @@ -355,7 +355,7 @@ setMethod("naiveBayes", signature(formula = "formula", data = "DataFrame"), #' df <- createDataFrame(sqlContext, ovarian) #' model <- survreg(Surv(futime, fustat) ~ ecog_ps + rx, df) #' } -setMethod("survreg", signature(formula = "formula", data = "DataFrame"), +setMethod("survreg", signature(formula = "formula", data = "SparkDataFrame"), function(formula, data, ...) { formula <- paste(deparse(formula), collapse = "") jobj <- callJStatic("org.apache.spark.ml.r.AFTSurvivalRegressionWrapper", @@ -393,8 +393,8 @@ setMethod("summary", signature(object = "AFTSurvivalRegressionModel"), #' Make predictions from a model produced by survreg(), similarly to R package survival's predict. #' #' @param object A fitted AFT survival regression model -#' @param newData DataFrame for testing -#' @return DataFrame containing predicted labels in a column named "prediction" +#' @param newData SparkDataFrame for testing +#' @return SparkDataFrame containing predicted labels in a column named "prediction" #' @rdname predict #' @export #' @examples diff --git a/R/pkg/R/schema.R b/R/pkg/R/schema.R index c6ddb56227..039aa008b3 100644 --- a/R/pkg/R/schema.R +++ b/R/pkg/R/schema.R @@ -16,11 +16,11 @@ # # A set of S3 classes and methods that support the SparkSQL `StructType` and `StructField -# datatypes. These are used to create and interact with DataFrame schemas. +# datatypes. These are used to create and interact with SparkDataFrame schemas. #' structType #' -#' Create a structType object that contains the metadata for a DataFrame. Intended for +#' Create a structType object that contains the metadata for a SparkDataFrame. Intended for #' use with createDataFrame and toDF. #' #' @param x a structField object (created with the field() function) @@ -171,7 +171,7 @@ checkType <- function(type) { }) } - stop(paste("Unsupported type for Dataframe:", type)) + stop(paste("Unsupported type for SparkDataframe:", type)) } structField.character <- function(x, type, nullable = TRUE) { diff --git a/R/pkg/R/stats.R b/R/pkg/R/stats.R index edf72937c6..879b664421 100644 --- a/R/pkg/R/stats.R +++ b/R/pkg/R/stats.R @@ -15,7 +15,7 @@ # limitations under the License. # -# stats.R - Statistic functions for DataFrames. +# stats.R - Statistic functions for SparkDataFrames. setOldClass("jobj") @@ -41,7 +41,7 @@ setOldClass("jobj") #' ct <- crosstab(df, "title", "gender") #' } setMethod("crosstab", - signature(x = "DataFrame", col1 = "character", col2 = "character"), + signature(x = "SparkDataFrame", col1 = "character", col2 = "character"), function(x, col1, col2) { statFunctions <- callJMethod(x@sdf, "stat") sct <- callJMethod(statFunctions, "crosstab", col1, col2) @@ -50,9 +50,9 @@ setMethod("crosstab", #' cov #' -#' Calculate the sample covariance of two numerical columns of a DataFrame. +#' Calculate the sample covariance of two numerical columns of a SparkDataFrame. #' -#' @param x A SparkSQL DataFrame +#' @param x A SparkDataFrame #' @param col1 the name of the first column #' @param col2 the name of the second column #' @return the covariance of the two columns. @@ -66,7 +66,7 @@ setMethod("crosstab", #' cov <- cov(df, "title", "gender") #' } setMethod("cov", - signature(x = "DataFrame"), + signature(x = "SparkDataFrame"), function(x, col1, col2) { stopifnot(class(col1) == "character" && class(col2) == "character") statFunctions <- callJMethod(x@sdf, "stat") @@ -75,11 +75,11 @@ setMethod("cov", #' corr #' -#' Calculates the correlation of two columns of a DataFrame. +#' Calculates the correlation of two columns of a SparkDataFrame. #' Currently only supports the Pearson Correlation Coefficient. #' For Spearman Correlation, consider using RDD methods found in MLlib's Statistics. #' -#' @param x A SparkSQL DataFrame +#' @param x A SparkDataFrame #' @param col1 the name of the first column #' @param col2 the name of the second column #' @param method Optional. A character specifying the method for calculating the correlation. @@ -96,7 +96,7 @@ setMethod("cov", #' corr <- corr(df, "title", "gender", method = "pearson") #' } setMethod("corr", - signature(x = "DataFrame"), + signature(x = "SparkDataFrame"), function(x, col1, col2, method = "pearson") { stopifnot(class(col1) == "character" && class(col2) == "character") statFunctions <- callJMethod(x@sdf, "stat") @@ -109,7 +109,7 @@ setMethod("corr", #' Using the frequent element count algorithm described in #' \url{http://dx.doi.org/10.1145/762471.762473}, proposed by Karp, Schenker, and Papadimitriou. #' -#' @param x A SparkSQL DataFrame. +#' @param x A SparkDataFrame. #' @param cols A vector column names to search frequent items in. #' @param support (Optional) The minimum frequency for an item to be considered `frequent`. #' Should be greater than 1e-4. Default support = 0.01. @@ -123,7 +123,7 @@ setMethod("corr", #' df <- jsonFile(sqlContext, "/path/to/file.json") #' fi = freqItems(df, c("title", "gender")) #' } -setMethod("freqItems", signature(x = "DataFrame", cols = "character"), +setMethod("freqItems", signature(x = "SparkDataFrame", cols = "character"), function(x, cols, support = 0.01) { statFunctions <- callJMethod(x@sdf, "stat") sct <- callJMethod(statFunctions, "freqItems", as.list(cols), support) @@ -132,18 +132,18 @@ setMethod("freqItems", signature(x = "DataFrame", cols = "character"), #' approxQuantile #' -#' Calculates the approximate quantiles of a numerical column of a DataFrame. +#' Calculates the approximate quantiles of a numerical column of a SparkDataFrame. #' #' The result of this algorithm has the following deterministic bound: -#' If the DataFrame has N elements and if we request the quantile at probability `p` up to error -#' `err`, then the algorithm will return a sample `x` from the DataFrame so that the *exact* rank -#' of `x` is close to (p * N). More precisely, +#' If the SparkDataFrame has N elements and if we request the quantile at probability `p` up to +#' error `err`, then the algorithm will return a sample `x` from the SparkDataFrame so that the +#' *exact* rank of `x` is close to (p * N). More precisely, #' floor((p - err) * N) <= rank(x) <= ceil((p + err) * N). #' This method implements a variation of the Greenwald-Khanna algorithm (with some speed #' optimizations). The algorithm was first present in [[http://dx.doi.org/10.1145/375663.375670 #' Space-efficient Online Computation of Quantile Summaries]] by Greenwald and Khanna. #' -#' @param x A SparkSQL DataFrame. +#' @param x A SparkDataFrame. #' @param col The name of the numerical column. #' @param probabilities A list of quantile probabilities. Each number must belong to [0, 1]. #' For example 0 is the minimum, 0.5 is the median, 1 is the maximum. @@ -161,7 +161,7 @@ setMethod("freqItems", signature(x = "DataFrame", cols = "character"), #' quantiles <- approxQuantile(df, "key", c(0.5, 0.8), 0.0) #' } setMethod("approxQuantile", - signature(x = "DataFrame", col = "character", + signature(x = "SparkDataFrame", col = "character", probabilities = "numeric", relativeError = "numeric"), function(x, col, probabilities, relativeError) { statFunctions <- callJMethod(x@sdf, "stat") @@ -173,12 +173,12 @@ setMethod("approxQuantile", #' #' Returns a stratified sample without replacement based on the fraction given on each stratum. #' -#' @param x A SparkSQL DataFrame +#' @param x A SparkDataFrame #' @param col column that defines strata #' @param fractions A named list giving sampling fraction for each stratum. If a stratum is #' not specified, we treat its fraction as zero. #' @param seed random seed -#' @return A new DataFrame that represents the stratified sample +#' @return A new SparkDataFrame that represents the stratified sample #' #' @rdname statfunctions #' @name sampleBy @@ -189,7 +189,7 @@ setMethod("approxQuantile", #' sample <- sampleBy(df, "key", fractions, 36) #' } setMethod("sampleBy", - signature(x = "DataFrame", col = "character", + signature(x = "SparkDataFrame", col = "character", fractions = "list", seed = "numeric"), function(x, col, fractions, seed) { fractionsEnv <- convertNamedListToEnv(fractions) diff --git a/R/pkg/R/utils.R b/R/pkg/R/utils.R index b425ccf6e7..ba7a611e64 100644 --- a/R/pkg/R/utils.R +++ b/R/pkg/R/utils.R @@ -626,7 +626,7 @@ convertNamedListToEnv <- function(namedList) { # Assign a new environment for attach() and with() methods assignNewEnv <- function(data) { - stopifnot(class(data) == "DataFrame") + stopifnot(class(data) == "SparkDataFrame") cols <- columns(data) stopifnot(length(cols) > 0) diff --git a/R/pkg/inst/tests/testthat/test_sparkSQL.R b/R/pkg/inst/tests/testthat/test_sparkSQL.R index b923ccf6bb..9bd3975405 100644 --- a/R/pkg/inst/tests/testthat/test_sparkSQL.R +++ b/R/pkg/inst/tests/testthat/test_sparkSQL.R @@ -101,8 +101,8 @@ test_that("create DataFrame from RDD", { rdd <- lapply(parallelize(sc, 1:10), function(x) { list(x, as.character(x)) }) df <- createDataFrame(sqlContext, rdd, list("a", "b")) dfAsDF <- as.DataFrame(sqlContext, rdd, list("a", "b")) - expect_is(df, "DataFrame") - expect_is(dfAsDF, "DataFrame") + expect_is(df, "SparkDataFrame") + expect_is(dfAsDF, "SparkDataFrame") expect_equal(count(df), 10) expect_equal(count(dfAsDF), 10) expect_equal(nrow(df), 10) @@ -118,21 +118,21 @@ test_that("create DataFrame from RDD", { df <- createDataFrame(sqlContext, rdd) dfAsDF <- as.DataFrame(sqlContext, rdd) - expect_is(df, "DataFrame") - expect_is(dfAsDF, "DataFrame") + expect_is(df, "SparkDataFrame") + expect_is(dfAsDF, "SparkDataFrame") expect_equal(columns(df), c("_1", "_2")) expect_equal(columns(dfAsDF), c("_1", "_2")) schema <- structType(structField(x = "a", type = "integer", nullable = TRUE), structField(x = "b", type = "string", nullable = TRUE)) df <- createDataFrame(sqlContext, rdd, schema) - expect_is(df, "DataFrame") + expect_is(df, "SparkDataFrame") expect_equal(columns(df), c("a", "b")) expect_equal(dtypes(df), list(c("a", "int"), c("b", "string"))) rdd <- lapply(parallelize(sc, 1:10), function(x) { list(a = x, b = as.character(x)) }) df <- createDataFrame(sqlContext, rdd) - expect_is(df, "DataFrame") + expect_is(df, "SparkDataFrame") expect_equal(count(df), 10) expect_equal(columns(df), c("a", "b")) expect_equal(dtypes(df), list(c("a", "int"), c("b", "string"))) @@ -155,7 +155,7 @@ test_that("create DataFrame from RDD", { age = c(19L, 23L, 18L), height = c(176.5, 181.4, 173.7)) df <- createDataFrame(sqlContext, localDF, schema) - expect_is(df, "DataFrame") + expect_is(df, "SparkDataFrame") expect_equal(count(df), 3) expect_equal(columns(df), c("name", "age", "height")) expect_equal(dtypes(df), list(c("name", "string"), c("age", "int"), c("height", "float"))) @@ -218,25 +218,25 @@ test_that("convert NAs to null type in DataFrames", { test_that("toDF", { rdd <- lapply(parallelize(sc, 1:10), function(x) { list(x, as.character(x)) }) df <- toDF(rdd, list("a", "b")) - expect_is(df, "DataFrame") + expect_is(df, "SparkDataFrame") expect_equal(count(df), 10) expect_equal(columns(df), c("a", "b")) expect_equal(dtypes(df), list(c("a", "int"), c("b", "string"))) df <- toDF(rdd) - expect_is(df, "DataFrame") + expect_is(df, "SparkDataFrame") expect_equal(columns(df), c("_1", "_2")) schema <- structType(structField(x = "a", type = "integer", nullable = TRUE), structField(x = "b", type = "string", nullable = TRUE)) df <- toDF(rdd, schema) - expect_is(df, "DataFrame") + expect_is(df, "SparkDataFrame") expect_equal(columns(df), c("a", "b")) expect_equal(dtypes(df), list(c("a", "int"), c("b", "string"))) rdd <- lapply(parallelize(sc, 1:10), function(x) { list(a = x, b = as.character(x)) }) df <- toDF(rdd) - expect_is(df, "DataFrame") + expect_is(df, "SparkDataFrame") expect_equal(count(df), 10) expect_equal(columns(df), c("a", "b")) expect_equal(dtypes(df), list(c("a", "int"), c("b", "string"))) @@ -377,7 +377,7 @@ test_that("Collect DataFrame with complex types", { test_that("read/write json files", { # Test read.df df <- read.df(sqlContext, jsonPath, "json") - expect_is(df, "DataFrame") + expect_is(df, "SparkDataFrame") expect_equal(count(df), 3) # Test read.df with a user defined schema @@ -385,17 +385,17 @@ test_that("read/write json files", { structField("age", type = "double")) df1 <- read.df(sqlContext, jsonPath, "json", schema) - expect_is(df1, "DataFrame") + expect_is(df1, "SparkDataFrame") expect_equal(dtypes(df1), list(c("name", "string"), c("age", "double"))) # Test loadDF df2 <- loadDF(sqlContext, jsonPath, "json", schema) - expect_is(df2, "DataFrame") + expect_is(df2, "SparkDataFrame") expect_equal(dtypes(df2), list(c("name", "string"), c("age", "double"))) # Test read.json df <- read.json(sqlContext, jsonPath) - expect_is(df, "DataFrame") + expect_is(df, "SparkDataFrame") expect_equal(count(df), 3) # Test write.df @@ -408,11 +408,11 @@ test_that("read/write json files", { # Test read.json()/jsonFile() works with multiple input paths jsonDF1 <- read.json(sqlContext, c(jsonPath2, jsonPath3)) - expect_is(jsonDF1, "DataFrame") + expect_is(jsonDF1, "SparkDataFrame") expect_equal(count(jsonDF1), 6) # Suppress warnings because jsonFile is deprecated jsonDF2 <- suppressWarnings(jsonFile(sqlContext, c(jsonPath2, jsonPath3))) - expect_is(jsonDF2, "DataFrame") + expect_is(jsonDF2, "SparkDataFrame") expect_equal(count(jsonDF2), 6) unlink(jsonPath2) @@ -423,12 +423,12 @@ test_that("jsonRDD() on a RDD with json string", { rdd <- parallelize(sc, mockLines) expect_equal(count(rdd), 3) df <- suppressWarnings(jsonRDD(sqlContext, rdd)) - expect_is(df, "DataFrame") + expect_is(df, "SparkDataFrame") expect_equal(count(df), 3) rdd2 <- flatMap(rdd, function(x) c(x, x)) df <- suppressWarnings(jsonRDD(sqlContext, rdd2)) - expect_is(df, "DataFrame") + expect_is(df, "SparkDataFrame") expect_equal(count(df), 6) }) @@ -454,7 +454,7 @@ test_that("registerTempTable() results in a queryable table and sql() results in df <- read.json(sqlContext, jsonPath) registerTempTable(df, "table1") newdf <- sql(sqlContext, "SELECT * FROM table1 where name = 'Michael'") - expect_is(newdf, "DataFrame") + expect_is(newdf, "SparkDataFrame") expect_equal(count(newdf), 1) dropTempTable(sqlContext, "table1") }) @@ -493,7 +493,7 @@ test_that("tableToDF() returns a new DataFrame", { df <- read.json(sqlContext, jsonPath) registerTempTable(df, "table1") tabledf <- tableToDF(sqlContext, "table1") - expect_is(tabledf, "DataFrame") + expect_is(tabledf, "SparkDataFrame") expect_equal(count(tabledf), 3) tabledf2 <- tableToDF(sqlContext, "table1") expect_equal(count(tabledf2), 3) @@ -595,7 +595,7 @@ test_that("collect() returns a data.frame", { test_that("limit() returns DataFrame with the correct number of rows", { df <- read.json(sqlContext, jsonPath) dfLimited <- limit(df, 2) - expect_is(dfLimited, "DataFrame") + expect_is(dfLimited, "SparkDataFrame") expect_equal(count(dfLimited), 2) }) @@ -750,11 +750,11 @@ test_that("distinct(), unique() and dropDuplicates() on DataFrames", { df <- read.json(sqlContext, jsonPathWithDup) uniques <- distinct(df) - expect_is(uniques, "DataFrame") + expect_is(uniques, "SparkDataFrame") expect_equal(count(uniques), 3) uniques2 <- unique(df) - expect_is(uniques2, "DataFrame") + expect_is(uniques2, "SparkDataFrame") expect_equal(count(uniques2), 3) # Test dropDuplicates() @@ -798,7 +798,7 @@ test_that("sample on a DataFrame", { df <- read.json(sqlContext, jsonPath) sampled <- sample(df, FALSE, 1.0) expect_equal(nrow(collect(sampled)), count(df)) - expect_is(sampled, "DataFrame") + expect_is(sampled, "SparkDataFrame") sampled2 <- sample(df, FALSE, 0.1, 0) # set seed for predictable result expect_true(count(sampled2) < 3) @@ -822,11 +822,11 @@ test_that("select operators", { expect_is(df[[2]], "Column") expect_is(df[["age"]], "Column") - expect_is(df[, 1], "DataFrame") + expect_is(df[, 1], "SparkDataFrame") expect_equal(columns(df[, 1]), c("name")) expect_equal(columns(df[, "age"]), c("age")) df2 <- df[, c("age", "name")] - expect_is(df2, "DataFrame") + expect_is(df2, "SparkDataFrame") expect_equal(columns(df2), c("age", "name")) df$age2 <- df$age @@ -890,7 +890,7 @@ test_that("subsetting", { expect_equal(collect(filtered)$name, "Andy") df2 <- df[df$age == 19, 1] - expect_is(df2, "DataFrame") + expect_is(df2, "SparkDataFrame") expect_equal(count(df2), 1) expect_equal(columns(df2), c("name")) expect_equal(collect(df2)$name, "Justin") @@ -940,7 +940,7 @@ test_that("column calculation", { d <- collect(select(df, alias(df$age + 1, "age2"))) expect_equal(names(d), c("age2")) df2 <- select(df, lower(df$name), abs(df$age)) - expect_is(df2, "DataFrame") + expect_is(df2, "SparkDataFrame") expect_equal(count(df2), 3) }) @@ -953,30 +953,30 @@ test_that("test HiveContext", { skip("Hive is not build with SparkSQL, skipped") }) df <- createExternalTable(hiveCtx, "json", jsonPath, "json") - expect_is(df, "DataFrame") + expect_is(df, "SparkDataFrame") expect_equal(count(df), 3) df2 <- sql(hiveCtx, "select * from json") - expect_is(df2, "DataFrame") + expect_is(df2, "SparkDataFrame") expect_equal(count(df2), 3) jsonPath2 <- tempfile(pattern = "sparkr-test", fileext = ".tmp") invisible(saveAsTable(df, "json2", "json", "append", path = jsonPath2)) df3 <- sql(hiveCtx, "select * from json2") - expect_is(df3, "DataFrame") + expect_is(df3, "SparkDataFrame") expect_equal(count(df3), 3) unlink(jsonPath2) hivetestDataPath <- tempfile(pattern = "sparkr-test", fileext = ".tmp") invisible(saveAsTable(df, "hivetestbl", path = hivetestDataPath)) df4 <- sql(hiveCtx, "select * from hivetestbl") - expect_is(df4, "DataFrame") + expect_is(df4, "SparkDataFrame") expect_equal(count(df4), 3) unlink(hivetestDataPath) parquetDataPath <- tempfile(pattern = "sparkr-test", fileext = ".tmp") invisible(saveAsTable(df, "parquetest", "parquet", mode = "overwrite", path = parquetDataPath)) df5 <- sql(hiveCtx, "select * from parquetest") - expect_is(df5, "DataFrame") + expect_is(df5, "SparkDataFrame") expect_equal(count(df5), 3) unlink(parquetDataPath) }) @@ -1272,28 +1272,28 @@ test_that("group by, agg functions", { gd <- groupBy(df, "name") expect_is(gd, "GroupedData") df2 <- count(gd) - expect_is(df2, "DataFrame") + expect_is(df2, "SparkDataFrame") expect_equal(3, count(df2)) # Also test group_by, summarize, mean gd1 <- group_by(df, "name") expect_is(gd1, "GroupedData") df_summarized <- summarize(gd, mean_age = mean(df$age)) - expect_is(df_summarized, "DataFrame") + expect_is(df_summarized, "SparkDataFrame") expect_equal(3, count(df_summarized)) df3 <- agg(gd, age = "stddev") - expect_is(df3, "DataFrame") + expect_is(df3, "SparkDataFrame") df3_local <- collect(df3) expect_true(is.nan(df3_local[df3_local$name == "Andy", ][1, 2])) df4 <- agg(gd, sumAge = sum(df$age)) - expect_is(df4, "DataFrame") + expect_is(df4, "SparkDataFrame") expect_equal(3, count(df4)) expect_equal(columns(df4), c("name", "sumAge")) df5 <- sum(gd, "age") - expect_is(df5, "DataFrame") + expect_is(df5, "SparkDataFrame") expect_equal(3, count(df5)) expect_equal(3, count(mean(gd))) @@ -1521,22 +1521,22 @@ test_that("unionAll(), rbind(), except(), and intersect() on a DataFrame", { df2 <- read.df(sqlContext, jsonPath2, "json") unioned <- arrange(unionAll(df, df2), df$age) - expect_is(unioned, "DataFrame") + expect_is(unioned, "SparkDataFrame") expect_equal(count(unioned), 6) expect_equal(first(unioned)$name, "Michael") unioned2 <- arrange(rbind(unioned, df, df2), df$age) - expect_is(unioned2, "DataFrame") + expect_is(unioned2, "SparkDataFrame") expect_equal(count(unioned2), 12) expect_equal(first(unioned2)$name, "Michael") excepted <- arrange(except(df, df2), desc(df$age)) - expect_is(unioned, "DataFrame") + expect_is(unioned, "SparkDataFrame") expect_equal(count(excepted), 2) expect_equal(first(excepted)$name, "Justin") intersected <- arrange(intersect(df, df2), df$age) - expect_is(unioned, "DataFrame") + expect_is(unioned, "SparkDataFrame") expect_equal(count(intersected), 1) expect_equal(first(intersected)$name, "Andy") @@ -1601,7 +1601,7 @@ test_that("read/write Parquet files", { # Test write.df and read.df write.df(df, parquetPath, "parquet", mode = "overwrite") df2 <- read.df(sqlContext, parquetPath, "parquet") - expect_is(df2, "DataFrame") + expect_is(df2, "SparkDataFrame") expect_equal(count(df2), 3) # Test write.parquet/saveAsParquetFile and read.parquet/parquetFile @@ -1610,10 +1610,10 @@ test_that("read/write Parquet files", { parquetPath3 <- tempfile(pattern = "parquetPath3", fileext = ".parquet") suppressWarnings(saveAsParquetFile(df, parquetPath3)) parquetDF <- read.parquet(sqlContext, c(parquetPath2, parquetPath3)) - expect_is(parquetDF, "DataFrame") + expect_is(parquetDF, "SparkDataFrame") expect_equal(count(parquetDF), count(df) * 2) parquetDF2 <- suppressWarnings(parquetFile(sqlContext, parquetPath2, parquetPath3)) - expect_is(parquetDF2, "DataFrame") + expect_is(parquetDF2, "SparkDataFrame") expect_equal(count(parquetDF2), count(df) * 2) # Test if varargs works with variables @@ -1630,7 +1630,7 @@ test_that("read/write Parquet files", { test_that("read/write text files", { # Test write.df and read.df df <- read.df(sqlContext, jsonPath, "text") - expect_is(df, "DataFrame") + expect_is(df, "SparkDataFrame") expect_equal(colnames(df), c("value")) expect_equal(count(df), 3) textPath <- tempfile(pattern = "textPath", fileext = ".txt") @@ -1640,7 +1640,7 @@ test_that("read/write text files", { textPath2 <- tempfile(pattern = "textPath2", fileext = ".txt") write.text(df, textPath2) df2 <- read.text(sqlContext, c(textPath, textPath2)) - expect_is(df2, "DataFrame") + expect_is(df2, "SparkDataFrame") expect_equal(colnames(df2), c("value")) expect_equal(count(df2), count(df) * 2) @@ -1877,7 +1877,7 @@ test_that("attach() on a DataFrame", { df <- read.json(sqlContext, jsonPath) expect_error(age) attach(df) - expect_is(age, "DataFrame") + expect_is(age, "SparkDataFrame") expected_age <- data.frame(age = c(NA, 30, 19)) expect_equal(head(age), expected_age) stat <- summary(age) @@ -1936,7 +1936,7 @@ test_that("Method coltypes() to get and set R's data types of a DataFrame", { expect_equal(dtypes(df), list(c("name", "string"), c("age", "double"))) expect_error(coltypes(df) <- c("character"), - "Length of type vector should match the number of columns for DataFrame") + "Length of type vector should match the number of columns for SparkDataFrame") expect_error(coltypes(df) <- c("environment", "list"), "Only atomic type is supported for column types") }) @@ -1950,7 +1950,7 @@ test_that("Method str()", { out <- capture.output(str(irisDF2)) expect_equal(length(out), 7) - expect_equal(out[1], "'DataFrame': 6 variables:") + expect_equal(out[1], "'SparkDataFrame': 6 variables:") expect_equal(out[2], " $ Sepal_Length: num 5.1 4.9 4.7 4.6 5 5.4") expect_equal(out[3], " $ Sepal_Width : num 3.5 3 3.2 3.1 3.6 3.9") expect_equal(out[4], " $ Petal_Length: num 1.4 1.4 1.3 1.5 1.4 1.7") -- cgit v1.2.3