aboutsummaryrefslogtreecommitdiff
path: root/R
diff options
context:
space:
mode:
authorKai Jiang <jiangkai@gmail.com>2016-06-16 19:39:33 -0700
committerJoseph K. Bradley <joseph@databricks.com>2016-06-16 19:39:33 -0700
commit5fd20b66ffe18c05cf257af7f30d32464d2fe8e7 (patch)
tree4645446733acbc6cc5d31c7f62b52e8f031d2041 /R
parent63470afc997fb9d6b6f8a911c25964743556c9cc (diff)
downloadspark-5fd20b66ffe18c05cf257af7f30d32464d2fe8e7.tar.gz
spark-5fd20b66ffe18c05cf257af7f30d32464d2fe8e7.tar.bz2
spark-5fd20b66ffe18c05cf257af7f30d32464d2fe8e7.zip
[SPARK-15490][R][DOC] SparkR 2.0 QA: New R APIs and API docs for non-MLib changes
## What changes were proposed in this pull request? R Docs changes include typos, format, layout. ## How was this patch tested? Test locally. Author: Kai Jiang <jiangkai@gmail.com> Closes #13394 from vectorijk/spark-15490.
Diffstat (limited to 'R')
-rw-r--r--R/pkg/R/DataFrame.R91
-rw-r--r--R/pkg/R/RDD.R14
-rw-r--r--R/pkg/R/WindowSpec.R7
-rw-r--r--R/pkg/R/broadcast.R8
-rw-r--r--R/pkg/R/column.R6
-rw-r--r--R/pkg/R/context.R41
-rw-r--r--R/pkg/R/functions.R2
-rw-r--r--R/pkg/R/group.R6
-rw-r--r--R/pkg/R/mllib.R34
-rw-r--r--R/pkg/R/utils.R2
10 files changed, 123 insertions, 88 deletions
diff --git a/R/pkg/R/DataFrame.R b/R/pkg/R/DataFrame.R
index 9a9b3f7eca..d72cbbd79e 100644
--- a/R/pkg/R/DataFrame.R
+++ b/R/pkg/R/DataFrame.R
@@ -23,9 +23,11 @@ NULL
setOldClass("jobj")
setOldClass("structType")
-#' @title S4 class that represents a SparkDataFrame
-#' @description DataFrames can be created using functions like \link{createDataFrame},
-#' \link{read.json}, \link{table} etc.
+#' S4 class that represents a SparkDataFrame
+#'
+#' DataFrames can be created using functions like \link{createDataFrame},
+#' \link{read.json}, \link{table} etc.
+#'
#' @family SparkDataFrame functions
#' @rdname SparkDataFrame
#' @docType class
@@ -629,8 +631,6 @@ setMethod("repartition",
#'
#' @param x A SparkDataFrame
#' @return A StringRRDD of JSON objects
-#' @family SparkDataFrame functions
-#' @rdname tojson
#' @noRd
#' @examples
#'\dontrun{
@@ -648,7 +648,7 @@ setMethod("toJSON",
RDD(jrdd, serializedMode = "string")
})
-#' write.json
+#' Save the contents of SparkDataFrame as a JSON file
#'
#' Save the contents of a SparkDataFrame as a JSON file (one object per line). Files written out
#' with this method can be read back in as a SparkDataFrame using read.json().
@@ -675,7 +675,7 @@ setMethod("write.json",
invisible(callJMethod(write, "json", path))
})
-#' write.parquet
+#' Save the contents of SparkDataFrame as a Parquet file, preserving the schema.
#'
#' Save the contents of a SparkDataFrame as a Parquet file, preserving the schema. Files written out
#' with this method can be read back in as a SparkDataFrame using read.parquet().
@@ -713,9 +713,9 @@ setMethod("saveAsParquetFile",
write.parquet(x, path)
})
-#' write.text
+#' Save the content of SparkDataFrame in a text file at the specified path.
#'
-#' Saves the content of the SparkDataFrame in a text file at the specified path.
+#' Save the content of the SparkDataFrame in a text file at the specified path.
#' The SparkDataFrame must have only one column of string type with the name "value".
#' Each row becomes a new line in the output file.
#'
@@ -820,8 +820,6 @@ setMethod("sample_frac",
sample(x, withReplacement, fraction, seed)
})
-#' nrow
-#'
#' Returns the number of rows in a SparkDataFrame
#'
#' @param x A SparkDataFrame
@@ -874,6 +872,8 @@ setMethod("ncol",
length(columns(x))
})
+#' Returns the dimensions of SparkDataFrame
+#'
#' Returns the dimensions (number of rows and columns) of a SparkDataFrame
#' @param x a SparkDataFrame
#'
@@ -2012,8 +2012,9 @@ setMethod("join",
dataFrame(sdf)
})
+#' Merges two data frames
+#'
#' @name merge
-#' @title Merges two data frames
#' @param x the first data frame to be joined
#' @param y the second data frame to be joined
#' @param by a character vector specifying the join columns. If by is not
@@ -2127,7 +2128,6 @@ setMethod("merge",
joinRes
})
-#'
#' Creates a list of columns by replacing the intersected ones with aliases.
#' The name of the alias column is formed by concatanating the original column name and a suffix.
#'
@@ -2182,8 +2182,9 @@ setMethod("unionAll",
dataFrame(unioned)
})
-#' @title Union two or more SparkDataFrames
-#' @description Returns a new SparkDataFrame containing rows of all parameters.
+#' Union two or more SparkDataFrames
+#'
+#' Returns a new SparkDataFrame containing rows of all parameters.
#'
#' @rdname rbind
#' @name rbind
@@ -2254,20 +2255,22 @@ setMethod("except",
dataFrame(excepted)
})
-#' Save the contents of the SparkDataFrame to a data source
+#' Save the contents of SparkDataFrame to a data source.
#'
#' The data source is specified by the `source` and a set of options (...).
#' If `source` is not specified, the default data source configured by
#' spark.sql.sources.default will be used.
#'
-#' Additionally, mode is used to specify the behavior of the save operation when
-#' data already exists in the data source. There are four modes: \cr
-#' append: Contents of this SparkDataFrame are expected to be appended to existing data. \cr
-#' overwrite: Existing data is expected to be overwritten by the contents of this
-#' SparkDataFrame. \cr
-#' error: An exception is expected to be thrown. \cr
-#' ignore: The save operation is expected to not save the contents of the SparkDataFrame
-#' and to not change the existing data. \cr
+#' Additionally, mode is used to specify the behavior of the save operation when data already
+#' exists in the data source. There are four modes:
+#' \itemize{
+#' \item append: Contents of this SparkDataFrame are expected to be appended to existing data.
+#' \item overwrite: Existing data is expected to be overwritten by the contents of this
+#' SparkDataFrame.
+#' \item error: An exception is expected to be thrown.
+#' \item ignore: The save operation is expected to not save the contents of the SparkDataFrame
+#' and to not change the existing data.
+#' }
#'
#' @param df A SparkDataFrame
#' @param path A name for the table
@@ -2315,8 +2318,6 @@ setMethod("saveDF",
write.df(df, path, source, mode, ...)
})
-#' saveAsTable
-#'
#' Save the contents of the SparkDataFrame to a data source as a table
#'
#' The data source is specified by the `source` and a set of options (...).
@@ -2543,11 +2544,12 @@ setMethod("fillna",
dataFrame(sdf)
})
+#' Download data from a SparkDataFrame into a data.frame
+#'
#' This function downloads the contents of a SparkDataFrame into an R's data.frame.
#' Since data.frames are held in memory, ensure that you have enough memory
#' in your system to accommodate the contents.
#'
-#' @title Download data from a SparkDataFrame into a data.frame
#' @param x a SparkDataFrame
#' @return a data.frame
#' @family SparkDataFrame functions
@@ -2563,13 +2565,14 @@ setMethod("as.data.frame",
as.data.frame(collect(x), row.names, optional, ...)
})
+#' Attach SparkDataFrame to R search path
+#'
#' The specified SparkDataFrame is attached to the R search path. This means that
#' the SparkDataFrame is searched by R when evaluating a variable, so columns in
#' the SparkDataFrame can be accessed by simply giving their names.
#'
#' @family SparkDataFrame functions
#' @rdname attach
-#' @title Attach SparkDataFrame to R search path
#' @param what (SparkDataFrame) The SparkDataFrame to attach
#' @param pos (integer) Specify position in search() where to attach.
#' @param name (character) Name to use for the attached SparkDataFrame. Names
@@ -2590,13 +2593,15 @@ setMethod("attach",
})
#' Evaluate a R expression in an environment constructed from a SparkDataFrame
+#'
+#' Evaluate a R expression in an environment constructed from a SparkDataFrame
#' with() allows access to columns of a SparkDataFrame by simply referring to
#' their name. It appends every column of a SparkDataFrame into a new
#' environment. Then, the given expression is evaluated in this new
#' environment.
#'
#' @rdname with
-#' @title Evaluate a R expression in an environment constructed from a SparkDataFrame
+#' @family SparkDataFrame functions
#' @param data (SparkDataFrame) SparkDataFrame to use for constructing an environment.
#' @param expr (expression) Expression to evaluate.
#' @param ... arguments to be passed to future methods.
@@ -2612,10 +2617,12 @@ setMethod("with",
eval(substitute(expr), envir = newEnv, enclos = newEnv)
})
+#' Compactly display the structure of a dataset
+#'
#' Display the structure of a SparkDataFrame, including column names, column types, as well as a
#' a small sample of rows.
+#'
#' @name str
-#' @title Compactly display the structure of a dataset
#' @rdname str
#' @family SparkDataFrame functions
#' @param object a SparkDataFrame
@@ -2728,10 +2735,11 @@ setMethod("drop",
base::drop(x)
})
+#' Compute histogram statistics for given column
+#'
#' This function computes a histogram for a given SparkR Column.
#'
#' @name histogram
-#' @title Histogram
#' @param nbins the number of bins (optional). Default value is 10.
#' @param df the SparkDataFrame containing the Column to build the histogram from.
#' @param colname the name of the column to build the histogram from.
@@ -2847,18 +2855,21 @@ setMethod("histogram",
return(histStats)
})
-#' Saves the content of the SparkDataFrame to an external database table via JDBC
+#' Save the content of SparkDataFrame to an external database table via JDBC.
#'
-#' Additional JDBC database connection properties can be set (...)
+#' Save the content of the SparkDataFrame to an external database table via JDBC. Additional JDBC
+#' database connection properties can be set (...)
#'
#' Also, mode is used to specify the behavior of the save operation when
-#' data already exists in the data source. There are four modes: \cr
-#' append: Contents of this SparkDataFrame are expected to be appended to existing data. \cr
-#' overwrite: Existing data is expected to be overwritten by the contents of this
-#' SparkDataFrame. \cr
-#' error: An exception is expected to be thrown. \cr
-#' ignore: The save operation is expected to not save the contents of the SparkDataFrame
-#' and to not change the existing data. \cr
+#' data already exists in the data source. There are four modes:
+#' \itemize{
+#' \item append: Contents of this SparkDataFrame are expected to be appended to existing data.
+#' \item overwrite: Existing data is expected to be overwritten by the contents of this
+#' SparkDataFrame.
+#' \item error: An exception is expected to be thrown.
+#' \item ignore: The save operation is expected to not save the contents of the SparkDataFrame
+#' and to not change the existing data.
+#' }
#'
#' @param x A SparkDataFrame
#' @param url JDBC database url of the form `jdbc:subprotocol:subname`
diff --git a/R/pkg/R/RDD.R b/R/pkg/R/RDD.R
index f1badf4364..72a8052565 100644
--- a/R/pkg/R/RDD.R
+++ b/R/pkg/R/RDD.R
@@ -19,9 +19,11 @@
setOldClass("jobj")
-#' @title S4 class that represents an RDD
-#' @description RDD can be created using functions like
+#' S4 class that represents an RDD
+#'
+#' RDD can be created using functions like
#' \code{parallelize}, \code{textFile} etc.
+#'
#' @rdname RDD
#' @seealso parallelize, textFile
#' @slot env An R environment that stores bookkeeping states of the RDD
@@ -497,9 +499,9 @@ setMethod("map",
lapply(X, FUN)
})
-#' Flatten results after apply a function to all elements
+#' Flatten results after applying a function to all elements
#'
-#' This function return a new RDD by first applying a function to all
+#' This function returns a new RDD by first applying a function to all
#' elements of this RDD, and then flattening the results.
#'
#' @param X The RDD to apply the transformation.
@@ -713,7 +715,7 @@ setMethod("sumRDD",
reduce(x, "+")
})
-#' Applies a function to all elements in an RDD, and force evaluation.
+#' Applies a function to all elements in an RDD, and forces evaluation.
#'
#' @param x The RDD to apply the function
#' @param func The function to be applied.
@@ -737,7 +739,7 @@ setMethod("foreach",
invisible(collect(mapPartitions(x, partition.func)))
})
-#' Applies a function to each partition in an RDD, and force evaluation.
+#' Applies a function to each partition in an RDD, and forces evaluation.
#'
#' @examples
#'\dontrun{
diff --git a/R/pkg/R/WindowSpec.R b/R/pkg/R/WindowSpec.R
index 581176a6c0..d8405420d0 100644
--- a/R/pkg/R/WindowSpec.R
+++ b/R/pkg/R/WindowSpec.R
@@ -20,9 +20,10 @@
#' @include generics.R jobj.R column.R
NULL
-#' @title S4 class that represents a WindowSpec
-#' @description WindowSpec can be created by using window.partitionBy()
-#' or window.orderBy()
+#' S4 class that represents a WindowSpec
+#'
+#' WindowSpec can be created by using window.partitionBy() or window.orderBy()
+#'
#' @rdname WindowSpec
#' @seealso \link{window.partitionBy}, \link{window.orderBy}
#'
diff --git a/R/pkg/R/broadcast.R b/R/pkg/R/broadcast.R
index 38f0eed95e..398dffc4ab 100644
--- a/R/pkg/R/broadcast.R
+++ b/R/pkg/R/broadcast.R
@@ -23,9 +23,11 @@
.broadcastValues <- new.env()
.broadcastIdToName <- new.env()
-# @title S4 class that represents a Broadcast variable
-# @description Broadcast variables can be created using the broadcast
-# function from a \code{SparkContext}.
+# S4 class that represents a Broadcast variable
+#
+# Broadcast variables can be created using the broadcast
+# function from a \code{SparkContext}.
+#
# @rdname broadcast-class
# @seealso broadcast
#
diff --git a/R/pkg/R/column.R b/R/pkg/R/column.R
index 873e8b1665..cc2876ed94 100644
--- a/R/pkg/R/column.R
+++ b/R/pkg/R/column.R
@@ -22,8 +22,10 @@ NULL
setOldClass("jobj")
-#' @title S4 class that represents a SparkDataFrame column
-#' @description The column class supports unary, binary operations on SparkDataFrame columns
+#' S4 class that represents a SparkDataFrame column
+#'
+#' The column class supports unary, binary operations on SparkDataFrame columns
+#'
#' @rdname column
#'
#' @slot jc reference to JVM SparkDataFrame column
diff --git a/R/pkg/R/context.R b/R/pkg/R/context.R
index 44bca877fd..5c886030ff 100644
--- a/R/pkg/R/context.R
+++ b/R/pkg/R/context.R
@@ -173,9 +173,8 @@ includePackage <- function(sc, pkg) {
.sparkREnv$.packages <- packages
}
-#' @title Broadcast a variable to all workers
+#' Broadcast a variable to all workers
#'
-#' @description
#' Broadcast a read-only variable to the cluster, returning a \code{Broadcast}
#' object for reading it in distributed functions.
#'
@@ -207,7 +206,7 @@ broadcast <- function(sc, object) {
Broadcast(id, object, jBroadcast, objName)
}
-#' @title Set the checkpoint directory
+#' Set the checkpoint directory
#'
#' Set the directory under which RDDs are going to be checkpointed. The
#' directory must be a HDFS path if running on a cluster.
@@ -226,30 +225,31 @@ setCheckpointDir <- function(sc, dirName) {
invisible(callJMethod(sc, "setCheckpointDir", suppressWarnings(normalizePath(dirName))))
}
-#' @title Run a function over a list of elements, distributing the computations with Spark.
+#' Run a function over a list of elements, distributing the computations with Spark.
#'
-#' @description
#' Applies a function in a manner that is similar to doParallel or lapply to elements of a list.
#' The computations are distributed using Spark. It is conceptually the same as the following code:
#' lapply(list, func)
#'
#' Known limitations:
-#' - variable scoping and capture: compared to R's rich support for variable resolutions, the
-# distributed nature of SparkR limits how variables are resolved at runtime. All the variables
-# that are available through lexical scoping are embedded in the closure of the function and
-# available as read-only variables within the function. The environment variables should be
-# stored into temporary variables outside the function, and not directly accessed within the
-# function.
+#' \itemize{
+#' \item variable scoping and capture: compared to R's rich support for variable resolutions,
+#' the distributed nature of SparkR limits how variables are resolved at runtime. All the
+#' variables that are available through lexical scoping are embedded in the closure of the
+#' function and available as read-only variables within the function. The environment variables
+#' should be stored into temporary variables outside the function, and not directly accessed
+#' within the function.
#'
-#' - loading external packages: In order to use a package, you need to load it inside the
-#' closure. For example, if you rely on the MASS module, here is how you would use it:
-#'\dontrun{
-#' train <- function(hyperparam) {
-#' library(MASS)
-#' lm.ridge(“y ~ x+z”, data, lambda=hyperparam)
-#' model
+#' \item loading external packages: In order to use a package, you need to load it inside the
+#' closure. For example, if you rely on the MASS module, here is how you would use it:
+#' \preformatted{
+#' train <- function(hyperparam) {
+#' library(MASS)
+#' lm.ridge(“y ~ x+z”, data, lambda=hyperparam)
+#' model
+#' }
+#' }
#' }
-#'}
#'
#' @rdname spark.lapply
#' @param sc Spark Context to use
@@ -259,7 +259,8 @@ setCheckpointDir <- function(sc, dirName) {
#' @export
#' @examples
#'\dontrun{
-#' doubled <- spark.lapply(1:10, function(x){2 * x})
+#' sc <- sparkR.init()
+#' doubled <- spark.lapply(sc, 1:10, function(x){2 * x})
#'}
spark.lapply <- function(sc, list, func) {
rdd <- parallelize(sc, list, length(list))
diff --git a/R/pkg/R/functions.R b/R/pkg/R/functions.R
index 2665d1d477..a779127b37 100644
--- a/R/pkg/R/functions.R
+++ b/R/pkg/R/functions.R
@@ -2185,7 +2185,7 @@ setMethod("from_unixtime", signature(x = "Column"),
#' # 09:01:15-09:02:15...
#' window(df$time, "1 minute", startTime = "15 seconds")
#'
-#' # Thirty second windows every 10 seconds, e.g. 09:00:00-09:00:30, 09:00:10-09:00:40, ...
+#' # Thirty-second windows every 10 seconds, e.g. 09:00:00-09:00:30, 09:00:10-09:00:40, ...
#' window(df$time, "30 seconds", "10 seconds")
#'}
setMethod("window", signature(x = "Column"),
diff --git a/R/pkg/R/group.R b/R/pkg/R/group.R
index b704776917..eba083fe4b 100644
--- a/R/pkg/R/group.R
+++ b/R/pkg/R/group.R
@@ -22,8 +22,10 @@ NULL
setOldClass("jobj")
-#' @title S4 class that represents a GroupedData
-#' @description GroupedDatas can be created using groupBy() on a SparkDataFrame
+#' S4 class that represents a GroupedData
+#'
+#' GroupedDatas can be created using groupBy() on a SparkDataFrame
+#'
#' @rdname GroupedData
#' @seealso groupBy
#'
diff --git a/R/pkg/R/mllib.R b/R/pkg/R/mllib.R
index d4152b43b6..ba2eee2fca 100644
--- a/R/pkg/R/mllib.R
+++ b/R/pkg/R/mllib.R
@@ -25,22 +25,26 @@
# - a set of methods that reflect the arguments of the other languages supported by Spark. These
# methods are prefixed with the `spark.` prefix: spark.glm, spark.kmeans, etc.
-#' @title S4 class that represents a generalized linear model
+#' S4 class that represents a generalized linear model
+#'
#' @param jobj a Java object reference to the backing Scala GeneralizedLinearRegressionWrapper
#' @export
setClass("GeneralizedLinearRegressionModel", representation(jobj = "jobj"))
-#' @title S4 class that represents a NaiveBayesModel
+#' S4 class that represents a NaiveBayesModel
+#'
#' @param jobj a Java object reference to the backing Scala NaiveBayesWrapper
#' @export
setClass("NaiveBayesModel", representation(jobj = "jobj"))
-#' @title S4 class that represents a AFTSurvivalRegressionModel
+#' S4 class that represents a AFTSurvivalRegressionModel
+#'
#' @param jobj a Java object reference to the backing Scala AFTSurvivalRegressionWrapper
#' @export
setClass("AFTSurvivalRegressionModel", representation(jobj = "jobj"))
-#' @title S4 class that represents a KMeansModel
+#' S4 class that represents a KMeansModel
+#'
#' @param jobj a Java object reference to the backing Scala KMeansModel
#' @export
setClass("KMeansModel", representation(jobj = "jobj"))
@@ -197,7 +201,7 @@ print.summary.GeneralizedLinearRegressionModel <- function(x, ...) {
invisible(x)
}
-#' Make predictions from a generalized linear model
+#' Predicted values based on model
#'
#' Makes predictions from a generalized linear model produced by glm() or spark.glm(),
#' similarly to R's predict().
@@ -218,9 +222,9 @@ setMethod("predict", signature(object = "GeneralizedLinearRegressionModel"),
return(dataFrame(callJMethod(object@jobj, "transform", newData@sdf)))
})
-#' Make predictions from a naive Bayes model
+#' Predicted values based on model
#'
-#' Makes predictions from a model produced by spark.naiveBayes(),
+#' Makes predictions from a naive Bayes model or a model produced by spark.naiveBayes(),
#' similarly to R package e1071's predict.
#'
#' @param object A fitted naive Bayes model
@@ -357,9 +361,9 @@ setMethod("summary", signature(object = "KMeansModel"),
cluster = cluster, is.loaded = is.loaded))
})
-#' Make predictions from a k-means model
+#' Predicted values based on model
#'
-#' Make predictions from a model produced by spark.kmeans().
+#' Makes predictions from a k-means model or a model produced by spark.kmeans().
#'
#' @param object A fitted k-means model
#' @param newData SparkDataFrame for testing
@@ -402,6 +406,8 @@ setMethod("spark.naiveBayes", signature(data = "SparkDataFrame", formula = "form
return(new("NaiveBayesModel", jobj = jobj))
})
+#' Save fitted MLlib model to the input path
+#'
#' Save the Bernoulli naive Bayes model to the input path.
#'
#' @param object A fitted Bernoulli naive Bayes model
@@ -428,6 +434,8 @@ setMethod("write.ml", signature(object = "NaiveBayesModel", path = "character"),
invisible(callJMethod(writer, "save", path))
})
+#' Save fitted MLlib model to the input path
+#'
#' Save the AFT survival regression model to the input path.
#'
#' @param object A fitted AFT survival regression model
@@ -453,6 +461,8 @@ setMethod("write.ml", signature(object = "AFTSurvivalRegressionModel", path = "c
invisible(callJMethod(writer, "save", path))
})
+#' Save fitted MLlib model to the input path
+#'
#' Save the generalized linear model to the input path.
#'
#' @param object A fitted generalized linear model
@@ -478,6 +488,8 @@ setMethod("write.ml", signature(object = "GeneralizedLinearRegressionModel", pat
invisible(callJMethod(writer, "save", path))
})
+#' Save fitted MLlib model to the input path
+#'
#' Save the k-means model to the input path.
#'
#' @param object A fitted k-means model
@@ -582,9 +594,9 @@ setMethod("summary", signature(object = "AFTSurvivalRegressionModel"),
return(list(coefficients = coefficients))
})
-#' Make predictions from an AFT survival regression model
+#' Predicted values based on model
#'
-#' Make predictions from a model produced by spark.survreg(),
+#' Makes predictions from an AFT survival regression model or a model produced by spark.survreg(),
#' similarly to R package survival's predict.
#'
#' @param object A fitted AFT survival regression model
diff --git a/R/pkg/R/utils.R b/R/pkg/R/utils.R
index 12e4f4f1ae..b1b8adaa66 100644
--- a/R/pkg/R/utils.R
+++ b/R/pkg/R/utils.R
@@ -110,9 +110,11 @@ isRDD <- function(name, env) {
#' @return the hash code as an integer
#' @export
#' @examples
+#'\dontrun{
#' hashCode(1L) # 1
#' hashCode(1.0) # 1072693248
#' hashCode("1") # 49
+#'}
hashCode <- function(key) {
if (class(key) == "integer") {
as.integer(key[[1]])