R/pkg/R/stats.R


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225

#
# Licensed to the Apache Software Foundation (ASF) under one or more
# contributor license agreements.  See the NOTICE file distributed with
# this work for additional information regarding copyright ownership.
# The ASF licenses this file to You under the Apache License, Version 2.0
# (the "License"); you may not use this file except in compliance with
# the License.  You may obtain a copy of the License at
#
#    http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#

# stats.R - Statistic functions for SparkDataFrames.

setOldClass("jobj")

#' Computes a pair-wise frequency table of the given columns
#'
#' Computes a pair-wise frequency table of the given columns. Also known as a contingency
#' table. The number of distinct values for each column should be less than 1e4. At most 1e6
#' non-zero pair frequencies will be returned.
#'
#' @param x a SparkDataFrame
#' @param col1 name of the first column. Distinct items will make the first item of each row.
#' @param col2 name of the second column. Distinct items will make the column names of the output.
#' @return a local R data.frame representing the contingency table. The first column of each row
#'         will be the distinct values of \code{col1} and the column names will be the distinct values
#'         of \code{col2}. The name of the first column will be "\code{col1}_\code{col2}". Pairs
#'         that have no occurrences will have zero as their counts.
#'
#' @rdname crosstab
#' @name crosstab
#' @aliases crosstab,SparkDataFrame,character,character-method
#' @family stat functions
#' @export
#' @examples
#' \dontrun{
#' df <- read.json("/path/to/file.json")
#' ct <- crosstab(df, "title", "gender")
#' }
#' @note crosstab since 1.5.0
setMethod("crosstab",
          signature(x = "SparkDataFrame", col1 = "character", col2 = "character"),
          function(x, col1, col2) {
            statFunctions <- callJMethod(x@sdf, "stat")
            sct <- callJMethod(statFunctions, "crosstab", col1, col2)
            collect(dataFrame(sct))
          })

#' Calculate the sample covariance of two numerical columns of a SparkDataFrame.
#'
#' @param colName1 the name of the first column
#' @param colName2 the name of the second column
#' @return The covariance of the two columns.
#'
#' @rdname cov
#' @name cov
#' @aliases cov,SparkDataFrame-method
#' @family stat functions
#' @export
#' @examples
#'\dontrun{
#' df <- read.json("/path/to/file.json")
#' cov <- cov(df, "title", "gender")
#' }
#' @note cov since 1.6.0
setMethod("cov",
          signature(x = "SparkDataFrame"),
          function(x, colName1, colName2) {
            stopifnot(class(colName1) == "character" && class(colName2) == "character")
            statFunctions <- callJMethod(x@sdf, "stat")
            callJMethod(statFunctions, "cov", colName1, colName2)
          })

#' Calculates the correlation of two columns of a SparkDataFrame.
#' Currently only supports the Pearson Correlation Coefficient.
#' For Spearman Correlation, consider using RDD methods found in MLlib's Statistics.
#'
#' @param colName1 the name of the first column
#' @param colName2 the name of the second column
#' @param method Optional. A character specifying the method for calculating the correlation.
#'               only "pearson" is allowed now.
#' @return The Pearson Correlation Coefficient as a Double.
#'
#' @rdname corr
#' @name corr
#' @aliases corr,SparkDataFrame-method
#' @family stat functions
#' @export
#' @examples
#'\dontrun{
#' df <- read.json("/path/to/file.json")
#' corr <- corr(df, "title", "gender")
#' corr <- corr(df, "title", "gender", method = "pearson")
#' }
#' @note corr since 1.6.0
setMethod("corr",
          signature(x = "SparkDataFrame"),
          function(x, colName1, colName2, method = "pearson") {
            stopifnot(class(colName1) == "character" && class(colName2) == "character")
            statFunctions <- callJMethod(x@sdf, "stat")
            callJMethod(statFunctions, "corr", colName1, colName2, method)
          })


#' Finding frequent items for columns, possibly with false positives
#'
#' Finding frequent items for columns, possibly with false positives.
#' Using the frequent element count algorithm described in
#' \url{http://dx.doi.org/10.1145/762471.762473}, proposed by Karp, Schenker, and Papadimitriou.
#'
#' @param x A SparkDataFrame.
#' @param cols A vector column names to search frequent items in.
#' @param support (Optional) The minimum frequency for an item to be considered \code{frequent}.
#'                Should be greater than 1e-4. Default support = 0.01.
#' @return a local R data.frame with the frequent items in each column
#'
#' @rdname freqItems
#' @name freqItems
#' @aliases freqItems,SparkDataFrame,character-method
#' @family stat functions
#' @export
#' @examples
#' \dontrun{
#' df <- read.json("/path/to/file.json")
#' fi = freqItems(df, c("title", "gender"))
#' }
#' @note freqItems since 1.6.0
setMethod("freqItems", signature(x = "SparkDataFrame", cols = "character"),
          function(x, cols, support = 0.01) {
            statFunctions <- callJMethod(x@sdf, "stat")
            sct <- callJMethod(statFunctions, "freqItems", as.list(cols), support)
            collect(dataFrame(sct))
          })

#' Calculates the approximate quantiles of numerical columns of a SparkDataFrame
#'
#' Calculates the approximate quantiles of numerical columns of a SparkDataFrame.
#' The result of this algorithm has the following deterministic bound:
#' If the SparkDataFrame has N elements and if we request the quantile at probability p up to
#' error err, then the algorithm will return a sample x from the SparkDataFrame so that the
#' *exact* rank of x is close to (p * N). More precisely,
#'   floor((p - err) * N) <= rank(x) <= ceil((p + err) * N).
#' This method implements a variation of the Greenwald-Khanna algorithm (with some speed
#' optimizations). The algorithm was first present in [[http://dx.doi.org/10.1145/375663.375670
#' Space-efficient Online Computation of Quantile Summaries]] by Greenwald and Khanna.
#' Note that rows containing any NA values will be removed before calculation.
#'
#' @param x A SparkDataFrame.
#' @param cols A single column name, or a list of names for multiple columns.
#' @param probabilities A list of quantile probabilities. Each number must belong to [0, 1].
#'                      For example 0 is the minimum, 0.5 is the median, 1 is the maximum.
#' @param relativeError The relative target precision to achieve (>= 0). If set to zero,
#'                      the exact quantiles are computed, which could be very expensive.
#'                      Note that values greater than 1 are accepted but give the same result as 1.
#' @return The approximate quantiles at the given probabilities. If the input is a single column name,
#'         the output is a list of approximate quantiles in that column; If the input is
#'         multiple column names, the output should be a list, and each element in it is a list of
#'         numeric values which represents the approximate quantiles in corresponding column.
#'
#' @rdname approxQuantile
#' @name approxQuantile
#' @aliases approxQuantile,SparkDataFrame,character,numeric,numeric-method
#' @family stat functions
#' @export
#' @examples
#' \dontrun{
#' df <- read.json("/path/to/file.json")
#' quantiles <- approxQuantile(df, "key", c(0.5, 0.8), 0.0)
#' }
#' @note approxQuantile since 2.0.0
setMethod("approxQuantile",
          signature(x = "SparkDataFrame", cols = "character",
                    probabilities = "numeric", relativeError = "numeric"),
          function(x, cols, probabilities, relativeError) {
            statFunctions <- callJMethod(x@sdf, "stat")
            quantiles <- callJMethod(statFunctions, "approxQuantile", as.list(cols),
                                     as.list(probabilities), relativeError)
            if (length(cols) == 1) {
              quantiles[[1]]
            } else {
              quantiles
            }
          })

#' Returns a stratified sample without replacement
#'
#' Returns a stratified sample without replacement based on the fraction given on each
#' stratum.
#'
#' @param x A SparkDataFrame
#' @param col column that defines strata
#' @param fractions A named list giving sampling fraction for each stratum. If a stratum is
#'                  not specified, we treat its fraction as zero.
#' @param seed random seed
#' @return A new SparkDataFrame that represents the stratified sample
#'
#' @rdname sampleBy
#' @aliases sampleBy,SparkDataFrame,character,list,numeric-method
#' @name sampleBy
#' @family stat functions
#' @export
#' @examples
#'\dontrun{
#' df <- read.json("/path/to/file.json")
#' sample <- sampleBy(df, "key", fractions, 36)
#' }
#' @note sampleBy since 1.6.0
setMethod("sampleBy",
          signature(x = "SparkDataFrame", col = "character",
                    fractions = "list", seed = "numeric"),
          function(x, col, fractions, seed) {
            fractionsEnv <- convertNamedListToEnv(fractions)

            statFunctions <- callJMethod(x@sdf, "stat")
            # Seed is expected to be Long on Scala side, here convert it to an integer
            # due to SerDe limitation now.
            sdf <- callJMethod(statFunctions, "sampleBy", col, fractionsEnv, as.integer(seed))
            dataFrame(sdf)
          })