[SPARK-13734][SPARKR] Added histogram function

## What changes were proposed in this pull request? Added method histogram() to compute the histogram of a Column Usage: ``` ## Create a DataFrame from the Iris dataset irisDF <- createDataFrame(sqlContext, iris) ## Render a histogram for the Sepal_Length column histogram(irisDF, "Sepal_Length", nbins=12) ``` ![histogram](https://cloud.githubusercontent.com/assets/13985649/13588486/e1e751c6-e484-11e5-85db-2fc2115c4bb2.png) Note: Usage will change once SPARK-9325 is figured out so that histogram() only takes a Column as a parameter, as opposed to a DataFrame and a name ## How was this patch tested? All unit tests pass. I added specific unit cases for different scenarios. Author: Oscar D. Lara Yejas <odlaraye@oscars-mbp.usca.ibm.com> Author: Oscar D. Lara Yejas <odlaraye@oscars-mbp.attlocal.net> Closes #11569 from olarayej/SPARK-13734.
author: Oscar D. Lara Yejas <odlaraye@oscars-mbp.usca.ibm.com> 2016-04-26 15:34:30 -0700
committer: Shivaram Venkataraman <shivaram@cs.berkeley.edu> 2016-04-26 15:34:30 -0700
commit: 0c99c23b7d9f0c3538cd2b062d551411712a2bcc (patch)
tree: fc7d7cc02559756f50b3dd4a5f262e5aa822412e /R/pkg/inst/tests/testthat/test_sparkSQL.R
parent: 75879ac3c07f3b1a708f4392429335feb06f271b (diff)
download: spark-0c99c23b7d9f0c3538cd2b062d551411712a2bcc.tar.gz
spark-0c99c23b7d9f0c3538cd2b062d551411712a2bcc.tar.bz2
spark-0c99c23b7d9f0c3538cd2b062d551411712a2bcc.zip
1 files changed, 45 insertions, 0 deletions
diff --git a/R/pkg/inst/tests/testthat/test_sparkSQL.R b/R/pkg/inst/tests/testthat/test_sparkSQL.R
index 9244c5621b..336068035e 100644
--- a/R/pkg/inst/tests/testthat/test_sparkSQL.R
+++ b/R/pkg/inst/tests/testthat/test_sparkSQL.R
@@ -1972,6 +1972,51 @@ test_that("Method str()", {
   expect_equal(capture.output(utils:::str(iris)), capture.output(str(iris)))
 })
 
+test_that("Histogram", {
+
+  # Basic histogram test with colname
+  expect_equal(
+    all(histogram(irisDF, "Petal_Width", 8) ==
+        data.frame(bins = seq(0, 7),
+                   counts = c(48, 2, 7, 21, 24, 19, 15, 14),
+                   centroids = seq(0, 7) * 0.3 + 0.25)),
+        TRUE)
+
+  # Basic histogram test with Column
+  expect_equal(
+    all(histogram(irisDF, irisDF$Petal_Width, 8) ==
+          data.frame(bins = seq(0, 7),
+                     counts = c(48, 2, 7, 21, 24, 19, 15, 14),
+                     centroids = seq(0, 7) * 0.3 + 0.25)),
+    TRUE)
+
+  # Basic histogram test with derived column
+  expect_equal(
+    all(round(histogram(irisDF, irisDF$Petal_Width + 1, 8), 2) ==
+          data.frame(bins = seq(0, 7),
+                     counts = c(48, 2, 7, 21, 24, 19, 15, 14),
+                     centroids = seq(0, 7) * 0.3 + 1.25)),
+    TRUE)
+
+  # Missing nbins
+  expect_equal(length(histogram(irisDF, "Petal_Width")$counts), 10)
+
+  # Wrong colname
+  expect_error(histogram(irisDF, "xxx"),
+               "Specified colname does not belong to the given SparkDataFrame.")
+
+  # Invalid nbins
+  expect_error(histogram(irisDF, "Petal_Width", nbins = 0),
+               "The number of bins must be a positive integer number greater than 1.")
+
+  # Test against R's hist
+  expect_equal(all(hist(iris$Sepal.Width)$counts ==
+                   histogram(irisDF, "Sepal_Width", 12)$counts), T)
+
+  # Test when there are zero counts
+  df <- as.DataFrame(sqlContext, data.frame(x = c(1, 2, 3, 4, 100)))
+  expect_equal(histogram(df, "x")$counts, c(4, 0, 0, 0, 0, 0, 0, 0, 0, 1))
+})
 unlink(parquetPath)
 unlink(jsonPath)
 unlink(jsonPathNa)
author	Oscar D. Lara Yejas <odlaraye@oscars-mbp.usca.ibm.com>	2016-04-26 15:34:30 -0700
committer	Shivaram Venkataraman <shivaram@cs.berkeley.edu>	2016-04-26 15:34:30 -0700
commit	0c99c23b7d9f0c3538cd2b062d551411712a2bcc (patch)
tree	fc7d7cc02559756f50b3dd4a5f262e5aa822412e /R/pkg/inst/tests/testthat/test_sparkSQL.R
parent	75879ac3c07f3b1a708f4392429335feb06f271b (diff)
download	spark-0c99c23b7d9f0c3538cd2b062d551411712a2bcc.tar.gz spark-0c99c23b7d9f0c3538cd2b062d551411712a2bcc.tar.bz2 spark-0c99c23b7d9f0c3538cd2b062d551411712a2bcc.zip