aboutsummaryrefslogtreecommitdiff
path: root/examples/src
diff options
context:
space:
mode:
authorYanbo Liang <ybliang8@gmail.com>2016-05-20 09:30:20 -0700
committerXiangrui Meng <meng@databricks.com>2016-05-20 09:30:20 -0700
commit9a9c6f5c22248c5a891e9d3b788ff12b6b4718b2 (patch)
tree9d8006c4641cc7a76b0b0c37b7b583c4813408ec /examples/src
parenta3ceb875c64421ced8e52db6d8e51aec9b758e3e (diff)
downloadspark-9a9c6f5c22248c5a891e9d3b788ff12b6b4718b2.tar.gz
spark-9a9c6f5c22248c5a891e9d3b788ff12b6b4718b2.tar.bz2
spark-9a9c6f5c22248c5a891e9d3b788ff12b6b4718b2.zip
[SPARK-15222][SPARKR][ML] SparkR ML examples update in 2.0
## What changes were proposed in this pull request? Update example code in examples/src/main/r/ml.R to reflect the new algorithms. * spark.glm and glm * spark.survreg * spark.naiveBayes * spark.kmeans ## How was this patch tested? Offline test. Author: Yanbo Liang <ybliang8@gmail.com> Closes #13000 from yanboliang/spark-15222.
Diffstat (limited to 'examples/src')
-rw-r--r--examples/src/main/r/ml.R129
1 files changed, 112 insertions, 17 deletions
diff --git a/examples/src/main/r/ml.R b/examples/src/main/r/ml.R
index a0c903939c..fd35936635 100644
--- a/examples/src/main/r/ml.R
+++ b/examples/src/main/r/ml.R
@@ -16,7 +16,7 @@
#
# To run this example use
-# ./bin/sparkR examples/src/main/r/ml.R
+# ./bin/spark-submit examples/src/main/r/ml.R
# Load SparkR library into your R session
library(SparkR)
@@ -25,30 +25,125 @@ library(SparkR)
sc <- sparkR.init(appName="SparkR-ML-example")
sqlContext <- sparkRSQL.init(sc)
-# Train GLM of family 'gaussian'
-training1 <- suppressWarnings(createDataFrame(sqlContext, iris))
-test1 <- training1
-model1 <- glm(Sepal_Length ~ Sepal_Width + Species, training1, family = "gaussian")
+############################ spark.glm and glm ##############################################
+
+irisDF <- suppressWarnings(createDataFrame(sqlContext, iris))
+# Fit a generalized linear model of family "gaussian" with spark.glm
+gaussianDF <- irisDF
+gaussianTestDF <- irisDF
+gaussianGLM <- spark.glm(gaussianDF, Sepal_Length ~ Sepal_Width + Species, family = "gaussian")
# Model summary
-summary(model1)
+summary(gaussianGLM)
# Prediction
-predictions1 <- predict(model1, test1)
-head(select(predictions1, "Sepal_Length", "prediction"))
+gaussianPredictions <- predict(gaussianGLM, gaussianTestDF)
+showDF(gaussianPredictions)
+
+# Fit a generalized linear model with glm (R-compliant)
+gaussianGLM2 <- glm(Sepal_Length ~ Sepal_Width + Species, gaussianDF, family = "gaussian")
+summary(gaussianGLM2)
+
+# Fit a generalized linear model of family "binomial" with spark.glm
+binomialDF <- filter(irisDF, irisDF$Species != "setosa")
+binomialTestDF <- binomialDF
+binomialGLM <- spark.glm(binomialDF, Species ~ Sepal_Length + Sepal_Width, family = "binomial")
+
+# Model summary
+summary(binomialGLM)
+
+# Prediction
+binomialPredictions <- predict(binomialGLM, binomialTestDF)
+showDF(binomialPredictions)
+
+############################ spark.survreg ##############################################
+
+# Use the ovarian dataset available in R survival package
+library(survival)
-# Train GLM of family 'binomial'
-training2 <- filter(training1, training1$Species != "setosa")
-test2 <- training2
-model2 <- glm(Species ~ Sepal_Length + Sepal_Width, data = training2, family = "binomial")
+# Fit an accelerated failure time (AFT) survival regression model with spark.survreg
+ovarianDF <- suppressWarnings(createDataFrame(sqlContext, ovarian))
+aftDF <- ovarianDF
+aftTestDF <- ovarianDF
+aftModel <- spark.survreg(aftDF, Surv(futime, fustat) ~ ecog_ps + rx)
# Model summary
-summary(model2)
+summary(aftModel)
+
+# Prediction
+aftPredictions <- predict(aftModel, aftTestDF)
+showDF(aftPredictions)
+
+############################ spark.naiveBayes ##############################################
+
+# Fit a Bernoulli naive Bayes model with spark.naiveBayes
+titanic <- as.data.frame(Titanic)
+titanicDF <- suppressWarnings(createDataFrame(sqlContext, titanic[titanic$Freq > 0, -5]))
+nbDF <- titanicDF
+nbTestDF <- titanicDF
+nbModel <- spark.naiveBayes(nbDF, Survived ~ Class + Sex + Age)
+
+# Model summary
+summary(nbModel)
+
+# Prediction
+nbPredictions <- predict(nbModel, nbTestDF)
+showDF(nbPredictions)
+
+############################ spark.kmeans ##############################################
+
+# Fit a k-means model with spark.kmeans
+irisDF <- suppressWarnings(createDataFrame(sqlContext, iris))
+kmeansDF <- irisDF
+kmeansTestDF <- irisDF
+kmeansModel <- spark.kmeans(kmeansDF, ~ Sepal_Length + Sepal_Width + Petal_Length + Petal_Width,
+ k = 3)
+
+# Model summary
+summary(kmeansModel)
+
+# Get fitted result from the k-means model
+showDF(fitted(kmeansModel))
+
+# Prediction
+kmeansPredictions <- predict(kmeansModel, kmeansTestDF)
+showDF(kmeansPredictions)
+
+############################ model read/write ##############################################
+
+irisDF <- suppressWarnings(createDataFrame(sqlContext, iris))
+# Fit a generalized linear model of family "gaussian" with spark.glm
+gaussianDF <- irisDF
+gaussianTestDF <- irisDF
+gaussianGLM <- spark.glm(gaussianDF, Sepal_Length ~ Sepal_Width + Species, family = "gaussian")
+
+# Save and then load a fitted MLlib model
+modelPath <- tempfile(pattern = "ml", fileext = ".tmp")
+write.ml(gaussianGLM, modelPath)
+gaussianGLM2 <- read.ml(modelPath)
+
+# Check model summary
+summary(gaussianGLM2)
+
+# Check model prediction
+gaussianPredictions <- predict(gaussianGLM2, gaussianTestDF)
+showDF(gaussianPredictions)
+
+unlink(modelPath)
+
+############################ fit models with spark.lapply #####################################
+
+# Perform distributed training of multiple models with spark.lapply
+families <- c("gaussian", "poisson")
+train <- function(family) {
+ model <- glm(Sepal.Length ~ Sepal.Width + Species, iris, family = family)
+ summary(model)
+}
+model.summaries <- spark.lapply(sc, families, train)
+
+# Print the summary of each model
+print(model.summaries)
-# Prediction (Currently the output of prediction for binomial GLM is the indexed label,
-# we need to transform back to the original string label later)
-predictions2 <- predict(model2, test2)
-head(select(predictions2, "Species", "prediction"))
# Stop the SparkContext now
sparkR.stop()