From 9a9c6f5c22248c5a891e9d3b788ff12b6b4718b2 Mon Sep 17 00:00:00 2001 From: Yanbo Liang Date: Fri, 20 May 2016 09:30:20 -0700 Subject: [SPARK-15222][SPARKR][ML] SparkR ML examples update in 2.0 ## What changes were proposed in this pull request? Update example code in examples/src/main/r/ml.R to reflect the new algorithms. * spark.glm and glm * spark.survreg * spark.naiveBayes * spark.kmeans ## How was this patch tested? Offline test. Author: Yanbo Liang Closes #13000 from yanboliang/spark-15222. --- examples/src/main/r/ml.R | 129 ++++++++++++++++++++++++++++++++++++++++------- 1 file changed, 112 insertions(+), 17 deletions(-) (limited to 'examples/src') diff --git a/examples/src/main/r/ml.R b/examples/src/main/r/ml.R index a0c903939c..fd35936635 100644 --- a/examples/src/main/r/ml.R +++ b/examples/src/main/r/ml.R @@ -16,7 +16,7 @@ # # To run this example use -# ./bin/sparkR examples/src/main/r/ml.R +# ./bin/spark-submit examples/src/main/r/ml.R # Load SparkR library into your R session library(SparkR) @@ -25,30 +25,125 @@ library(SparkR) sc <- sparkR.init(appName="SparkR-ML-example") sqlContext <- sparkRSQL.init(sc) -# Train GLM of family 'gaussian' -training1 <- suppressWarnings(createDataFrame(sqlContext, iris)) -test1 <- training1 -model1 <- glm(Sepal_Length ~ Sepal_Width + Species, training1, family = "gaussian") +############################ spark.glm and glm ############################################## + +irisDF <- suppressWarnings(createDataFrame(sqlContext, iris)) +# Fit a generalized linear model of family "gaussian" with spark.glm +gaussianDF <- irisDF +gaussianTestDF <- irisDF +gaussianGLM <- spark.glm(gaussianDF, Sepal_Length ~ Sepal_Width + Species, family = "gaussian") # Model summary -summary(model1) +summary(gaussianGLM) # Prediction -predictions1 <- predict(model1, test1) -head(select(predictions1, "Sepal_Length", "prediction")) +gaussianPredictions <- predict(gaussianGLM, gaussianTestDF) +showDF(gaussianPredictions) + +# Fit a generalized linear model with glm (R-compliant) +gaussianGLM2 <- glm(Sepal_Length ~ Sepal_Width + Species, gaussianDF, family = "gaussian") +summary(gaussianGLM2) + +# Fit a generalized linear model of family "binomial" with spark.glm +binomialDF <- filter(irisDF, irisDF$Species != "setosa") +binomialTestDF <- binomialDF +binomialGLM <- spark.glm(binomialDF, Species ~ Sepal_Length + Sepal_Width, family = "binomial") + +# Model summary +summary(binomialGLM) + +# Prediction +binomialPredictions <- predict(binomialGLM, binomialTestDF) +showDF(binomialPredictions) + +############################ spark.survreg ############################################## + +# Use the ovarian dataset available in R survival package +library(survival) -# Train GLM of family 'binomial' -training2 <- filter(training1, training1$Species != "setosa") -test2 <- training2 -model2 <- glm(Species ~ Sepal_Length + Sepal_Width, data = training2, family = "binomial") +# Fit an accelerated failure time (AFT) survival regression model with spark.survreg +ovarianDF <- suppressWarnings(createDataFrame(sqlContext, ovarian)) +aftDF <- ovarianDF +aftTestDF <- ovarianDF +aftModel <- spark.survreg(aftDF, Surv(futime, fustat) ~ ecog_ps + rx) # Model summary -summary(model2) +summary(aftModel) + +# Prediction +aftPredictions <- predict(aftModel, aftTestDF) +showDF(aftPredictions) + +############################ spark.naiveBayes ############################################## + +# Fit a Bernoulli naive Bayes model with spark.naiveBayes +titanic <- as.data.frame(Titanic) +titanicDF <- suppressWarnings(createDataFrame(sqlContext, titanic[titanic$Freq > 0, -5])) +nbDF <- titanicDF +nbTestDF <- titanicDF +nbModel <- spark.naiveBayes(nbDF, Survived ~ Class + Sex + Age) + +# Model summary +summary(nbModel) + +# Prediction +nbPredictions <- predict(nbModel, nbTestDF) +showDF(nbPredictions) + +############################ spark.kmeans ############################################## + +# Fit a k-means model with spark.kmeans +irisDF <- suppressWarnings(createDataFrame(sqlContext, iris)) +kmeansDF <- irisDF +kmeansTestDF <- irisDF +kmeansModel <- spark.kmeans(kmeansDF, ~ Sepal_Length + Sepal_Width + Petal_Length + Petal_Width, + k = 3) + +# Model summary +summary(kmeansModel) + +# Get fitted result from the k-means model +showDF(fitted(kmeansModel)) + +# Prediction +kmeansPredictions <- predict(kmeansModel, kmeansTestDF) +showDF(kmeansPredictions) + +############################ model read/write ############################################## + +irisDF <- suppressWarnings(createDataFrame(sqlContext, iris)) +# Fit a generalized linear model of family "gaussian" with spark.glm +gaussianDF <- irisDF +gaussianTestDF <- irisDF +gaussianGLM <- spark.glm(gaussianDF, Sepal_Length ~ Sepal_Width + Species, family = "gaussian") + +# Save and then load a fitted MLlib model +modelPath <- tempfile(pattern = "ml", fileext = ".tmp") +write.ml(gaussianGLM, modelPath) +gaussianGLM2 <- read.ml(modelPath) + +# Check model summary +summary(gaussianGLM2) + +# Check model prediction +gaussianPredictions <- predict(gaussianGLM2, gaussianTestDF) +showDF(gaussianPredictions) + +unlink(modelPath) + +############################ fit models with spark.lapply ##################################### + +# Perform distributed training of multiple models with spark.lapply +families <- c("gaussian", "poisson") +train <- function(family) { + model <- glm(Sepal.Length ~ Sepal.Width + Species, iris, family = family) + summary(model) +} +model.summaries <- spark.lapply(sc, families, train) + +# Print the summary of each model +print(model.summaries) -# Prediction (Currently the output of prediction for binomial GLM is the indexed label, -# we need to transform back to the original string label later) -predictions2 <- predict(model2, test2) -head(select(predictions2, "Species", "prediction")) # Stop the SparkContext now sparkR.stop() -- cgit v1.2.3