diff options
Diffstat (limited to 'examples/src/main/r/ml.R')
-rw-r--r-- | examples/src/main/r/ml.R | 148 |
1 files changed, 0 insertions, 148 deletions
diff --git a/examples/src/main/r/ml.R b/examples/src/main/r/ml.R deleted file mode 100644 index a8a1274ac9..0000000000 --- a/examples/src/main/r/ml.R +++ /dev/null @@ -1,148 +0,0 @@ -# -# Licensed to the Apache Software Foundation (ASF) under one or more -# contributor license agreements. See the NOTICE file distributed with -# this work for additional information regarding copyright ownership. -# The ASF licenses this file to You under the Apache License, Version 2.0 -# (the "License"); you may not use this file except in compliance with -# the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# - -# To run this example use -# ./bin/spark-submit examples/src/main/r/ml.R - -# Load SparkR library into your R session -library(SparkR) - -# Initialize SparkSession -sparkR.session(appName = "SparkR-ML-example") - -############################ spark.glm and glm ############################################## -# $example on:glm$ -irisDF <- suppressWarnings(createDataFrame(iris)) -# Fit a generalized linear model of family "gaussian" with spark.glm -gaussianDF <- irisDF -gaussianTestDF <- irisDF -gaussianGLM <- spark.glm(gaussianDF, Sepal_Length ~ Sepal_Width + Species, family = "gaussian") - -# Model summary -summary(gaussianGLM) - -# Prediction -gaussianPredictions <- predict(gaussianGLM, gaussianTestDF) -showDF(gaussianPredictions) - -# Fit a generalized linear model with glm (R-compliant) -gaussianGLM2 <- glm(Sepal_Length ~ Sepal_Width + Species, gaussianDF, family = "gaussian") -summary(gaussianGLM2) - -# Fit a generalized linear model of family "binomial" with spark.glm -binomialDF <- filter(irisDF, irisDF$Species != "setosa") -binomialTestDF <- binomialDF -binomialGLM <- spark.glm(binomialDF, Species ~ Sepal_Length + Sepal_Width, family = "binomial") - -# Model summary -summary(binomialGLM) - -# Prediction -binomialPredictions <- predict(binomialGLM, binomialTestDF) -showDF(binomialPredictions) -# $example off:glm$ -############################ spark.survreg ############################################## -# $example on:survreg$ -# Use the ovarian dataset available in R survival package -library(survival) - -# Fit an accelerated failure time (AFT) survival regression model with spark.survreg -ovarianDF <- suppressWarnings(createDataFrame(ovarian)) -aftDF <- ovarianDF -aftTestDF <- ovarianDF -aftModel <- spark.survreg(aftDF, Surv(futime, fustat) ~ ecog_ps + rx) - -# Model summary -summary(aftModel) - -# Prediction -aftPredictions <- predict(aftModel, aftTestDF) -showDF(aftPredictions) -# $example off:survreg$ -############################ spark.naiveBayes ############################################## -# $example on:naiveBayes$ -# Fit a Bernoulli naive Bayes model with spark.naiveBayes -titanic <- as.data.frame(Titanic) -titanicDF <- createDataFrame(titanic[titanic$Freq > 0, -5]) -nbDF <- titanicDF -nbTestDF <- titanicDF -nbModel <- spark.naiveBayes(nbDF, Survived ~ Class + Sex + Age) - -# Model summary -summary(nbModel) - -# Prediction -nbPredictions <- predict(nbModel, nbTestDF) -showDF(nbPredictions) -# $example off:naiveBayes$ -############################ spark.kmeans ############################################## -# $example on:kmeans$ -# Fit a k-means model with spark.kmeans -irisDF <- suppressWarnings(createDataFrame(iris)) -kmeansDF <- irisDF -kmeansTestDF <- irisDF -kmeansModel <- spark.kmeans(kmeansDF, ~ Sepal_Length + Sepal_Width + Petal_Length + Petal_Width, - k = 3) - -# Model summary -summary(kmeansModel) - -# Get fitted result from the k-means model -showDF(fitted(kmeansModel)) - -# Prediction -kmeansPredictions <- predict(kmeansModel, kmeansTestDF) -showDF(kmeansPredictions) -# $example off:kmeans$ -############################ model read/write ############################################## -# $example on:read_write$ -irisDF <- suppressWarnings(createDataFrame(iris)) -# Fit a generalized linear model of family "gaussian" with spark.glm -gaussianDF <- irisDF -gaussianTestDF <- irisDF -gaussianGLM <- spark.glm(gaussianDF, Sepal_Length ~ Sepal_Width + Species, family = "gaussian") - -# Save and then load a fitted MLlib model -modelPath <- tempfile(pattern = "ml", fileext = ".tmp") -write.ml(gaussianGLM, modelPath) -gaussianGLM2 <- read.ml(modelPath) - -# Check model summary -summary(gaussianGLM2) - -# Check model prediction -gaussianPredictions <- predict(gaussianGLM2, gaussianTestDF) -showDF(gaussianPredictions) - -unlink(modelPath) -# $example off:read_write$ -############################ fit models with spark.lapply ##################################### - -# Perform distributed training of multiple models with spark.lapply -families <- c("gaussian", "poisson") -train <- function(family) { - model <- glm(Sepal.Length ~ Sepal.Width + Species, iris, family = family) - summary(model) -} -model.summaries <- spark.lapply(families, train) - -# Print the summary of each model -print(model.summaries) - - -# Stop the SparkSession now -sparkR.session.stop() |