diff options
author | wm624@hotmail.com <wm624@hotmail.com> | 2017-02-28 22:31:35 -0800 |
---|---|---|
committer | Felix Cheung <felixcheung@apache.org> | 2017-02-28 22:31:35 -0800 |
commit | 89cd3845b6edb165236a6498dcade033975ee276 (patch) | |
tree | 1aae82ffb40b20e0cd0befa89d816d2ad3671368 /examples | |
parent | 7315880568fd07d4dfb9f76d538f220e9d320c6f (diff) | |
download | spark-89cd3845b6edb165236a6498dcade033975ee276.tar.gz spark-89cd3845b6edb165236a6498dcade033975ee276.tar.bz2 spark-89cd3845b6edb165236a6498dcade033975ee276.zip |
[SPARK-19460][SPARKR] Update dataset used in R documentation, examples to reduce warning noise and confusions
## What changes were proposed in this pull request?
Replace `iris` dataset with `Titanic` or other dataset in example and document.
## How was this patch tested?
Manual and existing test
Author: wm624@hotmail.com <wm624@hotmail.com>
Closes #17032 from wangmiao1981/example.
Diffstat (limited to 'examples')
-rw-r--r-- | examples/src/main/r/ml/bisectingKmeans.R | 11 | ||||
-rw-r--r-- | examples/src/main/r/ml/glm.R | 20 | ||||
-rw-r--r-- | examples/src/main/r/ml/kmeans.R | 10 | ||||
-rw-r--r-- | examples/src/main/r/ml/ml.R | 9 |
4 files changed, 28 insertions, 22 deletions
diff --git a/examples/src/main/r/ml/bisectingKmeans.R b/examples/src/main/r/ml/bisectingKmeans.R index 5fb5bfb0fa..b3eaa6dd86 100644 --- a/examples/src/main/r/ml/bisectingKmeans.R +++ b/examples/src/main/r/ml/bisectingKmeans.R @@ -25,20 +25,21 @@ library(SparkR) sparkR.session(appName = "SparkR-ML-bisectingKmeans-example") # $example on$ -irisDF <- createDataFrame(iris) +t <- as.data.frame(Titanic) +training <- createDataFrame(t) # Fit bisecting k-means model with four centers -model <- spark.bisectingKmeans(df, Sepal_Length ~ Sepal_Width, k = 4) +model <- spark.bisectingKmeans(training, Class ~ Survived, k = 4) # get fitted result from a bisecting k-means model fitted.model <- fitted(model, "centers") # Model summary -summary(fitted.model) +head(summary(fitted.model)) # fitted values on training data -fitted <- predict(model, df) -head(select(fitted, "Sepal_Length", "prediction")) +fitted <- predict(model, training) +head(select(fitted, "Class", "prediction")) # $example off$ sparkR.session.stop() diff --git a/examples/src/main/r/ml/glm.R b/examples/src/main/r/ml/glm.R index e41af97751..ee13910382 100644 --- a/examples/src/main/r/ml/glm.R +++ b/examples/src/main/r/ml/glm.R @@ -25,11 +25,12 @@ library(SparkR) sparkR.session(appName = "SparkR-ML-glm-example") # $example on$ -irisDF <- suppressWarnings(createDataFrame(iris)) +training <- read.df("data/mllib/sample_multiclass_classification_data.txt", source = "libsvm") # Fit a generalized linear model of family "gaussian" with spark.glm -gaussianDF <- irisDF -gaussianTestDF <- irisDF -gaussianGLM <- spark.glm(gaussianDF, Sepal_Length ~ Sepal_Width + Species, family = "gaussian") +df_list <- randomSplit(training, c(7,3), 2) +gaussianDF <- df_list[[1]] +gaussianTestDF <- df_list[[2]] +gaussianGLM <- spark.glm(gaussianDF, label ~ features, family = "gaussian") # Model summary summary(gaussianGLM) @@ -39,14 +40,15 @@ gaussianPredictions <- predict(gaussianGLM, gaussianTestDF) head(gaussianPredictions) # Fit a generalized linear model with glm (R-compliant) -gaussianGLM2 <- glm(Sepal_Length ~ Sepal_Width + Species, gaussianDF, family = "gaussian") +gaussianGLM2 <- glm(label ~ features, gaussianDF, family = "gaussian") summary(gaussianGLM2) # Fit a generalized linear model of family "binomial" with spark.glm -# Note: Filter out "setosa" from label column (two labels left) to match "binomial" family. -binomialDF <- filter(irisDF, irisDF$Species != "setosa") -binomialTestDF <- binomialDF -binomialGLM <- spark.glm(binomialDF, Species ~ Sepal_Length + Sepal_Width, family = "binomial") +training2 <- read.df("data/mllib/sample_binary_classification_data.txt", source = "libsvm") +df_list2 <- randomSplit(training2, c(7,3), 2) +binomialDF <- df_list2[[1]] +binomialTestDF <- df_list2[[2]] +binomialGLM <- spark.glm(binomialDF, label ~ features, family = "binomial") # Model summary summary(binomialGLM) diff --git a/examples/src/main/r/ml/kmeans.R b/examples/src/main/r/ml/kmeans.R index 288e2f9724..824df20644 100644 --- a/examples/src/main/r/ml/kmeans.R +++ b/examples/src/main/r/ml/kmeans.R @@ -26,10 +26,12 @@ sparkR.session(appName = "SparkR-ML-kmeans-example") # $example on$ # Fit a k-means model with spark.kmeans -irisDF <- suppressWarnings(createDataFrame(iris)) -kmeansDF <- irisDF -kmeansTestDF <- irisDF -kmeansModel <- spark.kmeans(kmeansDF, ~ Sepal_Length + Sepal_Width + Petal_Length + Petal_Width, +t <- as.data.frame(Titanic) +training <- createDataFrame(t) +df_list <- randomSplit(training, c(7,3), 2) +kmeansDF <- df_list[[1]] +kmeansTestDF <- df_list[[2]] +kmeansModel <- spark.kmeans(kmeansDF, ~ Class + Sex + Age + Freq, k = 3) # Model summary diff --git a/examples/src/main/r/ml/ml.R b/examples/src/main/r/ml/ml.R index b96819418b..41b7867f64 100644 --- a/examples/src/main/r/ml/ml.R +++ b/examples/src/main/r/ml/ml.R @@ -26,11 +26,12 @@ sparkR.session(appName = "SparkR-ML-example") ############################ model read/write ############################################## # $example on:read_write$ -irisDF <- suppressWarnings(createDataFrame(iris)) +training <- read.df("data/mllib/sample_multiclass_classification_data.txt", source = "libsvm") # Fit a generalized linear model of family "gaussian" with spark.glm -gaussianDF <- irisDF -gaussianTestDF <- irisDF -gaussianGLM <- spark.glm(gaussianDF, Sepal_Length ~ Sepal_Width + Species, family = "gaussian") +df_list <- randomSplit(training, c(7,3), 2) +gaussianDF <- df_list[[1]] +gaussianTestDF <- df_list[[2]] +gaussianGLM <- spark.glm(gaussianDF, label ~ features, family = "gaussian") # Save and then load a fitted MLlib model modelPath <- tempfile(pattern = "ml", fileext = ".tmp") |