aboutsummaryrefslogtreecommitdiff
path: root/examples
diff options
context:
space:
mode:
authorJoseph K. Bradley <joseph@databricks.com>2015-02-20 02:31:32 -0800
committerXiangrui Meng <meng@databricks.com>2015-02-20 02:31:32 -0800
commit4a17eedb16343413e5b6f8bb58c6da8952ee7ab6 (patch)
treeac17fd9eed4f42ba3095b148c68d4e78b6afc875 /examples
parentd3dfebebce9f76e4433e16d4d6d29fb8fa4d4193 (diff)
downloadspark-4a17eedb16343413e5b6f8bb58c6da8952ee7ab6.tar.gz
spark-4a17eedb16343413e5b6f8bb58c6da8952ee7ab6.tar.bz2
spark-4a17eedb16343413e5b6f8bb58c6da8952ee7ab6.zip
[SPARK-5867] [SPARK-5892] [doc] [ml] [mllib] Doc cleanups for 1.3 release
For SPARK-5867: * The spark.ml programming guide needs to be updated to use the new SQL DataFrame API instead of the old SchemaRDD API. * It should also include Python examples now. For SPARK-5892: * Fix Python docs * Various other cleanups BTW, I accidentally merged this with master. If you want to compile it on your own, use this branch which is based on spark/branch-1.3 and cherry-picks the commits from this PR: [https://github.com/jkbradley/spark/tree/doc-review-1.3-check] CC: mengxr (ML), davies (Python docs) Author: Joseph K. Bradley <joseph@databricks.com> Closes #4675 from jkbradley/doc-review-1.3 and squashes the following commits: f191bb0 [Joseph K. Bradley] small cleanups e786efa [Joseph K. Bradley] small doc corrections 6b1ab4a [Joseph K. Bradley] fixed python lint test 946affa [Joseph K. Bradley] Added sample data for ml.MovieLensALS example. Changed spark.ml Java examples to use DataFrames API instead of sql() da81558 [Joseph K. Bradley] Merge remote-tracking branch 'upstream/master' into doc-review-1.3 629dbf5 [Joseph K. Bradley] Updated based on code review: * made new page for old migration guides * small fixes * moved inherit_doc in python b9df7c4 [Joseph K. Bradley] Small cleanups: toDF to toDF(), adding s for string interpolation 34b067f [Joseph K. Bradley] small doc correction da16aef [Joseph K. Bradley] Fixed python mllib docs 8cce91c [Joseph K. Bradley] GMM: removed old imports, added some doc 695f3f6 [Joseph K. Bradley] partly done trying to fix inherit_doc for class hierarchies in python docs a72c018 [Joseph K. Bradley] made ChiSqTestResult appear in python docs b05a80d [Joseph K. Bradley] organize imports. doc cleanups e572827 [Joseph K. Bradley] updated programming guide for ml and mllib
Diffstat (limited to 'examples')
-rw-r--r--examples/src/main/java/org/apache/spark/examples/ml/JavaCrossValidatorExample.java7
-rw-r--r--examples/src/main/java/org/apache/spark/examples/ml/JavaSimpleParamsExample.java8
-rw-r--r--examples/src/main/java/org/apache/spark/examples/ml/JavaSimpleTextClassificationPipeline.java7
-rw-r--r--examples/src/main/python/ml/simple_text_classification_pipeline.py4
-rw-r--r--examples/src/main/scala/org/apache/spark/examples/ml/CrossValidatorExample.scala2
-rw-r--r--examples/src/main/scala/org/apache/spark/examples/ml/MovieLensALS.scala32
-rw-r--r--examples/src/main/scala/org/apache/spark/examples/ml/SimpleParamsExample.scala6
-rw-r--r--examples/src/main/scala/org/apache/spark/examples/ml/SimpleTextClassificationPipeline.scala2
8 files changed, 35 insertions, 33 deletions
diff --git a/examples/src/main/java/org/apache/spark/examples/ml/JavaCrossValidatorExample.java b/examples/src/main/java/org/apache/spark/examples/ml/JavaCrossValidatorExample.java
index 5d8c5d0a92..9bbc14ea40 100644
--- a/examples/src/main/java/org/apache/spark/examples/ml/JavaCrossValidatorExample.java
+++ b/examples/src/main/java/org/apache/spark/examples/ml/JavaCrossValidatorExample.java
@@ -34,8 +34,8 @@ import org.apache.spark.ml.tuning.CrossValidator;
import org.apache.spark.ml.tuning.CrossValidatorModel;
import org.apache.spark.ml.tuning.ParamGridBuilder;
import org.apache.spark.sql.DataFrame;
-import org.apache.spark.sql.SQLContext;
import org.apache.spark.sql.Row;
+import org.apache.spark.sql.SQLContext;
/**
* A simple example demonstrating model selection using CrossValidator.
@@ -115,9 +115,8 @@ public class JavaCrossValidatorExample {
DataFrame test = jsql.createDataFrame(jsc.parallelize(localTest), Document.class);
// Make predictions on test documents. cvModel uses the best model found (lrModel).
- cvModel.transform(test).registerTempTable("prediction");
- DataFrame predictions = jsql.sql("SELECT id, text, probability, prediction FROM prediction");
- for (Row r: predictions.collect()) {
+ DataFrame predictions = cvModel.transform(test);
+ for (Row r: predictions.select("id", "text", "probability", "prediction").collect()) {
System.out.println("(" + r.get(0) + ", " + r.get(1) + ") --> prob=" + r.get(2)
+ ", prediction=" + r.get(3));
}
diff --git a/examples/src/main/java/org/apache/spark/examples/ml/JavaSimpleParamsExample.java b/examples/src/main/java/org/apache/spark/examples/ml/JavaSimpleParamsExample.java
index 4c4d532388..4e02acce69 100644
--- a/examples/src/main/java/org/apache/spark/examples/ml/JavaSimpleParamsExample.java
+++ b/examples/src/main/java/org/apache/spark/examples/ml/JavaSimpleParamsExample.java
@@ -29,8 +29,8 @@ import org.apache.spark.ml.classification.LogisticRegression;
import org.apache.spark.mllib.linalg.Vectors;
import org.apache.spark.mllib.regression.LabeledPoint;
import org.apache.spark.sql.DataFrame;
-import org.apache.spark.sql.SQLContext;
import org.apache.spark.sql.Row;
+import org.apache.spark.sql.SQLContext;
/**
* A simple example demonstrating ways to specify parameters for Estimators and Transformers.
@@ -100,10 +100,8 @@ public class JavaSimpleParamsExample {
// LogisticRegression.transform will only use the 'features' column.
// Note that model2.transform() outputs a 'myProbability' column instead of the usual
// 'probability' column since we renamed the lr.probabilityCol parameter previously.
- model2.transform(test).registerTempTable("results");
- DataFrame results =
- jsql.sql("SELECT features, label, myProbability, prediction FROM results");
- for (Row r: results.collect()) {
+ DataFrame results = model2.transform(test);
+ for (Row r: results.select("features", "label", "myProbability", "prediction").collect()) {
System.out.println("(" + r.get(0) + ", " + r.get(1) + ") -> prob=" + r.get(2)
+ ", prediction=" + r.get(3));
}
diff --git a/examples/src/main/java/org/apache/spark/examples/ml/JavaSimpleTextClassificationPipeline.java b/examples/src/main/java/org/apache/spark/examples/ml/JavaSimpleTextClassificationPipeline.java
index fdcfc888c2..ef1ec103a8 100644
--- a/examples/src/main/java/org/apache/spark/examples/ml/JavaSimpleTextClassificationPipeline.java
+++ b/examples/src/main/java/org/apache/spark/examples/ml/JavaSimpleTextClassificationPipeline.java
@@ -30,8 +30,8 @@ import org.apache.spark.ml.classification.LogisticRegression;
import org.apache.spark.ml.feature.HashingTF;
import org.apache.spark.ml.feature.Tokenizer;
import org.apache.spark.sql.DataFrame;
-import org.apache.spark.sql.SQLContext;
import org.apache.spark.sql.Row;
+import org.apache.spark.sql.SQLContext;
/**
* A simple text classification pipeline that recognizes "spark" from input text. It uses the Java
@@ -82,9 +82,8 @@ public class JavaSimpleTextClassificationPipeline {
DataFrame test = jsql.createDataFrame(jsc.parallelize(localTest), Document.class);
// Make predictions on test documents.
- model.transform(test).registerTempTable("prediction");
- DataFrame predictions = jsql.sql("SELECT id, text, score, prediction FROM prediction");
- for (Row r: predictions.collect()) {
+ DataFrame predictions = model.transform(test);
+ for (Row r: predictions.select("id", "text", "probability", "prediction").collect()) {
System.out.println("(" + r.get(0) + ", " + r.get(1) + ") --> prob=" + r.get(2)
+ ", prediction=" + r.get(3));
}
diff --git a/examples/src/main/python/ml/simple_text_classification_pipeline.py b/examples/src/main/python/ml/simple_text_classification_pipeline.py
index b4d9355b68..d281f4fa44 100644
--- a/examples/src/main/python/ml/simple_text_classification_pipeline.py
+++ b/examples/src/main/python/ml/simple_text_classification_pipeline.py
@@ -16,10 +16,10 @@
#
from pyspark import SparkContext
-from pyspark.sql import SQLContext, Row
from pyspark.ml import Pipeline
-from pyspark.ml.feature import HashingTF, Tokenizer
from pyspark.ml.classification import LogisticRegression
+from pyspark.ml.feature import HashingTF, Tokenizer
+from pyspark.sql import Row, SQLContext
"""
diff --git a/examples/src/main/scala/org/apache/spark/examples/ml/CrossValidatorExample.scala b/examples/src/main/scala/org/apache/spark/examples/ml/CrossValidatorExample.scala
index 7ab892cd75..6c0af20461 100644
--- a/examples/src/main/scala/org/apache/spark/examples/ml/CrossValidatorExample.scala
+++ b/examples/src/main/scala/org/apache/spark/examples/ml/CrossValidatorExample.scala
@@ -100,7 +100,7 @@ object CrossValidatorExample {
Document(7L, "apache hadoop")))
// Make predictions on test documents. cvModel uses the best model found (lrModel).
- cvModel.transform(test.toDF)
+ cvModel.transform(test.toDF())
.select("id", "text", "probability", "prediction")
.collect()
.foreach { case Row(id: Long, text: String, prob: Vector, prediction: Double) =>
diff --git a/examples/src/main/scala/org/apache/spark/examples/ml/MovieLensALS.scala b/examples/src/main/scala/org/apache/spark/examples/ml/MovieLensALS.scala
index 96b2dd463e..25f21113bf 100644
--- a/examples/src/main/scala/org/apache/spark/examples/ml/MovieLensALS.scala
+++ b/examples/src/main/scala/org/apache/spark/examples/ml/MovieLensALS.scala
@@ -93,8 +93,8 @@ object MovieLensALS {
| bin/spark-submit --class org.apache.spark.examples.ml.MovieLensALS \
| examples/target/scala-*/spark-examples-*.jar \
| --rank 10 --maxIter 15 --regParam 0.1 \
- | --movies path/to/movielens/movies.dat \
- | --ratings path/to/movielens/ratings.dat
+ | --movies data/mllib/als/sample_movielens_movies.txt \
+ | --ratings data/mllib/als/sample_movielens_ratings.txt
""".stripMargin)
}
@@ -157,17 +157,23 @@ object MovieLensALS {
println(s"Test RMSE = $rmse.")
// Inspect false positives.
- predictions.registerTempTable("prediction")
- sc.textFile(params.movies).map(Movie.parseMovie).toDF().registerTempTable("movie")
- sqlContext.sql(
- """
- |SELECT userId, prediction.movieId, title, rating, prediction
- | FROM prediction JOIN movie ON prediction.movieId = movie.movieId
- | WHERE rating <= 1 AND prediction >= 4
- | LIMIT 100
- """.stripMargin)
- .collect()
- .foreach(println)
+ // Note: We reference columns in 2 ways:
+ // (1) predictions("movieId") lets us specify the movieId column in the predictions
+ // DataFrame, rather than the movieId column in the movies DataFrame.
+ // (2) $"userId" specifies the userId column in the predictions DataFrame.
+ // We could also write predictions("userId") but do not have to since
+ // the movies DataFrame does not have a column "userId."
+ val movies = sc.textFile(params.movies).map(Movie.parseMovie).toDF()
+ val falsePositives = predictions.join(movies)
+ .where((predictions("movieId") === movies("movieId"))
+ && ($"rating" <= 1) && ($"prediction" >= 4))
+ .select($"userId", predictions("movieId"), $"title", $"rating", $"prediction")
+ val numFalsePositives = falsePositives.count()
+ println(s"Found $numFalsePositives false positives")
+ if (numFalsePositives > 0) {
+ println(s"Example false positives:")
+ falsePositives.limit(100).collect().foreach(println)
+ }
sc.stop()
}
diff --git a/examples/src/main/scala/org/apache/spark/examples/ml/SimpleParamsExample.scala b/examples/src/main/scala/org/apache/spark/examples/ml/SimpleParamsExample.scala
index e8af5c1625..bf805149d0 100644
--- a/examples/src/main/scala/org/apache/spark/examples/ml/SimpleParamsExample.scala
+++ b/examples/src/main/scala/org/apache/spark/examples/ml/SimpleParamsExample.scala
@@ -40,8 +40,8 @@ object SimpleParamsExample {
import sqlContext.implicits._
// Prepare training data.
- // We use LabeledPoint, which is a case class. Spark SQL can convert RDDs of Java Beans
- // into DataFrames, where it uses the bean metadata to infer the schema.
+ // We use LabeledPoint, which is a case class. Spark SQL can convert RDDs of case classes
+ // into DataFrames, where it uses the case class metadata to infer the schema.
val training = sc.parallelize(Seq(
LabeledPoint(1.0, Vectors.dense(0.0, 1.1, 0.1)),
LabeledPoint(0.0, Vectors.dense(2.0, 1.0, -1.0)),
@@ -94,7 +94,7 @@ object SimpleParamsExample {
.select("features", "label", "myProbability", "prediction")
.collect()
.foreach { case Row(features: Vector, label: Double, prob: Vector, prediction: Double) =>
- println("($features, $label) -> prob=$prob, prediction=$prediction")
+ println(s"($features, $label) -> prob=$prob, prediction=$prediction")
}
sc.stop()
diff --git a/examples/src/main/scala/org/apache/spark/examples/ml/SimpleTextClassificationPipeline.scala b/examples/src/main/scala/org/apache/spark/examples/ml/SimpleTextClassificationPipeline.scala
index a11db6fd5c..6772efd2c5 100644
--- a/examples/src/main/scala/org/apache/spark/examples/ml/SimpleTextClassificationPipeline.scala
+++ b/examples/src/main/scala/org/apache/spark/examples/ml/SimpleTextClassificationPipeline.scala
@@ -83,7 +83,7 @@ object SimpleTextClassificationPipeline {
.select("id", "text", "probability", "prediction")
.collect()
.foreach { case Row(id: Long, text: String, prob: Vector, prediction: Double) =>
- println("($id, $text) --> prob=$prob, prediction=$prediction")
+ println(s"($id, $text) --> prob=$prob, prediction=$prediction")
}
sc.stop()