aboutsummaryrefslogtreecommitdiff
path: root/examples/src/main/scala
diff options
context:
space:
mode:
authorJoseph K. Bradley <joseph@databricks.com>2015-02-20 02:31:32 -0800
committerXiangrui Meng <meng@databricks.com>2015-02-20 02:31:32 -0800
commit4a17eedb16343413e5b6f8bb58c6da8952ee7ab6 (patch)
treeac17fd9eed4f42ba3095b148c68d4e78b6afc875 /examples/src/main/scala
parentd3dfebebce9f76e4433e16d4d6d29fb8fa4d4193 (diff)
downloadspark-4a17eedb16343413e5b6f8bb58c6da8952ee7ab6.tar.gz
spark-4a17eedb16343413e5b6f8bb58c6da8952ee7ab6.tar.bz2
spark-4a17eedb16343413e5b6f8bb58c6da8952ee7ab6.zip
[SPARK-5867] [SPARK-5892] [doc] [ml] [mllib] Doc cleanups for 1.3 release
For SPARK-5867: * The spark.ml programming guide needs to be updated to use the new SQL DataFrame API instead of the old SchemaRDD API. * It should also include Python examples now. For SPARK-5892: * Fix Python docs * Various other cleanups BTW, I accidentally merged this with master. If you want to compile it on your own, use this branch which is based on spark/branch-1.3 and cherry-picks the commits from this PR: [https://github.com/jkbradley/spark/tree/doc-review-1.3-check] CC: mengxr (ML), davies (Python docs) Author: Joseph K. Bradley <joseph@databricks.com> Closes #4675 from jkbradley/doc-review-1.3 and squashes the following commits: f191bb0 [Joseph K. Bradley] small cleanups e786efa [Joseph K. Bradley] small doc corrections 6b1ab4a [Joseph K. Bradley] fixed python lint test 946affa [Joseph K. Bradley] Added sample data for ml.MovieLensALS example. Changed spark.ml Java examples to use DataFrames API instead of sql() da81558 [Joseph K. Bradley] Merge remote-tracking branch 'upstream/master' into doc-review-1.3 629dbf5 [Joseph K. Bradley] Updated based on code review: * made new page for old migration guides * small fixes * moved inherit_doc in python b9df7c4 [Joseph K. Bradley] Small cleanups: toDF to toDF(), adding s for string interpolation 34b067f [Joseph K. Bradley] small doc correction da16aef [Joseph K. Bradley] Fixed python mllib docs 8cce91c [Joseph K. Bradley] GMM: removed old imports, added some doc 695f3f6 [Joseph K. Bradley] partly done trying to fix inherit_doc for class hierarchies in python docs a72c018 [Joseph K. Bradley] made ChiSqTestResult appear in python docs b05a80d [Joseph K. Bradley] organize imports. doc cleanups e572827 [Joseph K. Bradley] updated programming guide for ml and mllib
Diffstat (limited to 'examples/src/main/scala')
-rw-r--r--examples/src/main/scala/org/apache/spark/examples/ml/CrossValidatorExample.scala2
-rw-r--r--examples/src/main/scala/org/apache/spark/examples/ml/MovieLensALS.scala32
-rw-r--r--examples/src/main/scala/org/apache/spark/examples/ml/SimpleParamsExample.scala6
-rw-r--r--examples/src/main/scala/org/apache/spark/examples/ml/SimpleTextClassificationPipeline.scala2
4 files changed, 24 insertions, 18 deletions
diff --git a/examples/src/main/scala/org/apache/spark/examples/ml/CrossValidatorExample.scala b/examples/src/main/scala/org/apache/spark/examples/ml/CrossValidatorExample.scala
index 7ab892cd75..6c0af20461 100644
--- a/examples/src/main/scala/org/apache/spark/examples/ml/CrossValidatorExample.scala
+++ b/examples/src/main/scala/org/apache/spark/examples/ml/CrossValidatorExample.scala
@@ -100,7 +100,7 @@ object CrossValidatorExample {
Document(7L, "apache hadoop")))
// Make predictions on test documents. cvModel uses the best model found (lrModel).
- cvModel.transform(test.toDF)
+ cvModel.transform(test.toDF())
.select("id", "text", "probability", "prediction")
.collect()
.foreach { case Row(id: Long, text: String, prob: Vector, prediction: Double) =>
diff --git a/examples/src/main/scala/org/apache/spark/examples/ml/MovieLensALS.scala b/examples/src/main/scala/org/apache/spark/examples/ml/MovieLensALS.scala
index 96b2dd463e..25f21113bf 100644
--- a/examples/src/main/scala/org/apache/spark/examples/ml/MovieLensALS.scala
+++ b/examples/src/main/scala/org/apache/spark/examples/ml/MovieLensALS.scala
@@ -93,8 +93,8 @@ object MovieLensALS {
| bin/spark-submit --class org.apache.spark.examples.ml.MovieLensALS \
| examples/target/scala-*/spark-examples-*.jar \
| --rank 10 --maxIter 15 --regParam 0.1 \
- | --movies path/to/movielens/movies.dat \
- | --ratings path/to/movielens/ratings.dat
+ | --movies data/mllib/als/sample_movielens_movies.txt \
+ | --ratings data/mllib/als/sample_movielens_ratings.txt
""".stripMargin)
}
@@ -157,17 +157,23 @@ object MovieLensALS {
println(s"Test RMSE = $rmse.")
// Inspect false positives.
- predictions.registerTempTable("prediction")
- sc.textFile(params.movies).map(Movie.parseMovie).toDF().registerTempTable("movie")
- sqlContext.sql(
- """
- |SELECT userId, prediction.movieId, title, rating, prediction
- | FROM prediction JOIN movie ON prediction.movieId = movie.movieId
- | WHERE rating <= 1 AND prediction >= 4
- | LIMIT 100
- """.stripMargin)
- .collect()
- .foreach(println)
+ // Note: We reference columns in 2 ways:
+ // (1) predictions("movieId") lets us specify the movieId column in the predictions
+ // DataFrame, rather than the movieId column in the movies DataFrame.
+ // (2) $"userId" specifies the userId column in the predictions DataFrame.
+ // We could also write predictions("userId") but do not have to since
+ // the movies DataFrame does not have a column "userId."
+ val movies = sc.textFile(params.movies).map(Movie.parseMovie).toDF()
+ val falsePositives = predictions.join(movies)
+ .where((predictions("movieId") === movies("movieId"))
+ && ($"rating" <= 1) && ($"prediction" >= 4))
+ .select($"userId", predictions("movieId"), $"title", $"rating", $"prediction")
+ val numFalsePositives = falsePositives.count()
+ println(s"Found $numFalsePositives false positives")
+ if (numFalsePositives > 0) {
+ println(s"Example false positives:")
+ falsePositives.limit(100).collect().foreach(println)
+ }
sc.stop()
}
diff --git a/examples/src/main/scala/org/apache/spark/examples/ml/SimpleParamsExample.scala b/examples/src/main/scala/org/apache/spark/examples/ml/SimpleParamsExample.scala
index e8af5c1625..bf805149d0 100644
--- a/examples/src/main/scala/org/apache/spark/examples/ml/SimpleParamsExample.scala
+++ b/examples/src/main/scala/org/apache/spark/examples/ml/SimpleParamsExample.scala
@@ -40,8 +40,8 @@ object SimpleParamsExample {
import sqlContext.implicits._
// Prepare training data.
- // We use LabeledPoint, which is a case class. Spark SQL can convert RDDs of Java Beans
- // into DataFrames, where it uses the bean metadata to infer the schema.
+ // We use LabeledPoint, which is a case class. Spark SQL can convert RDDs of case classes
+ // into DataFrames, where it uses the case class metadata to infer the schema.
val training = sc.parallelize(Seq(
LabeledPoint(1.0, Vectors.dense(0.0, 1.1, 0.1)),
LabeledPoint(0.0, Vectors.dense(2.0, 1.0, -1.0)),
@@ -94,7 +94,7 @@ object SimpleParamsExample {
.select("features", "label", "myProbability", "prediction")
.collect()
.foreach { case Row(features: Vector, label: Double, prob: Vector, prediction: Double) =>
- println("($features, $label) -> prob=$prob, prediction=$prediction")
+ println(s"($features, $label) -> prob=$prob, prediction=$prediction")
}
sc.stop()
diff --git a/examples/src/main/scala/org/apache/spark/examples/ml/SimpleTextClassificationPipeline.scala b/examples/src/main/scala/org/apache/spark/examples/ml/SimpleTextClassificationPipeline.scala
index a11db6fd5c..6772efd2c5 100644
--- a/examples/src/main/scala/org/apache/spark/examples/ml/SimpleTextClassificationPipeline.scala
+++ b/examples/src/main/scala/org/apache/spark/examples/ml/SimpleTextClassificationPipeline.scala
@@ -83,7 +83,7 @@ object SimpleTextClassificationPipeline {
.select("id", "text", "probability", "prediction")
.collect()
.foreach { case Row(id: Long, text: String, prob: Vector, prediction: Double) =>
- println("($id, $text) --> prob=$prob, prediction=$prediction")
+ println(s"($id, $text) --> prob=$prob, prediction=$prediction")
}
sc.stop()