From fe89c1817d668e46adf70d0896c42c22a547c76a Mon Sep 17 00:00:00 2001
From: Xiangrui Meng <meng@databricks.com>
Date: Sun, 22 Nov 2015 21:45:46 -0800
Subject: [SPARK-11895][ML] rename and refactor DatasetExample under
 mllib/examples

We used the name `Dataset` to refer to `SchemaRDD` in 1.2 in ML pipelines and created this example file. Since `Dataset` has a new meaning in Spark 1.6, we should rename it to avoid confusion. This PR also removes support for dense format to simplify the example code.

cc: yinxusen

Author: Xiangrui Meng <meng@databricks.com>

Closes #9873 from mengxr/SPARK-11895.
---
 .../spark/examples/ml/DataFrameExample.scala       | 104 +++++++++++++++++
 .../spark/examples/mllib/DatasetExample.scala      | 123 ---------------------
 2 files changed, 104 insertions(+), 123 deletions(-)
 create mode 100644 examples/src/main/scala/org/apache/spark/examples/ml/DataFrameExample.scala
 delete mode 100644 examples/src/main/scala/org/apache/spark/examples/mllib/DatasetExample.scala

(limited to 'examples')

diff --git a/examples/src/main/scala/org/apache/spark/examples/ml/DataFrameExample.scala b/examples/src/main/scala/org/apache/spark/examples/ml/DataFrameExample.scala
new file mode 100644
index 0000000000..424f00158c
--- /dev/null
+++ b/examples/src/main/scala/org/apache/spark/examples/ml/DataFrameExample.scala
@@ -0,0 +1,104 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+// scalastyle:off println
+package org.apache.spark.examples.ml
+
+import java.io.File
+
+import com.google.common.io.Files
+import scopt.OptionParser
+
+import org.apache.spark.{SparkConf, SparkContext}
+import org.apache.spark.examples.mllib.AbstractParams
+import org.apache.spark.mllib.linalg.Vector
+import org.apache.spark.mllib.stat.MultivariateOnlineSummarizer
+import org.apache.spark.sql.{DataFrame, Row, SQLContext}
+
+/**
+ * An example of how to use [[org.apache.spark.sql.DataFrame]] for ML. Run with
+ * {{{
+ * ./bin/run-example ml.DataFrameExample [options]
+ * }}}
+ * If you use it as a template to create your own app, please use `spark-submit` to submit your app.
+ */
+object DataFrameExample {
+
+  case class Params(input: String = "data/mllib/sample_libsvm_data.txt")
+    extends AbstractParams[Params]
+
+  def main(args: Array[String]) {
+    val defaultParams = Params()
+
+    val parser = new OptionParser[Params]("DatasetExample") {
+      head("Dataset: an example app using DataFrame as a Dataset for ML.")
+      opt[String]("input")
+        .text(s"input path to dataset")
+        .action((x, c) => c.copy(input = x))
+      checkConfig { params =>
+        success
+      }
+    }
+
+    parser.parse(args, defaultParams).map { params =>
+      run(params)
+    }.getOrElse {
+      sys.exit(1)
+    }
+  }
+
+  def run(params: Params) {
+
+    val conf = new SparkConf().setAppName(s"DataFrameExample with $params")
+    val sc = new SparkContext(conf)
+    val sqlContext = new SQLContext(sc)
+
+    // Load input data
+    println(s"Loading LIBSVM file with UDT from ${params.input}.")
+    val df: DataFrame = sqlContext.read.format("libsvm").load(params.input).cache()
+    println("Schema from LIBSVM:")
+    df.printSchema()
+    println(s"Loaded training data as a DataFrame with ${df.count()} records.")
+
+    // Show statistical summary of labels.
+    val labelSummary = df.describe("label")
+    labelSummary.show()
+
+    // Convert features column to an RDD of vectors.
+    val features = df.select("features").map { case Row(v: Vector) => v }
+    val featureSummary = features.aggregate(new MultivariateOnlineSummarizer())(
+      (summary, feat) => summary.add(feat),
+      (sum1, sum2) => sum1.merge(sum2))
+    println(s"Selected features column with average values:\n ${featureSummary.mean.toString}")
+
+    // Save the records in a parquet file.
+    val tmpDir = Files.createTempDir()
+    tmpDir.deleteOnExit()
+    val outputDir = new File(tmpDir, "dataset").toString
+    println(s"Saving to $outputDir as Parquet file.")
+    df.write.parquet(outputDir)
+
+    // Load the records back.
+    println(s"Loading Parquet file with UDT from $outputDir.")
+    val newDF = sqlContext.read.parquet(outputDir)
+    println(s"Schema from Parquet:")
+    newDF.printSchema()
+
+    sc.stop()
+  }
+}
+// scalastyle:on println
diff --git a/examples/src/main/scala/org/apache/spark/examples/mllib/DatasetExample.scala b/examples/src/main/scala/org/apache/spark/examples/mllib/DatasetExample.scala
deleted file mode 100644
index dc13f82488..0000000000
--- a/examples/src/main/scala/org/apache/spark/examples/mllib/DatasetExample.scala
+++ /dev/null
@@ -1,123 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-// scalastyle:off println
-package org.apache.spark.examples.mllib
-
-import java.io.File
-
-import com.google.common.io.Files
-import scopt.OptionParser
-
-import org.apache.spark.{SparkConf, SparkContext}
-import org.apache.spark.mllib.linalg.Vector
-import org.apache.spark.mllib.regression.LabeledPoint
-import org.apache.spark.mllib.stat.MultivariateOnlineSummarizer
-import org.apache.spark.mllib.util.MLUtils
-import org.apache.spark.rdd.RDD
-import org.apache.spark.sql.{Row, SQLContext, DataFrame}
-
-/**
- * An example of how to use [[org.apache.spark.sql.DataFrame]] as a Dataset for ML. Run with
- * {{{
- * ./bin/run-example org.apache.spark.examples.mllib.DatasetExample [options]
- * }}}
- * If you use it as a template to create your own app, please use `spark-submit` to submit your app.
- */
-object DatasetExample {
-
-  case class Params(
-      input: String = "data/mllib/sample_libsvm_data.txt",
-      dataFormat: String = "libsvm") extends AbstractParams[Params]
-
-  def main(args: Array[String]) {
-    val defaultParams = Params()
-
-    val parser = new OptionParser[Params]("DatasetExample") {
-      head("Dataset: an example app using DataFrame as a Dataset for ML.")
-      opt[String]("input")
-        .text(s"input path to dataset")
-        .action((x, c) => c.copy(input = x))
-      opt[String]("dataFormat")
-        .text("data format: libsvm (default), dense (deprecated in Spark v1.1)")
-        .action((x, c) => c.copy(input = x))
-      checkConfig { params =>
-        success
-      }
-    }
-
-    parser.parse(args, defaultParams).map { params =>
-      run(params)
-    }.getOrElse {
-      sys.exit(1)
-    }
-  }
-
-  def run(params: Params) {
-
-    val conf = new SparkConf().setAppName(s"DatasetExample with $params")
-    val sc = new SparkContext(conf)
-    val sqlContext = new SQLContext(sc)
-    import sqlContext.implicits._  // for implicit conversions
-
-    // Load input data
-    val origData: RDD[LabeledPoint] = params.dataFormat match {
-      case "dense" => MLUtils.loadLabeledPoints(sc, params.input)
-      case "libsvm" => MLUtils.loadLibSVMFile(sc, params.input)
-    }
-    println(s"Loaded ${origData.count()} instances from file: ${params.input}")
-
-    // Convert input data to DataFrame explicitly.
-    val df: DataFrame = origData.toDF()
-    println(s"Inferred schema:\n${df.schema.prettyJson}")
-    println(s"Converted to DataFrame with ${df.count()} records")
-
-    // Select columns
-    val labelsDf: DataFrame = df.select("label")
-    val labels: RDD[Double] = labelsDf.map { case Row(v: Double) => v }
-    val numLabels = labels.count()
-    val meanLabel = labels.fold(0.0)(_ + _) / numLabels
-    println(s"Selected label column with average value $meanLabel")
-
-    val featuresDf: DataFrame = df.select("features")
-    val features: RDD[Vector] = featuresDf.map { case Row(v: Vector) => v }
-    val featureSummary = features.aggregate(new MultivariateOnlineSummarizer())(
-      (summary, feat) => summary.add(feat),
-      (sum1, sum2) => sum1.merge(sum2))
-    println(s"Selected features column with average values:\n ${featureSummary.mean.toString}")
-
-    val tmpDir = Files.createTempDir()
-    tmpDir.deleteOnExit()
-    val outputDir = new File(tmpDir, "dataset").toString
-    println(s"Saving to $outputDir as Parquet file.")
-    df.write.parquet(outputDir)
-
-    println(s"Loading Parquet file with UDT from $outputDir.")
-    val newDataset = sqlContext.read.parquet(outputDir)
-
-    println(s"Schema from Parquet: ${newDataset.schema.prettyJson}")
-    val newFeatures = newDataset.select("features").map { case Row(v: Vector) => v }
-    val newFeaturesSummary = newFeatures.aggregate(new MultivariateOnlineSummarizer())(
-      (summary, feat) => summary.add(feat),
-      (sum1, sum2) => sum1.merge(sum2))
-    println(s"Selected features column with average values:\n ${newFeaturesSummary.mean.toString}")
-
-    sc.stop()
-  }
-
-}
-// scalastyle:on println
-- 
cgit v1.2.3