aboutsummaryrefslogtreecommitdiff
path: root/examples/src
diff options
context:
space:
mode:
authorwm624@hotmail.com <wm624@hotmail.com>2016-07-03 23:23:02 -0700
committerYanbo Liang <ybliang8@gmail.com>2016-07-03 23:23:02 -0700
commita539b724c1d407083cb87abfa06d8bf213501057 (patch)
tree7ebac5820069ac28e1859f8e868532202e057903 /examples/src
parent26283339786f38c50722a7488d0bca8573b9c352 (diff)
downloadspark-a539b724c1d407083cb87abfa06d8bf213501057.tar.gz
spark-a539b724c1d407083cb87abfa06d8bf213501057.tar.bz2
spark-a539b724c1d407083cb87abfa06d8bf213501057.zip
[SPARK-16260][ML][EXAMPLE] PySpark ML Example Improvements and Cleanup
## What changes were proposed in this pull request? 1). Remove unused import in Scala example; 2). Move spark session import outside example off; 3). Change parameter setting the same as Scala; 4). Change comment to be consistent; 5). Make sure that Scala and python using the same data set; I did one pass and fixed the above issues. There are missing examples in python, which might be added later. TODO: For some examples, there are comments on how to run examples; But there are many missing. We can add them later. ## How was this patch tested? (Please explain how this patch was tested. E.g. unit tests, integration tests, manual tests) Manually test them Author: wm624@hotmail.com <wm624@hotmail.com> Closes #14021 from wangmiao1981/ann.
Diffstat (limited to 'examples/src')
-rw-r--r--examples/src/main/python/ml/elementwise_product_example.py2
-rw-r--r--examples/src/main/python/ml/polynomial_expansion_example.py2
-rw-r--r--examples/src/main/python/ml/quantile_discretizer_example.py2
-rw-r--r--examples/src/main/python/ml/random_forest_classifier_example.py2
-rw-r--r--examples/src/main/python/ml/simple_text_classification_pipeline.py2
-rw-r--r--examples/src/main/scala/org/apache/spark/examples/ml/DataFrameExample.scala1
-rw-r--r--examples/src/main/scala/org/apache/spark/examples/ml/GaussianMixtureExample.scala2
-rw-r--r--examples/src/main/scala/org/apache/spark/examples/ml/NaiveBayesExample.scala2
8 files changed, 8 insertions, 7 deletions
diff --git a/examples/src/main/python/ml/elementwise_product_example.py b/examples/src/main/python/ml/elementwise_product_example.py
index 598deae886..590053998b 100644
--- a/examples/src/main/python/ml/elementwise_product_example.py
+++ b/examples/src/main/python/ml/elementwise_product_example.py
@@ -30,10 +30,12 @@ if __name__ == "__main__":
.getOrCreate()
# $example on$
+ # Create some vector data; also works for sparse vectors
data = [(Vectors.dense([1.0, 2.0, 3.0]),), (Vectors.dense([4.0, 5.0, 6.0]),)]
df = spark.createDataFrame(data, ["vector"])
transformer = ElementwiseProduct(scalingVec=Vectors.dense([0.0, 1.0, 2.0]),
inputCol="vector", outputCol="transformedVector")
+ # Batch transform the vectors to create new column:
transformer.transform(df).show()
# $example off$
diff --git a/examples/src/main/python/ml/polynomial_expansion_example.py b/examples/src/main/python/ml/polynomial_expansion_example.py
index 9475e33218..b46c1ba2f4 100644
--- a/examples/src/main/python/ml/polynomial_expansion_example.py
+++ b/examples/src/main/python/ml/polynomial_expansion_example.py
@@ -35,7 +35,7 @@ if __name__ == "__main__":
(Vectors.dense([0.0, 0.0]),),
(Vectors.dense([0.6, -1.1]),)],
["features"])
- px = PolynomialExpansion(degree=2, inputCol="features", outputCol="polyFeatures")
+ px = PolynomialExpansion(degree=3, inputCol="features", outputCol="polyFeatures")
polyDF = px.transform(df)
for expanded in polyDF.select("polyFeatures").take(3):
print(expanded)
diff --git a/examples/src/main/python/ml/quantile_discretizer_example.py b/examples/src/main/python/ml/quantile_discretizer_example.py
index 5444cacd95..6f422f840a 100644
--- a/examples/src/main/python/ml/quantile_discretizer_example.py
+++ b/examples/src/main/python/ml/quantile_discretizer_example.py
@@ -24,7 +24,7 @@ from pyspark.sql import SparkSession
if __name__ == "__main__":
- spark = SparkSession.builder.appName("PythonQuantileDiscretizerExample").getOrCreate()
+ spark = SparkSession.builder.appName("QuantileDiscretizerExample").getOrCreate()
# $example on$
data = [(0, 18.0,), (1, 19.0,), (2, 8.0,), (3, 5.0,), (4, 2.2,)]
diff --git a/examples/src/main/python/ml/random_forest_classifier_example.py b/examples/src/main/python/ml/random_forest_classifier_example.py
index a7fc765318..eb9ded9af5 100644
--- a/examples/src/main/python/ml/random_forest_classifier_example.py
+++ b/examples/src/main/python/ml/random_forest_classifier_example.py
@@ -50,7 +50,7 @@ if __name__ == "__main__":
(trainingData, testData) = data.randomSplit([0.7, 0.3])
# Train a RandomForest model.
- rf = RandomForestClassifier(labelCol="indexedLabel", featuresCol="indexedFeatures")
+ rf = RandomForestClassifier(labelCol="indexedLabel", featuresCol="indexedFeatures", numTrees=10)
# Chain indexers and forest in a Pipeline
pipeline = Pipeline(stages=[labelIndexer, featureIndexer, rf])
diff --git a/examples/src/main/python/ml/simple_text_classification_pipeline.py b/examples/src/main/python/ml/simple_text_classification_pipeline.py
index 886f43c0b0..b528b59be9 100644
--- a/examples/src/main/python/ml/simple_text_classification_pipeline.py
+++ b/examples/src/main/python/ml/simple_text_classification_pipeline.py
@@ -48,7 +48,7 @@ if __name__ == "__main__":
# Configure an ML pipeline, which consists of tree stages: tokenizer, hashingTF, and lr.
tokenizer = Tokenizer(inputCol="text", outputCol="words")
- hashingTF = HashingTF(inputCol=tokenizer.getOutputCol(), outputCol="features")
+ hashingTF = HashingTF(numFeatures=1000, inputCol=tokenizer.getOutputCol(), outputCol="features")
lr = LogisticRegression(maxIter=10, regParam=0.001)
pipeline = Pipeline(stages=[tokenizer, hashingTF, lr])
diff --git a/examples/src/main/scala/org/apache/spark/examples/ml/DataFrameExample.scala b/examples/src/main/scala/org/apache/spark/examples/ml/DataFrameExample.scala
index 11faa6192b..38c1c1c186 100644
--- a/examples/src/main/scala/org/apache/spark/examples/ml/DataFrameExample.scala
+++ b/examples/src/main/scala/org/apache/spark/examples/ml/DataFrameExample.scala
@@ -20,7 +20,6 @@ package org.apache.spark.examples.ml
import java.io.File
-import com.google.common.io.Files
import scopt.OptionParser
import org.apache.spark.examples.mllib.AbstractParams
diff --git a/examples/src/main/scala/org/apache/spark/examples/ml/GaussianMixtureExample.scala b/examples/src/main/scala/org/apache/spark/examples/ml/GaussianMixtureExample.scala
index c484ee5556..2c2bf421bc 100644
--- a/examples/src/main/scala/org/apache/spark/examples/ml/GaussianMixtureExample.scala
+++ b/examples/src/main/scala/org/apache/spark/examples/ml/GaussianMixtureExample.scala
@@ -21,8 +21,8 @@ package org.apache.spark.examples.ml
// $example on$
import org.apache.spark.ml.clustering.GaussianMixture
-import org.apache.spark.sql.SparkSession
// $example off$
+import org.apache.spark.sql.SparkSession
/**
* An example demonstrating Gaussian Mixture Model (GMM).
diff --git a/examples/src/main/scala/org/apache/spark/examples/ml/NaiveBayesExample.scala b/examples/src/main/scala/org/apache/spark/examples/ml/NaiveBayesExample.scala
index a59ba182fc..7089a4bc87 100644
--- a/examples/src/main/scala/org/apache/spark/examples/ml/NaiveBayesExample.scala
+++ b/examples/src/main/scala/org/apache/spark/examples/ml/NaiveBayesExample.scala
@@ -35,7 +35,7 @@ object NaiveBayesExample {
val data = spark.read.format("libsvm").load("data/mllib/sample_libsvm_data.txt")
// Split the data into training and test sets (30% held out for testing)
- val Array(trainingData, testData) = data.randomSplit(Array(0.7, 0.3))
+ val Array(trainingData, testData) = data.randomSplit(Array(0.7, 0.3), seed = 1234L)
// Train a NaiveBayes model.
val model = new NaiveBayes()