aboutsummaryrefslogtreecommitdiff
path: root/examples/src/main/python
diff options
context:
space:
mode:
authorBryan Cutler <cutlerb@gmail.com>2016-08-05 20:57:46 +0100
committerSean Owen <sowen@cloudera.com>2016-08-05 20:57:46 +0100
commit180fd3e0a3426db200c97170926afb60751dfd0e (patch)
tree4d10f86a901a0cfd52121f856f409a2b90ff5404 /examples/src/main/python
parent2460f03ffe94154b73995e4f16dd799d1a0f56b8 (diff)
downloadspark-180fd3e0a3426db200c97170926afb60751dfd0e.tar.gz
spark-180fd3e0a3426db200c97170926afb60751dfd0e.tar.bz2
spark-180fd3e0a3426db200c97170926afb60751dfd0e.zip
[SPARK-16421][EXAMPLES][ML] Improve ML Example Outputs
## What changes were proposed in this pull request? Improve example outputs to better reflect the functionality that is being presented. This mostly consisted of modifying what was printed at the end of the example, such as calling show() with truncate=False, but sometimes required minor tweaks in the example data to get relevant output. Explicitly set parameters when they are used as part of the example. Fixed Java examples that failed to run because of using old-style MLlib Vectors or problem with schema. Synced examples between different APIs. ## How was this patch tested? Ran each example for Scala, Python, and Java and made sure output was legible on a terminal of width 100. Author: Bryan Cutler <cutlerb@gmail.com> Closes #14308 from BryanCutler/ml-examples-improve-output-SPARK-16260.
Diffstat (limited to 'examples/src/main/python')
-rw-r--r--examples/src/main/python/ml/binarizer_example.py10
-rw-r--r--examples/src/main/python/ml/bucketizer_example.py4
-rw-r--r--examples/src/main/python/ml/chisq_selector_example.py2
-rw-r--r--examples/src/main/python/ml/count_vectorizer_example.py4
-rw-r--r--examples/src/main/python/ml/dct_example.py3
-rw-r--r--examples/src/main/python/ml/gaussian_mixture_example.py6
-rw-r--r--examples/src/main/python/ml/index_to_string_example.py14
-rw-r--r--examples/src/main/python/ml/isotonic_regression_example.py4
-rw-r--r--examples/src/main/python/ml/linear_regression_with_elastic_net.py12
-rw-r--r--examples/src/main/python/ml/max_abs_scaler_example.py10
-rw-r--r--examples/src/main/python/ml/min_max_scaler_example.py10
-rw-r--r--examples/src/main/python/ml/multilayer_perceptron_classification.py2
-rw-r--r--examples/src/main/python/ml/n_gram_example.py9
-rw-r--r--examples/src/main/python/ml/naive_bayes_example.py12
-rw-r--r--examples/src/main/python/ml/normalizer_example.py9
-rw-r--r--examples/src/main/python/ml/onehot_encoder_example.py4
-rw-r--r--examples/src/main/python/ml/pipeline_example.py5
-rw-r--r--examples/src/main/python/ml/polynomial_expansion_example.py11
-rw-r--r--examples/src/main/python/ml/stopwords_remover_example.py2
-rw-r--r--examples/src/main/python/ml/tf_idf_example.py9
-rw-r--r--examples/src/main/python/ml/tokenizer_example.py14
-rw-r--r--examples/src/main/python/ml/train_validation_split.py7
-rw-r--r--examples/src/main/python/ml/vector_assembler_example.py3
-rw-r--r--examples/src/main/python/ml/vector_indexer_example.py4
-rw-r--r--examples/src/main/python/ml/word2vec_example.py5
-rwxr-xr-xexamples/src/main/python/pagerank.py7
26 files changed, 120 insertions, 62 deletions
diff --git a/examples/src/main/python/ml/binarizer_example.py b/examples/src/main/python/ml/binarizer_example.py
index 4224a27dbe..669bb2aeab 100644
--- a/examples/src/main/python/ml/binarizer_example.py
+++ b/examples/src/main/python/ml/binarizer_example.py
@@ -33,12 +33,14 @@ if __name__ == "__main__":
(0, 0.1),
(1, 0.8),
(2, 0.2)
- ], ["label", "feature"])
+ ], ["id", "feature"])
+
binarizer = Binarizer(threshold=0.5, inputCol="feature", outputCol="binarized_feature")
+
binarizedDataFrame = binarizer.transform(continuousDataFrame)
- binarizedFeatures = binarizedDataFrame.select("binarized_feature")
- for binarized_feature, in binarizedFeatures.collect():
- print(binarized_feature)
+
+ print("Binarizer output with Threshold = %f" % binarizer.getThreshold())
+ binarizedDataFrame.show()
# $example off$
spark.stop()
diff --git a/examples/src/main/python/ml/bucketizer_example.py b/examples/src/main/python/ml/bucketizer_example.py
index 8177e560dd..742f35093b 100644
--- a/examples/src/main/python/ml/bucketizer_example.py
+++ b/examples/src/main/python/ml/bucketizer_example.py
@@ -31,13 +31,15 @@ if __name__ == "__main__":
# $example on$
splits = [-float("inf"), -0.5, 0.0, 0.5, float("inf")]
- data = [(-0.5,), (-0.3,), (0.0,), (0.2,)]
+ data = [(-999.9,), (-0.5,), (-0.3,), (0.0,), (0.2,), (999.9,)]
dataFrame = spark.createDataFrame(data, ["features"])
bucketizer = Bucketizer(splits=splits, inputCol="features", outputCol="bucketedFeatures")
# Transform original data into its bucket index.
bucketedData = bucketizer.transform(dataFrame)
+
+ print("Bucketizer output with %d buckets" % (len(bucketizer.getSplits())-1))
bucketedData.show()
# $example off$
diff --git a/examples/src/main/python/ml/chisq_selector_example.py b/examples/src/main/python/ml/chisq_selector_example.py
index 5e19ef1624..028a9ea9d6 100644
--- a/examples/src/main/python/ml/chisq_selector_example.py
+++ b/examples/src/main/python/ml/chisq_selector_example.py
@@ -39,6 +39,8 @@ if __name__ == "__main__":
outputCol="selectedFeatures", labelCol="clicked")
result = selector.fit(df).transform(df)
+
+ print("ChiSqSelector output with top %d features selected" % selector.getNumTopFeatures())
result.show()
# $example off$
diff --git a/examples/src/main/python/ml/count_vectorizer_example.py b/examples/src/main/python/ml/count_vectorizer_example.py
index 38cfac82fb..f2e41db77d 100644
--- a/examples/src/main/python/ml/count_vectorizer_example.py
+++ b/examples/src/main/python/ml/count_vectorizer_example.py
@@ -37,9 +37,11 @@ if __name__ == "__main__":
# fit a CountVectorizerModel from the corpus.
cv = CountVectorizer(inputCol="words", outputCol="features", vocabSize=3, minDF=2.0)
+
model = cv.fit(df)
+
result = model.transform(df)
- result.show()
+ result.show(truncate=False)
# $example off$
spark.stop()
diff --git a/examples/src/main/python/ml/dct_example.py b/examples/src/main/python/ml/dct_example.py
index a4f25df784..c0457f8d0f 100644
--- a/examples/src/main/python/ml/dct_example.py
+++ b/examples/src/main/python/ml/dct_example.py
@@ -39,8 +39,7 @@ if __name__ == "__main__":
dctDf = dct.transform(df)
- for dcts in dctDf.select("featuresDCT").take(3):
- print(dcts)
+ dctDf.select("featuresDCT").show(truncate=False)
# $example off$
spark.stop()
diff --git a/examples/src/main/python/ml/gaussian_mixture_example.py b/examples/src/main/python/ml/gaussian_mixture_example.py
index edc258de05..8ad450b669 100644
--- a/examples/src/main/python/ml/gaussian_mixture_example.py
+++ b/examples/src/main/python/ml/gaussian_mixture_example.py
@@ -38,11 +38,11 @@ if __name__ == "__main__":
# loads data
dataset = spark.read.format("libsvm").load("data/mllib/sample_kmeans_data.txt")
- gmm = GaussianMixture().setK(2)
+ gmm = GaussianMixture().setK(2).setSeed(538009335L)
model = gmm.fit(dataset)
- print("Gaussians: ")
- model.gaussiansDF.show()
+ print("Gaussians shown as a DataFrame: ")
+ model.gaussiansDF.show(truncate=False)
# $example off$
spark.stop()
diff --git a/examples/src/main/python/ml/index_to_string_example.py b/examples/src/main/python/ml/index_to_string_example.py
index 523caac00c..33d104e8e3 100644
--- a/examples/src/main/python/ml/index_to_string_example.py
+++ b/examples/src/main/python/ml/index_to_string_example.py
@@ -33,14 +33,22 @@ if __name__ == "__main__":
[(0, "a"), (1, "b"), (2, "c"), (3, "a"), (4, "a"), (5, "c")],
["id", "category"])
- stringIndexer = StringIndexer(inputCol="category", outputCol="categoryIndex")
- model = stringIndexer.fit(df)
+ indexer = StringIndexer(inputCol="category", outputCol="categoryIndex")
+ model = indexer.fit(df)
indexed = model.transform(df)
+ print("Transformed string column '%s' to indexed column '%s'"
+ % (indexer.getInputCol(), indexer.getOutputCol()))
+ indexed.show()
+
+ print("StringIndexer will store labels in output column metadata\n")
+
converter = IndexToString(inputCol="categoryIndex", outputCol="originalCategory")
converted = converter.transform(indexed)
- converted.select("id", "originalCategory").show()
+ print("Transformed indexed column '%s' back to original string column '%s' using "
+ "labels in metadata" % (converter.getInputCol(), converter.getOutputCol()))
+ converted.select("id", "categoryIndex", "originalCategory").show()
# $example off$
spark.stop()
diff --git a/examples/src/main/python/ml/isotonic_regression_example.py b/examples/src/main/python/ml/isotonic_regression_example.py
index a41b8ffacb..6ae15f1b4b 100644
--- a/examples/src/main/python/ml/isotonic_regression_example.py
+++ b/examples/src/main/python/ml/isotonic_regression_example.py
@@ -44,8 +44,8 @@ if __name__ == "__main__":
# Trains an isotonic regression model.
model = IsotonicRegression().fit(dataset)
- print("Boundaries in increasing order: " + str(model.boundaries))
- print("Predictions associated with the boundaries: " + str(model.predictions))
+ print("Boundaries in increasing order: %s\n" % str(model.boundaries))
+ print("Predictions associated with the boundaries: %s\n" % str(model.predictions))
# Makes predictions.
model.transform(dataset).show()
diff --git a/examples/src/main/python/ml/linear_regression_with_elastic_net.py b/examples/src/main/python/ml/linear_regression_with_elastic_net.py
index 620ab5b87e..6639e9160a 100644
--- a/examples/src/main/python/ml/linear_regression_with_elastic_net.py
+++ b/examples/src/main/python/ml/linear_regression_with_elastic_net.py
@@ -39,8 +39,16 @@ if __name__ == "__main__":
lrModel = lr.fit(training)
# Print the coefficients and intercept for linear regression
- print("Coefficients: " + str(lrModel.coefficients))
- print("Intercept: " + str(lrModel.intercept))
+ print("Coefficients: %s" % str(lrModel.coefficients))
+ print("Intercept: %s" % str(lrModel.intercept))
+
+ # Summarize the model over the training set and print out some metrics
+ trainingSummary = lrModel.summary
+ print("numIterations: %d" % trainingSummary.totalIterations)
+ print("objectiveHistory: %s" % str(trainingSummary.objectiveHistory))
+ trainingSummary.residuals.show()
+ print("RMSE: %f" % trainingSummary.rootMeanSquaredError)
+ print("r2: %f" % trainingSummary.r2)
# $example off$
spark.stop()
diff --git a/examples/src/main/python/ml/max_abs_scaler_example.py b/examples/src/main/python/ml/max_abs_scaler_example.py
index ab91198b08..45eda3cdad 100644
--- a/examples/src/main/python/ml/max_abs_scaler_example.py
+++ b/examples/src/main/python/ml/max_abs_scaler_example.py
@@ -19,6 +19,7 @@ from __future__ import print_function
# $example on$
from pyspark.ml.feature import MaxAbsScaler
+from pyspark.ml.linalg import Vectors
# $example off$
from pyspark.sql import SparkSession
@@ -29,7 +30,11 @@ if __name__ == "__main__":
.getOrCreate()
# $example on$
- dataFrame = spark.read.format("libsvm").load("data/mllib/sample_libsvm_data.txt")
+ dataFrame = spark.createDataFrame([
+ (0, Vectors.dense([1.0, 0.1, -8.0]),),
+ (1, Vectors.dense([2.0, 1.0, -4.0]),),
+ (2, Vectors.dense([4.0, 10.0, 8.0]),)
+ ], ["id", "features"])
scaler = MaxAbsScaler(inputCol="features", outputCol="scaledFeatures")
@@ -38,7 +43,8 @@ if __name__ == "__main__":
# rescale each feature to range [-1, 1].
scaledData = scalerModel.transform(dataFrame)
- scaledData.show()
+
+ scaledData.select("features", "scaledFeatures").show()
# $example off$
spark.stop()
diff --git a/examples/src/main/python/ml/min_max_scaler_example.py b/examples/src/main/python/ml/min_max_scaler_example.py
index e3e7bc205b..b5f272e59b 100644
--- a/examples/src/main/python/ml/min_max_scaler_example.py
+++ b/examples/src/main/python/ml/min_max_scaler_example.py
@@ -19,6 +19,7 @@ from __future__ import print_function
# $example on$
from pyspark.ml.feature import MinMaxScaler
+from pyspark.ml.linalg import Vectors
# $example off$
from pyspark.sql import SparkSession
@@ -29,7 +30,11 @@ if __name__ == "__main__":
.getOrCreate()
# $example on$
- dataFrame = spark.read.format("libsvm").load("data/mllib/sample_libsvm_data.txt")
+ dataFrame = spark.createDataFrame([
+ (0, Vectors.dense([1.0, 0.1, -1.0]),),
+ (1, Vectors.dense([2.0, 1.1, 1.0]),),
+ (2, Vectors.dense([3.0, 10.1, 3.0]),)
+ ], ["id", "features"])
scaler = MinMaxScaler(inputCol="features", outputCol="scaledFeatures")
@@ -38,7 +43,8 @@ if __name__ == "__main__":
# rescale each feature to range [min, max].
scaledData = scalerModel.transform(dataFrame)
- scaledData.show()
+ print("Features scaled to range: [%f, %f]" % (scaler.getMin(), scaler.getMax()))
+ scaledData.select("features", "scaledFeatures").show()
# $example off$
spark.stop()
diff --git a/examples/src/main/python/ml/multilayer_perceptron_classification.py b/examples/src/main/python/ml/multilayer_perceptron_classification.py
index 2cc38c2855..88fc69f753 100644
--- a/examples/src/main/python/ml/multilayer_perceptron_classification.py
+++ b/examples/src/main/python/ml/multilayer_perceptron_classification.py
@@ -52,7 +52,7 @@ if __name__ == "__main__":
result = model.transform(test)
predictionAndLabels = result.select("prediction", "label")
evaluator = MulticlassClassificationEvaluator(metricName="accuracy")
- print("Accuracy: " + str(evaluator.evaluate(predictionAndLabels)))
+ print("Test set accuracy = " + str(evaluator.evaluate(predictionAndLabels)))
# $example off$
spark.stop()
diff --git a/examples/src/main/python/ml/n_gram_example.py b/examples/src/main/python/ml/n_gram_example.py
index 55263adb46..31676e076a 100644
--- a/examples/src/main/python/ml/n_gram_example.py
+++ b/examples/src/main/python/ml/n_gram_example.py
@@ -33,13 +33,12 @@ if __name__ == "__main__":
(0, ["Hi", "I", "heard", "about", "Spark"]),
(1, ["I", "wish", "Java", "could", "use", "case", "classes"]),
(2, ["Logistic", "regression", "models", "are", "neat"])
- ], ["label", "words"])
+ ], ["id", "words"])
- ngram = NGram(inputCol="words", outputCol="ngrams")
- ngramDataFrame = ngram.transform(wordDataFrame)
+ ngram = NGram(n=2, inputCol="words", outputCol="ngrams")
- for ngrams_label in ngramDataFrame.select("ngrams", "label").take(3):
- print(ngrams_label)
+ ngramDataFrame = ngram.transform(wordDataFrame)
+ ngramDataFrame.select("ngrams").show(truncate=False)
# $example off$
spark.stop()
diff --git a/examples/src/main/python/ml/naive_bayes_example.py b/examples/src/main/python/ml/naive_bayes_example.py
index aa23f298c8..7290ab81cd 100644
--- a/examples/src/main/python/ml/naive_bayes_example.py
+++ b/examples/src/main/python/ml/naive_bayes_example.py
@@ -45,11 +45,15 @@ if __name__ == "__main__":
# train the model
model = nb.fit(train)
+ # select example rows to display.
+ predictions = model.transform(test)
+ predictions.show()
+
# compute accuracy on the test set
- result = model.transform(test)
- predictionAndLabels = result.select("prediction", "label")
- evaluator = MulticlassClassificationEvaluator(metricName="accuracy")
- print("Accuracy: " + str(evaluator.evaluate(predictionAndLabels)))
+ evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction",
+ metricName="accuracy")
+ accuracy = evaluator.evaluate(predictions)
+ print("Test set accuracy = " + str(accuracy))
# $example off$
spark.stop()
diff --git a/examples/src/main/python/ml/normalizer_example.py b/examples/src/main/python/ml/normalizer_example.py
index 19012f51f4..510bd825fd 100644
--- a/examples/src/main/python/ml/normalizer_example.py
+++ b/examples/src/main/python/ml/normalizer_example.py
@@ -19,6 +19,7 @@ from __future__ import print_function
# $example on$
from pyspark.ml.feature import Normalizer
+from pyspark.ml.linalg import Vectors
# $example off$
from pyspark.sql import SparkSession
@@ -29,15 +30,21 @@ if __name__ == "__main__":
.getOrCreate()
# $example on$
- dataFrame = spark.read.format("libsvm").load("data/mllib/sample_libsvm_data.txt")
+ dataFrame = spark.createDataFrame([
+ (0, Vectors.dense([1.0, 0.5, -1.0]),),
+ (1, Vectors.dense([2.0, 1.0, 1.0]),),
+ (2, Vectors.dense([4.0, 10.0, 2.0]),)
+ ], ["id", "features"])
# Normalize each Vector using $L^1$ norm.
normalizer = Normalizer(inputCol="features", outputCol="normFeatures", p=1.0)
l1NormData = normalizer.transform(dataFrame)
+ print("Normalized using L^1 norm")
l1NormData.show()
# Normalize each Vector using $L^\infty$ norm.
lInfNormData = normalizer.transform(dataFrame, {normalizer.p: float("inf")})
+ print("Normalized using L^inf norm")
lInfNormData.show()
# $example off$
diff --git a/examples/src/main/python/ml/onehot_encoder_example.py b/examples/src/main/python/ml/onehot_encoder_example.py
index 47faf8d202..e1996c7f0a 100644
--- a/examples/src/main/python/ml/onehot_encoder_example.py
+++ b/examples/src/main/python/ml/onehot_encoder_example.py
@@ -42,9 +42,9 @@ if __name__ == "__main__":
model = stringIndexer.fit(df)
indexed = model.transform(df)
- encoder = OneHotEncoder(dropLast=False, inputCol="categoryIndex", outputCol="categoryVec")
+ encoder = OneHotEncoder(inputCol="categoryIndex", outputCol="categoryVec")
encoded = encoder.transform(indexed)
- encoded.select("id", "categoryVec").show()
+ encoded.show()
# $example off$
spark.stop()
diff --git a/examples/src/main/python/ml/pipeline_example.py b/examples/src/main/python/ml/pipeline_example.py
index 2d0865578a..f63e4db434 100644
--- a/examples/src/main/python/ml/pipeline_example.py
+++ b/examples/src/main/python/ml/pipeline_example.py
@@ -60,9 +60,10 @@ if __name__ == "__main__":
# Make predictions on test documents and print columns of interest.
prediction = model.transform(test)
- selected = prediction.select("id", "text", "prediction")
+ selected = prediction.select("id", "text", "probability", "prediction")
for row in selected.collect():
- print(row)
+ rid, text, prob, prediction = row
+ print("(%d, %s) --> prob=%s, prediction=%f" % (rid, text, str(prob), prediction))
# $example off$
spark.stop()
diff --git a/examples/src/main/python/ml/polynomial_expansion_example.py b/examples/src/main/python/ml/polynomial_expansion_example.py
index b464ee86b6..40bcb7b13a 100644
--- a/examples/src/main/python/ml/polynomial_expansion_example.py
+++ b/examples/src/main/python/ml/polynomial_expansion_example.py
@@ -31,16 +31,15 @@ if __name__ == "__main__":
# $example on$
df = spark.createDataFrame([
- (Vectors.dense([-2.0, 2.3]),),
+ (Vectors.dense([2.0, 1.0]),),
(Vectors.dense([0.0, 0.0]),),
- (Vectors.dense([0.6, -1.1]),)
+ (Vectors.dense([3.0, -1.0]),)
], ["features"])
- px = PolynomialExpansion(degree=3, inputCol="features", outputCol="polyFeatures")
- polyDF = px.transform(df)
+ polyExpansion = PolynomialExpansion(degree=3, inputCol="features", outputCol="polyFeatures")
+ polyDF = polyExpansion.transform(df)
- for expanded in polyDF.select("polyFeatures").take(3):
- print(expanded)
+ polyDF.show(truncate=False)
# $example off$
spark.stop()
diff --git a/examples/src/main/python/ml/stopwords_remover_example.py b/examples/src/main/python/ml/stopwords_remover_example.py
index 8a8392cc1f..3b8e7855e3 100644
--- a/examples/src/main/python/ml/stopwords_remover_example.py
+++ b/examples/src/main/python/ml/stopwords_remover_example.py
@@ -32,7 +32,7 @@ if __name__ == "__main__":
sentenceData = spark.createDataFrame([
(0, ["I", "saw", "the", "red", "balloon"]),
(1, ["Mary", "had", "a", "little", "lamb"])
- ], ["label", "raw"])
+ ], ["id", "raw"])
remover = StopWordsRemover(inputCol="raw", outputCol="filtered")
remover.transform(sentenceData).show(truncate=False)
diff --git a/examples/src/main/python/ml/tf_idf_example.py b/examples/src/main/python/ml/tf_idf_example.py
index 4ab7eb6964..d43244fa68 100644
--- a/examples/src/main/python/ml/tf_idf_example.py
+++ b/examples/src/main/python/ml/tf_idf_example.py
@@ -30,9 +30,9 @@ if __name__ == "__main__":
# $example on$
sentenceData = spark.createDataFrame([
- (0, "Hi I heard about Spark"),
- (0, "I wish Java could use case classes"),
- (1, "Logistic regression models are neat")
+ (0.0, "Hi I heard about Spark"),
+ (0.0, "I wish Java could use case classes"),
+ (1.0, "Logistic regression models are neat")
], ["label", "sentence"])
tokenizer = Tokenizer(inputCol="sentence", outputCol="words")
@@ -46,8 +46,7 @@ if __name__ == "__main__":
idfModel = idf.fit(featurizedData)
rescaledData = idfModel.transform(featurizedData)
- for features_label in rescaledData.select("features", "label").take(3):
- print(features_label)
+ rescaledData.select("label", "features").show()
# $example off$
spark.stop()
diff --git a/examples/src/main/python/ml/tokenizer_example.py b/examples/src/main/python/ml/tokenizer_example.py
index 89f5060705..5c65c5c9f8 100644
--- a/examples/src/main/python/ml/tokenizer_example.py
+++ b/examples/src/main/python/ml/tokenizer_example.py
@@ -19,6 +19,8 @@ from __future__ import print_function
# $example on$
from pyspark.ml.feature import Tokenizer, RegexTokenizer
+from pyspark.sql.functions import col, udf
+from pyspark.sql.types import IntegerType
# $example off$
from pyspark.sql import SparkSession
@@ -33,20 +35,22 @@ if __name__ == "__main__":
(0, "Hi I heard about Spark"),
(1, "I wish Java could use case classes"),
(2, "Logistic,regression,models,are,neat")
- ], ["label", "sentence"])
+ ], ["id", "sentence"])
tokenizer = Tokenizer(inputCol="sentence", outputCol="words")
regexTokenizer = RegexTokenizer(inputCol="sentence", outputCol="words", pattern="\\W")
# alternatively, pattern="\\w+", gaps(False)
+ countTokens = udf(lambda words: len(words), IntegerType())
+
tokenized = tokenizer.transform(sentenceDataFrame)
- for words_label in tokenized.select("words", "label").take(3):
- print(words_label)
+ tokenized.select("sentence", "words")\
+ .withColumn("tokens", countTokens(col("words"))).show(truncate=False)
regexTokenized = regexTokenizer.transform(sentenceDataFrame)
- for words_label in regexTokenized.select("words", "label").take(3):
- print(words_label)
+ regexTokenized.select("sentence", "words") \
+ .withColumn("tokens", countTokens(col("words"))).show(truncate=False)
# $example off$
spark.stop()
diff --git a/examples/src/main/python/ml/train_validation_split.py b/examples/src/main/python/ml/train_validation_split.py
index a92b861f83..d104f7d30a 100644
--- a/examples/src/main/python/ml/train_validation_split.py
+++ b/examples/src/main/python/ml/train_validation_split.py
@@ -66,8 +66,9 @@ if __name__ == "__main__":
# Make predictions on test data. model is the model with combination of parameters
# that performed best.
- prediction = model.transform(test)
- for row in prediction.take(5):
- print(row)
+ model.transform(test)\
+ .select("features", "label", "prediction")\
+ .show()
+
# $example off$
spark.stop()
diff --git a/examples/src/main/python/ml/vector_assembler_example.py b/examples/src/main/python/ml/vector_assembler_example.py
index eac33711ad..98de1d5ea7 100644
--- a/examples/src/main/python/ml/vector_assembler_example.py
+++ b/examples/src/main/python/ml/vector_assembler_example.py
@@ -39,7 +39,8 @@ if __name__ == "__main__":
outputCol="features")
output = assembler.transform(dataset)
- print(output.select("features", "clicked").first())
+ print("Assembled columns 'hour', 'mobile', 'userFeatures' to vector column 'features'")
+ output.select("features", "clicked").show(truncate=False)
# $example off$
spark.stop()
diff --git a/examples/src/main/python/ml/vector_indexer_example.py b/examples/src/main/python/ml/vector_indexer_example.py
index 3912c135be..5c2956077d 100644
--- a/examples/src/main/python/ml/vector_indexer_example.py
+++ b/examples/src/main/python/ml/vector_indexer_example.py
@@ -34,6 +34,10 @@ if __name__ == "__main__":
indexer = VectorIndexer(inputCol="features", outputCol="indexed", maxCategories=10)
indexerModel = indexer.fit(data)
+ categoricalFeatures = indexerModel.categoryMaps
+ print("Chose %d categorical features: %s" %
+ (len(categoricalFeatures), ", ".join(str(k) for k in categoricalFeatures.keys())))
+
# Create new column "indexed" with categorical values transformed to indices
indexedData = indexerModel.transform(data)
indexedData.show()
diff --git a/examples/src/main/python/ml/word2vec_example.py b/examples/src/main/python/ml/word2vec_example.py
index 78a91c92fc..77f8951df0 100644
--- a/examples/src/main/python/ml/word2vec_example.py
+++ b/examples/src/main/python/ml/word2vec_example.py
@@ -41,8 +41,9 @@ if __name__ == "__main__":
model = word2Vec.fit(documentDF)
result = model.transform(documentDF)
- for feature in result.select("result").take(3):
- print(feature)
+ for row in result.collect():
+ text, vector = row
+ print("Text: [%s] => \nVector: %s\n" % (", ".join(text), str(vector)))
# $example off$
spark.stop()
diff --git a/examples/src/main/python/pagerank.py b/examples/src/main/python/pagerank.py
index a399a9c37c..0d6c253d39 100755
--- a/examples/src/main/python/pagerank.py
+++ b/examples/src/main/python/pagerank.py
@@ -18,6 +18,9 @@
"""
This is an example implementation of PageRank. For more conventional use,
Please refer to PageRank implementation provided by graphx
+
+Example Usage:
+bin/spark-submit examples/src/main/python/pagerank.py data/mllib/pagerank_data.txt 10
"""
from __future__ import print_function
@@ -46,8 +49,8 @@ if __name__ == "__main__":
print("Usage: pagerank <file> <iterations>", file=sys.stderr)
exit(-1)
- print("""WARN: This is a naive implementation of PageRank and is
- given as an example! Please refer to PageRank implementation provided by graphx""",
+ print("WARN: This is a naive implementation of PageRank and is given as an example!\n" +
+ "Please refer to PageRank implementation provided by graphx",
file=sys.stderr)
# Initialize the spark context.