aboutsummaryrefslogtreecommitdiff
path: root/examples/src/main/python
diff options
context:
space:
mode:
Diffstat (limited to 'examples/src/main/python')
-rw-r--r--examples/src/main/python/ml/binarizer_example.py10
-rw-r--r--examples/src/main/python/ml/bucketizer_example.py4
-rw-r--r--examples/src/main/python/ml/chisq_selector_example.py2
-rw-r--r--examples/src/main/python/ml/count_vectorizer_example.py4
-rw-r--r--examples/src/main/python/ml/dct_example.py3
-rw-r--r--examples/src/main/python/ml/gaussian_mixture_example.py6
-rw-r--r--examples/src/main/python/ml/index_to_string_example.py14
-rw-r--r--examples/src/main/python/ml/isotonic_regression_example.py4
-rw-r--r--examples/src/main/python/ml/linear_regression_with_elastic_net.py12
-rw-r--r--examples/src/main/python/ml/max_abs_scaler_example.py10
-rw-r--r--examples/src/main/python/ml/min_max_scaler_example.py10
-rw-r--r--examples/src/main/python/ml/multilayer_perceptron_classification.py2
-rw-r--r--examples/src/main/python/ml/n_gram_example.py9
-rw-r--r--examples/src/main/python/ml/naive_bayes_example.py12
-rw-r--r--examples/src/main/python/ml/normalizer_example.py9
-rw-r--r--examples/src/main/python/ml/onehot_encoder_example.py4
-rw-r--r--examples/src/main/python/ml/pipeline_example.py5
-rw-r--r--examples/src/main/python/ml/polynomial_expansion_example.py11
-rw-r--r--examples/src/main/python/ml/stopwords_remover_example.py2
-rw-r--r--examples/src/main/python/ml/tf_idf_example.py9
-rw-r--r--examples/src/main/python/ml/tokenizer_example.py14
-rw-r--r--examples/src/main/python/ml/train_validation_split.py7
-rw-r--r--examples/src/main/python/ml/vector_assembler_example.py3
-rw-r--r--examples/src/main/python/ml/vector_indexer_example.py4
-rw-r--r--examples/src/main/python/ml/word2vec_example.py5
-rwxr-xr-xexamples/src/main/python/pagerank.py7
26 files changed, 120 insertions, 62 deletions
diff --git a/examples/src/main/python/ml/binarizer_example.py b/examples/src/main/python/ml/binarizer_example.py
index 4224a27dbe..669bb2aeab 100644
--- a/examples/src/main/python/ml/binarizer_example.py
+++ b/examples/src/main/python/ml/binarizer_example.py
@@ -33,12 +33,14 @@ if __name__ == "__main__":
(0, 0.1),
(1, 0.8),
(2, 0.2)
- ], ["label", "feature"])
+ ], ["id", "feature"])
+
binarizer = Binarizer(threshold=0.5, inputCol="feature", outputCol="binarized_feature")
+
binarizedDataFrame = binarizer.transform(continuousDataFrame)
- binarizedFeatures = binarizedDataFrame.select("binarized_feature")
- for binarized_feature, in binarizedFeatures.collect():
- print(binarized_feature)
+
+ print("Binarizer output with Threshold = %f" % binarizer.getThreshold())
+ binarizedDataFrame.show()
# $example off$
spark.stop()
diff --git a/examples/src/main/python/ml/bucketizer_example.py b/examples/src/main/python/ml/bucketizer_example.py
index 8177e560dd..742f35093b 100644
--- a/examples/src/main/python/ml/bucketizer_example.py
+++ b/examples/src/main/python/ml/bucketizer_example.py
@@ -31,13 +31,15 @@ if __name__ == "__main__":
# $example on$
splits = [-float("inf"), -0.5, 0.0, 0.5, float("inf")]
- data = [(-0.5,), (-0.3,), (0.0,), (0.2,)]
+ data = [(-999.9,), (-0.5,), (-0.3,), (0.0,), (0.2,), (999.9,)]
dataFrame = spark.createDataFrame(data, ["features"])
bucketizer = Bucketizer(splits=splits, inputCol="features", outputCol="bucketedFeatures")
# Transform original data into its bucket index.
bucketedData = bucketizer.transform(dataFrame)
+
+ print("Bucketizer output with %d buckets" % (len(bucketizer.getSplits())-1))
bucketedData.show()
# $example off$
diff --git a/examples/src/main/python/ml/chisq_selector_example.py b/examples/src/main/python/ml/chisq_selector_example.py
index 5e19ef1624..028a9ea9d6 100644
--- a/examples/src/main/python/ml/chisq_selector_example.py
+++ b/examples/src/main/python/ml/chisq_selector_example.py
@@ -39,6 +39,8 @@ if __name__ == "__main__":
outputCol="selectedFeatures", labelCol="clicked")
result = selector.fit(df).transform(df)
+
+ print("ChiSqSelector output with top %d features selected" % selector.getNumTopFeatures())
result.show()
# $example off$
diff --git a/examples/src/main/python/ml/count_vectorizer_example.py b/examples/src/main/python/ml/count_vectorizer_example.py
index 38cfac82fb..f2e41db77d 100644
--- a/examples/src/main/python/ml/count_vectorizer_example.py
+++ b/examples/src/main/python/ml/count_vectorizer_example.py
@@ -37,9 +37,11 @@ if __name__ == "__main__":
# fit a CountVectorizerModel from the corpus.
cv = CountVectorizer(inputCol="words", outputCol="features", vocabSize=3, minDF=2.0)
+
model = cv.fit(df)
+
result = model.transform(df)
- result.show()
+ result.show(truncate=False)
# $example off$
spark.stop()
diff --git a/examples/src/main/python/ml/dct_example.py b/examples/src/main/python/ml/dct_example.py
index a4f25df784..c0457f8d0f 100644
--- a/examples/src/main/python/ml/dct_example.py
+++ b/examples/src/main/python/ml/dct_example.py
@@ -39,8 +39,7 @@ if __name__ == "__main__":
dctDf = dct.transform(df)
- for dcts in dctDf.select("featuresDCT").take(3):
- print(dcts)
+ dctDf.select("featuresDCT").show(truncate=False)
# $example off$
spark.stop()
diff --git a/examples/src/main/python/ml/gaussian_mixture_example.py b/examples/src/main/python/ml/gaussian_mixture_example.py
index edc258de05..8ad450b669 100644
--- a/examples/src/main/python/ml/gaussian_mixture_example.py
+++ b/examples/src/main/python/ml/gaussian_mixture_example.py
@@ -38,11 +38,11 @@ if __name__ == "__main__":
# loads data
dataset = spark.read.format("libsvm").load("data/mllib/sample_kmeans_data.txt")
- gmm = GaussianMixture().setK(2)
+ gmm = GaussianMixture().setK(2).setSeed(538009335L)
model = gmm.fit(dataset)
- print("Gaussians: ")
- model.gaussiansDF.show()
+ print("Gaussians shown as a DataFrame: ")
+ model.gaussiansDF.show(truncate=False)
# $example off$
spark.stop()
diff --git a/examples/src/main/python/ml/index_to_string_example.py b/examples/src/main/python/ml/index_to_string_example.py
index 523caac00c..33d104e8e3 100644
--- a/examples/src/main/python/ml/index_to_string_example.py
+++ b/examples/src/main/python/ml/index_to_string_example.py
@@ -33,14 +33,22 @@ if __name__ == "__main__":
[(0, "a"), (1, "b"), (2, "c"), (3, "a"), (4, "a"), (5, "c")],
["id", "category"])
- stringIndexer = StringIndexer(inputCol="category", outputCol="categoryIndex")
- model = stringIndexer.fit(df)
+ indexer = StringIndexer(inputCol="category", outputCol="categoryIndex")
+ model = indexer.fit(df)
indexed = model.transform(df)
+ print("Transformed string column '%s' to indexed column '%s'"
+ % (indexer.getInputCol(), indexer.getOutputCol()))
+ indexed.show()
+
+ print("StringIndexer will store labels in output column metadata\n")
+
converter = IndexToString(inputCol="categoryIndex", outputCol="originalCategory")
converted = converter.transform(indexed)
- converted.select("id", "originalCategory").show()
+ print("Transformed indexed column '%s' back to original string column '%s' using "
+ "labels in metadata" % (converter.getInputCol(), converter.getOutputCol()))
+ converted.select("id", "categoryIndex", "originalCategory").show()
# $example off$
spark.stop()
diff --git a/examples/src/main/python/ml/isotonic_regression_example.py b/examples/src/main/python/ml/isotonic_regression_example.py
index a41b8ffacb..6ae15f1b4b 100644
--- a/examples/src/main/python/ml/isotonic_regression_example.py
+++ b/examples/src/main/python/ml/isotonic_regression_example.py
@@ -44,8 +44,8 @@ if __name__ == "__main__":
# Trains an isotonic regression model.
model = IsotonicRegression().fit(dataset)
- print("Boundaries in increasing order: " + str(model.boundaries))
- print("Predictions associated with the boundaries: " + str(model.predictions))
+ print("Boundaries in increasing order: %s\n" % str(model.boundaries))
+ print("Predictions associated with the boundaries: %s\n" % str(model.predictions))
# Makes predictions.
model.transform(dataset).show()
diff --git a/examples/src/main/python/ml/linear_regression_with_elastic_net.py b/examples/src/main/python/ml/linear_regression_with_elastic_net.py
index 620ab5b87e..6639e9160a 100644
--- a/examples/src/main/python/ml/linear_regression_with_elastic_net.py
+++ b/examples/src/main/python/ml/linear_regression_with_elastic_net.py
@@ -39,8 +39,16 @@ if __name__ == "__main__":
lrModel = lr.fit(training)
# Print the coefficients and intercept for linear regression
- print("Coefficients: " + str(lrModel.coefficients))
- print("Intercept: " + str(lrModel.intercept))
+ print("Coefficients: %s" % str(lrModel.coefficients))
+ print("Intercept: %s" % str(lrModel.intercept))
+
+ # Summarize the model over the training set and print out some metrics
+ trainingSummary = lrModel.summary
+ print("numIterations: %d" % trainingSummary.totalIterations)
+ print("objectiveHistory: %s" % str(trainingSummary.objectiveHistory))
+ trainingSummary.residuals.show()
+ print("RMSE: %f" % trainingSummary.rootMeanSquaredError)
+ print("r2: %f" % trainingSummary.r2)
# $example off$
spark.stop()
diff --git a/examples/src/main/python/ml/max_abs_scaler_example.py b/examples/src/main/python/ml/max_abs_scaler_example.py
index ab91198b08..45eda3cdad 100644
--- a/examples/src/main/python/ml/max_abs_scaler_example.py
+++ b/examples/src/main/python/ml/max_abs_scaler_example.py
@@ -19,6 +19,7 @@ from __future__ import print_function
# $example on$
from pyspark.ml.feature import MaxAbsScaler
+from pyspark.ml.linalg import Vectors
# $example off$
from pyspark.sql import SparkSession
@@ -29,7 +30,11 @@ if __name__ == "__main__":
.getOrCreate()
# $example on$
- dataFrame = spark.read.format("libsvm").load("data/mllib/sample_libsvm_data.txt")
+ dataFrame = spark.createDataFrame([
+ (0, Vectors.dense([1.0, 0.1, -8.0]),),
+ (1, Vectors.dense([2.0, 1.0, -4.0]),),
+ (2, Vectors.dense([4.0, 10.0, 8.0]),)
+ ], ["id", "features"])
scaler = MaxAbsScaler(inputCol="features", outputCol="scaledFeatures")
@@ -38,7 +43,8 @@ if __name__ == "__main__":
# rescale each feature to range [-1, 1].
scaledData = scalerModel.transform(dataFrame)
- scaledData.show()
+
+ scaledData.select("features", "scaledFeatures").show()
# $example off$
spark.stop()
diff --git a/examples/src/main/python/ml/min_max_scaler_example.py b/examples/src/main/python/ml/min_max_scaler_example.py
index e3e7bc205b..b5f272e59b 100644
--- a/examples/src/main/python/ml/min_max_scaler_example.py
+++ b/examples/src/main/python/ml/min_max_scaler_example.py
@@ -19,6 +19,7 @@ from __future__ import print_function
# $example on$
from pyspark.ml.feature import MinMaxScaler
+from pyspark.ml.linalg import Vectors
# $example off$
from pyspark.sql import SparkSession
@@ -29,7 +30,11 @@ if __name__ == "__main__":
.getOrCreate()
# $example on$
- dataFrame = spark.read.format("libsvm").load("data/mllib/sample_libsvm_data.txt")
+ dataFrame = spark.createDataFrame([
+ (0, Vectors.dense([1.0, 0.1, -1.0]),),
+ (1, Vectors.dense([2.0, 1.1, 1.0]),),
+ (2, Vectors.dense([3.0, 10.1, 3.0]),)
+ ], ["id", "features"])
scaler = MinMaxScaler(inputCol="features", outputCol="scaledFeatures")
@@ -38,7 +43,8 @@ if __name__ == "__main__":
# rescale each feature to range [min, max].
scaledData = scalerModel.transform(dataFrame)
- scaledData.show()
+ print("Features scaled to range: [%f, %f]" % (scaler.getMin(), scaler.getMax()))
+ scaledData.select("features", "scaledFeatures").show()
# $example off$
spark.stop()
diff --git a/examples/src/main/python/ml/multilayer_perceptron_classification.py b/examples/src/main/python/ml/multilayer_perceptron_classification.py
index 2cc38c2855..88fc69f753 100644
--- a/examples/src/main/python/ml/multilayer_perceptron_classification.py
+++ b/examples/src/main/python/ml/multilayer_perceptron_classification.py
@@ -52,7 +52,7 @@ if __name__ == "__main__":
result = model.transform(test)
predictionAndLabels = result.select("prediction", "label")
evaluator = MulticlassClassificationEvaluator(metricName="accuracy")
- print("Accuracy: " + str(evaluator.evaluate(predictionAndLabels)))
+ print("Test set accuracy = " + str(evaluator.evaluate(predictionAndLabels)))
# $example off$
spark.stop()
diff --git a/examples/src/main/python/ml/n_gram_example.py b/examples/src/main/python/ml/n_gram_example.py
index 55263adb46..31676e076a 100644
--- a/examples/src/main/python/ml/n_gram_example.py
+++ b/examples/src/main/python/ml/n_gram_example.py
@@ -33,13 +33,12 @@ if __name__ == "__main__":
(0, ["Hi", "I", "heard", "about", "Spark"]),
(1, ["I", "wish", "Java", "could", "use", "case", "classes"]),
(2, ["Logistic", "regression", "models", "are", "neat"])
- ], ["label", "words"])
+ ], ["id", "words"])
- ngram = NGram(inputCol="words", outputCol="ngrams")
- ngramDataFrame = ngram.transform(wordDataFrame)
+ ngram = NGram(n=2, inputCol="words", outputCol="ngrams")
- for ngrams_label in ngramDataFrame.select("ngrams", "label").take(3):
- print(ngrams_label)
+ ngramDataFrame = ngram.transform(wordDataFrame)
+ ngramDataFrame.select("ngrams").show(truncate=False)
# $example off$
spark.stop()
diff --git a/examples/src/main/python/ml/naive_bayes_example.py b/examples/src/main/python/ml/naive_bayes_example.py
index aa23f298c8..7290ab81cd 100644
--- a/examples/src/main/python/ml/naive_bayes_example.py
+++ b/examples/src/main/python/ml/naive_bayes_example.py
@@ -45,11 +45,15 @@ if __name__ == "__main__":
# train the model
model = nb.fit(train)
+ # select example rows to display.
+ predictions = model.transform(test)
+ predictions.show()
+
# compute accuracy on the test set
- result = model.transform(test)
- predictionAndLabels = result.select("prediction", "label")
- evaluator = MulticlassClassificationEvaluator(metricName="accuracy")
- print("Accuracy: " + str(evaluator.evaluate(predictionAndLabels)))
+ evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction",
+ metricName="accuracy")
+ accuracy = evaluator.evaluate(predictions)
+ print("Test set accuracy = " + str(accuracy))
# $example off$
spark.stop()
diff --git a/examples/src/main/python/ml/normalizer_example.py b/examples/src/main/python/ml/normalizer_example.py
index 19012f51f4..510bd825fd 100644
--- a/examples/src/main/python/ml/normalizer_example.py
+++ b/examples/src/main/python/ml/normalizer_example.py
@@ -19,6 +19,7 @@ from __future__ import print_function
# $example on$
from pyspark.ml.feature import Normalizer
+from pyspark.ml.linalg import Vectors
# $example off$
from pyspark.sql import SparkSession
@@ -29,15 +30,21 @@ if __name__ == "__main__":
.getOrCreate()
# $example on$
- dataFrame = spark.read.format("libsvm").load("data/mllib/sample_libsvm_data.txt")
+ dataFrame = spark.createDataFrame([
+ (0, Vectors.dense([1.0, 0.5, -1.0]),),
+ (1, Vectors.dense([2.0, 1.0, 1.0]),),
+ (2, Vectors.dense([4.0, 10.0, 2.0]),)
+ ], ["id", "features"])
# Normalize each Vector using $L^1$ norm.
normalizer = Normalizer(inputCol="features", outputCol="normFeatures", p=1.0)
l1NormData = normalizer.transform(dataFrame)
+ print("Normalized using L^1 norm")
l1NormData.show()
# Normalize each Vector using $L^\infty$ norm.
lInfNormData = normalizer.transform(dataFrame, {normalizer.p: float("inf")})
+ print("Normalized using L^inf norm")
lInfNormData.show()
# $example off$
diff --git a/examples/src/main/python/ml/onehot_encoder_example.py b/examples/src/main/python/ml/onehot_encoder_example.py
index 47faf8d202..e1996c7f0a 100644
--- a/examples/src/main/python/ml/onehot_encoder_example.py
+++ b/examples/src/main/python/ml/onehot_encoder_example.py
@@ -42,9 +42,9 @@ if __name__ == "__main__":
model = stringIndexer.fit(df)
indexed = model.transform(df)
- encoder = OneHotEncoder(dropLast=False, inputCol="categoryIndex", outputCol="categoryVec")
+ encoder = OneHotEncoder(inputCol="categoryIndex", outputCol="categoryVec")
encoded = encoder.transform(indexed)
- encoded.select("id", "categoryVec").show()
+ encoded.show()
# $example off$
spark.stop()
diff --git a/examples/src/main/python/ml/pipeline_example.py b/examples/src/main/python/ml/pipeline_example.py
index 2d0865578a..f63e4db434 100644
--- a/examples/src/main/python/ml/pipeline_example.py
+++ b/examples/src/main/python/ml/pipeline_example.py
@@ -60,9 +60,10 @@ if __name__ == "__main__":
# Make predictions on test documents and print columns of interest.
prediction = model.transform(test)
- selected = prediction.select("id", "text", "prediction")
+ selected = prediction.select("id", "text", "probability", "prediction")
for row in selected.collect():
- print(row)
+ rid, text, prob, prediction = row
+ print("(%d, %s) --> prob=%s, prediction=%f" % (rid, text, str(prob), prediction))
# $example off$
spark.stop()
diff --git a/examples/src/main/python/ml/polynomial_expansion_example.py b/examples/src/main/python/ml/polynomial_expansion_example.py
index b464ee86b6..40bcb7b13a 100644
--- a/examples/src/main/python/ml/polynomial_expansion_example.py
+++ b/examples/src/main/python/ml/polynomial_expansion_example.py
@@ -31,16 +31,15 @@ if __name__ == "__main__":
# $example on$
df = spark.createDataFrame([
- (Vectors.dense([-2.0, 2.3]),),
+ (Vectors.dense([2.0, 1.0]),),
(Vectors.dense([0.0, 0.0]),),
- (Vectors.dense([0.6, -1.1]),)
+ (Vectors.dense([3.0, -1.0]),)
], ["features"])
- px = PolynomialExpansion(degree=3, inputCol="features", outputCol="polyFeatures")
- polyDF = px.transform(df)
+ polyExpansion = PolynomialExpansion(degree=3, inputCol="features", outputCol="polyFeatures")
+ polyDF = polyExpansion.transform(df)
- for expanded in polyDF.select("polyFeatures").take(3):
- print(expanded)
+ polyDF.show(truncate=False)
# $example off$
spark.stop()
diff --git a/examples/src/main/python/ml/stopwords_remover_example.py b/examples/src/main/python/ml/stopwords_remover_example.py
index 8a8392cc1f..3b8e7855e3 100644
--- a/examples/src/main/python/ml/stopwords_remover_example.py
+++ b/examples/src/main/python/ml/stopwords_remover_example.py
@@ -32,7 +32,7 @@ if __name__ == "__main__":
sentenceData = spark.createDataFrame([
(0, ["I", "saw", "the", "red", "balloon"]),
(1, ["Mary", "had", "a", "little", "lamb"])
- ], ["label", "raw"])
+ ], ["id", "raw"])
remover = StopWordsRemover(inputCol="raw", outputCol="filtered")
remover.transform(sentenceData).show(truncate=False)
diff --git a/examples/src/main/python/ml/tf_idf_example.py b/examples/src/main/python/ml/tf_idf_example.py
index 4ab7eb6964..d43244fa68 100644
--- a/examples/src/main/python/ml/tf_idf_example.py
+++ b/examples/src/main/python/ml/tf_idf_example.py
@@ -30,9 +30,9 @@ if __name__ == "__main__":
# $example on$
sentenceData = spark.createDataFrame([
- (0, "Hi I heard about Spark"),
- (0, "I wish Java could use case classes"),
- (1, "Logistic regression models are neat")
+ (0.0, "Hi I heard about Spark"),
+ (0.0, "I wish Java could use case classes"),
+ (1.0, "Logistic regression models are neat")
], ["label", "sentence"])
tokenizer = Tokenizer(inputCol="sentence", outputCol="words")
@@ -46,8 +46,7 @@ if __name__ == "__main__":
idfModel = idf.fit(featurizedData)
rescaledData = idfModel.transform(featurizedData)
- for features_label in rescaledData.select("features", "label").take(3):
- print(features_label)
+ rescaledData.select("label", "features").show()
# $example off$
spark.stop()
diff --git a/examples/src/main/python/ml/tokenizer_example.py b/examples/src/main/python/ml/tokenizer_example.py
index 89f5060705..5c65c5c9f8 100644
--- a/examples/src/main/python/ml/tokenizer_example.py
+++ b/examples/src/main/python/ml/tokenizer_example.py
@@ -19,6 +19,8 @@ from __future__ import print_function
# $example on$
from pyspark.ml.feature import Tokenizer, RegexTokenizer
+from pyspark.sql.functions import col, udf
+from pyspark.sql.types import IntegerType
# $example off$
from pyspark.sql import SparkSession
@@ -33,20 +35,22 @@ if __name__ == "__main__":
(0, "Hi I heard about Spark"),
(1, "I wish Java could use case classes"),
(2, "Logistic,regression,models,are,neat")
- ], ["label", "sentence"])
+ ], ["id", "sentence"])
tokenizer = Tokenizer(inputCol="sentence", outputCol="words")
regexTokenizer = RegexTokenizer(inputCol="sentence", outputCol="words", pattern="\\W")
# alternatively, pattern="\\w+", gaps(False)
+ countTokens = udf(lambda words: len(words), IntegerType())
+
tokenized = tokenizer.transform(sentenceDataFrame)
- for words_label in tokenized.select("words", "label").take(3):
- print(words_label)
+ tokenized.select("sentence", "words")\
+ .withColumn("tokens", countTokens(col("words"))).show(truncate=False)
regexTokenized = regexTokenizer.transform(sentenceDataFrame)
- for words_label in regexTokenized.select("words", "label").take(3):
- print(words_label)
+ regexTokenized.select("sentence", "words") \
+ .withColumn("tokens", countTokens(col("words"))).show(truncate=False)
# $example off$
spark.stop()
diff --git a/examples/src/main/python/ml/train_validation_split.py b/examples/src/main/python/ml/train_validation_split.py
index a92b861f83..d104f7d30a 100644
--- a/examples/src/main/python/ml/train_validation_split.py
+++ b/examples/src/main/python/ml/train_validation_split.py
@@ -66,8 +66,9 @@ if __name__ == "__main__":
# Make predictions on test data. model is the model with combination of parameters
# that performed best.
- prediction = model.transform(test)
- for row in prediction.take(5):
- print(row)
+ model.transform(test)\
+ .select("features", "label", "prediction")\
+ .show()
+
# $example off$
spark.stop()
diff --git a/examples/src/main/python/ml/vector_assembler_example.py b/examples/src/main/python/ml/vector_assembler_example.py
index eac33711ad..98de1d5ea7 100644
--- a/examples/src/main/python/ml/vector_assembler_example.py
+++ b/examples/src/main/python/ml/vector_assembler_example.py
@@ -39,7 +39,8 @@ if __name__ == "__main__":
outputCol="features")
output = assembler.transform(dataset)
- print(output.select("features", "clicked").first())
+ print("Assembled columns 'hour', 'mobile', 'userFeatures' to vector column 'features'")
+ output.select("features", "clicked").show(truncate=False)
# $example off$
spark.stop()
diff --git a/examples/src/main/python/ml/vector_indexer_example.py b/examples/src/main/python/ml/vector_indexer_example.py
index 3912c135be..5c2956077d 100644
--- a/examples/src/main/python/ml/vector_indexer_example.py
+++ b/examples/src/main/python/ml/vector_indexer_example.py
@@ -34,6 +34,10 @@ if __name__ == "__main__":
indexer = VectorIndexer(inputCol="features", outputCol="indexed", maxCategories=10)
indexerModel = indexer.fit(data)
+ categoricalFeatures = indexerModel.categoryMaps
+ print("Chose %d categorical features: %s" %
+ (len(categoricalFeatures), ", ".join(str(k) for k in categoricalFeatures.keys())))
+
# Create new column "indexed" with categorical values transformed to indices
indexedData = indexerModel.transform(data)
indexedData.show()
diff --git a/examples/src/main/python/ml/word2vec_example.py b/examples/src/main/python/ml/word2vec_example.py
index 78a91c92fc..77f8951df0 100644
--- a/examples/src/main/python/ml/word2vec_example.py
+++ b/examples/src/main/python/ml/word2vec_example.py
@@ -41,8 +41,9 @@ if __name__ == "__main__":
model = word2Vec.fit(documentDF)
result = model.transform(documentDF)
- for feature in result.select("result").take(3):
- print(feature)
+ for row in result.collect():
+ text, vector = row
+ print("Text: [%s] => \nVector: %s\n" % (", ".join(text), str(vector)))
# $example off$
spark.stop()
diff --git a/examples/src/main/python/pagerank.py b/examples/src/main/python/pagerank.py
index a399a9c37c..0d6c253d39 100755
--- a/examples/src/main/python/pagerank.py
+++ b/examples/src/main/python/pagerank.py
@@ -18,6 +18,9 @@
"""
This is an example implementation of PageRank. For more conventional use,
Please refer to PageRank implementation provided by graphx
+
+Example Usage:
+bin/spark-submit examples/src/main/python/pagerank.py data/mllib/pagerank_data.txt 10
"""
from __future__ import print_function
@@ -46,8 +49,8 @@ if __name__ == "__main__":
print("Usage: pagerank <file> <iterations>", file=sys.stderr)
exit(-1)
- print("""WARN: This is a naive implementation of PageRank and is
- given as an example! Please refer to PageRank implementation provided by graphx""",
+ print("WARN: This is a naive implementation of PageRank and is given as an example!\n" +
+ "Please refer to PageRank implementation provided by graphx",
file=sys.stderr)
# Initialize the spark context.