diff options
Diffstat (limited to 'examples/src/main/python')
26 files changed, 120 insertions, 62 deletions
diff --git a/examples/src/main/python/ml/binarizer_example.py b/examples/src/main/python/ml/binarizer_example.py index 4224a27dbe..669bb2aeab 100644 --- a/examples/src/main/python/ml/binarizer_example.py +++ b/examples/src/main/python/ml/binarizer_example.py @@ -33,12 +33,14 @@ if __name__ == "__main__": (0, 0.1), (1, 0.8), (2, 0.2) - ], ["label", "feature"]) + ], ["id", "feature"]) + binarizer = Binarizer(threshold=0.5, inputCol="feature", outputCol="binarized_feature") + binarizedDataFrame = binarizer.transform(continuousDataFrame) - binarizedFeatures = binarizedDataFrame.select("binarized_feature") - for binarized_feature, in binarizedFeatures.collect(): - print(binarized_feature) + + print("Binarizer output with Threshold = %f" % binarizer.getThreshold()) + binarizedDataFrame.show() # $example off$ spark.stop() diff --git a/examples/src/main/python/ml/bucketizer_example.py b/examples/src/main/python/ml/bucketizer_example.py index 8177e560dd..742f35093b 100644 --- a/examples/src/main/python/ml/bucketizer_example.py +++ b/examples/src/main/python/ml/bucketizer_example.py @@ -31,13 +31,15 @@ if __name__ == "__main__": # $example on$ splits = [-float("inf"), -0.5, 0.0, 0.5, float("inf")] - data = [(-0.5,), (-0.3,), (0.0,), (0.2,)] + data = [(-999.9,), (-0.5,), (-0.3,), (0.0,), (0.2,), (999.9,)] dataFrame = spark.createDataFrame(data, ["features"]) bucketizer = Bucketizer(splits=splits, inputCol="features", outputCol="bucketedFeatures") # Transform original data into its bucket index. bucketedData = bucketizer.transform(dataFrame) + + print("Bucketizer output with %d buckets" % (len(bucketizer.getSplits())-1)) bucketedData.show() # $example off$ diff --git a/examples/src/main/python/ml/chisq_selector_example.py b/examples/src/main/python/ml/chisq_selector_example.py index 5e19ef1624..028a9ea9d6 100644 --- a/examples/src/main/python/ml/chisq_selector_example.py +++ b/examples/src/main/python/ml/chisq_selector_example.py @@ -39,6 +39,8 @@ if __name__ == "__main__": outputCol="selectedFeatures", labelCol="clicked") result = selector.fit(df).transform(df) + + print("ChiSqSelector output with top %d features selected" % selector.getNumTopFeatures()) result.show() # $example off$ diff --git a/examples/src/main/python/ml/count_vectorizer_example.py b/examples/src/main/python/ml/count_vectorizer_example.py index 38cfac82fb..f2e41db77d 100644 --- a/examples/src/main/python/ml/count_vectorizer_example.py +++ b/examples/src/main/python/ml/count_vectorizer_example.py @@ -37,9 +37,11 @@ if __name__ == "__main__": # fit a CountVectorizerModel from the corpus. cv = CountVectorizer(inputCol="words", outputCol="features", vocabSize=3, minDF=2.0) + model = cv.fit(df) + result = model.transform(df) - result.show() + result.show(truncate=False) # $example off$ spark.stop() diff --git a/examples/src/main/python/ml/dct_example.py b/examples/src/main/python/ml/dct_example.py index a4f25df784..c0457f8d0f 100644 --- a/examples/src/main/python/ml/dct_example.py +++ b/examples/src/main/python/ml/dct_example.py @@ -39,8 +39,7 @@ if __name__ == "__main__": dctDf = dct.transform(df) - for dcts in dctDf.select("featuresDCT").take(3): - print(dcts) + dctDf.select("featuresDCT").show(truncate=False) # $example off$ spark.stop() diff --git a/examples/src/main/python/ml/gaussian_mixture_example.py b/examples/src/main/python/ml/gaussian_mixture_example.py index edc258de05..8ad450b669 100644 --- a/examples/src/main/python/ml/gaussian_mixture_example.py +++ b/examples/src/main/python/ml/gaussian_mixture_example.py @@ -38,11 +38,11 @@ if __name__ == "__main__": # loads data dataset = spark.read.format("libsvm").load("data/mllib/sample_kmeans_data.txt") - gmm = GaussianMixture().setK(2) + gmm = GaussianMixture().setK(2).setSeed(538009335L) model = gmm.fit(dataset) - print("Gaussians: ") - model.gaussiansDF.show() + print("Gaussians shown as a DataFrame: ") + model.gaussiansDF.show(truncate=False) # $example off$ spark.stop() diff --git a/examples/src/main/python/ml/index_to_string_example.py b/examples/src/main/python/ml/index_to_string_example.py index 523caac00c..33d104e8e3 100644 --- a/examples/src/main/python/ml/index_to_string_example.py +++ b/examples/src/main/python/ml/index_to_string_example.py @@ -33,14 +33,22 @@ if __name__ == "__main__": [(0, "a"), (1, "b"), (2, "c"), (3, "a"), (4, "a"), (5, "c")], ["id", "category"]) - stringIndexer = StringIndexer(inputCol="category", outputCol="categoryIndex") - model = stringIndexer.fit(df) + indexer = StringIndexer(inputCol="category", outputCol="categoryIndex") + model = indexer.fit(df) indexed = model.transform(df) + print("Transformed string column '%s' to indexed column '%s'" + % (indexer.getInputCol(), indexer.getOutputCol())) + indexed.show() + + print("StringIndexer will store labels in output column metadata\n") + converter = IndexToString(inputCol="categoryIndex", outputCol="originalCategory") converted = converter.transform(indexed) - converted.select("id", "originalCategory").show() + print("Transformed indexed column '%s' back to original string column '%s' using " + "labels in metadata" % (converter.getInputCol(), converter.getOutputCol())) + converted.select("id", "categoryIndex", "originalCategory").show() # $example off$ spark.stop() diff --git a/examples/src/main/python/ml/isotonic_regression_example.py b/examples/src/main/python/ml/isotonic_regression_example.py index a41b8ffacb..6ae15f1b4b 100644 --- a/examples/src/main/python/ml/isotonic_regression_example.py +++ b/examples/src/main/python/ml/isotonic_regression_example.py @@ -44,8 +44,8 @@ if __name__ == "__main__": # Trains an isotonic regression model. model = IsotonicRegression().fit(dataset) - print("Boundaries in increasing order: " + str(model.boundaries)) - print("Predictions associated with the boundaries: " + str(model.predictions)) + print("Boundaries in increasing order: %s\n" % str(model.boundaries)) + print("Predictions associated with the boundaries: %s\n" % str(model.predictions)) # Makes predictions. model.transform(dataset).show() diff --git a/examples/src/main/python/ml/linear_regression_with_elastic_net.py b/examples/src/main/python/ml/linear_regression_with_elastic_net.py index 620ab5b87e..6639e9160a 100644 --- a/examples/src/main/python/ml/linear_regression_with_elastic_net.py +++ b/examples/src/main/python/ml/linear_regression_with_elastic_net.py @@ -39,8 +39,16 @@ if __name__ == "__main__": lrModel = lr.fit(training) # Print the coefficients and intercept for linear regression - print("Coefficients: " + str(lrModel.coefficients)) - print("Intercept: " + str(lrModel.intercept)) + print("Coefficients: %s" % str(lrModel.coefficients)) + print("Intercept: %s" % str(lrModel.intercept)) + + # Summarize the model over the training set and print out some metrics + trainingSummary = lrModel.summary + print("numIterations: %d" % trainingSummary.totalIterations) + print("objectiveHistory: %s" % str(trainingSummary.objectiveHistory)) + trainingSummary.residuals.show() + print("RMSE: %f" % trainingSummary.rootMeanSquaredError) + print("r2: %f" % trainingSummary.r2) # $example off$ spark.stop() diff --git a/examples/src/main/python/ml/max_abs_scaler_example.py b/examples/src/main/python/ml/max_abs_scaler_example.py index ab91198b08..45eda3cdad 100644 --- a/examples/src/main/python/ml/max_abs_scaler_example.py +++ b/examples/src/main/python/ml/max_abs_scaler_example.py @@ -19,6 +19,7 @@ from __future__ import print_function # $example on$ from pyspark.ml.feature import MaxAbsScaler +from pyspark.ml.linalg import Vectors # $example off$ from pyspark.sql import SparkSession @@ -29,7 +30,11 @@ if __name__ == "__main__": .getOrCreate() # $example on$ - dataFrame = spark.read.format("libsvm").load("data/mllib/sample_libsvm_data.txt") + dataFrame = spark.createDataFrame([ + (0, Vectors.dense([1.0, 0.1, -8.0]),), + (1, Vectors.dense([2.0, 1.0, -4.0]),), + (2, Vectors.dense([4.0, 10.0, 8.0]),) + ], ["id", "features"]) scaler = MaxAbsScaler(inputCol="features", outputCol="scaledFeatures") @@ -38,7 +43,8 @@ if __name__ == "__main__": # rescale each feature to range [-1, 1]. scaledData = scalerModel.transform(dataFrame) - scaledData.show() + + scaledData.select("features", "scaledFeatures").show() # $example off$ spark.stop() diff --git a/examples/src/main/python/ml/min_max_scaler_example.py b/examples/src/main/python/ml/min_max_scaler_example.py index e3e7bc205b..b5f272e59b 100644 --- a/examples/src/main/python/ml/min_max_scaler_example.py +++ b/examples/src/main/python/ml/min_max_scaler_example.py @@ -19,6 +19,7 @@ from __future__ import print_function # $example on$ from pyspark.ml.feature import MinMaxScaler +from pyspark.ml.linalg import Vectors # $example off$ from pyspark.sql import SparkSession @@ -29,7 +30,11 @@ if __name__ == "__main__": .getOrCreate() # $example on$ - dataFrame = spark.read.format("libsvm").load("data/mllib/sample_libsvm_data.txt") + dataFrame = spark.createDataFrame([ + (0, Vectors.dense([1.0, 0.1, -1.0]),), + (1, Vectors.dense([2.0, 1.1, 1.0]),), + (2, Vectors.dense([3.0, 10.1, 3.0]),) + ], ["id", "features"]) scaler = MinMaxScaler(inputCol="features", outputCol="scaledFeatures") @@ -38,7 +43,8 @@ if __name__ == "__main__": # rescale each feature to range [min, max]. scaledData = scalerModel.transform(dataFrame) - scaledData.show() + print("Features scaled to range: [%f, %f]" % (scaler.getMin(), scaler.getMax())) + scaledData.select("features", "scaledFeatures").show() # $example off$ spark.stop() diff --git a/examples/src/main/python/ml/multilayer_perceptron_classification.py b/examples/src/main/python/ml/multilayer_perceptron_classification.py index 2cc38c2855..88fc69f753 100644 --- a/examples/src/main/python/ml/multilayer_perceptron_classification.py +++ b/examples/src/main/python/ml/multilayer_perceptron_classification.py @@ -52,7 +52,7 @@ if __name__ == "__main__": result = model.transform(test) predictionAndLabels = result.select("prediction", "label") evaluator = MulticlassClassificationEvaluator(metricName="accuracy") - print("Accuracy: " + str(evaluator.evaluate(predictionAndLabels))) + print("Test set accuracy = " + str(evaluator.evaluate(predictionAndLabels))) # $example off$ spark.stop() diff --git a/examples/src/main/python/ml/n_gram_example.py b/examples/src/main/python/ml/n_gram_example.py index 55263adb46..31676e076a 100644 --- a/examples/src/main/python/ml/n_gram_example.py +++ b/examples/src/main/python/ml/n_gram_example.py @@ -33,13 +33,12 @@ if __name__ == "__main__": (0, ["Hi", "I", "heard", "about", "Spark"]), (1, ["I", "wish", "Java", "could", "use", "case", "classes"]), (2, ["Logistic", "regression", "models", "are", "neat"]) - ], ["label", "words"]) + ], ["id", "words"]) - ngram = NGram(inputCol="words", outputCol="ngrams") - ngramDataFrame = ngram.transform(wordDataFrame) + ngram = NGram(n=2, inputCol="words", outputCol="ngrams") - for ngrams_label in ngramDataFrame.select("ngrams", "label").take(3): - print(ngrams_label) + ngramDataFrame = ngram.transform(wordDataFrame) + ngramDataFrame.select("ngrams").show(truncate=False) # $example off$ spark.stop() diff --git a/examples/src/main/python/ml/naive_bayes_example.py b/examples/src/main/python/ml/naive_bayes_example.py index aa23f298c8..7290ab81cd 100644 --- a/examples/src/main/python/ml/naive_bayes_example.py +++ b/examples/src/main/python/ml/naive_bayes_example.py @@ -45,11 +45,15 @@ if __name__ == "__main__": # train the model model = nb.fit(train) + # select example rows to display. + predictions = model.transform(test) + predictions.show() + # compute accuracy on the test set - result = model.transform(test) - predictionAndLabels = result.select("prediction", "label") - evaluator = MulticlassClassificationEvaluator(metricName="accuracy") - print("Accuracy: " + str(evaluator.evaluate(predictionAndLabels))) + evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction", + metricName="accuracy") + accuracy = evaluator.evaluate(predictions) + print("Test set accuracy = " + str(accuracy)) # $example off$ spark.stop() diff --git a/examples/src/main/python/ml/normalizer_example.py b/examples/src/main/python/ml/normalizer_example.py index 19012f51f4..510bd825fd 100644 --- a/examples/src/main/python/ml/normalizer_example.py +++ b/examples/src/main/python/ml/normalizer_example.py @@ -19,6 +19,7 @@ from __future__ import print_function # $example on$ from pyspark.ml.feature import Normalizer +from pyspark.ml.linalg import Vectors # $example off$ from pyspark.sql import SparkSession @@ -29,15 +30,21 @@ if __name__ == "__main__": .getOrCreate() # $example on$ - dataFrame = spark.read.format("libsvm").load("data/mllib/sample_libsvm_data.txt") + dataFrame = spark.createDataFrame([ + (0, Vectors.dense([1.0, 0.5, -1.0]),), + (1, Vectors.dense([2.0, 1.0, 1.0]),), + (2, Vectors.dense([4.0, 10.0, 2.0]),) + ], ["id", "features"]) # Normalize each Vector using $L^1$ norm. normalizer = Normalizer(inputCol="features", outputCol="normFeatures", p=1.0) l1NormData = normalizer.transform(dataFrame) + print("Normalized using L^1 norm") l1NormData.show() # Normalize each Vector using $L^\infty$ norm. lInfNormData = normalizer.transform(dataFrame, {normalizer.p: float("inf")}) + print("Normalized using L^inf norm") lInfNormData.show() # $example off$ diff --git a/examples/src/main/python/ml/onehot_encoder_example.py b/examples/src/main/python/ml/onehot_encoder_example.py index 47faf8d202..e1996c7f0a 100644 --- a/examples/src/main/python/ml/onehot_encoder_example.py +++ b/examples/src/main/python/ml/onehot_encoder_example.py @@ -42,9 +42,9 @@ if __name__ == "__main__": model = stringIndexer.fit(df) indexed = model.transform(df) - encoder = OneHotEncoder(dropLast=False, inputCol="categoryIndex", outputCol="categoryVec") + encoder = OneHotEncoder(inputCol="categoryIndex", outputCol="categoryVec") encoded = encoder.transform(indexed) - encoded.select("id", "categoryVec").show() + encoded.show() # $example off$ spark.stop() diff --git a/examples/src/main/python/ml/pipeline_example.py b/examples/src/main/python/ml/pipeline_example.py index 2d0865578a..f63e4db434 100644 --- a/examples/src/main/python/ml/pipeline_example.py +++ b/examples/src/main/python/ml/pipeline_example.py @@ -60,9 +60,10 @@ if __name__ == "__main__": # Make predictions on test documents and print columns of interest. prediction = model.transform(test) - selected = prediction.select("id", "text", "prediction") + selected = prediction.select("id", "text", "probability", "prediction") for row in selected.collect(): - print(row) + rid, text, prob, prediction = row + print("(%d, %s) --> prob=%s, prediction=%f" % (rid, text, str(prob), prediction)) # $example off$ spark.stop() diff --git a/examples/src/main/python/ml/polynomial_expansion_example.py b/examples/src/main/python/ml/polynomial_expansion_example.py index b464ee86b6..40bcb7b13a 100644 --- a/examples/src/main/python/ml/polynomial_expansion_example.py +++ b/examples/src/main/python/ml/polynomial_expansion_example.py @@ -31,16 +31,15 @@ if __name__ == "__main__": # $example on$ df = spark.createDataFrame([ - (Vectors.dense([-2.0, 2.3]),), + (Vectors.dense([2.0, 1.0]),), (Vectors.dense([0.0, 0.0]),), - (Vectors.dense([0.6, -1.1]),) + (Vectors.dense([3.0, -1.0]),) ], ["features"]) - px = PolynomialExpansion(degree=3, inputCol="features", outputCol="polyFeatures") - polyDF = px.transform(df) + polyExpansion = PolynomialExpansion(degree=3, inputCol="features", outputCol="polyFeatures") + polyDF = polyExpansion.transform(df) - for expanded in polyDF.select("polyFeatures").take(3): - print(expanded) + polyDF.show(truncate=False) # $example off$ spark.stop() diff --git a/examples/src/main/python/ml/stopwords_remover_example.py b/examples/src/main/python/ml/stopwords_remover_example.py index 8a8392cc1f..3b8e7855e3 100644 --- a/examples/src/main/python/ml/stopwords_remover_example.py +++ b/examples/src/main/python/ml/stopwords_remover_example.py @@ -32,7 +32,7 @@ if __name__ == "__main__": sentenceData = spark.createDataFrame([ (0, ["I", "saw", "the", "red", "balloon"]), (1, ["Mary", "had", "a", "little", "lamb"]) - ], ["label", "raw"]) + ], ["id", "raw"]) remover = StopWordsRemover(inputCol="raw", outputCol="filtered") remover.transform(sentenceData).show(truncate=False) diff --git a/examples/src/main/python/ml/tf_idf_example.py b/examples/src/main/python/ml/tf_idf_example.py index 4ab7eb6964..d43244fa68 100644 --- a/examples/src/main/python/ml/tf_idf_example.py +++ b/examples/src/main/python/ml/tf_idf_example.py @@ -30,9 +30,9 @@ if __name__ == "__main__": # $example on$ sentenceData = spark.createDataFrame([ - (0, "Hi I heard about Spark"), - (0, "I wish Java could use case classes"), - (1, "Logistic regression models are neat") + (0.0, "Hi I heard about Spark"), + (0.0, "I wish Java could use case classes"), + (1.0, "Logistic regression models are neat") ], ["label", "sentence"]) tokenizer = Tokenizer(inputCol="sentence", outputCol="words") @@ -46,8 +46,7 @@ if __name__ == "__main__": idfModel = idf.fit(featurizedData) rescaledData = idfModel.transform(featurizedData) - for features_label in rescaledData.select("features", "label").take(3): - print(features_label) + rescaledData.select("label", "features").show() # $example off$ spark.stop() diff --git a/examples/src/main/python/ml/tokenizer_example.py b/examples/src/main/python/ml/tokenizer_example.py index 89f5060705..5c65c5c9f8 100644 --- a/examples/src/main/python/ml/tokenizer_example.py +++ b/examples/src/main/python/ml/tokenizer_example.py @@ -19,6 +19,8 @@ from __future__ import print_function # $example on$ from pyspark.ml.feature import Tokenizer, RegexTokenizer +from pyspark.sql.functions import col, udf +from pyspark.sql.types import IntegerType # $example off$ from pyspark.sql import SparkSession @@ -33,20 +35,22 @@ if __name__ == "__main__": (0, "Hi I heard about Spark"), (1, "I wish Java could use case classes"), (2, "Logistic,regression,models,are,neat") - ], ["label", "sentence"]) + ], ["id", "sentence"]) tokenizer = Tokenizer(inputCol="sentence", outputCol="words") regexTokenizer = RegexTokenizer(inputCol="sentence", outputCol="words", pattern="\\W") # alternatively, pattern="\\w+", gaps(False) + countTokens = udf(lambda words: len(words), IntegerType()) + tokenized = tokenizer.transform(sentenceDataFrame) - for words_label in tokenized.select("words", "label").take(3): - print(words_label) + tokenized.select("sentence", "words")\ + .withColumn("tokens", countTokens(col("words"))).show(truncate=False) regexTokenized = regexTokenizer.transform(sentenceDataFrame) - for words_label in regexTokenized.select("words", "label").take(3): - print(words_label) + regexTokenized.select("sentence", "words") \ + .withColumn("tokens", countTokens(col("words"))).show(truncate=False) # $example off$ spark.stop() diff --git a/examples/src/main/python/ml/train_validation_split.py b/examples/src/main/python/ml/train_validation_split.py index a92b861f83..d104f7d30a 100644 --- a/examples/src/main/python/ml/train_validation_split.py +++ b/examples/src/main/python/ml/train_validation_split.py @@ -66,8 +66,9 @@ if __name__ == "__main__": # Make predictions on test data. model is the model with combination of parameters # that performed best. - prediction = model.transform(test) - for row in prediction.take(5): - print(row) + model.transform(test)\ + .select("features", "label", "prediction")\ + .show() + # $example off$ spark.stop() diff --git a/examples/src/main/python/ml/vector_assembler_example.py b/examples/src/main/python/ml/vector_assembler_example.py index eac33711ad..98de1d5ea7 100644 --- a/examples/src/main/python/ml/vector_assembler_example.py +++ b/examples/src/main/python/ml/vector_assembler_example.py @@ -39,7 +39,8 @@ if __name__ == "__main__": outputCol="features") output = assembler.transform(dataset) - print(output.select("features", "clicked").first()) + print("Assembled columns 'hour', 'mobile', 'userFeatures' to vector column 'features'") + output.select("features", "clicked").show(truncate=False) # $example off$ spark.stop() diff --git a/examples/src/main/python/ml/vector_indexer_example.py b/examples/src/main/python/ml/vector_indexer_example.py index 3912c135be..5c2956077d 100644 --- a/examples/src/main/python/ml/vector_indexer_example.py +++ b/examples/src/main/python/ml/vector_indexer_example.py @@ -34,6 +34,10 @@ if __name__ == "__main__": indexer = VectorIndexer(inputCol="features", outputCol="indexed", maxCategories=10) indexerModel = indexer.fit(data) + categoricalFeatures = indexerModel.categoryMaps + print("Chose %d categorical features: %s" % + (len(categoricalFeatures), ", ".join(str(k) for k in categoricalFeatures.keys()))) + # Create new column "indexed" with categorical values transformed to indices indexedData = indexerModel.transform(data) indexedData.show() diff --git a/examples/src/main/python/ml/word2vec_example.py b/examples/src/main/python/ml/word2vec_example.py index 78a91c92fc..77f8951df0 100644 --- a/examples/src/main/python/ml/word2vec_example.py +++ b/examples/src/main/python/ml/word2vec_example.py @@ -41,8 +41,9 @@ if __name__ == "__main__": model = word2Vec.fit(documentDF) result = model.transform(documentDF) - for feature in result.select("result").take(3): - print(feature) + for row in result.collect(): + text, vector = row + print("Text: [%s] => \nVector: %s\n" % (", ".join(text), str(vector))) # $example off$ spark.stop() diff --git a/examples/src/main/python/pagerank.py b/examples/src/main/python/pagerank.py index a399a9c37c..0d6c253d39 100755 --- a/examples/src/main/python/pagerank.py +++ b/examples/src/main/python/pagerank.py @@ -18,6 +18,9 @@ """ This is an example implementation of PageRank. For more conventional use, Please refer to PageRank implementation provided by graphx + +Example Usage: +bin/spark-submit examples/src/main/python/pagerank.py data/mllib/pagerank_data.txt 10 """ from __future__ import print_function @@ -46,8 +49,8 @@ if __name__ == "__main__": print("Usage: pagerank <file> <iterations>", file=sys.stderr) exit(-1) - print("""WARN: This is a naive implementation of PageRank and is - given as an example! Please refer to PageRank implementation provided by graphx""", + print("WARN: This is a naive implementation of PageRank and is given as an example!\n" + + "Please refer to PageRank implementation provided by graphx", file=sys.stderr) # Initialize the spark context. |