From be88383e15a86d094963de5f7e8792510bc990de Mon Sep 17 00:00:00 2001 From: GayathriMurali Date: Fri, 24 Jun 2016 13:25:40 +0200 Subject: [SPARK-15997][DOC][ML] Update user guide for HashingTF, QuantileVectorizer and CountVectorizer ## What changes were proposed in this pull request? Made changes to HashingTF,QuantileVectorizer and CountVectorizer Author: GayathriMurali Closes #13745 from GayathriMurali/SPARK-15997. --- examples/src/main/python/ml/quantile_discretizer_example.py | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) (limited to 'examples/src/main/python') diff --git a/examples/src/main/python/ml/quantile_discretizer_example.py b/examples/src/main/python/ml/quantile_discretizer_example.py index 6ae7bb18f8..5444cacd95 100644 --- a/examples/src/main/python/ml/quantile_discretizer_example.py +++ b/examples/src/main/python/ml/quantile_discretizer_example.py @@ -28,11 +28,16 @@ if __name__ == "__main__": # $example on$ data = [(0, 18.0,), (1, 19.0,), (2, 8.0,), (3, 5.0,), (4, 2.2,)] - dataFrame = spark.createDataFrame(data, ["id", "hour"]) - + df = spark.createDataFrame(data, ["id", "hour"]) + # $example off$ + # Output of QuantileDiscretizer for such small datasets can depend on the number of + # partitions. Here we force a single partition to ensure consistent results. + # Note this is not necessary for normal use cases + df = df.repartition(1) + # $example on$ discretizer = QuantileDiscretizer(numBuckets=3, inputCol="hour", outputCol="result") - result = discretizer.fit(dataFrame).transform(dataFrame) + result = discretizer.fit(df).transform(df) result.show() # $example off$ -- cgit v1.2.3