From be88383e15a86d094963de5f7e8792510bc990de Mon Sep 17 00:00:00 2001
From: GayathriMurali <gayathri.m@intel.com>
Date: Fri, 24 Jun 2016 13:25:40 +0200
Subject: [SPARK-15997][DOC][ML] Update user guide for HashingTF,
 QuantileVectorizer and CountVectorizer

## What changes were proposed in this pull request?

Made changes to HashingTF,QuantileVectorizer and CountVectorizer

Author: GayathriMurali <gayathri.m@intel.com>

Closes #13745 from GayathriMurali/SPARK-15997.
---
 examples/src/main/python/ml/quantile_discretizer_example.py | 11 ++++++++---
 1 file changed, 8 insertions(+), 3 deletions(-)

(limited to 'examples/src/main/python')

diff --git a/examples/src/main/python/ml/quantile_discretizer_example.py b/examples/src/main/python/ml/quantile_discretizer_example.py
index 6ae7bb18f8..5444cacd95 100644
--- a/examples/src/main/python/ml/quantile_discretizer_example.py
+++ b/examples/src/main/python/ml/quantile_discretizer_example.py
@@ -28,11 +28,16 @@ if __name__ == "__main__":
 
     # $example on$
     data = [(0, 18.0,), (1, 19.0,), (2, 8.0,), (3, 5.0,), (4, 2.2,)]
-    dataFrame = spark.createDataFrame(data, ["id", "hour"])
-
+    df = spark.createDataFrame(data, ["id", "hour"])
+    # $example off$
+    # Output of QuantileDiscretizer for such small datasets can depend on the number of
+    # partitions. Here we force a single partition to ensure consistent results.
+    # Note this is not necessary for normal use cases
+    df = df.repartition(1)
+    # $example on$
     discretizer = QuantileDiscretizer(numBuckets=3, inputCol="hour", outputCol="result")
 
-    result = discretizer.fit(dataFrame).transform(dataFrame)
+    result = discretizer.fit(df).transform(df)
     result.show()
     # $example off$
 
-- 
cgit v1.2.3