aboutsummaryrefslogtreecommitdiff
path: root/examples/src/main
diff options
context:
space:
mode:
Diffstat (limited to 'examples/src/main')
-rw-r--r--examples/src/main/java/org/apache/spark/examples/ml/JavaQuantileDiscretizerExample.java7
-rw-r--r--examples/src/main/python/ml/quantile_discretizer_example.py11
-rw-r--r--examples/src/main/scala/org/apache/spark/examples/ml/QuantileDiscretizerExample.scala9
3 files changed, 21 insertions, 6 deletions
diff --git a/examples/src/main/java/org/apache/spark/examples/ml/JavaQuantileDiscretizerExample.java b/examples/src/main/java/org/apache/spark/examples/ml/JavaQuantileDiscretizerExample.java
index 16f58a852d..dd20cac621 100644
--- a/examples/src/main/java/org/apache/spark/examples/ml/JavaQuantileDiscretizerExample.java
+++ b/examples/src/main/java/org/apache/spark/examples/ml/JavaQuantileDiscretizerExample.java
@@ -54,7 +54,12 @@ public class JavaQuantileDiscretizerExample {
});
Dataset<Row> df = spark.createDataFrame(data, schema);
-
+ // $example off$
+ // Output of QuantileDiscretizer for such small datasets can depend on the number of
+ // partitions. Here we force a single partition to ensure consistent results.
+ // Note this is not necessary for normal use cases
+ df = df.repartition(1);
+ // $example on$
QuantileDiscretizer discretizer = new QuantileDiscretizer()
.setInputCol("hour")
.setOutputCol("result")
diff --git a/examples/src/main/python/ml/quantile_discretizer_example.py b/examples/src/main/python/ml/quantile_discretizer_example.py
index 6ae7bb18f8..5444cacd95 100644
--- a/examples/src/main/python/ml/quantile_discretizer_example.py
+++ b/examples/src/main/python/ml/quantile_discretizer_example.py
@@ -28,11 +28,16 @@ if __name__ == "__main__":
# $example on$
data = [(0, 18.0,), (1, 19.0,), (2, 8.0,), (3, 5.0,), (4, 2.2,)]
- dataFrame = spark.createDataFrame(data, ["id", "hour"])
-
+ df = spark.createDataFrame(data, ["id", "hour"])
+ # $example off$
+ # Output of QuantileDiscretizer for such small datasets can depend on the number of
+ # partitions. Here we force a single partition to ensure consistent results.
+ # Note this is not necessary for normal use cases
+ df = df.repartition(1)
+ # $example on$
discretizer = QuantileDiscretizer(numBuckets=3, inputCol="hour", outputCol="result")
- result = discretizer.fit(dataFrame).transform(dataFrame)
+ result = discretizer.fit(df).transform(df)
result.show()
# $example off$
diff --git a/examples/src/main/scala/org/apache/spark/examples/ml/QuantileDiscretizerExample.scala b/examples/src/main/scala/org/apache/spark/examples/ml/QuantileDiscretizerExample.scala
index 1a16515594..2f7e217b8f 100644
--- a/examples/src/main/scala/org/apache/spark/examples/ml/QuantileDiscretizerExample.scala
+++ b/examples/src/main/scala/org/apache/spark/examples/ml/QuantileDiscretizerExample.scala
@@ -32,8 +32,13 @@ object QuantileDiscretizerExample {
// $example on$
val data = Array((0, 18.0), (1, 19.0), (2, 8.0), (3, 5.0), (4, 2.2))
- val df = spark.createDataFrame(data).toDF("id", "hour")
-
+ var df = spark.createDataFrame(data).toDF("id", "hour")
+ // $example off$
+ // Output of QuantileDiscretizer for such small datasets can depend on the number of
+ // partitions. Here we force a single partition to ensure consistent results.
+ // Note this is not necessary for normal use cases
+ .repartition(1)
+ // $example on$
val discretizer = new QuantileDiscretizer()
.setInputCol("hour")
.setOutputCol("result")