aboutsummaryrefslogtreecommitdiff
path: root/examples/src/main
diff options
context:
space:
mode:
authorBryan Cutler <cutlerb@gmail.com>2016-06-27 12:58:39 -0700
committerXiangrui Meng <meng@databricks.com>2016-06-27 12:58:39 -0700
commit1aa191e58e905f470f73663fc1c35f36e05e929a (patch)
tree00785ffc958467598b6bb9e0d166f70591f78098 /examples/src/main
parentc17b1abff8f8c6d24cb0cf4ff4f8c14a780c64b0 (diff)
downloadspark-1aa191e58e905f470f73663fc1c35f36e05e929a.tar.gz
spark-1aa191e58e905f470f73663fc1c35f36e05e929a.tar.bz2
spark-1aa191e58e905f470f73663fc1c35f36e05e929a.zip
[SPARK-16231][PYSPARK][ML][EXAMPLES] dataframe_example.py fails to convert ML style vectors
## What changes were proposed in this pull request? Need to convert ML Vectors to the old MLlib style before doing Statistics.colStats operations on the DataFrame ## How was this patch tested? Ran example, local tests Author: Bryan Cutler <cutlerb@gmail.com> Closes #13928 from BryanCutler/pyspark-ml-example-vector-conv-SPARK-16231.
Diffstat (limited to 'examples/src/main')
-rw-r--r--examples/src/main/python/ml/dataframe_example.py4
1 files changed, 3 insertions, 1 deletions
diff --git a/examples/src/main/python/ml/dataframe_example.py b/examples/src/main/python/ml/dataframe_example.py
index a7d8b9056d..c1818d72fe 100644
--- a/examples/src/main/python/ml/dataframe_example.py
+++ b/examples/src/main/python/ml/dataframe_example.py
@@ -28,6 +28,7 @@ import shutil
from pyspark.sql import SparkSession
from pyspark.mllib.stat import Statistics
+from pyspark.mllib.util import MLUtils
if __name__ == "__main__":
if len(sys.argv) > 2:
@@ -55,7 +56,8 @@ if __name__ == "__main__":
labelSummary.show()
# Convert features column to an RDD of vectors.
- features = df.select("features").rdd.map(lambda r: r.features)
+ features = MLUtils.convertVectorColumnsFromML(df, "features") \
+ .select("features").rdd.map(lambda r: r.features)
summary = Statistics.colStats(features)
print("Selected features column with average values:\n" +
str(summary.mean()))