aboutsummaryrefslogtreecommitdiff
path: root/examples/src/main/python/mllib
diff options
context:
space:
mode:
authorDavies Liu <davies@databricks.com>2015-04-16 16:20:57 -0700
committerJosh Rosen <joshrosen@databricks.com>2015-04-16 16:20:57 -0700
commit04e44b37cc04f62fbf9e08c7076349e0a4d12ea8 (patch)
treeb6429253955210445ddc37faa4d5166ea25a91e2 /examples/src/main/python/mllib
parent55f553a979db925aa0c3559f7e80b99d2bf3feb4 (diff)
downloadspark-04e44b37cc04f62fbf9e08c7076349e0a4d12ea8.tar.gz
spark-04e44b37cc04f62fbf9e08c7076349e0a4d12ea8.tar.bz2
spark-04e44b37cc04f62fbf9e08c7076349e0a4d12ea8.zip
[SPARK-4897] [PySpark] Python 3 support
This PR update PySpark to support Python 3 (tested with 3.4). Known issue: unpickle array from Pyrolite is broken in Python 3, those tests are skipped. TODO: ec2/spark-ec2.py is not fully tested with python3. Author: Davies Liu <davies@databricks.com> Author: twneale <twneale@gmail.com> Author: Josh Rosen <joshrosen@databricks.com> Closes #5173 from davies/python3 and squashes the following commits: d7d6323 [Davies Liu] fix tests 6c52a98 [Davies Liu] fix mllib test 99e334f [Davies Liu] update timeout b716610 [Davies Liu] Merge branch 'master' of github.com:apache/spark into python3 cafd5ec [Davies Liu] adddress comments from @mengxr bf225d7 [Davies Liu] Merge branch 'master' of github.com:apache/spark into python3 179fc8d [Davies Liu] tuning flaky tests 8c8b957 [Davies Liu] fix ResourceWarning in Python 3 5c57c95 [Davies Liu] Merge branch 'master' of github.com:apache/spark into python3 4006829 [Davies Liu] fix test 2fc0066 [Davies Liu] add python3 path 71535e9 [Davies Liu] fix xrange and divide 5a55ab4 [Davies Liu] Merge branch 'master' of github.com:apache/spark into python3 125f12c [Davies Liu] Merge branch 'master' of github.com:apache/spark into python3 ed498c8 [Davies Liu] fix compatibility with python 3 820e649 [Davies Liu] Merge branch 'master' of github.com:apache/spark into python3 e8ce8c9 [Davies Liu] Merge branch 'master' of github.com:apache/spark into python3 ad7c374 [Davies Liu] fix mllib test and warning ef1fc2f [Davies Liu] fix tests 4eee14a [Davies Liu] Merge branch 'master' of github.com:apache/spark into python3 20112ff [Davies Liu] Merge branch 'master' of github.com:apache/spark into python3 59bb492 [Davies Liu] fix tests 1da268c [Davies Liu] Merge branch 'master' of github.com:apache/spark into python3 ca0fdd3 [Davies Liu] fix code style 9563a15 [Davies Liu] add imap back for python 2 0b1ec04 [Davies Liu] make python examples work with Python 3 d2fd566 [Davies Liu] Merge branch 'master' of github.com:apache/spark into python3 a716d34 [Davies Liu] test with python 3.4 f1700e8 [Davies Liu] fix test in python3 671b1db [Davies Liu] fix test in python3 692ff47 [Davies Liu] fix flaky test 7b9699f [Davies Liu] invalidate import cache for Python 3.3+ 9c58497 [Davies Liu] fix kill worker 309bfbf [Davies Liu] keep compatibility 5707476 [Davies Liu] cleanup, fix hash of string in 3.3+ 8662d5b [Davies Liu] Merge branch 'master' of github.com:apache/spark into python3 f53e1f0 [Davies Liu] fix tests 70b6b73 [Davies Liu] compile ec2/spark_ec2.py in python 3 a39167e [Davies Liu] support customize class in __main__ 814c77b [Davies Liu] run unittests with python 3 7f4476e [Davies Liu] mllib tests passed d737924 [Davies Liu] pass ml tests 375ea17 [Davies Liu] SQL tests pass 6cc42a9 [Davies Liu] rename 431a8de [Davies Liu] streaming tests pass 78901a7 [Davies Liu] fix hash of serializer in Python 3 24b2f2e [Davies Liu] pass all RDD tests 35f48fe [Davies Liu] run future again 1eebac2 [Davies Liu] fix conflict in ec2/spark_ec2.py 6e3c21d [Davies Liu] make cloudpickle work with Python3 2fb2db3 [Josh Rosen] Guard more changes behind sys.version; still doesn't run 1aa5e8f [twneale] Turned out `pickle.DictionaryType is dict` == True, so swapped it out 7354371 [twneale] buffer --> memoryview I'm not super sure if this a valid change, but the 2.7 docs recommend using memoryview over buffer where possible, so hoping it'll work. b69ccdf [twneale] Uses the pure python pickle._Pickler instead of c-extension _pickle.Pickler. It appears pyspark 2.7 uses the pure python pickler as well, so this shouldn't degrade pickling performance (?). f40d925 [twneale] xrange --> range e104215 [twneale] Replaces 2.7 types.InstsanceType with 3.4 `object`....could be horribly wrong depending on how types.InstanceType is used elsewhere in the package--see http://bugs.python.org/issue8206 79de9d0 [twneale] Replaces python2.7 `file` with 3.4 _io.TextIOWrapper 2adb42d [Josh Rosen] Fix up some import differences between Python 2 and 3 854be27 [Josh Rosen] Run `futurize` on Python code: 7c5b4ce [Josh Rosen] Remove Python 3 check in shell.py.
Diffstat (limited to 'examples/src/main/python/mllib')
-rwxr-xr-xexamples/src/main/python/mllib/correlations.py19
-rw-r--r--examples/src/main/python/mllib/dataset_example.py13
-rwxr-xr-xexamples/src/main/python/mllib/decision_tree_runner.py29
-rw-r--r--examples/src/main/python/mllib/gaussian_mixture_model.py9
-rw-r--r--examples/src/main/python/mllib/gradient_boosted_trees.py7
-rwxr-xr-xexamples/src/main/python/mllib/kmeans.py5
-rwxr-xr-xexamples/src/main/python/mllib/logistic_regression.py9
-rwxr-xr-xexamples/src/main/python/mllib/random_forest_example.py9
-rwxr-xr-xexamples/src/main/python/mllib/random_rdd_generation.py21
-rwxr-xr-xexamples/src/main/python/mllib/sampled_rdds.py29
-rw-r--r--examples/src/main/python/mllib/word2vec.py5
11 files changed, 81 insertions, 74 deletions
diff --git a/examples/src/main/python/mllib/correlations.py b/examples/src/main/python/mllib/correlations.py
index 4218eca822..0e13546b88 100755
--- a/examples/src/main/python/mllib/correlations.py
+++ b/examples/src/main/python/mllib/correlations.py
@@ -18,6 +18,7 @@
"""
Correlations using MLlib.
"""
+from __future__ import print_function
import sys
@@ -29,7 +30,7 @@ from pyspark.mllib.util import MLUtils
if __name__ == "__main__":
if len(sys.argv) not in [1, 2]:
- print >> sys.stderr, "Usage: correlations (<file>)"
+ print("Usage: correlations (<file>)", file=sys.stderr)
exit(-1)
sc = SparkContext(appName="PythonCorrelations")
if len(sys.argv) == 2:
@@ -41,20 +42,20 @@ if __name__ == "__main__":
points = MLUtils.loadLibSVMFile(sc, filepath)\
.map(lambda lp: LabeledPoint(lp.label, lp.features.toArray()))
- print
- print 'Summary of data file: ' + filepath
- print '%d data points' % points.count()
+ print()
+ print('Summary of data file: ' + filepath)
+ print('%d data points' % points.count())
# Statistics (correlations)
- print
- print 'Correlation (%s) between label and each feature' % corrType
- print 'Feature\tCorrelation'
+ print()
+ print('Correlation (%s) between label and each feature' % corrType)
+ print('Feature\tCorrelation')
numFeatures = points.take(1)[0].features.size
labelRDD = points.map(lambda lp: lp.label)
for i in range(numFeatures):
featureRDD = points.map(lambda lp: lp.features[i])
corr = Statistics.corr(labelRDD, featureRDD, corrType)
- print '%d\t%g' % (i, corr)
- print
+ print('%d\t%g' % (i, corr))
+ print()
sc.stop()
diff --git a/examples/src/main/python/mllib/dataset_example.py b/examples/src/main/python/mllib/dataset_example.py
index fcbf56cbf0..e23ecc0c5d 100644
--- a/examples/src/main/python/mllib/dataset_example.py
+++ b/examples/src/main/python/mllib/dataset_example.py
@@ -19,6 +19,7 @@
An example of how to use DataFrame as a dataset for ML. Run with::
bin/spark-submit examples/src/main/python/mllib/dataset_example.py
"""
+from __future__ import print_function
import os
import sys
@@ -32,16 +33,16 @@ from pyspark.mllib.stat import Statistics
def summarize(dataset):
- print "schema: %s" % dataset.schema().json()
+ print("schema: %s" % dataset.schema().json())
labels = dataset.map(lambda r: r.label)
- print "label average: %f" % labels.mean()
+ print("label average: %f" % labels.mean())
features = dataset.map(lambda r: r.features)
summary = Statistics.colStats(features)
- print "features average: %r" % summary.mean()
+ print("features average: %r" % summary.mean())
if __name__ == "__main__":
if len(sys.argv) > 2:
- print >> sys.stderr, "Usage: dataset_example.py <libsvm file>"
+ print("Usage: dataset_example.py <libsvm file>", file=sys.stderr)
exit(-1)
sc = SparkContext(appName="DatasetExample")
sqlContext = SQLContext(sc)
@@ -54,9 +55,9 @@ if __name__ == "__main__":
summarize(dataset0)
tempdir = tempfile.NamedTemporaryFile(delete=False).name
os.unlink(tempdir)
- print "Save dataset as a Parquet file to %s." % tempdir
+ print("Save dataset as a Parquet file to %s." % tempdir)
dataset0.saveAsParquetFile(tempdir)
- print "Load it back and summarize it again."
+ print("Load it back and summarize it again.")
dataset1 = sqlContext.parquetFile(tempdir).setName("dataset1").cache()
summarize(dataset1)
shutil.rmtree(tempdir)
diff --git a/examples/src/main/python/mllib/decision_tree_runner.py b/examples/src/main/python/mllib/decision_tree_runner.py
index fccabd841b..513ed8fd51 100755
--- a/examples/src/main/python/mllib/decision_tree_runner.py
+++ b/examples/src/main/python/mllib/decision_tree_runner.py
@@ -20,6 +20,7 @@ Decision tree classification and regression using MLlib.
This example requires NumPy (http://www.numpy.org/).
"""
+from __future__ import print_function
import numpy
import os
@@ -83,18 +84,17 @@ def reindexClassLabels(data):
numClasses = len(classCounts)
# origToNewLabels: class --> index in 0,...,numClasses-1
if (numClasses < 2):
- print >> sys.stderr, \
- "Dataset for classification should have at least 2 classes." + \
- " The given dataset had only %d classes." % numClasses
+ print("Dataset for classification should have at least 2 classes."
+ " The given dataset had only %d classes." % numClasses, file=sys.stderr)
exit(1)
origToNewLabels = dict([(sortedClasses[i], i) for i in range(0, numClasses)])
- print "numClasses = %d" % numClasses
- print "Per-class example fractions, counts:"
- print "Class\tFrac\tCount"
+ print("numClasses = %d" % numClasses)
+ print("Per-class example fractions, counts:")
+ print("Class\tFrac\tCount")
for c in sortedClasses:
frac = classCounts[c] / (numExamples + 0.0)
- print "%g\t%g\t%d" % (c, frac, classCounts[c])
+ print("%g\t%g\t%d" % (c, frac, classCounts[c]))
if (sortedClasses[0] == 0 and sortedClasses[-1] == numClasses - 1):
return (data, origToNewLabels)
@@ -105,8 +105,7 @@ def reindexClassLabels(data):
def usage():
- print >> sys.stderr, \
- "Usage: decision_tree_runner [libsvm format data filepath]"
+ print("Usage: decision_tree_runner [libsvm format data filepath]", file=sys.stderr)
exit(1)
@@ -133,13 +132,13 @@ if __name__ == "__main__":
model = DecisionTree.trainClassifier(reindexedData, numClasses=numClasses,
categoricalFeaturesInfo=categoricalFeaturesInfo)
# Print learned tree and stats.
- print "Trained DecisionTree for classification:"
- print " Model numNodes: %d" % model.numNodes()
- print " Model depth: %d" % model.depth()
- print " Training accuracy: %g" % getAccuracy(model, reindexedData)
+ print("Trained DecisionTree for classification:")
+ print(" Model numNodes: %d" % model.numNodes())
+ print(" Model depth: %d" % model.depth())
+ print(" Training accuracy: %g" % getAccuracy(model, reindexedData))
if model.numNodes() < 20:
- print model.toDebugString()
+ print(model.toDebugString())
else:
- print model
+ print(model)
sc.stop()
diff --git a/examples/src/main/python/mllib/gaussian_mixture_model.py b/examples/src/main/python/mllib/gaussian_mixture_model.py
index a2cd626c9f..2cb8010cdc 100644
--- a/examples/src/main/python/mllib/gaussian_mixture_model.py
+++ b/examples/src/main/python/mllib/gaussian_mixture_model.py
@@ -18,7 +18,8 @@
"""
A Gaussian Mixture Model clustering program using MLlib.
"""
-import sys
+from __future__ import print_function
+
import random
import argparse
import numpy as np
@@ -59,7 +60,7 @@ if __name__ == "__main__":
model = GaussianMixture.train(data, args.k, args.convergenceTol,
args.maxIterations, args.seed)
for i in range(args.k):
- print ("weight = ", model.weights[i], "mu = ", model.gaussians[i].mu,
- "sigma = ", model.gaussians[i].sigma.toArray())
- print ("Cluster labels (first 100): ", model.predict(data).take(100))
+ print(("weight = ", model.weights[i], "mu = ", model.gaussians[i].mu,
+ "sigma = ", model.gaussians[i].sigma.toArray()))
+ print(("Cluster labels (first 100): ", model.predict(data).take(100)))
sc.stop()
diff --git a/examples/src/main/python/mllib/gradient_boosted_trees.py b/examples/src/main/python/mllib/gradient_boosted_trees.py
index e647773ad9..781bd61c9d 100644
--- a/examples/src/main/python/mllib/gradient_boosted_trees.py
+++ b/examples/src/main/python/mllib/gradient_boosted_trees.py
@@ -18,6 +18,7 @@
"""
Gradient boosted Trees classification and regression using MLlib.
"""
+from __future__ import print_function
import sys
@@ -34,7 +35,7 @@ def testClassification(trainingData, testData):
# Evaluate model on test instances and compute test error
predictions = model.predict(testData.map(lambda x: x.features))
labelsAndPredictions = testData.map(lambda lp: lp.label).zip(predictions)
- testErr = labelsAndPredictions.filter(lambda (v, p): v != p).count() \
+ testErr = labelsAndPredictions.filter(lambda v_p: v_p[0] != v_p[1]).count() \
/ float(testData.count())
print('Test Error = ' + str(testErr))
print('Learned classification ensemble model:')
@@ -49,7 +50,7 @@ def testRegression(trainingData, testData):
# Evaluate model on test instances and compute test error
predictions = model.predict(testData.map(lambda x: x.features))
labelsAndPredictions = testData.map(lambda lp: lp.label).zip(predictions)
- testMSE = labelsAndPredictions.map(lambda (v, p): (v - p) * (v - p)).sum() \
+ testMSE = labelsAndPredictions.map(lambda vp: (vp[0] - vp[1]) * (vp[0] - vp[1])).sum() \
/ float(testData.count())
print('Test Mean Squared Error = ' + str(testMSE))
print('Learned regression ensemble model:')
@@ -58,7 +59,7 @@ def testRegression(trainingData, testData):
if __name__ == "__main__":
if len(sys.argv) > 1:
- print >> sys.stderr, "Usage: gradient_boosted_trees"
+ print("Usage: gradient_boosted_trees", file=sys.stderr)
exit(1)
sc = SparkContext(appName="PythonGradientBoostedTrees")
diff --git a/examples/src/main/python/mllib/kmeans.py b/examples/src/main/python/mllib/kmeans.py
index 2eeb1abeeb..f901a87fa6 100755
--- a/examples/src/main/python/mllib/kmeans.py
+++ b/examples/src/main/python/mllib/kmeans.py
@@ -20,6 +20,7 @@ A K-means clustering program using MLlib.
This example requires NumPy (http://www.numpy.org/).
"""
+from __future__ import print_function
import sys
@@ -34,12 +35,12 @@ def parseVector(line):
if __name__ == "__main__":
if len(sys.argv) != 3:
- print >> sys.stderr, "Usage: kmeans <file> <k>"
+ print("Usage: kmeans <file> <k>", file=sys.stderr)
exit(-1)
sc = SparkContext(appName="KMeans")
lines = sc.textFile(sys.argv[1])
data = lines.map(parseVector)
k = int(sys.argv[2])
model = KMeans.train(data, k)
- print "Final centers: " + str(model.clusterCenters)
+ print("Final centers: " + str(model.clusterCenters))
sc.stop()
diff --git a/examples/src/main/python/mllib/logistic_regression.py b/examples/src/main/python/mllib/logistic_regression.py
index 8cae27fc4a..d4f1d34e2d 100755
--- a/examples/src/main/python/mllib/logistic_regression.py
+++ b/examples/src/main/python/mllib/logistic_regression.py
@@ -20,11 +20,10 @@ Logistic regression using MLlib.
This example requires NumPy (http://www.numpy.org/).
"""
+from __future__ import print_function
-from math import exp
import sys
-import numpy as np
from pyspark import SparkContext
from pyspark.mllib.regression import LabeledPoint
from pyspark.mllib.classification import LogisticRegressionWithSGD
@@ -42,12 +41,12 @@ def parsePoint(line):
if __name__ == "__main__":
if len(sys.argv) != 3:
- print >> sys.stderr, "Usage: logistic_regression <file> <iterations>"
+ print("Usage: logistic_regression <file> <iterations>", file=sys.stderr)
exit(-1)
sc = SparkContext(appName="PythonLR")
points = sc.textFile(sys.argv[1]).map(parsePoint)
iterations = int(sys.argv[2])
model = LogisticRegressionWithSGD.train(points, iterations)
- print "Final weights: " + str(model.weights)
- print "Final intercept: " + str(model.intercept)
+ print("Final weights: " + str(model.weights))
+ print("Final intercept: " + str(model.intercept))
sc.stop()
diff --git a/examples/src/main/python/mllib/random_forest_example.py b/examples/src/main/python/mllib/random_forest_example.py
index d3c24f7664..4cfdad868c 100755
--- a/examples/src/main/python/mllib/random_forest_example.py
+++ b/examples/src/main/python/mllib/random_forest_example.py
@@ -22,6 +22,7 @@ Note: This example illustrates binary classification.
For information on multiclass classification, please refer to the decision_tree_runner.py
example.
"""
+from __future__ import print_function
import sys
@@ -43,7 +44,7 @@ def testClassification(trainingData, testData):
# Evaluate model on test instances and compute test error
predictions = model.predict(testData.map(lambda x: x.features))
labelsAndPredictions = testData.map(lambda lp: lp.label).zip(predictions)
- testErr = labelsAndPredictions.filter(lambda (v, p): v != p).count()\
+ testErr = labelsAndPredictions.filter(lambda v_p: v_p[0] != v_p[1]).count()\
/ float(testData.count())
print('Test Error = ' + str(testErr))
print('Learned classification forest model:')
@@ -62,8 +63,8 @@ def testRegression(trainingData, testData):
# Evaluate model on test instances and compute test error
predictions = model.predict(testData.map(lambda x: x.features))
labelsAndPredictions = testData.map(lambda lp: lp.label).zip(predictions)
- testMSE = labelsAndPredictions.map(lambda (v, p): (v - p) * (v - p)).sum()\
- / float(testData.count())
+ testMSE = labelsAndPredictions.map(lambda v_p1: (v_p1[0] - v_p1[1]) * (v_p1[0] - v_p1[1]))\
+ .sum() / float(testData.count())
print('Test Mean Squared Error = ' + str(testMSE))
print('Learned regression forest model:')
print(model.toDebugString())
@@ -71,7 +72,7 @@ def testRegression(trainingData, testData):
if __name__ == "__main__":
if len(sys.argv) > 1:
- print >> sys.stderr, "Usage: random_forest_example"
+ print("Usage: random_forest_example", file=sys.stderr)
exit(1)
sc = SparkContext(appName="PythonRandomForestExample")
diff --git a/examples/src/main/python/mllib/random_rdd_generation.py b/examples/src/main/python/mllib/random_rdd_generation.py
index 1e8892741e..729bae30b1 100755
--- a/examples/src/main/python/mllib/random_rdd_generation.py
+++ b/examples/src/main/python/mllib/random_rdd_generation.py
@@ -18,6 +18,7 @@
"""
Randomly generated RDDs.
"""
+from __future__ import print_function
import sys
@@ -27,7 +28,7 @@ from pyspark.mllib.random import RandomRDDs
if __name__ == "__main__":
if len(sys.argv) not in [1, 2]:
- print >> sys.stderr, "Usage: random_rdd_generation"
+ print("Usage: random_rdd_generation", file=sys.stderr)
exit(-1)
sc = SparkContext(appName="PythonRandomRDDGeneration")
@@ -37,19 +38,19 @@ if __name__ == "__main__":
# Example: RandomRDDs.normalRDD
normalRDD = RandomRDDs.normalRDD(sc, numExamples)
- print 'Generated RDD of %d examples sampled from the standard normal distribution'\
- % normalRDD.count()
- print ' First 5 samples:'
+ print('Generated RDD of %d examples sampled from the standard normal distribution'
+ % normalRDD.count())
+ print(' First 5 samples:')
for sample in normalRDD.take(5):
- print ' ' + str(sample)
- print
+ print(' ' + str(sample))
+ print()
# Example: RandomRDDs.normalVectorRDD
normalVectorRDD = RandomRDDs.normalVectorRDD(sc, numRows=numExamples, numCols=2)
- print 'Generated RDD of %d examples of length-2 vectors.' % normalVectorRDD.count()
- print ' First 5 samples:'
+ print('Generated RDD of %d examples of length-2 vectors.' % normalVectorRDD.count())
+ print(' First 5 samples:')
for sample in normalVectorRDD.take(5):
- print ' ' + str(sample)
- print
+ print(' ' + str(sample))
+ print()
sc.stop()
diff --git a/examples/src/main/python/mllib/sampled_rdds.py b/examples/src/main/python/mllib/sampled_rdds.py
index 92af3af5eb..b7033ab7da 100755
--- a/examples/src/main/python/mllib/sampled_rdds.py
+++ b/examples/src/main/python/mllib/sampled_rdds.py
@@ -18,6 +18,7 @@
"""
Randomly sampled RDDs.
"""
+from __future__ import print_function
import sys
@@ -27,7 +28,7 @@ from pyspark.mllib.util import MLUtils
if __name__ == "__main__":
if len(sys.argv) not in [1, 2]:
- print >> sys.stderr, "Usage: sampled_rdds <libsvm data file>"
+ print("Usage: sampled_rdds <libsvm data file>", file=sys.stderr)
exit(-1)
if len(sys.argv) == 2:
datapath = sys.argv[1]
@@ -41,24 +42,24 @@ if __name__ == "__main__":
examples = MLUtils.loadLibSVMFile(sc, datapath)
numExamples = examples.count()
if numExamples == 0:
- print >> sys.stderr, "Error: Data file had no samples to load."
+ print("Error: Data file had no samples to load.", file=sys.stderr)
exit(1)
- print 'Loaded data with %d examples from file: %s' % (numExamples, datapath)
+ print('Loaded data with %d examples from file: %s' % (numExamples, datapath))
# Example: RDD.sample() and RDD.takeSample()
expectedSampleSize = int(numExamples * fraction)
- print 'Sampling RDD using fraction %g. Expected sample size = %d.' \
- % (fraction, expectedSampleSize)
+ print('Sampling RDD using fraction %g. Expected sample size = %d.'
+ % (fraction, expectedSampleSize))
sampledRDD = examples.sample(withReplacement=True, fraction=fraction)
- print ' RDD.sample(): sample has %d examples' % sampledRDD.count()
+ print(' RDD.sample(): sample has %d examples' % sampledRDD.count())
sampledArray = examples.takeSample(withReplacement=True, num=expectedSampleSize)
- print ' RDD.takeSample(): sample has %d examples' % len(sampledArray)
+ print(' RDD.takeSample(): sample has %d examples' % len(sampledArray))
- print
+ print()
# Example: RDD.sampleByKey()
keyedRDD = examples.map(lambda lp: (int(lp.label), lp.features))
- print ' Keyed data using label (Int) as key ==> Orig'
+ print(' Keyed data using label (Int) as key ==> Orig')
# Count examples per label in original data.
keyCountsA = keyedRDD.countByKey()
@@ -69,18 +70,18 @@ if __name__ == "__main__":
sampledByKeyRDD = keyedRDD.sampleByKey(withReplacement=True, fractions=fractions)
keyCountsB = sampledByKeyRDD.countByKey()
sizeB = sum(keyCountsB.values())
- print ' Sampled %d examples using approximate stratified sampling (by label). ==> Sample' \
- % sizeB
+ print(' Sampled %d examples using approximate stratified sampling (by label). ==> Sample'
+ % sizeB)
# Compare samples
- print ' \tFractions of examples with key'
- print 'Key\tOrig\tSample'
+ print(' \tFractions of examples with key')
+ print('Key\tOrig\tSample')
for k in sorted(keyCountsA.keys()):
fracA = keyCountsA[k] / float(numExamples)
if sizeB != 0:
fracB = keyCountsB.get(k, 0) / float(sizeB)
else:
fracB = 0
- print '%d\t%g\t%g' % (k, fracA, fracB)
+ print('%d\t%g\t%g' % (k, fracA, fracB))
sc.stop()
diff --git a/examples/src/main/python/mllib/word2vec.py b/examples/src/main/python/mllib/word2vec.py
index 99fef4276a..40d1b88792 100644
--- a/examples/src/main/python/mllib/word2vec.py
+++ b/examples/src/main/python/mllib/word2vec.py
@@ -23,6 +23,7 @@
# grep -o -E '\w+(\W+\w+){0,15}' text8 > text8_lines
# This was done so that the example can be run in local mode
+from __future__ import print_function
import sys
@@ -34,7 +35,7 @@ USAGE = ("bin/spark-submit --driver-memory 4g "
if __name__ == "__main__":
if len(sys.argv) < 2:
- print USAGE
+ print(USAGE)
sys.exit("Argument for file not provided")
file_path = sys.argv[1]
sc = SparkContext(appName='Word2Vec')
@@ -46,5 +47,5 @@ if __name__ == "__main__":
synonyms = model.findSynonyms('china', 40)
for word, cosine_distance in synonyms:
- print "{}: {}".format(word, cosine_distance)
+ print("{}: {}".format(word, cosine_distance))
sc.stop()