aboutsummaryrefslogtreecommitdiff
path: root/examples/src/main/python/mllib/sampled_rdds.py
diff options
context:
space:
mode:
Diffstat (limited to 'examples/src/main/python/mllib/sampled_rdds.py')
-rwxr-xr-xexamples/src/main/python/mllib/sampled_rdds.py29
1 files changed, 15 insertions, 14 deletions
diff --git a/examples/src/main/python/mllib/sampled_rdds.py b/examples/src/main/python/mllib/sampled_rdds.py
index 92af3af5eb..b7033ab7da 100755
--- a/examples/src/main/python/mllib/sampled_rdds.py
+++ b/examples/src/main/python/mllib/sampled_rdds.py
@@ -18,6 +18,7 @@
"""
Randomly sampled RDDs.
"""
+from __future__ import print_function
import sys
@@ -27,7 +28,7 @@ from pyspark.mllib.util import MLUtils
if __name__ == "__main__":
if len(sys.argv) not in [1, 2]:
- print >> sys.stderr, "Usage: sampled_rdds <libsvm data file>"
+ print("Usage: sampled_rdds <libsvm data file>", file=sys.stderr)
exit(-1)
if len(sys.argv) == 2:
datapath = sys.argv[1]
@@ -41,24 +42,24 @@ if __name__ == "__main__":
examples = MLUtils.loadLibSVMFile(sc, datapath)
numExamples = examples.count()
if numExamples == 0:
- print >> sys.stderr, "Error: Data file had no samples to load."
+ print("Error: Data file had no samples to load.", file=sys.stderr)
exit(1)
- print 'Loaded data with %d examples from file: %s' % (numExamples, datapath)
+ print('Loaded data with %d examples from file: %s' % (numExamples, datapath))
# Example: RDD.sample() and RDD.takeSample()
expectedSampleSize = int(numExamples * fraction)
- print 'Sampling RDD using fraction %g. Expected sample size = %d.' \
- % (fraction, expectedSampleSize)
+ print('Sampling RDD using fraction %g. Expected sample size = %d.'
+ % (fraction, expectedSampleSize))
sampledRDD = examples.sample(withReplacement=True, fraction=fraction)
- print ' RDD.sample(): sample has %d examples' % sampledRDD.count()
+ print(' RDD.sample(): sample has %d examples' % sampledRDD.count())
sampledArray = examples.takeSample(withReplacement=True, num=expectedSampleSize)
- print ' RDD.takeSample(): sample has %d examples' % len(sampledArray)
+ print(' RDD.takeSample(): sample has %d examples' % len(sampledArray))
- print
+ print()
# Example: RDD.sampleByKey()
keyedRDD = examples.map(lambda lp: (int(lp.label), lp.features))
- print ' Keyed data using label (Int) as key ==> Orig'
+ print(' Keyed data using label (Int) as key ==> Orig')
# Count examples per label in original data.
keyCountsA = keyedRDD.countByKey()
@@ -69,18 +70,18 @@ if __name__ == "__main__":
sampledByKeyRDD = keyedRDD.sampleByKey(withReplacement=True, fractions=fractions)
keyCountsB = sampledByKeyRDD.countByKey()
sizeB = sum(keyCountsB.values())
- print ' Sampled %d examples using approximate stratified sampling (by label). ==> Sample' \
- % sizeB
+ print(' Sampled %d examples using approximate stratified sampling (by label). ==> Sample'
+ % sizeB)
# Compare samples
- print ' \tFractions of examples with key'
- print 'Key\tOrig\tSample'
+ print(' \tFractions of examples with key')
+ print('Key\tOrig\tSample')
for k in sorted(keyCountsA.keys()):
fracA = keyCountsA[k] / float(numExamples)
if sizeB != 0:
fracB = keyCountsB.get(k, 0) / float(sizeB)
else:
fracB = 0
- print '%d\t%g\t%g' % (k, fracA, fracB)
+ print('%d\t%g\t%g' % (k, fracA, fracB))
sc.stop()