diff options
author | MechCoder <manojkumarsivaraj334@gmail.com> | 2015-06-23 12:43:32 -0700 |
---|---|---|
committer | Xiangrui Meng <meng@databricks.com> | 2015-06-23 12:43:32 -0700 |
commit | f2022fa0d375c804eca7803e172543b23ecbb9b7 (patch) | |
tree | 1c4c51b7950cfb4a78a6d1ae3fb944275546492a /python/pyspark/mllib/util.py | |
parent | 2b1111dd0b8deb9ad8d43fec792e60e3d0c4de75 (diff) | |
download | spark-f2022fa0d375c804eca7803e172543b23ecbb9b7.tar.gz spark-f2022fa0d375c804eca7803e172543b23ecbb9b7.tar.bz2 spark-f2022fa0d375c804eca7803e172543b23ecbb9b7.zip |
[SPARK-8265] [MLLIB] [PYSPARK] Add LinearDataGenerator to pyspark.mllib.utils
It is useful to generate linear data for easy testing of linear models and in general. Scala already has it. This is just a wrapper around the Scala code.
Author: MechCoder <manojkumarsivaraj334@gmail.com>
Closes #6715 from MechCoder/generate_linear_input and squashes the following commits:
6182884 [MechCoder] Minor changes
8bda047 [MechCoder] Minor style fixes
0f1053c [MechCoder] [SPARK-8265] Add LinearDataGenerator to pyspark.mllib.utils
Diffstat (limited to 'python/pyspark/mllib/util.py')
-rw-r--r-- | python/pyspark/mllib/util.py | 35 |
1 files changed, 35 insertions, 0 deletions
diff --git a/python/pyspark/mllib/util.py b/python/pyspark/mllib/util.py index 16a90db146..348238319e 100644 --- a/python/pyspark/mllib/util.py +++ b/python/pyspark/mllib/util.py @@ -257,6 +257,41 @@ class JavaLoader(Loader): return cls(java_model) +class LinearDataGenerator(object): + """Utils for generating linear data""" + + @staticmethod + def generateLinearInput(intercept, weights, xMean, xVariance, + nPoints, seed, eps): + """ + :param: intercept bias factor, the term c in X'w + c + :param: weights feature vector, the term w in X'w + c + :param: xMean Point around which the data X is centered. + :param: xVariance Variance of the given data + :param: nPoints Number of points to be generated + :param: seed Random Seed + :param: eps Used to scale the noise. If eps is set high, + the amount of gaussian noise added is more. + Returns a list of LabeledPoints of length nPoints + """ + weights = [float(weight) for weight in weights] + xMean = [float(mean) for mean in xMean] + xVariance = [float(var) for var in xVariance] + return list(callMLlibFunc( + "generateLinearInputWrapper", float(intercept), weights, xMean, + xVariance, int(nPoints), int(seed), float(eps))) + + @staticmethod + def generateLinearRDD(sc, nexamples, nfeatures, eps, + nParts=2, intercept=0.0): + """ + Generate a RDD of LabeledPoints. + """ + return callMLlibFunc( + "generateLinearRDDWrapper", sc, int(nexamples), int(nfeatures), + float(eps), int(nParts), float(intercept)) + + def _test(): import doctest from pyspark.context import SparkContext |