aboutsummaryrefslogtreecommitdiff
path: root/python/pyspark/mllib/random.py
diff options
context:
space:
mode:
authorDavies Liu <davies@databricks.com>2014-11-10 22:26:16 -0800
committerXiangrui Meng <meng@databricks.com>2014-11-10 22:26:16 -0800
commit65083e93ddd552b7d3e4eb09f87c091ef2ae83a2 (patch)
tree38c82101a74f8dd099f062b574e6b7781b4801d8 /python/pyspark/mllib/random.py
parent3c07b8f08240bafcdff5d174989fb433f4bc80b6 (diff)
downloadspark-65083e93ddd552b7d3e4eb09f87c091ef2ae83a2.tar.gz
spark-65083e93ddd552b7d3e4eb09f87c091ef2ae83a2.tar.bz2
spark-65083e93ddd552b7d3e4eb09f87c091ef2ae83a2.zip
[SPARK-4324] [PySpark] [MLlib] support numpy.array for all MLlib API
This PR check all of the existing Python MLlib API to make sure that numpy.array is supported as Vector (also RDD of numpy.array). It also improve some docstring and doctest. cc mateiz mengxr Author: Davies Liu <davies@databricks.com> Closes #3189 from davies/numpy and squashes the following commits: d5057c4 [Davies Liu] fix tests 6987611 [Davies Liu] support numpy.array for all MLlib API
Diffstat (limited to 'python/pyspark/mllib/random.py')
-rw-r--r--python/pyspark/mllib/random.py45
1 files changed, 43 insertions, 2 deletions
diff --git a/python/pyspark/mllib/random.py b/python/pyspark/mllib/random.py
index 7eebfc6bcd..cb4304f921 100644
--- a/python/pyspark/mllib/random.py
+++ b/python/pyspark/mllib/random.py
@@ -52,6 +52,12 @@ class RandomRDDs(object):
C{RandomRDDs.uniformRDD(sc, n, p, seed)\
.map(lambda v: a + (b - a) * v)}
+ :param sc: SparkContext used to create the RDD.
+ :param size: Size of the RDD.
+ :param numPartitions: Number of partitions in the RDD (default: `sc.defaultParallelism`).
+ :param seed: Random seed (default: a random long integer).
+ :return: RDD of float comprised of i.i.d. samples ~ `U(0.0, 1.0)`.
+
>>> x = RandomRDDs.uniformRDD(sc, 100).collect()
>>> len(x)
100
@@ -76,6 +82,12 @@ class RandomRDDs(object):
C{RandomRDDs.normal(sc, n, p, seed)\
.map(lambda v: mean + sigma * v)}
+ :param sc: SparkContext used to create the RDD.
+ :param size: Size of the RDD.
+ :param numPartitions: Number of partitions in the RDD (default: `sc.defaultParallelism`).
+ :param seed: Random seed (default: a random long integer).
+ :return: RDD of float comprised of i.i.d. samples ~ N(0.0, 1.0).
+
>>> x = RandomRDDs.normalRDD(sc, 1000, seed=1L)
>>> stats = x.stats()
>>> stats.count()
@@ -93,6 +105,13 @@ class RandomRDDs(object):
Generates an RDD comprised of i.i.d. samples from the Poisson
distribution with the input mean.
+ :param sc: SparkContext used to create the RDD.
+ :param mean: Mean, or lambda, for the Poisson distribution.
+ :param size: Size of the RDD.
+ :param numPartitions: Number of partitions in the RDD (default: `sc.defaultParallelism`).
+ :param seed: Random seed (default: a random long integer).
+ :return: RDD of float comprised of i.i.d. samples ~ Pois(mean).
+
>>> mean = 100.0
>>> x = RandomRDDs.poissonRDD(sc, mean, 1000, seed=2L)
>>> stats = x.stats()
@@ -104,7 +123,7 @@ class RandomRDDs(object):
>>> abs(stats.stdev() - sqrt(mean)) < 0.5
True
"""
- return callMLlibFunc("poissonRDD", sc._jsc, mean, size, numPartitions, seed)
+ return callMLlibFunc("poissonRDD", sc._jsc, float(mean), size, numPartitions, seed)
@staticmethod
@toArray
@@ -113,6 +132,13 @@ class RandomRDDs(object):
Generates an RDD comprised of vectors containing i.i.d. samples drawn
from the uniform distribution U(0.0, 1.0).
+ :param sc: SparkContext used to create the RDD.
+ :param numRows: Number of Vectors in the RDD.
+ :param numCols: Number of elements in each Vector.
+ :param numPartitions: Number of partitions in the RDD.
+ :param seed: Seed for the RNG that generates the seed for the generator in each partition.
+ :return: RDD of Vector with vectors containing i.i.d samples ~ `U(0.0, 1.0)`.
+
>>> import numpy as np
>>> mat = np.matrix(RandomRDDs.uniformVectorRDD(sc, 10, 10).collect())
>>> mat.shape
@@ -131,6 +157,13 @@ class RandomRDDs(object):
Generates an RDD comprised of vectors containing i.i.d. samples drawn
from the standard normal distribution.
+ :param sc: SparkContext used to create the RDD.
+ :param numRows: Number of Vectors in the RDD.
+ :param numCols: Number of elements in each Vector.
+ :param numPartitions: Number of partitions in the RDD (default: `sc.defaultParallelism`).
+ :param seed: Random seed (default: a random long integer).
+ :return: RDD of Vector with vectors containing i.i.d. samples ~ `N(0.0, 1.0)`.
+
>>> import numpy as np
>>> mat = np.matrix(RandomRDDs.normalVectorRDD(sc, 100, 100, seed=1L).collect())
>>> mat.shape
@@ -149,6 +182,14 @@ class RandomRDDs(object):
Generates an RDD comprised of vectors containing i.i.d. samples drawn
from the Poisson distribution with the input mean.
+ :param sc: SparkContext used to create the RDD.
+ :param mean: Mean, or lambda, for the Poisson distribution.
+ :param numRows: Number of Vectors in the RDD.
+ :param numCols: Number of elements in each Vector.
+ :param numPartitions: Number of partitions in the RDD (default: `sc.defaultParallelism`)
+ :param seed: Random seed (default: a random long integer).
+ :return: RDD of Vector with vectors containing i.i.d. samples ~ Pois(mean).
+
>>> import numpy as np
>>> mean = 100.0
>>> rdd = RandomRDDs.poissonVectorRDD(sc, mean, 100, 100, seed=1L)
@@ -161,7 +202,7 @@ class RandomRDDs(object):
>>> abs(mat.std() - sqrt(mean)) < 0.5
True
"""
- return callMLlibFunc("poissonVectorRDD", sc._jsc, mean, numRows, numCols,
+ return callMLlibFunc("poissonVectorRDD", sc._jsc, float(mean), numRows, numCols,
numPartitions, seed)