aboutsummaryrefslogtreecommitdiff
path: root/python
diff options
context:
space:
mode:
authornoelsmith <mail@noelsmith.com>2015-09-14 21:58:52 -0700
committerXiangrui Meng <meng@databricks.com>2015-09-14 21:58:52 -0700
commit610971ecfe858b1a48ce69b25614afe52bcbe77f (patch)
treeb2ccf437ac0ebbac0ec5ec58a43f3405df770318 /python
parent4ae4d54794778042b2cc983e52757edac02412ab (diff)
downloadspark-610971ecfe858b1a48ce69b25614afe52bcbe77f.tar.gz
spark-610971ecfe858b1a48ce69b25614afe52bcbe77f.tar.bz2
spark-610971ecfe858b1a48ce69b25614afe52bcbe77f.zip
[SPARK-10273] Add @since annotation to pyspark.mllib.feature
Duplicated the since decorator from pyspark.sql into pyspark (also tweaked to handle functions without docstrings). Added since to methods + "versionadded::" to classes (derived from the git file history in pyspark). Author: noelsmith <mail@noelsmith.com> Closes #8633 from noel-smith/SPARK-10273-since-mllib-feature.
Diffstat (limited to 'python')
-rw-r--r--python/pyspark/mllib/feature.py58
1 files changed, 57 insertions, 1 deletions
diff --git a/python/pyspark/mllib/feature.py b/python/pyspark/mllib/feature.py
index f921e3ad1a..7b077b058c 100644
--- a/python/pyspark/mllib/feature.py
+++ b/python/pyspark/mllib/feature.py
@@ -30,7 +30,7 @@ if sys.version >= '3':
from py4j.protocol import Py4JJavaError
-from pyspark import SparkContext
+from pyspark import SparkContext, since
from pyspark.rdd import RDD, ignore_unicode_prefix
from pyspark.mllib.common import callMLlibFunc, JavaModelWrapper
from pyspark.mllib.linalg import (
@@ -84,11 +84,14 @@ class Normalizer(VectorTransformer):
>>> nor2 = Normalizer(float("inf"))
>>> nor2.transform(v)
DenseVector([0.0, 0.5, 1.0])
+
+ .. versionadded:: 1.2.0
"""
def __init__(self, p=2.0):
assert p >= 1.0, "p should be greater than 1.0"
self.p = float(p)
+ @since('1.2.0')
def transform(self, vector):
"""
Applies unit length normalization on a vector.
@@ -133,7 +136,11 @@ class StandardScalerModel(JavaVectorTransformer):
.. note:: Experimental
Represents a StandardScaler model that can transform vectors.
+
+ .. versionadded:: 1.2.0
"""
+
+ @since('1.2.0')
def transform(self, vector):
"""
Applies standardization transformation on a vector.
@@ -149,6 +156,7 @@ class StandardScalerModel(JavaVectorTransformer):
"""
return JavaVectorTransformer.transform(self, vector)
+ @since('1.4.0')
def setWithMean(self, withMean):
"""
Setter of the boolean which decides
@@ -157,6 +165,7 @@ class StandardScalerModel(JavaVectorTransformer):
self.call("setWithMean", withMean)
return self
+ @since('1.4.0')
def setWithStd(self, withStd):
"""
Setter of the boolean which decides
@@ -189,6 +198,8 @@ class StandardScaler(object):
>>> for r in result.collect(): r
DenseVector([-0.7071, 0.7071, -0.7071])
DenseVector([0.7071, -0.7071, 0.7071])
+
+ .. versionadded:: 1.2.0
"""
def __init__(self, withMean=False, withStd=True):
if not (withMean or withStd):
@@ -196,6 +207,7 @@ class StandardScaler(object):
self.withMean = withMean
self.withStd = withStd
+ @since('1.2.0')
def fit(self, dataset):
"""
Computes the mean and variance and stores as a model to be used
@@ -215,7 +227,11 @@ class ChiSqSelectorModel(JavaVectorTransformer):
.. note:: Experimental
Represents a Chi Squared selector model.
+
+ .. versionadded:: 1.4.0
"""
+
+ @since('1.4.0')
def transform(self, vector):
"""
Applies transformation on a vector.
@@ -245,10 +261,13 @@ class ChiSqSelector(object):
SparseVector(1, {0: 6.0})
>>> model.transform(DenseVector([8.0, 9.0, 5.0]))
DenseVector([5.0])
+
+ .. versionadded:: 1.4.0
"""
def __init__(self, numTopFeatures):
self.numTopFeatures = int(numTopFeatures)
+ @since('1.4.0')
def fit(self, data):
"""
Returns a ChiSquared feature selector.
@@ -265,6 +284,8 @@ class ChiSqSelector(object):
class PCAModel(JavaVectorTransformer):
"""
Model fitted by [[PCA]] that can project vectors to a low-dimensional space using PCA.
+
+ .. versionadded:: 1.5.0
"""
@@ -281,6 +302,8 @@ class PCA(object):
1.648...
>>> pcArray[1]
-4.013...
+
+ .. versionadded:: 1.5.0
"""
def __init__(self, k):
"""
@@ -288,6 +311,7 @@ class PCA(object):
"""
self.k = int(k)
+ @since('1.5.0')
def fit(self, data):
"""
Computes a [[PCAModel]] that contains the principal components of the input vectors.
@@ -312,14 +336,18 @@ class HashingTF(object):
>>> doc = "a a b b c d".split(" ")
>>> htf.transform(doc)
SparseVector(100, {...})
+
+ .. versionadded:: 1.2.0
"""
def __init__(self, numFeatures=1 << 20):
self.numFeatures = numFeatures
+ @since('1.2.0')
def indexOf(self, term):
""" Returns the index of the input term. """
return hash(term) % self.numFeatures
+ @since('1.2.0')
def transform(self, document):
"""
Transforms the input document (list of terms) to term frequency
@@ -339,7 +367,10 @@ class HashingTF(object):
class IDFModel(JavaVectorTransformer):
"""
Represents an IDF model that can transform term frequency vectors.
+
+ .. versionadded:: 1.2.0
"""
+ @since('1.2.0')
def transform(self, x):
"""
Transforms term frequency (TF) vectors to TF-IDF vectors.
@@ -358,6 +389,7 @@ class IDFModel(JavaVectorTransformer):
"""
return JavaVectorTransformer.transform(self, x)
+ @since('1.4.0')
def idf(self):
"""
Returns the current IDF vector.
@@ -401,10 +433,13 @@ class IDF(object):
DenseVector([0.0, 0.0, 1.3863, 0.863])
>>> model.transform(Vectors.sparse(n, (1, 3), (1.0, 2.0)))
SparseVector(4, {1: 0.0, 3: 0.5754})
+
+ .. versionadded:: 1.2.0
"""
def __init__(self, minDocFreq=0):
self.minDocFreq = minDocFreq
+ @since('1.2.0')
def fit(self, dataset):
"""
Computes the inverse document frequency.
@@ -420,7 +455,10 @@ class IDF(object):
class Word2VecModel(JavaVectorTransformer, JavaSaveable, JavaLoader):
"""
class for Word2Vec model
+
+ .. versionadded:: 1.2.0
"""
+ @since('1.2.0')
def transform(self, word):
"""
Transforms a word to its vector representation
@@ -435,6 +473,7 @@ class Word2VecModel(JavaVectorTransformer, JavaSaveable, JavaLoader):
except Py4JJavaError:
raise ValueError("%s not found" % word)
+ @since('1.2.0')
def findSynonyms(self, word, num):
"""
Find synonyms of a word
@@ -450,6 +489,7 @@ class Word2VecModel(JavaVectorTransformer, JavaSaveable, JavaLoader):
words, similarity = self.call("findSynonyms", word, num)
return zip(words, similarity)
+ @since('1.4.0')
def getVectors(self):
"""
Returns a map of words to their vector representations.
@@ -457,7 +497,11 @@ class Word2VecModel(JavaVectorTransformer, JavaSaveable, JavaLoader):
return self.call("getVectors")
@classmethod
+ @since('1.5.0')
def load(cls, sc, path):
+ """
+ Load a model from the given path.
+ """
jmodel = sc._jvm.org.apache.spark.mllib.feature \
.Word2VecModel.load(sc._jsc.sc(), path)
return Word2VecModel(jmodel)
@@ -507,6 +551,8 @@ class Word2Vec(object):
... rmtree(path)
... except OSError:
... pass
+
+ .. versionadded:: 1.2.0
"""
def __init__(self):
"""
@@ -519,6 +565,7 @@ class Word2Vec(object):
self.seed = random.randint(0, sys.maxsize)
self.minCount = 5
+ @since('1.2.0')
def setVectorSize(self, vectorSize):
"""
Sets vector size (default: 100).
@@ -526,6 +573,7 @@ class Word2Vec(object):
self.vectorSize = vectorSize
return self
+ @since('1.2.0')
def setLearningRate(self, learningRate):
"""
Sets initial learning rate (default: 0.025).
@@ -533,6 +581,7 @@ class Word2Vec(object):
self.learningRate = learningRate
return self
+ @since('1.2.0')
def setNumPartitions(self, numPartitions):
"""
Sets number of partitions (default: 1). Use a small number for
@@ -541,6 +590,7 @@ class Word2Vec(object):
self.numPartitions = numPartitions
return self
+ @since('1.2.0')
def setNumIterations(self, numIterations):
"""
Sets number of iterations (default: 1), which should be smaller
@@ -549,6 +599,7 @@ class Word2Vec(object):
self.numIterations = numIterations
return self
+ @since('1.2.0')
def setSeed(self, seed):
"""
Sets random seed.
@@ -556,6 +607,7 @@ class Word2Vec(object):
self.seed = seed
return self
+ @since('1.4.0')
def setMinCount(self, minCount):
"""
Sets minCount, the minimum number of times a token must appear
@@ -564,6 +616,7 @@ class Word2Vec(object):
self.minCount = minCount
return self
+ @since('1.2.0')
def fit(self, data):
"""
Computes the vector representation of each word in vocabulary.
@@ -596,10 +649,13 @@ class ElementwiseProduct(VectorTransformer):
>>> rdd = sc.parallelize([a, b])
>>> eprod.transform(rdd).collect()
[DenseVector([2.0, 2.0, 9.0]), DenseVector([9.0, 6.0, 12.0])]
+
+ .. versionadded:: 1.5.0
"""
def __init__(self, scalingVector):
self.scalingVector = _convert_to_vector(scalingVector)
+ @since('1.5.0')
def transform(self, vector):
"""
Computes the Hadamard product of the vector.