diff options
Diffstat (limited to 'python/pyspark/ml/feature.py')
-rw-r--r-- | python/pyspark/ml/feature.py | 95 |
1 files changed, 55 insertions, 40 deletions
diff --git a/python/pyspark/ml/feature.py b/python/pyspark/ml/feature.py index 16cb9d1db3..86b53285b5 100644 --- a/python/pyspark/ml/feature.py +++ b/python/pyspark/ml/feature.py @@ -83,7 +83,8 @@ class Binarizer(JavaTransformer, HasInputCol, HasOutputCol, JavaMLReadable, Java """ threshold = Param(Params._dummy(), "threshold", - "threshold in binary classification prediction, in range [0, 1]") + "threshold in binary classification prediction, in range [0, 1]", + typeConverter=TypeConverters.toFloat) @keyword_only def __init__(self, threshold=0.0, inputCol=None, outputCol=None): @@ -159,7 +160,8 @@ class Bucketizer(JavaTransformer, HasInputCol, HasOutputCol, JavaMLReadable, Jav "range [x,y) except the last bucket, which also includes y. The splits " + "should be strictly increasing. Values at -inf, inf must be explicitly " + "provided to cover all Double values; otherwise, values outside the splits " + - "specified will be treated as errors.") + "specified will be treated as errors.", + typeConverter=TypeConverters.toListFloat) @keyword_only def __init__(self, splits=None, inputCol=None, outputCol=None): @@ -243,15 +245,17 @@ class CountVectorizer(JavaEstimator, HasInputCol, HasOutputCol, JavaMLReadable, " threshold are ignored. If this is an integer >= 1, then this specifies a count (of" + " times the term must appear in the document); if this is a double in [0,1), then this " + "specifies a fraction (out of the document's token count). Note that the parameter is " + - "only used in transform of CountVectorizerModel and does not affect fitting. Default 1.0") + "only used in transform of CountVectorizerModel and does not affect fitting. Default 1.0", + typeConverter=TypeConverters.toFloat) minDF = Param( Params._dummy(), "minDF", "Specifies the minimum number of" + " different documents a term must appear in to be included in the vocabulary." + " If this is an integer >= 1, this specifies the number of documents the term must" + " appear in; if this is a double in [0,1), then this specifies the fraction of documents." + - " Default 1.0") + " Default 1.0", typeConverter=TypeConverters.toFloat) vocabSize = Param( - Params._dummy(), "vocabSize", "max size of the vocabulary. Default 1 << 18.") + Params._dummy(), "vocabSize", "max size of the vocabulary. Default 1 << 18.", + typeConverter=TypeConverters.toInt) @keyword_only def __init__(self, minTF=1.0, minDF=1.0, vocabSize=1 << 18, inputCol=None, outputCol=None): @@ -375,7 +379,7 @@ class DCT(JavaTransformer, HasInputCol, HasOutputCol, JavaMLReadable, JavaMLWrit """ inverse = Param(Params._dummy(), "inverse", "Set transformer to perform inverse DCT, " + - "default False.") + "default False.", typeConverter=TypeConverters.toBoolean) @keyword_only def __init__(self, inverse=False, inputCol=None, outputCol=None): @@ -441,8 +445,8 @@ class ElementwiseProduct(JavaTransformer, HasInputCol, HasOutputCol, JavaMLReada .. versionadded:: 1.5.0 """ - scalingVec = Param(Params._dummy(), "scalingVec", "vector for hadamard product, " + - "it must be MLlib Vector type.") + scalingVec = Param(Params._dummy(), "scalingVec", "Vector for hadamard product.", + typeConverter=TypeConverters.toVector) @keyword_only def __init__(self, scalingVec=None, inputCol=None, outputCol=None): @@ -564,7 +568,8 @@ class IDF(JavaEstimator, HasInputCol, HasOutputCol, JavaMLReadable, JavaMLWritab """ minDocFreq = Param(Params._dummy(), "minDocFreq", - "minimum of documents in which a term should appear for filtering") + "minimum of documents in which a term should appear for filtering", + typeConverter=TypeConverters.toInt) @keyword_only def __init__(self, minDocFreq=0, inputCol=None, outputCol=None): @@ -746,8 +751,10 @@ class MinMaxScaler(JavaEstimator, HasInputCol, HasOutputCol, JavaMLReadable, Jav .. versionadded:: 1.6.0 """ - min = Param(Params._dummy(), "min", "Lower bound of the output feature range") - max = Param(Params._dummy(), "max", "Upper bound of the output feature range") + min = Param(Params._dummy(), "min", "Lower bound of the output feature range", + typeConverter=TypeConverters.toFloat) + max = Param(Params._dummy(), "max", "Upper bound of the output feature range", + typeConverter=TypeConverters.toFloat) @keyword_only def __init__(self, min=0.0, max=1.0, inputCol=None, outputCol=None): @@ -870,7 +877,8 @@ class NGram(JavaTransformer, HasInputCol, HasOutputCol, JavaMLReadable, JavaMLWr .. versionadded:: 1.5.0 """ - n = Param(Params._dummy(), "n", "number of elements per n-gram (>=1)") + n = Param(Params._dummy(), "n", "number of elements per n-gram (>=1)", + typeConverter=TypeConverters.toInt) @keyword_only def __init__(self, n=2, inputCol=None, outputCol=None): @@ -936,7 +944,8 @@ class Normalizer(JavaTransformer, HasInputCol, HasOutputCol, JavaMLReadable, Jav .. versionadded:: 1.4.0 """ - p = Param(Params._dummy(), "p", "the p norm value.") + p = Param(Params._dummy(), "p", "the p norm value.", + typeConverter=TypeConverters.toFloat) @keyword_only def __init__(self, p=2.0, inputCol=None, outputCol=None): @@ -1018,7 +1027,8 @@ class OneHotEncoder(JavaTransformer, HasInputCol, HasOutputCol, JavaMLReadable, .. versionadded:: 1.4.0 """ - dropLast = Param(Params._dummy(), "dropLast", "whether to drop the last category") + dropLast = Param(Params._dummy(), "dropLast", "whether to drop the last category", + typeConverter=TypeConverters.toBoolean) @keyword_only def __init__(self, dropLast=True, inputCol=None, outputCol=None): @@ -1085,7 +1095,8 @@ class PolynomialExpansion(JavaTransformer, HasInputCol, HasOutputCol, JavaMLRead .. versionadded:: 1.4.0 """ - degree = Param(Params._dummy(), "degree", "the polynomial degree to expand (>= 1)") + degree = Param(Params._dummy(), "degree", "the polynomial degree to expand (>= 1)", + typeConverter=TypeConverters.toInt) @keyword_only def __init__(self, degree=2, inputCol=None, outputCol=None): @@ -1163,7 +1174,8 @@ class QuantileDiscretizer(JavaEstimator, HasInputCol, HasOutputCol, HasSeed, Jav # a placeholder to make it appear in the generated doc numBuckets = Param(Params._dummy(), "numBuckets", "Maximum number of buckets (quantiles, or " + - "categories) into which data points are grouped. Must be >= 2. Default 2.") + "categories) into which data points are grouped. Must be >= 2. Default 2.", + typeConverter=TypeConverters.toInt) @keyword_only def __init__(self, numBuckets=2, inputCol=None, outputCol=None, seed=None): @@ -1255,11 +1267,13 @@ class RegexTokenizer(JavaTransformer, HasInputCol, HasOutputCol, JavaMLReadable, .. versionadded:: 1.4.0 """ - minTokenLength = Param(Params._dummy(), "minTokenLength", "minimum token length (>= 0)") + minTokenLength = Param(Params._dummy(), "minTokenLength", "minimum token length (>= 0)", + typeConverter=TypeConverters.toInt) gaps = Param(Params._dummy(), "gaps", "whether regex splits on gaps (True) or matches tokens") - pattern = Param(Params._dummy(), "pattern", "regex pattern (Java dialect) used for tokenizing") + pattern = Param(Params._dummy(), "pattern", "regex pattern (Java dialect) used for tokenizing", + TypeConverters.toString) toLowercase = Param(Params._dummy(), "toLowercase", "whether to convert all characters to " + - "lowercase before tokenizing") + "lowercase before tokenizing", TypeConverters.toBoolean) @keyword_only def __init__(self, minTokenLength=1, gaps=True, pattern="\\s+", inputCol=None, @@ -1370,7 +1384,7 @@ class SQLTransformer(JavaTransformer, JavaMLReadable, JavaMLWritable): .. versionadded:: 1.6.0 """ - statement = Param(Params._dummy(), "statement", "SQL statement") + statement = Param(Params._dummy(), "statement", "SQL statement", TypeConverters.toString) @keyword_only def __init__(self, statement=None): @@ -1444,8 +1458,9 @@ class StandardScaler(JavaEstimator, HasInputCol, HasOutputCol, JavaMLReadable, J .. versionadded:: 1.4.0 """ - withMean = Param(Params._dummy(), "withMean", "Center data with mean") - withStd = Param(Params._dummy(), "withStd", "Scale to unit standard deviation") + withMean = Param(Params._dummy(), "withMean", "Center data with mean", TypeConverters.toBoolean) + withStd = Param(Params._dummy(), "withStd", "Scale to unit standard deviation", + TypeConverters.toBoolean) @keyword_only def __init__(self, withMean=False, withStd=True, inputCol=None, outputCol=None): @@ -1628,7 +1643,8 @@ class IndexToString(JavaTransformer, HasInputCol, HasOutputCol, JavaMLReadable, labels = Param(Params._dummy(), "labels", "Optional array of labels specifying index-string mapping." + - " If not provided or if empty, then metadata from inputCol is used instead.") + " If not provided or if empty, then metadata from inputCol is used instead.", + typeConverter=TypeConverters.toListString) @keyword_only def __init__(self, inputCol=None, outputCol=None, labels=None): @@ -1689,9 +1705,10 @@ class StopWordsRemover(JavaTransformer, HasInputCol, HasOutputCol, JavaMLReadabl .. versionadded:: 1.6.0 """ - stopWords = Param(Params._dummy(), "stopWords", "The words to be filtered out") + stopWords = Param(Params._dummy(), "stopWords", "The words to be filtered out", + typeConverter=TypeConverters.toListString) caseSensitive = Param(Params._dummy(), "caseSensitive", "whether to do a case sensitive " + - "comparison over the stop words") + "comparison over the stop words", TypeConverters.toBoolean) @keyword_only def __init__(self, inputCol=None, outputCol=None, stopWords=None, @@ -1930,7 +1947,7 @@ class VectorIndexer(JavaEstimator, HasInputCol, HasOutputCol, JavaMLReadable, Ja maxCategories = Param(Params._dummy(), "maxCategories", "Threshold for the number of values a categorical feature can take " + "(>= 2). If a feature is found to have > maxCategories values, then " + - "it is declared continuous.") + "it is declared continuous.", typeConverter=TypeConverters.toInt) @keyword_only def __init__(self, maxCategories=20, inputCol=None, outputCol=None): @@ -2035,11 +2052,12 @@ class VectorSlicer(JavaTransformer, HasInputCol, HasOutputCol, JavaMLReadable, J """ indices = Param(Params._dummy(), "indices", "An array of indices to select features from " + - "a vector column. There can be no overlap with names.") + "a vector column. There can be no overlap with names.", + typeConverter=TypeConverters.toListInt) names = Param(Params._dummy(), "names", "An array of feature names to select features from " + "a vector column. These names must be specified by ML " + "org.apache.spark.ml.attribute.Attribute. There can be no overlap with " + - "indices.") + "indices.", typeConverter=TypeConverters.toListString) @keyword_only def __init__(self, inputCol=None, outputCol=None, indices=None, names=None): @@ -2147,12 +2165,14 @@ class Word2Vec(JavaEstimator, HasStepSize, HasMaxIter, HasSeed, HasInputCol, Has """ vectorSize = Param(Params._dummy(), "vectorSize", - "the dimension of codes after transforming from words") + "the dimension of codes after transforming from words", + typeConverter=TypeConverters.toInt) numPartitions = Param(Params._dummy(), "numPartitions", - "number of partitions for sentences of words") + "number of partitions for sentences of words", + typeConverter=TypeConverters.toInt) minCount = Param(Params._dummy(), "minCount", "the minimum number of times a token must appear to be included in the " + - "word2vec model's vocabulary") + "word2vec model's vocabulary", typeConverter=TypeConverters.toInt) @keyword_only def __init__(self, vectorSize=100, minCount=5, numPartitions=1, stepSize=0.025, maxIter=1, @@ -2293,7 +2313,8 @@ class PCA(JavaEstimator, HasInputCol, HasOutputCol, JavaMLReadable, JavaMLWritab .. versionadded:: 1.5.0 """ - k = Param(Params._dummy(), "k", "the number of principal components") + k = Param(Params._dummy(), "k", "the number of principal components", + typeConverter=TypeConverters.toInt) @keyword_only def __init__(self, k=None, inputCol=None, outputCol=None): @@ -2425,7 +2446,7 @@ class RFormula(JavaEstimator, HasFeaturesCol, HasLabelCol, JavaMLReadable, JavaM .. versionadded:: 1.5.0 """ - formula = Param(Params._dummy(), "formula", "R model formula") + formula = Param(Params._dummy(), "formula", "R model formula", TypeConverters.toString) @keyword_only def __init__(self, formula=None, featuresCol="features", labelCol="label"): @@ -2511,12 +2532,11 @@ class ChiSqSelector(JavaEstimator, HasFeaturesCol, HasOutputCol, HasLabelCol, Ja .. versionadded:: 2.0.0 """ - # a placeholder to make it appear in the generated doc numTopFeatures = \ Param(Params._dummy(), "numTopFeatures", "Number of features that selector will select, ordered by statistics value " + "descending. If the number of features is < numTopFeatures, then this will select " + - "all features.") + "all features.", typeConverter=TypeConverters.toInt) @keyword_only def __init__(self, numTopFeatures=50, featuresCol="features", outputCol=None, labelCol="label"): @@ -2525,11 +2545,6 @@ class ChiSqSelector(JavaEstimator, HasFeaturesCol, HasOutputCol, HasLabelCol, Ja """ super(ChiSqSelector, self).__init__() self._java_obj = self._new_java_obj("org.apache.spark.ml.feature.ChiSqSelector", self.uid) - self.numTopFeatures = \ - Param(self, "numTopFeatures", - "Number of features that selector will select, ordered by statistics value " + - "descending. If the number of features is < numTopFeatures, then this will " + - "select all features.") kwargs = self.__init__._input_kwargs self.setParams(**kwargs) |