diff options
Diffstat (limited to 'python/pyspark')
-rw-r--r-- | python/pyspark/ml/classification.py | 14 | ||||
-rw-r--r-- | python/pyspark/ml/clustering.py | 8 | ||||
-rw-r--r-- | python/pyspark/ml/evaluation.py | 2 | ||||
-rwxr-xr-x | python/pyspark/ml/feature.py | 26 | ||||
-rw-r--r-- | python/pyspark/ml/param/__init__.py | 2 | ||||
-rw-r--r-- | python/pyspark/ml/regression.py | 14 | ||||
-rwxr-xr-x | python/pyspark/ml/tests.py | 136 | ||||
-rw-r--r-- | python/pyspark/ml/tuning.py | 4 |
8 files changed, 94 insertions, 112 deletions
diff --git a/python/pyspark/ml/classification.py b/python/pyspark/ml/classification.py index 5c11aa71b4..a1c3f72984 100644 --- a/python/pyspark/ml/classification.py +++ b/python/pyspark/ml/classification.py @@ -53,7 +53,7 @@ class LogisticRegression(JavaEstimator, HasFeaturesCol, HasLabelCol, HasPredicti Currently, this class only supports binary classification. >>> from pyspark.sql import Row - >>> from pyspark.mllib.linalg import Vectors + >>> from pyspark.ml.linalg import Vectors >>> df = sc.parallelize([ ... Row(label=1.0, weight=2.0, features=Vectors.dense(1.0)), ... Row(label=0.0, weight=2.0, features=Vectors.sparse(1, [], []))]).toDF() @@ -496,7 +496,7 @@ class DecisionTreeClassifier(JavaEstimator, HasFeaturesCol, HasLabelCol, HasPred It supports both binary and multiclass labels, as well as both continuous and categorical features. - >>> from pyspark.mllib.linalg import Vectors + >>> from pyspark.ml.linalg import Vectors >>> from pyspark.ml.feature import StringIndexer >>> df = sqlContext.createDataFrame([ ... (1.0, Vectors.dense(1.0)), @@ -625,7 +625,7 @@ class RandomForestClassifier(JavaEstimator, HasFeaturesCol, HasLabelCol, HasPred >>> import numpy >>> from numpy import allclose - >>> from pyspark.mllib.linalg import Vectors + >>> from pyspark.ml.linalg import Vectors >>> from pyspark.ml.feature import StringIndexer >>> df = sqlContext.createDataFrame([ ... (1.0, Vectors.dense(1.0)), @@ -752,7 +752,7 @@ class GBTClassifier(JavaEstimator, HasFeaturesCol, HasLabelCol, HasPredictionCol `SPARK-4240 <https://issues.apache.org/jira/browse/SPARK-4240>`_ >>> from numpy import allclose - >>> from pyspark.mllib.linalg import Vectors + >>> from pyspark.ml.linalg import Vectors >>> from pyspark.ml.feature import StringIndexer >>> df = sqlContext.createDataFrame([ ... (1.0, Vectors.dense(1.0)), @@ -884,7 +884,7 @@ class NaiveBayes(JavaEstimator, HasFeaturesCol, HasLabelCol, HasPredictionCol, H The input feature values must be nonnegative. >>> from pyspark.sql import Row - >>> from pyspark.mllib.linalg import Vectors + >>> from pyspark.ml.linalg import Vectors >>> df = sqlContext.createDataFrame([ ... Row(label=0.0, features=Vectors.dense([0.0, 0.0])), ... Row(label=0.0, features=Vectors.dense([0.0, 1.0])), @@ -1028,7 +1028,7 @@ class MultilayerPerceptronClassifier(JavaEstimator, HasFeaturesCol, HasLabelCol, Number of inputs has to be equal to the size of feature vectors. Number of outputs has to be equal to the total number of labels. - >>> from pyspark.mllib.linalg import Vectors + >>> from pyspark.ml.linalg import Vectors >>> df = sqlContext.createDataFrame([ ... (0.0, Vectors.dense([0.0, 0.0])), ... (1.0, Vectors.dense([0.0, 1.0])), @@ -1193,7 +1193,7 @@ class OneVsRest(Estimator, OneVsRestParams, MLReadable, MLWritable): is picked to label the example. >>> from pyspark.sql import Row - >>> from pyspark.mllib.linalg import Vectors + >>> from pyspark.ml.linalg import Vectors >>> df = sc.parallelize([ ... Row(label=0.0, features=Vectors.dense(1.0, 0.8)), ... Row(label=1.0, features=Vectors.sparse(2, [], [])), diff --git a/python/pyspark/ml/clustering.py b/python/pyspark/ml/clustering.py index 921633164b..ac7183d2ef 100644 --- a/python/pyspark/ml/clustering.py +++ b/python/pyspark/ml/clustering.py @@ -65,7 +65,7 @@ class GaussianMixture(JavaEstimator, HasFeaturesCol, HasPredictionCol, HasMaxIte GaussianMixture clustering. - >>> from pyspark.mllib.linalg import Vectors + >>> from pyspark.ml.linalg import Vectors >>> data = [(Vectors.dense([-0.1, -0.05 ]),), ... (Vectors.dense([-0.01, -0.1]),), @@ -194,7 +194,7 @@ class KMeans(JavaEstimator, HasFeaturesCol, HasPredictionCol, HasMaxIter, HasTol K-means clustering with a k-means++ like initialization mode (the k-means|| algorithm by Bahmani et al). - >>> from pyspark.mllib.linalg import Vectors + >>> from pyspark.ml.linalg import Vectors >>> data = [(Vectors.dense([0.0, 0.0]),), (Vectors.dense([1.0, 1.0]),), ... (Vectors.dense([9.0, 8.0]),), (Vectors.dense([8.0, 9.0]),)] >>> df = sqlContext.createDataFrame(data, ["features"]) @@ -347,7 +347,7 @@ class BisectingKMeans(JavaEstimator, HasFeaturesCol, HasPredictionCol, HasMaxIte If bisecting all divisible clusters on the bottom level would result more than `k` leaf clusters, larger clusters get higher priority. - >>> from pyspark.mllib.linalg import Vectors + >>> from pyspark.ml.linalg import Vectors >>> data = [(Vectors.dense([0.0, 0.0]),), (Vectors.dense([1.0, 1.0]),), ... (Vectors.dense([9.0, 8.0]),), (Vectors.dense([8.0, 9.0]),)] >>> df = sqlContext.createDataFrame(data, ["features"]) @@ -625,7 +625,7 @@ class LDA(JavaEstimator, HasFeaturesCol, HasMaxIter, HasSeed, HasCheckpointInter :py:class:`pyspark.ml.feature.Tokenizer` and :py:class:`pyspark.ml.feature.CountVectorizer` can be useful for converting text to word count vectors. - >>> from pyspark.mllib.linalg import Vectors, SparseVector + >>> from pyspark.ml.linalg import Vectors, SparseVector >>> from pyspark.ml.clustering import LDA >>> df = sqlContext.createDataFrame([[1, Vectors.dense([0.0, 1.0])], ... [2, SparseVector(2, {0: 1.0})],], ["id", "features"]) diff --git a/python/pyspark/ml/evaluation.py b/python/pyspark/ml/evaluation.py index 719c0c7d79..fc9099b7ec 100644 --- a/python/pyspark/ml/evaluation.py +++ b/python/pyspark/ml/evaluation.py @@ -111,7 +111,7 @@ class BinaryClassificationEvaluator(JavaEvaluator, HasLabelCol, HasRawPrediction The rawPrediction column can be of type double (binary 0/1 prediction, or probability of label 1) or of type vector (length-2 vector of raw predictions, scores, or label probabilities). - >>> from pyspark.mllib.linalg import Vectors + >>> from pyspark.ml.linalg import Vectors >>> scoreAndLabels = map(lambda x: (Vectors.dense([1.0 - x[0], x[0]]), x[1]), ... [(0.1, 0.0), (0.1, 1.0), (0.4, 0.0), (0.6, 0.0), (0.6, 1.0), (0.6, 1.0), (0.8, 1.0)]) >>> dataset = sqlContext.createDataFrame(scoreAndLabels, ["raw", "label"]) diff --git a/python/pyspark/ml/feature.py b/python/pyspark/ml/feature.py index 606a6e7c22..983b6a5301 100755 --- a/python/pyspark/ml/feature.py +++ b/python/pyspark/ml/feature.py @@ -23,11 +23,11 @@ from py4j.java_collections import JavaArray from pyspark import since, keyword_only from pyspark.rdd import ignore_unicode_prefix +from pyspark.ml.linalg import _convert_to_vector from pyspark.ml.param.shared import * from pyspark.ml.util import JavaMLReadable, JavaMLWritable from pyspark.ml.wrapper import JavaEstimator, JavaModel, JavaTransformer, _jvm from pyspark.mllib.common import inherit_doc -from pyspark.mllib.linalg import _convert_to_vector __all__ = ['Binarizer', 'Bucketizer', @@ -380,7 +380,7 @@ class DCT(JavaTransformer, HasInputCol, HasOutputCol, JavaMLReadable, JavaMLWrit .. seealso:: `More information on Wikipedia \ <https://en.wikipedia.org/wiki/Discrete_cosine_transform#DCT-II Wikipedia>`_. - >>> from pyspark.mllib.linalg import Vectors + >>> from pyspark.ml.linalg import Vectors >>> df1 = sqlContext.createDataFrame([(Vectors.dense([5.0, 8.0, 6.0]),)], ["vec"]) >>> dct = DCT(inverse=False, inputCol="vec", outputCol="resultVec") >>> df2 = dct.transform(df1) @@ -447,7 +447,7 @@ class ElementwiseProduct(JavaTransformer, HasInputCol, HasOutputCol, JavaMLReada with a provided "weight" vector. In other words, it scales each column of the dataset by a scalar multiplier. - >>> from pyspark.mllib.linalg import Vectors + >>> from pyspark.ml.linalg import Vectors >>> df = sqlContext.createDataFrame([(Vectors.dense([2.0, 1.0, 3.0]),)], ["values"]) >>> ep = ElementwiseProduct(scalingVec=Vectors.dense([1.0, 2.0, 3.0]), ... inputCol="values", outputCol="eprod") @@ -582,7 +582,7 @@ class IDF(JavaEstimator, HasInputCol, HasOutputCol, JavaMLReadable, JavaMLWritab Compute the Inverse Document Frequency (IDF) given a collection of documents. - >>> from pyspark.mllib.linalg import DenseVector + >>> from pyspark.ml.linalg import DenseVector >>> df = sqlContext.createDataFrame([(DenseVector([1.0, 2.0]),), ... (DenseVector([0.0, 1.0]),), (DenseVector([3.0, 0.2]),)], ["tf"]) >>> idf = IDF(minDocFreq=3, inputCol="tf", outputCol="idf") @@ -670,7 +670,7 @@ class MaxAbsScaler(JavaEstimator, HasInputCol, HasOutputCol, JavaMLReadable, Jav absolute value in each feature. It does not shift/center the data, and thus does not destroy any sparsity. - >>> from pyspark.mllib.linalg import Vectors + >>> from pyspark.ml.linalg import Vectors >>> df = sqlContext.createDataFrame([(Vectors.dense([1.0]),), (Vectors.dense([2.0]),)], ["a"]) >>> maScaler = MaxAbsScaler(inputCol="a", outputCol="scaled") >>> model = maScaler.fit(df) @@ -757,7 +757,7 @@ class MinMaxScaler(JavaEstimator, HasInputCol, HasOutputCol, JavaMLReadable, Jav Note that since zero values will probably be transformed to non-zero values, output of the transformer will be DenseVector even for sparse input. - >>> from pyspark.mllib.linalg import Vectors + >>> from pyspark.ml.linalg import Vectors >>> df = sqlContext.createDataFrame([(Vectors.dense([0.0]),), (Vectors.dense([2.0]),)], ["a"]) >>> mmScaler = MinMaxScaler(inputCol="a", outputCol="scaled") >>> model = mmScaler.fit(df) @@ -961,7 +961,7 @@ class Normalizer(JavaTransformer, HasInputCol, HasOutputCol, JavaMLReadable, Jav Normalize a vector to have unit norm using the given p-norm. - >>> from pyspark.mllib.linalg import Vectors + >>> from pyspark.ml.linalg import Vectors >>> svec = Vectors.sparse(4, {1: 4.0, 3: 3.0}) >>> df = sqlContext.createDataFrame([(Vectors.dense([3.0, -4.0]), svec)], ["dense", "sparse"]) >>> normalizer = Normalizer(p=2.0, inputCol="dense", outputCol="features") @@ -1114,7 +1114,7 @@ class PolynomialExpansion(JavaTransformer, HasInputCol, HasOutputCol, JavaMLRead multiplication distributes over addition". Take a 2-variable feature vector as an example: `(x, y)`, if we want to expand it with degree 2, then we get `(x, x * x, y, x * y, y * y)`. - >>> from pyspark.mllib.linalg import Vectors + >>> from pyspark.ml.linalg import Vectors >>> df = sqlContext.createDataFrame([(Vectors.dense([0.5, 2.0]),)], ["dense"]) >>> px = PolynomialExpansion(degree=2, inputCol="dense", outputCol="expanded") >>> px.transform(df).head().expanded @@ -1459,7 +1459,7 @@ class StandardScaler(JavaEstimator, HasInputCol, HasOutputCol, JavaMLReadable, J Standardizes features by removing the mean and scaling to unit variance using column summary statistics on the samples in the training set. - >>> from pyspark.mllib.linalg import Vectors + >>> from pyspark.ml.linalg import Vectors >>> df = sqlContext.createDataFrame([(Vectors.dense([0.0]),), (Vectors.dense([2.0]),)], ["a"]) >>> standardScaler = StandardScaler(inputCol="a", outputCol="scaled") >>> model = standardScaler.fit(df) @@ -1942,7 +1942,7 @@ class VectorIndexer(JavaEstimator, HasInputCol, HasOutputCol, JavaMLReadable, Ja - Add warning if a categorical feature has only 1 category. - Add option for allowing unknown categories. - >>> from pyspark.mllib.linalg import Vectors + >>> from pyspark.ml.linalg import Vectors >>> df = sqlContext.createDataFrame([(Vectors.dense([-1.0, 0.0]),), ... (Vectors.dense([0.0, 1.0]),), (Vectors.dense([0.0, 2.0]),)], ["a"]) >>> indexer = VectorIndexer(maxCategories=2, inputCol="a", outputCol="indexed") @@ -2062,7 +2062,7 @@ class VectorSlicer(JavaTransformer, HasInputCol, HasOutputCol, JavaMLReadable, J The output vector will order features with the selected indices first (in the order given), followed by the selected names (in the order given). - >>> from pyspark.mllib.linalg import Vectors + >>> from pyspark.ml.linalg import Vectors >>> df = sqlContext.createDataFrame([ ... (Vectors.dense([-2.0, 2.3, 0.0, 0.0, 1.0]),), ... (Vectors.dense([0.0, 0.0, 0.0, 0.0, 0.0]),), @@ -2329,7 +2329,7 @@ class PCA(JavaEstimator, HasInputCol, HasOutputCol, JavaMLReadable, JavaMLWritab PCA trains a model to project vectors to a low-dimensional space using PCA. - >>> from pyspark.mllib.linalg import Vectors + >>> from pyspark.ml.linalg import Vectors >>> data = [(Vectors.sparse(5, [(1, 1.0), (3, 7.0)]),), ... (Vectors.dense([2.0, 0.0, 3.0, 4.0, 5.0]),), ... (Vectors.dense([4.0, 0.0, 0.0, 6.0, 7.0]),)] @@ -2547,7 +2547,7 @@ class ChiSqSelector(JavaEstimator, HasFeaturesCol, HasOutputCol, HasLabelCol, Ja Chi-Squared feature selection, which selects categorical features to use for predicting a categorical label. - >>> from pyspark.mllib.linalg import Vectors + >>> from pyspark.ml.linalg import Vectors >>> df = sqlContext.createDataFrame( ... [(Vectors.dense([0.0, 0.0, 18.0, 1.0]), 1.0), ... (Vectors.dense([0.0, 1.0, 12.0, 0.0]), 0.0), diff --git a/python/pyspark/ml/param/__init__.py b/python/pyspark/ml/param/__init__.py index d9513ca5b2..ade4864e1d 100644 --- a/python/pyspark/ml/param/__init__.py +++ b/python/pyspark/ml/param/__init__.py @@ -29,8 +29,8 @@ import warnings from py4j.java_gateway import JavaObject from pyspark import since +from pyspark.ml.linalg import DenseVector, Vector from pyspark.ml.util import Identifiable -from pyspark.mllib.linalg import DenseVector, Vector __all__ = ['Param', 'Params', 'TypeConverters'] diff --git a/python/pyspark/ml/regression.py b/python/pyspark/ml/regression.py index fcdc29e69b..cfcbbfc98e 100644 --- a/python/pyspark/ml/regression.py +++ b/python/pyspark/ml/regression.py @@ -54,7 +54,7 @@ class LinearRegression(JavaEstimator, HasFeaturesCol, HasLabelCol, HasPrediction - L1 (Lasso) - L2 + L1 (elastic net) - >>> from pyspark.mllib.linalg import Vectors + >>> from pyspark.ml.linalg import Vectors >>> df = sqlContext.createDataFrame([ ... (1.0, 2.0, Vectors.dense(1.0)), ... (0.0, 2.0, Vectors.sparse(1, [], []))], ["label", "weight", "features"]) @@ -412,7 +412,7 @@ class IsotonicRegression(JavaEstimator, HasFeaturesCol, HasLabelCol, HasPredicti Currently implemented using parallelized pool adjacent violators algorithm. Only univariate (single feature) algorithm supported. - >>> from pyspark.mllib.linalg import Vectors + >>> from pyspark.ml.linalg import Vectors >>> df = sqlContext.createDataFrame([ ... (1.0, Vectors.dense(1.0)), ... (0.0, Vectors.sparse(1, [], []))], ["label", "features"]) @@ -642,7 +642,7 @@ class DecisionTreeRegressor(JavaEstimator, HasFeaturesCol, HasLabelCol, HasPredi learning algorithm for regression. It supports both continuous and categorical features. - >>> from pyspark.mllib.linalg import Vectors + >>> from pyspark.ml.linalg import Vectors >>> df = sqlContext.createDataFrame([ ... (1.0, Vectors.dense(1.0)), ... (0.0, Vectors.sparse(1, [], []))], ["label", "features"]) @@ -808,7 +808,7 @@ class RandomForestRegressor(JavaEstimator, HasFeaturesCol, HasLabelCol, HasPredi It supports both continuous and categorical features. >>> from numpy import allclose - >>> from pyspark.mllib.linalg import Vectors + >>> from pyspark.ml.linalg import Vectors >>> df = sqlContext.createDataFrame([ ... (1.0, Vectors.dense(1.0)), ... (0.0, Vectors.sparse(1, [], []))], ["label", "features"]) @@ -920,7 +920,7 @@ class GBTRegressor(JavaEstimator, HasFeaturesCol, HasLabelCol, HasPredictionCol, It supports both continuous and categorical features. >>> from numpy import allclose - >>> from pyspark.mllib.linalg import Vectors + >>> from pyspark.ml.linalg import Vectors >>> df = sqlContext.createDataFrame([ ... (1.0, Vectors.dense(1.0)), ... (0.0, Vectors.sparse(1, [], []))], ["label", "features"]) @@ -1055,7 +1055,7 @@ class AFTSurvivalRegression(JavaEstimator, HasFeaturesCol, HasLabelCol, HasPredi .. seealso:: `AFT Model <https://en.wikipedia.org/wiki/Accelerated_failure_time_model>`_ - >>> from pyspark.mllib.linalg import Vectors + >>> from pyspark.ml.linalg import Vectors >>> df = sqlContext.createDataFrame([ ... (1.0, Vectors.dense(1.0), 1.0), ... (0.0, Vectors.sparse(1, [], []), 0.0)], ["label", "features", "censor"]) @@ -1252,7 +1252,7 @@ class GeneralizedLinearRegression(JavaEstimator, HasLabelCol, HasFeaturesCol, Ha .. seealso:: `GLM <https://en.wikipedia.org/wiki/Generalized_linear_model>`_ - >>> from pyspark.mllib.linalg import Vectors + >>> from pyspark.ml.linalg import Vectors >>> df = sqlContext.createDataFrame([ ... (1.0, Vectors.dense(0.0, 0.0)), ... (1.0, Vectors.dense(1.0, 2.0)), diff --git a/python/pyspark/ml/tests.py b/python/pyspark/ml/tests.py index c567905759..e3511120bd 100755 --- a/python/pyspark/ml/tests.py +++ b/python/pyspark/ml/tests.py @@ -62,10 +62,6 @@ from pyspark.ml.regression import LinearRegression, DecisionTreeRegressor, \ from pyspark.ml.tuning import * from pyspark.ml.wrapper import JavaParams from pyspark.mllib.common import _java2py -from pyspark.mllib.linalg import SparseVector as OldSparseVector, DenseVector as OldDenseVector,\ - DenseMatrix as OldDenseMatrix, MatrixUDT as OldMatrixUDT, SparseMatrix as OldSparseMatrix,\ - Vectors as OldVectors, VectorUDT as OldVectorUDT -from pyspark.mllib.regression import LabeledPoint from pyspark.serializers import PickleSerializer from pyspark.sql import DataFrame, Row, SparkSession from pyspark.sql.functions import rand @@ -162,22 +158,22 @@ class ParamTypeConversionTests(PySparkTestCase): def test_vector(self): ewp = ElementwiseProduct(scalingVec=[1, 3]) - self.assertEqual(ewp.getScalingVec(), OldDenseVector([1.0, 3.0])) + self.assertEqual(ewp.getScalingVec(), DenseVector([1.0, 3.0])) ewp = ElementwiseProduct(scalingVec=np.array([1.2, 3.4])) - self.assertEqual(ewp.getScalingVec(), OldDenseVector([1.2, 3.4])) + self.assertEqual(ewp.getScalingVec(), DenseVector([1.2, 3.4])) self.assertRaises(TypeError, lambda: ElementwiseProduct(scalingVec=["a", "b"])) def test_list(self): l = [0, 1] - for lst_like in [l, np.array(l), OldDenseVector(l), OldSparseVector(len(l), + for lst_like in [l, np.array(l), DenseVector(l), SparseVector(len(l), range(len(l)), l), pyarray.array('l', l), xrange(2), tuple(l)]: converted = TypeConverters.toList(lst_like) self.assertEqual(type(converted), list) self.assertListEqual(converted, l) def test_list_int(self): - for indices in [[1.0, 2.0], np.array([1.0, 2.0]), OldDenseVector([1.0, 2.0]), - OldSparseVector(2, {0: 1.0, 1: 2.0}), xrange(1, 3), (1.0, 2.0), + for indices in [[1.0, 2.0], np.array([1.0, 2.0]), DenseVector([1.0, 2.0]), + SparseVector(2, {0: 1.0, 1: 2.0}), xrange(1, 3), (1.0, 2.0), pyarray.array('d', [1.0, 2.0])]: vs = VectorSlicer(indices=indices) self.assertListEqual(vs.getIndices(), [1, 2]) @@ -410,9 +406,9 @@ class FeatureTests(SparkSessionTestCase): def test_idf(self): dataset = self.spark.createDataFrame([ - (OldDenseVector([1.0, 2.0]),), - (OldDenseVector([0.0, 1.0]),), - (OldDenseVector([3.0, 0.2]),)], ["tf"]) + (DenseVector([1.0, 2.0]),), + (DenseVector([0.0, 1.0]),), + (DenseVector([3.0, 0.2]),)], ["tf"]) idf0 = IDF(inputCol="tf") self.assertListEqual(idf0.params, [idf0.inputCol, idf0.minDocFreq, idf0.outputCol]) idf0m = idf0.fit(dataset, {idf0.outputCol: "idf"}) @@ -457,10 +453,10 @@ class FeatureTests(SparkSessionTestCase): def test_count_vectorizer_with_binary(self): dataset = self.spark.createDataFrame([ - (0, "a a a b b c".split(' '), OldSparseVector(3, {0: 1.0, 1: 1.0, 2: 1.0}),), - (1, "a a".split(' '), OldSparseVector(3, {0: 1.0}),), - (2, "a b".split(' '), OldSparseVector(3, {0: 1.0, 1: 1.0}),), - (3, "c".split(' '), OldSparseVector(3, {2: 1.0}),)], ["id", "words", "expected"]) + (0, "a a a b b c".split(' '), SparseVector(3, {0: 1.0, 1: 1.0, 2: 1.0}),), + (1, "a a".split(' '), SparseVector(3, {0: 1.0}),), + (2, "a b".split(' '), SparseVector(3, {0: 1.0, 1: 1.0}),), + (3, "c".split(' '), SparseVector(3, {2: 1.0}),)], ["id", "words", "expected"]) cv = CountVectorizer(binary=True, inputCol="words", outputCol="features") model = cv.fit(dataset) @@ -581,11 +577,11 @@ class CrossValidatorTests(SparkSessionTestCase): # Save/load for CrossValidator will be added later: SPARK-13786 temp_path = tempfile.mkdtemp() dataset = self.spark.createDataFrame( - [(OldVectors.dense([0.0]), 0.0), - (OldVectors.dense([0.4]), 1.0), - (OldVectors.dense([0.5]), 0.0), - (OldVectors.dense([0.6]), 1.0), - (OldVectors.dense([1.0]), 1.0)] * 10, + [(Vectors.dense([0.0]), 0.0), + (Vectors.dense([0.4]), 1.0), + (Vectors.dense([0.5]), 0.0), + (Vectors.dense([0.6]), 1.0), + (Vectors.dense([1.0]), 1.0)] * 10, ["features", "label"]) lr = LogisticRegression() grid = ParamGridBuilder().addGrid(lr.maxIter, [0, 1]).build() @@ -654,11 +650,11 @@ class TrainValidationSplitTests(SparkSessionTestCase): # Save/load for TrainValidationSplit will be added later: SPARK-13786 temp_path = tempfile.mkdtemp() dataset = self.spark.createDataFrame( - [(OldVectors.dense([0.0]), 0.0), - (OldVectors.dense([0.4]), 1.0), - (OldVectors.dense([0.5]), 0.0), - (OldVectors.dense([0.6]), 1.0), - (OldVectors.dense([1.0]), 1.0)] * 10, + [(Vectors.dense([0.0]), 0.0), + (Vectors.dense([0.4]), 1.0), + (Vectors.dense([0.5]), 0.0), + (Vectors.dense([0.6]), 1.0), + (Vectors.dense([1.0]), 1.0)] * 10, ["features", "label"]) lr = LogisticRegression() grid = ParamGridBuilder().addGrid(lr.maxIter, [0, 1]).build() @@ -857,8 +853,8 @@ class LDATest(SparkSessionTestCase): def test_persistence(self): # Test save/load for LDA, LocalLDAModel, DistributedLDAModel. df = self.spark.createDataFrame([ - [1, OldVectors.dense([0.0, 1.0])], - [2, OldVectors.sparse(2, {0: 1.0})], + [1, Vectors.dense([0.0, 1.0])], + [2, Vectors.sparse(2, {0: 1.0})], ], ["id", "features"]) # Fit model lda = LDA(k=2, seed=1, optimizer="em") @@ -893,8 +889,8 @@ class LDATest(SparkSessionTestCase): class TrainingSummaryTest(SparkSessionTestCase): def test_linear_regression_summary(self): - df = self.spark.createDataFrame([(1.0, 2.0, OldVectors.dense(1.0)), - (0.0, 2.0, OldVectors.sparse(1, [], []))], + df = self.spark.createDataFrame([(1.0, 2.0, Vectors.dense(1.0)), + (0.0, 2.0, Vectors.sparse(1, [], []))], ["label", "weight", "features"]) lr = LinearRegression(maxIter=5, regParam=0.0, solver="normal", weightCol="weight", fitIntercept=False) @@ -930,7 +926,7 @@ class TrainingSummaryTest(SparkSessionTestCase): self.assertAlmostEqual(sameSummary.explainedVariance, s.explainedVariance) def test_glr_summary(self): - from pyspark.mllib.linalg import Vectors + from pyspark.ml.linalg import Vectors df = self.spark.createDataFrame([(1.0, 2.0, Vectors.dense(1.0)), (0.0, 2.0, Vectors.sparse(1, [], []))], ["label", "weight", "features"]) @@ -966,8 +962,8 @@ class TrainingSummaryTest(SparkSessionTestCase): self.assertAlmostEqual(sameSummary.deviance, s.deviance) def test_logistic_regression_summary(self): - df = self.spark.createDataFrame([(1.0, 2.0, OldVectors.dense(1.0)), - (0.0, 2.0, OldVectors.sparse(1, [], []))], + df = self.spark.createDataFrame([(1.0, 2.0, Vectors.dense(1.0)), + (0.0, 2.0, Vectors.sparse(1, [], []))], ["label", "weight", "features"]) lr = LogisticRegression(maxIter=5, regParam=0.01, weightCol="weight", fitIntercept=False) model = lr.fit(df) @@ -996,9 +992,9 @@ class TrainingSummaryTest(SparkSessionTestCase): class OneVsRestTests(SparkSessionTestCase): def test_copy(self): - df = self.spark.createDataFrame([(0.0, OldVectors.dense(1.0, 0.8)), - (1.0, OldVectors.sparse(2, [], [])), - (2.0, OldVectors.dense(0.5, 0.5))], + df = self.spark.createDataFrame([(0.0, Vectors.dense(1.0, 0.8)), + (1.0, Vectors.sparse(2, [], [])), + (2.0, Vectors.dense(0.5, 0.5))], ["label", "features"]) lr = LogisticRegression(maxIter=5, regParam=0.01) ovr = OneVsRest(classifier=lr) @@ -1010,9 +1006,9 @@ class OneVsRestTests(SparkSessionTestCase): self.assertEqual(model1.getPredictionCol(), "indexed") def test_output_columns(self): - df = self.spark.createDataFrame([(0.0, OldVectors.dense(1.0, 0.8)), - (1.0, OldVectors.sparse(2, [], [])), - (2.0, OldVectors.dense(0.5, 0.5))], + df = self.spark.createDataFrame([(0.0, Vectors.dense(1.0, 0.8)), + (1.0, Vectors.sparse(2, [], [])), + (2.0, Vectors.dense(0.5, 0.5))], ["label", "features"]) lr = LogisticRegression(maxIter=5, regParam=0.01) ovr = OneVsRest(classifier=lr) @@ -1022,9 +1018,9 @@ class OneVsRestTests(SparkSessionTestCase): def test_save_load(self): temp_path = tempfile.mkdtemp() - df = self.spark.createDataFrame([(0.0, OldVectors.dense(1.0, 0.8)), - (1.0, OldVectors.sparse(2, [], [])), - (2.0, OldVectors.dense(0.5, 0.5))], + df = self.spark.createDataFrame([(0.0, Vectors.dense(1.0, 0.8)), + (1.0, Vectors.sparse(2, [], [])), + (2.0, Vectors.dense(0.5, 0.5))], ["label", "features"]) lr = LogisticRegression(maxIter=5, regParam=0.01) ovr = OneVsRest(classifier=lr) @@ -1052,7 +1048,7 @@ class HashingTFTest(SparkSessionTestCase): hashingTF.setInputCol("words").setOutputCol("features").setNumFeatures(n).setBinary(True) output = hashingTF.transform(df) features = output.select("features").first().features.toArray() - expected = OldVectors.dense([1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]).toArray() + expected = Vectors.dense([1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]).toArray() for i in range(0, n): self.assertAlmostEqual(features[i], expected[i], 14, "Error at " + str(i) + ": expected " + str(expected[i]) + ", got " + str(features[i])) @@ -1147,15 +1143,13 @@ class VectorTests(MLlibTestCase): self.assertEqual(vs, nvs) def test_serialize(self): - # Because pickle path still uses old vector/matrix - # TODO: Change this to new vector/matrix when pickle for new vector/matrix is ready. - self._test_serialize(OldDenseVector(range(10))) - self._test_serialize(OldDenseVector(array([1., 2., 3., 4.]))) - self._test_serialize(OldDenseVector(pyarray.array('d', range(10)))) - self._test_serialize(OldSparseVector(4, {1: 1, 3: 2})) - self._test_serialize(OldSparseVector(3, {})) - self._test_serialize(OldDenseMatrix(2, 3, range(6))) - sm1 = OldSparseMatrix( + self._test_serialize(DenseVector(range(10))) + self._test_serialize(DenseVector(array([1., 2., 3., 4.]))) + self._test_serialize(DenseVector(pyarray.array('d', range(10)))) + self._test_serialize(SparseVector(4, {1: 1, 3: 2})) + self._test_serialize(SparseVector(3, {})) + self._test_serialize(DenseMatrix(2, 3, range(6))) + sm1 = SparseMatrix( 3, 4, [0, 2, 2, 4, 4], [1, 2, 1, 2], [1.0, 2.0, 4.0, 5.0]) self._test_serialize(sm1) @@ -1407,12 +1401,6 @@ class VectorUDTTests(MLlibTestCase): sv1 = SparseVector(2, [1], [2.0]) udt = VectorUDT() - old_dv0 = OldDenseVector([]) - old_dv1 = OldDenseVector([1.0, 2.0]) - old_sv0 = OldSparseVector(2, [], []) - old_sv1 = OldSparseVector(2, [1], [2.0]) - old_udt = OldVectorUDT() - def test_json_schema(self): self.assertEqual(VectorUDT.fromJson(self.udt.jsonValue()), self.udt) @@ -1421,19 +1409,19 @@ class VectorUDTTests(MLlibTestCase): self.assertEqual(v, self.udt.deserialize(self.udt.serialize(v))) def test_infer_schema(self): - rdd = self.sc.parallelize([LabeledPoint(1.0, self.old_dv1), - LabeledPoint(0.0, self.old_sv1)]) + rdd = self.sc.parallelize([Row(label=1.0, features=self.dv1), + Row(label=0.0, features=self.sv1)]) df = rdd.toDF() schema = df.schema field = [f for f in schema.fields if f.name == "features"][0] - self.assertEqual(field.dataType, self.old_udt) + self.assertEqual(field.dataType, self.udt) vectors = df.rdd.map(lambda p: p.features).collect() self.assertEqual(len(vectors), 2) for v in vectors: - if isinstance(v, OldSparseVector): - self.assertEqual(v, self.old_sv1) - elif isinstance(v, OldDenseVector): - self.assertEqual(v, self.old_dv1) + if isinstance(v, SparseVector): + self.assertEqual(v, self.sv1) + elif isinstance(v, DenseVector): + self.assertEqual(v, self.dv1) else: raise TypeError("expecting a vector but got %r of type %r" % (v, type(v))) @@ -1446,12 +1434,6 @@ class MatrixUDTTests(MLlibTestCase): sm2 = SparseMatrix(2, 1, [0, 0, 1], [0], [5.0], isTransposed=True) udt = MatrixUDT() - old_dm1 = OldDenseMatrix(3, 2, [0, 1, 4, 5, 9, 10]) - old_dm2 = OldDenseMatrix(3, 2, [0, 1, 4, 5, 9, 10], isTransposed=True) - old_sm1 = OldSparseMatrix(1, 1, [0, 1], [0], [2.0]) - old_sm2 = OldSparseMatrix(2, 1, [0, 0, 1], [0], [5.0], isTransposed=True) - old_udt = OldMatrixUDT() - def test_json_schema(self): self.assertEqual(MatrixUDT.fromJson(self.udt.jsonValue()), self.udt) @@ -1460,17 +1442,17 @@ class MatrixUDTTests(MLlibTestCase): self.assertEqual(m, self.udt.deserialize(self.udt.serialize(m))) def test_infer_schema(self): - rdd = self.sc.parallelize([("dense", self.old_dm1), ("sparse", self.old_sm1)]) + rdd = self.sc.parallelize([("dense", self.dm1), ("sparse", self.sm1)]) df = rdd.toDF() schema = df.schema - self.assertTrue(schema.fields[1].dataType, self.old_udt) + self.assertTrue(schema.fields[1].dataType, self.udt) matrices = df.rdd.map(lambda x: x._2).collect() self.assertEqual(len(matrices), 2) for m in matrices: - if isinstance(m, OldDenseMatrix): - self.assertTrue(m, self.old_dm1) - elif isinstance(m, OldSparseMatrix): - self.assertTrue(m, self.old_sm1) + if isinstance(m, DenseMatrix): + self.assertTrue(m, self.dm1) + elif isinstance(m, SparseMatrix): + self.assertTrue(m, self.sm1) else: raise ValueError("Expected a matrix but got type %r" % type(m)) diff --git a/python/pyspark/ml/tuning.py b/python/pyspark/ml/tuning.py index 0920ae6ea1..75789c4d09 100644 --- a/python/pyspark/ml/tuning.py +++ b/python/pyspark/ml/tuning.py @@ -151,7 +151,7 @@ class CrossValidator(Estimator, ValidatorParams): >>> from pyspark.ml.classification import LogisticRegression >>> from pyspark.ml.evaluation import BinaryClassificationEvaluator - >>> from pyspark.mllib.linalg import Vectors + >>> from pyspark.ml.linalg import Vectors >>> dataset = sqlContext.createDataFrame( ... [(Vectors.dense([0.0]), 0.0), ... (Vectors.dense([0.4]), 1.0), @@ -310,7 +310,7 @@ class TrainValidationSplit(Estimator, ValidatorParams): >>> from pyspark.ml.classification import LogisticRegression >>> from pyspark.ml.evaluation import BinaryClassificationEvaluator - >>> from pyspark.mllib.linalg import Vectors + >>> from pyspark.ml.linalg import Vectors >>> dataset = sqlContext.createDataFrame( ... [(Vectors.dense([0.0]), 0.0), ... (Vectors.dense([0.4]), 1.0), |