diff options
author | WeichenXu <WeichenXu123@outlook.com> | 2016-05-23 18:14:48 -0700 |
---|---|---|
committer | Andrew Or <andrew@databricks.com> | 2016-05-23 18:14:48 -0700 |
commit | a15ca5533db91fefaf3248255a59c4d94eeda1a9 (patch) | |
tree | 80867e08b17b01d96611a9a695cbb1f01c0198f6 /python/pyspark/ml/feature.py | |
parent | 5afd927a47aa7ede3039234f2f7262e2247aa2ae (diff) | |
download | spark-a15ca5533db91fefaf3248255a59c4d94eeda1a9.tar.gz spark-a15ca5533db91fefaf3248255a59c4d94eeda1a9.tar.bz2 spark-a15ca5533db91fefaf3248255a59c4d94eeda1a9.zip |
[SPARK-15464][ML][MLLIB][SQL][TESTS] Replace SQLContext and SparkContext with SparkSession using builder pattern in python test code
## What changes were proposed in this pull request?
Replace SQLContext and SparkContext with SparkSession using builder pattern in python test code.
## How was this patch tested?
Existing test.
Author: WeichenXu <WeichenXu123@outlook.com>
Closes #13242 from WeichenXu123/python_doctest_update_sparksession.
Diffstat (limited to 'python/pyspark/ml/feature.py')
-rwxr-xr-x | python/pyspark/ml/feature.py | 66 |
1 files changed, 34 insertions, 32 deletions
diff --git a/python/pyspark/ml/feature.py b/python/pyspark/ml/feature.py index 497f2ad68e..93745c70c4 100755 --- a/python/pyspark/ml/feature.py +++ b/python/pyspark/ml/feature.py @@ -66,7 +66,7 @@ class Binarizer(JavaTransformer, HasInputCol, HasOutputCol, JavaMLReadable, Java Binarize a column of continuous features given a threshold. - >>> df = sqlContext.createDataFrame([(0.5,)], ["values"]) + >>> df = spark.createDataFrame([(0.5,)], ["values"]) >>> binarizer = Binarizer(threshold=1.0, inputCol="values", outputCol="features") >>> binarizer.transform(df).head().features 0.0 @@ -131,7 +131,7 @@ class Bucketizer(JavaTransformer, HasInputCol, HasOutputCol, JavaMLReadable, Jav Maps a column of continuous features to a column of feature buckets. - >>> df = sqlContext.createDataFrame([(0.1,), (0.4,), (1.2,), (1.5,)], ["values"]) + >>> df = spark.createDataFrame([(0.1,), (0.4,), (1.2,), (1.5,)], ["values"]) >>> bucketizer = Bucketizer(splits=[-float("inf"), 0.5, 1.4, float("inf")], ... inputCol="values", outputCol="buckets") >>> bucketed = bucketizer.transform(df).collect() @@ -206,7 +206,7 @@ class CountVectorizer(JavaEstimator, HasInputCol, HasOutputCol, JavaMLReadable, Extracts a vocabulary from document collections and generates a :py:attr:`CountVectorizerModel`. - >>> df = sqlContext.createDataFrame( + >>> df = spark.createDataFrame( ... [(0, ["a", "b", "c"]), (1, ["a", "b", "b", "c", "a"])], ... ["label", "raw"]) >>> cv = CountVectorizer(inputCol="raw", outputCol="vectors") @@ -381,7 +381,7 @@ class DCT(JavaTransformer, HasInputCol, HasOutputCol, JavaMLReadable, JavaMLWrit <https://en.wikipedia.org/wiki/Discrete_cosine_transform#DCT-II Wikipedia>`_. >>> from pyspark.ml.linalg import Vectors - >>> df1 = sqlContext.createDataFrame([(Vectors.dense([5.0, 8.0, 6.0]),)], ["vec"]) + >>> df1 = spark.createDataFrame([(Vectors.dense([5.0, 8.0, 6.0]),)], ["vec"]) >>> dct = DCT(inverse=False, inputCol="vec", outputCol="resultVec") >>> df2 = dct.transform(df1) >>> df2.head().resultVec @@ -448,7 +448,7 @@ class ElementwiseProduct(JavaTransformer, HasInputCol, HasOutputCol, JavaMLReada by a scalar multiplier. >>> from pyspark.ml.linalg import Vectors - >>> df = sqlContext.createDataFrame([(Vectors.dense([2.0, 1.0, 3.0]),)], ["values"]) + >>> df = spark.createDataFrame([(Vectors.dense([2.0, 1.0, 3.0]),)], ["values"]) >>> ep = ElementwiseProduct(scalingVec=Vectors.dense([1.0, 2.0, 3.0]), ... inputCol="values", outputCol="eprod") >>> ep.transform(df).head().eprod @@ -516,7 +516,7 @@ class HashingTF(JavaTransformer, HasInputCol, HasOutputCol, HasNumFeatures, Java it is advisable to use a power of two as the numFeatures parameter; otherwise the features will not be mapped evenly to the columns. - >>> df = sqlContext.createDataFrame([(["a", "b", "c"],)], ["words"]) + >>> df = spark.createDataFrame([(["a", "b", "c"],)], ["words"]) >>> hashingTF = HashingTF(numFeatures=10, inputCol="words", outputCol="features") >>> hashingTF.transform(df).head().features SparseVector(10, {0: 1.0, 1: 1.0, 2: 1.0}) @@ -583,7 +583,7 @@ class IDF(JavaEstimator, HasInputCol, HasOutputCol, JavaMLReadable, JavaMLWritab Compute the Inverse Document Frequency (IDF) given a collection of documents. >>> from pyspark.ml.linalg import DenseVector - >>> df = sqlContext.createDataFrame([(DenseVector([1.0, 2.0]),), + >>> df = spark.createDataFrame([(DenseVector([1.0, 2.0]),), ... (DenseVector([0.0, 1.0]),), (DenseVector([3.0, 0.2]),)], ["tf"]) >>> idf = IDF(minDocFreq=3, inputCol="tf", outputCol="idf") >>> model = idf.fit(df) @@ -671,7 +671,7 @@ class MaxAbsScaler(JavaEstimator, HasInputCol, HasOutputCol, JavaMLReadable, Jav any sparsity. >>> from pyspark.ml.linalg import Vectors - >>> df = sqlContext.createDataFrame([(Vectors.dense([1.0]),), (Vectors.dense([2.0]),)], ["a"]) + >>> df = spark.createDataFrame([(Vectors.dense([1.0]),), (Vectors.dense([2.0]),)], ["a"]) >>> maScaler = MaxAbsScaler(inputCol="a", outputCol="scaled") >>> model = maScaler.fit(df) >>> model.transform(df).show() @@ -758,7 +758,7 @@ class MinMaxScaler(JavaEstimator, HasInputCol, HasOutputCol, JavaMLReadable, Jav transformer will be DenseVector even for sparse input. >>> from pyspark.ml.linalg import Vectors - >>> df = sqlContext.createDataFrame([(Vectors.dense([0.0]),), (Vectors.dense([2.0]),)], ["a"]) + >>> df = spark.createDataFrame([(Vectors.dense([0.0]),), (Vectors.dense([2.0]),)], ["a"]) >>> mmScaler = MinMaxScaler(inputCol="a", outputCol="scaled") >>> model = mmScaler.fit(df) >>> model.originalMin @@ -889,7 +889,7 @@ class NGram(JavaTransformer, HasInputCol, HasOutputCol, JavaMLReadable, JavaMLWr When the input array length is less than n (number of elements per n-gram), no n-grams are returned. - >>> df = sqlContext.createDataFrame([Row(inputTokens=["a", "b", "c", "d", "e"])]) + >>> df = spark.createDataFrame([Row(inputTokens=["a", "b", "c", "d", "e"])]) >>> ngram = NGram(n=2, inputCol="inputTokens", outputCol="nGrams") >>> ngram.transform(df).head() Row(inputTokens=[u'a', u'b', u'c', u'd', u'e'], nGrams=[u'a b', u'b c', u'c d', u'd e']) @@ -963,7 +963,7 @@ class Normalizer(JavaTransformer, HasInputCol, HasOutputCol, JavaMLReadable, Jav >>> from pyspark.ml.linalg import Vectors >>> svec = Vectors.sparse(4, {1: 4.0, 3: 3.0}) - >>> df = sqlContext.createDataFrame([(Vectors.dense([3.0, -4.0]), svec)], ["dense", "sparse"]) + >>> df = spark.createDataFrame([(Vectors.dense([3.0, -4.0]), svec)], ["dense", "sparse"]) >>> normalizer = Normalizer(p=2.0, inputCol="dense", outputCol="features") >>> normalizer.transform(df).head().features DenseVector([0.6, -0.8]) @@ -1115,7 +1115,7 @@ class PolynomialExpansion(JavaTransformer, HasInputCol, HasOutputCol, JavaMLRead `(x, y)`, if we want to expand it with degree 2, then we get `(x, x * x, y, x * y, y * y)`. >>> from pyspark.ml.linalg import Vectors - >>> df = sqlContext.createDataFrame([(Vectors.dense([0.5, 2.0]),)], ["dense"]) + >>> df = spark.createDataFrame([(Vectors.dense([0.5, 2.0]),)], ["dense"]) >>> px = PolynomialExpansion(degree=2, inputCol="dense", outputCol="expanded") >>> px.transform(df).head().expanded DenseVector([0.5, 0.25, 2.0, 1.0, 4.0]) @@ -1182,7 +1182,7 @@ class QuantileDiscretizer(JavaEstimator, HasInputCol, HasOutputCol, HasSeed, Jav covering all real values. This attempts to find numBuckets partitions based on a sample of data, but it may find fewer depending on the data sample values. - >>> df = sqlContext.createDataFrame([(0.1,), (0.4,), (1.2,), (1.5,)], ["values"]) + >>> df = spark.createDataFrame([(0.1,), (0.4,), (1.2,), (1.5,)], ["values"]) >>> qds = QuantileDiscretizer(numBuckets=2, ... inputCol="values", outputCol="buckets", seed=123) >>> qds.getSeed() @@ -1272,7 +1272,7 @@ class RegexTokenizer(JavaTransformer, HasInputCol, HasOutputCol, JavaMLReadable, length. It returns an array of strings that can be empty. - >>> df = sqlContext.createDataFrame([("A B c",)], ["text"]) + >>> df = spark.createDataFrame([("A B c",)], ["text"]) >>> reTokenizer = RegexTokenizer(inputCol="text", outputCol="words") >>> reTokenizer.transform(df).head() Row(text=u'A B c', words=[u'a', u'b', u'c']) @@ -1400,7 +1400,7 @@ class SQLTransformer(JavaTransformer, JavaMLReadable, JavaMLWritable): Currently we only support SQL syntax like 'SELECT ... FROM __THIS__' where '__THIS__' represents the underlying table of the input dataset. - >>> df = sqlContext.createDataFrame([(0, 1.0, 3.0), (2, 2.0, 5.0)], ["id", "v1", "v2"]) + >>> df = spark.createDataFrame([(0, 1.0, 3.0), (2, 2.0, 5.0)], ["id", "v1", "v2"]) >>> sqlTrans = SQLTransformer( ... statement="SELECT *, (v1 + v2) AS v3, (v1 * v2) AS v4 FROM __THIS__") >>> sqlTrans.transform(df).head() @@ -1461,7 +1461,7 @@ class StandardScaler(JavaEstimator, HasInputCol, HasOutputCol, JavaMLReadable, J statistics on the samples in the training set. >>> from pyspark.ml.linalg import Vectors - >>> df = sqlContext.createDataFrame([(Vectors.dense([0.0]),), (Vectors.dense([2.0]),)], ["a"]) + >>> df = spark.createDataFrame([(Vectors.dense([0.0]),), (Vectors.dense([2.0]),)], ["a"]) >>> standardScaler = StandardScaler(inputCol="a", outputCol="scaled") >>> model = standardScaler.fit(df) >>> model.mean @@ -1718,7 +1718,7 @@ class StopWordsRemover(JavaTransformer, HasInputCol, HasOutputCol, JavaMLReadabl A feature transformer that filters out stop words from input. Note: null values from input array are preserved unless adding null to stopWords explicitly. - >>> df = sqlContext.createDataFrame([(["a", "b", "c"],)], ["text"]) + >>> df = spark.createDataFrame([(["a", "b", "c"],)], ["text"]) >>> remover = StopWordsRemover(inputCol="text", outputCol="words", stopWords=["b"]) >>> remover.transform(df).head().words == ['a', 'c'] True @@ -1810,7 +1810,7 @@ class Tokenizer(JavaTransformer, HasInputCol, HasOutputCol, JavaMLReadable, Java A tokenizer that converts the input string to lowercase and then splits it by white spaces. - >>> df = sqlContext.createDataFrame([("a b c",)], ["text"]) + >>> df = spark.createDataFrame([("a b c",)], ["text"]) >>> tokenizer = Tokenizer(inputCol="text", outputCol="words") >>> tokenizer.transform(df).head() Row(text=u'a b c', words=[u'a', u'b', u'c']) @@ -1864,7 +1864,7 @@ class VectorAssembler(JavaTransformer, HasInputCols, HasOutputCol, JavaMLReadabl A feature transformer that merges multiple columns into a vector column. - >>> df = sqlContext.createDataFrame([(1, 0, 3)], ["a", "b", "c"]) + >>> df = spark.createDataFrame([(1, 0, 3)], ["a", "b", "c"]) >>> vecAssembler = VectorAssembler(inputCols=["a", "b", "c"], outputCol="features") >>> vecAssembler.transform(df).head().features DenseVector([1.0, 0.0, 3.0]) @@ -1944,7 +1944,7 @@ class VectorIndexer(JavaEstimator, HasInputCol, HasOutputCol, JavaMLReadable, Ja - Add option for allowing unknown categories. >>> from pyspark.ml.linalg import Vectors - >>> df = sqlContext.createDataFrame([(Vectors.dense([-1.0, 0.0]),), + >>> df = spark.createDataFrame([(Vectors.dense([-1.0, 0.0]),), ... (Vectors.dense([0.0, 1.0]),), (Vectors.dense([0.0, 2.0]),)], ["a"]) >>> indexer = VectorIndexer(maxCategories=2, inputCol="a", outputCol="indexed") >>> model = indexer.fit(df) @@ -2074,7 +2074,7 @@ class VectorSlicer(JavaTransformer, HasInputCol, HasOutputCol, JavaMLReadable, J followed by the selected names (in the order given). >>> from pyspark.ml.linalg import Vectors - >>> df = sqlContext.createDataFrame([ + >>> df = spark.createDataFrame([ ... (Vectors.dense([-2.0, 2.3, 0.0, 0.0, 1.0]),), ... (Vectors.dense([0.0, 0.0, 0.0, 0.0, 0.0]),), ... (Vectors.dense([0.6, -1.1, -3.0, 4.5, 3.3]),)], ["features"]) @@ -2161,7 +2161,7 @@ class Word2Vec(JavaEstimator, HasStepSize, HasMaxIter, HasSeed, HasInputCol, Has natural language processing or machine learning process. >>> sent = ("a b " * 100 + "a c " * 10).split(" ") - >>> doc = sqlContext.createDataFrame([(sent,), (sent,)], ["sentence"]) + >>> doc = spark.createDataFrame([(sent,), (sent,)], ["sentence"]) >>> word2Vec = Word2Vec(vectorSize=5, seed=42, inputCol="sentence", outputCol="model") >>> model = word2Vec.fit(doc) >>> model.getVectors().show() @@ -2345,7 +2345,7 @@ class PCA(JavaEstimator, HasInputCol, HasOutputCol, JavaMLReadable, JavaMLWritab >>> data = [(Vectors.sparse(5, [(1, 1.0), (3, 7.0)]),), ... (Vectors.dense([2.0, 0.0, 3.0, 4.0, 5.0]),), ... (Vectors.dense([4.0, 0.0, 0.0, 6.0, 7.0]),)] - >>> df = sqlContext.createDataFrame(data,["features"]) + >>> df = spark.createDataFrame(data,["features"]) >>> pca = PCA(k=2, inputCol="features", outputCol="pca_features") >>> model = pca.fit(df) >>> model.transform(df).collect()[0].pca_features @@ -2447,7 +2447,7 @@ class RFormula(JavaEstimator, HasFeaturesCol, HasLabelCol, JavaMLReadable, JavaM operators, including '~', '.', ':', '+', and '-'. Also see the `R formula docs <http://stat.ethz.ch/R-manual/R-patched/library/stats/html/formula.html>`_. - >>> df = sqlContext.createDataFrame([ + >>> df = spark.createDataFrame([ ... (1.0, 1.0, "a"), ... (0.0, 2.0, "b"), ... (0.0, 0.0, "a") @@ -2561,7 +2561,7 @@ class ChiSqSelector(JavaEstimator, HasFeaturesCol, HasOutputCol, HasLabelCol, Ja categorical label. >>> from pyspark.ml.linalg import Vectors - >>> df = sqlContext.createDataFrame( + >>> df = spark.createDataFrame( ... [(Vectors.dense([0.0, 0.0, 18.0, 1.0]), 1.0), ... (Vectors.dense([0.0, 1.0, 12.0, 0.0]), 0.0), ... (Vectors.dense([1.0, 0.0, 15.0, 0.1]), 0.0)], @@ -2656,8 +2656,7 @@ if __name__ == "__main__": import tempfile import pyspark.ml.feature - from pyspark.context import SparkContext - from pyspark.sql import Row, SQLContext + from pyspark.sql import Row, SparkSession globs = globals().copy() features = pyspark.ml.feature.__dict__.copy() @@ -2665,19 +2664,22 @@ if __name__ == "__main__": # The small batch size here ensures that we see multiple batches, # even in these small test examples: - sc = SparkContext("local[2]", "ml.feature tests") - sqlContext = SQLContext(sc) + spark = SparkSession.builder\ + .master("local[2]")\ + .appName("ml.feature tests")\ + .getOrCreate() + sc = spark.sparkContext globs['sc'] = sc - globs['sqlContext'] = sqlContext + globs['spark'] = spark testData = sc.parallelize([Row(id=0, label="a"), Row(id=1, label="b"), Row(id=2, label="c"), Row(id=3, label="a"), Row(id=4, label="a"), Row(id=5, label="c")], 2) - globs['stringIndDf'] = sqlContext.createDataFrame(testData) + globs['stringIndDf'] = spark.createDataFrame(testData) temp_path = tempfile.mkdtemp() globs['temp_path'] = temp_path try: (failure_count, test_count) = doctest.testmod(globs=globs, optionflags=doctest.ELLIPSIS) - sc.stop() + spark.stop() finally: from shutil import rmtree try: |