diff options
author | Feynman Liang <fliang@databricks.com> | 2015-06-29 18:40:30 -0700 |
---|---|---|
committer | Joseph K. Bradley <joseph@databricks.com> | 2015-06-29 18:40:30 -0700 |
commit | 620605a4a1123afaab2674e38251f1231dea17ce (patch) | |
tree | 2fec235613a66fb012193e8fae90902c6657b63d /python/pyspark/ml/tests.py | |
parent | 4c1808be4d3aaa37a5a878892e91ca73ea405ffa (diff) | |
download | spark-620605a4a1123afaab2674e38251f1231dea17ce.tar.gz spark-620605a4a1123afaab2674e38251f1231dea17ce.tar.bz2 spark-620605a4a1123afaab2674e38251f1231dea17ce.zip |
[SPARK-8456] [ML] Ngram featurizer python
Python API for N-gram feature transformer
Author: Feynman Liang <fliang@databricks.com>
Closes #6960 from feynmanliang/ngram-featurizer-python and squashes the following commits:
f9e37c9 [Feynman Liang] Remove debugging code
4dd81f4 [Feynman Liang] Fix typo and doctest
06c79ac [Feynman Liang] Style guide
26c1175 [Feynman Liang] Add python NGram API
Diffstat (limited to 'python/pyspark/ml/tests.py')
-rw-r--r-- | python/pyspark/ml/tests.py | 11 |
1 files changed, 11 insertions, 0 deletions
diff --git a/python/pyspark/ml/tests.py b/python/pyspark/ml/tests.py index 6adbf166f3..c151d21fd6 100644 --- a/python/pyspark/ml/tests.py +++ b/python/pyspark/ml/tests.py @@ -252,6 +252,17 @@ class FeatureTests(PySparkTestCase): output = idf0m.transform(dataset) self.assertIsNotNone(output.head().idf) + def test_ngram(self): + sqlContext = SQLContext(self.sc) + dataset = sqlContext.createDataFrame([ + ([["a", "b", "c", "d", "e"]])], ["input"]) + ngram0 = NGram(n=4, inputCol="input", outputCol="output") + self.assertEqual(ngram0.getN(), 4) + self.assertEqual(ngram0.getInputCol(), "input") + self.assertEqual(ngram0.getOutputCol(), "output") + transformedDF = ngram0.transform(dataset) + self.assertEquals(transformedDF.head().output, ["a b c d", "b c d e"]) + if __name__ == "__main__": unittest.main() |