diff options
author | Jason Lee <cjlee@us.ibm.com> | 2016-04-18 12:47:14 -0700 |
---|---|---|
committer | Joseph K. Bradley <joseph@databricks.com> | 2016-04-18 12:47:14 -0700 |
commit | 3d66a2ce9bfc19096e07181f9e970372d32bbc0b (patch) | |
tree | d2e5205d84bd63a764801ff106f098897e507c41 /python/pyspark/mllib | |
parent | d280d1da1aec925687a0bfb496f3a6e0979e896f (diff) | |
download | spark-3d66a2ce9bfc19096e07181f9e970372d32bbc0b.tar.gz spark-3d66a2ce9bfc19096e07181f9e970372d32bbc0b.tar.bz2 spark-3d66a2ce9bfc19096e07181f9e970372d32bbc0b.zip |
[SPARK-14564][ML][MLLIB][PYSPARK] Python Word2Vec missing setWindowSize method
## What changes were proposed in this pull request?
Added windowSize getter/setter to ML/MLlib
## How was this patch tested?
Added test cases in tests.py under both ML and MLlib
Author: Jason Lee <cjlee@us.ibm.com>
Closes #12428 from jasoncl/SPARK-14564.
Diffstat (limited to 'python/pyspark/mllib')
-rw-r--r-- | python/pyspark/mllib/feature.py | 11 | ||||
-rw-r--r-- | python/pyspark/mllib/tests.py | 4 |
2 files changed, 13 insertions, 2 deletions
diff --git a/python/pyspark/mllib/feature.py b/python/pyspark/mllib/feature.py index b3dd2f63a5..90559f6cfb 100644 --- a/python/pyspark/mllib/feature.py +++ b/python/pyspark/mllib/feature.py @@ -617,6 +617,7 @@ class Word2Vec(object): self.numIterations = 1 self.seed = random.randint(0, sys.maxsize) self.minCount = 5 + self.windowSize = 5 @since('1.2.0') def setVectorSize(self, vectorSize): @@ -669,6 +670,14 @@ class Word2Vec(object): self.minCount = minCount return self + @since('2.0.0') + def setWindowSize(self, windowSize): + """ + Sets window size (default: 5). + """ + self.windowSize = windowSize + return self + @since('1.2.0') def fit(self, data): """ @@ -682,7 +691,7 @@ class Word2Vec(object): jmodel = callMLlibFunc("trainWord2VecModel", data, int(self.vectorSize), float(self.learningRate), int(self.numPartitions), int(self.numIterations), int(self.seed), - int(self.minCount)) + int(self.minCount), int(self.windowSize)) return Word2VecModel(jmodel) diff --git a/python/pyspark/mllib/tests.py b/python/pyspark/mllib/tests.py index ac55fbf798..f272da56d1 100644 --- a/python/pyspark/mllib/tests.py +++ b/python/pyspark/mllib/tests.py @@ -1027,13 +1027,15 @@ class Word2VecTests(MLlibTestCase): .setNumPartitions(2) \ .setNumIterations(10) \ .setSeed(1024) \ - .setMinCount(3) + .setMinCount(3) \ + .setWindowSize(6) self.assertEqual(model.vectorSize, 2) self.assertTrue(model.learningRate < 0.02) self.assertEqual(model.numPartitions, 2) self.assertEqual(model.numIterations, 10) self.assertEqual(model.seed, 1024) self.assertEqual(model.minCount, 3) + self.assertEqual(model.windowSize, 6) def test_word2vec_get_vectors(self): data = [ |