From 425f6916462ca5d0595c61101d52686006ed6b8b Mon Sep 17 00:00:00 2001 From: Yanbo Liang Date: Mon, 25 Apr 2016 12:08:43 -0700 Subject: [SPARK-10574][ML][MLLIB] HashingTF supports MurmurHash3 ## What changes were proposed in this pull request? As the discussion at [SPARK-10574](https://issues.apache.org/jira/browse/SPARK-10574), ```HashingTF``` should support MurmurHash3 and make it as the default hash algorithm. We should also expose set/get API for ```hashAlgorithm```, then users can choose the hash method. Note: The problem that ```mllib.feature.HashingTF``` behaves differently between Scala/Java and Python will be resolved in the followup work. ## How was this patch tested? unit tests. cc jkbradley MLnick Author: Yanbo Liang Author: Joseph K. Bradley Closes #12498 from yanboliang/spark-10574. --- python/pyspark/ml/feature.py | 39 +++++++++++++++++++++++++++++++-------- python/pyspark/ml/tests.py | 3 ++- 2 files changed, 33 insertions(+), 9 deletions(-) (limited to 'python/pyspark') diff --git a/python/pyspark/ml/feature.py b/python/pyspark/ml/feature.py index 1b298e639d..0e578d48ca 100644 --- a/python/pyspark/ml/feature.py +++ b/python/pyspark/ml/feature.py @@ -523,12 +523,12 @@ class HashingTF(JavaTransformer, HasInputCol, HasOutputCol, HasNumFeatures, Java >>> df = sqlContext.createDataFrame([(["a", "b", "c"],)], ["words"]) >>> hashingTF = HashingTF(numFeatures=10, inputCol="words", outputCol="features") >>> hashingTF.transform(df).head().features - SparseVector(10, {7: 1.0, 8: 1.0, 9: 1.0}) + SparseVector(10, {0: 1.0, 1: 1.0, 2: 1.0}) >>> hashingTF.setParams(outputCol="freqs").transform(df).head().freqs - SparseVector(10, {7: 1.0, 8: 1.0, 9: 1.0}) + SparseVector(10, {0: 1.0, 1: 1.0, 2: 1.0}) >>> params = {hashingTF.numFeatures: 5, hashingTF.outputCol: "vector"} >>> hashingTF.transform(df, params).head().vector - SparseVector(5, {2: 1.0, 3: 1.0, 4: 1.0}) + SparseVector(5, {0: 1.0, 1: 1.0, 2: 1.0}) >>> hashingTFPath = temp_path + "/hashing-tf" >>> hashingTF.save(hashingTFPath) >>> loadedHashingTF = HashingTF.load(hashingTFPath) @@ -543,22 +543,30 @@ class HashingTF(JavaTransformer, HasInputCol, HasOutputCol, HasNumFeatures, Java "rather than integer counts. Default False.", typeConverter=TypeConverters.toBoolean) + hashAlgorithm = Param(Params._dummy(), "hashAlgorithm", "The hash algorithm used when " + + "mapping term to integer. Supported options: murmur3(default) " + + "and native.", typeConverter=TypeConverters.toString) + @keyword_only - def __init__(self, numFeatures=1 << 18, binary=False, inputCol=None, outputCol=None): + def __init__(self, numFeatures=1 << 18, binary=False, inputCol=None, outputCol=None, + hashAlgorithm="murmur3"): """ - __init__(self, numFeatures=1 << 18, inputCol=None, outputCol=None) + __init__(self, numFeatures=1 << 18, binary=False, inputCol=None, outputCol=None, \ + hashAlgorithm="murmur3") """ super(HashingTF, self).__init__() self._java_obj = self._new_java_obj("org.apache.spark.ml.feature.HashingTF", self.uid) - self._setDefault(numFeatures=1 << 18, binary=False) + self._setDefault(numFeatures=1 << 18, binary=False, hashAlgorithm="murmur3") kwargs = self.__init__._input_kwargs self.setParams(**kwargs) @keyword_only @since("1.3.0") - def setParams(self, numFeatures=1 << 18, inputCol=None, outputCol=None): + def setParams(self, numFeatures=1 << 18, binary=False, inputCol=None, outputCol=None, + hashAlgorithm="murmur3"): """ - setParams(self, numFeatures=1 << 18, inputCol=None, outputCol=None) + setParams(self, numFeatures=1 << 18, binary=False, inputCol=None, outputCol=None, \ + hashAlgorithm="murmur3") Sets params for this HashingTF. """ kwargs = self.setParams._input_kwargs @@ -579,6 +587,21 @@ class HashingTF(JavaTransformer, HasInputCol, HasOutputCol, HasNumFeatures, Java """ return self.getOrDefault(self.binary) + @since("2.0.0") + def setHashAlgorithm(self, value): + """ + Sets the value of :py:attr:`hashAlgorithm`. + """ + self._set(hashAlgorithm=value) + return self + + @since("2.0.0") + def getHashAlgorithm(self): + """ + Gets the value of hashAlgorithm or its default value. + """ + return self.getOrDefault(self.hashAlgorithm) + @inherit_doc class IDF(JavaEstimator, HasInputCol, HasOutputCol, JavaMLReadable, JavaMLWritable): diff --git a/python/pyspark/ml/tests.py b/python/pyspark/ml/tests.py index e95458699d..8954e96df9 100644 --- a/python/pyspark/ml/tests.py +++ b/python/pyspark/ml/tests.py @@ -918,7 +918,8 @@ class HashingTFTest(PySparkTestCase): df = sqlContext.createDataFrame([(0, ["a", "a", "b", "c", "c", "c"])], ["id", "words"]) n = 100 hashingTF = HashingTF() - hashingTF.setInputCol("words").setOutputCol("features").setNumFeatures(n).setBinary(True) + hashingTF.setInputCol("words").setOutputCol("features").setNumFeatures(n)\ + .setBinary(True).setHashAlgorithm("native") output = hashingTF.transform(df) features = output.select("features").first().features.toArray() expected = Vectors.sparse(n, {(ord("a") % n): 1.0, -- cgit v1.2.3