diff options
author | Yanbo Liang <ybliang8@gmail.com> | 2016-04-27 14:08:26 -0700 |
---|---|---|
committer | Joseph K. Bradley <joseph@databricks.com> | 2016-04-27 14:08:26 -0700 |
commit | 4672e9838b130d006965efeba2665676aa995ebc (patch) | |
tree | 1c9461c5596c76eb10059d90c351b4f2ded1bcb7 /mllib/src/main | |
parent | 24bea000476cdd0b43be5160a76bc5b170ef0b42 (diff) | |
download | spark-4672e9838b130d006965efeba2665676aa995ebc.tar.gz spark-4672e9838b130d006965efeba2665676aa995ebc.tar.bz2 spark-4672e9838b130d006965efeba2665676aa995ebc.zip |
[SPARK-14899][ML][PYSPARK] Remove spark.ml HashingTF hashingAlg option
## What changes were proposed in this pull request?
Since [SPARK-10574](https://issues.apache.org/jira/browse/SPARK-10574) breaks behavior of ```HashingTF```, we should try to enforce good practice by removing the "native" hashAlgorithm option in spark.ml and pyspark.ml. We can leave spark.mllib and pyspark.mllib alone.
## How was this patch tested?
Unit tests.
cc jkbradley
Author: Yanbo Liang <ybliang8@gmail.com>
Closes #12702 from yanboliang/spark-14899.
Diffstat (limited to 'mllib/src/main')
-rw-r--r-- | mllib/src/main/scala/org/apache/spark/ml/feature/HashingTF.scala | 36 | ||||
-rw-r--r-- | mllib/src/main/scala/org/apache/spark/mllib/feature/HashingTF.scala | 4 |
2 files changed, 9 insertions, 31 deletions
diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/HashingTF.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/HashingTF.scala index 6fc08aee13..66ae91cfc0 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/feature/HashingTF.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/feature/HashingTF.scala @@ -31,12 +31,11 @@ import org.apache.spark.sql.types.{ArrayType, StructType} /** * :: Experimental :: * Maps a sequence of terms to their term frequencies using the hashing trick. - * Currently we support two hash algorithms: "murmur3" (default) and "native". - * "murmur3" calculates a hash code value for the term object using - * Austin Appleby's MurmurHash 3 algorithm (MurmurHash3_x86_32); - * "native" calculates the hash code value using the native Scala implementation. - * In Spark 1.6 and earlier, "native" is the default hash algorithm; - * after Spark 2.0, we use "murmur3" as the default one. + * Currently we use Austin Appleby's MurmurHash 3 algorithm (MurmurHash3_x86_32) + * to calculate the hash code value for the term object. + * Since a simple modulo is used to transform the hash function to a column index, + * it is advisable to use a power of two as the numFeatures parameter; + * otherwise the features will not be mapped evenly to the columns. */ @Experimental class HashingTF(override val uid: String) @@ -69,20 +68,7 @@ class HashingTF(override val uid: String) "This is useful for discrete probabilistic models that model binary events rather " + "than integer counts") - /** - * The hash algorithm used when mapping term to integer. - * Supported options: "murmur3" and "native". We use "native" as default hash algorithm - * in Spark 1.6 and earlier. After Spark 2.0, we use "murmur3" as default one. - * (Default = "murmur3") - * @group expertParam - */ - val hashAlgorithm = new Param[String](this, "hashAlgorithm", "The hash algorithm used when " + - "mapping term to integer. Supported options: " + - s"${feature.HashingTF.supportedHashAlgorithms.mkString(",")}.", - ParamValidators.inArray[String](feature.HashingTF.supportedHashAlgorithms)) - - setDefault(numFeatures -> (1 << 18), binary -> false, - hashAlgorithm -> feature.HashingTF.Murmur3) + setDefault(numFeatures -> (1 << 18), binary -> false) /** @group getParam */ def getNumFeatures: Int = $(numFeatures) @@ -96,18 +82,10 @@ class HashingTF(override val uid: String) /** @group setParam */ def setBinary(value: Boolean): this.type = set(binary, value) - /** @group expertGetParam */ - def getHashAlgorithm: String = $(hashAlgorithm) - - /** @group expertSetParam */ - def setHashAlgorithm(value: String): this.type = set(hashAlgorithm, value) - @Since("2.0.0") override def transform(dataset: Dataset[_]): DataFrame = { val outputSchema = transformSchema(dataset.schema) - val hashingTF = new feature.HashingTF($(numFeatures)) - .setBinary($(binary)) - .setHashAlgorithm($(hashAlgorithm)) + val hashingTF = new feature.HashingTF($(numFeatures)).setBinary($(binary)) val t = udf { terms: Seq[_] => hashingTF.transform(terms) } val metadata = outputSchema($(outputCol)).metadata dataset.select(col("*"), t(col($(inputCol))).as($(outputCol), metadata)) diff --git a/mllib/src/main/scala/org/apache/spark/mllib/feature/HashingTF.scala b/mllib/src/main/scala/org/apache/spark/mllib/feature/HashingTF.scala index 321f11d9f9..bc26655104 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/feature/HashingTF.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/feature/HashingTF.scala @@ -135,18 +135,18 @@ object HashingTF { private[spark] val Murmur3: String = "murmur3" - private[spark] val supportedHashAlgorithms: Array[String] = Array(Native, Murmur3) - private val seed = 42 /** * Calculate a hash code value for the term object using the native Scala implementation. + * This is the default hash algorithm used in Spark 1.6 and earlier. */ private[spark] def nativeHash(term: Any): Int = term.## /** * Calculate a hash code value for the term object using * Austin Appleby's MurmurHash 3 algorithm (MurmurHash3_x86_32). + * This is the default hash algorithm used from Spark 2.0 onwards. */ private[spark] def murmur3Hash(term: Any): Int = { term match { |