aboutsummaryrefslogtreecommitdiff
path: root/mllib/src/main
diff options
context:
space:
mode:
authorYanbo Liang <ybliang8@gmail.com>2016-04-27 14:08:26 -0700
committerJoseph K. Bradley <joseph@databricks.com>2016-04-27 14:08:26 -0700
commit4672e9838b130d006965efeba2665676aa995ebc (patch)
tree1c9461c5596c76eb10059d90c351b4f2ded1bcb7 /mllib/src/main
parent24bea000476cdd0b43be5160a76bc5b170ef0b42 (diff)
downloadspark-4672e9838b130d006965efeba2665676aa995ebc.tar.gz
spark-4672e9838b130d006965efeba2665676aa995ebc.tar.bz2
spark-4672e9838b130d006965efeba2665676aa995ebc.zip
[SPARK-14899][ML][PYSPARK] Remove spark.ml HashingTF hashingAlg option
## What changes were proposed in this pull request? Since [SPARK-10574](https://issues.apache.org/jira/browse/SPARK-10574) breaks behavior of ```HashingTF```, we should try to enforce good practice by removing the "native" hashAlgorithm option in spark.ml and pyspark.ml. We can leave spark.mllib and pyspark.mllib alone. ## How was this patch tested? Unit tests. cc jkbradley Author: Yanbo Liang <ybliang8@gmail.com> Closes #12702 from yanboliang/spark-14899.
Diffstat (limited to 'mllib/src/main')
-rw-r--r--mllib/src/main/scala/org/apache/spark/ml/feature/HashingTF.scala36
-rw-r--r--mllib/src/main/scala/org/apache/spark/mllib/feature/HashingTF.scala4
2 files changed, 9 insertions, 31 deletions
diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/HashingTF.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/HashingTF.scala
index 6fc08aee13..66ae91cfc0 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/feature/HashingTF.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/feature/HashingTF.scala
@@ -31,12 +31,11 @@ import org.apache.spark.sql.types.{ArrayType, StructType}
/**
* :: Experimental ::
* Maps a sequence of terms to their term frequencies using the hashing trick.
- * Currently we support two hash algorithms: "murmur3" (default) and "native".
- * "murmur3" calculates a hash code value for the term object using
- * Austin Appleby's MurmurHash 3 algorithm (MurmurHash3_x86_32);
- * "native" calculates the hash code value using the native Scala implementation.
- * In Spark 1.6 and earlier, "native" is the default hash algorithm;
- * after Spark 2.0, we use "murmur3" as the default one.
+ * Currently we use Austin Appleby's MurmurHash 3 algorithm (MurmurHash3_x86_32)
+ * to calculate the hash code value for the term object.
+ * Since a simple modulo is used to transform the hash function to a column index,
+ * it is advisable to use a power of two as the numFeatures parameter;
+ * otherwise the features will not be mapped evenly to the columns.
*/
@Experimental
class HashingTF(override val uid: String)
@@ -69,20 +68,7 @@ class HashingTF(override val uid: String)
"This is useful for discrete probabilistic models that model binary events rather " +
"than integer counts")
- /**
- * The hash algorithm used when mapping term to integer.
- * Supported options: "murmur3" and "native". We use "native" as default hash algorithm
- * in Spark 1.6 and earlier. After Spark 2.0, we use "murmur3" as default one.
- * (Default = "murmur3")
- * @group expertParam
- */
- val hashAlgorithm = new Param[String](this, "hashAlgorithm", "The hash algorithm used when " +
- "mapping term to integer. Supported options: " +
- s"${feature.HashingTF.supportedHashAlgorithms.mkString(",")}.",
- ParamValidators.inArray[String](feature.HashingTF.supportedHashAlgorithms))
-
- setDefault(numFeatures -> (1 << 18), binary -> false,
- hashAlgorithm -> feature.HashingTF.Murmur3)
+ setDefault(numFeatures -> (1 << 18), binary -> false)
/** @group getParam */
def getNumFeatures: Int = $(numFeatures)
@@ -96,18 +82,10 @@ class HashingTF(override val uid: String)
/** @group setParam */
def setBinary(value: Boolean): this.type = set(binary, value)
- /** @group expertGetParam */
- def getHashAlgorithm: String = $(hashAlgorithm)
-
- /** @group expertSetParam */
- def setHashAlgorithm(value: String): this.type = set(hashAlgorithm, value)
-
@Since("2.0.0")
override def transform(dataset: Dataset[_]): DataFrame = {
val outputSchema = transformSchema(dataset.schema)
- val hashingTF = new feature.HashingTF($(numFeatures))
- .setBinary($(binary))
- .setHashAlgorithm($(hashAlgorithm))
+ val hashingTF = new feature.HashingTF($(numFeatures)).setBinary($(binary))
val t = udf { terms: Seq[_] => hashingTF.transform(terms) }
val metadata = outputSchema($(outputCol)).metadata
dataset.select(col("*"), t(col($(inputCol))).as($(outputCol), metadata))
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/feature/HashingTF.scala b/mllib/src/main/scala/org/apache/spark/mllib/feature/HashingTF.scala
index 321f11d9f9..bc26655104 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/feature/HashingTF.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/feature/HashingTF.scala
@@ -135,18 +135,18 @@ object HashingTF {
private[spark] val Murmur3: String = "murmur3"
- private[spark] val supportedHashAlgorithms: Array[String] = Array(Native, Murmur3)
-
private val seed = 42
/**
* Calculate a hash code value for the term object using the native Scala implementation.
+ * This is the default hash algorithm used in Spark 1.6 and earlier.
*/
private[spark] def nativeHash(term: Any): Int = term.##
/**
* Calculate a hash code value for the term object using
* Austin Appleby's MurmurHash 3 algorithm (MurmurHash3_x86_32).
+ * This is the default hash algorithm used from Spark 2.0 onwards.
*/
private[spark] def murmur3Hash(term: Any): Int = {
term match {