[SPARK-14899][ML][PYSPARK] Remove spark.ml HashingTF hashingAlg option

## What changes were proposed in this pull request? Since [SPARK-10574](https://issues.apache.org/jira/browse/SPARK-10574) breaks behavior of ```HashingTF```, we should try to enforce good practice by removing the "native" hashAlgorithm option in spark.ml and pyspark.ml. We can leave spark.mllib and pyspark.mllib alone. ## How was this patch tested? Unit tests. cc jkbradley Author: Yanbo Liang <ybliang8@gmail.com> Closes #12702 from yanboliang/spark-14899.
author: Yanbo Liang <ybliang8@gmail.com> 2016-04-27 14:08:26 -0700
committer: Joseph K. Bradley <joseph@databricks.com> 2016-04-27 14:08:26 -0700
commit: 4672e9838b130d006965efeba2665676aa995ebc (patch)
tree: 1c9461c5596c76eb10059d90c351b4f2ded1bcb7 /mllib/src/main
parent: 24bea000476cdd0b43be5160a76bc5b170ef0b42 (diff)
download: spark-4672e9838b130d006965efeba2665676aa995ebc.tar.gz
spark-4672e9838b130d006965efeba2665676aa995ebc.tar.bz2
spark-4672e9838b130d006965efeba2665676aa995ebc.zip
2 files changed, 9 insertions, 31 deletions
diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/HashingTF.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/HashingTF.scala
index 6fc08aee13..66ae91cfc0 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/feature/HashingTF.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/feature/HashingTF.scala
@@ -31,12 +31,11 @@ import org.apache.spark.sql.types.{ArrayType, StructType}
 /**
  * :: Experimental ::
  * Maps a sequence of terms to their term frequencies using the hashing trick.
- * Currently we support two hash algorithms: "murmur3" (default) and "native".
- * "murmur3" calculates a hash code value for the term object using
- * Austin Appleby's MurmurHash 3 algorithm (MurmurHash3_x86_32);
- * "native" calculates the hash code value using the native Scala implementation.
- * In Spark 1.6 and earlier, "native" is the default hash algorithm;
- * after Spark 2.0, we use "murmur3" as the default one.
+ * Currently we use Austin Appleby's MurmurHash 3 algorithm (MurmurHash3_x86_32)
+ * to calculate the hash code value for the term object.
+ * Since a simple modulo is used to transform the hash function to a column index,
+ * it is advisable to use a power of two as the numFeatures parameter;
+ * otherwise the features will not be mapped evenly to the columns.
  */
 @Experimental
 class HashingTF(override val uid: String)
@@ -69,20 +68,7 @@ class HashingTF(override val uid: String)
     "This is useful for discrete probabilistic models that model binary events rather " +
     "than integer counts")
 
-  /**
-   * The hash algorithm used when mapping term to integer.
-   * Supported options: "murmur3" and "native". We use "native" as default hash algorithm
-   * in Spark 1.6 and earlier. After Spark 2.0, we use "murmur3" as default one.
-   * (Default = "murmur3")
-   * @group expertParam
-   */
-  val hashAlgorithm = new Param[String](this, "hashAlgorithm", "The hash algorithm used when " +
-    "mapping term to integer. Supported options: " +
-    s"${feature.HashingTF.supportedHashAlgorithms.mkString(",")}.",
-    ParamValidators.inArray[String](feature.HashingTF.supportedHashAlgorithms))
-
-  setDefault(numFeatures -> (1 << 18), binary -> false,
-    hashAlgorithm -> feature.HashingTF.Murmur3)
+  setDefault(numFeatures -> (1 << 18), binary -> false)
 
   /** @group getParam */
   def getNumFeatures: Int = $(numFeatures)
@@ -96,18 +82,10 @@ class HashingTF(override val uid: String)
   /** @group setParam */
   def setBinary(value: Boolean): this.type = set(binary, value)
 
-  /** @group expertGetParam */
-  def getHashAlgorithm: String = $(hashAlgorithm)
-
-  /** @group expertSetParam */
-  def setHashAlgorithm(value: String): this.type = set(hashAlgorithm, value)
-
   @Since("2.0.0")
   override def transform(dataset: Dataset[_]): DataFrame = {
     val outputSchema = transformSchema(dataset.schema)
-    val hashingTF = new feature.HashingTF($(numFeatures))
-      .setBinary($(binary))
-      .setHashAlgorithm($(hashAlgorithm))
+    val hashingTF = new feature.HashingTF($(numFeatures)).setBinary($(binary))
     val t = udf { terms: Seq[_] => hashingTF.transform(terms) }
     val metadata = outputSchema($(outputCol)).metadata
     dataset.select(col("*"), t(col($(inputCol))).as($(outputCol), metadata))
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/feature/HashingTF.scala b/mllib/src/main/scala/org/apache/spark/mllib/feature/HashingTF.scala
index 321f11d9f9..bc26655104 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/feature/HashingTF.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/feature/HashingTF.scala
@@ -135,18 +135,18 @@ object HashingTF {
 
   private[spark] val Murmur3: String = "murmur3"
 
-  private[spark] val supportedHashAlgorithms: Array[String] = Array(Native, Murmur3)
-
   private val seed = 42
 
   /**
    * Calculate a hash code value for the term object using the native Scala implementation.
+   * This is the default hash algorithm used in Spark 1.6 and earlier.
    */
   private[spark] def nativeHash(term: Any): Int = term.##
 
   /**
    * Calculate a hash code value for the term object using
    * Austin Appleby's MurmurHash 3 algorithm (MurmurHash3_x86_32).
+   * This is the default hash algorithm used from Spark 2.0 onwards.
    */
   private[spark] def murmur3Hash(term: Any): Int = {
     term match {
author	Yanbo Liang <ybliang8@gmail.com>	2016-04-27 14:08:26 -0700
committer	Joseph K. Bradley <joseph@databricks.com>	2016-04-27 14:08:26 -0700
commit	4672e9838b130d006965efeba2665676aa995ebc (patch)
tree	1c9461c5596c76eb10059d90c351b4f2ded1bcb7 /mllib/src/main
parent	24bea000476cdd0b43be5160a76bc5b170ef0b42 (diff)
download	spark-4672e9838b130d006965efeba2665676aa995ebc.tar.gz spark-4672e9838b130d006965efeba2665676aa995ebc.tar.bz2 spark-4672e9838b130d006965efeba2665676aa995ebc.zip