aboutsummaryrefslogtreecommitdiff
path: root/mllib/src/main/scala/org/apache/spark/ml/feature/HashingTF.scala
diff options
context:
space:
mode:
Diffstat (limited to 'mllib/src/main/scala/org/apache/spark/ml/feature/HashingTF.scala')
-rw-r--r--mllib/src/main/scala/org/apache/spark/ml/feature/HashingTF.scala36
1 files changed, 7 insertions, 29 deletions
diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/HashingTF.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/HashingTF.scala
index 6fc08aee13..66ae91cfc0 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/feature/HashingTF.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/feature/HashingTF.scala
@@ -31,12 +31,11 @@ import org.apache.spark.sql.types.{ArrayType, StructType}
/**
* :: Experimental ::
* Maps a sequence of terms to their term frequencies using the hashing trick.
- * Currently we support two hash algorithms: "murmur3" (default) and "native".
- * "murmur3" calculates a hash code value for the term object using
- * Austin Appleby's MurmurHash 3 algorithm (MurmurHash3_x86_32);
- * "native" calculates the hash code value using the native Scala implementation.
- * In Spark 1.6 and earlier, "native" is the default hash algorithm;
- * after Spark 2.0, we use "murmur3" as the default one.
+ * Currently we use Austin Appleby's MurmurHash 3 algorithm (MurmurHash3_x86_32)
+ * to calculate the hash code value for the term object.
+ * Since a simple modulo is used to transform the hash function to a column index,
+ * it is advisable to use a power of two as the numFeatures parameter;
+ * otherwise the features will not be mapped evenly to the columns.
*/
@Experimental
class HashingTF(override val uid: String)
@@ -69,20 +68,7 @@ class HashingTF(override val uid: String)
"This is useful for discrete probabilistic models that model binary events rather " +
"than integer counts")
- /**
- * The hash algorithm used when mapping term to integer.
- * Supported options: "murmur3" and "native". We use "native" as default hash algorithm
- * in Spark 1.6 and earlier. After Spark 2.0, we use "murmur3" as default one.
- * (Default = "murmur3")
- * @group expertParam
- */
- val hashAlgorithm = new Param[String](this, "hashAlgorithm", "The hash algorithm used when " +
- "mapping term to integer. Supported options: " +
- s"${feature.HashingTF.supportedHashAlgorithms.mkString(",")}.",
- ParamValidators.inArray[String](feature.HashingTF.supportedHashAlgorithms))
-
- setDefault(numFeatures -> (1 << 18), binary -> false,
- hashAlgorithm -> feature.HashingTF.Murmur3)
+ setDefault(numFeatures -> (1 << 18), binary -> false)
/** @group getParam */
def getNumFeatures: Int = $(numFeatures)
@@ -96,18 +82,10 @@ class HashingTF(override val uid: String)
/** @group setParam */
def setBinary(value: Boolean): this.type = set(binary, value)
- /** @group expertGetParam */
- def getHashAlgorithm: String = $(hashAlgorithm)
-
- /** @group expertSetParam */
- def setHashAlgorithm(value: String): this.type = set(hashAlgorithm, value)
-
@Since("2.0.0")
override def transform(dataset: Dataset[_]): DataFrame = {
val outputSchema = transformSchema(dataset.schema)
- val hashingTF = new feature.HashingTF($(numFeatures))
- .setBinary($(binary))
- .setHashAlgorithm($(hashAlgorithm))
+ val hashingTF = new feature.HashingTF($(numFeatures)).setBinary($(binary))
val t = udf { terms: Seq[_] => hashingTF.transform(terms) }
val metadata = outputSchema($(outputCol)).metadata
dataset.select(col("*"), t(col($(inputCol))).as($(outputCol), metadata))