diff options
Diffstat (limited to 'mllib/src/main/scala/org/apache/spark/ml/feature/HashingTF.scala')
-rw-r--r-- | mllib/src/main/scala/org/apache/spark/ml/feature/HashingTF.scala | 36 |
1 files changed, 7 insertions, 29 deletions
diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/HashingTF.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/HashingTF.scala index 6fc08aee13..66ae91cfc0 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/feature/HashingTF.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/feature/HashingTF.scala @@ -31,12 +31,11 @@ import org.apache.spark.sql.types.{ArrayType, StructType} /** * :: Experimental :: * Maps a sequence of terms to their term frequencies using the hashing trick. - * Currently we support two hash algorithms: "murmur3" (default) and "native". - * "murmur3" calculates a hash code value for the term object using - * Austin Appleby's MurmurHash 3 algorithm (MurmurHash3_x86_32); - * "native" calculates the hash code value using the native Scala implementation. - * In Spark 1.6 and earlier, "native" is the default hash algorithm; - * after Spark 2.0, we use "murmur3" as the default one. + * Currently we use Austin Appleby's MurmurHash 3 algorithm (MurmurHash3_x86_32) + * to calculate the hash code value for the term object. + * Since a simple modulo is used to transform the hash function to a column index, + * it is advisable to use a power of two as the numFeatures parameter; + * otherwise the features will not be mapped evenly to the columns. */ @Experimental class HashingTF(override val uid: String) @@ -69,20 +68,7 @@ class HashingTF(override val uid: String) "This is useful for discrete probabilistic models that model binary events rather " + "than integer counts") - /** - * The hash algorithm used when mapping term to integer. - * Supported options: "murmur3" and "native". We use "native" as default hash algorithm - * in Spark 1.6 and earlier. After Spark 2.0, we use "murmur3" as default one. - * (Default = "murmur3") - * @group expertParam - */ - val hashAlgorithm = new Param[String](this, "hashAlgorithm", "The hash algorithm used when " + - "mapping term to integer. Supported options: " + - s"${feature.HashingTF.supportedHashAlgorithms.mkString(",")}.", - ParamValidators.inArray[String](feature.HashingTF.supportedHashAlgorithms)) - - setDefault(numFeatures -> (1 << 18), binary -> false, - hashAlgorithm -> feature.HashingTF.Murmur3) + setDefault(numFeatures -> (1 << 18), binary -> false) /** @group getParam */ def getNumFeatures: Int = $(numFeatures) @@ -96,18 +82,10 @@ class HashingTF(override val uid: String) /** @group setParam */ def setBinary(value: Boolean): this.type = set(binary, value) - /** @group expertGetParam */ - def getHashAlgorithm: String = $(hashAlgorithm) - - /** @group expertSetParam */ - def setHashAlgorithm(value: String): this.type = set(hashAlgorithm, value) - @Since("2.0.0") override def transform(dataset: Dataset[_]): DataFrame = { val outputSchema = transformSchema(dataset.schema) - val hashingTF = new feature.HashingTF($(numFeatures)) - .setBinary($(binary)) - .setHashAlgorithm($(hashAlgorithm)) + val hashingTF = new feature.HashingTF($(numFeatures)).setBinary($(binary)) val t = udf { terms: Seq[_] => hashingTF.transform(terms) } val metadata = outputSchema($(outputCol)).metadata dataset.select(col("*"), t(col($(inputCol))).as($(outputCol), metadata)) |