diff options
author | sethah <seth.hendrickson16@gmail.com> | 2016-03-10 13:17:41 +0200 |
---|---|---|
committer | Nick Pentreath <nick.pentreath@gmail.com> | 2016-03-10 13:17:41 +0200 |
commit | 9fe38aba1f70a4cb19ec1f9df4814fce0b267b54 (patch) | |
tree | fa03aa4a7c692c2b3dc13f706a98215c88a07914 /mllib | |
parent | 9525c563de9c446e108c1e9535238d99cc34cab9 (diff) | |
download | spark-9fe38aba1f70a4cb19ec1f9df4814fce0b267b54.tar.gz spark-9fe38aba1f70a4cb19ec1f9df4814fce0b267b54.tar.bz2 spark-9fe38aba1f70a4cb19ec1f9df4814fce0b267b54.zip |
[SPARK-11108][ML] OneHotEncoder should support other numeric types
Adding support for other numeric types:
* Integer
* Short
* Long
* Float
* Decimal
Author: sethah <seth.hendrickson16@gmail.com>
Closes #9777 from sethah/SPARK-11108.
Diffstat (limited to 'mllib')
-rw-r--r-- | mllib/src/main/scala/org/apache/spark/ml/feature/OneHotEncoder.scala | 9 | ||||
-rw-r--r-- | mllib/src/test/scala/org/apache/spark/ml/feature/OneHotEncoderSuite.scala | 29 |
2 files changed, 35 insertions, 3 deletions
diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/OneHotEncoder.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/OneHotEncoder.scala index e9df161c00..fa5013d3c9 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/feature/OneHotEncoder.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/feature/OneHotEncoder.scala @@ -26,7 +26,7 @@ import org.apache.spark.ml.util._ import org.apache.spark.mllib.linalg.Vectors import org.apache.spark.sql.DataFrame import org.apache.spark.sql.functions.{col, udf} -import org.apache.spark.sql.types.{DoubleType, StructType} +import org.apache.spark.sql.types.{DoubleType, NumericType, StructType} /** * :: Experimental :: @@ -70,7 +70,8 @@ class OneHotEncoder(override val uid: String) extends Transformer val inputColName = $(inputCol) val outputColName = $(outputCol) - SchemaUtils.checkColumnType(schema, inputColName, DoubleType) + require(schema(inputColName).dataType.isInstanceOf[NumericType], + s"Input column must be of type NumericType but got ${schema(inputColName).dataType}") val inputFields = schema.fields require(!inputFields.exists(_.name == outputColName), s"Output column $outputColName already exists.") @@ -133,7 +134,9 @@ class OneHotEncoder(override val uid: String) extends Transformer val numAttrs = dataset.select(col(inputColName).cast(DoubleType)).rdd.map(_.getDouble(0)) .aggregate(0.0)( (m, x) => { - assert(x >=0.0 && x == x.toInt, + assert(x <= Int.MaxValue, + s"OneHotEncoder only supports up to ${Int.MaxValue} indices, but got $x") + assert(x >= 0.0 && x == x.toInt, s"Values from column $inputColName must be indices, but got $x.") math.max(m, x) }, diff --git a/mllib/src/test/scala/org/apache/spark/ml/feature/OneHotEncoderSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/feature/OneHotEncoderSuite.scala index e238b33ed8..49803aef71 100644 --- a/mllib/src/test/scala/org/apache/spark/ml/feature/OneHotEncoderSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/ml/feature/OneHotEncoderSuite.scala @@ -25,6 +25,7 @@ import org.apache.spark.mllib.linalg.Vector import org.apache.spark.mllib.util.MLlibTestSparkContext import org.apache.spark.sql.DataFrame import org.apache.spark.sql.functions.col +import org.apache.spark.sql.types._ class OneHotEncoderSuite extends SparkFunSuite with MLlibTestSparkContext with DefaultReadWriteTest { @@ -111,4 +112,32 @@ class OneHotEncoderSuite .setDropLast(false) testDefaultReadWrite(t) } + + test("OneHotEncoder with varying types") { + val df = stringIndexed() + val dfWithTypes = df + .withColumn("shortLabel", df("labelIndex").cast(ShortType)) + .withColumn("longLabel", df("labelIndex").cast(LongType)) + .withColumn("intLabel", df("labelIndex").cast(IntegerType)) + .withColumn("floatLabel", df("labelIndex").cast(FloatType)) + .withColumn("decimalLabel", df("labelIndex").cast(DecimalType(10, 0))) + val cols = Array("labelIndex", "shortLabel", "longLabel", "intLabel", + "floatLabel", "decimalLabel") + for (col <- cols) { + val encoder = new OneHotEncoder() + .setInputCol(col) + .setOutputCol("labelVec") + .setDropLast(false) + val encoded = encoder.transform(dfWithTypes) + + val output = encoded.select("id", "labelVec").rdd.map { r => + val vec = r.getAs[Vector](1) + (r.getInt(0), vec(0), vec(1), vec(2)) + }.collect().toSet + // a -> 0, b -> 2, c -> 1 + val expected = Set((0, 1.0, 0.0, 0.0), (1, 0.0, 0.0, 1.0), (2, 0.0, 1.0, 0.0), + (3, 1.0, 0.0, 0.0), (4, 1.0, 0.0, 0.0), (5, 0.0, 1.0, 0.0)) + assert(output === expected) + } + } } |