aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorYuhao Yang <hhbyyh@gmail.com>2015-07-17 13:43:19 -0700
committerJoseph K. Bradley <joseph@databricks.com>2015-07-17 13:43:19 -0700
commit806c579f43ce66ac1398200cbc773fa3b69b5cb6 (patch)
tree38f13d9412a91bca70c160fdd951cc65aed01a76
parentf9a82a884e7cb2a466a33ab64912924ce7ee30c1 (diff)
downloadspark-806c579f43ce66ac1398200cbc773fa3b69b5cb6.tar.gz
spark-806c579f43ce66ac1398200cbc773fa3b69b5cb6.tar.bz2
spark-806c579f43ce66ac1398200cbc773fa3b69b5cb6.zip
[SPARK-9062] [ML] Change output type of Tokenizer to Array(String, true)
jira: https://issues.apache.org/jira/browse/SPARK-9062 Currently output type of Tokenizer is Array(String, false), which is not compatible with Word2Vec and Other transformers since their input type is Array(String, true). Seq[String] in udf will be treated as Array(String, true) by default. I'm not sure what's the recommended way for Tokenizer to handle the null value in the input. Any suggestion will be welcome. Author: Yuhao Yang <hhbyyh@gmail.com> Closes #7414 from hhbyyh/tokenizer and squashes the following commits: c01bd7a [Yuhao Yang] change output type of tokenizer
-rw-r--r--mllib/src/main/scala/org/apache/spark/ml/feature/Tokenizer.scala4
1 files changed, 2 insertions, 2 deletions
diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/Tokenizer.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/Tokenizer.scala
index 5f9f57a2eb..0b3af4747e 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/feature/Tokenizer.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/feature/Tokenizer.scala
@@ -42,7 +42,7 @@ class Tokenizer(override val uid: String) extends UnaryTransformer[String, Seq[S
require(inputType == StringType, s"Input type must be string type but got $inputType.")
}
- override protected def outputDataType: DataType = new ArrayType(StringType, false)
+ override protected def outputDataType: DataType = new ArrayType(StringType, true)
override def copy(extra: ParamMap): Tokenizer = defaultCopy(extra)
}
@@ -113,7 +113,7 @@ class RegexTokenizer(override val uid: String)
require(inputType == StringType, s"Input type must be string type but got $inputType.")
}
- override protected def outputDataType: DataType = new ArrayType(StringType, false)
+ override protected def outputDataType: DataType = new ArrayType(StringType, true)
override def copy(extra: ParamMap): RegexTokenizer = defaultCopy(extra)
}