diff options
author | Volodymyr Lyubinets <vlyubin@gmail.com> | 2015-04-10 16:27:56 -0700 |
---|---|---|
committer | Michael Armbrust <michael@databricks.com> | 2015-04-10 16:27:56 -0700 |
commit | 67d06880e47e0324409cf7e5b21db1dcb0107b82 (patch) | |
tree | 4e59b5974a5f9b51919fec87f60a9f71d6f02234 /mllib/src | |
parent | 23d5f8864f7d665a74b1d38118700139854dbb1c (diff) | |
download | spark-67d06880e47e0324409cf7e5b21db1dcb0107b82.tar.gz spark-67d06880e47e0324409cf7e5b21db1dcb0107b82.tar.bz2 spark-67d06880e47e0324409cf7e5b21db1dcb0107b82.zip |
[SQL] [SPARK-6620] Speed up toDF() and rdd() functions by constructing converters in ScalaReflection
cc marmbrus
Author: Volodymyr Lyubinets <vlyubin@gmail.com>
Closes #5279 from vlyubin/speedup and squashes the following commits:
e75a387 [Volodymyr Lyubinets] Changes to ScalaUDF
11a20ec [Volodymyr Lyubinets] Avoid creating a tuple
c327bc9 [Volodymyr Lyubinets] Moved the only remaining function from DataTypeConversions to DateUtils
dec6802 [Volodymyr Lyubinets] Addresed review feedback
74301fa [Volodymyr Lyubinets] Addressed review comments
afa3aa5 [Volodymyr Lyubinets] Minor refactoring, added license, removed debug output
881dc60 [Volodymyr Lyubinets] Moved to a separate module; addressed review comments; one extra place of usage; changed behaviour for Java
8cad6e2 [Volodymyr Lyubinets] Addressed review commments
41b2aa9 [Volodymyr Lyubinets] Creating converters for ScalaReflection stuff, and more
Diffstat (limited to 'mllib/src')
-rw-r--r-- | mllib/src/test/scala/org/apache/spark/ml/feature/TokenizerSuite.scala | 17 |
1 files changed, 7 insertions, 10 deletions
diff --git a/mllib/src/test/scala/org/apache/spark/ml/feature/TokenizerSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/feature/TokenizerSuite.scala index bf862b912d..d186ead8f5 100644 --- a/mllib/src/test/scala/org/apache/spark/ml/feature/TokenizerSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/ml/feature/TokenizerSuite.scala @@ -25,10 +25,7 @@ import org.apache.spark.mllib.util.MLlibTestSparkContext import org.apache.spark.sql.{DataFrame, Row, SQLContext} @BeanInfo -case class TokenizerTestData(rawText: String, wantedTokens: Seq[String]) { - /** Constructor used in [[org.apache.spark.ml.feature.JavaTokenizerSuite]] */ - def this(rawText: String, wantedTokens: Array[String]) = this(rawText, wantedTokens.toSeq) -} +case class TokenizerTestData(rawText: String, wantedTokens: Array[String]) class RegexTokenizerSuite extends FunSuite with MLlibTestSparkContext { import org.apache.spark.ml.feature.RegexTokenizerSuite._ @@ -46,14 +43,14 @@ class RegexTokenizerSuite extends FunSuite with MLlibTestSparkContext { .setOutputCol("tokens") val dataset0 = sqlContext.createDataFrame(Seq( - TokenizerTestData("Test for tokenization.", Seq("Test", "for", "tokenization", ".")), - TokenizerTestData("Te,st. punct", Seq("Te", ",", "st", ".", "punct")) + TokenizerTestData("Test for tokenization.", Array("Test", "for", "tokenization", ".")), + TokenizerTestData("Te,st. punct", Array("Te", ",", "st", ".", "punct")) )) testRegexTokenizer(tokenizer, dataset0) val dataset1 = sqlContext.createDataFrame(Seq( - TokenizerTestData("Test for tokenization.", Seq("Test", "for", "tokenization")), - TokenizerTestData("Te,st. punct", Seq("punct")) + TokenizerTestData("Test for tokenization.", Array("Test", "for", "tokenization")), + TokenizerTestData("Te,st. punct", Array("punct")) )) tokenizer.setMinTokenLength(3) @@ -64,8 +61,8 @@ class RegexTokenizerSuite extends FunSuite with MLlibTestSparkContext { .setGaps(true) .setMinTokenLength(0) val dataset2 = sqlContext.createDataFrame(Seq( - TokenizerTestData("Test for tokenization.", Seq("Test", "for", "tokenization.")), - TokenizerTestData("Te,st. punct", Seq("Te,st.", "", "punct")) + TokenizerTestData("Test for tokenization.", Array("Test", "for", "tokenization.")), + TokenizerTestData("Te,st. punct", Array("Te,st.", "", "punct")) )) testRegexTokenizer(tokenizer, dataset2) } |