[SQL] [SPARK-6620] Speed up toDF() and rdd() functions by constructing converters in ScalaReflection

cc marmbrus Author: Volodymyr Lyubinets <vlyubin@gmail.com> Closes #5279 from vlyubin/speedup and squashes the following commits: e75a387 [Volodymyr Lyubinets] Changes to ScalaUDF 11a20ec [Volodymyr Lyubinets] Avoid creating a tuple c327bc9 [Volodymyr Lyubinets] Moved the only remaining function from DataTypeConversions to DateUtils dec6802 [Volodymyr Lyubinets] Addresed review feedback 74301fa [Volodymyr Lyubinets] Addressed review comments afa3aa5 [Volodymyr Lyubinets] Minor refactoring, added license, removed debug output 881dc60 [Volodymyr Lyubinets] Moved to a separate module; addressed review comments; one extra place of usage; changed behaviour for Java 8cad6e2 [Volodymyr Lyubinets] Addressed review commments 41b2aa9 [Volodymyr Lyubinets] Creating converters for ScalaReflection stuff, and more
author: Volodymyr Lyubinets <vlyubin@gmail.com> 2015-04-10 16:27:56 -0700
committer: Michael Armbrust <michael@databricks.com> 2015-04-10 16:27:56 -0700
commit: 67d06880e47e0324409cf7e5b21db1dcb0107b82 (patch)
tree: 4e59b5974a5f9b51919fec87f60a9f71d6f02234 /mllib/src
parent: 23d5f8864f7d665a74b1d38118700139854dbb1c (diff)
download: spark-67d06880e47e0324409cf7e5b21db1dcb0107b82.tar.gz
spark-67d06880e47e0324409cf7e5b21db1dcb0107b82.tar.bz2
spark-67d06880e47e0324409cf7e5b21db1dcb0107b82.zip
1 files changed, 7 insertions, 10 deletions
diff --git a/mllib/src/test/scala/org/apache/spark/ml/feature/TokenizerSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/feature/TokenizerSuite.scala
index bf862b912d..d186ead8f5 100644
--- a/mllib/src/test/scala/org/apache/spark/ml/feature/TokenizerSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/ml/feature/TokenizerSuite.scala
@@ -25,10 +25,7 @@ import org.apache.spark.mllib.util.MLlibTestSparkContext
 import org.apache.spark.sql.{DataFrame, Row, SQLContext}
 
 @BeanInfo
-case class TokenizerTestData(rawText: String, wantedTokens: Seq[String]) {
-  /** Constructor used in [[org.apache.spark.ml.feature.JavaTokenizerSuite]] */
-  def this(rawText: String, wantedTokens: Array[String]) = this(rawText, wantedTokens.toSeq)
-}
+case class TokenizerTestData(rawText: String, wantedTokens: Array[String])
 
 class RegexTokenizerSuite extends FunSuite with MLlibTestSparkContext {
   import org.apache.spark.ml.feature.RegexTokenizerSuite._
@@ -46,14 +43,14 @@ class RegexTokenizerSuite extends FunSuite with MLlibTestSparkContext {
       .setOutputCol("tokens")
 
     val dataset0 = sqlContext.createDataFrame(Seq(
-      TokenizerTestData("Test for tokenization.", Seq("Test", "for", "tokenization", ".")),
-      TokenizerTestData("Te,st. punct", Seq("Te", ",", "st", ".", "punct"))
+      TokenizerTestData("Test for tokenization.", Array("Test", "for", "tokenization", ".")),
+      TokenizerTestData("Te,st. punct", Array("Te", ",", "st", ".", "punct"))
     ))
     testRegexTokenizer(tokenizer, dataset0)
 
     val dataset1 = sqlContext.createDataFrame(Seq(
-      TokenizerTestData("Test for tokenization.", Seq("Test", "for", "tokenization")),
-      TokenizerTestData("Te,st. punct", Seq("punct"))
+      TokenizerTestData("Test for tokenization.", Array("Test", "for", "tokenization")),
+      TokenizerTestData("Te,st. punct", Array("punct"))
     ))
 
     tokenizer.setMinTokenLength(3)
@@ -64,8 +61,8 @@ class RegexTokenizerSuite extends FunSuite with MLlibTestSparkContext {
       .setGaps(true)
       .setMinTokenLength(0)
     val dataset2 = sqlContext.createDataFrame(Seq(
-      TokenizerTestData("Test for tokenization.", Seq("Test", "for", "tokenization.")),
-      TokenizerTestData("Te,st.  punct", Seq("Te,st.", "", "punct"))
+      TokenizerTestData("Test for tokenization.", Array("Test", "for", "tokenization.")),
+      TokenizerTestData("Te,st.  punct", Array("Te,st.", "", "punct"))
     ))
     testRegexTokenizer(tokenizer, dataset2)
   }
author	Volodymyr Lyubinets <vlyubin@gmail.com>	2015-04-10 16:27:56 -0700
committer	Michael Armbrust <michael@databricks.com>	2015-04-10 16:27:56 -0700
commit	67d06880e47e0324409cf7e5b21db1dcb0107b82 (patch)
tree	4e59b5974a5f9b51919fec87f60a9f71d6f02234 /mllib/src
parent	23d5f8864f7d665a74b1d38118700139854dbb1c (diff)
download	spark-67d06880e47e0324409cf7e5b21db1dcb0107b82.tar.gz spark-67d06880e47e0324409cf7e5b21db1dcb0107b82.tar.bz2 spark-67d06880e47e0324409cf7e5b21db1dcb0107b82.zip