aboutsummaryrefslogtreecommitdiff
path: root/mllib/src/test
diff options
context:
space:
mode:
authorVolodymyr Lyubinets <vlyubin@gmail.com>2015-04-10 16:27:56 -0700
committerMichael Armbrust <michael@databricks.com>2015-04-10 16:27:56 -0700
commit67d06880e47e0324409cf7e5b21db1dcb0107b82 (patch)
tree4e59b5974a5f9b51919fec87f60a9f71d6f02234 /mllib/src/test
parent23d5f8864f7d665a74b1d38118700139854dbb1c (diff)
downloadspark-67d06880e47e0324409cf7e5b21db1dcb0107b82.tar.gz
spark-67d06880e47e0324409cf7e5b21db1dcb0107b82.tar.bz2
spark-67d06880e47e0324409cf7e5b21db1dcb0107b82.zip
[SQL] [SPARK-6620] Speed up toDF() and rdd() functions by constructing converters in ScalaReflection
cc marmbrus Author: Volodymyr Lyubinets <vlyubin@gmail.com> Closes #5279 from vlyubin/speedup and squashes the following commits: e75a387 [Volodymyr Lyubinets] Changes to ScalaUDF 11a20ec [Volodymyr Lyubinets] Avoid creating a tuple c327bc9 [Volodymyr Lyubinets] Moved the only remaining function from DataTypeConversions to DateUtils dec6802 [Volodymyr Lyubinets] Addresed review feedback 74301fa [Volodymyr Lyubinets] Addressed review comments afa3aa5 [Volodymyr Lyubinets] Minor refactoring, added license, removed debug output 881dc60 [Volodymyr Lyubinets] Moved to a separate module; addressed review comments; one extra place of usage; changed behaviour for Java 8cad6e2 [Volodymyr Lyubinets] Addressed review commments 41b2aa9 [Volodymyr Lyubinets] Creating converters for ScalaReflection stuff, and more
Diffstat (limited to 'mllib/src/test')
-rw-r--r--mllib/src/test/scala/org/apache/spark/ml/feature/TokenizerSuite.scala17
1 files changed, 7 insertions, 10 deletions
diff --git a/mllib/src/test/scala/org/apache/spark/ml/feature/TokenizerSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/feature/TokenizerSuite.scala
index bf862b912d..d186ead8f5 100644
--- a/mllib/src/test/scala/org/apache/spark/ml/feature/TokenizerSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/ml/feature/TokenizerSuite.scala
@@ -25,10 +25,7 @@ import org.apache.spark.mllib.util.MLlibTestSparkContext
import org.apache.spark.sql.{DataFrame, Row, SQLContext}
@BeanInfo
-case class TokenizerTestData(rawText: String, wantedTokens: Seq[String]) {
- /** Constructor used in [[org.apache.spark.ml.feature.JavaTokenizerSuite]] */
- def this(rawText: String, wantedTokens: Array[String]) = this(rawText, wantedTokens.toSeq)
-}
+case class TokenizerTestData(rawText: String, wantedTokens: Array[String])
class RegexTokenizerSuite extends FunSuite with MLlibTestSparkContext {
import org.apache.spark.ml.feature.RegexTokenizerSuite._
@@ -46,14 +43,14 @@ class RegexTokenizerSuite extends FunSuite with MLlibTestSparkContext {
.setOutputCol("tokens")
val dataset0 = sqlContext.createDataFrame(Seq(
- TokenizerTestData("Test for tokenization.", Seq("Test", "for", "tokenization", ".")),
- TokenizerTestData("Te,st. punct", Seq("Te", ",", "st", ".", "punct"))
+ TokenizerTestData("Test for tokenization.", Array("Test", "for", "tokenization", ".")),
+ TokenizerTestData("Te,st. punct", Array("Te", ",", "st", ".", "punct"))
))
testRegexTokenizer(tokenizer, dataset0)
val dataset1 = sqlContext.createDataFrame(Seq(
- TokenizerTestData("Test for tokenization.", Seq("Test", "for", "tokenization")),
- TokenizerTestData("Te,st. punct", Seq("punct"))
+ TokenizerTestData("Test for tokenization.", Array("Test", "for", "tokenization")),
+ TokenizerTestData("Te,st. punct", Array("punct"))
))
tokenizer.setMinTokenLength(3)
@@ -64,8 +61,8 @@ class RegexTokenizerSuite extends FunSuite with MLlibTestSparkContext {
.setGaps(true)
.setMinTokenLength(0)
val dataset2 = sqlContext.createDataFrame(Seq(
- TokenizerTestData("Test for tokenization.", Seq("Test", "for", "tokenization.")),
- TokenizerTestData("Te,st. punct", Seq("Te,st.", "", "punct"))
+ TokenizerTestData("Test for tokenization.", Array("Test", "for", "tokenization.")),
+ TokenizerTestData("Te,st. punct", Array("Te,st.", "", "punct"))
))
testRegexTokenizer(tokenizer, dataset2)
}