aboutsummaryrefslogtreecommitdiff
path: root/mllib/src/test
diff options
context:
space:
mode:
authorEric Liang <ekl@databricks.com>2015-07-28 14:16:57 -0700
committerXiangrui Meng <meng@databricks.com>2015-07-28 14:16:57 -0700
commit8d5bb5283c3cc9180ef34b05be4a715d83073b1e (patch)
treec91d0261b5212032a129bcad4d772f1b183a7ea8 /mllib/src/test
parent6cdcc21fe654ac0a2d0d72783eb10005fc513af6 (diff)
downloadspark-8d5bb5283c3cc9180ef34b05be4a715d83073b1e.tar.gz
spark-8d5bb5283c3cc9180ef34b05be4a715d83073b1e.tar.bz2
spark-8d5bb5283c3cc9180ef34b05be4a715d83073b1e.zip
[SPARK-9391] [ML] Support minus, dot, and intercept operators in SparkR RFormula
Adds '.', '-', and intercept parsing to RFormula. Also splits RFormulaParser into a separate file. Umbrella design doc here: https://docs.google.com/document/d/10NZNSEurN2EdWM31uFYsgayIPfCFHiuIu3pCWrUmP_c/edit?usp=sharing mengxr Author: Eric Liang <ekl@databricks.com> Closes #7707 from ericl/string-features-2 and squashes the following commits: 8588625 [Eric Liang] exclude complex types for . 8106ffe [Eric Liang] comments a9350bb [Eric Liang] s/var/val 9c50d4d [Eric Liang] Merge branch 'string-features' into string-features-2 581afb2 [Eric Liang] Merge branch 'master' into string-features 08ae539 [Eric Liang] Merge branch 'string-features' into string-features-2 f99131a [Eric Liang] comments cecec43 [Eric Liang] Merge branch 'string-features' into string-features-2 0bf3c26 [Eric Liang] update docs 4592df2 [Eric Liang] intercept supports 7412a2e [Eric Liang] Fri Jul 24 14:56:51 PDT 2015 3cf848e [Eric Liang] fix the parser 0556c2b [Eric Liang] Merge branch 'string-features' into string-features-2 c302a2c [Eric Liang] fix tests 9d1ac82 [Eric Liang] Merge remote-tracking branch 'upstream/master' into string-features e713da3 [Eric Liang] comments cd231a9 [Eric Liang] Wed Jul 22 17:18:44 PDT 2015 4d79193 [Eric Liang] revert to seq + distinct 169a085 [Eric Liang] tweak functional test a230a47 [Eric Liang] Merge branch 'master' into string-features 72bd6f3 [Eric Liang] fix merge d841cec [Eric Liang] Merge branch 'master' into string-features 5b2c4a2 [Eric Liang] Mon Jul 20 18:45:33 PDT 2015 b01c7c5 [Eric Liang] add test 8a637db [Eric Liang] encoder wip a1d03f4 [Eric Liang] refactor into estimator
Diffstat (limited to 'mllib/src/test')
-rw-r--r--mllib/src/test/scala/org/apache/spark/ml/feature/RFormulaParserSuite.scala55
1 files changed, 51 insertions, 4 deletions
diff --git a/mllib/src/test/scala/org/apache/spark/ml/feature/RFormulaParserSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/feature/RFormulaParserSuite.scala
index c4b45aee06..436e66bab0 100644
--- a/mllib/src/test/scala/org/apache/spark/ml/feature/RFormulaParserSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/ml/feature/RFormulaParserSuite.scala
@@ -18,12 +18,17 @@
package org.apache.spark.ml.feature
import org.apache.spark.SparkFunSuite
+import org.apache.spark.sql.types._
class RFormulaParserSuite extends SparkFunSuite {
- private def checkParse(formula: String, label: String, terms: Seq[String]) {
- val parsed = RFormulaParser.parse(formula)
- assert(parsed.label == label)
- assert(parsed.terms == terms)
+ private def checkParse(
+ formula: String,
+ label: String,
+ terms: Seq[String],
+ schema: StructType = null) {
+ val resolved = RFormulaParser.parse(formula).resolve(schema)
+ assert(resolved.label == label)
+ assert(resolved.terms == terms)
}
test("parse simple formulas") {
@@ -32,4 +37,46 @@ class RFormulaParserSuite extends SparkFunSuite {
checkParse("y ~ ._foo ", "y", Seq("._foo"))
checkParse("resp ~ A_VAR + B + c123", "resp", Seq("A_VAR", "B", "c123"))
}
+
+ test("parse dot") {
+ val schema = (new StructType)
+ .add("a", "int", true)
+ .add("b", "long", false)
+ .add("c", "string", true)
+ checkParse("a ~ .", "a", Seq("b", "c"), schema)
+ }
+
+ test("parse deletion") {
+ val schema = (new StructType)
+ .add("a", "int", true)
+ .add("b", "long", false)
+ .add("c", "string", true)
+ checkParse("a ~ c - b", "a", Seq("c"), schema)
+ }
+
+ test("parse additions and deletions in order") {
+ val schema = (new StructType)
+ .add("a", "int", true)
+ .add("b", "long", false)
+ .add("c", "string", true)
+ checkParse("a ~ . - b + . - c", "a", Seq("b"), schema)
+ }
+
+ test("dot ignores complex column types") {
+ val schema = (new StructType)
+ .add("a", "int", true)
+ .add("b", "tinyint", false)
+ .add("c", "map<string, string>", true)
+ checkParse("a ~ .", "a", Seq("b"), schema)
+ }
+
+ test("parse intercept") {
+ assert(RFormulaParser.parse("a ~ b").hasIntercept)
+ assert(RFormulaParser.parse("a ~ b + 1").hasIntercept)
+ assert(RFormulaParser.parse("a ~ b - 0").hasIntercept)
+ assert(RFormulaParser.parse("a ~ b - 1 + 1").hasIntercept)
+ assert(!RFormulaParser.parse("a ~ b + 0").hasIntercept)
+ assert(!RFormulaParser.parse("a ~ b - 1").hasIntercept)
+ assert(!RFormulaParser.parse("a ~ b + 1 - 1").hasIntercept)
+ }
}