aboutsummaryrefslogtreecommitdiff
path: root/mllib
diff options
context:
space:
mode:
authorXiangrui Meng <meng@databricks.com>2015-05-21 17:59:03 -0700
committerXiangrui Meng <meng@databricks.com>2015-05-21 17:59:03 -0700
commitf5db4b416c922db7a8f1b0c098b4f08647106231 (patch)
tree97ddc7467b77b169324bba8b0ec93c5d76005ca9 /mllib
parent17791a58159b3e4619d0367f54a4c5332342658b (diff)
downloadspark-f5db4b416c922db7a8f1b0c098b4f08647106231.tar.gz
spark-f5db4b416c922db7a8f1b0c098b4f08647106231.tar.bz2
spark-f5db4b416c922db7a8f1b0c098b4f08647106231.zip
[SPARK-7794] [MLLIB] update RegexTokenizer default settings
The previous default is `{gaps: false, pattern: "\\p{L}+|[^\\p{L}\\s]+"}`. The default pattern is hard to understand. This PR changes the default to `{gaps: true, pattern: "\\s+"}`. jkbradley Author: Xiangrui Meng <meng@databricks.com> Closes #6330 from mengxr/SPARK-7794 and squashes the following commits: 5ee7cde [Xiangrui Meng] update RegexTokenizer default settings
Diffstat (limited to 'mllib')
-rw-r--r--mllib/src/main/scala/org/apache/spark/ml/feature/Tokenizer.scala18
-rw-r--r--mllib/src/test/scala/org/apache/spark/ml/feature/TokenizerSuite.scala32
2 files changed, 25 insertions, 25 deletions
diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/Tokenizer.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/Tokenizer.scala
index 3f7f4f96fc..31f3a1aa4c 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/feature/Tokenizer.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/feature/Tokenizer.scala
@@ -26,6 +26,8 @@ import org.apache.spark.sql.types.{ArrayType, DataType, StringType}
/**
* :: AlphaComponent ::
* A tokenizer that converts the input string to lowercase and then splits it by white spaces.
+ *
+ * @see [[RegexTokenizer]]
*/
@AlphaComponent
class Tokenizer(override val uid: String) extends UnaryTransformer[String, Seq[String], Tokenizer] {
@@ -45,9 +47,9 @@ class Tokenizer(override val uid: String) extends UnaryTransformer[String, Seq[S
/**
* :: AlphaComponent ::
- * A regex based tokenizer that extracts tokens either by repeatedly matching the regex(default)
- * or using it to split the text (set matching to false). Optional parameters also allow filtering
- * tokens using a minimal length.
+ * A regex based tokenizer that extracts tokens either by using the provided regex pattern to split
+ * the text (default) or repeatedly matching the regex (if `gaps` is true).
+ * Optional parameters also allow filtering tokens using a minimal length.
* It returns an array of strings that can be empty.
*/
@AlphaComponent
@@ -71,8 +73,8 @@ class RegexTokenizer(override val uid: String)
def getMinTokenLength: Int = $(minTokenLength)
/**
- * Indicates whether regex splits on gaps (true) or matching tokens (false).
- * Default: false
+ * Indicates whether regex splits on gaps (true) or matches tokens (false).
+ * Default: true
* @group param
*/
val gaps: BooleanParam = new BooleanParam(this, "gaps", "Set regex to match gaps or tokens")
@@ -84,8 +86,8 @@ class RegexTokenizer(override val uid: String)
def getGaps: Boolean = $(gaps)
/**
- * Regex pattern used by tokenizer.
- * Default: `"\\p{L}+|[^\\p{L}\\s]+"`
+ * Regex pattern used to match delimiters if [[gaps]] is true or tokens if [[gaps]] is false.
+ * Default: `"\\s+"`
* @group param
*/
val pattern: Param[String] = new Param(this, "pattern", "regex pattern used for tokenizing")
@@ -96,7 +98,7 @@ class RegexTokenizer(override val uid: String)
/** @group getParam */
def getPattern: String = $(pattern)
- setDefault(minTokenLength -> 1, gaps -> false, pattern -> "\\p{L}+|[^\\p{L}\\s]+")
+ setDefault(minTokenLength -> 1, gaps -> true, pattern -> "\\s+")
override protected def createTransformFunc: String => Seq[String] = { str =>
val re = $(pattern).r
diff --git a/mllib/src/test/scala/org/apache/spark/ml/feature/TokenizerSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/feature/TokenizerSuite.scala
index a46d08d651..eabda089d0 100644
--- a/mllib/src/test/scala/org/apache/spark/ml/feature/TokenizerSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/ml/feature/TokenizerSuite.scala
@@ -29,35 +29,34 @@ case class TokenizerTestData(rawText: String, wantedTokens: Array[String])
class RegexTokenizerSuite extends FunSuite with MLlibTestSparkContext {
import org.apache.spark.ml.feature.RegexTokenizerSuite._
-
+
test("RegexTokenizer") {
- val tokenizer = new RegexTokenizer()
+ val tokenizer0 = new RegexTokenizer()
+ .setGaps(false)
+ .setPattern("\\w+|\\p{Punct}")
.setInputCol("rawText")
.setOutputCol("tokens")
-
val dataset0 = sqlContext.createDataFrame(Seq(
TokenizerTestData("Test for tokenization.", Array("Test", "for", "tokenization", ".")),
TokenizerTestData("Te,st. punct", Array("Te", ",", "st", ".", "punct"))
))
- testRegexTokenizer(tokenizer, dataset0)
+ testRegexTokenizer(tokenizer0, dataset0)
val dataset1 = sqlContext.createDataFrame(Seq(
TokenizerTestData("Test for tokenization.", Array("Test", "for", "tokenization")),
TokenizerTestData("Te,st. punct", Array("punct"))
))
+ tokenizer0.setMinTokenLength(3)
+ testRegexTokenizer(tokenizer0, dataset1)
- tokenizer.setMinTokenLength(3)
- testRegexTokenizer(tokenizer, dataset1)
-
- tokenizer
- .setPattern("\\s")
- .setGaps(true)
- .setMinTokenLength(0)
+ val tokenizer2 = new RegexTokenizer()
+ .setInputCol("rawText")
+ .setOutputCol("tokens")
val dataset2 = sqlContext.createDataFrame(Seq(
TokenizerTestData("Test for tokenization.", Array("Test", "for", "tokenization.")),
- TokenizerTestData("Te,st. punct", Array("Te,st.", "", "punct"))
+ TokenizerTestData("Te,st. punct", Array("Te,st.", "punct"))
))
- testRegexTokenizer(tokenizer, dataset2)
+ testRegexTokenizer(tokenizer2, dataset2)
}
}
@@ -67,9 +66,8 @@ object RegexTokenizerSuite extends FunSuite {
t.transform(dataset)
.select("tokens", "wantedTokens")
.collect()
- .foreach {
- case Row(tokens, wantedTokens) =>
- assert(tokens === wantedTokens)
- }
+ .foreach { case Row(tokens, wantedTokens) =>
+ assert(tokens === wantedTokens)
+ }
}
}