diff options
author | Yuhao Yang <hhbyyh@gmail.com> | 2015-08-12 09:35:32 -0700 |
---|---|---|
committer | Xiangrui Meng <meng@databricks.com> | 2015-08-12 09:35:41 -0700 |
commit | 2d86faddd87b6e61565cbdf18dadaf4aeb2b223e (patch) | |
tree | 7e5a90f88c30123388dc44bef32a47539ccbcd59 | |
parent | bc4ac65d4c0fed93c70582fc74574c5b70aa842d (diff) | |
download | spark-2d86faddd87b6e61565cbdf18dadaf4aeb2b223e.tar.gz spark-2d86faddd87b6e61565cbdf18dadaf4aeb2b223e.tar.bz2 spark-2d86faddd87b6e61565cbdf18dadaf4aeb2b223e.zip |
[SPARK-7583] [MLLIB] User guide update for RegexTokenizer
jira: https://issues.apache.org/jira/browse/SPARK-7583
User guide update for RegexTokenizer
Author: Yuhao Yang <hhbyyh@gmail.com>
Closes #7828 from hhbyyh/regexTokenizerDoc.
(cherry picked from commit 66d87c1d76bea2b81993156ac1fa7dad6c312ebf)
Signed-off-by: Xiangrui Meng <meng@databricks.com>
-rw-r--r-- | docs/ml-features.md | 41 |
1 files changed, 30 insertions, 11 deletions
diff --git a/docs/ml-features.md b/docs/ml-features.md index fa0ad1f00a..cec2cbe673 100644 --- a/docs/ml-features.md +++ b/docs/ml-features.md @@ -217,21 +217,32 @@ for feature in result.select("result").take(3): [Tokenization](http://en.wikipedia.org/wiki/Lexical_analysis#Tokenization) is the process of taking text (such as a sentence) and breaking it into individual terms (usually words). A simple [Tokenizer](api/scala/index.html#org.apache.spark.ml.feature.Tokenizer) class provides this functionality. The example below shows how to split sentences into sequences of words. -Note: A more advanced tokenizer is provided via [RegexTokenizer](api/scala/index.html#org.apache.spark.ml.feature.RegexTokenizer). +[RegexTokenizer](api/scala/index.html#org.apache.spark.ml.feature.RegexTokenizer) allows more + advanced tokenization based on regular expression (regex) matching. + By default, the parameter "pattern" (regex, default: \\s+) is used as delimiters to split the input text. + Alternatively, users can set parameter "gaps" to false indicating the regex "pattern" denotes + "tokens" rather than splitting gaps, and find all matching occurrences as the tokenization result. <div class="codetabs"> <div data-lang="scala" markdown="1"> {% highlight scala %} -import org.apache.spark.ml.feature.Tokenizer +import org.apache.spark.ml.feature.{Tokenizer, RegexTokenizer} val sentenceDataFrame = sqlContext.createDataFrame(Seq( (0, "Hi I heard about Spark"), - (0, "I wish Java could use case classes"), - (1, "Logistic regression models are neat") + (1, "I wish Java could use case classes"), + (2, "Logistic,regression,models,are,neat") )).toDF("label", "sentence") val tokenizer = new Tokenizer().setInputCol("sentence").setOutputCol("words") -val wordsDataFrame = tokenizer.transform(sentenceDataFrame) -wordsDataFrame.select("words", "label").take(3).foreach(println) +val regexTokenizer = new RegexTokenizer() + .setInputCol("sentence") + .setOutputCol("words") + .setPattern("\\W") // alternatively .setPattern("\\w+").setGaps(false) + +val tokenized = tokenizer.transform(sentenceDataFrame) +tokenized.select("words", "label").take(3).foreach(println) +val regexTokenized = regexTokenizer.transform(sentenceDataFrame) +regexTokenized.select("words", "label").take(3).foreach(println) {% endhighlight %} </div> @@ -240,6 +251,7 @@ wordsDataFrame.select("words", "label").take(3).foreach(println) import com.google.common.collect.Lists; import org.apache.spark.api.java.JavaRDD; +import org.apache.spark.ml.feature.RegexTokenizer; import org.apache.spark.ml.feature.Tokenizer; import org.apache.spark.mllib.linalg.Vector; import org.apache.spark.sql.DataFrame; @@ -252,8 +264,8 @@ import org.apache.spark.sql.types.StructType; JavaRDD<Row> jrdd = jsc.parallelize(Lists.newArrayList( RowFactory.create(0, "Hi I heard about Spark"), - RowFactory.create(0, "I wish Java could use case classes"), - RowFactory.create(1, "Logistic regression models are neat") + RowFactory.create(1, "I wish Java could use case classes"), + RowFactory.create(2, "Logistic,regression,models,are,neat") )); StructType schema = new StructType(new StructField[]{ new StructField("label", DataTypes.DoubleType, false, Metadata.empty()), @@ -267,22 +279,29 @@ for (Row r : wordsDataFrame.select("words", "label").take(3)) { for (String word : words) System.out.print(word + " "); System.out.println(); } + +RegexTokenizer regexTokenizer = new RegexTokenizer() + .setInputCol("sentence") + .setOutputCol("words") + .setPattern("\\W"); // alternatively .setPattern("\\w+").setGaps(false); {% endhighlight %} </div> <div data-lang="python" markdown="1"> {% highlight python %} -from pyspark.ml.feature import Tokenizer +from pyspark.ml.feature import Tokenizer, RegexTokenizer sentenceDataFrame = sqlContext.createDataFrame([ (0, "Hi I heard about Spark"), - (0, "I wish Java could use case classes"), - (1, "Logistic regression models are neat") + (1, "I wish Java could use case classes"), + (2, "Logistic,regression,models,are,neat") ], ["label", "sentence"]) tokenizer = Tokenizer(inputCol="sentence", outputCol="words") wordsDataFrame = tokenizer.transform(sentenceDataFrame) for words_label in wordsDataFrame.select("words", "label").take(3): print(words_label) +regexTokenizer = RegexTokenizer(inputCol="sentence", outputCol="words", pattern="\\W") +# alternatively, pattern="\\w+", gaps(False) {% endhighlight %} </div> </div> |