aboutsummaryrefslogtreecommitdiff
path: root/docs
diff options
context:
space:
mode:
authorYuhao Yang <hhbyyh@gmail.com>2015-08-12 09:35:32 -0700
committerXiangrui Meng <meng@databricks.com>2015-08-12 09:35:32 -0700
commit66d87c1d76bea2b81993156ac1fa7dad6c312ebf (patch)
tree7392766d367ccd7101d9971909531ab01c129c28 /docs
parentbe5d1912076c2ffd21ec88611e53d3b3c59b7ecc (diff)
downloadspark-66d87c1d76bea2b81993156ac1fa7dad6c312ebf.tar.gz
spark-66d87c1d76bea2b81993156ac1fa7dad6c312ebf.tar.bz2
spark-66d87c1d76bea2b81993156ac1fa7dad6c312ebf.zip
[SPARK-7583] [MLLIB] User guide update for RegexTokenizer
jira: https://issues.apache.org/jira/browse/SPARK-7583 User guide update for RegexTokenizer Author: Yuhao Yang <hhbyyh@gmail.com> Closes #7828 from hhbyyh/regexTokenizerDoc.
Diffstat (limited to 'docs')
-rw-r--r--docs/ml-features.md41
1 files changed, 30 insertions, 11 deletions
diff --git a/docs/ml-features.md b/docs/ml-features.md
index fa0ad1f00a..cec2cbe673 100644
--- a/docs/ml-features.md
+++ b/docs/ml-features.md
@@ -217,21 +217,32 @@ for feature in result.select("result").take(3):
[Tokenization](http://en.wikipedia.org/wiki/Lexical_analysis#Tokenization) is the process of taking text (such as a sentence) and breaking it into individual terms (usually words). A simple [Tokenizer](api/scala/index.html#org.apache.spark.ml.feature.Tokenizer) class provides this functionality. The example below shows how to split sentences into sequences of words.
-Note: A more advanced tokenizer is provided via [RegexTokenizer](api/scala/index.html#org.apache.spark.ml.feature.RegexTokenizer).
+[RegexTokenizer](api/scala/index.html#org.apache.spark.ml.feature.RegexTokenizer) allows more
+ advanced tokenization based on regular expression (regex) matching.
+ By default, the parameter "pattern" (regex, default: \\s+) is used as delimiters to split the input text.
+ Alternatively, users can set parameter "gaps" to false indicating the regex "pattern" denotes
+ "tokens" rather than splitting gaps, and find all matching occurrences as the tokenization result.
<div class="codetabs">
<div data-lang="scala" markdown="1">
{% highlight scala %}
-import org.apache.spark.ml.feature.Tokenizer
+import org.apache.spark.ml.feature.{Tokenizer, RegexTokenizer}
val sentenceDataFrame = sqlContext.createDataFrame(Seq(
(0, "Hi I heard about Spark"),
- (0, "I wish Java could use case classes"),
- (1, "Logistic regression models are neat")
+ (1, "I wish Java could use case classes"),
+ (2, "Logistic,regression,models,are,neat")
)).toDF("label", "sentence")
val tokenizer = new Tokenizer().setInputCol("sentence").setOutputCol("words")
-val wordsDataFrame = tokenizer.transform(sentenceDataFrame)
-wordsDataFrame.select("words", "label").take(3).foreach(println)
+val regexTokenizer = new RegexTokenizer()
+ .setInputCol("sentence")
+ .setOutputCol("words")
+ .setPattern("\\W") // alternatively .setPattern("\\w+").setGaps(false)
+
+val tokenized = tokenizer.transform(sentenceDataFrame)
+tokenized.select("words", "label").take(3).foreach(println)
+val regexTokenized = regexTokenizer.transform(sentenceDataFrame)
+regexTokenized.select("words", "label").take(3).foreach(println)
{% endhighlight %}
</div>
@@ -240,6 +251,7 @@ wordsDataFrame.select("words", "label").take(3).foreach(println)
import com.google.common.collect.Lists;
import org.apache.spark.api.java.JavaRDD;
+import org.apache.spark.ml.feature.RegexTokenizer;
import org.apache.spark.ml.feature.Tokenizer;
import org.apache.spark.mllib.linalg.Vector;
import org.apache.spark.sql.DataFrame;
@@ -252,8 +264,8 @@ import org.apache.spark.sql.types.StructType;
JavaRDD<Row> jrdd = jsc.parallelize(Lists.newArrayList(
RowFactory.create(0, "Hi I heard about Spark"),
- RowFactory.create(0, "I wish Java could use case classes"),
- RowFactory.create(1, "Logistic regression models are neat")
+ RowFactory.create(1, "I wish Java could use case classes"),
+ RowFactory.create(2, "Logistic,regression,models,are,neat")
));
StructType schema = new StructType(new StructField[]{
new StructField("label", DataTypes.DoubleType, false, Metadata.empty()),
@@ -267,22 +279,29 @@ for (Row r : wordsDataFrame.select("words", "label").take(3)) {
for (String word : words) System.out.print(word + " ");
System.out.println();
}
+
+RegexTokenizer regexTokenizer = new RegexTokenizer()
+ .setInputCol("sentence")
+ .setOutputCol("words")
+ .setPattern("\\W"); // alternatively .setPattern("\\w+").setGaps(false);
{% endhighlight %}
</div>
<div data-lang="python" markdown="1">
{% highlight python %}
-from pyspark.ml.feature import Tokenizer
+from pyspark.ml.feature import Tokenizer, RegexTokenizer
sentenceDataFrame = sqlContext.createDataFrame([
(0, "Hi I heard about Spark"),
- (0, "I wish Java could use case classes"),
- (1, "Logistic regression models are neat")
+ (1, "I wish Java could use case classes"),
+ (2, "Logistic,regression,models,are,neat")
], ["label", "sentence"])
tokenizer = Tokenizer(inputCol="sentence", outputCol="words")
wordsDataFrame = tokenizer.transform(sentenceDataFrame)
for words_label in wordsDataFrame.select("words", "label").take(3):
print(words_label)
+regexTokenizer = RegexTokenizer(inputCol="sentence", outputCol="words", pattern="\\W")
+# alternatively, pattern="\\w+", gaps(False)
{% endhighlight %}
</div>
</div>