path: root/mllib/src
diff options
Diffstat (limited to 'mllib/src')
17 files changed, 2561 insertions, 71 deletions
diff --git a/mllib/src/main/resources/org/apache/spark/ml/feature/stopwords/README b/mllib/src/main/resources/org/apache/spark/ml/feature/stopwords/README
new file mode 100755
index 0000000000..ec08a50807
--- /dev/null
+++ b/mllib/src/main/resources/org/apache/spark/ml/feature/stopwords/README
@@ -0,0 +1,12 @@
+Stopwords Corpus
+This corpus contains lists of stop words for several languages. These
+are high-frequency grammatical words which are usually ignored in text
+retrieval applications.
+They were obtained from:
+The English list has been augmented
diff --git a/mllib/src/main/resources/org/apache/spark/ml/feature/stopwords/danish.txt b/mllib/src/main/resources/org/apache/spark/ml/feature/stopwords/danish.txt
new file mode 100644
index 0000000000..ea9e2c4abe
--- /dev/null
+++ b/mllib/src/main/resources/org/apache/spark/ml/feature/stopwords/danish.txt
@@ -0,0 +1,94 @@
+sådan \ No newline at end of file
diff --git a/mllib/src/main/resources/org/apache/spark/ml/feature/stopwords/dutch.txt b/mllib/src/main/resources/org/apache/spark/ml/feature/stopwords/dutch.txt
new file mode 100644
index 0000000000..023cc2c939
--- /dev/null
+++ b/mllib/src/main/resources/org/apache/spark/ml/feature/stopwords/dutch.txt
@@ -0,0 +1,101 @@
+andere \ No newline at end of file
diff --git a/mllib/src/main/resources/org/apache/spark/ml/feature/stopwords/english.txt b/mllib/src/main/resources/org/apache/spark/ml/feature/stopwords/english.txt
new file mode 100644
index 0000000000..d075cc0bab
--- /dev/null
+++ b/mllib/src/main/resources/org/apache/spark/ml/feature/stopwords/english.txt
@@ -0,0 +1,153 @@
diff --git a/mllib/src/main/resources/org/apache/spark/ml/feature/stopwords/finnish.txt b/mllib/src/main/resources/org/apache/spark/ml/feature/stopwords/finnish.txt
new file mode 100644
index 0000000000..5b0eb10777
--- /dev/null
+++ b/mllib/src/main/resources/org/apache/spark/ml/feature/stopwords/finnish.txt
@@ -0,0 +1,235 @@
+itse \ No newline at end of file
diff --git a/mllib/src/main/resources/org/apache/spark/ml/feature/stopwords/french.txt b/mllib/src/main/resources/org/apache/spark/ml/feature/stopwords/french.txt
new file mode 100644
index 0000000000..94b8f8f39a
--- /dev/null
+++ b/mllib/src/main/resources/org/apache/spark/ml/feature/stopwords/french.txt
@@ -0,0 +1,155 @@
+eussent \ No newline at end of file
diff --git a/mllib/src/main/resources/org/apache/spark/ml/feature/stopwords/german.txt b/mllib/src/main/resources/org/apache/spark/ml/feature/stopwords/german.txt
new file mode 100644
index 0000000000..7e65190f8b
--- /dev/null
+++ b/mllib/src/main/resources/org/apache/spark/ml/feature/stopwords/german.txt
@@ -0,0 +1,231 @@
+zwischen \ No newline at end of file
diff --git a/mllib/src/main/resources/org/apache/spark/ml/feature/stopwords/hungarian.txt b/mllib/src/main/resources/org/apache/spark/ml/feature/stopwords/hungarian.txt
new file mode 100644
index 0000000000..8d4543a096
--- /dev/null
+++ b/mllib/src/main/resources/org/apache/spark/ml/feature/stopwords/hungarian.txt
@@ -0,0 +1,199 @@
+volna \ No newline at end of file
diff --git a/mllib/src/main/resources/org/apache/spark/ml/feature/stopwords/italian.txt b/mllib/src/main/resources/org/apache/spark/ml/feature/stopwords/italian.txt
new file mode 100644
index 0000000000..783b2e0cbf
--- /dev/null
+++ b/mllib/src/main/resources/org/apache/spark/ml/feature/stopwords/italian.txt
@@ -0,0 +1,279 @@
+stando \ No newline at end of file
diff --git a/mllib/src/main/resources/org/apache/spark/ml/feature/stopwords/norwegian.txt b/mllib/src/main/resources/org/apache/spark/ml/feature/stopwords/norwegian.txt
new file mode 100644
index 0000000000..cb91702c5e
--- /dev/null
+++ b/mllib/src/main/resources/org/apache/spark/ml/feature/stopwords/norwegian.txt
@@ -0,0 +1,176 @@
+vart \ No newline at end of file
diff --git a/mllib/src/main/resources/org/apache/spark/ml/feature/stopwords/portuguese.txt b/mllib/src/main/resources/org/apache/spark/ml/feature/stopwords/portuguese.txt
new file mode 100644
index 0000000000..98b4fdcdf7
--- /dev/null
+++ b/mllib/src/main/resources/org/apache/spark/ml/feature/stopwords/portuguese.txt
@@ -0,0 +1,203 @@
+teriam \ No newline at end of file
diff --git a/mllib/src/main/resources/org/apache/spark/ml/feature/stopwords/russian.txt b/mllib/src/main/resources/org/apache/spark/ml/feature/stopwords/russian.txt
new file mode 100644
index 0000000000..8a800b7449
--- /dev/null
+++ b/mllib/src/main/resources/org/apache/spark/ml/feature/stopwords/russian.txt
@@ -0,0 +1,151 @@
+между \ No newline at end of file
diff --git a/mllib/src/main/resources/org/apache/spark/ml/feature/stopwords/spanish.txt b/mllib/src/main/resources/org/apache/spark/ml/feature/stopwords/spanish.txt
new file mode 100644
index 0000000000..94f493a8d1
--- /dev/null
+++ b/mllib/src/main/resources/org/apache/spark/ml/feature/stopwords/spanish.txt
@@ -0,0 +1,313 @@
+tened \ No newline at end of file
diff --git a/mllib/src/main/resources/org/apache/spark/ml/feature/stopwords/swedish.txt b/mllib/src/main/resources/org/apache/spark/ml/feature/stopwords/swedish.txt
new file mode 100644
index 0000000000..9fae31c185
--- /dev/null
+++ b/mllib/src/main/resources/org/apache/spark/ml/feature/stopwords/swedish.txt
@@ -0,0 +1,114 @@
+vilkas \ No newline at end of file
diff --git a/mllib/src/main/resources/org/apache/spark/ml/feature/stopwords/turkish.txt b/mllib/src/main/resources/org/apache/spark/ml/feature/stopwords/turkish.txt
new file mode 100644
index 0000000000..4e9708d9d2
--- /dev/null
+++ b/mllib/src/main/resources/org/apache/spark/ml/feature/stopwords/turkish.txt
@@ -0,0 +1,53 @@
+yani \ No newline at end of file
diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/StopWordsRemover.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/StopWordsRemover.scala
index b96bc48566..11864cb8f4 100644..100755
--- a/mllib/src/main/scala/org/apache/spark/ml/feature/StopWordsRemover.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/feature/StopWordsRemover.scala
@@ -27,58 +27,6 @@ import org.apache.spark.sql.functions.{col, udf}
import org.apache.spark.sql.types.{ArrayType, StringType, StructType}
- * stop words list
- */
-private[spark] object StopWords {
- /**
- * Use the same default stopwords list as scikit-learn.
- * The original list can be found from "Glasgow Information Retrieval Group"
- * [[http://ir.dcs.gla.ac.uk/resources/linguistic_utils/stop_words]]
- */
- val English = Array( "a", "about", "above", "across", "after", "afterwards", "again",
- "against", "all", "almost", "alone", "along", "already", "also", "although", "always",
- "am", "among", "amongst", "amoungst", "amount", "an", "and", "another",
- "any", "anyhow", "anyone", "anything", "anyway", "anywhere", "are",
- "around", "as", "at", "back", "be", "became", "because", "become",
- "becomes", "becoming", "been", "before", "beforehand", "behind", "being",
- "below", "beside", "besides", "between", "beyond", "bill", "both",
- "bottom", "but", "by", "call", "can", "cannot", "cant", "co", "con",
- "could", "couldnt", "cry", "de", "describe", "detail", "do", "done",
- "down", "due", "during", "each", "eg", "eight", "either", "eleven", "else",
- "elsewhere", "empty", "enough", "etc", "even", "ever", "every", "everyone",
- "everything", "everywhere", "except", "few", "fifteen", "fify", "fill",
- "find", "fire", "first", "five", "for", "former", "formerly", "forty",
- "found", "four", "from", "front", "full", "further", "get", "give", "go",
- "had", "has", "hasnt", "have", "he", "hence", "her", "here", "hereafter",
- "hereby", "herein", "hereupon", "hers", "herself", "him", "himself", "his",
- "how", "however", "hundred", "i", "ie", "if", "in", "inc", "indeed",
- "interest", "into", "is", "it", "its", "itself", "keep", "last", "latter",
- "latterly", "least", "less", "ltd", "made", "many", "may", "me",
- "meanwhile", "might", "mill", "mine", "more", "moreover", "most", "mostly",
- "move", "much", "must", "my", "myself", "name", "namely", "neither",
- "never", "nevertheless", "next", "nine", "no", "nobody", "none", "noone",
- "nor", "not", "nothing", "now", "nowhere", "of", "off", "often", "on",
- "once", "one", "only", "onto", "or", "other", "others", "otherwise", "our",
- "ours", "ourselves", "out", "over", "own", "part", "per", "perhaps",
- "please", "put", "rather", "re", "same", "see", "seem", "seemed",
- "seeming", "seems", "serious", "several", "she", "should", "show", "side",
- "since", "sincere", "six", "sixty", "so", "some", "somehow", "someone",
- "something", "sometime", "sometimes", "somewhere", "still", "such",
- "system", "take", "ten", "than", "that", "the", "their", "them",
- "themselves", "then", "thence", "there", "thereafter", "thereby",
- "therefore", "therein", "thereupon", "these", "they", "thick", "thin",
- "third", "this", "those", "though", "three", "through", "throughout",
- "thru", "thus", "to", "together", "too", "top", "toward", "towards",
- "twelve", "twenty", "two", "un", "under", "until", "up", "upon", "us",
- "very", "via", "was", "we", "well", "were", "what", "whatever", "when",
- "whence", "whenever", "where", "whereafter", "whereas", "whereby",
- "wherein", "whereupon", "wherever", "whether", "which", "while", "whither",
- "who", "whoever", "whole", "whom", "whose", "why", "will", "with",
- "within", "without", "would", "yet", "you", "your", "yours", "yourself", "yourselves")
* :: Experimental ::
* A feature transformer that filters out stop words from input.
* Note: null values from input array are preserved unless adding null to stopWords explicitly.
@@ -97,11 +45,13 @@ class StopWordsRemover(override val uid: String)
def setOutputCol(value: String): this.type = set(outputCol, value)
- * the stop words set to be filtered out
- * Default: [[StopWords.English]]
+ * The words to be filtered out.
+ * Default: English stop words
+ * @see [[StopWordsRemover.loadDefaultStopWords()]]
* @group param
- val stopWords: StringArrayParam = new StringArrayParam(this, "stopWords", "stop words")
+ val stopWords: StringArrayParam =
+ new StringArrayParam(this, "stopWords", "the words to be filtered out")
/** @group setParam */
def setStopWords(value: Array[String]): this.type = set(stopWords, value)
@@ -110,12 +60,12 @@ class StopWordsRemover(override val uid: String)
def getStopWords: Array[String] = $(stopWords)
- * whether to do a case sensitive comparison over the stop words
+ * Whether to do a case sensitive comparison over the stop words.
* Default: false
* @group param
val caseSensitive: BooleanParam = new BooleanParam(this, "caseSensitive",
- "whether to do case-sensitive comparison during filtering")
+ "whether to do a case-sensitive comparison over the stop words")
/** @group setParam */
def setCaseSensitive(value: Boolean): this.type = set(caseSensitive, value)
@@ -123,24 +73,24 @@ class StopWordsRemover(override val uid: String)
/** @group getParam */
def getCaseSensitive: Boolean = $(caseSensitive)
- setDefault(stopWords -> StopWords.English, caseSensitive -> false)
+ setDefault(stopWords -> StopWordsRemover.loadDefaultStopWords("english"), caseSensitive -> false)
override def transform(dataset: Dataset[_]): DataFrame = {
val outputSchema = transformSchema(dataset.schema)
val t = if ($(caseSensitive)) {
- val stopWordsSet = $(stopWords).toSet
- udf { terms: Seq[String] =>
- terms.filter(s => !stopWordsSet.contains(s))
- }
- } else {
- val toLower = (s: String) => if (s != null) s.toLowerCase else s
- val lowerStopWords = $(stopWords).map(toLower(_)).toSet
- udf { terms: Seq[String] =>
- terms.filter(s => !lowerStopWords.contains(toLower(s)))
- }
+ val stopWordsSet = $(stopWords).toSet
+ udf { terms: Seq[String] =>
+ terms.filter(s => !stopWordsSet.contains(s))
+ }
+ } else {
+ // TODO: support user locale (SPARK-15064)
+ val toLower = (s: String) => if (s != null) s.toLowerCase else s
+ val lowerStopWords = $(stopWords).map(toLower(_)).toSet
+ udf { terms: Seq[String] =>
+ terms.filter(s => !lowerStopWords.contains(toLower(s)))
+ }
val metadata = outputSchema($(outputCol)).metadata
dataset.select(col("*"), t(col($(inputCol))).as($(outputCol), metadata))
@@ -158,6 +108,24 @@ class StopWordsRemover(override val uid: String)
object StopWordsRemover extends DefaultParamsReadable[StopWordsRemover] {
+ private[feature]
+ val supportedLanguages = Set("danish", "dutch", "english", "finnish", "french", "german",
+ "hungarian", "italian", "norwegian", "portuguese", "russian", "spanish", "swedish", "turkish")
override def load(path: String): StopWordsRemover = super.load(path)
+ /**
+ * Loads the default stop words for the given language.
+ * Supported languages: danish, dutch, english, finnish, french, german, hungarian,
+ * italian, norwegian, portuguese, russian, spanish, swedish, turkish
+ * @see [[http://anoncvs.postgresql.org/cvsweb.cgi/pgsql/src/backend/snowball/stopwords/]]
+ */
+ @Since("2.0.0")
+ def loadDefaultStopWords(language: String): Array[String] = {
+ require(supportedLanguages.contains(language),
+ s"$language is not in the supported language list: ${supportedLanguages.mkString(", ")}.")
+ val is = getClass.getResourceAsStream(s"/org/apache/spark/ml/feature/stopwords/$language.txt")
+ scala.io.Source.fromInputStream(is)(scala.io.Codec.UTF8).getLines().toArray
+ }
diff --git a/mllib/src/test/scala/org/apache/spark/ml/feature/StopWordsRemoverSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/feature/StopWordsRemoverSuite.scala
index 3505befdf8..8e7e000fbc 100644..100755
--- a/mllib/src/test/scala/org/apache/spark/ml/feature/StopWordsRemoverSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/ml/feature/StopWordsRemoverSuite.scala
@@ -44,6 +44,24 @@ class StopWordsRemoverSuite
val dataSet = sqlContext.createDataFrame(Seq(
(Seq("test", "test"), Seq("test", "test")),
+ (Seq("a", "b", "c", "d"), Seq("b", "c")),
+ (Seq("a", "the", "an"), Seq()),
+ (Seq("A", "The", "AN"), Seq()),
+ (Seq(null), Seq(null)),
+ (Seq(), Seq())
+ )).toDF("raw", "expected")
+ testStopWordsRemover(remover, dataSet)
+ }
+ test("StopWordsRemover with particular stop words list") {
+ val stopWords = Array("test", "a", "an", "the")
+ val remover = new StopWordsRemover()
+ .setInputCol("raw")
+ .setOutputCol("filtered")
+ .setStopWords(stopWords)
+ val dataSet = sqlContext.createDataFrame(Seq(
+ (Seq("test", "test"), Seq()),
(Seq("a", "b", "c", "d"), Seq("b", "c", "d")),
(Seq("a", "the", "an"), Seq()),
(Seq("A", "The", "AN"), Seq()),
@@ -67,13 +85,48 @@ class StopWordsRemoverSuite
testStopWordsRemover(remover, dataSet)
- test("StopWordsRemover with additional words") {
- val stopWords = StopWords.English ++ Array("python", "scala")
+ test("default stop words of supported languages are not empty") {
+ StopWordsRemover.supportedLanguages.foreach { lang =>
+ assert(StopWordsRemover.loadDefaultStopWords(lang).nonEmpty,
+ s"The default stop words of $lang cannot be empty.")
+ }
+ }
+ test("StopWordsRemover with language selection") {
+ val stopWords = StopWordsRemover.loadDefaultStopWords("turkish")
val remover = new StopWordsRemover()
val dataSet = sqlContext.createDataFrame(Seq(
+ (Seq("acaba", "ama", "biri"), Seq()),
+ (Seq("hep", "her", "scala"), Seq("scala"))
+ )).toDF("raw", "expected")
+ testStopWordsRemover(remover, dataSet)
+ }
+ test("StopWordsRemover with ignored words") {
+ val stopWords = StopWordsRemover.loadDefaultStopWords("english").toSet -- Set("a")
+ val remover = new StopWordsRemover()
+ .setInputCol("raw")
+ .setOutputCol("filtered")
+ .setStopWords(stopWords.toArray)
+ val dataSet = sqlContext.createDataFrame(Seq(
+ (Seq("python", "scala", "a"), Seq("python", "scala", "a")),
+ (Seq("Python", "Scala", "swift"), Seq("Python", "Scala", "swift"))
+ )).toDF("raw", "expected")
+ testStopWordsRemover(remover, dataSet)
+ }
+ test("StopWordsRemover with additional words") {
+ val stopWords = StopWordsRemover.loadDefaultStopWords("english").toSet ++ Set("python", "scala")
+ val remover = new StopWordsRemover()
+ .setInputCol("raw")
+ .setOutputCol("filtered")
+ .setStopWords(stopWords.toArray)
+ val dataSet = sqlContext.createDataFrame(Seq(
(Seq("python", "scala", "a"), Seq()),
(Seq("Python", "Scala", "swift"), Seq("swift"))
)).toDF("raw", "expected")