diff options
Diffstat (limited to 'mllib/src/main/resources/org/apache/spark/ml/feature/stopwords/README')
-rwxr-xr-x | mllib/src/main/resources/org/apache/spark/ml/feature/stopwords/README | 12 |
1 files changed, 12 insertions, 0 deletions
diff --git a/mllib/src/main/resources/org/apache/spark/ml/feature/stopwords/README b/mllib/src/main/resources/org/apache/spark/ml/feature/stopwords/README new file mode 100755 index 0000000000..ec08a50807 --- /dev/null +++ b/mllib/src/main/resources/org/apache/spark/ml/feature/stopwords/README @@ -0,0 +1,12 @@ +Stopwords Corpus + +This corpus contains lists of stop words for several languages. These +are high-frequency grammatical words which are usually ignored in text +retrieval applications. + +They were obtained from: +http://anoncvs.postgresql.org/cvsweb.cgi/pgsql/src/backend/snowball/stopwords/ + +The English list has been augmented +https://github.com/nltk/nltk_data/issues/22 + |