aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorBurak Köse <burakks41@gmail.com>2016-05-06 13:58:12 -0700
committerXiangrui Meng <meng@databricks.com>2016-05-06 13:58:12 -0700
commite20cd9f4ce977739ce80a2c39f8ebae5e53f72f6 (patch)
treeea5578c886cae4b083ca2ad6bdd9ca2008fa2bf9
parent5c8fad7b9bfd6677111a8e27e2574f82b04ec479 (diff)
downloadspark-e20cd9f4ce977739ce80a2c39f8ebae5e53f72f6.tar.gz
spark-e20cd9f4ce977739ce80a2c39f8ebae5e53f72f6.tar.bz2
spark-e20cd9f4ce977739ce80a2c39f8ebae5e53f72f6.zip
[SPARK-14050][ML] Add multiple languages support and additional methods for Stop Words Remover
## What changes were proposed in this pull request? This PR continues the work from #11871 with the following changes: * load English stopwords as default * covert stopwords to list in Python * update some tests and doc ## How was this patch tested? Unit tests. Closes #11871 cc: burakkose srowen Author: Burak Köse <burakks41@gmail.com> Author: Xiangrui Meng <meng@databricks.com> Author: Burak KOSE <burakks41@gmail.com> Closes #12843 from mengxr/SPARK-14050.
-rw-r--r--licenses/LICENSE-postgresql.txt24
-rwxr-xr-xmllib/src/main/resources/org/apache/spark/ml/feature/stopwords/README12
-rw-r--r--mllib/src/main/resources/org/apache/spark/ml/feature/stopwords/danish.txt94
-rw-r--r--mllib/src/main/resources/org/apache/spark/ml/feature/stopwords/dutch.txt101
-rw-r--r--mllib/src/main/resources/org/apache/spark/ml/feature/stopwords/english.txt153
-rw-r--r--mllib/src/main/resources/org/apache/spark/ml/feature/stopwords/finnish.txt235
-rw-r--r--mllib/src/main/resources/org/apache/spark/ml/feature/stopwords/french.txt155
-rw-r--r--mllib/src/main/resources/org/apache/spark/ml/feature/stopwords/german.txt231
-rw-r--r--mllib/src/main/resources/org/apache/spark/ml/feature/stopwords/hungarian.txt199
-rw-r--r--mllib/src/main/resources/org/apache/spark/ml/feature/stopwords/italian.txt279
-rw-r--r--mllib/src/main/resources/org/apache/spark/ml/feature/stopwords/norwegian.txt176
-rw-r--r--mllib/src/main/resources/org/apache/spark/ml/feature/stopwords/portuguese.txt203
-rw-r--r--mllib/src/main/resources/org/apache/spark/ml/feature/stopwords/russian.txt151
-rw-r--r--mllib/src/main/resources/org/apache/spark/ml/feature/stopwords/spanish.txt313
-rw-r--r--mllib/src/main/resources/org/apache/spark/ml/feature/stopwords/swedish.txt114
-rw-r--r--mllib/src/main/resources/org/apache/spark/ml/feature/stopwords/turkish.txt53
-rwxr-xr-x[-rw-r--r--]mllib/src/main/scala/org/apache/spark/ml/feature/StopWordsRemover.scala106
-rwxr-xr-x[-rw-r--r--]mllib/src/test/scala/org/apache/spark/ml/feature/StopWordsRemoverSuite.scala57
-rwxr-xr-x[-rw-r--r--]python/pyspark/ml/feature.py38
-rwxr-xr-x[-rw-r--r--]python/pyspark/ml/tests.py7
20 files changed, 2614 insertions, 87 deletions
diff --git a/licenses/LICENSE-postgresql.txt b/licenses/LICENSE-postgresql.txt
new file mode 100644
index 0000000000..515bf9af4d
--- /dev/null
+++ b/licenses/LICENSE-postgresql.txt
@@ -0,0 +1,24 @@
+PostgreSQL Database Management System
+(formerly known as Postgres, then as Postgres95)
+
+Portions Copyright (c) 1996-2010, PostgreSQL Global Development Group
+
+Portions Copyright (c) 1994, The Regents of the University of California
+
+Permission to use, copy, modify, and distribute this software and its
+documentation for any purpose, without fee, and without a written agreement
+is hereby granted, provided that the above copyright notice and this
+paragraph and the following two paragraphs appear in all copies.
+
+IN NO EVENT SHALL THE UNIVERSITY OF CALIFORNIA BE LIABLE TO ANY PARTY FOR
+DIRECT, INDIRECT, SPECIAL, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, INCLUDING
+LOST PROFITS, ARISING OUT OF THE USE OF THIS SOFTWARE AND ITS
+DOCUMENTATION, EVEN IF THE UNIVERSITY OF CALIFORNIA HAS BEEN ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGE.
+
+THE UNIVERSITY OF CALIFORNIA SPECIFICALLY DISCLAIMS ANY WARRANTIES,
+INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY
+AND FITNESS FOR A PARTICULAR PURPOSE. THE SOFTWARE PROVIDED HEREUNDER IS
+ON AN "AS IS" BASIS, AND THE UNIVERSITY OF CALIFORNIA HAS NO OBLIGATIONS TO
+PROVIDE MAINTENANCE, SUPPORT, UPDATES, ENHANCEMENTS, OR MODIFICATIONS.
+
diff --git a/mllib/src/main/resources/org/apache/spark/ml/feature/stopwords/README b/mllib/src/main/resources/org/apache/spark/ml/feature/stopwords/README
new file mode 100755
index 0000000000..ec08a50807
--- /dev/null
+++ b/mllib/src/main/resources/org/apache/spark/ml/feature/stopwords/README
@@ -0,0 +1,12 @@
+Stopwords Corpus
+
+This corpus contains lists of stop words for several languages. These
+are high-frequency grammatical words which are usually ignored in text
+retrieval applications.
+
+They were obtained from:
+http://anoncvs.postgresql.org/cvsweb.cgi/pgsql/src/backend/snowball/stopwords/
+
+The English list has been augmented
+https://github.com/nltk/nltk_data/issues/22
+
diff --git a/mllib/src/main/resources/org/apache/spark/ml/feature/stopwords/danish.txt b/mllib/src/main/resources/org/apache/spark/ml/feature/stopwords/danish.txt
new file mode 100644
index 0000000000..ea9e2c4abe
--- /dev/null
+++ b/mllib/src/main/resources/org/apache/spark/ml/feature/stopwords/danish.txt
@@ -0,0 +1,94 @@
+og
+i
+jeg
+det
+at
+en
+den
+til
+er
+som
+på
+de
+med
+han
+af
+for
+ikke
+der
+var
+mig
+sig
+men
+et
+har
+om
+vi
+min
+havde
+ham
+hun
+nu
+over
+da
+fra
+du
+ud
+sin
+dem
+os
+op
+man
+hans
+hvor
+eller
+hvad
+skal
+selv
+her
+alle
+vil
+blev
+kunne
+ind
+når
+være
+dog
+noget
+ville
+jo
+deres
+efter
+ned
+skulle
+denne
+end
+dette
+mit
+også
+under
+have
+dig
+anden
+hende
+mine
+alt
+meget
+sit
+sine
+vor
+mod
+disse
+hvis
+din
+nogle
+hos
+blive
+mange
+ad
+bliver
+hendes
+været
+thi
+jer
+sådan \ No newline at end of file
diff --git a/mllib/src/main/resources/org/apache/spark/ml/feature/stopwords/dutch.txt b/mllib/src/main/resources/org/apache/spark/ml/feature/stopwords/dutch.txt
new file mode 100644
index 0000000000..023cc2c939
--- /dev/null
+++ b/mllib/src/main/resources/org/apache/spark/ml/feature/stopwords/dutch.txt
@@ -0,0 +1,101 @@
+de
+en
+van
+ik
+te
+dat
+die
+in
+een
+hij
+het
+niet
+zijn
+is
+was
+op
+aan
+met
+als
+voor
+had
+er
+maar
+om
+hem
+dan
+zou
+of
+wat
+mijn
+men
+dit
+zo
+door
+over
+ze
+zich
+bij
+ook
+tot
+je
+mij
+uit
+der
+daar
+haar
+naar
+heb
+hoe
+heeft
+hebben
+deze
+u
+want
+nog
+zal
+me
+zij
+nu
+ge
+geen
+omdat
+iets
+worden
+toch
+al
+waren
+veel
+meer
+doen
+toen
+moet
+ben
+zonder
+kan
+hun
+dus
+alles
+onder
+ja
+eens
+hier
+wie
+werd
+altijd
+doch
+wordt
+wezen
+kunnen
+ons
+zelf
+tegen
+na
+reeds
+wil
+kon
+niets
+uw
+iemand
+geweest
+andere \ No newline at end of file
diff --git a/mllib/src/main/resources/org/apache/spark/ml/feature/stopwords/english.txt b/mllib/src/main/resources/org/apache/spark/ml/feature/stopwords/english.txt
new file mode 100644
index 0000000000..d075cc0bab
--- /dev/null
+++ b/mllib/src/main/resources/org/apache/spark/ml/feature/stopwords/english.txt
@@ -0,0 +1,153 @@
+i
+me
+my
+myself
+we
+our
+ours
+ourselves
+you
+your
+yours
+yourself
+yourselves
+he
+him
+his
+himself
+she
+her
+hers
+herself
+it
+its
+itself
+they
+them
+their
+theirs
+themselves
+what
+which
+who
+whom
+this
+that
+these
+those
+am
+is
+are
+was
+were
+be
+been
+being
+have
+has
+had
+having
+do
+does
+did
+doing
+a
+an
+the
+and
+but
+if
+or
+because
+as
+until
+while
+of
+at
+by
+for
+with
+about
+against
+between
+into
+through
+during
+before
+after
+above
+below
+to
+from
+up
+down
+in
+out
+on
+off
+over
+under
+again
+further
+then
+once
+here
+there
+when
+where
+why
+how
+all
+any
+both
+each
+few
+more
+most
+other
+some
+such
+no
+nor
+not
+only
+own
+same
+so
+than
+too
+very
+s
+t
+can
+will
+just
+don
+should
+now
+d
+ll
+m
+o
+re
+ve
+y
+ain
+aren
+couldn
+didn
+doesn
+hadn
+hasn
+haven
+isn
+ma
+mightn
+mustn
+needn
+shan
+shouldn
+wasn
+weren
+won
+wouldn
diff --git a/mllib/src/main/resources/org/apache/spark/ml/feature/stopwords/finnish.txt b/mllib/src/main/resources/org/apache/spark/ml/feature/stopwords/finnish.txt
new file mode 100644
index 0000000000..5b0eb10777
--- /dev/null
+++ b/mllib/src/main/resources/org/apache/spark/ml/feature/stopwords/finnish.txt
@@ -0,0 +1,235 @@
+olla
+olen
+olet
+on
+olemme
+olette
+ovat
+ole
+oli
+olisi
+olisit
+olisin
+olisimme
+olisitte
+olisivat
+olit
+olin
+olimme
+olitte
+olivat
+ollut
+olleet
+en
+et
+ei
+emme
+ette
+eivät
+minä
+minun
+minut
+minua
+minussa
+minusta
+minuun
+minulla
+minulta
+minulle
+sinä
+sinun
+sinut
+sinua
+sinussa
+sinusta
+sinuun
+sinulla
+sinulta
+sinulle
+hän
+hänen
+hänet
+häntä
+hänessä
+hänestä
+häneen
+hänellä
+häneltä
+hänelle
+me
+meidän
+meidät
+meitä
+meissä
+meistä
+meihin
+meillä
+meiltä
+meille
+te
+teidän
+teidät
+teitä
+teissä
+teistä
+teihin
+teillä
+teiltä
+teille
+he
+heidän
+heidät
+heitä
+heissä
+heistä
+heihin
+heillä
+heiltä
+heille
+tämä
+tämän
+tätä
+tässä
+tästä
+tähän
+tallä
+tältä
+tälle
+tänä
+täksi
+tuo
+tuon
+tuotä
+tuossa
+tuosta
+tuohon
+tuolla
+tuolta
+tuolle
+tuona
+tuoksi
+se
+sen
+sitä
+siinä
+siitä
+siihen
+sillä
+siltä
+sille
+sinä
+siksi
+nämä
+näiden
+näitä
+näissä
+näistä
+näihin
+näillä
+näiltä
+näille
+näinä
+näiksi
+nuo
+noiden
+noita
+noissa
+noista
+noihin
+noilla
+noilta
+noille
+noina
+noiksi
+ne
+niiden
+niitä
+niissä
+niistä
+niihin
+niillä
+niiltä
+niille
+niinä
+niiksi
+kuka
+kenen
+kenet
+ketä
+kenessä
+kenestä
+keneen
+kenellä
+keneltä
+kenelle
+kenenä
+keneksi
+ketkä
+keiden
+ketkä
+keitä
+keissä
+keistä
+keihin
+keillä
+keiltä
+keille
+keinä
+keiksi
+mikä
+minkä
+minkä
+mitä
+missä
+mistä
+mihin
+millä
+miltä
+mille
+minä
+miksi
+mitkä
+joka
+jonka
+jota
+jossa
+josta
+johon
+jolla
+jolta
+jolle
+jona
+joksi
+jotka
+joiden
+joita
+joissa
+joista
+joihin
+joilla
+joilta
+joille
+joina
+joiksi
+että
+ja
+jos
+koska
+kuin
+mutta
+niin
+sekä
+sillä
+tai
+vaan
+vai
+vaikka
+kanssa
+mukaan
+noin
+poikki
+yli
+kun
+niin
+nyt
+itse \ No newline at end of file
diff --git a/mllib/src/main/resources/org/apache/spark/ml/feature/stopwords/french.txt b/mllib/src/main/resources/org/apache/spark/ml/feature/stopwords/french.txt
new file mode 100644
index 0000000000..94b8f8f39a
--- /dev/null
+++ b/mllib/src/main/resources/org/apache/spark/ml/feature/stopwords/french.txt
@@ -0,0 +1,155 @@
+au
+aux
+avec
+ce
+ces
+dans
+de
+des
+du
+elle
+en
+et
+eux
+il
+je
+la
+le
+leur
+lui
+ma
+mais
+me
+même
+mes
+moi
+mon
+ne
+nos
+notre
+nous
+on
+ou
+par
+pas
+pour
+qu
+que
+qui
+sa
+se
+ses
+son
+sur
+ta
+te
+tes
+toi
+ton
+tu
+un
+une
+vos
+votre
+vous
+c
+d
+j
+l
+m
+n
+s
+t
+y
+été
+étée
+étées
+étés
+étant
+étante
+étants
+étantes
+suis
+es
+est
+sommes
+êtes
+sont
+serai
+seras
+sera
+serons
+serez
+seront
+serais
+serait
+serions
+seriez
+seraient
+étais
+était
+étions
+étiez
+étaient
+fus
+fut
+fûmes
+fûtes
+furent
+sois
+soit
+soyons
+soyez
+soient
+fusse
+fusses
+fût
+fussions
+fussiez
+fussent
+ayant
+ayante
+ayantes
+ayants
+eu
+eue
+eues
+eus
+ai
+as
+avons
+avez
+ont
+aurai
+auras
+aura
+aurons
+aurez
+auront
+aurais
+aurait
+aurions
+auriez
+auraient
+avais
+avait
+avions
+aviez
+avaient
+eut
+eûmes
+eûtes
+eurent
+aie
+aies
+ait
+ayons
+ayez
+aient
+eusse
+eusses
+eût
+eussions
+eussiez
+eussent \ No newline at end of file
diff --git a/mllib/src/main/resources/org/apache/spark/ml/feature/stopwords/german.txt b/mllib/src/main/resources/org/apache/spark/ml/feature/stopwords/german.txt
new file mode 100644
index 0000000000..7e65190f8b
--- /dev/null
+++ b/mllib/src/main/resources/org/apache/spark/ml/feature/stopwords/german.txt
@@ -0,0 +1,231 @@
+aber
+alle
+allem
+allen
+aller
+alles
+als
+also
+am
+an
+ander
+andere
+anderem
+anderen
+anderer
+anderes
+anderm
+andern
+anderr
+anders
+auch
+auf
+aus
+bei
+bin
+bis
+bist
+da
+damit
+dann
+der
+den
+des
+dem
+die
+das
+daß
+derselbe
+derselben
+denselben
+desselben
+demselben
+dieselbe
+dieselben
+dasselbe
+dazu
+dein
+deine
+deinem
+deinen
+deiner
+deines
+denn
+derer
+dessen
+dich
+dir
+du
+dies
+diese
+diesem
+diesen
+dieser
+dieses
+doch
+dort
+durch
+ein
+eine
+einem
+einen
+einer
+eines
+einig
+einige
+einigem
+einigen
+einiger
+einiges
+einmal
+er
+ihn
+ihm
+es
+etwas
+euer
+eure
+eurem
+euren
+eurer
+eures
+für
+gegen
+gewesen
+hab
+habe
+haben
+hat
+hatte
+hatten
+hier
+hin
+hinter
+ich
+mich
+mir
+ihr
+ihre
+ihrem
+ihren
+ihrer
+ihres
+euch
+im
+in
+indem
+ins
+ist
+jede
+jedem
+jeden
+jeder
+jedes
+jene
+jenem
+jenen
+jener
+jenes
+jetzt
+kann
+kein
+keine
+keinem
+keinen
+keiner
+keines
+können
+könnte
+machen
+man
+manche
+manchem
+manchen
+mancher
+manches
+mein
+meine
+meinem
+meinen
+meiner
+meines
+mit
+muss
+musste
+nach
+nicht
+nichts
+noch
+nun
+nur
+ob
+oder
+ohne
+sehr
+sein
+seine
+seinem
+seinen
+seiner
+seines
+selbst
+sich
+sie
+ihnen
+sind
+so
+solche
+solchem
+solchen
+solcher
+solches
+soll
+sollte
+sondern
+sonst
+über
+um
+und
+uns
+unse
+unsem
+unsen
+unser
+unses
+unter
+viel
+vom
+von
+vor
+während
+war
+waren
+warst
+was
+weg
+weil
+weiter
+welche
+welchem
+welchen
+welcher
+welches
+wenn
+werde
+werden
+wie
+wieder
+will
+wir
+wird
+wirst
+wo
+wollen
+wollte
+würde
+würden
+zu
+zum
+zur
+zwar
+zwischen \ No newline at end of file
diff --git a/mllib/src/main/resources/org/apache/spark/ml/feature/stopwords/hungarian.txt b/mllib/src/main/resources/org/apache/spark/ml/feature/stopwords/hungarian.txt
new file mode 100644
index 0000000000..8d4543a096
--- /dev/null
+++ b/mllib/src/main/resources/org/apache/spark/ml/feature/stopwords/hungarian.txt
@@ -0,0 +1,199 @@
+a
+ahogy
+ahol
+aki
+akik
+akkor
+alatt
+által
+általában
+amely
+amelyek
+amelyekben
+amelyeket
+amelyet
+amelynek
+ami
+amit
+amolyan
+amíg
+amikor
+át
+abban
+ahhoz
+annak
+arra
+arról
+az
+azok
+azon
+azt
+azzal
+azért
+aztán
+azután
+azonban
+bár
+be
+belül
+benne
+cikk
+cikkek
+cikkeket
+csak
+de
+e
+eddig
+egész
+egy
+egyes
+egyetlen
+egyéb
+egyik
+egyre
+ekkor
+el
+elég
+ellen
+elõ
+elõször
+elõtt
+elsõ
+én
+éppen
+ebben
+ehhez
+emilyen
+ennek
+erre
+ez
+ezt
+ezek
+ezen
+ezzel
+ezért
+és
+fel
+felé
+hanem
+hiszen
+hogy
+hogyan
+igen
+így
+illetve
+ill.
+ill
+ilyen
+ilyenkor
+ison
+ismét
+itt
+jó
+jól
+jobban
+kell
+kellett
+keresztül
+keressünk
+ki
+kívül
+között
+közül
+legalább
+lehet
+lehetett
+legyen
+lenne
+lenni
+lesz
+lett
+maga
+magát
+majd
+majd
+már
+más
+másik
+meg
+még
+mellett
+mert
+mely
+melyek
+mi
+mit
+míg
+miért
+milyen
+mikor
+minden
+mindent
+mindenki
+mindig
+mint
+mintha
+mivel
+most
+nagy
+nagyobb
+nagyon
+ne
+néha
+nekem
+neki
+nem
+néhány
+nélkül
+nincs
+olyan
+ott
+össze
+õk
+õket
+pedig
+persze
+rá
+s
+saját
+sem
+semmi
+sok
+sokat
+sokkal
+számára
+szemben
+szerint
+szinte
+talán
+tehát
+teljes
+tovább
+továbbá
+több
+úgy
+ugyanis
+új
+újabb
+újra
+után
+utána
+utolsó
+vagy
+vagyis
+valaki
+valami
+valamint
+való
+vagyok
+van
+vannak
+volt
+voltam
+voltak
+voltunk
+vissza
+vele
+viszont
+volna \ No newline at end of file
diff --git a/mllib/src/main/resources/org/apache/spark/ml/feature/stopwords/italian.txt b/mllib/src/main/resources/org/apache/spark/ml/feature/stopwords/italian.txt
new file mode 100644
index 0000000000..783b2e0cbf
--- /dev/null
+++ b/mllib/src/main/resources/org/apache/spark/ml/feature/stopwords/italian.txt
@@ -0,0 +1,279 @@
+ad
+al
+allo
+ai
+agli
+all
+agl
+alla
+alle
+con
+col
+coi
+da
+dal
+dallo
+dai
+dagli
+dall
+dagl
+dalla
+dalle
+di
+del
+dello
+dei
+degli
+dell
+degl
+della
+delle
+in
+nel
+nello
+nei
+negli
+nell
+negl
+nella
+nelle
+su
+sul
+sullo
+sui
+sugli
+sull
+sugl
+sulla
+sulle
+per
+tra
+contro
+io
+tu
+lui
+lei
+noi
+voi
+loro
+mio
+mia
+miei
+mie
+tuo
+tua
+tuoi
+tue
+suo
+sua
+suoi
+sue
+nostro
+nostra
+nostri
+nostre
+vostro
+vostra
+vostri
+vostre
+mi
+ti
+ci
+vi
+lo
+la
+li
+le
+gli
+ne
+il
+un
+uno
+una
+ma
+ed
+se
+perché
+anche
+come
+dov
+dove
+che
+chi
+cui
+non
+più
+quale
+quanto
+quanti
+quanta
+quante
+quello
+quelli
+quella
+quelle
+questo
+questi
+questa
+queste
+si
+tutto
+tutti
+a
+c
+e
+i
+l
+o
+ho
+hai
+ha
+abbiamo
+avete
+hanno
+abbia
+abbiate
+abbiano
+avrò
+avrai
+avrà
+avremo
+avrete
+avranno
+avrei
+avresti
+avrebbe
+avremmo
+avreste
+avrebbero
+avevo
+avevi
+aveva
+avevamo
+avevate
+avevano
+ebbi
+avesti
+ebbe
+avemmo
+aveste
+ebbero
+avessi
+avesse
+avessimo
+avessero
+avendo
+avuto
+avuta
+avuti
+avute
+sono
+sei
+siamo
+siete
+sia
+siate
+siano
+sarò
+sarai
+sarà
+saremo
+sarete
+saranno
+sarei
+saresti
+sarebbe
+saremmo
+sareste
+sarebbero
+ero
+eri
+era
+eravamo
+eravate
+erano
+fui
+fosti
+fu
+fummo
+foste
+furono
+fossi
+fosse
+fossimo
+fossero
+essendo
+faccio
+fai
+facciamo
+fanno
+faccia
+facciate
+facciano
+farò
+farai
+farà
+faremo
+farete
+faranno
+farei
+faresti
+farebbe
+faremmo
+fareste
+farebbero
+facevo
+facevi
+faceva
+facevamo
+facevate
+facevano
+feci
+facesti
+fece
+facemmo
+faceste
+fecero
+facessi
+facesse
+facessimo
+facessero
+facendo
+sto
+stai
+sta
+stiamo
+stanno
+stia
+stiate
+stiano
+starò
+starai
+starà
+staremo
+starete
+staranno
+starei
+staresti
+starebbe
+staremmo
+stareste
+starebbero
+stavo
+stavi
+stava
+stavamo
+stavate
+stavano
+stetti
+stesti
+stette
+stemmo
+steste
+stettero
+stessi
+stesse
+stessimo
+stessero
+stando \ No newline at end of file
diff --git a/mllib/src/main/resources/org/apache/spark/ml/feature/stopwords/norwegian.txt b/mllib/src/main/resources/org/apache/spark/ml/feature/stopwords/norwegian.txt
new file mode 100644
index 0000000000..cb91702c5e
--- /dev/null
+++ b/mllib/src/main/resources/org/apache/spark/ml/feature/stopwords/norwegian.txt
@@ -0,0 +1,176 @@
+og
+i
+jeg
+det
+at
+en
+et
+den
+til
+er
+som
+på
+de
+med
+han
+av
+ikke
+ikkje
+der
+så
+var
+meg
+seg
+men
+ett
+har
+om
+vi
+min
+mitt
+ha
+hadde
+hun
+nå
+over
+da
+ved
+fra
+du
+ut
+sin
+dem
+oss
+opp
+man
+kan
+hans
+hvor
+eller
+hva
+skal
+selv
+sjøl
+her
+alle
+vil
+bli
+ble
+blei
+blitt
+kunne
+inn
+når
+være
+kom
+noen
+noe
+ville
+dere
+som
+deres
+kun
+ja
+etter
+ned
+skulle
+denne
+for
+deg
+si
+sine
+sitt
+mot
+meget
+hvorfor
+dette
+disse
+uten
+hvordan
+ingen
+din
+ditt
+blir
+samme
+hvilken
+hvilke
+sånn
+inni
+mellom
+vår
+hver
+hvem
+vors
+hvis
+både
+bare
+enn
+fordi
+før
+mange
+også
+slik
+vært
+være
+båe
+begge
+siden
+dykk
+dykkar
+dei
+deira
+deires
+deim
+di
+då
+eg
+ein
+eit
+eitt
+elles
+honom
+hjå
+ho
+hoe
+henne
+hennar
+hennes
+hoss
+hossen
+ikkje
+ingi
+inkje
+korleis
+korso
+kva
+kvar
+kvarhelst
+kven
+kvi
+kvifor
+me
+medan
+mi
+mine
+mykje
+no
+nokon
+noka
+nokor
+noko
+nokre
+si
+sia
+sidan
+so
+somt
+somme
+um
+upp
+vere
+vore
+verte
+vort
+varte
+vart \ No newline at end of file
diff --git a/mllib/src/main/resources/org/apache/spark/ml/feature/stopwords/portuguese.txt b/mllib/src/main/resources/org/apache/spark/ml/feature/stopwords/portuguese.txt
new file mode 100644
index 0000000000..98b4fdcdf7
--- /dev/null
+++ b/mllib/src/main/resources/org/apache/spark/ml/feature/stopwords/portuguese.txt
@@ -0,0 +1,203 @@
+de
+a
+o
+que
+e
+do
+da
+em
+um
+para
+com
+não
+uma
+os
+no
+se
+na
+por
+mais
+as
+dos
+como
+mas
+ao
+ele
+das
+seu
+sua
+ou
+quando
+muito
+nos
+já
+eu
+também
+só
+pelo
+pela
+até
+isso
+ela
+entre
+depois
+sem
+mesmo
+aos
+seus
+quem
+nas
+me
+esse
+eles
+você
+essa
+num
+nem
+suas
+meu
+às
+minha
+numa
+pelos
+elas
+qual
+nós
+lhe
+deles
+essas
+esses
+pelas
+este
+dele
+tu
+te
+vocês
+vos
+lhes
+meus
+minhas
+teu
+tua
+teus
+tuas
+nosso
+nossa
+nossos
+nossas
+dela
+delas
+esta
+estes
+estas
+aquele
+aquela
+aqueles
+aquelas
+isto
+aquilo
+estou
+está
+estamos
+estão
+estive
+esteve
+estivemos
+estiveram
+estava
+estávamos
+estavam
+estivera
+estivéramos
+esteja
+estejamos
+estejam
+estivesse
+estivéssemos
+estivessem
+estiver
+estivermos
+estiverem
+hei
+há
+havemos
+hão
+houve
+houvemos
+houveram
+houvera
+houvéramos
+haja
+hajamos
+hajam
+houvesse
+houvéssemos
+houvessem
+houver
+houvermos
+houverem
+houverei
+houverá
+houveremos
+houverão
+houveria
+houveríamos
+houveriam
+sou
+somos
+são
+era
+éramos
+eram
+fui
+foi
+fomos
+foram
+fora
+fôramos
+seja
+sejamos
+sejam
+fosse
+fôssemos
+fossem
+for
+formos
+forem
+serei
+será
+seremos
+serão
+seria
+seríamos
+seriam
+tenho
+tem
+temos
+tém
+tinha
+tínhamos
+tinham
+tive
+teve
+tivemos
+tiveram
+tivera
+tivéramos
+tenha
+tenhamos
+tenham
+tivesse
+tivéssemos
+tivessem
+tiver
+tivermos
+tiverem
+terei
+terá
+teremos
+terão
+teria
+teríamos
+teriam \ No newline at end of file
diff --git a/mllib/src/main/resources/org/apache/spark/ml/feature/stopwords/russian.txt b/mllib/src/main/resources/org/apache/spark/ml/feature/stopwords/russian.txt
new file mode 100644
index 0000000000..8a800b7449
--- /dev/null
+++ b/mllib/src/main/resources/org/apache/spark/ml/feature/stopwords/russian.txt
@@ -0,0 +1,151 @@
+во
+не
+что
+он
+на
+со
+как
+то
+все
+она
+так
+его
+но
+да
+ты
+же
+вы
+за
+бы
+по
+только
+ее
+мне
+было
+вот
+от
+меня
+еще
+нет
+из
+ему
+теперь
+когда
+даже
+ну
+вдруг
+ли
+если
+уже
+или
+ни
+быть
+был
+него
+до
+вас
+нибудь
+опять
+уж
+вам
+ведь
+там
+потом
+себя
+ничего
+ей
+может
+они
+тут
+где
+есть
+надо
+ней
+для
+мы
+тебя
+их
+чем
+была
+сам
+чтоб
+без
+будто
+чего
+раз
+тоже
+себе
+под
+будет
+тогда
+кто
+этот
+того
+потому
+этого
+какой
+совсем
+ним
+здесь
+этом
+один
+почти
+мой
+тем
+чтобы
+нее
+сейчас
+были
+куда
+зачем
+всех
+никогда
+можно
+при
+наконец
+два
+об
+другой
+хоть
+после
+над
+больше
+тот
+через
+эти
+нас
+про
+всего
+них
+какая
+много
+разве
+три
+эту
+моя
+впрочем
+хорошо
+свою
+этой
+перед
+иногда
+лучше
+чуть
+том
+нельзя
+такой
+им
+более
+всегда
+конечно
+всю
+между \ No newline at end of file
diff --git a/mllib/src/main/resources/org/apache/spark/ml/feature/stopwords/spanish.txt b/mllib/src/main/resources/org/apache/spark/ml/feature/stopwords/spanish.txt
new file mode 100644
index 0000000000..94f493a8d1
--- /dev/null
+++ b/mllib/src/main/resources/org/apache/spark/ml/feature/stopwords/spanish.txt
@@ -0,0 +1,313 @@
+de
+la
+que
+el
+en
+y
+a
+los
+del
+se
+las
+por
+un
+para
+con
+no
+una
+su
+al
+lo
+como
+más
+pero
+sus
+le
+ya
+o
+este
+sí
+porque
+esta
+entre
+cuando
+muy
+sin
+sobre
+también
+me
+hasta
+hay
+donde
+quien
+desde
+todo
+nos
+durante
+todos
+uno
+les
+ni
+contra
+otros
+ese
+eso
+ante
+ellos
+e
+esto
+mí
+antes
+algunos
+qué
+unos
+yo
+otro
+otras
+otra
+él
+tanto
+esa
+estos
+mucho
+quienes
+nada
+muchos
+cual
+poco
+ella
+estar
+estas
+algunas
+algo
+nosotros
+mi
+mis
+tú
+te
+ti
+tu
+tus
+ellas
+nosotras
+vosostros
+vosostras
+os
+mío
+mía
+míos
+mías
+tuyo
+tuya
+tuyos
+tuyas
+suyo
+suya
+suyos
+suyas
+nuestro
+nuestra
+nuestros
+nuestras
+vuestro
+vuestra
+vuestros
+vuestras
+esos
+esas
+estoy
+estás
+está
+estamos
+estáis
+están
+esté
+estés
+estemos
+estéis
+estén
+estaré
+estarás
+estará
+estaremos
+estaréis
+estarán
+estaría
+estarías
+estaríamos
+estaríais
+estarían
+estaba
+estabas
+estábamos
+estabais
+estaban
+estuve
+estuviste
+estuvo
+estuvimos
+estuvisteis
+estuvieron
+estuviera
+estuvieras
+estuviéramos
+estuvierais
+estuvieran
+estuviese
+estuvieses
+estuviésemos
+estuvieseis
+estuviesen
+estando
+estado
+estada
+estados
+estadas
+estad
+he
+has
+ha
+hemos
+habéis
+han
+haya
+hayas
+hayamos
+hayáis
+hayan
+habré
+habrás
+habrá
+habremos
+habréis
+habrán
+habría
+habrías
+habríamos
+habríais
+habrían
+había
+habías
+habíamos
+habíais
+habían
+hube
+hubiste
+hubo
+hubimos
+hubisteis
+hubieron
+hubiera
+hubieras
+hubiéramos
+hubierais
+hubieran
+hubiese
+hubieses
+hubiésemos
+hubieseis
+hubiesen
+habiendo
+habido
+habida
+habidos
+habidas
+soy
+eres
+es
+somos
+sois
+son
+sea
+seas
+seamos
+seáis
+sean
+seré
+serás
+será
+seremos
+seréis
+serán
+sería
+serías
+seríamos
+seríais
+serían
+era
+eras
+éramos
+erais
+eran
+fui
+fuiste
+fue
+fuimos
+fuisteis
+fueron
+fuera
+fueras
+fuéramos
+fuerais
+fueran
+fuese
+fueses
+fuésemos
+fueseis
+fuesen
+sintiendo
+sentido
+sentida
+sentidos
+sentidas
+siente
+sentid
+tengo
+tienes
+tiene
+tenemos
+tenéis
+tienen
+tenga
+tengas
+tengamos
+tengáis
+tengan
+tendré
+tendrás
+tendrá
+tendremos
+tendréis
+tendrán
+tendría
+tendrías
+tendríamos
+tendríais
+tendrían
+tenía
+tenías
+teníamos
+teníais
+tenían
+tuve
+tuviste
+tuvo
+tuvimos
+tuvisteis
+tuvieron
+tuviera
+tuvieras
+tuviéramos
+tuvierais
+tuvieran
+tuviese
+tuvieses
+tuviésemos
+tuvieseis
+tuviesen
+teniendo
+tenido
+tenida
+tenidos
+tenidas
+tened \ No newline at end of file
diff --git a/mllib/src/main/resources/org/apache/spark/ml/feature/stopwords/swedish.txt b/mllib/src/main/resources/org/apache/spark/ml/feature/stopwords/swedish.txt
new file mode 100644
index 0000000000..9fae31c185
--- /dev/null
+++ b/mllib/src/main/resources/org/apache/spark/ml/feature/stopwords/swedish.txt
@@ -0,0 +1,114 @@
+och
+det
+att
+i
+en
+jag
+hon
+som
+han
+på
+den
+med
+var
+sig
+för
+så
+till
+är
+men
+ett
+om
+hade
+de
+av
+icke
+mig
+du
+henne
+då
+sin
+nu
+har
+inte
+hans
+honom
+skulle
+hennes
+där
+min
+man
+ej
+vid
+kunde
+något
+från
+ut
+när
+efter
+upp
+vi
+dem
+vara
+vad
+över
+än
+dig
+kan
+sina
+här
+ha
+mot
+alla
+under
+någon
+eller
+allt
+mycket
+sedan
+ju
+denna
+själv
+detta
+åt
+utan
+varit
+hur
+ingen
+mitt
+ni
+bli
+blev
+oss
+din
+dessa
+några
+deras
+blir
+mina
+samma
+vilken
+er
+sådan
+vår
+blivit
+dess
+inom
+mellan
+sådant
+varför
+varje
+vilka
+ditt
+vem
+vilket
+sitta
+sådana
+vart
+dina
+vars
+vårt
+våra
+ert
+era
+vilkas \ No newline at end of file
diff --git a/mllib/src/main/resources/org/apache/spark/ml/feature/stopwords/turkish.txt b/mllib/src/main/resources/org/apache/spark/ml/feature/stopwords/turkish.txt
new file mode 100644
index 0000000000..4e9708d9d2
--- /dev/null
+++ b/mllib/src/main/resources/org/apache/spark/ml/feature/stopwords/turkish.txt
@@ -0,0 +1,53 @@
+acaba
+ama
+aslında
+az
+bazı
+belki
+biri
+birkaç
+birşey
+biz
+bu
+çok
+çünkü
+da
+daha
+de
+defa
+diye
+eğer
+en
+gibi
+hem
+hep
+hepsi
+her
+hiç
+için
+ile
+ise
+kez
+ki
+kim
+mı
+mu
+mü
+nasıl
+ne
+neden
+nerde
+nerede
+nereye
+niçin
+niye
+o
+sanki
+şey
+siz
+şu
+tüm
+ve
+veya
+ya
+yani \ No newline at end of file
diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/StopWordsRemover.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/StopWordsRemover.scala
index b96bc48566..11864cb8f4 100644..100755
--- a/mllib/src/main/scala/org/apache/spark/ml/feature/StopWordsRemover.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/feature/StopWordsRemover.scala
@@ -27,58 +27,6 @@ import org.apache.spark.sql.functions.{col, udf}
import org.apache.spark.sql.types.{ArrayType, StringType, StructType}
/**
- * stop words list
- */
-private[spark] object StopWords {
-
- /**
- * Use the same default stopwords list as scikit-learn.
- * The original list can be found from "Glasgow Information Retrieval Group"
- * [[http://ir.dcs.gla.ac.uk/resources/linguistic_utils/stop_words]]
- */
- val English = Array( "a", "about", "above", "across", "after", "afterwards", "again",
- "against", "all", "almost", "alone", "along", "already", "also", "although", "always",
- "am", "among", "amongst", "amoungst", "amount", "an", "and", "another",
- "any", "anyhow", "anyone", "anything", "anyway", "anywhere", "are",
- "around", "as", "at", "back", "be", "became", "because", "become",
- "becomes", "becoming", "been", "before", "beforehand", "behind", "being",
- "below", "beside", "besides", "between", "beyond", "bill", "both",
- "bottom", "but", "by", "call", "can", "cannot", "cant", "co", "con",
- "could", "couldnt", "cry", "de", "describe", "detail", "do", "done",
- "down", "due", "during", "each", "eg", "eight", "either", "eleven", "else",
- "elsewhere", "empty", "enough", "etc", "even", "ever", "every", "everyone",
- "everything", "everywhere", "except", "few", "fifteen", "fify", "fill",
- "find", "fire", "first", "five", "for", "former", "formerly", "forty",
- "found", "four", "from", "front", "full", "further", "get", "give", "go",
- "had", "has", "hasnt", "have", "he", "hence", "her", "here", "hereafter",
- "hereby", "herein", "hereupon", "hers", "herself", "him", "himself", "his",
- "how", "however", "hundred", "i", "ie", "if", "in", "inc", "indeed",
- "interest", "into", "is", "it", "its", "itself", "keep", "last", "latter",
- "latterly", "least", "less", "ltd", "made", "many", "may", "me",
- "meanwhile", "might", "mill", "mine", "more", "moreover", "most", "mostly",
- "move", "much", "must", "my", "myself", "name", "namely", "neither",
- "never", "nevertheless", "next", "nine", "no", "nobody", "none", "noone",
- "nor", "not", "nothing", "now", "nowhere", "of", "off", "often", "on",
- "once", "one", "only", "onto", "or", "other", "others", "otherwise", "our",
- "ours", "ourselves", "out", "over", "own", "part", "per", "perhaps",
- "please", "put", "rather", "re", "same", "see", "seem", "seemed",
- "seeming", "seems", "serious", "several", "she", "should", "show", "side",
- "since", "sincere", "six", "sixty", "so", "some", "somehow", "someone",
- "something", "sometime", "sometimes", "somewhere", "still", "such",
- "system", "take", "ten", "than", "that", "the", "their", "them",
- "themselves", "then", "thence", "there", "thereafter", "thereby",
- "therefore", "therein", "thereupon", "these", "they", "thick", "thin",
- "third", "this", "those", "though", "three", "through", "throughout",
- "thru", "thus", "to", "together", "too", "top", "toward", "towards",
- "twelve", "twenty", "two", "un", "under", "until", "up", "upon", "us",
- "very", "via", "was", "we", "well", "were", "what", "whatever", "when",
- "whence", "whenever", "where", "whereafter", "whereas", "whereby",
- "wherein", "whereupon", "wherever", "whether", "which", "while", "whither",
- "who", "whoever", "whole", "whom", "whose", "why", "will", "with",
- "within", "without", "would", "yet", "you", "your", "yours", "yourself", "yourselves")
-}
-
-/**
* :: Experimental ::
* A feature transformer that filters out stop words from input.
* Note: null values from input array are preserved unless adding null to stopWords explicitly.
@@ -97,11 +45,13 @@ class StopWordsRemover(override val uid: String)
def setOutputCol(value: String): this.type = set(outputCol, value)
/**
- * the stop words set to be filtered out
- * Default: [[StopWords.English]]
+ * The words to be filtered out.
+ * Default: English stop words
+ * @see [[StopWordsRemover.loadDefaultStopWords()]]
* @group param
*/
- val stopWords: StringArrayParam = new StringArrayParam(this, "stopWords", "stop words")
+ val stopWords: StringArrayParam =
+ new StringArrayParam(this, "stopWords", "the words to be filtered out")
/** @group setParam */
def setStopWords(value: Array[String]): this.type = set(stopWords, value)
@@ -110,12 +60,12 @@ class StopWordsRemover(override val uid: String)
def getStopWords: Array[String] = $(stopWords)
/**
- * whether to do a case sensitive comparison over the stop words
+ * Whether to do a case sensitive comparison over the stop words.
* Default: false
* @group param
*/
val caseSensitive: BooleanParam = new BooleanParam(this, "caseSensitive",
- "whether to do case-sensitive comparison during filtering")
+ "whether to do a case-sensitive comparison over the stop words")
/** @group setParam */
def setCaseSensitive(value: Boolean): this.type = set(caseSensitive, value)
@@ -123,24 +73,24 @@ class StopWordsRemover(override val uid: String)
/** @group getParam */
def getCaseSensitive: Boolean = $(caseSensitive)
- setDefault(stopWords -> StopWords.English, caseSensitive -> false)
+ setDefault(stopWords -> StopWordsRemover.loadDefaultStopWords("english"), caseSensitive -> false)
@Since("2.0.0")
override def transform(dataset: Dataset[_]): DataFrame = {
val outputSchema = transformSchema(dataset.schema)
val t = if ($(caseSensitive)) {
- val stopWordsSet = $(stopWords).toSet
- udf { terms: Seq[String] =>
- terms.filter(s => !stopWordsSet.contains(s))
- }
- } else {
- val toLower = (s: String) => if (s != null) s.toLowerCase else s
- val lowerStopWords = $(stopWords).map(toLower(_)).toSet
- udf { terms: Seq[String] =>
- terms.filter(s => !lowerStopWords.contains(toLower(s)))
- }
+ val stopWordsSet = $(stopWords).toSet
+ udf { terms: Seq[String] =>
+ terms.filter(s => !stopWordsSet.contains(s))
+ }
+ } else {
+ // TODO: support user locale (SPARK-15064)
+ val toLower = (s: String) => if (s != null) s.toLowerCase else s
+ val lowerStopWords = $(stopWords).map(toLower(_)).toSet
+ udf { terms: Seq[String] =>
+ terms.filter(s => !lowerStopWords.contains(toLower(s)))
+ }
}
-
val metadata = outputSchema($(outputCol)).metadata
dataset.select(col("*"), t(col($(inputCol))).as($(outputCol), metadata))
}
@@ -158,6 +108,24 @@ class StopWordsRemover(override val uid: String)
@Since("1.6.0")
object StopWordsRemover extends DefaultParamsReadable[StopWordsRemover] {
+ private[feature]
+ val supportedLanguages = Set("danish", "dutch", "english", "finnish", "french", "german",
+ "hungarian", "italian", "norwegian", "portuguese", "russian", "spanish", "swedish", "turkish")
+
@Since("1.6.0")
override def load(path: String): StopWordsRemover = super.load(path)
+
+ /**
+ * Loads the default stop words for the given language.
+ * Supported languages: danish, dutch, english, finnish, french, german, hungarian,
+ * italian, norwegian, portuguese, russian, spanish, swedish, turkish
+ * @see [[http://anoncvs.postgresql.org/cvsweb.cgi/pgsql/src/backend/snowball/stopwords/]]
+ */
+ @Since("2.0.0")
+ def loadDefaultStopWords(language: String): Array[String] = {
+ require(supportedLanguages.contains(language),
+ s"$language is not in the supported language list: ${supportedLanguages.mkString(", ")}.")
+ val is = getClass.getResourceAsStream(s"/org/apache/spark/ml/feature/stopwords/$language.txt")
+ scala.io.Source.fromInputStream(is)(scala.io.Codec.UTF8).getLines().toArray
+ }
}
diff --git a/mllib/src/test/scala/org/apache/spark/ml/feature/StopWordsRemoverSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/feature/StopWordsRemoverSuite.scala
index 3505befdf8..8e7e000fbc 100644..100755
--- a/mllib/src/test/scala/org/apache/spark/ml/feature/StopWordsRemoverSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/ml/feature/StopWordsRemoverSuite.scala
@@ -44,6 +44,24 @@ class StopWordsRemoverSuite
.setOutputCol("filtered")
val dataSet = sqlContext.createDataFrame(Seq(
(Seq("test", "test"), Seq("test", "test")),
+ (Seq("a", "b", "c", "d"), Seq("b", "c")),
+ (Seq("a", "the", "an"), Seq()),
+ (Seq("A", "The", "AN"), Seq()),
+ (Seq(null), Seq(null)),
+ (Seq(), Seq())
+ )).toDF("raw", "expected")
+
+ testStopWordsRemover(remover, dataSet)
+ }
+
+ test("StopWordsRemover with particular stop words list") {
+ val stopWords = Array("test", "a", "an", "the")
+ val remover = new StopWordsRemover()
+ .setInputCol("raw")
+ .setOutputCol("filtered")
+ .setStopWords(stopWords)
+ val dataSet = sqlContext.createDataFrame(Seq(
+ (Seq("test", "test"), Seq()),
(Seq("a", "b", "c", "d"), Seq("b", "c", "d")),
(Seq("a", "the", "an"), Seq()),
(Seq("A", "The", "AN"), Seq()),
@@ -67,13 +85,48 @@ class StopWordsRemoverSuite
testStopWordsRemover(remover, dataSet)
}
- test("StopWordsRemover with additional words") {
- val stopWords = StopWords.English ++ Array("python", "scala")
+ test("default stop words of supported languages are not empty") {
+ StopWordsRemover.supportedLanguages.foreach { lang =>
+ assert(StopWordsRemover.loadDefaultStopWords(lang).nonEmpty,
+ s"The default stop words of $lang cannot be empty.")
+ }
+ }
+
+ test("StopWordsRemover with language selection") {
+ val stopWords = StopWordsRemover.loadDefaultStopWords("turkish")
val remover = new StopWordsRemover()
.setInputCol("raw")
.setOutputCol("filtered")
.setStopWords(stopWords)
val dataSet = sqlContext.createDataFrame(Seq(
+ (Seq("acaba", "ama", "biri"), Seq()),
+ (Seq("hep", "her", "scala"), Seq("scala"))
+ )).toDF("raw", "expected")
+
+ testStopWordsRemover(remover, dataSet)
+ }
+
+ test("StopWordsRemover with ignored words") {
+ val stopWords = StopWordsRemover.loadDefaultStopWords("english").toSet -- Set("a")
+ val remover = new StopWordsRemover()
+ .setInputCol("raw")
+ .setOutputCol("filtered")
+ .setStopWords(stopWords.toArray)
+ val dataSet = sqlContext.createDataFrame(Seq(
+ (Seq("python", "scala", "a"), Seq("python", "scala", "a")),
+ (Seq("Python", "Scala", "swift"), Seq("Python", "Scala", "swift"))
+ )).toDF("raw", "expected")
+
+ testStopWordsRemover(remover, dataSet)
+ }
+
+ test("StopWordsRemover with additional words") {
+ val stopWords = StopWordsRemover.loadDefaultStopWords("english").toSet ++ Set("python", "scala")
+ val remover = new StopWordsRemover()
+ .setInputCol("raw")
+ .setOutputCol("filtered")
+ .setStopWords(stopWords.toArray)
+ val dataSet = sqlContext.createDataFrame(Seq(
(Seq("python", "scala", "a"), Seq()),
(Seq("Python", "Scala", "swift"), Seq("swift"))
)).toDF("raw", "expected")
diff --git a/python/pyspark/ml/feature.py b/python/pyspark/ml/feature.py
index f21e3062ef..d2989fa4cd 100644..100755
--- a/python/pyspark/ml/feature.py
+++ b/python/pyspark/ml/feature.py
@@ -1738,28 +1738,23 @@ class StopWordsRemover(JavaTransformer, HasInputCol, HasOutputCol, JavaMLReadabl
"comparison over the stop words", typeConverter=TypeConverters.toBoolean)
@keyword_only
- def __init__(self, inputCol=None, outputCol=None, stopWords=None,
- caseSensitive=False):
+ def __init__(self, inputCol=None, outputCol=None, stopWords=None, caseSensitive=False):
"""
- __init__(self, inputCol=None, outputCol=None, stopWords=None,\
- caseSensitive=false)
+ __init__(self, inputCol=None, outputCol=None, stopWords=None, caseSensitive=false)
"""
super(StopWordsRemover, self).__init__()
self._java_obj = self._new_java_obj("org.apache.spark.ml.feature.StopWordsRemover",
self.uid)
- stopWordsObj = _jvm().org.apache.spark.ml.feature.StopWords
- defaultStopWords = list(stopWordsObj.English())
- self._setDefault(stopWords=defaultStopWords, caseSensitive=False)
+ self._setDefault(stopWords=StopWordsRemover.loadDefaultStopWords("english"),
+ caseSensitive=False)
kwargs = self.__init__._input_kwargs
self.setParams(**kwargs)
@keyword_only
@since("1.6.0")
- def setParams(self, inputCol=None, outputCol=None, stopWords=None,
- caseSensitive=False):
+ def setParams(self, inputCol=None, outputCol=None, stopWords=None, caseSensitive=False):
"""
- setParams(self, inputCol="input", outputCol="output", stopWords=None,\
- caseSensitive=false)
+ setParams(self, inputCol=None, outputCol=None, stopWords=None, caseSensitive=false)
Sets params for this StopWordRemover.
"""
kwargs = self.setParams._input_kwargs
@@ -1768,31 +1763,42 @@ class StopWordsRemover(JavaTransformer, HasInputCol, HasOutputCol, JavaMLReadabl
@since("1.6.0")
def setStopWords(self, value):
"""
- Specify the stopwords to be filtered.
+ Sets the value of :py:attr:`stopWords`.
"""
return self._set(stopWords=value)
@since("1.6.0")
def getStopWords(self):
"""
- Get the stopwords.
+ Gets the value of :py:attr:`stopWords` or its default value.
"""
return self.getOrDefault(self.stopWords)
@since("1.6.0")
def setCaseSensitive(self, value):
"""
- Set whether to do a case sensitive comparison over the stop words
+ Sets the value of :py:attr:`caseSensitive`.
"""
return self._set(caseSensitive=value)
@since("1.6.0")
def getCaseSensitive(self):
"""
- Get whether to do a case sensitive comparison over the stop words.
+ Gets the value of :py:attr:`caseSensitive` or its default value.
"""
return self.getOrDefault(self.caseSensitive)
+ @staticmethod
+ @since("2.0.0")
+ def loadDefaultStopWords(language):
+ """
+ Loads the default stop words for the given language.
+ Supported languages: danish, dutch, english, finnish, french, german, hungarian,
+ italian, norwegian, portuguese, russian, spanish, swedish, turkish
+ """
+ stopWordsObj = _jvm().org.apache.spark.ml.feature.StopWordsRemover
+ return list(stopWordsObj.loadDefaultStopWords(language))
+
@inherit_doc
@ignore_unicode_prefix
@@ -1843,7 +1849,7 @@ class Tokenizer(JavaTransformer, HasInputCol, HasOutputCol, JavaMLReadable, Java
@since("1.3.0")
def setParams(self, inputCol=None, outputCol=None):
"""
- setParams(self, inputCol="input", outputCol="output")
+ setParams(self, inputCol=None, outputCol=None)
Sets params for this Tokenizer.
"""
kwargs = self.setParams._input_kwargs
diff --git a/python/pyspark/ml/tests.py b/python/pyspark/ml/tests.py
index 78ec96af8a..ad1631fb5b 100644..100755
--- a/python/pyspark/ml/tests.py
+++ b/python/pyspark/ml/tests.py
@@ -417,6 +417,13 @@ class FeatureTests(PySparkTestCase):
self.assertEqual(stopWordRemover.getStopWords(), stopwords)
transformedDF = stopWordRemover.transform(dataset)
self.assertEqual(transformedDF.head().output, ["a"])
+ # with language selection
+ stopwords = StopWordsRemover.loadDefaultStopWords("turkish")
+ dataset = sqlContext.createDataFrame([Row(input=["acaba", "ama", "biri"])])
+ stopWordRemover.setStopWords(stopwords)
+ self.assertEqual(stopWordRemover.getStopWords(), stopwords)
+ transformedDF = stopWordRemover.transform(dataset)
+ self.assertEqual(transformedDF.head().output, [])
def test_count_vectorizer_with_binary(self):
sqlContext = SQLContext(self.sc)