aboutsummaryrefslogtreecommitdiff
path: root/mllib
diff options
context:
space:
mode:
Diffstat (limited to 'mllib')
-rw-r--r--mllib/data/als/test.data16
-rwxr-xr-xmllib/data/lr-data/random.data1000
-rw-r--r--mllib/data/ridge-data/lpsa.data67
-rw-r--r--mllib/src/main/scala/spark/mllib/clustering/KMeans.scala334
-rw-r--r--mllib/src/main/scala/spark/mllib/clustering/KMeansModel.scala44
-rw-r--r--mllib/src/main/scala/spark/mllib/clustering/LocalKMeans.scala105
-rw-r--r--mllib/src/main/scala/spark/mllib/optimization/Gradient.scala50
-rw-r--r--mllib/src/main/scala/spark/mllib/optimization/GradientDescent.scala79
-rw-r--r--mllib/src/main/scala/spark/mllib/optimization/Updater.scala44
-rw-r--r--mllib/src/main/scala/spark/mllib/recommendation/ALS.scala436
-rw-r--r--mllib/src/main/scala/spark/mllib/recommendation/MatrixFactorizationModel.scala40
-rw-r--r--mllib/src/main/scala/spark/mllib/regression/LogisticRegression.scala175
-rw-r--r--mllib/src/main/scala/spark/mllib/regression/LogisticRegressionGenerator.scala58
-rw-r--r--mllib/src/main/scala/spark/mllib/regression/Regression.scala38
-rw-r--r--mllib/src/main/scala/spark/mllib/regression/RidgeRegression.scala211
-rw-r--r--mllib/src/main/scala/spark/mllib/regression/RidgeRegressionGenerator.scala72
-rw-r--r--mllib/src/main/scala/spark/mllib/util/MLUtils.scala112
-rw-r--r--mllib/src/test/resources/log4j.properties28
-rw-r--r--mllib/src/test/scala/spark/mllib/clustering/KMeansSuite.scala170
-rw-r--r--mllib/src/test/scala/spark/mllib/recommendation/ALSSuite.scala97
-rw-r--r--mllib/src/test/scala/spark/mllib/regression/LogisticRegressionSuite.scala74
-rw-r--r--mllib/src/test/scala/spark/mllib/regression/RidgeRegressionSuite.scala64
22 files changed, 3314 insertions, 0 deletions
diff --git a/mllib/data/als/test.data b/mllib/data/als/test.data
new file mode 100644
index 0000000000..e476cc23e0
--- /dev/null
+++ b/mllib/data/als/test.data
@@ -0,0 +1,16 @@
+1,1,5.0
+1,2,1.0
+1,3,5.0
+1,4,1.0
+2,1,5.0
+2,2,1.0
+2,3,5.0
+2,4,1.0
+3,1,1.0
+3,2,5.0
+3,3,1.0
+3,4,5.0
+4,1,1.0
+4,2,5.0
+4,3,1.0
+4,4,5.0
diff --git a/mllib/data/lr-data/random.data b/mllib/data/lr-data/random.data
new file mode 100755
index 0000000000..29bcb8acba
--- /dev/null
+++ b/mllib/data/lr-data/random.data
@@ -0,0 +1,1000 @@
+0.0,-0.19138793197590276 0.7834675900121327
+1.0,3.712420417753061 3.55967640829891
+0.0,-0.3173743619974614 0.9034702789806682
+1.0,4.759494447180777 3.407011867344781
+0.0,-0.7078607074437426 -0.7866705652344417
+1.0,2.6708084832010215 2.5322909406378016
+0.0,-0.07553885038446313 -0.1297104483563081
+1.0,2.759487072285262 2.474689814713741
+0.0,-2.2199161547238107 0.7543109438660762
+1.0,1.922617509832946 1.9412373902594937
+0.0,0.8140942462004225 1.883920822277784
+1.0,1.7649295902120172 3.8195077526061363
+0.0,-1.1173052428096684 -1.468964723960145
+1.0,1.8733449544967458 2.913026590975709
+0.0,-0.11212965215910947 1.068087981775071
+1.0,2.3368459971730227 5.453870208593922
+0.0,-1.2802488543364463 -0.47218504171867676
+1.0,4.1917343620336895 3.5602286778418355
+0.0,0.5995976502137177 -0.797374550890321
+1.0,3.721592294428238 4.824418090974808
+0.0,-0.0721649164244053 -1.3952880192542576
+1.0,3.609764030146346 3.4730043476891277
+0.0,-1.5078269860498976 -2.6460421495665987
+1.0,1.8510254911824193 1.6748364225650059
+0.0,1.021485727769095 -0.14476425336866738
+1.0,4.10105000223134 2.3772502437548493
+0.0,2.6132710211418675 -1.061646527586342
+1.0,2.6444875273854653 4.043302750329545
+0.0,1.115723715938777 0.38401588153403887
+1.0,2.045759949164019 3.156447533448806
+0.0,-1.0543022640565405 -0.6820337845705753
+1.0,3.535337069948117 3.8121122972294965
+0.0,0.9427529503486505 -0.25123516319259886
+1.0,3.9611643301316795 3.3144121016644443
+0.0,-0.15013188927817916 0.8178862482229886
+1.0,3.200504584029051 2.3088398886136057
+0.0,0.819731993393585 -0.47386644109886344
+1.0,3.283317566020217 3.4828146842654513
+0.0,-2.3283941193793303 -0.6148925379529
+1.0,3.901670215294089 3.6356776610143324
+0.0,-0.28635769830042973 0.049586437072917544
+1.0,3.1114746381043927 3.6314805300338775
+0.0,-1.3085536069757229 0.11172767926766304
+1.0,3.3676979357140744 4.689661419564771
+0.0,-1.5820787210442733 1.3226576351191428
+1.0,2.5957586701668207 3.0648240201825923
+0.0,-2.116823743560968 0.272822309954307
+1.0,3.31672509500716 3.870172182480263
+0.0,0.09751166932653511 0.6469052579904877
+1.0,2.0609623373451305 3.9496181906908694
+0.0,0.5238217321419351 -1.2424816480725946
+1.0,3.5731384504449717 5.293293512805712
+0.0,-0.8507917425723299 -1.2243124053200718
+1.0,3.3060954421001867 3.1337045819604565
+0.0,1.5066706426420082 0.04176666807070882
+1.0,4.197316426430547 2.327643377792433
+0.0,-1.8068158696573955 -1.6380836149377855
+1.0,3.568239793850545 3.561688791420822
+0.0,0.4705756905309871 1.1991675114038487
+1.0,4.85003762884306 4.253420553408024
+0.0,0.7595792932847568 0.014062431397674205
+1.0,1.6984862661221896 1.7746925013882613
+0.0,0.1132294255888917 -0.09228036942051128
+1.0,3.766092539171029 2.765647342841482
+0.0,1.053401788561791 -1.0588667339849278
+1.0,2.780021685872393 3.239478188786074
+0.0,0.4042022490052266 1.0982210323828034
+1.0,2.4939569547402063 2.4615506964861273
+0.0,0.4469359967563411 0.3880418183993791
+1.0,2.7943749030887486 3.742182807141721
+0.0,-0.4418685162293727 0.802180923066725
+1.0,3.711213212127241 4.620177703831104
+0.0,0.10737314976605918 -1.5716142960765325
+1.0,4.0522289913808365 3.77562942835957
+0.0,1.4798827061781141 1.1638601205648005
+1.0,3.6758023575825547 3.115500589955362
+0.0,-1.803338141681238 -0.639996207387159
+1.0,2.044667029270621 3.04922768663927
+0.0,-0.06067427095346295 1.394611410740688
+1.0,4.626495834477846 2.995800202291488
+0.0,-0.2770274350630315 0.4521526506693692
+1.0,3.130857841268635 3.76858860814448
+0.0,2.163400739017478 -1.303601716798734
+1.0,2.9131896969824367 3.4288919990054167
+0.0,-0.7145108501670207 1.4189762494365543
+1.0,3.535768896041034 1.4894011726406373
+0.0,1.605614523747256 0.29974289519139824
+1.0,2.413678734728178 2.1826316767457183
+0.0,-0.8821932593373774 0.26432786248412726
+1.0,2.0878695933047116 3.5277388966365177
+0.0,-1.107001191509183 0.38421647065699477
+1.0,2.6462094774496454 2.273786785429519
+0.0,1.0712046043765102 -1.1889735666835115
+1.0,3.7458483094910666 1.3868020542832566
+0.0,-0.8403883736429167 -0.7163969561320671
+1.0,3.3359151000342195 3.2382001552279576
+0.0,0.13309387098922537 0.938761191821517
+1.0,2.083439571838502 3.2204948086228944
+0.0,1.3030219848568272 0.5976630914634896
+1.0,2.7602376200551317 2.200505791897739
+0.0,-0.9458633178207942 0.0490955863627428
+1.0,3.7998466026531883 1.9291683955712686
+0.0,-1.327236501803235 0.06915643957270164
+1.0,3.4740573335685925 2.1080735512507114
+0.0,0.8627688253416859 -1.961802291046532
+1.0,3.5108780392869776 3.9854745964798326
+0.0,-0.69537574439301 0.2436269580373554
+1.0,2.920286302932126 4.704192389485899
+0.0,-2.031190954684878 -0.7843052045579578
+1.0,1.6768848711259499 1.345658047606076
+0.0,0.9234894202027507 -0.38179572928866495
+1.0,3.1710339307651334 4.129874876536583
+0.0,-2.5086697007630376 -0.2638692986795807
+1.0,2.079400422215581 3.124756711992435
+0.0,-0.1388012859869782 0.3698243463601514
+1.0,2.665728164475424 4.574860576068532
+0.0,0.11967116650891912 -0.8792117975750646
+1.0,3.042630437105455 2.7245525508413677
+0.0,0.6078023848042808 -0.7977233104047035
+1.0,3.3340709038589638 4.962729210819017
+0.0,0.6373101353982795 1.1335021278327686
+1.0,3.3821397455119446 4.349379573895378
+0.0,-0.9140176931412027 -0.03428220013900756
+1.0,4.579963977595727 3.8322809335521484
+0.0,-0.43958506434874983 0.21259366700539037
+1.0,2.644701808902675 3.945416465403505
+0.0,-1.119921743746522 -0.2089105317801997
+1.0,2.5480553203091922 3.123344220515146
+0.0,0.8723990414181355 1.11150972420879
+1.0,4.479600967837827 2.8645066949820057
+0.0,-0.003869320481891422 0.24756134775982133
+1.0,3.237294368758498 4.642548547098718
+0.0,0.34643329685515545 0.029869480691029456
+1.0,2.6324740490008893 1.2577448307260846
+0.0,-0.4416403319035849 -1.4597062027342758
+1.0,1.764049052224297 3.649850384544675
+0.0,0.6779287737716254 -1.9489876700506967
+1.0,1.4286669812409405 2.4906452014102416
+0.0,-1.2271599940693638 0.9869686407012563
+1.0,3.6244117441765993 2.36879554315985
+0.0,-0.11422653411940642 0.4741905017884626
+1.0,3.6192153991840694 2.149436181779614
+0.0,0.45425900443207484 -1.357987041493406
+1.0,4.312295702128074 3.7596991900930252
+0.0,-0.35153502234686884 -0.6297451691082592
+1.0,3.4901363450669476 2.0630236379093243
+0.0,-1.5343533005821828 -0.23745688647461852
+1.0,4.775056734905926 5.291243824646301
+0.0,-1.032123659747431 0.8458711875294105
+1.0,2.3091889606097844 3.3688150059111215
+0.0,0.7854236849909306 0.6742463927844289
+1.0,3.284779531346899 2.855746734955609
+0.0,0.380579394855332 -1.2378905330462027
+1.0,2.540193014555953 3.245568950444961
+0.0,-0.5491810448400926 -2.3179482776107894
+1.0,3.481785462949587 1.8870182253717969
+0.0,-0.06833732101790825 2.178923334945784
+1.0,1.1663083809702222 1.8919272314310458
+0.0,-0.7801536433937879 -1.4185984368350903
+1.0,1.457713814592066 3.0323739348144048
+0.0,-0.16377716798970973 0.09678021896691058
+1.0,2.2294515799173094 1.6179126855486068
+0.0,-0.5845552895984718 -0.8095679531228397
+1.0,2.024328902209618 2.4660315284543888
+0.0,0.2037503424802764 1.5767438723426828
+1.0,3.5058983262252643 3.292836693091364
+0.0,-1.4004772080893082 0.6150928060180622
+1.0,4.610936499146778 3.3674445809820313
+0.0,-0.7325641160695897 -3.0469742419403225
+1.0,2.6778956983269926 4.049681967443553
+0.0,-0.3375932473421461 -0.32976087151423067
+1.0,3.975838378562512 1.2032482992228626
+0.0,-1.6622711226380826 -0.6954676646542216
+1.0,3.1601568512397256 2.7472491112914357
+0.0,0.6739969973916968 1.3608866192945286
+1.0,3.097978499063888 3.88429576456391
+0.0,-0.16445244300279913 0.631410854999902
+1.0,4.244875698991619 3.0464568222900477
+0.0,0.1749522197766453 -0.3295077792829936
+1.0,4.158913950688044 1.1836177376726964
+0.0,-1.8286320279969996 -0.6355826362111864
+1.0,2.4795264391445326 0.8073937061906746
+0.0,-0.5095499320702017 -0.8451757050184052
+1.0,3.6489546081475206 2.7405880916534957
+0.0,-0.11733097334574003 0.020300758125140466
+1.0,1.9034123919197892 4.036941742254072
+0.0,-0.4678304671259669 -0.7653895561277071
+1.0,2.555027220737054 4.205906511993216
+0.0,0.1952150967011765 1.2402178923240337
+1.0,3.532371144429582 2.395018092924601
+0.0,1.4682834110821084 2.2292327929025078
+1.0,2.1160331256749663 3.7157102308564824
+0.0,1.3973790173654674 -1.1902799121683607
+1.0,3.4775573554170616 3.0459058509488557
+0.0,-2.215337088722839 0.7693588032777773
+1.0,2.3298220860458976 1.5924630285528396
+0.0,1.260641664088144 1.5474089692944746
+1.0,4.460878990061944 2.595950219349794
+0.0,-1.8214944389802914 -1.9733205363211535
+1.0,4.41874870213851 2.4975116019313264
+0.0,1.2037921250123007 -0.7057578432831773
+1.0,3.042628088030598 3.7366256492570136
+0.0,-0.02609770715133313 -0.01975791007372346
+1.0,1.123824442324706 3.5115607224884466
+0.0,0.3466005704292144 -1.206858960323042
+1.0,3.044152779557358 2.4308738719304266
+0.0,-0.8292396838183249 -0.5768591341562801
+1.0,2.9898679252543325 3.3291086316901484
+0.0,0.6033357093153775 0.18738779274832332
+1.0,3.2777482224094916 2.2676548172839714
+0.0,-0.7104360487845565 -1.0365712508175688
+1.0,2.617802272534323 1.887796671556582
+0.0,-0.21008998836798706 -2.4424443035468957
+1.0,3.9387085143031317 2.368798316318223
+0.0,-0.65027380204969 0.4757828709083824
+1.0,1.6786020855223545 1.62019388696364
+0.0,0.40325101156361803 0.26629562725726075
+1.0,2.4614637796912167 2.778406744842399
+0.0,-0.4327374795655596 0.5643009301153851
+1.0,2.6419358755663103 2.1911675067034206
+0.0,-0.06058610052148417 0.6118154934715632
+1.0,4.134485645832481 4.214482766162727
+0.0,-2.091472947105952 -0.21279450874188077
+1.0,3.7664041746453503 0.5848083052756543
+0.0,0.20187441248519114 0.7310035835212488
+1.0,3.6821251396696817 1.2016937526237272
+0.0,0.16248871053987612 -0.8547163523143474
+1.0,3.1725037691095834 3.051265058839004
+0.0,-1.7466975308858639 -0.048497170816597705
+1.0,4.296665913992498 4.432036327276331
+0.0,-0.49371042139965376 -1.3162216335880739
+1.0,3.0767376272412292 2.4082404056282467
+0.0,0.6517145281009619 -0.15229289422910688
+1.0,3.8556129079007406 4.932746403550176
+0.0,2.467072616559744 -0.6570760874457315
+1.0,3.8722558954619446 2.398547361219584
+0.0,-0.996362973160808 -0.24663573264285635
+1.0,2.058960472055059 0.09020868936476445
+0.0,1.1921444033047794 -1.2205820383864918
+1.0,3.499255855340612 4.26015377680707
+0.0,0.46495431359796363 -0.3535071804767937
+1.0,3.2772715993311534 1.8496849599545144
+0.0,0.9200766227075026 1.0153595739730128
+1.0,3.7395665378166516 4.161859093428991
+0.0,-1.3445731221950805 0.3711182438638966
+1.0,1.974184816991473 2.3758202020218637
+0.0,0.25747673028745044 1.4898729695115611
+1.0,3.643667737073963 2.5171980898063024
+0.0,-0.7491175934837044 1.807998586131331
+1.0,3.024294668483263 2.745713910567566
+0.0,-2.9902104324990075 0.48847563269083094
+1.0,2.693457241550706 4.067192099378729
+0.0,1.0010822910854564 1.065617155304199
+1.0,2.6231328305267576 3.2530925652040796
+0.0,-1.569524799794976 0.10080365850268516
+1.0,5.543177898986999 3.149276748958176
+0.0,-0.2697035609845456 -0.3834981890675749
+1.0,5.5737716796876935 3.134627621089238
+0.0,0.16848836970122472 1.7680681560270155
+1.0,2.984578320659214 3.8081853301923743
+0.0,2.00864307305994 -1.1769936806590435
+1.0,2.4301644281026538 1.5357007015355957
+0.0,-1.251515087462618 -1.0023388301407077
+1.0,2.7783106123714036 3.4753675099443138
+0.0,1.2067779830446301 -1.1138369735803868
+1.0,2.660559526103853 0.9246419639107195
+0.0,-0.2120078291751072 0.553871125085326
+1.0,3.2961674182984613 4.1840551114889655
+0.0,-1.7407002661640898 -0.13494920714243758
+1.0,2.61652747199719 2.606431158365525
+0.0,0.1810536358726569 -0.7041543708042312
+1.0,0.6618977487425206 4.43976232230529
+0.0,-1.1056190552516114 -0.26273698119076755
+1.0,3.245745718364984 0.9585399121419127
+0.0,0.451245033031027 0.3966692171364385
+1.0,0.7000962854359294 2.5787278270774685
+0.0,-0.20657738352563298 -0.3054434424581368
+1.0,2.194893094322135 1.2265276851138993
+0.0,1.6478689673866447 -1.2217538409516264
+1.0,2.6520153534620268 4.253943157694819
+0.0,-1.091459682813003 -1.5933476790183565
+1.0,2.381978388803204 2.5725801073346375
+0.0,-1.7089448316753346 -0.40058783295112843
+1.0,4.692976595302646 2.293610804758882
+0.0,-0.8154594160076379 0.9100123432125261
+1.0,1.8893957859271135 2.365552941116367
+0.0,1.4750445045587657 -0.5730495722105764
+1.0,4.627946484342315 4.01023129091373
+0.0,-0.5740578222548407 -0.9010801407945085
+1.0,1.1844352711236998 1.0077910117111921
+0.0,-1.1904557430938465 -0.972229300373332
+1.0,1.9514043869587852 2.6603232743467817
+0.0,-0.11744191317950421 1.8160954524210857
+1.0,2.796337014232012 3.45131164191957
+0.0,1.1908754571951825 1.37388641966138
+1.0,3.1347230127964805 3.4874636513372774
+0.0,1.4279445191621287 0.4142573535049987
+1.0,3.2845746999649457 2.942571828876143
+0.0,1.0418078095097314 -0.515727237947711
+1.0,3.0672407807876674 3.593602465858237
+0.0,0.1070041194341431 0.013584199138111364
+1.0,2.831124413123504 2.5083468687281196
+0.0,1.9088191143015583 1.1943157723052062
+1.0,2.888463730373365 3.8588231186101716
+0.0,0.3344825700647222 1.4902421889158837
+1.0,5.1805240354926285 2.347000348613805
+0.0,-0.14736761539184529 -1.3764336595247777
+1.0,4.945788020165247 4.520764535128319
+0.0,0.48089579766964224 -1.0406729486881927
+1.0,3.115699146536788 3.0271206455481905
+0.0,0.8816867514268375 -0.7885530518936628
+1.0,3.293642905051253 4.129500570671647
+0.0,0.021019117419869213 -1.0983625263034136
+1.0,3.4712873315273884 2.8896550248710255
+0.0,1.336463967380889 0.1782538924176004
+1.0,2.9674559623039674 2.1702990000666977
+0.0,-0.9137873001694705 -1.6488427315604255
+1.0,2.425720985355789 3.336546225859983
+0.0,-2.3622279944776245 0.33443034793657744
+1.0,3.557057454549674 0.9654984504665607
+0.0,0.4924227412613347 0.8572441753897001
+1.0,2.903599258175698 1.9821387894597133
+0.0,-0.562864152759892 -1.41025535274598
+1.0,2.621542267864135 3.0896861639721602
+0.0,-0.9659016052287058 1.8601390770202668
+1.0,2.73394050343452 1.5908844566159697
+0.0,0.316736908826005 0.2857224419323005
+1.0,2.3312567009140532 5.596694984859762
+0.0,0.3137619371424862 -0.1840942808000176
+1.0,3.857644883242267 1.7425846536145542
+0.0,-0.10204795362718587 3.253153279848385
+1.0,1.991635750012152 3.0091345292604816
+0.0,0.6187841242310289 0.9589700354301842
+1.0,2.9773010080735895 3.723750625441197
+0.0,-0.8890787476930039 0.6057780620635984
+1.0,3.2341068438464773 4.238588226643048
+0.0,-0.6100941277292691 -1.5125630779121992
+1.0,3.378840902739636 2.0705801293719017
+0.0,1.9736225258875286 1.725383750563661
+1.0,1.8874237286900284 3.9061132751393997
+0.0,-0.0823939289302894 1.8958431169469556
+1.0,1.5927855001333566 4.6310125064091965
+0.0,0.3112044157520983 -1.7878471816057036
+1.0,4.34881513764263 3.4693940014863784
+0.0,1.052103622850019 -0.16912252356217902
+1.0,3.167179956507673 2.8792495587252507
+0.0,0.16791453003538387 -0.8546142448164881
+1.0,3.0538805073215953 3.4494667407676842
+0.0,-0.9500475678227512 0.06998146933806365
+1.0,3.8909913837847467 2.6813428719208763
+0.0,-0.09976816220585052 -1.4875944011133129
+1.0,3.1791447205478742 4.424991854067018
+0.0,1.0999643223476656 -1.1200747827607145
+1.0,5.222367041159025 1.2015274537211948
+0.0,-0.2848179798736651 0.401703345435371
+1.0,3.92690552314874 0.5307127426832543
+0.0,-0.6771410319499919 -0.5806616553853885
+1.0,3.611779415106116 3.3322298911093533
+0.0,-1.359189339369671 -0.03773529290863042
+1.0,4.696002594470123 1.4346348756461187
+0.0,-1.0094856636150293 0.19687532044013809
+1.0,3.2169383066148383 3.2307201581236473
+0.0,0.7836015359045666 0.2941037782687062
+1.0,3.7317041306588012 3.7985843457251107
+0.0,-0.3693168101963429 1.4513472421644549
+1.0,4.398703283685875 2.654636797434109
+0.0,0.02043081741683321 0.20805199015337653
+1.0,2.324187503797731 3.8819865944906566
+0.0,1.671377007435211 1.3731572027338659
+1.0,4.534630721644852 1.1543799480085444
+0.0,-0.3253127279932509 -0.8285225286171498
+1.0,3.993821155042294 0.7056403589045206
+0.0,1.194500226045371 0.638917136862092
+1.0,2.72148063695256 3.858678264350294
+0.0,-0.1905653672336637 0.8969404368665279
+1.0,1.9587911397509248 3.937696894952624
+0.0,-1.1358853052995896 1.4443151501322575
+1.0,3.7551091652428026 2.475478572543473
+0.0,-0.9167034706173607 -1.7549316646340103
+1.0,1.4669571532496661 3.2025879996118567
+0.0,-0.9673112226998997 0.13104324478779786
+1.0,5.129589009385082 2.962228456981596
+0.0,-1.038791699676283 0.3394661925580474
+1.0,4.0067362767396055 3.7808733451013863
+0.0,0.4607763000001474 0.3165842402170894
+1.0,3.470781763864157 3.1917117382789906
+0.0,-1.0759836593672722 2.1677955321765423
+1.0,1.8061608083541592 2.1368201192592524
+0.0,0.18913968729195288 -0.6832055159990379
+1.0,2.222086435460701 2.462434683952491
+0.0,1.1697195016246194 -0.6482703204844716
+1.0,0.9469729137532825 2.564223951962673
+0.0,-0.2596612587018774 1.3675954564898984
+1.0,3.3498722540414603 2.8411678301395655
+0.0,0.15549061976540607 -0.8795816620250406
+1.0,3.2166810907529517 3.3909740833940147
+0.0,-0.27777898312342497 1.5708467895548373
+1.0,3.5590852623593734 3.022687446035052
+0.0,0.8854804450462548 -0.1674059547432505
+1.0,5.592380230543062 2.046846128948299
+0.0,-0.38403645419139704 -0.6879614453050698
+1.0,1.2059037878354082 3.1373448113023263
+0.0,-0.9332349591768346 0.3271191223126651
+1.0,2.6941262027196444 2.0016455336591275
+0.0,1.985628476449888 -1.720937514961405
+1.0,1.52678578836386 3.6524268651279113
+0.0,0.14930924959259012 0.3549736192569231
+1.0,2.5081810800507904 4.502494324423253
+0.0,1.3659157029970181 -1.4064298168920828
+1.0,2.8947698041280185 3.871692848909248
+0.0,-0.19002791703482588 0.8099829390725909
+1.0,3.0481549176670555 4.05245395484312
+0.0,-0.014729952199541938 0.43445426055411474
+1.0,3.0874888030440486 3.89317889717026
+0.0,0.9521743475193137 0.16292125350371375
+1.0,3.0564028575123805 3.150394468127784
+0.0,-2.5565867181635724 1.1693524400747453
+1.0,3.963399476624186 2.655863627219969
+0.0,2.0594134768376584 1.4326082874689938
+1.0,3.9415985004601524 4.816989711315565
+0.0,0.4986273362656531 -0.30506819506279537
+1.0,2.7697598834307633 2.0292290332215512
+0.0,-0.4716043983943112 1.4692631198715722
+1.0,3.4127279940145883 3.078218915501194
+0.0,-0.28649487641740207 -0.8009455078808752
+1.0,2.645854233845017 4.028461076417125
+0.0,-1.2333241385253426 -0.2850384355482007
+1.0,2.4938754741404976 1.3466482769013481
+0.0,0.6872021385233428 -0.5159203960430369
+1.0,3.136974388668967 1.69291587793452
+0.0,0.9532239280401443 2.619265789851879
+1.0,2.570576389986536 2.548658346643033
+0.0,-1.030037965987706 0.2814883160676786
+1.0,2.510605023939257 2.3227098241155213
+0.0,2.4171507836629256 1.245606490445435
+1.0,3.5520681299250985 0.7442734445298673
+0.0,1.1940577980770877 1.6319950123919318
+1.0,2.708933998825159 2.118496371335553
+0.0,0.26808250222082186 2.5727974909556437
+1.0,3.221534693193204 3.073316472650363
+0.0,-0.6915734756410544 0.25168141600713434
+1.0,1.839319878312068 1.765565689559382
+0.0,1.708990562782385 1.1196517028520787
+1.0,2.1942131633492643 3.733776318231434
+0.0,1.4884941762679373 -0.5221400677305167
+1.0,2.425026062564176 4.814343944240822
+0.0,-1.3572570451352999 0.04542725800519613
+1.0,3.211869589232063 0.01498355271713292
+0.0,1.6170759581287553 0.7420944718274473
+1.0,1.8096883146020295 1.2063063122336204
+0.0,0.8326608996906895 -0.9760063002065638
+1.0,3.60415819299222 3.905143144181063
+0.0,0.9709971797789466 -1.0644382680658016
+1.0,2.8104103693138778 3.5792951568581017
+0.0,-1.021059644329913 -0.25967578007654707
+1.0,2.4020556940935216 3.8705560506781826
+0.0,-2.704107564850001 -0.14300257306795375
+1.0,3.7681081908063643 2.5433599278958297
+0.0,-0.537043950598385 0.8892208622861
+1.0,3.894301374710518 2.76168141850308
+0.0,-0.8416385593366815 1.3377079857054535
+1.0,1.4560861866861152 1.9464951398785584
+0.0,0.8974462212548237 -0.9027814165394935
+1.0,2.848274393366227 4.089266410865265
+0.0,-1.9874388443190703 -2.0515326123686
+1.0,1.7443330286532606 5.182730816947559
+0.0,1.9345124573698136 0.15482916596109797
+1.0,3.730890742221753 3.4571088485293173
+0.0,-0.7591467032951466 0.7817400181511722
+1.0,1.9612060838774241 1.7874104906670758
+0.0,0.04241602781710118 1.7624663777014242
+1.0,2.983106574446788 2.057794179835603
+0.0,-2.2675373876565272 0.1810247094230928
+1.0,1.8242036739605434 3.2897838599534053
+0.0,0.42135250345103276 0.9201551657148959
+1.0,2.3324158301116547 3.2735600739611406
+0.0,-2.503382611181759 -0.604428052499623
+1.0,2.1068571110070753 1.3987709205712464
+0.0,-0.25006447102137164 1.1597904649452788
+1.0,3.6610503210650105 2.389802330720335
+0.0,0.6655774387829471 -0.7657689612002381
+1.0,3.85820287126228 5.653287382126853
+0.0,0.08244241317513575 0.4755361735454262
+1.0,3.6029514045048234 3.0483730792265247
+0.0,1.0276000901424318 -0.569237094330588
+1.0,2.484863163042475 3.4464671311141046
+0.0,0.24588867824456415 -0.7355421671684942
+1.0,2.8757627634577396 1.3730139621444188
+0.0,0.911649033206053 -1.0562220913143838
+1.0,0.6701966948829261 3.8815519088585195
+0.0,1.0649444423673609 0.5738944212075908
+1.0,3.1272553354329955 5.18450239514651
+0.0,-1.8305691156390467 -1.2811179644895232
+1.0,4.326027257587544 1.9589219729995737
+0.0,-0.2278417247639679 -0.6436775444106994
+1.0,3.9854139754166136 2.8662622299102947
+0.0,-0.33177487577648573 0.7122237484053809
+1.0,2.7631237758865255 2.490470927953921
+0.0,-0.2989203275224733 -0.9063254275476191
+1.0,2.7739570950234254 3.333596743208583
+0.0,-0.12025132003053318 -1.2251715775331837
+1.0,3.9028268386113307 2.580334438085556
+0.0,0.3114518803226873 0.35489645702286177
+1.0,2.8765994073916112 4.251640702192294
+0.0,-3.0895947568085367 -1.0526550179589378
+1.0,3.5182345295490216 2.764855512391279
+0.0,0.5749621254042305 0.7148834016467635
+1.0,4.039448299164001 2.377396087740471
+0.0,1.7077800661629936 -0.23711282974122355
+1.0,2.883211311171089 3.5259606315833287
+0.0,-1.0304518163976537 -0.16271910447066004
+1.0,3.8284470175501504 1.0841759781704199
+0.0,-1.3620621426919217 0.8678141368192274
+1.0,3.831976508070298 2.3592788803510505
+0.0,0.8398199934902235 0.8458121179021545
+1.0,2.166979759191688 4.408250411844058
+0.0,-1.2009412161006234 -0.04486968047943732
+1.0,3.0041897020427517 1.67577082931885
+0.0,-1.0550850035108499 2.6114061208535673
+1.0,1.46399823823424 3.6863318429400627
+0.0,-0.439942118867861 0.8107733517611471
+1.0,2.799907981207793 3.1021389011201244
+0.0,0.40512996190803663 -0.2720769110918539
+1.0,2.936414720731187 2.6121553148876706
+0.0,0.7864503163458285 0.879685137879171
+1.0,3.497848931993103 3.93953696354328
+0.0,1.0898800025299487 -0.3780987477521812
+1.0,3.0737866861658834 3.8281246288654067
+0.0,1.0100369320198321 -0.36412797089680377
+1.0,4.977156552398557 1.9361263628969327
+0.0,1.1948682006514484 -1.0421380659408503
+1.0,2.3707352395183743 3.319087891488442
+0.0,0.14662871945444525 -1.125277513770441
+1.0,4.18636170602371 5.079790109963499
+0.0,0.5213830491310841 2.5489667538554355
+1.0,3.456121838657517 2.9777488007628823
+0.0,1.3942157902546204 -0.7392170745991694
+1.0,4.027857416272539 2.5520251242493615
+0.0,0.6677437543225546 -0.7054702957392922
+1.0,2.419993627501343 3.147115729790262
+0.0,-1.1891285195785104 0.7121837556662985
+1.0,2.6768950566988114 2.746092902448666
+0.0,-0.5581632736462642 -0.8475377022167101
+1.0,2.2877649074222144 3.360822129377224
+0.0,0.12427410923130733 -0.029877611579596446
+1.0,2.1363649823278976 2.040672619624904
+0.0,0.164296403698455 -0.7853340225962958
+1.0,2.2867454265483063 2.920796736914219
+0.0,0.030938689766481568 0.02840531713718885
+1.0,4.935402862397514 4.984097800264938
+0.0,-0.49323021214001667 -0.009344009957387383
+1.0,2.2590589178865788 2.784700488476081
+0.0,-1.7996451721642797 -0.08927843209025701
+1.0,2.7189425454136047 3.366984002518318
+0.0,-0.4732503966611213 2.41667617281343
+1.0,1.914172722581019 2.723688261246487
+0.0,0.6854209215843875 -0.6321377274037409
+1.0,4.7025333481932705 2.6561807763401646
+0.0,0.016511529980536163 -0.4064291762993186
+1.0,1.3841179371371182 3.367159685928979
+0.0,-0.525665902025766 0.3189849885462113
+1.0,2.1237941386456276 3.4141040859263914
+0.0,-1.3977733609952327 1.6180332199555512
+1.0,3.3282228318571496 2.9879449742002184
+0.0,-1.3911999737510374 -0.47876736354905697
+1.0,3.071461319022103 3.902142645231827
+0.0,-1.4616870328596612 0.4234223737141411
+1.0,3.3069543201402576 1.3522887907099401
+0.0,0.1771175002160632 0.7092577154896049
+1.0,2.561517669553921 3.2663130772229185
+0.0,0.8635080818806004 1.7578935533355913
+1.0,3.3054989034355793 3.4205399612822633
+0.0,-0.5525474134214131 -0.008874526853035592
+1.0,5.024607965706471 3.377256085775693
+0.0,0.6499316691799448 0.7636813929956143
+1.0,1.7211648540475015 3.7290596058136307
+0.0,-0.4312096678787339 0.4723353140241522
+1.0,1.6269397815780402 1.9613109767814954
+0.0,0.06589250830042476 0.5659627954925366
+1.0,1.4141705667382305 2.9411215895612255
+0.0,-0.30655047441372724 1.134312621267185
+1.0,4.079371134159225 3.7127217011979767
+0.0,-0.11148410319718746 1.504423362990177
+1.0,3.21908765035085 1.5284527951297098
+0.0,0.38879874604519066 -0.7718569898512835
+1.0,3.0387686435299197 1.9571679686339727
+0.0,0.0432538958325193 -0.609046739618082
+1.0,3.858513576900389 2.3343789318227595
+0.0,-1.594606569379673 2.0291869081775498
+1.0,4.418575803606943 3.634284954659144
+0.0,-1.5657043498774568 0.48528442006547645
+1.0,3.7474369990653518 2.417108621170513
+0.0,-0.4087178618516316 -0.5585629524971241
+1.0,2.8830052178069345 2.714807180476644
+0.0,1.0200529614238536 1.633454495011907
+1.0,2.161101444560085 2.722233198993495
+0.0,0.8905571055499505 0.3531260808046299
+1.0,1.5770402091220281 2.5197577954902615
+0.0,0.19603489193696402 0.4391781215510938
+1.0,3.285302297900197 2.5981032583297274
+0.0,-1.7728311957227578 2.226646036588897
+1.0,2.212402423781055 2.994783519362575
+0.0,-0.26351331835428804 0.6197161896115081
+1.0,2.5101464936050144 2.747453537535198
+0.0,1.083443472210967 -0.7471502465676395
+1.0,2.618022142084275 3.201094589808021
+0.0,-0.10243507468644107 -1.5307780048431203
+1.0,2.0479014235932986 2.7174445598757764
+0.0,-0.2530316183327909 1.5105959457792464
+1.0,2.616239369128394 3.1011058356715644
+0.0,2.0703487677159997 -1.23039689097027
+1.0,2.00559575849234 3.088170264353322
+0.0,0.751453701775929 -0.34079600956200146
+1.0,2.6436129383324625 0.6934715851263205
+0.0,0.4735774669250165 0.24981500600111478
+1.0,3.614102521076285 3.297655445774221
+0.0,-0.8397190394129946 2.0791729859494583
+1.0,2.5800847823336372 2.312770726398467
+0.0,0.9528690775719402 -4.054641847252764
+1.0,1.6631425491523402 4.465488566725185
+0.0,-0.40442215938144854 2.1662912065078923
+1.0,3.2025444402071472 0.954639816329502
+0.0,0.8484611241529962 -0.6531501762867838
+1.0,2.907155165379039 4.494838051538261
+0.0,1.1473298350419248 -0.7604213061923158
+1.0,4.406872541176625 2.616395889868952
+0.0,-1.0643453307576694 0.32269083514118757
+1.0,3.4229771635424653 5.404174358063928
+0.0,0.8223012341648268 -2.0705983787489455
+1.0,0.6519219290294926 3.317297519573949
+0.0,0.6661739745821234 0.21368601256080724
+1.0,2.8092516816651187 2.9407143882873363
+0.0,-2.0396349059310626 0.6660958962860263
+1.0,1.621401319049101 2.120514741629026
+0.0,-0.6673242389540511 -1.033336539766657
+1.0,2.4729967381312257 2.0622671692969314
+0.0,0.318696287733599 0.7696143248064906
+1.0,-0.3310542190127661 2.503572170101248
+0.0,-0.024545405442632163 1.2826535279165514
+1.0,2.08361065329982 1.7709137020843035
+0.0,-0.03325908838419148 2.127731976717063
+1.0,0.8920712229737089 2.267227052639782
+0.0,2.4226620796703706 -1.5422597801969735
+1.0,2.6125707261695665 4.136941962252239
+0.0,0.710000430684373 -0.2365544035810329
+1.0,3.587983407259662 2.371118916918134
+0.0,1.548716105657387 2.6039797648647527
+1.0,2.288647833469394 2.8514285941696564
+0.0,0.5407956769257948 -1.4250712589214616
+1.0,3.9999271279969157 4.647262641336589
+0.0,0.46916438504363506 -0.16114805677977867
+1.0,3.9351714928555133 3.017851089635014
+0.0,-0.24683125971847 0.8686956304798523
+1.0,2.445900548419883 2.601998949302925
+0.0,0.9708272515136681 0.9540365110832763
+1.0,2.0889493306284472 1.670700190658552
+0.0,0.7573519355244429 -0.6731075400854291
+1.0,2.9938559890272676 0.5796453404844417
+0.0,-0.42350233780111274 0.1072223004754211
+1.0,3.22502989165533 3.2744724666391045
+0.0,-0.051171179793716125 0.035749085667007977
+1.0,4.256076524642883 3.956646576238979
+0.0,0.44715068158575316 -0.10904823199444005
+1.0,3.754239074295241 2.4862504435534283
+0.0,-0.12025734941101636 0.6682754649328633
+1.0,2.9673795614648815 3.6207880514009263
+0.0,-2.250093626462795 -0.49148713538228506
+1.0,1.7335315087131171 4.234455598757855
+0.0,-0.5145677322324603 -1.8872464244504652
+1.0,3.1524408905920547 2.534903833671654
+0.0,1.4188237424906527 -1.987300018397619
+1.0,3.025903676999244 2.1652631630581847
+0.0,0.5008343534015861 0.28011601768758965
+1.0,2.0039218613662197 2.3639397631018015
+0.0,1.342528231824729 1.0036076495884643
+1.0,3.3281244751369985 2.4251038991267277
+0.0,-0.38845861664115766 -1.5147629282596704
+1.0,2.613448357242925 4.463712912575443
+0.0,-0.19439583983218703 0.676381234314577
+1.0,1.0400516553104269 2.3981508685333424
+0.0,0.9469554018478826 -0.08144910777086176
+1.0,3.179705969662961 3.768848690124549
+0.0,0.39855441813668835 -1.6301847736954416
+1.0,2.1915941615815226 2.7947789889097763
+0.0,1.6023287643577222 0.05432794979410767
+1.0,1.5758610206949497 3.8709473262823777
+0.0,-1.3109119301269387 -0.8645189055395048
+1.0,3.715865055565244 1.9360512196442488
+0.0,-0.2073998491467907 -1.178882579876182
+1.0,2.565062666629786 2.3121370465462494
+0.0,-0.41397768670851737 -0.6674761320605563
+1.0,2.941938460212705 3.537877403937825
+0.0,0.5954231185191001 1.6839554319972647
+1.0,4.591360208911688 1.4381368838271187
+0.0,-1.3221878199013057 0.786799353955043
+1.0,0.6498018470693379 2.2143413646510095
+0.0,0.5346452265922554 0.45599002729248733
+1.0,2.668100742914233 2.679883986650412
+0.0,-0.22428284967184606 -1.0003823373608314
+1.0,4.233871998643562 3.3423521548333897
+0.0,0.7800144346305873 1.6512542456242612
+1.0,3.3192955924982677 4.664828345688715
+0.0,-0.9059493298933676 -0.42207747354389447
+1.0,3.1776956110847916 1.1393123509452483
+0.0,-0.5246202787832872 1.0246845701853746
+1.0,4.732113325540828 1.29018271893586
+0.0,0.9863596225434407 0.7506968948666005
+1.0,2.911409852038849 2.626474556246977
+0.0,0.8545346747310709 -2.1711133879380955
+1.0,2.476689592134109 4.03136160709651
+0.0,0.43108249592457043 0.4589971218864913
+1.0,3.2333287857145825 2.188137362144206
+0.0,1.4405649581445525 0.4131214094941824
+1.0,2.0631468420251093 3.807898318807702
+0.0,0.43964401099781425 0.6669437158150616
+1.0,2.165843657939062 4.109647016182597
+0.0,-0.9735452695016392 -0.6172105570335473
+1.0,3.169794653766589 3.2721053734106
+0.0,1.3129166037688875 -1.2040138532590103
+1.0,2.211361701514339 1.025981622029549
+0.0,0.3653350359702278 0.5229315457444437
+1.0,3.372206428302252 4.163685355869495
+0.0,-0.8690030167652726 0.3226849491596335
+1.0,4.188509026227427 2.1137749377457076
+0.0,2.2174789916979933 0.8249932442083762
+1.0,3.9224824525785706 2.9436443006575925
+0.0,0.1370905200148926 -0.043320354739616776
+1.0,3.1118662077850807 1.4983207834379917
+0.0,-0.5304073850344787 -0.4219778391981189
+1.0,1.2153552376808336 3.4749521622043438
+0.0,-2.545970043914331 -0.5480647959096547
+1.0,1.8097968872175412 4.733523163055134
+0.0,-0.5599306916727819 0.4648015112295201
+1.0,3.0242901796172204 4.354893518146392
+0.0,-0.49175893973189483 1.8635231981223406
+1.0,3.923889822736733 4.199324033436554
+0.0,0.32931083529824645 -1.2038529291812745
+1.0,2.8430570026355904 3.2581768028655214
+0.0,0.08015643729775149 -0.5281238499521005
+1.0,1.0251176552841985 2.452443183841665
+0.0,-1.4000614002792062 -0.4723026702712555
+1.0,4.642753244692533 3.5777684251625153
+0.0,-0.9732069449126244 -0.7507666182081589
+1.0,2.284811103731081 2.6226837934175817
+0.0,1.4938320459354653 1.2271703303402608
+1.0,2.5217907633717935 1.9804499278889345
+0.0,0.9177851256816916 -1.196945923903535
+1.0,2.650515007788954 0.9818159554114416
+0.0,-0.4172435945582116 0.11930551874205601
+1.0,1.8203127944592765 3.3069324017397594
+0.0,0.08195935202288789 -0.2585763476071969
+1.0,2.14910426585678 4.146147361847687
+0.0,1.578290774885182 0.16149960053586573
+1.0,1.2607405323635168 2.940350340912184
+0.0,1.6722138822230346 -0.5454073192477626
+1.0,0.3769561517619793 4.029314828130509
+0.0,-0.012008811772440746 0.2577932550827986
+1.0,2.330909580388283 3.1650439747088024
+0.0,-1.4224384024201595 -0.6369918128076046
+1.0,3.451178380794735 2.7553545272536746
+0.0,-0.7913135079702314 -0.012217405089490006
+1.0,3.7918310740082424 3.3927876820084033
+0.0,0.41016650792928255 0.3521369094279198
+1.0,2.380867149491576 3.7533007228820754
+0.0,-0.2787273586680994 1.3553543015884186
+1.0,2.8933236071325226 1.7975563396445144
+0.0,-0.4868680345968448 0.058461169788172784
+1.0,3.484434144626577 3.5622013162506683
+0.0,1.171904838026115 0.1162839888503951
+1.0,1.8132727587691455 2.238018140780368
+0.0,0.8114997821213137 -1.712768034302675
+1.0,2.977061410695451 2.802894970831404
+0.0,1.7141760742336318 0.5672102391229309
+1.0,3.2929421353515185 3.3754831695793945
+0.0,-2.280170614413754 -0.4912881923146271
+1.0,4.182771547422101 3.5331418354105812
+0.0,-0.2544453921577854 0.4682744998445509
+1.0,1.9236524545763007 2.628837510538455
+0.0,0.6645491524745186 -2.398604366119661
+1.0,3.50840713613987 3.7182332137428955
+0.0,-1.4532823239751684 -0.9916580822162051
+1.0,2.769613688635247 4.72661442603805
+0.0,-1.090104082054257 0.486265921887567
+1.0,3.4900626627065003 3.03025323652533
+0.0,1.4518716691137106 -0.10218738652959546
+1.0,2.745034544461333 4.366809709694589
+0.0,-0.17197050309086373 0.13673125942508174
+1.0,2.4934379443680985 2.954734256628178
+0.0,0.14078971520128297 -0.5401300324197861
+1.0,3.640563349517043 5.163454382169049
+0.0,1.0264020194022627 -0.8738489740165843
+1.0,3.791458514669831 2.2038333093620834
+0.0,-3.075231830613813 2.04054404065675
+1.0,4.647422323558612 3.5220753128741427
+0.0,-0.6423734479152313 0.5403500050100541
+1.0,1.5985339514690007 2.73447434771563
+0.0,-0.04474684215568748 -0.21477212224970194
+1.0,2.6701891009654792 3.9776885659794505
+0.0,-0.4714276238216119 1.4235807729101415
+1.0,3.5551789183755806 2.7057825768035104
+0.0,1.108254774651522 0.8596053056731966
+1.0,3.0623366138774983 2.718494058918926
+0.0,-1.375827910513567 0.011994162356159788
+1.0,3.841407434840553 2.8434319292302304
+0.0,-0.7149712282755271 0.1811986378283469
+1.0,5.155524316715826 2.1468464150279747
+0.0,-0.06822014690491127 -0.15801546435311806
+1.0,3.4838423066641173 4.211572262022802
+0.0,1.455177312877137 -0.9388697017811595
+1.0,3.917344840727481 3.569507254920478
+0.0,-2.080636526173827 -1.2489913979804321
+1.0,4.904327940183608 3.4289745068714295
+0.0,-1.4744723958060084 0.2930577753686633
+1.0,2.810346752831796 2.4062885063635333
+0.0,-0.17365054648101302 -2.26263747840141
+1.0,4.077713960215311 3.841309768575811
+0.0,1.581178479362914 -0.9672846912018417
+1.0,4.516244757634386 2.9078781629204054
+0.0,-1.5890391289381882 -0.4092245513024253
+1.0,3.359480708344044 3.7375262649030123
+0.0,1.5675385032786122 0.9010632060589036
+1.0,3.8564874267647644 3.060660915266198
+0.0,-0.2482500870678099 0.29655946916337894
+1.0,3.1672692968701397 1.1973226392521306
+0.0,-1.4471523637168304 0.5370395414503478
+1.0,4.814859889188941 2.229750617440331
+0.0,0.2812295731325761 0.6044036116090106
+1.0,2.4884527354338903 1.4171627784171204
+0.0,1.173099753717184 0.7948729712563257
+1.0,1.5092479631180256 4.1412277875509105
+0.0,-1.1453508695714685 -0.15567849492271865
+1.0,1.9397046305500465 3.430755367623314
+0.0,-1.6689604208958047 -1.161942047896626
+1.0,4.287905082572467 2.643797664646416
+0.0,0.5691715436318573 -0.6013793142266736
+1.0,2.622904412483301 1.769830678112635
+0.0,-1.0627706066421603 -1.2962746926911266
+1.0,2.5818494635089886 2.9547836545958663
+0.0,-1.555832778500785 0.6050365213516793
+1.0,0.6877755924513469 3.0627330470806617
+0.0,-0.6945984937358738 -0.5355659085722678
+1.0,3.631758943383 2.6990914911890194
+0.0,-0.10204034384758799 1.2650405538373874
+1.0,2.8618200471403488 2.7676923144816237
+0.0,-1.2337428464512885 -0.7151041760567872
+1.0,3.5209869997316807 3.280763138579491
+0.0,0.3700095159793621 -0.8614396246939711
+1.0,2.698616090611572 3.2205340189872795
+0.0,-0.8069663812258417 -0.07956402748767083
+1.0,2.929873320056276 4.030067053746698
+0.0,-1.2316919288622938 1.245687935224532
+1.0,2.9285679560367055 2.9682906465530783
+0.0,-0.3965578686363537 1.1748126835359254
+1.0,4.002714110052464 4.370338584188975
+0.0,-0.6084107635744659 -0.6092872315132073
+1.0,3.293912876563504 3.5843332356258464
+0.0,-0.8145032742370918 1.4050967895930515
+1.0,1.991600071099763 2.343264260750465
+0.0,-0.9433799779882722 1.5943129187456013
+1.0,2.369037146473894 1.9827898318071764
+0.0,-0.26885731570182714 0.47421918725401946
+1.0,3.263006333756187 3.0441051541001443
+0.0,0.21785408377528742 0.5754303556190559
+1.0,2.941128899266118 1.240818619804987
+0.0,0.736142634408259 -1.3173589352849961
+1.0,3.2027184783050644 2.9218716893221766
+0.0,1.9216539101612737 -2.2400666381338694
+1.0,2.4823406743823426 3.429705681271458
+0.0,0.0666674809216063 -0.976496437708073
+1.0,3.206108328915537 2.0828009180110976
+0.0,-0.11582094814525531 2.5093876016868366
+1.0,2.5373176496966328 2.32926952602907
+0.0,-0.9237765727032562 0.9342845305943139
+1.0,2.5300867778672123 3.2754703213122753
+0.0,0.13837351460348038 0.2533025702882705
+1.0,4.556185356940701 0.7629684714626066
+0.0,-1.8251759895063635 0.6966019254550819
+1.0,4.905392053322123 4.111245902434462
+0.0,0.09886105139472441 1.4093224263552915
+1.0,2.0484713074013223 4.874632770975326
+0.0,-0.040609033066195156 -1.3446008307073973
+1.0,3.678642687565624 4.156505531118834
+0.0,0.052003196801406706 1.2239229001362555
+1.0,3.4376496474012876 2.417529764306501
+0.0,-0.09054032070414311 -1.7571173217955876
+1.0,3.230032966809188 3.5965216835420546
+0.0,0.9100014718072797 0.5615698517199065
+1.0,3.938728443662248 3.2945250621813273
+0.0,-0.9205165004286314 -0.01425448590777016
+1.0,1.907285344344031 3.8629943281683987
+0.0,-0.8160057252300347 -0.2757475590440447
+1.0,2.3076630082503926 3.2283118851645476
+0.0,1.3000520665928303 0.581203895654615
+1.0,3.8425274250736887 3.6133028383400414
+0.0,0.13694776598217193 -1.1659103408047182
+1.0,2.688548985689179 1.5486856086329917
+0.0,-0.14378057635986438 -1.4649914115754739
+1.0,3.923705106138171 3.8281415874634783
+0.0,1.3334544187579878 -0.048721556115349604
+1.0,3.320777445436592 2.947489296620178
+0.0,-0.36251547004650103 -0.2886015741883188
+1.0,3.2163584307843567 2.9285953038088373
+0.0,0.5437339741631225 -0.23459273264636704
+1.0,2.820666118654177 4.0305429519659395
+0.0,0.04808393980018175 0.42285718084497675
+1.0,1.4686721107589078 2.6605885841423067
+0.0,1.1873828480862414 0.5487600196906772
+1.0,3.425690422789916 4.252827757634791
+0.0,-0.7323210179394448 -0.9818194354330615
+1.0,3.018263609974841 2.914037267945018
+0.0,1.005159548514262 -0.5055899932767433
+1.0,4.566046579419102 5.545663797862058
+0.0,-0.7129346827436536 2.2938920919917742
+1.0,2.869336979055624 2.5688122980246684
+0.0,1.5201806096451054 -0.7414084378784415
+1.0,1.71558426191034 2.4576286538624794
+0.0,0.8090326808020629 0.26208059965589425
+1.0,3.0163716479573077 2.4747608384001056
+0.0,0.47627288733283857 1.3085076289292734
+1.0,3.3891272567835684 3.20832981462489
+0.0,1.0488767400026389 1.2318533170755142
+1.0,3.3428160616141853 2.5497426855885075
+0.0,-0.6411040361810151 -0.4290410178863531
+1.0,2.219119637941564 2.6621113083439254
+0.0,1.5621125506487947 0.7273124535333745
+1.0,3.1459765929197636 1.3663869759433418
+0.0,-0.05263982623034547 0.43675636434345644
+1.0,1.890191705836878 3.435071392429276
+0.0,0.28718983621307775 -2.438042507707637
+1.0,5.717207001359904 2.2303522388797035
+0.0,0.17636841934036573 -0.2202348356695646
+1.0,2.7426941364254294 3.9506423829670734
+0.0,-1.118995077703066 0.6062681312772151
+1.0,4.510963440028501 2.4497214672006575
+0.0,0.07601426739661686 1.4712413920907517
+1.0,2.472822799411239 4.045939967967948
+0.0,-2.2061186560242603 0.32560701091997957
+1.0,3.250675248798315 3.268273446922124
+0.0,-0.024542349115316425 1.5505593308513355
+1.0,2.5654508852779654 2.9476923150082874
+0.0,0.8070230851041806 1.0614288963806608
+1.0,4.0121013342203655 1.7608333223695753
+0.0,-0.6895596222836047 0.035498410809669464
+1.0,1.697905057706837 4.053746875797327
+0.0,-0.3311042917990167 -0.09180266122060314
+1.0,3.720796880080382 4.467214289132983
+0.0,-0.318673057944378 -3.1474317710285202
+1.0,4.809204233917482 4.55250051737848
+0.0,0.596445093094233 0.41780789823963405
+1.0,4.432965399675368 3.4638105151117617
+0.0,-0.10285141484897965 1.747950423830727
+1.0,2.1513849154027014 3.9020766404442933
+0.0,1.5988780419195843 -0.08753929889987294
+1.0,0.9867334105272594 3.017081919852008
+0.0,-1.4952194834476749 1.0187701527429442
+1.0,2.2468599817570376 2.5883807516977395
+0.0,-1.804930212071194 0.3519094744696904
+1.0,4.1524048686549975 2.39387437993355
+0.0,0.7077190974093445 0.5703893640810606
+1.0,3.551726989450847 2.4786821848615985
+0.0,1.866022101379231 0.23733176192158173
+1.0,2.636453843734601 3.2607059005922467
+0.0,1.0052825898444602 0.5988275134415102
+1.0,2.643754787324359 3.72363185525656
+0.0,-0.9925822461102075 0.060644514219670244
+1.0,3.8994350969658136 1.9246001662480055
+0.0,0.6513177047637154 0.04450296971216735
+1.0,2.4564101844841106 3.6785165656991596
+0.0,0.2606556093620563 -0.6172755504020078
+1.0,2.4170362032345674 0.8639272362396189
+0.0,-0.6416537078444019 1.8622433251026849
+1.0,2.0247632881021267 2.538336421666863
+0.0,-1.0177991501405648 -0.8522549981552515
+1.0,3.3426117902650185 3.1635532244875586
+0.0,-0.08963512689480763 1.4555128614393191
+1.0,3.7470117779591092 3.414476280017385
+0.0,0.7721815837750134 -0.17297061945116646
+1.0,3.823597567639877 4.2427688079492665
+0.0,-0.6905817293226868 0.5838402640342898
+1.0,3.005258204213709 2.7252310853631125
+0.0,0.963732273262942 -1.3950688358262504
+1.0,3.2803836447761934 3.448945851174787
+0.0,-0.11576488451784747 1.8796627145034757
+1.0,3.905782244273501 3.3853014175990412
+0.0,0.3786078767939069 0.4054987293824608
+1.0,4.251338642737948 3.2212804055347375
+0.0,1.785664685579919 -0.4528337660796719
+1.0,0.9522164714530392 4.648272724469027
+0.0,2.06805484281029 0.3211833348167774
+1.0,3.2063266406360875 3.20907719820361
+0.0,-0.18542396323311192 -0.4721814985954186
+1.0,1.2468417100913183 2.988063666542869
+0.0,-0.9089767150726245 0.049627884005341995
+1.0,3.570670591235201 1.812766580123238
+0.0,1.9973417232460495 -0.17709723581574177
+1.0,2.810527831677345 2.0292239826226717
+0.0,0.06390562956663569 0.9110683296487658
+1.0,4.449308253046676 2.5895593413305997
+0.0,-0.18596846882351442 1.2495641818989083
+1.0,2.1189215966743986 3.7928094437779283
diff --git a/mllib/data/ridge-data/lpsa.data b/mllib/data/ridge-data/lpsa.data
new file mode 100644
index 0000000000..fdd16e36b4
--- /dev/null
+++ b/mllib/data/ridge-data/lpsa.data
@@ -0,0 +1,67 @@
+-0.4307829,-1.63735562648104 -2.00621178480549 -1.86242597251066 -1.02470580167082 -0.522940888712441 -0.863171185425945 -1.04215728919298 -0.864466507337306
+-0.1625189,-1.98898046126935 -0.722008756122123 -0.787896192088153 -1.02470580167082 -0.522940888712441 -0.863171185425945 -1.04215728919298 -0.864466507337306
+-0.1625189,-1.57881887548545 -2.1887840293994 1.36116336875686 -1.02470580167082 -0.522940888712441 -0.863171185425945 0.342627053981254 -0.155348103855541
+-0.1625189,-2.16691708463163 -0.807993896938655 -0.787896192088153 -1.02470580167082 -0.522940888712441 -0.863171185425945 -1.04215728919298 -0.864466507337306
+0.3715636,-0.507874475300631 -0.458834049396776 -0.250631301876899 -1.02470580167082 -0.522940888712441 -0.863171185425945 -1.04215728919298 -0.864466507337306
+0.7654678,-2.03612849966376 -0.933954647105133 -1.86242597251066 -1.02470580167082 -0.522940888712441 -0.863171185425945 -1.04215728919298 -0.864466507337306
+0.8544153,-0.557312518810673 -0.208756571683607 -0.787896192088153 0.990146852537193 -0.522940888712441 -0.863171185425945 -1.04215728919298 -0.864466507337306
+1.2669476,-0.929360463147704 -0.0578991819441687 0.152317365781542 -1.02470580167082 -0.522940888712441 -0.863171185425945 -1.04215728919298 -0.864466507337306
+1.2669476,-2.28833047634983 -0.0706369432557794 -0.116315079324086 0.80409888772376 -0.522940888712441 -0.863171185425945 -1.04215728919298 -0.864466507337306
+1.2669476,0.223498042876113 -1.41471935455355 -0.116315079324086 -1.02470580167082 -0.522940888712441 -0.29928234305568 0.342627053981254 0.199211097885341
+1.3480731,0.107785900236813 -1.47221551299731 0.420949810887169 -1.02470580167082 -0.522940888712441 -0.863171185425945 0.342627053981254 -0.687186906466865
+1.446919,0.162180092313795 -1.32557369901905 0.286633588334355 -1.02470580167082 -0.522940888712441 -0.863171185425945 -1.04215728919298 -0.864466507337306
+1.4701758,-1.49795329918548 -0.263601072284232 0.823898478545609 0.788388310173035 -0.522940888712441 -0.29928234305568 0.342627053981254 0.199211097885341
+1.4929041,0.796247055396743 0.0476559407005752 0.286633588334355 -1.02470580167082 -0.522940888712441 0.394013435896129 -1.04215728919298 -0.864466507337306
+1.5581446,-1.62233848461465 -0.843294091975396 -3.07127197548598 -1.02470580167082 -0.522940888712441 -0.863171185425945 -1.04215728919298 -0.864466507337306
+1.5993876,-0.990720665490831 0.458513517212311 0.823898478545609 1.07379746308195 -0.522940888712441 -0.863171185425945 -1.04215728919298 -0.864466507337306
+1.6389967,-0.171901281967138 -0.489197399065355 -0.65357996953534 -1.02470580167082 -0.522940888712441 -0.863171185425945 -1.04215728919298 -0.864466507337306
+1.6956156,-1.60758252338831 -0.590700340358265 -0.65357996953534 -0.619561070667254 -0.522940888712441 -0.863171185425945 -1.04215728919298 -0.864466507337306
+1.7137979,0.366273918511144 -0.414014962912583 -0.116315079324086 0.232904453212813 -0.522940888712441 0.971228997418125 0.342627053981254 1.26288870310799
+1.8000583,-0.710307384579833 0.211731938156277 0.152317365781542 -1.02470580167082 -0.522940888712441 -0.442797990776478 0.342627053981254 1.61744790484887
+1.8484548,-0.262791728113881 -1.16708345615721 0.420949810887169 0.0846342590816532 -0.522940888712441 0.163172393491611 0.342627053981254 1.97200710658975
+1.8946169,0.899043117369237 -0.590700340358265 0.152317365781542 -1.02470580167082 -0.522940888712441 1.28643254437683 -1.04215728919298 -0.864466507337306
+1.9242487,-0.903451690500615 1.07659722048274 0.152317365781542 1.28380453408541 -0.522940888712441 -0.442797990776478 -1.04215728919298 -0.864466507337306
+2.008214,-0.0633337899773081 -1.38088970920094 0.958214701098423 0.80409888772376 -0.522940888712441 -0.863171185425945 -1.04215728919298 -0.864466507337306
+2.0476928,-1.15393789990757 -0.961853075398404 -0.116315079324086 -1.02470580167082 -0.522940888712441 -0.442797990776478 -1.04215728919298 -0.864466507337306
+2.1575593,0.0620203721138446 0.0657973885499142 1.22684714620405 -0.468824786336838 -0.522940888712441 1.31421001659859 1.72741139715549 -0.332627704725983
+2.1916535,-0.75731027755674 -2.92717970468456 0.018001143228728 -1.02470580167082 -0.522940888712441 -0.863171185425945 0.342627053981254 -0.332627704725983
+2.2137539,1.11226993252773 1.06484916245061 0.555266033439982 0.877691038550889 1.89254797819741 1.43890404648442 0.342627053981254 0.376490698755783
+2.2772673,-0.468768642850639 -1.43754788774533 -1.05652863719378 0.576050411655607 -0.522940888712441 0.0120483832567209 0.342627053981254 -0.687186906466865
+2.2975726,-0.618884859896728 -1.1366360750781 -0.519263746982526 -1.02470580167082 -0.522940888712441 -0.863171185425945 3.11219574032972 1.97200710658975
+2.3272777,-0.651431999123483 0.55329161145762 -0.250631301876899 1.11210019001038 -0.522940888712441 -0.179808625688859 -1.04215728919298 -0.864466507337306
+2.5217206,0.115499102435224 -0.512233676577595 0.286633588334355 1.13650173283446 -0.522940888712441 -0.179808625688859 0.342627053981254 -0.155348103855541
+2.5533438,0.266341329949937 -0.551137885443386 -0.384947524429713 0.354857790686005 -0.522940888712441 -0.863171185425945 0.342627053981254 -0.332627704725983
+2.5687881,1.16902610257751 0.855491905752846 2.03274448152093 1.22628985326088 1.89254797819741 2.02833774827712 3.11219574032972 2.68112551007152
+2.6567569,-0.218972367124187 0.851192298581141 0.555266033439982 -1.02470580167082 -0.522940888712441 -0.863171185425945 0.342627053981254 0.908329501367106
+2.677591,0.263121415733908 1.4142681068416 0.018001143228728 1.35980653053822 -0.522940888712441 -0.863171185425945 -1.04215728919298 -0.864466507337306
+2.7180005,-0.0704736333296423 1.52000996595417 0.286633588334355 1.39364261119802 -0.522940888712441 -0.863171185425945 0.342627053981254 -0.332627704725983
+2.7942279,-0.751957286017338 0.316843561689933 -1.99674219506348 0.911736065044475 -0.522940888712441 -0.863171185425945 -1.04215728919298 -0.864466507337306
+2.8063861,-0.685277652430997 1.28214038482516 0.823898478545609 0.232904453212813 -0.522940888712441 -0.863171185425945 0.342627053981254 -0.155348103855541
+2.8124102,-0.244991501432929 0.51882005949686 -0.384947524429713 0.823246560137838 -0.522940888712441 -0.863171185425945 0.342627053981254 0.553770299626224
+2.8419982,-0.75731027755674 2.09041984898851 1.22684714620405 1.53428167116843 -0.522940888712441 -0.863171185425945 -1.04215728919298 -0.864466507337306
+2.8535925,1.20962937075363 -0.242882661178889 1.09253092365124 -1.02470580167082 -0.522940888712441 1.24263233939889 3.11219574032972 2.50384590920108
+2.9204698,0.570886990493502 0.58243883987948 0.555266033439982 1.16006887775962 -0.522940888712441 1.07357183940747 0.342627053981254 1.61744790484887
+2.9626924,0.719758684343624 0.984970304132004 1.09253092365124 1.52137230773457 -0.522940888712441 -0.179808625688859 0.342627053981254 -0.509907305596424
+2.9626924,-1.52406140158064 1.81975700990333 0.689582255992796 -1.02470580167082 -0.522940888712441 -0.863171185425945 -1.04215728919298 -0.864466507337306
+2.9729753,-0.132431544081234 2.68769877553723 1.09253092365124 1.53428167116843 -0.522940888712441 -0.442797990776478 0.342627053981254 -0.687186906466865
+3.0130809,0.436161292804989 -0.0834447307428255 -0.519263746982526 -1.02470580167082 1.89254797819741 1.07357183940747 0.342627053981254 1.26288870310799
+3.0373539,-0.161195191984091 -0.671900359186746 1.7641120364153 1.13650173283446 -0.522940888712441 -0.863171185425945 0.342627053981254 0.0219314970149
+3.2752562,1.39927182372944 0.513852869452676 0.689582255992796 -1.02470580167082 1.89254797819741 1.49394503405693 0.342627053981254 -0.155348103855541
+3.3375474,1.51967002306341 -0.852203755696565 0.555266033439982 -0.104527297798983 1.89254797819741 1.85927724828569 0.342627053981254 0.908329501367106
+3.3928291,0.560725834706224 1.87867703391426 1.09253092365124 1.39364261119802 -0.522940888712441 0.486423065822545 0.342627053981254 1.26288870310799
+3.4355988,1.00765532502814 1.69426310090641 1.89842825896812 1.53428167116843 -0.522940888712441 -0.863171185425945 0.342627053981254 -0.509907305596424
+3.4578927,1.10152996153577 -0.10927271844907 0.689582255992796 -1.02470580167082 1.89254797819741 1.97630171771485 0.342627053981254 1.61744790484887
+3.5160131,0.100001934217311 -1.30380956369388 0.286633588334355 0.316555063757567 -0.522940888712441 0.28786643052924 0.342627053981254 0.553770299626224
+3.5307626,0.987291634724086 -0.36279314978779 -0.922212414640967 0.232904453212813 -0.522940888712441 1.79270085261407 0.342627053981254 1.26288870310799
+3.5652984,1.07158528137575 0.606453149641961 1.7641120364153 -0.432854616994416 1.89254797819741 0.528504607720369 0.342627053981254 0.199211097885341
+3.5876769,0.180156323255198 0.188987436375017 -0.519263746982526 1.09956763075594 -0.522940888712441 0.708239632330506 0.342627053981254 0.199211097885341
+3.6309855,1.65687973755377 -0.256675483533719 0.018001143228728 -1.02470580167082 1.89254797819741 1.79270085261407 0.342627053981254 1.26288870310799
+3.6800909,0.5720085322365 0.239854450210939 -0.787896192088153 1.0605418233138 -0.522940888712441 -0.863171185425945 -1.04215728919298 -0.864466507337306
+3.7123518,0.323806133438225 -0.606717660886078 -0.250631301876899 -1.02470580167082 1.89254797819741 0.342907418101747 0.342627053981254 0.199211097885341
+3.9843437,1.23668206715898 2.54220539083611 0.152317365781542 -1.02470580167082 1.89254797819741 1.89037692416194 0.342627053981254 1.26288870310799
+3.993603,0.180156323255198 0.154448192444669 1.62979581386249 0.576050411655607 1.89254797819741 0.708239632330506 0.342627053981254 1.79472750571931
+4.029806,1.60906277046565 1.10378605019827 0.555266033439982 -1.02470580167082 -0.522940888712441 -0.863171185425945 -1.04215728919298 -0.864466507337306
+4.1295508,1.0036214996026 0.113496885050331 -0.384947524429713 0.860016436332751 1.89254797819741 -0.863171185425945 0.342627053981254 -0.332627704725983
+4.3851468,1.25591974271076 0.577607033774471 0.555266033439982 -1.02470580167082 1.89254797819741 1.07357183940747 0.342627053981254 1.26288870310799
+4.6844434,2.09650591351268 0.625488598331018 -2.66832330782754 -1.02470580167082 1.89254797819741 1.67954222367555 0.342627053981254 0.553770299626224
+5.477509,1.30028987435881 0.338383613253713 0.555266033439982 1.00481276295349 1.89254797819741 1.24263233939889 0.342627053981254 1.97200710658975
diff --git a/mllib/src/main/scala/spark/mllib/clustering/KMeans.scala b/mllib/src/main/scala/spark/mllib/clustering/KMeans.scala
new file mode 100644
index 0000000000..d875d6de50
--- /dev/null
+++ b/mllib/src/main/scala/spark/mllib/clustering/KMeans.scala
@@ -0,0 +1,334 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package spark.mllib.clustering
+
+import scala.collection.mutable.ArrayBuffer
+import scala.util.Random
+
+import spark.{SparkContext, RDD}
+import spark.SparkContext._
+import spark.Logging
+import spark.mllib.util.MLUtils
+
+import org.jblas.DoubleMatrix
+
+
+/**
+ * K-means clustering with support for multiple parallel runs and a k-means++ like initialization
+ * mode (the k-means|| algorithm by Bahmani et al). When multiple concurrent runs are requested,
+ * they are executed together with joint passes over the data for efficiency.
+ *
+ * This is an iterative algorithm that will make multiple passes over the data, so any RDDs given
+ * to it should be cached by the user.
+ */
+class KMeans private (
+ var k: Int,
+ var maxIterations: Int,
+ var runs: Int,
+ var initializationMode: String,
+ var initializationSteps: Int,
+ var epsilon: Double)
+ extends Serializable with Logging
+{
+ private type ClusterCenters = Array[Array[Double]]
+
+ def this() = this(2, 20, 1, KMeans.K_MEANS_PARALLEL, 5, 1e-4)
+
+ /** Set the number of clusters to create (k). Default: 2. */
+ def setK(k: Int): KMeans = {
+ this.k = k
+ this
+ }
+
+ /** Set maximum number of iterations to run. Default: 20. */
+ def setMaxIterations(maxIterations: Int): KMeans = {
+ this.maxIterations = maxIterations
+ this
+ }
+
+ /**
+ * Set the initialization algorithm. This can be either "random" to choose random points as
+ * initial cluster centers, or "k-means||" to use a parallel variant of k-means++
+ * (Bahmani et al., Scalable K-Means++, VLDB 2012). Default: k-means||.
+ */
+ def setInitializationMode(initializationMode: String): KMeans = {
+ if (initializationMode != KMeans.RANDOM && initializationMode != KMeans.K_MEANS_PARALLEL) {
+ throw new IllegalArgumentException("Invalid initialization mode: " + initializationMode)
+ }
+ this.initializationMode = initializationMode
+ this
+ }
+
+ /**
+ * Set the number of runs of the algorithm to execute in parallel. We initialize the algorithm
+ * this many times with random starting conditions (configured by the initialization mode), then
+ * return the best clustering found over any run. Default: 1.
+ */
+ def setRuns(runs: Int): KMeans = {
+ if (runs <= 0) {
+ throw new IllegalArgumentException("Number of runs must be positive")
+ }
+ this.runs = runs
+ this
+ }
+
+ /**
+ * Set the number of steps for the k-means|| initialization mode. This is an advanced
+ * setting -- the default of 5 is almost always enough. Default: 5.
+ */
+ def setInitializationSteps(initializationSteps: Int): KMeans = {
+ if (initializationSteps <= 0) {
+ throw new IllegalArgumentException("Number of initialization steps must be positive")
+ }
+ this.initializationSteps = initializationSteps
+ this
+ }
+
+ /**
+ * Set the distance threshold within which we've consider centers to have converged.
+ * If all centers move less than this Euclidean distance, we stop iterating one run.
+ */
+ def setEpsilon(epsilon: Double): KMeans = {
+ this.epsilon = epsilon
+ this
+ }
+
+ /**
+ * Train a K-means model on the given set of points; `data` should be cached for high
+ * performance, because this is an iterative algorithm.
+ */
+ def train(data: RDD[Array[Double]]): KMeansModel = {
+ // TODO: check whether data is persistent; this needs RDD.storageLevel to be publicly readable
+
+ val sc = data.sparkContext
+
+ val centers = if (initializationMode == KMeans.RANDOM) {
+ initRandom(data)
+ } else {
+ initKMeansParallel(data)
+ }
+
+ val active = Array.fill(runs)(true)
+ val costs = Array.fill(runs)(0.0)
+
+ var activeRuns = new ArrayBuffer[Int] ++ (0 until runs)
+ var iteration = 0
+
+ // Execute iterations of Lloyd's algorithm until all runs have converged
+ while (iteration < maxIterations && !activeRuns.isEmpty) {
+ type WeightedPoint = (DoubleMatrix, Long)
+ def mergeContribs(p1: WeightedPoint, p2: WeightedPoint): WeightedPoint = {
+ (p1._1.addi(p2._1), p1._2 + p2._2)
+ }
+
+ val activeCenters = activeRuns.map(r => centers(r)).toArray
+ val costAccums = activeRuns.map(_ => sc.accumulator(0.0))
+
+ // Find the sum and count of points mapping to each center
+ val totalContribs = data.mapPartitions { points =>
+ val runs = activeCenters.length
+ val k = activeCenters(0).length
+ val dims = activeCenters(0)(0).length
+
+ val sums = Array.fill(runs, k)(new DoubleMatrix(dims))
+ val counts = Array.fill(runs, k)(0L)
+
+ for (point <- points; (centers, runIndex) <- activeCenters.zipWithIndex) {
+ val (bestCenter, cost) = KMeans.findClosest(centers, point)
+ costAccums(runIndex) += cost
+ sums(runIndex)(bestCenter).addi(new DoubleMatrix(point))
+ counts(runIndex)(bestCenter) += 1
+ }
+
+ val contribs = for (i <- 0 until runs; j <- 0 until k) yield {
+ ((i, j), (sums(i)(j), counts(i)(j)))
+ }
+ contribs.iterator
+ }.reduceByKey(mergeContribs).collectAsMap()
+
+ // Update the cluster centers and costs for each active run
+ for ((run, i) <- activeRuns.zipWithIndex) {
+ var changed = false
+ for (j <- 0 until k) {
+ val (sum, count) = totalContribs((i, j))
+ if (count != 0) {
+ val newCenter = sum.divi(count).data
+ if (MLUtils.squaredDistance(newCenter, centers(run)(j)) > epsilon * epsilon) {
+ changed = true
+ }
+ centers(run)(j) = newCenter
+ }
+ }
+ if (!changed) {
+ active(run) = false
+ logInfo("Run " + run + " finished in " + (iteration + 1) + " iterations")
+ }
+ costs(run) = costAccums(i).value
+ }
+
+ activeRuns = activeRuns.filter(active(_))
+ iteration += 1
+ }
+
+ val bestRun = costs.zipWithIndex.min._2
+ new KMeansModel(centers(bestRun))
+ }
+
+ /**
+ * Initialize `runs` sets of cluster centers at random.
+ */
+ private def initRandom(data: RDD[Array[Double]]): Array[ClusterCenters] = {
+ // Sample all the cluster centers in one pass to avoid repeated scans
+ val sample = data.takeSample(true, runs * k, new Random().nextInt())
+ Array.tabulate(runs)(r => sample.slice(r * k, (r + 1) * k))
+ }
+
+ /**
+ * Initialize `runs` sets of cluster centers using the k-means|| algorithm by Bahmani et al.
+ * (Bahmani et al., Scalable K-Means++, VLDB 2012). This is a variant of k-means++ that tries
+ * to find with dissimilar cluster centers by starting with a random center and then doing
+ * passes where more centers are chosen with probability proportional to their squared distance
+ * to the current cluster set. It results in a provable approximation to an optimal clustering.
+ *
+ * The original paper can be found at http://theory.stanford.edu/~sergei/papers/vldb12-kmpar.pdf.
+ */
+ private def initKMeansParallel(data: RDD[Array[Double]]): Array[ClusterCenters] = {
+ // Initialize each run's center to a random point
+ val seed = new Random().nextInt()
+ val sample = data.takeSample(true, runs, seed)
+ val centers = Array.tabulate(runs)(r => ArrayBuffer(sample(r)))
+
+ // On each step, sample 2 * k points on average for each run with probability proportional
+ // to their squared distance from that run's current centers
+ for (step <- 0 until initializationSteps) {
+ val centerArrays = centers.map(_.toArray)
+ val sumCosts = data.flatMap { point =>
+ for (r <- 0 until runs) yield (r, KMeans.pointCost(centerArrays(r), point))
+ }.reduceByKey(_ + _).collectAsMap()
+ val chosen = data.mapPartitionsWithIndex { (index, points) =>
+ val rand = new Random(seed ^ (step << 16) ^ index)
+ for {
+ p <- points
+ r <- 0 until runs
+ if rand.nextDouble() < KMeans.pointCost(centerArrays(r), p) * 2 * k / sumCosts(r)
+ } yield (r, p)
+ }.collect()
+ for ((r, p) <- chosen) {
+ centers(r) += p
+ }
+ }
+
+ // Finally, we might have a set of more than k candidate centers for each run; weigh each
+ // candidate by the number of points in the dataset mapping to it and run a local k-means++
+ // on the weighted centers to pick just k of them
+ val centerArrays = centers.map(_.toArray)
+ val weightMap = data.flatMap { p =>
+ for (r <- 0 until runs) yield ((r, KMeans.findClosest(centerArrays(r), p)._1), 1.0)
+ }.reduceByKey(_ + _).collectAsMap()
+ val finalCenters = (0 until runs).map { r =>
+ val myCenters = centers(r).toArray
+ val myWeights = (0 until myCenters.length).map(i => weightMap.getOrElse((r, i), 0.0)).toArray
+ LocalKMeans.kMeansPlusPlus(r, myCenters, myWeights, k, 30)
+ }
+
+ finalCenters.toArray
+ }
+}
+
+
+/**
+ * Top-level methods for calling K-means clustering.
+ */
+object KMeans {
+ // Initialization mode names
+ val RANDOM = "random"
+ val K_MEANS_PARALLEL = "k-means||"
+
+ def train(
+ data: RDD[Array[Double]],
+ k: Int,
+ maxIterations: Int,
+ runs: Int,
+ initializationMode: String)
+ : KMeansModel =
+ {
+ new KMeans().setK(k)
+ .setMaxIterations(maxIterations)
+ .setRuns(runs)
+ .setInitializationMode(initializationMode)
+ .train(data)
+ }
+
+ def train(data: RDD[Array[Double]], k: Int, maxIterations: Int, runs: Int): KMeansModel = {
+ train(data, k, maxIterations, runs, K_MEANS_PARALLEL)
+ }
+
+ def train(data: RDD[Array[Double]], k: Int, maxIterations: Int): KMeansModel = {
+ train(data, k, maxIterations, 1, K_MEANS_PARALLEL)
+ }
+
+ /**
+ * Return the index of the closest point in `centers` to `point`, as well as its distance.
+ */
+ private[mllib] def findClosest(centers: Array[Array[Double]], point: Array[Double])
+ : (Int, Double) =
+ {
+ var bestDistance = Double.PositiveInfinity
+ var bestIndex = 0
+ for (i <- 0 until centers.length) {
+ val distance = MLUtils.squaredDistance(point, centers(i))
+ if (distance < bestDistance) {
+ bestDistance = distance
+ bestIndex = i
+ }
+ }
+ (bestIndex, bestDistance)
+ }
+
+ /**
+ * Return the K-means cost of a given point against the given cluster centers.
+ */
+ private[mllib] def pointCost(centers: Array[Array[Double]], point: Array[Double]): Double = {
+ var bestDistance = Double.PositiveInfinity
+ for (i <- 0 until centers.length) {
+ val distance = MLUtils.squaredDistance(point, centers(i))
+ if (distance < bestDistance) {
+ bestDistance = distance
+ }
+ }
+ bestDistance
+ }
+
+ def main(args: Array[String]) {
+ if (args.length != 4) {
+ println("Usage: KMeans <master> <input_file> <k> <max_iterations>")
+ System.exit(1)
+ }
+ val (master, inputFile, k, iters) = (args(0), args(1), args(2).toInt, args(3).toInt)
+ val sc = new SparkContext(master, "KMeans")
+ val data = sc.textFile(inputFile).map(line => line.split(' ').map(_.toDouble))
+ val model = KMeans.train(data, k, iters)
+ val cost = model.computeCost(data)
+ println("Cluster centers:")
+ for (c <- model.clusterCenters) {
+ println(" " + c.mkString(" "))
+ }
+ println("Cost: " + cost)
+ System.exit(0)
+ }
+}
diff --git a/mllib/src/main/scala/spark/mllib/clustering/KMeansModel.scala b/mllib/src/main/scala/spark/mllib/clustering/KMeansModel.scala
new file mode 100644
index 0000000000..b8f80e80cd
--- /dev/null
+++ b/mllib/src/main/scala/spark/mllib/clustering/KMeansModel.scala
@@ -0,0 +1,44 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package spark.mllib.clustering
+
+import spark.RDD
+import spark.SparkContext._
+import spark.mllib.util.MLUtils
+
+
+/**
+ * A clustering model for K-means. Each point belongs to the cluster with the closest center.
+ */
+class KMeansModel(val clusterCenters: Array[Array[Double]]) extends Serializable {
+ /** Total number of clusters. */
+ def k: Int = clusterCenters.length
+
+ /** Return the cluster index that a given point belongs to. */
+ def predict(point: Array[Double]): Int = {
+ KMeans.findClosest(clusterCenters, point)._1
+ }
+
+ /**
+ * Return the K-means cost (sum of squared distances of points to their nearest center) for this
+ * model on the given data.
+ */
+ def computeCost(data: RDD[Array[Double]]): Double = {
+ data.map(p => KMeans.pointCost(clusterCenters, p)).sum
+ }
+}
diff --git a/mllib/src/main/scala/spark/mllib/clustering/LocalKMeans.scala b/mllib/src/main/scala/spark/mllib/clustering/LocalKMeans.scala
new file mode 100644
index 0000000000..89fe7d7e85
--- /dev/null
+++ b/mllib/src/main/scala/spark/mllib/clustering/LocalKMeans.scala
@@ -0,0 +1,105 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package spark.mllib.clustering
+
+import scala.util.Random
+
+import org.jblas.{DoubleMatrix, SimpleBlas}
+
+/**
+ * An utility object to run K-means locally. This is private to the ML package because it's used
+ * in the initialization of KMeans but not meant to be publicly exposed.
+ */
+private[mllib] object LocalKMeans {
+ /**
+ * Run K-means++ on the weighted point set `points`. This first does the K-means++
+ * initialization procedure and then roudns of Lloyd's algorithm.
+ */
+ def kMeansPlusPlus(
+ seed: Int,
+ points: Array[Array[Double]],
+ weights: Array[Double],
+ k: Int,
+ maxIterations: Int)
+ : Array[Array[Double]] =
+ {
+ val rand = new Random(seed)
+ val dimensions = points(0).length
+ val centers = new Array[Array[Double]](k)
+
+ // Initialize centers by sampling using the k-means++ procedure
+ centers(0) = pickWeighted(rand, points, weights)
+ for (i <- 1 until k) {
+ // Pick the next center with a probability proportional to cost under current centers
+ val curCenters = centers.slice(0, i)
+ val sum = points.zip(weights).map { case (p, w) =>
+ w * KMeans.pointCost(curCenters, p)
+ }.sum
+ val r = rand.nextDouble() * sum
+ var cumulativeScore = 0.0
+ var j = 0
+ while (j < points.length && cumulativeScore < r) {
+ cumulativeScore += weights(j) * KMeans.pointCost(curCenters, points(j))
+ j += 1
+ }
+ centers(i) = points(j-1)
+ }
+
+ // Run up to maxIterations iterations of Lloyd's algorithm
+ val oldClosest = Array.fill(points.length)(-1)
+ var iteration = 0
+ var moved = true
+ while (moved && iteration < maxIterations) {
+ moved = false
+ val sums = Array.fill(k)(new DoubleMatrix(dimensions))
+ val counts = Array.fill(k)(0.0)
+ for ((p, i) <- points.zipWithIndex) {
+ val index = KMeans.findClosest(centers, p)._1
+ SimpleBlas.axpy(weights(i), new DoubleMatrix(p), sums(index))
+ counts(index) += weights(i)
+ if (index != oldClosest(i)) {
+ moved = true
+ oldClosest(i) = index
+ }
+ }
+ // Update centers
+ for (i <- 0 until k) {
+ if (counts(i) == 0.0) {
+ // Assign center to a random point
+ centers(i) = points(rand.nextInt(points.length))
+ } else {
+ centers(i) = sums(i).divi(counts(i)).data
+ }
+ }
+ iteration += 1
+ }
+
+ centers
+ }
+
+ private def pickWeighted[T](rand: Random, data: Array[T], weights: Array[Double]): T = {
+ val r = rand.nextDouble() * weights.sum
+ var i = 0
+ var curWeight = 0.0
+ while (i < data.length && curWeight < r) {
+ curWeight += weights(i)
+ i += 1
+ }
+ data(i - 1)
+ }
+}
diff --git a/mllib/src/main/scala/spark/mllib/optimization/Gradient.scala b/mllib/src/main/scala/spark/mllib/optimization/Gradient.scala
new file mode 100644
index 0000000000..2fb0c8136f
--- /dev/null
+++ b/mllib/src/main/scala/spark/mllib/optimization/Gradient.scala
@@ -0,0 +1,50 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package spark.mllib.optimization
+
+import org.jblas.DoubleMatrix
+
+abstract class Gradient extends Serializable {
+ /**
+ * Compute the gradient for a given row of data.
+ *
+ * @param data - One row of data. Row matrix of size 1xn where n is the number of features.
+ * @param label - Label for this data item.
+ * @param weights - Column matrix containing weights for every feature.
+ */
+ def compute(data: DoubleMatrix, label: Double, weights: DoubleMatrix):
+ (DoubleMatrix, Double)
+}
+
+class LogisticGradient extends Gradient {
+ override def compute(data: DoubleMatrix, label: Double, weights: DoubleMatrix):
+ (DoubleMatrix, Double) = {
+ val margin: Double = -1.0 * data.dot(weights)
+ val gradientMultiplier = (1.0 / (1.0 + math.exp(margin))) - label
+
+ val gradient = data.mul(gradientMultiplier)
+ val loss =
+ if (margin > 0) {
+ math.log(1 + math.exp(0 - margin))
+ } else {
+ math.log(1 + math.exp(margin)) - margin
+ }
+
+ (gradient, loss)
+ }
+}
diff --git a/mllib/src/main/scala/spark/mllib/optimization/GradientDescent.scala b/mllib/src/main/scala/spark/mllib/optimization/GradientDescent.scala
new file mode 100644
index 0000000000..e1b73bc25e
--- /dev/null
+++ b/mllib/src/main/scala/spark/mllib/optimization/GradientDescent.scala
@@ -0,0 +1,79 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package spark.mllib.optimization
+
+import spark.{Logging, RDD, SparkContext}
+import spark.SparkContext._
+
+import org.jblas.DoubleMatrix
+
+import scala.collection.mutable.ArrayBuffer
+
+
+object GradientDescent {
+
+ /**
+ * Run gradient descent in parallel using mini batches.
+ * Based on Matlab code written by John Duchi.
+ *
+ * @param data - Input data for SGD. RDD of form (label, [feature values]).
+ * @param gradient - Gradient object that will be used to compute the gradient.
+ * @param updater - Updater object that will be used to update the model.
+ * @param stepSize - stepSize to be used during update.
+ * @param numIters - number of iterations that SGD should be run.
+ * @param miniBatchFraction - fraction of the input data set that should be used for
+ * one iteration of SGD. Default value 1.0.
+ *
+ * @return weights - Column matrix containing weights for every feature.
+ * @return lossHistory - Array containing the loss computed for every iteration.
+ */
+ def runMiniBatchSGD(
+ data: RDD[(Double, Array[Double])],
+ gradient: Gradient,
+ updater: Updater,
+ stepSize: Double,
+ numIters: Int,
+ miniBatchFraction: Double=1.0) : (DoubleMatrix, Array[Double]) = {
+
+ val lossHistory = new ArrayBuffer[Double](numIters)
+
+ val nfeatures: Int = data.take(1)(0)._2.length
+ val nexamples: Long = data.count()
+ val miniBatchSize = nexamples * miniBatchFraction
+
+ // Initialize weights as a column matrix
+ var weights = DoubleMatrix.ones(nfeatures)
+ var reg_val = 0.0
+
+ for (i <- 1 to numIters) {
+ val (gradientSum, lossSum) = data.sample(false, miniBatchFraction, 42+i).map {
+ case (y, features) =>
+ val featuresRow = new DoubleMatrix(features.length, 1, features:_*)
+ val (grad, loss) = gradient.compute(featuresRow, y, weights)
+ (grad, loss)
+ }.reduce((a, b) => (a._1.addi(b._1), a._2 + b._2))
+
+ lossHistory.append(lossSum / miniBatchSize + reg_val)
+ val update = updater.compute(weights, gradientSum.div(miniBatchSize), stepSize, i)
+ weights = update._1
+ reg_val = update._2
+ }
+
+ (weights, lossHistory.toArray)
+ }
+}
diff --git a/mllib/src/main/scala/spark/mllib/optimization/Updater.scala b/mllib/src/main/scala/spark/mllib/optimization/Updater.scala
new file mode 100644
index 0000000000..b864fd4634
--- /dev/null
+++ b/mllib/src/main/scala/spark/mllib/optimization/Updater.scala
@@ -0,0 +1,44 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package spark.mllib.optimization
+
+import org.jblas.DoubleMatrix
+
+abstract class Updater extends Serializable {
+ /**
+ * Compute an updated value for weights given the gradient, stepSize and iteration number.
+ *
+ * @param weightsOld - Column matrix of size nx1 where n is the number of features.
+ * @param gradient - Column matrix of size nx1 where n is the number of features.
+ * @param stepSize - step size across iterations
+ * @param iter - Iteration number
+ *
+ * @return weightsNew - Column matrix containing updated weights
+ * @return reg_val - regularization value
+ */
+ def compute(weightsOlds: DoubleMatrix, gradient: DoubleMatrix, stepSize: Double, iter: Int):
+ (DoubleMatrix, Double)
+}
+
+class SimpleUpdater extends Updater {
+ override def compute(weightsOld: DoubleMatrix, gradient: DoubleMatrix,
+ stepSize: Double, iter: Int): (DoubleMatrix, Double) = {
+ val normGradient = gradient.mul(stepSize / math.sqrt(iter))
+ (weightsOld.sub(normGradient), 0)
+ }
+}
diff --git a/mllib/src/main/scala/spark/mllib/recommendation/ALS.scala b/mllib/src/main/scala/spark/mllib/recommendation/ALS.scala
new file mode 100644
index 0000000000..7da96397a6
--- /dev/null
+++ b/mllib/src/main/scala/spark/mllib/recommendation/ALS.scala
@@ -0,0 +1,436 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package spark.mllib.recommendation
+
+import scala.collection.mutable.{ArrayBuffer, BitSet}
+import scala.util.Random
+import scala.util.Sorting
+
+import spark.{HashPartitioner, Partitioner, SparkContext, RDD}
+import spark.storage.StorageLevel
+import spark.KryoRegistrator
+import spark.SparkContext._
+
+import com.esotericsoftware.kryo.Kryo
+import org.jblas.{DoubleMatrix, SimpleBlas, Solve}
+
+
+/**
+ * Out-link information for a user or product block. This includes the original user/product IDs
+ * of the elements within this block, and the list of destination blocks that each user or
+ * product will need to send its feature vector to.
+ */
+private[recommendation] case class OutLinkBlock(
+ elementIds: Array[Int], shouldSend: Array[BitSet])
+
+
+/**
+ * In-link information for a user (or product) block. This includes the original user/product IDs
+ * of the elements within this block, as well as an array of indices and ratings that specify
+ * which user in the block will be rated by which products from each product block (or vice-versa).
+ * Specifically, if this InLinkBlock is for users, ratingsForBlock(b)(i) will contain two arrays,
+ * indices and ratings, for the i'th product that will be sent to us by product block b (call this
+ * P). These arrays represent the users that product P had ratings for (by their index in this
+ * block), as well as the corresponding rating for each one. We can thus use this information when
+ * we get product block b's message to update the corresponding users.
+ */
+private[recommendation] case class InLinkBlock(
+ elementIds: Array[Int], ratingsForBlock: Array[Array[(Array[Int], Array[Double])]])
+
+
+/**
+ * A more compact class to represent a rating than Tuple3[Int, Int, Double].
+ */
+private[recommendation] case class Rating(user: Int, product: Int, rating: Double)
+
+
+/**
+ * Alternating Least Squares matrix factorization.
+ *
+ * This is a blocked implementation of the ALS factorization algorithm that groups the two sets
+ * of factors (referred to as "users" and "products") into blocks and reduces communication by only
+ * sending one copy of each user vector to each product block on each iteration, and only for the
+ * product blocks that need that user's feature vector. This is achieved by precomputing some
+ * information about the ratings matrix to determine the "out-links" of each user (which blocks of
+ * products it will contribute to) and "in-link" information for each product (which of the feature
+ * vectors it receives from each user block it will depend on). This allows us to send only an
+ * array of feature vectors between each user block and product block, and have the product block
+ * find the users' ratings and update the products based on these messages.
+ */
+class ALS private (var numBlocks: Int, var rank: Int, var iterations: Int, var lambda: Double)
+ extends Serializable
+{
+ def this() = this(-1, 10, 10, 0.01)
+
+ /**
+ * Set the number of blocks to parallelize the computation into; pass -1 for an auto-configured
+ * number of blocks. Default: -1.
+ */
+ def setBlocks(numBlocks: Int): ALS = {
+ this.numBlocks = numBlocks
+ this
+ }
+
+ /** Set the rank of the feature matrices computed (number of features). Default: 10. */
+ def setRank(rank: Int): ALS = {
+ this.rank = rank
+ this
+ }
+
+ /** Set the number of iterations to run. Default: 10. */
+ def setIterations(iterations: Int): ALS = {
+ this.iterations = iterations
+ this
+ }
+
+ /** Set the regularization parameter, lambda. Default: 0.01. */
+ def setLambda(lambda: Double): ALS = {
+ this.lambda = lambda
+ this
+ }
+
+ /**
+ * Run ALS with the configured parmeters on an input RDD of (user, product, rating) triples.
+ * Returns a MatrixFactorizationModel with feature vectors for each user and product.
+ */
+ def train(ratings: RDD[(Int, Int, Double)]): MatrixFactorizationModel = {
+ val numBlocks = if (this.numBlocks == -1) {
+ math.max(ratings.context.defaultParallelism, ratings.partitions.size / 2)
+ } else {
+ this.numBlocks
+ }
+
+ val partitioner = new HashPartitioner(numBlocks)
+
+ val ratingsByUserBlock = ratings.map{ case (u, p, r) => (u % numBlocks, Rating(u, p, r)) }
+ val ratingsByProductBlock = ratings.map{ case (u, p, r) => (p % numBlocks, Rating(p, u, r)) }
+
+ val (userInLinks, userOutLinks) = makeLinkRDDs(numBlocks, ratingsByUserBlock)
+ val (productInLinks, productOutLinks) = makeLinkRDDs(numBlocks, ratingsByProductBlock)
+
+ // Initialize user and product factors randomly
+ val seed = new Random().nextInt()
+ var users = userOutLinks.mapValues(_.elementIds.map(u => randomFactor(rank, seed ^ u)))
+ var products = productOutLinks.mapValues(_.elementIds.map(p => randomFactor(rank, seed ^ ~p)))
+
+ for (iter <- 0 until iterations) {
+ // perform ALS update
+ products = updateFeatures(users, userOutLinks, productInLinks, partitioner, rank, lambda)
+ users = updateFeatures(products, productOutLinks, userInLinks, partitioner, rank, lambda)
+ }
+
+ // Flatten and cache the two final RDDs to un-block them
+ val usersOut = users.join(userOutLinks).flatMap { case (b, (factors, outLinkBlock)) =>
+ for (i <- 0 until factors.length) yield (outLinkBlock.elementIds(i), factors(i))
+ }
+ val productsOut = products.join(productOutLinks).flatMap { case (b, (factors, outLinkBlock)) =>
+ for (i <- 0 until factors.length) yield (outLinkBlock.elementIds(i), factors(i))
+ }
+
+ usersOut.persist()
+ productsOut.persist()
+
+ new MatrixFactorizationModel(rank, usersOut, productsOut)
+ }
+
+ /**
+ * Make the out-links table for a block of the users (or products) dataset given the list of
+ * (user, product, rating) values for the users in that block (or the opposite for products).
+ */
+ private def makeOutLinkBlock(numBlocks: Int, ratings: Array[Rating]): OutLinkBlock = {
+ val userIds = ratings.map(_.user).distinct.sorted
+ val numUsers = userIds.length
+ val userIdToPos = userIds.zipWithIndex.toMap
+ val shouldSend = Array.fill(numUsers)(new BitSet(numBlocks))
+ for (r <- ratings) {
+ shouldSend(userIdToPos(r.user))(r.product % numBlocks) = true
+ }
+ OutLinkBlock(userIds, shouldSend)
+ }
+
+ /**
+ * Make the in-links table for a block of the users (or products) dataset given a list of
+ * (user, product, rating) values for the users in that block (or the opposite for products).
+ */
+ private def makeInLinkBlock(numBlocks: Int, ratings: Array[Rating]): InLinkBlock = {
+ val userIds = ratings.map(_.user).distinct.sorted
+ val numUsers = userIds.length
+ val userIdToPos = userIds.zipWithIndex.toMap
+ // Split out our ratings by product block
+ val blockRatings = Array.fill(numBlocks)(new ArrayBuffer[Rating])
+ for (r <- ratings) {
+ blockRatings(r.product % numBlocks) += r
+ }
+ val ratingsForBlock = new Array[Array[(Array[Int], Array[Double])]](numBlocks)
+ for (productBlock <- 0 until numBlocks) {
+ // Create an array of (product, Seq(Rating)) ratings
+ val groupedRatings = blockRatings(productBlock).groupBy(_.product).toArray
+ // Sort them by product ID
+ val ordering = new Ordering[(Int, ArrayBuffer[Rating])] {
+ def compare(a: (Int, ArrayBuffer[Rating]), b: (Int, ArrayBuffer[Rating])): Int = a._1 - b._1
+ }
+ Sorting.quickSort(groupedRatings)(ordering)
+ // Translate the user IDs to indices based on userIdToPos
+ ratingsForBlock(productBlock) = groupedRatings.map { case (p, rs) =>
+ (rs.view.map(r => userIdToPos(r.user)).toArray, rs.view.map(_.rating).toArray)
+ }
+ }
+ InLinkBlock(userIds, ratingsForBlock)
+ }
+
+ /**
+ * Make RDDs of InLinkBlocks and OutLinkBlocks given an RDD of (blockId, (u, p, r)) values for
+ * the users (or (blockId, (p, u, r)) for the products). We create these simultaneously to avoid
+ * having to shuffle the (blockId, (u, p, r)) RDD twice, or to cache it.
+ */
+ private def makeLinkRDDs(numBlocks: Int, ratings: RDD[(Int, Rating)])
+ : (RDD[(Int, InLinkBlock)], RDD[(Int, OutLinkBlock)]) =
+ {
+ val grouped = ratings.partitionBy(new HashPartitioner(numBlocks))
+ val links = grouped.mapPartitionsWithIndex((blockId, elements) => {
+ val ratings = elements.map{_._2}.toArray
+ val inLinkBlock = makeInLinkBlock(numBlocks, ratings)
+ val outLinkBlock = makeOutLinkBlock(numBlocks, ratings)
+ Iterator.single((blockId, (inLinkBlock, outLinkBlock)))
+ }, true)
+ links.persist(StorageLevel.MEMORY_AND_DISK)
+ (links.mapValues(_._1), links.mapValues(_._2))
+ }
+
+ /**
+ * Make a random factor vector with the given seed.
+ * TODO: Initialize things using mapPartitionsWithIndex to make it faster?
+ */
+ private def randomFactor(rank: Int, seed: Int): Array[Double] = {
+ val rand = new Random(seed)
+ Array.fill(rank)(rand.nextDouble)
+ }
+
+ /**
+ * Compute the user feature vectors given the current products (or vice-versa). This first joins
+ * the products with their out-links to generate a set of messages to each destination block
+ * (specifically, the features for the products that user block cares about), then groups these
+ * by destination and joins them with the in-link info to figure out how to update each user.
+ * It returns an RDD of new feature vectors for each user block.
+ */
+ private def updateFeatures(
+ products: RDD[(Int, Array[Array[Double]])],
+ productOutLinks: RDD[(Int, OutLinkBlock)],
+ userInLinks: RDD[(Int, InLinkBlock)],
+ partitioner: Partitioner,
+ rank: Int,
+ lambda: Double)
+ : RDD[(Int, Array[Array[Double]])] =
+ {
+ val numBlocks = products.partitions.size
+ productOutLinks.join(products).flatMap { case (bid, (outLinkBlock, factors)) =>
+ val toSend = Array.fill(numBlocks)(new ArrayBuffer[Array[Double]])
+ for (p <- 0 until outLinkBlock.elementIds.length; userBlock <- 0 until numBlocks) {
+ if (outLinkBlock.shouldSend(p)(userBlock)) {
+ toSend(userBlock) += factors(p)
+ }
+ }
+ toSend.zipWithIndex.map{ case (buf, idx) => (idx, (bid, buf.toArray)) }
+ }.groupByKey(partitioner)
+ .join(userInLinks)
+ .mapValues{ case (messages, inLinkBlock) => updateBlock(messages, inLinkBlock, rank, lambda) }
+ }
+
+ /**
+ * Compute the new feature vectors for a block of the users matrix given the list of factors
+ * it received from each product and its InLinkBlock.
+ */
+ def updateBlock(messages: Seq[(Int, Array[Array[Double]])], inLinkBlock: InLinkBlock,
+ rank: Int, lambda: Double)
+ : Array[Array[Double]] =
+ {
+ // Sort the incoming block factor messages by block ID and make them an array
+ val blockFactors = messages.sortBy(_._1).map(_._2).toArray // Array[Array[Double]]
+ val numBlocks = blockFactors.length
+ val numUsers = inLinkBlock.elementIds.length
+
+ // We'll sum up the XtXes using vectors that represent only the lower-triangular part, since
+ // the matrices are symmetric
+ val triangleSize = rank * (rank + 1) / 2
+ val userXtX = Array.fill(numUsers)(DoubleMatrix.zeros(triangleSize))
+ val userXy = Array.fill(numUsers)(DoubleMatrix.zeros(rank))
+
+ // Some temp variables to avoid memory allocation
+ val tempXtX = DoubleMatrix.zeros(triangleSize)
+ val fullXtX = DoubleMatrix.zeros(rank, rank)
+
+ // Compute the XtX and Xy values for each user by adding products it rated in each product block
+ for (productBlock <- 0 until numBlocks) {
+ for (p <- 0 until blockFactors(productBlock).length) {
+ val x = new DoubleMatrix(blockFactors(productBlock)(p))
+ fillXtX(x, tempXtX)
+ val (us, rs) = inLinkBlock.ratingsForBlock(productBlock)(p)
+ for (i <- 0 until us.length) {
+ userXtX(us(i)).addi(tempXtX)
+ SimpleBlas.axpy(rs(i), x, userXy(us(i)))
+ }
+ }
+ }
+
+ // Solve the least-squares problem for each user and return the new feature vectors
+ userXtX.zipWithIndex.map{ case (triangularXtX, index) =>
+ // Compute the full XtX matrix from the lower-triangular part we got above
+ fillFullMatrix(triangularXtX, fullXtX)
+ // Add regularization
+ (0 until rank).foreach(i => fullXtX.data(i*rank + i) += lambda)
+ // Solve the resulting matrix, which is symmetric and positive-definite
+ Solve.solvePositive(fullXtX, userXy(index)).data
+ }
+ }
+
+ /**
+ * Set xtxDest to the lower-triangular part of x transpose * x. For efficiency in summing
+ * these matrices, we store xtxDest as only rank * (rank+1) / 2 values, namely the values
+ * at (0,0), (1,0), (1,1), (2,0), (2,1), (2,2), etc in that order.
+ */
+ private def fillXtX(x: DoubleMatrix, xtxDest: DoubleMatrix) {
+ var i = 0
+ var pos = 0
+ while (i < x.length) {
+ var j = 0
+ while (j <= i) {
+ xtxDest.data(pos) = x.data(i) * x.data(j)
+ pos += 1
+ j += 1
+ }
+ i += 1
+ }
+ }
+
+ /**
+ * Given a triangular matrix in the order of fillXtX above, compute the full symmetric square
+ * matrix that it represents, storing it into destMatrix.
+ */
+ private def fillFullMatrix(triangularMatrix: DoubleMatrix, destMatrix: DoubleMatrix) {
+ val rank = destMatrix.rows
+ var i = 0
+ var pos = 0
+ while (i < rank) {
+ var j = 0
+ while (j <= i) {
+ destMatrix.data(i*rank + j) = triangularMatrix.data(pos)
+ destMatrix.data(j*rank + i) = triangularMatrix.data(pos)
+ pos += 1
+ j += 1
+ }
+ i += 1
+ }
+ }
+}
+
+
+/**
+ * Top-level methods for calling Alternating Least Squares (ALS) matrix factorizaton.
+ */
+object ALS {
+ /**
+ * Train a matrix factorization model given an RDD of ratings given by users to some products,
+ * in the form of (userID, productID, rating) pairs. We approximate the ratings matrix as the
+ * product of two lower-rank matrices of a given rank (number of features). To solve for these
+ * features, we run a given number of iterations of ALS. This is done using a level of
+ * parallelism given by `blocks`.
+ *
+ * @param ratings RDD of (userID, productID, rating) pairs
+ * @param rank number of features to use
+ * @param iterations number of iterations of ALS (recommended: 10-20)
+ * @param lambda regularization factor (recommended: 0.01)
+ * @param blocks level of parallelism to split computation into
+ */
+ def train(
+ ratings: RDD[(Int, Int, Double)],
+ rank: Int,
+ iterations: Int,
+ lambda: Double,
+ blocks: Int)
+ : MatrixFactorizationModel =
+ {
+ new ALS(blocks, rank, iterations, lambda).train(ratings)
+ }
+
+ /**
+ * Train a matrix factorization model given an RDD of ratings given by users to some products,
+ * in the form of (userID, productID, rating) pairs. We approximate the ratings matrix as the
+ * product of two lower-rank matrices of a given rank (number of features). To solve for these
+ * features, we run a given number of iterations of ALS. The level of parallelism is determined
+ * automatically based on the number of partitions in `ratings`.
+ *
+ * @param ratings RDD of (userID, productID, rating) pairs
+ * @param rank number of features to use
+ * @param iterations number of iterations of ALS (recommended: 10-20)
+ * @param lambda regularization factor (recommended: 0.01)
+ */
+ def train(ratings: RDD[(Int, Int, Double)], rank: Int, iterations: Int, lambda: Double)
+ : MatrixFactorizationModel =
+ {
+ train(ratings, rank, iterations, lambda, -1)
+ }
+
+ /**
+ * Train a matrix factorization model given an RDD of ratings given by users to some products,
+ * in the form of (userID, productID, rating) pairs. We approximate the ratings matrix as the
+ * product of two lower-rank matrices of a given rank (number of features). To solve for these
+ * features, we run a given number of iterations of ALS. The level of parallelism is determined
+ * automatically based on the number of partitions in `ratings`.
+ *
+ * @param ratings RDD of (userID, productID, rating) pairs
+ * @param rank number of features to use
+ * @param iterations number of iterations of ALS (recommended: 10-20)
+ */
+ def train(ratings: RDD[(Int, Int, Double)], rank: Int, iterations: Int)
+ : MatrixFactorizationModel =
+ {
+ train(ratings, rank, iterations, 0.01, -1)
+ }
+
+ private class ALSRegistrator extends KryoRegistrator {
+ override def registerClasses(kryo: Kryo) {
+ kryo.register(classOf[Rating])
+ }
+ }
+
+ def main(args: Array[String]) {
+ if (args.length != 5 && args.length != 6) {
+ println("Usage: ALS <master> <ratings_file> <rank> <iterations> <output_dir> [<blocks>]")
+ System.exit(1)
+ }
+ val (master, ratingsFile, rank, iters, outputDir) =
+ (args(0), args(1), args(2).toInt, args(3).toInt, args(4))
+ val blocks = if (args.length == 6) args(5).toInt else -1
+ System.setProperty("spark.serializer", "spark.KryoSerializer")
+ System.setProperty("spark.kryo.registrator", classOf[ALSRegistrator].getName)
+ System.setProperty("spark.kryo.referenceTracking", "false")
+ System.setProperty("spark.locality.wait", "10000")
+ val sc = new SparkContext(master, "ALS")
+ val ratings = sc.textFile(ratingsFile).map { line =>
+ val fields = line.split(',')
+ (fields(0).toInt, fields(1).toInt, fields(2).toDouble)
+ }
+ val model = ALS.train(ratings, rank, iters, 0.01, blocks)
+ model.userFeatures.map{ case (id, vec) => id + "," + vec.mkString(" ") }
+ .saveAsTextFile(outputDir + "/userFeatures")
+ model.productFeatures.map{ case (id, vec) => id + "," + vec.mkString(" ") }
+ .saveAsTextFile(outputDir + "/productFeatures")
+ println("Final user/product features written to " + outputDir)
+ System.exit(0)
+ }
+}
diff --git a/mllib/src/main/scala/spark/mllib/recommendation/MatrixFactorizationModel.scala b/mllib/src/main/scala/spark/mllib/recommendation/MatrixFactorizationModel.scala
new file mode 100644
index 0000000000..38637b3dd1
--- /dev/null
+++ b/mllib/src/main/scala/spark/mllib/recommendation/MatrixFactorizationModel.scala
@@ -0,0 +1,40 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package spark.mllib.recommendation
+
+import spark.RDD
+import spark.SparkContext._
+
+import org.jblas._
+
+class MatrixFactorizationModel(
+ val rank: Int,
+ val userFeatures: RDD[(Int, Array[Double])],
+ val productFeatures: RDD[(Int, Array[Double])])
+ extends Serializable
+{
+ /** Predict the rating of one user for one product. */
+ def predict(user: Int, product: Int): Double = {
+ val userVector = new DoubleMatrix(userFeatures.lookup(user).head)
+ val productVector = new DoubleMatrix(productFeatures.lookup(product).head)
+ userVector.dot(productVector)
+ }
+
+ // TODO: Figure out what good bulk prediction methods would look like.
+ // Probably want a way to get the top users for a product or vice-versa.
+}
diff --git a/mllib/src/main/scala/spark/mllib/regression/LogisticRegression.scala b/mllib/src/main/scala/spark/mllib/regression/LogisticRegression.scala
new file mode 100644
index 0000000000..bb294c2257
--- /dev/null
+++ b/mllib/src/main/scala/spark/mllib/regression/LogisticRegression.scala
@@ -0,0 +1,175 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package spark.mllib.regression
+
+import spark.{Logging, RDD, SparkContext}
+import spark.mllib.optimization._
+import spark.mllib.util.MLUtils
+
+import org.jblas.DoubleMatrix
+
+/**
+ * Logistic Regression using Stochastic Gradient Descent.
+ * Based on Matlab code written by John Duchi.
+ */
+class LogisticRegressionModel(
+ val weights: DoubleMatrix,
+ val intercept: Double,
+ val losses: Array[Double]) extends RegressionModel {
+
+ override def predict(testData: spark.RDD[Array[Double]]) = {
+ testData.map { x =>
+ val margin = new DoubleMatrix(1, x.length, x:_*).mmul(this.weights).get(0) + this.intercept
+ 1.0/ (1.0 + math.exp(margin * -1))
+ }
+ }
+
+ override def predict(testData: Array[Double]): Double = {
+ val dataMat = new DoubleMatrix(1, testData.length, testData:_*)
+ val margin = dataMat.mmul(this.weights).get(0) + this.intercept
+ 1.0/ (1.0 + math.exp(margin * -1))
+ }
+}
+
+class LogisticRegression private (var stepSize: Double, var miniBatchFraction: Double,
+ var numIters: Int)
+ extends Logging {
+
+ /**
+ * Construct a LogisticRegression object with default parameters
+ */
+ def this() = this(1.0, 1.0, 100)
+
+ /**
+ * Set the step size per-iteration of SGD. Default 1.0.
+ */
+ def setStepSize(step: Double) = {
+ this.stepSize = step
+ this
+ }
+
+ /**
+ * Set fraction of data to be used for each SGD iteration. Default 1.0.
+ */
+ def setMiniBatchFraction(fraction: Double) = {
+ this.miniBatchFraction = fraction
+ this
+ }
+
+ /**
+ * Set the number of iterations for SGD. Default 100.
+ */
+ def setNumIterations(iters: Int) = {
+ this.numIters = iters
+ this
+ }
+
+ def train(input: RDD[(Double, Array[Double])]): LogisticRegressionModel = {
+ // Add a extra variable consisting of all 1.0's for the intercept.
+ val data = input.map { case (y, features) =>
+ (y, Array(1.0, features:_*))
+ }
+
+ val (weights, losses) = GradientDescent.runMiniBatchSGD(
+ data, new LogisticGradient(), new SimpleUpdater(), stepSize, numIters, miniBatchFraction)
+
+ val weightsScaled = weights.getRange(1, weights.length)
+ val intercept = weights.get(0)
+
+ val model = new LogisticRegressionModel(weightsScaled, intercept, losses)
+
+ logInfo("Final model weights " + model.weights)
+ logInfo("Final model intercept " + model.intercept)
+ logInfo("Last 10 losses " + model.losses.takeRight(10).mkString(", "))
+ model
+ }
+}
+
+/**
+ * Top-level methods for calling Logistic Regression.
+ */
+object LogisticRegression {
+
+ /**
+ * Train a logistic regression model given an RDD of (label, features) pairs. We run a fixed number
+ * of iterations of gradient descent using the specified step size. Each iteration uses
+ * `miniBatchFraction` fraction of the data to calculate the gradient.
+ *
+ * @param input RDD of (label, array of features) pairs.
+ * @param numIterations Number of iterations of gradient descent to run.
+ * @param stepSize Step size to be used for each iteration of gradient descent.
+ * @param miniBatchFraction Fraction of data to be used per iteration.
+ */
+ def train(
+ input: RDD[(Double, Array[Double])],
+ numIterations: Int,
+ stepSize: Double,
+ miniBatchFraction: Double)
+ : LogisticRegressionModel =
+ {
+ new LogisticRegression(stepSize, miniBatchFraction, numIterations).train(input)
+ }
+
+ /**
+ * Train a logistic regression model given an RDD of (label, features) pairs. We run a fixed number
+ * of iterations of gradient descent using the specified step size. We use the entire data set to update
+ * the gradient in each iteration.
+ *
+ * @param input RDD of (label, array of features) pairs.
+ * @param stepSize Step size to be used for each iteration of Gradient Descent.
+ * @param numIterations Number of iterations of gradient descent to run.
+ * @return a LogisticRegressionModel which has the weights and offset from training.
+ */
+ def train(
+ input: RDD[(Double, Array[Double])],
+ numIterations: Int,
+ stepSize: Double)
+ : LogisticRegressionModel =
+ {
+ train(input, numIterations, stepSize, 1.0)
+ }
+
+ /**
+ * Train a logistic regression model given an RDD of (label, features) pairs. We run a fixed number
+ * of iterations of gradient descent using a step size of 1.0. We use the entire data set to update
+ * the gradient in each iteration.
+ *
+ * @param input RDD of (label, array of features) pairs.
+ * @param numIterations Number of iterations of gradient descent to run.
+ * @return a LogisticRegressionModel which has the weights and offset from training.
+ */
+ def train(
+ input: RDD[(Double, Array[Double])],
+ numIterations: Int)
+ : LogisticRegressionModel =
+ {
+ train(input, numIterations, 1.0, 1.0)
+ }
+
+ def main(args: Array[String]) {
+ if (args.length != 4) {
+ println("Usage: LogisticRegression <master> <input_dir> <step_size> <niters>")
+ System.exit(1)
+ }
+ val sc = new SparkContext(args(0), "LogisticRegression")
+ val data = MLUtils.loadLabeledData(sc, args(1))
+ val model = LogisticRegression.train(data, args(3).toInt, args(2).toDouble)
+
+ sc.stop()
+ }
+}
diff --git a/mllib/src/main/scala/spark/mllib/regression/LogisticRegressionGenerator.scala b/mllib/src/main/scala/spark/mllib/regression/LogisticRegressionGenerator.scala
new file mode 100644
index 0000000000..8094d22405
--- /dev/null
+++ b/mllib/src/main/scala/spark/mllib/regression/LogisticRegressionGenerator.scala
@@ -0,0 +1,58 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package spark.mllib.regression
+
+import scala.util.Random
+
+import org.jblas.DoubleMatrix
+
+import spark.{RDD, SparkContext}
+import spark.mllib.util.MLUtils
+
+object LogisticRegressionGenerator {
+
+ def main(args: Array[String]) {
+ if (args.length != 5) {
+ println("Usage: LogisticRegressionGenerator " +
+ "<master> <output_dir> <num_examples> <num_features> <num_partitions>")
+ System.exit(1)
+ }
+
+ val sparkMaster: String = args(0)
+ val outputPath: String = args(1)
+ val nexamples: Int = if (args.length > 2) args(2).toInt else 1000
+ val nfeatures: Int = if (args.length > 3) args(3).toInt else 2
+ val parts: Int = if (args.length > 4) args(4).toInt else 2
+ val eps = 3
+
+ val sc = new SparkContext(sparkMaster, "LogisticRegressionGenerator")
+
+ val data: RDD[(Double, Array[Double])] = sc.parallelize(0 until nexamples, parts).map { idx =>
+ val rnd = new Random(42 + idx)
+
+ val y = if (idx % 2 == 0) 0 else 1
+ val x = Array.fill[Double](nfeatures) {
+ rnd.nextGaussian() + (y * eps)
+ }
+ (y, x)
+ }
+
+ MLUtils.saveLabeledData(data, outputPath)
+ sc.stop()
+ }
+}
diff --git a/mllib/src/main/scala/spark/mllib/regression/Regression.scala b/mllib/src/main/scala/spark/mllib/regression/Regression.scala
new file mode 100644
index 0000000000..645204ddf3
--- /dev/null
+++ b/mllib/src/main/scala/spark/mllib/regression/Regression.scala
@@ -0,0 +1,38 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package spark.mllib.regression
+
+import spark.RDD
+
+trait RegressionModel {
+ /**
+ * Predict values for the given data set using the model trained.
+ *
+ * @param testData RDD representing data points to be predicted
+ * @return RDD[Double] where each entry contains the corresponding prediction
+ */
+ def predict(testData: RDD[Array[Double]]): RDD[Double]
+
+ /**
+ * Predict values for a single data point using the model trained.
+ *
+ * @param testData array representing a single data point
+ * @return Double prediction from the trained model
+ */
+ def predict(testData: Array[Double]): Double
+}
diff --git a/mllib/src/main/scala/spark/mllib/regression/RidgeRegression.scala b/mllib/src/main/scala/spark/mllib/regression/RidgeRegression.scala
new file mode 100644
index 0000000000..7c7f912b43
--- /dev/null
+++ b/mllib/src/main/scala/spark/mllib/regression/RidgeRegression.scala
@@ -0,0 +1,211 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package spark.mllib.regression
+
+import spark.{Logging, RDD, SparkContext}
+import spark.mllib.util.MLUtils
+
+import org.jblas.DoubleMatrix
+import org.jblas.Solve
+
+import scala.annotation.tailrec
+import scala.collection.mutable
+
+/**
+ * Ridge Regression from Joseph Gonzalez's implementation in MLBase
+ */
+class RidgeRegressionModel(
+ val weights: DoubleMatrix,
+ val intercept: Double,
+ val lambdaOpt: Double,
+ val lambdas: Seq[(Double, Double, DoubleMatrix)])
+ extends RegressionModel {
+
+ override def predict(testData: RDD[Array[Double]]): RDD[Double] = {
+ testData.map { x =>
+ (new DoubleMatrix(1, x.length, x:_*).mmul(this.weights)).get(0) + this.intercept
+ }
+ }
+
+ override def predict(testData: Array[Double]): Double = {
+ (new DoubleMatrix(1, testData.length, testData:_*).mmul(this.weights)).get(0) + this.intercept
+ }
+}
+
+class RidgeRegression private (var lambdaLow: Double, var lambdaHigh: Double)
+ extends Logging {
+
+ def this() = this(0.0, 100.0)
+
+ /**
+ * Set the lower bound on binary search for lambda's. Default is 0.
+ */
+ def setLowLambda(low: Double) = {
+ this.lambdaLow = low
+ this
+ }
+
+ /**
+ * Set the upper bound on binary search for lambda's. Default is 100.0.
+ */
+ def setHighLambda(hi: Double) = {
+ this.lambdaHigh = hi
+ this
+ }
+
+ def train(input: RDD[(Double, Array[Double])]): RidgeRegressionModel = {
+ val nfeatures: Int = input.take(1)(0)._2.length
+ val nexamples: Long = input.count()
+
+ val (yMean, xColMean, xColSd) = MLUtils.computeStats(input, nfeatures, nexamples)
+
+ val data = input.map { case(y, features) =>
+ val yNormalized = y - yMean
+ val featuresMat = new DoubleMatrix(nfeatures, 1, features:_*)
+ val featuresNormalized = featuresMat.sub(xColMean).divi(xColSd)
+ (yNormalized, featuresNormalized.toArray)
+ }
+
+ // Compute XtX - Size of XtX is nfeatures by nfeatures
+ val XtX: DoubleMatrix = data.map { case (y, features) =>
+ val x = new DoubleMatrix(1, features.length, features:_*)
+ x.transpose().mmul(x)
+ }.reduce(_.addi(_))
+
+ // Compute Xt*y - Size of Xty is nfeatures by 1
+ val Xty: DoubleMatrix = data.map { case (y, features) =>
+ new DoubleMatrix(features.length, 1, features:_*).mul(y)
+ }.reduce(_.addi(_))
+
+ // Define a function to compute the leave one out cross validation error
+ // for a single example
+ def crossValidate(lambda: Double): (Double, Double, DoubleMatrix) = {
+ // Compute the MLE ridge regression parameter value
+
+ // Ridge Regression parameter = inv(XtX + \lambda*I) * Xty
+ val XtXlambda = DoubleMatrix.eye(nfeatures).muli(lambda).addi(XtX)
+ val w = Solve.solveSymmetric(XtXlambda, Xty)
+
+ val invXtX = Solve.solveSymmetric(XtXlambda, DoubleMatrix.eye(nfeatures))
+
+ // compute the generalized cross validation score
+ val cverror = data.map {
+ case (y, features) =>
+ val x = new DoubleMatrix(features.length, 1, features:_*)
+ val yhat = w.transpose().mmul(x).get(0)
+ val H_ii = x.transpose().mmul(invXtX).mmul(x).get(0)
+ val residual = (y - yhat) / (1.0 - H_ii)
+ residual * residual
+ }.reduce(_ + _) / nexamples
+
+ (lambda, cverror, w)
+ }
+
+ // Binary search for the best assignment to lambda.
+ def binSearch(low: Double, high: Double): Seq[(Double, Double, DoubleMatrix)] = {
+ val buffer = mutable.ListBuffer.empty[(Double, Double, DoubleMatrix)]
+
+ @tailrec
+ def loop(low: Double, high: Double): Seq[(Double, Double, DoubleMatrix)] = {
+ val mid = (high - low) / 2 + low
+ val lowValue = crossValidate((mid - low) / 2 + low)
+ val highValue = crossValidate((high - mid) / 2 + mid)
+ val (newLow, newHigh) = if (lowValue._2 < highValue._2) {
+ (low, mid + (high-low)/4)
+ } else {
+ (mid - (high-low)/4, high)
+ }
+ if (newHigh - newLow > 1.0E-7) {
+ buffer += lowValue += highValue
+ loop(newLow, newHigh)
+ } else {
+ buffer += lowValue += highValue
+ buffer.result()
+ }
+ }
+
+ loop(low, high)
+ }
+
+ // Actually compute the best lambda
+ val lambdas = binSearch(lambdaLow, lambdaHigh).sortBy(_._1)
+
+ // Find the best parameter set by taking the lowest cverror.
+ val (lambdaOpt, cverror, weights) = lambdas.reduce((a, b) => if (a._2 < b._2) a else b)
+
+ // Return the model which contains the solution
+ val weightsScaled = weights.div(xColSd)
+ val intercept = yMean - (weights.transpose().mmul(xColMean.div(xColSd)).get(0))
+ val model = new RidgeRegressionModel(weightsScaled, intercept, lambdaOpt, lambdas)
+
+ logInfo("RidgeRegression: optimal lambda " + model.lambdaOpt)
+ logInfo("RidgeRegression: optimal weights " + model.weights)
+ logInfo("RidgeRegression: optimal intercept " + model.intercept)
+ logInfo("RidgeRegression: cross-validation error " + cverror)
+
+ model
+ }
+}
+
+/**
+ * Top-level methods for calling Ridge Regression.
+ */
+object RidgeRegression {
+
+ /**
+ * Train a ridge regression model given an RDD of (response, features) pairs.
+ * We use the closed form solution to compute the cross-validation score for
+ * a given lambda. The optimal lambda is computed by performing binary search
+ * between the provided bounds of lambda.
+ *
+ * @param input RDD of (response, array of features) pairs.
+ * @param lambdaLow lower bound used in binary search for lambda
+ * @param lambdaHigh upper bound used in binary search for lambda
+ */
+ def train(
+ input: RDD[(Double, Array[Double])],
+ lambdaLow: Double,
+ lambdaHigh: Double)
+ : RidgeRegressionModel =
+ {
+ new RidgeRegression(lambdaLow, lambdaHigh).train(input)
+ }
+
+ /**
+ * Train a ridge regression model given an RDD of (response, features) pairs.
+ * We use the closed form solution to compute the cross-validation score for
+ * a given lambda. The optimal lambda is computed by performing binary search
+ * between lambda values of 0 and 100.
+ *
+ * @param input RDD of (response, array of features) pairs.
+ */
+ def train(input: RDD[(Double, Array[Double])]) : RidgeRegressionModel = {
+ train(input, 0.0, 100.0)
+ }
+
+ def main(args: Array[String]) {
+ if (args.length != 2) {
+ println("Usage: RidgeRegression <master> <input_dir>")
+ System.exit(1)
+ }
+ val sc = new SparkContext(args(0), "RidgeRegression")
+ val data = MLUtils.loadLabeledData(sc, args(1))
+ val model = RidgeRegression.train(data, 0, 1000)
+ sc.stop()
+ }
+}
diff --git a/mllib/src/main/scala/spark/mllib/regression/RidgeRegressionGenerator.scala b/mllib/src/main/scala/spark/mllib/regression/RidgeRegressionGenerator.scala
new file mode 100644
index 0000000000..c2260ae286
--- /dev/null
+++ b/mllib/src/main/scala/spark/mllib/regression/RidgeRegressionGenerator.scala
@@ -0,0 +1,72 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package spark.mllib.regression
+
+import scala.util.Random
+
+import org.jblas.DoubleMatrix
+
+import spark.{RDD, SparkContext}
+import spark.mllib.util.MLUtils
+
+
+object RidgeRegressionGenerator {
+
+ def main(args: Array[String]) {
+ if (args.length != 5) {
+ println("Usage: RidgeRegressionGenerator " +
+ "<master> <output_dir> <num_examples> <num_features> <num_partitions>")
+ System.exit(1)
+ }
+
+ val sparkMaster: String = args(0)
+ val outputPath: String = args(1)
+ val nexamples: Int = if (args.length > 2) args(2).toInt else 1000
+ val nfeatures: Int = if (args.length > 3) args(3).toInt else 100
+ val parts: Int = if (args.length > 4) args(4).toInt else 2
+ val eps = 10
+
+ org.jblas.util.Random.seed(42)
+ val sc = new SparkContext(sparkMaster, "RidgeRegressionGenerator")
+
+ // Random values distributed uniformly in [-0.5, 0.5]
+ val w = DoubleMatrix.rand(nfeatures, 1).subi(0.5)
+ w.put(0, 0, 10)
+ w.put(1, 0, 10)
+
+ val data: RDD[(Double, Array[Double])] = sc.parallelize(0 until parts, parts).flatMap { p =>
+ org.jblas.util.Random.seed(42 + p)
+ val examplesInPartition = nexamples / parts
+
+ val X = DoubleMatrix.rand(examplesInPartition, nfeatures)
+ val y = X.mmul(w)
+
+ val rnd = new Random(42 + p)
+
+ val normalValues = Array.fill[Double](examplesInPartition)(rnd.nextGaussian() * eps)
+ val yObs = new DoubleMatrix(normalValues).addi(y)
+
+ Iterator.tabulate(examplesInPartition) { i =>
+ (yObs.get(i, 0), X.getRow(i).toArray)
+ }
+ }
+
+ MLUtils.saveLabeledData(data, outputPath)
+ sc.stop()
+ }
+}
diff --git a/mllib/src/main/scala/spark/mllib/util/MLUtils.scala b/mllib/src/main/scala/spark/mllib/util/MLUtils.scala
new file mode 100644
index 0000000000..b5e564df6d
--- /dev/null
+++ b/mllib/src/main/scala/spark/mllib/util/MLUtils.scala
@@ -0,0 +1,112 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package spark.mllib.util
+
+import spark.{RDD, SparkContext}
+import spark.SparkContext._
+
+import org.jblas.DoubleMatrix
+
+/**
+ * Helper methods to load and save data
+ * Data format:
+ * <l>, <f1> <f2> ...
+ * where <f1>, <f2> are feature values in Double and <l> is the corresponding label as Double.
+ */
+object MLUtils {
+
+ /**
+ * @param sc SparkContext
+ * @param dir Directory to the input data files.
+ * @return An RDD of tuples. For each tuple, the first element is the label, and the second
+ * element represents the feature values (an array of Double).
+ */
+ def loadLabeledData(sc: SparkContext, dir: String): RDD[(Double, Array[Double])] = {
+ sc.textFile(dir).map { line =>
+ val parts = line.split(",")
+ val label = parts(0).toDouble
+ val features = parts(1).trim().split(" ").map(_.toDouble)
+ (label, features)
+ }
+ }
+
+ def saveLabeledData(data: RDD[(Double, Array[Double])], dir: String) {
+ val dataStr = data.map(x => x._1 + "," + x._2.mkString(" "))
+ dataStr.saveAsTextFile(dir)
+ }
+
+ /**
+ * Utility function to compute mean and standard deviation on a given dataset.
+ *
+ * @param data - input data set whose statistics are computed
+ * @param nfeatures - number of features
+ * @param nexamples - number of examples in input dataset
+ *
+ * @return (yMean, xColMean, xColSd) - Tuple consisting of
+ * yMean - mean of the labels
+ * xColMean - Row vector with mean for every column (or feature) of the input data
+ * xColSd - Row vector standard deviation for every column (or feature) of the input data.
+ */
+ def computeStats(data: RDD[(Double, Array[Double])], nfeatures: Int, nexamples: Long):
+ (Double, DoubleMatrix, DoubleMatrix) = {
+ val yMean: Double = data.map { case (y, features) => y }.reduce(_ + _) / nexamples
+
+ // NOTE: We shuffle X by column here to compute column sum and sum of squares.
+ val xColSumSq: RDD[(Int, (Double, Double))] = data.flatMap { case(y, features) =>
+ val nCols = features.length
+ // Traverse over every column and emit (col, value, value^2)
+ Iterator.tabulate(nCols) { i =>
+ (i, (features(i), features(i)*features(i)))
+ }
+ }.reduceByKey { case(x1, x2) =>
+ (x1._1 + x2._1, x1._2 + x2._2)
+ }
+ val xColSumsMap = xColSumSq.collectAsMap()
+
+ val xColMean = DoubleMatrix.zeros(nfeatures, 1)
+ val xColSd = DoubleMatrix.zeros(nfeatures, 1)
+
+ // Compute mean and unbiased variance using column sums
+ var col = 0
+ while (col < nfeatures) {
+ xColMean.put(col, xColSumsMap(col)._1 / nexamples)
+ val variance =
+ (xColSumsMap(col)._2 - (math.pow(xColSumsMap(col)._1, 2) / nexamples)) / (nexamples)
+ xColSd.put(col, math.sqrt(variance))
+ col += 1
+ }
+
+ (yMean, xColMean, xColSd)
+ }
+
+ /**
+ * Return the squared Euclidean distance between two vectors.
+ */
+ def squaredDistance(v1: Array[Double], v2: Array[Double]): Double = {
+ if (v1.length != v2.length) {
+ throw new IllegalArgumentException("Vector sizes don't match")
+ }
+ var i = 0
+ var sum = 0.0
+ while (i < v1.length) {
+ sum += (v1(i) - v2(i)) * (v1(i) - v2(i))
+ i += 1
+ }
+ sum
+ }
+}
diff --git a/mllib/src/test/resources/log4j.properties b/mllib/src/test/resources/log4j.properties
new file mode 100644
index 0000000000..a112e0b506
--- /dev/null
+++ b/mllib/src/test/resources/log4j.properties
@@ -0,0 +1,28 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements. See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+# Set everything to be logged to the file core/target/unit-tests.log
+log4j.rootCategory=INFO, file
+log4j.appender.file=org.apache.log4j.FileAppender
+log4j.appender.file.append=false
+log4j.appender.file.file=ml/target/unit-tests.log
+log4j.appender.file.layout=org.apache.log4j.PatternLayout
+log4j.appender.file.layout.ConversionPattern=%d{yy/MM/dd HH:mm:ss.SSS} %p %c{1}: %m%n
+
+# Ignore messages below warning level from Jetty, because it's a bit verbose
+log4j.logger.org.eclipse.jetty=WARN
+
diff --git a/mllib/src/test/scala/spark/mllib/clustering/KMeansSuite.scala b/mllib/src/test/scala/spark/mllib/clustering/KMeansSuite.scala
new file mode 100644
index 0000000000..bebade9afb
--- /dev/null
+++ b/mllib/src/test/scala/spark/mllib/clustering/KMeansSuite.scala
@@ -0,0 +1,170 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package spark.mllib.clustering
+
+import scala.util.Random
+
+import org.scalatest.BeforeAndAfterAll
+import org.scalatest.FunSuite
+
+import spark.SparkContext
+import spark.SparkContext._
+
+import org.jblas._
+
+
+class KMeansSuite extends FunSuite with BeforeAndAfterAll {
+ val sc = new SparkContext("local", "test")
+
+ override def afterAll() {
+ sc.stop()
+ System.clearProperty("spark.driver.port")
+ }
+
+ val EPSILON = 1e-4
+
+ import KMeans.{RANDOM, K_MEANS_PARALLEL}
+
+ def prettyPrint(point: Array[Double]): String = point.mkString("(", ", ", ")")
+
+ def prettyPrint(points: Array[Array[Double]]): String = {
+ points.map(prettyPrint).mkString("(", "; ", ")")
+ }
+
+ // L1 distance between two points
+ def distance1(v1: Array[Double], v2: Array[Double]): Double = {
+ v1.zip(v2).map{ case (a, b) => math.abs(a-b) }.max
+ }
+
+ // Assert that two vectors are equal within tolerance EPSILON
+ def assertEqual(v1: Array[Double], v2: Array[Double]) {
+ def errorMessage = prettyPrint(v1) + " did not equal " + prettyPrint(v2)
+ assert(v1.length == v2.length, errorMessage)
+ assert(distance1(v1, v2) <= EPSILON, errorMessage)
+ }
+
+ // Assert that two sets of points are equal, within EPSILON tolerance
+ def assertSetsEqual(set1: Array[Array[Double]], set2: Array[Array[Double]]) {
+ def errorMessage = prettyPrint(set1) + " did not equal " + prettyPrint(set2)
+ assert(set1.length == set2.length, errorMessage)
+ for (v <- set1) {
+ val closestDistance = set2.map(w => distance1(v, w)).min
+ if (closestDistance > EPSILON) {
+ fail(errorMessage)
+ }
+ }
+ for (v <- set2) {
+ val closestDistance = set1.map(w => distance1(v, w)).min
+ if (closestDistance > EPSILON) {
+ fail(errorMessage)
+ }
+ }
+ }
+
+ test("single cluster") {
+ val data = sc.parallelize(Array(
+ Array(1.0, 2.0, 6.0),
+ Array(1.0, 3.0, 0.0),
+ Array(1.0, 4.0, 6.0)
+ ))
+
+ // No matter how many runs or iterations we use, we should get one cluster,
+ // centered at the mean of the points
+
+ var model = KMeans.train(data, k=1, maxIterations=1)
+ assertSetsEqual(model.clusterCenters, Array(Array(1.0, 3.0, 4.0)))
+
+ model = KMeans.train(data, k=1, maxIterations=2)
+ assertSetsEqual(model.clusterCenters, Array(Array(1.0, 3.0, 4.0)))
+
+ model = KMeans.train(data, k=1, maxIterations=5)
+ assertSetsEqual(model.clusterCenters, Array(Array(1.0, 3.0, 4.0)))
+
+ model = KMeans.train(data, k=1, maxIterations=1, runs=5)
+ assertSetsEqual(model.clusterCenters, Array(Array(1.0, 3.0, 4.0)))
+
+ model = KMeans.train(data, k=1, maxIterations=1, runs=5)
+ assertSetsEqual(model.clusterCenters, Array(Array(1.0, 3.0, 4.0)))
+
+ model = KMeans.train(data, k=1, maxIterations=1, runs=1, initializationMode=RANDOM)
+ assertSetsEqual(model.clusterCenters, Array(Array(1.0, 3.0, 4.0)))
+
+ model = KMeans.train(
+ data, k=1, maxIterations=1, runs=1, initializationMode=K_MEANS_PARALLEL)
+ assertSetsEqual(model.clusterCenters, Array(Array(1.0, 3.0, 4.0)))
+ }
+
+ test("single cluster with big dataset") {
+ val smallData = Array(
+ Array(1.0, 2.0, 6.0),
+ Array(1.0, 3.0, 0.0),
+ Array(1.0, 4.0, 6.0)
+ )
+ val data = sc.parallelize((1 to 100).flatMap(_ => smallData), 4)
+
+ // No matter how many runs or iterations we use, we should get one cluster,
+ // centered at the mean of the points
+
+ var model = KMeans.train(data, k=1, maxIterations=1)
+ assertSetsEqual(model.clusterCenters, Array(Array(1.0, 3.0, 4.0)))
+
+ model = KMeans.train(data, k=1, maxIterations=2)
+ assertSetsEqual(model.clusterCenters, Array(Array(1.0, 3.0, 4.0)))
+
+ model = KMeans.train(data, k=1, maxIterations=5)
+ assertSetsEqual(model.clusterCenters, Array(Array(1.0, 3.0, 4.0)))
+
+ model = KMeans.train(data, k=1, maxIterations=1, runs=5)
+ assertSetsEqual(model.clusterCenters, Array(Array(1.0, 3.0, 4.0)))
+
+ model = KMeans.train(data, k=1, maxIterations=1, runs=5)
+ assertSetsEqual(model.clusterCenters, Array(Array(1.0, 3.0, 4.0)))
+
+ model = KMeans.train(data, k=1, maxIterations=1, runs=1, initializationMode=RANDOM)
+ assertSetsEqual(model.clusterCenters, Array(Array(1.0, 3.0, 4.0)))
+
+ model = KMeans.train(data, k=1, maxIterations=1, runs=1, initializationMode=K_MEANS_PARALLEL)
+ assertSetsEqual(model.clusterCenters, Array(Array(1.0, 3.0, 4.0)))
+ }
+
+ test("k-means|| initialization") {
+ val points = Array(
+ Array(1.0, 2.0, 6.0),
+ Array(1.0, 3.0, 0.0),
+ Array(1.0, 4.0, 6.0),
+ Array(1.0, 0.0, 1.0),
+ Array(1.0, 1.0, 1.0)
+ )
+ val rdd = sc.parallelize(points)
+
+ // K-means|| initialization should place all clusters into distinct centers because
+ // it will make at least five passes, and it will give non-zero probability to each
+ // unselected point as long as it hasn't yet selected all of them
+
+ var model = KMeans.train(rdd, k=5, maxIterations=1)
+ assertSetsEqual(model.clusterCenters, points)
+
+ // Iterations of Lloyd's should not change the answer either
+ model = KMeans.train(rdd, k=5, maxIterations=10)
+ assertSetsEqual(model.clusterCenters, points)
+
+ // Neither should more runs
+ model = KMeans.train(rdd, k=5, maxIterations=10, runs=5)
+ assertSetsEqual(model.clusterCenters, points)
+ }
+}
diff --git a/mllib/src/test/scala/spark/mllib/recommendation/ALSSuite.scala b/mllib/src/test/scala/spark/mllib/recommendation/ALSSuite.scala
new file mode 100644
index 0000000000..f98590b8d9
--- /dev/null
+++ b/mllib/src/test/scala/spark/mllib/recommendation/ALSSuite.scala
@@ -0,0 +1,97 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package spark.mllib.recommendation
+
+import scala.util.Random
+
+import org.scalatest.BeforeAndAfterAll
+import org.scalatest.FunSuite
+
+import spark.SparkContext
+import spark.SparkContext._
+
+import org.jblas._
+
+
+class ALSSuite extends FunSuite with BeforeAndAfterAll {
+ val sc = new SparkContext("local", "test")
+
+ override def afterAll() {
+ sc.stop()
+ System.clearProperty("spark.driver.port")
+ }
+
+ test("rank-1 matrices") {
+ testALS(10, 20, 1, 15, 0.7, 0.3)
+ }
+
+ test("rank-2 matrices") {
+ testALS(20, 30, 2, 15, 0.7, 0.3)
+ }
+
+ /**
+ * Test if we can correctly factorize R = U * P where U and P are of known rank.
+ *
+ * @param users number of users
+ * @param products number of products
+ * @param features number of features (rank of problem)
+ * @param iterations number of iterations to run
+ * @param samplingRate what fraction of the user-product pairs are known
+ * @param matchThreshold max difference allowed to consider a predicted rating correct
+ */
+ def testALS(users: Int, products: Int, features: Int, iterations: Int,
+ samplingRate: Double, matchThreshold: Double)
+ {
+ val rand = new Random(42)
+
+ // Create a random matrix with uniform values from -1 to 1
+ def randomMatrix(m: Int, n: Int) =
+ new DoubleMatrix(m, n, Array.fill(m * n)(rand.nextDouble() * 2 - 1): _*)
+
+ val userMatrix = randomMatrix(users, features)
+ val productMatrix = randomMatrix(features, products)
+ val trueRatings = userMatrix.mmul(productMatrix)
+
+ val sampledRatings = {
+ for (u <- 0 until users; p <- 0 until products if rand.nextDouble() < samplingRate)
+ yield (u, p, trueRatings.get(u, p))
+ }
+
+ val model = ALS.train(sc.parallelize(sampledRatings), features, iterations)
+
+ val predictedU = new DoubleMatrix(users, features)
+ for ((u, vec) <- model.userFeatures.collect(); i <- 0 until features) {
+ predictedU.put(u, i, vec(i))
+ }
+ val predictedP = new DoubleMatrix(products, features)
+ for ((p, vec) <- model.productFeatures.collect(); i <- 0 until features) {
+ predictedP.put(p, i, vec(i))
+ }
+ val predictedRatings = predictedU.mmul(predictedP.transpose)
+
+ for (u <- 0 until users; p <- 0 until products) {
+ val prediction = predictedRatings.get(u, p)
+ val correct = trueRatings.get(u, p)
+ if (math.abs(prediction - correct) > matchThreshold) {
+ fail("Model failed to predict (%d, %d): %f vs %f\ncorr: %s\npred: %s\nU: %s\n P: %s".format(
+ u, p, correct, prediction, trueRatings, predictedRatings, predictedU, predictedP))
+ }
+ }
+ }
+}
+
diff --git a/mllib/src/test/scala/spark/mllib/regression/LogisticRegressionSuite.scala b/mllib/src/test/scala/spark/mllib/regression/LogisticRegressionSuite.scala
new file mode 100644
index 0000000000..bc9bfd054f
--- /dev/null
+++ b/mllib/src/test/scala/spark/mllib/regression/LogisticRegressionSuite.scala
@@ -0,0 +1,74 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package spark.mllib.regression
+
+import scala.util.Random
+
+import org.scalatest.BeforeAndAfterAll
+import org.scalatest.FunSuite
+
+import spark.SparkContext
+import spark.SparkContext._
+
+
+class LogisticRegressionSuite extends FunSuite with BeforeAndAfterAll {
+ val sc = new SparkContext("local", "test")
+
+ override def afterAll() {
+ sc.stop()
+ System.clearProperty("spark.driver.port")
+ }
+
+ // Test if we can correctly learn A, B where Y = logistic(A + B*X)
+ test("logistic regression") {
+ val nPoints = 10000
+ val rnd = new Random(42)
+
+ val x1 = Array.fill[Double](nPoints)(rnd.nextGaussian())
+
+ val A = 2.0
+ val B = -1.5
+
+ // NOTE: if U is uniform[0, 1] then ln(u) - ln(1-u) is Logistic(0,1)
+ val unifRand = new scala.util.Random(45)
+ val rLogis = (0 until nPoints).map { i =>
+ val u = unifRand.nextDouble()
+ math.log(u) - math.log(1.0-u)
+ }
+
+ // y <- A + B*x + rlogis(100)
+ // y <- as.numeric(y > 0)
+ val y = (0 until nPoints).map { i =>
+ val yVal = A + B * x1(i) + rLogis(i)
+ if (yVal > 0) 1.0 else 0.0
+ }
+
+ val testData = (0 until nPoints).map(i => (y(i).toDouble, Array(x1(i)))).toArray
+
+ val testRDD = sc.parallelize(testData, 2)
+ testRDD.cache()
+ val lr = new LogisticRegression().setStepSize(10.0)
+ .setNumIterations(20)
+
+ val model = lr.train(testRDD)
+
+ val weight0 = model.weights.get(0)
+ assert(weight0 >= -1.60 && weight0 <= -1.40, weight0 + " not in [-1.6, -1.4]")
+ assert(model.intercept >= 1.9 && model.intercept <= 2.1, model.intercept + " not in [1.9, 2.1]")
+ }
+}
diff --git a/mllib/src/test/scala/spark/mllib/regression/RidgeRegressionSuite.scala b/mllib/src/test/scala/spark/mllib/regression/RidgeRegressionSuite.scala
new file mode 100644
index 0000000000..3c588c6162
--- /dev/null
+++ b/mllib/src/test/scala/spark/mllib/regression/RidgeRegressionSuite.scala
@@ -0,0 +1,64 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package spark.mllib.regression
+
+import scala.util.Random
+
+import org.scalatest.BeforeAndAfterAll
+import org.scalatest.FunSuite
+
+import spark.SparkContext
+import spark.SparkContext._
+
+
+class RidgeRegressionSuite extends FunSuite with BeforeAndAfterAll {
+ val sc = new SparkContext("local", "test")
+
+ override def afterAll() {
+ sc.stop()
+ System.clearProperty("spark.driver.port")
+ }
+
+ // Test if we can correctly learn Y = 3 + X1 + X2 when
+ // X1 and X2 are collinear.
+ test("multi-collinear variables") {
+ val rnd = new Random(43)
+ val x1 = Array.fill[Double](20)(rnd.nextGaussian())
+
+ // Pick a mean close to mean of x1
+ val rnd1 = new Random(42) //new NormalDistribution(0.1, 0.01)
+ val x2 = Array.fill[Double](20)(0.1 + rnd1.nextGaussian() * 0.01)
+
+ val xMat = (0 until 20).map(i => Array(x1(i), x2(i))).toArray
+
+ val y = xMat.map(i => 3 + i(0) + i(1))
+ val testData = (0 until 20).map(i => (y(i), xMat(i))).toArray
+
+ val testRDD = sc.parallelize(testData, 2)
+ testRDD.cache()
+ val ridgeReg = new RidgeRegression().setLowLambda(0)
+ .setHighLambda(10)
+
+ val model = ridgeReg.train(testRDD)
+
+ assert(model.intercept >= 2.9 && model.intercept <= 3.1)
+ assert(model.weights.length === 2)
+ assert(model.weights.get(0) >= 0.9 && model.weights.get(0) <= 1.1)
+ assert(model.weights.get(1) >= 0.9 && model.weights.get(1) <= 1.1)
+ }
+}