1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
|
#
# Licensed to the Apache Software Foundation (ASF) under one or more
# contributor license agreements. See the NOTICE file distributed with
# this work for additional information regarding copyright ownership.
# The ASF licenses this file to You under the Apache License, Version 2.0
# (the "License"); you may not use this file except in compliance with
# the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
library(SparkR)
# Logistic regression in Spark.
# Note: unlike the example in Scala, a point here is represented as a vector of
# doubles.
parseVectors <- function(lines) {
lines <- strsplit(as.character(lines) , " ", fixed = TRUE)
list(matrix(as.numeric(unlist(lines)), ncol = length(lines[[1]])))
}
dist.fun <- function(P, C) {
apply(
C,
1,
function(x) {
colSums((t(P) - x)^2)
}
)
}
closestPoint <- function(P, C) {
max.col(-dist.fun(P, C))
}
# Main program
args <- commandArgs(trailing = TRUE)
if (length(args) != 3) {
print("Usage: kmeans <file> <K> <convergeDist>")
q("no")
}
sc <- sparkR.init(appName = "RKMeans")
K <- as.integer(args[[2]])
convergeDist <- as.double(args[[3]])
lines <- textFile(sc, args[[1]])
points <- cache(lapplyPartition(lines, parseVectors))
# kPoints <- take(points, K)
kPoints <- do.call(rbind, takeSample(points, FALSE, K, 16189L))
tempDist <- 1.0
while (tempDist > convergeDist) {
closest <- lapplyPartition(
lapply(points,
function(p) {
cp <- closestPoint(p, kPoints);
mapply(list, unique(cp), split.data.frame(cbind(1, p), cp), SIMPLIFY=FALSE)
}),
function(x) {do.call(c, x)
})
pointStats <- reduceByKey(closest,
function(p1, p2) {
t(colSums(rbind(p1, p2)))
},
2L)
newPoints <- do.call(
rbind,
collect(lapply(pointStats,
function(tup) {
point.sum <- tup[[2]][, -1]
point.count <- tup[[2]][, 1]
point.sum/point.count
})))
D <- dist.fun(kPoints, newPoints)
tempDist <- sum(D[cbind(1:3, max.col(-D))])
kPoints <- newPoints
cat("Finished iteration (delta = ", tempDist, ")\n")
}
cat("Final centers:\n")
writeLines(unlist(lapply(kPoints, paste, collapse = " ")))
|