From 7a4bb863c7c11e22332763081793e4989af8c526 Mon Sep 17 00:00:00 2001 From: Ankur Dave Date: Sun, 12 Jan 2014 16:58:18 -0800 Subject: Add connected components example to doc --- docs/graphx-programming-guide.md | 20 +++++++++++++++++++- graphx/data/followers.txt | 6 +----- graphx/data/users.txt | 2 +- 3 files changed, 21 insertions(+), 7 deletions(-) diff --git a/docs/graphx-programming-guide.md b/docs/graphx-programming-guide.md index 52668b07c8..22feccb7ad 100644 --- a/docs/graphx-programming-guide.md +++ b/docs/graphx-programming-guide.md @@ -475,6 +475,7 @@ GraphX includes a set of graph algorithms in to simplify analytics. The algorith [Algorithms]: api/graphx/index.html#org.apache.spark.graphx.lib.Algorithms ## PageRank + PageRank measures the importance of each vertex in a graph, assuming an edge from *u* to *v* represents an endorsement of *v*'s importance by *u*. For example, if a Twitter user is followed by many others, the user will be ranked highly. @@ -503,9 +504,26 @@ val ranksByUsername = users.leftOuterJoin(ranks).map { println(ranksByUsername.collect().mkString("\n")) {% endhighlight %} - ## Connected Components +The connected components algorithm labels each connected component of the graph with the ID of its lowest-numbered vertex. For example, in a social network, connected components can approximate clusters. We can compute the connected components of the example social network dataset from the [PageRank section](#pagerank) as follows: + +{% highlight scala %} +// Load the implicit conversion and graph as in the PageRank example +import org.apache.spark.graphx.lib._ +val users = ... +val followers = ... +val graph = Graph(users, followers) +// Find the connected components +val cc = graph.connectedComponents().vertices +// Join the connected components with the usernames +val ccByUsername = graph.vertices.innerJoin(cc) { (id, username, cc) => + (username, cc) +} +// Print the result +println(ccByUsername.collect().mkString("\n")) +{% endhighlight %} + ## Shortest Path ## Triangle Counting diff --git a/graphx/data/followers.txt b/graphx/data/followers.txt index 0f46d80806..7bb8e900e2 100644 --- a/graphx/data/followers.txt +++ b/graphx/data/followers.txt @@ -1,10 +1,6 @@ 2 1 -3 1 4 1 -6 1 -3 2 -6 2 -7 2 +1 2 6 3 7 3 7 6 diff --git a/graphx/data/users.txt b/graphx/data/users.txt index ce3d06c600..26e3b3bb4d 100644 --- a/graphx/data/users.txt +++ b/graphx/data/users.txt @@ -1,5 +1,5 @@ 1 BarackObama -2 ericschmidt +2 ladygaga 3 jeresig 4 justinbieber 6 matei_zaharia -- cgit v1.2.3