aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorAnkur Dave <ankurdave@gmail.com>2014-01-10 12:37:20 -0800
committerAnkur Dave <ankurdave@gmail.com>2014-01-10 12:37:20 -0800
commit9454fa1f6c714a720197a6db04a6c369e425e507 (patch)
tree12b2a8683fc76d97287e73c8a565ad403497d39f
parent37611e57f6ca1768529e4038198de56f85f90665 (diff)
downloadspark-9454fa1f6c714a720197a6db04a6c369e425e507.tar.gz
spark-9454fa1f6c714a720197a6db04a6c369e425e507.tar.bz2
spark-9454fa1f6c714a720197a6db04a6c369e425e507.zip
Remove duplicate method in GraphLoader and improve docs
-rw-r--r--graphx/src/main/scala/org/apache/spark/graphx/GraphLoader.scala63
1 files changed, 13 insertions, 50 deletions
diff --git a/graphx/src/main/scala/org/apache/spark/graphx/GraphLoader.scala b/graphx/src/main/scala/org/apache/spark/graphx/GraphLoader.scala
index 473cfb18cf..3c06a403ea 100644
--- a/graphx/src/main/scala/org/apache/spark/graphx/GraphLoader.scala
+++ b/graphx/src/main/scala/org/apache/spark/graphx/GraphLoader.scala
@@ -8,51 +8,20 @@ import org.apache.spark.{Logging, SparkContext}
import org.apache.spark.graphx.impl.{EdgePartition, GraphImpl}
import org.apache.spark.util.collection.PrimitiveVector
-
+/**
+ * Provides utilities for loading [[Graph]]s from files.
+ */
object GraphLoader extends Logging {
/**
- * Load an edge list from file initializing the Graph
- *
- * @tparam ED the type of the edge data of the resulting Graph
- *
- * @param sc the SparkContext used to construct RDDs
- * @param path the path to the text file containing the edge list
- * @param edgeParser a function that takes an array of strings and
- * returns an ED object
- * @param minEdgePartitions the number of partitions for the
- * the Edge RDD
+ * Loads a graph from an edge list formatted file where each line contains two integers: a source
+ * id and a target id. Skips lines that begin with `#`.
*
- */
- def textFile[ED: ClassTag](
- sc: SparkContext,
- path: String,
- edgeParser: Array[String] => ED,
- minEdgePartitions: Int = 1):
- Graph[Int, ED] = {
- // Parse the edge data table
- val edges = sc.textFile(path, minEdgePartitions).mapPartitions( iter =>
- iter.filter(line => !line.isEmpty && line(0) != '#').map { line =>
- val lineArray = line.split("\\s+")
- if(lineArray.length < 2) {
- println("Invalid line: " + line)
- assert(false)
- }
- val source = lineArray(0).trim.toLong
- val target = lineArray(1).trim.toLong
- val tail = lineArray.drop(2)
- val edata = edgeParser(tail)
- Edge(source, target, edata)
- })
- val defaultVertexAttr = 1
- Graph.fromEdges(edges, defaultVertexAttr)
- }
-
- /**
- * Load a graph from an edge list formatted file with each line containing
- * two integers: a source Id and a target Id.
+ * If desired the edges can be automatically oriented in the positive
+ * direction (source Id < target Id) by setting `canonicalOrientation` to
+ * true.
*
- * @example A file in the following format:
+ * @example Loads a file in the following format:
* {{{
* # Comment Line
* # Source Id <\t> Target Id
@@ -62,25 +31,19 @@ object GraphLoader extends Logging {
* 1 8
* }}}
*
- * If desired the edges can be automatically oriented in the positive
- * direction (source Id < target Id) by setting `canonicalOrientation` to
- * true
- *
* @param sc
- * @param path the path to the file (e.g., /Home/data/file or hdfs://file)
+ * @param path the path to the file (e.g., /home/data/file or hdfs://file)
* @param canonicalOrientation whether to orient edges in the positive
- * direction.
+ * direction
* @param minEdgePartitions the number of partitions for the
- * the Edge RDD
+ * the edge RDD
* @tparam ED
- * @return
*/
def edgeListFile(
sc: SparkContext,
path: String,
canonicalOrientation: Boolean = false,
- minEdgePartitions: Int = 1):
- Graph[Int, Int] = {
+ minEdgePartitions: Int = 1): Graph[Int, Int] = {
val startTime = System.currentTimeMillis
// Parse the edge data table directly into edge partitions