From 9454fa1f6c714a720197a6db04a6c369e425e507 Mon Sep 17 00:00:00 2001 From: Ankur Dave Date: Fri, 10 Jan 2014 12:37:20 -0800 Subject: Remove duplicate method in GraphLoader and improve docs --- .../org/apache/spark/graphx/GraphLoader.scala | 63 +++++----------------- 1 file changed, 13 insertions(+), 50 deletions(-) diff --git a/graphx/src/main/scala/org/apache/spark/graphx/GraphLoader.scala b/graphx/src/main/scala/org/apache/spark/graphx/GraphLoader.scala index 473cfb18cf..3c06a403ea 100644 --- a/graphx/src/main/scala/org/apache/spark/graphx/GraphLoader.scala +++ b/graphx/src/main/scala/org/apache/spark/graphx/GraphLoader.scala @@ -8,51 +8,20 @@ import org.apache.spark.{Logging, SparkContext} import org.apache.spark.graphx.impl.{EdgePartition, GraphImpl} import org.apache.spark.util.collection.PrimitiveVector - +/** + * Provides utilities for loading [[Graph]]s from files. + */ object GraphLoader extends Logging { /** - * Load an edge list from file initializing the Graph - * - * @tparam ED the type of the edge data of the resulting Graph - * - * @param sc the SparkContext used to construct RDDs - * @param path the path to the text file containing the edge list - * @param edgeParser a function that takes an array of strings and - * returns an ED object - * @param minEdgePartitions the number of partitions for the - * the Edge RDD + * Loads a graph from an edge list formatted file where each line contains two integers: a source + * id and a target id. Skips lines that begin with `#`. * - */ - def textFile[ED: ClassTag]( - sc: SparkContext, - path: String, - edgeParser: Array[String] => ED, - minEdgePartitions: Int = 1): - Graph[Int, ED] = { - // Parse the edge data table - val edges = sc.textFile(path, minEdgePartitions).mapPartitions( iter => - iter.filter(line => !line.isEmpty && line(0) != '#').map { line => - val lineArray = line.split("\\s+") - if(lineArray.length < 2) { - println("Invalid line: " + line) - assert(false) - } - val source = lineArray(0).trim.toLong - val target = lineArray(1).trim.toLong - val tail = lineArray.drop(2) - val edata = edgeParser(tail) - Edge(source, target, edata) - }) - val defaultVertexAttr = 1 - Graph.fromEdges(edges, defaultVertexAttr) - } - - /** - * Load a graph from an edge list formatted file with each line containing - * two integers: a source Id and a target Id. + * If desired the edges can be automatically oriented in the positive + * direction (source Id < target Id) by setting `canonicalOrientation` to + * true. * - * @example A file in the following format: + * @example Loads a file in the following format: * {{{ * # Comment Line * # Source Id <\t> Target Id @@ -62,25 +31,19 @@ object GraphLoader extends Logging { * 1 8 * }}} * - * If desired the edges can be automatically oriented in the positive - * direction (source Id < target Id) by setting `canonicalOrientation` to - * true - * * @param sc - * @param path the path to the file (e.g., /Home/data/file or hdfs://file) + * @param path the path to the file (e.g., /home/data/file or hdfs://file) * @param canonicalOrientation whether to orient edges in the positive - * direction. + * direction * @param minEdgePartitions the number of partitions for the - * the Edge RDD + * the edge RDD * @tparam ED - * @return */ def edgeListFile( sc: SparkContext, path: String, canonicalOrientation: Boolean = false, - minEdgePartitions: Int = 1): - Graph[Int, Int] = { + minEdgePartitions: Int = 1): Graph[Int, Int] = { val startTime = System.currentTimeMillis // Parse the edge data table directly into edge partitions -- cgit v1.2.3