From 5fcd2a61b4f10d039336e2ca3716e99c0d96ded2 Mon Sep 17 00:00:00 2001 From: Ankur Dave Date: Fri, 10 Jan 2014 12:17:04 -0800 Subject: Finish cleaning up Graph docs --- .../main/scala/org/apache/spark/graphx/Graph.scala | 180 ++++++++++----------- 1 file changed, 82 insertions(+), 98 deletions(-) (limited to 'graphx/src') diff --git a/graphx/src/main/scala/org/apache/spark/graphx/Graph.scala b/graphx/src/main/scala/org/apache/spark/graphx/Graph.scala index aeeaddb320..a188de5537 100644 --- a/graphx/src/main/scala/org/apache/spark/graphx/Graph.scala +++ b/graphx/src/main/scala/org/apache/spark/graphx/Graph.scala @@ -15,8 +15,8 @@ import org.apache.spark.storage.StorageLevel * RDDs, the graph is a functional data-structure in which mutating * operations return new graphs. * - * @note [[org.apache.spark.graphx.GraphOps]] contains additional convenience operations. - * [[org.apache.spark.graphx.algorithms.Algorithms]] contains graph algorithms; to access these, + * @note [[GraphOps]] contains additional convenience operations. + * [[algorithms.Algorithms]] contains graph algorithms; to access these, * import `org.apache.spark.graphx.algorithms._`. * * @tparam VD the vertex attribute type @@ -25,32 +25,31 @@ import org.apache.spark.storage.StorageLevel abstract class Graph[VD: ClassTag, ED: ClassTag] { /** - * Get the vertices and their data. + * An RDD containing the vertices and their associated attributes. * * @note vertex ids are unique. - * @return An RDD containing the vertices in this graph + * @return an RDD containing the vertices in this graph */ val vertices: VertexRDD[VD] /** - * Get the Edges and their data as an RDD. The entries in the RDD - * contain just the source id and target id along with the edge - * data. + * An RDD containing the edges and their associated attributes. The entries in the RDD contain + * just the source id and target id along with the edge data. * - * @return An RDD containing the edges in this graph + * @return an RDD containing the edges in this graph * - * @see [[org.apache.spark.graphx.Edge]] for the edge type. - * @see `triplets` to get an RDD which contains all the edges + * @see [[Edge]] for the edge type. + * @see [[triplets]] to get an RDD which contains all the edges * along with their vertex data. * */ val edges: EdgeRDD[ED] /** - * Get the edges with the vertex data associated with the adjacent - * pair of vertices. + * An RDD containing the edge triplets, which are edges along with the vertex data associated with + * the adjacent vertices. * - * @return An RDD containing edge triplets. + * @return an RDD containing edge triplets * * @example This operation might be used to evaluate a graph * coloring where we would like to check that both vertices are a @@ -68,7 +67,7 @@ abstract class Graph[VD: ClassTag, ED: ClassTag] { val triplets: RDD[EdgeTriplet[VD, ED]] /** - * Cache the vertices and edges associated with this graph. + * Caches the vertices and edges associated with this graph at the specified storage level. * * @param newLevel the level at which to cache the graph. @@ -78,14 +77,14 @@ abstract class Graph[VD: ClassTag, ED: ClassTag] { def persist(newLevel: StorageLevel = StorageLevel.MEMORY_ONLY): Graph[VD, ED] /** - * Return a graph that is cached when first created. This is used to + * Caches the vertices and edges associated with this graph. This is used to * pin a graph in memory enabling multiple queries to reuse the same * construction process. */ def cache(): Graph[VD, ED] /** - * Uncache only the vertices of this graph, leaving the edges alone. This is useful in iterative + * Uncaches only the vertices of this graph, leaving the edges alone. This is useful in iterative * algorithms that modify the vertex attributes but reuse the edges. This method can be used to * uncache the vertex attributes of previous iterations once they are no longer needed, improving * GC performance. @@ -93,23 +92,22 @@ abstract class Graph[VD: ClassTag, ED: ClassTag] { def unpersistVertices(blocking: Boolean = true): Graph[VD, ED] /** - * Repartition the edges in the graph according to `partitionStrategy`. + * Repartitions the edges in the graph according to `partitionStrategy`. */ def partitionBy(partitionStrategy: PartitionStrategy): Graph[VD, ED] /** - * Compute statistics describing the graph representation. + * Computes statistics describing the graph representation. */ def statistics: Map[String, Any] /** - * Construct a new graph where each vertex value has been - * transformed by the map function. + * Transforms each vertex attribute in the graph using the map function. * * @note The new graph has the same structure. As a consequence the underlying index structures * can be reused. * - * @param map the function from a vertex object to a new vertex value. + * @param map the function from a vertex object to a new vertex value * * @tparam VD2 the new vertex data type * @@ -125,10 +123,9 @@ abstract class Graph[VD: ClassTag, ED: ClassTag] { def mapVertices[VD2: ClassTag](map: (VertexID, VD) => VD2): Graph[VD2, ED] /** - * Construct a new graph where the value of each edge is - * transformed by the map operation. This function is not passed - * the vertex value for the vertices adjacent to the edge. If - * vertex values are desired, use the `mapTriplets` method. + * Transforms each edge attribute in the graph using the map function. The map function is not + * passed the vertex value for the vertices adjacent to the edge. If vertex values are desired, + * use [[mapTriplets]]. * * @note This graph is not changed and that the new graph has the * same structure. As a consequence the underlying index structures @@ -147,19 +144,19 @@ abstract class Graph[VD: ClassTag, ED: ClassTag] { } /** - * Construct a new graph transforming the value of each edge using the user defined iterator - * transform. The iterator transform is given an iterator over edges within a logical partition - * as well as the partition's ID, and it should return a new iterator over the new values of each - * edge. The new iterator's elements must correspond one-to-one with the old iterator's - * elements. If adjacent vertex values are desired, use the `mapTriplets` method. + * Transforms each edge attribute using the map function, passing it a whole partition at a + * time. The map function is given an iterator over edges within a logical partition as well as + * the partition's ID, and it should return a new iterator over the new values of each edge. The + * new iterator's elements must correspond one-to-one with the old iterator's elements. If + * adjacent vertex values are desired, use [[mapTriplets]]. * * @note This does not change the structure of the * graph or modify the values of this graph. As a consequence * the underlying index structures can be reused. * - * @param map the function which takes a partition id and an iterator - * over all the edges in the partition and must return an iterator over - * the new values for each edge in the order of the input iterator. + * @param map a function that takes a partition id and an iterator + * over all the edges in the partition, and must return an iterator over + * the new values for each edge in the order of the input iterator * * @tparam ED2 the new edge data type * @@ -168,11 +165,8 @@ abstract class Graph[VD: ClassTag, ED: ClassTag] { map: (PartitionID, Iterator[Edge[ED]]) => Iterator[ED2]): Graph[VD, ED2] /** - * Construct a new graph where the value of each edge is - * transformed by the map operation. This function passes vertex - * values for the adjacent vertices to the map function. If - * adjacent vertex values are not required, consider using the - * `mapEdges` method instead. + * Transforms each edge attribute using the map function, passing it the adjacent vertex attributes + * as well. If adjacent vertex values are not required, consider using [[mapEdges]] instead. * * @note This does not change the structure of the * graph or modify the values of this graph. As a consequence @@ -196,21 +190,17 @@ abstract class Graph[VD: ClassTag, ED: ClassTag] { } /** - * Construct a new graph transforming the value of each edge using - * the user defined iterator transform. The iterator transform is - * given an iterator over edge triplets within a logical partition - * and should yield a new iterator over the new values of each edge - * in the order in which they are provided to the iterator transform - * If adjacent vertex values are not required, consider using the - * mapEdges function instead. - * - * @note This that this does not change the structure of the + * Transforms each edge attribute a partition at a time using the map function, passing it the + * adjacent vertex attributes as well. The map function is given an iterator over edge triplets + * within a logical partition and should yield a new iterator over the new values of each edge in + * the order in which they are provided. If adjacent vertex values are not required, consider + * using [[mapEdges]] instead. + * + * @note This does not change the structure of the * graph or modify the values of this graph. As a consequence * the underlying index structures can be reused. * - * @param map the function which takes a partition id and an iterator - * over all the edges in the partition and must return an iterator over - * the new values for each edge in the order of the input iterator. + * @param map the iterator transform * * @tparam ED2 the new edge data type * @@ -220,86 +210,82 @@ abstract class Graph[VD: ClassTag, ED: ClassTag] { Graph[VD, ED2] /** - * Construct a new graph with all the edges reversed. If this graph - * contains an edge from a to b then the returned graph contains an - * edge from b to a. + * Reverses all edges in the graph. If this graph contains an edge from a to b then the returned + * graph contains an edge from b to a. */ def reverse: Graph[VD, ED] /** - * This function takes a vertex and edge predicate and constructs - * the subgraph that consists of vertices and edges that satisfy the - * predict. The resulting graph contains the vertices and edges - * that satisfy: + * Restricts the graph to only the vertices and edges satisfying the predicates. The resulting + * subgraph satisifies * * {{{ * V' = {v : for all v in V where vpred(v)} * E' = {(u,v): for all (u,v) in E where epred((u,v)) && vpred(u) && vpred(v)} * }}} * - * @param epred the edge predicate which takes a triplet and + * @param epred the edge predicate, which takes a triplet and * evaluates to true if the edge is to remain in the subgraph. Note - * that only edges in which both vertices satisfy the vertex + * that only edges where both vertices satisfy the vertex * predicate are considered. * - * @param vpred the vertex predicate which takes a vertex object and + * @param vpred the vertex predicate, which takes a vertex object and * evaluates to true if the vertex is to be included in the subgraph * * @return the subgraph containing only the vertices and edges that - * satisfy the predicates. + * satisfy the predicates */ def subgraph(epred: EdgeTriplet[VD,ED] => Boolean = (x => true), vpred: (VertexID, VD) => Boolean = ((v,d) => true) ): Graph[VD, ED] /** - * Subgraph of this graph with only vertices and edges from the other graph. + * Restricts the graph to only the vertices and edges that are also in `other`, but keeps the + * attributes from this graph. * @param other the graph to project this graph onto - * @return a graph with vertices and edges that exists in both the current graph and other, - * with vertex and edge data from the current graph. + * @return a graph with vertices and edges that exist in both the current graph and `other`, + * with vertex and edge data from the current graph */ def mask[VD2: ClassTag, ED2: ClassTag](other: Graph[VD2, ED2]): Graph[VD, ED] /** - * This function merges multiple edges between two vertices into a single Edge. For correct - * results, the graph must have been partitioned using partitionBy. + * Merges multiple edges between two vertices into a single edge. For correct results, the graph + * must have been partitioned using [[partitionBy]]. * * @tparam ED2 the type of the resulting edge data after grouping. * - * @param f the user supplied commutative associative function to merge edge attributes for + * @param f the user-supplied commutative associative function to merge edge attributes for * duplicate edges. * - * @return Graph[VD,ED2] The resulting graph with a single Edge for each source, dest vertex pair. + * @return The resulting graph with a single edge for each (source, dest) vertex pair. */ def groupEdges(merge: (ED, ED) => ED): Graph[VD,ED] /** - * The mapReduceTriplets function is used to compute statistics - * about the neighboring edges and vertices of each vertex. The - * user supplied `mapFunc` function is invoked on each edge of the - * graph generating 0 or more "messages" to be "sent" to either - * vertex in the edge. The `reduceFunc` is then used to combine the - * output of the map phase destined to each vertex. + * Computes statistics about the neighboring edges and vertices of each vertex. The user supplied + * `mapFunc` function is invoked on each edge of the graph, generating 0 or more "messages" to be + * "sent" to either vertex in the edge. The `reduceFunc` is then used to combine the output of + * the map phase destined to each vertex. * * @tparam A the type of "message" to be sent to each vertex * * @param mapFunc the user defined map function which returns 0 or - * more messages to neighboring vertices. + * more messages to neighboring vertices * * @param reduceFunc the user defined reduce function which should * be commutative and assosciative and is used to combine the output - * of the map phase. + * of the map phase * - * @param activeSet optionally, a set of "active" vertices and a direction of edges to consider + * @param activeSetOpt optionally, a set of "active" vertices and a direction of edges to consider * when running `mapFunc`. For example, if the direction is Out, `mapFunc` will only be run on - * edges originating from vertices in the active set. `activeSet` must have the same index as the - * graph's vertices. + * edges originating from vertices in the active set. The active set must have the same index as + * the graph's vertices. * - * @example We can use this function to compute the inDegree of each + * @example We can use this function to compute the in-degree of each * vertex * {{{ * val rawGraph: Graph[(),()] = Graph.textFile("twittergraph") * val inDeg: RDD[(VertexID, Int)] = - * mapReduceTriplets[Int](et => Array((et.dst.id, 1)), _ + _) + * mapReduceTriplets[Int](et => Iterator((et.dst.id, 1)), _ + _) * }}} * * @note By expressing computation at the edge level we achieve @@ -316,10 +302,9 @@ abstract class Graph[VD: ClassTag, ED: ClassTag] { : VertexRDD[A] /** - * Join the vertices with an RDD and then apply a function from the - * the vertex and RDD entry to a new vertex value and type. The - * input table should contain at most one entry for each vertex. If - * no entry is provided the map function is invoked passing none. + * Joins the vertices with entries in the `table` RDD and merges the results using `mapFunc`. The + * input table should contain at most one entry for each vertex. If no entry in `table` is + * provided for a particular vertex in the graph, the map function receives `None`. * * @tparam U the type of entry in the table of updates * @tparam VD2 the new vertex value type @@ -331,12 +316,11 @@ abstract class Graph[VD: ClassTag, ED: ClassTag] { * values. The map function is invoked for all vertices, even those * that do not have a corresponding entry in the table. * - * @example This function is used to update the vertices with new - * values based on external data. For example we could add the out - * degree to each vertex record + * @example This function is used to update the vertices with new values based on external data. + * For example we could add the out-degree to each vertex record: * * {{{ - * val rawGraph: Graph[(),()] = Graph.textFile("webgraph") + * val rawGraph: Graph[_, _] = Graph.textFile("webgraph") * val outDeg: RDD[(VertexID, Int)] = rawGraph.outDegrees() * val graph = rawGraph.outerJoinVertices(outDeg) { * (vid, data, optDeg) => optDeg.getOrElse(0) @@ -364,13 +348,13 @@ object Graph { /** * Construct a graph from a collection of edges encoded as vertex id pairs. * - * @param rawEdges a collection of edges in (src,dst) form. + * @param rawEdges a collection of edges in (src, dst) form * @param uniqueEdges if multiple identical edges are found they are combined and the edge * attribute is set to the sum. Otherwise duplicate edges are treated as separate. To enable - * uniqueEdges, a [[PartitionStrategy]] must be provided. + * `uniqueEdges`, a [[PartitionStrategy]] must be provided. * * @return a graph with edge attributes containing either the count of duplicate edges or 1 - * (if `uniqueEdges=None`) and vertex attributes containing the total degree of each vertex. + * (if `uniqueEdges` is `None`) and vertex attributes containing the total degree of each vertex. */ def fromEdgeTuples[VD: ClassTag]( rawEdges: RDD[(VertexID, VertexID)], @@ -400,10 +384,10 @@ object Graph { } /** - * Construct a graph from a collection attributed vertices and - * edges. Duplicate vertices are picked arbitrarily and + * Construct a graph from a collection of vertices and + * edges with attributes. Duplicate vertices are picked arbitrarily and * vertices found in the edge collection but not in the input - * vertices are the default attribute. + * vertices are assigned the default attribute. * * @tparam VD the vertex attribute type * @tparam ED the edge attribute type @@ -412,7 +396,7 @@ object Graph { * @param defaultVertexAttr the default vertex attribute to use for * vertices that are mentioned in edges but not in vertices * @param partitionStrategy the partition strategy to use when - * partitioning the edges. + * partitioning the edges */ def apply[VD: ClassTag, ED: ClassTag]( vertices: RDD[(VertexID, VD)], @@ -422,10 +406,10 @@ object Graph { } /** - * The implicit graphToGraphOPs function extracts the GraphOps member from a graph. + * Implicitly extracts the [[GraphOps]] member from a graph. * * To improve modularity the Graph type only contains a small set of basic operations. All the - * convenience operations are defined in the GraphOps class which may be shared across multiple + * convenience operations are defined in the [[GraphOps]] class which may be shared across multiple * graph implementations. */ implicit def graphToGraphOps[VD: ClassTag, ED: ClassTag](g: Graph[VD, ED]) = g.ops -- cgit v1.2.3