Spark GraphX圖處理編程實例

本文轉載自查看原文 2015-12-15 17:20 2390 Spark

所構建的圖如下：

Scala程序代碼如下：

import org.apache.spark._
import org.apache.spark.graphx._
// To make some of the examples work we will also need RDD
import org.apache.spark.rdd.RDD
object Test {
  def main(args: Array[String]): Unit = {
        // 初始化SparkContext
        val sc: SparkContext = new SparkContext("local[2]", "Spark Graphx");
        // 創造一個點的RDD
        val users: RDD[(VertexId, (String, String))] =
        sc.parallelize(Array((3L, ("rxin", "student")), (7L, ("jgonzal", "postdoc")),
                (5L, ("franklin", "prof")), (2L, ("istoica", "prof"))))
        // 創造一個邊的RDD，包含各種關系
        val relationships: RDD[Edge[String]] =
        sc.parallelize(Array(Edge(3L, 7L, "collab"), Edge(5L, 3L, "advisor"),
                Edge(2L, 5L, "colleague"), Edge(5L, 7L, "pi")))
        // 定義一個缺省的用戶，其主要作用就在於當描述一種關系中不存在的目標頂點時就會使用這個缺省的用戶
        val defaultUser = ("John Doe", "Missing")
        // 構造圖
        val graph = Graph(users, relationships, defaultUser)
        // 輸出Graph的信息
        graph.vertices.collect().foreach(println(_))
        graph.triplets.map(triplet => triplet.srcAttr + "----->" + triplet.dstAttr + "    attr:" + triplet.attr)
            .collect().foreach(println(_))
        // 統計所有用戶當中postdoc的數量
      val cnt1 = graph.vertices.filter { case (id, (name, pos)) => pos == "postdoc" }.count
      System.out.println("所有用戶當中postdoc的數量為："+cnt1);
      // 統計所有源頂點大於目標頂點src > dst的邊的數量 
      val cnt2 = graph.edges.filter(e => e.srcId > e.dstId).count
      System.out.println("所有源頂點大於目標頂點 src > dst的邊的數量為："+cnt2);
      // 統計圖各個頂點的入度
      val inDegrees: VertexRDD[Int] = graph.inDegrees
      inDegrees.collect().foreach(println(_))
    }
}

相關內置的圖操作方法有：

/** Summary of the functionality in the property graph */
class Graph[VD, ED] {
  // Information about the Graph ===================================================================
  val numEdges: Long
  val numVertices: Long
  val inDegrees: VertexRDD[Int]
  val outDegrees: VertexRDD[Int]
  val degrees: VertexRDD[Int]
  // Views of the graph as collections =============================================================
  val vertices: VertexRDD[VD]
  val edges: EdgeRDD[ED]
  val triplets: RDD[EdgeTriplet[VD, ED]]
  // Functions for caching graphs ==================================================================
  def persist(newLevel: StorageLevel = StorageLevel.MEMORY_ONLY): Graph[VD, ED]
  def cache(): Graph[VD, ED]
  def unpersistVertices(blocking: Boolean = true): Graph[VD, ED]
  // Change the partitioning heuristic  ============================================================
  def partitionBy(partitionStrategy: PartitionStrategy): Graph[VD, ED]
  // Transform vertex and edge attributes ==========================================================
  def mapVertices[VD2](map: (VertexID, VD) => VD2): Graph[VD2, ED]
  def mapEdges[ED2](map: Edge[ED] => ED2): Graph[VD, ED2]
  def mapEdges[ED2](map: (PartitionID, Iterator[Edge[ED]]) => Iterator[ED2]): Graph[VD, ED2]
  def mapTriplets[ED2](map: EdgeTriplet[VD, ED] => ED2): Graph[VD, ED2]
  def mapTriplets[ED2](map: (PartitionID, Iterator[EdgeTriplet[VD, ED]]) => Iterator[ED2])
    : Graph[VD, ED2]
  // Modify the graph structure ====================================================================
  def reverse: Graph[VD, ED]
  def subgraph(
      epred: EdgeTriplet[VD,ED] => Boolean = (x => true),
      vpred: (VertexID, VD) => Boolean = ((v, d) => true))
    : Graph[VD, ED]
  def mask[VD2, ED2](other: Graph[VD2, ED2]): Graph[VD, ED]
  def groupEdges(merge: (ED, ED) => ED): Graph[VD, ED]
  // Join RDDs with the graph ======================================================================
  def joinVertices[U](table: RDD[(VertexID, U)])(mapFunc: (VertexID, VD, U) => VD): Graph[VD, ED]
  def outerJoinVertices[U, VD2](other: RDD[(VertexID, U)])
      (mapFunc: (VertexID, VD, Option[U]) => VD2)
    : Graph[VD2, ED]
  // Aggregate information about adjacent triplets =================================================
  def collectNeighborIds(edgeDirection: EdgeDirection): VertexRDD[Array[VertexID]]
  def collectNeighbors(edgeDirection: EdgeDirection): VertexRDD[Array[(VertexID, VD)]]
  def aggregateMessages[Msg: ClassTag](
      sendMsg: EdgeContext[VD, ED, Msg] => Unit,
      mergeMsg: (Msg, Msg) => Msg,
      tripletFields: TripletFields = TripletFields.All)
    : VertexRDD[A]
  // Iterative graph-parallel computation ==========================================================
  def pregel[A](initialMsg: A, maxIterations: Int, activeDirection: EdgeDirection)(
      vprog: (VertexID, VD, A) => VD,
      sendMsg: EdgeTriplet[VD, ED] => Iterator[(VertexID,A)],
      mergeMsg: (A, A) => A)
    : Graph[VD, ED]
  // Basic graph algorithms ========================================================================
  def pageRank(tol: Double, resetProb: Double = 0.15): Graph[Double, Double]
  def connectedComponents(): Graph[VertexID, ED]
  def triangleCount(): Graph[Int, ED]
  def stronglyConnectedComponents(numIter: Int): Graph[VertexID, ED]
}

參考鏈接：

http://spark.apache.org/docs/latest/graphx-programming-guide.html

免責聲明！

本站轉載的文章為個人學習借鑒使用，本站對版權不負任何法律責任。如果侵犯了您的隱私權益，請聯系本站郵箱yoyou2525@163.com刪除。

猜您在找 spark graphx圖計算 Spark GraphX實例(1) Spark（十七）圖計算GraphX Spark入門實戰系列--9.Spark圖計算GraphX介紹及實例大數據技術之_19_Spark學習_05_Spark GraphX 應用解析 + Spark GraphX 概述、解析 + 計算模式 + Pregel API + 圖算法參考代碼 + PageRank 實例 Spark GraphX圖計算快速入門原創：Spark中GraphX圖運算pregel詳解 Spark GraphX圖算法應用【分區策略、PageRank、ConnectedComponents，TriangleCount】 spark Graphx 之 Connected Components 2. Spark GraphX解析