diff options
-rw-r--r-- | core/src/main/scala/org/apache/spark/broadcast/Broadcast.scala | 32 | ||||
-rw-r--r-- | core/src/main/scala/org/apache/spark/broadcast/package.scala | 25 |
2 files changed, 54 insertions, 3 deletions
diff --git a/core/src/main/scala/org/apache/spark/broadcast/Broadcast.scala b/core/src/main/scala/org/apache/spark/broadcast/Broadcast.scala index 6bfe2cb4a2..cf0904fc85 100644 --- a/core/src/main/scala/org/apache/spark/broadcast/Broadcast.scala +++ b/core/src/main/scala/org/apache/spark/broadcast/Broadcast.scala @@ -17,13 +17,39 @@ package org.apache.spark.broadcast -import java.io._ import java.util.concurrent.atomic.AtomicLong import org.apache.spark._ -private[spark] -abstract class Broadcast[T](private[spark] val id: Long) extends Serializable { +/** + * A broadcast variable. Broadcast variables allow the programmer to keep a read-only variable + * cached on each machine rather than shipping a copy of it with tasks. They can be used, for + * example, to give every node a copy of a large input dataset in an efficient manner. Spark also + * attempts to distribute broadcast variables using efficient broadcast algorithms to reduce + * communication cost. + * + * Broadcast variables are created from a variable `v` by calling [[SparkContext#broadcast]]. + * The broadcast variable is a wrapper around `v`, and its value can be accessed by calling the + * `value` method. The interpreter session below shows this: + * + * {{{ + * scala> val broadcastVar = sc.broadcast(Array(1, 2, 3)) + * broadcastVar: spark.Broadcast[Array[Int]] = spark.Broadcast(b5c40191-a864-4c7d-b9bf-d87e1a4e787c) + * + * scala> broadcastVar.value + * res0: Array[Int] = Array(1, 2, 3) + * }}} + * + * After the broadcast variable is created, it should be used instead of the value `v` in any + * functions run on the cluster so that `v` is not shipped to the nodes more than once. + * In addition, the object `v` should not be modified after it is broadcast in order to ensure + * that all nodes get the same value of the broadcast variable (e.g. if the variable is shipped + * to a new node later). + * + * @param id A unique identifier for the broadcast variable. + * @tparam T Type of the data contained in the broadcast variable. + */ +abstract class Broadcast[T](val id: Long) extends Serializable { def value: T // We cannot have an abstract readObject here due to some weird issues with diff --git a/core/src/main/scala/org/apache/spark/broadcast/package.scala b/core/src/main/scala/org/apache/spark/broadcast/package.scala new file mode 100644 index 0000000000..01bf88629a --- /dev/null +++ b/core/src/main/scala/org/apache/spark/broadcast/package.scala @@ -0,0 +1,25 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark + +/** + * Package for broadcast variables. See [[broadcast.Broadcast]] for details. + */ +package object broadcast { + // For package docs only +} |