Merge branch 'master' into tupleBy

Conflicts: core/src/test/scala/spark/RDDSuite.scala
author: Stephen Haberman <stephen@exigencecorp.com> 2013-01-08 09:10:10 -0600
committer: Stephen Haberman <stephen@exigencecorp.com> 2013-01-08 09:10:10 -0600
commit: 4ee6b227757a0670b548bb101310e68aa2741d75 (patch)
tree: efebc6af4ef3439139325aac7160fb8fb2eaffec /core
parent: 8dc06069fe2330c3ee0fcaaeb0ae6e627a5887c3 (diff)
parent: a37adfa67bac51b2630c6e1673f8607a87273402 (diff)
download: spark-4ee6b227757a0670b548bb101310e68aa2741d75.tar.gz
spark-4ee6b227757a0670b548bb101310e68aa2741d75.tar.bz2
spark-4ee6b227757a0670b548bb101310e68aa2741d75.zip
15 files changed, 208 insertions, 223 deletions
diff --git a/core/src/main/scala/spark/Accumulators.scala b/core/src/main/scala/spark/Accumulators.scala
index bacd0ace37..b644aba5f8 100644
--- a/core/src/main/scala/spark/Accumulators.scala
+++ b/core/src/main/scala/spark/Accumulators.scala
@@ -39,19 +39,36 @@ class Accumulable[R, T] (
   def += (term: T) { value_ = param.addAccumulator(value_, term) }
 
   /**
+   * Add more data to this accumulator / accumulable
+   * @param term the data to add
+   */
+  def add(term: T) { value_ = param.addAccumulator(value_, term) }
+
+  /**
    * Merge two accumulable objects together
-   * 
+   *
    * Normally, a user will not want to use this version, but will instead call `+=`.
-   * @param term the other Accumulable that will get merged with this
+   * @param term the other `R` that will get merged with this
    */
   def ++= (term: R) { value_ = param.addInPlace(value_, term)}
 
   /**
+   * Merge two accumulable objects together
+   *
+   * Normally, a user will not want to use this version, but will instead call `add`.
+   * @param term the other `R` that will get merged with this
+   */
+  def merge(term: R) { value_ = param.addInPlace(value_, term)}
+
+  /**
    * Access the accumulator's current value; only allowed on master.
    */
-  def value = {
-    if (!deserialized) value_
-    else throw new UnsupportedOperationException("Can't read accumulator value in task")
+  def value: R = {
+    if (!deserialized) {
+      value_
+    } else {
+      throw new UnsupportedOperationException("Can't read accumulator value in task")
+    }
   }
 
   /**
@@ -68,10 +85,17 @@ class Accumulable[R, T] (
   /**
    * Set the accumulator's value; only allowed on master.
    */
-  def value_= (r: R) {
-    if (!deserialized) value_ = r
+  def value_= (newValue: R) {
+    if (!deserialized) value_ = newValue
     else throw new UnsupportedOperationException("Can't assign accumulator value in task")
   }
+
+  /**
+   * Set the accumulator's value; only allowed on master
+   */
+  def setValue(newValue: R) {
+    this.value = newValue
+  }
  
   // Called by Java when deserializing an object
   private def readObject(in: ObjectInputStream) {
diff --git a/core/src/main/scala/spark/BoundedMemoryCache.scala b/core/src/main/scala/spark/BoundedMemoryCache.scala
deleted file mode 100644
index e8392a194f..0000000000
--- a/core/src/main/scala/spark/BoundedMemoryCache.scala
+++ /dev/null
@@ -1,118 +0,0 @@
-package spark
-
-import java.util.LinkedHashMap
-
-/**
- * An implementation of Cache that estimates the sizes of its entries and attempts to limit its
- * total memory usage to a fraction of the JVM heap. Objects' sizes are estimated using
- * SizeEstimator, which has limitations; most notably, we will overestimate total memory used if
- * some cache entries have pointers to a shared object. Nonetheless, this Cache should work well
- * when most of the space is used by arrays of primitives or of simple classes.
- */
-private[spark] class BoundedMemoryCache(maxBytes: Long) extends Cache with Logging {
-  logInfo("BoundedMemoryCache.maxBytes = " + maxBytes)
-
-  def this() {
-    this(BoundedMemoryCache.getMaxBytes)
-  }
-
-  private var currentBytes = 0L
-  private val map = new LinkedHashMap[(Any, Int), Entry](32, 0.75f, true)
-
-  override def get(datasetId: Any, partition: Int): Any = {
-    synchronized {
-      val entry = map.get((datasetId, partition))
-      if (entry != null) {
-        entry.value
-      } else {
-        null
-      }
-    }
-  }
-
-  override def put(datasetId: Any, partition: Int, value: Any): CachePutResponse = {
-    val key = (datasetId, partition)
-    logInfo("Asked to add key " + key)
-    val size = estimateValueSize(key, value)
-    synchronized {
-      if (size > getCapacity) {
-        return CachePutFailure()
-      } else if (ensureFreeSpace(datasetId, size)) {
-        logInfo("Adding key " + key)
-        map.put(key, new Entry(value, size))
-        currentBytes += size
-        logInfo("Number of entries is now " + map.size)
-        return CachePutSuccess(size)
-      } else {
-        logInfo("Didn't add key " + key + " because we would have evicted part of same dataset")
-        return CachePutFailure()
-      }
-    }
-  }
-
-  override def getCapacity: Long = maxBytes
-
-  /**
-   * Estimate sizeOf 'value'
-   */
-  private def estimateValueSize(key: (Any, Int), value: Any) = {
-    val startTime = System.currentTimeMillis
-    val size = SizeEstimator.estimate(value.asInstanceOf[AnyRef])
-    val timeTaken = System.currentTimeMillis - startTime
-    logInfo("Estimated size for key %s is %d".format(key, size))
-    logInfo("Size estimation for key %s took %d ms".format(key, timeTaken))
-    size
-  }
-
-  /**
-   * Remove least recently used entries from the map until at least space bytes are free, in order
-   * to make space for a partition from the given dataset ID. If this cannot be done without
-   * evicting other data from the same dataset, returns false; otherwise, returns true. Assumes
-   * that a lock is held on the BoundedMemoryCache.
-   */
-  private def ensureFreeSpace(datasetId: Any, space: Long): Boolean = {
-    logInfo("ensureFreeSpace(%s, %d) called with curBytes=%d, maxBytes=%d".format(
-      datasetId, space, currentBytes, maxBytes))
-    val iter = map.entrySet.iterator   // Will give entries in LRU order
-    while (maxBytes - currentBytes < space && iter.hasNext) {
-      val mapEntry = iter.next()
-      val (entryDatasetId, entryPartition) = mapEntry.getKey
-      if (entryDatasetId == datasetId) {
-        // Cannot make space without removing part of the same dataset, or a more recently used one
-        return false
-      }
-      reportEntryDropped(entryDatasetId, entryPartition, mapEntry.getValue)
-      currentBytes -= mapEntry.getValue.size
-      iter.remove()
-    }
-    return true
-  }
-
-  protected def reportEntryDropped(datasetId: Any, partition: Int, entry: Entry) {
-    logInfo("Dropping key (%s, %d) of size %d to make space".format(datasetId, partition, entry.size))
-    // TODO: remove BoundedMemoryCache
-    
-    val (keySpaceId, innerDatasetId) = datasetId.asInstanceOf[(Any, Any)] 
-    innerDatasetId match {
-      case rddId: Int =>
-        SparkEnv.get.cacheTracker.dropEntry(rddId, partition)
-      case broadcastUUID: java.util.UUID =>
-        // TODO: Maybe something should be done if the broadcasted variable falls out of cache  
-      case _ => 
-    }    
-  }
-}
-
-// An entry in our map; stores a cached object and its size in bytes
-private[spark] case class Entry(value: Any, size: Long)
-
-private[spark] object BoundedMemoryCache {
-  /**
-   * Get maximum cache capacity from system configuration
-   */
-   def getMaxBytes: Long = {
-    val memoryFractionToUse = System.getProperty("spark.boundedMemoryCache.memoryFraction", "0.66").toDouble
-    (Runtime.getRuntime.maxMemory * memoryFractionToUse).toLong
-  }
-}
-
diff --git a/core/src/main/scala/spark/PairRDDFunctions.scala b/core/src/main/scala/spark/PairRDDFunctions.scala
index 413c944a66..ce48cea903 100644
--- a/core/src/main/scala/spark/PairRDDFunctions.scala
+++ b/core/src/main/scala/spark/PairRDDFunctions.scala
@@ -615,6 +615,16 @@ class PairRDDFunctions[K: ClassManifest, V: ClassManifest](
     writer.cleanup()
   }
 
+  /**
+   * Return an RDD with the keys of each tuple.
+   */
+  def keys: RDD[K] = self.map(_._1)
+  
+  /**
+   * Return an RDD with the values of each tuple.
+   */
+  def values: RDD[V] = self.map(_._2)
+
   private[spark] def getKeyClass() = implicitly[ClassManifest[K]].erasure
 
   private[spark] def getValueClass() = implicitly[ClassManifest[V]].erasure
diff --git a/core/src/main/scala/spark/RDD.scala b/core/src/main/scala/spark/RDD.scala
index 5ce524c0e7..3b9ced1946 100644
--- a/core/src/main/scala/spark/RDD.scala
+++ b/core/src/main/scala/spark/RDD.scala
@@ -330,6 +330,13 @@ abstract class RDD[T: ClassManifest](@transient sc: SparkContext) extends Serial
   def toArray(): Array[T] = collect()
 
   /**
+   * Return an RDD that contains all matching values by applying `f`.
+   */
+  def collect[U: ClassManifest](f: PartialFunction[T, U]): RDD[U] = {
+    filter(f.isDefinedAt).map(f)
+  }
+
+  /**
    * Reduces the elements of this RDD using the specified associative binary operator.
    */
   def reduce(f: (T, T) => T): T = {
diff --git a/core/src/main/scala/spark/SequenceFileRDDFunctions.scala b/core/src/main/scala/spark/SequenceFileRDDFunctions.scala
index a34aee69c1..6b4a11d6d3 100644
--- a/core/src/main/scala/spark/SequenceFileRDDFunctions.scala
+++ b/core/src/main/scala/spark/SequenceFileRDDFunctions.scala
@@ -42,7 +42,13 @@ class SequenceFileRDDFunctions[K <% Writable: ClassManifest, V <% Writable : Cla
       if (classOf[Writable].isAssignableFrom(classManifest[T].erasure)) { 
         classManifest[T].erasure
       } else {
-        implicitly[T => Writable].getClass.getMethods()(0).getReturnType
+        // We get the type of the Writable class by looking at the apply method which converts
+        // from T to Writable. Since we have two apply methods we filter out the one which
+        // is of the form "java.lang.Object apply(java.lang.Object)"
+        implicitly[T => Writable].getClass.getDeclaredMethods().filter(
+            m => m.getReturnType().toString != "java.lang.Object" &&
+                 m.getName() == "apply")(0).getReturnType
+
       }
        // TODO: use something like WritableConverter to avoid reflection
     }
diff --git a/core/src/main/scala/spark/SizeEstimator.scala b/core/src/main/scala/spark/SizeEstimator.scala
index 7c3e8640e9..d4e1157250 100644
--- a/core/src/main/scala/spark/SizeEstimator.scala
+++ b/core/src/main/scala/spark/SizeEstimator.scala
@@ -9,7 +9,6 @@ import java.util.Random
 
 import javax.management.MBeanServer
 import java.lang.management.ManagementFactory
-import com.sun.management.HotSpotDiagnosticMXBean
 
 import scala.collection.mutable.ArrayBuffer
 
@@ -76,12 +75,20 @@ private[spark] object SizeEstimator extends Logging {
     if (System.getProperty("spark.test.useCompressedOops") != null) {
       return System.getProperty("spark.test.useCompressedOops").toBoolean 
     }
+
     try {
       val hotSpotMBeanName = "com.sun.management:type=HotSpotDiagnostic"
       val server = ManagementFactory.getPlatformMBeanServer()
+
+      // NOTE: This should throw an exception in non-Sun JVMs
+      val hotSpotMBeanClass = Class.forName("com.sun.management.HotSpotDiagnosticMXBean")
+      val getVMMethod = hotSpotMBeanClass.getDeclaredMethod("getVMOption",
+          Class.forName("java.lang.String"))
+
       val bean = ManagementFactory.newPlatformMXBeanProxy(server, 
-        hotSpotMBeanName, classOf[HotSpotDiagnosticMXBean])
-      return bean.getVMOption("UseCompressedOops").getValue.toBoolean
+        hotSpotMBeanName, hotSpotMBeanClass)
+      // TODO: We could use reflection on the VMOption returned ?
+      return getVMMethod.invoke(bean, "UseCompressedOops").toString.contains("true")
     } catch {
       case e: Exception => {
         // Guess whether they've enabled UseCompressedOops based on whether maxMemory < 32 GB
diff --git a/core/src/main/scala/spark/SparkContext.scala b/core/src/main/scala/spark/SparkContext.scala
index 4fd81bc63b..bbf8272eb3 100644
--- a/core/src/main/scala/spark/SparkContext.scala
+++ b/core/src/main/scala/spark/SparkContext.scala
@@ -382,11 +382,12 @@ class SparkContext(
     new Accumulator(initialValue, param)
 
   /**
-   * Create an [[spark.Accumulable]] shared variable, with a `+=` method
+   * Create an [[spark.Accumulable]] shared variable, to which tasks can add values with `+=`.
+   * Only the master can access the accumuable's `value`.
    * @tparam T accumulator type
    * @tparam R type that can be added to the accumulator
    */
-  def accumulable[T,R](initialValue: T)(implicit param: AccumulableParam[T,R]) =
+  def accumulable[T, R](initialValue: T)(implicit param: AccumulableParam[T, R]) =
     new Accumulable(initialValue, param)
 
   /**
@@ -404,7 +405,7 @@ class SparkContext(
    * Broadcast a read-only variable to the cluster, returning a [[spark.Broadcast]] object for
    * reading it in distributed functions. The variable will be sent to each cluster only once.
    */
-  def broadcast[T](value: T) = env.broadcastManager.newBroadcast[T] (value, isLocal)
+  def broadcast[T](value: T) = env.broadcastManager.newBroadcast[T](value, isLocal)
 
   /**
    * Add a file to be downloaded into the working directory of this Spark job on every node.
diff --git a/core/src/main/scala/spark/api/java/JavaPairRDD.scala b/core/src/main/scala/spark/api/java/JavaPairRDD.scala
index 5c2be534ff..8ce32e0e2f 100644
--- a/core/src/main/scala/spark/api/java/JavaPairRDD.scala
+++ b/core/src/main/scala/spark/api/java/JavaPairRDD.scala
@@ -471,6 +471,16 @@ class JavaPairRDD[K, V](val rdd: RDD[(K, V)])(implicit val kManifest: ClassManif
     implicit def toOrdered(x: K): Ordered[K] = new KeyOrdering(x)
     fromRDD(new OrderedRDDFunctions(rdd).sortByKey(ascending))
   }
+
+  /**
+   * Return an RDD with the keys of each tuple.
+   */
+  def keys(): JavaRDD[K] = JavaRDD.fromRDD[K](rdd.map(_._1))
+
+  /**
+   * Return an RDD with the values of each tuple.
+   */
+  def values(): JavaRDD[V] = JavaRDD.fromRDD[V](rdd.map(_._2))
 }
 
 object JavaPairRDD {
diff --git a/core/src/main/scala/spark/api/java/JavaSparkContext.scala b/core/src/main/scala/spark/api/java/JavaSparkContext.scala
index b7725313c4..88ab2846be 100644
--- a/core/src/main/scala/spark/api/java/JavaSparkContext.scala
+++ b/core/src/main/scala/spark/api/java/JavaSparkContext.scala
@@ -10,7 +10,7 @@ import org.apache.hadoop.mapred.InputFormat
 import org.apache.hadoop.mapred.JobConf
 import org.apache.hadoop.mapreduce.{InputFormat => NewInputFormat}
 
-import spark.{Accumulator, AccumulatorParam, RDD, SparkContext}
+import spark.{Accumulable, AccumulableParam, Accumulator, AccumulatorParam, RDD, SparkContext}
 import spark.SparkContext.IntAccumulatorParam
 import spark.SparkContext.DoubleAccumulatorParam
 import spark.broadcast.Broadcast
@@ -265,26 +265,46 @@ class JavaSparkContext(val sc: SparkContext) extends JavaSparkContextVarargsWork
 
   /**
    * Create an [[spark.Accumulator]] integer variable, which tasks can "add" values
-   * to using the `+=` method. Only the master can access the accumulator's `value`.
+   * to using the `add` method. Only the master can access the accumulator's `value`.
    */
-  def intAccumulator(initialValue: Int): Accumulator[Int] =
-    sc.accumulator(initialValue)(IntAccumulatorParam)
+  def intAccumulator(initialValue: Int): Accumulator[java.lang.Integer] =
+    sc.accumulator(initialValue)(IntAccumulatorParam).asInstanceOf[Accumulator[java.lang.Integer]]
 
   /**
    * Create an [[spark.Accumulator]] double variable, which tasks can "add" values
-   * to using the `+=` method. Only the master can access the accumulator's `value`.
+   * to using the `add` method. Only the master can access the accumulator's `value`.
    */
-  def doubleAccumulator(initialValue: Double): Accumulator[Double] =
-    sc.accumulator(initialValue)(DoubleAccumulatorParam)
+  def doubleAccumulator(initialValue: Double): Accumulator[java.lang.Double] =
+    sc.accumulator(initialValue)(DoubleAccumulatorParam).asInstanceOf[Accumulator[java.lang.Double]]
+
+  /**
+   * Create an [[spark.Accumulator]] integer variable, which tasks can "add" values
+   * to using the `add` method. Only the master can access the accumulator's `value`.
+   */
+  def accumulator(initialValue: Int): Accumulator[java.lang.Integer] = intAccumulator(initialValue)
+
+  /**
+   * Create an [[spark.Accumulator]] double variable, which tasks can "add" values
+   * to using the `add` method. Only the master can access the accumulator's `value`.
+   */
+  def accumulator(initialValue: Double): Accumulator[java.lang.Double] =
+    doubleAccumulator(initialValue)
 
   /**
    * Create an [[spark.Accumulator]] variable of a given type, which tasks can "add" values
-   * to using the `+=` method. Only the master can access the accumulator's `value`.
+   * to using the `add` method. Only the master can access the accumulator's `value`.
    */
   def accumulator[T](initialValue: T, accumulatorParam: AccumulatorParam[T]): Accumulator[T] =
     sc.accumulator(initialValue)(accumulatorParam)
 
   /**
+   * Create an [[spark.Accumulable]] shared variable of the given type, to which tasks can
+   * "add" values with `add`. Only the master can access the accumuable's `value`.
+   */
+  def accumulable[T, R](initialValue: T, param: AccumulableParam[T, R]): Accumulable[T, R] =
+    sc.accumulable(initialValue)(param)
+
+  /**
    * Broadcast a read-only variable to the cluster, returning a [[spark.Broadcast]] object for
    * reading it in distributed functions. The variable will be sent to each cluster only once.
    */
diff --git a/core/src/main/scala/spark/deploy/worker/WorkerArguments.scala b/core/src/main/scala/spark/deploy/worker/WorkerArguments.scala
index 340920025b..37524a7c82 100644
--- a/core/src/main/scala/spark/deploy/worker/WorkerArguments.scala
+++ b/core/src/main/scala/spark/deploy/worker/WorkerArguments.scala
@@ -104,9 +104,25 @@ private[spark] class WorkerArguments(args: Array[String]) {
   }
 
   def inferDefaultMemory(): Int = {
-    val bean = ManagementFactory.getOperatingSystemMXBean
-                                .asInstanceOf[com.sun.management.OperatingSystemMXBean]
-    val totalMb = (bean.getTotalPhysicalMemorySize / 1024 / 1024).toInt
+    val ibmVendor = System.getProperty("java.vendor").contains("IBM")
+    var totalMb = 0
+    try {
+      val bean = ManagementFactory.getOperatingSystemMXBean()
+      if (ibmVendor) {
+        val beanClass = Class.forName("com.ibm.lang.management.OperatingSystemMXBean")
+        val method = beanClass.getDeclaredMethod("getTotalPhysicalMemory")
+        totalMb = (method.invoke(bean).asInstanceOf[Long] / 1024 / 1024).toInt
+      } else {
+        val beanClass = Class.forName("com.sun.management.OperatingSystemMXBean")
+        val method = beanClass.getDeclaredMethod("getTotalPhysicalMemorySize")
+        totalMb = (method.invoke(bean).asInstanceOf[Long] / 1024 / 1024).toInt
+      }
+    } catch {
+      case e: Exception => {
+        totalMb = 2*1024
+        System.out.println("Failed to get total physical memory. Using " + totalMb + " MB")
+      }
+    }
     // Leave out 1 GB for the operating system, but don't return a negative memory size
     math.max(totalMb - 1024, 512)
   }
diff --git a/core/src/test/scala/spark/BoundedMemoryCacheSuite.scala b/core/src/test/scala/spark/BoundedMemoryCacheSuite.scala
deleted file mode 100644
index 37cafd1e8e..0000000000
--- a/core/src/test/scala/spark/BoundedMemoryCacheSuite.scala
+++ /dev/null
@@ -1,58 +0,0 @@
-package spark
-
-import org.scalatest.FunSuite
-import org.scalatest.PrivateMethodTester
-import org.scalatest.matchers.ShouldMatchers
-
-// TODO: Replace this with a test of MemoryStore
-class BoundedMemoryCacheSuite extends FunSuite with PrivateMethodTester with ShouldMatchers {
-  test("constructor test") {
-    val cache = new BoundedMemoryCache(60)
-    expect(60)(cache.getCapacity)
-  }
-
-  test("caching") {
-    // Set the arch to 64-bit and compressedOops to true to get a deterministic test-case 
-    val oldArch = System.setProperty("os.arch", "amd64")
-    val oldOops = System.setProperty("spark.test.useCompressedOops", "true")
-    val initialize = PrivateMethod[Unit]('initialize)
-    SizeEstimator invokePrivate initialize()
-
-    val cache = new BoundedMemoryCache(60) {
-      //TODO sorry about this, but there is not better way how to skip 'cacheTracker.dropEntry'
-      override protected def reportEntryDropped(datasetId: Any, partition: Int, entry: Entry) {
-        logInfo("Dropping key (%s, %d) of size %d to make space".format(datasetId, partition, entry.size))
-      }
-    }
-
-    // NOTE: The String class definition changed in JDK 7 to exclude the int fields count and length
-    // This means that the size of strings will be lesser by 8 bytes in JDK 7 compared to JDK 6.
-    // http://mail.openjdk.java.net/pipermail/core-libs-dev/2012-May/010257.html
-    // Work around to check for either.
-
-    //should be OK
-    cache.put("1", 0, "Meh") should (equal (CachePutSuccess(56)) or equal (CachePutSuccess(48)))
-
-    //we cannot add this to cache (there is not enough space in cache) & we cannot evict the only value from
-    //cache because it's from the same dataset
-    expect(CachePutFailure())(cache.put("1", 1, "Meh"))
-
-    //should be OK, dataset '1' can be evicted from cache
-    cache.put("2", 0, "Meh") should (equal (CachePutSuccess(56)) or equal (CachePutSuccess(48)))
-
-    //should fail, cache should obey it's capacity
-    expect(CachePutFailure())(cache.put("3", 0, "Very_long_and_useless_string"))
-
-    if (oldArch != null) {
-      System.setProperty("os.arch", oldArch)
-    } else {
-      System.clearProperty("os.arch")
-    }
-
-    if (oldOops != null) {
-      System.setProperty("spark.test.useCompressedOops", oldOops)
-    } else {
-      System.clearProperty("spark.test.useCompressedOops")
-    }
-  }
-}
diff --git a/core/src/test/scala/spark/JavaAPISuite.java b/core/src/test/scala/spark/JavaAPISuite.java
index 33d5fc2d89..0817d1146c 100644
--- a/core/src/test/scala/spark/JavaAPISuite.java
+++ b/core/src/test/scala/spark/JavaAPISuite.java
@@ -581,4 +581,52 @@ public class JavaAPISuite implements Serializable {
     JavaPairRDD<Integer, Double> zipped = rdd.zip(doubles);
     zipped.count();
   }
+
+  @Test
+  public void accumulators() {
+    JavaRDD<Integer> rdd = sc.parallelize(Arrays.asList(1, 2, 3, 4, 5));
+
+    final Accumulator<Integer> intAccum = sc.accumulator(10);
+    rdd.foreach(new VoidFunction<Integer>() {
+      public void call(Integer x) {
+        intAccum.add(x);
+      }
+    });
+    Assert.assertEquals((Integer) 25, intAccum.value());
+
+    final Accumulator<Double> doubleAccum = sc.accumulator(10.0);
+    rdd.foreach(new VoidFunction<Integer>() {
+      public void call(Integer x) {
+        doubleAccum.add((double) x);
+      }
+    });
+    Assert.assertEquals((Double) 25.0, doubleAccum.value());
+
+    // Try a custom accumulator type
+    AccumulatorParam<Float> floatAccumulatorParam = new AccumulatorParam<Float>() {
+      public Float addInPlace(Float r, Float t) {
+        return r + t;
+      }
+
+      public Float addAccumulator(Float r, Float t) {
+        return r + t;
+      }
+
+      public Float zero(Float initialValue) {
+        return 0.0f;
+      }
+    };
+
+    final Accumulator<Float> floatAccum = sc.accumulator((Float) 10.0f, floatAccumulatorParam);
+    rdd.foreach(new VoidFunction<Integer>() {
+      public void call(Integer x) {
+        floatAccum.add((float) x);
+      }
+    });
+    Assert.assertEquals((Float) 25.0f, floatAccum.value());
+
+    // Test the setValue method
+    floatAccum.setValue(5.0f);
+    Assert.assertEquals((Float) 5.0f, floatAccum.value());
+  }
 }
diff --git a/core/src/test/scala/spark/RDDSuite.scala b/core/src/test/scala/spark/RDDSuite.scala
index 77bff8aba1..d74e9786c3 100644
--- a/core/src/test/scala/spark/RDDSuite.scala
+++ b/core/src/test/scala/spark/RDDSuite.scala
@@ -35,6 +35,7 @@ class RDDSuite extends FunSuite with BeforeAndAfter {
     assert(nums.flatMap(x => 1 to x).collect().toList === List(1, 1, 2, 1, 2, 3, 1, 2, 3, 4))
     assert(nums.union(nums).collect().toList === List(1, 2, 3, 4, 1, 2, 3, 4))
     assert(nums.glom().map(_.toList).collect().toList === List(List(1, 2), List(3, 4)))
+    assert(nums.collect({ case i if i >= 3 => i.toString }).collect().toList === List("3", "4"))
     assert(nums.keyBy(_.toString).collect().toList === List(("1", 1), ("2", 2), ("3", 3), ("4", 4)))
     val partitionSums = nums.mapPartitions(iter => Iterator(iter.reduceLeft(_ + _)))
     assert(partitionSums.collect().toList === List(3, 7))
diff --git a/core/src/test/scala/spark/ShuffleSuite.scala b/core/src/test/scala/spark/ShuffleSuite.scala
index 8170100f1d..bebb8ebe86 100644
--- a/core/src/test/scala/spark/ShuffleSuite.scala
+++ b/core/src/test/scala/spark/ShuffleSuite.scala
@@ -216,6 +216,13 @@ class ShuffleSuite extends FunSuite with ShouldMatchers with BeforeAndAfter {
     // Test that a shuffle on the file works, because this used to be a bug
     assert(file.map(line => (line, 1)).reduceByKey(_ + _).collect().toList === Nil)
   }
+
+  test("keys and values") {
+    sc = new SparkContext("local", "test")
+    val rdd = sc.parallelize(Array((1, "a"), (2, "b")))
+    assert(rdd.keys.collect().toList === List(1, 2))
+    assert(rdd.values.collect().toList === List("a", "b"))
+  }
 }
 
 object ShuffleSuite {
diff --git a/core/src/test/scala/spark/SizeEstimatorSuite.scala b/core/src/test/scala/spark/SizeEstimatorSuite.scala
index 17f366212b..e235ef2f67 100644
--- a/core/src/test/scala/spark/SizeEstimatorSuite.scala
+++ b/core/src/test/scala/spark/SizeEstimatorSuite.scala
@@ -3,7 +3,6 @@ package spark
 import org.scalatest.FunSuite
 import org.scalatest.BeforeAndAfterAll
 import org.scalatest.PrivateMethodTester
-import org.scalatest.matchers.ShouldMatchers
 
 class DummyClass1 {}
 
@@ -20,8 +19,17 @@ class DummyClass4(val d: DummyClass3) {
   val x: Int = 0
 }
 
+object DummyString {
+  def apply(str: String) : DummyString = new DummyString(str.toArray)
+}
+class DummyString(val arr: Array[Char]) {
+  override val hashCode: Int = 0
+  // JDK-7 has an extra hash32 field http://hg.openjdk.java.net/jdk7u/jdk7u6/jdk/rev/11987e85555f
+  @transient val hash32: Int = 0
+}
+
 class SizeEstimatorSuite
-  extends FunSuite with BeforeAndAfterAll with PrivateMethodTester with ShouldMatchers {
+  extends FunSuite with BeforeAndAfterAll with PrivateMethodTester {
 
   var oldArch: String = _
   var oldOops: String = _
@@ -45,15 +53,13 @@ class SizeEstimatorSuite
     expect(48)(SizeEstimator.estimate(new DummyClass4(new DummyClass3)))
   }
 
-  // NOTE: The String class definition changed in JDK 7 to exclude the int fields count and length.
-  // This means that the size of strings will be lesser by 8 bytes in JDK 7 compared to JDK 6.
-  // http://mail.openjdk.java.net/pipermail/core-libs-dev/2012-May/010257.html
-  // Work around to check for either.
+  // NOTE: The String class definition varies across JDK versions (1.6 vs. 1.7) and vendors
+  // (Sun vs IBM). Use a DummyString class to make tests deterministic.
   test("strings") {
-    SizeEstimator.estimate("") should (equal (48) or equal (40))
-    SizeEstimator.estimate("a") should (equal (56) or equal (48))
-    SizeEstimator.estimate("ab") should (equal (56) or equal (48))
-    SizeEstimator.estimate("abcdefgh") should (equal(64) or equal(56))
+    expect(40)(SizeEstimator.estimate(DummyString("")))
+    expect(48)(SizeEstimator.estimate(DummyString("a")))
+    expect(48)(SizeEstimator.estimate(DummyString("ab")))
+    expect(56)(SizeEstimator.estimate(DummyString("abcdefgh")))
   }
 
   test("primitive arrays") {
@@ -105,18 +111,16 @@ class SizeEstimatorSuite
     val initialize = PrivateMethod[Unit]('initialize)
     SizeEstimator invokePrivate initialize()
 
-    expect(40)(SizeEstimator.estimate(""))
-    expect(48)(SizeEstimator.estimate("a"))
-    expect(48)(SizeEstimator.estimate("ab"))
-    expect(56)(SizeEstimator.estimate("abcdefgh"))
+    expect(40)(SizeEstimator.estimate(DummyString("")))
+    expect(48)(SizeEstimator.estimate(DummyString("a")))
+    expect(48)(SizeEstimator.estimate(DummyString("ab")))
+    expect(56)(SizeEstimator.estimate(DummyString("abcdefgh")))
 
     resetOrClear("os.arch", arch)
   }
 
-  // NOTE: The String class definition changed in JDK 7 to exclude the int fields count and length.
-  // This means that the size of strings will be lesser by 8 bytes in JDK 7 compared to JDK 6.
-  // http://mail.openjdk.java.net/pipermail/core-libs-dev/2012-May/010257.html
-  // Work around to check for either.
+  // NOTE: The String class definition varies across JDK versions (1.6 vs. 1.7) and vendors
+  // (Sun vs IBM). Use a DummyString class to make tests deterministic.
   test("64-bit arch with no compressed oops") {
     val arch = System.setProperty("os.arch", "amd64")
     val oops = System.setProperty("spark.test.useCompressedOops", "false")
@@ -124,10 +128,10 @@ class SizeEstimatorSuite
     val initialize = PrivateMethod[Unit]('initialize)
     SizeEstimator invokePrivate initialize()
 
-    SizeEstimator.estimate("") should (equal (64) or equal (56))
-    SizeEstimator.estimate("a") should (equal (72) or equal (64))
-    SizeEstimator.estimate("ab") should (equal (72) or equal (64))
-    SizeEstimator.estimate("abcdefgh") should (equal (80) or equal (72))
+    expect(56)(SizeEstimator.estimate(DummyString("")))
+    expect(64)(SizeEstimator.estimate(DummyString("a")))
+    expect(64)(SizeEstimator.estimate(DummyString("ab")))
+    expect(72)(SizeEstimator.estimate(DummyString("abcdefgh")))
 
     resetOrClear("os.arch", arch)
     resetOrClear("spark.test.useCompressedOops", oops)
author	Stephen Haberman <stephen@exigencecorp.com>	2013-01-08 09:10:10 -0600
committer	Stephen Haberman <stephen@exigencecorp.com>	2013-01-08 09:10:10 -0600
commit	4ee6b227757a0670b548bb101310e68aa2741d75 (patch)
tree	efebc6af4ef3439139325aac7160fb8fb2eaffec /core
parent	8dc06069fe2330c3ee0fcaaeb0ae6e627a5887c3 (diff)
parent	a37adfa67bac51b2630c6e1673f8607a87273402 (diff)
download	spark-4ee6b227757a0670b548bb101310e68aa2741d75.tar.gz spark-4ee6b227757a0670b548bb101310e68aa2741d75.tar.bz2 spark-4ee6b227757a0670b548bb101310e68aa2741d75.zip