aboutsummaryrefslogtreecommitdiff
path: root/mllib/src/test
diff options
context:
space:
mode:
authortmnd1991 <antonio.murgia2@studio.unibo.it>2016-07-06 12:56:26 -0700
committerJoseph K. Bradley <joseph@databricks.com>2016-07-06 12:56:26 -0700
commit040f6f9f468f153e4c4db78c26ced0299245fb6f (patch)
tree14ac29891850ecd4d254ddb82c76dc17883dae66 /mllib/src/test
parent4f8ceed59367319300e4bfa5b957c387be81ffa3 (diff)
downloadspark-040f6f9f468f153e4c4db78c26ced0299245fb6f.tar.gz
spark-040f6f9f468f153e4c4db78c26ced0299245fb6f.tar.bz2
spark-040f6f9f468f153e4c4db78c26ced0299245fb6f.zip
[SPARK-15740][MLLIB] Word2VecSuite "big model load / save" caused OOM in maven jenkins builds
## What changes were proposed in this pull request? "test big model load / save" in Word2VecSuite, lately resulted into OOM. Therefore we decided to make the partitioning adaptive (not based on spark default "spark.kryoserializer.buffer.max" conf) and then testing it using a small buffer size in order to trigger partitioning without allocating too much memory for the test. ## How was this patch tested? It was tested running the following unit test: org.apache.spark.mllib.feature.Word2VecSuite Author: tmnd1991 <antonio.murgia2@studio.unibo.it> Closes #13509 from tmnd1991/SPARK-15740.
Diffstat (limited to 'mllib/src/test')
-rw-r--r--mllib/src/test/scala/org/apache/spark/mllib/feature/Word2VecSuite.scala25
1 files changed, 22 insertions, 3 deletions
diff --git a/mllib/src/test/scala/org/apache/spark/mllib/feature/Word2VecSuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/feature/Word2VecSuite.scala
index c9fb9768c1..22de4c4ac4 100644
--- a/mllib/src/test/scala/org/apache/spark/mllib/feature/Word2VecSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/mllib/feature/Word2VecSuite.scala
@@ -91,11 +91,23 @@ class Word2VecSuite extends SparkFunSuite with MLlibTestSparkContext {
}
- ignore("big model load / save") {
- // create a model bigger than 32MB since 9000 * 1000 * 4 > 2^25
- val word2VecMap = Map((0 to 9000).map(i => s"$i" -> Array.fill(1000)(0.1f)): _*)
+ test("big model load / save") {
+ // backupping old values
+ val oldBufferConfValue = spark.conf.get("spark.kryoserializer.buffer.max", "64m")
+ val oldBufferMaxConfValue = spark.conf.get("spark.kryoserializer.buffer", "64k")
+
+ // setting test values to trigger partitioning
+ spark.conf.set("spark.kryoserializer.buffer", "50b")
+ spark.conf.set("spark.kryoserializer.buffer.max", "50b")
+
+ // create a model bigger than 50 Bytes
+ val word2VecMap = Map((0 to 10).map(i => s"$i" -> Array.fill(10)(0.1f)): _*)
val model = new Word2VecModel(word2VecMap)
+ // est. size of this model, given the formula:
+ // (floatSize * vectorSize + 15) * numWords
+ // (4 * 10 + 15) * 10 = 550
+ // therefore it should generate multiple partitions
val tempDir = Utils.createTempDir()
val path = tempDir.toURI.toString
@@ -103,9 +115,16 @@ class Word2VecSuite extends SparkFunSuite with MLlibTestSparkContext {
model.save(sc, path)
val sameModel = Word2VecModel.load(sc, path)
assert(sameModel.getVectors.mapValues(_.toSeq) === model.getVectors.mapValues(_.toSeq))
+ }
+ catch {
+ case t: Throwable => fail("exception thrown persisting a model " +
+ "that spans over multiple partitions", t)
} finally {
Utils.deleteRecursively(tempDir)
+ spark.conf.set("spark.kryoserializer.buffer", oldBufferConfValue)
+ spark.conf.set("spark.kryoserializer.buffer.max", oldBufferMaxConfValue)
}
+
}
test("test similarity for word vectors with large values is not Infinity or NaN") {