aboutsummaryrefslogtreecommitdiff
path: root/mllib/src/test/scala
diff options
context:
space:
mode:
authorWenchen Fan <wenchen@databricks.com>2016-04-29 23:04:51 -0700
committerXiangrui Meng <meng@databricks.com>2016-04-29 23:04:51 -0700
commit43b149fb885a27f9467aab28e5195f6f03aadcf0 (patch)
treec8620d5d0f42e9f3238020e3bce8f8ea527182eb /mllib/src/test/scala
parent4bac703eb9dcc286d6b89630cf433f95b63a4a1f (diff)
downloadspark-43b149fb885a27f9467aab28e5195f6f03aadcf0.tar.gz
spark-43b149fb885a27f9467aab28e5195f6f03aadcf0.tar.bz2
spark-43b149fb885a27f9467aab28e5195f6f03aadcf0.zip
[SPARK-14850][ML] convert primitive array from/to unsafe array directly in VectorUDT/MatrixUDT
## What changes were proposed in this pull request? This PR adds `fromPrimitiveArray` and `toPrimitiveArray` in `UnsafeArrayData`, so that we can do the conversion much faster in VectorUDT/MatrixUDT. ## How was this patch tested? existing tests and new test suite `UnsafeArraySuite` Author: Wenchen Fan <wenchen@databricks.com> Closes #12640 from cloud-fan/ml.
Diffstat (limited to 'mllib/src/test/scala')
-rw-r--r--mllib/src/test/scala/org/apache/spark/mllib/linalg/UDTSerializationBenchmark.scala70
1 files changed, 70 insertions, 0 deletions
diff --git a/mllib/src/test/scala/org/apache/spark/mllib/linalg/UDTSerializationBenchmark.scala b/mllib/src/test/scala/org/apache/spark/mllib/linalg/UDTSerializationBenchmark.scala
new file mode 100644
index 0000000000..be7110ad6b
--- /dev/null
+++ b/mllib/src/test/scala/org/apache/spark/mllib/linalg/UDTSerializationBenchmark.scala
@@ -0,0 +1,70 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.mllib.linalg
+
+import org.apache.spark.sql.catalyst.encoders.ExpressionEncoder
+import org.apache.spark.util.Benchmark
+
+/**
+ * Serialization benchmark for VectorUDT.
+ */
+object UDTSerializationBenchmark {
+
+ def main(args: Array[String]): Unit = {
+ val iters = 1e2.toInt
+ val numRows = 1e3.toInt
+
+ val encoder = ExpressionEncoder[Vector].defaultBinding
+
+ val vectors = (1 to numRows).map { i =>
+ Vectors.dense(Array.fill(1e5.toInt)(1.0 * i))
+ }.toArray
+ val rows = vectors.map(encoder.toRow)
+
+ val benchmark = new Benchmark("VectorUDT de/serialization", numRows, iters)
+
+ benchmark.addCase("serialize") { _ =>
+ var sum = 0
+ var i = 0
+ while (i < numRows) {
+ sum += encoder.toRow(vectors(i)).numFields
+ i += 1
+ }
+ }
+
+ benchmark.addCase("deserialize") { _ =>
+ var sum = 0
+ var i = 0
+ while (i < numRows) {
+ sum += encoder.fromRow(rows(i)).numActives
+ i += 1
+ }
+ }
+
+ /*
+ Java HotSpot(TM) 64-Bit Server VM 1.8.0_60-b27 on Mac OS X 10.11.4
+ Intel(R) Core(TM) i7-4960HQ CPU @ 2.60GHz
+
+ VectorUDT de/serialization: Best/Avg Time(ms) Rate(M/s) Per Row(ns) Relative
+ -------------------------------------------------------------------------------------------
+ serialize 380 / 392 0.0 379730.0 1.0X
+ deserialize 138 / 142 0.0 137816.6 2.8X
+ */
+ benchmark.run()
+ }
+}