diff options
author | Wenchen Fan <wenchen@databricks.com> | 2016-04-29 23:04:51 -0700 |
---|---|---|
committer | Xiangrui Meng <meng@databricks.com> | 2016-04-29 23:04:51 -0700 |
commit | 43b149fb885a27f9467aab28e5195f6f03aadcf0 (patch) | |
tree | c8620d5d0f42e9f3238020e3bce8f8ea527182eb /mllib/src/test/scala | |
parent | 4bac703eb9dcc286d6b89630cf433f95b63a4a1f (diff) | |
download | spark-43b149fb885a27f9467aab28e5195f6f03aadcf0.tar.gz spark-43b149fb885a27f9467aab28e5195f6f03aadcf0.tar.bz2 spark-43b149fb885a27f9467aab28e5195f6f03aadcf0.zip |
[SPARK-14850][ML] convert primitive array from/to unsafe array directly in VectorUDT/MatrixUDT
## What changes were proposed in this pull request?
This PR adds `fromPrimitiveArray` and `toPrimitiveArray` in `UnsafeArrayData`, so that we can do the conversion much faster in VectorUDT/MatrixUDT.
## How was this patch tested?
existing tests and new test suite `UnsafeArraySuite`
Author: Wenchen Fan <wenchen@databricks.com>
Closes #12640 from cloud-fan/ml.
Diffstat (limited to 'mllib/src/test/scala')
-rw-r--r-- | mllib/src/test/scala/org/apache/spark/mllib/linalg/UDTSerializationBenchmark.scala | 70 |
1 files changed, 70 insertions, 0 deletions
diff --git a/mllib/src/test/scala/org/apache/spark/mllib/linalg/UDTSerializationBenchmark.scala b/mllib/src/test/scala/org/apache/spark/mllib/linalg/UDTSerializationBenchmark.scala new file mode 100644 index 0000000000..be7110ad6b --- /dev/null +++ b/mllib/src/test/scala/org/apache/spark/mllib/linalg/UDTSerializationBenchmark.scala @@ -0,0 +1,70 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.mllib.linalg + +import org.apache.spark.sql.catalyst.encoders.ExpressionEncoder +import org.apache.spark.util.Benchmark + +/** + * Serialization benchmark for VectorUDT. + */ +object UDTSerializationBenchmark { + + def main(args: Array[String]): Unit = { + val iters = 1e2.toInt + val numRows = 1e3.toInt + + val encoder = ExpressionEncoder[Vector].defaultBinding + + val vectors = (1 to numRows).map { i => + Vectors.dense(Array.fill(1e5.toInt)(1.0 * i)) + }.toArray + val rows = vectors.map(encoder.toRow) + + val benchmark = new Benchmark("VectorUDT de/serialization", numRows, iters) + + benchmark.addCase("serialize") { _ => + var sum = 0 + var i = 0 + while (i < numRows) { + sum += encoder.toRow(vectors(i)).numFields + i += 1 + } + } + + benchmark.addCase("deserialize") { _ => + var sum = 0 + var i = 0 + while (i < numRows) { + sum += encoder.fromRow(rows(i)).numActives + i += 1 + } + } + + /* + Java HotSpot(TM) 64-Bit Server VM 1.8.0_60-b27 on Mac OS X 10.11.4 + Intel(R) Core(TM) i7-4960HQ CPU @ 2.60GHz + + VectorUDT de/serialization: Best/Avg Time(ms) Rate(M/s) Per Row(ns) Relative + ------------------------------------------------------------------------------------------- + serialize 380 / 392 0.0 379730.0 1.0X + deserialize 138 / 142 0.0 137816.6 2.8X + */ + benchmark.run() + } +} |