aboutsummaryrefslogtreecommitdiff
path: root/unsafe
diff options
context:
space:
mode:
authorDavies Liu <davies@databricks.com>2015-11-04 14:45:02 -0800
committerJosh Rosen <joshrosen@databricks.com>2015-11-04 14:45:02 -0800
commit1b6a5d4af9691c3f7f3ebee3146dc13d12a0e047 (patch)
tree387ed802d0acb7508cbf1d6ffaf59e40b2a41cff /unsafe
parent701fb5052080fa8c0a79ad7c1e65693ccf444787 (diff)
downloadspark-1b6a5d4af9691c3f7f3ebee3146dc13d12a0e047.tar.gz
spark-1b6a5d4af9691c3f7f3ebee3146dc13d12a0e047.tar.bz2
spark-1b6a5d4af9691c3f7f3ebee3146dc13d12a0e047.zip
[SPARK-11493] remove bitset from BytesToBytesMap
Since we have 4 bytes as number of records in the beginning of a page, the address can not be zero, so we do not need the bitset. For performance concerns, the bitset could help speed up false lookup if the slot is empty (because bitset is smaller than longArray, cache hit rate will be higher). In practice, the map is filled with 35% - 70% (use 50% as average), so only half of the false lookups can benefit of it, all others will pay the cost of load the bitset (still need to access the longArray anyway). For aggregation, we always need to access the longArray (insert a new key after false lookup), also confirmed by a benchmark. For broadcast hash join, there could be a regression, but a simple benchmark showed that it may not (most of lookup are false): ``` sqlContext.range(1<<20).write.parquet("small") df = sqlContext.read.parquet('small') for i in range(3): t = time.time() df2 = sqlContext.range(1<<26).selectExpr("id * 1111111111 % 987654321 as id2") df2.join(df, df.id == df2.id2).count() print time.time() -t ``` Having bitset (used time in seconds): ``` 17.5404241085 10.2758829594 10.5786800385 ``` After removing bitset (used time in seconds): ``` 21.8939979076 12.4132959843 9.97224712372 ``` cc rxin nongli Author: Davies Liu <davies@databricks.com> Closes #9452 from davies/remove_bitset.
Diffstat (limited to 'unsafe')
-rw-r--r--unsafe/src/main/java/org/apache/spark/unsafe/bitset/BitSet.java113
-rw-r--r--unsafe/src/test/java/org/apache/spark/unsafe/bitset/BitSetSuite.java88
2 files changed, 0 insertions, 201 deletions
diff --git a/unsafe/src/main/java/org/apache/spark/unsafe/bitset/BitSet.java b/unsafe/src/main/java/org/apache/spark/unsafe/bitset/BitSet.java
deleted file mode 100644
index 7c124173b0..0000000000
--- a/unsafe/src/main/java/org/apache/spark/unsafe/bitset/BitSet.java
+++ /dev/null
@@ -1,113 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.spark.unsafe.bitset;
-
-import org.apache.spark.unsafe.array.LongArray;
-import org.apache.spark.unsafe.memory.MemoryBlock;
-
-/**
- * A fixed size uncompressed bit set backed by a {@link LongArray}.
- *
- * Each bit occupies exactly one bit of storage.
- */
-public final class BitSet {
-
- /** A long array for the bits. */
- private final LongArray words;
-
- /** Length of the long array. */
- private final int numWords;
-
- private final Object baseObject;
- private final long baseOffset;
-
- /**
- * Creates a new {@link BitSet} using the specified memory block. Size of the memory block must be
- * multiple of 8 bytes (i.e. 64 bits).
- */
- public BitSet(MemoryBlock memory) {
- words = new LongArray(memory);
- assert (words.size() <= Integer.MAX_VALUE);
- numWords = (int) words.size();
- baseObject = words.memoryBlock().getBaseObject();
- baseOffset = words.memoryBlock().getBaseOffset();
- }
-
- public MemoryBlock memoryBlock() {
- return words.memoryBlock();
- }
-
- /**
- * Returns the number of bits in this {@code BitSet}.
- */
- public long capacity() {
- return numWords * 64;
- }
-
- /**
- * Sets the bit at the specified index to {@code true}.
- */
- public void set(int index) {
- assert index < numWords * 64 : "index (" + index + ") should < length (" + numWords * 64 + ")";
- BitSetMethods.set(baseObject, baseOffset, index);
- }
-
- /**
- * Sets the bit at the specified index to {@code false}.
- */
- public void unset(int index) {
- assert index < numWords * 64 : "index (" + index + ") should < length (" + numWords * 64 + ")";
- BitSetMethods.unset(baseObject, baseOffset, index);
- }
-
- /**
- * Returns {@code true} if the bit is set at the specified index.
- */
- public boolean isSet(int index) {
- assert index < numWords * 64 : "index (" + index + ") should < length (" + numWords * 64 + ")";
- return BitSetMethods.isSet(baseObject, baseOffset, index);
- }
-
- /**
- * Returns the index of the first bit that is set to true that occurs on or after the
- * specified starting index. If no such bit exists then {@code -1} is returned.
- * <p>
- * To iterate over the true bits in a BitSet, use the following loop:
- * <pre>
- * <code>
- * for (long i = bs.nextSetBit(0); i &gt;= 0; i = bs.nextSetBit(i + 1)) {
- * // operate on index i here
- * }
- * </code>
- * </pre>
- *
- * @param fromIndex the index to start checking from (inclusive)
- * @return the index of the next set bit, or -1 if there is no such bit
- */
- public int nextSetBit(int fromIndex) {
- return BitSetMethods.nextSetBit(baseObject, baseOffset, fromIndex, numWords);
- }
-
- /**
- * Returns {@code true} if any bit is set.
- */
- public boolean anySet() {
- return BitSetMethods.anySet(baseObject, baseOffset, numWords);
- }
-
-}
diff --git a/unsafe/src/test/java/org/apache/spark/unsafe/bitset/BitSetSuite.java b/unsafe/src/test/java/org/apache/spark/unsafe/bitset/BitSetSuite.java
deleted file mode 100644
index 14e38683df..0000000000
--- a/unsafe/src/test/java/org/apache/spark/unsafe/bitset/BitSetSuite.java
+++ /dev/null
@@ -1,88 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.spark.unsafe.bitset;
-
-import org.junit.Assert;
-import org.junit.Test;
-
-import org.apache.spark.unsafe.memory.MemoryBlock;
-
-public class BitSetSuite {
-
- private static BitSet createBitSet(int capacity) {
- Assert.assertEquals(0, capacity % 64);
- return new BitSet(MemoryBlock.fromLongArray(new long[capacity / 64]));
- }
-
- @Test
- public void basicOps() {
- BitSet bs = createBitSet(64);
- Assert.assertEquals(64, bs.capacity());
-
- // Make sure the bit set starts empty.
- for (int i = 0; i < bs.capacity(); i++) {
- Assert.assertFalse(bs.isSet(i));
- }
- // another form of asserting that the bit set is empty
- Assert.assertFalse(bs.anySet());
-
- // Set every bit and check it.
- for (int i = 0; i < bs.capacity(); i++) {
- bs.set(i);
- Assert.assertTrue(bs.isSet(i));
- }
-
- // Unset every bit and check it.
- for (int i = 0; i < bs.capacity(); i++) {
- Assert.assertTrue(bs.isSet(i));
- bs.unset(i);
- Assert.assertFalse(bs.isSet(i));
- }
-
- // Make sure anySet() can detect any set bit
- bs = createBitSet(256);
- bs.set(64);
- Assert.assertTrue(bs.anySet());
- }
-
- @Test
- public void traversal() {
- BitSet bs = createBitSet(256);
-
- Assert.assertEquals(-1, bs.nextSetBit(0));
- Assert.assertEquals(-1, bs.nextSetBit(10));
- Assert.assertEquals(-1, bs.nextSetBit(64));
-
- bs.set(10);
- Assert.assertEquals(10, bs.nextSetBit(0));
- Assert.assertEquals(10, bs.nextSetBit(1));
- Assert.assertEquals(10, bs.nextSetBit(10));
- Assert.assertEquals(-1, bs.nextSetBit(11));
-
- bs.set(11);
- Assert.assertEquals(10, bs.nextSetBit(10));
- Assert.assertEquals(11, bs.nextSetBit(11));
-
- // Skip a whole word and find it
- bs.set(190);
- Assert.assertEquals(190, bs.nextSetBit(12));
-
- Assert.assertEquals(-1, bs.nextSetBit(191));
- Assert.assertEquals(-1, bs.nextSetBit(256));
- }
-}