Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
70 changes: 69 additions & 1 deletion crates/geo_filters/evaluation/performance.rs
Original file line number Diff line number Diff line change
@@ -1,9 +1,12 @@
use std::hash::BuildHasher;
use std::hint::black_box;

use criterion::{criterion_group, criterion_main, Criterion};
use geo_filters::build_hasher::UnstableDefaultBuildHasher;
use geo_filters::config::VariableConfig;
use geo_filters::diff_count::{GeoDiffCount, GeoDiffCount13};
use geo_filters::diff_count::{
GeoDiffConfig13, GeoDiffCount, GeoDiffCount13, GeoDiffCount7, GeoDiffCountBuilder,
};
use geo_filters::distinct_count::GeoDistinctCount13;
use geo_filters::evaluation::hll::Hll14;
use geo_filters::Count;
Expand Down Expand Up @@ -130,6 +133,71 @@ fn criterion_benchmark(c: &mut Criterion) {
})
});
}

// Compare building a diff filter from a precomputed slice of hashes one by one (`push_hash`)
// versus via the incremental `GeoDiffCountBuilder` (per-hash, and the batched
// `extend_by_hashes`). The hashes are precomputed so that only construction cost is measured.
for size in [1000usize, 10000, 100000, 1000000] {
let mut group = c.benchmark_group(format!("construct:{size}"));
let build_hasher = UnstableDefaultBuildHasher::default();
let hashes: Vec<u64> = (0..size).map(|i| build_hasher.hash_one(i)).collect();

group.bench_function("geo_diff_count_7_push", |b| {
b.iter(|| {
let mut gc = GeoDiffCount7::default();
for &hash in &hashes {
gc.push_hash(hash);
}
black_box(&gc);
})
});
group.bench_function("geo_diff_count_13_push", |b| {
b.iter(|| {
let mut gc = GeoDiffCount13::default();
for &hash in &hashes {
gc.push_hash(hash);
}
black_box(&gc);
})
});
group.bench_function("geo_diff_count_13_builder_extend", |b| {
b.iter(|| {
let mut builder = GeoDiffCountBuilder::with_capacity(
GeoDiffConfig13::<UnstableDefaultBuildHasher>::default(),
0,
);
builder.extend_by_hashes(hashes.iter().copied());
black_box(builder.build());
})
});
group.bench_function("geo_diff_count_13_builder", |b| {
b.iter(|| {
let mut builder = GeoDiffCountBuilder::with_capacity(
GeoDiffConfig13::<UnstableDefaultBuildHasher>::default(),
size,
);
for &hash in &hashes {
builder.push_hash(hash);
}
black_box(builder.build());
})
});
// Reserve nothing so the split starts at 0 and every bucket initially lands in `numbers`,
// forcing the buffer to fill and compact (lazily flush) repeatedly as the split ramps up.
// This isolates the cost of the lazy-flush path versus a well-positioned builder.
group.bench_function("geo_diff_count_13_builder_unreserved", |b| {
b.iter(|| {
let mut builder = GeoDiffCountBuilder::with_capacity(
GeoDiffConfig13::<UnstableDefaultBuildHasher>::default(),
0,
);
for &hash in &hashes {
builder.push_hash(hash);
}
black_box(builder.build());
})
});
}
}

criterion_group!(benches, criterion_benchmark);
Expand Down
28 changes: 25 additions & 3 deletions crates/geo_filters/src/config/bitchunks.rs
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,11 @@ impl BitChunk {
}
}

/// Merges a descending stream of distinct one-bit positions (`leading`) with a descending stream
/// of `BitChunk`s (`trailing`) into a single descending `BitChunk` stream. All leading positions
/// must be more significant than all trailing bits, except that the least-significant leading block
/// may overlap the most-significant trailing block (the two are or-ed). Leading positions must be
/// distinct.
pub(crate) fn iter_bit_chunks(
leading: impl Iterator<Item = usize>,
trailing: impl Iterator<Item = BitChunk>,
Expand Down Expand Up @@ -55,8 +60,7 @@ impl<I: Iterator<Item = BitChunk>, J: Iterator<Item = usize>> Iterator for BitCh
_ => break,
}
}
// All leading bits were consumed, test whether it can be merged with
// trailing bits.
// All leading bits were consumed, test whether it can be merged with trailing bits.
match self.trailing.peek() {
Some(BitChunk {
index: other_index,
Expand Down Expand Up @@ -314,7 +318,7 @@ impl<T: IsBucketType, I: Iterator<Item = BitChunk>> Iterator for BitChunksOnes<T
mod tests {
use itertools::Itertools;

use super::{iter_ones, BitChunk};
use super::{iter_bit_chunks, iter_ones, BitChunk};

#[test]
fn test_iter_ones() {
Expand All @@ -338,4 +342,22 @@ mod tests {
iter_ones::<usize, _>(chunks.into_iter().peekable()).collect_vec()
);
}

#[test]
fn test_iter_bit_chunks() {
// Distinct leading bits merge within a block (via or) and merge with the trailing block at
// the boundary index.
let chunks = iter_bit_chunks(
vec![70, 67, 5].into_iter(),
vec![BitChunk::new(0, 1 << 2)].into_iter(),
)
.collect_vec();
assert_eq!(
chunks,
vec![
BitChunk::new(1, (1 << 6) | (1 << 3)), // 70, 67
BitChunk::new(0, (1 << 5) | (1 << 2)), // 5 (leading) and bit 2 (trailing)
]
);
}
}
16 changes: 11 additions & 5 deletions crates/geo_filters/src/config/lookup.rs
Original file line number Diff line number Diff line change
Expand Up @@ -2,22 +2,24 @@ use crate::config::phi_f64;

pub(crate) struct HashToBucketLookup {
b: usize,
buckets: Vec<(usize, usize)>,
buckets: Vec<(u32, u32)>,
}

impl HashToBucketLookup {
pub(crate) fn new(b: usize) -> Self {
let mut buckets = vec![(0, 0); 2 << b];
let mut buckets = vec![(0u32, 0u32); 2 << b];
let mut last_filled_bucket = buckets.len();
let phi = phi_f64(b);
for bucket in 0..(1 << b) {
let lower_bucket_limit = phi.powf((bucket + 1) as f64);
// `lower_hash_limit` is a 32-bit hash threshold: `lower_bucket_limit` lies in
// `[0.5, 1)`, so this value is always in `[0, 2^32)` and fits losslessly into a `u32`.
let lower_hash_limit = ((lower_bucket_limit - 0.5) * 2.0f64.powf(33.0)) as usize;
let lower_hash_bucket = lower_hash_limit >> (32 - b - 1);
assert!(lower_hash_bucket < last_filled_bucket);
while last_filled_bucket > lower_hash_bucket {
last_filled_bucket -= 1;
buckets[last_filled_bucket] = (bucket, lower_hash_limit);
buckets[last_filled_bucket] = (bucket as u32, lower_hash_limit as u32);
}
}
assert_eq!(last_filled_bucket, 0);
Expand All @@ -38,8 +40,12 @@ impl HashToBucketLookup {
} & 0xFFFFFFFF) as usize;
// From those, the first B bits determine the bucket index in our lookup table.
let idx = hash >> (32 - self.b - 1);
let offset = (hash < self.buckets[idx].1) as usize;
offset + self.buckets[idx].0 + (1 << self.b) * levels
// SAFETY: `hash` was masked to 32 bits, so `idx = hash >> (31 - b)` holds at most `b + 1`
// significant bits and is therefore always `< 2^(b+1) == 2 << b == self.buckets.len()`.
debug_assert!(idx < self.buckets.len());
let (base, threshold) = *unsafe { self.buckets.get_unchecked(idx) };
let offset = (hash < threshold as usize) as usize;
Comment on lines 42 to +47
offset + base as usize + (1 << self.b) * levels
}
}

Expand Down
Loading