0.9.8.10
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Modules Pages
BloomFilter.h
Go to the documentation of this file.
1 /*
2  * Copyright (C) 2007-2015 Hypertable, Inc.
3  *
4  * This file is part of Hypertable.
5  *
6  * Hypertable is free software; you can redistribute it and/or
7  * modify it under the terms of the GNU General Public License
8  * as published by the Free Software Foundation; either version 3
9  * of the License, or any later version.
10  *
11  * Hypertable is distributed in the hope that it will be useful,
12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14  * GNU General Public License for more details.
15  *
16  * You should have received a copy of the GNU General Public License
17  * along with this program; if not, write to the Free Software
18  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
19  * 02110-1301, USA.
20  */
21 
29 #ifndef HYPERTABLE_BLOOM_FILTER_H
30 #define HYPERTABLE_BLOOM_FILTER_H
31 
32 #include <cmath>
33 #include <limits.h>
34 #include "Common/StaticBuffer.h"
35 #include "Common/MurmurHash.h"
36 #include "Common/Logger.h"
37 #include "Common/StringExt.h"
38 
39 namespace Hypertable {
40 
49 template <class HasherT = MurmurHash2>
51 public:
58  BasicBloomFilter(size_t items_estimate, float false_positive_prob) {
59  m_items_actual = 0;
60  m_items_estimate = items_estimate;
61  m_false_positive_prob = false_positive_prob;
62  double num_hashes = -std::log(m_false_positive_prob) / std::log(2);
63  m_num_hash_functions = (size_t)num_hashes;
64  m_num_bits = (size_t)(m_items_estimate * num_hashes / std::log(2));
65  if (m_num_bits == 0) {
67  "Num elements=%lu false_positive_prob=%.3f",
68  (Lu)items_estimate, false_positive_prob);
69  }
70  m_num_bytes = (m_num_bits / CHAR_BIT) + (m_num_bits % CHAR_BIT ? 1 : 0);
71  m_bloom_bits = new uint8_t[m_num_bytes];
72 
73  memset(m_bloom_bits, 0, m_num_bytes);
74 
75  HT_DEBUG_OUT << "num funcs=" << m_num_hash_functions
76  << " num bits=" << m_num_bits << " num bytes=" << m_num_bytes
77  << " bits per element=" << double(m_num_bits) / items_estimate
78  << HT_END;
79  }
80 
88  BasicBloomFilter(size_t items_estimate, float bits_per_item,
89  size_t num_hashes) {
90  m_items_actual = 0;
91  m_items_estimate = items_estimate;
93  m_num_hash_functions = num_hashes;
94  m_num_bits = (size_t)((double)items_estimate * (double)bits_per_item);
95  if (m_num_bits == 0) {
96  HT_THROWF(Error::EMPTY_BLOOMFILTER, "Num elements=%lu bits_per_item=%.3f",
97  (Lu)items_estimate, bits_per_item);
98  }
99  m_num_bytes = (m_num_bits / CHAR_BIT) + (m_num_bits % CHAR_BIT ? 1 : 0);
100  m_bloom_bits = new uint8_t[m_num_bytes];
101 
102  memset(m_bloom_bits, 0, m_num_bytes);
103 
104  HT_DEBUG_OUT << "num funcs=" << m_num_hash_functions << " num bits="
105  << m_num_bits << " num bytes=" << m_num_bytes << " bits per element="
106  << double(m_num_bits) / items_estimate << HT_END;
107  }
108 
117  BasicBloomFilter(size_t items_estimate, size_t items_actual,
118  int64_t length, size_t num_hashes) {
119  m_items_actual = items_actual;
120  m_items_estimate = items_estimate;
121  m_false_positive_prob = 0.0;
122  m_num_hash_functions = num_hashes;
123  m_num_bits = (size_t)length;
124  if (m_num_bits == 0) {
126  "Estimated items=%lu actual items=%lu length=%lld num hashes=%lu",
127  (Lu)items_estimate, (Lu)items_actual, (Lld)length,
128  (Lu)num_hashes);
129  }
130  m_num_bytes = (m_num_bits / CHAR_BIT) + (m_num_bits % CHAR_BIT ? 1 : 0);
131  m_bloom_bits = new uint8_t[m_num_bytes];
132 
133  memset(m_bloom_bits, 0, m_num_bytes);
134 
135  HT_DEBUG_OUT << "num funcs=" << m_num_hash_functions << " num bits="
136  << m_num_bits << " num bytes=" << m_num_bytes << " bits per element="
137  << double(m_num_bits) / items_estimate << HT_END;
138  }
139 
142  delete[] m_bloom_bits;
143  }
144 
145  /* XXX/review static functions to expose the bloom filter parameters, given
146  1) probablility and # keys
147  2) #keys and fix the total size (m), to calculate the optimal
148  probability - # hash functions to use
149  */
150 
159  void insert(const void *key, size_t len) {
160  uint32_t hash = len;
161 
162  for (size_t i = 0; i < m_num_hash_functions; ++i) {
163  hash = m_hasher(key, len, hash) % m_num_bits;
164  m_bloom_bits[hash / CHAR_BIT] |= (1 << (hash % CHAR_BIT));
165  }
166  m_items_actual++;
167  }
168 
173  void insert(const String& key) {
174  insert(key.c_str(), key.length());
175  }
176 
188  bool may_contain(const void *key, size_t len) const {
189  uint32_t hash = len;
190  uint8_t byte_mask;
191  uint8_t byte;
192 
193  for (size_t i = 0; i < m_num_hash_functions; ++i) {
194  hash = m_hasher(key, len, hash) % m_num_bits;
195  byte = m_bloom_bits[hash / CHAR_BIT];
196  byte_mask = (1 << (hash % CHAR_BIT));
197 
198  if ((byte & byte_mask) == 0) {
199  return false;
200  }
201  }
202  return true;
203  }
204 
210  bool may_contain(const String& key) const {
211  return may_contain(key.c_str(), key.length());
212  }
213 
218  void serialize(StaticBuffer& buf) {
219  buf.set(m_bloom_bits, m_num_bytes, false);
220  }
221 
226  uint8_t *ptr() { return m_bloom_bits; }
227 
232  size_t size() { return m_num_bytes; }
233 
239 
244  size_t get_length_bits() { return m_num_bits; }
245 
251 
256  size_t get_items_actual() { return m_items_actual; }
257 
258 private:
260  HasherT m_hasher;
261 
264 
267 
270 
273 
275  size_t m_num_bits;
276 
278  size_t m_num_bytes;
279 
281  uint8_t *m_bloom_bits;
282 };
283 
285 
288 } //namespace Hypertable
289 
290 #endif // HYPERTABLE_BLOOM_FILTER_H
size_t get_num_hashes()
Getter for the number of hash functions.
Definition: BloomFilter.h:238
A memory buffer of static size.
Definition: StaticBuffer.h:45
size_t get_items_estimate()
Getter for the estimated number of items.
Definition: BloomFilter.h:250
void set(uint8_t *data, uint32_t len, bool take_ownership=true)
Sets data pointer; the existing buffer is discarded and deleted.
Definition: StaticBuffer.h:175
uint8_t * m_bloom_bits
The actual bloom filter bit-array.
Definition: BloomFilter.h:281
MurmurHash2 digest routine.
size_t get_items_actual()
Getter for the actual number of items.
Definition: BloomFilter.h:256
std::string String
A String is simply a typedef to std::string.
Definition: String.h:44
~BasicBloomFilter()
Destructor; releases resources.
Definition: BloomFilter.h:141
size_t get_length_bits()
Getter for the number of bits.
Definition: BloomFilter.h:244
uint8_t * ptr()
Getter for the bloom filter data.
Definition: BloomFilter.h:226
size_t m_num_bytes
Number of bytes (approx.
Definition: BloomFilter.h:278
void insert(const String &key)
Overloaded insert function for Strings.
Definition: BloomFilter.h:173
size_t m_items_estimate
Estimated number of items.
Definition: BloomFilter.h:263
float m_false_positive_prob
Probability of returning a false positive.
Definition: BloomFilter.h:269
size_t m_num_bits
Number of bits.
Definition: BloomFilter.h:275
BasicBloomFilter(size_t items_estimate, size_t items_actual, int64_t length, size_t num_hashes)
Alternative constructor.
Definition: BloomFilter.h:117
size_t size()
Getter for the bloom filter size.
Definition: BloomFilter.h:232
BasicBloomFilter(size_t items_estimate, float false_positive_prob)
Constructor.
Definition: BloomFilter.h:58
Logging routines and macros.
BasicBloomFilter BloomFilter
Definition: BloomFilter.h:284
#define HT_END
Definition: Logger.h:220
A memory buffer of static size.
bool may_contain(const void *key, size_t len) const
Checks if the data set "may" contain the key.
Definition: BloomFilter.h:188
Hypertable definitions
long long int Lld
Shortcut for printf formats.
Definition: String.h:53
size_t m_num_hash_functions
Number of hash functions.
Definition: BloomFilter.h:272
BasicBloomFilter(size_t items_estimate, float bits_per_item, size_t num_hashes)
Alternative constructor.
Definition: BloomFilter.h:88
#define HT_THROWF(_code_, _fmt_,...)
Definition: Error.h:490
void serialize(StaticBuffer &buf)
Serializes the BloomFilter into a static memory buffer.
Definition: BloomFilter.h:218
size_t m_items_actual
Actual number of items.
Definition: BloomFilter.h:266
long unsigned int Lu
Shortcut for printf formats.
Definition: String.h:47
void insert(const void *key, size_t len)
Inserts a new blob into the hash.
Definition: BloomFilter.h:159
A space-efficent probabilistic set for membership test, false postives are possible, but false negatives are not.
Definition: BloomFilter.h:50
String extensions and helpers: sets, maps, append operators etc.
HasherT m_hasher
The hash function implementation.
Definition: BloomFilter.h:260
bool may_contain(const String &key) const
Overloaded may_contain function for Strings.
Definition: BloomFilter.h:210
#define HT_DEBUG_OUT
Definition: Logger.h:261