簡單的python實現
pip install mmh3
對於安裝報錯,c++編譯錯誤問題:可以安裝 Microsoft Visual C++ Build Tools()
例子轉載(https://www.cnblogs.com/naive/p/5815433.html)
from bitarray import bitarray # 3rd party import mmh3 class BloomFilter(set): def __init__(self, size, hash_count): super(BloomFilter, self).__init__() self.bit_array = bitarray(size) self.bit_array.setall(0) self.size = size self.hash_count = hash_count def __len__(self): return self.size def __iter__(self): return iter(self.bit_array) def add(self, item): for ii in range(self.hash_count): index = mmh3.hash(item, ii) % self.size self.bit_array[index] = 1 return self def __contains__(self, item): out = True for ii in range(self.hash_count): index = mmh3.hash(item, ii) % self.size if self.bit_array[index] == 0: out = False return out def main(): bloom = BloomFilter(10000, 10) animals = ['dog', 'cat', 'giraffe', 'fly', 'mosquito', 'horse', 'eagle', 'bird', 'bison', 'boar', 'butterfly', 'ant', 'anaconda', 'bear', 'chicken', 'dolphin', 'donkey', 'crow', 'crocodile'] # First insertion of animals into the bloom filter for animal in animals: bloom.add(animal) # Membership existence for already inserted animals # There should not be any false negatives for animal in animals: if animal in bloom: print('{} is in bloom filter as expected'.format(animal)) else: print('Something is terribly went wrong for {}'.format(animal)) print('FALSE NEGATIVE!') # Membership existence for not inserted animals # There could be false positives other_animals = ['badger', 'cow', 'pig', 'sheep', 'bee', 'wolf', 'fox', 'whale', 'shark', 'fish', 'turkey', 'duck', 'dove', 'deer', 'elephant', 'frog', 'falcon', 'goat', 'gorilla', 'hawk' ] for other_animal in other_animals: if other_animal in bloom: print('{} is not in the bloom, but a false positive'.format(other_animal)) else: print('{} is not in the bloom filter as expected'.format(other_animal)) if __name__ == '__main__': main()
運行結果
dog is in bloom filter as expected cat is in bloom filter as expected giraffe is in bloom filter as expected fly is in bloom filter as expected mosquito is in bloom filter as expected horse is in bloom filter as expected eagle is in bloom filter as expected bird is in bloom filter as expected bison is in bloom filter as expected boar is in bloom filter as expected butterfly is in bloom filter as expected ant is in bloom filter as expected anaconda is in bloom filter as expected bear is in bloom filter as expected chicken is in bloom filter as expected dolphin is in bloom filter as expected donkey is in bloom filter as expected crow is in bloom filter as expected crocodile is in bloom filter as expected badger is not in the bloom filter as expected cow is not in the bloom filter as expected pig is not in the bloom filter as expected sheep is not in the bloom, but a false positive bee is not in the bloom filter as expected wolf is not in the bloom filter as expected fox is not in the bloom filter as expected whale is not in the bloom filter as expected shark is not in the bloom, but a false positive fish is not in the bloom, but a false positive turkey is not in the bloom filter as expected duck is not in the bloom filter as expected dove is not in the bloom誤報 filter as expected deer is not in the bloom filter as expected elephant is not in the bloom, but a false positive frog is not in the bloom filter as expected falcon is not in the bloom filter as expected goat is not in the bloom filter as expected gorilla is not in the bloom filter as expected hawk is not in the bloom filter as expected
從輸出結果可以發現,存在不少誤報樣本,但是並不存在假陰性。
不同於這段布隆過濾器的實現代碼,其它語言的多個實現版本並不提供哈希函數的參數。這是因為在實際應用中誤報比例這個指標比哈希函數更重要,用戶可以根據誤報比例的需求來調整哈希函數的個數。通常來說,size
和error_rate
是布隆過濾器的真正誤報比例。如果你在初始化階段減小了error_rate
,它們會調整哈希函數的數量。
誤報
布隆過濾器能夠拍着胸脯說某個元素“肯定不存在”,但是對於一些元素它們會說“可能存在”。針對不同的應用場景,這有可能會是一個巨大的缺陷,亦或是無關緊要的問題。如果在檢索元素是否存在時不介意引入誤報情況,那么你就應當考慮用布隆過濾器。
另外,如果隨意地減小了誤報比率,哈希函數的數量相應地就要增加,在插入和查詢時的延時也會相應地增加。本節的另一個要點是,如果哈希函數是相互獨立的,並且輸入元素在空間中均勻的分布,那么理論上真實誤報率就不會超過理論值。否則,由於哈希函數的相關性和更頻繁的哈希沖突,布隆過濾器的真實誤報比例會高於理論值。
在使用布隆過濾器時,需要考慮誤報的潛在影響。
確定性
當你使用相同大小和數量的哈希函數時,某個元素通過布隆過濾器得到的是正反饋還是負反饋的結果是確定的。對於某個元素x
,如果它現在可能存在
,那五分鍾之后、一小時之后、一天之后、甚至一周之后的狀態都是可能存在
。當我得知這一特性時有一點點驚訝。因為布隆過濾器是概率性
的,那其結果顯然應該存在某種隨機因素,難道不是嗎?確實不是。它的概率性
體現在我們無法判斷究竟哪些元素的狀態是可能存在
。
換句話說,過濾器一旦做出可能存在
的結論后,結論不會發生變化。
python 基於redis實現的bloomfilter(布隆過濾器),BloomFilter_imooc
BloomFilter_imooc下載
下載地址:https://github.com/liyaopinner/BloomFilter_imooc
py_bloomfilter.py(布隆過濾器)源碼:
import mmh3 import redis import math import time class PyBloomFilter(): #內置100個隨機種子 SEEDS = [543, 460, 171, 876, 796, 607, 650, 81, 837, 545, 591, 946, 846, 521, 913, 636, 878, 735, 414, 372, 344, 324, 223, 180, 327, 891, 798, 933, 493, 293, 836, 10, 6, 544, 924, 849, 438, 41, 862, 648, 338, 465, 562, 693, 979, 52, 763, 103, 387, 374, 349, 94, 384, 680, 574, 480, 307, 580, 71, 535, 300, 53, 481, 519, 644, 219, 686, 236, 424, 326, 244, 212, 909, 202, 951, 56, 812, 901, 926, 250, 507, 739, 371, 63, 584, 154, 7, 284, 617, 332, 472, 140, 605, 262, 355, 526, 647, 923, 199, 518] #capacity是預先估計要去重的數量 #error_rate表示錯誤率 #conn表示redis的連接客戶端 #key表示在redis中的鍵的名字前綴 def __init__(self, capacity=1000000000, error_rate=0.00000001, conn=None, key='BloomFilter'): self.m = math.ceil(capacity*math.log2(math.e)*math.log2(1/error_rate)) #需要的總bit位數 self.k = math.ceil(math.log1p(2)*self.m/capacity) #需要最少的hash次數 self.mem = math.ceil(self.m/8/1024/1024) #需要的多少M內存 self.blocknum = math.ceil(self.mem/512) #需要多少個512M的內存塊,value的第一個字符必須是ascii碼,所有最多有256個內存塊 self.seeds = self.SEEDS[0:self.k] self.key = key self.N = 2**31-1 self.redis = conn # print(self.mem) # print(self.k) def add(self, value): name = self.key + "_" + str(ord(value[0])%self.blocknum) hashs = self.get_hashs(value) for hash in hashs: self.redis.setbit(name, hash, 1) def is_exist(self, value): name = self.key + "_" + str(ord(value[0])%self.blocknum) hashs = self.get_hashs(value) exist = True for hash in hashs: exist = exist & self.redis.getbit(name, hash) return exist def get_hashs(self, value): hashs = list() for seed in self.seeds: hash = mmh3.hash(value, seed) if hash >= 0: hashs.append(hash) else: hashs.append(self.N - hash) return hashs pool = redis.ConnectionPool(host='127.0.0.1', port=6379, db=0) conn = redis.StrictRedis(connection_pool=pool) # 使用方法 # if __name__ == "__main__": # bf = PyBloomFilter(conn=conn) # 利用連接池連接Redis # bf.add('www.jobbole.com') # 向Redis默認的通道添加一個域名 # bf.add('www.luyin.org') # 向Redis默認的通道添加一個域名 # print(bf.is_exist('www.zhihu.com')) # 打印此域名在通道里是否存在,存在返回1,不存在返回0 # print(bf.is_exist('www.luyin.org')) # 打印此域名在通道里是否存在,存在返回1,不存在返回0