-
Notifications
You must be signed in to change notification settings - Fork 8
/
bloom filter.jl
158 lines (105 loc) · 3.81 KB
/
bloom filter.jl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
# coding: utf-8
# In[1]:
#bloom filter is a space-efficient probabilistic data structure
#it is commonly used for query in large dataset
#the query can possibly return false positive matches but never false negative matches
#the workflow is very straight forward
#for each string in the database, it has to go through k number of hash functions
#k hash value would be allocated to the hashtable accordingly
#to make any query,the query string has to go through k number of hash functions as well
#if every underlying cell has been occupied in the hashtable
#the query returns positive matches
#fowler–noll–vo hash function
#pseudo code can be found in the link below
# https://en.wikipedia.org/wiki/Fowler%E2%80%93Noll%E2%80%93Vo_hash_function#FNV-1_hash
function fnv1(text,
offset_basis=0xcbf29ce484222325,
prime=0x100000001b3)
hash_value=offset_basis
for letter in text
hash_value*=prime
#for some unknown reasons,large int xor in julia is different from python
hash_value=hash_value ⊻ codepoint(letter)
end
return hash_value
end
# In[2]:
#jenkins one at a time hash function
#pseudo code can be found in the link below
# https://en.wikipedia.org/wiki/Jenkins_hash_function#one_at_a_time
#actual code can be found in the link below
# https://stackoverflow.com/a/51730837
function one_at_a_time(text)
hash_value=0
for letter in text
hash_value+=codepoint(letter)
hash_value=hash_value & 0xFFFFFFFF
hash_value+=hash_value<<10
hash_value=hash_value & 0xFFFFFFFF
hash_value=hash_value ⊻ hash_value>>6
hash_value=hash_value & 0xFFFFFFFF
end
hash_value+=hash_value<<3
hash_value=hash_value & 0xFFFFFFFF
hash_value=hash_value ⊻ hash_value>>11
hash_value=hash_value & 0xFFFFFFFF
hash_value+=hash_value<<15
hash_value=hash_value & 0xFFFFFFFF
return hash_value
end
# In[3]:
#compute hash value from two different functions
#update the underlying cells in hashtable
function update_hashtable(text,hashtable,verbose=true)
#use extra computation to shrink hashtable size
fnv_hash=Int64(round(fnv1(text)^0.0625))
jenkins_hash=Int64(round(one_at_a_time(text)^0.25))
#prevent overflow
if max(fnv_hash,jenkins_hash)<=hashtable_size
#update hashtable
hashtable[fnv_hash]=1
hashtable[jenkins_hash]=1
if verbose
println("Hashtable updated.")
end
else
if verbose
println("Hash value exceeds hashtable size!")
end
end
return hashtable
end
# In[4]:
#compute hash value from two different functions
#check the underlying cells in hashtable
function search_hashtable(text,hashtable)
#use extra computation to shrink hashtable size
fnv_hash=Int64(round(fnv1(text)^0.0625))
jenkins_hash=Int64(round(one_at_a_time(text)^0.25))
#check hashtable
if hashtable[fnv_hash]==1 && hashtable[jenkins_hash]==1
return true
else
return false
end
end
# In[5]:
#initialize
hashtable_size=500
hashtable=[0 for _ in 1:hashtable_size];
update_list=["raison d'être","porte-voix","prêt-à-gouverner",
"proxénétisme","hydroxychloroquine"];
# In[6]:
#update hashtable
for text in update_list
hashtable=update_hashtable(text,hashtable)
end
# In[7]:
#true negative
println(search_hashtable("raison d’être",hashtable))
println(search_hashtable("raison d être",hashtable))
println(search_hashtable("raison d\"être",hashtable))
#true positive
println(search_hashtable("raison d'être",hashtable))
#false positive
println(search_hashtable("rajson d'être",hashtable))