-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathcommon_substring.py
More file actions
executable file
·74 lines (63 loc) · 2.53 KB
/
common_substring.py
File metadata and controls
executable file
·74 lines (63 loc) · 2.53 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
# Longest Common Substring
import sys
from random import randint
from collections import namedtuple
Answer = namedtuple('answer_type', 'i j len')
# Find a string w of maximal length that is a substring of both s and t.
# Special case of the edit distance problem (only insertions and deletions).
# Can be solved in time O(|s| * |t|) using dynamic programming.
# Goal is to use hashing to solve it in almost linear time.
def binary_search(s, t, prime1, rand1, prime2, rand2):
# Use binary search to find the length of the longest common substring.
answer = Answer(0, 0, 0)
left, right = 0, min(len(s), len(t))
while left <= right:
mid = (left + right)//2
# Precompute hash values of all substrings of length k of s and t.
hash_s = precompute_hashes(s, mid, prime1, rand1, prime2, rand2)
hash_t = precompute_hashes(t, mid, prime1, rand1, prime2, rand2)
# Store hash values of all substrings of length k of the string s.
dict_s = {h: i for i, h in enumerate(hash_s)}
dict_t = {h: i for i, h in enumerate(hash_t)}
# Go through all substrings of length k of the string t and check
# whether the hash values of this substring are present.
matching_hashes = dict_s.keys() & dict_t.keys()
if matching_hashes and mid > 0:
h = list(matching_hashes)[0]
i, j = dict_s[h], dict_t[h]
answer = Answer(i, j, mid)
left = mid + 1
else:
right = mid - 1
return answer
def precompute_hashes(string, length, prime1, rand1, prime2, rand2):
# Use two hash functions to reduce the probability of a collision
n = len(string)
m = n - length
substr = string[m:n]
hashes = [(0, 0)]*(m + 1)
hashes[m] = poly_hash(substr, prime1, rand1, prime2, rand2)
num1, num2 = 1, 1
for _ in range(1, length + 1):
num1, num2 = (num1*rand1) % prime1, (num2*rand2) % prime2
for i in range(m - 1, -1, -1):
h1, h2 = hashes[i + 1]
c1, c2 = ord(string[i]), ord(string[i + length])
x1, x2 = c1 - num1*c2, c1 - num2*c2
hashes[i] = (rand1*h1 + x1) % prime1, (rand2*h2 + x2) % prime2
return hashes
def poly_hash(substr, prime1, rand1, prime2, rand2):
hash_value1, hash_value2 = 0, 0
for c in reversed(substr):
hash_value1 = (hash_value1*rand1 + ord(c)) % prime1
hash_value2 = (hash_value2*rand2 + ord(c)) % prime2
return hash_value1, hash_value2
def solve(s, t):
prime1, prime2 = 1000000007, 1000000009
rand1, rand2 = randint(31, prime1 - 1), randint(31, prime2 - 1)
return binary_search(s, t, prime1, rand1, prime2, rand2)
if __name__ == '__main__':
for line in sys.stdin.readlines():
a, b = line.split()
ans = solve(a, b)
print(ans.i, ans.j, ans.len)