-
Notifications
You must be signed in to change notification settings - Fork 33
/
addRedirects.py
121 lines (80 loc) · 3.46 KB
/
addRedirects.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
#!/usr/bin/python
'''
Copyright (C) 2010 Cagatay Calli <[email protected]>
Adds redirections from Wikiprep output to target Wikipedia articles.
USAGE: addRedirects.py <redir file from Wikiprep> <any writeable folder>
The folder is used by the script to create data files that are loaded into database.
IMPORTANT: If you use XML output from a recent version of Wikiprep
(e.g. Zemanta fork), then set FORMAT to 'Zemanta-legacy' or 'Zemanta-modern'.
'''
import sys
import re
import MySQLdb
# formats: 1) Gabrilovich 2) Zemanta-legacy 3) Zemanta-modern
FORMAT = 'Gabrilovich'
PARTITION_SIZE = 100000
RSIZE = 10000000 # read chunk size = 10 MB - implicit for now
reModernREDIR = re.compile('<redirect>\n<from>\n<id>.+?</id>\n<name>(?P<text>.+?)</name>\n</from>\n<to>\n<id>(?P<target>\d+)</id>\n<name>.+?</name>\n</to>\n</redirect>',re.DOTALL|re.MULTILINE)
reLegacyREDIR = re.compile('<redirect>\n<from>\n<id>.+?</id>\n<title>(?P<text>.+?)</title>.*?</from>\n<to>\n<id>(?P<target>\d+)</id>\n<title>.+?</title>\n</to>\n</redirect>',re.DOTALL|re.MULTILINE)
if 'Zemanta-modern':
reREDIR = reModernREDIR
else:
reREDIR = reLegacyREDIR
args = sys.argv[1:]
if len(args) < 2:
sys.exit(1)
f = open(args[0],'r')
outFolder = args[1].rstrip('/') + '/'
outPrefix = outFolder + '/zredir'
out = open(outPrefix + '0','w')
try:
conn = MySQLdb.connect(host='localhost',user='root',passwd='123456',db='wiki',charset = "utf8", use_unicode = True)
except MySQLdb.Error, e:
print "Error %d: %s" % (e.args[0], e.args[1])
sys.exit(1)
lc = 0
outk = 0
prevText = ''
firstRead = f.read(10000)
documentStart = firstRead.find('<redirects>') + len('<redirects>')
prevText = firstRead[documentStart:10000]
while True:
newText = f.read(RSIZE)
if not newText:
break
text = prevText + newText
endIndex = -1
for page in reREDIR.finditer(text):
out.write(page.group(2) + '\t' + page.group(1) + '\n')
lc += 1
if lc >= PARTITION_SIZE:
lc = 0
outk += 1
out.close()
out = open(outPrefix + str(outk),'w')
endIndex = page.end()
prevText = text[endIndex:]
f.close()
if lc > 0:
out.close()
outk += 1
try:
cursor = conn.cursor()
for i in range(outk):
si = str(i)
cursor.execute("DROP TABLE IF EXISTS zredir"+si)
cursor.execute("CREATE TABLE zredir"+si+" (target_id int(10) unsigned, redir varbinary(255));")
cursor.execute("LOAD DATA LOCAL INFILE '"+outPrefix+si+"' INTO TABLE zredir"+si)
cursor.execute("CREATE INDEX idx_target_id ON zredir"+si+" (target_id);")
cursor.execute("DROP TABLE IF EXISTS redirList"+si)
cursor.execute("CREATE TABLE redirList"+si+" SELECT a.target_id,GROUP_CONCAT(a.redir SEPARATOR ' \n') AS redir_text FROM zredir"+si+" a WHERE a.redir IS NOT NULL GROUP BY a.target_id;")
cursor.execute("DROP TABLE zredir"+si)
# add redirects after creating each partition
cursor.execute("CREATE INDEX idx_target_id ON redirList"+si+" (target_id);")
cursor.execute("UPDATE text t, redirList"+si+" a SET t.old_text = CONCAT(a.redir_text,' \n',t.old_text) WHERE t.old_id = a.target_id AND a.redir_text IS NOT NULL;")
cursor.execute("DROP TABLE redirList"+si)
cursor.close()
conn.close()
except MySQLdb.Error, e:
print "Error %d: %s" % (e.args[0], e.args[1])
sys.exit (1)