Skip to content

Commit 33c1e7f

Browse files
committed
upload script for conflict
1 parent 98184f9 commit 33c1e7f

File tree

5 files changed

+620
-0
lines changed

5 files changed

+620
-0
lines changed

s00_function.py

Lines changed: 242 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,242 @@
1+
# Project: ISCA 2021 Script
2+
# Filename: s00_function.py
3+
# Date: March 16, 2021
4+
# Author: Bagus Hanindhito (hanindhito[at]bagus[dot]my[dot]id)
5+
# Title: Python Function File for ISCA 2021 Script
6+
# Description:
7+
## This script contains callable functions used by other Python scripts.
8+
## You don't need to run this script since it will be called by other Python scripts.
9+
10+
#%% Import some libraries that are needed
11+
import urllib.request
12+
import json
13+
import unidecode
14+
import os
15+
from fuzzywuzzy import fuzz
16+
import xmltodict
17+
import hashlib
18+
19+
#%% Function to Retrieve DBLP Person ID
20+
## This function is used to retrieve DBLP Person ID using DBLP API based on Person Name
21+
## Since there is a possibility that multiple people own same name, the function will return
22+
## JSON file that contains all possible people.
23+
def request_author_key(firstname, lastname, retry_num=2, outputtype='json'):
24+
if not os.path.exists('.cache'):
25+
os.makedirs('.cache')
26+
if not os.path.exists('.cache/person_id'):
27+
os.makedirs('.cache/person_id')
28+
# Define the DBLP API URL to retrieve the autor
29+
api_url = 'https://dblp.org/search/author/api?'
30+
# Define the format, currently it is json
31+
format_url= ('format=%s' % (outputtype))
32+
# If firstname consists of multiple words, then use only the first word
33+
req_firstname = firstname.split()[0]
34+
# lastname is ready to use
35+
req_lastname = lastname
36+
# Construct the query, please refer to https://dblp.org/faq/1474589.html
37+
req_query = ('q=$%s$+$%s$' % (urllib.parse.quote(req_firstname), urllib.parse.quote(req_lastname)))
38+
req_url = ('%s%s&%s' % (api_url, req_query, format_url))
39+
req_hash= hashlib.sha256(req_query.encode('utf-8')).hexdigest()
40+
41+
# Check if the request is already cached:
42+
if os.path.isfile('.cache/person_id/'+req_hash+'.json'):
43+
with open('.cache/person_id/'+req_hash+'.json', 'r') as fp:
44+
json_dict = json.load(fp)
45+
else:
46+
# Try to fetch author data using DBLP API
47+
resource = urllib.request.urlopen(req_url)
48+
# Get the JSON data
49+
raw_str = resource.read()
50+
# Sanitize string
51+
decoded_string = raw_str.decode('utf-8')
52+
#decoded_string = unidecode.unidecode(html.unescape(raw_str.decode('utf-8')))
53+
# Convert JSON to Python Dictionary
54+
#try:
55+
json_dict = json.loads(decoded_string)
56+
# cache
57+
with open('.cache/person_id/'+req_hash+'.json', 'w') as fp:
58+
json.dump(json_dict, fp)
59+
#except:
60+
# print(raw_str)
61+
return json_dict
62+
63+
#%% Function to merge affiliation
64+
# DBLP may return multiple affiliations. This function will merge all affiliation into a list of string
65+
def merge_affiliation(pc_json, entrynum):
66+
affiliation_dblp_list = []
67+
if 'notes' in pc_json['result']['hits']['hit'][entrynum]['info']:
68+
if 'note' in pc_json['result']['hits']['hit'][entrynum]['info']['notes']:
69+
if isinstance(pc_json['result']['hits']['hit'][entrynum]['info']['notes']['note'], dict):
70+
pc_json['result']['hits']['hit'][entrynum]['info']['notes']['note'] = [pc_json['result']['hits']['hit'][entrynum]['info']['notes']['note']]
71+
number_of_notes = len(pc_json['result']['hits']['hit'][entrynum]['info']['notes']['note'])
72+
if(number_of_notes!=0):
73+
for note_dict in pc_json['result']['hits']['hit'][entrynum]['info']['notes']['note']:
74+
if (note_dict['@type'] == 'affiliation'):
75+
affiliation_dblp_list.append(note_dict['text'])
76+
return affiliation_dblp_list
77+
78+
#%% Function to Convert JSON returned by DBLP to Python Dictionary
79+
# This function converts each possible person with a given name returned by DBLP to Python Dictionary
80+
def convert_to_dict(pc_member, pc_json):
81+
pc_member_dblp_list = []
82+
number_of_hits = int(pc_json['result']['hits']['@sent'])
83+
if(number_of_hits==0):
84+
affiliation_dblp_list = []
85+
pc_member_dblp_dict = \
86+
{
87+
"full_name" : pc_member['full'],
88+
"first_name" : pc_member['first'],
89+
"last_name" : pc_member['last'],
90+
"affiliation" : pc_member['affiliation'],
91+
"email" : pc_member['email'],
92+
"isUnique" : 0,
93+
"isError" : 1,
94+
"entrynum" : 0,
95+
"name_confidence" : 0,
96+
"affl_confidence" : 0,
97+
"name_dblp" : '',
98+
"url_dblp" : '',
99+
"affiliation_dblp" : affiliation_dblp_list
100+
}
101+
pc_member_dblp_list.append(pc_member_dblp_dict)
102+
else:
103+
if (number_of_hits==1):
104+
isUnique=1
105+
else:
106+
isUnique=0
107+
# iterate over each entry in JSON file
108+
for entrynum in range(0, number_of_hits):
109+
# use to merge multiple affiliation (if any)
110+
affiliation_dblp_list = merge_affiliation(pc_json, entrynum)
111+
pc_member_dblp_dict = \
112+
{
113+
"full_name" : pc_member['full'],
114+
"first_name" : pc_member['first'],
115+
"last_name" : pc_member['last'],
116+
"affiliation" : pc_member['affiliation'],
117+
"email" : pc_member['email'],
118+
"isUnique" : isUnique,
119+
"isError" : 0,
120+
"entrynum" : entrynum,
121+
"name_confidence" : 0,
122+
"affl_confidence" : 0,
123+
"name_dblp" : pc_json['result']['hits']['hit'][entrynum]['info']['author'],
124+
"url_dblp" : pc_json['result']['hits']['hit'][entrynum]['info']['url'],
125+
"affiliation_dblp" : affiliation_dblp_list
126+
}
127+
pc_member_dblp_list.append(pc_member_dblp_dict)
128+
return pc_member_dblp_list
129+
130+
#%% Function to Filter the returned list of people based on the name.
131+
# This filter is not perfect. It uses fuzzy match to the string.
132+
def filter_name(pc_member_dblp_list, confidence_threshold=90):
133+
original_list_length = len(pc_member_dblp_list)
134+
# only filter if multiple entries are found
135+
if(original_list_length>1):
136+
pc_member_dblp_list_filtered = []
137+
for pc_member_dict in pc_member_dblp_list:
138+
string_1 = pc_member_dict['full_name'].lower()
139+
string_2 = pc_member_dict['name_dblp'].lower()
140+
confidence_level = fuzz.ratio(string_1, string_2)
141+
if(confidence_level>=confidence_threshold):
142+
pc_member_dict['name_confidence'] = confidence_level
143+
pc_member_dblp_list_filtered.append(pc_member_dict)
144+
#print('%s vs %s == %d' % (string_1, string_2, confidence_level))
145+
else:
146+
pc_member_dblp_list_filtered = pc_member_dblp_list
147+
return pc_member_dblp_list_filtered
148+
149+
#%% Function to Filter the returned list of people based on the affiliation.
150+
# This filter is not perfect. It uses fuzzy match to the string.
151+
# Use this filter after filtering based on the name
152+
def filter_affiliation(pc_member_dblp_list, confidence_threshold=80):
153+
original_list_length = len(pc_member_dblp_list)
154+
# only filter if multiple entries are found
155+
if(original_list_length>1):
156+
pc_member_dblp_list_filtered = []
157+
for pc_member_dict in pc_member_dblp_list:
158+
max_confidence = 0
159+
string_1 = pc_member_dict['affiliation'].lower()
160+
for affiliation_dblp in pc_member_dict['affiliation_dblp']:
161+
string_2 = affiliation_dblp.lower()
162+
confidence_level = fuzz.partial_ratio(string_1, string_2)
163+
max_confidence = max(max_confidence, confidence_level)
164+
if(max_confidence>=confidence_threshold):
165+
pc_member_dict['affl_confidence'] = max_confidence
166+
pc_member_dblp_list_filtered.append(pc_member_dict)
167+
#print('%s vs %s == %d' % (string_1, string_2, confidence_level))
168+
# inconclusive filtering since affiliation information is not available on DBLP
169+
if(len(pc_member_dict['affiliation_dblp'])==0):
170+
pc_member_dblp_list_filtered.append(pc_member_dict)
171+
else:
172+
pc_member_dblp_list_filtered = pc_member_dblp_list
173+
return pc_member_dblp_list_filtered
174+
175+
176+
#%% Function to retrieve PC member's all publications from DBLP
177+
def request_publication_list(dblp_link, retry_num=2, outputtype='xml'):
178+
if not os.path.exists('.cache'):
179+
os.makedirs('.cache')
180+
if not os.path.exists('.cache/pub_id'):
181+
os.makedirs('.cache/pub_id')
182+
# Construct the query, please refer to https://dblp.org/faq/1474589.html
183+
req_url = ('%s.xml' % (dblp_link))
184+
req_hash= hashlib.sha256(req_url.encode('utf-8')).hexdigest()
185+
# Cache
186+
if os.path.isfile('.cache/pub_id/'+req_hash):
187+
with open('.cache/pub_id/'+req_hash, 'r') as fp:
188+
xml_dict = xmltodict.parse(fp.read(), dict_constructor=dict)
189+
else:
190+
# Try to fetch author data using DBLP API
191+
resource = urllib.request.urlopen(req_url)
192+
# Get the XML data
193+
raw_str = resource.read()
194+
# Sanitize string
195+
decoded_string = raw_str.decode('utf-8')
196+
# Convert XML to Python Dictionary
197+
xml_dict = xmltodict.parse(decoded_string, dict_constructor=dict)
198+
with open('.cache/pub_id/'+req_hash, 'w') as fp:
199+
fp.write(xmltodict.unparse(xml_dict))
200+
return xml_dict
201+
202+
#%% Function to retrieve PC member's all publications from DBLP
203+
def request_affiliation(dblp_link, retry_num=2, outputtype='xml'):
204+
if not os.path.exists('.cache'):
205+
os.makedirs('.cache')
206+
if not os.path.exists('.cache/pub_id'):
207+
os.makedirs('.cache/pub_id')
208+
# Construct the query, please refer to https://dblp.org/faq/1474589.html
209+
req_url = ('%s.xml' % (dblp_link))
210+
req_hash= hashlib.sha256(req_url.encode('utf-8')).hexdigest()
211+
# Try to fetch author data using DBLP API
212+
try:
213+
if os.path.isfile('.cache/pub_id/'+req_hash):
214+
with open('.cache/pub_id/'+req_hash, 'r') as fp:
215+
xml_dict = xmltodict.parse(fp.read(), dict_constructor=dict)
216+
else:
217+
resource = urllib.request.urlopen(req_url)
218+
# Get the JSON data
219+
raw_str = resource.read()
220+
# Sanitize string
221+
decoded_string = raw_str.decode('utf-8')
222+
# Convert XML to Python Dictionary
223+
xml_dict = xmltodict.parse(decoded_string, dict_constructor=dict)
224+
with open('.cache/pub_id/'+req_hash, 'w') as fp:
225+
fp.write(xmltodict.unparse(xml_dict))
226+
227+
affiliation_list=[]
228+
affiliation_str = ''
229+
try:
230+
notes = xml_dict['dblpperson']['person']['note']
231+
if(isinstance(notes,dict)):
232+
notes = [notes]
233+
for note in notes:
234+
if (note['@type']=='affiliation'):
235+
affiliation_list.append(note['#text'])
236+
affiliation_str=','.join(affiliation_list)
237+
affiliation_str= affiliation_str + ' <DBLP>'
238+
except:
239+
affiliation_str = 'NONE <DBLP>'
240+
except:
241+
affiliation_str = 'NONE <DBLP>'
242+
return affiliation_str

s01_pcname_to_dblp_person_id.py

Lines changed: 81 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,81 @@
1+
# Project: ISCA 2021 Script
2+
# Filename: s01_pcname_to_dblp_person_id.py
3+
# Date: March 16, 2021
4+
# Author: Bagus Hanindhito (hanindhito[at]bagus[dot]my[dot]id)
5+
# Title: PC Member Name to DBLP Person ID Lookup Script
6+
# Description:
7+
## This script is used to look-up appropriate DBLP Person ID that match
8+
## with a given name (first name and last name). The script uses fuzzy match
9+
## to match the given person name with the DBLP person name. Because of homonym
10+
## (i.e., multiple persons with the same name), the script may output multiple
11+
## person with the same name and requires a bit of manual works to choose the
12+
## correct person.
13+
14+
#%% Import some libraries that are needed
15+
import pandas as pd
16+
import numpy as np
17+
import tqdm
18+
import os
19+
from fuzzywuzzy import fuzz
20+
from s00_function import request_author_key
21+
from s00_function import merge_affiliation
22+
from s00_function import convert_to_dict
23+
from s00_function import filter_name
24+
from s00_function import filter_affiliation
25+
26+
#%% Define the input and output CSV filename
27+
# Input CSV filename
28+
## This is CSV file obtained from HotCRP by going to Users and select 'Program Committee' in the
29+
## filter. At the bottom of page, click Select All then Download 'PC info' and click 'Go'
30+
##
31+
pc_info_hotcrp_filename = 'sample-data/input/isca2021-pcinfo.csv'
32+
33+
# Output CSV filename
34+
pc_to_dblp_filename = 'sample-data/output/isca2021-pc-to-dblp.csv'
35+
36+
# %% Load Input CSV to Pandas Dataframe
37+
# Load the PC Info from HotCRP
38+
pc_info_hotcrp_df = pd.read_csv(pc_info_hotcrp_filename)
39+
40+
# %% Sanitize
41+
# Construct and Sanitize First name and Last name
42+
pc_members_name = pd.DataFrame(pc_info_hotcrp_df['first'] + " " + pc_info_hotcrp_df['last'], columns = ['full'])
43+
pc_members_name['split'] = pc_members_name['full'].str.split()
44+
pc_members_name['last'] = pc_members_name['split'].str[-1]
45+
pc_members_name['first'] = pc_members_name.apply(lambda x: x['full'].replace(" " + x['last'],""), axis=1)
46+
pc_members_name['affiliation'] = pc_info_hotcrp_df['affiliation']
47+
pc_members_name['email'] = pc_info_hotcrp_df['email']
48+
del pc_members_name['split']
49+
50+
# %% Request DBLP Person ID for each PC Members
51+
pc_members_list = []
52+
53+
for index,pc_member in tqdm.tqdm(pc_members_name.iterrows(), total=pc_members_name.shape[0]):
54+
pc_json = request_author_key(pc_member["first"], pc_member["last"])
55+
pc_member_dblp_list = convert_to_dict(pc_member, pc_json)
56+
confidence_level=100
57+
pc_member_dblp_list_filtered = []
58+
59+
# Filtering by name with the highest confidence level
60+
# then gradually reducing the confidence level until at least one item come up.
61+
# This is not 100% perfect.
62+
while(len(pc_member_dblp_list_filtered)<1):
63+
pc_member_dblp_list_filtered=filter_name(pc_member_dblp_list, confidence_level)
64+
confidence_level=confidence_level-1
65+
pc_member_dblp_list = pc_member_dblp_list_filtered
66+
pc_member_dblp_list_filtered = filter_affiliation(pc_member_dblp_list)
67+
68+
# Filtering by Affiliation
69+
if(len(pc_member_dblp_list_filtered)>0):
70+
pc_member_dblp_list = pc_member_dblp_list_filtered
71+
72+
# Add to the Final List
73+
pc_members_list = pc_members_list + pc_member_dblp_list
74+
75+
# %% Write Output to CSV
76+
# This CSV needs to be inspected manually.
77+
## NOTE: To avoid overwritting, rename the final CSV file to something else.
78+
pc_members_df = pd.DataFrame(pc_members_list)
79+
pc_members_df.to_csv(pc_to_dblp_filename, index=False)
80+
81+
# %%

0 commit comments

Comments
 (0)