1+ # Project: ISCA 2021 Script
2+ # Filename: s00_function.py
3+ # Date: March 16, 2021
4+ # Author: Bagus Hanindhito (hanindhito[at]bagus[dot]my[dot]id)
5+ # Title: Python Function File for ISCA 2021 Script
6+ # Description:
7+ ## This script contains callable functions used by other Python scripts.
8+ ## You don't need to run this script since it will be called by other Python scripts.
9+
10+ #%% Import some libraries that are needed
11+ import urllib .request
12+ import json
13+ import unidecode
14+ import os
15+ from fuzzywuzzy import fuzz
16+ import xmltodict
17+ import hashlib
18+
19+ #%% Function to Retrieve DBLP Person ID
20+ ## This function is used to retrieve DBLP Person ID using DBLP API based on Person Name
21+ ## Since there is a possibility that multiple people own same name, the function will return
22+ ## JSON file that contains all possible people.
23+ def request_author_key (firstname , lastname , retry_num = 2 , outputtype = 'json' ):
24+ if not os .path .exists ('.cache' ):
25+ os .makedirs ('.cache' )
26+ if not os .path .exists ('.cache/person_id' ):
27+ os .makedirs ('.cache/person_id' )
28+ # Define the DBLP API URL to retrieve the autor
29+ api_url = 'https://dblp.org/search/author/api?'
30+ # Define the format, currently it is json
31+ format_url = ('format=%s' % (outputtype ))
32+ # If firstname consists of multiple words, then use only the first word
33+ req_firstname = firstname .split ()[0 ]
34+ # lastname is ready to use
35+ req_lastname = lastname
36+ # Construct the query, please refer to https://dblp.org/faq/1474589.html
37+ req_query = ('q=$%s$+$%s$' % (urllib .parse .quote (req_firstname ), urllib .parse .quote (req_lastname )))
38+ req_url = ('%s%s&%s' % (api_url , req_query , format_url ))
39+ req_hash = hashlib .sha256 (req_query .encode ('utf-8' )).hexdigest ()
40+
41+ # Check if the request is already cached:
42+ if os .path .isfile ('.cache/person_id/' + req_hash + '.json' ):
43+ with open ('.cache/person_id/' + req_hash + '.json' , 'r' ) as fp :
44+ json_dict = json .load (fp )
45+ else :
46+ # Try to fetch author data using DBLP API
47+ resource = urllib .request .urlopen (req_url )
48+ # Get the JSON data
49+ raw_str = resource .read ()
50+ # Sanitize string
51+ decoded_string = raw_str .decode ('utf-8' )
52+ #decoded_string = unidecode.unidecode(html.unescape(raw_str.decode('utf-8')))
53+ # Convert JSON to Python Dictionary
54+ #try:
55+ json_dict = json .loads (decoded_string )
56+ # cache
57+ with open ('.cache/person_id/' + req_hash + '.json' , 'w' ) as fp :
58+ json .dump (json_dict , fp )
59+ #except:
60+ # print(raw_str)
61+ return json_dict
62+
63+ #%% Function to merge affiliation
64+ # DBLP may return multiple affiliations. This function will merge all affiliation into a list of string
65+ def merge_affiliation (pc_json , entrynum ):
66+ affiliation_dblp_list = []
67+ if 'notes' in pc_json ['result' ]['hits' ]['hit' ][entrynum ]['info' ]:
68+ if 'note' in pc_json ['result' ]['hits' ]['hit' ][entrynum ]['info' ]['notes' ]:
69+ if isinstance (pc_json ['result' ]['hits' ]['hit' ][entrynum ]['info' ]['notes' ]['note' ], dict ):
70+ pc_json ['result' ]['hits' ]['hit' ][entrynum ]['info' ]['notes' ]['note' ] = [pc_json ['result' ]['hits' ]['hit' ][entrynum ]['info' ]['notes' ]['note' ]]
71+ number_of_notes = len (pc_json ['result' ]['hits' ]['hit' ][entrynum ]['info' ]['notes' ]['note' ])
72+ if (number_of_notes != 0 ):
73+ for note_dict in pc_json ['result' ]['hits' ]['hit' ][entrynum ]['info' ]['notes' ]['note' ]:
74+ if (note_dict ['@type' ] == 'affiliation' ):
75+ affiliation_dblp_list .append (note_dict ['text' ])
76+ return affiliation_dblp_list
77+
78+ #%% Function to Convert JSON returned by DBLP to Python Dictionary
79+ # This function converts each possible person with a given name returned by DBLP to Python Dictionary
80+ def convert_to_dict (pc_member , pc_json ):
81+ pc_member_dblp_list = []
82+ number_of_hits = int (pc_json ['result' ]['hits' ]['@sent' ])
83+ if (number_of_hits == 0 ):
84+ affiliation_dblp_list = []
85+ pc_member_dblp_dict = \
86+ {
87+ "full_name" : pc_member ['full' ],
88+ "first_name" : pc_member ['first' ],
89+ "last_name" : pc_member ['last' ],
90+ "affiliation" : pc_member ['affiliation' ],
91+ "email" : pc_member ['email' ],
92+ "isUnique" : 0 ,
93+ "isError" : 1 ,
94+ "entrynum" : 0 ,
95+ "name_confidence" : 0 ,
96+ "affl_confidence" : 0 ,
97+ "name_dblp" : '' ,
98+ "url_dblp" : '' ,
99+ "affiliation_dblp" : affiliation_dblp_list
100+ }
101+ pc_member_dblp_list .append (pc_member_dblp_dict )
102+ else :
103+ if (number_of_hits == 1 ):
104+ isUnique = 1
105+ else :
106+ isUnique = 0
107+ # iterate over each entry in JSON file
108+ for entrynum in range (0 , number_of_hits ):
109+ # use to merge multiple affiliation (if any)
110+ affiliation_dblp_list = merge_affiliation (pc_json , entrynum )
111+ pc_member_dblp_dict = \
112+ {
113+ "full_name" : pc_member ['full' ],
114+ "first_name" : pc_member ['first' ],
115+ "last_name" : pc_member ['last' ],
116+ "affiliation" : pc_member ['affiliation' ],
117+ "email" : pc_member ['email' ],
118+ "isUnique" : isUnique ,
119+ "isError" : 0 ,
120+ "entrynum" : entrynum ,
121+ "name_confidence" : 0 ,
122+ "affl_confidence" : 0 ,
123+ "name_dblp" : pc_json ['result' ]['hits' ]['hit' ][entrynum ]['info' ]['author' ],
124+ "url_dblp" : pc_json ['result' ]['hits' ]['hit' ][entrynum ]['info' ]['url' ],
125+ "affiliation_dblp" : affiliation_dblp_list
126+ }
127+ pc_member_dblp_list .append (pc_member_dblp_dict )
128+ return pc_member_dblp_list
129+
130+ #%% Function to Filter the returned list of people based on the name.
131+ # This filter is not perfect. It uses fuzzy match to the string.
132+ def filter_name (pc_member_dblp_list , confidence_threshold = 90 ):
133+ original_list_length = len (pc_member_dblp_list )
134+ # only filter if multiple entries are found
135+ if (original_list_length > 1 ):
136+ pc_member_dblp_list_filtered = []
137+ for pc_member_dict in pc_member_dblp_list :
138+ string_1 = pc_member_dict ['full_name' ].lower ()
139+ string_2 = pc_member_dict ['name_dblp' ].lower ()
140+ confidence_level = fuzz .ratio (string_1 , string_2 )
141+ if (confidence_level >= confidence_threshold ):
142+ pc_member_dict ['name_confidence' ] = confidence_level
143+ pc_member_dblp_list_filtered .append (pc_member_dict )
144+ #print('%s vs %s == %d' % (string_1, string_2, confidence_level))
145+ else :
146+ pc_member_dblp_list_filtered = pc_member_dblp_list
147+ return pc_member_dblp_list_filtered
148+
149+ #%% Function to Filter the returned list of people based on the affiliation.
150+ # This filter is not perfect. It uses fuzzy match to the string.
151+ # Use this filter after filtering based on the name
152+ def filter_affiliation (pc_member_dblp_list , confidence_threshold = 80 ):
153+ original_list_length = len (pc_member_dblp_list )
154+ # only filter if multiple entries are found
155+ if (original_list_length > 1 ):
156+ pc_member_dblp_list_filtered = []
157+ for pc_member_dict in pc_member_dblp_list :
158+ max_confidence = 0
159+ string_1 = pc_member_dict ['affiliation' ].lower ()
160+ for affiliation_dblp in pc_member_dict ['affiliation_dblp' ]:
161+ string_2 = affiliation_dblp .lower ()
162+ confidence_level = fuzz .partial_ratio (string_1 , string_2 )
163+ max_confidence = max (max_confidence , confidence_level )
164+ if (max_confidence >= confidence_threshold ):
165+ pc_member_dict ['affl_confidence' ] = max_confidence
166+ pc_member_dblp_list_filtered .append (pc_member_dict )
167+ #print('%s vs %s == %d' % (string_1, string_2, confidence_level))
168+ # inconclusive filtering since affiliation information is not available on DBLP
169+ if (len (pc_member_dict ['affiliation_dblp' ])== 0 ):
170+ pc_member_dblp_list_filtered .append (pc_member_dict )
171+ else :
172+ pc_member_dblp_list_filtered = pc_member_dblp_list
173+ return pc_member_dblp_list_filtered
174+
175+
176+ #%% Function to retrieve PC member's all publications from DBLP
177+ def request_publication_list (dblp_link , retry_num = 2 , outputtype = 'xml' ):
178+ if not os .path .exists ('.cache' ):
179+ os .makedirs ('.cache' )
180+ if not os .path .exists ('.cache/pub_id' ):
181+ os .makedirs ('.cache/pub_id' )
182+ # Construct the query, please refer to https://dblp.org/faq/1474589.html
183+ req_url = ('%s.xml' % (dblp_link ))
184+ req_hash = hashlib .sha256 (req_url .encode ('utf-8' )).hexdigest ()
185+ # Cache
186+ if os .path .isfile ('.cache/pub_id/' + req_hash ):
187+ with open ('.cache/pub_id/' + req_hash , 'r' ) as fp :
188+ xml_dict = xmltodict .parse (fp .read (), dict_constructor = dict )
189+ else :
190+ # Try to fetch author data using DBLP API
191+ resource = urllib .request .urlopen (req_url )
192+ # Get the XML data
193+ raw_str = resource .read ()
194+ # Sanitize string
195+ decoded_string = raw_str .decode ('utf-8' )
196+ # Convert XML to Python Dictionary
197+ xml_dict = xmltodict .parse (decoded_string , dict_constructor = dict )
198+ with open ('.cache/pub_id/' + req_hash , 'w' ) as fp :
199+ fp .write (xmltodict .unparse (xml_dict ))
200+ return xml_dict
201+
202+ #%% Function to retrieve PC member's all publications from DBLP
203+ def request_affiliation (dblp_link , retry_num = 2 , outputtype = 'xml' ):
204+ if not os .path .exists ('.cache' ):
205+ os .makedirs ('.cache' )
206+ if not os .path .exists ('.cache/pub_id' ):
207+ os .makedirs ('.cache/pub_id' )
208+ # Construct the query, please refer to https://dblp.org/faq/1474589.html
209+ req_url = ('%s.xml' % (dblp_link ))
210+ req_hash = hashlib .sha256 (req_url .encode ('utf-8' )).hexdigest ()
211+ # Try to fetch author data using DBLP API
212+ try :
213+ if os .path .isfile ('.cache/pub_id/' + req_hash ):
214+ with open ('.cache/pub_id/' + req_hash , 'r' ) as fp :
215+ xml_dict = xmltodict .parse (fp .read (), dict_constructor = dict )
216+ else :
217+ resource = urllib .request .urlopen (req_url )
218+ # Get the JSON data
219+ raw_str = resource .read ()
220+ # Sanitize string
221+ decoded_string = raw_str .decode ('utf-8' )
222+ # Convert XML to Python Dictionary
223+ xml_dict = xmltodict .parse (decoded_string , dict_constructor = dict )
224+ with open ('.cache/pub_id/' + req_hash , 'w' ) as fp :
225+ fp .write (xmltodict .unparse (xml_dict ))
226+
227+ affiliation_list = []
228+ affiliation_str = ''
229+ try :
230+ notes = xml_dict ['dblpperson' ]['person' ]['note' ]
231+ if (isinstance (notes ,dict )):
232+ notes = [notes ]
233+ for note in notes :
234+ if (note ['@type' ]== 'affiliation' ):
235+ affiliation_list .append (note ['#text' ])
236+ affiliation_str = ',' .join (affiliation_list )
237+ affiliation_str = affiliation_str + ' <DBLP>'
238+ except :
239+ affiliation_str = 'NONE <DBLP>'
240+ except :
241+ affiliation_str = 'NONE <DBLP>'
242+ return affiliation_str
0 commit comments