hibagus
diff --git a/‎s00_function.py‎
Lines changed: 242 additions & 0 deletions b/‎s00_function.py‎
Lines changed: 242 additions & 0 deletions
diff --git a/‎s01_pcname_to_dblp_person_id.py‎
Lines changed: 81 additions & 0 deletions b/‎s01_pcname_to_dblp_person_id.py‎
Lines changed: 81 additions & 0 deletions
@@ -0,0 +1,242 @@
+# Project: ISCA 2021 Script
+# Filename: s00_function.py
+# Date: March 16, 2021
+# Author: Bagus Hanindhito (hanindhito[at]bagus[dot]my[dot]id)
+# Title: Python Function File for ISCA 2021 Script
+# Description:
+## This script contains callable functions used by other Python scripts.
+## You don't need to run this script since it will be called by other Python scripts.
+
+#%% Import some libraries that are needed
+import urllib.request
+import json
+import unidecode
+import os
+from fuzzywuzzy import fuzz
+import xmltodict
+import hashlib
+
+#%% Function to Retrieve DBLP Person ID
+## This function is used to retrieve DBLP Person ID using DBLP API based on Person Name
+## Since there is a possibility that multiple people own same name, the function will return
+## JSON file that contains all possible people.
+def request_author_key(firstname, lastname, retry_num=2, outputtype='json'):
+    if not os.path.exists('.cache'):
+        os.makedirs('.cache')
+    if not os.path.exists('.cache/person_id'):
+        os.makedirs('.cache/person_id')
+    # Define the DBLP API URL to retrieve the autor
+    api_url = 'https://dblp.org/search/author/api?'
+    # Define the format, currently it is json
+    format_url= ('format=%s' % (outputtype))
+    # If firstname consists of multiple words, then use only the first word
+    req_firstname = firstname.split()[0]
+    # lastname is ready to use
+    req_lastname = lastname
+    # Construct the query, please refer to https://dblp.org/faq/1474589.html
+    req_query = ('q=$%s$+$%s$' % (urllib.parse.quote(req_firstname), urllib.parse.quote(req_lastname)))
+    req_url = ('%s%s&%s' % (api_url, req_query, format_url))
+    req_hash= hashlib.sha256(req_query.encode('utf-8')).hexdigest()
+
+    # Check if the request is already cached:
+    if os.path.isfile('.cache/person_id/'+req_hash+'.json'):
+        with open('.cache/person_id/'+req_hash+'.json', 'r') as fp:
+            json_dict = json.load(fp)
+    else:
+        # Try to fetch author data using DBLP API
+        resource = urllib.request.urlopen(req_url)
+        # Get the JSON data
+        raw_str = resource.read()
+        # Sanitize string
+        decoded_string = raw_str.decode('utf-8')
+        #decoded_string = unidecode.unidecode(html.unescape(raw_str.decode('utf-8')))
+        # Convert JSON to Python Dictionary
+        #try:
+        json_dict = json.loads(decoded_string)
+        # cache
+        with open('.cache/person_id/'+req_hash+'.json', 'w') as fp:
+            json.dump(json_dict, fp)
+        #except:
+        #    print(raw_str)
+    return json_dict
+
+#%% Function to merge affiliation
+# DBLP may return multiple affiliations. This function will merge all affiliation into a list of string
+def merge_affiliation(pc_json, entrynum):
+    affiliation_dblp_list = []
+    if 'notes' in pc_json['result']['hits']['hit'][entrynum]['info']:
+        if 'note' in pc_json['result']['hits']['hit'][entrynum]['info']['notes']:
+            if isinstance(pc_json['result']['hits']['hit'][entrynum]['info']['notes']['note'], dict):
+                pc_json['result']['hits']['hit'][entrynum]['info']['notes']['note'] = [pc_json['result']['hits']['hit'][entrynum]['info']['notes']['note']]
+            number_of_notes = len(pc_json['result']['hits']['hit'][entrynum]['info']['notes']['note'])
+            if(number_of_notes!=0):
+                for note_dict in pc_json['result']['hits']['hit'][entrynum]['info']['notes']['note']:
+                    if (note_dict['@type'] == 'affiliation'):
+                        affiliation_dblp_list.append(note_dict['text'])
+    return affiliation_dblp_list
+
+#%% Function to Convert JSON returned by DBLP to Python Dictionary
+# This function converts each possible person with a given name returned by DBLP to Python Dictionary
+def convert_to_dict(pc_member, pc_json):
+    pc_member_dblp_list = []
+    number_of_hits = int(pc_json['result']['hits']['@sent'])
+    if(number_of_hits==0):
+        affiliation_dblp_list = []
+        pc_member_dblp_dict = \
+        {
+            "full_name"        : pc_member['full'],
+            "first_name"       : pc_member['first'],
+            "last_name"        : pc_member['last'],
+            "affiliation"      : pc_member['affiliation'],
+            "email"            : pc_member['email'],
+            "isUnique"         : 0,
+            "isError"          : 1,
+            "entrynum"         : 0,
+            "name_confidence"  : 0,
+            "affl_confidence"  : 0,
+            "name_dblp"        : '',
+            "url_dblp"         : '',
+            "affiliation_dblp" : affiliation_dblp_list
+        }
+        pc_member_dblp_list.append(pc_member_dblp_dict)
+    else:
+        if (number_of_hits==1):
+            isUnique=1
+        else:
+            isUnique=0
+        # iterate over each entry in JSON file
+        for entrynum in range(0, number_of_hits):
+            # use to merge multiple affiliation (if any)
+            affiliation_dblp_list = merge_affiliation(pc_json, entrynum) 
+            pc_member_dblp_dict = \
+            {
+                "full_name"        : pc_member['full'],
+                "first_name"       : pc_member['first'],
+                "last_name"        : pc_member['last'],
+                "affiliation"      : pc_member['affiliation'],
+                "email"            : pc_member['email'],
+                "isUnique"         : isUnique,
+                "isError"          : 0,
+                "entrynum"         : entrynum,
+                "name_confidence"  : 0,
+                "affl_confidence"  : 0,
+                "name_dblp"        : pc_json['result']['hits']['hit'][entrynum]['info']['author'],
+                "url_dblp"         : pc_json['result']['hits']['hit'][entrynum]['info']['url'],
+                "affiliation_dblp" : affiliation_dblp_list
+            }
+            pc_member_dblp_list.append(pc_member_dblp_dict)           
+    return pc_member_dblp_list
+
+#%% Function to Filter the returned list of people based on the name.
+# This filter is not perfect. It uses fuzzy match to the string.
+def filter_name(pc_member_dblp_list, confidence_threshold=90):
+    original_list_length = len(pc_member_dblp_list)
+    # only filter if multiple entries are found
+    if(original_list_length>1): 
+        pc_member_dblp_list_filtered = []
+        for pc_member_dict in pc_member_dblp_list:
+            string_1 = pc_member_dict['full_name'].lower()
+            string_2 = pc_member_dict['name_dblp'].lower()
+            confidence_level = fuzz.ratio(string_1, string_2)
+            if(confidence_level>=confidence_threshold):
+                pc_member_dict['name_confidence'] = confidence_level
+                pc_member_dblp_list_filtered.append(pc_member_dict)
+                #print('%s vs %s == %d' % (string_1, string_2, confidence_level))
+    else:
+        pc_member_dblp_list_filtered = pc_member_dblp_list
+    return pc_member_dblp_list_filtered
+
+#%% Function to Filter the returned list of people based on the affiliation.
+# This filter is not perfect. It uses fuzzy match to the string.
+# Use this filter after filtering based on the name
+def filter_affiliation(pc_member_dblp_list, confidence_threshold=80):
+    original_list_length = len(pc_member_dblp_list)
+    # only filter if multiple entries are found
+    if(original_list_length>1): 
+        pc_member_dblp_list_filtered = []
+        for pc_member_dict in pc_member_dblp_list:
+            max_confidence = 0
+            string_1 = pc_member_dict['affiliation'].lower()
+            for affiliation_dblp in pc_member_dict['affiliation_dblp']:
+                string_2 = affiliation_dblp.lower()
+                confidence_level = fuzz.partial_ratio(string_1, string_2)
+                max_confidence = max(max_confidence, confidence_level)
+            if(max_confidence>=confidence_threshold):
+                pc_member_dict['affl_confidence'] = max_confidence 
+                pc_member_dblp_list_filtered.append(pc_member_dict)
+                #print('%s vs %s == %d' % (string_1, string_2, confidence_level))
+            # inconclusive filtering since affiliation information is not available on DBLP
+            if(len(pc_member_dict['affiliation_dblp'])==0):
+                pc_member_dblp_list_filtered.append(pc_member_dict)
+    else:
+        pc_member_dblp_list_filtered = pc_member_dblp_list
+    return pc_member_dblp_list_filtered
+
+
+#%% Function to retrieve PC member's all publications from DBLP
+def request_publication_list(dblp_link, retry_num=2, outputtype='xml'):
+    if not os.path.exists('.cache'):
+        os.makedirs('.cache')
+    if not os.path.exists('.cache/pub_id'):
+        os.makedirs('.cache/pub_id')
+    # Construct the query, please refer to https://dblp.org/faq/1474589.html
+    req_url = ('%s.xml' % (dblp_link))
+    req_hash= hashlib.sha256(req_url.encode('utf-8')).hexdigest()
+    # Cache
+    if os.path.isfile('.cache/pub_id/'+req_hash):
+        with open('.cache/pub_id/'+req_hash, 'r') as fp:
+            xml_dict = xmltodict.parse(fp.read(), dict_constructor=dict)
+    else:
+        # Try to fetch author data using DBLP API
+        resource = urllib.request.urlopen(req_url)
+        # Get the XML data
+        raw_str = resource.read()
+        # Sanitize string
+        decoded_string = raw_str.decode('utf-8')
+        # Convert XML to Python Dictionary
+        xml_dict = xmltodict.parse(decoded_string, dict_constructor=dict)
+        with open('.cache/pub_id/'+req_hash, 'w') as fp:
+            fp.write(xmltodict.unparse(xml_dict))
+    return xml_dict
+
+#%% Function to retrieve PC member's all publications from DBLP
+def request_affiliation(dblp_link, retry_num=2, outputtype='xml'):
+    if not os.path.exists('.cache'):
+        os.makedirs('.cache')
+    if not os.path.exists('.cache/pub_id'):
+        os.makedirs('.cache/pub_id')
+    # Construct the query, please refer to https://dblp.org/faq/1474589.html
+    req_url = ('%s.xml' % (dblp_link))
+    req_hash= hashlib.sha256(req_url.encode('utf-8')).hexdigest()
+    # Try to fetch author data using DBLP API
+    try:
+        if os.path.isfile('.cache/pub_id/'+req_hash):
+            with open('.cache/pub_id/'+req_hash, 'r') as fp:
+                xml_dict = xmltodict.parse(fp.read(), dict_constructor=dict)
+        else:
+            resource = urllib.request.urlopen(req_url)
+            # Get the JSON data
+            raw_str = resource.read()
+            # Sanitize string
+            decoded_string = raw_str.decode('utf-8')
+            # Convert XML to Python Dictionary
+            xml_dict = xmltodict.parse(decoded_string, dict_constructor=dict)
+            with open('.cache/pub_id/'+req_hash, 'w') as fp:
+                fp.write(xmltodict.unparse(xml_dict))
+
+        affiliation_list=[]
+        affiliation_str = ''
+        try:
+            notes = xml_dict['dblpperson']['person']['note']
+            if(isinstance(notes,dict)):
+                notes = [notes]
+            for note in notes:
+                if (note['@type']=='affiliation'):
+                    affiliation_list.append(note['#text'])
+            affiliation_str=','.join(affiliation_list)
+            affiliation_str= affiliation_str + ' <DBLP>'
+        except:
+            affiliation_str = 'NONE <DBLP>'
+    except:
+        affiliation_str = 'NONE <DBLP>'
+    return affiliation_str
@@ -0,0 +1,81 @@
+# Project: ISCA 2021 Script
+# Filename: s01_pcname_to_dblp_person_id.py
+# Date: March 16, 2021
+# Author: Bagus Hanindhito (hanindhito[at]bagus[dot]my[dot]id)
+# Title: PC Member Name to DBLP Person ID Lookup Script
+# Description:
+## This script is used to look-up appropriate DBLP Person ID that match
+## with a given name (first name and last name). The script uses fuzzy match
+## to match the given person name with the DBLP person name. Because of homonym 
+## (i.e., multiple persons with the same name), the script may output multiple 
+## person with the same name and requires a bit of manual works to choose the 
+## correct person. 
+
+#%% Import some libraries that are needed
+import pandas as pd
+import numpy as np
+import tqdm
+import os
+from fuzzywuzzy import fuzz
+from s00_function import request_author_key
+from s00_function import merge_affiliation
+from s00_function import convert_to_dict
+from s00_function import filter_name
+from s00_function import filter_affiliation
+
+#%% Define the input and output CSV filename
+# Input CSV filename
+## This is CSV file obtained from HotCRP by going to Users and select 'Program Committee' in the
+## filter. At the bottom of page, click Select All then Download 'PC info' and click 'Go'
+## 
+pc_info_hotcrp_filename = 'sample-data/input/isca2021-pcinfo.csv'
+
+# Output CSV filename
+pc_to_dblp_filename = 'sample-data/output/isca2021-pc-to-dblp.csv'
+
+# %% Load Input CSV to Pandas Dataframe
+# Load the PC Info from HotCRP
+pc_info_hotcrp_df = pd.read_csv(pc_info_hotcrp_filename)
+
+# %% Sanitize
+# Construct and Sanitize First name and Last name
+pc_members_name = pd.DataFrame(pc_info_hotcrp_df['first'] + " " + pc_info_hotcrp_df['last'], columns = ['full'])
+pc_members_name['split'] = pc_members_name['full'].str.split()
+pc_members_name['last'] = pc_members_name['split'].str[-1]
+pc_members_name['first'] = pc_members_name.apply(lambda x: x['full'].replace(" " + x['last'],""), axis=1)
+pc_members_name['affiliation'] = pc_info_hotcrp_df['affiliation']
+pc_members_name['email'] = pc_info_hotcrp_df['email']
+del pc_members_name['split']
+
+# %% Request DBLP Person ID for each PC Members
+pc_members_list = []
+
+for index,pc_member in tqdm.tqdm(pc_members_name.iterrows(), total=pc_members_name.shape[0]):
+    pc_json = request_author_key(pc_member["first"], pc_member["last"])    
+    pc_member_dblp_list = convert_to_dict(pc_member, pc_json)
+    confidence_level=100
+    pc_member_dblp_list_filtered = []
+    
+    # Filtering by name with the highest confidence level
+    # then gradually reducing the confidence level until at least one item come up.
+    # This is not 100% perfect.
+    while(len(pc_member_dblp_list_filtered)<1):
+        pc_member_dblp_list_filtered=filter_name(pc_member_dblp_list, confidence_level)
+        confidence_level=confidence_level-1
+    pc_member_dblp_list = pc_member_dblp_list_filtered   
+    pc_member_dblp_list_filtered = filter_affiliation(pc_member_dblp_list)
+    
+    # Filtering by Affiliation
+    if(len(pc_member_dblp_list_filtered)>0):
+        pc_member_dblp_list = pc_member_dblp_list_filtered
+    
+    # Add to the Final List
+    pc_members_list = pc_members_list + pc_member_dblp_list
+
+# %% Write Output to CSV
+# This CSV needs to be inspected manually.
+## NOTE: To avoid overwritting, rename the final CSV file to something else.
+pc_members_df = pd.DataFrame(pc_members_list)
+pc_members_df.to_csv(pc_to_dblp_filename, index=False)
+
+# %%