Skip to content

Commit 3a8983a

Browse files
committed
Refactor combine_journal_lists scripts to improve quality
ref: JabRef#150
1 parent 10e6244 commit 3a8983a

File tree

2 files changed

+163
-51
lines changed

2 files changed

+163
-51
lines changed

scripts/combine_journal_lists_dotless.py

+76-21
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@
33
"""
44
Python script for combining several journal abbreviation lists
55
and producing an alphabetically sorted list. If the same journal
6-
names are repeated, only the version found last is retained.
6+
names are repeated, only the version found first is retained.
77
88
This version of the script specifically combines the lists following the ISO4
99
standard WITHOUT dots after abbreviated words.
@@ -13,37 +13,92 @@
1313
Output: writes file 'journalList_dotless.csv'
1414
"""
1515

16+
import csv
17+
import json
18+
from pathlib import Path
19+
import re
1620
import sys
17-
import pandas as pd
1821

1922
# Define the list of CSV files
2023
import_order = [
21-
'journals/journal_abbreviations_entrez.csv',
22-
'journals/journal_abbreviations_medicus.csv',
23-
'journals/journal_abbreviations_webofscience-dotless.csv'
24+
"journals/journal_abbreviations_entrez.csv",
25+
"journals/journal_abbreviations_medicus.csv",
26+
"journals/journal_abbreviations_webofscience-dotless.csv",
2427
]
2528

2629

27-
def main(output_filename):
28-
# Read and merge CSV files
29-
# dfs = [pd.read_csv(file, header=None) for file in import_order]
30-
dfs = []
31-
for file in import_order:
32-
df = pd.read_csv(file, header=None)
33-
dfs.append(df)
34-
print(f"{file}: {len(df)}")
35-
merged_df = pd.concat(dfs, ignore_index=True)
30+
def load_data(file_paths):
31+
"""Load and combine data from CSV files."""
32+
journal_dict = {}
33+
normalized_keys = set()
34+
for path in file_paths:
35+
with open(path, mode="r", encoding="utf-8") as file:
36+
reader = csv.reader(file)
37+
for row in reader:
38+
name = row[0].strip()
39+
abbr = row[1].strip()
3640

37-
# Drop duplicates based on the first column value and keep the last one obtained
38-
merged_df.drop_duplicates(subset=[0], keep='last', inplace=True)
41+
# Discard entries where name or abbr is missing
42+
if not (name and abbr):
43+
continue
44+
# Discard entries that are too long or too short
45+
if len(name) >= 80 or len(name) <= 3:
46+
continue
47+
# Discard names that start with non-alphanumeric characters
48+
if not name[0].isalnum():
49+
continue
50+
# Discard names that consist only of numbers
51+
if name.replace(" ", "").isnumeric():
52+
continue
53+
# Discard names containing \
54+
if name.count("\\"):
55+
continue
56+
# Discard entries where the first letters of name and abbr do not match
57+
if abbr[0] != name.replace("The", "").replace("A ", "")[0]:
58+
continue
59+
# Only keep the first occurrence
60+
if name in journal_dict:
61+
continue
62+
# Generate normalizedKey, keeping only the first match
63+
normalized_key = normalize_name(name)
64+
if normalized_key in normalized_keys:
65+
continue
3966

40-
# Sort alphabetically
41-
sorted_df = merged_df.sort_values(by=[0])
67+
journal_dict[name] = abbr
68+
normalized_keys.add(normalized_key) # Add to the set of used keys
69+
return journal_dict
4270

43-
# Save the result to the specified CSV file and ensure values are quoted
44-
sorted_df.to_csv(output_filename, index=False, header=False, quoting=1)
4571

46-
print(f"Write {output_filename}, Combined key count: {len(merged_df)}")
72+
def normalize_name(name):
73+
"""
74+
Normalize the journal name by removing specified characters using regex.
75+
See src/utils/str.ts -> normalizeKey()
76+
"""
77+
return re.sub(r"\b(the|and)\b|[&\-:, ()]", "", name, flags=re.IGNORECASE).lower()
78+
79+
80+
def save_to_json(data, output_file):
81+
"""Save the data to a JSON file."""
82+
with open(output_file, mode="w", encoding="utf-8") as json_file:
83+
json.dump(data, json_file, indent=2, ensure_ascii=False)
84+
85+
86+
def save_to_csv(data, output_file):
87+
"""Save the data to a CSV file."""
88+
with open(output_file, mode="w", newline="", encoding="utf-8") as csv_file:
89+
writer = csv.writer(csv_file, quoting=1)
90+
for name, abbr in data.items():
91+
writer.writerow([name, abbr])
92+
93+
94+
def main(filename):
95+
base_path = Path().cwd()
96+
output_filename = base_path / filename
97+
import_paths = [base_path / file for file in import_order]
98+
99+
journal_data = load_data(import_paths)
100+
sorted_journal_data = dict(sorted(journal_data.items())) # Sort alphabetically
101+
save_to_csv(sorted_journal_data, output_filename)
47102

48103

49104
if __name__ == "__main__":

scripts/combine_journal_lists_dots.py

+87-30
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@
33
"""
44
Python script for combining several journal abbreviation lists
55
and producing an alphabetically sorted list. If the same journal
6-
names are repeated, only the version found last is retained.
6+
names are repeated, only the version found first is retained.
77
88
This version of the script specifically combines the lists following the ISO4
99
standard WITH dots after abbreviated words.
@@ -13,45 +13,102 @@
1313
Output: writes file 'journalList_dots.csv' (or specified output file)
1414
"""
1515

16+
import csv
17+
import json
18+
from pathlib import Path
19+
import re
1620
import sys
17-
import pandas as pd
1821

19-
# Define the list of CSV files
2022
import_order = [
21-
'journals/journal_abbreviations_acs.csv',
22-
'journals/journal_abbreviations_ams.csv',
23-
'journals/journal_abbreviations_general.csv',
24-
'journals/journal_abbreviations_geology_physics.csv',
25-
'journals/journal_abbreviations_ieee.csv',
26-
'journals/journal_abbreviations_lifescience.csv',
27-
'journals/journal_abbreviations_mathematics.csv',
28-
'journals/journal_abbreviations_mechanical.csv',
29-
'journals/journal_abbreviations_meteorology.csv',
30-
'journals/journal_abbreviations_sociology.csv',
31-
'journals/journal_abbreviations_webofscience-dots.csv'
23+
# Keep IEEE before ubc, because IEEE has its own style.
24+
"journals/journal_abbreviations_ieee.csv",
25+
"journals/journal_abbreviations_acs.csv",
26+
# Keep ubc before other jabref's, because ubc's data is more accurate.
27+
"journals/journal_abbreviations_ubc.csv",
28+
"journals/journal_abbreviations_ams.csv",
29+
"journals/journal_abbreviations_general.csv",
30+
"journals/journal_abbreviations_geology_physics.csv",
31+
"journals/journal_abbreviations_lifescience.csv",
32+
"journals/journal_abbreviations_mathematics.csv",
33+
"journals/journal_abbreviations_mechanical.csv",
34+
"journals/journal_abbreviations_meteorology.csv",
35+
"journals/journal_abbreviations_sociology.csv",
36+
"journals/journal_abbreviations_webofscience-dots.csv",
3237
]
3338

3439

35-
def main(output_filename):
36-
# Read and merge CSV files
37-
# dfs = [pd.read_csv(file, header=None) for file in import_order]
38-
dfs = []
39-
for file in import_order:
40-
df = pd.read_csv(file, header=None)
41-
dfs.append(df)
42-
print(f"{file}: {len(df)}")
43-
merged_df = pd.concat(dfs, ignore_index=True)
40+
def load_data(file_paths):
41+
"""Load and combine data from CSV files."""
42+
journal_dict = {}
43+
normalized_keys = set()
44+
for path in file_paths:
45+
with open(path, mode="r", encoding="utf-8") as file:
46+
reader = csv.reader(file)
47+
for row in reader:
48+
name = row[0].strip()
49+
abbr = row[1].strip()
4450

45-
# Drop duplicates based on the first column value and keep the last one obtained
46-
merged_df.drop_duplicates(subset=[0], keep='last', inplace=True)
51+
# Discard entries where name or abbr is missing
52+
if not (name and abbr):
53+
continue
54+
# Discard entries that are too long or too short
55+
if len(name) >= 80 or len(name) <= 3:
56+
continue
57+
# Discard names that start with non-alphanumeric characters
58+
if not name[0].isalnum():
59+
continue
60+
# Discard names that consist only of numbers
61+
if name.replace(" ", "").isnumeric():
62+
continue
63+
# Discard names containing \
64+
if name.count("\\"):
65+
continue
66+
# Discard entries where the first letters of name and abbr do not match
67+
if abbr[0] != name.replace("The", "").replace("A ", "")[0]:
68+
continue
69+
# Only keep the first occurrence
70+
if name in journal_dict:
71+
continue
72+
# Generate normalizedKey, keeping only the first match
73+
normalized_key = normalize_name(name)
74+
if normalized_key in normalized_keys:
75+
continue
4776

48-
# Sort alphabetically
49-
sorted_df = merged_df.sort_values(by=[0])
77+
journal_dict[name] = abbr
78+
normalized_keys.add(normalized_key) # Add to the set of used keys
79+
return journal_dict
5080

51-
# Save the result to the specified CSV file and ensure values are quoted
52-
sorted_df.to_csv(output_filename, index=False, header=False, quoting=1)
5381

54-
print(f"Write {output_filename}, Combined key count: {len(merged_df)}")
82+
def normalize_name(name):
83+
"""
84+
Normalize the journal name by removing specified characters using regex.
85+
See src/utils/str.ts -> normalizeKey()
86+
"""
87+
return re.sub(r"\b(the|and)\b|[&\-:, ()]", "", name, flags=re.IGNORECASE).lower()
88+
89+
90+
def save_to_json(data, output_file):
91+
"""Save the data to a JSON file."""
92+
with open(output_file, mode="w", encoding="utf-8") as json_file:
93+
json.dump(data, json_file, indent=2, ensure_ascii=False)
94+
95+
96+
def save_to_csv(data, output_file):
97+
"""Save the data to a CSV file."""
98+
with open(output_file, mode="w", newline="", encoding="utf-8") as csv_file:
99+
writer = csv.writer(csv_file, quoting=1)
100+
for name, abbr in data.items():
101+
writer.writerow([name, abbr])
102+
103+
104+
def main(filename):
105+
base_path = Path().cwd()
106+
output_filename = base_path / filename
107+
import_paths = [base_path / file for file in import_order]
108+
109+
journal_data = load_data(import_paths)
110+
sorted_journal_data = dict(sorted(journal_data.items())) # Sort alphabetically
111+
save_to_csv(sorted_journal_data, output_filename)
55112

56113

57114
if __name__ == "__main__":

0 commit comments

Comments
 (0)