|
3 | 3 | """
|
4 | 4 | Python script for combining several journal abbreviation lists
|
5 | 5 | and producing an alphabetically sorted list. If the same journal
|
6 |
| -names are repeated, only the version found last is retained. |
| 6 | +names are repeated, only the version found first is retained. |
7 | 7 |
|
8 | 8 | This version of the script specifically combines the lists following the ISO4
|
9 | 9 | standard WITH dots after abbreviated words.
|
|
13 | 13 | Output: writes file 'journalList_dots.csv' (or specified output file)
|
14 | 14 | """
|
15 | 15 |
|
| 16 | +import csv |
| 17 | +import json |
| 18 | +from pathlib import Path |
| 19 | +import re |
16 | 20 | import sys
|
17 |
| -import pandas as pd |
18 | 21 |
|
19 |
| -# Define the list of CSV files |
20 | 22 | import_order = [
|
21 |
| - 'journals/journal_abbreviations_acs.csv', |
22 |
| - 'journals/journal_abbreviations_ams.csv', |
23 |
| - 'journals/journal_abbreviations_general.csv', |
24 |
| - 'journals/journal_abbreviations_geology_physics.csv', |
25 |
| - 'journals/journal_abbreviations_ieee.csv', |
26 |
| - 'journals/journal_abbreviations_lifescience.csv', |
27 |
| - 'journals/journal_abbreviations_mathematics.csv', |
28 |
| - 'journals/journal_abbreviations_mechanical.csv', |
29 |
| - 'journals/journal_abbreviations_meteorology.csv', |
30 |
| - 'journals/journal_abbreviations_sociology.csv', |
31 |
| - 'journals/journal_abbreviations_webofscience-dots.csv' |
| 23 | + # Keep IEEE before ubc, because IEEE has its own style. |
| 24 | + "journals/journal_abbreviations_ieee.csv", |
| 25 | + "journals/journal_abbreviations_acs.csv", |
| 26 | + # Keep ubc before other jabref's, because ubc's data is more accurate. |
| 27 | + "journals/journal_abbreviations_ubc.csv", |
| 28 | + "journals/journal_abbreviations_ams.csv", |
| 29 | + "journals/journal_abbreviations_general.csv", |
| 30 | + "journals/journal_abbreviations_geology_physics.csv", |
| 31 | + "journals/journal_abbreviations_lifescience.csv", |
| 32 | + "journals/journal_abbreviations_mathematics.csv", |
| 33 | + "journals/journal_abbreviations_mechanical.csv", |
| 34 | + "journals/journal_abbreviations_meteorology.csv", |
| 35 | + "journals/journal_abbreviations_sociology.csv", |
| 36 | + "journals/journal_abbreviations_webofscience-dots.csv", |
32 | 37 | ]
|
33 | 38 |
|
34 | 39 |
|
35 |
| -def main(output_filename): |
36 |
| - # Read and merge CSV files |
37 |
| - # dfs = [pd.read_csv(file, header=None) for file in import_order] |
38 |
| - dfs = [] |
39 |
| - for file in import_order: |
40 |
| - df = pd.read_csv(file, header=None) |
41 |
| - dfs.append(df) |
42 |
| - print(f"{file}: {len(df)}") |
43 |
| - merged_df = pd.concat(dfs, ignore_index=True) |
| 40 | +def load_data(file_paths): |
| 41 | + """Load and combine data from CSV files.""" |
| 42 | + journal_dict = {} |
| 43 | + normalized_keys = set() |
| 44 | + for path in file_paths: |
| 45 | + with open(path, mode="r", encoding="utf-8") as file: |
| 46 | + reader = csv.reader(file) |
| 47 | + for row in reader: |
| 48 | + name = row[0].strip() |
| 49 | + abbr = row[1].strip() |
44 | 50 |
|
45 |
| - # Drop duplicates based on the first column value and keep the last one obtained |
46 |
| - merged_df.drop_duplicates(subset=[0], keep='last', inplace=True) |
| 51 | + # Discard entries where name or abbr is missing |
| 52 | + if not (name and abbr): |
| 53 | + continue |
| 54 | + # Discard entries that are too long or too short |
| 55 | + if len(name) >= 80 or len(name) <= 3: |
| 56 | + continue |
| 57 | + # Discard names that start with non-alphanumeric characters |
| 58 | + if not name[0].isalnum(): |
| 59 | + continue |
| 60 | + # Discard names that consist only of numbers |
| 61 | + if name.replace(" ", "").isnumeric(): |
| 62 | + continue |
| 63 | + # Discard names containing \ |
| 64 | + if name.count("\\"): |
| 65 | + continue |
| 66 | + # Discard entries where the first letters of name and abbr do not match |
| 67 | + if abbr[0] != name.replace("The", "").replace("A ", "")[0]: |
| 68 | + continue |
| 69 | + # Only keep the first occurrence |
| 70 | + if name in journal_dict: |
| 71 | + continue |
| 72 | + # Generate normalizedKey, keeping only the first match |
| 73 | + normalized_key = normalize_name(name) |
| 74 | + if normalized_key in normalized_keys: |
| 75 | + continue |
47 | 76 |
|
48 |
| - # Sort alphabetically |
49 |
| - sorted_df = merged_df.sort_values(by=[0]) |
| 77 | + journal_dict[name] = abbr |
| 78 | + normalized_keys.add(normalized_key) # Add to the set of used keys |
| 79 | + return journal_dict |
50 | 80 |
|
51 |
| - # Save the result to the specified CSV file and ensure values are quoted |
52 |
| - sorted_df.to_csv(output_filename, index=False, header=False, quoting=1) |
53 | 81 |
|
54 |
| - print(f"Write {output_filename}, Combined key count: {len(merged_df)}") |
| 82 | +def normalize_name(name): |
| 83 | + """ |
| 84 | + Normalize the journal name by removing specified characters using regex. |
| 85 | + See src/utils/str.ts -> normalizeKey() |
| 86 | + """ |
| 87 | + return re.sub(r"\b(the|and)\b|[&\-:, ()]", "", name, flags=re.IGNORECASE).lower() |
| 88 | + |
| 89 | + |
| 90 | +def save_to_json(data, output_file): |
| 91 | + """Save the data to a JSON file.""" |
| 92 | + with open(output_file, mode="w", encoding="utf-8") as json_file: |
| 93 | + json.dump(data, json_file, indent=2, ensure_ascii=False) |
| 94 | + |
| 95 | + |
| 96 | +def save_to_csv(data, output_file): |
| 97 | + """Save the data to a CSV file.""" |
| 98 | + with open(output_file, mode="w", newline="", encoding="utf-8") as csv_file: |
| 99 | + writer = csv.writer(csv_file, quoting=1) |
| 100 | + for name, abbr in data.items(): |
| 101 | + writer.writerow([name, abbr]) |
| 102 | + |
| 103 | + |
| 104 | +def main(filename): |
| 105 | + base_path = Path().cwd() |
| 106 | + output_filename = base_path / filename |
| 107 | + import_paths = [base_path / file for file in import_order] |
| 108 | + |
| 109 | + journal_data = load_data(import_paths) |
| 110 | + sorted_journal_data = dict(sorted(journal_data.items())) # Sort alphabetically |
| 111 | + save_to_csv(sorted_journal_data, output_filename) |
55 | 112 |
|
56 | 113 |
|
57 | 114 | if __name__ == "__main__":
|
|
0 commit comments