Skip to content

Commit 9d01fc7

Browse files
committed
Added CSV sorting (closes emeryberger#7215).
1 parent 73d14cb commit 9d01fc7

File tree

3 files changed

+79
-0
lines changed

3 files changed

+79
-0
lines changed

Makefile

+1
Original file line numberDiff line numberDiff line change
@@ -57,6 +57,7 @@ faculty-affiliations.csv homepages.csv scholar.csv csrankings.csv: csrankings-*.
5757
clean-csrankings:
5858
@echo "Cleaning."
5959
@$(PYTHON) util/clean-csrankings.py
60+
@$(PYTHON) util/sort-csv-files.py
6061
@echo "Done."
6162

6263
home-pages: faculty-affiliations.csv homepages.csv

sort_directives.json

+23
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,23 @@
1+
[
2+
{
3+
"files": ["country-info.csv"],
4+
"sort_columns": ["institution", "region", "countryabbrv" ]
5+
},
6+
{
7+
"files": [ "institutions.csv" ],
8+
"sort_columns": ["institution"]
9+
10+
},
11+
{
12+
"files" : ["csrankings-[a-z].csv"],
13+
"sort_columns": ["name"]
14+
},
15+
{
16+
"files": ["acm-fellows.csv", "turing.csv"],
17+
"sort_columns": ["year", "name"]
18+
},
19+
{
20+
"files" : ["old/industry.csv", "old/rip.csv", "old/emeritus.csv"],
21+
"sort_columns": ["name"]
22+
}
23+
]

util/sort-csv-files.py

+55
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,55 @@
1+
import pandas as pd
2+
import json
3+
import os
4+
from glob import glob
5+
6+
def get_line_ending(file_path):
7+
with open(file_path, 'rb') as f:
8+
first_line = f.readline()
9+
if b'\r\n' in first_line:
10+
return '\r\n'
11+
elif b'\r' in first_line:
12+
return '\r'
13+
else:
14+
return '\n'
15+
16+
def sort_csv_files(directives_file):
17+
with open(directives_file, 'r') as f:
18+
directives = json.load(f)
19+
20+
for directive in directives:
21+
files = directive['files']
22+
sort_columns = directive['sort_columns']
23+
sort_orders = directive.get('sort_orders', [True] * len(sort_columns))
24+
25+
for file_pattern in files:
26+
for file_path in glob(file_pattern):
27+
line_ending = get_line_ending(file_path)
28+
df = pd.read_csv(file_path)
29+
30+
# Convert column indexes to names if specified as numbers
31+
columns = df.columns
32+
sort_columns_actual = [
33+
columns[col] if isinstance(col, int) else col
34+
for col in sort_columns
35+
]
36+
37+
sorted_df = df.sort_values(by=sort_columns_actual, ascending=sort_orders)
38+
39+
# Write the sorted DataFrame to a temporary file with the specified line ending
40+
temp_file_path = file_path + '.tmp'
41+
sorted_df.to_csv(temp_file_path, index=False)
42+
43+
# Replace original file with the temporary file using the correct line endings
44+
with open(temp_file_path, 'r', newline='\n') as temp_file:
45+
with open(file_path, 'w', newline='') as original_file:
46+
for line in temp_file:
47+
original_file.write(line.rstrip('\n') + line_ending)
48+
49+
os.remove(temp_file_path)
50+
51+
# sorted_df.to_csv(file_path, index=False, line_terminator=line_ending)
52+
53+
# Example usage
54+
if __name__ == "__main__":
55+
sort_csv_files('sort_directives.json')

0 commit comments

Comments
 (0)