-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathprepare_biom_file_from_url.py
75 lines (63 loc) · 2.83 KB
/
prepare_biom_file_from_url.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
#!/usr/bin/env python3
import sys
import re
import biom
import requests
import os
import pandas as pd
excel_file = sys.argv[1]
# Function to download the file from url and save it locally
def download(url: str, dest_folder: str):
if not os.path.exists(dest_folder):
os.makedirs(dest_folder) # create folder if it does not exist
filename = url.split('/')[-1].replace(" ", "_") # be careful with file names
file_path = os.path.join(dest_folder, filename)
r = requests.get(url, stream=True)
if r.ok:
print("saving to", os.path.abspath(file_path))
with open(file_path, 'wb') as f:
for chunk in r.iter_content(chunk_size=1024 * 8):
if chunk:
f.write(chunk)
f.flush()
os.fsync(f.fileno())
else: # HTTP status code 4XX/5XX
print("Download failed: status code {}\n{}".format(r.status_code, r.text))
return file_path
# Function to clean the column names
# It replaces <sample_name>_Abundance to Abundance to make it generic for loading when you have to reference the column name
def convert_group(match_obj):
if match_obj.group(1) is not None:
result = re.sub(".*_Abundance","Abundance",match_obj.group(1))
return result
######### Python script #########
# 1. Downloads file by url and save it locally
# 2. Converts biom format to tsv (table separated values) format using biom-format Python package
# 3. Modifies tsv file to make it ready for neo4j loading
#################################
# Read file URLs from Excel
df = pd.ExcelFile(excel_file).parse('Sheet1')
# Loop through urls
for url in df['urls'].values:
try:
print(url)
# Check if the file already processed
# Download file
file_path = download(url, dest_folder=".\data")
# Convert biom into tsv format
biom_table = biom.load_table(file_path)
tsv_filename = file_path.replace("data","import_data") + ".tsv"
with open(tsv_filename, "w") as f:
biom_table.to_tsv(header_key="taxonomy", header_value="taxonomy",direct_io=f)
# Modify tsv file to prepare it for neo4j loading
with open(tsv_filename, 'r') as fin:
lines = fin.readlines()
lines[1] = lines[1][1:].replace("OTU ID", "OTU_ID") #removes the first symbol of the second line (it's "#" that neo4j doesn't like)
lines[1] = re.sub(r"(\w*\W)", convert_group, lines[1])
lines[1] = lines[1].replace("-RPKs", "_RPKs")
#new_lines = [re.sub('[k,f,s,q,g,p,c,o,t]__', '', s) for s in lines]
new_lines = [re.sub('.s__', '|s__', s) for s in lines]
with open(tsv_filename, 'w') as fout:
fout.writelines(new_lines[1:]) # removes the first line before saving
except:
print("Something went wrong with the file: " + url)