-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathtss_from_gtf.py
59 lines (44 loc) · 1.26 KB
/
tss_from_gtf.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
"""
Extract transcription start sites from an Ensembl GTF. Used to create the TSS reference file used in
~/ATAC_PIPELINE/atac-seq-pipeline/scripts/build_genome_data.sh
Usage:
python tss_from_gtf.py <gtf.gz> <tss_outfile.gz>
"""
import gzip
import re
import sys
in_gtf = sys.argv[1]
out_tss = sys.argv[2]
# make TSS annotation file from GTF
# chr1 69089 69090 ENSG00000186092.4 0 +
# chr1 HAVANA gene 11869 14409 . + . gene_id "ENSG00000223972.5";
with gzip.open(in_gtf, 'rb') as gtf, gzip.open(out_tss, 'wb') as out:
for line in gtf:
if line.startswith('#'):
continue
l = line.strip().split('\t')
if not l[2] == 'gene':
continue
strand = l[6]
if strand == '+':
tss = l[3]
elif strand == '-':
tss = l[4]
else:
print 'Strand not recognized'
break
chrom = l[0]
# print (l[8])
gene_id = l[8].split(';')[0].split(' ')[1]
gene_id = re.sub("\"", "", gene_id)
# print (gene_id)
for i in(l[8].split(';')):
if "gene_biotype" in i :
gene_type=i.split(" ")[2]
gene_type = re.sub("\"", "", gene_type)
if not gene_type == 'protein_coding':
continue
# print (gene_type)
end = tss
start = int(tss) - 1
out.write('\t'.join([chrom, str(start), end, gene_id, '0', strand])+'\n')