forked from tomdeman-bio/Sequence-scripts
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathcalc_N50_GC_genomesize.pl
140 lines (119 loc) · 2.73 KB
/
calc_N50_GC_genomesize.pl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
#! /usr/bin/perl
#Written by Tom de Man
#Calculates basic contig stats like G+C content, N50, and number of contigs
use strict;
use warnings;
use List::Util qw(sum min max);
use Getopt::Long;
use File::Basename;
my $As = 0;
my $Ts = 0;
my $Gs = 0;
my $Cs = 0;
my $Ns = 0;
my $file;
my $helpAsked;
my $outFile = "";
GetOptions(
"i=s" => \$file,
"h|help" => \$helpAsked,
"o|outputFile=s" => \$outFile,
);
if(defined($helpAsked)) {
Usage();
exit;
}
if(!defined($file)) {
Error("No input files are provided");
}
my ($fileName, $filePath) = fileparse($file);
$outFile = $file . "_n50_GC_genomesize_stats" if($outFile eq "");
open(IN, "<$file") or die "Cannot open file: $file\n";
open(OUT, ">$outFile") or die "Cannot open file: $outFile\n";
my @len = ();
my $prevFastaSeqId = "";
my $fastaSeqId = "";
my $fastaSeq = "";
while(<IN>) {
chomp;
if($_ =~ /^>/) {
$prevFastaSeqId = $fastaSeqId;
$fastaSeqId = $_;
if($fastaSeq ne "") {
push(@len, length $fastaSeq);
baseCount($fastaSeq);
}
$fastaSeq = "";
}
else {
$fastaSeq .= $_;
}
}
close IN;
if($fastaSeq ne "") {
$prevFastaSeqId = $fastaSeqId;
push(@len, length $fastaSeq);
baseCount($fastaSeq);
}
my $totalContigs = scalar @len;
my $bases = sum(@len);
my $minContigLen = min(@len);
my $maxContigLen = max(@len);
my $n50 = calcN50(\@len, 50);
my $GCcont = ($Gs+$Cs)/$bases*100;
#MMB sheet order
print "$totalContigs\t$bases\n";
printf OUT "%-25s %d\n", "Number of reads/contigs", $totalContigs;
printf OUT "%-25s %d\n", "Total assembly length", $bases;
#GC
printf OUT "%-25s %0.2f %s\n", "(G + C)s", ($Gs+$Cs)/$bases*100, "%";
#N50
printf OUT "%-25s %d\n", "N50 length", $n50;
print "Contig Statistics file: $outFile\n";
close OUT;
exit;
sub calcN50 {
my @x = @{$_[0]};
my $n = $_[1];
@x=sort{$b<=>$a} @x;
my $total = sum(@x);
my ($count, $n50)=(0,0);
for (my $j=0; $j<@x; $j++){
$count+=$x[$j];
if(($count>=$total*$n/100)){
$n50=$x[$j];
last;
}
}
return $n50;
}
sub baseCount {
my $seq = $_[0];
my $tAs += $seq =~ s/A/A/gi;
my $tTs += $seq =~ s/T/T/gi;
my $tGs += $seq =~ s/G/G/gi;
my $tCs += $seq =~ s/C/C/gi;
$Ns += (length $seq) - $tAs - $tTs - $tGs - $tCs;
$As += $tAs;
$Ts += $tTs;
$Gs += $tGs;
$Cs += $tCs;
}
sub Usage {
print "\n Usage: perl $0 <options>\n\n";
print "### Input reads/contigs (FASTA) (Required)\n";
print " -i <Read/Sequence file>\n";
print " Read/Sequence in fasta format\n";
print " -o | -outputFile <Output file name>\n";
print " default: By default, N50 statistics file will be stored where the input file is\n";
print " -h | -help\n";
print " Prints this help\n";
print "\n";
}
sub Error {
my $msg = $_[0];
printf STDERR "|%-70s|\n", " Error!!";
printf STDERR "|%-70s|\n", " $msg";
Usage();
exit;
}