#!/usr/bin/perl # Time-stamp: <2005-04-28 22:41:03 kasper> use warnings; use strict; use LFasta; use LFastaIO; use Getopt::Long; use constant USAGE =>< Label to consider. Default is 'C'. --short One line output: Base SN, Base SP, Exon SN, Exon SP --long Prints stats on first, internal, last and single exons, as well as gene stats. --fuzzy Reports the exon stats allowing borders to be at most off relative to the annotation. --exonsplot prints 100*correct_exons/annotated_exons and 100*exons_overlapping_annotated/annotated_exons per gene to . With the --fuzzy option the fuzzy stats are printed too. EXAMPLES: predstat.pl file.lfa file.stats cat file.lfa | predstat.pl > file.stats DEPENDENCIES Classes LFasta and LFastaIO. AUTHOR: Kasper Munch COPYRIGHT: This program is free software. You may copy, modify, and redistribute it under the same terms as Perl itself. END my $help = 0; my $chunkletter = 'C'; my $short = 0; my $long = 0; my $fuzzy = 0; my $errorcounts = 0; my $exonsplot = ''; my $borderplot = ''; GetOptions( "help" => \$help, "short" => \$short, "long" => \$long, "fuzzy=i" => \$fuzzy, "chunkletter=s" => \$chunkletter, "exonsplot=s" => \$exonsplot, "borderplot=s" => \$borderplot, "errorcounts" => \$errorcounts, ) or die USAGE; $help and die USAGE; @ARGV = ('-') unless @ARGV; my $input = shift @ARGV; open my $in, "$input" or die "$input: $!\n"; @ARGV = ('>&STDOUT') unless @ARGV; my $output = shift @ARGV; open my $out, ">$output" or die "$output: $!\n"; # Redefine input to LFastaIO: $in = LFastaIO->new(fh => $in); my $exonsplotFh; if ($exonsplot) { open $exonsplotFh, ">$exonsplot" or die "$exonsplot: $!\n"; } # Hash to hold global stats: my %s; my %exonsplot; my %borderplot; my %borderfiles; my @nochunkwarning = (); my $C = $chunkletter; # Number of sequences: my $N=0; # Number of bases: my $NB=0; my $nr_preds = 0; # Count and do stats for each sequence: while (my $lfa = <$in>) { my $id = $lfa->id; my $lab = $lfa->labels; my $preds = $lfa->preds; my $seqlength = $lfa->length; # Turn it into a regular array where 0, 2, 4 ... are prediction # labels and 1, 3, 5 ... are the predictions: my @preds = (); foreach my $arrayref (@$preds) { push @preds, @$arrayref; } # Number of prediction tracks: $nr_preds = $#preds; ++$N; $NB += length($lab); for (my $k=1; $k<=$nr_preds; $k+=2) { # Initialise all the per gene stats: my $total_annot = 0; my $nucl_correct = 0; my $gene_total_annot = 0; my $gene_total_pred = 0; my $gene_correct = 0; my $gene_fuzzycorrect = 0; my $genes_overlap_annot = 0; my $genes_overlap_pred = 0; my $exon_total_annot = 0; my $exon_total_pred = 0; my $exon_correct = 0; my $exon_fuzzycorrect = 0; my $exons_overlap_annot = 0; my $exons_overlap_pred = 0; my $first_exon_correct = 0; my $first_exon_fuzzycorrect = 0; my $first_overlap_annot = 0; my $first_overlap_pred = 0; my $first_exon_total_annot = 0; my $first_exon_total_pred = 0; my $last_exon_correct = 0; my $last_exon_fuzzycorrect = 0; my $last_overlap_annot = 0; my $last_overlap_pred = 0; my $last_exon_total_annot = 0; my $last_exon_total_pred = 0; my $single_exon_correct = 0; my $single_exon_fuzzycorrect = 0; my $single_overlap_annot = 0; my $single_overlap_pred = 0; my $single_exon_total_annot = 0; my $single_exon_total_pred = 0; my $internal_exon_correct = 0; my $internal_exon_fuzzycorrect = 0; my $internal_overlap_annot = 0; my $internal_overlap_pred = 0; my $internal_exon_total_annot = 0; my $internal_exon_total_pred = 0; my $donor_total_pred = 0; my $inframe_upst_donor = 0; my $inframe_downst_donor = 0; my $outframe_upst_donor = 0; my $outframe_downst_donor = 0; my $inframe_upst_start = 0; my $inframe_downst_start = 0; my $outframe_upst_start = 0; my $outframe_downst_start = 0; my $acc_total_pred = 0; my $inframe_upst_acc = 0; my $inframe_downst_acc = 0; my $outframe_upst_acc = 0; my $outframe_downst_acc = 0; my $inframe_upst_end = 0; my $inframe_downst_end = 0; my $outframe_upst_end = 0; my $outframe_downst_end = 0; my $intronphase = 0; my $introns_total_annot = 0; my $upst_start_right_first_don = 0; my $downst_start_right_first_don = 0; my $wrong_first_exon_right_first_acc = 0; # Make the counts for the bases: my $seq0=$lab; my $seq1=$preds[$k]; # Match regions of $C: while ($seq0 =~ /$C+/g) { my $begin = length($`); my $length = length($&); # cut out region in second sequence: my $z = substr($seq1,$begin,$length); # Remove non-$C: $z =~ s/[^$C]//g; $total_annot += $length; $nucl_correct += length($z); } my $z = $seq1; # Remove non-$C: $z =~ s/[^$C]//g; my $total_pred = length($z); # Put non-$C flanks on the ends so that we are always # able to get a flank char on each chunk: $seq0= "_" . $lab . "_"; $seq1= "_" . $preds[$k] . "_"; # Keep track of what block we are at: my $blocks = 0; # Total number of annotation and prediction blocks: my $nr_annot_blocks = $seq0 =~ s/$C+/$&/g; my $nr_pred_blocks = $seq1 =~ s/$C+/$&/g; # Do counts in the annotation: while ($seq0 =~ /$C+/g) { $blocks++; my $begin = length($`); my $length = length($&); # Cut out region in prediction: my $z = substr($seq1, $begin-1, $length+2); my $annot_upst_flank = substr($seq0, $begin-1, 1); # See if it is a perfect match to the prediction (except # flanks): if ($z =~ /^[^$C]$C+[^$C]$/) { ++$exon_correct; ++$exon_fuzzycorrect; if ($nr_annot_blocks > 1) { $first_exon_correct = 1 if $blocks == 1; $first_exon_fuzzycorrect = 1 if $blocks == 1; $last_exon_correct = 1 if $blocks == $nr_annot_blocks; $last_exon_fuzzycorrect = 1 if $blocks == $nr_annot_blocks; ++$internal_exon_correct if $nr_annot_blocks > 2; ++$internal_exon_fuzzycorrect if $nr_annot_blocks > 2; } elsif ($nr_annot_blocks == 1) { $single_exon_correct = 1; $single_exon_fuzzycorrect = 1; } else { die; } } elsif ($fuzzy) { # Do counts of fuzzy exons: my $fuzzy_upst = substr($seq1, $begin-$fuzzy-1, 2*$fuzzy+2); my $fuzzy_downst = substr($seq1, $begin+$length-$fuzzy-1, 2*$fuzzy+2); my $at_start = 0; my $at_end = 0; $at_start = 1 if $begin-$fuzzy-1 < 0; $at_end = 1 if $begin+$length-$fuzzy-1 + 2*$fuzzy+2 > $seqlength; if ( $z =~ /^[^$C]*$C+[^$C]*$/ # There can be at most one chunk in there && ( $length < 2*$fuzzy # fuzzy regions overlap || ( ($at_start || $fuzzy_upst =~ /^[^$C]+$C+$/) # upst fuzzy region ok && ($at_end || $fuzzy_downst =~ /^$C+[^$C]+$/) # upst fuzzy region ok ) ) ) { ++$exon_fuzzycorrect; if ($nr_annot_blocks > 1) { $first_exon_fuzzycorrect = 1 if $blocks == 1; $last_exon_fuzzycorrect = 1 if $blocks == $nr_annot_blocks; ++$internal_exon_fuzzycorrect if $nr_annot_blocks > 2; } elsif ($nr_annot_blocks == 1) { $single_exon_fuzzycorrect = 1; } else { die; } } } if ($borderplot && $z =~ /^[^$C]*$C+[^$C]*$/) { # we only consider chunks with one chunk in it. my $upst_border; my $downst_border; if ($z =~ /^[^$C]$C+/) { # border is correct $upst_border = 0; } elsif ($z =~ /^[^$C]{2,}($C)/) { # border is inside $upst_border = $-[1] - 1; } else { # Take the upst sequence including the first base of z: my $u = substr($seq1, 0, $begin); if ($u =~ /([^$C])$C+$/) { $upst_border = -1 * (length($u) - $-[1] - 1); } else { die; } } if ($z =~ /$C+[^$C]$/) { $downst_border = 0; } elsif ($z =~ /($C)[^$C]{2,}$/) { $downst_border = -1 * (length($z) - $-[1] - 2); } else { my $d = substr($seq1, $begin+$length, length($seq1)-($begin+$length)); if ($d =~ /^$C+([^$C])/) { $downst_border = $-[1] - 1; } else { die; } } push @{$borderplot{$preds[$k-1]}}, [$upst_border, $downst_border]; # print $borderplotFh "$upst_border\t$downst_border\n"; } # See if there is at least an overlap: if ($z =~ /^.+$C.+$/) { ++$exons_overlap_annot; if ($nr_annot_blocks > 1) { $first_overlap_annot = 1 if $blocks == 1; $last_overlap_annot = 1 if $blocks == $nr_annot_blocks; ++$internal_overlap_annot if $nr_annot_blocks > 2; } elsif ($nr_annot_blocks == 1) { $single_overlap_annot = 1; } if ($errorcounts) { # Make start and donor stats: ++$donor_total_pred; # See if there is no flanking non-chunkletter, in which case # the predicted start/donor site is too upstream: my $pred_upst = substr($seq1, 0, $begin); my $pred_upst_flank; if ($z =~ /^$C/) { # Lets see how much: $pred_upst =~ /([^$C])($C*)$/; $pred_upst_flank = $1; my $shift = length($2); if ($blocks == 1) { # Make a quick check upstream to make sure there are no # first wrong exons: if ($pred_upst !~ /$C+/g) { if ($shift % 3) { ++$outframe_upst_start; } else { ++$inframe_upst_start; } } } else { if ($shift % 3) { ++$outframe_upst_donor; } else { ++$inframe_upst_donor; } } } elsif ($z =~ /^[^$C]{2,}/) { # two or more because ther is one flanking non-C $pred_upst_flank = substr($z, 0, 1); # Start or donor site is too downstream: my $shift = length($&) - 1; if ($blocks == 1) { if ($pred_upst !~ /$C+/) { if ($shift % 3) { ++$outframe_downst_start; } else { ++$inframe_downst_start; } } } else { if ($shift % 3) { ++$outframe_downst_donor; } else { ++$inframe_downst_donor; } } } else { $pred_upst_flank = substr($z, 0, 1); } # Check that the intron phase is right: if ($blocks != 1) { if ($pred_upst_flank ne $annot_upst_flank) { ++$intronphase; } ++$introns_total_annot; } # Make end and acceptor stats: ++$acc_total_pred; my $pred_downst = substr($seq1, $begin+$length, $lfa->length - $begin+$length); if ($z =~ /$C$/) { # Acceptor site is too downstream. Lets see how much: my $pos = $begin+$length; $seq1 =~ /^.{$pos}(.*)/; my $tmp = $1; $tmp =~ /^$C+/; my $shift = length($&); if ($blocks == $nr_annot_blocks) { # Make a quick check downstream to see if there are any # wrong last exons: if ($pred_downst !~ /$C+/) { if ($shift % 3) { ++$outframe_downst_end; } else { ++$inframe_downst_end; } } } else { if ($shift % 3) { ++$outframe_downst_acc; } else { ++$inframe_downst_acc; } } } elsif ($z =~ /[^$C]{2,}$/) { # two or more because ther is one flanking non-C # Acceptor site is too upstream: my $shift = length($&) - 1; if ($blocks == $nr_annot_blocks) { if ($pred_downst !~ /$C+/) { if ($shift % 3) { ++$outframe_upst_end; } else { ++$inframe_upst_end; } } } else { if ($shift % 3) { ++$outframe_upst_acc; } else { ++$inframe_upst_acc; } } } } } # Record numbers of different types of blocks in annotation: ++$exon_total_annot; if ($nr_annot_blocks > 1) { $first_exon_total_annot = 1 if $blocks == 1; $last_exon_total_annot = 1 if $blocks == $nr_annot_blocks; ++$internal_exon_total_annot if $nr_annot_blocks > 2; } elsif ($nr_annot_blocks == 1) { $single_exon_total_annot = 1; } } # Reset block count: $blocks = 0; # Count in the prediction: while ($seq1 =~ /$C+/g) { $blocks++; my $b = length($`); my $l = length($&); # Cut out region in the prediction: my $z = substr($seq0,$b,$l); if ($z =~ /^.+$C.+$/) { ++$exons_overlap_pred; if ($nr_pred_blocks > 1) { $first_overlap_pred = 1 if $blocks == 1; $last_overlap_pred = 1 if $blocks == $nr_pred_blocks; ++$internal_overlap_pred if $nr_pred_blocks > 2; } elsif ($nr_pred_blocks == 1) { $single_overlap_pred = 1; } } # Record numbers of different types of blocks in prediction: ++$exon_total_pred; if ($nr_pred_blocks > 1) { $first_exon_total_pred = 1 if $blocks == 1; $last_exon_total_pred = 1 if $blocks == $nr_pred_blocks; ++$internal_exon_total_pred if $nr_pred_blocks > 2; } elsif ($nr_pred_blocks == 1) { $single_exon_total_pred = 1; } # See how many of the wrong starts that are back on track by # first donor, and if not by first acceptor: if ($inframe_upst_start || $outframe_upst_start) { # How many upst start are right again by first donor: $seq0 =~ /^.[^$C]+$C+/; my $annot_don = length $&; $seq1 =~ /^.[^$C]+$C+/; my $pred_don = length $&; if ($pred_don == $annot_don) { $upst_start_right_first_don = 1; } } elsif ($inframe_downst_start || $outframe_downst_start) { # How many downst start are right again by first donor: $seq0 =~ /^.[^$C]+$C+/; my $annot_acc = length $&; $seq1 =~ /^.[^$C]+$C+/; my $pred_acc = length $&; if ($pred_acc == $annot_acc) { $downst_start_right_first_don = 1; } } if ($first_exon_total_pred-$first_overlap_pred) { # We have a wrong first exon $seq0 =~ /^.[^$C]+/; my $start_offset_annot = length $&; my $str = substr $seq1, $start_offset_annot - 1, 2; if ($str =~ /[^$C]$C/) { $wrong_first_exon_right_first_acc = 1; } } } $gene_total_annot = 1 if $nr_annot_blocks; $gene_total_pred = 1 if $nr_pred_blocks; $gene_correct = 1 if $exon_correct == $nr_annot_blocks; $gene_fuzzycorrect = 1 if $exon_fuzzycorrect == $nr_annot_blocks; $genes_overlap_annot = 1 if $exons_overlap_annot; $genes_overlap_pred = 1 if $exons_overlap_pred; if ($exonsplot) { if (!$exon_total_annot) { push @nochunkwarning, $id; } else { push @{$exonsplot{$id}}, sprintf("%.2f", 100*$exon_correct/$exon_total_annot), sprintf("%.2f", 100*$exons_overlap_annot/$exon_total_annot); if ($fuzzy) { push @{$exonsplot{$id}}, sprintf("%.2f", 100*$exon_fuzzycorrect/$exon_total_annot); } } } elsif (!$exon_total_annot) { push @nochunkwarning, $id; } # Update global stats: # Update stats for bases: $s{nucl_correct}[$k] += $nucl_correct; $s{nucl_total_annot}[$k] += $total_annot; $s{nucl_total_pred}[$k] += $total_pred; $s{nucl_SN}[$k] += $total_annot > 0 ? $nucl_correct/$total_annot : 1; $s{nucl_SP}[$k] += $total_pred > 0 ? $nucl_correct/$total_pred : 1; # Update stats for genes: $s{gene_correct}[$k] += $gene_correct; $s{gene_fuzzycorrect}[$k] += $gene_fuzzycorrect; $s{gene_total_annot}[$k] += $gene_total_annot; $s{gene_total_pred}[$k] += $gene_total_pred; $s{genes_overlap_annot}[$k] += $genes_overlap_annot; $s{genes_overlap_pred}[$k] += $genes_overlap_pred; $s{gene_SN}[$k] += $gene_total_annot > 0 ? $gene_correct/$gene_total_annot : 1; $s{gene_fuzzySN}[$k] += $gene_total_annot > 0 ? $gene_fuzzycorrect/$gene_total_annot : 1; $s{gene_SP}[$k] += $gene_total_pred > 0 ? $gene_correct/$gene_total_pred : 1; $s{gene_fuzzySP}[$k] += $gene_total_pred > 0 ? $gene_fuzzycorrect/$gene_total_pred : 1; $s{missing_genes}[$k] += $gene_total_annot > 0 ? ($gene_total_annot-$genes_overlap_annot)/$gene_total_annot : 0; $s{wrong_genes}[$k] += $gene_total_pred > 0 ? ($gene_total_pred-$genes_overlap_pred)/$gene_total_pred : 0; # Intron phases: $s{intronphase}[$k] += $intronphase; $s{introns_total_annot}[$k] += $introns_total_annot; # Update stats all exons: $s{exon_correct}[$k] += $exon_correct; $s{exon_fuzzycorrect}[$k] += $exon_fuzzycorrect; $s{exon_total_annot}[$k] += $exon_total_annot; $s{exon_total_pred}[$k] += $exon_total_pred; $s{exons_overlap_annot}[$k] += $exons_overlap_annot; $s{exons_overlap_pred}[$k] += $exons_overlap_pred; $s{exon_SN}[$k] += $exon_total_annot > 0 ? $exon_correct/$exon_total_annot : 1; $s{exon_fuzzySN}[$k] += $exon_total_annot > 0 ? $exon_fuzzycorrect/$exon_total_annot : 1; $s{exon_SP}[$k] += $exon_total_pred > 0 ? $exon_correct/$exon_total_pred : 1; $s{exon_fuzzySP}[$k] += $exon_total_pred > 0 ? $exon_fuzzycorrect/$exon_total_pred : 1; $s{missing_exons}[$k] += $exon_total_annot > 0 ? ($exon_total_annot-$exons_overlap_annot)/$exon_total_annot : 0; $s{wrong_exons}[$k] += $exon_total_pred > 0 ? ($exon_total_pred-$exons_overlap_pred)/$exon_total_pred : 0; # Update stats internal exons; $s{internal_exon_correct}[$k] += $internal_exon_correct; $s{internal_exon_fuzzycorrect}[$k] += $internal_exon_fuzzycorrect; $s{internal_exon_total_annot}[$k] += $internal_exon_total_annot; $s{internal_exon_total_pred}[$k] += $internal_exon_total_pred; $s{internal_overlap_annot}[$k] += $internal_overlap_annot; $s{internal_overlap_pred}[$k] += $internal_overlap_pred; $s{internal_exon_SN}[$k] += $internal_exon_total_annot > 0 ? $internal_exon_correct/$internal_exon_total_annot : 1; $s{internal_exon_fuzzySN}[$k] += $internal_exon_total_annot > 0 ? $internal_exon_fuzzycorrect/$internal_exon_total_annot : 1; $s{internal_exon_SP}[$k] += $internal_exon_total_pred > 0 ? $internal_exon_correct/$internal_exon_total_pred : 1; $s{internal_exon_fuzzySP}[$k] += $internal_exon_total_pred > 0 ? $internal_exon_fuzzycorrect/$internal_exon_total_pred : 1; $s{internal_missing_exons}[$k] += $internal_exon_total_annot > 0 ? ($internal_exon_total_annot-$internal_overlap_annot)/$internal_exon_total_annot : 0; $s{internal_wrong_exons}[$k] += $internal_exon_total_pred > 0 ? ($internal_exon_total_pred-$internal_overlap_pred)/$internal_exon_total_pred : 0; # Update stats first exons: $s{first_exon_correct}[$k] += $first_exon_correct; $s{first_exon_fuzzycorrect}[$k] += $first_exon_fuzzycorrect; $s{first_exon_total_annot}[$k] += $first_exon_total_annot; $s{first_exon_total_pred}[$k] += $first_exon_total_pred; $s{first_overlap_annot}[$k] += $first_overlap_annot; $s{first_overlap_pred}[$k] += $first_overlap_pred; $s{first_exon_SN}[$k] += $first_exon_total_annot > 0 ? $first_exon_correct/$first_exon_total_annot : 1; $s{first_exon_fuzzySN}[$k] += $first_exon_total_annot > 0 ? $first_exon_fuzzycorrect/$first_exon_total_annot : 1; $s{first_exon_SP}[$k] += $first_exon_total_pred > 0 ? $first_exon_correct/$first_exon_total_pred : 1; $s{first_exon_fuzzySP}[$k] += $first_exon_total_pred > 0 ? $first_exon_fuzzycorrect/$first_exon_total_pred : 1; $s{first_missing_exons}[$k] += $first_exon_total_annot > 0 ? ($first_exon_total_annot-$first_overlap_annot)/$first_exon_total_annot : 0; $s{first_wrong_exons}[$k] += $first_exon_total_pred > 0 ? ($first_exon_total_pred-$first_overlap_pred)/$first_exon_total_pred : 0; # Update stats last exons: $s{last_exon_correct}[$k] += $last_exon_correct; $s{last_exon_fuzzycorrect}[$k] += $last_exon_fuzzycorrect; $s{last_exon_total_annot}[$k] += $last_exon_total_annot; $s{last_exon_total_pred}[$k] += $last_exon_total_pred; $s{last_overlap_annot}[$k] += $last_overlap_annot; $s{last_overlap_pred}[$k] += $last_overlap_pred; $s{last_exon_SN}[$k] += $last_exon_total_annot > 0 ? $last_exon_correct/$last_exon_total_annot : 1; $s{last_exon_fuzzySN}[$k] += $last_exon_total_annot > 0 ? $last_exon_fuzzycorrect/$last_exon_total_annot : 1; $s{last_exon_SP}[$k] += $last_exon_total_pred > 0 ? $last_exon_correct/$last_exon_total_pred : 1; $s{last_exon_fuzzySP}[$k] += $last_exon_total_pred > 0 ? $last_exon_fuzzycorrect/$last_exon_total_pred : 1; $s{last_missing_exons}[$k] += $last_exon_total_annot > 0 ? ($last_exon_total_annot-$last_overlap_annot)/$last_exon_total_annot : 0; $s{last_wrong_exons}[$k] += $last_exon_total_pred > 0 ? ($last_exon_total_pred-$last_overlap_pred)/$last_exon_total_pred : 0; # Update stats single exons; $s{single_exon_correct}[$k] += $single_exon_correct; $s{single_exon_fuzzycorrect}[$k] += $single_exon_fuzzycorrect; $s{single_exon_total_annot}[$k] += $single_exon_total_annot; $s{single_exon_total_pred}[$k] += $single_exon_total_pred; $s{single_overlap_annot}[$k] += $single_overlap_annot; $s{single_overlap_pred}[$k] += $single_overlap_pred; $s{single_exon_SN}[$k] += $single_exon_total_annot > 0 ? $single_exon_correct/$single_exon_total_annot : 1; $s{single_exon_fuzzySN}[$k] += $single_exon_total_annot > 0 ? $single_exon_fuzzycorrect/$single_exon_total_annot : 1; $s{single_exon_SP}[$k] += $single_exon_total_pred > 0 ? $single_exon_correct/$single_exon_total_pred : 1; $s{single_exon_fuzzySP}[$k] += $single_exon_total_pred > 0 ? $single_exon_fuzzycorrect/$single_exon_total_pred : 1; $s{single_missing_exons}[$k] += $single_exon_total_annot > 0 ? ($single_exon_total_annot-$single_overlap_annot)/$single_exon_total_annot : 0; $s{single_wrong_exons}[$k] += $single_exon_total_pred > 0 ? ($single_exon_total_pred-$single_overlap_pred)/$single_exon_total_pred : 0; $s{donor_total_pred}[$k] += $donor_total_pred; $s{inframe_upst_donor}[$k] += $inframe_upst_donor; $s{inframe_downst_donor}[$k] += $inframe_downst_donor; $s{outframe_upst_donor}[$k] += $outframe_upst_donor; $s{outframe_downst_donor}[$k] += $outframe_downst_donor; $s{inframe_upst_start}[$k] += $inframe_upst_start; $s{inframe_downst_start}[$k] += $inframe_downst_start; $s{outframe_upst_start}[$k] += $outframe_upst_start; $s{outframe_downst_start}[$k] += $outframe_downst_start; $s{acc_total_pred}[$k] += $acc_total_pred; $s{inframe_upst_acc}[$k] += $inframe_upst_acc; $s{inframe_downst_acc}[$k] += $inframe_downst_acc; $s{outframe_upst_acc}[$k] += $outframe_upst_acc; $s{outframe_downst_acc}[$k] += $outframe_downst_acc; $s{inframe_upst_end}[$k] += $inframe_upst_end; $s{inframe_downst_end}[$k] += $inframe_downst_end; $s{outframe_upst_end}[$k] += $outframe_upst_end; $s{outframe_downst_end}[$k] += $outframe_downst_end; # These are some hack stats: $s{upst_start_right_first_don}[$k] += $upst_start_right_first_don; $s{downst_start_right_first_don}[$k] += $downst_start_right_first_don; $s{wrong_first_exon_right_first_acc}[$k] += $wrong_first_exon_right_first_acc; } if ($exonsplot && $N==1) { my @cols; for (my $k=0; $k<=$nr_preds; $k+=2) { my $cols = $fuzzy ? 3 : 2; for (1..$cols) { push @cols, $preds[$k]; } } print $exonsplotFh "# ", join("\t", @cols), "\n"; } # Print out a line with the prediction prefixes: if ($N==1 && !$short) { print $out "\nPrediction"; for (my $k=0; $k<=$nr_preds; $k+=2) { printf($out "%+22s",$preds[$k]); print $out " " x 8; } print $out "\n"; print $out "-" x 14; for (my $k=1; $k<=$nr_preds; $k+=2) { print $out "-" x 2, "+"; print $out "-" x 27; } printf($out "--+\n"); print STDERR "Processing..."; } } print STDERR "\b" x length('Processing...'); if ($exonsplot) { for my $id (keys %exonsplot) { print $exonsplotFh join("\t", @{$exonsplot{$id}}), "\n"; } } if ($borderplot) { for my $prefix (keys %borderplot) { open my $fh, ">$borderplot.$prefix.tbl" or die; for my $ref (@{$borderplot{$prefix}}) { print $fh join("\t", @{$ref}), "\n"; } } } # Do the printing: if ($short) { for (my $k=1; $k<=$nr_preds; $k+=2) { printf($out "%6.2f\t%6.2f\t%6.2f\t%6.2f\n", 100*$s{nucl_SN}[$k]/$N, 100*$s{nucl_SP}[$k]/$N, 100*$s{exon_SN}[$k]/$N, 100*$s{exon_SP}[$k]/$N); } } else { # Bases ##################################### # Base sensitivity: print $out "Base SN "; for (my $k=1; $k<=$nr_preds; $k+=2) { if ($s{nucl_total_annot}[$k]>0) { printf($out " %7d %7d %6.2f %6.2f", $s{nucl_correct}[$k], $s{nucl_total_annot}[$k], 100.*$s{nucl_correct}[$k]/$s{nucl_total_annot}[$k],100.*$s{nucl_SN}[$k]/$N); } else { printf($out " %7d %7d %6.2f %6.2f", $s{nucl_correct}[$k], $s{nucl_total_annot}[$k], 100, 100.*$s{nucl_SN}[$k]/$N); } } printf($out "\n"); # Base specificity: print $out "Base SP "; for (my $k=1; $k<=$nr_preds; $k+=2) { if ($s{nucl_total_pred}[$k]>0) { printf($out " %7d %7d %6.2f %6.2f", $s{nucl_correct}[$k], $s{nucl_total_pred}[$k], 100.*$s{nucl_correct}[$k]/$s{nucl_total_pred}[$k], 100.*$s{nucl_SP}[$k]/$N); } else { printf($out " %7d %7d %6.2f %6.2f", $s{nucl_correct}[$k], $s{nucl_total_pred}[$k], 100, 100.*$s{nucl_SP}[$k]/$N); } } printf($out "\n"); # Exons ##################################### # Exon sensitivity: print $out "Exon SN "; for (my $k=1; $k<=$nr_preds; $k+=2) { if ($s{exon_total_annot}[$k]>0) { printf($out " %7d %7d %6.2f %6.2f", $s{exon_correct}[$k], $s{exon_total_annot}[$k], 100.*$s{exon_correct}[$k]/$s{exon_total_annot}[$k], 100.*$s{exon_SN}[$k]/$N); } else { printf($out " %7d %7d %6.2f %6.2f", $s{exon_correct}[$k], $s{exon_total_annot}[$k], 100, 100.*$s{exon_SN}[$k]/$N); } } printf($out "\n"); if ($fuzzy) { # Exon sensitivity: printf $out "... fuzzy %-6d", $fuzzy; for (my $k=1; $k<=$nr_preds; $k+=2) { if ($s{exon_total_annot}[$k]>0) { printf($out " %7d %7d %6.2f %6.2f", $s{exon_fuzzycorrect}[$k], $s{exon_total_annot}[$k], 100.*$s{exon_fuzzycorrect}[$k]/$s{exon_total_annot}[$k], 100.*$s{exon_fuzzySN}[$k]/$N); } else { printf($out " %7d %7d %6.2f %6.2f", $s{exon_fuzzycorrect}[$k], $s{exon_total_annot}[$k], 100, 100.*$s{exon_fuzzySN}[$k]/$N); } } printf($out "\n"); } # Exon specificity: print $out "Exon SP "; for (my $k=1; $k<=$nr_preds; $k+=2) { if ($s{exon_total_pred}[$k]>0) { printf($out " %7d %7d %6.2f %6.2f", $s{exon_correct}[$k], $s{exon_total_pred}[$k], 100.*$s{exon_correct}[$k]/$s{exon_total_pred}[$k], 100.*$s{exon_SP}[$k]/$N); } else { printf($out " %7d %7d %6.2f %6.2f", $s{exon_correct}[$k], $s{exon_total_pred}[$k], 100, 100.*$s{exon_SP}[$k]/$N); } } printf($out "\n"); if ($fuzzy) { # Exon specificity: printf $out "... fuzzy %-6d", $fuzzy; for (my $k=1; $k<=$nr_preds; $k+=2) { if ($s{exon_total_pred}[$k]>0) { printf($out " %7d %7d %6.2f %6.2f", $s{exon_fuzzycorrect}[$k], $s{exon_total_pred}[$k], 100.*$s{exon_fuzzycorrect}[$k]/$s{exon_total_pred}[$k], 100.*$s{exon_fuzzySP}[$k]/$N); } else { printf($out " %7d %7d %6.2f %6.2f", $s{exon_fuzzycorrect}[$k], $s{exon_total_pred}[$k], 100, 100.*$s{exon_fuzzySP}[$k]/$N); } } printf($out "\n"); } # Missing exons: print $out "Missing exons "; for (my $k=1; $k<=$nr_preds; $k+=2) { if ($s{exon_total_annot}[$k]>0) { printf($out " %7d %7d %6.2f %6.2f", $s{exon_total_annot}[$k]-$s{exons_overlap_annot}[$k], $s{exon_total_annot}[$k], 100.*($s{exon_total_annot}[$k]-$s{exons_overlap_annot}[$k])/$s{exon_total_annot}[$k], 100.*$s{missing_exons}[$k]/$N); } else { printf($out " %7d %7d %6.2f %6.2f", $s{exon_total_annot}[$k]-$s{exons_overlap_annot}[$k], $s{exon_total_annot}[$k], 100, 100.*$s{missing_exons}[$k]/$N); } } printf($out "\n"); # Wrong exons print $out "Wrong exons "; for (my $k=1; $k<=$nr_preds; $k+=2) { if ($s{exon_total_pred}[$k]>0) { printf($out " %7d %7d %6.2f %6.2f", $s{exon_total_pred}[$k]-$s{exons_overlap_pred}[$k], $s{exon_total_pred}[$k], 100.*($s{exon_total_pred}[$k]-$s{exons_overlap_pred}[$k])/$s{exon_total_pred}[$k], 100.*$s{wrong_exons}[$k]/$N); } else { printf($out " %7d %7d %6.2f %6.2f", $s{exon_total_pred}[$k]-$s{exons_overlap_pred}[$k], $s{exon_total_pred}[$k], 100, 100.*$s{wrong_exons}[$k]/$N); } } if ($long) { printf($out "\n\n"); # Internal exons ############################ # Internal exon sensitivity: print $out "Internal exon SN"; for (my $k=1; $k<=$nr_preds; $k+=2) { if ($s{internal_exon_total_annot}[$k]>0) { printf($out " %7d %7d %6.2f %6.2f", $s{internal_exon_correct}[$k], $s{internal_exon_total_annot}[$k], 100.*$s{internal_exon_correct}[$k]/$s{internal_exon_total_annot}[$k], 100.*$s{internal_exon_SN}[$k]/$N); } else { printf($out " %7d %7d %6.2f %6.2f", $s{internal_exon_correct}[$k], $s{internal_exon_total_annot}[$k], 100, 100.*$s{internal_exon_SN}[$k]/$N); } } printf($out "\n"); if ($fuzzy) { # Internal exon sensitivity: printf $out "... fuzzy %-6d", $fuzzy; for (my $k=1; $k<=$nr_preds; $k+=2) { if ($s{internal_exon_total_annot}[$k]>0) { printf($out " %7d %7d %6.2f %6.2f", $s{internal_exon_fuzzycorrect}[$k], $s{internal_exon_total_annot}[$k], 100.*$s{internal_exon_fuzzycorrect}[$k]/$s{internal_exon_total_annot}[$k], 100.*$s{internal_exon_fuzzySN}[$k]/$N); } else { printf($out " %7d %7d %6.2f %6.2f", $s{internal_exon_fuzzycorrect}[$k], $s{internal_exon_total_annot}[$k], 100, 100.*$s{internal_exon_fuzzySN}[$k]/$N); } } printf($out "\n"); } # Internal exon specificity: print $out "Internal exon SP"; for (my $k=1; $k<=$nr_preds; $k+=2) { if ($s{internal_exon_total_pred}[$k]>0) { printf($out " %7d %7d %6.2f %6.2f", $s{internal_exon_correct}[$k], $s{internal_exon_total_pred}[$k], 100.*$s{internal_exon_correct}[$k]/$s{internal_exon_total_pred}[$k], 100.*$s{internal_exon_SP}[$k]/$N); } else { printf($out " %7d %7d %6.2f %6.2f", $s{internal_exon_correct}[$k], $s{internal_exon_total_pred}[$k], 100, 100.*$s{internal_exon_SP}[$k]/$N); } } printf($out "\n"); if ($fuzzy) { # Internal exon specificity: printf $out "... fuzzy %-6d", $fuzzy; for (my $k=1; $k<=$nr_preds; $k+=2) { if ($s{internal_exon_total_pred}[$k]>0) { printf($out " %7d %7d %6.2f %6.2f", $s{internal_exon_fuzzycorrect}[$k], $s{internal_exon_total_pred}[$k], 100.*$s{internal_exon_fuzzycorrect}[$k]/$s{internal_exon_total_pred}[$k], 100.*$s{internal_exon_fuzzySP}[$k]/$N); } else { printf($out " %7d %7d %6.2f %6.2f", $s{internal_exon_fuzzycorrect}[$k], $s{internal_exon_total_pred}[$k], 100, 100.*$s{internal_exon_fuzzySP}[$k]/$N); } } printf($out "\n"); } # Missing internal exons: print $out "Missing int. ex "; for (my $k=1; $k<=$nr_preds; $k+=2) { if ($s{internal_exon_total_annot}[$k]>0) { printf($out " %7d %7d %6.2f %6.2f", $s{internal_exon_total_annot}[$k]-$s{internal_overlap_annot}[$k], $s{internal_exon_total_annot}[$k], 100.*($s{internal_exon_total_annot}[$k]-$s{internal_overlap_annot}[$k])/$s{internal_exon_total_annot}[$k], 100.*$s{internal_missing_exons}[$k]/$N); } else { printf($out " %7d %7d %6.2f %6.2f", $s{internal_exon_total_annot}[$k]-$s{internal_overlap_annot}[$k], $s{internal_exon_total_annot}[$k], 100, 100.*$s{internal_missing_exons}[$k]/$N); } } printf($out "\n"); # Wrong internal exons print $out "Wrong int. ex "; for (my $k=1; $k<=$nr_preds; $k+=2) { if ($s{internal_exon_total_pred}[$k]>0) { printf($out " %7d %7d %6.2f %6.2f", $s{internal_exon_total_pred}[$k]-$s{internal_overlap_pred}[$k], $s{internal_exon_total_pred}[$k], 100.*($s{internal_exon_total_pred}[$k]-$s{internal_overlap_pred}[$k])/$s{internal_exon_total_pred}[$k], 100.*$s{internal_wrong_exons}[$k]/$N); } else { printf($out " %7d %7d %6.2f %6.2f", $s{internal_exon_total_pred}[$k]-$s{internal_overlap_pred}[$k], $s{internal_exon_total_pred}[$k], 100, 100.*$s{internal_wrong_exons}[$k]/$N); } } printf($out "\n\n"); # First exons ############################### # First exon sensitivity: print $out "First exon SN "; for (my $k=1; $k<=$nr_preds; $k+=2) { if ($s{first_exon_total_annot}[$k]>0) { printf($out " %7d %7d %6.2f %6.2f", $s{first_exon_correct}[$k], $s{first_exon_total_annot}[$k], 100.*$s{first_exon_correct}[$k]/$s{first_exon_total_annot}[$k], 100.*$s{first_exon_SN}[$k]/$N); } else { printf($out " %7d %7d %6.2f %6.2f", $s{first_exon_correct}[$k], $s{first_exon_total_annot}[$k], 100, 100.*$s{first_exon_SN}[$k]/$N); } } printf($out "\n"); if ($fuzzy) { # First exon sensitivity: printf $out "... fuzzy %-6d", $fuzzy; for (my $k=1; $k<=$nr_preds; $k+=2) { if ($s{first_exon_total_annot}[$k]>0) { printf($out " %7d %7d %6.2f %6.2f", $s{first_exon_fuzzycorrect}[$k], $s{first_exon_total_annot}[$k], 100.*$s{first_exon_fuzzycorrect}[$k]/$s{first_exon_total_annot}[$k], 100.*$s{first_exon_fuzzySN}[$k]/$N); } else { printf($out " %7d %7d %6.2f %6.2f", $s{first_exon_fuzzycorrect}[$k], $s{first_exon_total_annot}[$k], 100, 100.*$s{first_exon_fuzzySN}[$k]/$N); } } printf($out "\n"); } # First exon specificity: print $out "First exon SP "; for (my $k=1; $k<=$nr_preds; $k+=2) { if ($s{first_exon_total_pred}[$k]>0) { printf($out " %7d %7d %6.2f %6.2f", $s{first_exon_correct}[$k], $s{first_exon_total_pred}[$k], 100.*$s{first_exon_correct}[$k]/$s{first_exon_total_pred}[$k], 100.*$s{first_exon_SP}[$k]/$N); } else { printf($out " %7d %7d %6.2f %6.2f", $s{first_exon_correct}[$k], $s{first_exon_total_pred}[$k], 100, 100.*$s{first_exon_SP}[$k]/$N); } } printf($out "\n"); if ($fuzzy) { # First exon specificity: printf $out "... fuzzy %-6d", $fuzzy; for (my $k=1; $k<=$nr_preds; $k+=2) { if ($s{first_exon_total_pred}[$k]>0) { printf($out " %7d %7d %6.2f %6.2f", $s{first_exon_fuzzycorrect}[$k], $s{first_exon_total_pred}[$k], 100.*$s{first_exon_fuzzycorrect}[$k]/$s{first_exon_total_pred}[$k], 100.*$s{first_exon_fuzzySP}[$k]/$N); } else { printf($out " %7d %7d %6.2f %6.2f", $s{first_exon_fuzzycorrect}[$k], $s{first_exon_total_pred}[$k], 100, 100.*$s{first_exon_fuzzySP}[$k]/$N); } } printf($out "\n"); } # Missing first exons: print $out "Missing first ex"; for (my $k=1; $k<=$nr_preds; $k+=2) { if ($s{first_exon_total_annot}[$k]>0) { printf($out " %7d %7d %6.2f %6.2f", $s{first_exon_total_annot}[$k]-$s{first_overlap_annot}[$k], $s{first_exon_total_annot}[$k], 100.*($s{first_exon_total_annot}[$k]-$s{first_overlap_annot}[$k])/$s{first_exon_total_annot}[$k], 100.*$s{first_missing_exons}[$k]/$N); } else { printf($out " %7d %7d %6.2f %6.2f", $s{first_exon_total_annot}[$k]-$s{first_overlap_annot}[$k], $s{first_exon_total_annot}[$k], 100, 100.*$s{first_missing_exons}[$k]/$N); } } printf($out "\n"); # Wrong first exons print $out "Wrong first ex "; for (my $k=1; $k<=$nr_preds; $k+=2) { if ($s{first_exon_total_pred}[$k]>0) { printf($out " %7d %7d %6.2f %6.2f", $s{first_exon_total_pred}[$k]-$s{first_overlap_pred}[$k], $s{first_exon_total_pred}[$k], 100.*($s{first_exon_total_pred}[$k]-$s{first_overlap_pred}[$k])/$s{first_exon_total_pred}[$k], 100.*$s{first_wrong_exons}[$k]/$N); } else { printf($out " %7d %7d %6.2f %6.2f", $s{first_exon_total_pred}[$k]-$s{first_overlap_pred}[$k], $s{first_exon_total_pred}[$k], 100, 100.*$s{first_wrong_exons}[$k]/$N); } } printf($out "\n\n"); # Last exons ################################ # Last exon sensitivity: print $out "Last exon SN "; for (my $k=1; $k<=$nr_preds; $k+=2) { if ($s{last_exon_total_annot}[$k]>0) { printf($out " %7d %7d %6.2f %6.2f", $s{last_exon_correct}[$k], $s{last_exon_total_annot}[$k], 100.*$s{last_exon_correct}[$k]/$s{last_exon_total_annot}[$k], 100.*$s{last_exon_SN}[$k]/$N); } else { printf($out " %7d %7d %6.2f %6.2f", $s{last_exon_correct}[$k], $s{last_exon_total_annot}[$k], 100, 100.*$s{last_exon_SN}[$k]/$N); } } printf($out "\n"); if ($fuzzy) { # Last exon sensitivity: printf $out "... fuzzy %-6d", $fuzzy; for (my $k=1; $k<=$nr_preds; $k+=2) { if ($s{last_exon_total_annot}[$k]>0) { printf($out " %7d %7d %6.2f %6.2f", $s{last_exon_fuzzycorrect}[$k], $s{last_exon_total_annot}[$k], 100.*$s{last_exon_fuzzycorrect}[$k]/$s{last_exon_total_annot}[$k], 100.*$s{last_exon_fuzzySN}[$k]/$N); } else { printf($out " %7d %7d %6.2f %6.2f", $s{last_exon_fuzzycorrect}[$k], $s{last_exon_total_annot}[$k], 100, 100.*$s{last_exon_fuzzySN}[$k]/$N); } } printf($out "\n"); } # Last exon specificity: print $out "Last exon SP "; for (my $k=1; $k<=$nr_preds; $k+=2) { if ($s{last_exon_total_pred}[$k]>0) { printf($out " %7d %7d %6.2f %6.2f", $s{last_exon_correct}[$k], $s{last_exon_total_pred}[$k], 100.*$s{last_exon_correct}[$k]/$s{last_exon_total_pred}[$k], 100.*$s{last_exon_SP}[$k]/$N); } else { printf($out " %7d %7d %6.2f %6.2f", $s{last_exon_correct}[$k], $s{last_exon_total_pred}[$k], 100, 100.*$s{last_exon_SP}[$k]/$N); } } printf($out "\n"); if ($fuzzy) { # Last exon specificity: printf $out "... fuzzy %-6d", $fuzzy; for (my $k=1; $k<=$nr_preds; $k+=2) { if ($s{last_exon_total_pred}[$k]>0) { printf($out " %7d %7d %6.2f %6.2f", $s{last_exon_fuzzycorrect}[$k], $s{last_exon_total_pred}[$k], 100.*$s{last_exon_fuzzycorrect}[$k]/$s{last_exon_total_pred}[$k], 100.*$s{last_exon_fuzzySP}[$k]/$N); } else { printf($out " %7d %7d %6.2f %6.2f", $s{last_exon_fuzzycorrect}[$k], $s{last_exon_total_pred}[$k], 100, 100.*$s{last_exon_fuzzySP}[$k]/$N); } } printf($out "\n"); } # Missing last exons: print $out "Missing last ex "; for (my $k=1; $k<=$nr_preds; $k+=2) { if ($s{last_exon_total_annot}[$k]>0) { printf($out " %7d %7d %6.2f %6.2f", $s{last_exon_total_annot}[$k]-$s{last_overlap_annot}[$k], $s{last_exon_total_annot}[$k], 100.*($s{last_exon_total_annot}[$k]-$s{last_overlap_annot}[$k])/$s{last_exon_total_annot}[$k], 100.*$s{last_missing_exons}[$k]/$N); } else { printf($out " %7d %7d %6.2f %6.2f", $s{last_exon_total_annot}[$k]-$s{last_overlap_annot}[$k], $s{last_exon_total_annot}[$k], 100, 100.*$s{last_missing_exons}[$k]/$N); } } printf($out "\n"); # Wrong last exons print $out "Wrong last ex "; for (my $k=1; $k<=$nr_preds; $k+=2) { if ($s{last_exon_total_pred}[$k]>0) { printf($out " %7d %7d %6.2f %6.2f", $s{last_exon_total_pred}[$k]-$s{last_overlap_pred}[$k], $s{last_exon_total_pred}[$k], 100.*($s{last_exon_total_pred}[$k]-$s{last_overlap_pred}[$k])/$s{last_exon_total_pred}[$k], 100.*$s{last_wrong_exons}[$k]/$N); } else { printf($out " %7d %7d %6.2f %6.2f", $s{last_exon_total_pred}[$k]-$s{last_overlap_pred}[$k], $s{last_exon_total_pred}[$k], 100, 100.*$s{last_wrong_exons}[$k]/$N); } } printf($out "\n\n"); # Single exons ############################## # Single exon sensitivity: print $out "Single exon SN "; for (my $k=1; $k<=$nr_preds; $k+=2) { if ($s{single_exon_total_annot}[$k]>0) { printf($out " %7d %7d %6.2f %6.2f", $s{single_exon_correct}[$k], $s{single_exon_total_annot}[$k], 100.*$s{single_exon_correct}[$k]/$s{single_exon_total_annot}[$k], 100.*$s{single_exon_SN}[$k]/$N); } else { printf($out " %7d %7d %6.2f %6.2f", $s{single_exon_correct}[$k], $s{single_exon_total_annot}[$k], 100, 100.*$s{single_exon_SN}[$k]/$N); } } printf($out "\n"); if ($fuzzy) { # Single exon sensitivity: printf $out "... fuzzy %-6d", $fuzzy; for (my $k=1; $k<=$nr_preds; $k+=2) { if ($s{single_exon_total_annot}[$k]>0) { printf($out " %7d %7d %6.2f %6.2f", $s{single_exon_fuzzycorrect}[$k], $s{single_exon_total_annot}[$k], 100.*$s{single_exon_fuzzycorrect}[$k]/$s{single_exon_total_annot}[$k], 100.*$s{single_exon_fuzzySN}[$k]/$N); } else { printf($out " %7d %7d %6.2f %6.2f", $s{single_exon_fuzzycorrect}[$k], $s{single_exon_total_annot}[$k], 100, 100.*$s{single_exon_fuzzySN}[$k]/$N); } } printf($out "\n"); } # Single exon specificity: print $out "Single exon SP "; for (my $k=1; $k<=$nr_preds; $k+=2) { if ($s{single_exon_total_pred}[$k]>0) { printf($out " %7d %7d %6.2f %6.2f", $s{single_exon_correct}[$k], $s{single_exon_total_pred}[$k], 100.*$s{single_exon_correct}[$k]/$s{single_exon_total_pred}[$k], 100.*$s{single_exon_SP}[$k]/$N); } else { printf($out " %7d %7d %6.2f %6.2f", $s{single_exon_correct}[$k], $s{single_exon_total_pred}[$k], 100, 100.*$s{single_exon_SP}[$k]/$N); } } printf($out "\n"); if ($fuzzy) { # Single exon specificity: printf $out "... fuzzy %-6d", $fuzzy; for (my $k=1; $k<=$nr_preds; $k+=2) { if ($s{single_exon_total_pred}[$k]>0) { printf($out " %7d %7d %6.2f %6.2f", $s{single_exon_fuzzycorrect}[$k], $s{single_exon_total_pred}[$k], 100.*$s{single_exon_fuzzycorrect}[$k]/$s{single_exon_total_pred}[$k], 100.*$s{single_exon_fuzzySP}[$k]/$N); } else { printf($out " %7d %7d %6.2f %6.2f", $s{single_exon_fuzzycorrect}[$k], $s{single_exon_total_pred}[$k], 100, 100.*$s{single_exon_fuzzySP}[$k]/$N); } } printf($out "\n"); } # Missing single exons: print $out "Missing singl ex"; for (my $k=1; $k<=$nr_preds; $k+=2) { if ($s{single_exon_total_annot}[$k]>0) { printf($out " %7d %7d %6.2f %6.2f", $s{single_exon_total_annot}[$k]-$s{single_overlap_annot}[$k], $s{single_exon_total_annot}[$k], 100.*($s{single_exon_total_annot}[$k]-$s{single_overlap_annot}[$k])/$s{single_exon_total_annot}[$k], 100.*$s{single_missing_exons}[$k]/$N); } else { printf($out " %7d %7d %6.2f %6.2f", $s{single_exon_total_annot}[$k]-$s{single_overlap_annot}[$k], $s{single_exon_total_annot}[$k], 100, 100.*$s{single_missing_exons}[$k]/$N); } } printf($out "\n"); # Wrong single exons print $out "Wrong single ex "; for (my $k=1; $k<=$nr_preds; $k+=2) { if ($s{single_exon_total_pred}[$k]>0) { printf($out " %7d %7d %6.2f %6.2f", $s{single_exon_total_pred}[$k]-$s{single_overlap_pred}[$k], $s{single_exon_total_pred}[$k], 100.*($s{single_exon_total_pred}[$k]-$s{single_overlap_pred}[$k])/$s{single_exon_total_pred}[$k], 100.*$s{single_wrong_exons}[$k]/$N); } else { printf($out " %7d %7d %6.2f %6.2f", $s{single_exon_total_pred}[$k]-$s{single_overlap_pred}[$k], $s{single_exon_total_pred}[$k], 100, 100.*$s{single_wrong_exons}[$k]/$N); } } printf($out "\n\n"); # Genes ##################################### # Gene sensitivity: print $out "Gene SN "; for (my $k=1; $k<=$nr_preds; $k+=2) { if ($s{gene_total_annot}[$k]>0) { printf($out " %7d %7d %6.2f %6.2f", $s{gene_correct}[$k], $s{gene_total_annot}[$k], 100.*$s{gene_correct}[$k]/$s{gene_total_annot}[$k], 100.*$s{gene_SN}[$k]/$N ); } else { printf($out " %7d %7d %6.2f %6.2f", $s{gene_correct}[$k], $s{gene_total_annot}[$k], 100, 100.*$s{gene_SN}[$k]/$N); } } printf($out "\n"); if ($fuzzy) { printf $out "... fuzzy %-6d", $fuzzy; for (my $k=1; $k<=$nr_preds; $k+=2) { if ($s{gene_total_annot}[$k]>0) { printf($out " %7d %7d %6.2f %6.2f", $s{gene_fuzzycorrect}[$k], $s{gene_total_annot}[$k], 100.*$s{gene_fuzzycorrect}[$k]/$s{gene_total_annot}[$k], 100.*$s{gene_fuzzySN}[$k]/$N ); } else { printf($out " %7d %7d %6.2f %6.2f", $s{gene_fuzzycorrect}[$k], $s{gene_total_annot}[$k], 100, 100.*$s{gene_fuzzySN}[$k]/$N); } } printf($out "\n"); } # Gene specificity: print $out "Gene SP "; for (my $k=1; $k<=$nr_preds; $k+=2) { if ($s{gene_total_pred}[$k]>0) { printf($out " %7d %7d %6.2f %6.2f", $s{gene_correct}[$k], $s{gene_total_pred}[$k], 100.*$s{gene_correct}[$k]/$s{gene_total_pred}[$k], 100.*$s{gene_SP}[$k]/$N); } else { printf($out " %7d %7d %6.2f %6.2f", $s{gene_correct}[$k], $s{gene_total_pred}[$k], 100, 100.*$s{gene_SP}[$k]/$N); } } printf($out "\n"); if ($fuzzy) { # Gene specificity: printf $out "... fuzzy %-6d", $fuzzy; for (my $k=1; $k<=$nr_preds; $k+=2) { if ($s{gene_total_pred}[$k]>0) { printf($out " %7d %7d %6.2f %6.2f", $s{gene_fuzzycorrect}[$k], $s{gene_total_pred}[$k], 100.*$s{gene_fuzzycorrect}[$k]/$s{gene_total_pred}[$k], 100.*$s{gene_fuzzySP}[$k]/$N); } else { printf($out " %7d %7d %6.2f %6.2f", $s{gene_fuzzycorrect}[$k], $s{gene_total_pred}[$k], 100, 100.*$s{gene_fuzzySP}[$k]/$N); } } printf($out "\n"); } # Missing genes: print $out "Missing genes "; for (my $k=1; $k<=$nr_preds; $k+=2) { if ($s{gene_total_annot}[$k]>0) { printf($out " %7d %7d %6.2f %6.2f", $s{gene_total_annot}[$k]-$s{genes_overlap_annot}[$k], $s{gene_total_annot}[$k], 100.*($s{gene_total_annot}[$k]-$s{genes_overlap_annot}[$k])/$s{gene_total_annot}[$k], 100.*$s{missing_genes}[$k]/$N); } else { printf($out " %7d %7d %6.2f %6.2f", $s{gene_total_annot}[$k]-$s{genes_overlap_annot}[$k], $s{gene_total_annot}[$k], 100, 100.*$s{missing_genes}[$k]/$N); } } printf($out "\n"); # Wrong genes print $out "Wrong genes "; for (my $k=1; $k<=$nr_preds; $k+=2) { if ($s{gene_total_pred}[$k]>0) { printf($out " %7d %7d %6.2f %6.2f", $s{gene_total_pred}[$k]-$s{genes_overlap_pred}[$k], $s{gene_total_pred}[$k], 100.*($s{gene_total_pred}[$k]-$s{genes_overlap_pred}[$k])/$s{gene_total_pred}[$k], 100.*$s{wrong_genes}[$k]/$N); } else { printf($out " %7d %7d %6.2f %6.2f", $s{gene_total_pred}[$k]-$s{genes_overlap_pred}[$k], $s{gene_total_pred}[$k], 100, 100.*$s{wrong_genes}[$k]/$N); } } printf($out "\n\n"); } # Errorcounts #################################### if ($errorcounts) { print $out "-- Error counts: -----"; for (my $k=1; $k<=$nr_preds; $k+=2) { print $out "-" x 30; } print $out "\n\n"; print $out "wrong intr.phase"; for (my $k=1; $k<=$nr_preds; $k+=2) { printf($out " %7d %7d %6.2f %6s", $s{intronphase}[$k], $s{introns_total_annot}[$k], 100*$s{intronphase}[$k]/$s{introns_total_annot}[$k], 'xxx'); } print $out "\n\n"; print $out "In-fr 5' start "; for (my $k=1; $k<=$nr_preds; $k+=2) { printf($out " %7d %7d %6.2f %6s", $s{inframe_upst_start}[$k], $N, 100*$s{inframe_upst_start}[$k]/$N, 'xxx'); } print $out "\n"; print $out "In-fr 3' start "; for (my $k=1; $k<=$nr_preds; $k+=2) { printf($out " %7d %7d %6.2f %6s", $s{inframe_downst_start}[$k], $N, 100*$s{inframe_downst_start}[$k]/$N, 'xxx'); } print $out "\n"; print $out "Out-fr 5' start "; for (my $k=1; $k<=$nr_preds; $k+=2) { printf($out " %7d %7d %6.2f %6s", $s{outframe_upst_start}[$k], $N, 100*$s{outframe_upst_start}[$k]/$N, 'xxx'); } print $out "\n"; print $out "Out-fr 3' start "; for (my $k=1; $k<=$nr_preds; $k+=2) { printf($out " %7d %7d %6.2f %6s", $s{outframe_downst_start}[$k], $N, 100*$s{outframe_downst_start}[$k]/$N, 'xxx'); } print $out "\n\n"; print $out "In-fr 5' end "; for (my $k=1; $k<=$nr_preds; $k+=2) { printf($out " %7d %7d %6.2f %6s", $s{inframe_upst_end}[$k], $N, 100*$s{inframe_upst_end}[$k]/$N, 'xxx'); } print $out "\n"; print $out "In-fr 3' end "; for (my $k=1; $k<=$nr_preds; $k+=2) { printf($out " %7d %7d %6.2f %6s", $s{inframe_downst_end}[$k], $N, 100*$s{inframe_downst_end}[$k]/$N, 'xxx'); } print $out "\n"; print $out "Out-fr 5' end "; for (my $k=1; $k<=$nr_preds; $k+=2) { printf($out " %7d %7d %6.2f %6s", $s{outframe_upst_end}[$k], $N, 100*$s{outframe_upst_end}[$k]/$N, 'xxx'); } print $out "\n"; print $out "Out-fr 3' end "; for (my $k=1; $k<=$nr_preds; $k+=2) { printf($out " %7d %7d %6.2f %6s", 100*$s{outframe_downst_end}[$k], $N, $s{outframe_downst_end}[$k]/$N, 'xxx'); } print $out "\n\n"; print $out "In-fr 5' donor "; for (my $k=1; $k<=$nr_preds; $k+=2) { printf($out " %7d %7d %6.2f %6s", $s{inframe_upst_donor}[$k], $s{donor_total_pred}[$k], 100*$s{inframe_upst_donor}[$k]/$s{donor_total_pred}[$k], 'xxx'); } print $out "\n"; print $out "In-fr 3' donor "; for (my $k=1; $k<=$nr_preds; $k+=2) { printf($out " %7d %7d %6.2f %6s", $s{inframe_downst_donor}[$k], $s{donor_total_pred}[$k], 100*$s{inframe_downst_donor}[$k]/$s{donor_total_pred}[$k], 'xxx'); } print $out "\n"; print $out "Out-fr 5' donor "; for (my $k=1; $k<=$nr_preds; $k+=2) { printf($out " %7d %7d %6.2f %6s", $s{outframe_upst_donor}[$k], $s{donor_total_pred}[$k], 100*$s{outframe_upst_donor}[$k]/$s{donor_total_pred}[$k], 'xxx'); } print $out "\n"; print $out "Out-fr 3' donor "; for (my $k=1; $k<=$nr_preds; $k+=2) { printf($out " %7d %7d %6.2f %6s", $s{outframe_downst_donor}[$k], $s{donor_total_pred}[$k], 100*$s{outframe_downst_donor}[$k]/$s{donor_total_pred}[$k], 'xxx'); } print $out "\n\n"; print $out "In-fr 5' acc "; for (my $k=1; $k<=$nr_preds; $k+=2) { printf($out " %7d %7d %6.2f %6s", $s{inframe_upst_acc}[$k], $s{acc_total_pred}[$k], 100*$s{inframe_upst_acc}[$k]/$s{acc_total_pred}[$k], 'xxx'); } print $out "\n"; print $out "In-fr 3' acc "; for (my $k=1; $k<=$nr_preds; $k+=2) { printf($out " %7d %7d %6.2f %6s", $s{inframe_downst_acc}[$k], $s{acc_total_pred}[$k], 100*$s{inframe_downst_acc}[$k]/$s{acc_total_pred}[$k], 'xxx'); } print $out "\n"; print $out "Out-fr 5' acc "; for (my $k=1; $k<=$nr_preds; $k+=2) { printf($out " %7d %7d %6.2f %6s", $s{outframe_upst_acc}[$k], $s{acc_total_pred}[$k], 100*$s{outframe_upst_acc}[$k]/$s{acc_total_pred}[$k], 'xxx'); } print $out "\n"; print $out "Out-fr 3' acc "; for (my $k=1; $k<=$nr_preds; $k+=2) { printf($out " %7d %7d %6.2f %6s", $s{outframe_downst_acc}[$k], $s{acc_total_pred}[$k], 100*$s{outframe_downst_acc}[$k]/$s{acc_total_pred}[$k], 'xxx'); } print $out "\n\n"; print $out "-- Extras: -----"; for (my $k=1; $k<=$nr_preds; $k+=2) { print $out "-" x 30; } print $out "\n\n"; # Some additional stats #################### print $out "5'st. 1.don OK "; for (my $k=1; $k<=$nr_preds; $k+=2) { printf($out " %7d %7d %6.2f %6s", $s{upst_start_right_first_don}[$k], $s{outframe_upst_start}[$k] + $s{inframe_upst_start}[$k], 100*$s{outframe_upst_start}[$k] + $s{inframe_upst_start}[$k] ? $s{upst_start_right_first_don}[$k] / ($s{outframe_upst_start}[$k] + $s{inframe_upst_start}[$k]) : 0, 'xxx'); } print $out "\n"; print $out "3'st. 1.don OK "; for (my $k=1; $k<=$nr_preds; $k+=2) { printf($out " %7d %7d %6.2f %6s", $s{downst_start_right_first_don}[$k], $s{outframe_downst_start}[$k] + $s{inframe_downst_start}[$k], 100*$s{outframe_downst_start}[$k] + $s{inframe_downst_start}[$k] ? $s{downst_start_right_first_don}[$k] / ($s{outframe_downst_start}[$k] + $s{inframe_downst_start}[$k]) : 0, 'xxx'); } print $out "\n"; print $out "wr.f.ex 1.acc OK"; for (my $k=1; $k<=$nr_preds; $k+=2) { printf($out " %7d %7d %6.2f %6s", $s{wrong_first_exon_right_first_acc}[$k], $s{wrong_exons}[$k], 100*$s{wrong_exons}[$k] ? $s{wrong_first_exon_right_first_acc}[$k] / $s{wrong_exons}[$k] : 0, 'xxx'); } print $out "\n\n"; } print $out "\n"; print $out "-" x 14; for (my $k=1; $k<=$nr_preds; $k+=2) { print $out "-" x 2, "+"; print $out "-" x 27; } printf($out "--+\n"); # Sequence and base count: print $out "Sequences: $N\n"; print $out "Bases: $NB\n\n"; } if (@nochunkwarning) { print $out "WARNING: Found entries without chunkletters in label track:\n"; print join("\n", @nochunkwarning), "\n\n"; } sub min() { if ($_[0] < $_[1]) { return $_[0]; } else { return $_[1]; } } sub max() { if ($_[0] > $_[1]) { return $_[0]; } else { return $_[1]; } } =head1 SYNOPSIS: predstat.pl [OPTIONS] [infile [outfile]] =head1 DESCRIPTION: This script makes statistics for one or more predictions in a LFasta file. The LFasta file has to hold both the label track together with the prediction track(s) that you want to compare to. NOTE that it makes the assumption that each LFasta entry holds one and only one annotated gene. So you may get strange results for the gene stats it there is more than one in the annotation or prediction. =head1 OPTIONS: =over 4 =item --help Prints this help. =item --errorcounts Prints counts of some types of errors. =item --chunkletter Label to consider. Default is 'C'. =item --short One line output: Base SN, Base SP, Exon SN, Exon SP =item --long Prints stats on first, internal, last and single exons, as well as gene stats. =item --fuzzy Reports the exon stats allowing borders to be at most off relative to the annotation. =back =head1 EXAMPLES: predstat.pl file.lfa file.stats cat file.lfa | predstat.pl > file.stats DEPENDENCIES Classes LFasta and LFastaIO. =head1 AUTHOR: Kasper Munch =head1 COPYRIGHT: This program is free software. You may copy, modify, and redistribute it under the same terms as Perl itself. =cut