#!/usr/bin/perl # Time-stamp: <2005-07-06 15:34:52 kasper> use warnings; use strict; use LFasta; use LFastaIO; use Getopt::Long; use constant USAGE =>< [infile [outfile] ] DESCRIPTION: This script adds a prediction track to labeled Fasta entries as specified by a gff file. This is usefull for comaparing predictions. You can add more than one track of predictions from one gff file. In this case the prefix will be the first letter (capitalised) of the source field. Unless you use the onlydefined option you must use the prefix option. Otherwise the script will not know which prefix that goes with the tracks in entries that are not part of a prediction. NOTE: It skips predictions on the negative strand. OPTIONS: --help Prints this help. --onlydefined Prints only entries that are not are part of a prediction on the new track. --prefix Specifies the prefix for the added prediction track. --labelkey Specifies which labels that go with which features. Default is: 'CDS:C exon:C initial:C internal:C terminal:C single:C intron:I default:x' EXAMPLES: cat input.lfa | addprediction.pl pred.gff > ouput.lfa addprediction.pl pred.gff input.lfa ouput.lfa AUTHOR: Kasper Munch COPYRIGHT: This program is free software. You may copy and redistribute it under the same terms as Perl itself. END my $labelkey = 'CDS:C exon:C First:C initial:C Internal:C internal:C Terminal:C terminal:C Single:C single:C intron:I default:x'; my $help = 0; my $onlydefined = 0; my $prefix = ''; GetOptions( "help" => \$help, "onlydefined" => \$onlydefined, "prefix=s" => \$prefix, "labelkey=s" => \$labelkey, ) or die USAGE; $help and die USAGE; $onlydefined || $prefix or die USAGE; # Input and output: @ARGV or die USAGE; my $gff = shift; @ARGV = ('-') unless @ARGV; my $input = shift @ARGV; my $in = LFastaIO->new(file => $input) or die;; @ARGV = ('&STDOUT') unless @ARGV; my $output = shift @ARGV; my $out = LFastaIO->new(file => ">$output", plpprint => 1) or die;; $labelkey = { split /[:\s]+/, $labelkey }; # Get all the gff lines into a hash of arrays: my %gff = (); my $fh; if ($gff) { open $fh, "$gff" or die "$gff: $!\n";; while (my $gff = <$fh>) { my $seqname = (split /\t/, $gff)[0]; push @{$gff{$seqname}}, $gff; } } while (my $lfa = <$in>) { if ( defined $gff{$lfa->id} ) { # next unless defined $gff{$lfa->id}; # Make the prefix the uppercase first letter of the source field: $prefix ||= uc( (split(//, (split /\t/,${$gff{$lfa->id}}[0] )[1] ) )[0] ); # Generate a label string from the GFFs: my $pred = gff2labels($labelkey, $gff{$lfa->id}, $lfa->seq); next unless $pred; if (length $pred != length $lfa->seq) { warn "Length of prediciton line (",length($pred),") not equal to length of sequence (",length($lfa->seq),") for entry: ",$lfa->id,"\n","Probably GFF prediction outside sequence. Length:", length($lfa->seq),"\n", " @{$gff{$lfa->id}}\nSkipping...\n"; next; } # Add the prediction line: $lfa->add_pred( prefix => $prefix, pred => $pred ); print $out $lfa; } elsif (not $onlydefined) { my $pred = $$labelkey{default} x length($lfa->seq); # Add the "prediction" line: $lfa->add_pred( prefix => $prefix, pred => $pred ); print $out $lfa; } } sub gff2labels { my ($labelkey, $gffarray, $sequence) = @_; # Make a default string: my $labels = $$labelkey{default} x length($sequence); # split the GFFs: my @gff; foreach (@$gffarray) { my @array = split( /\t/ ); push @gff, \@array, } # Make sure that the gffs does not overlap: @gff = sort { $a->[3] <=> $b->[3] || $a->[4] <=> $b->[4] } @gff; for (my $i=1; $i<@gff; $i++) { if ($gff[$i][3] <= $gff[$i-1][4] && $gff[$i][6] eq $gff[$i-1][6]) { warn "The gff features overlap for sequence: $gff[$i][0]\n"; return ''; } } # Change the labeling from default in accordance with the GFFs: for my $gff (@gff) { my $replacement = $$labelkey{$$gff[2]} x ($$gff[4]-$$gff[3]+1); next if $$gff[6] eq '-'; substr $labels, $$gff[3]-1, $$gff[4]-$$gff[3]+1, $replacement; } return $labels; } =head1 SYNOPSIS: addprediction.pl [OPTIONS] [infile [outfile] ] =head1 DESCRIPTION: This script adds a prediction track to labeled Fasta entries as specified by a gff file. This is usefull for comaparing predictions. You can add more than one track of predictions from one gff file. In this case the prefix will be the first letter (capitalised) of the source field. Unless you use the onlydefined option you must use the prefix option. Otherwise the script will not know which prefix that goes with the tracks in entries that are not part of a prediction. NOTE: It skips predictions on the negative strand. =head1 OPTIONS: =over 4 =item --help Prints this help. =item --onlydefined Prints only entries that are not are part of a prediction on the new track. =item --prefix Specifies the prefix to to with the added prediction track. =item --labelkey Specifies which labels that go with which features. Default is: 'CDS:C exon:C initial:C internal:C terminal:C single:C intron:I default:x' =back =head1 EXAMPLES: cat input.lfa | addprediction.pl pred.gff > ouput.lfa addprediction.pl pred.gff input.lfa ouput.lfa =head1 AUTHOR: Kasper Munch =head1 COPYRIGHT: This program is free software. You may copy and redistribute it under the same terms as Perl itself. =cut