#!/usr/bin/perl use Getopt::Long; use constant USAGE =>< Label for introns not in CDS and hence not in any phase. Defaults to 'x'. --help Prints this help. EXAMPLES: cat file1.lfasta | $0 > file2.lfsta AUTHOR: Andes Krogh / (Kasper Munch) COPYRIGHT: This program is free software. You may copy and redistribute it under the same terms as Perl itself. END my $help = 0; my $flankingintronlabel = 'x'; GetOptions( "flankingintronlabel=s" => \$flankingintronlabel, "help" => \$help, ) or die USAGE; $help and die USAGE; while ( @entry = read_entry() ) { $n = 0; while ( $entry[2] =~ /C+[CI]+C+/g ) { $genebeg[$n] = length($`); $genelen[$n] = length($&); ++$n; } $l = length($entry[1]); # print STDERR "Number of genes: $n\n"; $error = 0; # Find the phase for the first intron: if ($n) { # If first gene does NOT start at pos 0: if ( $genebeg[0] > 0) { $phase = 0; } # If first gene DOES start at pos 0 and is contained in the # fasta entry: elsif ( $genebeg[0] + $genelen[0] < $l ) { $x = substr($entry[2],$genebeg[0],$genelen[0]); $x =~ s/I//g; $phase = 3 - length($x) % 3; } # Find the most likely frame: else { $ferror = 0; @CDSlist = splice_gene(\@entry); $cds = $CDSlist[0]; $cds =~ s/_//g; $cds =~ s/(TAA|TGA|TAG)$//; for ($i=0; $i<3; ++$i) { $cds =~ m/^(...)+/; $x = $&; $x =~ s/(...)/ &change_stop($1) /ge; if ( $x =~ /\@/ ) { ++$ferror; } else { $phase=3-$i; } $cds = substr($cds,1); } # If there is an error in all three frames, # or the frame is ambiguous, report if ( $ferror != 2 ) { ++$error; } } } if ($error) { die "There is an error in all three frames, or the frame is ambiguous.\n"; } if ( !$error ) { $phase = $phase % 3; # This is smart but extremely slow! # $entry[2] =~ s/(.C*)(I+)/ &intron_label($1,$2) /ge; $x = $entry[2]; $entry[2] = ""; $nintron = 0; # Make progressive matching: while ( $x =~ /I+/) { $li = length($&); $preceding = $`; $entry[2] .= $`; $following = $'; $x = $'; if ($following =~ /^C+/ && $preceding =~ /C+$/) { # Get the length of the preceding exon. $le = length($&); # New gene if ( $entry[2] =~ /[x$flankingintronlabel]C+$/) { $nintron = $phase = 0; } $phase = ($phase+$le)%3; ++$nintron ; $entry[2] .= "$phase" x $li; } else { $entry[2] .= "$flankingintronlabel" x $li; } } $entry[2] .= $x; print_entry(\@entry); } } sub change_stop { if ( $_[0] =~ /TAA|TGA|TAG/ ) {return '@';} else {$_= '';} } sub intron_label { my $l1 = length($_[0])-1; my $l2 = length($_[1]); if ($_[0] !~ /^I/ ) { $phase = 0; } elsif ( $_[0] =~ /^C/ ) { $l1 += 1; } $phase = ($phase+$l1)%3; return $_[0] . "$phase" x $l2; } sub read_entry { if (defined($_[0])) { $fh = $_[0]; } else {$fh = \*STDIN;} @entry = (); %labels = (); $nr=0; $nl = 3; $entry[0] = ""; $nid = 0; if ($lastline) { $entry[0]=$lastline; $lastline = ""; $nid=1; } else {$nid=0;} $entry[1] = ""; while (<$fh>) { chop $_; if ($_ =~ /^>/) { if ($nid==1) { $lastline = $_; return @entry; } $entry[0] = $_; ++$nid; ++$nr; } elsif ($nid && $_ =~ /^%/) { # Append comments to the id line $entry[0] = $entry[0] . "\n" . $_; } elsif ($nid) { if ($_ =~ /^#/) { if ( !defined($entry[2]) ) { $entry[2] = ""; } $_ =~ s/^#//; $k = 2; } elsif ($_ =~ /^\?/) { $lab = substr($_,1,1); if (defined($labels{$lab})) { $k=$labels{$lab}; } else { $entry[$nl] = $lab; $labels{$lab} = $nl+1; $k = $nl+1; $nl +=2; $entry[$k] = ""; } $_ =~ s/^\?.//; } else { $k = 1; } $_ =~ s/ //g; $entry[$k] .= $_; ++$nr; } } if (!$nr) { @entry = () ;} return @entry; } # Splice genes according to labels # If a gene touches an end or starts/ends with intron, an '_' it appended # to that end. sub splice_gene { @entry = @{$_[0]}; my $k=0; my @cds = ''; while ( $entry[2] =~ /C+/g ) { $n=length($`); $l=length($&); if (!defined($cds[$k])) { $cds[$k] = ''; } $cds[$k] = $cds[$k] . substr($entry[1],$n,$l); if ( $n==0 || $` =~ /^I+$/ ) {$cds[$k] = '_' . $cds[$k]; } if (length($')==0 || $' =~ /^I+$/) {$cds[$k] = $cds[$k] . '_'; } if ( $' =~ /^[0EXx]/ ) { ++$k; } } return @cds; } # Print an entry with a label # First arg is the entry, second (optional) is the filehandle # Eg. # print_entry(\@entry,\*OFILE); sub print_entry { if (defined($_[1])) { $fh = $_[1]; } else {$fh = \*STDOUT;} $linelen = 70; @entry = @{$_[0]}; print $fh ">" if ( $entry[0]!~/^>/ ); print $fh "$entry[0]\n"; $movein = ""; if (defined($entry[2])) { $movein = ' '; $labpref = '# ' } if ($#entry>2) { $movein = ' '; $labpref = '# '; } $l = length($entry[1]); for ($i=0; $i<$l; $i += $linelen) { print $fh $movein . substr($entry[1],$i,$linelen) . "\n"; if (defined($entry[2])) { print $fh $labpref . substr($entry[2],$i,$linelen) . "\n"; } for ($k=3; $k<$#entry; $k += 2) { if ($entry[$k+1]) { if (defined($entry[$k])) { print $fh "?$entry[$k] ";} else { print $fh '? '; } print $fh substr($entry[$k+1],$i,$linelen) . "\n"; } } } } =head1 SYNOPSIS: intronphase.pl [OPTIONS] =head1 DESCRIPTION: This script changes all I labeling (intron) to 0, 1, or 2. =head1 OPTIONS: =over 4 =item --flankingintronlabel Prints this help. =item --help Prints this help. =back =head1 EXAMPLES: cat file1.lfasta | $0 > file2.lfsta =head1 AUTHOR: Andes Krogh / (Kasper Munch) =head1 COPYRIGHT: This program is free software. You may copy and redistribute it under the same terms as Perl itself. =cut