#!/usr/local/bin/perl -w # Time-stamp: <2004-04-12 20:14:53 kasper> use warnings; use Getopt::Long; use constant USAGE =><= ... DESCRIPTION: Takes a labeled Fasta from STDIN and transforms labels to GFF entries The label-to-feature translations are given as eg C=coding I=intron etc. Only specified labels are turned into GFF lines Labels are grouped by '-g transcript=EIiC' for instance to group labels EIiC into a transcript (which are enumerated) OPTIONS: --group = Specifies grouping by parent field. (see example) --process Regular expression for the line prefixes to process. --help Prints this help. EXAMPLES: cat file.lfa | lfa2gff.pl --group transcript=CI C=exon I=intron # will produce something like: 76XGX787 lfa2gff.pl exon 504 659 . . transcript0 76XGX787 lfa2gff.pl intron 660 713 . . transcript0 76XGX787 lfa2gff.pl exon 714 1297 . . transcript0 76XGX787 lfa2gff.pl intron 1298 1344 . . transcript0 76XGX787 lfa2gff.pl exon 1345 1858 . . transcript0 AUTHOR: Anders Krogh COPYRIGHT: This program is free software. You may copy and redistribute it under the same terms as Perl itself. END my $help = 0; my $group = ''; my $process = '#|0'; GetOptions( "help" => \$help, "group=s" => \$group, "process=s" => \$process, ) or die USAGE; $help and die USAGE; # Extract label translations and get filename for $a ( @ARGV ) { if ( $a =~ /\=/ ) { ($x,$y) = split(/\=/,$a); $featurekey{$x} = $y; print STDERR "Label $x translated to \"$y\"\n"; } else { die "didn't understand $a\n"; } } # Make the group regular expression if ($group) { ( $groupnm, $groups ) = split(/\=/,$group); $groups = "[$groups]+"; print STDERR "Group \"$groupnm\", regular expression: $groups\n"; } # Make label regular expression $label_regexp = join("+|",keys(%featurekey)); $label_regexp .= "+"; print STDERR "label regular expression: $label_regexp\n"; while ( @entry = &read_entry(\*STDIN) ) { $entry[0] =~ /^>*(\S+)/; $id = $1; # This deletes the sequence! $entry[1] = '#'; $i = 2; while ( defined($entry[$i]) ) { print "$entry[$i-1]\n"; if ( $entry[$i-1] !~ $process ) { $i += 2; next; } # Look for groups first $ng = $ig = 0; while ( $group && $entry[$i] =~ /$groups/g ) { $gb[$ng] = length($`); $ge[$ng] = $gb[$ng]+length($&); ++$ng; } # Look for labeled regions while ( $entry[$i] =~ /$label_regexp/g ) { $b = length($`); $e = $b + length($&); $key = substr($&,0,1); if ($group) { while ( $b > $ge[$ig] ) { ++$ig; } if ($b >= $gb[$ig]) { $group = "$groupnm$ig"; } else { $group = "."; } } $method = $0; $method =~ s/.*?([^\/]+)$/$1/; @gff = ($id,$method,$featurekey{$key},$b+1,$e,".",".",$group); print_gff(\@gff); } $i += 2; } } sub read_entry { if (defined($_[0])) { $fh = $_[0]; } else { $fh = \*STDIN; } @entry = (); %labels = (); $nr=0; $nl = 3; $entry[0] = ""; $nid = 0; if ($lastline) { $entry[0]=$lastline; $lastline = ""; $nid=1; } else { $nid=0; } $entry[1] = ""; while (<$fh>) { chop $_; if ($_ =~ /^>/) { if ($nid==1) { $lastline = $_; return @entry; } $entry[0] = $_; ++$nid; ++$nr; } elsif ($nid && $_ =~ /^%/) { # Append comments to the id line $entry[0] = $entry[0] . "\n" . $_; } elsif ($nid) { if ($_ =~ /^#/) { if ( !defined($entry[2]) ) { $entry[2] = ""; } $_ =~ s/^#//; $k = 2; } elsif ($_ =~ /^\?/) { $lab = substr($_,1,1); if (defined($labels{$lab})) { $k=$labels{$lab}; } else { $entry[$nl] = $lab; $labels{$lab} = $nl+1; $k = $nl+1; $nl +=2; $entry[$k] = ""; } $_ =~ s/^\?.//; } else { $k = 1; } $_ =~ s/ //g; $entry[$k] .= $_; ++$nr; } } if (!$nr) { @entry = () ; } return @entry; } sub print_gff { my @entry = @{$_[0]}; my $i=0; for ($i=0; $i<8; ++$i) { $entry[$i] = "." if (!defined($entry[$i])); print "$entry[$i]"; print "\t" if ($i<7); } print "\t$entry[8]" if (defined($entry[8])); print "\t$entry[9]" if (defined($entry[9])); print "\n"; } =head1 SYNOPSIS: lfa2gff.pl [OPTIONS] = ... =head1 DESCRIPTION: Takes a labeled Fasta from STDIN and transforms labels to GFF entries The label-to-feature translations are given as eg C=coding I=intron etc. Only specified labels are turned into GFF lines Labels are grouped by '-g transcript=EIiC' for instance to group labels EIiC into a transcript (which are enumerated) =head1 OPTIONS: =over 4 =item --group = Specifies grouping by parent field. (see example) =item --process Regular expression for the line prefixes to process. =item --help Prints this help. =back =head1 EXAMPLES: cat file.lfa | lfa2gff.pl --group transcript=CI C=exon I=intron # will produce something like: 76XGX787 lfa2gff.pl exon 504 659 . . transcript0 76XGX787 lfa2gff.pl intron 660 713 . . transcript0 76XGX787 lfa2gff.pl exon 714 1297 . . transcript0 76XGX787 lfa2gff.pl intron 1298 1344 . . transcript0 76XGX787 lfa2gff.pl exon 1345 1858 . . transcript0 =head1 AUTHOR: Anders Krogh =head1 COPYRIGHT: This program is free software. You may copy and redistribute it under the same terms as Perl itself. =cut # LocalWords: lfa