#!/usr/bin/perl # Time-stamp: <2004-02-20 16:56:46 kasper> use warnings; use strict; use Getopt::Long; use Bio::SeqIO; use Bio::LocatableSeq; use constant USAGE =>< Specifies format of sequences. Default is EMBL. --regex <(perl) regular expression> Regex to specify what features to include. EXAMPLES: AUTHOR: Kasper Munch COPYRIGHT: This program is free software. You may copy and redistribute it under the same terms as Perl itself. END my $help = 0; my $format = 'EMBL'; my $regex = '.'; GetOptions( "help" => \$help, "format=s" => \$format, "regex=s" => \$regex, ) or die USAGE; $help and die USAGE; die "Specified format must be GenBank, EMBL or Swissprot\n" unless $format =~ /GenBank|EMBL|Swissprot/; # Create a SeqIO obj. for input: my $in; if (@ARGV) { my $arg = shift @ARGV; $in = Bio::SeqIO->newFh(-file => "$arg" , '-format' => $format ); } else { $in = Bio::SeqIO->newFh(-fh => \*STDIN , '-format' => $format ); } while (<$in>) { my $seq = $_; my @features = $seq->all_SeqFeatures(); my $i = 1; for my $f (@features) { # Check if the prim tag matches regex: next unless $f->primary_tag() =~ $regex; my $id = $seq->accession . "_" . eval{$i++}; my $gff = make_gff($f, $id, $seq); print "$gff\n"; # If 'mRNA' of 'CDS' features contain sublocations, make GFF # strings for exons and introns: if (ref($f->location()) eq 'Bio::Location::Split' and $f->primary_tag() =~ /mRNA|CDS/i) { # Get the array of Bio::Location::Simple objs.: my @sl = $f->location->sub_Location(); for (my $j = 0; $j < @sl; $j++) { my $subid = $id . "_e" . eval{$j + 1}; my $subgff = make_sub_gff('exon', $sl[$j]->start(), $sl[$j]->end(), $id, $subid, $f); print "$subgff\n"; unless ($j == @sl - 1) { my $subid = $id . "_i" . eval{$j + 1}; $subgff = make_sub_gff('intron', $sl[$j]->end() + 1, $sl[$j+1]->start() - 1, $id, $subid, $f); print "$subgff\n"; } } } } } sub make_gff { my ($feature, $id, $Seq) = @_; # FIGURE OUT WHETHER THE GROUP FIELD SHOULD HAVE A TARGET:, SEQUENCE: # OR OTHER TAG. FOR NOW WE CALL EVERYTHING FOR SEQUENCE. my $type = 'Sequence'; my $gff = $feature->gff_string(); $gff =~ s/^(\S+)(\t\S+\t\S+\t)(\S+)\t(\S+)(\t\S+\t\S+\t\S+\t)(.*)$/ $Seq->accession() . $2 . $feature->start() . "\t" . $feature->end() . $5 . "$type:$id 1 " . eval{$feature->end() - $feature->start() + 1} . " ; " . $6/e; return $gff; } sub make_sub_gff { my ($name, $start, $end, $id, $subid, $feature) = @_; # FIGURE OUT WHETHER THE GROUP FIELD SHOULD HAVE A TARGET:, SEQUENCE: # OR OTHER TAG. FOR NOW WE CALL EVERYTHING FOR SEQUENCE. my $type = 'Sequence'; my $gff = sprintf "$id\tEMBL/GenBank/SwissProt\t$name\t$start\t$end\t.\t%s\t.\t$type:$subid 1 %d", eval{ $feature->strand() == 1 ? '+' : '-' }, eval{$end - $start + 1}; return $gff; # my $gff = $id . "\t" . # "Seq2gff.pl" . "\t" . # $name . "\t" . # $start . "\t" . # $end . "\t" . # "." . "\t" . # eval{ $feature->strand() == 1 ? '+' : '-' } . "\t" . # "." . "\t" . # "$type:" . $subid . " " . '1' . " " . eval{$end - $start + 1}; # This is actually prettier: } =head1 SYNOPSIS: seq2gff.pl [OPTIONS] format inputfile [regex] =head1 DESCRIPTION: This script generates gff from seqfiles. =head1 OPTIONS: =over 4 =item --help Prints this help. =item --format Specifies format of sequences. Default is EMBL. =item --regex <(perl) regular expression> Regex to specify what features to include. =back =head1 EXAMPLES: =head1 AUTHOR: Kasper Munch =head1 COPYRIGHT: This program is free software. You may copy and redistribute it under the same terms as Perl itself. =cut