#! /usr/bin/perl use warnings; use strict; use Bio::SeqIO; use Getopt::Long; use constant USAGE =>< Replaces ORFs with a string of that letter. --minlength Minimum ORF length accepted. EXAMPLES: cat file.fa | orfprune.pl --mask N --minlength 100 > pruned.fa orfprune.pl --mask N --minlength 100 file.embl pruned.gb AUTHOR: Kasper Munch COPYRIGHT: This program is free software. You may copy and redistribute it under the same terms as Perl itself. END my $help; my $usage; my $mask; my $minorflength = 360; GetOptions("help" => \$help, "mask=s" => \$mask, "minlength=s" => \$minorflength) or die $usage; die $usage if $help; my $in; if (@ARGV) { my $arg = shift @ARGV; $arg =~ /\.(.*)$/ or die "Could not quess format from suffix.\n"; $format = guessformat($1) or die "Could not quess informat from suffix.\n"; $in = Bio::SeqIO->newFh(-file => "$arg" , '-format' => $format ); } else { $in = Bio::SeqIO->newFh(-fh => \*STDIN , '-format' => 'Fasta' ); } # Create a the output file handle: my $out; if (@ARGV) { my $arg = shift @ARGV; $arg =~ /\.(.*)$/ or die "Could not quess format from suffix.\n"; $format = guessformat($1) or die "Could not quess informat from suffix.\n"; $out = Bio::SeqIO->newFh(-file => ">$arg" , '-format' => $format ); } else { $out = Bio::SeqIO->newFh(-fh => \*STDOUT , '-format' => 'Fasta' ); } if ($mask) { while (my $seq = <$in>) { my $str = $seq->seq(); if ($str =~ s/(ATG(?:...){$minlength,}((?:TAA)|(?:TAG)|(?:TGA)))/$mask x length($1)/egi) { $seq->seq($str); } print $out $seq; } } else { while (<$in>) { print $out $_ if $_->seq() !~ m/(ATG(?:...)*((?:TAA)|(?:TAG)|(?:TGA)))/i; } } sub guessformat { my $s = shift @_; my $failed = 0; my $format; SW: { if ($s =~ /(^fasta)|(^fast)|(^fst)|(^fsa)|(^ft)|(^fs)|(^fa)/i) { $format = 'Fasta'; last SW; } if ($s =~ /(lfasta)|(fla)|(lfast)|(lfst)|(lfsa)|(lft)|(lfs)/i) { $format = 'LabeledFasta'; last SW; } if ($s =~ /(embl)|(emb)|(em)|(eml)/i) { $format = 'EMBL'; last SW; } if ($s =~ /(genebank)|(genbank)|(genb)|(geneb)|(gbank)|(gb)/i) { $format = 'GenBank'; last SW; } if ($s =~ /(swissprot)|(sprt)|(swissp)|(sprot)|(sp)|(spr)/i) { $format = 'Swissprot'; last SW; } if ($s =~ /pir/i) { $format = 'PIR'; last SW; } if ($s =~ /gcg/i) { $format = 'GCG'; last SW; } if ($s =~ /scf/i) { $format = 'SCF'; last SW; } if ($s =~ /ace/i) { $format = 'Ace'; last SW; } if ($s =~ /phd/i) { $format = 'phd'; last SW; } if ($s =~ /phred/i) { $format = 'phred'; last SW; } if ($s =~ /raw/i) { $format = 'raw'; last SW; } $failed++; } return eval{$failed ? 0 : $format}; } =head1 SYNOPSIS: orfprune.pl [OPTIONS] [infile [outfile]] Prunes a seqfile of open reading frames. Either by masking or by removing sequence entries (this is default). =head1 OPTIONS: =over 4 =item --help Prints a help. =item --mask Replaces ORFs with a string of that letter. =item --minlength Minimum ORF length accepted. =back =head1 EXAMPLES: cat file.fa | orfprune.pl --mask N --minlength 100 > pruned.fa orfprune.pl --mask N --minlength 100 file.embl pruned.gb =head1 AUTHOR: Kasper Munch =head1 COPYRIGHT: This program is free software. You may copy and redistribute it under the same terms as Perl itself. =cut