#!/usr/bin/perl use warnings; use strict; use Bio::SeqIO; use Getopt::Long; use Bio::DB::Fasta; use constant USAGE =>< DESCRIPTION: Picks out sequence entries from a sequence file based on id/accession. It takes a newline separated list of ids/accessions to pick, either as first argument or from STDIN. Genbank and EMBL entries is identified by their accession. Fasta by id. Indexing is only supported for Fasta files. Note that the whole description line is not returned when using indexes. Only the first non-white-space after the '>' is returned which is also that the index uses as unique lookup id. OPTIONS: --informat Specifies the input format. --outformat Specifies the output format. Defaults to input format. --swop Swops the id and sequence streams. Handy when you need to pipe the sequences to STDIN. --list <'id id ...'> Specify list of ids instead of a file. Will ignore specified id file. --not Prints the entries NOT that are not specifies in the id-file. --index Makes the script use indexing. Doesn't work with standard input and only with Fasta files. --reindex Forces creation of new index even if one exists. --justindex Like --reindex but exits after index creation. --help Prints a help. EXAMPLES: cat file.tbl | pickseq.pl --informat EMBL file.embl > newfile.embl pickseq.pl --index file.tbl file.fa > newfile.fa cat file.embl | pickseq.pl --informat embl --list RGYZM0 > newfile.embl cat file.fa | pickseq.pl --swop --list 'RGYZM0 TSYZM03' > newfile.embl AUTHOR: Kasper Munch COPYRIGHT: This program is free software. You may copy and redistribute it under the same terms as Perl itself. END my $help = 0; my $list = 0; my $informat = 'Fasta'; my $outformat = ''; my $reindex = 0; my $index = 0; my $justindex = 0; my $not = 0; my $swop = 0; GetOptions("help" => \$help, "informat=s" => \$informat, "outformat=s" => \$outformat, "not" => \$not, "list=s" => \$list, "swop" => \$swop, "index" => \$index, "justindex" => \$justindex, "reindex" => \$reindex) or die USAGE; my $seqfile; my $input; $seqfile = pop @ARGV or die USAGE unless $list and $swop; unless ($list) { $input = pop @ARGV if @ARGV; } my $fh1; if ($input) { open $fh1, "$input" or die $!; } else { open $fh1, "-" or die $!; } $not && ( $index || $justindex || $reindex) and die "--not does not work with indexing.\n"; $reindex ||= $justindex; $swop && ( $index || $reindex || $justindex) and die "Don't use indexing with --swop or --list\n"; $informat = guessformat($informat) or $seqfile =~ /[^\.]+$/ and $informat = guessformat($&) or die "Could not guess informat.\n"; if ($outformat) { $outformat = guessformat($outformat) or die "Could not guess informat.\n"; } else { $outformat = $informat; } my $out = Bio::SeqIO->newFh(-fh => \*STDOUT, -format => $outformat ); if ($index || $reindex || $justindex) { die "Option --index incompatible with STDIN.\n" if $seqfile eq '-'; die "Specified format incompatible with STDIN. (Has to be Fasta).\n" if $outformat ne 'Fasta'; my $fileindex = Bio::DB::Fasta->new($seqfile, -reindex => $reindex); ref($fileindex) eq 'Bio::DB::Fasta' or die "First arg. to fetch not a Bio::DB::Fasta obj.\n"; exit 1 if $justindex; if ($list) { # Get the specified list: my @list = split /\s+/, $list; for my $id (@list) { my $seq = $fileindex->get_Seq_by_id($id) or die "Can't find sequence id: $id in index. Consider the --reindex option.\n"; print $out $seq; } } else { while (my $id = <$fh1>) { chomp $id; my $seq = $fileindex->get_Seq_by_id($id) or die "Can't find sequence id: $id in index. Consider the --reindex option.\n"; print $out $seq; } } } else { my $fh2; my %ids; if ($list) { # Get the specified list: my @list = split /\s+/, $list; for my $id (@list) { $ids{$id} = 1 } # Make $fh2 read from STDIN: open $fh2, ">&STDIN" or die "$seqfile: $!"; # Close unused filehandle: close $fh1; } else { open $fh2, "$seqfile" or die "$seqfile: $!"; if ($swop) { # Interchange the id and sequence stream: ($fh1, $fh2) = swop($fh1, $fh2); } # Get the list from the stream: while (my $id = <$fh1>) { chomp $id; $id =~ /^>*(\S+)/ or die $id; $id = $1; $ids{$id} = 1 } } my $ids = scalar (keys %ids); # Convert sequence filehandle to SeqIO: $fh2 = Bio::SeqIO->newFh(-fh => $fh2, -format => $informat ); my $nr = 0; while (my $seq = <$fh2>) { if (!$not && (defined $ids{$seq->id} or defined $ids{$seq->accession_number})) { print $out $seq; last if ++$nr == $ids; } elsif ($not && not (defined $ids{$seq->id} or defined $ids{$seq->accession_number})) { print $out $seq; last if ++$nr == $ids; } } } 1; sub swop { my ($fh1, $fh2) = @_; # my $tmp = $fh2; # $fh2 = $fh1; # $fh1 = $tmp; return ($fh2, $fh1); } sub indexid { my $str = shift; # if ($format =~ /EMBL|GenBank/) { # $str =~ s/^>[^\|]+\|[^\|]+\|\w+\|([^\|]+)\|/$1/ # } else { $str =~ s/^>(\S+)/$1/; # } return $str; } sub guessformat { my $s = shift @_; my $failed = 0; my $format; SW: { if ($s =~ /(^fasta)|(^fast)|(^fst)|(^fsa)|(^ft)|(^fs)|(^fa)/i) { $format = 'Fasta'; last SW; } if ($s =~ /(lfasta)|(fla)|(lfast)|(lfst)|(lfsa)|(lft)|(lfs)/i) { $format = 'LabeledFasta'; last SW; } if ($s =~ /(embl)|(emb)|(em)|(eml)/i) { $format = 'EMBL'; last SW; } if ($s =~ /(genebank)|(genbank)|(genb)|(geneb)|(gbank)|(gb)/i) { $format = 'GenBank'; last SW; } if ($s =~ /(swissprot)|(sprt)|(swissp)|(sprot)|(sp)|(spr)/i) { $format = 'Swissprot'; last SW; } if ($s =~ /pir/i) { $format = 'PIR'; last SW; } if ($s =~ /gcg/i) { $format = 'GCG'; last SW; } if ($s =~ /scf/i) { $format = 'SCF'; last SW; } if ($s =~ /ace/i) { $format = 'Ace'; last SW; } if ($s =~ /phd/i) { $format = 'phd'; last SW; } if ($s =~ /phred/i) { $format = 'phred'; last SW; } if ($s =~ /raw/i) { $format = 'raw'; last SW; } $failed++; } return eval{$failed ? 0 : $format}; } =head1 SYNOPSIS: pickseq.pl [OPTIONS] [id_file] =head1 DESCRIPTION: Picks out sequence entries from a sequence file based on id/accession. It takes a newline separated list of ids/accessions to pick, either as first argument or from STDIN. Genbank and EMBL entries is identified by their accession. Fasta by id. Indexing is only supported for Fasta files. Note that the whole description line is not returned when using indexes. Only the first non-white-space after the '>' is returned which is also that the index uses as unique lookup id. =head1 OPTIONS: =over 4 =item --informat Specifies the input format. =item --outformat Specifies the output format. Defaults to input format. =item --swop Swops the id and sequence streams. Handy when you need to pipe the sequences to STDIN. =item --list <'id id ...'> Specify list of ids instead of a file. Will ignore specified id file. =item --not Prints the entries NOT that are not specifies in the id-file. =item --index Makes the script use indexing. Doesn't work with standard input and only with Fasta files. =item --reindex Forces creation of new index even if one exists. =item --justindex Like --reindex but exits after index creation. =item --help Prints a help. =back =head1 EXAMPLES: cat file.tbl | pickseq.pl --informat EMBL file.embl > newfile.embl pickseq.pl --index file.tbl file.fa > newfile.fa cat file.embl | pickseq.pl --informat embl --list RGYZM0 > newfile.embl cat file.fa | pickseq.pl --swop --list 'RGYZM0 TSYZM03' > newfile.embl =head1 AUTHOR: Kasper Munch =head1 COPYRIGHT: This program is free software. You may copy and redistribute it under the same terms as Perl itself. =cut