#!/usr/bin/perl

use warnings;
use strict;
use Bio::SeqIO;
use Getopt::Long;
use Bio::DB::Fasta;
use constant USAGE =><<END;

SYNOPSIS:

 pickseq.pl [OPTIONS] [id_file] <file_to_pick_from>

DESCRIPTION:

Picks out sequence entries from a sequence file based on id/accession.
It takes a newline separated list of ids/accessions to pick, either as
first argument or from STDIN. Genbank and EMBL entries is identified
by their accession. Fasta by id.

Indexing is only supported for Fasta files. Note that the whole
description line is not returned when using indexes. Only the first
non-white-space after the '>' is returned which is also that the index
uses as unique lookup id.

OPTIONS:

       --informat <format>
              Specifies the input format.
       --outformat <format>
              Specifies the output format. Defaults to input format.
       --swop
              Swops the id and sequence streams. Handy when you need
              to pipe the sequences to STDIN.
       --list <'id id ...'>
              Specify list of ids instead of a file. Will ignore
              specified id file.
       --not
              Prints the entries NOT that are not specifies in the
              id-file.
       --index
              Makes the script use indexing. Doesn't work with
              standard input and only with Fasta files.
       --reindex
              Forces creation of new index even if one exists.
       --justindex
              Like --reindex but exits after index creation.
       --help
             Prints a help.

EXAMPLES:

 cat file.tbl | pickseq.pl --informat EMBL file.embl > newfile.embl

 pickseq.pl --index file.tbl file.fa > newfile.fa

 cat file.embl | pickseq.pl --informat embl --list RGYZM0 > newfile.embl

 cat file.fa | pickseq.pl --swop --list 'RGYZM0 TSYZM03' > newfile.embl

AUTHOR:

Kasper Munch

COPYRIGHT:

This program is free software. You may copy and redistribute it under
the same terms as Perl itself.

END

my $help = 0;
my $list = 0;
my $informat = 'Fasta';
my $outformat = '';
my $reindex = 0;
my $index = 0;
my $justindex = 0;
my $not = 0;
my $swop = 0;

GetOptions("help" => \$help,
           "informat=s" => \$informat,
           "outformat=s" => \$outformat,
           "not" => \$not,
           "list=s" => \$list,
           "swop" => \$swop,
           "index" => \$index,
           "justindex" => \$justindex,
           "reindex" => \$reindex) or die USAGE;

my $seqfile;
my $input;

$seqfile = pop @ARGV or die USAGE unless $list and $swop;
unless ($list) {
  $input = pop @ARGV if @ARGV;
}

my $fh1;
if ($input) {
  open $fh1, "$input" or die $!;
} else {
  open $fh1, "-" or die $!;
}

$not && ( $index || $justindex || $reindex)
  and die "--not does not work with indexing.\n";

$reindex ||= $justindex;

$swop && ( $index || $reindex || $justindex)
  and die "Don't use indexing with --swop or --list\n";

$informat = guessformat($informat)
  or $seqfile =~ /[^\.]+$/
  and $informat = guessformat($&)
  or die "Could not guess informat.\n";

if ($outformat) {
  $outformat = guessformat($outformat)
    or die "Could not guess informat.\n";
} else {
  $outformat = $informat;
}

my $out = Bio::SeqIO->newFh(-fh => \*STDOUT,
                            -format => $outformat );

if ($index || $reindex || $justindex) {

  die "Option --index incompatible with STDIN.\n" if $seqfile eq '-';
  die "Specified format incompatible with STDIN. (Has to be Fasta).\n" if $outformat ne 'Fasta';

  my $fileindex = Bio::DB::Fasta->new($seqfile, -reindex => $reindex);
  ref($fileindex) eq 'Bio::DB::Fasta' or die "First arg. to fetch not a Bio::DB::Fasta obj.\n";

  exit 1 if $justindex;

  if ($list) {
    # Get the specified list:
    my @list = split /\s+/, $list;
    for my $id (@list) {
      my $seq = $fileindex->get_Seq_by_id($id)
        or die "Can't find sequence id: $id in index. Consider the --reindex option.\n";
      print $out $seq;
    }
  } else {
    while (my $id = <$fh1>) {
      chomp $id;
      my $seq = $fileindex->get_Seq_by_id($id)
        or die "Can't find sequence id: $id in index. Consider the --reindex option.\n";
      print $out $seq;
    }
  }
} else {

  my $fh2;
  my %ids;
  if ($list) {
    # Get the specified list:
    my @list = split /\s+/, $list;
    for my $id (@list) {
      $ids{$id} = 1
    }
    # Make $fh2 read from STDIN:
    open $fh2, ">&STDIN" or die "$seqfile: $!";
    # Close unused filehandle:
    close $fh1;
  } else {
    open $fh2, "$seqfile" or die "$seqfile: $!";
    if ($swop) {
      # Interchange the id and sequence stream:
      ($fh1, $fh2) = swop($fh1, $fh2);
    }
    # Get the list from the stream:
    while (my $id = <$fh1>) {
      chomp $id;
      $id =~ /^>*(\S+)/ or die $id;
      $id = $1;
      $ids{$id} = 1
    }
  }
  my $ids = scalar (keys %ids);

  # Convert sequence filehandle to SeqIO:
  $fh2 = Bio::SeqIO->newFh(-fh => $fh2, -format => $informat );

  my $nr = 0;
  while (my $seq = <$fh2>) {
    if (!$not && (defined $ids{$seq->id} or defined $ids{$seq->accession_number})) {
      print $out $seq;
      last if ++$nr == $ids;
    } elsif ($not && not (defined $ids{$seq->id} or defined $ids{$seq->accession_number})) {
      print $out $seq;
      last if ++$nr == $ids;
    }
  }

}
1;

sub swop {
  my ($fh1, $fh2) = @_;
#   my $tmp = $fh2;
#   $fh2 = $fh1;
#   $fh1 = $tmp;
  return ($fh2, $fh1);
}


sub indexid {
  my $str = shift;
#   if ($format =~ /EMBL|GenBank/) {
#     $str =~ s/^>[^\|]+\|[^\|]+\|\w+\|([^\|]+)\|/$1/
#   } else {
    $str =~ s/^>(\S+)/$1/;
#  }
  return $str;
}

sub guessformat {
  my $s = shift @_;
  my $failed = 0;
  my $format;
 SW: {
    if ($s =~ /(^fasta)|(^fast)|(^fst)|(^fsa)|(^ft)|(^fs)|(^fa)/i) {
      $format = 'Fasta'; last SW;
    }
    if ($s =~ /(lfasta)|(fla)|(lfast)|(lfst)|(lfsa)|(lft)|(lfs)/i) {
      $format = 'LabeledFasta'; last SW;
    }
    if ($s =~ /(embl)|(emb)|(em)|(eml)/i) {
      $format = 'EMBL'; last SW;
    }
    if ($s =~ /(genebank)|(genbank)|(genb)|(geneb)|(gbank)|(gb)/i) {
      $format = 'GenBank'; last SW;
    }
    if ($s =~ /(swissprot)|(sprt)|(swissp)|(sprot)|(sp)|(spr)/i) {
      $format = 'Swissprot'; last SW;
    }
    if ($s =~ /pir/i) {
      $format = 'PIR'; last SW;
    }
    if ($s =~ /gcg/i) {
      $format = 'GCG'; last SW;
    }
    if ($s =~ /scf/i) {
      $format = 'SCF'; last SW;
    }
    if ($s =~ /ace/i) {
      $format = 'Ace'; last SW;
    }
    if ($s =~ /phd/i) {
      $format = 'phd'; last SW;
    }
    if ($s =~ /phred/i) {
      $format = 'phred'; last SW;
    }
    if ($s =~ /raw/i) {
      $format = 'raw'; last SW;
    }
    $failed++;
  }
  return eval{$failed ? 0 : $format};
}

=head1 SYNOPSIS:

 pickseq.pl [OPTIONS] [id_file] <file_to_pick_from>

=head1 DESCRIPTION:

Picks out sequence entries from a sequence file based on id/accession.
It takes a newline separated list of ids/accessions to pick, either as
first argument or from STDIN. Genbank and EMBL entries is identified
by their accession. Fasta by id.

Indexing is only supported for Fasta files. Note that the whole
description line is not returned when using indexes. Only the first
non-white-space after the '>' is returned which is also that the index
uses as unique lookup id.

=head1 OPTIONS:

=over 4

=item --informat <format>

Specifies the input format.

=item --outformat <format>

Specifies the output format. Defaults to input format.

=item --swop

Swops the id and sequence streams. Handy when you need
to pipe the sequences to STDIN.

=item --list <'id id ...'>

Specify list of ids instead of a file. Will ignore
specified id file.

=item --not

Prints the entries NOT that are not specifies in the
id-file.

=item --index

Makes the script use indexing. Doesn't work with
standard input and only with Fasta files.

=item --reindex

Forces creation of new index even if one exists.

=item --justindex

Like --reindex but exits after index creation.

=item --help

Prints a help.

=back

=head1 EXAMPLES:

 cat file.tbl | pickseq.pl --informat EMBL file.embl > newfile.embl

 pickseq.pl --index file.tbl file.fa > newfile.fa

 cat file.embl | pickseq.pl --informat embl --list RGYZM0 > newfile.embl

 cat file.fa | pickseq.pl --swop --list 'RGYZM0 TSYZM03' > newfile.embl

=head1 AUTHOR:

Kasper Munch

=head1 COPYRIGHT:

This program is free software. You may copy and redistribute it under
the same terms as Perl itself.


=cut