#!/usr/bin/perl

# Time-stamp: <2005-07-21 17:04:10 kasper>

use warnings;
use strict;
use LFasta;
use LFastaIO;
use Getopt::Long;
use constant USAGE =><<END;

SYNOPSIS:

 dumpprediction.pl [OPTIONS] [infile [outfile]]

DESCRIPTION:


OPTIONS:

       --help
             Prints this help.
       --prefix <char>
             Specifies the prefix to to with the added prediction
             track.
       --labels
             Dump annotation.
       --nbest
             Dump all prediction tracks with integers as labels and
             print GTF lines.
       --labelkey <string>
             Specifies which labels that go with which
             features. Default is: 'C:CDS E:exon I:intron x:default'
       --labelgroup <string>
             Specifies which labels that together form features.
             Eg: 'CEI:transcript'

EXAMPLES:

 cat input.lfa | dumpprediction.pl > ouput.gff

 dumpprediction.pl input.lfa ouput.gff

AUTHOR:

Kasper Munch

COPYRIGHT:

This program is free software. You may copy and redistribute it under
the same terms as Perl itself.

END

my $labelkey = 'C:CDS E:exon I:intron x:default';
my $labelgroups = '';
my $help = 0;
my $onlydefined = 0;
my $prefix = undef;
my $labels = 0;
my $nbest = 0;
my $onlygroups = 0;

GetOptions(
           "help" => \$help,
           "prefix=s" => \$prefix,
           "labels" => \$labels,
           "nbest" => \$nbest,
           "labelkey=s" => \$labelkey,
           "labelgroups=s" => \$labelgroups,
           "onlygroups" => \$onlygroups,
          ) or die USAGE;


$help and die USAGE;

$onlydefined || defined $prefix || $labels || $nbest or die USAGE;

# Input and output:
@ARGV = ('-') unless @ARGV;
my $input = shift @ARGV;
open my $in, "$input" or die "$input: $!\n";
@ARGV = ('>&STDOUT') unless @ARGV;
my $output = shift @ARGV;
open my $out, ">$output" or die "$output: $!\n";

$in = LFastaIO->new(fh => $in);

$labelkey = { split /[:\s]+/, $labelkey };
$labelgroups = { split /[:\s]+/, $labelgroups };

use Data::Dumper;

while (my $lfa = <$in>) {

  $lfa->labelkey($labelkey);
  $lfa->labelgroups($labelgroups) if $labelgroups;
  my $preds = $lfa->preds_hash();
  my $id = $lfa->id();

  if ($labels) {
    my $gfflines = labels2gff({prefix => 'labels', pred => $lfa->labels}, $lfa->id, $labelkey);
    print $out $_ for (@$gfflines);
  } elsif ($nbest) {
    for my $pred (@$preds) {
      if ($$pred{prefix} =~ /\d+/) {
        my $gfflines = labels2gff($pred, $lfa->id, $labelkey);
        print $out $_ for (@$gfflines);
      }
    }
  } else {
    for my $pred (@$preds) {
      if ($$pred{prefix} eq $prefix) {
        my $gfflines = labels2gff($pred, $lfa->id, $labelkey);
        my $idx = 0;
        for my $l (@$gfflines) {
          $idx++;
          $l =~ s/status=prediction/gene_id "$id"; transcript_id "$id.$idx";/;
          print $out $l;
        }
      }
    }
  }
}

sub labels2gff {
  my $p = shift;
  my $id = shift;
  my $labelkey = shift;

  my @gfflines = ();

  # Make label regular expression
  my $label_regexp .= join("+|",keys(%{$labelkey}));
  $label_regexp .= "+";

  my $i = 1;

  my $attribute = '';
  if ($$p{prefix} eq 'labels') {
    $attribute = 'status=known';
  } elsif ($$p{prefix} =~ /\d+/) {
    $attribute = 'status=prediction';
  }
  # Look for labeled regions
  unless ($onlygroups) {
    while ( $$p{pred} =~ /$label_regexp/g ) {
      my $b = length($`);
      my $e = $b + length($&);
      my $key = substr($&,0,1);
      my $method = $0;
      $method =~ s/.*?([^\/]+)$/$1/;
      my $fase = $key =~ /\d/ ? $key : '.';
      my @gff = ($id,$$p{prefix},$$labelkey{$key},$b+1,$e,".","+","$fase","$attribute");
      my $gffstr = join("\t", @gff) . "\n";
      push @gfflines, $gffstr;
    }
  }
  while (my ($labels, $groupname) = each %{$labelgroups}) {
    my $regex = "[$labels]+";
    while ( $$p{pred} =~ /$regex/g ) {
      my $b = length($`);
      my $e = $b + length($&);
      my $key = substr($&,0,1);
      $key = '[CEIx012]+';
      my @gff = ($id,$$p{prefix},$groupname,$b+1,$e,".","+",".","$attribute");
      my $gffstr = join("\t", @gff) . "\n";
      push @gfflines, $gffstr;
    }
  }

  return \@gfflines;

}

=head1 SYNOPSIS:

 dumpprediction.pl [OPTIONS] [infile [outfile]]

=head1 DESCRIPTION:


=head1 OPTIONS:

=over 4

=item --help

Prints this help.

=item --prefix <char>

Specifies the prefix to to with the added prediction
track.

=item --labels

Dump annotation.

=item --nbest

Dump all prediction tracks with integers as labels and
print GTF lines.

=item --labelkey <string>

Specifies which labels that go with which
features. Default is: 'C:CDS E:exon I:intron x:default'

=item --labelgroup <string>

Specifies which labels that together form features.
Eg: 'CEI:transcript'

=back

=head1 EXAMPLES:

 cat input.lfa | dumpprediction.pl > ouput.gff

 dumpprediction.pl input.lfa ouput.gff

=head1 AUTHOR:

Kasper Munch

=head1 COPYRIGHT:

This program is free software. You may copy and redistribute it under
the same terms as Perl itself.


=cut