#!/usr/bin/perl -w

use Getopt::Long;
use constant USAGE =><<END;

SYNOPSIS:

 homologyreduce.pl [OPTIONS] file.fa

DESCRIPTION:

Reads a file with three columns: id1 id2 score from STDIN. Self scores are
assumed to be included. Apply hobohm 1 or 2 and output list of reduced
IDs. Ids MUST be unique!!

Hobohm 1:
Original alg is more efficient, because you don't need all
pairwise comparisons in advance. This is better however, because
it keeps the ones with fewest neighbors first.
Sort on the number of neighbors.
1. Start with the one with lowest number of neighbors.
2. If seq not removed, remove all neighbors from the list
3. Next sequence, repeat from 2 until done.

Hobohm 2:
Two lists: keep and potential. Initially all on potential.
1. Sort on the number of neighbors on potential list.
2. Move all with no neighbors from potential to keep list
3. Remove the one with highest number of neighbors from
   potential list. Remove its name from all neighbor lists.
5. Stop if potential list emty, othberwise go to 1.

OPTIONS:

       --help
             Prints this help.
       --type
             Type of reduction used: 1: Hobohm1, 2: Hobohm2.
       --cutoff
             Cut-off on the significance score.
       --reverse
             Use reverse scoring (the higher the better, eg bits).

EXAMPLES:

 cat table.tbl | homologyreduce.pl > newtable.tbl

AUTHOR:

Thomas Schou Larsen

COPYRIGHT:

This program is free software. You may copy and redistribute it under
the same terms as Perl itself.

END

my $help = 0;
my $type = 2;                    # Type of reduction used:
                                # 1: Hobohm1
                                # 2: Hobohm2
my $cutoff = 1.e-3;                 # Cut-off on the significance score
my $reverse = 0;     # Use reverse scoring (the higher the better, eg bits)

GetOptions(
           "help" => \$help,
           "type=i" => \$type,
           "cutoff=f" => \$cutoff,
           "reverse!" => \$reverse,
          ) or die USAGE;

$help and die USAGE;

@empty = ();

while ( <> ) {
  @x = split(/\s+/);
  # Fix a problem with numbers in Blast output
  $x[2] = "1".$x[2] if ($x[2]=~/^e/);
  # If not seen before, initialize list
  if (!defined($matches{$x[0]})) {
    $matches{$x[0]} = [ @empty ];
  }
  # Push on the list if required
  if ($reverse) {
    if ( $x[2]>=$cutoff && $x[0] ne $x[1] ) {
      push(@{$matches{$x[0]}},$x[1]);
    }
  } else {
    if ( $x[2]<=$cutoff && $x[0] ne $x[1] ) {
      push(@{$matches{$x[0]}},$x[1]);
    }
  }
}

# Symmetrize matrix, so if x matches y, y match x:
for $x (keys(%matches)) {
  for $y (@{$matches{$x}}) {
    push @{$matches{$y}},$x if (!grep(/^$x$/,@{$matches{$y}}));
  }
}

if ($type==1) {
  &hobohm1();
} elsif ($type==2) {
  &hobohm2();
} else {
  die "Type has to be either 1 or 2";
}

sub hobohm1 {
  # Sort list according to the number of matches (fewest first)
  @ids = sort { $#{$matches{$a}} <=> $#{$matches{$b}} } keys(%matches);

  # Initialize keep list
  for $x (@ids) {
    $keep{$x}=1;
  }

  for $x (@ids) {
    # All neighbors are thrown out:
    if ($keep{$x}) {
      for $y ( @{$matches{$x}} ) {
        $keep{$y}=0;
      }
    }
  }

  for $x (@ids) {
    print "$x\n" if ($keep{$x});
  }
}

sub hobohm2 {

  # Get all the ids:
  @ids = keys(%matches);

  # Go through the list until all are done
  @keep = ();
  while ( $#ids>=0 ) {
    # Sort list according to the number of matches (lesser first)
    @ids = sort { $#{$matches{$a}} <=> $#{$matches{$b}} } @ids;

    # Put all the ones that doesn't have a match on to the list of the
    # ones we want to keep:
    while ( $#ids>=0 && $#{$matches{$ids[0]}}<0 ) {
      push(@keep, shift @ids);
    }

    if ($#ids>=0) {
      # Remove the ones with most edges:
      $last = pop @ids;
      # Remove this from all the rest:
      for $x ( @{$matches{$last}} ) {
        $matches{$x} = [ grep(!/^$last$/,@{$matches{$x}}) ];
      }
    }
  }

  # Print the final list:
  for $x (@keep) {
    print "$x\n";
  }
}

=head1 SYNOPSIS:

 homologyreduce.pl [OPTIONS] file.fa

=head1 DESCRIPTION:

Reads a file with three columns: id1 id2 score from STDIN. Self scores are
assumed to be included. Apply hobohm 1 or 2 and output list of reduced
IDs. Ids MUST be unique!!

Hobohm 1:
Original alg is more efficient, because you don't need all
pairwise comparisons in advance. This is better however, because
it keeps the ones with fewest neighbors first.
Sort on the number of neighbors.
1. Start with the one with lowest number of neighbors.
2. If seq not removed, remove all neighbors from the list
3. Next sequence, repeat from 2 until done.

Hobohm 2:
Two lists: keep and potential. Initially all on potential.
1. Sort on the number of neighbors on potential list.
2. Move all with no neighbors from potential to keep list
3. Remove the one with highest number of neighbors from
   potential list. Remove its name from all neighbor lists.
5. Stop if potential list emty, othberwise go to 1.

=head1 OPTIONS:

=over 4

=item --help

Prints this help.

=item --type

Type of reduction used: 1: Hobohm1, 2: Hobohm2.

=item --cutoff

Cut-off on the significance score.

=item --reverse

Use reverse scoring (the higher the better, eg bits).

=back

=head1 EXAMPLES:

 cat table.tbl | homologyreduce.pl > newtable.tbl

=head1 AUTHOR:

Thomas Schou Larsen

=head1 COPYRIGHT:

This program is free software. You may copy and redistribute it under
the same terms as Perl itself.


=cut