#!/usr/bin/perl -w use Getopt::Long; use constant USAGE =>< newtable.tbl AUTHOR: Thomas Schou Larsen COPYRIGHT: This program is free software. You may copy and redistribute it under the same terms as Perl itself. END my $help = 0; my $type = 2; # Type of reduction used: # 1: Hobohm1 # 2: Hobohm2 my $cutoff = 1.e-3; # Cut-off on the significance score my $reverse = 0; # Use reverse scoring (the higher the better, eg bits) GetOptions( "help" => \$help, "type=i" => \$type, "cutoff=f" => \$cutoff, "reverse!" => \$reverse, ) or die USAGE; $help and die USAGE; @empty = (); while ( <> ) { @x = split(/\s+/); # Fix a problem with numbers in Blast output $x[2] = "1".$x[2] if ($x[2]=~/^e/); # If not seen before, initialize list if (!defined($matches{$x[0]})) { $matches{$x[0]} = [ @empty ]; } # Push on the list if required if ($reverse) { if ( $x[2]>=$cutoff && $x[0] ne $x[1] ) { push(@{$matches{$x[0]}},$x[1]); } } else { if ( $x[2]<=$cutoff && $x[0] ne $x[1] ) { push(@{$matches{$x[0]}},$x[1]); } } } # Symmetrize matrix, so if x matches y, y match x: for $x (keys(%matches)) { for $y (@{$matches{$x}}) { push @{$matches{$y}},$x if (!grep(/^$x$/,@{$matches{$y}})); } } if ($type==1) { &hobohm1(); } elsif ($type==2) { &hobohm2(); } else { die "Type has to be either 1 or 2"; } sub hobohm1 { # Sort list according to the number of matches (fewest first) @ids = sort { $#{$matches{$a}} <=> $#{$matches{$b}} } keys(%matches); # Initialize keep list for $x (@ids) { $keep{$x}=1; } for $x (@ids) { # All neighbors are thrown out: if ($keep{$x}) { for $y ( @{$matches{$x}} ) { $keep{$y}=0; } } } for $x (@ids) { print "$x\n" if ($keep{$x}); } } sub hobohm2 { # Get all the ids: @ids = keys(%matches); # Go through the list until all are done @keep = (); while ( $#ids>=0 ) { # Sort list according to the number of matches (lesser first) @ids = sort { $#{$matches{$a}} <=> $#{$matches{$b}} } @ids; # Put all the ones that doesn't have a match on to the list of the # ones we want to keep: while ( $#ids>=0 && $#{$matches{$ids[0]}}<0 ) { push(@keep, shift @ids); } if ($#ids>=0) { # Remove the ones with most edges: $last = pop @ids; # Remove this from all the rest: for $x ( @{$matches{$last}} ) { $matches{$x} = [ grep(!/^$last$/,@{$matches{$x}}) ]; } } } # Print the final list: for $x (@keep) { print "$x\n"; } } =head1 SYNOPSIS: homologyreduce.pl [OPTIONS] file.fa =head1 DESCRIPTION: Reads a file with three columns: id1 id2 score from STDIN. Self scores are assumed to be included. Apply hobohm 1 or 2 and output list of reduced IDs. Ids MUST be unique!! Hobohm 1: Original alg is more efficient, because you don't need all pairwise comparisons in advance. This is better however, because it keeps the ones with fewest neighbors first. Sort on the number of neighbors. 1. Start with the one with lowest number of neighbors. 2. If seq not removed, remove all neighbors from the list 3. Next sequence, repeat from 2 until done. Hobohm 2: Two lists: keep and potential. Initially all on potential. 1. Sort on the number of neighbors on potential list. 2. Move all with no neighbors from potential to keep list 3. Remove the one with highest number of neighbors from potential list. Remove its name from all neighbor lists. 5. Stop if potential list emty, othberwise go to 1. =head1 OPTIONS: =over 4 =item --help Prints this help. =item --type Type of reduction used: 1: Hobohm1, 2: Hobohm2. =item --cutoff Cut-off on the significance score. =item --reverse Use reverse scoring (the higher the better, eg bits). =back =head1 EXAMPLES: cat table.tbl | homologyreduce.pl > newtable.tbl =head1 AUTHOR: Thomas Schou Larsen =head1 COPYRIGHT: This program is free software. You may copy and redistribute it under the same terms as Perl itself. =cut