#!/usr/bin/perl use warnings; use strict; use Bio::SeqIO; use Getopt::Long; use constant USAGE =>< DESCRIPTION: Reports the differences between two sequence files, and prints either the sequences unique to file1 or the sequences that the files have in common. OPTIONS: --help Prints a help. --informat Format of the sequence files. Fasta is default. --outformat Format of the sequence files. Fasta is default. --common Prints the sequences that the files have in common. --onlyfile2 Prints the sequences that are unique to file2 instead. --onlyid Prints the id of each sequence only. EXAMPLES: # Print the ids that the two fasta files have in common: seqfilediff.pl --common --onlyid file1.fa file2.fa # Print the entries unique to each of two EMBL files: cat file2.embl | seqfilediff.pl --informat EMBL file1.embl AUTHOR: Kasper Munch COPYRIGHT: This program is free software. You may copy and redistribute it under the same terms as Perl itself. END my $help = 0; my $informat = 'Fasta'; my $outformat = ''; my $common = 0; my $onlyfile2 = 0; my $onlyid = 0; GetOptions("help" => \$help, "informat|format=s" => \$informat, "outformat=s" => \$outformat, "common" => \$common, "onlyfile2" => \$onlyfile2, "onlyid" => \$onlyid) or die USAGE; $common && $onlyfile2 and die "Incompatible options: --common and --onlyfile2\n"; my $file2 = pop @ARGV or die USAGE;; my $file1 = pop @ARGV if @ARGV; $outformat ||= $informat; # Make file/steam handles: my $fh1; my $fh2; if ($file2) { $fh2 = Bio::SeqIO->newFh(-file => "$file2", -format => $informat); } if ($file1) { $fh1 = Bio::SeqIO->newFh(-file => "$file1", -format => $informat); } else { $fh1 = Bio::SeqIO->newFh(-fh => \*STDIN, -format => $informat); } # Create an output file handle: my $out; $out = Bio::SeqIO->newFh(-fh => \*STDOUT , '-format' => $outformat ); # Hash the sequence entries: my %file1; while (<$fh1>) { $file1{$_->id()} = $_; } my %file2; while (<$fh2>) { $file2{$_->id()} = $_; } # Do the proper printing: if ($common) { for (keys %file1) { if ($onlyid) { print "$_\n" if defined $file2{$_}; } else { print $out $file1{$_} if defined $file2{$_}; } } } elsif ($onlyfile2) { for (keys %file2) { if ($onlyid) { print "$_\n" unless defined $file1{$_}; } else { print $out $file2{$_} unless defined $file1{$_}; } } } else { for (keys %file1) { if ($onlyid) { print "$_\n" unless defined $file2{$_}; } else { print $out $file1{$_} unless defined $file2{$_}; } } } =head1 SYNOPSIS: seqfilediff.pl [OPTIONS] [file1] =head1 DESCRIPTION: Reports the differences between two sequence files, and prints either the sequences unique to file1 or the sequences that the files have in common. =head1 OPTIONS: =over 4 =item --help Prints a help. =item --format Format of the sequence files. Fasta is default. =item --common Prints the sequences that the files have in common. =item --onlyfile2 Prints the sequences that are unique to file2 instead. =item --onlyid Prints the id of each sequence only. =back =head1 EXAMPLES: # Print the ids that the two fasta files have in common: seqfilediff.pl --common --onlyid file1.fa file2.fa # Print the entries unique to each of two EMBL files: cat file2.embl | seqfilediff.pl --informat EMBL file1.embl =head1 AUTHOR: Kasper Munch =head1 COPYRIGHT: This program is free software. You may copy and redistribute it under the same terms as Perl itself. =cut