#!/usr/bin/perl # Time-stamp: <2007-07-12 13:32:41 kasper> use Bio::Tools::Run::StandAloneBlast; use Bio::SeqIO; use Bio::SearchIO; use Getopt::Long; use SQLWriter; use DBI; use constant USAGE =>< DESCRIPTION: Blasts a file against a blast database, parses the blastreports and outputs GFF lines for query, hits and hsps. OPTIONS: --help Prints a help. --ungapped Makes ungapped BLAST. --program Specifies the program to run. blastn id default. --description Prints a description of the parsed BLAST report. --format Can be GenBank, EMBL, Fasta ... Defaults to Fasta --(no)hsps Whether or not to print GFF lines for each HSP. Default does. --(no)hits Whether or not to print GFF lines for each Hit. Defaults doesn't. --cutoff Sets the E-value cutoff. Default is 10e-50 --outfile Specifies output file. STDOUT is default. --self Makes the script blast the input file against itself. Hence, ignores the blastbase argument if given. --querys Print a GFF line for each query to a file. --sepfiles Puts the HSPs for each query into seperate files named 'seqid'. --sqldump Dumps in SQL to result, hit and hsp tables (that must be precreated). --reportdump Dumps all reports in file: . EXAMPLES: blast.pl query.fa database.fa > output.gff blast.pl --reportdump report.blr query.fa database.fa > output.gff blast.pl --self file.fa > output.gff DEPENDENCIES: The class: SQLWriter. Not on CPAN. But send me an email (kasper at binf.ku.dk) and I will be more than happy to send it. AUTHOR: Kasper Munch COPYRIGHT: This program is free software. You may copy and redistribute it under the same terms as Perl itself. END my $help = 0; my $description = 0; my $long = 0; my $ungapped = 0; my $program = 'blastn'; my $format = 'Fasta'; my $hsps = 1; my $hits = 0; my $querys = 0; my $outfile = ''; my $cutoffvalue = 10e-50; my $sqldump = ''; my $reportdump = ''; my $sepfiles = 0; GetOptions("help" => \$help, "program" => \$program, "ungapped" => \$ungapped, "description" => \$description, "long" => \$long, "format=s" => \$format, "hsps!" => \$hsps, "hits!" => \$hits, "cutoff=f" => \$cutoffvalue, "self" => \$self, "sqldump=s" => \$sqldump, "reportdump=s" => \$reportdump, "sepfiles" => \$sepfiles, "outfile|output=s" => \$outfile, "querys=s" => \$query) or die USAGE; $help and die USAGE; my $blastbase = pop @ARGV or die $usage unless $self; my $infile = @ARGV ? pop @ARGV : '' ; if ($infile) { $in = Bio::SeqIO->newFh(-file => "$infile" , '-format' => 'Fasta' ); } else { $in = Bio::SeqIO->newFh(-fh => \*STDIN , '-format' => 'Fasta' ); } if ($outfile) { open $out, ">$outfile" or die "Couldn't open $outfile: $1\n"; } else { open $out, ">&STDOUT" or die "Couldn't copy STDOUT filehandle\n"; } if ($querys) { open my $queryout, ">$querys"; } # If we blast against self, then make the infile the blastbase: if ($self) { $blastbase = $infile or die "Sorry, we don't buffer SDTDIN.\n"; } # Make a blastbase if there is not one allready: unless (-e "$blastbase.nhr" && -e "$blastbase.nin" && -e "$blastbase.nsq") { system("formatdb -p F -i $blastbase"); } # Blast-parameter initialization: my @params = ('database' => $blastbase, '_READMETHOD' => 'Blast', 'g' => eval{$ungapped ? 'F' : 'T'}, # gapped = False/True) 'e' => $cutoffvalue, 'program' => $program, # 'outfile' => 'blast_tmp', # 'f' => 0, 'W' => 10, 'y' => 5, ); # Create standaloneblast obj.: my $factory = Bio::Tools::Run::StandAloneBlast->new(@params); $factory->o('blast_tmp') if $reportdump; if ($sqldump) { my %attr = (PrintError => 0, RaiseError => 1, AutoCommit => 0); # Create the database $sqldump: system(qq{mysql -e 'CREATE database $sqldump'}); my $dsn = "DBI:mysql:$sqldump;mysql_read_default_file=$ENV{HOME}/.my.cnf"; my $dbh = DBI->connect($dsn); } # Blast each sequence against the database: while (my $seq = <$in>) { if ($sepfiles) { close $out; my $queryfilename = $seq->id(); open $out, ">$queryfilename"; } my $blastobj = $factory->blastall($seq); system("cat blast_tmp >> $reportdump") if $reportdump; if ($sqldump) { my $writer = SQLWriter->new(dbh => $dbh); $writer->make_tables(); $writer->dumpsql($blastobj); next; } # Get a ResultI obj: my $result = $blastobj->next_result(); # Print some report information: print $out "###################################################################################\n", "# Query name:\t\t\t", $result->query_name(), "\n", "# Description:\t\t\t", $result->query_description(), "\n", "# Database name:\t\t", $result->database_name(), "\n", "# Database size (letters):\t", $result->database_letters(), "\n", "# Database entries:\t\t", $result->database_entries(), "\n", "# Cutoff value \t\t\t$cutoffvalue\n", "# Gap parameter:\t\t", eval{$result->get_parameter('gapext') ? $result->get_parameter('gapext') : 'Ungaped blast!'}, "\n", "# Statistic (Kappa):\t\t", $result->get_statistic('kappa'), "\n", "###################################################################################\n" if $description; # Get a blast obj: # Get the next hit while ( my $hit = $result->next_hit ) { # PRINT HIT GFF UNLESS --nohits ..... next unless $hsps; while ( my $hsp = $hit->next_hsp ) { # Print the GFF line for the HSP: print $out $hit->accession(), "\t", $hsp->source_tag(), "\t", "similarity\t", $hsp->hit->start(), "\t", $hsp->hit->end(), "\t", $hsp->evalue(), "\t", eval{$hsp->hit->strand() == 1 ? '+' : '-'}, "\t", ".\t", "Target ", $result->query_name(), " ", $hsp->query->start(), " ", $hsp->query->end(); if ($long) { print $out " ; ", # This is admittedly not pretty but we have to extract the query # description from the seq obj. because the $result->query_description() # methosd returns null (probably a bug): "query_desc \"", $seq->desc(),"\" ; ", "hit_frac_ident ", $hsp->frac_identical('hit'), " ; ", "query_frac_ident ", $hsp->frac_identical('query'), " ; ", "total_frac_ident ", $hsp->frac_identical('total'), " ; ", "hit_frac_cons ", $hsp->frac_conserved('hit'), " ; ", "query_frac_cons ", $hsp->frac_conserved('query'), " ; ", "total_frac_cons ", $hsp->frac_conserved('total'), " ; ", "percent_ident ", $hsp->percent_identity, " ; ", "hit_gaps ", $hsp->gaps('hit'), " ; ", "query_gaps ", $hsp->gaps('query'), " ; ", "total_gaps ", $hsp->gaps('total'); } print $out "\n"; } } if ($querys) { # Print the GFF lines for the query seq: print $queryout $seq->id(), "\t", "reference\t", "Component\t", $seq->start(), "\t", $seq->end(), "\t.\t.\t.\t", "Sequence ", $seq->id()," ; ID ", $seq->id(),"\n"; } } if ($sqldump) { $dbh->disconnect() or warn $dbh->errstr(); } system('rm blast_tmp') if $reportdump; # Returns complementary strand sub plus2minus { my $string = shift; my @compl; my @string = split //, $string; while (my $base = pop @string) { SWITCH: { if ($base =~ /A/i) { push @compl, ('T'); last SWITCH; } if ($base =~ /T/i) { push @compl, ('A'); last SWITCH; } if ($base =~ /C/i) { push @compl, ('G'); last SWITCH; } if ($base =~ /G/i) { push @compl, ('C'); last SWITCH; } if ($base =~ /N/i) { push @compl, ('N'); last SWITCH; } print STDERR "Sequence letter not recognized\n"; } } return join "", @compl; } =head1 SYNOPSIS: blast.pl [OPTIONS] =head1 DESCRIPTION: Blasts a file against a blast database, parses the blastreports and outputs GFF lines for query, hits and hsps. =head1 OPTIONS: =over 4 =item --help Prints a help. =item --ungapped Makes ungapped BLAST. =item --program Specifies the program to run. blastn id default. =item --description Prints a description of the parsed BLAST report. =item --format Can be GenBank, EMBL, Fasta ... Defaults to Fasta =item --(no)hsps Whether or not to print GFF lines for each HSP. Default does. =item --(no)hits Whether or not to print GFF lines for each Hit. Defaults doesn't. =item --cutoff Sets the E-value cutoff. Default is 10e-50 =item --outfile Specifies output file. STDOUT is default. =item --self Makes the script blast the input file against itself. Hence, ignores the blastbase argument if given. =item --querys Print a GFF line for each query to a file. =item --sepfiles Puts the HSPs for each query into seperate files named 'seqid'. =item --sqldump Dumps in SQL to result, hit and hsp tables (that must be precreated). =item --reportdump Dumps all reports in file: . =back =head1 EXAMPLES: blast.pl query.fa database.fa > output.gff blast.pl --reportdump report.blr query.fa database.fa > output.gff blast.pl --self file.fa > output.gff =head1 DEPENDENCIES: The class: SQLWriter. Not on CPAN. But send me an email (kasper at binf.ku.dk) and I will be more than happy to send it. =head1 AUTHOR: Kasper Munch =head1 COPYRIGHT: This program is free software. You may copy and redistribute it under the same terms as Perl itself. =cut