#! /usr/bin/perl # Time-stamp: <2004-05-17 22:44:42 kasper> use warnings; use strict; use Getopt::Long; use Bio::SeqIO; use DBI; use constant USAGE =>< ... DESCRIPTION: This script creates a sql table from one or more sequence files in a pre-created SQL database. OPTIONS: --help Prints this help. --format Specifies format of input sequences. --quiet Makes the script run without progress messages. --overwrite Forces overwrite of old database of same name. EXAMPLES: seq2sql.pl --prefix myseq_ --overwrite mydatabase file1.fa file2.gb DEPENDENCIES: A MySQL database. AUTHOR: Kasper Munch COPYRIGHT: This program is free software. You may copy and redistribute it under the same terms as Perl itself. END my $help = 0; my $prefix = ''; my $overwrite = 0; my $quiet = 0; my $adding = 0; my $format = ''; GetOptions( "help" => \$help, "quiet" => \$quiet, "overwrite" => \$overwrite, "format=s" => \$format, ) or die USAGE; $help and die USAGE; my $tablename = shift @ARGV or die USAGE; my $databasename = shift @ARGV or die USAGE; my @files = @ARGV; my $files = scalar @files; my %attr = (PrintError => 0, RaiseError => 1, AutoCommit => 0); my $dsn = "DBI:mysql:$databasename;mysql_read_default_file=$ENV{HOME}/.my.cnf"; my $dbh = DBI->connect($dsn); # If the tables are already there don't try to make them unless # overwrite. if (exist("$tablename")) { if ($overwrite) { # Remove table: if ($overwrite) { $dbh->do("DROP TABLE $tablename"); } # Create table: $dbh->do("CREATE TABLE $tablename ( id VARCHAR(25) NOT NULL, header TEXT, seq TEXT NOT NULL, PRIMARY KEY (id) )" ); } else { $adding = 1 } } else { # Create table: $dbh->do("CREATE TABLE $tablename ( id VARCHAR(25) NOT NULL, header TEXT, seq TEXT NOT NULL, PRIMARY KEY (id) )" ); } print STDERR "\nDumping sequence entries to table $tablename in database $databasename" if !$quiet and $files == 0; print STDERR "\nDumping sequence file to table $tablename in database $databasename" if !$quiet and $files == 1; print STDERR "\nDumping $files files to table $tablename in database $databasename" if !$quiet and $files > 1; print STDERR ", adding to existing existing table" if !$quiet && $adding; print STDERR ", overwriting table if existing" if !$quiet && $overwrite; print STDERR ".\n\n"; my $str = "INSERT INTO $tablename ( id, header, seq ) VALUES (" . "\?, " x 2 . "\?)"; my $sth = $dbh->prepare($str); if (@files) { foreach my $file (@files) { if ($format) { $format = guessformat($format); } else { $file =~ /\.(.*)$/ or die "Could not quess format. Use --format or a sensible suffix for the files.\n"; $format = guessformat($1) or die "Could not quess format. Use --format or a sensible suffix for the files.\n"; } my $in; eval{$in = Bio::SeqIO->newFh(-file => "$file" , -format => $format )} or die "$file: $!"; while (my $seq = <$in>) { my @data = ($seq->id(), $seq->desc(), $seq->seq()); $sth->execute(@data); } } } else { $format or die "You have to specify format of standard input.\n"; $format = guessformat($format) or die "You have to specify format of sequences from standard input.\n"; my $in; eval{$in = Bio::SeqIO->newFh(-fh => \*STDIN , -format => $format )} or die $!; while (my $seq = <$in>) { my @data = ($seq->id(), $seq->desc(), $seq->seq()); $sth->execute(@data); } } $dbh->disconnect() or warn $dbh->errstr(); sub exist { my $table = shift; my $exist = 0; my $check = $dbh->prepare(qq{show tables}); $check->execute(); my $tables = $check->fetchall_arrayref(); for my $t (@$tables) { $exist = 1 if $$t[0] eq "$table"; } return $exist; } sub guessformat { my $s = shift @_; my $failed = 0; my $format; SW: { if ($s =~ /(^fasta)|(^fast)|(^fst)|(^fsa)|(^ft)|(^fs)|(^fa)/i) { $format = 'Fasta'; last SW; } if ($s =~ /(lfasta)|(fla)|(lfast)|(lfst)|(lfsa)|(lft)|(lfs)/i) { $format = 'LabeledFasta'; last SW; } if ($s =~ /(embl)|(emb)|(em)|(eml)/i) { $format = 'EMBL'; last SW; } if ($s =~ /(genebank)|(genbank)|(genb)|(geneb)|(gbank)|(gb)/i) { $format = 'GenBank'; last SW; } if ($s =~ /(swissprot)|(sprt)|(swissp)|(sprot)|(sp)|(spr)/i) { $format = 'Swissprot'; last SW; } if ($s =~ /pir/i) { $format = 'PIR'; last SW; } if ($s =~ /gcg/i) { $format = 'GCG'; last SW; } if ($s =~ /scf/i) { $format = 'SCF'; last SW; } if ($s =~ /ace/i) { $format = 'Ace'; last SW; } if ($s =~ /phd/i) { $format = 'phd'; last SW; } if ($s =~ /phred/i) { $format = 'phred'; last SW; } if ($s =~ /raw/i) { $format = 'raw'; last SW; } $failed++; } return eval{$failed ? 0 : $format}; } =head1 SYNOPSIS: seq2sql.pl [OPTIONS] ... =head1 DESCRIPTION: This script creates a sql table from one or more sequence files in a pre-created SQL database. =head1 OPTIONS: =over 4 =item --help Prints this help. =item --format Specifies format of input sequences. =item --quiet Makes the script run without progress messages. =item --overwrite Forces overwrite of old database of same name. =back =head1 EXAMPLES: seq2sql.pl --prefix myseq_ --overwrite mydatabase file1.fa file2.gb =head1 DEPENDENCIES: A MySQL database. =head1 AUTHOR: Kasper Munch =head1 COPYRIGHT: This program is free software. You may copy and redistribute it under the same terms as Perl itself. =cut