#!/usr/bin/perl use warnings; use strict; use Getopt::Long; use constant USAGE =>< Breaks into files with at most entries. --files Breaks into seqfiles. --random Same as --files but randomises the order of the entries before breaking. --size Breaks into files of size at most Mb. --like Similar to the files option, but breaks into files so that entries are distributed in the same same way as for with the reamaining entries distributed as appropriate. This is usefull when running cross-validations where we train on one set of files and test on another. --format Specifies sequence format. Fasta is default. EXAMPLES: # Break into ten files: seqfilebreak.pl --files 10 # Break into files of 4 Mb: seqfilebreak.pl --size 4 # Break into files each containint 30 seqs: seqfilebreak.pl --seqs 30 AUTHOR: Kasper Munch COPYRIGHT: This program is free software. You may copy and redistribute it under the same terms as Perl itself. END my $help = 0; my $files = 0; my $seqs = 0; my $size = 0; my $format = 'Fasta'; my $like = ''; my $random = 0; GetOptions("help" => \$help, "files=i" => \$files, "random=i" => \$random, "format=s" => \$format, "like=s" => \$like, "seqs=i" => \$seqs, "size=f" => \$size) or die USAGE; $help && die USAGE; my $infile = shift @ARGV or die USAGE; my $outdir = shift @ARGV if @ARGV; $outdir ||= "$infile.subfiles"; $files || $seqs || $size || $like || $random or die "You must specify either --files , --random , --seqs , --size , or --like \n"; $like and $format !~ /fasta/i and die "Format has to be Fasta when using --like\n"; if (-d $outdir) { system "rm -rf $outdir"; die "Old outdir still there" if -d $outdir; } system "mkdir -p $outdir"; my $in; open $in, "$infile" or die "$infile: $!\n"; my $sep; my $bracket;; my $end; if ($format =~ /EMBL|Genbank/i) { $/ = "//\n"; $bracket = ''; $end = "//\n"; } else { $/ = ">"; $bracket = '>'; $end = ''; } if ($files || $like || $random) { my $name = $infile; $name =~ s/^.*\/([^\/]+)$/$1/; my $f; my $q; my $n = 0; open $q, "$infile" or die "$infile: $!\n"; system 'sync'; my %fh; if ($like) { # Get the ids from the break specified by --like -e "$like" or die "$like does not exist. Break that first.\n"; for my $file (glob("$like/*[0-9]")) { $files++; open my $in, "$file" or die "Could not open $file\n"; open my $fh, ">$outdir/$name.$n" or die "Could not open $file\n"; while (my $str = <$in>) { chomp $str; next unless $str; if ($format =~ /Fasta/i && $str !~ /\n$/) { # Seems we only got part of an entry, so we probably encountered a # '>' in the description. So let's add another line: my $l = $str; $l = "$l$bracket" . <$q>; $str = $l; chomp $str; } $str =~ /\S+/ or die; $fh{$&} = $fh; } $n++; close $in; } } elsif ($random) { # Get the ids open my $in, "$infile" or die "Could not open $infile\n"; my @fh; for my $n (0..$random-1) { open my $fh, ">$outdir/$name.$n" or die "Could not open $infile\n"; push @fh, $fh; } my @ids; while (my $str = <$in>) { chomp $str; next unless $str; if ($format =~ /Fasta/i && $str !~ /\n$/) { # Seems we only got part of an entry, so we probably encountered a # '>' in the description. So let's add another line: my $l = $str; $l = "$l$bracket" . <$q>; $str = $l; chomp $str; } $str =~ /\S+/ or die; push @ids, $&; } while (@ids) { for my $n (0..$random-1) { if (@ids) { my $offset = int(rand(scalar(@ids))); my $id = $ids[$offset]; splice(@ids, $offset, 1); $fh{$id} = $fh[$n]; } } } close $in; } if ($like || $random) { # Start by filling in the entries we know where have to be and put # the rest in a tmpfile: open my $tmp, ">$outdir/tmpfile" or die; while (my $str = <$q>) { chomp $str; next unless $str; if ($format =~ /Fasta/i && $str !~ /\n$/) { # Seems we only got part of an entry, so we probably encountered a # '>' in the description. So let's add another line: my $l = $str; $l = "$l$bracket" . <$q>; $str = $l; chomp $str; } $str =~ /\S+/ or die; if (defined $fh{$&}) { my $fh = $fh{$&}; my $ofh = select $fh; $| = 1; select $ofh; print $fh "$bracket$str$end"; } else { print $tmp "$bracket$str$end"; } } $n = 0; # Reopen the $q file handle to read from the tmpfile close $q; open $q, "$outdir/tmpfile" or die "$outdir/tmpfile: $!\n"; # If we are doing random we are done filling into files: if ($random) { unlink "$outdir/tmpfile"; exit 1; } } # Keep track of how much the total size of the chunks up to now is off # with respect to the ideal distribution and use that to adjust how # large the remaining (the next) needs to be: my $filesize = (stat($q))[7]; # The size of the mother file. my $chunksize = $filesize/$files; # The size of each subfile. my $chunksum = 0; # The sum of the subfiles written to far. my $size = 0; while (my $str = <$q>) { chomp $str; next unless $str; if ($format =~ /Fasta/i && $str !~ /\n$/) { # Seems we only got part of an entry, so we probably encountered a # '>' in the description. So let's add another line: my $l = $str; $l = "$l$bracket" . <$q>; $str = $l; chomp $str; } $size += length($str); # Create a file for the first chunk of sequences: if ($n == 0) { if ($like) { open $f, ">>$outdir/$name.$n" or die "Could not open $outdir/$name.$n\n"; } else { open $f, ">$outdir/$name.$n" or die "Could not open $outdir/$name.$n\n"; } $n++; } # If the chunk is not empty or not the last one, make a new chunkfile # if this one is already to large or if adding the next sequence # would make it more off than just leaving it as it is: elsif ($n != $files && ($size > $chunksize || length($str) > 2 * ($chunksize - $size)) ) { # Add the size to the last chunk to the sum of chunks, and # adjust the chunksize accordingly: $chunksum += $size; $chunksize = ($filesize - $chunksum) / ($files - $n); close $f; if ($like) { open $f, ">>$outdir/$name.$n" or die "Could not open $outdir/$name.$n\n"; } else { open $f, ">$outdir/$name.$n" or die "Could not open $outdir/$name.$n\n"; } $n++; $size = 0; } print $f "$bracket$str$end"; } $/ = "\n"; unlink "$outdir/tmpfile"; } if ($seqs) { my $name = $infile; $name =~ s/^.*\/([^\/]+)$/$1/; my $fh; my $n = 0; my $i = 0; open $fh, ">$outdir/$name.$n" or die $!; while (my $str = <$in>) { chomp $str; next unless $str; if ($format =~ /Fasta/i && $str !~ /\n$/) { # Seems we only got part of an entry, so we probably encountered a # '>' in the description. So let's add another line: my $l = $str; $l = "$l$bracket" . <$in>; $str = $l; chomp $str; } if ($i == $seqs) { close $fh; $n++; open $fh, ">$outdir/$name.$n"; $i = 0; } print $fh "$bracket$str$end"; $i++; } } if ($size) { my $name = $infile; $name =~ s/^.*\/([^\/]+)$/$1/; my $fh; my $n = 0; my $i = 0; open $fh, ">$outdir/$name.$n"; while (my $str = <$in>) { chomp $str; next unless $str; if ($format =~ /Fasta/i && $str !~ /\n$/) { # Seems we only got part of an entry, so we probably encountered a # '>' in the description. So let's add another line: my $l = $str; $l = "$l$bracket" . <$in>; $str = $l; chomp $str; } print $fh "$bracket$str$end"; system 'sync'; if ((stat($fh))[7] > $size * 1048576) { close $fh; $n++; open $fh, ">$outdir/$name.$n"; $i = 0; } $i++; } } =head1 SYNOPSIS: seqfilebreak.pl [OPTIONS] [Fasta file [output dir]] =head1 DESCRIPTION: Breaks a fasta file into smaller files. It evaluates each small file after a new seq entry has been written to it. That means that the specified limit at most may be exceded by the size of the next sequence. =head1 OPTIONS: =over 4 =item --help Prints a help. =item --seqs Breaks into files with at most entries. =item --files Breaks into seqfiles. =item --random Same as --files but randomises the order of the entries before breaking. =item --size Breaks into files of size at most Mb. =item --like Similar to the files option, but breaks into files so that entries are distributed in the same same way as for with the reamaining entries distributed as appropriate. This is usefull when running cross-validations where we train on one set of files and test on another. =item --format Specifies sequence format. Fasta is default. =back =head1 EXAMPLES: # Break into ten files: seqfilebreak.pl --files 10 # Break into files of 4 Mb: seqfilebreak.pl --size 4 # Break into files each containint 30 seqs: seqfilebreak.pl --seqs 30 =head1 AUTHOR: Kasper Munch =head1 COPYRIGHT: This program is free software. You may copy and redistribute it under the same terms as Perl itself. =cut