#!/usr/bin/perl use warnings; use strict; use Bio::SeqIO; use Getopt::Long; use constant USAGE =>< [seqfile] DESCRIPTION: Masks out Fasta entries from a sequence file based on GFF lines. OPTIONS: --informat Specifies the input format. Defualt is Fasta. --outformat Specifies the output format. Default is informat. --skip Skips first entries instead of printing the first. --help Prints a help. EXAMPLES: # Get the first 10 entries in an Fasta file cat file.fa | seqfilehead.pl 5 > newfile.fa # Get the first 5 entries in an EMBL file seqfilehead.pl --informat EMBL 5 file.embl > newfile.embl AUTHOR: Kasper Munch COPYRIGHT: This program is free software. You may copy and redistribute it under the same terms as Perl itself. END my $help = 0; my $informat = 'Fasta'; my $outformat = ''; my $skip = 0; GetOptions("help" => \$help, "skip" => \$skip, "informat=s" => \$informat, "outformat=s" => \$outformat) or die USAGE; $outformat ||= $informat; @ARGV or die USAGE; my $nr = shift @ARGV; defined $nr or die "Specify the number of seqs you want.\n"; my $seqfile = shift @ARGV if @ARGV; my $fh; my $in; if ($seqfile) { if ($informat eq 'Fasta') { open $fh, "$seqfile"; } else { $in = Bio::SeqIO->newFh(-file => "$seqfile", '-format' => $informat) or die "$!\n"; } } else { if ($informat eq 'Fasta') { open $fh, ">&STDIN"; } else { $in = Bio::SeqIO->newFh(-fh => \*STDIN, '-format' => $informat) or die "$!\n"; } } my $i = 0; if ($informat eq 'Fasta') { local $/ = ">"; while(<$fh>) { chomp; next unless $_; unless (/\n$/) { # Seems we only got part of an entry, so we probably encountered a # '>' in the description. So let's add another line: my $l = $_; $l = "$l>" . <$fh>; $_ = $l; chomp; } next unless $_; if ($skip) { if ($nr > $i++) { next; } else { print ">$_"; } } else { if ($nr > $i++) { print ">$_"; } else { exit; } } } } else { my $out = Bio::SeqIO->newFh(-fh => \*STDOUT, '-format' => $outformat) or die "$!\n"; while (my $seq = <$in>) { if ($skip) { if ($nr > $i++) { next; } else { print $out $seq; } } else { if ($nr > $i++) { print $out $seq; } else { exit 1; } } } } =head1 SYNOPSIS: seqfilehead.pl [OPTIONS] [seqfile] =head1 DESCRIPTION: Masks out Fasta entries from a sequence file based on GFF lines. =head1 OPTIONS: =over 4 =item --informat Specifies the input format. Defualt is Fasta. =item --outformat Specifies the output format. Default is informat. =item --skip Skips first entries instead of printing the first. =item --help Prints a help. =back =head1 EXAMPLES: # Get the first 10 entries in an Fasta file cat file.fa | seqfilehead.pl 5 > newfile.fa # Get the first 5 entries in an EMBL file seqfilehead.pl --informat EMBL 5 file.embl > newfile.embl =head1 AUTHOR: Kasper Munch =head1 COPYRIGHT: This program is free software. You may copy and redistribute it under the same terms as Perl itself. =cut