#!/usr/local/bin/perl use warnings; use strict; use Getopt::Long; use Bio::SeqIO; use SeqFunctions; use constant USAGE =>< Specifies a gff file to read from. --feature ',...' Specifies a list of features to mask. EXAMPLES: cat file.fa | mask.pl --gff file.gff > file.masked.fa cat file.gb | mask.pl --format gb --gff file.gff > file.masked.gb mask.pl --feature 'repeat_region,mRNA' file.embl > file.masked.embl AUTHOR: Kasper Munch DEPENDENCIES The module SeqFuntions. Not on CPAN but send me and email (kasper at binf.ku.dk) and I will be more than happy to send it. COPYRIGHT: This program is free software. You may copy and redistribute it under the same terms as Perl itself. END my $help = 0; my $format = ''; my $gff = ''; my $feature = ''; my $quiet = 0; GetOptions("help" => \$help, "gff=s" => \$gff, "feature=s" => \$feature, "format=s" => \$format, "quiet" => \$quiet) or die USAGE; $help && die USAGE; my %gff = (); my $fh; if ($gff) { open $fh, "$gff" or die "$gff: $!\n";; # Get all the gff lines into a hash of arrays: while (my $gff = <$fh>) { my $seqname = (split /\t/, $gff)[0]; push @{$gff{$seqname}}, $gff; } } my $regex; if ($feature) { $regex = join("|", split(/,/, $feature)); } @ARGV = ('-') unless @ARGV; my $input = shift @ARGV; $format = SeqFunctions::guessformat($format ? $format : $input) or die "Could not guess format.\n"; my $output = @ARGV ? shift @ARGV : "&STDOUT"; open my $infh, "$input" or die $!; my $in = Bio::SeqIO->newFh(-fh => $infh , -format => $format ); open my $outfh, ">$output" or die $!; my $out = Bio::SeqIO->newFh(-fh => $outfh, -format => $format ); while (my $seq = <$in>) { my $id = $seq->accession_number(); # Get the features included in the list from the seq: foreach my $f ($seq->get_SeqFeatures()) { # Check if the prim tag matches regex: next unless $regex && $f->primary_tag() =~ $regex; if (ref($f->location()) eq 'Bio::Location::Split') { # Get the array of Bio::Location::Simple objs.: my @sl = $f->location->sub_Location(); for (my $j = 0; $j < @sl; $j++) { my $gff = join "\t", $seq->accession_number, '.', '.', $sl[$j]->start, $sl[$j]->end, '.', $sl[$j]->strand, '.'; push @{$gff{$id}}, $gff; } } else { my $gff = join "\t", $seq->accession_number, '.', '.', $f->location->start, $f->location->end, '.', $f->location->strand, '.'; push @{$gff{$id}}, $gff; } } # Get the sequence: my $str = $seq->seq(); # Mask it using the gff or feature information we have collected: foreach my $line (@{$gff{$id}}) { my ($start, $end) = (split /\t/, $line)[3,4]; warn $id, " ", length($str), " ", $end, "\n" if length($str) < $end; substr($str,$start-1,$end-$start+1) =~ tr/ATGCatgc/NNNNnnnn/; } $seq->seq($str); print $out $seq; } =head1 SYNOPSIS: mask.pl [OPTIONS] [[infile] outfile] =head1 DESCRIPTION: Takes one of more seqfiles (STDIN is default) and masks them using gff lines from an external file and/or named features from the sequence annotation. =head1 OPTIONS: =over 4 =item --help Produces this help. =item --format In and out format of the sequence files. Fasta is default when using STDIN. =item --gff Specifies a gff file to read from. =item --feature ',...' Specifies a list of features to mask. =back =head1 EXAMPLES: cat file.fa | mask.pl --gff file.gff > file.masked.fa cat file.gb | mask.pl --format gb --gff file.gff > file.masked.gb mask.pl --feature 'repeat_region,mRNA' file.embl > file.masked.embl =head1 AUTHOR: Kasper Munch DEPENDENCIES The module SeqFuntions. Not on CPAN but send me and email (kasper at binf.ku.dk) and I will be more than happy to send it. =head1 COPYRIGHT: This program is free software. You may copy and redistribute it under the same terms as Perl itself. =cut