#!/usr/bin/perl # Time-stamp: <2008-03-05 10:21:28 kasper> use warnings; use strict; use LFasta; use LFastaIO; use Getopt::Long; use constant USAGE =>< [infile [outfile]] DESCRIPTION: Extract sub-sequences from sequences on stdin based on a (perl) regular expression given on the cmd line. Input sequences in labeled fasta format. By default the labels are searched using the regexp. Note that the IDs on the output are made unique by adding an incrementing suffix for each match in an entry. This can be avoided by using the keepid option. OPTIONS: General options: --help Prints this help. --quiet No output to STDERR. --meta Print the meta info contained in the input. Defualt does not. --[no]labels Print label sequence corresponding to matches. Default does. --pred Specifies the prediction track to match with regexp. The character specifies must be one of the prediction track labels. --match The number in the entry array to match with regexp. N=2 corresponds to the labels and is default. N=1 is the sequence (-seq is equaivalent to '-match 1'). Prediction lines correspond to N = 4+2*k (k=0, 1, 2,...), so first prediction is -match 4, second is -match 6, etc. This option exists for historic reasons. Use the pred option instead. --sequence Match the sequence instead of labels. Same as '-match 1'. Over-writes any -match given. --count Print only the number of matches (like grep -c). --length Print only length of matching region or entry. (If used with --count and --perentry it prints the length or each entry in addition to number of matches.) --submatch Print only the 'th submatch. That is, the string captured by the 'th set of parenthesises --wildcards Interpolate wildcards Y:CT R:AG N:CTAG. --gff Print gff lines corresponding to matches instead. The feature field in the gff lines will then be . Options for counting matching in each entry: --perentry Used with --count to print only the id and number of matches for each entry. --suffix str Use str as suffix on sequence id --ff N First flank added to matching sequence (can be negative). --lf N Last flank added to matching sequence (can be negative). --[no]ForceFlanks Only print out if flanks are large enough. If -F, -T, -L are used, print if sequence is long enough. --single Print each match as single line. --splice Splice matches from a sequence together instead of printing them individually --keepid The IDs on the Labeled Fasta output are not made unique. The id from the mother sequence is retained. --[no]description Whether to print description line describing where the mactch was. Default does. --[no]global With noglobal only the first matching segment is retrieved. Default is to retrieve all matching segments. Options for whole entries: --entries Print the whole entry matching the regexp (on labels or seqeunce as specified by the label and seq options). --header Print The whole entry matching with a header matching the regexp. The entries options is implied when using this option. --onlyid|id Print only the id of the entries matching the regexp. --not Used with --entries. Print the whole entry NOT matching the regexp. Options for use without regexp: --F N Extract from letter number N (no regular exp). Must be used with -L or -T. --T N Extract to letter number N. --L N Extract length N. EXAMPLES: # Data file signal.lfa is eg: >seq1 MASKATLLLAFTLLFATCIARHQQRQQQQNQCQLQNIEALEPIEVIQAEA # xxxxxxxxxxxxxxxxxxxxyyyyyyyyyyyyyyyyyyyyyyyyyyyyyy >seq2 MARSSLFTFLCLAVFINGCLSQIEQQSPWEFQGSEVWQQHRYQSPRACRL # xxxxxxxxxxxxxyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyy >seq3 MLVMAPRTVLLLLSAALALTETWAGSHSMRYFYTSVSRPGRGEPRFISVG # xxxxxxxxxxxxxxxxxxxxxxxxyyyyyyyyyyyyyyyyyyyyyyyyyy # Extract the first 5 letters of each seq: cat signal.lfa | grepseq.pl '^.{5}' # or cat signal.lfa | grepseq.pl -F 1 -L 5 # Extract 2 letters before boundary between x and y and 5 letters after: cat signal.lfa | grepseq.pl 'xxy{5}' # Find all matches to at least two Ls in the primary sequence and # print the match with 5 letters flank: cat signal.lfa | grepseq.pl -seq -nolabels -ff 5 -lf 5 'LL+' # Result: >seq1_1 match 2 15 flanks 5,5 "LL+" ASKATLLLAFTLL >seq1_2 match 8 20 flanks 5,5 "LL+" LLAFTLLFATCI >seq3_1 match 5 19 flanks 5,5 "LL+" APRTVLLLLSAALA # Print all the entries that don't have an exon longer than 300 next # to a intron smaller than 50: cat file.lfa | grepseq.pl --entries --not 'E{300,}I{,50} AUTHOR: Anders Krogh / Kasper Munch COPYRIGHT: This program is free software. You may copy and redistribute it under the same terms as Perl itself. END my $help = 0; my $regexp = ''; my $not = 0; my $meta = 0; my $onlyid = 0; my $labels = 1; my $header = 0; my $match = 0; my $wildcards = 0; my $F = 0; my $T = 0; my $L = 0; my $suffix = ''; my $ff = 0; my $lf = 0; my $ForceFlanks = 0; my $splice = 0; my $submatch = 0; my $single = 0; my $keepid = 0; my $description = 1; my $length = 0; my $sequence = 0; my $entries = 0; my $perentry = 0; my $count = 0; my $global = 1; my $gff = ''; my $pred = undef; my $quiet = 0; GetOptions( "help" => \$help, "quiet" => \$quiet, "meta" => \$meta, "not" => \$not, "F=i" => \$F, "T=i" => \$T, "L=i" => \$L, "suffix=s" => \$suffix, "ff=i" => \$ff, "lf=i" => \$lf, "submatch=i" => \$submatch, "wildcards" => \$wildcards, "ForceFlanks=i" => \$ForceFlanks, "labels!" => \$labels, "single" => \$single, "global!" => \$global, "onlyid|id" => \$onlyid, "length" => \$length, "match=i" => \$match, "sequence" => \$sequence, "header" => \$header, "keepid" => \$keepid, "description!" => \$description, "splice" => \$splice, "entries" => \$entries, "perentry" => \$perentry, "count" => \$count, "gff=s" => \$gff, "pred=s" => \$pred, ) or die USAGE; $help and die USAGE; $regexp = shift @ARGV if (@ARGV && !$F); if ($wildcards) { $regexp =~ s/Y/[CT]/g; $regexp =~ s/R/[AG]/g; $regexp =~ s/N/[CTAG]/g; } @ARGV = ('-') unless @ARGV; my $input = shift @ARGV; open my $in, "$input" or die "$input: $!"; my $out; if (@ARGV) { my $outfile = shift @ARGV; open $out, ">$outfile" or die "$outfile: $!"; } else { open $out, ">&STDOUT" or die "Can't dub STDOUT: $!"; } $in = LFastaIO->new(fh => $in); -e $regexp and die "Specified regexp is a file. Ommit only the regexp when using -F\n"; # Die on incompatible options: ($entries || $not) && ($F || $T || $L || $suffix || $ff || $lf || $ForceFlanks || $splice || $single || $length ) and die "Options:\n\t--entries\n\t--not\nincompatible with\n\t--F,\n\t--T,\n\t--L,\n\t--suffix,\n\t--ff,\n\t--lf,\n\t--ForceFlanks,\n\t--splice,\n\t--single,\n\t--length\n"; if ($gff) { ($splice || $single || $length) and die "You can't use --length, --splice and --single with --gff\n"; } if ($count) { ($splice || $single) and die "You can't use --splice and --single with --count\n"; } if ($header) { $sequence and die "Options: --header incompatible with --labels and --seq\n"; } if ($perentry) { $entries and die "Don't use --perentry with --entries"; $count or die "Don't use --perentry without --count"; } if ($submatch) { $regexp or die "Don't use --submatch without a regular expression.\n"; $regexp =~ /.*$.*$.*/ or die "Don't use --submatch without paranthesises.\n"; } if ($length && $count) { $perentry or die "Don't use --count --length without --perentry"; } $T && $L and die "Don't use both -T and -L\n"; my $lastline = ""; my $matches = 0; $entries = 1 if $header or $onlyid; $labels = 0 if not $labels; $onlyid = 1 if $entries && $length; if ($sequence) { $match = 1; } elsif ($header) { $match = 0; } else { $match ||= 2; } if ($entries) { my $doing = 'Printing'; $doing = 'Counting' if $count; my $what; if ($header) { $what = 'header'; } elsif ($sequence) { $what = 'sequence;' } else { $what = 'labels' } print STDERR "$doing entries with $what ", "not " x $not, "matching \"$regexp\"", "\n" unless $quiet; # print STDERR "$doing entries with ", "sequence " x $sequence, "labels " x $labels, # "not " x $not, "matching \"$regexp\"", "\n" unless $quiet; } else { if ( $regexp ) { if (defined $pred) { print STDERR "Chopping out \"$regexp\" based on prediction labeled '$pred'\n" unless $quiet; } elsif ($match==2) { print STDERR "Chopping out \"$regexp\" based on labels\n" unless $quiet; } elsif ($match==1) { print STDERR "Chopping out \"$regexp\" based on sequence\n" unless $quiet; } else { print STDERR "Chopping out \"$regexp\" based on prediction track ". ($match-2)/2 ."\n" unless $quiet; } } elsif ($F) { print STDERR "Chopping out from $F to " unless $quiet; $T ||= $F+$L-1 if $L; if ($T) { print STDERR "$T\n" unless $quiet; $L = $T-$F+1; } else { print STDERR "end\n" unless $quiet; } } else { die "ERROR: nothing to chop out\n"; } } while ( my $lfa = <$in> ) { # Trun lfa into an array: (we should change this later.) my @entry = (); $entry[0] = $lfa->header; $entry[1] = $lfa->seq; $entry[2] = $lfa->labels; my $preds = $lfa->preds; # Turn it into a regular array where 0, 2, 4 ... are prediction # labels and 1, 3, 5 ... are the predictions: foreach my $arrayref (@$preds) { push @entry, @$arrayref; } print "$entry[0]\n"; if (defined $pred) { my $ok = 0; for (my $k=3; $k<@entry; $k=$k+2) { if ($entry[$k] eq $pred) { $match = $k+1; $ok = 1; } } $ok or die "Could not find specifies prefix\n"; } if (!$entry[$match]) { $entry[0] =~ />(\S+)/; my $id = $1; die "No sequence in entry $id\n" if $match == 1; die "No labels in entry $id\n" if $match == 2; my $line = ($match-4)/2; die "No prediction line corresponding to --match $match (which is line $line in the labeled FASTA)\n"; } if ($entries) { if (!$not && $entry[$match] =~ /$regexp/) { if ($count) { $matches++ } elsif ($onlyid) { $entry[0] =~ />(\S+)/; print "$1"; print "\t", length $entry[1] if $length; print "\n"; } else { print_entry(\@entry); } } elsif ($not && $entry[$match] !~ /$regexp/) { if ($count) { $matches++ } elsif ($onlyid) { $entry[0] =~ />(\S+)/; print "$1"; print "\t", length $entry[1] if $length; print "\n"; } else { print_entry(\@entry); } } } else { # Anders grepseq code (tightened up somewhat): my $id = $entry[0]; $id =~ s/ .*//; $id = $id . $suffix; my $n=0; my $len = length($entry[1]); my @splice; if ($splice && !$keepid) { @splice = ( $id."spliced" ) if ($splice); } elsif ($splice) { @splice = ( $id ) if ($splice); } if ( !$regexp ) { my $b = $F-1; next if ($b>$len); my $l = $L; if ($b+$l>$len) { next if ($ForceFlanks); $l = $len-$b } my @pentry = (); $pentry[1] = substr($entry[1],$b,$l); $pentry[2] = substr($entry[2],$b,$l) if ($entry[2] && $labels); if ( $single ) { print $out "$pentry[1]\n"; # print $out "$pentry[2]\n" if ($labels); } else { $pentry[0] = $id; $pentry[0] .= ++$n unless $keepid; $pentry[0] = $pentry[0] . " from $F to " . ($F+$l-1); if ($count) { $matches++; } else { print_entry(\@pentry); } } } else { my $b; my $e; my $entry_matches = 0; while ( $entry[$match] =~ /$regexp/g ) { my $l; $b = $-[$submatch] - $ff; $l = $+[$submatch] - $-[$submatch] + $ff + $lf; next if ($l<=0 || $b>$len || $b+$l<1 ); next if ( $ForceFlanks && ( $b < 0 || $b + $l > $len ) ); $b = 0 if $b < 0; $l = $len - $b if ($b + $l > $len); if ($length && !$perentry) { print $out "$l\n"; next; } if ($gff) { $entry[0] =~ /^>(\S+)/; my $id = $1; print $out "$id\tgrepseq.pl\t$gff\t$b\t",$b + $l - 1 ,"\t.\t+\t0\n"; next; } my @pentry = (); $pentry[1] = substr($entry[1],$b,$l); # $pentry[2] = substr($entry[2],$b,$l) if ($labels); if ($labels && defined($entry[2]) ) { $pentry[2] = substr($entry[2],$b,$l); for (my $i=3; $i<$#entry; $i+=2 ) { $pentry[$i] = $entry[$i]; $pentry[$i+1] = substr($entry[$i+1],$b,$l); } } $b += 1; $e = $b + $l; if ($splice) { $splice[1] .= $pentry[1]; if ($#pentry>1) { $splice[2] .= $pentry[2]; for (my $i=3; $i<$#entry; $i+=2 ) { $splice[$i] = $pentry[$i]; $splice[$i+1] .= $pentry[$i+1]; } } } else { if ( $single ) { print $out "$pentry[1]\n"; if ($labels) { for (my $i=2; $i<$#entry; $i+=2 ) { print $out "$pentry[$i]\n"; } } } else { $pentry[0] = $id; $pentry[0] .= ++$n unless $keepid; if ($description) { $pentry[0] = $pentry[0] . " match $b $e flanks $ff,$lf \"$regexp\""; } if ($count) { if ($perentry) { $entry_matches++; } else { $matches++; } } else { print_entry(\@pentry); } } } last unless $global; } if ($perentry) { $id =~ s/^>//; print "$id\t$entry_matches"; print "\t", length($entry[1]) if $length; print "\n"; } if ($splice && $splice[1]) { if ( $single ) { print $out "$splice[1]\n"; if ($labels) { for (my $i=2; $i<$#splice; $i+=2 ) { print $out "$splice[$i]\n"; } } } else { if ($description) { $splice[0] .= " match $b $e flanks $ff,$lf \"$regexp\""; } print_entry(\@splice); } } } } } if ($count && !$perentry) { print $out "$matches\n"; } sub print_entry { my $linelen = 70; my @entry = @{$_[0]}; print $out ">" if ( $entry[0]!~/^>/ ); print $out "$entry[0]\n"; my $movein = ""; my $labpref; if (defined($entry[2])) { $movein = ' '; $labpref = '# ' } if ($#entry>2) { $movein = ' '; $labpref = '# '; } my $l = length($entry[1]); for (my $i=0; $i<$l; $i += $linelen) { print $out $movein . substr($entry[1],$i,$linelen) . "\n"; if (defined($entry[2])) { print $out $labpref . substr($entry[2],$i,$linelen) . "\n"; } for (my $k=3; $k<$#entry; $k += 2) { if ($entry[$k+1]) { if (defined($entry[$k])) { print $out "?$entry[$k] "; } else { print $out '? '; } print $out substr($entry[$k+1],$i,$linelen) . "\n"; } } } } exit 0; =head1 SYNOPSIS: grepseq.pl [OPTIONS] [infile [outfile]] =head1 DESCRIPTION: Extract sub-sequences from sequences on stdin based on a (perl) regular expression given on the cmd line. Input sequences in labeled fasta format. By default the labels are searched using the regexp. Note that the IDs on the output are made unique by adding an incrementing suffix for each match in an entry. This can be avoided by using the keepid option. =head1 OPTIONS: General options: =over 4 =item --help Prints this help. =item --quiet No output to STDERR. =item --meta Print the meta info contained in the input. Defualt does not. =item --[no]labels Print label sequence corresponding to matches. Default does. =item --pred Specifies the prediction track to match with regexp. The character specifies must be one of the prediction track labels. =item --match The number in the entry array to match with regexp. N=2 corresponds to the labels and is default. N=1 is the sequence (-seq is equaivalent to '-match 1'). Prediction lines correspond to N = 4+2*k (k=0, 1, 2,...), so first prediction is -match 4, second is -match 6, etc. This option exists for historic reasons. Use the pred option instead. =item --sequence Match the sequence instead of labels. Same as '-match 1'. Over-writes any -match given. =item --count Print only the number of matches (like grep -c). =item --length Print only length of matching region or entry. (If used with --count and --perentry it prints the length or each entry in addition to number of matches.) =item --submatch Print only the 'th submatch. That is, the string captured by the 'th set of parenthesises =item --wildcards Interpolate wildcards Y:CT R:AG N:CTAG. =item --gff Print gff lines corresponding to matches instead. The feature field in the gff lines will then be . =back Options for counting matching in each entry: =over 4 =item --perentry Used with --count to print only the id and number of matches for each entry. =item --suffix str Use str as suffix on sequence id =item --ff N First flank added to matching sequence (can be negative). =item --lf N Last flank added to matching sequence (can be negative). =item --[no]ForceFlanks Only print out if flanks are large enough. If -F, -T, -L are used, print if sequence is long enough. =item --single Print each match as single line. =item --splice Splice matches from a sequence together instead of printing them individually =item --keepid The IDs on the Labeled Fasta output are not made unique. The id from the mother sequence is retained. =item --[no]global With noglobal only the first matching segment is retrieved. Default is to retrieve all matching segments. =back Options for whole entries: =over 4 =item --entries Print the whole entry matching the regexp (on labels or seqeunce as specified by the label and seq options). =item --header Print The whole entry matching with a header matching the regexp. The entries options is implied when using this option. =item --onlyid|id Print only the id of the entries matching the regexp. =item --not Used with --entries. Print the whole entry NOT matching the regexp. =back Options for use without regexp: =over 4 =item --F N Extract from letter number N (no regular exp). Must be used with -L or -T. =item --T N Extract to letter number N. =item --L N Extract length N. =back =head1 EXAMPLES: # Data file signal.lfa is eg: >seq1 MASKATLLLAFTLLFATCIARHQQRQQQQNQCQLQNIEALEPIEVIQAEA # xxxxxxxxxxxxxxxxxxxxyyyyyyyyyyyyyyyyyyyyyyyyyyyyyy >seq2 MARSSLFTFLCLAVFINGCLSQIEQQSPWEFQGSEVWQQHRYQSPRACRL # xxxxxxxxxxxxxyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyy >seq3 MLVMAPRTVLLLLSAALALTETWAGSHSMRYFYTSVSRPGRGEPRFISVG # xxxxxxxxxxxxxxxxxxxxxxxxyyyyyyyyyyyyyyyyyyyyyyyyyy # Extract the first 5 letters of each seq: cat signal.lfa | grepseq.pl '^.{5}' # or cat signal.lfa | grepseq.pl -F 1 -L 5 # Extract 2 letters before boundary between x and y and 5 letters after: cat signal.lfa | grepseq.pl 'xxy{5}' # Find all matches to at least two Ls in the primary sequence and # print the match with 5 letters flank: cat signal.lfa | grepseq.pl -seq -nolabels -ff 5 -lf 5 'LL+' # Result: >seq1_1 match 2 15 flanks 5,5 "LL+" ASKATLLLAFTLL >seq1_2 match 8 20 flanks 5,5 "LL+" LLAFTLLFATCI >seq3_1 match 5 19 flanks 5,5 "LL+" APRTVLLLLSAALA # Print all the entries that don't have an exon longer than 300 next # to a intron smaller than 50: cat file.lfa | grepseq.pl --entries --not 'E{300,}I{,50} =head1 AUTHOR: Anders Krogh / Kasper Munch =head1 COPYRIGHT: This program is free software. You may copy and redistribute it under the same terms as Perl itself. =cut