#!/usr/local/bin/perl #use strict; use warnings; use Getopt::Long; use constant USAGE =>< \$not, "all" => \$all, "presorted" => \$presorted, "self" => \$self, "quiet" => \$quiet, "byparent" => \$byparent, "print1" => \$print1, "regex=s" => \$regex, "minfrac1=f" => \$minfrac1, "minfrac2=f" => \$minfrac2, "minhits=i" => \$minhits, "maxhits=i" => \$maxhits, "near=i" => \$near, "name1=s" => \$name1, "shutit" => \$shutit) or die USAGE; if (@ARGV==0 && $self) { die "Sorry - perl doesn't buffer STDIN, so you'll have to use a temporary file for that. Died"; } if (@ARGV==1) { push @ARGV, $self ? $ARGV[0] : '-'; } @ARGV==2 or die USAGE; ($file1,$file2) = @ARGV; $name1 = $file1 unless $name1; if ($self) { $name1 = "self&$name1"; } # Put the files into arrays of the lines: undef $/; open my $fh1, "$file1"; $_ = <$fh1>; $file1 = [split /^/]; open my $fh2, "$file2"; $_ = <$fh2>; $file2 = [split /^/]; $/ = "\n"; # Make mappings from line to linenr: for (my $i=0; $i < @$file1; $i++) { $file1{$$file1[$i]} = $i+1; } for (my $i=0; $i < @$file2; $i++) { $file2{$$file2[$i]} = $i+1; } unless ($presorted) { # Sort the arrays: $file1 = [sort generic @$file1]; $file2 = [sort generic @$file2]; } # $line1 holds next line from file1; @f1 holds fields of $line1 ($line1,@f1) = getline($file1,\$n1); # $line2 holds next line from file2; @f2 holds fields of $line2 ($line2,@f2) = getline($file2,\$n2); $aref = \@file1; # Loop is over all lines in file2: while ($line2) { $seqname2 = $f2[0]; # printbuffer1 holds lines to be printed from file1. output these now, if it's time. while (@printbuffer1) { my @f = split /\t/, $printbuffer1[0]; if ($f[0] lt $seqname2 or ($f[0] eq $seqname2 and $f[3] <= $f2[3])) { print shift @printbuffer1; } else { last; } } if ($seqname2 ne $lastseqname2) { @file1 = (); $lastseqname2 = $seqname2; } # Keep sucking lines from file1 and sticking GFF coords into @file1, # until past $line2 endpoint. Entries are "start end linenumber" # while ($line1 and $seqname1 = $f1[0], ($seqname1 lt $seqname2 or ($seqname1 eq $seqname2 and $f1[3] - $near <= $f2[4]))) { if ($seqname1 eq $seqname2) { push @file1, "@f1[3,4] $n1"; } # put lines from file1 into file1 printing buffer if -print1 switch is set if ($print1) { if ($seqname1 lt $seqname2 or ($seqname1 eq $seqname2 and $f1[3] <= $f2[3])) { print $line1; } else { push @printbuffer1, $line1; } } ($line1,@f1) = getline($file1,\$n1); } # use Data::Dumper; # print Data::Dumper->Dump([$aref]); # die; # loop over every entry in @$aref until past $line2 endpoint $skip # and $hitstart2yet are used to keep track of entries in @$aref that # may be discarded. # @lines = (); $hitstart2yet = $skip = 0; ($start2,$end2) = @f2[3,4]; for ($i=0;$i<@$aref;$i++) { $entry1 = $aref->[$i]; last if $entry1 - $near > $end2; ($start1,$end1,$n) = split /\s+/, $entry1; # check for entries that may be discarded # if ($end1 + $near < $start2) { if (!$hitstart2yet) { $skip = $i + 1; } } else { # if line from file1 overlaps line from file2, store linenumber in @lines array # $hitstart2yet = 1; ($effstart1,$effend1) = ($start1-$near/2,$end1+$near/2); ($effstart2,$effend2) = ($start2-$near/2,$end2+$near/2); $maxstart = $effstart1 > $effstart2 ? $effstart1 : $effstart2; $minend = $effend1 < $effend2 ? $effend1 : $effend2; $overlaplen = $minend + 1 - $maxstart; $len1 = $effend1 + 1 - $effstart1; $len2 = $effend2 + 1 - $effstart2; if (($len1 && $overlaplen/$len1 >= $minfrac1) && ($len2 && $overlaplen/$len2 >= $minfrac2)) { push @lines, $n; } } } while ($skip-- > 0) { shift @$aref; } # print if eligible # if ($maxhits != 1) { $test = (@lines >= $minhits && @lines <= $maxhits); } else { $test = (@lines >= $minhits); } if ($test && ($all || !$not)) { if ($quiet) { print $line2; } else { chomp $line2; print "$line2 intersect($name1)=(@{[sort {$a<=>$b} @lines]})\n"; } } elsif (!$test && ($all || $not)) { print $line2; } ($line2,@f2) = getline($file2,\$n2); } print $line1 if $print1 && $line1; sub getline { my ($file, $nref) = @_; my $line = ''; while (@$file and !$line || $line !~ /\S/) { $line = shift @$file; } $$nref = $file1{$line}; my @f = split(/\t/,$line,9); # Hack to make the script work with the parent info instead of the info in the main fields: if ($byparent) { my @p = split " ", $f[8]; ($f[0], $f[1], $f[2], $f[3], $f[4], $f[5], $f[6], $f[7], $f[8]) = ($p[1], '.', '.', $p[2], $p[3], '.', '+', '.', '.'); } if (@f == 0) { return (undef); } if ($regex) { $f[0] =~ /($regex)/; $f[0] = $1; } if (!$shutit) { if (@f < 9) { warn "Warning: fewer than 9 fields at $file line $$nref\n"; } if (join("",@f[0..7]) =~ / /) { warn "Warning: space in tab-delimited field at $file line $$nref\n"; } } ($line,@f); } sub generic { my $sorted = 0; my @a = split /\t/, $a; my @b = split /\t/, $b; for (my $i = 0; $i<@a && !$sorted ; $i++) { if ($a[$i] =~ /^\d+$/ && $b[$i] =~ /^\d+$/) { $sorted = $a[$i] <=> $b[$i]; } else { $sorted = $a[$i] cmp $b[$i]; } } $sorted; } =head1 SYNOPSIS: gffintersect.pl [OPTIONS] file1 file2 =head1 DESCRIPTION: Finds intersection of two GFF files. Prints lines from file2 that intersect (or do not intersect) with file1 =head1 OPTIONS: =over 4 =item --not To print lines from file2 that don't match the criteria rather than lines that do =item --all To print everything from file2 =item --self for comparisons vs self (omit file2); also adds \"self&\" prefix to name1 =item --minfrac1 xxx To specify minimum fractional overlap for file1 features =item --minfrac2 xxx To specify minimum fractional overlap for file2 features =item --near xxx To extend definition of \"overlap\" to nearby GFFs =item --minhits xxx To specify minimum number of hits required to print lines from file2 (default is 1) =item --maxhits xxx To specify maximum number of hits required to print lines from file2 (default is 1) =item --name1 If file1 is a name that won't look good in \"intersect(..)=(....)\" messages =item --quiet To suppress \"intersect(..)=(....)\" messages altogether =item --print1 To copy all lines from file1 to output =item --regex To specify a (perl) regular expression determining how much of the reference id in field one that is to be taken as reference id and how much that is just an exon, intron or alt.spl (Eg. --regex '^[A-Za-z0-9]+_[A-Za-z0-9]+'' specifying that only NC_003279 of NC_003279_3 is to determine the reference sequence.) =item --shutit To suppress GFF format warnings =item --presorted The GFF files are already sorted. =back =head1 EXAMPLES: =head1 AUTHOR: Kasper Munch =head1 COPYRIGHT: This program is free software. You may copy and redistribute it under the same terms as Perl itself. =cut