#!/usr/bin/python # Time-stamp: <2006-08-24 11:21:35 kasper> import sys import re import string import copy from optparse import OptionParser class GFF: "Class for holding a GFF line" def __init__(self, list): self.sanityCheck(list) self.seq = list[0] self.source = list[1] self.feature = list[2] self.start = int(list[3]) self.end = int(list[4]) if list[5] == '.': self.score = list[5] else: self.score = float(list[5]) self.strand = list[6] if list[7] == '.': self.frame = list[7] else: self.frame = int(list[7]) if len(list) > 8 and list[8] != '.': list[8].strip() self.attrib = {} self.attribstr = list[8] if re.search('gene_id\s+"\S+"\s*;\s+transcript_id\s+"\S+"\s*;', list[8]): # This is GTF format: for m in re.finditer('(\S+) "(\S+)"\s*;', list[8]): self.attrib[m.group(1)] = m.group(2) else: # This is GFF format for s in re.split("\s*;\s*", list[8]): if re.search('=', s): # Probably GFF3 # ID=exon00005;Parent=mRNA00001,mRNA00002,mRNA00003 pair = s.split("=") assert len(pair) == 2; key = pair[0] values = pair[1].split("\s*,\s*") self.attrib[key] = values # elif re.search(';', s): # # Probably GFF2 # # Target "HBA_HUMAN" 11 55 ; E_value 0.0003 # # how do I avoid maching ; and , and = inside quotes? else: self.attrib = '' self.attribstr = '' def __cmp__(self, other): return cmp(self.seq, other.seq) \ or cmp(self.start, other.start) \ or cmp(self.end, other.end) def __repr__(self): list = [ self.seq, self.source, self.feature, str(self.start), str(self.end), str(self.score), self.strand, str(self.frame)] s = "\t".join(list) if self.attribstr: s += "\t" + self.attribstr return s def getline(self): list = [self.seq, self.source, self.feature, str(self.start), str(self.end), self.score, self.strand, self.frame] s = "\t".join(list) if self.attrib: list = ["=".join(x) for x in self.attrib.items()] s += "\t" + ";".join(list) s += "\n" return s def sanityCheck(self, list): assert len(list) == 8 or len(list) == 9, "Not appropriate nr of entries" assert str(list[3]).isdigit(), "Start not digits: " + str(list[3]) assert str(list[4]).isdigit(), "End not digits: " + str(list[4]) def onlyCoordinates(self): self.feature = '.' self.strand = '.' self.source = '.' self.score = '.' self.frame = '.' def overlaps(self, other): if self.seq == other.seq \ and (self.start >= other.start and self.start <= other.end \ or self.end >= other.start and self.end <= other.end \ or self.start < other.start and self.end > other.start): return 1 else: return 0 def endToEnd(self, other): if self.seq == other.seq and (self.start == other.end + 1 or self.end == other.start + 1): return 1 else: return 0 def before(self,other): if self.seq < other.seq or self.seq == other.seq and self.end < other.start: return 1 else: return 0 def after(self,other): if self.seq > other.seq or self.seq == other.seq and self.start > other.end: return 1 else: return 0 class GFFSet: "Class for holding a set of GFF lines" def __init__(self, list=[]): if len(list): self.list = list self.seq = list[0].seq self.start = list[0].start ends = [item.end for item in self.list] self.end = max(ends) self.strand = list[0].strand else: self.list = list self.seq = False self.start = False self.end = False self.strand = False def __len__(self): return len(self.list) def __getitem__(self, item): return self.list[item] def __delitem__(self, item): del self.list[item] def __iter__(self): self.index = 0 return self def next(self): if self.index < len(self.list): self.index += 1 return self.list[self.index-1] else: raise StopIteration def clear(self): self.list = [] self.seq = False self.start = False self.end = False self.strand = False def add(self, other): self.list.append(other) self.list.sort() if not self.seq: self.seq = other.seq else: assert self.seq == other.seq, "Tried to add a GFF from a diffrent sequence " + other.seq + " ne " + self.seq if not self.start: self.start = other.start if self.end: self.end = max(self.end, other.end) else: self.end = other.end def len(self): return len(self.list) class GFFIterator: "Iterator for iterating over GFF entries" def __init__(self, fp): self.fp = fp def __iter__(self): return self def next(self): line = self.fp.readline() while re.match("^#|^\s$", line): line = self.fp.readline() line = line.strip() if line: list = line.split("\t") gff = GFF(list); if gff: return gff else: raise StopIteration else: raise StopIteration class BufferedList: "Buffered Iterator for iterating over GFF gff list" def __init__(self, list): self.list = list def __iter__(self): return self def next(self): gff = False if len(self.list): gff = self.list.pop() return gff else: raise StopIteration def putback(self, gff): self.list.append(gff) class BufferedIterator: "Buffered List for iterating over GFF gff input stream" def __init__(self, iterator): self.buffer = [] self.iterator = iterator def __iter__(self): return self def next(self): gff = False if len(self.buffer): gff = self.buffer.pop() else: gff = self.iterator.next() if gff: return gff else: raise StopIteration def putback(self, gff): self.buffer.append(gff) def parseOptions(): "Parser of options" parser = OptionParser(usage="%prog [options] file1 [file2 [output]]", version="%prog 1.0") parser.add_option("-p", "--presorted", action="store_true", dest="presorted", default=False, help="Use this option if the GFF input is sorted") parser.add_option("-S", "--self", action="store_true", dest="self", default=False, help="Compares file1 to itself. Specify only file1.") parser.add_option("-s", "--strands", action="store_true", dest="strands", default=False, help="Do strand specific sorting") parser.add_option("-f", "--features", action="store_true", dest="features", default=False, help="Do feature specific sorting") parser.add_option("-i", "--inverse", action="store_true", dest="inverse", default=False, help="Prints inverse of the overlap to file2 (or file1 with --self)") parser.add_option("-N", "--nocollapse", action="store_true", dest="nocollapse", default=False, help="Obscure option that prevents collapsing of overlapping GFFs within files before finding overlaps between files.") parser.add_option("-m", "--match", action="store_true", dest="match", default=False, help="Print only GFFs with a match in file2 (or file1 with --self)") parser.add_option("-n", "--nomatch", action="store_true", dest="nomatch", default=False, help="Print GFFs with no match in file2 (or file1 with --self)") parser.add_option("-b", "--bestmatch", action="store_true", dest="bestmatch", default=False, help="Prints only the best matching GFFs in file2 (don't use with --self)") (options, args) = parser.parse_args() # Invalid options combinations: assert not options.strands, "Not implemented" assert not options.features, "Not implemented" return (options, args) def printSet(set, out): "Prints a set of gff opjects." for gff in set: out.write(str(gff) + "\n") def selfMatch(file1, out, options): "Finds GFFs in a sorted stream that overlaps and prints either these or the ones without overlaps" # Previous gff prevGff = False # Last one we decided to print or not print prevDecided = False # Figure out what GFFs that match others: for gff in file1: if prevGff: # Check for matches to previous: if gff.overlaps(prevGff): if not options.nomatch: if prevGff is not prevDecided: out.write(str(prevGff) + "\n") prevDecided = prevGff out.write(str(gff) + "\n") prevDecided = gff else: prevDecided = gff else: if options.nomatch: if prevGff is not prevDecided: out.write(str(prevGff) + "\n") printed = prevGff prevGff = gff # We need to take case of the last GFF line: if gff and gff is not prevGff: if gff.overlaps(prevGff): if not options.nomatch: if prevGff is not prevDecided: out.write(str(prevGff) + "\n") prevDecided = prevGff out.write(str(gff) + "\n") prevDecided = gff else: prevDecided = gff else: if options.nomatch: if prevGff is not prevDecided: out.write(str(prevGff) + "\n") printed = prevGff # In case there is only one line in the file: elif gff and options.nomatch: out.write(str(gff) + "\n") def selfCollapse(file1, out, options): "Collapses overlapping gffs from a sorted stream and prints them" prev = False buffer = GFFSet([]) if options.inverse: assert not options.inverse, "--inverse with --self not implemented" else: for gff in file1: if not prev: buffer.add(gff) elif gff.overlaps(buffer): buffer.add(gff) else: buffer = collapse(buffer) printSet(buffer, out) buffer.clear() buffer.add(gff) prev = gff buffer = collapse(buffer) printSet(buffer, out) def getBestmatch(set1, set2): "Finds best match for each line in file2" overlap = [] rest = [] assert 0, "bestmatch not implemented yet" # my $array1 = shift; # my $array2 = shift; # # my @bestmatches = (); # my @rest = (); # # # Get to script name: # $0 =~ s/.*?([^\/]+)$/$1/; # # for (my $i=0; $i<@$array1; $i++) { # my $best = ''; # my $longest = 0; # for (my $j=0; $j<@$array2; $j++) { # if ($$array1[$i][3] >= $$array2[$j][3] && $$array1[$i][3] <= $$array2[$j][4] # || $$array1[$i][4] >= $$array2[$j][3] && $$array1[$i][4] <= $$array2[$j][4] # or $$array2[$j][3] >= $$array1[$i][3] && $$array2[$j][3] <= $$array1[$i][4] # && $$array2[$j][4] >= $$array1[$i][3] && $$array2[$j][4] <= $$array1[$i][4]) { # # Get length of overlap: # my $length = min($$array1[$i][4], $$array2[$j][4]) - max($$array1[$i][3], $$array2[$j][3]) + 1; # # See if it is better than the one we have: # if ($length > $longest) { # $best = $$array2[$j]; # $longest = $length; # } # } # } # if ($best) { # push @bestmatches, $best; # } # } # # removeDoubles(\@bestmatches) if @bestmatches; # removeDoubles(\@rest) if @rest; # # return (\@bestmatches, \@rest); # } return (overlap, rest) def getMatch(set1, set2): "get match" overlap = [] rest = [] # where we start looking in set2 (it is the index where all # previous gffs have ends before the start of the next set1 gff): set2Start = 0 # for gff in set1: for i in range(len(set1)): # Indicates whether we have found the set2Start value: set2StartFound = 0 # Indicates whether the set1 gff is matched: matched = 0 for j in range(set2Start, len(set2)): # Check for overlap: if set1[i].overlaps(set2[j]): matched = 1 new = copy.deepcopy(set1[i]) overlap.append(new) # Keep track of where to start in set2 next time: if not set2StartFound: if i < len(set1) - 1 and set2[j].end < set1[i+1].start: assert j == set2Start + 1, "Incremented by more than one." set2Start = j else: set2StartFound = 1 elif set1[i].before(set2[j]): # Because the sets are sorted we are not going to find an overlap later on: break if not matched: new = copy.deepcopy(set1[i]) rest.append(new) return (overlap, rest) def getOverlap(set1, set2): "Overlap of overlapping gffs in two sorted sets of objects." overlap = [] rest = [] # where we start looking in set2 (it is the index where all # previous gffs have ends before the start of the next set1 gff): set2Start = 0 # for gff in set1: for i in range(len(set1)): # Indicates whether we have found the set2Start value: set2StartFound = 0 for j in range(set2Start, len(set2)): # Check for overlap: if set1[i].overlaps(set2[j]): # make an overlap gff: start = max(set1[i].start, set2[j].start) end = min(set1[i].end, set2[j].end) new = GFF([set1[i].seq, set1[i].source, set1[i].feature, start, \ end, set1[i].score, set1[i].strand, set1[i].frame]) overlap.append(new) # Keep track of where to start in set2 next time: if not set2StartFound: if i < len(set1) - 1 and set2[j].end < set1[i+1].start: assert j == set2Start + 1, "Incremented by more than one." set2Start = j else: set2StartFound = 1 elif set1[i].before(set2[j]): # Because the sets are sorted we are not going to find an overlap later on: break for j in range(len(overlap)): # Get the parts of the gff that is not in overlap: if j == 0 and overlap[j].start > set1[i].start: # Put in the start flank: end = overlap[j].start - 1 new = GFF([set1[i].seq, set1[i].source, set1[i].feature, set1[i].start, \ end, set1[i].score, set1[i].strand, set1[i].frame]) rest.append(new) if overlap[j].end < set1[i].end: # Put in the end flank: start = overlap[j].end + 1 if j != len(overlap) - 1: end = overlap[j+1].start-1 else: end = set1[i].end new = GFF([set1[i].seq, set1[i].source, set1[i].feature, start, \ end, set1[i].score, set1[i].strand, set1[i].frame]) rest.append(new) overlap = GFFSet(overlap) rest = GFFSet(rest) return (overlap, rest) def nextCluster(buffIter): "Gets the next cluster of overlapping gffs from the file1 stream." cluster = [] for gff in buffIter: # Get a cluster (often a singleton) of overlapping file1 gffs: if not len(cluster): # If cluster1 is empty we add the line to it: cluster.append(gff) continue elif gff.overlaps(cluster[-1]): # If cluster1 is not empty and gff overlaps the previous gff: cluster.append(gff) continue else: # If we don't have overlap we buffer the gff for the next loop and go on: buffIter.putback(gff) break cluster = GFFSet(cluster) return cluster def nextOverlapCluster(buffIter, otherCluster): """Gets the next cluster of overlapping gffs from the file2 stream that overlaps a cluster from the file1 stream.""" cluster = [] # Get a line from buffIter: for gff in buffIter: if gff.before(otherCluster): # Skip the gff line if it is before the first line in set1: continue elif gff.after(otherCluster): # Buffer and terminate loop if the gff line is after the last line in set1: buffIter.putback(gff) break else: # gff overlaps set1 so we put it in set2: cluster.append(gff) continue cluster = GFFSet(cluster) return cluster def collapse(set): "Collapses overlapping gffs in a sorted set of objects." i = 0 j = 1 collapsedSet = [] new = copy.deepcopy(set) while j < len(new): while j < len(new) and new[i].overlaps(new[j]): new[i].end = max(new[i].end, new[j].end) j += 1 new[i].onlyCoordinates() collapsedSet.append(new[i]) i = j # if we are now at the last gff we add it: if i == len(new) - 1: new[i].onlyCoordinates() collapsedSet.append(new[i]) break j += 1 if len(new) == 1: new[0].onlyCoordinates() collapsedSet.append(new[0]) # new = GFFSet(collapsedSet) # return new return GFFSet(collapsedSet) # def collapse(set): # "Collapses overlapping gffs in a sorted set of objects." # i = 0 # j = 1 # collapsedSet = [] # # new = copy.deepcopy(set) # # while j < len(new): # while j < len(new) and new[i].overlaps(new[j]): # sys.exit() # new[i].end = max(new[i].end, new[j].end) # j += 1 # # # new[i].onlyCoordinates() # # collapsedSet.append(new[i]) # i = j # # if we are now at the last gff we add it: # if i == len(new) - 1: # # # new[i].onlyCoordinates() # # collapsedSet.append(new[i]) # break # j += 1 # if len(new) == 1: # # # new[0].onlyCoordinates() # # collapsedSet.append(new[0]) # # new = GFFSet(collapsedSet) # # return new # return GFFSet(collapsedSet) def main(): # Parse Options: options, args = parseOptions() # Parse arguments: assert len(args), "You have to specify at least one file" infile1 = args.pop(0) instream1 = open(infile1, 'r') if not options.self: if len(args): infile2 = args.pop(0) instream2 = open(infile2, 'r') else: instream2 = sys.stdin if len(args): outfile = args.pop(0) outstream = open(outfile, 'w') else: outstream = sys.stdout # Iterators to read in gff lines: gffIterator1 = GFFIterator(instream1) if not options.self: gffIterator2 = GFFIterator(instream2) # Sort and make iterators: if not options.presorted: list1 = [] for gff in gffIterator1: list1.append(gff) list1.sort() list1.reverse() list2 = [] # Make buffered iterator: file1 = BufferedList(list1) if not options.self: for gff in gffIterator2: list2.append(gff) list2.sort() list2.reverse() # Make buffered iterator: file2 = BufferedList(list2) else: # Make buffered iterators: file1 = BufferedIterator(gffIterator1) if not options.self: file2 = BufferedIterator(gffIterator2) # Ovelaps/matches to self: if options.self: if options.match or options.nomatch: selfMatch(file1, outstream, options) else: # Get overlap to self: selfCollapse(file1, outstream, options) # Ovelaps/matches to file2: else: # Initialise Sets to hold overlapping gff lines within each file: cluster1 = nextCluster(file1) cluster2 = [] while cluster1: # Get set of overlapping gffs from file2: cluster2 = nextOverlapCluster(file2, cluster1) orig2 = copy.deepcopy(cluster2) if len(cluster2): # If cluster2 is empty (line one in file one didn't match anything in # file two) then we just print the lines in cluster1: if not len(cluster2): overlap = [] rest = cluster1 else: # Find the overlaps between the two clusters: if options.bestmatch: overlap, rest = getBestmatch(cluster1, cluster2) elif options.match: overlap, rest = getMatch(cluster1, cluster2) else: if not options.nocollapse: cluster1 = collapse(cluster1) # print "---------------" # for g in cluster2: print g # print "---------------" cluster2 = collapse(cluster2) # print "###############" # for g in cluster1: print "1:", g # for g in cluster2: print "2:", g overlap, rest = getOverlap(cluster1, cluster2) # for g in overlap: print "I:", g # for g in rest: print "R:", g # print "###############" if options.bestmatch or options.match: printSet(overlap, outstream) else: if options.inverse: printSet(rest, outstream) else: printSet(overlap, outstream) # We might need the gff from cluster2 that extends # past the end of cluster1. So the collapsed regions # from file2 that we might need to look at again are # put back: for gff in orig2: file2.putback(gff) # for gff in cluster2: # if gff.end < cluster1.end: # file2.putback(gff) elif options.nomatch or options.inverse: printSet(cluster1, outstream) # Reinitialise: cluster1 = nextCluster(file1) cluster2 = [] main() sys.exit(1)