Newer
Older
from BCBio import GFF
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord
from Bio.SeqFeature import SeqFeature, FeatureLocation
from gt.core import *
from gt.extended import *
from gt.annotationsketch import *
from gt.annotationsketch.custom_track import CustomTrack
from gt.core.gtrange import Range
#a contains b as subinterval
def contains(a,b):
return a[0] <= b[0] and a[1] >= b[1]
#remove a from b
def remove(a,b):
return [[b[0],a[0]], [a[1], b[1]]]
#get intervals of a cropped by b
def crop(a, b):
if not len(b):
return [a]
#sort children
b.sort(key=lambda x: x[0])
cropped = [[a[0], b[0][0]]]
for i in range(len(b) - 1):
cropped.append([b[i][1], b[i+1][0]])
cropped.append([b[-1][1], a[1]])
return cropped
def sketch(nested, genes):
print 'Running GT annotation sketch...'
for g in genes:
createGFF(genes[g], nested[g])
visualize(g)
def createGFF(gene, nested):
#find closest parent
if nested:
nested[-1]['parent'] = -1
for i in range(len(nested) - 1):
nested[i]['parent'] = -1
for j in range(i+1, len(nested)):
if contains(nested[j]['location'], nested[i]['location']):
nested[i]['parent'] = j
break
#append direct children
for i in reversed(range(len(nested))):
nested[i]['direct_children_locations'] = []
parent = nested[i]['parent']
if parent != -1:
nested[parent]['direct_children_locations'].append(nested[i]['location'])
#GFF
for i in range(len(nested)):
#insert BaseLine
features.append(SeqFeature(FeatureLocation(nested[i]['location'][0], nested[i]['location'][1]),
type='te_base',
strand=0,
qualifiers={'ID': 'TE_BASE {}'.format(i)}))
#Insert croppedIntervals
children = nested[i]['direct_children_locations']
cropped = crop(nested[i]['location'], children)
for subinterval in cropped:
features.append(SeqFeature(FeatureLocation(subinterval[0],subinterval[1]),
type='te',
strand=0,
qualifiers={'ID': 'TE {}'.format(i), 'Parent': 'TE_BASE {}'.format(i)}))
#Insert elements (domains, ppt, pbs, ...)
j = 0
for domain in nested[i]['features']['domains']:
sign = (lambda x: x and (1, -1)[x < 0])(domain['frame'][0])
if sign < 0:
domain['location'] = [domain['location'][1], domain['location'][0]]
overlap = [x for x in children if contains(domain['location'], x)]
cropped_domain = crop(domain['location'], overlap)
for part in cropped_domain:
features.append(SeqFeature(FeatureLocation(part[0], part[1]),
strand=sign,
#qualifiers={'Parent': 'DOMAIN {}-{}'.format(i,j)}))
qualifiers={'ID': 'DOMAIN {}-{}'.format(i,j),'Parent': 'TE_BASE {}'.format(i)}))
style.load_file('gt.style')
#Feature index
feature_index = FeatureIndexMemory()
feature_index.add_gff3file('data/' + gene_id + '.gff')
#create diagram for first sequence ID
seqid = feature_index.get_first_seqid()
range = feature_index.get_range_for_seqid(seqid)
diagram = Diagram.from_index(feature_index, seqid, range, style)
#create layout
canvas = CanvasCairoFile(style, 1400, height)
#canvas = CanvasCairoFilePDF(style, 1400, height)