Newer
Older
from BCBio import GFF
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord
from Bio.SeqFeature import SeqFeature, FeatureLocation
from gt.core import *
from gt.extended import *
from gt.annotationsketch import *
from gt.annotationsketch.custom_track import CustomTrack
from gt.core.gtrange import Range
#a contains b as subinterval
def contains(a,b):
return a[0] <= b[0] and a[1] >= b[1]
#remove a from b
def remove(a,b):
return [[b[0],a[0]], [a[1], b[1]]]
#get intervals of a cropped by b
def crop(a, b):
if not len(b):
return [a]
#sort children
b.sort(key=lambda x: x[0])
cropped = [[a[0], b[0][0]]]
for i in range(len(b) - 1):
cropped.append([b[i][1], b[i+1][0]])
cropped.append([b[-1][1], a[1]])
return cropped
def sketch(nested, genes):
print 'Running GT annotation sketch...'
for g in genes:
createGFF(genes[g], nested[g])
visualize(g)
def createGFF(gene, nested):
#find closest parent
if nested:
nested[-1]['parent'] = -1
for i in range(len(nested) - 1):
nested[i]['parent'] = -1
for j in range(i+1, len(nested)):
if contains(nested[j]['location'], nested[i]['location']):
nested[i]['parent'] = j
break
#append direct children
for i in reversed(range(len(nested))):
nested[i]['direct_children_locations'] = []
parent = nested[i]['parent']
if parent != -1:
nested[parent]['direct_children_locations'].append(nested[i]['location'])
#GFF
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
for i in range(len(nested)):
#insert BaseLine
features.append(SeqFeature(FeatureLocation(nested[i]['location'][0], nested[i]['location'][1]),
type='te_base',
strand=0,
qualifiers={'ID': 'TE_BASE {}'.format(i)}))
#Insert croppedIntervals
children = nested[i]['direct_children_locations']
cropped = crop(nested[i]['location'], children)
for subinterval in cropped:
features.append(SeqFeature(FeatureLocation(subinterval[0],subinterval[1]),
type='te',
strand=0,
qualifiers={'ID': 'TE {}'.format(i), 'Parent': 'TE_BASE {}'.format(i)}))
#Insert elements (domains, ppt, pbs, ...)
j = 0
for domain in nested[i]['features']['domains']:
sign = (lambda x: x and (1, -1)[x < 0])(domain['frame'][0])
if sign < 0:
domain['location'] = [domain['location'][1], domain['location'][0]]
#features.append(SeqFeature(FeatureLocation(domain['location'][0], domain['location'][1]),
# type='domain_base',
# strand=sign,
# qualifiers={'ID': 'DOMAIN {}-{}'.format(i,j), 'Parent': 'TE_BASE {}'.format(i)}))
overlap = [x for x in children if contains(domain['location'], x)]
cropped_domain = crop(domain['location'], overlap)
for part in cropped_domain:
features.append(SeqFeature(FeatureLocation(part[0], part[1]),
type='domain',
strand=sign,
#qualifiers={'Parent': 'DOMAIN {}-{}'.format(i,j)}))
qualifiers={'Parent': 'TE_BASE {}'.format(i)}))
j += 1
style.load_file('gt.style')
#Feature index
feature_index = FeatureIndexMemory()
feature_index.add_gff3file('data/' + gene_id + '.gff')
#create diagram for first sequence ID
seqid = feature_index.get_first_seqid()
range = feature_index.get_range_for_seqid(seqid)
diagram = Diagram.from_index(feature_index, seqid, range, style)
#create layout
canvas = CanvasCairoFile(style, 1400, height)
#canvas = CanvasCairoFilePDF(style, 1400, height)