Skip to content
Snippets Groups Projects
Unverified Commit f8dc4d52 authored by rlapar's avatar rlapar Committed by GitHub
Browse files

Merge pull request #1 from rlapar/nesting-test-transposon

Nesting test transposon
parents 831e896a 21d1cb12
No related branches found
No related tags found
No related merge requests found
......@@ -20,11 +20,27 @@
style =
{
gene = {
te_base = {
-- Color definitions
stroke = {red=0.0, green=0.0, blue=0.0, alpha = 1.0},
stroke_marked = {red=1.0, green=0.0, blue=0.0},
fill = {red=0.0, green=0.9, blue=1.0},
stroke_marked = {red=1.0, green=0.0, blue=0.0, alpha = 1.0},
fill = {red=0.8, green=0.8, blue=0.2, alpha = 1.0},
style = "box",
-- Collapsing options
collapse_to_parent = true,
split_lines = true,
-- Caption options
max_capt_show_width= nil,
-- Display this track only if the viewport is not wider than this
-- number of nucleotides. Set to 0 to disable type track.
max_show_width = nil,
},
--------------------------------------
gene = {
-- Color definitions
stroke = {red=0.0, green=0.0, blue=0.0, alpha = 0.2},
stroke_marked = {red=1.0, green=0.0, blue=0.0, alpha = 0.2},
fill = {red=0.0, green=0.0, blue=0.0, alpha = 0.2},
style = "box",
-- Collapsing options
collapse_to_parent = false,
......
......@@ -10,19 +10,22 @@ def runBlastx(genes):
#print 'Running blastx...'
for g in genes:
genes[g]['domains'] = findDomains(genes[g])
return genes
def findDomains(gene):
domains = {'INT':[],'GAG':[],'AP':[],'RNaseH':[],'RT':[],'XX':[]}
blastx_cline = NcbiblastxCommandline(db=config.blastx_gydb_protein_db, outfmt=5,
num_threads=config.blastx_args['num_threads'], dbsize=config.blastx_args['dbsize'],
word_size=config.blastx_args['word_size'], evalue=config.blastx_args['evalue'])
xml_out, stderr = blastx_cline(stdin=str(gene['sequence']))
blast_records = NCBIXML.parse(StringIO(xml_out))
blast_record = next(blast_records).alignments
try:
blast_record = next(blast_records).alignments
except ValueError:
return domains
#print len(blast_record)
domains = {'INT':[],'GAG':[],'AP':[],'RNaseH':[],'RT':[],'XX':[]}
if blast_record:
for alignment in blast_record:
for hsp in alignment.hsps:
......
import sys
import pprint
class IntervalTree:
def __init__(self, intervalList, transposonList):
self.intervalList = intervalList
self.transposonList = transposonList
self.__buildTree()
def __buildTree(self):
self.root = TreeNode([-3000000000,3000000000], None)
for i in reversed(range(len(self.intervalList))):
self.root.addNode(self.intervalList[i], self.transposonList[i])
def toDict(self):
return self.root.toDict()
class TreeNode:
def __init__(self, interval, transposon):
self.children = []
self.interval = interval
self.parent = None
self.transposon = transposon
def contains(self, value):
return value >= self.interval[0] and value <= self.interval[1]
def setParent(self, node):
self.parent = node
def addNode(self, interval, transposon):
for child in self.children:
if child.contains(interval[0]):
child.addNode(interval, transposon)
return
node = TreeNode(interval, transposon)
node.setParent(self)
self.children.append(node)
def toDict(self):
node = {
'interval': self.interval,
'transposon': self.transposon,
'children': []
}
for child in self.children:
node['children'].append(child.toDict())
return node
......@@ -45,8 +45,11 @@ def runLtrFinder(genes):
transposons[seq_name] = parseLTRTable(s.split('\n')[1:])
#append to genes
for ltr in transposons:
genes[ltr]['ltr_finder'] =transposons[ltr]
for ltr in genes:
if ltr in transposons:
genes[ltr]['ltr_finder'] = transposons[ltr]
else:
genes[ltr]['ltr_finder'] = []
return genes
......
import math
import pprint
from BCBio import GFF
from Bio.Seq import Seq
......@@ -11,6 +12,62 @@ from gt.annotationsketch import *
from gt.annotationsketch.custom_track import CustomTrack
from gt.core.gtrange import Range
def containsSubinterval(a, b):
#a contains b as subinterval
return b[0] >= a[0] and b[1] <= a[1]
def removeSubinterval(a,b):
#remove subinterval b from a
return [[a[0], b[0]], [b[1], a[1]]]
def intervalsToGFF(gene, tree):
#crop intervals
croppedTree = []
for i in range(len(tree)):
#find direct children
directChildren = []
for j in reversed(range(0,i)):
directChild = True
for dc in directChildren:
if containsSubinterval(dc, tree[j]):
directChild = False
if not directChild:
continue
if containsSubinterval(tree[i], tree[j]):
directChildren.append(tree[j])
directChildren.sort()
node = [tree[i]]
for child in directChildren:
node = node[:-1] + removeSubinterval(node[-1], child)
croppedTree.append(node)
#GFF
rec = SeqRecord(gene['sequence'], gene['id'])
features = []
i = 0
for interval in croppedTree:
#insert base
feat = SeqFeature(FeatureLocation(interval[0][0], interval[-1][1]), type='gene', strand=0, qualifiers={'ID': 'Transposon {}'.format(i)})
feat.sub_features = []
#insert all intervals
for subinterval in interval:
feat.sub_features.append(SeqFeature(FeatureLocation(subinterval[0],subinterval[1]), type='te_base', strand=0))
features.append(feat)
i += 1
rec.features = features
#possible other formats as well
with open('data/' + gene['id'] + '.gff', 'w+') as out_handle:
gff = GFF.write([rec], out_handle)
def sketch(genes):
print 'Running GT annotation sketch...'
for g in genes:
intervalsToGFF(genes[g], genes[g]['nested']['intervals'])
#treeToGFF(genes[g]['nested']['tree'])
visualize(genes[g])
def geneToGFF(gene):
#insert gene
rec = SeqRecord(gene['sequence'], gene['id'])
......@@ -20,24 +77,25 @@ def geneToGFF(gene):
#insert LTR finder results
for transposon in gene['ltr_finder']:
LTR_feature = SeqFeature(FeatureLocation(transposon['Location'][0], transposon['Location'][1]),
LTR_feature = SeqFeature(FeatureLocation(transposon['location'][0], transposon['location'][1]),
type='LTR_finder_TE', strand=1)
LTR_feature.sub_features = []
if not math.isnan(transposon['PPT'][0]):
if not math.isnan(transposon['ppt'][0]):
strand = 1
if transposon['PPT'][0] > transposon['PPT'][1]:
if transposon['ppt'][0] > transposon['ppt'][1]:
strand = -1
LTR_feature.sub_features.append(SeqFeature(FeatureLocation(transposon['PPT'][0], transposon['PPT'][1]),
type='LTR_PPT', strand=1))
if not math.isnan(transposon['PBS'][0]):
LTR_feature.sub_features.append(SeqFeature(FeatureLocation(transposon['ppt'][0], transposon['ppt'][1]),
type='LTR_ppt', strand=1))
if not math.isnan(transposon['pbs'][0]):
strand = 1
if transposon['PBS'][0] > transposon['PBS'][1]:
if transposon['pbs'][0] > transposon['pbs'][1]:
strand = -1
LTR_feature.sub_features.append(SeqFeature(FeatureLocation(transposon['PBS'][0], transposon['PBS'][1]),
type='LTR_PBS', strand=1))
LTR_feature.sub_features.append(SeqFeature(FeatureLocation(transposon['pbs'][0], transposon['pbs'][1]),
type='LTR_pbs', strand=1))
features.append(LTR_feature)
#domain type: INT, GAG, AP, RT, RNaseH, Unkwnown
'''
domain_features = {}
domain_features['INT'] = SeqFeature(FeatureLocation(0, len(gene['sequence'])), type='INT', strand=0)
domain_features['GAG'] = SeqFeature(FeatureLocation(0, len(gene['sequence'])), type='GAG', strand=0)
......@@ -50,8 +108,8 @@ def geneToGFF(gene):
#insert domains
for domain in gene['domains']:
strand = 1
x,y = domain['Location'][0], domain['Location'][1]
if domain['Location'][0] > domain['Location'][1]:
x,y = domain['location'][0], domain['location'][1]
if domain['location'][0] > domain['location'][1]:
strand = -1
x,y = y,x
domain_type = domain['Type'].split('_')[0]
......@@ -61,14 +119,14 @@ def geneToGFF(gene):
for df in domain_features:
features.append(domain_features[df])
'''
rec.features = features
#possible other formats as well
with open('data/' + gene['id'] + '.gff', 'w+') as out_handle:
gff = GFF.write([rec], out_handle)
def visualize(gene):
geneToGFF(gene)
#geneToGFF(gene)
#Style
style = Style()
......
......@@ -4,11 +4,11 @@ import pprint
from python import config
from python import ltr as pythonLtr
from python import progressBar
#from python import progressBar
from python import sketch as pythonSketch
from python import domains as pythonBlast
from python import geneGraph
#from python import intervalTree as it
#from python import geneGraph
from python import intervalTree as it
def expandIntervals(intervals):
for i in range(len(intervals) - 2, -1, -1):
......@@ -23,21 +23,21 @@ def expandIntervals(intervals):
def findNestedTranspononsTree(fasta_sequences):
nested = findNestedTransposons(fasta_sequences)
genes = pythonLtr.getGeneDict(fasta_sequences)
for g in genes:
genes[g]['nested'] = nested[g]
#reconstruct
#ITERATE FROM THE BACK AND RECONSTRUCT TREE
for n in nested:
nested[n] = expandIntervals(nested[n])
print n, ':', nested[n]
#tree = it.IntervalTree(nested[n])
#tree.printIntervalList()
#intervals = [[200,400], [1200,1400], [800,1000], [700,900], [300,500], [100,400], [0,400]]
#print expandIntervals(intervals)
#intervals = [[200,300], [600,700], [100,700]]
for g in genes:
genes[g]['nested']['intervals'] = expandIntervals(genes[g]['nested']['intervals'])
print g, ':', genes[g]['nested']['intervals']
genes[g]['nested']['tree'] = it.IntervalTree(genes[g]['nested']['intervals'], genes[g]['nested']['transposons']).toDict()
pythonSketch.sketch(genes)
def findNestedTransposons(fasta_sequences):
#print '______________________'
genes = pythonLtr.getGeneDict(fasta_sequences)
#Find LTR pairs using LTR_finder
......@@ -55,26 +55,33 @@ def findNestedTransposons(fasta_sequences):
#SAVE COORDINATES FOR BACKTRACK
nested = {}
for g in genes:
if math.isnan(genes[g]['best_transposon'][0]):
nested[g] = []
if math.isnan(genes[g]['best_transposon']['location'][0]):
nested[g] = {
'intervals': [],
'transposons': []
}
else:
nested[g] = [genes[g]['best_transposon']]
nested[g] = {
'intervals': [genes[g]['best_transposon']['location']],
'transposons': [genes[g]['best_transposon']]
}
#CROP SEQUENCES AND RECURSIVELY CALL
cropped = False
cropped_sequences = []
for sequence in fasta_sequences:
if not math.isnan(genes[sequence.id]['best_transposon'][0]):
if not math.isnan(genes[sequence.id]['best_transposon']['location'][0]):
#print 'Cropped: ' + str(genes[sequence.id]['best_transposon'])
cropped = True
location = genes[sequence.id]['best_transposon']
location = genes[sequence.id]['best_transposon']['location']
cropped_sequences.append(sequence[:(location[0] - 1)] + sequence[(location[1] + 1):])
#RECURSIVELY CALL
if cropped:
nested_cropped = findNestedTransposons(cropped_sequences)
for a in nested_cropped:
nested[a] += nested_cropped[a]
nested[a]['intervals'] += nested_cropped[a]['intervals']
nested[a]['transposons'] += nested_cropped[a]['transposons']
return nested
......@@ -87,16 +94,23 @@ def findBestCandidate(genes):
#get best evaluated transposon
for gene in genes:
if not len(genes[gene]['ltr_finder']):
genes[gene]['best_transposon'] = [float('nan'), float('nan')]
genes[gene]['best_transposon'] = {
'location': [float('nan'), float('nan')]
}
else:
best_transposon = max(genes[gene]['ltr_finder'], key=lambda d: d['evaluated'])
genes[gene]['best_transposon'] = best_transposon['location']
best_transposon = max(genes[gene]['ltr_finder'], key=lambda d: d['evaluated']['score'])
genes[gene]['best_transposon'] = best_transposon
return genes
def evaluateTransposon(transposon, domains):
score = 0
#print 'Evaluation transposon: ' + transposon['seqid'] + ' ' + transposon['index']
features = {
'domains': [],
'pbs': None,
'ppt': None
}
#print 'Evaluation transposon: ' + transposon['seqid'] + ' ' + str(transposon['location'])
#Look for domains
for domain_type in domains:
......@@ -107,13 +121,21 @@ def evaluateTransposon(transposon, domains):
#found domain
if domain_location[0] >= transposon['location'][0] and domain_location[1] <= transposon['location'][1]:
score += domain['score']
features['domains'].append(domain)
#Check PBS
if not math.isnan(transposon['pbs'][0]):
score += 1000
features['pbs'] = transposon['pbs']
#Check PPT
if not math.isnan(transposon['ppt'][0]):
score += 1000
features['ppt'] = transposon['ppt']
return score
score /= float(transposon['location'][1] - transposon['location'][0])
return {
'score': score,
'features': features
}
This diff is collapsed.
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment