Skip to content
Snippets Groups Projects
Commit 493e3575 authored by rlapar's avatar rlapar
Browse files

restructuring project

parent e3125900
No related branches found
No related tags found
1 merge request!6Reengineering
*.pyc
python/*pyc
python/*.pyc
data
#!/usr/bin/env python3
import sys
import os
import shutil
import click
......@@ -7,6 +8,7 @@ import click
from Bio import SeqIO
from python.nester import Nester
from python import sketch
def setup():
if not os.path.exists('data'):
......@@ -27,23 +29,27 @@ def main(input_fasta, sketch_only):
#RUN
sequences = list(SeqIO.parse(open(input_fasta), 'fasta'))
for sequence in sequences:
if not os.path.exists('data/{}'.format(sequence.id)):
os.makedirs('data/{}'.format(sequence.id))
if not os.path.exists('data/{}/TE'.format(sequence.id)):
os.makedirs('data/{}/TE'.format(sequence.id))
print('Running Nester for {}...'.format(sequence.id))
nester = Nester(sequence)
#TODO
#createGFF nester.nestedList
#TODO
#sketch only without nester
#remove
for a in nester.nestedList:
print(a.location)
#remove
try:
if not os.path.exists('data/{}'.format(sequence.id)):
os.makedirs('data/{}'.format(sequence.id))
if not os.path.exists('data/{}/TE'.format(sequence.id)):
os.makedirs('data/{}/TE'.format(sequence.id))
print('Running Nester for {}...'.format(sequence.id))
if not sketch_only:
nester = Nester(sequence)
#for a in nester.nestedList:
# print(a)
# for b in a.features['domains']:
# print('--------------------')
# print(b)
# print('##########################')
sketch.createGFF(sequence.id, sequence.seq, nester.nestedList)
sketch.sketch(sequence.id)
except KeyboardInterrupt:
raise
#except:
# print('Unexpected error:', sys.exc_info()[0])
cleanup()
......
#!/usr/bin/env python3
import copy
from python import te as pythonTe
from python import domain as pythonDomain
from python.transposonGraph import TransposonGraph
class Gene(object):
def __init__(self, seqid=None, sequence=None):
......@@ -29,13 +32,7 @@ class Gene(object):
return self.teList[scores.index(max(scores))]
def _evaluateTe(self, te):
#TODO scores and features (domains)
#TODO genegraph
#graph = Genegraph()
#score, features = graph.getScore()
te.score = 1 / float(te.location[1] - te.location[0])
te.features = None
return te.score
\ No newline at end of file
graph = TransposonGraph(te, self.domainList)
te.score, te.features = graph.getScore()
te.score /= float(te.location[1] - te.location[0])
return te.score
def expandInterval(source, target):
def expand(source, target):
length = (source[1] - source[0])
if source[0] <= target[0]:
target[0] += length
if source[0] <= target[1]:
target[1] += length
return target
\ No newline at end of file
return target
def compare(a, b):
return -1 if a[1] < b[0] else (0 if a[1] == b[0] else 1)
#a contains b as subinterval
def contains(a, b):
return a[0] <= b[0] and a[1] >= b[1]
#remove a from b
def remove(a, b):
return [[b[0],a[0]], [a[1], b[1]]]
#get intervals of a cropped by b
def crop(a, b):
if not len(b):
return [a]
#sort children
b.sort(key=lambda x: x[0])
cropped = [[a[0], b[0][0]]]
for i in range(len(b) - 1):
cropped.append([b[i][1], b[i+1][0]])
cropped.append([b[-1][1], a[1]])
return cropped
......@@ -6,24 +6,26 @@ from python.gene import Gene
class Nester(object):
def __init__(self, sequence):
self._seqid = sequence.id
self._sequence = sequence.seq
self.seqid = sequence.id
self.sequence = sequence.seq
self.nestedList = []
self._findNestedTransposonList()
def _findNestedTransposonList(self):
self.nestedList = self._getUnexpandedTransposonList(self._sequence)
self.nestedList = self._getUnexpandedTransposonList(self.sequence)
self._expandTransposonList()
def _expandTransposonList(self):
for i in reversed(range(len(self.nestedList) - 1)):
for j in range(i + 1, len(self.nestedList)):
self.nestedList[j].location = intervals.expandInterval(self.nestedList[i].location, self.nestedList[j].location)
#TODO expand domains
#TODO expand ppt, pbs
self.nestedList[j].location = intervals.expand(self.nestedList[i].location, self.nestedList[j].location)
for domain in self.nestedList[j].features['domains']:
domain.location = intervals.expand(self.nestedList[i].location, domain.location)
self.nestedList[j].features['ppt'] = intervals.expand(self.nestedList[i].location, self.nestedList[j].features['ppt'])
self.nestedList[j].features['pbs'] = intervals.expand(self.nestedList[i].location, self.nestedList[j].features['pbs'])
def _getUnexpandedTransposonList(self, sequence):
gene = Gene(self._seqid, sequence)
gene = Gene(self.seqid, sequence)
bestCandidate = gene.getBestCandidate()
if not bestCandidate:
......
import pprint
import subprocess
import config
from Bio import SeqIO
from BCBio import GFF
......@@ -8,151 +6,82 @@ from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord
from Bio.SeqFeature import SeqFeature, FeatureLocation
try:
from gt.core import *
from gt.extended import *
from gt.annotationsketch import *
from gt.annotationsketch.custom_track import CustomTrack
from gt.core.gtrange import Range
except ImportError:
print 'Warning: Could not import gt, sketch needs to be called externally!'
from python import config
from python import intervals
#a contains b as subinterval
def contains(a,b):
return a[0] <= b[0] and a[1] >= b[1]
#remove a from b
def remove(a,b):
return [[b[0],a[0]], [a[1], b[1]]]
#get intervals of a cropped by b
def crop(a, b):
if not len(b):
return [a]
#sort children
b.sort(key=lambda x: x[0])
cropped = [[a[0], b[0][0]]]
for i in range(len(b) - 1):
cropped.append([b[i][1], b[i+1][0]])
cropped.append([b[-1][1], a[1]])
return cropped
def sketch(nested, genes):
print 'Running GT annotation sketch...'
for g in genes:
createGFF(genes[g], nested[g])
visualize(g)
def createGFF(gene, nested):
def createGFF(id, sequence, nestedList):
#find closest parent
if nested:
nested[-1]['parent'] = -1
for i in range(len(nested) - 1):
nested[i]['parent'] = -1
for j in range(i+1, len(nested)):
if contains(nested[j]['location'], nested[i]['location']):
nested[i]['parent'] = j
parents = [-1] * len(nestedList)
for i in range(len(parents) - 1):
for j in range(i + 1, len(parents)):
if intervals.contains(nestedList[j].location, nestedList[i].location):
parents[i] = j
break
#append direct children
for i in reversed(range(len(nested))):
nested[i]['direct_children_locations'] = []
parent = nested[i]['parent']
#append direct children
directChildren = [[] for i in range(len(nestedList))]
for i in reversed(range(len(directChildren))):
parent = parents[i]
if parent != -1:
nested[parent]['direct_children_locations'].append(nested[i]['location'])
directChildren[parent].append(nestedList[i].location)
#GFF
rec = SeqRecord(gene['sequence'], gene['id'])
rec = SeqRecord(sequence, id)
features = []
for i in range(len(nested)):
for i in range(len(nestedList)):
#insert BaseLine
features.append(SeqFeature(FeatureLocation(nested[i]['location'][0], nested[i]['location'][1]),
type='te_base',
strand=0,
qualifiers={'ID': 'TE_BASE {}'.format(i)}))
#Insert croppedIntervals
children = nested[i]['direct_children_locations']
cropped = crop(nested[i]['location'], children)
open('data/{}/TE/{}.fa'.format(gene['id'], i), 'w').close
features.append(SeqFeature(
FeatureLocation(nestedList[i].location[0], nestedList[i].location[1]),
type='te_base',
strand=0,
qualifiers={'ID': 'TE_BASE {}'.format(i)}
))
#insert element cropped by its children
#save element fasta
subseq = Seq('')
children = directChildren[i]
cropped = intervals.crop(nestedList[i].location, children)
for subinterval in cropped:
subseq += gene['sequence'][subinterval[0]: (subinterval[1] + 1)]
features.append(SeqFeature(FeatureLocation(subinterval[0],subinterval[1]),
type='te',
strand=0,
qualifiers={'ID': 'TE {}'.format(i), 'Parent': 'TE_BASE {}'.format(i)}))
with open('data/{}/TE/{}.fa'.format(gene['id'], i), 'a') as outfile:
SeqIO.write(SeqRecord(subseq, id=gene['id'], description='TE-{}'.format(i)),
outfile,
'fasta')
#Insert elements (domains, ppt, pbs, ...)
subseq += sequence[subinterval[0]: (subinterval[1] + 1)]
features.append(SeqFeature(
FeatureLocation(subinterval[0], subinterval[1]),
type='te',
strand=0,
qualifiers={'ID': 'TE {}'.format(i), 'Parent': 'TE_BASE {}'.format(i)}
))
with open('data/{}/TE/{}.fa'.format(id, i), 'w') as outfile:
SeqIO.write(
SeqRecord(subseq, id=id, description='TE-{}'.format(i)),
outfile,
'fasta'
)
#insert domains
j = 0
for domain in nested[i]['features']['domains']:
sign = (lambda x: x and (1, -1)[x < 0])(domain['frame'][0])
for domain in nestedList[i].features['domains']:
sign = (lambda x: x and (1, -1)[x < 0])(domain.frame[0])
if sign < 0:
domain['location'] = [domain['location'][1], domain['location'][0]]
overlap = [x for x in children if contains(domain['location'], x)]
domain.location = [domain.location[1], domain.location[0]]
cropped_domain = crop(domain['location'], overlap)
overlap = [x for x in children if intervals.contains(domain.location, x)]
cropped_domain = intervals.crop(domain.location, overlap)
for part in cropped_domain:
features.append(SeqFeature(FeatureLocation(part[0], part[1]),
type=domain['type'],
strand=sign,
#qualifiers={'Parent': 'DOMAIN {}-{}'.format(i,j)}))
qualifiers={'ID': 'DOMAIN {}-{}'.format(i,j),'Parent': 'TE_BASE {}'.format(i)}))
features.append(SeqFeature(
FeatureLocation(part[0], part[1]),
type=domain.type,
strand=sign,
qualifiers={'ID': 'DOMAIN {}-{}'.format(i,j),'Parent': 'TE_BASE {}'.format(i)}
))
j += 1
rec.features = features
#possible other formats as well
with open('data/{}/'.format(gene['id']) + gene['id'] + '.gff', 'w+') as out_handle:
gff = GFF.write([rec], out_handle)
def visualize(gene_id):
if config.call_gt_sketch_externally:
process = subprocess.Popen([config.gt_sketch_path] + config.gt_sketch_args + ['data/{}/{}.png'.format(gene_id, gene_id)] + ['data/{}/{}.gff'.format(gene_id, gene_id)])
return
#Visualize using python
#Style
style = Style()
style.load_file('gt.style')
#Feature index
feature_index = FeatureIndexMemory()
#add gff file
feature_index.add_gff3file('data/{}/'.format(gene_id) + gene_id + '.gff')
#create diagram for first sequence ID
seqid = feature_index.get_first_seqid()
range = feature_index.get_range_for_seqid(seqid)
diagram = Diagram.from_index(feature_index, seqid, range, style)
#create layout
layout = Layout(diagram, 1400, style)
height = layout.get_height()
#create canvas
canvas = CanvasCairoFile(style, 1400, height)
#canvas = CanvasCairoFilePDF(style, 1400, height)
rec.features = features
#other formats possible
with open('data/{}/{}.gff'.format(id, id), 'w+') as out_handle:
GFF.write([rec], out_handle)
#sketch layout on canvas
layout.sketch(canvas)
def sketch(id):
process = subprocess.Popen([config.gt_sketch_path] + config.gt_sketch_args + ['data/{}/{}.png'.format(id, id)] + ['data/{}/{}.gff'.format(id, id)])
#write to file
pngfile = 'data/{}/'.format(gene_id) + gene_id + '.png'
canvas.to_file(pngfile)
......@@ -13,13 +13,24 @@ class TE(object):
self.ppt = entry['ppt']
self.pbs = entry['pbs']
self.location = entry['location']
self.ltrLeftLocation = [
entry['location'][0],
entry['location'][0] + int(entry['ltr len'].split(',')[0])
]
self.ltrRightLocation = [
entry['location'][1] - int(entry['ltr len'].split(',')[1]),
entry['location'][1]
]
self.features = None
self.score = None
def __str__(self):
lines = ['{{location = {},'.format(self.location),
' left ltr = {},'.format(self.ltrLeftLocation),
' right ltr = {},'.format(self.ltrRightLocation),
' ppt = {},'.format(self.ppt),
' pbs = {},'.format(self.pbs),
#' features = {},'.format(self.features),
' score = {}}}'.format(self.score)]
return '\n'.join(lines)
......
#!/usr/bin/env python3
import math
import copy
import networkx as nx
from python import intervals
#COPIA
#[LTR, PBS, GAG, AP, INT, RT, RNaseH, PPT, LTR]
#GYPSY
#[LTR, PBS, GAG, AP, RT, RNaseH, INT, PPT, LTR]
class TransposonGraph(object):
def __init__(self, te, domainList):
self._graph = None
self._buildGraph(te, domainList)
def _buildGraph(self, te, domainList):
#TODO
#check negative strands
self._graph = nx.DiGraph()
self._addNodes(te, domainList)
self._addEdges()
def _addNodes(self, te, domainsList):
#LTR nodes
self._graph.add_node(
n='ltr_left',
location=te.ltrLeftLocation,
score=1,
node_class='ltr_left',
strand='0',
features={}
)
self._graph.add_node(
n='ltr_right',
location=te.ltrRightLocation,
score=1,
node_class='ltr_right',
strand='0',
features={}
)
#PBS/PPT nodes
if not math.isnan(te.ppt[0]):
location = copy.deepcopy(te.ppt)
strand = '0'
if location[0] > location[1]:
location = [location[1], location[0]]
strand = '-'
if (intervals.compare(te.ltrLeftLocation, location) != 1
and intervals.compare(location, te.ltrRightLocation) != 1):
self._graph.add_node(
n='ppt',
location=location,
score=1,
node_class='ppt',
strand=strand,
features={}
)
if not math.isnan(te.pbs[0]):
location = copy.deepcopy(te.pbs)
strand = '0'
if location[0] > location[1]:
location = [location[1], location[0]]
strand = '-'
if (intervals.compare(te.ltrLeftLocation, location) != 1
and intervals.compare(location, te.ltrRightLocation) != 1):
self._graph.add_node(
n='pbs',
location=location,
score=1,
node_class='pbs',
strand=strand,
features={}
)
#DOMAINS
#TODO negative strands
i = 0
for domain in domainsList:
if domain.type not in ['GAG', 'AP', 'INT', 'RT', 'RNaseH']:
continue
if (intervals.compare(te.ltrLeftLocation, domain.location) != 1
and intervals.compare(domain.location, te.ltrRightLocation) != 1):
self._graph.add_node(
n='domain_{}'.format(i),
location=domain.location,
score=domain.score,
node_class=domain.type,
strand='+',
features={'domain': domain}
)
i += 1
def _addEdges(self):
for node1 in self._graph.nodes(data=True):
for node2 in self._graph.nodes(data=True):
self._addEdge(node1, node2)
def _addEdge(self, node1, node2):
if node1 == node2 or node1[1]['node_class'] == node2[1]['node_class']:
return
copia = ['ltr_left', 'pbs', 'GAG', 'AP', 'INT', 'RT', 'RNaseH', 'ppt', 'ltr_right']
#node1 before node2
if intervals.compare(node1[1]['location'], node2[1]['location']) != 1:
index1 = copia.index(node1[1]['node_class'])
index2 = copia.index(node2[1]['node_class'])
diff = index2 - index1
if diff == 1:
self._graph.add_edge(node1[0], node2[0], weight=float(1)/node2[1]['score'])
elif diff > 1:
self._graph.add_edge(node1[0], node2[0], weight=1000 * diff)
else:
self._graph.add_edge(node1[0], node2[0], weight=1000 * 2 * (-diff))
def getScore(self):
pathScore = 0
pathFeatures = {
'domains': [],
'pbs': [float('nan'), float('nan')],
'ppt': [float('nan'), float('nan')]
}
path = nx.dijkstra_path(self._graph, 'ltr_left', 'ltr_right', 'weight')[1:-1]
scores = nx.get_node_attributes(self._graph, 'score')
locations = nx.get_node_attributes(self._graph, 'location')
features = nx.get_node_attributes(self._graph, 'features')
for node in path:
pathScore += scores[node]
if node == 'pbs' or node == 'ppt':
pathFeatures[node] = locations[node]
else:
pathFeatures['domains'].append(features[node]['domain'])
return pathScore, pathFeatures
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment