tadnem repeat finder integrated not tested properly yet (86e4fdf4) · Commits · Matej Lexa / nested

nested/core/nester.py

+13 −2

Original line number	Original line	Diff line number	Diff line
	@@ -8,6 +8,7 @@ from nested.core.nested_element import NestedElement
	from nested.core.solo_ltrs import SoloLtrs		from nested.core.solo_ltrs import SoloLtrs
	from nested.logging.logger import NesterLogger		from nested.logging.logger import NesterLogger
	from nested.config.config import config		from nested.config.config import config
			import nested.core.tandem_repeat as tandemRepeatFinder


	class Nester(object):		class Nester(object):
	@@ -32,7 +33,11 @@ class Nester(object):
	self._find_nesting()		self._find_nesting()

	def _find_nesting(self):		def _find_nesting(self):
	nested_list = self._get_unexpanded_transposon_list(self.sequence, self.threshold) # find list of nested transposons		# tandem repeat finder
			nested_list = tandemRepeatFinder.run(self.seqid, self.sequence)
			cropped_sequence = self._crop_sequence(nested_list, self.sequence)

			nested_list += self._get_unexpanded_transposon_list(cropped_sequence, self.threshold) # find list of nested transposons
	nested_list = self._expand_transposon_list(nested_list)		nested_list = self._expand_transposon_list(nested_list)
	nested_list = self._filter_nested_list(nested_list)		nested_list = self._filter_nested_list(nested_list)
	self.nested_element = NestedElement(self.seqid, self.sequence, nested_list)		self.nested_element = NestedElement(self.seqid, self.sequence, nested_list)
	@@ -87,8 +92,8 @@ class Nester(object):
	return nested_list		return nested_list

	def _expand_transposon_list(self, nested_list):		def _expand_transposon_list(self, nested_list):
	for i in reversed(range(len(nested_list) - 1)):
	# backwards expanding of intervals according to previously found and cropped elements		# backwards expanding of intervals according to previously found and cropped elements
			for i in reversed(range(len(nested_list) - 1)):
	for j in range(i + 1, len(nested_list)):		for j in range(i + 1, len(nested_list)):
	nested_list[j].location = intervals.expand(nested_list[i].location, nested_list[j].location)		nested_list[j].location = intervals.expand(nested_list[i].location, nested_list[j].location)
	nested_list[j].ltr_left_location = intervals.expand(nested_list[i].location,		nested_list[j].ltr_left_location = intervals.expand(nested_list[i].location,
	@@ -115,4 +120,10 @@ class Nester(object):
	result.append(te)		result.append(te)
	return result		return result

			def _crop_sequence(self, elements, sequence):
			cropped = sequence
			for element in elements:
			cropped = cropped[:element.location[0]] + cropped[element.location[1]:]
			return cropped

nested/core/tandem_repeat.py

0 → 100644

+104 −0

Original line number	Original line	Diff line number	Diff line
			#!/usr/bin/env python3
			import os
			import subprocess
			from itertools import islice

			from Bio import SeqIO
			from Bio.SeqRecord import SeqRecord

			from nested.core.te import TE
			from nested.utils import intervals

			class TandemRepeat(TE):
			def __init__(self, loc, period_size, copies, matches, indels, scor, entropy):
			super().__init__(location=loc, score=scor)
			self.matches = matches
			self.indels = indels
			self.entropy = entropy
			self.period_size = period_size
			self.copies = copies

			def is_better(self, other):
			if not isinstance(other, TandemRepeat):
			return False

			score = 0
			score += self.matches > other.matches
			score += self.indels < other.indels
			score += self.score > other.score
			score += self.entropy > other.entropy
			if score == 2:
			return intervals.length(self.location) > intervals.length(other.location)
			return score > 2

			def __str__(self):
			lines = ['location: {}'.format(self.location),
			'period size: {}'.format(self.period_size),
			'number of copies: {}'.format(self.copies),
			'% of matches: {}'.format(self.matches),
			'% of indels: {}'.format(self.indels),
			'score: {}'.format(self.score),
			'entropy: {}'.format(self.entropy)]
			return os.linesep.join(lines)

			def run(seqid, sequence):
			if not os.path.exists('/tmp/nested'):
			os.makedirs('/tmp/nested')

			if not os.path.exists('/tmp/nested/trf'):
			os.makedirs('/tmp/nested/trf')

			with open('/tmp/nested/trf/{}.fa'.format(seqid), 'w+') as tmp_file:
			SeqIO.write(SeqRecord(sequence, id=seqid),
			tmp_file,
			'fasta')

			process = subprocess.Popen(
			['trf', '/tmp/nested/trf/{}.fa'.format(seqid), str(2), str(5), str(7),
			str(80), str(10), str(50), str(2000), '-m', '-d', '-h'],
			stdout=subprocess.PIPE,
			stderr=subprocess.PIPE)

			stdout, stderr = process.communicate()

			repeats = filter_candidates(get_candidates(stdout))
			repeats.sort(key=lambda r: r.location[0], reverse=True)
			return repeats

			def get_candidates(raw_output):
			entries = raw_output.decode('utf-8').split(os.linesep)[13:-1]
			candidates = []

			for entry in entries:
			split = entry.split(' ')
			candidates.append(TandemRepeat([int(split[0]), int(split[1])], int(split[2]), float(split[3]),
			int(split[5]), int(split[6]), int(split[7]), float(split[12])))
			return candidates

			def filter_candidates(candidates):
			candidates.sort(key=lambda r: r.location[0])
			repeats = []

			for candidate in candidates:
			add = True
			for candidate2 in candidates:
			if candidate2.location[0] > candidate.location[1]:
			break
			if intervals.contains(candidate.location, candidate2.location):
			continue
			elif intervals.intersect(candidate.location, candidate2.location):
			if not candidate.is_better(candidate2):
			add = False
			break
			for repeat in repeats:
			if repeat.location[0] > candidate.location[1]:
			break
			if intervals.contains(repeat.location, candidate.location):
			add = False
			break
			if add:
			repeats.append(candidate)
			return repeats