Loading nested/core/nester.py +13 −2 Original line number Original line Diff line number Diff line Loading @@ -8,6 +8,7 @@ from nested.core.nested_element import NestedElement from nested.core.solo_ltrs import SoloLtrs from nested.core.solo_ltrs import SoloLtrs from nested.logging.logger import NesterLogger from nested.logging.logger import NesterLogger from nested.config.config import config from nested.config.config import config import nested.core.tandem_repeat as tandemRepeatFinder class Nester(object): class Nester(object): Loading @@ -32,7 +33,11 @@ class Nester(object): self._find_nesting() self._find_nesting() def _find_nesting(self): def _find_nesting(self): nested_list = self._get_unexpanded_transposon_list(self.sequence, self.threshold) # find list of nested transposons # tandem repeat finder nested_list = tandemRepeatFinder.run(self.seqid, self.sequence) cropped_sequence = self._crop_sequence(nested_list, self.sequence) nested_list += self._get_unexpanded_transposon_list(cropped_sequence, self.threshold) # find list of nested transposons nested_list = self._expand_transposon_list(nested_list) nested_list = self._expand_transposon_list(nested_list) nested_list = self._filter_nested_list(nested_list) nested_list = self._filter_nested_list(nested_list) self.nested_element = NestedElement(self.seqid, self.sequence, nested_list) self.nested_element = NestedElement(self.seqid, self.sequence, nested_list) Loading Loading @@ -87,8 +92,8 @@ class Nester(object): return nested_list return nested_list def _expand_transposon_list(self, nested_list): def _expand_transposon_list(self, nested_list): for i in reversed(range(len(nested_list) - 1)): # backwards expanding of intervals according to previously found and cropped elements # backwards expanding of intervals according to previously found and cropped elements for i in reversed(range(len(nested_list) - 1)): for j in range(i + 1, len(nested_list)): for j in range(i + 1, len(nested_list)): nested_list[j].location = intervals.expand(nested_list[i].location, nested_list[j].location) nested_list[j].location = intervals.expand(nested_list[i].location, nested_list[j].location) nested_list[j].ltr_left_location = intervals.expand(nested_list[i].location, nested_list[j].ltr_left_location = intervals.expand(nested_list[i].location, Loading @@ -115,4 +120,10 @@ class Nester(object): result.append(te) result.append(te) return result return result def _crop_sequence(self, elements, sequence): cropped = sequence for element in elements: cropped = cropped[:element.location[0]] + cropped[element.location[1]:] return cropped nested/core/tandem_repeat.py 0 → 100644 +104 −0 Original line number Original line Diff line number Diff line #!/usr/bin/env python3 import os import subprocess from itertools import islice from Bio import SeqIO from Bio.SeqRecord import SeqRecord from nested.core.te import TE from nested.utils import intervals class TandemRepeat(TE): def __init__(self, loc, period_size, copies, matches, indels, scor, entropy): super().__init__(location=loc, score=scor) self.matches = matches self.indels = indels self.entropy = entropy self.period_size = period_size self.copies = copies def is_better(self, other): if not isinstance(other, TandemRepeat): return False score = 0 score += self.matches > other.matches score += self.indels < other.indels score += self.score > other.score score += self.entropy > other.entropy if score == 2: return intervals.length(self.location) > intervals.length(other.location) return score > 2 def __str__(self): lines = ['location: {}'.format(self.location), 'period size: {}'.format(self.period_size), 'number of copies: {}'.format(self.copies), '% of matches: {}'.format(self.matches), '% of indels: {}'.format(self.indels), 'score: {}'.format(self.score), 'entropy: {}'.format(self.entropy)] return os.linesep.join(lines) def run(seqid, sequence): if not os.path.exists('/tmp/nested'): os.makedirs('/tmp/nested') if not os.path.exists('/tmp/nested/trf'): os.makedirs('/tmp/nested/trf') with open('/tmp/nested/trf/{}.fa'.format(seqid), 'w+') as tmp_file: SeqIO.write(SeqRecord(sequence, id=seqid), tmp_file, 'fasta') process = subprocess.Popen( ['trf', '/tmp/nested/trf/{}.fa'.format(seqid), str(2), str(5), str(7), str(80), str(10), str(50), str(2000), '-m', '-d', '-h'], stdout=subprocess.PIPE, stderr=subprocess.PIPE) stdout, stderr = process.communicate() repeats = filter_candidates(get_candidates(stdout)) repeats.sort(key=lambda r: r.location[0], reverse=True) return repeats def get_candidates(raw_output): entries = raw_output.decode('utf-8').split(os.linesep)[13:-1] candidates = [] for entry in entries: split = entry.split(' ') candidates.append(TandemRepeat([int(split[0]), int(split[1])], int(split[2]), float(split[3]), int(split[5]), int(split[6]), int(split[7]), float(split[12]))) return candidates def filter_candidates(candidates): candidates.sort(key=lambda r: r.location[0]) repeats = [] for candidate in candidates: add = True for candidate2 in candidates: if candidate2.location[0] > candidate.location[1]: break if intervals.contains(candidate.location, candidate2.location): continue elif intervals.intersect(candidate.location, candidate2.location): if not candidate.is_better(candidate2): add = False break for repeat in repeats: if repeat.location[0] > candidate.location[1]: break if intervals.contains(repeat.location, candidate.location): add = False break if add: repeats.append(candidate) return repeats Loading
nested/core/nester.py +13 −2 Original line number Original line Diff line number Diff line Loading @@ -8,6 +8,7 @@ from nested.core.nested_element import NestedElement from nested.core.solo_ltrs import SoloLtrs from nested.core.solo_ltrs import SoloLtrs from nested.logging.logger import NesterLogger from nested.logging.logger import NesterLogger from nested.config.config import config from nested.config.config import config import nested.core.tandem_repeat as tandemRepeatFinder class Nester(object): class Nester(object): Loading @@ -32,7 +33,11 @@ class Nester(object): self._find_nesting() self._find_nesting() def _find_nesting(self): def _find_nesting(self): nested_list = self._get_unexpanded_transposon_list(self.sequence, self.threshold) # find list of nested transposons # tandem repeat finder nested_list = tandemRepeatFinder.run(self.seqid, self.sequence) cropped_sequence = self._crop_sequence(nested_list, self.sequence) nested_list += self._get_unexpanded_transposon_list(cropped_sequence, self.threshold) # find list of nested transposons nested_list = self._expand_transposon_list(nested_list) nested_list = self._expand_transposon_list(nested_list) nested_list = self._filter_nested_list(nested_list) nested_list = self._filter_nested_list(nested_list) self.nested_element = NestedElement(self.seqid, self.sequence, nested_list) self.nested_element = NestedElement(self.seqid, self.sequence, nested_list) Loading Loading @@ -87,8 +92,8 @@ class Nester(object): return nested_list return nested_list def _expand_transposon_list(self, nested_list): def _expand_transposon_list(self, nested_list): for i in reversed(range(len(nested_list) - 1)): # backwards expanding of intervals according to previously found and cropped elements # backwards expanding of intervals according to previously found and cropped elements for i in reversed(range(len(nested_list) - 1)): for j in range(i + 1, len(nested_list)): for j in range(i + 1, len(nested_list)): nested_list[j].location = intervals.expand(nested_list[i].location, nested_list[j].location) nested_list[j].location = intervals.expand(nested_list[i].location, nested_list[j].location) nested_list[j].ltr_left_location = intervals.expand(nested_list[i].location, nested_list[j].ltr_left_location = intervals.expand(nested_list[i].location, Loading @@ -115,4 +120,10 @@ class Nester(object): result.append(te) result.append(te) return result return result def _crop_sequence(self, elements, sequence): cropped = sequence for element in elements: cropped = cropped[:element.location[0]] + cropped[element.location[1]:] return cropped
nested/core/tandem_repeat.py 0 → 100644 +104 −0 Original line number Original line Diff line number Diff line #!/usr/bin/env python3 import os import subprocess from itertools import islice from Bio import SeqIO from Bio.SeqRecord import SeqRecord from nested.core.te import TE from nested.utils import intervals class TandemRepeat(TE): def __init__(self, loc, period_size, copies, matches, indels, scor, entropy): super().__init__(location=loc, score=scor) self.matches = matches self.indels = indels self.entropy = entropy self.period_size = period_size self.copies = copies def is_better(self, other): if not isinstance(other, TandemRepeat): return False score = 0 score += self.matches > other.matches score += self.indels < other.indels score += self.score > other.score score += self.entropy > other.entropy if score == 2: return intervals.length(self.location) > intervals.length(other.location) return score > 2 def __str__(self): lines = ['location: {}'.format(self.location), 'period size: {}'.format(self.period_size), 'number of copies: {}'.format(self.copies), '% of matches: {}'.format(self.matches), '% of indels: {}'.format(self.indels), 'score: {}'.format(self.score), 'entropy: {}'.format(self.entropy)] return os.linesep.join(lines) def run(seqid, sequence): if not os.path.exists('/tmp/nested'): os.makedirs('/tmp/nested') if not os.path.exists('/tmp/nested/trf'): os.makedirs('/tmp/nested/trf') with open('/tmp/nested/trf/{}.fa'.format(seqid), 'w+') as tmp_file: SeqIO.write(SeqRecord(sequence, id=seqid), tmp_file, 'fasta') process = subprocess.Popen( ['trf', '/tmp/nested/trf/{}.fa'.format(seqid), str(2), str(5), str(7), str(80), str(10), str(50), str(2000), '-m', '-d', '-h'], stdout=subprocess.PIPE, stderr=subprocess.PIPE) stdout, stderr = process.communicate() repeats = filter_candidates(get_candidates(stdout)) repeats.sort(key=lambda r: r.location[0], reverse=True) return repeats def get_candidates(raw_output): entries = raw_output.decode('utf-8').split(os.linesep)[13:-1] candidates = [] for entry in entries: split = entry.split(' ') candidates.append(TandemRepeat([int(split[0]), int(split[1])], int(split[2]), float(split[3]), int(split[5]), int(split[6]), int(split[7]), float(split[12]))) return candidates def filter_candidates(candidates): candidates.sort(key=lambda r: r.location[0]) repeats = [] for candidate in candidates: add = True for candidate2 in candidates: if candidate2.location[0] > candidate.location[1]: break if intervals.contains(candidate.location, candidate2.location): continue elif intervals.intersect(candidate.location, candidate2.location): if not candidate.is_better(candidate2): add = False break for repeat in repeats: if repeat.location[0] > candidate.location[1]: break if intervals.contains(repeat.location, candidate.location): add = False break if add: repeats.append(candidate) return repeats