Loading nested/cli/nester.py +2 −1 Original line number Original line Diff line number Diff line Loading @@ -56,6 +56,7 @@ def process_sequence(sequence, sketcher, sketch, format, output_fasta_offset, ou nester = Nester(sequence, initial_threshold, threshold_multiplier, dependency_resolver) nester = Nester(sequence, initial_threshold, threshold_multiplier, dependency_resolver) sketcher.create_gff(nester.nested_element, dirpath=output_folder, output_fasta_offset=output_fasta_offset, format=format) sketcher.create_gff(nester.nested_element, dirpath=output_folder, output_fasta_offset=output_fasta_offset, format=format) sketcher.create_solo_ltr_gff(nester.solo_ltrs, dirpath=output_folder) sketcher.create_solo_ltr_gff(nester.solo_ltrs, dirpath=output_folder) sketcher.create_trf_gff(nester.trf, nester.seqid, dirpath=output_folder) if sketch: if sketch: if format != 'default': if format != 'default': sketcher.create_gff(nester.nested_element, dirpath=output_folder, output_fasta_offset=output_fasta_offset) sketcher.create_gff(nester.nested_element, dirpath=output_folder, output_fasta_offset=output_fasta_offset) Loading nested/core/nester.py +8 −3 Original line number Original line Diff line number Diff line Loading @@ -30,11 +30,14 @@ class Nester(object): self.threshold = threshold self.threshold = threshold self.multiplier = multiplier self.multiplier = multiplier self.dependency_resolver = dependency_resolver self.dependency_resolver = dependency_resolver self.trf = [] self._find_nesting() self._find_nesting() def _find_nesting(self): def _find_nesting(self): # tandem repeat finder # tandem repeat finder nested_list = tandemRepeatFinder.run(self.seqid, self.sequence) self.trf = tandemRepeatFinder.run(self.seqid, self.sequence) nested_list = self.trf[:] cropped_sequence = self._crop_sequence(nested_list, self.sequence) cropped_sequence = self._crop_sequence(nested_list, self.sequence) nested_list += self._get_unexpanded_transposon_list(cropped_sequence, self.threshold) # find list of nested transposons nested_list += self._get_unexpanded_transposon_list(cropped_sequence, self.threshold) # find list of nested transposons Loading Loading @@ -102,8 +105,10 @@ class Nester(object): nested_list[j].ltr_right_location) nested_list[j].ltr_right_location) for domain in nested_list[j].features['domains']: for domain in nested_list[j].features['domains']: domain.location = intervals.expand(nested_list[i].location, domain.location) domain.location = intervals.expand(nested_list[i].location, domain.location) if 'ppt' in nested_list[j].features.keys(): nested_list[j].features['ppt'] = intervals.expand(nested_list[i].location, nested_list[j].features['ppt'] = intervals.expand(nested_list[i].location, nested_list[j].features['ppt']) nested_list[j].features['ppt']) if 'pbs' in nested_list[j].features.keys(): nested_list[j].features['pbs'] = intervals.expand(nested_list[i].location, nested_list[j].features['pbs'] = intervals.expand(nested_list[i].location, nested_list[j].features['pbs']) nested_list[j].features['pbs']) nested_list[j].tsr_left = intervals.expand(nested_list[i].location, nested_list[j].tsr_left = intervals.expand(nested_list[i].location, Loading nested/core/tandem_repeat.py +11 −10 Original line number Original line Diff line number Diff line Loading @@ -10,13 +10,14 @@ from nested.core.te import TE from nested.utils import intervals from nested.utils import intervals class TandemRepeat(TE): class TandemRepeat(TE): def __init__(self, loc, period_size, copies, matches, indels, scor, entropy): def __init__(self, loc, period_size, copies, matches, indels, scor, entropy, monomer): super().__init__(location=loc, score=scor) super().__init__(location=loc, score=scor) self.matches = matches self.matches = matches self.indels = indels self.indels = indels self.entropy = entropy self.entropy = entropy self.period_size = period_size self.period_size = period_size self.copies = copies self.copies = copies self.monomer = monomer def is_better(self, other): def is_better(self, other): if not isinstance(other, TandemRepeat): if not isinstance(other, TandemRepeat): Loading Loading @@ -56,23 +57,23 @@ def run(seqid, sequence): process = subprocess.Popen( process = subprocess.Popen( ['trf', '/tmp/nested/trf/{}.fa'.format(seqid), str(2), str(5), str(7), ['trf', '/tmp/nested/trf/{}.fa'.format(seqid), str(2), str(5), str(7), str(80), str(10), str(50), str(2000), '-m', '-d', '-h'], str(80), str(10), str(50), str(2000), '-m', '-d', '-h'], stdout=subprocess.PIPE, stdout=subprocess.DEVNULL, stderr=subprocess.PIPE) stderr=subprocess.DEVNULL) stdout, stderr = process.communicate() stdout, stderr = process.communicate() repeats = filter_candidates(get_candidates(stdout)) repeats = filter_candidates(get_candidates(seqid)) repeats.sort(key=lambda r: r.location[0], reverse=True) repeats.sort(key=lambda r: r.location[0], reverse=True) return repeats return repeats def get_candidates(raw_output): def get_candidates(seqid): entries = raw_output.decode('utf-8').split(os.linesep)[13:-1] candidates = [] candidates = [] for entry in entries: with open('{}.fa.2.5.7.80.10.50.2000.dat'.format(seqid)) as file: for entry in islice(file, 15, None): split = entry.split(' ') split = entry.split(' ') candidates.append(TandemRepeat([int(split[0]), int(split[1])], int(split[2]), float(split[3]), candidates.append(TandemRepeat([int(split[0]), int(split[1])], int(split[2]), float(split[3]), int(split[5]), int(split[6]), int(split[7]), float(split[12]))) int(split[5]), int(split[6]), int(split[7]), float(split[12]), split[-1])) return candidates return candidates def filter_candidates(candidates): def filter_candidates(candidates): Loading nested/core/te.py +1 −1 Original line number Original line Diff line number Diff line Loading @@ -27,7 +27,7 @@ class TE(object): def __init__(self, ppt=[0, 0], pbs=[0, 0], location=[0, 0], def __init__(self, ppt=[0, 0], pbs=[0, 0], location=[0, 0], ltr_left_location=[0, 0], ltr_right_location=[0, 0], ltr_left_location=[0, 0], ltr_right_location=[0, 0], tsr_left=[0, 0], tsr_right=[0, 0], features={}, score=None): tsr_left=[0, 0], tsr_right=[0, 0], features={'domains': []}, score=None): self.ppt = ppt self.ppt = ppt self.pbs = pbs self.pbs = pbs self.location = location self.location = location Loading nested/output/sketcher.py +6 −1 Original line number Original line Diff line number Diff line Loading @@ -6,6 +6,7 @@ import subprocess from nested.config.config import config, args_dict_to_list from nested.config.config import config, args_dict_to_list from nested.output.gff import GFFMaker from nested.output.gff import GFFMaker from nested.output.solo_gff import SoloGFFMaker from nested.output.solo_gff import SoloGFFMaker from nested.output.trf_gff import TrfGFFMaker DEFAULT_DIRPATH = 'data' DEFAULT_DIRPATH = 'data' Loading @@ -13,6 +14,7 @@ class Sketcher(object): def __init__(self): def __init__(self): self._gff_maker = GFFMaker() self._gff_maker = GFFMaker() self._solo_gff_maker = SoloGFFMaker() self._solo_gff_maker = SoloGFFMaker() self._trf_gff_maker = TrfGFFMaker() self._gff_path = '' self._gff_path = '' def create_gff(self, nested_element, dirpath, output_fasta_offset=0, format='default'): def create_gff(self, nested_element, dirpath, output_fasta_offset=0, format='default'): Loading @@ -24,6 +26,9 @@ class Sketcher(object): self._solo_gff_maker.create_solo_gff(solo_ltrs, path) self._solo_gff_maker.create_solo_gff(solo_ltrs, path) self._solo_gff_maker.move_ltrs_spliced(solo_ltrs.seqId, path) self._solo_gff_maker.move_ltrs_spliced(solo_ltrs.seqId, path) def create_trf_gff(self, trf, seqId, dirpath): path = os.path.join(dirpath, DEFAULT_DIRPATH) self._trf_gff_maker.create_gff(trf, seqId, path) def sketch(self, id, dirpath): def sketch(self, id, dirpath): path = os.path.join(dirpath, DEFAULT_DIRPATH) path = os.path.join(dirpath, DEFAULT_DIRPATH) Loading Loading
nested/cli/nester.py +2 −1 Original line number Original line Diff line number Diff line Loading @@ -56,6 +56,7 @@ def process_sequence(sequence, sketcher, sketch, format, output_fasta_offset, ou nester = Nester(sequence, initial_threshold, threshold_multiplier, dependency_resolver) nester = Nester(sequence, initial_threshold, threshold_multiplier, dependency_resolver) sketcher.create_gff(nester.nested_element, dirpath=output_folder, output_fasta_offset=output_fasta_offset, format=format) sketcher.create_gff(nester.nested_element, dirpath=output_folder, output_fasta_offset=output_fasta_offset, format=format) sketcher.create_solo_ltr_gff(nester.solo_ltrs, dirpath=output_folder) sketcher.create_solo_ltr_gff(nester.solo_ltrs, dirpath=output_folder) sketcher.create_trf_gff(nester.trf, nester.seqid, dirpath=output_folder) if sketch: if sketch: if format != 'default': if format != 'default': sketcher.create_gff(nester.nested_element, dirpath=output_folder, output_fasta_offset=output_fasta_offset) sketcher.create_gff(nester.nested_element, dirpath=output_folder, output_fasta_offset=output_fasta_offset) Loading
nested/core/nester.py +8 −3 Original line number Original line Diff line number Diff line Loading @@ -30,11 +30,14 @@ class Nester(object): self.threshold = threshold self.threshold = threshold self.multiplier = multiplier self.multiplier = multiplier self.dependency_resolver = dependency_resolver self.dependency_resolver = dependency_resolver self.trf = [] self._find_nesting() self._find_nesting() def _find_nesting(self): def _find_nesting(self): # tandem repeat finder # tandem repeat finder nested_list = tandemRepeatFinder.run(self.seqid, self.sequence) self.trf = tandemRepeatFinder.run(self.seqid, self.sequence) nested_list = self.trf[:] cropped_sequence = self._crop_sequence(nested_list, self.sequence) cropped_sequence = self._crop_sequence(nested_list, self.sequence) nested_list += self._get_unexpanded_transposon_list(cropped_sequence, self.threshold) # find list of nested transposons nested_list += self._get_unexpanded_transposon_list(cropped_sequence, self.threshold) # find list of nested transposons Loading Loading @@ -102,8 +105,10 @@ class Nester(object): nested_list[j].ltr_right_location) nested_list[j].ltr_right_location) for domain in nested_list[j].features['domains']: for domain in nested_list[j].features['domains']: domain.location = intervals.expand(nested_list[i].location, domain.location) domain.location = intervals.expand(nested_list[i].location, domain.location) if 'ppt' in nested_list[j].features.keys(): nested_list[j].features['ppt'] = intervals.expand(nested_list[i].location, nested_list[j].features['ppt'] = intervals.expand(nested_list[i].location, nested_list[j].features['ppt']) nested_list[j].features['ppt']) if 'pbs' in nested_list[j].features.keys(): nested_list[j].features['pbs'] = intervals.expand(nested_list[i].location, nested_list[j].features['pbs'] = intervals.expand(nested_list[i].location, nested_list[j].features['pbs']) nested_list[j].features['pbs']) nested_list[j].tsr_left = intervals.expand(nested_list[i].location, nested_list[j].tsr_left = intervals.expand(nested_list[i].location, Loading
nested/core/tandem_repeat.py +11 −10 Original line number Original line Diff line number Diff line Loading @@ -10,13 +10,14 @@ from nested.core.te import TE from nested.utils import intervals from nested.utils import intervals class TandemRepeat(TE): class TandemRepeat(TE): def __init__(self, loc, period_size, copies, matches, indels, scor, entropy): def __init__(self, loc, period_size, copies, matches, indels, scor, entropy, monomer): super().__init__(location=loc, score=scor) super().__init__(location=loc, score=scor) self.matches = matches self.matches = matches self.indels = indels self.indels = indels self.entropy = entropy self.entropy = entropy self.period_size = period_size self.period_size = period_size self.copies = copies self.copies = copies self.monomer = monomer def is_better(self, other): def is_better(self, other): if not isinstance(other, TandemRepeat): if not isinstance(other, TandemRepeat): Loading Loading @@ -56,23 +57,23 @@ def run(seqid, sequence): process = subprocess.Popen( process = subprocess.Popen( ['trf', '/tmp/nested/trf/{}.fa'.format(seqid), str(2), str(5), str(7), ['trf', '/tmp/nested/trf/{}.fa'.format(seqid), str(2), str(5), str(7), str(80), str(10), str(50), str(2000), '-m', '-d', '-h'], str(80), str(10), str(50), str(2000), '-m', '-d', '-h'], stdout=subprocess.PIPE, stdout=subprocess.DEVNULL, stderr=subprocess.PIPE) stderr=subprocess.DEVNULL) stdout, stderr = process.communicate() stdout, stderr = process.communicate() repeats = filter_candidates(get_candidates(stdout)) repeats = filter_candidates(get_candidates(seqid)) repeats.sort(key=lambda r: r.location[0], reverse=True) repeats.sort(key=lambda r: r.location[0], reverse=True) return repeats return repeats def get_candidates(raw_output): def get_candidates(seqid): entries = raw_output.decode('utf-8').split(os.linesep)[13:-1] candidates = [] candidates = [] for entry in entries: with open('{}.fa.2.5.7.80.10.50.2000.dat'.format(seqid)) as file: for entry in islice(file, 15, None): split = entry.split(' ') split = entry.split(' ') candidates.append(TandemRepeat([int(split[0]), int(split[1])], int(split[2]), float(split[3]), candidates.append(TandemRepeat([int(split[0]), int(split[1])], int(split[2]), float(split[3]), int(split[5]), int(split[6]), int(split[7]), float(split[12]))) int(split[5]), int(split[6]), int(split[7]), float(split[12]), split[-1])) return candidates return candidates def filter_candidates(candidates): def filter_candidates(candidates): Loading
nested/core/te.py +1 −1 Original line number Original line Diff line number Diff line Loading @@ -27,7 +27,7 @@ class TE(object): def __init__(self, ppt=[0, 0], pbs=[0, 0], location=[0, 0], def __init__(self, ppt=[0, 0], pbs=[0, 0], location=[0, 0], ltr_left_location=[0, 0], ltr_right_location=[0, 0], ltr_left_location=[0, 0], ltr_right_location=[0, 0], tsr_left=[0, 0], tsr_right=[0, 0], features={}, score=None): tsr_left=[0, 0], tsr_right=[0, 0], features={'domains': []}, score=None): self.ppt = ppt self.ppt = ppt self.pbs = pbs self.pbs = pbs self.location = location self.location = location Loading
nested/output/sketcher.py +6 −1 Original line number Original line Diff line number Diff line Loading @@ -6,6 +6,7 @@ import subprocess from nested.config.config import config, args_dict_to_list from nested.config.config import config, args_dict_to_list from nested.output.gff import GFFMaker from nested.output.gff import GFFMaker from nested.output.solo_gff import SoloGFFMaker from nested.output.solo_gff import SoloGFFMaker from nested.output.trf_gff import TrfGFFMaker DEFAULT_DIRPATH = 'data' DEFAULT_DIRPATH = 'data' Loading @@ -13,6 +14,7 @@ class Sketcher(object): def __init__(self): def __init__(self): self._gff_maker = GFFMaker() self._gff_maker = GFFMaker() self._solo_gff_maker = SoloGFFMaker() self._solo_gff_maker = SoloGFFMaker() self._trf_gff_maker = TrfGFFMaker() self._gff_path = '' self._gff_path = '' def create_gff(self, nested_element, dirpath, output_fasta_offset=0, format='default'): def create_gff(self, nested_element, dirpath, output_fasta_offset=0, format='default'): Loading @@ -24,6 +26,9 @@ class Sketcher(object): self._solo_gff_maker.create_solo_gff(solo_ltrs, path) self._solo_gff_maker.create_solo_gff(solo_ltrs, path) self._solo_gff_maker.move_ltrs_spliced(solo_ltrs.seqId, path) self._solo_gff_maker.move_ltrs_spliced(solo_ltrs.seqId, path) def create_trf_gff(self, trf, seqId, dirpath): path = os.path.join(dirpath, DEFAULT_DIRPATH) self._trf_gff_maker.create_gff(trf, seqId, path) def sketch(self, id, dirpath): def sketch(self, id, dirpath): path = os.path.join(dirpath, DEFAULT_DIRPATH) path = os.path.join(dirpath, DEFAULT_DIRPATH) Loading