Loading nested/cli/generator.py +28 −3 Original line number Diff line number Diff line #!/usr/bin/env python3 import os import click from datetime import datetime Loading @@ -15,7 +16,13 @@ from nested.output.sketcher import Sketcher @click.option('--filter', '-f', is_flag=True, default=False, type=str, help='Filter database and create new one with given output db path.') @click.option('--filter_string', '-s', type=str, help='Filter entries by given string [ONLY RELEVANT WITH -filter OPTION].') @click.option('--filter_offset', '-o', type=int, help='LTR offset allowed [ONLY RELEVANT WITH -filter OPTION].') def main(input_db, output_db, baselength, number_of_iterations, number_of_elements, filter, filter_string, filter_offset): @click.option('--percentage', '-p', type=int, help='Percentage of elements in generated sequence.') @click.option('--average_element', '-a', type=int, help='Average element length.') @click.option('--expected_length', '-e', type=int, help='Expected output sequence length [ONLY WORKS WITH -percentage and -average_element].') @click.option('--output_directory', '-d', default=os.getcwd(), type=str, help='Output directory.') def main(input_db, output_db, baselength, number_of_iterations, number_of_elements, filter, filter_string, filter_offset, percentage, average_element, expected_length, output_directory): #number_of_errors = 0 start_time = datetime.now() generator = Generator(input_db) Loading @@ -30,16 +37,34 @@ def main(input_db, output_db, baselength, number_of_iterations, number_of_elemen if baselength: params['baselength'] = baselength if number_of_iterations: params['number_of_iterations'] = number_of_iterations if number_of_elements: params['number_of_elements'] = number_of_elements if percentage and average_element: if expected_length: params['baselength'] = calc_baselength(expected_length, percentage) if 'baselength' in params.keys(): params['number_of_iterations'] = calc_number_of_iterations(params['baselength'], percentage, average_element) generator.generate_random_nested_elements(**params) generator.save_elements_to_fasta('generated_data/{}'.format(output_db)) if not os.path.exists('{}/generated_data'.format(output_directory)): os.makedirs('{}/generated_data'.format(output_directory)) generator.save_elements_to_fasta('{}/generated_data/{}'.format(output_directory, output_db)) sketcher = Sketcher() for element in generator.elements: sketcher.create_gff(element, 'generated_data') sketcher.create_gff(element, '{}/generated_data'.format(output_directory)) # sketcher.sketch(element.id, 'generated_data') end_time = datetime.now() print('Total time: {}'.format(end_time - start_time)) #print('Number of errors: {}'.format(number_of_errors)) def calc_number_of_iterations(length, percentage, average_element): p = percentage / 100 remainder = int(p / (1 - p) * length) return remainder // average_element def calc_baselength(expected_length, percentage): p = percentage / 100 return int((1 - p) * expected_length) if __name__ == '__main__': main() nested/cli/nester.py +22 −19 Original line number Diff line number Diff line Loading @@ -17,8 +17,10 @@ from nested.output.sketcher import Sketcher @click.option('--format', '-f', default='genome_browser', help='Format for GFF.') @click.option('--output_fasta_offset', '-o', default=0, help='Number of bases around the element included in output fasta files.') @click.option('--output_folder', '-d', default="", help='Output data folder.') @click.option('--initial_threshold', '-t', type=int, default=500, help='Initial threshold value.') @click.option('--threshold_multiplier', '-m', type=float, default=1, help='Threshold multiplier.') # TODO DATA_FOLDER def main(input_fasta, sketch, format, output_fasta_offset, output_folder): def main(input_fasta, sketch, format, output_fasta_offset, output_folder, initial_threshold, threshold_multiplier): number_of_errors = 0 start_time = datetime.now() sequences = list(SeqIO.parse(open(input_fasta), 'fasta')) Loading @@ -30,24 +32,25 @@ def main(input_fasta, sketch, format, output_fasta_offset, output_folder): seq_start_time = datetime.now() strlen = 15 print('Processing {a}...'.format(a=sequence.id[:strlen]), end='\r') #try: nester = Nester(sequence, i) sketcher.create_gff(nester.nested_element, output_fasta_offset=output_fasta_offset, format=format, dirpath=output_folder) try: nester = Nester(sequence, i, initial_threshold, threshold_multiplier) sketcher.create_gff(nester.nested_element, dirpath=output_folder, output_fasta_offset=output_fasta_offset, format=format) sketcher.create_solo_ltr_gff(nester.solo_ltrs, dirpath=output_folder) if sketch: if format != 'default': sketcher.create_gff(nester.nested_element, output_fasta_offset=output_fasta_offset) sketcher.sketch(sequence.id) sketcher.create_gff(nester.nested_element, dirpath=output_folder, output_fasta_offset=output_fasta_offset) sketcher.sketch(sequence.id, dirpath=output_folder) seq_end_time = datetime.now() print('Processing {a}: DONE [{b}]'.format(a=sequence.id[:strlen], b=seq_end_time - seq_start_time)) #except KeyboardInterrupt: # raise #except CalledProcessError: # number_of_errors += 1 # print('Processing {}: SUBPROCESS ERROR'.format(sequence.id[:strlen])) #except: # number_of_errors += 1 # print('Processing {}: UNEXPECTED ERROR:'.format(sequence.id[:strlen]), sys.exc_info()[0]) except KeyboardInterrupt: # delete ltrs.fa raise except CalledProcessError: number_of_errors += 1 print('Processing {}: SUBPROCESS ERROR'.format(sequence.id[:strlen])) except: number_of_errors += 1 print('Processing {}: UNEXPECTED ERROR:'.format(sequence.id[:strlen]), sys.exc_info()[0]) end_time = datetime.now() print('Total time: {}'.format(end_time - start_time)) Loading nested/core/gene.py +1 −1 Original line number Diff line number Diff line Loading @@ -67,7 +67,7 @@ class Gene(object): gypsy_minus = Graph(te, self.domain_list, True, True) graphs = [copia_plus, copia_minus, gypsy_plus, gypsy_minus] te.score, te.features = self._best_score(graphs) te.score /= float(intervals.length(te.location)) # te.score /= float(intervals.length(te.location)) ''' # evaluate LTR pair using graph, set up score and features Loading nested/core/generator.py +1 −1 Original line number Diff line number Diff line Loading @@ -58,7 +58,7 @@ class Generator(object): #expand intervals nested_intervals = intervals.expand_list(nested_intervals) nested_tes = [] for interval in nested_intervals: for interval in nested_intervals[:-1]: nested_tes.append(TE( location=interval )) Loading nested/core/nester.py +15 −6 Original line number Diff line number Diff line Loading @@ -19,7 +19,7 @@ class Nester(object): nested_element (NestedElement): nested element """ def __init__(self, sequence, it): def __init__(self, sequence, it, threshold, multiplier): self.it = it self.seqid = sequence.id self.sequence = sequence.seq Loading @@ -27,21 +27,24 @@ class Nester(object): self.solo_ltrs = SoloLtrs(sequence.id) self._iteration = 0 self._logger = NesterLogger('{}/{}'.format(config['logdir'], self.seqid)) self.threshold = threshold self.multiplier = multiplier self._find_nesting() def _find_nesting(self): nested_list = self._get_unexpanded_transposon_list(self.sequence) # find list of nested transposons nested_list = self._get_unexpanded_transposon_list(self.sequence, self.threshold) # find list of nested transposons nested_list = self._expand_transposon_list(nested_list) nested_list = self._filter_nested_list(nested_list) self.nested_element = NestedElement(self.seqid, self.sequence, nested_list) def _get_unexpanded_transposon_list(self, sequence): # threshold causes loss of elements def _get_unexpanded_transposon_list(self, sequence, threshold): # recursivelly find and crop best evaluated transposon, return unexpanded list of found transposons self._iteration += 1 gene = Gene(self._iteration, self.it, self.seqid, sequence) candidates = gene.get_candidates_above_threshold(threshold=0) candidates = gene.get_candidates_above_threshold(threshold) if not candidates: best_candidate = gene.get_best_candidate() if not best_candidate: Loading @@ -60,11 +63,16 @@ class Nester(object): for element in nested_list: if intervals.intersect(candidate.location, element.location): choose = False # not sure about the impact # choose = intervals.contains(candidate.location, element.location) break for other_candidate in candidates: if candidate.location == other_candidate.location: continue if intervals.contains(candidate.location, other_candidate.location): # if candidate.score >= other_candidate.score: # choose = True # else: choose = False break if choose: Loading @@ -83,7 +91,7 @@ class Nester(object): # LOG self._logger.log_iteration(self._iteration, nested_list) nested_list += self._get_unexpanded_transposon_list(cropped_sequence) nested_list += self._get_unexpanded_transposon_list(cropped_sequence, self.multiplier * threshold) return nested_list Loading @@ -110,6 +118,7 @@ class Nester(object): return nested_list def _filter_nested_list(self, nested_list): # return list(filter(lambda x: not math.isnan(x.ltr_left_location[0]), nested_list) result = [] for te in nested_list: if not math.isnan(te.ltr_left_location[0]): Loading Loading
nested/cli/generator.py +28 −3 Original line number Diff line number Diff line #!/usr/bin/env python3 import os import click from datetime import datetime Loading @@ -15,7 +16,13 @@ from nested.output.sketcher import Sketcher @click.option('--filter', '-f', is_flag=True, default=False, type=str, help='Filter database and create new one with given output db path.') @click.option('--filter_string', '-s', type=str, help='Filter entries by given string [ONLY RELEVANT WITH -filter OPTION].') @click.option('--filter_offset', '-o', type=int, help='LTR offset allowed [ONLY RELEVANT WITH -filter OPTION].') def main(input_db, output_db, baselength, number_of_iterations, number_of_elements, filter, filter_string, filter_offset): @click.option('--percentage', '-p', type=int, help='Percentage of elements in generated sequence.') @click.option('--average_element', '-a', type=int, help='Average element length.') @click.option('--expected_length', '-e', type=int, help='Expected output sequence length [ONLY WORKS WITH -percentage and -average_element].') @click.option('--output_directory', '-d', default=os.getcwd(), type=str, help='Output directory.') def main(input_db, output_db, baselength, number_of_iterations, number_of_elements, filter, filter_string, filter_offset, percentage, average_element, expected_length, output_directory): #number_of_errors = 0 start_time = datetime.now() generator = Generator(input_db) Loading @@ -30,16 +37,34 @@ def main(input_db, output_db, baselength, number_of_iterations, number_of_elemen if baselength: params['baselength'] = baselength if number_of_iterations: params['number_of_iterations'] = number_of_iterations if number_of_elements: params['number_of_elements'] = number_of_elements if percentage and average_element: if expected_length: params['baselength'] = calc_baselength(expected_length, percentage) if 'baselength' in params.keys(): params['number_of_iterations'] = calc_number_of_iterations(params['baselength'], percentage, average_element) generator.generate_random_nested_elements(**params) generator.save_elements_to_fasta('generated_data/{}'.format(output_db)) if not os.path.exists('{}/generated_data'.format(output_directory)): os.makedirs('{}/generated_data'.format(output_directory)) generator.save_elements_to_fasta('{}/generated_data/{}'.format(output_directory, output_db)) sketcher = Sketcher() for element in generator.elements: sketcher.create_gff(element, 'generated_data') sketcher.create_gff(element, '{}/generated_data'.format(output_directory)) # sketcher.sketch(element.id, 'generated_data') end_time = datetime.now() print('Total time: {}'.format(end_time - start_time)) #print('Number of errors: {}'.format(number_of_errors)) def calc_number_of_iterations(length, percentage, average_element): p = percentage / 100 remainder = int(p / (1 - p) * length) return remainder // average_element def calc_baselength(expected_length, percentage): p = percentage / 100 return int((1 - p) * expected_length) if __name__ == '__main__': main()
nested/cli/nester.py +22 −19 Original line number Diff line number Diff line Loading @@ -17,8 +17,10 @@ from nested.output.sketcher import Sketcher @click.option('--format', '-f', default='genome_browser', help='Format for GFF.') @click.option('--output_fasta_offset', '-o', default=0, help='Number of bases around the element included in output fasta files.') @click.option('--output_folder', '-d', default="", help='Output data folder.') @click.option('--initial_threshold', '-t', type=int, default=500, help='Initial threshold value.') @click.option('--threshold_multiplier', '-m', type=float, default=1, help='Threshold multiplier.') # TODO DATA_FOLDER def main(input_fasta, sketch, format, output_fasta_offset, output_folder): def main(input_fasta, sketch, format, output_fasta_offset, output_folder, initial_threshold, threshold_multiplier): number_of_errors = 0 start_time = datetime.now() sequences = list(SeqIO.parse(open(input_fasta), 'fasta')) Loading @@ -30,24 +32,25 @@ def main(input_fasta, sketch, format, output_fasta_offset, output_folder): seq_start_time = datetime.now() strlen = 15 print('Processing {a}...'.format(a=sequence.id[:strlen]), end='\r') #try: nester = Nester(sequence, i) sketcher.create_gff(nester.nested_element, output_fasta_offset=output_fasta_offset, format=format, dirpath=output_folder) try: nester = Nester(sequence, i, initial_threshold, threshold_multiplier) sketcher.create_gff(nester.nested_element, dirpath=output_folder, output_fasta_offset=output_fasta_offset, format=format) sketcher.create_solo_ltr_gff(nester.solo_ltrs, dirpath=output_folder) if sketch: if format != 'default': sketcher.create_gff(nester.nested_element, output_fasta_offset=output_fasta_offset) sketcher.sketch(sequence.id) sketcher.create_gff(nester.nested_element, dirpath=output_folder, output_fasta_offset=output_fasta_offset) sketcher.sketch(sequence.id, dirpath=output_folder) seq_end_time = datetime.now() print('Processing {a}: DONE [{b}]'.format(a=sequence.id[:strlen], b=seq_end_time - seq_start_time)) #except KeyboardInterrupt: # raise #except CalledProcessError: # number_of_errors += 1 # print('Processing {}: SUBPROCESS ERROR'.format(sequence.id[:strlen])) #except: # number_of_errors += 1 # print('Processing {}: UNEXPECTED ERROR:'.format(sequence.id[:strlen]), sys.exc_info()[0]) except KeyboardInterrupt: # delete ltrs.fa raise except CalledProcessError: number_of_errors += 1 print('Processing {}: SUBPROCESS ERROR'.format(sequence.id[:strlen])) except: number_of_errors += 1 print('Processing {}: UNEXPECTED ERROR:'.format(sequence.id[:strlen]), sys.exc_info()[0]) end_time = datetime.now() print('Total time: {}'.format(end_time - start_time)) Loading
nested/core/gene.py +1 −1 Original line number Diff line number Diff line Loading @@ -67,7 +67,7 @@ class Gene(object): gypsy_minus = Graph(te, self.domain_list, True, True) graphs = [copia_plus, copia_minus, gypsy_plus, gypsy_minus] te.score, te.features = self._best_score(graphs) te.score /= float(intervals.length(te.location)) # te.score /= float(intervals.length(te.location)) ''' # evaluate LTR pair using graph, set up score and features Loading
nested/core/generator.py +1 −1 Original line number Diff line number Diff line Loading @@ -58,7 +58,7 @@ class Generator(object): #expand intervals nested_intervals = intervals.expand_list(nested_intervals) nested_tes = [] for interval in nested_intervals: for interval in nested_intervals[:-1]: nested_tes.append(TE( location=interval )) Loading
nested/core/nester.py +15 −6 Original line number Diff line number Diff line Loading @@ -19,7 +19,7 @@ class Nester(object): nested_element (NestedElement): nested element """ def __init__(self, sequence, it): def __init__(self, sequence, it, threshold, multiplier): self.it = it self.seqid = sequence.id self.sequence = sequence.seq Loading @@ -27,21 +27,24 @@ class Nester(object): self.solo_ltrs = SoloLtrs(sequence.id) self._iteration = 0 self._logger = NesterLogger('{}/{}'.format(config['logdir'], self.seqid)) self.threshold = threshold self.multiplier = multiplier self._find_nesting() def _find_nesting(self): nested_list = self._get_unexpanded_transposon_list(self.sequence) # find list of nested transposons nested_list = self._get_unexpanded_transposon_list(self.sequence, self.threshold) # find list of nested transposons nested_list = self._expand_transposon_list(nested_list) nested_list = self._filter_nested_list(nested_list) self.nested_element = NestedElement(self.seqid, self.sequence, nested_list) def _get_unexpanded_transposon_list(self, sequence): # threshold causes loss of elements def _get_unexpanded_transposon_list(self, sequence, threshold): # recursivelly find and crop best evaluated transposon, return unexpanded list of found transposons self._iteration += 1 gene = Gene(self._iteration, self.it, self.seqid, sequence) candidates = gene.get_candidates_above_threshold(threshold=0) candidates = gene.get_candidates_above_threshold(threshold) if not candidates: best_candidate = gene.get_best_candidate() if not best_candidate: Loading @@ -60,11 +63,16 @@ class Nester(object): for element in nested_list: if intervals.intersect(candidate.location, element.location): choose = False # not sure about the impact # choose = intervals.contains(candidate.location, element.location) break for other_candidate in candidates: if candidate.location == other_candidate.location: continue if intervals.contains(candidate.location, other_candidate.location): # if candidate.score >= other_candidate.score: # choose = True # else: choose = False break if choose: Loading @@ -83,7 +91,7 @@ class Nester(object): # LOG self._logger.log_iteration(self._iteration, nested_list) nested_list += self._get_unexpanded_transposon_list(cropped_sequence) nested_list += self._get_unexpanded_transposon_list(cropped_sequence, self.multiplier * threshold) return nested_list Loading @@ -110,6 +118,7 @@ class Nester(object): return nested_list def _filter_nested_list(self, nested_list): # return list(filter(lambda x: not math.isnan(x.ltr_left_location[0]), nested_list) result = [] for te in nested_list: if not math.isnan(te.ltr_left_location[0]): Loading