Loading nested/cli/nester.py +39 −28 Original line number Diff line number Diff line Loading @@ -2,6 +2,8 @@ import sys import click import glob from concurrent import futures from datetime import datetime from subprocess import CalledProcessError Loading @@ -9,6 +11,7 @@ from Bio import SeqIO from nested.core.nester import Nester from nested.output.sketcher import Sketcher from nested.core.sequence_thread import SequenceThread @click.command() Loading @@ -19,19 +22,26 @@ from nested.output.sketcher import Sketcher @click.option('--output_folder', '-d', default="", help='Output data folder.') @click.option('--initial_threshold', '-t', type=int, default=500, help='Initial threshold value.') @click.option('--threshold_multiplier', '-m', type=float, default=1, help='Threshold multiplier.') # TODO DATA_FOLDER def main(input_fasta, sketch, format, output_fasta_offset, output_folder, initial_threshold, threshold_multiplier): @click.option('--threads', '-n', type=int, default=1, help='Number of threads') def main(input_fasta, sketch, format, output_fasta_offset, output_folder, initial_threshold, threshold_multiplier, threads): number_of_errors = 0 start_time = datetime.now() sequences = list(SeqIO.parse(open(input_fasta), 'fasta')) sketcher = Sketcher() i = 0 with futures.ThreadPoolExecutor(threads) as executor: for sequence in sequences: executor.submit(process_sequence, sequence, sketcher, i, sketch, format, output_fasta_offset, output_folder, initial_threshold, threshold_multiplier) end_time = datetime.now() print('Total time: {}'.format(end_time - start_time)) print('Number of errors: {}'.format(number_of_errors)) def process_sequence(sequence, sketcher, i, sketch, format, output_fasta_offset, output_folder, initial_threshold, threshold_multiplier): i += 1 sequence.id = sequence.id.replace('/', '--') seq_start_time = datetime.now() strlen = 15 print('Processing {a}...'.format(a=sequence.id[:strlen]), end='\r') print("Processing {}".format(sequence.id)) try: nester = Nester(sequence, i, initial_threshold, threshold_multiplier) sketcher.create_gff(nester.nested_element, dirpath=output_folder, output_fasta_offset=output_fasta_offset, format=format) Loading @@ -41,21 +51,22 @@ def main(input_fasta, sketch, format, output_fasta_offset, output_folder, initia sketcher.create_gff(nester.nested_element, dirpath=output_folder, output_fasta_offset=output_fasta_offset) sketcher.sketch(sequence.id, dirpath=output_folder) seq_end_time = datetime.now() print('Processing {a}: DONE [{b}]'.format(a=sequence.id[:strlen], b=seq_end_time - seq_start_time)) print('Processing {a}: DONE [{b}]'.format(a=sequence.id, b=seq_end_time - seq_start_time)) except KeyboardInterrupt: # delete ltrs.fa cleanup() raise except CalledProcessError: cleanup() number_of_errors += 1 print('Processing {}: SUBPROCESS ERROR'.format(sequence.id[:strlen])) except: cleanup() number_of_errors += 1 print('Processing {}: UNEXPECTED ERROR:'.format(sequence.id[:strlen]), sys.exc_info()[0]) end_time = datetime.now() print('Total time: {}'.format(end_time - start_time)) print('Number of errors: {}'.format(number_of_errors)) def cleanup(): for file in glob.glob("*ltrs.fa"): os.remove(file) if __name__ == '__main__': main() nested/config/gt.style +51 −3 Original line number Diff line number Diff line Loading @@ -68,7 +68,7 @@ style = max_show_width = nil, }, -------------------------------------- AP = { PROT = { -- Color definitions stroke = {red=0.0, green=0.0, blue=0.0, alpha = 1.0}, stroke_marked = {red=1.0, green=0.0, blue=0.0, alpha = 1.0}, Loading Loading @@ -116,7 +116,55 @@ style = max_show_width = nil, }, -------------------------------------- RNaseH = { RH = { -- Color definitions stroke = {red=0.0, green=0.0, blue=0.0, alpha = 1.0}, stroke_marked = {red=1.0, green=0.0, blue=0.0, alpha = 1.0}, fill = {red=0.0510, green=0.2784, blue=0.6314, alpha = 5.0}, style = "box", -- Collapsing options collapse_to_parent = true, split_lines = true, -- Caption options max_capt_show_width= nil, -- Display this track only if the viewport is not wider than this -- number of nucleotides. Set to 0 to disable type track. max_show_width = nil, }, -------------------------------------- aRH = { -- Color definitions stroke = {red=0.0, green=0.0, blue=0.0, alpha = 1.0}, stroke_marked = {red=1.0, green=0.0, blue=0.0, alpha = 1.0}, fill = {red=0.0510, green=0.2784, blue=0.6314, alpha = 5.0}, style = "box", -- Collapsing options collapse_to_parent = true, split_lines = true, -- Caption options max_capt_show_width= nil, -- Display this track only if the viewport is not wider than this -- number of nucleotides. Set to 0 to disable type track. max_show_width = nil, }, -------------------------------------- CHD = { -- Color definitions stroke = {red=0.0, green=0.0, blue=0.0, alpha = 1.0}, stroke_marked = {red=1.0, green=0.0, blue=0.0, alpha = 1.0}, fill = {red=0.0510, green=0.2784, blue=0.6314, alpha = 5.0}, style = "box", -- Collapsing options collapse_to_parent = true, split_lines = true, -- Caption options max_capt_show_width= nil, -- Display this track only if the viewport is not wider than this -- number of nucleotides. Set to 0 to disable type track. max_show_width = nil, }, -------------------------------------- CHDCR = { -- Color definitions stroke = {red=0.0, green=0.0, blue=0.0, alpha = 1.0}, stroke_marked = {red=1.0, green=0.0, blue=0.0, alpha = 1.0}, Loading nested/core/domain_dante.py +2 −24 Original line number Diff line number Diff line Loading @@ -53,18 +53,9 @@ def run_blastx(sequence): list[Domain]: list of matched domains """ domains = [] # args = dict(config['blastx']['args']) # args['db'] = "/opt/nester/dbs/gydb_arh_chdcr/gydb_arh_chdcr.fa" # args['max_target_seqs'] = 1950 # args['num_threads'] = 6 # args['evalue'] = 1e-5 #blastx_cline = NcbiblastxCommandline(**args) blastx_cline = NcbiblastxCommandline(**config['blastx']['args']) xml_out, stderr = blastx_cline(stdin=str(sequence)) #output = open("xml_output_blast.xml", "a+").write(str(xml_out)) blast_records = NCBIXML.parse(StringIO(xml_out)) try: blast_record = next(blast_records).alignments Loading @@ -85,10 +76,8 @@ def run_blastx(sequence): else: domain.location = [hsp.query_start, hsp.query_start + 3 * (hsp.align_length - hsp.gaps)] #domain.type = alignment.title.split(' ')[1].split('_')[0] domain.type = alignment.title.split('__')[0].split('-')[1] domain.annotation = alignment.title.split('__')[1] #print(domain.type, domain.location) domains.append(domain) return domains Loading Loading @@ -134,14 +123,3 @@ def parse_last_output(raw_output): domains.append(domain) print(len(domains)) return domains nested/core/gene.py +0 −1 Original line number Diff line number Diff line Loading @@ -67,7 +67,6 @@ class Gene(object): gypsy_minus = Graph(te, self.domain_list, True, True) graphs = [copia_plus, copia_minus, gypsy_plus, gypsy_minus] te.score, te.features = self._best_score(graphs) # te.score /= float(intervals.length(te.location)) ''' # evaluate LTR pair using graph, set up score and features Loading nested/core/nester.py +2 −12 Original line number Diff line number Diff line Loading @@ -34,12 +34,9 @@ class Nester(object): def _find_nesting(self): nested_list = self._get_unexpanded_transposon_list(self.sequence, self.threshold) # find list of nested transposons nested_list = self._expand_transposon_list(nested_list) nested_list = self._filter_nested_list(nested_list) self.nested_element = NestedElement(self.seqid, self.sequence, nested_list) # threshold causes loss of elements def _get_unexpanded_transposon_list(self, sequence, threshold): # recursivelly find and crop best evaluated transposon, return unexpanded list of found transposons self._iteration += 1 Loading @@ -48,7 +45,6 @@ class Nester(object): if not candidates: best_candidate = gene.get_best_candidate() if not best_candidate: # call blastn here self.solo_ltrs.find_solo_ltrs(sequence) self.solo_ltrs.solo_ltrs.sort(key=lambda x: x.location[0], reverse=True) return self.solo_ltrs.solo_ltrs Loading @@ -63,16 +59,11 @@ class Nester(object): for element in nested_list: if intervals.intersect(candidate.location, element.location): choose = False # not sure about the impact # choose = intervals.contains(candidate.location, element.location) break for other_candidate in candidates: if candidate.location == other_candidate.location: continue if intervals.contains(candidate.location, other_candidate.location): # if candidate.score >= other_candidate.score: # choose = True # else: choose = False break if choose: Loading Loading @@ -110,7 +101,6 @@ class Nester(object): nested_list[j].features['ppt']) nested_list[j].features['pbs'] = intervals.expand(nested_list[i].location, nested_list[j].features['pbs']) #TODO check if works in corner case nested_list[j].tsr_left = intervals.expand(nested_list[i].location, nested_list[j].tsr_left) nested_list[j].tsr_right = intervals.expand(nested_list[i].location, Loading Loading
nested/cli/nester.py +39 −28 Original line number Diff line number Diff line Loading @@ -2,6 +2,8 @@ import sys import click import glob from concurrent import futures from datetime import datetime from subprocess import CalledProcessError Loading @@ -9,6 +11,7 @@ from Bio import SeqIO from nested.core.nester import Nester from nested.output.sketcher import Sketcher from nested.core.sequence_thread import SequenceThread @click.command() Loading @@ -19,19 +22,26 @@ from nested.output.sketcher import Sketcher @click.option('--output_folder', '-d', default="", help='Output data folder.') @click.option('--initial_threshold', '-t', type=int, default=500, help='Initial threshold value.') @click.option('--threshold_multiplier', '-m', type=float, default=1, help='Threshold multiplier.') # TODO DATA_FOLDER def main(input_fasta, sketch, format, output_fasta_offset, output_folder, initial_threshold, threshold_multiplier): @click.option('--threads', '-n', type=int, default=1, help='Number of threads') def main(input_fasta, sketch, format, output_fasta_offset, output_folder, initial_threshold, threshold_multiplier, threads): number_of_errors = 0 start_time = datetime.now() sequences = list(SeqIO.parse(open(input_fasta), 'fasta')) sketcher = Sketcher() i = 0 with futures.ThreadPoolExecutor(threads) as executor: for sequence in sequences: executor.submit(process_sequence, sequence, sketcher, i, sketch, format, output_fasta_offset, output_folder, initial_threshold, threshold_multiplier) end_time = datetime.now() print('Total time: {}'.format(end_time - start_time)) print('Number of errors: {}'.format(number_of_errors)) def process_sequence(sequence, sketcher, i, sketch, format, output_fasta_offset, output_folder, initial_threshold, threshold_multiplier): i += 1 sequence.id = sequence.id.replace('/', '--') seq_start_time = datetime.now() strlen = 15 print('Processing {a}...'.format(a=sequence.id[:strlen]), end='\r') print("Processing {}".format(sequence.id)) try: nester = Nester(sequence, i, initial_threshold, threshold_multiplier) sketcher.create_gff(nester.nested_element, dirpath=output_folder, output_fasta_offset=output_fasta_offset, format=format) Loading @@ -41,21 +51,22 @@ def main(input_fasta, sketch, format, output_fasta_offset, output_folder, initia sketcher.create_gff(nester.nested_element, dirpath=output_folder, output_fasta_offset=output_fasta_offset) sketcher.sketch(sequence.id, dirpath=output_folder) seq_end_time = datetime.now() print('Processing {a}: DONE [{b}]'.format(a=sequence.id[:strlen], b=seq_end_time - seq_start_time)) print('Processing {a}: DONE [{b}]'.format(a=sequence.id, b=seq_end_time - seq_start_time)) except KeyboardInterrupt: # delete ltrs.fa cleanup() raise except CalledProcessError: cleanup() number_of_errors += 1 print('Processing {}: SUBPROCESS ERROR'.format(sequence.id[:strlen])) except: cleanup() number_of_errors += 1 print('Processing {}: UNEXPECTED ERROR:'.format(sequence.id[:strlen]), sys.exc_info()[0]) end_time = datetime.now() print('Total time: {}'.format(end_time - start_time)) print('Number of errors: {}'.format(number_of_errors)) def cleanup(): for file in glob.glob("*ltrs.fa"): os.remove(file) if __name__ == '__main__': main()
nested/config/gt.style +51 −3 Original line number Diff line number Diff line Loading @@ -68,7 +68,7 @@ style = max_show_width = nil, }, -------------------------------------- AP = { PROT = { -- Color definitions stroke = {red=0.0, green=0.0, blue=0.0, alpha = 1.0}, stroke_marked = {red=1.0, green=0.0, blue=0.0, alpha = 1.0}, Loading Loading @@ -116,7 +116,55 @@ style = max_show_width = nil, }, -------------------------------------- RNaseH = { RH = { -- Color definitions stroke = {red=0.0, green=0.0, blue=0.0, alpha = 1.0}, stroke_marked = {red=1.0, green=0.0, blue=0.0, alpha = 1.0}, fill = {red=0.0510, green=0.2784, blue=0.6314, alpha = 5.0}, style = "box", -- Collapsing options collapse_to_parent = true, split_lines = true, -- Caption options max_capt_show_width= nil, -- Display this track only if the viewport is not wider than this -- number of nucleotides. Set to 0 to disable type track. max_show_width = nil, }, -------------------------------------- aRH = { -- Color definitions stroke = {red=0.0, green=0.0, blue=0.0, alpha = 1.0}, stroke_marked = {red=1.0, green=0.0, blue=0.0, alpha = 1.0}, fill = {red=0.0510, green=0.2784, blue=0.6314, alpha = 5.0}, style = "box", -- Collapsing options collapse_to_parent = true, split_lines = true, -- Caption options max_capt_show_width= nil, -- Display this track only if the viewport is not wider than this -- number of nucleotides. Set to 0 to disable type track. max_show_width = nil, }, -------------------------------------- CHD = { -- Color definitions stroke = {red=0.0, green=0.0, blue=0.0, alpha = 1.0}, stroke_marked = {red=1.0, green=0.0, blue=0.0, alpha = 1.0}, fill = {red=0.0510, green=0.2784, blue=0.6314, alpha = 5.0}, style = "box", -- Collapsing options collapse_to_parent = true, split_lines = true, -- Caption options max_capt_show_width= nil, -- Display this track only if the viewport is not wider than this -- number of nucleotides. Set to 0 to disable type track. max_show_width = nil, }, -------------------------------------- CHDCR = { -- Color definitions stroke = {red=0.0, green=0.0, blue=0.0, alpha = 1.0}, stroke_marked = {red=1.0, green=0.0, blue=0.0, alpha = 1.0}, Loading
nested/core/domain_dante.py +2 −24 Original line number Diff line number Diff line Loading @@ -53,18 +53,9 @@ def run_blastx(sequence): list[Domain]: list of matched domains """ domains = [] # args = dict(config['blastx']['args']) # args['db'] = "/opt/nester/dbs/gydb_arh_chdcr/gydb_arh_chdcr.fa" # args['max_target_seqs'] = 1950 # args['num_threads'] = 6 # args['evalue'] = 1e-5 #blastx_cline = NcbiblastxCommandline(**args) blastx_cline = NcbiblastxCommandline(**config['blastx']['args']) xml_out, stderr = blastx_cline(stdin=str(sequence)) #output = open("xml_output_blast.xml", "a+").write(str(xml_out)) blast_records = NCBIXML.parse(StringIO(xml_out)) try: blast_record = next(blast_records).alignments Loading @@ -85,10 +76,8 @@ def run_blastx(sequence): else: domain.location = [hsp.query_start, hsp.query_start + 3 * (hsp.align_length - hsp.gaps)] #domain.type = alignment.title.split(' ')[1].split('_')[0] domain.type = alignment.title.split('__')[0].split('-')[1] domain.annotation = alignment.title.split('__')[1] #print(domain.type, domain.location) domains.append(domain) return domains Loading Loading @@ -134,14 +123,3 @@ def parse_last_output(raw_output): domains.append(domain) print(len(domains)) return domains
nested/core/gene.py +0 −1 Original line number Diff line number Diff line Loading @@ -67,7 +67,6 @@ class Gene(object): gypsy_minus = Graph(te, self.domain_list, True, True) graphs = [copia_plus, copia_minus, gypsy_plus, gypsy_minus] te.score, te.features = self._best_score(graphs) # te.score /= float(intervals.length(te.location)) ''' # evaluate LTR pair using graph, set up score and features Loading
nested/core/nester.py +2 −12 Original line number Diff line number Diff line Loading @@ -34,12 +34,9 @@ class Nester(object): def _find_nesting(self): nested_list = self._get_unexpanded_transposon_list(self.sequence, self.threshold) # find list of nested transposons nested_list = self._expand_transposon_list(nested_list) nested_list = self._filter_nested_list(nested_list) self.nested_element = NestedElement(self.seqid, self.sequence, nested_list) # threshold causes loss of elements def _get_unexpanded_transposon_list(self, sequence, threshold): # recursivelly find and crop best evaluated transposon, return unexpanded list of found transposons self._iteration += 1 Loading @@ -48,7 +45,6 @@ class Nester(object): if not candidates: best_candidate = gene.get_best_candidate() if not best_candidate: # call blastn here self.solo_ltrs.find_solo_ltrs(sequence) self.solo_ltrs.solo_ltrs.sort(key=lambda x: x.location[0], reverse=True) return self.solo_ltrs.solo_ltrs Loading @@ -63,16 +59,11 @@ class Nester(object): for element in nested_list: if intervals.intersect(candidate.location, element.location): choose = False # not sure about the impact # choose = intervals.contains(candidate.location, element.location) break for other_candidate in candidates: if candidate.location == other_candidate.location: continue if intervals.contains(candidate.location, other_candidate.location): # if candidate.score >= other_candidate.score: # choose = True # else: choose = False break if choose: Loading Loading @@ -110,7 +101,6 @@ class Nester(object): nested_list[j].features['ppt']) nested_list[j].features['pbs'] = intervals.expand(nested_list[i].location, nested_list[j].features['pbs']) #TODO check if works in corner case nested_list[j].tsr_left = intervals.expand(nested_list[i].location, nested_list[j].tsr_left) nested_list[j].tsr_right = intervals.expand(nested_list[i].location, Loading