Commit adb3c45b authored by Ivan Vanát's avatar Ivan Vanát
Browse files

Comments gone, multithreading done

parent b6415c8f
Loading
Loading
Loading
Loading
+39 −28
Original line number Original line Diff line number Diff line
@@ -2,6 +2,8 @@


import sys
import sys
import click
import click
import glob
from concurrent import futures
from datetime import datetime
from datetime import datetime
from subprocess import CalledProcessError
from subprocess import CalledProcessError


@@ -9,6 +11,7 @@ from Bio import SeqIO


from nested.core.nester import Nester
from nested.core.nester import Nester
from nested.output.sketcher import Sketcher
from nested.output.sketcher import Sketcher
from nested.core.sequence_thread import SequenceThread




@click.command()
@click.command()
@@ -19,19 +22,26 @@ from nested.output.sketcher import Sketcher
@click.option('--output_folder', '-d', default="", help='Output data folder.')
@click.option('--output_folder', '-d', default="", help='Output data folder.')
@click.option('--initial_threshold', '-t', type=int, default=500, help='Initial threshold value.')
@click.option('--initial_threshold', '-t', type=int, default=500, help='Initial threshold value.')
@click.option('--threshold_multiplier', '-m', type=float, default=1, help='Threshold multiplier.')
@click.option('--threshold_multiplier', '-m', type=float, default=1, help='Threshold multiplier.')
# TODO DATA_FOLDER
@click.option('--threads', '-n', type=int, default=1, help='Number of threads')
def main(input_fasta, sketch, format, output_fasta_offset, output_folder, initial_threshold, threshold_multiplier):
def main(input_fasta, sketch, format, output_fasta_offset, output_folder, initial_threshold, threshold_multiplier, threads):
    number_of_errors = 0
    number_of_errors = 0
    start_time = datetime.now()
    start_time = datetime.now()
    sequences = list(SeqIO.parse(open(input_fasta), 'fasta'))
    sequences = list(SeqIO.parse(open(input_fasta), 'fasta'))
    sketcher = Sketcher()
    sketcher = Sketcher()
    i = 0
    i = 0
    with futures.ThreadPoolExecutor(threads) as executor:
        for sequence in sequences:
        for sequence in sequences:
            executor.submit(process_sequence, sequence, sketcher, i, sketch, format, output_fasta_offset, output_folder, initial_threshold, threshold_multiplier)
    end_time = datetime.now()
    print('Total time: {}'.format(end_time - start_time))
    print('Number of errors: {}'.format(number_of_errors))

def process_sequence(sequence, sketcher, i, sketch, format, output_fasta_offset, output_folder, initial_threshold, threshold_multiplier):
    i += 1
    i += 1
    sequence.id = sequence.id.replace('/', '--')
    sequence.id = sequence.id.replace('/', '--')
    seq_start_time = datetime.now()
    seq_start_time = datetime.now()
    strlen = 15
    strlen = 15
        print('Processing {a}...'.format(a=sequence.id[:strlen]), end='\r')
    print("Processing {}".format(sequence.id))
    try:
    try:
        nester = Nester(sequence, i, initial_threshold, threshold_multiplier)
        nester = Nester(sequence, i, initial_threshold, threshold_multiplier)
        sketcher.create_gff(nester.nested_element, dirpath=output_folder, output_fasta_offset=output_fasta_offset, format=format)
        sketcher.create_gff(nester.nested_element, dirpath=output_folder, output_fasta_offset=output_fasta_offset, format=format)
@@ -41,21 +51,22 @@ def main(input_fasta, sketch, format, output_fasta_offset, output_folder, initia
                sketcher.create_gff(nester.nested_element, dirpath=output_folder, output_fasta_offset=output_fasta_offset)            
                sketcher.create_gff(nester.nested_element, dirpath=output_folder, output_fasta_offset=output_fasta_offset)            
            sketcher.sketch(sequence.id, dirpath=output_folder)
            sketcher.sketch(sequence.id, dirpath=output_folder)
        seq_end_time = datetime.now()
        seq_end_time = datetime.now()
            print('Processing {a}: DONE [{b}]'.format(a=sequence.id[:strlen], b=seq_end_time - seq_start_time))
        print('Processing {a}: DONE [{b}]'.format(a=sequence.id, b=seq_end_time - seq_start_time))
    except KeyboardInterrupt:
    except KeyboardInterrupt:
            # delete ltrs.fa            
        cleanup()
        raise
        raise
    except CalledProcessError:
    except CalledProcessError:
        cleanup()
        number_of_errors += 1
        number_of_errors += 1
        print('Processing {}: SUBPROCESS ERROR'.format(sequence.id[:strlen]))
        print('Processing {}: SUBPROCESS ERROR'.format(sequence.id[:strlen]))
    except:
    except:
        cleanup()
        number_of_errors += 1
        number_of_errors += 1
        print('Processing {}: UNEXPECTED ERROR:'.format(sequence.id[:strlen]), sys.exc_info()[0])
        print('Processing {}: UNEXPECTED ERROR:'.format(sequence.id[:strlen]), sys.exc_info()[0])


    end_time = datetime.now()
def cleanup():
    print('Total time: {}'.format(end_time - start_time))
    for file in glob.glob("*ltrs.fa"):
    print('Number of errors: {}'.format(number_of_errors))
        os.remove(file)

        
        
if __name__ == '__main__':
if __name__ == '__main__':
    main()
    main()
+51 −3
Original line number Original line Diff line number Diff line
@@ -68,7 +68,7 @@ style =
    max_show_width     = nil,
    max_show_width     = nil,
  },
  },
--------------------------------------
--------------------------------------
  AP = {
  PROT = {
    -- Color definitions
    -- Color definitions
    stroke             = {red=0.0, green=0.0, blue=0.0, alpha = 1.0},
    stroke             = {red=0.0, green=0.0, blue=0.0, alpha = 1.0},
    stroke_marked      = {red=1.0, green=0.0, blue=0.0, alpha = 1.0},
    stroke_marked      = {red=1.0, green=0.0, blue=0.0, alpha = 1.0},
@@ -116,7 +116,55 @@ style =
    max_show_width     = nil,
    max_show_width     = nil,
  },
  },
--------------------------------------
--------------------------------------
  RNaseH = {
  RH = {
    -- Color definitions
    stroke             = {red=0.0, green=0.0, blue=0.0, alpha = 1.0},
    stroke_marked      = {red=1.0, green=0.0, blue=0.0, alpha = 1.0},
    fill               = {red=0.0510, green=0.2784, blue=0.6314, alpha = 5.0},
    style              = "box",
    -- Collapsing options
    collapse_to_parent = true,
    split_lines        = true,
    -- Caption options
    max_capt_show_width= nil,
    -- Display this track only if the viewport is not wider than this
    -- number of nucleotides. Set to 0 to disable type track.
    max_show_width     = nil,
  },
--------------------------------------
  aRH = {
    -- Color definitions
    stroke             = {red=0.0, green=0.0, blue=0.0, alpha = 1.0},
    stroke_marked      = {red=1.0, green=0.0, blue=0.0, alpha = 1.0},
    fill               = {red=0.0510, green=0.2784, blue=0.6314, alpha = 5.0},
    style              = "box",
    -- Collapsing options
    collapse_to_parent = true,
    split_lines        = true,
    -- Caption options
    max_capt_show_width= nil,
    -- Display this track only if the viewport is not wider than this
    -- number of nucleotides. Set to 0 to disable type track.
    max_show_width     = nil,
  },
--------------------------------------
  CHD = {
    -- Color definitions
    stroke             = {red=0.0, green=0.0, blue=0.0, alpha = 1.0},
    stroke_marked      = {red=1.0, green=0.0, blue=0.0, alpha = 1.0},
    fill               = {red=0.0510, green=0.2784, blue=0.6314, alpha = 5.0},
    style              = "box",
    -- Collapsing options
    collapse_to_parent = true,
    split_lines        = true,
    -- Caption options
    max_capt_show_width= nil,
    -- Display this track only if the viewport is not wider than this
    -- number of nucleotides. Set to 0 to disable type track.
    max_show_width     = nil,
  },
--------------------------------------
  CHDCR = {
    -- Color definitions
    -- Color definitions
    stroke             = {red=0.0, green=0.0, blue=0.0, alpha = 1.0},
    stroke             = {red=0.0, green=0.0, blue=0.0, alpha = 1.0},
    stroke_marked      = {red=1.0, green=0.0, blue=0.0, alpha = 1.0},
    stroke_marked      = {red=1.0, green=0.0, blue=0.0, alpha = 1.0},
+2 −24
Original line number Original line Diff line number Diff line
@@ -53,18 +53,9 @@ def run_blastx(sequence):
        list[Domain]: list of matched domains
        list[Domain]: list of matched domains
    """
    """
    domains = []    
    domains = []    
    # args = dict(config['blastx']['args'])
    # args['db'] = "/opt/nester/dbs/gydb_arh_chdcr/gydb_arh_chdcr.fa"
    # args['max_target_seqs'] = 1950
    # args['num_threads'] = 6
    # args['evalue'] = 1e-5

    #blastx_cline = NcbiblastxCommandline(**args)    
    blastx_cline = NcbiblastxCommandline(**config['blastx']['args'])
    blastx_cline = NcbiblastxCommandline(**config['blastx']['args'])
    xml_out, stderr = blastx_cline(stdin=str(sequence))
    xml_out, stderr = blastx_cline(stdin=str(sequence))


    #output = open("xml_output_blast.xml", "a+").write(str(xml_out))

    blast_records = NCBIXML.parse(StringIO(xml_out))
    blast_records = NCBIXML.parse(StringIO(xml_out))
    try:
    try:
        blast_record = next(blast_records).alignments
        blast_record = next(blast_records).alignments
@@ -85,10 +76,8 @@ def run_blastx(sequence):
                else:
                else:
                    domain.location = [hsp.query_start, hsp.query_start + 3 * (hsp.align_length - hsp.gaps)]
                    domain.location = [hsp.query_start, hsp.query_start + 3 * (hsp.align_length - hsp.gaps)]


                #domain.type = alignment.title.split(' ')[1].split('_')[0]
                domain.type = alignment.title.split('__')[0].split('-')[1]
                domain.type = alignment.title.split('__')[0].split('-')[1]
                domain.annotation = alignment.title.split('__')[1]
                domain.annotation = alignment.title.split('__')[1]
                #print(domain.type, domain.location)
                domains.append(domain)
                domains.append(domain)
    return domains
    return domains


@@ -134,14 +123,3 @@ def parse_last_output(raw_output):
        domains.append(domain)
        domains.append(domain)
    print(len(domains))
    print(len(domains))
    return domains   
    return domains   
                             
        

    

    

    

    
    
+0 −1
Original line number Original line Diff line number Diff line
@@ -67,7 +67,6 @@ class Gene(object):
        gypsy_minus = Graph(te, self.domain_list, True, True)
        gypsy_minus = Graph(te, self.domain_list, True, True)
        graphs = [copia_plus, copia_minus, gypsy_plus, gypsy_minus]
        graphs = [copia_plus, copia_minus, gypsy_plus, gypsy_minus]
        te.score, te.features = self._best_score(graphs)
        te.score, te.features = self._best_score(graphs)
        # te.score /= float(intervals.length(te.location))


        '''
        '''
        # evaluate LTR pair using graph, set up score and features
        # evaluate LTR pair using graph, set up score and features
+2 −12
Original line number Original line Diff line number Diff line
@@ -34,12 +34,9 @@ class Nester(object):
    def _find_nesting(self):
    def _find_nesting(self):
        nested_list = self._get_unexpanded_transposon_list(self.sequence, self.threshold)  # find list of nested transposons
        nested_list = self._get_unexpanded_transposon_list(self.sequence, self.threshold)  # find list of nested transposons
        nested_list = self._expand_transposon_list(nested_list)        
        nested_list = self._expand_transposon_list(nested_list)        
        
        nested_list = self._filter_nested_list(nested_list)        
        nested_list = self._filter_nested_list(nested_list)        
        
        self.nested_element = NestedElement(self.seqid, self.sequence, nested_list)
        self.nested_element = NestedElement(self.seqid, self.sequence, nested_list)
        
        
    # threshold causes loss of elements
    def _get_unexpanded_transposon_list(self, sequence, threshold):
    def _get_unexpanded_transposon_list(self, sequence, threshold):
        # recursivelly find and crop best evaluated transposon, return unexpanded list of found transposons
        # recursivelly find and crop best evaluated transposon, return unexpanded list of found transposons
        self._iteration += 1
        self._iteration += 1
@@ -48,7 +45,6 @@ class Nester(object):
        if not candidates:
        if not candidates:
            best_candidate = gene.get_best_candidate()
            best_candidate = gene.get_best_candidate()
            if not best_candidate:
            if not best_candidate:
                # call blastn here
                self.solo_ltrs.find_solo_ltrs(sequence)
                self.solo_ltrs.find_solo_ltrs(sequence)
                self.solo_ltrs.solo_ltrs.sort(key=lambda x: x.location[0], reverse=True)
                self.solo_ltrs.solo_ltrs.sort(key=lambda x: x.location[0], reverse=True)
                return self.solo_ltrs.solo_ltrs
                return self.solo_ltrs.solo_ltrs
@@ -63,16 +59,11 @@ class Nester(object):
            for element in nested_list:
            for element in nested_list:
                if intervals.intersect(candidate.location, element.location):
                if intervals.intersect(candidate.location, element.location):
                    choose = False
                    choose = False
                    # not sure about the impact
                    # choose = intervals.contains(candidate.location, element.location)
                    break
                    break
            for other_candidate in candidates:
            for other_candidate in candidates:
                if candidate.location == other_candidate.location:
                if candidate.location == other_candidate.location:
                    continue
                    continue
                if intervals.contains(candidate.location, other_candidate.location):
                if intervals.contains(candidate.location, other_candidate.location):
                   # if candidate.score >= other_candidate.score:
                   #     choose = True
                   # else:
                    choose = False
                    choose = False
                    break
                    break
            if choose:
            if choose:
@@ -110,7 +101,6 @@ class Nester(object):
                                                                  nested_list[j].features['ppt'])
                                                                  nested_list[j].features['ppt'])
                nested_list[j].features['pbs'] = intervals.expand(nested_list[i].location,
                nested_list[j].features['pbs'] = intervals.expand(nested_list[i].location,
                                                                  nested_list[j].features['pbs'])
                                                                  nested_list[j].features['pbs'])
                #TODO check if works in corner case
                nested_list[j].tsr_left = intervals.expand(nested_list[i].location,
                nested_list[j].tsr_left = intervals.expand(nested_list[i].location,
                                                           nested_list[j].tsr_left)
                                                           nested_list[j].tsr_left)
                nested_list[j].tsr_right = intervals.expand(nested_list[i].location,
                nested_list[j].tsr_right = intervals.expand(nested_list[i].location,
Loading