Commit b6415c8f authored by Ivan Vanát's avatar Ivan Vanát
Browse files

Current version

parent 9d1551e4
Loading
Loading
Loading
Loading
+28 −3
Original line number Diff line number Diff line
#!/usr/bin/env python3

import os
import click
from datetime import datetime

@@ -15,7 +16,13 @@ from nested.output.sketcher import Sketcher
@click.option('--filter', '-f', is_flag=True, default=False, type=str, help='Filter database and create new one with given output db path.')
@click.option('--filter_string', '-s', type=str, help='Filter entries by given string [ONLY RELEVANT WITH -filter OPTION].')
@click.option('--filter_offset', '-o', type=int, help='LTR offset allowed [ONLY RELEVANT WITH -filter OPTION].')
def main(input_db, output_db, baselength, number_of_iterations, number_of_elements, filter, filter_string, filter_offset):
@click.option('--percentage', '-p', type=int, help='Percentage of elements in generated sequence.')
@click.option('--average_element', '-a', type=int, help='Average element length.')
@click.option('--expected_length', '-e', type=int, help='Expected output sequence length [ONLY WORKS WITH -percentage and -average_element].')
@click.option('--output_directory', '-d', default=os.getcwd(), type=str, help='Output directory.')
def main(input_db, output_db, baselength, number_of_iterations,
         number_of_elements, filter, filter_string, filter_offset,
         percentage, average_element, expected_length, output_directory):
    #number_of_errors = 0
    start_time = datetime.now()
    generator = Generator(input_db)
@@ -30,16 +37,34 @@ def main(input_db, output_db, baselength, number_of_iterations, number_of_elemen
        if baselength: params['baselength'] = baselength
        if number_of_iterations: params['number_of_iterations'] = number_of_iterations
        if number_of_elements: params['number_of_elements'] = number_of_elements
        if percentage and average_element:
            if expected_length:
                params['baselength'] = calc_baselength(expected_length, percentage)
            if 'baselength' in params.keys():
                params['number_of_iterations'] = calc_number_of_iterations(params['baselength'], percentage, average_element)

        generator.generate_random_nested_elements(**params)
        generator.save_elements_to_fasta('generated_data/{}'.format(output_db))

        if not os.path.exists('{}/generated_data'.format(output_directory)):
            os.makedirs('{}/generated_data'.format(output_directory))
        generator.save_elements_to_fasta('{}/generated_data/{}'.format(output_directory, output_db))
        sketcher = Sketcher()
        for element in generator.elements:
            sketcher.create_gff(element, 'generated_data')
            sketcher.create_gff(element, '{}/generated_data'.format(output_directory))
            # sketcher.sketch(element.id, 'generated_data')
    end_time = datetime.now()
    print('Total time: {}'.format(end_time - start_time))
    #print('Number of errors: {}'.format(number_of_errors))

def calc_number_of_iterations(length, percentage, average_element):
     p = percentage / 100
     remainder = int(p / (1 - p) * length)
     return remainder // average_element

def calc_baselength(expected_length, percentage):
    p = percentage / 100
    return int((1 - p) * expected_length)
    

if __name__ == '__main__':
    main()
+22 −19
Original line number Diff line number Diff line
@@ -17,8 +17,10 @@ from nested.output.sketcher import Sketcher
@click.option('--format', '-f', default='genome_browser', help='Format for GFF.')
@click.option('--output_fasta_offset', '-o', default=0, help='Number of bases around the element included in output fasta files.')
@click.option('--output_folder', '-d', default="", help='Output data folder.')
@click.option('--initial_threshold', '-t', type=int, default=500, help='Initial threshold value.')
@click.option('--threshold_multiplier', '-m', type=float, default=1, help='Threshold multiplier.')
# TODO DATA_FOLDER
def main(input_fasta, sketch, format, output_fasta_offset, output_folder):
def main(input_fasta, sketch, format, output_fasta_offset, output_folder, initial_threshold, threshold_multiplier):
    number_of_errors = 0
    start_time = datetime.now()
    sequences = list(SeqIO.parse(open(input_fasta), 'fasta'))
@@ -30,24 +32,25 @@ def main(input_fasta, sketch, format, output_fasta_offset, output_folder):
        seq_start_time = datetime.now()
        strlen = 15
        print('Processing {a}...'.format(a=sequence.id[:strlen]), end='\r')
        #try:
        nester = Nester(sequence, i)
        sketcher.create_gff(nester.nested_element, output_fasta_offset=output_fasta_offset, format=format, dirpath=output_folder)
        try:
            nester = Nester(sequence, i, initial_threshold, threshold_multiplier)
            sketcher.create_gff(nester.nested_element, dirpath=output_folder, output_fasta_offset=output_fasta_offset, format=format)
            sketcher.create_solo_ltr_gff(nester.solo_ltrs, dirpath=output_folder)  
            if sketch:
                if format != 'default':
                sketcher.create_gff(nester.nested_element, output_fasta_offset=output_fasta_offset)            
            sketcher.sketch(sequence.id)
                    sketcher.create_gff(nester.nested_element, dirpath=output_folder, output_fasta_offset=output_fasta_offset)            
                sketcher.sketch(sequence.id, dirpath=output_folder)
            seq_end_time = datetime.now()
            print('Processing {a}: DONE [{b}]'.format(a=sequence.id[:strlen], b=seq_end_time - seq_start_time))
        #except KeyboardInterrupt:
        #    raise
        #except CalledProcessError:
        #    number_of_errors += 1
        #    print('Processing {}: SUBPROCESS ERROR'.format(sequence.id[:strlen]))
        #except:
        #    number_of_errors += 1
        #    print('Processing {}: UNEXPECTED ERROR:'.format(sequence.id[:strlen]), sys.exc_info()[0])
        except KeyboardInterrupt:
            # delete ltrs.fa            
            raise
        except CalledProcessError:
            number_of_errors += 1
            print('Processing {}: SUBPROCESS ERROR'.format(sequence.id[:strlen]))
        except:
            number_of_errors += 1
            print('Processing {}: UNEXPECTED ERROR:'.format(sequence.id[:strlen]), sys.exc_info()[0])

    end_time = datetime.now()
    print('Total time: {}'.format(end_time - start_time))
+1 −1
Original line number Diff line number Diff line
@@ -67,7 +67,7 @@ class Gene(object):
        gypsy_minus = Graph(te, self.domain_list, True, True)
        graphs = [copia_plus, copia_minus, gypsy_plus, gypsy_minus]
        te.score, te.features = self._best_score(graphs)
        te.score /= float(intervals.length(te.location))
        # te.score /= float(intervals.length(te.location))

        '''
        # evaluate LTR pair using graph, set up score and features
+1 −1
Original line number Diff line number Diff line
@@ -58,7 +58,7 @@ class Generator(object):
        #expand intervals
        nested_intervals = intervals.expand_list(nested_intervals)
        nested_tes = []
        for interval in nested_intervals:
        for interval in nested_intervals[:-1]:
            nested_tes.append(TE(
                location=interval
            ))
+15 −6
Original line number Diff line number Diff line
@@ -19,7 +19,7 @@ class Nester(object):
        nested_element (NestedElement): nested element
    """

    def __init__(self, sequence, it):
    def __init__(self, sequence, it, threshold, multiplier):
        self.it = it
        self.seqid = sequence.id
        self.sequence = sequence.seq
@@ -27,21 +27,24 @@ class Nester(object):
        self.solo_ltrs = SoloLtrs(sequence.id)
        self._iteration = 0
        self._logger = NesterLogger('{}/{}'.format(config['logdir'], self.seqid))
        self.threshold = threshold
        self.multiplier = multiplier
        self._find_nesting()

    def _find_nesting(self):
        nested_list = self._get_unexpanded_transposon_list(self.sequence)  # find list of nested transposons
        nested_list = self._get_unexpanded_transposon_list(self.sequence, self.threshold)  # find list of nested transposons
        nested_list = self._expand_transposon_list(nested_list)
        
        nested_list = self._filter_nested_list(nested_list)
        
        self.nested_element = NestedElement(self.seqid, self.sequence, nested_list)
        
    def _get_unexpanded_transposon_list(self, sequence):
    # threshold causes loss of elements
    def _get_unexpanded_transposon_list(self, sequence, threshold):
        # recursivelly find and crop best evaluated transposon, return unexpanded list of found transposons
        self._iteration += 1
        gene = Gene(self._iteration, self.it, self.seqid, sequence)
        candidates = gene.get_candidates_above_threshold(threshold=0)
        candidates = gene.get_candidates_above_threshold(threshold)
        if not candidates:
            best_candidate = gene.get_best_candidate()
            if not best_candidate:
@@ -60,11 +63,16 @@ class Nester(object):
            for element in nested_list:
                if intervals.intersect(candidate.location, element.location):
                    choose = False
                    # not sure about the impact
                    # choose = intervals.contains(candidate.location, element.location)
                    break
            for other_candidate in candidates:
                if candidate.location == other_candidate.location:
                    continue
                if intervals.contains(candidate.location, other_candidate.location):
                   # if candidate.score >= other_candidate.score:
                   #     choose = True
                   # else:
                    choose = False
                    break
            if choose:
@@ -83,7 +91,7 @@ class Nester(object):
        # LOG
        self._logger.log_iteration(self._iteration, nested_list)

        nested_list += self._get_unexpanded_transposon_list(cropped_sequence)
        nested_list += self._get_unexpanded_transposon_list(cropped_sequence, self.multiplier * threshold)

        return nested_list

@@ -110,6 +118,7 @@ class Nester(object):
        return nested_list

    def _filter_nested_list(self, nested_list):
        # return list(filter(lambda x: not math.isnan(x.ltr_left_location[0]), nested_list)
        result = []
        for te in nested_list:
            if not math.isnan(te.ltr_left_location[0]):
Loading