Commit 65e2f929 authored by Pavel Jedlicka's avatar Pavel Jedlicka
Browse files

fixed trf; added output

parent 86e4fdf4
Loading
Loading
Loading
Loading
+2 −1
Original line number Original line Diff line number Diff line
@@ -56,6 +56,7 @@ def process_sequence(sequence, sketcher, sketch, format, output_fasta_offset, ou
        nester = Nester(sequence, initial_threshold, threshold_multiplier, dependency_resolver)
        nester = Nester(sequence, initial_threshold, threshold_multiplier, dependency_resolver)
        sketcher.create_gff(nester.nested_element, dirpath=output_folder, output_fasta_offset=output_fasta_offset, format=format)
        sketcher.create_gff(nester.nested_element, dirpath=output_folder, output_fasta_offset=output_fasta_offset, format=format)
        sketcher.create_solo_ltr_gff(nester.solo_ltrs, dirpath=output_folder)
        sketcher.create_solo_ltr_gff(nester.solo_ltrs, dirpath=output_folder)
        sketcher.create_trf_gff(nester.trf, nester.seqid, dirpath=output_folder)
        if sketch:
        if sketch:
            if format != 'default':
            if format != 'default':
                sketcher.create_gff(nester.nested_element, dirpath=output_folder, output_fasta_offset=output_fasta_offset)
                sketcher.create_gff(nester.nested_element, dirpath=output_folder, output_fasta_offset=output_fasta_offset)
+8 −3
Original line number Original line Diff line number Diff line
@@ -30,11 +30,14 @@ class Nester(object):
        self.threshold = threshold
        self.threshold = threshold
        self.multiplier = multiplier
        self.multiplier = multiplier
        self.dependency_resolver = dependency_resolver
        self.dependency_resolver = dependency_resolver
        self.trf = []
        self._find_nesting()
        self._find_nesting()


    def _find_nesting(self):
    def _find_nesting(self):
        # tandem repeat finder
        # tandem repeat finder
        nested_list = tandemRepeatFinder.run(self.seqid, self.sequence)
        self.trf = tandemRepeatFinder.run(self.seqid, self.sequence)
        nested_list = self.trf[:]
        
        cropped_sequence = self._crop_sequence(nested_list, self.sequence)
        cropped_sequence = self._crop_sequence(nested_list, self.sequence)
        
        
        nested_list += self._get_unexpanded_transposon_list(cropped_sequence, self.threshold)  # find list of nested transposons
        nested_list += self._get_unexpanded_transposon_list(cropped_sequence, self.threshold)  # find list of nested transposons
@@ -102,8 +105,10 @@ class Nester(object):
                                                                     nested_list[j].ltr_right_location)
                                                                     nested_list[j].ltr_right_location)
                for domain in nested_list[j].features['domains']:
                for domain in nested_list[j].features['domains']:
                    domain.location = intervals.expand(nested_list[i].location, domain.location)
                    domain.location = intervals.expand(nested_list[i].location, domain.location)
                if 'ppt' in nested_list[j].features.keys():
                    nested_list[j].features['ppt'] = intervals.expand(nested_list[i].location,
                    nested_list[j].features['ppt'] = intervals.expand(nested_list[i].location,
                                                                  nested_list[j].features['ppt'])
                                                                  nested_list[j].features['ppt'])
                if 'pbs' in nested_list[j].features.keys():                                                               
                    nested_list[j].features['pbs'] = intervals.expand(nested_list[i].location,
                    nested_list[j].features['pbs'] = intervals.expand(nested_list[i].location,
                                                                  nested_list[j].features['pbs'])
                                                                  nested_list[j].features['pbs'])
                nested_list[j].tsr_left = intervals.expand(nested_list[i].location,
                nested_list[j].tsr_left = intervals.expand(nested_list[i].location,
+11 −10
Original line number Original line Diff line number Diff line
@@ -10,13 +10,14 @@ from nested.core.te import TE
from nested.utils import intervals
from nested.utils import intervals


class TandemRepeat(TE):
class TandemRepeat(TE):
    def __init__(self, loc, period_size, copies, matches, indels, scor, entropy):
    def __init__(self, loc, period_size, copies, matches, indels, scor, entropy, monomer):
        super().__init__(location=loc, score=scor)
        super().__init__(location=loc, score=scor)
        self.matches = matches
        self.matches = matches
        self.indels = indels
        self.indels = indels
        self.entropy = entropy
        self.entropy = entropy
        self.period_size = period_size
        self.period_size = period_size
        self.copies = copies
        self.copies = copies
        self.monomer = monomer


    def is_better(self, other):
    def is_better(self, other):
        if not isinstance(other, TandemRepeat):
        if not isinstance(other, TandemRepeat):
@@ -56,23 +57,23 @@ def run(seqid, sequence):
    process = subprocess.Popen(
    process = subprocess.Popen(
            ['trf', '/tmp/nested/trf/{}.fa'.format(seqid), str(2), str(5), str(7),
            ['trf', '/tmp/nested/trf/{}.fa'.format(seqid), str(2), str(5), str(7),
            str(80), str(10), str(50), str(2000), '-m', '-d', '-h'],
            str(80), str(10), str(50), str(2000), '-m', '-d', '-h'],
            stdout=subprocess.PIPE,
            stdout=subprocess.DEVNULL,
            stderr=subprocess.PIPE)
            stderr=subprocess.DEVNULL)
    
    
    stdout, stderr = process.communicate()
    stdout, stderr = process.communicate()


    repeats = filter_candidates(get_candidates(stdout))
    repeats = filter_candidates(get_candidates(seqid))
    repeats.sort(key=lambda r: r.location[0], reverse=True)
    repeats.sort(key=lambda r: r.location[0], reverse=True)
    return repeats
    return repeats


def get_candidates(raw_output):
def get_candidates(seqid):
    entries = raw_output.decode('utf-8').split(os.linesep)[13:-1]
    candidates = []
    candidates = []


    for entry in entries:
    with open('{}.fa.2.5.7.80.10.50.2000.dat'.format(seqid)) as file:
        for entry in islice(file, 15, None):
            split = entry.split(' ')
            split = entry.split(' ')
            candidates.append(TandemRepeat([int(split[0]), int(split[1])], int(split[2]), float(split[3]),
            candidates.append(TandemRepeat([int(split[0]), int(split[1])], int(split[2]), float(split[3]),
                int(split[5]), int(split[6]), int(split[7]), float(split[12])))
                    int(split[5]), int(split[6]), int(split[7]), float(split[12]), split[-1]))
    return candidates
    return candidates


def filter_candidates(candidates):
def filter_candidates(candidates):
+1 −1
Original line number Original line Diff line number Diff line
@@ -27,7 +27,7 @@ class TE(object):


    def __init__(self, ppt=[0, 0], pbs=[0, 0], location=[0, 0],
    def __init__(self, ppt=[0, 0], pbs=[0, 0], location=[0, 0],
                 ltr_left_location=[0, 0], ltr_right_location=[0, 0],
                 ltr_left_location=[0, 0], ltr_right_location=[0, 0],
                 tsr_left=[0, 0], tsr_right=[0, 0], features={}, score=None):
                 tsr_left=[0, 0], tsr_right=[0, 0], features={'domains': []}, score=None):
        self.ppt = ppt
        self.ppt = ppt
        self.pbs = pbs
        self.pbs = pbs
        self.location = location
        self.location = location
+6 −1
Original line number Original line Diff line number Diff line
@@ -6,6 +6,7 @@ import subprocess
from nested.config.config import config, args_dict_to_list
from nested.config.config import config, args_dict_to_list
from nested.output.gff import GFFMaker
from nested.output.gff import GFFMaker
from nested.output.solo_gff import SoloGFFMaker
from nested.output.solo_gff import SoloGFFMaker
from nested.output.trf_gff import TrfGFFMaker


DEFAULT_DIRPATH = 'data'
DEFAULT_DIRPATH = 'data'


@@ -13,6 +14,7 @@ class Sketcher(object):
    def __init__(self):
    def __init__(self):
        self._gff_maker = GFFMaker()
        self._gff_maker = GFFMaker()
        self._solo_gff_maker = SoloGFFMaker()
        self._solo_gff_maker = SoloGFFMaker()
        self._trf_gff_maker = TrfGFFMaker()
        self._gff_path = ''
        self._gff_path = ''


    def create_gff(self, nested_element, dirpath, output_fasta_offset=0, format='default'):
    def create_gff(self, nested_element, dirpath, output_fasta_offset=0, format='default'):
@@ -24,6 +26,9 @@ class Sketcher(object):
        self._solo_gff_maker.create_solo_gff(solo_ltrs, path)
        self._solo_gff_maker.create_solo_gff(solo_ltrs, path)
        self._solo_gff_maker.move_ltrs_spliced(solo_ltrs.seqId, path)
        self._solo_gff_maker.move_ltrs_spliced(solo_ltrs.seqId, path)
    
    
    def create_trf_gff(self, trf, seqId, dirpath):
        path = os.path.join(dirpath, DEFAULT_DIRPATH)
        self._trf_gff_maker.create_gff(trf, seqId, path)


    def sketch(self, id, dirpath):
    def sketch(self, id, dirpath):
        path = os.path.join(dirpath, DEFAULT_DIRPATH)
        path = os.path.join(dirpath, DEFAULT_DIRPATH)
Loading