Commit 86e4fdf4 authored by Pavel Jedlicka's avatar Pavel Jedlicka
Browse files

tadnem repeat finder integrated not tested properly yet

parent 6fc973ec
Loading
Loading
Loading
Loading
+13 −2
Original line number Original line Diff line number Diff line
@@ -8,6 +8,7 @@ from nested.core.nested_element import NestedElement
from nested.core.solo_ltrs import SoloLtrs
from nested.core.solo_ltrs import SoloLtrs
from nested.logging.logger import NesterLogger
from nested.logging.logger import NesterLogger
from nested.config.config import config
from nested.config.config import config
import nested.core.tandem_repeat as tandemRepeatFinder




class Nester(object):
class Nester(object):
@@ -32,7 +33,11 @@ class Nester(object):
        self._find_nesting()
        self._find_nesting()


    def _find_nesting(self):
    def _find_nesting(self):
        nested_list = self._get_unexpanded_transposon_list(self.sequence, self.threshold)  # find list of nested transposons
        # tandem repeat finder
        nested_list = tandemRepeatFinder.run(self.seqid, self.sequence)
        cropped_sequence = self._crop_sequence(nested_list, self.sequence)
        
        nested_list += self._get_unexpanded_transposon_list(cropped_sequence, self.threshold)  # find list of nested transposons
        nested_list = self._expand_transposon_list(nested_list)        
        nested_list = self._expand_transposon_list(nested_list)        
        nested_list = self._filter_nested_list(nested_list)        
        nested_list = self._filter_nested_list(nested_list)        
        self.nested_element = NestedElement(self.seqid, self.sequence, nested_list)
        self.nested_element = NestedElement(self.seqid, self.sequence, nested_list)
@@ -87,8 +92,8 @@ class Nester(object):
        return nested_list
        return nested_list


    def _expand_transposon_list(self, nested_list):
    def _expand_transposon_list(self, nested_list):
        for i in reversed(range(len(nested_list) - 1)):
        # backwards expanding of intervals according to previously found and cropped elements
        # backwards expanding of intervals according to previously found and cropped elements
        for i in reversed(range(len(nested_list) - 1)):
            for j in range(i + 1, len(nested_list)):
            for j in range(i + 1, len(nested_list)):
                nested_list[j].location = intervals.expand(nested_list[i].location, nested_list[j].location)
                nested_list[j].location = intervals.expand(nested_list[i].location, nested_list[j].location)
                nested_list[j].ltr_left_location = intervals.expand(nested_list[i].location,
                nested_list[j].ltr_left_location = intervals.expand(nested_list[i].location,
@@ -115,4 +120,10 @@ class Nester(object):
                result.append(te)
                result.append(te)
        return result
        return result
    
    
    def _crop_sequence(self, elements, sequence):
        cropped = sequence
        for element in elements:
            cropped = cropped[:element.location[0]] + cropped[element.location[1]:]
        return cropped
    
    
    
+104 −0
Original line number Original line Diff line number Diff line
#!/usr/bin/env python3
import os
import subprocess
from itertools import islice

from Bio import SeqIO
from Bio.SeqRecord import SeqRecord

from nested.core.te import TE
from nested.utils import intervals

class TandemRepeat(TE):
    def __init__(self, loc, period_size, copies, matches, indels, scor, entropy):
        super().__init__(location=loc, score=scor)
        self.matches = matches
        self.indels = indels
        self.entropy = entropy
        self.period_size = period_size
        self.copies = copies

    def is_better(self, other):
        if not isinstance(other, TandemRepeat):
            return False

        score = 0
        score += self.matches > other.matches
        score += self.indels < other.indels
        score += self.score > other.score
        score += self.entropy > other.entropy
        if score == 2:
            return intervals.length(self.location) > intervals.length(other.location)
        return score > 2

    def __str__(self):
        lines = ['location: {}'.format(self.location),
                 'period size: {}'.format(self.period_size),
                 'number of copies: {}'.format(self.copies),
                 '% of matches: {}'.format(self.matches),
                 '% of indels: {}'.format(self.indels),
                 'score: {}'.format(self.score),
                 'entropy: {}'.format(self.entropy)]
        return os.linesep.join(lines)

def run(seqid, sequence):
    if not os.path.exists('/tmp/nested'):
            os.makedirs('/tmp/nested')
    
    if not os.path.exists('/tmp/nested/trf'):
        os.makedirs('/tmp/nested/trf')
    
    with open('/tmp/nested/trf/{}.fa'.format(seqid), 'w+') as tmp_file:
        SeqIO.write(SeqRecord(sequence, id=seqid),
            tmp_file,
            'fasta')
    
    process = subprocess.Popen(
            ['trf', '/tmp/nested/trf/{}.fa'.format(seqid), str(2), str(5), str(7),
            str(80), str(10), str(50), str(2000), '-m', '-d', '-h'],
            stdout=subprocess.PIPE,
            stderr=subprocess.PIPE)
    
    stdout, stderr = process.communicate()

    repeats = filter_candidates(get_candidates(stdout))
    repeats.sort(key=lambda r: r.location[0], reverse=True)
    return repeats

def get_candidates(raw_output):
    entries = raw_output.decode('utf-8').split(os.linesep)[13:-1]
    candidates = []

    for entry in entries:
        split = entry.split(' ')
        candidates.append(TandemRepeat([int(split[0]), int(split[1])], int(split[2]), float(split[3]),
                int(split[5]), int(split[6]), int(split[7]), float(split[12])))
    return candidates

def filter_candidates(candidates):
    candidates.sort(key=lambda r: r.location[0])
    repeats = []

    for candidate in candidates:
        add = True
        for candidate2 in candidates:
            if candidate2.location[0] > candidate.location[1]:
                break
            if intervals.contains(candidate.location, candidate2.location):
                continue
            elif intervals.intersect(candidate.location, candidate2.location):
                if not candidate.is_better(candidate2):
                    add = False
                    break
        for repeat in repeats:
            if repeat.location[0] > candidate.location[1]:
                break
            if intervals.contains(repeat.location, candidate.location):
                add = False
                break
        if add:
            repeats.append(candidate)
    return repeats