Commit 3dafa070 authored by Pavel Jedlicka's avatar Pavel Jedlicka
Browse files

Merge branch 'feature/custom-discovery-tool' into 'master'

Feature/custom discovery tool

See merge request !23
parents d693a0de b69bdbc2
Loading
Loading
Loading
Loading
+5 −0
Original line number Diff line number Diff line
@@ -7,3 +7,8 @@ tmp
images
results
config.yml
LTR_Finder/
build/
nested.egg-info/
dist/
.vscode/

LTR_Finder @ eefef6a6

Original line number Diff line number Diff line
Subproject commit eefef6a6046338f0f69ef8003077c0dabcaedf8e
+9 −0
Original line number Diff line number Diff line
@@ -141,6 +141,7 @@ Options:
  -s, --sketch_only       If true, nesting is not computed. Genes are sketched
                          only from existing gff files.
  -d, --data_folder TEXT  Output data folder.
  -dt --discovery-tool TEXT       Used discovery tool. Default is LTR-finder.
  --help                  Show this message and exit.
```

@@ -162,3 +163,11 @@ Options:
  -d, --data_folder TEXT          Output data folder.
  --help                          Show this message and exit.
```

## Adding new LTR discovery tool

* Implement run() method in new TE class based on core/TE_template.py

* Add new value to core/DiscoveryTool.py including any aliases, make sure the numeric value is different from already implemented tools and that it is the same for all of your aliases. (used in calling nester ``-dt <value you set>``)

* Import your newly implemented class in utils/DependencyResolver.py and add it to ``valueToClassDict`` attribute. (key is numeric value you set in core/DiscoveryTool.py)
+11 −1
Original line number Diff line number Diff line
@@ -15,6 +15,16 @@ ltr:
    a: *prosite_path
    s: *tRNAdb_path

ltr_harvest:
  path: 'gt'
  command: 'ltrharvest'
  args:
    similar: 40
    maxlenltr: 3000
    maxdistltr: 20000
    mintsd: 4
    maxtsd: 10
       
gt:
  path: 'gt'
  command: 'sketch'
+33 −14
Original line number Diff line number Diff line
#!/usr/bin/env python3

import os
import sys
import click
import glob
from concurrent import futures
@@ -12,7 +11,8 @@ from Bio import SeqIO

from nested.core.nester import Nester
from nested.output.sketcher import Sketcher
from nested.core.sequence_thread import SequenceThread
from nested.core.discovery_tool import DiscoveryTool
from nested.utils.dependency_resolver import DependencyResolver


@click.command()
@@ -24,30 +24,39 @@ from nested.core.sequence_thread import SequenceThread
@click.option('--initial_threshold', '-t', type=int, default=500, help='Initial threshold value.')
@click.option('--threshold_multiplier', '-m', type=float, default=1, help='Threshold multiplier.')
@click.option('--threads', '-n', type=int, default=1, help='Number of threads')
def main(input_fasta, sketch, format, output_fasta_offset, output_folder, initial_threshold, threshold_multiplier, threads):    
@click.option('--discovery_tool', '-dt', default=DiscoveryTool.LTR_finder.name,
    type=click.Choice(list(map(lambda val: val, DiscoveryTool.__members__))),
    help='Determines which tool is used for retrotransoson discovery. Default: LTR_finder')
def main(input_fasta, sketch, format, output_fasta_offset, output_folder, initial_threshold, 
        threshold_multiplier, threads, discovery_tool):
    check_ram(input_fasta, threads)
    check_permissions(output_folder, os.W_OK | os.X_OK)
    number_of_errors = [0]
    start_time = datetime.now()
    sequences = list(SeqIO.parse(open(input_fasta), 'fasta'))
    sketcher = Sketcher()
    futuress = []
    dependencyResolver = DependencyResolver(discovery_tool)
    with futures.ThreadPoolExecutor(threads) as executor:
        for sequence in sequences:
            futuress.append(executor.submit(process_sequence, sequence, sketcher, sketch, format, output_fasta_offset, output_folder, initial_threshold, threshold_multiplier, number_of_errors))
        for sequence in SeqIO.parse(input_fasta, 'fasta'):
            futuress.append(executor.submit(process_sequence, sequence, sketcher, 
                sketch, format, output_fasta_offset, output_folder, 
                initial_threshold, threshold_multiplier, number_of_errors,
                dependencyResolver))
        futures.wait(futuress)
    end_time = datetime.now()
    print('Total time: {}'.format(end_time - start_time))
    print('Number of errors: {}'.format(number_of_errors[0]))

def process_sequence(sequence, sketcher, sketch, format, output_fasta_offset, output_folder, initial_threshold, threshold_multiplier, errors):
def process_sequence(sequence, sketcher, sketch, format, output_fasta_offset, output_folder, initial_threshold, threshold_multiplier, errors, dependency_resolver):
    sequence.id = sequence.id.replace('/', '--')
    seq_start_time = datetime.now()
    strlen = 15
    print("Processing {}".format(sequence.id))
    try:
        nester = Nester(sequence, initial_threshold, threshold_multiplier)
        nester = Nester(sequence, initial_threshold, threshold_multiplier, dependency_resolver)
        sketcher.create_gff(nester.nested_element, dirpath=output_folder, output_fasta_offset=output_fasta_offset, format=format)
        sketcher.create_solo_ltr_gff(nester.solo_ltrs, dirpath=output_folder)
        sketcher.create_trf_gff(nester.trf, nester.seqid, dirpath=output_folder)
        if sketch:
            if format != 'default':
                sketcher.create_gff(nester.nested_element, dirpath=output_folder, output_fasta_offset=output_fasta_offset)
@@ -57,14 +66,14 @@ def process_sequence(sequence, sketcher, sketch, format, output_fasta_offset, ou
    except KeyboardInterrupt:
        cleanup()
        raise
    except CalledProcessError:
    except CalledProcessError as ex:
        cleanup()
        errors[0] += 1
        print('Processing {}: SUBPROCESS ERROR'.format(sequence.id[:strlen]))
    except Exception:
        print('Processing {}: SUBPROCESS ERROR: {}'.format(sequence.id[:strlen], ex))
    except Exception as ex:
        cleanup()
        errors[0] += 1
        print('Processing {}: UNEXPECTED ERROR:'.format(sequence.id[:strlen]), sys.exc_info()[0])
        print('Processing {}: UNEXPECTED ERROR: {}'.format(sequence.id[:strlen], ex))

def check_permissions(path, permissions):
    path = os.getcwd() if len(path) == 0 else path
@@ -76,6 +85,16 @@ def check_permissions(path, permissions):
        except Exception:
            return        

def check_ram(input_file, n):
    mem_bytes = os.sysconf('SC_PAGE_SIZE') * os.sysconf('SC_PHYS_PAGES')
    mem_gib = mem_bytes/(1024**3)
    seq_sizes = []
    for seq in SeqIO.parse(input_file, 'fasta'):
        seq_sizes.append(len(seq) / (1024**3))
    seq_sizes.sort()
    if (sum(seq_sizes[-n:]) * 15) > (0.8 * mem_gib):
        print("\033[93m" + "During the analysis memory usage may exceed 80% of system's total physical memory." + "\033[0m")

def cleanup():
    for file in glob.glob("*ltrs.fa"):
        os.remove(file)
Loading