Skip to content
Snippets Groups Projects
Commit 64f80f3b authored by Pavel Jedlicka's avatar Pavel Jedlicka
Browse files

LTRharvest implemented; can add more discovery tools

parent ec377525
No related branches found
No related tags found
1 merge request!23Feature/custom discovery tool
......@@ -7,3 +7,8 @@ tmp
images
results
config.yml
LTR_Finder/
build/
nested.egg-info/
dist/
.vscode/
......@@ -14,7 +14,17 @@ ltr:
w: 2 #don't change, parsing format required!
a: *prosite_path
s: *tRNAdb_path
ltr_harvest:
path: 'gt'
command: 'ltrharvest'
args:
similar: 40
maxlenltr: 3000
maxdistltr: 20000
mintsd: 4
maxtsd: 10
gt:
path: 'gt'
command: 'sketch'
......
......@@ -12,7 +12,8 @@ from Bio import SeqIO
from nested.core.nester import Nester
from nested.output.sketcher import Sketcher
from nested.core.sequence_thread import SequenceThread
from nested.core.discovery_tool import DiscoveryTool
from nested.utils.dependency_resolver import DependencyResolver
@click.command()
......@@ -24,28 +25,36 @@ from nested.core.sequence_thread import SequenceThread
@click.option('--initial_threshold', '-t', type=int, default=500, help='Initial threshold value.')
@click.option('--threshold_multiplier', '-m', type=float, default=1, help='Threshold multiplier.')
@click.option('--threads', '-n', type=int, default=1, help='Number of threads')
def main(input_fasta, sketch, format, output_fasta_offset, output_folder, initial_threshold, threshold_multiplier, threads):
@click.option('--discovery_tool', '-dt', default=DiscoveryTool.LTR_finder.name,
type=click.Choice(list(map(lambda val: val, DiscoveryTool.__members__))),
help='Determines which tool is used for retrotransoson discovery. Default: LTR_finder')
def main(input_fasta, sketch, format, output_fasta_offset, output_folder, initial_threshold,
threshold_multiplier, threads, discovery_tool):
check_permissions(output_folder, os.W_OK | os.X_OK)
number_of_errors = [0]
start_time = datetime.now()
sequences = list(SeqIO.parse(open(input_fasta), 'fasta'))
sketcher = Sketcher()
futuress = []
dependencyResolver = DependencyResolver(discovery_tool)
with futures.ThreadPoolExecutor(threads) as executor:
for sequence in sequences:
futuress.append(executor.submit(process_sequence, sequence, sketcher, sketch, format, output_fasta_offset, output_folder, initial_threshold, threshold_multiplier, number_of_errors))
futuress.append(executor.submit(process_sequence, sequence, sketcher,
sketch, format, output_fasta_offset, output_folder,
initial_threshold, threshold_multiplier, number_of_errors,
dependencyResolver))
futures.wait(futuress)
end_time = datetime.now()
print('Total time: {}'.format(end_time - start_time))
print('Number of errors: {}'.format(number_of_errors[0]))
def process_sequence(sequence, sketcher, sketch, format, output_fasta_offset, output_folder, initial_threshold, threshold_multiplier, errors):
def process_sequence(sequence, sketcher, sketch, format, output_fasta_offset, output_folder, initial_threshold, threshold_multiplier, errors, dependency_resolver):
sequence.id = sequence.id.replace('/', '--')
seq_start_time = datetime.now()
strlen = 15
print("Processing {}".format(sequence.id))
try:
nester = Nester(sequence, initial_threshold, threshold_multiplier)
nester = Nester(sequence, initial_threshold, threshold_multiplier, dependency_resolver)
sketcher.create_gff(nester.nested_element, dirpath=output_folder, output_fasta_offset=output_fasta_offset, format=format)
sketcher.create_solo_ltr_gff(nester.solo_ltrs, dirpath=output_folder)
if sketch:
......@@ -57,14 +66,14 @@ def process_sequence(sequence, sketcher, sketch, format, output_fasta_offset, ou
except KeyboardInterrupt:
cleanup()
raise
except CalledProcessError:
except CalledProcessError as ex:
cleanup()
errors[0] += 1
print('Processing {}: SUBPROCESS ERROR'.format(sequence.id[:strlen]))
except Exception:
print('Processing {}: SUBPROCESS ERROR: {}'.format(sequence.id[:strlen], ex))
except Exception as ex:
cleanup()
errors[0] += 1
print('Processing {}: UNEXPECTED ERROR:'.format(sequence.id[:strlen]), sys.exc_info()[0])
print('Processing {}: UNEXPECTED ERROR: {}'.format(sequence.id[:strlen], ex))
def check_permissions(path, permissions):
path = os.getcwd() if len(path) == 0 else path
......
#!/usr/bin/env python3
from enum import Enum
class DiscoveryTool(Enum):
LTR_finder = 1
LTRharvest = 2
#add aliases with same value as the tool above
finder = 1
harvest = 2
#!/usr/bin/env python3
from nested.core.te import run_ltr_finder
#from nested.core.te import run_ltr_finder
# from nested.core.domain import run_blastx
# from nested.core.domain_dante import run_last
from nested.core.domain_dante import run_blastx
from nested.scoring.graph import Graph
from nested.utils import intervals
from nested.utils.dependency_resolver import DependencyResolver
class Gene(object):
......@@ -20,10 +21,11 @@ class Gene(object):
"""
def __init__(self, seqid=None, sequence=None):
def __init__(self, dependency_resolver, seqid=None, sequence=None):
self.seqid = seqid
self.sequence = sequence
self.te_list = run_ltr_finder(seqid, sequence)
self.te_list = dependency_resolver.discovery_tool.run(seqid, sequence)
#self.te_list = run_ltr_finder(seqid, sequence)
self.domain_list = run_blastx(sequence)
# self.domain_list = run_last(sequence)
self.scores = []
......
......@@ -19,7 +19,7 @@ class Nester(object):
nested_element (NestedElement): nested element
"""
def __init__(self, sequence, threshold, multiplier):
def __init__(self, sequence, threshold, multiplier, dependency_resolver):
self.seqid = sequence.id
self.sequence = sequence.seq
self.nested_element = None
......@@ -28,6 +28,7 @@ class Nester(object):
self._logger = NesterLogger('{}/{}'.format(config['logdir'], self.seqid))
self.threshold = threshold
self.multiplier = multiplier
self.dependency_resolver = dependency_resolver
self._find_nesting()
def _find_nesting(self):
......@@ -39,7 +40,7 @@ class Nester(object):
def _get_unexpanded_transposon_list(self, sequence, threshold):
# recursivelly find and crop best evaluated transposon, return unexpanded list of found transposons
self._iteration += 1
gene = Gene(self.seqid, sequence)
gene = Gene(self.dependency_resolver, self.seqid, sequence)
candidates = gene.get_candidates_above_threshold(threshold)
if not candidates:
best_candidate = gene.get_best_candidate()
......
......@@ -49,58 +49,58 @@ class TE(object):
# ' features = {},'.format(self.features),
' score = {}}}'.format(self.score)]
return '\n'.join(lines)
def run_ltr_finder(seqid, sequence):
"""Run LTR finder on sequence and return list of transposons
Arguments:
seqid (str): sequence id
sequence (Bio.Seq.Seq): sequence
tmp_dir (str): Auxiliary existing directory
Returns:
list[TE]: list of found ltr pairs as a TE class
"""
transposons = []
if not os.path.exists('/tmp/nested'):
os.makedirs('/tmp/nested')
if not os.path.exists('/tmp/nested/ltr'):
os.makedirs('/tmp/nested/ltr')
with open('/tmp/nested/ltr/{}.fa'.format(seqid), 'w+') as tmp_file: # prepare tmp fasta file for ltr finder
SeqIO.write(SeqRecord(sequence, id=seqid),
tmp_file,
'fasta')
# call LTR finder and feed stdin to it
process = subprocess.Popen(
[config['ltr']['path']] + args_dict_to_list(config['ltr']['args']) + ['/tmp/nested/ltr/{}.fa'.format(seqid)],
stdout=subprocess.PIPE,
stderr=subprocess.PIPE)
# os.remove('/tmp/nested/ltr/{}.fa'.format(seqid))
stdout, stderr = process.communicate()
parsed_output = parse_raw_output(stdout)
for entry in parsed_output:
params = {
'ppt': entry['ppt'],
'pbs': entry['pbs'],
'location': entry['location'],
'ltr_left_location': entry['ltr_left_location'],
'ltr_right_location': entry['ltr_right_location'],
'tsr_right': entry['tsr_right'],
'tsr_left': entry['tsr_left'],
}
transposons.append(TE(**params))
return transposons
@staticmethod
def run(seqid, sequence):
"""Run LTR finder on sequence and return list of transposons
Arguments:
seqid (str): sequence id
sequence (Bio.Seq.Seq): sequence
tmp_dir (str): Auxiliary existing directory
Returns:
list[TE]: list of found ltr pairs as a TE class
"""
transposons = []
if not os.path.exists('/tmp/nested'):
os.makedirs('/tmp/nested')
if not os.path.exists('/tmp/nested/ltr'):
os.makedirs('/tmp/nested/ltr')
with open('/tmp/nested/ltr/{}.fa'.format(seqid), 'w+') as tmp_file: # prepare tmp fasta file for ltr finder
SeqIO.write(SeqRecord(sequence, id=seqid),
tmp_file,
'fasta')
# call LTR finder and feed stdin to it
process = subprocess.Popen(
[config['ltr']['path']] + args_dict_to_list(config['ltr']['args']) + ['/tmp/nested/ltr/{}.fa'.format(seqid)],
stdout=subprocess.PIPE,
stderr=subprocess.PIPE)
# os.remove('/tmp/nested/ltr/{}.fa'.format(seqid))
stdout, stderr = process.communicate()
parsed_output = parse_raw_output(stdout)
for entry in parsed_output:
params = {
'ppt': entry['ppt'],
'pbs': entry['pbs'],
'location': entry['location'],
'ltr_left_location': entry['ltr_left_location'],
'ltr_right_location': entry['ltr_right_location'],
'tsr_right': entry['tsr_right'],
'tsr_left': entry['tsr_left'],
}
transposons.append(TE(**params))
return transposons
def parse_raw_output(raw_output):
"""Parse raw output from LTR finder
......
#!/usr/bin/env python3
import os
import subprocess
import re
from Bio import SeqIO
from Bio.SeqRecord import SeqRecord
from nested.config.config import config, args_dict_to_list
class TE_harvest(object):
"""Class representing TE. Every location is in format [from, to].
Attributes:
ppt (list): location of ppt
pbs (list): location of pbs
location (list): position of TE in gene
ltr_left_location (list): location of left ltr
ltr_right_location (list): location of right ltr
tsr_left (list): location of left tsr
tsr_right (list): location of right tsr
features (dict): dictionary of features assigned to TE (i.e. list of domains on the optimal path in graph)
score (float): evaluated score of TE
"""
def __init__(self, ppt=[0, 0], pbs=[0, 0], location=[0, 0],
ltr_left_location=[0, 0], ltr_right_location=[0, 0],
tsr_left=[0, 0], tsr_right=[0, 0], features={}, score=None):
self.ppt = ppt
self.pbs = pbs
self.location = location
self.ltr_left_location = ltr_left_location
self.ltr_right_location = ltr_right_location
self.tsr_left = tsr_left
self.tsr_right = tsr_right
self.features = features
self.score = score
def __str__(self):
lines = ['{{location = {},'.format(self.location),
' left ltr = {},'.format(self.ltr_left_location),
' right ltr = {},'.format(self.ltr_right_location),
' left tsr = {},'.format(self.tsr_left),
' right tsr = {},'.format(self.tsr_right),
' ppt = {},'.format(self.ppt),
' pbs = {},'.format(self.pbs),
# ' features = {},'.format(self.features),
' score = {}}}'.format(self.score)]
return '\n'.join(lines)
@staticmethod
def run(seqid, sequence):
"""Run LTR finder on sequence and return list of transposons
Arguments:
seqid (str): sequence id
sequence (Bio.Seq.Seq): sequence
tmp_dir (str): Auxiliary existing directory
Returns:
list[TE]: list of found ltr pairs as a TE class
"""
transposons = []
if not os.path.exists('/tmp/nested'):
os.makedirs('/tmp/nested')
if not os.path.exists('/tmp/nested/ltr'):
os.makedirs('/tmp/nested/ltr')
with open('/tmp/nested/ltr/{}.fa'.format(seqid), 'w+') as tmp_file: # prepare tmp fasta file for ltr finder
SeqIO.write(SeqRecord(sequence, id=seqid),
tmp_file,
'fasta')
process = subprocess.Popen(['gt', 'suffixerator', '-db', '/tmp/nested/ltr/{}.fa'.format(seqid),
'-indexname', '/tmp/nested/ltr/{}.fa'.format(seqid), '-tis', '-suf', '-lcp', '-des', '-ssp', '-sds', '-dna'],
stdout=subprocess.PIPE,
stderr=subprocess.PIPE)
process.communicate()
# call LTR harvest
process = subprocess.Popen(
[config['ltr_harvest']['path']] + [config['ltr_harvest']['command']] +
args_dict_to_list(config['ltr_harvest']['args']) +
['-index', '/tmp/nested/ltr/{}.fa'.format(seqid)] +
['-longoutput'],
stdout=subprocess.PIPE,
stderr=subprocess.PIPE)
stdout, stderr = process.communicate()
parsed_output = parse_raw_output(stdout)
for entry in parsed_output:
params = {
'ppt': entry['ppt'],
'pbs': entry['pbs'],
'location': entry['location'],
'ltr_left_location': entry['ltr_left_location'],
'ltr_right_location': entry['ltr_right_location'],
'tsr_right': entry['tsr_right'],
'tsr_left': entry['tsr_left'],
}
transposons.append(TE_harvest(**params))
return transposons
def parse_raw_output(raw_output):
"""Parse raw output from LTR harvest
Arguments:
raw_output (str): ltr finder raw output (args -w 2)
Returns:
dict: parsed raw output
"""
entries = raw_output.decode('utf-8').split(os.linesep)[13:-1]
result = []
for entry in entries:
split_line = entry.split(' ')
transposon = { "location": [int(split_line[0]), int(split_line[1])],
"ltr_left_location": [int(split_line[3]), int(split_line[4])],
"ltr_right_location": [int(split_line[8]), int(split_line[9])],
"ppt": [float("nan"), float("nan")],
"pbs": [float("nan"), float("nan")]
}
try:
tsr_right = [int(split_line[1]) + 1, int(split_line[1]) + int(split_line[7])]
tsr_left = [int(split_line[0]) - int(split_line[7]), int(split_line[0]) - 1]
except:
tsr_right = [float("nan"), float("nan")]
tsr_left = [float("nan"), float("nan")]
transposon["tsr_right"] = tsr_right
transposon["tsr_left"] = tsr_left
result.append(transposon)
return result
......@@ -29,8 +29,15 @@ class Sketcher(object):
path = os.path.join(dirpath, DEFAULT_DIRPATH)
print(path)
null = open(os.devnull, 'w')
process = subprocess.check_output(
[config['gt']['path'], config['gt']['command']] +
try:
process = subprocess.check_output(
[config['gt']['path'], config['gt']['command']] +
args_dict_to_list(config['gt']['args']) +
['{}/{}/{}.png'.format(path, id, id)] +
['{}/{}/{}.gff'.format(path, id, id)], stderr=null)
except Exception:
command = " ".join([config['gt']['path'], config['gt']['command']] +
args_dict_to_list(config['gt']['args']) +
['{}/{}/{}.png'.format(path, id, id)] +
['{}/{}/{}.gff'.format(path, id, id)], stderr=null)
['{}/{}/{}.gff'.format(path, id, id)])
raise Exception("Command \'{}\' returned non-zero status code. Problem with package genometools.".format(command))
#!/usr/bin/env python3
from nested.core.te_harvest import TE_harvest #ltr harvest
from nested.core.te import TE #ltr finder
from nested.core.discovery_tool import DiscoveryTool
class DependencyResolver():
def __init__(self, tool_name):
self.discovery_tool = self.resolve_discovery_tool(tool_name)
def resolve_discovery_tool(self, tool_name):
valueToClassDict = {1: TE, 2: TE_harvest}
value = 1
if (tool_name in DiscoveryTool.__members__):
value = DiscoveryTool[tool_name].value
return valueToClassDict[value]
\ No newline at end of file
#!/bin/bash
processEcho () {
echo ""
echo "#####"
echo "# $1"
echo "#####"
echo ""
}
installApt () {
# check [$1=binName; $2=aptName]
if hash $1 2>/dev/null; then
processEcho "$1 is already installed!"
else
if [[ $1 == gt ]]; then
processEcho "Installing libgenometools0, libgenometools-dev and genometools"
apt-get install libgenometools0 libgenometools0-dev genometools
else
apt-get install $2
fi
fi
}
installPip3Modul () {
processEcho "Installing $1"
pip3 install $1
}
# apt dependencies
aptBinNames=(build-essential gcc pip3 git gt blastx)
aptNames=(build-essential gcc python3-pip git genometools blast2)
# add also libgenometools0 libgenometools0-dev for gt
# pip3 dependencies
pip3Moduls=(setuptools biopython bcbio-gff click decorator networkx numpy PyYAML six)
# main process
if [[ $EUID -ne 0 ]]; then
echo "You should run as root!"
exit 1
else
# Prequisites installation
processEcho "CHECKING AND INSTALLING DEPENDENCIES"
processEcho "APT PACKAGES"
apt-get update
for i in "${!aptBinNames[@]}"
do
installApt "${aptBinNames[$i]}" "${aptNames[$i]}"
done
processEcho "PYTHON3 PACKAGES"
for pckg in "${pip3Moduls[@]}"
do
processEcho "Installing $pckg"
pip3 install $pckg
done
# LTR_Finder installation
# check
processEcho "LTR_FINDER CHECK & INSTALLATION"
path2LtrFinder=""
if hash ltr_finder 2>/dev/null; then
processEcho "$1 is already installed!"
path2LtrFinder="ltr_finder"
else
processEcho "LTR_Finder installation"
git clone https://github.com/xzhub/LTR_Finder.git
cd LTR_Finder/source
make
path=$(pwd)
path2LtrFinder=$(pwd)"/ltr_finder"
cd ../../
fi
# Setup paths in 'config.yml'
processEcho "GENERATING 'config.yml'"
# strings to be changed
ltrFinder="/path/to/ltr_finder_repository/LTR_Finder/source/ltr_finder"
prosite="path/to/repository/dbs/prosite"
tRNAs="/path/to/ltr_finder_repository/LTR_Finder/source/tRNA/Athal-tRNAs.fa"
gtStyle="/path/to/nested/nested/config/gt.style"
domDb="path/to/repository/dbs/gydb_arh_chdcr/gydb_arh_chdcr.fa"
origPaths=($ltrFinder $prosite $tRNAs $gtStyle $domDb)
# real paths
path2prosite=$(pwd)"/dbs/prosite"
path2tRNAs=$(pwd)"/LTR_Finder/source/tRNA/Athal-tRNAs.fa"
path2gtStyle=$(pwd)"/nested/config/gt.style"
path2domDb=$(pwd)"/dbs/gydb_arh_chdcr/gydb_arh_chdcr.fa"
realPaths=($path2LtrFinder $path2prosite $path2tRNAs $path2gtStyle $path2domDb)
# replacing paths
IFS=''
while read -r line
do
line=${line//$(echo $ltrFinder | sed 's_/_\\/_g')/$(echo $path2LtrFinder)}
line=${line//$(echo $prosite | sed 's_/_\\/_g')/$(echo $path2prosite)}
line=${line//$(echo $tRNAs | sed 's_/_\\/_g')/$(echo $path2tRNAs)}
line=${line//$(echo $gtStyle | sed 's_/_\\/_g')/$(echo $path2gtStyle)}
line=${line//$(echo $domDb | sed 's_/_\\/_g')/$(echo $path2domDb)}
echo $line >> config.yml
done<config_template.yml
processEcho "'config.yml' was generated"
# installing nester
processEcho "INSTALLING Te-greedy-nester"
./setup.sh
processEcho "Te-greedy-nester has been installed"
fi
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment