Skip to content
Snippets Groups Projects
Commit a4ca8760 authored by rlapar's avatar rlapar
Browse files

setuptools

parent 0ec1e3e6
No related branches found
No related tags found
1 merge request!9setuptools
Showing
with 178 additions and 23 deletions
__version__ = '0.1.0'
__author__ = 'Radovan Lapár'
#!/usr/bin/env python3
import click
from datetime import datetime
from nested.core.generator import Generator
from nested.output.sketcher import Sketcher
@click.command()
@click.argument('input_db', required=True, type=click.Path(exists=True))
@click.argument('output_db', required=True, type=str)
@click.option('--baselength', '-l', type=int, help='Baselength for generated elements.')
@click.option('--number_of_iterations', '-i', type=int, help='Number of iterations in generating.')
@click.option('--number_of_elements', '-n', type=int, help='Number of generated elements.')
@click.option('--filter', '-f', is_flag=True, default=False, type=str, help='Filter database and create new one with given output db path.')
@click.option('--filter_string', '-s', type=str, help='Filter entries by given string [ONLY RELEVANT WITH -filter OPTION].')
@click.option('--filter_offset', '-o', type=int, help='LTR offset allowed [ONLY RELEVANT WITH -filter OPTION].')
@click.option('--data_folder', '-d', type=str, help='Output data folder.')
#TODO DATA_FOLDER
def main(input_db, output_db, baselength, number_of_iterations, number_of_elements, filter, filter_string, filter_offset, data_folder):
#number_of_errors = 0
start_time = datetime.now()
generator = Generator(input_db)
if filter:
params = {}
if filter_string: params['filter_string'] = filter_string
if filter_offset: params['ltr_offset'] = filter_offset
params['verbose'] = True
generator.filter_db(output_db, **params)
else:
params = {}
if baselength: params['baselength'] = baselength
if number_of_iterations: params['number_of_iterations'] = number_of_iterations
if number_of_elements: params['number_of_elements'] = number_of_elements
generator.generate_random_nested_elements(**params)
generator.save_elements_to_fasta('generated_data/{}'.format(output_db))
sketcher = Sketcher()
for element in generator.elements:
sketcher.create_gff(element, 'generated_data')
sketcher.sketch(element.id, 'generated_data')
endTime = datetime.now()
print('Total time: {}'.format(endTime - start_time))
#print('Number of errors: {}'.format(number_of_errors))
if __name__ == '__main__':
main()
\ No newline at end of file
#!/usr/bin/env python3
import sys
import click
from datetime import datetime
from subprocess import CalledProcessError
from Bio import SeqIO
from nested.core.nester import Nester
from nested.output.sketcher import Sketcher
@click.command()
@click.argument('input_fasta', required=True, type=click.Path(exists=True))
@click.option('--sketch_only', '-s', is_flag=True, help='If true, nesting is not computed. Genes are sketched only from existing gff files.')
@click.option('--data_folder', '-d', type=str, help='Output data folder.')
#TODO DATA_FOLDER
def main(input_fasta, sketch_only):
number_of_errors = 0
start_time = datetime.now()
sequences = list(SeqIO.parse(open(input_fasta), 'fasta'))
sketcher = Sketcher()
for sequence in sequences:
sequence.id = sequence.id.replace('/', '--')
seq_start_time = datetime.now()
strlen = 15
print('Processing {a}...'.format(a=sequence.id[:strlen]), end='\r')
try:
if not sketch_only:
nester = Nester(sequence)
sketcher.create_gff(nester.nested_element)
sketcher.sketch(sequence.id)
seq_end_time = datetime.now()
print('Processing {a}: DONE [{b}]'.format(a=sequence.id[:strlen], b=seq_end_time - seq_start_time))
except KeyboardInterrupt:
raise
except CalledProcessError:
number_of_errors += 1
print('Processing {}: SUBPROCESS ERROR'.format(sequence.id[:strlen]))
except:
number_of_errors += 1
print('Processing {}: UNEXPECTED ERROR:'.format(sequence.id[:strlen]), sys.exc_info()[0])
endTime = datetime.now()
print('Total time: {}'.format(endTime - start_time))
print('Number of errors: {}'.format(number_of_errors))
if __name__ == '__main__':
main()
\ No newline at end of file
File moved
...@@ -5,7 +5,7 @@ from io import StringIO ...@@ -5,7 +5,7 @@ from io import StringIO
from Bio.Blast import NCBIXML from Bio.Blast import NCBIXML
from Bio.Blast.Applications import NcbiblastxCommandline from Bio.Blast.Applications import NcbiblastxCommandline
import config from nested.config import config
class Domain(object): class Domain(object):
"""Class representing domain parsed from blastx output. """Class representing domain parsed from blastx output.
......
...@@ -23,10 +23,10 @@ class Gene(object): ...@@ -23,10 +23,10 @@ class Gene(object):
def __str__(self): def __str__(self):
strlen = 15 strlen = 15
lines = ['{{id = {},'.format(self.seqid), lines = ['{{id = {a}{b},'.format(a=self.seqid[:strlen], b='...' if len(self.sequence) > strlen else ''),
' sequence = {a}{b},'.format(a=self.sequence[:strlen], b='...' if len(self.sequence) > strlen else ''), ' sequence = {a}{b},'.format(a=self.sequence[:strlen], b='...' if len(self.sequence) > strlen else ''),
' teList.size = {},'.format(len(self.te_list)), ' te_list.size = {},'.format(len(self.te_list)),
' domainList.size = {}}}'.format(len(self.domain_list))] ' domain_list.size = {}}}'.format(len(self.domain_list))]
return '\n'.join(lines) return '\n'.join(lines)
def get_best_candidate(self): def get_best_candidate(self):
......
...@@ -5,6 +5,7 @@ import random ...@@ -5,6 +5,7 @@ import random
from Bio import SeqIO from Bio import SeqIO
from Bio.SeqRecord import SeqRecord from Bio.SeqRecord import SeqRecord
from nested.core.gene import Gene
from nested.core.te import TE from nested.core.te import TE
from nested.core.nested_element import NestedElement from nested.core.nested_element import NestedElement
from nested.utils import intervals from nested.utils import intervals
...@@ -13,16 +14,35 @@ class Generator(object): ...@@ -13,16 +14,35 @@ class Generator(object):
"""Class used to generate artificial nested elements for testing purposes """Class used to generate artificial nested elements for testing purposes
Attributes: Attributes:
sourceFile (str): path to source database of TE's in fasta format source_db (str): path to source database of TE's in fasta format
""" """
def __init__(self, source_db): def __init__(self, source_db):
self._source_db = source_db self._source_db = source_db
self.elements = [] self.elements = []
def generate_random_nested_element(self, baselength=10, number_of_iterations=1, filter_string='LTR'): def filter_db(self, filtered_db_path, filter_string='LTR', ltr_offset=100, verbose=False):
sequences = list(SeqIO.parse(open(self._source_db), 'fasta')) sequences = list(SeqIO.parse(open(self._source_db), 'fasta'))
sequences = list(filter(lambda x: filter_string in x.id, sequences)) sequences = list(filter(lambda x: filter_string in x.id, sequences))
with open(filtered_db_path, 'w+') as filtered_file:
i = 1
number_of_entries = len(sequences)
for sequence in sequences:
if verbose:
print('Filtering entries: {}/{} [{:.2f}%]'.format(i+1, number_of_entries, (100*float(i+1))/number_of_entries), end='\r')
i += 1
sequence.id = sequence.id.replace('/', '--')
gene = Gene(sequence.id, sequence.seq)
best_candidate = gene.get_best_candidate()
if (best_candidate and
best_candidate.location[0] - 1 <= 100 and
len(sequence) - best_candidate.location[1] <= 100):
SeqIO.write(sequence, filtered_file, 'fasta')
if verbose:
print('Filtering entries: DONE')
def generate_random_nested_element(self, baselength=10, number_of_iterations=1):
sequences = list(SeqIO.parse(open(self._source_db), 'fasta'))
#random initial sequence #random initial sequence
element_sequence = ''.join([random.choice('atgc') for x in range(baselength)]) element_sequence = ''.join([random.choice('atgc') for x in range(baselength)])
...@@ -31,7 +51,6 @@ class Generator(object): ...@@ -31,7 +51,6 @@ class Generator(object):
for i in range(number_of_iterations): for i in range(number_of_iterations):
chosen_id = random.randint(0, len(sequences) - 1) chosen_id = random.randint(0, len(sequences) - 1)
chosen_seq = sequences[chosen_id].seq chosen_seq = sequences[chosen_id].seq
#print('Chosen sequence ({}) with length {}'.format(sequences[chosen_id].id, len(chosen_seq)))
insert_position = random.randint(0, len(element_sequence) - 1) insert_position = random.randint(0, len(element_sequence) - 1)
nested_intervals = [[insert_position, insert_position + len(chosen_seq) - 1]] + nested_intervals nested_intervals = [[insert_position, insert_position + len(chosen_seq) - 1]] + nested_intervals
element_sequence = element_sequence[:insert_position] + chosen_seq + element_sequence[insert_position:] element_sequence = element_sequence[:insert_position] + chosen_seq + element_sequence[insert_position:]
...@@ -48,9 +67,9 @@ class Generator(object): ...@@ -48,9 +67,9 @@ class Generator(object):
element = NestedElement(element_id, element_sequence, nested_tes) element = NestedElement(element_id, element_sequence, nested_tes)
self.elements.append(element) self.elements.append(element)
def generate_random_nested_elements(self, number_of_elements=1, baselength=10, number_of_iterations=1, filter_string='LTR'): def generate_random_nested_elements(self, number_of_elements=1, baselength=10, number_of_iterations=1):
for i in range(number_of_elements): for i in range(number_of_elements):
self.generate_random_nested_element(baselength=baselength, number_of_iterations=number_of_iterations, filter_string=filter_string) self.generate_random_nested_element(baselength=baselength, number_of_iterations=number_of_iterations)
def save_elements_to_fasta(self, filepath): def save_elements_to_fasta(self, filepath):
sequences = [SeqRecord(x.sequence, id=x.id, description='') for x in self.elements] sequences = [SeqRecord(x.sequence, id=x.id, description='') for x in self.elements]
......
#!/usr/bin/env python3 #!/usr/bin/env python3
import os
import subprocess import subprocess
import re import re
from Bio import SeqIO from Bio import SeqIO
from Bio.SeqRecord import SeqRecord from Bio.SeqRecord import SeqRecord
import config from nested.config import config
class TE(object): class TE(object):
"""Class representing TE. Every location is in format [from, to]. """Class representing TE. Every location is in format [from, to].
...@@ -45,21 +46,30 @@ def run_ltr_finder(seqid, sequence): ...@@ -45,21 +46,30 @@ def run_ltr_finder(seqid, sequence):
Arguments: Arguments:
seqid (str): sequence id seqid (str): sequence id
sequence (Bio.Seq.Seq): sequence sequence (Bio.Seq.Seq): sequence
tmp_dir (str): Auxiliary existing directory
Returns: Returns:
list[TE]: list of found ltr pairs as a TE class list[TE]: list of found ltr pairs as a TE class
""" """
transposons = [] transposons = []
with open('tmp/tmp_{}.fa'.format(seqid), 'w+') as tmp_file: #prepare tmp fasta file for ltr finder if not os.path.exists('/tmp/nested'):
os.makedirs('/tmp/nested')
if not os.path.exists('/tmp/nested/ltr'):
os.makedirs('/tmp/nested/ltr')
with open('/tmp/nested/ltr/{}.fa'.format(seqid), 'w+') as tmp_file: #prepare tmp fasta file for ltr finder
SeqIO.write(SeqRecord(sequence, id=seqid), SeqIO.write(SeqRecord(sequence, id=seqid),
tmp_file, tmp_file,
'fasta') 'fasta')
#call LTR finder and feed stdin to it #call LTR finder and feed stdin to it
process = subprocess.Popen([config.ltr_finder_path] + config.ltr_finder_args + ['tmp/tmp_{}.fa'.format(seqid)], process = subprocess.Popen([config.ltr_finder_path] + config.ltr_finder_args + ['/tmp/nested/ltr/{}.fa'.format(seqid)],
stdout=subprocess.PIPE, stdout=subprocess.PIPE,
stderr=subprocess.PIPE) stderr=subprocess.PIPE)
#os.remove('/tmp/nested/ltr/{}.fa'.format(seqid))
stdout, stderr = process.communicate() stdout, stderr = process.communicate()
parsed_output = parse_raw_output(stdout) parsed_output = parse_raw_output(stdout)
......
...@@ -9,7 +9,7 @@ from Bio.Seq import Seq ...@@ -9,7 +9,7 @@ from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord from Bio.SeqRecord import SeqRecord
from Bio.SeqFeature import SeqFeature, FeatureLocation from Bio.SeqFeature import SeqFeature, FeatureLocation
import config from nested.config import config
from nested.utils import intervals from nested.utils import intervals
DEFAULT_DIRPATH = 'data' DEFAULT_DIRPATH = 'data'
......
...@@ -29,7 +29,7 @@ class Graph(object): ...@@ -29,7 +29,7 @@ class Graph(object):
def _add_nodes(self, te, domain_list): #add all necessary nodes def _add_nodes(self, te, domain_list): #add all necessary nodes
#LTR nodes #LTR nodes
self._graph.add_node( self._graph.add_node(
n='ltr_left', 'ltr_left',
location=te.ltr_left_location, location=te.ltr_left_location,
score=1, score=1,
node_class='ltr_left', node_class='ltr_left',
...@@ -38,7 +38,7 @@ class Graph(object): ...@@ -38,7 +38,7 @@ class Graph(object):
) )
self._graph.add_node( self._graph.add_node(
n='ltr_right', 'ltr_right',
location=te.ltr_right_location, location=te.ltr_right_location,
score=1, score=1,
node_class='ltr_right', node_class='ltr_right',
...@@ -56,7 +56,7 @@ class Graph(object): ...@@ -56,7 +56,7 @@ class Graph(object):
if (intervals.compare(te.ltr_left_location, location) != 1 if (intervals.compare(te.ltr_left_location, location) != 1
and intervals.compare(location, te.ltr_right_location) != 1): and intervals.compare(location, te.ltr_right_location) != 1):
self._graph.add_node( self._graph.add_node(
n='ppt', 'ppt',
location=location, location=location,
score=1, score=1,
node_class='ppt', node_class='ppt',
...@@ -73,7 +73,7 @@ class Graph(object): ...@@ -73,7 +73,7 @@ class Graph(object):
if (intervals.compare(te.ltr_left_location, location) != 1 if (intervals.compare(te.ltr_left_location, location) != 1
and intervals.compare(location, te.ltr_right_location) != 1): and intervals.compare(location, te.ltr_right_location) != 1):
self._graph.add_node( self._graph.add_node(
n='pbs', 'pbs',
location=location, location=location,
score=1, score=1,
node_class='pbs', node_class='pbs',
...@@ -90,7 +90,7 @@ class Graph(object): ...@@ -90,7 +90,7 @@ class Graph(object):
if (intervals.compare(te.ltr_left_location, domain.location) != 1 if (intervals.compare(te.ltr_left_location, domain.location) != 1
and intervals.compare(domain.location, te.ltr_right_location) != 1): and intervals.compare(domain.location, te.ltr_right_location) != 1):
self._graph.add_node( self._graph.add_node(
n='domain_{}'.format(i), 'domain_{}'.format(i),
location=domain.location, location=domain.location,
score=domain.score, score=domain.score,
node_class=domain.type, node_class=domain.type,
......
bcbio-gff==0.6.4
biopython==1.70
click==6.7
decorator==4.2.1
networkx==2.1
numpy==1.14.2
six==1.11.0
from setuptools import setup, find_packages
import nested
setup(
name='nested',
version=nested.__version__,
description='Nested description',
author=nested.__author__,
packages=find_packages(),
install_requires=[
'bcbio-gff==0.6.4',
'biopython==1.70',
'click==6.7',
'networkx==2.1'
],
entry_points={
'console_scripts': [
'nested-generator=nested.cli.generator:main',
'nested-nester=nested.cli.nester:main'
]
}
)
\ No newline at end of file
This diff is collapsed.
This diff is collapsed.
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment