Comments gone, multithreading done (adb3c45b) · Commits · Matej Lexa / nested

nested/cli/nester.py

+39 −28

Original line number	Original line	Diff line number	Diff line
	@@ -2,6 +2,8 @@

	import sys		import sys
	import click		import click
			import glob
			from concurrent import futures
	from datetime import datetime		from datetime import datetime
	from subprocess import CalledProcessError		from subprocess import CalledProcessError

	@@ -9,6 +11,7 @@ from Bio import SeqIO

	from nested.core.nester import Nester		from nested.core.nester import Nester
	from nested.output.sketcher import Sketcher		from nested.output.sketcher import Sketcher
			from nested.core.sequence_thread import SequenceThread


	@click.command()		@click.command()
	@@ -19,43 +22,51 @@ from nested.output.sketcher import Sketcher
	@click.option('--output_folder', '-d', default="", help='Output data folder.')		@click.option('--output_folder', '-d', default="", help='Output data folder.')
	@click.option('--initial_threshold', '-t', type=int, default=500, help='Initial threshold value.')		@click.option('--initial_threshold', '-t', type=int, default=500, help='Initial threshold value.')
	@click.option('--threshold_multiplier', '-m', type=float, default=1, help='Threshold multiplier.')		@click.option('--threshold_multiplier', '-m', type=float, default=1, help='Threshold multiplier.')
	# TODO DATA_FOLDER		@click.option('--threads', '-n', type=int, default=1, help='Number of threads')
	def main(input_fasta, sketch, format, output_fasta_offset, output_folder, initial_threshold, threshold_multiplier):		def main(input_fasta, sketch, format, output_fasta_offset, output_folder, initial_threshold, threshold_multiplier, threads):
	number_of_errors = 0		number_of_errors = 0
	start_time = datetime.now()		start_time = datetime.now()
	sequences = list(SeqIO.parse(open(input_fasta), 'fasta'))		sequences = list(SeqIO.parse(open(input_fasta), 'fasta'))
	sketcher = Sketcher()		sketcher = Sketcher()
	i = 0		i = 0
	for sequence in sequences:		with futures.ThreadPoolExecutor(threads) as executor:
	i += 1		for sequence in sequences:
	sequence.id = sequence.id.replace('/', '--')		executor.submit(process_sequence, sequence, sketcher, i, sketch, format, output_fasta_offset, output_folder, initial_threshold, threshold_multiplier)
	seq_start_time = datetime.now()
	strlen = 15
	print('Processing {a}...'.format(a=sequence.id[:strlen]), end='\r')
	try:
	nester = Nester(sequence, i, initial_threshold, threshold_multiplier)
	sketcher.create_gff(nester.nested_element, dirpath=output_folder, output_fasta_offset=output_fasta_offset, format=format)
	sketcher.create_solo_ltr_gff(nester.solo_ltrs, dirpath=output_folder)
	if sketch:
	if format != 'default':
	sketcher.create_gff(nester.nested_element, dirpath=output_folder, output_fasta_offset=output_fasta_offset)
	sketcher.sketch(sequence.id, dirpath=output_folder)
	seq_end_time = datetime.now()
	print('Processing {a}: DONE [{b}]'.format(a=sequence.id[:strlen], b=seq_end_time - seq_start_time))
	except KeyboardInterrupt:
	# delete ltrs.fa
	raise
	except CalledProcessError:
	number_of_errors += 1
	print('Processing {}: SUBPROCESS ERROR'.format(sequence.id[:strlen]))
	except:
	number_of_errors += 1
	print('Processing {}: UNEXPECTED ERROR:'.format(sequence.id[:strlen]), sys.exc_info()[0])

	end_time = datetime.now()		end_time = datetime.now()
	print('Total time: {}'.format(end_time - start_time))		print('Total time: {}'.format(end_time - start_time))
	print('Number of errors: {}'.format(number_of_errors))		print('Number of errors: {}'.format(number_of_errors))

			def process_sequence(sequence, sketcher, i, sketch, format, output_fasta_offset, output_folder, initial_threshold, threshold_multiplier):
			i += 1
			sequence.id = sequence.id.replace('/', '--')
			seq_start_time = datetime.now()
			strlen = 15
			print("Processing {}".format(sequence.id))
			try:
			nester = Nester(sequence, i, initial_threshold, threshold_multiplier)
			sketcher.create_gff(nester.nested_element, dirpath=output_folder, output_fasta_offset=output_fasta_offset, format=format)
			sketcher.create_solo_ltr_gff(nester.solo_ltrs, dirpath=output_folder)
			if sketch:
			if format != 'default':
			sketcher.create_gff(nester.nested_element, dirpath=output_folder, output_fasta_offset=output_fasta_offset)
			sketcher.sketch(sequence.id, dirpath=output_folder)
			seq_end_time = datetime.now()
			print('Processing {a}: DONE [{b}]'.format(a=sequence.id, b=seq_end_time - seq_start_time))
			except KeyboardInterrupt:
			cleanup()
			raise
			except CalledProcessError:
			cleanup()
			number_of_errors += 1
			print('Processing {}: SUBPROCESS ERROR'.format(sequence.id[:strlen]))
			except:
			cleanup()
			number_of_errors += 1
			print('Processing {}: UNEXPECTED ERROR:'.format(sequence.id[:strlen]), sys.exc_info()[0])

			def cleanup():
			for file in glob.glob("*ltrs.fa"):
			os.remove(file)

	if __name__ == '__main__':		if __name__ == '__main__':
	main()		main()

nested/config/gt.style

+51 −3

Original line number	Original line	Diff line number	Diff line
	@@ -68,7 +68,7 @@ style =
	max_show_width = nil,		max_show_width = nil,
	},		},
	--------------------------------------		--------------------------------------
	AP = {		PROT = {
	-- Color definitions		-- Color definitions
	stroke = {red=0.0, green=0.0, blue=0.0, alpha = 1.0},		stroke = {red=0.0, green=0.0, blue=0.0, alpha = 1.0},
	stroke_marked = {red=1.0, green=0.0, blue=0.0, alpha = 1.0},		stroke_marked = {red=1.0, green=0.0, blue=0.0, alpha = 1.0},
	@@ -116,7 +116,55 @@ style =
	max_show_width = nil,		max_show_width = nil,
	},		},
	--------------------------------------		--------------------------------------
	RNaseH = {		RH = {
			-- Color definitions
			stroke = {red=0.0, green=0.0, blue=0.0, alpha = 1.0},
			stroke_marked = {red=1.0, green=0.0, blue=0.0, alpha = 1.0},
			fill = {red=0.0510, green=0.2784, blue=0.6314, alpha = 5.0},
			style = "box",
			-- Collapsing options
			collapse_to_parent = true,
			split_lines = true,
			-- Caption options
			max_capt_show_width= nil,
			-- Display this track only if the viewport is not wider than this
			-- number of nucleotides. Set to 0 to disable type track.
			max_show_width = nil,
			},
			--------------------------------------
			aRH = {
			-- Color definitions
			stroke = {red=0.0, green=0.0, blue=0.0, alpha = 1.0},
			stroke_marked = {red=1.0, green=0.0, blue=0.0, alpha = 1.0},
			fill = {red=0.0510, green=0.2784, blue=0.6314, alpha = 5.0},
			style = "box",
			-- Collapsing options
			collapse_to_parent = true,
			split_lines = true,
			-- Caption options
			max_capt_show_width= nil,
			-- Display this track only if the viewport is not wider than this
			-- number of nucleotides. Set to 0 to disable type track.
			max_show_width = nil,
			},
			--------------------------------------
			CHD = {
			-- Color definitions
			stroke = {red=0.0, green=0.0, blue=0.0, alpha = 1.0},
			stroke_marked = {red=1.0, green=0.0, blue=0.0, alpha = 1.0},
			fill = {red=0.0510, green=0.2784, blue=0.6314, alpha = 5.0},
			style = "box",
			-- Collapsing options
			collapse_to_parent = true,
			split_lines = true,
			-- Caption options
			max_capt_show_width= nil,
			-- Display this track only if the viewport is not wider than this
			-- number of nucleotides. Set to 0 to disable type track.
			max_show_width = nil,
			},
			--------------------------------------
			CHDCR = {
	-- Color definitions		-- Color definitions
	stroke = {red=0.0, green=0.0, blue=0.0, alpha = 1.0},		stroke = {red=0.0, green=0.0, blue=0.0, alpha = 1.0},
	stroke_marked = {red=1.0, green=0.0, blue=0.0, alpha = 1.0},		stroke_marked = {red=1.0, green=0.0, blue=0.0, alpha = 1.0},
	@@ -246,4 +294,4 @@ style =
	default_stroke_color = {red=0.1, green=0.1, blue=0.1, alpha = 1.0},		default_stroke_color = {red=0.1, green=0.1, blue=0.1, alpha = 1.0},
	background_color = {red=1.0, green=1.0, blue=1.0, alpha = 1.0},		background_color = {red=1.0, green=1.0, blue=1.0, alpha = 1.0},
	},		},
	}		}
	No newline at end of file

nested/core/domain_dante.py

+2 −24

Original line number	Original line	Diff line number	Diff line
	@@ -52,19 +52,10 @@ def run_blastx(sequence):
	Returns:		Returns:
	list[Domain]: list of matched domains		list[Domain]: list of matched domains
	"""		"""
	domains = []		domains = []
	# args = dict(config['blastx']['args'])
	# args['db'] = "/opt/nester/dbs/gydb_arh_chdcr/gydb_arh_chdcr.fa"
	# args['max_target_seqs'] = 1950
	# args['num_threads'] = 6
	# args['evalue'] = 1e-5

	#blastx_cline = NcbiblastxCommandline(**args)
	blastx_cline = NcbiblastxCommandline(**config['blastx']['args'])		blastx_cline = NcbiblastxCommandline(**config['blastx']['args'])
	xml_out, stderr = blastx_cline(stdin=str(sequence))		xml_out, stderr = blastx_cline(stdin=str(sequence))

	#output = open("xml_output_blast.xml", "a+").write(str(xml_out))

	blast_records = NCBIXML.parse(StringIO(xml_out))		blast_records = NCBIXML.parse(StringIO(xml_out))
	try:		try:
	blast_record = next(blast_records).alignments		blast_record = next(blast_records).alignments
	@@ -85,10 +76,8 @@ def run_blastx(sequence):
	else:		else:
	domain.location = [hsp.query_start, hsp.query_start + 3 * (hsp.align_length - hsp.gaps)]		domain.location = [hsp.query_start, hsp.query_start + 3 * (hsp.align_length - hsp.gaps)]

	#domain.type = alignment.title.split(' ')[1].split('_')[0]
	domain.type = alignment.title.split('__')[0].split('-')[1]		domain.type = alignment.title.split('__')[0].split('-')[1]
	domain.annotation = alignment.title.split('__')[1]		domain.annotation = alignment.title.split('__')[1]
	#print(domain.type, domain.location)
	domains.append(domain)		domains.append(domain)
	return domains		return domains

	@@ -133,15 +122,4 @@ def parse_last_output(raw_output):
	domain.annotation = split_line[1].split('__')[1]		domain.annotation = split_line[1].split('__')[1]
	domains.append(domain)		domains.append(domain)
	print(len(domains))		print(len(domains))
	return domains		return domains

nested/core/gene.py

+0 −1

Original line number	Original line	Diff line number	Diff line
	@@ -67,7 +67,6 @@ class Gene(object):
	gypsy_minus = Graph(te, self.domain_list, True, True)		gypsy_minus = Graph(te, self.domain_list, True, True)
	graphs = [copia_plus, copia_minus, gypsy_plus, gypsy_minus]		graphs = [copia_plus, copia_minus, gypsy_plus, gypsy_minus]
	te.score, te.features = self._best_score(graphs)		te.score, te.features = self._best_score(graphs)
	# te.score /= float(intervals.length(te.location))

	'''		'''
	# evaluate LTR pair using graph, set up score and features		# evaluate LTR pair using graph, set up score and features

nested/core/nester.py

+2 −12

Original line number	Original line	Diff line number	Diff line
	@@ -33,13 +33,10 @@ class Nester(object):

	def _find_nesting(self):		def _find_nesting(self):
	nested_list = self._get_unexpanded_transposon_list(self.sequence, self.threshold) # find list of nested transposons		nested_list = self._get_unexpanded_transposon_list(self.sequence, self.threshold) # find list of nested transposons
	nested_list = self._expand_transposon_list(nested_list)		nested_list = self._expand_transposon_list(nested_list)
			nested_list = self._filter_nested_list(nested_list)
	nested_list = self._filter_nested_list(nested_list)

	self.nested_element = NestedElement(self.seqid, self.sequence, nested_list)		self.nested_element = NestedElement(self.seqid, self.sequence, nested_list)

	# threshold causes loss of elements
	def _get_unexpanded_transposon_list(self, sequence, threshold):		def _get_unexpanded_transposon_list(self, sequence, threshold):
	# recursivelly find and crop best evaluated transposon, return unexpanded list of found transposons		# recursivelly find and crop best evaluated transposon, return unexpanded list of found transposons
	self._iteration += 1		self._iteration += 1
	@@ -48,7 +45,6 @@ class Nester(object):
	if not candidates:		if not candidates:
	best_candidate = gene.get_best_candidate()		best_candidate = gene.get_best_candidate()
	if not best_candidate:		if not best_candidate:
	# call blastn here
	self.solo_ltrs.find_solo_ltrs(sequence)		self.solo_ltrs.find_solo_ltrs(sequence)
	self.solo_ltrs.solo_ltrs.sort(key=lambda x: x.location[0], reverse=True)		self.solo_ltrs.solo_ltrs.sort(key=lambda x: x.location[0], reverse=True)
	return self.solo_ltrs.solo_ltrs		return self.solo_ltrs.solo_ltrs
	@@ -63,16 +59,11 @@ class Nester(object):
	for element in nested_list:		for element in nested_list:
	if intervals.intersect(candidate.location, element.location):		if intervals.intersect(candidate.location, element.location):
	choose = False		choose = False
	# not sure about the impact
	# choose = intervals.contains(candidate.location, element.location)
	break		break
	for other_candidate in candidates:		for other_candidate in candidates:
	if candidate.location == other_candidate.location:		if candidate.location == other_candidate.location:
	continue		continue
	if intervals.contains(candidate.location, other_candidate.location):		if intervals.contains(candidate.location, other_candidate.location):
	# if candidate.score >= other_candidate.score:
	# choose = True
	# else:
	choose = False		choose = False
	break		break
	if choose:		if choose:
	@@ -110,7 +101,6 @@ class Nester(object):
	nested_list[j].features['ppt'])		nested_list[j].features['ppt'])
	nested_list[j].features['pbs'] = intervals.expand(nested_list[i].location,		nested_list[j].features['pbs'] = intervals.expand(nested_list[i].location,
	nested_list[j].features['pbs'])		nested_list[j].features['pbs'])
	#TODO check if works in corner case
	nested_list[j].tsr_left = intervals.expand(nested_list[i].location,		nested_list[j].tsr_left = intervals.expand(nested_list[i].location,
	nested_list[j].tsr_left)		nested_list[j].tsr_left)
	nested_list[j].tsr_right = intervals.expand(nested_list[i].location,		nested_list[j].tsr_right = intervals.expand(nested_list[i].location,