Merge branch 'master' of https://gitlab.fi.muni.cz/lexa/nested (231a3895) · Commits · Matej Lexa / nested

2805_full.fa

0 → 100644

+1386 −0

File added.

Preview size limit exceeded, changes collapsed.

config_template.yml

+20 −5

Original line number	Original line	Diff line number	Diff line
	ltr:		ltr:
	path: '/home/pavel/Documents/Soft/LTR_Finder/source/ltr_finder'		path: '/home/xvanat/ltr_finder_repo/LTR_Finder/source/ltr_finder'
	prosite_path: &prosite_path /media/pavel/AA2C29922C295B19/Data_PJ/IBP_data/The_Age_of_TEs_project/prosite		prosite_path: &prosite_path /opt/nester/dbs/prosite
	tRNAdb_path: &tRNAdb_path /home/pavel/Documents/Soft/LTR_Finder/source/tRNA/Athal-tRNAs.fa		tRNAdb_path: &tRNAdb_path /home/xvanat/ltr_finder_repo/LTR_Finder/source/tRNA/Athal-tRNAs.fa
	args:		args:
	l: 32		l: 32
	S: 3		S: 3
	@@ -18,13 +18,13 @@ ltr:
	gt:		gt:
	path: 'gt'		path: 'gt'
	command: 'sketch'		command: 'sketch'
	style_path: &style_path /home/pavel/Documents/Soft/nested/nested/config/gt.style		style_path: &style_path /home/xvanat/nested/nested/config/gt.style
	args:		args:
	width: 1400		width: 1400
	style: *style_path		style: *style_path

	blastx:		blastx:
	db_path: &db_path /media/pavel/AA2C29922C295B19/Data_PJ/IBP_data/The_Age_of_TEs_project/gydb_proteins_db/gydb_proteins.fa		db_path: &db_path /opt/nester/dbs/gydb_proteins_db/gydb_proteins.fa
	args:		args:
	db: *db_path		db: *db_path
	outfmt: 5 #don't change, parsing format required!		outfmt: 5 #don't change, parsing format required!
	@@ -43,4 +43,19 @@ gff_format:
	ltr: long_terminal_repeat		ltr: long_terminal_repeat
	tsr: target_site_duplication		tsr: target_site_duplication

			igv_colors:
			tsr_left: '#333333'
			tsr_right: '#333333'
			ltr_left: '#f032e6'
			ltr_right: '#f032e6'
			pbs: '#c7e9c0'
			ppt: '#41ab5d'
			GAG: '#225ea8'
			AP: '#ffeda0'
			RT: '#feb24c'
			RNaseH: '#fc4e2a'
			INT: '#b10026'
			default: '#a9a9a9'
			CHR: '#6a51a3'

	logdir: /tmp/nested/logs		logdir: /tmp/nested/logs

doc/bibm_lexa_etal_2018.tex

+5 −5

Original line number	Original line	Diff line number	Diff line
	@@ -13,7 +13,7 @@
	T\kern-.1667em\lower.7ex\hbox{E}\kern-.125emX}}		T\kern-.1667em\lower.7ex\hbox{E}\kern-.125emX}}
	\begin{document}		\begin{document}

	\title{TE-nester: a recursive software tool for structure-based discovery of transposable elements fragmented by insertion of repetitive sequences (application to retrotransposons in plant genomes)\\		\title{TE-nester: a recursive software tool for structure-based discovery of nested transposable elements\\
	\thanks{This research is funded by the Czech Grant Agency grant No. GA18-00258S to EK and ML).}		\thanks{This research is funded by the Czech Grant Agency grant No. GA18-00258S to EK and ML).}
	}		}

	@@ -57,7 +57,7 @@ kejnovsk@ibp.cz}
	\maketitle		\maketitle

	\begin{abstract}		\begin{abstract}
	Eukaryotic genomes are generally rich in repetitive sequences. LTR retrotransposons are the most abundant class of repetitive sequences in plant genomes. They form segments of genomic sequences that accumulate via individual events and bursts of retrotransposition. The individual copies then undergo various types of evolutionary erosion as well as fixation, resulting in a complex mix of fragments present in different parts of the genomes. A limited number of tools exist that can identify fragments of repetitive sequences that likely originate from a longer, originally unfragmented element, using mostly sequence similarity to guide reconstruction of fragmented sequences. Here, we test a slightly different approach based on structural (as opposed to sequence similarity) detection of unfragmented full-length elements, which are then recursively eliminated from the analyzed sequence to repeatedly uncover unfragmented copies hidden underneath more recent insertions. This approach has the potential to detect relatively old and highly fragmented copies. We created a software tool for this kind of analysis called TE-nester and applied it to a number of assembled plant genomes to discover pairs of nested LTR retrotransposons of various age and fragmentation state. We test hypotheses about genome evolution and TE life cycle and insertion history against this unique and novel dataset. The software, still under development, is available for download from a repository at \url{https://gitlab.fi.muni.cz/lexa/nested}.		Eukaryotic genomes are generally rich in repetitive sequences. LTR retrotransposons are the most abundant class of repetitive sequences in plant genomes. They form segments of genomic sequences that accumulate via individual events and bursts of retrotransposition. A limited number of tools exist that can identify fragments of repetitive sequences that likely originate from a longer, originally unfragmented element, using mostly sequence similarity to guide reconstruction of fragmented sequences. Here, we use a slightly different approach based on structural (as opposed to sequence similarity) detection of unfragmented full-length elements, which are then recursively eliminated from the analyzed sequence to repeatedly uncover unfragmented copies hidden underneath more recent insertions. This approach has the potential to detect relatively old and highly fragmented copies. We created a software tool for this kind of analysis called TE-nester and applied it to a number of assembled plant genomes to discover pairs of nested LTR retrotransposons of various age and fragmentation state. TE-nester will allow us to test hypotheses about genome evolution, TE life cycle and insertion history. The software, still under improvement, is available for download from a repository at \url{https://gitlab.fi.muni.cz/lexa/nested}.
	\end{abstract}		\end{abstract}

	\begin{IEEEkeywords}		\begin{IEEEkeywords}
	@@ -94,7 +94,7 @@ After several rounds of design decisions we arrived at a procedure that works in
	Evaluation of full-length TE candidates is done by constructing a weighted directed graph, where nodes represent required sites in a full-length element (such as domains, pbs, ppt, tsd) (Fig.~\ref{fig1}). The program is trying to find a path from left LTR to the right LTR, whilst visiting every required node in the correct order (domains are ordered differently in Gypsy and Copia families, some, like {\it env} are family-specific or optional). By assigning weight to the edges, we prioritize a path that has as complete a structure as possible. At the same time, we allow alternative paths with respective penalties in case of either a missing node, or an incorrect order of available nodes.		Evaluation of full-length TE candidates is done by constructing a weighted directed graph, where nodes represent required sites in a full-length element (such as domains, pbs, ppt, tsd) (Fig.~\ref{fig1}). The program is trying to find a path from left LTR to the right LTR, whilst visiting every required node in the correct order (domains are ordered differently in Gypsy and Copia families, some, like {\it env} are family-specific or optional). By assigning weight to the edges, we prioritize a path that has as complete a structure as possible. At the same time, we allow alternative paths with respective penalties in case of either a missing node, or an incorrect order of available nodes.

	\begin{figure}[htbp]		\begin{figure}[htbp]
	\centerline{\includegraphics[width=\columnwidth]{img/fig1.png}}		\centerline{\includegraphics[width=\columnwidth, natwidth=756,natheight=231]{img/fig1.png}}
	\caption{The weighted directed graph used to evaluate individual TE candidates.}		\caption{The weighted directed graph used to evaluate individual TE candidates.}
	\label{fig1}		\label{fig1}
	\end{figure}		\end{figure}
	@@ -102,13 +102,13 @@ Evaluation of full-length TE candidates is done by constructing a weighted direc
	We also need a way to recover various subsequences of the analyzed sequence, such as the original unfragmented sequences of older TEs fragmented by nesting. The identified features also must be properly annotated to the analyzed sequence. This is achieved by a procedure where the removed sequences are virtually returned to their positions in the genome and the coordinates of TEs and their features are adjusted for the inserted element. Once all TEs that were removed in the first phase are processed, we generate a GFF3 file with coordinates that map to the analyzed sequence (Fig.~\ref{fig2}). The final GFF output file can be used to visualize all the identified features with specialized software, such as Genome Tools Annotation Sketch (Fig.~\ref{fig3})\cite{gremme_etal_2013}, a genome browser, or to extract sequences for certain features using bedtools, for example.		We also need a way to recover various subsequences of the analyzed sequence, such as the original unfragmented sequences of older TEs fragmented by nesting. The identified features also must be properly annotated to the analyzed sequence. This is achieved by a procedure where the removed sequences are virtually returned to their positions in the genome and the coordinates of TEs and their features are adjusted for the inserted element. Once all TEs that were removed in the first phase are processed, we generate a GFF3 file with coordinates that map to the analyzed sequence (Fig.~\ref{fig2}). The final GFF output file can be used to visualize all the identified features with specialized software, such as Genome Tools Annotation Sketch (Fig.~\ref{fig3})\cite{gremme_etal_2013}, a genome browser, or to extract sequences for certain features using bedtools, for example.

	\begin{figure}[htbp]		\begin{figure}[htbp]
	\centerline{\includegraphics[width=\columnwidth]{img/fig2.png}}		\centerline{\includegraphics[width=\columnwidth, natwidth=1015,natheight=337]{img/fig2.png}}
	\caption{Example output GFF3 file produced by TE-nester.}		\caption{Example output GFF3 file produced by TE-nester.}
	\label{fig2}		\label{fig2}
	\end{figure}		\end{figure}

	\begin{figure}[htbp]		\begin{figure}[htbp]
	\centerline{\includegraphics[width=\columnwidth]{img/fig3.png}}		\centerline{\includegraphics[width=\columnwidth, natwidth=1400,natheight=220]{img/fig3.png}}
	\caption{Visualization of TE-nester output with Genome Tools Annotation Sketch.}		\caption{Visualization of TE-nester output with Genome Tools Annotation Sketch.}
	\label{fig3}		\label{fig3}
	\end{figure}		\end{figure}

nested/cli/generator.py

+30 −5

Original line number	Original line	Diff line number	Diff line
	#!/usr/bin/env python3		#!/usr/bin/env python3

			import os
	import click		import click
	from datetime import datetime		from datetime import datetime

	@@ -15,7 +16,13 @@ from nested.output.sketcher import Sketcher
	@click.option('--filter', '-f', is_flag=True, default=False, type=str, help='Filter database and create new one with given output db path.')		@click.option('--filter', '-f', is_flag=True, default=False, type=str, help='Filter database and create new one with given output db path.')
	@click.option('--filter_string', '-s', type=str, help='Filter entries by given string [ONLY RELEVANT WITH -filter OPTION].')		@click.option('--filter_string', '-s', type=str, help='Filter entries by given string [ONLY RELEVANT WITH -filter OPTION].')
	@click.option('--filter_offset', '-o', type=int, help='LTR offset allowed [ONLY RELEVANT WITH -filter OPTION].')		@click.option('--filter_offset', '-o', type=int, help='LTR offset allowed [ONLY RELEVANT WITH -filter OPTION].')
	def main(input_db, output_db, baselength, number_of_iterations, number_of_elements, filter, filter_string, filter_offset):		@click.option('--percentage', '-p', type=int, help='Percentage of elements in generated sequence.')
			@click.option('--average_element', '-a', type=int, help='Average element length.')
			@click.option('--expected_length', '-e', type=int, help='Expected output sequence length [ONLY WORKS WITH -percentage and -average_element].')
			@click.option('--output_directory', '-d', default=os.getcwd(), type=str, help='Output directory.')
			def main(input_db, output_db, baselength, number_of_iterations,
			number_of_elements, filter, filter_string, filter_offset,
			percentage, average_element, expected_length, output_directory):
	#number_of_errors = 0		#number_of_errors = 0
	start_time = datetime.now()		start_time = datetime.now()
	generator = Generator(input_db)		generator = Generator(input_db)
	@@ -30,16 +37,34 @@ def main(input_db, output_db, baselength, number_of_iterations, number_of_elemen
	if baselength: params['baselength'] = baselength		if baselength: params['baselength'] = baselength
	if number_of_iterations: params['number_of_iterations'] = number_of_iterations		if number_of_iterations: params['number_of_iterations'] = number_of_iterations
	if number_of_elements: params['number_of_elements'] = number_of_elements		if number_of_elements: params['number_of_elements'] = number_of_elements
			if percentage and average_element:
			if expected_length:
			params['baselength'] = calc_baselength(expected_length, percentage)
			if 'baselength' in params.keys():
			params['number_of_iterations'] = calc_number_of_iterations(params['baselength'], percentage, average_element)

	generator.generate_random_nested_elements(**params)		generator.generate_random_nested_elements(**params)
	generator.save_elements_to_fasta('generated_data/{}'.format(output_db))
			if not os.path.exists('{}/generated_data'.format(output_directory)):
			os.makedirs('{}/generated_data'.format(output_directory))
			generator.save_elements_to_fasta('{}/generated_data/{}'.format(output_directory, output_db))
	sketcher = Sketcher()		sketcher = Sketcher()
	for element in generator.elements:		for element in generator.elements:
	sketcher.create_gff(element, 'generated_data')		sketcher.create_gff(element, '{}/generated_data'.format(output_directory))
	sketcher.sketch(element.id, 'generated_data')		# sketcher.sketch(element.id, 'generated_data')
	end_time = datetime.now()		end_time = datetime.now()
	print('Total time: {}'.format(end_time - start_time))		print('Total time: {}'.format(end_time - start_time))
	#print('Number of errors: {}'.format(number_of_errors))		#print('Number of errors: {}'.format(number_of_errors))

			def calc_number_of_iterations(length, percentage, average_element):
			p = percentage / 100
			remainder = int(p / (1 - p) * length)
			return remainder // average_element

			def calc_baselength(expected_length, percentage):
			p = percentage / 100
			return int((1 - p) * expected_length)


	if __name__ == '__main__':		if __name__ == '__main__':
	main()		main()

nested/cli/nester.py

+42 −26

Original line number	Original line	Diff line number	Diff line
	@@ -2,6 +2,8 @@

	import sys		import sys
	import click		import click
			import glob
			from concurrent import futures
	from datetime import datetime		from datetime import datetime
	from subprocess import CalledProcessError		from subprocess import CalledProcessError

	@@ -9,6 +11,7 @@ from Bio import SeqIO

	from nested.core.nester import Nester		from nested.core.nester import Nester
	from nested.output.sketcher import Sketcher		from nested.output.sketcher import Sketcher
			from nested.core.sequence_thread import SequenceThread


	@click.command()		@click.command()
	@@ -16,41 +19,54 @@ from nested.output.sketcher import Sketcher
	@click.option('--sketch', '-s', is_flag=True, default=False, help='Sketch output.')		@click.option('--sketch', '-s', is_flag=True, default=False, help='Sketch output.')
	@click.option('--format', '-f', default='genome_browser', help='Format for GFF.')		@click.option('--format', '-f', default='genome_browser', help='Format for GFF.')
	@click.option('--output_fasta_offset', '-o', default=0, help='Number of bases around the element included in output fasta files.')		@click.option('--output_fasta_offset', '-o', default=0, help='Number of bases around the element included in output fasta files.')
	# TODO DATA_FOLDER		@click.option('--output_folder', '-d', default="", help='Output data folder.')
	def main(input_fasta, sketch, format, output_fasta_offset):		@click.option('--initial_threshold', '-t', type=int, default=500, help='Initial threshold value.')
			@click.option('--threshold_multiplier', '-m', type=float, default=1, help='Threshold multiplier.')
			@click.option('--threads', '-n', type=int, default=1, help='Number of threads')
			def main(input_fasta, sketch, format, output_fasta_offset, output_folder, initial_threshold, threshold_multiplier, threads):
	number_of_errors = 0		number_of_errors = 0
	start_time = datetime.now()		start_time = datetime.now()
	sequences = list(SeqIO.parse(open(input_fasta), 'fasta'))		sequences = list(SeqIO.parse(open(input_fasta), 'fasta'))
	sketcher = Sketcher()		sketcher = Sketcher()
	i = 0		i = 0
			with futures.ThreadPoolExecutor(threads) as executor:
	for sequence in sequences:		for sequence in sequences:
			executor.submit(process_sequence, sequence, sketcher, i, sketch, format, output_fasta_offset, output_folder, initial_threshold, threshold_multiplier)
			end_time = datetime.now()
			print('Total time: {}'.format(end_time - start_time))
			print('Number of errors: {}'.format(number_of_errors))

			def process_sequence(sequence, sketcher, i, sketch, format, output_fasta_offset, output_folder, initial_threshold, threshold_multiplier):
	i += 1		i += 1
	sequence.id = sequence.id.replace('/', '--')		sequence.id = sequence.id.replace('/', '--')
	seq_start_time = datetime.now()		seq_start_time = datetime.now()
	strlen = 15		strlen = 15
	print('Processing {a}...'.format(a=sequence.id[:strlen]), end='\r')		print("Processing {}".format(sequence.id))
	try:		try:
	nester = Nester(sequence, i)		nester = Nester(sequence, i, initial_threshold, threshold_multiplier)
	sketcher.create_gff(nester.nested_element, output_fasta_offset=output_fasta_offset, format=format)		sketcher.create_gff(nester.nested_element, dirpath=output_folder, output_fasta_offset=output_fasta_offset, format=format)
			sketcher.create_solo_ltr_gff(nester.solo_ltrs, dirpath=output_folder)
	if sketch:		if sketch:
	if format != 'default':		if format != 'default':
	sketcher.create_gff(nester.nested_element, output_fasta_offset=output_fasta_offset)		sketcher.create_gff(nester.nested_element, dirpath=output_folder, output_fasta_offset=output_fasta_offset)
	sketcher.sketch(sequence.id)		sketcher.sketch(sequence.id, dirpath=output_folder)
	seq_end_time = datetime.now()		seq_end_time = datetime.now()
	print('Processing {a}: DONE [{b}]'.format(a=sequence.id[:strlen], b=seq_end_time - seq_start_time))		print('Processing {a}: DONE [{b}]'.format(a=sequence.id, b=seq_end_time - seq_start_time))
	except KeyboardInterrupt:		except KeyboardInterrupt:
			cleanup()
	raise		raise
	except CalledProcessError:		except CalledProcessError:
			cleanup()
	number_of_errors += 1		number_of_errors += 1
	print('Processing {}: SUBPROCESS ERROR'.format(sequence.id[:strlen]))		print('Processing {}: SUBPROCESS ERROR'.format(sequence.id[:strlen]))
	except:		except:
			cleanup()
	number_of_errors += 1		number_of_errors += 1
	print('Processing {}: UNEXPECTED ERROR:'.format(sequence.id[:strlen]), sys.exc_info()[0])		print('Processing {}: UNEXPECTED ERROR:'.format(sequence.id[:strlen]), sys.exc_info()[0])

	end_time = datetime.now()		def cleanup():
	print('Total time: {}'.format(end_time - start_time))		for file in glob.glob("*ltrs.fa"):
	print('Number of errors: {}'.format(number_of_errors))		os.remove(file)


	if __name__ == '__main__':		if __name__ == '__main__':
	main()		main()