diff -Nru busco-4.1.4/bin/busco busco-5.0.0/bin/busco --- busco-4.1.4/bin/busco 2020-10-01 14:11:36.000000000 +0000 +++ busco-5.0.0/bin/busco 2021-01-26 11:28:47.000000000 +0000 @@ -1,28 +1,30 @@ -#!/usr/bin/env python +#!/usr/bin/env python3 -try: - from busco import run_BUSCO -except ImportError as err: + +if __name__ == "__main__": try: - import re - pattern_search = re.search("cannot import name '(?P[\w]+)", err.msg) - missing_module = pattern_search.group("module_name") - if missing_module == "run_BUSCO": - print("BUSCO must be installed before it is run. Please enter 'python setup.py install (--user)'. " - "See the user guide for more information.") - elif missing_module == "Bio": - print("Please install BioPython (https://biopython.org/) before running BUSCO.") - elif missing_module == "numpy": - print("Please install NumPy before running BUSCO.") - else: - print("Unable to find module {}. Please make sure it is installed. See the user guide and the GitLab issue " - "board (https://gitlab.com/ezlab/busco/issues) if you need further assistance." - "".format(missing_module)) + from busco import run_BUSCO + except ImportError as err: + try: + import re + pattern_search = re.search("cannot import name '(?P[\w]+)", err.msg) + missing_module = pattern_search.group("module_name") + if missing_module == "run_BUSCO": + print("BUSCO must be installed before it is run. Please enter 'python setup.py install (--user)'. " + "See the user guide for more information.") + elif missing_module == "Bio": + print("Please install BioPython (https://biopython.org/) before running BUSCO.") + elif missing_module == "numpy": + print("Please install NumPy before running BUSCO.") + else: + print("Unable to find module {}. Please make sure it is installed. See the user guide and the GitLab issue " + "board (https://gitlab.com/ezlab/busco/issues) if you need further assistance." + "".format(missing_module)) - except: - print(err.msg) - print("There was a problem installing BUSCO or importing one of its dependencies. See the user guide and the " - "GitLab issue board (https://gitlab.com/ezlab/busco/issues) if you need further assistance.") - raise SystemExit(0) + except: + print(err.msg) + print("There was a problem installing BUSCO or importing one of its dependencies. See the user guide and the " + "GitLab issue board (https://gitlab.com/ezlab/busco/issues) if you need further assistance.") + raise SystemExit(1) -run_BUSCO.main() + run_BUSCO.main() diff -Nru busco-4.1.4/CHANGELOG busco-5.0.0/CHANGELOG --- busco-4.1.4/CHANGELOG 2020-10-01 14:11:36.000000000 +0000 +++ busco-5.0.0/CHANGELOG 2021-01-26 11:28:47.000000000 +0000 @@ -1,10 +1,30 @@ -4.1.4 -- Fix Augustus parsing bug +5.0.0 +- Implement metaeuk exon overlap filter +- Use metaeuk for eukaryote transcriptomes +- Issue #198 fixed +- Issue #248 fixed +- Issue #256 fixed +- Issue #260 fixed +- Issue #267 fixed +- Issue #277 fixed +- Issue #306 fixed +- Issue #313 fixed +- Issue #318 fixed +- Issue #326 fixed +- Issue #340 fixed +- Issue #344 fixed +- Issue #351 fixed +- Issue #357 fixed +- Issue #373 fixed +- Issue #385 fixed -4.1.3 -- Issue #296 fixed -- Issue #305 fixed -- Augustus parser improved + + +5.beta.1 +- Provide option to run augustus instead of metaeuk + +5.beta +- Replace Augustus with Metaeuk 4.1.2 - Issue #295 fixed diff -Nru busco-4.1.4/config/config.ini busco-5.0.0/config/config.ini --- busco-4.1.4/config/config.ini 2020-10-01 14:11:36.000000000 +0000 +++ busco-5.0.0/config/config.ini 2021-01-26 11:28:47.000000000 +0000 @@ -1,15 +1,20 @@ +# This is the BUSCOv5 configuration file template. +# It is not necessary to use this, as BUSCO will use the dependencies available on your PATH by default. +# The busco run parameters can all be set on the command line. See the help prompt (busco -h) for details. # -# This is the BUSCOv4 default configuration file. -# Rename it as config.ini and edit the path and command values to match your own environnment. -# You can use the script busco_configurator.py to do it for you -# Many of the options in the busco_run section can alternatively be set using command line arguments. See the help prompt (busco -h) for details. -# WARNING: passing a parameter through the command line overrides the value specified in this file. -# -# You need to set the path to this file in the environment variable BUSCO_CONFIG_FILE -# as follows: -# export BUSCO_CONFIG_FILE="/path/to/myconfig.ini" +# To use this file for an alternative configuration, or to specify particular versions of dependencies: +# 1) edit the path and command values to match your desired dependency versions. +# WARNING: passing a parameter through the command line overrides the value specified in this file. # -# Enable a parameter by removing ";" +# 2) Enable a parameter by removing ";" +# +# 3) Make this config file available to BUSCO either by setting an environment variable +# +# export BUSCO_CONFIG_FILE="/path/to/myconfig.ini" +# +# or by passing it as a command line argument +# +# busco --config /path/to/config.ini # [busco_run] # Input file @@ -38,12 +43,12 @@ ;evalue = 1e-3 # How many candidate regions (contigs, scaffolds) to consider for each BUSCO ;limit = 3 -# Augustus long mode for retraining (True/False) -;long = False -# Augustus species -;augustus_species = human +# Metaeuk parameters for initial run +;metaeuk_parameters='--param1=value1,--param2=value2' +# Metaeuk parameters for rerun +;metaeuk_rerun_parameters="" # Augustus parameters -;augustus_parameters='--genemodel=intronless,--singlestrand=false' +;augustus_parameters='--param1=value1,--param2=value2' # Quiet mode (True/False) ;quiet = False # Local destination path for downloaded lineage datasets @@ -56,6 +61,8 @@ ;download_base_url = https://busco-data.ezlab.org/v4/data/ # Download most recent BUSCO data and files ;update-data = True +# Use Augustus gene predictor instead of metaeuk +;use_augustus = True [tblastn] path = /ncbi-blast-2.10.1+/bin/ @@ -65,6 +72,10 @@ path = /ncbi-blast-2.10.1+/bin/ command = makeblastdb +[metaeuk] +path = /metaeuk/build/bin/ +command = metaeuk + [augustus] path = /augustus/bin/ command = augustus diff -Nru busco-4.1.4/debian/changelog busco-5.0.0/debian/changelog --- busco-4.1.4/debian/changelog 2020-10-08 13:01:52.000000000 +0000 +++ busco-5.0.0/debian/changelog 2021-01-28 11:29:01.000000000 +0000 @@ -1,3 +1,10 @@ +busco (5.0.0-1) unstable; urgency=medium + + * New upstream version + * Standards-Version: 4.5.1 (routine-update) + + -- Nilesh Patra Thu, 28 Jan 2021 16:59:01 +0530 + busco (4.1.4-1) unstable; urgency=medium * Team upload. diff -Nru busco-4.1.4/debian/control busco-5.0.0/debian/control --- busco-4.1.4/debian/control 2020-10-08 13:01:51.000000000 +0000 +++ busco-5.0.0/debian/control 2021-01-28 11:29:01.000000000 +0000 @@ -2,12 +2,12 @@ Section: science Priority: optional Maintainer: Debian Med Packaging Team -Uploaders: Andreas Tille +Uploaders: Andreas Tille , Nilesh Patra Build-Depends: debhelper-compat (= 13), dh-python, python3, python3-setuptools -Standards-Version: 4.5.0 +Standards-Version: 4.5.1 Vcs-Browser: https://salsa.debian.org/med-team/busco Vcs-Git: https://salsa.debian.org/med-team/busco.git Homepage: https://gitlab.com/ezlab/busco diff -Nru busco-4.1.4/debian/rules busco-5.0.0/debian/rules --- busco-4.1.4/debian/rules 2020-10-08 13:01:50.000000000 +0000 +++ busco-5.0.0/debian/rules 2021-01-28 11:29:01.000000000 +0000 @@ -4,24 +4,6 @@ export LC_ALL=C.UTF-8 include /usr/share/dpkg/default.mk -# this provides: -# DEB_SOURCE: the source package name -# DEB_VERSION: the full version of the package (epoch + upstream vers. + revision) -# DEB_VERSION_EPOCH_UPSTREAM: the package's version without the Debian revision -# DEB_VERSION_UPSTREAM_REVISION: the package's version without the Debian epoch -# DEB_VERSION_UPSTREAM: the package's upstream version -# DEB_DISTRIBUTION: the distribution(s) listed in the current entry of debian/changelog -# SOURCE_DATE_EPOCH: the source release date as seconds since the epoch, as -# specified by - -# for hardening you might like to uncomment this: -# export DEB_BUILD_MAINT_OPTIONS=hardening=+all %: dh $@ --with python3 --buildsystem=pybuild - -### When overriding auto_test make sure DEB_BUILD_OPTIONS will be respected -#override_dh_auto_test: -#ifeq (,$(filter nocheck,$(DEB_BUILD_OPTIONS))) -# do_stuff_for_testing -#endif diff -Nru busco-4.1.4/debian/upstream/metadata busco-5.0.0/debian/upstream/metadata --- busco-4.1.4/debian/upstream/metadata 2020-10-08 13:00:51.000000000 +0000 +++ busco-5.0.0/debian/upstream/metadata 2021-01-28 11:26:35.000000000 +0000 @@ -1,19 +1,22 @@ Bug-Database: https://gitlab.com/ezlab/busco/issues Bug-Submit: https://gitlab.com/ezlab/busco/issues/new Reference: - Author: > + - Author: > Mathieu Seppey and Mosè Manni and Evgeny M. Zdobnov - Title: > + Title: > BUSCO: Assessing Genome Assembly and Annotation Completeness - Journal: Methods Mol Biol. - Year: 2019 - Volume: 1962 - Pages: 227-245 - DOI: 10.1007/978-1-4939-9173-0_14 - PMID: 31020564 - URL: https://link.springer.com/protocol/10.1007%2F978-1-4939-9173-0_14 + Journal: Methods Mol Biol. + Year: 2019 + Volume: 1962 + Pages: 227-245 + DOI: 10.1007/978-1-4939-9173-0_14 + PMID: 31020564 + URL: > + https://link.springer.com/protocol/10.1007%2F978-1-4939-9173-0_14 Registry: - Name: conda:bioconda Entry: busco + - Name: bio.tools + Entry: busco Repository: https://gitlab.com/ezlab/busco.git Repository-Browse: https://gitlab.com/ezlab/busco diff -Nru busco-4.1.4/LICENSE busco-5.0.0/LICENSE --- busco-4.1.4/LICENSE 2020-10-01 14:11:36.000000000 +0000 +++ busco-5.0.0/LICENSE 2021-01-26 11:28:47.000000000 +0000 @@ -1,6 +1,6 @@ The MIT License (MIT) -Copyright (c) 2016-2020, Evgeny Zdobnov (ez@ezlab.org) +Copyright (c) 2016-2021, Evgeny Zdobnov (ez@ezlab.org) Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal diff -Nru busco-4.1.4/README.md busco-5.0.0/README.md --- busco-4.1.4/README.md 2020-10-01 14:11:36.000000000 +0000 +++ busco-5.0.0/README.md 2021-01-26 11:28:47.000000000 +0000 @@ -1,43 +1,35 @@ -**BUSCOv4 - Benchmarking sets of Universal Single-Copy Orthologs.** - -######Note: v4.1.4 is the latest stable release. To access v5.beta clone this repository and checkout the v5 branch with `git checkout v5.beta` - -######Note: A critical bug was discovered to have been introduced in v4.1.3. Any analysis on eukaryote genomes done with this version should be repeated with v4.1.4. +## BUSCOv5 - Benchmarking sets of Universal Single-Copy Orthologs. For full documentation please consult the user guide: https://busco.ezlab.org/busco_userguide.html -Main changes in v4: - -- Automated selection of lineages issued from https://www.orthodb.org/ release 10 +Main changes in v5: -- Automated download of all necessary files and datasets to conduct a run +- Metaeuk is used as default gene predictor for eukaryote pipeline. Augustus is maintained and can be used optionally instead of Metaeuk. +- The folder structure has changed, so if doing a manual installation, make sure to completely remove any previous versions of BUSCO before installing v5. -- Use prodigal for non-eukaryotic genomes +*** +### Installation -- **BUSCO is now Python 3.3+ only !** +#### Conda +Conda installation instructions are in the userguide here: +https://busco.ezlab.org/busco_userguide.html#conda-package -To install, clone the repository and enter ``sudo python3 setup.py install`` or ``python3 setup.py install --user`` +#### Docker +BUSCO is available through DockerHub - instructions here: +https://busco.ezlab.org/busco_userguide.html#docker-image +#### Manual installation +Manual installation is possible, though it is important to validate each of the dependencies before running BUSCO. More details in the user guide: https://busco.ezlab.org/busco_userguide.html#manual-installation -Do not forget to edit the ``config/config.ini`` file to match your environment. The script `scripts/busco_configurator.py` can help with this. -You can set the ``BUSCO_CONFIG_FILE`` environment variable to define the path (including the filename) to that ``config.ini`` file. - -``` -export BUSCO_CONFIG_FILE="/path/to/myconfig.ini" -``` -Alternatively you can pass the config file path as a command line argument using ``--config /path/to/config.ini``. - - -If you have trouble installing one of the many third-party tools, try the official Docker container: https://hub.docker.com/r/ezlabgva/busco/tags - -Report problems on the BUSCO issue board at https://gitlab.com/ezlab/busco/issues - +*** +### Troubleshooting To get help with BUSCO use: ``busco -h`` and ``python3 scripts/generate_plot.py -h`` -**!!!** Do not use "odb9" datasets with BUSCOv4. If you need to reproduce previous analyses, use BUSCOv3 (https://gitlab.com/ezlab/busco/-/tags/3.0.2) +Report problems on the BUSCO issue board at https://gitlab.com/ezlab/busco/issues -**How to cite BUSCO** +*** +### How to cite BUSCO *BUSCO: Assessing Genome Assembly and Annotation Completeness.* Mathieu Seppey, Mosè Manni, Evgeny M. Zdobnov @@ -54,5 +46,5 @@ *Bioinformatics*, published online June 9, 2015 doi: 10.1093/bioinformatics/btv351 -Copyright (c) 2016-2020, Evgeny Zdobnov (ez@ezlab.org) +Copyright (c) 2016-2021, Evgeny Zdobnov (ez@ezlab.org) Licensed under the MIT license. See LICENSE.md file. diff -Nru busco-4.1.4/scripts/busco_configurator.py busco-5.0.0/scripts/busco_configurator.py --- busco-4.1.4/scripts/busco_configurator.py 2020-10-01 14:11:36.000000000 +0000 +++ busco-5.0.0/scripts/busco_configurator.py 1970-01-01 00:00:00.000000000 +0000 @@ -1,39 +0,0 @@ -#!/usr/bin/env python3 -# -# This file fills the config.ini file with info matching your environment -# -# python3 busco_configurator.py config.ini.default yourconfig.ini -import sys -import shutil -paths = {} -try: - sys.argv[1] - sys.argv[2] -except IndexError: - print('\nUsage: python3 busco_configurator.py config.ini.default yourconfig.ini\n') - exit() -for line in open(sys.argv[1]): - if line.startswith('['): - name = line.strip().replace('[','').replace(']','') - if name not in ['busco_run','sepp']: - paths.update({name:shutil.which(name)}) - elif name == 'sepp': - paths.update({name:shutil.which('run_sepp.py')}) -outp = open(sys.argv[2],'w') -name = '' -for line in open(sys.argv[1]): - if line.startswith('['): - name = line.strip().replace('[','').replace(']','') - if line.startswith('path ='): - try: - outp.write('path = %s/\n' % '/'.join(paths[name].split('/')[0:-1])) - except AttributeError: - raise SystemExit('Cannot find the path for the command `%s`, add it in your $PATH and rerun this script' % name) - continue - elif line.startswith('command ='): - try: - outp.write('command = %s\n' % paths[name].split('/')[-1]) - except AttributeError: - raise SystemExit('Cannot find the path for the command `%s`, add it in your $PATH and rerun this script' % name) - continue - outp.write(line) diff -Nru busco-4.1.4/scripts/generate_plot.py busco-5.0.0/scripts/generate_plot.py --- busco-4.1.4/scripts/generate_plot.py 2020-10-01 14:11:36.000000000 +0000 +++ busco-5.0.0/scripts/generate_plot.py 2021-01-26 11:28:47.000000000 +0000 @@ -19,7 +19,7 @@ You can find both the resulting R script for customisation and the figure in the working directory. -Copyright (c) 2016-2020, Evgeny Zdobnov (ez@ezlab.org) +Copyright (c) 2016-2021, Evgeny Zdobnov (ez@ezlab.org) Licensed under the MIT license. See LICENSE.md file. """ @@ -37,9 +37,9 @@ from busco.BuscoLogger import BuscoLogger #: working directory -_plot_dir = '' +_plot_dir = "" #: r file name -_r_file = 'busco_figure.R' +_r_file = "busco_figure.R" # to avoid running R _no_r = False @@ -47,109 +47,111 @@ #: Get an instance of _logger for keeping track of events _logger = BuscoLogger.get_logger(__name__) -RCODE = '######################################\n'\ - '#\n'\ - '# BUSCO summary figure\n'\ - '# @version 4.0.0\n'\ - '# @since BUSCO 2.0.0\n'\ - '# \n' \ - '# Copyright (c) 2016-2020, Evgeny Zdobnov (ez@ezlab.org)\n'\ - '# Licensed under the MIT license. See LICENSE.md file.\n'\ - '#\n'\ - '######################################\n'\ - '\n'\ - '# Load the required libraries\n'\ - 'library(ggplot2)\n'\ - 'library("grid")\n'\ - '\n'\ - '# !!! CONFIGURE YOUR PLOT HERE !!! \n'\ - '# Output\n'\ - 'my_output <- paste(%s1,"busco_figure.png",sep="/") \n' \ - 'my_width <- 20\n' \ - 'my_height <- 15\n'\ - 'my_unit <- "cm"\n'\ - '\n'\ - '# Colors\n'\ - 'my_colors <- c("#56B4E9", "#3492C7", "#F0E442", "#F04442")\n'\ - '# Bar height ratio\n'\ - 'my_bar_height <- 0.75\n'\ - '\n'\ - '# Legend\n'\ - 'my_title <- "BUSCO Assessment Results"\n'\ - '\n'\ - '# Font\n'\ - 'my_family <- "sans"\n'\ - 'my_size_ratio <- 1\n'\ - '\n'\ - '# !!! SEE YOUR DATA HERE !!! \n'\ - '# Your data as generated by python, remove or add more\n'\ - 'my_species <- c%s2\n'\ - 'my_species <- factor(my_species)\n'\ - 'my_species <- factor(my_species,levels(my_species)[c(length(levels(my_species)):1)]) ' \ - '# reorder your species here just by changing the values in the vector :\n'\ - 'my_percentage <- c%s3\n'\ - 'my_values <- c%s4\n'\ - '\n'\ - '######################################\n'\ - '######################################\n'\ - '######################################\n'\ - '# Code to produce the graph\n' \ - 'labsize = 1\n' \ - 'if (length(levels(my_species)) > 10){\n'\ - ' labsize = 0.66\n'\ - '}\n'\ - 'print("Plotting the figure ...")\n'\ - 'category <- c(rep(c("S","D","F","M"),c%s5))\n'\ - 'category <-factor(category)\n'\ - 'category = factor(category,levels(category)[c(4,1,2,3)])\n'\ - 'df = data.frame(my_species,my_percentage,my_values,category)\n'\ - '\n'\ - 'figure <- ggplot() + \n'\ - ' \n'\ - ' geom_bar(aes(y = my_percentage, x = my_species, fill = category), position = position_stack(reverse = TRUE), data = df, stat="identity", ' \ - 'width=my_bar_height) + \n'\ - ' coord_flip() + \n' \ - ' theme_gray(base_size = 8) + \n' \ - ' scale_y_continuous(labels = c("0","20","40","60","80","100"), breaks = c(0,20,40,60,80,100)) + \n'\ - ' scale_fill_manual(values = my_colors,labels =c(" Complete (C) and single-copy (S) ",\n'\ - ' " Complete (C) and duplicated (D)",\n'\ - ' " Fragmented (F) ",\n'\ - ' " Missing (M)")) + \n'\ - ' ggtitle(my_title) + \n'\ - ' xlab("") + \n'\ - ' ylab("\\n%BUSCOs") + \n'\ - '\n'\ - ' theme(plot.title = element_text(family=my_family, hjust=0.5, colour = "black", size = rel(2.2)*my_size_ratio, face = ' \ - '"bold")) + \n'\ - ' theme(legend.position="top",legend.title = element_blank()) + \n'\ - ' theme(legend.text = element_text(family=my_family, size = rel(1.2)*my_size_ratio)) + \n'\ - ' theme(panel.background = element_rect(color="#FFFFFF", fill="white")) + \n'\ - ' theme(panel.grid.minor = element_blank()) + \n'\ - ' theme(panel.grid.major = element_blank()) +\n'\ - ' theme(axis.text.y = element_text(family=my_family, colour = "black", size = rel(1.66)*my_size_ratio)) + \n'\ - ' theme(axis.text.x = element_text(family=my_family, colour = "black", size = rel(1.66)*my_size_ratio)) + \n'\ - ' theme(axis.line = element_line(size=1*my_size_ratio, colour = "black")) + \n'\ - ' theme(axis.ticks.length = unit(.85, "cm")) + \n'\ - ' theme(axis.ticks.y = element_line(colour="white", size = 0)) + \n'\ - ' theme(axis.ticks.x = element_line(colour="#222222")) + \n'\ - ' theme(axis.ticks.length = unit(0.4, "cm")) + \n'\ - ' theme(axis.title.x = element_text(family=my_family, size=rel(1.2)*my_size_ratio)) + \n'\ - ' \n'\ - ' guides(fill = guide_legend(override.aes = list(colour = NULL))) +\n'\ - ' guides(fill=guide_legend(nrow=2,byrow=TRUE))\n'\ - ' \n'\ - ' for(i in rev(c(1:length(levels(my_species))))){\n'\ - ' detailed_values <- my_values[my_species==my_species[my_species==levels(my_species)[i]]]\n'\ - ' total_buscos <- sum(detailed_values)\n'\ - ' figure <- figure + \n' \ - ' annotate("text", label=paste("C:", detailed_values[1] + detailed_values[2], " [S:", detailed_values[1], ' \ - '", D:", detailed_values[2], "], F:", detailed_values[3], ", M:", detailed_values[4], ", n:", total_buscos, ' \ - 'sep=""), \n' \ - ' y=3, x = i, size = labsize*4*my_size_ratio, colour = "black", hjust=0, family=my_family)\n'\ - ' }\n'\ - ' \n'\ - 'ggsave(figure, file=my_output, width = my_width, height = my_height, unit = my_unit)\n'\ - 'print("Done")\n' +RCODE = ( + "######################################\n" + "#\n" + "# BUSCO summary figure\n" + "# @version 4.0.0\n" + "# @since BUSCO 2.0.0\n" + "# \n" + "# Copyright (c) 2016-2021, Evgeny Zdobnov (ez@ezlab.org)\n" + "# Licensed under the MIT license. See LICENSE.md file.\n" + "#\n" + "######################################\n" + "\n" + "# Load the required libraries\n" + "library(ggplot2)\n" + 'library("grid")\n' + "\n" + "# !!! CONFIGURE YOUR PLOT HERE !!! \n" + "# Output\n" + 'my_output <- paste(%s1,"busco_figure.png",sep="/") \n' + "my_width <- 20\n" + "my_height <- 15\n" + 'my_unit <- "cm"\n' + "\n" + "# Colors\n" + 'my_colors <- c("#56B4E9", "#3492C7", "#F0E442", "#F04442")\n' + "# Bar height ratio\n" + "my_bar_height <- 0.75\n" + "\n" + "# Legend\n" + 'my_title <- "BUSCO Assessment Results"\n' + "\n" + "# Font\n" + 'my_family <- "sans"\n' + "my_size_ratio <- 1\n" + "\n" + "# !!! SEE YOUR DATA HERE !!! \n" + "# Your data as generated by python, remove or add more\n" + "my_species <- c%s2\n" + "my_species <- factor(my_species)\n" + "my_species <- factor(my_species,levels(my_species)[c(length(levels(my_species)):1)]) " + "# reorder your species here just by changing the values in the vector :\n" + "my_percentage <- c%s3\n" + "my_values <- c%s4\n" + "\n" + "######################################\n" + "######################################\n" + "######################################\n" + "# Code to produce the graph\n" + "labsize = 1\n" + "if (length(levels(my_species)) > 10){\n" + " labsize = 0.66\n" + "}\n" + 'print("Plotting the figure ...")\n' + 'category <- c(rep(c("S","D","F","M"),c%s5))\n' + "category <-factor(category)\n" + "category = factor(category,levels(category)[c(4,1,2,3)])\n" + "df = data.frame(my_species,my_percentage,my_values,category)\n" + "\n" + "figure <- ggplot() + \n" + " \n" + ' geom_bar(aes(y = my_percentage, x = my_species, fill = category), position = position_stack(reverse = TRUE), data = df, stat="identity", ' + "width=my_bar_height) + \n" + " coord_flip() + \n" + " theme_gray(base_size = 8) + \n" + ' scale_y_continuous(labels = c("0","20","40","60","80","100"), breaks = c(0,20,40,60,80,100)) + \n' + ' scale_fill_manual(values = my_colors,labels =c(" Complete (C) and single-copy (S) ",\n' + ' " Complete (C) and duplicated (D)",\n' + ' " Fragmented (F) ",\n' + ' " Missing (M)")) + \n' + " ggtitle(my_title) + \n" + ' xlab("") + \n' + ' ylab("\\n%BUSCOs") + \n' + "\n" + ' theme(plot.title = element_text(family=my_family, hjust=0.5, colour = "black", size = rel(2.2)*my_size_ratio, face = ' + '"bold")) + \n' + ' theme(legend.position="top",legend.title = element_blank()) + \n' + " theme(legend.text = element_text(family=my_family, size = rel(1.2)*my_size_ratio)) + \n" + ' theme(panel.background = element_rect(color="#FFFFFF", fill="white")) + \n' + " theme(panel.grid.minor = element_blank()) + \n" + " theme(panel.grid.major = element_blank()) +\n" + ' theme(axis.text.y = element_text(family=my_family, colour = "black", size = rel(1.66)*my_size_ratio)) + \n' + ' theme(axis.text.x = element_text(family=my_family, colour = "black", size = rel(1.66)*my_size_ratio)) + \n' + ' theme(axis.line = element_line(size=1*my_size_ratio, colour = "black")) + \n' + ' theme(axis.ticks.length = unit(.85, "cm")) + \n' + ' theme(axis.ticks.y = element_line(colour="white", size = 0)) + \n' + ' theme(axis.ticks.x = element_line(colour="#222222")) + \n' + ' theme(axis.ticks.length = unit(0.4, "cm")) + \n' + " theme(axis.title.x = element_text(family=my_family, size=rel(1.2)*my_size_ratio)) + \n" + " \n" + " guides(fill = guide_legend(override.aes = list(colour = NULL))) +\n" + " guides(fill=guide_legend(nrow=2,byrow=TRUE))\n" + " \n" + " for(i in rev(c(1:length(levels(my_species))))){\n" + " detailed_values <- my_values[my_species==my_species[my_species==levels(my_species)[i]]]\n" + " total_buscos <- sum(detailed_values)\n" + " figure <- figure + \n" + ' annotate("text", label=paste("C:", detailed_values[1] + detailed_values[2], " [S:", detailed_values[1], ' + '", D:", detailed_values[2], "], F:", detailed_values[3], ", M:", detailed_values[4], ", n:", total_buscos, ' + 'sep=""), \n' + ' y=3, x = i, size = labsize*4*my_size_ratio, colour = "black", hjust=0, family=my_family)\n' + " }\n" + " \n" + "ggsave(figure, file=my_output, width = my_width, height = my_height, unit = my_unit)\n" + 'print("Done")\n' +) def _check_wd(): @@ -158,11 +160,11 @@ :raises SystemExit: if the folder is absent or the user has no write permission """ if not os.path.exists(_plot_dir): - _logger.warning('Impossible to read %s' % _plot_dir) - raise SystemExit + _logger.warning("Impossible to read %s" % _plot_dir) + raise SystemExit() if not os.access(_plot_dir, os.W_OK): - _logger.warning('Impossible to write into %s' % _plot_dir) - raise SystemExit + _logger.warning("Impossible to write into %s" % _plot_dir) + raise SystemExit() def _write_r_code(data): @@ -171,13 +173,14 @@ :param data: the data loaded from the run folders used to generate the R file :type data: dict """ - r_file = open('%s%s' % (_plot_dir, _r_file), 'w') - r_file.write(RCODE - .replace('%s1', '"%s"' % _plot_dir) - .replace('%s2', str(tuple(data['species']))) - .replace('%s3', str(tuple(data['percentages']))) - .replace('%s4', str(tuple(data['values']))) - .replace('%s5', '(1)')) + r_file = open("%s%s" % (_plot_dir, _r_file), "w") + r_file.write( + RCODE.replace("%s1", '"%s"' % _plot_dir) + .replace("%s2", str(tuple(data["species"]))) + .replace("%s3", str(tuple(data["percentages"]))) + .replace("%s4", str(tuple(data["values"]))) + .replace("%s5", "(1)") + ) def _run_r_code(): @@ -188,71 +191,104 @@ # first try to load the two required package and warn the user if an error occur # package ggplot2 need_to_exit = False - ggplot2 = subprocess.Popen(['R', '-e', 'library(ggplot2)', '--quiet'], - stderr=subprocess.PIPE, stdout=subprocess.PIPE) + ggplot2 = subprocess.Popen( + ["R", "-e", "library(ggplot2)", "--quiet"], + stderr=subprocess.PIPE, + stdout=subprocess.PIPE, + ) ggplot2_out = ggplot2.stderr.readlines() + ggplot2.stdout.readlines() - if 'Error' in str(ggplot2_out): - _logger.warning('Impossible to run R. The package ggplot2 does not seem to be installed. ' - 'Please check your R installation. See also the --no_r option to avoid this message') + if "Error" in str(ggplot2_out): + _logger.warning( + "Impossible to run R. The package ggplot2 does not seem to be installed. " + "Please check your R installation. See also the --no_r option to avoid this message" + ) need_to_exit = True # package grid - grid = subprocess.Popen(['R', '-e', 'library(grid)', '--quiet'], - stderr=subprocess.PIPE, stdout=subprocess.PIPE) + grid = subprocess.Popen( + ["R", "-e", "library(grid)", "--quiet"], + stderr=subprocess.PIPE, + stdout=subprocess.PIPE, + ) grid_out = grid.stderr.readlines() + grid.stdout.readlines() - if 'Error' in str(grid_out): - _logger.warning('Impossible to run R. The package grid does not seem to be installed. ' - 'Please check your R installation. See also the --no_r option to avoid this message') + if "Error" in str(grid_out): + _logger.warning( + "Impossible to run R. The package grid does not seem to be installed. " + "Please check your R installation. See also the --no_r option to avoid this message" + ) need_to_exit = True if need_to_exit: return None # do not run the code, but no need to stop the execution # run R - if which('Rscript') is not None: - r_script = ['Rscript','%s%s' % (_plot_dir, _r_file)] - p = subprocess.Popen(r_script, shell=False, stdout=subprocess.PIPE, stderr=subprocess.PIPE) + if which("Rscript") is not None: + r_script = ["Rscript", "%s%s" % (_plot_dir, _r_file)] + p = subprocess.Popen( + r_script, shell=False, stdout=subprocess.PIPE, stderr=subprocess.PIPE + ) out, err = p.communicate() if out: - _logger.info('\n%s' % str(out.decode('utf-8'))) + _logger.info("\n%s" % str(out.decode("utf-8"))) if err: - _logger.error('\n%s' % str(err.decode('utf-8'))) + _logger.error("\n%s" % str(err.decode("utf-8"))) else: - _logger.error( - '\"Rscript\" is not accessible') - raise SystemExit + _logger.error('"Rscript" is not accessible') + raise SystemExit() def _set_args(): """ This function sets the parameters provided by the user """ - parser = argparse.ArgumentParser(description='BUSCO plot generation tool.\n' - 'Place all BUSCO short summary files (short_summary.[generic|specific].dataset.label.txt) in a single folder. ' - 'It will be ' - 'your working directory, in which the generated plot files' - ' will be written' - '\nSee also the user guide' - ' for additional information', - usage='python3 generate_plot.py -wd [WORKING_DIRECTORY] [OTHER OPTIONS]', - formatter_class=RawTextHelpFormatter, add_help=False) + parser = argparse.ArgumentParser( + description="BUSCO plot generation tool.\n" + "Place all BUSCO short summary files (short_summary.[generic|specific].dataset.label.txt) in a single folder. " + "It will be " + "your working directory, in which the generated plot files" + " will be written" + "\nSee also the user guide" + " for additional information", + usage="python3 generate_plot.py -wd [WORKING_DIRECTORY] [OTHER OPTIONS]", + formatter_class=RawTextHelpFormatter, + add_help=False, + ) - required = parser.add_argument_group('required arguments') - optional = parser.add_argument_group('optional arguments') + required = parser.add_argument_group("required arguments") + optional = parser.add_argument_group("optional arguments") required.add_argument( - '-wd', '--working_directory', metavar='PATH', required=True, dest='working_directory', - help='Define the location of your working directory') + "-wd", + "--working_directory", + metavar="PATH", + required=True, + dest="working_directory", + help="Define the location of your working directory", + ) optional.add_argument( - '-rt', '--run_type', required=False, dest='run_type', - help='type of summary to use, `generic` or `specific`') + "-rt", + "--run_type", + required=False, + dest="run_type", + help="type of summary to use, `generic` or `specific`", + ) optional.add_argument( - '--no_r', help='To avoid to run R. It will just create the R script file in the working directory', - action="store_true", dest='no_r') + "--no_r", + help="To avoid to run R. It will just create the R script file in the working directory", + action="store_true", + dest="no_r", + ) optional.add_argument( - '-q', '--quiet', help='Disable the info logs, displays only errors', action="store_true", dest='quiet') - optional.add_argument('-h', '--help', action="help", help="Show this help message and exit") + "-q", + "--quiet", + help="Disable the info logs, displays only errors", + action="store_true", + dest="quiet", + ) + optional.add_argument( + "-h", "--help", action="help", help="Show this help message and exit" + ) args = vars(parser.parse_args()) if args["quiet"]: _logger.setLevel(logging.ERROR) @@ -261,56 +297,64 @@ _no_r = True global _plot_dir _plot_dir = args["working_directory"] - if _plot_dir[-1] != '/': - _plot_dir += '/' + if _plot_dir[-1] != "/": + _plot_dir += "/" global _run_type - _run_type = '*' + _run_type = "*" if args["run_type"]: _run_type = args["run_type"] + def _load_data(): """ :return: """ - data = {'species': [], 'values': [], 'percentages': [], 'species_tmp': []} + data = {"species": [], "values": [], "percentages": [], "species_tmp": []} datasets = set([]) - for f in glob.glob('%s/short_summary.%s.*.*.txt' % (_plot_dir, _run_type)): + for f in glob.glob("%s/short_summary.%s.*.*.txt" % (_plot_dir, _run_type)): try: - datasets.add(f.split('/')[-1].split('.')[1]) + datasets.add(f.split("/")[-1].split(".")[1]) content = open(f) comp = 0 dupl = 0 frag = 0 miss = 0 for line in content: - if 'Complete and single-copy BUSCOs' in line: - comp = int(line.split('\t')[1]) - elif 'Complete and duplicated BUSCOs' in line: - dupl = int(line.split('\t')[1]) - elif 'Fragmented BUSCOs' in line: - frag = int(line.split('\t')[1]) - elif 'Missing BUSCOs' in line: - miss = int(line.split('\t')[1]) - data['species_tmp'] += [".".join(f.split('/')[-1].split('.')[3:-1]+[f.split('/')[-1].split('.')[2]])]*4 - data['values'] += [comp, dupl, frag, miss] + if "Complete and single-copy BUSCOs" in line: + comp = int(line.split("\t")[1]) + elif "Complete and duplicated BUSCOs" in line: + dupl = int(line.split("\t")[1]) + elif "Fragmented BUSCOs" in line: + frag = int(line.split("\t")[1]) + elif "Missing BUSCOs" in line: + miss = int(line.split("\t")[1]) + data["species_tmp"] += [ + ".".join( + f.split("/")[-1].split(".")[3:-1] + [f.split("/")[-1].split(".")[2]] + ) + ] * 4 + data["values"] += [comp, dupl, frag, miss] total = comp + dupl + frag + miss - comp_pc = round(comp/float(total)*100, 1) - dupl_pc = round(dupl/float(total)*100, 1) - frag_pc = round(frag/float(total)*100, 1) + comp_pc = round(comp / float(total) * 100, 1) + dupl_pc = round(dupl / float(total) * 100, 1) + frag_pc = round(frag / float(total) * 100, 1) miss_pc = round(100 - comp_pc - dupl_pc - frag_pc, 1) - data['percentages'] += [comp_pc, dupl_pc, frag_pc, miss_pc] - _logger.info('Loaded %s successfully' % f) + data["percentages"] += [comp_pc, dupl_pc, frag_pc, miss_pc] + _logger.info("Loaded %s successfully" % f) except IOError: - _logger.warning('Impossible to use the file %s' % f) + _logger.warning("Impossible to use the file %s" % f) # if only one dataset, remove it from species label if len(datasets) == 1: - data['species'] = [label.split('.')[0] for label in data['species_tmp']] + data["species"] = [label.split(".")[0] for label in data["species_tmp"]] else: - data['species'] = data['species_tmp'] - if len(data['species']) == 0: - _logger.warning('No files matching the pattern short_summary.%s were found in %s' % (_run_type,_plot_dir)) - raise SystemExit + data["species"] = data["species_tmp"] + if len(data["species"]) == 0: + _logger.warning( + "No files matching the pattern short_summary.%s were found in %s" + % (_run_type, _plot_dir) + ) + raise SystemExit() return data @@ -324,57 +368,66 @@ try: - _logger.info('****************** Start plot generation at %s ******************' - % (time.strftime("%m/%d/%Y %H:%M:%S"))) + _logger.info( + "****************** Start plot generation at %s ******************" + % (time.strftime("%m/%d/%Y %H:%M:%S")) + ) # check working directory _check_wd() # load data - _logger.info('Load data ...') + _logger.info("Load data ...") data = _load_data() # write R code - _logger.info('Generate the R code ...') + _logger.info("Generate the R code ...") _write_r_code(data) # run R code if not _no_r: - _logger.info('Run the R code ...') + _logger.info("Run the R code ...") _run_r_code() else: - _logger.info('You chose not to run R') + _logger.info("You chose not to run R") if not _logger.has_warning(): - _logger.info('Plot generation done. Total running time: %s seconds' % str(time.time() - start_time)) + _logger.info( + "Plot generation done. Total running time: %s seconds" + % str(time.time() - start_time) + ) else: - _logger.info('Plot generation done with WARNING(s). Total running time: %s seconds' - % str(time.time() - start_time)) - _logger.info('Results written in %s\n' % _plot_dir) + _logger.info( + "Plot generation done with WARNING(s). Total running time: %s seconds" + % str(time.time() - start_time) + ) + _logger.info("Results written in %s\n" % _plot_dir) except SystemExit: - _logger.error('Plot generation failed !') + _logger.error("Plot generation failed !") _logger.info( - 'Check the logs, read the user guide, and check the BUSCO issue board on https://gitlab.com/ezlab/busco/issues' - ) - raise SystemExit + "Check the logs, read the user guide, and check the BUSCO issue board on https://gitlab.com/ezlab/busco/issues" + ) + raise except KeyboardInterrupt: - _logger.error('A signal was sent to kill the process') - _logger.error('Plot generation failed !') + _logger.error("A signal was sent to kill the process") + _logger.error("Plot generation failed !") _logger.info( - 'Check the logs, read the user guide, and check the BUSCO issue board on https://gitlab.com/ezlab/busco/issues' - ) - raise SystemExit + "Check the logs, read the user guide, and check the BUSCO issue board on https://gitlab.com/ezlab/busco/issues" + ) + raise except BaseException: exc_type, exc_value, exc_traceback = sys.exc_info() - _logger.critical('Unhandled exception occurred: %s' % repr(traceback.format_exception(exc_type, exc_value, - exc_traceback))) - _logger.error('Plot generation failed !\n') + _logger.critical( + "Unhandled exception occurred: %s" + % repr(traceback.format_exception(exc_type, exc_value, exc_traceback)) + ) + _logger.error("Plot generation failed !\n") _logger.info( - 'Check the logs, read the user guide, and check the BUSCO issue board on https://gitlab.com/ezlab/busco/issues' - ) - raise SystemExit + "Check the logs, read the user guide, and check the BUSCO issue board on https://gitlab.com/ezlab/busco/issues" + ) + raise SystemExit() # Entry point diff -Nru busco-4.1.4/setup.py busco-5.0.0/setup.py --- busco-4.1.4/setup.py 2020-10-01 14:11:36.000000000 +0000 +++ busco-5.0.0/setup.py 2021-01-26 11:28:47.000000000 +0000 @@ -1,10 +1,10 @@ -#!/usr/bin/env python +#!/usr/bin/env python3 # coding: utf-8 """ .. versionadded:: 3.0.0 -.. versionchanged:: 4.0.beta1 +.. versionchanged:: 5.0.0 -Copyright (c) 2016-2020, Evgeny Zdobnov (ez@ezlab.org) +Copyright (c) 2016-2021, Evgeny Zdobnov (ez@ezlab.org) Licensed under the MIT license. See LICENSE.md file. This script proceeds to the BUSCO packages installation @@ -12,20 +12,22 @@ """ from distutils.core import setup + version = {} with open("src/busco/_version.py") as version_file: exec(version_file.read(), version) -setup(name='BUSCO', - version=version['__version__'], - author='ezlab', - license='Licensed under the MIT license. See LICENSE.md file.', - author_email='ez@ezlab.org', - long_description='Assessing genome assembly and annotation completeness ' - 'with Benchmarking Universal Single-Copy Orthologs ', - url='https://busco.ezlab.org/', - platforms='Unix like', - packages=['busco'], - package_dir={'busco': 'src/busco'}, - scripts=['bin/busco'] - ) +setup( + name="BUSCO", + version=version["__version__"], + author="ezlab", + license="Licensed under the MIT license. See LICENSE.md file.", + author_email="ez@ezlab.org", + long_description="Assessing genome assembly and annotation completeness " + "with Benchmarking Universal Single-Copy Orthologs ", + url="https://busco.ezlab.org/", + platforms="Unix like", + packages=["busco", "busco.analysis", "busco.busco_tools"], + package_dir={"busco": "src/busco"}, + scripts=["bin/busco"], +) diff -Nru busco-4.1.4/src/busco/Actions.py busco-5.0.0/src/busco/Actions.py --- busco-4.1.4/src/busco/Actions.py 2020-10-01 14:11:36.000000000 +0000 +++ busco-5.0.0/src/busco/Actions.py 2021-01-26 11:28:47.000000000 +0000 @@ -5,10 +5,11 @@ import os import sys -logger = BuscoLogger.get_logger(__name__) class ListLineagesAction(argparse.Action): + logger = BuscoLogger.get_logger(__name__) + def __init__(self, option_strings, dest, nargs=0, default="==SUPPRESS==", **kwargs): super().__init__(option_strings, dest, nargs=nargs, default=default, **kwargs) @@ -16,16 +17,15 @@ try: self.config_manager = BuscoConfigManager({}) except SystemExit as se: - logger.error("The config file is necessary here as it contains remote and local path locations for " - "downloading dataset information") - logger.error(se) - raise SystemExit + type(self).logger.error(se) + raise SystemExit() + self.config = PseudoConfig(self.config_manager.config_file) try: self.config.load() self.print_lineages() except SystemExit as se: - logger.error(se) + type(self).logger.error(se) finally: os.remove("busco_{}.log".format(BuscoLogger.random_id)) parser.exit() @@ -39,12 +39,13 @@ print("".join(f.readlines())) def download_lineages_list(self): - lineages_list_file = self.config.downloader.get("lineages_list.txt", "information") + lineages_list_file = self.config.downloader.get( + "lineages_list.txt", "information" + ) return lineages_list_file class CleanHelpAction(argparse.Action): - def __init__(self, option_strings, dest, nargs=0, default="==SUPPRESS==", **kwargs): super().__init__(option_strings, dest, nargs=nargs, default=default, **kwargs) @@ -58,8 +59,15 @@ class CleanVersionAction(argparse.Action): - - def __init__(self, option_strings, version=None, dest="==SUPPRESS==", nargs=0, default="==SUPPRESS==", **kwargs): + def __init__( + self, + option_strings, + version=None, + dest="==SUPPRESS==", + nargs=0, + default="==SUPPRESS==", + **kwargs + ): super().__init__(option_strings, dest, nargs=nargs, default=default, **kwargs) self.version = version diff -Nru busco-4.1.4/src/busco/analysis/Analysis.py busco-5.0.0/src/busco/analysis/Analysis.py --- busco-4.1.4/src/busco/analysis/Analysis.py 1970-01-01 00:00:00.000000000 +0000 +++ busco-5.0.0/src/busco/analysis/Analysis.py 2021-01-26 11:28:47.000000000 +0000 @@ -0,0 +1,163 @@ +from Bio import SeqIO +from busco.BuscoLogger import BuscoLogger +from abc import ABCMeta +import os +from busco.busco_tools.blast import TBLASTNRunner, MKBLASTRunner + +logger = BuscoLogger.get_logger(__name__) + + +class NucleotideAnalysis(metaclass=ABCMeta): + + LETTERS = ["A", "C", "T", "G", "N"] + + # explanation of ambiguous codes found here: https://www.dnabaser.com/articles/IUPAC%20ambiguity%20codes.html + AMBIGUOUS_CODES = ["Y", "R", "W", "S", "K", "M", "D", "V", "H", "B"] + + def __init__(self): + + super().__init__() # Initialize BuscoAnalysis + if not self.check_nucleotide_file(self.input_file): + raise SystemExit("The input file does not contain nucleotide sequences.") + + def check_nucleotide_file(self, filename): + i = 0 + num_records = 0 + for record in SeqIO.parse(filename, "fasta"): + num_records += 1 + for letter in record.seq.upper(): + if i > 5000: + break + i += 1 + if ( + letter not in type(self).LETTERS + and letter not in type(self).AMBIGUOUS_CODES + ): + return False + else: + continue # only continue to next record if 5000 has not been hit + break # If for loop exits with "break", the else clause is skipped and the outer loop also breaks. + + if num_records == 0: + return False + + return True + + def init_tools(self): + super().init_tools() + + +class ProteinAnalysis: + + LETTERS = [ + "F", + "L", + "I", + "M", + "V", + "S", + "P", + "T", + "A", + "Y", + "X", + "H", + "Q", + "N", + "K", + "D", + "E", + "C", + "W", + "R", + "G", + ] + NUCL_LETTERS = ["A", "C", "T", "G", "N"] + + def __init__(self): + super().__init__() + if not self.check_protein_file(self.input_file): + raise SystemExit("Please provide a protein file as input") + + def check_protein_file(self, filename): + + for i, record in enumerate(SeqIO.parse(filename, "fasta")): + if i > 10: + break + for letter in record.seq: + if ( + letter.upper() not in type(self).NUCL_LETTERS + and letter.upper() in type(self).LETTERS + ): + return True + elif letter.upper() not in type(self).LETTERS: + return False + else: + continue + return False # if file only contains "A", "T", "C", "G", "N", it is unlikely to be a protein file + + +class BLASTAnalysis(metaclass=ABCMeta): + def __init__(self): + super().__init__() + + def init_tools(self): + super().init_tools() + self.mkblast_runner = MKBLASTRunner() + self.tblastn_runner = TBLASTNRunner() + + if self.mkblast_runner.version != self.tblastn_runner.version: + logger.warning( + "You are using version {} of makeblastdb and version {} of tblastn.".format( + self.mkblast_runner.version, self.tblastn_runner.version + ) + ) + + def _run_mkblast(self): + if self.restart and self.mkblast_runner.check_previous_completed_run(): + logger.info( + "Skipping makeblastdb as BLAST DB already exists at {}".format( + self.mkblast_runner.output_db + ) + ) + else: + self.restart = False # Turn off restart mode if this is the entry point + self.config.set("busco_run", "restart", str(self.restart)) + self.mkblast_runner.run() + if len(os.listdir(os.path.split(self.mkblast_runner.output_db)[0])) == 0: + raise SystemExit( + "makeblastdb failed to create a BLAST DB at {}".format( + self.mkblast_runner.output_db + ) + ) + + def _run_tblastn(self, missing_and_frag_only=False, ancestral_variants=False): + + incomplete_buscos = ( + self.hmmer_runner.missing_buscos + + list(self.hmmer_runner.fragmented_buscos.keys()) + if missing_and_frag_only + else None + ) # This parameter is only used on the re-run + + self.tblastn_runner.configure_runner( + self.mkblast_runner.output_db, + missing_and_frag_only, + ancestral_variants, + incomplete_buscos, + ) + if self.restart and self.tblastn_runner.check_previous_completed_run(): + logger.info( + "Skipping tblastn as results already exist at {}".format( + self.tblastn_runner.blast_filename + ) + ) + else: + self.restart = False + self.config.set("busco_run", "restart", str(self.restart)) + self.tblastn_runner.run() + self.tblastn_runner.get_coordinates() + self.tblastn_runner.filter_best_matches() + self.tblastn_runner.write_coordinates_to_file() # writes to "coordinates.tsv" + self.tblastn_runner.write_contigs() + return diff -Nru busco-4.1.4/src/busco/analysis/BuscoAnalysis.py busco-5.0.0/src/busco/analysis/BuscoAnalysis.py --- busco-4.1.4/src/busco/analysis/BuscoAnalysis.py 1970-01-01 00:00:00.000000000 +0000 +++ busco-5.0.0/src/busco/analysis/BuscoAnalysis.py 2021-01-26 11:28:47.000000000 +0000 @@ -0,0 +1,337 @@ +#!/usr/bin/env python3 +# coding: utf-8 +""" +.. module:: BuscoAnalysis + :synopsis: BuscoAnalysis implements general BUSCO analysis specifics +.. versionadded:: 3.0.0 +.. versionchanged:: 5.0.0 + +Copyright (c) 2016-2021, Evgeny Zdobnov (ez@ezlab.org) +Licensed under the MIT license. See LICENSE.md file. +""" + +from abc import ABCMeta, abstractmethod +from busco.BuscoConfig import BuscoConfig, BuscoConfigAuto +from busco.busco_tools.hmmer import HMMERRunner +import os +from busco.BuscoLogger import BuscoLogger +from busco.BuscoLogger import LogDecorator as log + +logger = BuscoLogger.get_logger(__name__) + + +class BuscoAnalysis(metaclass=ABCMeta): + """ + This abstract base class (ABC) defines methods required for most of BUSCO analyses and has to be extended + by each specific analysis class + """ + + config = None + + def __init__(self): + """ + 1) load parameters + 2) load and validate tools + 3) check data and dataset integrity + 4) Ready for analysis + """ + super().__init__() + + # Get paths + self._lineage_results_dir = self.config.get("busco_run", "lineage_results_dir") + self.main_out = self.config.get("busco_run", "main_out") + self._working_dir = ( + os.path.join(self.main_out, "auto_lineage") + if isinstance(self.config, BuscoConfigAuto) + else self.main_out + ) + self.run_folder = os.path.join(self._working_dir, self._lineage_results_dir) + self._log_folder = os.path.join(self.main_out, "logs") + + # Get other useful variables + self.input_file = self.config.get("busco_run", "in") + self._lineage_dataset = self.config.get("busco_run", "lineage_dataset") + self._lineage_name = os.path.basename(self._lineage_dataset) + self._domain = self.config.get("busco_run", "domain") + self._has_variants_file = os.path.exists( + os.path.join(self._lineage_dataset, "ancestral_variants") + ) + self._dataset_creation_date = self.config.get("busco_run", "creation_date") + self.restart = self.config.getboolean("busco_run", "restart") + + self.gene_details = ( + {} + ) # Dictionary containing coordinate information for predicted genes. + self.headers = set() + + self._lineages_download_path = os.path.join( + self.config.get("busco_run", "download_path"), "lineages" + ) + + self.hmmer_runner = None + + # Create optimized command line call for the given input + # self.busco_type = "main" if isinstance(self._config, BuscoConfigMain) else "auto" + # if self.busco_type == "main": + # self.set_rerun_busco_command(self._config.clargs) # todo: rework rerun command + + @abstractmethod + def cleanup(self): + # Delete any non-decompressed files in busco_downloads + try: + for dataset_name in os.listdir(self._lineages_download_path): + if dataset_name.endswith((".gz", ".tar")): + os.remove(dataset_name) + except OSError: + pass + + @abstractmethod + @log( + "Running BUSCO using lineage dataset {0} ({1}, {2})", + logger, + attr_name=["_lineage_name", "_domain", "_dataset_creation_date"], + on_func_exit=True, + ) + def run_analysis(self): + """ + Abstract method, override to call all needed steps for running the child analysis. + """ + self._create_dirs() + self.init_tools() + self._check_data_integrity() + + @log("***** Run HMMER on gene sequences *****", logger) + def run_hmmer(self, input_sequences, busco_ids=None): + """ + This function runs hmmsearch. + """ + if not busco_ids: + files = sorted(os.listdir(os.path.join(self._lineage_dataset, "hmms"))) + busco_ids = [ + os.path.splitext(f)[0] for f in files + ] # Each Busco ID has a HMM file of the form ".hmm" + self.hmmer_runner.configure_runner( + input_sequences, busco_ids, self._mode, self.gene_details + ) + if self.restart and self.hmmer_runner.check_previous_completed_run(): + logger.info("Skipping HMMER run as output already processed") + elif len(os.listdir(self.hmmer_runner.results_dir)) > 0: + raise SystemExit( + "HMMER results directory not empty. If you are running in restart mode, make sure you are " + "using the same eukaryotic gene predictor (metaeuk/augustus) as before." + ) + else: + self.restart = False + self.config.set("busco_run", "restart", str(self.restart)) + self.hmmer_runner.run() + self.hmmer_runner.process_output() + self.validate_output() + self.hmmer_runner.filter() + self.hmmer_runner.consolidate_busco_lists() + output = self.hmmer_runner.create_output_content() + self.hmmer_runner.write_hmmer_results(output) + self.hmmer_runner.produce_hmmer_summary() + return + + def validate_output( + self, + ): # Transparent method that can be overwritten by child classes + return + + @log("Checking dataset for HMM profiles", logger, debug=True) + def _check_dataset_integrity(self): + """ + Check the input dataset for hmm profiles, both files and folder are available + Note: score and length cutoffs are checked when read by hmmer_runner: see _load_scores and _load_lengths + Note: dataset.cfg file is not mandatory for offline mode + # todo: implement a check for dataset.cfg file if not using offline mode + + :raises SystemExit: if the dataset is missing files or folders + """ + + # Check hmm files exist + files = os.listdir(os.path.join(self._lineage_dataset, "hmms")) + if not files: + raise SystemExit( + "The dataset you provided lacks hmm profiles in {}".format( + os.path.join(self._lineage_dataset, "hmms") + ) + ) + + if self._domain == "eukaryota": + # Check prfl folder exists and contains profiles + for dirpath, dirnames, files in os.walk( + os.path.join(self._lineage_dataset, "prfl") + ): + if not files: + raise SystemExit( + "The dataset you provided lacks elements in {}".format( + os.path.join(self._lineage_dataset, "prfl") + ) + ) + + if not self._has_variants_file: + logger.warning( + "The dataset you provided does not contain the file ancestral_variants, likely because it " + 'is an old version. All blast steps will use the file "ancestral" instead' + ) + + return + + def _check_data_integrity(self): + self._check_dataset_integrity() + if not os.stat(self.input_file).st_size > 0: + raise SystemExit("Input file is empty.") + with open(self.input_file) as f: + for line in f: + if line.startswith(">"): + self._check_fasta_header(line) + self._check_seq_uniqueness(line) + return + + def _check_seq_uniqueness(self, line): + seq_id = line.split(" ")[0] + if seq_id in self.headers: + raise SystemExit("Duplicate of sequence {} in input file".format(seq_id)) + self.headers.add(seq_id) + return + + @staticmethod + def _check_fasta_header(header): + """ + This function checks problematic characters in fasta headers, + and warns the user and stops the execution + :param header: a fasta header to check + :type header: str + :raises SystemExit: if a problematic character is found + """ + for char in BuscoConfig.FORBIDDEN_HEADER_CHARS: + if char in header: + raise SystemExit( + 'The character "%s" is present in the fasta header %s, ' + "which will crash BUSCO. Please clean the header of your " + "input file." % (char, header.strip()) + ) + + for char in BuscoConfig.FORBIDDEN_HEADER_CHARS_BEFORE_SPLIT: + if char in header.split()[0]: + raise SystemExit( + 'The character "%s" is present in the fasta header %s, ' + "which will crash Reader. Please clean the header of your" + " input file." % (char, header.split()[0].strip()) + ) + + if header.split()[0] == ">": + raise SystemExit( + "A space is present in the fasta header %s, directly after " + '">" which will crash Reader. Please clean the header of ' + "your input file." % (header.strip()) + ) + + def _create_dirs(self): + """ + Create the run (main) directory, log directory and the temporary directories + :return: + """ + self._create_main_dir() + self._create_log_dir() + # self._create_tmp_dir() + + def _create_log_dir(self): + """ + Create a subfolder of the main output folder that contains all log files from BUSCO and the external tools used. + :return: + """ + if not os.path.exists(self._log_folder): + os.mkdir(self._log_folder) + return + + def _create_main_dir(self): + """ + This function creates the run (main) directory + :raises SystemExit: if write permissions are not available to the specified location + """ + try: + os.makedirs(self.run_folder) + except FileExistsError: + if not self.restart: + raise SystemExit( + "Something went wrong. BUSCO stopped before overwriting run folder " + "{}".format(self.run_folder) + ) + except PermissionError: + raise SystemExit( + "Cannot write to the output directory, please make sure " + "you have write permissions to {}".format(self.run_folder) + ) + return + + @log("Check all required tools are accessible...", logger, debug=True) + def init_tools(self): + """ + Init the tools needed for the analysis. HMMER is needed for all BUSCO analysis types. + """ + self.hmmer_runner = HMMERRunner() + return + + @property + @abstractmethod + def _mode(self): + pass + + # def _run_tarzip_hmmer_output(self): # todo: rewrite using tarfile + # """ + # This function tarzips "hmmer_output" results folder + # """ + # self._p_open(["tar", "-C", "%s" % self.run_folder, "-zcf", "%shmmer_output.tar.gz" % self.run_folder, + # "hmmer_output", "--remove-files"], "bash", shell=False) + # + # @log("To reproduce this run: {}", logger, attr_name="_rerun_cmd", on_func_exit=True) + # def set_rerun_busco_command(self, clargs): # todo: reconfigure + # """ + # This function sets the command line to call to reproduce this run + # """ + # + # # Find python script path + # entry_point = "" + # frame_ind = -1 + # while "run_BUSCO.py" not in entry_point: + # entry_point = inspect.stack()[frame_ind].filename + # frame_ind -= 1 + # + # # Add required parameters and other options + # self._rerun_cmd = "python %s -i %s -o %s -l %s -m %s -c %s" % (entry_point, self._input_file, os.path.basename(self.main_out), + # self._lineage_dataset, self._mode, self._cpus) + # + # try: + # if self._long: + # self._rerun_cmd += " --long" + # if self._region_limit != BuscoConfig.DEFAULT_ARGS_VALUES["limit"]: + # self._rerun_cmd += " --limit %s" % self._region_limit + # # if self._tmp != BuscoConfig.DEFAULT_ARGS_VALUES["tmp_path"]: + # # self._rerun_cmd += " -t %s" % self._tmp + # if self._ev_cutoff != BuscoConfig.DEFAULT_ARGS_VALUES["evalue"]: + # self._rerun_cmd += " -e %s" % self._ev_cutoff + # # if self._tarzip: + # # self._rerun_cmd += " -z" + # except AttributeError: + # pass + # + # # Include any command line arguments issued by the user + # # arg_aliases = {"-i": "--in", "-o": "--out", "-l": "--lineage_dataset", "-m": "--mode", "-c": "--cpu", + # # "-e": "--evalue", "-f": "--force", "-sp": "--species", "-z": "--tarzip", + # # "-r": "--restart", "-q": "--quiet", "-v": "--version", "-h": "--help"} + # arg_aliases.update(dict(zip(arg_aliases.values(), arg_aliases.keys()))) + # for a, arg in enumerate(clargs): + # if arg.startswith("-") and not arg in self._rerun_cmd: + # if arg in arg_aliases: + # if arg_aliases[arg] in self._rerun_cmd: + # continue + # if a + 1 < len(clargs) and not clargs[a + 1].startswith("-"): + # self._rerun_cmd += " %s %s" % (arg, clargs[a + 1]) + # else: + # self._rerun_cmd += " %s" % arg + # return + + # TODO: catch unicode encoding exception and report invalid character line instead of doing content validation + # todo: check config file exists before parsing diff -Nru busco-4.1.4/src/busco/analysis/GeneSetAnalysis.py busco-5.0.0/src/busco/analysis/GeneSetAnalysis.py --- busco-4.1.4/src/busco/analysis/GeneSetAnalysis.py 1970-01-01 00:00:00.000000000 +0000 +++ busco-5.0.0/src/busco/analysis/GeneSetAnalysis.py 2021-01-26 11:28:47.000000000 +0000 @@ -0,0 +1,51 @@ +#!/usr/bin/env python3 +# coding: utf-8 +""" +.. module:: GeneSetAnalysis + :synopsis: GeneSetAnalysis implements genome analysis specifics +.. versionadded:: 3.0.0 +.. versionchanged:: 3.0.0 + +Copyright (c) 2016-2021, Evgeny Zdobnov (ez@ezlab.org) +Licensed under the MIT license. See LICENSE.md file. + +""" +from busco.analysis.BuscoAnalysis import BuscoAnalysis +from busco.BuscoLogger import BuscoLogger +from busco.analysis.Analysis import ProteinAnalysis +from Bio import SeqIO + +logger = BuscoLogger.get_logger(__name__) + + +class GeneSetAnalysis(ProteinAnalysis, BuscoAnalysis): + """ + This class runs a BUSCO analysis on a gene set. + """ + + _mode = "proteins" + + def __init__(self): + """ + Initialize an instance. + :param params: Values of all parameters that have to be defined + :type params: PipeConfig + """ + super().__init__() + self.sequences_aa = { + record.id: record for record in list(SeqIO.parse(self.input_file, "fasta")) + } + + def cleanup(self): + super().cleanup() + + def run_analysis(self): + """ + This function calls all needed steps for running the analysis. + """ + super().run_analysis() + self.run_hmmer(self.input_file) + self.hmmer_runner.write_buscos_to_file(self.sequences_aa) + # if self._tarzip: + # self._run_tarzip_hmmer_output() + return diff -Nru busco-4.1.4/src/busco/analysis/GenomeAnalysis.py busco-5.0.0/src/busco/analysis/GenomeAnalysis.py --- busco-4.1.4/src/busco/analysis/GenomeAnalysis.py 1970-01-01 00:00:00.000000000 +0000 +++ busco-5.0.0/src/busco/analysis/GenomeAnalysis.py 2021-01-26 11:28:47.000000000 +0000 @@ -0,0 +1,785 @@ +#!/usr/bin/env python3 +# coding: utf-8 +""" +.. module:: GenomeAnalysis + :synopsis: GenomeAnalysis implements genome analysis specifics +.. versionadded:: 3.0.0 +.. versionchanged:: 5.0.0 + +Copyright (c) 2016-2021, Evgeny Zdobnov (ez@ezlab.org) +Licensed under the MIT license. See LICENSE.md file. + +""" +from busco.analysis.BuscoAnalysis import BuscoAnalysis +from busco.analysis.Analysis import NucleotideAnalysis, BLASTAnalysis +from busco.busco_tools.prodigal import ProdigalRunner +from busco.busco_tools.metaeuk import MetaeukRunner +from busco.busco_tools.augustus import ( + AugustusRunner, + GFF2GBRunner, + NewSpeciesRunner, + ETrainingRunner, + OptimizeAugustusRunner, +) +from busco.busco_tools.base import NoRerunFile, NoGenesError +from busco.BuscoLogger import BuscoLogger +from busco.BuscoLogger import LogDecorator as log +from abc import ABCMeta, abstractmethod +from configparser import NoOptionError +import time +import os +import pandas as pd +from collections import defaultdict +import subprocess + +logger = BuscoLogger.get_logger(__name__) + + +class GenomeAnalysis(NucleotideAnalysis, BuscoAnalysis, metaclass=ABCMeta): + + _mode = "genome" + + def __init__(self): + super().__init__() + + @abstractmethod + def run_analysis(self): + super().run_analysis() + + def init_tools(self): + """ + Initialize tools needed for Genome Analysis. + :return: + """ + super().init_tools() + + # def _run_tarzip_augustus_output(self): # Todo: rewrite using tarfile + # """ + # This function tarzips results folder + # """ + # # augustus_output/predicted_genes + # + # self._p_open(["tar", "-C", "%saugustus_output" % self.main_out, + # "-zcf", "%saugustus_output/predicted_genes.tar.gz" % + # self.main_out, "predicted_genes", "--remove-files"], + # "bash", shell=False) + # # augustus_output/extracted_proteins + # self._p_open(["tar", "-C", "%saugustus_output" % self.main_out, + # "-zcf", "%saugustus_output/extracted_proteins.tar.gz" % + # self.main_out, "extracted_proteins", "--remove-files"], + # "bash", shell=False) + # # augustus_output/gb + # self._p_open(["tar", "-C", "%saugustus_output" % self.main_out, + # "-zcf", "%saugustus_output/gb.tar.gz" % self.main_out, "gb", "--remove-files"], + # "bash", shell=False) + # # augustus_output/gffs + # self._p_open(["tar", "-C", "%saugustus_output" % self.main_out, + # "-zcf", "%saugustus_output/gffs.tar.gz" % + # self.main_out, "gffs", "--remove-files"], "bash", shell=False) + # # single_copy_busco_sequences + # self._p_open(["tar", "-C", "%s" % self.main_out, "-zcf", + # "%ssingle_copy_busco_sequences.tar.gz" % self.main_out, + # "single_copy_busco_sequences", "--remove-files"], "bash", shell=False) + + # def set_rerun_busco_command(self, clargs): + # """ + # This function sets the command line to call to reproduce this run + # """ + # clargs.extend(["-sp", self._target_species]) + # super().set_rerun_busco_command(clargs) + + +class GenomeAnalysisProkaryotes(GenomeAnalysis): + """ + This class runs a BUSCO analysis on a genome. + """ + + def __init__(self): + """ + Initialize an instance. + """ + super().__init__() + self.prodigal_runner = None + + def cleanup(self): + super().cleanup() + + def run_analysis(self): + """ + This function calls all needed steps for running the analysis. + """ + super().run_analysis() + self._run_prodigal() + self.run_hmmer(self.prodigal_runner.output_faa) + self.hmmer_runner.write_buscos_to_file(self.sequences_aa, self.sequences_nt) + return + + def init_tools(self): + """ + Init the tools needed for the analysis + """ + super().init_tools() + self.prodigal_runner = ProdigalRunner() + + @log("***** Run Prodigal on input to predict and extract genes *****", logger) + def _run_prodigal(self): + """ + Run Prodigal on input file to detect genes. + :return: + """ + if self.restart and self.prodigal_runner.check_previous_completed_run(): + logger.info("Skipping Prodigal run as it has already completed") + self.prodigal_runner.get_gene_details() + else: + self.restart = False + self.config.set("busco_run", "restart", str(self.restart)) + self.prodigal_runner.run() + self.gene_details = self.prodigal_runner.gene_details + self.sequences_nt = self.prodigal_runner.sequences_nt + self.sequences_aa = self.prodigal_runner.sequences_aa + + return + + +class GenomeAnalysisEukaryotes(GenomeAnalysis): + """ + This class runs a BUSCO analysis on a eukaryote genome. + """ + + def __init__(self): + super().__init__() + + self.sequences_nt = {} + self.sequences_aa = {} + + def cleanup(self): + """ + This function cleans temporary files + """ + super().cleanup() + + def init_tools(self): + """ + Initialize all required tools for Genome Eukaryote Analysis: + metaeuk + :return: + """ + super().init_tools() + + return + + @abstractmethod + def run_analysis(self): + super().run_analysis() + + # def set_rerun_busco_command(self, clargs): + # """ + # This function sets the command line to call to reproduce this run + # """ + # clargs.extend(["-sp", self._target_species]) + # if self._augustus_parameters: + # clargs.extend(["--augustus_parameters", "\"%s\"" % self._augustus_parameters]) + # super().set_rerun_busco_command(clargs) + + +class GenomeAnalysisEukaryotesAugustus(BLASTAnalysis, GenomeAnalysisEukaryotes): + def __init__(self): + super().__init__() + self._long = self.config.getboolean("busco_run", "long") + try: + self._target_species = self.config.get("busco_run", "augustus_species") + except KeyError: + raise SystemExit( + "Something went wrong. Eukaryota datasets should specify an augustus species." + ) + try: + self._augustus_parameters = self.config.get( + "busco_run", "augustus_parameters" + ).replace(",", " ") + except NoOptionError: + self._augustus_parameters = "" + self.mkblast_runner = None + self.tblastn_runner = None + self.augustus_runner = None + self.gff2gb_runner = None + self.new_species_runner = None + self.etraining_runner = None + self.optimize_augustus_runner = None + + def init_tools(self): + super().init_tools() + self.augustus_runner = AugustusRunner() + self.gff2gb_runner = GFF2GBRunner() + self.new_species_runner = NewSpeciesRunner() + self.etraining_runner = ETrainingRunner() + + if self._long: + self.optimize_augustus_runner = OptimizeAugustusRunner() + + def cleanup(self): + try: + if self._target_species.startswith("BUSCO"): + self.augustus_runner.move_retraining_parameters() + except OSError: + pass + super().cleanup() + + def run_analysis(self): + """This function calls all needed steps for running the analysis.""" + super().run_analysis() + self._run_mkblast() + self._run_tblastn() + self._run_augustus(self.tblastn_runner.coords) + self.gene_details = self.augustus_runner.gene_details + self.run_hmmer(self.augustus_runner.output_sequences) + self._rerun_analysis() + + def _rerun_augustus(self, coords): + missing_and_fragmented_buscos = self.hmmer_runner.missing_buscos + list( + self.hmmer_runner.fragmented_buscos.keys() + ) + logger.info( + "Re-running Augustus with the new metaparameters, number of target BUSCOs: {}".format( + len(missing_and_fragmented_buscos) + ) + ) + missing_and_fragmented_coords = { + busco: coords[busco] + for busco in coords + if busco in missing_and_fragmented_buscos + } + logger.debug("Trained species folder is {}".format(self._target_species)) + self._run_augustus(missing_and_fragmented_coords) + return + + @log( + "Starting second step of analysis. The gene predictor Augustus is retrained using the results from the " + "initial run to yield more accurate results.", + logger, + ) + def _rerun_analysis(self): + + self.augustus_runner.make_gff_files(self.hmmer_runner.single_copy_buscos) + self._run_tblastn( + missing_and_frag_only=True, ancestral_variants=self._has_variants_file + ) + self._run_gff2gb() + self._run_new_species() + self.config.set( + "busco_run", "augustus_species", self.new_species_runner.new_species_name + ) + self._target_species = self.new_species_runner.new_species_name + self._run_etraining() + + if self._long: + self._run_optimize_augustus(self.new_species_runner.new_species_name) + self._run_etraining() + + try: + self._rerun_augustus(self.tblastn_runner.coords) + self.gene_details.update(self.augustus_runner.gene_details) + self.run_hmmer(self.augustus_runner.output_sequences) + self.augustus_runner.make_gff_files(self.hmmer_runner.single_copy_buscos) + self.augustus_runner.make_gff_files(self.hmmer_runner.multi_copy_buscos) + self.augustus_runner.make_gff_files(self.hmmer_runner.fragmented_buscos) + self.hmmer_runner.write_buscos_to_file(self.sequences_aa, self.sequences_nt) + except NoGenesError: + logger.warning("No genes found on Augustus rerun.") + + # if self._tarzip: # todo: zip folders with a lot of output + # self._run_tarzip_augustus_output() + # self._run_tarzip_hmmer_output() + # remove the checkpoint, run is done + # self._set_checkpoint() + return + + @log("Running Augustus gene predictor on BLAST search results.", logger) + def _run_augustus(self, coords): + self.augustus_runner.configure_runner( + self.tblastn_runner.output_seqs, + coords, + self.sequences_aa, + self.sequences_nt, + ) + + if self.restart and self.augustus_runner.check_previous_completed_run(): + run = "2nd" if self.augustus_runner.run_number == 2 else "1st" + logger.info( + "Skipping {} augustus run as output already processed".format(run) + ) + else: + self.restart = False + self.config.set("busco_run", "restart", str(self.restart)) + self.augustus_runner.run() + self.augustus_runner.process_output() + self.sequences_nt = self.augustus_runner.sequences_nt + self.sequences_aa = self.augustus_runner.sequences_aa + + def _run_etraining(self): + """Train on new training set (complete single copy buscos)""" + self.etraining_runner.configure_runner(self.new_species_runner.new_species_name) + if self.restart and self.etraining_runner.check_previous_completed_run(): + logger.info("Skipping etraining as it has already been done") + else: + self.restart = False + self.config.set("busco_run", "restart", str(self.restart)) + self.etraining_runner.run() + return + + @log("Converting predicted genes to short genbank files", logger) + def _run_gff2gb(self): + self.gff2gb_runner.configure_runner(self.hmmer_runner.single_copy_buscos) + if self.restart and self.gff2gb_runner.check_previous_completed_run(): + logger.info("Skipping gff2gb conversion as it has already been done") + else: + self.restart = False + self.config.set("busco_run", "restart", str(self.restart)) + self.gff2gb_runner.run() + return + + @log( + "All files converted to short genbank files, now training Augustus using Single-Copy Complete BUSCOs", + logger, + ) + def _run_new_species(self): + """Create new species config file from template""" + if self.restart and self.new_species_runner.check_previous_completed_run(): + logger.info("Skipping new species creation as it has already been done") + else: + self.restart = False + self.config.set("busco_run", "restart", str(self.restart)) + self.new_species_runner.run() + return + + def _run_optimize_augustus(self, new_species_name): + """ long mode (--long) option - runs all the Augustus optimization scripts (adds ~1 day of runtime)""" + logger.warning( + "Optimizing augustus metaparameters, this may take a very long time, started at {}".format( + time.strftime("%m/%d/%Y %H:%M:%S") + ) + ) + self.optimize_augustus_runner.configure_runner( + self.augustus_runner.output_folder, new_species_name + ) + self.optimize_augustus_runner.run() + return + + +class GenomeAnalysisEukaryotesMetaeuk(GenomeAnalysisEukaryotes): + def __init__(self): + super().__init__() + self.metaeuk_runner = None + self.gene_details = {} + + def init_tools(self): + super().init_tools() + + self.metaeuk_runner = MetaeukRunner() + + def run_analysis(self): + """This function calls all needed steps for running the analysis.""" + super().run_analysis() + incomplete_buscos = None + for i in range(2): + try: + self._run_metaeuk(incomplete_buscos) + self.gene_details.update(self.metaeuk_runner.gene_details) + self.sequences_aa.update(self.metaeuk_runner.sequences_aa) + self.run_hmmer( + self.metaeuk_runner.pred_protein_seqs_modified, + busco_ids=incomplete_buscos, + ) + incomplete_buscos = self.hmmer_runner.missing_buscos + list( + self.hmmer_runner.fragmented_buscos.keys() + ) + if len(incomplete_buscos) == 0: + break + except NoRerunFile: + if i == 1: + logger.info("Metaeuk rerun did not find any genes") + else: + raise SystemExit( + "Metaeuk did not find any genes in the input file." + ) + + try: + self.metaeuk_runner.combine_run_results() + except FileNotFoundError: + # This exception should only happen if the rerun file does not exist. If the initial run file was + # missing there would have been a SystemExit call above. The index 0 sets the "combined" file to the + # output of the initial run. + self.metaeuk_runner.combined_pred_protein_seqs = ( + self.metaeuk_runner.pred_protein_mod_files[0] + ) + self.hmmer_runner.write_buscos_to_file(self.sequences_aa) + + def _run_metaeuk(self, incomplete_buscos): + self.metaeuk_runner.configure_runner(incomplete_buscos) + if self.restart and self.metaeuk_runner.check_previous_completed_run(): + logger.info("Skipping Metaeuk run as already run") + else: + self.restart = False + self.config.set("busco_run", "restart", str(self.restart)) + self.metaeuk_runner.run() + + self.metaeuk_runner.edit_protein_file() + self.metaeuk_runner.get_gene_details() # The gene details contain the overlaps that were removed when editing + # the protein file, but it doesn't matter, as it is just a look-up + # dictionary + + def validate_output(self): + if len(self.metaeuk_runner.headers_files) < 2: + return + hmmer_results = { + **self.hmmer_runner.is_complete, + **self.hmmer_runner.is_very_large, + **self.hmmer_runner.is_fragment, + } + if len(hmmer_results) > 0: + exon_records = self.get_exon_records(hmmer_results) + df = self.exons_to_df(exon_records) + overlaps = self.find_overlaps(df) + if overlaps: + inds_to_remove = self.handle_overlaps(overlaps, df) + inds_to_remove = list(set(inds_to_remove)) + df.drop(inds_to_remove, inplace=True) + complete, matched_genes_complete = self.reconstruct_hmmer_results( + df, self.hmmer_runner.is_complete + ) + v_large, matched_genes_v_large = self.reconstruct_hmmer_results( + df, self.hmmer_runner.is_very_large + ) + fragmented, matched_genes_fragmented = self.reconstruct_hmmer_results( + df, self.hmmer_runner.is_fragment + ) + + # Update hmmer runner with new dictionaries + self.hmmer_runner.is_complete = complete + self.hmmer_runner.is_very_large = v_large + self.hmmer_runner.is_fragment = fragmented + self.hmmer_runner.matched_genes_complete = matched_genes_complete + self.hmmer_runner.matched_genes_vlarge = matched_genes_v_large + self.hmmer_runner.matched_genes_fragment = matched_genes_fragmented + self.hmmer_runner.gene_details = self.gene_details + return + + def get_exon_records( + self, busco_dict + ): # Placed in the GenomeAnalysis module because it draws on both hmmer_runner and metaeuk_runner methods + + initial_run_results = self.metaeuk_runner.headers_files[0] + rerun_results = self.metaeuk_runner.headers_files[1] + + exon_records = [] + for busco_id, gene_match in busco_dict.items(): + for gene_id, details in gene_match.items(): + sequence, coords = gene_id.rsplit(":", 1) + gene_start, gene_end = coords.split("-") + strand = self.gene_details[gene_id][0]["strand"] + score = details[0]["bitscore"] + + # Need to determine run using HMMER results instead of metaeuk results. This is because exons can be + # matched identically to different BUSCO IDs, both on the same and on different runs. The presence of a + # match in the metaeuk rerun results does not indicate that the HMMER match in question is associated + # .with that metaeuk match + run_found = ( + "2" + if os.path.exists( + os.path.join( + self.hmmer_runner.rerun_results_dir, + "{}.out".format(busco_id), + ) + ) + else "1" + ) + + if run_found == "2": + matches = subprocess.check_output( + [ + "grep", + "{}|{}|.*|{}|{}|".format( + sequence, strand, gene_start, gene_end + ), + rerun_results, + ] + ).decode("utf-8") + else: + matches = subprocess.check_output( + [ + "grep", + "{}|{}|.*|{}|{}|".format( + sequence, strand, gene_start, gene_end + ), + initial_run_results, + ] + ).decode("utf-8") + + # The following line is a relic from when the previous grep search tried to match busco_id instead of + # gene details. The find_match method is still needed though to clean up the match, even though it + # redundantly matches the gene coordinates again. + good_match = self.metaeuk_runner.find_match( + matches, + ["|{}|".format(gene_start), "|{}|".format(gene_end), sequence], + ) + + if good_match: + low_coords, high_coords = self.metaeuk_runner.extract_exon_coords( + good_match + ) + for i, entry in enumerate(low_coords): + record = ( + busco_id, + sequence, + entry, + high_coords[i], + strand, + score, + run_found, + ) + exon_records.append(record) + return exon_records + + def reconstruct_hmmer_results(self, df, hmmer_result_dict): + busco_groups = df.groupby(["Busco id"]) + hmmer_result_dict_new = defaultdict(dict) + matched_genes_new = defaultdict(list) + for busco_id, matches in hmmer_result_dict.items(): + try: + busco_group = busco_groups.get_group(busco_id) + except KeyError: # if busco was removed during overlap filtering + continue + busco_score_groups = busco_group.groupby(["Score"]) + for _, busco_score_group in busco_score_groups: + min_coord = None + for idx, row in busco_score_group.iterrows(): + low_coord = row["Start"] + high_coord = row["Stop"] + score = row["Score"] + seq = row["Sequence"] + if min_coord: + min_coord = min(min_coord, low_coord) + max_coord = max(max_coord, high_coord) + else: + min_coord = low_coord + max_coord = high_coord + for gene_match, details in matches.items(): + if details[0]["bitscore"] == score: + new_gene_match = "{}:{}-{}".format(seq, min_coord, max_coord) + hmmer_result_dict_new[busco_id].update( + {new_gene_match: details} + ) + matched_genes_new[new_gene_match].append(busco_id) + self.gene_details[new_gene_match] = self.gene_details[ + gene_match + ] + self.sequences_aa[ + new_gene_match + ] = self.metaeuk_runner.sequences_aa[gene_match] + return hmmer_result_dict_new, matched_genes_new + + @log("Validating exons and removing overlapping matches", logger) + def exons_to_df(self, records): + df = self.metaeuk_runner.records_to_df(records) + df["Start"] = df["Start"].astype(int) + df["Stop"] = df["Stop"].astype(int) + df["Score"] = df["Score"].astype(float) + df["Run found"] = df["Run found"].astype(int) + df.loc[df["Strand"] == "-", ["Start", "Stop"]] = df.loc[ + df["Strand"] == "-", ["Stop", "Start"] + ].values # reverse coordinates on negative strand + return df + + def find_overlaps(self, df): + overlaps = self.metaeuk_runner.test_for_overlaps(df) + busco_overlaps = [] + for overlap in overlaps: + match1 = df.loc[overlap[0]] + match2 = df.loc[overlap[1]] + if (match1["Busco id"] != match2["Busco id"]) and ( + match1["Start"] % 3 == match2["Start"] % 3 + ): + # check the overlaps are for two different BUSCOs and check overlaps are in the same reading frame + busco_overlaps.append(overlap) + return busco_overlaps + + def handle_overlaps(self, overlaps, df): + indices_to_remove = [] + for overlap_inds in overlaps: + bad_inds = self.handle_diff_busco_overlap(overlap_inds, df) + indices_to_remove.extend(bad_inds) + return indices_to_remove + + def handle_diff_busco_overlap(self, overlap_inds, df): + match1 = df.loc[overlap_inds[0]] + match2 = df.loc[overlap_inds[1]] + seq = match1["Sequence"] + busco_match1 = match1["Busco id"] + run_match1 = match1["Run found"] + busco_match2 = match2["Busco id"] + run_match2 = match2["Run found"] + exons1 = df.loc[(df["Busco id"] == busco_match1) & (df["Sequence"] == seq)] + exons2 = df.loc[(df["Busco id"] == busco_match2) & (df["Sequence"] == seq)] + hmmer_run_folder1 = ( + self.hmmer_runner.initial_results_dir + if run_match1 == 1 + else self.hmmer_runner.rerun_results_dir + ) + hmmer_run_folder2 = ( + self.hmmer_runner.initial_results_dir + if run_match2 == 1 + else self.hmmer_runner.rerun_results_dir + ) + hmmer_result1 = os.path.join(hmmer_run_folder1, "{}.out".format(busco_match1)) + hmmer_result2 = os.path.join(hmmer_run_folder2, "{}.out".format(busco_match2)) + hmmer_match_details1 = self.hmmer_runner.parse_hmmer_output( + hmmer_result1, busco_match1 + ) + hmmer_match_details2 = self.hmmer_runner.parse_hmmer_output( + hmmer_result2, busco_match2 + ) + gene_id1 = list(hmmer_match_details1.keys())[0] + gene_id2 = list(hmmer_match_details2.keys())[0] + if ( + hmmer_match_details1[gene_id1]["score"] + > hmmer_match_details2[gene_id2]["score"] + ): + priority_match = hmmer_match_details1 + secondary_match = hmmer_match_details2 + priority_exons = exons1 + secondary_exons = exons2 + priority_gene_id = gene_id1 + secondary_gene_id = gene_id2 + else: + priority_match = hmmer_match_details2 + secondary_match = hmmer_match_details1 + priority_exons = exons2 + secondary_exons = exons1 + priority_gene_id = gene_id2 + secondary_gene_id = gene_id1 + priority_env_coords = iter(priority_match[priority_gene_id]["env_coords"]) + secondary_env_coords = iter(secondary_match[secondary_gene_id]["env_coords"]) + priority_used_exons, priority_unused_exons = self.find_unused_exons( + priority_env_coords, priority_exons + ) + secondary_used_exons, secondary_unused_exons = self.find_unused_exons( + secondary_env_coords, secondary_exons + ) + + priority_used_exons = ( + pd.DataFrame.from_records(priority_used_exons, index="index") + if priority_used_exons + else None + ) + priority_unused_exons = ( + pd.DataFrame.from_records(priority_unused_exons, index="index") + if priority_unused_exons + else None + ) + secondary_used_exons = ( + pd.DataFrame.from_records(secondary_used_exons, index="index") + if secondary_used_exons + else None + ) + secondary_unused_exons = ( + pd.DataFrame.from_records(secondary_unused_exons, index="index") + if secondary_unused_exons + else None + ) + + indices_to_remove = [] + # Check if secondary match uses priority match exons + used_exons = pd.concat([priority_used_exons, secondary_used_exons]) + overlaps = self.metaeuk_runner.test_for_overlaps(used_exons) + if overlaps: + # Remove secondary match + indices_to_remove = secondary_exons.index + return indices_to_remove + + # Check to see if unused priority exons are used by secondary match + indices_to_remove.extend( + self.get_indices_to_remove(secondary_used_exons, priority_unused_exons) + ) + + # Check to see if unused secondary exons are used by priority match + indices_to_remove.extend( + self.get_indices_to_remove(priority_used_exons, secondary_unused_exons) + ) + + # Check to see if unused secondary exons overlap with priority unused exons + indices_to_remove.extend( + self.get_indices_to_remove(priority_unused_exons, secondary_unused_exons) + ) + + return indices_to_remove + + def get_indices_to_remove(self, priority_exons, secondary_exons): + indices_to_remove = [] + try: + exons_to_check = pd.concat([priority_exons, secondary_exons]) + except ValueError: + # all exons are None + return indices_to_remove + + overlaps = self.metaeuk_runner.test_for_overlaps(exons_to_check) + if overlaps: + for overlap in overlaps: + match1 = exons_to_check.loc[overlap[0]] + index_to_remove = ( + overlap[0] + if secondary_exons.iloc[0]["Busco id"] == match1["Busco id"] + else overlap[1] + ) + indices_to_remove.append(index_to_remove) + return indices_to_remove + + @staticmethod + def find_unused_exons(env_coords, exons): + remaining_hmm_region = 0 + unused_exons = [] + used_exons = [] + hmm_coords = next(env_coords) + exon_cumul_len = 0 + for idx, entry in exons.iterrows(): + entry["index"] = idx + exon_matched = False + exon_size_nt = int(entry["Stop"]) - int(entry["Start"]) + 1 + if not exon_size_nt % 3 == 0: + raise SystemExit( + "The exon coordinates contain fractional reading frames and are ambiguous." + ) + exon_size_aa = exon_size_nt / 3 + exon_cumul_len += exon_size_aa + if remaining_hmm_region > exon_size_aa: + remaining_hmm_region -= exon_size_aa + exon_matched = True + + elif remaining_hmm_region: + exon_matched = True + + elif hmm_coords: + while hmm_coords[0] < exon_cumul_len + 1: + # hmm starts within exon + exon_matched = True + if hmm_coords[1] <= exon_cumul_len + 1: + # hmm ends within exon; cycle to the next hmm region + try: + hmm_coords = next(env_coords) + except StopIteration: + hmm_coords = None + break + continue + else: + remaining_hmm_region = hmm_coords[1] - exon_size_aa + 1 + break + if exon_matched: + used_exons.append(entry) + else: + unused_exons.append(entry) + return used_exons, unused_exons + + def cleanup(self): + try: + self.metaeuk_runner.remove_tmp_files() + except OSError: + pass + super().cleanup() diff -Nru busco-4.1.4/src/busco/analysis/TranscriptomeAnalysis.py busco-5.0.0/src/busco/analysis/TranscriptomeAnalysis.py --- busco-4.1.4/src/busco/analysis/TranscriptomeAnalysis.py 1970-01-01 00:00:00.000000000 +0000 +++ busco-5.0.0/src/busco/analysis/TranscriptomeAnalysis.py 2021-01-26 11:28:47.000000000 +0000 @@ -0,0 +1,250 @@ +#!/usr/bin/env python3 +# coding: utf-8 +""" +.. module:: TranscriptomeAnalysis + :synopsis:TranscriptomeAnalysis implements genome analysis specifics +.. versionadded:: 3.0.0 +.. versionchanged:: 5.0.0 + +Copyright (c) 2016-2021, Evgeny Zdobnov (ez@ezlab.org) +Licensed under the MIT license. See LICENSE.md file. + +""" +import os +from busco.analysis.BuscoAnalysis import BuscoAnalysis +from busco.BuscoLogger import BuscoLogger +from busco.BuscoLogger import LogDecorator as log +from Bio.Seq import reverse_complement, translate +from Bio import SeqIO +from Bio.SeqRecord import SeqRecord +from busco.analysis.Analysis import NucleotideAnalysis, BLASTAnalysis +from busco.analysis.GenomeAnalysis import GenomeAnalysisEukaryotesMetaeuk +from abc import ABCMeta +from collections import defaultdict + + +logger = BuscoLogger.get_logger(__name__) + + +class TranscriptomeAnalysis(NucleotideAnalysis, metaclass=ABCMeta): + """ + Analysis on a transcriptome. + """ + + _mode = "transcriptome" + + def __init__(self): + """ + Initialize an instance. + """ + super().__init__() + + +class TranscriptomeAnalysisProkaryotes( + TranscriptomeAnalysis, BLASTAnalysis, BuscoAnalysis +): + """ + Analysis on a transcriptome. + """ + + def __init__(self): + """ + Initialize an instance. + """ + super().__init__() + self.all_sequences = defaultdict(lambda: defaultdict(dict)) + + self.sequences_aa = {} + self.complete_seqs_nt = {} + self.complete_seqs_aa = {} + + self.single_copy_proteins_file = os.path.join( + self.run_folder, "single_copy_proteins.faa" + ) + + def run_analysis(self): + """ + This function calls all needed steps for running the analysis. + """ + + super().run_analysis() + + self._run_mkblast() + self._run_tblastn(ancestral_variants=self._has_variants_file) + + protein_seq_files = self._translate_seqs(self.tblastn_runner.coords) + + self.run_hmmer(protein_seq_files) + + self.prepare_sequences() + self.write_complete_seqs() + + self.hmmer_runner.write_buscos_to_file(self.sequences_aa, self.sequences_nt) + + # if self._tarzip: + # self._run_tarzip_hmmer_output() + # self._run_tarzip_translated_proteins() + return + + def init_tools(self): + super().init_tools() + + def cleanup(self): + """ + This function cleans temporary files. + """ + super().cleanup() + + def match_hmmer_results(self, hmmer_results): + seqs_aa = {} + seqs_nt = {} + + for busco_id, gene_matches in hmmer_results.items(): + for gene_id in gene_matches: + gene_info = gene_matches[gene_id][0] + frame = gene_info["frame"] + seq_aa = self.all_sequences[busco_id][gene_id]["translations"][frame] + seq_nt = self.all_sequences[busco_id][gene_id]["original"] + seqs_aa[gene_id] = seq_aa + seqs_nt[gene_id] = seq_nt + return seqs_aa, seqs_nt + + def prepare_sequences(self): + hmmer_results_complete = self.hmmer_runner.single_copy_buscos + + sc_seqs_aa, sc_seqs_nt = self.match_hmmer_results(hmmer_results_complete) + self.complete_seqs_aa = [val for key, val in sc_seqs_aa.items()] + + hmmer_results_remainder = { + **self.hmmer_runner.multi_copy_buscos, + **self.hmmer_runner.fragmented_buscos, + } + + remainder_seqs_aa, remainder_seqs_nt = self.match_hmmer_results( + hmmer_results_remainder + ) + + self.sequences_aa = {**sc_seqs_aa, **remainder_seqs_aa} + self.sequences_nt = {**sc_seqs_nt, **remainder_seqs_nt} + + return + + def write_complete_seqs(self): + + with open(self.single_copy_proteins_file, "w") as out_faa: + SeqIO.write(self.complete_seqs_aa, out_faa, "fasta") + + return + + @staticmethod + def six_frame_translation(record): + """ + Gets the sixframe translation for the provided sequence + :param seq: the sequence to be translated + :type seq: str + :return: the six translated sequences + :rtype: list + """ + descriptions = { + 1: "orig_seq_frame_1", + 2: "orig_seq_frame_2", + 3: "orig_seq_frame_3", + -1: "rev_comp_frame_1", + -2: "rev_comp_frame_2", + -3: "rev_comp_frame_3", + } + + # Based on code excerpt from https://biopython.org/DIST/docs/api/Bio.SeqUtils-pysrc.html#six_frame_translations + anti = reverse_complement(record.seq) + translated_seqs = {} + for i in range(3): + fragment_length = 3 * ((len(record.seq) - i) // 3) + translated_seqs[descriptions[i + 1]] = SeqRecord( + translate(record.seq[i : i + fragment_length], stop_symbol="X"), + id=record.id, + name=record.id, + description=descriptions[i + 1], + ) + translated_seqs[descriptions[-(i + 1)]] = SeqRecord( + translate(anti[i : i + fragment_length], stop_symbol="X"), + id=record.id, + name=record.id, + description=descriptions[i + 1], + ) + return translated_seqs + + @staticmethod + def _reformats_seq_id(seq_id): + """ + This function reformats the sequence id to its original values + :param seq_id: the seq id to reformats + :type seq_id: str + :return: the reformatted seq_id + :rtype: str + """ + return "_".join(seq_id.split("_")[:-1]) + + @log("Translating candidate transcripts", logger) + def _translate_seqs(self, coords): + + translated_proteins_dir = os.path.join(self.main_out, "translated_proteins") + if not os.path.exists(translated_proteins_dir): + os.makedirs(translated_proteins_dir) + + contig_names = [] + for contig_info in coords.values(): + for contig in contig_info: + contig_names.append(contig) + + protein_seq_files = [] + for busco_id, contig_info in coords.items(): + output_filename = os.path.join( + translated_proteins_dir, "{}.faa".format(busco_id) + ) + protein_seq_files.append(output_filename) + translated_records = [] + for contig_name in contig_info: + tmp_filename = os.path.join( + self.tblastn_runner.output_seqs, "{}.temp".format(contig_name[:100]) + ) # Avoid very long filenames + for record in SeqIO.parse( + tmp_filename, "fasta" + ): # These files will only ever have one sequence, + # but BioPython examples always parse them in an iterator. + translated_seqs = self.six_frame_translation(record) + gene_id = "{}:{}-{}".format( + record.id, + contig_info[contig_name]["contig_start"], + contig_info[contig_name]["contig_end"], + ) + self.all_sequences[busco_id][gene_id][ + "translations" + ] = translated_seqs + self.all_sequences[busco_id][gene_id]["original"] = record + for ( + desc_id + ) in translated_seqs: # There are six possible translated sequences + prot_seq = translated_seqs[desc_id] + prot_seq.id = gene_id + translated_records.append(prot_seq) + + with open(output_filename, "w") as out_faa: + SeqIO.write(translated_records, out_faa, "fasta") + + return protein_seq_files + + # def _run_tarzip_translated_proteins(self): + # """ + # This function tarzips results folder + # """ + # # translated_proteins # Todo: rewrite with tarfile module + # self._p_open(["tar", "-C", "%s" % self.mainout, "-zcf", + # "%stranslated_proteins.tar.gz" % self.mainout, "translated_proteins", "--remove-files"], "bash", + # shell=False) + + +class TranscriptomeAnalysisEukaryotes( + TranscriptomeAnalysis, GenomeAnalysisEukaryotesMetaeuk +): + def __init__(self): + super().__init__() diff -Nru busco-4.1.4/src/busco/Analysis.py busco-5.0.0/src/busco/Analysis.py --- busco-4.1.4/src/busco/Analysis.py 2020-10-01 14:11:36.000000000 +0000 +++ busco-5.0.0/src/busco/Analysis.py 1970-01-01 00:00:00.000000000 +0000 @@ -1,99 +0,0 @@ -from Bio import SeqIO -from busco.BuscoTools import TBLASTNRunner, MKBLASTRunner -from busco.BuscoLogger import BuscoLogger -import os -from abc import ABCMeta - -logger = BuscoLogger.get_logger(__name__) - - -class NucleotideAnalysis(metaclass=ABCMeta): - - LETTERS = ["A", "C", "T", "G", "N"] - - # explanation of ambiguous codes found here: https://www.dnabaser.com/articles/IUPAC%20ambiguity%20codes.html - AMBIGUOUS_CODES = ["Y", "R", "W", "S", "K", "M", "D", "V", "H", "B"] - - def __init__(self): - - super().__init__() # Initialize BuscoAnalysis - if not self.check_nucleotide_file(self._input_file): - raise SystemExit("Please provide a nucleotide file as input") - - def check_nucleotide_file(self, filename): - i = 0 - for record in SeqIO.parse(filename, "fasta"): - for letter in record.seq.upper(): - if i > 5000: - break - i += 1 - if letter not in type(self).LETTERS and letter not in type(self).AMBIGUOUS_CODES: - return False - else: - continue # only continue to next record of 5000 has not been hit - break # If for loop exits with "break", the else clause is skipped and the outer loop also breaks. - - return True - - def init_tools(self): - super().init_tools() - self.mkblast_runner = MKBLASTRunner() - self.tblastn_runner = TBLASTNRunner() - - if self.mkblast_runner.version != self.tblastn_runner.version: - logger.warning("You are using version {} of makeblastdb and version {} of tblastn.".format( - self.mkblast_runner.version, self.tblastn_runner.version)) - - def _run_mkblast(self): - if self.restart and self.mkblast_runner.check_previous_completed_run(): - logger.info("Skipping makeblastdb as BLAST DB already exists at {}".format(self.mkblast_runner.output_db)) - else: - self.restart = False # Turn off restart mode if this is the entry point - self.config.set("busco_run", "restart", str(self.restart)) - self.mkblast_runner.run() - if len(os.listdir(os.path.split(self.mkblast_runner.output_db)[0])) == 0: - raise SystemExit("makeblastdb failed to create a BLAST DB at {}".format(self.mkblast_runner.output_db)) - - def _run_tblastn(self, missing_and_frag_only=False, ancestral_variants=False): - - incomplete_buscos = (self.hmmer_runner.missing_buscos + list(self.hmmer_runner.fragmented_buscos.keys()) - if missing_and_frag_only else None) # This parameter is only used on the re-run - - self.tblastn_runner.configure_runner(self.mkblast_runner.output_db, missing_and_frag_only, - ancestral_variants, incomplete_buscos) - if self.restart and self.tblastn_runner.check_previous_completed_run(): - logger.info("Skipping tblastn as results already exist at {}".format(self.tblastn_runner.blast_filename)) - else: - self.restart = False - self.config.set("busco_run", "restart", str(self.restart)) - self.tblastn_runner.run() - self.tblastn_runner.get_coordinates() - self.tblastn_runner.filter_best_matches() - self.tblastn_runner.write_coordinates_to_file() # writes to "coordinates.tsv" - self.tblastn_runner.write_contigs() - return - - -class ProteinAnalysis: - - LETTERS = ["F", "L", "I", "M", "V", "S", "P", "T", "A", "Y", "X", "H", "Q", "N", "K", "D", "E", "C", "W", "R", "G"] - NUCL_LETTERS = ["A", "C", "T", "G", "N"] - - def __init__(self): - super().__init__() - if not self.check_protein_file(self._input_file): - raise SystemExit('Please provide a protein file as input') - - def check_protein_file(self, filename): - - for i, record in enumerate(SeqIO.parse(filename, "fasta")): - if i > 10: - break - for letter in record.seq: - if letter.upper() not in type(self).NUCL_LETTERS and letter.upper() in type(self).LETTERS: - return True - elif letter.upper() not in type(self).LETTERS: - return False - else: - continue - return False # if file only contains "A", "T", "C", "G", "N", it is unlikely to be a protein file diff -Nru busco-4.1.4/src/busco/AutoLineage.py busco-5.0.0/src/busco/AutoLineage.py --- busco-4.1.4/src/busco/AutoLineage.py 2020-10-01 14:11:36.000000000 +0000 +++ busco-5.0.0/src/busco/AutoLineage.py 2021-01-26 11:28:47.000000000 +0000 @@ -5,10 +5,16 @@ from busco.BuscoLogger import LogDecorator as log from busco.BuscoRunner import BuscoRunner import numpy as np -import numpy.ma as ma logger = BuscoLogger.get_logger(__name__) + +class NoGenesError(SystemExit): + + def __init__(self): + super().__init__("No genes were recognized by BUSCO. Please check the content of your input file.") + + class AutoSelectLineage: """ Class for selecting the best lineage dataset for the input data. @@ -16,6 +22,8 @@ highest BUSCO score. """ + runners = [] + @log("***** Starting Auto Select Lineage *****\n\t" "This process runs BUSCO on the generic lineage datasets for the domains archaea, bacteria and eukaryota. " "Once the optimal domain is selected, BUSCO automatically attempts to find the most appropriate BUSCO dataset " @@ -44,6 +52,7 @@ self.f_percents = [] self.best_match_lineage_dataset = None self.current_lineage = None + self.virus_pipeline = False def record_results(self, s_buscos, d_buscos, f_buscos, s_percent, d_percent, f_percent): """ @@ -70,11 +79,37 @@ """ root_runners = self.run_lineages_list(self.all_lineages) self.get_best_match_lineage(root_runners) + if self.virus_check(): + self.virus_pipeline = True + self.run_virus_datasets() + self.get_best_match_lineage(type(self).runners) + + if (len(self.selected_runner.analysis.hmmer_runner.single_copy_buscos) == + len(self.selected_runner.analysis.hmmer_runner.multi_copy_buscos) == + len(self.selected_runner.analysis.hmmer_runner.fragmented_buscos) == 0): + raise NoGenesError + + logger.info("{} selected\n".format(os.path.basename(self.best_match_lineage_dataset))) self.config.set("busco_run", "domain_run_name", os.path.basename(self.best_match_lineage_dataset)) BuscoRunner.final_results.append(self.selected_runner.analysis.hmmer_runner.hmmer_results_lines) BuscoRunner.results_datasets.append(os.path.basename(self.best_match_lineage_dataset)) return + def virus_check(self): + return (self.selected_runner.analysis.hmmer_runner.s_percent < 3.0) & \ + (os.stat(self.selected_runner.analysis.input_file).st_size < 500000) + + @log("Running virus detection pipeline", logger) + def run_virus_datasets(self): + lineages_to_check = [] + virus_datasets = self.selected_runner.config.downloader.get("virus_datasets.txt", "information") + with open(virus_datasets, "r") as vir_sets: + for line in vir_sets: + lineages_to_check.append(line.strip().split("_odb")[0]) + self.run_lineages_list(lineages_to_check) + return + + def run_lineages_list(self, lineages_list): root_runners = [] for l in lineages_list: @@ -82,11 +117,22 @@ autoconfig = BuscoConfigAuto(self.config, self.current_lineage) busco_run = BuscoRunner(autoconfig) busco_run.run_analysis(callback=self.callback) - root_runners.append(busco_run) # Save all root runs so they can be recalled if chosen + root_runners.append(busco_run) + type(self).runners.append(busco_run) # Save all root runs so they can be recalled if chosen return root_runners + def get_max_ind(self, arr): + """ + Return maximum ind(s) of array. If max value appears twice, two indices are returned. + :param arr: + :return: + """ + inds = np.arange(len(arr)) + max_mask = arr == np.amax(arr) + max_ind = inds[max_mask] + return max_ind - def evaluate(self): + def evaluate(self, runners): """ Evaluate output scores from all BUSCO runs. Lineage with the highest number of complete (single + multiple) copy BUSCOs is assigned as the best_match_lineage. @@ -96,35 +142,35 @@ If still a tie, use the first match. :return """ - total_complete = np.array(self.s_buscos) + np.array(self.d_buscos) - inds = np.arange(len(total_complete)) + self.collate_results(runners) - max_mask = total_complete == np.amax(total_complete) - max_ind = inds[max_mask] + max_ind = self.get_max_ind(np.array(self.s_buscos) + np.array(self.d_buscos)) if len(max_ind) > 1: - self.f_buscos = np.array(self.f_buscos) - tie_break = ma.array(self.f_buscos, mask=~max_mask) - max_mask &= tie_break == ma.max(tie_break) - max_ind = inds[max_mask] + max_ind2 = self.get_max_ind(np.array(self.f_buscos)[max_ind]) + max_ind = max_ind[max_ind2] if len(max_ind) > 1: if ((self.s_buscos[max_ind[0]] == 0.0) and (self.d_buscos[max_ind[0]] == 0.0) and (self.f_buscos[max_ind[0]] == 0.0)): - raise SystemExit("No genes were recognized by BUSCO. Please check the content of your input file.") + return int(0) else: - self.s_percents = np.array(self.s_percents) - tie_break = ma.array(self.s_percents, mask=~max_mask) - max_mask &= tie_break == ma.max(tie_break) - max_ind = inds[max_mask] + max_ind3 = self.get_max_ind(np.array(self.s_percents)[max_ind]) + max_ind = max_ind[max_ind3] if len(max_ind) > 1: logger.warning("Two lineage runs scored exactly the same. Proceeding with the first.") # I don't expect this error message will ever be used. max_ind = max_ind[0] - return max_ind + return int(max_ind) + + def collate_results(self, runners): + self.s_buscos = [runner.analysis.hmmer_runner.single_copy for runner in runners] + self.d_buscos = [runner.analysis.hmmer_runner.multi_copy for runner in runners] + self.f_buscos = [runner.analysis.hmmer_runner.only_fragments for runner in runners] + self.s_percents = [runner.analysis.hmmer_runner.s_percent for runner in runners] + return - @log("{} selected\n", logger, attr_name="best_match_lineage_dataset", apply="basename", on_func_exit=True) def get_best_match_lineage(self, runners): - max_ind = self.evaluate() + max_ind = self.evaluate(runners) self.selected_runner = runners[int(max_ind)] self.best_match_lineage_dataset = self.selected_runner.config.get("busco_run", "lineage_dataset") runners.pop(int(max_ind)) @@ -133,7 +179,8 @@ def cleanup_disused_runs(self, disused_runners): for runner in disused_runners: - runner.analysis.cleanup() + if not runner.cleaned_up: + runner.cleanup() def get_lineage_dataset(self): # todo: rethink structure after BuscoPlacer is finalized and protein mode with mollicutes is fixed. @@ -143,7 +190,7 @@ """ if self.selected_runner.domain == "eukaryota": self.run_busco_placer() - elif (self.selected_runner.mode in ["proteins", "prot", "transcriptome", "tran"] and + elif (self.selected_runner.mode in ["proteins", "prok_tran"] and os.path.basename(self.selected_runner.config.get("busco_run", "lineage_dataset")).startswith("bacteria")): logger.info( "Certain mollicute clades use a different genetic code to the rest of bacteria. They are not part " @@ -167,25 +214,15 @@ self._run_3_datasets() BuscoRunner.final_results.append(self.selected_runner.analysis.hmmer_runner.hmmer_results_lines) BuscoRunner.results_datasets.append(os.path.basename(self.best_match_lineage_dataset)) + elif self.selected_runner.domain == "viruses": + pass else: self.run_busco_placer() return def check_mollicutes(self): - self.s_buscos = [] - self.d_buscos = [] - self.f_buscos = [] - self.s_percents = [] - self.d_percents = [] - self.f_percents = [] runners = self.run_lineages_list(["mollicutes"]) runners.append(self.selected_runner) - self.s_buscos.append(self.selected_runner.analysis.hmmer_runner.single_copy) - self.d_buscos.append(self.selected_runner.analysis.hmmer_runner.multi_copy) - self.f_buscos.append(self.selected_runner.analysis.hmmer_runner.only_fragments) - self.s_percents.append(self.selected_runner.analysis.hmmer_runner.s_percent) - self.d_percents.append(self.selected_runner.analysis.hmmer_runner.d_percent) - self.f_percents.append(self.selected_runner.analysis.hmmer_runner.f_percent) self.get_best_match_lineage(runners) return @@ -194,9 +231,18 @@ if self.selected_runner.domain == "prokaryota": protein_seqs = self.selected_runner.analysis.prodigal_runner.output_faa elif self.selected_runner.domain == "eukaryota": - protein_seqs_dir = self.selected_runner.analysis.augustus_runner.extracted_prot_dir - protein_seqs = [os.path.join(protein_seqs_dir, f) for f in os.listdir(protein_seqs_dir) - if f.split(".")[-2] == "faa"] + if self.config.getboolean("busco_run", "use_augustus"): + protein_seqs_dir = self.selected_runner.analysis.augustus_runner.extracted_prot_dir + protein_seqs = [os.path.join(protein_seqs_dir, f) for f in os.listdir(protein_seqs_dir) + if f.split(".")[-2] == "faa"] + else: + protein_seqs = self.selected_runner.analysis.metaeuk_runner.combined_pred_protein_seqs + elif "tran" in self.selected_runner.mode: + if self.selected_runner.mode == "euk_tran": + protein_seqs = self.selected_runner.analysis.metaeuk_runner.combined_pred_protein_seqs + elif self.selected_runner.mode == "prok_tran": + protein_seqs = self.selected_runner.analysis.single_copy_proteins_file + else: protein_seqs = self.selected_runner.config.get("busco_run", "in") out_path = self.config.get("busco_run", "main_out") @@ -214,21 +260,9 @@ def _run_3_datasets(self, mollicutes_runner=None): if mollicutes_runner: datasets = ["mycoplasmatales", "entomoplasmatales"] - self.s_buscos = [mollicutes_runner.analysis.hmmer_runner.single_copy] - self.d_buscos = [mollicutes_runner.analysis.hmmer_runner.multi_copy] - self.f_buscos = [mollicutes_runner.analysis.hmmer_runner.only_fragments] - self.s_percents = [mollicutes_runner.analysis.hmmer_runner.s_percent] - self.d_percents = [mollicutes_runner.analysis.hmmer_runner.d_percent] - self.f_percents = [mollicutes_runner.analysis.hmmer_runner.f_percent] dataset_runners = [mollicutes_runner] else: datasets = ["mollicutes", "mycoplasmatales", "entomoplasmatales"] - self.s_buscos = [] - self.d_buscos = [] - self.f_buscos = [] - self.s_percents = [] - self.d_percents = [] - self.f_percents = [] dataset_runners = [] dataset_runners += self.run_lineages_list(datasets) self.get_best_match_lineage(dataset_runners) diff -Nru busco-4.1.4/src/busco/BuscoAnalysis.py busco-5.0.0/src/busco/BuscoAnalysis.py --- busco-4.1.4/src/busco/BuscoAnalysis.py 2020-10-01 14:11:36.000000000 +0000 +++ busco-5.0.0/src/busco/BuscoAnalysis.py 1970-01-01 00:00:00.000000000 +0000 @@ -1,280 +0,0 @@ -#!/usr/bin/env python -# coding: utf-8 -""" -.. module:: BuscoAnalysis - :synopsis: BuscoAnalysis implements general BUSCO analysis specifics -.. versionadded:: 3.0.0 -.. versionchanged:: 4.0.7 - -Copyright (c) 2016-2020, Evgeny Zdobnov (ez@ezlab.org) -Licensed under the MIT license. See LICENSE.md file. -""" - -from abc import ABCMeta, abstractmethod -from busco.BuscoConfig import BuscoConfig, BuscoConfigAuto -from busco.BuscoTools import HMMERRunner -import os -from busco.BuscoLogger import BuscoLogger -from busco.BuscoLogger import LogDecorator as log - -logger = BuscoLogger.get_logger(__name__) - - -class BuscoAnalysis(metaclass=ABCMeta): - """ - This abstract base class (ABC) defines methods required for most of BUSCO analyses and has to be extended - by each specific analysis class - """ - config = None - - def __init__(self): - """ - 1) load parameters - 2) load and validate tools - 3) check data and dataset integrity - 4) Ready for analysis - """ - super().__init__() - - # Get paths - self._lineage_results_dir = self.config.get("busco_run", "lineage_results_dir") - self.main_out = self.config.get("busco_run", "main_out") - self._working_dir = (os.path.join(self.main_out, "auto_lineage") - if isinstance(self.config, BuscoConfigAuto) - else self.main_out) - self._run_folder = os.path.join(self._working_dir, self._lineage_results_dir) - self._log_folder = os.path.join(self.main_out, "logs") - - # Get other useful variables - self._input_file = self.config.get("busco_run", "in") - self._lineage_dataset = self.config.get("busco_run", "lineage_dataset") - self._lineage_name = os.path.basename(self._lineage_dataset) - self._domain = self.config.get("busco_run", "domain") - self._has_variants_file = os.path.exists(os.path.join(self._lineage_dataset, "ancestral_variants")) - self._dataset_creation_date = self.config.get("busco_run", "creation_date") - self.restart = self.config.getboolean("busco_run", "restart") - - self.gene_details = None # Dictionary containing coordinate information for predicted genes. - - self._lineages_download_path = os.path.join(self.config.get("busco_run", "download_path"), "lineages") - - self.hmmer_runner = None - - # Create optimized command line call for the given input - # self.busco_type = "main" if isinstance(self._config, BuscoConfigMain) else "auto" - # if self.busco_type == "main": - # self.set_rerun_busco_command(self._config.clargs) # todo: rework rerun command - - @abstractmethod - def cleanup(self): - # Delete any non-decompressed files in busco_downloads - try: - for dataset_name in os.listdir(self._lineages_download_path): - if dataset_name.endswith((".gz", ".tar")): - os.remove(dataset_name) - except OSError: - pass - - @abstractmethod - @log("Running BUSCO using lineage dataset {0} ({1}, {2})", logger, - attr_name=["_lineage_name", "_domain", "_dataset_creation_date"], on_func_exit=True) - def run_analysis(self): - """ - Abstract method, override to call all needed steps for running the child analysis. - """ - self._create_dirs() - self.init_tools() - self._check_data_integrity() - - @log("***** Run HMMER on gene sequences *****", logger) - def run_hmmer(self, input_sequences): - """ - This function runs hmmsearch. - """ - files = sorted(os.listdir(os.path.join(self._lineage_dataset, "hmms"))) - busco_ids = [os.path.splitext(f)[0] for f in files] # Each Busco ID has a HMM file of the form ".hmm" - self.hmmer_runner.configure_runner(input_sequences, busco_ids, self._mode, self.gene_details) - if self.restart and self.hmmer_runner.check_previous_completed_run(): - logger.info("Skipping HMMER run as output already processed") - else: - self.restart = False - self.config.set("busco_run", "restart", str(self.restart)) - self.hmmer_runner.run() - self.hmmer_runner.process_output() - self.hmmer_runner.write_hmmer_results() - self.hmmer_runner.produce_hmmer_summary() - return - - @log("Checking dataset for HMM profiles", logger, debug=True) - def _check_dataset_integrity(self): - """ - Check the input dataset for hmm profiles, both files and folder are available - Note: score and length cutoffs are checked when read by hmmer_runner: see _load_scores and _load_lengths - Note: dataset.cfg file is not mandatory for offline mode - # todo: implement a check for dataset.cfg file if not using offline mode - - :raises SystemExit: if the dataset is missing files or folders - """ - - # Check hmm files exist - files = os.listdir(os.path.join(self._lineage_dataset, "hmms")) - if not files: - raise SystemExit("The dataset you provided lacks hmm profiles in {}".format( - os.path.join(self._lineage_dataset, "hmms"))) - - if self._domain == "eukaryota": - # Check prfl folder exists and contains profiles - for dirpath, dirnames, files in os.walk(os.path.join(self._lineage_dataset, "prfl")): - if not files: - raise SystemExit("The dataset you provided lacks elements in {}".format( - os.path.join(self._lineage_dataset, "prfl"))) - - if not self._has_variants_file: - logger.warning("The dataset you provided does not contain the file ancestral_variants, likely because it " - "is an old version. All blast steps will use the file \"ancestral\" instead") - - return - - def _check_data_integrity(self): - self._check_dataset_integrity() - if not os.stat(self._input_file).st_size > 0: - raise SystemExit("Input file is empty.") - with open(self._input_file) as f: - for line in f: - if line.startswith(">"): - self._check_fasta_header(line) - return - - @staticmethod - def _check_fasta_header(header): - """ - This function checks problematic characters in fasta headers, - and warns the user and stops the execution - :param header: a fasta header to check - :type header: str - :raises SystemExit: if a problematic character is found - """ - for char in BuscoConfig.FORBIDDEN_HEADER_CHARS: - if char in header: - raise SystemExit( - "The character \"%s\" is present in the fasta header %s, " - "which will crash BUSCO. Please clean the header of your " - "input file." % (char, header.strip())) - - for char in BuscoConfig.FORBIDDEN_HEADER_CHARS_BEFORE_SPLIT: - if char in header.split()[0]: - raise SystemExit( - "The character \"%s\" is present in the fasta header %s, " - "which will crash Reader. Please clean the header of your" - " input file." % (char, header.split()[0].strip())) - - if header.split()[0] == ">": - raise SystemExit( - "A space is present in the fasta header %s, directly after " - "\">\" which will crash Reader. Please clean the header of " - "your input file." % (header.strip())) - - def _create_dirs(self): - """ - Create the run (main) directory, log directory and the temporary directories - :return: - """ - self._create_main_dir() - self._create_log_dir() - # self._create_tmp_dir() - - def _create_log_dir(self): - """ - Create a subfolder of the main output folder that contains all log files from BUSCO and the external tools used. - :return: - """ - if not os.path.exists(self._log_folder): - os.mkdir(self._log_folder) - return - - def _create_main_dir(self): - """ - This function creates the run (main) directory - :raises SystemExit: if write permissions are not available to the specified location - """ - try: - os.makedirs(self._run_folder) - except FileExistsError: - if not self.restart: - raise SystemExit("Something went wrong. BUSCO stopped before overwriting run folder " - "{}".format(self._run_folder)) - except PermissionError: - raise SystemExit( - "Cannot write to the output directory, please make sure " - "you have write permissions to {}".format(self._run_folder)) - return - - @log("Check all required tools are accessible...", logger, debug=True) - def init_tools(self): - """ - Init the tools needed for the analysis. HMMER is needed for all BUSCO analysis types. - """ - self.hmmer_runner = HMMERRunner() - return - - @property - @abstractmethod - def _mode(self): - pass - - # def _run_tarzip_hmmer_output(self): # todo: rewrite using tarfile - # """ - # This function tarzips "hmmer_output" results folder - # """ - # self._p_open(["tar", "-C", "%s" % self.run_folder, "-zcf", "%shmmer_output.tar.gz" % self.run_folder, - # "hmmer_output", "--remove-files"], "bash", shell=False) - # - # @log("To reproduce this run: {}", logger, attr_name="_rerun_cmd", on_func_exit=True) - # def set_rerun_busco_command(self, clargs): # todo: reconfigure - # """ - # This function sets the command line to call to reproduce this run - # """ - # - # # Find python script path - # entry_point = "" - # frame_ind = -1 - # while "run_BUSCO.py" not in entry_point: - # entry_point = inspect.stack()[frame_ind].filename - # frame_ind -= 1 - # - # # Add required parameters and other options - # self._rerun_cmd = "python %s -i %s -o %s -l %s -m %s -c %s" % (entry_point, self._input_file, os.path.basename(self.main_out), - # self._lineage_dataset, self._mode, self._cpus) - # - # try: - # if self._long: - # self._rerun_cmd += " --long" - # if self._region_limit != BuscoConfig.DEFAULT_ARGS_VALUES["limit"]: - # self._rerun_cmd += " --limit %s" % self._region_limit - # # if self._tmp != BuscoConfig.DEFAULT_ARGS_VALUES["tmp_path"]: - # # self._rerun_cmd += " -t %s" % self._tmp - # if self._ev_cutoff != BuscoConfig.DEFAULT_ARGS_VALUES["evalue"]: - # self._rerun_cmd += " -e %s" % self._ev_cutoff - # # if self._tarzip: - # # self._rerun_cmd += " -z" - # except AttributeError: - # pass - # - # # Include any command line arguments issued by the user - # # arg_aliases = {"-i": "--in", "-o": "--out", "-l": "--lineage_dataset", "-m": "--mode", "-c": "--cpu", - # # "-e": "--evalue", "-f": "--force", "-sp": "--species", "-z": "--tarzip", - # # "-r": "--restart", "-q": "--quiet", "-v": "--version", "-h": "--help"} - # arg_aliases.update(dict(zip(arg_aliases.values(), arg_aliases.keys()))) - # for a, arg in enumerate(clargs): - # if arg.startswith("-") and not arg in self._rerun_cmd: - # if arg in arg_aliases: - # if arg_aliases[arg] in self._rerun_cmd: - # continue - # if a + 1 < len(clargs) and not clargs[a + 1].startswith("-"): - # self._rerun_cmd += " %s %s" % (arg, clargs[a + 1]) - # else: - # self._rerun_cmd += " %s" % arg - # return - - # TODO: catch unicode encoding exception and report invalid character line instead of doing content validation - # todo: check config file exists before parsing diff -Nru busco-4.1.4/src/busco/BuscoConfig.py busco-5.0.0/src/busco/BuscoConfig.py --- busco-4.1.4/src/busco/BuscoConfig.py 2020-10-01 14:11:36.000000000 +0000 +++ busco-5.0.0/src/busco/BuscoConfig.py 2021-01-26 11:28:47.000000000 +0000 @@ -1,5 +1,5 @@ from configparser import ConfigParser -from configparser import NoOptionError +from configparser import NoOptionError, NoSectionError from configparser import ParsingError from configparser import DuplicateOptionError from busco.BuscoLogger import BuscoLogger @@ -13,17 +13,53 @@ logger = BuscoLogger.get_logger(__name__) + class BaseConfig(ConfigParser): - DEFAULT_ARGS_VALUES = {"out_path": os.getcwd(), "cpu": 1, "force": False, "restart": False, "evalue": 1e-3, - "limit": 3, "long": False, "quiet": False, - "download_path": os.path.join(os.getcwd(), "busco_downloads"), "datasets_version": "odb10", - "offline": False, "download_base_url": "https://busco-data.ezlab.org/v4/data/", - "auto-lineage": False, "auto-lineage-prok": False, "auto-lineage-euk": False, - "update-data": False} + DEFAULT_ARGS_VALUES = { + "out_path": os.getcwd(), + "cpu": 1, + "force": False, + "restart": False, + "quiet": False, + "download_path": os.path.join(os.getcwd(), "busco_downloads"), + "datasets_version": "odb10", + "offline": False, + "download_base_url": "https://busco-data.ezlab.org/v5/data/", + "auto-lineage": False, + "auto-lineage-prok": False, + "auto-lineage-euk": False, + "update-data": False, + "evalue": 1e-3, + "limit": 3, + "use_augustus": False, + "long": False, + } + + DEPENDENCY_SECTIONS = { + "tblastn", + "makeblastdb", + "prodigal", + "sepp", + "metaeuk", + "augustus", + "etraining", + "gff2gbSmallDNA.pl", + "new_species.pl", + "optimize_augustus.pl", + "hmmsearch", + } def __init__(self): super().__init__() + config_dict = {"busco_run": type(self).DEFAULT_ARGS_VALUES} + config_dict.update( + { + tool: {"path": "", "command": ""} + for tool in type(self).DEPENDENCY_SECTIONS + } + ) + self.read_dict(config_dict) def _load_config_file(self): """ @@ -36,9 +72,15 @@ except IOError: raise SystemExit("Config file {} cannot be found".format(self.conf_file)) except ParsingError: - raise SystemExit("Unable to parse the contents of config file {}".format(self.conf_file)) + raise SystemExit( + "Unable to parse the contents of config file {}".format(self.conf_file) + ) except DuplicateOptionError: - raise SystemExit("Duplicated entry in config file {}. Unable to load configuration.".format(self.conf_file)) + raise SystemExit( + "Duplicated entry in config file {}. Unable to load configuration.".format( + self.conf_file + ) + ) return def _init_downloader(self): @@ -55,13 +97,15 @@ class PseudoConfig(BaseConfig): - def __init__(self, conf_file): super().__init__() self.conf_file = conf_file def load(self): - self._load_config_file() + if self.conf_file != "local environment": + self._load_config_file() + else: + self.set("busco_run", "update-data", "True") self._fill_default_values() self._init_downloader() @@ -71,29 +115,48 @@ try: self.get("busco_run", "download_base_url") except NoOptionError: - self.set("busco_run", "download_base_url", type(self).DEFAULT_ARGS_VALUES["download_base_url"]) + self.set( + "busco_run", + "download_base_url", + type(self).DEFAULT_ARGS_VALUES["download_base_url"], + ) try: self.get("busco_run", "download_path") except NoOptionError: - self.set("busco_run", "download_path", type(self).DEFAULT_ARGS_VALUES["download_path"]) + self.set( + "busco_run", + "download_path", + type(self).DEFAULT_ARGS_VALUES["download_path"], + ) try: self.update = self.getboolean("busco_run", "update-data") if not self.update: - self.existing_downloads = sorted(glob.glob(os.path.join(self.get("busco_run", "download_path"), "information", "lineages_list*.txt")))[::-1] + self.existing_downloads = sorted( + glob.glob( + os.path.join( + self.get("busco_run", "download_path"), + "information", + "lineages_list*.txt", + ) + ) + )[::-1] if self.existing_downloads: - logger.warning("The datasets list shown may not be up-to-date. To get current information, make sure " - "you have set 'update-data=True' in your config file.") + logger.warning( + "The datasets list shown may not be up-to-date. To get current information, make sure " + "you have set 'update-data=True' in your config file." + ) else: - raise SystemExit("Unable to download list of datasets. Please make sure you have set " - "update-data=True in your config file.") + raise SystemExit( + "Unable to download list of datasets. Please make sure you have set " + "update-data=True in your config file." + ) except NoOptionError: self.set("busco_run", "update-data", "True") - class BuscoConfig(ConfigParser, metaclass=ABCMeta): """ This class extends busco.PipeConfig to read the config.ini file. Furthermore, it uses extra args that can be @@ -101,11 +164,49 @@ containing all correct parameters to be injected to a busco.BuscoAnalysis instance. """ - FORBIDDEN_HEADER_CHARS = ["ç", "¬", "¢", "´", "ê", "î", "ô", "ŵ", "ẑ", "û", "â", "ŝ", "ĝ", "ĥ", "ĵ", "ŷ", - "ĉ", "é", "ï", "ẅ", "ë", "ẅ", "ë", "ẗ,", "ü", "í", "ö", "ḧ", "é", "ÿ", "ẍ", "è", "é", - "à", "ä", "¨", "€", "£", "á"] + FORBIDDEN_HEADER_CHARS = [ + "ç", + "¬", + "¢", + "´", + "ê", + "î", + "ô", + "ŵ", + "ẑ", + "û", + "â", + "ŝ", + "ĝ", + "ĥ", + "ĵ", + "ŷ", + "ĉ", + "é", + "ï", + "ẅ", + "ë", + "ẅ", + "ë", + "ẗ,", + "ü", + "í", + "ö", + "ḧ", + "é", + "ÿ", + "ẍ", + "è", + "é", + "à", + "ä", + "¨", + "€", + "£", + "á", + ] - FORBIDDEN_HEADER_CHARS_BEFORE_SPLIT = ["/", "\""] + FORBIDDEN_HEADER_CHARS_BEFORE_SPLIT = ["/", '"'] HMMER_VERSION = 3.1 @@ -129,26 +230,41 @@ :return: """ try: - with open(os.path.join(self.get("busco_run", "lineage_dataset"), "dataset.cfg"), "r") as target_species_file: - dataset_kwargs = dict(line.strip().split("=") for line in target_species_file) + with open( + os.path.join(self.get("busco_run", "lineage_dataset"), "dataset.cfg"), + "r", + ) as target_species_file: + dataset_kwargs = dict( + line.strip().split("=") for line in target_species_file + ) for key, value in dataset_kwargs.items(): if key == "species": try: config_species = self.get("busco_run", "augustus_species") if config_species != value: - logger.warning("An augustus species was mentioned in the config file or on the command " - "line, dataset default species ({}) will be ignored".format(value)) + logger.warning( + "An augustus species was mentioned in the config file or on the command " + "line, dataset default species ({}) will be ignored".format( + value + ) + ) except NoOptionError: self.set("busco_run", "augustus_species", value) - elif key in ["prodigal_genetic_code", "ambiguous_cd_range_upper", "ambiguous_cd_range_lower"]: + if key in [ + "prodigal_genetic_code", + "ambiguous_cd_range_upper", + "ambiguous_cd_range_lower", + ]: self.set("prodigal", key, value) else: self.set("busco_run", key, value) except IOError: - logger.warning("The dataset you provided does not contain the file dataset.cfg and is not valid for " - "BUSCO v4.0 or higher") + logger.warning( + "The dataset you provided does not contain the file dataset.cfg and is not valid for " + "BUSCO v4.0 or higher" + ) return @abstractmethod @@ -161,12 +277,15 @@ os.makedirs(main_out) def set_results_dirname(self, lineage): - self.set("busco_run", "lineage_results_dir", "run_{}".format(os.path.basename(lineage))) + self.set( + "busco_run", + "lineage_results_dir", + "run_{}".format(os.path.basename(lineage)), + ) return class BuscoConfigAuto(BuscoConfig): - def __init__(self, config, lineage, **kwargs): super().__init__(**kwargs) self._propagate_config(config) @@ -203,45 +322,65 @@ MANDATORY_USER_PROVIDED_PARAMS = ["in", "out", "mode"] - CONFIG_STRUCTURE = {"busco_run": ["in", "out", "out_path", "mode", "auto-lineage", "auto-lineage-prok", - "auto-lineage-euk", "cpu", "force", "restart", "download_path", - "datasets_version", "evalue", "limit", "long", "quiet", "offline", - "download_base_url", "lineage_dataset", "update-data", "augustus_parameters", - "augustus_species", "main_out"], - "tblastn": ["path", "command"], - "makeblastdb": ["path", "command"], - "prodigal": ["path", "command"], - "sepp": ["path", "command"], - "augustus": ["path", "command"], - "etraining": ["path", "command"], - "gff2gbSmallDNA.pl": ["path", "command"], - "new_species.pl": ["path", "command"], - "optimize_augustus.pl": ["path", "command"], - "hmmsearch": ["path", "command"]} + PERMITTED_OPTIONS = [ + "in", + "out", + "out_path", + "mode", + "auto-lineage", + "auto-lineage-prok", + "auto-lineage-euk", + "cpu", + "force", + "restart", + "download_path", + "datasets_version", + "quiet", + "offline", + "long", + "augustus_parameters", + "augustus_species", + "download_base_url", + "lineage_dataset", + "update-data", + "metaeuk_parameters", + "metaeuk_rerun_parameters", + "evalue", + "limit", + "use_augustus", + ] - def __init__(self, conf_file, params, clargs, **kwargs): + def __init__(self, conf_file, params, **kwargs): """ :param conf_file: a path to a config.ini file :type conf_file: str - :param args: key and values matching BUSCO parameters to override config.ini values - :type args: dict + :param params: key and values matching BUSCO parameters to override config.ini values + :type params: dict """ super().__init__(**kwargs) self.conf_file = conf_file - self._load_config_file() - self.clargs = clargs - # Update the config with args provided by the user, else keep config - self._update_config_with_args(params) - self._fill_default_values() + self.params = params + self.main_out = None + def configure(self): + if self.conf_file != "local environment": + self._load_config_file() + # Update the config with args provided by the user, else keep config + self._update_config_with_args(self.params) + self._check_value_constraints() + @classmethod + def merge_two_dicts(cls, x, y): + """Given two dictionaries, merge them into a new dict as a shallow copy.""" + z = x.copy() + z.update(y) + return z def validate(self): self._check_mandatory_keys_exist() self._check_no_previous_run() - self._create_required_paths() - self._check_allowed_keys() + self._create_required_paths() self._cleanup_config() self._check_required_input_exists() @@ -265,23 +404,28 @@ if "_odb" in lineage_dataset: dataset_version = lineage_dataset.rsplit("_")[-1].rstrip("/") if datasets_version != dataset_version: - logger.warning("There is a conflict in your config. You specified a dataset from {0} while " - "simultaneously requesting the datasets_version parameter be {1}. Proceeding with " - "the lineage dataset as specified from {0}".format(dataset_version, - datasets_version)) + logger.warning( + "There is a conflict in your config. You specified a dataset from {0} while " + "simultaneously requesting the datasets_version parameter be {1}. Proceeding with " + "the lineage dataset as specified from {0}".format( + dataset_version, datasets_version + ) + ) self.set("busco_run", "datasets_version", dataset_version) - else: # Make sure the ODB version is in the dataset name + else: # Make sure the ODB version is in the dataset name lineage_dataset = "_".join([lineage_dataset, datasets_version]) self.set("busco_run", "lineage_dataset", lineage_dataset) datasets_version = self.get("busco_run", "datasets_version") if datasets_version != "odb10": - raise SystemExit("BUSCO v4 only works with datasets from OrthoDB v10 (with the suffix '_odb10'). " - "For a full list of available datasets, enter 'busco --list-datasets'. " - "You can also run BUSCO using auto-lineage, to allow BUSCO to automatically select " - "the best dataset for your input data.") + raise SystemExit( + "BUSCO v4 only works with datasets from OrthoDB v10 (with the suffix '_odb10'). " + "For a full list of available datasets, enter 'busco --list-datasets'. " + "You can also run BUSCO using auto-lineage, to allow BUSCO to automatically select " + "the best dataset for your input data." + ) return True - except NoOptionError: + except NoOptionError: # todo: need a helpful error message if datasets_version is not set but lineage_dataset is. return False def _check_evalue(self): @@ -289,7 +433,10 @@ Warn the user if the config contains a non-standard e-value cutoff. :return: """ - if self.getfloat("busco_run", "evalue") != type(self).DEFAULT_ARGS_VALUES["evalue"]: + if ( + self.getfloat("busco_run", "evalue") + != type(self).DEFAULT_ARGS_VALUES["evalue"] + ): # todo: introduce systemexit if not float logger.warning("You are using a custom e-value cutoff") return @@ -298,10 +445,14 @@ Check the value of limit. Ensure it is between 1 and 20, otherwise raise SystemExit. :return: """ - limit_val = self.getint("busco_run", "limit") + limit_val = self.getint( + "busco_run", "limit" + ) # todo: introduce systemexit if not int. if limit_val <= 0 or limit_val > 20: - raise SystemExit("Limit must be an integer between 1 and 20 (you have used: {}). Note that this parameter " - "is not needed by the protein mode.".format(limit_val)) + raise SystemExit( + "Limit must be an integer between 1 and 20 (you have used: {}). Note that this parameter " + "is not needed by the protein mode.".format(limit_val) + ) return @log("Mode is {0}", logger, attr_name="_mode", on_func_exit=True, log_once=True) @@ -314,12 +465,47 @@ try: value = self.get("busco_run", param) if param == "mode": - synonyms = {"genome": ["genome", "geno", "genomes", "Genome", "Genomes", "Geno"], - "transcriptome": ["transcriptome", "tran", "transcriptomes", "trans", "Transcriptome", "Transcriptomes", "Tran", "Trans"], - "proteins": ["proteins", "prot", "protein", "Proteins", "Protein", "Prot", "proteome", "proteomes", "Proteome", "Proteomes"]} - if value not in list(synonyms["genome"] + synonyms["transcriptome"] + synonyms["proteins"]): - raise SystemExit("Unknown mode {}.\n'Mode' parameter must be one of " - "['genome', 'transcriptome', 'proteins']".format(value)) + synonyms = { + "genome": [ + "genome", + "geno", + "genomes", + "Genome", + "Genomes", + "Geno", + ], + "transcriptome": [ + "transcriptome", + "tran", + "transcriptomes", + "trans", + "Transcriptome", + "Transcriptomes", + "Tran", + "Trans", + ], + "proteins": [ + "proteins", + "prot", + "protein", + "Proteins", + "Protein", + "Prot", + "proteome", + "proteomes", + "Proteome", + "Proteomes", + ], + } + if value not in list( + synonyms["genome"] + + synonyms["transcriptome"] + + synonyms["proteins"] + ): + raise SystemExit( + "Unknown mode {}.\n'Mode' parameter must be one of " + "['genome', 'transcriptome', 'proteins']".format(value) + ) if value in synonyms["genome"]: self.set("busco_run", "mode", "genome") elif value in synonyms["transcriptome"]: @@ -331,19 +517,36 @@ if param == "out": if value == "": - raise SystemExit("Please specify an output name for the BUSCO run. " - "This can be done using the -o flag or in the config file") + raise SystemExit( + "Please specify an output name for the BUSCO run. " + "This can be done using the -o flag or in the config file" + ) except NoOptionError: - raise SystemExit("The parameter \"{} (--{})\" was not provided. " - "Please add it in the config file or provide it " - "through the command line".format(param, param)) + raise SystemExit( + 'The parameter "{} (--{})" was not provided. ' + "Please add it in the config file or provide it " + "through the command line".format(param, param) + ) return def _check_allowed_keys(self): - for section_name, options in type(self).CONFIG_STRUCTURE.items(): - for option in self.options(section_name): - if option not in options: - raise SystemExit("Unrecognized option '{}' in config file".format(option)) + full_dict = {"busco_run": type(self).PERMITTED_OPTIONS} + full_dict.update( + { + dependency: ["path", "command"] + for dependency in type(self).DEPENDENCY_SECTIONS + } + ) + + for section_name, options in full_dict.items(): + try: + for option in self.options(section_name): + if option not in options: + raise SystemExit( + "Unrecognized option '{}' in config file".format(option) + ) + except NoSectionError: + logger.warning("Section {} not found".format(section_name)) return def _check_out_value(self): @@ -352,11 +555,19 @@ :return: """ if "/" in self.get("busco_run", "out"): - raise SystemExit("Please do not provide a full path in --out parameter, no slash. " - "Use out_path in the config.ini file to specify the full path.") + raise SystemExit( + "Please do not provide a full path in --out parameter, no slash. " + "Use out_path in the config.ini file to specify the full path." + ) return - @log("Input file is {}", logger, attr_name="_input_filepath", on_func_exit=True, log_once=True) + @log( + "Input file is {}", + logger, + attr_name="_input_filepath", + on_func_exit=True, + log_once=True, + ) def _check_required_input_exists(self): """ Test for existence of input file. @@ -364,22 +575,36 @@ """ self._input_filepath = self.get("busco_run", "in") if not os.path.exists(self._input_filepath): - raise SystemExit("Input file {} does not exist".format(self._input_filepath)) + raise SystemExit( + "Input file {} does not exist".format(self._input_filepath) + ) return def _check_no_previous_run(self): - self.main_out = os.path.join(self.get("busco_run", "out_path"), self.get("busco_run", "out")) + self.main_out = os.path.join( + self.get("busco_run", "out_path"), self.get("busco_run", "out") + ) if os.path.exists(self.main_out): if self.getboolean("busco_run", "force"): self._force_remove_existing_output_dir(self.main_out) elif self.getboolean("busco_run", "restart"): - logger.info("Attempting to restart the run using the following directory: {}".format(self.main_out)) + logger.info( + "Attempting to restart the run using the following directory: {}".format( + self.main_out + ) + ) else: - raise SystemExit("A run with the name {} already exists...\n" - "\tIf you are sure you wish to overwrite existing files, " - "please use the -f (force) option".format(self.main_out)) + raise SystemExit( + "A run with the name {} already exists...\n" + "\tIf you are sure you wish to overwrite existing files, " + "please use the -f (force) option".format(self.main_out) + ) elif self.getboolean("busco_run", "restart"): - logger.warning("Restart mode not available as directory {} does not exist.".format(self.main_out)) + logger.warning( + "Restart mode not available as directory {} does not exist.".format( + self.main_out + ) + ) self.set("busco_run", "restart", "False") return @@ -395,7 +620,9 @@ self._expand_all_paths() @staticmethod - @log("'Force' option selected; overwriting previous results directory", logger) # todo: review log messages + @log( + "'Force' option selected; overwriting previous results directory", logger + ) # todo: review log messages def _force_remove_existing_output_dir(dirpath): """ Remove main output folder from a previous BUSCO run. @@ -423,27 +650,34 @@ if item[0].endswith("_path") or item[0] == "path" or item[0] == "in": if item[1].startswith("~"): self.set(key, item[0], os.path.expanduser(item[1])) - elif item[1].startswith("."): + elif item[1].startswith(".") or (item[1] and "/" not in item[1]): self.set(key, item[0], os.path.abspath(item[1])) + return - def _fill_default_values(self): + def _check_value_constraints(self): """ Load default values into config if not provided in config file or on the command line. :return: """ - for param in list(type(self).DEFAULT_ARGS_VALUES.keys()): - try: - self.get("busco_run", param) - except NoOptionError: - self.set("busco_run", param, str(type(self).DEFAULT_ARGS_VALUES[param])) + # for param in list(type(self).DEFAULT_ARGS_VALUES.keys()): + # try: + # self.get("busco_run", param) + # except NoOptionError: + # self.set("busco_run", param, str(type(self).DEFAULT_ARGS_VALUES[param])) # Set auto-lineage to True if either auto-lineage-prok or auto-lineage-euk is selected - if self.getboolean("busco_run", "auto-lineage-prok") or self.getboolean("busco_run", "auto-lineage-euk"): + if self.getboolean("busco_run", "auto-lineage-prok") or self.getboolean( + "busco_run", "auto-lineage-euk" + ): self.set("busco_run", "auto-lineage", "True") - if self.getboolean("busco_run", "auto-lineage-prok") and self.getboolean("busco_run", "auto-lineage-euk"): - logger.warning("You have specified both --auto-lineage-prok and --auto-lineage-euk. This has the same behaviour as --auto-lineage.") + if self.getboolean("busco_run", "auto-lineage-prok") and self.getboolean( + "busco_run", "auto-lineage-euk" + ): + logger.warning( + "You have specified both --auto-lineage-prok and --auto-lineage-euk. This has the same behaviour as --auto-lineage." + ) self.set("busco_run", "auto-lineage-prok", "False") self.set("busco_run", "auto-lineage-euk", "False") @@ -452,13 +686,13 @@ def _update_config_with_args(self, args): """ Include command line arguments in config. Overwrite any values given in the config file. - :param args: Dictionary of parsed command line arguments. To see full list, inspect run_BUSCO.py or + :param args: Dictionary of parsed command line arguments. To see full list, inspect run_BUSCO_unittests.py or type busco -h :type args: dict :return: """ for key, val in args.items(): - if key in type(self).CONFIG_STRUCTURE["busco_run"]: + if key in type(self).PERMITTED_OPTIONS: if val is not None and type(val) is not bool: self.set("busco_run", key, str(val)) elif val: # if True @@ -467,8 +701,9 @@ # Code taken from https://dave.dkjones.org/posts/2013/pretty-print-log-python/ -class PrettyLog(): +class PrettyLog: def __init__(self, obj): self.obj = obj + def __repr__(self): - return pprint.pformat(self.obj) \ No newline at end of file + return pprint.pformat(self.obj) diff -Nru busco-4.1.4/src/busco/BuscoDownloadManager.py busco-5.0.0/src/busco/BuscoDownloadManager.py --- busco-4.1.4/src/busco/BuscoDownloadManager.py 2020-10-01 14:11:36.000000000 +0000 +++ busco-5.0.0/src/busco/BuscoDownloadManager.py 2021-01-26 11:28:47.000000000 +0000 @@ -1,4 +1,4 @@ -#!/usr/bin/env python +#!/usr/bin/env python3 # coding: utf-8 """ .. module:: BuscoDownloadManager @@ -6,7 +6,7 @@ .. versionadded:: 4.0.0 .. versionchanged:: 4.0.0 -Copyright (c) 2016-2020, Evgeny Zdobnov (ez@ezlab.org) +Copyright (c) 2016-2021, Evgeny Zdobnov (ez@ezlab.org) Licensed under the MIT license. See LICENSE.md file. """ @@ -33,6 +33,7 @@ When the config parameter `auto_update_files` is set, new versions replace old versions.s Else, a warning is produced. """ + version_files = {} def __init__(self, config): @@ -62,12 +63,16 @@ dataset_name = line[0] dataset_date = line[1] dataset_hash = line[2] - type(self).version_files.update({dataset_name: (dataset_date, dataset_hash)}) + type(self).version_files.update( + {dataset_name: (dataset_date, dataset_hash)} + ) except URLError as e: if self.offline: - logger.warning("Unable to verify BUSCO datasets because of offline mode") + logger.warning( + "Unable to verify BUSCO datasets because of offline mode" + ) else: - SystemExit(e) + raise SystemExit(e) return @log("Downloading information on latest versions of BUSCO data...", logger) @@ -98,20 +103,26 @@ dataset_date = line[1] break if not dataset_date: - raise SystemExit("Creation date could not be extracted from dataset.cfg file.") + raise SystemExit( + "Creation date could not be extracted from dataset.cfg file." + ) return dataset_date def _check_existing_version(self, local_filepath, category, data_basename): try: latest_update = type(self).version_files[data_basename][0] except KeyError: - raise SystemExit("{} is not a valid option for '{}'".format(data_basename, category)) + raise SystemExit( + "{} is not a valid option for '{}'".format(data_basename, category) + ) path_basename, extension = os.path.splitext(data_basename) if category == "lineages": latest_version = ".".join([path_basename, latest_update]) try: - dataset_date = self._extract_creation_date(os.path.join(local_filepath, "dataset.cfg")) + dataset_date = self._extract_creation_date( + os.path.join(local_filepath, "dataset.cfg") + ) up_to_date = dataset_date == latest_update present = True except FileNotFoundError: @@ -119,11 +130,20 @@ present = False else: - latest_version = ".".join([path_basename, latest_update, extension.lstrip(".")]) + latest_version = ".".join( + [path_basename, latest_update, extension.lstrip(".")] + ) local_filepath = local_filepath.replace(data_basename, latest_version) up_to_date = os.path.exists(local_filepath) path_to_check, extension_to_check = os.path.splitext(local_filepath) - present = len(glob.glob("{}.*.{}".format(path_to_check[0:-11], extension_to_check[1:]))) > 0 + present = ( + len( + glob.glob( + "{}.*.{}".format(path_to_check[0:-11], extension_to_check[1:]) + ) + ) + > 0 + ) hash = type(self).version_files[data_basename][1] @@ -136,44 +156,74 @@ elif "/" in data_name: raise SystemExit("{} does not exist".format(data_name)) if self.offline: - if category == 'lineages': - local_dataset = os.path.join(self.local_download_path, category, data_name) + if category == "lineages": + local_dataset = os.path.join( + self.local_download_path, category, data_name + ) if os.path.exists(local_dataset): return local_dataset else: - raise SystemExit("Unable to run BUSCO in offline mode. Dataset {} does not " - "exist.".format(local_dataset)) + raise SystemExit( + "Unable to run BUSCO in offline mode. Dataset {} does not " + "exist.".format(local_dataset) + ) else: basename, extension = os.path.splitext(data_name) - placement_files = sorted(glob.glob(os.path.join( - self.local_download_path, category, "{}.*{}".format(basename, extension)))) + placement_files = sorted( + glob.glob( + os.path.join( + self.local_download_path, + category, + "{}.*{}".format(basename, extension), + ) + ) + ) if len(placement_files) > 0: return placement_files[-1] # todo: for offline mode, log which files are being used (in case of more than one glob match) else: - raise SystemExit("Unable to run BUSCO placer in offline mode. Cannot find necessary placement " - "files in {}".format(self.local_download_path)) + raise SystemExit( + "Unable to run BUSCO placer in offline mode. Cannot find necessary placement " + "files in {}".format(self.local_download_path) + ) data_basename = os.path.basename(data_name) local_filepath = os.path.join(self.local_download_path, category, data_basename) - present, up_to_date, latest_version, local_filepath, hash = self._check_existing_version( - local_filepath, category, data_basename) + ( + present, + up_to_date, + latest_version, + local_filepath, + hash, + ) = self._check_existing_version(local_filepath, category, data_basename) if (not up_to_date and self.update_data) or not present: # download self._create_category_dir(category) compression_extension = ".tar.gz" - remote_filepath = os.path.join(self.download_base_url, category, latest_version+compression_extension) - if present and category == 'lineages': + remote_filepath = os.path.join( + self.download_base_url, category, latest_version + compression_extension + ) + if present and category == "lineages": self._rename_old_version(local_filepath) - download_success = self._download_file(remote_filepath, local_filepath+compression_extension, hash) + download_success = self._download_file( + remote_filepath, local_filepath + compression_extension, hash + ) if download_success: - local_filepath = self._decompress_file(local_filepath+compression_extension) + local_filepath = self._decompress_file( + local_filepath + compression_extension + ) if present: - logger.warning("The file or folder {} was updated automatically.".format(data_basename)) + logger.warning( + "The file or folder {} was updated automatically.".format( + data_basename + ) + ) elif not up_to_date: - logger.warning("The file or folder {} is not the last available version. " - "To update all data files to the last version, add the parameter " - "--update-data in your next run.".format(local_filepath)) + logger.warning( + "The file or folder {} is not the last available version. " + "To update all data files to the last version, add the parameter " + "--update-data in your next run.".format(local_filepath) + ) return local_filepath @@ -182,12 +232,20 @@ if os.path.exists(local_filepath): try: os.rename(local_filepath, "{}.old".format(local_filepath)) - logger.info("Renaming {} into {}.old".format(local_filepath, local_filepath)) + logger.info( + "Renaming {} into {}.old".format(local_filepath, local_filepath) + ) except OSError: try: timestamp = time.time() - os.rename(local_filepath, "{}.old.{}".format(local_filepath, timestamp)) - logger.info("Renaming {} into {}.old.{}".format(local_filepath, local_filepath, timestamp)) + os.rename( + local_filepath, "{}.old.{}".format(local_filepath, timestamp) + ) + logger.info( + "Renaming {} into {}.old.{}".format( + local_filepath, local_filepath, timestamp + ) + ) except OSError: pass return @@ -198,13 +256,18 @@ urllib.request.urlretrieve(remote_filepath, local_filepath) observed_hash = type(self)._md5(local_filepath) if observed_hash != expected_hash: - logger.error("md5 hash is incorrect: {} while {} expected".format(str(observed_hash), - str(expected_hash))) + logger.error( + "md5 hash is incorrect: {} while {} expected".format( + str(observed_hash), str(expected_hash) + ) + ) logger.info("deleting corrupted file {}".format(local_filepath)) os.remove(local_filepath) - raise SystemExit("BUSCO was unable to download or update all necessary files") + raise SystemExit( + "BUSCO was unable to download or update all necessary files" + ) else: - logger.debug('md5 hash is {}'.format(observed_hash)) + logger.debug("md5 hash is {}".format(observed_hash)) except URLError: logger.error("Cannot reach {}".format(remote_filepath)) return False diff -Nru busco-4.1.4/src/busco/BuscoLogger.py busco-5.0.0/src/busco/BuscoLogger.py --- busco-4.1.4/src/busco/BuscoLogger.py 2020-10-01 14:11:36.000000000 +0000 +++ busco-5.0.0/src/busco/BuscoLogger.py 2021-01-26 11:28:47.000000000 +0000 @@ -1,4 +1,4 @@ -#!/usr/bin/env python +#!/usr/bin/env python3 # coding: utf-8 """ .. module:: BuscoLogger @@ -8,7 +8,7 @@ This is a logger for the pipeline that extends the default Python logger class -Copyright (c) 2016-2020, Evgeny Zdobnov (ez@ezlab.org) +Copyright (c) 2016-2021, Evgeny Zdobnov (ez@ezlab.org) Licensed under the MIT license. See LICENSE.md file. """ @@ -31,8 +31,18 @@ _log_once_keywords = {} - def __init__(self, msg, logger, - on_func_exit=False, func_arg=None, attr_name=None, iswarn=False, debug=False, apply=None, log_once=False): + def __init__( + self, + msg, + logger, + on_func_exit=False, + func_arg=None, + attr_name=None, + iswarn=False, + debug=False, + apply=None, + log_once=False, + ): self.msg = msg self.logger = logger self.on_func_exit = on_func_exit @@ -47,7 +57,7 @@ def __call__(self, func): def wrapped_func(*args, **kwargs): try: - if '{' in self.msg and self.on_func_exit: + if "{" in self.msg and self.on_func_exit: self.retval = func(*args, **kwargs) self.format_string(*args) else: @@ -56,6 +66,7 @@ return self.retval except SystemExit: raise + return wrapped_func def format_string(self, *args): @@ -67,8 +78,8 @@ if self.attr_name == "retvalue": string_arg = self.retval - if self.apply == 'join' and isinstance(string_arg, tuple): - string_arg = ' '.join(list(string_arg)) + if self.apply == "join" and isinstance(string_arg, tuple): + string_arg = " ".join(list(string_arg)) elif self.apply == "basename" and isinstance(string_arg, str): string_arg = os.path.basename(string_arg) log_msg = self.msg.format(string_arg) @@ -83,9 +94,11 @@ try: string_arg = getattr(obj_inst, self.attr_name) - if self.apply == 'join' and isinstance(string_arg, list): - string_arg = [str(arg) for arg in string_arg] # Ensure all parameters are joinable strings - string_arg = ' '.join(string_arg) + if self.apply == "join" and isinstance(string_arg, list): + string_arg = [ + str(arg) for arg in string_arg + ] # Ensure all parameters are joinable strings + string_arg = " ".join(string_arg) elif self.apply == "basename" and isinstance(string_arg, str): string_arg = os.path.basename(string_arg) @@ -98,7 +111,9 @@ self.logger.error("No such attribute {}".format(self.attr_name)) except IndexError: - self.logger.error("Index out of range for attribute {}".format(self.attr_name)) + self.logger.error( + "Index out of range for attribute {}".format(self.attr_name) + ) elif self.func_arg is not None: try: @@ -106,7 +121,9 @@ log_msg = self.msg.format(string_arg) except IndexError: - self.logger.error("Index out of range for function argument {}".format(self.func_arg)) + self.logger.error( + "Index out of range for function argument {}".format(self.func_arg) + ) else: log_msg = self.msg @@ -127,11 +144,12 @@ def __init__(self, filename): super().__init__(filename) self.setLevel(type(self)._level) - self._external_formatter = logging.Formatter('%(message)s') - self._file_hdlr = logging.FileHandler(filename, mode='a', encoding="UTF-8") + self._external_formatter = logging.Formatter("%(message)s") + self._file_hdlr = logging.FileHandler(filename, mode="a", encoding="UTF-8") self._file_hdlr.setFormatter(self._external_formatter) self.addHandler(self._file_hdlr) + # The following code was created by combining code based on a SO answer here: # https://stackoverflow.com/questions/4713932/decorate-delegate-a-file-object-to-add-functionality/4838875#4838875 # with code from the Python docs here: @@ -157,7 +175,7 @@ def _flusher_augustus_out(self): self._run = True - buf = b'' + buf = b"" timeout = 10 read_only = select.POLLIN | select.POLLPRI | select.POLLHUP | select.POLLERR # Switched from select.select() to select.poll() using examples at https://pymotw.com/2/select/ @@ -175,14 +193,14 @@ self.gene_found = True if b"command line" in buf: self.output_complete = True - data, buf = buf.split(b'\n', 1) + data, buf = buf.split(b"\n", 1) self.write(data.decode()) self._run = None def _flusher(self): self._run = True - buf = b'' + buf = b"" timeout = 10 read_only = select.POLLIN | select.POLLPRI | select.POLLHUP | select.POLLERR # Switched from select.select() to select.poll() using examples at https://pymotw.com/2/select/ @@ -196,7 +214,7 @@ if flag & (select.POLLIN | select.POLLPRI): buf += os.read(fd, 4096) while b"\n" in buf: - data, buf = buf.split(b'\n', 1) + data, buf = buf.split(b"\n", 1) self.write(data.decode()) self._run = None @@ -233,9 +251,11 @@ """ super(BuscoLogger, self).__init__(name) self.setLevel(BuscoLogger._level) - self._normal_formatter = logging.Formatter('%(levelname)s:\t%(message)s') - self._verbose_formatter = logging.Formatter('%(levelname)s:%(name)s\t%(message)s') - self._external_formatter = logging.Formatter('%(message)s') + self._normal_formatter = logging.Formatter("%(levelname)s:\t%(message)s") + self._verbose_formatter = logging.Formatter( + "%(levelname)s:%(name)s\t%(message)s" + ) + self._external_formatter = logging.Formatter("%(message)s") self._out_hdlr = logging.StreamHandler(sys.stdout) self._out_hdlr.addFilter(LessThanFilter(logging.ERROR)) @@ -250,10 +270,17 @@ try: # Random id used in filename to avoid complications for parallel BUSCO runs. - self._file_hdlr = logging.FileHandler("busco_{}.log".format(type(self).random_id), mode="a") + self._file_hdlr = logging.FileHandler( + "busco_{}.log".format(type(self).random_id), mode="a" + ) except IOError as e: - errStr = "No permission to write in the current directory: {}".format(os.getcwd()) if e.errno == 13 \ + errStr = ( + "No permission to write in the current directory: {}".format( + os.getcwd() + ) + if e.errno == 13 else "IO error({0}): {1}".format(e.errno, e.strerror) + ) raise SystemExit(errStr) self._file_hdlr.setLevel(logging.DEBUG) @@ -316,11 +343,12 @@ # Code from https://stackoverflow.com/a/31459386/4844311 + class LessThanFilter(logging.Filter): def __init__(self, exclusive_maximum, name=""): super(LessThanFilter, self).__init__(name) self.max_level = exclusive_maximum def filter(self, record): - #non-zero return means we log this message - return 1 if record.levelno < self.max_level else 0 \ No newline at end of file + # non-zero return means we log this message + return 1 if record.levelno < self.max_level else 0 diff -Nru busco-4.1.4/src/busco/BuscoPlacer.py busco-5.0.0/src/busco/BuscoPlacer.py --- busco-4.1.4/src/busco/BuscoPlacer.py 2020-10-01 14:11:36.000000000 +0000 +++ busco-5.0.0/src/busco/BuscoPlacer.py 2021-01-26 11:28:47.000000000 +0000 @@ -1,4 +1,4 @@ -#!/usr/bin/env python +#!/usr/bin/env python3 # coding: utf-8 """ @@ -8,19 +8,16 @@ .. versionadded:: 4.0.0 .. versionchanged:: 4.0.0 -Copyright (c) 2016-2020, Evgeny Zdobnov (ez@ezlab.org) +Copyright (c) 2016-2021, Evgeny Zdobnov (ez@ezlab.org) Licensed under the MIT license. See LICENSE.md file. """ -import glob import json import os -from busco.Toolset import Tool from busco.BuscoLogger import BuscoLogger from busco.BuscoLogger import LogDecorator as log -from busco.BuscoDownloadManager import BuscoDownloadManager from Bio import SeqIO -from busco.BuscoTools import SEPPRunner +from busco.busco_tools.sepp import SEPPRunner logger = BuscoLogger.get_logger(__name__) @@ -29,7 +26,10 @@ _logger = BuscoLogger.get_logger(__name__) - @log("***** Searching tree for chosen lineage to find best taxonomic match *****\n", logger) + @log( + "***** Searching tree for chosen lineage to find best taxonomic match *****\n", + logger, + ) def __init__(self, config, run_folder, protein_seqs, single_copy_buscos): self._config = config self._params = config @@ -46,43 +46,61 @@ self.datasets_version = self._config.get("busco_run", "datasets_version") self.protein_seqs = protein_seqs self.single_copy_buscos = single_copy_buscos # dict - self._init_tools() + self.init_tools() def _download_placement_files(self): - self.ref_markers_file = self.downloader.get("list_of_reference_markers.{0}_{1}.txt".format( - os.path.basename(self.run_folder).split("_")[-2], - self.datasets_version), - "placement_files") - self.tree_nwk_file = self.downloader.get("tree.{0}_{1}.nwk".format( - os.path.basename(self.run_folder).split("_")[-2], - self.datasets_version), - "placement_files") - self.tree_metadata_file = self.downloader.get("tree_metadata.{0}_{1}.txt".format( - os.path.basename(self.run_folder).split("_")[-2], - self.datasets_version), - "placement_files") - self.supermatrix_file = self.downloader.get("supermatrix.aln.{0}_{1}.faa".format( - os.path.basename(self.run_folder).split("_")[-2], - self.datasets_version), - "placement_files") - self.taxid_busco_file = self.downloader.get("mapping_taxids-busco_dataset_name.{0}_{1}.txt".format( - os.path.basename(self.run_folder).split("_")[-2], - self.datasets_version), - "placement_files") - self.taxid_lineage_file = self.downloader.get("mapping_taxid-lineage.{0}_{1}.txt".format( - os.path.basename(self.run_folder).split("_")[-2], - self.datasets_version), - "placement_files") + self.ref_markers_file = self.downloader.get( + "list_of_reference_markers.{0}_{1}.txt".format( + os.path.basename(self.run_folder).split("_")[-2], self.datasets_version + ), + "placement_files", + ) + self.tree_nwk_file = self.downloader.get( + "tree.{0}_{1}.nwk".format( + os.path.basename(self.run_folder).split("_")[-2], self.datasets_version + ), + "placement_files", + ) + self.tree_metadata_file = self.downloader.get( + "tree_metadata.{0}_{1}.txt".format( + os.path.basename(self.run_folder).split("_")[-2], self.datasets_version + ), + "placement_files", + ) + self.supermatrix_file = self.downloader.get( + "supermatrix.aln.{0}_{1}.faa".format( + os.path.basename(self.run_folder).split("_")[-2], self.datasets_version + ), + "placement_files", + ) + self.taxid_busco_file = self.downloader.get( + "mapping_taxids-busco_dataset_name.{0}_{1}.txt".format( + os.path.basename(self.run_folder).split("_")[-2], self.datasets_version + ), + "placement_files", + ) + self.taxid_lineage_file = self.downloader.get( + "mapping_taxid-lineage.{0}_{1}.txt".format( + os.path.basename(self.run_folder).split("_")[-2], self.datasets_version + ), + "placement_files", + ) return def _get_placement_file_versions(self): placement_file_versions = [ os.path.basename(filepath) - for filepath in [self.ref_markers_file, self.tree_nwk_file, self.tree_metadata_file, - self.supermatrix_file, self.taxid_busco_file, self.taxid_lineage_file]] + for filepath in [ + self.ref_markers_file, + self.tree_nwk_file, + self.tree_metadata_file, + self.supermatrix_file, + self.taxid_busco_file, + self.taxid_lineage_file, + ] + ] return placement_file_versions - @log("Extract markers...", logger) def define_dataset(self): # If mode is genome, substitute input with prodigal/augustus output @@ -95,15 +113,9 @@ return dataset, placement_file_versions - def _init_tools(self): + def init_tools(self): setattr(SEPPRunner, "config", self._config) self.sepp_runner = SEPPRunner() - # try: - # assert isinstance(self._sepp, Tool) - # except AttributeError: - # self._sepp = Tool("sepp", self._config) - # except AssertionError: - # raise SystemExit("SEPP should be a tool") def _pick_dataset(self): @@ -115,7 +127,12 @@ with open(self.taxid_busco_file) as f: for line in f: datasets_mapping.update( - {line.strip().split("\t")[0]: line.strip().split("\t")[1].split(",")[0]} + { + line.strip() + .split("\t")[0]: line.strip() + .split("\t")[1] + .split(",")[0] + } ) # load the lineage for each taxid in a dict {taxid:reversed_lineage} @@ -139,7 +156,9 @@ parents.update({t: line.strip().split("\t")[4].split(",")[0:i]}) for t in parents: - for p in parents[t][::-1]: # reverse the order to get the deepest parent, not the root one + for p in parents[t][ + ::-1 + ]: # reverse the order to get the deepest parent, not the root one if p in datasets_mapping: taxid_dataset.update({t: p}) break @@ -150,12 +169,16 @@ # figure out which taxid to use by using the highest number of markers and some extra rules try: - with open(os.path.join(self.placement_folder, "output_placement.json")) as json_file: + with open( + os.path.join(self.placement_folder, "output_placement.json") + ) as json_file: data = json.load(json_file) tree = data["tree"] placements = data["placements"] except FileNotFoundError: - raise SystemExit("Placements failed. Try to rerun increasing the memory or select a lineage manually.") + raise SystemExit( + "Placements failed. Try to rerun increasing the memory or select a lineage manually." + ) node_weight = {} n_p = 0 @@ -192,7 +215,9 @@ choice = [] for key in node_weight: - type(self)._logger.debug('%s markers assigned to the taxid %s' % (node_weight[key],key)) + type(self)._logger.debug( + "%s markers assigned to the taxid %s" % (node_weight[key], key) + ) # taxid for which no threshold or minimal amount of placement should be considered. # If it is the best, go for it. @@ -218,8 +243,8 @@ if len(choice) > 1: # more than one taxid should be considered, pick the common ancestor choice = self._get_common_ancestor(choice, parents) - #print('last common') - #print(choice) + # print('last common') + # print(choice) elif len(choice) == 0: if run_folder.split("/")[-1].split("_")[-2] == "bacteria": choice.append("2") @@ -237,7 +262,8 @@ else: key_taxid = None # unexpected. Should throw an exception or use assert. type(self)._logger.info( - "Not enough markers were placed on the tree (%s). Root lineage %s is kept" % (max_markers, datasets_mapping[taxid_dataset[key_taxid]]) + "Not enough markers were placed on the tree (%s). Root lineage %s is kept" + % (max_markers, datasets_mapping[taxid_dataset[key_taxid]]) ) return [ datasets_mapping[taxid_dataset[key_taxid]], @@ -245,7 +271,14 @@ sum(node_weight.values()), ] - type(self)._logger.info('Lineage %s is selected, supported by %s markers out of %s' % (datasets_mapping[taxid_dataset[choice[0]]],max_markers,sum(node_weight.values()))) + type(self)._logger.info( + "Lineage %s is selected, supported by %s markers out of %s" + % ( + datasets_mapping[taxid_dataset[choice[0]]], + max_markers, + sum(node_weight.values()), + ) + ) return [ datasets_mapping[taxid_dataset[choice[0]]], @@ -260,12 +293,10 @@ # order will be lost with sets, so keep in a list the lineage of one entry to later pick the deepest ancestor ordered_lineage = [] for c in choice: - #print('c is %s' % c) - if len(parents[c]) > len( - ordered_lineage - ): - # print('len parent c is %s' % len(parents[c])) - # print('len ordered lineage is %s' % ordered_lineage) + # print('c is %s' % c) + if len(parents[c]) > len(ordered_lineage): + # print('len parent c is %s' % len(parents[c])) + # print('len ordered lineage is %s' % ordered_lineage) # probably useless. Init with parents[choice[0] should work ordered_lineage = parents[c] # print('ordered_lineage us %s' % ordered_lineage) @@ -279,16 +310,19 @@ @log("Place the markers on the reference tree...", logger) def _run_sepp(self): - # self.sepp_runner = SEPPRunner(self._sepp, self.run_folder, self.placement_folder, self.tree_nwk_file, - # self.tree_metadata_file, self.supermatrix_file, self.downloader, - # self.datasets_version, self.cpus) - self.sepp_runner.configure_runner(self.tree_nwk_file, self.tree_metadata_file, self.supermatrix_file, self.downloader) + self.sepp_runner.configure_runner( + self.tree_nwk_file, + self.tree_metadata_file, + self.supermatrix_file, + self.downloader, + ) if self.restart and self.sepp_runner.check_previous_completed_run(): logger.info("Skipping SEPP run as it has already been completed") else: self.restart = False self._config.set("busco_run", "restart", str(self.restart)) self.sepp_runner.run() + self.sepp_runner.cleanup() def _extract_marker_sequences(self): """ @@ -299,16 +333,15 @@ :type: str """ - - with open(self.ref_markers_file, "r") as f: marker_list = [line.strip() for line in f] - marker_genes_names = [] for busco, gene_matches in self.single_copy_buscos.items(): if busco in marker_list: - marker_genes_names.append(list(gene_matches.keys())[0]) # The list should only have one entry because they are single copy buscos + marker_genes_names.append( + list(gene_matches.keys())[0] + ) # The list should only have one entry because they are single copy buscos marker_genes_records = [] if isinstance(self.protein_seqs, (str,)): @@ -327,4 +360,3 @@ marker_genes_file = os.path.join(self.placement_folder, "marker_genes.fasta") with open(marker_genes_file, "w") as output: SeqIO.write(marker_genes_records, output, "fasta") - diff -Nru busco-4.1.4/src/busco/BuscoRunner.py busco-5.0.0/src/busco/BuscoRunner.py --- busco-4.1.4/src/busco/BuscoRunner.py 2020-10-01 14:11:36.000000000 +0000 +++ busco-5.0.0/src/busco/BuscoRunner.py 2021-01-26 11:28:47.000000000 +0000 @@ -1,11 +1,17 @@ -from busco.BuscoAnalysis import BuscoAnalysis -from busco.GenomeAnalysis import GenomeAnalysisEukaryotes -from busco.TranscriptomeAnalysis import TranscriptomeAnalysis -from busco.GeneSetAnalysis import GeneSetAnalysis -from busco.GenomeAnalysis import GenomeAnalysisProkaryotes +from busco.analysis.BuscoAnalysis import BuscoAnalysis +from busco.analysis.GenomeAnalysis import ( + GenomeAnalysisEukaryotesAugustus, + GenomeAnalysisEukaryotesMetaeuk, +) +from busco.analysis.TranscriptomeAnalysis import ( + TranscriptomeAnalysisProkaryotes, + TranscriptomeAnalysisEukaryotes, +) +from busco.analysis.GeneSetAnalysis import GeneSetAnalysis +from busco.analysis.GenomeAnalysis import GenomeAnalysisProkaryotes from busco.BuscoLogger import BuscoLogger from busco.BuscoConfig import BuscoConfigMain -from busco.BuscoTools import NoGenesError, BaseRunner +from busco.busco_tools.base import NoGenesError, BaseRunner from configparser import NoOptionError import os import shutil @@ -15,9 +21,14 @@ class BuscoRunner: - mode_dict = {"euk_genome": GenomeAnalysisEukaryotes, "prok_genome": GenomeAnalysisProkaryotes, - "transcriptome": TranscriptomeAnalysis, "tran": TranscriptomeAnalysis, - "proteins": GeneSetAnalysis, "prot": GeneSetAnalysis} + mode_dict = { + "euk_genome_met": GenomeAnalysisEukaryotesMetaeuk, + "euk_genome_aug": GenomeAnalysisEukaryotesAugustus, + "prok_genome": GenomeAnalysisProkaryotes, + "euk_tran": TranscriptomeAnalysisEukaryotes, + "prok_tran": TranscriptomeAnalysisProkaryotes, + "proteins": GeneSetAnalysis, + } final_results = [] results_datasets = [] @@ -32,13 +43,29 @@ self.domain = self.config.get("busco_run", "domain") if self.mode == "genome": - if self.domain == "prokaryota": + if self.domain in ["prokaryota", "viruses"]: self.mode = "prok_genome" elif self.domain == "eukaryota": - self.mode = "euk_genome" + if self.config.getboolean("busco_run", "use_augustus"): + self.mode = "euk_genome_aug" + else: + self.mode = "euk_genome_met" + else: + raise SystemExit("Unrecognized mode {}".format(self.mode)) + + elif self.mode == "transcriptome": + if self.domain == "prokaryota": + self.mode = "prok_tran" + elif self.domain == "eukaryota": + self.mode = "euk_tran" + else: + raise SystemExit("Unrecognized mode {}".format(self.mode)) analysis_type = type(self).mode_dict[self.mode] self.analysis = analysis_type() - self.prok_fail_count = 0 # Needed to check if both bacteria and archaea return no genes. + self.prok_fail_count = ( + 0 # Needed to check if both bacteria and archaea return no genes. + ) + self.cleaned_up = False def run_analysis(self, callback=(lambda *args: None)): try: @@ -49,16 +76,28 @@ s_percent = self.analysis.hmmer_runner.s_percent d_percent = self.analysis.hmmer_runner.d_percent f_percent = self.analysis.hmmer_runner.f_percent - self.analysis.cleanup() + self.cleanup() except NoGenesError as nge: - no_genes_msg = "{0} did not recognize any genes matching the dataset {1} in the input file. " \ - "If this is unexpected, check your input file and your " \ - "installation of {0}\n".format(nge.gene_predictor, self.analysis._lineage_name) - fatal = (isinstance(self.config, BuscoConfigMain) - or (self.config.getboolean("busco_run", "auto-lineage-euk") and self.mode == "euk_genome") - or (self.config.getboolean("busco_run", "auto-lineage-prok") and self.mode == "prok_genome") - and self.prok_fail_count == 1) + no_genes_msg = ( + "{0} did not recognize any genes matching the dataset {1} in the input file. " + "If this is unexpected, check your input file and your " + "installation of {0}\n".format( + nge.gene_predictor, self.analysis._lineage_name + ) + ) + fatal = ( + isinstance(self.config, BuscoConfigMain) + or ( + self.config.getboolean("busco_run", "auto-lineage-euk") + and self.mode == "euk_genome" + ) + or ( + self.config.getboolean("busco_run", "auto-lineage-prok") + and self.mode == "prok_genome" + ) + and self.prok_fail_count == 1 + ) if fatal: raise SystemExit(no_genes_msg) else: @@ -68,24 +107,35 @@ self.prok_fail_count += 1 except SystemExit as se: - self.analysis.cleanup() + self.cleanup() + raise se return callback(s_buscos, d_buscos, f_buscos, s_percent, d_percent, f_percent) + def cleanup(self): + self.analysis.cleanup() + self.cleaned_up = True + def format_results(self): framed_output = [] if len(type(self).results_datasets) == 1: header1 = "Results from dataset {}\n".format(type(self).results_datasets[0]) else: - header1 = "Results from generic domain {}\n".format(type(self).results_datasets[0]) - final_output_results1 = "".join(self._check_parasitic(type(self).final_results[0][1:])) + header1 = "Results from generic domain {}\n".format( + type(self).results_datasets[0] + ) + final_output_results1 = "".join( + self._check_parasitic(type(self).final_results[0][1:]) + ) sb1 = SmartBox() framed_lines1 = sb1.create_results_box(header1, final_output_results1) framed_output.append(framed_lines1) if len(type(self).final_results) == 2: header2 = "Results from dataset {}\n".format(type(self).results_datasets[1]) - final_output_results2 = "".join(self._check_parasitic(type(self).final_results[1][1:])) + final_output_results2 = "".join( + self._check_parasitic(type(self).final_results[1][1:]) + ) sb2 = SmartBox() framed_lines2 = sb2.create_results_box(header2, final_output_results2) framed_output.append(framed_lines2) @@ -94,20 +144,39 @@ def _check_parasitic(self, final_output_results): try: - with open(os.path.join(self.analysis._lineage_dataset, "missing_in_parasitic.txt")) as parasitic_file: - missing_in_parasitic_buscos = [entry.strip() for entry in parasitic_file.readlines()] - if len(self.analysis.hmmer_runner.missing_buscos) >= 0.8*len(missing_in_parasitic_buscos) \ - and len(missing_in_parasitic_buscos) > 0: - intersection = [mb for mb in self.analysis.hmmer_runner.missing_buscos - if mb in missing_in_parasitic_buscos] + with open( + os.path.join(self.analysis._lineage_dataset, "missing_in_parasitic.txt") + ) as parasitic_file: + missing_in_parasitic_buscos = [ + entry.strip() for entry in parasitic_file.readlines() + ] + if ( + len(self.analysis.hmmer_runner.missing_buscos) + >= 0.8 * len(missing_in_parasitic_buscos) + and len(missing_in_parasitic_buscos) > 0 + ): + intersection = [ + mb + for mb in self.analysis.hmmer_runner.missing_buscos + if mb in missing_in_parasitic_buscos + ] percent_missing_in_parasites = round( - 100*len(intersection)/len(self.analysis.hmmer_runner.missing_buscos), 1) + 100 + * len(intersection) + / len(self.analysis.hmmer_runner.missing_buscos), + 1, + ) if percent_missing_in_parasites >= 80.0: - corrected_summary = self._recalculate_parasitic_scores(len(missing_in_parasitic_buscos)) - positive_parasitic_line = "\n!!! The missing BUSCOs match the pattern of a parasitic-reduced " \ - "genome. {}% of your missing BUSCOs are typically missing in these. " \ - "A corrected score would be: \n{}\n".format(percent_missing_in_parasites, - corrected_summary) + corrected_summary = self._recalculate_parasitic_scores( + len(missing_in_parasitic_buscos) + ) + positive_parasitic_line = ( + "\n!!! The missing BUSCOs match the pattern of a parasitic-reduced " + "genome. {}% of your missing BUSCOs are typically missing in these. " + "A corrected score would be: \n{}\n".format( + percent_missing_in_parasites, corrected_summary + ) + ) final_output_results.append(positive_parasitic_line) if not self.config.getboolean("busco_run", "auto-lineage"): auto_lineage_line = "\nConsider using the auto-lineage mode to select a more specific lineage." @@ -119,17 +188,24 @@ return final_output_results def _recalculate_parasitic_scores(self, num_missing_in_parasitic): - total_buscos = self.analysis.hmmer_runner.total_buscos - num_missing_in_parasitic + total_buscos = ( + self.analysis.hmmer_runner.total_buscos - num_missing_in_parasitic + ) single_copy = self.analysis.hmmer_runner.single_copy multi_copy = self.analysis.hmmer_runner.multi_copy fragmented_copy = self.analysis.hmmer_runner.only_fragments - s_percent = abs(round(100*single_copy/total_buscos, 1)) - d_percent = abs(round(100*multi_copy/total_buscos, 1)) - f_percent = abs(round(100*fragmented_copy/total_buscos, 1)) + s_percent = abs(round(100 * single_copy / total_buscos, 1)) + d_percent = abs(round(100 * multi_copy / total_buscos, 1)) + f_percent = abs(round(100 * fragmented_copy / total_buscos, 1)) one_line_summary = "C:{}%[S:{}%,D:{}%],F:{}%,M:{}%,n:{}\t\n".format( - round(s_percent + d_percent, 1), s_percent, d_percent, f_percent, - round(100-s_percent-d_percent-f_percent, 1), total_buscos) + round(s_percent + d_percent, 1), + s_percent, + d_percent, + f_percent, + round(100 - s_percent - d_percent - f_percent, 1), + total_buscos, + ) return one_line_summary def organize_final_output(self): @@ -137,14 +213,24 @@ try: domain_results_folder = self.config.get("busco_run", "domain_run_name") - root_domain_output_folder = os.path.join(main_out_folder, "auto_lineage", - "run_{}".format(domain_results_folder)) - root_domain_output_folder_final = os.path.join(main_out_folder, "run_{}".format(domain_results_folder)) + root_domain_output_folder = os.path.join( + main_out_folder, "auto_lineage", "run_{}".format(domain_results_folder) + ) + root_domain_output_folder_final = os.path.join( + main_out_folder, "run_{}".format(domain_results_folder) + ) os.rename(root_domain_output_folder, root_domain_output_folder_final) os.symlink(root_domain_output_folder_final, root_domain_output_folder) - shutil.copyfile(os.path.join(root_domain_output_folder_final, "short_summary.txt"), - os.path.join(main_out_folder, "short_summary.generic.{}.{}.txt".format( - domain_results_folder.replace("run_", ""), os.path.basename(main_out_folder)))) + shutil.copyfile( + os.path.join(root_domain_output_folder_final, "short_summary.txt"), + os.path.join( + main_out_folder, + "short_summary.generic.{}.{}.txt".format( + domain_results_folder.replace("run_", ""), + os.path.basename(main_out_folder), + ), + ), + ) except NoOptionError: pass @@ -155,9 +241,16 @@ finally: lineage_results_folder = self.config.get("busco_run", "lineage_results_dir") lineage_results_path = os.path.join(main_out_folder, lineage_results_folder) - shutil.copyfile(os.path.join(lineage_results_path, "short_summary.txt"), - os.path.join(main_out_folder, "short_summary.specific.{}.{}.txt".format( - lineage_results_folder.replace("run_", ""), os.path.basename(main_out_folder)))) + shutil.copyfile( + os.path.join(lineage_results_path, "short_summary.txt"), + os.path.join( + main_out_folder, + "short_summary.specific.{}.{}.txt".format( + lineage_results_folder.replace("run_", ""), + os.path.basename(main_out_folder), + ), + ), + ) return @staticmethod @@ -168,9 +261,16 @@ log_folder = os.path.join(config.get("busco_run", "main_out"), "logs") if not os.path.exists(log_folder): os.makedirs(log_folder) - os.rename("busco_{}.log".format(BuscoLogger.random_id), os.path.join(log_folder, "busco.log")) + shutil.move( + "busco_{}.log".format(BuscoLogger.random_id), + os.path.join(log_folder, "busco.log"), + ) except OSError: - logger.warning("Unable to move 'busco_{}.log' to the 'logs' folder.".format(BuscoLogger.random_id)) + logger.warning( + "Unable to move 'busco_{}.log' to the 'logs' folder.".format( + BuscoLogger.random_id + ) + ) return def finish(self, elapsed_time): @@ -181,20 +281,29 @@ self.organize_final_output() if not logger.has_warning(): - logger.info("BUSCO analysis done. Total running time: {} seconds".format(str(round(elapsed_time)))) + logger.info( + "BUSCO analysis done. Total running time: {} seconds".format( + str(round(elapsed_time)) + ) + ) else: - logger.info("BUSCO analysis done with WARNING(s). Total running time: {} seconds\n\n" - "***** Summary of warnings: *****".format(str(round(elapsed_time)))) + logger.info( + "BUSCO analysis done with WARNING(s). Total running time: {} seconds\n\n" + "***** Summary of warnings: *****".format(str(round(elapsed_time))) + ) for item in type(logger).warn_output.getvalue().split("\n"): print(item) - logger.info("Results written in {}\n".format(self.analysis.main_out)) + logger.info("Results written in {}".format(self.analysis.main_out)) + logger.info( + "For assistance with interpreting the results, please consult the userguide: " + "https://busco.ezlab.org/busco_userguide.html\n" + ) self.move_log_file(self.config) class SmartBox: - def __init__(self): self.width = None @@ -245,14 +354,16 @@ lines = lines.strip().split("\n") formatted_lines = [] for line in lines: - line = "|{}".format(line.strip()) # left bar needs to be added before expanding tabs - whitespace = " "*(self.width - len(line.expandtabs())) + line = "|{}".format( + line.strip() + ) # left bar needs to be added before expanding tabs + whitespace = " " * (self.width - len(line.expandtabs())) format_line = "{}{}|".format(line, whitespace) formatted_lines.append(format_line) return formatted_lines def add_horizontal(self): - return "-"*self.width + return "-" * self.width def create_results_box(self, header_text, body_text): header = self.wrap_header(header_text) # Called first to define width diff -Nru busco-4.1.4/src/busco/busco_tools/augustus.py busco-5.0.0/src/busco/busco_tools/augustus.py --- busco-4.1.4/src/busco/busco_tools/augustus.py 1970-01-01 00:00:00.000000000 +0000 +++ busco-5.0.0/src/busco/busco_tools/augustus.py 2021-01-26 11:28:47.000000000 +0000 @@ -0,0 +1,812 @@ +from busco.busco_tools.base import BaseRunner, NoGenesError +import os +import re +from collections import defaultdict +from busco.BuscoLogger import BuscoLogger +from busco.BuscoLogger import LogDecorator as log +from Bio import SeqIO +from Bio.Seq import Seq +from Bio.SeqRecord import SeqRecord +import shutil +import numpy as np +from configparser import NoOptionError +import subprocess + +logger = BuscoLogger.get_logger(__name__) + + +class AugustusParsingError(Exception): + def __init__(self): + pass + + +class AugustusRunner(BaseRunner): + + ACCEPTED_PARAMETERS = [ + "strand", + "genemodel", + "singlestrand", + "hintsfile", + "extrinsicCfgFile", + "maxDNAPieceSize", + "protein", + "introns", + "start", + "stop", + "cds", + "AUGUSTUS_CONFIG_PATH", + "alternatives-from-evidence", + "alternatives-from-sampling", + "sample", + "minexonintronprob", + "minmeanexonintronprob", + "maxtracks", + "gff3", + "UTR", + "outfile", + "noInFrameStop", + "noprediction", + "contentmodels", + "translation_table", + "temperature", + "proteinprofile", + "progress", + "predictionStart", + "predictionEnd", + "uniqueGeneId", + ] + + name = "augustus" + cmd = "augustus" + + def __init__(self): + self.gene_details = None + self._augustus_config_path = os.environ.get("AUGUSTUS_CONFIG_PATH") + self.config.set("busco_run", "augustus_config_path", self._augustus_config_path) + self._target_species = self.config.get("busco_run", "augustus_species") + super().__init__() + self._output_folder = os.path.join(self.run_folder, "augustus_output") + self.tmp_dir = os.path.join(self._output_folder, "tmp") + self.pred_genes_dir_initial = os.path.join( + self._output_folder, "predicted_genes_initial_run" + ) + self.pred_genes_dir_rerun = os.path.join( + self._output_folder, "predicted_genes_rerun" + ) + self.extracted_prot_dir = os.path.join( + self._output_folder, "extracted_proteins" + ) + self.err_logfile = os.path.join(self.log_folder, "augustus_err.log") + + try: + self.extra_params = self.config.get( + "busco_run", "augustus_parameters" + ).replace(",", " ") + except NoOptionError: + self.extra_params = "" + self.chunksize = 10 + + self.gff_dir = os.path.join(self._output_folder, "gff") + self.err_logfiles = [] + self.any_gene_found = False + self.param_keys = [] + self.param_values = [] + + self.create_dirs([self._output_folder, self.extracted_prot_dir, self.gff_dir]) + + self.init_checkpoint_file() + + def configure_runner(self, seqs_path, coords, sequences_aa, sequences_nt): + self.run_number += 1 + + # Placed here to allow reconfiguration for rerun + self._target_species = self.config.get("busco_run", "augustus_species") + + self.check_tool_dependencies() + self.gene_details = defaultdict(list) + self.output_sequences = [] + + self.seqs_path = seqs_path + self.coords = coords + + self.sequences_aa = sequences_aa + self.sequences_nt = sequences_nt + + self.pred_genes_dir = ( + self.pred_genes_dir_rerun + if self.run_number == 2 + else self.pred_genes_dir_initial + ) + + # self.tmp_dir placed here to allow it to be recreated during reconfiguration for rerun + self.create_dirs([self.pred_genes_dir, self.tmp_dir]) + + @property + def output_folder(self): + return self._output_folder + + def check_tool_dependencies(self): + """ + check dependencies on files and folders + properly configured. + :raises SystemExit: if Augustus config path is not writable or + not set at all + :raises SystemExit: if Augustus config path does not contain + the needed species + present + """ + try: + augustus_species_dir = os.path.join(self._augustus_config_path, "species") + if not os.access(augustus_species_dir, os.W_OK): + raise SystemExit( + "Cannot write to Augustus species folder, please make sure you have write " + "permissions to {}".format(augustus_species_dir) + ) + + except TypeError: + raise SystemExit("The environment variable AUGUSTUS_CONFIG_PATH is not set") + + if not os.path.exists(os.path.join(augustus_species_dir, self._target_species)): + # Exclude the case where this is a restarted run and the retraining parameters have already been moved. + if ( + self.config.getboolean("busco_run", "restart") + and self.run_number == 2 + and os.path.exists( + os.path.join( + self._output_folder, + "retraining_parameters", + self._target_species, + ) + ) + ): + pass + else: + raise SystemExit( + 'Impossible to locate the species "{0}" in Augustus species folder' + " ({1}), check that AUGUSTUS_CONFIG_PATH is properly set" + " and contains this species. \n\t\tSee the help if you want " + "to provide an alternative species".format( + self._target_species, augustus_species_dir + ) + ) + + @log( + "Running Augustus prediction using {} as species:", + logger, + attr_name="_target_species", + ) + def run(self): + super().run() + if self.extra_params: + logger.info( + "Additional parameters for Augustus are {}: ".format(self.extra_params) + ) + self.param_keys, self.param_values = self.parse_parameters() + + self.total = self._count_jobs() + self.run_jobs() + + def process_output(self): + logger.info("Extracting predicted proteins...") + files = [ + f + for f in sorted(os.listdir(self.pred_genes_dir)) + if any(busco_id in f for busco_id in self.coords) + ] + for filename in files: + self._extract_genes_from_augustus_output(filename) + + if not self.any_gene_found and self.run_number == 1: + raise NoGenesError("Augustus") + + self.gene_details = dict(self.gene_details) + + self._merge_stderr_logs() + self._remove_individual_err_logs() + + return + + def _count_jobs(self): + n = 0 + for busco_group, contigs in self.coords.items(): + for _ in contigs: + n += 1 + return n + + def sort_jobs(self): + jobs_size_info = [] + for busco_group, contigs in self.coords.items(): + + for contig_name, contig_info in contigs.items(): + contig_start = contig_info["contig_start"] + contig_end = contig_info["contig_end"] + pred_size = int(contig_end) - int(contig_start) + jobs_size_info.append( + { + "busco_group": busco_group, + "contig_name": contig_name, + "contig_start": contig_start, + "contig_end": contig_end, + "pred_size": pred_size, + } + ) + job_sizes = [item["pred_size"] for item in jobs_size_info] + new_job_order = np.argsort(job_sizes)[::-1] + ordered_jobs = [jobs_size_info[i] for i in new_job_order] + return ordered_jobs + + def generate_job_args(self): + contig_ordinal_inds = defaultdict(int) + njobs = 0 + + ordered_jobs = self.sort_jobs() + + for job_info in ordered_jobs: + contig_name = job_info["contig_name"] + busco_group = job_info["busco_group"] + contig_start = job_info["contig_start"] + contig_end = job_info["contig_end"] + contig_tmp_file = "{}.temp".format( + contig_name[:100] + ) # Avoid very long filenames + contig_ordinal_inds[busco_group] += 1 + output_index = contig_ordinal_inds[busco_group] + out_filename = os.path.join( + self.pred_genes_dir, "{}.out.{}".format(busco_group, output_index) + ) + njobs += 1 + + yield busco_group, contig_tmp_file, contig_start, contig_end, out_filename + + @log( + "Additional parameters for Augustus are {}: ", + logger, + attr_name="_target_species", + ) + def parse_parameters(self): + accepted_keys = [] + accepted_values = [] + if self.extra_params: + self.extra_params = self.extra_params.strip("\" '") + try: + if self.extra_params.startswith("--"): + key_val_pairs = self.extra_params.split(" --") + for kv in key_val_pairs: + key_vals = kv.strip("- ").split("=") + if len(key_vals) == 2: + key, val = key_vals + if key in type(self).ACCEPTED_PARAMETERS: + accepted_keys.append(key.strip()) + accepted_values.append(val.strip()) + else: + logger.warning( + "{} is not an accepted parameter for Augustus.".format( + key + ) + ) + else: + raise AugustusParsingError + else: + raise AugustusParsingError + except AugustusParsingError: + logger.warning( + "Augustus parameters are not correctly formatted. Please enter them as follows: " + '"--param1=value1 --param2=value2" etc. Proceeding without additional parameters.' + ) + return [], [] + return accepted_keys, accepted_values + + def _merge_stderr_logs(self): + with open(self.err_logfile, "a") as f: + for err_logfile in self.err_logfiles: + with open(err_logfile, "r") as g: + content = g.readlines() + f.writelines(content) + return + + def _remove_individual_err_logs(self): + shutil.rmtree(self.tmp_dir) + return + + def get_version(self): # todo: need to handle all possible exceptions + augustus_help_output = subprocess.check_output( + [self.cmd, "--version"], stderr=subprocess.STDOUT, shell=False + ) + augustus_help_output = augustus_help_output.decode("utf-8") + s = augustus_help_output.split("\n")[0] + augustus_version = s[s.find("(") + 1 : s.find(")")] + return augustus_version + + def configure_job( + self, busco_group, contig_tmp_file, contig_start, contig_end, out_filename + ): + # Augustus does not provide an option to write to an output file, so have to change the pipe target from the + # log file to the desired output file + self.logfile_path_out = out_filename + err_logfile = os.path.join( + self.tmp_dir, os.path.basename(out_filename.rpartition(".out")[0] + ".err") + ) + self.logfile_path_err = err_logfile + self.err_logfiles.append(err_logfile) + + augustus_job = self.create_job() + augustus_job.add_parameter("--codingseq=1") + augustus_job.add_parameter( + "--proteinprofile={}".format( + os.path.join( + self.lineage_dataset, "prfl", "{}.prfl".format(busco_group) + ) + ) + ) + augustus_job.add_parameter("--predictionStart={}".format(contig_start)) + augustus_job.add_parameter("--predictionEnd={}".format(contig_end)) + augustus_job.add_parameter("--species={}".format(self._target_species)) + for k, key in enumerate(self.param_keys): + augustus_job.add_parameter("--{}={}".format(key, self.param_values[k])) + augustus_job.add_parameter(os.path.join(self.seqs_path, contig_tmp_file)) + return augustus_job + + def _extract_genes_from_augustus_output(self, filename): + # todo: consider parallelizing this and other parsing functions + + gene_id = None + gene_info = [] + sequences_aa = [] + sequences_nt = [] + gene_found = False + completed_record = False + + with open( + os.path.join(self.pred_genes_dir, filename), "r", encoding="utf-8" + ) as f: + # utf-8 encoding needed to handle the umlaut in the third line of the file. + gene_info_section = False + nt_sequence_section = False + aa_sequence_section = False + nt_sequence_parts = [] + aa_sequence_parts = [] + + for line in f: + + if aa_sequence_section: + if "]" in line: + line = line.strip().lstrip("# ").rstrip("]") + aa_sequence_parts.append(line) + aa_sequence_section = False + completed_record = True + if gene_id is not None: + aa_sequence = "".join(aa_sequence_parts) + nt_sequence = "".join(nt_sequence_parts) + seq_record_aa = SeqRecord( + Seq(aa_sequence.upper()), id=gene_id + ) + seq_record_nt = SeqRecord( + Seq(nt_sequence.upper()), id=gene_id + ) + sequences_aa.append(seq_record_aa) + sequences_nt.append(seq_record_nt) + aa_sequence_parts = [] + nt_sequence_parts = [] + gene_id = None + continue + + else: + line = line.strip().lstrip("# ").rstrip("]") + aa_sequence_parts.append(line) + continue + + if line.startswith("# protein"): + nt_sequence_section = False + aa_sequence_section = True + if "]" in line: + line = line.strip().rstrip("]").split("[") + aa_sequence_parts.append(line[1]) + aa_sequence_section = False + completed_record = True + if gene_id is not None: + aa_sequence = "".join(aa_sequence_parts) + nt_sequence = "".join(nt_sequence_parts) + seq_record_aa = SeqRecord( + Seq(aa_sequence.upper()), id=gene_id + ) + seq_record_nt = SeqRecord( + Seq(nt_sequence.upper()), id=gene_id + ) + sequences_aa.append(seq_record_aa) + sequences_nt.append(seq_record_nt) + aa_sequence_parts = [] + nt_sequence_parts = [] + gene_id = None + else: + line = line.strip().rstrip("]").split("[") + aa_sequence_parts.append(line[1]) + continue + + if nt_sequence_section: + line = line.strip().lstrip("# ").rstrip("]") + nt_sequence_parts.append(line) + continue + + if line.startswith("# coding sequence"): + gene_info = [] + gene_info_section = False + nt_sequence_section = True + line = ( + line.strip().rstrip("]").split("[") + ) # Extract sequence part of line + nt_sequence_parts.append(line[1]) + continue + + if gene_info_section: + line = line.strip().split() + seq_name = line[0].strip() + gene_start = line[3].strip() + gene_end = line[4].strip() + strand = line[6].strip() + if not gene_id: + gene_id = "{}:{}-{}".format(seq_name, gene_start, gene_end) + self.gene_details[gene_id].append( + { + "gene_start": gene_start, + "gene_end": gene_end, + "strand": strand, + } + ) + gene_info.append("\t".join(line)) + continue + + if line.startswith("# start gene"): + gene_found = True + self.any_gene_found = True + gene_info_section = True + completed_record = False + continue + + if gene_found and not completed_record: + logger.warning("Augustus output file {} truncated".format(filename)) + + self.sequences_aa.update({record.id: record for record in sequences_aa}) + self.sequences_nt.update({record.id: record for record in sequences_nt}) + if gene_found: + self._write_sequences_to_file(filename, sequences_nt, sequences_aa) + + return + + def make_gff_files(self, single_copy_buscos): + + for b in single_copy_buscos: + gene_info = [] + busco_files = [ + f for f in os.listdir(self.pred_genes_dir) if f.startswith(b) + ] + pred_genes_dir_current = self.pred_genes_dir + if ( + len(busco_files) == 0 + and pred_genes_dir_current == self.pred_genes_dir_rerun + ): + pred_genes_dir_current = self.pred_genes_dir_initial + busco_files = [ + f for f in os.listdir(pred_genes_dir_current) if f.startswith(b) + ] + gff_filename = os.path.join(self.gff_dir, "{}.gff".format(b)) + single_copy_busco_gene = list(single_copy_buscos[b].keys())[0] + gene_id_parts = single_copy_busco_gene.split(":") + if ( + len(gene_id_parts) > 2 + ): # if a ":" is present in the gene id, we don't want to break it up + gene_id_parts = [":".join(gene_id_parts[:-1]), gene_id_parts[-1]] + single_copy_busco_gene_id = gene_id_parts[0] + ( + single_copy_busco_gene_start_coord, + single_copy_busco_gene_end_coord, + ) = gene_id_parts[1].split("-") + gene_found = False + for filename in busco_files: + match_number = filename.split(".")[-1] + with open( + os.path.join(pred_genes_dir_current, filename), + "r", + encoding="utf-8", + ) as f: + gene_info_section = False + for line in f: + if gene_info_section and line.startswith("# coding sequence"): + with open(gff_filename, "w") as g: + g.write("\n".join(gene_info) + "\n") + gene_info = [] + break + + if line.startswith("# start gene"): + gene_info_section = True + continue + + if gene_info_section: + line = line.strip().split() + seq_name = line[0] + gene_start = line[3] + gene_end = line[4] + if gene_found or ( + seq_name == single_copy_busco_gene_id + and gene_start == single_copy_busco_gene_start_coord + and gene_end == single_copy_busco_gene_end_coord + ): + gene_found = True + gene_id_info = line[-1] + line[-1] = self.edit_gene_identifier( + gene_id_info, match_number + ) + if len(line) == 12: + gene_id_info_2 = line[-3] + line[-3] = self.edit_gene_identifier( + gene_id_info_2, match_number + ) + gene_info.append("\t".join(line)) + else: + gene_info_section = False + continue + if gene_found: + break + if not gene_found and self.run_number == 1: + raise SystemExit( + "Unable to find single copy BUSCO gene in Augustus output." + ) + + return + + def edit_gene_identifier(self, orig_str, match_num): + modified_str = re.sub( + r"g([0-9])", r"r{}.m{}.g\1".format(self.run_number, match_num), orig_str + ) + return modified_str + + def _write_sequences_to_file(self, filename, sequences_nt, sequences_aa): + + filename_parts = filename.rpartition(".out") + output_fna = os.path.join( + self.extracted_prot_dir, filename_parts[0] + ".fna" + filename_parts[-1] + ) + output_faa = os.path.join( + self.extracted_prot_dir, filename_parts[0] + ".faa" + filename_parts[-1] + ) + self.output_sequences.append(output_faa) + + with open(output_fna, "w") as out_fna: + SeqIO.write(sequences_nt, out_fna, "fasta") + with open(output_faa, "w") as out_faa: + SeqIO.write(sequences_aa, out_faa, "fasta") + + return + + def move_retraining_parameters(self): + """ + This function moves retraining parameters from augustus species folder + to the run folder + """ + augustus_species_path = os.path.join( + self._augustus_config_path, "species", self._target_species + ) + if os.path.exists(augustus_species_path): + new_path = os.path.join( + self._output_folder, "retraining_parameters", self._target_species + ) + shutil.move(augustus_species_path, new_path) + elif self.config.getboolean("busco_run", "restart") and os.path.exists( + os.path.join( + self._output_folder, "retraining_parameters", self._target_species + ) + ): + pass + else: + logger.warning("Augustus did not produce a retrained species folder.") + return + + +class GFF2GBRunner(BaseRunner): + + name = "gff2gbSmallDNA.pl" + cmd = "gff2gbSmallDNA.pl" + + def __init__(self): + super().__init__() + self._output_folder = os.path.join(self.run_folder, "augustus_output") + self.gff_folder = os.path.join(self._output_folder, "gff") + self.gb_folder = os.path.join(self._output_folder, "gb") + self.create_dirs([self.gff_folder, self.gb_folder]) + + self.init_checkpoint_file() + + def configure_runner(self, single_copy_buscos): + self.run_number += 1 + self.single_copy_buscos = single_copy_buscos + + def run(self): + super().run() + self.total = self._count_jobs() + self.run_jobs() + + def _count_jobs(self): + n = len(self.single_copy_buscos) + return n + + def generate_job_args(self): + for busco_id in self.single_copy_buscos: + yield busco_id + + def configure_job(self, busco_id): + gff2_gb_small_dna_pl_job = self.create_job() + gff2_gb_small_dna_pl_job.add_parameter( + os.path.join(self.gff_folder, "{}.gff".format(busco_id)) + ) + gff2_gb_small_dna_pl_job.add_parameter(self.input_file) + gff2_gb_small_dna_pl_job.add_parameter("1000") + gff2_gb_small_dna_pl_job.add_parameter( + os.path.join(self.gb_folder, "{}.raw.gb".format(busco_id)) + ) + return gff2_gb_small_dna_pl_job + + def check_tool_dependencies(self): + pass + + def get_version(self): + return + + @property + def output_folder(self): + return self._output_folder + + +class NewSpeciesRunner(BaseRunner): + + name = "new_species.pl" + cmd = "new_species.pl" + + def __init__(self): + super().__init__() + self._output_folder = os.path.join(self.run_folder, "augustus_output") + self.new_species_name = "BUSCO_{}".format(os.path.basename(self.main_out)) + self.init_checkpoint_file() + self.run_number += 1 + + def run(self): + super().run() + self.total = 1 + self.run_jobs() + + def configure_job(self, *args): + + new_species_pl_job = self.create_job() + # bacteria clade needs to be flagged as "prokaryotic" + if self.domain == "prokaryota": + new_species_pl_job.add_parameter("--prokaryotic") + new_species_pl_job.add_parameter( + "--species={}".format(os.path.basename(self.new_species_name)) + ) + return new_species_pl_job + + def check_tool_dependencies(self): + pass + + def generate_job_args(self): + yield + + def get_version(self): + return + + @property + def output_folder(self): + return self._output_folder + + +class ETrainingRunner(BaseRunner): + + name = "etraining" + cmd = "etraining" + + def __init__(self): + super().__init__() + self._output_folder = os.path.join(self.run_folder, "augustus_output") + self._gb_folder = os.path.join(self._output_folder, "gb") + self.augustus_config_path = self.config.get("busco_run", "augustus_config_path") + self._training_file = os.path.join(self._output_folder, "training_set.db") + + self.init_checkpoint_file() + + def configure_runner(self, new_species_name): + self.run_number += 1 + self.new_species_name = new_species_name + self._merge_gb_files() + + def run(self): + super().run() + self.total = 1 + self.run_jobs() + self._validate_run() + + def check_tool_dependencies(self): + pass + + def generate_job_args(self): + yield + + def _merge_gb_files(self): + """Concatenate all GB files into one large file""" + with open(self._training_file, "w") as outfile: + for fname in os.listdir(self._gb_folder): + with open(os.path.join(self._gb_folder, fname), "r") as infile: + outfile.writelines(infile.readlines()) + return + + def _validate_run(self): + species_filepath = os.path.join( + self.augustus_config_path, "species", self.new_species_name + ) + if os.path.exists(species_filepath) and any( + "exon_probs" in f for f in os.listdir(species_filepath) + ): + return + else: + raise SystemExit( + "Retraining did not complete correctly. Check your Augustus config path environment variable." + ) + + def configure_job(self, *args): + etraining_job = self.create_job() + etraining_job.add_parameter("--species={}".format(self.new_species_name)) + etraining_job.add_parameter( + os.path.join(self.run_folder, "augustus_output", "training_set.db") + ) + return etraining_job + + def get_version(self): + return + + @property + def output_folder(self): + return self._output_folder + + +class OptimizeAugustusRunner(BaseRunner): + + name = "optimize_augustus.pl" + cmd = "optimize_augustus.pl" + + def __init__(self): + super().__init__() + self._output_folder = None + self.training_set_db = None + self.new_species_name = None + + def configure_runner(self, output_folder, new_species_name): + self.run_number += 1 + self._output_folder = output_folder + self.training_set_db = os.path.join(self._output_folder, "training_set.db") + self.new_species_name = new_species_name + + self.init_checkpoint_file() + + def configure_job(self, *args): + optimize_augustus_pl_job = self.create_job() + optimize_augustus_pl_job.add_parameter("--cpus={}".format(self.cpus)) + optimize_augustus_pl_job.add_parameter( + "--species={}".format(self.new_species_name) + ) + optimize_augustus_pl_job.add_parameter(self.training_set_db) + return optimize_augustus_pl_job + + def run(self): + super().run() + self.total = 1 + self.run_jobs() + + def generate_job_args(self): + yield + + def check_tool_dependencies(self): + pass + + def get_version(self): + return + + @property + def output_folder(self): + return self._output_folder diff -Nru busco-4.1.4/src/busco/busco_tools/base.py busco-5.0.0/src/busco/busco_tools/base.py --- busco-4.1.4/src/busco/busco_tools/base.py 1970-01-01 00:00:00.000000000 +0000 +++ busco-5.0.0/src/busco/busco_tools/base.py 2021-01-26 11:28:47.000000000 +0000 @@ -0,0 +1,233 @@ +import os +from busco.BuscoLogger import BuscoLogger +from busco.busco_tools.Toolset import Tool +from shutil import which +from abc import ABCMeta, abstractmethod +from busco.BuscoConfig import BuscoConfigAuto +import time + +logger = BuscoLogger.get_logger(__name__) + + +class ToolException(Exception): + """ + Module-specific exception + """ + + def __init__(self, value): + self.value = value + + def __str__(self): + return self.value + + +class BaseRunner(Tool, metaclass=ABCMeta): + + config = None + + def __init__(self): + super().__init__() + self.run_number = 0 + self.input_file = self.config.get("busco_run", "in") + self.main_out = self.config.get("busco_run", "main_out") + self.working_dir = ( + os.path.join(self.main_out, "auto_lineage") + if isinstance(self.config, BuscoConfigAuto) + else self.main_out + ) + self.lineage_results_dir = self.config.get("busco_run", "lineage_results_dir") + self.run_folder = os.path.join(self.working_dir, self.lineage_results_dir) + self.log_folder = os.path.join(self.main_out, "logs") + self.cpus = self.config.getint("busco_run", "cpu") + self.lineage_dataset = self.config.get("busco_run", "lineage_dataset") + self.domain = self.config.get("busco_run", "domain") + + if not self.check_tool_available(): + raise ToolException( + "{} tool cannot be found. Please check the 'path' and 'command' parameters " + "provided in the config file. Do not include the command in the " + "path!".format(self.name) + ) + self.version = self.get_version() + self.check_tool_dependencies() + + self.checkpoint_file = None + + self.logfile_path_out = os.path.join( + self.config.get("busco_run", "main_out"), + "logs", + "{}_out.log".format(self.name), + ) + self.logfile_path_err = ( + self.logfile_path_out.rpartition("_out.log")[0] + "_err.log" + ) + + def init_checkpoint_file(self): + self.checkpoint_file = os.path.join(self.output_folder, ".checkpoint") + + def write_checkpoint_file(self): + with open(self.checkpoint_file, "a") as cpt_file: + cpt_file.write("Tool: {}\n".format(self.name)) + cpt_file.write("Version: {}\n".format(self.version)) + cpt_file.write("Run: {}\n".format(self.run_number)) + cpt_file.write("Time: {}\n".format(time.strftime("%m/%d/%Y %H:%M:%S"))) + cpt_file.write("Completed {} jobs\n\n".format(self.total)) + + def check_previous_completed_run(self): + if not os.path.exists(self.checkpoint_file): + return False + else: + with open(self.checkpoint_file, "r") as cpt_file: + lines = cpt_file.readlines() + tool_names = [s.strip().split(": ")[1] for s in lines[0::6]] + tool_versions = [s.strip().split(": ")[1] for s in lines[1::6]] + tool_run_numbers = [s.strip().split(": ")[1] for s in lines[2::6]] + try: + start_search = 0 + while True: + tool_ind = tool_names.index(self.name, start_search) + if str(self.version) != str(tool_versions[tool_ind]): + logger.warning( + "A previous run used {} version {}. " + "The restarted run is using {} version " + "{}".format( + self.name, + tool_versions[tool_ind], + self.name, + self.version, + ) + ) + if int(tool_run_numbers[tool_ind]) == int(self.run_number): + return True + elif int(tool_run_numbers[tool_ind]) < int(self.run_number): + start_search = tool_ind + 1 + else: + raise SystemExit( + "Something went wrong. Information for {} run {} missing but " + "information for run {} found.".format( + self.name, + self.run_number, + tool_run_numbers[tool_ind], + ) + ) + + except ValueError: + return False + + except TypeError: + logger.warning( + "Unable to parse {} file. Restart mode not available.".format( + self.checkpoint_file + ) + ) + + @abstractmethod + def check_tool_dependencies(self): + pass + + @abstractmethod + def configure_job(self, *args): + pass + + @abstractmethod + def generate_job_args(self): + pass + + @property + @abstractmethod + def output_folder(self): + raise NotImplementedError + + @property + @abstractmethod + def name(self): + raise NotImplementedError + + @abstractmethod + def run(self): + if self.version is not None: + logger.debug("Tool: {}".format(self.name)) + logger.debug("Version: {}".format(self.version)) + + @staticmethod + def create_dirs(dirnames): + """ + Create all required directories + + :param dirnames: list of paths already constructed + :return: + """ + if isinstance(dirnames, str): + os.makedirs(dirnames, exist_ok=True) + elif isinstance(dirnames, list): + for d in dirnames: + os.makedirs(d, exist_ok=True) + else: + raise TypeError("'dirnames' should be either a str or a list") + + def check_tool_available(self): + """ + Check tool's availability. + + + :return: True if the tool can be run, False if it is not the case + :rtype: bool + """ + try: + self.get_tool_from_config() + except ToolException: + self.get_tool_from_environment() + + return which(self.cmd) is not None # True if tool available + + def get_tool_from_environment(self): + which_tool = which(self.cmd) + if not which_tool: + raise ToolException("Tool {} not found".format(self.name)) + + def get_tool_from_config(self): + """ + 1. The section ['name'] is available in the config + 2. This section contains keys 'path' and 'command' + 3. The string resulted from concatenation of values of these two keys + represents the full path to the command + :return: + """ + if not self.config.has_section(self.name): + raise ToolException( + "Section for the tool [{}] is not present in the config file".format( + self.name + ) + ) + + if not self.config.has_option(self.name, "path") or not self.config.get( + self.name, "path" + ): + raise ToolException( + "Key 'path' in the section [{}] is not present in the config file".format( + self.name + ) + ) + + if self.config.has_option(self.name, "command"): + executable = self.config.get(self.name, "command") + else: + executable = self.name + + self.cmd = os.path.join(self.config.get(self.name, "path"), executable) + + return + + @abstractmethod + def get_version(self): + return + + +class NoGenesError(Exception): + def __init__(self, gene_predictor): + self.gene_predictor = gene_predictor + + +class NoRerunFile(Exception): + def __init__(self): + pass diff -Nru busco-4.1.4/src/busco/busco_tools/blast.py busco-5.0.0/src/busco/busco_tools/blast.py --- busco-4.1.4/src/busco/busco_tools/blast.py 1970-01-01 00:00:00.000000000 +0000 +++ busco-5.0.0/src/busco/busco_tools/blast.py 2021-01-26 11:28:47.000000000 +0000 @@ -0,0 +1,532 @@ +from busco.busco_tools.base import BaseRunner +import os +from collections import defaultdict +from busco.BuscoLogger import BuscoLogger +from busco.BuscoLogger import LogDecorator as log +from Bio import SeqIO +import subprocess + +logger = BuscoLogger.get_logger(__name__) + + +class MKBLASTRunner(BaseRunner): + + name = "makeblastdb" + cmd = "makeblastdb" + + def __init__(self): + super().__init__() + self.db_path = os.path.join( + self.config.get("busco_run", "main_out"), "blast_db" + ) + self.output_db = os.path.join(self.db_path, os.path.basename(self.input_file)) + self.create_dirs(self.db_path) + self.total = 1 + self.init_checkpoint_file() + self.run_number += 1 + + @log("Creating BLAST database with input file", logger) + def configure_job(self, *args): + mkblast_job = self.create_job() + mkblast_job.add_parameter("-in") + mkblast_job.add_parameter(self.input_file) + mkblast_job.add_parameter("-dbtype") + mkblast_job.add_parameter("nucl") + mkblast_job.add_parameter("-out") + mkblast_job.add_parameter(self.output_db) + return mkblast_job + + def run(self): + super().run() + if os.path.exists(self.db_path) and len(os.listdir(self.db_path)) > 0: + return + + self.run_jobs() + + def generate_job_args(self): + yield + + def get_version(self): + mkblastdb_version_call = subprocess.check_output( + [self.cmd, "-version"], stderr=subprocess.STDOUT, shell=False + ) + mkblastdb_version = ".".join( + mkblastdb_version_call.decode("utf-8").split("\n")[0].split()[1].rsplit(".") + ) + + return mkblastdb_version + + def check_tool_dependencies(self): + pass + + @property + def output_folder(self): + return self.db_path + + +class TBLASTNRunner(BaseRunner): + + name = "tblastn" + cmd = "tblastn" + + MAX_FLANK = 20000 + + def __init__(self): + self.coords = {} + super().__init__() + self._output_folder = os.path.join(self.run_folder, "blast_output") + self.output_seqs = os.path.join(self._output_folder, "sequences") + self.create_dirs([self._output_folder, self.output_seqs]) + self.total = 1 + + self.e_v_cutoff = self.config.getfloat("busco_run", "evalue") + self.region_limit = self.config.getint("busco_run", "limit") + self.flank = self._define_flank() + + self.init_checkpoint_file() + + def configure_runner( + self, blast_db, missing_and_frag_only, ancestral_variants, incomplete_buscos + ): + self.run_number += 1 + self.blast_db = blast_db + self.missing_and_frag_only = missing_and_frag_only + self.ancestral_variants = ancestral_variants + self.incomplete_buscos = incomplete_buscos + + self.ancestral_sfx = "_variants" if self.ancestral_variants else "" + self.ancestral_file = os.path.join( + self.lineage_dataset, "ancestral{}".format(self.ancestral_sfx) + ) + self.query_file = os.path.join( + self.lineage_dataset, "ancestral{}".format(self.ancestral_sfx) + ) + self.output_suffix = ( + "_missing_and_frag_rerun" if self.missing_and_frag_only else "" + ) + self.rerun_query_file = os.path.join( + self._output_folder, + "ancestral{}{}".format(self.ancestral_sfx, self.output_suffix), + ) + if self.missing_and_frag_only and self.ancestral_variants: + self._extract_incomplete_buscos_ancestral() + + self.blast_filename = os.path.join( + self._output_folder, "tblastn{}.tsv".format(self.output_suffix) + ) + self.coords_filename = os.path.join( + self._output_folder, "coordinates{}.tsv".format(self.output_suffix) + ) + + def configure_job(self, *args): + tblastn_job = self.create_job() + tblastn_job.add_parameter("-evalue") + tblastn_job.add_parameter(str(self.e_v_cutoff)) + tblastn_job.add_parameter("-num_threads") + tblastn_job.add_parameter(str(self.cpus)) + tblastn_job.add_parameter("-query") + tblastn_job.add_parameter(self.query_file) + tblastn_job.add_parameter("-db") + tblastn_job.add_parameter(self.blast_db) + tblastn_job.add_parameter("-out") + tblastn_job.add_parameter(self.blast_filename) + tblastn_job.add_parameter("-outfmt") + tblastn_job.add_parameter("7") + return tblastn_job + + @property + def output_folder(self): + return self._output_folder + + def _define_flank(self): + """ + TODO: Add docstring + :return: + """ + try: + size = os.path.getsize(self.input_file) / 1000 # size in mb + flank = int(size / 50) # proportional flank size + # Ensure value is between 5000 and MAX_FLANK + flank = min(max(flank, 5000), type(self).MAX_FLANK) + except IOError: # Input data is only validated during run_analysis. This will catch any IO issues before that. + raise SystemExit( + "Impossible to read the fasta file {}".format(self.input_file) + ) + + return flank + + @log("Running a BLAST search for BUSCOs against created database", logger) + def run(self): + super().run() + self.run_jobs() + self._check_output() + return + + def check_tool_dependencies(self): + if ( + ".".join(self.version.split(".")[:-1]) not in ["2.2", "2.3"] + and self.version != "2.10.1+" + and float(".".join(self.version.split(".")[:-1])) < 2.11 + ): + # Known problems with multithreading on BLAST 2.4-2.10.0. + logger.warning( + "You are using BLAST version {}. This is known to yield inconsistent results when " + "multithreading. BLAST will run on a single core as a result. For performance improvement, " + "please upgrade to BLAST 2.10.1+.".format(self.version) + ) + self.cpus = 1 + + def get_version(self): + tblastn_version_call = subprocess.check_output( + [self.cmd, "-version"], stderr=subprocess.STDOUT, shell=False + ) + tblastn_version = ".".join( + tblastn_version_call.decode("utf-8").split("\n")[0].split()[1].rsplit(".") + ) + + return tblastn_version + + def generate_job_args(self): + yield + + def _check_output(self): + # check that blast worked + if not os.path.exists(self.blast_filename): + raise SystemExit("tblastn failed!") + + # check that the file is not truncated + with open(self.blast_filename, "r") as f: + try: + if "processed" not in f.readlines()[-1]: + raise SystemExit( + "tblastn has ended prematurely (the result file lacks the expected final line), " + "which will produce incomplete results in the next steps ! This problem likely " + "appeared in blast+ 2.4 and seems not fully fixed in 2.6. It happens only when " + "using multiple cores. You can use a single core (-c 1) or downgrade to " + "blast+ 2.2.x, a safe choice regarding this issue. See blast+ documentation for " + "more information." + ) + + except IndexError: + # if the tblastn result file is empty, for example in phase 2 + # if 100% was found in phase 1 + pass + return + + def _extract_incomplete_buscos_ancestral(self): + + logger.info( + "Extracting missing and fragmented buscos from the file {}...".format( + os.path.basename(self.ancestral_file) + ) + ) + + matched_seqs = [] + busco_ids_retrieved = set() + with open(self.ancestral_file, "rU") as anc_file: + + for record in SeqIO.parse(anc_file, "fasta"): + if any(record.id.startswith(b) for b in self.incomplete_buscos): + # Remove the ancestral variant identifier ("_1" etc) so it matches all other BUSCO IDs. + # The identifier is still present in the "name" and "description" Sequence Record attributes. + record.id = record.id.split("_")[0] + busco_ids_retrieved.add(record.id) + matched_seqs.append(record) + + unmatched_incomplete_buscos = list( + set(self.incomplete_buscos) - set(busco_ids_retrieved) + ) + if len(unmatched_incomplete_buscos) > 0: + logger.debug( + "The BUSCO ID(s) {} were not found in the file {}".format( + unmatched_incomplete_buscos, os.path.basename(self.ancestral_file) + ) + ) + + self.query_file = self.rerun_query_file + with open( + self.query_file, "w" + ) as out_file: # Create new query file for second tblastn run + SeqIO.write(matched_seqs, out_file, "fasta") + + return + + def _get_all_boundaries(self, locations): + sorted_locs = sorted(locations, key=lambda x: int(x[0])) + all_boundaries = [sorted_locs[0]] + for loc in sorted_locs[1:]: + overlap, boundary = self._get_overlap(all_boundaries[-1], loc) + if overlap > 0: + all_boundaries[-1] = boundary + else: + all_boundaries.append(boundary) + return all_boundaries + + def get_coordinates(self): + self.coords = self._parse_blast_output() + if self.ancestral_variants: + self.coords = self._select_busco_variants() + self._prune() + return + + def _get_largest_regions(self, candidate_contigs, coords, busco_group): + size_lists = [] + + for contig in candidate_contigs: + potential_locations = coords[busco_group][contig]["busco_coords"] + final_regions = self._get_all_boundaries(potential_locations) + + # Get sum of all potential match sizes for a contig + size_lists.append(self._sum_all_region_sizes(final_regions)) + + return size_lists + + @staticmethod + def _get_overlap(a, b): + """ + This function checks whether two regions overlap and returns the length of the overlap region along with the + boundaries of both regions combined as a [start, stop] list. + + :param a: first region, start and end + :type a: list + :param b: second region, start and end + :type b: list + :returns: overlap, boundary + :rtype: int, list + """ + a_start, a_end = a + b_start, b_end = b + overlap = min(a_end, b_end) - max(a_start, b_start) + if overlap > 0: + boundary = [min(a_start, b_start), max(a_end, b_end)] + elif b_start > a_start: + boundary = b + else: + boundary = a + return max(0, overlap), boundary + + def _parse_blast_output(self): + """ + Read the Blast output + """ + coords = defaultdict( + lambda: defaultdict(defaultdict) + ) # dict of busco_id -> contig_id -> {info} + with open(self.blast_filename, "r") as blast_file: + for line in blast_file: + if line.startswith("#"): + continue + else: + try: + line = line.strip().split() + busco_name = line[0] + contig_id = line[1] + busco_start = int(line[6]) + busco_end = int(line[7]) + contig_start = int(line[8]) + contig_end = int(line[9]) + blast_eval = float(line[10]) + except (IndexError, ValueError): + continue + + # for minus-strand genes, invert coordinates for convenience + if contig_end < contig_start: + contig_end, contig_start = contig_start, contig_end + + # Add all matches to dictionary. The top matches are selected out later. + if contig_id not in coords[busco_name]: + coords[busco_name][contig_id] = { + "contig_start": contig_start, + "contig_end": contig_end, + "busco_coords": [[busco_start, busco_end]], + "blast_eval": blast_eval, + } + + elif ( + contig_id in coords[busco_name] + ): # i.e. if the same gene matched the busco more than once. + # now update coordinates + coords = self._update_coordinates( + coords, + busco_name, + contig_id, + busco_start, + busco_end, + contig_start, + contig_end, + blast_eval, + ) + + return dict(coords) + + def _select_busco_variants(self): + """ + Filter contig matches to prevent multiple BUSCO variants matching the same contig. + The current behaviour combines all contig matches for all BUSCO variants, as long as the contig matches are + different. There is an open question over whether or not we should only return the contig matches for a single + BUSCO variant instead of all of them combined. This should only be an issue for the Transcriptome mode. + :return: + """ + selected_coords = defaultdict(lambda: defaultdict(defaultdict)) + for busco_name, contigs in self.coords.items(): + busco_basename = busco_name.split("_")[0] + if busco_basename in selected_coords: + for contig_id in contigs: + if contig_id in selected_coords[busco_basename]: + if ( + contigs[contig_id]["blast_eval"] + < selected_coords[busco_basename][contig_id]["blast_eval"] + ): + selected_coords[busco_basename][contig_id] = contigs[ + contig_id + ] + else: + selected_coords[busco_basename][contig_id] = contigs[contig_id] + else: + selected_coords[busco_basename] = contigs + + return selected_coords + + def _prune(self): + for busco_name, contigs in self.coords.items(): + if len(contigs) > self.region_limit: + # Sort by blast eval, then isolate smallest values leaving just "region_limit" number of contigs per + # busco_name + contigs_to_remove = sorted( + contigs, key=lambda contig: contigs[contig]["blast_eval"] + )[self.region_limit :] + for c in contigs_to_remove: + self.coords[busco_name].pop(c) + return + + @staticmethod + def _sum_all_region_sizes(deck): + """ + Sum all interval sizes in input list + :param deck: + :type deck: list + :return: + :rtype: int + """ + total = 0 + for entry in deck: + total += entry[1] - entry[0] + return total + + @staticmethod + def _update_coordinates( + coords, + busco_name, + contig, + busco_start, + busco_end, + contig_start, + contig_end, + blast_eval, + ): + """ + If a contig match starts or ends withing 50 kb of a previous match, extend the recorded start and end positions + of the contig match, and record the start/end locations of the busco match. + If the contig match is entirely within a previous match, just record the start/end locations of the busco match. + If the match is outside 50 kb of a previous match, ignore it. The tblastn output file ranks matches in order of + bitscore (inverse order of eval) so these subsequent matches at different locations are guaranteed not to be + better than the ones already recorded for that contig. + :param coords: # todo: fill in details + :param busco_name: + :param contig: + :param busco_start: + :param busco_end: + :param contig_start: + :param contig_end: + :param blast_eval: + :return: + """ + append_busco_coords = False + + # Check if contig starts before and within 50kb of current position + if 0 <= coords[busco_name][contig]["contig_start"] - contig_start <= 50000: + coords[busco_name][contig]["contig_start"] = contig_start + append_busco_coords = True + + # Check if contig ends after and within 50 kbs of current position + if 0 <= contig_end - coords[busco_name][contig]["contig_end"] <= 50000: + coords[busco_name][contig]["contig_end"] = contig_end + append_busco_coords = True + # Else, check if contig starts inside current coordinates + elif ( + coords[busco_name][contig]["contig_end"] + >= contig_start + >= coords[busco_name][contig]["contig_start"] + ): + # If contig ends inside current coordinates, just add alignment positions to list + if contig_end <= coords[busco_name][contig]["contig_end"]: + append_busco_coords = True + + # If contig ends after current coordinates, extend contig end + else: + coords[busco_name][contig]["contig_end"] = contig_end + append_busco_coords = True + + # moved to its own "if" statement to avoid multiple appends from the "if" statements above + if append_busco_coords: + coords[busco_name][contig]["busco_coords"].append([busco_start, busco_end]) + + if blast_eval < coords[busco_name][contig]["blast_eval"]: + coords[busco_name][contig]["blast_eval"] = blast_eval + + return coords + + def filter_best_matches(self): + + # Get a list of all start and stop positions of possible busco locations, merging overlapping regions + for busco_group in self.coords: + candidate_contigs = list(self.coords[busco_group].keys()) + size_lists = self._get_largest_regions( + candidate_contigs, self.coords, busco_group + ) + max_size = max(size_lists) # Get largest match size for a busco group + # Include all location matches for a busco as long as they are within 70% of the maximum size match + size_cutoff = int(0.7 * max_size) + for c, contig_name in enumerate(candidate_contigs): + if size_lists[c] < size_cutoff: + self.coords[busco_group].pop(contig_name) + return + + def write_coordinates_to_file(self): + + with open(self.coords_filename, "w") as out: + for busco_group, contig_matches in self.coords.items(): + for contig_name in contig_matches: + self.coords[busco_group][contig_name]["contig_start"] = max( + int(self.coords[busco_group][contig_name]["contig_start"]) + - self.flank, + 0, + ) + contig_start = self.coords[busco_group][contig_name]["contig_start"] + self.coords[busco_group][contig_name]["contig_end"] += self.flank + contig_end = int( + self.coords[busco_group][contig_name]["contig_end"] + ) + out.write( + "{}\t{}\t{}\t{}\n".format( + busco_group, contig_name, contig_start, contig_end + ) + ) + return + + def write_contigs(self): + # Extract all contig identifiers + contig_names = [] + for contig_info in self.coords.values(): + for contig in contig_info: + contig_names.append(contig) + + # Write sequences that match contig ids + with open(self.input_file, "rU") as f: + for record in SeqIO.parse(f, "fasta"): + if record.id in list(set(contig_names)): + with open( + os.path.join(self.output_seqs, "{}.temp".format(record.id)), "w" + ) as out: + SeqIO.write(record, out, "fasta") + return diff -Nru busco-4.1.4/src/busco/busco_tools/hmmer.py busco-5.0.0/src/busco/busco_tools/hmmer.py --- busco-4.1.4/src/busco/busco_tools/hmmer.py 1970-01-01 00:00:00.000000000 +0000 +++ busco-5.0.0/src/busco/busco_tools/hmmer.py 2021-01-26 11:28:47.000000000 +0000 @@ -0,0 +1,1059 @@ +from busco.busco_tools.base import BaseRunner +import os +from collections import defaultdict +import busco +from busco.BuscoLogger import BuscoLogger +from busco.BuscoLogger import LogDecorator as log +from busco.BuscoConfig import BuscoConfig, BuscoConfigMain +from Bio import SeqIO +import csv +import subprocess +from busco.BuscoConfig import BuscoConfigAuto + +logger = BuscoLogger.get_logger(__name__) + + +class HMMERRunner(BaseRunner): + + name = "hmmsearch" + cmd = "hmmsearch" + + def __init__(self): + super().__init__() + self._hmmer_output_folder = os.path.join(self.run_folder, "hmmer_output") + self.datasets_version = self.config.get("busco_run", "datasets_version") + self.dataset_creation_date = self.config.get("busco_run", "creation_date") + self.dataset_nb_species = self.config.get("busco_run", "number_of_species") + self.dataset_nb_buscos = self.config.get("busco_run", "number_of_BUSCOs") + self.domain = self.config.get("busco_run", "domain") + + self.single_copy_sequences_folder = os.path.join( + self.run_folder, "busco_sequences", "single_copy_busco_sequences" + ) + self.multi_copy_sequences_folder = os.path.join( + self.run_folder, "busco_sequences", "multi_copy_busco_sequences" + ) + self.fragmented_sequences_folder = os.path.join( + self.run_folder, "busco_sequences", "fragmented_busco_sequences" + ) + self.short_summary_file = os.path.join(self.run_folder, "short_summary.txt") + self.cutoff_dict = {} + self.single_copy_buscos = {} + self.multi_copy_buscos = {} + self.fragmented_buscos = {} + self.extra_columns = False + self.log_count = 0 # Dummy variable used to skip logging for intermediate eukaryote pipeline results. + self.one_line_summary = None + + # to be initialized before run time + self.input_sequences = None + self.busco_ids = None + self.mode = None + self.gene_details = None + self.results_dir = None + + self.matched_genes_complete = {} + self.matched_genes_vlarge = {} + self.matched_genes_fragment = {} + self.is_complete = {} + self.is_fragment = {} + self.is_very_large = {} + + self.create_dirs( + [ + self._hmmer_output_folder, + self.single_copy_sequences_folder, + self.multi_copy_sequences_folder, + self.fragmented_sequences_folder, + ] + ) + if self.domain == "eukaryota": + self.initial_results_dir = os.path.join( + self._hmmer_output_folder, "initial_run_results" + ) + self.rerun_results_dir = os.path.join( + self._hmmer_output_folder, "rerun_results" + ) + self.create_dirs([self.initial_results_dir, self.rerun_results_dir]) + + self.single_copy = 0 + self.multi_copy = 0 + self.only_fragments = 0 + self.total_buscos = 0 + + # Get percentage of each kind of BUSCO match + self.s_percent = 0 + self.d_percent = 0 + self.f_percent = 0 + + self.hmmer_results_lines = None + + self.init_checkpoint_file() + + def configure_runner(self, input_sequences, busco_ids, mode, gene_details): + self.run_number += 1 + self.input_sequences = input_sequences + self.busco_ids = busco_ids + self.mode = mode + + self.is_fragment = {} + + self.single_copy_buscos = {} + self.multi_copy_buscos = {} + self.fragmented_buscos = {} + + self._already_used_genes = set() + self.hmmer_results_lines = [] + self.missing_buscos = [] + self.gene_details = gene_details + if len(self.cutoff_dict) == 0: + self.load_buscos() + + if self.domain == "eukaryota": + if self.run_number == 1: + self.results_dir = self.initial_results_dir + elif self.run_number == 2: + self.results_dir = self.rerun_results_dir + else: + raise ValueError( + "HMMER should not be run more than twice in the same Run instance." + ) + else: + self.results_dir = self._hmmer_output_folder + # gene_details can only be None for proteins mode. In the other modes the gene locations are written to a file + # after the coordinates are loaded from this attribute + + def configure_job(self, busco_id, seq_filename, output_filename): + + hmmer_job = self.create_job() + hmmer_job.add_parameter("--domtblout") + hmmer_job.add_parameter(os.path.join(self.results_dir, output_filename)) + hmmer_job.add_parameter("--cpu") + hmmer_job.add_parameter("1") + hmmer_job.add_parameter( + os.path.join(self.lineage_dataset, "hmms", "{}.hmm".format(busco_id)) + ) + hmmer_job.add_parameter(seq_filename) + return hmmer_job + + def generate_job_args(self): + for busco_id in self.busco_ids: + if busco_id in self.cutoff_dict: + if isinstance(self.input_sequences, str): + output_filename = "{}.out".format(busco_id) + yield busco_id, self.input_sequences, output_filename + elif isinstance(self.input_sequences, list): + input_files = [ + f + for f in self.input_sequences + if os.path.basename(f).startswith(busco_id) + ] + for seq_filename in input_files: + filename_parts = os.path.basename(seq_filename).rpartition( + ".faa" + ) + output_filename = ( + filename_parts[0] + ".out" + filename_parts[-1] + ) + yield busco_id, seq_filename, output_filename + + @property + def output_folder(self): + return self._hmmer_output_folder + + def load_buscos(self): + """ + Load all BUSCOs for the lineage, along with their cutoff lengths and scores. + :return: + """ + self.cutoff_dict = defaultdict(dict) + self._load_length() + self._load_score() + self.cutoff_dict = dict(self.cutoff_dict) + return + + def run(self): + """ + Create a HMMER job for each BUSCO. Each job searches the input sequence file for matches for the BUSCO gene. + :return: + """ + super().run() + self.total = self._count_jobs() + self.run_jobs() + + def _count_jobs(self): + n = 0 + for busco_id in self.busco_ids: + if busco_id in self.cutoff_dict: + if isinstance(self.input_sequences, str): + n += 1 + elif isinstance(self.input_sequences, list): + input_files = [ + f + for f in self.input_sequences + if os.path.basename(f).startswith(busco_id) + ] + n += len(input_files) + return n + + def get_version(self): + """ + check the Tool has the correct version + :raises SystemExit: if the version is not correct + """ + hmmer_version = subprocess.check_output( + [self.cmd, "-h"], stderr=subprocess.STDOUT, shell=False + ) + hmmer_version = hmmer_version.decode("utf-8") + try: + hmmer_version = hmmer_version.split("\n")[1].split()[2] + hmmer_version = float(hmmer_version[:3]) + except ValueError: + # to avoid a crash with a super old version + hmmer_version = hmmer_version.split("\n")[1].split()[1] + hmmer_version = float(hmmer_version[:3]) + finally: + return hmmer_version + + def check_tool_dependencies(self): + """ + check dependencies on tools + :raises SystemExit: if a Tool version is not supported + """ + # check hmm version + if not self.version >= BuscoConfig.HMMER_VERSION: + raise SystemExit( + "HMMer version detected is not supported, please use HMMer v.{} +".format( + BuscoConfig.HMMER_VERSION + ) + ) + return + + def process_output(self): + self.is_complete = defaultdict( + lambda: defaultdict(list), self.is_complete + ) # dict of a dict of lists of dicts + self.is_fragment = defaultdict(lambda: defaultdict(list), self.is_fragment) + self.is_very_large = defaultdict(lambda: defaultdict(list), self.is_very_large) + self.matched_genes_complete = defaultdict(list, self.matched_genes_complete) + self.matched_genes_vlarge = defaultdict(list, self.matched_genes_vlarge) + self.matched_genes_fragment = defaultdict(list, self.matched_genes_fragment) + + self._load_matched_genes() + + self.is_complete = dict(self.is_complete) + self.is_fragment = dict(self.is_fragment) + self.is_very_large = dict(self.is_very_large) + self.matched_genes_complete = dict(self.matched_genes_complete) + self.matched_genes_vlarge = dict(self.matched_genes_vlarge) + self.matched_genes_fragment = dict(self.matched_genes_fragment) + + return + + @staticmethod + def _get_matched_lengths(nested_dict): + """ + For each entry in a nested dictionary, return a dict with the total lengths of all gene matches for each entry. + :param nested_dict: + :type nested_dict: + :return: + :rtype: + """ + total_len = defaultdict(int) + for entry in nested_dict: + for hit in nested_dict[entry]: + total_len[entry] += hit[1] - hit[0] + return total_len + + def parse_hmmer_output(self, filename, busco_query): + """ + Read and parse HMMER output file. + :param filename: Name of HMMER output file + :param busco_query: Basename of file, used to identify BUSCO + :type filename: str + :type busco_query: str + :return: Dictionary of (gene_id, total_matched_length) pairs + :rtype: dict + """ + records = defaultdict(dict) + + with open(filename, "r") as f: + + # Read HMMER output file + for line in f: + if line.startswith("#"): + continue + else: + try: + line = line.strip().split() + gene_id = line[0] + tlen = int(line[2]) + bit_score = float(line[7]) + + # Extract frame information (present in transcriptome mode) + frame = str(line[-1]) if "frame" in str(line[-1]) else None + + # Store bitscore matches for each gene match. If match below cutoff, discard. + if bit_score < float(self.cutoff_dict[busco_query]["score"]): + # todo: introduce upper bound - consult to see what a reasonable value would be + continue + if gene_id not in records: + records[gene_id] = { + "tlen": tlen, + "hmm_len": 0, + "env_coords": [], + "score": bit_score, + "frame": frame, + } + hmm_start = int(line[15]) + hmm_end = int(line[16]) + env_start = int(line[19]) + env_end = int(line[20]) + records[gene_id]["hmm_len"] += hmm_end - hmm_start + records[gene_id]["env_coords"].append((env_start, env_end)) + + except IndexError as e: + raise SystemExit( + e, "Cannot parse HMMER output file {}".format(filename) + ) + return records + + def _sort_matches(self, matched_record, busco_query): + """ + The HMMER gene matches are sorted into "complete", "v_large" and "fragmented" matches based on a comparison + with the cutoff value specified in the dataset cutoff_scores file + :param matched_lengths: dict of (gene_id, total_matched_length) pairs + :param busco_query: BUSCO identifier + :type matched_lengths: dict + :type busco_query: str + :return: busco_complete, busco_vlarge, busco_fragment - three dictionaries of the form + {gene_id: [{"bitscore": float, "length": int}, {...}, ...], ...} + :rtype: dict + """ + busco_complete = defaultdict(list) + busco_vlarge = defaultdict(list) + busco_fragment = defaultdict(list) + + # Determine whether matched gene represents a complete, very_large or fragment of a BUSCO + for gene_id, record in matched_record.items(): + size = record["hmm_len"] + frame = record["frame"] + + # Kind of like a z-score, but it is compared with a cutoff value, not a mean + zeta = (self.cutoff_dict[busco_query]["length"] - size) / self.cutoff_dict[ + busco_query + ]["sigma"] + + # gene match can only be either complete, v_large or fragment + if -2 <= zeta <= 2: + busco_type = busco_complete + match_type = self.matched_genes_complete + elif zeta < -2: + busco_type = busco_vlarge + match_type = self.matched_genes_vlarge + else: + busco_type = busco_fragment + match_type = self.matched_genes_fragment + + # Add information about match to dict + busco_type[gene_id].append( + dict({"bitscore": record["score"], "length": size, "frame": frame}) + ) + # Reference which busco_queries are associated with each gene match + match_type[gene_id].append(busco_query) + + return busco_complete, busco_vlarge, busco_fragment + + def _load_matched_genes(self): + """ + Load all gene matches from HMMER output and sort into dictionaries depending on match quality + (complete, v_large, fragment). + :return: + """ + if self.run_number == 1: + hmmer_results_files = sorted( + [ + os.path.join(self.results_dir, f) + for f in os.listdir(self.results_dir) + if not f.startswith(".") + ] + ) + elif self.run_number == 2: + hmmer_initial_run_files = [ + os.path.join(self.initial_results_dir, f) + for f in os.listdir(self.initial_results_dir) + if not f.startswith(".") + ] + hmmer_rerun_files = [ + os.path.join(self.rerun_results_dir, f) + for f in os.listdir(self.rerun_results_dir) + if not f.startswith(".") + ] + hmmer_results_files = sorted( + hmmer_rerun_files + ) # sorted(hmmer_initial_run_files + hmmer_rerun_files) + else: + raise ValueError( + "HMMER should not be run more than twice in the same Run instance." + ) + + for filename in hmmer_results_files: + busco_query = str(os.path.basename(filename).split(".")[0]) + matched_record = self.parse_hmmer_output(filename, busco_query) + busco_complete, busco_vlarge, busco_fragment = self._sort_matches( + matched_record, busco_query + ) + + # Add all information for this busco_id to the full dictionary + if len(busco_complete) > 0: + self.is_complete[busco_query].update(busco_complete) + if len(busco_vlarge) > 0: + self.is_very_large[busco_query].update(busco_vlarge) + if len(busco_fragment) > 0: + self.is_fragment[busco_query].update(busco_fragment) + + return + + def _update_used_gene_set(self, busco_dict): + """ + Update set of already used genes to prevent processing the same gene twice. + :param busco_dict: One of [self.is_complete, self.is_very_large, self.is_fragment] + :type busco_dict: dict + :return: + """ + for entries in busco_dict.values(): + for gene_id in entries: + self._already_used_genes.add(gene_id) + return + + def _remove_lower_ranked_duplicates(self, busco_dict): + """ + Remove any genes and/or busco matches from input dictionary if they have previously been assigned to a better + quality match. + :param busco_dict: one of [self.is_very_large, self.is_fragment] + :type busco_dict: dict + :return: + """ + # Determine which match ranks to worry about + if busco_dict == self.is_very_large: + higher_rank_buscos = self.is_complete.keys() + matched_genes = self.matched_genes_vlarge + elif busco_dict == self.is_fragment: + higher_rank_buscos = list(self.is_complete.keys()) + list( + self.is_very_large.keys() + ) + matched_genes = self.matched_genes_fragment + else: + raise SystemExit("Unrecognized dictionary of BUSCOs.") + + for busco_id in list(busco_dict.keys()): + matches = busco_dict[busco_id] + # Remove any buscos that appear in higher ranking dictionaries + if busco_id in higher_rank_buscos: + busco_dict.pop(busco_id) + for gene_id in matches: + matched_genes[gene_id] = [ + x for x in matched_genes[gene_id] if x != busco_id + ] # Remove all occurences of busco_id + if len(matched_genes[gene_id]) == 0: + matched_genes.pop(gene_id) + continue + + # Remove any genes that have previously been processed under a different and higher ranking busco match + for gene_id in list(matches.keys()): + if gene_id in self._already_used_genes: + busco_dict[busco_id].pop(gene_id) + matched_genes[gene_id] = [ + x for x in matched_genes[gene_id] if x != busco_id + ] # Remove all occurences of busco_id + if len(busco_dict[busco_id]) == 0: + busco_dict.pop(busco_id) + if len(matched_genes[gene_id]) == 0: + matched_genes.pop(gene_id) + + return + + def _remove_duplicates(self): + """ + Remove duplicate gene matches of lesser importance, i.e. keep the complete ones, then the very large ones and + finally the fragments. + Also remove duplicate BUSCO ID matches of lower importance. + Then search for any duplicate gene matches within the same rank for different BUSCOs and keep only the highest + scoring gene match. + :return: + """ + self._update_used_gene_set(self.is_complete) + self._remove_lower_ranked_duplicates(self.is_very_large) + self._update_used_gene_set(self.is_very_large) + self._remove_lower_ranked_duplicates(self.is_fragment) + self._remove_remaining_duplicate_matches(self.is_complete) + self._remove_remaining_duplicate_matches(self.is_very_large) + self._remove_remaining_duplicate_matches(self.is_fragment) + return + + def _remove_remaining_duplicate_matches(self, busco_dict): + """ + For any genes matched under more than one BUSCO, keep only the highest scoring match in the input dictionary. + :param busco_dict: one of [self.is_complete, self.is_very_large, self.is_fragment] + :type busco_dict: dict + :return: + """ + # For a given input dictionary {busco_id: gene_ids}, make sure we are using the corresponding dictionary + # {gene_id: busco_matches} + if busco_dict == self.is_complete: + matched_genes = self.matched_genes_complete + elif busco_dict == self.is_very_large: + matched_genes = self.matched_genes_vlarge + elif busco_dict == self.is_fragment: + matched_genes = self.matched_genes_fragment + else: + raise SystemExit("Unrecognized dictionary of BUSCOs.") + + busco_matches_to_remove = [] + # Keep the best scoring gene if gene is matched by more than one busco with the same match rank + for gene_id, buscos in matched_genes.items(): + if len(buscos) > 1: + busco_bitscores = [] + busco_matches = [] + for busco in buscos: + matches = busco_dict[busco][gene_id] + for match in matches: + bitscore = match["bitscore"] + busco_bitscores.append(bitscore) + busco_matches.append(busco) + + if ( + len(set(buscos)) == 1 + ): # If only one busco is matched twice (initial run and rerun), don't remove it + continue + best_match_ind = max( + range(len(busco_bitscores)), key=busco_bitscores.__getitem__ + ) + buscos = [x for x in buscos if x != busco_matches[best_match_ind]] + # Remove lower scoring duplicates from dictionary. + + for duplicate in list(set(buscos)): + # Use set to account for any duplicate entries (matched in both initial run and rerun) + busco_dict[duplicate].pop(gene_id) + if len(busco_dict[duplicate]) == 0: + busco_dict.pop(duplicate) + busco_matches_to_remove.append((gene_id, duplicate)) + + for gene_busco_pair in busco_matches_to_remove: + gene_id, busco_id = gene_busco_pair + matched_genes[gene_id].remove(busco_id) + if len(matched_genes[gene_id]) == 0: + matched_genes.pop(gene_id) + + return + + def _remove_low_scoring_matches(self, busco_dict): + """ + Go through input dictionary and remove any gene matches that score less than 85% of the top gene match score + for each BUSCO. + :param busco_dict: one of [self.is_complete, self.is_very_large, self.is_fragment] + :type busco_dict: dict + :return: + """ + empty_buscos = [] + + # For each busco, keep only matches within 85% of top bitscore match for that busco + for busco_id, matches in busco_dict.items(): + if len(matches) > 1: + _, max_bitscore = self._get_best_scoring_match(matches) + # Go through all matches again, removing any below the threshold + for gene_id in list(matches.keys()): + match_info = matches[gene_id] + matches_to_remove = [] + for m, match in enumerate(match_info): + if match["bitscore"] < 0.85 * max_bitscore: + matches_to_remove.append(m) + + # Remove dict from list of dicts. Safe way to delete without risking list size changing during + # iteration + for ind in sorted(matches_to_remove, reverse=True): + del match_info[ind] + + # Record dictionary address of empty gene records + if len(busco_dict[busco_id][gene_id]) == 0: + empty_buscos.append((busco_id, gene_id)) + + # Safe way to delete empty records without risking dictionary size changing while iterating + for item in empty_buscos: + busco_id, gene_id = item + busco_dict[busco_id].pop(gene_id) + + return + + @staticmethod + def _get_best_scoring_match(gene_matches): + """ + Find the highest bitscore in all gene matches. + :param gene_matches: dictionary of the form + {gene_id: [{"bitscore": float, "length": int}, {"bitscore": float, "length": int}, ...], ...} + :type gene_matches: dict + :return: best_match_gene, best_match_bitscore + :rtype: str, float + """ + match_scores = [] + match_genes = [] + for gene_id, matches in gene_matches.items(): + for match in matches: + bitscore = match["bitscore"] + match_scores.append(bitscore) + match_genes.append(gene_id) + best_match_ind = max(range(len(match_scores)), key=match_scores.__getitem__) + best_match_gene = match_genes[best_match_ind] + best_match_bitscore = match_scores[best_match_ind] + return best_match_gene, best_match_bitscore + + def filter(self): + """ + Remove all duplicate matches and any matches below 85% of the top match for each BUSCO. + :return: + """ + self._remove_duplicates() + self._remove_low_scoring_matches(self.is_complete) + self._remove_low_scoring_matches(self.is_very_large) + self._remove_low_scoring_matches(self.is_fragment) + return + + def consolidate_busco_lists(self): + """ + Sort BUSCO matches into single-copy, multi-copy and fragments. + Only the highest scoring fragment for each BUSCO is kept. + :return: + """ + for busco_dict in [self.is_complete, self.is_very_large]: + for busco_id, gene_matches in busco_dict.items(): + if len(gene_matches) == 1: + self.single_copy_buscos[busco_id] = busco_dict[busco_id] + else: + self.multi_copy_buscos[busco_id] = busco_dict[busco_id] + + for busco_id, gene_matches in self.is_fragment.items(): + if len(gene_matches) > 1: + best_fragment, _ = self._get_best_scoring_match(gene_matches) + self.fragmented_buscos[busco_id] = { + best_fragment: self.is_fragment[busco_id][best_fragment] + } + else: + self.fragmented_buscos[busco_id] = gene_matches + return + + def load_links_info(self): + links_info = defaultdict(dict) + links_file = os.path.join( + self.lineage_dataset, + "links_to_{}.txt".format(self.datasets_version.upper()), + ) + if os.path.exists(links_file): + with open(links_file, newline="") as f: + contents = csv.reader(f, delimiter="\t") + for row in contents: + busco_id, description, link = row + links_info[busco_id]["description"] = description + links_info[busco_id]["link"] = link + return links_info + + def _format_output_lines(self, busco_dict, label): + """ + Format BUSCO matches from input dictionary into output lines for writing to a file. + :param busco_dict: one of [self.single_copy_buscos, self.multi_copy_buscos, self.fragmented_buscos] + :type busco_dict: dict + :return: output_lines + :rtype: list + """ + output_lines = [] + + links_info = self.load_links_info() + + for busco, matches in busco_dict.items(): + for gene_id, match_info in matches.items(): + for m, match in enumerate(match_info): + bit_score = match["bitscore"] + match_length = match["length"] + + if self.mode == "proteins" or self.mode == "transcriptome": + try: + desc = links_info[busco]["description"] + link = links_info[busco]["link"] + self.extra_columns = True + output_lines.append( + "{}\t{}\t{}\t{}\t{}\t{}\t{}\n".format( + busco, + label, + gene_id, + bit_score, + match_length, + link, + desc, + ) + ) + except KeyError: + output_lines.append( + "{}\t{}\t{}\t{}\t{}\n".format( + busco, label, gene_id, bit_score, match_length + ) + ) + elif self.mode == "genome": + scaffold = self.gene_details[gene_id][m] + location_pattern = ":{}-{}".format( + scaffold["gene_start"], scaffold["gene_end"] + ) + if gene_id.endswith(location_pattern): + gene_id = gene_id.replace(location_pattern, "") + try: + desc = links_info[busco]["description"] + link = links_info[busco]["link"] + self.extra_columns = True + output_lines.append( + "{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\n".format( + busco, + label, + gene_id, + scaffold["gene_start"], + scaffold["gene_end"], + scaffold["strand"], + bit_score, + match_length, + link, + desc, + ) + ) + except KeyError: + output_lines.append( + "{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\n".format( + busco, + label, + gene_id, + scaffold["gene_start"], + scaffold["gene_end"], + scaffold["strand"], + bit_score, + match_length, + ) + ) + return output_lines + + def create_output_content(self): + """ + Format output for all BUSCO matches. + :return: output_lines + :rtype: list + """ + output_lines = [] + dict_labels = { + "Complete": self.single_copy_buscos, + "Duplicated": self.multi_copy_buscos, + "Fragmented": self.fragmented_buscos, + } + for label, busco_dict in dict_labels.items(): + output_lines += self._format_output_lines(busco_dict, label) + + return output_lines + + def _list_missing_buscos(self): + """ + Create a list of all BUSCOs that are missing after processing the HMMER output. + :return: output_lines, missing_buscos + :rtype: list, list + """ + output_lines = [] + for busco_group in self.cutoff_dict: + if not any( + busco_group in d + for d in [self.is_complete, self.is_very_large, self.is_fragment] + ): + output_lines.append("{}\tMissing\n".format(busco_group)) + self.missing_buscos.append(busco_group) + + if len(self.missing_buscos) == len(self.cutoff_dict): + logger.warning( + "BUSCO did not find any match. Make sure to check the log files if this is unexpected." + ) + + return output_lines, self.missing_buscos + + def _load_length(self): + """ + This function loads the length cutoffs file + :raises SystemExit: if the lengths_cutoff file cannot be read + """ + lengths_cutoff_file = os.path.join(self.lineage_dataset, "lengths_cutoff") + try: + with open(lengths_cutoff_file, "r") as f: + for line in f: + line = line.strip().split() + try: + taxid = line[0] + sd = float(line[2]) + length = float(line[3]) + + self.cutoff_dict[taxid]["sigma"] = sd + # there is an arthropod profile with sigma 0 + # that causes a crash on divisions + if sd == 0.0: + self.cutoff_dict[taxid]["sigma"] = 1 + self.cutoff_dict[taxid]["length"] = length + except IndexError as e: + raise SystemExit(e, "Error parsing the lengths_cutoff file.") + except IOError: + raise SystemExit( + "Impossible to read the lengths in {}".format( + os.path.join(lengths_cutoff_file) + ) + ) + return + + def _load_score(self): + """ + This function loads the score cutoffs file + :raises SystemExit: if the scores_cutoff file cannot be read + """ + scores_cutoff_file = os.path.join(self.lineage_dataset, "scores_cutoff") + try: + # open target scores file + with open(scores_cutoff_file, "r") as f: + for line in f: + line = line.strip().split() + try: + taxid = line[0] + score = float(line[1]) + self.cutoff_dict[taxid]["score"] = score + except IndexError as e: + raise SystemExit(e, "Error parsing the scores_cutoff file.") + except IOError: + raise SystemExit( + "Impossible to read the scores in {}".format(scores_cutoff_file) + ) + return + + def write_buscos_to_file(self, sequences_aa, sequences_nt=None): + """ + Write BUSCO matching sequences to output fasta files. Each sequence is printed in a separate file and both + nucleotide and amino acid versions are created. + :param sequences_aa: dict + :param sequences_nt: dict + :return: + """ + for busco_type in ["single_copy", "multi_copy", "fragmented"]: + if busco_type == "single_copy": + output_dir = self.single_copy_sequences_folder + busco_matches = self.single_copy_buscos + elif busco_type == "multi_copy": + output_dir = self.multi_copy_sequences_folder + busco_matches = self.multi_copy_buscos + elif busco_type == "fragmented": + output_dir = self.fragmented_sequences_folder + busco_matches = self.fragmented_buscos + + for busco, gene_matches in busco_matches.items(): + try: + aa_seqs, nt_seqs = zip( + *[ + (sequences_aa[gene_id], sequences_nt[gene_id]) + for gene_id in gene_matches + ] + ) + with open( + os.path.join(output_dir, "{}.fna".format(busco)), "w" + ) as f2: + SeqIO.write(nt_seqs, f2, "fasta") + except TypeError: + aa_seqs = [sequences_aa[gene_id] for gene_id in gene_matches] + with open(os.path.join(output_dir, "{}.faa".format(busco)), "w") as f1: + SeqIO.write(aa_seqs, f1, "fasta") + + def write_hmmer_results(self, output_lines): + """ + Create two output files: one with information on all BUSCOs for the given dataset and the other with a list of + all BUSCOs that were not found. + :return: + """ + + with open(os.path.join(self.run_folder, "full_table.tsv"), "w") as f_out: + + self._write_output_header(f_out) + + with open( + os.path.join(self.run_folder, "missing_busco_list.tsv"), "w" + ) as miss_out: + + self._write_output_header(miss_out, missing_list=True) + + # todo: move to calculate busco percentages + missing_buscos_lines, missing_buscos = self._list_missing_buscos() + output_lines += missing_buscos_lines + + for missing_busco in sorted(missing_buscos): + miss_out.write("{}\n".format(missing_busco)) + + sorted_output_lines = self._sort_lines(output_lines) + for busco in sorted_output_lines: + f_out.write(busco) + return + + @staticmethod + def _sort_lines(lines): + sorted_lines = sorted(lines, key=lambda x: int(x.split("\t")[0].split("at")[0])) + return sorted_lines + + def produce_hmmer_summary(self): + ( + single_copy, + multi_copy, + only_fragments, + total_buscos, + ) = self._get_busco_percentages() + + self.hmmer_results_lines.append("***** Results: *****\n\n") + self.one_line_summary = "C:{}%[S:{}%,D:{}%],F:{}%,M:{}%,n:{}\t{}\n".format( + round(self.s_percent + self.d_percent, 1), + self.s_percent, + self.d_percent, + self.f_percent, + abs(round(100 - self.s_percent - self.d_percent - self.f_percent, 1)), + total_buscos, + " ", + ) + self.hmmer_results_lines.append(self.one_line_summary) + self.hmmer_results_lines.append( + "{}\tComplete BUSCOs (C)\t\t\t{}\n".format(single_copy + multi_copy, " ") + ) + self.hmmer_results_lines.append( + "{}\tComplete and single-copy BUSCOs (S)\t{}\n".format(single_copy, " ") + ) + self.hmmer_results_lines.append( + "{}\tComplete and duplicated BUSCOs (D)\t{}\n".format(multi_copy, " ") + ) + self.hmmer_results_lines.append( + "{}\tFragmented BUSCOs (F)\t\t\t{}\n".format(only_fragments, " ") + ) + self.hmmer_results_lines.append( + "{}\tMissing BUSCOs (M)\t\t\t{}\n".format( + total_buscos - single_copy - multi_copy - only_fragments, " " + ) + ) + self.hmmer_results_lines.append( + "{}\tTotal BUSCO groups searched\t\t{}\n".format(total_buscos, " ") + ) + + if isinstance(self.config, BuscoConfigAuto): + self._one_line_hmmer_summary() + elif self.domain == "eukaryota" and self.log_count == 0: + self.log_count += 1 + self._produce_full_hmmer_summary_debug() + else: + self._one_line_hmmer_summary() + + with open(self.short_summary_file, "w") as summary_file: + + self._write_output_header(summary_file, no_table_header=True) + summary_file.write( + "# Summarized benchmarking in BUSCO notation for file {}\n" + "# BUSCO was run in mode: {}\n".format(self.input_file, self.mode) + ) + if self.mode == "genome": + if self.config.get("busco_run", "domain") in ["prokaryota", "viruses"]: + gene_predictor = "prodigal" + elif self.config.getboolean("busco_run", "use_augustus"): + gene_predictor = "augustus" + else: + gene_predictor = "metaeuk" + summary_file.write("# Gene predictor used: {}\n".format(gene_predictor)) + summary_file.write("\n") + + for line in self.hmmer_results_lines: + summary_file.write("\t{}".format(line)) + + if ( + self.config.getboolean("busco_run", "auto-lineage") + and isinstance(self.config, BuscoConfigMain) + and hasattr(self.config, "placement_files") + ): + summary_file.write("\nPlacement file versions:\n") + for placement_file in self.config.placement_files: + summary_file.write("{}\n".format(placement_file)) + + return + + @log("{}", logger, attr_name="hmmer_results_lines", apply="join", on_func_exit=True) + def _produce_full_hmmer_summary(self): + return + + @log( + "{}", + logger, + attr_name="hmmer_results_lines", + apply="join", + on_func_exit=True, + debug=True, + ) + def _produce_full_hmmer_summary_debug(self): + return + + @log("{}", logger, attr_name="one_line_summary", on_func_exit=True) + def _one_line_hmmer_summary(self): + self.one_line_summary = "Results:\t{}".format(self.one_line_summary) + return + + def _write_output_header( + self, file_object, missing_list=False, no_table_header=False + ): + """ + Write a standardized file header containing information on the BUSCO run. + :param file_object: Opened file object ready for writing + :type file_object: file + :return: + """ + file_object.write( + "# BUSCO version is: {} \n" + "# The lineage dataset is: {} (Creation date: {}, number of species: {}, number of BUSCOs: {}" + ")\n".format( + busco.__version__, + os.path.basename(self.lineage_dataset), + self.dataset_creation_date, + self.dataset_nb_species, + self.dataset_nb_buscos, + ) + ) + # if isinstance(self._config, BuscoConfigMain): # todo: wait until rerun command properly implemented again + # file_object.write("# To reproduce this run: {}\n#\n".format(self._rerun_cmd)) + + if no_table_header: + pass + elif missing_list: + file_object.write("# Busco id\n") + elif self.mode == "proteins" or self.mode == "transcriptome": + if self.extra_columns: + file_object.write( + "# Busco id\tStatus\tSequence\tScore\tLength\tOrthoDB url\tDescription\n" + ) + else: + file_object.write("# Busco id\tStatus\tSequence\tScore\tLength\n") + elif self.mode == "genome": + if self.extra_columns: + file_object.write( + "# Busco id\tStatus\tSequence\tGene Start\tGene End\tStrand\tScore\tLength\tOrthoDB url" + "\tDescription\n" + ) + else: + file_object.write( + "# Busco id\tStatus\tSequence\tGene Start\tGene End\tStrand\tScore\tLength\n" + ) + + return + + def _get_busco_percentages(self): + self.single_copy = len(self.single_copy_buscos) # int + self.multi_copy = len(self.multi_copy_buscos) # int + self.only_fragments = len(self.fragmented_buscos) # int + self.total_buscos = len(self.cutoff_dict) + + # Get percentage of each kind of BUSCO match + self.s_percent = abs(round((self.single_copy / self.total_buscos) * 100, 1)) + self.d_percent = abs(round((self.multi_copy / self.total_buscos) * 100, 1)) + self.f_percent = abs(round((self.only_fragments / self.total_buscos) * 100, 1)) + + return self.single_copy, self.multi_copy, self.only_fragments, self.total_buscos diff -Nru busco-4.1.4/src/busco/busco_tools/metaeuk.py busco-5.0.0/src/busco/busco_tools/metaeuk.py --- busco-4.1.4/src/busco/busco_tools/metaeuk.py 1970-01-01 00:00:00.000000000 +0000 +++ busco-5.0.0/src/busco/busco_tools/metaeuk.py 2021-01-26 11:28:47.000000000 +0000 @@ -0,0 +1,642 @@ +from busco.busco_tools.base import BaseRunner, NoRerunFile, NoGenesError +import os +from collections import defaultdict +from busco.BuscoLogger import BuscoLogger +from Bio import SeqIO +import shutil +from configparser import NoOptionError +import subprocess +import gzip +import pandas as pd +import numpy as np +import re + +logger = BuscoLogger.get_logger(__name__) + + +class MetaeukParsingError(Exception): + def __init__(self): + pass + + +class MetaeukRunner(BaseRunner): + + name = "metaeuk" + cmd = "metaeuk" + + ACCEPTED_PARAMETERS = [ + "comp-bias-corr", + "add-self-matches", + "seed-sub-mat", + "s", + "k", + "k-score", + "alph-size", + "max-seqs", + "split", + "split-mode", + "split-memory-limit", + "diag-score", + "exact-kmer-matching", + "mask", + "mask-lower-case", + "min-ungapped-score", + "spaced-kmer-mode", + "spaced-kmer-pattern", + "local-tmp", + "disk-space-limit", + "a", + "alignment-mode", + "wrapped-scoring", + "e", + "min-seq-id", + "min-aln-len", + "seq-id-mode", + "alt-ali", + "c", + "cov-mode", + "realign", + "max-rejected", + "max-accept", + "score-bias", + "gap-open", + "gap-extend", + "zdrop", + "pca", + "pcb", + "mask-profile", + "e-profile", + "wg", + "filter-msa", + "max-seq-id", + "qid", + "qsc", + "cov", + "diff", + "num-iterations", + "slice-search", + "rescore-mode", + "allow-deletion", + "min-length", + "max-length", + "max-gaps", + "contig-start-mode", + "contig-end-mode", + "orf-start-mode", + "forward-frames", + "reverse-frames", + "translation-table", + "translate", + "use-all-table-starts", + "id-offset", + "add-orf-stop", + "search-type", + "start-sens", + "sens-steps", + "metaeuk-eval", + "metaeuk-tcov", + "min-intron", + "min-exon-aa", + "max-overlap", + "set-gap-open", + "set-gap-extend", + "overlap", + "protein", + "target-key", + "reverse-fragments", + "sub-mat", + "db-load-mode", + "force-reuse", + "remove-tmp-files", + "filter-hits", + "sort-results", + "omit-consensus", + "create-lookup", + "chain-alignments", + "merge-query", + "strand", + "compressed", + "v", + "max-intron", + "max-seq-len", + ] + + PRESET_PARAMETERS = [ + "--max-intron", + "--max-seq-len", + "--min-exon-aa", + "--max-overlap", + "--min-intron", + "--overlap", + ] + + def __init__(self): + super().__init__() + self._output_folder = os.path.join(self.run_folder, "metaeuk_output") + self._initial_results_folder = os.path.join( + self._output_folder, "initial_results" + ) + self._rerun_results_folder = os.path.join(self._output_folder, "rerun_results") + self._tmp_folder = os.path.join(self._output_folder, "tmp") + + self.ancestral_file = os.path.join(self.lineage_dataset, "ancestral") + self.ancestral_variants_file = os.path.join( + self.lineage_dataset, "ancestral_variants" + ) + + self.max_intron = self.config.get("busco_run", "max_intron") + self.max_seq_len = self.config.get("busco_run", "max_seq_len") + self.overlap = 1 + self.s_set = False + + self.extra_params = None + self.param_keys = [] + self.param_values = [] + self.create_dirs( + [ + self._output_folder, + self._initial_results_folder, + self._rerun_results_folder, + ] + ) + self.gene_details = None + + self.headers_file = None + self.codon_file = None + self.pred_protein_seqs = None + self.pred_protein_seqs_modified = None + self.incomplete_buscos = None + self.sequences_aa = {} + self.init_checkpoint_file() + + self.pred_protein_files = [] + self.pred_protein_mod_files = [] + self.headers_files = [] + self.combined_pred_protein_seqs = os.path.join( + self._output_folder, "combined_pred_proteins.fas" + ) + + def configure_runner(self, incomplete_buscos=None): + self.run_number += 1 + self.incomplete_buscos = incomplete_buscos + + if self.run_number > 1: + self._output_basename = os.path.join( + self._rerun_results_folder, os.path.basename(self.input_file) + ) + self.refseq_db_rerun = os.path.join( + self._output_folder, "refseq_db_rerun.faa" + ) + self._extract_incomplete_buscos_ancestral() + self.refseq_db = self.refseq_db_rerun + self.min_exon_aa = 5 + self.max_overlap = 5 + self.min_intron = 1 + else: + self._output_basename = os.path.join( + self._initial_results_folder, os.path.basename(self.input_file) + ) + gzip_refseq = os.path.join(self.lineage_dataset, "refseq_db.faa.gz") + self.refseq_db = self.decompress_refseq_file(gzip_refseq) + self.min_exon_aa = 15 + self.max_overlap = 15 + self.min_intron = 5 + + try: + if self.run_number == 1: + self.extra_params = self.config.get( + "busco_run", "metaeuk_parameters" + ).replace(",", " ") + else: + self.extra_params = self.config.get( + "busco_run", "metaeuk_rerun_parameters" + ).replace(",", " ") + except NoOptionError: + self.extra_params = "" + + self.headers_file = "{}.headersMap.tsv".format(self._output_basename) + self.headers_files.append(self.headers_file) + self.codon_file = "{}.codon.fas".format(self._output_basename) + self.pred_protein_seqs = "{}.fas".format(self._output_basename) + self.pred_protein_files.append(self.pred_protein_seqs) + self.pred_protein_seqs_modified = self.pred_protein_seqs.replace( + ".fas", ".modified.fas" + ) + self.pred_protein_mod_files.append(self.pred_protein_seqs_modified) + + def decompress_refseq_file(self, gzip_file): + unzipped_filename = gzip_file.split(".gz")[0] + if not os.path.exists(unzipped_filename): + with gzip.open(gzip_file, "rb") as compressed_file: + with open(unzipped_filename, "wb") as decompressed_file: + for line in compressed_file: + decompressed_file.write(line) + if os.path.exists(gzip_file): + try: + os.remove(gzip_file) + except OSError: + logger.warning( + "Unable to remove compressed refseq file in dataset download" + ) + pass + return unzipped_filename + + def combine_run_results(self): + with open(self.combined_pred_protein_seqs, "w") as combined_output: + for run_result in self.pred_protein_mod_files: + with open(run_result, "r") as f: + shutil.copyfileobj(f, combined_output) + + return + + def records_to_df(self, records): + results = pd.DataFrame.from_records( + records, + columns=[ + "Busco id", + "Sequence", + "Start", + "Stop", + "Strand", + "Score", + "Run found", + ], + index=np.arange(len(records)), + ) + + return results + + @staticmethod + def detect_overlap(match1_details, match2_details): + return ( + match1_details["Strand"] == match2_details["Strand"] + ) # check overlaps are on the same strand + + def test_for_overlaps(self, record_df): + results_grouped = record_df.groupby("Sequence") + overlaps = [] + seq_ids = results_grouped.groups.keys() + for seq in seq_ids: + g1 = results_grouped.get_group(seq) + g1_sorted = g1.sort_values( + "Start" + ) # sort to facilitate a single-pass coordinate check + for idx1, row1 in g1_sorted.iterrows(): + start_val = g1_sorted.loc[idx1]["Start"] + stop_val = g1_sorted.loc[idx1]["Stop"] + matches = g1_sorted[g1_sorted["Start"] > start_val].loc[ + g1_sorted["Start"] < stop_val + ] # find entries with a start coordinate between the current exon start and end + for idx2 in matches.index.values: + if self.detect_overlap(g1_sorted.loc[idx1], g1_sorted.loc[idx2]): + overlaps.append((idx1, idx2)) + + return overlaps + + def find_match(self, matches, id_items): + matches_list = str(matches).split("\n") + good_match = None + good_matches = [] + for match in matches_list: + if all([i in match for i in id_items]): + good_match = match + good_matches.append(good_match) + if len(good_matches) > 1: + good_ind = self.select_higher_bitscore_ind(good_matches) + good_match = good_matches[good_ind] + return good_match + + @staticmethod + def select_higher_bitscore_ind(matches): + bitscores = [] + pos_pattern = "|+|" + neg_pattern = "|-|" + for match in matches: + strand_pattern = pos_pattern if pos_pattern in match else neg_pattern + score = re.search( + r"{}(.*?)\|".format(re.escape(strand_pattern)), match + ).group(1) + bitscores.append(int(score)) + max_ind = bitscores.index(max(bitscores)) + return max_ind + + def extract_exon_coords(self, match): + parts = str(match).split("\t") + header = parts[5] + details = self.parse_header(header) + return ( + details["all_taken_low_exon_coords"], + details["all_taken_high_exon_coords"], + ) + + def check_tool_dependencies(self): + pass + + def configure_job(self, *args): + + metaeuk_job = self.create_job() + metaeuk_job.add_parameter("easy-predict") + metaeuk_job.add_parameter("--threads") + metaeuk_job.add_parameter(str(self.cpus)) + metaeuk_job.add_parameter(self.input_file) + metaeuk_job.add_parameter(self.refseq_db) + metaeuk_job.add_parameter(self._output_basename) + metaeuk_job.add_parameter(self._tmp_folder) + metaeuk_job.add_parameter("--max-intron") + metaeuk_job.add_parameter(str(self.max_intron)) + metaeuk_job.add_parameter("--max-seq-len") + metaeuk_job.add_parameter(str(self.max_seq_len)) + metaeuk_job.add_parameter("--min-exon-aa") + metaeuk_job.add_parameter(str(self.min_exon_aa)) + metaeuk_job.add_parameter("--max-overlap") + metaeuk_job.add_parameter(str(self.max_overlap)) + metaeuk_job.add_parameter("--min-intron") + metaeuk_job.add_parameter(str(self.min_intron)) + metaeuk_job.add_parameter("--overlap") + metaeuk_job.add_parameter(str(self.overlap)) + if self.run_number > 1 and self.s_set == False: + metaeuk_job.add_parameter("-s") + metaeuk_job.add_parameter("6") + for k, key in enumerate(self.param_keys): + dashes = "-" if len(key) == 1 else "--" + metaeuk_job.add_parameter("{}{}".format(dashes, key)) + metaeuk_job.add_parameter("{}".format(str(self.param_values[k]))) + + return metaeuk_job + + def generate_job_args(self): + yield + + @property + def output_folder(self): + return self._output_folder + + def remove_tmp_files(self): + shutil.rmtree(self._tmp_folder) + + def run(self): + super().run() + if self.extra_params: + logger.info( + "Additional parameters for Metaeuk are {}: ".format(self.extra_params) + ) + self.param_keys, self.param_values = self.parse_parameters() + + # self.cwd = self._output_folder + self.total = 1 + self.run_jobs() + + def _extract_incomplete_buscos_ancestral(self): + + logger.info( + "Extracting missing and fragmented buscos from the file {}...".format( + os.path.basename(self.refseq_db) + ) + ) + + matched_seqs = [] + busco_ids_retrieved = set() + with open(self.refseq_db, "rU") as refseq_file: + + for record in SeqIO.parse(refseq_file, "fasta"): + if any(record.id.startswith(b) for b in self.incomplete_buscos): + # Remove the ancestral variant identifier ("_1" etc) so it matches all other BUSCO IDs. + # The identifier is still present in the "name" and "description" Sequence Record attributes. + record.id = record.id.split("_")[0] + busco_ids_retrieved.add(record.id) + matched_seqs.append(record) + + unmatched_incomplete_buscos = list( + set(self.incomplete_buscos) - set(busco_ids_retrieved) + ) + if len(unmatched_incomplete_buscos) > 0: + logger.debug( + "The BUSCO ID(s) {} were not found in the file {}".format( + unmatched_incomplete_buscos, os.path.basename(self.refseq_db) + ) + ) + + with open( + self.refseq_db_rerun, "w" + ) as out_file: # Create new query file for second tblastn run + SeqIO.write(matched_seqs, out_file, "fasta") + + return + + def get_version(self): + help_output = subprocess.check_output( + [self.cmd, "-h"], stderr=subprocess.STDOUT, shell=False + ) + lines = help_output.decode("utf-8").split("\n") + version = None + for line in lines: + if line.startswith("metaeuk Version:"): + version = line.strip().split(" ")[-1] + return version + + def parse_parameters(self): + accepted_keys = [] + accepted_values = [] + if self.extra_params: + self.extra_params = self.extra_params.strip("\" '") + try: + if self.extra_params.startswith("-"): + key_val_pairs = self.extra_params.split(" -") + for kv in key_val_pairs: + key_vals = kv.strip("- ").split("=") + if len(key_vals) == 2: + key, val = key_vals + if key in type(self).ACCEPTED_PARAMETERS: + if key == "min-exon-aa": + self.min_exon_aa = val.strip() + continue + elif key == "max-intron": + self.max_intron = val.strip() + continue + elif key == "max-seq-len": + self.max_seq_len = val.strip() + continue + elif key == "max-overlap": + self.max_overlap = val.strip() + continue + elif key == "min-intron": + self.min_intron = val.strip() + continue + elif key == "overlap": + self.overlap = val.strip() + continue + elif key == "s": + self.s_set = True + accepted_keys.append(key.strip()) + accepted_values.append(val.strip()) + else: + logger.warning( + "{} is not an accepted parameter for Metaeuk.".format( + key + ) + ) + else: + raise MetaeukParsingError + else: + raise MetaeukParsingError + except MetaeukParsingError: + logger.warning( + "Metaeuk parameters are not correctly formatted. Please enter them as follows: " + '"--param1=value1 --param2=value2" etc. Proceeding without additional parameters.' + ) + return [], [] + return accepted_keys, accepted_values + + @staticmethod + def parse_header(header): + header_parts = header.split("|") + if not header_parts[2] in [ + "+", + "-", + ]: # Deal with sequence IDs that contain the symbol "|" + try: + strand_ind = header_parts.index("+") + except ValueError: + strand_ind = header_parts.index("-") + header_parts[1] = "|".join(header_parts[1:strand_ind]) + for i in range(2, strand_ind): + header_parts.pop(i) + + T_acc = header_parts[0] + C_acc = header_parts[1] + strand = header_parts[2] + bitscore = float(header_parts[3]) + eval = float(header_parts[4]) + num_exons = int(header_parts[5]) + low_coord = int(header_parts[6]) + high_coord = int(header_parts[7]) + exon_coords = header_parts[8:] + + all_low_exon_coords = [] + all_taken_low_exon_coords = [] + all_high_exon_coords = [] + all_taken_high_exon_coords = [] + all_exon_nucl_len = [] + all_taken_exon_nucl_len = [] + for exon in exon_coords: + low_exon_coords, high_exon_coords, nucl_lens = exon.split(":") + + low_exon_coord, taken_low_exon_coord = low_exon_coords.split("[") + all_low_exon_coords.append(int(low_exon_coord)) + taken_low_exon_coord = int(taken_low_exon_coord.strip("]")) + + high_exon_coord, taken_high_exon_coord = high_exon_coords.split("[") + all_high_exon_coords.append(int(high_exon_coord)) + taken_high_exon_coord = int(taken_high_exon_coord.strip("]")) + + nucl_len, taken_nucl_len = nucl_lens.split("[") + all_exon_nucl_len.append(int(nucl_len)) + taken_nucl_len = int(taken_nucl_len.strip().rstrip("]")) + + # Need to fix the metaeuk coordinate problem + if strand == "-": + if int(taken_high_exon_coord) + int(taken_nucl_len) - 1 != int( + taken_low_exon_coord + ): + taken_low_exon_coord = ( + int(taken_high_exon_coord) + int(taken_nucl_len) - 1 + ) + + all_taken_low_exon_coords.append(taken_low_exon_coord) + all_taken_high_exon_coords.append(taken_high_exon_coord) + all_taken_exon_nucl_len.append(taken_nucl_len) + + gene_id = "{}:{}-{}".format(C_acc, low_coord, high_coord) + + details = { + "T_acc": T_acc, + "C_acc": C_acc, + "S": strand, + "bitscore": bitscore, + "e-value": eval, + "num_exons": num_exons, + "low_coord": low_coord, + "high_coord": high_coord, + "all_low_exon_coords": all_low_exon_coords, + "all_taken_low_exon_coords": all_taken_low_exon_coords, + "all_high_exon_coords": all_high_exon_coords, + "all_taken_high_exon_coords": all_taken_high_exon_coords, + "all_exon_nucl_len": all_exon_nucl_len, + "all_taken_exon_nucl_len": all_taken_exon_nucl_len, + "gene_id": gene_id, + } + return details + + def edit_protein_file(self): + all_records = [] + all_headers = [] + try: + with open(self.pred_protein_seqs, "rU") as f: + for record in SeqIO.parse(f, "fasta"): + header_details = self.parse_header(record.id) + record.id = header_details["gene_id"] + record.name = header_details["gene_id"] + record.description = header_details["gene_id"] + all_records.append(record) + + header_df_info = ( + header_details["T_acc"].split("_")[0], + header_details["C_acc"], + header_details["low_coord"], + header_details["high_coord"], + header_details["S"], + header_details["bitscore"], + None, + ) + all_headers.append(header_df_info) + + except FileNotFoundError: + raise NoGenesError("Metaeuk") + + if all_headers: + matches_df = self.records_to_df(all_headers) + overlaps = self.test_for_overlaps(matches_df) + inds_to_remove = [] + for overlap in overlaps: + match1 = matches_df.loc[overlap[0]] + match2 = matches_df.loc[overlap[1]] + if match1["Busco id"] == match2["Busco id"]: + if float(match1["Score"]) > float(match2["Score"]): + ind_to_remove = match2.name + else: + ind_to_remove = match1.name + inds_to_remove.append(ind_to_remove) + + filtered_records = [ + i for j, i in enumerate(all_records) if j not in inds_to_remove + ] + for record in filtered_records: + self.sequences_aa[record.id] = record + + with open(self.pred_protein_seqs_modified, "w") as f_mod: + SeqIO.write(filtered_records, f_mod, "fasta") + + def get_gene_details(self): + self.gene_details = defaultdict(list) + + try: + with open(self.headers_file, "r") as f: + lines = f.readlines() + + try: + for line in lines: + header = line.split("\t")[-1] + header_details = self.parse_header(header) + + self.gene_details[header_details["gene_id"]].append( + { + "gene_start": header_details["low_coord"], + "gene_end": header_details["high_coord"], + "strand": header_details["S"], + } + ) + + except KeyError: + raise SystemExit("*headersMap.tsv file could not be parsed.") + except FileNotFoundError: + raise NoRerunFile diff -Nru busco-4.1.4/src/busco/busco_tools/prodigal.py busco-5.0.0/src/busco/busco_tools/prodigal.py --- busco-4.1.4/src/busco/busco_tools/prodigal.py 1970-01-01 00:00:00.000000000 +0000 +++ busco-5.0.0/src/busco/busco_tools/prodigal.py 2021-01-26 11:28:47.000000000 +0000 @@ -0,0 +1,285 @@ +from busco.busco_tools.base import BaseRunner, NoGenesError +import os +import re +from collections import defaultdict +from busco.BuscoLogger import BuscoLogger +from busco.BuscoLogger import LogDecorator as log +from Bio import SeqIO +import shutil +import numpy as np +from configparser import NoOptionError +import subprocess + +logger = BuscoLogger.get_logger(__name__) + + +class ProdigalRunner(BaseRunner): + + name = "prodigal" + cmd = "prodigal" + + _gc_run_results = defaultdict(dict) + + def __init__(self): + super().__init__() + self._output_folder = os.path.join(self.main_out, "prodigal_output") + self._pred_genes_dir = os.path.join(self._output_folder, "predicted_genes") + self._tmp_path = os.path.join(self._pred_genes_dir, "tmp") + self.cpus = 1 + self.create_dirs([self._pred_genes_dir, self._tmp_path]) + + # Get genetic_code from dataset.cfg file + # bacteria/archaea=11; Entomoplasmatales,Mycoplasmatales=4 + try: + self._genetic_code = self.config.get( + "prodigal", "prodigal_genetic_code" + ).split(",") + except NoOptionError: + self._genetic_code = ["11"] + + # Set the ambiguous coding density range + try: + self._cd_upper = ( + float(self.config.get("prodigal", "ambiguous_cd_range_upper")) + if len(self._genetic_code) > 1 + else 0 + ) + except NoOptionError: + raise SystemExit( + "Dataset config file does not contain required information. Please upgrade datasets." + ) + + self.current_gc = None + self._current_run_mode = None + self._tmp_name = None + + self.output_faa = os.path.join(self._pred_genes_dir, "predicted.faa") + self._output_fna = os.path.join(self._pred_genes_dir, "predicted.fna") + self.sequences_aa = {} + self.sequences_nt = {} + self.gene_details = {} + + self._input_length = self._get_genome_length() + self._run_mode = ["single", "meta"] if self._input_length > 100000 else ["meta"] + + self.init_checkpoint_file() + self.run_number += 1 + + @property + def output_folder(self): + return self._output_folder + + def check_tool_dependencies(self): + pass + + def generate_job_args(self): + yield + + @log( + "Genetic code {} selected as optimal", + logger, + attr_name="current_gc", + on_func_exit=True, + ) + def run(self): + """ + 1) If genome length > 100000 run in "single" mode, then "meta" mode if there are no gene predictions. Otherwise + just run in "meta" mode. This is based on the recommendations in the Prodigal docs. + 2) Run once using genetic code 11. This can be overridden if the user includes a spceific genetic code in the + config file. + 3) Check the genome coding density. If the coding density is above the ambiguous range (typically 0.73-0.8) + then continue with the current genetic code. The ambiguous range was determined based on analysis done by Mose + Manni. Otherwise run again on the next genetic code specified. + 4) If the next run still has a genetic density within the ambiguous range, read the stdout log files (formerly + the GFF files) and extract the scores assigned to each gene prediction. Whichever genetic code yields the + greatest mean score is selected. + :return: + """ + super().run() + tmp_files = [] + + for ix, m in enumerate(self._run_mode): + self._current_run_mode = m + for g in self._genetic_code: + self.current_gc = g + + file_id = os.path.join( + self._tmp_path, + "prodigal_mode_{0}_code_{1}".format( + self._current_run_mode, self.current_gc + ), + ) + self._tmp_name = "{}.faa".format(file_id) + self.logfile_path_out = "{}_out.log".format(file_id) + self.logfile_path_err = "err".join( + self.logfile_path_out.rsplit("out", 1) + ) # Replace only the last occurence of "out" substring + self._gc_run_results[self.current_gc].update( + {"tmp_name": self._tmp_name, "log_file": self.logfile_path_out} + ) + + if os.path.exists( + self._tmp_name + ): # Check to see if has already been run with these parameters + coding_density = self._gc_run_results[g]["cd"] + else: + logger.info( + "Running Prodigal with genetic code {} in {} mode".format( + self.current_gc, self._current_run_mode + ) + ) + self.total = 1 + self.run_jobs() + coding_length = self._get_coding_length(self.logfile_path_out) + coding_density = coding_length / self._input_length + self._gc_run_results[self.current_gc].update({"cd": coding_density}) + + logger.debug("Coding density is {}".format(coding_density)) + tmp_files.append(self._gc_run_results[self.current_gc]["tmp_name"]) + + # if the coding density is above the ambiguous range, then continue with these parameters + if coding_density >= self._cd_upper: + break + + # If output files from both runs in "single" mode are empty, run again in "meta" mode, else raise Exception. + if not any([os.stat(tmp_file).st_size > 0 for tmp_file in tmp_files]): + if ix + 1 == len(self._run_mode): + raise NoGenesError("Prodigal") + else: + continue + + # if only one genetic code to consider, proceed with it + # if there is more than one possible set of parameters, decide which to use + self.current_gc = ( + self._select_best_gc() if len(tmp_files) > 1 else self._genetic_code[0] + ) + + selected_logfile = self._gc_run_results[self.current_gc]["log_file"] + selected_tmpfile = self._gc_run_results[self.current_gc]["tmp_name"] + + self._organize_prodigal_files(selected_tmpfile, selected_logfile) + self.get_gene_details() + self._gc_run_results[self.current_gc].update( + { + "seqs_aa": self.sequences_aa, + "seqs_nt": self.sequences_nt, + "gene_details": self.gene_details, + } + ) + break + + return + + def configure_job(self, *args): + tmp_name_nt = self._tmp_name.rpartition(".faa")[0] + ".fna" + + prodigal_job = self.create_job() + prodigal_job.add_parameter("-p") + prodigal_job.add_parameter("%s" % self._current_run_mode) + prodigal_job.add_parameter("-f") + prodigal_job.add_parameter("gff") + prodigal_job.add_parameter("-g") + prodigal_job.add_parameter("%s" % self.current_gc) + prodigal_job.add_parameter("-a") + prodigal_job.add_parameter("%s" % self._tmp_name) + prodigal_job.add_parameter("-d") + prodigal_job.add_parameter("%s" % tmp_name_nt) + prodigal_job.add_parameter("-i") + prodigal_job.add_parameter("%s" % self.input_file) + + return prodigal_job + + def get_gene_details(self): + self.gene_details = defaultdict(list) + + with open(self._output_fna, "rU") as f: + for record in SeqIO.parse(f, "fasta"): + gene_name = record.id + self.sequences_nt[gene_name] = record + description_parts = record.description.split() + gene_start = int(description_parts[2]) + gene_end = int(description_parts[4]) + strand = "+" if int(description_parts[6]) == 1 else "-" + self.gene_details[gene_name].append( + {"gene_start": gene_start, "gene_end": gene_end, "strand": strand} + ) + + with open(self.output_faa, "rU") as f: + for record in SeqIO.parse(f, "fasta"): + self.sequences_aa[record.id] = record + + return + + @staticmethod + def _get_coding_length(out_logfile): + total_coding_length = 0 + with open(out_logfile, "r") as f: + for line in f: + if not line.startswith("#"): + try: + start = int(line.split("\t")[3]) + stop = int(line.split("\t")[4]) + total_coding_length += stop - start + except IndexError: + continue + except ValueError: + continue + return total_coding_length + + def _get_genome_length(self): + length_seqs = 0 + for line in open(self.input_file): + if not line.startswith(">"): + length_seqs += len(line) + return length_seqs + + def _get_mean_score(self, gc): + logfile = self._gc_run_results[gc]["log_file"] + scores = [] + with open(logfile, "r") as f: + for line in f: + try: + score = re.search(";score=(.+?);", line).group(1) + scores.append(float(score)) + except AttributeError: + continue + mean_score = sum(scores) / len(scores) + return mean_score + + def _organize_prodigal_files(self, tmp_file, tmp_logfile): + + shutil.copy(tmp_file, self.output_faa) + shutil.copy(tmp_file.rpartition(".faa")[0] + ".fna", self._output_fna) + + # copy selected log files from tmp/ to logs/ + new_logname = os.path.join(self.log_folder, "prodigal_out.log") + shutil.copy(tmp_logfile, new_logname) + shutil.copy( + tmp_logfile.rpartition("_out.log")[0] + "_err.log", + new_logname.rpartition("_out.log")[0] + "_err.log", + ) + return + + def _select_best_gc(self): + gcs, cds = zip( + *[[gc, self._gc_run_results[gc]["cd"]] for gc in self._genetic_code] + ) + sort_order = np.argsort(np.array(cds))[::-1] + gcs_sorted = np.array(gcs)[sort_order] + cds_sorted = np.array(cds)[sort_order] + if abs(cds_sorted[0] - cds_sorted[1]) <= 0.05: + mean_score1 = self._get_mean_score(gcs_sorted[0]) + mean_score2 = self._get_mean_score(gcs_sorted[1]) + gc = gcs_sorted[int(mean_score2 > mean_score1)] + else: + gc = gcs_sorted[0] + return gc + + def get_version(self): + prodigal_version = subprocess.check_output( + [self.cmd, "-v"], stderr=subprocess.STDOUT, shell=False + ) + prodigal_version = prodigal_version.decode("utf-8") + prodigal_version = prodigal_version.split("\n")[1].split(":")[0] + prodigal_version = prodigal_version.replace("Prodigal V", "") + return prodigal_version diff -Nru busco-4.1.4/src/busco/busco_tools/sepp.py busco-5.0.0/src/busco/busco_tools/sepp.py --- busco-4.1.4/src/busco/busco_tools/sepp.py 1970-01-01 00:00:00.000000000 +0000 +++ busco-5.0.0/src/busco/busco_tools/sepp.py 2021-01-26 11:28:47.000000000 +0000 @@ -0,0 +1,84 @@ +from busco.busco_tools.base import BaseRunner +import os +from busco.BuscoLogger import BuscoLogger +import shutil +import subprocess + +logger = BuscoLogger.get_logger(__name__) + + +class SEPPRunner(BaseRunner): + + name = "sepp" + cmd = "run_sepp.py" + + def __init__(self): + super().__init__() + self._output_folder = os.path.join( + self.main_out, "auto_lineage", self.lineage_results_dir + ) + self.placement_folder = os.path.join(self._output_folder, "placement_files") + self._tmp_folder = os.path.join(self._output_folder, "sepp_tmp_files") + self.datasets_version = self.config.get("busco_run", "datasets_version") + self.create_dirs([self._output_folder, self.placement_folder, self._tmp_folder]) + + self.init_checkpoint_file() + + def configure_runner( + self, tree_nwk_file, tree_metadata_file, supermatrix_file, downloader + ): + self.run_number += 1 + self.tree_nwk_file = tree_nwk_file + self.tree_metadata_file = tree_metadata_file + self.supermatrix_file = supermatrix_file + self.downloader = downloader + + def cleanup(self): + shutil.rmtree(self._tmp_folder) + + def generate_job_args(self): + yield + + def run(self): + super().run() + self.total = 1 + self.run_jobs() + + def configure_job(self, *args): + sepp_job = self.create_job() + sepp_job.add_parameter("--cpu") + sepp_job.add_parameter(str(self.cpus)) + sepp_job.add_parameter("--outdir") + sepp_job.add_parameter(self.placement_folder) + sepp_job.add_parameter("-t") + sepp_job.add_parameter(self.tree_nwk_file) + sepp_job.add_parameter("-r") + sepp_job.add_parameter(self.tree_metadata_file) + sepp_job.add_parameter("-a") + sepp_job.add_parameter(self.supermatrix_file) + sepp_job.add_parameter("-f") + sepp_job.add_parameter( + os.path.join(self.placement_folder, "marker_genes.fasta") + ) + sepp_job.add_parameter("-F") + sepp_job.add_parameter("15") + sepp_job.add_parameter("-m") + sepp_job.add_parameter("amino") + sepp_job.add_parameter("-p") + sepp_job.add_parameter(self._tmp_folder) + return sepp_job + + def check_tool_dependencies(self): + pass + + def get_version(self): + sepp_version = subprocess.check_output( + [self.cmd, "-v"], stderr=subprocess.STDOUT, shell=False + ) + sepp_version = sepp_version.decode("utf-8") + sepp_version = sepp_version.strip().split(" ")[1] + return sepp_version + + @property + def output_folder(self): + return self._output_folder diff -Nru busco-4.1.4/src/busco/busco_tools/Toolset.py busco-5.0.0/src/busco/busco_tools/Toolset.py --- busco-4.1.4/src/busco/busco_tools/Toolset.py 1970-01-01 00:00:00.000000000 +0000 +++ busco-5.0.0/src/busco/busco_tools/Toolset.py 2021-01-26 11:28:47.000000000 +0000 @@ -0,0 +1,233 @@ +#!/usr/bin/env python3 +# coding: utf-8 +""" +.. module:: Toolset + :synopsis: the interface to OS enables to run executables / scripts + in external processes +.. versionadded:: 3.0.0 +.. versionchanged:: 4.0.0 + +Copyright (c) 2016-2021, Evgeny Zdobnov (ez@ezlab.org) +Licensed under the MIT license. See LICENSE.md file. + +""" +import os +import subprocess +from subprocess import TimeoutExpired +from multiprocessing import Process, Pool, Value, Lock +import time +from abc import ABCMeta, abstractmethod +from busco.BuscoLogger import BuscoLogger, ToolLogger +from busco.BuscoLogger import LogDecorator as log +from busco.BuscoLogger import StreamLogger +import logging + +logger = BuscoLogger.get_logger(__name__) + + +class Job(Process): + """ + Build and executes one work item in an external process + """ + + def __init__( + self, tool_name, cmd, job_outlogger, job_errlogger, timeout, cwd, **kwargs + ): + """ + :param name: a name of an executable / script ("a tool") to be run + :type cmd: list + :param thread_id: an int id for the thread + :type thread_id: int + """ + # initialize parent + super().__init__() + + self.tool_name = tool_name + self.cmd_line = [cmd] + self.job_outlogger = job_outlogger + self.job_errlogger = job_errlogger + self.timeout = timeout + self.cwd = cwd + self.kwargs = kwargs + + def add_parameter(self, parameter): + """ + Append parameter to the command line + :parameter: a parameter + :type parameter: str + """ + self.cmd_line.append(parameter) + + @log("cmd call: {}", logger, attr_name="cmd_line", apply="join", debug=True) + def run(self): + """ + Start external process and block the current thread's execution + till the process' run is over + """ + with StreamLogger(logging.DEBUG, self.job_outlogger, **self.kwargs) as out: + with StreamLogger(logging.ERROR, self.job_errlogger) as err: + try: + # Stick with Popen(), communicate() and wait() instead of just run() to ensure compatibility with + # Python versions < 3.5. + p = subprocess.Popen( + self.cmd_line, shell=False, stdout=out, stderr=err, cwd=self.cwd + ) + p.wait(self.timeout) + except TimeoutExpired: + p.kill() + logger.warning( + "The following job was killed as it was taking too long (>1hr) to " + "complete.\n{}".format(" ".join(self.cmd_line)) + ) + + self.job_outlogger._file_hdlr.close() + self.job_outlogger.removeHandler(self.job_outlogger._file_hdlr) + self.job_errlogger._file_hdlr.close() + self.job_errlogger.removeHandler(self.job_errlogger._file_hdlr) + with cnt.get_lock(): + cnt.value += 1 + + +class ToolException(Exception): + """ + Module-specific exception + """ + + def __init__(self, value): + self.value = value + + def __str__(self): + return self.value + + +class Tool(metaclass=ABCMeta): + """ + Collection of utility methods used by all tools + """ + + def __init__(self): + """ + Initialize job list for a tool + :param name: the name of the tool to execute + :type name: str + :param config: initialized instance of ConfigParser + :type config: configparser.ConfigParser + """ + if self.name == "augustus": + self.kwargs = {"augustus_out": True} + self.timeout = 3600 + else: + self.kwargs = {} + self.timeout = None + self.jobs_to_run = [] + self.jobs_running = [] + self.nb_done = 0 + self.total = 0 + self.cpus = None + self.chunksize = None + self.cwd = os.getcwd() + + @abstractmethod + def configure_job(self): + pass + + @abstractmethod + def generate_job_args(self): + pass + + @property + @abstractmethod + def name(self): + raise NotImplementedError + + @abstractmethod + def write_checkpoint_file(self): + pass + + def create_job(self): + """ + Create one work item + """ + self.tool_outlogger = ToolLogger(self.logfile_path_out) + self.tool_errlogger = ToolLogger(self.logfile_path_err) + job = Job( + self.name, + self.cmd[:], + self.tool_outlogger, + self.tool_errlogger, + self.timeout, + self.cwd, + **self.kwargs + ) + self.jobs_to_run.append(job) + # if self.count_jobs_created: + # self.total += 1 + return job + + def remove_job(self, job): + """ + Remove one work item + :param job: the Job to remove + :type job: Job + """ + self.jobs_to_run.remove(job) + + def log_jobs_to_run(self): + logger.info( + "Running {} job(s) on {}, starting at {}".format( + self.total, self.name, time.strftime("%m/%d/%Y %H:%M:%S") + ) + ) + return + + @log("No jobs to run on {}", logger, attr_name="name", iswarn=True) + def log_no_jobs(self): + return + + def run_jobs(self): + if self.total > 0: + self.log_jobs_to_run() + else: + self.log_no_jobs() + return + + if ( + self.cpus is None + ): # todo: need a different way to ensure self.cpus is nonzero number. + raise SystemExit("Number of CPUs not specified.") + + with Pool( + self.cpus, initializer=type(self).init_globals, initargs=(Value("i", 0),) + ) as job_pool: + job_pool.map( + self.run_job, self.generate_job_args(), chunksize=self.chunksize + ) + self.write_checkpoint_file() + + def run_job(self, args): + args = ( + (args,) if isinstance(args, str) else tuple(args or (args,)) + ) # Ensure args are tuples that can be unpacked. If no args, args=None, which is falsy, and this evaluates to (None,) + job = self.configure_job(*args) + job.run() + self.nb_done = cnt.value + if ( + self.nb_done == self.total + or int(self.nb_done % float(self.total / 10)) == 0 + ): + self._track_progress() + + @log( + "[{0}]\t{1} of {2} task(s) completed", + logger, + attr_name=["name", "nb_done", "total"], + on_func_exit=True, + ) + def _track_progress(self): + return + + @classmethod + def init_globals(cls, counter): + """Counter code adapted from the answer here: https://stackoverflow.com/a/53621343/4844311""" + global cnt + cnt = counter diff -Nru busco-4.1.4/src/busco/BuscoTools.py busco-5.0.0/src/busco/BuscoTools.py --- busco-4.1.4/src/busco/BuscoTools.py 2020-10-01 14:11:36.000000000 +0000 +++ busco-5.0.0/src/busco/BuscoTools.py 1970-01-01 00:00:00.000000000 +0000 @@ -1,2359 +0,0 @@ -import os -import re -from collections import defaultdict -import busco -from busco.BuscoLogger import BuscoLogger -from busco.BuscoLogger import LogDecorator as log -from busco.Toolset import Tool -from busco.BuscoConfig import BuscoConfig, BuscoConfigMain -from Bio import SeqIO -from Bio.Seq import Seq -from Bio.SeqRecord import SeqRecord -import shutil -import csv -import numpy as np -from shutil import which -from abc import ABCMeta, abstractmethod -from configparser import NoOptionError -import subprocess -from busco.BuscoConfig import BuscoConfigAuto -import time - -# todo: docstrings -logger = BuscoLogger.get_logger(__name__) - - -class ToolException(Exception): - """ - Module-specific exception - """ - def __init__(self, value): - self.value = value - - def __str__(self): - return self.value - - -class BaseRunner(Tool, metaclass=ABCMeta): - - config = None - - def __init__(self): - super().__init__() - self.run_number = 0 - self.input_file = self.config.get("busco_run", "in") - self.main_out = self.config.get("busco_run", "main_out") - self.working_dir = (os.path.join(self.main_out, "auto_lineage") - if isinstance(self.config, BuscoConfigAuto) - else self.main_out) - self.lineage_results_dir = self.config.get("busco_run", "lineage_results_dir") - self.run_folder = os.path.join(self.working_dir, self.lineage_results_dir) - self.log_folder = os.path.join(self.main_out, "logs") - self.cpus = self.config.getint("busco_run", "cpu") - self.lineage_dataset = self.config.get("busco_run", "lineage_dataset") - self.domain = self.config.get("busco_run", "domain") - - if not self.check_tool_available(): - raise ToolException("{} tool cannot be found. Please check the 'path' and 'command' parameters " - "provided in the config file. Do not include the command in the " - "path!".format(self.name)) - self.version = self.get_version() - self.check_tool_dependencies() - - self.checkpoint_file = None - - self.logfile_path_out = os.path.join(self.config.get("busco_run", "main_out"), "logs", - "{}_out.log".format(self.name)) - self.logfile_path_err = self.logfile_path_out.rpartition("_out.log")[0] + "_err.log" - - def init_checkpoint_file(self): - self.checkpoint_file = os.path.join(self.output_folder, ".checkpoint") - - def write_checkpoint_file(self): - with open(self.checkpoint_file, "a") as cpt_file: - cpt_file.write("Tool: {}\n".format(self.name)) - cpt_file.write("Version: {}\n".format(self.version)) - cpt_file.write("Run: {}\n".format(self.run_number)) - cpt_file.write("Time: {}\n".format(time.strftime('%m/%d/%Y %H:%M:%S'))) - cpt_file.write("Completed {} jobs\n\n".format(self.total)) - - def check_previous_completed_run(self): - if not os.path.exists(self.checkpoint_file): - return False - else: - with open(self.checkpoint_file, "r") as cpt_file: - lines = cpt_file.readlines() - tool_names = [s.strip().split(": ")[1] for s in lines[0::6]] - tool_versions = [s.strip().split(": ")[1] for s in lines[1::6]] - tool_run_numbers = [s.strip().split(": ")[1] for s in lines[2::6]] - try: - start_search = 0 - while True: - tool_ind = tool_names.index(self.name, start_search) - if str(self.version) != str(tool_versions[tool_ind]): - logger.warning("A previous run used {} version {}. " - "The restarted run is using {} version " - "{}".format(self.name, tool_versions[tool_ind], self.name, self.version)) - if int(tool_run_numbers[tool_ind]) == int(self.run_number): - return True - elif int(tool_run_numbers[tool_ind]) < int(self.run_number): - start_search = tool_ind + 1 - else: - raise SystemExit("Something went wrong. Information for {} run {} missing but " - "information for run {} found.".format(self.name, self.run_number, - tool_run_numbers[tool_ind])) - - except ValueError: - return False - - except TypeError: - logger.warning("Unable to parse {} file. Restart mode not available.".format(self.checkpoint_file)) - - @abstractmethod - def check_tool_dependencies(self): - pass - - @abstractmethod - def configure_job(self, *args): - pass - - @property - @abstractmethod - def output_folder(self): - raise NotImplementedError - - @property - @abstractmethod - def name(self): - raise NotImplementedError - - @abstractmethod - def run(self): - if self.version is not None: - logger.debug("Tool: {}".format(self.name)) - logger.debug("Version: {}".format(self.version)) - - @staticmethod - def create_dirs(dirnames): - """ - Create all required directories - - :param dirnames: list of paths already constructed - :return: - """ - if isinstance(dirnames, str): - os.makedirs(dirnames, exist_ok=True) - elif isinstance(dirnames, list): - for d in dirnames: - os.makedirs(d, exist_ok=True) - else: - raise TypeError("'dirnames' should be either a str or a list") - - def check_tool_available(self): - """ - Check tool's availability. - 1. The section ['name'] is available in the config - 2. This section contains keys 'path' and 'command' - 3. The string resulted from concatenation of values of these two keys - represents the full path to the command - - :return: True if the tool can be run, False if it is not the case - :rtype: bool - """ - if not self.config.has_section(self.name): - raise ToolException("Section for the tool [{}] is not present in the config file".format(self.name)) - - if not self.config.has_option(self.name, 'path'): - raise ToolException("Key \'path\' in the section [{}] is not present in the config file".format(self.name)) - - if self.config.has_option(self.name, 'command'): - executable = self.config.get(self.name, 'command') - else: - executable = self.name - - self.cmd = os.path.join(self.config.get(self.name, 'path'), executable) - - return which(self.cmd) is not None # True if tool available - - @abstractmethod - def get_version(self): - return - - -class ProdigalRunner(BaseRunner): - - name = "prodigal" - - _gc_run_results = defaultdict(dict) - - def __init__(self): - super().__init__() - self._output_folder = os.path.join(self.main_out, "prodigal_output") - self._pred_genes_dir = os.path.join(self._output_folder, "predicted_genes") - self._tmp_path = os.path.join(self._pred_genes_dir, "tmp") - self.cpus = 1 - self.create_dirs([self._pred_genes_dir, self._tmp_path]) - - # Get genetic_code from dataset.cfg file - # bacteria/archaea=11; Entomoplasmatales,Mycoplasmatales=4 - try: - self._genetic_code = self.config.get("prodigal", "prodigal_genetic_code").split(",") - except NoOptionError: - self._genetic_code = ["11"] - - # Set the ambiguous coding density range - try: - self._cd_upper = float(self.config.get("prodigal", "ambiguous_cd_range_upper")) \ - if len(self._genetic_code) > 1 else 0 - except NoOptionError: - raise SystemExit("Dataset config file does not contain required information. Please upgrade datasets.") - - self.current_gc = None - self._current_run_mode = None - self._tmp_name = None - - self.output_faa = os.path.join(self._pred_genes_dir, "predicted.faa") - self._output_fna = os.path.join(self._pred_genes_dir, "predicted.fna") - self.sequences_aa = {} - self.sequences_nt = {} - self.gene_details = {} - - self._input_length = self._get_genome_length() - self._run_mode = ["single", "meta"] if self._input_length > 100000 else ["meta"] - - self.init_checkpoint_file() - self.run_number += 1 - - @property - def output_folder(self): - return self._output_folder - - def check_tool_dependencies(self): - pass - - def generate_job_args(self): - yield - - @log("Genetic code {} selected as optimal", logger, attr_name="current_gc", on_func_exit=True) - def run(self): - """ - 1) If genome length > 100000 run in "single" mode, then "meta" mode if there are no gene predictions. Otherwise - just run in "meta" mode. This is based on the recommendations in the Prodigal docs. - 2) Run once using genetic code 11. This can be overridden if the user includes a spceific genetic code in the - config file. - 3) Check the genome coding density. If the coding density is above the ambiguous range (typically 0.73-0.8) - then continue with the current genetic code. The ambiguous range was determined based on analysis done by Mose - Manni. Otherwise run again on the next genetic code specified. - 4) If the next run still has a genetic density within the ambiguous range, read the stdout log files (formerly - the GFF files) and extract the scores assigned to each gene prediction. Whichever genetic code yields the - greatest mean score is selected. - :return: - """ - super().run() - tmp_files = [] - - for ix, m in enumerate(self._run_mode): - self._current_run_mode = m - for g in self._genetic_code: - self.current_gc = g - - file_id = os.path.join(self._tmp_path, - "prodigal_mode_{0}_code_{1}".format(self._current_run_mode, self.current_gc)) - self._tmp_name = "{}.faa".format(file_id) - self.logfile_path_out = "{}_out.log".format(file_id) - self.logfile_path_err = "err".join( - self.logfile_path_out.rsplit("out", 1)) # Replace only the last occurence of "out" substring - self._gc_run_results[self.current_gc].update({"tmp_name": self._tmp_name, - "log_file": self.logfile_path_out}) - - if os.path.exists(self._tmp_name): # Check to see if has already been run with these parameters - coding_density = self._gc_run_results[g]["cd"] - else: - logger.info("Running Prodigal with genetic code {} in {} mode".format(self.current_gc, - self._current_run_mode)) - self.total = 1 - self.run_jobs() - coding_length = self._get_coding_length(self.logfile_path_out) - coding_density = coding_length / self._input_length - self._gc_run_results[self.current_gc].update({"cd": coding_density}) - - logger.debug("Coding density is {}".format(coding_density)) - tmp_files.append(self._gc_run_results[self.current_gc]["tmp_name"]) - - # if the coding density is above the ambiguous range, then continue with these parameters - if coding_density >= self._cd_upper: - break - - # If output files from both runs in "single" mode are empty, run again in "meta" mode, else raise Exception. - if not any([os.stat(tmp_file).st_size > 0 for tmp_file in tmp_files]): - if ix + 1 == len(self._run_mode): - raise NoGenesError("Prodigal") - else: - continue - - # if only one genetic code to consider, proceed with it - # if there is more than one possible set of parameters, decide which to use - self.current_gc = self._select_best_gc() if len(tmp_files) > 1 else self._genetic_code[0] - - selected_logfile = self._gc_run_results[self.current_gc]["log_file"] - selected_tmpfile = self._gc_run_results[self.current_gc]["tmp_name"] - - self._organize_prodigal_files(selected_tmpfile, selected_logfile) - self.get_gene_details() - self._gc_run_results[self.current_gc].update({"seqs_aa": self.sequences_aa, "seqs_nt": self.sequences_nt, - "gene_details": self.gene_details}) - break - - return - - def configure_job(self, *args): - tmp_name_nt = self._tmp_name.rpartition(".faa")[0] + ".fna" - - prodigal_job = self.create_job() - prodigal_job.add_parameter("-p") - prodigal_job.add_parameter("%s" % self._current_run_mode) - prodigal_job.add_parameter("-f") - prodigal_job.add_parameter("gff") - prodigal_job.add_parameter("-g") - prodigal_job.add_parameter("%s" % self.current_gc) - prodigal_job.add_parameter("-a") - prodigal_job.add_parameter("%s" % self._tmp_name) - prodigal_job.add_parameter("-d") - prodigal_job.add_parameter("%s" % tmp_name_nt) - prodigal_job.add_parameter("-i") - prodigal_job.add_parameter("%s" % self.input_file) - - return prodigal_job - - def get_gene_details(self): - self.gene_details = defaultdict(list) - - with open(self._output_fna, "rU") as f: - for record in SeqIO.parse(f, "fasta"): - gene_name = record.id - self.sequences_nt[gene_name] = record - gene_start = int(record.description.split()[2]) - gene_end = int(record.description.split()[4]) - self.gene_details[gene_name].append({"gene_start": gene_start, "gene_end": gene_end}) - - with open(self.output_faa, "rU") as f: - for record in SeqIO.parse(f, "fasta"): - self.sequences_aa[record.id] = record - - return - - @staticmethod - def _get_coding_length(out_logfile): - total_coding_length = 0 - with open(out_logfile, "r") as f: - for line in f: - if not line.startswith('#'): - try: - start = int(line.split('\t')[3]) - stop = int(line.split('\t')[4]) - total_coding_length += (stop-start) - except IndexError: - continue - except ValueError: - continue - return total_coding_length - - def _get_genome_length(self): - length_seqs = 0 - for line in open(self.input_file): - if not line.startswith(">"): - length_seqs += len(line) - return length_seqs - - def _get_mean_score(self, gc): - logfile = self._gc_run_results[gc]["log_file"] - scores = [] - with open(logfile, "r") as f: - for line in f: - try: - score = re.search(";score=(.+?);", line).group(1) - scores.append(float(score)) - except AttributeError: - continue - mean_score = (sum(scores) / len(scores)) - return mean_score - - def _organize_prodigal_files(self, tmp_file, tmp_logfile): - - shutil.copy(tmp_file, self.output_faa) - shutil.copy(tmp_file.rpartition(".faa")[0] + ".fna", self._output_fna) - - # copy selected log files from tmp/ to logs/ - new_logname = os.path.join(self.log_folder, "prodigal_out.log") - shutil.copy(tmp_logfile, new_logname) - shutil.copy(tmp_logfile.rpartition("_out.log")[0] + "_err.log", - new_logname.rpartition("_out.log")[0] + "_err.log") - return - - def _select_best_gc(self): - gcs, cds = zip(*[[gc, self._gc_run_results[gc]["cd"]] for gc in self._genetic_code]) - sort_order = np.argsort(np.array(cds))[::-1] - gcs_sorted = np.array(gcs)[sort_order] - cds_sorted = np.array(cds)[sort_order] - if abs(cds_sorted[0] - cds_sorted[1]) <= 0.05: - mean_score1 = self._get_mean_score(gcs_sorted[0]) - mean_score2 = self._get_mean_score(gcs_sorted[1]) - gc = gcs_sorted[int(mean_score2 > mean_score1)] - else: - gc = gcs_sorted[0] - return gc - - def get_version(self): - prodigal_version = subprocess.check_output([self.cmd, "-v"], stderr=subprocess.STDOUT, shell=False) - prodigal_version = prodigal_version.decode("utf-8") - prodigal_version = prodigal_version.split("\n")[1].split(":")[0] - prodigal_version = prodigal_version.replace("Prodigal V", "") - return prodigal_version - - -class NoGenesError(Exception): - - def __init__(self, gene_predictor): - self.gene_predictor = gene_predictor - - -class HMMERRunner(BaseRunner): - - name = "hmmsearch" - - def __init__(self): - super().__init__() - self._hmmer_output_folder = os.path.join(self.run_folder, "hmmer_output") - self.datasets_version = self.config.get("busco_run", "datasets_version") - self.dataset_creation_date = self.config.get("busco_run", "creation_date") - self.dataset_nb_species = self.config.get("busco_run", "number_of_species") - self.dataset_nb_buscos = self.config.get("busco_run", "number_of_BUSCOs") - self.domain = self.config.get("busco_run", "domain") - - self.single_copy_sequences_folder = os.path.join(self.run_folder, "busco_sequences", - "single_copy_busco_sequences") - self.multi_copy_sequences_folder = os.path.join(self.run_folder, "busco_sequences", - "multi_copy_busco_sequences") - self.fragmented_sequences_folder = os.path.join(self.run_folder, "busco_sequences", - "fragmented_busco_sequences") - self.short_summary_file = os.path.join(self.run_folder, "short_summary.txt") - self.cutoff_dict = {} - self.extra_columns = False - self.log_count = 0 # Dummy variable used to skip logging for intermediate eukaryote pipeline results. - self.one_line_summary = None - - # to be initialized before run time - self.input_sequences = None - self.busco_ids = None - self.mode = None - self.gene_details = None - self.results_dir = None - - self.create_dirs([self._hmmer_output_folder, self.single_copy_sequences_folder, - self.multi_copy_sequences_folder, self.fragmented_sequences_folder]) - if self.domain == "eukaryota": - self.initial_results_dir = os.path.join(self._hmmer_output_folder, "initial_run_results") - self.rerun_results_dir = os.path.join(self._hmmer_output_folder, "rerun_results") - self.create_dirs([self.initial_results_dir, self.rerun_results_dir]) - - self.init_checkpoint_file() - - def configure_runner(self, input_sequences, busco_ids, mode, gene_details): - self.run_number += 1 - self.input_sequences = input_sequences - self.busco_ids = busco_ids - self.mode = mode - self.single_copy_buscos = {} - self.multi_copy_buscos = {} - self.fragmented_buscos = {} - self.is_complete = {} - self.is_fragment = {} - self.is_very_large = {} - self.matched_bitscores = {} - self.matched_genes_complete = {} - self.matched_genes_vlarge = {} - self.matched_genes_fragment = {} - self._already_used_genes = set() - self.hmmer_results_lines = [] - self.missing_buscos = [] - self.gene_details = gene_details - if len(self.cutoff_dict) == 0: - self.load_buscos() - - if self.domain == "eukaryota": - if self.run_number == 1: - self.results_dir = self.initial_results_dir - elif self.run_number == 2: - self.results_dir = self.rerun_results_dir - else: - raise ValueError("HMMER should not be run more than twice in the same Run instance.") - else: - self.results_dir = self._hmmer_output_folder - # gene_details can only be None for proteins mode. In the other modes the gene locations are written to a file - # after the coordinates are loaded from this attribute - - def configure_job(self, busco_id, seq_filename, output_filename): - - hmmer_job = self.create_job() - hmmer_job.add_parameter("--domtblout") - hmmer_job.add_parameter(os.path.join(self.results_dir, output_filename)) - hmmer_job.add_parameter("--cpu") - hmmer_job.add_parameter("1") - hmmer_job.add_parameter(os.path.join(self.lineage_dataset, "hmms", "{}.hmm".format(busco_id))) - hmmer_job.add_parameter(seq_filename) - return hmmer_job - - def generate_job_args(self): - for busco_id in self.busco_ids: - if busco_id in self.cutoff_dict: - if isinstance(self.input_sequences, str): - output_filename = "{}.out".format(busco_id) - yield busco_id, self.input_sequences, output_filename - elif isinstance(self.input_sequences, list): - input_files = [f for f in self.input_sequences if os.path.basename(f).startswith(busco_id)] - for seq_filename in input_files: - filename_parts = os.path.basename(seq_filename).rpartition(".faa") - output_filename = filename_parts[0] + ".out" + filename_parts[-1] - yield busco_id, seq_filename, output_filename - - @property - def output_folder(self): - return self._hmmer_output_folder - - def load_buscos(self): - """ - Load all BUSCOs for the lineage, along with their cutoff lengths and scores. - :return: - """ - self.cutoff_dict = defaultdict(dict) - self._load_length() - self._load_score() - self.cutoff_dict = dict(self.cutoff_dict) - return - - def run(self): - """ - Create a HMMER job for each BUSCO. Each job searches the input sequence file for matches for the BUSCO gene. - :return: - """ - super().run() - self.total = self._count_jobs() - self.run_jobs() - - def _count_jobs(self): - n = 0 - for busco_id in self.busco_ids: - if busco_id in self.cutoff_dict: - if isinstance(self.input_sequences, str): - n += 1 - elif isinstance(self.input_sequences, list): - input_files = [f for f in self.input_sequences if os.path.basename(f).startswith(busco_id)] - n += len(input_files) - return n - - def get_version(self): - """ - check the Tool has the correct version - :raises SystemExit: if the version is not correct - """ - hmmer_version = subprocess.check_output([self.cmd, "-h"], stderr=subprocess.STDOUT, shell=False) - hmmer_version = hmmer_version.decode("utf-8") - try: - hmmer_version = hmmer_version.split("\n")[1].split()[2] - hmmer_version = float(hmmer_version[:3]) - except ValueError: - # to avoid a crash with a super old version - hmmer_version = hmmer_version.split("\n")[1].split()[1] - hmmer_version = float(hmmer_version[:3]) - finally: - return hmmer_version - - def check_tool_dependencies(self): - """ - check dependencies on tools - :raises SystemExit: if a Tool version is not supported - """ - # check hmm version - if not self.version >= BuscoConfig.HMMER_VERSION: - raise SystemExit( - "HMMer version detected is not supported, please use HMMer v.{} +".format(BuscoConfig.HMMER_VERSION)) - return - - def process_output(self): - # Re-initialize dictionaries as defaultdicts - necessary because defaultdicts are not picklable and so they - # cannot appear in the __init__ when using multiprocessing within the class - self.matched_genes_complete = defaultdict(list) - self.matched_genes_vlarge = defaultdict(list) - self.matched_genes_fragment = defaultdict(list) - self.matched_bitscores = defaultdict(lambda: defaultdict(list)) - self.is_complete = defaultdict(lambda: defaultdict(list)) # dict of a dict of lists of dicts - self.is_fragment = defaultdict(lambda: defaultdict(list)) - self.is_very_large = defaultdict(lambda: defaultdict(list)) - - self._load_matched_genes() - self._filter() - self._consolidate_busco_lists() - - self.matched_bitscores = dict(self.matched_bitscores) - self.matched_genes_complete = dict(self.matched_genes_complete) - self.matched_genes_vlarge = dict(self.matched_genes_vlarge) - self.matched_genes_fragment = dict(self.matched_genes_fragment) - self.is_complete = dict(self.is_complete) - self.is_fragment = dict(self.is_fragment) - self.is_very_large = dict(self.is_very_large) - return - - @staticmethod - def _get_matched_lengths(nested_dict): - """ - For each entry in a nested dictionary, return a dict with the total lengths of all gene matches for each entry. - :param nested_dict: - :type nested_dict: - :return: - :rtype: - """ - total_len = defaultdict(int) - for entry in nested_dict: - for hit in nested_dict[entry]: - total_len[entry] += hit[1] - hit[0] - return total_len - - def _parse_hmmer_output(self, filename, busco_query): - """ - Read and parse HMMER output file. - :param filename: Name of HMMER output file - :param busco_query: Basename of file, used to identify BUSCO - :type filename: str - :type busco_query: str - :return: Dictionary of (gene_id, total_matched_length) pairs - :rtype: dict - """ - matched_lengths = defaultdict(int) - - with open(filename, "r") as f: - - # Read HMMER output file - for line in f: - if line.startswith("#"): - continue - else: - try: - line = line.strip().split() - gene_id = line[0] - bit_score = float(line[7]) - hmm_start = int(line[15]) - hmm_end = int(line[16]) - - # Store bitscore matches for each gene match. If match below cutoff, discard. - if bit_score >= float(self.cutoff_dict[busco_query]["score"]): - # todo: introduce upper bound - consult to see what a reasonable value would be - self.matched_bitscores[busco_query][gene_id].append(bit_score) - else: - continue - - matched_lengths[gene_id] += (hmm_end - hmm_start) - - except IndexError as e: - SystemExit(e, "Cannot parse HMMER output file {}".format(filename)) - return matched_lengths - - def _sort_matches(self, matched_lengths, busco_query): - """ - The HMMER gene matches are sorted into "complete", "v_large" and "fragmented" matches based on a comparison - with the cutoff value specified in the dataset cutoff_scores file - :param matched_lengths: dict of (gene_id, total_matched_length) pairs - :param busco_query: BUSCO identifier - :type matched_lengths: dict - :type busco_query: str - :return: busco_complete, busco_vlarge, busco_fragment - three dictionaries of the form - {gene_id: [{"bitscore": float, "length": int}, {...}, ...], ...} - :rtype: dict - """ - busco_complete = defaultdict(list) - busco_vlarge = defaultdict(list) - busco_fragment = defaultdict(list) - - # Determine whether matched gene represents a complete, very_large or fragment of a BUSCO - for gene_id, size in matched_lengths.items(): - - # Kind of like a z-score, but it is compared with a cutoff value, not a mean - zeta = (self.cutoff_dict[busco_query]["length"] - size) \ - / self.cutoff_dict[busco_query]["sigma"] - - # gene match can only be either complete, v_large or fragment - if -2 <= zeta <= 2: - busco_type = busco_complete - match_type = self.matched_genes_complete - elif zeta < -2: - busco_type = busco_vlarge - match_type = self.matched_genes_vlarge - else: - busco_type = busco_fragment - match_type = self.matched_genes_fragment - - # Add information about match to dict - busco_type[gene_id].append(dict({"bitscore": max(self.matched_bitscores[busco_query][gene_id]), - "length": matched_lengths[gene_id]})) - # Reference which busco_queries are associated with each gene match - match_type[gene_id].append(busco_query) - - return busco_complete, busco_vlarge, busco_fragment - - def _load_matched_genes(self): - """ - Load all gene matches from HMMER output and sort into dictionaries depending on match quality - (complete, v_large, fragment). - :return: - """ - if self.run_number == 1: - hmmer_results_files = sorted([os.path.join(self.results_dir, f) for f in os.listdir(self.results_dir)]) - elif self.run_number == 2: - hmmer_initial_run_files = [os.path.join(self.initial_results_dir, f) - for f in os.listdir(self.initial_results_dir)] - hmmer_rerun_files = [os.path.join(self.rerun_results_dir, f) - for f in os.listdir(self.rerun_results_dir)] - hmmer_results_files = sorted(hmmer_initial_run_files + hmmer_rerun_files) - else: - raise ValueError("HMMER should not be run more than twice in the same Run instance.") - - for filename in hmmer_results_files: - busco_query = str(os.path.basename(filename).split(".")[0]) - matched_lengths = self._parse_hmmer_output(filename, busco_query) - busco_complete, busco_vlarge, busco_fragment = self._sort_matches(matched_lengths, busco_query) - - # Add all information for this busco_id to the full dictionary - if len(busco_complete) > 0: - self.is_complete[busco_query].update(busco_complete) - if len(busco_vlarge) > 0: - self.is_very_large[busco_query].update(busco_vlarge) - if len(busco_fragment) > 0: - self.is_fragment[busco_query].update(busco_fragment) - - return - - def _update_used_gene_set(self, busco_dict): - """ - Update set of already used genes to prevent processing the same gene twice. - :param busco_dict: One of [self.is_complete, self.is_very_large, self.is_fragment] - :type busco_dict: dict - :return: - """ - for entries in busco_dict.values(): - for gene_id in entries: - self._already_used_genes.add(gene_id) - return - - def _remove_lower_ranked_duplicates(self, busco_dict): - """ - Remove any genes and/or busco matches from input dictionary if they have previously been assigned to a better - quality match. - :param busco_dict: one of [self.is_very_large, self.is_fragment] - :type busco_dict: dict - :return: - """ - # Determine which match ranks to worry about - if busco_dict == self.is_very_large: - higher_rank_buscos = self.is_complete.keys() - matched_genes = self.matched_genes_vlarge - elif busco_dict == self.is_fragment: - higher_rank_buscos = list(self.is_complete.keys()) + list(self.is_very_large.keys()) - matched_genes = self.matched_genes_fragment - else: - raise SystemExit("Unrecognized dictionary of BUSCOs.") - - for busco_id in list(busco_dict.keys()): - matches = busco_dict[busco_id] - # Remove any buscos that appear in higher ranking dictionaries - if busco_id in higher_rank_buscos: - busco_dict.pop(busco_id) - for gene_id in matches: - matched_genes[gene_id] = [x for x in matched_genes[gene_id] if x != busco_id] # Remove all occurences of busco_id - if len(matched_genes[gene_id]) == 0: - matched_genes.pop(gene_id) - continue - - # Remove any genes that have previously been processed under a different and higher ranking busco match - for gene_id in list(matches.keys()): - if gene_id in self._already_used_genes: - busco_dict[busco_id].pop(gene_id) - matched_genes[gene_id] = [x for x in matched_genes[gene_id] if x != busco_id] # Remove all occurences of busco_id - if len(busco_dict[busco_id]) == 0: - busco_dict.pop(busco_id) - if len(matched_genes[gene_id]) == 0: - matched_genes.pop(gene_id) - - return - - def _remove_duplicates(self): - """ - Remove duplicate gene matches of lesser importance, i.e. keep the complete ones, then the very large ones and - finally the fragments. - Also remove duplicate BUSCO ID matches of lower importance. - Then search for any duplicate gene matches within the same rank for different BUSCOs and keep only the highest - scoring gene match. - :return: - """ - self._update_used_gene_set(self.is_complete) - self._remove_lower_ranked_duplicates(self.is_very_large) - self._update_used_gene_set(self.is_very_large) - self._remove_lower_ranked_duplicates(self.is_fragment) - self._remove_remaining_duplicate_matches(self.is_complete) - self._remove_remaining_duplicate_matches(self.is_very_large) - self._remove_remaining_duplicate_matches(self.is_fragment) - return - - def _remove_remaining_duplicate_matches(self, busco_dict): - """ - For any genes matched under more than one BUSCO, keep only the highest scoring match in the input dictionary. - :param busco_dict: one of [self.is_complete, self.is_very_large, self.is_fragment] - :type busco_dict: dict - :return: - """ - # For a given input dictionary {busco_id: gene_ids}, make sure we are using the corresponding dictionary - # {gene_id: busco_matches} - if busco_dict == self.is_complete: - matched_genes = self.matched_genes_complete - elif busco_dict == self.is_very_large: - matched_genes = self.matched_genes_vlarge - elif busco_dict == self.is_fragment: - matched_genes = self.matched_genes_fragment - else: - raise SystemExit("Unrecognized dictionary of BUSCOs.") - - # Keep the best scoring gene if gene is matched by more than one busco with the same match rank - for gene_id, buscos in matched_genes.items(): - if len(buscos) > 1: - busco_bitscores = [] - busco_matches = [] - for busco in buscos: - matches = busco_dict[busco][gene_id] - for match in matches: - bitscore = match["bitscore"] - busco_bitscores.append(bitscore) - busco_matches.append(busco) - - if len(set(buscos)) == 1: # If only one busco is matched twice (initial run and rerun), don't remove it - continue - best_match_ind = max(range(len(busco_bitscores)), key=busco_bitscores.__getitem__) - buscos = [x for x in buscos if x != busco_matches[best_match_ind]] - # Remove lower scoring duplicates from dictionary. - # Note for future development: the matched_genes dictionary is not updated in this method when - # duplicates are removed from busco_dict - for duplicate in list(set(buscos)): - # Use set to account for any duplicate entries (matched in both initial run and rerun) - busco_dict[duplicate].pop(gene_id) - if len(busco_dict[duplicate]) == 0: - busco_dict.pop(duplicate) - return - - def _remove_low_scoring_matches(self, busco_dict): - """ - Go through input dictionary and remove any gene matches that score less than 85% of the top gene match score - for each BUSCO. - :param busco_dict: one of [self.is_complete, self.is_very_large, self.is_fragment] - :type busco_dict: dict - :return: - """ - empty_buscos = [] - - # For each busco, keep only matches within 85% of top bitscore match for that busco - for busco_id, matches in busco_dict.items(): - if len(matches) > 1: - _, max_bitscore = self._get_best_scoring_match(matches) - # Go through all matches again, removing any below the threshold - for gene_id in list(matches.keys()): - match_info = matches[gene_id] - matches_to_remove = [] - for m, match in enumerate(match_info): - if match["bitscore"] < 0.85*max_bitscore: - matches_to_remove.append(m) - - # Remove dict from list of dicts. Safe way to delete without risking list size changing during - # iteration - for ind in sorted(matches_to_remove, reverse=True): - del match_info[ind] - - # Record dictionary address of empty gene records - if len(busco_dict[busco_id][gene_id]) == 0: - empty_buscos.append((busco_id, gene_id)) - - # Safe way to delete empty records without risking dictionary size changing while iterating - for item in empty_buscos: - busco_id, gene_id = item - busco_dict[busco_id].pop(gene_id) - - return - - @staticmethod - def _get_best_scoring_match(gene_matches): - """ - Find the highest bitscore in all gene matches. - :param gene_matches: dictionary of the form - {gene_id: [{"bitscore": float, "length": int}, {"bitscore": float, "length": int}, ...], ...} - :type gene_matches: dict - :return: best_match_gene, best_match_bitscore - :rtype: str, float - """ - match_scores = [] - match_genes = [] - for gene_id, matches in gene_matches.items(): - for match in matches: - bitscore = match["bitscore"] - match_scores.append(bitscore) - match_genes.append(gene_id) - best_match_ind = max(range(len(match_scores)), key=match_scores.__getitem__) - best_match_gene = match_genes[best_match_ind] - best_match_bitscore = match_scores[best_match_ind] - return best_match_gene, best_match_bitscore - - def _filter(self): - """ - Remove all duplicate matches and any matches below 85% of the top match for each BUSCO. - :return: - """ - self._remove_duplicates() - self._remove_low_scoring_matches(self.is_complete) - self._remove_low_scoring_matches(self.is_very_large) - self._remove_low_scoring_matches(self.is_fragment) - return - - def _consolidate_busco_lists(self): - """ - Sort BUSCO matches into single-copy, multi-copy and fragments. - Only the highest scoring fragment for each BUSCO is kept. - :return: - """ - for busco_dict in [self.is_complete, self.is_very_large]: - for busco_id, gene_matches in busco_dict.items(): - if len(gene_matches) == 1: - self.single_copy_buscos[busco_id] = busco_dict[busco_id] - else: - self.multi_copy_buscos[busco_id] = busco_dict[busco_id] - - for busco_id, gene_matches in self.is_fragment.items(): - if len(gene_matches) > 1: - best_fragment, _ = self._get_best_scoring_match(gene_matches) - self.fragmented_buscos[busco_id] = {best_fragment: self.is_fragment[busco_id][best_fragment]} - else: - self.fragmented_buscos[busco_id] = gene_matches - return - - def load_links_info(self): - links_info = defaultdict(dict) - links_file = os.path.join(self.lineage_dataset, "links_to_{}.txt".format(self.datasets_version.upper())) - if os.path.exists(links_file): - with open(links_file, newline='') as f: - contents = csv.reader(f, delimiter="\t") - for row in contents: - busco_id, description, link = row - links_info[busco_id]["description"] = description - links_info[busco_id]["link"] = link - return links_info - - def _format_output_lines(self, busco_dict, label): - """ - Format BUSCO matches from input dictionary into output lines for writing to a file. - :param busco_dict: one of [self.single_copy_buscos, self.multi_copy_buscos, self.fragmented_buscos] - :type busco_dict: dict - :return: output_lines - :rtype: list - """ - output_lines = [] - - links_info = self.load_links_info() - - for busco, matches in busco_dict.items(): - for gene_id, match_info in matches.items(): - for m, match in enumerate(match_info): - bit_score = match["bitscore"] - match_length = match["length"] - - if self.mode == "proteins" or self.mode == "transcriptome": - try: - desc = links_info[busco]["description"] - link = links_info[busco]["link"] - self.extra_columns = True - output_lines.append("{}\t{}\t{}\t{}\t{}\t{}\t{}\n".format(busco, label, gene_id, bit_score, - match_length, link, desc)) - except KeyError: - output_lines.append("{}\t{}\t{}\t{}\t{}\n".format(busco, label, gene_id, bit_score, - match_length)) - elif self.mode == "genome": - scaffold = self.gene_details[gene_id][m] - location_pattern = ":{}-{}".format(scaffold["gene_start"], scaffold["gene_end"]) - if gene_id.endswith(location_pattern): - gene_id = gene_id.replace(location_pattern, "") - try: - desc = links_info[busco]["description"] - link = links_info[busco]["link"] - self.extra_columns = True - output_lines.append("{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\n".format( - busco, label, gene_id, scaffold["gene_start"], scaffold["gene_end"], bit_score, - match_length, link, desc)) - except KeyError: - output_lines.append("{}\t{}\t{}\t{}\t{}\t{}\t{}\n".format( - busco, label, gene_id, scaffold["gene_start"], scaffold["gene_end"], bit_score, - match_length)) - return output_lines - - def _create_output_content(self): - """ - Format output for all BUSCO matches. - :return: output_lines - :rtype: list - """ - output_lines = [] - dict_labels = {"Complete": self.single_copy_buscos, - "Duplicated": self.multi_copy_buscos, - "Fragmented": self.fragmented_buscos} - for label, busco_dict in dict_labels.items(): - output_lines += self._format_output_lines(busco_dict, label) - - return output_lines - - def _list_missing_buscos(self): - """ - Create a list of all BUSCOs that are missing after processing the HMMER output. - :return: output_lines, missing_buscos - :rtype: list, list - """ - output_lines = [] - for busco_group in self.cutoff_dict: - if not any(busco_group in d for d in [self.is_complete, self.is_very_large, self.is_fragment]): - output_lines.append("{}\tMissing\n".format(busco_group)) - self.missing_buscos.append(busco_group) - - if len(self.missing_buscos) == len(self.cutoff_dict): - logger.warning("BUSCO did not find any match. Make sure to check the log files if this is unexpected.") - - return output_lines, self.missing_buscos - - def _load_length(self): - """ - This function loads the length cutoffs file - :raises SystemExit: if the lengths_cutoff file cannot be read - """ - lengths_cutoff_file = os.path.join(self.lineage_dataset, "lengths_cutoff") - try: - with open(lengths_cutoff_file, "r") as f: - for line in f: - line = line.strip().split() - try: - taxid = line[0] - sd = float(line[2]) - length = float(line[3]) - - self.cutoff_dict[taxid]["sigma"] = sd - # there is an arthropod profile with sigma 0 - # that causes a crash on divisions - if sd == 0.0: - self.cutoff_dict[taxid]["sigma"] = 1 - self.cutoff_dict[taxid]["length"] = length - except IndexError as e: - raise SystemExit(e, "Error parsing the lengths_cutoff file.") - except IOError: - raise SystemExit("Impossible to read the lengths in {}".format(os.path.join(lengths_cutoff_file))) - return - - def _load_score(self): - """ - This function loads the score cutoffs file - :raises SystemExit: if the scores_cutoff file cannot be read - """ - scores_cutoff_file = os.path.join(self.lineage_dataset, "scores_cutoff") - try: - # open target scores file - with open(scores_cutoff_file, "r") as f: - for line in f: - line = line.strip().split() - try: - taxid = line[0] - score = float(line[1]) - self.cutoff_dict[taxid]["score"] = score - except IndexError as e: - raise SystemExit(e, "Error parsing the scores_cutoff file.") - except IOError: - raise SystemExit("Impossible to read the scores in {}".format(scores_cutoff_file)) - return - - def write_buscos_to_file(self, sequences_aa, sequences_nt=None): - """ - Write BUSCO matching sequences to output fasta files. Each sequence is printed in a separate file and both - nucleotide and amino acid versions are created. - :param sequences_aa: dict - :param sequences_nt: dict - :return: - """ - for busco_type in ["single_copy", "multi_copy", "fragmented"]: - if busco_type == "single_copy": - output_dir = self.single_copy_sequences_folder - busco_matches = self.single_copy_buscos - elif busco_type == "multi_copy": - output_dir = self.multi_copy_sequences_folder - busco_matches = self.multi_copy_buscos - elif busco_type == "fragmented": - output_dir = self.fragmented_sequences_folder - busco_matches = self.fragmented_buscos - - for busco, gene_matches in busco_matches.items(): - try: - aa_seqs, nt_seqs = zip(*[(sequences_aa[gene_id], sequences_nt[gene_id]) - for gene_id in gene_matches]) - with open(os.path.join(output_dir, "{}.fna".format(busco)), "w") as f2: - SeqIO.write(nt_seqs, f2, "fasta") - except TypeError: - aa_seqs = [sequences_aa[gene_id] for gene_id in gene_matches] - with open(os.path.join(output_dir, "{}.faa".format(busco)), "w") as f1: - SeqIO.write(aa_seqs, f1, "fasta") - - def write_hmmer_results(self): - """ - Create two output files: one with information on all BUSCOs for the given dataset and the other with a list of - all BUSCOs that were not found. - :return: - """ - - with open(os.path.join(self.run_folder, "full_table.tsv"), "w") as f_out: - - output_lines = self._create_output_content() - self._write_output_header(f_out) - - with open(os.path.join(self.run_folder, "missing_busco_list.tsv"), "w") as miss_out: - - self._write_output_header(miss_out, missing_list=True) - - # todo: move to calculate busco percentages - missing_buscos_lines, missing_buscos = self._list_missing_buscos() - output_lines += missing_buscos_lines - - for missing_busco in sorted(missing_buscos): - miss_out.write("{}\n".format(missing_busco)) - - sorted_output_lines = self._sort_lines(output_lines) - for busco in sorted_output_lines: - f_out.write(busco) - return - - @staticmethod - def _sort_lines(lines): - sorted_lines = sorted(lines, key=lambda x: int(x.split("\t")[0].split("at")[0])) - return sorted_lines - - def produce_hmmer_summary(self): - single_copy, multi_copy, only_fragments, total_buscos = self._get_busco_percentages() - - self.hmmer_results_lines.append("***** Results: *****\n\n") - self.one_line_summary = "C:{}%[S:{}%,D:{}%],F:{}%,M:{}%,n:{}\t{}\n".format( - round(self.s_percent + self.d_percent, 1), self.s_percent, self.d_percent, - self.f_percent, abs(round(100 - self.s_percent - self.d_percent - self.f_percent, 1)), total_buscos, " ") - self.hmmer_results_lines.append(self.one_line_summary) - self.hmmer_results_lines.append("{}\tComplete BUSCOs (C)\t\t\t{}\n".format(single_copy + multi_copy, " ")) - self.hmmer_results_lines.append("{}\tComplete and single-copy BUSCOs (S)\t{}\n".format(single_copy, " ")) - self.hmmer_results_lines.append("{}\tComplete and duplicated BUSCOs (D)\t{}\n".format(multi_copy, " ")) - self.hmmer_results_lines.append("{}\tFragmented BUSCOs (F)\t\t\t{}\n".format(only_fragments, " ")) - self.hmmer_results_lines.append("{}\tMissing BUSCOs (M)\t\t\t{}\n".format( - total_buscos - single_copy - multi_copy - only_fragments, " ")) - self.hmmer_results_lines.append("{}\tTotal BUSCO groups searched\t\t{}\n".format(total_buscos, " ")) - - if isinstance(self.config, BuscoConfigAuto): - self._one_line_hmmer_summary() - elif self.domain == "eukaryota" and self.log_count == 0: - self.log_count += 1 - self._produce_full_hmmer_summary_debug() - else: - self._one_line_hmmer_summary() - - with open(self.short_summary_file, "w") as summary_file: - - self._write_output_header(summary_file, no_table_header=True) - summary_file.write("# Summarized benchmarking in BUSCO notation for file {}\n" - "# BUSCO was run in mode: {}\n\n".format(self.input_file, self.mode)) - - for line in self.hmmer_results_lines: - summary_file.write("\t{}".format(line)) - - if self.config.getboolean("busco_run", "auto-lineage") and isinstance(self.config, BuscoConfigMain) \ - and hasattr(self.config, "placement_files"): - summary_file.write("\nPlacement file versions:\n") - for placement_file in self.config.placement_files: - summary_file.write("{}\n".format(placement_file)) - - return - - @log("{}", logger, attr_name="hmmer_results_lines", apply="join", on_func_exit=True) - def _produce_full_hmmer_summary(self): - return - - @log("{}", logger, attr_name="hmmer_results_lines", apply="join", on_func_exit=True, debug=True) - def _produce_full_hmmer_summary_debug(self): - return - - @log("{}", logger, attr_name="one_line_summary", on_func_exit=True) - def _one_line_hmmer_summary(self): - self.one_line_summary = "Results:\t{}".format(self.one_line_summary) - return - - def _write_output_header(self, file_object, missing_list=False, no_table_header=False): - """ - Write a standardized file header containing information on the BUSCO run. - :param file_object: Opened file object ready for writing - :type file_object: file - :return: - """ - file_object.write("# BUSCO version is: {} \n" - "# The lineage dataset is: {} (Creation date: {}, number of species: {}, number of BUSCOs: {}" - ")\n".format(busco.__version__, os.path.basename(self.lineage_dataset), - self.dataset_creation_date, self.dataset_nb_species, self.dataset_nb_buscos)) - # if isinstance(self._config, BuscoConfigMain): # todo: wait until rerun command properly implemented again - # file_object.write("# To reproduce this run: {}\n#\n".format(self._rerun_cmd)) - - if no_table_header: - pass - elif missing_list: - file_object.write("# Busco id\n") - elif self.mode == "proteins" or self.mode == "transcriptome": - if self.extra_columns: - file_object.write("# Busco id\tStatus\tSequence\tScore\tLength\tOrthoDB url\tDescription\n") - else: - file_object.write("# Busco id\tStatus\tSequence\tScore\tLength\n") - elif self.mode == "genome": - if self.extra_columns: - file_object.write( - "# Busco id\tStatus\tSequence\tGene Start\tGene End\tScore\tLength\tOrthoDB url\tDescription\n") - else: - file_object.write("# Busco id\tStatus\tSequence\tGene Start\tGene End\tScore\tLength\n") - - return - - def _get_busco_percentages(self): - self.single_copy = len(self.single_copy_buscos) # int - self.multi_copy = len(self.multi_copy_buscos) # int - self.only_fragments = len(self.fragmented_buscos) # int - self.total_buscos = len(self.cutoff_dict) - - # Get percentage of each kind of BUSCO match - self.s_percent = abs(round((self.single_copy / self.total_buscos) * 100, 1)) - self.d_percent = abs(round((self.multi_copy / self.total_buscos) * 100, 1)) - self.f_percent = abs(round((self.only_fragments / self.total_buscos) * 100, 1)) - - return self.single_copy, self.multi_copy, self.only_fragments, self.total_buscos - - -class MKBLASTRunner(BaseRunner): - - name = "makeblastdb" - - def __init__(self): - super().__init__() - self.db_path = os.path.join(self.config.get("busco_run", "main_out"), "blast_db") - self.output_db = os.path.join(self.db_path, os.path.basename(self.input_file)) - self.create_dirs(self.db_path) - self.total = 1 - self.init_checkpoint_file() - self.run_number += 1 - - @log("Creating BLAST database with input file", logger) - def configure_job(self, *args): - mkblast_job = self.create_job() - mkblast_job.add_parameter("-in") - mkblast_job.add_parameter(self.input_file) - mkblast_job.add_parameter("-dbtype") - mkblast_job.add_parameter("nucl") - mkblast_job.add_parameter("-out") - mkblast_job.add_parameter(self.output_db) - return mkblast_job - - def run(self): - super().run() - if os.path.exists(self.db_path) and len(os.listdir(self.db_path)) > 0: - return - - self.run_jobs() - - def generate_job_args(self): - yield - - def get_version(self): - mkblastdb_version_call = subprocess.check_output([self.cmd, "-version"], stderr=subprocess.STDOUT, shell=False) - mkblastdb_version = ".".join(mkblastdb_version_call.decode("utf-8").split("\n")[0].split()[1].rsplit(".")) - - return mkblastdb_version - - def check_tool_dependencies(self): - pass - - @property - def output_folder(self): - return self.db_path - - -class TBLASTNRunner(BaseRunner): - - name = "tblastn" - - MAX_FLANK = 20000 - - def __init__(self): - self.coords = {} - super().__init__() - self._output_folder = os.path.join(self.run_folder, "blast_output") - self.output_seqs = os.path.join(self._output_folder, "sequences") - self.create_dirs([self._output_folder, self.output_seqs]) - self.total = 1 - - self.e_v_cutoff = self.config.getfloat("busco_run", "evalue") - self.region_limit = self.config.getint("busco_run", "limit") - self.flank = self._define_flank() - - self.init_checkpoint_file() - - def configure_runner(self, blast_db, missing_and_frag_only, ancestral_variants, incomplete_buscos): - self.run_number += 1 - self.blast_db = blast_db - self.missing_and_frag_only = missing_and_frag_only - self.ancestral_variants = ancestral_variants - self.incomplete_buscos = incomplete_buscos - - self.ancestral_sfx = "_variants" if self.ancestral_variants else "" - self.ancestral_file = os.path.join(self.lineage_dataset, "ancestral{}".format(self.ancestral_sfx)) - self.query_file = os.path.join(self.lineage_dataset, "ancestral{}".format(self.ancestral_sfx)) - self.output_suffix = "_missing_and_frag_rerun" if self.missing_and_frag_only else "" - self.rerun_query_file = os.path.join(self._output_folder, - "ancestral{}{}".format(self.ancestral_sfx, self.output_suffix)) - if self.missing_and_frag_only and self.ancestral_variants: - self._extract_incomplete_buscos_ancestral() - - self.blast_filename = os.path.join(self._output_folder, "tblastn{}.tsv".format(self.output_suffix)) - self.coords_filename = os.path.join(self._output_folder, "coordinates{}.tsv".format(self.output_suffix)) - - def configure_job(self, *args): - tblastn_job = self.create_job() - tblastn_job.add_parameter("-evalue") - tblastn_job.add_parameter(str(self.e_v_cutoff)) - tblastn_job.add_parameter("-num_threads") - tblastn_job.add_parameter(str(self.cpus)) - tblastn_job.add_parameter("-query") - tblastn_job.add_parameter(self.query_file) - tblastn_job.add_parameter("-db") - tblastn_job.add_parameter(self.blast_db) - tblastn_job.add_parameter("-out") - tblastn_job.add_parameter(self.blast_filename) - tblastn_job.add_parameter("-outfmt") - tblastn_job.add_parameter("7") - return tblastn_job - - @property - def output_folder(self): - return self._output_folder - - def _define_flank(self): - """ - TODO: Add docstring - :return: - """ - try: - size = os.path.getsize(self.input_file) / 1000 # size in mb - flank = int(size / 50) # proportional flank size - # Ensure value is between 5000 and MAX_FLANK - flank = min(max(flank, 5000), type(self).MAX_FLANK) - except IOError: # Input data is only validated during run_analysis. This will catch any IO issues before that. - raise SystemExit("Impossible to read the fasta file {}".format(self.input_file)) - - return flank - - @log("Running a BLAST search for BUSCOs against created database", logger) - def run(self): - super().run() - self.run_jobs() - self._check_output() - return - - def check_tool_dependencies(self): - if ".".join(self.version.split(".")[:-1]) not in ["2.2", "2.3"] and self.version != "2.10.1+": - # Known problems with multithreading on BLAST 2.4-2.10.0. - logger.warning("You are using BLAST version {}. This is known to yield inconsistent results when " - "multithreading. BLAST will run on a single core as a result. For performance improvement, " - "please upgrade to BLAST 2.10.1+.".format(self.version)) - self.cpus = 1 - - def get_version(self): - tblastn_version_call = subprocess.check_output([self.cmd, "-version"], stderr=subprocess.STDOUT, shell=False) - tblastn_version = ".".join(tblastn_version_call.decode("utf-8").split("\n")[0].split()[1].rsplit(".")) - - return tblastn_version - - def generate_job_args(self): - yield - - def _check_output(self): - # check that blast worked - if not os.path.exists(self.blast_filename): - raise SystemExit("tblastn failed!") - - # check that the file is not truncated - with open(self.blast_filename, "r") as f: - try: - if "processed" not in f.readlines()[-1]: - raise SystemExit("tblastn has ended prematurely (the result file lacks the expected final line), " - "which will produce incomplete results in the next steps ! This problem likely " - "appeared in blast+ 2.4 and seems not fully fixed in 2.6. It happens only when " - "using multiple cores. You can use a single core (-c 1) or downgrade to " - "blast+ 2.2.x, a safe choice regarding this issue. See blast+ documentation for " - "more information.") - - except IndexError: - # if the tblastn result file is empty, for example in phase 2 - # if 100% was found in phase 1 - pass - return - - def _extract_incomplete_buscos_ancestral(self): - - logger.info("Extracting missing and fragmented buscos from the file {}...".format( - os.path.basename(self.ancestral_file))) - - matched_seqs = [] - busco_ids_retrieved = set() - with open(self.ancestral_file, "rU") as anc_file: - - for record in SeqIO.parse(anc_file, "fasta"): - if any(record.id.startswith(b) for b in self.incomplete_buscos): - # Remove the ancestral variant identifier ("_1" etc) so it matches all other BUSCO IDs. - # The identifier is still present in the "name" and "description" Sequence Record attributes. - logger.debug("Found ancestral proteins for {}".format(record.id)) - record.id = record.id.split("_")[0] - busco_ids_retrieved.add(record.id) - matched_seqs.append(record) - - unmatched_incomplete_buscos = list(set(self.incomplete_buscos) - set(busco_ids_retrieved)) - if len(unmatched_incomplete_buscos) > 0: - logger.debug("The BUSCO ID(s) {} were not found in the file {}".format( - unmatched_incomplete_buscos, os.path.basename(self.ancestral_file))) - - self.query_file = self.rerun_query_file - with open(self.query_file, "w") as out_file: # Create new query file for second tblastn run - SeqIO.write(matched_seqs, out_file, "fasta") - - return - - def _get_all_boundaries(self, locations): - sorted_locs = sorted(locations, key=lambda x: int(x[0])) - all_boundaries = [sorted_locs[0]] - for loc in sorted_locs[1:]: - overlap, boundary = self._get_overlap(all_boundaries[-1], loc) - if overlap > 0: - all_boundaries[-1] = boundary - else: - all_boundaries.append(boundary) - return all_boundaries - - def get_coordinates(self): - self.coords = self._parse_blast_output() - if self.ancestral_variants: - self.coords = self._select_busco_variants() - self._prune() - return - - def _get_largest_regions(self, candidate_contigs, coords, busco_group): - size_lists = [] - - for contig in candidate_contigs: - potential_locations = coords[busco_group][contig]["busco_coords"] - final_regions = self._get_all_boundaries(potential_locations) - - # Get sum of all potential match sizes for a contig - size_lists.append(self._sum_all_region_sizes(final_regions)) - - return size_lists - - @staticmethod - def _get_overlap(a, b): - """ - This function checks whether two regions overlap and returns the length of the overlap region along with the - boundaries of both regions combined as a [start, stop] list. - - :param a: first region, start and end - :type a: list - :param b: second region, start and end - :type b: list - :returns: overlap, boundary - :rtype: int, list - """ - a_start, a_end = a - b_start, b_end = b - overlap = min(a_end, b_end) - max(a_start, b_start) - if overlap > 0: - boundary = [min(a_start, b_start), max(a_end, b_end)] - elif b_start > a_start: - boundary = b - else: - boundary = a - return max(0, overlap), boundary - - def _parse_blast_output(self): - """ - Read the Blast output - """ - coords = defaultdict(lambda: defaultdict(defaultdict)) # dict of busco_id -> contig_id -> {info} - with open(self.blast_filename, "r") as blast_file: - for line in blast_file: - if line.startswith("#"): - continue - else: - try: - line = line.strip().split() - busco_name = line[0] - contig_id = line[1] - busco_start = int(line[6]) - busco_end = int(line[7]) - contig_start = int(line[8]) - contig_end = int(line[9]) - blast_eval = float(line[10]) - except (IndexError, ValueError): - continue - - # for minus-strand genes, invert coordinates for convenience - if contig_end < contig_start: - contig_end, contig_start = contig_start, contig_end - - # Add all matches to dictionary. The top matches are selected out later. - if contig_id not in coords[busco_name]: - coords[busco_name][contig_id] = {"contig_start": contig_start, "contig_end": contig_end, - "busco_coords": [[busco_start, busco_end]], - "blast_eval": blast_eval} - - elif contig_id in coords[busco_name]: # i.e. if the same gene matched the busco more than once. - # now update coordinates - coords = self._update_coordinates(coords, busco_name, contig_id, busco_start, busco_end, - contig_start, contig_end, blast_eval) - - return dict(coords) - - def _select_busco_variants(self): - """ - Filter contig matches to prevent multiple BUSCO variants matching the same contig. - The current behaviour combines all contig matches for all BUSCO variants, as long as the contig matches are - different. There is an open question over whether or not we should only return the contig matches for a single - BUSCO variant instead of all of them combined. This should only be an issue for the Transcriptome mode. - :return: - """ - selected_coords = defaultdict(lambda: defaultdict(defaultdict)) - for busco_name, contigs in self.coords.items(): - busco_basename = busco_name.split("_")[0] - if busco_basename in selected_coords: - for contig_id in contigs: - if contig_id in selected_coords[busco_basename]: - if contigs[contig_id]["blast_eval"] < selected_coords[busco_basename][contig_id]["blast_eval"]: - selected_coords[busco_basename][contig_id] = contigs[contig_id] - else: - selected_coords[busco_basename][contig_id] = contigs[contig_id] - else: - selected_coords[busco_basename] = contigs - - return selected_coords - - def _prune(self): - for busco_name, contigs in self.coords.items(): - if len(contigs) > self.region_limit: - # Sort by blast eval, then isolate smallest values leaving just "region_limit" number of contigs per - # busco_name - contigs_to_remove = sorted( - contigs, key=lambda contig: contigs[contig]["blast_eval"])[self.region_limit:] - for c in contigs_to_remove: - self.coords[busco_name].pop(c) - return - - @staticmethod - def _sum_all_region_sizes(deck): - """ - Sum all interval sizes in input list - :param deck: - :type deck: list - :return: - :rtype: int - """ - total = 0 - for entry in deck: - total += entry[1] - entry[0] - return total - - @staticmethod - def _update_coordinates(coords, busco_name, contig, busco_start, busco_end, contig_start, contig_end, blast_eval): - """ - If a contig match starts or ends withing 50 kb of a previous match, extend the recorded start and end positions - of the contig match, and record the start/end locations of the busco match. - If the contig match is entirely within a previous match, just record the start/end locations of the busco match. - If the match is outside 50 kb of a previous match, ignore it. The tblastn output file ranks matches in order of - bitscore (inverse order of eval) so these subsequent matches at different locations are guaranteed not to be - better than the ones already recorded for that contig. - :param coords: # todo: fill in details - :param busco_name: - :param contig: - :param busco_start: - :param busco_end: - :param contig_start: - :param contig_end: - :param blast_eval: - :return: - """ - append_busco_coords = False - - # Check if contig starts before and within 50kb of current position - if 0 <= coords[busco_name][contig]["contig_start"] - contig_start <= 50000: - coords[busco_name][contig]["contig_start"] = contig_start - append_busco_coords = True - - # Check if contig ends after and within 50 kbs of current position - if 0 <= contig_end - coords[busco_name][contig]["contig_end"] <= 50000: - coords[busco_name][contig]["contig_end"] = contig_end - append_busco_coords = True - # Else, check if contig starts inside current coordinates - elif coords[busco_name][contig]["contig_end"] >= contig_start >= coords[busco_name][contig]["contig_start"]: - # If contig ends inside current coordinates, just add alignment positions to list - if contig_end <= coords[busco_name][contig]["contig_end"]: - append_busco_coords = True - - # If contig ends after current coordinates, extend contig end - else: - coords[busco_name][contig]["contig_end"] = contig_end - append_busco_coords = True - - # moved to its own "if" statement to avoid multiple appends from the "if" statements above - if append_busco_coords: - coords[busco_name][contig]["busco_coords"].append([busco_start, busco_end]) - - if blast_eval < coords[busco_name][contig]["blast_eval"]: - coords[busco_name][contig]["blast_eval"] = blast_eval - - return coords - - def filter_best_matches(self): - - # Get a list of all start and stop positions of possible busco locations, merging overlapping regions - for busco_group in self.coords: - candidate_contigs = list(self.coords[busco_group].keys()) - size_lists = self._get_largest_regions(candidate_contigs, self.coords, busco_group) - max_size = max(size_lists) # Get largest match size for a busco group - # Include all location matches for a busco as long as they are within 70% of the maximum size match - size_cutoff = int(0.7 * max_size) - for c, contig_name in enumerate(candidate_contigs): - if size_lists[c] < size_cutoff: - self.coords[busco_group].pop(contig_name) - return - - def write_coordinates_to_file(self): - - with open(self.coords_filename, "w") as out: - for busco_group, contig_matches in self.coords.items(): - for contig_name in contig_matches: - self.coords[busco_group][contig_name]["contig_start"] = \ - max(int(self.coords[busco_group][contig_name]["contig_start"]) - self.flank, 0) - contig_start = self.coords[busco_group][contig_name]["contig_start"] - self.coords[busco_group][contig_name]["contig_end"] += self.flank - contig_end = int(self.coords[busco_group][contig_name]["contig_end"]) - out.write("{}\t{}\t{}\t{}\n".format(busco_group, contig_name, contig_start, contig_end)) - return - - def write_contigs(self): - # Extract all contig identifiers - contig_names = [] - for contig_info in self.coords.values(): - for contig in contig_info: - contig_names.append(contig) - - # Write sequences that match contig ids - with open(self.input_file, "rU") as f: - for record in SeqIO.parse(f, "fasta"): - if record.id in list(set(contig_names)): - with open(os.path.join(self.output_seqs, "{}.temp".format(record.id)), "w") as out: - SeqIO.write(record, out, "fasta") - return - - -class AugustusParsingError(Exception): - - def __init__(self): - pass - - -class AugustusRunner(BaseRunner): - - ACCEPTED_PARAMETERS = ["strand", "genemodel", "singlestrand", "hintsfile", "extrinsicCfgFile", "maxDNAPieceSize", - "protein", "introns", "start", "stop", "cds", "AUGUSTUS_CONFIG_PATH", - "alternatives-from-evidence", "alternatives-from-sampling", "sample", "minexonintronprob", - "minmeanexonintronprob", "maxtracks", "gff3", "UTR", "outfile", "noInFrameStop", - "noprediction", "contentmodels", "translation_table", "temperature", "proteinprofile", - "progress", "predictionStart", "predictionEnd", "uniqueGeneId"] - - name = "augustus" - - def __init__(self): - self.gene_details = None - self._augustus_config_path = os.environ.get("AUGUSTUS_CONFIG_PATH") - self.config.set("busco_run", "augustus_config_path", self._augustus_config_path) - self._target_species = self.config.get("busco_run", "augustus_species") - super().__init__() - self._output_folder = os.path.join(self.run_folder, "augustus_output") - self.tmp_dir = os.path.join(self._output_folder, "tmp") - self.extracted_prot_dir = os.path.join(self._output_folder, "extracted_proteins") - self.err_logfile = os.path.join(self.log_folder, "augustus_err.log") - - try: - self.extra_params = self.config.get("busco_run", "augustus_parameters").replace(',', ' ') - except NoOptionError: - self.extra_params = "" - self.chunksize = 10 - - self.gff_dir = os.path.join(self._output_folder, "gff") - self.err_logfiles = [] - self.any_gene_found = False - self.param_keys = [] - self.param_values = [] - - self.create_dirs([self.extracted_prot_dir, self.gff_dir]) - - self.init_checkpoint_file() - - def configure_runner(self, seqs_path, coords, sequences_aa, sequences_nt, rerun): - self.run_number += 1 - - # Placed here to allow reconfiguration for rerun - self._target_species = self.config.get("busco_run", "augustus_species") - - self.check_tool_dependencies() - self.gene_details = defaultdict(list) - self.output_sequences = [] - - self.seqs_path = seqs_path - self.coords = coords - self.run_num = 2 if rerun else 1 - - self.sequences_aa = sequences_aa - self.sequences_nt = sequences_nt - - self.pred_genes_dir = os.path.join(self._output_folder, "predicted_genes_rerun") if rerun \ - else os.path.join(self._output_folder, "predicted_genes") - - # self.tmp_dir placed here to allow it to be recreated during reconfiguration for rerun - self.create_dirs([self.pred_genes_dir, self.tmp_dir]) - - @property - def output_folder(self): - return self._output_folder - - def check_tool_dependencies(self): - """ - check dependencies on files and folders - properly configured. - :raises SystemExit: if Augustus config path is not writable or - not set at all - :raises SystemExit: if Augustus config path does not contain - the needed species - present - """ - try: - augustus_species_dir = os.path.join(self._augustus_config_path, "species") - if not os.access(augustus_species_dir, os.W_OK): - raise SystemExit("Cannot write to Augustus species folder, please make sure you have write " - "permissions to {}".format(augustus_species_dir)) - - except TypeError: - raise SystemExit( - "The environment variable AUGUSTUS_CONFIG_PATH is not set") - - if not os.path.exists(os.path.join(augustus_species_dir, self._target_species)): - # Exclude the case where this is a restarted run and the retraining parameters have already been moved. - if self.config.getboolean("busco_run", "restart") and self.run_number == 2 and \ - os.path.exists(os.path.join(self._output_folder, "retraining_parameters", self._target_species)): - pass - else: - raise SystemExit( - "Impossible to locate the species \"{0}\" in Augustus species folder" - " ({1}), check that AUGUSTUS_CONFIG_PATH is properly set" - " and contains this species. \n\t\tSee the help if you want " - "to provide an alternative species".format(self._target_species, augustus_species_dir)) - - @log("Running Augustus prediction using {} as species:", logger, attr_name="_target_species") - def run(self): - super().run() - if self.extra_params: - logger.info("Additional parameters for Augustus are {}: ".format(self.extra_params)) - self.param_keys, self.param_values = self.parse_parameters() - - self.total = self._count_jobs() - self.run_jobs() - - def process_output(self): - logger.info("Extracting predicted proteins...") - files = [f for f in sorted(os.listdir(self.pred_genes_dir)) if any(busco_id in f for busco_id in self.coords)] - for filename in files: - self._extract_genes_from_augustus_output(filename) - - if not self.any_gene_found and self.run_num == 1: - raise NoGenesError("Augustus") - - self.gene_details = dict(self.gene_details) - - self._merge_stderr_logs() - self._remove_individual_err_logs() - - return - - def _count_jobs(self): - n = 0 - for busco_group, contigs in self.coords.items(): - for _ in contigs: - n += 1 - return n - - def sort_jobs(self): - jobs_size_info = [] - for busco_group, contigs in self.coords.items(): - - for contig_name, contig_info in contigs.items(): - contig_start = contig_info["contig_start"] - contig_end = contig_info["contig_end"] - pred_size = int(contig_end) - int(contig_start) - jobs_size_info.append({"busco_group": busco_group, - "contig_name": contig_name, - "contig_start": contig_start, - "contig_end": contig_end, - "pred_size": pred_size}) - job_sizes = [item["pred_size"] for item in jobs_size_info] - new_job_order = np.argsort(job_sizes)[::-1] - ordered_jobs = [jobs_size_info[i] for i in new_job_order] - return ordered_jobs - - def generate_job_args(self): - contig_ordinal_inds = defaultdict(int) - njobs = 0 - - ordered_jobs = self.sort_jobs() - - for job_info in ordered_jobs: - contig_name = job_info["contig_name"] - busco_group = job_info["busco_group"] - contig_start = job_info["contig_start"] - contig_end = job_info["contig_end"] - contig_tmp_file = "{}.temp".format(contig_name[:100]) # Avoid very long filenames - contig_ordinal_inds[busco_group] += 1 - output_index = contig_ordinal_inds[busco_group] - out_filename = os.path.join(self.pred_genes_dir, "{}.out.{}".format(busco_group, output_index)) - njobs += 1 - - yield busco_group, contig_tmp_file, contig_start, contig_end, out_filename - - @log("Additional parameters for Augustus are {}: ", logger, attr_name="_target_species") - def parse_parameters(self): - accepted_keys = [] - accepted_values = [] - if self.extra_params: - self.extra_params = self.extra_params.strip("\" \'") - try: - if self.extra_params.startswith("--"): - key_val_pairs = self.extra_params.split(" --") - for kv in key_val_pairs: - key_vals = kv.strip("- ").split("=") - if len(key_vals) == 2: - key, val = key_vals - if key in type(self).ACCEPTED_PARAMETERS: - accepted_keys.append(key.strip()) - accepted_values.append(val.strip()) - else: - logger.warning("{} is not an accepted parameter for Augustus.".format(key)) - else: - raise AugustusParsingError - else: - raise AugustusParsingError - except AugustusParsingError: - logger.warning( - "Augustus parameters are not correctly formatted. Please enter them as follows: " - "\"--param1=value1 --param2=value2\" etc. Proceeding without additional parameters.") - return [], [] - return accepted_keys, accepted_values - - def _merge_stderr_logs(self): - with open(self.err_logfile, "a") as f: - for err_logfile in self.err_logfiles: - with open(err_logfile, "r") as g: - content = g.readlines() - f.writelines(content) - return - - def _remove_individual_err_logs(self): - shutil.rmtree(self.tmp_dir) - return - - def get_version(self): # todo: need to handle all possible exceptions - augustus_help_output = subprocess.check_output([self.cmd, "--version"], stderr=subprocess.STDOUT, shell=False) - augustus_help_output = augustus_help_output.decode("utf-8") - s = augustus_help_output.split("\n")[0] - augustus_version = s[s.find("(") + 1:s.find(")")] - return augustus_version - - def configure_job(self, busco_group, contig_tmp_file, contig_start, contig_end, out_filename): - # Augustus does not provide an option to write to an output file, so have to change the pipe target from the - # log file to the desired output file - self.logfile_path_out = out_filename - err_logfile = os.path.join(self.tmp_dir, os.path.basename(out_filename.rpartition(".out")[0] + ".err")) - self.logfile_path_err = err_logfile - self.err_logfiles.append(err_logfile) - - augustus_job = self.create_job() - augustus_job.add_parameter("--codingseq=1") - augustus_job.add_parameter("--proteinprofile={}".format(os.path.join(self.lineage_dataset, - "prfl", - "{}.prfl".format(busco_group)))) - augustus_job.add_parameter("--predictionStart={}".format(contig_start)) - augustus_job.add_parameter("--predictionEnd={}".format(contig_end)) - augustus_job.add_parameter("--species={}".format(self._target_species)) - for k, key in enumerate(self.param_keys): - augustus_job.add_parameter("--{}={}".format(key, self.param_values[k])) - augustus_job.add_parameter(os.path.join(self.seqs_path, contig_tmp_file)) - return augustus_job - - def _extract_genes_from_augustus_output(self, filename): - # todo: consider parallelizing this and other parsing functions - - gene_id = None - gene_info = [] - sequences_aa = [] - sequences_nt = [] - gene_found = False - completed_record = False - - with open(os.path.join(self.pred_genes_dir, filename), "r", encoding="utf-8") as f: - # utf-8 encoding needed to handle the umlaut in the third line of the file. - gene_info_section = False - nt_sequence_section = False - aa_sequence_section = False - nt_sequence_parts = [] - aa_sequence_parts = [] - - for line in f: - - if aa_sequence_section: - if "]" in line: - line = line.strip().lstrip("# ").rstrip("]") - aa_sequence_parts.append(line) - aa_sequence_section = False - completed_record = True - if gene_id is not None: - aa_sequence = "".join(aa_sequence_parts) - nt_sequence = "".join(nt_sequence_parts) - seq_record_aa = SeqRecord(Seq(aa_sequence.upper()), id=gene_id) - seq_record_nt = SeqRecord(Seq(nt_sequence.upper()), id=gene_id) - sequences_aa.append(seq_record_aa) - sequences_nt.append(seq_record_nt) - aa_sequence_parts = [] - nt_sequence_parts = [] - gene_id = None - continue - - else: - line = line.strip().lstrip("# ").rstrip("]") - aa_sequence_parts.append(line) - continue - - if line.startswith("# protein"): - nt_sequence_section = False - aa_sequence_section = True - if "]" in line: - line = line.strip().rstrip("]").split("[") - aa_sequence_parts.append(line[1]) - aa_sequence_section = False - completed_record = True - if gene_id is not None: - aa_sequence = "".join(aa_sequence_parts) - nt_sequence = "".join(nt_sequence_parts) - seq_record_aa = SeqRecord(Seq(aa_sequence.upper()), id=gene_id) - seq_record_nt = SeqRecord(Seq(nt_sequence.upper()), id=gene_id) - sequences_aa.append(seq_record_aa) - sequences_nt.append(seq_record_nt) - aa_sequence_parts = [] - nt_sequence_parts = [] - gene_id = None - else: - line = line.strip().rstrip("]").split("[") - aa_sequence_parts.append(line[1]) - continue - - if nt_sequence_section: - line = line.strip().lstrip("# ").rstrip("]") - nt_sequence_parts.append(line) - continue - - if line.startswith("# coding sequence"): - gene_info = [] - gene_info_section = False - nt_sequence_section = True - line = line.strip().rstrip("]").split("[") # Extract sequence part of line - nt_sequence_parts.append(line[1]) - continue - - if gene_info_section: - line = line.strip().split() - seq_name = line[0] - gene_start = line[3] - gene_end = line[4] - if not gene_id: - gene_id = "{}:{}-{}".format(seq_name, gene_start, gene_end) - self.gene_details[gene_id].append({"gene_start": gene_start, "gene_end": gene_end}) - gene_info.append("\t".join(line)) - continue - - if line.startswith("# start gene"): - gene_found = True - self.any_gene_found = True - gene_info_section = True - completed_record = False - continue - - if gene_found and not completed_record: - logger.warning("Augustus output file {} truncated".format(filename)) - - self.sequences_aa.update({record.id: record for record in sequences_aa}) - self.sequences_nt.update({record.id: record for record in sequences_nt}) - if gene_found: - self._write_sequences_to_file(filename, sequences_nt, sequences_aa) - - return - - def make_gff_files(self, single_copy_buscos): - - for b in single_copy_buscos: - gene_info = [] - busco_files = [f for f in os.listdir(self.pred_genes_dir) if f.startswith(b)] - gff_filename = os.path.join(self.gff_dir, "{}.gff".format(b)) - single_copy_busco_gene = list(single_copy_buscos[b].keys())[0] - gene_id_parts = single_copy_busco_gene.split(":") - if len(gene_id_parts) > 2: # if a ":" is present in the gene id, we don't want to break it up - gene_id_parts = [":".join(gene_id_parts[:-1]), gene_id_parts[-1]] - single_copy_busco_gene_id = gene_id_parts[0] - single_copy_busco_gene_start_coord, single_copy_busco_gene_end_coord = gene_id_parts[1].split("-") - gene_found = False - for filename in busco_files: - match_number = filename.split(".")[-1] - with open(os.path.join(self.pred_genes_dir, filename), "r", encoding="utf-8") as f: - gene_info_section = False - for line in f: - if gene_info_section and line.startswith("# coding sequence"): - with open(gff_filename, "a") as g: - g.write("\n".join(gene_info) + "\n") - gene_info = [] - break - - if line.startswith("# start gene"): - gene_info_section = True - continue - - if gene_info_section: - line = line.strip().split() - seq_name = line[0] - gene_start = line[3] - gene_end = line[4] - if gene_found or (seq_name == single_copy_busco_gene_id - and gene_start == single_copy_busco_gene_start_coord - and gene_end == single_copy_busco_gene_end_coord): - gene_found = True - gene_id_info = line[-1] - line[-1] = self.edit_gene_identifier(gene_id_info, match_number) - if len(line) == 12: - gene_id_info_2 = line[-3] - line[-3] = self.edit_gene_identifier(gene_id_info_2, match_number) - gene_info.append("\t".join(line)) - else: - gene_info_section = False - continue - if gene_found: - break - if not gene_found: - raise SystemExit("Unable to find single copy BUSCO gene in Augustus output.") - - return - - def edit_gene_identifier(self, orig_str, match_num): - modified_str = re.sub(r"g([0-9])", r"r{}.m{}.g\1".format(self.run_num, match_num), orig_str) - return modified_str - - def _write_sequences_to_file(self, filename, sequences_nt, sequences_aa): - - filename_parts = filename.rpartition(".out") - output_fna = os.path.join(self.extracted_prot_dir, filename_parts[0] + ".fna" + filename_parts[-1]) - output_faa = os.path.join(self.extracted_prot_dir, filename_parts[0] + ".faa" + filename_parts[-1]) - self.output_sequences.append(output_faa) - - with open(output_fna, "w") as out_fna: - SeqIO.write(sequences_nt, out_fna, "fasta") - with open(output_faa, "w") as out_faa: - SeqIO.write(sequences_aa, out_faa, "fasta") - - return - - def move_retraining_parameters(self): - """ - This function moves retraining parameters from augustus species folder - to the run folder - """ - augustus_species_path = os.path.join(self._augustus_config_path, "species", self._target_species) - if os.path.exists(augustus_species_path): - new_path = os.path.join(self._output_folder, "retraining_parameters", self._target_species) - shutil.move(augustus_species_path, new_path) - elif self.config.getboolean("busco_run", "restart") and \ - os.path.exists(os.path.join(self._output_folder, "retraining_parameters", self._target_species)): - pass - else: - logger.warning("Augustus did not produce a retrained species folder.") - return - - -class GFF2GBRunner(BaseRunner): - - name = "gff2gbSmallDNA.pl" - - def __init__(self): - super().__init__() - self._output_folder = os.path.join(self.run_folder, "augustus_output") - self.gff_folder = os.path.join(self._output_folder, "gff") - self.gb_folder = os.path.join(self._output_folder, "gb") - self.create_dirs([self.gff_folder, self.gb_folder]) - - self.init_checkpoint_file() - - def configure_runner(self, single_copy_buscos): - self.run_number += 1 - self.single_copy_buscos = single_copy_buscos - - def run(self): - super().run() - self.total = self._count_jobs() - self.run_jobs() - - def _count_jobs(self): - n = len(self.single_copy_buscos) - return n - - def generate_job_args(self): - for busco_id in self.single_copy_buscos: - yield busco_id - - def configure_job(self, busco_id): - gff2_gb_small_dna_pl_job = self.create_job() - gff2_gb_small_dna_pl_job.add_parameter(os.path.join(self.gff_folder, "{}.gff".format(busco_id))) - gff2_gb_small_dna_pl_job.add_parameter(self.input_file) - gff2_gb_small_dna_pl_job.add_parameter("1000") - gff2_gb_small_dna_pl_job.add_parameter(os.path.join(self.gb_folder, "{}.raw.gb".format(busco_id))) - return gff2_gb_small_dna_pl_job - - def check_tool_dependencies(self): - pass - - def get_version(self): - return - - @property - def output_folder(self): - return self._output_folder - - -class NewSpeciesRunner(BaseRunner): - - name = "new_species.pl" - - def __init__(self): - super().__init__() - self._output_folder = os.path.join(self.run_folder, "augustus_output") - self.new_species_name = "BUSCO_{}".format(os.path.basename(self.main_out)) - self.init_checkpoint_file() - self.run_number += 1 - - def run(self): - super().run() - self.total = 1 - self.run_jobs() - - def configure_job(self, *args): - - new_species_pl_job = self.create_job() - # bacteria clade needs to be flagged as "prokaryotic" - if self.domain == "prokaryota": - new_species_pl_job.add_parameter("--prokaryotic") - new_species_pl_job.add_parameter("--species={}".format(os.path.basename(self.new_species_name))) - return new_species_pl_job - - def check_tool_dependencies(self): - pass - - def generate_job_args(self): - yield - - def get_version(self): - return - - @property - def output_folder(self): - return self._output_folder - - -class ETrainingRunner(BaseRunner): - - name = "etraining" - - def __init__(self): - super().__init__() - self._output_folder = os.path.join(self.run_folder, "augustus_output") - self._gb_folder = os.path.join(self._output_folder, "gb") - self.augustus_config_path = self.config.get("busco_run", "augustus_config_path") - self._training_file = os.path.join(self._output_folder, "training_set.db") - - self.init_checkpoint_file() - - def configure_runner(self, new_species_name): - self.run_number += 1 - self.new_species_name = new_species_name - self._merge_gb_files() - - def run(self): - super().run() - self.total = 1 - self.run_jobs() - self._validate_run() - - def check_tool_dependencies(self): - pass - - def generate_job_args(self): - yield - - def _merge_gb_files(self): - """Concatenate all GB files into one large file""" - with open(self._training_file, "w") as outfile: - for fname in os.listdir(self._gb_folder): - with open(os.path.join(self._gb_folder, fname), "r") as infile: - outfile.writelines(infile.readlines()) - return - - def _validate_run(self): - species_filepath = os.path.join(self.augustus_config_path, "species", self.new_species_name) - if os.path.exists(species_filepath) and any("exon_probs" in f for f in os.listdir(species_filepath)): - return - else: - SystemExit("Retraining did not complete correctly. Check your Augustus config path environment variable.") - - def configure_job(self, *args): - etraining_job = self.create_job() - etraining_job.add_parameter("--species={}".format(self.new_species_name)) - etraining_job.add_parameter(os.path.join(self.run_folder, "augustus_output", "training_set.db")) - return etraining_job - - def get_version(self): - return - - @property - def output_folder(self): - return self._output_folder - - -class OptimizeAugustusRunner(BaseRunner): - - name = "optimize_augustus.pl" - - def __init__(self): - super().__init__() - self._output_folder = None - self.training_set_db = None - self.new_species_name = None - - def configure_runner(self, output_folder, new_species_name): - self.run_number += 1 - self._output_folder = output_folder - self.training_set_db = os.path.join(self._output_folder, "training_set.db") - self.new_species_name = new_species_name - - self.init_checkpoint_file() - - def configure_job(self, *args): - optimize_augustus_pl_job = self.create_job() - optimize_augustus_pl_job.add_parameter("--cpus={}".format(self.cpus)) - optimize_augustus_pl_job.add_parameter("--species={}".format(self.new_species_name)) - optimize_augustus_pl_job.add_parameter(self.training_set_db) - return optimize_augustus_pl_job - - def run(self): - super().run() - self.total = 1 - self.run_jobs() - - def generate_job_args(self): - yield - - def check_tool_dependencies(self): - pass - - def get_version(self): - return - - @property - def output_folder(self): - return self._output_folder - - -class SEPPRunner(BaseRunner): - - name = "sepp" - - def __init__(self): - super().__init__() - self._output_folder = os.path.join(self.main_out, "auto_lineage", self.lineage_results_dir) - self.placement_folder = os.path.join(self._output_folder, "placement_files") - self.datasets_version = self.config.get("busco_run", "datasets_version") - - self.init_checkpoint_file() - - def configure_runner(self, tree_nwk_file, tree_metadata_file, supermatrix_file, downloader): - self.run_number += 1 - self.tree_nwk_file = tree_nwk_file - self.tree_metadata_file = tree_metadata_file - self.supermatrix_file = supermatrix_file - self.downloader = downloader - - def generate_job_args(self): - yield - - def run(self): - super().run() - self.total = 1 - self.run_jobs() - - def configure_job(self, *args): - sepp_job = self.create_job() - sepp_job.add_parameter("--cpu") - sepp_job.add_parameter(str(self.cpus)) - sepp_job.add_parameter("--outdir") - sepp_job.add_parameter(self.placement_folder) - sepp_job.add_parameter("-t") - sepp_job.add_parameter(self.tree_nwk_file) - sepp_job.add_parameter("-r") - sepp_job.add_parameter(self.tree_metadata_file) - sepp_job.add_parameter("-a") - sepp_job.add_parameter(self.supermatrix_file) - sepp_job.add_parameter("-f") - sepp_job.add_parameter(os.path.join(self.placement_folder, "marker_genes.fasta")) - sepp_job.add_parameter("-F") - sepp_job.add_parameter("15") - sepp_job.add_parameter("-m") - sepp_job.add_parameter("amino") - return sepp_job - - def check_tool_dependencies(self): - pass - - def get_version(self): - sepp_version = subprocess.check_output([self.cmd, "-v"], stderr=subprocess.STDOUT, shell=False) - sepp_version = sepp_version.decode("utf-8") - sepp_version = sepp_version.strip().split(" ")[1] - return sepp_version - - @property - def output_folder(self): - return self._output_folder diff -Nru busco-4.1.4/src/busco/ConfigManager.py busco-5.0.0/src/busco/ConfigManager.py --- busco-4.1.4/src/busco/ConfigManager.py 2020-10-01 14:11:36.000000000 +0000 +++ busco-5.0.0/src/busco/ConfigManager.py 2021-01-26 11:28:47.000000000 +0000 @@ -1,16 +1,26 @@ +#!/usr/bin/env python3 +# coding: utf-8 +""" +.. module:: ConfigManager + :synopsis: manage setup for BUSCO run configuration +.. versionadded:: 3.0.0 +.. versionchanged:: 5.0.0 + +Copyright (c) 2016-2021, Evgeny Zdobnov (ez@ezlab.org) +Licensed under the MIT license. See LICENSE.md file. + +""" + from busco.AutoLineage import AutoSelectLineage from busco.BuscoConfig import BuscoConfigMain from busco.BuscoLogger import BuscoLogger from busco.BuscoLogger import LogDecorator as log import os -import sys logger = BuscoLogger.get_logger(__name__) -# todo: finalize config file class BuscoConfigManager: - def __init__(self, params): self.params = params self.config_file = None @@ -21,52 +31,70 @@ @log("Getting config file", logger, debug=True) def get_config_file(self): """ - Check for BUSCO config file specified as a command line argument; - if not present check if defined as an environment variable; - if not present use default config file. - :return config: A BuscoConfig object containing all the required configuration parameters - """ + Check for BUSCO config file specified as a command line argument; + if not present check if defined as an environment variable; + if not present use default config file. + :return config: A BuscoConfig object containing all the required configuration parameters + """ try: self.config_file = self.params["config_file"] if self.config_file is not None: return except KeyError: pass - if os.environ.get("BUSCO_CONFIG_FILE") and os.access(os.environ.get("BUSCO_CONFIG_FILE"), os.R_OK): + if os.environ.get("BUSCO_CONFIG_FILE") and os.access( + os.environ.get("BUSCO_CONFIG_FILE"), os.R_OK + ): self.config_file = os.environ.get("BUSCO_CONFIG_FILE") else: - raise SystemExit("Please specify a BUSCO config file using either " - "(i) an environment variable by entering 'export BUSCO_CONFIG_FILE=/path/to/config.ini' " - "or (ii) using the command line flag --config /path/to/config.ini") + self.config_file = "local environment" return self.config_file @log("Configuring BUSCO with {}", logger, attr_name="config_file") - def load_busco_config(self, clargs): - self.config = BuscoConfigMain(self.config_file, self.params, clargs) + def load_busco_config(self, *args): + self.config = BuscoConfigMain(self.config_file, self.params) + self.config.configure() self.config.validate() if not self.config.check_lineage_present(): - if not self.config.getboolean("busco_run", "auto-lineage") and not self.config.getboolean("busco_run", "auto-lineage-prok"):# and not self.config.getboolean("busco_run", "auto-lineage-euk"): - logger.warning("Running Auto Lineage Selector as no lineage dataset was specified. This will take a " - "little longer than normal. If you know what lineage dataset you want to use, please " - "specify this in the config file or using the -l (--lineage-dataset) flag in the " - "command line.") + if not self.config.getboolean( + "busco_run", "auto-lineage" + ) and not self.config.getboolean( + "busco_run", "auto-lineage-prok" + ): # and not self.config.getboolean("busco_run", "auto-lineage-euk"): + logger.warning( + "Running Auto Lineage Selector as no lineage dataset was specified. This will take a " + "little longer than normal. If you know what lineage dataset you want to use, please " + "specify this in the config file or using the -l (--lineage-dataset) flag in the " + "command line." + ) self.config.set("busco_run", "auto-lineage", "True") lineage_dataset_fullpath = self.auto_select_lineage() # full path self.config.set("busco_run", "lineage_dataset", lineage_dataset_fullpath) lineage_dataset = os.path.basename(lineage_dataset_fullpath) # base name else: - if self.config.getboolean("busco_run", "auto-lineage") or self.config.getboolean("busco_run", "auto-lineage-prok"):# or self.config.getboolean("busco_run", "auto-lineage-euk"): - logger.warning("You have selected auto-lineage but you have also provided a lineage dataset. " - "BUSCO will proceed with the specified dataset. " - "To run auto-lineage do not specify a dataset.") + if self.config.getboolean( + "busco_run", "auto-lineage" + ) or self.config.getboolean( + "busco_run", "auto-lineage-prok" + ): # or self.config.getboolean("busco_run", "auto-lineage-euk"): + logger.warning( + "You have selected auto-lineage but you have also provided a lineage dataset. " + "BUSCO will proceed with the specified dataset. " + "To run auto-lineage do not specify a dataset." + ) self.config.set("busco_run", "auto-lineage", "False") self.config.set("busco_run", "auto-lineage-prok", "False") self.config.set("busco_run", "auto-lineage-euk", "False") - lineage_dataset = self.config.get("busco_run", "lineage_dataset") # full path - - self.config.set_results_dirname(lineage_dataset) # function always only uses basename - self.config.download_lineage_file(lineage_dataset) # full path will return, base name will attempt download - # Todo: clean up error messages + lineage_dataset = self.config.get( + "busco_run", "lineage_dataset" + ) # full path + + self.config.set_results_dirname( + lineage_dataset + ) # function always only uses basename + self.config.download_lineage_file( + lineage_dataset + ) # full path will return, base name will attempt download self.config.load_dataset_config() return diff -Nru busco-4.1.4/src/busco/GeneSetAnalysis.py busco-5.0.0/src/busco/GeneSetAnalysis.py --- busco-4.1.4/src/busco/GeneSetAnalysis.py 2020-10-01 14:11:36.000000000 +0000 +++ busco-5.0.0/src/busco/GeneSetAnalysis.py 1970-01-01 00:00:00.000000000 +0000 @@ -1,48 +0,0 @@ -#!/usr/bin/env python3 -# coding: utf-8 -""" -.. module:: GeneSetAnalysis - :synopsis: GeneSetAnalysis implements genome analysis specifics -.. versionadded:: 3.0.0 -.. versionchanged:: 3.0.0 - -Copyright (c) 2016-2020, Evgeny Zdobnov (ez@ezlab.org) -Licensed under the MIT license. See LICENSE.md file. - -""" -from busco.BuscoAnalysis import BuscoAnalysis -from busco.BuscoLogger import BuscoLogger -from busco.Analysis import ProteinAnalysis -from Bio import SeqIO - -logger = BuscoLogger.get_logger(__name__) - - -class GeneSetAnalysis(ProteinAnalysis, BuscoAnalysis): - """ - This class runs a BUSCO analysis on a gene set. - """ - _mode = 'proteins' - - def __init__(self): - """ - Initialize an instance. - :param params: Values of all parameters that have to be defined - :type params: PipeConfig - """ - super().__init__() - self.sequences_aa = {record.id: record for record in list(SeqIO.parse(self._input_file, "fasta"))} - - def cleanup(self): - super().cleanup() - - def run_analysis(self): - """ - This function calls all needed steps for running the analysis. - """ - super().run_analysis() - self.run_hmmer(self._input_file) - self.hmmer_runner.write_buscos_to_file(self.sequences_aa) - # if self._tarzip: - # self._run_tarzip_hmmer_output() - return diff -Nru busco-4.1.4/src/busco/GenomeAnalysis.py busco-5.0.0/src/busco/GenomeAnalysis.py --- busco-4.1.4/src/busco/GenomeAnalysis.py 2020-10-01 14:11:36.000000000 +0000 +++ busco-5.0.0/src/busco/GenomeAnalysis.py 1970-01-01 00:00:00.000000000 +0000 @@ -1,313 +0,0 @@ -#!/usr/bin/env python -# coding: utf-8 -""" -.. module:: GenomeAnalysis - :synopsis: GenomeAnalysis implements genome analysis specifics -.. versionadded:: 3.0.0 -.. versionchanged:: 3.0.0 - -Copyright (c) 2016-2020, Evgeny Zdobnov (ez@ezlab.org) -Licensed under the MIT license. See LICENSE.md file. - -""" -from busco.BuscoAnalysis import BuscoAnalysis -from busco.Analysis import NucleotideAnalysis -from busco.BuscoTools import ProdigalRunner, AugustusRunner, GFF2GBRunner, NewSpeciesRunner, ETrainingRunner, \ - OptimizeAugustusRunner, NoGenesError -import os -import shutil -from busco.BuscoLogger import BuscoLogger -from busco.BuscoLogger import LogDecorator as log -import time -from abc import ABCMeta, abstractmethod -from configparser import NoOptionError - -logger = BuscoLogger.get_logger(__name__) - - -class GenomeAnalysis(NucleotideAnalysis, BuscoAnalysis, metaclass=ABCMeta): - - _mode = "genome" - - def __init__(self): - super().__init__() - - @abstractmethod - def run_analysis(self): - super().run_analysis() - - def init_tools(self): - """ - Initialize tools needed for Genome Analysis. - :return: - """ - super().init_tools() - - # def _run_tarzip_augustus_output(self): # Todo: rewrite using tarfile - # """ - # This function tarzips results folder - # """ - # # augustus_output/predicted_genes - # - # self._p_open(["tar", "-C", "%saugustus_output" % self.main_out, - # "-zcf", "%saugustus_output/predicted_genes.tar.gz" % - # self.main_out, "predicted_genes", "--remove-files"], - # "bash", shell=False) - # # augustus_output/extracted_proteins - # self._p_open(["tar", "-C", "%saugustus_output" % self.main_out, - # "-zcf", "%saugustus_output/extracted_proteins.tar.gz" % - # self.main_out, "extracted_proteins", "--remove-files"], - # "bash", shell=False) - # # augustus_output/gb - # self._p_open(["tar", "-C", "%saugustus_output" % self.main_out, - # "-zcf", "%saugustus_output/gb.tar.gz" % self.main_out, "gb", "--remove-files"], - # "bash", shell=False) - # # augustus_output/gffs - # self._p_open(["tar", "-C", "%saugustus_output" % self.main_out, - # "-zcf", "%saugustus_output/gffs.tar.gz" % - # self.main_out, "gffs", "--remove-files"], "bash", shell=False) - # # single_copy_busco_sequences - # self._p_open(["tar", "-C", "%s" % self.main_out, "-zcf", - # "%ssingle_copy_busco_sequences.tar.gz" % self.main_out, - # "single_copy_busco_sequences", "--remove-files"], "bash", shell=False) - - # def set_rerun_busco_command(self, clargs): - # """ - # This function sets the command line to call to reproduce this run - # """ - # clargs.extend(["-sp", self._target_species]) - # super().set_rerun_busco_command(clargs) - - -class GenomeAnalysisProkaryotes(GenomeAnalysis): - """ - This class runs a BUSCO analysis on a genome. - """ - - def __init__(self): - """ - Initialize an instance. - """ - super().__init__() - self.prodigal_runner = None - - def cleanup(self): - super().cleanup() - - def run_analysis(self): - """ - This function calls all needed steps for running the analysis. - """ - super().run_analysis() - self._run_prodigal() - self.run_hmmer(self.prodigal_runner.output_faa) - self.hmmer_runner.write_buscos_to_file(self.sequences_aa, self.sequences_nt) - return - - def init_tools(self): - """ - Init the tools needed for the analysis - """ - super().init_tools() - self.prodigal_runner = ProdigalRunner() - - @log("***** Run Prodigal on input to predict and extract genes *****", logger) - def _run_prodigal(self): - """ - Run Prodigal on input file to detect genes. - :return: - """ - if self.restart and self.prodigal_runner.check_previous_completed_run(): - logger.info("Skipping Prodigal run as it has already completed") - self.prodigal_runner.get_gene_details() - else: - self.restart = False - self.config.set("busco_run", "restart", str(self.restart)) - self.prodigal_runner.run() - self.gene_details = self.prodigal_runner.gene_details - self.sequences_nt = self.prodigal_runner.sequences_nt - self.sequences_aa = self.prodigal_runner.sequences_aa - - return - - -class GenomeAnalysisEukaryotes(GenomeAnalysis): - """ - This class runs a BUSCO analysis on a eukaryote genome. - """ - def __init__(self): - super().__init__() - - self._long = self.config.getboolean("busco_run", "long") - try: - self._target_species = self.config.get("busco_run", "augustus_species") - except KeyError: - raise SystemExit("Something went wrong. Eukaryota datasets should specify an augustus species.") - try: - self._augustus_parameters = self.config.get("busco_run", "augustus_parameters").replace(',', ' ') - except NoOptionError: - self._augustus_parameters = "" - self.mkblast_runner = None - self.tblastn_runner = None - self.augustus_runner = None - self.gff2gb_runner = None - self.new_species_runner = None - self.etraining_runner = None - self.optimize_augustus_runner = None - - self.sequences_nt = {} - self.sequences_aa = {} - self.gene_details = {} - - def cleanup(self): - """ - This function cleans temporary files - """ - try: - augustus_tmp = self.augustus_runner.tmp_dir # Should be already done if AugustusRunner ran correctly - if os.path.exists(augustus_tmp): - shutil.rmtree(augustus_tmp) - except OSError: - pass - try: - if self._target_species.startswith("BUSCO"): - self.augustus_runner.move_retraining_parameters() - except OSError: - pass - super().cleanup() - - def init_tools(self): - """ - Initialize all required tools for Genome Eukaryote Analysis: - MKBlast, TBlastn, Augustus and Augustus scripts: GFF2GBSmallDNA, new_species, etraining - :return: - """ - super().init_tools() - - self.augustus_runner = AugustusRunner() - self.gff2gb_runner = GFF2GBRunner() - self.new_species_runner = NewSpeciesRunner() - self.etraining_runner = ETrainingRunner() - - if self._long: - self.optimize_augustus_runner = OptimizeAugustusRunner() - - return - - def run_analysis(self): - """This function calls all needed steps for running the analysis.""" - super().run_analysis() - self._run_mkblast() - self._run_tblastn() - self._run_augustus(self.tblastn_runner.coords) - self.gene_details = self.augustus_runner.gene_details - self.run_hmmer(self.augustus_runner.output_sequences) - self._rerun_analysis() - - def _rerun_augustus(self, coords): - missing_and_fragmented_buscos = self.hmmer_runner.missing_buscos + list( - self.hmmer_runner.fragmented_buscos.keys()) - logger.info("Re-running Augustus with the new metaparameters, number of target BUSCOs: {}".format( - len(missing_and_fragmented_buscos))) - missing_and_fragmented_coords = {busco: coords[busco] for busco in coords if busco in - missing_and_fragmented_buscos} - logger.debug('Trained species folder is {}'.format(self._target_species)) - self._run_augustus(missing_and_fragmented_coords, rerun=True) - return - - @log("Starting second step of analysis. The gene predictor Augustus is retrained using the results from the " - "initial run to yield more accurate results.", logger) - def _rerun_analysis(self): - - self.augustus_runner.make_gff_files(self.hmmer_runner.single_copy_buscos) - self._run_tblastn(missing_and_frag_only=True, ancestral_variants=self._has_variants_file) - self._run_gff2gb() - self._run_new_species() - self.config.set("busco_run", "augustus_species", self.new_species_runner.new_species_name) - self._target_species = self.new_species_runner.new_species_name - self._run_etraining() - - if self._long: - self._run_optimize_augustus(self.new_species_runner.new_species_name) - self._run_etraining() - - try: - self._rerun_augustus(self.tblastn_runner.coords) - self.gene_details.update(self.augustus_runner.gene_details) - self.run_hmmer(self.augustus_runner.output_sequences) - self.hmmer_runner.write_buscos_to_file(self.sequences_aa, self.sequences_nt) - except NoGenesError: - logger.warning("No genes found on Augustus rerun.") - - # if self._tarzip: # todo: zip folders with a lot of output - # self._run_tarzip_augustus_output() - # self._run_tarzip_hmmer_output() - # remove the checkpoint, run is done - # self._set_checkpoint() - return - - @log("Running Augustus gene predictor on BLAST search results.", logger) - def _run_augustus(self, coords, rerun=False): - self.augustus_runner.configure_runner(self.tblastn_runner.output_seqs, coords, self.sequences_aa, - self.sequences_nt, rerun) - - if self.restart and self.augustus_runner.check_previous_completed_run(): - run = "2nd" if rerun else "1st" - logger.info("Skipping {} augustus run as output already processed".format(run)) - else: - self.restart = False - self.config.set("busco_run", "restart", str(self.restart)) - self.augustus_runner.run() - self.augustus_runner.process_output() - self.sequences_nt = self.augustus_runner.sequences_nt - self.sequences_aa = self.augustus_runner.sequences_aa - - def _run_etraining(self): - """Train on new training set (complete single copy buscos)""" - self.etraining_runner.configure_runner(self.new_species_runner.new_species_name) - if self.restart and self.etraining_runner.check_previous_completed_run(): - logger.info("Skipping etraining as it has already been done") - else: - self.restart = False - self.config.set("busco_run", "restart", str(self.restart)) - self.etraining_runner.run() - return - - @log("Converting predicted genes to short genbank files", logger) - def _run_gff2gb(self): - self.gff2gb_runner.configure_runner(self.hmmer_runner.single_copy_buscos) - if self.restart and self.gff2gb_runner.check_previous_completed_run(): - logger.info("Skipping gff2gb conversion as it has already been done") - else: - self.restart = False - self.config.set("busco_run", "restart", str(self.restart)) - self.gff2gb_runner.run() - return - - @log("All files converted to short genbank files, now training Augustus using Single-Copy Complete BUSCOs", logger) - def _run_new_species(self): - """Create new species config file from template""" - if self.restart and self.new_species_runner.check_previous_completed_run(): - logger.info("Skipping new species creation as it has already been done") - else: - self.restart = False - self.config.set("busco_run", "restart", str(self.restart)) - self.new_species_runner.run() - return - - def _run_optimize_augustus(self, new_species_name): - """ long mode (--long) option - runs all the Augustus optimization scripts (adds ~1 day of runtime)""" - logger.warning("Optimizing augustus metaparameters, this may take a very long time, started at {}".format( - time.strftime("%m/%d/%Y %H:%M:%S"))) - self.optimize_augustus_runner.configure_runner(self.augustus_runner.output_folder, new_species_name) - self.optimize_augustus_runner.run() - return - - # def set_rerun_busco_command(self, clargs): - # """ - # This function sets the command line to call to reproduce this run - # """ - # clargs.extend(["-sp", self._target_species]) - # if self._augustus_parameters: - # clargs.extend(["--augustus_parameters", "\"%s\"" % self._augustus_parameters]) - # super().set_rerun_busco_command(clargs) diff -Nru busco-4.1.4/src/busco/__init__.py busco-5.0.0/src/busco/__init__.py --- busco-4.1.4/src/busco/__init__.py 2020-10-01 14:11:36.000000000 +0000 +++ busco-5.0.0/src/busco/__init__.py 2021-01-26 11:28:47.000000000 +0000 @@ -1,17 +1,33 @@ -#!/usr/bin/env python +#!/usr/bin/env python3 # coding: utf-8 """ .. package:: busco :synopsis: BUSCO - Benchmarking Universal Single-Copy Orthologs. -Copyright (c) 2016-2020, Evgeny Zdobnov (ez@ezlab.org) +Copyright (c) 2016-2021, Evgeny Zdobnov (ez@ezlab.org) Licensed under the MIT license. See LICENSE.md file. """ from ._version import __version__ as version -__all__ = ["Actions", "Analysis", "AutoLineage", "BuscoAnalysis","BuscoConfig", "BuscoDownloadManager", "BuscoLogger", - "BuscoPlacer", "BuscoRunner", "BuscoTools", "ConfigManager", "GeneSetAnalysis", "GenomeAnalysis", "Toolset", - "TranscriptomeAnalysis", "BuscoConfig", "BuscoPlacer"] -__version__ = version +__all__ = [ + "Actions", + "Analysis", + "AutoLineage", + "BuscoAnalysis", + "BuscoConfig", + "BuscoDownloadManager", + "BuscoLogger", + "BuscoPlacer", + "BuscoRunner", + "BuscoTools", + "ConfigManager", + "GeneSetAnalysis", + "GenomeAnalysis", + "Toolset", + "TranscriptomeAnalysis", + "BuscoConfig", + "BuscoPlacer", +] +__version__ = version diff -Nru busco-4.1.4/src/busco/run_BUSCO.py busco-5.0.0/src/busco/run_BUSCO.py --- busco-4.1.4/src/busco/run_BUSCO.py 2020-10-01 14:11:36.000000000 +0000 +++ busco-5.0.0/src/busco/run_BUSCO.py 2021-01-26 11:28:47.000000000 +0000 @@ -1,18 +1,19 @@ -#!/usr/bin/env python +#!/usr/bin/env python3 # coding: utf-8 """ .. module:: run_BUSCO - :synopsis: BUSCO - Benchmarking Universal Single-Copy Orthologs. + :synopsis: .. versionadded:: 3.0.0 -.. versionchanged:: 4.0.beta1 +.. versionchanged:: 5.0.0 +BUSCO - Benchmarking Universal Single-Copy Orthologs. This is the BUSCO main script. To get help, ``busco -h``. See also the user guide. And visit our website ``_ -Copyright (c) 2016-2020, Evgeny Zdobnov (ez@ezlab.org) +Copyright (c) 2016-2021, Evgeny Zdobnov (ez@ezlab.org) Licensed under the MIT license. See LICENSE.md file. """ @@ -22,15 +23,16 @@ import sys import argparse import os +import shutil from argparse import RawTextHelpFormatter import busco from busco.BuscoLogger import BuscoLogger from busco.BuscoLogger import LogDecorator as log from busco.ConfigManager import BuscoConfigManager -from busco.BuscoConfig import BuscoConfigMain -from busco.Toolset import ToolException +from busco.busco_tools.Toolset import ToolException from busco.BuscoRunner import BuscoRunner from busco.Actions import ListLineagesAction, CleanHelpAction, CleanVersionAction +from busco.ConfigManager import BuscoConfigMain logger = BuscoLogger.get_logger(__name__) @@ -42,116 +44,259 @@ :rtype: dict """ - # todo: keyword arg order parser = argparse.ArgumentParser( - description='Welcome to BUSCO %s: the Benchmarking Universal Single-Copy Ortholog assessment tool.\n' - 'For more detailed usage information, please review the README file provided with ' - 'this distribution and the BUSCO user guide.' % busco.__version__, - usage='busco -i [SEQUENCE_FILE] -l [LINEAGE] -o [OUTPUT_NAME] -m [MODE] [OTHER OPTIONS]', - formatter_class=RawTextHelpFormatter, add_help=False) - - optional = parser.add_argument_group('optional arguments') - - optional.add_argument( - '-i', '--in', dest='in', required=False, metavar='FASTA FILE', help='Input sequence file in FASTA format. ' - 'Can be an assembled genome or transcriptome (DNA), or protein sequences from an annotated gene set.') - - optional.add_argument( - '-c', '--cpu', dest='cpu', required=False, metavar='N', help='Specify the number (N=integer) ' - 'of threads/cores to use.') - optional.add_argument( - '-o', '--out', dest='out', required=False, metavar='OUTPUT', - help='Give your analysis run a recognisable short name. ' - 'Output folders and files will be labelled with this name. WARNING: do not provide a path') - - optional.add_argument( - '--out_path', dest='out_path', required=False, metavar='OUTPUT_PATH', - help='Optional location for results folder, excluding results folder name. ' - 'Default is current working directory.') - - optional.add_argument( - '-e', '--evalue', dest='evalue', required=False, metavar='N', type=float, - help='E-value cutoff for BLAST searches. ' - 'Allowed formats, 0.001 or 1e-03 (Default: %.0e)' % BuscoConfigMain.DEFAULT_ARGS_VALUES['evalue']) - - optional.add_argument( - '-m', '--mode', dest='mode', required=False, metavar='MODE', - help='Specify which BUSCO analysis mode to run.\n' - 'There are three valid modes:\n- geno or genome, for genome assemblies (DNA)\n- tran or ' - 'transcriptome, ' - 'for transcriptome assemblies (DNA)\n- prot or proteins, for annotated gene sets (protein)') - - optional.add_argument( - '-l', '--lineage_dataset', dest='lineage_dataset', required=False, metavar='LINEAGE', - help='Specify the name of the BUSCO lineage to be used.') - - optional.add_argument( - '-f', '--force', action='store_true', required=False, dest='force', - help='Force rewriting of existing files. ' - 'Must be used when output files with the provided name already exist.') - - optional.add_argument( - '-r', '--restart', action='store_true', required=False, dest='restart', - help='Continue a run that had already partially completed.') - - optional.add_argument( - '--limit', dest='limit', metavar='REGION_LIMIT', required=False, - type=int, help='How many candidate regions (contig or transcript) to consider per BUSCO (default: %s)' - % str(BuscoConfigMain.DEFAULT_ARGS_VALUES['limit'])) - - optional.add_argument( - '--long', action='store_true', required=False, dest='long', - help='Optimization mode Augustus ' - 'self-training (Default: Off) adds considerably to the run time, ' - 'but can improve results for some non-model organisms') - - optional.add_argument( - '-q', '--quiet', dest='quiet', required=False, help='Disable the info logs, displays only errors', - action="store_true") - - optional.add_argument('--augustus_parameters', dest='augustus_parameters', required=False, - help="Pass additional arguments to Augustus. All arguments should be contained within a " - "single pair of quotation marks, separated by commas. E.g. \'--param1=1,--param2=2\'") - - optional.add_argument('--augustus_species', dest='augustus_species', required=False, - help="Specify a species for Augustus training.") - - # optional.add_argument( - # '-z', '--tarzip', dest='tarzip', required=False, help='Tarzip the output folders likely to ' - # 'contain thousands of files', - # action="store_true") - - optional.add_argument( - '--auto-lineage', dest='auto-lineage', action="store_true", required=False, - help='Run auto-lineage to find optimum lineage path') - - optional.add_argument( - '--auto-lineage-prok', dest='auto-lineage-prok', action="store_true", required=False, - help='Run auto-lineage just on non-eukaryote trees to find optimum lineage path') - - optional.add_argument( - '--auto-lineage-euk', dest='auto-lineage-euk', action="store_true", required=False, - help='Run auto-placement just on eukaryote tree to find optimum lineage path') - - optional.add_argument( - '--update-data', dest='update-data', action="store_true", required=False, - help='Download and replace with last versions all lineages datasets and files necessary' - ' to their automated selection') - - optional.add_argument( - '--offline', dest='offline', action="store_true", required=False, - help='To indicate that BUSCO cannot attempt to download files') - - optional.add_argument( - '--config', dest='config_file', required=False, help='Provide a config file') - - optional.add_argument('-v', '--version', action=CleanVersionAction, help="Show this version and exit", - version='BUSCO %s' % busco.__version__) - - optional.add_argument('-h', '--help', action=CleanHelpAction, help="Show this help message and exit") - - optional.add_argument('--list-datasets', action=ListLineagesAction, - help="Print the list of available BUSCO datasets") + description="Welcome to BUSCO %s: the Benchmarking Universal Single-Copy Ortholog assessment tool.\n" + "For more detailed usage information, please review the README file provided with " + "this distribution and the BUSCO user guide." % busco.__version__, + usage="busco -i [SEQUENCE_FILE] -l [LINEAGE] -o [OUTPUT_NAME] -m [MODE] [OTHER OPTIONS]", + formatter_class=RawTextHelpFormatter, + add_help=False, + ) + + optional = parser.add_argument_group("optional arguments") + + optional.add_argument( + "-i", + "--in", + dest="in", + required=False, + metavar="FASTA FILE", + help="Input sequence file in FASTA format. " + "Can be an assembled genome or transcriptome (DNA), or protein sequences from an annotated gene set.", + ) + + optional.add_argument( + "-o", + "--out", + dest="out", + required=False, + metavar="OUTPUT", + help="Give your analysis run a recognisable short name. " + "Output folders and files will be labelled with this name. WARNING: do not provide a path", + ) + + optional.add_argument( + "-m", + "--mode", + dest="mode", + required=False, + metavar="MODE", + help="Specify which BUSCO analysis mode to run.\n" + "There are three valid modes:\n- geno or genome, for genome assemblies (DNA)\n- tran or " + "transcriptome, " + "for transcriptome assemblies (DNA)\n- prot or proteins, for annotated gene sets (protein)", + ) + + optional.add_argument( + "-l", + "--lineage_dataset", + dest="lineage_dataset", + required=False, + metavar="LINEAGE", + help="Specify the name of the BUSCO lineage to be used.", + ) + + optional.add_argument( + "--auto-lineage", + dest="auto-lineage", + action="store_true", + required=False, + help="Run auto-lineage to find optimum lineage path", + ) + + optional.add_argument( + "--auto-lineage-prok", + dest="auto-lineage-prok", + action="store_true", + required=False, + help="Run auto-lineage just on non-eukaryote trees to find optimum lineage path", + ) + + optional.add_argument( + "--auto-lineage-euk", + dest="auto-lineage-euk", + action="store_true", + required=False, + help="Run auto-placement just on eukaryote tree to find optimum lineage path", + ) + + optional.add_argument( + "-c", + "--cpu", + dest="cpu", + required=False, + metavar="N", + help="Specify the number (N=integer) " "of threads/cores to use.", + ) + + optional.add_argument( + "-f", + "--force", + action="store_true", + required=False, + dest="force", + help="Force rewriting of existing files. " + "Must be used when output files with the provided name already exist.", + ) + + optional.add_argument( + "-r", + "--restart", + action="store_true", + required=False, + dest="restart", + help="Continue a run that had already partially completed.", + ) + + optional.add_argument( + "-q", + "--quiet", + dest="quiet", + required=False, + help="Disable the info logs, displays only errors", + action="store_true", + ) + + optional.add_argument( + "--out_path", + dest="out_path", + required=False, + metavar="OUTPUT_PATH", + help="Optional location for results folder, excluding results folder name. " + "Default is current working directory.", + ) + + optional.add_argument( + "--download_path", + dest="download_path", + required=False, + help="Specify local filepath for storing BUSCO dataset downloads", + ) + + optional.add_argument( + "--datasets_version", + dest="datasets_version", + required=False, + help="Specify the version of BUSCO datasets, e.g. odb10", + ) + + optional.add_argument( + "--download_base_url", + dest="download_base_url", + required=False, + help="Set the url to the remote BUSCO dataset location", + ) + + optional.add_argument( + "--update-data", + dest="update-data", + action="store_true", + required=False, + help="Download and replace with last versions all lineages datasets and files necessary" + " to their automated selection", + ) + + optional.add_argument( + "--offline", + dest="offline", + action="store_true", + required=False, + help="To indicate that BUSCO cannot attempt to download files", + ) + + optional.add_argument( + "--metaeuk_parameters", + dest="metaeuk_parameters", + required=False, + help="Pass additional arguments to Metaeuk for the first run. All arguments should be " + "contained within a single pair of quotation marks, separated by commas. " + 'E.g. "--param1=1,--param2=2"', + ) + + optional.add_argument( + "--metaeuk_rerun_parameters", + dest="metaeuk_rerun_parameters", + required=False, + help="Pass additional arguments to Metaeuk for the second run. All arguments should be " + "contained within a single pair of quotation marks, separated by commas. " + 'E.g. "--param1=1,--param2=2"', + ) + + optional.add_argument( + "-e", + "--evalue", + dest="evalue", + required=False, + metavar="N", + type=float, + help="E-value cutoff for BLAST searches. " + "Allowed formats, 0.001 or 1e-03 (Default: %.0e)" + % BuscoConfigMain.DEFAULT_ARGS_VALUES["evalue"], + ) + + optional.add_argument( + "--limit", + dest="limit", + metavar="REGION_LIMIT", + required=False, + type=int, + help="How many candidate regions (contig or transcript) to consider per BUSCO (default: %s)" + % str(BuscoConfigMain.DEFAULT_ARGS_VALUES["limit"]), + ) + + optional.add_argument( + "--augustus", + dest="use_augustus", + action="store_true", + required=False, + help="Use augustus gene predictor for eukaryote runs", + ) + + optional.add_argument( + "--augustus_parameters", + dest="augustus_parameters", + required=False, + help="Pass additional arguments to Augustus. All arguments should be contained within a " + 'single pair of quotation marks, separated by commas. E.g. "--param1=1,--param2=2"', + ) + + optional.add_argument( + "--augustus_species", + dest="augustus_species", + required=False, + help="Specify a species for Augustus training.", + ) + + optional.add_argument( + "--long", + action="store_true", + required=False, + dest="long", + help="Optimization Augustus self-training mode (Default: Off); adds considerably to the run " + "time, but can improve results for some non-model organisms", + ) + + optional.add_argument( + "--config", dest="config_file", required=False, help="Provide a config file" + ) + + optional.add_argument( + "-v", + "--version", + action=CleanVersionAction, + help="Show this version and exit", + version="BUSCO %s" % busco.__version__, + ) + + optional.add_argument( + "-h", "--help", action=CleanHelpAction, help="Show this help message and exit" + ) + + optional.add_argument( + "--list-datasets", + action=ListLineagesAction, + help="Print the list of available BUSCO datasets", + ) return vars(parser.parse_args()) @@ -168,8 +313,12 @@ run_BUSCO(params) -@log('***** Start a BUSCO v{} analysis, current time: {} *****'.format(busco.__version__, - time.strftime('%m/%d/%Y %H:%M:%S')), logger) +@log( + "***** Start a BUSCO v{} analysis, current time: {} *****".format( + busco.__version__, time.strftime("%m/%d/%Y %H:%M:%S") + ), + logger, +) def run_BUSCO(params): start_time = time.time() @@ -182,8 +331,11 @@ lineage_basename = os.path.basename(config.get("busco_run", "lineage_dataset")) main_out_folder = config.get("busco_run", "main_out") - lineage_results_folder = os.path.join(main_out_folder, "auto_lineage", - config.get("busco_run", "lineage_results_dir")) + lineage_results_folder = os.path.join( + main_out_folder, + "auto_lineage", + config.get("busco_run", "lineage_results_dir"), + ) if config.getboolean("busco_run", "auto-lineage"): if lineage_basename.startswith(("bacteria", "archaea", "eukaryota")): @@ -192,8 +344,9 @@ # It is possible that the following lineages were arrived at either by the Prodigal genetic code shortcut # or by BuscoPlacer. If the former, the run will have already been completed. If the latter it still needs # to be done. - elif lineage_basename.startswith(("mollicutes", "mycoplasmatales", "entomoplasmatales")) and \ - os.path.exists(lineage_results_folder): + elif lineage_basename.startswith( + ("mollicutes", "mycoplasmatales", "entomoplasmatales") + ) and os.path.exists(lineage_results_folder): busco_run = config_manager.runner else: busco_run = BuscoRunner(config) @@ -201,13 +354,21 @@ busco_run = BuscoRunner(config) if os.path.exists(lineage_results_folder): - os.rename(lineage_results_folder, os.path.join(main_out_folder, - config.get("busco_run", "lineage_results_dir"))) + new_dest = os.path.join( + main_out_folder, config.get("busco_run", "lineage_results_dir") + ) + if os.path.exists( + new_dest + ): # New dest would only exist if this is a rerun of a previously completed run + shutil.rmtree(new_dest) + os.rename(lineage_results_folder, new_dest) else: busco_run.run_analysis() - BuscoRunner.final_results.append(busco_run.analysis.hmmer_runner.hmmer_results_lines) + BuscoRunner.final_results.append( + busco_run.analysis.hmmer_runner.hmmer_results_lines + ) BuscoRunner.results_datasets.append(lineage_basename) - busco_run.finish(time.time()-start_time) + busco_run.finish(time.time() - start_time) except ToolException as e: logger.error(e) @@ -216,10 +377,11 @@ except SystemExit as se: logger.error(se) logger.debug(se, exc_info=True) - logger.error('BUSCO analysis failed !') + logger.error("BUSCO analysis failed !") logger.error( - "Check the logs, read the user guide, and check the BUSCO issue board on " - "https://gitlab.com/ezlab/busco/issues") + "Check the logs, read the user guide (https://busco.ezlab.org/busco_userguide.html), " + "and check the BUSCO issue board on https://gitlab.com/ezlab/busco/issues" + ) try: BuscoRunner.move_log_file(config) except NameError: @@ -232,13 +394,18 @@ raise SystemExit(1) except KeyboardInterrupt: - logger.exception('A signal was sent to kill the process. \nBUSCO analysis failed !') + logger.exception( + "A signal was sent to kill the process. \nBUSCO analysis failed !" + ) raise SystemExit(1) except BaseException: exc_type, exc_value, exc_traceback = sys.exc_info() - logger.critical("Unhandled exception occurred:\n{}\n".format( - "".join(traceback.format_exception(exc_type, exc_value, exc_traceback)))) + logger.critical( + "Unhandled exception occurred:\n{}\n".format( + "".join(traceback.format_exception(exc_type, exc_value, exc_traceback)) + ) + ) raise SystemExit(1) diff -Nru busco-4.1.4/src/busco/Toolset.py busco-5.0.0/src/busco/Toolset.py --- busco-4.1.4/src/busco/Toolset.py 2020-10-01 14:11:36.000000000 +0000 +++ busco-5.0.0/src/busco/Toolset.py 1970-01-01 00:00:00.000000000 +0000 @@ -1,209 +0,0 @@ -#!/usr/bin/env python -# coding: utf-8 -""" -.. module:: Toolset - :synopsis: the interface to OS enables to run executables / scripts - in external processes -.. versionadded:: 3.0.0 -.. versionchanged:: 4.0.0 - -Copyright (c) 2016-2020, Evgeny Zdobnov (ez@ezlab.org) -Licensed under the MIT license. See LICENSE.md file. - -""" -import os -import subprocess -from subprocess import TimeoutExpired -# import threading -from multiprocessing import Process, Pool, Value, Lock -import time -from shutil import which -from abc import ABCMeta, abstractmethod -from busco.BuscoLogger import BuscoLogger, ToolLogger -from busco.BuscoLogger import LogDecorator as log -from busco.BuscoLogger import StreamLogger -import logging - -logger = BuscoLogger.get_logger(__name__) - -class Job(Process):#threading.Thread): - """ - Build and executes one work item in an external process - """ - - def __init__(self, tool_name, cmd, job_outlogger, job_errlogger, timeout, **kwargs): - """ - :param name: a name of an executable / script ("a tool") to be run - :type cmd: list - :param thread_id: an int id for the thread - :type thread_id: int - """ - # initialize parent - super().__init__() - - self.tool_name = tool_name - self.cmd_line = [cmd] - self.job_outlogger = job_outlogger - self.job_errlogger = job_errlogger - self.timeout = timeout - self.kwargs = kwargs - - def add_parameter(self, parameter): - """ - Append parameter to the command line - :parameter: a parameter - :type parameter: str - """ - self.cmd_line.append(parameter) - - @log('cmd call: {}', logger, attr_name='cmd_line', apply='join', debug=True) - def run(self): - """ - Start external process and block the current thread's execution - till the process' run is over - """ - with StreamLogger(logging.DEBUG, self.job_outlogger, **self.kwargs) as out: # kwargs only provided to out to capture augustus stdout - with StreamLogger(logging.ERROR, self.job_errlogger) as err: - try: - # Stick with Popen(), communicate() and wait() instead of just run() to ensure compatibility with - # Python versions < 3.5. - p = subprocess.Popen(self.cmd_line, shell=False, stdout=out, stderr=err) - p.wait(self.timeout) - except TimeoutExpired: - p.kill() - logger.warning("The following job was killed as it was taking too long (>1hr) to " - "complete.\n{}".format(" ".join(self.cmd_line))) - - self.job_outlogger._file_hdlr.close() - self.job_outlogger.removeHandler(self.job_outlogger._file_hdlr) - self.job_errlogger._file_hdlr.close() - self.job_errlogger.removeHandler(self.job_errlogger._file_hdlr) - with cnt.get_lock(): - cnt.value += 1 - -class ToolException(Exception): - """ - Module-specific exception - """ - def __init__(self, value): - self.value = value - - def __str__(self): - return self.value - - -class Tool(metaclass=ABCMeta): - """ - Collection of utility methods used by all tools - """ - - def __init__(self): - """ - Initialize job list for a tool - :param name: the name of the tool to execute - :type name: str - :param config: initialized instance of ConfigParser - :type config: configparser.ConfigParser - """ - - self.cmd = None - # self.name = name - # if not self.check_tool_available(): - # raise ToolException("{} tool cannot be found. Please check the 'path' and 'command' parameters " - # "provided in the config file. Do not include the command in the path!".format(self.name)) - if self.name == "augustus": - self.kwargs = {"augustus_out": True} - self.timeout = 3600 - else: - self.kwargs = {} - self.timeout = None - self.jobs_to_run = [] - self.jobs_running = [] - self.nb_done = 0 - self.total = 0 - self.cpus = None - self.chunksize = None - # self.count_jobs_created = True - # self.logged_header = False - - # self.logfile_path_out = os.path.join(self.config.get("busco_run", "main_out"), "logs", "{}_out.log".format(self.name)) - # self.logfile_path_err = self.logfile_path_out.replace('_out.log', '_err.log') - - @abstractmethod - def configure_job(self): - pass - - @abstractmethod - def generate_job_args(self): - pass - - @property - @abstractmethod - def name(self): - raise NotImplementedError - - @abstractmethod - def write_checkpoint_file(self): - pass - - def create_job(self): - """ - Create one work item - """ - self.tool_outlogger = ToolLogger(self.logfile_path_out) - self.tool_errlogger = ToolLogger(self.logfile_path_err) - job = Job(self.name, self.cmd[:], self.tool_outlogger, self.tool_errlogger, self.timeout, **self.kwargs) - self.jobs_to_run.append(job) - # if self.count_jobs_created: - # self.total += 1 - return job - - def remove_job(self, job): - """ - Remove one work item - :param job: the Job to remove - :type job: Job - """ - self.jobs_to_run.remove(job) - - def log_jobs_to_run(self): - logger.info("Running {} job(s) on {}, starting at {}".format(self.total, self.name, - time.strftime('%m/%d/%Y %H:%M:%S'))) - return - - @log("No jobs to run on {}", logger, attr_name="name", iswarn=True) - def log_no_jobs(self): - return - - def run_jobs(self): - if self.total > 0: - self.log_jobs_to_run() - else: - self.log_no_jobs() - return - - if self.cpus is None: # todo: need a different way to ensure self.cpus is nonzero number. - raise SystemExit("Number of CPUs not specified.") - - with Pool(self.cpus, initializer=type(self).init_globals, initargs=(Value('i', 0),)) as job_pool: - job_pool.map(self.run_job, self.generate_job_args(), chunksize=self.chunksize) - self.write_checkpoint_file() - - def run_job(self, args): - args = (args,) if isinstance(args, str) else tuple(args or (args,)) # Ensure args are tuples that can be unpacked. If no args, args=None, which is falsy, and this evaluates to (None,) - job = self.configure_job(*args) - job.run() - self.nb_done = cnt.value - if (self.nb_done == self.total or int( - self.nb_done % float(self.total / 10)) == 0): - self._track_progress() - - @log('[{0}]\t{1} of {2} task(s) completed', logger, attr_name=['name', 'nb_done', 'total'], on_func_exit=True) - def _track_progress(self): - return - - @classmethod - def init_globals(cls, counter): - """Counter code adapted from the answer here: https://stackoverflow.com/a/53621343/4844311""" - global cnt - cnt = counter diff -Nru busco-4.1.4/src/busco/TranscriptomeAnalysis.py busco-5.0.0/src/busco/TranscriptomeAnalysis.py --- busco-4.1.4/src/busco/TranscriptomeAnalysis.py 2020-10-01 14:11:36.000000000 +0000 +++ busco-5.0.0/src/busco/TranscriptomeAnalysis.py 1970-01-01 00:00:00.000000000 +0000 @@ -1,152 +0,0 @@ -#!/usr/bin/env python3 -# coding: utf-8 -""" -.. module:: TranscriptomeAnalysis - :synopsis:TranscriptomeAnalysis implements genome analysis specifics -.. versionadded:: 3.0.0 -.. versionchanged:: 3.0.0 - -Copyright (c) 2016-2020, Evgeny Zdobnov (ez@ezlab.org) -Licensed under the MIT license. See LICENSE.md file. - -""" -import os -from busco.BuscoAnalysis import BuscoAnalysis -from busco.BuscoLogger import BuscoLogger -from busco.BuscoLogger import LogDecorator as log -from Bio.Seq import reverse_complement, translate -from Bio import SeqIO -from Bio.SeqRecord import SeqRecord -from busco.Analysis import NucleotideAnalysis - - -logger = BuscoLogger.get_logger(__name__) - -# todo: catch multiple buscos on one transcript - - -class TranscriptomeAnalysis(NucleotideAnalysis, BuscoAnalysis): - """ - Analysis on a transcriptome. - """ - - _mode = "transcriptome" - - def __init__(self): - """ - Initialize an instance. - """ - super().__init__() - - def run_analysis(self): - """ - This function calls all needed steps for running the analysis. - """ - - super().run_analysis() - - # if self._restart: # todo: reimplement restart mode - # checkpoint = self.get_checkpoint(reset_random_suffix=True) - # logger.warning("Restarting an uncompleted run") - # else: - # checkpoint = 0 # all steps will be done - # if checkpoint < 1: - - self._run_mkblast() - self._run_tblastn(ancestral_variants=self._has_variants_file) - - protein_seq_files = self._translate_seqs(self.tblastn_runner.coords) - - self.run_hmmer(protein_seq_files) - # Note BUSCO matches are not written to file, as we have not yet developed a suitable protocol for - # Transcriptomes - # if self._tarzip: - # self._run_tarzip_hmmer_output() - # self._run_tarzip_translated_proteins() - return - - def init_tools(self): - super().init_tools() - - def cleanup(self): - """ - This function cleans temporary files. - """ - super().cleanup() - - @staticmethod - def six_frame_translation(seq): - """ - Gets the sixframe translation for the provided sequence - :param seq: the sequence to be translated - :type seq: str - :return: the six translated sequences - :rtype: list - """ - descriptions = {1: "orig_seq_frame_1", - 2: "orig_seq_frame_2", - 3: "orig_seq_frame_3", - -1: "rev_comp_frame_1", - -2: "rev_comp_frame_2", - -3: "rev_comp_frame_3"} - - # Based on code excerpt from https://biopython.org/DIST/docs/api/Bio.SeqUtils-pysrc.html#six_frame_translations - anti = reverse_complement(seq) - translated_seqs = {} - for i in range(3): - fragment_length = 3 * ((len(seq) - i) // 3) - translated_seqs[descriptions[i+1]] = (translate(seq[i:i + fragment_length], stop_symbol="X")) - translated_seqs[descriptions[-(i+1)]] = (translate(anti[i:i + fragment_length], stop_symbol="X")) - return translated_seqs - - @staticmethod - def _reformats_seq_id(seq_id): - """ - This function reformats the sequence id to its original values - :param seq_id: the seq id to reformats - :type seq_id: str - :return: the reformatted seq_id - :rtype: str - """ - return "_".join(seq_id.split("_")[:-1]) - - @log("Translating candidate transcripts", logger) - def _translate_seqs(self, coords): - - translated_proteins_dir = os.path.join(self.main_out, "translated_proteins") - if not os.path.exists(translated_proteins_dir): - os.makedirs(translated_proteins_dir) - - contig_names = [] - for contig_info in coords.values(): - for contig in contig_info: - contig_names.append(contig) - - protein_seq_files = [] - for busco_id, contig_info in coords.items(): - output_filename = os.path.join(translated_proteins_dir, "{}.faa".format(busco_id)) - protein_seq_files.append(output_filename) - translated_records = [] - for contig_name in contig_info: - tmp_filename = os.path.join(self.tblastn_runner.output_seqs, "{}.temp".format( - contig_name[:100])) # Avoid very long filenames - for record in SeqIO.parse(tmp_filename, "fasta"): # These files will only ever have one sequence, - # but BioPython examples always parse them in an iterator. - translated_seqs = self.six_frame_translation(record.seq) - for desc_id in translated_seqs: # There are six possible translated sequences - prot_seq = translated_seqs[desc_id] - translated_records.append(SeqRecord(prot_seq, id=record.id, description=desc_id)) - - with open(output_filename, "w") as out_faa: - SeqIO.write(translated_records, out_faa, "fasta") - - return protein_seq_files - - # def _run_tarzip_translated_proteins(self): - # """ - # This function tarzips results folder - # """ - # # translated_proteins # Todo: rewrite with tarfile module - # self._p_open(["tar", "-C", "%s" % self.mainout, "-zcf", - # "%stranslated_proteins.tar.gz" % self.mainout, "translated_proteins", "--remove-files"], "bash", - # shell=False) diff -Nru busco-4.1.4/src/busco/_version.py busco-5.0.0/src/busco/_version.py --- busco-4.1.4/src/busco/_version.py 2020-10-01 14:11:36.000000000 +0000 +++ busco-5.0.0/src/busco/_version.py 2021-01-26 11:28:47.000000000 +0000 @@ -1,9 +1,9 @@ -#!/usr/bin/env python +#!/usr/bin/env python3 # coding: utf-8 """ -Copyright (c) 2016-2020, Evgeny Zdobnov (ez@ezlab.org) +Copyright (c) 2016-2021, Evgeny Zdobnov (ez@ezlab.org) Licensed under the MIT license. See LICENSE.md file. """ -__version__ = "4.1.4" +__version__ = "5.0.0" diff -Nru busco-4.1.4/test_data/bacteria/expected_log.txt busco-5.0.0/test_data/bacteria/expected_log.txt --- busco-4.1.4/test_data/bacteria/expected_log.txt 2020-10-01 14:11:36.000000000 +0000 +++ busco-5.0.0/test_data/bacteria/expected_log.txt 2021-01-26 11:28:47.000000000 +0000 @@ -1,7 +1,7 @@ -INFO: ***** Start a BUSCO v4.1.3 analysis, current time: 07/01/2020 18:43:08 ***** -INFO: Configuring BUSCO with /busco/config/config.ini +INFO: ***** Start a BUSCO v5.0.0 analysis, current time: 01/25/2021 20:10:29 ***** +INFO: Configuring BUSCO with local environment INFO: Mode is genome -INFO: Input file is genome.fna +INFO: Input file is /busco_wd/genome.fna INFO: Downloading information on latest versions of BUSCO data... WARNING: Running Auto Lineage Selector as no lineage dataset was specified. This will take a little longer than normal. If you know what lineage dataset you want to use, please specify this in the config file or using the -l (--lineage-dataset) flag in the command line. INFO: No lineage specified. Running lineage auto selector. @@ -10,16 +10,16 @@ This process runs BUSCO on the generic lineage datasets for the domains archaea, bacteria and eukaryota. Once the optimal domain is selected, BUSCO automatically attempts to find the most appropriate BUSCO dataset to use based on phylogenetic placement. --auto-lineage-euk and --auto-lineage-prok are also available if you know your input assembly is, or is not, an eukaryote. See the user guide for more information. A reminder: Busco evaluations are valid when an appropriate dataset is used, i.e., the dataset belongs to the lineage of the species to test. Because of overlapping markers/spurious matches among domains, busco matches in another domain do not necessarily mean that your genome/proteome contains sequences from this domain. However, a high busco score in multiple domains might help you identify possible contaminations. -INFO: Downloading file 'https://busco-data.ezlab.org/v4/data/lineages/archaea_odb10.2019-01-04.tar.gz' +INFO: Downloading file 'https://busco-data.ezlab.org/v5/data/lineages/archaea_odb10.2020-03-06.tar.gz' INFO: Decompressing file '/busco_wd/busco_downloads/lineages/archaea_odb10.tar.gz' -INFO: Running BUSCO using lineage dataset archaea_odb10 (prokaryota, 2019-01-04) +INFO: Running BUSCO using lineage dataset archaea_odb10 (prokaryota, 2020-03-06) INFO: ***** Run Prodigal on input to predict and extract genes ***** INFO: Running Prodigal with genetic code 11 in single mode -INFO: Running 1 job(s) on prodigal, starting at 07/01/2020 18:43:09 +INFO: Running 1 job(s) on prodigal, starting at 01/25/2021 20:10:30 INFO: [prodigal] 1 of 1 task(s) completed INFO: Genetic code 11 selected as optimal INFO: ***** Run HMMER on gene sequences ***** -INFO: Running 194 job(s) on hmmsearch, starting at 07/01/2020 18:43:10 +INFO: Running 194 job(s) on hmmsearch, starting at 01/25/2021 20:10:32 INFO: [hmmsearch] 20 of 194 task(s) completed INFO: [hmmsearch] 39 of 194 task(s) completed INFO: [hmmsearch] 59 of 194 task(s) completed @@ -32,13 +32,13 @@ INFO: [hmmsearch] 194 of 194 task(s) completed INFO: Results: C:5.2%[S:5.2%,D:0.0%],F:1.5%,M:93.3%,n:194 -INFO: Downloading file 'https://busco-data.ezlab.org/v4/data/lineages/bacteria_odb10.2019-06-26.tar.gz' +INFO: Downloading file 'https://busco-data.ezlab.org/v5/data/lineages/bacteria_odb10.2020-03-06.tar.gz' INFO: Decompressing file '/busco_wd/busco_downloads/lineages/bacteria_odb10.tar.gz' -INFO: Running BUSCO using lineage dataset bacteria_odb10 (prokaryota, 2019-06-26) +INFO: Running BUSCO using lineage dataset bacteria_odb10 (prokaryota, 2020-03-06) INFO: ***** Run Prodigal on input to predict and extract genes ***** INFO: Genetic code 11 selected as optimal INFO: ***** Run HMMER on gene sequences ***** -INFO: Running 124 job(s) on hmmsearch, starting at 07/01/2020 18:43:13 +INFO: Running 124 job(s) on hmmsearch, starting at 01/25/2021 20:10:34 INFO: [hmmsearch] 13 of 124 task(s) completed INFO: [hmmsearch] 25 of 124 task(s) completed INFO: [hmmsearch] 38 of 124 task(s) completed @@ -51,90 +51,63 @@ INFO: [hmmsearch] 124 of 124 task(s) completed INFO: Results: C:21.0%[S:21.0%,D:0.0%],F:0.8%,M:78.2%,n:124 -INFO: Downloading file 'https://busco-data.ezlab.org/v4/data/lineages/eukaryota_odb10.2019-11-20.tar.gz' +INFO: Downloading file 'https://busco-data.ezlab.org/v5/data/lineages/eukaryota_odb10.2020-09-10.tar.gz' INFO: Decompressing file '/busco_wd/busco_downloads/lineages/eukaryota_odb10.tar.gz' -INFO: Running BUSCO using lineage dataset eukaryota_odb10 (eukaryota, 2019-11-20) -INFO: Running 1 job(s) on makeblastdb, starting at 07/01/2020 18:43:16 -INFO: Creating BLAST database with input file -INFO: [makeblastdb] 1 of 1 task(s) completed -INFO: Running a BLAST search for BUSCOs against created database -INFO: Running 1 job(s) on tblastn, starting at 07/01/2020 18:43:16 -INFO: [tblastn] 1 of 1 task(s) completed -INFO: Running Augustus gene predictor on BLAST search results. -INFO: Running Augustus prediction using fly as species: -INFO: Running 10 job(s) on augustus, starting at 07/01/2020 18:43:18 -INFO: [augustus] 1 of 10 task(s) completed -INFO: [augustus] 2 of 10 task(s) completed -INFO: [augustus] 3 of 10 task(s) completed -INFO: [augustus] 4 of 10 task(s) completed -INFO: [augustus] 5 of 10 task(s) completed -INFO: [augustus] 6 of 10 task(s) completed -INFO: [augustus] 7 of 10 task(s) completed -INFO: [augustus] 8 of 10 task(s) completed -INFO: [augustus] 9 of 10 task(s) completed -INFO: [augustus] 10 of 10 task(s) completed -INFO: Extracting predicted proteins... +INFO: Running BUSCO using lineage dataset eukaryota_odb10 (eukaryota, 2020-09-10) +INFO: Running 1 job(s) on metaeuk, starting at 01/25/2021 20:10:43 +INFO: [metaeuk] 1 of 1 task(s) completed INFO: ***** Run HMMER on gene sequences ***** -INFO: Running 4 job(s) on hmmsearch, starting at 07/01/2020 18:43:51 -INFO: [hmmsearch] 1 of 4 task(s) completed -INFO: [hmmsearch] 2 of 4 task(s) completed -INFO: [hmmsearch] 3 of 4 task(s) completed -INFO: [hmmsearch] 4 of 4 task(s) completed -WARNING: BUSCO did not find any match. Make sure to check the log files if this is unexpected. -INFO: Results: C:0.0%[S:0.0%,D:0.0%],F:0.0%,M:100.0%,n:255 - -INFO: Starting second step of analysis. The gene predictor Augustus is retrained using the results from the initial run to yield more accurate results. -INFO: Extracting missing and fragmented buscos from the file ancestral_variants... -INFO: Running a BLAST search for BUSCOs against created database -INFO: Running 1 job(s) on tblastn, starting at 07/01/2020 18:43:52 -INFO: [tblastn] 1 of 1 task(s) completed -INFO: Converting predicted genes to short genbank files -WARNING: No jobs to run on gff2gbSmallDNA.pl -INFO: All files converted to short genbank files, now training Augustus using Single-Copy Complete BUSCOs -INFO: Running 1 job(s) on new_species.pl, starting at 07/01/2020 18:44:07 -INFO: [new_species.pl] 1 of 1 task(s) completed -INFO: Running 1 job(s) on etraining, starting at 07/01/2020 18:44:08 -INFO: [etraining] 1 of 1 task(s) completed -INFO: Re-running Augustus with the new metaparameters, number of target BUSCOs: 255 -INFO: Running Augustus gene predictor on BLAST search results. -INFO: Running Augustus prediction using BUSCO_test_bacteria as species: -INFO: Running 14 job(s) on augustus, starting at 07/01/2020 18:44:08 -INFO: [augustus] 2 of 14 task(s) completed -INFO: [augustus] 3 of 14 task(s) completed -INFO: [augustus] 5 of 14 task(s) completed -INFO: [augustus] 6 of 14 task(s) completed -INFO: [augustus] 7 of 14 task(s) completed -INFO: [augustus] 9 of 14 task(s) completed -INFO: [augustus] 10 of 14 task(s) completed -INFO: [augustus] 12 of 14 task(s) completed -INFO: [augustus] 13 of 14 task(s) completed -INFO: [augustus] 14 of 14 task(s) completed -INFO: Extracting predicted proteins... +INFO: Running 255 job(s) on hmmsearch, starting at 01/25/2021 20:11:02 +INFO: [hmmsearch] 26 of 255 task(s) completed +INFO: [hmmsearch] 51 of 255 task(s) completed +INFO: [hmmsearch] 51 of 255 task(s) completed +INFO: [hmmsearch] 77 of 255 task(s) completed +INFO: [hmmsearch] 102 of 255 task(s) completed +INFO: [hmmsearch] 128 of 255 task(s) completed +INFO: [hmmsearch] 153 of 255 task(s) completed +INFO: [hmmsearch] 179 of 255 task(s) completed +INFO: [hmmsearch] 204 of 255 task(s) completed +INFO: [hmmsearch] 230 of 255 task(s) completed +INFO: [hmmsearch] 255 of 255 task(s) completed +INFO: Results: C:1.2%[S:1.2%,D:0.0%],F:0.0%,M:98.8%,n:255 + +INFO: Extracting missing and fragmented buscos from the file refseq_db.faa... +INFO: Running 1 job(s) on metaeuk, starting at 01/25/2021 20:11:21 +INFO: [metaeuk] 1 of 1 task(s) completed INFO: ***** Run HMMER on gene sequences ***** -WARNING: No jobs to run on hmmsearch -WARNING: BUSCO did not find any match. Make sure to check the log files if this is unexpected. -INFO: Results: C:0.0%[S:0.0%,D:0.0%],F:0.0%,M:100.0%,n:255 +INFO: Running 252 job(s) on hmmsearch, starting at 01/25/2021 20:11:48 +INFO: [hmmsearch] 26 of 252 task(s) completed +INFO: [hmmsearch] 51 of 252 task(s) completed +INFO: [hmmsearch] 76 of 252 task(s) completed +INFO: [hmmsearch] 101 of 252 task(s) completed +INFO: [hmmsearch] 126 of 252 task(s) completed +INFO: [hmmsearch] 152 of 252 task(s) completed +INFO: [hmmsearch] 177 of 252 task(s) completed +INFO: [hmmsearch] 202 of 252 task(s) completed +INFO: [hmmsearch] 227 of 252 task(s) completed +INFO: [hmmsearch] 252 of 252 task(s) completed +INFO: Validating exons and removing overlapping matches +INFO: Results: C:1.2%[S:1.2%,D:0.0%],F:0.0%,M:98.8%,n:255 -WARNING: Augustus did not produce a retrained species folder. INFO: bacteria_odb10 selected INFO: ***** Searching tree for chosen lineage to find best taxonomic match ***** INFO: Extract markers... -INFO: Downloading file 'https://busco-data.ezlab.org/v4/data/placement_files/list_of_reference_markers.bacteria_odb10.2019-12-16.txt.tar.gz' +INFO: Downloading file 'https://busco-data.ezlab.org/v5/data/placement_files/list_of_reference_markers.bacteria_odb10.2019-12-16.txt.tar.gz' INFO: Decompressing file '/busco_wd/busco_downloads/placement_files/list_of_reference_markers.bacteria_odb10.2019-12-16.txt.tar.gz' -INFO: Downloading file 'https://busco-data.ezlab.org/v4/data/placement_files/tree.bacteria_odb10.2019-12-16.nwk.tar.gz' +INFO: Downloading file 'https://busco-data.ezlab.org/v5/data/placement_files/tree.bacteria_odb10.2019-12-16.nwk.tar.gz' INFO: Decompressing file '/busco_wd/busco_downloads/placement_files/tree.bacteria_odb10.2019-12-16.nwk.tar.gz' -INFO: Downloading file 'https://busco-data.ezlab.org/v4/data/placement_files/tree_metadata.bacteria_odb10.2019-12-16.txt.tar.gz' +INFO: Downloading file 'https://busco-data.ezlab.org/v5/data/placement_files/tree_metadata.bacteria_odb10.2019-12-16.txt.tar.gz' INFO: Decompressing file '/busco_wd/busco_downloads/placement_files/tree_metadata.bacteria_odb10.2019-12-16.txt.tar.gz' -INFO: Downloading file 'https://busco-data.ezlab.org/v4/data/placement_files/supermatrix.aln.bacteria_odb10.2019-12-16.faa.tar.gz' +INFO: Downloading file 'https://busco-data.ezlab.org/v5/data/placement_files/supermatrix.aln.bacteria_odb10.2019-12-16.faa.tar.gz' INFO: Decompressing file '/busco_wd/busco_downloads/placement_files/supermatrix.aln.bacteria_odb10.2019-12-16.faa.tar.gz' -INFO: Downloading file 'https://busco-data.ezlab.org/v4/data/placement_files/mapping_taxids-busco_dataset_name.bacteria_odb10.2019-12-16.txt.tar.gz' +INFO: Downloading file 'https://busco-data.ezlab.org/v5/data/placement_files/mapping_taxids-busco_dataset_name.bacteria_odb10.2019-12-16.txt.tar.gz' INFO: Decompressing file '/busco_wd/busco_downloads/placement_files/mapping_taxids-busco_dataset_name.bacteria_odb10.2019-12-16.txt.tar.gz' -INFO: Downloading file 'https://busco-data.ezlab.org/v4/data/placement_files/mapping_taxid-lineage.bacteria_odb10.2019-12-16.txt.tar.gz' +INFO: Downloading file 'https://busco-data.ezlab.org/v5/data/placement_files/mapping_taxid-lineage.bacteria_odb10.2019-12-16.txt.tar.gz' INFO: Decompressing file '/busco_wd/busco_downloads/placement_files/mapping_taxid-lineage.bacteria_odb10.2019-12-16.txt.tar.gz' INFO: Place the markers on the reference tree... -INFO: Running 1 job(s) on sepp, starting at 07/01/2020 18:44:10 +INFO: Running 1 job(s) on sepp, starting at 01/25/2021 20:11:51 INFO: [sepp] 1 of 1 task(s) completed INFO: Not enough markers were placed on the tree (11). Root lineage bacteria is kept INFO: @@ -150,15 +123,11 @@ |97 Missing BUSCOs (M) | |124 Total BUSCO groups searched | -------------------------------------------------- -INFO: BUSCO analysis done with WARNING(s). Total running time: 127 seconds +INFO: BUSCO analysis done with WARNING(s). Total running time: 129 seconds ***** Summary of warnings: ***** WARNING:busco.ConfigManager Running Auto Lineage Selector as no lineage dataset was specified. This will take a little longer than normal. If you know what lineage dataset you want to use, please specify this in the config file or using the -l (--lineage-dataset) flag in the command line. -WARNING:busco.BuscoTools BUSCO did not find any match. Make sure to check the log files if this is unexpected. -WARNING:busco.Toolset No jobs to run on gff2gbSmallDNA.pl -WARNING:busco.Toolset No jobs to run on hmmsearch -WARNING:busco.BuscoTools BUSCO did not find any match. Make sure to check the log files if this is unexpected. -WARNING:busco.BuscoTools Augustus did not produce a retrained species folder. INFO: Results written in /busco_wd/test_bacteria +INFO: For assistance with interpreting the results, please consult the userguide: https://busco.ezlab.org/busco_userguide.html diff -Nru busco-4.1.4/test_data/eukaryota/expected_log.txt busco-5.0.0/test_data/eukaryota/expected_log.txt --- busco-4.1.4/test_data/eukaryota/expected_log.txt 2020-10-01 14:11:36.000000000 +0000 +++ busco-5.0.0/test_data/eukaryota/expected_log.txt 2021-01-26 11:28:47.000000000 +0000 @@ -1,7 +1,7 @@ -INFO: ***** Start a BUSCO v4.1.3 analysis, current time: 07/01/2020 17:20:41 ***** -INFO: Configuring BUSCO with /busco/config/config.ini +INFO: ***** Start a BUSCO v5.0.0 analysis, current time: 01/25/2021 19:52:12 ***** +INFO: Configuring BUSCO with local environment INFO: Mode is genome -INFO: Input file is genome.fna +INFO: Input file is /busco_wd/genome.fna INFO: Downloading information on latest versions of BUSCO data... WARNING: Running Auto Lineage Selector as no lineage dataset was specified. This will take a little longer than normal. If you know what lineage dataset you want to use, please specify this in the config file or using the -l (--lineage-dataset) flag in the command line. INFO: No lineage specified. Running lineage auto selector. @@ -10,17 +10,16 @@ This process runs BUSCO on the generic lineage datasets for the domains archaea, bacteria and eukaryota. Once the optimal domain is selected, BUSCO automatically attempts to find the most appropriate BUSCO dataset to use based on phylogenetic placement. --auto-lineage-euk and --auto-lineage-prok are also available if you know your input assembly is, or is not, an eukaryote. See the user guide for more information. A reminder: Busco evaluations are valid when an appropriate dataset is used, i.e., the dataset belongs to the lineage of the species to test. Because of overlapping markers/spurious matches among domains, busco matches in another domain do not necessarily mean that your genome/proteome contains sequences from this domain. However, a high busco score in multiple domains might help you identify possible contaminations. -INFO: Downloading file 'https://busco-data.ezlab.org/v4/data/lineages/archaea_odb10.2019-01-04.tar.gz' +INFO: Downloading file 'https://busco-data.ezlab.org/v5/data/lineages/archaea_odb10.2020-03-06.tar.gz' INFO: Decompressing file '/busco_wd/busco_downloads/lineages/archaea_odb10.tar.gz' -INFO: Running BUSCO using lineage dataset archaea_odb10 (prokaryota, 2019-01-04) +INFO: Running BUSCO using lineage dataset archaea_odb10 (prokaryota, 2020-03-06) INFO: ***** Run Prodigal on input to predict and extract genes ***** INFO: Running Prodigal with genetic code 11 in single mode -INFO: Running 1 job(s) on prodigal, starting at 07/01/2020 17:20:42 +INFO: Running 1 job(s) on prodigal, starting at 01/25/2021 19:52:13 INFO: [prodigal] 1 of 1 task(s) completed INFO: Genetic code 11 selected as optimal INFO: ***** Run HMMER on gene sequences ***** -INFO: Running 194 job(s) on hmmsearch, starting at 07/01/2020 17:20:42 -INFO: [hmmsearch] 20 of 194 task(s) completed +INFO: Running 194 job(s) on hmmsearch, starting at 01/25/2021 19:52:13 INFO: [hmmsearch] 39 of 194 task(s) completed INFO: [hmmsearch] 59 of 194 task(s) completed INFO: [hmmsearch] 78 of 194 task(s) completed @@ -28,23 +27,23 @@ INFO: [hmmsearch] 117 of 194 task(s) completed INFO: [hmmsearch] 136 of 194 task(s) completed INFO: [hmmsearch] 156 of 194 task(s) completed -INFO: [hmmsearch] 156 of 194 task(s) completed INFO: [hmmsearch] 175 of 194 task(s) completed INFO: [hmmsearch] 194 of 194 task(s) completed INFO: Results: C:1.0%[S:1.0%,D:0.0%],F:0.5%,M:98.5%,n:194 -INFO: Downloading file 'https://busco-data.ezlab.org/v4/data/lineages/bacteria_odb10.2019-06-26.tar.gz' +INFO: Downloading file 'https://busco-data.ezlab.org/v5/data/lineages/bacteria_odb10.2020-03-06.tar.gz' INFO: Decompressing file '/busco_wd/busco_downloads/lineages/bacteria_odb10.tar.gz' -INFO: Running BUSCO using lineage dataset bacteria_odb10 (prokaryota, 2019-06-26) +INFO: Running BUSCO using lineage dataset bacteria_odb10 (prokaryota, 2020-03-06) INFO: ***** Run Prodigal on input to predict and extract genes ***** INFO: Genetic code 11 selected as optimal INFO: ***** Run HMMER on gene sequences ***** -INFO: Running 124 job(s) on hmmsearch, starting at 07/01/2020 17:20:45 +INFO: Running 124 job(s) on hmmsearch, starting at 01/25/2021 19:52:16 INFO: [hmmsearch] 13 of 124 task(s) completed INFO: [hmmsearch] 25 of 124 task(s) completed INFO: [hmmsearch] 38 of 124 task(s) completed INFO: [hmmsearch] 50 of 124 task(s) completed INFO: [hmmsearch] 63 of 124 task(s) completed +INFO: [hmmsearch] 63 of 124 task(s) completed INFO: [hmmsearch] 75 of 124 task(s) completed INFO: [hmmsearch] 87 of 124 task(s) completed INFO: [hmmsearch] 100 of 124 task(s) completed @@ -53,209 +52,110 @@ WARNING: BUSCO did not find any match. Make sure to check the log files if this is unexpected. INFO: Results: C:0.0%[S:0.0%,D:0.0%],F:0.0%,M:100.0%,n:124 -INFO: Downloading file 'https://busco-data.ezlab.org/v4/data/lineages/eukaryota_odb10.2019-11-20.tar.gz' +INFO: Downloading file 'https://busco-data.ezlab.org/v5/data/lineages/eukaryota_odb10.2020-09-10.tar.gz' INFO: Decompressing file '/busco_wd/busco_downloads/lineages/eukaryota_odb10.tar.gz' -INFO: Running BUSCO using lineage dataset eukaryota_odb10 (eukaryota, 2019-11-20) -INFO: Running 1 job(s) on makeblastdb, starting at 07/01/2020 17:20:48 -INFO: Creating BLAST database with input file -INFO: [makeblastdb] 1 of 1 task(s) completed -INFO: Running a BLAST search for BUSCOs against created database -INFO: Running 1 job(s) on tblastn, starting at 07/01/2020 17:20:48 -INFO: [tblastn] 1 of 1 task(s) completed -INFO: Running Augustus gene predictor on BLAST search results. -INFO: Running Augustus prediction using fly as species: -INFO: Running 52 job(s) on augustus, starting at 07/01/2020 17:20:48 -INFO: [augustus] 6 of 52 task(s) completed -INFO: [augustus] 11 of 52 task(s) completed -INFO: [augustus] 16 of 52 task(s) completed -INFO: [augustus] 21 of 52 task(s) completed -INFO: [augustus] 27 of 52 task(s) completed -INFO: [augustus] 32 of 52 task(s) completed -INFO: [augustus] 37 of 52 task(s) completed -INFO: [augustus] 42 of 52 task(s) completed -INFO: [augustus] 47 of 52 task(s) completed -INFO: [augustus] 52 of 52 task(s) completed -INFO: Extracting predicted proteins... -INFO: ***** Run HMMER on gene sequences ***** -INFO: Running 50 job(s) on hmmsearch, starting at 07/01/2020 17:21:44 -INFO: [hmmsearch] 5 of 50 task(s) completed -INFO: [hmmsearch] 10 of 50 task(s) completed -INFO: [hmmsearch] 15 of 50 task(s) completed -INFO: [hmmsearch] 20 of 50 task(s) completed -INFO: [hmmsearch] 25 of 50 task(s) completed -INFO: [hmmsearch] 30 of 50 task(s) completed -INFO: [hmmsearch] 35 of 50 task(s) completed -INFO: [hmmsearch] 40 of 50 task(s) completed -INFO: [hmmsearch] 45 of 50 task(s) completed -INFO: [hmmsearch] 50 of 50 task(s) completed -INFO: Results: C:15.3%[S:15.3%,D:0.0%],F:1.2%,M:83.5%,n:255 - -INFO: Starting second step of analysis. The gene predictor Augustus is retrained using the results from the initial run to yield more accurate results. -INFO: Extracting missing and fragmented buscos from the file ancestral_variants... -INFO: Running a BLAST search for BUSCOs against created database -INFO: Running 1 job(s) on tblastn, starting at 07/01/2020 17:21:45 -INFO: [tblastn] 1 of 1 task(s) completed -INFO: Converting predicted genes to short genbank files -INFO: Running 39 job(s) on gff2gbSmallDNA.pl, starting at 07/01/2020 17:21:49 -INFO: [gff2gbSmallDNA.pl] 4 of 39 task(s) completed -INFO: [gff2gbSmallDNA.pl] 8 of 39 task(s) completed -INFO: [gff2gbSmallDNA.pl] 12 of 39 task(s) completed -INFO: [gff2gbSmallDNA.pl] 16 of 39 task(s) completed -INFO: [gff2gbSmallDNA.pl] 20 of 39 task(s) completed -INFO: [gff2gbSmallDNA.pl] 24 of 39 task(s) completed -INFO: [gff2gbSmallDNA.pl] 28 of 39 task(s) completed -INFO: [gff2gbSmallDNA.pl] 32 of 39 task(s) completed -INFO: [gff2gbSmallDNA.pl] 36 of 39 task(s) completed -INFO: [gff2gbSmallDNA.pl] 39 of 39 task(s) completed -INFO: All files converted to short genbank files, now training Augustus using Single-Copy Complete BUSCOs -INFO: Running 1 job(s) on new_species.pl, starting at 07/01/2020 17:21:49 -INFO: [new_species.pl] 1 of 1 task(s) completed -INFO: Running 1 job(s) on etraining, starting at 07/01/2020 17:21:50 -INFO: [etraining] 1 of 1 task(s) completed -INFO: Re-running Augustus with the new metaparameters, number of target BUSCOs: 216 -INFO: Running Augustus gene predictor on BLAST search results. -INFO: Running Augustus prediction using BUSCO_test_eukaryota as species: -INFO: Running 39 job(s) on augustus, starting at 07/01/2020 17:21:50 -INFO: [augustus] 4 of 39 task(s) completed -INFO: [augustus] 8 of 39 task(s) completed -INFO: [augustus] 12 of 39 task(s) completed -INFO: [augustus] 16 of 39 task(s) completed -INFO: [augustus] 20 of 39 task(s) completed -INFO: [augustus] 24 of 39 task(s) completed -INFO: [augustus] 28 of 39 task(s) completed -INFO: [augustus] 32 of 39 task(s) completed -INFO: [augustus] 36 of 39 task(s) completed -INFO: [augustus] 39 of 39 task(s) completed -INFO: Extracting predicted proteins... -INFO: ***** Run HMMER on gene sequences ***** -INFO: Running 34 job(s) on hmmsearch, starting at 07/01/2020 17:22:01 -INFO: [hmmsearch] 4 of 34 task(s) completed -INFO: [hmmsearch] 7 of 34 task(s) completed -INFO: [hmmsearch] 11 of 34 task(s) completed -INFO: [hmmsearch] 14 of 34 task(s) completed -INFO: [hmmsearch] 17 of 34 task(s) completed -INFO: [hmmsearch] 21 of 34 task(s) completed -INFO: [hmmsearch] 24 of 34 task(s) completed -INFO: [hmmsearch] 28 of 34 task(s) completed -INFO: [hmmsearch] 31 of 34 task(s) completed -INFO: [hmmsearch] 34 of 34 task(s) completed -INFO: Results: C:18.8%[S:18.8%,D:0.0%],F:1.2%,M:80.0%,n:255 +INFO: Running BUSCO using lineage dataset eukaryota_odb10 (eukaryota, 2020-09-10) +INFO: Running 1 job(s) on metaeuk, starting at 01/25/2021 19:52:25 +INFO: [metaeuk] 1 of 1 task(s) completed +INFO: ***** Run HMMER on gene sequences ***** +INFO: Running 255 job(s) on hmmsearch, starting at 01/25/2021 19:52:43 +INFO: [hmmsearch] 51 of 255 task(s) completed +INFO: [hmmsearch] 77 of 255 task(s) completed +INFO: [hmmsearch] 77 of 255 task(s) completed +INFO: [hmmsearch] 102 of 255 task(s) completed +INFO: [hmmsearch] 128 of 255 task(s) completed +INFO: [hmmsearch] 153 of 255 task(s) completed +INFO: [hmmsearch] 179 of 255 task(s) completed +INFO: [hmmsearch] 204 of 255 task(s) completed +INFO: [hmmsearch] 230 of 255 task(s) completed +INFO: [hmmsearch] 255 of 255 task(s) completed +INFO: Results: C:19.2%[S:19.2%,D:0.0%],F:0.8%,M:80.0%,n:255 + +INFO: Extracting missing and fragmented buscos from the file refseq_db.faa... +INFO: Running 1 job(s) on metaeuk, starting at 01/25/2021 19:53:02 +INFO: [metaeuk] 1 of 1 task(s) completed +INFO: ***** Run HMMER on gene sequences ***** +INFO: Running 206 job(s) on hmmsearch, starting at 01/25/2021 19:53:16 +INFO: [hmmsearch] 21 of 206 task(s) completed +INFO: [hmmsearch] 42 of 206 task(s) completed +INFO: [hmmsearch] 62 of 206 task(s) completed +INFO: [hmmsearch] 83 of 206 task(s) completed +INFO: [hmmsearch] 104 of 206 task(s) completed +INFO: [hmmsearch] 124 of 206 task(s) completed +INFO: [hmmsearch] 145 of 206 task(s) completed +INFO: [hmmsearch] 165 of 206 task(s) completed +INFO: [hmmsearch] 186 of 206 task(s) completed +INFO: [hmmsearch] 206 of 206 task(s) completed +INFO: Validating exons and removing overlapping matches +INFO: Results: C:19.2%[S:19.2%,D:0.0%],F:0.8%,M:80.0%,n:255 INFO: eukaryota_odb10 selected INFO: ***** Searching tree for chosen lineage to find best taxonomic match ***** INFO: Extract markers... -INFO: Downloading file 'https://busco-data.ezlab.org/v4/data/placement_files/list_of_reference_markers.eukaryota_odb10.2019-12-16.txt.tar.gz' +INFO: Downloading file 'https://busco-data.ezlab.org/v5/data/placement_files/list_of_reference_markers.eukaryota_odb10.2019-12-16.txt.tar.gz' INFO: Decompressing file '/busco_wd/busco_downloads/placement_files/list_of_reference_markers.eukaryota_odb10.2019-12-16.txt.tar.gz' -INFO: Downloading file 'https://busco-data.ezlab.org/v4/data/placement_files/tree.eukaryota_odb10.2019-12-16.nwk.tar.gz' +INFO: Downloading file 'https://busco-data.ezlab.org/v5/data/placement_files/tree.eukaryota_odb10.2019-12-16.nwk.tar.gz' INFO: Decompressing file '/busco_wd/busco_downloads/placement_files/tree.eukaryota_odb10.2019-12-16.nwk.tar.gz' -INFO: Downloading file 'https://busco-data.ezlab.org/v4/data/placement_files/tree_metadata.eukaryota_odb10.2019-12-16.txt.tar.gz' +INFO: Downloading file 'https://busco-data.ezlab.org/v5/data/placement_files/tree_metadata.eukaryota_odb10.2019-12-16.txt.tar.gz' INFO: Decompressing file '/busco_wd/busco_downloads/placement_files/tree_metadata.eukaryota_odb10.2019-12-16.txt.tar.gz' -INFO: Downloading file 'https://busco-data.ezlab.org/v4/data/placement_files/supermatrix.aln.eukaryota_odb10.2019-12-16.faa.tar.gz' +INFO: Downloading file 'https://busco-data.ezlab.org/v5/data/placement_files/supermatrix.aln.eukaryota_odb10.2019-12-16.faa.tar.gz' INFO: Decompressing file '/busco_wd/busco_downloads/placement_files/supermatrix.aln.eukaryota_odb10.2019-12-16.faa.tar.gz' -INFO: Downloading file 'https://busco-data.ezlab.org/v4/data/placement_files/mapping_taxids-busco_dataset_name.eukaryota_odb10.2019-12-16.txt.tar.gz' +INFO: Downloading file 'https://busco-data.ezlab.org/v5/data/placement_files/mapping_taxids-busco_dataset_name.eukaryota_odb10.2019-12-16.txt.tar.gz' INFO: Decompressing file '/busco_wd/busco_downloads/placement_files/mapping_taxids-busco_dataset_name.eukaryota_odb10.2019-12-16.txt.tar.gz' -INFO: Downloading file 'https://busco-data.ezlab.org/v4/data/placement_files/mapping_taxid-lineage.eukaryota_odb10.2019-12-16.txt.tar.gz' +INFO: Downloading file 'https://busco-data.ezlab.org/v5/data/placement_files/mapping_taxid-lineage.eukaryota_odb10.2019-12-16.txt.tar.gz' INFO: Decompressing file '/busco_wd/busco_downloads/placement_files/mapping_taxid-lineage.eukaryota_odb10.2019-12-16.txt.tar.gz' INFO: Place the markers on the reference tree... -INFO: Running 1 job(s) on sepp, starting at 07/01/2020 17:22:02 +INFO: Running 1 job(s) on sepp, starting at 01/25/2021 19:53:19 INFO: [sepp] 1 of 1 task(s) completed -INFO: Lineage saccharomycetes is selected, supported by 16 markers out of 17 -INFO: Downloading file 'https://busco-data.ezlab.org/v4/data/lineages/saccharomycetes_odb10.2019-11-20.tar.gz' +INFO: Lineage saccharomycetes is selected, supported by 18 markers out of 19 +INFO: Downloading file 'https://busco-data.ezlab.org/v5/data/lineages/saccharomycetes_odb10.2020-08-05.tar.gz' INFO: Decompressing file '/busco_wd/busco_downloads/lineages/saccharomycetes_odb10.tar.gz' -INFO: Running BUSCO using lineage dataset saccharomycetes_odb10 (eukaryota, 2019-11-20) -INFO: Running a BLAST search for BUSCOs against created database -INFO: Running 1 job(s) on tblastn, starting at 07/01/2020 17:25:10 -INFO: [tblastn] 1 of 1 task(s) completed -INFO: Running Augustus gene predictor on BLAST search results. -INFO: Running Augustus prediction using aspergillus_nidulans as species: -INFO: Running 98 job(s) on augustus, starting at 07/01/2020 17:25:14 -INFO: [augustus] 10 of 98 task(s) completed -INFO: [augustus] 20 of 98 task(s) completed -INFO: [augustus] 30 of 98 task(s) completed -INFO: [augustus] 40 of 98 task(s) completed -INFO: [augustus] 50 of 98 task(s) completed -INFO: [augustus] 59 of 98 task(s) completed -INFO: [augustus] 69 of 98 task(s) completed -INFO: [augustus] 79 of 98 task(s) completed -INFO: [augustus] 89 of 98 task(s) completed -INFO: [augustus] 98 of 98 task(s) completed -INFO: Extracting predicted proteins... -INFO: ***** Run HMMER on gene sequences ***** -INFO: Running 63 job(s) on hmmsearch, starting at 07/01/2020 17:25:54 -INFO: [hmmsearch] 7 of 63 task(s) completed -INFO: [hmmsearch] 13 of 63 task(s) completed -INFO: [hmmsearch] 19 of 63 task(s) completed -INFO: [hmmsearch] 26 of 63 task(s) completed -INFO: [hmmsearch] 32 of 63 task(s) completed -INFO: [hmmsearch] 38 of 63 task(s) completed -INFO: [hmmsearch] 45 of 63 task(s) completed -INFO: [hmmsearch] 51 of 63 task(s) completed -INFO: [hmmsearch] 57 of 63 task(s) completed -INFO: [hmmsearch] 63 of 63 task(s) completed -INFO: Starting second step of analysis. The gene predictor Augustus is retrained using the results from the initial run to yield more accurate results. -INFO: Extracting missing and fragmented buscos from the file ancestral_variants... -INFO: Running a BLAST search for BUSCOs against created database -INFO: Running 1 job(s) on tblastn, starting at 07/01/2020 17:26:02 -INFO: [tblastn] 1 of 1 task(s) completed -INFO: Converting predicted genes to short genbank files -INFO: Running 29 job(s) on gff2gbSmallDNA.pl, starting at 07/01/2020 17:27:08 -INFO: [gff2gbSmallDNA.pl] 3 of 29 task(s) completed -INFO: [gff2gbSmallDNA.pl] 6 of 29 task(s) completed -INFO: [gff2gbSmallDNA.pl] 9 of 29 task(s) completed -INFO: [gff2gbSmallDNA.pl] 12 of 29 task(s) completed -INFO: [gff2gbSmallDNA.pl] 15 of 29 task(s) completed -INFO: [gff2gbSmallDNA.pl] 18 of 29 task(s) completed -INFO: [gff2gbSmallDNA.pl] 21 of 29 task(s) completed -INFO: [gff2gbSmallDNA.pl] 24 of 29 task(s) completed -INFO: [gff2gbSmallDNA.pl] 27 of 29 task(s) completed -INFO: [gff2gbSmallDNA.pl] 29 of 29 task(s) completed -INFO: [gff2gbSmallDNA.pl] 29 of 29 task(s) completed -INFO: All files converted to short genbank files, now training Augustus using Single-Copy Complete BUSCOs -INFO: Running 1 job(s) on new_species.pl, starting at 07/01/2020 17:27:09 -INFO: [new_species.pl] 1 of 1 task(s) completed -INFO: Running 1 job(s) on etraining, starting at 07/01/2020 17:27:09 -INFO: [etraining] 1 of 1 task(s) completed -INFO: Re-running Augustus with the new metaparameters, number of target BUSCOs: 2108 -INFO: Running Augustus gene predictor on BLAST search results. -INFO: Running Augustus prediction using BUSCO_test_eukaryota as species: -INFO: Running 147 job(s) on augustus, starting at 07/01/2020 17:27:10 -INFO: [augustus] 15 of 147 task(s) completed -INFO: [augustus] 30 of 147 task(s) completed -INFO: [augustus] 45 of 147 task(s) completed -INFO: [augustus] 59 of 147 task(s) completed -INFO: [augustus] 74 of 147 task(s) completed -INFO: [augustus] 89 of 147 task(s) completed -INFO: [augustus] 103 of 147 task(s) completed -INFO: [augustus] 118 of 147 task(s) completed -INFO: [augustus] 133 of 147 task(s) completed -INFO: [augustus] 147 of 147 task(s) completed -INFO: Extracting predicted proteins... -INFO: ***** Run HMMER on gene sequences ***** -INFO: Running 140 job(s) on hmmsearch, starting at 07/01/2020 17:27:58 -INFO: [hmmsearch] 14 of 140 task(s) completed -INFO: [hmmsearch] 28 of 140 task(s) completed -INFO: [hmmsearch] 42 of 140 task(s) completed -INFO: [hmmsearch] 56 of 140 task(s) completed -INFO: [hmmsearch] 70 of 140 task(s) completed -INFO: [hmmsearch] 84 of 140 task(s) completed -INFO: [hmmsearch] 98 of 140 task(s) completed -INFO: [hmmsearch] 112 of 140 task(s) completed -INFO: [hmmsearch] 126 of 140 task(s) completed -INFO: [hmmsearch] 140 of 140 task(s) completed -INFO: Results: C:2.0%[S:2.0%,D:0.0%],F:0.3%,M:97.7%,n:2137 +INFO: Running BUSCO using lineage dataset saccharomycetes_odb10 (eukaryota, 2020-08-05) +INFO: Running 1 job(s) on metaeuk, starting at 01/25/2021 19:55:14 +INFO: [metaeuk] 1 of 1 task(s) completed +INFO: ***** Run HMMER on gene sequences ***** +INFO: Running 2137 job(s) on hmmsearch, starting at 01/25/2021 19:55:17 +INFO: [hmmsearch] 214 of 2137 task(s) completed +INFO: [hmmsearch] 428 of 2137 task(s) completed +INFO: [hmmsearch] 642 of 2137 task(s) completed +INFO: [hmmsearch] 855 of 2137 task(s) completed +INFO: [hmmsearch] 1069 of 2137 task(s) completed +INFO: [hmmsearch] 1283 of 2137 task(s) completed +INFO: [hmmsearch] 1496 of 2137 task(s) completed +INFO: [hmmsearch] 1710 of 2137 task(s) completed +INFO: [hmmsearch] 1924 of 2137 task(s) completed +INFO: [hmmsearch] 2137 of 2137 task(s) completed +INFO: Extracting missing and fragmented buscos from the file refseq_db.faa... +INFO: Running 1 job(s) on metaeuk, starting at 01/25/2021 19:55:34 +INFO: [metaeuk] 1 of 1 task(s) completed +INFO: ***** Run HMMER on gene sequences ***** +INFO: Running 2093 job(s) on hmmsearch, starting at 01/25/2021 19:55:37 +INFO: [hmmsearch] 210 of 2093 task(s) completed +INFO: [hmmsearch] 419 of 2093 task(s) completed +INFO: [hmmsearch] 628 of 2093 task(s) completed +INFO: [hmmsearch] 838 of 2093 task(s) completed +INFO: [hmmsearch] 1047 of 2093 task(s) completed +INFO: [hmmsearch] 1256 of 2093 task(s) completed +INFO: [hmmsearch] 1466 of 2093 task(s) completed +INFO: [hmmsearch] 1675 of 2093 task(s) completed +INFO: [hmmsearch] 1675 of 2093 task(s) completed +INFO: [hmmsearch] 1884 of 2093 task(s) completed +INFO: [hmmsearch] 2093 of 2093 task(s) completed +INFO: Validating exons and removing overlapping matches +INFO: Results: C:2.1%[S:2.1%,D:0.0%],F:0.0%,M:97.9%,n:2137 INFO: -------------------------------------------------- |Results from generic domain eukaryota_odb10 | -------------------------------------------------- - |C:18.8%[S:18.8%,D:0.0%],F:1.2%,M:80.0%,n:255 | - |48 Complete BUSCOs (C) | - |48 Complete and single-copy BUSCOs (S) | + |C:19.2%[S:19.2%,D:0.0%],F:0.8%,M:80.0%,n:255 | + |49 Complete BUSCOs (C) | + |49 Complete and single-copy BUSCOs (S) | |0 Complete and duplicated BUSCOs (D) | - |3 Fragmented BUSCOs (F) | + |2 Fragmented BUSCOs (F) | |204 Missing BUSCOs (M) | |255 Total BUSCO groups searched | -------------------------------------------------- @@ -263,19 +163,20 @@ -------------------------------------------------- |Results from dataset saccharomycetes_odb10 | -------------------------------------------------- - |C:2.0%[S:2.0%,D:0.0%],F:0.3%,M:97.7%,n:2137 | - |42 Complete BUSCOs (C) | - |42 Complete and single-copy BUSCOs (S) | + |C:2.1%[S:2.1%,D:0.0%],F:0.0%,M:97.9%,n:2137 | + |45 Complete BUSCOs (C) | + |45 Complete and single-copy BUSCOs (S) | |0 Complete and duplicated BUSCOs (D) | - |6 Fragmented BUSCOs (F) | - |2089 Missing BUSCOs (M) | + |1 Fragmented BUSCOs (F) | + |2091 Missing BUSCOs (M) | |2137 Total BUSCO groups searched | -------------------------------------------------- -INFO: BUSCO analysis done with WARNING(s). Total running time: 440 seconds +INFO: BUSCO analysis done with WARNING(s). Total running time: 216 seconds ***** Summary of warnings: ***** WARNING:busco.ConfigManager Running Auto Lineage Selector as no lineage dataset was specified. This will take a little longer than normal. If you know what lineage dataset you want to use, please specify this in the config file or using the -l (--lineage-dataset) flag in the command line. -WARNING:busco.BuscoTools BUSCO did not find any match. Make sure to check the log files if this is unexpected. +WARNING:busco.busco_tools.hmmer BUSCO did not find any match. Make sure to check the log files if this is unexpected. INFO: Results written in /busco_wd/test_eukaryota +INFO: For assistance with interpreting the results, please consult the userguide: https://busco.ezlab.org/busco_userguide.html diff -Nru busco-4.1.4/tests/unittest_runner.py busco-5.0.0/tests/unittest_runner.py --- busco-4.1.4/tests/unittest_runner.py 1970-01-01 00:00:00.000000000 +0000 +++ busco-5.0.0/tests/unittest_runner.py 2021-01-26 11:28:47.000000000 +0000 @@ -0,0 +1,22 @@ +import unittest +from tests.unittests import run_BUSCO_unittests +from tests.unittests import ConfigManager_unittests +from tests.unittests import BuscoConfig_unittests +from tests.unittests import AutoLineage_unittests +from tests.unittests import GenomeAnalysis_unittests +import sys + +loader = unittest.TestLoader() +suite = unittest.TestSuite() + +suite.addTests(loader.loadTestsFromModule(run_BUSCO_unittests)) +suite.addTests(loader.loadTestsFromModule(ConfigManager_unittests)) +suite.addTests(loader.loadTestsFromModule(BuscoConfig_unittests)) +suite.addTests(loader.loadTestsFromModule(AutoLineage_unittests)) +suite.addTests(loader.loadTestsFromModule(GenomeAnalysis_unittests)) + +runner = unittest.TextTestRunner(verbosity=3) +result = runner.run(suite) + +ret = not result.wasSuccessful() +sys.exit(ret) diff -Nru busco-4.1.4/tests/unittests/AutoLineage_unittests.py busco-5.0.0/tests/unittests/AutoLineage_unittests.py --- busco-4.1.4/tests/unittests/AutoLineage_unittests.py 1970-01-01 00:00:00.000000000 +0000 +++ busco-5.0.0/tests/unittests/AutoLineage_unittests.py 2021-01-26 11:28:47.000000000 +0000 @@ -0,0 +1,312 @@ +import unittest +from unittest.mock import patch, call, Mock +from busco import AutoLineage, BuscoRunner + + +class TestAutoLineage(unittest.TestCase): + def setUp(self): + pass + + @patch("busco.BuscoConfig.BuscoConfigMain", autospec=True) + def test_init_autolineage(self, mock_config_main): + with self.assertLogs(AutoLineage.logger, 20): + AutoLineage.AutoSelectLineage(mock_config_main) + + @patch("busco.AutoLineage.AutoSelectLineage.virus_check", return_value=False) + @patch( + "busco.BuscoConfig.BuscoConfigMain.getboolean", + side_effect=[True, False, True, False, False], + ) + @patch("busco.AutoLineage.logger.info") + @patch("busco.AutoLineage.os") + @patch("busco.AutoLineage.BuscoRunner") + @patch("busco.AutoLineage.AutoSelectLineage.get_best_match_lineage") + @patch("busco.AutoLineage.AutoSelectLineage.run_lineages_list") + @patch("busco.BuscoConfig.BuscoConfigMain", autospec=True) + def test_run_auto_selector_lineage_lists_no_virus( + self, mock_config_main, mock_run_lineages_list, *args + ): + for _ in range(3): + asl = AutoLineage.AutoSelectLineage(mock_config_main) + asl.selected_runner = Mock() + asl.selected_runner.analysis.hmmer_runner.single_copy_buscos = [ + 0 + ] # avoid SystemExit with empty HMMER results + asl.selected_runner.analysis.hmmer_runner.multi_copy_buscos = [0] + asl.selected_runner.analysis.hmmer_runner.fragmented_buscos = [0] + asl.run_auto_selector() + + calls = [ + call(["archaea", "bacteria"]), + call(["eukaryota"]), + call(["archaea", "bacteria", "eukaryota"]), + ] + mock_run_lineages_list.assert_has_calls(calls, any_order=True) + + @patch("busco.AutoLineage.logger.info") + @patch("__main__.AutoLineage_unittests.AutoLineage.BuscoRunner") + @patch("busco.AutoLineage.BuscoConfigAuto", autospec=True) + @patch("busco.BuscoConfig.BuscoConfigMain") + def test_run_lineages_initializes_BuscoConfigAuto( + self, mock_config_main, mock_config_auto, *args + ): + asl = AutoLineage.AutoSelectLineage(mock_config_main) + test_lineages = ["a", "b", "c"] + test_dataset_version = "" + asl.dataset_version = test_dataset_version + asl.run_lineages_list(test_lineages) + calls = [ + call( + mock_config_main, "{}_{}".format(test_lineages[0], test_dataset_version) + ), + call( + mock_config_main, "{}_{}".format(test_lineages[1], test_dataset_version) + ), + call( + mock_config_main, "{}_{}".format(test_lineages[2], test_dataset_version) + ), + ] + mock_config_auto.assert_has_calls(calls, any_order=True) + + @patch("busco.AutoLineage.logger.info") + @patch("busco.AutoLineage.BuscoConfigAuto", autospec=True) + @patch("busco.BuscoConfig.BuscoConfigMain") + @patch("__main__.AutoLineage_unittests.AutoLineage.BuscoRunner") + def test_run_lineages_initializes_BuscoRunner( + self, mock_runner, mock_config_main, mock_config_auto, *args + ): + asl = AutoLineage.AutoSelectLineage(mock_config_main) + test_lineages = ["a", "b", "c"] + test_dataset_version = "" + asl.dataset_version = test_dataset_version + asl.run_lineages_list(test_lineages) + mock_runner.assert_called_with(mock_config_auto.return_value) + + @patch("busco.AutoLineage.logger.info") + @patch("busco.AutoLineage.BuscoConfigAuto", autospec=True) + @patch("busco.BuscoConfig.BuscoConfigMain") + @patch("__main__.AutoLineage_unittests.AutoLineage.BuscoRunner") + def test_run_lineages_runs_analysis(self, mock_runner, mock_config_main, *args): + asl = AutoLineage.AutoSelectLineage(mock_config_main) + test_lineages = ["a", "b", "c"] + test_dataset_version = "" + asl.dataset_version = test_dataset_version + asl.run_lineages_list(test_lineages) + mock_runner.return_value.run_analysis.assert_called_with( + callback=asl.record_results + ) + + @patch("busco.AutoLineage.logger.info") + @patch("busco.AutoLineage.BuscoConfigAuto", autospec=True) + @patch("busco.BuscoConfig.BuscoConfigMain") + @patch("__main__.AutoLineage_unittests.AutoLineage.BuscoRunner") + def test_run_lineages_returns_runners(self, mock_runner, mock_config_main, *args): + asl = AutoLineage.AutoSelectLineage(mock_config_main) + test_lineages = ["a", "b", "c"] + test_dataset_version = "" + asl.dataset_version = test_dataset_version + runners = asl.run_lineages_list(test_lineages) + self.assertEqual( + runners, + [ + mock_runner.return_value, + mock_runner.return_value, + mock_runner.return_value, + ], + ) + + @patch("busco.AutoLineage.logger.info") + @patch("busco.AutoLineage.BuscoConfigAuto", autospec=True) + @patch("__main__.AutoLineage_unittests.AutoLineage.BuscoRunner") + @patch("busco.BuscoConfig.BuscoConfigMain") + def test_record_results_first_run(self, mock_config_main, *args): + asl = AutoLineage.AutoSelectLineage(mock_config_main) + asl.record_results(0, 1, 2, 0, 1, 2) + self.assertGreater(len(asl.s_buscos), 0) + self.assertGreater(len(asl.d_buscos), 0) + self.assertGreater(len(asl.f_buscos), 0) + self.assertGreater(len(asl.s_percents), 0) + self.assertGreater(len(asl.d_percents), 0) + self.assertGreater(len(asl.f_percents), 0) + + @patch("busco.AutoLineage.logger.info") + @patch("busco.AutoLineage.BuscoConfigAuto", autospec=True) + @patch("__main__.AutoLineage_unittests.AutoLineage.BuscoRunner") + @patch("busco.BuscoConfig.BuscoConfigMain") + def test_record_results_multiple_runs(self, mock_config_main, *args): + asl = AutoLineage.AutoSelectLineage(mock_config_main) + asl.record_results(0, 1, 2, 0, 1, 2) + asl.record_results(0, 1, 2, 0, 1, 2) + self.assertGreater(len(asl.s_buscos), 1) + self.assertGreater(len(asl.d_buscos), 1) + self.assertGreater(len(asl.f_buscos), 1) + self.assertGreater(len(asl.s_percents), 1) + self.assertGreater(len(asl.d_percents), 1) + self.assertGreater(len(asl.f_percents), 1) + + @patch("busco.AutoLineage.logger.info") + @patch("busco.BuscoConfig.BuscoConfigMain") + def test_evaluate_single_runner(self, mock_config_main, *args): + runner1 = Mock() + runner1.analysis.hmmer_runner.single_copy = 1 + runner1.analysis.hmmer_runner.multi_copy = 1 + runner1.analysis.hmmer_runner.only_fragments = 1 + asl = AutoLineage.AutoSelectLineage(mock_config_main) + max_ind = asl.evaluate([runner1]) + self.assertEqual(max_ind, 0) + + @patch("busco.AutoLineage.logger.info") + @patch("busco.BuscoConfig.BuscoConfigMain") + def test_evaluate_multiple_runners(self, mock_config_main, *args): + runner1 = Mock() + runner2 = Mock() + runner3 = Mock() + runner1.analysis.hmmer_runner.single_copy = 10 + runner1.analysis.hmmer_runner.multi_copy = 5 + runner2.analysis.hmmer_runner.single_copy = 15 + runner2.analysis.hmmer_runner.multi_copy = 5 + runner3.analysis.hmmer_runner.single_copy = 12 + runner3.analysis.hmmer_runner.multi_copy = 5 + asl = AutoLineage.AutoSelectLineage(mock_config_main) + max_ind = asl.evaluate([runner1, runner2, runner3]) + self.assertEqual(max_ind, 1) + + runner2.analysis.hmmer_runner.single_copy = 10 + runner2.analysis.hmmer_runner.multi_copy = 6 + runner3.analysis.hmmer_runner.single_copy = 10 + runner3.analysis.hmmer_runner.multi_copy = 7 + max_ind = asl.evaluate([runner1, runner2, runner3]) + self.assertEqual(max_ind, 2) + + @patch("busco.AutoLineage.logger.info") + @patch("busco.BuscoConfig.BuscoConfigMain") + def test_evaluate_first_order_tiebreak(self, mock_config_main, *args): + runner1 = Mock() + runner2 = Mock() + runner3 = Mock() + runner1.analysis.hmmer_runner.single_copy = 10 + runner1.analysis.hmmer_runner.multi_copy = 5 + runner1.analysis.hmmer_runner.only_fragments = 1 + runner2.analysis.hmmer_runner.single_copy = 10 + runner2.analysis.hmmer_runner.multi_copy = 5 + runner2.analysis.hmmer_runner.only_fragments = 2 + runner3.analysis.hmmer_runner.single_copy = 12 + runner3.analysis.hmmer_runner.multi_copy = 0 + runner3.analysis.hmmer_runner.only_fragments = 3 + asl = AutoLineage.AutoSelectLineage(mock_config_main) + max_ind = asl.evaluate([runner1, runner2, runner3]) + self.assertEqual(max_ind, 1) + + @patch("busco.AutoLineage.logger.info") + @patch("busco.BuscoConfig.BuscoConfigMain") + def test_evaluate_second_order_tiebreak(self, mock_config_main, *args): + runner1 = Mock() + runner2 = Mock() + runner3 = Mock() + runner4 = Mock() + runner1.analysis.hmmer_runner.single_copy = 10 + runner1.analysis.hmmer_runner.multi_copy = 5 + runner1.analysis.hmmer_runner.only_fragments = 1 + runner1.analysis.hmmer_runner.s_percent = 20 + runner2.analysis.hmmer_runner.single_copy = 10 + runner2.analysis.hmmer_runner.multi_copy = 5 + runner2.analysis.hmmer_runner.only_fragments = 2 + runner2.analysis.hmmer_runner.s_percent = 40 + runner3.analysis.hmmer_runner.single_copy = 12 + runner3.analysis.hmmer_runner.multi_copy = 0 + runner3.analysis.hmmer_runner.only_fragments = 3 + runner3.analysis.hmmer_runner.s_percent = 60 + runner4.analysis.hmmer_runner.single_copy = 14 + runner4.analysis.hmmer_runner.multi_copy = 1 + runner4.analysis.hmmer_runner.only_fragments = 2 + runner4.analysis.hmmer_runner.s_percent = 80 + asl = AutoLineage.AutoSelectLineage(mock_config_main) + max_ind = asl.evaluate([runner1, runner2, runner3, runner4]) + self.assertEqual(max_ind, 3) + + @patch("busco.AutoLineage.logger.info") + @patch("busco.BuscoConfig.BuscoConfigMain") + def test_evaluate_third_order_tiebreak(self, mock_config_main, *args): + runner1 = Mock() + runner2 = Mock() + runner3 = Mock() + runner4 = Mock() + runner1.analysis.hmmer_runner.single_copy = 10 + runner1.analysis.hmmer_runner.multi_copy = 5 + runner1.analysis.hmmer_runner.only_fragments = 1 + runner1.analysis.hmmer_runner.s_percent = 20 + runner2.analysis.hmmer_runner.single_copy = 10 + runner2.analysis.hmmer_runner.multi_copy = 5 + runner2.analysis.hmmer_runner.only_fragments = 2 + runner2.analysis.hmmer_runner.s_percent = 80 + runner3.analysis.hmmer_runner.single_copy = 12 + runner3.analysis.hmmer_runner.multi_copy = 0 + runner3.analysis.hmmer_runner.only_fragments = 3 + runner3.analysis.hmmer_runner.s_percent = 60 + runner4.analysis.hmmer_runner.single_copy = 14 + runner4.analysis.hmmer_runner.multi_copy = 1 + runner4.analysis.hmmer_runner.only_fragments = 2 + runner4.analysis.hmmer_runner.s_percent = 80 + asl = AutoLineage.AutoSelectLineage(mock_config_main) + with self.assertLogs(AutoLineage.logger, "WARNING"): + max_ind = asl.evaluate([runner1, runner2, runner3, runner4]) + self.assertEqual(max_ind, 1) + + @patch("busco.AutoLineage.logger.info") + @patch("busco.AutoLineage.AutoSelectLineage.cleanup_disused_runs") + @patch("__main__.AutoLineage_unittests.BuscoRunner.BuscoRunner.mode_dict") + @patch("busco.BuscoConfig.BuscoConfigMain", autospec=True) + def test_get_best_match_lineage( + self, mock_config_main, fake_modedict, mock_cleanup, *args + ): + mock_config_main.get.side_effect = [None] + + mock_config1 = Mock() + mock_config2 = Mock() + mock_config3 = Mock() + mock_config1.get.side_effect = ["tran", None, "test1"] + mock_config2.get.side_effect = ["tran", None, "test2"] + mock_config3.get.side_effect = ["tran", None, "test3"] + + mock_analysis1 = Mock() + mock_analysis2 = Mock() + mock_analysis3 = Mock() + mock_analysis1.return_value.hmmer_runner.single_copy = 75 + mock_analysis2.return_value.hmmer_runner.single_copy = 85 + mock_analysis3.return_value.hmmer_runner.single_copy = 80 + mock_analysis1.return_value.hmmer_runner.multi_copy = 5 + mock_analysis2.return_value.hmmer_runner.multi_copy = 6 + mock_analysis3.return_value.hmmer_runner.multi_copy = 7 + + fake_modedict.__getitem__.side_effect = [ + mock_analysis1, + mock_analysis2, + mock_analysis3, + ] + + runner1 = BuscoRunner.BuscoRunner(mock_config1) + runner2 = BuscoRunner.BuscoRunner(mock_config2) + runner3 = BuscoRunner.BuscoRunner(mock_config3) + + asl = AutoLineage.AutoSelectLineage(mock_config_main) + asl.get_best_match_lineage([runner1, runner2, runner3]) + self.assertEqual(asl.best_match_lineage_dataset, "test2") + self.assertEqual(asl.selected_runner, runner2) + mock_cleanup.assert_called_with([runner1, runner3]) + + @patch("busco.AutoLineage.logger.info") + @patch("busco.BuscoConfig.BuscoConfigMain", autospec=True) + def test_cleanup_disused_runs(self, mock_config_main, *args): + asl = AutoLineage.AutoSelectLineage(mock_config_main) + mock_runner1 = Mock() + mock_runner2 = Mock() + mock_runner1.cleaned_up = False + mock_runner2.cleaned_up = True + asl.cleanup_disused_runs([mock_runner1, mock_runner2]) + mock_runner1.cleanup.assert_called() + mock_runner2.cleanup.assert_not_called() + + # Todo: add tests for get_lineage_datasets, 3_dataset check and busco_placer step + + def tearDown(self): + pass diff -Nru busco-4.1.4/tests/unittests/BuscoConfig_unittests.py busco-5.0.0/tests/unittests/BuscoConfig_unittests.py --- busco-4.1.4/tests/unittests/BuscoConfig_unittests.py 1970-01-01 00:00:00.000000000 +0000 +++ busco-5.0.0/tests/unittests/BuscoConfig_unittests.py 2021-01-26 11:28:47.000000000 +0000 @@ -0,0 +1,628 @@ +import unittest +from busco import BuscoConfig +import shutil +import os +from unittest.mock import Mock +from unittest.mock import patch +from pathlib import Path + + +class TestBuscoConfig(unittest.TestCase): + maxDiff = None + + def setUp(self): + self.maxDiff = None + self.base_config = "config/config.ini" + + self.params = { + "auto-lineage": False, + "auto-lineage-euk": False, + "auto-lineage-prok": False, + "config_file": None, + "cpu": None, + "evalue": None, + "force": False, + "help": "==SUPPRESS==", + "in": None, + "limit": None, + "lineage_dataset": None, + "list_datasets": "==SUPPRESS==", + "mode": None, + "offline": False, + "out": None, + "out_path": None, + "quiet": False, + "restart": False, + "metaeuk_parameters": None, + "metaeuk_rerun_parameters": None, + "use_augustus": False, + "augustus_parameters": None, + "augustus_species": None, + "long": False, + "datasets_version": None, + "download_base_url": None, + "download_path": None, + "update-data": False, + "version": "==SUPPRESS==", + } + + self.test_params = { + "in": "input_test", + "out": "output_test", + "mode": "mode_test", + } + + self.config_structure = { + "augustus": ["path", "command"], + "busco_run": [ + "in", + "out", + "out_path", + "mode", + "auto-lineage", + "auto-lineage-prok", + "auto-lineage-euk", + "cpu", + "force", + "restart", + "download_path", + "datasets_version", + "quiet", + "offline", + "long", + "augustus_parameters", + "augustus_species", + "download_base_url", + "lineage_dataset", + "update-data", + "metaeuk_parameters", + "metaeuk_rerun_parameters", + "evalue", + "limit", + "use_augustus", + ], + "etraining": ["path", "command"], + "gff2gbSmallDNA.pl": ["path", "command"], + "hmmsearch": ["path", "command"], + "makeblastdb": ["path", "command"], + "metaeuk": ["path", "command"], + "new_species.pl": ["path", "command"], + "optimize_augustus.pl": ["path", "command"], + "prodigal": ["path", "command"], + "sepp": ["path", "command"], + "tblastn": ["path", "command"], + } + + def test_read_config_file(self): + config = BuscoConfig.BaseConfig() + config.conf_file = self.base_config + config._load_config_file() + self.assertIn("busco_run", config.sections()) + + def test_read_config_file_ioerror(self): + with self.assertRaises(SystemExit): + config = BuscoConfig.BaseConfig() + config.conf_file = "/path/not/found" + config._load_config_file() + + def test_read_config_file_parseerror(self): + config_path = "tests/config_parseerror_test.ini" + test_config_contents = "in=input_file\n" + with open(config_path, "w") as f: + f.write(test_config_contents) + + with self.assertRaises(SystemExit): + config = BuscoConfig.BaseConfig() + config.conf_file = config_path + config._load_config_file() + os.remove(config_path) + + def test_read_config_file_duplicateerror(self): + config_path = "tests/config_duplicate_test.ini" + test_config_contents = "[busco_run]\n" "in=input_file\n" "in=input_file\n" + with open(config_path, "w") as f: + f.write(test_config_contents) + + with self.assertRaises(SystemExit): + config = BuscoConfig.BaseConfig() + config.conf_file = config_path + config._load_config_file() + os.remove(config_path) + + def test_config_update_args_bool(self): + update_params = { + "force": True, + "offline": True, + "quiet": True, + "restart": True, + } + config = BuscoConfig.BuscoConfigMain(self.base_config, update_params) + config.configure() + self.assertEqual( + update_params, + {key: config.getboolean("busco_run", key) for key in update_params.keys()}, + ) + + def test_config_update_args_nonbool(self): + update_params = { + "cpu": "10", + "evalue": "0.01", + "in": "input_file", + "limit": "1", + "lineage_dataset": "test", + "mode": "test", + "out": "test", + "out_path": "test", + } + config = BuscoConfig.BuscoConfigMain(self.base_config, update_params) + config.configure() + self.assertEqual( + update_params, + {key: config.get("busco_run", key) for key in update_params.keys()}, + ) + + def test_config_default_params(self): + correct_default_params = { + "auto-lineage": False, + "auto-lineage-euk": False, + "auto-lineage-prok": False, + "cpu": "1", + "datasets_version": "odb10", + "download_base_url": "https://busco-data.ezlab.org/v5/data/", + "download_path": os.path.join(os.getcwd(), "busco_downloads"), + "evalue": "0.001", + "force": False, + "limit": "3", + "long": False, + "offline": False, + "out_path": os.getcwd(), + "quiet": False, + "restart": False, + "update-data": False, + "use_augustus": False, + } + config = BuscoConfig.BuscoConfigMain(self.base_config, {}) + config.configure() + config_default_filled = { + key: config.get("busco_run", key) for key in correct_default_params + } + + self.assertEqual( + {key: str(val) for key, val in correct_default_params.items()}, + config_default_filled, + ) + + @patch( + "busco.BuscoConfig.BuscoConfigMain.getboolean", + side_effect=[True, False, False, True, False], + ) + def test_config_auto_lineage_settings(self, *args): + for _ in range(2): + config = BuscoConfig.BuscoConfigMain(self.base_config, {}) + config.configure() + self.assertEqual(config.get("busco_run", "auto-lineage"), "True") + + @patch("busco.BuscoConfig.BuscoConfigMain.getboolean", return_value=True) + def test_config_auto_lineage_both_selected_warning(self, *args): + with self.assertLogs(BuscoConfig.logger, "WARNING"): + config = BuscoConfig.BuscoConfigMain(self.base_config, {}) + config.configure() + self.assertEqual(config.get("busco_run", "auto-lineage-euk"), "False") + self.assertEqual(config.get("busco_run", "auto-lineage-prok"), "False") + + def test_mandatory_keys_check_log(self): + with self.assertLogs(BuscoConfig.logger, 20): + params_test = {"in": "input_file", "out": "output_name", "mode": "genome"} + config = BuscoConfig.BuscoConfigMain(self.base_config, params_test) + config.configure() + config._check_mandatory_keys_exist() + + def test_mandatory_keys_check_missing_param_in(self): + with self.assertRaises(SystemExit): + params_test = {"out": "output_name", "mode": "genome"} + config = BuscoConfig.BuscoConfigMain(self.base_config, params_test) + config.configure() + config._check_mandatory_keys_exist() + + def test_mandatory_keys_check_missing_param_mode(self): + with self.assertRaises(SystemExit): + params_test = {"in": "input_file", "out": "output_name"} + config = BuscoConfig.BuscoConfigMain(self.base_config, params_test) + config.configure() + config._check_mandatory_keys_exist() + + def test_mandatory_keys_check_missing_param_out(self): + with self.assertRaises(SystemExit): + params_test = {"in": "input_file", "mode": "genome"} + config = BuscoConfig.BuscoConfigMain(self.base_config, params_test) + config.configure() + config._check_mandatory_keys_exist() + + def test_previous_run_check_without_existing_run(self): + output_dir = os.path.join(os.getcwd(), self.test_params["out"]) + if os.path.exists(output_dir): + shutil.rmtree(output_dir) + config = BuscoConfig.BuscoConfigMain(self.base_config, self.test_params) + config.configure() + self.assertIsNone(config._check_no_previous_run()) + + def test_previous_run_check_with_existing_run_no_force(self): + previous_run_name = "test_busco_run_dir" + os.makedirs(previous_run_name, exist_ok=True) + self.test_params["out"] = previous_run_name + self.test_params["force"] = "False" + with self.assertRaises(SystemExit): + config = BuscoConfig.BuscoConfigMain(self.base_config, self.test_params) + config.configure() + config._check_no_previous_run() + shutil.rmtree(previous_run_name) + + def test_previous_run_check_with_existing_run_with_force_and_log(self): + previous_run_name = "test_busco_run_dir" + os.makedirs(previous_run_name, exist_ok=True) + self.test_params["out"] = previous_run_name + self.test_params["force"] = "True" + with self.assertLogs(BuscoConfig.logger, 20): + config = BuscoConfig.BuscoConfigMain(self.base_config, self.test_params) + config.configure() + config._check_no_previous_run() + self.assertFalse(os.path.exists(previous_run_name)) + + try: # In case of test failure, remove tmp folder anyway + shutil.rmtree(previous_run_name) + except FileNotFoundError: + pass + + def test_previous_run_check_without_existing_run_and_restart(self): + self.test_params["restart"] = "True" + with self.assertLogs(BuscoConfig.logger, "WARNING"): + config = BuscoConfig.BuscoConfigMain(self.base_config, self.test_params) + config.configure() + config._check_no_previous_run() + self.assertEqual(config.getboolean("busco_run", "restart"), False) + + def test_previous_run_check_with_existing_run_and_restart(self): + previous_run_name = "test_busco_run_dir" + os.makedirs(previous_run_name, exist_ok=True) + self.test_params.update({"out": previous_run_name, "restart": True}) + with self.assertLogs(BuscoConfig.logger, "INFO"): + config = BuscoConfig.BuscoConfigMain(self.base_config, self.test_params) + config.configure() + config._check_no_previous_run() + self.assertEqual(config.getboolean("busco_run", "restart"), True) + shutil.rmtree(previous_run_name) + + def test_create_required_paths(self): + config = BuscoConfig.BuscoConfigMain(self.base_config, self.test_params) + config.configure() + config.main_out = os.path.join( + config.get("busco_run", "out_path"), config.get("busco_run", "out") + ) + config._create_required_paths() + output_dir = os.path.join(os.getcwd(), self.test_params["out"]) + self.assertTrue(os.path.exists(output_dir)) + shutil.rmtree(output_dir) + + def test_config_structure(self): + config = BuscoConfig.BuscoConfigMain(self.base_config, self.test_params) + config.configure() + self.assertEqual( + set(config.PERMITTED_OPTIONS), set(self.config_structure["busco_run"]) + ) + + def test_catch_disallowed_keys(self): + for section_name in self.config_structure: + with self.assertRaises(SystemExit): + config = BuscoConfig.BuscoConfigMain(self.base_config, self.test_params) + config.configure() + config.set(section_name, "forbidden_option", "forbidden_value") + config._check_allowed_keys() + + def test_out_value_check_invalid(self): + for str_format in ["/path/to/output", "output/"]: + self.test_params["out"] = str_format + with self.assertRaises(SystemExit): + config = BuscoConfig.BuscoConfigMain(self.base_config, self.test_params) + config.configure() + config._check_out_value() + + def test_out_value_check_valid(self): + config = BuscoConfig.BuscoConfigMain(self.base_config, self.test_params) + config.configure() + self.assertIsNone(config._check_out_value()) + + def test_limit_value_out_of_range(self): + for lim_val in [-1, 0, 25]: + self.test_params["limit"] = lim_val + with self.assertRaises(SystemExit): + config = BuscoConfig.BuscoConfigMain(self.base_config, self.test_params) + config.configure() + config._check_limit_value() + + def test_limit_value_within_range(self): + for lim_val in [1, 20]: + self.test_params["limit"] = lim_val + config = BuscoConfig.BuscoConfigMain(self.base_config, self.test_params) + config.configure() + self.assertIsNone(config._check_limit_value()) + + def test_evalue_nondefault(self): + self.test_params["evalue"] = 1 + with self.assertLogs(BuscoConfig.logger, level="WARNING"): + config = BuscoConfig.BuscoConfigMain(self.base_config, self.test_params) + config.configure() + config._check_evalue() + + @patch("__main__.BuscoConfig_unittests.BuscoConfig.logger.warning") + def test_evalue_default(self, mock_logger): + self.test_params["evalue"] = 0.001 + config = BuscoConfig.BuscoConfigMain(self.base_config, self.test_params) + config.configure() + config._check_evalue() + mock_logger.assert_not_called() + + def test_expand_all_paths_tilde(self): + config = BuscoConfig.BuscoConfigMain(self.base_config, self.test_params) + config.configure() + config.set("busco_run", "download_path", "~/test_download_path") + config._expand_all_paths() + self.assertEqual( + config.get("busco_run", "download_path"), + os.path.expanduser("~/test_download_path"), + ) + + def test_expand_all_paths_relative_path_current_dir(self): + config = BuscoConfig.BuscoConfigMain(self.base_config, self.test_params) + config.configure() + config.set("busco_run", "out_path", "./test_out_path") + config._expand_all_paths() + self.assertEqual( + config.get("busco_run", "out_path"), os.path.abspath("./test_out_path") + ) + + def test_expand_all_paths_relative_path_parent_dir(self): + config = BuscoConfig.BuscoConfigMain(self.base_config, self.test_params) + config.configure() + config.set("busco_run", "in", "../test_input_file") + config._expand_all_paths() + self.assertEqual( + config.get("busco_run", "in"), os.path.abspath("../test_input_file") + ) + + def test_expand_all_paths_hmmsearch(self): + config = BuscoConfig.BuscoConfigMain(self.base_config, self.test_params) + config.configure() + config.set("hmmsearch", "path", "~/test_hmmsearch_path") + config._expand_all_paths() + self.assertEqual( + config.get("hmmsearch", "path"), os.path.expanduser("~/test_hmmsearch_path") + ) + + def test_required_input_exists_true(self): + input_filename = "test_input_file" + Path(input_filename).touch() + self.test_params["in"] = input_filename + config = BuscoConfig.BuscoConfigMain(self.base_config, self.test_params) + config.configure() + with self.assertLogs(BuscoConfig.logger, level="INFO"): + config._check_required_input_exists() + os.remove(input_filename) + + def test_required_input_exists_false(self): + input_filename = "test_input_file" + if os.path.exists(input_filename): + os.remove(input_filename) + self.test_params["in"] = input_filename + config = BuscoConfig.BuscoConfigMain(self.base_config, self.test_params) + config.configure() + with self.assertRaises(SystemExit): + config._check_required_input_exists() + + @patch("__main__.BuscoConfig_unittests.BuscoConfig.BuscoDownloadManager") + def test_downloader_initialized(self, mock_downloader): + config = BuscoConfig.BuscoConfigMain(self.base_config, self.test_params) + config.configure() + config._init_downloader() + mock_downloader.assert_called() + + @patch("__main__.BuscoConfig_unittests.BuscoConfig.PrettyLog") + def test_log_config(self, mock_pretty_log): + config = BuscoConfig.BuscoConfigMain(self.base_config, self.test_params) + config.configure() + with self.assertLogs(BuscoConfig.logger, level="DEBUG"): + config.log_config() + mock_pretty_log.assert_called() + + @patch.object(BuscoConfig.BuscoConfigMain, "log_config") + @patch.object(BuscoConfig.BuscoConfigMain, "_init_downloader") + @patch.object(BuscoConfig.BuscoConfigMain, "_check_required_input_exists") + @patch.object(BuscoConfig.BuscoConfigMain, "_expand_all_paths") + @patch.object(BuscoConfig.BuscoConfigMain, "_check_evalue") + @patch.object(BuscoConfig.BuscoConfigMain, "_check_limit_value") + @patch.object(BuscoConfig.BuscoConfigMain, "_check_out_value") + @patch.object(BuscoConfig.BuscoConfigMain, "_check_allowed_keys") + @patch.object(BuscoConfig.BuscoConfigMain, "_create_required_paths") + @patch.object(BuscoConfig.BuscoConfigMain, "_check_no_previous_run") + @patch.object(BuscoConfig.BuscoConfigMain, "_check_mandatory_keys_exist") + def test_validation( + self, + mock_check_mandatory_keys, + mock_check_no_previous_run, + mock_create_required_paths, + mock_check_allowed_keys, + mock_check_out_value, + mock_check_limit_value, + mock_check_evalue, + mock_expand_all_paths, + mock_check_input, + mock_init_downloader, + mock_log_config, + ): + config = BuscoConfig.BuscoConfigMain(self.base_config, self.test_params) + config.configure() + config.validate() + mock_check_mandatory_keys.assert_called() + mock_check_no_previous_run.assert_called() + mock_create_required_paths.assert_called() + mock_check_allowed_keys.assert_called() + mock_check_out_value.assert_called() + mock_check_limit_value.assert_called() + mock_check_evalue.assert_called() + mock_expand_all_paths.assert_called() + mock_check_input.assert_called() + mock_init_downloader.assert_called() + mock_log_config.assert_called() + + def test_check_lineage_present_false(self): + try: + del self.test_params["lineage_dataset"] # just in case, probably redundant + except KeyError: + pass + config = BuscoConfig.BuscoConfigMain(self.base_config, self.test_params) + config.configure() + self.assertFalse(config.check_lineage_present()) + + def test_check_lineage_present_true_with_dataset_version_correct(self): + self.test_params["lineage_dataset"] = "test_dataset_odb10" + self.test_params["datasets_version"] = "odb10" + config = BuscoConfig.BuscoConfigMain(self.base_config, self.test_params) + config.configure() + config.check_lineage_present() + self.assertEqual( + config.get("busco_run", "datasets_version"), + self.test_params["datasets_version"], + ) + + def test_check_lineage_present_true_with_dataset_version_mismatch(self): + self.test_params["lineage_dataset"] = "test_dataset_odb10" + self.test_params["datasets_version"] = "odb11" + config = BuscoConfig.BuscoConfigMain(self.base_config, self.test_params) + config.configure() + with self.assertLogs(BuscoConfig.logger, level="WARNING"): + config.check_lineage_present() + self.assertEqual( + config.get("busco_run", "datasets_version"), + self.test_params["lineage_dataset"].split("_")[-1], + ) + + def test_check_lineage_present_true_with_odb_missing(self): + self.test_params["lineage_dataset"] = "test_dataset" + self.test_params["datasets_version"] = "odb10" + config = BuscoConfig.BuscoConfigMain(self.base_config, self.test_params) + config.configure() + config.check_lineage_present() + self.assertEqual( + config.get("busco_run", "lineage_dataset"), + "{}_{}".format( + self.test_params["lineage_dataset"], + self.test_params["datasets_version"], + ), + ) + + def test_check_lineage_present_true_with_invalid_dataset_version(self): + self.test_params["lineage_dataset"] = "test_dataset" + self.test_params["datasets_version"] = "odb11" + config = BuscoConfig.BuscoConfigMain(self.base_config, self.test_params) + config.configure() + with self.assertRaises(SystemExit): + config.check_lineage_present() + + def test_set_results_dirname(self): + config = BuscoConfig.BuscoConfigMain(self.base_config, self.test_params) + config.configure() + test_dataset_path = "/path/to/lineage_dataset" + with patch("busco.BuscoConfig.BuscoConfig.set"): + config.set_results_dirname(test_dataset_path) + config.set.assert_called_with( + "busco_run", "lineage_results_dir", "run_lineage_dataset" + ) + + @patch("busco.BuscoConfig.BuscoConfigAuto.load_dataset_config") + @patch("busco.BuscoConfig.BuscoConfigAuto.download_lineage_file") + @patch("busco.BuscoConfig.BuscoConfigAuto._create_required_paths") + @patch("busco.BuscoConfig.BuscoConfigAuto.set_results_dirname") + @patch("busco.BuscoConfig.BuscoConfig") + @patch("busco.BuscoConfig.BuscoConfigAuto._propagate_config") + def test_autoconfig_init_propagates_mainconfig(self, mock_propagate, *args): + config = BuscoConfig.BuscoConfigMain(self.base_config, self.test_params) + config.configure() + BuscoConfig.BuscoConfigAuto(config, None) + mock_propagate.assert_called_with(config) + + @patch("busco.BuscoConfig.BuscoConfigAuto.load_dataset_config") + @patch("busco.BuscoConfig.BuscoConfigAuto.download_lineage_file") + @patch("busco.BuscoConfig.BuscoConfigAuto._create_required_paths") + @patch("busco.BuscoConfig.BuscoConfig") + @patch("busco.BuscoConfig.BuscoConfigAuto._propagate_config") + @patch("busco.BuscoConfig.BuscoConfigAuto.set_results_dirname") + def test_autoconfig_init_sets_results_dirname(self, mock_set_dirname, *args): + BuscoConfig.BuscoConfigAuto(None, "lineage") + mock_set_dirname.assert_called_with("lineage") + + @patch("busco.BuscoConfig.BuscoConfigAuto.load_dataset_config") + @patch("busco.BuscoConfig.BuscoConfigAuto.download_lineage_file") + @patch("busco.BuscoConfig.BuscoConfig") + @patch("busco.BuscoConfig.BuscoConfigAuto._propagate_config") + @patch("busco.BuscoConfig.BuscoConfigAuto.set_results_dirname") + @patch("busco.BuscoConfig.BuscoConfigAuto._create_required_paths") + def test_autoconfig_init_creates_paths(self, mock_create_paths, *args): + BuscoConfig.BuscoConfigAuto(None, None) + mock_create_paths.assert_called() + + @patch("busco.BuscoConfig.BuscoConfigAuto.load_dataset_config") + @patch("busco.BuscoConfig.BuscoConfig") + @patch("busco.BuscoConfig.BuscoConfigAuto._propagate_config") + @patch("busco.BuscoConfig.BuscoConfigAuto.set_results_dirname") + @patch("busco.BuscoConfig.BuscoConfigAuto._create_required_paths") + @patch("busco.BuscoConfig.BuscoConfigAuto.download_lineage_file") + def test_autoconfig_init_downloads_lineage(self, mock_download_lineage, *args): + BuscoConfig.BuscoConfigAuto(None, "lineage") + mock_download_lineage.assert_called_with("lineage") + + @patch("busco.BuscoConfig.BuscoConfig") + @patch("busco.BuscoConfig.BuscoConfigAuto._propagate_config") + @patch("busco.BuscoConfig.BuscoConfigAuto.set_results_dirname") + @patch("busco.BuscoConfig.BuscoConfigAuto._create_required_paths") + @patch("busco.BuscoConfig.BuscoConfigAuto.download_lineage_file") + @patch("busco.BuscoConfig.BuscoConfigAuto.load_dataset_config") + def test_autoconfig_init_loads_lineage_config(self, mock_load_dataset, *args): + BuscoConfig.BuscoConfigAuto(None, None) + mock_load_dataset.assert_called() + + @patch("busco.BuscoConfig.BuscoConfigAuto._propagate_config") + @patch("busco.BuscoConfig.BuscoConfigAuto.set_results_dirname") + @patch("busco.BuscoConfig.BuscoConfigAuto._create_required_paths") + @patch("busco.BuscoConfig.BuscoConfigAuto.download_lineage_file") + @patch("busco.BuscoConfig.BuscoConfigAuto.load_dataset_config") + @patch("busco.BuscoConfig.BuscoConfig.__init__") + def test_autoconfig_init_calls_super(self, mock_config_parent, *args): + BuscoConfig.BuscoConfigAuto(None, None) + mock_config_parent.assert_called() + + @patch("busco.BuscoConfig.BuscoConfigAuto._create_required_paths") + @patch("busco.BuscoConfig.BuscoConfigAuto.download_lineage_file") + @patch("busco.BuscoConfig.BuscoConfigAuto.load_dataset_config") + def test_propagate_config(self, *args): + config = BuscoConfig.BuscoConfigMain(self.base_config, self.params) + config.configure() + config.downloader = Mock() + autoconfig = BuscoConfig.BuscoConfigAuto(config, "test") + autoconfig._propagate_config(config) + self.assertEqual(autoconfig, config) + + @patch("busco.BuscoConfig.BuscoConfigAuto.load_dataset_config") + @patch("busco.BuscoConfig.BuscoConfigAuto.download_lineage_file") + @patch("busco.BuscoConfig.BuscoConfigAuto._propagate_config") + @patch("busco.BuscoConfig.BuscoConfigAuto.set_results_dirname") + @patch("busco.BuscoConfig.BuscoConfigAuto.get", return_value="test") + @patch("busco.BuscoConfig.BuscoConfig._create_required_paths") + def test_autolineage_create_path_method_calls_parent( + self, mock_create_paths, *args + ): + config = BuscoConfig.BuscoConfigMain(self.base_config, self.test_params) + config.configure() + BuscoConfig.BuscoConfigAuto(config, None) + mock_create_paths.assert_called_with("test/auto_lineage") + + def tearDown(self): + self.test_params = {} diff -Nru busco-4.1.4/tests/unittests/ConfigManager_unittests.py busco-5.0.0/tests/unittests/ConfigManager_unittests.py --- busco-4.1.4/tests/unittests/ConfigManager_unittests.py 1970-01-01 00:00:00.000000000 +0000 +++ busco-5.0.0/tests/unittests/ConfigManager_unittests.py 2021-01-26 11:28:47.000000000 +0000 @@ -0,0 +1,269 @@ +import unittest +from unittest.mock import patch, call, Mock +from busco import ConfigManager +import os +import importlib + + +class TestConfigManager(unittest.TestCase): + def setUp(self): + self.base_config = "config/config.ini" + self.params = {"config_file": "path/to/config.ini"} + + # In order to suppress logs, we need to patch the LogDecorator object that handles the log messages and the + # loggers from each module. The following code was adapted from a StackOverflow answer here: + # https://stackoverflow.com/a/37890916/4844311 + + # Do cleanup first so it is ready if an exception is raised + def kill_patches(): # Create a cleanup callback that undoes our patches + patch.stopall() # Stops all patches started with start() + importlib.reload( + ConfigManager + ) # Reload our UUT module which restores the original decorator + + self.addCleanup( + kill_patches + ) # We want to make sure this is run so we do this in addCleanup instead of tearDown + + # Now patch the decorator where the decorator is being imported from + patch( + "busco.BuscoLogger.LogDecorator", lambda *args, **kwargs: lambda f: f + ).start() # The lambda makes our decorator into a pass-thru. Also, don't forget to call start() + # HINT: if you're patching a decor with params use something like: + # lambda *x, **y: lambda f: f + importlib.reload( + ConfigManager + ) # Reloads the module which applies our patched decorator + + def test_get_config_from_params(self): + + config_manager = ConfigManager.BuscoConfigManager(self.params) + self.assertEqual(config_manager.config_file, self.params["config_file"]) + + def test_get_config_from_env(self): + os.environ["BUSCO_CONFIG_FILE"] = "path/to/config.ini" + with patch("os.access") as mockaccess: + mockaccess.return_value = True + config_manager = ConfigManager.BuscoConfigManager({}) + self.assertEqual( + config_manager.config_file, os.environ.get("BUSCO_CONFIG_FILE") + ) + + @patch("__main__.ConfigManager_unittests.ConfigManager.logger", autospec=True) + def test_config_validated(self, *args): + config_manager = ConfigManager.BuscoConfigManager(self.params) + with patch( + "__main__.ConfigManager_unittests.ConfigManager.BuscoConfigMain", + autospec=True, + ): + config_manager.load_busco_config() + config_manager.config.validate.assert_called() + + @patch( + "__main__.ConfigManager_unittests.ConfigManager.BuscoConfigMain", autospec=True + ) + def test_log_warning_if_neither_lineage_nor_autolineage_specified( + self, mock_config_main, *args + ): + mock_config_main.return_value.check_lineage_present = lambda: False + mock_config_main.return_value.getboolean.side_effect = [ + False, + False, + True, + False, + True, + ] # These values comprise the three distinct logical paths into this part + # of the code: run1: False, False; run2: True, (shortcircuit); run3: False, True. + config_manager = ConfigManager.BuscoConfigManager(self.params) + test_dataset_path = "/path/to/lineage_dataset" + with patch.object( + config_manager, "auto_select_lineage", lambda: test_dataset_path + ): + with self.assertLogs(ConfigManager.logger, "WARNING"): + config_manager.load_busco_config() + with patch("busco.ConfigManager.logger.warning") as mock_logger: + for _ in range(2): + config_manager.load_busco_config() + self.assertFalse(mock_logger.called) + + @patch( + "__main__.ConfigManager_unittests.ConfigManager.BuscoConfigMain", autospec=True + ) + def test_log_warning_if_both_lineage_and_autolineage_specified( + self, mock_config_main, *args + ): + mock_config_main.return_value.check_lineage_present = lambda: True + mock_config_main.return_value.getboolean.side_effect = [ + True, + False, + True, + False, + False, + ] + config_manager = ConfigManager.BuscoConfigManager(self.params) + test_dataset_path = "/path/to/lineage_dataset" + with patch.object( + config_manager, "auto_select_lineage", lambda: test_dataset_path + ): + for _ in range(2): + with self.assertLogs(ConfigManager.logger, "WARNING"): + config_manager.load_busco_config() + with patch("busco.ConfigManager.logger.warning") as mock_logger: + config_manager.load_busco_config() + self.assertFalse(mock_logger.called) + + patch("busco.ConfigManager.logger.warning") + + @patch( + "__main__.ConfigManager_unittests.ConfigManager.BuscoConfigMain", autospec=True + ) + def test_config_updated_if_lineage_missing(self, mock_config_main, *args): + mock_config_main.return_value.check_lineage_present = lambda: False + config_manager = ConfigManager.BuscoConfigManager(self.params) + test_dataset_path = "/path/to/lineage_dataset" + with patch.object( + config_manager, "auto_select_lineage", lambda: test_dataset_path + ): + config_manager.load_busco_config() + calls = [ + call("busco_run", "auto-lineage", "True"), + call("busco_run", "lineage_dataset", test_dataset_path), + ] + mock_config_main.return_value.set.assert_has_calls(calls, any_order=True) + + @patch("busco.ConfigManager.logger.warning") + @patch( + "__main__.ConfigManager_unittests.ConfigManager.BuscoConfigMain", autospec=True + ) + def test_config_updated_if_lineage_present(self, mock_config_main, *args): + mock_config_main.return_value.check_lineage_present = lambda: True + config_manager = ConfigManager.BuscoConfigManager(self.params) + test_dataset_path = "/path/to/lineage_dataset" + with patch.object( + config_manager, "auto_select_lineage", lambda: test_dataset_path + ): + config_manager.load_busco_config() + calls = [ + call("busco_run", "auto-lineage", "False"), + call("busco_run", "auto-lineage-prok", "False"), + call("busco_run", "auto-lineage-euk", "False"), + ] + mock_config_main.return_value.set.assert_has_calls(calls, any_order=True) + + @patch("busco.ConfigManager.logger.warning") + @patch( + "__main__.ConfigManager_unittests.ConfigManager.BuscoConfigMain", autospec=True + ) + def test_update_dirname(self, mock_config_main, *args): + config_manager = ConfigManager.BuscoConfigManager(self.params) + test_dataset_path = "/path/to/lineage_dataset" + mock_config_main.return_value.get = lambda *args: test_dataset_path + with patch.object( + config_manager, "auto_select_lineage", lambda: test_dataset_path + ): + config_manager.load_busco_config() + mock_config_main.return_value.set_results_dirname.assert_called_with( + test_dataset_path + ) + + @patch("busco.ConfigManager.logger.warning") + @patch( + "__main__.ConfigManager_unittests.ConfigManager.BuscoConfigMain", autospec=True + ) + def test_lineage_downloaded(self, mock_config_main, *args): + config_manager = ConfigManager.BuscoConfigManager(self.params) + test_dataset_path = "/path/to/lineage_dataset" + mock_config_main.return_value.get = lambda *args: test_dataset_path + with patch.object( + config_manager, "auto_select_lineage", lambda: test_dataset_path + ): + config_manager.load_busco_config() + mock_config_main.return_value.download_lineage_file.assert_called_with( + test_dataset_path + ) + + @patch("busco.ConfigManager.logger.warning") + @patch( + "__main__.ConfigManager_unittests.ConfigManager.BuscoConfigMain", autospec=True + ) + def test_lineage_dataset_config_loaded(self, mock_config_main, *args): + config_manager = ConfigManager.BuscoConfigManager(self.params) + test_dataset_path = "/path/to/lineage_dataset" + with patch.object( + config_manager, "auto_select_lineage", lambda: test_dataset_path + ): + config_manager.load_busco_config() + mock_config_main.return_value.load_dataset_config.assert_called() + + @patch("busco.ConfigManager.logger.warning") + @patch( + "__main__.ConfigManager_unittests.ConfigManager.BuscoConfigMain", autospec=True + ) + def test_run_auto_select_if_no_lineage(self, mock_config_main, *args): + config_manager = ConfigManager.BuscoConfigManager(self.params) + mock_config_main.return_value.check_lineage_present = lambda: False + test_dataset_path = "/path/to/lineage_dataset" + with patch.object( + config_manager, "auto_select_lineage", return_value=test_dataset_path + ): + config_manager.load_busco_config() + config_manager.auto_select_lineage.assert_called() + + @patch( + "__main__.ConfigManager_unittests.ConfigManager.AutoSelectLineage", + autospec=True, + ) + def test_auto_select_lineage_call_function_initializes_asl(self, mock_asl): + mock_asl.return_value.best_match_lineage_dataset = Mock() + mock_asl.return_value.selected_runner = Mock() + config_manager = ConfigManager.BuscoConfigManager(self.params) + config_manager.auto_select_lineage() + mock_asl.assert_called() + + @patch( + "__main__.ConfigManager_unittests.ConfigManager.AutoSelectLineage", + autospec=True, + ) + def test_auto_select_lineage_call_function_runs_asl(self, mock_asl): + mock_asl.return_value.best_match_lineage_dataset = Mock() + mock_asl.return_value.selected_runner = Mock() + config_manager = ConfigManager.BuscoConfigManager(self.params) + config_manager.auto_select_lineage() + mock_asl.return_value.run_auto_selector.assert_called() + + @patch( + "__main__.ConfigManager_unittests.ConfigManager.AutoSelectLineage", + autospec=True, + ) + def test_auto_select_lineage_call_function_gets_lineage_dataset(self, mock_asl): + mock_asl.return_value.best_match_lineage_dataset = Mock() + mock_asl.return_value.selected_runner = Mock() + config_manager = ConfigManager.BuscoConfigManager(self.params) + config_manager.auto_select_lineage() + mock_asl.return_value.get_lineage_dataset.assert_called() + + @patch( + "__main__.ConfigManager_unittests.ConfigManager.AutoSelectLineage", + autospec=True, + ) + def test_auto_select_lineage_call_function_returns_lineage_dataset(self, mock_asl): + config_manager = ConfigManager.BuscoConfigManager(self.params) + lineage_dataset = "best_match_dataset" + mock_asl.return_value.best_match_lineage_dataset = lineage_dataset + mock_asl.return_value.selected_runner = Mock() + retval = config_manager.auto_select_lineage() + self.assertEqual(retval, lineage_dataset) + + @patch( + "__main__.ConfigManager_unittests.ConfigManager.AutoSelectLineage", + autospec=True, + ) + def test_auto_select_lineage_call_function_selects_runner(self, mock_asl): + config_manager = ConfigManager.BuscoConfigManager(self.params) + mock_asl.return_value.best_match_lineage_dataset = Mock() + mock_asl.return_value.selected_runner = "test" + config_manager.auto_select_lineage() + self.assertEqual("test", config_manager.runner) + + def tearDown(self): + pass diff -Nru busco-4.1.4/tests/unittests/GenomeAnalysis_unittests.py busco-5.0.0/tests/unittests/GenomeAnalysis_unittests.py --- busco-4.1.4/tests/unittests/GenomeAnalysis_unittests.py 1970-01-01 00:00:00.000000000 +0000 +++ busco-5.0.0/tests/unittests/GenomeAnalysis_unittests.py 2021-01-26 11:28:47.000000000 +0000 @@ -0,0 +1,107 @@ +import unittest +from busco.analysis import GenomeAnalysis +from unittest.mock import patch, Mock + + +class TestConfigManager(unittest.TestCase): + def setUp(self) -> None: + pass + + # @patch('busco.analysis.GenomeAnalysis.BuscoAnalysis.config.get', return_value="test") + # @patch('busco.analysis.GenomeAnalysis.BuscoAnalysis.config.getboolean', return_value=True) + @patch("busco.analysis.GenomeAnalysis.BuscoAnalysis.config") + @patch("busco.analysis.BuscoAnalysis.os.path") + @patch("busco.analysis.GenomeAnalysis.NucleotideAnalysis.check_nucleotide_file") + def test_init_eukaryota_augustus_checks_filetype(self, mock_check_nucl_file, *args): + GenomeAnalysis.GenomeAnalysisEukaryotesAugustus() + mock_check_nucl_file.assert_called() + + @patch("busco.analysis.GenomeAnalysis.NucleotideAnalysis.__init__") + # @patch('busco.analysis.GenomeAnalysis.BuscoAnalysis.config.get') + # @patch('busco.analysis.GenomeAnalysis.BuscoAnalysis.config.getboolean') + @patch("busco.analysis.GenomeAnalysis.BuscoAnalysis.config") + @patch("busco.analysis.Analysis.logger.warning") + @patch("busco.analysis.GenomeAnalysis.OptimizeAugustusRunner") + @patch("busco.analysis.GenomeAnalysis.ETrainingRunner") + @patch("busco.analysis.GenomeAnalysis.NewSpeciesRunner") + @patch("busco.analysis.GenomeAnalysis.GFF2GBRunner") + @patch("busco.analysis.GenomeAnalysis.AugustusRunner") + @patch("busco.analysis.Analysis.TBLASTNRunner") + @patch("busco.analysis.Analysis.MKBLASTRunner") + @patch("busco.analysis.BuscoAnalysis.HMMERRunner") + def test_init_tools_eukaryota_augustus( + self, + mock_hmmer, + mock_mkblast, + mock_tblastn, + mock_augustus, + mock_gff2gb, + mock_new_species, + mock_etraining, + mock_optimize_augustus, + *args + ): + analysis = GenomeAnalysis.GenomeAnalysisEukaryotesAugustus() + analysis.init_tools() + mock_hmmer.assert_called() + mock_mkblast.assert_called() + mock_tblastn.assert_called() + mock_augustus.assert_called() + mock_gff2gb.assert_called() + mock_new_species.assert_called() + mock_etraining.assert_called() + mock_optimize_augustus.assert_called() + + @patch("busco.analysis.GenomeAnalysis.NucleotideAnalysis.__init__") + @patch("busco.analysis.GenomeAnalysis.BuscoAnalysis.config.get") + @patch("busco.analysis.GenomeAnalysis.BuscoAnalysis.config", autospec=True) + @patch("busco.analysis.GenomeAnalysis.MetaeukRunner") + @patch("busco.analysis.BuscoAnalysis.HMMERRunner") + def test_init_tools_eukaryota_metaeuk(self, mock_hmmer, mock_metaeuk, *args): + analysis = GenomeAnalysis.GenomeAnalysisEukaryotesMetaeuk() + analysis.init_tools() + mock_hmmer.assert_called() + mock_metaeuk.assert_called() + + @patch("busco.analysis.GenomeAnalysis.NucleotideAnalysis.__init__") + @patch("busco.analysis.GenomeAnalysis.BuscoAnalysis.config.get") + @patch("busco.analysis.GenomeAnalysis.BuscoAnalysis.config", autospec=True) + @patch("busco.analysis.GenomeAnalysis.ProdigalRunner") + @patch("busco.analysis.BuscoAnalysis.HMMERRunner") + def test_init_tools_prokaryota(self, mock_hmmer, mock_prodigal, *args): + analysis = GenomeAnalysis.GenomeAnalysisProkaryotes() + analysis.init_tools() + mock_hmmer.assert_called() + mock_prodigal.assert_called() + + @patch("busco.analysis.GenomeAnalysis.NucleotideAnalysis.__init__") + @patch("busco.analysis.GenomeAnalysis.BuscoAnalysis.config.get") + @patch("busco.analysis.GenomeAnalysis.BuscoAnalysis.config", autospec=True) + @patch("busco.analysis.GenomeAnalysis.BuscoAnalysis.run_analysis") + @patch("busco.analysis.GenomeAnalysis.BuscoAnalysis.run_hmmer") + @patch("busco.analysis.GenomeAnalysis.GenomeAnalysisEukaryotesMetaeuk._run_metaeuk") + def test_run_analysis_metaeuk(self, mock_run_metaeuk, mock_run_hmmer, *args): + analysis = GenomeAnalysis.GenomeAnalysisEukaryotesMetaeuk() + analysis.metaeuk_runner = Mock() + analysis.hmmer_runner = Mock(missing_buscos=[]) + analysis.hmmer_runner.fragmented_buscos.keys = Mock(return_value=[]) + analysis.gene_details = Mock(autospec=True) + analysis.sequences_aa = Mock(autospec=True) + analysis.sequences_nt = Mock(autospec=True) + analysis.run_analysis() + mock_run_metaeuk.assert_called() + mock_run_hmmer.assert_called() + + # @patch('busco.GenomeAnalysis.GenomeAnalysisEukaryotesAugustus._rerun_analysis') + # @patch('busco.GenomeAnalysis.GenomeAnalysisEukaryotesAugustus.run_hmmer') + # @patch('busco.GenomeAnalysis.GenomeAnalysisEukaryotesAugustus._run_augustus') + # @patch('busco.GenomeAnalysis.BLASTAnalysis._run_tblastn') + # @patch('busco.GenomeAnalysis.BLASTAnalysis._run_mkblast') + # mock_mkblast.assert_called() + # mock_tblastn.assert_called() + # mock_augustus.assert_called() + # mock_hmmer.assert_called() + # mock_rerun.assert_called() + + def tearDown(self) -> None: + pass diff -Nru busco-4.1.4/tests/unittests/run_BUSCO_unittests.py busco-5.0.0/tests/unittests/run_BUSCO_unittests.py --- busco-4.1.4/tests/unittests/run_BUSCO_unittests.py 1970-01-01 00:00:00.000000000 +0000 +++ busco-5.0.0/tests/unittests/run_BUSCO_unittests.py 2021-01-26 11:28:47.000000000 +0000 @@ -0,0 +1,239 @@ +import unittest +from busco import run_BUSCO +import sys +import io + + +class TestParams(unittest.TestCase): + def setUp(self): + self.maxDiff = None + pass + + def test_help_short(self): + args = ["-h"] + sys.argv[1:] = args + with self.assertRaises(SystemExit) as cm: + captured_output = io.StringIO() + sys.stdout = captured_output + try: + run_BUSCO._parse_args() + finally: + sys.stdout = sys.__stdout__ + self.assertEqual(cm.exception.code, 0) + + def test_help_long(self): + args = ["--help"] + sys.argv[1:] = args + with self.assertRaises(SystemExit) as cm: + captured_output = io.StringIO() + sys.stdout = captured_output + try: + run_BUSCO._parse_args() + finally: + sys.stdout = sys.__stdout__ + self.assertEqual(cm.exception.code, 0) + + def test_version_short(self): + args = ["-v"] + sys.argv[1:] = args + with self.assertRaises(SystemExit) as cm: + captured_output = io.StringIO() + sys.stdout = captured_output + try: + run_BUSCO._parse_args() + finally: + sys.stdout = sys.__stdout__ + self.assertEqual(cm.exception.code, 0) + + def test_version_long(self): + args = ["--version"] + sys.argv[1:] = args + with self.assertRaises(SystemExit) as cm: + captured_output = io.StringIO() + sys.stdout = captured_output + try: + run_BUSCO._parse_args() + finally: + sys.stdout = sys.__stdout__ + self.assertEqual(cm.exception.code, 0) + + def test_cmdline_options_short_minimum(self): + params = run_BUSCO._parse_args() + correct_parse = { + "auto-lineage": False, + "auto-lineage-euk": False, + "auto-lineage-prok": False, + "config_file": None, + "cpu": None, + "evalue": None, + "force": False, + "help": "==SUPPRESS==", + "in": None, + "limit": None, + "lineage_dataset": None, + "list_datasets": "==SUPPRESS==", + "mode": None, + "offline": False, + "out": None, + "out_path": None, + "quiet": False, + "restart": False, + "metaeuk_parameters": None, + "metaeuk_rerun_parameters": None, + "use_augustus": False, + "augustus_parameters": None, + "augustus_species": None, + "long": False, + "datasets_version": None, + "download_base_url": None, + "download_path": None, + "update-data": False, + "version": "==SUPPRESS==", + } + self.assertDictEqual(params, correct_parse) + + def test_cmdline_options_all_short(self): + input_file = "input_file" + output_file = "output_file" + mode = "mode" + lineage_dataset = "lineage_dataset" + cpus = "cpus" + evalue = 0.1 + + arg_values = { + "-i": input_file, + "-o": output_file, + "-m": mode, + "-l": lineage_dataset, + "-c": cpus, + "-e": evalue, + } + flag_options = ["-f", "-q", "-r"] + command_str = " ".join( + [" ".join([key, str(value)]) for key, value in arg_values.items()] + + flag_options + ) + sys.argv[1:] = command_str.split(" ") + params = run_BUSCO._parse_args() + correct_parse = { + "auto-lineage": False, + "auto-lineage-euk": False, + "auto-lineage-prok": False, + "config_file": None, + "cpu": cpus, + "evalue": evalue, + "force": True, + "help": "==SUPPRESS==", + "in": input_file, + "limit": None, + "lineage_dataset": lineage_dataset, + "list_datasets": "==SUPPRESS==", + "mode": mode, + "offline": False, + "out": output_file, + "out_path": None, + "quiet": True, + "restart": True, + "metaeuk_parameters": None, + "metaeuk_rerun_parameters": None, + "use_augustus": False, + "augustus_parameters": None, + "augustus_species": None, + "long": False, + "datasets_version": None, + "download_base_url": None, + "download_path": None, + "update-data": False, + "version": "==SUPPRESS==", + } + self.assertDictEqual(params, correct_parse) + + def test_cmdline_options_all_long(self): + input_file = "input_file" + output_file = "output_file" + mode = "mode" + lineage_dataset = "lineage_dataset" + cpus = "cpus" + evalue = 0.1 + limit = 1 + augustus_parameters = "augustus_parameters" + augustus_species = "augustus_species" + config = "config" + out_path = "out_path" + download_path = "download_path" + datasets_version = "datasets_version" + download_base_url = "download_base_url" + metaeuk_parameters = "metaeuk_parameters" + metaeuk_rerun_parameters = "metaeuk_rerun_parameters" + + arg_values = { + "--in": input_file, + "--cpu": cpus, + "--out": output_file, + "--evalue": evalue, + "--mode": mode, + "--lineage_dataset": lineage_dataset, + "--limit": limit, + "--augustus_parameters": augustus_parameters, + "--augustus_species": augustus_species, + "--config": config, + "--out_path": out_path, + "--download_path": download_path, + "--datasets_version": datasets_version, + "--download_base_url": download_base_url, + "--metaeuk_parameters": metaeuk_parameters, + "--metaeuk_rerun_parameters": metaeuk_rerun_parameters, + } + flag_options = [ + "--force", + "--restart", + "--quiet", + "--long", + "--auto-lineage", + "--auto-lineage-prok", + "--auto-lineage-euk", + "--augustus", + "--update-data", + "--offline", + ] + command_str = " ".join( + [" ".join([key, str(value)]) for key, value in arg_values.items()] + + flag_options + ) + sys.argv[1:] = command_str.split(" ") + params = run_BUSCO._parse_args() + correct_parse = { + "augustus_parameters": augustus_parameters, + "augustus_species": augustus_species, + "metaeuk_parameters": metaeuk_parameters, + "metaeuk_rerun_parameters": metaeuk_rerun_parameters, + "datasets_version": datasets_version, + "download_base_url": download_base_url, + "download_path": download_path, + "auto-lineage": True, + "auto-lineage-euk": True, + "auto-lineage-prok": True, + "config_file": config, + "cpu": cpus, + "evalue": evalue, + "force": True, + "restart": True, + "use_augustus": True, + "help": "==SUPPRESS==", + "in": input_file, + "limit": limit, + "lineage_dataset": lineage_dataset, + "list_datasets": "==SUPPRESS==", + "long": True, + "mode": mode, + "offline": True, + "out": output_file, + "out_path": out_path, + "quiet": True, + "update-data": True, + "version": "==SUPPRESS==", + } + self.assertDictEqual(params, correct_parse) + + def tearDown(self): + sys.argv = [sys.argv[0]]