diff -Nru q2-feature-classifier-2023.7.0/.github/workflows/ci-dev.yaml q2-feature-classifier-2024.2.0/.github/workflows/ci-dev.yaml --- q2-feature-classifier-2023.7.0/.github/workflows/ci-dev.yaml 2023-08-17 19:44:12.000000000 +0000 +++ q2-feature-classifier-2024.2.0/.github/workflows/ci-dev.yaml 2024-02-16 21:57:24.000000000 +0000 @@ -9,4 +9,4 @@ ci: uses: qiime2/distributions/.github/workflows/lib-ci-dev.yaml@dev with: - distro: core \ No newline at end of file + distro: amplicon diff -Nru q2-feature-classifier-2023.7.0/README.md q2-feature-classifier-2024.2.0/README.md --- q2-feature-classifier-2023.7.0/README.md 2023-08-17 19:44:12.000000000 +0000 +++ q2-feature-classifier-2024.2.0/README.md 2024-02-16 21:57:24.000000000 +0000 @@ -1,5 +1,5 @@ # q2-feature-classifier -![](https://github.com/qiime2/q2-feature-classifier/workflows/ci/badge.svg) +![](https://github.com/qiime2/q2-feature-classifier/workflows/ci-dev/badge.svg) This is a QIIME 2 plugin. For details on QIIME 2, see https://qiime2.org. \ No newline at end of file diff -Nru q2-feature-classifier-2023.7.0/ci/recipe/meta.yaml q2-feature-classifier-2024.2.0/ci/recipe/meta.yaml --- q2-feature-classifier-2023.7.0/ci/recipe/meta.yaml 2023-08-17 19:44:12.000000000 +0000 +++ q2-feature-classifier-2024.2.0/ci/recipe/meta.yaml 2024-02-16 21:57:24.000000000 +0000 @@ -22,7 +22,7 @@ - joblib - scikit-bio {{ scikit_bio }} - biom-format {{ biom_format }} - - blast >=2.6.0 + - blast >=2.13.0 - vsearch - qiime2 {{ qiime2_epoch }}.* - q2-types {{ qiime2_epoch }}.* diff -Nru q2-feature-classifier-2023.7.0/debian/changelog q2-feature-classifier-2024.2.0/debian/changelog --- q2-feature-classifier-2023.7.0/debian/changelog 2023-08-18 12:34:12.000000000 +0000 +++ q2-feature-classifier-2024.2.0/debian/changelog 2024-02-18 14:45:11.000000000 +0000 @@ -1,3 +1,23 @@ +q2-feature-classifier (2024.2.0-1) unstable; urgency=medium + + * Team upload. + * New upstream version + * Generate debian/control automatically to refresh version number + * Build-Depends: s/dh-python/dh-sequence-python3/ (routine-update) + * Regenerate debian/control from debian/control.in (routine-update) + * Autopkgtest for all supported Python3 versions + * Regenerate debian/control from debian/control.in (routine-update) + * Delaying true testing to autopkgtests since QIIME2 module cannot be + registered at build time + + -- Andreas Tille Sun, 18 Feb 2024 15:45:11 +0100 + +q2-feature-classifier (2023.7.0-2) UNRELEASED; urgency=medium + + * d/copyright: update years and debian packagers. + + -- Étienne Mollier Fri, 18 Aug 2023 15:23:36 +0200 + q2-feature-classifier (2023.7.0-1) unstable; urgency=medium * New upstream version diff -Nru q2-feature-classifier-2023.7.0/debian/control q2-feature-classifier-2024.2.0/debian/control --- q2-feature-classifier-2023.7.0/debian/control 2023-08-18 12:34:12.000000000 +0000 +++ q2-feature-classifier-2024.2.0/debian/control 2024-02-18 14:45:11.000000000 +0000 @@ -1,3 +1,4 @@ +# This file is autogenerated from control.in to update versioned dependencies Source: q2-feature-classifier Maintainer: Debian Med Packaging Team Uploaders: Liubov Chuprikova , @@ -5,7 +6,7 @@ Section: science Priority: optional Build-Depends: debhelper-compat (= 13), - dh-python, + dh-sequence-python3, python3, python3-setuptools, python3-pytest-cov, @@ -14,11 +15,11 @@ python3-skbio, ncbi-blast+, vsearch, - qiime (>= 2022.11.1), - q2-types (>= 2022.11.1), - q2-quality-control (>= 2022.11.1), - q2-taxa (>= 2022.11.1), - q2-feature-table (>= 2022.11.1) + qiime (>= 2024.2), + q2-types (>= 2024.2), + q2-quality-control (>= 2024.2), + q2-taxa (>= 2024.2), + q2-feature-table (>= 2024.2) Standards-Version: 4.6.2 Vcs-Browser: https://salsa.debian.org/med-team/q2-feature-classifier Vcs-Git: https://salsa.debian.org/med-team/q2-feature-classifier.git @@ -36,11 +37,11 @@ python3-biom-format, ncbi-blast+, vsearch, - qiime (>= 2022.11.1), - q2-types (>= 2022.11.1), - q2-quality-control (>= 2022.11.1), - q2-taxa (>= 2022.11.1), - q2-feature-table (>= 2022.11.1) + qiime (>= 2024.2), + q2-types (>= 2024.2), + q2-quality-control (>= 2024.2), + q2-taxa (>= 2024.2), + q2-feature-table (>= 2024.2) Description: QIIME 2 plugin supporting taxonomic classification QIIME 2 is a powerful, extensible, and decentralized microbiome analysis package with a focus on data and analysis transparency. QIIME 2 enables diff -Nru q2-feature-classifier-2023.7.0/debian/control.in q2-feature-classifier-2024.2.0/debian/control.in --- q2-feature-classifier-2023.7.0/debian/control.in 1970-01-01 00:00:00.000000000 +0000 +++ q2-feature-classifier-2024.2.0/debian/control.in 2024-02-18 14:45:11.000000000 +0000 @@ -0,0 +1,65 @@ +Source: q2-feature-classifier +Maintainer: Debian Med Packaging Team +Uploaders: Liubov Chuprikova , + Étienne Mollier +Section: science +Priority: optional +Build-Depends: debhelper-compat (= 13), + dh-sequence-python3, + python3, + python3-setuptools, + python3-pytest-cov, + python3-sklearn, + python3-joblib, + python3-skbio, + ncbi-blast+, + vsearch, + qiime (>= @DEB_VERSION_UPSTREAM@), + q2-types (>= @DEB_VERSION_UPSTREAM@), + q2-quality-control (>= @DEB_VERSION_UPSTREAM@), + q2-taxa (>= @DEB_VERSION_UPSTREAM@), + q2-feature-table (>= @DEB_VERSION_UPSTREAM@) +Standards-Version: 4.6.2 +Vcs-Browser: https://salsa.debian.org/med-team/q2-feature-classifier +Vcs-Git: https://salsa.debian.org/med-team/q2-feature-classifier.git +Homepage: https://qiime2.org/ +Rules-Requires-Root: no + +Package: q2-feature-classifier +Architecture: all +Depends: ${shlibs:Depends}, + ${misc:Depends}, + ${python3:Depends}, + python3-sklearn, + python3-skbio, + python3-joblib, + python3-biom-format, + ncbi-blast+, + vsearch, + qiime (>= @DEB_VERSION_UPSTREAM@), + q2-types (>= @DEB_VERSION_UPSTREAM@), + q2-quality-control (>= @DEB_VERSION_UPSTREAM@), + q2-taxa (>= @DEB_VERSION_UPSTREAM@), + q2-feature-table (>= @DEB_VERSION_UPSTREAM@) +Description: QIIME 2 plugin supporting taxonomic classification + QIIME 2 is a powerful, extensible, and decentralized microbiome analysis + package with a focus on data and analysis transparency. QIIME 2 enables + researchers to start an analysis with raw DNA sequence data and finish with + publication-quality figures and statistical results. + Key features: + * Integrated and automatic tracking of data provenance + * Semantic type system + * Plugin system for extending microbiome analysis functionality + * Support for multiple types of user interfaces (e.g. API, command line, + graphical) + . + QIIME 2 is a complete redesign and rewrite of the QIIME 1 microbiome analysis + pipeline. QIIME 2 will address many of the limitations of QIIME 1, while + retaining the features that makes QIIME 1 a powerful and widely-used analysis + pipeline. + . + QIIME 2 currently supports an initial end-to-end microbiome analysis pipeline. + New functionality will regularly become available through QIIME 2 plugins. You + can view a list of plugins that are currently available on the QIIME 2 plugin + availability page. The future plugins page lists plugins that are being + developed. diff -Nru q2-feature-classifier-2023.7.0/debian/copyright q2-feature-classifier-2024.2.0/debian/copyright --- q2-feature-classifier-2023.7.0/debian/copyright 2023-08-18 12:34:12.000000000 +0000 +++ q2-feature-classifier-2024.2.0/debian/copyright 2024-02-18 14:45:11.000000000 +0000 @@ -3,12 +3,13 @@ Source: https://github.com/qiime2/q2-feature-classifier/releases Files: * -Copyright: 2016-2019 QIIME 2 development team +Copyright: 2016-2023 QIIME 2 development team License: BSD-3-clause Files: debian/* Copyright: 2019 Liubov Chuprikova - Andreas Tille + 2019-2023 Andreas Tille + 2021-2023 Étienne Mollier License: GPL-2+ License: BSD-3-clause diff -Nru q2-feature-classifier-2023.7.0/debian/rules q2-feature-classifier-2024.2.0/debian/rules --- q2-feature-classifier-2023.7.0/debian/rules 2023-08-18 12:34:12.000000000 +0000 +++ q2-feature-classifier-2024.2.0/debian/rules 2024-02-18 14:45:11.000000000 +0000 @@ -9,13 +9,22 @@ export PYBUILD_BEFORE_INSTALL=rm -rvf {build_dir}/q2-feature-classifier.egg-* \ {build_dir}/site.py {build_dir}/.coverage* {build_dir}/easy-install.pth +include /usr/share/dpkg/default.mk +VERSION_UPSTREAM=$(shell echo $(DEB_VERSION_UPSTREAM) | sed -e 's/\(20[0-9][0-9]\.[0-9]\+\)\..*/\1/') + +debian/control: debian/control.in + echo "# This file is autogenerated from control.in to update versioned dependencies" > $@ + sed -e"s/@DEB_VERSION_UPSTREAM@/$(VERSION_UPSTREAM)/g" $< >> $@ + %: - dh $@ --with python3 --buildsystem=pybuild + dh $@ --buildsystem=pybuild +#FIXME: Delaying true testing to autopkgtests since QIIME2 module cannot be registered +# at build time. override_dh_auto_test: ifeq (,$(filter nocheck,$(DEB_BUILD_OPTIONS))) PYTHONPATH=$(CURDIR) \ - dh_auto_test -- -s custom --test-args="cd {build_dir}; py.test-3 --cov=q2_feature_classifier" + dh_auto_test -- -s custom --test-args="cd {build_dir}; py.test-3 --cov=q2_feature_classifier" || true endif override_dh_fixperms: diff -Nru q2-feature-classifier-2023.7.0/debian/tests/control q2-feature-classifier-2024.2.0/debian/tests/control --- q2-feature-classifier-2023.7.0/debian/tests/control 2023-08-18 12:34:12.000000000 +0000 +++ q2-feature-classifier-2024.2.0/debian/tests/control 2024-02-18 14:45:11.000000000 +0000 @@ -1,3 +1,3 @@ Tests: run-unit-test -Depends: @, python3-pytest-cov +Depends: @, python3-pytest-cov, python3-all Restrictions: allow-stderr, skip-not-installable diff -Nru q2-feature-classifier-2023.7.0/debian/tests/run-unit-test q2-feature-classifier-2024.2.0/debian/tests/run-unit-test --- q2-feature-classifier-2023.7.0/debian/tests/run-unit-test 2023-08-18 12:34:12.000000000 +0000 +++ q2-feature-classifier-2024.2.0/debian/tests/run-unit-test 2024-02-18 14:45:11.000000000 +0000 @@ -18,4 +18,7 @@ fi # Run build-time tests -py.test-3 --cov=${pkg} +for py in $(py3versions -s 2> /dev/null) +do + ${py} -m pytest -v --cov=${pkg} +done diff -Nru q2-feature-classifier-2023.7.0/q2_feature_classifier/__init__.py q2-feature-classifier-2024.2.0/q2_feature_classifier/__init__.py --- q2-feature-classifier-2023.7.0/q2_feature_classifier/__init__.py 2023-08-17 19:44:12.000000000 +0000 +++ q2-feature-classifier-2024.2.0/q2_feature_classifier/__init__.py 2024-02-16 21:57:24.000000000 +0000 @@ -12,6 +12,7 @@ __version__ = get_versions()['version'] del get_versions +importlib.import_module('q2_feature_classifier.types') importlib.import_module('q2_feature_classifier.classifier') importlib.import_module('q2_feature_classifier._cutter') importlib.import_module('q2_feature_classifier._blast') diff -Nru q2-feature-classifier-2023.7.0/q2_feature_classifier/_blast.py q2-feature-classifier-2024.2.0/q2_feature_classifier/_blast.py --- q2-feature-classifier-2023.7.0/q2_feature_classifier/_blast.py 2023-08-17 19:44:12.000000000 +0000 +++ q2-feature-classifier-2024.2.0/q2_feature_classifier/_blast.py 2024-02-16 21:57:24.000000000 +0000 @@ -6,12 +6,17 @@ # The full license is in the file LICENSE, distributed with this software. # ---------------------------------------------------------------------------- +import os +import warnings import subprocess import pandas as pd from q2_types.feature_data import ( FeatureData, Taxonomy, Sequence, DNAFASTAFormat, DNAIterator, BLAST6, BLAST6Format) -from qiime2.plugin import Int, Str, Float, Choices, Range, Bool +from .types import BLASTDBDirFmtV5, BLASTDB +from qiime2.plugin import ( + Int, Str, Float, Choices, Range, Bool, Threads, get_available_cores +) from .plugin_setup import plugin, citations from ._consensus_assignment import ( min_consensus_param, min_consensus_param_description, @@ -36,6 +41,7 @@ DEFAULTEVALUE = 0.001 DEFAULTMINCONSENSUS = 0.51 DEFAULTOUTPUTNOHITS = True +DEFAULTNUMTHREADS = 1 # NOTE FOR THE FUTURE: should this be called blastn? would it be possible to @@ -46,23 +52,43 @@ # expose different parameters etc? My feeling is let's call this `blast` for # now and then cross that bridge when we come to it. def blast(query: DNAFASTAFormat, - reference_reads: DNAFASTAFormat, + reference_reads: DNAFASTAFormat = None, + blastdb: BLASTDBDirFmtV5 = None, maxaccepts: int = DEFAULTMAXACCEPTS, perc_identity: float = DEFAULTPERCENTID, query_cov: float = DEFAULTQUERYCOV, strand: str = DEFAULTSTRAND, evalue: float = DEFAULTEVALUE, - output_no_hits: bool = DEFAULTOUTPUTNOHITS) -> pd.DataFrame: + output_no_hits: bool = DEFAULTOUTPUTNOHITS, + num_threads: int = DEFAULTNUMTHREADS) -> pd.DataFrame: + if num_threads == 0: + num_threads = get_available_cores() + + if reference_reads and blastdb: + raise ValueError('Only one reference_reads or blastdb artifact ' + 'can be provided as input. Choose one and try ' + 'again.') perc_identity = perc_identity * 100 query_cov = query_cov * 100 seqs_fp = str(query) - ref_fp = str(reference_reads) # TODO: generalize to support other blast types? output = BLAST6Format() cmd = ['blastn', '-query', seqs_fp, '-evalue', str(evalue), '-strand', - strand, '-outfmt', '6', '-subject', ref_fp, '-perc_identity', - str(perc_identity), '-qcov_hsp_perc', str(query_cov), + strand, '-outfmt', '6', '-perc_identity', str(perc_identity), + '-qcov_hsp_perc', str(query_cov), '-num_threads', str(num_threads), '-max_target_seqs', str(maxaccepts), '-out', str(output)] + if reference_reads: + cmd.extend(['-subject', str(reference_reads)]) + if num_threads > 1: + warnings.warn('The num_threads parameters is only compatible ' + 'when using a pre-indexed blastdb. The num_threads ' + 'is ignored when reference_reads are provided as ' + 'input.', UserWarning) + elif blastdb: + cmd.extend(['-db', os.path.join(blastdb.path, blastdb.get_basename())]) + else: + raise ValueError('Either reference_reads or a blastdb must be ' + 'provided as input.') _run_command(cmd) # load as dataframe to quickly validate (note: will fail now if empty) result = output.view(pd.DataFrame) @@ -76,19 +102,18 @@ if len(missing_ids) > 0: # we will mirror vsearch behavior and annotate unassigneds as '*' # and fill other columns with 0 values (np.nan makes format error). - missed = pd.DataFrame(columns=result.columns) - missed = missed.append( - [{'qseqid': i, 'sseqid': '*'} for i in missing_ids]).fillna(0) - # Do two separate appends to make sure that fillna does not alter - # any other contents from the original search results. - result = result.append(missed, ignore_index=True) + missed = pd.DataFrame( + [{'qseqid': i, 'sseqid': '*'} for i in missing_ids], + columns=result.columns).fillna(0) + result = pd.concat([result, missed], axis=0) return result def classify_consensus_blast(ctx, query, - reference_reads, reference_taxonomy, + blastdb=None, + reference_reads=None, maxaccepts=DEFAULTMAXACCEPTS, perc_identity=DEFAULTPERCENTID, query_cov=DEFAULTQUERYCOV, @@ -96,13 +121,18 @@ evalue=DEFAULTEVALUE, output_no_hits=DEFAULTOUTPUTNOHITS, min_consensus=DEFAULTMINCONSENSUS, - unassignable_label=DEFAULTUNASSIGNABLELABEL): + unassignable_label=DEFAULTUNASSIGNABLELABEL, + num_threads=DEFAULTNUMTHREADS): + if num_threads == 0: + num_threads = get_available_cores() + search_db = ctx.get_action('feature_classifier', 'blast') lca = ctx.get_action('feature_classifier', 'find_consensus_annotation') - result, = search_db(query=query, reference_reads=reference_reads, + result, = search_db(query=query, blastdb=blastdb, + reference_reads=reference_reads, maxaccepts=maxaccepts, perc_identity=perc_identity, query_cov=query_cov, strand=strand, evalue=evalue, - output_no_hits=output_no_hits) + output_no_hits=output_no_hits, num_threads=num_threads) consensus, = lca(search_results=result, reference_taxonomy=reference_taxonomy, min_consensus=min_consensus, @@ -113,6 +143,15 @@ return consensus, result +def makeblastdb(sequences: DNAFASTAFormat) -> BLASTDBDirFmtV5: + database = BLASTDBDirFmtV5() + build_cmd = ['makeblastdb', '-blastdb_version', '5', '-dbtype', 'nucl', + '-title', 'blastdb', '-in', str(sequences), + '-out', os.path.join(str(database.path), 'blastdb')] + _run_command(build_cmd) + return database + + # Replace this function with QIIME2 API for wrapping commands/binaries, # pending https://github.com/qiime2/qiime2/issues/224 def _run_command(cmd, verbose=True): @@ -128,10 +167,14 @@ inputs = {'query': FeatureData[Sequence], + 'blastdb': BLASTDB, 'reference_reads': FeatureData[Sequence]} input_descriptions = {'query': 'Query sequences.', - 'reference_reads': 'Reference sequences.'} + 'blastdb': 'BLAST indexed database. Incompatible with ' + 'reference_reads.', + 'reference_reads': 'Reference sequences. Incompatible ' + 'with blastdb.'} classification_output = ('classification', FeatureData[Taxonomy]) @@ -144,6 +187,7 @@ 'query_cov': Float % Range(0.0, 1.0, inclusive_end=True), 'strand': Str % Choices(['both', 'plus', 'minus']), 'output_no_hits': Bool, + 'num_threads': Threads, } parameter_descriptions = { @@ -170,6 +214,8 @@ 'unclassified sequences, otherwise you may run into ' 'errors downstream from missing feature IDs. Set to ' 'FALSE to mirror default BLAST search.', + 'num_threads': 'Number of threads (CPUs) to use in the BLAST search. ' + 'Pass 0 to use all available CPUs.', } blast6_output = ('search_results', FeatureData[BLAST6]) @@ -177,6 +223,20 @@ blast6_output_description = {'search_results': 'Top hits for each query.'} +plugin.methods.register_function( + function=makeblastdb, + inputs={'sequences': FeatureData[Sequence]}, + parameters={}, + outputs=[('database', BLASTDB)], + input_descriptions={'sequences': 'Input reference sequences.'}, + parameter_descriptions={}, + output_descriptions={'database': 'Output BLAST database.'}, + name='Make BLAST database.', + description=('Make BLAST database from custom sequence collection.'), + citations=[citations['camacho2009blast+']] +) + + # Note: name should be changed to blastn if we do NOT generalize this function plugin.methods.register_function( function=blast, diff -Nru q2-feature-classifier-2023.7.0/q2_feature_classifier/_consensus_assignment.py q2-feature-classifier-2024.2.0/q2_feature_classifier/_consensus_assignment.py --- q2-feature-classifier-2023.7.0/q2_feature_classifier/_consensus_assignment.py 2023-08-17 19:44:12.000000000 +0000 +++ q2-feature-classifier-2024.2.0/q2_feature_classifier/_consensus_assignment.py 2024-02-16 21:57:24.000000000 +0000 @@ -32,19 +32,19 @@ unassignable_label: str = DEFAULTUNASSIGNABLELABEL ) -> pd.DataFrame: - '''Find consensus taxonomy from BLAST6Format alignment summary. + """Find consensus taxonomy from BLAST6Format alignment summary. search_results: pd.dataframe BLAST6Format search results with canonical headers attached. reference_taxonomy: pd.Series Annotations of reference database used for original search. min_consensus : float - The minimum fraction of the annotations that a specfic annotation + The minimum fraction of the annotations that a specific annotation must be present in for that annotation to be accepted. Current lower boundary is 0.51. unassignable_label : str The label to apply if no acceptable annotations are identified. - ''' + """ # load and convert blast6format results to dict of taxa hits obs_taxa = _blast6format_df_to_series_of_lists( search_results, reference_taxonomy, @@ -85,9 +85,11 @@ def _blast6format_df_to_series_of_lists( - assignments, ref_taxa, - unassignable_label=DEFAULTUNASSIGNABLELABEL): - '''import observed assignments in blast6 format to series of lists. + assignments: pd.DataFrame, + ref_taxa: pd.Series, + unassignable_label: str = DEFAULTUNASSIGNABLELABEL +) -> pd.Series: + """import observed assignments in blast6 format to series of lists. assignments: pd.DataFrame Taxonomy observation map in blast format 6. Each line consists of @@ -99,13 +101,12 @@ Annotation The accession IDs in this taxonomy should match the subject-seq-ids in the "assignment" input. - ''' - taxa_hits = assignments.set_index('qseqid')['sseqid'] - + """ # validate that assignments are present in reference taxonomy # (i.e., that the correct reference taxonomy was used). # Note that we drop unassigned labels from this set. - missing_ids = set(taxa_hits.values) - set(ref_taxa.index) - {'*', ''} + missing_ids = \ + set(assignments['sseqid'].values) - set(ref_taxa.index) - {'*', ''} if len(missing_ids) > 0: raise KeyError('Reference taxonomy and search results do not match. ' 'The following identifiers were reported in the search ' @@ -115,9 +116,12 @@ # if vsearch fails to find assignment, it reports '*' as the # accession ID, so we will add this mapping to the reference taxonomy. ref_taxa['*'] = unassignable_label - # map accession IDs to taxonomy - taxa_hits.replace(ref_taxa, inplace=True) + assignments_copy = assignments.copy(deep=True) + for index, value in assignments_copy.iterrows(): + sseqid = assignments_copy.iloc[index]['sseqid'] + assignments_copy.at[index, 'sseqid'] = ref_taxa.at[sseqid] # convert to dict of {accession_id: [annotations]} + taxa_hits: pd.Series = assignments_copy.set_index('qseqid')['sseqid'] taxa_hits = taxa_hits.groupby(taxa_hits.index).apply(list) return taxa_hits @@ -131,7 +135,7 @@ ---------- query_annotations : pd.Series of lists Indices are query identifiers, and values are lists of all - taxonomic annotations associated with that identfier. + taxonomic annotations associated with that identifier. Returns ------- pd.DataFrame @@ -191,7 +195,7 @@ annotations : list of lists Taxonomic annotations to form consensus. min_consensus : float - The minimum fraction of the annotations that a specfic annotation + The minimum fraction of the annotations that a specific annotation must be present in for that annotation to be accepted. Current lower boundary is 0.51. unassignable_label : str @@ -211,7 +215,7 @@ # This assumes that a hierarchical taxonomy with even numbers of # ranks was used. taxa_comparison = [Counter(rank) for rank in zip(*annotations)] - # interate rank comparisons in reverse + # iterate rank comparisons in reverse # to find rank with consensus count > threshold for rank in taxa_comparison[::-1]: # grab most common label and its count diff -Nru q2-feature-classifier-2023.7.0/q2_feature_classifier/_skl.py q2-feature-classifier-2024.2.0/q2_feature_classifier/_skl.py --- q2-feature-classifier-2023.7.0/q2_feature_classifier/_skl.py 2023-08-17 19:44:12.000000000 +0000 +++ q2-feature-classifier-2024.2.0/q2_feature_classifier/_skl.py 2024-02-16 21:57:24.000000000 +0000 @@ -6,11 +6,55 @@ # The full license is in the file LICENSE, distributed with this software. # ---------------------------------------------------------------------------- +from dataclasses import dataclass, field +from functools import cached_property from itertools import islice, repeat -from copy import deepcopy +from typing import Dict, List from joblib import Parallel, delayed + +@dataclass +class _TaxonNode: + # The _TaxonNode is used to build a hierarchy from a list of sorted class + # labels. It allows one to quickly find class label indices of taxonomy + # labels that satisfy a given taxonomy hierarchy. For example, given the + # 'k__Bacteria' taxon, the _TaxonNode.range property will yield all class + # label indices where 'k__Bacteria' is a prefix. + + name: str + offset_index: int + children: Dict[str, "_TaxonNode"] = field( + default_factory=dict, + repr=False) + + @classmethod + def create_tree(cls, classes: List[str], separator: str): + if not all(a <= b for a, b in zip(classes, classes[1:])): + raise Exception("classes must be in sorted order") + root = cls("Unassigned", 0) + for class_start_index, label in enumerate(classes): + taxons = label.split(separator) + node = root + for name in taxons: + if name not in node.children: + node.children[name] = cls(name, class_start_index) + node = node.children[name] + return root + + @property + def range(self) -> range: + return range( + self.offset_index, + self.offset_index + self.num_leaf_nodes) + + @cached_property + def num_leaf_nodes(self) -> int: + if len(self.children) == 0: + return 1 + return sum(c.num_leaf_nodes for c in self.children.values()) + + _specific_fitters = [ ['naive_bayes', [['feat_ext', @@ -71,25 +115,26 @@ y = pipeline.classes_[prob_pos.argmax(axis=1)] + taxonomy_tree = _TaxonNode.create_tree(pipeline.classes_, separator) + results = [] - split_classes = [c.split(separator) for c in pipeline.classes_] for seq_id, taxon, class_probs in zip(seq_ids, y, prob_pos): - taxon = taxon.split(separator) - classes = zip(deepcopy(split_classes), class_probs) + split_taxon = taxon.split(separator) + accepted_cum_prob = 0.0 + cum_prob = 0.0 result = [] - for level in taxon: - classes = [cls for cls in classes if cls[0].pop(0) == level] - cum_prob = sum(c[1] for c in classes) + current = taxonomy_tree + for rank in split_taxon: + current = current.children[rank] + cum_prob = class_probs[current.range].sum() if cum_prob < confidence: break - result.append(level) - result_confidence = cum_prob - if result: - result = separator.join(result) - results.append((seq_id, result, result_confidence)) + accepted_cum_prob = cum_prob + result.append(rank) + if len(result) == 0: + results.append((seq_id, "Unassigned", 1.0 - cum_prob)) else: - results.append((seq_id, 'Unassigned', 1. - cum_prob)) - + results.append((seq_id, separator.join(result), accepted_cum_prob)) return results diff -Nru q2-feature-classifier-2023.7.0/q2_feature_classifier/_version.py q2-feature-classifier-2024.2.0/q2_feature_classifier/_version.py --- q2-feature-classifier-2023.7.0/q2_feature_classifier/_version.py 2023-08-17 19:44:12.000000000 +0000 +++ q2-feature-classifier-2024.2.0/q2_feature_classifier/_version.py 2024-02-16 21:57:24.000000000 +0000 @@ -23,9 +23,9 @@ # setup.py/versioneer.py will grep for the variable names, so they must # each be defined on a line of their own. _version.py will just call # get_keywords(). - git_refnames = " (tag: 2023.7.0, Release-2023.7)" - git_full = "ed298ae345a893a9efc01aaa304d60cf00944edf" - git_date = "2023-08-17 19:44:12 +0000" + git_refnames = " (tag: 2024.2.0, Release-2024.2)" + git_full = "5ce76be5b72482a7d033fb9d2c41446edd75851a" + git_date = "2024-02-16 21:57:24 +0000" keywords = {"refnames": git_refnames, "full": git_full, "date": git_date} return keywords diff -Nru q2-feature-classifier-2023.7.0/q2_feature_classifier/_vsearch.py q2-feature-classifier-2024.2.0/q2_feature_classifier/_vsearch.py --- q2-feature-classifier-2023.7.0/q2_feature_classifier/_vsearch.py 2023-08-17 19:44:12.000000000 +0000 +++ q2-feature-classifier-2024.2.0/q2_feature_classifier/_vsearch.py 2024-02-16 21:57:24.000000000 +0000 @@ -13,7 +13,7 @@ from q2_types.feature_data import ( FeatureData, Taxonomy, Sequence, DNAFASTAFormat, BLAST6, BLAST6Format) from .plugin_setup import plugin, citations -from qiime2.plugin import Int, Str, Float, Choices, Range, Bool +from qiime2.plugin import Int, Str, Float, Choices, Range, Bool, Threads from ._blast import (_run_command) from ._consensus_assignment import (DEFAULTUNASSIGNABLELABEL, min_consensus_param, @@ -214,7 +214,7 @@ 'perc_identity': Float % Range(0.0, 1.0, inclusive_end=True), 'query_cov': Float % Range(0.0, 1.0, inclusive_end=True), 'strand': Str % Choices(['both', 'plus']), - 'threads': Int % Range(1, None), + 'threads': Threads, 'maxhits': Int % Range(1, None) | Str % Choices(['all']), 'maxrejects': Int % Range(1, None) | Str % Choices(['all'])} @@ -252,7 +252,8 @@ 'lower.', 'query_cov': 'Reject match if query alignment coverage per high-' 'scoring pair is lower.', - 'threads': 'Number of threads to use for job parallelization.', + 'threads': 'Number of threads to use for job parallelization. Pass 0 to ' + 'use one per available CPU.', 'maxhits': 'Maximum number of hits to show once the search is terminated.', 'maxrejects': 'Maximum number of non-matching target sequences to ' 'consider before stopping the search. This option works in ' diff -Nru q2-feature-classifier-2023.7.0/q2_feature_classifier/classifier.py q2-feature-classifier-2024.2.0/q2_feature_classifier/classifier.py --- q2-feature-classifier-2023.7.0/q2_feature_classifier/classifier.py 2023-08-17 19:44:12.000000000 +0000 +++ q2-feature-classifier-2024.2.0/q2_feature_classifier/classifier.py 2024-02-16 21:57:24.000000000 +0000 @@ -14,7 +14,9 @@ import subprocess import pandas as pd -from qiime2.plugin import Int, Str, Float, Bool, Choices, Range +from qiime2.plugin import ( + Int, Str, Float, Bool, Choices, Range, Threads, get_available_cores +) from q2_types.feature_data import ( FeatureData, Taxonomy, Sequence, DNAIterator, DNAFASTAFormat) from q2_types.feature_table import FeatureTable, RelativeFrequency @@ -203,6 +205,13 @@ pre_dispatch: str = '2*n_jobs', confidence: float = 0.7, read_orientation: str = 'auto' ) -> pd.DataFrame: + + if n_jobs in (0, -1): + n_jobs = get_available_cores() + elif n_jobs < -1: + n_less = abs(n_jobs + 1) + n_jobs = get_available_cores(n_less=n_less) + try: # autotune reads per batch if reads_per_batch == 'auto': @@ -234,7 +243,7 @@ _classify_parameters = { 'reads_per_batch': Int % Range(1, None) | Str % Choices(['auto']), - 'n_jobs': Int, + 'n_jobs': Threads, 'pre_dispatch': Str, 'confidence': Float % Range( 0, 1, inclusive_start=True, inclusive_end=True) | Str % Choices( @@ -257,7 +266,7 @@ 'reads_per_batch': 'Number of reads to process in each batch. If "auto", ' 'this parameter is autoscaled to ' 'min( number of query sequences / n_jobs, 20000).', - 'n_jobs': 'The maximum number of concurrently worker processes. If -1 ' + 'n_jobs': 'The maximum number of concurrent worker processes. If -1 ' 'all CPUs are used. If 1 is given, no parallel computing ' 'code is used at all, which is useful for debugging. For ' 'n_jobs below -1, (n_cpus + 1 + n_jobs) are used. Thus for ' diff -Nru q2-feature-classifier-2023.7.0/q2_feature_classifier/tests/test_classifier.py q2-feature-classifier-2024.2.0/q2_feature_classifier/tests/test_classifier.py --- q2-feature-classifier-2023.7.0/q2_feature_classifier/tests/test_classifier.py 2023-08-17 19:44:12.000000000 +0000 +++ q2-feature-classifier-2024.2.0/q2_feature_classifier/tests/test_classifier.py 2024-02-16 21:57:24.000000000 +0000 @@ -16,7 +16,7 @@ import skbio import biom -from q2_feature_classifier._skl import _specific_fitters +from q2_feature_classifier._skl import _specific_fitters, _TaxonNode from q2_feature_classifier.classifier import spec_from_pipeline, \ pipeline_from_spec, populate_class_weight, _autotune_reads_per_batch @@ -242,3 +242,42 @@ def test_autotune_reads_per_batch_more_jobs_than_reads(self): self.assertEqual( _autotune_reads_per_batch(self.seq_path, n_jobs=1105), 1) + + def test_TaxonNode_create_tree(self): + classes = ['a;b;c', 'a;b;d', 'a;e;f', 'a;e;g'] + separator = ';' + tree = _TaxonNode.create_tree(classes, separator) + self.assertEqual( + tree.children['a'].children['b'].children['c'].name, 'c') + self.assertEqual( + tree.children['a'].children['b'].children['d'].name, 'd') + self.assertEqual( + tree.children['a'].children['e'].children['f'].name, 'f') + self.assertEqual( + tree.children['a'].children['e'].children['g'].name, 'g') + + def test_TaxonNode_range(self): + classes = ['a;b;c', 'a;b;d', 'a;e;f', 'a;e;g'] + separator = ';' + tree = _TaxonNode.create_tree(classes, separator) + self.assertEqual( + tree.children['a'].children['b'].children['c'].range, range(0, 1)) + self.assertEqual( + tree.children['a'].children['b'].children['d'].range, range(1, 2)) + self.assertEqual( + tree.children['a'].children['e'].children['f'].range, range(2, 3)) + self.assertEqual( + tree.children['a'].children['e'].children['g'].range, range(3, 4)) + self.assertEqual( + tree.children['a'].children['b'].range, range(0, 2)) + self.assertEqual( + tree.children['a'].children['e'].range, range(2, 4)) + + def test_TaxonNode_num_leaf_nodes(self): + classes = ['a;b;c', 'a;b;d', 'a;e;f', 'a;e;g'] + separator = ';' + tree = _TaxonNode.create_tree(classes, separator) + self.assertEqual(tree.num_leaf_nodes, 4) + self.assertEqual(tree.children['a'].num_leaf_nodes, 4) + self.assertEqual(tree.children['a'].children['b'].num_leaf_nodes, 2) + self.assertEqual(tree.children['a'].children['e'].num_leaf_nodes, 2) diff -Nru q2-feature-classifier-2023.7.0/q2_feature_classifier/tests/test_consensus_assignment.py q2-feature-classifier-2024.2.0/q2_feature_classifier/tests/test_consensus_assignment.py --- q2-feature-classifier-2023.7.0/q2_feature_classifier/tests/test_consensus_assignment.py 2023-08-17 19:44:12.000000000 +0000 +++ q2-feature-classifier-2024.2.0/q2_feature_classifier/tests/test_consensus_assignment.py 2024-02-16 21:57:24.000000000 +0000 @@ -31,6 +31,20 @@ 'FeatureData[Sequence]', self.get_data_path('se-dna-sequences.fasta')) + # The blastdb format is not documented in enough detail to validate + # so for now we just run together with blastn to validate. + def test_makeblastdb_and_blast(self): + db, = qfc.actions.makeblastdb(self.ref) + print(db) + result1, = qfc.actions.blast(self.query, blastdb=db) + result2, = qfc.actions.blast(self.query, self.ref) + pdt.assert_frame_equal(result1.view(pd.DataFrame), + result2.view(pd.DataFrame)) + with self.assertRaisesRegex(ValueError, "Only one.*can be provided"): + qfc.actions.blast(self.query, reference_reads=self.ref, blastdb=db) + with self.assertRaisesRegex(ValueError, "Either.*must be provided"): + qfc.actions.blast(self.query) + def test_blast(self): result, = qfc.actions.blast( self.query, self.ref, maxaccepts=3, perc_identity=0.9) @@ -164,7 +178,8 @@ # search and/or taxonomy classification outputs. def test_classify_consensus_blast(self): result, _, = qfc.actions.classify_consensus_blast( - self.reads, self.reads, self.taxonomy) + query=self.reads, reference_reads=self.reads, + reference_taxonomy=self.taxonomy) self.assertTrue(series_is_subset(self.exp, result.view(pd.Series))) def test_classify_consensus_vsearch(self): diff -Nru q2-feature-classifier-2023.7.0/q2_feature_classifier/types/__init__.py q2-feature-classifier-2024.2.0/q2_feature_classifier/types/__init__.py --- q2-feature-classifier-2023.7.0/q2_feature_classifier/types/__init__.py 1970-01-01 00:00:00.000000000 +0000 +++ q2-feature-classifier-2024.2.0/q2_feature_classifier/types/__init__.py 2024-02-16 21:57:24.000000000 +0000 @@ -0,0 +1,13 @@ +# ---------------------------------------------------------------------------- +# Copyright (c) 2016-2023, QIIME 2 development team. +# +# Distributed under the terms of the Modified BSD License. +# +# The full license is in the file LICENSE, distributed with this software. +# ---------------------------------------------------------------------------- + +from ._format import BLASTDBFileFmtV5, BLASTDBDirFmtV5 +from ._type import BLASTDB + + +__all__ = ['BLASTDBFileFmtV5', 'BLASTDBDirFmtV5', 'BLASTDB'] diff -Nru q2-feature-classifier-2023.7.0/q2_feature_classifier/types/_format.py q2-feature-classifier-2024.2.0/q2_feature_classifier/types/_format.py --- q2-feature-classifier-2023.7.0/q2_feature_classifier/types/_format.py 1970-01-01 00:00:00.000000000 +0000 +++ q2-feature-classifier-2024.2.0/q2_feature_classifier/types/_format.py 2024-02-16 21:57:24.000000000 +0000 @@ -0,0 +1,61 @@ +# ---------------------------------------------------------------------------- +# Copyright (c) 2016-2023, QIIME 2 development team. +# +# Distributed under the terms of the Modified BSD License. +# +# The full license is in the file LICENSE, distributed with this software. +# ---------------------------------------------------------------------------- + +import os +import itertools +from qiime2.plugin import model +from ..plugin_setup import plugin, citations + + +class BLASTDBFileFmtV5(model.BinaryFileFormat): + # We do not have a good way to validate the individual blastdb files. + # TODO: could wire up `blastdbcheck` to do a deep check when level=max + # but this must be done on the directory, not individual files. + # For now validation be done at the DirFmt level on file extensions. + def _validate_(self, level): + pass + + +class BLASTDBDirFmtV5(model.DirectoryFormat): + # TODO: is there a more robust way to do this/make some files optional? + # Some file extensions were introduced with more recent versions of + # blast, but are not actually needed for our purposes. Making these + # optional would allow more flexibility in blast versions, avoiding + # possible dependency conflicts. + # NOTE that the .n?? extensions are also nucleotide database specific. + # should we rather call the type/formats BLASTNucDB*? + idx1 = model.File(r'.+\.ndb', format=BLASTDBFileFmtV5) + idx2 = model.File(r'.+\.nhr', format=BLASTDBFileFmtV5) + idx3 = model.File(r'.+\.nin', format=BLASTDBFileFmtV5) + idx4 = model.File(r'.+\.not', format=BLASTDBFileFmtV5) + idx5 = model.File(r'.+\.nsq', format=BLASTDBFileFmtV5) + idx6 = model.File(r'.+\.ntf', format=BLASTDBFileFmtV5) + idx7 = model.File(r'.+\.nto', format=BLASTDBFileFmtV5) + # introducted in blast 2.13.0 + # https://ncbiinsights.ncbi.nlm.nih.gov/2022/03/29/blast-2-13-0/ + idx8 = model.File(r'.+\.njs', format=BLASTDBFileFmtV5) + + # borrowed from q2-types + def get_basename(self): + paths = [str(x.relative_to(self.path)) for x in self.path.iterdir()] + prefix = os.path.splitext(_get_prefix(paths))[0] + return prefix + + +# SO: https://stackoverflow.com/a/6718380/579416 +def _get_prefix(strings): + def all_same(x): + return all(x[0] == y for y in x) + + char_tuples = zip(*strings) + prefix_tuples = itertools.takewhile(all_same, char_tuples) + return ''.join(x[0] for x in prefix_tuples) + + +plugin.register_views(BLASTDBDirFmtV5, + citations=[citations['camacho2009blast+']]) diff -Nru q2-feature-classifier-2023.7.0/q2_feature_classifier/types/_type.py q2-feature-classifier-2024.2.0/q2_feature_classifier/types/_type.py --- q2-feature-classifier-2023.7.0/q2_feature_classifier/types/_type.py 1970-01-01 00:00:00.000000000 +0000 +++ q2-feature-classifier-2024.2.0/q2_feature_classifier/types/_type.py 2024-02-16 21:57:24.000000000 +0000 @@ -0,0 +1,17 @@ +# ---------------------------------------------------------------------------- +# Copyright (c) 2016-2023, QIIME 2 development team. +# +# Distributed under the terms of the Modified BSD License. +# +# The full license is in the file LICENSE, distributed with this software. +# ---------------------------------------------------------------------------- + +from qiime2.plugin import SemanticType +from . import BLASTDBDirFmtV5 +from ..plugin_setup import plugin + + +BLASTDB = SemanticType('BLASTDB') + +plugin.register_semantic_types(BLASTDB) +plugin.register_artifact_class(BLASTDB, BLASTDBDirFmtV5)