diff -Nru atropos-1.1.24+dfsg/atropos/adapters/__init__.py atropos-1.1.29+dfsg/atropos/adapters/__init__.py --- atropos-1.1.24+dfsg/atropos/adapters/__init__.py 2019-11-27 17:44:28.000000000 +0000 +++ atropos-1.1.29+dfsg/atropos/adapters/__init__.py 2020-12-07 03:57:01.000000000 +0000 @@ -14,13 +14,13 @@ from atropos.align import Match from atropos.io.seqio import ColorspaceSequence, FastaReader from atropos.util import ( - IUPAC_BASES, GC_BASES, MergingDict, NestedDict, CountingDict, Const, + IUPAC_BASES, GC_BASES, MergingDict, NestedDict, CountingDict, Const, reverse_complement, ALPHABETS) from atropos.util import colorspace as cs class AdapterType(object): """Struct for adapter type information. - + Args: name: Adapter type name. desc: Adapter type description. @@ -32,7 +32,7 @@ self.flags = flags[0] for i in range(1, len(flags)): self.flags |= flags[i] - + def asdict(self): """Returns AdapterType fields in a dict. """ @@ -80,7 +80,7 @@ class AdapterParser(object): """Factory for Adapter classes that all use the same parameters (error rate, indels etc.). The given **kwargs will be passed to the Adapter constructors. - + Args: colorspace: Whether the adapter sequences are in colorspace. cache: The AdapterCache to use for fetching/storing named adapters. @@ -90,23 +90,23 @@ self.cache = cache self.constructor_args = kwargs self.adapter_class = ColorspaceAdapter if colorspace else Adapter - + def parse(self, spec, cmdline_type='back'): """Parse an adapter specification not using ``file:`` notation and return an object of an appropriate Adapter class. The notation for anchored 5' and 3' adapters is supported. If the name parameter is None, then an attempt is made to extract the name from the specification (If spec is 'name=ADAPTER', name will be 'name'.) - + Args: spec: The adapter spec. name: The adapter name. If not provided, one is automatically generated. cmdline_type: describes which commandline parameter was used (``-a`` is 'back', ``-b`` is 'anywhere', and ``-g`` is 'front'). - + TODO: describe the adapter spec format - + Returns: An :class:`Adapter` instance. """ @@ -119,14 +119,14 @@ record.sequence, cmdline_type, name) else: yield self.parse_from_spec(spec, cmdline_type) - + def parse_from_spec(self, spec, cmdline_type='back', name=None): if cmdline_type not in ADAPTER_TYPES: raise ValueError( 'cmdline_type cannot be {0!r}'.format(cmdline_type)) orig_spec = spec where = ADAPTER_TYPES[cmdline_type].flags - + if name is None and spec is None: raise ValueError('Either name or spec must be given') elif name is None: @@ -136,15 +136,15 @@ elif spec is None: if self.cache and self.cache.has_name(name): spec = self.cache.get_for_name(name) - + if spec is None: raise ValueError('Name not found: {}'.format(name)) elif name is None: name, spec = _extract_name_from_spec(spec) - + if self.cache and name is not None: self.cache.add(name, spec) - + front_anchored, back_anchored = False, False if spec.startswith('^'): spec = spec[1:] @@ -152,9 +152,9 @@ if spec.endswith('$'): spec = spec[:-1] back_anchored = True - + sequence1, middle, sequence2 = spec.partition('...') - + if where == ANYWHERE: if front_anchored or back_anchored: raise ValueError("'anywhere' (-b) adapters may not be anchored") @@ -162,7 +162,7 @@ raise ValueError("'anywhere' (-b) adapters may not be linked") return self.adapter_class( sequence=spec, where=where, name=name, **self.constructor_args) - + assert where == FRONT or where == BACK if middle == '...': if not sequence1: @@ -185,11 +185,11 @@ # automatically anchor 5' adapter if -a is used if where == BACK: front_anchored = True - + return LinkedAdapter(sequence1, sequence2, name=name, front_anchored=front_anchored, back_anchored=back_anchored, **self.constructor_args) - + if front_anchored and back_anchored: raise ValueError( 'Trying to use both "^" and "$" in adapter specification ' @@ -202,20 +202,20 @@ if where == FRONT: raise ValueError("Cannot anchor 5' adapter at 3' end") where = SUFFIX - + return self.adapter_class( sequence=spec, where=where, name=name, **self.constructor_args) - + def parse_multi(self, back=None, anywhere=None, front=None): """Parse all three types of commandline options that can be used to specify adapters. back, anywhere and front are lists of strings, corresponding to the respective commandline types (-a, -b, -g). - + Args: back: Back-adapter specs. anywhere: Anywhere-adapter specs. front: Front-adapter specs. - + Return: A list of appropriate Adapter classes. """ @@ -231,7 +231,7 @@ class Adapter(object): """An adapter knows how to match itself to a read. In particular, it knows where it should be within the read and how to interpret wildcard characters. - + Args: sequence: The adapter sequence as string. Will be converted to uppercase. Also, Us will be converted to Ts. @@ -276,7 +276,7 @@ if isinstance(alphabet, str): alphabet = ALPHABETS[alphabet] alphabet.validate_string(sequence) - + self.debug = False self.name = _generate_adapter_name() if name is None else name self.sequence = sequence @@ -320,34 +320,34 @@ # When indels are disallowed, an entirely different algorithm # should be used. self.aligner.indel_cost = 100000 - + def __repr__(self): return ''.format(**vars(self)) - + def enable_debug(self): """Print out the dynamic programming matrix after matching a read to an adapter. """ self.debug = True self.aligner.enable_debug() - + def match_to(self, read): """Attempt to match this adapter to the given read. - + Args: read: A :class:`Sequence` instance. - + Returns: A :class:`Match` instance if a match was found; return None if no match was found given the matching criteria (minimum overlap length, maximum error rate). """ read_seq = read.sequence.upper() - + # try to find an exact match first unless wildcards are allowed pos = -1 if not self.adapter_wildcards: @@ -359,13 +359,13 @@ pos = (len(read_seq) - len(self.sequence)) else: pos = read_seq.find(self.sequence) - + if pos >= 0: seqlen = len(self.sequence) return Match( 0, seqlen, pos, pos + seqlen, seqlen, 0, self._front_flag, self, read) - + # try approximate matching if not self.indels and self.where in (PREFIX, SUFFIX): if self.where == PREFIX: @@ -382,7 +382,7 @@ alignment = self.aligner.locate(read_seq) if self.debug: print(self.aligner.dpmatrix) # pragma: no cover - + if alignment: astart, astop, rstart, rstop, matches, errors = alignment size = astop - astart @@ -396,12 +396,12 @@ return Match( astart, astop, rstart, rstop, matches, errors, self._front_flag, self, read) - + return None - + def _trimmed_anywhere(self, match): """Trims an adapter from either the front or back of sequence. - + Returns: A :class:`Sequence` instance: the trimmed read. """ @@ -409,10 +409,10 @@ return self._trimmed_front(match) else: return self._trimmed_back(match) - + def _trimmed_front(self, match): """Trims an adapter from the front of sequence. - + Returns: A :class:`Sequence` instance: the trimmed read. """ @@ -420,10 +420,10 @@ self.lengths_front[match.rstop] += 1 self.errors_front[match.rstop][match.errors] += 1 return match.read[match.rstop:] - + def _trimmed_back(self, match): """Trims an adapter from the back of sequence. - + Returns: A :class:`Sequence` instance: the trimmed read. """ @@ -434,12 +434,12 @@ adjacent_base = '' self.adjacent_bases[adjacent_base] += 1 return match.read[:match.rstart] - + def __len__(self): return len(self.sequence) - + def random_match_probabilities(self): - """Estimate probabilities that this adapter matches a random sequence. + """Estimate probabilities that this adapter matches a random sequence. Indels are not taken into account. Returns: @@ -451,12 +451,12 @@ seq = self.sequence[::-1] else: seq = self.sequence - + #matches = 0 base_probs = (self.gc_content / 2.0, (1 - self.gc_content) / 2.0) probabilities = [1.0] + ([0] * len(seq)) c_bases = frozenset(GC_BASES if self.adapter_wildcards else 'GC') - + # TODO: this doesn't work; need to figure out if RandomMatchProbability # can be used for this. #for idx, base in enumerate(seq, 1): @@ -464,32 +464,32 @@ # matches += 1 # probabilities[idx] = self.match_probability( # matches, idx, *base_probs) - + cur_p = 1.0 for idx, base in enumerate(seq, 1): cur_p *= base_probs[0 if base in c_bases else 1] probabilities[idx] = cur_p return probabilities - + def summarize(self): """Summarize the activities of this :class:`Adapter`. """ total_front = sum(self.lengths_front.values()) total_back = sum(self.lengths_back.values()) - + stats = MergingDict( adapter_class=self.__class__.__name__, total_front=total_front, total_back=total_back, total=total_front + total_back, match_probabilities=Const(self.random_match_probabilities())) - + where = self.where assert ( where in (ANYWHERE, LINKED) or (where in (BACK, SUFFIX) and total_front == 0) or (where in (FRONT, PREFIX) and total_back == 0)) - + stats["where"] = where_int_to_dict(where) stats["sequence"] = Const(self.sequence) stats["max_error_rate"] = Const(self.max_error_rate) @@ -501,12 +501,12 @@ stats["errors_back"] = self.errors_back if where in (BACK, SUFFIX): stats["adjacent_bases"] = self.adjacent_bases - + return stats class ColorspaceAdapter(Adapter): """An adapter for a colorspace sequence. - + Args: args, kwargs: Arguments to pass to :class:`Adapter` constructor. """ @@ -525,13 +525,13 @@ raise ValueError( "A 5' colorspace adapter needs to be given in nucleotide space") self.aligner.reference = self.sequence - + def match_to(self, read): """Attempt to match this adapter to the given read. - + Args: read: A :class:`Sequence` instance. - + Returns: A :class:`Match` instance if a match was found; return None if no match was found given the matching criteria (minimum overlap length, @@ -544,7 +544,7 @@ asequence = ( cs.ENCODE[read.primer + self.nucleotide_sequence[0:1]] + self.sequence) - + pos = 0 if read.sequence.startswith(asequence) else -1 if pos >= 0: match = Match( @@ -560,7 +560,7 @@ match = Match(*(alignment + (self._front_flag, self, read))) else: match = None - + if match is None: return None assert ( @@ -571,7 +571,7 @@ def _trimmed_front(self, match): """Trims an adapter from the front of sequence. - + Returns: A :class:`Sequence` instance: the trimmed read. """ @@ -596,7 +596,7 @@ def _trimmed_back(self, match): """Trims an adapter from the back of sequence. - + Returns: A :class:`Sequence` instance: the trimmed read. """ @@ -614,7 +614,7 @@ class LinkedMatch(object): """Represent a match of a LinkedAdapter. - + Args: front_match: The match to the front of the sequence. back_match: The match to the back of the sequence. @@ -625,7 +625,7 @@ self.back_match = back_match self.adapter = adapter assert front_match is not None - + def get_info_record(self): """Returns the info record for the either the back or forward match. """ @@ -636,7 +636,7 @@ class LinkedAdapter(object): """An adapter with linked front and back sequences. - + Args: front_sequence: Front adapter sequence. back_sequence: Back adapter sequence. @@ -653,7 +653,7 @@ where2 = SUFFIX if back_anchored else BACK self.front_anchored = front_anchored self.back_anchored = back_anchored - + # The following attributes are needed for the report self.where = LINKED self.name = _generate_adapter_name() if name is None else name @@ -661,20 +661,20 @@ front_sequence, where=where1, name=None, **kwargs) self.back_adapter = Adapter( back_sequence, where=where2, name=None, **kwargs) - + def enable_debug(self): """Enable debug on adapters. """ self.front_adapter.enable_debug() self.back_adapter.enable_debug() - + def match_to(self, read): """Match the linked adapters against the given read. If the 'front' adapter is not found, the 'back' adapter is not searched for. - + Args: read: A :class:`Sequence` instance. - + Returns: A :class:`Match` instance if a match was found; return None if no match was found given the matching criteria (minimum overlap length, @@ -692,10 +692,10 @@ def trimmed(self, match): """Returns the read trimmed with the front and/or back adapter trimmer(s). - + Args: match: The match to trim. - + Returns: The trimmed read. """ @@ -704,24 +704,24 @@ return self.back_adapter.trimmed(match.back_match) else: return front_trimmed - + def summarize(self): """Returns the summary dict for this adapter. """ total_front = sum(self.front_adapter.lengths_front.values()) total_back = sum(self.back_adapter.lengths_back.values()) - + stats = MergingDict( total_front=total_front, total_back=total_back, total=total_front + total_back) - + where = self.where assert ( where in (ANYWHERE, LINKED) or (where in (BACK, SUFFIX) and total_front == 0) or (where in (FRONT, PREFIX) and total_back == 0)) - + stats["where"] = where_int_to_dict(where) stats["front_sequence"] = Const(self.front_adapter.sequence) stats["front_match_probabilities"] = Const( @@ -741,12 +741,12 @@ stats["front_errors_back"] = self.front_adapter.errors_back stats["back_errors_front"] = self.back_adapter.errors_front stats["back_errors_back"] = self.back_adapter.errors_back - + return stats class AdapterCache(object): """Cache for known adapters. - + Args: path: Path to file where cache is written. auto_reverse_complement: Whether adapter reverse-complements should @@ -757,27 +757,32 @@ self.auto_reverse_complement = auto_reverse_complement if path and os.path.exists(path): with open(path, "rb") as cache: - self.seq_to_name, self.name_to_seq = pickle.load(cache) - else: - self.seq_to_name = {} - self.name_to_seq = {} - + try: + self.seq_to_name, self.name_to_seq = pickle.load(cache) + return + except: + # It is possible for the cache file to get corrupted - see #71 + pass + + self.seq_to_name = {} + self.name_to_seq = {} + @property def empty(self): """Whether the cache is empty. """ return len(self.seq_to_name) == 0 - + def save(self): """Save the cache to file. """ if self.path is not None: with open(self.path, "wb") as cache: pickle.dump((self.seq_to_name, self.name_to_seq), cache) - + def add(self, name, seq): """Add a sequence to the cache. - + Args: name: Adapter name. seq: Adapter sequence. @@ -785,16 +790,16 @@ self._add(name, seq) if self.auto_reverse_complement: self._add("{}_rc".format(name), reverse_complement(seq)) - + def _add(self, name, seq): if seq not in self.seq_to_name: self.seq_to_name[seq] = set() self.seq_to_name[seq].add(name) self.name_to_seq[name] = seq - + def load_from_file(self, path=DEFAULT_ADAPTERS_PATH): """Load cached data from a file. - + Args: path: Path from which to load. """ @@ -803,7 +808,7 @@ def load_from_url(self, url=DEFAULT_ADAPTERS_URL): """Load adapter data from a URL. - + Args: url: URL from which to load. """ @@ -816,10 +821,10 @@ if url.startswith("file:"): url = url[5:] return self.load_from_file(url) - + def load_from_fasta(self, fasta): """Load adapter data from a FASTA file. - + Args: fasta: FASTA file. """ @@ -836,7 +841,7 @@ if close: fasta.close() return num_records - + def load_default(self): """Tries to load from default URL first, then from default path. """ @@ -852,70 +857,70 @@ logging.getLogger().warning( "Error loading adapters from file %s; loading from file", DEFAULT_ADAPTERS_PATH) - + @property def names(self): """Sequence of adapter names. """ return list(self.name_to_seq.keys()) - + @property def sequences(self): """Sequence of adapter sequences. """ return list(self.seq_to_name.keys()) - + def iter_names(self): """Returns an iterator over adapter names. """ return self.name_to_seq.items() - + def iter_sequences(self): """Returns an iterator over adapter sequences. """ return self.seq_to_name.items() - + def has_name(self, name): """Returns whether this cache contains the specified name. - + Args: name: The adapter name. """ return name in self.name_to_seq - + def get_for_name(self, name): """Returns the sequence associated with a name. - + Args: name: The name to fetch. - + Returns: The sequence. """ return self.name_to_seq[name] - + def has_seq(self, seq): """Tests whether a sequence is in the cache. - + Args: seq: The sequence to check. - + Returns: True if the sequence is in the cache. """ return seq in self.seq_to_name - + def get_for_seq(self, seq): """Returns the name associated with a given sequence. - + Args: seq: The sequence to fetch. - + Returns: The name associated with the sequence. """ return list(self.seq_to_name[seq]) - + def summarize(self): """Returns a summary dict. Does *not* add sequence info. """ @@ -928,7 +933,7 @@ def parse_braces(sequence): """Replace all occurrences of ``x{n}`` (where x is any character) with n occurrences of x. Raise ValueError if the expression cannot be parsed. - + Examples: >>> parse_braces('TGA{5}CT') TGAAAAACT @@ -967,10 +972,10 @@ def _extract_name_from_spec(spec): """Parse an adapter specification given as 'name=adapt' into 'name' and 'adapt'. - + Args: spec: Adapter spec. - + Returns: (name, spec) """ diff -Nru atropos-1.1.24+dfsg/atropos/align/_align.pyx atropos-1.1.29+dfsg/atropos/align/_align.pyx --- atropos-1.1.24+dfsg/atropos/align/_align.pyx 2019-11-27 17:44:28.000000000 +0000 +++ atropos-1.1.29+dfsg/atropos/align/_align.pyx 2020-12-07 03:57:01.000000000 +0000 @@ -206,7 +206,7 @@ self.indel_cost = indel_cost self.debug = False self._dpmatrix = None - + property min_overlap: def __get__(self): return self._min_overlap @@ -420,15 +420,15 @@ column[i].cost = cost column[i].origin = origin column[i].matches = matches - + if self.debug: with gil: for i in range(last + 1): self._dpmatrix.set_entry(i, j, column[i].cost) - + while last >= 0 and column[last].cost > k: last -= 1 - + # last can be -1 here, but will be incremented next. # TODO if last is -1, can we stop searching? if last < m: @@ -468,7 +468,7 @@ best.origin = column[i].origin best.ref_stop = i best.query_stop = n - + if best.cost == m + n: # best.cost was initialized with this value. # If it is unchanged, no alignment was found that has @@ -554,14 +554,19 @@ cdef int _min_overlap cdef int _num_cols cdef int _num_matches - + def __cinit__(self, double max_error_rate, int flags=SEMIGLOBAL, int min_overlap=1): self.max_error_rate = max_error_rate self.flags = flags self._min_overlap = min_overlap self._num_cols = 0 self._num_matches = 0 - + + # def __repr__(self): + # return f"MultiAligner" + def _resize_matrix(self, size): if size > self._num_cols: mem = <_Entry*> PyMem_Realloc(self.column, (size + 1) * sizeof(_Entry)) @@ -569,7 +574,7 @@ raise MemoryError() self.column = mem self._num_cols = size - + def _resize_matches(self, size): if size > self._num_matches: mem = <_Match*> PyMem_Realloc(self.match_array, (size + 1) * sizeof(_Match)) @@ -577,7 +582,7 @@ raise MemoryError() self.match_array = mem self._num_matches = size - + def locate(self, str reference, str query, int max_matches=100): """ locate(query) -> (refstart, refstop, querystart, querystop, matches, errors) @@ -593,22 +598,22 @@ The alignment itself is not returned. """ cdef int m = len(reference) - + self._resize_matrix(m) self._resize_matches(max_matches) - + cdef bytes reference_bytes = reference.encode('ascii') cdef char* s1 = reference_bytes - + cdef bytes query_bytes = query.encode('ascii') cdef char* s2 = query_bytes cdef int n = len(query) - + cdef _Match* match_array = self.match_array cdef int num_matches = 0 cdef int exact_match = -1 cdef int max_cost = m + n - + cdef _Entry* column = self.column cdef double max_error_rate = self.max_error_rate cdef bint start_in_ref = self.flags & START_WITHIN_SEQ1 @@ -667,19 +672,19 @@ for j in range(min_n + 1, max_n + 1): # remember first entry tmp_entry = column[0] - + # fill in first entry in this column if start_in_query: column[0].origin = j else: column[0].cost = j * OVERHANG_MULTIPLIER - + for i in range(1, last + 1): characters_equal = (s1[i-1] == s2[j-1]) - + # TODO: this is where we can do qulity-based weighting # (i.e., add some transformation of Q, rather than 1) - + if characters_equal: # Characters match: This cannot be an indel. cost = tmp_entry.cost @@ -690,17 +695,17 @@ cost = tmp_entry.cost + 1 origin = tmp_entry.origin matches = tmp_entry.matches - + # remember current cell for next iteration tmp_entry = column[i] - + column[i].cost = cost column[i].origin = origin column[i].matches = matches - + while last >= 0 and column[last].cost > k: last -= 1 - + # last can be -1 here, but will be incremented next. # TODO if last is -1, can we stop searching? if last < m: @@ -711,17 +716,17 @@ cost = column[m].cost if cost > max_cost: continue - + length = m + min(column[m].origin, 0) if length >= self._min_overlap and cost <= length * max_error_rate: matches = column[m].matches - + match_array[num_matches].ref_stop = m match_array[num_matches].query_stop = j match_array[num_matches].cost = cost match_array[num_matches].origin = column[m].origin match_array[num_matches].matches = matches - + if cost == 0 and matches == m: # exact match, stop early exact_match = num_matches @@ -739,7 +744,7 @@ cost = column[i].cost if cost > max_cost: continue - + length = i + min(column[i].origin, 0) if length >= self._min_overlap and cost <= length * max_error_rate: # update best @@ -749,14 +754,14 @@ match_array[num_matches].origin = column[i].origin match_array[num_matches].matches = column[i].matches num_matches += 1 - + if num_matches == 0: result = None elif exact_match >= 0: result = [self._create_match(match_array[exact_match])] else: result = [self._create_match(match_array[i]) for i in range(num_matches)] - + return result def _create_match(self, _Match _match): diff -Nru atropos-1.1.24+dfsg/atropos/align/__init__.py atropos-1.1.29+dfsg/atropos/align/__init__.py --- atropos-1.1.24+dfsg/atropos/align/__init__.py 2019-11-27 17:44:28.000000000 +0000 +++ atropos-1.1.29+dfsg/atropos/align/__init__.py 2020-12-07 03:57:01.000000000 +0000 @@ -91,7 +91,7 @@ #if self.adapter: # assert self.errors / self.length <= self.adapter.max_error_rate - def __str__(self): + def __repr__(self): return ( 'Match(astart={0}, astop={1}, rstart={2}, rstop={3}, matches={4}, ' 'errors={5})').format( @@ -232,6 +232,21 @@ START_WITHIN_SEQ1 | STOP_WITHIN_SEQ2, min_insert_overlap) + # def __repr__(self) -> str: + # return f"InsertAligner" + def match_insert(self, seq1, seq2): """Use cutadapt aligner for insert and adapter matching. diff -Nru atropos-1.1.24+dfsg/atropos/commands/legacy_report.py atropos-1.1.29+dfsg/atropos/commands/legacy_report.py --- atropos-1.1.24+dfsg/atropos/commands/legacy_report.py 2019-11-27 17:44:28.000000000 +0000 +++ atropos-1.1.29+dfsg/atropos/commands/legacy_report.py 2020-12-07 03:57:01.000000000 +0000 @@ -3,8 +3,10 @@ text-based reports. This will eventually be deprecated in favor of the Jinja2 and MultiQC reports. """ +from abc import ABCMeta, abstractmethod import math import textwrap + from atropos.io import open_output from atropos.util import truncate_string, weighted_median from .reports import BaseReportGenerator @@ -690,28 +692,25 @@ _print() print_stats_report(data, outfile) -def print_stats_report(data, outfile): - """Print stats. - Args: - data: The stats dict. - outfile: The output file. - """ - paired = 'read2' in data - max_count = data['read1']['counts'] - if paired: - max_count = max(max_count, data['read2']['counts']) - max_width = len(str(max_count)) - # add space for commas and column separation - max_width += (max_width // 3) + 1 +class StatsPrinter(metaclass=ABCMeta): + def __init__(self, data, outfile): + self._data = data + self._title_printer = TitlePrinter(outfile) + max_count = self._max_count() + max_width = len(str(max_count)) + # add space for commas and column separation + max_width += (max_width // 3) + 1 + self._printer = RowPrinter(outfile, (35, max_width)) + + @abstractmethod + def _max_count(self): + pass - _print_title = TitlePrinter(outfile) - _print = RowPrinter(outfile, (35, max_width)) - - def _print_histogram(title, hist1, hist2=None): - _print_title(title, level=2) + def _print_histogram(self, title, hist1, hist2=None): + self._title_printer(title, level=2) if hist1 is None: - _print("No Data") + self._printer("No Data") return if hist2: hist = ( @@ -720,118 +719,203 @@ else: hist = sorted(hist1.items(), key=lambda x: x[0]) for histbin in hist: - _print(*histbin) + self._printer(*histbin) - def _print_base_histogram(title, hist, extra_width=4, index_name='Pos'): - _print_title(title, level=2) + def _print_base_histogram(self, title, hist, extra_width=4, index_name='Pos'): + self._title_printer(title, level=2) if hist is None: - _print("No Data") + self._printer("No Data") return - _print( + self._printer( index_name, *hist['columns'], header=True, extra_width=extra_width) for pos, row in hist['rows'].items(): total_count = sum(row) base_pcts = ( round(count * 100 / total_count, 1) for count in row) - _print(pos, *base_pcts, extra_width=extra_width) + self._printer(pos, *base_pcts, extra_width=extra_width) - def _print_tile_histogram(title, hist): + def _print_tile_histogram(self, title, hist): if hist is None: - _print_title(title, level=2) - _print("No Data") + self._title_printer(title, level=2) + self._printer("No Data") return ncol = len(hist['columns']) max_tile_width = max( - 4, len(str(math.ceil(data['read1']['counts'] / ncol)))) + 1 - _print_base_histogram( + 4, len(str(math.ceil(self._data['read1']['counts'] / ncol)))) + 1 + self._print_base_histogram( title, hist, extra_width=max_tile_width, index_name='Tile') - def _print_tile_base_histogram(title, hist): + def _print_tile_base_histogram(self, title, hist): """Print a histogram of position x tile, with values as the median base quality. """ - _print_title(title, level=2) + self._title_printer(title, level=2) if hist is None: - _print("No Data") + self._printer("No Data") return quals = hist['columns'] tiles = hist['columns2'] ncol = len(tiles) max_tile_width = max( - 4, len(str(math.ceil(data['read1']['counts'] / ncol)))) + 1 - _print('Pos', *tiles, header=True, extra_width=max_tile_width) + 4, len(str(math.ceil(self._data['read1']['counts'] / ncol)))) + 1 + self._printer('Pos', *tiles, header=True, extra_width=max_tile_width) for pos, tiles in hist['rows'].items(): # compute the weighted median for each tile at each position - _print( + self._printer( pos, *( weighted_median(quals, tile_counts) for tile_counts in tiles.values()), extra_width=max_tile_width) - _print('', 'Read1', 'Read2', header=True) + @abstractmethod + def print_header(self): + pass + + @abstractmethod + def print_counts(self): + pass + + @abstractmethod + def print_histogram(self, title, key1, key2): + pass + + @abstractmethod + def print_tile_histograms(self, title, key): + pass + + @abstractmethod + def print_base_histograms(self, title, key): + pass + + @abstractmethod + def print_tile_base_histograms(self, title, key): + pass + + +class SingleEndStatsPrinter(StatsPrinter): + def _max_count(self): + return self._data['read1']['counts'] + + def print_header(self): + self._printer('', 'Read1', header=True) + + def print_counts(self): + self._printer("Reads:", self._data['read1']['counts']) + self._printer() + + def print_histogram(self, title, key1, key2): + if key1 in self._data['read1']: + self._print_histogram(title, self._data['read1'][key1][key2]) + self._printer() + + def print_tile_histograms(self, title, key): + if key in self._data['read1']: + self._print_tile_histogram( + "Read 1 {}".format(title), + self._data['read1'][key]) + self._printer() + + def print_base_histograms(self, title, key): + if key in self._data['read1']: + self._print_base_histogram( + "Read 1 {}".format(title), + self._data['read1'][key]) + self._printer() + + def print_tile_base_histograms(self, title, key): + if key in self._data['read1']: + self._print_tile_base_histogram( + "Read 1 {}".format(title), + self._data['read1'][key]) + + +class PairedEndStatsPrinter(StatsPrinter): + def _max_count(self): + return max(self._data['read1']['counts'], self._data['read2']['counts']) + + def print_header(self): + self._printer('', 'Read1', 'Read2', header=True) + + def print_counts(self): + self._printer( + "Read pairs:", + self._data['read1']['counts'], + self._data['read2']['counts']) + self._printer() + + def print_histogram(self, title, key1, key2): + if key1 in self._data: + self._print_histogram( + title, + self._data['read1'][key1][key2], + self._data['read2'][key1][key2]) + self._printer() + + def print_tile_histograms(self, title, key): + if 'tile_sequence_qualities' in self._data['read1']: + self._print_tile_histogram( + "Read 1 {}".format(title), + self._data['read1'][key]) + self._printer() + self._print_tile_histogram( + "Read 2 {}".format(title), + self._data['read2'][key]) + self._printer() + + def print_base_histograms(self, title, key): + if key in self._data: + self._print_base_histogram( + "Read 1 {}".format(title), + self._data['read1'][key]) + self._printer() + self._print_base_histogram( + "Read 2 {}".format(title), + self._data['read2'][key]) + self._printer() + + def print_tile_base_histograms(self, title, key): + if key in self._data['read1']: + self._print_tile_base_histogram( + "Read 1 {}".format(title), + self._data['read1'][key]) + self._printer() + self._print_tile_base_histogram( + "Read 2 {}".format(title), + self._data['read2'][key]) + self._printer() - # Sequence-level stats - _print( - "Read pairs:" if paired else "Reads:", - data['read1']['counts'], - data['read2']['counts']) - _print() - _print_histogram( - "Sequence lengths:", - data['read1']['lengths']['hist'], - data['read2']['lengths']['hist']) - _print() - if 'qualities' in data['read1']: - _print_histogram( - "Sequence qualities:", - data['read1']['qualities']['hist'], - data['read2']['qualities']['hist']) - _print() - _print_histogram( - "Sequence GC content (%)", - data['read1']['gc']['hist'], - data['read2']['gc']['hist']) - _print() - if 'tile_sequence_qualities' in data['read1']: - _print_tile_histogram( - "Read 1 per-tile sequence qualities (%)", - data['read1']['tile_sequence_qualities']) - _print() - _print_tile_histogram( - "Read 2 per-tile sequence qualities (%)", - data['read2']['tile_sequence_qualities']) - _print() +def print_stats_report(data, outfile): + """Print stats. + + Args: + data: The stats dict. + outfile: The output file. + """ + paired = 'read2' in data + if paired: + printer = PairedEndStatsPrinter(data, outfile) + else: + printer = SingleEndStatsPrinter(data, outfile) + + printer.print_header() + + # Sequence-level stats + printer.print_counts() + printer.print_histogram("Sequence lengths:", "lengths", "hist") + printer.print_histogram("Sequence qualities:", "qualities", "hist") + printer.print_histogram("Sequence GC content (%)", "gc", "hist") + printer.print_tile_histograms( + "per-tile sequence qualities (%)", 'tile_sequence_qualities') # Base-level stats - if 'base_qualities' in data['read1']: - _print_base_histogram( - "Read 1 base qualities (%)", - data['read1']['base_qualities']) - _print() - _print_base_histogram( - "Read 2 base qualities (%)", - data['read2']['base_qualities']) - _print() - _print_base_histogram( - "Read 1 base composition (%)", - data['read1']['bases']) - _print() - _print_base_histogram( - "Read 2 base composition (%)", - data['read2']['bases']) - _print() - if 'tile_base_qualities' in data['read1']: - _print_tile_base_histogram( - "Read 1 per-tile base qualities (%)", - data['read1']['tile_base_qualities']) - _print() - _print_tile_base_histogram( - "Read 2 per-tile base qualities (%)", - data['read2']['tile_base_qualities']) - _print() + printer.print_base_histograms("base qualities (%)", 'base_qualities') + printer.print_base_histograms("base composition (%)", 'bases') + printer.print_tile_base_histograms( + "per-tile base qualities (%)", 'tile_base_qualities') + def sizeof(*x, seps=True, prec=1): """Returns the largest string size of all objects in x, where x is a diff -Nru atropos-1.1.24+dfsg/atropos/commands/multicore.py atropos-1.1.29+dfsg/atropos/commands/multicore.py --- atropos-1.1.24+dfsg/atropos/commands/multicore.py 2019-11-27 17:44:28.000000000 +0000 +++ atropos-1.1.29+dfsg/atropos/commands/multicore.py 2020-12-07 03:57:01.000000000 +0000 @@ -5,6 +5,7 @@ from multiprocessing import Process, Value, Queue import os from queue import Empty, Full +import sys import time from atropos import AtroposError from atropos.util import run_interruptible @@ -13,9 +14,9 @@ """Max time to wait between retrying operations.""" # Control values -CONTROL_ACTIVE = 0 +CONTROL_ACTIVE = -1 """Controlled process should run normally.""" -CONTROL_ERROR = -1 +CONTROL_ERROR = -2 """Controlled process should exit.""" class MulticoreError(AtroposError): @@ -26,36 +27,36 @@ class Control(object): """Shared (long) value for passing control information between main and worker threads. - + Args: initial_value: Initial value of the shared control variable. """ def __init__(self, initial_value=CONTROL_ACTIVE): self.control = Value('l', initial_value) - + def check_value(self, value, lock=False): """Check that the current control value == `value`. - + Args: value: The value to check. lock: Whether to lock the shared variable before checking. - + Returns: True if the values are equal. """ return self.get_value(lock=lock) == value - + def check_value_positive(self, lock=False): - """Check that the current control value is positive. - + """Check that the current control value is >= 0. + Args: lock: Whether to lock the shared variable before checking. """ - return self.get_value(lock=lock) > 0 - + return self.get_value(lock=lock) >= 0 + def get_value(self, lock=True): """Returns the current control value. - + Args: lock: Whether to lock the shared variable before checking. """ @@ -64,10 +65,10 @@ return self.control.value else: return self.control.value - + def set_value(self, value): """Set the control value. The shared variable is always locked. - + Args: value: The value to set. """ @@ -78,7 +79,7 @@ """Queue for items with sequentially increasing priority. An item whose priority is below the current level is queued. Pop returns the item with the current priority and increments the current priority. - + Args: max_size: Maximum queue size; None == infinite. """ @@ -86,15 +87,15 @@ self.queue = {} self.max_size = max_size self.min_priority = None - + def push(self, priority, value): """Add an item to the queue with priority. - + Args: priority: An integer that determines the placement of `value` in the queue. Must be unique. value: The value to queue. - + Raises: Full if queue is full. """ @@ -105,10 +106,10 @@ self.queue[priority] = value if self.min_priority is None or priority < self.min_priority: self.min_priority = priority - + def pop(self): """Remove and return the item in the queue with lowest priority. - + Raises: Empty if the queue is emtpy. """ @@ -120,13 +121,13 @@ else: self.min_priority = min(self.queue.keys()) return value - + @property def full(self): """Whether the queue is full. """ return self.max_size and len(self.queue) >= self.max_size - + @property def empty(self): """Whether the queue is empty. @@ -140,11 +141,11 @@ def start(self, **kwargs): super().start(**kwargs) self.seen_batches = set() - + def process_batch(self, batch): self.seen_batches.add(batch[0]['index']) super().process_batch(batch) - + def finish(self, summary, worker=None): super().finish(summary, worker=worker) logging.getLogger().debug( @@ -154,7 +155,7 @@ class WorkerProcess(Process): """Parent class for worker processes that execute Pipelines. - + Args: index: A unique ID for the process. input_queue: Queue with batches of records to process. @@ -169,13 +170,13 @@ self.pipeline = pipeline self.summary_queue = summary_queue self.timeout = timeout - + def run(self): logging.getLogger().debug( "%s running under pid %d", self.name, os.getpid()) - + summary = {} - + def iter_batches(): """Deque and yield batches. """ @@ -185,7 +186,7 @@ wait_message="{} waiting on batch {{}}".format(self.name), timeout=self.timeout) yield batch - + def enqueue_summary(): """Enqueue a summary dict. """ @@ -196,10 +197,10 @@ self.name), timeout=self.timeout ) - + try: self.pipeline.start(worker=self) - + try: for batch in iter_batches(): if batch is None: @@ -210,19 +211,19 @@ self.pipeline.process_batch(batch) finally: self.pipeline.finish(summary, worker=self) - + logging.getLogger().debug("%s finished normally", self.name) except Exception as err: logging.getLogger().error( "Unexpected error in %s", self.name, exc_info=True) summary['exception'] = err - + logging.getLogger().debug("%s sending summary", self.name) enqueue_summary() class ParallelPipelineRunner(object): """Run a pipeline in parallel. - + Args: reader: A :class:`BatchReader`. pipeline: A :class:`Pipeline`. @@ -230,9 +231,11 @@ from command_runner. """ def __init__(self, command_runner, pipeline, threads=None): + self.threads = threads or command_runner.threads + if self.threads < 2: + raise ValueError("'threads' must be >= 2") self.command_runner = command_runner self.pipeline = pipeline - self.threads = threads or command_runner.threads self.timeout = max(command_runner.process_timeout, RETRY_INTERVAL) # Queue by which batches of reads are sent to worker processes self.input_queue = Queue(command_runner.read_queue_size) @@ -242,35 +245,35 @@ self.num_batches = None self.seen_summaries = None self.seen_batches = None - + def ensure_alive(self): """Callback when enqueue times out. """ ensure_processes(self.worker_processes) - + def after_enqueue(self): """Called after all batches are queued. """ pass - + def finish(self): """Called at the end of the run. """ pass - + def run(self): """Run the pipeline. - + Returns: The return code. """ retcode = run_interruptible(self) self.terminate(retcode) return retcode - + def terminate(self, retcode): """Terminates all processes. - + Args: retcode: The return code. """ @@ -278,33 +281,33 @@ logging.getLogger().debug("Exiting all processes") for process in self.worker_processes: kill(process, retcode, self.timeout) - + def __call__(self): # Start worker processes, reserve a thread for the reader process, # which we will get back after it completes worker_args = ( self.input_queue, self.pipeline, self.summary_queue, self.timeout) self.worker_processes = launch_workers(self.threads - 1, worker_args) - + self.num_batches = enqueue_all( self.command_runner.iterator(), self.input_queue, self.timeout, self.ensure_alive) - + logging.getLogger().debug( "Main loop complete; saw %d batches", self.num_batches) - + # Tell the worker processes no more input is coming enqueue_all( (None,) * self.threads, self.input_queue, self.timeout, self.ensure_alive) - + self.after_enqueue() - + # Now that the reader process is done, it essentially # frees up another thread to use for a worker self.worker_processes.extend( launch_workers(1, worker_args, offset=self.threads-1)) - + # Wait for all summaries to be available on queue def summary_timeout_callback(): """Ensure that workers are still alive. @@ -316,21 +319,21 @@ alive=False) except Exception as err: logging.getLogger().error(err) - + wait_on( self.summary_queue.full, wait_message="Waiting on worker summaries {}", timeout=self.timeout, wait=True, timeout_callback=summary_timeout_callback) - + # Process summary information from worker processes logging.getLogger().debug( "Processing summary information from worker processes") - + self.seen_summaries = set() self.seen_batches = set() - + def summary_fail_callback(): """Raises AtroposError with workers that did not report summaries. """ @@ -339,7 +342,7 @@ raise AtroposError( "Missing summaries from processes %s", ",".join(str(summ) for summ in missing_summaries)) - + for _ in range(1, self.threads+1): batch = dequeue( self.summary_queue, fail_callback=summary_fail_callback) @@ -359,7 +362,7 @@ self.seen_summaries.add(worker_index) self.seen_batches |= worker_batches self.command_runner.summary.merge(worker_summary) - + # Check if any batches were missed if self.num_batches > 0: missing_batches = ( @@ -368,7 +371,7 @@ raise AtroposError( "Workers did not process batches {}".format( ",".join(str(batch) for batch in missing_batches))) - + self.finish() def launch_workers(num_workers, args=(), offset=0, worker_class=WorkerProcess): @@ -397,7 +400,7 @@ condition, *args, wait_message="Waiting {}", timeout=None, fail_callback=None, wait=None, timeout_callback=None): """Wait on a condition to be non-False. - + Args: condition: Function that returns either False or a non-False value. args: Args with which to call `condition`. @@ -445,7 +448,7 @@ def wait_on_process(process, timeout, terminate=False): """Wait on a process to terminate. - + Args: process: The process on which to wait. timeout: Number of seconds to wait for process to terminate. @@ -464,7 +467,7 @@ queue, item, wait_message="Waiting to enqueue item {}", block_timeout=RETRY_INTERVAL, **kwargs): """Enqueue an item, using `wait_on` to wait while `queue` is full. - + Args: queue: The queue to which to add the item. item: The item to queue. @@ -485,13 +488,13 @@ def enqueue_all(iterable, queue, timeout, fail_callback): """Enqueue all items in `iterable`, using `wait_on` to wait while `queue` is full. - + Args: iterable: Iterable of items to queue. queue: The queue to which to add the item. timeout: Number of seconds to wait after each `queue.put` attempt. fail_callback: Function called (or Exception raised) after timeout. - + Returns: The number of items queued. """ diff -Nru atropos-1.1.24+dfsg/atropos/commands/qc/__init__.py atropos-1.1.29+dfsg/atropos/commands/qc/__init__.py --- atropos-1.1.24+dfsg/atropos/commands/qc/__init__.py 2019-11-27 17:44:28.000000000 +0000 +++ atropos-1.1.29+dfsg/atropos/commands/qc/__init__.py 2020-12-07 03:57:01.000000000 +0000 @@ -16,15 +16,15 @@ self.read_statistics_class = read_statistics_class self.stats = {} self.stats_kwargs = kwargs - + def _get_stats(self, source): if source not in self.stats: self.stats[source] = self.read_statistics_class(**self.stats_kwargs) return self.stats[source] - + def handle_reads(self, context, read1, read2=None): self._get_stats(context['source']).collect(read1, read2) - + def finish(self, summary, **kwargs): super().finish(summary) summary['pre'] = dict( @@ -45,7 +45,7 @@ class CommandRunner(BaseCommandRunner): name = 'qc' - + def __call__(self): if self.paired: pipeline_class = PairedEndQcPipeline @@ -56,7 +56,7 @@ quality_base=self.quality_base) if self.stats: pipeline_args.update(self.stats) - + if self.threads is None: self.summary.update(mode='serial', threads=1) pipeline = pipeline_class(**pipeline_args) @@ -64,31 +64,27 @@ else: self.summary.update(mode='parallel', threads=self.threads) return self.run_parallel(pipeline_class, pipeline_args) - + def run_parallel(self, pipeline_class, pipeline_args): """Execute qc in parallel mode. - + Args: pipeline_class: Pipeline class to instantiate. pipeline_args: Arguments to pass to Pipeline constructor. - + Returns: The return code. """ from atropos.commands.multicore import ( ParallelPipelineMixin, ParallelPipelineRunner) - - logging.getLogger().debug( - "Starting atropos qc in parallel mode with threads=%d, timeout=%d", - self.threads, self.timeout) - - if self.threads < 2: - raise ValueError("'threads' must be >= 2") - - # Start worker processes, reserve a thread for the reader process, - # which we will get back after it completes + pipeline_class = type( - 'QcPipelineImpl', (ParallelPipelineMixin, pipeline_class)) + 'QcPipelineImpl', (ParallelPipelineMixin, pipeline_class), {}) pipeline = pipeline_class(**pipeline_args) runner = ParallelPipelineRunner(self, pipeline) + + logging.getLogger().debug( + "Starting atropos qc in parallel mode with threads=%d, timeout=%d", + runner.threads, runner.timeout) + return runner.run() diff -Nru atropos-1.1.24+dfsg/atropos/commands/trim/modifiers.py atropos-1.1.29+dfsg/atropos/commands/trim/modifiers.py --- atropos-1.1.24+dfsg/atropos/commands/trim/modifiers.py 2019-11-27 17:44:28.000000000 +0000 +++ atropos-1.1.29+dfsg/atropos/commands/trim/modifiers.py 2020-12-07 03:57:01.000000000 +0000 @@ -24,13 +24,13 @@ """Modifier name. """ return self.__class__.__name__ - + @property def description(self): """Modifier description (for display). """ return getattr(self, 'display_str', self.name) - + def summarize(self): """Returns a summary of the modifier's activity as a dict. """ @@ -47,13 +47,13 @@ """ def __init__(self): self.trimmed_bases = 0 - + def __call__(self, read): raise NotImplementedError() - + def subseq(self, read, begin=0, end=None): """Returns a subsequence of a read. - + Args: read: The read to trim. begin: The first base of the subsequnce. @@ -65,10 +65,10 @@ return new_read else: return read - + def clip(self, read, front=0, back=0): """Returns a read with bases trimmed off the front/back. - + Args: read: The read to trim. front: The number of bases to trim from the front. @@ -80,7 +80,7 @@ return new_read else: return read - + def summarize(self): """Returns a summary dict. """ @@ -91,7 +91,7 @@ class AdapterCutter(Modifier): """Repeatedly find one of multiple adapters in reads. The number of times the search is repeated is specified by the times parameter. - + Args: adapters: List of Adapter objects. times: Number of times to trim. @@ -106,7 +106,7 @@ def _best_match(self, read): """Find the best matching adapter in the given read. - + Returns: Either a Match instance or None if there are no matches. """ @@ -115,7 +115,7 @@ match = adapter.match_to(read) if match is None: continue - + # the no. of matches determines which adapter fits best if best is None or match.matches > best.matches: best = match @@ -127,17 +127,17 @@ of Match instances is returned, which need to be applied consecutively to the read. The list is empty if there are no adapter matches. - + The read is converted to uppercase before it is compared to the adapter sequences. - + Cut found adapters from a single read. Return modified read. """ if len(read) == 0: return read - + matches = [] - + # try at most self.times times to remove an adapter trimmed_read = read for _ in range(self.times): @@ -147,16 +147,16 @@ break matches.append(match) trimmed_read = match.adapter.trimmed(match) - + if not matches: trimmed_read.match = None trimmed_read.match_info = None return trimmed_read - + if __debug__: assert len(trimmed_read) < len(read), \ "Trimmed read isn't shorter than original" - + if self.action == 'trim': # read is already trimmed, nothing to do pass @@ -175,17 +175,17 @@ # set masked sequence as sequence with original quality trimmed_read.sequence = masked_sequence trimmed_read.qualities = matches[0].read.qualities - + assert len(trimmed_read.sequence) == len(read) elif self.action is None: trimmed_read = read - + trimmed_read.match = matches[-1] trimmed_read.match_info = [match.get_info_record() for match in matches] - + self.with_adapters += 1 return trimmed_read - + def summarize(self): adapters_summary = OrderedDict() for adapter in self.adapters: @@ -200,7 +200,7 @@ class ErrorCorrectorMixin(object): """Provides a method for error correction. - + Args: mismatch_action: The action to take when a mismatch between the overlapping portions of read1 and read2 is encountered. Valid @@ -215,10 +215,10 @@ self.r2r1_min_qual_difference = -1 * min_qual_difference self.corrected_pairs = 0 self.corrected_bp = [0, 0] - + def correct_errors(self, read1, read2, insert_match, truncate_seqs=False): """Correct errors in overlapping reads. - + Args: read1: The read1 to correct. read2: The read2 to correct. @@ -231,7 +231,7 @@ # Do not attempt to correct an already corrected read if read1.corrected > 0 or read2.corrected > 0: return - + # read2 reverse-complement is the reference, read1 is the query r1_seq = list(read1.sequence) r2_seq = list(read2.sequence) @@ -265,7 +265,7 @@ r2_end = len2 - insert_match[0] r2_changed = 0 quals_equal = [] - + for i, j in zip( range(r1_start, r1_end), range(r2_end - 1, r2_start - 1, -1)): base1 = r1_seq[i] @@ -299,7 +299,7 @@ r1_changed += 1 elif self.mismatch_action == 'liberal': quals_equal.append((i, j, base1, base2)) - + if quals_equal: mean_qual1 = mean([ord(b) for b in r1_qual[r1_start:r1_end]]) mean_qual2 = mean([ord(b) for b in r2_qual[r2_start:r2_end]]) @@ -320,7 +320,7 @@ r1_seq[i] = base2 r1_qual[i] = r2_qual[j] r1_changed += 1 - + if r1_changed or r2_changed: self.corrected_pairs += 1 @@ -348,7 +348,7 @@ update_read( read2, r2_seq, r2_qual if has_quals else None, len2, 1, r2_changed) - + def summarize(self): """Returns a summary dict. """ @@ -359,7 +359,7 @@ class InsertAdapterCutter(ReadPairModifier, ErrorCorrectorMixin): """AdapterCutter that uses InsertAligner to first try to identify insert overlap before falling back to semi-global adapter alignment. - + Args: adapter1, adapter2: Adapters. action: Action to take on adapter match: trim, mask (replace adapter @@ -387,20 +387,20 @@ self.action = action self.symmetric = symmetric self.with_adapters = [0, 0] - + def __call__(self, read1, read2): read_lengths = [len(r) for r in (read1, read2)] if any(l < self.min_insert_len for l in read_lengths): return (read1, read2) - + match = self.aligner.match_insert(read1.sequence, read2.sequence) read1.insert_overlap = read2.insert_overlap = (match is not None) insert_match = None correct_errors = False - + if match: insert_match, adapter_match1, adapter_match2 = match - correct_errors = self.mismatch_action and insert_match[5] > 0 + correct_errors = self.mismatch_action is not None and insert_match[5] > 0 else: adapter_match1 = self.adapter1.match_to(read1) adapter_match2 = self.adapter2.match_to(read2) @@ -413,7 +413,7 @@ read_lengths[1] - adapter_match1.rstart, read_lengths[1], 0, adapter_match1.rstart) correct_errors = True - + # If exactly one of the two alignments failed and symmetric is True, # duplicate the good alignment if self.symmetric and sum( @@ -444,17 +444,17 @@ read_lengths[1] - adapter_match1.rstart, read_lengths[1], 0, adapter_match1.rstart) correct_errors = True - + if correct_errors: self.correct_errors(read1, read2, insert_match, truncate_seqs=True) - + return ( self.trim(read1, self.adapter1, adapter_match1, 0), self.trim(read2, self.adapter2, adapter_match2, 1)) - + def trim(self, read, adapter, match, read_idx): """Trim an adapter from a read. - + Args: read: The read to trim from. adapter: The Adapter to trim. @@ -465,17 +465,17 @@ read.match = None read.match_info = None return read - + match.adapter = adapter match.read = read match.front = False - + if self.action is None or match.rstart >= len(read): trimmed_read = read - + else: trimmed_read = adapter.trimmed(match) - + if self.action == 'mask': # add N from last modification masked_sequence = trimmed_read.sequence @@ -488,13 +488,13 @@ # This will happen as part of the refactoring to modify # Sequences in-place. pass - + trimmed_read.match = match trimmed_read.match_info = [match.get_info_record()] - + self.with_adapters[read_idx] += 1 return trimmed_read - + def summarize(self): """Returns a summary dict. """ @@ -511,12 +511,12 @@ class OverwriteRead(ReadPairModifier): """If one read is of significantly worse quality than the other, overwrite the poor quality read with the good-quality read. - + Computes the mean quality over the first `window_size` bases. If one read has mean quality < `worse_read_min_quality` and the other read has mean quality >= `better_read_min_quality`, replace the entire worse read with the better read. - + Args: worse_read_min_quality: The minimum quality below which a read is considered 'poor' @@ -534,7 +534,7 @@ self.window_size = window_size self.base = base self.summary_fn = summary_fn - + def __call__(self, read1, read2): if len(read1) < self.window_size or len(read2) < self.window_size: return (read1, read2) @@ -544,10 +544,10 @@ "lacking base qualities.") qual1 = list(quals2ints(read1.qualities[:self.window_size], self.base)) summ1 = self.summary_fn(qual1) - + qual2 = list(quals2ints(read2.qualities[:self.window_size], self.base)) summ2 = self.summary_fn(qual2) - + if ( summ1 < self.worse_read_min_quality and summ2 >= self.better_read_min_quality): @@ -559,7 +559,7 @@ summ1 >= self.better_read_min_quality): read1.corrected = 1 read2 = read1.reverse_complement() - + return (read1, read2) class UnconditionalCutter(Trimmer): @@ -567,26 +567,26 @@ from a read. Equivalent to MinCutter with count_trimmed=False and only_trimmed=False, but UnconditionalCutter is applied before adapter trimming. - + If the length is positive, the bases are removed from the beginning of the read. If the length is negative, the bases are removed from the end of the read. """ display_str = "Cut unconditionally" - + def __init__(self, lengths=None): super().__init__() self.front_length = self.back_length = 0 if lengths: self.front_length = sum(front for front in lengths if front > 0) self.back_length = sum(back for back in lengths if back < 0) - + def __call__(self, read): return self.clip(read, self.front_length, self.back_length) class MinCutter(Trimmer): """Ensure that a minimum number of bases have been trimmed off each end. - + Args: lengths: Sequence of bases to trim. Numbers > 0 are summed to the total bases trimmed from the front of the read, and numbers < 0 @@ -598,7 +598,7 @@ adapter-trimmed. """ display_str = "Cut conditionally" - + def __init__(self, lengths=None, count_trimmed=True, only_trimmed=False): super().__init__() self.front_length = self.back_length = 0 @@ -607,7 +607,7 @@ self.back_length = sum(back for back in lengths if back < 0) self.count_trimmed = count_trimmed self.only_trimmed = only_trimmed - + def __call__(self, read): trim_front = trim_back = True if self.only_trimmed: @@ -619,7 +619,7 @@ trim_back = False else: return read - + # TODO: distinguish between adapter trimming and other trimming. # For things like Methyl-Seq, we want to trim additional bases # after adapter trimming, but we want to count other post-adapter @@ -638,12 +638,12 @@ trimmed = read.clipped[offset+2] else: trimmed = read.clipped[offset] - + if is_front: return max(self.front_length - trimmed, 0) else: return min(trimmed + self.back_length, 0) - + return self.clip( read, to_trim(0, True) if trim_front else 0, @@ -655,7 +655,7 @@ def __init__(self, length_tag="length="): self.regex = re.compile(r"\b" + length_tag + r"[0-9]*\b") self.length_tag = length_tag - + def __call__(self, read): read = read[:] if read.name.find(self.length_tag) >= 0: @@ -668,7 +668,7 @@ """ def __init__(self, suffixes=None): self.suffixes = suffixes or [] - + def __call__(self, read): name = read.name for suffix in self.suffixes: @@ -684,7 +684,7 @@ def __init__(self, prefix="", suffix=""): self.prefix = prefix self.suffix = suffix - + def __call__(self, read): read = read[:] adapter_name = 'no_adapter' @@ -700,7 +700,7 @@ """ def __init__(self): self.double_encode_trans = str.maketrans('0123.', 'ACGTN') - + def __call__(self, read): read = read[:] read.sequence = read.sequence.translate(self.double_encode_trans) @@ -713,7 +713,7 @@ qbase = quality_base self.zero_cap_trans = str.maketrans( ''.join(map(chr, range(qbase))), chr(qbase) * qbase) - + def __call__(self, read): read = read[:] read.qualities = read.qualities.translate(self.zero_cap_trans) @@ -723,7 +723,7 @@ """Trims primer base from colorspace reads. """ display_str = "Primer-trimmed" - + def __call__(self, read): read = self.clip(read, 1) read.primer = '' @@ -733,12 +733,12 @@ """NextSeq-specific quality trimmer. """ display_str = "Quality trimmed (NextSeq)" - + def __init__(self, cutoff=0, base=33): super(NextseqQualityTrimmer, self).__init__() self.cutoff = cutoff self.base = base - + def __call__(self, read): if len(read) == 0: return read @@ -749,13 +749,13 @@ """Trim bases from the start/end of reads based on their qualities. """ display_str = "Quality-trimmed" - + def __init__(self, cutoff_front=0, cutoff_back=0, base=33): super(QualityTrimmer, self).__init__() self.cutoff_front = cutoff_front self.cutoff_back = cutoff_back self.base = base - + def __call__(self, read): if len(read) == 0: return read @@ -767,12 +767,12 @@ """Trims Ns from the 3' and 5' end of reads. """ display_str = "End Ns trimmed" - + def __init__(self): super(NEndTrimmer, self).__init__() self.start_trim = re.compile(r'^N+') self.end_trim = re.compile(r'N+$') - + def __call__(self, read): if len(read) == 0: return read @@ -789,7 +789,7 @@ reaction. """ display_str = "RRBS-trimmed" - + def __init__(self, trim_5p=0, trim_3p=2): super().__init__( (trim_5p, -1 * trim_3p), count_trimmed=False, only_trimmed=True) @@ -804,14 +804,14 @@ """ display_str = "Bisulfite-trimmed (Non-directional)" _regex = re.compile(r"^C[AG]A") - + def __init__(self, trim_5p=2, trim_3p=2, rrbs=False): self._non_directional_cutter = MinCutter( [trim_5p], count_trimmed=False, only_trimmed=False) self.rrbs = rrbs if rrbs: self._rrbs_cutter = RRBSTrimmer(trim_3p) - + def __call__(self, read): if len(read) == 0: return read @@ -821,7 +821,7 @@ elif self.rrbs: cutter = self._rrbs_cutter return cutter(read) if cutter else read - + def summarize(self): """Returns the number of trimmed bases. """ @@ -834,7 +834,7 @@ """EpiGnome reads are trimmed at least 6 bp on the 5' end. """ display_str = "Bisulfite-trimmed (EpiGnome/TruSeq)" - + def __init__(self): super().__init__((6,), count_trimmed=True, only_trimmed=False) @@ -843,16 +843,16 @@ trimmed off the end of read1 and the beginning of read2. """ display_str = "Bisulfite-trimmed (Swift)" - + def __init__(self, trim_5p1=0, trim_3p1=10, trim_5p2=10, trim_3p2=0): self._read1_cutter = MinCutter( (trim_5p1, -1 * trim_3p1), count_trimmed=False, only_trimmed=False) self._read2_cutter = MinCutter( (trim_5p2, -1 * trim_3p2), count_trimmed=False, only_trimmed=False) - + def __call__(self, read1, read2): return (self._read1_cutter(read1), self._read2_cutter(read2)) - + def summarize(self): return dict(bp_trimmed=( self._read1_cutter.trimmed_bases, @@ -868,19 +868,19 @@ ErrorCorrectorMixin.__init__(self, mismatch_action) self.min_overlap = int(min_overlap) if min_overlap > 1 else min_overlap self.error_rate = error_rate - + def __call__(self, read1, read2): len1 = len(read1.sequence) len2 = len(read2.sequence) min_overlap = self.min_overlap if min_overlap <= 1: min_overlap = max(2, round(self.min_overlap * min(len1, len2))) - + if len1 < min_overlap or len2 < min_overlap: return (read1, read2) - + insert_matched = read1.insert_overlap and read2.insert_overlap - + if insert_matched: # If we've already determined that there is an insert overlap # with a 3' overhang, we can constrain our alignment @@ -892,7 +892,7 @@ read2_rc = reverse_complement(read2.sequence) aligner = Aligner(read2_rc, self.error_rate, aflags) alignment = aligner.locate(read1.sequence) - + if alignment: r2_start, r2_stop, r1_start, r1_stop, matches, errors = alignment if matches >= min_overlap: @@ -900,7 +900,7 @@ # the InsertAligner if self.mismatch_action and errors > 0 and not insert_matched: self.correct_errors(read1, read2, alignment) - + if r2_start == 0 and r2_stop == len2: # r2 is fully contained in r1 pass @@ -924,10 +924,10 @@ "Invalid alignment while trying to merge read " "{}: {}".format( read1.name, ",".join(str(i) for i in alignment))) - + read1.merged = True read2 = None - + return (read1, read2) class Modifiers(object): @@ -936,20 +936,20 @@ def __init__(self): self.modifiers = [] self.modifier_indexes = {} - + def add_modifier(self, mod_class, read=1|2, **kwargs): """Add a modifier of the specified type for one or both reads. - + Args: mod_class: The type of modifier to add. read: Which reads this modifier should modify; 1, 2, or 3 (1|2). kwargs: Additional keyword arguments to the modifier constructor. """ raise NotImplementedError() - + def add_modifier_pair(self, mod_class, read1_args=None, read2_args=None): """Add a modifier for both reads in a pair. - + Args: mod_class: The type of modifier to add. Cannot be a subclass of ReadPairModifier. @@ -957,7 +957,7 @@ modifier constructors. """ raise NotImplementedError() - + def _add_modifiers(self, mod_class, mods): idx = len(self.modifiers) self.modifiers.append(mods) @@ -966,15 +966,15 @@ else: self.modifier_indexes[mod_class] = [idx] return idx - + def has_modifier(self, mod_class): """Returns True if a modifier of the specified type has been added. """ return mod_class in self.modifier_indexes - + def get_modifiers(self, mod_class=None, read=None): """Returns a list of modifiers that have been added. - + Args: mod_class: Restrict modifiers to those of a certain type. read: Return only the modifiers for a given read (1 or 2) @@ -985,10 +985,10 @@ mods = [self.modifiers[i] for i in self.modifier_indexes[mod_class]] else: mods = [] - + if not (mods and read): return mods - + read_mods = [] for mod in mods: if isinstance(mod, ReadPairModifier): @@ -996,11 +996,11 @@ elif mod[read-1] is not None: read_mods.append(mod[read-1]) return read_mods - + def get_adapters(self): """Returns the adapters from the AdapterCutter or InsertAdapterCutter modifier, if any. - + Returns: A list [[read1_adapters], [read2_adapters]]. """ @@ -1016,23 +1016,23 @@ adapters[0] = [mod.adapter1] adapters[1] = [mod.adapter2] return adapters - + def modify(self, read1, read2=None): """Apply registered modifiers to a read/pair. - + Args: read1, read2: The reads to modify. - + Returns: A tuple of modified reads (read1, read2). """ raise NotImplementedError() - + def summarize(self): """Returns a summary dict. """ raise NotImplementedError() - + class SingleEndModifiers(Modifiers): """Manages modifiers for single-end data. """ @@ -1040,16 +1040,16 @@ if read != 1: raise ValueError("'read' must be 1 for single-end data") return self._add_modifiers(mod_class, [mod_class(**kwargs), None]) - + def add_modifier_pair(self, mod_class, read1_args=None, read2_args=None): if read1_args is not None: return self.add_modifier(mod_class, **read1_args) - + def modify(self, read1, read2=None): for mods in self.modifiers: read1 = mods[0](read1) return (read1,) - + def summarize(self): summary = {} for mods in self.modifiers: @@ -1066,7 +1066,7 @@ def __init__(self, paired): super().__init__() self.paired = paired - + def add_modifier(self, mod_class, read=1|2, **kwargs): if issubclass(mod_class, ReadPairModifier): if self.paired != "both" and read == 1|2: @@ -1083,7 +1083,7 @@ if not any(mods): return None return self._add_modifiers(mod_class, mods) - + def add_modifier_pair(self, mod_class, read1_args=None, read2_args=None): mods = [None, None] if read1_args is not None: @@ -1092,7 +1092,7 @@ mods[1] = mod_class(**read2_args) if any(mods): return self._add_modifiers(mod_class, mods) - + def modify(self, read1, read2=None): for mods in self.modifiers: if isinstance(mods, ReadPairModifier): @@ -1103,7 +1103,7 @@ if mods[1] is not None: read2 = mods[1](read2) return (read1, read2) - + def summarize(self): summary = {} for mods in self.modifiers: diff -Nru atropos-1.1.24+dfsg/atropos/_version.py atropos-1.1.29+dfsg/atropos/_version.py --- atropos-1.1.24+dfsg/atropos/_version.py 2019-11-27 17:44:28.000000000 +0000 +++ atropos-1.1.29+dfsg/atropos/_version.py 2020-12-07 03:57:01.000000000 +0000 @@ -23,8 +23,8 @@ # setup.py/versioneer.py will grep for the variable names, so they must # each be defined on a line of their own. _version.py will just call # get_keywords(). - git_refnames = " (HEAD -> 1.1, tag: 1.1.24)" - git_full = "9281be92f0e52a14085841344a509f7808efcfe1" + git_refnames = " (HEAD -> 1.1, tag: 1.1.29)" + git_full = "99388da335616157d140bd385343f98626f0aa6e" keywords = {"refnames": git_refnames, "full": git_full} return keywords diff -Nru atropos-1.1.24+dfsg/CHANGES.md atropos-1.1.29+dfsg/CHANGES.md --- atropos-1.1.24+dfsg/CHANGES.md 2019-11-27 17:44:28.000000000 +0000 +++ atropos-1.1.29+dfsg/CHANGES.md 2020-12-07 03:57:01.000000000 +0000 @@ -1,5 +1,25 @@ # Changes +v1.1.29 (2020.12.06) + +* Fix #122 - Atropos hangs on empty input files in multi-process mode + +v1.1.28 (2020.06.05) + +* Further fix for #100 + +v1.1.27 (2020.06.01) + +* Fix #100 - use correct property to determine timeout when running `qc` in parallel + +v1.1.26 (2020.05.04) + +* Fix #97 - handle single-end data in QC report + +v1.1.25 (2020.01.10) + +* Fix #71 - handle corrupted adapter cache file + v1.1.24 (2019.11.27) * Fix #87 - Python 3.8 compatibility, time.clock removed diff -Nru atropos-1.1.24+dfsg/debian/changelog atropos-1.1.29+dfsg/debian/changelog --- atropos-1.1.24+dfsg/debian/changelog 2020-11-19 17:30:46.000000000 +0000 +++ atropos-1.1.29+dfsg/debian/changelog 2020-12-09 21:29:55.000000000 +0000 @@ -1,14 +1,9 @@ -atropos (1.1.24+dfsg-1build2) hirsute; urgency=medium +atropos (1.1.29+dfsg-1) unstable; urgency=medium - * No-change rebuild to build with python3.9 as default. + * Team upload. + * New upstream version. - -- Matthias Klose Thu, 19 Nov 2020 18:30:46 +0100 - -atropos (1.1.24+dfsg-1build1) focal; urgency=medium - - * No-change rebuild to build with python3.8. - - -- Matthias Klose Sat, 25 Jan 2020 04:30:09 +0000 + -- Steffen Moeller Wed, 09 Dec 2020 22:29:55 +0100 atropos (1.1.24+dfsg-1) unstable; urgency=medium diff -Nru atropos-1.1.24+dfsg/debian/upstream/metadata atropos-1.1.29+dfsg/debian/upstream/metadata --- atropos-1.1.24+dfsg/debian/upstream/metadata 2019-11-30 12:00:11.000000000 +0000 +++ atropos-1.1.29+dfsg/debian/upstream/metadata 2020-12-09 21:29:55.000000000 +0000 @@ -1,21 +1,23 @@ Reference: - Author: John P. Didion and Marcel Martin and Francis S. Collins - Title: > - Atropos: specific, sensitive, and speedy trimming of sequencing reads - Journal: PeerJ - Year: 2017 - Volume: 5 - Pages: e3720 - DOI: 10.7717/peerj.3720 - PMID: 28875074 - URL: https://peerj.com/articles/3720/ - eprint: https://peerj.com/articles/3720.pdf + - Author: John P. Didion and Marcel Martin and Francis S. Collins + Title: > + Atropos: specific, sensitive, and speedy trimming of sequencing reads + Journal: PeerJ + Year: 2017 + Volume: 5 + Pages: e3720 + DOI: 10.7717/peerj.3720 + PMID: 28875074 + URL: https://peerj.com/articles/3720/ + eprint: https://peerj.com/articles/3720.pdf Registry: - Name: OMICtools Entry: OMICS_20869 - Name: bio.tools - Entry: NA + Entry: atropos - Name: SciCrunch Entry: NA - Name: conda:bioconda Entry: atropos +Bug-Database: https://github.com/jdidion/atropos/issues +Bug-Submit: https://github.com/jdidion/atropos/issues/new diff -Nru atropos-1.1.24+dfsg/debian/watch atropos-1.1.29+dfsg/debian/watch --- atropos-1.1.24+dfsg/debian/watch 2019-11-30 12:00:11.000000000 +0000 +++ atropos-1.1.29+dfsg/debian/watch 2020-12-09 21:29:55.000000000 +0000 @@ -1,4 +1,8 @@ version=4 +# To only be presented with version 1.x +#opts="repacksuffix=+dfsg,uversionmangle=s/-alpha\./~a/,dversionmangle=auto,repack,compression=xz" \ +# https://github.com/jdidion/atropos/releases .*/archive/v?(1[0-9.]*)@ARCHIVE_EXT@ + opts="repacksuffix=+dfsg,dversionmangle=auto,repack,compression=xz" \ https://github.com/jdidion/atropos/releases .*/archive/v?@ANY_VERSION@@ARCHIVE_EXT@ diff -Nru atropos-1.1.24+dfsg/.gitignore atropos-1.1.29+dfsg/.gitignore --- atropos-1.1.24+dfsg/.gitignore 2019-11-27 17:44:28.000000000 +0000 +++ atropos-1.1.29+dfsg/.gitignore 2020-12-07 03:57:01.000000000 +0000 @@ -3,6 +3,7 @@ build/ dist/ .coverage +coverage.xml *~ .tox galaxy/package/ @@ -50,4 +51,5 @@ *.ipynb *.tar .pytest_cache/ -venv/ \ No newline at end of file +venv/ +.eggs diff -Nru atropos-1.1.24+dfsg/README.md atropos-1.1.29+dfsg/README.md --- atropos-1.1.24+dfsg/README.md 2019-11-27 17:44:28.000000000 +0000 +++ atropos-1.1.29+dfsg/README.md 2020-12-07 03:57:01.000000000 +0000 @@ -26,7 +26,9 @@ First install dependencies: * Required - * Python 3.4.5+ (python 2.x is NOT supported) + * Python 3.4.5+ + * Python 2.x is NOT supported + * Python 3.5.2 has a [bug](https://bugs.python.org/issue28387) that [has been reported to cause random SEGFAULTs](https://github.com/jdidion/atropos/issues/88) * Cython 0.25.2+ (`pip install Cython`) * Maybe python libraries * pytest (for running unit tests) diff -Nru atropos-1.1.24+dfsg/setup.py atropos-1.1.29+dfsg/setup.py --- atropos-1.1.24+dfsg/setup.py 2019-11-27 17:44:28.000000000 +0000 +++ atropos-1.1.29+dfsg/setup.py 2020-12-07 03:57:01.000000000 +0000 @@ -179,6 +179,9 @@ "Programming Language :: Cython", "Programming Language :: Python :: 3.4", "Programming Language :: Python :: 3.5", - "Programming Language :: Python :: 3.6" + "Programming Language :: Python :: 3.6", + "Programming Language :: Python :: 3.7", + "Programming Language :: Python :: 3.8", + "Programming Language :: Python :: 3.9" ] ) diff -Nru atropos-1.1.24+dfsg/tests/test_paired.py atropos-1.1.29+dfsg/tests/test_paired.py --- atropos-1.1.24+dfsg/tests/test_paired.py 2019-11-27 17:44:28.000000000 +0000 +++ atropos-1.1.29+dfsg/tests/test_paired.py 2020-12-07 03:57:01.000000000 +0000 @@ -2,6 +2,7 @@ import os import shutil from unittest import TestCase +import pytest from pytest import raises from atropos.commands import execute_cli, get_command from tests.utils import ( @@ -606,3 +607,13 @@ expected2='issue68.2.fq', aligners=['insert'], ) + + @pytest.mark.timeout(10) + def test_issue122(self): + run_paired( + "--threads 2 --preserve-order --no-default-adapters -a TTAGACATAT -A CAGTGGAGTA", + in1="empty.fastq", + in2="empty.fastq", + expected1="empty.fastq", + expected2="empty.fastq", + ) \ No newline at end of file diff -Nru atropos-1.1.24+dfsg/.travis.yml atropos-1.1.29+dfsg/.travis.yml --- atropos-1.1.24+dfsg/.travis.yml 2019-11-27 17:44:28.000000000 +0000 +++ atropos-1.1.29+dfsg/.travis.yml 2020-12-07 03:57:01.000000000 +0000 @@ -1,4 +1,7 @@ sudo: false +arch: + - ppc64le + - amd64 dist: xenial language: python cache: @@ -10,6 +13,7 @@ - "3.6" - "3.7" - "3.8" + - "3.9" install: - pip install --upgrade pip wheel - pip install Cython