diff -Nru samtools-1.11/amplicon_stats.c samtools-1.13/amplicon_stats.c --- samtools-1.11/amplicon_stats.c 2020-09-22 12:15:55.000000000 +0000 +++ samtools-1.13/amplicon_stats.c 2021-07-07 17:18:52.000000000 +0000 @@ -1,6 +1,6 @@ /* stats.c -- This is the former bamcheck integrated into samtools/htslib. - Copyright (C) 2020 Genome Research Ltd. + Copyright (C) 2020-2021 Genome Research Ltd. Author: James Bonfield @@ -75,23 +75,49 @@ sam_global_args ga; uint32_t flag_require; uint32_t flag_filter; - bed_pair_list_t sites; int max_delta; // Used for matching read to amplicon primer loc int min_depth[MAX_DEPTH]; // Used for coverage; must be >= min_depth deep int use_sample_name; int max_amp; // Total number of amplicons int max_amp_len; // Maximum length of an individual amplicon - int64_t max_len; // Maximum reference length double depth_bin;// aggregate depth within this fraction int tlen_adj; // Adjust tlen by this amount, due to clip but no fixmate FILE *out_fp; char *argv; int tcoord_min_count; int tcoord_bin; + int multi_ref; } astats_args_t; +typedef struct { + int nseq; // total sequence count + int nfiltered; // sequence filtered + int nfailprimer;// count of sequences not matching the primer locations + + // Sizes of memory allocated below, to permit reset + int max_amp, max_amp_len, max_len; + + // Summary across all samples, sum(x) plus sum(x^2) for s.d. calc + int64_t *nreads, *nreads2; // [max_amp] + double *nfull_reads; // [max_amp]; 0.5/read if paired. + double *nrperc, *nrperc2; // [max_amp] + int64_t *nbases, *nbases2; // [max_amp] + int64_t *coverage; // [max_amp][max_amp_len] + double (*covered_perc)[MAX_DEPTH]; // [max_amp][MAX_DEPTH] + double (*covered_perc2)[MAX_DEPTH];// [max_amp][MAX_DEPTH]; + khash_t(tcoord) **tcoord; // [max_amp+1] + + // 0 is correct pair, 1 is incorrect pair, 2 is unidentified + int (*amp_dist)[3]; // [MAX_AMP][3]; + + int *depth_valid; // [max_len] + int *depth_all; // [max_len] + khash_t(qname) *qend; // queryname end, for overlap removal +} astats_t; + // We can have multiple primers for LEFT / RIGHT, so this // permits detection by any compatible combination. +// One reference: typedef struct { int64_t left[MAX_PRIMER_PER_AMPLICON]; int nleft; @@ -101,35 +127,45 @@ int64_t min_left, max_right; // outer dimensions } amplicon_t; -// Map positions to amplicon numbers. +// Multiple references, we have an array of amplicons_t - one per used ref. +// We have per reference local and global stats here, as some of the stats +// are coordinate based. However we report them combined together as a single +// list across all references. +// "namp" is the number of amplicons in this reference, but they're +// numbered first_amp to first_amp+namp-1 inclusively. +typedef struct { + int tid, namp; + int64_t len; + bed_entry_list_t *sites; + amplicon_t *amp; + astats_t *lstats, *gstats; // local (1 file) and global (all file) stats + const char *ref; // ref name (pointer to the bed hash table key) + int first_amp; // first amplicon number for this ref +} amplicons_t; + +// Reinitialised for each new reference/chromosome. +// Counts from 1 to namp, -1 for no match and 0 for ?. static int *pos2start = NULL; static int *pos2end = NULL; -static int64_t pos_lookup_len = 0; +static int pos2size = 0; // allocated size of pos2start/end // Lookup table to go from position to amplicon based on // read start / end. -// -// NB: Could do bed2amplicon and this code as bed2pos() in a single step. static int initialise_amp_pos_lookup(astats_args_t *args, - amplicon_t *amp, int namp, - int64_t max_len) { + amplicons_t *amps, + int ref) { int64_t i, j; + amplicon_t *amp = amps[ref].amp; + int64_t max_len = amps[ref].len; + int namp = amps[ref].namp; - if (!pos_lookup_len) { - pos_lookup_len = max_len; - if (!(pos2start = calloc(max_len+1, sizeof(*pos2start)))) + if (max_len+1 > pos2size) { + if (!(pos2start = realloc(pos2start, (max_len+1)*sizeof(*pos2start)))) return -1; - if (!(pos2end = calloc(max_len+1, sizeof(*pos2start)))) { + if (!(pos2end = realloc(pos2end, (max_len+1)*sizeof(*pos2end)))) return -1; - } - } else if (pos_lookup_len != max_len) { - fprintf(stderr, "[ampliconstats] error: " - "input files with differing reference length"); - return -1; - } else { - return 0; // already done + pos2size = max_len; } - for (i = 0; i < max_len; i++) pos2start[i] = pos2end[i] = -1; @@ -157,16 +193,11 @@ return 0; } -static void free_amp_pos_lookup(void) { - free(pos2start); - free(pos2end); -} - // Counts amplicons. // Assumption: input BED file alternates between LEFT and RIGHT primers // per amplicon, thus we can count the number based on the switching // orientation. -static int count_amplicon(bed_pair_list_t *sites) { +static int count_amplicon(bed_entry_list_t *sites) { int i, namp, last_rev = 0; for (i = namp = 0; i < sites->length; i++) { if (sites->bp[i].rev == 0 && last_rev) @@ -183,9 +214,9 @@ // // Returns right most amplicon position on success, // < 0 on error -static int64_t bed2amplicon(astats_args_t *args, - bed_pair_list_t *sites, - amplicon_t *amp, int *namp) { +static int64_t bed2amplicon(astats_args_t *args, bed_entry_list_t *sites, + amplicon_t *amp, int *namp, int do_title, + const char *ref, int first_amp) { int i, j; int64_t max_right = 0; FILE *ofp = args->out_fp; @@ -199,10 +230,15 @@ amp[0].min_right = INT64_MAX; amp[0].min_left = INT64_MAX; amp[0].max_right = 0; - fprintf(ofp, "# Amplicon locations from BED file.\n"); - fprintf(ofp, "# LEFT/RIGHT are - format and " - "comma-separated for alt-primers.\n"); - fprintf(ofp, "#\n# AMPLICON\tNUMBER\tLEFT\tRIGHT\n"); + if (do_title) { + fprintf(ofp, "# Amplicon locations from BED file.\n"); + fprintf(ofp, "# LEFT/RIGHT are - format and " + "comma-separated for alt-primers.\n"); + if (args->multi_ref) + fprintf(ofp, "#\n# AMPLICON\tREF\tNUMBER\tLEFT\tRIGHT\n"); + else + fprintf(ofp, "#\n# AMPLICON\tNUMBER\tLEFT\tRIGHT\n"); + } for (i = j = 0; i < sites->length; i++) { if (i == 0 && sites->bp[i].rev != 0) { fprintf(stderr, "[ampliconstats] error: BED file should start" @@ -224,7 +260,10 @@ if (sites->bp[i].rev == 0) { if (i == 0 || last_rev) { if (j>0) fprintf(ofp, "\n"); - fprintf(ofp, "AMPLICON\t%d", j+1); + if (args->multi_ref) + fprintf(ofp, "AMPLICON\t%s\t%d", ref, j+1 + first_amp); + else + fprintf(ofp, "AMPLICON\t%d", j+1); } if (amp[j].nleft >= MAX_PRIMER_PER_AMPLICON) { print_error_errno("ampliconstats", @@ -296,33 +335,6 @@ return max_right; } -typedef struct { - int nseq; // total sequence count - int nfiltered; // sequence filtered - int nfailprimer;// count of sequences not matching the primer locations - - // Sizes of memory allocated below, to permit reset - int max_amp, max_amp_len, max_len; - - // Summary across all samples, sum(x) plus sum(x^2) for s.d. calc - int64_t *nreads, *nreads2; // [max_amp] - double *nfull_reads; // [max_amp]; 0.5/read if paired. - double *nrperc, *nrperc2; // [max_amp] - int64_t *nbases, *nbases2; // [max_amp] - int64_t *coverage; // [max_amp][max_amp_len] - double (*covered_perc)[MAX_DEPTH]; // [max_amp][MAX_DEPTH] - double (*covered_perc2)[MAX_DEPTH];// [max_amp][MAX_DEPTH]; - khash_t(tcoord) **tcoord; // [max_amp+1] - - // 0 is correct pair, 1 is incorrect pair, 2 is unidentified - int (*amp_dist)[3]; // [MAX_AMP][3]; - //int amp_pair[MAX_AMP][MAX_AMP]; // dotplot style - - int *depth_valid; // [max_len] - int *depth_all; // [max_len] - khash_t(qname) *qend; // queryname end, for overlap removal -} astats_t; - void stats_free(astats_t *st) { if (!st) return; @@ -410,7 +422,7 @@ return NULL; } -void stats_reset(astats_t *st) { +static void stats_reset(astats_t *st) { st->nseq = 0; st->nfiltered = 0; st->nfailprimer = 0; @@ -457,10 +469,25 @@ memset(st->amp_dist, 0, st->max_amp * sizeof(*st->amp_dist)); } -static int accumulate_stats(astats_t *stats, - astats_args_t *args, - amplicon_t *amp, int namp, +static void amp_stats_reset(amplicons_t *amps, int nref) { + int i; + for (i = 0; i < nref; i++) { + if (!amps[i].sites) + continue; + stats_reset(amps[i].lstats); + } +} + +static int accumulate_stats(astats_args_t *args, amplicons_t *amps, bam1_t *b) { + int ref = b->core.tid; + amplicon_t *amp = amps[ref].amp; + astats_t *stats = amps[ref].lstats; + int len = amps[ref].len; + + if (!stats) + return 0; + stats->nseq++; if ((b->core.flag & args->flag_require) != args->flag_require || (b->core.flag & args->flag_filter) != 0) { @@ -498,14 +525,19 @@ kh_value(stats->qend, k) = start | (end << 32); } } - for (i = mstart; i < end; i++) + for (i = mstart; i < end && i < len; i++) stats->depth_all[i]++; + if (i < end) { + print_error("ampliconstats", "record %s overhangs end of reference", + bam_get_qname(b)); + // But keep going, as it's harmless. + } // On single ended runs, eg ONT or PacBio, we just use the start/end // of the template to assign. int anum = (b->core.flag & BAM_FREVERSE) || !(b->core.flag & BAM_FPAIRED) - ? (end-1 >= 0 && end-1 < args->max_len ? pos2end[end-1] : -1) - : (start >= 0 && start < args->max_len ? pos2start[start] : -1); + ? (end-1 >= 0 && end-1 < len ? pos2end[end-1] : -1) + : (start >= 0 && start < len ? pos2start[start] : -1); // ivar sometimes soft-clips 100% of the bases. // This is essentially unmapped @@ -518,20 +550,24 @@ stats->nfailprimer++; if (anum >= 0) { - stats->nreads[anum]++; - // NB: ref bases rather than read bases - stats->nbases[anum] += - MIN(end,amp[anum].min_right+1) - MAX(start,amp[anum].max_left); - - int64_t i; - if (start < 0) start = 0; - if (end > args->max_len) end = args->max_len; - - int64_t ostart = MAX(start, amp[anum].min_left-1); - int64_t oend = MIN(end, amp[anum].max_right); - int64_t offset = amp[anum].min_left-1; - for (i = ostart; i < oend; i++) - stats->coverage[anum*stats->max_amp_len + i-offset]++; + int64_t c = MIN(end,amp[anum].min_right+1) - MAX(start,amp[anum].max_left); + if (c > 0) { + stats->nreads[anum]++; + // NB: ref bases rather than read bases + stats->nbases[anum] += c; + + int64_t i; + if (start < 0) start = 0; + if (end > len) end = len; + + int64_t ostart = MAX(start, amp[anum].min_left-1); + int64_t oend = MIN(end, amp[anum].max_right); + int64_t offset = amp[anum].min_left-1; + for (i = ostart; i < oend; i++) + stats->coverage[anum*stats->max_amp_len + i-offset]++; + } else { + stats->nfailprimer++; + } } // Template length in terms of amplicon number to amplicon number. @@ -553,7 +589,7 @@ // average primer length (e.g. 50). t_end += b->core.isize > 0 ? -args->tlen_adj : +args->tlen_adj; - if (t_end > 0 && t_end < args->max_len && b->core.isize != 0) + if (t_end > 0 && t_end < len && b->core.isize != 0) oth_anum = (b->core.flag & BAM_FREVERSE) ? pos2start[t_end] : pos2end[t_end]; @@ -614,7 +650,7 @@ } // Append file local stats to global stats -int append_stats(astats_t *lstats, astats_t *gstats, int namp) { +int append_lstats(astats_t *lstats, astats_t *gstats, int namp, int all_nseq) { gstats->nseq += lstats->nseq; gstats->nfiltered += lstats->nfiltered; gstats->nfailprimer += lstats->nfailprimer; @@ -650,8 +686,7 @@ // To get mean & sd for amplicon read percentage, we need // to do the divisions here as nseq differs for each sample. - int nseq = lstats->nseq - lstats->nfiltered - lstats->nfailprimer; - double nrperc = nseq ? 100.0 * lstats->nreads[a] / nseq : 0; + double nrperc = all_nseq ? 100.0 * lstats->nreads[a] / all_nseq : 0; gstats->nrperc[a] += nrperc; gstats->nrperc2[a] += nrperc*nrperc; @@ -677,6 +712,26 @@ return 0; } +int append_stats(amplicons_t *amps, int nref) { + int i, r, all_nseq = 0; + for (r = 0; r < nref; r++) { + if (!amps[r].sites) + continue; + astats_t *stats = amps[r].lstats; + all_nseq += stats->nseq - stats->nfiltered - stats->nfailprimer; + } + + for (i = 0; i < nref; i++) { + if (!amps[i].sites) + continue; + if (append_lstats(amps[i].lstats, amps[i].gstats, amps[i].namp, + all_nseq) < 0) + return -1; + } + + return 0; +} + typedef struct { int32_t start, end; uint32_t freq; @@ -782,9 +837,9 @@ *np = k; } -int dump_stats(char type, char *name, astats_t *stats, astats_args_t *args, - amplicon_t *amp, int namp, int nfile) { - int i; +int dump_stats(astats_args_t *args, char type, char *name, int nfile, + amplicons_t *amps, int nref, int local) { + int i, r; FILE *ofp = args->out_fp; tcoord_t *tpos = NULL; size_t ntcoord = 0; @@ -792,51 +847,72 @@ // summary stats for this sample (or for all samples) fprintf(ofp, "# Summary stats.\n"); fprintf(ofp, "# Use 'grep ^%cSS | cut -f 2-' to extract this part.\n", type); - fprintf(ofp, "%cSS\t%s\traw total sequences:\t%d\n", - type, name, stats->nseq); - fprintf(ofp, "%cSS\t%s\tfiltered sequences:\t%d\n", - type, name, stats->nfiltered); - fprintf(ofp, "%cSS\t%s\tfailed primer match:\t%d\n", - type, name, stats->nfailprimer); - int nmatch = stats->nseq - stats->nfiltered - stats->nfailprimer; - fprintf(ofp, "%cSS\t%s\tmatching sequences:\t%d\n", - type, name, nmatch); - - int d = 0; - do { - // From first to last amplicon only, so not entire consensus. - // If contig length is known, maybe we want to add the missing - // count to < DEPTH figures? - int64_t start = 0, covered = 0, total = 0; - for (i = 0; i < namp; i++) { - int64_t j, offset = amp[i].min_left-1; - if (amp[i].min_right - amp[i].min_left > stats->max_amp_len) { - fprintf(stderr, "[ampliconstats] error: " - "Maximum amplicon length (%d) exceeded for '%s'\n", - stats->max_amp, name); - return -1; + + for (r = 0; r < nref; r++) { + if (!amps[r].sites) + continue; + astats_t *stats = local ? amps[r].lstats : amps[r].gstats; + int nmatch = stats->nseq - stats->nfiltered - stats->nfailprimer; + char *name_ref = malloc(strlen(name) + strlen(amps[r].ref) + 2); + if (!name_ref) + return -1; + if (args->multi_ref) + sprintf(name_ref, "%s\t%s", name, amps[r].ref); + else + sprintf(name_ref, "%s", name); + fprintf(ofp, "%cSS\t%s\traw total sequences:\t%d\n", + type, name_ref, stats->nseq); + fprintf(ofp, "%cSS\t%s\tfiltered sequences:\t%d\n", + type, name_ref, stats->nfiltered); + fprintf(ofp, "%cSS\t%s\tfailed primer match:\t%d\n", + type, name_ref, stats->nfailprimer); + fprintf(ofp, "%cSS\t%s\tmatching sequences:\t%d\n", + type, name_ref, nmatch); + + int d = 0; + do { + // From first to last amplicon only, so not entire consensus. + // If contig length is known, maybe we want to add the missing + // count to < DEPTH figures? + int64_t start = 0, covered = 0, total = 0; + amplicon_t *amp = amps[r].amp; + for (i = 0; i < amps[r].namp; i++) { + int64_t j, offset = amp[i].min_left-1; + if (amp[i].min_right - amp[i].min_left > stats->max_amp_len) { + fprintf(stderr, "[ampliconstats] error: " + "Maximum amplicon length (%d) exceeded for '%s'\n", + stats->max_amp, name); + return -1; + } + for (j = MAX(start, amp[i].max_left-1); + j < MAX(start, amp[i].min_right); j++) { + if (stats->coverage[i*stats->max_amp_len + j-offset] + >= args->min_depth[d]) + covered++; + total++; + } + start = MAX(start, amp[i].min_right); } - for (j = MAX(start, amp[i].max_left-1); - j < MAX(start, amp[i].min_right); j++) { - if (stats->coverage[i*stats->max_amp_len + j-offset] - >= args->min_depth[d]) - covered++; - total++; - } - start = MAX(start, amp[i].min_right); - } - fprintf(ofp, "%cSS\t%s\tconsensus depth count < %d and >= %d:\t%" - PRId64"\t%"PRId64"\n", type, name, - args->min_depth[d], args->min_depth[d], - total-covered, covered); - } while (++d < MAX_DEPTH && args->min_depth[d]); + fprintf(ofp, "%cSS\t%s\tconsensus depth count < %d and >= %d:\t%" + PRId64"\t%"PRId64"\n", type, name_ref, + args->min_depth[d], args->min_depth[d], + total-covered, covered); + } while (++d < MAX_DEPTH && args->min_depth[d]); + + free(name_ref); + } // Read count fprintf(ofp, "# Absolute matching read counts per amplicon.\n"); fprintf(ofp, "# Use 'grep ^%cREADS | cut -f 2-' to extract this part.\n", type); fprintf(ofp, "%cREADS\t%s", type, name); - for (i = 0; i < namp; i++) { - fprintf(ofp, "\t%"PRId64, stats->nreads[i]); + for (r = 0; r < nref; r++) { + if (!amps[r].sites) + continue; + astats_t *stats = local ? amps[r].lstats : amps[r].gstats; + for (i = 0; i < amps[r].namp; i++) { + fprintf(ofp, "\t%"PRId64, stats->nreads[i]); + } } fprintf(ofp, "\n"); @@ -844,23 +920,40 @@ // by the number we expect to cover), so +0.5 per read in pair. // A.k.a "usable depth" in the plots. fprintf(ofp, "%cVDEPTH\t%s", type, name); - for (i = 0; i < namp; i++) - fprintf(ofp, "\t%d", (int)stats->nfull_reads[i]); + for (r = 0; r < nref; r++) { + if (!amps[r].sites) + continue; + astats_t *stats = local ? amps[r].lstats : amps[r].gstats; + for (i = 0; i < amps[r].namp; i++) + fprintf(ofp, "\t%d", (int)stats->nfull_reads[i]); + } fprintf(ofp, "\n"); if (type == 'C') { // For combined we can compute mean & standard deviation too fprintf(ofp, "CREADS\tMEAN"); - for (i = 0; i < namp; i++) { - fprintf(ofp, "\t%.1f", stats->nreads[i] / (double)nfile); + for (r = 0; r < nref; r++) { + if (!amps[r].sites) + continue; + astats_t *stats = local ? amps[r].lstats : amps[r].gstats; + for (i = 0; i < amps[r].namp; i++) { + fprintf(ofp, "\t%.1f", stats->nreads[i] / (double)nfile); + } } fprintf(ofp, "\n"); fprintf(ofp, "CREADS\tSTDDEV"); - for (i = 0; i < namp; i++) { - double n1 = stats->nreads[i]; - fprintf(ofp, "\t%.1f", sqrt(stats->nreads2[i]/(double)nfile - - (n1/nfile)*(n1/nfile))); + for (r = 0; r < nref; r++) { + if (!amps[r].sites) + continue; + astats_t *stats = local ? amps[r].lstats : amps[r].gstats; + for (i = 0; i < amps[r].namp; i++) { + double n1 = stats->nreads[i]; + fprintf(ofp, "\t%.1f", nfile > 1 && stats->nreads2[i] > 0 + ? sqrt(stats->nreads2[i]/(double)nfile + - (n1/nfile)*(n1/nfile)) + : 0); + } } fprintf(ofp, "\n"); } @@ -868,12 +961,24 @@ fprintf(ofp, "# Read percentage of distribution between amplicons.\n"); fprintf(ofp, "# Use 'grep ^%cRPERC | cut -f 2-' to extract this part.\n", type); fprintf(ofp, "%cRPERC\t%s", type, name); - for (i = 0; i < namp; i++) { - if (type == 'C') { - fprintf(ofp, "\t%.3f", (double)stats->nrperc[i] / nfile); - } else { - int nseq = stats->nseq - stats->nfiltered - stats->nfailprimer; - fprintf(ofp, "\t%.3f", nseq ? 100.0 * stats->nreads[i] / nseq : 0); + int all_nseq = 0; + for (r = 0; r < nref; r++) { + if (!amps[r].sites) + continue; + astats_t *stats = local ? amps[r].lstats : amps[r].gstats; + all_nseq += stats->nseq - stats->nfiltered - stats->nfailprimer; + } + for (r = 0; r < nref; r++) { + if (!amps[r].sites) + continue; + astats_t *stats = local ? amps[r].lstats : amps[r].gstats; + for (i = 0; i < amps[r].namp; i++) { + if (type == 'C') { + fprintf(ofp, "\t%.3f", (double)stats->nrperc[i] / nfile); + } else { + fprintf(ofp, "\t%.3f", + all_nseq ? 100.0 * stats->nreads[i] / all_nseq : 0); + } } } fprintf(ofp, "\n"); @@ -881,17 +986,27 @@ if (type == 'C') { // For combined we compute mean and standard deviation too fprintf(ofp, "CRPERC\tMEAN"); - for (i = 0; i < namp; i++) { - fprintf(ofp, "\t%.3f", stats->nrperc[i] / nfile); + for (r = 0; r < nref; r++) { + if (!amps[r].sites) + continue; + astats_t *stats = local ? amps[r].lstats : amps[r].gstats; + for (i = 0; i < amps[r].namp; i++) { + fprintf(ofp, "\t%.3f", stats->nrperc[i] / nfile); + } } fprintf(ofp, "\n"); fprintf(ofp, "CRPERC\tSTDDEV"); - for (i = 0; i < namp; i++) { - // variance = SUM(X^2) - ((SUM(X)^2) / N) - double n1 = stats->nrperc[i]; - double v = stats->nrperc2[i]/nfile - (n1/nfile)*(n1/nfile); - fprintf(ofp, "\t%.3f", sqrt(v)); + for (r = 0; r < nref; r++) { + if (!amps[r].sites) + continue; + astats_t *stats = local ? amps[r].lstats : amps[r].gstats; + for (i = 0; i < amps[r].namp; i++) { + // variance = SUM(X^2) - ((SUM(X)^2) / N) + double n1 = stats->nrperc[i]; + double v = stats->nrperc2[i]/nfile - (n1/nfile)*(n1/nfile); + fprintf(ofp, "\t%.3f", v>0?sqrt(v):0); + } } fprintf(ofp, "\n"); } @@ -900,30 +1015,48 @@ fprintf(ofp, "# Read depth per amplicon.\n"); fprintf(ofp, "# Use 'grep ^%cDEPTH | cut -f 2-' to extract this part.\n", type); fprintf(ofp, "%cDEPTH\t%s", type, name); - for (i = 0; i < namp; i++) { - int nseq = stats->nseq - stats->nfiltered - stats->nfailprimer; - int64_t alen = amp[i].min_right - amp[i].max_left+1; - fprintf(ofp, "\t%.1f", nseq ? stats->nbases[i] / (double)alen : 0); + for (r = 0; r < nref; r++) { + if (!amps[r].sites) + continue; + astats_t *stats = local ? amps[r].lstats : amps[r].gstats; + amplicon_t *amp = amps[r].amp; + for (i = 0; i < amps[r].namp; i++) { + int nseq = stats->nseq - stats->nfiltered - stats->nfailprimer; + int64_t alen = amp[i].min_right - amp[i].max_left+1; + fprintf(ofp, "\t%.1f", nseq ? stats->nbases[i] / (double)alen : 0); + } } fprintf(ofp, "\n"); if (type == 'C') { // For combined we can compute mean & standard deviation too - int nseq = stats->nseq - stats->nfiltered - stats->nfailprimer; fprintf(ofp, "CDEPTH\tMEAN"); - for (i = 0; i < namp; i++) { - int64_t alen = amp[i].min_right - amp[i].max_left+1; - fprintf(ofp, "\t%.1f", nseq ? stats->nbases[i] / (double)alen / nfile : 0); + for (r = 0; r < nref; r++) { + if (!amps[r].sites) + continue; + astats_t *stats = local ? amps[r].lstats : amps[r].gstats; + amplicon_t *amp = amps[r].amp; + int nseq = stats->nseq - stats->nfiltered - stats->nfailprimer; + for (i = 0; i < amps[r].namp; i++) { + int64_t alen = amp[i].min_right - amp[i].max_left+1; + fprintf(ofp, "\t%.1f", nseq ? stats->nbases[i] / (double)alen / nfile : 0); + } } fprintf(ofp, "\n"); fprintf(ofp, "CDEPTH\tSTDDEV"); - for (i = 0; i < namp; i++) { - double alen = amp[i].min_right - amp[i].max_left+1; - double n1 = stats->nbases[i] / alen; - double v = stats->nbases2[i] / (alen*alen) /nfile - - (n1/nfile)*(n1/nfile); - fprintf(ofp, "\t%.1f", sqrt(v)); + for (r = 0; r < nref; r++) { + if (!amps[r].sites) + continue; + astats_t *stats = local ? amps[r].lstats : amps[r].gstats; + amplicon_t *amp = amps[r].amp; + for (i = 0; i < amps[r].namp; i++) { + double alen = amp[i].min_right - amp[i].max_left+1; + double n1 = stats->nbases[i] / alen; + double v = stats->nbases2[i] / (alen*alen) /nfile + - (n1/nfile)*(n1/nfile); + fprintf(ofp, "\t%.1f", v>0?sqrt(v):0); + } } fprintf(ofp, "\n"); } @@ -935,23 +1068,30 @@ int d = 0; do { fprintf(ofp, "%cPCOV-%d\t%s", type, args->min_depth[d], name); - for (i = 0; i < namp; i++) { - int covered = 0; - if (amp[i].min_right - amp[i].min_left > stats->max_amp_len) { - fprintf(stderr, "[ampliconstats] error: " - "Maximum amplicon length (%d) exceeded for '%s'\n", - stats->max_amp, name); - return -1; - } - int64_t j, offset = amp[i].min_left-1; - for (j = amp[i].max_left-1; j < amp[i].min_right; j++) { - int apos = i*stats->max_amp_len + j-offset; - if (stats->coverage[apos] >= args->min_depth[d]) - covered++; + + for (r = 0; r < nref; r++) { + if (!amps[r].sites) + continue; + astats_t *stats = local ? amps[r].lstats : amps[r].gstats; + amplicon_t *amp = amps[r].amp; + for (i = 0; i < amps[r].namp; i++) { + int covered = 0; + if (amp[i].min_right - amp[i].min_left > stats->max_amp_len) { + fprintf(stderr, "[ampliconstats] error: " + "Maximum amplicon length (%d) exceeded for '%s'\n", + stats->max_amp, name); + return -1; + } + int64_t j, offset = amp[i].min_left-1; + for (j = amp[i].max_left-1; j < amp[i].min_right; j++) { + int apos = i*stats->max_amp_len + j-offset; + if (stats->coverage[apos] >= args->min_depth[d]) + covered++; + } + int64_t alen = amp[i].min_right - amp[i].max_left+1; + stats->covered_perc[i][d] = 100.0 * covered / alen; + fprintf(ofp, "\t%.2f", 100.0 * covered / alen); } - int64_t alen = amp[i].min_right - amp[i].max_left+1; - stats->covered_perc[i][d] = 100.0 * covered / alen; - fprintf(ofp, "\t%.2f", 100.0 * covered / alen); } fprintf(ofp, "\n"); } while (++d < MAX_DEPTH && args->min_depth[d]); @@ -961,16 +1101,26 @@ int d = 0; do { fprintf(ofp, "CPCOV-%d\tMEAN", args->min_depth[d]); - for (i = 0; i < namp; i++) { - fprintf(ofp, "\t%.1f", stats->covered_perc[i][d] / nfile); + for (r = 0; r < nref; r++) { + if (!amps[r].sites) + continue; + astats_t *stats = local ? amps[r].lstats : amps[r].gstats; + for (i = 0; i < amps[r].namp; i++) { + fprintf(ofp, "\t%.1f", stats->covered_perc[i][d] / nfile); + } } fprintf(ofp, "\n"); fprintf(ofp, "CPCOV-%d\tSTDDEV", args->min_depth[d]); - for (i = 0; i < namp; i++) { - double n1 = stats->covered_perc[i][d] / nfile; - double v = stats->covered_perc2[i][d] / nfile - n1*n1; - fprintf(ofp, "\t%.1f", sqrt(v)); + for (r = 0; r < nref; r++) { + if (!amps[r].sites) + continue; + astats_t *stats = local ? amps[r].lstats : amps[r].gstats; + for (i = 0; i < amps[r].namp; i++) { + double n1 = stats->covered_perc[i][d] / nfile; + double v = stats->covered_perc2[i][d] / nfile - n1*n1; + fprintf(ofp, "\t%.1f", v>0?sqrt(v):0); + } } fprintf(ofp, "\n"); } while (++d < MAX_DEPTH && args->min_depth[d]); @@ -981,32 +1131,42 @@ fprintf(ofp, "# Depth per reference base for ALL data.\n"); fprintf(ofp, "# Use 'grep ^%cDP_ALL | cut -f 2-' to extract this part.\n", type); - fprintf(ofp, "%cDP_ALL\t%s\t", type, name); // Base depth in amplicons - for (i = 0; i < args->max_len; i++) { - // Basic run-length encoding provided all values are within - // +- depth_bin fraction of the mid-point. - int dmin = stats->depth_all[i], dmax = stats->depth_all[i], j; - double dmid = (dmin + dmax)/2.0; - double low = dmid*(1-args->depth_bin); - double high = dmid*(1+args->depth_bin); - for (j = i+1; j < args->max_len; j++) { - int d = stats->depth_all[j]; - if (d < low || d > high) - break; - if (dmin > d) { - dmin = d; - dmid = (dmin + dmax)/2.0; - low = dmid*(1-args->depth_bin); - high = dmid*(1+args->depth_bin); - } else if (dmax < d) { - dmax = d; - dmid = (dmin + dmax)/2.0; - low = dmid*(1-args->depth_bin); - high = dmid*(1+args->depth_bin); + for (r = 0; r < nref; r++) { + if (!amps[r].sites) + continue; + astats_t *stats = local ? amps[r].lstats : amps[r].gstats; + if (args->multi_ref) + fprintf(ofp, "%cDP_ALL\t%s\t%s", type, name, amps[r].ref); + else + fprintf(ofp, "%cDP_ALL\t%s", type, name); + + for (i = 0; i < amps[r].len; i++) { + // Basic run-length encoding provided all values are within + // +- depth_bin fraction of the mid-point. + int dmin = stats->depth_all[i], dmax = stats->depth_all[i], j; + double dmid = (dmin + dmax)/2.0; + double low = dmid*(1-args->depth_bin); + double high = dmid*(1+args->depth_bin); + for (j = i+1; j < amps[r].len; j++) { + int d = stats->depth_all[j]; + if (d < low || d > high) + break; + if (dmin > d) { + dmin = d; + dmid = (dmin + dmax)/2.0; + low = dmid*(1-args->depth_bin); + high = dmid*(1+args->depth_bin); + } else if (dmax < d) { + dmax = d; + dmid = (dmin + dmax)/2.0; + low = dmid*(1-args->depth_bin); + high = dmid*(1+args->depth_bin); + } } + fprintf(ofp, "\t%d,%d", (int)dmid, j-i); + i = j-1; } - fprintf(ofp, "%d,%d%c", (int)dmid, j-i, "\n\t"[j < args->max_len]); - i = j-1; + fprintf(ofp, "\n"); } // And depth for only reads matching to a single amplicon for full @@ -1014,80 +1174,96 @@ fprintf(ofp, "# Depth per reference base for full-length valid amplicon data.\n"); fprintf(ofp, "# Use 'grep ^%cDP_VALID | cut -f 2-' to extract this " "part.\n", type); - fprintf(ofp, "%cDP_VALID\t%s\t", type, name); // Base depth in amplicons - for (i = 0; i < args->max_len; i++) { - int dmin = stats->depth_valid[i], dmax = stats->depth_valid[i], j; - double dmid = (dmin + dmax)/2.0; - double low = dmid*(1-args->depth_bin); - double high = dmid*(1+args->depth_bin); - for (j = i+1; j < args->max_len; j++) { - int d = stats->depth_valid[j]; - if (d < low || d > high) - break; - if (dmin > d) { - dmin = d; - dmid = (dmin + dmax)/2.0; - low = dmid*(1-args->depth_bin); - high = dmid*(1+args->depth_bin); - } else if (dmax < d) { - dmax = d; - dmid = (dmin + dmax)/2.0; - low = dmid*(1-args->depth_bin); - high = dmid*(1+args->depth_bin); + for (r = 0; r < nref; r++) { + if (!amps[r].sites) + continue; + astats_t *stats = local ? amps[r].lstats : amps[r].gstats; + if (args->multi_ref) + fprintf(ofp, "%cDP_VALID\t%s\t%s", type, name, amps[r].ref); + else + fprintf(ofp, "%cDP_VALID\t%s", type, name); + + for (i = 0; i < amps[r].len; i++) { + int dmin = stats->depth_valid[i], dmax = stats->depth_valid[i], j; + double dmid = (dmin + dmax)/2.0; + double low = dmid*(1-args->depth_bin); + double high = dmid*(1+args->depth_bin); + for (j = i+1; j < amps[r].len; j++) { + int d = stats->depth_valid[j]; + if (d < low || d > high) + break; + if (dmin > d) { + dmin = d; + dmid = (dmin + dmax)/2.0; + low = dmid*(1-args->depth_bin); + high = dmid*(1+args->depth_bin); + } else if (dmax < d) { + dmax = d; + dmid = (dmin + dmax)/2.0; + low = dmid*(1-args->depth_bin); + high = dmid*(1+args->depth_bin); + } } + fprintf(ofp, "\t%d,%d", (int)dmid, j-i); + i = j-1; } - fprintf(ofp, "%d,%d%c", (int)dmid, j-i, "\n\t"[j < args->max_len]); - i = j-1; + fprintf(ofp, "\n"); } // TCOORD (start to end) distribution fprintf(ofp, "# Distribution of aligned template coordinates.\n"); fprintf(ofp, "# Use 'grep ^%cTCOORD | cut -f 2-' to extract this part.\n", type); - for (i = -1; i < namp; i++) { - if (ntcoord < kh_size(stats->tcoord[i+1])) { - ntcoord = kh_size(stats->tcoord[i+1]); - tcoord_t *tmp = realloc(tpos, ntcoord * sizeof(*tmp)); - if (!tmp) { - free(tpos); - return -1; + for (r = 0; r < nref; r++) { + if (!amps[r].sites) + continue; + astats_t *stats = local ? amps[r].lstats : amps[r].gstats; + for (i = 0 - (nref==1); i < amps[r].namp; i++) { + if (ntcoord < kh_size(stats->tcoord[i+1])) { + ntcoord = kh_size(stats->tcoord[i+1]); + tcoord_t *tmp = realloc(tpos, ntcoord * sizeof(*tmp)); + if (!tmp) { + free(tpos); + return -1; + } + tpos = tmp; } - tpos = tmp; - } - khiter_t k; - size_t n = 0, j; - for (k = kh_begin(stats->tcoord[i+1]); - k != kh_end(stats->tcoord[i+1]); k++) { - if (!kh_exist(stats->tcoord[i+1], k) || - (kh_value(stats->tcoord[i+1], k) & 0xFFFFFFFF) == 0) - continue; - // Key is start,end in 32-bit quantities. - // Yes this limits us to 4Gb references, but just how - // many primers are we planning on making? Not that many - // I hope. - tpos[n].start = kh_key(stats->tcoord[i+1], k)&0xffffffff; - tpos[n].end = kh_key(stats->tcoord[i+1], k)>>32; - - // Value is frequency (top 32-bits) and status (bottom 32). - tpos[n].freq = kh_value(stats->tcoord[i+1], k)&0xffffffff; - tpos[n].status = kh_value(stats->tcoord[i+1], k)>>32; - n++; - } - - if (args->tcoord_bin > 1) - aggregate_tcoord(args, tpos, &n); - - fprintf(ofp, "%cTCOORD\t%s\t%d", type, name, i+1); // per amplicon - for (j = 0; j < n; j++) { - if (tpos[j].freq < args->tcoord_min_count) - continue; - fprintf(ofp, "\t%d,%d,%u,%u", - tpos[j].start, - tpos[j].end, - tpos[j].freq, - tpos[j].status); + khiter_t k; + size_t n = 0, j; + for (k = kh_begin(stats->tcoord[i+1]); + k != kh_end(stats->tcoord[i+1]); k++) { + if (!kh_exist(stats->tcoord[i+1], k) || + (kh_value(stats->tcoord[i+1], k) & 0xFFFFFFFF) == 0) + continue; + // Key is start,end in 32-bit quantities. + // Yes this limits us to 4Gb references, but just how + // many primers are we planning on making? Not that many + // I hope. + tpos[n].start = kh_key(stats->tcoord[i+1], k)&0xffffffff; + tpos[n].end = kh_key(stats->tcoord[i+1], k)>>32; + + // Value is frequency (top 32-bits) and status (bottom 32). + tpos[n].freq = kh_value(stats->tcoord[i+1], k)&0xffffffff; + tpos[n].status = kh_value(stats->tcoord[i+1], k)>>32; + n++; + } + + if (args->tcoord_bin > 1) + aggregate_tcoord(args, tpos, &n); + + fprintf(ofp, "%cTCOORD\t%s\t%d", type, name, + i+1+amps[r].first_amp); // per amplicon + for (j = 0; j < n; j++) { + if (tpos[j].freq < args->tcoord_min_count) + continue; + fprintf(ofp, "\t%d,%d,%u,%u", + tpos[j].start, + tpos[j].end, + tpos[j].freq, + tpos[j].status); + } + fprintf(ofp, "\n"); } - fprintf(ofp, "\n"); } @@ -1101,68 +1277,85 @@ fprintf(ofp, "# primers from different amplicon, and number with a position\n"); fprintf(ofp, "# not matching any valid amplicon primer site\n"); fprintf(ofp, "# Use 'grep ^%cAMP | cut -f 2-' to extract this part.\n", type); + fprintf(ofp, "%cAMP\t%s\t0", type, name); // all merged int amp_dist[3] = {0}; - for (i = 0; i < namp; i++) { // accumulate for all amps - amp_dist[0] += stats->amp_dist[i][0]; - amp_dist[1] += stats->amp_dist[i][1]; - amp_dist[2] += stats->amp_dist[i][2]; + for (r = 0; r < nref; r++) { + if (!amps[r].sites) + continue; + astats_t *stats = local ? amps[r].lstats : amps[r].gstats; + for (i = 0; i < amps[r].namp; i++) { // accumulate for all amps + amp_dist[0] += stats->amp_dist[i][0]; + amp_dist[1] += stats->amp_dist[i][1]; + amp_dist[2] += stats->amp_dist[i][2]; + } } fprintf(ofp, "\t%d\t%d\t%d\n", amp_dist[0], amp_dist[1], amp_dist[2]); - for (i = 0; i < namp; i++) { - fprintf(ofp, "%cAMP\t%s\t%d", type, name, i+1); // per amplicon - fprintf(ofp, "\t%d\t%d\t%d\n", stats->amp_dist[i][0], - stats->amp_dist[i][1], stats->amp_dist[i][2]); + for (r = 0; r < nref; r++) { + if (!amps[r].sites) + continue; + astats_t *stats = local ? amps[r].lstats : amps[r].gstats; + for (i = 0; i < amps[r].namp; i++) { + // per amplicon + fprintf(ofp, "%cAMP\t%s\t%d", type, name, i+1+amps[r].first_amp); + fprintf(ofp, "\t%d\t%d\t%d\n", stats->amp_dist[i][0], + stats->amp_dist[i][1], stats->amp_dist[i][2]); + } } -// for (i = 0; i < namp; i++) { -// printf("%cAMP\t%s\t%d", type, name, i+1); // per amplicon -// for (j = 0; j < namp; j++) -// printf("\t%d", stats->amp_pair[i][j]); -// printf("\n"); -// } - free(tpos); return 0; } +int dump_lstats(astats_args_t *args, char type, char *name, int nfile, + amplicons_t *amps, int nref) { + return dump_stats(args, type, name, nfile, amps, nref, 1); +} + +int dump_gstats(astats_args_t *args, char type, char *name, int nfile, + amplicons_t *amps, int nref) { + return dump_stats(args, type, name, nfile, amps, nref, 0); +} + char const *get_sample_name(sam_hdr_t *header, char *RG) { kstring_t ks = {0}; sam_hdr_find_tag_id(header, "RG", RG?"ID":NULL, RG, "SM", &ks); return ks.s; } -int64_t get_ref_len(sam_hdr_t *header, char *SQ) { - int tid = SQ ? sam_hdr_name2tid(header, SQ) : 0; - return tid >= 0 ? sam_hdr_tid2len(header, tid) : -1; +// Return maximum reference length (SQ is NULL) or the length +// of the specified reference in SQ. +int64_t get_ref_len(sam_hdr_t *header, const char *SQ) { + if (SQ) { + int tid = SQ ? sam_hdr_name2tid(header, SQ) : 0; + return tid >= 0 ? sam_hdr_tid2len(header, tid) : -1; + } else { + int nref = sam_hdr_nref(header), tid;; + int64_t len = 0; + for (tid = 0; tid < nref; tid++) { + int64_t rl = sam_hdr_tid2len(header, tid); + if (len < rl) + len = rl; + } + return len; + } } -static int amplicon_stats(astats_args_t *args, char **filev, int filec) { - int i, namp; +static int amplicon_stats(astats_args_t *args, + khash_t(bed_list_hash) *bed_hash, + char **filev, int filec) { + int i, ref = -1, ref_tid = -1, ret = -1, nref = 0; samFile *fp = NULL; sam_hdr_t *header = NULL; bam1_t *b = bam_init1(); FILE *ofp = args->out_fp; char sname_[8192], *sname = NULL; + amplicons_t *amps = NULL; - // Global stats across all samples - astats_t *gstats = NULL, *lstats = NULL; - - amplicon_t *amp = calloc(args->sites.length, sizeof(*amp)); - if (!amp) - return -1; - - namp = count_amplicon(&args->sites); - - fprintf(ofp, "# Summary statistics, used for scaling the plots.\n"); - fprintf(ofp, "SS\tSamtools version: %s\n", samtools_version()); - fprintf(ofp, "SS\tCommand line: %s\n", args->argv); - fprintf(ofp, "SS\tNumber of amplicons:\t%d\n", namp); - fprintf(ofp, "SS\tNumber of files:\t%d\n", filec); - - // Report ref len from first file. - // Some minor duplication here. + // Report initial SS header. We gather data from the bed_hash entries + // as well as from the first SAM header (with the requirement that all + // headers should be compatible). if (filec) { if (!(fp = sam_open_format(filev[0], "r", &args->ga.in))) { print_error_errno("ampliconstats", @@ -1173,22 +1366,91 @@ if (!(header = sam_hdr_read(fp))) goto err; - // FIXME: permit other references to be specified. - if ((args->max_len = get_ref_len(header, NULL)) < 0) - goto err; - fprintf(ofp, "SS\tReference length:\t%"PRId64"\n", args->max_len); + if (!amps) { + amps = calloc(nref=sam_hdr_nref(header), sizeof(*amps)); + if (!amps) + goto err; + fprintf(ofp, "# Summary statistics, used for scaling the plots.\n"); + fprintf(ofp, "SS\tSamtools version: %s\n", samtools_version()); + fprintf(ofp, "SS\tCommand line: %s\n", args->argv); + fprintf(ofp, "SS\tNumber of files:\t%d\n", filec); + + // Note: order of hash entries will be different to order of + // BED file which may also differ to order of SQ headers. + // SQ header is canonical ordering (pos sorted file). + khiter_t k; + int bam_nref = sam_hdr_nref(header); + for (i = 0; i < bam_nref; i++) { + k = kh_get(bed_list_hash, bed_hash, + sam_hdr_tid2name(header, i)); + if (!kh_exist(bed_hash, k)) + continue; + + bed_entry_list_t *sites = &kh_value(bed_hash, k); + + ref = i; + amps[ref].ref = kh_key(bed_hash, k); + amps[ref].sites = sites; + amps[ref].namp = count_amplicon(sites); + amps[ref].amp = calloc(sites->length, + sizeof(*amps[ref].amp)); + if (!amps[ref].amp) + goto err; + if (args->multi_ref) + fprintf(ofp, "SS\tNumber of amplicons:\t%s\t%d\n", + kh_key(bed_hash, k), amps[ref].namp); + else + fprintf(ofp, "SS\tNumber of amplicons:\t%d\n", + amps[ref].namp); + + amps[ref].tid = ref; + if (ref_tid == -1) + ref_tid = ref; + + int64_t len = get_ref_len(header, kh_key(bed_hash, k)); + amps[ref].len = len; + if (args->multi_ref) + fprintf(ofp, "SS\tReference length:\t%s\t%"PRId64"\n", + kh_key(bed_hash, k), len); + else + fprintf(ofp, "SS\tReference length:\t%"PRId64"\n", + len); + + amps[ref].lstats = stats_alloc(len, args->max_amp, + args->max_amp_len); + amps[ref].gstats = stats_alloc(len, args->max_amp, + args->max_amp_len); + if (!amps[ref].lstats || !amps[ref].gstats) + goto err; + } + } sam_hdr_destroy(header); + header = NULL; if (sam_close(fp) < 0) { fp = NULL; goto err; } + fp = NULL; } fprintf(ofp, "SS\tEnd of summary\n"); - if (bed2amplicon(args, &args->sites, amp, &namp) < 0) - goto err; + // Extract the bits of amplicon data we need from bed hash and turn + // it into a position-to-amplicon lookup table. + int offset = 0; + for (i = 0; i < nref; i++) { + if (!amps[i].sites) + continue; + + amps[i].first_amp = offset; + if (bed2amplicon(args, amps[i].sites, amps[i].amp, + &s[i].namp, i==0, amps[i].ref, offset) < 0) + goto err; + offset += amps[i].namp; // cumulative amplicon number across refs + } + + // Now iterate over file contents, one at a time. for (i = 0; i < filec; i++) { char *nstart = filev[i]; @@ -1206,22 +1468,25 @@ if (!(header = sam_hdr_read(fp))) goto err; - if (args->use_sample_name) - sname = (char *)get_sample_name(header, NULL); - - // FIXME: permit other references to be specified. - if ((args->max_len = get_ref_len(header, NULL)) < 0) - goto err; - if (initialise_amp_pos_lookup(args, amp, namp, args->max_len) < 0) - goto err; - - if (!gstats) gstats = stats_alloc(args->max_len, args->max_amp, - args->max_amp_len); - if (!lstats) lstats = stats_alloc(args->max_len, args->max_amp, - args->max_amp_len); - if (!lstats || !gstats) + if (nref != sam_hdr_nref(header)) { + print_error_errno("ampliconstats", + "SAM headers are not consistent across input files"); goto err; + } + int r; + for (r = 0; r < nref; r++) { + if (!amps[r].ref || + strcmp(amps[r].ref, sam_hdr_tid2name(header, r)) != 0 || + amps[r].len != sam_hdr_tid2len(header, r)) { + print_error_errno("ampliconstats", + "SAM headers are not consistent across " + "input files"); + goto err; + } + } + if (args->use_sample_name) + sname = (char *)get_sample_name(header, NULL); if (!sname) { sname = sname_; @@ -1239,11 +1504,21 @@ } // Stats local to this sample only - stats_reset(lstats); + amp_stats_reset(amps, nref); - int r; + int last_ref = -9; while ((r = sam_read1(fp, header, b)) >= 0) { - if (accumulate_stats(lstats, args, amp, namp, b) < 0) + // Other filter options useful here? + if (b->core.tid < 0) + continue; + + if (last_ref != b->core.tid) { + last_ref = b->core.tid; + if (initialise_amp_pos_lookup(args, amps, last_ref) < 0) + goto err; + } + + if (accumulate_stats(args, amps, b) < 0) goto err; } @@ -1261,10 +1536,10 @@ fp = NULL; header = NULL; - if (dump_stats('F', sname, lstats, args, amp, namp, filec) < 0) + if (dump_lstats(args, 'F', sname, filec, amps, nref) < 0) goto err; - if (append_stats(lstats, gstats, namp) < 0) + if (append_stats(amps, nref) < 0) goto err; if (sname && sname != sname_) @@ -1272,28 +1547,32 @@ sname = NULL; } - dump_stats('C', "COMBINED", gstats, args, amp, namp, filec); - - stats_free(lstats); - stats_free(gstats); - bam_destroy1(b); - free(amp); - free_amp_pos_lookup(); - - return 0; + if (dump_gstats(args, 'C', "COMBINED", filec, amps, nref) < 0) + goto err; + ret = 0; err: bam_destroy1(b); - if (header) - sam_hdr_destroy(header); - if (fp) - sam_close(fp); - free(amp); - free_amp_pos_lookup(); - if (sname && sname != sname_) - free(sname); + if (ret) { + if (header) + sam_hdr_destroy(header); + if (fp) + sam_close(fp); + } + for (i = 0; i < nref; i++) { + stats_free(amps[i].lstats); + stats_free(amps[i].gstats); + free(amps[i].amp); + } + free(amps); + free(pos2start); + free(pos2end); + if (ret) { + if (sname && sname != sname_) + free(sname); + } - return -1; + return ret; } static int usage(astats_args_t *args, FILE *fp, int exit_status) { @@ -1325,7 +1604,9 @@ fprintf(fp, " -c, --tcoord-min-count INT\n" " Minimum template start,end frequency for recording [%d]\n", TCOORD_MIN_COUNT); fprintf(fp, " -D, --depth-bin FRACTION\n" - " Merge FDP values within +/- FRACTION together.\n"); + " Merge FDP values within +/- FRACTION together\n"); + fprintf(fp, " -S, --single-ref\n" + " Force single-ref (<=1.12) output format\n"); sam_global_opt_help(fp, "I.--.@"); return exit_status; @@ -1336,7 +1617,7 @@ .ga = SAM_GLOBAL_ARGS_INIT, .flag_require = 0, .flag_filter = 0x10B04, - .sites = {NULL, 0, 0}, + //.sites = BED_LIST_INIT, .max_delta = 30, // large enough to cope with alt primers .min_depth = {1}, .use_sample_name = 0, @@ -1346,7 +1627,8 @@ .out_fp = stdout, .tcoord_min_count = TCOORD_MIN_COUNT, .tcoord_bin = 1, - .depth_bin = 0.01 + .depth_bin = 0.01, + .multi_ref = 1 }, oargs = args; static const struct option loptions[] = @@ -1365,11 +1647,12 @@ {"tcoord-min-count", required_argument, NULL, 'c'}, {"tcoord-bin", required_argument, NULL, 'b'}, {"depth-bin", required_argument, NULL, 'D'}, + {"single-ref", no_argument, NULL, 'S'}, {NULL, 0, NULL, 0} }; int opt; - while ( (opt=getopt_long(argc,argv,"?hf:F:@:p:m:d:sa:l:t:o:c:b:D:",loptions,NULL))>0 ) { + while ( (opt=getopt_long(argc,argv,"?hf:F:@:p:m:d:sa:l:t:o:c:b:D:S",loptions,NULL))>0 ) { switch (opt) { case 'f': args.flag_require = bam_str2flag(optarg); break; case 'F': @@ -1413,6 +1696,10 @@ } break; + case 'S': + args.multi_ref = 0; + break; + case '?': return usage(&oargs, stderr, EXIT_FAILURE); case 'h': return usage(&oargs, stdout, EXIT_SUCCESS); @@ -1428,24 +1715,40 @@ if (argc <= optind+1 && isatty(STDIN_FILENO)) return usage(&oargs, stderr, EXIT_FAILURE); - int64_t longest; - if (load_bed_file_pairs(argv[optind], 1, 0, &args.sites, &longest)) { + khash_t(bed_list_hash) *bed_hash = kh_init(bed_list_hash); + if (load_bed_file_multi_ref(argv[optind], 1, 0, bed_hash)) { print_error_errno("ampliconstats", "Could not read file \"%s\"", argv[optind]); return 1; + + } + + khiter_t k, ref_count = 0; + for (k = kh_begin(bed_hash); k != kh_end(bed_hash); k++) { + if (!kh_exist(bed_hash, k)) + continue; + ref_count++; + } + if (ref_count == 0) + return 1; + if (ref_count > 1 && args.multi_ref == 0) { + print_error("ampliconstats", + "Single-ref mode is not permitted for BED files\n" + "containing more than one reference."); + return 1; } args.argv = stringify_argv(argc, argv); int ret; if (argc == ++optind) { char *av = "-"; - ret = amplicon_stats(&args, &av, 1); + ret = amplicon_stats(&args, bed_hash, &av, 1); } else { - ret = amplicon_stats(&args, &argv[optind], argc-optind); + ret = amplicon_stats(&args, bed_hash, &argv[optind], argc-optind); } - free(args.sites.bp); free(args.argv); + destroy_bed_hash(bed_hash); return ret; } diff -Nru samtools-1.11/bam2bcf_indel.c samtools-1.13/bam2bcf_indel.c --- samtools-1.11/bam2bcf_indel.c 2020-09-22 12:15:55.000000000 +0000 +++ samtools-1.13/bam2bcf_indel.c 2021-07-07 17:18:52.000000000 +0000 @@ -408,6 +408,10 @@ { // do realignment; this is the bottleneck const uint8_t *qual = bam_get_qual(p->b), *bq; uint8_t *qq; + if (qend < qbeg) { + fprintf(stderr, "Impossible data in bcf_call_gap_prep\n"); + exit(1); + } qq = calloc(qend - qbeg, 1); bq = (uint8_t*)bam_aux_get(p->b, "ZQ"); if (bq) ++bq; // skip type diff -Nru samtools-1.11/bam2depth.c samtools-1.13/bam2depth.c --- samtools-1.11/bam2depth.c 2020-09-22 12:15:55.000000000 +0000 +++ samtools-1.13/bam2depth.c 2021-07-07 17:18:52.000000000 +0000 @@ -1,9 +1,11 @@ /* bam2depth.c -- depth subcommand. Copyright (C) 2011, 2012 Broad Institute. - Copyright (C) 2012-2016, 2018, 2019-2020 Genome Research Ltd. + Copyright (C) 2012-2016, 2018, 2019-2021 Genome Research Ltd. + + Author: Heng Li (to 2020) + Author: James Bonfield (2021 rewrite) - Author: Heng Li Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal @@ -41,372 +43,913 @@ #include "samtools.h" #include "bedidx.h" #include "sam_opts.h" +#include "htslib/khash.h" -typedef struct { // auxiliary data structure - samFile *fp; // the file handle - sam_hdr_t *hdr; // the file header - hts_itr_t *iter; // NULL if a region not specified - int min_mapQ, min_len; // mapQ filter; length filter - uint32_t flags; // read filtering flags -} aux_t; +// From bam_plcmd.c +int read_file_list(const char *file_list, int *n, char **argv[]); -// This function reads a BAM alignment from one BAM file. -static int read_bam(void *data, bam1_t *b) // read level filters better go here to avoid pileup -{ - aux_t *aux = (aux_t*)data; // data in fact is a pointer to an auxiliary structure - int ret; - while (1) - { - ret = aux->iter? sam_itr_next(aux->fp, aux->iter, b) : sam_read1(aux->fp, aux->hdr, b); - if ( ret<0 ) break; - if ( b->core.flag & aux->flags ) continue; - if ( (int)b->core.qual < aux->min_mapQ ) continue; - if ( aux->min_len && bam_cigar2qlen(b->core.n_cigar, bam_get_cigar(b)) < aux->min_len ) continue; - break; +// We accumulate to hist[pos & (size-1)]. This is a ring-buffer. +// We track where we last got to in output and what the biggest value +// we've written to so far (in absolute unmasked coordinates) in +// "last_output" and "end_pos" respectively. +// For each new record we just flush anything we haven't written yet +// already, between "last_output" and this read's start position, and +// initialise any newly seen positions between "end_pos" and this read's +// end position. +typedef struct { + size_t size; + int **hist; // hist[nfiles][size] + hts_pos_t *end_pos; // end_pos[nfiles] + hts_pos_t last_output; + int last_ref; + int nfiles; + const char *ref; + kstring_t ks; + hts_pos_t beg, end; // limit to region + int tid; +} depth_hist; + +typedef struct { + int header; + int flag; + int min_qual; + int min_mqual; + int min_len; + int skip_del; + int all_pos; + int remove_overlaps; + FILE *out; + char *reg; + void *bed; +} depth_opt; + +static void zero_region(depth_opt *opt, depth_hist *dh, + const char *name, hts_pos_t start, hts_pos_t end) { + hts_pos_t i; + kstring_t *ks = &dh->ks; + + kputs(name, ks_clear(ks)); + kputc('\t', ks); + size_t cur_l = ks->l; + if (dh->beg >= 0 && start < dh->beg) + start = dh->beg; + if (dh->end >= 0 && end > dh->end) + end = dh->end; + + for (i = start; i < end; i++) { + // Could be optimised, but needs better API to skip to next + // bed region. + if (opt->bed && bed_overlap(opt->bed, name, i, i+1) == 0) + continue; + + ks->l = cur_l; + kputll(i+1, ks); + int n; + for (n = 0; n < dh->nfiles; n++) { + kputc_('\t', ks); + kputc_('0', ks); + } + kputc('\n', ks); + fputs(ks->s, opt->out); } - return ret; + ks->l = cur_l; } -int read_file_list(const char *file_list,int *n,char **argv[]); +// A variation of bam_cigar2qlen which doesn't count soft-clips in to the +// equation. Basically it's the number of bases in query that are aligned +// in some way to the reference (including insertions, which are considered +// to be aligned by dint of being anchored either side). +hts_pos_t qlen_used(bam1_t *b) { + int n_cigar = b->core.n_cigar; + const uint32_t *cigar = bam_get_cigar(b); + + hts_pos_t l; + + if (b->core.l_qseq) { + // Known SEQ permits of short cut of l_qseq minus CSOFT_CLIPs. + // Full scan not needed, which helps on excessively long CIGARs. + l = b->core.l_qseq; + int kl, kr; + for (kl = 0; kl < n_cigar; kl++) + if (bam_cigar_op(cigar[kl]) == BAM_CSOFT_CLIP) + l -= bam_cigar_oplen(cigar[kl]); + else + break; -static int usage() { - fprintf(stderr, "\n"); - fprintf(stderr, "Usage: samtools depth [options] in1.bam [in2.bam [...]]\n"); - fprintf(stderr, "Options:\n"); - fprintf(stderr, " -a output all positions (including zero depth)\n"); - fprintf(stderr, " -a -a (or -aa) output absolutely all positions, including unused ref. sequences\n"); - fprintf(stderr, " -b list of positions or regions\n"); - fprintf(stderr, " -X use customized index files\n"); - fprintf(stderr, " -f list of input BAM filenames, one per line [null]\n"); - fprintf(stderr, " -H print a file header\n"); - fprintf(stderr, " -l read length threshold (ignore reads shorter than ) [0]\n"); - fprintf(stderr, " -d/-m maximum coverage depth [8000]. If 0, depth is set to the maximum\n" - " integer value, effectively removing any depth limit.\n"); // the htslib's default - fprintf(stderr, " -o FILE where to write output to [stdout]\n"); - fprintf(stderr, " -q base quality threshold [0]\n"); - fprintf(stderr, " -Q mapping quality threshold [0]\n"); - fprintf(stderr, " -r region\n"); - fprintf(stderr, " -g remove the specified flags from the set used to filter out reads\n"); - fprintf(stderr, " -G add the specified flags to the set used to filter out reads\n" - " The default set is UNMAP,SECONDARY,QCFAIL,DUP or 0x704\n"); - fprintf(stderr, " -J include reads with deletions in depth computation\n"); - fprintf(stderr, " -s for the overlapping section of a read pair, count only the bases\n" - " of a single read. This option requires raising the base quality\n" - " threshold to 1.\n"); - - sam_global_opt_help(stderr, "-.--.--."); - - fprintf(stderr, "\n"); - fprintf(stderr, "The output is a simple tab-separated table with three columns: reference name,\n"); - fprintf(stderr, "position, and coverage depth. Note that positions with zero coverage may be\n"); - fprintf(stderr, "omitted by default; see the -a option.\n"); - fprintf(stderr, "\n"); + for (kr = n_cigar-1; kr > kl; kr--) + if (bam_cigar_op(cigar[kr]) == BAM_CSOFT_CLIP) + l -= bam_cigar_oplen(cigar[kr]); + else + break; + } else { + // Unknown SEQ ("*") needs a full scan through the CIGAR string. + static int query[16] = { + //M I D N S H P = X B ? ? ? ? ? ? + 1,1,0,0, 0,0,0,1, 1,0,0,0, 0,0,0,0 + }; + int k; + for (k = l = 0; k < n_cigar; k++) + if (query[bam_cigar_op(cigar[k])]) + l += bam_cigar_oplen(cigar[k]); + } + return l; - return EXIT_FAILURE; } -int main_depth(int argc, char *argv[]) -{ - int i, n, tid, reg_tid, *n_plp, baseQ = 0, mapQ = 0, min_len = 0, has_index_file = 0; - hts_pos_t beg, end, pos = -1, last_pos = -1; - int all = 0, status = EXIT_SUCCESS, nfiles, max_depth = -1; - const bam_pileup1_t **plp; - char *reg = 0; // specified region - void *bed = 0; // BED data structure - char *file_list = NULL, **fn = NULL; - sam_hdr_t *h = NULL; // BAM header of the 1st input - aux_t **data; - bam_mplp_t mplp; - int last_tid = -1, ret; - int print_header = 0; - char *output_file = NULL; - FILE *file_out = stdout; - uint32_t flags = (BAM_FUNMAP | BAM_FSECONDARY | BAM_FQCFAIL | BAM_FDUP); - int tflags = 0; - int inc_del = 0; - int overlap_init = 0; +// Adds the depth for a single read to a depth_hist struct. +// For just one file, this is easy. We just have a circular buffer +// where we increment values for bits that overlap existing data +// and initialise values for coordinates which we're seeing for the first +// time. This is tracked by "end_pos" to know where we've got to. +// +// As the input is sorted, we can flush output from "last_output" to +// b->core.pos. +// +// With multiple files, we must feed data in sorted order as if all files +// are merged, but track depth per file. This also means "end_pos" is per +// file too, but "last_output" is global as it corresponds to rows printed. +static int add_depth(depth_opt *opt, depth_hist *dh, sam_hdr_t *h, bam1_t *b, + int overlap_clip, int file) { + hts_pos_t i; + size_t hmask = dh->size-1; + int n; + + if (!b || b->core.tid != dh->last_ref) { + // New ref + if (dh->last_ref >= 0) { + // do end + size_t cur_l = dh->ks.l; + int nf = dh->nfiles; + i = dh->last_output; + for (i = dh->last_output; nf; i++) { + nf = 0; + for (n = 0; n < dh->nfiles; n++) { + if (i < dh->end_pos[n]) + nf++; + } + if (!nf) + break; - sam_global_args ga = SAM_GLOBAL_ARGS_INIT; - static const struct option lopts[] = { - SAM_OPT_GLOBAL_OPTIONS('-', 0, '-', '-', 0, '-'), - { NULL, 0, NULL, 0 } - }; + if (opt->bed && bed_overlap(opt->bed, dh->ref, i, i+1) == 0) + continue; - // parse the command line - while ((n = getopt_long(argc, argv, "r:b:Xq:Q:l:f:am:d:Ho:g:G:Js", lopts, NULL)) >= 0) { - switch (n) { - case 'l': min_len = atoi(optarg); break; // minimum query length - case 'r': reg = strdup(optarg); break; // parsing a region requires a BAM header - case 'b': - bed = bed_read(optarg); // BED or position list file can be parsed now - if (!bed) { - print_error_errno("depth", "Could not read file \"%s\"", optarg); - return EXIT_FAILURE; + dh->ks.l = cur_l; + kputll(i+1, &dh->ks); + for (n = 0; n < dh->nfiles; n++) { + kputc_('\t', &dh->ks); + int d = i < dh->end_pos[n] + ? dh->hist[n][i & hmask] + : 0; + kputuw(d, &dh->ks); } - break; - case 'X': has_index_file = 1; break; - case 'q': baseQ = atoi(optarg); break; // base quality threshold - case 'Q': mapQ = atoi(optarg); break; // mapping quality threshold - case 'f': file_list = optarg; break; - case 'a': all++; break; - case 'd': case 'm': max_depth = atoi(optarg); break; // maximum coverage depth - case 'H': print_header = 1; break; - case 'o': output_file = optarg; break; - case 'g': - tflags = bam_str2flag(optarg); - if (tflags < 0 || tflags > ((BAM_FSUPPLEMENTARY << 1) - 1)) { - print_error_errno("depth", "Flag value \"%s\" is not supported", optarg); - return 1; + kputc('\n', &dh->ks); + fputs(dh->ks.s, opt->out); + } + if (opt->all_pos) { + // End of last ref + zero_region(opt, dh, + sam_hdr_tid2name(h, dh->last_ref), + i, sam_hdr_tid2len(h, dh->last_ref)); + } + dh->ks.l = cur_l; + } + + if (opt->all_pos > 1 && !opt->reg) { + // Any previous unused refs + int lr = dh->last_ref < 0 ? 0 : dh->last_ref+1; + int rr = b ? b->core.tid : sam_hdr_nref(h), r; + for (r = lr; r < rr; r++) + zero_region(opt, dh, + sam_hdr_tid2name(h, r), + 0, sam_hdr_tid2len(h, r)); + } + + if (!b) { + // we're just flushing to end of file + if (opt->all_pos && opt->reg && dh->last_ref < 0) + // -a or -aa without a single read being output yet + zero_region(opt, dh, sam_hdr_tid2name(h, dh->tid), dh->beg, + MIN(dh->end, sam_hdr_tid2len(h, dh->tid))); + + return 0; + } + + for (n = 0; dh->end_pos && n < dh->nfiles; n++) + dh->end_pos[n] = 0; + dh->last_output = dh->beg >= 0 + ? MAX(b->core.pos, dh->beg) + : b->core.pos; + dh->last_ref = b->core.tid; + dh->ref = sam_hdr_tid2name(h, b->core.tid); + kputs(dh->ref, ks_clear(&dh->ks)); + kputc('\t', &dh->ks); + + if (opt->all_pos) + // Start of ref + zero_region(opt, dh, dh->ref, 0, b->core.pos); + } else { + if (dh->last_output < b->core.pos) { + // Flush any depth outputs up to start of new read + size_t cur_l = dh->ks.l; + int nf = dh->nfiles; + for (i = dh->last_output; i < b->core.pos; i++) { + nf = 0; + for (n = 0; n < dh->nfiles; n++) { + if (i < dh->end_pos[n]) + nf++; } - flags &= ~tflags; - break; - case 'G': - tflags = bam_str2flag(optarg); - if (tflags < 0 || tflags > ((BAM_FSUPPLEMENTARY << 1) - 1)) { - print_error_errno("depth", "Flag value \"%s\" is not supported", optarg); - return 1; + if (!nf) + break; + + if (opt->bed && bed_overlap(opt->bed, dh->ref, i, i+1) == 0) + continue; + + dh->ks.l = cur_l; + kputll(i+1, &dh->ks); + for (n = 0; n < dh->nfiles; n++) { + kputc_('\t', &dh->ks); + int d = i < dh->end_pos[n] + ? dh->hist[n][i & hmask] + : 0; + kputuw(d, &dh->ks); + } + kputc('\n', &dh->ks); + fputs(dh->ks.s, opt->out); + } + if (opt->all_pos && i < b->core.pos) + // Hole in middle of ref + zero_region(opt, dh, dh->ref, i, b->core.pos); + + dh->ks.l = cur_l; + dh->last_output = b->core.pos; + } + } + + hts_pos_t end_pos = bam_endpos(b); // 0 based, 1 past end. + //printf("%d %d\n", (int)b->core.pos+1, (int)end_pos); + + if (b->core.tid < dh->last_ref || + (dh->last_ref == b->core.tid && end_pos < dh->last_output)) { + print_error_errno("depth", "Data is not position sorted"); + return -1; + } + + // If needed, grow the circular buffer. + if (end_pos+1 - b->core.pos >= dh->size) { + size_t old_size = dh->size; + size_t old_hmask = hmask; + while (end_pos+1 - b->core.pos >= dh->size) + dh->size = dh->size ? 2*dh->size : 2048; + hmask = dh->size-1; + if (!dh->hist) { + dh->hist = calloc(dh->nfiles, sizeof(*dh->hist)); + dh->end_pos = calloc(dh->nfiles, sizeof(*dh->end_pos)); + if (!dh->hist || !dh->end_pos) + return -1; + } + for (n = 0; n < dh->nfiles; n++) { + int *hist = calloc(dh->size, sizeof(*dh->hist[n])); + if (!hist) + return -1; + + // Simple approach for now; copy over old histogram verbatim. + for (i = dh->last_output; i < dh->last_output + old_size; i++) + hist[i & hmask] = dh->hist[n][i & old_hmask]; + free(dh->hist[n]); + dh->hist[n] = hist; + } + } + + // Accumulate depth, based on CIGAR + uint32_t *cig = bam_get_cigar(b); + int ncig = b->core.n_cigar, j, k, spos = 0; + + // Zero new (previously unseen) coordinates so increment works later. + hts_pos_t end = MAX(dh->end_pos[file], b->core.pos); + if (end_pos > end && (end & hmask) < (end_pos & hmask)) { + memset(&dh->hist[file][end & hmask], 0, + sizeof(**dh->hist) * (end_pos - end)); + } else { + for (i = end; i < end_pos; i++) + dh->hist[file][i & hmask] = 0; + } + + i = b->core.pos; + uint8_t *qual = bam_get_qual(b); + int min_qual = opt->min_qual; + for (j = 0; j < ncig; j++) { + int op = bam_cigar_op(cig[j]); + int oplen = bam_cigar_oplen(cig[j]); + + switch (op) { + case BAM_CDEL: + case BAM_CREF_SKIP: + if (op != BAM_CDEL || opt->skip_del) { + // don't increment reference location + if (i + oplen >= dh->end_pos[file]) { + for (k = 0; k < oplen; k++, i++) { + if (i >= dh->end_pos[file]) + // redundant due to zero new elements above? + dh->hist[file][i & hmask] = 0; + } + } else { + i += oplen; + } + } else { // op == BAM_CDEL and we count them (-J option), + // We don't incr spos here, but we still use qual. + // This doesn't make much sense, but it's for compatibility + // with the old code. Arguably DEL shouldn't have a min + // qual and should always pass (as we've explicitly asked to + // include them). + int *hist = dh->hist[file]; + k = 0; + if (overlap_clip) { + if (i+oplen < overlap_clip) { + i += oplen; + break; + } else if (i < overlap_clip) { + k = overlap_clip - i; + i = overlap_clip; + } } - flags |= tflags; + + // Question: should we even check quality values for DEL? + // We've explicitly asked to include them, and the quality + // is wrong anyway (it's the neighbouring base). We do this + // for now for compatibility with the old depth command. + + if (spos < b->core.l_qseq) + for (; k < oplen; k++, i++) + hist[i & hmask]+=qual[spos]>=min_qual; + else + for (; k < oplen; k++, i++) + hist[i & hmask]++; + } + break; + + case BAM_CMATCH: + case BAM_CEQUAL: + case BAM_CDIFF: + if ((i & hmask) < ((i+oplen) & hmask)) { + // Optimisation when not wrapping around + + // Unrolling doesn't help clang, but helps gcc, + // especially when not using -O3. + int *hist = &dh->hist[file][i & hmask]; + if (min_qual || overlap_clip) { + k = 0; + if (overlap_clip) { + if (i+oplen < overlap_clip) { + i += oplen; + spos += oplen; + break; + } else if (i < overlap_clip) { + oplen -= overlap_clip - i; + spos += overlap_clip - i; + hist += overlap_clip - i; + i = overlap_clip; + } + } + + // approx 50% of this func cpu time in this loop + for (; k < (oplen & ~7); k+=8) { + hist[k+0]+=qual[spos+0]>=min_qual; + hist[k+1]+=qual[spos+1]>=min_qual; + hist[k+2]+=qual[spos+2]>=min_qual; + hist[k+3]+=qual[spos+3]>=min_qual; + hist[k+4]+=qual[spos+4]>=min_qual; + hist[k+5]+=qual[spos+5]>=min_qual; + hist[k+6]+=qual[spos+6]>=min_qual; + hist[k+7]+=qual[spos+7]>=min_qual; + spos += 8; + } + } else { + // easier to vectorize when no min_qual + for (k = 0; k < (oplen & ~7); k+=8) { + hist[k+0]++; + hist[k+1]++; + hist[k+2]++; + hist[k+3]++; + hist[k+4]++; + hist[k+5]++; + hist[k+6]++; + hist[k+7]++; + } + spos += k; + } + for (; k < oplen && spos < b->core.l_qseq; k++, spos++) + hist[k]+=qual[spos]>=min_qual; + for (; k < oplen; k++, spos++) + hist[k]++; + i += oplen; + } else { + // Simple to understand case, but slower. + // We use this only for reads with wrap-around. + int *hist = dh->hist[file]; + k = 0; + if (overlap_clip) { + if (i+oplen < overlap_clip) { + i += oplen; + break; + } else if (i < overlap_clip) { + oplen -= overlap_clip - i; + spos += overlap_clip - i; + i = overlap_clip; + } + } + for (; k < oplen && spos < b->core.l_qseq; k++, i++, spos++) + hist[i & hmask]+=qual[spos]>=min_qual; + for (; k < oplen; k++, i++, spos++) + hist[i & hmask]++; + } + break; + + case BAM_CINS: + case BAM_CSOFT_CLIP: + spos += oplen; + break; + + case BAM_CPAD: + case BAM_CHARD_CLIP: + // ignore + break; + + default: + print_error("depth", "Unsupported cigar op '%d'", op); + return -1; + } + } + + if (dh->end >= 0 && end_pos > dh->end) + end_pos = dh->end; + if (dh->end_pos[file] < end_pos) + dh->end_pos[file] = end_pos; + + return 0; +} + +// Hash on name -> alignment end pos. This permits a naive overlap removal. +// Note it cannot analyse the overlapping sequence and qualities, so the +// interaction of basecalls/qualities and the -Q parameter cannot be +// applied here (unlike the full mpileup algorithm). +KHASH_MAP_INIT_STR(olap_hash, hts_pos_t) +typedef khash_t(olap_hash) olap_hash_t; + +static int fastdepth_core(depth_opt *opt, uint32_t nfiles, char **fn, + samFile **fp, hts_itr_t **itr, sam_hdr_t **h) { + int ret = -1, err = 1, i; + olap_hash_t **overlaps = NULL; + depth_hist dh = {0}; + + // An array of bam structs, one per input file, to hold the next entry + bam1_t **b = calloc(nfiles, sizeof(*b)); + int *finished = calloc(nfiles, sizeof(*finished)), to_go = nfiles; + if (!b || !finished) + goto err; + + for (i = 0; i < nfiles; i++) + if (!(b[i] = bam_init1())) + goto err; + + // Do we need one overlap hash per file? Or shared? + if (opt->remove_overlaps) { + if (!(overlaps = calloc(nfiles, sizeof(*overlaps)))) + return -1; + for (i = 0; i < nfiles; i++) { + if (!(overlaps[i] = kh_init(olap_hash))) + return -1; + } + } + + // Create the initial histogram + dh.nfiles = nfiles; + dh.size = 0; + dh.hist = NULL; + dh.last_ref = -99; + dh.end_pos = NULL; + dh.last_output = itr && itr[0] ? itr[0]->beg : 0; + ks_initialize(&dh.ks); + + // Clip results to region if specified + dh.beg = -1; + dh.end = -1; + dh.tid = 0; + if (itr && itr[0]) { + dh.tid = itr[0]->tid; + dh.beg = itr[0]->beg; + dh.end = itr[0]->end; + } + + if (opt->header) { + fprintf(opt->out, "#CHROM\tPOS"); + for (i = 0; i < nfiles; i++) + fprintf(opt->out, "\t%s", fn[i]); + fputc('\n', opt->out); + } + + // Populate first record per file + for (i = 0; i < nfiles; i++) { + for(;;) { + ret = itr && itr[i] + ? sam_itr_next(fp[i], itr[i], b[i]) + : sam_read1(fp[i], h[i], b[i]); + if (ret < -1) + goto err; + if (ret == -1) { + to_go--; + finished[i] = 1; break; - case 'J': inc_del = 1; break; - case 's': - overlap_init = 1; - if (!baseQ) baseQ = 1; + } + + if (b[i]->core.tid < 0) + continue; + if (b[i]->core.flag & opt->flag) + continue; + if (b[i]->core.qual < opt->min_mqual) + continue; + + // Original samtools depth used the total sequence (l_qseq) + // including soft-clips. This doesn't feel like a useful metric + // to be filtering on. We now only count sequence bases that + // form the used part of the alignment. + if (opt->min_len) { + if (qlen_used(b[i]) < opt->min_len) + continue; + } + + break; + } + } + + // Loop through input files, merging in order so we're + // always adding the next record in sequence + while (to_go) { + // Find next record in file list + int best_tid = INT_MAX, best_file = 0; + hts_pos_t best_pos = HTS_POS_MAX; + + for (i = 0; i < nfiles; i++) { + if (finished[i]) + continue; + if (best_tid > b[i]->core.tid) { + best_tid = b[i]->core.tid; + best_pos = b[i]->core.pos; + best_file = i; + } else if (best_tid == b[i]->core.tid && + best_pos > b[i]->core.pos) { + best_pos = b[i]->core.pos; + best_file = i; + } + } + i = best_file; + + hts_pos_t clip = 0; + if (overlaps && (b[i]->core.flag & BAM_FPAIRED) && + !(b[i]->core.flag & BAM_FMUNMAP)) { + khiter_t k = kh_get(olap_hash, overlaps[i], bam_get_qname(b[i])); + if (k == kh_end(overlaps[i])) { + // not seen before + hts_pos_t endpos = bam_endpos(b[i]); + + // Don't add if mate location is known and can't overlap. + if (b[i]->core.mpos == -1 || + (b[i]->core.tid == b[i]->core.mtid && + b[i]->core.mpos <= endpos)) { + k = kh_put(olap_hash, overlaps[i], bam_get_qname(b[i]), + &ret); + if (ret < 0) + return -1; + kh_key(overlaps[i], k) = strdup(bam_get_qname(b[i])); + kh_value(overlaps[i], k) = endpos; + } + } else { + // seen before + clip = kh_value(overlaps[i], k); + free((char *)kh_key(overlaps[i], k)); + kh_del(olap_hash, overlaps[i], k); + } + } + + // Add the next merged BAM record to the depth plot + if ((ret = add_depth(opt, &dh, h[i], b[i], clip, i)) < 0) { + ret = -1; + goto err; + } + + // Populate next record from this file + for(;!finished[i];) { + ret = itr && itr[i] + ? sam_itr_next(fp[i], itr[i], b[i]) + : sam_read1(fp[i], h[i], b[i]); + if (ret < -1) { + ret = -1; + goto err; + } + if (ret == -1) { + to_go--; + finished[i] = 1; break; - default: if (parse_sam_global_opt(n, optarg, lopts, &ga) == 0) break; - /* else fall-through */ - case '?': return usage(); + } + + if (b[i]->core.tid < 0) + continue; + if (b[i]->core.flag & opt->flag) + continue; + if (b[i]->core.qual < opt->min_mqual) + continue; + + if (opt->min_len) { + if (qlen_used(b[i]) < opt->min_len) + continue; + } + + break; } } - if (optind == argc && !file_list) - return usage(); - /* output file provided by user */ - if (output_file != NULL && strcmp(output_file,"-")!=0) { - file_out = fopen( output_file, "w" ); - if (file_out == NULL) { - print_error_errno("depth", "Cannot open \"%s\" for writing.", output_file); - return EXIT_FAILURE; + // Tidy up end. + ret = add_depth(opt, &dh, h[0], NULL, 0, 0); + err = 0; + + err: + if (ret == 0 && err) + ret = -1; + + for (i = 0; i < nfiles; i++) { + if (b[i]) + bam_destroy1(b[i]); + if (dh.hist && dh.hist[i]) + free(dh.hist[i]); + } + free(b); + free(finished); + ks_free(&dh.ks); + free(dh.hist); + free(dh.end_pos); + if (overlaps) { + khiter_t k; + for (i = 0; i < nfiles; i++) { + if (!overlaps[i]) + continue; + for (k = kh_begin(overlaps[i]); k < kh_end(overlaps[i]); k++) + if (kh_exist(overlaps[i], k)) + free((char *)kh_key(overlaps[i], k)); + kh_destroy(olap_hash, overlaps[i]); + } + free(overlaps); + } + + return ret; +} + +static void usage_exit(FILE *fp, int exit_status) +{ + fprintf(fp, "Usage: samtools depth [options] in.bam [in.bam ...]\n"); + fprintf(fp, "\nOptions:\n"); + fprintf(fp, " -a Output all positions (including zero depth)\n"); + fprintf(fp, " -a -a, -aa Output absolutely all positions, including unused ref seqs\n"); + fprintf(fp, " -r REG Specify a region in chr or chr:from-to syntax\n"); + fprintf(fp, " -b FILE Use bed FILE for list of regions\n"); + fprintf(fp, " -f FILE Specify list of input BAM/SAM/CRAM filenames\n"); + fprintf(fp, " -X Use custom index files (in -X *.bam *.bam.bai order)\n"); + fprintf(fp, " -g INT Remove specified flags from default flag filter\n"); + fprintf(fp, " -G INT Add specified flags to the default flag filter\n"); + fprintf(fp, " -H Print a file header line\n"); + fprintf(fp, " -l INT Minimum read length [0]\n"); + fprintf(fp, " -o FILE Write output to FILE [stdout]\n"); + fprintf(fp, " -q INT Minimum base quality [0]\n"); + fprintf(fp, " -Q INT Minimum mapping quality [0]\n"); + fprintf(fp, " -H Print a file header\n"); + fprintf(fp, " -J Include reads with deletions in depth computation\n"); + fprintf(fp, " -s Do not count overlapping reads within a template\n"); + sam_global_opt_help(fp, "-.---@-."); + exit(exit_status); +} + +int main_depth(int argc, char *argv[]) +{ + int nfiles, i; + samFile **fp; + sam_hdr_t **header; + int c, has_index_file = 0; + char *file_list = NULL, **fn = NULL; + depth_opt opt = { + .flag = BAM_FUNMAP | BAM_FSECONDARY | BAM_FDUP | BAM_FQCFAIL, + .min_qual = 0, + .min_mqual = 0, + .skip_del = 1, + .header = 0, + .min_len = 0, + .out = stdout, + .all_pos = 0, + .remove_overlaps = 0, + .reg = NULL, + .bed = NULL, + }; + + sam_global_args ga = SAM_GLOBAL_ARGS_INIT; + static const struct option lopts[] = { + SAM_OPT_GLOBAL_OPTIONS('-', 0, '-', '-', '-', '@'), + {NULL, 0, NULL, 0} + }; + + while ((c = getopt_long(argc, argv, "@:q:Q:JHd:m:l:g:G:o:ar:Xf:b:s", + lopts, NULL)) >= 0) { + switch (c) { + case 'a': + opt.all_pos++; + break; + + case 'b': + opt.bed = bed_read(optarg); + if (!opt.bed) { + print_error_errno("depth", "Could not read file \"%s\"", + optarg); + return 1; + } + break; + + case 'f': + file_list = optarg; + break; + + case 'd': + case 'm': + // depth limit - now ignored + break; + + case 'g': + opt.flag &= ~bam_str2flag(optarg); + break; + case 'G': + opt.flag |= bam_str2flag(optarg); + break; + + case 'l': + opt.min_len = atoi(optarg); + break; + + case 'H': + opt.header = 1; + break; + + case 'q': + opt.min_qual = atoi(optarg); + break; + case 'Q': + opt.min_mqual = atoi(optarg); + break; + + case 'J': + opt.skip_del = 0; + break; + + case 'o': + if (opt.out != stdout) + break; + opt.out = fopen(optarg, "w"); + if (!opt.out) { + print_error_errno("depth", "Cannot open \"%s\" for writing.", + optarg); + return EXIT_FAILURE; + } + break; + + case 'r': + opt.reg = optarg; + break; + + case 's': + opt.remove_overlaps = 1; + break; + + case 'X': + has_index_file = 1; + break; + + default: if (parse_sam_global_opt(c, optarg, lopts, &ga) == 0) break; + /* else fall-through */ + case '?': + usage_exit(stderr, EXIT_FAILURE); } } + if (argc < optind+1 && !file_list) { + if (argc == optind) + usage_exit(stdout, EXIT_SUCCESS); + else + usage_exit(stderr, EXIT_FAILURE); + } - // initialize the auxiliary data structures - if (file_list) - { + if (file_list) { if (has_index_file) { print_error("depth", "The -f option cannot be combined with -X"); return 1; } - if ( read_file_list(file_list,&nfiles,&fn) ) return EXIT_FAILURE; - n = nfiles; + if (read_file_list(file_list, &nfiles, &fn)) + return 1; argv = fn; + argc = nfiles; optind = 0; + } else { + nfiles = argc - optind; } - else if (has_index_file) { // Calculate # of input BAM files - if ((argc - optind) % 2 != 0) { - fprintf(stderr, "Error: Odd number of filenames detected! Each BAM file should have an index file\n"); + + if (has_index_file) { + if (nfiles%1) { + print_error("depth", "-X needs one index specified per bam file"); return 1; } - n = (argc - optind) / 2; - } else { - n = argc - optind; + nfiles /= 2; + } + fp = malloc(nfiles * sizeof(*fp)); + header = malloc(nfiles * sizeof(*header)); + if (!fp || !header) { + print_error_errno("depth", "Out of memory"); + return 1; } - data = calloc(n, sizeof(aux_t*)); // data[i] for the i-th input - reg_tid = 0; beg = 0; end = HTS_POS_MAX; // set the default region - for (i = 0; i < n; ++i) { - int rf; - data[i] = calloc(1, sizeof(aux_t)); - data[i]->fp = sam_open_format(argv[optind+i], "r", &ga.in); // open BAM - if (data[i]->fp == NULL) { - print_error_errno("depth", "Could not open \"%s\"", argv[optind+i]); - status = EXIT_FAILURE; - goto depth_end; - } - rf = SAM_FLAG | SAM_RNAME | SAM_POS | SAM_MAPQ | SAM_CIGAR | SAM_SEQ; - if (baseQ) rf |= SAM_QUAL; - if (hts_set_opt(data[i]->fp, CRAM_OPT_REQUIRED_FIELDS, rf)) { - print_error_errno("depth", "Failed to set CRAM_OPT_REQUIRED_FIELDS value"); - status = EXIT_FAILURE; - goto depth_end; - } - if (hts_set_opt(data[i]->fp, CRAM_OPT_DECODE_MD, 0)) { - print_error_errno("depth", "Failed to set CRAM_OPT_DECODE_MD value"); - status = EXIT_FAILURE; - goto depth_end; - } - data[i]->min_mapQ = mapQ; // set the mapQ filter - data[i]->min_len = min_len; // set the qlen filter - data[i]->hdr = sam_hdr_read(data[i]->fp); // read the BAM header - if (data[i]->hdr == NULL) { - print_error_errno("depth", "Couldn't read header for \"%s\"", - argv[optind+i]); - status = EXIT_FAILURE; - goto depth_end; - } - if (reg) { // if a region is specified - hts_idx_t *idx = NULL; - // If index filename has not been specified, look in the BAM folder - if (has_index_file) { - idx = sam_index_load2(data[i]->fp, argv[optind+i], argv[optind+i+n]); // load the index - } else { - idx = sam_index_load(data[i]->fp, argv[optind+i]); - } - if (idx == NULL) { - print_error("depth", "can't load index for \"%s\"", argv[optind+i]); - status = EXIT_FAILURE; - goto depth_end; - } - data[i]->iter = sam_itr_querys(idx, data[i]->hdr, reg); // set the iterator - hts_idx_destroy(idx); // the index is not needed any more; free the memory - if (data[i]->iter == NULL) { - print_error("depth", "can't parse region \"%s\"", reg); - status = EXIT_FAILURE; - goto depth_end; - } - } - data[i]->flags = flags; - } - if (print_header) { - fputs("#CHROM\tPOS", file_out); - for (i = 0; i < n; ++i) { - fputc('\t', file_out); - fputs(argv[optind+i], file_out); - } - fputc('\n', file_out); - } - h = data[0]->hdr; // easy access to the header of the 1st BAM - if (reg) { - beg = data[0]->iter->beg; // and to the parsed region coordinates - end = data[0]->iter->end; - reg_tid = data[0]->iter->tid; - } - - // the core multi-pileup loop - mplp = bam_mplp_init(n, read_bam, (void**)data); - if (overlap_init && bam_mplp_init_overlaps(mplp) < 0) { - print_error("depth", "failed to init overlap detection\n"); - status = EXIT_FAILURE; - goto depth_end; - } - if (0 < max_depth) - bam_mplp_set_maxcnt(mplp,max_depth); // set maximum coverage depth - else if (!max_depth) - bam_mplp_set_maxcnt(mplp,INT_MAX); - n_plp = calloc(n, sizeof(int)); // n_plp[i] is the number of covering reads from the i-th BAM - plp = calloc(n, sizeof(bam_pileup1_t*)); // plp[i] points to the array of covering reads (internal in mplp) - while ((ret=bam_mplp64_auto(mplp, &tid, &pos, n_plp, plp)) > 0) { // come to the next covered position - if (pos < beg || pos >= end) continue; // out of range; skip - if (tid >= sam_hdr_nref(h)) continue; // diff number of @SQ lines per file? - if (all) { - while (tid > last_tid) { - if (last_tid >= 0 && !reg) { - // Deal with remainder or entirety of last tid. - while (++last_pos < sam_hdr_tid2len(h, last_tid)) { - // Horribly inefficient, but the bed API is an obfuscated black box. - if (bed && bed_overlap(bed, sam_hdr_tid2name(h, last_tid), last_pos, last_pos + 1) == 0) - continue; - fputs(sam_hdr_tid2name(h, last_tid), file_out); - fprintf(file_out, "\t%"PRIhts_pos, last_pos+1); - for (i = 0; i < n; i++) - fputc('\t', file_out), fputc('0', file_out); - fputc('\n', file_out); - } - } - last_tid++; - last_pos = -1; - if (all < 2) - break; - } + hts_itr_t **itr = NULL; + if (opt.reg) { + itr = calloc(nfiles, sizeof(*itr)); + if (!itr) + return 1; + } - // Deal with missing portion of current tid - while (++last_pos < pos) { - if (last_pos < beg) continue; // out of range; skip - if (bed && bed_overlap(bed, sam_hdr_tid2name(h, tid), last_pos, last_pos + 1) == 0) - continue; - fputs(sam_hdr_tid2name(h, tid), file_out); - fprintf(file_out, "\t%"PRIhts_pos, last_pos+1); - for (i = 0; i < n; i++) - fputc('\t', file_out), fputc('0', file_out); - fputc('\n', file_out); - } - - last_tid = tid; - last_pos = pos; - } - if (bed && bed_overlap(bed, sam_hdr_tid2name(h, tid), pos, pos + 1) == 0) continue; - fputs(sam_hdr_tid2name(h, tid), file_out); - fprintf(file_out, "\t%"PRIhts_pos, pos+1); // a customized printf() would be faster - for (i = 0; i < n; ++i) { // base level filters have to go here - int j, m = 0; - for (j = 0; j < n_plp[i]; ++j) { - const bam_pileup1_t *p = plp[i] + j; // DON'T modify plp[][] unless you really know - if ((!inc_del && p->is_del) || p->is_refskip) ++m; // having dels or refskips at tid:pos - else if (p->qpos < p->b->core.l_qseq && - bam_get_qual(p->b)[p->qpos] < baseQ) ++m; // low base quality - } - fprintf(file_out, "\t%d", n_plp[i] - m); // this the depth to output - } - fputc('\n', file_out); - } - if (ret < 0) status = EXIT_FAILURE; - free(n_plp); free(plp); - bam_mplp_destroy(mplp); - - if (all) { - // Handle terminating region - if (last_tid < 0 && reg) { - last_tid = reg_tid; - last_pos = beg-1; - } - if (pos < 0 && all > 1 && last_tid < 0 && !reg) { - last_tid = 0; - } - while (last_tid >= 0 && last_tid < sam_hdr_nref(h)) { - while (++last_pos < sam_hdr_tid2len(h, last_tid)) { - if (last_pos >= end) break; - if (bed && bed_overlap(bed, sam_hdr_tid2name(h, last_tid), last_pos, last_pos + 1) == 0) - continue; - fputs(sam_hdr_tid2name(h, last_tid), file_out); - fprintf(file_out, "\t%"PRIhts_pos, last_pos+1); - for (i = 0; i < n; i++) - fputc('\t', file_out), fputc('0', file_out); - fputc('\n', file_out); - } - last_tid++; - last_pos = -1; - if (all < 2 || reg) - break; + for (i = 0; i < nfiles; i++, optind++) { + fp[i] = sam_open_format(argv[optind], "r", &ga.in); + if (fp[i] == NULL) { + print_error_errno("depth", + "Cannot open input file \"%s\"", argv[optind]); + return 1; } - } -depth_end: - if (((file_out != stdout)? fclose(file_out) : fflush(file_out)) != 0) { - if (status == EXIT_SUCCESS) { - if (file_out != stdout) - print_error_errno("depth", "error on closing \"%s\"", output_file); - else - print_error_errno("depth", "error on flushing standard output"); - status = EXIT_FAILURE; + if (ga.nthreads > 0) + hts_set_threads(fp[i], ga.nthreads); + + if (hts_set_opt(fp[i], CRAM_OPT_REQUIRED_FIELDS, + SAM_FLAG | SAM_RNAME | SAM_POS | SAM_CIGAR + | (opt.remove_overlaps ? SAM_QNAME|SAM_RNEXT|SAM_PNEXT + : 0) + | (opt.min_mqual ? SAM_MAPQ : 0) + | (opt.min_len ? SAM_SEQ : 0) + | (opt.min_qual ? SAM_QUAL : 0))) { + fprintf(stderr, "Failed to set CRAM_OPT_REQUIRED_FIELDS value\n"); + return 1; + } + + if (hts_set_opt(fp[i], CRAM_OPT_DECODE_MD, 0)) { + fprintf(stderr, "Failed to set CRAM_OPT_DECODE_MD value\n"); + return 1; + } + + // FIXME: what if headers differ? + header[i] = sam_hdr_read(fp[i]); + if (header == NULL) { + fprintf(stderr, "Failed to read header for \"%s\"\n", + argv[optind]); + return 1; + } + + if (opt.reg) { + hts_idx_t *idx = has_index_file + ? sam_index_load2(fp[i], argv[optind], argv[optind+nfiles]) + : sam_index_load(fp[i], argv[optind]); + if (!idx) { + print_error("depth", "cannot load index for \"%s\"", + argv[optind]); + return 1; + } + if (!(itr[i] = sam_itr_querys(idx, header[i], opt.reg))) { + print_error("depth", "cannot parse region \"%s\"", opt.reg); + return 1; + } + hts_idx_destroy(idx); } } - for (i = 0; i < n && data[i]; ++i) { - sam_hdr_destroy(data[i]->hdr); - if (data[i]->fp) sam_close(data[i]->fp); - hts_itr_destroy(data[i]->iter); - free(data[i]); - } - free(data); free(reg); - if (bed) bed_destroy(bed); - if ( file_list ) - { - for (i=0; i @@ -52,6 +52,7 @@ sam_global_args ga; htsThreadPool p; int uncompressed; + int overwrite_hdr_rg; }; struct state; @@ -173,6 +174,7 @@ " -r STRING @RG line text\n" " -R STRING ID of @RG line in existing header to use\n" " -u Output uncompressed data\n" + " -w Overwrite an existing @RG line\n" " --no-PG Do not add a PG line\n" ); sam_global_opt_help(fp, "..O..@.."); @@ -200,7 +202,7 @@ }; kstring_t rg_line = {0,0,NULL}; - while ((n = getopt_long(argc, argv, "r:R:m:o:O:h@:u", lopts, NULL)) >= 0) { + while ((n = getopt_long(argc, argv, "r:R:m:o:O:h@:uw", lopts, NULL)) >= 0) { switch (n) { case 'r': // Are we adding to existing rg line? @@ -240,6 +242,9 @@ case 'u': retval->uncompressed = 1; break; + case 'w': + retval->overwrite_hdr_rg = 1; + break; case '?': usage(stderr); free(retval); @@ -360,10 +365,20 @@ // Check does not already exist kstring_t hdr_line = { 0, 0, NULL }; if (sam_hdr_find_line_id(retval->output_header, "RG", "ID", opts->rg_id, &hdr_line) == 0) { - fprintf(stderr, "[init] ID of new RG line specified conflicts with that of an existing header RG line. Overwrite not yet implemented.\n"); - free(hdr_line.s); - return false; + if (opts->overwrite_hdr_rg) { + if(-1 == sam_hdr_remove_line_id(retval->output_header, "RG", "ID", opts->rg_id)) { + fprintf(stderr, "[init] Error removing the RG line with ID:%s from the output header.\n", opts->rg_id); + ks_free(&hdr_line); + return false; + } + } else { + fprintf(stderr, "[init] RG line with ID:%s already present in the header. Use -w to overwrite.\n", opts->rg_id); + ks_free(&hdr_line); + return false; + } } + ks_free(&hdr_line); + if (-1 == sam_hdr_add_lines(retval->output_header, opts->rg_line, strlen(opts->rg_line))) { fprintf(stderr, "[init] Error adding RG line with ID:%s to the output header.\n", opts->rg_id); return false; @@ -383,7 +398,7 @@ return false; } retval->rg_id = strdup(opts->rg_id); - free(hdr_line.s); + ks_free(&hdr_line); } else { kstring_t rg_id = { 0, 0, NULL }; if (sam_hdr_find_tag_id(retval->output_header, "RG", NULL, NULL, "ID", &rg_id) < 0) { diff -Nru samtools-1.11/bam_ampliconclip.c samtools-1.13/bam_ampliconclip.c --- samtools-1.11/bam_ampliconclip.c 2020-09-22 12:15:55.000000000 +0000 +++ samtools-1.13/bam_ampliconclip.c 2021-07-07 17:18:52.000000000 +0000 @@ -1,7 +1,7 @@ -/* bam_ampliconclip.c -- loads amplicons from a BED file and cuts reads +/* bam_ampliconclip.c -- loads amplicon primers from a BED file and cuts reads from the 5' end. - Copyright (C) 2020 Genome Research Ltd. + Copyright (C) 2020-2021 Genome Research Ltd. Authors: Andrew Whitwham Rob Davies @@ -58,119 +58,176 @@ int unmapped; int oa_tag; int del_tag; + int tol; char *arg_list; char *stats_file; char *rejects_file; } cl_param_t; -static int bed_pair_sort(const void *av, const void *bv) { - bed_pair_t *a = (bed_pair_t *) av; - bed_pair_t *b = (bed_pair_t *) bv; +static int bed_entry_sort(const void *av, const void *bv) { + bed_entry_t *a = (bed_entry_t *) av; + bed_entry_t *b = (bed_entry_t *) bv; return a->right < b->right ? -1 : (a->right == b->right ? 0 : 1); } -int load_bed_file_pairs(char *infile, int get_strand, int sort_by_pos, - bed_pair_list_t *pairs, int64_t *longest) { +int load_bed_file_multi_ref(char *infile, int get_strand, int sort_by_pos, khash_t(bed_list_hash) *bed_lists) { hFILE *fp; int line_count = 0, ret; int64_t left, right; kstring_t line = KS_INITIALIZE; - *longest = 0; + bed_entry_list_t *list; + khiter_t bed_itr; if ((fp = hopen(infile, "r")) == NULL) { - print_error_errno("ampliconclip", "unable to open file %s.", infile); + print_error_errno("amplicon", "unable to open file %s.", infile); return 1; } - pairs->size = 256; - - if ((pairs->bp = malloc(pairs->size * sizeof(bed_pair_t))) == NULL) { - fprintf(stderr, "[ampliconclip] error: unable to allocate memory for bed data.\n"); - ret = 1; - goto error; - } + char ref[1024]; while (line.l = 0, kgetline(&line, (kgets_func *)hgets, fp) >= 0) { line_count++; + int hret; + char strand; if (line.l == 0 || *line.s == '#') continue; if (strncmp(line.s, "track ", 6) == 0) continue; if (strncmp(line.s, "browser ", 8) == 0) continue; if (get_strand) { - char strand; - - if (sscanf(line.s, "%*s %"SCNd64" %"SCNd64" %*s %*s %c", &left, &right, &strand) != 3) { - fprintf(stderr, "[ampliconclip] error: bad bed file format in line %d of %s.\n", + if (sscanf(line.s, "%1023s %"SCNd64" %"SCNd64" %*s %*s %c", + ref, &left, &right, &strand) != 4) { + fprintf(stderr, "[amplicon] error: bad bed file format in line %d of %s.\n" + "(N.B. ref/chrom name limited to 1023 characters.)\n", line_count, infile); ret = 1; goto error; } + } else { + if (sscanf(line.s, "%1023s %"SCNd64" %"SCNd64, + ref, &left, &right) != 3) { + fprintf(stderr, "[amplicon] error: bad bed file format in line %d of %s\n" + "(N.B. ref/chrom name limited to 1023 characters.)\n", + line_count, infile); + ret = 1; + goto error; + } + } - if (strand == '+') { - pairs->bp[pairs->length].rev = 0; - } else if (strand == '-') { - pairs->bp[pairs->length].rev = 1; - } else { - fprintf(stderr, "[ampliconclip] error: bad strand value in line %d, expecting '+' or '-', found '%c'.\n", - line_count, strand); + bed_itr = kh_get(bed_list_hash, bed_lists, ref); + + if (bed_itr == kh_end(bed_lists)) { // new ref entry + char *ref_name = strdup(ref); // need a copy for the hash key + + if (!ref_name) { + fprintf(stderr, "[amplicon] error: unable to allocate memory for ref name.\n"); ret = 1; goto error; } - } else { - if (sscanf(line.s, "%*s %"SCNd64" %"SCNd64, &left, &right) != 2) { - fprintf(stderr, "[ampliconclip] error: bad bed file format in line %d of %s", - line_count, infile); + + bed_itr = kh_put(bed_list_hash, bed_lists, ref_name, &hret); + + if (hret > 0) { + list = &kh_val(bed_lists, bed_itr); + + // initialise the new hash entry + list->longest = 0; + list->size = 0; + list->length = 0; + list->bp = NULL; + } else { + fprintf(stderr, "[amplicon] error: ref hashing failure.\n"); ret = 1; goto error; } + } else { // existing ref + list = &kh_val(bed_lists, bed_itr); } - if (pairs->length == pairs->size) { - bed_pair_t *tmp; + if (list->length == list->size) { + bed_entry_t *tmp; - pairs->size *= 2; + list->size += list->size / 2 + 256; - if ((tmp = realloc(pairs->bp, pairs->size * sizeof(bed_pair_t))) == NULL) { - fprintf(stderr, "[ampliconclip] error: unable to allocate more memory for bed data.\n"); + if ((tmp = realloc(list->bp, list->size * sizeof(bed_entry_t))) == NULL) { + fprintf(stderr, "[amplicon] error: unable to allocate more memory for bed data.\n"); + ret = 1; + goto error; + } + + list->bp = tmp; + } + + list->bp[list->length].left = left; + list->bp[list->length].right = right; + + if (get_strand) { + if (strand == '+') { + list->bp[list->length].rev = 0; + } else if (strand == '-') { + list->bp[list->length].rev = 1; + } else { + fprintf(stderr, "[amplicon] error: bad strand value in line %d, expecting '+' or '-', found '%c'.\n", + line_count, strand); ret = 1; goto error; } - - pairs->bp = tmp; } - pairs->bp[pairs->length].left = left; - pairs->bp[pairs->length].right = right; - if (right - left > *longest) - *longest = right - left; + if (right - left > list->longest) + list->longest = right - left; - pairs->length++; + list->length++; } - if (sort_by_pos) - qsort(pairs->bp, pairs->length, sizeof(pairs->bp[0]), bed_pair_sort); + if (sort_by_pos) { + for (bed_itr = kh_begin(bed_lists); bed_itr != kh_end(bed_lists); ++bed_itr) { + if (kh_exist(bed_lists, bed_itr)) { + list = &kh_val(bed_lists, bed_itr); + qsort(list->bp, list->length, sizeof(list->bp[0]), bed_entry_sort); + } + } + } - if (pairs->length) + if (kh_size(bed_lists) > 0) {// any entries ret = 0; - else + } else { ret = 1; + } error: ks_free(&line); + if (hclose(fp) != 0) { - fprintf(stderr, "[ampliconclip] warning: failed to close %s", infile); + fprintf(stderr, "[amplicon] warning: failed to close %s", infile); } return ret; } -static int matching_clip_site(bed_pair_list_t *sites, hts_pos_t pos, - int is_rev, int use_strand, int64_t longest) { - int i, tol = 5, size; // may need this to be variable +void destroy_bed_hash(khash_t(bed_list_hash) *hash) { + khiter_t itr; + + for (itr = kh_begin(hash); itr != kh_end(hash); ++itr) { + if (kh_exist(hash, itr)) { + free(kh_val(hash, itr).bp); + free((char *)kh_key(hash, itr)); + kh_key(hash, itr) = NULL; + } + } + + kh_destroy(bed_list_hash, hash); +} + + +static int matching_clip_site(bed_entry_list_t *sites, hts_pos_t pos, + int is_rev, int use_strand, int64_t longest, + cl_param_t *param) { + int i, size; // may need this to be variable + int tol = param->tol; int l = 0, mid = sites->length / 2, r = sites->length; int pos_tol = is_rev ? (pos > tol ? pos - tol : 0) : pos; @@ -571,18 +628,19 @@ static int bam_clip(samFile *in, samFile *out, samFile *reject, char *bedfile, clipping_type clipping, cl_param_t *param) { - int ret = 1, r, exclude = 0, file_open = 0; + int ret = 1, r, file_open = 0; + bam_hdr_t *header = NULL; bam1_t *b = NULL, *b_tmp = NULL; long f_count = 0, r_count = 0, n_count = 0, l_count = 0, l_exclude = 0, b_count = 0; long filtered = 0, written = 0, failed = 0; - int64_t longest = 0; kstring_t str = KS_INITIALIZE; kstring_t oat = KS_INITIALIZE; - bed_pair_list_t sites = {NULL, 0, 0}; + bed_entry_list_t *sites; FILE *stats_fp = stderr; + khash_t(bed_list_hash) *bed_hash = kh_init(bed_list_hash); - if (load_bed_file_pairs(bedfile, param->use_strand, 1, &sites, &longest)) { + if (load_bed_file_multi_ref(bedfile, param->use_strand, 1, bed_hash)) { fprintf(stderr, "[ampliconclip] error: unable to load bed file.\n"); goto fail; } @@ -629,17 +687,36 @@ goto fail; } + int32_t last_tid = -1; + int ref_found = 0; + while ((r = sam_read1(in, header, b)) >= 0) { hts_pos_t pos; int is_rev; int p_size; int been_clipped = 0, filter = 0; + int exclude = (BAM_FUNMAP | BAM_FQCFAIL); + khiter_t itr; l_count++; - exclude |= (BAM_FUNMAP | BAM_FQCFAIL); + if (b->core.tid != last_tid) { + const char *ref_name; + + ref_found = 0; + last_tid = b->core.tid; - if (!(b->core.flag & exclude)) { + if ((ref_name = sam_hdr_tid2name(header, b->core.tid)) != NULL) { + itr = kh_get(bed_list_hash, bed_hash, ref_name); + + if (itr != kh_end(bed_hash)) { + sites = &kh_val(bed_hash, itr); + ref_found = 1; + } + } + } + + if (!(b->core.flag & exclude) && ref_found) { if (param->oa_tag) if (tag_original_data(b, &oat)) goto fail; @@ -653,7 +730,7 @@ is_rev = 0; } - if ((p_size = matching_clip_site(&sites, pos, is_rev, param->use_strand, longest))) { + if ((p_size = matching_clip_site(sites, pos, is_rev, param->use_strand, sites->longest, param))) { if (is_rev) { if (bam_trim_right(b, b_tmp, p_size, clipping) != 0) goto fail; @@ -698,7 +775,7 @@ pos = b->core.pos; is_rev = 0; - if ((p_size = matching_clip_site(&sites, pos, is_rev, param->use_strand, longest))) { + if ((p_size = matching_clip_site(sites, pos, is_rev, param->use_strand, sites->longest, param))) { if (bam_trim_left(b, b_tmp, p_size, clipping) != 0) goto fail; @@ -712,7 +789,7 @@ pos = bam_endpos(b); is_rev = 1; - if ((p_size = matching_clip_site(&sites, pos, is_rev, param->use_strand, longest))) { + if ((p_size = matching_clip_site(sites, pos, is_rev, param->use_strand, sites->longest, param))) { if (bam_trim_right(b, b_tmp, p_size, clipping) != 0) goto fail; @@ -832,7 +909,7 @@ ret = 0; fail: - free(sites.bp); + destroy_bed_hash(bed_hash); ks_free(&oat); sam_hdr_destroy(header); bam_destroy1(b); @@ -842,16 +919,16 @@ static void usage(void) { - fprintf(stderr, "Usage samtools ampliconclip -b bedfile -o \n\n"); + fprintf(stderr, "Usage: samtools ampliconclip -b BED file -o \n\n"); fprintf(stderr, "Option: \n"); - fprintf(stderr, " -b FILE bedfile of amplicons to be removed.\n"); + fprintf(stderr, " -b FILE BED file of regions (eg amplicon primers) to be removed.\n"); fprintf(stderr, " -o FILE output file name (default stdout).\n"); fprintf(stderr, " -f FILE write stats to file name (default stderr)\n"); fprintf(stderr, " -u Output uncompressed data\n"); - fprintf(stderr, " --soft-clip soft clip amplicons from reads (default)\n"); - fprintf(stderr, " --hard-clip hard clip amplicons from reads.\n"); - fprintf(stderr, " --both-ends clip on both ends.\n"); - fprintf(stderr, " --strand use strand data from bed file.\n"); + fprintf(stderr, " --soft-clip soft clip amplicon primers from reads (default)\n"); + fprintf(stderr, " --hard-clip hard clip amplicon primers from reads.\n"); + fprintf(stderr, " --both-ends clip on both 5' and 3' ends.\n"); + fprintf(stderr, " --strand use strand data from BED file to match read direction.\n"); fprintf(stderr, " --clipped only output clipped reads.\n"); fprintf(stderr, " --fail mark unclipped, mapped reads as QCFAIL.\n"); fprintf(stderr, " --filter-len INT do not output reads INT size or shorter.\n"); @@ -860,8 +937,11 @@ fprintf(stderr, " --rejects-file FILE file to write filtered reads.\n"); fprintf(stderr, " --original for clipped entries add an OA tag with original data.\n"); fprintf(stderr, " --keep-tag for clipped entries keep the old NM and MD tags.\n"); + fprintf(stderr, " --tolerance match region within this number of bases, default 5.\n"); fprintf(stderr, " --no-PG do not add an @PG line.\n"); sam_global_opt_help(stderr, "-.O..@-."); + fprintf(stderr, "\nAbout: Soft clips read alignments where they match BED file defined regions.\n" + "Default clipping is only on the 5' end.\n\n"); } @@ -873,7 +953,7 @@ htsThreadPool p = {NULL, 0}; samFile *in = NULL, *out = NULL, *reject = NULL; clipping_type clipping = soft_clip; - cl_param_t param = {1, 0, 0, 0, 0, -1, -1, 0, 0, 1, NULL, NULL, NULL}; + cl_param_t param = {1, 0, 0, 0, 0, -1, -1, 0, 0, 1, 5, NULL, NULL, NULL}; static const struct option lopts[] = { SAM_OPT_GLOBAL_OPTIONS('-', 0, 'O', 0, 0, '@'), @@ -890,6 +970,7 @@ {"rejects-file", required_argument, NULL, 1012}, {"original", no_argument, NULL, 1013}, {"keep-tag", no_argument, NULL, 1014}, + {"tolerance", required_argument, NULL, 1015}, {NULL, 0, NULL, 0} }; @@ -912,6 +993,7 @@ case 1012: param.rejects_file = optarg; break; case 1013: param.oa_tag = 1; break; case 1014: param.del_tag = 0; break; + case 1015: param.tol = atoi(optarg); break; default: if (parse_sam_global_opt(c, optarg, lopts, &ga) == 0) break; /* else fall-through */ case '?': usage(); exit(1); @@ -928,6 +1010,12 @@ return 1; } + if (param.tol < 0) { + fprintf(stderr, "[ampliconclip] warning: invalid tolerance of %d," + " reseting tolerance to default of 5.\n", param.tol); + param.tol = 5; + } + if ((in = sam_open_format(argv[optind], "rb", &ga.in)) == NULL) { print_error_errno("ampliconclip", "cannot open input file"); return 1; diff -Nru samtools-1.11/bam_ampliconclip.h samtools-1.13/bam_ampliconclip.h --- samtools-1.11/bam_ampliconclip.h 2020-09-22 12:15:55.000000000 +0000 +++ samtools-1.13/bam_ampliconclip.h 2021-07-07 17:18:52.000000000 +0000 @@ -1,6 +1,6 @@ /* bam_ampliconclip.h -- shared functions between amplicon clip/stats - Copyright (C) 2020 Genome Research Ltd. + Copyright (C) 2020-2021 Genome Research Ltd. Author: James Bonfield @@ -25,19 +25,30 @@ #ifndef BAM_AMPLICONCLIP_H #define BAM_AMPLICONCLIP_H +#include "htslib/khash.h" + typedef struct { int64_t left; int64_t right; int rev; -} bed_pair_t; +} bed_entry_t; typedef struct { - bed_pair_t *bp; + bed_entry_t *bp; + int64_t longest; int length; int size; -} bed_pair_list_t; +} bed_entry_list_t; + +KHASH_MAP_INIT_STR(bed_list_hash, bed_entry_list_t); + +#define BED_LIST_INIT {NULL, 0, 0, 0, {0}} + + +int load_bed_file_multi_ref(char *infile, int get_strand, + int sort_by_pos, khash_t(bed_list_hash) *bed_lists); + +void destroy_bed_hash(khash_t(bed_list_hash) *hash); -int load_bed_file_pairs(char *infile, int get_strand, int sort_by_pos, - bed_pair_list_t *pairs, int64_t *longest); #endif /* BAM_AMPLICONCLIP_H */ diff -Nru samtools-1.11/bam_cat.c samtools-1.13/bam_cat.c --- samtools-1.11/bam_cat.c 2020-09-22 12:15:55.000000000 +0000 +++ samtools-1.13/bam_cat.c 2021-07-07 17:18:52.000000000 +0000 @@ -1,6 +1,6 @@ /* bam_cat.c -- efficiently concatenates bam files. - Copyright (C) 2008-2009, 2011-2013, 2015-2017, 2019 Genome Research Ltd. + Copyright (C) 2008-2009, 2011-2013, 2015-2017, 2019, 2021 Genome Research Ltd. Modified SAMtools work copyright (C) 2010 Illumina, Inc. Permission is hereby granted, free of charge, to any person obtaining a copy @@ -270,22 +270,13 @@ // Copy contains and blocks within them while ((c = cram_read_container(in_c))) { - cram_block *blk; - - if (cram_container_is_empty(in_c)) { - if (cram_write_container(out_c, c) != 0) - return -1; - + if (cram_container_is_empty(in_c)) { + cram_block *blk; // Container compression header if (!(blk = cram_read_block(in_c))) return -1; - if (cram_write_block(out_c, blk) != 0) { - cram_free_block(blk); - return -1; - } cram_free_block(blk); cram_free_container(c); - continue; } @@ -297,6 +288,7 @@ cram_transcode_rg(in_c, out_c, c, 1, &zero, &new_rg); } else { int32_t num_slices; + cram_block *blk; // Not switching rg so do the usual read/write loop if (cram_write_container(out_c, c) != 0) @@ -467,7 +459,7 @@ char *outfn = 0; char **infns = NULL; // files to concatenate int infns_size = 0; - int c, ret = 0, no_pg = 0; + int c, ret = 0, no_pg = 0, usage = 0; samFile *in; sam_global_args ga; @@ -481,7 +473,7 @@ sam_global_args_init(&ga); - while ((c = getopt_long(argc, argv, "h:o:b:", lopts, NULL)) >= 0) { + while ((c = getopt_long(argc, argv, "h:o:b:@:", lopts, NULL)) >= 0) { switch (c) { case 'h': { samFile *fph = sam_open(optarg, "r"); @@ -522,6 +514,8 @@ break; default: if (parse_sam_global_opt(c, optarg, lopts, &ga) == 0) break; + /* else fall-through */ + case '?': usage=1; break; } } @@ -539,7 +533,7 @@ } // Require at least one input file - if (infns_size + nargv_fns == 0) { + if (infns_size + nargv_fns == 0 || usage) { fprintf(stderr, "Usage: samtools cat [options] [... ]\n"); fprintf(stderr, " samtools cat [options] [... ]\n\n"); fprintf(stderr, "Concatenate BAM or CRAM files, first those in , then those\non the command line.\n\n"); diff -Nru samtools-1.11/bam_fastq.c samtools-1.13/bam_fastq.c --- samtools-1.11/bam_fastq.c 2020-09-22 12:15:55.000000000 +0000 +++ samtools-1.13/bam_fastq.c 2021-07-07 17:18:52.000000000 +0000 @@ -1,6 +1,6 @@ /* bam_fastq.c -- FASTA and FASTQ file generation - Copyright (C) 2009-2017, 2019 Genome Research Ltd. + Copyright (C) 2009-2017, 2019-2020 Genome Research Ltd. Portions copyright (C) 2009, 2011, 2012 Broad Institute. Author: Heng Li @@ -42,16 +42,11 @@ #include "samtools.h" #include "sam_opts.h" -#define taglist_free(p) -KLIST_INIT(ktaglist, char*, taglist_free) - #define DEFAULT_BARCODE_TAG "BC" #define DEFAULT_QUALITY_TAG "QT" #define INDEX_SEPARATOR "+" int8_t seq_comp_table[16] = { 0, 8, 4, 12, 2, 10, 6, 14, 1, 9, 5, 13, 3, 11, 7, 15 }; -static const char *copied_tags[] = { "RG", "BC", "QT", NULL }; - static void bam2fq_usage(FILE *to, const char *command) { int fq = strcasecmp("fastq", command) == 0 || strcasecmp("bam2fq", command) == 0; @@ -60,64 +55,71 @@ fprintf(to, "\n" "Description:\n" -"Converts a SAM, BAM or CRAM into either FASTQ or FASTA format depending on the command invoked.\n" +"Converts a SAM, BAM or CRAM to %s format.\n" "\n" "Options:\n" -" -0 FILE write reads designated READ_OTHER to FILE\n" -" -1 FILE write reads designated READ1 to FILE\n" -" -2 FILE write reads designated READ2 to FILE\n" -" -o FILE write reads designated READ1 or READ2 to FILE\n" -" note: if a singleton file is specified with -s, only\n" -" paired reads will be written to the -1 and -2 files.\n" -" -f INT only include reads with all of the FLAGs in INT present [0]\n" // F&x == x -" -F INT only include reads with none of the FLAGS in INT present [0x900]\n" // F&x == 0 -" -G INT only EXCLUDE reads with all of the FLAGs in INT present [0]\n" // !(F&x == x) -" -n don't append /1 and /2 to the read name\n" -" -N always append /1 and /2 to the read name\n"); +" -0 FILE write reads designated READ_OTHER to FILE\n" +" -1 FILE write reads designated READ1 to FILE\n" +" -2 FILE write reads designated READ2 to FILE\n" +" -o FILE write reads designated READ1 or READ2 to FILE\n" +" note: if a singleton file is specified with -s, only\n" +" paired reads will be written to the -1 and -2 files.\n" +" -f INT only include reads with all of the FLAGs in INT present [0]\n" // F&x == x +" -F INT only include reads with none of the FLAGS in INT present [0x900]\n" // F&x == 0 +" -G INT only EXCLUDE reads with all of the FLAGs in INT present [0]\n" // !(F&x == x) +" -n don't append /1 and /2 to the read name\n" +" -N always append /1 and /2 to the read name\n", + fq ? "FASTQ" : "FASTA"); if (fq) fprintf(to, -" -O output quality in the OQ tag if present\n"); +" -O output quality in the OQ tag if present\n"); fprintf(to, -" -s FILE write singleton reads designated READ1 or READ2 to FILE\n" -" -t copy RG, BC and QT tags to the %s header line\n", +" -s FILE write singleton reads designated READ1 or READ2 to FILE\n" +" -t copy RG, BC and QT tags to the %s header line\n", fq ? "FASTQ" : "FASTA"); fprintf(to, -" -T TAGLIST copy arbitrary tags to the %s header line\n", +" -T TAGLIST copy arbitrary tags to the %s header line\n", fq ? "FASTQ" : "FASTA"); if (fq) fprintf(to, -" -v INT default quality score if not given in file [1]\n" -" -i add Illumina Casava 1.8 format entry to header (eg 1:N:0:ATCACG)\n" -" -c compression level [0..9] to use when creating gz or bgzf fastq files [1]\n" -" --i1 FILE write first index reads to FILE\n" -" --i2 FILE write second index reads to FILE\n" -" --barcode-tag TAG Barcode tag [default: " DEFAULT_BARCODE_TAG "]\n" -" --quality-tag TAG Quality tag [default: " DEFAULT_QUALITY_TAG "]\n" -" --index-format STR How to parse barcode and quality tags\n\n"); +" -v INT default quality score if not given in file [1]\n" +" -i add Illumina Casava 1.8 format entry to header (eg 1:N:0:ATCACG)\n" +" -c INT compression level [0..9] to use when writing bgzf files [1]\n" +" --i1 FILE write first index reads to FILE\n" +" --i2 FILE write second index reads to FILE\n" +" --barcode-tag TAG\n" +" Barcode tag [" DEFAULT_BARCODE_TAG "]\n" +" --quality-tag TAG\n" +" Quality tag [" DEFAULT_QUALITY_TAG "]\n" +" --index-format STR\n" +" How to parse barcode and quality tags\n\n"); sam_global_opt_help(to, "-.--.@-."); fprintf(to, "\n" -"The files will be automatically compressed if the file names have a .gz or .bgzf extension.\n" -"The input to this program must be collated by name. Run 'samtools collate' or 'samtools sort -n'.\n" +"The files will be automatically compressed if the file names have a .gz\n" +"or .bgzf extension. The input to this program must be collated by name.\n" +"Run 'samtools collate' or 'samtools sort -n' to achieve this.\n" "\n" "Reads are designated READ1 if FLAG READ1 is set and READ2 is not set.\n" "Reads are designated READ2 if FLAG READ1 is not set and READ2 is set.\n" -"Reads are designated READ_OTHER if FLAGs READ1 and READ2 are either both set\n" -"or both unset.\n" +"Otherwise reads are designated READ_OTHER (both flags set or both flags unset).\n" "Run 'samtools flags' for more information on flag codes and meanings.\n"); fprintf(to, "\n" -"The index-format string describes how to parse the barcode and quality tags, for example:\n" -" i14i8 the first 14 characters are index 1, the next 8 characters are index 2\n" -" n8i14 ignore the first 8 characters, and use the next 14 characters for index 1\n" -"If the tag contains a separator, then the numeric part can be replaced with '*' to mean\n" -"'read until the separator or end of tag', for example:\n" -" n*i* ignore the left part of the tag until the separator, then use the second part\n" -" of the tag as index 1\n"); +"The index-format string describes how to parse the barcode and quality tags.\n" +"It is made up of 'i' or 'n' followed by a length or '*'. For example:\n" +" i14i8 The first 14 characters are index 1, the next 8 are index 2\n" +" n8i14 Ignore the first 8 characters, and use the next 14 for index 1\n\n" +"If the tag contains a separator, then the numeric part can be replaced with\n" +"'*' to mean 'read until the separator or end of tag', for example:\n" +" i*i* Break the tag at the separator into index 1 and index 2\n" +" n*i* Ignore the left part of the tag until the separator,\n" +" then use the second part of the tag as index 1\n"); fprintf(to, "\n" "Examples:\n" -" To get just the paired reads in separate files, use:\n" -" samtools %s -1 paired1.%s -2 paired2.%s -0 /dev/null -s /dev/null -n in.bam\n" -"\n To get all non-supplementary/secondary reads in a single file, redirect the output:\n" +"To get just the paired reads in separate files, use:\n" +" samtools %s -1 pair1.%s -2 pair2.%s -0 /dev/null -s /dev/null -n in.bam\n" +"\nTo get all non-supplementary/secondary reads in a single file, redirect\n" +"the output:\n" " samtools %s in.bam > all_reads.%s\n", command, fq ? "fq" : "fa", fq ? "fq" : "fa", command, fq ? "fq" : "fa"); @@ -144,96 +146,20 @@ typedef struct bam2fq_state { samFile *fp; - BGZF *fpse; - BGZF *fpr[3]; - BGZF *fpi[2]; - BGZF *hstdout; + samFile *fpse; + samFile *fpr[3]; + samFile *fpi[3]; + samFile *hstdout; sam_hdr_t *h; bool has12, use_oq, copy_tags, illumina_tag; int flag_on, flag_off, flag_alloff; fastfile filetype; int def_qual; - klist_t(ktaglist) *taglist; char *index_sequence; char compression_level; htsThreadPool p; } bam2fq_state_t; -/* - * Get and decode the read from a BAM record. - * - * TODO: htslib really needs an interface for this. Consider this or perhaps - * bam_get_seq_str (current vs original orientation) and bam_get_qual_str - * functions as string formatted equivalents to bam_get_{seq,qual}? - */ - -/* - * Reverse a string in place. - * From http://stackoverflow.com/questions/8534274/is-the-strrev-function-not-available-in-linux. - * Author Sumit-naik: http://stackoverflow.com/users/4590926/sumit-naik - */ -static char *reverse(char *str) -{ - int i = strlen(str)-1,j=0; - char ch; - while (i>j) { - ch = str[i]; - str[i]= str[j]; - str[j] = ch; - i--; - j++; - } - return str; -} - -/* return the read, reverse complemented if necessary */ -static char *get_read(const bam1_t *rec) -{ - int len = rec->core.l_qseq + 1; - char *read = calloc(1, len); - char *seq = (char *)bam_get_seq(rec); - int n; - - if (!read) return NULL; - - for (n=0; n < rec->core.l_qseq; n++) { - if (rec->core.flag & BAM_FREVERSE) read[n] = seq_nt16_str[seq_comp_table[bam_seqi(seq,n)]]; - else read[n] = seq_nt16_str[bam_seqi(seq,n)]; - } - if (rec->core.flag & BAM_FREVERSE) reverse(read); - return read; -} - -/* - * get and decode the quality from a BAM record - */ -static int get_quality(const bam1_t *rec, char **qual_out) -{ - char *quality = calloc(1, rec->core.l_qseq + 1); - char *q = (char *)bam_get_qual(rec); - int n; - - if (!quality) return -1; - - if (*q == '\xff') { - free(quality); - *qual_out = NULL; - return 0; - } - - for (n=0; n < rec->core.l_qseq; n++) { - quality[n] = q[n]+33; - } - if (rec->core.flag & BAM_FREVERSE) reverse(quality); - *qual_out = quality; - return 0; -} - -// -// End of htslib complaints -// - - static readpart which_readpart(const bam1_t *b) { if ((b->core.flag & BAM_FREAD1) && !(b->core.flag & BAM_FREAD2)) { @@ -245,269 +171,8 @@ } } -/* - * parse the length part from the index-format string - */ -static int getLength(char **s) -{ - int n = 0; - while (**s) { - if (**s == '*') { n=-1; (*s)++; break; } - if ( !isdigit(**s)) break; - n = n*10 + ((**s)-'0'); - (*s)++; - } - return n; -} - -static bool copy_tag(const char *tag, const bam1_t *rec, kstring_t *linebuf) -{ - if (kputc('\t', linebuf) < 0) - return false; - int ret = bam_aux_get_str(rec, tag, linebuf); - if (ret < 0) - return false; - else if (ret == 0) - linebuf->s[--linebuf->l] = 0; // no tag so undo \t again - - return true; -} - -static int insert_index_sequence_into_linebuf(char *index_sequence, kstring_t *linebuf, bam1_t *rec) -{ - if (!index_sequence) return 0; - - kstring_t new = {0,0,NULL}; - if (linebuf->s) { - char *s = strchr(linebuf->s, '\n'); - if (s) { - if (ks_resize(&new, linebuf->l + strlen(index_sequence) + 16) < 0) - return -1; - *s = 0; - kputs(linebuf->s, &new); - kputc(' ', &new); - readpart readpart = which_readpart(rec); - if (readpart == READ_1) kputc('1', &new); - else if (readpart == READ_2) kputc('2', &new); - else kputc('0', &new); - - kputc(':', &new); - if (rec->core.flag & BAM_FQCFAIL) kputc('Y', &new); - else kputc('N', &new); - - kputs(":0:", &new); - kputs(index_sequence, &new); - kputc('\n', &new); - kputs(s+1, &new); - free(ks_release(linebuf)); - linebuf->s = new.s; linebuf->l = new.l; linebuf->m = new.m; - } - } - return 0; -} - -static bool make_fq_line(const bam1_t *rec, char *seq, char *qual, kstring_t *linebuf, const bam2fq_state_t *state) -{ - int i; - - linebuf->l = 0; - // Write read name - if (kputc(state->filetype == FASTA? '>' : '@', linebuf) < 0) return false; - if (kputs(bam_get_qname(rec), linebuf) < 0) return false; - // Add the /1 /2 if requested - if (state->has12) { - readpart readpart = which_readpart(rec); - if (readpart == READ_1) { - if (kputs("/1", linebuf) < 0) return false; - } else if (readpart == READ_2) { - if (kputs("/2", linebuf) < 0) return false; - } - } - if (state->copy_tags) { - for (i = 0; copied_tags[i]; ++i) { - if (!copy_tag(copied_tags[i], rec, linebuf)) { - fprintf(stderr, "Problem copying aux tags: [%s]\n", linebuf->s); - return false; - } - } - } - - if (state->taglist->size) { - kliter_t(ktaglist) *p; - for (p = kl_begin(state->taglist); p != kl_end(state->taglist); p = kl_next(p)) { - if (!copy_tag(kl_val(p), rec, linebuf)) { - fprintf(stderr, "Problem copying aux tags: [%s]\n", linebuf->s); - return false; - } - } - } - - if (kputc('\n', linebuf) < 0) return false; - if (kputs(seq, linebuf) < 0) return false; - if (kputc('\n', linebuf) < 0) return false; - - if (state->filetype == FASTQ) { - // Write quality - if (kputs("+\n", linebuf) < 0) return false; - if (qual && *qual) { - if (kputs(qual, linebuf) < 0) return false; - } else { - int len = strlen(seq); - if (ks_resize(linebuf, ks_len(linebuf) + len + 1) < 0) return false; - for (i = 0; i < len; ++i) { - kputc(33 + state->def_qual, linebuf); - } - } - if (kputc('\n', linebuf) < 0) return false; - } - return true; -} - -/* - * Create FASTQ lines from the barcode tag using the index-format - */ -static bool tags2fq(bam1_t *rec, bam2fq_state_t *state, const bam2fq_opts_t* opts) -{ - uint8_t *p; - char *ifmt = opts->index_format; - char *tag = NULL; - char *qual = NULL; - char *sub_tag = NULL; - char *sub_qual = NULL; - size_t tag_len; - int file_number = 0; - kstring_t linebuf = { 0, 0, NULL }; // Buffer - int index_segment = 0, same_read = 1; - - if (!ifmt) return true; - - // read barcode tag - p = bam_aux_get(rec,opts->barcode_tag); - if (p) tag = bam_aux2Z(p); - - if (!tag) return true; // there is no tag - - tag_len = strlen(tag); - sub_tag = calloc(1, tag_len + 1); - if (!sub_tag) goto fail; - sub_qual = calloc(1, tag_len + 1); - if (!sub_qual) goto fail; - - // read quality tag - p = bam_aux_get(rec, opts->quality_tag); - if (p) qual = bam_aux2Z(p); - - // Parse the index-format string - while (*ifmt) { - if (file_number > 1) break; // shouldn't happen if we've validated parameters correctly - char action = *ifmt; // should be 'i' or 'n' - ifmt++; // skip over action - int index_len = getLength(&ifmt); - int n = 0; - - if (index_len < 0) { - // read until separator - while (isalpha(*tag)) { - sub_tag[n] = *tag++; - if (qual) sub_qual[n] = *qual++; - n++; - } - if (*tag) { // skip separator - tag++; - if (qual) qual++; - } - } else { - // read index_len characters - while (index_len-- && *tag) { - sub_tag[n] = *tag++; - if (qual) sub_qual[n] = *qual++; - n++; - } - } - sub_tag[n] = '\0'; - sub_qual[n] = '\0'; - index_segment++; - - if (action=='i' && *sub_tag) { - if (state->index_sequence) { - if (index_segment == 1) same_read = 0; - if (same_read) { - char *new_index_sequence = realloc(state->index_sequence, strlen(state->index_sequence) + strlen(sub_tag) + 2); - if (!new_index_sequence) goto fail; - state->index_sequence = new_index_sequence; - strcat(state->index_sequence, INDEX_SEPARATOR); - strcat(state->index_sequence, sub_tag); - } - } else { - state->index_sequence = strdup(sub_tag); // we're going to need this later... - } - if (!state->index_sequence) goto fail; - if (!make_fq_line(rec, sub_tag, sub_qual, &linebuf, state)) goto fail; - if (state->illumina_tag) { - if (insert_index_sequence_into_linebuf(sub_tag, &linebuf, rec) < 0) { - goto fail; - } - } - if (state->fpi[file_number]) { - if (bgzf_write(state->fpi[file_number++], linebuf.s, linebuf.l) < 0) - goto fail; - } - } - - } - - free(sub_qual); free(sub_tag); - free(linebuf.s); - return true; - - fail: - perror(__func__); - free(sub_qual); free(sub_tag); - free(linebuf.s); - return false; -} - -// Transform a bam1_t record into a string with the FASTQ representation of it -// @returns false for error, true for success -static bool bam1_to_fq(const bam1_t *b, kstring_t *linebuf, const bam2fq_state_t *state) -{ - int32_t qlen = b->core.l_qseq; - assert(qlen >= 0); - const uint8_t *oq = NULL; - char *qual = NULL; - - char *seq = get_read(b); - if (!seq) return false; - - if (state->use_oq) oq = bam_aux_get(b, "OQ"); - if (oq && *oq=='Z') { - qual = strdup(bam_aux2Z(oq)); - if (!qual) goto fail; - if (b->core.flag & BAM_FREVERSE) { // read is reverse complemented - reverse(qual); - } - } else { - if (get_quality(b, &qual) < 0) goto fail; - } - - if (!make_fq_line(b, seq, qual, linebuf, state)) goto fail; - - free(qual); - free(seq); - return true; - - fail: - free(seq); - free(qual); - return false; -} - static void free_opts(bam2fq_opts_t *opts) { - free(opts->barcode_tag); - free(opts->quality_tag); - free(opts->index_format); - free(opts->extra_tags); free(opts); } @@ -545,13 +210,14 @@ {"quality-tag", required_argument, NULL, 'q'}, { NULL, 0, NULL, 0 } }; - while ((c = getopt_long(argc, argv, "0:1:2:o:f:F:G:niNOs:c:tT:v:@:", lopts, NULL)) > 0) { + while ((c = getopt_long(argc, argv, "0:1:2:o:f:F:G:niNOs:c:tT:v:@:", + lopts, NULL)) > 0) { switch (c) { - case 'b': opts->barcode_tag = strdup(optarg); break; - case 'q': opts->quality_tag = strdup(optarg); break; + case 'b': opts->barcode_tag = optarg; break; + case 'q': opts->quality_tag = optarg; break; case 1 : opts->index_file[0] = optarg; break; case 2 : opts->index_file[1] = optarg; break; - case 3 : opts->index_format = strdup(optarg); break; + case 3 : opts->index_format = optarg; break; case '0': opts->fnr[0] = optarg; break; case '1': opts->fnr[1] = optarg; break; case '2': opts->fnr[2] = optarg; break; @@ -562,7 +228,8 @@ flag_off_set = 1; opts->flag_off = 0; } - opts->flag_off |= strtol(optarg, 0, 0); break; + opts->flag_off |= strtol(optarg, 0, 0); + break; case 'G': opts->flag_alloff |= strtol(optarg, 0, 0); break; case 'n': opts->has12 = false; break; case 'N': opts->has12always = true; break; @@ -570,13 +237,25 @@ case 's': opts->fnse = optarg; break; case 't': opts->copy_tags = true; break; case 'i': opts->illumina_tag = true; break; - case 'c': opts->compression_level = atoi(optarg); break; - case 'T': opts->extra_tags = strdup(optarg); break; + case 'c': + opts->compression_level = atoi(optarg); + if (opts->compression_level < 0) + opts->compression_level = 0; + if (opts->compression_level > 9) + opts->compression_level = 9; + break; + case 'T': opts->extra_tags = optarg; break; case 'v': opts->def_qual = atoi(optarg); break; - case '?': bam2fq_usage(stderr, argv[0]); free_opts(opts); return false; + + case '?': + bam2fq_usage(stderr, argv[0]); + free_opts(opts); + return false; default: if (parse_sam_global_opt(c, optarg, lopts, &opts->ga) != 0) { - bam2fq_usage(stderr, argv[0]); free_opts(opts); return false; + bam2fq_usage(stderr, argv[0]); + free_opts(opts); + return false; } break; } @@ -585,8 +264,8 @@ if (opts->fnr[1] || opts->fnr[2]) opts->has12 = false; if (opts->has12always) opts->has12 = true; - if (!opts->barcode_tag) opts->barcode_tag = strdup(DEFAULT_BARCODE_TAG); - if (!opts->quality_tag) opts->quality_tag = strdup(DEFAULT_QUALITY_TAG); + if (!opts->barcode_tag) opts->barcode_tag = DEFAULT_BARCODE_TAG; + if (!opts->quality_tag) opts->quality_tag = DEFAULT_QUALITY_TAG; int nIndex = 0; if (opts->index_format) { @@ -631,7 +310,8 @@ } const char* type_str = argv[0]; - if (strcasecmp("fastq", type_str) == 0 || strcasecmp("bam2fq", type_str) == 0) { + if (strcasecmp("fastq", type_str) == 0 || + strcasecmp("bam2fq", type_str) == 0) { opts->filetype = FASTQ; } else if (strcasecmp("fasta", type_str) == 0) { opts->filetype = FASTA; @@ -659,34 +339,61 @@ return true; } -static BGZF *open_fqfile(char *filename, int c, htsThreadPool *tp) -{ - char mode[4] = "w"; - size_t len = strlen(filename); - - mode[2] = 0; mode[3] = 0; - if (len > 3 && strstr(filename + (len - 3),".gz")) { - mode[1] = 'g'; mode[2] = c+'0'; - } else if ((len > 4 && strstr(filename + (len - 4),".bgz")) - || (len > 5 && strstr(filename + (len - 5),".bgzf"))) { - mode[1] = c+'0'; - } else { - mode[1] = 'u'; +void set_sam_opts(samFile *fp, bam2fq_state_t *state, + const bam2fq_opts_t *opts) { + if (state->has12) + hts_set_opt(fp, FASTQ_OPT_RNUM, 1); + + if (state->illumina_tag) + hts_set_opt(fp, FASTQ_OPT_CASAVA, 1); + + hts_set_opt(fp, FASTQ_OPT_BARCODE, opts->barcode_tag); + + kstring_t tag_list = {0,0}; + if (state->copy_tags) + kputs("RG,BC,QT", &tag_list); + if (opts->extra_tags) { + if (tag_list.l) + kputc(',', &tag_list); + kputs(opts->extra_tags, &tag_list); + } + if (tag_list.l) + hts_set_opt(fp, FASTQ_OPT_AUX, tag_list.s); + ks_free(&tag_list); +} + +// Open a file as normal or gzipped based on filename. +// Note we always use bgzf and don't bother to attempt non-blocked +// gzip streams. This is a departure from the old fastq code. +static samFile *sam_open_z(char *fn, char *mode, bam2fq_state_t *state) { + char modez[6]; + strcpy(modez, mode); + + size_t l = strlen(fn); + if ((l > 3 && strcmp(fn+l-3, ".gz") == 0) || + (l > 4 && strcmp(fn+l-4, ".bgz") == 0) || + (l > 5 && strcmp(fn+l-5, ".bgzf") == 0)) { + char m[3] = {'z', state->compression_level+'0', '\0'}; + strcat(modez, m); } - BGZF *fp = bgzf_open(filename,mode); + samFile *fp = sam_open(fn, modez); if (!fp) - return fp; - if (tp->pool && bgzf_thread_pool(fp, tp->pool, tp->qsize) < 0) { - bgzf_close(fp); return NULL; - } + + if (state->p.pool) + hts_set_thread_pool(fp, &state->p); + return fp; } static bool init_state(const bam2fq_opts_t* opts, bam2fq_state_t** state_out) { + char *mode = opts->filetype == FASTA ? "wF" : "wf"; + bam2fq_state_t* state = calloc(1, sizeof(bam2fq_state_t)); + if (!state) + return false; state->flag_on = opts->flag_on; state->flag_off = opts->flag_off; state->flag_alloff = opts->flag_alloff; @@ -700,22 +407,6 @@ state->hstdout = NULL; state->compression_level = opts->compression_level; - state->taglist = kl_init(ktaglist); - if (opts->extra_tags) { - char *save_p; - char *s = strtok_r(opts->extra_tags, ",", &save_p); - while (s) { - if (strlen(s) != 2) { - fprintf(stderr, "Parsing extra tags - '%s' is not two characters\n", s); - free(state); - return false; - } - char **et = kl_pushp(ktaglist, state->taglist); - *et = s; - s = strtok_r(NULL, ",", &save_p); - } - } - state->fp = sam_open(opts->fn_input, "r"); if (state->fp == NULL) { print_error_errno("bam2fq","Cannot read file \"%s\"", opts->fn_input); @@ -747,12 +438,12 @@ return false; } if (opts->fnse) { - state->fpse = open_fqfile(opts->fnse, state->compression_level, &state->p); - if (state->fpse == NULL) { - print_error_errno("bam2fq", "Cannot write to singleton file \"%s\"", opts->fnse); + if (!(state->fpse = sam_open_z(opts->fnse, mode, state))) { + print_error_errno("bam2fq", "Cannot open singleton file \"%s\"", opts->fnse); free(state); return false; } + set_sam_opts(state->fpse, state, opts); } if (opts->ga.reference) { @@ -763,6 +454,7 @@ } } + // single, read1, read2 int i, j; for (i = 0; i < 3; ++i) { if (opts->fnr[i]) { @@ -770,28 +462,30 @@ if (opts->fnr[j] && strcmp(opts->fnr[j], opts->fnr[i]) == 0) break; if (j == i) { - state->fpr[i] = open_fqfile(opts->fnr[i], state->compression_level, &state->p); - if (state->fpr[i] == NULL) { - print_error_errno("bam2fq", "Cannot write to r%d file \"%s\"", + if (!(state->fpr[i] = sam_open_z(opts->fnr[i], mode, state))) { + print_error_errno("bam2fq", "Cannot open r%d file \"%s\"", i, opts->fnr[i]); free(state); return false; } + set_sam_opts(state->fpr[i], state, opts); } else { state->fpr[i] = state->fpr[j]; } } else { if (!state->hstdout) { - state->hstdout = bgzf_dopen(fileno(stdout), "wu"); - if (!state->hstdout) { + if (!(state->hstdout = sam_open_z("-", mode, state))) { print_error_errno("bam2fq", "Cannot open STDOUT"); free(state); return false; } + set_sam_opts(state->hstdout, state, opts); } state->fpr[i] = state->hstdout; } } + + // index 1, index 2 for (i = 0; i < 2; i++) { state->fpi[i] = NULL; if (opts->index_file[i]) { @@ -802,13 +496,14 @@ if (opts->index_file[j] && strcmp(opts->index_file[j], opts->index_file[i]) == 0) break; if (i == j) { - state->fpi[i] = open_fqfile(opts->index_file[i], state->compression_level, &state->p); - if (state->fpi[i] == NULL) { - print_error_errno("bam2fq", "Cannot write to i%d file \"%s\"", + if (!(state->fpi[i] = sam_open_z(opts->index_file[i], mode, + state))) { + print_error_errno("bam2fq", "Cannot open i%d file \"%s\"", i+1, opts->index_file[i]); free(state); return false; } + set_sam_opts(state->fpi[i], state, opts); } else if (j < 0) { state->fpi[i] = state->fpr[j+3]; } else { @@ -833,21 +528,25 @@ bool valid = true; sam_hdr_destroy(state->h); check_sam_close("bam2fq", state->fp, opts->fn_input, "file", status); - if (state->fpse && bgzf_close(state->fpse)) { print_error_errno("bam2fq", "Error closing singleton file \"%s\"", opts->fnse); valid = false; } + if (state->fpse && sam_close(state->fpse) < 0) { + print_error_errno("bam2fq", "Error closing singleton file \"%s\"", opts->fnse); + valid = false; + } + int i, j; for (i = 0; i < 3; ++i) { if (state->fpr[i] != state->hstdout) { for (j = 0; j < i; j++) if (state->fpr[i] == state->fpr[j]) break; - if (j == i && bgzf_close(state->fpr[i])) { + if (j == i && sam_close(state->fpr[i])) { print_error_errno("bam2fq", "Error closing r%d file \"%s\"", i, opts->fnr[i]); valid = false; } } } if (state->hstdout) { - if (bgzf_close(state->hstdout)) { + if (sam_close(state->hstdout) < 0) { print_error_errno("bam2fq", "Error closing STDOUT"); valid = false; } @@ -859,12 +558,11 @@ for (j -= 3; j >= 0 && j < i; j++) if (state->fpi[i] == state->fpi[j]) break; - if (j == i && state->fpi[i] && bgzf_close(state->fpi[i])) { + if (j == i && state->fpi[i] && sam_close(state->fpi[i]) < 0) { print_error_errno("bam2fq", "Error closing i%d file \"%s\"", i+1, opts->index_file[i]); valid = false; } } - kl_destroy(ktaglist,state->taglist); free(state->index_sequence); if (state->p.pool) hts_tpool_destroy(state->p.pool); @@ -880,135 +578,300 @@ } +int write_index_rec(samFile *fp, bam1_t *b, bam2fq_state_t *state, + bam2fq_opts_t* opts, char *seq, int seq_len, + char *qual, int qual_len) { + if (!fp || !b || !seq_len) + return 0; + + int ret = -1; + bam1_t *b2 = bam_init1(); // FIXME: reuse + if (!b2) + return -1; + + size_t aux_len = b->data + b->l_data - bam_get_aux(b); + if (bam_set1(b2, b->core.l_qname, bam_get_qname(b), + (b->core.flag | BAM_FUNMAP) & ~BAM_FREVERSE, + -1, -1, 0, // refid, pos, mapq + 0, NULL, // cigar + -1, -1, 0, // rnext, pnext, tlen + seq_len, seq, qual, + aux_len) < 0) + goto err; + + uint8_t *q = bam_get_qual(b2); + if (qual) { + int i; + for (i = 0; i < seq_len; i++) + q[i] -= '!'; + } else { + memset(q, opts->def_qual, seq_len); + } + + memcpy(bam_get_aux(b2), bam_get_aux(b), aux_len); + b2->l_data += aux_len; + if (sam_write1(fp, state->h, b2) < 0) + goto err; + + ret = 0; + err: + if (b2) + bam_destroy1(b2); + return ret; +} + +int output_index(bam1_t *b1, bam1_t *b2, bam2fq_state_t *state, + bam2fq_opts_t* opts) { + bam1_t *b[2] = {b1, b2}; + + char *ifmt = opts->index_format; + if (!ifmt) + ifmt = "i*i*"; + + // Get seq / qual elements + char *bc = NULL, *qt = NULL; + if (b1) + bc = (char *)bam_aux_get(b1, opts->barcode_tag); + if (b2 && !bc) + bc = (char *)bam_aux_get(b2, opts->barcode_tag); + if (!bc) + return 0; + else + bc++; // skip Z + + if (b1) + qt = (char *)bam_aux_get(b1, opts->quality_tag); + if (b2 && !qt) + qt = (char *)bam_aux_get(b2, opts->quality_tag); + if (qt && strlen(bc) != strlen(qt)-1) + qt = NULL; + else if (qt) + qt++; + + int inum = 0; + while (inum < 2) { + char fc = *ifmt++; + if (!fc) + break; // ran out of index-format + + long len, rem = 0; + if (isdigit(*ifmt)) { + rem = len = strtol(ifmt, &ifmt, 10); + } else { + ifmt++; + len = 0; + } + + char *bc_end = bc, *qt_end = qt; + while (len ? *bc_end && rem-- : isalpha(*bc_end)) + bc_end++, qt_end += qt != NULL; + + switch (fc) { + case 'n': + // skip + bc = bc_end + (len==0); + if (qt) + qt = qt_end + (len==0); + break; + + case 'i': + if (write_index_rec(state->fpi[inum], b[inum], state, opts, + bc, bc_end-bc, qt, qt_end-qt) < 0) + return -1; + bc = bc_end + (len==0); + if (qt) + qt = qt_end + (len==0); + inum++; + break; + + default: + fprintf(stderr, "Unknown index-format code\n"); + return -1; + } + } + + return 0; +} + +static int flush_rec(bam2fq_state_t *state, bam2fq_opts_t* opts, + bam1_t *b[4], int score[3], int best[3], + int64_t *n_singletons) { + // Paired data, with 1 or 2 ends present. + if (score[1] > 0 && score[2] > 0) { + // If CASAVA tag is required and barcode is only on R1, + // copy it to R2 + if (state->illumina_tag) { + char *tag; + if ((tag = (char *)bam_aux_get(b[best[1]], + opts->barcode_tag))) + if (bam_aux_update_str(b[best[2]], + opts->barcode_tag, + strlen(tag), tag+1) < 0) + goto err; + if ((tag = (char *)bam_aux_get(b[best[1]], + opts->quality_tag))) + if (bam_aux_update_str(b[best[2]], + opts->quality_tag, + strlen(tag), tag+1) < 0) + goto err; + + } + if (sam_write1(state->fpr[1], state->h, b[best[1]]) < 0) + goto err; + if (sam_write1(state->fpr[2], state->h, b[best[2]]) < 0) + goto err; + + if (output_index(b[best[1]], b[best[2]], state, opts) < 0) + goto err; + } else if (score[1] > 0 || score[2] > 0) { + if (state->fpse) { + // print whichever one exists to fpse + if (score[1] > 0) { + if (sam_write1(state->fpse, state->h, b[best[1]]) < 0) + goto err; + } else { + if (sam_write1(state->fpse, state->h, b[best[2]]) < 0) + goto err; + } + ++(*n_singletons); + } else { + if (score[1] > 0) { + if (sam_write1(state->fpr[1], state->h, b[best[1]]) < 0) + goto err; + } else { + if (sam_write1(state->fpr[2], state->h, b[best[2]]) < 0) + goto err; + } + } + + if (output_index(score[1] > 0 ? b[best[1]] : NULL, + score[2] > 0 ? b[best[2]] : NULL, + state, opts) < 0) + goto err; + } + + if (score[0]) { // single ended data (neither READ1 nor READ2) + if (sam_write1(state->fpr[0], state->h, b[best[0]]) < 0) + goto err; + + if (output_index(b[best[0]], NULL, state, opts) < 0) + goto err; + } + + return 0; + + err: + return -1; +} + static bool bam2fq_mainloop(bam2fq_state_t *state, bam2fq_opts_t* opts) { int n; - bam1_t *records[3] = {NULL, NULL, NULL}; char *current_qname = NULL; int64_t n_reads = 0, n_singletons = 0; // Statistics - kstring_t linebuf[3] = {{0,0,NULL},{0,0,NULL},{0,0,NULL}}; int score[3]; int at_eof; - bool valid = true; - bam1_t* b = NULL; + bool valid = false; + int best[3] = {-1, -1, -1}; // map R0, R1, single to b[] indices; + // indexed by [readpart] + bam1_t *b[4]; // 3 readparts, plus current record - while (true) { - if (!b) - b = bam_init1(); - if (b == NULL) { + for (n = 0; n < 4; n++) { + if (!(b[n] = bam_init1())) { perror("[bam2fq_mainloop] Malloc error for bam record buffer."); - valid = false; - break; + return false; } - int res = sam_read1(state->fp, state->h, b); + } + + n = 0; + while (true) { + int res = sam_read1(state->fp, state->h, b[n]); if (res < -1) { fprintf(stderr, "[bam2fq_mainloop] Failed to read bam record.\n"); - valid = false; - break; + goto err; } at_eof = res < 0; - if (!at_eof && filter_it_out(b, state)) + if (!at_eof && filter_it_out(b[n], state)) continue; - if (!at_eof) ++n_reads; + if (!at_eof) { + ++n_reads; - if (at_eof || !current_qname || (strcmp(current_qname, bam_get_qname(b)) != 0)) { - if (current_qname) { - if (state->illumina_tag) { - for (n=0; valid && n<3; n++) { - if (!records[n]) continue; - if (insert_index_sequence_into_linebuf(state->index_sequence, &linebuf[n], records[n]) < 0) valid = false; - } - if (!valid) break; - } - free(state->index_sequence); state->index_sequence = NULL; - if (score[1] > 0 && score[2] > 0) { - // print linebuf[1] to fpr[1], linebuf[2] to fpr[2] - if (bgzf_write(state->fpr[1], linebuf[1].s, linebuf[1].l) < 0) { valid = false; break; } - if (bgzf_write(state->fpr[2], linebuf[2].s, linebuf[2].l) < 0) { valid = false; break; } - } else if (score[1] > 0 || score[2] > 0) { - if (state->fpse) { - // print whichever one exists to fpse - if (score[1] > 0) { - if (bgzf_write(state->fpse, linebuf[1].s, linebuf[1].l) < 0) { valid = false; break; } - } else { - if (bgzf_write(state->fpse, linebuf[2].s, linebuf[2].l) < 0) { valid = false; break; } - } - ++n_singletons; - } else { - if (score[1] > 0) { - if (bgzf_write(state->fpr[1], linebuf[1].s, linebuf[1].l) < 0) { valid = false; break; } - } else { - if (bgzf_write(state->fpr[2], linebuf[2].s, linebuf[2].l) < 0) { valid = false; break; } - } - } - } - if (score[0]) { // TODO: check this - // print linebuf[0] to fpr[0] - if (bgzf_write(state->fpr[0], linebuf[0].s, linebuf[0].l) < 0) { valid = false; break; } - } + // Handle -O option: use OQ for qual + uint8_t *oq; + if (state->use_oq && (oq = bam_aux_get(b[n],"OQ")) && *oq == 'Z') { + int i, l = strlen((char *)++oq); + uint8_t *qual = bam_get_qual(b[n]); + for (i = 0; i < l && i < b[n]->core.l_qseq; i++) + qual[i] = oq[i] - '!'; } + } + if (at_eof + || !current_qname + || (strcmp(current_qname, bam_get_qname(b[n])) != 0)) { + // New name, so flush best examples of previous name. + if (current_qname) + if (flush_rec(state, opts, b, score, best, &n_singletons) < 0) + goto err; - free(current_qname); current_qname = NULL; + current_qname = bam_get_qname(b[n]); score[0] = score[1] = score[2] = 0; - for (n=0; n < 3; n++) { - bam_destroy1(records[n]); records[n]=NULL; - } if (at_eof) { break; } - - current_qname = strdup(bam_get_qname(b)); - if (!current_qname) { valid = false; break; } } // Prefer a copy of the read that has base qualities - int b_score = bam_get_qual(b)[0] != 0xff? 2 : 1; - readpart rp = which_readpart(b); - if (b_score > score[rp]) { - if (!tags2fq(b, state, opts)) { valid = false; break; } - if (records[rp]) bam_destroy1(records[rp]); - records[rp] = b; + int b_score = bam_get_qual(b[n])[0] != 0xff? 2 : 1; + readpart rp = which_readpart(b[n]); + if (score[rp] < b_score) { score[rp] = b_score; - b = NULL; - if(!bam1_to_fq(records[rp], &linebuf[rp], state)) { - fprintf(stderr, "[%s] Error converting read to FASTA/Q\n", __func__); - valid = false; break; - } + // Record b[n] slot for best copy of readpair and find a new + // slot for next bam read + best[rp] = n; + int used_slot[4] = {0}, i; + for (i = 0; i < 3; i++) + if (best[i] >= 0) + used_slot[best[i]] = 1; + for (i = 0; i < 4 && used_slot[i]; i++) + ; + n = i; } } + + valid = true; + err: if (!valid) - { - perror("[bam2fq_mainloop] Error writing to FASTx files."); - } - bam_destroy1(b); - for (n=0; n < 3; n++) { - bam_destroy1(records[n]); - } - free(current_qname); - free(linebuf[0].s); - free(linebuf[1].s); - free(linebuf[2].s); - fprintf(stderr, "[M::%s] discarded %" PRId64 " singletons\n", __func__, n_singletons); - fprintf(stderr, "[M::%s] processed %" PRId64 " reads\n", __func__, n_reads); + print_error_errno("bam2fq", "Error writing to FASTx files."); + + for (n = 0; n < 4; n++) + bam_destroy1(b[n]); + + fprintf(stderr, "[M::%s] discarded %" PRId64 " singletons\n", + __func__, n_singletons); + fprintf(stderr, "[M::%s] processed %" PRId64 " reads\n", + __func__, n_reads); return valid; } int main_bam2fq(int argc, char *argv[]) { - int status = EXIT_SUCCESS; + int status = EXIT_FAILURE; bam2fq_opts_t* opts = NULL; bam2fq_state_t* state = NULL; bool valid = parse_opts(argc, argv, &opts); if (!valid || opts == NULL) return valid ? EXIT_SUCCESS : EXIT_FAILURE; - if (!init_state(opts, &state)) return EXIT_FAILURE; + if (!init_state(opts, &state)) goto err; + + if (!bam2fq_mainloop(state,opts)) goto err; - if (!bam2fq_mainloop(state,opts)) status = EXIT_FAILURE; + if (!destroy_state(opts, state, &status)) goto err; - if (!destroy_state(opts, state, &status)) return EXIT_FAILURE; + status = EXIT_SUCCESS; + err: sam_global_args_free(&opts->ga); free_opts(opts); diff -Nru samtools-1.11/bam_flags.c samtools-1.13/bam_flags.c --- samtools-1.11/bam_flags.c 2020-09-22 12:15:55.000000000 +0000 +++ samtools-1.13/bam_flags.c 2021-07-07 17:18:52.000000000 +0000 @@ -1,6 +1,6 @@ /* bam_flags.c -- flags subcommand. - Copyright (C) 2013-2014 Genome Research Ltd. + Copyright (C) 2013-2014, 2021 Genome Research Ltd. Author: Petr Danecek @@ -32,38 +32,54 @@ #include #include #include +#include "samtools.h" -static void usage(void) +static void usage(FILE *fp) { - fprintf(stderr, "\n"); - fprintf(stderr, "About: Convert between textual and numeric flag representation\n"); - fprintf(stderr, "Usage: samtools flags INT|STR[,...]\n"); - fprintf(stderr, "\n"); - fprintf(stderr, "Flags:\n"); - fprintf(stderr, "\t0x%x\tPAIRED .. paired-end (or multiple-segment) sequencing technology\n", BAM_FPAIRED); - fprintf(stderr, "\t0x%x\tPROPER_PAIR .. each segment properly aligned according to the aligner\n", BAM_FPROPER_PAIR); - fprintf(stderr, "\t0x%x\tUNMAP .. segment unmapped\n", BAM_FUNMAP); - fprintf(stderr, "\t0x%x\tMUNMAP .. next segment in the template unmapped\n", BAM_FMUNMAP); - fprintf(stderr, "\t0x%x\tREVERSE .. SEQ is reverse complemented\n", BAM_FREVERSE); - fprintf(stderr, "\t0x%x\tMREVERSE .. SEQ of the next segment in the template is reversed\n", BAM_FMREVERSE); - fprintf(stderr, "\t0x%x\tREAD1 .. the first segment in the template\n", BAM_FREAD1); - fprintf(stderr, "\t0x%x\tREAD2 .. the last segment in the template\n", BAM_FREAD2); - fprintf(stderr, "\t0x%x\tSECONDARY .. secondary alignment\n", BAM_FSECONDARY); - fprintf(stderr, "\t0x%x\tQCFAIL .. not passing quality controls\n", BAM_FQCFAIL); - fprintf(stderr, "\t0x%x\tDUP .. PCR or optical duplicate\n", BAM_FDUP); - fprintf(stderr, "\t0x%x\tSUPPLEMENTARY .. supplementary alignment\n", BAM_FSUPPLEMENTARY); - fprintf(stderr, "\n"); + static const struct { int bit; const char *desc; } *fl, flags[] = { + { BAM_FPAIRED, "paired-end / multiple-segment sequencing technology" }, + { BAM_FPROPER_PAIR, "each segment properly aligned according to aligner" }, + { BAM_FUNMAP, "segment unmapped" }, + { BAM_FMUNMAP, "next segment in the template unmapped" }, + { BAM_FREVERSE, "SEQ is reverse complemented" }, + { BAM_FMREVERSE, "SEQ of next segment in template is rev.complemented" }, + { BAM_FREAD1, "the first segment in the template" }, + { BAM_FREAD2, "the last segment in the template" }, + { BAM_FSECONDARY, "secondary alignment" }, + { BAM_FQCFAIL, "not passing quality controls or other filters" }, + { BAM_FDUP, "PCR or optical duplicate" }, + { BAM_FSUPPLEMENTARY, "supplementary alignment" }, + { 0, NULL } + }; + + fprintf(fp, +"About: Convert between textual and numeric flag representation\n" +"Usage: samtools flags FLAGS...\n" +"\n" +"Each FLAGS argument is either an INT (in decimal/hexadecimal/octal) representing\n" +"a combination of the following numeric flag values, or a comma-separated string\n" +"NAME,...,NAME representing a combination of the following flag names:\n" +"\n"); + for (fl = flags; fl->desc; fl++) { + char *name = bam_flag2str(fl->bit); + fprintf(fp, "%#6x %5d %-15s%s\n", fl->bit, fl->bit, name, fl->desc); + free(name); + } } int main_flags(int argc, char *argv[]) { - if ( argc!=2 ) usage(); - else + if ( argc < 2 ) { usage(stdout); return 0; } + + int i; + for (i = 1; i < argc; i++) { - int mask = bam_str2flag(argv[1]); - if ( mask<0 ) { fprintf(stderr,"Error: Could not parse \"%s\"\n", argv[1]); usage(); return 1; } - printf("0x%x\t%d\t%s\n", mask, mask, bam_flag2str(mask)); + int mask = bam_str2flag(argv[i]); + if ( mask<0 ) { print_error("flags", "Could not parse \"%s\"", argv[i]); usage(stderr); return 1; } + char *str = bam_flag2str(mask); + printf("0x%x\t%d\t%s\n", mask, mask, str); + free(str); } return 0; } diff -Nru samtools-1.11/bam.h samtools-1.13/bam.h --- samtools-1.11/bam.h 2020-09-22 12:15:55.000000000 +0000 +++ samtools-1.13/bam.h 2021-07-07 17:18:52.000000000 +0000 @@ -38,7 +38,7 @@ @copyright Genome Research Ltd. */ -#define BAM_VERSION "1.11" +#define BAM_VERSION "1.13" #include #include @@ -77,7 +77,7 @@ #define BAM_OFHEX 1 #define BAM_OFSTR 2 -/*! @abstract defautl mask for pileup */ +/*! @abstract default mask for pileup */ #define BAM_DEF_MASK (BAM_FUNMAP | BAM_FSECONDARY | BAM_FQCFAIL | BAM_FDUP) /*! @typedef diff -Nru samtools-1.11/bam_import.c samtools-1.13/bam_import.c --- samtools-1.11/bam_import.c 1970-01-01 00:00:00.000000000 +0000 +++ samtools-1.13/bam_import.c 2021-07-07 17:18:52.000000000 +0000 @@ -0,0 +1,487 @@ +/* bam_import -- Import of FASTQ files. + * + * samtools import -1 a_1.fq -2 a_2.fq --i1 a_i1.fq --i2 a_i2.fq + * samtools import a_1.fq a_2.fq + * samtools import a_interleaved.fq + * + * Copyright (C) 2020 Genome Research Ltd. + * + * Author: James Bonfield + */ + +/* +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notices and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +DEALINGS IN THE SOFTWARE. +*/ + +// TODO: Store other non-aux comments; in new sam tag? + +#include +#include + +#include "htslib/sam.h" +#include "htslib/thread_pool.h" + +#include "samtools.h" +#include "sam_opts.h" + +static int usage(FILE *fp, int exit_status) { + fprintf(fp, "Usage: samtools import [options] [file.fastq ...]\n"); + fprintf(fp, "\n"); + fprintf(fp, "Options:\n"); + fprintf(fp, " -s FILE Read paired-ended data from single FILE\n"); + fprintf(fp, " -0 FILE Read single-ended data from FILE\n"); + fprintf(fp, " -1 FILE Read-1 from FILE\n"); + fprintf(fp, " -2 FILE Read-2 from FILE\n"); + fprintf(fp, " --i1 FILE Index-1 from FILE\n"); + fprintf(fp, " --i2 FILE Index-2 from FILE\n"); + fprintf(fp, " -i Parse CASAVA identifier\n"); + fprintf(fp, " --barcode-tag TAG\n"); + fprintf(fp, " Tag to use with barcode sequences [BC]\n"); + fprintf(fp, " --quality-tag TAG\n"); + fprintf(fp, " Tag to use with barcode qualities [QT]\n"); + fprintf(fp, " -r STRING Build up a complete @RG line\n"); + fprintf(fp, " -R STRING Add a simple RG line of \"@RG\\tID:STRING\"\n"); + fprintf(fp, " -T TAGLIST Parse tags in SAM format; list of '*' for all\n"); + fprintf(fp, " -o FILE Output to FILE instead of stdout\n"); + fprintf(fp, " -u Uncompressed output\n"); + fprintf(fp, " --order TAG Store Nth record count in TAG\n"); + fprintf(fp, "\n"); + sam_global_opt_help(fp, "-.O.-@--"); + + fprintf(fp, "\nA single fastq file will be interpreted as -s, -0 or -1 depending on\n"); + fprintf(fp, "file contents, and a pair of fastq files as \"-1 FILE1 -2 FILE2\".\n"); + + return exit_status; +} + +// Order matters here as we want to read index elements before main +// sequences so on reading the seqs we can emit a fully annotated record. +enum fileno { + FQ_I1, FQ_I2, // index seqs for R1 and R2 + FQ_R0, // single file and unpaired data (singled-ended tech). + FQ_R1, FQ_R2, // separate read1 and read2 files + FQ_SINGLE, // single file, but with read1 and/or read2 present. + FQ_END +}; + +typedef struct { + sam_global_args ga; + int no_pg; + char *fn[FQ_END], *fn_out; + int idx_both; // add index to READ2 too, not just READ1 + int casava; + char *barcode_seq; + char *barcode_qual; + char *aux; + char *rg; + char *rg_line; + char *order; + int compress_level; + htsThreadPool p; +} opts_t; + +// Append a sequence and quality string from a BAM record to a BC:Z and +// QT:Z style aux tag string. +static int append_index(kstring_t *s, kstring_t *q, bam1_t *b) { + char *sp, *qp; + if (ks_resize(s, s->l + b->core.l_qseq+1 +1) < 0) + return -1; + if (ks_resize(q, q->l + b->core.l_qseq+1 +1) < 0) + return -1; + + sp = s->s + s->l - (s->l > 0); + qp = q->s + q->l - (q->l > 0); + + if (s->l) + *sp++ = '-'; + + if (q->l) + *qp++ = ' '; + + int i; + uint8_t *seq = bam_get_seq(b); + uint8_t *qual = bam_get_qual(b); + for (i = 0; i < b->core.l_qseq; i++) { + *sp++ = seq_nt16_str[bam_seqi(seq, i)]; + *qp++ = qual[i] + '!'; + } + *sp++ = 0; + *qp++ = 0; + + s->l = sp - s->s; + q->l = qp - q->s; + + return 0; +} + +static int import_fastq(int argc, char **argv, opts_t *opts) { + int i, n, ret = 0; + samFile *fp_in[FQ_END] = {NULL}; + bam1_t *b = bam_init1(); + int ids[FQ_END]; + samFile *fp_out = NULL; + sam_hdr_t *hdr_out = NULL; + kstring_t index_str = {0,0}; + kstring_t read_str = {0,0}; + char *rg = opts->rg; + kstring_t rg_line = {0,0}; + uint64_t read_num = 0; + kstring_t idx_seq = {0}; + kstring_t idx_qual = {0}; + + // Any additional arguments are assumed to be r1 r2, as a + // short cut. We support reading index tags out of those too (eg + // Illumina CASAVA format), but if we do that we lack the barcode + // quality string. + // + // We also consider a read name ending in /1 or /2 to be a single + // file containing interleaved fastq records for both ends. + // These will be labeled as fn[FQ_R1] but adjusted during reading. + if (argc == 1) + opts->fn[FQ_SINGLE] = argv[0]; + else + for (i = 0; i < 4; i++) + if (argc > i) + opts->fn[FQ_R1+i] = argv[i]; + + // Open all files + for (i = n = 0; i < FQ_END; i++) { + if (!opts->fn[i]) + continue; + fp_in[i] = sam_open_format(opts->fn[i], "r", &opts->ga.in); + if (!fp_in[i]) { + perror(opts->fn[i]); + ret = -1; + goto err; + } + if (opts->p.pool) + hts_set_thread_pool(fp_in[i], &opts->p); + ids[n++] = i; + + if (opts->casava) + hts_set_opt(fp_in[i], FASTQ_OPT_CASAVA, 1); + if (opts->barcode_seq) // for auto-CASAVA parsing + hts_set_opt(fp_in[i], FASTQ_OPT_BARCODE, opts->barcode_seq); + if (opts->aux) + hts_set_opt(fp_in[i], FASTQ_OPT_AUX, + *opts->aux == '*' || *opts->aux == '\0' + ? NULL : opts->aux); + + switch (i) { + case FQ_I1: + kputs("--i1 I1.fastq ", &read_str); + kputs("i*", &index_str); + break; + case FQ_I2: + kputs("--i2 I2.fastq ", &read_str); + kputs("i*", &index_str); + break; + + case FQ_R0: + kputs("-0 unpaired.fastq ", &read_str); + break; + + case FQ_R1: + kputs("-1 R1.fastq ", &read_str); + break; + + case FQ_R2: + kputs("-2 R2.fastq ", &read_str); + break; + + case FQ_SINGLE: + kputs("-N -o paired.fastq ", &read_str); + break; + + default: + ks_clear(&read_str); // not reversible + kputs("", &read_str); + } + } + if (n == 0) { + bam_destroy1(b); + return usage(stdout, EXIT_SUCCESS); + } + + char out_mode[10] = {'w', 0, 0}; + if (opts->compress_level != -1) + out_mode[1] = '0' + opts->compress_level; + sam_open_mode(out_mode+strlen(out_mode), opts->fn_out, NULL); + fp_out = sam_open_format(opts->fn_out, out_mode, &opts->ga.out); + if (!fp_out) { + perror(opts->fn_out); + goto err; + } + if (opts->p.pool) + hts_set_thread_pool(fp_out, &opts->p); + + // Create header + if (ks_len(&read_str)) { + char CO[2100]; + if (ks_len(&index_str)) + snprintf(CO, sizeof(CO), "@CO\tReverse with: samtools fastq %s " + "--index-format=\"%s\"\n", + ks_str(&read_str), ks_str(&index_str)); + else + snprintf(CO, sizeof(CO), "@CO\tReverse with: samtools fastq %s\n", + ks_str(&read_str)); + + hdr_out = sam_hdr_parse(strlen(CO), CO); + } else { + hdr_out = sam_hdr_init(); + } + + // Read group + if (opts->rg_line) { + if (*opts->rg_line != '@') + ksprintf(&rg_line, "@RG\t%s", opts->rg_line); + else + kputs(opts->rg_line, &rg_line); + } else if (opts->rg) { + ksprintf(&rg_line, "@RG\tID:%s", opts->rg); + } + + if (ks_len(&rg_line)) { + if (sam_hdr_add_lines(hdr_out, ks_str(&rg_line), 0) < 0) + goto err; + rg = strstr(ks_str(&rg_line), "\tID:"); + if (!rg) { + fprintf(stderr, "\"-r RG-LINE\" option contained no ID field\n"); + goto err; + } + rg += 4; + + i = 0; + while (rg[i] != '\t' && rg[i] != '\0') + i++; + rg[i] = 0; + } + + if ((ret = sam_hdr_write(fp_out, hdr_out)) < 0) + goto err; + + + // Interleave / combine from n files (ids[0..n-1]). + int res; + int eof = 0; + do { + idx_seq.l = idx_qual.l = 0; + for (i = 0; i < n; i++) { + if ((res = sam_read1(fp_in[ids[i]], NULL, b)) < 0) { + if (res == -1) { + eof++; + continue; + } else + break; + } + + // index + if (ids[i] == FQ_I1 || ids[i] == FQ_I2) { + if (append_index(&idx_seq, &idx_qual, b) < 0) { + res = -1; + break; + } + continue; + } + + // full read + if (idx_seq.l) { + if (opts->idx_both || ids[i] == FQ_SINGLE || + ids[i] == FQ_R0 || ids[i] == FQ_R1) { + if (bam_aux_append(b, opts->barcode_seq, 'Z', idx_seq.l, + (uint8_t *)idx_seq.s) || + bam_aux_append(b, opts->barcode_qual, 'Z', idx_qual.l, + (uint8_t *)idx_qual.s)) { + res = -1; + break; + } + } + } + + switch(ids[i]) { + case FQ_R0: + // unpaired; no flags to declare + break; + case FQ_SINGLE: + // paired (but don't know if R1 or R2) or unpaired. + // We rely on the /1 and /2 read suffix parsing in htslib + // to distinguish the two cases, or CASAVA tags if + // explicitly enabled. + break; + case FQ_R1: + if ((b->core.flag & (BAM_FREAD1 | BAM_FREAD2)) == 0) + b->core.flag |= BAM_FREAD1; + b->core.flag |= BAM_FPAIRED; + if (i+1 < n && ids[i+1] == FQ_R2) + b->core.flag |= BAM_FMUNMAP; + break; + case FQ_R2: + b->core.flag |= BAM_FPAIRED | BAM_FREAD2; + if (i > 0 && ids[i-1] == FQ_R1) + b->core.flag |= BAM_FMUNMAP; + break; + } + + if (rg) { + if (bam_aux_append(b, "RG", 'Z', strlen(rg)+1, + (uint8_t *)rg) < 0) { + ret = -1; + goto err; + } + } + + if (opts->order) { + if (bam_aux_update_int(b, opts->order, read_num++) < 0) { + ret = -1; + goto err; + } + } + + res = sam_write1(fp_out, hdr_out, b); + } + } while (res >= 0); + + if (res != -1) { + print_error("import", "truncated file. Aborting"); + ret = res; + goto err; + } + + if (eof != n) { + print_error("import", "input files with differing number of records"); + ret = -1; + goto err; + } + + // Close and return + ret = 0; +err: + bam_destroy1(b); + sam_hdr_destroy(hdr_out); + ks_free(&rg_line); + ks_free(&index_str); + ks_free(&read_str); + if (fp_out) { + if (sam_close(fp_out) < 0) { + perror(opts->fn_out); + ret |= -1; + } + } + for (i = 0; i < FQ_END; i++) { + if (fp_in[i] && sam_close(fp_in[i]) < 0) { + perror(opts->fn[i]); + ret |= -1; + } + } + ks_free(&idx_seq); + ks_free(&idx_qual); + + return ret; +} + +int main_import(int argc, char *argv[]) { + int c; + opts_t opts = { + .no_pg = 0, + .ga = SAM_GLOBAL_ARGS_INIT, + .fn = {NULL}, + .fn_out = "-", + .casava = 0, + .barcode_seq = "BC", + .barcode_qual = "QT", + .aux = NULL, + .rg = NULL, + .rg_line = NULL, + .order = NULL, + .compress_level = -1, + }; + kstring_t rg = {0}; + + static const struct option lopts[] = { + SAM_OPT_GLOBAL_OPTIONS('-', 0, 'O', 0, '-', '@'), + {"no-PG", no_argument, NULL, 9}, + {"i1", required_argument, NULL, 1}, + {"i2", required_argument, NULL, 2}, + {"r1", required_argument, NULL, '1'}, + {"r2", required_argument, NULL, '2'}, + {"rg", required_argument, NULL, 'R'}, + {"rg-line", required_argument, NULL, 'r'}, + {"order", required_argument, NULL, 3}, + {"barcode-tag", required_argument, NULL, 4}, + {"quality-tag", required_argument, NULL, 5}, + { NULL, 0, NULL, 0 } + }; + + while ((c = getopt_long(argc, argv, "1:2:s:0:bhiT:r:R:o:O:u@:", lopts, NULL)) >= 0) { + switch (c) { + case 'b': opts.idx_both = 1; break; + case '0': opts.fn[FQ_R0] = optarg; break; + case '1': opts.fn[FQ_R1] = optarg; break; + case '2': opts.fn[FQ_R2] = optarg; break; + case 1: opts.fn[FQ_I1] = optarg; break; + case 2: opts.fn[FQ_I2] = optarg; break; + case 's': opts.fn[FQ_SINGLE] = optarg; break; + case 'o': opts.fn_out = optarg; break; + case 'i': opts.casava = 1; break; + case 4: opts.barcode_seq = optarg; break; + case 5: opts.barcode_qual = optarg; break; + case 'T': opts.aux = optarg; break; + case 'u': opts.compress_level = 0; break; + case 'R': opts.rg = optarg; break; + case 'r': + if (*optarg != '@' && ks_len(&rg) == 0) + kputs("@RG", &rg); + if (ks_len(&rg)) + kputc_('\t', &rg); + kputs(optarg, &rg); + opts.rg_line = rg.s; + break; + + case 9: opts.no_pg = 1; break; + case 3: opts.order = optarg; break; + + case 'h': return usage(stdout, EXIT_SUCCESS); + case '?': return usage(stderr, EXIT_FAILURE); + + default: + if (parse_sam_global_opt(c, optarg, lopts, &opts.ga) != 0) + return usage(stderr, EXIT_FAILURE); + break; + } + } + + if (opts.ga.nthreads > 0) { + if (!(opts.p.pool = hts_tpool_init(opts.ga.nthreads))) { + fprintf(stderr, "Failed to create thread pool\n"); + if (rg.s) + free(rg.s); + return -1;; + } + } + + int ret = import_fastq(argc-optind, argv+optind, &opts) ? 1 : 0; + + if (rg.s) + free(rg.s); + + if (opts.p.pool) + hts_tpool_destroy(opts.p.pool); + + return ret; +} diff -Nru samtools-1.11/bam_markdup.c samtools-1.13/bam_markdup.c --- samtools-1.11/bam_markdup.c 2020-09-22 12:15:55.000000000 +0000 +++ samtools-1.13/bam_markdup.c 2021-07-07 17:18:52.000000000 +0000 @@ -62,6 +62,7 @@ int mode; int write_index; int include_fails; + int check_chain; char *stats_file; char *arg_list; char *out_fn; @@ -83,6 +84,7 @@ bam1_t *b; struct read_queue_s *duplicate; hts_pos_t pos; + int dup_checked; } read_queue_t; typedef struct { @@ -94,8 +96,23 @@ char type; } dup_map_t; +typedef struct { + bam1_t *b; + int64_t score; + int64_t mate_score; + long x; + long y; + int opt; + int xpos; +} check_t; +typedef struct { + check_t *c; + size_t size; + size_t length; +} check_list_t; + static khint32_t do_hash(unsigned char *key, khint32_t len); static khint_t hash_key(key_data_t key) { @@ -665,6 +682,7 @@ } +/* Get the position of the coordinates from the read name. */ static inline int get_coordinate_positions(const char *qname, int *xpos, int *ypos) { int sep = 0; int pos = 0; @@ -693,6 +711,66 @@ return sep; } + +static int get_coordinates(const char *name, int *xpos_out, long *x_coord, long *y_coord, long *warnings) { + int ret = 1; + int seps, xpos = 0, ypos = 0; + long x = 0, y = 0; + char *end; + + seps = get_coordinate_positions(name, &xpos, &ypos); + + /* The most current Illumina read format at time of writing is: + @machine:run:flowcell:lane:tile:x:y:UMI or + @machine:run:flowcell:lane:tile:x:y + + Counting the separating colons gives us a quick format check. + Older name formats have fewer elements. + */ + + if (!(seps == 3 || seps == 4 || seps == 6 || seps == 7)) { + (*warnings)++; + + if (*warnings <= BMD_WARNING_MAX) { + fprintf(stderr, "[markdup] warning: cannot decipher read name %s for optical duplicate marking.\n", name); + } + + return ret; + } + + x = strtol(name + xpos, &end, 10); + + if ((name + xpos) == end) { + (*warnings)++; + + if (*warnings <= BMD_WARNING_MAX) { + fprintf(stderr, "[markdup] warning: can not decipher X coordinate in %s .\n", name); + } + + return ret; + } + + y = strtol(name + ypos, &end, 10); + + if ((name + ypos) == end) { + (*warnings)++; + + if (*warnings <= BMD_WARNING_MAX) { + fprintf(stderr, "[markdup] warning: can not decipher y coordinate in %s .\n", name); + } + + return ret; + } + + *x_coord = x; + *y_coord = y; + *xpos_out = xpos; + ret = 0; + + return ret; +} + + /* Using the coordinates from the Illumina read name, see whether the duplicated read is close enough (set by max_dist) to the original to be counted as optical.*/ @@ -806,6 +884,59 @@ } +/* Using the coordinates from the Illumina read name, see whether the duplicated read is + close enough (set by max_dist) to the original to be counted as optical. + + This function needs the values from the first read to be already calculated. */ + +static int optical_duplicate_partial(const char *name, const int oxpos, const long ox, const long oy, bam1_t *dup, check_t *c, long max_dist, long *warnings) { + int ret = 0; + char *duplicate; + int dxpos = 0; + long dx, dy; + + duplicate = bam_get_qname(dup); + + if (get_coordinates(duplicate, &dxpos, &dx, &dy, warnings)) { + return ret; + } + + if (strncmp(name, duplicate, oxpos - 1) == 0) { + // the initial parts match, look at the numbers + long xdiff, ydiff; + + if (ox > dx) { + xdiff = ox - dx; + } else { + xdiff = dx - ox; + } + + if (xdiff <= max_dist) { + // still might be optical + + if (oy > dy) { + ydiff = oy - dy; + } else { + ydiff = dy - oy; + } + + if (ydiff <= max_dist) ret = 1; + } + } + + c->x = dx; + c->y = dy; + c->xpos = dxpos; + + if (ret) { + c->opt = ret; + } + + return ret; +} + + +/* Mark the read as a duplicate and update the duplicate hash (if needed) */ static int mark_duplicates(md_param_t *param, khash_t(duplicates) *dup_hash, bam1_t *ori, bam1_t *dup, long *optical, long *warn) { char dup_type = 0; @@ -814,7 +945,7 @@ dup->core.flag |= BAM_FDUP; if (param->tag) { - if (bam_aux_append(dup, "do", 'Z', strlen(bam_get_qname(ori)) + 1, (uint8_t*)bam_get_qname(ori))) { + if (bam_aux_update_str(dup, "do", strlen(bam_get_qname(ori)) + 1, bam_get_qname(ori))) { fprintf(stderr, "[markdup] error: unable to append 'do' tag.\n"); return -1; } @@ -822,12 +953,12 @@ if (param->opt_dist) { // mark optical duplicates if (optical_duplicate(ori, dup, param->opt_dist, warn)) { - bam_aux_append(dup, "dt", 'Z', 3, (const uint8_t *)"SQ"); + bam_aux_update_str(dup, "dt", 3, "SQ"); dup_type = 'O'; (*optical)++; } else { // not an optical duplicate - bam_aux_append(dup, "dt", 'Z', 3, (const uint8_t *)"LB"); + bam_aux_update_str(dup, "dt", 3, "LB"); } } @@ -853,17 +984,12 @@ } +/* If the duplicate type has changed to optical then retag and duplicate hash. */ static inline int optical_retag(md_param_t *param, khash_t(duplicates) *dup_hash, bam1_t *b, int paired, long *optical_single, long *optical_pair) { int ret = 0; - uint8_t *data; - - // remove any existing dt tag - if ((data = bam_aux_get(b, "dt")) != NULL) { - bam_aux_del(b, data); - } - if (bam_aux_append(b, "dt", 'Z', 3, (const uint8_t *)"SQ")) { - fprintf(stderr, "[markdup] error: unable to append 'dt' tag.\n"); + if (bam_aux_update_str(b, "dt", 3, "SQ")) { + fprintf(stderr, "[markdup] error: unable to update 'dt' tag.\n"); ret = -1; } @@ -897,26 +1023,54 @@ } +/* Check all duplicates of the highest quality read (the "original") for consistancy. Also + pre-calculate any values for use in check_duplicate_chain later. + Returns 0 on success, >0 on coordinate reading error (program can continue) or + <0 on an error (program should not continue. */ +static int check_chain_against_original(md_param_t *param, khash_t(duplicates) *dup_hash, read_queue_t *ori, + check_list_t *list, long *warn, long *optical_single, long *optical_pair) { -/* - Where there is more than one duplicate go down the list and check for optical duplicates and change - do tags (where used) to point to original (non-duplicate) read. -*/ -static int duplicate_chain_check(md_param_t *param, khash_t(duplicates) *dup_hash, read_queue_t *ori, - long *warn, long *optical_single, long *optical_pair) { int ret = 0; - read_queue_t *current = ori->duplicate; char *ori_name = bam_get_qname(ori->b); - int have_original = !(ori->b->core.flag & BAM_FDUP); - int ori_paired = (ori->b->core.flag & BAM_FPAIRED) && !(ori->b->core.flag & BAM_FMUNMAP); + read_queue_t *current = ori->duplicate; + int xpos; + long x, y; - if (!(param->tag || param->opt_dist)) - return ret; // nothing to do here + if (param->opt_dist) { + if ((ret = get_coordinates(ori_name, &xpos, &x, &y, warn))) { + return ret; + } + } + + list->length = 0; while (current) { - int current_paired = (current->b->core.flag & BAM_FPAIRED) && !(current->b->core.flag & BAM_FMUNMAP); + check_t *c; + + if (list->length >= list->size) { + check_t *tmp; - if (param->tag && have_original) { + list->size *= 2; + + if (!(tmp = realloc(list->c, list->size * sizeof(check_t)))) { + fprintf(stderr, "[markdup] error: Unable to expand opt check list.\n"); + return -1; + } + + list->c = tmp; + } + + c = &list->c[list->length]; + + c->b = current->b; + c->x = -1; + c->y = -1; + c->opt = 0; + c->score = 0; + c->mate_score = 0; + current->dup_checked = 1; + + if (param->tag) { uint8_t *data; // at this stage all duplicates should have a do tag @@ -926,10 +1080,8 @@ if (old_name) { if (strcmp(old_name, ori_name) != 0) { - bam_aux_del(current->b, data); - - if (bam_aux_append(current->b, "do", 'Z', strlen(ori_name) + 1, (uint8_t*)ori_name)) { - fprintf(stderr, "[markdup] error: unable to append 'do' tag.\n"); + if (bam_aux_update_str(current->b, "do", strlen(ori_name) + 1, (const char *)ori_name)) { + fprintf(stderr, "[markdup] error: unable to update 'do' tag.\n"); ret = -1; break; } @@ -943,118 +1095,226 @@ } if (param->opt_dist) { - int is_cur_opt = 0, is_ori_opt = 0; uint8_t *data; char *dup_type; + int is_opt = 0; + int current_paired = (current->b->core.flag & BAM_FPAIRED) && !(current->b->core.flag & BAM_FMUNMAP); - if ((data = bam_aux_get(ori->b, "dt"))) { + if ((data = bam_aux_get(current->b, "dt"))) { if ((dup_type = bam_aux2Z(data))) { if (strcmp(dup_type, "SQ") == 0) { - is_ori_opt = 1; + c->opt = 1; } } } - if ((data = bam_aux_get(current->b, "dt"))) { - if ((dup_type = bam_aux2Z(data))) { - if (strcmp(dup_type, "SQ") == 0) { - is_cur_opt = 1; - } + // need to run this to get the duplicates x and y scores + is_opt = optical_duplicate_partial(ori_name, xpos, x, y, current->b, c, param->opt_dist, warn); + + if (!c->opt && is_opt) { + if (optical_retag(param, dup_hash, current->b, current_paired, optical_single, optical_pair)) { + ret = -1; + break; + } + + c->opt = 1; + } + + c->score = calc_score(current->b); + + if (current_paired) { + if ((c->mate_score = get_mate_score(current->b)) == -1) { + fprintf(stderr, "[markdup] error: no ms score tag. Please run samtools fixmate on file first.\n"); + ret = -1; + break; } } + } - if (!(is_ori_opt && is_cur_opt)) { - // if both are already optical duplicates there is no need to check again, otherwise... + current = current->duplicate; + list->length++; + } - if (optical_duplicate(ori->b, current->b, param->opt_dist, warn)) { - // find out which one is the duplicate - int is_cur_dup = 0; - - if (have_original) { - // compared against an original, this is a dup. - is_cur_dup = 1; - } else if (ori_paired != current_paired) { - if (!current_paired) { - // current is single vs pair, this is a dup. - is_cur_dup = 1; - } - } else { - // do it by scores - int64_t ori_score, curr_score; + return ret; +} - if ((ori->b->core.flag & BAM_FQCFAIL) != (current->b->core.flag & BAM_FQCFAIL)) { - if (ori->b->core.flag & BAM_FQCFAIL) { - ori_score = 0; - curr_score = 1; - } else { - ori_score = 1; - curr_score = 0; - } - } else { - ori_score = calc_score(ori->b); - curr_score = calc_score(current->b); - if (current_paired) { - // they are pairs so add mate scores. - int64_t mate_tmp; - - if ((mate_tmp = get_mate_score(ori->b)) == -1) { - fprintf(stderr, "[markdup] error: no ms score tag. Please run samtools fixmate on file first.\n"); - ret = -1; - break; - } else { - ori_score += mate_tmp; - } +static int xcoord_sort(const void *a, const void *b) { + check_t *ac = (check_t *) a; + check_t *bc = (check_t *) b; - if ((mate_tmp = get_mate_score(current->b)) == -1) { - fprintf(stderr, "[markdup] error: no ms score tag. Please run samtools fixmate on file first.\n"); - ret = -1; - break; - } else { - curr_score += mate_tmp; - } - } - } + return (ac->x - bc->x); +} - if (ori_score == curr_score) { - if (strcmp(bam_get_qname(current->b), ori_name) < 0) { - curr_score++; - } else { - curr_score--; - } - } - if (ori_score > curr_score) { - is_cur_dup = 1; - } +/* Check all the duplicates against each other to see if they are optical duplicates. */ +static int check_duplicate_chain(md_param_t *param, khash_t(duplicates) *dup_hash, check_list_t *list, + long *warn, long *optical_single, long *optical_pair) { + int ret = 0; + size_t curr = 0; + + qsort(list->c, list->length, sizeof(list->c[0]), xcoord_sort); + + while (curr < list->length - 1) { + check_t *current = &list->c[curr]; + size_t count = curr; + char *cur_name = bam_get_qname(current->b); + int current_paired = (current->b->core.flag & BAM_FPAIRED) && !(current->b->core.flag & BAM_FMUNMAP); + + while (++count < list->length && (list->c[count].x - current->x <= param->opt_dist)) { + // while close enough along the x coordinate + check_t *chk = &list->c[count]; + + if (current->opt && chk->opt) + continue; + + // if both are already optical duplicates there is no need to check again, otherwise... + + long ydiff; + + if (current->y > chk->y) { + ydiff = current->y - chk->y; + } else { + ydiff = chk->y - current->y; + } + + if (ydiff > param->opt_dist) + continue; + + // the number are right, check the names + if (strncmp(cur_name, bam_get_qname(chk->b), current->xpos - 1) != 0) + continue; + + // optical duplicates + int chk_dup = 0; + int chk_paired = (chk->b->core.flag & BAM_FPAIRED) && !(chk->b->core.flag & BAM_FMUNMAP); + + if (current_paired != chk_paired) { + if (!chk_paired) { + // chk is single vs pair, this is a dup. + chk_dup = 1; + } + } else { + // do it by scores + int64_t cur_score, chk_score; + + if ((current->b->core.flag & BAM_FQCFAIL) != (chk->b->core.flag & BAM_FQCFAIL)) { + if (current->b->core.flag & BAM_FQCFAIL) { + cur_score = 0; + chk_score = 1; + } else { + cur_score = 1; + chk_score = 0; } + } else { + cur_score = current->score; + chk_score = chk->score; - if (is_cur_dup) { - // the current is the optical duplicate - if (!is_cur_opt) { // only change if not already an optical duplicate - if (optical_retag(param, dup_hash, current->b, current_paired, optical_single, optical_pair)) { - ret = -1; - break; - } - } + if (current_paired) { + // they are pairs so add mate scores. + chk_score += chk->mate_score; + cur_score += current->mate_score; + } + } + + if (cur_score == chk_score) { + if (strcmp(bam_get_qname(chk->b), cur_name) < 0) { + chk_score++; } else { - if (!is_ori_opt) { - if (optical_retag(param, dup_hash, ori->b, ori_paired, optical_single, optical_pair)) { - ret = -1; - break; - } - } + chk_score--; + } + } + + if (cur_score > chk_score) { + chk_dup = 1; + } + } + + if (chk_dup) { + // the duplicate is the optical duplicate + if (!chk->opt) { // only change if not already an optical duplicate + if (optical_retag(param, dup_hash, chk->b, chk_paired, optical_single, optical_pair)) { + ret = -1; + goto fail; + } + + chk->opt = 1; + } + } else { + if (!current->opt) { + if (optical_retag(param, dup_hash, current->b, current_paired, optical_single, optical_pair)) { + ret = -1; + goto fail; } + + current->opt = 1; } } } - current = current->duplicate; + curr++; + } + + fail: + return ret; +} + + +/* Where there is more than one duplicate go down the list and check for optical duplicates and change + do tags (where used) to point to original (non-duplicate) read. */ +static int find_duplicate_chains(md_param_t *param, klist_t(read_queue) *read_buffer, khash_t(duplicates) *dup_hash, check_list_t *dup_list, + const hts_pos_t prev_coord, const int32_t prev_tid, long *warn, long *optical_single, + long *optical_pair, const int check_range) { + int ret = 0; + kliter_t(read_queue) *rq; + + rq = kl_begin(read_buffer); + + while (rq != kl_end(read_buffer)) { + read_queue_t *in_read = &kl_val(rq); + + if (check_range) { + /* Just check against the moving window of reads based on coordinates and max read length. */ + if (in_read->pos + param->max_length > prev_coord && in_read->b->core.tid == prev_tid && (prev_tid != -1 || prev_coord != -1)) { + break; + } + } else { + // this is the last set of results and the end entry will be blank + if (!bam_get_qname(in_read->b)) { + break; + } + } + + if (!(in_read->b->core.flag & BAM_FDUP) && in_read->duplicate) { // is the head of a duplicate chain + + // check against the original for tagging and optical duplication + if ((ret = check_chain_against_original(param, dup_hash, in_read, dup_list, warn, optical_single, optical_pair))) { + if (ret < 0) { // real error + ret = -1; + break; + } else { // coordinate decoding error + ret = 0; + in_read->duplicate = NULL; + continue; + } + } + + // check the rest of the duplicates against each other for optical duplication + if (param->opt_dist && check_duplicate_chain(param, dup_hash, dup_list, warn, optical_single, optical_pair)) { + ret = -1; + break; + } + + in_read->duplicate = NULL; + } + + rq = kl_next(rq); } return ret; } + /* Function to use when estimating library size. @@ -1155,6 +1415,7 @@ tmp_file_t temp; char *idx_fn = NULL; int exclude = 0; + check_list_t dup_list = {NULL, 0, 0}; if (!pair_hash || !single_hash || !read_buffer || !dup_hash) { fprintf(stderr, "[markdup] out of memory\n"); @@ -1215,10 +1476,24 @@ goto fail; } + if (param->check_chain && !(param->tag || param->opt_dist)) + param->check_chain = 0; + + if (param->check_chain) { + dup_list.size = 128; + dup_list.c = NULL; + + if ((dup_list.c = malloc(dup_list.size * sizeof(check_t))) == NULL) { + fprintf(stderr, "[markdup] error: unable to allocate memory for dup_list.\n"); + goto fail; + } + } + reading = writing = excluded = single_dup = duplicate = examined = pair = single = optical = single_optical = 0; np_duplicate = np_opt_duplicate = 0; while ((ret = sam_read1(param->in, header, in_read->b)) >= 0) { + int dup_checked = 0; // do some basic coordinate order checks if (in_read->b->core.tid >= 0) { // -1 for unmapped reads @@ -1233,6 +1508,8 @@ prev_tid = in_read->b->core.tid; in_read->pair_key.single = 1; in_read->single_key.single = 0; + in_read->duplicate = NULL; + in_read->dup_checked = 0; reading++; @@ -1259,7 +1536,7 @@ // read must not be secondary, supplementary, unmapped or (possibly) failed QC if (!(in_read->b->core.flag & exclude)) { examined++; - in_read->duplicate = NULL; + // look at the pairs first if ((in_read->b->core.flag & BAM_FPAIRED) && !(in_read->b->core.flag & BAM_FMUNMAP)) { @@ -1302,17 +1579,15 @@ // scores more than one read of the pair bam1_t *dup = bp->p->b; - in_read->duplicate = bp->p; + if (param->check_chain) + in_read->duplicate = bp->p; + bp->p = in_read; if (mark_duplicates(param, dup_hash, bp->p->b, dup, &single_optical, &opt_warnings)) goto fail; single_dup++; - - if (duplicate_chain_check(param, dup_hash, bp->p, &opt_warnings, &single_optical, &optical)) - goto fail; - } } else { fprintf(stderr, "[markdup] error: single hashing failure.\n"); @@ -1329,8 +1604,7 @@ in_read->pair_key = pair_key; } else if (ret == 0) { int64_t old_score, new_score, tie_add = 0; - bam1_t *dup; - int check_chain = 0; + bam1_t *dup = NULL; bp = &kh_val(pair_hash, k); @@ -1371,29 +1645,48 @@ if (new_score + tie_add > old_score) { // swap reads dup = bp->p->b; - in_read->duplicate = bp->p; + + if (param->check_chain) { + + if (in_read->duplicate) { + read_queue_t *current = in_read->duplicate; + + while (current->duplicate) { + current = current->duplicate; + } + + current->duplicate = bp->p; + } else { + in_read->duplicate = bp->p; + } + } + bp->p = in_read; } else { - if (bp->p->duplicate) { - in_read->duplicate = bp->p->duplicate; - check_chain = 1; + if (param->check_chain) { + if (bp->p->duplicate) { + if (in_read->duplicate) { + read_queue_t *current = bp->p->duplicate; + + while (current->duplicate) { + current = current->duplicate; + } + + current->duplicate = in_read->duplicate; + } + + in_read->duplicate = bp->p->duplicate; + } + + bp->p->duplicate = in_read; } - bp->p->duplicate = in_read; dup = in_read->b; } if (mark_duplicates(param, dup_hash, bp->p->b, dup, &optical, &opt_warnings)) goto fail; - if (check_chain) { - if (duplicate_chain_check(param, dup_hash, bp->p->duplicate, &opt_warnings, &single_optical, &optical)) - goto fail; - } - - if (duplicate_chain_check(param, dup_hash, bp->p, &opt_warnings, &single_optical, &optical)) - goto fail; - duplicate++; } else { fprintf(stderr, "[markdup] error: pair hashing failure.\n"); @@ -1403,7 +1696,6 @@ int ret; key_data_t single_key; in_hash_t *bp; - int check_chain = 0; make_single_key(&single_key, in_read->b); @@ -1422,29 +1714,20 @@ if ((bp->p->b->core.flag & BAM_FPAIRED) && !(bp->p->b->core.flag & BAM_FMUNMAP)) { // if matched against one of a pair just mark as duplicate - if (bp->p->duplicate) { - in_read->duplicate = bp->p->duplicate; - check_chain = 1; - } - - bp->p->duplicate = in_read; - - if (mark_duplicates(param, dup_hash, bp->p->b, in_read->b, &single_optical, &opt_warnings)) - goto fail; + if (param->check_chain) { + if (bp->p->duplicate) { + in_read->duplicate = bp->p->duplicate; + } - if (check_chain) { - // check the new duplicate entry in the chain - if (duplicate_chain_check(param, dup_hash, bp->p->duplicate, &opt_warnings, &single_optical, &optical)) - goto fail; + bp->p->duplicate = in_read; } - // check against the new original - if (duplicate_chain_check(param, dup_hash, bp->p, &opt_warnings, &single_optical, &optical)) + if (mark_duplicates(param, dup_hash, bp->p->b, in_read->b, &single_optical, &opt_warnings)) goto fail; } else { int64_t old_score, new_score; - bam1_t *dup; + bam1_t *dup = NULL; old_score = calc_score(bp->p->b); new_score = calc_score(in_read->b); @@ -1453,32 +1736,26 @@ // to the single hash and mark the other as duplicate if (new_score > old_score) { // swap reads dup = bp->p->b; - in_read->duplicate = bp->p; + + if (param->check_chain) + in_read->duplicate = bp->p; + bp->p = in_read; } else { - if (bp->p->duplicate) { - in_read->duplicate = bp->p->duplicate; - check_chain = 1; + if (param->check_chain) { + if (bp->p->duplicate) { + in_read->duplicate = bp->p->duplicate; + } + + bp->p->duplicate = in_read; } - bp->p->duplicate = in_read; dup = in_read->b; } if (mark_duplicates(param, dup_hash, bp->p->b, dup, &single_optical, &opt_warnings)) goto fail; - - - if (check_chain) { - if (duplicate_chain_check(param, dup_hash, bp->p->duplicate, &opt_warnings, &single_optical, &optical)) - goto fail; - } - - if (duplicate_chain_check(param, dup_hash, bp->p, &opt_warnings, &single_optical, &optical)) - goto fail; - - - } + } single_dup++; } else { @@ -1502,6 +1779,22 @@ break; } + if (!dup_checked && param->check_chain) { + // check for multiple optical duplicates of the same original read + + if (find_duplicate_chains(param, read_buffer, dup_hash, &dup_list, prev_coord, prev_tid, &opt_warnings, &single_optical, &optical, 1)) { + fprintf(stderr, "[markdup] error: duplicate checking failed.\n"); + goto fail; + } + + dup_checked = 1; + } + + + if (param->check_chain && (in_read->b->core.flag & BAM_FDUP) && !in_read->dup_checked && !(in_read->b->core.flag & exclude)) { + break; + } + if (!param->remove_dups || !(in_read->b->core.flag & BAM_FDUP)) { if (param->supp) { if (tmp_file_write(&temp, in_read->b)) { @@ -1552,6 +1845,14 @@ goto fail; } + // one last check + if (param->tag || param->opt_dist) { + if (find_duplicate_chains(param, read_buffer, dup_hash, &dup_list, prev_coord, prev_tid, &opt_warnings, &single_optical, &optical, 0)) { + fprintf(stderr, "[markdup] error: duplicate checking failed.\n"); + goto fail; + } + } + // write out the end of the list rq = kl_begin(read_buffer); while (rq != kl_end(read_buffer)) { @@ -1608,7 +1909,7 @@ np_duplicate++; if (param->tag && kh_val(dup_hash, k).name) { - if (bam_aux_append(b, "do", 'Z', strlen(kh_val(dup_hash, k).name) + 1, (uint8_t*)kh_val(dup_hash, k).name)) { + if (bam_aux_update_str(b, "do", strlen(kh_val(dup_hash, k).name) + 1, (char*)kh_val(dup_hash, k).name)) { fprintf(stderr, "[markdup] error: unable to append supplementary 'do' tag.\n"); goto fail; } @@ -1616,10 +1917,10 @@ if (param->opt_dist) { if (kh_val(dup_hash, k).type) { - bam_aux_append(b, "dt", 'Z', 3, (const uint8_t *)"SQ"); + bam_aux_update_str(b, "dt", 3, "SQ"); np_opt_duplicate++; } else { - bam_aux_append(b, "dt", 'Z', 3, (const uint8_t *)"LB"); + bam_aux_update_str(b, "dt", 3, "LB"); } } } @@ -1705,6 +2006,9 @@ } } + if (param->check_chain && (param->tag || param->opt_dist)) + free(dup_list.c); + kh_destroy(reads, pair_hash); kh_destroy(reads, single_hash); kl_destroy(read_queue, read_buffer); @@ -1725,6 +2029,9 @@ } kh_destroy(duplicates, dup_hash); + if (param->check_chain && (param->tag || param->opt_dist)) + free(dup_list.c); + kh_destroy(reads, pair_hash); kh_destroy(reads, single_hash); sam_hdr_destroy(header); @@ -1747,9 +2054,11 @@ fprintf(stderr, " -m --mode TYPE Duplicate decision method for paired reads.\n" " TYPE = t measure positions based on template start/end (default).\n" " s measure positions based on sequence start.\n"); + fprintf(stderr, " -n Reduce optical duplicate accuracy (faster results with many duplicates).\n"); fprintf(stderr, " -u Output uncompressed data\n"); fprintf(stderr, " --include-fails Include quality check failed reads.\n"); fprintf(stderr, " --no-PG Do not add a PG line\n"); + fprintf(stderr, " --no-multi-dup Reduced duplicates of duplicates checking.\n"); fprintf(stderr, " -t Mark primary duplicates with the name of the original in a \'do\' tag." " Mainly for information and debugging.\n"); @@ -1770,17 +2079,18 @@ kstring_t tmpprefix = {0, 0, NULL}; struct stat st; unsigned int t; - md_param_t param = {NULL, NULL, NULL, 0, 300, 0, 0, 0, 0, 0, 0, 0, 0, 0, NULL, NULL, NULL}; + md_param_t param = {NULL, NULL, NULL, 0, 300, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, NULL, NULL, NULL}; static const struct option lopts[] = { SAM_OPT_GLOBAL_OPTIONS('-', 0, 'O', 0, 0, '@'), {"include-fails", no_argument, NULL, 1001}, {"no-PG", no_argument, NULL, 1002}, {"mode", required_argument, NULL, 'm'}, + {"no-multi-dup", no_argument, NULL, 1003}, {NULL, 0, NULL, 0} }; - while ((c = getopt_long(argc, argv, "rsl:StT:O:@:f:d:ncm:u", lopts, NULL)) >= 0) { + while ((c = getopt_long(argc, argv, "rsl:StT:O:@:f:d:cm:u", lopts, NULL)) >= 0) { switch (c) { case 'r': param.remove_dups = 1; break; case 'l': param.max_length = atoi(optarg); break; @@ -1805,6 +2115,7 @@ case 'u': wmode[2] = '0'; break; case 1001: param.include_fails = 1; break; case 1002: param.no_pg = 1; break; + case 1003: param.check_chain = 0; break; default: if (parse_sam_global_opt(c, optarg, lopts, &ga) == 0) break; /* else fall-through */ case '?': return markdup_usage(); diff -Nru samtools-1.11/bam_mate.c samtools-1.13/bam_mate.c --- samtools-1.11/bam_mate.c 2020-09-22 12:15:55.000000000 +0000 +++ samtools-1.13/bam_mate.c 2021-07-07 17:18:52.000000000 +0000 @@ -372,7 +372,7 @@ curr = 1 - curr; pre_end = cur_end; } - if (result < -1) goto fail; + if (result < -1) goto read_fail; if (has_prev && !remove_reads) { // If we still have a BAM in the buffer it must be unpaired bam1_t *pre = b[1-curr]; if (pre->core.tid < 0 || pre->core.pos < 0 || pre->core.flag&BAM_FUNMAP) { // If unmapped @@ -391,6 +391,10 @@ ks_free(&str); return 0; + read_fail: + print_error("fixmate", "Couldn't read from input file"); + goto fail; + write_fail: print_error_errno("fixmate", "Couldn't write to output file"); fail: diff -Nru samtools-1.11/bam_plcmd.c samtools-1.13/bam_plcmd.c --- samtools-1.11/bam_plcmd.c 2020-09-22 12:15:55.000000000 +0000 +++ samtools-1.13/bam_plcmd.c 2021-07-07 17:18:52.000000000 +0000 @@ -1,6 +1,6 @@ /* bam_plcmd.c -- mpileup subcommand. - Copyright (C) 2008-2015, 2019-2020 Genome Research Ltd. + Copyright (C) 2008-2015, 2019-2021 Genome Research Ltd. Portions copyright (C) 2009-2012 Broad Institute. Author: Heng Li @@ -90,8 +90,10 @@ int del_len = -p->indel; if (p->indel > 0) { int len = bam_plp_insertion(p, ks, &del_len); - if (len < 0) + if (len < 0) { + print_error("mpileup", "bam_plp_insertion() failed"); return -1; + } putc('+', fp); printw(len, fp); if (bam_is_rev(p->b)) { char pad = rev_del ? '#' : '*'; @@ -916,6 +918,12 @@ } } + if (ret < 0) { + print_error("mpileup", "error reading from input file"); + ret = EXIT_FAILURE; + goto fail; + } + if (conf->all && !(conf->flag & MPLP_BCF)) { // Handle terminating region if (last_tid < 0 && conf->reg && conf->all > 1) { @@ -1096,9 +1104,9 @@ fprintf(fp, " -r, --region REG region in which pileup is generated\n" " -R, --ignore-RG ignore RG tags (one BAM = one sample)\n" -" --rf, --incl-flags STR|INT required flags: skip reads with mask bits unset [%s]\n", tmp_require); +" --rf, --incl-flags STR|INT required flags: include reads with any of the mask bits set [%s]\n", tmp_require); fprintf(fp, -" --ff, --excl-flags STR|INT filter flags: skip reads with mask bits set\n" +" --ff, --excl-flags STR|INT filter flags: skip reads with any of the mask bits set\n" " [%s]\n", tmp_filter); fprintf(fp, " -x, --ignore-overlaps disable read-pair overlap detection\n" diff -Nru samtools-1.11/bam_sort.c samtools-1.13/bam_sort.c --- samtools-1.11/bam_sort.c 2020-09-22 12:15:55.000000000 +0000 +++ samtools-1.13/bam_sort.c 2021-07-07 17:18:52.000000000 +0000 @@ -1,6 +1,6 @@ /* bam_sort.c -- sorting and merging. - Copyright (C) 2008-2019 Genome Research Ltd. + Copyright (C) 2008-2021 Genome Research Ltd. Portions copyright (C) 2009-2012 Broad Institute. Author: Heng Li @@ -33,6 +33,7 @@ #include #include #include +#include #include #include #include @@ -724,6 +725,7 @@ // Get translated header lines and fill in map for @PG records pg_list = trans_rg_pg(false, translate, merge_pg, merged_hdr->pg_ids, tbl->pg_trans, NULL); + if (!pg_list) goto fail; // Fix-up PG: tags in the new @RG records and add to output if (finish_rg_pg(true, rg_list, tbl->pg_trans, &merged_hdr->out_rg)) @@ -1395,7 +1397,8 @@ static void merge_usage(FILE *to) { fprintf(to, -"Usage: samtools merge [-nurlf] [-h inh.sam] [-b ] [ ... ]\n" +"Usage: samtools merge [options] -o [options] ... \n" +" or: samtools merge [options] ... \n" "\n" "Options:\n" " -n Input files are sorted by read name\n" @@ -1403,6 +1406,7 @@ " -r Attach RG tag (inferred from file names)\n" " -u Uncompressed BAM output\n" " -f Overwrite the output BAM if exist\n" +" -o FILE Specify output file via option instead of argument\n" " -1 Compress level 1\n" " -l INT Compression level, from 0 to 9 [-1]\n" " -R STR Merge file in the specified region STR [all]\n" @@ -1421,7 +1425,7 @@ { int c, is_by_qname = 0, flag = 0, ret = 0, level = -1, has_index_file = 0; char *fn_headers = NULL, *reg = NULL, mode[12]; - char *sort_tag = NULL, *arg_list = NULL; + char *sort_tag = NULL, *fnout = NULL, *arg_list = NULL; long random_seed = (long)time(NULL); char** fn = NULL; char** fn_idx = NULL, *fn_bed = NULL; @@ -1440,12 +1444,13 @@ return 0; } - while ((c = getopt_long(argc, argv, "h:nru1R:f@:l:cps:b:O:t:XL:", lopts, NULL)) >= 0) { + while ((c = getopt_long(argc, argv, "h:nru1R:o:f@:l:cps:b:O:t:XL:", lopts, NULL)) >= 0) { switch (c) { case 'r': flag |= MERGE_RG; break; case 'f': flag |= MERGE_FORCE; break; case 'h': fn_headers = optarg; break; case 'n': is_by_qname = 1; break; + case 'o': fnout = optarg; break; case 't': sort_tag = optarg; break; case '1': flag |= MERGE_LEVEL1; level = 1; break; case 'u': flag |= MERGE_UNCOMP; level = 0; break; @@ -1484,7 +1489,12 @@ case '?': merge_usage(stderr); return 1; } } - if ( argc - optind < 1 ) { + + if (fnout == NULL && argc - optind >= 1) { + fnout = argv[optind]; + optind++; + } + if (fnout == NULL) { print_error("merge", "You must at least specify the output file"); merge_usage(stderr); return 1; @@ -1496,11 +1506,10 @@ } hts_srand48(random_seed); - if (!(flag & MERGE_FORCE) && strcmp(argv[optind], "-")) { - FILE *fp = fopen(argv[optind], "rb"); - if (fp != NULL) { - fclose(fp); - fprintf(stderr, "[%s] File '%s' exists. Please apply '-f' to overwrite. Abort.\n", __func__, argv[optind]); + if (!(flag & MERGE_FORCE) && strcmp(fnout, "-") != 0) { + struct stat sbuf; + if (stat(fnout, &sbuf) == 0 && S_ISREG(sbuf.st_mode)) { + fprintf(stderr, "[%s] File '%s' exists. Please apply '-f' to overwrite. Abort.\n", __func__, fnout); ret = 1; goto end; } @@ -1508,26 +1517,26 @@ int nargcfiles = 0; if (has_index_file) { // Calculate # of input BAM files - if ((argc - optind - 1) % 2 != 0) { + if ((argc - optind) % 2 != 0) { fprintf(stderr, "Odd number of filenames detected! Each BAM file should have an index file\n"); ret = 1; goto end; } - nargcfiles = (argc - optind - 1) / 2; + nargcfiles = (argc - optind) / 2; } else { - nargcfiles = argc - optind - 1; + nargcfiles = argc - optind; } if (nargcfiles > 0) { // Add argc files to end of array fn = realloc(fn, (fn_size+nargcfiles) * sizeof(char*)); if (fn == NULL) { ret = 1; goto end; } - memcpy(fn+fn_size, argv + (optind+1), nargcfiles * sizeof(char*)); + memcpy(fn+fn_size, argv + optind, nargcfiles * sizeof(char*)); if(has_index_file) { fn_idx = realloc(fn_idx, nargcfiles * sizeof(char*)); if (fn_idx == NULL) { ret = 1; goto end; } - memcpy(fn_idx+fn_size, argv + nargcfiles + (optind+1), nargcfiles * sizeof(char*)); + memcpy(fn_idx+fn_size, argv + nargcfiles + optind, nargcfiles * sizeof(char*)); } } if (fn_size+nargcfiles < 1) { @@ -1543,9 +1552,9 @@ goto end; } strcpy(mode, "wb"); - sam_open_mode(mode+1, argv[optind], NULL); + sam_open_mode(mode+1, fnout, NULL); if (level >= 0) sprintf(strchr(mode, '\0'), "%d", level < 9? level : 9); - if (bam_merge_core2(is_by_qname, sort_tag, argv[optind], mode, fn_headers, + if (bam_merge_core2(is_by_qname, sort_tag, fnout, mode, fn_headers, fn_size+nargcfiles, fn, fn_idx, fn_bed, flag, reg, ga.nthreads, "merge", &ga.in, &ga.out, ga.write_index, arg_list, no_pg) < 0) ret = 1; @@ -2605,7 +2614,7 @@ " -o FILE Write final output to FILE rather than standard output\n" " -T PREFIX Write temporary files to PREFIX.nnnn.bam\n" " --no-PG do not add a PG line\n"); - sam_global_opt_help(fp, "-.O..@-."); + sam_global_opt_help(fp, "-.O..@.."); } static void complain_about_memory_setting(size_t max_mem) { diff -Nru samtools-1.11/bam_stat.c samtools-1.13/bam_stat.c --- samtools-1.11/bam_stat.c 2020-09-22 12:15:55.000000000 +0000 +++ samtools-1.13/bam_stat.c 2021-07-07 17:18:52.000000000 +0000 @@ -1,6 +1,6 @@ /* bam_stat.c -- flagstat subcommand. - Copyright (C) 2009, 2011, 2013-2015, 2019 Genome Research Ltd. + Copyright (C) 2009, 2011, 2013-2015, 2019, 2021 Genome Research Ltd. Author: Heng Li @@ -42,32 +42,41 @@ long long n_dup[2]; long long n_diffchr[2], n_diffhigh[2]; long long n_secondary[2], n_supp[2]; + long long n_primary[2], n_pmapped[2], n_pdup[2]; } bam_flagstat_t; -#define flagstat_loop(s, c) do { \ - int w = ((c)->flag & BAM_FQCFAIL)? 1 : 0; \ - ++(s)->n_reads[w]; \ - if ((c)->flag & BAM_FSECONDARY ) { \ - ++(s)->n_secondary[w]; \ - } else if ((c)->flag & BAM_FSUPPLEMENTARY ) { \ - ++(s)->n_supp[w]; \ - } else if ((c)->flag & BAM_FPAIRED) { \ - ++(s)->n_pair_all[w]; \ - if (((c)->flag & BAM_FPROPER_PAIR) && !((c)->flag & BAM_FUNMAP) ) ++(s)->n_pair_good[w]; \ - if ((c)->flag & BAM_FREAD1) ++(s)->n_read1[w]; \ - if ((c)->flag & BAM_FREAD2) ++(s)->n_read2[w]; \ - if (((c)->flag & BAM_FMUNMAP) && !((c)->flag & BAM_FUNMAP)) ++(s)->n_sgltn[w]; \ - if (!((c)->flag & BAM_FUNMAP) && !((c)->flag & BAM_FMUNMAP)) { \ - ++(s)->n_pair_map[w]; \ - if ((c)->mtid != (c)->tid) { \ - ++(s)->n_diffchr[w]; \ - if ((c)->qual >= 5) ++(s)->n_diffhigh[w]; \ - } \ - } \ - } \ - if (!((c)->flag & BAM_FUNMAP)) ++(s)->n_mapped[w]; \ - if ((c)->flag & BAM_FDUP) ++(s)->n_dup[w]; \ - } while (0) +inline static void flagstat_loop(bam_flagstat_t *s, bam1_core_t *c) +{ + int w = (c->flag & BAM_FQCFAIL)? 1 : 0; + ++s->n_reads[w]; + if (c->flag & BAM_FSECONDARY ) { + ++s->n_secondary[w]; + } else if (c->flag & BAM_FSUPPLEMENTARY ) { + ++s->n_supp[w]; + } else { + ++s->n_primary[w]; + + if (c->flag & BAM_FPAIRED) { + ++s->n_pair_all[w]; + if ((c->flag & BAM_FPROPER_PAIR) && !(c->flag & BAM_FUNMAP) ) ++s->n_pair_good[w]; + if (c->flag & BAM_FREAD1) ++s->n_read1[w]; + if (c->flag & BAM_FREAD2) ++s->n_read2[w]; + if ((c->flag & BAM_FMUNMAP) && !(c->flag & BAM_FUNMAP)) ++s->n_sgltn[w]; + if (!(c->flag & BAM_FUNMAP) && !(c->flag & BAM_FMUNMAP)) { + ++s->n_pair_map[w]; + if (c->mtid != c->tid) { + ++s->n_diffchr[w]; + if (c->qual >= 5) ++s->n_diffhigh[w]; + } + } + } + + if (!(c->flag & BAM_FUNMAP)) ++s->n_pmapped[w]; + if (c->flag & BAM_FDUP) ++s->n_pdup[w]; + } + if (!(c->flag & BAM_FUNMAP)) ++s->n_mapped[w]; + if (c->flag & BAM_FDUP) ++s->n_dup[w]; +} bam_flagstat_t *bam_flagstat_core(samFile *fp, sam_hdr_t *h) { @@ -81,8 +90,10 @@ while ((ret = sam_read1(fp, h, b)) >= 0) flagstat_loop(s, c); bam_destroy1(b); - if (ret != -1) - fprintf(stderr, "[bam_flagstat_core] Truncated file? Continue anyway.\n"); + if (ret != -1) { + free(s); + return NULL; + } return s; } @@ -114,10 +125,13 @@ { char b0[16], b1[16]; printf("%lld + %lld in total (QC-passed reads + QC-failed reads)\n", s->n_reads[0], s->n_reads[1]); + printf("%lld + %lld primary\n", s->n_primary[0], s->n_primary[1]); printf("%lld + %lld secondary\n", s->n_secondary[0], s->n_secondary[1]); printf("%lld + %lld supplementary\n", s->n_supp[0], s->n_supp[1]); printf("%lld + %lld duplicates\n", s->n_dup[0], s->n_dup[1]); + printf("%lld + %lld primary duplicates\n", s->n_pdup[0], s->n_pdup[1]); printf("%lld + %lld mapped (%s : %s)\n", s->n_mapped[0], s->n_mapped[1], percent(b0, s->n_mapped[0], s->n_reads[0]), percent(b1, s->n_mapped[1], s->n_reads[1])); + printf("%lld + %lld primary mapped (%s : %s)\n", s->n_pmapped[0], s->n_pmapped[1], percent(b0, s->n_pmapped[0], s->n_primary[0]), percent(b1, s->n_pmapped[1], s->n_primary[1])); printf("%lld + %lld paired in sequencing\n", s->n_pair_all[0], s->n_pair_all[1]); printf("%lld + %lld read1\n", s->n_read1[0], s->n_read1[1]); printf("%lld + %lld read2\n", s->n_read2[0], s->n_read2[1]); @@ -129,14 +143,18 @@ } static void out_fmt_json(bam_flagstat_t *s) { - char b0[16], b1[16]; + char b0[16], b1[16], p0[16], p1[16], pp0[16], pp1[16], s0[16], s1[16]; printf("{\n \"QC-passed reads\": { \n" " \"total\": %lld, \n" + " \"primary\": %lld, \n" " \"secondary\": %lld, \n" " \"supplementary\": %lld, \n" " \"duplicates\": %lld, \n" + " \"primary duplicates\": %lld, \n" " \"mapped\": %lld, \n" " \"mapped %%\": %s, \n" + " \"primary mapped\": %lld, \n" + " \"primary mapped %%\": %s, \n" " \"paired in sequencing\": %lld, \n" " \"read1\": %lld, \n" " \"read2\": %lld, \n" @@ -150,11 +168,15 @@ " }," "\n \"QC-failed reads\": { \n" " \"total\": %lld, \n" + " \"primary\": %lld, \n" " \"secondary\": %lld, \n" " \"supplementary\": %lld, \n" " \"duplicates\": %lld, \n" + " \"primary duplicates\": %lld, \n" " \"mapped\": %lld, \n" " \"mapped %%\": %s, \n" + " \"primary mapped\": %lld, \n" + " \"primary mapped %%\": %s, \n" " \"paired in sequencing\": %lld, \n" " \"read1\": %lld, \n" " \"read2\": %lld, \n" @@ -168,35 +190,43 @@ " }\n" "}\n", s->n_reads[0], + s->n_primary[0], s->n_secondary[0], s->n_supp[0], s->n_dup[0], + s->n_pdup[0], s->n_mapped[0], percent_json(b0, s->n_mapped[0], s->n_reads[0]), + s->n_pmapped[0], + percent_json(p0, s->n_pmapped[0], s->n_primary[0]), s->n_pair_all[0], s->n_read1[0], s->n_read2[0], s->n_pair_good[0], - percent_json(b0, s->n_pair_good[0], s->n_pair_all[0]), + percent_json(pp0, s->n_pair_good[0], s->n_pair_all[0]), s->n_pair_map[0], s->n_sgltn[0], - percent_json(b0, s->n_sgltn[0], s->n_pair_all[0]), + percent_json(s0, s->n_sgltn[0], s->n_pair_all[0]), s->n_diffchr[0], s->n_diffhigh[0], s->n_reads[1], + s->n_primary[1], s->n_secondary[1], s->n_supp[1], s->n_dup[1], + s->n_pdup[1], s->n_mapped[1], percent_json(b1, s->n_mapped[1], s->n_reads[1]), + s->n_pmapped[1], + percent_json(p1, s->n_pmapped[1], s->n_primary[1]), s->n_pair_all[1], s->n_read1[1], s->n_read2[1], s->n_pair_good[1], - percent_json(b1, s->n_pair_good[1], s->n_pair_all[1]), + percent_json(pp1, s->n_pair_good[1], s->n_pair_all[1]), s->n_pair_map[1], s->n_sgltn[1], - percent_json(b1, s->n_sgltn[1], s->n_pair_all[1]), + percent_json(s1, s->n_sgltn[1], s->n_pair_all[1]), s->n_diffchr[1], s->n_diffhigh[1] ); @@ -205,11 +235,15 @@ static void out_fmt_tsv(bam_flagstat_t *s) { char b0[16], b1[16]; printf("%lld\t%lld\ttotal (QC-passed reads + QC-failed reads)\n", s->n_reads[0], s->n_reads[1]); + printf("%lld\t%lld\tprimary\n", s->n_primary[0], s->n_primary[1]); printf("%lld\t%lld\tsecondary\n", s->n_secondary[0], s->n_secondary[1]); printf("%lld\t%lld\tsupplementary\n", s->n_supp[0], s->n_supp[1]); printf("%lld\t%lld\tduplicates\n", s->n_dup[0], s->n_dup[1]); + printf("%lld\t%lld\tprimary duplicates\n", s->n_pdup[0], s->n_pdup[1]); printf("%lld\t%lld\tmapped\n", s->n_mapped[0], s->n_mapped[1]); printf("%s\t%s\tmapped %%\n", percent(b0, s->n_mapped[0], s->n_reads[0]), percent(b1, s->n_mapped[1], s->n_reads[1])); + printf("%lld\t%lld\tprimary mapped\n", s->n_pmapped[0], s->n_pmapped[1]); + printf("%s\t%s\tprimary mapped %%\n", percent(b0, s->n_pmapped[0], s->n_primary[0]), percent(b1, s->n_pmapped[1], s->n_primary[1])); printf("%lld\t%lld\tpaired in sequencing\n", s->n_pair_all[0], s->n_pair_all[1]); printf("%lld\t%lld\tread1\n", s->n_read1[0], s->n_read1[1]); printf("%lld\t%lld\tread2\n", s->n_read2[0], s->n_read2[1]); @@ -242,7 +276,7 @@ sam_hdr_t *header; bam_flagstat_t *s; const char *out_fmt = "default"; - int c; + int c, status = EXIT_SUCCESS; enum { INPUT_FMT_OPTION = CHAR_MAX+1, @@ -296,10 +330,17 @@ } s = bam_flagstat_core(fp, header); - output_fmt(s, out_fmt); - free(s); + if (s) { + output_fmt(s, out_fmt); + free(s); + } + else { + print_error("flagstat", "error reading from \"%s\"", argv[optind]); + status = EXIT_FAILURE; + } + sam_hdr_destroy(header); sam_close(fp); sam_global_args_free(&ga); - return 0; + return status; } diff -Nru samtools-1.11/bamtk.c samtools-1.13/bamtk.c --- samtools-1.11/bamtk.c 2020-09-22 12:15:55.000000000 +0000 +++ samtools-1.13/bamtk.c 2021-07-07 17:18:52.000000000 +0000 @@ -1,6 +1,6 @@ /* bamtk.c -- main samtools command front-end. - Copyright (C) 2008-2020 Genome Research Ltd. + Copyright (C) 2008-2021 Genome Research Ltd. Author: Heng Li @@ -30,6 +30,7 @@ #include #include "htslib/hts.h" +#include "htslib/hfile.h" #include "samtools.h" #include "version.h" @@ -66,12 +67,76 @@ int fqidx_main(int argc, char *argv[]); int amplicon_clip_main(int argc, char *argv[]); int main_ampliconstats(int argc, char *argv[]); +int main_import(int argc, char *argv[]); const char *samtools_version() { return SAMTOOLS_VERSION; } +// These come out of the config.h file built by autoconf or Makefile +const char *samtools_feature_string(void) { + const char *fmt = + +#ifdef PACKAGE_URL + "build=configure " +#else + "build=Makefile " +#endif + +#ifdef HAVE_CURSES + "curses=yes " +#else + "curses=no " +#endif + ; + + return fmt; +} + +static void long_version(void) { + printf("samtools %s\n" + "Using htslib %s\n" + "Copyright (C) 2021 Genome Research Ltd.\n", + samtools_version(), hts_version()); + + printf("\nSamtools compilation details:\n"); + printf(" Features: %s\n", samtools_feature_string()); + printf(" CC: %s\n", SAMTOOLS_CC); + printf(" CPPFLAGS: %s\n", SAMTOOLS_CPPFLAGS); + printf(" CFLAGS: %s\n", SAMTOOLS_CFLAGS); + printf(" LDFLAGS: %s\n", SAMTOOLS_LDFLAGS); + printf(" HTSDIR: %s\n", SAMTOOLS_HTSDIR); + printf(" LIBS: %s\n", SAMTOOLS_LIBS); + printf(" CURSES_LIB: %s\n", SAMTOOLS_CURSES_LIB); + + printf("\nHTSlib compilation details:\n"); + printf(" Features: %s\n", hts_feature_string()); + printf(" CC: %s\n", hts_test_feature(HTS_FEATURE_CC)); + printf(" CPPFLAGS: %s\n", hts_test_feature(HTS_FEATURE_CPPFLAGS)); + printf(" CFLAGS: %s\n", hts_test_feature(HTS_FEATURE_CFLAGS)); + printf(" LDFLAGS: %s\n", hts_test_feature(HTS_FEATURE_LDFLAGS)); + + // Plugins and schemes + printf("\nHTSlib URL scheme handlers present:\n"); + const char *plugins[100]; + int np = 100, i, j; + + if (hfile_list_plugins(plugins, &np) < 0) + return; + + for (i = 0; i < np; i++) { + const char *sc_list[100]; + int nschemes = 100; + if (hfile_list_schemes(plugins[i], sc_list, &nschemes) < 0) + return; + + printf(" %s:\t", plugins[i]); + for (j = 0; j < nschemes; j++) + printf(" %s%c", sc_list[j], ",\n"[j+1==nschemes]); + } +} + static void usage(FILE *fp) { /* Please improve the grouping */ @@ -109,6 +174,7 @@ " quickcheck quickly check if SAM/BAM/CRAM file appears intact\n" " fastq converts a BAM to a FASTQ\n" " fasta converts a BAM to a FASTA\n" +" import Converts FASTA or FASTQ files to SAM/BAM/CRAM\n" "\n" " -- Statistics\n" " bedcov read depth per BED region\n" @@ -125,6 +191,10 @@ " tview text alignment viewer\n" " view SAM<->BAM<->CRAM conversion\n" " depad convert padded BAM to unpadded BAM\n" +"\n" +" -- Misc\n" +" help [cmd] display this help message or help for [cmd]\n" +" version detailed version information\n" "\n"); } @@ -140,11 +210,6 @@ int _CRT_glob = 0; #endif -static void bam_import_err(void) { - fprintf(stderr, "[main] \"samtools import\" has been removed. " - "Please use \"samtools view\" instead.\n"); -} - int main(int argc, char *argv[]) { #ifdef _WIN32 @@ -165,7 +230,7 @@ int ret = 0; if (strcmp(argv[1], "view") == 0) ret = main_samview(argc-1, argv+1); - else if (strcmp(argv[1], "import") == 0) { bam_import_err(); return 1; } + else if (strcmp(argv[1], "import") == 0) ret = main_import(argc-1, argv+1); else if (strcmp(argv[1], "mpileup") == 0) ret = bam_mpileup(argc-1, argv+1); else if (strcmp(argv[1], "merge") == 0) ret = bam_merge(argc-1, argv+1); else if (strcmp(argv[1], "sort") == 0) ret = bam_sort(argc-1, argv+1); @@ -210,12 +275,9 @@ } else if (strcmp(argv[1], "tview") == 0) ret = bam_tview_main(argc-1, argv+1); else if (strcmp(argv[1], "ampliconstats") == 0) ret = main_ampliconstats(argc-1, argv+1); - else if (strcmp(argv[1], "--version") == 0) { - printf( -"samtools %s\n" -"Using htslib %s\n" -"Copyright (C) 2020 Genome Research Ltd.\n", - samtools_version(), hts_version()); + else if (strcmp(argv[1], "version") == 0 || \ + strcmp(argv[1], "--version") == 0) { + long_version(); } else if (strcmp(argv[1], "--version-only") == 0) { printf("%s+htslib-%s\n", samtools_version(), hts_version()); diff -Nru samtools-1.11/bam_tview.c samtools-1.13/bam_tview.c --- samtools-1.11/bam_tview.c 2020-09-22 12:15:55.000000000 +0000 +++ samtools-1.13/bam_tview.c 2021-07-07 17:18:52.000000000 +0000 @@ -351,6 +351,7 @@ int base_draw_aln(tview_t *tv, int tid, hts_pos_t pos) { + int ret; assert(tv!=NULL); // reset tv->my_clear(tv); @@ -379,9 +380,14 @@ bam_lplbuf_reset(tv->lplbuf); hts_itr_t *iter = sam_itr_queryi(tv->idx, tv->curr_tid, tv->left_pos, tv->left_pos + tv->mcol); bam1_t *b = bam_init1(); - while (sam_itr_next(tv->fp, iter, b) >= 0) tv_push_aln(b, tv); + while ((ret = sam_itr_next(tv->fp, iter, b)) >= 0) tv_push_aln(b, tv); bam_destroy1(b); hts_itr_destroy(iter); + if (ret < -1) { + print_error("tview", "could not read from input file"); + exit(1); + } + bam_lplbuf_push(0, tv->lplbuf); while (tv->ccol < tv->mcol) { diff -Nru samtools-1.11/bedcov.c samtools-1.13/bedcov.c --- samtools-1.11/bedcov.c 2020-09-22 12:15:55.000000000 +0000 +++ samtools-1.13/bedcov.c 2021-07-07 17:18:52.000000000 +0000 @@ -1,7 +1,7 @@ /* bedcov.c -- bedcov subcommand. Copyright (C) 2012 Broad Institute. - Copyright (C) 2013-2014, 2018-2020 Genome Research Ltd. + Copyright (C) 2013-2014, 2018-2021 Genome Research Ltd. Author: Heng Li @@ -72,7 +72,7 @@ kstream_t *ks; hts_idx_t **idx; aux_t **aux; - int *n_plp, dret, i, j, m, n, c, min_mapQ = 0, skip_DN = 0; + int *n_plp, dret, i, j, m, n, c, ret, status = 0, min_mapQ = 0, skip_DN = 0; int64_t *cnt, *pcov = NULL;; const bam_pileup1_t **plp; int usage = 0, has_index_file = 0; @@ -92,7 +92,7 @@ case 'g': tflags = bam_str2flag(optarg); if (tflags < 0 || tflags > ((BAM_FSUPPLEMENTARY << 1) - 1)) { - print_error_errno("depth", "Flag value \"%s\" is not supported", optarg); + print_error("bedcov", "Flag value \"%s\" is not supported", optarg); return 1; } flags &= ~tflags; @@ -100,7 +100,7 @@ case 'G': tflags = bam_str2flag(optarg); if (tflags < 0 || tflags > ((BAM_FSUPPLEMENTARY << 1) - 1)) { - print_error_errno("depth", "Flag value \"%s\" is not supported", optarg); + print_error("bedcov", "Flag value \"%s\" is not supported", optarg); return 1; } flags |= tflags; @@ -211,7 +211,7 @@ memset(cnt, 0, sizeof(*cnt) * n); if (min_depth >= 0) memset(pcov, 0, sizeof(*pcov) * n); - while (bam_mplp_auto(mplp, &tid, &pos, n_plp, plp) > 0) + while ((ret = bam_mplp_auto(mplp, &tid, &pos, n_plp, plp)) > 0) if (pos >= beg && pos < end) { for (i = 0; i < n; ++i) { m = 0; @@ -226,6 +226,14 @@ if (min_depth >= 0 && pd >= min_depth) pcov[i]++; } } + + if (ret < 0) { + print_error("bedcov", "error reading from input file"); + status = 2; + bam_mplp_destroy(mplp); + break; + } + for (i = 0; i < n; ++i) { kputc('\t', &str); kputl(cnt[i], &str); @@ -242,6 +250,7 @@ bed_error: fprintf(stderr, "Errors in BED line '%s'\n", str.s); + status = 2; } free(n_plp); free(plp); ks_destroy(ks); @@ -259,5 +268,5 @@ free(aux); free(idx); free(str.s); sam_global_args_free(&ga); - return 0; + return status; } diff -Nru samtools-1.11/configure.ac samtools-1.13/configure.ac --- samtools-1.11/configure.ac 2020-09-22 12:15:55.000000000 +0000 +++ samtools-1.13/configure.ac 2021-07-07 17:18:52.000000000 +0000 @@ -34,7 +34,7 @@ m4_include([m4/ax_with_htslib.m4]) dnl Copyright notice to be copied into the generated configure script -AC_COPYRIGHT([Portions copyright (C) 2015,2017 Genome Research Ltd. +AC_COPYRIGHT([Portions copyright (C) 2015,2017, 2019 Genome Research Ltd. This configure script is free software: you are free to change and redistribute it. There is NO WARRANTY, to the extent permitted by law.]) @@ -137,7 +137,7 @@ # This also sets __USE_MINGW_ANSI_STDIO which in turn makes PRId64, # %lld and %z printf formats work. It also enforces the snprintf to # be C99 compliant so it returns the correct values (in kstring.c). - CPPFLAGS="$CPPCFLAGS -D_XOPEN_SOURCE=600" + CPPFLAGS="$CPPFLAGS -D_XOPEN_SOURCE=600" ;; esac diff -Nru samtools-1.11/coverage.c samtools-1.13/coverage.c --- samtools-1.11/coverage.c 2020-09-22 12:15:55.000000000 +0000 +++ samtools-1.13/coverage.c 2021-07-07 17:18:52.000000000 +0000 @@ -1,7 +1,7 @@ /* coverage.c -- samtools coverage subcommand Copyright (C) 2018,2019 Florian Breitwieser - Portions copyright (C) 2019 Genome Research Ltd. + Portions copyright (C) 2019-2021 Genome Research Ltd. Author: Florian P Breitwieser @@ -24,7 +24,7 @@ DEALINGS IN THE SOFTWARE. */ /* This program calculates coverage from multiple BAMs - * simutaneously, to achieve random access and to use the BED interface. + * simultaneously, to achieve random access and to use the BED interface. * To compile this program separately, you may: * * gcc -g -O2 -Wall -o bamcov -D_MAIN_BAMCOV coverage.c -lhts -lz @@ -89,7 +89,7 @@ // LOWER ONE EIGHTH BLOCK … FULL BLOCK static const char *const BLOCK_CHARS8[8] = {"\u2581", "\u2582", "\u2583", "\u2584", "\u2585", "\u2586", "\u2587", "\u2588"}; // In some terminals / with some fonts not all UTF8 block characters are supported (e.g. Putty). Use only half and full block for those -static const char *const BLOCK_CHARS2[2] = {"\u2584", "\u2588"}; +static const char *const BLOCK_CHARS2[2] = {".", ":"}; #else @@ -100,7 +100,7 @@ "\xE2\x96\x81", "\xE2\x96\x82", "\xE2\x96\x83", "\xE2\x96\x84", "\xE2\x96\x85", "\xE2\x96\x86", "\xE2\x96\x87", "\xE2\x96\x88" }; -static const char *const BLOCK_CHARS2[2] = {"\xE2\x96\x84", "\xE2\x96\x88"}; +static const char *const BLOCK_CHARS2[2] = {".", ":"}; #endif @@ -117,6 +117,9 @@ " --rf required flags: skip reads with mask bits unset []\n" " --ff filter flags: skip reads with mask bits set \n" " [UNMAP,SECONDARY,QCFAIL,DUP]\n" + " -d, --depth INT maximum allowed coverage depth [1000000].\n" + " If 0, depth is set to the maximum integer value,\n" + " effectively removing any depth limit.\n" "Output options:\n" " -m, --histogram show histogram instead of tabular output\n" " -A, --ascii show only ASCII characters in histogram\n" @@ -235,7 +238,7 @@ } else { fprintf(file_out, ">%7.2f%% ", current_bin); } - fprintf(file_out, VERTICAL_LINE); + fprintf(file_out, full_utf ? VERTICAL_LINE : "|"); for (col = 0; col < hist_size; ++col) { // get the difference in eights, or halfs when full UTF8 is not supported int cur_val_diff = round(blockchar_len * (hist_data[col] - current_bin) / row_bin_size) - 1; @@ -248,7 +251,7 @@ fprintf(file_out, "%s", BLOCK_CHARS[cur_val_diff]); } } - fprintf(file_out, VERTICAL_LINE); + fprintf(file_out, full_utf ? VERTICAL_LINE : "|"); fputc(' ', file_out); switch (i) { case 9: fprintf(file_out, "Number of reads: %i", stats[tid].n_selected_reads); break; @@ -287,7 +290,7 @@ int ret, tid = -1, old_tid = -1, pos, i, j; - int max_depth = 0; + int max_depth = 1000000; int opt_min_baseQ = 0; int opt_min_mapQ = 0; int opt_min_len = 0; @@ -324,7 +327,7 @@ {"incl-flags", required_argument, NULL, 1}, // require flag {"excl-flags", required_argument, NULL, 2}, // filter flag {"bam-list", required_argument, NULL, 'b'}, - {"min-read-len", required_argument, NULL, 'L'}, + {"min-read-len", required_argument, NULL, 'l'}, {"min-MQ", required_argument, NULL, 'q'}, {"min-mq", required_argument, NULL, 'q'}, {"min-BQ", required_argument, NULL, 'Q'}, @@ -336,13 +339,14 @@ {"n-bins", required_argument, NULL, 'w'}, {"region", required_argument, NULL, 'r'}, {"help", no_argument, NULL, 'h'}, + {"depth", required_argument, NULL, 'd'}, { NULL, 0, NULL, 0 } }; // parse the command line int c; opterr = 0; - while ((c = getopt_long(argc, argv, "Ao:L:q:Q:hHw:r:b:m", lopts, NULL)) != -1) { + while ((c = getopt_long(argc, argv, "Ao:l:q:Q:hHw:r:b:md:", lopts, NULL)) != -1) { switch (c) { case 1: if ((required_flags = bam_str2flag(optarg)) < 0) { @@ -353,9 +357,10 @@ fprintf(stderr,"Could not parse --ff %s\n", optarg); return EXIT_FAILURE; }; break; case 'o': opt_output_file = optarg; opt_full_width = false; break; - case 'L': opt_min_len = atoi(optarg); break; + case 'l': opt_min_len = atoi(optarg); break; case 'q': opt_min_mapQ = atoi(optarg); break; case 'Q': opt_min_baseQ = atoi(optarg); break; + case 'd': max_depth = atoi(optarg); break; // maximum coverage depth case 'w': opt_n_bins = atoi(optarg); opt_full_width = false; opt_print_histogram = true; opt_print_tabular = false; break; @@ -441,7 +446,7 @@ data = (bam_aux_t **)calloc(n_bam_files, sizeof(bam_aux_t*)); // data[i] for the i-th BAM file if (!data) { - print_error("coverage", "Failed to allocate memory"); + print_error_errno("coverage", "Failed to allocate memory"); status = EXIT_FAILURE; goto coverage_end; } @@ -450,7 +455,7 @@ int rf; data[i] = (bam_aux_t *) calloc(1, sizeof(bam_aux_t)); if (!data[i]) { - print_error("coverage", "Failed to allocate memory"); + print_error_errno("coverage", "Failed to allocate memory"); status = EXIT_FAILURE; goto coverage_end; } @@ -466,12 +471,12 @@ // Set CRAM options on file handle - returns 0 on success if (hts_set_opt(data[i]->fp, CRAM_OPT_REQUIRED_FIELDS, rf)) { - print_error_errno("coverage", "Failed to set CRAM_OPT_REQUIRED_FIELDS value"); + print_error("coverage", "Failed to set CRAM_OPT_REQUIRED_FIELDS value"); status = EXIT_FAILURE; goto coverage_end; } if (hts_set_opt(data[i]->fp, CRAM_OPT_DECODE_MD, 0)) { - print_error_errno("coverage", "Failed to set CRAM_OPT_DECODE_MD value"); + print_error("coverage", "Failed to set CRAM_OPT_DECODE_MD value"); status = EXIT_FAILURE; goto coverage_end; } @@ -497,7 +502,7 @@ data[i]->iter = sam_itr_querys(idx, data[i]->hdr, opt_reg); // set the iterator hts_idx_destroy(idx); // the index is not needed any more; free the memory if (data[i]->iter == NULL) { - print_error_errno("coverage", "Failed to parse region \"%s\"", opt_reg); + print_error("coverage", "Failed to parse region \"%s\". Check the region format or region name presence in the file \"%s\"", opt_reg, argv[optind+i]); status = EXIT_FAILURE; goto coverage_end; } @@ -511,7 +516,7 @@ int n_targets = sam_hdr_nref(h); stats = calloc(n_targets, sizeof(stats_aux_t)); if (!stats) { - print_error("coverage", "Failed to allocate memory"); + print_error_errno("coverage", "Failed to allocate memory"); status = EXIT_FAILURE; goto coverage_end; } @@ -548,7 +553,7 @@ n_plp = (int*) calloc(n_bam_files, sizeof(int*)); // n_plp[i] is the number of covering reads from the i-th BAM plp = (const bam_pileup1_t**) calloc(n_bam_files, sizeof(bam_pileup1_t*)); // plp[i] points to the array of covering reads (internal in mplp) if (!hist || !n_plp || !plp) { - print_error("coverage", "Failed to allocate memory"); + print_error_errno("coverage", "Failed to allocate memory"); status = EXIT_FAILURE; goto coverage_end; } diff -Nru samtools-1.11/cut_target.c samtools-1.13/cut_target.c --- samtools-1.11/cut_target.c 2020-09-22 12:15:55.000000000 +0000 +++ samtools-1.13/cut_target.c 2021-07-07 17:18:52.000000000 +0000 @@ -170,7 +170,7 @@ int main_cut_target(int argc, char *argv[]) { - int c, tid, pos, n, lasttid = -1, usage = 0; + int c, tid, pos, n, lasttid = -1, usage = 0, status = EXIT_SUCCESS; hts_pos_t l, max_l; const bam_pileup1_t *p; bam_plp_t plp; @@ -237,6 +237,12 @@ cns[pos] = gencns(&g, n, p); } process_cns(g.h, lasttid, l, cns); + + if (n < 0) { + print_error("targetcut", "error reading from \"%s\"", argv[optind]); + status = EXIT_FAILURE; + } + free(cns); sam_hdr_destroy(g.h); bam_plp_destroy(plp); @@ -247,5 +253,5 @@ errmod_destroy(g.em); free(g.bases); sam_global_args_free(&ga); - return 0; + return status; } diff -Nru samtools-1.11/debian/changelog samtools-1.13/debian/changelog --- samtools-1.11/debian/changelog 2020-10-01 14:42:22.000000000 +0000 +++ samtools-1.13/debian/changelog 2021-08-30 14:32:37.000000000 +0000 @@ -1,3 +1,24 @@ +samtools (1.13-2) unstable; urgency=medium + + [ Étienne Mollier ] + * d/control: bump B-D libhts-dev (>= 1.13~); builds otherwise break against + older htslib versions. + + [ Andreas Tille ] + * make the build reproducible (Thanks for the patch to Chris Lamb) + Closes: #993304 + + -- Andreas Tille Mon, 30 Aug 2021 16:32:37 +0200 + +samtools (1.13-1) unstable; urgency=medium + + * New upstream version + * Standards-Version: 4.6.0 (routine-update) + * watch file standard 4 (routine-update) + * d/control: add myself to uploaders. + + -- Étienne Mollier Fri, 27 Aug 2021 16:10:26 +0200 + samtools (1.11-1) unstable; urgency=medium * New upstream version diff -Nru samtools-1.11/debian/control samtools-1.13/debian/control --- samtools-1.11/debian/control 2020-10-01 14:42:22.000000000 +0000 +++ samtools-1.13/debian/control 2021-08-30 14:32:37.000000000 +0000 @@ -1,14 +1,15 @@ Source: samtools Maintainer: Debian Med Packaging Team Uploaders: Charles Plessy , - Andreas Tille + Andreas Tille , + Étienne Mollier Section: science Priority: optional Build-Depends: debhelper-compat (= 13), bash-completion, libio-pty-perl , libncurses5-dev, - libhts-dev (>= 1.11), + libhts-dev (>= 1.13~), zlib1g-dev, automake, autoconf-archive, @@ -17,10 +18,10 @@ liblzma-dev, libbz2-dev # tabix is needed for the regression tests. -Standards-Version: 4.5.0 +Standards-Version: 4.6.0 Vcs-Browser: https://salsa.debian.org/med-team/samtools Vcs-Git: https://salsa.debian.org/med-team/samtools.git -Homepage: http://www.htslib.org/ +Homepage: https://www.htslib.org/ Rules-Requires-Root: no Package: samtools diff -Nru samtools-1.11/debian/patches/reproducible.patch samtools-1.13/debian/patches/reproducible.patch --- samtools-1.11/debian/patches/reproducible.patch 1970-01-01 00:00:00.000000000 +0000 +++ samtools-1.13/debian/patches/reproducible.patch 2021-08-30 14:32:37.000000000 +0000 @@ -0,0 +1,15 @@ +Author: Chris Lamb +Last-Update: Mon, 30 Aug 2021 14:55:21 +0100 +Bug-Debian: https://bugs.debian.org/993304 +Description: make the build reproducible + +--- a/Makefile ++++ b/Makefile +@@ -130,6 +130,7 @@ version.h: + echo '#define SAMTOOLS_HTSDIR "$(HTSDIR)"' >> $@ + echo '#define SAMTOOLS_LIBS "$(LIBS)"' >> $@ + echo '#define SAMTOOLS_CURSES_LIB "$(CURSES_LIB)"' >> $@ ++ sed -i -e 's@$(CURDIR)@«BUILDPATH»@g' $@ + + print-version: + @echo $(PACKAGE_VERSION) diff -Nru samtools-1.11/debian/patches/series samtools-1.13/debian/patches/series --- samtools-1.11/debian/patches/series 1970-01-01 00:00:00.000000000 +0000 +++ samtools-1.13/debian/patches/series 2021-08-30 14:32:37.000000000 +0000 @@ -0,0 +1 @@ +reproducible.patch diff -Nru samtools-1.11/debian/upstream/metadata samtools-1.13/debian/upstream/metadata --- samtools-1.11/debian/upstream/metadata 2020-10-01 14:42:22.000000000 +0000 +++ samtools-1.13/debian/upstream/metadata 2021-08-30 14:32:37.000000000 +0000 @@ -2,28 +2,30 @@ Bug-Submit: https://github.com/samtools/samtools/issues/new Changelog: https://raw.github.com/samtools/samtools/HEAD/NEWS Reference: - author: > + - Author: > Heng Li and Bob Handsaker and Alec Wysoker and Tim Fennell and Jue Ruan and Nils Homer and Gabor Marth and Goncalo Abecasis and Richard Durbin and 1000 Genome Project Data Processing Subgroup - title: The Sequence Alignment/Map (SAM) Format and SAMtools - journal: Bioinformatics - volume: 25 - number: 16 - pages: 2078-2079 - doi: 10.1093/bioinformatics/btp352 - PMID: 19505943 - year: 2009 - URL: http://bioinformatics.oxfordjournals.org/cgi/content/abstract/btp352v1 - eprint: http://bioinformatics.oxfordjournals.org/cgi/reprint/btp352v1.pdf + Title: The Sequence Alignment/Map (SAM) Format and SAMtools + Journal: Bioinformatics + Volume: 25 + Number: 16 + Pages: 2078-2079 + DOI: 10.1093/bioinformatics/btp352 + PMID: 19505943 + Year: 2009 + URL: http://bioinformatics.oxfordjournals.org/cgi/content/abstract/btp352v1 + eprint: http://bioinformatics.oxfordjournals.org/cgi/reprint/btp352v1.pdf Repository: https://github.com/samtools/samtools.git Registry: -- Name: bio.tools - Entry: samtools -- Name: SciCrunch - Entry: SCR_002105 -- Name: OMICtools - Entry: OMICS_00090 -- Name: conda:bioconda - Entry: samtools + - Name: bio.tools + Entry: samtools + - Name: SciCrunch + Entry: SCR_002105 + - Name: OMICtools + Entry: OMICS_00090 + - Name: conda:bioconda + Entry: samtools + - Name: guix + Entry: samtools Repository-Browse: https://github.com/samtools/samtools diff -Nru samtools-1.11/debian/watch samtools-1.13/debian/watch --- samtools-1.11/debian/watch 2020-10-01 14:42:22.000000000 +0000 +++ samtools-1.13/debian/watch 2021-08-30 14:32:37.000000000 +0000 @@ -1,3 +1,3 @@ -version=3 +version=4 opts=filenamemangle=s/.+\/v?(\d\S*)\.tar\.gz/samtools-$1\.tar\.gz/,uversionmangle=s/-rc/~rc/ \ https://github.com/samtools/samtools/tags .*/v?(\d\S*)\.tar\.gz diff -Nru samtools-1.11/doc/samtools.1 samtools-1.13/doc/samtools.1 --- samtools-1.11/doc/samtools.1 2020-09-22 12:15:55.000000000 +0000 +++ samtools-1.13/doc/samtools.1 2021-07-07 17:18:52.000000000 +0000 @@ -1,5 +1,5 @@ '\" t -.TH samtools 1 "22 September 2020" "samtools-1.11" "Bioinformatics tools" +.TH samtools 1 "7 July 2021" "samtools-1.13" "Bioinformatics tools" .SH NAME samtools \- Utilities for the Sequence Alignment/Map (SAM) format .\" @@ -67,6 +67,8 @@ .PP samtools depth aln.sorted.bam .PP +samtools ampliconstats primers.bed in.bam +.PP samtools mpileup -C50 -f ref.fasta -r chr3:1,000-2,000 in1.bam in2.bam .PP samtools coverage aln.sorted.bam @@ -77,6 +79,8 @@ .PP samtools cat out.bam in1.bam in2.bam in3.bam .PP +samtools import input.fastq > output.bam +.PP samtools fastq input.bam > output.fastq .PP samtools fasta input.bam > output.fasta @@ -376,6 +380,21 @@ Computes the read depth at each position or region. +.TP \"-------- ampliconstats +.B ampliconstats +samtools ampliconstats +.RI [ options ] +.IR primers.bed +.IR in.sam | in.bam | in.cram [...] + +samtools ampliconstats collects statistics from one or more input +alignment files and produces tables in text format. The output can be +visualized graphically using plot-ampliconstats. + +The alignment files should have previously been clipped of primer +sequence, for example by \fBsamtools ampliconclip\fR and the sites of +these primers should be specified as a bed file in the arguments. + .TP \"-------- mpileup .B mpileup samtools mpileup @@ -468,6 +487,18 @@ .B reheader which enables fast BAM concatenation. +.TP \"-------- import +.B import +samtools import +.RI [ options ] +.IR in.fastq " [ ... ]" + +Converts one or more FASTQ files to unaligned SAM, BAM or CRAM. These +formats offer a richer capability of tracking sample meta-data via the +SAM header and per-read meta-data via the auxiliary tags. The +.B fastq +command may be used to reverse this conversion. + .TP \"-------- fastq fasta .B fastq/a samtools fastq @@ -756,6 +787,11 @@ Specifies the number of threads to use during encoding and/or decoding. For BAM this will be encoding only. In CRAM the threads are dynamically shared between encoder and decoder. +.TP +.BI filter= STRING +Apply filter STRING to all incoming records, rejecting any that do not +satisfy the expression. See the FILTER EXPRESSIONS section below for +specifics. .\" CRAM specific .TP .BI reference= fasta_file @@ -913,6 +949,133 @@ Look for MD5 in each element of the REF_PATH environment variable. .IP 4. 3 Look for a local file listed in the UR: header tag. + +.PP +.SH FILTER EXPRESSIONS +.PP +Filter expressions are used as an on-the-fly checking of incoming SAM, +BAM or CRAM records, discarding records that do not match the +specified expression. + +The language used is primarily C style, but with a few differences in +the precedence rules for bit operators and the inclusion of regular +expression matching. + +The operator precedence, from strongest binding to weakest, is: + +.TS +l lb l . +Grouping (, ) E.g. "(1+2)*3" +Values: literals, vars Numbers, strings and variables +Unary ops: +, -, !, ~ E.g. -10 +10, !10 (not), ~5 (bit not) +Math ops: *, /, % Multiply, division and (integer) modulo +Math ops: +, - Addition / subtraction +Bit-wise: & Integer AND +Bit-wise ^ Integer XOR +Bit-wise | Integer OR +Conditionals: >, >=, <, <= +Equality: ==, !=, =~, !~ =~ and !~ match regular expressions +Boolean: &&, || Logical AND / OR +.TE + +Expressions are computed using floating point mathematics, so "10 / 4" +evaluates to 2.5 rather than 2. They may be written as integers in +decimal or "0x" plus hexadecimal, and floating point with or without +exponents.However operations that require integers first do an +implicit type conversion, so "7.9 % 5" is 2 and "7.9 & 4.1" is +equivalent to "7 & 4", which is 4. Strings are always specified using +double quotes. To get a double quote in a string, use backslash. +Similarly a double backslash is used to get a literal backslash. For +example \fBab\\"c\\\\d\fR is the string \fBab"c\\d\fR. + +Comparison operators are evaluated as a match being 1 and a mismatch +being 0, thus "(2 > 1) + (3 < 5)" evaluates as 2. + +The variables are where the file format specifics are accessed from +the expression. The variables correspond to SAM fields, for example +to find paired alignments with high mapping quality and a very large +insert size, we may use the expression "\fBmapq >= 30 && (tlen >= 100000 || tlen <= -100000)\fR". +Valid variable names and their data types are: + +.TS +lb l l . +flag int Combined FLAG field +flag.paired int Single bit, 0 or 1 +flag.proper_pair int Single bit, 0 or 2 +flag.unmap int Single bit, 0 or 4 +flag.munmap int Single bit, 0 or 8 +flag.reverse int Single bit, 0 or 16 +flag.mreverse int Single bit, 0 or 32 +flag.read1 int Single bit, 0 or 64 +flag.read2 int Single bit, 0 or 128 +flag.secondary int Single bit, 0 or 256 +flag.qcfail int Single bit, 0 or 512 +flag.dup int Single bit, 0 or 1024 +flag.supplementary int Single bit, 0 or 2048 +library string Library (LB header via RG) +mapq int Mapping quality +mpos int Synonym for pnext +mrefid int Mate reference number (0 based) +mrname string Synonym for rnext +ncigar int Number of cigar operations +pnext int Mate's alignment position (1-based) +pos int Alignment position (1-based) +qlen int Alignment length: no. query bases +qname string Query name +qual string Quality values (raw, 0 based) +refid int Integer reference number (0 based) +rlen int Alignment length: no. reference bases +rname string Reference name +rnext string Mate's reference name +seq string Sequence +tlen int Template length (insert size) +[XX] int / string XX tag value +.TE + +Flags are returned either as the whole flag value or by checking for a +single bit. Hence the filter expression \fBflag.dup\fR is +equivalent to \fBflag & 1024\fR. + +"qlen" and "rlen" are measured using the CIGAR string to count the +number of query (sequence) and reference bases consumed. Note "qlen" +may not exactly match the length of the "seq" field if the sequence is +"*". + +Reference names may be matched either by their string forms ("rname" +and "mrname") or as the Nth \fB@SQ\fR line (counting from zero) as +stored in BAM using "tid" and "mtid" respectively. + +Auxiliary tags are described in square brackets and these expand to +either integer or string as defined by the tag itself (\fBXX:Z:\fIstring\fR or +\fBXX:i:\fIint\fR). For example \fB[NM]>=10\fR can be used to look +for alignments with many mismatches and \fB[RG]=~"grp[ABC]-"\fR will +match the read-group string. + +If no comparison is used with an auxiliary tag it is taken simply to +be a test for the existence of that tag. So "[NM]" will return any +record containing an NM tag, even if that tag is zero (\fBNM:i:0\fR). + +If you need to check specifically for a non-zero value then use \fB[NM] +&& [NM]!=0\fR. + +Some simple functions are available to operate on strings. These +treat the strings as arrays of bytes, permitting their length, +minimum, maximum and average values to be computed. + +.TS +lb l . +length Length of the string (excluding nul char) +min Minimum byte value in the string +max Maximum byte value in the string +avg Average byte value in the string +.TE + +Note that "avg" is a floating point value and it may be NAN for empty +strings. This means that "avg(qual)" does not produce an error for +records that have both seq and qual of "*". This value will fail any +conditional checks, so e.g. "avg(qual) > 20" works and will not report +these records. + .PP .SH ENVIRONMENT VARIABLES .PP @@ -954,7 +1117,7 @@ To avoid many files being stored in the same directory, REF_CACHE may be defined as a pattern using \fB%\fR\fInum\fR\fBs\fR to consume \fInum\fR -chracters of the MD5sum and \fB%s\fR to consume all remaining characters. +characters of the MD5sum and \fB%s\fR to consume all remaining characters. If REF_CACHE lacks \fB%s\fR then it will get an implicit \fB/%s\fR appended. To aid population of the REF_CACHE directory a script @@ -1014,6 +1177,7 @@ .SH SEE ALSO .IR samtools-addreplacerg (1), .IR samtools-ampliconclip (1), +.IR samtools-ampliconstats (1), .IR samtools-bedcov (1), .IR samtools-calmd (1), .IR samtools-cat (1), @@ -1030,6 +1194,7 @@ .IR samtools-flagstat (1), .IR samtools-fqidx (1), .IR samtools-idxstats (1), +.IR samtools-import (1), .IR samtools-index (1), .IR samtools-markdup (1), .IR samtools-merge (1), diff -Nru samtools-1.11/doc/samtools-addreplacerg.1 samtools-1.13/doc/samtools-addreplacerg.1 --- samtools-1.11/doc/samtools-addreplacerg.1 2020-09-22 12:15:55.000000000 +0000 +++ samtools-1.13/doc/samtools-addreplacerg.1 2021-07-07 17:18:52.000000000 +0000 @@ -1,5 +1,5 @@ '\" t -.TH samtools-addreplacerg 1 "22 September 2020" "samtools-1.11" "Bioinformatics tools" +.TH samtools-addreplacerg 1 "7 July 2021" "samtools-1.13" "Bioinformatics tools" .SH NAME samtools addreplacerg \- adds or replaces read group tags .\" @@ -58,7 +58,8 @@ .SH DESCRIPTION .PP -Adds or replaces read group tags in a file. +Adds or replaces read group tags in a file. Also allows for adding and updating +@RG lines in the header. .SH OPTIONS .TP 8 @@ -87,6 +88,9 @@ .B -u Output uncompressed SAM, BAM or CRAM. .TP 8 +.B -w +Overwrite an existing @RG line, if a new one with the same ID value is provided. +.TP 8 .BI --no-PG Do not add a @PG line to the header of the output file. .TP 8 diff -Nru samtools-1.11/doc/samtools-ampliconclip.1 samtools-1.13/doc/samtools-ampliconclip.1 --- samtools-1.11/doc/samtools-ampliconclip.1 2020-09-22 12:15:55.000000000 +0000 +++ samtools-1.13/doc/samtools-ampliconclip.1 2021-07-07 17:18:52.000000000 +0000 @@ -1,9 +1,9 @@ '\" t -.TH samtools-ampliconclip 1 "22 September 2020" "samtools-1.11" "Bioinformatics tools" +.TH samtools-ampliconclip 1 "7 July 2021" "samtools-1.13" "Bioinformatics tools" .SH NAME samtools ampliconclip \- clip reads using a BED file .\" -.\" Copyright (C) 2008-2011, 2013-2020 Genome Research Ltd. +.\" Copyright (C) 2008-2011, 2013-2021 Genome Research Ltd. .\" Portions copyright (C) 2010, 2011 Broad Institute. .\" .\" Author: Andrew Whitwham @@ -62,6 +62,7 @@ .IR rejects.file ] .RB [ --original ] .RB [ --keep-tag ] +.RB [ --tolerance ] .RB [ --no-PG ] .RB [ -u ] .B -b @@ -69,8 +70,16 @@ .SH DESCRIPTION .PP -Clip reads in a SAM compatible file based on data from a BED file. By default -the reads are soft clipped and clip is only done from the 5' end. +Clips the ends of read alignments if they intersect with regions defined in a +BED file. While this tool was originally written for clipping read alignment +positions which correspond to amplicon primer locations it can also be used in +other contexts. + +BED file entries used are chrom, chromStart, chromEnd and, optionally, strand. +There is a default tolerance of 5 bases when matching chromStart and chromEnd +to alignments. + +By default the reads are soft clipped and clip is only done from the 5' end. Some things to be aware of. While ordering is not significant, adjustments to the left most mapping position (\fIPOS\fR) will mean that coordinate sorted @@ -84,7 +93,7 @@ .SH OPTIONS .TP 11 .BI "-b " FILE -BED file of amplicons to be removed. +BED file of regions (e.g. amplicon primers) to be removed. .TP .BI "-o " FILE Output file name (defaults to stdout). @@ -102,10 +111,11 @@ Hard clip reads. .TP .B --both-ends -Clip at both ends as opposed to just the 5' end. +Clip at both the 5' and the 3' ends where regions match. .TP .B --strand -Use strand entry from the BED file. +Use strand entry from the BED file to clip on the matching forward or reverse +alignment. .TP .B --clipped Only output clipped reads. Filter all others. @@ -133,6 +143,9 @@ .B --keep-tag In clipped reads, keep the possibly invalid NM and MD tags. By default these tags are deleted. .TP +.BI "--tolerance " INT +The amount of latitude given in matching regions to alignments. Default 5 bases. +.TP .B --no-PG Do not at a PG line to the header. diff -Nru samtools-1.11/doc/samtools-ampliconstats.1 samtools-1.13/doc/samtools-ampliconstats.1 --- samtools-1.11/doc/samtools-ampliconstats.1 2020-09-22 12:15:55.000000000 +0000 +++ samtools-1.13/doc/samtools-ampliconstats.1 2021-07-07 17:18:52.000000000 +0000 @@ -1,9 +1,9 @@ '\" t -.TH samtools-ampliconstats 1 "22 September 2020" "samtools-1.11" "Bioinformatics tools" +.TH samtools-ampliconstats 1 "7 July 2021" "samtools-1.13" "Bioinformatics tools" .SH NAME samtools ampliconstats \- produces statistics from amplicon sequencing alignment file .\" -.\" Copyright (C) 2020 Genome Research Ltd. +.\" Copyright (C) 2020-2021 Genome Research Ltd. .\" .\" Author: James Bonfield .\" @@ -120,14 +120,17 @@ This section is once per file and includes summary information to be utilised for scaling of plots, for example the total number of amplicons and files present, tool version number, and command line -arguments. +arguments. The second column is the filename or "COMBINED". This is +followed by the reference name (unless single-ref mode is enabled), +and the summary statistic name and value. The AMPLICON section is a reformatting of the input BED file. Each -line consists of the amplicon number and the \fIstart\fR-\fIend\fR -coordinates of the left and right primers. Where multiple primers are -available these are comma separated, for example \fB10-30,15-40\fR in -the left primer column indicates two primers have been multiplex -together covering genome coords 10-30 inclusive and 14-40 inclusively. +line consists of the reference name (unless single-ref mode is +enable), the amplicon number and the \fIstart\fR-\fIend\fR coordinates +of the left and right primers. Where multiple primers are available +these are comma separated, for example \fB10-30,15-40\fR in the left +primer column indicates two primers have been multiplex together +covering genome coordinates 10-30 inclusive and 14-40 inclusively. .SH CSS SECTION @@ -216,6 +219,12 @@ Th COORD data may indicate which primers are being utilised if there are alternates available for a given amplicon. +For COORD lines amplicon number 0 holds the frequency data for data +that reads that have not been assigned to any amplicon. That is, they +may lie within an amplicon, but they do not start or end at a known +primer location. It is not recorded for BED files containing multiple +references. + The FAMP / CAMP section is a simple count per amplicon of the number of templates coming from this amplicon. Templates are counted once per amplicon, but and like the FTCOORD field if a read-pair spans @@ -226,6 +235,9 @@ of templates where the other end has failed to be assigned to an amplicon. +Note FAMP / CAMP amplicon number 0 is the summation of data for all +amplicons (1 onwards). + .SH FDP_ALL / CDP_ALL / FDP_VALID / CDP_VALID section These are for depth plots per base rather than per amplicon. They @@ -245,12 +257,13 @@ reads or bases alone in the amplicon does not reveal the potential for non-uniformity of coverage. -The format of the data is a series of tab separated tuples consisting -of \fIdepth,length\fR. The length field is a basic form of run-length -encoding where all depth values within a specified fraction of each -other (e.g. >= (1-fract)*midpoint and <= (1+fract)*midpoint) are -combined into a single run. This fraction is controlled via the -\fB-D\fR option.. +The lines start with the type keyword, file / sample name, reference +name (unless single-ref mode is enabled), followed by a variable +number of tab separated tuples consisting of \fIdepth,length\fR. The +length field is a basic form of run-length encoding where all depth +values within a specified fraction of each other (e.g. >= +(1-fract)*midpoint and <= (1+fract)*midpoint) are combined into a +single run. This fraction is controlled via the \fB-D\fR option. .SH OPTIONS .TP 8 @@ -328,6 +341,14 @@ the SM field from the first @RG header line. .TP +.B "-S, --single-ref" +Force the output format to match the older single-reference style +used in Samtools 1.12 and earlier. This removes the reference names +from the SS, AMPLICON, DP_ALL and DP_VALID sections. It cannot be +enabled if the input BED file has more than one reference present. +Note that plot-ampliconstats can process both output styles. + +.TP .BI "-t, --tlen-adjust " INT Adjust the TLEN field by +/- \fIINT\fR to compensate for primer clipping. This defaults to zero, but if the primers have been clipped and the diff -Nru samtools-1.11/doc/samtools-bedcov.1 samtools-1.13/doc/samtools-bedcov.1 --- samtools-1.11/doc/samtools-bedcov.1 2020-09-22 12:15:55.000000000 +0000 +++ samtools-1.13/doc/samtools-bedcov.1 2021-07-07 17:18:52.000000000 +0000 @@ -1,5 +1,5 @@ '\" t -.TH samtools-bedcov 1 "22 September 2020" "samtools-1.11" "Bioinformatics tools" +.TH samtools-bedcov 1 "7 July 2021" "samtools-1.13" "Bioinformatics tools" .SH NAME samtools bedcov \- reports coverage over regions in a supplied BED file .\" diff -Nru samtools-1.11/doc/samtools-calmd.1 samtools-1.13/doc/samtools-calmd.1 --- samtools-1.11/doc/samtools-calmd.1 2020-09-22 12:15:55.000000000 +0000 +++ samtools-1.13/doc/samtools-calmd.1 2021-07-07 17:18:52.000000000 +0000 @@ -1,5 +1,5 @@ '\" t -.TH samtools-calmd 1 "22 September 2020" "samtools-1.11" "Bioinformatics tools" +.TH samtools-calmd 1 "7 July 2021" "samtools-1.13" "Bioinformatics tools" .SH NAME samtools calmd \- calculates MD and NM tags .\" diff -Nru samtools-1.11/doc/samtools-cat.1 samtools-1.13/doc/samtools-cat.1 --- samtools-1.11/doc/samtools-cat.1 2020-09-22 12:15:55.000000000 +0000 +++ samtools-1.13/doc/samtools-cat.1 2021-07-07 17:18:52.000000000 +0000 @@ -1,5 +1,5 @@ '\" t -.TH samtools-cat 1 "22 September 2020" "samtools-1.11" "Bioinformatics tools" +.TH samtools-cat 1 "7 July 2021" "samtools-1.13" "Bioinformatics tools" .SH NAME samtools cat \- concatenate files together .\" diff -Nru samtools-1.11/doc/samtools-collate.1 samtools-1.13/doc/samtools-collate.1 --- samtools-1.11/doc/samtools-collate.1 2020-09-22 12:15:55.000000000 +0000 +++ samtools-1.13/doc/samtools-collate.1 2021-07-07 17:18:52.000000000 +0000 @@ -1,5 +1,5 @@ '\" t -.TH samtools-collate 1 "22 September 2020" "samtools-1.11" "Bioinformatics tools" +.TH samtools-collate 1 "7 July 2021" "samtools-1.13" "Bioinformatics tools" .SH NAME samtools collate \- shuffles and groups reads together by their names .\" diff -Nru samtools-1.11/doc/samtools-coverage.1 samtools-1.13/doc/samtools-coverage.1 --- samtools-1.11/doc/samtools-coverage.1 2020-09-22 12:15:55.000000000 +0000 +++ samtools-1.13/doc/samtools-coverage.1 2021-07-07 17:18:52.000000000 +0000 @@ -1,9 +1,9 @@ '\" t -.TH samtools-coverage 1 "22 September 2020" "samtools-1.11" "Bioinformatics tools" +.TH samtools-coverage 1 "7 July 2021" "samtools-1.13" "Bioinformatics tools" .SH NAME samtools coverage \- produces a histogram or table of coverage per chromosome .\" -.\" Copyright (C) 2019 Genome Research Ltd. +.\" Copyright (C) 2019, 2021 Genome Research Ltd. .\" .\" Author: James Bonfield .\" @@ -88,6 +88,10 @@ .BI --ff,\ --excl-flags \ STR|INT Filter flags: skip reads with mask bits set [UNMAP,SECONDARY,QCFAIL,DUP] +.TP +.BI -d,\ --depth \ INT +Maximum allowed coverage depth [1000000]. If 0, depth is set to the maximum +integer value effectively removing any depth limit. .PP Output options: @@ -97,7 +101,8 @@ Show histogram instead of tabular output. .TP .BI -A,\ --ascii -Show only ASCII characters in histogram. +Show only ASCII characters in histogram using colon and fullstop for +full and half height characters. .TP .BI -o,\ --output \ FILE Write output to FILE [stdout]. @@ -133,16 +138,16 @@ samtools coverage -A -w 32 -r chr1:1M-12M input.bam chr1 (249.25Mbp) -> 24.19% | # | Number of reads: 528695 -> 21.50% |## | (132000 filtered) -> 18.81% |## | Covered bases: 1.07Mbp -> 16.12% |## # | Percent covered: 9.727% -> 13.44% |## # # ## # #| Mean coverage: 3.5x -> 10.75% |## ## # ## # # #| Mean baseQ: 34.4 -> 8.06% |##### # ## # # # #| Mean mapQ: 55.8 -> 5.37% |##### ## ### # ##### #| -> 2.69% |##### ### ### ### #########| Histo bin width: 343.8Kbp -> 0.00% |############ ###################| Histo max bin: 26.873% +> 24.19% | . | Number of reads: 528695 +> 21.50% |:: | (132000 filtered) +> 18.81% |:: | Covered bases: 1.07Mbp +> 16.12% |:: : | Percent covered: 9.727% +> 13.44% |:: : . :: : :| Mean coverage: 3.5x +> 10.75% |:: :: : :: : : :| Mean baseQ: 34.4 +> 8.06% |::::: : :: : : : :| Mean mapQ: 55.8 +> 5.37% |::::: :: ::: : ::::: :| +> 2.69% |::::: ::: ::: ::: :::::::::| Histo bin width: 343.8Kbp +> 0.00% |:::::::::::. :::::::::::::::::::| Histo max bin: 26.873% 1.00M 4.44M 7.87M 12.00M .EE diff -Nru samtools-1.11/doc/samtools-depad.1 samtools-1.13/doc/samtools-depad.1 --- samtools-1.11/doc/samtools-depad.1 2020-09-22 12:15:55.000000000 +0000 +++ samtools-1.13/doc/samtools-depad.1 2021-07-07 17:18:52.000000000 +0000 @@ -1,5 +1,5 @@ '\" t -.TH samtools-depad 1 "22 September 2020" "samtools-1.11" "Bioinformatics tools" +.TH samtools-depad 1 "7 July 2021" "samtools-1.13" "Bioinformatics tools" .SH NAME samtools depad \- convert padded BAM to unpadded BAM .\" diff -Nru samtools-1.11/doc/samtools-depth.1 samtools-1.13/doc/samtools-depth.1 --- samtools-1.11/doc/samtools-depth.1 2020-09-22 12:15:55.000000000 +0000 +++ samtools-1.13/doc/samtools-depth.1 2021-07-07 17:18:52.000000000 +0000 @@ -1,9 +1,9 @@ '\" t -.TH samtools-depth 1 "22 September 2020" "samtools-1.11" "Bioinformatics tools" +.TH samtools-depth 1 "7 July 2021" "samtools-1.13" "Bioinformatics tools" .SH NAME samtools depth \- computes the read depth at each position or region .\" -.\" Copyright (C) 2008-2011, 2013-2020 Genome Research Ltd. +.\" Copyright (C) 2008-2011, 2013-2021 Genome Research Ltd. .\" Portions copyright (C) 2010, 2011 Broad Institute. .\" .\" Author: Heng Li @@ -78,23 +78,20 @@ the corresponding column. .TP .BI "-l " INT -.RI "Ignore reads shorter than " INT +.RI "Ignore reads shorter than " INT "." +This is the number of bases in the sequence, minus any soft clips. .TP .BI "-m, -d " INT -At a position, read at most -.I INT -reads per input file. This means figures greater than -.I INT -may be reported in the output. - -Setting this limit reduces the amount of memory and time needed to -process regions with very high coverage. Passing zero for this option -sets it to the highest possible value, effectively removing the depth -limit. [8000] - -Note that up to release 1.8, samtools would enforce a minimum value for -this option. This no longer happens and the limit is set exactly as -specified. +(Deprecated since 1.13) This option previously limited the depth to a maximum +value. It is still accepted as an option, but ignored. + +Note for single files, the behaviour of old +.B samtools depth -J -q0 -d +.I INT FILE +is identical to +.B samtools mpileup -A -Q0 -x -d +.I INT FILE +.B | cut -f 1,2,4 .TP .BI "-o " FILE .RI "Write output to " FILE ". Using \*(lq-\*(rq for " FILE @@ -110,7 +107,7 @@ Only report depth in specified region. .TP .B "-X" -If this option is set, it will allows user to specify customized index file location(s) if the data +If this option is set, it will allow the user to specify customized index file location(s) if the data folder does not contain any index file. Example usage: samtools depth [options] -X /data_folder/in1.bam [/data_folder/in2.bam [...]] /index_folder/index1.bai [/index_folder/index2.bai [...]] .TP .BI "-g " FLAGS @@ -136,14 +133,55 @@ Include reads with deletions in depth computation. .TP .B -s -For the overlapping section of a read pair, count only the bases of a single -read. This is accomplished by lowering to 0 the quality values of one read for -the span of the overlapping section. As a consequence, the algorithm will -consider only bases with a quality above 0. +For the overlapping section of a read pair, count only the bases of +the first read. Note this algorithm changed in 1.13 so the +results may differ slightly to older releases. + +.SH CAVEATS +It may appear that "samtools depth" is simply "samtools mpileup" with some +of the columns removed, and indeed earlier versions of this command +were just this. However both then and now there are subtle +differences in parameters which make the two not entirely comparable. +Differences, other than the obvious speed benefits, include: + +.IP o 2 +Deletions (CIGAR element "D") are counted by default in "depth". This +can be turned off with the \fB-J\fR option. "Mpileup" always ignores +these bases, and has no option to count them. + +.IP o 2 +Beware there are idiosyncrasies in option naming. Specifically +\fB-q\fR and \fB-Q\fR options have their meanings swapped between +"depth" and "mpileup". + +.IP o 2 +The removal of overlapping sequences (option \fB-s\fR) is on by +default in "mpileup" and off by default in "depth". Additionally the +overlap removal algorithm differs, giving subtle changes when Ns are +present in the sequence. Also any paired read is considered for overlap +removal by "depth", rather than only those with the properly-paired flag +set ("mpileup"). See above for a more detailed description. + +.IP o 2 +The default minimum quality value is 0 for "depth" and 13 for "mpileup". + +.IP o 2 +Specifying multiple BAMs will produce one depth column per file with +"depth", but these are merged in "mpileup". + +.IP o 2 +"Depth" doesn't have a maximum depth limit, while "mpileup" defaults +to a maximum of 8000. + +.IP o 2 +If a reference is specified to "mpileup" the BAQ algorithm will be +used to adjust quality values, although it can be disabled. "Depth" +never uses BAQ. +.EE .SH AUTHOR .PP -Written by Heng Li from the Sanger Institute. +Written by Heng Li and James Bonfield from the Sanger Institute. .SH SEE ALSO .IR samtools (1), diff -Nru samtools-1.11/doc/samtools-dict.1 samtools-1.13/doc/samtools-dict.1 --- samtools-1.11/doc/samtools-dict.1 2020-09-22 12:15:55.000000000 +0000 +++ samtools-1.13/doc/samtools-dict.1 2021-07-07 17:18:52.000000000 +0000 @@ -1,5 +1,5 @@ '\" t -.TH samtools-dict 1 "22 September 2020" "samtools-1.11" "Bioinformatics tools" +.TH samtools-dict 1 "7 July 2021" "samtools-1.13" "Bioinformatics tools" .SH NAME samtools dict \- create a sequence dictionary file from a fasta file .\" @@ -59,7 +59,7 @@ Add an AN tag with the same value as the SN tag, except that a \(lqchr\(rq prefix is removed if SN has one or added if it does not. For mitochondria (i.e., when SN is \(lqM\(rq or \(lqMT\(rq, with or without a -\(lqchr\(rq prefix), also adds the remaining combiniations of \(lqchr/M/MT\(rq +\(lqchr\(rq prefix), also adds the remaining combinations of \(lqchr/M/MT\(rq to the AN tag. .TP .B -H,\ --no-header diff -Nru samtools-1.11/doc/samtools-faidx.1 samtools-1.13/doc/samtools-faidx.1 --- samtools-1.11/doc/samtools-faidx.1 2020-09-22 12:15:55.000000000 +0000 +++ samtools-1.13/doc/samtools-faidx.1 2021-07-07 17:18:52.000000000 +0000 @@ -1,5 +1,5 @@ '\" t -.TH samtools-faidx 1 "22 September 2020" "samtools-1.11" "Bioinformatics tools" +.TH samtools-faidx 1 "7 July 2021" "samtools-1.13" "Bioinformatics tools" .SH NAME samtools faidx \- indexes or queries regions from a fasta file .\" diff -Nru samtools-1.11/doc/samtools-fasta.1 samtools-1.13/doc/samtools-fasta.1 --- samtools-1.11/doc/samtools-fasta.1 2020-09-22 12:15:55.000000000 +0000 +++ samtools-1.13/doc/samtools-fasta.1 2021-07-07 17:18:52.000000000 +0000 @@ -1,5 +1,5 @@ '\" t -.TH samtools-fasta 1 "22 September 2020" "samtools-1.11" "Bioinformatics tools" +.TH samtools-fasta 1 "7 July 2021" "samtools-1.13" "Bioinformatics tools" .SH NAME samtools fasta / fastq \- converts a SAM/BAM/CRAM file to FASTA or FASTQ .\" @@ -94,7 +94,7 @@ .BR -1 ", " -2 ", " -o ", or " -0 options is used, in which case sequences for that category will be written to the specified file. -The same filename may be specified with multiple options, in which ase the +The same filename may be specified with multiple options, in which case the sequences will be multiplexed in order of occurrence. If a singleton file is specified using the @@ -267,7 +267,7 @@ .SH BUGS .IP o 2 -The way of specifying output files is far to complicated and easy to get wrong. +The way of specifying output files is far too complicated and easy to get wrong. .SH AUTHOR .PP @@ -278,5 +278,6 @@ .IR samtools (1), .IR samtools-faidx (1), .IR samtools-fqidx (1) +.IR samtools-import (1) .PP Samtools website: diff -Nru samtools-1.11/doc/samtools-fixmate.1 samtools-1.13/doc/samtools-fixmate.1 --- samtools-1.11/doc/samtools-fixmate.1 2020-09-22 12:15:55.000000000 +0000 +++ samtools-1.13/doc/samtools-fixmate.1 2021-07-07 17:18:52.000000000 +0000 @@ -1,5 +1,5 @@ '\" t -.TH samtools-fixmate 1 "22 September 2020" "samtools-1.11" "Bioinformatics tools" +.TH samtools-fixmate 1 "7 July 2021" "samtools-1.13" "Bioinformatics tools" .SH NAME samtools fixmate \- fills in mate coordinates and insert size fields. .\" diff -Nru samtools-1.11/doc/samtools-flags.1 samtools-1.13/doc/samtools-flags.1 --- samtools-1.11/doc/samtools-flags.1 2020-09-22 12:15:55.000000000 +0000 +++ samtools-1.13/doc/samtools-flags.1 2021-07-07 17:18:52.000000000 +0000 @@ -1,5 +1,5 @@ '\" t -.TH samtools-flags 1 "22 September 2020" "samtools-1.11" "Bioinformatics tools" +.TH samtools-flags 1 "7 July 2021" "samtools-1.13" "Bioinformatics tools" .SH NAME samtools flags \- convert between textual and numeric flag representation. .\" @@ -44,12 +44,17 @@ .SH SYNOPSIS .PP samtools flags -.IR INT | STR [,...] +.IR FLAGS ... .SH DESCRIPTION .PP Convert between textual and numeric flag representation. +Each \fIFLAGS\fP argument may be either an integer (in decimal, hexadecimal, +or octal) representing a combination of the listed numeric flag values, +or a comma-separated string \fINAME\fB,\fR...\fB,\fINAME\fR representing +a combination of the flag names listed below. + .B FLAGS: .TS rb l l . diff -Nru samtools-1.11/doc/samtools-flagstat.1 samtools-1.13/doc/samtools-flagstat.1 --- samtools-1.11/doc/samtools-flagstat.1 2020-09-22 12:15:55.000000000 +0000 +++ samtools-1.13/doc/samtools-flagstat.1 2021-07-07 17:18:52.000000000 +0000 @@ -1,9 +1,9 @@ '\" t -.TH samtools-flagstat 1 "22 September 2020" "samtools-1.11" "Bioinformatics tools" +.TH samtools-flagstat 1 "7 July 2021" "samtools-1.13" "Bioinformatics tools" .SH NAME samtools flagstat \- counts the number of alignments for each FLAG type .\" -.\" Copyright (C) 2008-2011, 2013-2019 Genome Research Ltd. +.\" Copyright (C) 2008-2011, 2013-2019, 2021 Genome Research Ltd. .\" Portions copyright (C) 2010, 2011 Broad Institute. .\" .\" Author: Heng Li @@ -53,6 +53,9 @@ Provides counts for each of 13 categories based primarily on bit flags in the FLAG field. +Information on the meaning of the flags is given in the SAM specification +document . + Each category in the output is broken down into QC pass and QC fail. In the default output format, these are presented as "#PASS + #FAIL" followed by a description of the category. @@ -70,6 +73,9 @@ .RS 18 .TP +primary +neither 0x100 nor 0x800 bit set +.TP secondary 0x100 bit set .TP @@ -79,9 +85,15 @@ duplicates 0x400 bit set .TP +primary duplicates +0x400 bit set and neither 0x100 nor 0x800 bit set +.TP mapped 0x4 bit not set .TP +primary mapped +0x4, 0x100 and 0x800 bits not set +.TP paired in sequencing 0x1 bit set .TP @@ -139,19 +151,27 @@ These contain the various categories listed above as names and the corresponding count as value. -For the default format, the -.BR mapped ", " "properly paired" ", and " singletons -show the count as a percentage of the total number of QC-passed or QC-failed +For the default format, +.BR mapped " +shows the count as a percentage of the total number of QC-passed or QC-failed reads after the category name. For example: .EX 32 + 0 mapped (94.12% : N/A) .EE +The +.BR "properly paired" " and " singletons +counts work in a similar way but the percentage is against the total number of +QC-passed and QC-failed pairs. The +.BR "primary mapped" +count is a percentage of the total number of QC-passed and QC-failed primary +reads. + In the .BR tsv " and " json formats, these percentages are listed in separate categories -.BR "mapped %" ", " "properly paired %" ", and " "singletons %" . +.BR "mapped %" ", " "primary mapped %" ", " "properly paired %" ", and " "singletons %" . If the percentage cannot be calculated (because the total is zero) then in the .BR default " and " tsv diff -Nru samtools-1.11/doc/samtools-fqidx.1 samtools-1.13/doc/samtools-fqidx.1 --- samtools-1.11/doc/samtools-fqidx.1 2020-09-22 12:15:55.000000000 +0000 +++ samtools-1.13/doc/samtools-fqidx.1 2021-07-07 17:18:52.000000000 +0000 @@ -1,5 +1,5 @@ '\" t -.TH samtools-fqidx 1 "22 September 2020" "samtools-1.11" "Bioinformatics tools" +.TH samtools-fqidx 1 "7 July 2021" "samtools-1.13" "Bioinformatics tools" .SH NAME samtools fqidx \- Indexes or queries regions from a fastq file .\" diff -Nru samtools-1.11/doc/samtools-idxstats.1 samtools-1.13/doc/samtools-idxstats.1 --- samtools-1.11/doc/samtools-idxstats.1 2020-09-22 12:15:55.000000000 +0000 +++ samtools-1.13/doc/samtools-idxstats.1 2021-07-07 17:18:52.000000000 +0000 @@ -1,5 +1,5 @@ '\" t -.TH samtools-idxstats 1 "22 September 2020" "samtools-1.11" "Bioinformatics tools" +.TH samtools-idxstats 1 "7 July 2021" "samtools-1.13" "Bioinformatics tools" .SH NAME samtools idxstats \- reports alignment summary statistics .\" diff -Nru samtools-1.11/doc/samtools-import.1 samtools-1.13/doc/samtools-import.1 --- samtools-1.11/doc/samtools-import.1 1970-01-01 00:00:00.000000000 +0000 +++ samtools-1.13/doc/samtools-import.1 2021-07-07 17:18:52.000000000 +0000 @@ -0,0 +1,221 @@ +'\" t +.TH samtools-import 1 "7 July 2021" "samtools-1.13" "Bioinformatics tools" +.SH NAME +samtools import \- converts FASTQ files to unmapped SAM/BAM/CRAM +.\" +.\" Copyright (C) 2020 Genome Research Ltd. +.\" +.\" Author: James Bonfield +.\" +.\" Permission is hereby granted, free of charge, to any person obtaining a +.\" copy of this software and associated documentation files (the "Software"), +.\" to deal in the Software without restriction, including without limitation +.\" the rights to use, copy, modify, merge, publish, distribute, sublicense, +.\" and/or sell copies of the Software, and to permit persons to whom the +.\" Software is furnished to do so, subject to the following conditions: +.\" +.\" The above copyright notice and this permission notice shall be included in +.\" all copies or substantial portions of the Software. +.\" +.\" THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +.\" IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +.\" FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +.\" THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +.\" LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +.\" FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +.\" DEALINGS IN THE SOFTWARE. +. +.\" For code blocks and examples (cf groff's Ultrix-specific man macros) +.de EX + +. in +\\$1 +. nf +. ft CR +.. +.de EE +. ft +. fi +. in + +.. +. +.SH SYNOPSIS +.PP +samtools import +.RI [ options ] +[ +.I fastq_file +\fR... ] + + +.SH DESCRIPTION +.PP + +Reads one or more FASTQ files and converts them to unmapped SAM, BAM +or CRAM. The input files may be automatically decompressed if they +have a .gz extension. + +The simplest usage in the absence of any other command line options is +to provide one or two input files. + +If a single file is given, it will be interpreted as a single-ended +sequencing format unless the read names end with /1 and /2 in which +case they will be labelled as PAIRED with READ1 or READ2 BAM flags +set. If a pair of filenames are given they will be read from +alternately to produce an interleaved output file, also setting PAIRED +and READ1 / READ2 flags. + +The filenames may be explicitly labelled using \fB-1\fR and \fB-2\fR +for READ1 and READ2 data files, \fB-s\fR for an interleaved paired +file (or one half of a paired-end run), \fB-0\fR for unpaired data +and explicit index files specified with \fB--i1\fR and \fB--i2\fR. +These correspond to typical output produced by Illumina bcl2fastq and +match the output from \fBsamtools fastq\fR. The index files will set +both the \fBBC\fR barcode code and it's associated \fBQT\fR quality tag. + +The Illumina CASAVA identifiers may also be processed when the \fB-i\fR +option is given. This tag will be processed for READ1 / READ2, +whether or not the read failed processing (QCFAIL flag), and the +barcode sequence which will be added to the \fBBC\fR tag. This can be +an alternative to explicitly specifying the index files, although note +that doing so will not fill out the barcode quality tag. + + +.SH OPTIONS +.TP 8 +.BI -s\ FILE +Import paired interleaved data from \fIFILE\fR. + +.TP 8 +.BI -0\ FILE +Import single-ended (unpaired) data from \fIFILE\fR. + +Operationally there is no difference between the \fB-s\fR and \fB-0\fR +options as given an interleaved file with /1 and /2 read name endings +both will correctly set the PAIRED, READ1 and READ2 flags, and given +data with no suffixes and no CASAVA identifiers being processed both will +leave the data as unpaired. However their inclusion here is for more +descriptive command lines and to improve the header comment describing +the samtools fastq decode command. + +.TP 8 +.BI -1\ FILE ,\ -2\ FILE +Import paired data from a pair of FILEs. The BAM flag PAIRED will be +set, but not PROPER_PAIR as it has not been aligned. READ1 and READ2 +will be stored in their original, unmapped, orientation. + +.TP 8 +.BI --i1\ FILE ,\ --i2\ FILE +Specifies index barcodes associated with the \fB-1\fR and \fB-2\fR +files. These will be appended to READ1 and READ2 records in the +barcode (BC) and quality (QT) tags. + +.TP 8 +.B -i +Specifies that the Illumina CASAVA identifiers should be processed. +This may set the READ1, READ2 and QCFAIL flags and add a barcode tag. + +.TP +.BI --barcode-tag\ TAG +Changes the auxiliary tag used for barcode sequence. Defaults to BC. + +.TP +.BI --quality-tag\ TAG +Changes the auxiliary tag used for barcode quality. Defaults to QT. + +.TP +.BI -o FILE +Output to \fIFILE\fR. By default output will be written to stdout. + +.TP 8 +.BI --order\ TAG +When outputting a SAM record, also output an integer tag containing +the Nth record number. This may be useful if the data is to be sorted +or collated in some manner and we wish this to be reversible. In this +case the tag may be used with \fBsamtools sort -t TAG\fR to regenerate +the original input order. + +.TP 8 +.BI -r\ RG_line ,\ --rg-line\ RG_line +A complete \fB@RG\fR header line may be specified, with or without the +initial "@RG" component. If specified this will also use the ID field +from \fIRG_line\fR in each SAM records RG auxiliary tag. + +If specified multiple times this appends to the RG line, automatically +adding tabs between invocations. + +.TP 8 +.BI -R\ RG_ID ,\ --rg\ RG_ID +This is a shorter form of the option above, equivalent to +\fB--rg-line ID:\fR\fIRG_ID\fR. +If both are specified then this option is ignored. + +.TP +.B -u +Output BAM or CRAM as uncompressed data. + +.TP 8 +.BI -T\ TAGLIST +This looks for any SAM-format auxiliary tags in the comment field of a fastq +read name. These must match the :: +pattern as specified in the SAM specification. \fITAGLIST\fR can be blank +or \fB*\fR to indicate all tags should be copied to the output, +otherwise it is a comma-separated list of tag types to include with +all others being discarded. + + +.SH EXAMPLES +Convert a single-ended fastq file to an unmapped CRAM. Both of these +commands perform the same action. + +.EX 4 +samtools import -0 in.fastq -o out.cram +samtools import in.fastq > out.cram +.EE + +Convert a pair of Illumina fastqs containing CASAVA identifiers to BAM, +adding the barcode information to the BC auxiliary tag. + +.EX 4 +samtools import -i -1 in_1.fastq -2 in_2.fastq -o out.bam +samtools import -i in_[12].fastq > out.bam +.EE + +Specify the read group. These commands are equivalent + +.EX 4 +samtools import -r "$(echo -e 'ID:xyz\\tPL:ILLUMINA')" in.fq +samtools import -r "$(echo -e '@RG\\tID:xyz\\tPL:ILLUMINA')" in.fq +samtools import -r ID:xyz -r PL:ILLUMINA in.fq +.EE + +Create an unmapped BAM file from a set of 4 Illumina fastqs from +bcf2fastq, consisting of two read and two index tags. The CASAVA identifier +is used only for setting QC pass / failure status. + +.EX 4 +samtools import -i -1 R1.fq -2 R2.fq --i1 I1.fq --i2 I2.fq -o out.bam +.EE + +Convert a pair of CASAVA barcoded fastq files to unmapped CRAM with an +incremental record counter, then sort this by minimiser in order to +reduce file space. The reversal process is also shown using samtools +sort and samtools fastq. + +.EX 4 +samtools import -i in_1.fq in_2.fq --order ro -O bam,level=0 | \\ + samtools sort -@4 -M -o out.srt.cram - + +samtools sort -@4 -O bam -u -t ro out.srt.cram | \\ + samtools fastq -1 out_1.fq -2 out_2.fq -i --index-format "i*i*" +.EE + +.SH AUTHOR +.PP +Written by James Bonfield of the Wellcome Sanger Institute. + +.SH SEE ALSO +.IR samtools (1), +.IR samtools-fastq (1) +.PP +Samtools website: diff -Nru samtools-1.11/doc/samtools-index.1 samtools-1.13/doc/samtools-index.1 --- samtools-1.11/doc/samtools-index.1 2020-09-22 12:15:55.000000000 +0000 +++ samtools-1.13/doc/samtools-index.1 2021-07-07 17:18:52.000000000 +0000 @@ -1,9 +1,9 @@ '\" t -.TH samtools-index 1 "22 September 2020" "samtools-1.11" "Bioinformatics tools" +.TH samtools-index 1 "7 July 2021" "samtools-1.13" "Bioinformatics tools" .SH NAME samtools index \- indexes SAM/BAM/CRAM files .\" -.\" Copyright (C) 2008-2011, 2013-2019 Genome Research Ltd. +.\" Copyright (C) 2008-2011, 2013-2020 Genome Research Ltd. .\" Portions copyright (C) 2010, 2011 Broad Institute. .\" .\" Author: Heng Li diff -Nru samtools-1.11/doc/samtools-markdup.1 samtools-1.13/doc/samtools-markdup.1 --- samtools-1.11/doc/samtools-markdup.1 2020-09-22 12:15:55.000000000 +0000 +++ samtools-1.13/doc/samtools-markdup.1 2021-07-07 17:18:52.000000000 +0000 @@ -1,5 +1,5 @@ '\" t -.TH samtools-markdup 1 "22 September 2020" "samtools-1.11" "Bioinformatics tools" +.TH samtools-markdup 1 "7 July 2021" "samtools-1.13" "Bioinformatics tools" .SH NAME samtools markdup \- mark duplicate alignments in a coordinate sorted file .\" @@ -61,6 +61,7 @@ .RB [ --include-fails ] .RB [ --no-PG ] .RB [ -u ] +.RB [ --no-multi-dup ] .I in.algsort.bam out.bam .SH DESCRIPTION @@ -120,6 +121,14 @@ .B --include-fails Include quality checked failed reads. .TP +.B --no-multi-dup +Stop checking duplicates of duplicates for correctness. While still marking +reads as duplicates further checks to make sure all optical duplicates are found +are not carried out. Also operates on \fB-t\fR tagging where reads may tagged +with a better quality read but not necessarily the best one. Using this option +can speed up duplicate marking when there are a great many duplicates for each +original read. +.TP .B --no-PG Do not add a PG line to the output file. .TP diff -Nru samtools-1.11/doc/samtools-merge.1 samtools-1.13/doc/samtools-merge.1 --- samtools-1.11/doc/samtools-merge.1 2020-09-22 12:15:55.000000000 +0000 +++ samtools-1.13/doc/samtools-merge.1 2021-07-07 17:18:52.000000000 +0000 @@ -1,9 +1,9 @@ '\" t -.TH samtools-merge 1 "22 September 2020" "samtools-1.11" "Bioinformatics tools" +.TH samtools-merge 1 "7 July 2021" "samtools-1.13" "Bioinformatics tools" .SH NAME -samtools merge \- merges multiple sorted input files into a single output. +samtools merge \- merges multiple sorted files into a single file .\" -.\" Copyright (C) 2008-2011, 2013-2019 Genome Research Ltd. +.\" Copyright (C) 2008-2011, 2013-2019, 2021 Genome Research Ltd. .\" Portions copyright (C) 2010, 2011 Broad Institute. .\" .\" Author: Heng Li @@ -43,23 +43,30 @@ . .SH SYNOPSIS .PP -.B merge -samtools merge -.RB [ -nur1f ] -.RB [ -h -.IR inh.sam ] -.RB [ -t -.IR tag ] -.RB [ -R -.IR reg ] -.RB [ -b -.IR list "] " out.bam " " in1.bam " [" in2.bam " " in3.bam " ... " inN.bam ] +.B samtools merge +.RI [ options ] +.B -o +.I out.bam +.RI [ options ] +.IR in1.bam " ... " inN.bam +.PP +.B samtools merge +.RI [ options ] +.I out.bam +.IR in1.bam " ... " inN.bam .SH DESCRIPTION .PP Merge multiple sorted alignment files, producing a single sorted output file that contains all the input records and maintains the existing sort order. +The output file can be specified via \fB-o\fP as shown in the first synopsis. +Otherwise the first non-option filename argument is taken to be \fIout.bam\fP +rather than an input file, as in the second synopsis. +There is no default; to write to standard output (or to a pipe), use either +\(lq\fB-o -\fP\(rq or the equivalent using \(lq\fB-\fP\(rq as the first +filename argument. + If .BR -h is specified the @SQ headers of input files will be merged into the specified header, otherwise they will be merged @@ -106,6 +113,14 @@ The input alignments are sorted by read names rather than by chromosomal coordinates .TP +.BI -o \ FILE +Write merged output to +.IR FILE , +specifying the filename via an option rather than as the first filename +argument. +When \fB-o\fP is used, all non-option filename arguments specify input +files to be merged. +.TP .B -t TAG The input alignments have been sorted by the value of TAG, then by either position or name (if \fB-n\fP is given). diff -Nru samtools-1.11/doc/samtools-mpileup.1 samtools-1.13/doc/samtools-mpileup.1 --- samtools-1.11/doc/samtools-mpileup.1 2020-09-22 12:15:55.000000000 +0000 +++ samtools-1.13/doc/samtools-mpileup.1 2021-07-07 17:18:52.000000000 +0000 @@ -1,5 +1,5 @@ '\" t -.TH samtools-mpileup 1 "22 September 2020" "samtools-1.11" "Bioinformatics tools" +.TH samtools-mpileup 1 "7 July 2021" "samtools-1.13" "Bioinformatics tools" .SH NAME samtools mpileup \- produces "pileup" textual format from an alignment .\" @@ -234,7 +234,7 @@ below. .TP .BI -G,\ --exclude-RG \ FILE -Exclude reads from readgroups listed in FILE (one @RG-ID per line) +Exclude reads from read groups listed in FILE (one @RG-ID per line) .TP .BI -l,\ --positions \ FILE BED or position list file containing a list of regions or sites where @@ -264,10 +264,10 @@ Ignore RG tags. Treat all reads in one BAM as one sample. .TP .BI --rf,\ --incl-flags \ STR|INT -Required flags: skip reads with mask bits unset [null] +Required flags: include reads with any of the mask bits set [null] .TP .BI --ff,\ --excl-flags \ STR|INT -Filter flags: skip reads with mask bits set +Filter flags: skip reads with any of the mask bits set [UNMAP,SECONDARY,QCFAIL,DUP] .TP .B -x,\ --ignore-overlaps @@ -337,7 +337,7 @@ tag values. .TP .BI "--output-sep" \ CHAR -Specify a different separtor character for tag value lists, when those values +Specify a different separator character for tag value lists, when those values might contain one or more commas (\fB,\fR), which is the default list separator. This option only affects columns for two-letter tags like NM; standard fields like FLAG or QNAME will always be separated by commas. @@ -381,7 +381,7 @@ .B -B option. -It is possible to store pre-calculated BAQ values in a SAM BQ:Z tag. +It is possible to store precalculated BAQ values in a SAM BQ:Z tag. Samtools mpileup will use the precalculated values if it finds them. The .B -E diff -Nru samtools-1.11/doc/samtools-phase.1 samtools-1.13/doc/samtools-phase.1 --- samtools-1.11/doc/samtools-phase.1 2020-09-22 12:15:55.000000000 +0000 +++ samtools-1.13/doc/samtools-phase.1 2021-07-07 17:18:52.000000000 +0000 @@ -1,7 +1,7 @@ '\" t -.TH samtools-phase 1 "22 September 2020" "samtools-1.11" "Bioinformatics tools" +.TH samtools-phase 1 "7 July 2021" "samtools-1.13" "Bioinformatics tools" .SH NAME -samtools phase \- call and phase heterozygous SNPS +samtools phase \- call and phase heterozygous SNPs .\" .\" Copyright (C) 2008-2011, 2013-2018 Genome Research Ltd. .\" Portions copyright (C) 2010, 2011 Broad Institute. diff -Nru samtools-1.11/doc/samtools-quickcheck.1 samtools-1.13/doc/samtools-quickcheck.1 --- samtools-1.11/doc/samtools-quickcheck.1 2020-09-22 12:15:55.000000000 +0000 +++ samtools-1.13/doc/samtools-quickcheck.1 2021-07-07 17:18:52.000000000 +0000 @@ -1,5 +1,5 @@ '\" t -.TH samtools-quickcheck 1 "22 September 2020" "samtools-1.11" "Bioinformatics tools" +.TH samtools-quickcheck 1 "7 July 2021" "samtools-1.13" "Bioinformatics tools" .SH NAME samtools quickcheck \- a rapid sanity check on input files .\" diff -Nru samtools-1.11/doc/samtools-reheader.1 samtools-1.13/doc/samtools-reheader.1 --- samtools-1.11/doc/samtools-reheader.1 2020-09-22 12:15:55.000000000 +0000 +++ samtools-1.13/doc/samtools-reheader.1 2021-07-07 17:18:52.000000000 +0000 @@ -1,5 +1,5 @@ '\" t -.TH samtools-reheader 1 "22 September 2020" "samtools-1.11" "Bioinformatics tools" +.TH samtools-reheader 1 "7 July 2021" "samtools-1.13" "Bioinformatics tools" .SH NAME samtools reheader \- replaces the header in the input file .\" diff -Nru samtools-1.11/doc/samtools-rmdup.1 samtools-1.13/doc/samtools-rmdup.1 --- samtools-1.11/doc/samtools-rmdup.1 2020-09-22 12:15:55.000000000 +0000 +++ samtools-1.13/doc/samtools-rmdup.1 2021-07-07 17:18:52.000000000 +0000 @@ -1,5 +1,5 @@ '\" t -.TH samtools-rmdup 1 "22 September 2020" "samtools-1.11" "Bioinformatics tools" +.TH samtools-rmdup 1 "7 July 2021" "samtools-1.13" "Bioinformatics tools" .SH NAME samtools rmdup \- removes duplicate reads (obsolete) .\" diff -Nru samtools-1.11/doc/samtools-sort.1 samtools-1.13/doc/samtools-sort.1 --- samtools-1.11/doc/samtools-sort.1 2020-09-22 12:15:55.000000000 +0000 +++ samtools-1.13/doc/samtools-sort.1 2021-07-07 17:18:52.000000000 +0000 @@ -1,9 +1,9 @@ '\" t -.TH samtools-sort 1 "22 September 2020" "samtools-1.11" "Bioinformatics tools" +.TH samtools-sort 1 "7 July 2021" "samtools-1.13" "Bioinformatics tools" .SH NAME samtools sort \- sorts SAM/BAM/CRAM files .\" -.\" Copyright (C) 2008-2011, 2013-2019 Genome Research Ltd. +.\" Copyright (C) 2008-2011, 2013-2020 Genome Research Ltd. .\" Portions copyright (C) 2010, 2011 Broad Institute. .\" .\" Author: Heng Li diff -Nru samtools-1.11/doc/samtools-split.1 samtools-1.13/doc/samtools-split.1 --- samtools-1.11/doc/samtools-split.1 2020-09-22 12:15:55.000000000 +0000 +++ samtools-1.13/doc/samtools-split.1 2021-07-07 17:18:52.000000000 +0000 @@ -1,5 +1,5 @@ '\" t -.TH samtools-split 1 "22 September 2020" "samtools-1.11" "Bioinformatics tools" +.TH samtools-split 1 "7 July 2021" "samtools-1.13" "Bioinformatics tools" .SH NAME samtools split \- splits a file by read group. .\" diff -Nru samtools-1.11/doc/samtools-stats.1 samtools-1.13/doc/samtools-stats.1 --- samtools-1.11/doc/samtools-stats.1 2020-09-22 12:15:55.000000000 +0000 +++ samtools-1.13/doc/samtools-stats.1 2021-07-07 17:18:52.000000000 +0000 @@ -1,9 +1,9 @@ '\" t -.TH samtools-stats 1 "22 September 2020" "samtools-1.11" "Bioinformatics tools" +.TH samtools-stats 1 "7 July 2021" "samtools-1.13" "Bioinformatics tools" .SH NAME samtools stats \- produces comprehensive statistics from alignment file .\" -.\" Copyright (C) 2008-2011, 2013-2018 Genome Research Ltd. +.\" Copyright (C) 2008-2011, 2013-2018, 2020-2021 Genome Research Ltd. .\" Portions copyright (C) 2010, 2011 Broad Institute. .\" .\" Author: Heng Li @@ -110,6 +110,9 @@ Reads where PAIRED is set and either both READ1 and READ2 are set or neither is set are not counted in either category. .PP +Information on the meaning of the flags is given in the SAM specification +document . + The CHK row contains distinct CRC32 checksums of read names, sequences and quality values. The checksums are computed per alignment record and summed, meaning the checksum does not change if the input file has @@ -121,7 +124,8 @@ .RS .B raw total sequences -- total number of reads in a file. Same number reported by +- total number of reads in a file, excluding supplementary and secondary reads. +Same number reported by .BR "samtools view -c". .B filtered sequences @@ -171,6 +175,9 @@ .B non-primary alignments - number of secondary reads (flag 0x100 (256) set). +.B supplementary alignments +- number of supplementary reads (flag 0x800 (2048) set). + .B total length - number of processed bases from reads that are neither secondary nor supplementary (flags 0x100 (256) and 0x800 (2048) are not set). diff -Nru samtools-1.11/doc/samtools-targetcut.1 samtools-1.13/doc/samtools-targetcut.1 --- samtools-1.11/doc/samtools-targetcut.1 2020-09-22 12:15:55.000000000 +0000 +++ samtools-1.13/doc/samtools-targetcut.1 2021-07-07 17:18:52.000000000 +0000 @@ -1,5 +1,5 @@ '\" t -.TH samtools-targetcut 1 "22 September 2020" "samtools-1.11" "Bioinformatics tools" +.TH samtools-targetcut 1 "7 July 2021" "samtools-1.13" "Bioinformatics tools" .SH NAME samtools targetcut \- cut fosmid regions (for fosmid pool only) .\" diff -Nru samtools-1.11/doc/samtools-tview.1 samtools-1.13/doc/samtools-tview.1 --- samtools-1.11/doc/samtools-tview.1 2020-09-22 12:15:55.000000000 +0000 +++ samtools-1.13/doc/samtools-tview.1 2021-07-07 17:18:52.000000000 +0000 @@ -1,5 +1,5 @@ '\" t -.TH samtools-tview 1 "22 September 2020" "samtools-1.11" "Bioinformatics tools" +.TH samtools-tview 1 "7 July 2021" "samtools-1.13" "Bioinformatics tools" .SH NAME samtools tview \- display alignments in a curses-based interactive viewer. .\" @@ -43,8 +43,7 @@ . .SH SYNOPSIS .PP -.B tview -samtools tview +.B samtools tview .RB [ -p .IR chr:pos ] .RB [ -s diff -Nru samtools-1.11/doc/samtools-view.1 samtools-1.13/doc/samtools-view.1 --- samtools-1.11/doc/samtools-view.1 2020-09-22 12:15:55.000000000 +0000 +++ samtools-1.13/doc/samtools-view.1 2021-07-07 17:18:52.000000000 +0000 @@ -1,9 +1,9 @@ '\" t -.TH samtools-view 1 "22 September 2020" "samtools-1.11" "Bioinformatics tools" +.TH samtools-view 1 "7 July 2021" "samtools-1.13" "Bioinformatics tools" .SH NAME samtools view \- views and converts SAM/BAM/CRAM files .\" -.\" Copyright (C) 2008-2011, 2013-2020 Genome Research Ltd. +.\" Copyright (C) 2008-2011, 2013-2021 Genome Research Ltd. .\" Portions copyright (C) 2010, 2011 Broad Institute. .\" .\" Author: Heng Li @@ -43,8 +43,7 @@ . .SH SYNOPSIS .PP -.B view -samtools view +.B samtools view .RI [ options ] .IR in.sam | in.bam | in.cram .RI [ region ...] @@ -87,6 +86,7 @@ The .BR -L , .BR -M , +.BR -N , .BR -r , .BR -R , .BR -d , @@ -103,9 +103,11 @@ alignments that match certain criteria. The -.B -x +.BR -x , +.BR -B , +.BR --add-flags , and -.B -B +.B --remove-flags options modify the data which is contained in each alignment. The @@ -156,27 +158,32 @@ .SH OPTIONS .TP 10 -.B -b +.BR -b ", " --bam Output in the BAM format. .TP -.B -C +.BR -C ", " --cram Output in the CRAM format (requires -T). .TP -.B -1 +.BR -1 ", " --fast Enable fast BAM compression (implies -b). .TP -.B -u +.BR -u ", " --uncompressed Output uncompressed BAM. This option saves time spent on compression/decompression and is thus preferred when the output is piped to another samtools command. .TP -.B -h +.BR -h ", " --with-header Include the header in the output. .TP -.B -H +.BR -H ", " --header-only Output the header only. .TP -.B -c +.B --no-header +When producing SAM format, output alignment records but not headers. +This is the default; the option can be used to reset the effect of +.BR -h / -H . +.TP +.BR -c ", " --count Instead of printing the alignments, only count them and print the total number. All filter options, such as .BR -f , @@ -185,14 +192,14 @@ .BR -q , are taken into account. .TP -.B -? +.BR -? ", " --help Output long help and exit immediately. .TP -.BI "-o " FILE +.BI "-o " FILE ", --output " FILE Output to .I FILE [stdout]. .TP -.BI "-U " FILE +.BI "-U " FILE ", --unoutput " FILE ", --output-unselected " FILE Write alignments that are .I not selected by the various filter options to @@ -201,7 +208,7 @@ .I regions specified) are written to either the output file or this file, but never both. .TP -.BI "-t " FILE +.BI "-t " FILE ", --fai-reference " FILE A tab-delimited .IR FILE . Each line must contain the reference name in the first column and the length of @@ -213,7 +220,7 @@ can be used as this .IR FILE . .TP -.BI "-T " FILE +.BI "-T " FILE ", --reference " FILE A FASTA format reference .IR FILE , optionally compressed by @@ -239,12 +246,12 @@ find the index, and may not be able to decode the file unless it can get the references it needs using a different method. .TP -.BI "-L " FILE +.BI "-L " FILE ", --target-file " FILE ", --targets-file " FILE Only output alignments overlapping the input BED .I FILE [null]. .TP -.B "-M " +.BR -M ", " --use-index Use the multi-region iterator on the union of a BED file and command-line region arguments. This avoids re-reading the same regions of files so can sometimes be much faster. Note this also removes @@ -254,7 +261,21 @@ .BR -L option. .TP -.BI "-r " STR +.BI "--region-file " FILE ", --regions-file " FILE +Use an index and multi-region iterator to only output alignments +overlapping the input BED +.IR FILE . +Equivalent to +.BI "-M -L " FILE +or +.B --use-index --target-file +.IR FILE . +.TP +.BI "-N " FILE ", --qname-file " FILE +Output only alignments with read names listed in +.IR FILE . +.TP +.BI "-r " STR ", --read-group " STR Output alignments in read group .I STR [null]. @@ -263,7 +284,7 @@ tag will also be output when using this option. This behaviour may change in a future release. .TP -.BI "-R " FILE +.BI "-R " FILE ", --read-group-file " FILE Output alignments in read groups listed in .I FILE [null]. @@ -272,87 +293,120 @@ tag will also be output when using this option. This behaviour may change in a future release. .TP -.BI "-d " STR:STR +.BI "-d " STR1[:STR2] ", --tag " STR1[:STR2] Only output alignments with tag -.I STR +.I STR1 and associated value -.I STR -[null]. +.IR STR2 , +which can be a string or an integer [null]. +The value can be omitted, in which case only the tag is considered. .TP -.BI "-D " STR:FILE +.BI "-D " STR:FILE ", --tag-file " STR:FILE Only output alignments with tag .I STR and associated values listed in .I FILE [null]. .TP -.BI "-q " INT +.BI "-q " INT ", --min-MQ " INT Skip alignments with MAPQ smaller than .I INT [0]. .TP -.BI "-l " STR +.BI "-l " STR ", --library " STR Only output alignments in library .I STR [null]. .TP -.BI "-m " INT +.BI "-m " INT ", --min-qlen " INT Only output alignments with number of CIGAR bases consuming query sequence \(>= .I INT [0] .TP -.BI "-f " INT +.BI "-e " STR ", --expr " STR +Only include alignments that match the filter expression \fISTR\fR. +The syntax for these expressions is described in the main samtools(1) man page +under the FILTER EXPRESSIONS heading. +.TP +.BI "-f " FLAG ", --require-flags " FLAG Only output alignments with all bits set in -.I INT +.I FLAG present in the FLAG field. -.I INT -can be specified in hex by beginning with `0x' (i.e. /^0x[0-9A-F]+/) -or in octal by beginning with `0' (i.e. /^0[0-7]+/) [0]. +.I FLAG +can be specified in hex by beginning with `0x' (i.e. /^0x[0-9A-F]+/), +in octal by beginning with `0' (i.e. /^0[0-7]+/), as a decimal number +not beginning with '0' or as a comma-separated list of flag names. + + +For a list of flag names see +.IR samtools-flags (1). .TP -.BI "-F " INT +.BI "-F " FLAG ", --excl-flags " FLAG ", --exclude-flags " FLAG Do not output alignments with any bits set in -.I INT +.I FLAG present in the FLAG field. -.I INT -can be specified in hex by beginning with `0x' (i.e. /^0x[0-9A-F]+/) -or in octal by beginning with `0' (i.e. /^0[0-7]+/) [0]. +.I FLAG +can be specified in hex by beginning with `0x' (i.e. /^0x[0-9A-F]+/), +in octal by beginning with `0' (i.e. /^0[0-7]+/), as a decimal number +not beginning with '0' or as a comma-separated list of flag names. .TP -.BI "-G " INT +.BI "-G " FLAG Do not output alignments with all bits set in .I INT present in the FLAG field. This is the opposite of \fI-f\fR such that \fI-f12 -G12\fR is the same as no filtering at all. -.I INT -can be specified in hex by beginning with `0x' (i.e. /^0x[0-9A-F]+/) -or in octal by beginning with `0' (i.e. /^0[0-7]+/) [0]. +.I FLAG +can be specified in hex by beginning with `0x' (i.e. /^0x[0-9A-F]+/), +in octal by beginning with `0' (i.e. /^0[0-7]+/), as a decimal number +not beginning with '0' or as a comma-separated list of flag names. .TP -.BI "-x " STR +.BI "-x " STR ", --remove-tag " STR Read tag to exclude from output (repeatable) [null] .TP -.B -B +.BR -B ", " --remove-B Collapse the backward CIGAR operation. .TP -.BI "-s " FLOAT -Output only a proportion of the input alignments. +.BI "--add-flags " FLAG +Adds flag(s) to read. +.I FLAG +can be specified in hex by beginning with `0x' (i.e. /^0x[0-9A-F]+/), +in octal by beginning with `0' (i.e. /^0[0-7]+/), as a decimal number +not beginning with '0' or as a comma-separated list of flag names. +.TP +.BI "--remove-flags " FLAG +Remove flag(s) from read. +.I FLAG +is specified in the same way as with the +.B "--add-flags" +option. +.TP +.BI "--subsample " FLOAT +Output only a proportion of the input alignments, as specified by 0.0 \(<= +.I FLOAT +\(<= 1.0, which gives the fraction of templates/pairs to be kept. This subsampling acts in the same way on all of the alignment records in the same template or read pair, so it never keeps a read but not its mate. -.IP -The integer and fractional parts of the -.BI "-s " INT . FRAC -option are used separately: the part after the -decimal point sets the fraction of templates/pairs to be kept, -while the integer part is used as a seed that influences +.TP +.BI "--subsample-seed " INT +Subsampling seed used to influence .I which subset of reads is kept. -.IP .\" Reads are retained based on a score computed by hashing their QNAME .\" field and the seed value. When subsampling data that has previously been subsampled, be sure to use a different seed value from those used previously; otherwise more reads will be retained than expected. +[0] .TP -.BI "-@ " INT +.BI "-s " FLOAT +Subsampling shorthand option: +.BI "-s " INT . FRAC +is equivalent to +.BI "--subsample-seed " INT " --subsample +.RI 0. FRAC . +.TP +.BI "-@ " INT ", --threads " INT Number of BAM compression threads to use in addition to main thread [0]. .TP .B -S @@ -361,7 +415,7 @@ correct format is automatically detected by examining the first few characters of input. .TP -.B -X +.BR -X ", " --customized-index Include customized index file as a part of arguments. See .B EXAMPLES section for sample of usage. @@ -375,14 +429,14 @@ .B @SQ lines are present in the header: .EX 2 -samtools view -bS aln.sam > aln.bam +samtools view -bo aln.bam aln.sam .EE If .B @SQ lines are absent: .EX 2 samtools faidx ref.fa -samtools view -bt ref.fa.fai aln.sam > aln.bam +samtools view -bt ref.fa.fai -o aln.bam aln.sam .EE where .I ref.fa.fai @@ -393,7 +447,7 @@ .IP o 2 Convert a BAM file to a CRAM file using a local reference sequence. .EX 2 -samtools view -C -T ref.fa aln.bam > aln.cram +samtools view -C -T ref.fa -o aln.cram aln.bam .EE .IP o 2 @@ -444,6 +498,12 @@ samtools view -d RG:grp2 -o /data_folder/data.rg2_only.bam /data_folder/data.bam .EE +.IP o 2 +Remove the actions of samtools markdup. Clear the duplicate flag and remove the \fBdt\fR tag, keep the header. +.EX 2 +samtools view -h --remove-flags DUP -x dt -o /data_folder/dat.no_dup_markings.bam /data_folder/data.bam +.EE + .SH AUTHOR .PP Written by Heng Li from the Sanger Institute. diff -Nru samtools-1.11/INSTALL samtools-1.13/INSTALL --- samtools-1.11/INSTALL 2020-09-22 12:15:55.000000000 +0000 +++ samtools-1.13/INSTALL 2021-07-07 17:18:52.000000000 +0000 @@ -123,6 +123,9 @@ installation directory (i.e., it has 'include' and 'lib' subdirectories containing HTSlib headers and libraries). + (If the desired HTSlib source tree has been configured to build in a + separate build directory, DIR should refer to the build directory.) + By default, configure looks for an HTSlib source tree within or alongside the samtools source directory; if there are several likely candidates, you will have to choose one via this option. diff -Nru samtools-1.11/LICENSE samtools-1.13/LICENSE --- samtools-1.11/LICENSE 2020-09-22 12:15:55.000000000 +0000 +++ samtools-1.13/LICENSE 2021-07-07 17:18:52.000000000 +0000 @@ -1,6 +1,6 @@ The MIT/Expat License -Copyright (C) 2008-2020 Genome Research Ltd. +Copyright (C) 2008-2021 Genome Research Ltd. Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal diff -Nru samtools-1.11/m4/ax_with_htslib.m4 samtools-1.13/m4/ax_with_htslib.m4 --- samtools-1.11/m4/ax_with_htslib.m4 2020-09-22 12:15:55.000000000 +0000 +++ samtools-1.13/m4/ax_with_htslib.m4 2021-07-07 17:18:52.000000000 +0000 @@ -16,8 +16,8 @@ # # Different checks occur depending on the --with-htslib argument given: # -# With --with-htslib=DIR, checks whether DIR is a source tree or contains -# a working installation. +# With --with-htslib=DIR, checks whether DIR is a source tree (including +# a separate build tree) or contains a working installation. # By default, searches for a source tree (with a name matching htslib*) # within or alongside $srcdir. Produces AC_MSG_ERROR if there are # several equally-likely candidates. If there are none, checks for @@ -43,14 +43,14 @@ # # LICENSE # -# Copyright (C) 2015,2017 Genome Research Ltd +# Copyright (C) 2015, 2017, 2021 Genome Research Ltd # # Copying and distribution of this file, with or without modification, are # permitted in any medium without royalty provided the copyright notice # and this notice are preserved. This file is offered as-is, without any # warranty. -#serial 1 +#serial 3 AC_DEFUN([AX_WITH_HTSLIB], [AC_ARG_WITH([htslib], @@ -105,6 +105,8 @@ HTSDIR=$with_htslib if test -f "$HTSDIR/hts.c" && test -f "$HTSDIR/htslib/hts.h"; then ax_cv_htslib_which=source + elif test -f "$HTSDIR/htslib_vars.mk"; then + ax_cv_htslib_which=build else ax_cv_htslib_which=install fi @@ -121,6 +123,15 @@ AC_CONFIG_SUBDIRS($HTSDIR) fi ;; +build) + ax_cv_htslib=yes + ax_cv_htslib_which=source + # Ensure this is fully expanded, as the caller might not be using @HTSDIR@ + HTSSRCDIR=$(sed -n 's:\$(HTSDIR):'$HTSDIR': + s/^HTSSRCDIR *= *//p' "$HTSDIR/htslib_vars.mk") + HTSLIB_CPPFLAGS="-I$HTSSRCDIR" + HTSLIB_LDFLAGS="-L$HTSDIR" + ;; system) AC_CHECK_HEADER([htslib/sam.h], [AC_CHECK_LIB(hts, hts_version, [ax_cv_htslib=yes], [ax_cv_htslib=no])], diff -Nru samtools-1.11/Makefile samtools-1.13/Makefile --- samtools-1.11/Makefile 2020-09-22 12:15:55.000000000 +0000 +++ samtools-1.13/Makefile 2021-07-07 17:18:52.000000000 +0000 @@ -1,6 +1,6 @@ # Makefile for samtools, utilities for the Sequence Alignment/Map format. # -# Copyright (C) 2008-2020 Genome Research Ltd. +# Copyright (C) 2008-2021 Genome Research Ltd. # Portions copyright (C) 2010-2012 Broad Institute. # # Permission is hereby granted, free of charge, to any person obtaining a copy @@ -46,7 +46,7 @@ faidx.o dict.o stats.o stats_isize.o bam_flags.o bam_split.o \ bam_tview.o bam_tview_curses.o bam_tview_html.o bam_lpileup.o \ bam_quickcheck.o bam_addrprg.o bam_markdup.o tmp_file.o \ - bam_ampliconclip.o amplicon_stats.o + bam_ampliconclip.o amplicon_stats.o bam_import.o LZ4OBJS = $(LZ4DIR)/lz4.o prefix = /usr/local @@ -76,11 +76,10 @@ MISC_SCRIPTS = \ misc/blast2sam.pl misc/bowtie2sam.pl misc/export2sam.pl \ - misc/interpolate_sam.pl misc/novo2sam.pl \ - misc/plot-bamstats misc/psl2sam.pl \ + misc/fasta-sanitize.pl misc/interpolate_sam.pl misc/novo2sam.pl \ + misc/plot-ampliconstats misc/plot-bamstats misc/psl2sam.pl \ misc/sam2vcf.pl misc/samtools.pl misc/seq_cache_populate.pl \ - misc/soap2sam.pl misc/wgsim_eval.pl misc/zoom2sam.pl \ - misc/plot-ampliconstats + misc/soap2sam.pl misc/wgsim_eval.pl misc/zoom2sam.pl TEST_PROGRAMS = \ test/merge/test_bam_translate \ @@ -124,6 +123,13 @@ # echo '#define SAMTOOLS_VERSION "`git describe --always --dirty`"' > $@ version.h: echo '#define SAMTOOLS_VERSION "$(PACKAGE_VERSION)"' > $@ + echo '#define SAMTOOLS_CC "$(CC)"' >> $@ + echo '#define SAMTOOLS_CPPFLAGS "$(CPPFLAGS)"' >> $@ + echo '#define SAMTOOLS_CFLAGS "$(CFLAGS)"' >> $@ + echo '#define SAMTOOLS_LDFLAGS "$(LDFLAGS)"' >> $@ + echo '#define SAMTOOLS_HTSDIR "$(HTSDIR)"' >> $@ + echo '#define SAMTOOLS_LIBS "$(LIBS)"' >> $@ + echo '#define SAMTOOLS_CURSES_LIB "$(CURSES_LIB)"' >> $@ print-version: @echo $(PACKAGE_VERSION) @@ -167,13 +173,14 @@ bam.o: bam.c config.h $(bam_h) $(htslib_kstring_h) bam2bcf.o: bam2bcf.c config.h $(htslib_hts_h) $(htslib_sam_h) $(htslib_kstring_h) $(htslib_kfunc_h) $(bam2bcf_h) bam2bcf_indel.o: bam2bcf_indel.c config.h $(htslib_hts_h) $(htslib_sam_h) $(bam2bcf_h) $(htslib_khash_h) $(htslib_ksort_h) -bam2depth.o: bam2depth.c config.h $(htslib_sam_h) $(samtools_h) $(bedidx_h) $(sam_opts_h) +bam2depth.o: bam2depth.c config.h $(htslib_sam_h) $(samtools_h) $(bedidx_h) $(sam_opts_h) $(htslib_khash_h) coverage.o: coverage.c config.h $(htslib_sam_h) $(htslib_hts_h) $(samtools_h) $(sam_opts_h) bam_addrprg.o: bam_addrprg.c config.h $(htslib_sam_h) $(htslib_kstring_h) $(samtools_h) $(htslib_thread_pool_h) $(sam_opts_h) bam_aux.o: bam_aux.c config.h $(bam_h) bam_cat.o: bam_cat.c config.h $(htslib_bgzf_h) $(htslib_sam_h) $(htslib_cram_h) $(htslib_kstring_h) $(samtools_h) $(sam_opts_h) bam_color.o: bam_color.c config.h $(htslib_sam_h) bam_fastq.o: bam_fastq.c config.h $(htslib_sam_h) $(htslib_klist_h) $(htslib_kstring_h) $(htslib_bgzf_h) $(htslib_thread_pool_h) $(samtools_h) $(sam_opts_h) +bam_import.o: bam_import.c config.h $(htslib_sam_h) $(htslib_thread_pool_h) $(samtools_h) $(sam_opts_h) bam_index.o: bam_index.c config.h $(htslib_hts_h) $(htslib_sam_h) $(htslib_khash_h) $(samtools_h) $(sam_opts_h) bam_lpileup.o: bam_lpileup.c config.h $(bam_plbuf_h) $(bam_lpileup_h) $(htslib_ksort_h) bam_mate.o: bam_mate.c config.h $(htslib_thread_pool_h) $(sam_opts_h) $(htslib_kstring_h) $(htslib_sam_h) $(samtools_h) @@ -190,9 +197,9 @@ bam_tview.o: bam_tview.c config.h $(bam_tview_h) $(htslib_faidx_h) $(htslib_sam_h) $(htslib_bgzf_h) $(samtools_h) $(sam_opts_h) bam_tview_curses.o: bam_tview_curses.c config.h $(bam_tview_h) bam_tview_html.o: bam_tview_html.c config.h $(bam_tview_h) -bam_flags.o: bam_flags.c config.h $(htslib_sam_h) +bam_flags.o: bam_flags.c config.h $(htslib_sam_h) $(samtools_h) bamshuf.o: bamshuf.c config.h $(htslib_sam_h) $(htslib_hts_h) $(htslib_ksort_h) $(samtools_h) $(htslib_thread_pool_h) $(sam_opts_h) $(htslib_khash_h) -bamtk.o: bamtk.c config.h $(htslib_hts_h) $(samtools_h) version.h +bamtk.o: bamtk.c config.h $(htslib_hts_h) $(htslib_hfile_h) $(samtools_h) version.h bedcov.o: bedcov.c config.h $(htslib_kstring_h) $(htslib_sam_h) $(htslib_thread_pool_h) $(samtools_h) $(sam_opts_h) $(htslib_kseq_h) bedidx.o: bedidx.c config.h $(bedidx_h) $(htslib_ksort_h) $(htslib_kseq_h) $(htslib_khash_h) cut_target.o: cut_target.c config.h $(htslib_hts_h) $(htslib_sam_h) $(htslib_faidx_h) $(samtools_h) $(sam_opts_h) @@ -203,7 +210,7 @@ sam.o: sam.c config.h $(htslib_faidx_h) $(sam_h) sam_opts.o: sam_opts.c config.h $(sam_opts_h) sam_utils.o: sam_utils.c config.h $(samtools_h) -sam_view.o: sam_view.c config.h $(htslib_sam_h) $(htslib_faidx_h) $(htslib_khash_h) $(htslib_thread_pool_h) $(samtools_h) $(sam_opts_h) $(bedidx_h) +sam_view.o: sam_view.c config.h $(htslib_sam_h) $(htslib_faidx_h) $(htslib_khash_h) $(htslib_thread_pool_h) $(htslib_hts_expr_h) $(samtools_h) $(sam_opts_h) $(bedidx_h) sample.o: sample.c config.h $(sample_h) $(htslib_khash_h) stats_isize.o: stats_isize.c config.h $(stats_isize_h) $(htslib_khash_h) stats.o: stats.c config.h $(htslib_faidx_h) $(htslib_sam_h) $(htslib_hts_h) $(htslib_hts_defs_h) $(htslib_khash_str2int_h) $(samtools_h) $(htslib_khash_h) $(htslib_kstring_h) $(stats_isize_h) $(sam_opts_h) $(bedidx_h) @@ -276,6 +283,9 @@ test/test.o: test/test.c config.h $(htslib_sam_h) $(test_test_h) test/vcf-miniview.o: test/vcf-miniview.c config.h $(htslib_vcf_h) +# test HTSlib as well, where it is built alongside SAMtools + +check-all test-all: test-htslib test # misc programs @@ -335,6 +345,11 @@ clean-all: clean clean-htslib +distclean-all: distclean distclean-htslib + +mostlyclean-all: mostlyclean mostlyclean-htslib + +testclean-all: testclean testclean-htslib tags: ctags -f TAGS *.[ch] misc/*.[ch] @@ -343,5 +358,6 @@ force: -.PHONY: all check clean clean-all distclean force install -.PHONY: lib mostlyclean print-version tags test testclean +.PHONY: all check check-all clean clean-all distclean distclean-all force +.PHONY: install lib mostlyclean mostlyclean-all print-version tags +.PHONY: test test-all testclean testclean-all diff -Nru samtools-1.11/misc/export2sam.pl samtools-1.13/misc/export2sam.pl --- samtools-1.11/misc/export2sam.pl 2020-09-22 12:15:55.000000000 +0000 +++ samtools-1.13/misc/export2sam.pl 2021-07-07 17:18:52.000000000 +0000 @@ -9,7 +9,7 @@ # # The MIT License # -# Copyright (c) 2008-2010 Genome Research Ltd. +# Copyright (c) 2008-2011 Genome Research Ltd. # Modifications Copyright (c) 2010 Illumina, Inc. # # Permission is hereby granted, free of charge, to any person obtaining a copy diff -Nru samtools-1.11/misc/fasta-sanitize.pl samtools-1.13/misc/fasta-sanitize.pl --- samtools-1.11/misc/fasta-sanitize.pl 1970-01-01 00:00:00.000000000 +0000 +++ samtools-1.13/misc/fasta-sanitize.pl 2021-07-07 17:18:52.000000000 +0000 @@ -0,0 +1,97 @@ +#!/usr/bin/env perl +# +# Copyright (C) 2020 Genome Research Ltd. +# +# Author: James Bonfield +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in +# all copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +# DEALINGS IN THE SOFTWARE. + + +# Usage: fasta-sanitize.pl file.in > file.out +# Or via stdin, e.g. zcat file.in.gz | fasta-sanitize.pl > file.out +# +# Also supports and autodetects fastq. + +# This tool sanitizes the reference names as per the SAM specification. +# See SAM pull request https://github.com/samtools/hts-specs/pull/333 + +# It is important that this is run prior to aligning. This ensures the +# SAM file contains @SQ lines which adhere to the specification, and that +# VCF files produced from those SAM files also match the VCF specification. +# +# Furthermore, doing this early rather than via later with "samtools reheader" +# means that the fasta file matches the SAM file. +# Several samtools and bcftools sub-commands also require a fasta +# reference, which must match the primary SQ lines, plus it helps CRAM +# if using "view -T" (although it'll still work if using the +# recommended practice of an MD5sum based ref cache). + +# The regexp permitted is: +# [0-9A-Za-z!#$%&+./:;?@^_|~-][0-9A-Za-z!#$%&*+./:;=?@^_|~-]* + +use strict; + +my $re = qr/^[0-9A-Za-z!#$%&+.\/:;?@^_|~-][0-9A-Za-z!#$%&*+.\/:;=?@^_|~-]*$/; + +my $fastq = 0; +my $in_qual = 0; +my $seq_len = 0; + +my $name_re = qr/^([>@])\s*(\S*)(.*)/; + +while (<>) { + # Name + if (/$name_re/ && !$in_qual) { + my ($prefix, $name, $other) = ($1,$2,$3); + $fastq = ($prefix eq "@") ? 1 : 0; + + if ($name !~ /$re/) { + my ($l,$r)=($name=~/(.)(.*)/); + $l =~ tr/[0-9A-Za-z!#$%&+.\/:;?@^_|~\-]/_/c; + $r =~ tr/[0-9A-Za-z!#$%&*+.\/:;=?@^_|~\-]/_/c; + my $new_name = $l.$r; + + print STDERR "Renaming reference $name to $new_name\n"; + $name = $new_name; + $seq_len = 0; + } + + print "$prefix$name$other\n"; + next; + } + + if (!$in_qual) { + # FASTQ separator between seq and qual + if ($fastq && /^\+/) { + print; + $in_qual = 1; + next; + } + + # Seq + print; + chomp($_); + $seq_len += length($_); + } else { + # Qual + print; + chomp($_); + $in_qual = 0 if (($seq_len -= length($_)) <= 0); + } +} diff -Nru samtools-1.11/misc/plot-ampliconstats samtools-1.13/misc/plot-ampliconstats --- samtools-1.11/misc/plot-ampliconstats 2020-09-22 12:15:55.000000000 +0000 +++ samtools-1.13/misc/plot-ampliconstats 2021-07-07 17:18:52.000000000 +0000 @@ -1,6 +1,6 @@ #!/usr/bin/env perl # -# Copyright (C) 2020 Genome Research Ltd. +# Copyright (C) 2020-2021 Genome Research Ltd. # # Author: James Bonfield # @@ -107,15 +107,33 @@ #----------------------------------------------------------------------------- # Load plot meta-data # This allows us to auto-scale plots. -my $namp = 0; +my $namp = 0; # total number +my %namp = (); # number per reference my $nfile = 0; -my $ref_len = 0; +my %ref_len = (); +my $multi_ref = 0; # Flag for multi-ref mode. Autodetected. +my $ref = "_"; +my %ref_start = (); # cumulative position in genome +my $total_len = 0; + while (<>) { chomp($_); my @F = split("\t", $_); - $namp = $F[2] if (/^SS\tNumber of amplicons/); + if (/^SS\tNumber of amplicons/) { + $multi_ref = 1 if (scalar(@F) > 3); + $ref = $F[2]; + } + if (/^SS\tNumber of amplicons/) { + $namp{$ref} = $F[2+$multi_ref]; + $namp += $namp{$ref}; + } $nfile = $F[2] if (/^SS\tNumber of files/); - $ref_len = $F[2] if (/^SS\tReference length/); + if (/^SS\tReference length/) { + $ref_len{$ref} = $F[2+$multi_ref]; + $ref_start{$ref} = $total_len; + $total_len += $ref_len{$ref}; + } + last if (/^SS\tEnd of summary/); } @@ -236,8 +254,9 @@ my %fh_cover_page; my %fh_cover_nfile; -my @amp_start; # coords for amplicons +my @amp_start; # coords for amplicons; cumulative across all refs my @amp_end; +my @amp2ref; # map amplicon number to ref name #----------------------------------------------------------------------------- # Parse stats file, writing to a number of outputs simultaneously @@ -251,19 +270,21 @@ # Amplicon coordinates if (/^AMPLICON/) { my $min_left=1e9; - foreach (split(",",$F[2])) { + $ref = $F[1] if $multi_ref; + foreach (split(",",$F[2+$multi_ref])) { /\d+-(\d+)/; $min_left=$1 if ($min_left > $1); } my $max_right=0; - foreach (split(",",$F[3])) { + foreach (split(",",$F[3+$multi_ref])) { /(\d+)-\d+/; $max_right=$1 if ($max_right < $1); } - $amp_start[$F[1]]=$min_left; - $amp_end[$F[1]]=$max_right; + $amp_start[$F[1+$multi_ref]]=$min_left + $ref_start{$ref}; + $amp_end[$F[1+$multi_ref]]=$max_right + $ref_start{$ref}; + $amp2ref[$F[1+$multi_ref]] = $multi_ref ? $F[1] : "_"; - $ref_len = $max_right if ($ref_len < $max_right); + $ref_len{$ref} = $max_right if ($ref_len{$ref} < $max_right); } # Heatmaps showing all files & all amplicons @@ -275,9 +296,18 @@ # Initialise gnuplot files as we create new pages #---------- Heatmap READS + # Input order is: + # TYPE file1 ref1 + # TYPE file1 ref2 + # TYPE file2 ref1 + # TYPE file2 ref2 + # + # Heatmaps combine file1/2 together, but the ref1/2 are interleaved. + # This means we need to load the file contents first so we can consume + # in another order. + $fh_reads_count++ if (/^FREADS/); - if ($fh_reads_count > $opts{page} && $nfile-$opts{page}*$fh_reads_page!=1) { - print STDERR "page $fh_reads_page\n"; + if (($fh_reads_count > $opts{page} || 0) && $nfile-$opts{page}*$fh_reads_page!=1) { if (defined($fh_reads)) { print $fh_reads "\nend\n"; close($fh_reads); @@ -411,11 +441,11 @@ } else { # Subsequent lines are per amplicon stats, # corrected for small sample size via amp_add. - print $fh_amp "\t", 100*($F[4]+$F[5])/($F[3]+$F[4]+$F[5]+$opts{amp_add}); + print $fh_amp "\t", + 100*($F[4]+$F[5])/($F[3]+$F[4]+$F[5]+$opts{amp_add}); } } - #---------- Heatmap READ-PERC, clipped and log $fh_rperc_count++ if (/^FRPERC/); if ($fh_rperc_count > $opts{page} @@ -541,12 +571,12 @@ # Graphs with merged file data (mean and SD) for all amplicons local $"="\n"; print $ch_reads "\$mean << EOD\n@F[2..$#F]\nEOD\n\n" if (/^CDEPTH.*MEAN/); - print $ch_reads "\$sd << EOD\n@F[2..$#F]\nEOD\n\n" if (/^CDEPTH.*STDDEV/); + print $ch_reads "\$sd << EOD\n@F[2..$#F]\nEOD\n\n" if (/^CDEPTH.*STDDEV/); print $ch_depth "\$mean << EOD\n@F[2..$#F]\nEOD\n\n" if (/^CDEPTH.*MEAN/); - print $ch_depth "\$sd << EOD\n@F[2..$#F]\nEOD\n\n" if (/^CDEPTH.*STDDEV/); - print $ch_rperc "@F[2..$#F]\nend\n" if (/^CRPERC.*(MEAN|STDDEV)/); - print $ch_amp 100*($F[4]+$F[5])/($F[3]+$F[4]+$F[5]+$opts{amp_add}),"\n" - if (/^CAMP\tCOMBINED\t[1-9]/); + print $ch_depth "\$sd << EOD\n@F[2..$#F]\nEOD\n\n" if (/^CDEPTH.*STDDEV/); + print $ch_rperc "@F[2..$#F]\nend\n" if (/^CRPERC.*(MEAN|STDDEV)/); + print $ch_amp 100*($F[4]+$F[5])/($F[3]+$F[4]+$F[5]+$opts{amp_add}),"\n" + if (/^CAMP\tCOMBINED\t[1-9]/); # These only occur once so we can open the file in situ rather # than keep appending to it. @@ -610,25 +640,35 @@ #push(@{$file{$F[1]}{keys}}, $F[0]); } if (/^FAMP\s\S+\s[1-9]*/) { - push(@{$file{$F[1]}{$F[0]}}, 100*($F[4]+$F[5])/($F[3]+$F[4]+$F[5]+$opts{amp_add})); + push(@{$file{$F[1]}{$F[0]}}, 100*($F[4]+$F[5])/($F[3]+$F[4]+$F[5]+$opts{amp_add})); } if (/^FTCOORD/) { - local $"="\t$F[2]\n"; - $_="@F[3..$#F]"; - s/,/\t/g; - push(@{$file{$F[1]}{$F[0]}}, "$_\t$F[2]") if ($_ ne ""); + $_ = ""; + foreach my $x (@F[3..$#F]) { + my @a = split(",", $x); + $a[0] += $ref_start{$amp2ref[$F[2]]}; + $a[1] += $ref_start{$amp2ref[$F[2]]}; + $_ .= join("\t", @a) . "\t$F[2]\n"; + } + s/\n$//; + push(@{$file{$F[1]}{$F[0]}}, "$_") if ($_ ne ""); } if (/^CTCOORD/) { - local $"="\t$F[2]\n"; - $_="@F[3..$#F]"; - s/,/\t/g; - push(@combined_coord, "$_\t$F[2]") if ($_ ne ""); + $_ = ""; + foreach my $x (@F[3..$#F]) { + my @a = split(",", $x); + $a[0] += $ref_start{$amp2ref[$F[2]]}; + $a[1] += $ref_start{$amp2ref[$F[2]]}; + $_ .= join("\t", @a) . "\t$F[2]\n"; + } + s/\n$//; + push(@combined_coord, "$_") if ($_ ne ""); } if(/^[FC]DP_(ALL|VALID)/) { local $"="\n"; - push(@{$file{$F[1]}{$F[0]}}, "@F[2..$#F]") if ($_ ne ""); + push(@{$file{$F[1]}{$F[0]}}, "@F[3..$#F]") if ($_ ne ""); } } @@ -808,7 +848,7 @@ set title "$fg: Template coordinate frequencies" unset key set xlabel "position" -set xrange [0:$ref_len] +set xrange [0:$total_len] set ylabel "frequency" set yrange [*:*] set logscale y @@ -856,7 +896,7 @@ set title "$fg: Template sizes" unset key set xlabel "position" -set xrange [0:$ref_len] +set xrange [0:$total_len] set ylabel "length" set yrange [10:10000] set logscale y @@ -911,7 +951,7 @@ set title "Template coordinate frequencies, all files" unset key set xlabel "position" -set xrange [0:$ref_len] +set xrange [0:$total_len] set ylabel "frequency" set yrange [*:*] set logscale y @@ -961,7 +1001,7 @@ set title "$fg: Template depth per base" set key below set xlabel "position" -set xrange [0:$ref_len] +set xrange [0:$total_len] set ylabel "depth" set yrange [1:$max_depth] set logscale y @@ -1025,7 +1065,7 @@ set title "Template depth per base, all files" set key below set xlabel "position" -set xrange [0:$ref_len] +set xrange [0:$total_len] set ylabel "depth" set yrange [1:$max_depth] set logscale y @@ -1211,7 +1251,6 @@ next unless /^FPCOV-(\d+)/; print $fh "@{$file{$f}{$_}}[2..$#{$file{$f}{$_}}]\nend\n"; } - #print $fh "@{$file{$f}{FRPERC}}[2..$#{$file{$f}{FRPERC}}]\nend\n"; close($fh); system("gnuplot", "$fn.gp") && die; diff -Nru samtools-1.11/misc/seq_cache_populate.pl samtools-1.13/misc/seq_cache_populate.pl --- samtools-1.11/misc/seq_cache_populate.pl 2020-09-22 12:15:55.000000000 +0000 +++ samtools-1.13/misc/seq_cache_populate.pl 2021-07-07 17:18:52.000000000 +0000 @@ -2,7 +2,7 @@ # The MIT License -# Copyright (c) 2014 Genome Research Ltd. +# Copyright (c) 2014, 2020 Genome Research Ltd. # Author: Rob Davies # Permission is hereby granted, free of charge, to any person obtaining a copy diff -Nru samtools-1.11/misc/wgsim.1 samtools-1.13/misc/wgsim.1 --- samtools-1.11/misc/wgsim.1 2020-09-22 12:15:55.000000000 +0000 +++ samtools-1.13/misc/wgsim.1 2021-07-07 17:18:52.000000000 +0000 @@ -1,4 +1,4 @@ -.TH wgsim 1 "22 September 2020" "samtools-1.11" "Bioinformatics tools" +.TH wgsim 1 "7 July 2021" "samtools-1.13" "Bioinformatics tools" .SH NAME wgsim \- Whole-genome sequencing read simulator .SH SYNOPSIS diff -Nru samtools-1.11/NEWS samtools-1.13/NEWS --- samtools-1.11/NEWS 2020-09-22 12:15:55.000000000 +0000 +++ samtools-1.13/NEWS 2021-07-07 17:18:52.000000000 +0000 @@ -1,3 +1,184 @@ +Release 1.13 (7th July 2021) +---------------------------- + + * Fixed samtools view FILE REGION, mpileup -r REGION, coverage -r REGION and + other region queries: fixed bug introduced in 1.12, which led to region + queries producing very few reads for some queries (especially for larger + target regions) when unmapped reads were present. + Thanks to @vinimfava (#1451), @JingGuo1997 (#1457) and Ramprasad Neethiraj + (#1460) for reporting the respective issues. + + * Added options to set and clear flags to samtools view. Along with the + existing remove aux tags this gives the ability to remove mark duplicate + changes (part of #1358) + (#1441) + + * samtools view now has long option equivalents for most of its single-letter + options. Thanks to John Marshall. + (#1442) + + * A new tool, samtools import, has been added. It reads one or more FASTQ + files and converts them into unmapped SAM, BAM or CRAM. + (#1323) + + * Fixed samtools coverage error message when the target region name is not + present in the file header. Thanks to @Lyn16 for reporting it. + (#1462; fixes #1461) + + * Made samtools coverage ASCII mode produce true ASCII output. Previously it + would produce UTF-8 characters. + (#1423; fixes #1419) + + * samtools coverage now allows setting the maximum depth, using the -d/--depth + option. Also, the default maximum depth has been set to 1000000. + (#1415; fixes #1395) + + * Complete rewrite of samtools depth. This means it is now considerably faster + and does not need a depth limit to avoid high memory usage. Results should + mostly be the same as the old command with the potential exception of overlap + removal. + (#1428; fixes #889, helps ameliorate #1411) + + * samtools flags now accepts any number of command line arguments, + allowing multiple SAM flag combinations to be converted at once. Thanks to + John Marshall. + (#1401, fixes #749) + + * samtools ampliconclip, ampliconstats and plot-ampliconstats now support + inputs that list more than one reference. + (#1410 and #1417; fixes #1396 and #1418) + + * samtools ampliconclip now accepts the --tolerance option, which allows the + user to set the number of bases within which a region is matched. The + default is 5. + (#1456) + + * Updated the documentation on samtools ampliconclip to be clearer about what + it does. From a suggestion by Nathan S Watson-Haigh. + (#1448) + + * Fixed negative depth values in ampliconstats output. + (#1400) + + * samtools addreplacerg now allows for updating (replacing) an existing + `@RG` line in the output header, if a new `@RG` line is provided in the + command line, via the -r argument. The update still requires the user's + approval, which can be given with the new -w option. Thanks to Chuang Yu. + (#1404) + + * Stopped samtools cat from outputting multiple CRAM EOF markers. + (#1422) + + * Three new counts have been added to samtools flagstat: primary, mapped + primary and duplicate primary. + (#1431; fixes #1382) + + * samtools merge now accepts a `-o FILE` option specifying the output file, + similarly to most other subcommands. The existing way of specifying it + (as the first non-option argument, alongside the input file arguments) + remains supported. Thanks to David McGaughey and John Marshall. + (#1434) + + * The way samtools merge checks for existing files has been changed + so that it does not hang when used on a named pipe. + (#1438; fixes #1437) + + * Updated documentation on mpileup to highlight the fact that the filtering + options on FLAGs work with ANY rules. + (#1447; fixes #1435) + + * samtools can now be configured to use a copy of HTSlib that has been set + up with separate build and source trees. When this is the case, the + `--with-htslib` configure option should be given the location of the HTSlib + build tree. (Note that samtools itself does not yet support out-of-tree + builds). Thanks to John Marshall. + (#1427; companion change to samtools/htslib#1277) + + +Release 1.12 (17th March 2021) +------------------------------ + + * The legacy samtools API (libbam.a, bam.h, sam.h, etc) has not been actively + maintained since 2015. It is deprecated and will be removed entirely in a + future SAMtools release. We recommend coding against the HTSlib API directly. + + * I/O errors and record parsing errors during the reading of SAM/BAM/CRAM + files are now always detected. Thanks to John Marshall (#1379; fixed #101) + + * New make targets have been added: check-all, test-all, distclean-all, + mostlyclean-all, testclean-all, which allow SAMtools installations to + call corresponding Makefile targets from embedded HTSlib installations. + + * samtools --version now displays a summary of the compilation details and + available features, including flags, used libraries and enabled plugins + from HTSlib. As an alias, `samtools version` can also be used. (#1371) + + * samtools stats now displays the number of supplementary reads in the + SN section. Also, supplementary reads are no longer considered when + splitting read pairs by orientation (inward, outward, other). (#1363) + + * samtools stats now counts only the filtered alignments that overlap + target regions, if any are specified. (#1363) + + * samtools view now accepts option -N, which takes a file containing + read names of interest. This allows the output of only the reads with + names contained in the given file. Thanks to Daniel Cameron. (#1324) + + * samtools view -d option now works without a tag associated value, which + allows it to output all the reads with the given tag. (#1339; fixed #1317) + + * samtools view -d and -D options now accept integer and single character + values associated with tags, not just strings. Thanks to `@dariome` and + Keiran Raine for the suggestions. (#1357, #1392) + + * samtools view now works with the filtering expressions introduced by HTSlib. + The filtering expression is passed to the program using the specific option + -e or the global long option --input-fmt-option. E.g. + `samtools view -e 'qname =~ "#49$" && mrefid != refid && refid != -1 && + mrefid != -1' align.bam` looks for records with query-name ending in `#49` + that have their mate aligned in a different chromosome. More details can be + found in the FILTER EXPRESSIONS section of the main man page. (#1346) + + * samtools markdup now benefits from an increase in performance in the + situation when a single read has tens or hundreds of thousands of duplicates. + Thanks to `@denriquez` for reporting the issue. (#1345; fixed #1325) + + * The documentation for samtools ampliconstats has been added to the samtools + man page. (#1351) + + * A new FASTA/FASTQ sanitizer script (`fasta-sanitize.pl`) was added, which + corrects the invalid characters in the reference names. (#1314) Thanks to + John Marshall for the installation fix. (#1353) + + * The CI scripts have been updated to recurse the HTSlib submodules when + cloning HTSlib, to accommodate for the CRAM codecs, which now reside in the + htscodecs submodule. (#1359) + + * The CI integrations now include Cirrus-CI rather than Travis. (#1335; #1365) + + * Updated the Windows image used by Appveyor to 'Visual Studio 2019'. + (#1333; fixed #1332) + + * Fixed a bug in samtools cat, which prevented the command from running in + multi-threaded mode. Thanks to Alex Leonard for reporting the issue. + (#1337; fixed #1336) + + * A couple of invalid CIGAR strings have been corrected in the test data. + (#1343) + + * The documentation for `samtools depth -s` has been improved. + Thanks to `@wulj2`. (#1355) + + * Fixed a `samtools merge` segmentation fault when it failed to merge + header `@PG` records. Thanks to John Marshall. (#1394; reported by + Kemin Zhou in #1393) + + * Ampliconclip and ampliconstats now guard against the BED file + containing more than one reference (chromosome) and fail when + found. Adding proper support for multiple references will appear + later. (#1398) + + Release 1.11 (22nd September 2020) ---------------------------------- diff -Nru samtools-1.11/phase.c samtools-1.13/phase.c --- samtools-1.11/phase.c 2020-09-22 12:15:55.000000000 +0000 +++ samtools-1.13/phase.c 2021-07-07 17:18:52.000000000 +0000 @@ -583,6 +583,7 @@ int main_phase(int argc, char *argv[]) { int c, tid, pos, vpos = 0, n, lasttid = -1, max_vpos = 0, usage = 0; + int status = EXIT_SUCCESS; const bam_pileup1_t *plp; bam_plp_t iter; nseq_t *seqs; @@ -785,6 +786,12 @@ return 1; } } + + if (n < 0) { + print_error("phase", "error reading from '%s'", argv[optind]); + status = EXIT_FAILURE; + } + sam_hdr_destroy(g.fp_hdr); bam_plp_destroy(iter); sam_close(g.fp); @@ -809,5 +816,5 @@ } free(g.arg_list); sam_global_args_free(&ga); - return 0; + return status; } diff -Nru samtools-1.11/README samtools-1.13/README --- samtools-1.11/README 2020-09-22 12:15:55.000000000 +0000 +++ samtools-1.13/README 2021-07-07 17:18:52.000000000 +0000 @@ -9,7 +9,7 @@ The typical simple case of building Samtools using the HTSlib bundled within this Samtools release tarball is done as follows: - cd .../samtools-1.11 # Within the unpacked release directory + cd .../samtools-1.13 # Within the unpacked release directory ./configure make @@ -21,7 +21,7 @@ installation using the HTSlib bundled within this Samtools release tarball, and building the various HTSlib utilities such as bgzip is done as follows: - cd .../samtools-1.11 # Within the unpacked release directory + cd .../samtools-1.13 # Within the unpacked release directory ./configure --prefix=/path/to/location make all all-htslib make install install-htslib @@ -48,7 +48,7 @@ To build with plug-ins, you need to use the --enable-plugins configure option as follows: - cd .../samtools-1.11 # Within the unpacked release directory + cd .../samtools-1.13 # Within the unpacked release directory ./configure --enable-plugins --prefix=/path/to/location make all all-htslib make install install-htslib @@ -66,8 +66,8 @@ the source distribution instead of installing the package. In that case you can use: - cd .../samtools-1.11 # Within the unpacked release directory - ./configure --enable-plugins --with-plugin-path=$PWD/htslib-1.11 + cd .../samtools-1.13 # Within the unpacked release directory + ./configure --enable-plugins --with-plugin-path=$PWD/htslib-1.13 make all all-htslib It is possible to override the built-in search path using the HTS_PATH @@ -99,3 +99,28 @@ It is recommended that you perform your own rigorous tests for an entire pipeline if you wish to switch to one of the optimised zlib implementations. + +Citing +====== + +Please cite this paper when using SAMtools for your publications: + +Twelve years of SAMtools and BCFtools +Petr Danecek, James K Bonfield, Jennifer Liddle, John Marshall, Valeriu Ohan, Martin O Pollard, Andrew Whitwham, Thomas Keane, Shane A McCarthy, Robert M Davies, Heng Li +GigaScience, Volume 10, Issue 2, February 2021, giab008, https://doi.org/10.1093/gigascience/giab008 + +@article{10.1093/gigascience/giab008, + author = {Danecek, Petr and Bonfield, James K and Liddle, Jennifer and Marshall, John and Ohan, Valeriu and Pollard, Martin O and Whitwham, Andrew and Keane, Thomas and McCarthy, Shane A and Davies, Robert M and Li, Heng}, + title = "{Twelve years of SAMtools and BCFtools}", + journal = {GigaScience}, + volume = {10}, + number = {2}, + year = {2021}, + month = {02}, + abstract = "{SAMtools and BCFtools are widely used programs for processing and analysing high-throughput sequencing data. They include tools for file format conversion and manipulation, sorting, querying, statistics, variant calling, and effect analysis amongst other methods.The first version appeared online 12 years ago and has been maintained and further developed ever since, with many new features and improvements added over the years. The SAMtools and BCFtools packages represent a unique collection of tools that have been used in numerous other software projects and countless genomic pipelines.Both SAMtools and BCFtools are freely available on GitHub under the permissive MIT licence, free for both non-commercial and commercial use. Both packages have been installed \\>1 million times via Bioconda. The source code and documentation are available from https://www.htslib.org.}", + issn = {2047-217X}, + doi = {10.1093/gigascience/giab008}, + url = {https://doi.org/10.1093/gigascience/giab008}, + note = {giab008}, + eprint = {https://academic.oup.com/gigascience/article-pdf/10/2/giab008/36332246/giab008.pdf}, +} diff -Nru samtools-1.11/sam_view.c samtools-1.13/sam_view.c --- samtools-1.11/sam_view.c 2020-09-22 12:15:55.000000000 +0000 +++ samtools-1.13/sam_view.c 2021-07-07 17:18:52.000000000 +0000 @@ -1,6 +1,6 @@ /* sam_view.c -- SAM<->BAM<->CRAM conversion. - Copyright (C) 2009-2020 Genome Research Ltd. + Copyright (C) 2009-2021 Genome Research Ltd. Portions copyright (C) 2009, 2011, 2012 Broad Institute. Author: Heng Li @@ -37,20 +37,20 @@ #include "htslib/faidx.h" #include "htslib/khash.h" #include "htslib/thread_pool.h" +#include "htslib/hts_expr.h" #include "samtools.h" #include "sam_opts.h" #include "bedidx.h" -KHASH_SET_INIT_STR(rg) -KHASH_SET_INIT_STR(tv) +KHASH_SET_INIT_STR(str) -typedef khash_t(rg) *rghash_t; -typedef khash_t(tv) *tvhash_t; +typedef khash_t(str) *strhash_t; // This structure contains the settings for a samview run typedef struct samview_settings { - rghash_t rghash; - tvhash_t tvhash; + strhash_t rghash; + strhash_t rnhash; + strhash_t tvhash; int min_mapQ; int flag_on; int flag_off; @@ -65,6 +65,9 @@ char** remove_aux; int multi_region; char* tag; + hts_filter_t *filter; + int remove_flag; + int add_flag; } samview_settings_t; @@ -97,19 +100,39 @@ if (settings->rghash) { uint8_t *s = bam_aux_get(b, "RG"); if (s) { - khint_t k = kh_get(rg, settings->rghash, (char*)(s + 1)); + khint_t k = kh_get(str, settings->rghash, (char*)(s + 1)); if (k == kh_end(settings->rghash)) return 1; } } - if (settings->tvhash && settings->tag) { + if (settings->tag) { uint8_t *s = bam_aux_get(b, settings->tag); if (s) { - khint_t k = kh_get(tv, settings->tvhash, (char*)(s + 1)); - if (k == kh_end(settings->tvhash)) return 1; + if (settings->tvhash) { + char t[32], *val; + if (*s == 'i' || *s == 'I' || *s == 's' || *s == 'S' || *s == 'c' || *s == 'C') { + int ret = snprintf(t, 32, "%"PRId64, bam_aux2i(s)); + if (ret > 0) val = t; + else return 1; + } else if (*s == 'A') { + t[0] = *(s+1); + t[1] = 0; + val = t; + } else { + val = (char *)(s+1); + } + khint_t k = kh_get(str, settings->tvhash, val); + if (k == kh_end(settings->tvhash)) return 1; + } } else { return 1; } } + if (settings->rnhash) { + const char* rn = bam_get_qname(b); + if (!rn || kh_get(str, settings->rnhash, rn) == kh_end(settings->rnhash)) { + return 1; + } + } if (settings->library) { const char *p = bam_get_library((sam_hdr_t*)h, b); if (!p || strcmp(p, settings->library) != 0) return 1; @@ -123,11 +146,43 @@ } } } + + if (settings->filter && sam_passes_filter(h, b, settings->filter) < 1) + return 1; + return 0; } static int usage(FILE *fp, int exit_status, int is_long_help); +static int populate_lookup_from_file(const char *subcmd, strhash_t lookup, char *fn) +{ + FILE *fp; + char buf[1024]; + int ret = 0; + fp = fopen(fn, "r"); + if (fp == NULL) { + print_error_errno(subcmd, "failed to open \"%s\" for reading", fn); + return -1; + } + + while (ret != -1 && !feof(fp) && fscanf(fp, "%1023s", buf) > 0) { + char *d = strdup(buf); + if (d != NULL) { + kh_put(str, lookup, d, &ret); + if (ret == 0) free(d); /* Duplicate */ + } else { + ret = -1; + } + } + if (ferror(fp)) ret = -1; + if (ret == -1) { + print_error_errno(subcmd, "failed to read \"%s\"", fn); + } + fclose(fp); + return (ret != -1) ? 0 : -1; +} + static int add_read_group_single(const char *subcmd, samview_settings_t *settings, char *name) { char *d = strdup(name); @@ -136,11 +191,11 @@ if (d == NULL) goto err; if (settings->rghash == NULL) { - settings->rghash = kh_init(rg); + settings->rghash = kh_init(str); if (settings->rghash == NULL) goto err; } - kh_put(rg, settings->rghash, d, &ret); + kh_put(str, settings->rghash, d, &ret); if (ret == -1) goto err; if (ret == 0) free(d); /* Duplicate */ return 0; @@ -151,40 +206,28 @@ return -1; } -static int add_read_groups_file(const char *subcmd, samview_settings_t *settings, char *fn) +static int add_read_names_file(const char *subcmd, samview_settings_t *settings, char *fn) { - FILE *fp; - char buf[1024]; - int ret = 0; - if (settings->rghash == NULL) { - settings->rghash = kh_init(rg); - if (settings->rghash == NULL) { + if (settings->rnhash == NULL) { + settings->rnhash = kh_init(str); + if (settings->rnhash == NULL) { perror(NULL); return -1; } } + return populate_lookup_from_file(subcmd, settings->rnhash, fn); +} - fp = fopen(fn, "r"); - if (fp == NULL) { - print_error_errno(subcmd, "failed to open \"%s\" for reading", fn); - return -1; - } - - while (ret != -1 && !feof(fp) && fscanf(fp, "%1023s", buf) > 0) { - char *d = strdup(buf); - if (d != NULL) { - kh_put(rg, settings->rghash, d, &ret); - if (ret == 0) free(d); /* Duplicate */ - } else { - ret = -1; +static int add_read_groups_file(const char *subcmd, samview_settings_t *settings, char *fn) +{ + if (settings->rghash == NULL) { + settings->rghash = kh_init(str); + if (settings->rghash == NULL) { + perror(NULL); + return -1; } } - if (ferror(fp)) ret = -1; - if (ret == -1) { - print_error_errno(subcmd, "failed to read \"%s\"", fn); - } - fclose(fp); - return (ret != -1) ? 0 : -1; + return populate_lookup_from_file(subcmd, settings->rghash, fn); } static int add_tag_value_single(const char *subcmd, samview_settings_t *settings, char *name) @@ -195,11 +238,11 @@ if (d == NULL) goto err; if (settings->tvhash == NULL) { - settings->tvhash = kh_init(tv); + settings->tvhash = kh_init(str); if (settings->tvhash == NULL) goto err; } - kh_put(tv, settings->tvhash, d, &ret); + kh_put(str, settings->tvhash, d, &ret); if (ret == -1) goto err; if (ret == 0) free(d); /* Duplicate */ return 0; @@ -212,38 +255,14 @@ static int add_tag_values_file(const char *subcmd, samview_settings_t *settings, char *fn) { - FILE *fp; - char buf[1024]; - int ret = 0; if (settings->tvhash == NULL) { - settings->tvhash = kh_init(tv); + settings->tvhash = kh_init(str); if (settings->tvhash == NULL) { perror(NULL); return -1; } } - - fp = fopen(fn, "r"); - if (fp == NULL) { - print_error_errno(subcmd, "failed to open \"%s\" for reading", fn); - return -1; - } - - while (ret != -1 && !feof(fp) && fscanf(fp, "%1023s", buf) > 0) { - char *d = strdup(buf); - if (d != NULL) { - kh_put(tv, settings->tvhash, d, &ret); - if (ret == 0) free(d); /* Duplicate */ - } else { - ret = -1; - } - } - if (ferror(fp)) ret = -1; - if (ret == -1) { - print_error_errno(subcmd, "failed to read \"%s\"", fn); - } - fclose(fp); - return (ret != -1) ? 0 : -1; + return populate_lookup_from_file(subcmd, settings->tvhash, fn); } static inline int check_sam_write1(samFile *fp, const sam_hdr_t *h, const bam1_t *b, const char *fname, int *retp) @@ -258,6 +277,18 @@ return r; } +static inline void change_flag(bam1_t *b, samview_settings_t *settings) +{ + if (settings->add_flag) + b->core.flag |= settings->add_flag; + + if (settings->remove_flag) + b->core.flag &= ~settings->remove_flag; +} + +// Make mnemonic distinct values for longoption-only options +#define LONGOPT(c) ((c) + 128) + int main_samview(int argc, char *argv[]) { int c, is_header = 0, is_header_only = 0, ret = 0, compress_level = -1, is_count = 0, has_index_file = 0, no_pg = 0; @@ -265,7 +296,7 @@ samFile *in = 0, *out = 0, *un_out=0; FILE *fp_out = NULL; sam_hdr_t *header = NULL; - char out_mode[5], out_un_mode[5], *out_format = ""; + char out_mode[6] = {0}, out_un_mode[6] = {0}, *out_format = ""; char *fn_in = 0, *fn_idx_in = 0, *fn_out = 0, *fn_fai = 0, *q, *fn_un_out = 0; char *fn_out_idx = NULL, *fn_un_out_idx = NULL, *arg_list = NULL; sam_global_args ga = SAM_GLOBAL_ARGS_INIT; @@ -287,12 +318,59 @@ .library = NULL, .bed = NULL, .multi_region = 0, - .tag = NULL + .tag = NULL, + .filter = NULL, + .remove_flag = 0, + .add_flag = 0 }; static const struct option lopts[] = { SAM_OPT_GLOBAL_OPTIONS('-', 0, 'O', 0, 'T', '@'), - {"no-PG", no_argument, NULL, 1}, + {"add-flags", required_argument, NULL, LONGOPT('a')}, + {"bam", no_argument, NULL, 'b'}, + {"count", no_argument, NULL, 'c'}, + {"cram", no_argument, NULL, 'C'}, + {"customised-index", no_argument, NULL, 'X'}, + {"customized-index", no_argument, NULL, 'X'}, + {"excl-flags", required_argument, NULL, 'F'}, + {"exclude-flags", required_argument, NULL, 'F'}, + {"expr", required_argument, NULL, 'e'}, + {"expression", required_argument, NULL, 'e'}, + {"fai-reference", required_argument, NULL, 't'}, + {"fast", no_argument, NULL, '1'}, + {"header-only", no_argument, NULL, 'H'}, + {"help", no_argument, NULL, LONGOPT('?')}, + {"library", required_argument, NULL, 'l'}, + {"min-mapq", required_argument, NULL, 'q'}, + {"min-MQ", required_argument, NULL, 'q'}, + {"min-mq", required_argument, NULL, 'q'}, + {"min-qlen", required_argument, NULL, 'm'}, + {"no-header", no_argument, NULL, LONGOPT('H')}, + {"no-PG", no_argument, NULL, LONGOPT('P')}, + {"output", required_argument, NULL, 'o'}, + {"output-unselected", required_argument, NULL, 'U'}, + {"QNAME-file", required_argument, NULL, 'N'}, + {"qname-file", required_argument, NULL, 'N'}, + {"read-group", required_argument, NULL, 'r'}, + {"read-group-file", required_argument, NULL, 'R'}, + {"readgroup", required_argument, NULL, 'r'}, + {"readgroup-file", required_argument, NULL, 'R'}, + {"region-file", required_argument, NULL, LONGOPT('L')}, + {"regions-file", required_argument, NULL, LONGOPT('L')}, + {"remove-B", no_argument, NULL, 'B'}, + {"remove-flags", required_argument, NULL, LONGOPT('r')}, + {"remove-tag", required_argument, NULL, 'x'}, + {"require-flags", required_argument, NULL, 'f'}, + {"subsample", required_argument, NULL, LONGOPT('s')}, + {"subsample-seed", required_argument, NULL, LONGOPT('S')}, + {"tag", required_argument, NULL, 'd'}, + {"tag-file", required_argument, NULL, 'D'}, + {"target-file", required_argument, NULL, 'L'}, + {"targets-file", required_argument, NULL, 'L'}, + {"uncompressed", no_argument, NULL, 'u'}, + {"unoutput", required_argument, NULL, 'U'}, + {"use-index", no_argument, NULL, 'M'}, + {"with-header", no_argument, NULL, 'h'}, { NULL, 0, NULL, 0 } }; @@ -309,16 +387,11 @@ opterr = 0; while ((c = getopt_long(argc, argv, - "SbBcCt:h1Ho:O:q:f:F:G:ul:r:T:R:d:D:L:s:@:m:x:U:MX", + "SbBcCt:h1Ho:O:q:f:F:G:ul:r:T:R:N:d:D:L:s:@:m:x:U:MXe:", lopts, NULL)) >= 0) { switch (c) { case 's': - if ((settings.subsam_seed = strtol(optarg, &q, 10)) != 0) { - // Convert likely user input 0,1,2,... to pseudo-random - // values with more entropy and more bits set - srand(settings.subsam_seed); - settings.subsam_seed = rand(); - } + settings.subsam_seed = strtol(optarg, &q, 10); if (q && *q == '.') { settings.subsam_frac = strtod(q, &q); if (*q) ret = 1; @@ -331,6 +404,14 @@ goto view_end; } break; + case LONGOPT('s'): + settings.subsam_frac = strtod(optarg, &q); + if (*q || settings.subsam_frac < 0.0 || settings.subsam_frac > 1.0) { + print_error("view", "Incorrect sampling argument \"%s\"", optarg); + goto view_end; + } + break; + case LONGOPT('S'): settings.subsam_seed = atoi(optarg); break; case 'm': settings.min_qlen = atoi(optarg); break; case 'c': is_count = 1; break; case 'S': break; @@ -339,16 +420,20 @@ case 't': fn_fai = strdup(optarg); break; case 'h': is_header = 1; break; case 'H': is_header_only = 1; break; + case LONGOPT('H'): is_header = is_header_only = 0; break; case 'o': fn_out = strdup(optarg); break; case 'U': fn_un_out = strdup(optarg); break; case 'X': has_index_file = 1; break; - case 'f': settings.flag_on |= strtol(optarg, 0, 0); break; - case 'F': settings.flag_off |= strtol(optarg, 0, 0); break; - case 'G': settings.flag_alloff |= strtol(optarg, 0, 0); break; + case 'f': settings.flag_on |= bam_str2flag(optarg); break; + case 'F': settings.flag_off |= bam_str2flag(optarg); break; + case 'G': settings.flag_alloff |= bam_str2flag(optarg); break; case 'q': settings.min_mapQ = atoi(optarg); break; case 'u': compress_level = 0; break; case '1': compress_level = 1; break; case 'l': settings.library = strdup(optarg); break; + case LONGOPT('L'): + settings.multi_region = 1; + // fall through case 'L': if ((settings.bed = bed_read(optarg)) == NULL) { print_error_errno("view", "Could not read file \"%s\"", optarg); @@ -368,8 +453,14 @@ goto view_end; } break; + case 'N': + if (add_read_names_file("view", &settings, optarg) != 0) { + ret = 1; + goto view_end; + } + break; case 'd': - if (strlen(optarg) < 4 || optarg[2] != ':') { + if (strlen(optarg) < 2 || (strlen(optarg) > 2 && optarg[2] != ':')) { print_error_errno("view", "Invalid \"tag:value\" option: \"%s\"", optarg); ret = 1; goto view_end; @@ -390,7 +481,8 @@ memcpy(settings.tag, optarg, 2); } - if (add_tag_value_single("view", &settings, optarg+3) != 0) { + if (strlen(optarg) > 3 && add_tag_value_single("view", &settings, optarg+3) != 0) { + print_error("view", "Could not add tag:value \"%s\"", optarg); ret = 1; goto view_end; } @@ -398,7 +490,7 @@ case 'D': // Allow ";" as delimiter besides ":" to support MinGW CLI POSIX // path translation as described at: - // http://www.mingw.org/wiki/Posix_path_conversion + // http://www.mingw.org/wiki/Posix_path_conversion if (strlen(optarg) < 4 || (optarg[2] != ':' && optarg[2] != ';')) { print_error_errno("view", "Invalid \"tag:file\" option: \"%s\"", optarg); ret = 1; @@ -429,6 +521,8 @@ //case 'x': out_format = "x"; break; //case 'X': out_format = "X"; break; */ + case LONGOPT('?'): + return usage(stdout, EXIT_SUCCESS, 1); case '?': if (optopt == '?') { // '-?' appeared on command line return usage(stdout, EXIT_SUCCESS, 1); @@ -450,7 +544,7 @@ case 'x': { if (strlen(optarg) != 2) { - fprintf(stderr, "main_samview: Error parsing -x auxiliary tags should be exactly two characters long.\n"); + print_error("main_samview", "Error parsing -x auxiliary tags should be exactly two characters long."); return usage(stderr, EXIT_FAILURE, 0); } settings.remove_aux = (char**)realloc(settings.remove_aux, sizeof(char*) * (++settings.remove_aux_len)); @@ -458,7 +552,15 @@ } break; case 'M': settings.multi_region = 1; break; - case 1: no_pg = 1; break; + case LONGOPT('P'): no_pg = 1; break; + case 'e': + if (!(settings.filter = hts_filter_init(optarg))) { + print_error("main_samview", "Couldn't initialise filter"); + return 1; + } + break; + case LONGOPT('r'): settings.remove_flag |= bam_str2flag(optarg); break; + case LONGOPT('a'): settings.add_flag |= bam_str2flag(optarg); break; default: if (parse_sam_global_opt(c, optarg, lopts, &ga) != 0) return usage(stderr, EXIT_FAILURE, 0); @@ -474,8 +576,7 @@ // Overridden by manual -b, -C if (*out_format) out_mode[1] = out_un_mode[1] = *out_format; - out_mode[2] = out_un_mode[2] = '\0'; - // out_(un_)mode now 1 or 2 bytes long, followed by nul. + // out_(un_)mode now 1, 2 or 3 bytes long, followed by nul. if (compress_level >= 0) { char tmp[2]; tmp[0] = compress_level + '0'; tmp[1] = '\0'; @@ -486,6 +587,12 @@ print_error("view", "No input provided or missing option argument."); return usage(stderr, EXIT_FAILURE, 0); // potential memory leak... } + if (settings.subsam_seed != 0) { + // Convert likely user input 1,2,... to pseudo-random + // values with more entropy and more bits set + srand(settings.subsam_seed); + settings.subsam_seed = rand(); + } fn_in = (optind < argc)? argv[optind] : "-"; if ((in = sam_open_format(fn_in, "r", &ga.in)) == 0) { @@ -651,7 +758,10 @@ // fetch alignments while ((result = sam_itr_multi_next(in, iter, b)) >= 0) { if (!process_aln(header, b, &settings)) { - if (!is_count) { if (check_sam_write1(out, header, b, fn_out, &ret) < 0) break; } + if (!is_count) { + change_flag(b, &settings); + if (check_sam_write1(out, header, b, fn_out, &ret) < 0) break; + } count++; } else { if (un_out) { if (check_sam_write1(un_out, header, b, fn_un_out, &ret) < 0) break; } @@ -682,7 +792,10 @@ errno = 0; while ((r = sam_read1(in, header, b)) >= 0) { // read one alignment from `in' if (!process_aln(header, b, &settings)) { - if (!is_count) { if (check_sam_write1(out, header, b, fn_out, &ret) < 0) break; } + if (!is_count) { + change_flag(b, &settings); + if (check_sam_write1(out, header, b, fn_out, &ret) < 0) break; + } count++; } else { if (un_out) { if (check_sam_write1(un_out, header, b, fn_un_out, &ret) < 0) break; } @@ -720,7 +833,10 @@ // fetch alignments while ((result = sam_itr_next(in, iter, b)) >= 0) { if (!process_aln(header, b, &settings)) { - if (!is_count) { if (check_sam_write1(out, header, b, fn_out, &ret) < 0) break; } + if (!is_count) { + change_flag(b, &settings); + if (check_sam_write1(out, header, b, fn_out, &ret) < 0) break; + } count++; } else { if (un_out) { if (check_sam_write1(un_out, header, b, fn_un_out, &ret) < 0) break; } @@ -772,13 +888,19 @@ khint_t k; for (k = 0; k < kh_end(settings.rghash); ++k) if (kh_exist(settings.rghash, k)) free((char*)kh_key(settings.rghash, k)); - kh_destroy(rg, settings.rghash); + kh_destroy(str, settings.rghash); + } + if (settings.rnhash) { + khint_t k; + for (k = 0; k < kh_end(settings.rnhash); ++k) + if (kh_exist(settings.rnhash, k)) free((char*)kh_key(settings.rnhash, k)); + kh_destroy(str, settings.rnhash); } if (settings.tvhash) { khint_t k; for (k = 0; k < kh_end(settings.tvhash); ++k) if (kh_exist(settings.tvhash, k)) free((char*)kh_key(settings.tvhash, k)); - kh_destroy(tv, settings.tvhash); + kh_destroy(str, settings.tvhash); } if (settings.remove_aux_len) { free(settings.remove_aux); @@ -786,6 +908,8 @@ if (settings.tag) { free(settings.tag); } + if (settings.filter) + hts_filter_free(settings.filter); if (p.pool) hts_tpool_destroy(p.pool); @@ -805,47 +929,52 @@ "\n" "Usage: samtools view [options] || [region ...]\n" "\n" -"Options:\n" -// output options -" -b output BAM\n" -" -C output CRAM (requires -T)\n" -" -1 use fast BAM compression (implies -b)\n" -" -u uncompressed BAM output (implies -b)\n" -" -h include header in SAM output\n" -" -H print SAM header only (no alignments)\n" -" -c print only the count of matching records\n" -" -o FILE output file name [stdout]\n" -" -U FILE output reads not selected by filters to FILE [null]\n" -// extra input -" -t FILE FILE listing reference names and lengths (see long help) [null]\n" -" -X include customized index file\n" -// read filters -" -L FILE only include reads overlapping this BED FILE [null]\n" -" -r STR only include reads in read group STR [null]\n" -" -R FILE only include reads with read group listed in FILE [null]\n" -" -d STR:STR\n" -" only include reads with tag STR and associated value STR [null]\n" -" -D STR:FILE\n" -" only include reads with tag STR and associated values listed in\n" -" FILE [null]\n" -" -q INT only include reads with mapping quality >= INT [0]\n" -" -l STR only include reads in library STR [null]\n" -" -m INT only include reads with number of CIGAR operations consuming\n" -" query sequence >= INT [0]\n" -" -f INT only include reads with all of the FLAGs in INT present [0]\n" // F&x == x -" -F INT only include reads with none of the FLAGS in INT present [0]\n" // F&x == 0 -" -G INT only EXCLUDE reads with all of the FLAGs in INT present [0]\n" // !(F&x == x) -" -s FLOAT subsample reads (given INT.FRAC option value, 0.FRAC is the\n" -" fraction of templates/read pairs to keep; INT part sets seed)\n" -" -M use the multi-region iterator (increases the speed, removes\n" -" duplicates and outputs the reads as they are ordered in the file)\n" -// read processing -" -x STR read tag to strip (repeatable) [null]\n" -" -B collapse the backward CIGAR operation\n" -// general options -" -? print long help, including note about region specification\n" -" -S ignored (input format is auto-detected)\n" -" --no-PG do not add a PG line\n"); +"Output options:\n" +" -b, --bam Output BAM\n" +" -C, --cram Output CRAM (requires -T)\n" +" -1, --fast Use fast BAM compression (implies --bam)\n" +" -u, --uncompressed Uncompressed BAM output (implies --bam)\n" +" -h, --with-header Include header in SAM output\n" +" -H, --header-only Print SAM header only (no alignments)\n" +" --no-header Print SAM alignment records only [default]\n" +" -c, --count Print only the count of matching records\n" +" -o, --output FILE Write output to FILE [standard output]\n" +" -U, --unoutput FILE, --output-unselected FILE\n" +" Output reads not selected by filters to FILE\n" +"Input options:\n" +" -t, --fai-reference FILE FILE listing reference names and lengths\n" +" -M, --use-index Use index and multi-region iterator for regions\n" +" --region[s]-file FILE Use index to include only reads overlapping FILE\n" +" -X, --customized-index Expect extra index file argument after \n" +"\n" +"Filtering options (Only include in output reads that...):\n" +" -L, --target[s]-file FILE ...overlap (BED) regions in FILE\n" +" -r, --read-group STR ...are in read group STR\n" +" -R, --read-group-file FILE ...are in a read group listed in FILE\n" +" -N, --qname-file FILE ...whose read name is listed in FILE\n" +" -d, --tag STR1[:STR2] ...have a tag STR1 (with associated value STR2)\n" +" -D, --tag-file STR:FILE ...have a tag STR whose value is listed in FILE\n" +" -q, --min-MQ INT ...have mapping quality >= INT\n" +" -l, --library STR ...are in library STR\n" +" -m, --min-qlen INT ...cover >= INT query bases (as measured via CIGAR)\n" +" -e, --expr STR ...match the filter expression STR\n" +" -f, --require-flags FLAG ...have all of the FLAGs present\n" // F&x == x +" -F, --excl[ude]-flags FLAG ...have none of the FLAGs present\n" // F&x == 0 +" -G FLAG EXCLUDE reads with all of the FLAGs present\n" // !(F&x == x) TODO long option +" --subsample FLOAT Keep only FLOAT fraction of templates/read pairs\n" +" --subsample-seed INT Influence WHICH reads are kept in subsampling [0]\n" +" -s INT.FRAC Same as --subsample 0.FRAC --subsample-seed INT\n" +"\n" +"Processing options:\n" +" --add-flags FLAG Add FLAGs to reads\n" +" --remove-flags FLAG Remove FLAGs from reads\n" +" -x, --remove-tag STR Strip tag STR from reads (option may be repeated)\n" +" -B, --remove-B Collapse the backward CIGAR operation\n" +"\n" +"General options:\n" +" -?, --help Print long help, including note about region specification\n" +" -S Ignored (input format is auto-detected)\n" +" --no-PG Do not add a PG line\n"); sam_global_opt_help(fp, "-.O.T@.."); fprintf(fp, "\n"); @@ -885,6 +1014,15 @@ "\n" "6. Option `-u' is preferred over `-b' when the output is piped to\n" " another samtools command.\n" +"\n" +"7. Option `-M`/`--use-index` causes overlaps with `-L` BED file regions and\n" +" command-line region arguments to be computed using the multi-region iterator\n" +" and an index. This increases speed, omits duplicates, and outputs the reads\n" +" as they are ordered in the input SAM/BAM/CRAM file.\n" +"\n" +"8. Options `-L`/`--target[s]-file` and `--region[s]-file` may not be used\n" +" together. `--region[s]-file FILE` is simply equivalent to `-M -L FILE`,\n" +" so using both causes one of the specified BED files to be ignored.\n" "\n"); return exit_status; diff -Nru samtools-1.11/stats.c samtools-1.13/stats.c --- samtools-1.11/stats.c 2020-09-22 12:15:55.000000000 +0000 +++ samtools-1.13/stats.c 2021-07-07 17:18:52.000000000 +0000 @@ -1,6 +1,6 @@ /* stats.c -- This is the former bamcheck integrated into samtools/htslib. - Copyright (C) 2012-2020 Genome Research Ltd. + Copyright (C) 2012-2021 Genome Research Ltd. Author: Petr Danecek Author: Sam Nicholls @@ -210,7 +210,7 @@ uint64_t nbases_mapped_cigar; uint64_t nbases_trimmed; // bwa trimmed bases uint64_t nmismatches; - uint64_t nreads_QCfailed, nreads_secondary; + uint64_t nreads_QCfailed, nreads_secondary, nreads_supplementary; struct { uint32_t names, reads, quals; } checksum; @@ -1138,6 +1138,8 @@ void collect_stats(bam1_t *bam_line, stats_t *stats, khash_t(qn2pair) *read_pairs) { + if ( !is_in_regions(bam_line,stats) ) + return; if ( stats->rg_hash ) { const uint8_t *rg = bam_aux_get(bam_line, "RG"); @@ -1154,8 +1156,6 @@ stats->nreads_filtered++; return; } - if ( !is_in_regions(bam_line,stats) ) - return; if ( stats->info->filter_readlen!=-1 && bam_line->core.l_qseq!=stats->info->filter_readlen ) return; @@ -1168,6 +1168,11 @@ return; } + if ( bam_line->core.flag & BAM_FSUPPLEMENTARY ) + { + stats->nreads_supplementary++; + } + // If line has no sequence cannot continue int seq_len = bam_line->core.l_qseq; if ( !seq_len ) return; @@ -1196,8 +1201,7 @@ // These stats should only be calculated for the original reads ignoring supplementary artificial reads // otherwise we'll accidentally double count - if ( IS_ORIGINAL(bam_line) ) - { + if ( IS_ORIGINAL(bam_line) ) { stats->read_lengths[read_len]++; if ( order == READ_ORDER_FIRST ) stats->read_lengths_1st[read_len]++; if ( order == READ_ORDER_LAST ) stats->read_lengths_2nd[read_len]++; @@ -1209,7 +1213,7 @@ count_indels(stats, bam_line); - if ( IS_PAIRED_AND_MAPPED(bam_line) ) + if ( IS_PAIRED_AND_MAPPED(bam_line) && IS_ORIGINAL(bam_line) ) { // The insert size is tricky, because for long inserts the libraries are // prepared differently and the pairs point in other direction. BWA does @@ -1504,7 +1508,7 @@ fprintf(to, "# CHK, CRC32 of reads which passed filtering followed by addition (32bit overflow)\n"); fprintf(to, "CHK\t%08x\t%08x\t%08x\n", stats->checksum.names,stats->checksum.reads,stats->checksum.quals); fprintf(to, "# Summary Numbers. Use `grep ^SN | cut -f 2-` to extract this part.\n"); - fprintf(to, "SN\traw total sequences:\t%ld\n", (long)(stats->nreads_filtered+stats->nreads_1st+stats->nreads_2nd+stats->nreads_other)); // not counting excluded seqs (and none of the below) + fprintf(to, "SN\traw total sequences:\t%ld\t# excluding supplementary and secondary reads\n", (long)(stats->nreads_filtered+stats->nreads_1st+stats->nreads_2nd+stats->nreads_other)); // not counting excluded seqs (and none of the below) fprintf(to, "SN\tfiltered sequences:\t%ld\n", (long)stats->nreads_filtered); fprintf(to, "SN\tsequences:\t%ld\n", (long)(stats->nreads_1st+stats->nreads_2nd+stats->nreads_other)); fprintf(to, "SN\tis sorted:\t%d\n", stats->is_sorted ? 1 : 0); @@ -1519,6 +1523,7 @@ fprintf(to, "SN\treads MQ0:\t%ld\t# mapped and MQ=0\n", (long)stats->nreads_mq0); fprintf(to, "SN\treads QC failed:\t%ld\n", (long)stats->nreads_QCfailed); fprintf(to, "SN\tnon-primary alignments:\t%ld\n", (long)stats->nreads_secondary); + fprintf(to, "SN\tsupplementary alignments:\t%ld\n", (long)stats->nreads_supplementary); fprintf(to, "SN\ttotal length:\t%ld\t# ignores clipping\n", (long)stats->total_len); fprintf(to, "SN\ttotal first fragment length:\t%ld\t# ignores clipping\n", (long)stats->total_len_1st); fprintf(to, "SN\ttotal last fragment length:\t%ld\t# ignores clipping\n", (long)stats->total_len_2nd); diff -Nru samtools-1.11/test/addrprg/5_editrg.sam.expected samtools-1.13/test/addrprg/5_editrg.sam.expected --- samtools-1.11/test/addrprg/5_editrg.sam.expected 1970-01-01 00:00:00.000000000 +0000 +++ samtools-1.13/test/addrprg/5_editrg.sam.expected 2021-07-07 17:18:52.000000000 +0000 @@ -0,0 +1,29 @@ +@HD VN:1.4 SO:coordinate +@SQ SN:ref1 LN:45 +@SQ SN:ref2 LN:40 +@RG ID:1#8 CN:Sanger DS:Testing the editing code. +r001 163 ref1 7 30 8M4I4M1D3M = 37 39 TTAGATAAAGAGGATACTG * XX:B:S,12561,2,20,112 YY:i:100 RG:Z:1#8 +r005 163 ref1 7 30 8M4I4M1D3M = 37 39 TTAGATAAAGAGGATACTG * XX:B:S,12561,2,20,112 YY:i:100 RG:Z:1#8 +r002 0 ref1 9 30 1S2I6M1P1I1P1I4M2I * 0 0 AAAAGATAAGGGATAAA * XA:Z:abc XB:i:-10 RG:Z:1#8 +r006 0 ref1 9 30 1S2I6M1P1I1P1I4M2I * 0 0 AAAAGATAAGGGATAAA * XA:Z:abc XB:i:-10 RG:Z:1#8 +r003 0 ref1 9 30 5H6M * 0 0 AGCTAA * RG:Z:1#8 +r007 0 ref1 9 30 5H6M * 0 0 AGCTAA * RG:Z:1#8 +r004 0 ref1 16 30 6M14N1I5M * 0 0 ATAGCTCTCAGC * RG:Z:1#8 +r007 0 ref1 16 30 6M14N1I5M * 0 0 ATAGCTCTCAGC * RG:Z:1#8 +r003 16 ref1 29 30 6H5M * 0 0 TAGGC * RG:Z:1#8 +r006 16 ref1 29 30 6H5M * 0 0 TAGGC * RG:Z:1#8 +r001 83 ref1 37 30 9M = 7 -39 CAGCGCCAT * RG:Z:1#8 +r005 83 ref1 37 30 9M = 7 -39 CAGCGCCAT * RG:Z:1#8 +x1 0 ref2 1 30 20M * 0 0 AGGTTTTATAAAACAAATAA * RG:Z:1#8 +x7 0 ref2 1 30 20M * 0 0 AGGTTTTATAAAACAAATAA * RG:Z:1#8 +x2 0 ref2 2 30 21M * 0 0 GGTTTTATAAAACAAATAATT ????????????????????? RG:Z:1#8 +x8 0 ref2 2 30 21M * 0 0 GGTTTTATAAAACAAATAATT ????????????????????? RG:Z:1#8 +x3 0 ref2 6 30 9M4I13M * 0 0 TTATAAAACAAATAATTAAGTCTACA ?????????????????????????? RG:Z:1#8 +x9 0 ref2 6 30 9M4I13M * 0 0 TTATAAAACAAATAATTAAGTCTACA ?????????????????????????? RG:Z:1#8 +x4 0 ref2 10 30 25M * 0 0 CAAATAATTAAGTCTACAGAGCAAC ????????????????????????? RG:Z:1#8 +x10 0 ref2 10 30 25M * 0 0 CAAATAATTAAGTCTACAGAGCAAC ????????????????????????? RG:Z:1#8 +x5 0 ref2 12 30 24M * 0 0 AATAATTAAGTCTACAGAGCAACT ???????????????????????? RG:Z:1#8 +x11 0 ref2 12 30 24M * 0 0 AATAATTAAGTCTACAGAGCAACT ???????????????????????? RG:Z:1#8 +x6 0 ref2 14 30 23M * 0 0 TAATTAAGTCTACAGAGCAACTA ??????????????????????? RG:Z:1#8 +x12 0 ref2 14 30 23M * 0 0 TAATTAAGTCTACAGAGCAACTA ??????????????????????? RG:Z:1#8 +u1 4 * 0 30 23M * 0 0 TAATTAAGTCTACAGAGCAACTA ??????????????????????? RG:Z:1#8 diff -Nru samtools-1.11/test/ampliconclip/3_multi_ref_both_clip.expected.sam samtools-1.13/test/ampliconclip/3_multi_ref_both_clip.expected.sam --- samtools-1.11/test/ampliconclip/3_multi_ref_both_clip.expected.sam 1970-01-01 00:00:00.000000000 +0000 +++ samtools-1.13/test/ampliconclip/3_multi_ref_both_clip.expected.sam 2021-07-07 17:18:52.000000000 +0000 @@ -0,0 +1,22 @@ +@HD VN:1.5 SO:unknown +@SQ SN:vir1 LN:800 +@SQ SN:vir2 LN:900 +@RG ID:12345_1#2 +read_1 163 vir1 55 60 25S185M = 201 381 ACCCAACCAACTTTCGATCTCTTGTAGATCTGTTCTCTAAACGAACTTTAAAATCTGTGTGGCTGTCACTCGGCTGCATGCTTAGTGCACTCACGCAGTATAATTAATAACTAATTACTGTCGTTGACAGGACACGAGTAACTCGTCTATCTTCTGCAGGCTGCTTACGGTTTCGTCCGTGTTGCAGCCGATCATCAGCACATCTAGGTT CC AS:i:208 XS:i:0 MD:Z:1A208 NM:i:1 RG:Z:12345_1#2 MQ:i:60 MC:Z:210M +read_2 99 vir1 55 60 24S186M = 201 375 ACCCCCCAACTTTCGATCTCTTGTAGATCTGTTCTCTAAACGAACTTTAAAATCTGTGTGGCTGTCACTCGGCTGCATGCTTAGTGCACTCACGCAGTATAATTAATAACTAATTACTGTCGTTGACAGGACACGAGTAACTCGTCTATCTTCTGCAGGCTGCTTACGGTTTCGTCCGTGTTGCAGCCGATCATCAGCACATCTAGGTTT 8888ACFGGGGFFFGGGGGGGFGGGGGGGGGFGGGGGGGGGDGGGGGGGGGGGGGGGGGGGGGGGGGGGGGEGGGGGGGAFGGGGGGGGGGGGGGGGGCFFGGGFDGFFGGGGGGGGGGGGGGGGGGFCFGGGGGGGGAFFGEGGFFGGGGGGGGGGGGGGGGGGGGGGGGGGGGFGGGGGGGGGFGGGEGEGCFGGGGGGDGGGFGGGG AS:i:205 XS:i:0 BC:Z:TGGGCCTA-ACTCCTAC QT:Z:CCCCCFGF CCCCCGGG MD:Z:205 NM:i:0 RG:Z:12345_1#2 MQ:i:60 MC:Z:210M +read_3 161 vir1 132 60 210M = 201 279 ATTAATAACTAATTACTGTCGTTGACAGGACACGAGTAACTCGTCTATCTTCTGCAGGCTGCTTACGGTTTCGTCCGTGTTGCAGCCGATCATCAGCACATCTAGGTTTCGTCCGGGTGTGACCGAAAGGTAAGATGGAGAGCCTTGTCCCTGGTTTCAACGAGAAAACACACGTCCAACTCAGTTTGCCTGTTTTACAGGTTCGCGACG CCCCCGFGGGGGGFFGGGGGGGGGGGFGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGEGGGGGGGGFGGGFGGGDGGGG@FFEGGFGGEGGGGGGGGGGGGGGGGGFFGDFDGGGDGEG9F@FCFGFGGGGGGCFGGGGGCFAFBAFEGGGGFFFGFGFGGGGG::7GGECFCFGFCCDACGFCD:FC>EEGGGF:B39FBCF>GFF9GDC,GGFDGFGDFDGGGFFF7@7GFFGFFF?GDGFE9GFGGFFEGFCGGFDEDGFDGGGGGGFGFF9FGFGFGCF7@+7GGGGGGECEGFGGE8FEFFE9F9GGGCEFCGGFFGGGGFCGDGFEFDFEDF9GFDEGGGGCGGFCGDE@CCCC AS:i:205 XS:i:0 MD:Z:61T148 NM:i:1 RG:Z:12345_1#2 MQ:i:60 MC:Z:5S205M +read_3 81 vir1 201 60 185M25S = 132 -279 TTCGTCCGTGTTGCAGCCGATCATCAGCACATCTAGGTTTCGTCCGGGTGTGACCGAAAGGTAAGATGGAGAGCCTTGTCCCTGGTTTCAACGAGAAAACACACGTCCAACTCAGTTTGCCTGTTTTACAGGTTCGCGACGTGCTCGTACGTGGCTTTGGAGACTCCGTGGAGGAGGTCTTATCAGAGGCACGTCAACATCTTAAAGATG GGGGGGGGGGGGEEGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGFGGGGGGGGGGGGGGGGGGGGGGGGGGFCGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGCCCCC AS:i:210 XS:i:0 BC:Z:TGGGCCTA-ACTCCTAC QT:Z:CCCCCGGG CCCCCGGG MD:Z:210 NM:i:0 RG:Z:12345_1#2 MQ:i:60 MC:Z:210M +read_4 97 vir1 407 60 86S124M = 517 320 CGAAATTACAGGTTCTCTCCTTGCTCTTCCTTTTCTTTTTCTCCTCCTTTTCTTCTTTCTTCTCCTCTTCCCTTCCCCCTCTTCCCGATGGCACTTGTGGCTTAGTAGAAGTTGAAAAAGGCGTTTTGCCTCAACTTGAACAGCCCTATGTGTTCATCAAACGTTCGGATGCTCGAACTGCACCTCATGGTCATGTTATGGTTGAGCTGG 86---;=;=6;;;;CCFEFGFE6;;= AS:i:208 XS:i:0 MD:Z:1A208 NM:i:1 RG:Z:12345_1#2 MQ:i:60 MC:Z:210M +read_2 99 vir1 55 60 24S186M = 201 375 ACCCCCCAACTTTCGATCTCTTGTAGATCTGTTCTCTAAACGAACTTTAAAATCTGTGTGGCTGTCACTCGGCTGCATGCTTAGTGCACTCACGCAGTATAATTAATAACTAATTACTGTCGTTGACAGGACACGAGTAACTCGTCTATCTTCTGCAGGCTGCTTACGGTTTCGTCCGTGTTGCAGCCGATCATCAGCACATCTAGGTTT 8888ACFGGGGFFFGGGGGGGFGGGGGGGGGFGGGGGGGGGDGGGGGGGGGGGGGGGGGGGGGGGGGGGGGEGGGGGGGAFGGGGGGGGGGGGGGGGGCFFGGGFDGFFGGGGGGGGGGGGGGGGGGFCFGGGGGGGGAFFGEGGFFGGGGGGGGGGGGGGGGGGGGGGGGGGGGFGGGGGGGGGFGGGEGEGCFGGGGGGDGGGFGGGG AS:i:205 XS:i:0 BC:Z:TGGGCCTA-ACTCCTAC QT:Z:CCCCCFGF CCCCCGGG MD:Z:205 NM:i:0 RG:Z:12345_1#2 MQ:i:60 MC:Z:210M +read_3 161 vir1 132 60 210M = 201 279 ATTAATAACTAATTACTGTCGTTGACAGGACACGAGTAACTCGTCTATCTTCTGCAGGCTGCTTACGGTTTCGTCCGTGTTGCAGCCGATCATCAGCACATCTAGGTTTCGTCCGGGTGTGACCGAAAGGTAAGATGGAGAGCCTTGTCCCTGGTTTCAACGAGAAAACACACGTCCAACTCAGTTTGCCTGTTTTACAGGTTCGCGACG CCCCCGFGGGGGGFFGGGGGGGGGGGFGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGEGGGGGGGGFGGGFGGGDGGGG@FFEGGFGGEGGGGGGGGGGGGGGGGGFFGDFDGGGDGEG9F@FCFGFGGGGGGCFGGGGGCFAFBAFEGGGGFFFGFGFGGGGG::7GGECFCFGFCCDACGFCD:FC>EEGGGF:B39FBCF>GFF9GDC,GGFDGFGDFDGGGFFF7@7GFFGFFF?GDGFE9GFGGFFEGFCGGFDEDGFDGGGGGGFGFF9FGFGFGCF7@+7GGGGGGECEGFGGE8FEFFE9F9GGGCEFCGGFFGGGGFCGDGFEFDFEDF9GFDEGGGGCGGFCGDE@CCCC AS:i:205 XS:i:0 MD:Z:61T148 NM:i:1 RG:Z:12345_1#2 MQ:i:60 MC:Z:5S205M +read_3 81 vir1 201 60 185M25S = 132 -279 TTCGTCCGTGTTGCAGCCGATCATCAGCACATCTAGGTTTCGTCCGGGTGTGACCGAAAGGTAAGATGGAGAGCCTTGTCCCTGGTTTCAACGAGAAAACACACGTCCAACTCAGTTTGCCTGTTTTACAGGTTCGCGACGTGCTCGTACGTGGCTTTGGAGACTCCGTGGAGGAGGTCTTATCAGAGGCACGTCAACATCTTAAAGATG GGGGGGGGGGGGEEGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGFGGGGGGGGGGGGGGGGGGGGGGGGGGFCGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGCCCCC AS:i:210 XS:i:0 BC:Z:TGGGCCTA-ACTCCTAC QT:Z:CCCCCGGG CCCCCGGG MD:Z:210 NM:i:0 RG:Z:12345_1#2 MQ:i:60 MC:Z:210M +read_4 97 vir1 407 60 86S124M = 517 320 CGAAATTACAGGTTCTCTCCTTGCTCTTCCTTTTCTTTTTCTCCTCCTTTTCTTCTTTCTTCTCCTCTTCCCTTCCCCCTCTTCCCGATGGCACTTGTGGCTTAGTAGAAGTTGAAAAAGGCGTTTTGCCTCAACTTGAACAGCCCTATGTGTTCATCAAACGTTCGGATGCTCGAACTGCACCTCATGGTCATGTTATGGTTGAGCTGG 86---;=;=6;;;;CCFEFGFE6;;= AS:i:208 XS:i:0 MD:Z:1A208 NM:i:1 RG:Z:12345_1#2 MQ:i:60 MC:Z:210M +read_2 99 vir1 36 60 5S205M = 201 375 ACCCCCCAACTTTCGATCTCTTGTAGATCTGTTCTCTAAACGAACTTTAAAATCTGTGTGGCTGTCACTCGGCTGCATGCTTAGTGCACTCACGCAGTATAATTAATAACTAATTACTGTCGTTGACAGGACACGAGTAACTCGTCTATCTTCTGCAGGCTGCTTACGGTTTCGTCCGTGTTGCAGCCGATCATCAGCACATCTAGGTTT 8888ACFGGGGFFFGGGGGGGFGGGGGGGGGFGGGGGGGGGDGGGGGGGGGGGGGGGGGGGGGGGGGGGGGEGGGGGGGAFGGGGGGGGGGGGGGGGGCFFGGGFDGFFGGGGGGGGGGGGGGGGGGFCFGGGGGGGGAFFGEGGFFGGGGGGGGGGGGGGGGGGGGGGGGGGGGFGGGGGGGGGFGGGEGEGCFGGGGGGDGGGFGGGG AS:i:205 XS:i:0 BC:Z:TGGGCCTA-ACTCCTAC QT:Z:CCCCCFGF CCCCCGGG MD:Z:205 NM:i:0 RG:Z:12345_1#2 MQ:i:60 MC:Z:210M +read_3 161 vir1 132 60 210M = 201 279 ATTAATAACTAATTACTGTCGTTGACAGGACACGAGTAACTCGTCTATCTTCTGCAGGCTGCTTACGGTTTCGTCCGTGTTGCAGCCGATCATCAGCACATCTAGGTTTCGTCCGGGTGTGACCGAAAGGTAAGATGGAGAGCCTTGTCCCTGGTTTCAACGAGAAAACACACGTCCAACTCAGTTTGCCTGTTTTACAGGTTCGCGACG CCCCCGFGGGGGGFFGGGGGGGGGGGFGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGEGGGGGGGGFGGGFGGGDGGGG@FFEGGFGGEGGGGGGGGGGGGGGGGGFFGDFDGGGDGEG9F@FCFGFGGGGGGCFGGGGGCFAFBAFEGGGGFFFGFGFGGGGG::7GGECFCFGFCCDACGFCD:FC>EEGGGF:B39FBCF>GFF9GDC,GGFDGFGDFDGGGFFF7@7GFFGFFF?GDGFE9GFGGFFEGFCGGFDEDGFDGGGGGGFGFF9FGFGFGCF7@+7GGGGGGECEGFGGE8FEFFE9F9GGGCEFCGGFFGGGGFCGDGFEFDFEDF9GFDEGGGGCGGFCGDE@CCCC AS:i:205 XS:i:0 MD:Z:61T148 NM:i:1 RG:Z:12345_1#2 MQ:i:60 MC:Z:5S205M +read_3 81 vir1 201 60 210M = 132 -279 TTCGTCCGTGTTGCAGCCGATCATCAGCACATCTAGGTTTCGTCCGGGTGTGACCGAAAGGTAAGATGGAGAGCCTTGTCCCTGGTTTCAACGAGAAAACACACGTCCAACTCAGTTTGCCTGTTTTACAGGTTCGCGACGTGCTCGTACGTGGCTTTGGAGACTCCGTGGAGGAGGTCTTATCAGAGGCACGTCAACATCTTAAAGATG GGGGGGGGGGGGEEGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGFGGGGGGGGGGGGGGGGGGGGGGGGGGFCGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGCCCCC AS:i:210 XS:i:0 BC:Z:TGGGCCTA-ACTCCTAC QT:Z:CCCCCGGG CCCCCGGG MD:Z:210 NM:i:0 RG:Z:12345_1#2 MQ:i:60 MC:Z:210M +read_4 97 vir1 407 60 86S124M = 517 320 CGAAATTACAGGTTCTCTCCTTGCTCTTCCTTTTCTTTTTCTCCTCCTTTTCTTCTTTCTTCTCCTCTTCCCTTCCCCCTCTTCCCGATGGCACTTGTGGCTTAGTAGAAGTTGAAAAAGGCGTTTTGCCTCAACTTGAACAGCCCTATGTGTTCATCAAACGTTCGGATGCTCGAACTGCACCTCATGGTCATGTTATGGTTGAGCTGG 86---;=;=6;;;;CCFEFGFE6;;= AS:i:208 XS:i:0 RG:Z:12345_1#2 MQ:i:60 MC:Z:185M25S +read_2 99 vir1 55 60 24S186M = 201 331 ACCCCCCAACTTTCGATCTCTTGTAGATCTGTTCTCTAAACGAACTTTAAAATCTGTGTGGCTGTCACTCGGCTGCATGCTTAGTGCACTCACGCAGTATAATTAATAACTAATTACTGTCGTTGACAGGACACGAGTAACTCGTCTATCTTCTGCAGGCTGCTTACGGTTTCGTCCGTGTTGCAGCCGATCATCAGCACATCTAGGTTT 8888ACFGGGGFFFGGGGGGGFGGGGGGGGGFGGGGGGGGGDGGGGGGGGGGGGGGGGGGGGGGGGGGGGGEGGGGGGGAFGGGGGGGGGGGGGGGGGCFFGGGFDGFFGGGGGGGGGGGGGGGGGGFCFGGGGGGGGAFFGEGGFFGGGGGGGGGGGGGGGGGGGGGGGGGGGGFGGGGGGGGGFGGGEGEGCFGGGGGGDGGGFGGGG AS:i:205 XS:i:0 BC:Z:TGGGCCTA-ACTCCTAC QT:Z:CCCCCFGF CCCCCGGG RG:Z:12345_1#2 MQ:i:60 MC:Z:185M25S +read_3 161 vir1 132 60 189M21S = 201 254 ATTAATAACTAATTACTGTCGTTGACAGGACACGAGTAACTCGTCTATCTTCTGCAGGCTGCTTACGGTTTCGTCCGTGTTGCAGCCGATCATCAGCACATCTAGGTTTCGTCCGGGTGTGACCGAAAGGTAAGATGGAGAGCCTTGTCCCTGGTTTCAACGAGAAAACACACGTCCAACTCAGTTTGCCTGTTTTACAGGTTCGCGACG CCCCCGFGGGGGGFFGGGGGGGGGGGFGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGEGGGGGGGGFGGGFGGGDGGGG@FFEGGFGGEGGGGGGGGGGGGGGGGGFFGDFDGGGDGEG9F@FCFGFGGGGGGCFGGGGGCFAFBAFEGGGGFFFGFGFGGGGG::7GGECFCFGFCCDACGFCD:FC>EEGGGF:B39FBCF>GFF9GDC,GGFDGFGDFDGGGFFF7@7GFFGFFF?GDGFE9GFGGFFEGFCGGFDEDGFDGGGGGGFGFF9FGFGFGCF7@+7GGGGGGECEGFGGE8FEFFE9F9GGGCEFCGGFFGGGGFCGDGFEFDFEDF9GFDEGGGGCGGFCGDE@CCCC AS:i:205 XS:i:0 RG:Z:12345_1#2 MQ:i:60 MC:Z:24S186M +read_3 81 vir1 201 60 185M25S = 132 -254 TTCGTCCGTGTTGCAGCCGATCATCAGCACATCTAGGTTTCGTCCGGGTGTGACCGAAAGGTAAGATGGAGAGCCTTGTCCCTGGTTTCAACGAGAAAACACACGTCCAACTCAGTTTGCCTGTTTTACAGGTTCGCGACGTGCTCGTACGTGGCTTTGGAGACTCCGTGGAGGAGGTCTTATCAGAGGCACGTCAACATCTTAAAGATG GGGGGGGGGGGGEEGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGFGGGGGGGGGGGGGGGGGGGGGGGGGGFCGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGCCCCC AS:i:210 XS:i:0 BC:Z:TGGGCCTA-ACTCCTAC QT:Z:CCCCCGGG CCCCCGGG RG:Z:12345_1#2 MQ:i:60 MC:Z:189M21S +read_4 97 vir1 411 60 90S120M = 517 294 CGAAATTACAGGTTCTCTCCTTGCTCTTCCTTTTCTTTTTCTCCTCCTTTTCTTCTTTCTTCTCCTCTTCCCTTCCCCCTCTTCCCGATGGCACTTGTGGCTTAGTAGAAGTTGAAAAAGGCGTTTTGCCTCAACTTGAACAGCCCTATGTGTTCATCAAACGTTCGGATGCTCGAACTGCACCTCATGGTCATGTTATGGTTGAGCTGG 86---;=;=6;;;;CCFEFGFE6;;=- format and comma-separated for alt-primers. +# +# AMPLICON REF NUMBER LEFT RIGHT +AMPLICON vir1 1 31-54 386-410 +AMPLICON vir1 2 321-342 705-726 +AMPLICON vir2 3 11-35,21-40 301-330 +AMPLICON vir2 4 261-275 441-460 +AMPLICON vir2 5 401-420 851-880,856-882 +# Summary stats. +# Use 'grep ^FSS | cut -f 2-' to extract this part. +FSS mixed_clipped vir1 raw total sequences: 8 +FSS mixed_clipped vir1 filtered sequences: 0 +FSS mixed_clipped vir1 failed primer match: 2 +FSS mixed_clipped vir1 matching sequences: 6 +FSS mixed_clipped vir1 consensus depth count < 1 and >= 1: 132 517 +FSS mixed_clipped vir2 raw total sequences: 10 +FSS mixed_clipped vir2 filtered sequences: 0 +FSS mixed_clipped vir2 failed primer match: 0 +FSS mixed_clipped vir2 matching sequences: 10 +FSS mixed_clipped vir2 consensus depth count < 1 and >= 1: 140 669 +# Absolute matching read counts per amplicon. +# Use 'grep ^FREADS | cut -f 2-' to extract this part. +FREADS mixed_clipped 5 1 6 0 4 +FVDEPTH mixed_clipped 2 0 2 0 1 +# Read percentage of distribution between amplicons. +# Use 'grep ^FRPERC | cut -f 2-' to extract this part. +FRPERC mixed_clipped 31.250 6.250 37.500 0.000 25.000 +# Read depth per amplicon. +# Use 'grep ^FDEPTH | cut -f 2-' to extract this part. +FDEPTH mixed_clipped 2.8 0.5 3.9 0.0 1.7 +# Percentage coverage per amplicon +# Use 'grep ^FPCOV | cut -f 2-' to extract this part. +FPCOV-1 mixed_clipped 100.00 51.80 100.00 0.00 100.00 +# Depth per reference base for ALL data. +# Use 'grep ^FDP_ALL | cut -f 2-' to extract this part. +FDP_ALL mixed_clipped vir1 0,54 2,77 3,254 0,25 1,294 0,96 +FDP_ALL mixed_clipped vir2 0,35 2,5 3,69 2,100 3,91 0,120 2,141 1,161 2,128 0,50 +# Depth per reference base for full-length valid amplicon data. +# Use 'grep ^FDP_VALID | cut -f 2-' to extract this part. +FDP_VALID mixed_clipped vir1 0,54 2,331 0,415 +FDP_VALID mixed_clipped vir2 0,35 1,5 2,260 0,120 1,430 0,50 +# Distribution of aligned template coordinates. +# Use 'grep ^FTCOORD | cut -f 2-' to extract this part. +FTCOORD mixed_clipped 1 55,385,2,0 +FTCOORD mixed_clipped 2 +FTCOORD mixed_clipped 3 36,300,2,0 41,300,1,0 +FTCOORD mixed_clipped 4 +FTCOORD mixed_clipped 5 421,850,2,0 +# Classification of amplicon status. Columns are +# number with both primers from this amplicon, number with +# primers from different amplicon, and number with a position +# not matching any valid amplicon primer site +# Use 'grep ^FAMP | cut -f 2-' to extract this part. +FAMP mixed_clipped 0 8 0 2 +FAMP mixed_clipped 1 2 0 1 +FAMP mixed_clipped 2 0 0 1 +FAMP mixed_clipped 3 4 0 0 +FAMP mixed_clipped 4 0 0 0 +FAMP mixed_clipped 5 2 0 0 +# Summary stats. +# Use 'grep ^CSS | cut -f 2-' to extract this part. +CSS COMBINED vir1 raw total sequences: 8 +CSS COMBINED vir1 filtered sequences: 0 +CSS COMBINED vir1 failed primer match: 2 +CSS COMBINED vir1 matching sequences: 6 +CSS COMBINED vir1 consensus depth count < 1 and >= 1: 649 0 +CSS COMBINED vir2 raw total sequences: 10 +CSS COMBINED vir2 filtered sequences: 0 +CSS COMBINED vir2 failed primer match: 0 +CSS COMBINED vir2 matching sequences: 10 +CSS COMBINED vir2 consensus depth count < 1 and >= 1: 809 0 +# Absolute matching read counts per amplicon. +# Use 'grep ^CREADS | cut -f 2-' to extract this part. +CREADS COMBINED 5 1 6 0 4 +CVDEPTH COMBINED 2 0 2 0 1 +CREADS MEAN 5.0 1.0 6.0 0.0 4.0 +CREADS STDDEV 0.0 0.0 0.0 0.0 0.0 +# Read percentage of distribution between amplicons. +# Use 'grep ^CRPERC | cut -f 2-' to extract this part. +CRPERC COMBINED 31.250 6.250 37.500 0.000 25.000 +CRPERC MEAN 31.250 6.250 37.500 0.000 25.000 +CRPERC STDDEV 0.000 0.000 0.000 0.000 0.000 +# Read depth per amplicon. +# Use 'grep ^CDEPTH | cut -f 2-' to extract this part. +CDEPTH COMBINED 2.8 0.5 3.9 0.0 1.7 +CDEPTH MEAN 2.8 0.5 3.9 0.0 1.7 +CDEPTH STDDEV 0.0 0.0 0.0 0.0 0.0 +CPCOV-1 MEAN 100.0 51.8 100.0 0.0 100.0 +CPCOV-1 STDDEV 0.0 0.0 0.0 0.0 0.0 +# Depth per reference base for ALL data. +# Use 'grep ^CDP_ALL | cut -f 2-' to extract this part. +CDP_ALL COMBINED vir1 0,54 2,77 3,254 0,25 1,294 0,96 +CDP_ALL COMBINED vir2 0,35 2,5 3,69 2,100 3,91 0,120 2,141 1,161 2,128 0,50 +# Depth per reference base for full-length valid amplicon data. +# Use 'grep ^CDP_VALID | cut -f 2-' to extract this part. +CDP_VALID COMBINED vir1 0,54 2,331 0,415 +CDP_VALID COMBINED vir2 0,35 1,5 2,260 0,120 1,430 0,50 +# Distribution of aligned template coordinates. +# Use 'grep ^CTCOORD | cut -f 2-' to extract this part. +CTCOORD COMBINED 1 55,385,2,0 +CTCOORD COMBINED 2 +CTCOORD COMBINED 3 36,300,2,0 41,300,1,0 +CTCOORD COMBINED 4 +CTCOORD COMBINED 5 421,850,2,0 +# Classification of amplicon status. Columns are +# number with both primers from this amplicon, number with +# primers from different amplicon, and number with a position +# not matching any valid amplicon primer site +# Use 'grep ^CAMP | cut -f 2-' to extract this part. +CAMP COMBINED 0 8 0 2 +CAMP COMBINED 1 2 0 1 +CAMP COMBINED 2 0 0 1 +CAMP COMBINED 3 4 0 0 +CAMP COMBINED 4 0 0 0 +CAMP COMBINED 5 2 0 0 diff -Nru samtools-1.11/test/bam2fq/11.fa.expected samtools-1.13/test/bam2fq/11.fa.expected --- samtools-1.11/test/bam2fq/11.fa.expected 1970-01-01 00:00:00.000000000 +0000 +++ samtools-1.13/test/bam2fq/11.fa.expected 2021-07-07 17:18:52.000000000 +0000 @@ -0,0 +1,72 @@ +>ref1_grp1_p001/1 +CGAGCTCGGT +>ref1_grp1_p001/2 +GTCGACTCTA +>ref1_grp1_p002/1 +CTCGGTACCC +>ref1_grp1_p002/2 +GCAGGTCGAC +>ref1_grp1_p003/1 +GTACCCGGGG +>ref1_grp1_p003/2 +GCCTGCAGGT +>ref1_grp1_p004/1 +CCGGGGATCC +>ref1_grp1_p004/2 +GCATGCCTGC +>ref1_grp1_p005/1 +GGATCCTCTA +>ref1_grp1_p005/2 +GCTTGCATGC +>ref1_grp1_p006/1 +CCTCTAGAGT +>ref1_grp1_p006/2 +TCAAGCTTGC +>ref1_grp2_p001/1 +AGCTCGGTAC +>ref1_grp2_p001/2 +AGGTCGACTC +>ref1_grp2_p002/1 +CGGTACCCGG +>ref1_grp2_p002/2 +CTGCAGGTCG +>ref1_grp2_p003/1 +ACCCGGGGAT +>ref1_grp2_p003/2 +ATGCCTGCAG +>ref1_grp2_p004/1 +GGGGATCCTC +>ref1_grp2_p004/2 +TTGCATGCCT +>ref1_grp2_p005/1 +ATCCTCTAGA +>ref1_grp2_p005/2 +AAGCTTGCAT +>ref1_grp2_p006/1 +TCTAGAGTCG +>ref1_grp2_p006/2 +ACTCAAGCTT +>ref2_grp3_p001/1 +GTGACACTATAGAAT +>ref2_grp3_p001/2 +CTGTTTCCTGTGTGA +>ref2_grp3_p002/1 +CTGTTTCCTGTGTGA +>ref2_grp3_p002/2 +CGCCAAGCTATTTAG +>ref2_grp3_p003/1 +ACGTMRWSYKVHDBN +>ref2_grp3_p003/2 +ACGTMRWSYKVHDBN +>ref12_grp1_p001/1 +TGCAGGCATG +>ref12_grp1_p001/2 +CACTATAGAA +>ref12_grp2_p001/1 +CAAGCTTGAG +>ref12_grp2_p001/2 +ATTTAGGTGA +>unaligned_grp3_p001/1 +CACTCGTTCATGACG +>unaligned_grp3_p001/2 +GAAAGTGAGGAGGTG diff -Nru samtools-1.11/test/bam2fq/12.1.fq.expected samtools-1.13/test/bam2fq/12.1.fq.expected --- samtools-1.11/test/bam2fq/12.1.fq.expected 2020-09-22 12:15:55.000000000 +0000 +++ samtools-1.13/test/bam2fq/12.1.fq.expected 2021-07-07 17:18:52.000000000 +0000 @@ -6,19 +6,19 @@ CTCGGTACCC + ########## -@ref1_grp1_p003 +@ref1_grp1_p003 1:N:0:0 GTACCCGGGG + %%%%%%%%%% -@ref1_grp1_p004 +@ref1_grp1_p004 1:N:0:0 CCGGGGATCC + '''''''''' -@ref1_grp1_p005 +@ref1_grp1_p005 1:N:0:0 GGATCCTCTA + )))))))))) -@ref1_grp1_p006 +@ref1_grp1_p006 1:N:0:0 CCTCTAGAGT + ++++++++++ @@ -26,43 +26,43 @@ AGCTCGGTAC + """""""""" -@ref1_grp2_p002 +@ref1_grp2_p002 1:N:0:0 CGGTACCCGG + $$$$$$$$$$ -@ref1_grp2_p003 +@ref1_grp2_p003 1:N:0:0 ACCCGGGGAT + &&&&&&&&&& -@ref1_grp2_p004 +@ref1_grp2_p004 1:N:0:0 GGGGATCCTC + (((((((((( -@ref1_grp2_p005 +@ref1_grp2_p005 1:N:0:0 ATCCTCTAGA + ********** -@ref1_grp2_p006 +@ref1_grp2_p006 1:N:0:0 TCTAGAGTCG + ,,,,,,,,,, -@ref2_grp3_p001 +@ref2_grp3_p001 1:N:0:0 GTGACACTATAGAAT + ~~~~~~~~~~~~~~~ -@ref2_grp3_p002 +@ref2_grp3_p002 1:N:0:0 CTGTTTCCTGTGTGA + {{{{{{{{{{{{{{{ -@ref2_grp3_p003 +@ref2_grp3_p003 1:N:0:0 ACGTMRWSYKVHDBN + 0123456789abcd! -@ref12_grp1_p001 +@ref12_grp1_p001 1:N:0:0 TGCAGGCATG + AAAAAAAAAA -@ref12_grp2_p001 +@ref12_grp2_p001 1:N:0:0 CAAGCTTGAG + AAAAAAAAAA diff -Nru samtools-1.11/test/bam2fq/12.2.fq.expected samtools-1.13/test/bam2fq/12.2.fq.expected --- samtools-1.11/test/bam2fq/12.2.fq.expected 2020-09-22 12:15:55.000000000 +0000 +++ samtools-1.13/test/bam2fq/12.2.fq.expected 2021-07-07 17:18:52.000000000 +0000 @@ -6,19 +6,19 @@ GCAGGTCGAC + ////////// -@ref1_grp1_p003 +@ref1_grp1_p003 2:N:0:0 GCCTGCAGGT + 1111111111 -@ref1_grp1_p004 +@ref1_grp1_p004 2:N:0:0 GCATGCCTGC + 3333333333 -@ref1_grp1_p005 +@ref1_grp1_p005 2:N:0:0 GCTTGCATGC + 5555555555 -@ref1_grp1_p006 +@ref1_grp1_p006 2:N:0:0 TCAAGCTTGC + 7777777777 @@ -26,43 +26,43 @@ AGGTCGACTC + .......... -@ref1_grp2_p002 +@ref1_grp2_p002 2:N:0:0 CTGCAGGTCG + 0000000000 -@ref1_grp2_p003 +@ref1_grp2_p003 2:N:0:0 ATGCCTGCAG + 2222222222 -@ref1_grp2_p004 +@ref1_grp2_p004 2:N:0:0 TTGCATGCCT + 4444444444 -@ref1_grp2_p005 +@ref1_grp2_p005 2:N:0:0 AAGCTTGCAT + 6666666666 -@ref1_grp2_p006 +@ref1_grp2_p006 2:N:0:0 ACTCAAGCTT + 8888888888 -@ref2_grp3_p001 +@ref2_grp3_p001 2:N:0:0 CTGTTTCCTGTGTGA + ||||||||||||||| -@ref2_grp3_p002 +@ref2_grp3_p002 2:N:0:0 CGCCAAGCTATTTAG + }}}}}}}}}}}}}}} -@ref2_grp3_p003 +@ref2_grp3_p003 2:N:0:0 ACGTMRWSYKVHDBN + 0123456789abcd! -@ref12_grp1_p001 +@ref12_grp1_p001 2:N:0:0 CACTATAGAA + BBBBBBBBBB -@ref12_grp2_p001 +@ref12_grp2_p001 2:N:0:0 ATTTAGGTGA + BBBBBBBBBB diff -Nru samtools-1.11/test/bam2fq/12.i.fq.expected samtools-1.13/test/bam2fq/12.i.fq.expected --- samtools-1.11/test/bam2fq/12.i.fq.expected 2020-09-22 12:15:55.000000000 +0000 +++ samtools-1.13/test/bam2fq/12.i.fq.expected 2021-07-07 17:18:52.000000000 +0000 @@ -1,12 +1,12 @@ -@ref1_grp1_p001 1:N:0:AC +@ref1_grp1_p001 1:N:0:AC+GT AC + "" -@ref1_grp1_p002 1:N:0:AATT +@ref1_grp1_p002 1:N:0:AATT+CCGG AATT + """" -@ref1_grp2_p001 1:N:0:TG +@ref1_grp2_p001 1:N:0:TG+CA TG + ab diff -Nru samtools-1.11/test/bam2fq/12.s.fq.expected samtools-1.13/test/bam2fq/12.s.fq.expected --- samtools-1.11/test/bam2fq/12.s.fq.expected 2020-09-22 12:15:55.000000000 +0000 +++ samtools-1.13/test/bam2fq/12.s.fq.expected 2021-07-07 17:18:52.000000000 +0000 @@ -1,8 +1,8 @@ -@ref1_grp2_p002a +@ref1_grp2_p002a 1:N:0:0 CGGTACCCGG + $$$$$$$$$$ -@unaligned_grp3_p001 +@unaligned_grp3_p001 1:N:0:0 CACTCGTTCATGACG + 0123456789abcde diff -Nru samtools-1.11/test/bam2fq/7.1.fq.expected samtools-1.13/test/bam2fq/7.1.fq.expected --- samtools-1.11/test/bam2fq/7.1.fq.expected 2020-09-22 12:15:55.000000000 +0000 +++ samtools-1.13/test/bam2fq/7.1.fq.expected 2021-07-07 17:18:52.000000000 +0000 @@ -1,68 +1,68 @@ -@ref1_grp1_p001/1 RG:Z:grp1 BC:Z:AC-GT MD:Z:10 +@ref1_grp1_p001/1 MD:Z:10 RG:Z:grp1 BC:Z:AC-GT CGAGCTCGGT + !!!!!!!!!! -@ref1_grp1_p002/1 RG:Z:grp1 BC:Z:AATT+CCGG MD:Z:10 +@ref1_grp1_p002/1 MD:Z:10 RG:Z:grp1 BC:Z:AATT+CCGG CTCGGTACCC + ########## -@ref1_grp1_p003/1 RG:Z:grp1 MD:Z:10 +@ref1_grp1_p003/1 MD:Z:10 RG:Z:grp1 GTACCCGGGG + %%%%%%%%%% -@ref1_grp1_p004/1 RG:Z:grp1 MD:Z:10 +@ref1_grp1_p004/1 MD:Z:10 RG:Z:grp1 CCGGGGATCC + '''''''''' -@ref1_grp1_p005/1 RG:Z:grp1 MD:Z:10 ia:i:40000 +@ref1_grp1_p005/1 MD:Z:10 RG:Z:grp1 ia:i:40000 GGATCCTCTA + )))))))))) -@ref1_grp1_p006/1 RG:Z:grp1 MD:Z:10 ia:i:255 +@ref1_grp1_p006/1 MD:Z:10 RG:Z:grp1 ia:i:255 CCTCTAGAGT + ++++++++++ -@ref1_grp2_p001/1 RG:Z:grp2 BC:Z:TG+CA QT:Z:ab+cd MD:Z:8 +@ref1_grp2_p001/1 MD:Z:8 RG:Z:grp2 BC:Z:TG+CA QT:Z:ab+cd AGCTCGGTAC + """""""""" -@ref1_grp2_p002/1 RG:Z:grp2 MD:Z:10 +@ref1_grp2_p002/1 MD:Z:10 RG:Z:grp2 CGGTACCCGG + $$$$$$$$$$ -@ref1_grp2_p003/1 RG:Z:grp2 MD:Z:10 ia:i:4294967295 +@ref1_grp2_p003/1 MD:Z:10 RG:Z:grp2 ia:i:4294967295 ACCCGGGGAT + &&&&&&&&&& -@ref1_grp2_p004/1 RG:Z:grp2 MD:Z:10 ia:i:-2147483648 +@ref1_grp2_p004/1 MD:Z:10 RG:Z:grp2 ia:i:-2147483648 GGGGATCCTC + (((((((((( -@ref1_grp2_p005/1 RG:Z:grp2 MD:Z:10 ia:i:-1000 +@ref1_grp2_p005/1 MD:Z:10 RG:Z:grp2 ia:i:-1000 ATCCTCTAGA + ********** -@ref1_grp2_p006/1 RG:Z:grp2 MD:Z:10 ia:i:-1 +@ref1_grp2_p006/1 MD:Z:10 RG:Z:grp2 ia:i:-1 TCTAGAGTCG + ,,,,,,,,,, -@ref2_grp3_p001/1 RG:Z:grp3 MD:Z:15 +@ref2_grp3_p001/1 MD:Z:15 RG:Z:grp3 GTGACACTATAGAAT + ~~~~~~~~~~~~~~~ -@ref2_grp3_p002/1 RG:Z:grp3 MD:Z:15 +@ref2_grp3_p002/1 MD:Z:15 RG:Z:grp3 CTGTTTCCTGTGTGA + {{{{{{{{{{{{{{{ -@ref2_grp3_p003/1 RG:Z:grp3 MD:Z:1T0T0C0T0A0T0A0G0T0G0T0C0A0C0 +@ref2_grp3_p003/1 MD:Z:1T0T0C0T0A0T0A0G0T0G0T0C0A0C0 RG:Z:grp3 ACGTMRWSYKVHDBN + 0123456789abcd! -@ref12_grp1_p001/1 RG:Z:grp1 MD:Z:10 +@ref12_grp1_p001/1 MD:Z:10 RG:Z:grp1 TGCAGGCATG + AAAAAAAAAA -@ref12_grp2_p001/1 RG:Z:grp2 MD:Z:10 +@ref12_grp2_p001/1 MD:Z:10 RG:Z:grp2 CAAGCTTGAG + AAAAAAAAAA diff -Nru samtools-1.11/test/bam2fq/7.2.fq.expected samtools-1.13/test/bam2fq/7.2.fq.expected --- samtools-1.11/test/bam2fq/7.2.fq.expected 2020-09-22 12:15:55.000000000 +0000 +++ samtools-1.13/test/bam2fq/7.2.fq.expected 2021-07-07 17:18:52.000000000 +0000 @@ -1,68 +1,68 @@ -@ref1_grp1_p001/2 RG:Z:grp1 MD:Z:10 +@ref1_grp1_p001/2 MD:Z:10 RG:Z:grp1 GTCGACTCTA + ---------- -@ref1_grp1_p002/2 RG:Z:grp1 MD:Z:10 +@ref1_grp1_p002/2 MD:Z:10 RG:Z:grp1 GCAGGTCGAC + ////////// -@ref1_grp1_p003/2 RG:Z:grp1 MD:Z:10 +@ref1_grp1_p003/2 MD:Z:10 RG:Z:grp1 GCCTGCAGGT + 1111111111 -@ref1_grp1_p004/2 RG:Z:grp1 MD:Z:10 +@ref1_grp1_p004/2 MD:Z:10 RG:Z:grp1 GCATGCCTGC + 3333333333 -@ref1_grp1_p005/2 RG:Z:grp1 MD:Z:10 +@ref1_grp1_p005/2 MD:Z:10 RG:Z:grp1 GCTTGCATGC + 5555555555 -@ref1_grp1_p006/2 RG:Z:grp1 MD:Z:10 +@ref1_grp1_p006/2 MD:Z:10 RG:Z:grp1 TCAAGCTTGC + 7777777777 -@ref1_grp2_p001/2 RG:Z:grp2 MD:Z:10 +@ref1_grp2_p001/2 MD:Z:10 RG:Z:grp2 AGGTCGACTC + .......... -@ref1_grp2_p002/2 RG:Z:grp2 MD:Z:10 +@ref1_grp2_p002/2 MD:Z:10 RG:Z:grp2 CTGCAGGTCG + 0000000000 -@ref1_grp2_p003/2 RG:Z:grp2 MD:Z:10 +@ref1_grp2_p003/2 MD:Z:10 RG:Z:grp2 ATGCCTGCAG + 2222222222 -@ref1_grp2_p004/2 RG:Z:grp2 MD:Z:10 +@ref1_grp2_p004/2 MD:Z:10 RG:Z:grp2 TTGCATGCCT + 4444444444 -@ref1_grp2_p005/2 RG:Z:grp2 MD:Z:10 +@ref1_grp2_p005/2 MD:Z:10 RG:Z:grp2 AAGCTTGCAT + 6666666666 -@ref1_grp2_p006/2 RG:Z:grp2 MD:Z:10 +@ref1_grp2_p006/2 MD:Z:10 RG:Z:grp2 ACTCAAGCTT + 8888888888 -@ref2_grp3_p001/2 RG:Z:grp3 MD:Z:0T0A0A1C0A0T0G0G0T0C0A1A0G0 +@ref2_grp3_p001/2 MD:Z:0T0A0A1C0A0T0G0G0T0C0A1A0G0 RG:Z:grp3 CTGTTTCCTGTGTGA + ||||||||||||||| -@ref2_grp3_p002/2 RG:Z:grp3 MD:Z:15 +@ref2_grp3_p002/2 MD:Z:15 RG:Z:grp3 CGCCAAGCTATTTAG + }}}}}}}}}}}}}}} -@ref2_grp3_p003/2 RG:Z:grp3 MD:Z:0A0T0T0C0T0A0T0A0G0T0G0T1A0C0 +@ref2_grp3_p003/2 MD:Z:0A0T0T0C0T0A0T0A0G0T0G0T1A0C0 RG:Z:grp3 ACGTMRWSYKVHDBN + 0123456789abcd! -@ref12_grp1_p001/2 RG:Z:grp1 MD:Z:10 +@ref12_grp1_p001/2 MD:Z:10 RG:Z:grp1 CACTATAGAA + BBBBBBBBBB -@ref12_grp2_p001/2 RG:Z:grp2 MD:Z:10 +@ref12_grp2_p001/2 MD:Z:10 RG:Z:grp2 ATTTAGGTGA + BBBBBBBBBB diff -Nru samtools-1.11/test/bam2fq/7.s.fq.expected samtools-1.13/test/bam2fq/7.s.fq.expected --- samtools-1.11/test/bam2fq/7.s.fq.expected 2020-09-22 12:15:55.000000000 +0000 +++ samtools-1.13/test/bam2fq/7.s.fq.expected 2021-07-07 17:18:52.000000000 +0000 @@ -1,4 +1,4 @@ -@ref1_grp2_p002a/1 RG:Z:grp2 MD:Z:10 +@ref1_grp2_p002a/1 MD:Z:10 RG:Z:grp2 CGGTACCCGG + $$$$$$$$$$ diff -Nru samtools-1.11/test/bam2fq/8.1.fq.expected samtools-1.13/test/bam2fq/8.1.fq.expected --- samtools-1.11/test/bam2fq/8.1.fq.expected 2020-09-22 12:15:55.000000000 +0000 +++ samtools-1.13/test/bam2fq/8.1.fq.expected 2021-07-07 17:18:52.000000000 +0000 @@ -1,68 +1,68 @@ -@ref1_grp1_p001 1:N:0:GT +@ref1_grp1_p001 1:N:0:ACGT CGAGCTCGGT + !!!!!!!!!! -@ref1_grp1_p002 1:N:0:TT +@ref1_grp1_p002 1:N:0:AATTCCGG CTCGGTACCC + ########## -@ref1_grp1_p003 +@ref1_grp1_p003 1:N:0:0 GTACCCGGGG + %%%%%%%%%% -@ref1_grp1_p004 +@ref1_grp1_p004 1:N:0:0 CCGGGGATCC + '''''''''' -@ref1_grp1_p005 +@ref1_grp1_p005 1:N:0:0 GGATCCTCTA + )))))))))) -@ref1_grp1_p006 +@ref1_grp1_p006 1:N:0:0 CCTCTAGAGT + ++++++++++ -@ref1_grp2_p001 1:N:0:CA +@ref1_grp2_p001 1:N:0:TGCA AGCTCGGTAC + """""""""" -@ref1_grp2_p002 +@ref1_grp2_p002 1:N:0:0 CGGTACCCGG + $$$$$$$$$$ -@ref1_grp2_p003 +@ref1_grp2_p003 1:N:0:0 ACCCGGGGAT + &&&&&&&&&& -@ref1_grp2_p004 +@ref1_grp2_p004 1:N:0:0 GGGGATCCTC + (((((((((( -@ref1_grp2_p005 +@ref1_grp2_p005 1:N:0:0 ATCCTCTAGA + ********** -@ref1_grp2_p006 +@ref1_grp2_p006 1:N:0:0 TCTAGAGTCG + ,,,,,,,,,, -@ref2_grp3_p001 +@ref2_grp3_p001 1:N:0:0 GTGACACTATAGAAT + ~~~~~~~~~~~~~~~ -@ref2_grp3_p002 +@ref2_grp3_p002 1:N:0:0 CTGTTTCCTGTGTGA + {{{{{{{{{{{{{{{ -@ref2_grp3_p003 +@ref2_grp3_p003 1:N:0:0 ACGTMRWSYKVHDBN + 0123456789abcd! -@ref12_grp1_p001 +@ref12_grp1_p001 1:N:0:0 TGCAGGCATG + AAAAAAAAAA -@ref12_grp2_p001 +@ref12_grp2_p001 1:N:0:0 CAAGCTTGAG + AAAAAAAAAA diff -Nru samtools-1.11/test/bam2fq/8.2.fq.expected samtools-1.13/test/bam2fq/8.2.fq.expected --- samtools-1.11/test/bam2fq/8.2.fq.expected 2020-09-22 12:15:55.000000000 +0000 +++ samtools-1.13/test/bam2fq/8.2.fq.expected 2021-07-07 17:18:52.000000000 +0000 @@ -1,68 +1,68 @@ -@ref1_grp1_p001 2:N:0:GT +@ref1_grp1_p001 2:N:0:ACGT GTCGACTCTA + ---------- -@ref1_grp1_p002 2:N:0:TT +@ref1_grp1_p002 2:N:0:AATTCCGG GCAGGTCGAC + ////////// -@ref1_grp1_p003 +@ref1_grp1_p003 2:N:0:0 GCCTGCAGGT + 1111111111 -@ref1_grp1_p004 +@ref1_grp1_p004 2:N:0:0 GCATGCCTGC + 3333333333 -@ref1_grp1_p005 +@ref1_grp1_p005 2:N:0:0 GCTTGCATGC + 5555555555 -@ref1_grp1_p006 +@ref1_grp1_p006 2:N:0:0 TCAAGCTTGC + 7777777777 -@ref1_grp2_p001 2:N:0:CA +@ref1_grp2_p001 2:N:0:TGCA AGGTCGACTC + .......... -@ref1_grp2_p002 +@ref1_grp2_p002 2:N:0:0 CTGCAGGTCG + 0000000000 -@ref1_grp2_p003 +@ref1_grp2_p003 2:N:0:0 ATGCCTGCAG + 2222222222 -@ref1_grp2_p004 +@ref1_grp2_p004 2:N:0:0 TTGCATGCCT + 4444444444 -@ref1_grp2_p005 +@ref1_grp2_p005 2:N:0:0 AAGCTTGCAT + 6666666666 -@ref1_grp2_p006 +@ref1_grp2_p006 2:N:0:0 ACTCAAGCTT + 8888888888 -@ref2_grp3_p001 +@ref2_grp3_p001 2:N:0:0 CTGTTTCCTGTGTGA + ||||||||||||||| -@ref2_grp3_p002 +@ref2_grp3_p002 2:N:0:0 CGCCAAGCTATTTAG + }}}}}}}}}}}}}}} -@ref2_grp3_p003 +@ref2_grp3_p003 2:N:0:0 ACGTMRWSYKVHDBN + 0123456789abcd! -@ref12_grp1_p001 +@ref12_grp1_p001 2:N:0:0 CACTATAGAA + BBBBBBBBBB -@ref12_grp2_p001 +@ref12_grp2_p001 2:N:0:0 ATTTAGGTGA + BBBBBBBBBB diff -Nru samtools-1.11/test/bam2fq/8.i.fq.expected samtools-1.13/test/bam2fq/8.i.fq.expected --- samtools-1.11/test/bam2fq/8.i.fq.expected 2020-09-22 12:15:55.000000000 +0000 +++ samtools-1.13/test/bam2fq/8.i.fq.expected 2021-07-07 17:18:52.000000000 +0000 @@ -1,12 +1,12 @@ -@ref1_grp1_p001 1:N:0:GT +@ref1_grp1_p001 1:N:0:ACGT GT + "" -@ref1_grp1_p002 1:N:0:TT +@ref1_grp1_p002 1:N:0:AATTCCGG TT + "" -@ref1_grp2_p001 1:N:0:CA +@ref1_grp2_p001 1:N:0:TGCA CA + cd diff -Nru samtools-1.11/test/bam2fq/8.s.fq.expected samtools-1.13/test/bam2fq/8.s.fq.expected --- samtools-1.11/test/bam2fq/8.s.fq.expected 2020-09-22 12:15:55.000000000 +0000 +++ samtools-1.13/test/bam2fq/8.s.fq.expected 2021-07-07 17:18:52.000000000 +0000 @@ -1,8 +1,8 @@ -@ref1_grp2_p002a +@ref1_grp2_p002a 1:N:0:0 CGGTACCCGG + $$$$$$$$$$ -@unaligned_grp3_p001 +@unaligned_grp3_p001 1:N:0:0 CACTCGTTCATGACG + 0123456789abcde diff -Nru samtools-1.11/test/dat/bam2fq.004.sam samtools-1.13/test/dat/bam2fq.004.sam --- samtools-1.11/test/dat/bam2fq.004.sam 2020-09-22 12:15:55.000000000 +0000 +++ samtools-1.13/test/dat/bam2fq.004.sam 2021-07-07 17:18:52.000000000 +0000 @@ -38,7 +38,7 @@ ref1_grp1_p005 147 ref1 41 20 10M = 17 -34 GCATGCAAGC 5555555555 MD:Z:10 NM:i:0 RG:Z:grp1 ref1_grp1_p006 99 ref1 21 10 10M = 45 34 CCTCTAGAGT ++++++++++ MD:Z:10 NM:i:0 RG:Z:grp1 ia:i:255 ref1_grp1_p006 147 ref1 45 22 10M = 21 -34 GCAAGCTTGA 7777777777 MD:Z:10 NM:i:0 RG:Z:grp1 -ref1_grp2_p001 99 ref1 3 1 8M2S = 27 34 AGCTCGGTAC """""""""" MD:Z:8 NM:i:0 RG:Z:grp2 BC:Z:TGCA QT:Z:abcd H0:i:1 aa:A:A ab:A:Z fa:f:6.67e-11 za:Z:!"$%^&*() ha:H:CAFE +ref1_grp2_p001 99 ref1 3 1 8M2S = 27 34 AGCTCGGTAC """""""""" MD:Z:8 NM:i:0 RG:Z:grp2 BC:Z:TGCA QT:Z:abcd H0:i:1 aa:A:A ab:A:Z fa:f:6.67e-11 za:Z:!"$%^&*() ha:H:CAFE ref1_grp2_p001 2147 ref1 11 1 8H2M = 27 34 AC "" MD:Z:2 NM:i:0 RG:Z:grp2 BC:Z:TGCA H0:i:1 aa:A:A ab:A:Z fa:f:6.67e-11 za:Z:!"$%^&*() ha:H:CAFE ref1_grp2_p001 147 ref1 27 13 10M = 3 -34 GAGTCGACCT .......... MD:Z:10 NM:i:0 RG:Z:grp2 ref1_grp2_p002a 99 ref1 7 3 10M = 31 34 CGGTACCCGG $$$$$$$$$$ MD:Z:10 NM:i:0 fa:f:6.022e+23 RG:Z:grp2 diff -Nru samtools-1.11/test/dat/bam2fq.010.sam samtools-1.13/test/dat/bam2fq.010.sam --- samtools-1.11/test/dat/bam2fq.010.sam 2020-09-22 12:15:55.000000000 +0000 +++ samtools-1.13/test/dat/bam2fq.010.sam 2021-07-07 17:18:52.000000000 +0000 @@ -38,7 +38,7 @@ ref1_grp1_p005 147 ref1 41 20 10M = 17 -34 GCATGCAAGC 5555555555 MD:Z:10 NM:i:0 RG:Z:grp1 ref1_grp1_p006 99 ref1 21 10 10M = 45 34 CCTCTAGAGT ++++++++++ MD:Z:10 NM:i:0 RG:Z:grp1 ia:i:255 ref1_grp1_p006 147 ref1 45 22 10M = 21 -34 GCAAGCTTGA 7777777777 MD:Z:10 NM:i:0 RG:Z:grp1 -ref1_grp2_p001 99 ref1 3 1 8M2S = 27 34 AGCTCGGTAC """""""""" MD:Z:8 NM:i:0 RG:Z:grp2 BC:Z:TGCA QT:Z:abcd H0:i:1 aa:A:A ab:A:Z fa:f:6.67e-11 za:Z:!"$%^&*() ha:H:CAFE +ref1_grp2_p001 99 ref1 3 1 8M2S = 27 34 AGCTCGGTAC """""""""" MD:Z:8 NM:i:0 RG:Z:grp2 BC:Z:TGCA QT:Z:abcd H0:i:1 aa:A:A ab:A:Z fa:f:6.67e-11 za:Z:!"$%^&*() ha:H:CAFE ref1_grp2_p001 2147 ref1 11 1 8H2M = 27 34 AC "" MD:Z:2 NM:i:0 RG:Z:grp2 BC:Z:TGCA H0:i:1 aa:A:A ab:A:Z fa:f:6.67e-11 za:Z:!"$%^&*() ha:H:CAFE ref1_grp2_p001 147 ref1 27 13 10M = 3 -34 GAGTCGACCT .......... MD:Z:10 NM:i:0 RG:Z:grp2 ref1_grp2_p002a 99 ref1 7 3 10M = 31 34 CGGTACCCGG $$$$$$$$$$ MD:Z:10 NM:i:0 fa:f:6.022e+23 RG:Z:grp2 diff -Nru samtools-1.11/test/dat/mpileup.out.1 samtools-1.13/test/dat/mpileup.out.1 --- samtools-1.11/test/dat/mpileup.out.1 2020-09-22 12:15:55.000000000 +0000 +++ samtools-1.13/test/dat/mpileup.out.1 2021-07-07 17:18:52.000000000 +0000 @@ -1,36 +1,36 @@ -17 100 C 9 ......... JLJMLLGG4 3 ... DmH 5 ..... HIJ?G -17 101 C 9 ......... HFLJKMH97 3 ... Aq5 5 ..... FEIEG -17 102 C 9 ......... FKKLLLKF5 3 ... GpA 5 ..... HHIJG -17 103 T 8 ........ KLLLMLKG 3 ... Fu; 5 ..... MIMHI -17 104 G 8 ........ ;HLGKLIH 2 .. Dp 5 ..... ICHCI -17 105 G 9 ........^]. HB6KKKG8; 3 ... Cj0 5 ..... GBCAH -17 106 G 10 .......... GMKKIKJE2> 2 .. @i 5 ..... GCF5J -17 107 C 9 ......... IKMKLJKI? 3 ... Gm; 5 ..... JGH?K -17 108 C 9 ......... HGKLLKLG? 3 ... Fq< 5 ..... IFGBK -17 109 T 9 ......... ILLOLJJFB 3 ... FvI 5 ..... JJMLL -17 110 G 9 ......... JJMMKLIHD 3 ... ?t? 5 ..... IDFII -17 111 G 10 .......... G77HJJBA/C 2 .. Cr 4 .... GCHK -17 112 C 9 ......... IKILKKLI? 3 ... 6t; 5 ..... J?IEJ -17 113 A 9 ......... CGIFIGJE? 2 .. Bs 5 ..... JBJJI -17 114 C 9 ......... IDGIJJIHB 3 ... Ar; 5 ..... GEDHI -17 115 C 10 .......... EJJILNGF/A 2 .. Ds 6 .....^]. HFHIH? -17 116 A 9 ......... DHGJJJIHF 2 .. Ev 7 ......, F8JGIGG -17 117 G 9 .$........ >BJIMIFJF 2 .. @t 7 ......, H>IFJDD -17 118 G 9 ......... BFKLJGF/A 1 . s 7 ......, A>CEECE -17 119 G 8 ........ F?HLKJGG 2 .. s1 7 ......, A1ABLCB -17 120 A 8 ........ FCIIKHGJ 2 .. w= 7 ......, E;I@GIE -17 121 G 8 ........ GFMMMG@H 2 .. u9 7 ......, AI 6 ....., EJ;LKK -17 124 T 9 .$.......^]. ?HEIE?/7> 3 ... q6F 6 ....., ?FBCGI -17 125 A 7 ....... DHICDGA 3 ... s6F 6 ....., CIBHFD -17 126 A 8 ........ JEKAG0GB 3 ... p6I 6 ....., GHEMKC -17 127 C 8 ........ JGGJG/DD 3 ... p@E 6 ....., ?GDHGI -17 128 A 8 ........ HJIIE8IE 3 ... rFG 6 .$...., >GGJHF -17 129 A 8 ........ HFKJD8IC 3 ... oCJ 5 ...., IIMKD -17 130 A 8 ......$.. IJKCE8JD 3 ... kHJ 5 ...., BEMKI -17 131 C 7 ....... JKKFGGG 3 ... jCF 5 ...., BDKHF -17 132 A 7 ....... JGKJGKF 3 .$.. nIF 5 ...., HGJGG +17 100 C 9 ......... JLJMLLGG4 3 .., DHm 5 ..... HIJ?G +17 101 C 9 ......... HFLJKMH97 3 .., A5q 5 ..... FEIEG +17 102 C 9 ......... FKKLLLKF5 3 .., GAp 5 ..... HHIJG +17 103 T 8 ........ KLLLMLKG 3 .., F;u 5 ..... MIMHI +17 104 G 8 ........ ;HLGKLIH 2 ., Dp 5 ..... ICHCI +17 105 G 9 ........^]. HB6KKKG8; 3 .., C0j 5 ..... GBCAH +17 106 G 10 .......... GMKKIKJE2> 2 ., @i 5 ..... GCF5J +17 107 C 9 ......... IKMKLJKI? 3 .., G;m 5 ..... JGH?K +17 108 C 9 ......... HGKLLKLG? 3 .., FBJIMIFJF 2 ., @t 7 ......, H>IFJDD +17 118 G 9 ......... BFKLJGF/A 1 , s 7 ......, A>CEECE +17 119 G 8 ........ F?HLKJGG 2 ., 1s 7 ......, A1ABLCB +17 120 A 8 ........ FCIIKHGJ 2 ., =w 7 ......, E;I@GIE +17 121 G 8 ........ GFMMMG@H 2 ., 9u 7 ......, AyI 6 ....., EJ;LKK +17 124 T 9 .$.......^]. ?HEIE?/7> 3 .,. 6qF 6 ....., ?FBCGI +17 125 A 7 ....... DHICDGA 3 .,. 6sF 6 ....., CIBHFD +17 126 A 8 ........ JEKAG0GB 3 .,. 6pI 6 ....., GHEMKC +17 127 C 8 ........ JGGJG/DD 3 .,. @pE 6 ....., ?GDHGI +17 128 A 8 ........ HJIIE8IE 3 .,. FrG 6 .$...., >GGJHF +17 129 A 8 ........ HFKJD8IC 3 .,. CoJ 5 ...., IIMKD +17 130 A 8 ......$.. IJKCE8JD 3 .,. HkJ 5 ...., BEMKI +17 131 C 7 ....... JKKFGGG 3 .,. CjF 5 ...., BDKHF +17 132 A 7 ....... JGKJGKF 3 .,. InF 5 ...., HGJGG 17 133 T 7 ....... GKHIEGE 3 .,. ELH 5 ...., EGJIK 17 134 C 7 ....... KLMKKJH 3 .,. IKI 5 ...., EGJKD 17 135 T 7 ....... JLKLHIG 3 .,. HJK 5 .$..., DLLJ? diff -Nru samtools-1.11/test/dat/mpileup.out.2 samtools-1.13/test/dat/mpileup.out.2 --- samtools-1.11/test/dat/mpileup.out.2 2020-09-22 12:15:55.000000000 +0000 +++ samtools-1.13/test/dat/mpileup.out.2 2021-07-07 17:18:52.000000000 +0000 @@ -19,39 +19,39 @@ ##FORMAT= ##FORMAT= #CHROM POS ID REF ALT QUAL FILTER INFO FORMAT HG00100 HG00101 HG00102 -17 100 . C <*> 0 . DP=18;I16=17,0,0,0,688,29762,0,0,958,55682,0,0,332,7446,0,0;QS=3,0;MQ0F=0 PL:DP:DV 0,27,189:9:0 0,9,108:3:0 0,15,134:5:0 -17 101 . C <*> 0 . DP=18;I16=17,0,0,0,650,27530,0,0,958,55682,0,0,331,7303,0,0;QS=3,0;MQ0F=0 PL:DP:DV 0,27,182:9:0 0,9,99:3:0 0,15,132:5:0 -17 102 . C <*> 0 . DP=18;I16=17,0,0,0,695,30453,0,0,958,55682,0,0,330,7178,0,0;QS=3,0;MQ0F=0 PL:DP:DV 0,27,188:9:0 0,9,111:3:0 0,15,139:5:0 -17 103 . T <*> 0 . DP=18;I16=16,0,0,0,692,31998,0,0,929,54841,0,0,323,7035,0,0;QS=3,0;MQ0F=0 PL:DP:DV 0,24,189:8:0 0,9,108:3:0 0,15,147:5:0 -17 104 . G <*> 0 . DP=18;I16=15,0,0,0,611,26723,0,0,900,54000,0,0,295,6259,0,0;QS=3,0;MQ0F=0 PL:DP:DV 0,24,178:8:0 0,6,89:2:0 0,15,133:5:0 -17 105 . G <*> 0 . DP=19;I16=17,0,0,0,604,23936,0,0,989,58441,0,0,317,6751,0,0;QS=3,0;MQ0F=0 PL:DP:DV 0,27,170:9:0 0,9,97:3:0 0,15,125:5:0 -17 106 . G <*> 0 . DP=19;I16=17,0,0,0,644,26574,0,0,989,58441,0,0,299,6093,0,0;QS=3,0;MQ0F=0 PL:DP:DV 0,30,190:10:0 0,6,85:2:0 0,15,124:5:0 -17 107 . C <*> 0 . DP=19;I16=17,0,0,0,694,30064,0,0,989,58441,0,0,313,6543,0,0;QS=3,0;MQ0F=0 PL:DP:DV 0,27,192:9:0 0,9,108:3:0 0,15,136:5:0 -17 108 . C <*> 0 . DP=19;I16=17,0,0,0,692,30148,0,0,989,58441,0,0,310,6420,0,0;QS=3,0;MQ0F=0 PL:DP:DV 0,27,190:9:0 0,9,108:3:0 0,15,135:5:0 -17 109 . T <*> 0 . DP=19;I16=17,0,0,0,741,34273,0,0,989,58441,0,0,307,6319,0,0;QS=3,0;MQ0F=0 PL:DP:DV 0,27,195:9:0 0,9,110:3:0 0,15,150:5:0 -17 110 . G <*> 0 . DP=19;I16=17,0,0,0,704,31276,0,0,989,58441,0,0,304,6240,0,0;QS=3,0;MQ0F=0 PL:DP:DV 0,27,194:9:0 0,9,104:3:0 0,15,136:5:0 -17 111 . G <*> 0 . DP=19;I16=16,0,0,0,584,24362,0,0,929,54841,0,0,272,5416,0,0;QS=3,0;MQ0F=0 PL:DP:DV 0,30,167:10:0 0,6,88:2:0 0,12,118:4:0 -17 112 . C <*> 0 . DP=19;I16=17,0,0,0,680,29854,0,0,989,58441,0,0,296,6052,0,0;QS=3,0;MQ0F=0 PL:DP:DV 0,27,191:9:0 0,9,95:3:0 0,15,135:5:0 -17 113 . A <*> 0 . DP=19;I16=16,0,0,0,645,28035,0,0,960,57600,0,0,266,5318,0,0;QS=3,0;MQ0F=0 PL:DP:DV 0,27,176:9:0 0,6,87:2:0 0,15,139:5:0 -17 114 . C <*> 0 . DP=19;I16=17,0,0,0,674,28788,0,0,989,58441,0,0,286,5856,0,0;QS=3,0;MQ0F=0 PL:DP:DV 0,27,182:9:0 0,9,103:3:0 0,15,133:5:0 -17 115 . C <*> 0 . DP=21;I16=18,0,0,0,708,30546,0,0,1049,62041,0,0,274,5490,0,0;QS=3,0;MQ0F=0 PL:DP:DV 0,30,189:10:0 0,6,89:2:0 0,18,147:6:0 -17 116 . A <*> 0 . DP=21;I16=17,1,0,0,727,31755,0,0,1049,62041,0,0,253,5079,0,0;QS=3,0;MQSB=1;MQ0F=0 PL:DP:DV 0,27,183:9:0 0,6,90:2:0 0,21,175:7:0 -17 117 . G <*> 0 . DP=21;I16=17,1,0,0,712,30478,0,0,1049,62041,0,0,249,5019,0,0;QS=3,0;MQSB=1;MQ0F=0 PL:DP:DV 0,27,183:9:0 0,6,85:2:0 0,21,177:7:0 -17 118 . G <*> 0 . DP=20;I16=16,1,0,0,636,26574,0,0,958,55682,0,0,266,5426,0,0;QS=3,0;MQSB=1;MQ0F=0 PL:DP:DV 0,27,175:9:0 0,3,60:1:0 0,21,162:7:0 -17 119 . G <*> 0 . DP=19;I16=16,1,0,0,629,26439,0,0,958,55682,0,0,267,5553,0,0;QS=3,0;MQSB=1;MQ0F=0 PL:DP:DV 0,24,175:8:0 0,6,73:2:0 0,21,160:7:0 -17 120 . A <*> 0 . DP=19;I16=16,1,0,0,672,29188,0,0,958,55682,0,0,264,5518,0,0;QS=3,0;MQSB=1;MQ0F=0 PL:DP:DV 0,24,175:8:0 0,6,83:2:0 0,21,171:7:0 -17 121 . G <*> 0 . DP=19;I16=16,1,0,0,662,28460,0,0,958,55682,0,0,260,5454,0,0;QS=3,0;MQSB=1;MQ0F=0 PL:DP:DV 0,24,181:8:0 0,6,80:2:0 0,21,168:7:0 -17 122 . C <*> 0 . DP=20;I16=17,1,0,0,716,31224,0,0,1018,59282,0,0,256,5410,0,0;QS=3,0;MQSB=1;MQ0F=0 PL:DP:DV 0,24,181:8:0 0,9,99:3:0 0,21,178:7:0 -17 123 . T <*> 0 . DP=18;I16=15,1,0,0,661,29997,0,0,898,52082,0,0,255,5385,0,0;QS=3,0;MQSB=1;MQ0F=0 PL:DP:DV 0,21,167:7:0 0,9,112:3:0 0,18,166:6:0 -17 124 . T <*> 0 . DP=19;I16=17,1,0,0,626,24802,0,0,987,56523,0,0,279,6003,0,0;QS=3,0;MQSB=1;MQ0F=0 PL:DP:DV 0,27,154:9:0 0,9,104:3:0 0,18,154:6:0 -17 125 . A <*> 0 . DP=18;I16=15,1,0,0,611,25689,0,0,898,52082,0,0,254,5340,0,0;QS=3,0;MQSB=1;MQ0F=0 PL:DP:DV 0,21,154:7:0 0,9,104:3:0 0,18,162:6:0 -17 126 . A <*> 0 . DP=18;I16=16,1,0,0,648,27366,0,0,927,52923,0,0,279,5947,0,0;QS=3,0;MQSB=1;MQ0F=0 PL:DP:DV 0,24,162:8:0 0,9,107:3:0 0,18,174:6:0 -17 127 . C <*> 0 . DP=18;I16=16,1,0,0,646,26972,0,0,927,52923,0,0,279,5949,0,0;QS=3,0;MQSB=1;MQ0F=0 PL:DP:DV 0,24,163:8:0 0,9,109:3:0 0,18,160:6:0 -17 128 . A <*> 0 . DP=18;I16=16,1,0,0,673,28797,0,0,927,52923,0,0,279,5971,0,0;QS=3,0;MQSB=1;MQ0F=0 PL:DP:DV 0,24,169:8:0 0,9,111:3:0 0,18,162:6:0 -17 129 . A <*> 0 . DP=17;I16=15,1,0,0,645,27891,0,0,867,49323,0,0,280,6012,0,0;QS=3,0;MQSB=1;MQ0F=0 PL:DP:DV 0,24,168:8:0 0,9,113:3:0 0,15,159:5:0 -17 130 . A <*> 0 . DP=17;I16=15,1,0,0,641,27295,0,0,867,49323,0,0,281,6071,0,0;QS=3,0;MQSB=1;MQ0F=0 PL:DP:DV 0,24,169:8:0 0,9,113:3:0 0,15,152:5:0 -17 131 . C <*> 0 . DP=16;I16=14,1,0,0,606,25732,0,0,838,48482,0,0,256,5472,0,0;QS=3,0;MQSB=1;MQ0F=0 PL:DP:DV 0,21,167:7:0 0,9,110:3:0 0,15,147:5:0 -17 132 . A <*> 0 . DP=16;I16=14,1,0,0,627,27579,0,0,838,48482,0,0,256,5514,0,0;QS=3,0;MQSB=1;MQ0F=0 PL:DP:DV 0,21,169:7:0 0,9,110:3:0 0,15,151:5:0 +17 100 . C <*> 0 . DP=18;I16=16,1,0,0,688,29762,0,0,958,55682,0,0,332,7446,0,0;QS=3,0;MQSB=1;MQ0F=0 PL:DP:DV 0,27,189:9:0 0,9,118:3:0 0,15,134:5:0 +17 101 . C <*> 0 . DP=18;I16=16,1,0,0,650,27530,0,0,958,55682,0,0,331,7303,0,0;QS=3,0;MQSB=1;MQ0F=0 PL:DP:DV 0,27,182:9:0 0,9,108:3:0 0,15,132:5:0 +17 102 . C <*> 0 . DP=18;I16=16,1,0,0,695,30453,0,0,958,55682,0,0,330,7178,0,0;QS=3,0;MQSB=1;MQ0F=0 PL:DP:DV 0,27,188:9:0 0,9,121:3:0 0,15,139:5:0 +17 103 . T <*> 0 . DP=18;I16=15,1,0,0,692,31998,0,0,929,54841,0,0,323,7035,0,0;QS=3,0;MQSB=1;MQ0F=0 PL:DP:DV 0,24,189:8:0 0,9,118:3:0 0,15,147:5:0 +17 104 . G <*> 0 . DP=18;I16=14,1,0,0,611,26723,0,0,900,54000,0,0,295,6259,0,0;QS=3,0;MQSB=1;MQ0F=0 PL:DP:DV 0,24,178:8:0 0,6,95:2:0 0,15,133:5:0 +17 105 . G <*> 0 . DP=19;I16=16,1,0,0,604,23936,0,0,989,58441,0,0,317,6751,0,0;QS=3,0;MQSB=1;MQ0F=0 PL:DP:DV 0,27,170:9:0 0,9,106:3:0 0,15,125:5:0 +17 106 . G <*> 0 . DP=19;I16=16,1,0,0,644,26574,0,0,989,58441,0,0,299,6093,0,0;QS=3,0;MQSB=1;MQ0F=0 PL:DP:DV 0,30,190:10:0 0,6,91:2:0 0,15,124:5:0 +17 107 . C <*> 0 . DP=19;I16=16,1,0,0,694,30064,0,0,989,58441,0,0,313,6543,0,0;QS=3,0;MQSB=1;MQ0F=0 PL:DP:DV 0,27,192:9:0 0,9,119:3:0 0,15,136:5:0 +17 108 . C <*> 0 . DP=19;I16=16,1,0,0,692,30148,0,0,989,58441,0,0,311,6469,0,0;QS=3,0;MQSB=1;MQ0F=0 PL:DP:DV 0,27,190:9:0 0,9,119:3:0 0,15,135:5:0 +17 109 . T <*> 0 . DP=19;I16=16,1,0,0,741,34273,0,0,989,58441,0,0,309,6415,0,0;QS=3,0;MQSB=1;MQ0F=0 PL:DP:DV 0,27,195:9:0 0,9,120:3:0 0,15,150:5:0 +17 110 . G <*> 0 . DP=19;I16=16,1,0,0,704,31276,0,0,989,58441,0,0,307,6381,0,0;QS=3,0;MQSB=1;MQ0F=0 PL:DP:DV 0,27,194:9:0 0,9,113:3:0 0,15,136:5:0 +17 111 . G <*> 0 . DP=19;I16=15,1,0,0,584,24362,0,0,929,54841,0,0,276,5600,0,0;QS=3,0;MQSB=1;MQ0F=0 PL:DP:DV 0,30,167:10:0 0,6,94:2:0 0,12,118:4:0 +17 112 . C <*> 0 . DP=19;I16=16,1,0,0,680,29854,0,0,989,58441,0,0,301,6277,0,0;QS=3,0;MQSB=1;MQ0F=0 PL:DP:DV 0,27,191:9:0 0,9,103:3:0 0,15,135:5:0 +17 113 . A <*> 0 . DP=19;I16=15,1,0,0,645,28035,0,0,960,57600,0,0,272,5582,0,0;QS=3,0;MQSB=1;MQ0F=0 PL:DP:DV 0,27,176:9:0 0,6,93:2:0 0,15,139:5:0 +17 114 . C <*> 0 . DP=19;I16=16,1,0,0,674,28788,0,0,989,58441,0,0,293,6157,0,0;QS=3,0;MQSB=1;MQ0F=0 PL:DP:DV 0,27,182:9:0 0,9,113:3:0 0,15,133:5:0 +17 115 . C <*> 0 . DP=21;I16=17,1,0,0,708,30546,0,0,1049,62041,0,0,282,5826,0,0;QS=3,0;MQSB=1;MQ0F=0 PL:DP:DV 0,30,189:10:0 0,6,95:2:0 0,18,147:6:0 +17 116 . A <*> 0 . DP=21;I16=16,2,0,0,727,31755,0,0,1049,62041,0,0,262,5448,0,0;QS=3,0;MQSB=0.5;MQ0F=0 PL:DP:DV 0,27,183:9:0 0,6,96:2:0 0,21,175:7:0 +17 117 . G <*> 0 . DP=21;I16=16,2,0,0,712,30478,0,0,1049,62041,0,0,259,5419,0,0;QS=3,0;MQSB=0.5;MQ0F=0 PL:DP:DV 0,27,183:9:0 0,6,91:2:0 0,21,177:7:0 +17 118 . G <*> 0 . DP=20;I16=15,2,0,0,636,26574,0,0,958,55682,0,0,277,5855,0,0;QS=3,0;MQSB=0.566667;MQ0F=0 PL:DP:DV 0,27,175:9:0 0,3,60:1:0 0,21,162:7:0 +17 119 . G <*> 0 . DP=19;I16=15,2,0,0,629,26439,0,0,958,55682,0,0,279,6009,0,0;QS=3,0;MQSB=0.566667;MQ0F=0 PL:DP:DV 0,24,175:8:0 0,6,76:2:0 0,21,160:7:0 +17 120 . A <*> 0 . DP=19;I16=15,2,0,0,672,29188,0,0,958,55682,0,0,277,5999,0,0;QS=3,0;MQSB=0.566667;MQ0F=0 PL:DP:DV 0,24,175:8:0 0,6,88:2:0 0,21,171:7:0 +17 121 . G <*> 0 . DP=19;I16=15,2,0,0,662,28460,0,0,958,55682,0,0,274,5958,0,0;QS=3,0;MQSB=0.566667;MQ0F=0 PL:DP:DV 0,24,181:8:0 0,6,84:2:0 0,21,168:7:0 +17 122 . C <*> 0 . DP=20;I16=16,2,0,0,716,31224,0,0,1018,59282,0,0,271,5935,0,0;QS=3,0;MQSB=0.5625;MQ0F=0 PL:DP:DV 0,24,181:8:0 0,9,107:3:0 0,21,178:7:0 +17 123 . T <*> 0 . DP=18;I16=14,2,0,0,661,29997,0,0,898,52082,0,0,271,5929,0,0;QS=3,0;MQSB=0.571429;MQ0F=0 PL:DP:DV 0,21,167:7:0 0,9,123:3:0 0,18,166:6:0 +17 124 . T <*> 0 . DP=19;I16=16,2,0,0,626,24802,0,0,987,56523,0,0,296,6564,0,0;QS=3,0;MQSB=0.625;MQ0F=0 PL:DP:DV 0,27,154:9:0 0,9,114:3:0 0,18,154:6:0 +17 125 . A <*> 0 . DP=18;I16=14,2,0,0,611,25689,0,0,898,52082,0,0,272,5916,0,0;QS=3,0;MQSB=0.571429;MQ0F=0 PL:DP:DV 0,21,154:7:0 0,9,114:3:0 0,18,162:6:0 +17 126 . A <*> 0 . DP=18;I16=15,2,0,0,648,27366,0,0,927,52923,0,0,298,6536,0,0;QS=3,0;MQSB=0.633333;MQ0F=0 PL:DP:DV 0,24,162:8:0 0,9,117:3:0 0,18,174:6:0 +17 127 . C <*> 0 . DP=18;I16=15,2,0,0,646,26972,0,0,927,52923,0,0,299,6549,0,0;QS=3,0;MQSB=0.633333;MQ0F=0 PL:DP:DV 0,24,163:8:0 0,9,119:3:0 0,18,160:6:0 +17 128 . A <*> 0 . DP=18;I16=15,2,0,0,673,28797,0,0,927,52923,0,0,300,6580,0,0;QS=3,0;MQSB=0.633333;MQ0F=0 PL:DP:DV 0,24,169:8:0 0,9,121:3:0 0,18,162:6:0 +17 129 . A <*> 0 . DP=17;I16=14,2,0,0,645,27891,0,0,867,49323,0,0,302,6628,0,0;QS=3,0;MQSB=0.642857;MQ0F=0 PL:DP:DV 0,24,168:8:0 0,9,124:3:0 0,15,159:5:0 +17 130 . A <*> 0 . DP=17;I16=14,2,0,0,641,27295,0,0,867,49323,0,0,304,6692,0,0;QS=3,0;MQSB=0.642857;MQ0F=0 PL:DP:DV 0,24,169:8:0 0,9,124:3:0 0,15,152:5:0 +17 131 . C <*> 0 . DP=16;I16=13,2,0,0,606,25732,0,0,838,48482,0,0,280,6096,0,0;QS=3,0;MQSB=0.576923;MQ0F=0 PL:DP:DV 0,21,167:7:0 0,9,120:3:0 0,15,147:5:0 +17 132 . A <*> 0 . DP=16;I16=13,2,0,0,627,27579,0,0,838,48482,0,0,281,6139,0,0;QS=3,0;MQSB=0.576923;MQ0F=0 PL:DP:DV 0,21,169:7:0 0,9,120:3:0 0,15,151:5:0 17 133 . T <*> 0 . DP=15;I16=13,2,0,0,584,22816,0,0,838,48482,0,0,282,6196,0,0;QS=3,0;MQSB=0.576923;MQ0F=0 PL:DP:DV 0,21,163:7:0 0,9,105:3:0 0,15,150:5:0 17 134 . C <*> 0 . DP=15;I16=13,2,0,0,607,24653,0,0,838,48482,0,0,283,6267,0,0;QS=3,0;MQSB=0.576923;MQ0F=0 PL:DP:DV 0,21,177:7:0 0,9,105:3:0 0,15,152:5:0 17 135 . T <*> 0 . DP=15;I16=13,2,0,0,600,24178,0,0,838,48482,0,0,284,6352,0,0;QS=3,0;MQSB=0.576923;MQ0F=0 PL:DP:DV 0,21,173:7:0 0,9,106:3:0 0,15,156:5:0 @@ -363,48 +363,48 @@ 17 440 . A <*> 0 . DP=28;I16=14,14,0,0,1095,43251,0,0,1587,92523,0,0,561,12615,0,0;QS=3,0;MQSB=0.949591;MQ0F=0 PL:DP:DV 0,48,255:16:0 0,9,115:3:0 0,27,247:9:0 17 441 . G <*> 0 . DP=29;I16=15,14,0,0,1068,40344,0,0,1647,96123,0,0,559,12605,0,0;QS=3,0;MQSB=0.960561;MQ0F=0 PL:DP:DV 0,51,255:17:0 0,9,104:3:0 0,27,198:9:0 17 442 . A <*> 0 . DP=29;I16=15,14,0,0,1091,41507,0,0,1647,96123,0,0,558,12620,0,0;QS=3,0;MQSB=0.960561;MQ0F=0 PL:DP:DV 0,51,255:17:0 0,9,112:3:0 0,27,233:9:0 -17 443 . A <*> 0 . DP=30;I16=15,14,0,0,1173,49439,0,0,1647,96123,0,0,557,12661,0,0;QS=3,0;MQSB=0.960561;MQ0F=0 PL:DP:DV 0,51,255:17:0 0,9,129:3:0 0,27,246:9:0 -17 444 . G <*> 0 . DP=29;I16=15,13,0,0,1095,44661,0,0,1587,92523,0,0,557,12727,0,0;QS=3,0;MQSB=0.968414;MQ0F=0 PL:DP:DV 0,51,255:17:0 0,6,91:2:0 0,27,227:9:0 -17 445 . C <*> 0 . DP=30;I16=16,13,0,0,1100,43706,0,0,1647,96123,0,0,557,12817,0,0;QS=3,0;MQSB=0.976248;MQ0F=0 PL:DP:DV 0,51,255:17:0 0,9,111:3:0 0,27,219:9:0 -17 446 . A <*> 0 . DP=30;I16=16,13,0,0,1107,44265,0,0,1647,96123,0,0,557,12881,0,0;QS=3,0;MQSB=0.976248;MQ0F=0 PL:DP:DV 0,51,255:17:0 0,9,115:3:0 0,27,232:9:0 -17 447 . C <*> 0 . DP=29;I16=16,12,0,0,1108,45364,0,0,1618,95282,0,0,555,12817,0,0;QS=3,0;MQSB=0.856268;MQ0F=0 PL:DP:DV 0,48,255:16:0 0,9,114:3:0 0,27,235:9:0 -17 448 . T <*> 0 . DP=29;I16=16,12,0,0,1125,47237,0,0,1618,95282,0,0,553,12773,0,0;QS=3,0;MQSB=0.856268;MQ0F=0 PL:DP:DV 0,48,255:16:0 0,9,118:3:0 0,27,240:9:0 -17 449 . A <*> 0 . DP=28;I16=15,12,0,0,1091,45981,0,0,1558,91682,0,0,552,12748,0,0;QS=3,0;MQSB=0.84246;MQ0F=0 PL:DP:DV 0,48,255:16:0 0,6,90:2:0 0,27,245:9:0 -17 450 . G <*> 0 . DP=28;I16=15,12,0,0,1069,44603,0,0,1558,91682,0,0,551,12741,0,0;QS=3,0;MQSB=0.84246;MQ0F=0 PL:DP:DV 0,48,255:16:0 0,6,91:2:0 0,27,233:9:0 -17 451 . A <*> 0 . DP=28;I16=15,12,0,0,1021,41371,0,0,1558,91682,0,0,550,12752,0,0;QS=3,0;MQSB=0.84246;MQ0F=0 PL:DP:DV 0,48,255:16:0 0,6,93:2:0 0,27,244:9:0 -17 452 . A <*> 0 . DP=31;I16=18,11,0,0,1079,43353,0,0,1678,98882,0,0,530,12420,0,0;QS=3,0;MQSB=0.884952;MQ0F=0 PL:DP:DV 0,54,255:18:0 0,9,110:3:0 0,24,225:8:0 -17 453 . A <*> 0 . DP=31;I16=17,11,0,0,1037,41069,0,0,1649,98041,0,0,508,11882,0,0;QS=3,0;MQSB=0.967085;MQ0F=0 PL:DP:DV 0,54,255:18:0 0,9,111:3:0 0,21,221:7:0 -17 454 . A <*> 0 . DP=31;I16=18,12,0,0,1158,47028,0,0,1738,102482,0,0,554,12904,0,0;QS=3,0;MQSB=0.878946;MQ0F=0 PL:DP:DV 0,51,255:17:0 0,9,113:3:0 0,30,255:10:0 -17 455 . T <*> 0 . DP=32;I16=17,13,0,0,1148,46574,0,0,1715,100251,0,0,550,12864,0,0;QS=3,0;MQSB=0.973855;MQ0F=0 PL:DP:DV 0,48,255:16:0 0,9,113:3:0 0,33,255:11:0 -17 456 . G <*> 0 . DP=32;I16=17,13,0,0,1161,47287,0,0,1746,103010,0,0,534,12296,0,0;QS=3,0;MQSB=0.998031;MQ0F=0 PL:DP:DV 0,51,255:17:0 0,9,116:3:0 0,30,245:10:0 -17 457 . C <*> 0 . DP=33;I16=19,13,0,0,1218,48642,0,0,1835,107451,0,0,563,12967,0,0;QS=3,0;MQSB=0.985204;MQ0F=0 PL:DP:DV 0,54,255:18:0 0,9,118:3:0 0,33,255:11:0 -17 458 . A <*> 0 . DP=33;I16=19,13,0,0,1226,49034,0,0,1835,107451,0,0,568,12990,0,0;QS=3,0;MQSB=0.985204;MQ0F=0 PL:DP:DV 0,54,255:18:0 0,9,111:3:0 0,33,255:11:0 -17 459 . T <*> 0 . DP=33;I16=18,13,0,0,1167,46981,0,0,1775,103851,0,0,565,12945,0,0;QS=3,0;MQSB=0.980167;MQ0F=0 PL:DP:DV 0,54,255:18:0 0,6,92:2:0 0,33,255:11:0 -17 460 . G <*> 0 . DP=32;I16=19,12,0,0,1219,50105,0,0,1775,103851,0,0,575,12929,0,0;QS=3,0;MQSB=0.989977;MQ0F=0 PL:DP:DV 0,54,255:18:0 0,9,116:3:0 0,30,255:10:0 -17 461 . T <*> 0 . DP=32;I16=19,12,0,0,1213,49819,0,0,1775,103851,0,0,577,12845,0,0;QS=3,0;MQSB=0.989977;MQ0F=0 PL:DP:DV 0,54,255:18:0 0,9,115:3:0 0,30,255:10:0 -17 462 . G <*> 0 . DP=32;I16=19,12,0,0,1190,48962,0,0,1775,103851,0,0,580,12792,0,0;QS=3,0;MQSB=0.989977;MQ0F=0 PL:DP:DV 0,51,255:17:0 0,12,119:4:0 0,30,241:10:0 -17 463 . G <*> 0 . DP=32;I16=19,12,0,0,1114,44214,0,0,1775,103851,0,0,584,12770,0,0;QS=3,0;MQSB=0.989977;MQ0F=0 PL:DP:DV 0,51,255:17:0 0,12,114:4:0 0,30,221:10:0 -17 464 . A <*> 0 . DP=32;I16=18,11,0,0,1100,43908,0,0,1686,99410,0,0,556,12106,0,0;QS=3,0;MQSB=0.99095;MQ0F=0 PL:DP:DV 0,51,255:17:0 0,12,133:4:0 0,24,213:8:0 -17 465 . C <*> 0 . DP=33;I16=20,11,0,0,1191,48085,0,0,1775,103851,0,0,586,12786,0,0;QS=3,0;MQSB=0.996597;MQ0F=0 PL:DP:DV 0,51,255:17:0 0,15,140:5:0 0,27,231:9:0 -17 466 . A <*> 0 . DP=34;I16=21,12,0,0,1293,53311,0,0,1895,111051,0,0,597,12897,0,0;QS=3,0;MQSB=0.995633;MQ0F=0 PL:DP:DV 0,54,255:18:0 0,15,154:5:0 0,30,255:10:0 -17 467 . A <*> 0 . DP=34;I16=21,11,0,0,1256,51450,0,0,1835,107451,0,0,597,12891,0,0;QS=3,0;MQSB=0.998231;MQ0F=0 PL:DP:DV 0,54,255:18:0 0,15,157:5:0 0,27,248:9:0 -17 468 . A <*> 0 . DP=35;I16=22,12,0,0,1274,51268,0,0,1955,114651,0,0,604,12904,0,0;QS=3,0;MQSB=0.997406;MQ0F=0 PL:DP:DV 0,57,255:19:0 0,15,154:5:0 0,30,251:10:0 -17 469 . A <*> 0 . DP=35;I16=22,12,0,0,1285,52989,0,0,1955,114651,0,0,608,12940,0,0;QS=3,0;MQSB=0.997406;MQ0F=0 PL:DP:DV 0,57,255:19:0 0,15,146:5:0 0,30,255:10:0 -17 470 . G <*> 0 . DP=35;I16=22,12,0,0,1281,51055,0,0,1955,114651,0,0,612,13016,0,0;QS=3,0;MQSB=0.997406;MQ0F=0 PL:DP:DV 0,57,255:19:0 0,15,148:5:0 0,30,238:10:0 -17 471 . T <*> 0 . DP=36;I16=22,11,0,0,1239,49021,0,0,1918,113282,0,0,599,12825,0,0;QS=3,0;MQSB=0.915545;MQ0F=0 PL:DP:DV 0,57,255:19:0 0,15,150:5:0 0,27,232:9:0 -17 472 . T <*> 0 . DP=35;I16=21,12,0,0,1245,48915,0,0,1926,113810,0,0,595,12559,0,0;QS=3,0;MQSB=0.988858;MQ0F=0 PL:DP:DV 0,57,255:19:0 0,15,153:5:0 0,27,237:9:0 -17 473 . G <*> 0 . DP=35;I16=21,12,0,0,1307,53473,0,0,1926,113810,0,0,599,12651,0,0;QS=3,0;MQSB=0.988858;MQ0F=0 PL:DP:DV 0,57,255:19:0 0,15,141:5:0 0,27,249:9:0 -17 474 . G <*> 0 . DP=36;I16=22,12,0,0,1284,51708,0,0,1986,117410,0,0,602,12734,0,0;QS=3,0;MQSB=0.986937;MQ0F=0 PL:DP:DV 0,57,255:19:0 0,15,131:5:0 0,30,255:10:0 -17 475 . G <*> 0 . DP=36;I16=23,12,0,0,1311,51609,0,0,2015,118251,0,0,631,13485,0,0;QS=3,0;MQSB=0.998642;MQ0F=0 PL:DP:DV 0,57,255:19:0 0,15,141:5:0 0,33,252:11:0 -17 476 . A <*> 0 . DP=36;I16=23,12,0,0,1312,52078,0,0,2015,118251,0,0,634,13606,0,0;QS=3,0;MQSB=0.998642;MQ0F=0 PL:DP:DV 0,57,255:19:0 0,15,157:5:0 0,33,255:11:0 -17 477 . T <*> 0 . DP=36;I16=23,12,0,0,1318,52668,0,0,2015,118251,0,0,637,13773,0,0;QS=3,0;MQSB=0.998642;MQ0F=0 PL:DP:DV 0,57,255:19:0 0,15,148:5:0 0,33,255:11:0 -17 478 . T <*> 0 . DP=38;I16=25,12,0,0,1338,51774,0,0,2135,125451,0,0,637,13833,0,0;QS=3,0;MQSB=0.999868;MQ0F=0 PL:DP:DV 0,60,255:20:0 0,18,154:6:0 0,33,255:11:0 -17 479 . A <*> 0 . DP=38;I16=25,12,0,0,1420,57788,0,0,2135,125451,0,0,639,13935,0,0;QS=3,0;MQSB=0.999868;MQ0F=0 PL:DP:DV 0,60,255:20:0 0,18,163:6:0 0,33,255:11:0 -17 480 . G <*> 0 . DP=37;I16=25,11,0,0,1438,60172,0,0,2075,121851,0,0,641,14029,0,0;QS=3,0;MQSB=0.999853;MQ0F=0 PL:DP:DV 0,57,255:19:0 0,18,165:6:0 0,33,255:11:0 -17 481 . G <*> 0 . DP=37;I16=25,11,0,0,1392,55824,0,0,2075,121851,0,0,642,14112,0,0;QS=3,0;MQSB=0.999853;MQ0F=0 PL:DP:DV 0,57,255:19:0 0,18,165:6:0 0,33,255:11:0 -17 482 . A <*> 0 . DP=37;I16=24,11,0,0,1352,55134,0,0,2015,118251,0,0,618,13608,0,0;QS=3,0;MQSB=1;MQ0F=0 PL:DP:DV 0,57,255:19:0 0,15,143:5:0 0,33,255:11:0 -17 483 . G <*> 0 . DP=37;I16=24,12,0,0,1417,57747,0,0,2075,121851,0,0,642,14240,0,0;QS=3,0;MQSB=0.999437;MQ0F=0 PL:DP:DV 0,57,255:19:0 0,18,165:6:0 0,33,255:11:0 -17 484 . A <*> 0 . DP=36;I16=24,11,0,0,1340,53992,0,0,2015,118251,0,0,643,14281,0,0;QS=3,0;MQSB=1;MQ0F=0 PL:DP:DV 0,54,255:18:0 0,18,168:6:0 0,33,255:11:0 +17 443 . A <*> 0 . DP=30;I16=14,15,0,0,1173,49439,0,0,1647,96123,0,0,532,12036,0,0;QS=3,0;MQSB=0.940868;MQ0F=0 PL:DP:DV 0,51,255:17:0 0,9,130:3:0 0,27,246:9:0 +17 444 . G <*> 0 . DP=29;I16=14,14,0,0,1095,44661,0,0,1587,92523,0,0,533,12103,0,0;QS=3,0;MQSB=0.949591;MQ0F=0 PL:DP:DV 0,51,255:17:0 0,6,98:2:0 0,27,227:9:0 +17 445 . C <*> 0 . DP=30;I16=15,14,0,0,1100,43706,0,0,1647,96123,0,0,534,12196,0,0;QS=3,0;MQSB=0.960561;MQ0F=0 PL:DP:DV 0,51,255:17:0 0,9,122:3:0 0,27,219:9:0 +17 446 . A <*> 0 . DP=30;I16=15,14,0,0,1107,44265,0,0,1647,96123,0,0,535,12265,0,0;QS=3,0;MQSB=0.960561;MQ0F=0 PL:DP:DV 0,51,255:17:0 0,9,127:3:0 0,27,232:9:0 +17 447 . C <*> 0 . DP=29;I16=15,13,0,0,1108,45364,0,0,1618,95282,0,0,534,12208,0,0;QS=3,0;MQSB=0.835846;MQ0F=0 PL:DP:DV 0,48,255:16:0 0,9,125:3:0 0,27,235:9:0 +17 448 . T <*> 0 . DP=29;I16=15,13,0,0,1125,47237,0,0,1618,95282,0,0,533,12173,0,0;QS=3,0;MQSB=0.835846;MQ0F=0 PL:DP:DV 0,48,255:16:0 0,9,130:3:0 0,27,240:9:0 +17 449 . A <*> 0 . DP=28;I16=14,13,0,0,1091,45981,0,0,1558,91682,0,0,533,12159,0,0;QS=3,0;MQSB=0.819567;MQ0F=0 PL:DP:DV 0,48,255:16:0 0,6,97:2:0 0,27,245:9:0 +17 450 . G <*> 0 . DP=28;I16=14,13,0,0,1069,44603,0,0,1558,91682,0,0,533,12165,0,0;QS=3,0;MQSB=0.819567;MQ0F=0 PL:DP:DV 0,48,255:16:0 0,6,98:2:0 0,27,233:9:0 +17 451 . A <*> 0 . DP=28;I16=14,13,0,0,1021,41371,0,0,1558,91682,0,0,533,12191,0,0;QS=3,0;MQSB=0.819567;MQ0F=0 PL:DP:DV 0,48,255:16:0 0,6,100:2:0 0,27,244:9:0 +17 452 . A <*> 0 . DP=31;I16=17,12,0,0,1079,43353,0,0,1678,98882,0,0,514,11876,0,0;QS=3,0;MQSB=0.868336;MQ0F=0 PL:DP:DV 0,54,255:18:0 0,9,121:3:0 0,24,225:8:0 +17 453 . A <*> 0 . DP=31;I16=16,12,0,0,1037,41069,0,0,1649,98041,0,0,493,11357,0,0;QS=3,0;MQSB=0.96195;MQ0F=0 PL:DP:DV 0,54,255:18:0 0,9,122:3:0 0,21,221:7:0 +17 454 . A <*> 0 . DP=31;I16=17,13,0,0,1158,47028,0,0,1738,102482,0,0,540,12400,0,0;QS=3,0;MQSB=0.862425;MQ0F=0 PL:DP:DV 0,51,255:17:0 0,9,124:3:0 0,30,255:10:0 +17 455 . T <*> 0 . DP=32;I16=16,14,0,0,1148,46574,0,0,1715,100251,0,0,537,12383,0,0;QS=3,0;MQSB=0.958545;MQ0F=0 PL:DP:DV 0,48,255:16:0 0,9,124:3:0 0,33,255:11:0 +17 456 . G <*> 0 . DP=32;I16=16,14,0,0,1161,47287,0,0,1746,103010,0,0,522,11840,0,0;QS=3,0;MQSB=0.999784;MQ0F=0 PL:DP:DV 0,51,255:17:0 0,9,128:3:0 0,30,245:10:0 +17 457 . C <*> 0 . DP=33;I16=18,14,0,0,1218,48642,0,0,1835,107451,0,0,552,12538,0,0;QS=3,0;MQSB=0.97436;MQ0F=0 PL:DP:DV 0,54,255:18:0 0,9,131:3:0 0,33,255:11:0 +17 458 . A <*> 0 . DP=33;I16=18,14,0,0,1226,49034,0,0,1835,107451,0,0,558,12590,0,0;QS=3,0;MQSB=0.97436;MQ0F=0 PL:DP:DV 0,54,255:18:0 0,9,122:3:0 0,33,255:11:0 +17 459 . T <*> 0 . DP=33;I16=17,14,0,0,1167,46981,0,0,1775,103851,0,0,556,12576,0,0;QS=3,0;MQSB=0.967263;MQ0F=0 PL:DP:DV 0,54,255:18:0 0,6,99:2:0 0,33,255:11:0 +17 460 . G <*> 0 . DP=32;I16=18,13,0,0,1219,50105,0,0,1775,103851,0,0,568,12642,0,0;QS=3,0;MQSB=0.980167;MQ0F=0 PL:DP:DV 0,54,255:18:0 0,9,128:3:0 0,30,255:10:0 +17 461 . T <*> 0 . DP=32;I16=18,13,0,0,1213,49819,0,0,1775,103851,0,0,572,12640,0,0;QS=3,0;MQSB=0.980167;MQ0F=0 PL:DP:DV 0,54,255:18:0 0,9,127:3:0 0,30,255:10:0 +17 462 . G <*> 0 . DP=32;I16=18,13,0,0,1190,48962,0,0,1775,103851,0,0,577,12669,0,0;QS=3,0;MQSB=0.980167;MQ0F=0 PL:DP:DV 0,51,255:17:0 0,12,131:4:0 0,30,241:10:0 +17 463 . G <*> 0 . DP=32;I16=18,13,0,0,1114,44214,0,0,1775,103851,0,0,583,12729,0,0;QS=3,0;MQSB=0.980167;MQ0F=0 PL:DP:DV 0,51,255:17:0 0,12,126:4:0 0,30,221:10:0 +17 464 . A <*> 0 . DP=32;I16=17,12,0,0,1100,43908,0,0,1686,99410,0,0,557,12147,0,0;QS=3,0;MQSB=0.996086;MQ0F=0 PL:DP:DV 0,51,255:17:0 0,12,148:4:0 0,24,213:8:0 +17 465 . C <*> 0 . DP=33;I16=19,12,0,0,1191,48085,0,0,1775,103851,0,0,589,12909,0,0;QS=3,0;MQSB=0.989977;MQ0F=0 PL:DP:DV 0,51,255:17:0 0,15,158:5:0 0,27,231:9:0 +17 466 . A <*> 0 . DP=34;I16=20,13,0,0,1293,53311,0,0,1895,111051,0,0,602,13102,0,0;QS=3,0;MQSB=0.989199;MQ0F=0 PL:DP:DV 0,54,255:18:0 0,15,173:5:0 0,30,255:10:0 +17 467 . A <*> 0 . DP=34;I16=20,12,0,0,1256,51450,0,0,1835,107451,0,0,604,13178,0,0;QS=3,0;MQSB=0.993205;MQ0F=0 PL:DP:DV 0,54,255:18:0 0,15,177:5:0 0,27,248:9:0 +17 468 . A <*> 0 . DP=35;I16=21,13,0,0,1274,51268,0,0,1955,114651,0,0,613,13273,0,0;QS=3,0;MQSB=0.992337;MQ0F=0 PL:DP:DV 0,57,255:19:0 0,15,174:5:0 0,30,251:10:0 +17 469 . A <*> 0 . DP=35;I16=21,13,0,0,1285,52989,0,0,1955,114651,0,0,618,13340,0,0;QS=3,0;MQSB=0.992337;MQ0F=0 PL:DP:DV 0,57,255:19:0 0,15,165:5:0 0,30,255:10:0 +17 470 . G <*> 0 . DP=35;I16=21,13,0,0,1281,51055,0,0,1955,114651,0,0,623,13445,0,0;QS=3,0;MQSB=0.992337;MQ0F=0 PL:DP:DV 0,57,255:19:0 0,15,167:5:0 0,30,238:10:0 +17 471 . T <*> 0 . DP=36;I16=21,12,0,0,1239,49021,0,0,1918,113282,0,0,611,13281,0,0;QS=3,0;MQSB=0.904077;MQ0F=0 PL:DP:DV 0,57,255:19:0 0,15,169:5:0 0,27,232:9:0 +17 472 . T <*> 0 . DP=35;I16=20,13,0,0,1245,48915,0,0,1926,113810,0,0,608,13040,0,0;QS=3,0;MQSB=0.99391;MQ0F=0 PL:DP:DV 0,57,255:19:0 0,15,172:5:0 0,27,237:9:0 +17 473 . G <*> 0 . DP=35;I16=20,13,0,0,1307,53473,0,0,1926,113810,0,0,613,13155,0,0;QS=3,0;MQSB=0.99391;MQ0F=0 PL:DP:DV 0,57,255:19:0 0,15,159:5:0 0,27,249:9:0 +17 474 . G <*> 0 . DP=36;I16=21,13,0,0,1284,51708,0,0,1986,117410,0,0,617,13259,0,0;QS=3,0;MQSB=0.992337;MQ0F=0 PL:DP:DV 0,57,255:19:0 0,15,146:5:0 0,30,255:10:0 +17 475 . G <*> 0 . DP=36;I16=22,13,0,0,1311,51609,0,0,2015,118251,0,0,647,14029,0,0;QS=3,0;MQSB=0.994769;MQ0F=0 PL:DP:DV 0,57,255:19:0 0,15,159:5:0 0,33,252:11:0 +17 476 . A <*> 0 . DP=36;I16=22,13,0,0,1312,52078,0,0,2015,118251,0,0,651,14167,0,0;QS=3,0;MQSB=0.994769;MQ0F=0 PL:DP:DV 0,57,255:19:0 0,15,177:5:0 0,33,255:11:0 +17 477 . T <*> 0 . DP=36;I16=22,13,0,0,1318,52668,0,0,2015,118251,0,0,655,14349,0,0;QS=3,0;MQSB=0.994769;MQ0F=0 PL:DP:DV 0,57,255:19:0 0,15,167:5:0 0,33,255:11:0 +17 478 . T <*> 0 . DP=38;I16=24,13,0,0,1338,51774,0,0,2135,125451,0,0,656,14422,0,0;QS=3,0;MQSB=0.997978;MQ0F=0 PL:DP:DV 0,60,255:20:0 0,18,174:6:0 0,33,255:11:0 +17 479 . A <*> 0 . DP=38;I16=24,13,0,0,1420,57788,0,0,2135,125451,0,0,659,14535,0,0;QS=3,0;MQSB=0.997978;MQ0F=0 PL:DP:DV 0,60,255:20:0 0,18,184:6:0 0,33,255:11:0 +17 480 . G <*> 0 . DP=37;I16=24,12,0,0,1438,60172,0,0,2075,121851,0,0,662,14638,0,0;QS=3,0;MQSB=0.999437;MQ0F=0 PL:DP:DV 0,57,255:19:0 0,18,187:6:0 0,33,255:11:0 +17 481 . G <*> 0 . DP=37;I16=24,12,0,0,1392,55824,0,0,2075,121851,0,0,664,14728,0,0;QS=3,0;MQSB=0.999437;MQ0F=0 PL:DP:DV 0,57,255:19:0 0,18,187:6:0 0,33,255:11:0 +17 482 . A <*> 0 . DP=37;I16=23,12,0,0,1352,55134,0,0,2015,118251,0,0,641,14229,0,0;QS=3,0;MQSB=0.998642;MQ0F=0 PL:DP:DV 0,57,255:19:0 0,15,161:5:0 0,33,255:11:0 +17 483 . G <*> 0 . DP=37;I16=23,13,0,0,1417,57747,0,0,2075,121851,0,0,666,14864,0,0;QS=3,0;MQSB=0.996616;MQ0F=0 PL:DP:DV 0,57,255:19:0 0,18,187:6:0 0,33,255:11:0 +17 484 . A <*> 0 . DP=36;I16=23,12,0,0,1340,53992,0,0,2015,118251,0,0,668,14906,0,0;QS=3,0;MQSB=0.998642;MQ0F=0 PL:DP:DV 0,54,255:18:0 0,18,191:6:0 0,33,255:11:0 17 485 . G <*> 0 . DP=35;I16=23,12,0,0,1329,51411,0,0,2015,118251,0,0,669,14931,0,0;QS=3,0;MQSB=0.998642;MQ0F=0 PL:DP:DV 0,54,255:18:0 0,18,160:6:0 0,33,255:11:0 17 486 . A <*> 0 . DP=34;I16=22,12,0,0,1311,51523,0,0,1955,114651,0,0,671,14989,0,0;QS=3,0;MQSB=0.997406;MQ0F=0 PL:DP:DV 0,51,255:17:0 0,18,173:6:0 0,33,255:11:0 17 487 . G <*> 0 . DP=34;I16=22,12,0,0,1306,50760,0,0,1955,114651,0,0,672,15030,0,0;QS=3,0;MQSB=0.997406;MQ0F=0 PL:DP:DV 0,51,255:17:0 0,18,169:6:0 0,33,255:11:0 diff -Nru samtools-1.11/test/dat/mpileup.out.4 samtools-1.13/test/dat/mpileup.out.4 --- samtools-1.11/test/dat/mpileup.out.4 2020-09-22 12:15:55.000000000 +0000 +++ samtools-1.13/test/dat/mpileup.out.4 2021-07-07 17:18:52.000000000 +0000 @@ -23,39 +23,39 @@ ##FORMAT= ##FORMAT= #CHROM POS ID REF ALT QUAL FILTER INFO FORMAT HG00100 HG00101 HG00102 -17 100 . C <*> 0 . DP=18;DPR=17,0;I16=17,0,0,0,688,29762,0,0,958,55682,0,0,332,7446,0,0;QS=3,0;MQ0F=0 PL:DP:DV:SP:DP4:DPR 0,27,189:9:0:0:9,0,0,0:9,0 0,9,108:3:0:0:3,0,0,0:3,0 0,15,134:5:0:0:5,0,0,0:5,0 -17 101 . C <*> 0 . DP=18;DPR=17,0;I16=17,0,0,0,650,27530,0,0,958,55682,0,0,331,7303,0,0;QS=3,0;MQ0F=0 PL:DP:DV:SP:DP4:DPR 0,27,182:9:0:0:9,0,0,0:9,0 0,9,99:3:0:0:3,0,0,0:3,0 0,15,132:5:0:0:5,0,0,0:5,0 -17 102 . C <*> 0 . DP=18;DPR=17,0;I16=17,0,0,0,695,30453,0,0,958,55682,0,0,330,7178,0,0;QS=3,0;MQ0F=0 PL:DP:DV:SP:DP4:DPR 0,27,188:9:0:0:9,0,0,0:9,0 0,9,111:3:0:0:3,0,0,0:3,0 0,15,139:5:0:0:5,0,0,0:5,0 -17 103 . T <*> 0 . DP=18;DPR=16,0;I16=16,0,0,0,692,31998,0,0,929,54841,0,0,323,7035,0,0;QS=3,0;MQ0F=0 PL:DP:DV:SP:DP4:DPR 0,24,189:8:0:0:8,0,0,0:8,0 0,9,108:3:0:0:3,0,0,0:3,0 0,15,147:5:0:0:5,0,0,0:5,0 -17 104 . G <*> 0 . DP=18;DPR=15,0;I16=15,0,0,0,611,26723,0,0,900,54000,0,0,295,6259,0,0;QS=3,0;MQ0F=0 PL:DP:DV:SP:DP4:DPR 0,24,178:8:0:0:8,0,0,0:8,0 0,6,89:2:0:0:2,0,0,0:2,0 0,15,133:5:0:0:5,0,0,0:5,0 -17 105 . G <*> 0 . DP=19;DPR=17,0;I16=17,0,0,0,604,23936,0,0,989,58441,0,0,317,6751,0,0;QS=3,0;MQ0F=0 PL:DP:DV:SP:DP4:DPR 0,27,170:9:0:0:9,0,0,0:9,0 0,9,97:3:0:0:3,0,0,0:3,0 0,15,125:5:0:0:5,0,0,0:5,0 -17 106 . G <*> 0 . DP=19;DPR=17,0;I16=17,0,0,0,644,26574,0,0,989,58441,0,0,299,6093,0,0;QS=3,0;MQ0F=0 PL:DP:DV:SP:DP4:DPR 0,30,190:10:0:0:10,0,0,0:10,0 0,6,85:2:0:0:2,0,0,0:2,0 0,15,124:5:0:0:5,0,0,0:5,0 -17 107 . C <*> 0 . DP=19;DPR=17,0;I16=17,0,0,0,694,30064,0,0,989,58441,0,0,313,6543,0,0;QS=3,0;MQ0F=0 PL:DP:DV:SP:DP4:DPR 0,27,192:9:0:0:9,0,0,0:9,0 0,9,108:3:0:0:3,0,0,0:3,0 0,15,136:5:0:0:5,0,0,0:5,0 -17 108 . C <*> 0 . DP=19;DPR=17,0;I16=17,0,0,0,692,30148,0,0,989,58441,0,0,310,6420,0,0;QS=3,0;MQ0F=0 PL:DP:DV:SP:DP4:DPR 0,27,190:9:0:0:9,0,0,0:9,0 0,9,108:3:0:0:3,0,0,0:3,0 0,15,135:5:0:0:5,0,0,0:5,0 -17 109 . T <*> 0 . DP=19;DPR=17,0;I16=17,0,0,0,741,34273,0,0,989,58441,0,0,307,6319,0,0;QS=3,0;MQ0F=0 PL:DP:DV:SP:DP4:DPR 0,27,195:9:0:0:9,0,0,0:9,0 0,9,110:3:0:0:3,0,0,0:3,0 0,15,150:5:0:0:5,0,0,0:5,0 -17 110 . G <*> 0 . DP=19;DPR=17,0;I16=17,0,0,0,704,31276,0,0,989,58441,0,0,304,6240,0,0;QS=3,0;MQ0F=0 PL:DP:DV:SP:DP4:DPR 0,27,194:9:0:0:9,0,0,0:9,0 0,9,104:3:0:0:3,0,0,0:3,0 0,15,136:5:0:0:5,0,0,0:5,0 -17 111 . G <*> 0 . DP=19;DPR=16,0;I16=16,0,0,0,584,24362,0,0,929,54841,0,0,272,5416,0,0;QS=3,0;MQ0F=0 PL:DP:DV:SP:DP4:DPR 0,30,167:10:0:0:10,0,0,0:10,0 0,6,88:2:0:0:2,0,0,0:2,0 0,12,118:4:0:0:4,0,0,0:4,0 -17 112 . C <*> 0 . DP=19;DPR=17,0;I16=17,0,0,0,680,29854,0,0,989,58441,0,0,296,6052,0,0;QS=3,0;MQ0F=0 PL:DP:DV:SP:DP4:DPR 0,27,191:9:0:0:9,0,0,0:9,0 0,9,95:3:0:0:3,0,0,0:3,0 0,15,135:5:0:0:5,0,0,0:5,0 -17 113 . A <*> 0 . DP=19;DPR=16,0;I16=16,0,0,0,645,28035,0,0,960,57600,0,0,266,5318,0,0;QS=3,0;MQ0F=0 PL:DP:DV:SP:DP4:DPR 0,27,176:9:0:0:9,0,0,0:9,0 0,6,87:2:0:0:2,0,0,0:2,0 0,15,139:5:0:0:5,0,0,0:5,0 -17 114 . C <*> 0 . DP=19;DPR=17,0;I16=17,0,0,0,674,28788,0,0,989,58441,0,0,286,5856,0,0;QS=3,0;MQ0F=0 PL:DP:DV:SP:DP4:DPR 0,27,182:9:0:0:9,0,0,0:9,0 0,9,103:3:0:0:3,0,0,0:3,0 0,15,133:5:0:0:5,0,0,0:5,0 -17 115 . C <*> 0 . DP=21;DPR=18,0;I16=18,0,0,0,708,30546,0,0,1049,62041,0,0,274,5490,0,0;QS=3,0;MQ0F=0 PL:DP:DV:SP:DP4:DPR 0,30,189:10:0:0:10,0,0,0:10,0 0,6,89:2:0:0:2,0,0,0:2,0 0,18,147:6:0:0:6,0,0,0:6,0 -17 116 . A <*> 0 . DP=21;DPR=18,0;I16=17,1,0,0,727,31755,0,0,1049,62041,0,0,253,5079,0,0;QS=3,0;MQSB=1;MQ0F=0 PL:DP:DV:SP:DP4:DPR 0,27,183:9:0:0:9,0,0,0:9,0 0,6,90:2:0:0:2,0,0,0:2,0 0,21,175:7:0:0:6,1,0,0:7,0 -17 117 . G <*> 0 . DP=21;DPR=18,0;I16=17,1,0,0,712,30478,0,0,1049,62041,0,0,249,5019,0,0;QS=3,0;MQSB=1;MQ0F=0 PL:DP:DV:SP:DP4:DPR 0,27,183:9:0:0:9,0,0,0:9,0 0,6,85:2:0:0:2,0,0,0:2,0 0,21,177:7:0:0:6,1,0,0:7,0 -17 118 . G <*> 0 . DP=20;DPR=17,0;I16=16,1,0,0,636,26574,0,0,958,55682,0,0,266,5426,0,0;QS=3,0;MQSB=1;MQ0F=0 PL:DP:DV:SP:DP4:DPR 0,27,175:9:0:0:9,0,0,0:9,0 0,3,60:1:0:0:1,0,0,0:1,0 0,21,162:7:0:0:6,1,0,0:7,0 -17 119 . G <*> 0 . DP=19;DPR=17,0;I16=16,1,0,0,629,26439,0,0,958,55682,0,0,267,5553,0,0;QS=3,0;MQSB=1;MQ0F=0 PL:DP:DV:SP:DP4:DPR 0,24,175:8:0:0:8,0,0,0:8,0 0,6,73:2:0:0:2,0,0,0:2,0 0,21,160:7:0:0:6,1,0,0:7,0 -17 120 . A <*> 0 . DP=19;DPR=17,0;I16=16,1,0,0,672,29188,0,0,958,55682,0,0,264,5518,0,0;QS=3,0;MQSB=1;MQ0F=0 PL:DP:DV:SP:DP4:DPR 0,24,175:8:0:0:8,0,0,0:8,0 0,6,83:2:0:0:2,0,0,0:2,0 0,21,171:7:0:0:6,1,0,0:7,0 -17 121 . G <*> 0 . DP=19;DPR=17,0;I16=16,1,0,0,662,28460,0,0,958,55682,0,0,260,5454,0,0;QS=3,0;MQSB=1;MQ0F=0 PL:DP:DV:SP:DP4:DPR 0,24,181:8:0:0:8,0,0,0:8,0 0,6,80:2:0:0:2,0,0,0:2,0 0,21,168:7:0:0:6,1,0,0:7,0 -17 122 . C <*> 0 . DP=20;DPR=18,0;I16=17,1,0,0,716,31224,0,0,1018,59282,0,0,256,5410,0,0;QS=3,0;MQSB=1;MQ0F=0 PL:DP:DV:SP:DP4:DPR 0,24,181:8:0:0:8,0,0,0:8,0 0,9,99:3:0:0:3,0,0,0:3,0 0,21,178:7:0:0:6,1,0,0:7,0 -17 123 . T <*> 0 . DP=18;DPR=16,0;I16=15,1,0,0,661,29997,0,0,898,52082,0,0,255,5385,0,0;QS=3,0;MQSB=1;MQ0F=0 PL:DP:DV:SP:DP4:DPR 0,21,167:7:0:0:7,0,0,0:7,0 0,9,112:3:0:0:3,0,0,0:3,0 0,18,166:6:0:0:5,1,0,0:6,0 -17 124 . T <*> 0 . DP=19;DPR=18,0;I16=17,1,0,0,626,24802,0,0,987,56523,0,0,279,6003,0,0;QS=3,0;MQSB=1;MQ0F=0 PL:DP:DV:SP:DP4:DPR 0,27,154:9:0:0:9,0,0,0:9,0 0,9,104:3:0:0:3,0,0,0:3,0 0,18,154:6:0:0:5,1,0,0:6,0 -17 125 . A <*> 0 . DP=18;DPR=16,0;I16=15,1,0,0,611,25689,0,0,898,52082,0,0,254,5340,0,0;QS=3,0;MQSB=1;MQ0F=0 PL:DP:DV:SP:DP4:DPR 0,21,154:7:0:0:7,0,0,0:7,0 0,9,104:3:0:0:3,0,0,0:3,0 0,18,162:6:0:0:5,1,0,0:6,0 -17 126 . A <*> 0 . DP=18;DPR=17,0;I16=16,1,0,0,648,27366,0,0,927,52923,0,0,279,5947,0,0;QS=3,0;MQSB=1;MQ0F=0 PL:DP:DV:SP:DP4:DPR 0,24,162:8:0:0:8,0,0,0:8,0 0,9,107:3:0:0:3,0,0,0:3,0 0,18,174:6:0:0:5,1,0,0:6,0 -17 127 . C <*> 0 . DP=18;DPR=17,0;I16=16,1,0,0,646,26972,0,0,927,52923,0,0,279,5949,0,0;QS=3,0;MQSB=1;MQ0F=0 PL:DP:DV:SP:DP4:DPR 0,24,163:8:0:0:8,0,0,0:8,0 0,9,109:3:0:0:3,0,0,0:3,0 0,18,160:6:0:0:5,1,0,0:6,0 -17 128 . A <*> 0 . DP=18;DPR=17,0;I16=16,1,0,0,673,28797,0,0,927,52923,0,0,279,5971,0,0;QS=3,0;MQSB=1;MQ0F=0 PL:DP:DV:SP:DP4:DPR 0,24,169:8:0:0:8,0,0,0:8,0 0,9,111:3:0:0:3,0,0,0:3,0 0,18,162:6:0:0:5,1,0,0:6,0 -17 129 . A <*> 0 . DP=17;DPR=16,0;I16=15,1,0,0,645,27891,0,0,867,49323,0,0,280,6012,0,0;QS=3,0;MQSB=1;MQ0F=0 PL:DP:DV:SP:DP4:DPR 0,24,168:8:0:0:8,0,0,0:8,0 0,9,113:3:0:0:3,0,0,0:3,0 0,15,159:5:0:0:4,1,0,0:5,0 -17 130 . A <*> 0 . DP=17;DPR=16,0;I16=15,1,0,0,641,27295,0,0,867,49323,0,0,281,6071,0,0;QS=3,0;MQSB=1;MQ0F=0 PL:DP:DV:SP:DP4:DPR 0,24,169:8:0:0:8,0,0,0:8,0 0,9,113:3:0:0:3,0,0,0:3,0 0,15,152:5:0:0:4,1,0,0:5,0 -17 131 . C <*> 0 . DP=16;DPR=15,0;I16=14,1,0,0,606,25732,0,0,838,48482,0,0,256,5472,0,0;QS=3,0;MQSB=1;MQ0F=0 PL:DP:DV:SP:DP4:DPR 0,21,167:7:0:0:7,0,0,0:7,0 0,9,110:3:0:0:3,0,0,0:3,0 0,15,147:5:0:0:4,1,0,0:5,0 -17 132 . A <*> 0 . DP=16;DPR=15,0;I16=14,1,0,0,627,27579,0,0,838,48482,0,0,256,5514,0,0;QS=3,0;MQSB=1;MQ0F=0 PL:DP:DV:SP:DP4:DPR 0,21,169:7:0:0:7,0,0,0:7,0 0,9,110:3:0:0:3,0,0,0:3,0 0,15,151:5:0:0:4,1,0,0:5,0 +17 100 . C <*> 0 . DP=18;DPR=17,0;I16=16,1,0,0,688,29762,0,0,958,55682,0,0,332,7446,0,0;QS=3,0;MQSB=1;MQ0F=0 PL:DP:DV:SP:DP4:DPR 0,27,189:9:0:0:9,0,0,0:9,0 0,9,118:3:0:0:2,1,0,0:3,0 0,15,134:5:0:0:5,0,0,0:5,0 +17 101 . C <*> 0 . DP=18;DPR=17,0;I16=16,1,0,0,650,27530,0,0,958,55682,0,0,331,7303,0,0;QS=3,0;MQSB=1;MQ0F=0 PL:DP:DV:SP:DP4:DPR 0,27,182:9:0:0:9,0,0,0:9,0 0,9,108:3:0:0:2,1,0,0:3,0 0,15,132:5:0:0:5,0,0,0:5,0 +17 102 . C <*> 0 . DP=18;DPR=17,0;I16=16,1,0,0,695,30453,0,0,958,55682,0,0,330,7178,0,0;QS=3,0;MQSB=1;MQ0F=0 PL:DP:DV:SP:DP4:DPR 0,27,188:9:0:0:9,0,0,0:9,0 0,9,121:3:0:0:2,1,0,0:3,0 0,15,139:5:0:0:5,0,0,0:5,0 +17 103 . T <*> 0 . DP=18;DPR=16,0;I16=15,1,0,0,692,31998,0,0,929,54841,0,0,323,7035,0,0;QS=3,0;MQSB=1;MQ0F=0 PL:DP:DV:SP:DP4:DPR 0,24,189:8:0:0:8,0,0,0:8,0 0,9,118:3:0:0:2,1,0,0:3,0 0,15,147:5:0:0:5,0,0,0:5,0 +17 104 . G <*> 0 . DP=18;DPR=15,0;I16=14,1,0,0,611,26723,0,0,900,54000,0,0,295,6259,0,0;QS=3,0;MQSB=1;MQ0F=0 PL:DP:DV:SP:DP4:DPR 0,24,178:8:0:0:8,0,0,0:8,0 0,6,95:2:0:0:1,1,0,0:2,0 0,15,133:5:0:0:5,0,0,0:5,0 +17 105 . G <*> 0 . DP=19;DPR=17,0;I16=16,1,0,0,604,23936,0,0,989,58441,0,0,317,6751,0,0;QS=3,0;MQSB=1;MQ0F=0 PL:DP:DV:SP:DP4:DPR 0,27,170:9:0:0:9,0,0,0:9,0 0,9,106:3:0:0:2,1,0,0:3,0 0,15,125:5:0:0:5,0,0,0:5,0 +17 106 . G <*> 0 . DP=19;DPR=17,0;I16=16,1,0,0,644,26574,0,0,989,58441,0,0,299,6093,0,0;QS=3,0;MQSB=1;MQ0F=0 PL:DP:DV:SP:DP4:DPR 0,30,190:10:0:0:10,0,0,0:10,0 0,6,91:2:0:0:1,1,0,0:2,0 0,15,124:5:0:0:5,0,0,0:5,0 +17 107 . C <*> 0 . DP=19;DPR=17,0;I16=16,1,0,0,694,30064,0,0,989,58441,0,0,313,6543,0,0;QS=3,0;MQSB=1;MQ0F=0 PL:DP:DV:SP:DP4:DPR 0,27,192:9:0:0:9,0,0,0:9,0 0,9,119:3:0:0:2,1,0,0:3,0 0,15,136:5:0:0:5,0,0,0:5,0 +17 108 . C <*> 0 . DP=19;DPR=17,0;I16=16,1,0,0,692,30148,0,0,989,58441,0,0,311,6469,0,0;QS=3,0;MQSB=1;MQ0F=0 PL:DP:DV:SP:DP4:DPR 0,27,190:9:0:0:9,0,0,0:9,0 0,9,119:3:0:0:2,1,0,0:3,0 0,15,135:5:0:0:5,0,0,0:5,0 +17 109 . T <*> 0 . DP=19;DPR=17,0;I16=16,1,0,0,741,34273,0,0,989,58441,0,0,309,6415,0,0;QS=3,0;MQSB=1;MQ0F=0 PL:DP:DV:SP:DP4:DPR 0,27,195:9:0:0:9,0,0,0:9,0 0,9,120:3:0:0:2,1,0,0:3,0 0,15,150:5:0:0:5,0,0,0:5,0 +17 110 . G <*> 0 . DP=19;DPR=17,0;I16=16,1,0,0,704,31276,0,0,989,58441,0,0,307,6381,0,0;QS=3,0;MQSB=1;MQ0F=0 PL:DP:DV:SP:DP4:DPR 0,27,194:9:0:0:9,0,0,0:9,0 0,9,113:3:0:0:2,1,0,0:3,0 0,15,136:5:0:0:5,0,0,0:5,0 +17 111 . G <*> 0 . DP=19;DPR=16,0;I16=15,1,0,0,584,24362,0,0,929,54841,0,0,276,5600,0,0;QS=3,0;MQSB=1;MQ0F=0 PL:DP:DV:SP:DP4:DPR 0,30,167:10:0:0:10,0,0,0:10,0 0,6,94:2:0:0:1,1,0,0:2,0 0,12,118:4:0:0:4,0,0,0:4,0 +17 112 . C <*> 0 . DP=19;DPR=17,0;I16=16,1,0,0,680,29854,0,0,989,58441,0,0,301,6277,0,0;QS=3,0;MQSB=1;MQ0F=0 PL:DP:DV:SP:DP4:DPR 0,27,191:9:0:0:9,0,0,0:9,0 0,9,103:3:0:0:2,1,0,0:3,0 0,15,135:5:0:0:5,0,0,0:5,0 +17 113 . A <*> 0 . DP=19;DPR=16,0;I16=15,1,0,0,645,28035,0,0,960,57600,0,0,272,5582,0,0;QS=3,0;MQSB=1;MQ0F=0 PL:DP:DV:SP:DP4:DPR 0,27,176:9:0:0:9,0,0,0:9,0 0,6,93:2:0:0:1,1,0,0:2,0 0,15,139:5:0:0:5,0,0,0:5,0 +17 114 . C <*> 0 . DP=19;DPR=17,0;I16=16,1,0,0,674,28788,0,0,989,58441,0,0,293,6157,0,0;QS=3,0;MQSB=1;MQ0F=0 PL:DP:DV:SP:DP4:DPR 0,27,182:9:0:0:9,0,0,0:9,0 0,9,113:3:0:0:2,1,0,0:3,0 0,15,133:5:0:0:5,0,0,0:5,0 +17 115 . C <*> 0 . DP=21;DPR=18,0;I16=17,1,0,0,708,30546,0,0,1049,62041,0,0,282,5826,0,0;QS=3,0;MQSB=1;MQ0F=0 PL:DP:DV:SP:DP4:DPR 0,30,189:10:0:0:10,0,0,0:10,0 0,6,95:2:0:0:1,1,0,0:2,0 0,18,147:6:0:0:6,0,0,0:6,0 +17 116 . A <*> 0 . DP=21;DPR=18,0;I16=16,2,0,0,727,31755,0,0,1049,62041,0,0,262,5448,0,0;QS=3,0;MQSB=0.5;MQ0F=0 PL:DP:DV:SP:DP4:DPR 0,27,183:9:0:0:9,0,0,0:9,0 0,6,96:2:0:0:1,1,0,0:2,0 0,21,175:7:0:0:6,1,0,0:7,0 +17 117 . G <*> 0 . DP=21;DPR=18,0;I16=16,2,0,0,712,30478,0,0,1049,62041,0,0,259,5419,0,0;QS=3,0;MQSB=0.5;MQ0F=0 PL:DP:DV:SP:DP4:DPR 0,27,183:9:0:0:9,0,0,0:9,0 0,6,91:2:0:0:1,1,0,0:2,0 0,21,177:7:0:0:6,1,0,0:7,0 +17 118 . G <*> 0 . DP=20;DPR=17,0;I16=15,2,0,0,636,26574,0,0,958,55682,0,0,277,5855,0,0;QS=3,0;MQSB=0.566667;MQ0F=0 PL:DP:DV:SP:DP4:DPR 0,27,175:9:0:0:9,0,0,0:9,0 0,3,60:1:0:0:0,1,0,0:1,0 0,21,162:7:0:0:6,1,0,0:7,0 +17 119 . G <*> 0 . DP=19;DPR=17,0;I16=15,2,0,0,629,26439,0,0,958,55682,0,0,279,6009,0,0;QS=3,0;MQSB=0.566667;MQ0F=0 PL:DP:DV:SP:DP4:DPR 0,24,175:8:0:0:8,0,0,0:8,0 0,6,76:2:0:0:1,1,0,0:2,0 0,21,160:7:0:0:6,1,0,0:7,0 +17 120 . A <*> 0 . DP=19;DPR=17,0;I16=15,2,0,0,672,29188,0,0,958,55682,0,0,277,5999,0,0;QS=3,0;MQSB=0.566667;MQ0F=0 PL:DP:DV:SP:DP4:DPR 0,24,175:8:0:0:8,0,0,0:8,0 0,6,88:2:0:0:1,1,0,0:2,0 0,21,171:7:0:0:6,1,0,0:7,0 +17 121 . G <*> 0 . DP=19;DPR=17,0;I16=15,2,0,0,662,28460,0,0,958,55682,0,0,274,5958,0,0;QS=3,0;MQSB=0.566667;MQ0F=0 PL:DP:DV:SP:DP4:DPR 0,24,181:8:0:0:8,0,0,0:8,0 0,6,84:2:0:0:1,1,0,0:2,0 0,21,168:7:0:0:6,1,0,0:7,0 +17 122 . C <*> 0 . DP=20;DPR=18,0;I16=16,2,0,0,716,31224,0,0,1018,59282,0,0,271,5935,0,0;QS=3,0;MQSB=0.5625;MQ0F=0 PL:DP:DV:SP:DP4:DPR 0,24,181:8:0:0:8,0,0,0:8,0 0,9,107:3:0:0:2,1,0,0:3,0 0,21,178:7:0:0:6,1,0,0:7,0 +17 123 . T <*> 0 . DP=18;DPR=16,0;I16=14,2,0,0,661,29997,0,0,898,52082,0,0,271,5929,0,0;QS=3,0;MQSB=0.571429;MQ0F=0 PL:DP:DV:SP:DP4:DPR 0,21,167:7:0:0:7,0,0,0:7,0 0,9,123:3:0:0:2,1,0,0:3,0 0,18,166:6:0:0:5,1,0,0:6,0 +17 124 . T <*> 0 . DP=19;DPR=18,0;I16=16,2,0,0,626,24802,0,0,987,56523,0,0,296,6564,0,0;QS=3,0;MQSB=0.625;MQ0F=0 PL:DP:DV:SP:DP4:DPR 0,27,154:9:0:0:9,0,0,0:9,0 0,9,114:3:0:0:2,1,0,0:3,0 0,18,154:6:0:0:5,1,0,0:6,0 +17 125 . A <*> 0 . DP=18;DPR=16,0;I16=14,2,0,0,611,25689,0,0,898,52082,0,0,272,5916,0,0;QS=3,0;MQSB=0.571429;MQ0F=0 PL:DP:DV:SP:DP4:DPR 0,21,154:7:0:0:7,0,0,0:7,0 0,9,114:3:0:0:2,1,0,0:3,0 0,18,162:6:0:0:5,1,0,0:6,0 +17 126 . A <*> 0 . DP=18;DPR=17,0;I16=15,2,0,0,648,27366,0,0,927,52923,0,0,298,6536,0,0;QS=3,0;MQSB=0.633333;MQ0F=0 PL:DP:DV:SP:DP4:DPR 0,24,162:8:0:0:8,0,0,0:8,0 0,9,117:3:0:0:2,1,0,0:3,0 0,18,174:6:0:0:5,1,0,0:6,0 +17 127 . C <*> 0 . DP=18;DPR=17,0;I16=15,2,0,0,646,26972,0,0,927,52923,0,0,299,6549,0,0;QS=3,0;MQSB=0.633333;MQ0F=0 PL:DP:DV:SP:DP4:DPR 0,24,163:8:0:0:8,0,0,0:8,0 0,9,119:3:0:0:2,1,0,0:3,0 0,18,160:6:0:0:5,1,0,0:6,0 +17 128 . A <*> 0 . DP=18;DPR=17,0;I16=15,2,0,0,673,28797,0,0,927,52923,0,0,300,6580,0,0;QS=3,0;MQSB=0.633333;MQ0F=0 PL:DP:DV:SP:DP4:DPR 0,24,169:8:0:0:8,0,0,0:8,0 0,9,121:3:0:0:2,1,0,0:3,0 0,18,162:6:0:0:5,1,0,0:6,0 +17 129 . A <*> 0 . DP=17;DPR=16,0;I16=14,2,0,0,645,27891,0,0,867,49323,0,0,302,6628,0,0;QS=3,0;MQSB=0.642857;MQ0F=0 PL:DP:DV:SP:DP4:DPR 0,24,168:8:0:0:8,0,0,0:8,0 0,9,124:3:0:0:2,1,0,0:3,0 0,15,159:5:0:0:4,1,0,0:5,0 +17 130 . A <*> 0 . DP=17;DPR=16,0;I16=14,2,0,0,641,27295,0,0,867,49323,0,0,304,6692,0,0;QS=3,0;MQSB=0.642857;MQ0F=0 PL:DP:DV:SP:DP4:DPR 0,24,169:8:0:0:8,0,0,0:8,0 0,9,124:3:0:0:2,1,0,0:3,0 0,15,152:5:0:0:4,1,0,0:5,0 +17 131 . C <*> 0 . DP=16;DPR=15,0;I16=13,2,0,0,606,25732,0,0,838,48482,0,0,280,6096,0,0;QS=3,0;MQSB=0.576923;MQ0F=0 PL:DP:DV:SP:DP4:DPR 0,21,167:7:0:0:7,0,0,0:7,0 0,9,120:3:0:0:2,1,0,0:3,0 0,15,147:5:0:0:4,1,0,0:5,0 +17 132 . A <*> 0 . DP=16;DPR=15,0;I16=13,2,0,0,627,27579,0,0,838,48482,0,0,281,6139,0,0;QS=3,0;MQSB=0.576923;MQ0F=0 PL:DP:DV:SP:DP4:DPR 0,21,169:7:0:0:7,0,0,0:7,0 0,9,120:3:0:0:2,1,0,0:3,0 0,15,151:5:0:0:4,1,0,0:5,0 17 133 . T <*> 0 . DP=15;DPR=15,0;I16=13,2,0,0,584,22816,0,0,838,48482,0,0,282,6196,0,0;QS=3,0;MQSB=0.576923;MQ0F=0 PL:DP:DV:SP:DP4:DPR 0,21,163:7:0:0:7,0,0,0:7,0 0,9,105:3:0:0:2,1,0,0:3,0 0,15,150:5:0:0:4,1,0,0:5,0 17 134 . C <*> 0 . DP=15;DPR=15,0;I16=13,2,0,0,607,24653,0,0,838,48482,0,0,283,6267,0,0;QS=3,0;MQSB=0.576923;MQ0F=0 PL:DP:DV:SP:DP4:DPR 0,21,177:7:0:0:7,0,0,0:7,0 0,9,105:3:0:0:2,1,0,0:3,0 0,15,152:5:0:0:4,1,0,0:5,0 17 135 . T <*> 0 . DP=15;DPR=15,0;I16=13,2,0,0,600,24178,0,0,838,48482,0,0,284,6352,0,0;QS=3,0;MQSB=0.576923;MQ0F=0 PL:DP:DV:SP:DP4:DPR 0,21,173:7:0:0:7,0,0,0:7,0 0,9,106:3:0:0:2,1,0,0:3,0 0,15,156:5:0:0:4,1,0,0:5,0 @@ -367,48 +367,48 @@ 17 440 . A <*> 0 . DP=28;DPR=28,0;I16=14,14,0,0,1095,43251,0,0,1587,92523,0,0,561,12615,0,0;QS=3,0;MQSB=0.949591;MQ0F=0 PL:DP:DV:SP:DP4:DPR 0,48,255:16:0:0:8,8,0,0:16,0 0,9,115:3:0:0:2,1,0,0:3,0 0,27,247:9:0:0:4,5,0,0:9,0 17 441 . G <*> 0 . DP=29;DPR=29,0;I16=15,14,0,0,1068,40344,0,0,1647,96123,0,0,559,12605,0,0;QS=3,0;MQSB=0.960561;MQ0F=0 PL:DP:DV:SP:DP4:DPR 0,51,255:17:0:0:9,8,0,0:17,0 0,9,104:3:0:0:2,1,0,0:3,0 0,27,198:9:0:0:4,5,0,0:9,0 17 442 . A <*> 0 . DP=29;DPR=29,0;I16=15,14,0,0,1091,41507,0,0,1647,96123,0,0,558,12620,0,0;QS=3,0;MQSB=0.960561;MQ0F=0 PL:DP:DV:SP:DP4:DPR 0,51,255:17:0:0:9,8,0,0:17,0 0,9,112:3:0:0:2,1,0,0:3,0 0,27,233:9:0:0:4,5,0,0:9,0 -17 443 . A <*> 0 . DP=30;DPR=29,0;I16=15,14,0,0,1173,49439,0,0,1647,96123,0,0,557,12661,0,0;QS=3,0;MQSB=0.960561;MQ0F=0 PL:DP:DV:SP:DP4:DPR 0,51,255:17:0:0:9,8,0,0:17,0 0,9,129:3:0:0:2,1,0,0:3,0 0,27,246:9:0:0:4,5,0,0:9,0 -17 444 . G <*> 0 . DP=29;DPR=28,0;I16=15,13,0,0,1095,44661,0,0,1587,92523,0,0,557,12727,0,0;QS=3,0;MQSB=0.968414;MQ0F=0 PL:DP:DV:SP:DP4:DPR 0,51,255:17:0:0:9,8,0,0:17,0 0,6,91:2:0:0:2,0,0,0:2,0 0,27,227:9:0:0:4,5,0,0:9,0 -17 445 . C <*> 0 . DP=30;DPR=29,0;I16=16,13,0,0,1100,43706,0,0,1647,96123,0,0,557,12817,0,0;QS=3,0;MQSB=0.976248;MQ0F=0 PL:DP:DV:SP:DP4:DPR 0,51,255:17:0:0:9,8,0,0:17,0 0,9,111:3:0:0:3,0,0,0:3,0 0,27,219:9:0:0:4,5,0,0:9,0 -17 446 . A <*> 0 . DP=30;DPR=29,0;I16=16,13,0,0,1107,44265,0,0,1647,96123,0,0,557,12881,0,0;QS=3,0;MQSB=0.976248;MQ0F=0 PL:DP:DV:SP:DP4:DPR 0,51,255:17:0:0:9,8,0,0:17,0 0,9,115:3:0:0:3,0,0,0:3,0 0,27,232:9:0:0:4,5,0,0:9,0 -17 447 . C <*> 0 . DP=29;DPR=28,0;I16=16,12,0,0,1108,45364,0,0,1618,95282,0,0,555,12817,0,0;QS=3,0;MQSB=0.856268;MQ0F=0 PL:DP:DV:SP:DP4:DPR 0,48,255:16:0:0:9,7,0,0:16,0 0,9,114:3:0:0:3,0,0,0:3,0 0,27,235:9:0:0:4,5,0,0:9,0 -17 448 . T <*> 0 . DP=29;DPR=28,0;I16=16,12,0,0,1125,47237,0,0,1618,95282,0,0,553,12773,0,0;QS=3,0;MQSB=0.856268;MQ0F=0 PL:DP:DV:SP:DP4:DPR 0,48,255:16:0:0:9,7,0,0:16,0 0,9,118:3:0:0:3,0,0,0:3,0 0,27,240:9:0:0:4,5,0,0:9,0 -17 449 . A <*> 0 . DP=28;DPR=27,0;I16=15,12,0,0,1091,45981,0,0,1558,91682,0,0,552,12748,0,0;QS=3,0;MQSB=0.84246;MQ0F=0 PL:DP:DV:SP:DP4:DPR 0,48,255:16:0:0:9,7,0,0:16,0 0,6,90:2:0:0:2,0,0,0:2,0 0,27,245:9:0:0:4,5,0,0:9,0 -17 450 . G <*> 0 . DP=28;DPR=27,0;I16=15,12,0,0,1069,44603,0,0,1558,91682,0,0,551,12741,0,0;QS=3,0;MQSB=0.84246;MQ0F=0 PL:DP:DV:SP:DP4:DPR 0,48,255:16:0:0:9,7,0,0:16,0 0,6,91:2:0:0:2,0,0,0:2,0 0,27,233:9:0:0:4,5,0,0:9,0 -17 451 . A <*> 0 . DP=28;DPR=27,0;I16=15,12,0,0,1021,41371,0,0,1558,91682,0,0,550,12752,0,0;QS=3,0;MQSB=0.84246;MQ0F=0 PL:DP:DV:SP:DP4:DPR 0,48,255:16:0:0:9,7,0,0:16,0 0,6,93:2:0:0:2,0,0,0:2,0 0,27,244:9:0:0:4,5,0,0:9,0 -17 452 . A <*> 0 . DP=31;DPR=29,0;I16=18,11,0,0,1079,43353,0,0,1678,98882,0,0,530,12420,0,0;QS=3,0;MQSB=0.884952;MQ0F=0 PL:DP:DV:SP:DP4:DPR 0,54,255:18:0:0:11,7,0,0:18,0 0,9,110:3:0:0:3,0,0,0:3,0 0,24,225:8:0:0:4,4,0,0:8,0 -17 453 . A <*> 0 . DP=31;DPR=28,0;I16=17,11,0,0,1037,41069,0,0,1649,98041,0,0,508,11882,0,0;QS=3,0;MQSB=0.967085;MQ0F=0 PL:DP:DV:SP:DP4:DPR 0,54,255:18:0:0:11,7,0,0:18,0 0,9,111:3:0:0:3,0,0,0:3,0 0,21,221:7:0:0:3,4,0,0:7,0 -17 454 . A <*> 0 . DP=31;DPR=30,0;I16=18,12,0,0,1158,47028,0,0,1738,102482,0,0,554,12904,0,0;QS=3,0;MQSB=0.878946;MQ0F=0 PL:DP:DV:SP:DP4:DPR 0,51,255:17:0:0:10,7,0,0:17,0 0,9,113:3:0:0:3,0,0,0:3,0 0,30,255:10:0:0:5,5,0,0:10,0 -17 455 . T <*> 0 . DP=32;DPR=30,0;I16=17,13,0,0,1148,46574,0,0,1715,100251,0,0,550,12864,0,0;QS=3,0;MQSB=0.973855;MQ0F=0 PL:DP:DV:SP:DP4:DPR 0,48,255:16:0:0:9,7,0,0:16,0 0,9,113:3:0:0:3,0,0,0:3,0 0,33,255:11:0:0:5,6,0,0:11,0 -17 456 . G <*> 0 . DP=32;DPR=30,0;I16=17,13,0,0,1161,47287,0,0,1746,103010,0,0,534,12296,0,0;QS=3,0;MQSB=0.998031;MQ0F=0 PL:DP:DV:SP:DP4:DPR 0,51,255:17:0:0:10,7,0,0:17,0 0,9,116:3:0:0:3,0,0,0:3,0 0,30,245:10:0:0:4,6,0,0:10,0 -17 457 . C <*> 0 . DP=33;DPR=32,0;I16=19,13,0,0,1218,48642,0,0,1835,107451,0,0,563,12967,0,0;QS=3,0;MQSB=0.985204;MQ0F=0 PL:DP:DV:SP:DP4:DPR 0,54,255:18:0:0:11,7,0,0:18,0 0,9,118:3:0:0:3,0,0,0:3,0 0,33,255:11:0:0:5,6,0,0:11,0 -17 458 . A <*> 0 . DP=33;DPR=32,0;I16=19,13,0,0,1226,49034,0,0,1835,107451,0,0,568,12990,0,0;QS=3,0;MQSB=0.985204;MQ0F=0 PL:DP:DV:SP:DP4:DPR 0,54,255:18:0:0:11,7,0,0:18,0 0,9,111:3:0:0:3,0,0,0:3,0 0,33,255:11:0:0:5,6,0,0:11,0 -17 459 . T <*> 0 . DP=33;DPR=31,0;I16=18,13,0,0,1167,46981,0,0,1775,103851,0,0,565,12945,0,0;QS=3,0;MQSB=0.980167;MQ0F=0 PL:DP:DV:SP:DP4:DPR 0,54,255:18:0:0:11,7,0,0:18,0 0,6,92:2:0:0:2,0,0,0:2,0 0,33,255:11:0:0:5,6,0,0:11,0 -17 460 . G <*> 0 . DP=32;DPR=31,0;I16=19,12,0,0,1219,50105,0,0,1775,103851,0,0,575,12929,0,0;QS=3,0;MQSB=0.989977;MQ0F=0 PL:DP:DV:SP:DP4:DPR 0,54,255:18:0:0:11,7,0,0:18,0 0,9,116:3:0:0:3,0,0,0:3,0 0,30,255:10:0:0:5,5,0,0:10,0 -17 461 . T <*> 0 . DP=32;DPR=31,0;I16=19,12,0,0,1213,49819,0,0,1775,103851,0,0,577,12845,0,0;QS=3,0;MQSB=0.989977;MQ0F=0 PL:DP:DV:SP:DP4:DPR 0,54,255:18:0:0:11,7,0,0:18,0 0,9,115:3:0:0:3,0,0,0:3,0 0,30,255:10:0:0:5,5,0,0:10,0 -17 462 . G <*> 0 . DP=32;DPR=31,0;I16=19,12,0,0,1190,48962,0,0,1775,103851,0,0,580,12792,0,0;QS=3,0;MQSB=0.989977;MQ0F=0 PL:DP:DV:SP:DP4:DPR 0,51,255:17:0:0:10,7,0,0:17,0 0,12,119:4:0:0:4,0,0,0:4,0 0,30,241:10:0:0:5,5,0,0:10,0 -17 463 . G <*> 0 . DP=32;DPR=31,0;I16=19,12,0,0,1114,44214,0,0,1775,103851,0,0,584,12770,0,0;QS=3,0;MQSB=0.989977;MQ0F=0 PL:DP:DV:SP:DP4:DPR 0,51,255:17:0:0:10,7,0,0:17,0 0,12,114:4:0:0:4,0,0,0:4,0 0,30,221:10:0:0:5,5,0,0:10,0 -17 464 . A <*> 0 . DP=32;DPR=29,0;I16=18,11,0,0,1100,43908,0,0,1686,99410,0,0,556,12106,0,0;QS=3,0;MQSB=0.99095;MQ0F=0 PL:DP:DV:SP:DP4:DPR 0,51,255:17:0:0:10,7,0,0:17,0 0,12,133:4:0:0:4,0,0,0:4,0 0,24,213:8:0:0:4,4,0,0:8,0 -17 465 . C <*> 0 . DP=33;DPR=31,0;I16=20,11,0,0,1191,48085,0,0,1775,103851,0,0,586,12786,0,0;QS=3,0;MQSB=0.996597;MQ0F=0 PL:DP:DV:SP:DP4:DPR 0,51,255:17:0:0:10,7,0,0:17,0 0,15,140:5:0:0:5,0,0,0:5,0 0,27,231:9:0:0:5,4,0,0:9,0 -17 466 . A <*> 0 . DP=34;DPR=33,0;I16=21,12,0,0,1293,53311,0,0,1895,111051,0,0,597,12897,0,0;QS=3,0;MQSB=0.995633;MQ0F=0 PL:DP:DV:SP:DP4:DPR 0,54,255:18:0:0:11,7,0,0:18,0 0,15,154:5:0:0:5,0,0,0:5,0 0,30,255:10:0:0:5,5,0,0:10,0 -17 467 . A <*> 0 . DP=34;DPR=32,0;I16=21,11,0,0,1256,51450,0,0,1835,107451,0,0,597,12891,0,0;QS=3,0;MQSB=0.998231;MQ0F=0 PL:DP:DV:SP:DP4:DPR 0,54,255:18:0:0:11,7,0,0:18,0 0,15,157:5:0:0:5,0,0,0:5,0 0,27,248:9:0:0:5,4,0,0:9,0 -17 468 . A <*> 0 . DP=35;DPR=34,0;I16=22,12,0,0,1274,51268,0,0,1955,114651,0,0,604,12904,0,0;QS=3,0;MQSB=0.997406;MQ0F=0 PL:DP:DV:SP:DP4:DPR 0,57,255:19:0:0:12,7,0,0:19,0 0,15,154:5:0:0:5,0,0,0:5,0 0,30,251:10:0:0:5,5,0,0:10,0 -17 469 . A <*> 0 . DP=35;DPR=34,0;I16=22,12,0,0,1285,52989,0,0,1955,114651,0,0,608,12940,0,0;QS=3,0;MQSB=0.997406;MQ0F=0 PL:DP:DV:SP:DP4:DPR 0,57,255:19:0:0:12,7,0,0:19,0 0,15,146:5:0:0:5,0,0,0:5,0 0,30,255:10:0:0:5,5,0,0:10,0 -17 470 . G <*> 0 . DP=35;DPR=34,0;I16=22,12,0,0,1281,51055,0,0,1955,114651,0,0,612,13016,0,0;QS=3,0;MQSB=0.997406;MQ0F=0 PL:DP:DV:SP:DP4:DPR 0,57,255:19:0:0:12,7,0,0:19,0 0,15,148:5:0:0:5,0,0,0:5,0 0,30,238:10:0:0:5,5,0,0:10,0 -17 471 . T <*> 0 . DP=36;DPR=33,0;I16=22,11,0,0,1239,49021,0,0,1918,113282,0,0,599,12825,0,0;QS=3,0;MQSB=0.915545;MQ0F=0 PL:DP:DV:SP:DP4:DPR 0,57,255:19:0:0:12,7,0,0:19,0 0,15,150:5:0:0:5,0,0,0:5,0 0,27,232:9:0:0:5,4,0,0:9,0 -17 472 . T <*> 0 . DP=35;DPR=33,0;I16=21,12,0,0,1245,48915,0,0,1926,113810,0,0,595,12559,0,0;QS=3,0;MQSB=0.988858;MQ0F=0 PL:DP:DV:SP:DP4:DPR 0,57,255:19:0:0:12,7,0,0:19,0 0,15,153:5:0:0:5,0,0,0:5,0 0,27,237:9:0:0:4,5,0,0:9,0 -17 473 . G <*> 0 . DP=35;DPR=33,0;I16=21,12,0,0,1307,53473,0,0,1926,113810,0,0,599,12651,0,0;QS=3,0;MQSB=0.988858;MQ0F=0 PL:DP:DV:SP:DP4:DPR 0,57,255:19:0:0:12,7,0,0:19,0 0,15,141:5:0:0:5,0,0,0:5,0 0,27,249:9:0:0:4,5,0,0:9,0 -17 474 . G <*> 0 . DP=36;DPR=34,0;I16=22,12,0,0,1284,51708,0,0,1986,117410,0,0,602,12734,0,0;QS=3,0;MQSB=0.986937;MQ0F=0 PL:DP:DV:SP:DP4:DPR 0,57,255:19:0:0:12,7,0,0:19,0 0,15,131:5:0:0:5,0,0,0:5,0 0,30,255:10:0:0:5,5,0,0:10,0 -17 475 . G <*> 0 . DP=36;DPR=35,0;I16=23,12,0,0,1311,51609,0,0,2015,118251,0,0,631,13485,0,0;QS=3,0;MQSB=0.998642;MQ0F=0 PL:DP:DV:SP:DP4:DPR 0,57,255:19:0:0:12,7,0,0:19,0 0,15,141:5:0:0:5,0,0,0:5,0 0,33,252:11:0:0:6,5,0,0:11,0 -17 476 . A <*> 0 . DP=36;DPR=35,0;I16=23,12,0,0,1312,52078,0,0,2015,118251,0,0,634,13606,0,0;QS=3,0;MQSB=0.998642;MQ0F=0 PL:DP:DV:SP:DP4:DPR 0,57,255:19:0:0:12,7,0,0:19,0 0,15,157:5:0:0:5,0,0,0:5,0 0,33,255:11:0:0:6,5,0,0:11,0 -17 477 . T <*> 0 . DP=36;DPR=35,0;I16=23,12,0,0,1318,52668,0,0,2015,118251,0,0,637,13773,0,0;QS=3,0;MQSB=0.998642;MQ0F=0 PL:DP:DV:SP:DP4:DPR 0,57,255:19:0:0:12,7,0,0:19,0 0,15,148:5:0:0:5,0,0,0:5,0 0,33,255:11:0:0:6,5,0,0:11,0 -17 478 . T <*> 0 . DP=38;DPR=37,0;I16=25,12,0,0,1338,51774,0,0,2135,125451,0,0,637,13833,0,0;QS=3,0;MQSB=0.999868;MQ0F=0 PL:DP:DV:SP:DP4:DPR 0,60,255:20:0:0:13,7,0,0:20,0 0,18,154:6:0:0:6,0,0,0:6,0 0,33,255:11:0:0:6,5,0,0:11,0 -17 479 . A <*> 0 . DP=38;DPR=37,0;I16=25,12,0,0,1420,57788,0,0,2135,125451,0,0,639,13935,0,0;QS=3,0;MQSB=0.999868;MQ0F=0 PL:DP:DV:SP:DP4:DPR 0,60,255:20:0:0:13,7,0,0:20,0 0,18,163:6:0:0:6,0,0,0:6,0 0,33,255:11:0:0:6,5,0,0:11,0 -17 480 . G <*> 0 . DP=37;DPR=36,0;I16=25,11,0,0,1438,60172,0,0,2075,121851,0,0,641,14029,0,0;QS=3,0;MQSB=0.999853;MQ0F=0 PL:DP:DV:SP:DP4:DPR 0,57,255:19:0:0:13,6,0,0:19,0 0,18,165:6:0:0:6,0,0,0:6,0 0,33,255:11:0:0:6,5,0,0:11,0 -17 481 . G <*> 0 . DP=37;DPR=36,0;I16=25,11,0,0,1392,55824,0,0,2075,121851,0,0,642,14112,0,0;QS=3,0;MQSB=0.999853;MQ0F=0 PL:DP:DV:SP:DP4:DPR 0,57,255:19:0:0:13,6,0,0:19,0 0,18,165:6:0:0:6,0,0,0:6,0 0,33,255:11:0:0:6,5,0,0:11,0 -17 482 . A <*> 0 . DP=37;DPR=35,0;I16=24,11,0,0,1352,55134,0,0,2015,118251,0,0,618,13608,0,0;QS=3,0;MQSB=1;MQ0F=0 PL:DP:DV:SP:DP4:DPR 0,57,255:19:0:0:13,6,0,0:19,0 0,15,143:5:0:0:5,0,0,0:5,0 0,33,255:11:0:0:6,5,0,0:11,0 -17 483 . G <*> 0 . DP=37;DPR=36,0;I16=24,12,0,0,1417,57747,0,0,2075,121851,0,0,642,14240,0,0;QS=3,0;MQSB=0.999437;MQ0F=0 PL:DP:DV:SP:DP4:DPR 0,57,255:19:0:0:12,7,0,0:19,0 0,18,165:6:0:0:6,0,0,0:6,0 0,33,255:11:0:0:6,5,0,0:11,0 -17 484 . A <*> 0 . DP=36;DPR=35,0;I16=24,11,0,0,1340,53992,0,0,2015,118251,0,0,643,14281,0,0;QS=3,0;MQSB=1;MQ0F=0 PL:DP:DV:SP:DP4:DPR 0,54,255:18:0:0:12,6,0,0:18,0 0,18,168:6:0:0:6,0,0,0:6,0 0,33,255:11:0:0:6,5,0,0:11,0 +17 443 . A <*> 0 . DP=30;DPR=29,0;I16=14,15,0,0,1173,49439,0,0,1647,96123,0,0,532,12036,0,0;QS=3,0;MQSB=0.940868;MQ0F=0 PL:DP:DV:SP:DP4:DPR 0,51,255:17:0:0:9,8,0,0:17,0 0,9,130:3:0:0:1,2,0,0:3,0 0,27,246:9:0:0:4,5,0,0:9,0 +17 444 . G <*> 0 . DP=29;DPR=28,0;I16=14,14,0,0,1095,44661,0,0,1587,92523,0,0,533,12103,0,0;QS=3,0;MQSB=0.949591;MQ0F=0 PL:DP:DV:SP:DP4:DPR 0,51,255:17:0:0:9,8,0,0:17,0 0,6,98:2:0:0:1,1,0,0:2,0 0,27,227:9:0:0:4,5,0,0:9,0 +17 445 . C <*> 0 . DP=30;DPR=29,0;I16=15,14,0,0,1100,43706,0,0,1647,96123,0,0,534,12196,0,0;QS=3,0;MQSB=0.960561;MQ0F=0 PL:DP:DV:SP:DP4:DPR 0,51,255:17:0:0:9,8,0,0:17,0 0,9,122:3:0:0:2,1,0,0:3,0 0,27,219:9:0:0:4,5,0,0:9,0 +17 446 . A <*> 0 . DP=30;DPR=29,0;I16=15,14,0,0,1107,44265,0,0,1647,96123,0,0,535,12265,0,0;QS=3,0;MQSB=0.960561;MQ0F=0 PL:DP:DV:SP:DP4:DPR 0,51,255:17:0:0:9,8,0,0:17,0 0,9,127:3:0:0:2,1,0,0:3,0 0,27,232:9:0:0:4,5,0,0:9,0 +17 447 . C <*> 0 . DP=29;DPR=28,0;I16=15,13,0,0,1108,45364,0,0,1618,95282,0,0,534,12208,0,0;QS=3,0;MQSB=0.835846;MQ0F=0 PL:DP:DV:SP:DP4:DPR 0,48,255:16:0:0:9,7,0,0:16,0 0,9,125:3:0:0:2,1,0,0:3,0 0,27,235:9:0:0:4,5,0,0:9,0 +17 448 . T <*> 0 . DP=29;DPR=28,0;I16=15,13,0,0,1125,47237,0,0,1618,95282,0,0,533,12173,0,0;QS=3,0;MQSB=0.835846;MQ0F=0 PL:DP:DV:SP:DP4:DPR 0,48,255:16:0:0:9,7,0,0:16,0 0,9,130:3:0:0:2,1,0,0:3,0 0,27,240:9:0:0:4,5,0,0:9,0 +17 449 . A <*> 0 . DP=28;DPR=27,0;I16=14,13,0,0,1091,45981,0,0,1558,91682,0,0,533,12159,0,0;QS=3,0;MQSB=0.819567;MQ0F=0 PL:DP:DV:SP:DP4:DPR 0,48,255:16:0:0:9,7,0,0:16,0 0,6,97:2:0:0:1,1,0,0:2,0 0,27,245:9:0:0:4,5,0,0:9,0 +17 450 . G <*> 0 . DP=28;DPR=27,0;I16=14,13,0,0,1069,44603,0,0,1558,91682,0,0,533,12165,0,0;QS=3,0;MQSB=0.819567;MQ0F=0 PL:DP:DV:SP:DP4:DPR 0,48,255:16:0:0:9,7,0,0:16,0 0,6,98:2:0:0:1,1,0,0:2,0 0,27,233:9:0:0:4,5,0,0:9,0 +17 451 . A <*> 0 . DP=28;DPR=27,0;I16=14,13,0,0,1021,41371,0,0,1558,91682,0,0,533,12191,0,0;QS=3,0;MQSB=0.819567;MQ0F=0 PL:DP:DV:SP:DP4:DPR 0,48,255:16:0:0:9,7,0,0:16,0 0,6,100:2:0:0:1,1,0,0:2,0 0,27,244:9:0:0:4,5,0,0:9,0 +17 452 . A <*> 0 . DP=31;DPR=29,0;I16=17,12,0,0,1079,43353,0,0,1678,98882,0,0,514,11876,0,0;QS=3,0;MQSB=0.868336;MQ0F=0 PL:DP:DV:SP:DP4:DPR 0,54,255:18:0:0:11,7,0,0:18,0 0,9,121:3:0:0:2,1,0,0:3,0 0,24,225:8:0:0:4,4,0,0:8,0 +17 453 . A <*> 0 . DP=31;DPR=28,0;I16=16,12,0,0,1037,41069,0,0,1649,98041,0,0,493,11357,0,0;QS=3,0;MQSB=0.96195;MQ0F=0 PL:DP:DV:SP:DP4:DPR 0,54,255:18:0:0:11,7,0,0:18,0 0,9,122:3:0:0:2,1,0,0:3,0 0,21,221:7:0:0:3,4,0,0:7,0 +17 454 . A <*> 0 . DP=31;DPR=30,0;I16=17,13,0,0,1158,47028,0,0,1738,102482,0,0,540,12400,0,0;QS=3,0;MQSB=0.862425;MQ0F=0 PL:DP:DV:SP:DP4:DPR 0,51,255:17:0:0:10,7,0,0:17,0 0,9,124:3:0:0:2,1,0,0:3,0 0,30,255:10:0:0:5,5,0,0:10,0 +17 455 . T <*> 0 . DP=32;DPR=30,0;I16=16,14,0,0,1148,46574,0,0,1715,100251,0,0,537,12383,0,0;QS=3,0;MQSB=0.958545;MQ0F=0 PL:DP:DV:SP:DP4:DPR 0,48,255:16:0:0:9,7,0,0:16,0 0,9,124:3:0:0:2,1,0,0:3,0 0,33,255:11:0:0:5,6,0,0:11,0 +17 456 . G <*> 0 . DP=32;DPR=30,0;I16=16,14,0,0,1161,47287,0,0,1746,103010,0,0,522,11840,0,0;QS=3,0;MQSB=0.999784;MQ0F=0 PL:DP:DV:SP:DP4:DPR 0,51,255:17:0:0:10,7,0,0:17,0 0,9,128:3:0:0:2,1,0,0:3,0 0,30,245:10:0:0:4,6,0,0:10,0 +17 457 . C <*> 0 . DP=33;DPR=32,0;I16=18,14,0,0,1218,48642,0,0,1835,107451,0,0,552,12538,0,0;QS=3,0;MQSB=0.97436;MQ0F=0 PL:DP:DV:SP:DP4:DPR 0,54,255:18:0:0:11,7,0,0:18,0 0,9,131:3:0:0:2,1,0,0:3,0 0,33,255:11:0:0:5,6,0,0:11,0 +17 458 . A <*> 0 . DP=33;DPR=32,0;I16=18,14,0,0,1226,49034,0,0,1835,107451,0,0,558,12590,0,0;QS=3,0;MQSB=0.97436;MQ0F=0 PL:DP:DV:SP:DP4:DPR 0,54,255:18:0:0:11,7,0,0:18,0 0,9,122:3:0:0:2,1,0,0:3,0 0,33,255:11:0:0:5,6,0,0:11,0 +17 459 . T <*> 0 . DP=33;DPR=31,0;I16=17,14,0,0,1167,46981,0,0,1775,103851,0,0,556,12576,0,0;QS=3,0;MQSB=0.967263;MQ0F=0 PL:DP:DV:SP:DP4:DPR 0,54,255:18:0:0:11,7,0,0:18,0 0,6,99:2:0:0:1,1,0,0:2,0 0,33,255:11:0:0:5,6,0,0:11,0 +17 460 . G <*> 0 . DP=32;DPR=31,0;I16=18,13,0,0,1219,50105,0,0,1775,103851,0,0,568,12642,0,0;QS=3,0;MQSB=0.980167;MQ0F=0 PL:DP:DV:SP:DP4:DPR 0,54,255:18:0:0:11,7,0,0:18,0 0,9,128:3:0:0:2,1,0,0:3,0 0,30,255:10:0:0:5,5,0,0:10,0 +17 461 . T <*> 0 . DP=32;DPR=31,0;I16=18,13,0,0,1213,49819,0,0,1775,103851,0,0,572,12640,0,0;QS=3,0;MQSB=0.980167;MQ0F=0 PL:DP:DV:SP:DP4:DPR 0,54,255:18:0:0:11,7,0,0:18,0 0,9,127:3:0:0:2,1,0,0:3,0 0,30,255:10:0:0:5,5,0,0:10,0 +17 462 . G <*> 0 . DP=32;DPR=31,0;I16=18,13,0,0,1190,48962,0,0,1775,103851,0,0,577,12669,0,0;QS=3,0;MQSB=0.980167;MQ0F=0 PL:DP:DV:SP:DP4:DPR 0,51,255:17:0:0:10,7,0,0:17,0 0,12,131:4:0:0:3,1,0,0:4,0 0,30,241:10:0:0:5,5,0,0:10,0 +17 463 . G <*> 0 . DP=32;DPR=31,0;I16=18,13,0,0,1114,44214,0,0,1775,103851,0,0,583,12729,0,0;QS=3,0;MQSB=0.980167;MQ0F=0 PL:DP:DV:SP:DP4:DPR 0,51,255:17:0:0:10,7,0,0:17,0 0,12,126:4:0:0:3,1,0,0:4,0 0,30,221:10:0:0:5,5,0,0:10,0 +17 464 . A <*> 0 . DP=32;DPR=29,0;I16=17,12,0,0,1100,43908,0,0,1686,99410,0,0,557,12147,0,0;QS=3,0;MQSB=0.996086;MQ0F=0 PL:DP:DV:SP:DP4:DPR 0,51,255:17:0:0:10,7,0,0:17,0 0,12,148:4:0:0:3,1,0,0:4,0 0,24,213:8:0:0:4,4,0,0:8,0 +17 465 . C <*> 0 . DP=33;DPR=31,0;I16=19,12,0,0,1191,48085,0,0,1775,103851,0,0,589,12909,0,0;QS=3,0;MQSB=0.989977;MQ0F=0 PL:DP:DV:SP:DP4:DPR 0,51,255:17:0:0:10,7,0,0:17,0 0,15,158:5:0:0:4,1,0,0:5,0 0,27,231:9:0:0:5,4,0,0:9,0 +17 466 . A <*> 0 . DP=34;DPR=33,0;I16=20,13,0,0,1293,53311,0,0,1895,111051,0,0,602,13102,0,0;QS=3,0;MQSB=0.989199;MQ0F=0 PL:DP:DV:SP:DP4:DPR 0,54,255:18:0:0:11,7,0,0:18,0 0,15,173:5:0:0:4,1,0,0:5,0 0,30,255:10:0:0:5,5,0,0:10,0 +17 467 . A <*> 0 . DP=34;DPR=32,0;I16=20,12,0,0,1256,51450,0,0,1835,107451,0,0,604,13178,0,0;QS=3,0;MQSB=0.993205;MQ0F=0 PL:DP:DV:SP:DP4:DPR 0,54,255:18:0:0:11,7,0,0:18,0 0,15,177:5:0:0:4,1,0,0:5,0 0,27,248:9:0:0:5,4,0,0:9,0 +17 468 . A <*> 0 . DP=35;DPR=34,0;I16=21,13,0,0,1274,51268,0,0,1955,114651,0,0,613,13273,0,0;QS=3,0;MQSB=0.992337;MQ0F=0 PL:DP:DV:SP:DP4:DPR 0,57,255:19:0:0:12,7,0,0:19,0 0,15,174:5:0:0:4,1,0,0:5,0 0,30,251:10:0:0:5,5,0,0:10,0 +17 469 . A <*> 0 . DP=35;DPR=34,0;I16=21,13,0,0,1285,52989,0,0,1955,114651,0,0,618,13340,0,0;QS=3,0;MQSB=0.992337;MQ0F=0 PL:DP:DV:SP:DP4:DPR 0,57,255:19:0:0:12,7,0,0:19,0 0,15,165:5:0:0:4,1,0,0:5,0 0,30,255:10:0:0:5,5,0,0:10,0 +17 470 . G <*> 0 . DP=35;DPR=34,0;I16=21,13,0,0,1281,51055,0,0,1955,114651,0,0,623,13445,0,0;QS=3,0;MQSB=0.992337;MQ0F=0 PL:DP:DV:SP:DP4:DPR 0,57,255:19:0:0:12,7,0,0:19,0 0,15,167:5:0:0:4,1,0,0:5,0 0,30,238:10:0:0:5,5,0,0:10,0 +17 471 . T <*> 0 . DP=36;DPR=33,0;I16=21,12,0,0,1239,49021,0,0,1918,113282,0,0,611,13281,0,0;QS=3,0;MQSB=0.904077;MQ0F=0 PL:DP:DV:SP:DP4:DPR 0,57,255:19:0:0:12,7,0,0:19,0 0,15,169:5:0:0:4,1,0,0:5,0 0,27,232:9:0:0:5,4,0,0:9,0 +17 472 . T <*> 0 . DP=35;DPR=33,0;I16=20,13,0,0,1245,48915,0,0,1926,113810,0,0,608,13040,0,0;QS=3,0;MQSB=0.99391;MQ0F=0 PL:DP:DV:SP:DP4:DPR 0,57,255:19:0:0:12,7,0,0:19,0 0,15,172:5:0:0:4,1,0,0:5,0 0,27,237:9:0:0:4,5,0,0:9,0 +17 473 . G <*> 0 . DP=35;DPR=33,0;I16=20,13,0,0,1307,53473,0,0,1926,113810,0,0,613,13155,0,0;QS=3,0;MQSB=0.99391;MQ0F=0 PL:DP:DV:SP:DP4:DPR 0,57,255:19:0:0:12,7,0,0:19,0 0,15,159:5:0:0:4,1,0,0:5,0 0,27,249:9:0:0:4,5,0,0:9,0 +17 474 . G <*> 0 . DP=36;DPR=34,0;I16=21,13,0,0,1284,51708,0,0,1986,117410,0,0,617,13259,0,0;QS=3,0;MQSB=0.992337;MQ0F=0 PL:DP:DV:SP:DP4:DPR 0,57,255:19:0:0:12,7,0,0:19,0 0,15,146:5:0:0:4,1,0,0:5,0 0,30,255:10:0:0:5,5,0,0:10,0 +17 475 . G <*> 0 . DP=36;DPR=35,0;I16=22,13,0,0,1311,51609,0,0,2015,118251,0,0,647,14029,0,0;QS=3,0;MQSB=0.994769;MQ0F=0 PL:DP:DV:SP:DP4:DPR 0,57,255:19:0:0:12,7,0,0:19,0 0,15,159:5:0:0:4,1,0,0:5,0 0,33,252:11:0:0:6,5,0,0:11,0 +17 476 . A <*> 0 . DP=36;DPR=35,0;I16=22,13,0,0,1312,52078,0,0,2015,118251,0,0,651,14167,0,0;QS=3,0;MQSB=0.994769;MQ0F=0 PL:DP:DV:SP:DP4:DPR 0,57,255:19:0:0:12,7,0,0:19,0 0,15,177:5:0:0:4,1,0,0:5,0 0,33,255:11:0:0:6,5,0,0:11,0 +17 477 . T <*> 0 . DP=36;DPR=35,0;I16=22,13,0,0,1318,52668,0,0,2015,118251,0,0,655,14349,0,0;QS=3,0;MQSB=0.994769;MQ0F=0 PL:DP:DV:SP:DP4:DPR 0,57,255:19:0:0:12,7,0,0:19,0 0,15,167:5:0:0:4,1,0,0:5,0 0,33,255:11:0:0:6,5,0,0:11,0 +17 478 . T <*> 0 . DP=38;DPR=37,0;I16=24,13,0,0,1338,51774,0,0,2135,125451,0,0,656,14422,0,0;QS=3,0;MQSB=0.997978;MQ0F=0 PL:DP:DV:SP:DP4:DPR 0,60,255:20:0:0:13,7,0,0:20,0 0,18,174:6:0:0:5,1,0,0:6,0 0,33,255:11:0:0:6,5,0,0:11,0 +17 479 . A <*> 0 . DP=38;DPR=37,0;I16=24,13,0,0,1420,57788,0,0,2135,125451,0,0,659,14535,0,0;QS=3,0;MQSB=0.997978;MQ0F=0 PL:DP:DV:SP:DP4:DPR 0,60,255:20:0:0:13,7,0,0:20,0 0,18,184:6:0:0:5,1,0,0:6,0 0,33,255:11:0:0:6,5,0,0:11,0 +17 480 . G <*> 0 . DP=37;DPR=36,0;I16=24,12,0,0,1438,60172,0,0,2075,121851,0,0,662,14638,0,0;QS=3,0;MQSB=0.999437;MQ0F=0 PL:DP:DV:SP:DP4:DPR 0,57,255:19:0:0:13,6,0,0:19,0 0,18,187:6:0:0:5,1,0,0:6,0 0,33,255:11:0:0:6,5,0,0:11,0 +17 481 . G <*> 0 . DP=37;DPR=36,0;I16=24,12,0,0,1392,55824,0,0,2075,121851,0,0,664,14728,0,0;QS=3,0;MQSB=0.999437;MQ0F=0 PL:DP:DV:SP:DP4:DPR 0,57,255:19:0:0:13,6,0,0:19,0 0,18,187:6:0:0:5,1,0,0:6,0 0,33,255:11:0:0:6,5,0,0:11,0 +17 482 . A <*> 0 . DP=37;DPR=35,0;I16=23,12,0,0,1352,55134,0,0,2015,118251,0,0,641,14229,0,0;QS=3,0;MQSB=0.998642;MQ0F=0 PL:DP:DV:SP:DP4:DPR 0,57,255:19:0:0:13,6,0,0:19,0 0,15,161:5:0:0:4,1,0,0:5,0 0,33,255:11:0:0:6,5,0,0:11,0 +17 483 . G <*> 0 . DP=37;DPR=36,0;I16=23,13,0,0,1417,57747,0,0,2075,121851,0,0,666,14864,0,0;QS=3,0;MQSB=0.996616;MQ0F=0 PL:DP:DV:SP:DP4:DPR 0,57,255:19:0:0:12,7,0,0:19,0 0,18,187:6:0:0:5,1,0,0:6,0 0,33,255:11:0:0:6,5,0,0:11,0 +17 484 . A <*> 0 . DP=36;DPR=35,0;I16=23,12,0,0,1340,53992,0,0,2015,118251,0,0,668,14906,0,0;QS=3,0;MQSB=0.998642;MQ0F=0 PL:DP:DV:SP:DP4:DPR 0,54,255:18:0:0:12,6,0,0:18,0 0,18,191:6:0:0:5,1,0,0:6,0 0,33,255:11:0:0:6,5,0,0:11,0 17 485 . G <*> 0 . DP=35;DPR=35,0;I16=23,12,0,0,1329,51411,0,0,2015,118251,0,0,669,14931,0,0;QS=3,0;MQSB=0.998642;MQ0F=0 PL:DP:DV:SP:DP4:DPR 0,54,255:18:0:0:12,6,0,0:18,0 0,18,160:6:0:0:5,1,0,0:6,0 0,33,255:11:0:0:6,5,0,0:11,0 17 486 . A <*> 0 . DP=34;DPR=34,0;I16=22,12,0,0,1311,51523,0,0,1955,114651,0,0,671,14989,0,0;QS=3,0;MQSB=0.997406;MQ0F=0 PL:DP:DV:SP:DP4:DPR 0,51,255:17:0:0:11,6,0,0:17,0 0,18,173:6:0:0:5,1,0,0:6,0 0,33,255:11:0:0:6,5,0,0:11,0 17 487 . G <*> 0 . DP=34;DPR=34,0;I16=22,12,0,0,1306,50760,0,0,1955,114651,0,0,672,15030,0,0;QS=3,0;MQSB=0.997406;MQ0F=0 PL:DP:DV:SP:DP4:DPR 0,51,255:17:0:0:11,6,0,0:17,0 0,18,169:6:0:0:5,1,0,0:6,0 0,33,255:11:0:0:6,5,0,0:11,0 diff -Nru samtools-1.11/test/dat/mpileup.out.5 samtools-1.13/test/dat/mpileup.out.5 --- samtools-1.11/test/dat/mpileup.out.5 2020-09-22 12:15:55.000000000 +0000 +++ samtools-1.13/test/dat/mpileup.out.5 2021-07-07 17:18:52.000000000 +0000 @@ -1 +1 @@ -chr3 128814202 N 1 G ~ +chr3 128814202 N 1 g ~ diff -Nru samtools-1.11/test/dat/view.005.expected.sam samtools-1.13/test/dat/view.005.expected.sam --- samtools-1.11/test/dat/view.005.expected.sam 1970-01-01 00:00:00.000000000 +0000 +++ samtools-1.13/test/dat/view.005.expected.sam 2021-07-07 17:18:52.000000000 +0000 @@ -0,0 +1,12 @@ +@HD VN:1.5 SO:coordinate +@SQ SN:ztr1 LN:200000000 +AW7_00001:1:0001:7902:59903 163 ztr1 122527090 4 132M19S = 122527410 469 AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA AAFFFJJJJJJJJJJJJJJJJJJJJJJJJJJJFAJJJJJJJFJJJJJJJJFJJJJJJJJJJJJJJJJJJJJJJJJJJ-J--FJJJ7JJJ-FAJJJJJJJJJJJJJJJJJJJJJJJJJJJJJ<-AJJJJJJJJ7F<<JGEH -17 3124 N 13 ccCcCccCcCccC IIKKmJJJLILJE -17 3125 N 13 ttTtTttTtTttT FFMIlGBDGHJIH -17 3126 N 13 ggGgGggGgGggG HHLDoJJF>JEAH -17 3127 N 13 ccCcCccCcCccC GIKKqKKEGJJCG -17 3128 N 13 ttTtTttTtTttT AFKElGIACJBII -17 3129 N 13 t$tTtTttTtTttT @FFHdGID5EEDD -17 3130 N 12 tTtTttTtTttT EHHfGHDGE;EE -17 3131 N 12 cCcCccCcCccC HIJrJJEBJLFI -17 3132 N 12 tTtTttTtTttT EIGoGH>GKIFH -17 3133 N 12 gGgGggGgGggG DHHrHHCDJLDH -17 3134 N 13 gGgGggGgGggG^]G DEGgIH8?ELCA> -17 3135 N 14 gGgGggGgGggGG^]g CBHqJJE7?KGH?E -17 3136 N 14 c$CcCccCcCccCCc ?FIpJIGEG;DD;B -17 3139 N 13 cCccCcCccCCcC BtFF:CHJBEFGE -17 3140 N 13 cCccCcCccCCcC ApHJJIKJ?FFBA -17 3141 N 13 cCccCcCccCCcC GrJIIEFLEIGGE -17 3142 N 13 aAaaAaAaaAAaA HrJGEG BuIGLBM=HIIGBA -17 3144 N 14 CccCcCccCCcCC^]C qJJ?HDMJDHKG.A -17 3145 N 14 AaaAaAaaAAaAAA iHIGMJMGCI -17 3155 N 14 GggGgGggGGgGGG lHIIMBJ=KIGJIK -17 3156 N 14 G$ggGgGggGGgGGG jJJHHFLBEHJEJEAHo +17 3127 N 13 ccCcccCcCccCc GIKKKKEGJJCGq +17 3128 N 13 ttTtttTtTttTt AFKEGIACJBIIl +17 3129 N 13 t$tTtttTtTttTt @FFHGID5EEDDd +17 3130 N 12 tTtttTtTttTt EHHGHDGE;EEf +17 3131 N 12 cCcccCcCccCc HIJJJEBJLFIr +17 3132 N 12 tTtttTtTttTt EIGGH>GKIFHo +17 3133 N 12 gGgggGgGggGg DHHHHCDJLDHr +17 3134 N 13 gGgggGgGggGg^]G DEGIH8?ELCAg> +17 3135 N 14 gGgggGgGggGgG^]g CBHJJE7?KGHq?E +17 3136 N 14 c$CcccCcCccCcCc ?FIJIGEG;DhD;B +17 3139 N 13 cccCcCccCcCcC BFF:CHJBEtFGE +17 3140 N 13 cccCcCccCcCcC AHJJIKJ?FpFBA +17 3141 N 13 cccCcCccCcCcC GJIIEFLEIrGGE +17 3142 N 13 aaaAaAaaAaAaA HJGEG BIGLBM=HIuIGBA +17 3144 N 14 ccCcCccCcCcCC^]C JJ?HDMJDqHKG.A +17 3145 N 14 aaAaAaaAaAaAAA HIGMJMGCI +17 3155 N 14 ggGgGggGgGgGGG HIIMBJ=KlIGJIK +17 3156 N 14 ggGgGggGgGgGGG JJHHFLBEjHJE6AIK:E:JK 17 3484 N 10 AaAAAaaAA^]A I?FKCD2GFD 17 3485 N 11 CccCCCccCCC CFKFJJIGHIC -17 3486 N 10 TttTTTtTTT K@GYMCGIKH -17 3487 N 11 CccCCCccCCC D2;kLCACIJH -17 3488 N 11 TttTTTttTTT IEA^L=E;HLI -17 3489 N 12 GggGGGggGGGg K9:nLEE7HLJC -17 3490 N 13 TttTTTttTTTt^]T D/JYJIEAFGGC: -17 3491 N 14 CccCCCccCCCCc^]c I>EeL:J1IKHC7C -17 3492 N 14 TttTTTttTTTTtt L/A]LF/.JKKE15 -17 3493 N 15 CccCCCc-1ncCCCc+1aCcc E3>cMK5;JLK9D5; -17 3494 N 16 AaaAAA#aAAAaAaa^]a JCHkKKF@IIIFC7=0 -17 3495 N 16 AaaAAAaaAAAaAaaa IFGlJEC -17 3504 N 16 AaaAAAaaAAAaAaaa DEImI4FFJJJCIDEF -17 3505 N 16 AaaAAAaaAAAaAaaa DFGjJ5FDFKJ1I?E? -17 3506 N 14 A$aaAAaaAAaAaa^]A BGI`;ICIIFIBC? -17 3507 N 15 ttTTttTTtTtttT^]t FIg@HHCGIGDCG@> -17 3508 N 16 ccCCccCCCcCcccCc JLr?IK0BKJHLIKCG -17 3509 N 17 aaAAAaaA-1NAAaAaaaAa CHeE;FC>JH:HF;FCC -17 3510 N 17 ccCCAcc*CCcCcccCc JLkIIJH.JJJEJJ:EH -17 3511 N 18 aaAAAaaAAAaAaaaAa^]A DH\G2EA.JI1H7FGA1B -17 3512 N 17 ccCCccCCcCcccCcCc HJoHHIFH6FBIHEGDF -17 3513 N 19 ccCCAccCCCcCcccCcCc HKa@.JFJJIAJ7KLDFDC -17 3514 N 18 aaAAAaaAAaAaaaAaAa FIgJ;GDAJ8HAHIBFC@ -17 3515 N 19 ttTTTttTTTtTtttTtTt FGiJIGG1GJ>H=GHDHF@ -17 3516 N 19 ttTTTttTTTTtttTtTt^]T FGfGGHHGIIEIHJDGBD@ -17 3517 N 20 ttTTTttTTTtTtttTtTtT @GoHBHB>HI2EIGKDHA>@ -17 3518 N 20 ttTTTttTTTtTtttTtTtT EGkCACCGJJEL:J1IKHeC7C +17 3492 N 14 TttTTttTTTtTtt L/ALF/.JKK]E15 +17 3493 N 15 CccCCc-1ncCCCcc+1aCcc E3>MK5;JLKc9D5; +17 3494 N 16 AaaAA#aAAAaaAaa^]a JCHKKF@IIIkFC7=0 +17 3495 N 16 AaaAAaaAAAaaAaaa IFGJEC +17 3504 N 16 AaaAAaaAAAaaAaaa DEII4FFJJJmCIDEF +17 3505 N 16 AaaAAaaAAAaaAaaa DFGJ5FDFKJj1I?E? +17 3506 N 14 A$aaAaaAAaaAaa^]A BGI;ICII`FIBC? +17 3507 N 15 ttTttTTttTtttT^]t FI@HHCGgIGDCG@> +17 3508 N 16 ccCccCCCccCcccCc JL?IK0BKrJHLIKCG +17 3509 N 17 aaAAaaA-1NAAaaAaaaAa CHE;FC>JHe:HF;FCC +17 3510 N 17 ccCAcc*CCccCcccCc JLIIJH.JJkJEJJ:EH +17 3511 N 18 aaAAaaAAAaaAaaaAa^]A DHG2EA.JI\1H7FGA1B +17 3512 N 17 ccCccCCccCcccCcCc HJHHIFHo6FBIHEGDF +17 3513 N 19 ccCAccCCCccCcccCcCc HK@.JFJJIaAJ7KLDFDC +17 3514 N 18 aaAAaaAAaaAaaaAaAa FIJ;GDAJg8HAHIBFC@ +17 3515 N 19 ttTTttTTTttTtttTtTt FGJIGG1GJi>H=GHDHF@ +17 3516 N 19 ttTTttTTTtTtttTtTt^]T FGGGHHGIIfEIHJDGBD@ +17 3517 N 20 ttTTttTTTttTtttTtTtT @GHBHB>HIo2EIGKDHA>@ +17 3518 N 20 ttTTttTTTttTtttTtTtT EGCACCGJJf # @@ -44,6 +44,7 @@ test_usage($opts, cmd=>'samtools'); test_view($opts); test_cat($opts); +test_import($opts); test_bam2fq($opts); test_bam2fq($opts, threads=>2); test_depad($opts); @@ -959,6 +960,7 @@ # now test subcommand usage as well foreach my $subcommand (@subcommands) { + next if ($subcommand =~ /^(help|version)$/); # Under msys the isatty function fails to recognise the terminal. # Skip these tests for now. next if ($^O =~ /^msys/ && $subcommand =~ /^(dict|sort|stats|view|fasta|fastq)$/); @@ -1088,6 +1090,7 @@ # $args->{flags_required} bits which must be set in flags (-f option) # $args->{flags_rejected} bits which must not be set in flags (-F option) # $args->{read_groups} hash of read groups to output (-r or -R) +# $args->{read_names} names of reads to output (-N) # $args->{tag} tag used for checking if reads match tag_values (-d or -D) # $args->{tag_values} hash of values assocated with tag to output (-d or -D) # $args->{libraries} hash of libraries to output (-l) @@ -1142,6 +1145,7 @@ my $flags_required = $args->{flags_required} || 0; my $flags_rejected = $args->{flags_rejected} || 0; my $read_groups = $args->{read_groups}; + my $read_names = $args->{read_names}; my $tag = $args->{tag}; my $tag_values = $args->{tag_values}; my $libraries = $args->{libraries}; @@ -1149,7 +1153,7 @@ my $strip_tags = $args->{strip_tags}; my $min_qlen = $args->{min_qlen} || 0; my $body_filter = ($flags_required || $flags_rejected - || $read_groups || $tag_values + || $read_groups || $read_names || $tag_values || $min_map_qual || $libraries || $region || $strip_tags || $min_qlen); my $lib_read_groups = $libraries ? {} : undef; @@ -1196,10 +1200,13 @@ if ($tag_values) { my $tag_value = ''; for my $i (11 .. $#sam) { - last if (($tag_value) = $sam[$i] =~ /^${tag}:Z:(.*)/); + last if (($tag_value) = $sam[$i] =~ /^${tag}:[ZiIsScCA]:(.*)/); } next if (!exists($tag_values->{$tag_value||""})); } + if ($read_names) { + next if (!exists($read_names->{$sam[0]})); + } if ($region) { my $in_range = 0; foreach my $r (@$region) { @@ -2057,6 +2064,11 @@ print $f "ACGT\nAATTCCGG\n" || die "Error writing to $fobc : $!\n"; close($f) || die "Error writing to $fobc : $!\n"; + # Read names file for -N test + my $forn = "$$opts{tmp}/view.001.forn"; + open($f, '>', $forn) || die "Couldn't open $forn : $!\n"; + print $f "ref1_grp1_p001\nunaligned_grp3_p001\nr008\nr009\n" || die "Error writing to $forn : $!\n"; + close($f) || die "Error writing to $forn : $!\n"; my @filter_tests = ( # [test_name, {filter_sam options}, [samtools options], expect_fail] @@ -2073,7 +2085,12 @@ ['-R', $fogn, '-r', 'grp2'], 0], ['rg_both2', { read_groups => { grp1 => 1, grp2 => 1, grp3 => 1 }}, ['-r', 'grp2', '-R', $fogn], 0], + # Read names + ['rn', { read_names => { 'unaligned_grp3_p001' => 1, 'ref1_grp1_p001' => 1, 'r008' => 1, 'r009' => 1 } }, + ['-N', $forn], 0], # Tag with values + ['tv_BC', { tag => 'BC', tag_values => { ACGT => 1, TGCA => 1, AATTCCGG => 1 }}, + ['-d', 'BC'], 0], ['tv_BC_TGCA', { tag => 'BC', tag_values => { TGCA => 1 }}, ['-d', 'BC:TGCA'], 0], ['tv_BC_fobc', { tag => 'BC', tag_values => { ACGT => 1, AATTCCGG => 1 }}, @@ -2092,6 +2109,10 @@ ['-D', "BClong:${fobc}"], 1], ['tv_d_different_tags', { tag => 'BC', tag_values => { ACGT => 1, grp2 => 1 }}, ['-d', 'BC:ACGT', '-d', 'RG:grp2' ], 1], + ['tv_NM_13', { tag => 'NM', tag_values => { 13 => 1 }}, + ['-d', 'NM:13'], 0], + ['tv_ab_z', { tag => 'ab', tag_values => { z => 2 }}, + ['-d', 'ab:z'], 0], # Libraries ['lib2', { libraries => { 'Library 2' => 1 }}, ['-l', 'Library 2'], 0], ['lib3', { libraries => { 'Library 3' => 1 }}, ['-l', 'Library 3'], 0], @@ -2113,6 +2134,22 @@ ['qlen11', { min_qlen => 11 }, ['-m', 11], 0], ['qlen15', { min_qlen => 15 }, ['-m', 15], 0], ['qlen16', { min_qlen => 16 }, ['-m', 16], 0], + # Filter expressions + ['expr_rej128req2', { flags_rejected => 128, flags_required => 2 }, + ['-e', '!(flag & 128) && (flag & 2)'], 0], + # filter_sam also removes the header line, so cannot compare. + # ['expr_RG', { read_groups => {grp1 => 1, grp3 => 1}}, ['-e', '[RG]=~"^grp[13]$"'], 0], + ['expr_BC', { tag => 'BC', tag_values => { ACGT => 1, TGCA => 1, AATTCCGG => 1 }}, + ['-e', '[BC]'], 0], + ['expr_BC2', { tag => 'BC', tag_values => { ACGT => 1, AATTCCGG => 1 }}, + ['-e', '[BC] == "ACGT" || [BC] == "AATTCCGG"'], 0], + ['expr_mq50', { min_map_qual => 50 }, ['-e', 'mapq >= 50' ], 0], + ['expr_mq99', { min_map_qual => 99 }, ['-e', 'mapq >= 99' ], 0], + ['expr_mq100', { min_map_qual => 100 }, ['-e', 'mapq >= 100'], 0], + # TODO: add library to filter expression? It needs to go via RG. + # TODO: add cigar.qbase and cigar.rbase counts for consumes + # N bases of query and ref? Not the same as qlen/rlen as + # indels don't count the same. ); my @filter_inputs = ([SAM => $sam_with_ur], @@ -2426,6 +2463,18 @@ args => ['-h', $b_pg_sam], out => sprintf("%s.test%03d.sam", $out, $test), compare => $b_pg_expected); + + # unset flags and clear tags associated with duplication + $test++; + + my $dup_sam = "$$opts{path}/dat/view.005.sam"; + my $dup_expected = "$$opts{path}/dat/view.005.expected.sam"; + + run_view_test($opts, + msg=> "$test: Unset dup flag, remove dt and do tags", + args => ['-h', '--remove-flags', 'DUP', '-x', 'do', '-x', 'dt', '--no-PG', $dup_sam], + out => sprintf("%s.test%03d.sam", $out, $test), + compare => $dup_expected); } # cat SAM files in the same way as samtools cat does with BAMs @@ -2657,6 +2706,95 @@ close($in) || die "Error reading $sam_in : $!\n"; } +# Conversion of FASTQ to BAM. +# We use the bam2fq expected output to validate on. +# This permits round trip validation +# NB: This uses bam2fq/1.stdout.expected as a proxy for a blank filename. +sub test_import +{ + my ($opts, %args) = @_; + + # Just 1 end, as an unpaired read sample; eg as if ont or pacbio. + # -0 or implicit (lack of /1 /2 suffixes) via -s. + test_cmd($opts, out=>'bam2fq/1.stdout.expected', + out_map=>{"0.fq" => 'bam2fq/1.1.fq.expected'}, + cmd=>"$$opts{bin}/samtools import -0 test/bam2fq/1.1.fq.expected | $$opts{bin}/samtools fastq -0 $$opts{path}/0.fq"); + test_cmd($opts, out=>'bam2fq/1.stdout.expected', + out_map=>{"0.fq" => 'bam2fq/1.1.fq.expected'}, + cmd=>"$$opts{bin}/samtools import -s test/bam2fq/1.1.fq.expected | $$opts{bin}/samtools fastq -0 $$opts{path}/0.fq"); + + # Just 1 end, as half of a paired-end sample. Can be either explicit via + test_cmd($opts, out=>'bam2fq/1.stdout.expected', + out_map=>{"s.fq" => 'bam2fq/5.s.fq.expected'}, + cmd=>"$$opts{bin}/samtools import -s test/bam2fq/5.s.fq.expected | $$opts{bin}/samtools fastq -s $$opts{path}/s.fq"); + + # Normal read 1 / read 2 + test_cmd($opts, out=>'bam2fq/1.stdout.expected', + out_map=>{"1.fq" => 'bam2fq/1.1.fq.expected', + "2.fq" => 'bam2fq/1.2.fq.expected'}, + cmd=>"$$opts{bin}/samtools import test/bam2fq/1.1.fq.expected test/bam2fq/1.2.fq.expected | $$opts{bin}/samtools fastq -1 $$opts{path}/1.fq -2 $$opts{path}/2.fq"); + + # Normal read 1 / read 2 but with /1 and /2 suffixes. + test_cmd($opts, out=>'bam2fq/1.stdout.expected', + out_map=>{"1.fq" => 'bam2fq/5.1.fq.expected', + "2.fq" => 'bam2fq/5.2.fq.expected'}, + cmd=>"$$opts{bin}/samtools import test/bam2fq/5.1.fq.expected test/bam2fq/5.2.fq.expected | $$opts{bin}/samtools fastq -N -1 $$opts{path}/1.fq -2 $$opts{path}/2.fq"); + + # Barcodes via CASAVA tags + test_cmd($opts, out=>'bam2fq/1.stdout.expected', + out_map=>{"1.fq" => 'bam2fq/12.1.fq.expected', + "2.fq" => 'bam2fq/12.2.fq.expected'}, + cmd=>"$$opts{bin}/samtools import -i -1 test/bam2fq/12.1.fq.expected -2 test/bam2fq/12.2.fq.expected | $$opts{bin}/samtools fastq -i --index-format i*i* -1 $$opts{path}/1.fq -2 $$opts{path}/2.fq"); + test_cmd($opts, out=>'bam2fq/1.stdout.expected', + out_map=>{"1.fq" => 'bam2fq/12.1.fq.expected', + "2.fq" => 'bam2fq/12.2.fq.expected'}, + cmd=>"$$opts{bin}/samtools import --barcode-tag OX -i -1 test/bam2fq/12.1.fq.expected -2 test/bam2fq/12.2.fq.expected | $$opts{bin}/samtools fastq --barcode-tag OX -i --index-format i*i* -1 $$opts{path}/1.fq -2 $$opts{path}/2.fq"); + + # Barcodes via explicit aux tags; 6 + test_cmd($opts, out=>'bam2fq/1.stdout.expected', + out_map=>{"1.fq" => 'bam2fq/6.1.fq.expected', + "2.fq" => 'bam2fq/6.2.fq.expected'}, + cmd=>"$$opts{bin}/samtools import -T \"\" -1 test/bam2fq/6.1.fq.expected -2 test/bam2fq/6.2.fq.expected | $$opts{bin}/samtools fastq -N -T RG,BC,QT -1 $$opts{path}/1.fq -2 $$opts{path}/2.fq"); + + # Other aux tags; 7 + test_cmd($opts, out=>'bam2fq/1.stdout.expected', + out_map=>{"1.fq" => 'bam2fq/7.1.fq.expected', + "2.fq" => 'bam2fq/7.2.fq.expected'}, + cmd=>"$$opts{bin}/samtools import -T \"*\" -1 test/bam2fq/7.1.fq.expected -2 test/bam2fq/7.2.fq.expected | $$opts{bin}/samtools fastq -N -T RG,BC,QT,MD,ia -1 $$opts{path}/1.fq -2 $$opts{path}/2.fq"); + + #------------------------ + # Plus our own test files, using bam2fq as source + + # Read-group + test_cmd($opts, out=>'import/1.expected.sam', + cmd=>"$$opts{bin}/samtools import test/bam2fq/1.1.fq.expected test/bam2fq/1.2.fq.expected -R rgid"); + test_cmd($opts, out=>'import/1.expected.sam', + cmd=>"$$opts{bin}/samtools import test/bam2fq/1.1.fq.expected test/bam2fq/1.2.fq.expected -r ID:rgid"); + test_cmd($opts, out=>'import/1.expected.sam', + cmd=>"$$opts{bin}/samtools import test/bam2fq/1.1.fq.expected test/bam2fq/1.2.fq.expected -r '\@RG\tID:rgid'"); + + + # Interleaved data + test_cmd($opts, out=>'import/2.expected.sam', + cmd=>"$$opts{bin}/samtools import test/import/2.interleaved.fq -T \"\""); + test_cmd($opts, out=>'import/2.expected.sam', + cmd=>"$$opts{bin}/samtools import test/import/3.interleaved.fq -i"); + + # Non aux-tag comments (we don't use these, but also shouldn't choke). + test_cmd($opts, out=>'import/4.expected.sam', + cmd=>"$$opts{bin}/samtools import test/import/4.aux.fq -T \"*\""); + test_cmd($opts, out=>'import/4.expected.sam', + cmd=>"$$opts{bin}/samtools import test/import/4.aux.fq -T \"\""); + test_cmd($opts, out=>'import/4.expected-XZ,XA,AA.sam', + cmd=>"$$opts{bin}/samtools import test/import/4.aux.fq -T XZ,XA,AA"); + + # Barcode files + test_cmd($opts, out=>'import/5-BC.expected.sam', + cmd=>"$$opts{bin}/samtools import --i1 test/import/5-i1.fq --i2 test/import/5-i2.fq --r1 test/import/5-r1.fq --r2 test/import/5-r2.fq"); + test_cmd($opts, out=>'import/5-OX.expected.sam', + cmd=>"$$opts{bin}/samtools import --i1 test/import/5-i1.fq --i2 test/import/5-i2.fq --r1 test/import/5-r1.fq --r2 test/import/5-r2.fq --barcode-tag OX --quality-tag BZ"); +} + sub test_bam2fq { my ($opts, %args) = @_; @@ -2746,6 +2884,7 @@ # Read 1/2 output, duplicate filename (-1 -2) test_cmd($opts, out=>'bam2fq/2.stdout.expected', out_map=>{'o.fq' => 'bam2fq/11.fq.expected'},cmd=>"$$opts{bin}/samtools fastq @$threads -N -1 $$opts{path}/o.fq -2 $$opts{path}/o.fq $$opts{path}/dat/bam2fq.001.sam"); + test_cmd($opts, out=>'bam2fq/2.stdout.expected', out_map=>{'o.fa' => 'bam2fq/11.fa.expected'},cmd=>"$$opts{bin}/samtools fasta @$threads -N -1 $$opts{path}/o.fa -2 $$opts{path}/o.fa $$opts{path}/dat/bam2fq.001.sam"); # Read 1/2 output, single filename (-o) test_cmd($opts, out=>'bam2fq/2.stdout.expected', out_map=>{'o.fq' => 'bam2fq/11.fq.expected'},cmd=>"$$opts{bin}/samtools fastq @$threads -N -o $$opts{path}/o.fq $$opts{path}/dat/bam2fq.001.sam"); # Read 1/2 output, stdout and discard singletons/other @@ -2885,6 +3024,8 @@ test_cmd($opts,out=>'merge/6.merge.expected.sam', ignore_pg_header => 1, cmd=>"$$opts{bin}/samtools merge${threads} -cp -s 1 -O sam - $$opts{path}/dat/test_input_1_a.sam $$opts{path}/dat/test_input_1_b.sam"); # Merge 7 - ID and SN with regex in them test_cmd($opts,out=>'merge/7.merge.expected.sam', ignore_pg_header => 1, cmd=>"$$opts{bin}/samtools merge${threads} -s 1 -O sam - $$opts{path}/dat/test_input_1_a_regex.sam $$opts{path}/dat/test_input_1_b_regex.sam"); + # Merge 8 - Standard 3 file SAM merge, output file specified via option + test_cmd($opts,out=>'merge/2.merge.expected.sam', ignore_pg_header => 1, cmd=>"$$opts{bin}/samtools merge${threads} -o - -s 1 -O sam $$opts{path}/dat/test_input_1_a.sam $$opts{path}/dat/test_input_1_b.sam $$opts{path}/dat/test_input_1_c.sam"); # Sort inputs by PG, then merge system("$$opts{bin}/samtools sort -o $$opts{tmp}/merge.tag.1.bam -t PG -m 10M $$opts{path}/dat/test_input_1_b.sam") == 0 or die "failed to create sort BAM: $?"; @@ -3126,6 +3267,7 @@ test_cmd($opts,out=>'addrprg/4_fixup_norg.sam.expected', err=>'addrprg/4_fixup_norg.sam.expected.err', ignore_pg_header => 1, cmd=>"$$opts{bin}/samtools addreplacerg${threads} -O sam -r '\@RG\\tID:1#8\\tCN:SC' $$opts{path}/addrprg/4_fixup_norg.sam"); test_cmd($opts,out=>'addrprg/1_fixup.sam.expected', err=>'addrprg/1_fixup.sam.expected.err', ignore_pg_header => 1, cmd=>"$$opts{bin}/samtools addreplacerg${threads} -O sam -m overwrite_all -R '1#8' $$opts{path}/addrprg/1_fixup.sam"); test_cmd($opts,out=>'addrprg/4_fixup_norg.sam.expected', err=>'addrprg/4_fixup_norg.sam.expected.err', ignore_pg_header => 1, cmd=>"$$opts{bin}/samtools addreplacerg${threads} -O sam -r 'ID:1#8' -r 'CN:SC' $$opts{path}/addrprg/4_fixup_norg.sam"); + test_cmd($opts,out=>'addrprg/5_editrg.sam.expected', ignore_pg_header => 1, cmd=>"$$opts{bin}/samtools addreplacerg${threads} -O sam -w -r '\@RG\\tID:1#8\\tCN:Sanger\\tDS:Testing the editing code.' $$opts{path}/addrprg/1_fixup.sam"); } sub test_markdup @@ -3196,5 +3338,6 @@ "$$opts{path}/ampliconclip/2_both_clipped.expected.sam"); my $threads = exists($args{threads}) ? " -@ $args{threads}" : ""; - test_cmd($opts, out=>'ampliconstats/stats.expected.txt', cmd=>"$$opts{bin}/samtools ampliconstats${threads} -t 50 -d 1,20,100 $$opts{path}/ampliconclip/ac_test.bed @inputs | egrep -v 'Samtools version|Command line' | tee /tmp/out.txt"); + test_cmd($opts, out=>'ampliconstats/stats.expected.txt', cmd=>"$$opts{bin}/samtools ampliconstats${threads} -S -t 50 -d 1,20,100 $$opts{path}/ampliconclip/ac_test.bed @inputs | egrep -v 'Samtools version|Command line'"); + test_cmd($opts, out=>'ampliconstats/stats_mixed.expected.txt', cmd=>"$$opts{bin}/samtools ampliconstats${threads} -c 0 $$opts{path}/ampliconclip/multi_ref.bed $$opts{path}/ampliconstats/mixed_clipped.sam | egrep -v 'Samtools version|Command line'"); } diff -Nru samtools-1.11/version.sh samtools-1.13/version.sh --- samtools-1.11/version.sh 2020-09-22 12:15:55.000000000 +0000 +++ samtools-1.13/version.sh 2021-07-07 17:18:52.000000000 +0000 @@ -24,7 +24,7 @@ # DEALINGS IN THE SOFTWARE. # Master version, for use in tarballs or non-git source copies -VERSION=1.11 +VERSION=1.13 # If we have a git clone, then check against the current tag if [ -e .git ]