diff -Nru htslib-1.6/.appveyor.yml htslib-1.7/.appveyor.yml --- htslib-1.6/.appveyor.yml 2017-09-28 11:20:23.000000000 +0000 +++ htslib-1.7/.appveyor.yml 2018-01-26 12:01:45.000000000 +0000 @@ -5,10 +5,6 @@ # branches to build branches: - # Whitelist - only: - - develop - # Blacklist except: - gh-pages diff -Nru htslib-1.6/bcf_sr_sort.c htslib-1.7/bcf_sr_sort.c --- htslib-1.6/bcf_sr_sort.c 2017-09-28 11:20:23.000000000 +0000 +++ htslib-1.7/bcf_sr_sort.c 2018-01-26 12:01:45.000000000 +0000 @@ -327,6 +327,20 @@ } return ret; } +int bcf_sr_sort_set_active(sr_sort_t *srt, int idx) +{ + hts_expand(int,idx+1,srt->mactive,srt->active); + srt->nactive = 1; + srt->active[srt->nactive - 1] = idx; + return 0; +} +int bcf_sr_sort_add_active(sr_sort_t *srt, int idx) +{ + hts_expand(int,idx+1,srt->mactive,srt->active); + srt->nactive++; + srt->active[srt->nactive - 1] = idx; + return 0; +} static void bcf_sr_sort_set(bcf_srs_t *readers, sr_sort_t *srt, const char *chr, int min_pos) { if ( !srt->grp_str2int ) @@ -357,11 +371,11 @@ memset(&grp,0,sizeof(grp_t)); // group VCFs into groups, each with a unique combination of variants in the duplicate lines - int ireader,ivar,irec,igrp,ivset; - for (ireader=0; ireadernreaders; ireader++) + int ireader,ivar,irec,igrp,ivset,iact; + for (ireader=0; ireadernreaders; ireader++) srt->vcf_buf[ireader].nrec = 0; + for (iact=0; iactnactive; iact++) { - srt->vcf_buf[ireader].nrec = 0; - + ireader = srt->active[iact]; bcf_sr_t *reader = &readers->readers[ireader]; int rid = bcf_hdr_name2id(reader->header, chr); grp.nvar = 0; @@ -546,23 +560,8 @@ int bcf_sr_sort_next(bcf_srs_t *readers, sr_sort_t *srt, const char *chr, int min_pos) { int i,j; - if ( readers->nreaders == 1 ) - { - srt->nsr = 1; - if ( !srt->msr ) - { - srt->vcf_buf = (vcf_buf_t*) calloc(1,sizeof(vcf_buf_t)); // first time here - srt->msr = 1; - } - bcf_sr_t *reader = &readers->readers[0]; - assert( reader->buffer[1]->pos==min_pos ); - bcf1_t *tmp = reader->buffer[0]; - for (j=1; j<=reader->nbuffer; j++) reader->buffer[j-1] = reader->buffer[j]; - reader->buffer[ reader->nbuffer ] = tmp; - reader->nbuffer--; - readers->has_line[0] = 1; - return 1; - } + assert( srt->nactive>0 ); + if ( srt->nsr != readers->nreaders ) { srt->sr = readers; @@ -575,6 +574,19 @@ srt->nsr = readers->nreaders; srt->chr = NULL; } + if ( srt->nactive == 1 ) + { + if ( readers->nreaders>1 ) + memset(readers->has_line, 0, readers->nreaders*sizeof(*readers->has_line)); + bcf_sr_t *reader = &readers->readers[srt->active[0]]; + assert( reader->buffer[1]->pos==min_pos ); + bcf1_t *tmp = reader->buffer[0]; + for (j=1; j<=reader->nbuffer; j++) reader->buffer[j-1] = reader->buffer[j]; + reader->buffer[ reader->nbuffer ] = tmp; + reader->nbuffer--; + readers->has_line[srt->active[0]] = 1; + return 1; + } if ( !srt->chr || srt->pos!=min_pos || strcmp(srt->chr,chr) ) bcf_sr_sort_set(readers, srt, chr, min_pos); if ( !srt->vcf_buf[0].nrec ) return 0; @@ -629,6 +641,7 @@ } void bcf_sr_sort_destroy(sr_sort_t *srt) { + free(srt->active); if ( srt->var_str2int ) khash_str2int_destroy_free(srt->var_str2int); if ( srt->grp_str2int ) khash_str2int_destroy_free(srt->grp_str2int); int i; diff -Nru htslib-1.6/bcf_sr_sort.h htslib-1.7/bcf_sr_sort.h --- htslib-1.6/bcf_sr_sort.h 2017-09-28 11:20:23.000000000 +0000 +++ htslib-1.7/bcf_sr_sort.h 2018-01-26 12:01:45.000000000 +0000 @@ -92,11 +92,14 @@ const char *chr; int pos, nsr, msr; int pair; + int nactive, mactive, *active; // list of readers with lines at the current pos } sr_sort_t; sr_sort_t *bcf_sr_sort_init(sr_sort_t *srt); int bcf_sr_sort_next(bcf_srs_t *readers, sr_sort_t *srt, const char *chr, int pos); +int bcf_sr_sort_set_active(sr_sort_t *srt, int i); +int bcf_sr_sort_add_active(sr_sort_t *srt, int i); void bcf_sr_sort_destroy(sr_sort_t *srt); void bcf_sr_sort_remove_reader(bcf_srs_t *readers, sr_sort_t *srt, int i); diff -Nru htslib-1.6/bgzf.c htslib-1.7/bgzf.c --- htslib-1.6/bgzf.c 2017-09-28 11:20:23.000000000 +0000 +++ htslib-1.7/bgzf.c 2018-01-26 12:01:45.000000000 +0000 @@ -71,10 +71,16 @@ uint8_t *block; int64_t end_offset; } cache_t; + #include "htslib/khash.h" KHASH_MAP_INIT_INT64(cache, cache_t) #endif +struct bgzf_cache_t { + khash_t(cache) *h; + khint_t last_pos; +}; + #ifdef BGZF_MT typedef struct bgzf_job { @@ -215,7 +221,16 @@ fp->is_compressed = (n==18 && magic[0]==0x1f && magic[1]==0x8b); fp->is_gzip = ( !fp->is_compressed || ((magic[3]&4) && memcmp(&magic[12], "BC\2\0",4)==0) ) ? 0 : 1; #ifdef BGZF_CACHE - fp->cache = kh_init(cache); + if (!(fp->cache = malloc(sizeof(*fp->cache)))) { + free(fp); + return NULL; + } + if (!(fp->cache->h = kh_init(cache))) { + free(fp->cache); + free(fp); + return NULL; + } + fp->cache->last_pos = 0; #endif return fp; } @@ -524,11 +539,12 @@ static void free_cache(BGZF *fp) { khint_t k; - khash_t(cache) *h = (khash_t(cache)*)fp->cache; if (fp->is_write) return; + khash_t(cache) *h = fp->cache->h; for (k = kh_begin(h); k < kh_end(h); ++k) if (kh_exist(h, k)) free(kh_val(h, k).block); kh_destroy(cache, h); + free(fp->cache); } static int load_block_from_cache(BGZF *fp, int64_t block_address) @@ -536,15 +552,14 @@ khint_t k; cache_t *p; - khash_t(cache) *h = (khash_t(cache)*)fp->cache; + khash_t(cache) *h = fp->cache->h; k = kh_get(cache, h, block_address); if (k == kh_end(h)) return 0; p = &kh_val(h, k); if (fp->block_length != 0) fp->block_offset = 0; fp->block_address = block_address; fp->block_length = p->size; - // FIXME: why BGZF_MAX_BLOCK_SIZE and not p->size? - memcpy(fp->uncompressed_block, p->block, BGZF_MAX_BLOCK_SIZE); + memcpy(fp->uncompressed_block, p->block, p->size); if ( hseek(fp->fp, p->end_offset, SEEK_SET) < 0 ) { // todo: move the error up @@ -557,29 +572,48 @@ static void cache_block(BGZF *fp, int size) { int ret; - khint_t k; + khint_t k, k_orig; + uint8_t *block = NULL; cache_t *p; //fprintf(stderr, "Cache block at %llx\n", (int)fp->block_address); - khash_t(cache) *h = (khash_t(cache)*)fp->cache; + khash_t(cache) *h = fp->cache->h; if (BGZF_MAX_BLOCK_SIZE >= fp->cache_size) return; + if (fp->block_length < 0 || fp->block_length > BGZF_MAX_BLOCK_SIZE) return; if ((kh_size(h) + 1) * BGZF_MAX_BLOCK_SIZE > (uint32_t)fp->cache_size) { - /* A better way would be to remove the oldest block in the - * cache, but here we remove a random one for simplicity. This - * should not have a big impact on performance. */ - for (k = kh_begin(h); k < kh_end(h); ++k) - if (kh_exist(h, k)) break; - if (k < kh_end(h)) { - free(kh_val(h, k).block); + /* Remove uniformly from any position in the hash by a simple + * round-robin approach. An alternative strategy would be to + * remove the least recently accessed block, but the round-robin + * removal is simpler and is not expected to have a big impact + * on performance */ + if (fp->cache->last_pos >= kh_end(h)) fp->cache->last_pos = kh_begin(h); + k_orig = k = fp->cache->last_pos; + if (++k >= kh_end(h)) k = kh_begin(h); + while (k != k_orig) { + if (kh_exist(h, k)) + break; + if (++k == kh_end(h)) + k = kh_begin(h); + } + fp->cache->last_pos = k; + + if (k != k_orig) { + block = kh_val(h, k).block; kh_del(cache, h, k); } + } else { + block = (uint8_t*)malloc(BGZF_MAX_BLOCK_SIZE); } + if (!block) return; k = kh_put(cache, h, fp->block_address, &ret); - if (ret == 0) return; // if this happens, a bug! + if (ret <= 0) { // kh_put failed, or in there already (shouldn't happen) + free(block); + return; + } p = &kh_val(h, k); p->size = fp->block_length; p->end_offset = fp->block_address + size; - p->block = (uint8_t*)malloc(BGZF_MAX_BLOCK_SIZE); - memcpy(kh_val(h, k).block, fp->uncompressed_block, BGZF_MAX_BLOCK_SIZE); + p->block = block; + memcpy(p->block, fp->uncompressed_block, p->size); } #else static void free_cache(BGZF *fp) {} @@ -1489,7 +1523,7 @@ void bgzf_set_cache_size(BGZF *fp, int cache_size) { - if (fp) fp->cache_size = cache_size; + if (fp && fp->cache) fp->cache_size = cache_size; } int bgzf_check_EOF(BGZF *fp) { diff -Nru htslib-1.6/bgzip.c htslib-1.7/bgzip.c --- htslib-1.6/bgzip.c 2017-09-28 11:20:23.000000000 +0000 +++ htslib-1.7/bgzip.c 2018-01-26 12:01:45.000000000 +0000 @@ -1,7 +1,7 @@ /* bgzip.c -- Block compression/decompression utility. Copyright (C) 2008, 2009 Broad Institute / Massachusetts Institute of Technology - Copyright (C) 2010, 2013-2017 Genome Research Ltd. + Copyright (C) 2010, 2013-2018 Genome Research Ltd. Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal @@ -130,7 +130,7 @@ case 1: printf( "bgzip (htslib) %s\n" -"Copyright (C) 2017 Genome Research Ltd.\n", hts_version()); +"Copyright (C) 2018 Genome Research Ltd.\n", hts_version()); return EXIT_SUCCESS; case 'h': case '?': return bgzip_main_usage(); diff -Nru htslib-1.6/configure.ac htslib-1.7/configure.ac --- htslib-1.6/configure.ac 2017-09-28 11:20:23.000000000 +0000 +++ htslib-1.7/configure.ac 2018-01-26 12:01:45.000000000 +0000 @@ -29,6 +29,8 @@ AC_CONFIG_SRCDIR(hts.c) AC_CONFIG_HEADERS(config.h) +m4_include([m4/hts_prog_cc_warnings.m4]) + dnl Copyright notice to be copied into the generated configure script AC_COPYRIGHT([Portions copyright (C) 2016 Genome Research Ltd. @@ -49,6 +51,12 @@ AC_PROG_CC AC_PROG_RANLIB +dnl Turn on compiler warnings, if possible +HTS_PROG_CC_WARNINGS +dnl Flags to treat warnings as errors. These need to be applied to CFLAGS +dnl later as they can interfere with some of the tests (notably AC_SEARCH_LIBS) +HTS_PROG_CC_WERROR(hts_late_cflags) + dnl Avoid chicken-and-egg problem where pkg-config supplies the dnl PKG_PROG_PKG_CONFIG macro, but we want to use it to check dnl for pkg-config... @@ -229,9 +237,9 @@ if test "$enable_lzma" != no; then lzma_devel=ok - AC_CHECK_HEADER([lzma.h], [], [lzma_devel=missing], [;]) + AC_CHECK_HEADERS([lzma.h], [], [lzma_devel=header-missing], [;]) AC_CHECK_LIB([lzma], [lzma_easy_buffer_encode], [], [lzma_devel=missing]) - if test $lzma_devel != ok; then + if test $lzma_devel = missing; then AC_MSG_ERROR([liblzma development files not found The CRAM format may use LZMA2 compression, which is implemented in HTSlib @@ -354,6 +362,10 @@ if test "$s3" = enabled ; then AC_DEFINE([ENABLE_S3], 1, [Define if HTSlib should enable S3 support.]) fi + +dnl Apply value from HTS_PROG_CC_WERROR (if set) +AS_IF([test "x$hts_late_cflags" != x],[CFLAGS="$CFLAGS $hts_late_cflags"]) + AC_SUBST([s3]) AC_SUBST([CRYPTO_LIBS]) diff -Nru htslib-1.6/cram/cram_decode.c htslib-1.7/cram/cram_decode.c --- htslib-1.6/cram/cram_decode.c 2017-09-28 11:20:23.000000000 +0000 +++ htslib-1.7/cram/cram_decode.c 2018-01-26 12:01:45.000000000 +0000 @@ -1798,6 +1798,9 @@ } ref_pos += cr->len - seq_pos + 1; } + } else if (cr->ref_id >= 0) { + // So alignment end can be computed even when not decoding sequence + ref_pos += cr->len - seq_pos + 1; } if (ncigar+1 >= cigar_alloc) { @@ -1950,7 +1953,7 @@ cr->aux_size += out_sz + 3; } - + return r; } @@ -1961,7 +1964,7 @@ int32_t TL = 0; unsigned char *TN; uint32_t ds = c->comp_hdr->data_series; - + if (!(ds & (CRAM_TL|CRAM_aux))) { cr->aux = 0; cr->aux_size = 0; @@ -2938,7 +2941,8 @@ if (fd->range.refid != -2) { while (c->ref_seq_id != -2 && (c->ref_seq_id < fd->range.refid || - c->ref_seq_start + c->ref_seq_span-1 < fd->range.start)) { + (fd->range.refid >= 0 && c->ref_seq_id == fd->range.refid + && c->ref_seq_start + c->ref_seq_span-1 < fd->range.start))) { if (0 != cram_seek(fd, c->length, SEEK_CUR)) return NULL; cram_free_container(fd->ctr); @@ -2948,8 +2952,10 @@ } while (c->length == 0); } - if (c->ref_seq_id != -2 && c->ref_seq_id != fd->range.refid) + if (c->ref_seq_id != -2 && c->ref_seq_id != fd->range.refid) { + fd->eof = 1; return NULL; + } } if (!(c->comp_hdr_block = cram_read_block(fd))) @@ -3225,7 +3231,7 @@ /* * Read the next cram record and convert it to a bam_seq_t struct. * - * Returns 0 on success + * Returns >= 0 success (number of bytes written to *bam) * -1 on EOF or failure (check fd->err) */ int cram_get_bam_seq(cram_fd *fd, bam_seq_t **bam) { diff -Nru htslib-1.6/cram/cram_index.c htslib-1.7/cram/cram_index.c --- htslib-1.6/cram/cram_index.c 2017-09-28 11:20:23.000000000 +0000 +++ htslib-1.7/cram/cram_index.c 2018-01-26 12:01:45.000000000 +0000 @@ -345,7 +345,10 @@ * "from" as the last slice we checked to find the next one. Otherwise * set "from" to be NULL to find the first one. * - * Returns the cram_index pointer on sucess + * Refid can also be any of the special HTS_IDX_ values. + * For backwards compatibility, refid -1 is equivalent to HTS_IDX_NOCOOR. + * + * Returns the cram_index pointer on success * NULL on failure */ cram_index *cram_index_query(cram_fd *fd, int refid, int pos, @@ -353,9 +356,34 @@ int i, j, k; cram_index *e; - if (refid+1 < 0 || refid+1 >= fd->index_sz) + switch(refid) { + case HTS_IDX_NONE: + case HTS_IDX_REST: + // fail, or already there, dealt with elsewhere. return NULL; + case HTS_IDX_NOCOOR: + refid = -1; + break; + + case HTS_IDX_START: { + int64_t min_idx = INT64_MAX; + for (i = 0, j = -1; i < fd->index_sz; i++) { + if (fd->index[i].e && fd->index[i].e[0].offset < min_idx) { + min_idx = fd->index[i].e[0].offset; + j = i; + } + } + if (j < 0) + return NULL; + return fd->index[j].e; + } + + default: + if (refid < HTS_IDX_NONE || refid+1 >= fd->index_sz) + return NULL; + } + if (!from) from = &fd->index[refid+1]; @@ -406,6 +434,24 @@ return e; } +// Return the index entry for last slice on a specific reference. +cram_index *cram_index_last(cram_fd *fd, int refid, cram_index *from) { + int slice; + + if (refid+1 < 0 || refid+1 >= fd->index_sz) + return NULL; + + if (!from) + from = &fd->index[refid+1]; + + // Ref with nothing aligned against it. + if (!from->e) + return NULL; + + slice = fd->index[refid+1].nslice - 1; + + return &from->e[slice]; +} /* * Skips to a container overlapping the start coordinate listed in @@ -424,6 +470,9 @@ int cram_seek_to_refpos(cram_fd *fd, cram_range *r) { cram_index *e; + if (r->refid == HTS_IDX_NONE) + return -2; + // Ideally use an index, so see if we have one. if ((e = cram_index_query(fd, r->refid, r->start, NULL))) { if (0 != cram_seek(fd, e->offset, SEEK_SET)) @@ -434,6 +483,9 @@ return -2; } + if (r->refid == HTS_IDX_START || r->refid == HTS_IDX_REST) + fd->range.refid = -2; // special case in cram_next_slice + if (fd->ctr) { cram_free_container(fd->ctr); fd->ctr = NULL; diff -Nru htslib-1.6/cram/cram_index.h htslib-1.7/cram/cram_index.h --- htslib-1.6/cram/cram_index.h 2017-09-28 11:20:23.000000000 +0000 +++ htslib-1.7/cram/cram_index.h 2018-01-26 12:01:45.000000000 +0000 @@ -52,6 +52,7 @@ * NULL on failure */ cram_index *cram_index_query(cram_fd *fd, int refid, int pos, cram_index *frm); +cram_index *cram_index_last(cram_fd *fd, int refid, cram_index *from); /* * Skips to a container overlapping the start coordinate listed in diff -Nru htslib-1.6/cram/cram_io.c htslib-1.7/cram/cram_io.c --- htslib-1.6/cram/cram_io.c 2017-09-28 11:20:23.000000000 +0000 +++ htslib-1.7/cram/cram_io.c 2018-01-26 12:01:45.000000000 +0000 @@ -57,7 +57,11 @@ #include #endif #ifdef HAVE_LIBLZMA +#ifdef HAVE_LZMA_H #include +#else +#include "os/lzma_stub.h" +#endif #endif #include #include @@ -4129,8 +4133,9 @@ fd->ooc = 0; - if (hseek(fd->fp, offset, whence) >= 0) - return 0; + if (hseek(fd->fp, offset, whence) >= 0) { + return 0; + } if (!(whence == SEEK_CUR && offset >= 0)) return -1; diff -Nru htslib-1.6/cram/cram_io.h htslib-1.7/cram/cram_io.h --- htslib-1.6/cram/cram_io.h 2017-09-28 11:20:23.000000000 +0000 +++ htslib-1.7/cram/cram_io.h 2018-01-26 12:01:45.000000000 +0000 @@ -842,6 +842,8 @@ */ int cram_seek(cram_fd *fd, off_t offset, int whence); +int64_t cram_tell(cram_fd *fd); + /* * Flushes a CRAM file. * Useful for when writing to stdout without wishing to close the stream. diff -Nru htslib-1.6/cram/cram_samtools.c htslib-1.7/cram/cram_samtools.c --- htslib-1.6/cram/cram_samtools.c 2017-09-28 11:20:23.000000000 +0000 +++ htslib-1.7/cram/cram_samtools.c 2018-01-26 12:01:45.000000000 +0000 @@ -124,7 +124,7 @@ else memset(cp, '\xff', len); - return 0; + return bam_len; } bam_hdr_t *cram_header_to_bam(SAM_hdr *h) { diff -Nru htslib-1.6/cram/mFILE.h htslib-1.7/cram/mFILE.h --- htslib-1.6/cram/mFILE.h 2017-09-28 11:20:23.000000000 +0000 +++ htslib-1.7/cram/mFILE.h 2018-01-26 12:01:45.000000000 +0000 @@ -81,7 +81,6 @@ void *mfsteal(mFILE *mf, size_t *size_out); char *mfgets(char *s, int size, mFILE *mf); int mfflush(mFILE *mf); -int mfprintf(mFILE *mf, char *fmt, ...); mFILE *mstdin(void); mFILE *mstdout(void); mFILE *mstderr(void); diff -Nru htslib-1.6/debian/changelog htslib-1.7/debian/changelog --- htslib-1.6/debian/changelog 2018-02-05 16:49:52.000000000 +0000 +++ htslib-1.7/debian/changelog 2018-02-14 12:13:18.000000000 +0000 @@ -1,8 +1,22 @@ -htslib (1.6-4build1) bionic; urgency=high +htslib (1.7-2) unstable; urgency=medium - * No change rebuild against openssl1.1. + * Move all needed files into htslib-test package + Closes: #890303 + * Document how to test the package + * Update symbols file - -- Dimitri John Ledkov Mon, 05 Feb 2018 16:49:52 +0000 + -- Andreas Tille Wed, 14 Feb 2018 13:13:18 +0100 + +htslib (1.7-1) unstable; urgency=medium + + * Team upload. + + * New upstream version. + * Standards-Version: 4.1.3 + + * win/* files no longer in upstream source tree, affecting -test package + + -- Steffen Moeller Mon, 12 Feb 2018 18:59:19 +0100 htslib (1.6-4) unstable; urgency=medium diff -Nru htslib-1.6/debian/control htslib-1.7/debian/control --- htslib-1.6/debian/control 2018-02-05 16:49:52.000000000 +0000 +++ htslib-1.7/debian/control 2018-02-14 12:13:18.000000000 +0000 @@ -1,6 +1,5 @@ Source: htslib -Maintainer: Ubuntu Developers -XSBC-Original-Maintainer: Debian Med Packaging Team +Maintainer: Debian Med Packaging Team Uploaders: Charles Plessy , Andreas Tille Section: science @@ -12,7 +11,7 @@ liblzma-dev, libssl-dev, zlib1g-dev -Standards-Version: 4.1.2 +Standards-Version: 4.1.3 Vcs-Browser: https://anonscm.debian.org/cgit/debian-med/htslib.git Vcs-Git: https://anonscm.debian.org/git/debian-med/htslib.git -b debian/unstable Homepage: https://github.com/samtools/htslib diff -Nru htslib-1.6/debian/copyright htslib-1.7/debian/copyright --- htslib-1.6/debian/copyright 2017-12-21 15:02:00.000000000 +0000 +++ htslib-1.7/debian/copyright 2018-02-14 12:13:18.000000000 +0000 @@ -1,5 +1,5 @@ Format: http://www.debian.org/doc/packaging-manuals/copyright-format/1.0/ -Source: https://github.com/samtools/htslib/archive/1.3.1.tar.gz +Source: https://github.com/samtools/htslib/releases Files: * Copyright: © 2012-2016 Genome Research Ltd. diff -Nru htslib-1.6/debian/htslib-test.install htslib-1.7/debian/htslib-test.install --- htslib-1.6/debian/htslib-test.install 2017-12-21 15:03:36.000000000 +0000 +++ htslib-1.7/debian/htslib-test.install 2018-02-14 12:13:18.000000000 +0000 @@ -1,9 +1,10 @@ -*.c /usr/share/htslib-test -*.h /usr/share/htslib-test -*.mk /usr/share/htslib-test -Makefile /usr/share/htslib-test -cram/*.c /usr/share/htslib-test/cram -cram/*.h /usr/share/htslib-test/cram -test/* /usr/share/htslib-test/test -win/*.c usr/share/htslib-test/win -win/*.h usr/share/htslib-test/win +*.c usr/share/htslib-test +*.h usr/share/htslib-test +*.mk usr/share/htslib-test +Makefile usr/share/htslib-test +cram/*.c usr/share/htslib-test/cram +cram/*.h usr/share/htslib-test/cram +test/* usr/share/htslib-test/test +os usr/share/htslib-test +version.sh usr/share/htslib-test +debian/tests/run-unit-test usr/share/doc/libhts-dev diff -Nru htslib-1.6/debian/libhts2.symbols htslib-1.7/debian/libhts2.symbols --- htslib-1.6/debian/libhts2.symbols 2017-12-21 15:02:00.000000000 +0000 +++ htslib-1.7/debian/libhts2.symbols 2018-02-14 12:13:18.000000000 +0000 @@ -14,6 +14,7 @@ bam_aux_update_str@Base 1.4.1 bam_cigar2qlen@Base 1.0 bam_cigar2rlen@Base 1.0 + bam_cigar2rqlens@Base 1.7 bam_construct_seq@Base 1.0 bam_copy1@Base 1.0 bam_destroy1@Base 1.0 @@ -142,10 +143,12 @@ bcf_sr_set_samples@Base 1.0 bcf_sr_set_targets@Base 1.0 bcf_sr_set_threads@Base 1.4.1 + bcf_sr_sort_add_active@Base 1.7 bcf_sr_sort_destroy@Base 1.4.1 bcf_sr_sort_init@Base 1.4.1 bcf_sr_sort_next@Base 1.4.1 bcf_sr_sort_remove_reader@Base 1.4.1 + bcf_sr_sort_set_active@Base 1.7 bcf_sr_strerror@Base 1.2.1 bcf_subset@Base 1.0 bcf_subset_format@Base 1.0 @@ -326,6 +329,7 @@ cram_huffman_encode_store@Base 1.0 cram_index_build@Base 1.0 cram_index_free@Base 1.0 + cram_index_last@Base 1.7 cram_index_load@Base 1.0 cram_index_query@Base 1.0 cram_load_reference@Base 1.0 @@ -403,9 +407,12 @@ hfile_destroy@Base 1.0 hfile_init@Base 1.0 hfile_init_fixed@Base 1.4.1 + hfile_mem_get_buffer@Base 1.7 + hfile_mem_steal_buffer@Base 1.7 hfile_oflags@Base 1.0 hfile_plugin_init_gcs@Base 1.4.1 hfile_plugin_init_libcurl@Base 1.3.1 + hfile_plugin_init_mem@Base 1.7 hfile_plugin_init_net@Base 1.3 hfile_plugin_init_s3@Base 1.4.1 hfile_set_blksize@Base 1.5 @@ -417,6 +424,7 @@ hopen@Base 1.0 hopen_htsget_redirect@Base 1.6 hopen_net@Base 1.0 + hopenv_mem@Base 1.7 hpeek@Base 1.0 hputc2@Base 1.0 hputs2@Base 1.0 @@ -454,13 +462,23 @@ hts_idx_seqnames@Base 1.0 hts_idx_set_meta@Base 1.0 hts_itr_destroy@Base 1.0 + hts_itr_multi_bam@Base 1.7 + hts_itr_multi_cram@Base 1.7 + hts_itr_multi_destroy@Base 1.7 + hts_itr_multi_next@Base 1.7 hts_itr_next@Base 1.0 + hts_itr_off@Base 1.7 hts_itr_query@Base 1.0 hts_itr_querys@Base 1.0 + hts_itr_regions@Base 1.7 + hts_json_alloc_token@Base 1.7 hts_json_fnext@Base 1.4.1 + hts_json_free_token@Base 1.7 hts_json_fskip_value@Base 1.4.1 hts_json_snext@Base 1.4.1 hts_json_sskip_value@Base 1.4.1 + hts_json_token_str@Base 1.7 + hts_json_token_type@Base 1.7 hts_log@Base 1.5 hts_lrand48@Base 1.6 hts_md5_destroy@Base 1.3 @@ -481,6 +499,7 @@ hts_readlines@Base 1.0 hts_readlist@Base 1.0 hts_realloc_or_die@Base 1.4.1 + hts_reglist_free@Base 1.7 hts_set_cache_size@Base 1.4.1 hts_set_fai_filename@Base 1.0 hts_set_log_level@Base 1.5 @@ -555,20 +574,28 @@ knet_seek@Base 1.0 kputd@Base 1.4.1 ks_combsort__off@Base 1.0 + ks_combsort__off_max@Base 1.7 ks_combsort_uint16_t@Base 1.4.1 ks_heapadjust__off@Base 1.0 + ks_heapadjust__off_max@Base 1.7 ks_heapadjust_uint16_t@Base 1.4.1 ks_heapmake__off@Base 1.0 + ks_heapmake__off_max@Base 1.7 ks_heapmake_uint16_t@Base 1.4.1 ks_heapsort__off@Base 1.0 + ks_heapsort__off_max@Base 1.7 ks_heapsort_uint16_t@Base 1.4.1 ks_introsort__off@Base 1.0 + ks_introsort__off_max@Base 1.7 ks_introsort_uint16_t@Base 1.4.1 ks_ksmall__off@Base 1.0 + ks_ksmall__off_max@Base 1.7 ks_ksmall_uint16_t@Base 1.4.1 ks_mergesort__off@Base 1.0 + ks_mergesort__off_max@Base 1.7 ks_mergesort_uint16_t@Base 1.4.1 ks_shuffle__off@Base 1.0 + ks_shuffle__off_max@Base 1.7 ks_shuffle_uint16_t@Base 1.4.1 ksplit_core@Base 1.0 ksprintf@Base 1.0 @@ -639,6 +666,7 @@ sam_hdr_add@Base 1.0 sam_hdr_add_PG@Base 1.0 sam_hdr_add_lines@Base 1.0 + sam_hdr_change_HD@Base 1.7 sam_hdr_decr_ref@Base 1.0 sam_hdr_dump@Base 1.0 sam_hdr_dup@Base 1.0 @@ -668,6 +696,7 @@ sam_index_load@Base 1.0 sam_itr_queryi@Base 1.0 sam_itr_querys@Base 1.0 + sam_itr_regions@Base 1.7 sam_open_mode@Base 1.0 sam_open_mode_opts@Base 1.3 sam_parse1@Base 1.0 diff -Nru htslib-1.6/debian/libhts-dev.docs htslib-1.7/debian/libhts-dev.docs --- htslib-1.6/debian/libhts-dev.docs 1970-01-01 00:00:00.000000000 +0000 +++ htslib-1.7/debian/libhts-dev.docs 2018-02-14 12:13:18.000000000 +0000 @@ -0,0 +1 @@ +debian/README.test diff -Nru htslib-1.6/debian/patches/series htslib-1.7/debian/patches/series --- htslib-1.6/debian/patches/series 2017-12-21 15:02:00.000000000 +0000 +++ htslib-1.7/debian/patches/series 2018-02-14 12:13:18.000000000 +0000 @@ -1,4 +1,4 @@ define_PATH_MAX.patch htslib-add-cram_to_bam.patch fPIC.patch -877670.patch +#877670.patch # applied by upstream diff -Nru htslib-1.6/debian/README.test htslib-1.7/debian/README.test --- htslib-1.6/debian/README.test 1970-01-01 00:00:00.000000000 +0000 +++ htslib-1.7/debian/README.test 2018-02-14 12:13:18.000000000 +0000 @@ -0,0 +1,10 @@ +Notes on how this package can be tested. +──────────────────────────────────────── + +To run the unit tests provided by the upstream authors of this package +you can install the package htslib-test and run + + sh /usr/share/doc/libhts-dev/run-unit-test + + -- Andreas Tille Wed, 14 Feb 2018 09:43:36 +0100 + diff -Nru htslib-1.6/errmod.c htslib-1.7/errmod.c --- htslib-1.6/errmod.c 2017-09-28 11:20:23.000000000 +0000 +++ htslib-1.7/errmod.c 2018-01-26 12:01:45.000000000 +0000 @@ -64,7 +64,7 @@ static void cal_coef(errmod_t *em, double depcorr, double eta) { int k, n, q; - long double sum, sum1; + double sum, sum1; double *lC; // initialize ->fk @@ -84,10 +84,11 @@ double le1 = log(1.0 - e); for (n = 1; n <= 255; ++n) { double *beta = em->beta + (q<<16|n<<8); - sum1 = sum = 0.0; - for (k = n; k >= 0; --k, sum1 = sum) { - sum = sum1 + expl(lC[n<<8|k] + k*le + (n-k)*le1); - beta[k] = -10. / M_LN10 * logl(sum1 / sum); + sum1 = lC[n<<8|n] + n*le; + beta[n] = HUGE_VAL; + for (k = n - 1; k >= 0; --k, sum1 = sum) { + sum = sum1 + log1p(exp(lC[n<<8|k] + k*le + (n-k)*le1 - sum1)); + beta[k] = -10. / M_LN10 * (sum1 - sum); } } } diff -Nru htslib-1.6/hfile.c htslib-1.7/hfile.c --- htslib-1.6/hfile.c 2017-09-28 11:20:23.000000000 +0000 +++ htslib-1.7/hfile.c 2018-01-26 12:01:45.000000000 +0000 @@ -83,15 +83,15 @@ are empty. In all cases, the stream's file position indicator corresponds to the position pointed to by begin. -The above is the normal scenario of a mobile window. For in-memory streams, -a fixed (immobile) buffer can be used as the full contents without any separate -backend behind it. These always have at_eof set, offset set to 0, need no -read() method, and should just return EINVAL for seek(): +The above is the normal scenario of a mobile window. For in-memory +streams (eg via hfile_init_fixed) the buffer can be used as the full +contents without any separate backend behind it. These always have at_eof +set, offset set to 0, need no read() method, and should just return EINVAL +for seek(): abcdefghijkLMNOPQRSTUVWXYZ------ ^buffer ^begin ^end ^limit - -Use hfile_init_fixed() to create one of these. */ +*/ hFILE *hfile_init(size_t struct_size, const char *mode, size_t capacity) { @@ -138,6 +138,8 @@ return fp; } +static const struct hFILE_backend mem_backend; + void hfile_destroy(hFILE *fp) { int save = errno; @@ -404,7 +406,7 @@ { off_t curpos, pos; - if (writebuffer_is_nonempty(fp)) { + if (writebuffer_is_nonempty(fp) && fp->mobile) { int ret = flush_buffer(fp); if (ret < 0) return ret; } @@ -615,6 +617,56 @@ return NULL; } +// Loads the contents of filename to produced a read-only, in memory, +// immobile hfile. fp is the already opened file. We always close this +// input fp, irrespective of whether we error or whether we return a new +// immobile hfile. +static hFILE *hpreload(hFILE *fp) { + hFILE *mem_fp; + char *buf = NULL; + off_t buf_sz = 0, buf_a = 0, buf_inc = 8192, len; + + for (;;) { + if (buf_a - buf_sz < 5000) { + buf_a += buf_inc; + char *t = realloc(buf, buf_a); + if (!t) goto err; + buf = t; + if (buf_inc < 1000000) buf_inc *= 1.3; + } + len = hread(fp, buf+buf_sz, buf_a-buf_sz); + if (len > 0) + buf_sz += len; + else + break; + } + + if (len < 0) goto err; + mem_fp = hfile_init_fixed(sizeof(hFILE), "r", buf, buf_sz, buf_a); + if (!mem_fp) goto err; + mem_fp->backend = &mem_backend; + + if (hclose(fp) < 0) { + hclose_abruptly(mem_fp); + goto err; + } + return mem_fp; + + err: + free(buf); + hclose_abruptly(fp); + return NULL; +} + +static int is_preload_url_remote(const char *url){ + return hisremote(url + 8); // len("preload:") = 8 +} + +static hFILE *hopen_preload(const char *url, const char *mode){ + hFILE* fp = hopen(url + 8, mode); + return hpreload(fp); +} + hFILE *hdopen(int fd, const char *mode) { hFILE_fd *fp = (hFILE_fd*) hfile_init(sizeof (hFILE_fd), mode, blksize(fd)); @@ -711,6 +763,16 @@ return 0; } +static hFILE *create_hfile_mem(char* buffer, const char* mode, size_t buf_filled, size_t buf_size) +{ + hFILE_mem *fp = (hFILE_mem *) hfile_init_fixed(sizeof(hFILE_mem), mode, buffer, buf_filled, buf_size); + if (fp == NULL) + return NULL; + + fp->base.backend = &mem_backend; + return &fp->base; +} + static hFILE *hopen_mem(const char *url, const char *mode) { size_t length, size; @@ -734,13 +796,59 @@ if (buffer == NULL) return NULL; hts_decode_percent(buffer, &length, data); } + hFILE* hf; - hFILE_mem *fp = (hFILE_mem *) - hfile_init_fixed(sizeof (hFILE_mem), mode, buffer, length, size); - if (fp == NULL) { free(buffer); return NULL; } + if(!(hf = create_hfile_mem(buffer, mode, length, size))){ + free(buffer); + return NULL; + } - fp->base.backend = &mem_backend; - return &fp->base; + return hf; +} + +hFILE *hopenv_mem(const char *filename, const char *mode, va_list args) +{ + char* buffer = va_arg(args, char*); + size_t sz = va_arg(args, size_t); + va_end(args); + + hFILE* hf; + + if(!(hf = create_hfile_mem(buffer, mode, sz, sz))){ + free(buffer); + return NULL; + } + + return hf; +} + +char *hfile_mem_get_buffer(hFILE *file, size_t *length) { + if (file->backend != &mem_backend) { + errno = EINVAL; + return NULL; + } + + if (length) + *length = file->buffer - file->limit; + + return file->buffer; +} + +char *hfile_mem_steal_buffer(hFILE *file, size_t *length) { + char *buf = hfile_mem_get_buffer(file, length); + if (buf) + file->buffer = NULL; + return buf; +} + +int hfile_plugin_init_mem(struct hFILE_plugin *self) +{ + // mem files are declared remote so they work with a tabix index + static const struct hFILE_scheme_handler handler = + {NULL, hfile_always_remote, "mem", 2000 + 50, hopenv_mem}; + self->name = "mem"; + hfile_add_scheme_handler("mem", &handler); + return 0; } @@ -825,14 +933,17 @@ { static const struct hFILE_scheme_handler data = { hopen_mem, hfile_always_local, "built-in", 80 }, - file = { hopen_fd_fileuri, hfile_always_local, "built-in", 80 }; + file = { hopen_fd_fileuri, hfile_always_local, "built-in", 80 }, + preload = { hopen_preload, is_preload_url_remote, "built-in", 80 }; schemes = kh_init(scheme_string); if (schemes == NULL) abort(); hfile_add_scheme_handler("data", &data); hfile_add_scheme_handler("file", &file); + hfile_add_scheme_handler("preload", &preload); init_add_plugin(NULL, hfile_plugin_init_net, "knetfile"); + init_add_plugin(NULL, hfile_plugin_init_mem, "mem"); #ifdef ENABLE_PLUGINS struct hts_path_itr path; @@ -910,8 +1021,12 @@ { const struct hFILE_scheme_handler *handler = find_scheme_handler(fname); if (handler) { - if (strchr(mode, ':') == NULL) return handler->open(fname, mode); - else if (handler->priority >= 2000 && handler->vopen) { + if (strchr(mode, ':') == NULL + || handler->priority < 2000 + || handler->vopen == NULL) { + return handler->open(fname, mode); + } + else { hFILE *fp; va_list arg; va_start(arg, mode); @@ -919,7 +1034,6 @@ va_end(arg); return fp; } - else { errno = ENOTSUP; return NULL; } } else if (strcmp(fname, "-") == 0) return hopen_fd_stdinout(mode); else return hopen_fd(fname, mode); diff -Nru htslib-1.6/hfile_internal.h htslib-1.7/hfile_internal.h --- htslib-1.6/hfile_internal.h 2017-09-28 11:20:23.000000000 +0000 +++ htslib-1.7/hfile_internal.h 2018-01-26 12:01:45.000000000 +0000 @@ -29,6 +29,8 @@ #include "htslib/hfile.h" +#include "textutils_internal.h" + #ifdef __cplusplus extern "C" { #endif diff -Nru htslib-1.6/hfile_libcurl.c htslib-1.7/hfile_libcurl.c --- htslib-1.6/hfile_libcurl.c 2017-09-28 11:20:23.000000000 +0000 +++ htslib-1.7/hfile_libcurl.c 2018-01-26 12:01:45.000000000 +0000 @@ -32,6 +32,7 @@ #ifndef _WIN32 # include #endif +#include #include "hfile_internal.h" #ifdef ENABLE_PLUGINS @@ -39,9 +40,31 @@ #endif #include "htslib/hts.h" // for hts_version() and hts_verbose #include "htslib/kstring.h" +#include "htslib/khash.h" #include +// Number of seconds to take off auth_token expiry, to allow for clock skew +// and slow servers +#define AUTH_REFRESH_EARLY_SECS 60 + +// Minimum number of bytes to skip when seeking forward. Seeks less than +// this will just read the data and throw it away. The optimal value +// depends on how long it takes to make a new connection compared +// to how fast the data arrives. +#define MIN_SEEK_FORWARD 1000000 + +typedef struct { + char *path; + char *token; + time_t expiry; + int failed; + pthread_mutex_t lock; +} auth_token; + +// For the authorization header cache +KHASH_MAP_INIT_STR(auth_map, auth_token *); + // Curl-compatible header linked list typedef struct { struct curl_slist *list; @@ -54,6 +77,12 @@ hdrlist extra; // List of headers from callback hts_httphdr_callback callback; // Callback to get more headers void *callback_data; // Data to pass to callback + auth_token *auth; // Authentication token + int auth_hdr_num; // Location of auth_token in hdrlist extra + // If -1, Authorization header is in fixed + // -2, it came from the callback + // -3, "auth_token_enabled", "false" + // passed to hopen() } http_headers; typedef struct { @@ -73,10 +102,17 @@ unsigned perform_again : 1; unsigned is_read : 1; // Opened in read mode unsigned can_seek : 1; // Can (attempt to) seek on this handle + unsigned is_recursive:1; // Opened by hfile_libcurl itself + unsigned tried_seek : 1; // At least one seek has been attempted int nrunning; http_headers headers; + off_t delayed_seek; // Location to seek to before reading + off_t last_offset; // Location we're seeking from } hFILE_libcurl; +static off_t libcurl_seek(hFILE *fpv, off_t offset, int whence); +static int restart_from_position(hFILE_libcurl *fp, off_t pos); + static int http_status_errno(int status) { if (status >= 500) @@ -198,22 +234,33 @@ } } - static struct { kstring_t useragent; CURLSH *share; - pthread_mutex_t lock; -} curl = { { 0, 0, NULL }, NULL, PTHREAD_MUTEX_INITIALIZER }; + char *auth_path; + khash_t(auth_map) *auth_map; + int allow_unencrypted_auth_header; + pthread_mutex_t auth_lock; + pthread_mutex_t share_lock; +} curl = { { 0, 0, NULL }, NULL, NULL, NULL, 0, PTHREAD_MUTEX_INITIALIZER, + PTHREAD_MUTEX_INITIALIZER }; static void share_lock(CURL *handle, curl_lock_data data, curl_lock_access access, void *userptr) { - pthread_mutex_lock(&curl.lock); + pthread_mutex_lock(&curl.share_lock); } static void share_unlock(CURL *handle, curl_lock_data data, void *userptr) { - pthread_mutex_unlock(&curl.lock); + pthread_mutex_unlock(&curl.share_lock); } +static void free_auth(auth_token *tok) { + if (!tok) return; + if (pthread_mutex_destroy(&tok->lock)) abort(); + free(tok->path); + free(tok->token); + free(tok); +} static void libcurl_exit() { @@ -223,6 +270,22 @@ free(curl.useragent.s); curl.useragent.l = curl.useragent.m = 0; curl.useragent.s = NULL; + free(curl.auth_path); + curl.auth_path = NULL; + + if (curl.auth_map) { + khiter_t i; + for (i = kh_begin(curl.auth_map); i != kh_end(curl.auth_map); ++i) { + if (kh_exist(curl.auth_map, i)) { + free_auth(kh_value(curl.auth_map, i)); + kh_key(curl.auth_map, i) = NULL; + kh_value(curl.auth_map, i) = NULL; + } + } + kh_destroy(auth_map, curl.auth_map); + curl.auth_map = NULL; + } + curl_global_cleanup(); } @@ -268,6 +331,10 @@ return 0; } +static inline int is_authorization(const char *hdr) { + return (strncasecmp("authorization:", hdr, 14) == 0); +} + static int add_callback_headers(hFILE_libcurl *fp) { char **hdrs = NULL, **hdr; @@ -289,11 +356,16 @@ } free_headers(&fp->headers.extra, 0); + if (fp->headers.auth_hdr_num > 0 || fp->headers.auth_hdr_num == -2) + fp->headers.auth_hdr_num = 0; // Just removed it... + // Convert to libcurl-suitable form for (hdr = hdrs; *hdr; hdr++) { if (append_header(&fp->headers.extra, *hdr, 0) < 0) { goto cleanup; } + if (is_authorization(*hdr) && !fp->headers.auth_hdr_num) + fp->headers.auth_hdr_num = -2; } for (hdr = hdrs; *hdr; hdr++) *hdr = NULL; @@ -312,6 +384,276 @@ return -1; } +/* + * Read an OAUTH2-style Bearer access token (see + * https://tools.ietf.org/html/rfc6750#section-4). + * Returns 'v' for valid; 'i' for invalid (token missing or wrong sort); + * '?' for a JSON parse error; 'm' if it runs out of memory. + */ +static int read_auth_json(auth_token *tok, hFILE *auth_fp) { + hts_json_token *t = hts_json_alloc_token(); + kstring_t str = {0, 0, NULL}; + char *token = NULL, *type = NULL, *expiry = NULL; + int ret = 'i'; + + if (!t) goto error; + + if ((ret = hts_json_fnext(auth_fp, t, &str)) != '{') goto error; + while (hts_json_fnext(auth_fp, t, &str) != '}') { + char *key; + if (hts_json_token_type(t) != 's') { + ret = '?'; + goto error; + } + key = hts_json_token_str(t); + if (!key) goto error; + if (strcmp(key, "access_token") == 0) { + if ((ret = hts_json_fnext(auth_fp, t, &str)) != 's') goto error; + token = ks_release(&str); + } else if (strcmp(key, "token_type") == 0) { + if ((ret = hts_json_fnext(auth_fp, t, &str)) != 's') goto error; + type = ks_release(&str); + } else if (strcmp(key, "expires_in") == 0) { + if ((ret = hts_json_fnext(auth_fp, t, &str)) != 'n') goto error; + expiry = ks_release(&str); + } else if (hts_json_fskip_value(auth_fp, '\0') != 'v') { + ret = '?'; + goto error; + } + } + + if (!token || (type && strcmp(type, "Bearer") != 0)) { + ret = 'i'; + goto error; + } + + ret = 'm'; + str.l = 0; + if (kputs("Authorization: Bearer ", &str) < 0) goto error; + if (kputs(token, &str) < 0) goto error; + free(tok->token); + tok->token = ks_release(&str); + if (expiry) { + long exp = strtol(expiry, NULL, 10); + if (exp < 0) exp = 0; + tok->expiry = time(NULL) + exp; + } else { + tok->expiry = 0; + } + ret = 'v'; + + error: + free(token); + free(type); + free(expiry); + free(str.s); + hts_json_free_token(t); + return ret; +} + +static int read_auth_plain(auth_token *tok, hFILE *auth_fp) { + kstring_t line = {0, 0, NULL}; + kstring_t token = {0, 0, NULL}; + const char *start, *end; + + if (kgetline(&line, (char * (*)(char *, int, void *)) hgets, auth_fp) < 0) goto error; + if (kputc('\0', &line) < 0) goto error; + + for (start = line.s; *start && isspace(*start); start++) {} + for (end = start; *end && !isspace(*end); end++) {} + + if (end > start) { + if (kputs("Authorization: Bearer ", &token) < 0) goto error; + if (kputsn(start, end - start, &token) < 0) goto error; + } + + free(tok->token); + tok->token = ks_release(&token); + tok->expiry = 0; + free(line.s); + return 0; + + error: + free(line.s); + free(token.s); + return -1; +} + +static int renew_auth_token(auth_token *tok, int *changed) { + hFILE *auth_fp = NULL; + char buffer[16]; + ssize_t len; + + *changed = 0; + if (tok->expiry == 0 || time(NULL) + AUTH_REFRESH_EARLY_SECS < tok->expiry) + return 0; // Still valid + + if (tok->failed) + return -1; + + *changed = 1; + auth_fp = hopen(tok->path, "rR"); + if (!auth_fp) { + // Not worried about missing files; other errors are bad. + if (errno != ENOENT) + goto fail; + + tok->expiry = 0; // Prevent retry + free(tok->token); // Just in case it was set + return 0; + } + + len = hpeek(auth_fp, buffer, sizeof(buffer)); + if (len < 0) + goto fail; + + if (memchr(buffer, '{', len) != NULL) { + if (read_auth_json(tok, auth_fp) != 'v') + goto fail; + } else { + if (read_auth_plain(tok, auth_fp) < 0) + goto fail; + } + + return hclose(auth_fp) < 0 ? -1 : 0; + + fail: + tok->failed = 1; + if (auth_fp) hclose_abruptly(auth_fp); + return -1; +} + +static int add_auth_header(hFILE_libcurl *fp) { + int changed = 0; + + if (fp->headers.auth_hdr_num < 0) + return 0; // Have an Authorization header from open or header callback + + if (!fp->headers.auth) + return 0; // Nothing to add + + pthread_mutex_lock(&fp->headers.auth->lock); + if (renew_auth_token(fp->headers.auth, &changed) < 0) + goto unlock_fail; + + if (!changed && fp->headers.auth_hdr_num > 0) { + pthread_mutex_unlock(&fp->headers.auth->lock); + return 0; + } + + if (fp->headers.auth_hdr_num > 0) { + // Had a previous header, so swap in the new one + char *header = fp->headers.auth->token; + char *header_copy = header ? strdup(header) : NULL; + int idx = fp->headers.auth_hdr_num - 1; + if (header && !header_copy) + goto unlock_fail; + + if (header_copy) { + free(fp->headers.extra.list[idx].data); + fp->headers.extra.list[idx].data = header_copy; + } else { + unsigned int j; + // More complicated case - need to get rid of the old header + // and tidy up linked lists + free(fp->headers.extra.list[idx].data); + for (j = idx + 1; j < fp->headers.extra.num; j++) { + fp->headers.extra.list[j - 1] = fp->headers.extra.list[j]; + fp->headers.extra.list[j - 1].next = &fp->headers.extra.list[j]; + } + fp->headers.extra.num--; + if (fp->headers.extra.num > 0) { + fp->headers.extra.list[fp->headers.extra.num-1].next = NULL; + } else if (fp->headers.fixed.num > 0) { + fp->headers.fixed.list[fp->headers.fixed.num - 1].next = NULL; + } + fp->headers.auth_hdr_num = 0; + } + } else if (fp->headers.auth->token) { + // Add new header and remember where it is + if (append_header(&fp->headers.extra, + fp->headers.auth->token, 1) < 0) { + goto unlock_fail; + } + fp->headers.auth_hdr_num = fp->headers.extra.num; + } + + pthread_mutex_unlock(&fp->headers.auth->lock); + return 0; + + unlock_fail: + pthread_mutex_unlock(&fp->headers.auth->lock); + return -1; +} + +static int get_auth_token(hFILE_libcurl *fp, const char *url) { + const char *host = NULL, *p, *q; + kstring_t name = {0, 0, NULL}; + size_t host_len = 0; + khiter_t idx; + auth_token *tok = NULL; + + // Nothing to do if: + // curl.auth_path has not been set + // fp was made by hfile_libcurl (e.g. auth_path is a http:// url) + // we already have an Authorization header + if (!curl.auth_path || fp->is_recursive || fp->headers.auth_hdr_num != 0) + return 0; + + // Insist on having a secure connection unless the user insists harder + if (!curl.allow_unencrypted_auth_header && strncmp(url, "https://", 8) != 0) + return 0; + + host = strstr(url, "://"); + if (host) { + host += 3; + host_len = strcspn(host, "/"); + } + + p = curl.auth_path; + while ((q = strstr(p, "%h")) != NULL) { + if (q - p > INT_MAX || host_len > INT_MAX) goto error; + if (kputsn_(p, q - p, &name) < 0) goto error; + if (kputsn_(host, host_len, &name) < 0) goto error; + p = q + 2; + } + if (kputs(p, &name) < 0) goto error; + + pthread_mutex_lock(&curl.auth_lock); + idx = kh_get(auth_map, curl.auth_map, name.s); + if (idx < kh_end(curl.auth_map)) { + tok = kh_value(curl.auth_map, idx); + } else { + tok = calloc(1, sizeof(*tok)); + if (tok && pthread_mutex_init(&tok->lock, NULL) != 0) { + free(tok); + tok = NULL; + } + if (tok) { + int ret = -1; + tok->path = ks_release(&name); + tok->token = NULL; + tok->expiry = 1; // Force refresh + idx = kh_put(auth_map, curl.auth_map, tok->path, &ret); + if (ret < 0) { + free_auth(tok); + tok = NULL; + } + kh_value(curl.auth_map, idx) = tok; + } + } + pthread_mutex_unlock(&curl.auth_lock); + + fp->headers.auth = tok; + free(name.s); + + return add_auth_header(fp); + + error: + free(name.s); + return -1; +} + static void process_messages(hFILE_libcurl *fp) { CURLMsg *msg; @@ -337,26 +679,29 @@ long timeout; CURLMcode errm; - FD_ZERO(&rd); - FD_ZERO(&wr); - FD_ZERO(&ex); - if (curl_multi_fdset(fp->multi, &rd, &wr, &ex, &maxfd) != CURLM_OK) - maxfd = -1, timeout = 1000; - else if (maxfd < 0) - timeout = 100; // as recommended by curl_multi_fdset(3) - else { - if (curl_multi_timeout(fp->multi, &timeout) != CURLM_OK) - timeout = 1000; - else if (timeout < 0) - timeout = 10000; // as recommended by curl_multi_timeout(3) - } + if (!fp->perform_again) { + FD_ZERO(&rd); + FD_ZERO(&wr); + FD_ZERO(&ex); + if (curl_multi_fdset(fp->multi, &rd, &wr, &ex, &maxfd) != CURLM_OK) + maxfd = -1, timeout = 1000; + else { + if (curl_multi_timeout(fp->multi, &timeout) != CURLM_OK) + timeout = 1000; + else if (timeout < 0) { + timeout = 10000; // as recommended by curl_multi_timeout(3) + } + } + if (maxfd < 0 && timeout > 100) + timeout = 100; // as recommended by curl_multi_fdset(3) - if (timeout > 0 && ! fp->perform_again) { - struct timeval tval; - tval.tv_sec = (timeout / 1000); - tval.tv_usec = (timeout % 1000) * 1000; + if (timeout > 0) { + struct timeval tval; + tval.tv_sec = (timeout / 1000); + tval.tv_usec = (timeout % 1000) * 1000; - if (select(maxfd + 1, &rd, &wr, &ex, &tval) < 0) return -1; + if (select(maxfd + 1, &rd, &wr, &ex, &tval) < 0) return -1; + } } errm = curl_multi_perform(fp->multi, &nrunning); @@ -387,18 +732,54 @@ { hFILE_libcurl *fp = (hFILE_libcurl *) fpv; char *buffer = (char *) bufferv; + off_t to_skip = -1; + ssize_t got = 0; CURLcode err; - fp->buffer.ptr.rd = buffer; - fp->buffer.len = nbytes; - fp->paused = 0; - err = curl_easy_pause(fp->easy, CURLPAUSE_CONT); - if (err != CURLE_OK) { errno = easy_errno(fp->easy, err); return -1; } - - while (! fp->paused && ! fp->finished) - if (wait_perform(fp) < 0) return -1; + if (fp->delayed_seek >= 0) { + assert(fp->base.offset == fp->delayed_seek + && fp->base.begin == fp->base.buffer + && fp->base.end == fp->base.buffer); + + if (fp->last_offset >= 0 + && fp->delayed_seek > fp->last_offset + && fp->delayed_seek - fp->last_offset < MIN_SEEK_FORWARD) { + // If not seeking far, just read the data and throw it away. This + // is likely to be quicker than opening a new stream + to_skip = fp->delayed_seek - fp->last_offset; + } else { + if (restart_from_position(fp, fp->delayed_seek) < 0) { + return -1; + } + } + fp->delayed_seek = -1; + fp->last_offset = -1; + } - nbytes = fp->buffer.ptr.rd - buffer; + do { + fp->buffer.ptr.rd = buffer; + fp->buffer.len = nbytes; + fp->paused = 0; + err = curl_easy_pause(fp->easy, CURLPAUSE_CONT); + if (err != CURLE_OK) { errno = easy_errno(fp->easy, err); return -1; } + + while (! fp->paused && ! fp->finished) + if (wait_perform(fp) < 0) return -1; + + got = fp->buffer.ptr.rd - buffer; + + if (to_skip >= 0) { // Skipping over a small seek + if (got < to_skip) { // Need to skip more data + to_skip -= got; + } else { + got -= to_skip; + if (got > 0) { // If enough was skipped, return the rest + memmove(buffer, buffer + to_skip, got); + to_skip = -1; + } + } + } + } while (to_skip >= 0 && ! fp->finished); fp->buffer.ptr.rd = NULL; fp->buffer.len = 0; @@ -407,7 +788,7 @@ return -1; } - return nbytes; + return got; } static size_t send_callback(char *ptr, size_t size, size_t nmemb, void *fpv) @@ -458,9 +839,6 @@ static off_t libcurl_seek(hFILE *fpv, off_t offset, int whence) { hFILE_libcurl *fp = (hFILE_libcurl *) fpv; - hFILE_libcurl temp_fp; - CURLcode err; - CURLMcode errm; off_t origin, pos; if (!fp->is_read || !fp->can_seek) { @@ -494,6 +872,36 @@ pos = origin + offset; + if (fp->tried_seek) { + /* Seeking has worked at least once, so now we can delay doing + the actual work until the next read. This avoids lots of pointless + http or ftp reconnections if the caller does lots of seeks + without any intervening reads. */ + if (fp->delayed_seek < 0) { + fp->last_offset = fp->base.offset + (fp->base.end - fp->base.buffer); + } + fp->delayed_seek = pos; + return pos; + } + + if (restart_from_position(fp, pos) < 0) { + /* This value for errno may not be entirely true, but the caller may be + able to carry on with the existing handle. */ + errno = ESPIPE; + return -1; + } + + fp->tried_seek = 1; + return pos; +} + +static int restart_from_position(hFILE_libcurl *fp, off_t pos) { + hFILE_libcurl temp_fp; + CURLcode err; + CURLMcode errm; + int update_headers = 0; + int save_errno = 0; + // TODO If we seem to be doing random access, use CURLOPT_RANGE to do // limited reads (e.g. about a BAM block!) so seeking can reuse the // existing connection more often. @@ -503,10 +911,17 @@ // sent by now. if (fp->headers.callback) { - struct curl_slist *list; if (add_callback_headers(fp) != 0) return -1; - list = get_header_list(fp); + update_headers = 1; + } + if (fp->headers.auth_hdr_num > 0 && fp->headers.auth) { + if (add_auth_header(fp) != 0) + return -1; + update_headers = 1; + } + if (update_headers) { + struct curl_slist *list = get_header_list(fp); if (list) { err = curl_easy_setopt(fp->easy, CURLOPT_HTTPHEADER, list); if (err != CURLE_OK) { @@ -521,10 +936,7 @@ a new request to the server, reading from the location that we want to seek to. If the new request works and returns the correct data, the original easy handle in *fp is closed and replaced with the new - one. If not, we close the new handle, leave *fp unchanged, set - errno to ESPIPE and return -1 so that the caller knows we can't seek. - This allows the caller to decide if it wants to continue reading from - fp, in the same way as it would if reading from a pipe. + one. If not, we close the new handle and leave *fp unchanged. */ memcpy(&temp_fp, fp, sizeof(temp_fp)); @@ -537,36 +949,52 @@ err = curl_easy_setopt(temp_fp.easy, CURLOPT_RESUME_FROM_LARGE,(curl_off_t)pos); err |= curl_easy_setopt(temp_fp.easy, CURLOPT_PRIVATE, &temp_fp); err |= curl_easy_setopt(temp_fp.easy, CURLOPT_WRITEDATA, &temp_fp); - if (err != CURLE_OK) + if (err != CURLE_OK) { + save_errno = easy_errno(temp_fp.easy, err); goto error; + } temp_fp.buffer.len = 0; // Ensures we only read the response headers temp_fp.paused = temp_fp.finished = 0; // fp->multi and temp_fp.multi are the same. errm = curl_multi_add_handle(fp->multi, temp_fp.easy); - if (errm != CURLM_OK) { errno = multi_errno(errm); return -1; } + if (errm != CURLM_OK) { + save_errno = multi_errno(errm); + goto error; + } temp_fp.nrunning = ++fp->nrunning; err = curl_easy_pause(temp_fp.easy, CURLPAUSE_CONT); - if (err != CURLE_OK) + if (err != CURLE_OK) { + save_errno = easy_errno(temp_fp.easy, err); goto error_remove; + } while (! temp_fp.paused && ! temp_fp.finished) - if (wait_perform(&temp_fp) < 0) goto error_remove; + if (wait_perform(&temp_fp) < 0) { + save_errno = errno; + goto error_remove; + } - if (temp_fp.finished && temp_fp.final_result != CURLE_OK) + if (temp_fp.finished && temp_fp.final_result != CURLE_OK) { + save_errno = easy_errno(temp_fp.easy, temp_fp.final_result); goto error_remove; + } // We've got a good response, close the original connection and // replace it with the new one. errm = curl_multi_remove_handle(fp->multi, fp->easy); if (errm != CURLM_OK) { + // Clean up as much as possible curl_easy_reset(temp_fp.easy); - curl_multi_remove_handle(fp->multi, temp_fp.easy); - errno = multi_errno(errm); - return -1; + if (curl_multi_remove_handle(fp->multi, temp_fp.easy) == CURLM_OK) { + fp->nrunning--; + curl_easy_cleanup(temp_fp.easy); + } + save_errno = multi_errno(errm); + goto early_error; } fp->nrunning--; @@ -575,7 +1003,7 @@ err = curl_easy_setopt(fp->easy, CURLOPT_WRITEDATA, fp); err |= curl_easy_setopt(fp->easy, CURLOPT_PRIVATE, fp); if (err != CURLE_OK) { - int save_errno = easy_errno(fp->easy, err); + save_errno = easy_errno(fp->easy, err); curl_easy_reset(fp->easy); errno = save_errno; return -1; @@ -586,7 +1014,7 @@ fp->perform_again = temp_fp.perform_again; fp->final_result = temp_fp.final_result; - return pos; + return 0; error_remove: curl_easy_reset(temp_fp.easy); // Ensure no pointers to on-stack temp_fp @@ -600,9 +1028,8 @@ curl_easy_cleanup(temp_fp.easy); early_error: fp->can_seek = 0; // Don't try to seek again - /* This value for errno may not be entirely true, but the caller may be - able to carry on with the existing handle. */ - errno = ESPIPE; + if (save_errno) + errno = save_errno; return -1; } @@ -658,7 +1085,9 @@ const char *s; CURLcode err; CURLMcode errm; - int save; + int save, is_recursive; + + is_recursive = strchr(modes, 'R') != NULL; if ((s = strpbrk(modes, "rwa+")) != NULL) { mode = *s; @@ -676,12 +1105,16 @@ } else { memset(&fp->headers, 0, sizeof(fp->headers)); } + fp->file_size = -1; fp->buffer.ptr.rd = NULL; fp->buffer.len = 0; fp->final_result = (CURLcode) -1; fp->paused = fp->closing = fp->finished = fp->perform_again = 0; fp->can_seek = 1; + fp->tried_seek = 0; + fp->delayed_seek = fp->last_offset = -1; + fp->is_recursive = is_recursive; fp->nrunning = 0; fp->easy = NULL; @@ -694,6 +1127,10 @@ // Make a route to the hFILE_libcurl* given just a CURL* easy handle err = curl_easy_setopt(fp->easy, CURLOPT_PRIVATE, fp); + // Avoid many repeated CWD calls with FTP, instead requesting the filename + // by full path (as done in knet, but not strictly compliant with RFC1738). + err |= curl_easy_setopt(fp->easy, CURLOPT_FTP_FILEMETHOD, CURLFTPMETHOD_NOCWD); + if (mode == 'r') { err |= curl_easy_setopt(fp->easy, CURLOPT_WRITEFUNCTION, recv_callback); err |= curl_easy_setopt(fp->easy, CURLOPT_WRITEDATA, fp); @@ -711,10 +1148,18 @@ err |= curl_easy_setopt(fp->easy, CURLOPT_SHARE, curl.share); err |= curl_easy_setopt(fp->easy, CURLOPT_URL, url); + { + char* env_curl_ca_bundle = getenv("CURL_CA_BUNDLE"); + if (env_curl_ca_bundle) { + err |= curl_easy_setopt(fp->easy, CURLOPT_CAINFO, env_curl_ca_bundle); + } + } err |= curl_easy_setopt(fp->easy, CURLOPT_USERAGENT, curl.useragent.s); if (fp->headers.callback) { if (add_callback_headers(fp) != 0) goto error; } + if (get_auth_token(fp, url) < 0) + goto error; if ((list = get_header_list(fp)) != NULL) err |= curl_easy_setopt(fp->easy, CURLOPT_HTTPHEADER, list); err |= curl_easy_setopt(fp->easy, CURLOPT_FOLLOWLOCATION, 1L); @@ -757,15 +1202,12 @@ save = errno; if (fp->easy) curl_easy_cleanup(fp->easy); if (fp->multi) curl_multi_cleanup(fp->multi); - free_headers(&fp->headers.fixed, 1); free_headers(&fp->headers.extra, 1); hfile_destroy((hFILE *) fp); errno = save; return NULL; early_error: - save = errno; - errno = save; return NULL; } @@ -784,6 +1226,8 @@ for (hdr = va_arg(args, const char **); *hdr; hdr++) { if (append_header(&headers->fixed, *hdr, 1) < 0) return -1; + if (is_authorization(*hdr)) + headers->auth_hdr_num = -1; } } else if (strcmp(argtype, "httphdr:l") == 0) { @@ -791,6 +1235,8 @@ while ((hdr = va_arg(args, const char *)) != NULL) { if (append_header(&headers->fixed, hdr, 1) < 0) return -1; + if (is_authorization(hdr)) + headers->auth_hdr_num = -1; } } else if (strcmp(argtype, "httphdr") == 0) { @@ -798,6 +1244,8 @@ if (hdr) { if (append_header(&headers->fixed, hdr, 1) < 0) return -1; + if (is_authorization(hdr)) + headers->auth_hdr_num = -1; } } else if (strcmp(argtype, "httphdr_callback") == 0) { @@ -812,6 +1260,11 @@ if (parse_va_list(headers, *args2) < 0) return -1; } } + else if (strcmp(argtype, "auth_token_enabled") == 0) { + const char *flag = va_arg(args, const char *); + if (strcmp(flag, "false") == 0) + headers->auth_hdr_num = -3; + } else { errno = EINVAL; return -1; } return 0; @@ -891,6 +1344,7 @@ #endif const curl_version_info_data *info; const char * const *protocol; + const char *auth; CURLcode err; CURLSHcode errsh; @@ -909,6 +1363,24 @@ return -1; } + if ((auth = getenv("HTS_AUTH_LOCATION")) != NULL) { + curl.auth_path = strdup(auth); + curl.auth_map = kh_init(auth_map); + if (!curl.auth_path || !curl.auth_map) { + int save_errno = errno; + free(curl.auth_path); + kh_destroy(auth_map, curl.auth_map); + curl_share_cleanup(curl.share); + curl_global_cleanup(); + errno = save_errno; + return -1; + } + } + if ((auth = getenv("HTS_ALLOW_UNENCRYPTED_AUTHORIZATION_HEADER")) != NULL + && strcmp(auth, "I understand the risks") == 0) { + curl.allow_unencrypted_auth_header = 1; + } + info = curl_version_info(CURLVERSION_NOW); ksprintf(&curl.useragent, "htslib/%s libcurl/%s", version, info->version); diff -Nru htslib-1.6/hfile_s3.c htslib-1.7/hfile_s3.c --- htslib-1.6/hfile_s3.c 2017-09-28 11:20:23.000000000 +0000 +++ htslib-1.7/hfile_s3.c 2018-01-26 12:01:45.000000000 +0000 @@ -30,7 +30,6 @@ #include #include -#include "hts_internal.h" #include "hfile_internal.h" #ifdef ENABLE_PLUGINS #include "version.h" diff -Nru htslib-1.6/hts.c htslib-1.7/hts.c --- htslib-1.6/hts.c 2017-09-28 11:20:23.000000000 +0000 +++ htslib-1.7/hts.c 2018-01-26 12:01:45.000000000 +0000 @@ -1260,6 +1260,7 @@ #define pair64_lt(a,b) ((a).u < (b).u) KSORT_INIT(_off, hts_pair64_t, pair64_lt) +KSORT_INIT(_off_max, hts_pair64_max_t, pair64_lt) typedef struct { int32_t m, n; @@ -1953,6 +1954,115 @@ return itr->bins.n; } +static inline int reg2intervals(hts_itr_multi_t *iter, const hts_idx_t *idx, int tid, int64_t beg, int64_t end, uint64_t min_off, uint64_t max_off, int min_shift, int n_lvls) +{ + int l, t, s; + int b, e, i, j; + hts_pair64_max_t *off; + bidx_t *bidx; + khint_t k; + + if (!iter || !idx || (bidx = idx->bidx[tid]) == NULL || beg >= end) + return -1; + + s = min_shift + (n_lvls<<1) + n_lvls; + if (end >= 1LL<>s); e = t + (end>>s); + + for (i = b; i <= e; ++i) { + if ((k = kh_get(bin, bidx, i)) != kh_end(bidx)) { + bins_t *p = &kh_value(bidx, k); + + if (p->n) { + off = (hts_pair64_max_t*)realloc(iter->off, (iter->n_off + p->n) * sizeof(hts_pair64_max_t)); + if (!off) + return -2; + + iter->off = off; + for (j = 0; j < p->n; ++j) { + if (p->list[j].v > min_off && p->list[j].u < max_off) { + iter->off[iter->n_off].u = p->list[j].u; + iter->off[iter->n_off].v = p->list[j].v; + iter->off[iter->n_off].max = ((uint64_t)tid<<32) | (end+1); + iter->n_off++; + } + } + } + } + } + } + + return iter->n_off; +} + +static int compare_regions(const void *r1, const void *r2) { + hts_reglist_t *reg1 = (hts_reglist_t *)r1; + hts_reglist_t *reg2 = (hts_reglist_t *)r2; + + if (reg1->tid < 0 && reg2->tid >= 0) + return 1; + else if (reg1->tid >= 0 && reg2->tid < 0) + return -1; + else + return reg1->tid - reg2->tid; +} + +uint64_t hts_itr_off(const hts_idx_t* idx, int tid) { + + int i; + bidx_t* bidx; + uint64_t off0 = (uint64_t) -1; + khint_t k; + switch (tid) { + case HTS_IDX_START: + // Find the smallest offset, note that sequence ids may not be ordered sequentially + for (i = 0; i < idx->n; i++) { + bidx = idx->bidx[i]; + k = kh_get(bin, bidx, META_BIN(idx)); + if (k == kh_end(bidx)) + continue; + + if (off0 > kh_val(bidx, k).list[0].u) + off0 = kh_val(bidx, k).list[0].u; + } + if (off0 == (uint64_t) -1 && idx->n_no_coor) + off0 = 0; + // only no-coor reads in this bam + break; + case HTS_IDX_NOCOOR: + /* No-coor reads sort after all of the mapped reads. The position + is not stored in the index itself, so need to find the end + offset for the last mapped read. A loop is needed here in + case references at the end of the file have no mapped reads, + or sequence ids are not ordered sequentially. + See issue samtools#568 and commits b2aab8, 60c22d and cc207d. */ + for (i = 0; i < idx->n; i++) { + bidx = idx->bidx[i]; + k = kh_get(bin, bidx, META_BIN(idx)); + if (k != kh_end(bidx)) { + if (off0 == (uint64_t) -1 || off0 < kh_val(bidx, k).list[0].v) { + off0 = kh_val(bidx, k).list[0].v; + } + } + } + if (off0 == (uint64_t) -1 && idx->n_no_coor) + off0 = 0; + // only no-coor reads in this bam + break; + case HTS_IDX_REST: + off0 = 0; + break; + case HTS_IDX_NONE: + off0 = 0; + break; + } + + return off0; +} + hts_itr_t *hts_itr_query(const hts_idx_t *idx, int tid, int beg, int end, hts_readrec_func *readrec) { int i, n_off, l, bin; @@ -1960,142 +2070,351 @@ khint_t k; bidx_t *bidx; uint64_t min_off, max_off; - hts_itr_t *iter = 0; - if (tid < 0) { - int finished0 = 0; - uint64_t off0 = (uint64_t)-1; - khint_t k; - switch (tid) { - case HTS_IDX_START: - // Find the smallest offset, note that sequence ids may not be ordered sequentially - for (i=0; in; i++) - { - bidx = idx->bidx[i]; - k = kh_get(bin, bidx, META_BIN(idx)); - if (k == kh_end(bidx)) continue; - if ( off0 > kh_val(bidx, k).list[0].u ) off0 = kh_val(bidx, k).list[0].u; - } - if ( off0==(uint64_t)-1 && idx->n_no_coor ) off0 = 0; // only no-coor reads in this bam - break; - - case HTS_IDX_NOCOOR: - /* No-coor reads sort after all of the mapped reads. The position - is not stored in the index itself, so need to find the end - offset for the last mapped read. A loop is needed here in - case references at the end of the file have no mapped reads, - or sequence ids are not ordered sequentially. - See issue samtools#568 and commits b2aab8, 60c22d and cc207d. */ - for (i = 0; i < idx->n; i++) { - bidx = idx->bidx[i]; - k = kh_get(bin, bidx, META_BIN(idx)); - if (k != kh_end(bidx)) { - if (off0==(uint64_t)-1 || off0 < kh_val(bidx, k).list[0].v) { - off0 = kh_val(bidx, k).list[0].v; - } + hts_itr_t *iter = (hts_itr_t*)calloc(1, sizeof(hts_itr_t)); + if (iter) { + if (tid < 0) { + uint64_t off = hts_itr_off(idx, tid); + if (off != (uint64_t) -1) { + iter->read_rest = 1; + iter->curr_off = off; + iter->readrec = readrec; + if (tid == HTS_IDX_NONE) + iter->finished = 1; + } else { + free(iter); + iter = NULL; + } + } else { + if (beg < 0) beg = 0; + if (end < beg) return 0; + if (tid >= idx->n || (bidx = idx->bidx[tid]) == NULL) return 0; + + iter->tid = tid, iter->beg = beg, iter->end = end; iter->i = -1; + iter->readrec = readrec; + + if ( !kh_size(bidx) ) { iter->finished = 1; return iter; } + + // compute min_off + bin = hts_bin_first(idx->n_lvls) + (beg>>idx->min_shift); + do { + int first; + k = kh_get(bin, bidx, bin); + if (k != kh_end(bidx)) break; + first = (hts_bin_parent(bin)<<3) + 1; + if (bin > first) --bin; + else bin = hts_bin_parent(bin); + } while (bin); + if (bin == 0) k = kh_get(bin, bidx, bin); + min_off = k != kh_end(bidx)? kh_val(bidx, k).loff : 0; + + // compute max_off: a virtual offset from a bin to the right of end + bin = hts_bin_first(idx->n_lvls) + ((end-1) >> idx->min_shift) + 1; + if (bin >= idx->n_bins) bin = 0; + while (1) { + // search for an extant bin by moving right, but moving up to the + // parent whenever we get to a first child (which also covers falling + // off the RHS, which wraps around and immediately goes up to bin 0) + while (bin % 8 == 1) bin = hts_bin_parent(bin); + if (bin == 0) { max_off = (uint64_t)-1; break; } + k = kh_get(bin, bidx, bin); + if (k != kh_end(bidx) && kh_val(bidx, k).n > 0) { max_off = kh_val(bidx, k).list[0].u; break; } + bin++; + } + + // retrieve bins + reg2bins(beg, end, iter, idx->min_shift, idx->n_lvls); + + for (i = n_off = 0; i < iter->bins.n; ++i) + if ((k = kh_get(bin, bidx, iter->bins.a[i])) != kh_end(bidx)) + n_off += kh_value(bidx, k).n; + if (n_off == 0) { + // No overlapping bins means the iterator has already finished. + iter->finished = 1; + return iter; + } + off = (hts_pair64_t*)calloc(n_off, sizeof(hts_pair64_t)); + for (i = n_off = 0; i < iter->bins.n; ++i) { + if ((k = kh_get(bin, bidx, iter->bins.a[i])) != kh_end(bidx)) { + int j; + bins_t *p = &kh_value(bidx, k); + for (j = 0; j < p->n; ++j) + if (p->list[j].v > min_off && p->list[j].u < max_off) + off[n_off++] = p->list[j]; } } - if ( off0==(uint64_t)-1 && idx->n_no_coor ) off0 = 0; // only no-coor reads in this bam - break; - case HTS_IDX_REST: - off0 = 0; - break; + if (n_off == 0) { + free(off); + iter->finished = 1; + return iter; + } + ks_introsort(_off, n_off, off); + // resolve completely contained adjacent blocks + for (i = 1, l = 0; i < n_off; ++i) + if (off[l].v < off[i].v) off[++l] = off[i]; + n_off = l + 1; + // resolve overlaps between adjacent blocks; this may happen due to the merge in indexing + for (i = 1; i < n_off; ++i) + if (off[i-1].v >= off[i].u) off[i-1].v = off[i].u; + // merge adjacent blocks + for (i = 1, l = 0; i < n_off; ++i) { + if (off[l].v>>16 == off[i].u>>16) off[l].v = off[i].v; + else off[++l] = off[i]; + } + n_off = l + 1; + iter->n_off = n_off; iter->off = off; + } + } + return iter; +} - case HTS_IDX_NONE: - finished0 = 1; - off0 = 0; - break; +hts_itr_multi_t *hts_itr_multi_bam(const hts_idx_t *idx, hts_itr_multi_t *iter) +{ + int i, j, l, n_off = 0, bin; + hts_pair64_max_t *off = NULL; + khint_t k; + bidx_t *bidx; + uint64_t min_off, max_off, t_off = (uint64_t)-1; + int tid, beg, end; + hts_reglist_t *curr_reg; - default: - return 0; + if (iter) { + iter->i = -1; + for (i=0; in_reg; i++) { + + curr_reg = &iter->reg_list[i]; + tid = curr_reg->tid; + + if (tid < 0) { + t_off = hts_itr_off(idx, tid); + if (t_off != (uint64_t)-1) { + switch (tid) { + case HTS_IDX_NONE: + iter->finished = 1; + case HTS_IDX_START: + case HTS_IDX_REST: + iter->curr_off = t_off; + iter->n_reg = 0; + iter->reg_list = NULL; + iter->read_rest = 1; + return iter; + case HTS_IDX_NOCOOR: + iter->nocoor = 1; + iter->nocoor_off = t_off; + } + } + } else { + if (tid >= idx->n || (bidx = idx->bidx[tid]) == NULL || !kh_size(bidx)) + continue; + + for(j=0; jcount; j++) { + hts_pair32_t *curr_intv = &curr_reg->intervals[j]; + if (curr_intv->end < curr_intv->beg) + continue; + + beg = curr_intv->beg; + end = curr_intv->end; + + /* Compute 'min_off' by searching the lowest level bin containing 'beg'. + If the computed bin is not in the index, try the next bin to the + left, belonging to the same parent. If it is the first sibling bin, + try the parent bin. */ + bin = hts_bin_first(idx->n_lvls) + (beg>>idx->min_shift); + do { + int first; + k = kh_get(bin, bidx, bin); + if (k != kh_end(bidx)) break; + first = (hts_bin_parent(bin)<<3) + 1; + if (bin > first) --bin; + else bin = hts_bin_parent(bin); + } while (bin); + if (bin == 0) + k = kh_get(bin, bidx, bin); + min_off = k != kh_end(bidx)? kh_val(bidx, k).loff : 0; + + // compute max_off: a virtual offset from a bin to the right of end + bin = hts_bin_first(idx->n_lvls) + ((end-1) >> idx->min_shift) + 1; + if (bin >= idx->n_bins) bin = 0; + while (1) { + // search for an extant bin by moving right, but moving up to the + // parent whenever we get to a first child (which also covers falling + // off the RHS, which wraps around and immediately goes up to bin 0) + while (bin % 8 == 1) bin = hts_bin_parent(bin); + if (bin == 0) { max_off = (uint64_t)-1; break; } + k = kh_get(bin, bidx, bin); + if (k != kh_end(bidx) && kh_val(bidx, k).n > 0) { + max_off = kh_val(bidx, k).loff; + break; + } + bin++; + } + + //convert coordinates to file offsets + reg2intervals(iter, idx, tid, beg, end, min_off, max_off, idx->min_shift, idx->n_lvls); + } + } } - if (off0 != (uint64_t)-1) { - iter = (hts_itr_t*)calloc(1, sizeof(hts_itr_t)); - iter->read_rest = 1; - iter->finished = finished0; - iter->curr_off = off0; - iter->readrec = readrec; - return iter; - } else return 0; + + off = iter->off; + n_off = iter->n_off; + + if (n_off) { + ks_introsort(_off_max, n_off, off); + // resolve completely contained adjacent blocks + for (i = 1, l = 0; i < n_off; ++i) { + if (off[l].v < off[i].v) { + off[++l] = off[i]; + } else { + off[l].max = (off[i].max > off[l].max ? off[i].max : off[l].max); + } + } + n_off = l + 1; + // resolve overlaps between adjacent blocks; this may happen due to the merge in indexing + for (i = 1; i < n_off; ++i) + if (off[i-1].v >= off[i].u) off[i-1].v = off[i].u; + // merge adjacent blocks + for (i = 1, l = 0; i < n_off; ++i) { + if (off[l].v>>16 == off[i].u>>16) { + off[l].v = off[i].v; + off[l].max = (off[i].max > off[l].max ? off[i].max : off[l].max); + } else off[++l] = off[i]; + } + n_off = l + 1; + iter->n_off = n_off; iter->off = off; + } + + if(!n_off && !iter->nocoor) + iter->finished = 1; } + return iter; +} - if (beg < 0) beg = 0; - if (end < beg) return 0; - if (tid >= idx->n || (bidx = idx->bidx[tid]) == NULL) return 0; - - iter = (hts_itr_t*)calloc(1, sizeof(hts_itr_t)); - iter->tid = tid, iter->beg = beg, iter->end = end; iter->i = -1; - iter->readrec = readrec; - - if ( !kh_size(bidx) ) { iter->finished = 1; return iter; } - - // compute min_off - bin = hts_bin_first(idx->n_lvls) + (beg>>idx->min_shift); - do { - int first; - k = kh_get(bin, bidx, bin); - if (k != kh_end(bidx)) break; - first = (hts_bin_parent(bin)<<3) + 1; - if (bin > first) --bin; - else bin = hts_bin_parent(bin); - } while (bin); - if (bin == 0) k = kh_get(bin, bidx, bin); - min_off = k != kh_end(bidx)? kh_val(bidx, k).loff : 0; - - // compute max_off: a virtual offset from a bin to the right of end - bin = hts_bin_first(idx->n_lvls) + ((end-1) >> idx->min_shift) + 1; - if (bin >= idx->n_bins) bin = 0; - while (1) { - // search for an extant bin by moving right, but moving up to the - // parent whenever we get to a first child (which also covers falling - // off the RHS, which wraps around and immediately goes up to bin 0) - while (bin % 8 == 1) bin = hts_bin_parent(bin); - if (bin == 0) { max_off = (uint64_t)-1; break; } - k = kh_get(bin, bidx, bin); - if (k != kh_end(bidx) && kh_val(bidx, k).n > 0) { max_off = kh_val(bidx, k).list[0].u; break; } - bin++; - } - - // retrieve bins - reg2bins(beg, end, iter, idx->min_shift, idx->n_lvls); - for (i = n_off = 0; i < iter->bins.n; ++i) - if ((k = kh_get(bin, bidx, iter->bins.a[i])) != kh_end(bidx)) - n_off += kh_value(bidx, k).n; - if (n_off == 0) { - // No overlapping bins means the iterator has already finished. - iter->finished = 1; - return iter; +hts_itr_multi_t *hts_itr_multi_cram(const hts_idx_t *idx, hts_itr_multi_t *iter) +{ + const hts_cram_idx_t *cidx = (const hts_cram_idx_t *) idx; + int tid, beg, end, i, j, l, n_off = 0; + hts_reglist_t *curr_reg; + hts_pair32_t *curr_intv; + hts_pair64_max_t *off = NULL; + cram_index *e = NULL; + + if (!cidx || !iter) + return NULL; + + iter->is_cram = 1; + iter->read_rest = 0; + iter->off = NULL; + iter->n_off = 0; + iter->curr_off = 0; + iter->i = -1; + + for (i=0; in_reg; i++) { + + curr_reg = &iter->reg_list[i]; + tid = curr_reg->tid; + + if (tid >= 0) { + off = (hts_pair64_max_t*)realloc(off, (n_off + curr_reg->count) * sizeof(hts_pair64_max_t)); + if (!off) + return NULL; + + for (j=0; j < curr_reg->count; j++) { + curr_intv = &curr_reg->intervals[j]; + if (curr_intv->end < curr_intv->beg) + continue; + + beg = curr_intv->beg; + end = curr_intv->end; + +/* First, fetch the container overlapping 'beg' and assign its file offset to u, then + * find the container overlapping 'end' and assing the relative end of the slice to v. + * The cram_ptell function will adjust with the container offset, which is not stored + * in the index. + */ + e = cram_index_query(cidx->cram, tid, beg+1, NULL); + if (e) { + off[n_off].u = e->offset; + + if (end == INT_MAX) { + e = cram_index_last(cidx->cram, tid, NULL); + } else { + e = cram_index_query(cidx->cram, tid, end+1, NULL); + } + + if (e) { + off[n_off].v = e->offset + e->slice + e->len; + off[n_off].max = (uint64_t)tid<<32 | end; + n_off++; + } else { + hts_log_warning("Could not set offset end for region %d(%s):%d-%d. Skipping", tid, curr_reg->reg, beg, end); + } + } else { + hts_log_warning("No index entry for region %d:%d-%d", tid, beg, end); + } + } + } else { + switch (tid) { + case HTS_IDX_NOCOOR: + e = cram_index_query(cidx->cram, tid, 1, NULL); + if (e) { + iter->nocoor = 1; + iter->nocoor_off = e->offset; + } else { + hts_log_warning("No index entry for NOCOOR region"); + } + break; + case HTS_IDX_START: + e = cram_index_query(cidx->cram, tid, 1, NULL); + if (e) { + iter->read_rest = 1; + off = (hts_pair64_max_t*)realloc(off, sizeof(hts_pair64_max_t)); + off[0].u = e->offset; + off[0].v = 0; + off[0].max = 0; + n_off=1; + } else { + hts_log_warning("No index entries"); + } + break; + case HTS_IDX_REST: + break; + case HTS_IDX_NONE: + iter->finished = 1; + break; + default: + hts_log_error("Query with tid=%d not implemented for CRAM files", tid); + } + } } - off = (hts_pair64_t*)calloc(n_off, sizeof(hts_pair64_t)); - for (i = n_off = 0; i < iter->bins.n; ++i) { - if ((k = kh_get(bin, bidx, iter->bins.a[i])) != kh_end(bidx)) { - int j; - bins_t *p = &kh_value(bidx, k); - for (j = 0; j < p->n; ++j) - if (p->list[j].v > min_off && p->list[j].u < max_off) - off[n_off++] = p->list[j]; + + if (n_off) { + ks_introsort(_off_max, n_off, off); + // resolve completely contained adjacent blocks + for (i = 1, l = 0; i < n_off; ++i) { + if (off[l].v < off[i].v) { + off[++l] = off[i]; + } else { + off[l].max = (off[i].max > off[l].max ? off[i].max : off[l].max); + } } + n_off = l + 1; + // resolve overlaps between adjacent blocks; this may happen due to the merge in indexing + for (i = 1; i < n_off; ++i) + if (off[i-1].v >= off[i].u) off[i-1].v = off[i].u; + // merge adjacent blocks + for (i = 1, l = 0; i < n_off; ++i) { + if (off[l].v>>16 == off[i].u>>16) { + off[l].v = off[i].v; + off[l].max = (off[i].max > off[l].max ? off[i].max : off[l].max); + } else off[++l] = off[i]; + } + n_off = l + 1; + iter->n_off = n_off; iter->off = off; } - if (n_off == 0) { - free(off); + + if(!n_off && !iter->nocoor) iter->finished = 1; - return iter; - } - ks_introsort(_off, n_off, off); - // resolve completely contained adjacent blocks - for (i = 1, l = 0; i < n_off; ++i) - if (off[l].v < off[i].v) off[++l] = off[i]; - n_off = l + 1; - // resolve overlaps between adjacent blocks; this may happen due to the merge in indexing - for (i = 1; i < n_off; ++i) - if (off[i-1].v >= off[i].u) off[i-1].v = off[i].u; - // merge adjacent blocks - for (i = 1, l = 0; i < n_off; ++i) { - if (off[l].v>>16 == off[i].u>>16) off[l].v = off[i].v; - else off[++l] = off[i]; - } - n_off = l + 1; - iter->n_off = n_off; iter->off = off; + return iter; } @@ -2104,6 +2423,30 @@ if (iter) { free(iter->off); free(iter->bins.a); free(iter); } } +void hts_reglist_free(hts_reglist_t *reglist, int count) { + + int i; + if(reglist) { + for (i=0;ireg_list && iter->n_reg) + hts_reglist_free(iter->reg_list, iter->n_reg); + + if (iter->off && iter->n_off) + free(iter->off); + free(iter); + } +} + static inline long long push_digit(long long i, char c) { // ensure subtraction occurs first, avoiding overflow for >= MAX-48 or so @@ -2208,6 +2551,46 @@ return itr_query(idx, tid, beg, end, readrec); } +hts_itr_multi_t *hts_itr_regions(const hts_idx_t *idx, hts_reglist_t *reglist, int count, hts_name2id_f getid, void *hdr, hts_itr_multi_query_func *itr_specific, hts_readrec_func *readrec, hts_seek_func *seek, hts_tell_func *tell) { + + int i; + + if (!reglist) + return NULL; + + hts_itr_multi_t *itr = (hts_itr_multi_t*)calloc(1, sizeof(hts_itr_multi_t)); + if (itr) { + itr->n_reg = count; + itr->readrec = readrec; + itr->seek = seek; + itr->tell = tell; + itr->reg_list = reglist; + itr->finished = 0; + itr->nocoor = 0; + + + for (i = 0; i < itr->n_reg; i++) { + if (!strcmp(itr->reg_list[i].reg, ".")) { + itr->reg_list[i].tid = HTS_IDX_START; + continue; + } + + if (!strcmp(itr->reg_list[i].reg, "*")) { + itr->reg_list[i].tid = HTS_IDX_NOCOOR; + continue; + } + + itr->reg_list[i].tid = getid(hdr, reglist[i].reg); + if (itr->reg_list[i].tid < 0) + hts_log_warning("Region '%s' specifies an unknown reference name. Continue anyway", reglist[i].reg); + } + + qsort(itr->reg_list, itr->n_reg, sizeof(hts_reglist_t), compare_regions); + itr_specific(idx, itr); + } + return itr; +} + int hts_itr_next(BGZF *fp, hts_itr_t *iter, void *r, void *data) { int ret, tid, beg, end; @@ -2251,6 +2634,110 @@ return ret; } +int hts_itr_multi_next(htsFile *fd, hts_itr_multi_t *iter, void *r) +{ + void *fp; + int ret, tid, beg, end, i, cr, ci; + hts_reglist_t *found_reg; + + if (iter == NULL || iter->finished) return -1; + + if (iter->is_cram) { + fp = fd->fp.cram; + } else { + fp = fd->fp.bgzf; + } + + if (iter->read_rest) { + if (iter->curr_off) { // seek to the start + if (iter->seek(fp, iter->curr_off, SEEK_SET) < 0) { + return -1; + } + iter->curr_off = 0; // only seek once + } + + ret = iter->readrec(fp, fd, r, &tid, &beg, &end); + if (ret < 0) { + iter->finished = 1; + } + + iter->curr_tid = tid; + iter->curr_beg = beg; + iter->curr_end = end; + + return ret; + } + // A NULL iter->off should always be accompanied by iter->finished. + assert(iter->off != NULL || iter->nocoor != 0); + + for (;;) { + if (iter->curr_off == 0 || iter->curr_off >= iter->off[iter->i].v) { // then jump to the next chunk + if (iter->i == iter->n_off - 1) { // no more chunks, except NOCOORs + if (iter->nocoor) { + iter->read_rest = 1; + iter->curr_off = iter->nocoor_off; + + return hts_itr_multi_next(fd, iter, r); + } else { + ret = -1; break; + } + } + + if (iter->i < 0 || iter->off[iter->i].v != iter->off[iter->i+1].u) { // not adjacent chunks; then seek + if (iter->seek(fp, iter->off[iter->i+1].u, SEEK_SET) < 0) { + return -1; + } + + iter->curr_off = iter->tell(fp); + } + ++iter->i; + } + + if ((ret = iter->readrec(fp, fd, r, &tid, &beg, &end)) >= 0) { + iter->curr_off = iter->tell(fp); + if (tid != iter->curr_tid) { + hts_reglist_t key; + key.tid = tid; + + found_reg = (hts_reglist_t *)bsearch(&key, iter->reg_list, iter->n_reg, sizeof(hts_reglist_t), compare_regions); + + if (!found_reg) + continue; + + iter->curr_reg = (found_reg - iter->reg_list); + iter->curr_tid = tid; + iter->curr_intv = 0; + } + + cr = iter->curr_reg; + ci = iter->curr_intv; + + + if (beg > iter->off[iter->i].max) { + iter->curr_off = iter->off[iter->i].v; + continue; + } + if (beg > iter->reg_list[cr].max_end) + continue; + + for (i = ci; i < iter->reg_list[cr].count; i++) { + if (end > iter->reg_list[cr].intervals[i].beg && iter->reg_list[cr].intervals[i].end > beg) { + iter->curr_beg = beg; + iter->curr_end = end; + iter->curr_intv = i; + + return ret; + } + } + } else { + break; // end of file or error + } + } + iter->finished = 1; + + return ret; +} + /********************** *** Retrieve index *** **********************/ diff -Nru htslib-1.6/htsfile.1 htslib-1.7/htsfile.1 --- htslib-1.6/htsfile.1 2017-09-28 11:20:23.000000000 +0000 +++ htslib-1.7/htsfile.1 2018-01-26 12:01:45.000000000 +0000 @@ -1,4 +1,4 @@ -.TH htsfile 1 "28 September 2017" "htslib-1.6" "Bioinformatics tools" +.TH htsfile 1 "26 January 2018" "htslib-1.7" "Bioinformatics tools" .SH NAME htsfile \- identify high-throughput sequencing data files .\" diff -Nru htslib-1.6/htsfile.c htslib-1.7/htsfile.c --- htslib-1.6/htsfile.c 2017-09-28 11:20:23.000000000 +0000 +++ htslib-1.7/htsfile.c 2018-01-26 12:01:45.000000000 +0000 @@ -1,6 +1,6 @@ /* htsfile.c -- file identifier and minimal viewer. - Copyright (C) 2014-2017 Genome Research Ltd. + Copyright (C) 2014-2018 Genome Research Ltd. Author: John Marshall @@ -206,7 +206,7 @@ case 1: printf( "htsfile (htslib) %s\n" -"Copyright (C) 2017 Genome Research Ltd.\n", +"Copyright (C) 2018 Genome Research Ltd.\n", hts_version()); exit(EXIT_SUCCESS); break; diff -Nru htslib-1.6/hts_internal.h htslib-1.7/hts_internal.h --- htslib-1.6/hts_internal.h 2017-09-28 11:20:23.000000000 +0000 +++ htslib-1.7/hts_internal.h 2018-01-26 12:01:45.000000000 +0000 @@ -28,121 +28,21 @@ #include "htslib/hts.h" +#include "textutils_internal.h" + #ifdef __cplusplus extern "C" { #endif struct hFILE; -/// Decode percent-encoded (URL-encoded) text -/** On input, _dest_ should be a buffer at least the same size as _s_, - and may be equal to _s_ to decode in place. On output, _dest_ will be - NUL-terminated and the number of characters written (not including the - NUL) is stored in _destlen_. -*/ -int hts_decode_percent(char *dest, size_t *destlen, const char *s); - -/// Return decoded data length given length of base64-encoded text -/** This gives an upper bound, as it overestimates by a byte or two when - the encoded text ends with (possibly omitted) `=` padding characters. -*/ -size_t hts_base64_decoded_length(size_t len); - -/// Decode base64-encoded data -/** On input, _dest_ should be a sufficient buffer (see `hts_base64_length()`), - and may be equal to _s_ to decode in place. On output, the number of - bytes writen is stored in _destlen_. -*/ -int hts_decode_base64(char *dest, size_t *destlen, const char *s); - - -/// Token structure returned by JSON lexing functions -/** Token types correspond to scalar JSON values and selected punctuation -as follows: - - `s` string - - `n` number - - `b` boolean literal - - `.` null literal - - `{`, `}`, `[`, `]` object and array delimiters - - `?` lexing error - - `\0` terminator at end of input -*/ -typedef struct hts_json_token { +struct hts_json_token { char type; ///< Token type char *str; ///< Value as a C string (filled in for all token types) // TODO Add other fields to fill in for particular data types, e.g. // int inum; // float fnum; -} hts_json_token; - -/// Read one JSON token from a file -/** @param str The input C string - @param state The input string state - @param token On return, filled in with the token read - @return The type of the token read - -On return, `token->str` points into the supplied input string, which -is modified by having token-terminating characters overwritten as NULs. -The `state` argument records the current position within `str` after each -`hts_json_snext()` call, and should be set to 0 before the first call. -*/ -char hts_json_snext(char *str, size_t *state, hts_json_token *token); - -/// Read and discard a complete JSON value from a string -/** @param str The input C string - @param state The input string state, as per `hts_json_snext()` - @param type If the first token of the value to be discarded has already - been read, provide its type; otherwise `'\0'` - @return One of `v` (success), `\0` (end of string), and `?` (lexing error) - -Skips a complete JSON value, which may be a single token or an entire object -or array. -*/ -char hts_json_sskip_value(char *str, size_t *state, char type); - -/// Read one JSON token from a file -/** @param fp The file stream - @param token On return, filled in with the token read - @param kstr Buffer used to store the token string returned - @return The type of the token read - -The `kstr` buffer is used to store the string value of the token read, -so `token->str` is only valid until the next time `hts_json_fnext()` is -called with the same `kstr` argument. -*/ -char hts_json_fnext(struct hFILE *fp, hts_json_token *token, kstring_t *kstr); - -/// Read and discard a complete JSON value from a file -/** @param fp The file stream - @param type If the first token of the value to be discarded has already - been read, provide its type; otherwise `'\0'` - @return One of `v` (success), `\0` (EOF), and `?` (lexing error) - -Skips a complete JSON value, which may be a single token or an entire object -or array. -*/ -char hts_json_fskip_value(struct hFILE *fp, char type); - - -// The functions operate on ints such as are returned by fgetc(), -// i.e., characters represented as unsigned-char-valued ints, or EOF. -// To operate on plain chars (and to avoid warnings on some platforms), -// technically one must cast to unsigned char everywhere (see CERT STR37-C) -// or less painfully use these *_c() functions that operate on plain chars -// (but not EOF, which must be considered separately where it is applicable). -// TODO We may eventually wish to implement these functions directly without -// using their equivalents, and thus make them immune to locales. -static inline int isalnum_c(char c) { return isalnum((unsigned char) c); } -static inline int isalpha_c(char c) { return isalpha((unsigned char) c); } -static inline int isdigit_c(char c) { return isdigit((unsigned char) c); } -static inline int isgraph_c(char c) { return isgraph((unsigned char) c); } -static inline int islower_c(char c) { return islower((unsigned char) c); } -static inline int isprint_c(char c) { return isprint((unsigned char) c); } -static inline int isspace_c(char c) { return isspace((unsigned char) c); } -static inline int isupper_c(char c) { return isupper((unsigned char) c); } -static inline char tolower_c(char c) { return tolower((unsigned char) c); } -static inline char toupper_c(char c) { return toupper((unsigned char) c); } - +}; struct cram_fd; @@ -161,7 +61,6 @@ // Entry point to hFILE_multipart backend. struct hFILE *hopen_htsget_redirect(struct hFILE *hfile, const char *mode); - struct hts_path_itr { kstring_t path, entry; void *dirv; // DIR * privately diff -Nru htslib-1.6/htslib/bgzf.h htslib-1.7/htslib/bgzf.h --- htslib-1.6/htslib/bgzf.h 2017-09-28 11:20:23.000000000 +0000 +++ htslib-1.7/htslib/bgzf.h 2018-01-26 12:01:45.000000000 +0000 @@ -55,6 +55,7 @@ struct hts_tpool; struct bgzf_mtaux_t; typedef struct __bgzidx_t bgzidx_t; +typedef struct bgzf_cache_t bgzf_cache_t; struct BGZF { // Reserved bits should be written as 0; read as "don't care" @@ -65,7 +66,7 @@ int block_length, block_clength, block_offset; int64_t block_address, uncompressed_address; void *uncompressed_block, *compressed_block; - void *cache; // a pointer to a hash table + bgzf_cache_t *cache; struct hFILE *fp; // actual file handle struct bgzf_mtaux_t *mt; // only used for multi-threading bgzidx_t *idx; // BGZF index @@ -194,7 +195,7 @@ /** * Return a virtual file pointer to the current location in the file. - * No interpetation of the value should be made, other than a subsequent + * No interpretation of the value should be made, other than a subsequent * call to bgzf_seek can be used to position the file at the same point. * Return value is non-negative on success. */ diff -Nru htslib-1.6/htslib/hfile.h htslib-1.7/htslib/hfile.h --- htslib-1.6/htslib/hfile.h 2017-09-28 11:20:23.000000000 +0000 +++ htslib-1.7/htslib/hfile.h 2018-01-26 12:01:45.000000000 +0000 @@ -202,7 +202,7 @@ if (n > nbytes) n = nbytes; memcpy(buffer, fp->begin, n); fp->begin += n; - return (n == nbytes)? (ssize_t) n : hread2(fp, buffer, nbytes, n); + return (n == nbytes || !fp->mobile)? (ssize_t) n : hread2(fp, buffer, nbytes, n); } /// Write a character to the stream @@ -239,6 +239,14 @@ hwrite(hFILE *fp, const void *buffer, size_t nbytes) { extern ssize_t hwrite2(hFILE *, const void *, size_t, size_t); + extern int hfile_set_blksize(hFILE *fp, size_t bufsiz); + + if(!fp->mobile){ + if (fp->limit - fp->begin < nbytes){ + hfile_set_blksize(fp, fp->limit - fp->buffer + nbytes); + fp->end = fp->limit; + } + } size_t n = fp->limit - fp->begin; if (n > nbytes) n = nbytes; @@ -254,6 +262,24 @@ */ int hflush(hFILE *fp) HTS_RESULT_USED; +/// For hfile_mem: get the internal buffer and it's size from a hfile +/** @return buffer if successful, or NULL if an error occurred + +The buffer returned should not be freed as this will happen when the +hFILE is closed. +*/ +char *hfile_mem_get_buffer(hFILE *file, size_t *length); + +/// For hfile_mem: get the internal buffer and it's size from a hfile. +/** @return buffer if successful, or NULL if an error occurred + +This is similar to hfile_mem_get_buffer except that ownership of the +buffer is granted to the caller, who now has responsibility for freeing +it. From this point onwards, the hFILE should not be used for any +purpose other than closing. +*/ +char *hfile_mem_steal_buffer(hFILE *file, size_t *length); + #ifdef __cplusplus } #endif diff -Nru htslib-1.6/htslib/hts.h htslib-1.7/htslib/hts.h --- htslib-1.6/htslib/hts.h 2017-09-28 11:20:23.000000000 +0000 +++ htslib-1.7/htslib/hts.h 2018-01-26 12:01:45.000000000 +0000 @@ -511,10 +511,29 @@ typedef struct __hts_idx_t hts_idx_t; typedef struct { + uint32_t beg, end; +} hts_pair32_t; + +typedef struct { uint64_t u, v; } hts_pair64_t; +typedef struct { + uint64_t u, v; + uint64_t max; +} hts_pair64_max_t; + +typedef struct { + const char *reg; + int tid; + hts_pair32_t *intervals; + uint32_t count; + uint32_t min_beg, max_end; +} hts_reglist_t; + typedef int hts_readrec_func(BGZF *fp, void *data, void *r, int *tid, int *beg, int *end); +typedef int hts_seek_func(void *fp, int64_t offset, int where); +typedef int64_t hts_tell_func(void *fp); typedef struct { uint32_t read_rest:1, finished:1, is_cram:1, dummy:29; @@ -529,6 +548,24 @@ } bins; } hts_itr_t; +typedef struct { + int key; + uint64_t min_off, max_off; +} aux_key_t; + +typedef struct { + uint32_t read_rest:1, finished:1, is_cram:1, nocoor:1, dummy:28; + hts_reglist_t *reg_list; + int n_reg, i; + int curr_tid, curr_intv, curr_beg, curr_end, curr_reg; + hts_pair64_max_t *off; + int n_off; + uint64_t curr_off, nocoor_off; + hts_readrec_func *readrec; + hts_seek_func *seek; + hts_tell_func *tell; +} hts_itr_multi_t; + #define hts_bin_first(l) (((1<<(((l)<<1) + (l))) - 1) / 7) #define hts_bin_parent(l) (((l) - 1) >> 3) @@ -637,6 +674,19 @@ int hts_itr_next(BGZF *fp, hts_itr_t *iter, void *r, void *data) HTS_RESULT_USED; const char **hts_idx_seqnames(const hts_idx_t *idx, int *n, hts_id2name_f getid, void *hdr); // free only the array, not the values +/********************************** + * Iterator with multiple regions * + **********************************/ + +typedef hts_itr_multi_t *hts_itr_multi_query_func(const hts_idx_t *idx, hts_itr_multi_t *itr); +hts_itr_multi_t *hts_itr_multi_bam(const hts_idx_t *idx, hts_itr_multi_t *iter); +hts_itr_multi_t *hts_itr_multi_cram(const hts_idx_t *idx, hts_itr_multi_t *iter); +hts_itr_multi_t *hts_itr_regions(const hts_idx_t *idx, hts_reglist_t *reglist, int count, hts_name2id_f getid, void *hdr, hts_itr_multi_query_func *itr_specific, hts_readrec_func *readrec, hts_seek_func *seek, hts_tell_func *tell); +int hts_itr_multi_next(htsFile *fd, hts_itr_multi_t *iter, void *r); +void hts_reglist_free(hts_reglist_t *reglist, int count); +void hts_itr_multi_destroy(hts_itr_multi_t *iter); + + /** * hts_file_type() - Convenience function to determine file type * DEPRECATED: This function has been replaced by hts_detect_format(). diff -Nru htslib-1.6/htslib/khash.h htslib-1.7/htslib/khash.h --- htslib-1.6/htslib/khash.h 2017-09-28 11:20:23.000000000 +0000 +++ htslib-1.7/htslib/khash.h 2018-01-26 12:01:45.000000000 +0000 @@ -594,7 +594,7 @@ KHASH_INIT(name, khint32_t, khval_t, 1, kh_int_hash_func, kh_int_hash_equal) /*! @function - @abstract Instantiate a hash map containing 64-bit integer keys + @abstract Instantiate a hash set containing 64-bit integer keys @param name Name of the hash table [symbol] */ #define KHASH_SET_INIT_INT64(name) \ @@ -610,7 +610,7 @@ typedef const char *kh_cstr_t; /*! @function - @abstract Instantiate a hash map containing const char* keys + @abstract Instantiate a hash set containing const char* keys @param name Name of the hash table [symbol] */ #define KHASH_SET_INIT_STR(name) \ diff -Nru htslib-1.6/htslib/ksort.h htslib-1.7/htslib/ksort.h --- htslib-1.6/htslib/ksort.h 2017-09-28 11:20:23.000000000 +0000 +++ htslib-1.7/htslib/ksort.h 2018-01-26 12:01:45.000000000 +0000 @@ -65,6 +65,10 @@ #include #include +#ifdef __cplusplus +extern "C" { +#endif + // Use our own drand48() symbol (used by ks_shuffle) to avoid portability // problems on Windows. Don't include htslib/hts_os.h for this as it // may not get on with older attempts to fix this in code that includes @@ -289,4 +293,8 @@ #define KSORT_INIT_GENERIC(type_t) KSORT_INIT(type_t, type_t, ks_lt_generic) #define KSORT_INIT_STR KSORT_INIT(str, ksstr_t, ks_lt_str) +#ifdef __cplusplus +} +#endif + #endif diff -Nru htslib-1.6/htslib/sam.h htslib-1.7/htslib/sam.h --- htslib-1.6/htslib/sam.h 2017-09-28 11:20:23.000000000 +0000 +++ htslib-1.7/htslib/sam.h 2018-01-26 12:01:45.000000000 +0000 @@ -34,6 +34,9 @@ extern "C" { #endif +/// Highest SAM format version supported by this library +#define SAM_FORMAT_VERSION "1.5" + /********************** *** SAM/BAM header *** **********************/ @@ -353,7 +356,10 @@ #define sam_itr_destroy(iter) hts_itr_destroy(iter) hts_itr_t *sam_itr_queryi(const hts_idx_t *idx, int tid, int beg, int end); hts_itr_t *sam_itr_querys(const hts_idx_t *idx, bam_hdr_t *hdr, const char *region); + hts_itr_multi_t *sam_itr_regions(const hts_idx_t *idx, bam_hdr_t *hdr, hts_reglist_t *reglist, unsigned int regcount); + #define sam_itr_next(htsfp, itr, r) hts_itr_next((htsfp)->fp.bgzf, (itr), (r), (htsfp)) + #define sam_itr_multi_next(htsfp, itr, r) hts_itr_multi_next((htsfp), (itr), (r)) /*************** *** SAM I/O *** @@ -376,6 +382,7 @@ bam_hdr_t *sam_hdr_parse(int l_text, const char *text); bam_hdr_t *sam_hdr_read(samFile *fp); int sam_hdr_write(samFile *fp, const bam_hdr_t *h) HTS_RESULT_USED; + int sam_hdr_change_HD(bam_hdr_t *h, const char *key, const char *val); int sam_parse1(kstring_t *s, bam_hdr_t *h, bam1_t *b) HTS_RESULT_USED; int sam_format1(const bam_hdr_t *h, const bam1_t *b, kstring_t *str) HTS_RESULT_USED; diff -Nru htslib-1.6/htslib/thread_pool.h htslib-1.7/htslib/thread_pool.h --- htslib-1.6/htslib/thread_pool.h 2017-09-28 11:20:23.000000000 +0000 +++ htslib-1.7/htslib/thread_pool.h 2018-01-26 12:01:45.000000000 +0000 @@ -68,7 +68,7 @@ * growing too large and serial numbers to ensure sequential consumption of * the output. * - * The thread pool may have many hetergeneous tasks, each using its own + * The thread pool may have many heterogeneous tasks, each using its own * process mixed into the same thread pool. */ typedef struct hts_tpool_process hts_tpool_process; @@ -145,7 +145,7 @@ int hts_tpool_process_flush(hts_tpool_process *q); /* - * Resets a process to the intial state. + * Resets a process to the initial state. * * This removes any queued up input jobs, disables any notification of * new results/output, flushes what is left and then discards any diff -Nru htslib-1.6/hts_os.c htslib-1.7/hts_os.c --- htslib-1.6/hts_os.c 2017-09-28 11:20:23.000000000 +0000 +++ htslib-1.7/hts_os.c 2018-01-26 12:01:45.000000000 +0000 @@ -23,11 +23,14 @@ FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ +#include + // Windows (maybe more) lack a drand48 implementation. #ifndef HAVE_DRAND48 -#include "win/rand.c" +#include "os/rand.c" #else -void hts_srand48(long seed) { return srand48(seed); } +#include +void hts_srand48(long seed) { srand48(seed); } double hts_erand48(unsigned short xseed[3]) { return erand48(xseed); } double hts_drand48(void) { return drand48(); } double hts_lrand48(void) { return lrand48(); } @@ -36,5 +39,5 @@ // // On Windows when using the MSYS or Cygwin terminals, isatty fails // #ifdef _WIN32 // #define USE_FILEEXTD -// #include "win/iscygpty.c" +// #include "os/iscygpty.c" // #endif diff -Nru htslib-1.6/kstring.c htslib-1.7/kstring.c --- htslib-1.6/kstring.c 2017-09-28 11:20:23.000000000 +0000 +++ htslib-1.7/kstring.c 2018-01-26 12:01:45.000000000 +0000 @@ -61,7 +61,7 @@ return len; } - uint64_t i = d*10000000000; + uint64_t i = d*10000000000LL; // Correction for rounding - rather ugly // Optimised for small numbers. @@ -88,7 +88,7 @@ else if (d < 100000) i+=500000000; else - i+=5000000000; + i+=5000000000LL; do { *--cp = '0' + i%10; @@ -174,28 +174,29 @@ return l; } -char *kstrtok(const char *str, const char *sep, ks_tokaux_t *aux) +char *kstrtok(const char *str, const char *sep_in, ks_tokaux_t *aux) { - const char *p, *start; + const unsigned char *p, *start, *sep = (unsigned char *) sep_in; if (sep) { // set up the table - if (str == 0 && (aux->tab[0]&1)) return 0; // no need to set up if we have finished + if (str == 0 && aux->finished) return 0; // no need to set up if we have finished aux->finished = 0; - if (sep[1]) { + if (sep[0] && sep[1]) { aux->sep = -1; aux->tab[0] = aux->tab[1] = aux->tab[2] = aux->tab[3] = 0; for (p = sep; *p; ++p) aux->tab[*p>>6] |= 1ull<<(*p&0x3f); } else aux->sep = sep[0]; } if (aux->finished) return 0; - else if (str) aux->p = str - 1, aux->finished = 0; + else if (str) start = (unsigned char *) str, aux->finished = 0; + else start = (unsigned char *) aux->p + 1; if (aux->sep < 0) { - for (p = start = aux->p + 1; *p; ++p) + for (p = start; *p; ++p) if (aux->tab[*p>>6]>>(*p&0x3f)&1) break; } else { - for (p = start = aux->p + 1; *p; ++p) + for (p = start; *p; ++p) if (*p == aux->sep) break; } - aux->p = p; // end of token + aux->p = (const char *) p; // end of token if (*p == 0) aux->finished = 1; // no more tokens return (char*)start; } diff -Nru htslib-1.6/m4/hts_prog_cc_warnings.m4 htslib-1.7/m4/hts_prog_cc_warnings.m4 --- htslib-1.6/m4/hts_prog_cc_warnings.m4 1970-01-01 00:00:00.000000000 +0000 +++ htslib-1.7/m4/hts_prog_cc_warnings.m4 2018-01-26 12:01:45.000000000 +0000 @@ -0,0 +1,208 @@ +dnl @synopsis HTS_PROG_CC_WARNINGS([ANSI]) +dnl +dnl Derived from +dnl http://ac-archive.sourceforge.net/ac-archive/vl_prog_cc_warnings.html +dnl +dnl Enables a reasonable set of warnings for the C compiler. +dnl Optionally, if the first argument is nonempty, turns on flags which +dnl enforce and/or enable proper ANSI C if such are known with the +dnl compiler used. +dnl +dnl Currently this macro knows about GCC, Solaris C compiler, Digital +dnl Unix C compiler, C for AIX Compiler, HP-UX C compiler, IRIX C +dnl compiler, NEC SX-5 (Super-UX 10) C compiler, and Cray J90 (Unicos +dnl 10.0.0.8) C compiler. +dnl +dnl @category C +dnl @author Ville Laurikari +dnl Updated by Rob Davies for HTSlib +dnl @license AllPermissive +dnl Copying and distribution of this file, with or without modification, +dnl are permitted in any medium without royalty provided the copyright notice +dnl and this notice are preserved. Users of this software should generally +dnl follow the principles of the MIT License including its disclaimer. +dnl Original Copyright (c) Ville Laurikari 2002 +dnl Modifications Copyright (c) Genome Research Limited 2015,2017 + +AC_DEFUN([HTS_PROG_CC_WARNINGS], [ + AC_ARG_ENABLE([warnings], + [AS_HELP_STRING([--disable-warnings], [turn off compiler warnings])], + [], + [enable_warnings=yes]) + + AS_IF([test "x$enable_warnings" != xno],[ + AC_REQUIRE([AC_PROG_GREP]) + + ansi="$1" + AS_IF([test "x$ansi" = "x"], + [msg="for C compiler warning flags"], + [msg="for C compiler warning and ANSI conformance flags"]) + + AC_MSG_CHECKING($msg) + AC_CACHE_VAL(hts_cv_prog_cc_warnings, [dnl + hts_cv_prog_cc_warnings="" + AS_IF([test "x$CC" != "x"],[ + cat > conftest.c < /dev/null 2>&1 && + test -f conftest.o],[dnl + AS_IF([test "x$ansi" = "x"], + [hts_cv_prog_cc_warnings="-Wall"], + [hts_cv_prog_cc_warnings="-Wall -ansi -pedantic"]) + ], + # Sun Studio or Solaris C compiler + ["$CC" -V 2>&1 | $GREP -i -E "WorkShop|Sun C" > /dev/null 2>&1 && + "$CC" -c -v -Xc conftest.c > /dev/null 2>&1 && + test -f conftest.o],[dnl + AS_IF([test "x$ansi" = "x"], + [hts_cv_prog_cc_warnings="-v"], + [hts_cv_prog_cc_warnings="-v -Xc"]) + ], + # Digital Unix C compiler + ["$CC" -V 2>&1 | $GREP -i "Digital UNIX Compiler" > /dev/null 2>&1 && + "$CC" -c -verbose -w0 -warnprotos -std1 conftest.c > /dev/null 2>&1 && + test -f conftest.o], [dnl + AS_IF([test "x$ansi" = "x"], + [hts_cv_prog_cc_warnings="-verbose -w0 -warnprotos"], + [hts_cv_prog_cc_warnings="-verbose -w0 -warnprotos -std1"]) + ], + # C for AIX Compiler + ["$CC" 2>&1 | $GREP -i "C for AIX Compiler" > /dev/null 2>&1 && + "$CC" -c -qlanglvl=ansi -qinfo=all conftest.c > /dev/null 2>&1 && + test -f conftest.o],[dnl + AS_IF([test "x$ansi" = "x"], + [hts_cv_prog_cc_warnings="-qsrcmsg -qinfo=all:noppt:noppc:noobs:nocnd"], + [hts_cv_prog_cc_warnings="-qsrcmsg -qinfo=all:noppt:noppc:noobs:nocnd -qlanglvl=ansi"]) + ], + # IRIX C compiler + ["$CC" -version 2>&1 | $GREP -i "MIPSpro Compilers" > /dev/null 2>&1 && + "$CC" -c -fullwarn -ansi -ansiE conftest.c > /dev/null 2>&1 && + test -f conftest.o],[dnl + AS_IF([test "x$ansi" = "x"], + [hts_cv_prog_cc_warnings="-fullwarn"], + [hts_cv_prog_cc_warnings="-fullwarn -ansi -ansiE"]) + ], + # HP-UX C compiler + [what "$CC" 2>&1 | $GREP -i "HP C Compiler" > /dev/null 2>&1 && + "$CC" -c -Aa +w1 conftest.c > /dev/null 2>&1 && + test -f conftest.o],[dnl + AS_IF([test "x$ansi" = "x"], + [hts_cv_prog_cc_warnings="+w1"], + [hts_cv_prog_cc_warnings="+w1 -Aa"]) + ], + # The NEC SX series (Super-UX 10) C compiler + ["$CC" -V 2>&1 | $GREP "/SX" > /dev/null 2>&1 && + "$CC" -c -pvctl[,]fullmsg -Xc conftest.c > /dev/null 2>&1 && + test -f conftest.o],[ + AS_IF([test "x$ansi" = "x"], + [hts_cv_prog_cc_warnings="-pvctl[,]fullmsg"], + [hts_cv_prog_cc_warnings="-pvctl[,]fullmsg -Xc"]) + ], + # The Cray C compiler (Unicos) + ["$CC" -V 2>&1 | $GREP -i "Cray" > /dev/null 2>&1 && + "$CC" -c -h msglevel_2 conftest.c > /dev/null 2>&1 && + test -f conftest.o],[dnl + AS_IF([test "x$ansi" = "x"], + [hts_cv_prog_cc_warnings="-h#msglevel_2"], + [hts_cv_prog_cc_warnings="-h#msglevel_2,conform"]) + ], + # The Tiny C Compiler + ["$CC" -v 2>&1 | $GREP "tcc version" > /dev/null && + "$CC" -Wall -c conftest.c > /dev/null 2>&1 && + test -f conftest.o],[dnl + hts_cv_prog_cc_warnings="-Wall" + ]) + rm -f conftest.* + ]) + ]) + + AS_IF([test "x$hts_cv_prog_cc_warnings" != "x"],[ +dnl Print result, with underscores as spaces +ac_arg_result=`echo "$hts_cv_prog_cc_warnings" | tr '#' ' '` +AC_MSG_RESULT($ac_arg_result) + +dnl Add options to CFLAGS only if they are not already present +ac_arg_needed="" +for ac_arg in $hts_cv_prog_cc_warnings +do + ac_arg_sp=`echo "$ac_arg" | tr '#' ' '` + AS_CASE([" $CFLAGS "], +[*" $ac_arg_sp "*], [], +[ac_arg_needed="$ac_arg_all $ac_arg_sp"]) +done +CFLAGS="$ac_arg_needed $CFLAGS"],[dnl + AC_MSG_RESULT(unknown) + ]) + ]) +])dnl HTS_PROG_CC_WARNINGS + +# SYNOPSIS +# +# HTS_PROG_CC_WERROR(FLAGS_VAR) +# +# Set FLAGS_VAR to the flags needed to make the C compiler treat warnings +# as errors. + +AC_DEFUN([HTS_PROG_CC_WERROR], [ + AC_ARG_ENABLE([werror], + [AS_HELP_STRING([--enable-werror], [change warnings into errors, where supported])], + [enable_werror=yes], + []) + + AS_IF([test "x$enable_werror" = xyes],[ + AC_MSG_CHECKING([for C compiler flags to error on warnings]) + AC_CACHE_VAL(hts_cv_prog_cc_werror, [dnl + hts_cv_prog_cc_werror="" + AS_IF([test "x$CC" != "x"],[ + cat > conftest.c < /dev/null 2>&1 && + test -f conftest.o],[hts_cv_prog_cc_werror="-Werror"], + # Sun Studio or Solaris C compiler + ["$CC" -V 2>&1 | $GREP -i -E "WorkShop|Sun C" > /dev/null 2>&1 && + "$CC" -c -errwarn=%all conftest.c > /dev/null 2>&1 && + test -f conftest.o],[hts_cv_prog_cc_werror="-errwarn=%all"], + # The Tiny C Compiler + ["$CC" -v 2>&1 | $GREP "tcc version" > /dev/null && + "$CC" -Wall -c conftest.c > /dev/null 2>&1 && + test -f conftest.o],[hts_cv_prog_cc_werror="-Werror"] + dnl TODO: Add more compilers + ) + rm -f conftest.* + ]) + ]) + AS_IF([test "x$hts_cv_prog_cc_werror" != x],[ + AC_MSG_RESULT($hts_cv_prog_cc_werror) + AS_IF([test "x$1" != x],[eval AS_TR_SH([$1])="$hts_cv_prog_cc_werror"]) + ],[dnl + AC_MSG_RESULT(unknown) + ]) + ]) +])dnl HTS_PROG_CC_WERROR diff -Nru htslib-1.6/Makefile htslib-1.7/Makefile --- htslib-1.6/Makefile 2017-09-28 11:20:23.000000000 +0000 +++ htslib-1.7/Makefile 2018-01-26 12:01:45.000000000 +0000 @@ -175,8 +175,9 @@ cram_samtools_h = cram/cram_samtools.h $(htslib_sam_h) $(cram_sam_header_h) cram_structs_h = cram/cram_structs.h $(htslib_thread_pool_h) cram/string_alloc.h $(htslib_khash_h) cram_open_trace_file_h = cram/open_trace_file.h cram/mFILE.h -hfile_internal_h = hfile_internal.h $(htslib_hfile_h) -hts_internal_h = hts_internal.h $(htslib_hts_h) +hfile_internal_h = hfile_internal.h $(htslib_hfile_h) $(textutils_internal_h) +hts_internal_h = hts_internal.h $(htslib_hts_h) $(textutils_internal_h) +textutils_internal_h = textutils_internal.h $(htslib_kstring_h) thread_pool_internal_h = thread_pool_internal.h $(htslib_thread_pool_h) @@ -195,6 +196,7 @@ echo '/* Default config.h generated by Makefile */' > $@ echo '#define HAVE_LIBBZ2 1' >> $@ echo '#define HAVE_LIBLZMA 1' >> $@ + echo '#define HAVE_LZMA_H 1' >> $@ echo '#define HAVE_FSEEKO 1' >> $@ echo '#define HAVE_DRAND48 1' >> $@ @@ -293,8 +295,9 @@ hfile_gcs.o hfile_gcs.pico: hfile_gcs.c config.h $(htslib_hts_h) $(htslib_kstring_h) $(hfile_internal_h) hfile_libcurl.o hfile_libcurl.pico: hfile_libcurl.c config.h $(hfile_internal_h) $(htslib_hts_h) $(htslib_kstring_h) hfile_net.o hfile_net.pico: hfile_net.c config.h $(hfile_internal_h) $(htslib_knetfile_h) -hfile_s3.o hfile_s3.pico: hfile_s3.c config.h $(hts_internal_h) $(hfile_internal_h) $(htslib_hts_h) $(htslib_kstring_h) +hfile_s3.o hfile_s3.pico: hfile_s3.c config.h $(hfile_internal_h) $(htslib_hts_h) $(htslib_kstring_h) hts.o hts.pico: hts.c config.h $(htslib_hts_h) $(htslib_bgzf_h) $(cram_h) $(hfile_internal_h) $(htslib_hfile_h) version.h $(hts_internal_h) $(htslib_khash_h) $(htslib_kseq_h) $(htslib_ksort_h) +hts_os.o hts_os.pico: hts_os.c config.h os/rand.c vcf.o vcf.pico: vcf.c config.h $(htslib_vcf_h) $(htslib_bgzf_h) $(htslib_tbx_h) $(htslib_hfile_h) $(hts_internal_h) $(htslib_khash_str2int_h) $(htslib_kstring_h) $(htslib_khash_h) $(htslib_kseq_h) $(htslib_hts_endian_h) sam.o sam.pico: sam.c config.h $(htslib_sam_h) $(htslib_bgzf_h) $(cram_h) $(hts_internal_h) $(htslib_hfile_h) $(htslib_khash_h) $(htslib_kseq_h) $(htslib_kstring_h) $(htslib_hts_endian_h) tbx.o tbx.pico: tbx.c config.h $(htslib_tbx_h) $(htslib_bgzf_h) $(hts_internal_h) $(htslib_khash_h) @@ -317,7 +320,7 @@ cram/cram_encode.o cram/cram_encode.pico: cram/cram_encode.c config.h $(cram_h) $(cram_os_h) $(htslib_hts_h) $(htslib_hts_endian_h) cram/cram_external.o cram/cram_external.pico: cram/cram_external.c config.h $(htslib_hfile_h) $(cram_h) cram/cram_index.o cram/cram_index.pico: cram/cram_index.c config.h $(htslib_bgzf_h) $(htslib_hfile_h) $(hts_internal_h) $(cram_h) $(cram_os_h) -cram/cram_io.o cram/cram_io.pico: cram/cram_io.c config.h $(cram_h) $(cram_os_h) $(htslib_hts_h) $(cram_open_trace_file_h) cram/rANS_static.h $(htslib_hfile_h) $(htslib_bgzf_h) $(htslib_faidx_h) $(hts_internal_h) +cram/cram_io.o cram/cram_io.pico: cram/cram_io.c config.h os/lzma_stub.h $(cram_h) $(cram_os_h) $(htslib_hts_h) $(cram_open_trace_file_h) cram/rANS_static.h $(htslib_hfile_h) $(htslib_bgzf_h) $(htslib_faidx_h) $(hts_internal_h) cram/cram_samtools.o cram/cram_samtools.pico: cram/cram_samtools.c config.h $(cram_h) $(htslib_sam_h) cram/cram_stats.o cram/cram_stats.pico: cram/cram_stats.c config.h $(cram_h) $(cram_os_h) cram/files.o cram/files.pico: cram/files.c config.h $(cram_misc_h) diff -Nru htslib-1.6/multipart.c htslib-1.7/multipart.c --- htslib-1.6/multipart.c 2017-09-28 11:20:23.000000000 +0000 +++ htslib-1.7/multipart.c 2018-01-26 12:01:45.000000000 +0000 @@ -83,8 +83,10 @@ (strlen(p->url) > 120)? "..." : ""); fp->currentfp = p->headers? - hopen(p->url, "r:", "httphdr:v", p->headers, NULL) - : hopen(p->url, "r"); + hopen(p->url, "r:", + "httphdr:v", p->headers, + "auth_token_enabled", "false", NULL) + : hopen(p->url, "r:", "auth_token_enabled", "false", NULL); if (fp->currentfp == NULL) return -1; } diff -Nru htslib-1.6/NEWS htslib-1.7/NEWS --- htslib-1.6/NEWS 2017-09-28 11:20:23.000000000 +0000 +++ htslib-1.7/NEWS 2018-01-26 12:01:45.000000000 +0000 @@ -1,3 +1,45 @@ +Noteworthy changes in release 1.7 (26th January 2018) +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +* BAM: HTSlib now supports BAMs which include CIGARs with more than + 65535 operations as per HTS-Specs 18th November (dab57f4 and 2f915a8). + +* BCF/VCF: + - Removed the need for long double in pileup calculations. + - Sped up the synced reader in some situations. + - Bug fixing: removed memory leak in bcf_copy. + +* CRAM: + - Added support for HTS_IDX_START in cram iterators. + - Easier to build when lzma header files are absent. + - Bug fixing: a region query with REQUIRED_FIELDS option to + disable sequence retrieval now gives correct results. + - Bug fixing: stop queries to regions starting after the last + read on a chromosome from incorrectly reporting errors + (#651, #653; reported by Imran Haque and @egafni via pysam). + +* Multi-region iterator: The new structure takes a list of regions and + iterates over all, deduplicating reads in the process, and producing a + full list of file offset intervals. This is usually much faster than + repeatedly using the old single-region iterator on a series of regions. + +* Curl improvements: + - Add Bearer token support via HTS_AUTH_LOCATION env (#600). + - Use CURL_CA_BUNDLE environment variable to override the CA (#622; + thanks to Garret Kelly & David Alexander). + - Speed up (removal of excessive waiting) for both http(s) and ftp. + - Avoid repeatedly reconnecting by removal of unnecessary seeks. + - Bug fixing: double free when libcurl_open fails. + +* BGZF block caching, if enabled, now performs far better (#629; reported + by Ram Yalamanchili). + +* Added an hFILE layer for in-memory I/O buffers (#590; thanks to Thomas + Hickman). + +* Tidied up the drand48 support (intended for systems that do not + provide this function). + Noteworthy changes in release 1.6 (28th September 2017) ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ diff -Nru htslib-1.6/os/lzma_stub.h htslib-1.7/os/lzma_stub.h --- htslib-1.6/os/lzma_stub.h 1970-01-01 00:00:00.000000000 +0000 +++ htslib-1.7/os/lzma_stub.h 2018-01-26 12:01:45.000000000 +0000 @@ -0,0 +1,85 @@ +#ifndef LZMA_STUB_H +#define LZMA_STUB_H + +/* Some platforms, notably macOS, ship a usable liblzma shared library but + do not ship any LZMA header files. The and header + files that come with the library contain the following statement: + + * + * Author: Lasse Collin + * + * This file has been put into the public domain. + * You can do whatever you want with this file. + * + + Accordingly the following declarations have been copied and distilled + from and (primarily) and are sufficient + to compile cram/cram_io.c in the absence of proper LZMA headers. + + This file, lzma_stub.h, remains in the public domain. */ + +#include + +#ifdef __cplusplus +extern "C" { +#endif + +typedef enum { LZMA_OK = 0, LZMA_STREAM_END = 1 } lzma_ret; +typedef enum { LZMA_RUN = 0, LZMA_FINISH = 3 } lzma_action; +typedef enum { LZMA_CHECK_CRC32 = 1 } lzma_check; +typedef enum { LZMA_RESERVED_ENUM = 0 } lzma_reserved_enum; + +struct lzma_allocator; +struct lzma_internal; + +typedef struct { + const uint8_t *next_in; + size_t avail_in; + uint64_t total_in; + + uint8_t *next_out; + size_t avail_out; + uint64_t total_out; + + const struct lzma_allocator *allocator; + struct lzma_internal *internal; + + void *reserved_ptr1; + void *reserved_ptr2; + void *reserved_ptr3; + void *reserved_ptr4; + uint64_t reserved_int1; + uint64_t reserved_int2; + size_t reserved_int3; + size_t reserved_int4; + lzma_reserved_enum reserved_enum1; + lzma_reserved_enum reserved_enum2; +} lzma_stream; + +#define LZMA_STREAM_INIT \ + { NULL, 0, 0, NULL, 0, 0, NULL, NULL, \ + NULL, NULL, NULL, NULL, 0, 0, 0, 0, \ + LZMA_RESERVED_ENUM, LZMA_RESERVED_ENUM } + +extern size_t lzma_stream_buffer_bound(size_t uncompressed_size); + +extern lzma_ret lzma_easy_buffer_encode( + uint32_t preset, lzma_check check, + const struct lzma_allocator *allocator, + const uint8_t *in, size_t in_size, + uint8_t *out, size_t *out_pos, size_t out_size); + +extern lzma_ret lzma_stream_decoder( + lzma_stream *strm, uint64_t memlimit, uint32_t flags); + +extern uint64_t lzma_easy_decoder_memusage(uint32_t preset); + +extern lzma_ret lzma_code(lzma_stream *strm, lzma_action action); + +extern void lzma_end(lzma_stream *strm); + +#ifdef __cplusplus +} +#endif + +#endif diff -Nru htslib-1.6/os/rand.c htslib-1.7/os/rand.c --- htslib-1.6/os/rand.c 1970-01-01 00:00:00.000000000 +0000 +++ htslib-1.7/os/rand.c 2018-01-26 12:01:45.000000000 +0000 @@ -0,0 +1,97 @@ +/* rand.c -- drand48 implementation from the FreeBSD source tree. */ + +// This file is an amalgamation of the many small files in FreeBSD to do with +// drand48 and friends implementations. +// It comprises _rand48.c, rand48.h, srand48.c, drand48.c, erand48.c, lrand48.c + +/* + * Copyright (c) 1993 Martin Birgmeier + * All rights reserved. + * + * You may redistribute unmodified or modified versions of this source + * code provided that the above copyright notice and this and the + * following conditions are retained. + * + * This software is provided ``as is'', and comes with no warranties + * of any kind. I shall in no event be liable for anything that happens + * to anyone/anything when using this software. + */ + +//#include +//__FBSDID("$FreeBSD: src/lib/libc/gen/_rand48.c,v 1.2 2002/03/22 21:52:05 obrien Exp $"); + +#include + +#define RAND48_SEED_0 (0x330e) +#define RAND48_SEED_1 (0xabcd) +#define RAND48_SEED_2 (0x1234) +#define RAND48_MULT_0 (0xe66d) +#define RAND48_MULT_1 (0xdeec) +#define RAND48_MULT_2 (0x0005) +#define RAND48_ADD (0x000b) + +static unsigned short _rand48_seed[3] = { + RAND48_SEED_0, + RAND48_SEED_1, + RAND48_SEED_2 +}; +static unsigned short _rand48_mult[3] = { + RAND48_MULT_0, + RAND48_MULT_1, + RAND48_MULT_2 +}; +static unsigned short _rand48_add = RAND48_ADD; + +static void +_dorand48(unsigned short xseed[3]) +{ + unsigned long accu; + unsigned short temp[2]; + + accu = (unsigned long) _rand48_mult[0] * (unsigned long) xseed[0] + + (unsigned long) _rand48_add; + temp[0] = (unsigned short) accu; /* lower 16 bits */ + accu >>= sizeof(unsigned short) * 8; + accu += (unsigned long) _rand48_mult[0] * (unsigned long) xseed[1] + + (unsigned long) _rand48_mult[1] * (unsigned long) xseed[0]; + temp[1] = (unsigned short) accu; /* middle 16 bits */ + accu >>= sizeof(unsigned short) * 8; + accu += _rand48_mult[0] * xseed[2] + _rand48_mult[1] * xseed[1] + _rand48_mult[2] * xseed[0]; + xseed[0] = temp[0]; + xseed[1] = temp[1]; + xseed[2] = (unsigned short) accu; +} + +void +hts_srand48(long seed) +{ + _rand48_seed[0] = RAND48_SEED_0; + _rand48_seed[1] = (unsigned short) seed; + _rand48_seed[2] = (unsigned short) (seed >> 16); + _rand48_mult[0] = RAND48_MULT_0; + _rand48_mult[1] = RAND48_MULT_1; + _rand48_mult[2] = RAND48_MULT_2; + _rand48_add = RAND48_ADD; +} + +double +hts_erand48(unsigned short xseed[3]) +{ + _dorand48(xseed); + return ldexp((double) xseed[0], -48) + + ldexp((double) xseed[1], -32) + + ldexp((double) xseed[2], -16); +} + +double +hts_drand48(void) +{ + return hts_erand48(_rand48_seed); +} + +long +hts_lrand48(void) +{ + _dorand48(_rand48_seed); + return ((long) _rand48_seed[2] << 15) + ((long) _rand48_seed[1] >> 1); +} diff -Nru htslib-1.6/sam.c htslib-1.7/sam.c --- htslib-1.6/sam.c 2017-09-28 11:20:23.000000000 +0000 +++ htslib-1.7/sam.c 2018-01-26 12:01:45.000000000 +0000 @@ -1,6 +1,6 @@ /* sam.c -- SAM and BAM file I/O and manipulation. - Copyright (C) 2008-2010, 2012-2017 Genome Research Ltd. + Copyright (C) 2008-2010, 2012-2018 Genome Research Ltd. Copyright (C) 2010, 2012, 2013 Broad Institute. Author: Heng Li @@ -326,6 +326,18 @@ return bam_copy1(bdst, bsrc); } +void bam_cigar2rqlens(int n_cigar, const uint32_t *cigar, int *rlen, int *qlen) +{ + int k; + *rlen = *qlen = 0; + for (k = 0; k < n_cigar; ++k) { + int type = bam_cigar_type(bam_cigar_op(cigar[k])); + int len = bam_cigar_oplen(cigar[k]); + if (type & 1) *qlen += len; + if (type & 2) *rlen += len; + } +} + int bam_cigar2qlen(int n_cigar, const uint32_t *cigar) { int k, l; @@ -352,6 +364,49 @@ return b->core.pos + 1; } +static int bam_tag2cigar(bam1_t *b, int recal_bin, int give_warning) // return 0 if CIGAR is untouched; 1 if CIGAR is updated with CG +{ + bam1_core_t *c = &b->core; + uint32_t cigar_st, n_cigar4, CG_st, CG_en, ori_len = b->l_data, *cigar0, CG_len, fake_bytes; + uint8_t *CG; + + // test where there is a real CIGAR in the CG tag to move + if (c->n_cigar == 0 || c->tid < 0 || c->pos < 0) return 0; + cigar0 = bam_get_cigar(b); + if (bam_cigar_op(cigar0[0]) != BAM_CSOFT_CLIP || bam_cigar_oplen(cigar0[0]) != c->l_qseq) return 0; + fake_bytes = c->n_cigar * 4; + if ((CG = bam_aux_get(b, "CG")) == 0) return 0; // no CG tag + if (CG[0] != 'B' || CG[1] != 'I') return 0; // not of type B,I + CG_len = le_to_u32(CG + 2); + if (CG_len < c->n_cigar || CG_len >= 1U<<29) return 0; // don't move if the real CIGAR length is shorter than the fake cigar length + + // move from the CG tag to the right position + cigar_st = (uint8_t*)cigar0 - b->data; + c->n_cigar = CG_len; + n_cigar4 = c->n_cigar * 4; + CG_st = CG - b->data - 2; + CG_en = CG_st + 8 + n_cigar4; + b->l_data = b->l_data - fake_bytes + n_cigar4; // we need c->n_cigar-fake_bytes bytes to swap CIGAR to the right place + if (b->m_data < b->l_data) { + uint8_t *new_data; + uint32_t new_max = b->l_data; + kroundup32(new_max); + new_data = (uint8_t*)realloc(b->data, new_max); + if (new_data == 0) return -1; + b->m_data = new_max, b->data = new_data; + } + memmove(b->data + cigar_st + n_cigar4, b->data + cigar_st + fake_bytes, ori_len - (cigar_st + fake_bytes)); // insert c->n_cigar-fake_bytes empty space to make room + memcpy(b->data + cigar_st, b->data + (n_cigar4 - fake_bytes) + CG_st + 8, n_cigar4); // copy the real CIGAR to the right place; -fake_bytes for the fake CIGAR + if (ori_len > CG_en) // move data after the CG tag + memmove(b->data + CG_st + n_cigar4 - fake_bytes, b->data + CG_en + n_cigar4 - fake_bytes, ori_len - CG_en); + b->l_data -= n_cigar4 + 8; // 8: CGBI (4 bytes) and CGBI length (4) + if (recal_bin) + b->core.bin = hts_reg2bin(b->core.pos, b->core.pos + bam_cigar2rlen(b->core.n_cigar, bam_get_cigar(b)), 14, 5); + if (give_warning) + hts_log_error("%s encodes a CIGAR with %d operators at the CG tag", bam_get_qname(b), c->n_cigar); + return 1; +} + static inline int aux_type2size(uint8_t type) { switch (type) { @@ -423,13 +478,20 @@ bgzf_read(fp, b->data + c->l_qname, b->l_data - c->l_qname) != b->l_data - c->l_qname) return -4; if (fp->is_be) swap_data(c, b->l_data, b->data, 0); - - // Sanity check for broken CIGAR alignments - if (c->n_cigar > 0 && c->l_qseq > 0 && !(c->flag & BAM_FUNMAP) - && bam_cigar2qlen(c->n_cigar, bam_get_cigar(b)) != c->l_qseq) { - hts_log_error("CIGAR and query sequence lengths differ for %s", - bam_get_qname(b)); + if (bam_tag2cigar(b, 0, 0) < 0) return -4; + + if (c->n_cigar > 0) { // recompute "bin" and check CIGAR-qlen consistency + int rlen, qlen; + bam_cigar2rqlens(c->n_cigar, bam_get_cigar(b), &rlen, &qlen); + if ((b->core.flag & BAM_FUNMAP)) rlen=1; + b->core.bin = hts_reg2bin(b->core.pos, b->core.pos + rlen, 14, 5); + // Sanity check for broken CIGAR alignments + if (c->l_qseq > 0 && !(c->flag & BAM_FUNMAP) && qlen != c->l_qseq) { + hts_log_error("CIGAR and query sequence lengths differ for %s", + bam_get_qname(b)); + return -4; + } } return 4 + block_len; @@ -440,15 +502,12 @@ const bam1_core_t *c = &b->core; uint32_t x[8], block_len = b->l_data - c->l_extranul + 32, y; int i, ok; - if (c->n_cigar >= 65536) { - hts_log_error("Too many CIGAR operations (%d >= 64K for QNAME \"%s\")", c->n_cigar, bam_get_qname(b)); - errno = EOVERFLOW; - return -1; - } + if (c->n_cigar > 0xffff) block_len += 16; // "16" for "CGBI", 4-byte tag length and 8-byte fake CIGAR x[0] = c->tid; x[1] = c->pos; x[2] = (uint32_t)c->bin<<16 | c->qual<<8 | (c->l_qname - c->l_extranul); - x[3] = (uint32_t)c->flag<<16 | c->n_cigar; + if (c->n_cigar > 0xffff) x[3] = (uint32_t)c->flag << 16 | 2; + else x[3] = (uint32_t)c->flag << 16 | (c->n_cigar & 0xffff); x[4] = c->l_qseq; x[5] = c->mtid; x[6] = c->mpos; @@ -464,7 +523,24 @@ } if (ok) ok = (bgzf_write(fp, x, 32) >= 0); if (ok) ok = (bgzf_write(fp, b->data, c->l_qname - c->l_extranul) >= 0); - if (ok) ok = (bgzf_write(fp, b->data + c->l_qname, b->l_data - c->l_qname) >= 0); + if (c->n_cigar <= 0xffff) { // no long CIGAR; write normally + if (ok) ok = (bgzf_write(fp, b->data + c->l_qname, b->l_data - c->l_qname) >= 0); + } else { // with long CIGAR, insert a fake CIGAR record and move the real CIGAR to the CG:B,I tag + uint8_t buf[8]; + uint32_t cigar_st, cigar_en, cigar[2]; + cigar_st = (uint8_t*)bam_get_cigar(b) - b->data; + cigar_en = cigar_st + c->n_cigar * 4; + cigar[0] = (uint32_t)c->l_qseq << 4 | BAM_CSOFT_CLIP; + cigar[1] = (uint32_t)bam_cigar2rlen(c->n_cigar, bam_get_cigar(b)) << 4 | BAM_CREF_SKIP; + u32_to_le(cigar[0], buf); + u32_to_le(cigar[1], buf + 4); + if (ok) ok = (bgzf_write(fp, buf, 8) >= 0); // write cigar: SN + if (ok) ok = (bgzf_write(fp, &b->data[cigar_en], b->l_data - cigar_en) >= 0); // write data after CIGAR + if (ok) ok = (bgzf_write(fp, "CGBI", 4) >= 0); // write CG:B,I + u32_to_le(c->n_cigar, buf); + if (ok) ok = (bgzf_write(fp, buf, 4) >= 0); // write the true CIGAR length + if (ok) ok = (bgzf_write(fp, &b->data[cigar_st], c->n_cigar * 4) >= 0); // write the real CIGAR + } if (fp->is_be) swap_data(c, b->l_data, b->data, 0); return ok? 4 + block_len : -1; } @@ -577,9 +653,73 @@ htsFile *fp = fpv; bam1_t *b = bv; int ret = cram_get_bam_seq(fp->fp.cram, &b); - return ret >= 0 - ? ret - : (cram_eof(fp->fp.cram) ? -1 : -2); + if (ret < 0) + return cram_eof(fp->fp.cram) ? -1 : -2; + + if (bam_tag2cigar(b, 1, 1) < 0) + return -2; + + *tid = b->core.tid; + *beg = b->core.pos; + *end = bam_endpos(b); + + return ret; +} + +static int cram_pseek(void *fp, int64_t offset, int whence) +{ + cram_fd *fd = (cram_fd *)fp; + + if ((0 != cram_seek(fd, offset, SEEK_SET)) + && (0 != cram_seek(fd, offset - fd->first_container, SEEK_CUR))) + return -1; + + if (fd->ctr) { + cram_free_container(fd->ctr); + fd->ctr = NULL; + fd->ooc = 0; + } + + return 0; +} + +/* + * cram_ptell is a pseudo-tell function, because it matches the position of the disk cursor only + * after a fresh seek call. Otherwise it indicates that the read takes place inside the buffered + * container previously fetched. It was designed like this to integrate with the functionality + * of the iterator stepping logic. + */ + +static int64_t cram_ptell(void *fp) +{ + cram_fd *fd = (cram_fd *)fp; + cram_container *c; + int64_t ret = -1L; + + if (fd && fd->fp) { + ret = htell(fd->fp); + if ((c = fd->ctr) != NULL) { + ret -= ((c->curr_slice != c->max_slice || c->curr_rec != c->max_rec) ? c->offset + 1 : 0); + } + } + + return ret; +} + +static int bam_pseek(void *fp, int64_t offset, int whence) +{ + BGZF *fd = (BGZF *)fp; + + return bgzf_seek(fd, offset, whence); +} + +static int64_t bam_ptell(void *fp) +{ + BGZF *fd = (BGZF *)fp; + if (!fd) + return -1L; + + return bgzf_tell(fd); } // This is used only with read_rest=1 iterators, so need not set tid/beg/end. @@ -591,9 +731,12 @@ case bam: return bam_read1(bgzfp, b); case cram: { int ret = cram_get_bam_seq(fp->fp.cram, &b); - return ret >= 0 - ? ret - : (cram_eof(fp->fp.cram) ? -1 : -2); + if (ret < 0) + return cram_eof(fp->fp.cram) ? -1 : -2; + + if (bam_tag2cigar(b, 1, 1) < 0) + return -2; + return ret; } default: // TODO Need headers available to implement this for SAM files @@ -642,8 +785,8 @@ iter->bins.a = NULL; iter->readrec = readrec; - if (tid >= 0 || tid == HTS_IDX_NOCOOR) { - cram_range r = { tid == HTS_IDX_NOCOOR ? -1 : tid, beg+1, end }; + if (tid >= 0 || tid == HTS_IDX_NOCOOR || tid == HTS_IDX_START) { + cram_range r = { tid, beg+1, end }; int ret = cram_set_option(cidx->cram, CRAM_OPT_RANGE, &r); iter->curr_off = 0; @@ -711,6 +854,17 @@ return hts_itr_querys(idx, region, (hts_name2id_f)(bam_name2id), hdr, hts_itr_query, bam_readrec); } +hts_itr_multi_t *sam_itr_regions(const hts_idx_t *idx, bam_hdr_t *hdr, hts_reglist_t *reglist, unsigned int regcount) +{ + const hts_cram_idx_t *cidx = (const hts_cram_idx_t *) idx; + if (cidx->fmt == HTS_FMT_CRAI) + return hts_itr_regions(idx, reglist, regcount, cram_name2id, cidx->cram, + hts_itr_multi_cram, cram_readrec, cram_pseek, cram_ptell); + else + return hts_itr_regions(idx, reglist, regcount, (hts_name2id_f)(bam_name2id), hdr, + hts_itr_multi_bam, bam_readrec, bam_pseek, bam_ptell); +} + /********************** *** SAM header I/O *** **********************/ @@ -936,6 +1090,78 @@ return 0; } +int sam_hdr_change_HD(bam_hdr_t *h, const char *key, const char *val) +{ + char *p, *q, *beg = NULL, *end = NULL, *newtext; + if (!h || !key) + return -1; + + if (h->l_text > 3) { + if (strncmp(h->text, "@HD", 3) == 0) { //@HD line exists + if ((p = strchr(h->text, '\n')) == 0) return -1; + *p = '\0'; // for strstr call + + char tmp[5] = { '\t', key[0], key[0] ? key[1] : '\0', ':', '\0' }; + + if ((q = strstr(h->text, tmp)) != 0) { // key exists + *p = '\n'; // change back + + // mark the key:val + beg = q; + for (q += 4; *q != '\n' && *q != '\t'; ++q); + end = q; + + if (val && (strncmp(beg + 4, val, end - beg - 4) == 0) + && strlen(val) == end - beg - 4) + return 0; // val is the same, no need to change + + } else { + beg = end = p; + *p = '\n'; + } + } + } + if (beg == NULL) { // no @HD + if (h->l_text > UINT32_MAX - strlen(SAM_FORMAT_VERSION) - 9) + return -1; + h->l_text += strlen(SAM_FORMAT_VERSION) + 8; + if (val) { + if (h->l_text > UINT32_MAX - strlen(val) - 5) + return -1; + h->l_text += strlen(val) + 4; + } + newtext = (char*)malloc(h->l_text + 1); + if (!newtext) return -1; + + if (val) + snprintf(newtext, h->l_text + 1, + "@HD\tVN:%s\t%s:%s\n%s", SAM_FORMAT_VERSION, key, val, h->text); + else + snprintf(newtext, h->l_text + 1, + "@HD\tVN:%s\n%s", SAM_FORMAT_VERSION, h->text); + } else { // has @HD but different or no key + h->l_text = (beg - h->text) + (h->text + h->l_text - end); + if (val) { + if (h->l_text > UINT32_MAX - strlen(val) - 5) + return -1; + h->l_text += strlen(val) + 4; + } + newtext = (char*)malloc(h->l_text + 1); + if (!newtext) return -1; + + if (val) { + snprintf(newtext, h->l_text + 1, "%.*s\t%s:%s%s", + (int) (beg - h->text), h->text, key, val, end); + } else { //delete key + snprintf(newtext, h->l_text + 1, "%.*s%s", + (int) (beg - h->text), h->text, end); + } + } + free(h->text); + h->text = newtext; + return 0; +} + /********************** *** SAM record I/O *** **********************/ @@ -1162,6 +1388,8 @@ } else _parse_err_param(1, "unrecognized type %c", type); } b->data = (uint8_t*)str.s; b->l_data = str.l; b->m_data = str.m; + if (bam_tag2cigar(b, 1, 1) < 0) + return -2; return 0; #undef _parse_warn @@ -1190,9 +1418,12 @@ case cram: { int ret = cram_get_bam_seq(fp->fp.cram, &b); - return ret >= 0 - ? ret - : (cram_eof(fp->fp.cram) ? -1 : -2); + if (ret < 0) + return cram_eof(fp->fp.cram) ? -1 : -2; + + if (bam_tag2cigar(b, 1, 1) < 0) + return -2; + return ret; } case sam: { diff -Nru htslib-1.6/synced_bcf_reader.c htslib-1.7/synced_bcf_reader.c --- htslib-1.6/synced_bcf_reader.c 2017-09-28 11:20:23.000000000 +0000 +++ htslib-1.7/synced_bcf_reader.c 2018-01-26 12:01:45.000000000 +0000 @@ -598,7 +598,10 @@ { min_pos = files->readers[i].buffer[1]->pos; chr = bcf_seqname(files->readers[i].header, files->readers[i].buffer[1]); + bcf_sr_sort_set_active(&BCF_SR_AUX(files)->sort, i); } + else if ( min_pos==files->readers[i].buffer[1]->pos ) + bcf_sr_sort_add_active(&BCF_SR_AUX(files)->sort, i); } if ( min_pos==INT_MAX ) { diff -Nru htslib-1.6/tabix.1 htslib-1.7/tabix.1 --- htslib-1.6/tabix.1 2017-09-28 11:20:23.000000000 +0000 +++ htslib-1.7/tabix.1 2018-01-26 12:01:45.000000000 +0000 @@ -1,4 +1,4 @@ -.TH tabix 1 "28 September 2017" "htslib-1.6" "Bioinformatics tools" +.TH tabix 1 "26 January 2018" "htslib-1.7" "Bioinformatics tools" .SH NAME .PP bgzip \- Block compression/decompression utility diff -Nru htslib-1.6/tabix.c htslib-1.7/tabix.c --- htslib-1.6/tabix.c 2017-09-28 11:20:23.000000000 +0000 +++ htslib-1.7/tabix.c 2018-01-26 12:01:45.000000000 +0000 @@ -1,7 +1,7 @@ /* tabix.c -- Generic indexer for TAB-delimited genome position files. Copyright (C) 2009-2011 Broad Institute. - Copyright (C) 2010-2012, 2014-2017 Genome Research Ltd. + Copyright (C) 2010-2012, 2014-2018 Genome Research Ltd. Author: Heng Li @@ -446,7 +446,7 @@ case 1: printf( "tabix (htslib) %s\n" -"Copyright (C) 2017 Genome Research Ltd.\n", hts_version()); +"Copyright (C) 2018 Genome Research Ltd.\n", hts_version()); return EXIT_SUCCESS; default: return usage(); } diff -Nru htslib-1.6/test/hfile.c htslib-1.7/test/hfile.c --- htslib-1.6/test/hfile.c 2017-09-28 11:20:23.000000000 +0000 +++ htslib-1.7/test/hfile.c 2018-01-26 12:01:45.000000000 +0000 @@ -202,6 +202,47 @@ if ((c = hgetc(fin)) != EOF) fail("chars: hgetc (EOF) returned %d", c); if (hclose(fin) != 0) fail("hclose(test/hfile_chars.tmp) for reading"); + fin = hopen("preload:test/hfile_chars.tmp", "r"); + if (fin == NULL) fail("preloading \"test/hfile_chars.tmp\" for reading"); + for (i = 0; i < 256; i++) + if ((c = hgetc(fin)) != i) + fail("preloading chars: hgetc (%d = 0x%x) returned %d = 0x%x", i, i, c, c); + if ((c = hgetc(fin)) != EOF) fail("preloading chars: hgetc (EOF) returned %d", c); + if (hclose(fin) != 0) fail("preloading hclose(test/hfile_chars.tmp) for reading"); + + char* test_string = strdup("Test string"); + fin = hopen("mem:", "r:", test_string, 12); + if (fin == NULL) fail("hopen(\"mem:\", \"r:\", ...)"); + if (hread(fin, buffer, 12) != 12) + fail("hopen('mem:', 'r') failed read"); + if(strcmp(buffer, test_string) != 0) + fail("hopen('mem:', 'r') missread '%s' != '%s'", buffer, test_string); + char* internal_buf; + size_t interval_buf_len; + if((internal_buf = hfile_mem_get_buffer(fin, &interval_buf_len)) == NULL){ + fail("hopen('mem:', 'r') failed to get internal buffer"); + } + if (hclose(fin) != 0) fail("hclose mem for reading"); + + test_string = strdup("Test string"); + fin = hopen("mem:", "wr:", test_string, 12); + if (fin == NULL) fail("hopen(\"mem:\", \"w:\", ...)"); + if (hseek(fin, -1, SEEK_END) < 0) + fail("hopen('mem:', 'wr') failed seek"); + if (hwrite(fin, " extra", 7) != 7) + fail("hopen('mem:', 'wr') failed write"); + if (hseek(fin, 0, SEEK_SET) < 0) + fail("hopen('mem:', 'wr') failed seek"); + if (hread(fin, buffer, 18) != 18) + fail("hopen('mem:', 'wr') failed read"); + if (strcmp(buffer, "Test string extra") != 0) + fail("hopen('mem:', 'wr') misswrote '%s' != '%s'", buffer, "Test string extra"); + if((internal_buf = hfile_mem_steal_buffer(fin, &interval_buf_len)) == NULL){ + fail("hopen('mem:', 'wr') failed to get internal buffer"); + } + free(internal_buf); + if (hclose(fin) != 0) fail("hclose mem for writing"); + fin = hopen("data:,hello, world!%0A", "r"); if (fin == NULL) fail("hopen(\"data:...\")"); n = hread(fin, buffer, 300); diff -Nru htslib-1.6/test/hts_endian.c htslib-1.7/test/hts_endian.c --- htslib-1.6/test/hts_endian.c 2017-09-28 11:20:23.000000000 +0000 +++ htslib-1.7/test/hts_endian.c 2018-01-26 12:01:45.000000000 +0000 @@ -102,7 +102,7 @@ T64(0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 1, 1), T64(0x00, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 256, 256), T64(0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 65536, 65536), - T64(0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 4294967296, 4294967296), + T64(0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 4294967296LL, 4294967296ULL), T64(0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x7f, 9223372036854775807LL, 9223372036854775807ULL), // Odd coding of signed result below avoids a compiler warning diff -Nru htslib-1.6/test/test_view.c htslib-1.7/test/test_view.c --- htslib-1.6/test/test_view.c 2017-09-28 11:20:23.000000000 +0000 +++ htslib-1.7/test/test_view.c 2018-01-26 12:01:45.000000000 +0000 @@ -30,6 +30,7 @@ #include #include #include +#include #include "cram/cram.h" @@ -58,8 +59,9 @@ int extra_hdr_nuls = 0; int benchmark = 0; int nthreads = 0; // shared pool + int multi_reg = 0; - while ((c = getopt(argc, argv, "DSIt:i:bCl:o:N:BZ:@:")) >= 0) { + while ((c = getopt(argc, argv, "DSIt:i:bCl:o:N:BZ:@:M")) >= 0) { switch (c) { case 'D': flag |= READ_CRAM; break; case 'S': flag |= READ_COMPRESSED; break; @@ -73,6 +75,7 @@ case 'N': nreads = atoi(optarg); break; case 'B': benchmark = 1; break; case 'Z': extra_hdr_nuls = atoi(optarg); break; + case 'M': multi_reg = 1; break; case '@': nthreads = atoi(optarg); break; } } @@ -92,8 +95,10 @@ fprintf(stderr, "-N: num_reads: limit the output to the first num_reads reads\n"); fprintf(stderr, "\n"); fprintf(stderr, "-B: enable benchmarking\n"); + fprintf(stderr, "-M: use hts_itr_multi iterator\n"); fprintf(stderr, "-Z hdr_nuls: append specified number of null bytes to the SAM header\n"); - fprintf(stderr, "-@ num_threads: use thread pool with specified number of threads\n"); + fprintf(stderr, "-@ num_threads: use thread pool with specified number of threads\n\n"); + fprintf(stderr, "The region list entries should be specified as 'reg:beg-end', with intervals of a region being disjunct and sorted by the starting coordinate.\n"); return 1; } strcpy(moder, "r"); @@ -185,13 +190,69 @@ fprintf(stderr, "[E::%s] fail to load the BAM index\n", __func__); return 1; } - for (i = optind + 1; i < argc; ++i) { - hts_itr_t *iter; - if ((iter = sam_itr_querys(idx, h, argv[i])) == 0) { - fprintf(stderr, "[E::%s] fail to parse region '%s'\n", __func__, argv[i]); - continue; + if (multi_reg) { + int reg_count = 0; + hts_reglist_t *reg_list = calloc(argc-(optind+1), sizeof(*reg_list)); + if (!reg_list) + return 1; + + // We need a public function somewhere to turn an array of region strings + // into a region list, but for testing this will suffice for now. + // Consider moving a derivation of this into htslib proper sometime. + for (i = optind + 1; i < argc; ++i) { + int j; + uint32_t beg, end; + char *cp = strrchr(argv[i], ':'); + if (cp) *cp = 0; + + for (j = 0; j < reg_count; j++) + if (strcmp(reg_list[j].reg, argv[i]) == 0) + break; + if (j == reg_count) { + reg_list[reg_count++].reg = argv[i]; + if (strcmp(".", argv[i]) == 0) { + reg_list[j].tid = HTS_IDX_START; + + } else if (strcmp("*", argv[i]) == 0) { + reg_list[j].tid = HTS_IDX_NOCOOR; + + } else { + int k; // need the header API here! + for (k = 0; k < h->n_targets; k++) + if (strcmp(h->target_name[k], argv[i]) == 0) + break; + if (k == h->n_targets) + return 1; + reg_list[j].tid = k; + reg_list[j].min_beg = h->target_len[k]; + reg_list[j].max_end = 0; + } + } + + hts_reglist_t *r = ®_list[j]; + r->intervals = realloc(r->intervals, ++r->count * sizeof(*r->intervals)); + if (!r->intervals) + return 1; + beg = 1; + end = r->tid >= 0 ? h->target_len[r->tid] : 0; + if (cp) { + *cp = 0; + // hts_parse_reg() is better, but awkward here + sscanf(cp+1, "%d-%d", &beg, &end); + } + r->intervals[r->count-1].beg = beg-1; // BED syntax + r->intervals[r->count-1].end = end; + + if (r->min_beg > beg) + r->min_beg = beg; + if (r->max_end < end) + r->max_end = end; } - while ((r = sam_itr_next(in, iter, b)) >= 0) { + + hts_itr_multi_t *iter = sam_itr_regions(idx, h, reg_list, reg_count); + if (!iter) + return 1; + while ((r = sam_itr_multi_next(in, iter, b)) >= 0) { if (!benchmark && sam_write1(out, h, b) < 0) { fprintf(stderr, "Error writing output.\n"); exit_code = 1; @@ -200,7 +261,25 @@ if (nreads && --nreads == 0) break; } - hts_itr_destroy(iter); + hts_itr_multi_destroy(iter); + } else { + for (i = optind + 1; i < argc; ++i) { + hts_itr_t *iter; + if ((iter = sam_itr_querys(idx, h, argv[i])) == 0) { + fprintf(stderr, "[E::%s] fail to parse region '%s'\n", __func__, argv[i]); + continue; + } + while ((r = sam_itr_next(in, iter, b)) >= 0) { + if (!benchmark && sam_write1(out, h, b) < 0) { + fprintf(stderr, "Error writing output.\n"); + exit_code = 1; + break; + } + if (nreads && --nreads == 0) + break; + } + hts_itr_destroy(iter); + } } hts_idx_destroy(idx); } else while ((r = sam_read1(in, h, b)) >= 0) { diff -Nru htslib-1.6/textutils.c htslib-1.7/textutils.c --- htslib-1.6/textutils.c 2017-09-28 11:20:23.000000000 +0000 +++ htslib-1.7/textutils.c 2018-01-26 12:01:45.000000000 +0000 @@ -215,6 +215,22 @@ } } +hts_json_token * hts_json_alloc_token() { + return calloc(1, sizeof(hts_json_token)); +} + +char hts_json_token_type(hts_json_token *token) { + return token->type; +} + +void hts_json_free_token(hts_json_token *token) { + free(token); +} + +char *hts_json_token_str(hts_json_token *token) { + return token->str; +} + char hts_json_snext(char *str, size_t *state, hts_json_token *token) { char *s = &str[*state >> 2]; diff -Nru htslib-1.6/textutils_internal.h htslib-1.7/textutils_internal.h --- htslib-1.6/textutils_internal.h 1970-01-01 00:00:00.000000000 +0000 +++ htslib-1.7/textutils_internal.h 2018-01-26 12:01:45.000000000 +0000 @@ -0,0 +1,175 @@ +/* textutils_internal.h -- non-bioinformatics utility routines for text etc. + + Copyright (C) 2016,2018 Genome Research Ltd. + + Author: John Marshall + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +DEALINGS IN THE SOFTWARE. */ + +#ifndef HTSLIB_TEXTUTILS_INTERNAL_H +#define HTSLIB_TEXTUTILS_INTERNAL_H + +/* N.B. These interfaces may be used by plug-ins */ + +#include +#include +#include "htslib/kstring.h" + +#ifdef __cplusplus +extern "C" { +#endif + +/// Decode percent-encoded (URL-encoded) text +/** On input, _dest_ should be a buffer at least the same size as _s_, + and may be equal to _s_ to decode in place. On output, _dest_ will be + NUL-terminated and the number of characters written (not including the + NUL) is stored in _destlen_. +*/ +int hts_decode_percent(char *dest, size_t *destlen, const char *s); + +/// Return decoded data length given length of base64-encoded text +/** This gives an upper bound, as it overestimates by a byte or two when + the encoded text ends with (possibly omitted) `=` padding characters. +*/ +size_t hts_base64_decoded_length(size_t len); + +/// Decode base64-encoded data +/** On input, _dest_ should be a sufficient buffer (see `hts_base64_length()`), + and may be equal to _s_ to decode in place. On output, the number of + bytes writen is stored in _destlen_. +*/ +int hts_decode_base64(char *dest, size_t *destlen, const char *s); + +/// Token structure returned by JSON lexing functions +/** Structure is defined in hts_internal.h + */ + +typedef struct hts_json_token hts_json_token; + +/// Allocate an empty JSON token structure, for use with hts_json_* functions +/** @return An empty token on success; NULL on failure + */ +hts_json_token * hts_json_alloc_token(); + +/// Free a JSON token +void hts_json_free_token(hts_json_token *token); + +/// Accessor funtion to get JSON token type +/** @param token Pointer to JSON token + @return Character indicating the token type + +Token types correspond to scalar JSON values and selected punctuation +as follows: + - `s` string + - `n` number + - `b` boolean literal + - `.` null literal + - `{`, `}`, `[`, `]` object and array delimiters + - `?` lexing error + - `!` other errors (e.g. out of memory) + - `\0` terminator at end of input +*/ +char hts_json_token_type(hts_json_token *token); + +/// Accessor funtion to get JSON token in string form +/** @param token Pointer to JSON token + @return String representation of the JSON token; NULL if unset + +If the token was parsed from a string using hts_json_snext(), the return value +will point into the string passed as the first parameter to hts_json_snext(). +If the token was parsed from a file using hts_json_fnext(), the return value +will point at the kstring_t buffer passed as the third parameter to +hts_json_fnext(). In that case, the value will only be valid until the +next call to hts_json_fnext(). + */ +char *hts_json_token_str(hts_json_token *token); + +/// Read one JSON token from a string +/** @param str The input C string + @param state The input string state + @param token On return, filled in with the token read + @return The type of the token read + +On return, `token->str` points into the supplied input string, which +is modified by having token-terminating characters overwritten as NULs. +The `state` argument records the current position within `str` after each +`hts_json_snext()` call, and should be set to 0 before the first call. +*/ +char hts_json_snext(char *str, size_t *state, hts_json_token *token); + +/// Read and discard a complete JSON value from a string +/** @param str The input C string + @param state The input string state, as per `hts_json_snext()` + @param type If the first token of the value to be discarded has already + been read, provide its type; otherwise `'\0'` + @return One of `v` (success), `\0` (end of string), and `?` (lexing error) + +Skips a complete JSON value, which may be a single token or an entire object +or array. +*/ +char hts_json_sskip_value(char *str, size_t *state, char type); + +/// Read one JSON token from a file +/** @param fp The file stream + @param token On return, filled in with the token read + @param kstr Buffer used to store the token string returned + @return The type of the token read + +The `kstr` buffer is used to store the string value of the token read, +so `token->str` is only valid until the next time `hts_json_fnext()` is +called with the same `kstr` argument. +*/ +char hts_json_fnext(struct hFILE *fp, hts_json_token *token, kstring_t *kstr); + +/// Read and discard a complete JSON value from a file +/** @param fp The file stream + @param type If the first token of the value to be discarded has already + been read, provide its type; otherwise `'\0'` + @return One of `v` (success), `\0` (EOF), and `?` (lexing error) + +Skips a complete JSON value, which may be a single token or an entire object +or array. +*/ +char hts_json_fskip_value(struct hFILE *fp, char type); + + +// The functions operate on ints such as are returned by fgetc(), +// i.e., characters represented as unsigned-char-valued ints, or EOF. +// To operate on plain chars (and to avoid warnings on some platforms), +// technically one must cast to unsigned char everywhere (see CERT STR37-C) +// or less painfully use these *_c() functions that operate on plain chars +// (but not EOF, which must be considered separately where it is applicable). +// TODO We may eventually wish to implement these functions directly without +// using their equivalents, and thus make them immune to locales. +static inline int isalnum_c(char c) { return isalnum((unsigned char) c); } +static inline int isalpha_c(char c) { return isalpha((unsigned char) c); } +static inline int isdigit_c(char c) { return isdigit((unsigned char) c); } +static inline int isgraph_c(char c) { return isgraph((unsigned char) c); } +static inline int islower_c(char c) { return islower((unsigned char) c); } +static inline int isprint_c(char c) { return isprint((unsigned char) c); } +static inline int isspace_c(char c) { return isspace((unsigned char) c); } +static inline int isupper_c(char c) { return isupper((unsigned char) c); } +static inline char tolower_c(char c) { return tolower((unsigned char) c); } +static inline char toupper_c(char c) { return toupper((unsigned char) c); } + +#ifdef __cplusplus +} +#endif + +#endif diff -Nru htslib-1.6/thread_pool.c htslib-1.7/thread_pool.c --- htslib-1.6/thread_pool.c 2017-09-28 11:20:23.000000000 +0000 +++ htslib-1.7/thread_pool.c 2018-01-26 12:01:45.000000000 +0000 @@ -850,7 +850,7 @@ } /* - * Resets a process to the intial state. + * Resets a process to the initial state. * * This removes any queued up input jobs, disables any notification of * new results/output, flushes what is left and then discards any @@ -1084,7 +1084,7 @@ // Check for results. if ((r = hts_tpool_next_result(q))) { - printf("RESULT: %d\n", *(int *)r->data); + printf("RESULT: %d\n", *(int *)hts_tpool_result_data(r)); hts_tpool_delete_result(r, 1); } if (blk == -1) { @@ -1100,7 +1100,7 @@ hts_tpool_process_flush(q); while ((r = hts_tpool_next_result(q))) { - printf("RESULT: %d\n", *(int *)r->data); + printf("RESULT: %d\n", *(int *)hts_tpool_result_data(r)); hts_tpool_delete_result(r, 1); } @@ -1152,7 +1152,7 @@ // Consume all results until we find the end-of-job marker. for(;;) { hts_tpool_result *r = hts_tpool_next_result_wait(q); - int x = *(int *)r->data; + int x = *(int *)hts_tpool_result_data(r); hts_tpool_delete_result(r, 1); if (x == -1) break; @@ -1175,7 +1175,7 @@ /*----------------------------------------------------------------------------- * A simple pipeline test. - * We use a dediocated input thread that does the initial generation of job + * We use a dedicated input thread that does the initial generation of job * and dispatch, several execution steps running in a shared pool, and a * dedicated output thread that prints up the final result. It's key that our * pipeline execution stages can run independently and don't themselves have @@ -1253,7 +1253,7 @@ hts_tpool_result *r; while ((r = hts_tpool_next_result_wait(o->q1))) { - pipe_job *j = (pipe_job *)r->data; + pipe_job *j = (pipe_job *)hts_tpool_result_data(r); hts_tpool_delete_result(r, 0); if (hts_tpool_dispatch(j->o->p, j->o->q2, pipe_stage2, j) != 0) pthread_exit((void *)1); @@ -1279,7 +1279,7 @@ hts_tpool_result *r; while ((r = hts_tpool_next_result_wait(o->q2))) { - pipe_job *j = (pipe_job *)r->data; + pipe_job *j = (pipe_job *)hts_tpool_result_data(r); hts_tpool_delete_result(r, 0); if (hts_tpool_dispatch(j->o->p, j->o->q3, pipe_stage3, j) != 0) pthread_exit((void *)1); @@ -1303,7 +1303,7 @@ hts_tpool_result *r; while ((r = hts_tpool_next_result_wait(o->q3))) { - pipe_job *j = (pipe_job *)r->data; + pipe_job *j = (pipe_job *)hts_tpool_result_data(r); int eof = j->eof; printf("O %08x\n", j->x); hts_tpool_delete_result(r, 1); diff -Nru htslib-1.6/vcf.c htslib-1.7/vcf.c --- htslib-1.6/vcf.c 2017-09-28 11:20:23.000000000 +0000 +++ htslib-1.7/vcf.c 2018-01-26 12:01:45.000000000 +0000 @@ -1474,12 +1474,20 @@ dst->n_info = src->n_info; dst->n_allele = src->n_allele; dst->n_fmt = src->n_fmt; dst->n_sample = src->n_sample; - dst->shared.m = dst->shared.l = src->shared.l; - dst->shared.s = (char*) malloc(dst->shared.l); + if ( dst->shared.m < src->shared.l ) + { + dst->shared.s = (char*) realloc(dst->shared.s, src->shared.l); + dst->shared.m = src->shared.l; + } + dst->shared.l = src->shared.l; memcpy(dst->shared.s,src->shared.s,dst->shared.l); - dst->indiv.m = dst->indiv.l = src->indiv.l; - dst->indiv.s = (char*) malloc(dst->indiv.l); + if ( dst->indiv.m < src->indiv.l ) + { + dst->indiv.s = (char*) realloc(dst->indiv.s, src->indiv.l); + dst->indiv.m = src->indiv.l; + } + dst->indiv.l = src->indiv.l; memcpy(dst->indiv.s,src->indiv.s,dst->indiv.l); return dst; diff -Nru htslib-1.6/version.sh htslib-1.7/version.sh --- htslib-1.6/version.sh 2017-09-28 11:20:23.000000000 +0000 +++ htslib-1.7/version.sh 2018-01-26 12:01:45.000000000 +0000 @@ -1,7 +1,7 @@ #!/bin/sh # Master version, for use in tarballs or non-git source copies -VERSION=1.6 +VERSION=1.7 # If we have a git clone, then check against the current tag if [ -e .git ] diff -Nru htslib-1.6/win/rand.c htslib-1.7/win/rand.c --- htslib-1.6/win/rand.c 2017-09-28 11:20:23.000000000 +0000 +++ htslib-1.7/win/rand.c 1970-01-01 00:00:00.000000000 +0000 @@ -1,98 +0,0 @@ -/* rand.c -- drand48 implementation from the FreeBSD source tree. */ - -// This file is an amalgamation of the many small files in FreeBSD to do with -// drand48 and friends implementations. -// It comprises _rand48.c, rand48.h, srand48.c, drand48.c, erand48.c, lrand48.c - -/* - * Copyright (c) 1993 Martin Birgmeier - * All rights reserved. - * - * You may redistribute unmodified or modified versions of this source - * code provided that the above copyright notice and this and the - * following conditions are retained. - * - * This software is provided ``as is'', and comes with no warranties - * of any kind. I shall in no event be liable for anything that happens - * to anyone/anything when using this software. - */ - -//#include -//__FBSDID("$FreeBSD: src/lib/libc/gen/_rand48.c,v 1.2 2002/03/22 21:52:05 obrien Exp $"); - -#include -#include "win/rand.h" - -#define RAND48_SEED_0 (0x330e) -#define RAND48_SEED_1 (0xabcd) -#define RAND48_SEED_2 (0x1234) -#define RAND48_MULT_0 (0xe66d) -#define RAND48_MULT_1 (0xdeec) -#define RAND48_MULT_2 (0x0005) -#define RAND48_ADD (0x000b) - -static unsigned short _rand48_seed[3] = { - RAND48_SEED_0, - RAND48_SEED_1, - RAND48_SEED_2 -}; -static unsigned short _rand48_mult[3] = { - RAND48_MULT_0, - RAND48_MULT_1, - RAND48_MULT_2 -}; -static unsigned short _rand48_add = RAND48_ADD; - -static void -_dorand48(unsigned short xseed[3]) -{ - unsigned long accu; - unsigned short temp[2]; - - accu = (unsigned long) _rand48_mult[0] * (unsigned long) xseed[0] + - (unsigned long) _rand48_add; - temp[0] = (unsigned short) accu; /* lower 16 bits */ - accu >>= sizeof(unsigned short) * 8; - accu += (unsigned long) _rand48_mult[0] * (unsigned long) xseed[1] + - (unsigned long) _rand48_mult[1] * (unsigned long) xseed[0]; - temp[1] = (unsigned short) accu; /* middle 16 bits */ - accu >>= sizeof(unsigned short) * 8; - accu += _rand48_mult[0] * xseed[2] + _rand48_mult[1] * xseed[1] + _rand48_mult[2] * xseed[0]; - xseed[0] = temp[0]; - xseed[1] = temp[1]; - xseed[2] = (unsigned short) accu; -} - -void -hts_srand48(long seed) -{ - _rand48_seed[0] = RAND48_SEED_0; - _rand48_seed[1] = (unsigned short) seed; - _rand48_seed[2] = (unsigned short) (seed >> 16); - _rand48_mult[0] = RAND48_MULT_0; - _rand48_mult[1] = RAND48_MULT_1; - _rand48_mult[2] = RAND48_MULT_2; - _rand48_add = RAND48_ADD; -} - -double -hts_erand48(unsigned short xseed[3]) -{ - _dorand48(xseed); - return ldexp((double) xseed[0], -48) + - ldexp((double) xseed[1], -32) + - ldexp((double) xseed[2], -16); -} - -double -hts_drand48(void) -{ - return hts_erand48(_rand48_seed); -} - -long -hts_lrand48(void) -{ - _dorand48(_rand48_seed); - return ((long) _rand48_seed[2] << 15) + ((long) _rand48_seed[1] >> 1); -} diff -Nru htslib-1.6/win/rand.h htslib-1.7/win/rand.h --- htslib-1.6/win/rand.h 2017-09-28 11:20:23.000000000 +0000 +++ htslib-1.7/win/rand.h 1970-01-01 00:00:00.000000000 +0000 @@ -1,24 +0,0 @@ -/* rand.h -- drand48 implementation from the FreeBSD source tree. */ - -/* - * Copyright (c) 1993 Martin Birgmeier - * All rights reserved. - * - * You may redistribute unmodified or modified versions of this source - * code provided that the above copyright notice and this and the - * following conditions are retained. - * - * This software is provided ``as is'', and comes with no warranties - * of any kind. I shall in no event be liable for anything that happens - * to anyone/anything when using this software. - */ - -#ifndef HTSLIB_HTS_RAND_H -#define HTSLIB_HTS_RAND_H - -void hts_srand48(long seed); -double hts_erand48(unsigned short xseed[3]); -double hts_drand48(void); -long hts_lrand48(void); - -#endif /* HTSLIB_HTS_RAND_H */