diff -Nru tophat-2.1.1+dfsg/debian/changelog tophat-2.1.1+dfsg1/debian/changelog --- tophat-2.1.1+dfsg/debian/changelog 2017-10-26 17:24:58.000000000 +0000 +++ tophat-2.1.1+dfsg1/debian/changelog 2017-12-06 15:13:31.000000000 +0000 @@ -1,8 +1,36 @@ -tophat (2.1.1+dfsg-3build1) bionic; urgency=medium +tophat (2.1.1+dfsg1-1) unstable; urgency=medium - * No-change rebuild for boost soname change. + * Team upload - -- Matthias Klose Thu, 26 Oct 2017 17:24:58 +0000 + * debian/upstream/metadata: + - Added references to registries + - yamllint cleanliness + + [ Nadiya Sitdykova ] + * add info about debian/tests/test_data + + [ Fabian Klötzl ] + * remove convenience copy of samtools. + Closes: #780816 + * adapt tophat script to recent samtools version + + [ Andreas Tille ] + * Remove unused license paragraph + * Standards-Version: 4.1.1 + * Really remove samtools code copy also from source tarball + * Versioned Depends: samtools (>= 1.5) + * d/watch: enable numbered dfsg-appendices + + -- Fabian Klötzl Wed, 06 Dec 2017 16:13:31 +0100 + +tophat (2.1.1+dfsg-4) unstable; urgency=medium + + * Build-Depends: bowtie + Architecture: any + Closes: #873866 + * Standards-Version: 4.1.0 (no changes needed) + * Autoreconf is default with debhelper 10 + + -- Andreas Tille Mon, 04 Sep 2017 13:36:19 +0200 tophat (2.1.1+dfsg-3) unstable; urgency=medium diff -Nru tophat-2.1.1+dfsg/debian/control tophat-2.1.1+dfsg1/debian/control --- tophat-2.1.1+dfsg/debian/control 2017-07-15 01:15:50.000000000 +0000 +++ tophat-2.1.1+dfsg1/debian/control 2017-12-06 15:13:31.000000000 +0000 @@ -6,26 +6,26 @@ Section: science Priority: optional Build-Depends: debhelper (>= 10), - autotools-dev, libbam-dev, zlib1g-dev, - dh-autoreconf, python, seqan-dev (>= 1.4), libboost-system-dev, libboost-thread-dev, - help2man -Standards-Version: 4.0.0 + help2man, + bowtie +Standards-Version: 4.1.1 Vcs-Browser: https://anonscm.debian.org/cgit/debian-med/tophat.git Vcs-Git: https://anonscm.debian.org/git/debian-med/tophat.git Homepage: http://ccb.jhu.edu/software/tophat Package: tophat -Architecture: amd64 kfreebsd-amd64 +Architecture: any Depends: ${shlibs:Depends}, ${misc:Depends}, python, - bowtie2 | bowtie + bowtie2 | bowtie, + samtools (>= 1.5) Suggests: cufflinks Description: fast splice junction mapper for RNA-Seq reads TopHat aligns RNA-Seq reads to mammalian-sized genomes using the ultra diff -Nru tophat-2.1.1+dfsg/debian/copyright tophat-2.1.1+dfsg1/debian/copyright --- tophat-2.1.1+dfsg/debian/copyright 2017-07-15 01:15:50.000000000 +0000 +++ tophat-2.1.1+dfsg1/debian/copyright 2017-12-06 15:13:31.000000000 +0000 @@ -1,4 +1,4 @@ -Format: http://www.debian.org/doc/packaging-manuals/copyright-format/1.0/ +Format: https://www.debian.org/doc/packaging-manuals/copyright-format/1.0/ Upstream-Name: TopHat Upstream-Contact: Daehwan Kim Source: http://ccb.jhu.edu/software/tophat/index.shtml @@ -10,6 +10,7 @@ */src/SeqAn* */src/intervaltree* */src/sortedcontainers* + */src/samtools-0.1.18* Files: * Copyright: © 2003-2010 Cole Trapnell et al @@ -37,23 +38,11 @@ This configure script is free software; the Free Software Foundation gives unlimited permission to copy, distribute and modify it. -Files: src/samtools-0.1.18/* -Copyright: © 2008–2010, Genome Research Ltd. (GRL) - © 2003–2006, 2008–2010, by Heng Li - © 2008–2010 Broad Institute - © 2010 Illumina, Inc. -License: MIT - Files: debian/* Copyright: © 2011 Carlos Borroto © 2016 Alex Mestiashvili License: GPL-2+ -License: GAP - Copying and distribution of this file, with or without modification, are - permitted in any medium without royalty provided the copyright notice - and this notice are preserved. - License: Artistic This program is free software; you can redistribute it and/or modify it under the terms of the Artistic License, which comes with Perl. @@ -61,6 +50,11 @@ On Debian systems, the complete text of the Artistic License can be found in `/usr/share/common-licenses/Artistic'. +License: GAP + Copying and distribution of this file, with or without modification, are + permitted in any medium without royalty provided the copyright notice + and this notice are preserved. + License: GPL-2+ This package is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by @@ -77,22 +71,3 @@ . On Debian systems, the complete text of the GNU General Public License version 2 can be found in "/usr/share/common-licenses/GPL-2". - -License: MIT - Permission is hereby granted, free of charge, to any person obtaining a copy - of this software and associated documentation files (the "Software"), to deal - in the Software without restriction, including without limitation the rights - to use, copy, modify, merge, publish, distribute, sublicense, and/or sell - copies of the Software, and to permit persons to whom the Software is - furnished to do so, subject to the following conditions: - . - The above copyright notice and this permission notice shall be included in - all copies or substantial portions of the Software. - . - THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN - THE SOFTWARE. diff -Nru tophat-2.1.1+dfsg/debian/patches/fix-compatibility-with-recent-samtools.patch tophat-2.1.1+dfsg1/debian/patches/fix-compatibility-with-recent-samtools.patch --- tophat-2.1.1+dfsg/debian/patches/fix-compatibility-with-recent-samtools.patch 1970-01-01 00:00:00.000000000 +0000 +++ tophat-2.1.1+dfsg1/debian/patches/fix-compatibility-with-recent-samtools.patch 2017-12-06 15:13:31.000000000 +0000 @@ -0,0 +1,58 @@ +From: =?utf-8?q?Fabian_Kl=C3=B6tzl?= +Date: Wed, 6 Dec 2017 15:32:12 +0100 +Subject: fix compatibility with recent samtools + +--- + src/tophat.py | 10 +++++----- + 1 file changed, 5 insertions(+), 5 deletions(-) + +diff --git a/src/tophat.py b/src/tophat.py +index 1eed276..4b8bbb1 100755 +--- a/src/tophat.py ++++ b/src/tophat.py +@@ -182,7 +182,7 @@ use_BWT_FIFO = False # can only be set to True if use_zpacker is True and only w + unmapped_reads_fifo = None # if use_BWT_FIFO is True, this tricks bowtie into writing the + # unmapped reads into a compressed file + +-samtools_path = "samtools_0.1.18" ++samtools_path = "samtools" + bowtie_path = None + fail_str = "\t[FAILED]\n" + gtf_juncs = None #file name with junctions extracted from given GFF file +@@ -1162,7 +1162,7 @@ def nonzeroFile(filepath): + head_cmd = ["head", "-1"] + head = subprocess.Popen(head_cmd, stdin=samtools_view.stdout, stdout=subprocess.PIPE) + +- samtools_view.stdout.close() # as per http://bugs.python.org/issue7678 ++ # samtools_view.stdout.close() # as per http://bugs.python.org/issue7678 + output = head.communicate()[0][:-1] + + if len(output) > 0: +@@ -1569,7 +1569,7 @@ def get_samtools_version(): + def check_samtools(): + #th_log("Checking for Samtools") + global samtools_path +- samtools_path=prog_path("samtools_0.1.18") ++ samtools_path=prog_path("samtools") + #samtools_version_str, samtools_version_arr = get_samtools_version() + #if samtools_version_str == None: + if not samtools_path: +@@ -2744,16 +2744,16 @@ def compile_reports(params, sam_header_filename, ref_fasta, mappings, readfiles, + + if params.report_params.sort_bam: + pids = [0 for i in range(num_bam_parts)] +- sorted_bam_parts = ["%s%d_sorted" % (alignments_output_filename, i) for i in range(num_bam_parts)] ++ sorted_bam_parts = ["%s%d_sorted.bam" % (alignments_output_filename, i) for i in range(num_bam_parts)] + #left_um_parts = ["%s%s%d_sorted" % (alignments_output_filename, i) for i in range(num_bam_parts)] + #right_um_parts = ["%s%d_sorted" % (alignments_output_filename, i) for i in range(num_bam_parts)] + for i in range(num_bam_parts): + bamsort_cmd = [samtools_path, + "sort", + bam_parts[i], ++ "-o", + sorted_bam_parts[i]] + +- sorted_bam_parts[i] += ".bam" + print >> run_log, " ".join(bamsort_cmd) + + if i + 1 < num_bam_parts: diff -Nru tophat-2.1.1+dfsg/debian/patches/hardening4samtools.patch tophat-2.1.1+dfsg1/debian/patches/hardening4samtools.patch --- tophat-2.1.1+dfsg/debian/patches/hardening4samtools.patch 2017-07-15 01:15:50.000000000 +0000 +++ tophat-2.1.1+dfsg1/debian/patches/hardening4samtools.patch 1970-01-01 00:00:00.000000000 +0000 @@ -1,21 +0,0 @@ -Description: Don't overwrite CFLAGS, add CPPFLAGS and LDFLAGS fot the embedded -copy of samtools -Author: Alex Mestiashvili ---- tophat.orig/src/samtools-0.1.18/Makefile -+++ tophat/src/samtools-0.1.18/Makefile -@@ -1,5 +1,5 @@ - CC= gcc --CFLAGS= -g -Wall -O2 #-m64 #-arch ppc -+ - DFLAGS= -D_FILE_OFFSET_BITS=64 -D_LARGEFILE64_SOURCE -D_USE_KNETFILE -D_CURSES_LIB=0 - KNETFILE_O= knetfile.o - LOBJS= bgzf.o kstring.o bam_aux.o bam.o bam_import.o sam.o bam_index.o \ -@@ -41,7 +41,7 @@ - $(AR) -csru $@ $(LOBJS) - - samtools_0.1.18:lib-recur $(AOBJS) -- $(CC) $(CFLAGS) -o $@ $(AOBJS) -Lbcftools $(LIBPATH) libbam.a -lbcf -lm -lz #$(LIBCURSES) -+ $(CC) $(CPPFLAGS) $(CFLAGS) -o $@ $(AOBJS) -Lbcftools $(LIBPATH) libbam.a -lbcf -lm -lz $(LDFLAGS) #$(LIBCURSES) - - razip:razip.o razf.o $(KNETFILE_O) - $(CC) $(CFLAGS) -o $@ razf.o razip.o $(KNETFILE_O) -lz diff -Nru tophat-2.1.1+dfsg/debian/patches/remove-convenience-copy-of-samtools.patch tophat-2.1.1+dfsg1/debian/patches/remove-convenience-copy-of-samtools.patch --- tophat-2.1.1+dfsg/debian/patches/remove-convenience-copy-of-samtools.patch 1970-01-01 00:00:00.000000000 +0000 +++ tophat-2.1.1+dfsg1/debian/patches/remove-convenience-copy-of-samtools.patch 2017-12-06 15:13:31.000000000 +0000 @@ -0,0 +1,224 @@ +From: =?utf-8?q?Fabian_Kl=C3=B6tzl?= +Date: Mon, 4 Dec 2017 12:28:43 +0100 +Subject: remove convenience copy of samtools + +--- + configure.ac | 10 ---- + src/Makefile.am | 125 ++++++-------------------------------------------------- + 2 files changed + +diff --git a/src/Makefile.am b/src/Makefile.am +index e983fee..986e321 100644 +--- a/src/Makefile.am ++++ b/src/Makefile.am +@@ -16,90 +16,6 @@ sortedcontainers/sorteddict.py \ + sortedcontainers/sortedlist.py \ + sortedcontainers/sortedlistwithkey.py \ + sortedcontainers/sortedset.py \ +-samtools-0.1.18/AUTHORS \ +-samtools-0.1.18/COPYING \ +-samtools-0.1.18/ChangeLog \ +-samtools-0.1.18/INSTALL \ +-samtools-0.1.18/Makefile \ +-samtools-0.1.18/Makefile.mingw \ +-samtools-0.1.18/NEWS \ +-samtools-0.1.18/bam.c \ +-samtools-0.1.18/bam.h \ +-samtools-0.1.18/bam2bcf.c \ +-samtools-0.1.18/bam2bcf.h \ +-samtools-0.1.18/bam2bcf_indel.c \ +-samtools-0.1.18/bam2depth.c \ +-samtools-0.1.18/bam_aux.c \ +-samtools-0.1.18/bam_cat.c \ +-samtools-0.1.18/bam_color.c \ +-samtools-0.1.18/bam_endian.h \ +-samtools-0.1.18/bam_import.c \ +-samtools-0.1.18/bam_index.c \ +-samtools-0.1.18/bam_lpileup.c \ +-samtools-0.1.18/bam_mate.c \ +-samtools-0.1.18/bam_md.c \ +-samtools-0.1.18/bam_pileup.c \ +-samtools-0.1.18/bam_plcmd.c \ +-samtools-0.1.18/bam_reheader.c \ +-samtools-0.1.18/bam_rmdup.c \ +-samtools-0.1.18/bam_rmdupse.c \ +-samtools-0.1.18/bam_sort.c \ +-samtools-0.1.18/bam_stat.c \ +-samtools-0.1.18/bam_tview.c \ +-samtools-0.1.18/bamtk.c \ +-samtools-0.1.18/bedidx.c \ +-samtools-0.1.18/bgzf.c \ +-samtools-0.1.18/bgzf.h \ +-samtools-0.1.18/bgzip.c \ +-samtools-0.1.18/cut_target.c \ +-samtools-0.1.18/errmod.c \ +-samtools-0.1.18/errmod.h \ +-samtools-0.1.18/faidx.c \ +-samtools-0.1.18/faidx.h \ +-samtools-0.1.18/kaln.c \ +-samtools-0.1.18/kaln.h \ +-samtools-0.1.18/khash.h \ +-samtools-0.1.18/klist.h \ +-samtools-0.1.18/knetfile.c \ +-samtools-0.1.18/knetfile.h \ +-samtools-0.1.18/kprobaln.c \ +-samtools-0.1.18/kprobaln.h \ +-samtools-0.1.18/kseq.h \ +-samtools-0.1.18/ksort.h \ +-samtools-0.1.18/kstring.c \ +-samtools-0.1.18/kstring.h \ +-samtools-0.1.18/phase.c \ +-samtools-0.1.18/razf.c \ +-samtools-0.1.18/razf.h \ +-samtools-0.1.18/razip.c \ +-samtools-0.1.18/sam.c \ +-samtools-0.1.18/sam.h \ +-samtools-0.1.18/sam_header.c \ +-samtools-0.1.18/sam_header.h \ +-samtools-0.1.18/sam_view.c \ +-samtools-0.1.18/sample.c \ +-samtools-0.1.18/sample.h \ +-samtools-0.1.18/samtools.1 \ +-samtools-0.1.18/bcftools/Makefile \ +-samtools-0.1.18/bcftools/README \ +-samtools-0.1.18/bcftools/bcf.c \ +-samtools-0.1.18/bcftools/bcf.h \ +-samtools-0.1.18/bcftools/bcf.tex \ +-samtools-0.1.18/bcftools/bcf2qcall.c \ +-samtools-0.1.18/bcftools/bcfutils.c \ +-samtools-0.1.18/bcftools/call1.c \ +-samtools-0.1.18/bcftools/em.c \ +-samtools-0.1.18/bcftools/fet.c \ +-samtools-0.1.18/bcftools/index.c \ +-samtools-0.1.18/bcftools/kfunc.c \ +-samtools-0.1.18/bcftools/kmin.c \ +-samtools-0.1.18/bcftools/kmin.h \ +-samtools-0.1.18/bcftools/main.c \ +-samtools-0.1.18/bcftools/mut.c \ +-samtools-0.1.18/bcftools/prob1.c \ +-samtools-0.1.18/bcftools/prob1.h \ +-samtools-0.1.18/bcftools/vcf.c \ +-samtools-0.1.18/bcftools/vcfutils.pl \ + SeqAn-1.4.2/LICENSE \ + SeqAn-1.4.2/README.rst \ + SeqAn-1.4.2/seqan/align/align_base.h \ +@@ -683,17 +599,11 @@ SeqAn-1.4.2/seqan/system/system_sema.h \ + SeqAn-1.4.2/seqan/system/system_thread.h \ + SeqAn-1.4.2/seqan/version.h + +-SAMDIR = ./samtools-0.1.18 +-SAMLIB = libbam.a +-SAMPROG = samtools_0.1.18 + BAM_LIB = -lbam +-BAM_CPPFLAGS = -I$(SAMDIR) +-BAM_LDFLAGS = -L$(SAMDIR) + + #-- progs to be installed in $prefix/bin + + bin_PROGRAMS = \ +- $(SAMPROG) \ + prep_reads \ + gtf_to_fasta \ + fix_map_ordering \ +@@ -732,7 +642,7 @@ tophat: tophat.py + sed -e 's|__VERSION__|$(VERSION)|' tophat.py > tophat && chmod 755 tophat + + #-- tophat library for linking convienence +-noinst_LIBRARIES = $(SAMLIB) libgc.a libtophat.a ++noinst_LIBRARIES = libgc.a libtophat.a + + noinst_HEADERS = \ + reads.h \ +@@ -798,58 +708,49 @@ libgc_a_SOURCES = \ + + prep_reads_SOURCES = prep_reads.cpp + prep_reads_LDADD = $(top_builddir)/src/libtophat.a $(BAM_LIB) +-prep_reads_LDFLAGS = $(BAM_LDFLAGS) $(LDFLAGS) ++prep_reads_LDFLAGS = $(LDFLAGS) + + segment_juncs_SOURCES = segment_juncs.cpp + segment_juncs_LDADD = $(top_builddir)/src/libtophat.a $(BOOST_THREAD_LIBS) $(BOOST_SYSTEM_LIB) $(BAM_LIB) +-segment_juncs_LDFLAGS = $(BAM_LDFLAGS) $(LDFLAGS) $(BOOST_LDFLAGS) ++segment_juncs_LDFLAGS = $(LDFLAGS) $(BOOST_LDFLAGS) + + long_spanning_reads_SOURCES = long_spanning_reads.cpp + long_spanning_reads_LDADD = $(top_builddir)/src/libtophat.a $(BOOST_THREAD_LIBS) $(BOOST_SYSTEM_LIB) $(BAM_LIB) +-long_spanning_reads_LDFLAGS = $(BAM_LDFLAGS) $(LDFLAGS) $(BOOST_LDFLAGS) ++long_spanning_reads_LDFLAGS = $(LDFLAGS) $(BOOST_LDFLAGS) + + gtf_juncs_SOURCES = gtf_juncs.cpp + gtf_juncs_LDADD = $(top_builddir)/src/libtophat.a libgc.a $(BAM_LIB) +-gtf_juncs_LDFLAGS = $(BAM_LDFLAGS) $(LDFLAGS) ++gtf_juncs_LDFLAGS = $(LDFLAGS) + + juncs_db_SOURCES = juncs_db.cpp + juncs_db_LDADD = $(top_builddir)/src/libtophat.a $(BAM_LIB) +-juncs_db_LDFLAGS = $(BAM_LDFLAGS) $(LDFLAGS) ++juncs_db_LDFLAGS = $(LDFLAGS) + + tophat_reports_SOURCES = tophat_reports.cpp + tophat_reports_LDADD = $(top_builddir)/src/libtophat.a $(BOOST_THREAD_LIBS) $(BOOST_SYSTEM_LIB) $(BAM_LIB) +-tophat_reports_LDFLAGS = $(BAM_LDFLAGS) $(LDFLAGS) $(BOOST_LDFLAGS) ++tophat_reports_LDFLAGS = $(LDFLAGS) $(BOOST_LDFLAGS) + + fix_map_ordering_SOURCES = fix_map_ordering.cpp + fix_map_ordering_LDADD = $(top_builddir)/src/libtophat.a $(BAM_LIB) +-fix_map_ordering_LDFLAGS = $(BAM_LDFLAGS) $(LDFLAGS) ++fix_map_ordering_LDFLAGS = $(LDFLAGS) + + bam2fastx_SOURCES = bam2fastx.cpp + bam2fastx_LDADD = $(top_builddir)/src/libgc.a $(BAM_LIB) +-bam2fastx_LDFLAGS = $(BAM_LDFLAGS) $(LDFLAGS) ++bam2fastx_LDFLAGS = $(LDFLAGS) + + bam_merge_SOURCES = bam_merge.cpp + bam_merge_LDADD = $(top_builddir)/src/libtophat.a $(top_builddir)/src/libgc.a $(BAM_LIB) +-bam_merge_LDFLAGS = $(BAM_LDFLAGS) $(LDFLAGS) ++bam_merge_LDFLAGS = $(LDFLAGS) + + sam_juncs_SOURCES = sam_juncs.cpp + sam_juncs_LDADD = $(top_builddir)/src/libtophat.a $(BAM_LIB) +-sam_juncs_LDFLAGS = $(BAM_LDFLAGS) $(LDFLAGS) ++sam_juncs_LDFLAGS = $(LDFLAGS) + + map2gtf_SOURCES = map2gtf.cpp + map2gtf_LDADD = $(top_builddir)/src/libtophat.a libgc.a $(BAM_LIB) +-map2gtf_LDFLAGS = $(BAM_LDFLAGS) $(LDFLAGS) ++map2gtf_LDFLAGS = $(LDFLAGS) + + gtf_to_fasta_SOURCES = GTFToFasta.cpp FastaTools.cpp + gtf_to_fasta_LDADD = $(top_builddir)/src/libtophat.a libgc.a $(BAM_LIB) +-gtf_to_fasta_LDFLAGS = $(BAM_LDFLAGS) $(LDFLAGS) ++gtf_to_fasta_LDFLAGS = $(LDFLAGS) + +- +-libbam_a_SOURCES = +-samtools_0_1_18_SOURCES = +- +-$(SAMPROG): $(SAMLIB) +- +- +-$(SAMLIB): +- cd $(SAMDIR) && make $(SAMPROG) && cp $(SAMLIB) $(SAMPROG) .. + +diff --git a/configure.ac b/configure.ac +index dd5ac83..1a7f32b 100644 +--- a/configure.ac ++++ b/configure.ac +@@ -38,16 +38,6 @@ if test -z "$BOOST_THREAD_LIBS"; then + fi + + +-# BAM related: +- ac_bam_path=samtools-0.1.18 +- BAM_LIB="-lbam" +- BAM_LDFLAGS="-L./$ac_bam_path" +- BAM_CPPFLAGS="-I./$ac_bam_path" +- AC_SUBST(BAM_CPPFLAGS) +- AC_SUBST(BAM_LDFLAGS) +- AC_SUBST(BAM_LIB) +- +- + # Checks for header files. + AC_CHECK_HEADERS([stdlib.h string.h unistd.h]) + diff -Nru tophat-2.1.1+dfsg/debian/patches/series tophat-2.1.1+dfsg1/debian/patches/series --- tophat-2.1.1+dfsg/debian/patches/series 2017-07-15 01:15:50.000000000 +0000 +++ tophat-2.1.1+dfsg1/debian/patches/series 2017-12-06 15:13:31.000000000 +0000 @@ -1,4 +1,5 @@ -hardening4samtools.patch remove_3rd_party_code.patch fix_includes_path.patch fix-gcc6.patch +remove-convenience-copy-of-samtools.patch +fix-compatibility-with-recent-samtools.patch diff -Nru tophat-2.1.1+dfsg/debian/README.source tophat-2.1.1+dfsg1/debian/README.source --- tophat-2.1.1+dfsg/debian/README.source 1970-01-01 00:00:00.000000000 +0000 +++ tophat-2.1.1+dfsg1/debian/README.source 2017-12-06 15:13:31.000000000 +0000 @@ -0,0 +1,6 @@ +Tophat +========= + +The files in debian/tests/test_data was retrieved from +Johns Hopkins University CCB website. +URL: http://ccb.jhu.edu/software/tophat/downloads/test_data.tar.gz diff -Nru tophat-2.1.1+dfsg/debian/rules tophat-2.1.1+dfsg1/debian/rules --- tophat-2.1.1+dfsg/debian/rules 2017-07-15 01:15:50.000000000 +0000 +++ tophat-2.1.1+dfsg1/debian/rules 2017-12-06 15:13:31.000000000 +0000 @@ -11,7 +11,7 @@ DEB_HOST_MULTIARCH ?= $(shell dpkg-architecture -qDEB_HOST_MULTIARCH) %: - dh $@ --with autoreconf --no-parallel + dh $@ --no-parallel override_dh_auto_configure: dh_auto_configure -- --with-boost-libdir=/usr/lib/$(DEB_HOST_MULTIARCH) diff -Nru tophat-2.1.1+dfsg/debian/upstream/metadata tophat-2.1.1+dfsg1/debian/upstream/metadata --- tophat-2.1.1+dfsg/debian/upstream/metadata 2017-07-15 01:15:50.000000000 +0000 +++ tophat-2.1.1+dfsg1/debian/upstream/metadata 2017-12-06 15:13:31.000000000 +0000 @@ -13,5 +13,13 @@ DOI: 10.1093/bioinformatics/btp120 PMID: 19289445 URL: http://bioinformatics.oxfordjournals.org/content/25/9/1105.short - eprint: http://bioinformatics.oxfordjournals.org/content/25/9/1105.full.pdf+html + eprint: "http://bioinformatics.oxfordjournals.org/content/\ + 25/9/1105.full.pdf+html" license: Open Access +Registry: + - Name: OMICtools + Entry: OMICS_01257 + - Name: bio.tools + Entry: tophat + - Name: RRID + Entry: SCR_013035 diff -Nru tophat-2.1.1+dfsg/debian/watch tophat-2.1.1+dfsg1/debian/watch --- tophat-2.1.1+dfsg/debian/watch 2017-07-15 01:15:50.000000000 +0000 +++ tophat-2.1.1+dfsg1/debian/watch 2017-12-06 15:13:31.000000000 +0000 @@ -1,3 +1,3 @@ version=3 -opts="repacksuffix=+dfsg,dversionmangle=s/\+dfsg//g" \ +opts="repacksuffix=+dfsg,dversionmangle=s/\+dfsg\d*//g" \ http://ccb.jhu.edu/software/tophat/downloads/tophat-([-0-9.]*\w?)\.tar\.gz diff -Nru tophat-2.1.1+dfsg/src/intervaltree/__init__.py tophat-2.1.1+dfsg1/src/intervaltree/__init__.py --- tophat-2.1.1+dfsg/src/intervaltree/__init__.py 2016-02-14 18:21:17.342079000 +0000 +++ tophat-2.1.1+dfsg1/src/intervaltree/__init__.py 1970-01-01 00:00:00.000000000 +0000 @@ -1,22 +0,0 @@ -""" -intervaltree: A mutable, self-balancing interval tree for Python 2 and 3. -Queries may be by point, by range overlap, or by range envelopment. - -Root package. - -Copyright 2013-2015 Chaim-Leib Halbert - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -""" -from .interval import Interval -from .intervaltree import IntervalTree diff -Nru tophat-2.1.1+dfsg/src/intervaltree/interval.py tophat-2.1.1+dfsg1/src/intervaltree/interval.py --- tophat-2.1.1+dfsg/src/intervaltree/interval.py 2016-02-14 18:21:17.350079000 +0000 +++ tophat-2.1.1+dfsg1/src/intervaltree/interval.py 1970-01-01 00:00:00.000000000 +0000 @@ -1,302 +0,0 @@ -""" -intervaltree: A mutable, self-balancing interval tree for Python 2 and 3. -Queries may be by point, by range overlap, or by range envelopment. - -Interval class - -Copyright 2013-2015 Chaim-Leib Halbert -Modifications copyright 2014 Konstantin Tretyakov - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -""" -from numbers import Number -from collections import namedtuple - - -# noinspection PyBroadException -class Interval(namedtuple('IntervalBase', ['begin', 'end', 'data'])): - __slots__ = () # Saves memory, avoiding the need to create __dict__ for each interval - - def __new__(cls, begin, end, data=None): - return super(Interval, cls).__new__(cls, begin, end, data) - - def overlaps(self, begin, end=None): - """ - Whether the interval overlaps the given point, range or Interval. - :param begin: beginning point of the range, or the point, or an Interval - :param end: end point of the range. Optional if not testing ranges. - :return: True or False - :rtype: bool - """ - if end is not None: - return ( - (begin <= self.begin < end) or - (begin < self.end <= end) or - (self.begin <= begin < self.end) or - (self.begin < end <= self.end) - ) - try: - return self.overlaps(begin.begin, begin.end) - except: - return self.contains_point(begin) - - def contains_point(self, p): - """ - Whether the Interval contains p. - :param p: a point - :return: True or False - :rtype: bool - """ - return self.begin <= p < self.end - - def range_matches(self, other): - """ - Whether the begins equal and the ends equal. Compare __eq__(). - :param other: Interval - :return: True or False - :rtype: bool - """ - return ( - self.begin == other.begin and - self.end == other.end - ) - - def contains_interval(self, other): - """ - Whether other is contained in this Interval. - :param other: Interval - :return: True or False - :rtype: bool - """ - return ( - self.begin <= other.begin and - self.end >= other.end - ) - - def distance_to(self, other): - """ - Returns the size of the gap between intervals, or 0 - if they touch or overlap. - :param other: Interval or point - :return: distance - :rtype: Number - """ - if self.overlaps(other): - return 0 - try: - if self.begin < other.begin: - return other.begin - self.end - else: - return self.begin - other.end - except: - if self.end < other: - return other - self.end - else: - return self.begin - other - - def is_null(self): - """ - Whether this equals the null interval. - :return: True if end <= begin else False - :rtype: bool - """ - return self.begin >= self.end - - def length(self): - """ - The distance covered by this Interval. - :return: length - :type: Number - """ - if self.is_null(): - return 0 - return self.end - self.begin - - def __hash__(self): - """ - Depends on begin and end only. - :return: hash - :rtype: Number - """ - return hash((self.begin, self.end)) - - def __eq__(self, other): - """ - Whether the begins equal, the ends equal, and the data fields - equal. Compare range_matches(). - :param other: Interval - :return: True or False - :rtype: bool - """ - return ( - self.begin == other.begin and - self.end == other.end and - self.data == other.data - ) - - def __cmp__(self, other): - """ - Tells whether other sorts before, after or equal to this - Interval. - - Sorting is by begins, then by ends, then by data fields. - - If data fields are not both sortable types, data fields are - compared alphabetically by type name. - :param other: Interval - :return: -1, 0, 1 - :rtype: int - """ - s = self[0:2] - try: - o = other[0:2] - except: - o = (other,) - if s != o: - return -1 if s < o else 1 - try: - if self.data == other.data: - return 0 - return -1 if self.data < other.data else 1 - except TypeError: - s = type(self.data).__name__ - o = type(other.data).__name__ - if s == o: - return 0 - return -1 if s < o else 1 - - def __lt__(self, other): - """ - Less than operator. Parrots __cmp__() - :param other: Interval or point - :return: True or False - :rtype: bool - """ - return self.__cmp__(other) < 0 - - def __gt__(self, other): - """ - Greater than operator. Parrots __cmp__() - :param other: Interval or point - :return: True or False - :rtype: bool - """ - return self.__cmp__(other) > 0 - - def _raise_if_null(self, other): - """ - :raises ValueError: if either self or other is a null Interval - """ - if self.is_null(): - raise ValueError("Cannot compare null Intervals!") - if hasattr(other, 'is_null') and other.is_null(): - raise ValueError("Cannot compare null Intervals!") - - def lt(self, other): - """ - Strictly less than. Returns True if no part of this Interval - extends higher than or into other. - :raises ValueError: if either self or other is a null Interval - :param other: Interval or point - :return: True or False - :rtype: bool - """ - self._raise_if_null(other) - return self.end <= getattr(other, 'begin', other) - - def le(self, other): - """ - Less than or overlaps. Returns True if no part of this Interval - extends higher than other. - :raises ValueError: if either self or other is a null Interval - :param other: Interval or point - :return: True or False - :rtype: bool - """ - self._raise_if_null(other) - return self.end <= getattr(other, 'end', other) - - def gt(self, other): - """ - Strictly greater than. Returns True if no part of this Interval - extends lower than or into other. - :raises ValueError: if either self or other is a null Interval - :param other: Interval or point - :return: True or False - :rtype: bool - """ - self._raise_if_null(other) - if hasattr(other, 'end'): - return self.begin >= other.end - else: - return self.begin > other - - def ge(self, other): - """ - Greater than or overlaps. Returns True if no part of this Interval - extends lower than other. - :raises ValueError: if either self or other is a null Interval - :param other: Interval or point - :return: True or False - :rtype: bool - """ - self._raise_if_null(other) - return self.begin >= getattr(other, 'begin', other) - - def _get_fields(self): - """ - Used by str, unicode, repr and __reduce__. - - Returns only the fields necessary to reconstruct the Interval. - :return: reconstruction info - :rtype: tuple - """ - if self.data is not None: - return self.begin, self.end, self.data - else: - return self.begin, self.end - - def __repr__(self): - """ - Executable string representation of this Interval. - :return: string representation - :rtype: str - """ - if isinstance(self.begin, Number): - s_begin = str(self.begin) - s_end = str(self.end) - else: - s_begin = repr(self.begin) - s_end = repr(self.end) - if self.data is None: - return "Interval({0}, {1})".format(s_begin, s_end) - else: - return "Interval({0}, {1}, {2})".format(s_begin, s_end, repr(self.data)) - - __str__ = __repr__ - - def copy(self): - """ - Shallow copy. - :return: copy of self - :rtype: Interval - """ - return Interval(self.begin, self.end, self.data) - - def __reduce__(self): - """ - For pickle-ing. - :return: pickle data - :rtype: tuple - """ - return Interval, self._get_fields() diff -Nru tophat-2.1.1+dfsg/src/intervaltree/intervaltree.py tophat-2.1.1+dfsg1/src/intervaltree/intervaltree.py --- tophat-2.1.1+dfsg/src/intervaltree/intervaltree.py 2016-02-14 18:21:17.352079000 +0000 +++ tophat-2.1.1+dfsg1/src/intervaltree/intervaltree.py 1970-01-01 00:00:00.000000000 +0000 @@ -1,947 +0,0 @@ -""" -intervaltree: A mutable, self-balancing interval tree for Python 2 and 3. -Queries may be by point, by range overlap, or by range envelopment. - -Core logic. - -Copyright 2013-2015 Chaim-Leib Halbert -Modifications Copyright 2014 Konstantin Tretyakov - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -""" -from .interval import Interval -from .node import Node -from numbers import Number -import collections -from sortedcontainers import SortedDict -from warnings import warn - -try: - xrange # Python 2? -except NameError: - xrange = range - - -# noinspection PyBroadException -class IntervalTree(collections.MutableSet): - """ - A binary lookup tree of intervals. - The intervals contained in the tree are represented using ``Interval(a, b, data)`` objects. - Each such object represents a half-open interval ``[a, b)`` with optional data. - - Examples: - --------- - - Initialize a blank tree:: - - >>> tree = IntervalTree() - >>> tree - IntervalTree() - - Initialize a tree from an iterable set of Intervals in O(n * log n):: - - >>> tree = IntervalTree([Interval(-10, 10), Interval(-20.0, -10.0)]) - >>> tree - IntervalTree([Interval(-20.0, -10.0), Interval(-10, 10)]) - >>> len(tree) - 2 - - Note that this is a set, i.e. repeated intervals are ignored. However, - Intervals with different data fields are regarded as different:: - - >>> tree = IntervalTree([Interval(-10, 10), Interval(-10, 10), Interval(-10, 10, "x")]) - >>> tree - IntervalTree([Interval(-10, 10), Interval(-10, 10, 'x')]) - >>> len(tree) - 2 - - Insertions:: - >>> tree = IntervalTree() - >>> tree[0:1] = "data" - >>> tree.add(Interval(10, 20)) - >>> tree.addi(19.9, 20) - >>> tree - IntervalTree([Interval(0, 1, 'data'), Interval(10, 20), Interval(19.9, 20)]) - >>> tree.update([Interval(19.9, 20.1), Interval(20.1, 30)]) - >>> len(tree) - 5 - - Inserting the same Interval twice does nothing:: - >>> tree = IntervalTree() - >>> tree[-10:20] = "arbitrary data" - >>> tree[-10:20] = None # Note that this is also an insertion - >>> tree - IntervalTree([Interval(-10, 20), Interval(-10, 20, 'arbitrary data')]) - >>> tree[-10:20] = None # This won't change anything - >>> tree[-10:20] = "arbitrary data" # Neither will this - >>> len(tree) - 2 - - Deletions:: - >>> tree = IntervalTree(Interval(b, e) for b, e in [(-10, 10), (-20, -10), (10, 20)]) - >>> tree - IntervalTree([Interval(-20, -10), Interval(-10, 10), Interval(10, 20)]) - >>> tree.remove(Interval(-10, 10)) - >>> tree - IntervalTree([Interval(-20, -10), Interval(10, 20)]) - >>> tree.remove(Interval(-10, 10)) - Traceback (most recent call last): - ... - ValueError - >>> tree.discard(Interval(-10, 10)) # Same as remove, but no exception on failure - >>> tree - IntervalTree([Interval(-20, -10), Interval(10, 20)]) - - Delete intervals, overlapping a given point:: - - >>> tree = IntervalTree([Interval(-1.1, 1.1), Interval(-0.5, 1.5), Interval(0.5, 1.7)]) - >>> tree.remove_overlap(1.1) - >>> tree - IntervalTree([Interval(-1.1, 1.1)]) - - Delete intervals, overlapping an interval:: - - >>> tree = IntervalTree([Interval(-1.1, 1.1), Interval(-0.5, 1.5), Interval(0.5, 1.7)]) - >>> tree.remove_overlap(0, 0.5) - >>> tree - IntervalTree([Interval(0.5, 1.7)]) - >>> tree.remove_overlap(1.7, 1.8) - >>> tree - IntervalTree([Interval(0.5, 1.7)]) - >>> tree.remove_overlap(1.6, 1.6) # Null interval does nothing - >>> tree - IntervalTree([Interval(0.5, 1.7)]) - >>> tree.remove_overlap(1.6, 1.5) # Ditto - >>> tree - IntervalTree([Interval(0.5, 1.7)]) - - Delete intervals, enveloped in the range:: - - >>> tree = IntervalTree([Interval(-1.1, 1.1), Interval(-0.5, 1.5), Interval(0.5, 1.7)]) - >>> tree.remove_envelop(-1.0, 1.5) - >>> tree - IntervalTree([Interval(-1.1, 1.1), Interval(0.5, 1.7)]) - >>> tree.remove_envelop(-1.1, 1.5) - >>> tree - IntervalTree([Interval(0.5, 1.7)]) - >>> tree.remove_envelop(0.5, 1.5) - >>> tree - IntervalTree([Interval(0.5, 1.7)]) - >>> tree.remove_envelop(0.5, 1.7) - >>> tree - IntervalTree() - - Point/interval overlap queries:: - - >>> tree = IntervalTree([Interval(-1.1, 1.1), Interval(-0.5, 1.5), Interval(0.5, 1.7)]) - >>> assert tree[-1.1] == set([Interval(-1.1, 1.1)]) - >>> assert tree.search(1.1) == set([Interval(-0.5, 1.5), Interval(0.5, 1.7)]) # Same as tree[1.1] - >>> assert tree[-0.5:0.5] == set([Interval(-0.5, 1.5), Interval(-1.1, 1.1)]) # Interval overlap query - >>> assert tree.search(1.5, 1.5) == set() # Same as tree[1.5:1.5] - >>> assert tree.search(1.5) == set([Interval(0.5, 1.7)]) # Same as tree[1.5] - - >>> assert tree.search(1.7, 1.8) == set() - - Envelop queries:: - - >>> assert tree.search(-0.5, 0.5, strict=True) == set() - >>> assert tree.search(-0.4, 1.7, strict=True) == set([Interval(0.5, 1.7)]) - - Membership queries:: - - >>> tree = IntervalTree([Interval(-1.1, 1.1), Interval(-0.5, 1.5), Interval(0.5, 1.7)]) - >>> Interval(-0.5, 0.5) in tree - False - >>> Interval(-1.1, 1.1) in tree - True - >>> Interval(-1.1, 1.1, "x") in tree - False - >>> tree.overlaps(-1.1) - True - >>> tree.overlaps(1.7) - False - >>> tree.overlaps(1.7, 1.8) - False - >>> tree.overlaps(-1.2, -1.1) - False - >>> tree.overlaps(-1.2, -1.0) - True - - Sizing:: - - >>> tree = IntervalTree([Interval(-1.1, 1.1), Interval(-0.5, 1.5), Interval(0.5, 1.7)]) - >>> len(tree) - 3 - >>> tree.is_empty() - False - >>> IntervalTree().is_empty() - True - >>> not tree - False - >>> not IntervalTree() - True - >>> print(tree.begin()) # using print() because of floats in Python 2.6 - -1.1 - >>> print(tree.end()) # ditto - 1.7 - - Iteration:: - - >>> tree = IntervalTree([Interval(-11, 11), Interval(-5, 15), Interval(5, 17)]) - >>> [iv.begin for iv in sorted(tree)] - [-11, -5, 5] - >>> assert tree.items() == set([Interval(-5, 15), Interval(-11, 11), Interval(5, 17)]) - - Copy- and typecasting, pickling:: - - >>> tree0 = IntervalTree([Interval(0, 1, "x"), Interval(1, 2, ["x"])]) - >>> tree1 = IntervalTree(tree0) # Shares Interval objects - >>> tree2 = tree0.copy() # Shallow copy (same as above, as Intervals are singletons) - >>> import pickle - >>> tree3 = pickle.loads(pickle.dumps(tree0)) # Deep copy - >>> list(tree0[1])[0].data[0] = "y" # affects shallow copies, but not deep copies - >>> tree0 - IntervalTree([Interval(0, 1, 'x'), Interval(1, 2, ['y'])]) - >>> tree1 - IntervalTree([Interval(0, 1, 'x'), Interval(1, 2, ['y'])]) - >>> tree2 - IntervalTree([Interval(0, 1, 'x'), Interval(1, 2, ['y'])]) - >>> tree3 - IntervalTree([Interval(0, 1, 'x'), Interval(1, 2, ['x'])]) - - Equality testing:: - - >>> IntervalTree([Interval(0, 1)]) == IntervalTree([Interval(0, 1)]) - True - >>> IntervalTree([Interval(0, 1)]) == IntervalTree([Interval(0, 1, "x")]) - False - """ - @classmethod - def from_tuples(cls, tups): - """ - Create a new IntervalTree from an iterable of 2- or 3-tuples, - where the tuple lists begin, end, and optionally data. - """ - ivs = [Interval(*t) for t in tups] - return IntervalTree(ivs) - - def __init__(self, intervals=None): - """ - Set up a tree. If intervals is provided, add all the intervals - to the tree. - - Completes in O(n*log n) time. - """ - intervals = set(intervals) if intervals is not None else set() - for iv in intervals: - if iv.is_null(): - raise ValueError( - "IntervalTree: Null Interval objects not allowed in IntervalTree:" - " {0}".format(iv) - ) - self.all_intervals = intervals - self.top_node = Node.from_intervals(self.all_intervals) - self.boundary_table = SortedDict() - for iv in self.all_intervals: - self._add_boundaries(iv) - - def copy(self): - """ - Construct a new IntervalTree using shallow copies of the - intervals in the source tree. - - Completes in O(n*log n) time. - :rtype: IntervalTree - """ - return IntervalTree(iv.copy() for iv in self) - - def _add_boundaries(self, interval): - """ - Records the boundaries of the interval in the boundary table. - """ - begin = interval.begin - end = interval.end - if begin in self.boundary_table: - self.boundary_table[begin] += 1 - else: - self.boundary_table[begin] = 1 - - if end in self.boundary_table: - self.boundary_table[end] += 1 - else: - self.boundary_table[end] = 1 - - def _remove_boundaries(self, interval): - """ - Removes the boundaries of the interval from the boundary table. - """ - begin = interval.begin - end = interval.end - if self.boundary_table[begin] == 1: - del self.boundary_table[begin] - else: - self.boundary_table[begin] -= 1 - - if self.boundary_table[end] == 1: - del self.boundary_table[end] - else: - self.boundary_table[end] -= 1 - - def add(self, interval): - """ - Adds an interval to the tree, if not already present. - - Completes in O(log n) time. - """ - if interval in self: - return - - if interval.is_null(): - raise ValueError( - "IntervalTree: Null Interval objects not allowed in IntervalTree:" - " {0}".format(interval) - ) - - if not self.top_node: - self.top_node = Node.from_interval(interval) - else: - self.top_node = self.top_node.add(interval) - self.all_intervals.add(interval) - self._add_boundaries(interval) - append = add - - def addi(self, begin, end, data=None): - """ - Shortcut for add(Interval(begin, end, data)). - - Completes in O(log n) time. - """ - return self.add(Interval(begin, end, data)) - appendi = addi - - def update(self, intervals): - """ - Given an iterable of intervals, add them to the tree. - - Completes in O(m*log(n+m), where m = number of intervals to - add. - """ - for iv in intervals: - self.add(iv) - - def extend(self, intervals): - """ - Deprecated: Replaced by update(). - """ - warn("IntervalTree.extend() has been deprecated. Consider using update() instead", DeprecationWarning) - self.update(intervals) - - def remove(self, interval): - """ - Removes an interval from the tree, if present. If not, raises - ValueError. - - Completes in O(log n) time. - """ - #self.verify() - if interval not in self: - #print(self.all_intervals) - raise ValueError - self.top_node = self.top_node.remove(interval) - self.all_intervals.remove(interval) - self._remove_boundaries(interval) - #self.verify() - - def removei(self, begin, end, data=None): - """ - Shortcut for remove(Interval(begin, end, data)). - - Completes in O(log n) time. - """ - return self.remove(Interval(begin, end, data)) - - def discard(self, interval): - """ - Removes an interval from the tree, if present. If not, does - nothing. - - Completes in O(log n) time. - """ - if interval not in self: - return - self.all_intervals.discard(interval) - self.top_node = self.top_node.discard(interval) - self._remove_boundaries(interval) - - def discardi(self, begin, end, data=None): - """ - Shortcut for discard(Interval(begin, end, data)). - - Completes in O(log n) time. - """ - return self.discard(Interval(begin, end, data)) - - def difference(self, other): - """ - Returns a new tree, comprising all intervals in self but not - in other. - """ - ivs = set() - for iv in self: - if iv not in other: - ivs.add(iv) - return IntervalTree(ivs) - - def difference_update(self, other): - """ - Removes all intervals in other from self. - """ - for iv in other: - self.discard(iv) - - def union(self, other): - """ - Returns a new tree, comprising all intervals from self - and other. - """ - return IntervalTree(set(self).union(other)) - - def intersection(self, other): - """ - Returns a new tree of all intervals common to both self and - other. - """ - ivs = set() - shorter, longer = sorted([self, other], key=len) - for iv in shorter: - if iv in longer: - ivs.add(iv) - return IntervalTree(ivs) - - def intersection_update(self, other): - """ - Removes intervals from self unless they also exist in other. - """ - for iv in self: - if iv not in other: - self.remove(iv) - - def symmetric_difference(self, other): - """ - Return a tree with elements only in self or other but not - both. - """ - if not isinstance(other, set): other = set(other) - me = set(self) - ivs = me - other + (other - me) - return IntervalTree(ivs) - - def symmetric_difference_update(self, other): - """ - Throws out all intervals except those only in self or other, - not both. - """ - other = set(other) - for iv in self: - if iv in other: - self.remove(iv) - other.remove(iv) - self.update(other) - - def remove_overlap(self, begin, end=None): - """ - Removes all intervals overlapping the given point or range. - - Completes in O((r+m)*log n) time, where: - * n = size of the tree - * m = number of matches - * r = size of the search range (this is 1 for a point) - """ - hitlist = self.search(begin, end) - for iv in hitlist: - self.remove(iv) - - def remove_envelop(self, begin, end): - """ - Removes all intervals completely enveloped in the given range. - - Completes in O((r+m)*log n) time, where: - * n = size of the tree - * m = number of matches - * r = size of the search range (this is 1 for a point) - """ - hitlist = self.search(begin, end, strict=True) - for iv in hitlist: - self.remove(iv) - - def chop(self, begin, end, datafunc=None): - """ - Like remove_envelop(), but trims back Intervals hanging into - the chopped area so that nothing overlaps. - """ - insertions = set() - begin_hits = [iv for iv in self[begin] if iv.begin < begin] - end_hits = [iv for iv in self[end] if iv.end > end] - - if datafunc: - for iv in begin_hits: - insertions.add(Interval(iv.begin, begin, datafunc(iv, True))) - for iv in end_hits: - insertions.add(Interval(end, iv.end, datafunc(iv, False))) - else: - for iv in begin_hits: - insertions.add(Interval(iv.begin, begin, iv.data)) - for iv in end_hits: - insertions.add(Interval(end, iv.end, iv.data)) - - self.remove_envelop(begin, end) - self.difference_update(begin_hits) - self.difference_update(end_hits) - self.update(insertions) - - def slice(self, point, datafunc=None): - """ - Split Intervals that overlap point into two new Intervals. if - specified, uses datafunc(interval, islower=True/False) to - set the data field of the new Intervals. - :param point: where to slice - :param datafunc(interval, isupper): callable returning a new - value for the interval's data field - """ - hitlist = set(iv for iv in self[point] if iv.begin < point) - insertions = set() - if datafunc: - for iv in hitlist: - insertions.add(Interval(iv.begin, point, datafunc(iv, True))) - insertions.add(Interval(point, iv.end, datafunc(iv, False))) - else: - for iv in hitlist: - insertions.add(Interval(iv.begin, point, iv.data)) - insertions.add(Interval(point, iv.end, iv.data)) - self.difference_update(hitlist) - self.update(insertions) - - def clear(self): - """ - Empties the tree. - - Completes in O(1) tine. - """ - self.__init__() - - def find_nested(self): - """ - Returns a dictionary mapping parent intervals to sets of - intervals overlapped by and contained in the parent. - - Completes in O(n^2) time. - :rtype: dict of [Interval, set of Interval] - """ - result = {} - - def add_if_nested(): - if parent.contains_interval(child): - if parent not in result: - result[parent] = set() - result[parent].add(child) - - long_ivs = sorted(self.all_intervals, key=Interval.length, reverse=True) - for i, parent in enumerate(long_ivs): - for child in long_ivs[i + 1:]: - add_if_nested() - return result - - def overlaps(self, begin, end=None): - """ - Returns whether some interval in the tree overlaps the given - point or range. - - Completes in O(r*log n) time, where r is the size of the - search range. - :rtype: bool - """ - if end is not None: - return self.overlaps_range(begin, end) - elif isinstance(begin, Number): - return self.overlaps_point(begin) - else: - return self.overlaps_range(begin.begin, begin.end) - - def overlaps_point(self, p): - """ - Returns whether some interval in the tree overlaps p. - - Completes in O(log n) time. - :rtype: bool - """ - if self.is_empty(): - return False - return bool(self.top_node.contains_point(p)) - - def overlaps_range(self, begin, end): - """ - Returns whether some interval in the tree overlaps the given - range. - - Completes in O(r*log n) time, where r is the range length and n - is the table size. - :rtype: bool - """ - if self.is_empty(): - return False - elif self.overlaps_point(begin): - return True - return any( - self.overlaps_point(bound) - for bound in self.boundary_table - if begin <= bound < end - ) - - def split_overlaps(self): - """ - Finds all intervals with overlapping ranges and splits them - along the range boundaries. - - Completes in worst-case O(n^2*log n) time (many interval - boundaries are inside many intervals), best-case O(n*log n) - time (small number of overlaps << n per interval). - """ - if not self: - return - if len(self.boundary_table) == 2: - return - - bounds = sorted(self.boundary_table) # get bound locations - - new_ivs = set() - for lbound, ubound in zip(bounds[:-1], bounds[1:]): - for iv in self[lbound]: - new_ivs.add(Interval(lbound, ubound, iv.data)) - - self.__init__(new_ivs) - - def items(self): - """ - Constructs and returns a set of all intervals in the tree. - - Completes in O(n) time. - :rtype: set of Interval - """ - return set(self.all_intervals) - - def is_empty(self): - """ - Returns whether the tree is empty. - - Completes in O(1) time. - :rtype: bool - """ - return 0 == len(self) - - def search(self, begin, end=None, strict=False): - """ - Returns a set of all intervals overlapping the given range. Or, - if strict is True, returns the set of all intervals fully - contained in the range [begin, end]. - - Completes in O(m + k*log n) time, where: - * n = size of the tree - * m = number of matches - * k = size of the search range (this is 1 for a point) - :rtype: set of Interval - """ - root = self.top_node - if not root: - return set() - if end is None: - try: - iv = begin - return self.search(iv.begin, iv.end, strict=strict) - except: - return root.search_point(begin, set()) - elif begin >= end: - return set() - else: - result = root.search_point(begin, set()) - - boundary_table = self.boundary_table - bound_begin = boundary_table.bisect_left(begin) - bound_end = boundary_table.bisect_left(end) # exclude final end bound - result.update(root.search_overlap( - # slice notation is slightly slower - boundary_table.iloc[index] for index in xrange(bound_begin, bound_end) - )) - - # TODO: improve strict search to use node info instead of less-efficient filtering - if strict: - result = set( - iv for iv in result - if iv.begin >= begin and iv.end <= end - ) - return result - - def begin(self): - """ - Returns the lower bound of the first interval in the tree. - - Completes in O(n) time. - :rtype: Number - """ - if not self.boundary_table: - return 0 - return min(self.boundary_table) - - def end(self): - """ - Returns the upper bound of the last interval in the tree. - - Completes in O(n) time. - :rtype: Number - """ - if not self.boundary_table: - return 0 - return max(self.boundary_table) - - def print_structure(self, tostring=False): - """ - ## FOR DEBUGGING ONLY ## - Pretty-prints the structure of the tree. - If tostring is true, prints nothing and returns a string. - :rtype: None or str - """ - if self.top_node: - return self.top_node.print_structure(tostring=tostring) - else: - result = "" - if not tostring: - print(result) - else: - return result - - def verify(self): - """ - ## FOR DEBUGGING ONLY ## - Checks the table to ensure that the invariants are held. - """ - if self.all_intervals: - ## top_node.all_children() == self.all_intervals - try: - assert self.top_node.all_children() == self.all_intervals - except AssertionError as e: - print( - 'Error: the tree and the membership set are out of sync!' - ) - tivs = set(self.top_node.all_children()) - print('top_node.all_children() - all_intervals:') - pprint(tivs - self.all_intervals) - print('all_intervals - top_node.all_children():') - pprint(self.all_intervals - tivs) - raise e - - ## All members are Intervals - for iv in self: - assert isinstance(iv, Interval), ( - "Error: Only Interval objects allowed in IntervalTree:" - " {0}".format(iv) - ) - - ## No null intervals - for iv in self: - assert not iv.is_null(), ( - "Error: Null Interval objects not allowed in IntervalTree:" - " {0}".format(iv) - ) - - ## Reconstruct boundary_table - bound_check = {} - for iv in self: - if iv.begin in bound_check: - bound_check[iv.begin] += 1 - else: - bound_check[iv.begin] = 1 - if iv.end in bound_check: - bound_check[iv.end] += 1 - else: - bound_check[iv.end] = 1 - - ## Reconstructed boundary table (bound_check) ==? boundary_table - assert set(self.boundary_table.keys()) == set(bound_check.keys()),\ - 'Error: boundary_table is out of sync with ' \ - 'the intervals in the tree!' - - # For efficiency reasons this should be iteritems in Py2, but we - # don't care much for efficiency in debug methods anyway. - for key, val in self.boundary_table.items(): - assert bound_check[key] == val, \ - 'Error: boundary_table[{0}] should be {1},' \ - ' but is {2}!'.format( - key, bound_check[key], val) - - ## Internal tree structure - self.top_node.verify(set()) - else: - ## Verify empty tree - assert not self.boundary_table, \ - "Error: boundary table should be empty!" - assert self.top_node is None, \ - "Error: top_node isn't None!" - - def score(self, full_report=False): - """ - Returns a number between 0 and 1, indicating how suboptimal the tree - is. The lower, the better. Roughly, this number represents the - fraction of flawed Intervals in the tree. - :rtype: float - """ - if len(self) <= 2: - return 0.0 - - n = len(self) - m = self.top_node.count_nodes() - - def s_center_score(): - """ - Returns a normalized score, indicating roughly how many times - intervals share s_center with other intervals. Output is full-scale - from 0 to 1. - :rtype: float - """ - raw = n - m - maximum = n - 1 - return raw / float(maximum) - - report = { - "depth": self.top_node.depth_score(n, m), - "s_center": s_center_score(), - } - cumulative = max(report.values()) - report["_cumulative"] = cumulative - if full_report: - return report - return cumulative - - def __getitem__(self, index): - """ - Returns a set of all intervals overlapping the given index or - slice. - - Completes in O(k * log(n) + m) time, where: - * n = size of the tree - * m = number of matches - * k = size of the search range (this is 1 for a point) - :rtype: set of Interval - """ - try: - start, stop = index.start, index.stop - if start is None: - start = self.begin() - if stop is None: - return set(self) - if stop is None: - stop = self.end() - return self.search(start, stop) - except AttributeError: - return self.search(index) - - def __setitem__(self, index, value): - """ - Adds a new interval to the tree. A shortcut for - add(Interval(index.start, index.stop, value)). - - If an identical Interval object with equal range and data - already exists, does nothing. - - Completes in O(log n) time. - """ - self.addi(index.start, index.stop, value) - - def __delitem__(self, point): - """ - Delete all items overlapping point. - """ - self.remove_overlap(point) - - def __contains__(self, item): - """ - Returns whether item exists as an Interval in the tree. - This method only returns True for exact matches; for - overlaps, see the overlaps() method. - - Completes in O(1) time. - :rtype: bool - """ - # Removed point-checking code; it might trick the user into - # thinking that this is O(1), which point-checking isn't. - #if isinstance(item, Interval): - return item in self.all_intervals - #else: - # return self.contains_point(item) - - def containsi(self, begin, end, data=None): - """ - Shortcut for (Interval(begin, end, data) in tree). - - Completes in O(1) time. - :rtype: bool - """ - return Interval(begin, end, data) in self - - def __iter__(self): - """ - Returns an iterator over all the intervals in the tree. - - Completes in O(1) time. - :rtype: collections.Iterable[Interval] - """ - return self.all_intervals.__iter__() - iter = __iter__ - - def __len__(self): - """ - Returns how many intervals are in the tree. - - Completes in O(1) time. - :rtype: int - """ - return len(self.all_intervals) - - def __eq__(self, other): - """ - Whether two IntervalTrees are equal. - - Completes in O(n) time if sizes are equal; O(1) time otherwise. - :rtype: bool - """ - return ( - isinstance(other, IntervalTree) and - self.all_intervals == other.all_intervals - ) - - def __repr__(self): - """ - :rtype: str - """ - ivs = sorted(self) - if not ivs: - return "IntervalTree()" - else: - return "IntervalTree({0})".format(ivs) - - __str__ = __repr__ - - def __reduce__(self): - """ - For pickle-ing. - :rtype: tuple - """ - return IntervalTree, (sorted(self.all_intervals),) diff -Nru tophat-2.1.1+dfsg/src/intervaltree/node.py tophat-2.1.1+dfsg1/src/intervaltree/node.py --- tophat-2.1.1+dfsg/src/intervaltree/node.py 2016-02-14 18:21:17.353079000 +0000 +++ tophat-2.1.1+dfsg1/src/intervaltree/node.py 1970-01-01 00:00:00.000000000 +0000 @@ -1,593 +0,0 @@ -""" -intervaltree: A mutable, self-balancing interval tree for Python 2 and 3. -Queries may be by point, by range overlap, or by range envelopment. - -Core logic: internal tree nodes. - -Copyright 2013-2015 Chaim-Leib Halbert -Modifications Copyright 2014 Konstantin Tretyakov - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -""" -from operator import attrgetter -from math import floor, log - - -def l2(num): - """ - log base 2 - :rtype real - """ - return log(num, 2) - - -class Node(object): - def __init__(self, - x_center=None, - s_center=set(), - left_node=None, - right_node=None): - self.x_center = x_center - self.s_center = set(s_center) - self.left_node = left_node - self.right_node = right_node - self.depth = 0 # will be set when rotated - self.balance = 0 # ditto - self.rotate() - - @classmethod - def from_interval(cls, interval): - """ - :rtype : Node - """ - center = interval.begin - return Node(center, [interval]) - - @classmethod - def from_intervals(cls, intervals): - """ - :rtype : Node - """ - if not intervals: - return None - node = Node() - node = node.init_from_sorted(sorted(intervals)) - return node - - def init_from_sorted(self, intervals): - if not intervals: - return None - center_iv = intervals[len(intervals) // 2] - self.x_center = center_iv.begin - self.s_center = set() - s_left = [] - s_right = [] - for k in intervals: - if k.end <= self.x_center: - s_left.append(k) - elif k.begin > self.x_center: - s_right.append(k) - else: - self.s_center.add(k) - self.left_node = Node.from_intervals(s_left) - self.right_node = Node.from_intervals(s_right) - return self.rotate() - - def center_hit(self, interval): - """Returns whether interval overlaps self.x_center.""" - return interval.contains_point(self.x_center) - - def hit_branch(self, interval): - """ - Assuming not center_hit(interval), return which branch - (left=0, right=1) interval is in. - """ - return interval.begin > self.x_center - - def refresh_balance(self): - """ - Recalculate self.balance and self.depth based on child node values. - """ - left_depth = self.left_node.depth if self.left_node else 0 - right_depth = self.right_node.depth if self.right_node else 0 - self.depth = 1 + max(left_depth, right_depth) - self.balance = right_depth - left_depth - - def compute_depth(self): - """ - Recursively computes true depth of the subtree. Should only - be needed for debugging. Unless something is wrong, the - depth field should reflect the correct depth of the subtree. - """ - left_depth = self.left_node.compute_depth() if self.left_node else 0 - right_depth = self.right_node.compute_depth() if self.right_node else 0 - return 1 + max(left_depth, right_depth) - - def rotate(self): - """ - Does rotating, if necessary, to balance this node, and - returns the new top node. - """ - self.refresh_balance() - if abs(self.balance) < 2: - return self - # balance > 0 is the heavy side - my_heavy = self.balance > 0 - child_heavy = self[my_heavy].balance > 0 - if my_heavy == child_heavy or self[my_heavy].balance == 0: - ## Heavy sides same - # self save - # save -> 1 self - # 1 - # - ## Heavy side balanced - # self save save - # save -> 1 self -> 1 self.rot() - # 1 2 2 - return self.srotate() - else: - return self.drotate() - - def srotate(self): - """Single rotation. Assumes that balance is +-2.""" - # self save save - # save 3 -> 1 self -> 1 self.rot() - # 1 2 2 3 - # - # self save save - # 3 save -> self 1 -> self.rot() 1 - # 2 1 3 2 - - #assert(self.balance != 0) - heavy = self.balance > 0 - light = not heavy - save = self[heavy] - #print("srotate: bal={},{}".format(self.balance, save.balance)) - #self.print_structure() - self[heavy] = save[light] # 2 - #assert(save[light]) - save[light] = self.rotate() # Needed to ensure the 2 and 3 are balanced under new subnode - - # Some intervals may overlap both self.x_center and save.x_center - # Promote those to the new tip of the tree - promotees = [iv for iv in save[light].s_center if save.center_hit(iv)] - if promotees: - for iv in promotees: - save[light] = save[light].remove(iv) # may trigger pruning - # TODO: Use Node.add() here, to simplify future balancing improvements. - # For now, this is the same as augmenting save.s_center, but that may - # change. - save.s_center.update(promotees) - save.refresh_balance() - return save - - def drotate(self): - # First rotation - my_heavy = self.balance > 0 - self[my_heavy] = self[my_heavy].srotate() - self.refresh_balance() - - # Second rotation - result = self.srotate() - - return result - - def add(self, interval): - """ - Returns self after adding the interval and balancing. - """ - if self.center_hit(interval): - self.s_center.add(interval) - return self - else: - direction = self.hit_branch(interval) - if not self[direction]: - self[direction] = Node.from_interval(interval) - self.refresh_balance() - return self - else: - self[direction] = self[direction].add(interval) - return self.rotate() - - def remove(self, interval): - """ - Returns self after removing the interval and balancing. - - If interval is not present, raise ValueError. - """ - # since this is a list, called methods can set this to [1], - # making it true - done = [] - return self.remove_interval_helper(interval, done, should_raise_error=True) - - def discard(self, interval): - """ - Returns self after removing interval and balancing. - - If interval is not present, do nothing. - """ - done = [] - return self.remove_interval_helper(interval, done, should_raise_error=False) - - def remove_interval_helper(self, interval, done, should_raise_error): - """ - Returns self after removing interval and balancing. - If interval doesn't exist, raise ValueError. - - This method may set done to [1] to tell all callers that - rebalancing has completed. - - See Eternally Confuzzled's jsw_remove_r function (lines 1-32) - in his AVL tree article for reference. - """ - #trace = interval.begin == 347 and interval.end == 353 - #if trace: print('\nRemoving from {} interval {}'.format( - # self.x_center, interval)) - if self.center_hit(interval): - #if trace: print('Hit at {}'.format(self.x_center)) - if not should_raise_error and interval not in self.s_center: - done.append(1) - #if trace: print('Doing nothing.') - return self - try: - # raises error if interval not present - this is - # desired. - self.s_center.remove(interval) - except: - self.print_structure() - raise KeyError(interval) - if self.s_center: # keep this node - done.append(1) # no rebalancing necessary - #if trace: print('Removed, no rebalancing.') - return self - - # If we reach here, no intervals are left in self.s_center. - # So, prune self. - return self.prune() - else: # interval not in s_center - direction = self.hit_branch(interval) - - if not self[direction]: - if should_raise_error: - raise ValueError - done.append(1) - return self - - #if trace: - # print('Descending to {} branch'.format( - # ['left', 'right'][direction] - # )) - self[direction] = self[direction].remove_interval_helper(interval, done, should_raise_error) - - # Clean up - if not done: - #if trace: - # print('Rotating {}'.format(self.x_center)) - # self.print_structure() - return self.rotate() - return self - - def search_overlap(self, point_list): - """ - Returns all intervals that overlap the point_list. - """ - result = set() - for j in point_list: - self.search_point(j, result) - return result - - def search_point(self, point, result): - """ - Returns all intervals that contain point. - """ - for k in self.s_center: - if k.begin <= point < k.end: - result.add(k) - if point < self.x_center and self[0]: - return self[0].search_point(point, result) - elif point > self.x_center and self[1]: - return self[1].search_point(point, result) - return result - - def prune(self): - """ - On a subtree where the root node's s_center is empty, - return a new subtree with no empty s_centers. - """ - if not self[0] or not self[1]: # if I have an empty branch - direction = not self[0] # graft the other branch here - #if trace: - # print('Grafting {} branch'.format( - # 'right' if direction else 'left')) - - result = self[direction] - #if result: result.verify() - return result - else: - # Replace the root node with the greatest predecessor. - heir, self[0] = self[0].pop_greatest_child() - #if trace: - # print('Replacing {} with {}.'.format( - # self.x_center, heir.x_center - # )) - # print('Removed greatest predecessor:') - # self.print_structure() - - #if self[0]: self[0].verify() - #if self[1]: self[1].verify() - - # Set up the heir as the new root node - (heir[0], heir[1]) = (self[0], self[1]) - #if trace: print('Setting up the heir:') - #if trace: heir.print_structure() - - # popping the predecessor may have unbalanced this node; - # fix it - heir.refresh_balance() - heir = heir.rotate() - #heir.verify() - #if trace: print('Rotated the heir:') - #if trace: heir.print_structure() - return heir - - def pop_greatest_child(self): - """ - Used when pruning a node with both a left and a right branch. - Returns (greatest_child, node), where: - * greatest_child is a new node to replace the removed node. - * node is the subtree after: - - removing the greatest child - - balancing - - moving overlapping nodes into greatest_child - - Assumes that self.s_center is not empty. - - See Eternally Confuzzled's jsw_remove_r function (lines 34-54) - in his AVL tree article for reference. - """ - #print('Popping from {}'.format(self.x_center)) - if not self.right_node: # This node is the greatest child. - # To reduce the chances of an overlap with a parent, return - # a child node containing the smallest possible number of - # intervals, as close as possible to the maximum bound. - ivs = sorted(self.s_center, key=attrgetter('end', 'begin')) - max_iv = ivs.pop() - new_x_center = self.x_center - while ivs: - next_max_iv = ivs.pop() - if next_max_iv.end == max_iv.end: continue - new_x_center = max(new_x_center, next_max_iv.end) - def get_new_s_center(): - for iv in self.s_center: - if iv.contains_point(new_x_center): yield iv - - # Create a new node with the largest x_center possible. - child = Node.from_intervals(get_new_s_center()) - # [iv for iv in self.s_center if iv.contains_point(child_x_center)] - # ) - child.x_center = new_x_center - self.s_center -= child.s_center - - #print('Pop hit! Returning child = {}'.format( - # child.print_structure(tostring=True) - # )) - #assert not child[0] - #assert not child[1] - - if self.s_center: - #print(' and returning newnode = {}'.format( self )) - #self.verify() - return child, self - else: - #print(' and returning newnode = {}'.format( self[0] )) - #if self[0]: self[0].verify() - return child, self[0] # Rotate left child up - - else: - #print('Pop descent to {}'.format(self[1].x_center)) - (greatest_child, self[1]) = self[1].pop_greatest_child() - self.refresh_balance() - new_self = self.rotate() - - # Move any overlaps into greatest_child - for iv in set(new_self.s_center): - if iv.contains_point(greatest_child.x_center): - new_self.s_center.remove(iv) - greatest_child.add(iv) - - #print('Pop Returning child = {}'.format( - # greatest_child.print_structure(tostring=True) - # )) - if new_self.s_center: - #print('and returning newnode = {}'.format( - # new_self.print_structure(tostring=True) - # )) - #new_self.verify() - return greatest_child, new_self - else: - new_self = new_self.prune() - #print('and returning prune = {}'.format( - # new_self.print_structure(tostring=True) - # )) - #if new_self: new_self.verify() - return greatest_child, new_self - - def contains_point(self, p): - """ - Returns whether this node or a child overlaps p. - """ - for iv in self.s_center: - if iv.contains_point(p): - return True - branch = self[p > self.x_center] - return branch and branch.contains_point(p) - - def all_children(self): - return self.all_children_helper(set()) - - def all_children_helper(self, result): - result.update(self.s_center) - if self[0]: - self[0].all_children_helper(result) - if self[1]: - self[1].all_children_helper(result) - return result - - def verify(self, parents=set()): - """ - ## DEBUG ONLY ## - Recursively ensures that the invariants of an interval subtree - hold. - """ - assert(isinstance(self.s_center, set)) - - bal = self.balance - assert abs(bal) < 2, \ - "Error: Rotation should have happened, but didn't! \n{}".format( - self.print_structure(tostring=True) - ) - self.refresh_balance() - assert bal == self.balance, \ - "Error: self.balance not set correctly! \n{}".format( - self.print_structure(tostring=True) - ) - - assert self.s_center, \ - "Error: s_center is empty! \n{}".format( - self.print_structure(tostring=True) - ) - for iv in self.s_center: - assert hasattr(iv, 'begin') - assert hasattr(iv, 'end') - assert iv.begin < iv.end - assert iv.overlaps(self.x_center) - for parent in sorted(parents): - assert not iv.contains_point(parent), \ - "Error: Overlaps ancestor ({})! \n{}\n\n{}".format( - parent, iv, self.print_structure(tostring=True) - ) - if self[0]: - assert self[0].x_center < self.x_center, \ - "Error: Out-of-order left child! {}".format(self.x_center) - self[0].verify(parents.union([self.x_center])) - if self[1]: - assert self[1].x_center > self.x_center, \ - "Error: Out-of-order right child! {}".format(self.x_center) - self[1].verify(parents.union([self.x_center])) - - def __getitem__(self, index): - """ - Returns the left child if input is equivalent to False, or - the right side otherwise. - """ - if index: - return self.right_node - else: - return self.left_node - - def __setitem__(self, key, value): - """Sets the left (0) or right (1) child.""" - if key: - self.right_node = value - else: - self.left_node = value - - def __str__(self): - """ - Shows info about this node. - - Since Nodes are internal data structures not revealed to the - user, I'm not bothering to make this copy-paste-executable as a - constructor. - """ - return "Node<{0}, depth={1}, balance={2}>".format( - self.x_center, - self.depth, - self.balance - ) - #fieldcount = 'c_count,has_l,has_r = <{}, {}, {}>'.format( - # len(self.s_center), - # bool(self.left_node), - # bool(self.right_node) - #) - #fields = [self.x_center, self.balance, fieldcount] - #return "Node({}, b={}, {})".format(*fields) - - def count_nodes(self): - """ - Count the number of Nodes in this subtree. - :rtype: int - """ - count = 1 - if self.left_node: - count += self.left_node.count_nodes() - if self.right_node: - count += self.right_node.count_nodes() - return count - - def depth_score(self, n, m): - """ - Calculates flaws in balancing the tree. - :param n: size of tree - :param m: number of Nodes in tree - :rtype: real - """ - if n == 0: - return 0.0 - - # dopt is the optimal maximum depth of the tree - dopt = 1 + int(floor(l2(m))) - f = 1 / float(1 + n - dopt) - return f * self.depth_score_helper(1, dopt) - - def depth_score_helper(self, d, dopt): - """ - Gets a weighted count of the number of Intervals deeper than dopt. - :param d: current depth, starting from 0 - :param dopt: optimal maximum depth of a leaf Node - :rtype: real - """ - # di is how may levels deeper than optimal d is - di = d - dopt - if di > 0: - count = di * len(self.s_center) - else: - count = 0 - if self.right_node: - count += self.right_node.depth_score_helper(d + 1, dopt) - if self.left_node: - count += self.left_node.depth_score_helper(d + 1, dopt) - return count - - def print_structure(self, indent=0, tostring=False): - """ - For debugging. - """ - nl = '\n' - sp = indent * ' ' - - rlist = [str(self) + nl] - if self.s_center: - for iv in sorted(self.s_center): - rlist.append(sp + ' ' + repr(iv) + nl) - if self.left_node: - rlist.append(sp + '<: ') # no CR - rlist.append(self.left_node.print_structure(indent + 1, True)) - if self.right_node: - rlist.append(sp + '>: ') # no CR - rlist.append(self.right_node.print_structure(indent + 1, True)) - result = ''.join(rlist) - if tostring: - return result - else: - print(result) diff -Nru tophat-2.1.1+dfsg/src/samtools-0.1.18/AUTHORS tophat-2.1.1+dfsg1/src/samtools-0.1.18/AUTHORS --- tophat-2.1.1+dfsg/src/samtools-0.1.18/AUTHORS 2016-02-14 18:21:17.371079000 +0000 +++ tophat-2.1.1+dfsg1/src/samtools-0.1.18/AUTHORS 1970-01-01 00:00:00.000000000 +0000 @@ -1,20 +0,0 @@ -Heng Li from the Sanger Institute wrote most of the initial source codes -of SAMtools and various converters. - -Bob Handsaker from the Broad Institute is a major contributor to the -SAM/BAM specification. He designed and implemented the BGZF format, the -underlying indexable compression format for the BAM format. BGZF does -not support arithmetic between file offsets. - -Jue Ruan for the Beijing Genome Institute designed and implemented the -RAZF format, an alternative indexable compression format. RAZF supports -arithmetic between file offsets, at the cost of increased index file -size and the full compatibility with gzip. RAZF is optional and only -used in `faidx' for indexing RAZF compressed fasta files. - -Colin Hercus updated novo2sam.pl to support gapped alignment by -novoalign. - -Petr Danecek contributed the header parsing library sam_header.c and -sam2vcf.pl script and added knet support to the RAZF library. - diff -Nru tophat-2.1.1+dfsg/src/samtools-0.1.18/bam2bcf.c tophat-2.1.1+dfsg1/src/samtools-0.1.18/bam2bcf.c --- tophat-2.1.1+dfsg/src/samtools-0.1.18/bam2bcf.c 2016-02-14 18:21:17.381079000 +0000 +++ tophat-2.1.1+dfsg1/src/samtools-0.1.18/bam2bcf.c 1970-01-01 00:00:00.000000000 +0000 @@ -1,351 +0,0 @@ -#include -#include -#include "bam.h" -#include "kstring.h" -#include "bam2bcf.h" -#include "errmod.h" -#include "bcftools/bcf.h" - -extern void ks_introsort_uint32_t(size_t n, uint32_t a[]); - -#define CALL_ETA 0.03f -#define CALL_MAX 256 -#define CALL_DEFTHETA 0.83f -#define DEF_MAPQ 20 - -#define CAP_DIST 25 - -bcf_callaux_t *bcf_call_init(double theta, int min_baseQ) -{ - bcf_callaux_t *bca; - if (theta <= 0.) theta = CALL_DEFTHETA; - bca = calloc(1, sizeof(bcf_callaux_t)); - bca->capQ = 60; - bca->openQ = 40; bca->extQ = 20; bca->tandemQ = 100; - bca->min_baseQ = min_baseQ; - bca->e = errmod_init(1. - theta); - bca->min_frac = 0.002; - bca->min_support = 1; - return bca; -} - -void bcf_call_destroy(bcf_callaux_t *bca) -{ - if (bca == 0) return; - errmod_destroy(bca->e); - free(bca->bases); free(bca->inscns); free(bca); -} -/* ref_base is the 4-bit representation of the reference base. It is - * negative if we are looking at an indel. */ -int bcf_call_glfgen(int _n, const bam_pileup1_t *pl, int ref_base, bcf_callaux_t *bca, bcf_callret1_t *r) -{ - static int *var_pos = NULL, nvar_pos = 0; - int i, n, ref4, is_indel, ori_depth = 0; - memset(r, 0, sizeof(bcf_callret1_t)); - if (ref_base >= 0) { - ref4 = bam_nt16_nt4_table[ref_base]; - is_indel = 0; - } else ref4 = 4, is_indel = 1; - if (_n == 0) return -1; - // enlarge the bases array if necessary - if (bca->max_bases < _n) { - bca->max_bases = _n; - kroundup32(bca->max_bases); - bca->bases = (uint16_t*)realloc(bca->bases, 2 * bca->max_bases); - } - // fill the bases array - memset(r, 0, sizeof(bcf_callret1_t)); - for (i = n = 0; i < _n; ++i) { - const bam_pileup1_t *p = pl + i; - int q, b, mapQ, baseQ, is_diff, min_dist, seqQ; - // set base - if (p->is_del || p->is_refskip || (p->b->core.flag&BAM_FUNMAP)) continue; - ++ori_depth; - baseQ = q = is_indel? p->aux&0xff : (int)bam1_qual(p->b)[p->qpos]; // base/indel quality - seqQ = is_indel? (p->aux>>8&0xff) : 99; - if (q < bca->min_baseQ) continue; - if (q > seqQ) q = seqQ; - mapQ = p->b->core.qual < 255? p->b->core.qual : DEF_MAPQ; // special case for mapQ==255 - mapQ = mapQ < bca->capQ? mapQ : bca->capQ; - if (q > mapQ) q = mapQ; - if (q > 63) q = 63; - if (q < 4) q = 4; - if (!is_indel) { - b = bam1_seqi(bam1_seq(p->b), p->qpos); // base - b = bam_nt16_nt4_table[b? b : ref_base]; // b is the 2-bit base - is_diff = (ref4 < 4 && b == ref4)? 0 : 1; - } else { - b = p->aux>>16&0x3f; - is_diff = (b != 0); - } - bca->bases[n++] = q<<5 | (int)bam1_strand(p->b)<<4 | b; - // collect annotations - if (b < 4) r->qsum[b] += q; - ++r->anno[0<<2|is_diff<<1|bam1_strand(p->b)]; - min_dist = p->b->core.l_qseq - 1 - p->qpos; - if (min_dist > p->qpos) min_dist = p->qpos; - if (min_dist > CAP_DIST) min_dist = CAP_DIST; - r->anno[1<<2|is_diff<<1|0] += baseQ; - r->anno[1<<2|is_diff<<1|1] += baseQ * baseQ; - r->anno[2<<2|is_diff<<1|0] += mapQ; - r->anno[2<<2|is_diff<<1|1] += mapQ * mapQ; - r->anno[3<<2|is_diff<<1|0] += min_dist; - r->anno[3<<2|is_diff<<1|1] += min_dist * min_dist; - } - r->depth = n; r->ori_depth = ori_depth; - // glfgen - errmod_cal(bca->e, n, 5, bca->bases, r->p); - - // Calculate the Variant Distance Bias (make it optional?) - if ( nvar_pos < _n ) { - nvar_pos = _n; - var_pos = realloc(var_pos,sizeof(int)*nvar_pos); - } - int alt_dp=0, read_len=0; - for (i=0; i<_n; i++) { - const bam_pileup1_t *p = pl + i; - if ( bam1_seqi(bam1_seq(p->b),p->qpos) == ref_base ) - continue; - - var_pos[alt_dp] = p->qpos; - if ( (bam1_cigar(p->b)[0]&BAM_CIGAR_MASK)==4 ) - var_pos[alt_dp] -= bam1_cigar(p->b)[0]>>BAM_CIGAR_SHIFT; - - alt_dp++; - read_len += p->b->core.l_qseq; - } - float mvd=0; - int j; - n=0; - for (i=0; imvd[0] = n ? mvd/n : 0; - r->mvd[1] = alt_dp; - r->mvd[2] = alt_dp ? read_len/alt_dp : 0; - - return r->depth; -} - - -void calc_vdb(int n, const bcf_callret1_t *calls, bcf_call_t *call) -{ - // Variant distance bias. Samples merged by means of DP-weighted average. - - float weight=0, tot_prob=0; - - int i; - for (i=0; i2*mu ? 0 : sin(mvd*3.14/2/mu) / (4*mu/3.14); - } - else - { - // Scaled gaussian curve, crude approximation, but behaves well. Using fixed depth for bigger depths. - if ( dp>5 ) - dp = 5; - float sigma2 = (read_len/1.9/(dp+1)) * (read_len/1.9/(dp+1)); - float norm = 1.125*sqrt(2*3.14*sigma2); - float mu = read_len/2.9; - if ( mvd < mu ) - prob = exp(-(mvd-mu)*(mvd-mu)/2/sigma2)/norm; - else - prob = exp(-(mvd-mu)*(mvd-mu)/3.125/sigma2)/norm; - } - - //fprintf(stderr,"dp=%d mvd=%d read_len=%d -> prob=%f\n", dp,mvd,read_len,prob); - tot_prob += prob*dp; - weight += dp; - } - tot_prob = weight ? tot_prob/weight : 1; - //fprintf(stderr,"prob=%f\n", tot_prob); - call->vdb = tot_prob; -} - -int bcf_call_combine(int n, const bcf_callret1_t *calls, int ref_base /*4-bit*/, bcf_call_t *call) -{ - int ref4, i, j, qsum[4]; - int64_t tmp; - if (ref_base >= 0) { - call->ori_ref = ref4 = bam_nt16_nt4_table[ref_base]; - if (ref4 > 4) ref4 = 4; - } else call->ori_ref = -1, ref4 = 0; - // calculate qsum - memset(qsum, 0, 4 * sizeof(int)); - for (i = 0; i < n; ++i) - for (j = 0; j < 4; ++j) - qsum[j] += calls[i].qsum[j]; - for (j = 0; j < 4; ++j) qsum[j] = qsum[j] << 2 | j; - // find the top 2 alleles - for (i = 1; i < 4; ++i) // insertion sort - for (j = i; j > 0 && qsum[j] < qsum[j-1]; --j) - tmp = qsum[j], qsum[j] = qsum[j-1], qsum[j-1] = tmp; - // set the reference allele and alternative allele(s) - for (i = 0; i < 5; ++i) call->a[i] = -1; - call->unseen = -1; - call->a[0] = ref4; - for (i = 3, j = 1; i >= 0; --i) { - if ((qsum[i]&3) != ref4) { - if (qsum[i]>>2 != 0) call->a[j++] = qsum[i]&3; - else break; - } - } - if (ref_base >= 0) { // for SNPs, find the "unseen" base - if (((ref4 < 4 && j < 4) || (ref4 == 4 && j < 5)) && i >= 0) - call->unseen = j, call->a[j++] = qsum[i]&3; - call->n_alleles = j; - } else { - call->n_alleles = j; - if (call->n_alleles == 1) return -1; // no reliable supporting read. stop doing anything - } - // set the PL array - if (call->n < n) { - call->n = n; - call->PL = realloc(call->PL, 15 * n); - } - { - int x, g[15], z; - double sum_min = 0.; - x = call->n_alleles * (call->n_alleles + 1) / 2; - // get the possible genotypes - for (i = z = 0; i < call->n_alleles; ++i) - for (j = 0; j <= i; ++j) - g[z++] = call->a[j] * 5 + call->a[i]; - for (i = 0; i < n; ++i) { - uint8_t *PL = call->PL + x * i; - const bcf_callret1_t *r = calls + i; - float min = 1e37; - for (j = 0; j < x; ++j) - if (min > r->p[g[j]]) min = r->p[g[j]]; - sum_min += min; - for (j = 0; j < x; ++j) { - int y; - y = (int)(r->p[g[j]] - min + .499); - if (y > 255) y = 255; - PL[j] = y; - } - } -// if (ref_base < 0) fprintf(stderr, "%d,%d,%f,%d\n", call->n_alleles, x, sum_min, call->unseen); - call->shift = (int)(sum_min + .499); - } - // combine annotations - memset(call->anno, 0, 16 * sizeof(int)); - for (i = call->depth = call->ori_depth = 0, tmp = 0; i < n; ++i) { - call->depth += calls[i].depth; - call->ori_depth += calls[i].ori_depth; - for (j = 0; j < 16; ++j) call->anno[j] += calls[i].anno[j]; - } - - calc_vdb(n, calls, call); - - return 0; -} - -int bcf_call2bcf(int tid, int pos, bcf_call_t *bc, bcf1_t *b, bcf_callret1_t *bcr, int is_SP, - const bcf_callaux_t *bca, const char *ref) -{ - extern double kt_fisher_exact(int n11, int n12, int n21, int n22, double *_left, double *_right, double *two); - kstring_t s; - int i, j; - b->n_smpl = bc->n; - b->tid = tid; b->pos = pos; b->qual = 0; - s.s = b->str; s.m = b->m_str; s.l = 0; - kputc('\0', &s); - if (bc->ori_ref < 0) { // an indel - // write REF - kputc(ref[pos], &s); - for (j = 0; j < bca->indelreg; ++j) kputc(ref[pos+1+j], &s); - kputc('\0', &s); - // write ALT - kputc(ref[pos], &s); - for (i = 1; i < 4; ++i) { - if (bc->a[i] < 0) break; - if (i > 1) { - kputc(',', &s); kputc(ref[pos], &s); - } - if (bca->indel_types[bc->a[i]] < 0) { // deletion - for (j = -bca->indel_types[bc->a[i]]; j < bca->indelreg; ++j) - kputc(ref[pos+1+j], &s); - } else { // insertion; cannot be a reference unless a bug - char *inscns = &bca->inscns[bc->a[i] * bca->maxins]; - for (j = 0; j < bca->indel_types[bc->a[i]]; ++j) - kputc("ACGTN"[(int)inscns[j]], &s); - for (j = 0; j < bca->indelreg; ++j) kputc(ref[pos+1+j], &s); - } - } - kputc('\0', &s); - } else { // a SNP - kputc("ACGTN"[bc->ori_ref], &s); kputc('\0', &s); - for (i = 1; i < 5; ++i) { - if (bc->a[i] < 0) break; - if (i > 1) kputc(',', &s); - kputc(bc->unseen == i? 'X' : "ACGT"[bc->a[i]], &s); - } - kputc('\0', &s); - } - kputc('\0', &s); - // INFO - if (bc->ori_ref < 0) kputs("INDEL;", &s); - kputs("DP=", &s); kputw(bc->ori_depth, &s); kputs(";I16=", &s); - for (i = 0; i < 16; ++i) { - if (i) kputc(',', &s); - kputw(bc->anno[i], &s); - } - if ( bc->vdb!=1 ) - { - ksprintf(&s, ";VDB=%.4f", bc->vdb); - } - kputc('\0', &s); - // FMT - kputs("PL", &s); - if (bcr) { - kputs(":DP", &s); - if (is_SP) kputs(":SP", &s); - } - kputc('\0', &s); - b->m_str = s.m; b->str = s.s; b->l_str = s.l; - bcf_sync(b); - memcpy(b->gi[0].data, bc->PL, b->gi[0].len * bc->n); - if (bcr) { - uint16_t *dp = (uint16_t*)b->gi[1].data; - int32_t *sp = is_SP? b->gi[2].data : 0; - for (i = 0; i < bc->n; ++i) { - bcf_callret1_t *p = bcr + i; - dp[i] = p->depth < 0xffff? p->depth : 0xffff; - if (is_SP) { - if (p->anno[0] + p->anno[1] < 2 || p->anno[2] + p->anno[3] < 2 - || p->anno[0] + p->anno[2] < 2 || p->anno[1] + p->anno[3] < 2) - { - sp[i] = 0; - } else { - double left, right, two; - int x; - kt_fisher_exact(p->anno[0], p->anno[1], p->anno[2], p->anno[3], &left, &right, &two); - x = (int)(-4.343 * log(two) + .499); - if (x > 255) x = 255; - sp[i] = x; - } - } - } - } - return 0; -} diff -Nru tophat-2.1.1+dfsg/src/samtools-0.1.18/bam2bcf.h tophat-2.1.1+dfsg1/src/samtools-0.1.18/bam2bcf.h --- tophat-2.1.1+dfsg/src/samtools-0.1.18/bam2bcf.h 2016-02-14 18:21:17.382079000 +0000 +++ tophat-2.1.1+dfsg1/src/samtools-0.1.18/bam2bcf.h 1970-01-01 00:00:00.000000000 +0000 @@ -1,57 +0,0 @@ -#ifndef BAM2BCF_H -#define BAM2BCF_H - -#include -#include "errmod.h" -#include "bcftools/bcf.h" - -#define B2B_INDEL_NULL 10000 - -typedef struct __bcf_callaux_t { - int capQ, min_baseQ; - int openQ, extQ, tandemQ; // for indels - int min_support; // for collecting indel candidates - double min_frac; // for collecting indel candidates - // for internal uses - int max_bases; - int indel_types[4]; - int maxins, indelreg; - char *inscns; - uint16_t *bases; - errmod_t *e; - void *rghash; -} bcf_callaux_t; - -typedef struct { - int depth, ori_depth, qsum[4]; - int anno[16]; - float p[25]; - int mvd[3]; // mean variant distance, number of variant reads, average read length -} bcf_callret1_t; - -typedef struct { - int a[5]; // alleles: ref, alt, alt2, alt3 - int n, n_alleles, shift, ori_ref, unseen; - int anno[16], depth, ori_depth; - uint8_t *PL; - float vdb; // variant distance bias -} bcf_call_t; - -#ifdef __cplusplus -extern "C" { -#endif - - bcf_callaux_t *bcf_call_init(double theta, int min_baseQ); - void bcf_call_destroy(bcf_callaux_t *bca); - int bcf_call_glfgen(int _n, const bam_pileup1_t *pl, int ref_base, bcf_callaux_t *bca, bcf_callret1_t *r); - int bcf_call_combine(int n, const bcf_callret1_t *calls, int ref_base /*4-bit*/, bcf_call_t *call); - int bcf_call2bcf(int tid, int pos, bcf_call_t *bc, bcf1_t *b, bcf_callret1_t *bcr, int is_SP, - const bcf_callaux_t *bca, const char *ref); - int bcf_call_gap_prep(int n, int *n_plp, bam_pileup1_t **plp, int pos, bcf_callaux_t *bca, const char *ref, - const void *rghash); - -#ifdef __cplusplus -} -#endif - -#endif diff -Nru tophat-2.1.1+dfsg/src/samtools-0.1.18/bam2bcf_indel.c tophat-2.1.1+dfsg1/src/samtools-0.1.18/bam2bcf_indel.c --- tophat-2.1.1+dfsg/src/samtools-0.1.18/bam2bcf_indel.c 2016-02-14 18:21:17.383079000 +0000 +++ tophat-2.1.1+dfsg1/src/samtools-0.1.18/bam2bcf_indel.c 1970-01-01 00:00:00.000000000 +0000 @@ -1,475 +0,0 @@ -#include -#include -#include -#include "bam.h" -#include "bam2bcf.h" -#include "kaln.h" -#include "kprobaln.h" -#include "khash.h" -KHASH_SET_INIT_STR(rg) - -#include "ksort.h" -KSORT_INIT_GENERIC(uint32_t) - -#define MINUS_CONST 0x10000000 -#define INDEL_WINDOW_SIZE 50 - -void *bcf_call_add_rg(void *_hash, const char *hdtext, const char *list) -{ - const char *s, *p, *q, *r, *t; - khash_t(rg) *hash; - if (list == 0 || hdtext == 0) return _hash; - if (_hash == 0) _hash = kh_init(rg); - hash = (khash_t(rg)*)_hash; - if ((s = strstr(hdtext, "@RG\t")) == 0) return hash; - do { - t = strstr(s + 4, "@RG\t"); // the next @RG - if ((p = strstr(s, "\tID:")) != 0) p += 4; - if ((q = strstr(s, "\tPL:")) != 0) q += 4; - if (p && q && (t == 0 || (p < t && q < t))) { // ID and PL are both present - int lp, lq; - char *x; - for (r = p; *r && *r != '\t' && *r != '\n'; ++r); lp = r - p; - for (r = q; *r && *r != '\t' && *r != '\n'; ++r); lq = r - q; - x = calloc((lp > lq? lp : lq) + 1, 1); - for (r = q; *r && *r != '\t' && *r != '\n'; ++r) x[r-q] = *r; - if (strstr(list, x)) { // insert ID to the hash table - khint_t k; - int ret; - for (r = p; *r && *r != '\t' && *r != '\n'; ++r) x[r-p] = *r; - x[r-p] = 0; - k = kh_get(rg, hash, x); - if (k == kh_end(hash)) k = kh_put(rg, hash, x, &ret); - else free(x); - } else free(x); - } - s = t; - } while (s); - return hash; -} - -void bcf_call_del_rghash(void *_hash) -{ - khint_t k; - khash_t(rg) *hash = (khash_t(rg)*)_hash; - if (hash == 0) return; - for (k = kh_begin(hash); k < kh_end(hash); ++k) - if (kh_exist(hash, k)) - free((char*)kh_key(hash, k)); - kh_destroy(rg, hash); -} - -static int tpos2qpos(const bam1_core_t *c, const uint32_t *cigar, int32_t tpos, int is_left, int32_t *_tpos) -{ - int k, x = c->pos, y = 0, last_y = 0; - *_tpos = c->pos; - for (k = 0; k < c->n_cigar; ++k) { - int op = cigar[k] & BAM_CIGAR_MASK; - int l = cigar[k] >> BAM_CIGAR_SHIFT; - if (op == BAM_CMATCH || op == BAM_CEQUAL || op == BAM_CDIFF) { - if (c->pos > tpos) return y; - if (x + l > tpos) { - *_tpos = tpos; - return y + (tpos - x); - } - x += l; y += l; - last_y = y; - } else if (op == BAM_CINS || op == BAM_CSOFT_CLIP) y += l; - else if (op == BAM_CDEL || op == BAM_CREF_SKIP) { - if (x + l > tpos) { - *_tpos = is_left? x : x + l; - return y; - } - x += l; - } - } - *_tpos = x; - return last_y; -} -// FIXME: check if the inserted sequence is consistent with the homopolymer run -// l is the relative gap length and l_run is the length of the homopolymer on the reference -static inline int est_seqQ(const bcf_callaux_t *bca, int l, int l_run) -{ - int q, qh; - q = bca->openQ + bca->extQ * (abs(l) - 1); - qh = l_run >= 3? (int)(bca->tandemQ * (double)abs(l) / l_run + .499) : 1000; - return q < qh? q : qh; -} - -static inline int est_indelreg(int pos, const char *ref, int l, char *ins4) -{ - int i, j, max = 0, max_i = pos, score = 0; - l = abs(l); - for (i = pos + 1, j = 0; ref[i]; ++i, ++j) { - if (ins4) score += (toupper(ref[i]) != "ACGTN"[(int)ins4[j%l]])? -10 : 1; - else score += (toupper(ref[i]) != toupper(ref[pos+1+j%l]))? -10 : 1; - if (score < 0) break; - if (max < score) max = score, max_i = i; - } - return max_i - pos; -} - -int bcf_call_gap_prep(int n, int *n_plp, bam_pileup1_t **plp, int pos, bcf_callaux_t *bca, const char *ref, - const void *rghash) -{ - int i, s, j, k, t, n_types, *types, max_rd_len, left, right, max_ins, *score1, *score2, max_ref2; - int N, K, l_run, ref_type, n_alt; - char *inscns = 0, *ref2, *query, **ref_sample; - khash_t(rg) *hash = (khash_t(rg)*)rghash; - if (ref == 0 || bca == 0) return -1; - // mark filtered reads - if (rghash) { - N = 0; - for (s = N = 0; s < n; ++s) { - for (i = 0; i < n_plp[s]; ++i) { - bam_pileup1_t *p = plp[s] + i; - const uint8_t *rg = bam_aux_get(p->b, "RG"); - p->aux = 1; // filtered by default - if (rg) { - khint_t k = kh_get(rg, hash, (const char*)(rg + 1)); - if (k != kh_end(hash)) p->aux = 0, ++N; // not filtered - } - } - } - if (N == 0) return -1; // no reads left - } - // determine if there is a gap - for (s = N = 0; s < n; ++s) { - for (i = 0; i < n_plp[s]; ++i) - if (plp[s][i].indel != 0) break; - if (i < n_plp[s]) break; - } - if (s == n) return -1; // there is no indel at this position. - for (s = N = 0; s < n; ++s) N += n_plp[s]; // N is the total number of reads - { // find out how many types of indels are present - int m, n_alt = 0, n_tot = 0; - uint32_t *aux; - aux = calloc(N + 1, 4); - m = max_rd_len = 0; - aux[m++] = MINUS_CONST; // zero indel is always a type - for (s = 0; s < n; ++s) { - for (i = 0; i < n_plp[s]; ++i) { - const bam_pileup1_t *p = plp[s] + i; - if (rghash == 0 || p->aux == 0) { - ++n_tot; - if (p->indel != 0) { - ++n_alt; - aux[m++] = MINUS_CONST + p->indel; - } - } - j = bam_cigar2qlen(&p->b->core, bam1_cigar(p->b)); - if (j > max_rd_len) max_rd_len = j; - } - } - ks_introsort(uint32_t, m, aux); - // squeeze out identical types - for (i = 1, n_types = 1; i < m; ++i) - if (aux[i] != aux[i-1]) ++n_types; - if (n_types == 1 || (double)n_alt / n_tot < bca->min_frac || n_alt < bca->min_support) { // then skip - free(aux); return -1; - } - if (n_types >= 64) { - free(aux); - if (bam_verbose >= 2) - fprintf(stderr, "[%s] excessive INDEL alleles at position %d. Skip the position.\n", __func__, pos + 1); - return -1; - } - types = (int*)calloc(n_types, sizeof(int)); - t = 0; - types[t++] = aux[0] - MINUS_CONST; - for (i = 1; i < m; ++i) - if (aux[i] != aux[i-1]) - types[t++] = aux[i] - MINUS_CONST; - free(aux); - for (t = 0; t < n_types; ++t) - if (types[t] == 0) break; - ref_type = t; // the index of the reference type (0) - } - { // calculate left and right boundary - left = pos > INDEL_WINDOW_SIZE? pos - INDEL_WINDOW_SIZE : 0; - right = pos + INDEL_WINDOW_SIZE; - if (types[0] < 0) right -= types[0]; - // in case the alignments stand out the reference - for (i = pos; i < right; ++i) - if (ref[i] == 0) break; - right = i; - } - /* The following block fixes a long-existing flaw in the INDEL - * calling model: the interference of nearby SNPs. However, it also - * reduces the power because sometimes, substitutions caused by - * indels are not distinguishable from true mutations. Multiple - * sequence realignment helps to increase the power. - */ - { // construct per-sample consensus - int L = right - left + 1, max_i, max2_i; - uint32_t *cns, max, max2; - char *ref0, *r; - ref_sample = calloc(n, sizeof(void*)); - cns = calloc(L, 4); - ref0 = calloc(L, 1); - for (i = 0; i < right - left; ++i) - ref0[i] = bam_nt16_table[(int)ref[i+left]]; - for (s = 0; s < n; ++s) { - r = ref_sample[s] = calloc(L, 1); - memset(cns, 0, sizeof(int) * L); - // collect ref and non-ref counts - for (i = 0; i < n_plp[s]; ++i) { - bam_pileup1_t *p = plp[s] + i; - bam1_t *b = p->b; - uint32_t *cigar = bam1_cigar(b); - uint8_t *seq = bam1_seq(b); - int x = b->core.pos, y = 0; - for (k = 0; k < b->core.n_cigar; ++k) { - int op = cigar[k]&0xf; - int j, l = cigar[k]>>4; - if (op == BAM_CMATCH || op == BAM_CEQUAL || op == BAM_CDIFF) { - for (j = 0; j < l; ++j) - if (x + j >= left && x + j < right) - cns[x+j-left] += (bam1_seqi(seq, y+j) == ref0[x+j-left])? 1 : 0x10000; - x += l; y += l; - } else if (op == BAM_CDEL || op == BAM_CREF_SKIP) x += l; - else if (op == BAM_CINS || op == BAM_CSOFT_CLIP) y += l; - } - } - // determine the consensus - for (i = 0; i < right - left; ++i) r[i] = ref0[i]; - max = max2 = 0; max_i = max2_i = -1; - for (i = 0; i < right - left; ++i) { - if (cns[i]>>16 >= max>>16) max2 = max, max2_i = max_i, max = cns[i], max_i = i; - else if (cns[i]>>16 >= max2>>16) max2 = cns[i], max2_i = i; - } - if ((double)(max&0xffff) / ((max&0xffff) + (max>>16)) >= 0.7) max_i = -1; - if ((double)(max2&0xffff) / ((max2&0xffff) + (max2>>16)) >= 0.7) max2_i = -1; - if (max_i >= 0) r[max_i] = 15; - if (max2_i >= 0) r[max2_i] = 15; -// for (i = 0; i < right - left; ++i) fputc("=ACMGRSVTWYHKDBN"[(int)r[i]], stderr); fputc('\n', stderr); - } - free(ref0); free(cns); - } - { // the length of the homopolymer run around the current position - int c = bam_nt16_table[(int)ref[pos + 1]]; - if (c == 15) l_run = 1; - else { - for (i = pos + 2; ref[i]; ++i) - if (bam_nt16_table[(int)ref[i]] != c) break; - l_run = i; - for (i = pos; i >= 0; --i) - if (bam_nt16_table[(int)ref[i]] != c) break; - l_run -= i + 1; - } - } - // construct the consensus sequence - max_ins = types[n_types - 1]; // max_ins is at least 0 - if (max_ins > 0) { - int *inscns_aux = calloc(4 * n_types * max_ins, sizeof(int)); - // count the number of occurrences of each base at each position for each type of insertion - for (t = 0; t < n_types; ++t) { - if (types[t] > 0) { - for (s = 0; s < n; ++s) { - for (i = 0; i < n_plp[s]; ++i) { - bam_pileup1_t *p = plp[s] + i; - if (p->indel == types[t]) { - uint8_t *seq = bam1_seq(p->b); - for (k = 1; k <= p->indel; ++k) { - int c = bam_nt16_nt4_table[bam1_seqi(seq, p->qpos + k)]; - if (c < 4) ++inscns_aux[(t*max_ins+(k-1))*4 + c]; - } - } - } - } - } - } - // use the majority rule to construct the consensus - inscns = calloc(n_types * max_ins, 1); - for (t = 0; t < n_types; ++t) { - for (j = 0; j < types[t]; ++j) { - int max = 0, max_k = -1, *ia = &inscns_aux[(t*max_ins+j)*4]; - for (k = 0; k < 4; ++k) - if (ia[k] > max) - max = ia[k], max_k = k; - inscns[t*max_ins + j] = max? max_k : 4; - } - } - free(inscns_aux); - } - // compute the likelihood given each type of indel for each read - max_ref2 = right - left + 2 + 2 * (max_ins > -types[0]? max_ins : -types[0]); - ref2 = calloc(max_ref2, 1); - query = calloc(right - left + max_rd_len + max_ins + 2, 1); - score1 = calloc(N * n_types, sizeof(int)); - score2 = calloc(N * n_types, sizeof(int)); - bca->indelreg = 0; - for (t = 0; t < n_types; ++t) { - int l, ir; - kpa_par_t apf1 = { 1e-4, 1e-2, 10 }, apf2 = { 1e-6, 1e-3, 10 }; - apf1.bw = apf2.bw = abs(types[t]) + 3; - // compute indelreg - if (types[t] == 0) ir = 0; - else if (types[t] > 0) ir = est_indelreg(pos, ref, types[t], &inscns[t*max_ins]); - else ir = est_indelreg(pos, ref, -types[t], 0); - if (ir > bca->indelreg) bca->indelreg = ir; -// fprintf(stderr, "%d, %d, %d\n", pos, types[t], ir); - // realignment - for (s = K = 0; s < n; ++s) { - // write ref2 - for (k = 0, j = left; j <= pos; ++j) - ref2[k++] = bam_nt16_nt4_table[(int)ref_sample[s][j-left]]; - if (types[t] <= 0) j += -types[t]; - else for (l = 0; l < types[t]; ++l) - ref2[k++] = inscns[t*max_ins + l]; - for (; j < right && ref[j]; ++j) - ref2[k++] = bam_nt16_nt4_table[(int)ref_sample[s][j-left]]; - for (; k < max_ref2; ++k) ref2[k] = 4; - if (j < right) right = j; - // align each read to ref2 - for (i = 0; i < n_plp[s]; ++i, ++K) { - bam_pileup1_t *p = plp[s] + i; - int qbeg, qend, tbeg, tend, sc, kk; - uint8_t *seq = bam1_seq(p->b); - uint32_t *cigar = bam1_cigar(p->b); - if (p->b->core.flag&4) continue; // unmapped reads - // FIXME: the following loop should be better moved outside; nonetheless, realignment should be much slower anyway. - for (kk = 0; kk < p->b->core.n_cigar; ++kk) - if ((cigar[kk]&BAM_CIGAR_MASK) == BAM_CREF_SKIP) break; - if (kk < p->b->core.n_cigar) continue; - // FIXME: the following skips soft clips, but using them may be more sensitive. - // determine the start and end of sequences for alignment - qbeg = tpos2qpos(&p->b->core, bam1_cigar(p->b), left, 0, &tbeg); - qend = tpos2qpos(&p->b->core, bam1_cigar(p->b), right, 1, &tend); - if (types[t] < 0) { - int l = -types[t]; - tbeg = tbeg - l > left? tbeg - l : left; - } - // write the query sequence - for (l = qbeg; l < qend; ++l) - query[l - qbeg] = bam_nt16_nt4_table[bam1_seqi(seq, l)]; - { // do realignment; this is the bottleneck - const uint8_t *qual = bam1_qual(p->b), *bq; - uint8_t *qq; - qq = calloc(qend - qbeg, 1); - bq = (uint8_t*)bam_aux_get(p->b, "ZQ"); - if (bq) ++bq; // skip type - for (l = qbeg; l < qend; ++l) { - qq[l - qbeg] = bq? qual[l] + (bq[l] - 64) : qual[l]; - if (qq[l - qbeg] > 30) qq[l - qbeg] = 30; - if (qq[l - qbeg] < 7) qq[l - qbeg] = 7; - } - sc = kpa_glocal((uint8_t*)ref2 + tbeg - left, tend - tbeg + abs(types[t]), - (uint8_t*)query, qend - qbeg, qq, &apf1, 0, 0); - l = (int)(100. * sc / (qend - qbeg) + .499); // used for adjusting indelQ below - if (l > 255) l = 255; - score1[K*n_types + t] = score2[K*n_types + t] = sc<<8 | l; - if (sc > 5) { - sc = kpa_glocal((uint8_t*)ref2 + tbeg - left, tend - tbeg + abs(types[t]), - (uint8_t*)query, qend - qbeg, qq, &apf2, 0, 0); - l = (int)(100. * sc / (qend - qbeg) + .499); - if (l > 255) l = 255; - score2[K*n_types + t] = sc<<8 | l; - } - free(qq); - } -/* - for (l = 0; l < tend - tbeg + abs(types[t]); ++l) - fputc("ACGTN"[(int)ref2[tbeg-left+l]], stderr); - fputc('\n', stderr); - for (l = 0; l < qend - qbeg; ++l) fputc("ACGTN"[(int)query[l]], stderr); - fputc('\n', stderr); - fprintf(stderr, "pos=%d type=%d read=%d:%d name=%s qbeg=%d tbeg=%d score=%d\n", pos, types[t], s, i, bam1_qname(p->b), qbeg, tbeg, sc); -*/ - } - } - } - free(ref2); free(query); - { // compute indelQ - int *sc, tmp, *sumq; - sc = alloca(n_types * sizeof(int)); - sumq = alloca(n_types * sizeof(int)); - memset(sumq, 0, sizeof(int) * n_types); - for (s = K = 0; s < n; ++s) { - for (i = 0; i < n_plp[s]; ++i, ++K) { - bam_pileup1_t *p = plp[s] + i; - int *sct = &score1[K*n_types], indelQ1, indelQ2, seqQ, indelQ; - for (t = 0; t < n_types; ++t) sc[t] = sct[t]<<6 | t; - for (t = 1; t < n_types; ++t) // insertion sort - for (j = t; j > 0 && sc[j] < sc[j-1]; --j) - tmp = sc[j], sc[j] = sc[j-1], sc[j-1] = tmp; - /* errmod_cal() assumes that if the call is wrong, the - * likelihoods of other events are equal. This is about - * right for substitutions, but is not desired for - * indels. To reuse errmod_cal(), I have to make - * compromise for multi-allelic indels. - */ - if ((sc[0]&0x3f) == ref_type) { - indelQ1 = (sc[1]>>14) - (sc[0]>>14); - seqQ = est_seqQ(bca, types[sc[1]&0x3f], l_run); - } else { - for (t = 0; t < n_types; ++t) // look for the reference type - if ((sc[t]&0x3f) == ref_type) break; - indelQ1 = (sc[t]>>14) - (sc[0]>>14); - seqQ = est_seqQ(bca, types[sc[0]&0x3f], l_run); - } - tmp = sc[0]>>6 & 0xff; - indelQ1 = tmp > 111? 0 : (int)((1. - tmp/111.) * indelQ1 + .499); // reduce indelQ - sct = &score2[K*n_types]; - for (t = 0; t < n_types; ++t) sc[t] = sct[t]<<6 | t; - for (t = 1; t < n_types; ++t) // insertion sort - for (j = t; j > 0 && sc[j] < sc[j-1]; --j) - tmp = sc[j], sc[j] = sc[j-1], sc[j-1] = tmp; - if ((sc[0]&0x3f) == ref_type) { - indelQ2 = (sc[1]>>14) - (sc[0]>>14); - } else { - for (t = 0; t < n_types; ++t) // look for the reference type - if ((sc[t]&0x3f) == ref_type) break; - indelQ2 = (sc[t]>>14) - (sc[0]>>14); - } - tmp = sc[0]>>6 & 0xff; - indelQ2 = tmp > 111? 0 : (int)((1. - tmp/111.) * indelQ2 + .499); - // pick the smaller between indelQ1 and indelQ2 - indelQ = indelQ1 < indelQ2? indelQ1 : indelQ2; - if (indelQ > 255) indelQ = 255; - if (seqQ > 255) seqQ = 255; - p->aux = (sc[0]&0x3f)<<16 | seqQ<<8 | indelQ; // use 22 bits in total - sumq[sc[0]&0x3f] += indelQ < seqQ? indelQ : seqQ; -// fprintf(stderr, "pos=%d read=%d:%d name=%s call=%d indelQ=%d seqQ=%d\n", pos, s, i, bam1_qname(p->b), types[sc[0]&0x3f], indelQ, seqQ); - } - } - // determine bca->indel_types[] and bca->inscns - bca->maxins = max_ins; - bca->inscns = realloc(bca->inscns, bca->maxins * 4); - for (t = 0; t < n_types; ++t) - sumq[t] = sumq[t]<<6 | t; - for (t = 1; t < n_types; ++t) // insertion sort - for (j = t; j > 0 && sumq[j] > sumq[j-1]; --j) - tmp = sumq[j], sumq[j] = sumq[j-1], sumq[j-1] = tmp; - for (t = 0; t < n_types; ++t) // look for the reference type - if ((sumq[t]&0x3f) == ref_type) break; - if (t) { // then move the reference type to the first - tmp = sumq[t]; - for (; t > 0; --t) sumq[t] = sumq[t-1]; - sumq[0] = tmp; - } - for (t = 0; t < 4; ++t) bca->indel_types[t] = B2B_INDEL_NULL; - for (t = 0; t < 4 && t < n_types; ++t) { - bca->indel_types[t] = types[sumq[t]&0x3f]; - memcpy(&bca->inscns[t * bca->maxins], &inscns[(sumq[t]&0x3f) * max_ins], bca->maxins); - } - // update p->aux - for (s = n_alt = 0; s < n; ++s) { - for (i = 0; i < n_plp[s]; ++i) { - bam_pileup1_t *p = plp[s] + i; - int x = types[p->aux>>16&0x3f]; - for (j = 0; j < 4; ++j) - if (x == bca->indel_types[j]) break; - p->aux = j<<16 | (j == 4? 0 : (p->aux&0xffff)); - if ((p->aux>>16&0x3f) > 0) ++n_alt; -// fprintf(stderr, "X pos=%d read=%d:%d name=%s call=%d type=%d q=%d seqQ=%d\n", pos, s, i, bam1_qname(p->b), p->aux>>16&63, bca->indel_types[p->aux>>16&63], p->aux&0xff, p->aux>>8&0xff); - } - } - } - free(score1); free(score2); - // free - for (i = 0; i < n; ++i) free(ref_sample[i]); - free(ref_sample); - free(types); free(inscns); - return n_alt > 0? 0 : -1; -} diff -Nru tophat-2.1.1+dfsg/src/samtools-0.1.18/bam2depth.c tophat-2.1.1+dfsg1/src/samtools-0.1.18/bam2depth.c --- tophat-2.1.1+dfsg/src/samtools-0.1.18/bam2depth.c 2016-02-14 18:21:17.384079000 +0000 +++ tophat-2.1.1+dfsg1/src/samtools-0.1.18/bam2depth.c 1970-01-01 00:00:00.000000000 +0000 @@ -1,112 +0,0 @@ -/* This program demonstrates how to generate pileup from multiple BAMs - * simutaneously, to achieve random access and to use the BED interface. - * To compile this program separately, you may: - * - * gcc -g -O2 -Wall -o bam2depth -D_MAIN_BAM2DEPTH bam2depth.c -L. -lbam -lz - */ -#include -#include -#include -#include -#include "bam.h" - -typedef struct { // auxiliary data structure - bamFile fp; // the file handler - bam_iter_t iter; // NULL if a region not specified - int min_mapQ; // mapQ filter -} aux_t; - -void *bed_read(const char *fn); // read a BED or position list file -void bed_destroy(void *_h); // destroy the BED data structure -int bed_overlap(const void *_h, const char *chr, int beg, int end); // test if chr:beg-end overlaps - -// This function reads a BAM alignment from one BAM file. -static int read_bam(void *data, bam1_t *b) // read level filters better go here to avoid pileup -{ - aux_t *aux = (aux_t*)data; // data in fact is a pointer to an auxiliary structure - int ret = aux->iter? bam_iter_read(aux->fp, aux->iter, b) : bam_read1(aux->fp, b); - if ((int)b->core.qual < aux->min_mapQ) b->core.flag |= BAM_FUNMAP; - return ret; -} - -#ifdef _MAIN_BAM2DEPTH -int main(int argc, char *argv[]) -#else -int main_depth(int argc, char *argv[]) -#endif -{ - int i, n, tid, beg, end, pos, *n_plp, baseQ = 0, mapQ = 0; - const bam_pileup1_t **plp; - char *reg = 0; // specified region - void *bed = 0; // BED data structure - bam_header_t *h = 0; // BAM header of the 1st input - aux_t **data; - bam_mplp_t mplp; - - // parse the command line - while ((n = getopt(argc, argv, "r:b:q:Q:")) >= 0) { - switch (n) { - case 'r': reg = strdup(optarg); break; // parsing a region requires a BAM header - case 'b': bed = bed_read(optarg); break; // BED or position list file can be parsed now - case 'q': baseQ = atoi(optarg); break; // base quality threshold - case 'Q': mapQ = atoi(optarg); break; // mapping quality threshold - } - } - if (optind == argc) { - fprintf(stderr, "Usage: bam2depth [-r reg] [-q baseQthres] [-Q mapQthres] [-b in.bed] [...]\n"); - return 1; - } - - // initialize the auxiliary data structures - n = argc - optind; // the number of BAMs on the command line - data = calloc(n, sizeof(void*)); // data[i] for the i-th input - beg = 0; end = 1<<30; tid = -1; // set the default region - for (i = 0; i < n; ++i) { - bam_header_t *htmp; - data[i] = calloc(1, sizeof(aux_t)); - data[i]->fp = bam_open(argv[optind+i], "r"); // open BAM - data[i]->min_mapQ = mapQ; // set the mapQ filter - htmp = bam_header_read(data[i]->fp); // read the BAM header - if (i == 0) { - h = htmp; // keep the header of the 1st BAM - if (reg) bam_parse_region(h, reg, &tid, &beg, &end); // also parse the region - } else bam_header_destroy(htmp); // if not the 1st BAM, trash the header - if (tid >= 0) { // if a region is specified and parsed successfully - bam_index_t *idx = bam_index_load(argv[optind+i]); // load the index - data[i]->iter = bam_iter_query(idx, tid, beg, end); // set the iterator - bam_index_destroy(idx); // the index is not needed any more; phase out of the memory - } - } - - // the core multi-pileup loop - mplp = bam_mplp_init(n, read_bam, (void**)data); // initialization - n_plp = calloc(n, sizeof(int)); // n_plp[i] is the number of covering reads from the i-th BAM - plp = calloc(n, sizeof(void*)); // plp[i] points to the array of covering reads (internal in mplp) - while (bam_mplp_auto(mplp, &tid, &pos, n_plp, plp) > 0) { // come to the next covered position - if (pos < beg || pos >= end) continue; // out of range; skip - if (bed && bed_overlap(bed, h->target_name[tid], pos, pos + 1) == 0) continue; // not in BED; skip - fputs(h->target_name[tid], stdout); printf("\t%d", pos+1); // a customized printf() would be faster - for (i = 0; i < n; ++i) { // base level filters have to go here - int j, m = 0; - for (j = 0; j < n_plp[i]; ++j) { - const bam_pileup1_t *p = plp[i] + j; // DON'T modfity plp[][] unless you really know - if (p->is_del || p->is_refskip) ++m; // having dels or refskips at tid:pos - else if (bam1_qual(p->b)[p->qpos] < baseQ) ++m; // low base quality - } - printf("\t%d", n_plp[i] - m); // this the depth to output - } - putchar('\n'); - } - free(n_plp); free(plp); - bam_mplp_destroy(mplp); - - bam_header_destroy(h); - for (i = 0; i < n; ++i) { - bam_close(data[i]->fp); - if (data[i]->iter) bam_iter_destroy(data[i]->iter); - free(data[i]); - } - free(data); free(reg); - if (bed) bed_destroy(bed); - return 0; -} diff -Nru tophat-2.1.1+dfsg/src/samtools-0.1.18/bam_aux.c tophat-2.1.1+dfsg1/src/samtools-0.1.18/bam_aux.c --- tophat-2.1.1+dfsg/src/samtools-0.1.18/bam_aux.c 2016-02-14 18:21:17.385079000 +0000 +++ tophat-2.1.1+dfsg1/src/samtools-0.1.18/bam_aux.c 1970-01-01 00:00:00.000000000 +0000 @@ -1,213 +0,0 @@ -#include -#include "bam.h" -#include "khash.h" -typedef char *str_p; -KHASH_MAP_INIT_STR(s, int) -KHASH_MAP_INIT_STR(r2l, str_p) - -void bam_aux_append(bam1_t *b, const char tag[2], char type, int len, uint8_t *data) -{ - int ori_len = b->data_len; - b->data_len += 3 + len; - b->l_aux += 3 + len; - if (b->m_data < b->data_len) { - b->m_data = b->data_len; - kroundup32(b->m_data); - b->data = (uint8_t*)realloc(b->data, b->m_data); - } - b->data[ori_len] = tag[0]; b->data[ori_len + 1] = tag[1]; - b->data[ori_len + 2] = type; - memcpy(b->data + ori_len + 3, data, len); -} - -uint8_t *bam_aux_get_core(bam1_t *b, const char tag[2]) -{ - return bam_aux_get(b, tag); -} - -#define __skip_tag(s) do { \ - int type = toupper(*(s)); \ - ++(s); \ - if (type == 'Z' || type == 'H') { while (*(s)) ++(s); ++(s); } \ - else if (type == 'B') (s) += 5 + bam_aux_type2size(*(s)) * (*(int32_t*)((s)+1)); \ - else (s) += bam_aux_type2size(type); \ - } while(0) - -uint8_t *bam_aux_get(const bam1_t *b, const char tag[2]) -{ - uint8_t *s; - int y = tag[0]<<8 | tag[1]; - s = bam1_aux(b); - while (s < b->data + b->data_len) { - int x = (int)s[0]<<8 | s[1]; - s += 2; - if (x == y) return s; - __skip_tag(s); - } - return 0; -} -// s MUST BE returned by bam_aux_get() -int bam_aux_del(bam1_t *b, uint8_t *s) -{ - uint8_t *p, *aux; - aux = bam1_aux(b); - p = s - 2; - __skip_tag(s); - memmove(p, s, b->l_aux - (s - aux)); - b->data_len -= s - p; - b->l_aux -= s - p; - return 0; -} - -int bam_aux_drop_other(bam1_t *b, uint8_t *s) -{ - if (s) { - uint8_t *p, *aux; - aux = bam1_aux(b); - p = s - 2; - __skip_tag(s); - memmove(aux, p, s - p); - b->data_len -= b->l_aux - (s - p); - b->l_aux = s - p; - } else { - b->data_len -= b->l_aux; - b->l_aux = 0; - } - return 0; -} - -void bam_init_header_hash(bam_header_t *header) -{ - if (header->hash == 0) { - int ret, i; - khiter_t iter; - khash_t(s) *h; - header->hash = h = kh_init(s); - for (i = 0; i < header->n_targets; ++i) { - iter = kh_put(s, h, header->target_name[i], &ret); - kh_value(h, iter) = i; - } - } -} - -void bam_destroy_header_hash(bam_header_t *header) -{ - if (header->hash) - kh_destroy(s, (khash_t(s)*)header->hash); -} - -int32_t bam_get_tid(const bam_header_t *header, const char *seq_name) -{ - khint_t k; - khash_t(s) *h = (khash_t(s)*)header->hash; - k = kh_get(s, h, seq_name); - return k == kh_end(h)? -1 : kh_value(h, k); -} - -int bam_parse_region(bam_header_t *header, const char *str, int *ref_id, int *beg, int *end) -{ - char *s; - int i, l, k, name_end; - khiter_t iter; - khash_t(s) *h; - - bam_init_header_hash(header); - h = (khash_t(s)*)header->hash; - - *ref_id = *beg = *end = -1; - name_end = l = strlen(str); - s = (char*)malloc(l+1); - // remove space - for (i = k = 0; i < l; ++i) - if (!isspace(str[i])) s[k++] = str[i]; - s[k] = 0; l = k; - // determine the sequence name - for (i = l - 1; i >= 0; --i) if (s[i] == ':') break; // look for colon from the end - if (i >= 0) name_end = i; - if (name_end < l) { // check if this is really the end - int n_hyphen = 0; - for (i = name_end + 1; i < l; ++i) { - if (s[i] == '-') ++n_hyphen; - else if (!isdigit(s[i]) && s[i] != ',') break; - } - if (i < l || n_hyphen > 1) name_end = l; // malformated region string; then take str as the name - s[name_end] = 0; - iter = kh_get(s, h, s); - if (iter == kh_end(h)) { // cannot find the sequence name - iter = kh_get(s, h, str); // try str as the name - if (iter == kh_end(h)) { - if (bam_verbose >= 2) fprintf(stderr, "[%s] fail to determine the sequence name.\n", __func__); - free(s); return -1; - } else s[name_end] = ':', name_end = l; - } - } else iter = kh_get(s, h, str); - *ref_id = kh_val(h, iter); - // parse the interval - if (name_end < l) { - for (i = k = name_end + 1; i < l; ++i) - if (s[i] != ',') s[k++] = s[i]; - s[k] = 0; - *beg = atoi(s + name_end + 1); - for (i = name_end + 1; i != k; ++i) if (s[i] == '-') break; - *end = i < k? atoi(s + i + 1) : 1<<29; - if (*beg > 0) --*beg; - } else *beg = 0, *end = 1<<29; - free(s); - return *beg <= *end? 0 : -1; -} - -int32_t bam_aux2i(const uint8_t *s) -{ - int type; - if (s == 0) return 0; - type = *s++; - if (type == 'c') return (int32_t)*(int8_t*)s; - else if (type == 'C') return (int32_t)*(uint8_t*)s; - else if (type == 's') return (int32_t)*(int16_t*)s; - else if (type == 'S') return (int32_t)*(uint16_t*)s; - else if (type == 'i' || type == 'I') return *(int32_t*)s; - else return 0; -} - -float bam_aux2f(const uint8_t *s) -{ - int type; - type = *s++; - if (s == 0) return 0.0; - if (type == 'f') return *(float*)s; - else return 0.0; -} - -double bam_aux2d(const uint8_t *s) -{ - int type; - type = *s++; - if (s == 0) return 0.0; - if (type == 'd') return *(double*)s; - else return 0.0; -} - -char bam_aux2A(const uint8_t *s) -{ - int type; - type = *s++; - if (s == 0) return 0; - if (type == 'A') return *(char*)s; - else return 0; -} - -char *bam_aux2Z(const uint8_t *s) -{ - int type; - type = *s++; - if (s == 0) return 0; - if (type == 'Z' || type == 'H') return (char*)s; - else return 0; -} - -#ifdef _WIN32 -double drand48() -{ - return (double)rand() / RAND_MAX; -} -#endif diff -Nru tophat-2.1.1+dfsg/src/samtools-0.1.18/bam.c tophat-2.1.1+dfsg1/src/samtools-0.1.18/bam.c --- tophat-2.1.1+dfsg/src/samtools-0.1.18/bam.c 2016-02-14 18:21:17.379079000 +0000 +++ tophat-2.1.1+dfsg1/src/samtools-0.1.18/bam.c 1970-01-01 00:00:00.000000000 +0000 @@ -1,362 +0,0 @@ -#include -#include -#include -#include -#include "bam.h" -#include "bam_endian.h" -#include "kstring.h" -#include "sam_header.h" - -int bam_is_be = 0, bam_verbose = 2; -char *bam_flag2char_table = "pPuUrR12sfd\0\0\0\0\0"; - -/************************** - * CIGAR related routines * - **************************/ - -uint32_t bam_calend(const bam1_core_t *c, const uint32_t *cigar) -{ - uint32_t k, end; - end = c->pos; - for (k = 0; k < c->n_cigar; ++k) { - int op = cigar[k] & BAM_CIGAR_MASK; - if (op == BAM_CMATCH || op == BAM_CDEL || op == BAM_CREF_SKIP) - end += cigar[k] >> BAM_CIGAR_SHIFT; - } - return end; -} - -int32_t bam_cigar2qlen(const bam1_core_t *c, const uint32_t *cigar) -{ - uint32_t k; - int32_t l = 0; - for (k = 0; k < c->n_cigar; ++k) { - int op = cigar[k] & BAM_CIGAR_MASK; - if (op == BAM_CMATCH || op == BAM_CINS || op == BAM_CSOFT_CLIP || op == BAM_CEQUAL || op == BAM_CDIFF) - l += cigar[k] >> BAM_CIGAR_SHIFT; - } - return l; -} - -/******************** - * BAM I/O routines * - ********************/ - -bam_header_t *bam_header_init() -{ - bam_is_be = bam_is_big_endian(); - return (bam_header_t*)calloc(1, sizeof(bam_header_t)); -} - -void bam_header_destroy(bam_header_t *header) -{ - int32_t i; - extern void bam_destroy_header_hash(bam_header_t *header); - if (header == 0) return; - if (header->target_name) { - for (i = 0; i < header->n_targets; ++i) - free(header->target_name[i]); - free(header->target_name); - free(header->target_len); - } - free(header->text); - if (header->dict) sam_header_free(header->dict); - if (header->rg2lib) sam_tbl_destroy(header->rg2lib); - bam_destroy_header_hash(header); - free(header); -} - -bam_header_t *bam_header_read(bamFile fp) -{ - bam_header_t *header; - char buf[4]; - int magic_len; - int32_t i = 1, name_len; - // check EOF - i = bgzf_check_EOF(fp); - if (i < 0) { - // If the file is a pipe, checking the EOF marker will *always* fail - // with ESPIPE. Suppress the error message in this case. - if (errno != ESPIPE) perror("[bam_header_read] bgzf_check_EOF"); - } - else if (i == 0) fprintf(stderr, "[bam_header_read] EOF marker is absent. The input is probably truncated.\n"); - // read "BAM1" - magic_len = bam_read(fp, buf, 4); - if (magic_len != 4 || strncmp(buf, "BAM\001", 4) != 0) { - fprintf(stderr, "[bam_header_read] invalid BAM binary header (this is not a BAM file).\n"); - return 0; - } - header = bam_header_init(); - // read plain text and the number of reference sequences - bam_read(fp, &header->l_text, 4); - if (bam_is_be) bam_swap_endian_4p(&header->l_text); - header->text = (char*)calloc(header->l_text + 1, 1); - bam_read(fp, header->text, header->l_text); - bam_read(fp, &header->n_targets, 4); - if (bam_is_be) bam_swap_endian_4p(&header->n_targets); - // read reference sequence names and lengths - header->target_name = (char**)calloc(header->n_targets, sizeof(char*)); - header->target_len = (uint32_t*)calloc(header->n_targets, 4); - for (i = 0; i != header->n_targets; ++i) { - bam_read(fp, &name_len, 4); - if (bam_is_be) bam_swap_endian_4p(&name_len); - header->target_name[i] = (char*)calloc(name_len, 1); - bam_read(fp, header->target_name[i], name_len); - bam_read(fp, &header->target_len[i], 4); - if (bam_is_be) bam_swap_endian_4p(&header->target_len[i]); - } - return header; -} - -int bam_header_write(bamFile fp, const bam_header_t *header) -{ - char buf[4]; - int32_t i, name_len, x; - // write "BAM1" - strncpy(buf, "BAM\001", 4); - bam_write(fp, buf, 4); - // write plain text and the number of reference sequences - if (bam_is_be) { - x = bam_swap_endian_4(header->l_text); - bam_write(fp, &x, 4); - if (header->l_text) bam_write(fp, header->text, header->l_text); - x = bam_swap_endian_4(header->n_targets); - bam_write(fp, &x, 4); - } else { - bam_write(fp, &header->l_text, 4); - if (header->l_text) bam_write(fp, header->text, header->l_text); - bam_write(fp, &header->n_targets, 4); - } - // write sequence names and lengths - for (i = 0; i != header->n_targets; ++i) { - char *p = header->target_name[i]; - name_len = strlen(p) + 1; - if (bam_is_be) { - x = bam_swap_endian_4(name_len); - bam_write(fp, &x, 4); - } else bam_write(fp, &name_len, 4); - bam_write(fp, p, name_len); - if (bam_is_be) { - x = bam_swap_endian_4(header->target_len[i]); - bam_write(fp, &x, 4); - } else bam_write(fp, &header->target_len[i], 4); - } - bgzf_flush(fp); - return 0; -} - -static void swap_endian_data(const bam1_core_t *c, int data_len, uint8_t *data) -{ - uint8_t *s; - uint32_t i, *cigar = (uint32_t*)(data + c->l_qname); - s = data + c->n_cigar*4 + c->l_qname + c->l_qseq + (c->l_qseq + 1)/2; - for (i = 0; i < c->n_cigar; ++i) bam_swap_endian_4p(&cigar[i]); - while (s < data + data_len) { - uint8_t type; - s += 2; // skip key - type = toupper(*s); ++s; // skip type - if (type == 'C' || type == 'A') ++s; - else if (type == 'S') { bam_swap_endian_2p(s); s += 2; } - else if (type == 'I' || type == 'F') { bam_swap_endian_4p(s); s += 4; } - else if (type == 'D') { bam_swap_endian_8p(s); s += 8; } - else if (type == 'Z' || type == 'H') { while (*s) ++s; ++s; } - else if (type == 'B') { - int32_t n, Bsize = bam_aux_type2size(*s); - memcpy(&n, s + 1, 4); - if (1 == Bsize) { - } else if (2 == Bsize) { - for (i = 0; i < n; i += 2) - bam_swap_endian_2p(s + 5 + i); - } else if (4 == Bsize) { - for (i = 0; i < n; i += 4) - bam_swap_endian_4p(s + 5 + i); - } - bam_swap_endian_4p(s+1); - } - } -} - -int bam_read1(bamFile fp, bam1_t *b) -{ - bam1_core_t *c = &b->core; - int32_t block_len, ret, i; - uint32_t x[8]; - - assert(BAM_CORE_SIZE == 32); - if ((ret = bam_read(fp, &block_len, 4)) != 4) { - if (ret == 0) return -1; // normal end-of-file - else return -2; // truncated - } - if (bam_read(fp, x, BAM_CORE_SIZE) != BAM_CORE_SIZE) return -3; - if (bam_is_be) { - bam_swap_endian_4p(&block_len); - for (i = 0; i < 8; ++i) bam_swap_endian_4p(x + i); - } - c->tid = x[0]; c->pos = x[1]; - c->bin = x[2]>>16; c->qual = x[2]>>8&0xff; c->l_qname = x[2]&0xff; - c->flag = x[3]>>16; c->n_cigar = x[3]&0xffff; - c->l_qseq = x[4]; - c->mtid = x[5]; c->mpos = x[6]; c->isize = x[7]; - b->data_len = block_len - BAM_CORE_SIZE; - if (b->m_data < b->data_len) { - b->m_data = b->data_len; - kroundup32(b->m_data); - b->data = (uint8_t*)realloc(b->data, b->m_data); - } - if (bam_read(fp, b->data, b->data_len) != b->data_len) return -4; - b->l_aux = b->data_len - c->n_cigar * 4 - c->l_qname - c->l_qseq - (c->l_qseq+1)/2; - if (bam_is_be) swap_endian_data(c, b->data_len, b->data); - return 4 + block_len; -} - -inline int bam_write1_core(bamFile fp, const bam1_core_t *c, int data_len, uint8_t *data) -{ - uint32_t x[8], block_len = data_len + BAM_CORE_SIZE, y; - int i; - assert(BAM_CORE_SIZE == 32); - x[0] = c->tid; - x[1] = c->pos; - x[2] = (uint32_t)c->bin<<16 | c->qual<<8 | c->l_qname; - x[3] = (uint32_t)c->flag<<16 | c->n_cigar; - x[4] = c->l_qseq; - x[5] = c->mtid; - x[6] = c->mpos; - x[7] = c->isize; - bgzf_flush_try(fp, 4 + block_len); - if (bam_is_be) { - for (i = 0; i < 8; ++i) bam_swap_endian_4p(x + i); - y = block_len; - bam_write(fp, bam_swap_endian_4p(&y), 4); - swap_endian_data(c, data_len, data); - } else bam_write(fp, &block_len, 4); - bam_write(fp, x, BAM_CORE_SIZE); - bam_write(fp, data, data_len); - if (bam_is_be) swap_endian_data(c, data_len, data); - return 4 + block_len; -} - -int bam_write1(bamFile fp, const bam1_t *b) -{ - return bam_write1_core(fp, &b->core, b->data_len, b->data); -} - -char *bam_format1_core(const bam_header_t *header, const bam1_t *b, int of) -{ - uint8_t *s = bam1_seq(b), *t = bam1_qual(b); - int i; - const bam1_core_t *c = &b->core; - kstring_t str; - str.l = str.m = 0; str.s = 0; - - kputsn(bam1_qname(b), c->l_qname-1, &str); kputc('\t', &str); - if (of == BAM_OFDEC) { kputw(c->flag, &str); kputc('\t', &str); } - else if (of == BAM_OFHEX) ksprintf(&str, "0x%x\t", c->flag); - else { // BAM_OFSTR - for (i = 0; i < 16; ++i) - if ((c->flag & 1<tid < 0) kputsn("*\t", 2, &str); - else { - if (header) kputs(header->target_name[c->tid] , &str); - else kputw(c->tid, &str); - kputc('\t', &str); - } - kputw(c->pos + 1, &str); kputc('\t', &str); kputw(c->qual, &str); kputc('\t', &str); - if (c->n_cigar == 0) kputc('*', &str); - else { - for (i = 0; i < c->n_cigar; ++i) { - kputw(bam1_cigar(b)[i]>>BAM_CIGAR_SHIFT, &str); - kputc("MIDNSHP=X"[bam1_cigar(b)[i]&BAM_CIGAR_MASK], &str); - } - } - kputc('\t', &str); - if (c->mtid < 0) kputsn("*\t", 2, &str); - else if (c->mtid == c->tid) kputsn("=\t", 2, &str); - else { - if (header) kputs(header->target_name[c->mtid], &str); - else kputw(c->mtid, &str); - kputc('\t', &str); - } - kputw(c->mpos + 1, &str); kputc('\t', &str); kputw(c->isize, &str); kputc('\t', &str); - if (c->l_qseq) { - for (i = 0; i < c->l_qseq; ++i) kputc(bam_nt16_rev_table[bam1_seqi(s, i)], &str); - kputc('\t', &str); - if (t[0] == 0xff) kputc('*', &str); - else for (i = 0; i < c->l_qseq; ++i) kputc(t[i] + 33, &str); - } else kputsn("*\t*", 3, &str); - s = bam1_aux(b); - while (s < b->data + b->data_len) { - uint8_t type, key[2]; - key[0] = s[0]; key[1] = s[1]; - s += 2; type = *s; ++s; - kputc('\t', &str); kputsn((char*)key, 2, &str); kputc(':', &str); - if (type == 'A') { kputsn("A:", 2, &str); kputc(*s, &str); ++s; } - else if (type == 'C') { kputsn("i:", 2, &str); kputw(*s, &str); ++s; } - else if (type == 'c') { kputsn("i:", 2, &str); kputw(*(int8_t*)s, &str); ++s; } - else if (type == 'S') { kputsn("i:", 2, &str); kputw(*(uint16_t*)s, &str); s += 2; } - else if (type == 's') { kputsn("i:", 2, &str); kputw(*(int16_t*)s, &str); s += 2; } - else if (type == 'I') { kputsn("i:", 2, &str); kputuw(*(uint32_t*)s, &str); s += 4; } - else if (type == 'i') { kputsn("i:", 2, &str); kputw(*(int32_t*)s, &str); s += 4; } - else if (type == 'f') { ksprintf(&str, "f:%g", *(float*)s); s += 4; } - else if (type == 'd') { ksprintf(&str, "d:%lg", *(double*)s); s += 8; } - else if (type == 'Z' || type == 'H') { kputc(type, &str); kputc(':', &str); while (*s) kputc(*s++, &str); ++s; } - else if (type == 'B') { - uint8_t sub_type = *(s++); - int32_t n; - memcpy(&n, s, 4); - s += 4; // no point to the start of the array - kputc(type, &str); kputc(':', &str); kputc(sub_type, &str); // write the typing - for (i = 0; i < n; ++i) { - kputc(',', &str); - if ('c' == sub_type || 'c' == sub_type) { kputw(*(int8_t*)s, &str); ++s; } - else if ('C' == sub_type) { kputw(*(uint8_t*)s, &str); ++s; } - else if ('s' == sub_type) { kputw(*(int16_t*)s, &str); s += 2; } - else if ('S' == sub_type) { kputw(*(uint16_t*)s, &str); s += 2; } - else if ('i' == sub_type) { kputw(*(int32_t*)s, &str); s += 4; } - else if ('I' == sub_type) { kputuw(*(uint32_t*)s, &str); s += 4; } - else if ('f' == sub_type) { ksprintf(&str, "%g", *(float*)s); s += 4; } - } - } - } - return str.s; -} - -char *bam_format1(const bam_header_t *header, const bam1_t *b) -{ - return bam_format1_core(header, b, BAM_OFDEC); -} - -void bam_view1(const bam_header_t *header, const bam1_t *b) -{ - char *s = bam_format1(header, b); - puts(s); - free(s); -} - -int bam_validate1(const bam_header_t *header, const bam1_t *b) -{ - char *s; - - if (b->core.tid < -1 || b->core.mtid < -1) return 0; - if (header && (b->core.tid >= header->n_targets || b->core.mtid >= header->n_targets)) return 0; - - if (b->data_len < b->core.l_qname) return 0; - s = memchr(bam1_qname(b), '\0', b->core.l_qname); - if (s != &bam1_qname(b)[b->core.l_qname-1]) return 0; - - // FIXME: Other fields could also be checked, especially the auxiliary data - - return 1; -} - -// FIXME: we should also check the LB tag associated with each alignment -const char *bam_get_library(bam_header_t *h, const bam1_t *b) -{ - const uint8_t *rg; - if (h->dict == 0) h->dict = sam_header_parse2(h->text); - if (h->rg2lib == 0) h->rg2lib = sam_header2tbl(h->dict, "RG", "ID", "LB"); - rg = bam_aux_get(b, "RG"); - return (rg == 0)? 0 : sam_tbl_get(h->rg2lib, (const char*)(rg + 1)); -} diff -Nru tophat-2.1.1+dfsg/src/samtools-0.1.18/bam_cat.c tophat-2.1.1+dfsg1/src/samtools-0.1.18/bam_cat.c --- tophat-2.1.1+dfsg/src/samtools-0.1.18/bam_cat.c 2016-02-14 18:21:17.386079000 +0000 +++ tophat-2.1.1+dfsg1/src/samtools-0.1.18/bam_cat.c 1970-01-01 00:00:00.000000000 +0000 @@ -1,184 +0,0 @@ -/* - -bam_cat -- efficiently concatenates bam files - -bam_cat can be used to concatenate BAM files. Under special -circumstances, it can be used as an alternative to 'samtools merge' to -concatenate multiple sorted files into a single sorted file. For this -to work each file must be sorted, and the sorted files must be given -as command line arguments in order such that the final read in file i -is less than or equal to the first read in file i+1. - -This code is derived from the bam_reheader function in samtools 0.1.8 -and modified to perform concatenation by Chris Saunders on behalf of -Illumina. - - -########## License: - -The MIT License - -Original SAMtools work copyright (c) 2008-2009 Genome Research Ltd. -Modified SAMtools work copyright (c) 2010 Illumina, Inc. - -Permission is hereby granted, free of charge, to any person obtaining a copy -of this software and associated documentation files (the "Software"), to deal -in the Software without restriction, including without limitation the rights -to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -copies of the Software, and to permit persons to whom the Software is -furnished to do so, subject to the following conditions: - -The above copyright notice and this permission notice shall be included in -all copies or substantial portions of the Software. - -THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN -THE SOFTWARE. - -*/ - - -/* -makefile: -""" -CC=gcc -CFLAGS+=-g -Wall -O2 -D_FILE_OFFSET_BITS=64 -D_USE_KNETFILE -I$(SAMTOOLS_DIR) -LDFLAGS+=-L$(SAMTOOLS_DIR) -LDLIBS+=-lbam -lz - -all:bam_cat -""" -*/ - - -#include -#include -#include - -#include "bgzf.h" -#include "bam.h" - -#define BUF_SIZE 0x10000 - -#define GZIPID1 31 -#define GZIPID2 139 - -#define BGZF_EMPTY_BLOCK_SIZE 28 - - -int bam_cat(int nfn, char * const *fn, const bam_header_t *h, const char* outbam) -{ - BGZF *fp; - FILE* fp_file; - uint8_t *buf; - uint8_t ebuf[BGZF_EMPTY_BLOCK_SIZE]; - const int es=BGZF_EMPTY_BLOCK_SIZE; - int i; - - fp = strcmp(outbam, "-")? bgzf_open(outbam, "w") : bgzf_fdopen(fileno(stdout), "w"); - if (fp == 0) { - fprintf(stderr, "[%s] ERROR: fail to open output file '%s'.\n", __func__, outbam); - return 1; - } - if (h) bam_header_write(fp, h); - - buf = (uint8_t*) malloc(BUF_SIZE); - for(i = 0; i < nfn; ++i){ - BGZF *in; - bam_header_t *old; - int len,j; - - in = strcmp(fn[i], "-")? bam_open(fn[i], "r") : bam_dopen(fileno(stdin), "r"); - if (in == 0) { - fprintf(stderr, "[%s] ERROR: fail to open file '%s'.\n", __func__, fn[i]); - return -1; - } - if (in->open_mode != 'r') return -1; - - old = bam_header_read(in); - if (h == 0 && i == 0) bam_header_write(fp, old); - - if (in->block_offset < in->block_length) { - bgzf_write(fp, in->uncompressed_block + in->block_offset, in->block_length - in->block_offset); - bgzf_flush(fp); - } - - j=0; -#ifdef _USE_KNETFILE - fp_file=fp->x.fpw; - while ((len = knet_read(in->x.fpr, buf, BUF_SIZE)) > 0) { -#else - fp_file=fp->file; - while (!feof(in->file) && (len = fread(buf, 1, BUF_SIZE, in->file)) > 0) { -#endif - if(len= 0) { - switch (c) { - case 'h': { - tamFile fph = sam_open(optarg); - if (fph == 0) { - fprintf(stderr, "[%s] ERROR: fail to read the header from '%s'.\n", __func__, argv[1]); - return 1; - } - h = sam_header_read(fph); - sam_close(fph); - break; - } - case 'o': outfn = strdup(optarg); break; - } - } - if (argc - optind < 2) { - fprintf(stderr, "Usage: samtools cat [-h header.sam] [-o out.bam] [...]\n"); - return 1; - } - ret = bam_cat(argc - optind, argv + optind, h, outfn? outfn : "-"); - free(outfn); - return ret; -} diff -Nru tophat-2.1.1+dfsg/src/samtools-0.1.18/bam_color.c tophat-2.1.1+dfsg1/src/samtools-0.1.18/bam_color.c --- tophat-2.1.1+dfsg/src/samtools-0.1.18/bam_color.c 2016-02-14 18:21:17.386079000 +0000 +++ tophat-2.1.1+dfsg1/src/samtools-0.1.18/bam_color.c 1970-01-01 00:00:00.000000000 +0000 @@ -1,127 +0,0 @@ -#include -#include "bam.h" - -/*! - @abstract Get the color encoding the previous and current base - @param b pointer to an alignment - @param i The i-th position, 0-based - @return color - - @discussion Returns 0 no color information is found. - */ -char bam_aux_getCSi(bam1_t *b, int i) -{ - uint8_t *c = bam_aux_get(b, "CS"); - char *cs = NULL; - - // return the base if the tag was not found - if(0 == c) return 0; - - cs = bam_aux2Z(c); - // adjust for strandedness and leading adaptor - if(bam1_strand(b)) i = strlen(cs) - 1 - i; - else i++; - return cs[i]; -} - -/*! - @abstract Get the color quality of the color encoding the previous and current base - @param b pointer to an alignment - @param i The i-th position, 0-based - @return color quality - - @discussion Returns 0 no color information is found. - */ -char bam_aux_getCQi(bam1_t *b, int i) -{ - uint8_t *c = bam_aux_get(b, "CQ"); - char *cq = NULL; - - // return the base if the tag was not found - if(0 == c) return 0; - - cq = bam_aux2Z(c); - // adjust for strandedness - if(bam1_strand(b)) i = strlen(cq) - 1 - i; - return cq[i]; -} - -char bam_aux_nt2int(char a) -{ - switch(toupper(a)) { - case 'A': - return 0; - break; - case 'C': - return 1; - break; - case 'G': - return 2; - break; - case 'T': - return 3; - break; - default: - return 4; - break; - } -} - -char bam_aux_ntnt2cs(char a, char b) -{ - a = bam_aux_nt2int(a); - b = bam_aux_nt2int(b); - if(4 == a || 4 == b) return '4'; - return "0123"[(int)(a ^ b)]; -} - -/*! - @abstract Get the color error profile at the give position - @param b pointer to an alignment - @return the original color if the color was an error, '-' (dash) otherwise - - @discussion Returns 0 no color information is found. - */ -char bam_aux_getCEi(bam1_t *b, int i) -{ - int cs_i; - uint8_t *c = bam_aux_get(b, "CS"); - char *cs = NULL; - char prev_b, cur_b; - char cur_color, cor_color; - - // return the base if the tag was not found - if(0 == c) return 0; - - cs = bam_aux2Z(c); - - // adjust for strandedness and leading adaptor - if(bam1_strand(b)) { //reverse strand - cs_i = strlen(cs) - 1 - i; - // get current color - cur_color = cs[cs_i]; - // get previous base. Note: must rc adaptor - prev_b = (cs_i == 1) ? "TGCAN"[(int)bam_aux_nt2int(cs[0])] : bam_nt16_rev_table[bam1_seqi(bam1_seq(b), i+1)]; - // get current base - cur_b = bam_nt16_rev_table[bam1_seqi(bam1_seq(b), i)]; - } - else { - cs_i=i+1; - // get current color - cur_color = cs[cs_i]; - // get previous base - prev_b = (0 == i) ? cs[0] : bam_nt16_rev_table[bam1_seqi(bam1_seq(b), i-1)]; - // get current base - cur_b = bam_nt16_rev_table[bam1_seqi(bam1_seq(b), i)]; - } - - // corrected color - cor_color = bam_aux_ntnt2cs(prev_b, cur_b); - - if(cur_color == cor_color) { - return '-'; - } - else { - return cur_color; - } -} diff -Nru tophat-2.1.1+dfsg/src/samtools-0.1.18/bam_endian.h tophat-2.1.1+dfsg1/src/samtools-0.1.18/bam_endian.h --- tophat-2.1.1+dfsg/src/samtools-0.1.18/bam_endian.h 2016-02-14 18:21:17.387079000 +0000 +++ tophat-2.1.1+dfsg1/src/samtools-0.1.18/bam_endian.h 1970-01-01 00:00:00.000000000 +0000 @@ -1,42 +0,0 @@ -#ifndef BAM_ENDIAN_H -#define BAM_ENDIAN_H - -#include - -static inline int bam_is_big_endian() -{ - long one= 1; - return !(*((char *)(&one))); -} -static inline uint16_t bam_swap_endian_2(uint16_t v) -{ - return (uint16_t)(((v & 0x00FF00FFU) << 8) | ((v & 0xFF00FF00U) >> 8)); -} -static inline void *bam_swap_endian_2p(void *x) -{ - *(uint16_t*)x = bam_swap_endian_2(*(uint16_t*)x); - return x; -} -static inline uint32_t bam_swap_endian_4(uint32_t v) -{ - v = ((v & 0x0000FFFFU) << 16) | (v >> 16); - return ((v & 0x00FF00FFU) << 8) | ((v & 0xFF00FF00U) >> 8); -} -static inline void *bam_swap_endian_4p(void *x) -{ - *(uint32_t*)x = bam_swap_endian_4(*(uint32_t*)x); - return x; -} -static inline uint64_t bam_swap_endian_8(uint64_t v) -{ - v = ((v & 0x00000000FFFFFFFFLLU) << 32) | (v >> 32); - v = ((v & 0x0000FFFF0000FFFFLLU) << 16) | ((v & 0xFFFF0000FFFF0000LLU) >> 16); - return ((v & 0x00FF00FF00FF00FFLLU) << 8) | ((v & 0xFF00FF00FF00FF00LLU) >> 8); -} -static inline void *bam_swap_endian_8p(void *x) -{ - *(uint64_t*)x = bam_swap_endian_8(*(uint64_t*)x); - return x; -} - -#endif diff -Nru tophat-2.1.1+dfsg/src/samtools-0.1.18/bam.h tophat-2.1.1+dfsg1/src/samtools-0.1.18/bam.h --- tophat-2.1.1+dfsg/src/samtools-0.1.18/bam.h 2016-02-14 18:21:17.380079000 +0000 +++ tophat-2.1.1+dfsg1/src/samtools-0.1.18/bam.h 1970-01-01 00:00:00.000000000 +0000 @@ -1,763 +0,0 @@ -/* The MIT License - - Copyright (c) 2008-2010 Genome Research Ltd (GRL). - - Permission is hereby granted, free of charge, to any person obtaining - a copy of this software and associated documentation files (the - "Software"), to deal in the Software without restriction, including - without limitation the rights to use, copy, modify, merge, publish, - distribute, sublicense, and/or sell copies of the Software, and to - permit persons to whom the Software is furnished to do so, subject to - the following conditions: - - The above copyright notice and this permission notice shall be - included in all copies or substantial portions of the Software. - - THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS - BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN - ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN - CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - SOFTWARE. -*/ - -/* Contact: Heng Li */ - -#ifndef BAM_BAM_H -#define BAM_BAM_H - -/*! - @header - - BAM library provides I/O and various operations on manipulating files - in the BAM (Binary Alignment/Mapping) or SAM (Sequence Alignment/Map) - format. It now supports importing from or exporting to SAM, sorting, - merging, generating pileup, and quickly retrieval of reads overlapped - with a specified region. - - @copyright Genome Research Ltd. - */ - -#define BAM_VERSION "0.1.18 (r982:295)" - -#include -#include -#include -#include - -#ifndef BAM_LITE -#define BAM_VIRTUAL_OFFSET16 -#include "bgzf.h" -/*! @abstract BAM file handler */ -typedef BGZF *bamFile; -#define bam_open(fn, mode) bgzf_open(fn, mode) -#define bam_dopen(fd, mode) bgzf_fdopen(fd, mode) -#define bam_close(fp) bgzf_close(fp) -#define bam_read(fp, buf, size) bgzf_read(fp, buf, size) -#define bam_write(fp, buf, size) bgzf_write(fp, buf, size) -#define bam_tell(fp) bgzf_tell(fp) -#define bam_seek(fp, pos, dir) bgzf_seek(fp, pos, dir) -#else -#define BAM_TRUE_OFFSET -#include -typedef gzFile bamFile; -#define bam_open(fn, mode) gzopen(fn, mode) -#define bam_dopen(fd, mode) gzdopen(fd, mode) -#define bam_close(fp) gzclose(fp) -#define bam_read(fp, buf, size) gzread(fp, buf, size) -/* no bam_write/bam_tell/bam_seek() here */ -#endif - -/*! @typedef - @abstract Structure for the alignment header. - @field n_targets number of reference sequences - @field target_name names of the reference sequences - @field target_len lengths of the referene sequences - @field dict header dictionary - @field hash hash table for fast name lookup - @field rg2lib hash table for @RG-ID -> LB lookup - @field l_text length of the plain text in the header - @field text plain text - - @discussion Field hash points to null by default. It is a private - member. - */ -typedef struct { - int32_t n_targets; - char **target_name; - uint32_t *target_len; - void *dict, *hash, *rg2lib; - size_t l_text, n_text; - char *text; -} bam_header_t; - -/*! @abstract the read is paired in sequencing, no matter whether it is mapped in a pair */ -#define BAM_FPAIRED 1 -/*! @abstract the read is mapped in a proper pair */ -#define BAM_FPROPER_PAIR 2 -/*! @abstract the read itself is unmapped; conflictive with BAM_FPROPER_PAIR */ -#define BAM_FUNMAP 4 -/*! @abstract the mate is unmapped */ -#define BAM_FMUNMAP 8 -/*! @abstract the read is mapped to the reverse strand */ -#define BAM_FREVERSE 16 -/*! @abstract the mate is mapped to the reverse strand */ -#define BAM_FMREVERSE 32 -/*! @abstract this is read1 */ -#define BAM_FREAD1 64 -/*! @abstract this is read2 */ -#define BAM_FREAD2 128 -/*! @abstract not primary alignment */ -#define BAM_FSECONDARY 256 -/*! @abstract QC failure */ -#define BAM_FQCFAIL 512 -/*! @abstract optical or PCR duplicate */ -#define BAM_FDUP 1024 - -#define BAM_OFDEC 0 -#define BAM_OFHEX 1 -#define BAM_OFSTR 2 - -/*! @abstract defautl mask for pileup */ -#define BAM_DEF_MASK (BAM_FUNMAP | BAM_FSECONDARY | BAM_FQCFAIL | BAM_FDUP) - -#define BAM_CORE_SIZE sizeof(bam1_core_t) - -/** - * Describing how CIGAR operation/length is packed in a 32-bit integer. - */ -#define BAM_CIGAR_SHIFT 4 -#define BAM_CIGAR_MASK ((1 << BAM_CIGAR_SHIFT) - 1) - -/* - CIGAR operations. - */ -/*! @abstract CIGAR: M = match or mismatch*/ -#define BAM_CMATCH 0 -/*! @abstract CIGAR: I = insertion to the reference */ -#define BAM_CINS 1 -/*! @abstract CIGAR: D = deletion from the reference */ -#define BAM_CDEL 2 -/*! @abstract CIGAR: N = skip on the reference (e.g. spliced alignment) */ -#define BAM_CREF_SKIP 3 -/*! @abstract CIGAR: S = clip on the read with clipped sequence - present in qseq */ -#define BAM_CSOFT_CLIP 4 -/*! @abstract CIGAR: H = clip on the read with clipped sequence trimmed off */ -#define BAM_CHARD_CLIP 5 -/*! @abstract CIGAR: P = padding */ -#define BAM_CPAD 6 -/*! @abstract CIGAR: equals = match */ -#define BAM_CEQUAL 7 -/*! @abstract CIGAR: X = mismatch */ -#define BAM_CDIFF 8 - -/*! @typedef - @abstract Structure for core alignment information. - @field tid chromosome ID, defined by bam_header_t - @field pos 0-based leftmost coordinate - @field strand strand; 0 for forward and 1 otherwise - @field bin bin calculated by bam_reg2bin() - @field qual mapping quality - @field l_qname length of the query name - @field flag bitwise flag - @field n_cigar number of CIGAR operations - @field l_qseq length of the query sequence (read) - */ -typedef struct { - int32_t tid; - int32_t pos; - uint32_t bin:16, qual:8, l_qname:8; - uint32_t flag:16, n_cigar:16; - int32_t l_qseq; - int32_t mtid; - int32_t mpos; - int32_t isize; -} bam1_core_t; - -/*! @typedef - @abstract Structure for one alignment. - @field core core information about the alignment - @field l_aux length of auxiliary data - @field data_len current length of bam1_t::data - @field m_data maximum length of bam1_t::data - @field data all variable-length data, concatenated; structure: cigar-qname-seq-qual-aux - - @discussion Notes: - - 1. qname is zero tailing and core.l_qname includes the tailing '\0'. - 2. l_qseq is calculated from the total length of an alignment block - on reading or from CIGAR. - */ -typedef struct { - bam1_core_t core; - int l_aux, data_len, m_data; - uint8_t *data; -} bam1_t; - -typedef struct __bam_iter_t *bam_iter_t; - -#define bam1_strand(b) (((b)->core.flag&BAM_FREVERSE) != 0) -#define bam1_mstrand(b) (((b)->core.flag&BAM_FMREVERSE) != 0) - -/*! @function - @abstract Get the CIGAR array - @param b pointer to an alignment - @return pointer to the CIGAR array - - @discussion In the CIGAR array, each element is a 32-bit integer. The - lower 4 bits gives a CIGAR operation and the higher 28 bits keep the - length of a CIGAR. - */ -#define bam1_cigar(b) ((uint32_t*)((b)->data + (b)->core.l_qname)) - -/*! @function - @abstract Get the name of the query - @param b pointer to an alignment - @return pointer to the name string, null terminated - */ -#define bam1_qname(b) ((char*)((b)->data)) - -/*! @function - @abstract Get query sequence - @param b pointer to an alignment - @return pointer to sequence - - @discussion Each base is encoded in 4 bits: 1 for A, 2 for C, 4 for G, - 8 for T and 15 for N. Two bases are packed in one byte with the base - at the higher 4 bits having smaller coordinate on the read. It is - recommended to use bam1_seqi() macro to get the base. - */ -#define bam1_seq(b) ((b)->data + (b)->core.n_cigar*4 + (b)->core.l_qname) - -/*! @function - @abstract Get query quality - @param b pointer to an alignment - @return pointer to quality string - */ -#define bam1_qual(b) ((b)->data + (b)->core.n_cigar*4 + (b)->core.l_qname + (((b)->core.l_qseq + 1)>>1)) - -/*! @function - @abstract Get a base on read - @param s Query sequence returned by bam1_seq() - @param i The i-th position, 0-based - @return 4-bit integer representing the base. - */ -#define bam1_seqi(s, i) ((s)[(i)/2] >> 4*(1-(i)%2) & 0xf) - -/*! @function - @abstract Get query sequence and quality - @param b pointer to an alignment - @return pointer to the concatenated auxiliary data - */ -#define bam1_aux(b) ((b)->data + (b)->core.n_cigar*4 + (b)->core.l_qname + (b)->core.l_qseq + ((b)->core.l_qseq + 1)/2) - -#ifndef kroundup32 -/*! @function - @abstract Round an integer to the next closest power-2 integer. - @param x integer to be rounded (in place) - @discussion x will be modified. - */ -#define kroundup32(x) (--(x), (x)|=(x)>>1, (x)|=(x)>>2, (x)|=(x)>>4, (x)|=(x)>>8, (x)|=(x)>>16, ++(x)) -#endif - -/*! - @abstract Whether the machine is big-endian; modified only in - bam_header_init(). - */ -extern int bam_is_be; - -/*! - @abstract Verbose level between 0 and 3; 0 is supposed to disable all - debugging information, though this may not have been implemented. - */ -extern int bam_verbose; - -/*! @abstract Table for converting a nucleotide character to the 4-bit encoding. */ -extern unsigned char bam_nt16_table[256]; - -/*! @abstract Table for converting a 4-bit encoded nucleotide to a letter. */ -extern char *bam_nt16_rev_table; - -extern char bam_nt16_nt4_table[]; - -#ifdef __cplusplus -extern "C" { -#endif - - /********************* - * Low-level SAM I/O * - *********************/ - - /*! @abstract TAM file handler */ - typedef struct __tamFile_t *tamFile; - - /*! - @abstract Open a SAM file for reading, either uncompressed or compressed by gzip/zlib. - @param fn SAM file name - @return SAM file handler - */ - tamFile sam_open(const char *fn); - - /*! - @abstract Close a SAM file handler - @param fp SAM file handler - */ - void sam_close(tamFile fp); - - /*! - @abstract Read one alignment from a SAM file handler - @param fp SAM file handler - @param header header information (ordered names of chromosomes) - @param b read alignment; all members in b will be updated - @return 0 if successful; otherwise negative - */ - int sam_read1(tamFile fp, bam_header_t *header, bam1_t *b); - - /*! - @abstract Read header information from a TAB-delimited list file. - @param fn_list file name for the list - @return a pointer to the header structure - - @discussion Each line in this file consists of chromosome name and - the length of chromosome. - */ - bam_header_t *sam_header_read2(const char *fn_list); - - /*! - @abstract Read header from a SAM file (if present) - @param fp SAM file handler - @return pointer to header struct; 0 if no @SQ lines available - */ - bam_header_t *sam_header_read(tamFile fp); - - /*! - @abstract Parse @SQ lines a update a header struct - @param h pointer to the header struct to be updated - @return number of target sequences - - @discussion bam_header_t::{n_targets,target_len,target_name} will - be destroyed in the first place. - */ - int sam_header_parse(bam_header_t *h); - int32_t bam_get_tid(const bam_header_t *header, const char *seq_name); - - /*! - @abstract Parse @RG lines a update a header struct - @param h pointer to the header struct to be updated - @return number of @RG lines - - @discussion bam_header_t::rg2lib will be destroyed in the first - place. - */ - int sam_header_parse_rg(bam_header_t *h); - -#define sam_write1(header, b) bam_view1(header, b) - - - /******************************** - * APIs for string dictionaries * - ********************************/ - - int bam_strmap_put(void *strmap, const char *rg, const char *lib); - const char *bam_strmap_get(const void *strmap, const char *rg); - void *bam_strmap_dup(const void*); - void *bam_strmap_init(); - void bam_strmap_destroy(void *strmap); - - - /********************* - * Low-level BAM I/O * - *********************/ - - /*! - @abstract Initialize a header structure. - @return the pointer to the header structure - - @discussion This function also modifies the global variable - bam_is_be. - */ - bam_header_t *bam_header_init(); - - /*! - @abstract Destroy a header structure. - @param header pointer to the header - */ - void bam_header_destroy(bam_header_t *header); - - /*! - @abstract Read a header structure from BAM. - @param fp BAM file handler, opened by bam_open() - @return pointer to the header structure - - @discussion The file position indicator must be placed at the - beginning of the file. Upon success, the position indicator will - be set at the start of the first alignment. - */ - bam_header_t *bam_header_read(bamFile fp); - - /*! - @abstract Write a header structure to BAM. - @param fp BAM file handler - @param header pointer to the header structure - @return always 0 currently - */ - int bam_header_write(bamFile fp, const bam_header_t *header); - - /*! - @abstract Read an alignment from BAM. - @param fp BAM file handler - @param b read alignment; all members are updated. - @return number of bytes read from the file - - @discussion The file position indicator must be - placed right before an alignment. Upon success, this function - will set the position indicator to the start of the next - alignment. This function is not affected by the machine - endianness. - */ - int bam_read1(bamFile fp, bam1_t *b); - - /*! - @abstract Write an alignment to BAM. - @param fp BAM file handler - @param c pointer to the bam1_core_t structure - @param data_len total length of variable size data related to - the alignment - @param data pointer to the concatenated data - @return number of bytes written to the file - - @discussion This function is not affected by the machine - endianness. - */ - int bam_write1_core(bamFile fp, const bam1_core_t *c, int data_len, uint8_t *data); - - /*! - @abstract Write an alignment to BAM. - @param fp BAM file handler - @param b alignment to write - @return number of bytes written to the file - - @abstract It is equivalent to: - bam_write1_core(fp, &b->core, b->data_len, b->data) - */ - int bam_write1(bamFile fp, const bam1_t *b); - - /*! @function - @abstract Initiate a pointer to bam1_t struct - */ -#define bam_init1() ((bam1_t*)calloc(1, sizeof(bam1_t))) - - /*! @function - @abstract Free the memory allocated for an alignment. - @param b pointer to an alignment - */ -#define bam_destroy1(b) do { \ - if (b) { free((b)->data); free(b); } \ - } while (0) - - /*! - @abstract Format a BAM record in the SAM format - @param header pointer to the header structure - @param b alignment to print - @return a pointer to the SAM string - */ - char *bam_format1(const bam_header_t *header, const bam1_t *b); - - char *bam_format1_core(const bam_header_t *header, const bam1_t *b, int of); - - /*! - @abstract Check whether a BAM record is plausibly valid - @param header associated header structure, or NULL if unavailable - @param b alignment to validate - @return 0 if the alignment is invalid; non-zero otherwise - - @discussion Simple consistency check of some of the fields of the - alignment record. If the header is provided, several additional checks - are made. Not all fields are checked, so a non-zero result is not a - guarantee that the record is valid. However it is usually good enough - to detect when bam_seek() has been called with a virtual file offset - that is not the offset of an alignment record. - */ - int bam_validate1(const bam_header_t *header, const bam1_t *b); - - const char *bam_get_library(bam_header_t *header, const bam1_t *b); - - - /*************** - * pileup APIs * - ***************/ - - /*! @typedef - @abstract Structure for one alignment covering the pileup position. - @field b pointer to the alignment - @field qpos position of the read base at the pileup site, 0-based - @field indel indel length; 0 for no indel, positive for ins and negative for del - @field is_del 1 iff the base on the padded read is a deletion - @field level the level of the read in the "viewer" mode - - @discussion See also bam_plbuf_push() and bam_lplbuf_push(). The - difference between the two functions is that the former does not - set bam_pileup1_t::level, while the later does. Level helps the - implementation of alignment viewers, but calculating this has some - overhead. - */ - typedef struct { - bam1_t *b; - int32_t qpos; - int indel, level; - uint32_t is_del:1, is_head:1, is_tail:1, is_refskip:1, aux:28; - } bam_pileup1_t; - - typedef int (*bam_plp_auto_f)(void *data, bam1_t *b); - - struct __bam_plp_t; - typedef struct __bam_plp_t *bam_plp_t; - - bam_plp_t bam_plp_init(bam_plp_auto_f func, void *data); - int bam_plp_push(bam_plp_t iter, const bam1_t *b); - const bam_pileup1_t *bam_plp_next(bam_plp_t iter, int *_tid, int *_pos, int *_n_plp); - const bam_pileup1_t *bam_plp_auto(bam_plp_t iter, int *_tid, int *_pos, int *_n_plp); - void bam_plp_set_mask(bam_plp_t iter, int mask); - void bam_plp_set_maxcnt(bam_plp_t iter, int maxcnt); - void bam_plp_reset(bam_plp_t iter); - void bam_plp_destroy(bam_plp_t iter); - - struct __bam_mplp_t; - typedef struct __bam_mplp_t *bam_mplp_t; - - bam_mplp_t bam_mplp_init(int n, bam_plp_auto_f func, void **data); - void bam_mplp_destroy(bam_mplp_t iter); - void bam_mplp_set_maxcnt(bam_mplp_t iter, int maxcnt); - int bam_mplp_auto(bam_mplp_t iter, int *_tid, int *_pos, int *n_plp, const bam_pileup1_t **plp); - - /*! @typedef - @abstract Type of function to be called by bam_plbuf_push(). - @param tid chromosome ID as is defined in the header - @param pos start coordinate of the alignment, 0-based - @param n number of elements in pl array - @param pl array of alignments - @param data user provided data - @discussion See also bam_plbuf_push(), bam_plbuf_init() and bam_pileup1_t. - */ - typedef int (*bam_pileup_f)(uint32_t tid, uint32_t pos, int n, const bam_pileup1_t *pl, void *data); - - typedef struct { - bam_plp_t iter; - bam_pileup_f func; - void *data; - } bam_plbuf_t; - - void bam_plbuf_set_mask(bam_plbuf_t *buf, int mask); - void bam_plbuf_reset(bam_plbuf_t *buf); - bam_plbuf_t *bam_plbuf_init(bam_pileup_f func, void *data); - void bam_plbuf_destroy(bam_plbuf_t *buf); - int bam_plbuf_push(const bam1_t *b, bam_plbuf_t *buf); - - int bam_pileup_file(bamFile fp, int mask, bam_pileup_f func, void *func_data); - - struct __bam_lplbuf_t; - typedef struct __bam_lplbuf_t bam_lplbuf_t; - - void bam_lplbuf_reset(bam_lplbuf_t *buf); - - /*! @abstract bam_plbuf_init() equivalent with level calculated. */ - bam_lplbuf_t *bam_lplbuf_init(bam_pileup_f func, void *data); - - /*! @abstract bam_plbuf_destroy() equivalent with level calculated. */ - void bam_lplbuf_destroy(bam_lplbuf_t *tv); - - /*! @abstract bam_plbuf_push() equivalent with level calculated. */ - int bam_lplbuf_push(const bam1_t *b, bam_lplbuf_t *buf); - - - /********************* - * BAM indexing APIs * - *********************/ - - struct __bam_index_t; - typedef struct __bam_index_t bam_index_t; - - /*! - @abstract Build index for a BAM file. - @discussion Index file "fn.bai" will be created. - @param fn name of the BAM file - @return always 0 currently - */ - int bam_index_build(const char *fn); - - /*! - @abstract Load index from file "fn.bai". - @param fn name of the BAM file (NOT the index file) - @return pointer to the index structure - */ - bam_index_t *bam_index_load(const char *fn); - - /*! - @abstract Destroy an index structure. - @param idx pointer to the index structure - */ - void bam_index_destroy(bam_index_t *idx); - - /*! @typedef - @abstract Type of function to be called by bam_fetch(). - @param b the alignment - @param data user provided data - */ - typedef int (*bam_fetch_f)(const bam1_t *b, void *data); - - /*! - @abstract Retrieve the alignments that are overlapped with the - specified region. - - @discussion A user defined function will be called for each - retrieved alignment ordered by its start position. - - @param fp BAM file handler - @param idx pointer to the alignment index - @param tid chromosome ID as is defined in the header - @param beg start coordinate, 0-based - @param end end coordinate, 0-based - @param data user provided data (will be transferred to func) - @param func user defined function - */ - int bam_fetch(bamFile fp, const bam_index_t *idx, int tid, int beg, int end, void *data, bam_fetch_f func); - - bam_iter_t bam_iter_query(const bam_index_t *idx, int tid, int beg, int end); - int bam_iter_read(bamFile fp, bam_iter_t iter, bam1_t *b); - void bam_iter_destroy(bam_iter_t iter); - - /*! - @abstract Parse a region in the format: "chr2:100,000-200,000". - @discussion bam_header_t::hash will be initialized if empty. - @param header pointer to the header structure - @param str string to be parsed - @param ref_id the returned chromosome ID - @param begin the returned start coordinate - @param end the returned end coordinate - @return 0 on success; -1 on failure - */ - int bam_parse_region(bam_header_t *header, const char *str, int *ref_id, int *begin, int *end); - - - /************************** - * APIs for optional tags * - **************************/ - - /*! - @abstract Retrieve data of a tag - @param b pointer to an alignment struct - @param tag two-character tag to be retrieved - - @return pointer to the type and data. The first character is the - type that can be 'iIsScCdfAZH'. - - @discussion Use bam_aux2?() series to convert the returned data to - the corresponding type. - */ - uint8_t *bam_aux_get(const bam1_t *b, const char tag[2]); - - int32_t bam_aux2i(const uint8_t *s); - float bam_aux2f(const uint8_t *s); - double bam_aux2d(const uint8_t *s); - char bam_aux2A(const uint8_t *s); - char *bam_aux2Z(const uint8_t *s); - - int bam_aux_del(bam1_t *b, uint8_t *s); - void bam_aux_append(bam1_t *b, const char tag[2], char type, int len, uint8_t *data); - uint8_t *bam_aux_get_core(bam1_t *b, const char tag[2]); // an alias of bam_aux_get() - - - /***************** - * Miscellaneous * - *****************/ - - /*! - @abstract Calculate the rightmost coordinate of an alignment on the - reference genome. - - @param c pointer to the bam1_core_t structure - @param cigar the corresponding CIGAR array (from bam1_t::cigar) - @return the rightmost coordinate, 0-based - */ - uint32_t bam_calend(const bam1_core_t *c, const uint32_t *cigar); - - /*! - @abstract Calculate the length of the query sequence from CIGAR. - @param c pointer to the bam1_core_t structure - @param cigar the corresponding CIGAR array (from bam1_t::cigar) - @return length of the query sequence - */ - int32_t bam_cigar2qlen(const bam1_core_t *c, const uint32_t *cigar); - -#ifdef __cplusplus -} -#endif - -/*! - @abstract Calculate the minimum bin that contains a region [beg,end). - @param beg start of the region, 0-based - @param end end of the region, 0-based - @return bin - */ -static inline int bam_reg2bin(uint32_t beg, uint32_t end) -{ - --end; - if (beg>>14 == end>>14) return 4681 + (beg>>14); - if (beg>>17 == end>>17) return 585 + (beg>>17); - if (beg>>20 == end>>20) return 73 + (beg>>20); - if (beg>>23 == end>>23) return 9 + (beg>>23); - if (beg>>26 == end>>26) return 1 + (beg>>26); - return 0; -} - -/*! - @abstract Copy an alignment - @param bdst destination alignment struct - @param bsrc source alignment struct - @return pointer to the destination alignment struct - */ -static inline bam1_t *bam_copy1(bam1_t *bdst, const bam1_t *bsrc) -{ - uint8_t *data = bdst->data; - int m_data = bdst->m_data; // backup data and m_data - if (m_data < bsrc->data_len) { // double the capacity - m_data = bsrc->data_len; kroundup32(m_data); - data = (uint8_t*)realloc(data, m_data); - } - memcpy(data, bsrc->data, bsrc->data_len); // copy var-len data - *bdst = *bsrc; // copy the rest - // restore the backup - bdst->m_data = m_data; - bdst->data = data; - return bdst; -} - -/*! - @abstract Duplicate an alignment - @param src source alignment struct - @return pointer to the destination alignment struct - */ -static inline bam1_t *bam_dup1(const bam1_t *src) -{ - bam1_t *b; - b = bam_init1(); - *b = *src; - b->m_data = b->data_len; - b->data = (uint8_t*)calloc(b->data_len, 1); - memcpy(b->data, src->data, b->data_len); - return b; -} - -static inline int bam_aux_type2size(int x) -{ - if (x == 'C' || x == 'c' || x == 'A') return 1; - else if (x == 'S' || x == 's') return 2; - else if (x == 'I' || x == 'i' || x == 'f') return 4; - else return 0; -} - - -#endif diff -Nru tophat-2.1.1+dfsg/src/samtools-0.1.18/bam_import.c tophat-2.1.1+dfsg1/src/samtools-0.1.18/bam_import.c --- tophat-2.1.1+dfsg/src/samtools-0.1.18/bam_import.c 2016-02-14 18:21:17.388079000 +0000 +++ tophat-2.1.1+dfsg1/src/samtools-0.1.18/bam_import.c 1970-01-01 00:00:00.000000000 +0000 @@ -1,485 +0,0 @@ -#include -#include -#include -#include -#include -#include -#include -#ifdef _WIN32 -#include -#endif -#include "kstring.h" -#include "bam.h" -#include "sam_header.h" -#include "kseq.h" -#include "khash.h" - -KSTREAM_INIT(gzFile, gzread, 16384) -KHASH_MAP_INIT_STR(ref, uint64_t) - -void bam_init_header_hash(bam_header_t *header); -void bam_destroy_header_hash(bam_header_t *header); -int32_t bam_get_tid(const bam_header_t *header, const char *seq_name); - -unsigned char bam_nt16_table[256] = { - 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15, - 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15, - 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15, - 1, 2, 4, 8, 15,15,15,15, 15,15,15,15, 15, 0 /*=*/,15,15, - 15, 1,14, 2, 13,15,15, 4, 11,15,15,12, 15, 3,15,15, - 15,15, 5, 6, 8,15, 7, 9, 15,10,15,15, 15,15,15,15, - 15, 1,14, 2, 13,15,15, 4, 11,15,15,12, 15, 3,15,15, - 15,15, 5, 6, 8,15, 7, 9, 15,10,15,15, 15,15,15,15, - 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15, - 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15, - 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15, - 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15, - 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15, - 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15, - 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15, - 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15 -}; - -unsigned short bam_char2flag_table[256] = { - 0,0,0,0, 0,0,0,0, 0,0,0,0, 0,0,0,0, - 0,0,0,0, 0,0,0,0, 0,0,0,0, 0,0,0,0, - 0,0,0,0, 0,0,0,0, 0,0,0,0, 0,0,0,0, - 0,BAM_FREAD1,BAM_FREAD2,0, 0,0,0,0, 0,0,0,0, 0,0,0,0, - 0,0,0,0, 0,0,0,0, 0,0,0,0, 0,0,0,0, - BAM_FPROPER_PAIR,0,BAM_FMREVERSE,0, 0,BAM_FMUNMAP,0,0, 0,0,0,0, 0,0,0,0, - 0,0,0,0, BAM_FDUP,0,BAM_FQCFAIL,0, 0,0,0,0, 0,0,0,0, - BAM_FPAIRED,0,BAM_FREVERSE,BAM_FSECONDARY, 0,BAM_FUNMAP,0,0, 0,0,0,0, 0,0,0,0, - 0,0,0,0, 0,0,0,0, 0,0,0,0, 0,0,0,0, - 0,0,0,0, 0,0,0,0, 0,0,0,0, 0,0,0,0, - 0,0,0,0, 0,0,0,0, 0,0,0,0, 0,0,0,0, - 0,0,0,0, 0,0,0,0, 0,0,0,0, 0,0,0,0, - 0,0,0,0, 0,0,0,0, 0,0,0,0, 0,0,0,0, - 0,0,0,0, 0,0,0,0, 0,0,0,0, 0,0,0,0, - 0,0,0,0, 0,0,0,0, 0,0,0,0, 0,0,0,0, - 0,0,0,0, 0,0,0,0, 0,0,0,0, 0,0,0,0 -}; - -char *bam_nt16_rev_table = "=ACMGRSVTWYHKDBN"; - -struct __tamFile_t { - gzFile fp; - kstream_t *ks; - kstring_t *str; - uint64_t n_lines; - int is_first; -}; - -char **__bam_get_lines(const char *fn, int *_n) // for bam_plcmd.c only -{ - char **list = 0, *s; - int n = 0, dret, m = 0; - gzFile fp = (strcmp(fn, "-") == 0)? gzdopen(fileno(stdin), "r") : gzopen(fn, "r"); - kstream_t *ks; - kstring_t *str; - str = (kstring_t*)calloc(1, sizeof(kstring_t)); - ks = ks_init(fp); - while (ks_getuntil(ks, '\n', str, &dret) > 0) { - if (n == m) { - m = m? m << 1 : 16; - list = (char**)realloc(list, m * sizeof(char*)); - } - if (str->s[str->l-1] == '\r') - str->s[--str->l] = '\0'; - s = list[n++] = (char*)calloc(str->l + 1, 1); - strcpy(s, str->s); - } - ks_destroy(ks); - gzclose(fp); - free(str->s); free(str); - *_n = n; - return list; -} - -static bam_header_t *hash2header(const kh_ref_t *hash) -{ - bam_header_t *header; - khiter_t k; - header = bam_header_init(); - header->n_targets = kh_size(hash); - header->target_name = (char**)calloc(kh_size(hash), sizeof(char*)); - header->target_len = (uint32_t*)calloc(kh_size(hash), 4); - for (k = kh_begin(hash); k != kh_end(hash); ++k) { - if (kh_exist(hash, k)) { - int i = (int)kh_value(hash, k); - header->target_name[i] = (char*)kh_key(hash, k); - header->target_len[i] = kh_value(hash, k)>>32; - } - } - bam_init_header_hash(header); - return header; -} -bam_header_t *sam_header_read2(const char *fn) -{ - bam_header_t *header; - int c, dret, ret, error = 0; - gzFile fp; - kstream_t *ks; - kstring_t *str; - kh_ref_t *hash; - khiter_t k; - if (fn == 0) return 0; - fp = (strcmp(fn, "-") == 0)? gzdopen(fileno(stdin), "r") : gzopen(fn, "r"); - if (fp == 0) return 0; - hash = kh_init(ref); - ks = ks_init(fp); - str = (kstring_t*)calloc(1, sizeof(kstring_t)); - while (ks_getuntil(ks, 0, str, &dret) > 0) { - char *s = strdup(str->s); - int len, i; - i = kh_size(hash); - ks_getuntil(ks, 0, str, &dret); - len = atoi(str->s); - k = kh_put(ref, hash, s, &ret); - if (ret == 0) { - fprintf(stderr, "[sam_header_read2] duplicated sequence name: %s\n", s); - error = 1; - } - kh_value(hash, k) = (uint64_t)len<<32 | i; - if (dret != '\n') - while ((c = ks_getc(ks)) != '\n' && c != -1); - } - ks_destroy(ks); - gzclose(fp); - free(str->s); free(str); - fprintf(stderr, "[sam_header_read2] %d sequences loaded.\n", kh_size(hash)); - if (error) return 0; - header = hash2header(hash); - kh_destroy(ref, hash); - return header; -} -static inline uint8_t *alloc_data(bam1_t *b, int size) -{ - if (b->m_data < size) { - b->m_data = size; - kroundup32(b->m_data); - b->data = (uint8_t*)realloc(b->data, b->m_data); - } - return b->data; -} -static inline void parse_error(int64_t n_lines, const char * __restrict msg) -{ - fprintf(stderr, "Parse error at line %lld: %s\n", (long long)n_lines, msg); - abort(); -} -static inline void append_text(bam_header_t *header, kstring_t *str) -{ - size_t x = header->l_text, y = header->l_text + str->l + 2; // 2 = 1 byte dret + 1 byte null - kroundup32(x); kroundup32(y); - if (x < y) - { - header->n_text = y; - header->text = (char*)realloc(header->text, y); - if ( !header->text ) - { - fprintf(stderr,"realloc failed to alloc %ld bytes\n", y); - abort(); - } - } - // Sanity check - if ( header->l_text+str->l+1 >= header->n_text ) - { - fprintf(stderr,"append_text FIXME: %ld>=%ld, x=%ld,y=%ld\n", header->l_text+str->l+1,header->n_text,x,y); - abort(); - } - strncpy(header->text + header->l_text, str->s, str->l+1); // we cannot use strcpy() here. - header->l_text += str->l + 1; - header->text[header->l_text] = 0; -} - -int sam_header_parse(bam_header_t *h) -{ - char **tmp; - int i; - free(h->target_len); free(h->target_name); - h->n_targets = 0; h->target_len = 0; h->target_name = 0; - if (h->l_text < 3) return 0; - if (h->dict == 0) h->dict = sam_header_parse2(h->text); - tmp = sam_header2list(h->dict, "SQ", "SN", &h->n_targets); - if (h->n_targets == 0) return 0; - h->target_name = calloc(h->n_targets, sizeof(void*)); - for (i = 0; i < h->n_targets; ++i) - h->target_name[i] = strdup(tmp[i]); - free(tmp); - tmp = sam_header2list(h->dict, "SQ", "LN", &h->n_targets); - h->target_len = calloc(h->n_targets, 4); - for (i = 0; i < h->n_targets; ++i) - h->target_len[i] = atoi(tmp[i]); - free(tmp); - return h->n_targets; -} - -bam_header_t *sam_header_read(tamFile fp) -{ - int ret, dret; - bam_header_t *header = bam_header_init(); - kstring_t *str = fp->str; - while ((ret = ks_getuntil(fp->ks, KS_SEP_TAB, str, &dret)) >= 0 && str->s[0] == '@') { // skip header - str->s[str->l] = dret; // note that str->s is NOT null terminated!! - append_text(header, str); - if (dret != '\n') { - ret = ks_getuntil(fp->ks, '\n', str, &dret); - str->s[str->l] = '\n'; // NOT null terminated!! - append_text(header, str); - } - ++fp->n_lines; - } - sam_header_parse(header); - bam_init_header_hash(header); - fp->is_first = 1; - return header; -} - -int sam_read1(tamFile fp, bam_header_t *header, bam1_t *b) -{ - int ret, doff, doff0, dret, z = 0; - bam1_core_t *c = &b->core; - kstring_t *str = fp->str; - kstream_t *ks = fp->ks; - - if (fp->is_first) { - fp->is_first = 0; - ret = str->l; - } else { - do { // special consideration for empty lines - ret = ks_getuntil(fp->ks, KS_SEP_TAB, str, &dret); - if (ret >= 0) z += str->l + 1; - } while (ret == 0); - } - if (ret < 0) return -1; - ++fp->n_lines; - doff = 0; - - { // name - c->l_qname = strlen(str->s) + 1; - memcpy(alloc_data(b, doff + c->l_qname) + doff, str->s, c->l_qname); - doff += c->l_qname; - } - { // flag - long flag; - char *s; - ret = ks_getuntil(ks, KS_SEP_TAB, str, &dret); z += str->l + 1; - flag = strtol((char*)str->s, &s, 0); - if (*s) { // not the end of the string - flag = 0; - for (s = str->s; *s; ++s) - flag |= bam_char2flag_table[(int)*s]; - } - c->flag = flag; - } - { // tid, pos, qual - ret = ks_getuntil(ks, KS_SEP_TAB, str, &dret); z += str->l + 1; c->tid = bam_get_tid(header, str->s); - if (c->tid < 0 && strcmp(str->s, "*")) { - if (header->n_targets == 0) { - fprintf(stderr, "[sam_read1] missing header? Abort!\n"); - exit(1); - } else fprintf(stderr, "[sam_read1] reference '%s' is recognized as '*'.\n", str->s); - } - ret = ks_getuntil(ks, KS_SEP_TAB, str, &dret); z += str->l + 1; c->pos = isdigit(str->s[0])? atoi(str->s) - 1 : -1; - ret = ks_getuntil(ks, KS_SEP_TAB, str, &dret); z += str->l + 1; c->qual = isdigit(str->s[0])? atoi(str->s) : 0; - if (ret < 0) return -2; - } - { // cigar - char *s, *t; - int i, op; - long x; - c->n_cigar = 0; - if (ks_getuntil(ks, KS_SEP_TAB, str, &dret) < 0) return -3; - z += str->l + 1; - if (str->s[0] != '*') { - for (s = str->s; *s; ++s) { - if ((isalpha(*s)) || (*s=='=')) ++c->n_cigar; - else if (!isdigit(*s)) parse_error(fp->n_lines, "invalid CIGAR character"); - } - b->data = alloc_data(b, doff + c->n_cigar * 4); - for (i = 0, s = str->s; i != c->n_cigar; ++i) { - x = strtol(s, &t, 10); - op = toupper(*t); - if (op == 'M') op = BAM_CMATCH; - else if (op == 'I') op = BAM_CINS; - else if (op == 'D') op = BAM_CDEL; - else if (op == 'N') op = BAM_CREF_SKIP; - else if (op == 'S') op = BAM_CSOFT_CLIP; - else if (op == 'H') op = BAM_CHARD_CLIP; - else if (op == 'P') op = BAM_CPAD; - else if (op == '=') op = BAM_CEQUAL; - else if (op == 'X') op = BAM_CDIFF; - else parse_error(fp->n_lines, "invalid CIGAR operation"); - s = t + 1; - bam1_cigar(b)[i] = x << BAM_CIGAR_SHIFT | op; - } - if (*s) parse_error(fp->n_lines, "unmatched CIGAR operation"); - c->bin = bam_reg2bin(c->pos, bam_calend(c, bam1_cigar(b))); - doff += c->n_cigar * 4; - } else { - if (!(c->flag&BAM_FUNMAP)) { - fprintf(stderr, "Parse warning at line %lld: mapped sequence without CIGAR\n", (long long)fp->n_lines); - c->flag |= BAM_FUNMAP; - } - c->bin = bam_reg2bin(c->pos, c->pos + 1); - } - } - { // mtid, mpos, isize - ret = ks_getuntil(ks, KS_SEP_TAB, str, &dret); z += str->l + 1; - c->mtid = strcmp(str->s, "=")? bam_get_tid(header, str->s) : c->tid; - ret = ks_getuntil(ks, KS_SEP_TAB, str, &dret); z += str->l + 1; - c->mpos = isdigit(str->s[0])? atoi(str->s) - 1 : -1; - ret = ks_getuntil(ks, KS_SEP_TAB, str, &dret); z += str->l + 1; - c->isize = (str->s[0] == '-' || isdigit(str->s[0]))? atoi(str->s) : 0; - if (ret < 0) return -4; - } - { // seq and qual - int i; - uint8_t *p = 0; - if (ks_getuntil(ks, KS_SEP_TAB, str, &dret) < 0) return -5; // seq - z += str->l + 1; - if (strcmp(str->s, "*")) { - c->l_qseq = strlen(str->s); - if (c->n_cigar && c->l_qseq != (int32_t)bam_cigar2qlen(c, bam1_cigar(b))) { - fprintf(stderr, "Line %ld, sequence length %i vs %i from CIGAR\n", - (long)fp->n_lines, c->l_qseq, (int32_t)bam_cigar2qlen(c, bam1_cigar(b))); - parse_error(fp->n_lines, "CIGAR and sequence length are inconsistent"); - } - p = (uint8_t*)alloc_data(b, doff + c->l_qseq + (c->l_qseq+1)/2) + doff; - memset(p, 0, (c->l_qseq+1)/2); - for (i = 0; i < c->l_qseq; ++i) - p[i/2] |= bam_nt16_table[(int)str->s[i]] << 4*(1-i%2); - } else c->l_qseq = 0; - if (ks_getuntil(ks, KS_SEP_TAB, str, &dret) < 0) return -6; // qual - z += str->l + 1; - if (strcmp(str->s, "*") && c->l_qseq != strlen(str->s)) - parse_error(fp->n_lines, "sequence and quality are inconsistent"); - p += (c->l_qseq+1)/2; - if (strcmp(str->s, "*") == 0) for (i = 0; i < c->l_qseq; ++i) p[i] = 0xff; - else for (i = 0; i < c->l_qseq; ++i) p[i] = str->s[i] - 33; - doff += c->l_qseq + (c->l_qseq+1)/2; - } - doff0 = doff; - if (dret != '\n' && dret != '\r') { // aux - while (ks_getuntil(ks, KS_SEP_TAB, str, &dret) >= 0) { - uint8_t *s, type, key[2]; - z += str->l + 1; - if (str->l < 6 || str->s[2] != ':' || str->s[4] != ':') - parse_error(fp->n_lines, "missing colon in auxiliary data"); - key[0] = str->s[0]; key[1] = str->s[1]; - type = str->s[3]; - s = alloc_data(b, doff + 3) + doff; - s[0] = key[0]; s[1] = key[1]; s += 2; doff += 2; - if (type == 'A' || type == 'a' || type == 'c' || type == 'C') { // c and C for backward compatibility - s = alloc_data(b, doff + 2) + doff; - *s++ = 'A'; *s = str->s[5]; - doff += 2; - } else if (type == 'I' || type == 'i') { - long long x; - s = alloc_data(b, doff + 5) + doff; - x = (long long)atoll(str->s + 5); - if (x < 0) { - if (x >= -127) { - *s++ = 'c'; *(int8_t*)s = (int8_t)x; - s += 1; doff += 2; - } else if (x >= -32767) { - *s++ = 's'; *(int16_t*)s = (int16_t)x; - s += 2; doff += 3; - } else { - *s++ = 'i'; *(int32_t*)s = (int32_t)x; - s += 4; doff += 5; - if (x < -2147483648ll) - fprintf(stderr, "Parse warning at line %lld: integer %lld is out of range.", - (long long)fp->n_lines, x); - } - } else { - if (x <= 255) { - *s++ = 'C'; *s++ = (uint8_t)x; - doff += 2; - } else if (x <= 65535) { - *s++ = 'S'; *(uint16_t*)s = (uint16_t)x; - s += 2; doff += 3; - } else { - *s++ = 'I'; *(uint32_t*)s = (uint32_t)x; - s += 4; doff += 5; - if (x > 4294967295ll) - fprintf(stderr, "Parse warning at line %lld: integer %lld is out of range.", - (long long)fp->n_lines, x); - } - } - } else if (type == 'f') { - s = alloc_data(b, doff + 5) + doff; - *s++ = 'f'; - *(float*)s = (float)atof(str->s + 5); - s += 4; doff += 5; - } else if (type == 'd') { - s = alloc_data(b, doff + 9) + doff; - *s++ = 'd'; - *(float*)s = (float)atof(str->s + 9); - s += 8; doff += 9; - } else if (type == 'Z' || type == 'H') { - int size = 1 + (str->l - 5) + 1; - if (type == 'H') { // check whether the hex string is valid - int i; - if ((str->l - 5) % 2 == 1) parse_error(fp->n_lines, "length of the hex string not even"); - for (i = 0; i < str->l - 5; ++i) { - int c = toupper(str->s[5 + i]); - if (!((c >= '0' && c <= '9') || (c >= 'A' && c <= 'F'))) - parse_error(fp->n_lines, "invalid hex character"); - } - } - s = alloc_data(b, doff + size) + doff; - *s++ = type; - memcpy(s, str->s + 5, str->l - 5); - s[str->l - 5] = 0; - doff += size; - } else if (type == 'B') { - int32_t n = 0, Bsize, k = 0, size; - char *p; - if (str->l < 8) parse_error(fp->n_lines, "too few values in aux type B"); - Bsize = bam_aux_type2size(str->s[5]); // the size of each element - for (p = (char*)str->s + 6; *p; ++p) // count the number of elements in the array - if (*p == ',') ++n; - p = str->s + 7; // now p points to the first number in the array - size = 6 + Bsize * n; // total number of bytes allocated to this tag - s = alloc_data(b, doff + 6 * Bsize * n) + doff; // allocate memory - *s++ = 'B'; *s++ = str->s[5]; - memcpy(s, &n, 4); s += 4; // write the number of elements - if (str->s[5] == 'c') while (p < str->s + str->l) ((int8_t*)s)[k++] = (int8_t)strtol(p, &p, 0), ++p; - else if (str->s[5] == 'C') while (p < str->s + str->l) ((uint8_t*)s)[k++] = (uint8_t)strtol(p, &p, 0), ++p; - else if (str->s[5] == 's') while (p < str->s + str->l) ((int16_t*)s)[k++] = (int16_t)strtol(p, &p, 0), ++p; // FIXME: avoid unaligned memory - else if (str->s[5] == 'S') while (p < str->s + str->l) ((uint16_t*)s)[k++] = (uint16_t)strtol(p, &p, 0), ++p; - else if (str->s[5] == 'i') while (p < str->s + str->l) ((int32_t*)s)[k++] = (int32_t)strtol(p, &p, 0), ++p; - else if (str->s[5] == 'I') while (p < str->s + str->l) ((uint32_t*)s)[k++] = (uint32_t)strtol(p, &p, 0), ++p; - else if (str->s[5] == 'f') while (p < str->s + str->l) ((float*)s)[k++] = (float)strtod(p, &p), ++p; - else parse_error(fp->n_lines, "unrecognized array type"); - s += Bsize * n; doff += size; - } else parse_error(fp->n_lines, "unrecognized type"); - if (dret == '\n' || dret == '\r') break; - } - } - b->l_aux = doff - doff0; - b->data_len = doff; - return z; -} - -tamFile sam_open(const char *fn) -{ - tamFile fp; - gzFile gzfp = (strcmp(fn, "-") == 0)? gzdopen(fileno(stdin), "rb") : gzopen(fn, "rb"); - if (gzfp == 0) return 0; - fp = (tamFile)calloc(1, sizeof(struct __tamFile_t)); - fp->str = (kstring_t*)calloc(1, sizeof(kstring_t)); - fp->fp = gzfp; - fp->ks = ks_init(fp->fp); - return fp; -} - -void sam_close(tamFile fp) -{ - if (fp) { - ks_destroy(fp->ks); - gzclose(fp->fp); - free(fp->str->s); free(fp->str); - free(fp); - } -} diff -Nru tophat-2.1.1+dfsg/src/samtools-0.1.18/bam_index.c tophat-2.1.1+dfsg1/src/samtools-0.1.18/bam_index.c --- tophat-2.1.1+dfsg/src/samtools-0.1.18/bam_index.c 2016-02-14 18:21:17.389079000 +0000 +++ tophat-2.1.1+dfsg1/src/samtools-0.1.18/bam_index.c 1970-01-01 00:00:00.000000000 +0000 @@ -1,719 +0,0 @@ -#include -#include -#include "bam.h" -#include "khash.h" -#include "ksort.h" -#include "bam_endian.h" -#ifdef _USE_KNETFILE -#include "knetfile.h" -#endif - -/*! - @header - - Alignment indexing. Before indexing, BAM must be sorted based on the - leftmost coordinate of alignments. In indexing, BAM uses two indices: - a UCSC binning index and a simple linear index. The binning index is - efficient for alignments spanning long distance, while the auxiliary - linear index helps to reduce unnecessary seek calls especially for - short alignments. - - The UCSC binning scheme was suggested by Richard Durbin and Lincoln - Stein and is explained by Kent et al. (2002). In this scheme, each bin - represents a contiguous genomic region which can be fully contained in - another bin; each alignment is associated with a bin which represents - the smallest region containing the entire alignment. The binning - scheme is essentially another representation of R-tree. A distinct bin - uniquely corresponds to a distinct internal node in a R-tree. Bin A is - a child of Bin B if region A is contained in B. - - In BAM, each bin may span 2^29, 2^26, 2^23, 2^20, 2^17 or 2^14 bp. Bin - 0 spans a 512Mbp region, bins 1-8 span 64Mbp, 9-72 8Mbp, 73-584 1Mbp, - 585-4680 128Kbp and bins 4681-37449 span 16Kbp regions. If we want to - find the alignments overlapped with a region [rbeg,rend), we need to - calculate the list of bins that may be overlapped the region and test - the alignments in the bins to confirm the overlaps. If the specified - region is short, typically only a few alignments in six bins need to - be retrieved. The overlapping alignments can be quickly fetched. - - */ - -#define BAM_MIN_CHUNK_GAP 32768 -// 1<<14 is the size of minimum bin. -#define BAM_LIDX_SHIFT 14 - -#define BAM_MAX_BIN 37450 // =(8^6-1)/7+1 - -typedef struct { - uint64_t u, v; -} pair64_t; - -#define pair64_lt(a,b) ((a).u < (b).u) -KSORT_INIT(off, pair64_t, pair64_lt) - -typedef struct { - uint32_t m, n; - pair64_t *list; -} bam_binlist_t; - -typedef struct { - int32_t n, m; - uint64_t *offset; -} bam_lidx_t; - -KHASH_MAP_INIT_INT(i, bam_binlist_t) - -struct __bam_index_t { - int32_t n; - uint64_t n_no_coor; // unmapped reads without coordinate - khash_t(i) **index; - bam_lidx_t *index2; -}; - -// requirement: len <= LEN_MASK -static inline void insert_offset(khash_t(i) *h, int bin, uint64_t beg, uint64_t end) -{ - khint_t k; - bam_binlist_t *l; - int ret; - k = kh_put(i, h, bin, &ret); - l = &kh_value(h, k); - if (ret) { // not present - l->m = 1; l->n = 0; - l->list = (pair64_t*)calloc(l->m, 16); - } - if (l->n == l->m) { - l->m <<= 1; - l->list = (pair64_t*)realloc(l->list, l->m * 16); - } - l->list[l->n].u = beg; l->list[l->n++].v = end; -} - -static inline void insert_offset2(bam_lidx_t *index2, bam1_t *b, uint64_t offset) -{ - int i, beg, end; - beg = b->core.pos >> BAM_LIDX_SHIFT; - end = (bam_calend(&b->core, bam1_cigar(b)) - 1) >> BAM_LIDX_SHIFT; - if (index2->m < end + 1) { - int old_m = index2->m; - index2->m = end + 1; - kroundup32(index2->m); - index2->offset = (uint64_t*)realloc(index2->offset, index2->m * 8); - memset(index2->offset + old_m, 0, 8 * (index2->m - old_m)); - } - if (beg == end) { - if (index2->offset[beg] == 0) index2->offset[beg] = offset; - } else { - for (i = beg; i <= end; ++i) - if (index2->offset[i] == 0) index2->offset[i] = offset; - } - index2->n = end + 1; -} - -static void merge_chunks(bam_index_t *idx) -{ -#if defined(BAM_TRUE_OFFSET) || defined(BAM_VIRTUAL_OFFSET16) - khash_t(i) *index; - int i, l, m; - khint_t k; - for (i = 0; i < idx->n; ++i) { - index = idx->index[i]; - for (k = kh_begin(index); k != kh_end(index); ++k) { - bam_binlist_t *p; - if (!kh_exist(index, k) || kh_key(index, k) == BAM_MAX_BIN) continue; - p = &kh_value(index, k); - m = 0; - for (l = 1; l < p->n; ++l) { -#ifdef BAM_TRUE_OFFSET - if (p->list[m].v + BAM_MIN_CHUNK_GAP > p->list[l].u) p->list[m].v = p->list[l].v; -#else - if (p->list[m].v>>16 == p->list[l].u>>16) p->list[m].v = p->list[l].v; -#endif - else p->list[++m] = p->list[l]; - } // ~for(l) - p->n = m + 1; - } // ~for(k) - } // ~for(i) -#endif // defined(BAM_TRUE_OFFSET) || defined(BAM_BGZF) -} - -static void fill_missing(bam_index_t *idx) -{ - int i, j; - for (i = 0; i < idx->n; ++i) { - bam_lidx_t *idx2 = &idx->index2[i]; - for (j = 1; j < idx2->n; ++j) - if (idx2->offset[j] == 0) - idx2->offset[j] = idx2->offset[j-1]; - } -} - -bam_index_t *bam_index_core(bamFile fp) -{ - bam1_t *b; - bam_header_t *h; - int i, ret; - bam_index_t *idx; - uint32_t last_bin, save_bin; - int32_t last_coor, last_tid, save_tid; - bam1_core_t *c; - uint64_t save_off, last_off, n_mapped, n_unmapped, off_beg, off_end, n_no_coor; - - idx = (bam_index_t*)calloc(1, sizeof(bam_index_t)); - b = (bam1_t*)calloc(1, sizeof(bam1_t)); - h = bam_header_read(fp); - c = &b->core; - - idx->n = h->n_targets; - bam_header_destroy(h); - idx->index = (khash_t(i)**)calloc(idx->n, sizeof(void*)); - for (i = 0; i < idx->n; ++i) idx->index[i] = kh_init(i); - idx->index2 = (bam_lidx_t*)calloc(idx->n, sizeof(bam_lidx_t)); - - save_bin = save_tid = last_tid = last_bin = 0xffffffffu; - save_off = last_off = bam_tell(fp); last_coor = 0xffffffffu; - n_mapped = n_unmapped = n_no_coor = off_end = 0; - off_beg = off_end = bam_tell(fp); - while ((ret = bam_read1(fp, b)) >= 0) { - if (c->tid < 0) ++n_no_coor; - if (last_tid < c->tid || (last_tid >= 0 && c->tid < 0)) { // change of chromosomes - last_tid = c->tid; - last_bin = 0xffffffffu; - } else if ((uint32_t)last_tid > (uint32_t)c->tid) { - fprintf(stderr, "[bam_index_core] the alignment is not sorted (%s): %d-th chr > %d-th chr\n", - bam1_qname(b), last_tid+1, c->tid+1); - return NULL; - } else if ((int32_t)c->tid >= 0 && last_coor > c->pos) { - fprintf(stderr, "[bam_index_core] the alignment is not sorted (%s): %u > %u in %d-th chr\n", - bam1_qname(b), last_coor, c->pos, c->tid+1); - return NULL; - } - if (c->tid >= 0 && !(c->flag & BAM_FUNMAP)) insert_offset2(&idx->index2[b->core.tid], b, last_off); - if (c->bin != last_bin) { // then possibly write the binning index - if (save_bin != 0xffffffffu) // save_bin==0xffffffffu only happens to the first record - insert_offset(idx->index[save_tid], save_bin, save_off, last_off); - if (last_bin == 0xffffffffu && save_tid != 0xffffffffu) { // write the meta element - off_end = last_off; - insert_offset(idx->index[save_tid], BAM_MAX_BIN, off_beg, off_end); - insert_offset(idx->index[save_tid], BAM_MAX_BIN, n_mapped, n_unmapped); - n_mapped = n_unmapped = 0; - off_beg = off_end; - } - save_off = last_off; - save_bin = last_bin = c->bin; - save_tid = c->tid; - if (save_tid < 0) break; - } - if (bam_tell(fp) <= last_off) { - fprintf(stderr, "[bam_index_core] bug in BGZF/RAZF: %llx < %llx\n", - (unsigned long long)bam_tell(fp), (unsigned long long)last_off); - return NULL; - } - if (c->flag & BAM_FUNMAP) ++n_unmapped; - else ++n_mapped; - last_off = bam_tell(fp); - last_coor = b->core.pos; - } - if (save_tid >= 0) { - insert_offset(idx->index[save_tid], save_bin, save_off, bam_tell(fp)); - insert_offset(idx->index[save_tid], BAM_MAX_BIN, off_beg, bam_tell(fp)); - insert_offset(idx->index[save_tid], BAM_MAX_BIN, n_mapped, n_unmapped); - } - merge_chunks(idx); - fill_missing(idx); - if (ret >= 0) { - while ((ret = bam_read1(fp, b)) >= 0) { - ++n_no_coor; - if (c->tid >= 0 && n_no_coor) { - fprintf(stderr, "[bam_index_core] the alignment is not sorted: reads without coordinates prior to reads with coordinates.\n"); - return NULL; - } - } - } - if (ret < -1) fprintf(stderr, "[bam_index_core] truncated file? Continue anyway. (%d)\n", ret); - free(b->data); free(b); - idx->n_no_coor = n_no_coor; - return idx; -} - -void bam_index_destroy(bam_index_t *idx) -{ - khint_t k; - int i; - if (idx == 0) return; - for (i = 0; i < idx->n; ++i) { - khash_t(i) *index = idx->index[i]; - bam_lidx_t *index2 = idx->index2 + i; - for (k = kh_begin(index); k != kh_end(index); ++k) { - if (kh_exist(index, k)) - free(kh_value(index, k).list); - } - kh_destroy(i, index); - free(index2->offset); - } - free(idx->index); free(idx->index2); - free(idx); -} - -void bam_index_save(const bam_index_t *idx, FILE *fp) -{ - int32_t i, size; - khint_t k; - fwrite("BAI\1", 1, 4, fp); - if (bam_is_be) { - uint32_t x = idx->n; - fwrite(bam_swap_endian_4p(&x), 4, 1, fp); - } else fwrite(&idx->n, 4, 1, fp); - for (i = 0; i < idx->n; ++i) { - khash_t(i) *index = idx->index[i]; - bam_lidx_t *index2 = idx->index2 + i; - // write binning index - size = kh_size(index); - if (bam_is_be) { // big endian - uint32_t x = size; - fwrite(bam_swap_endian_4p(&x), 4, 1, fp); - } else fwrite(&size, 4, 1, fp); - for (k = kh_begin(index); k != kh_end(index); ++k) { - if (kh_exist(index, k)) { - bam_binlist_t *p = &kh_value(index, k); - if (bam_is_be) { // big endian - uint32_t x; - x = kh_key(index, k); fwrite(bam_swap_endian_4p(&x), 4, 1, fp); - x = p->n; fwrite(bam_swap_endian_4p(&x), 4, 1, fp); - for (x = 0; (int)x < p->n; ++x) { - bam_swap_endian_8p(&p->list[x].u); - bam_swap_endian_8p(&p->list[x].v); - } - fwrite(p->list, 16, p->n, fp); - for (x = 0; (int)x < p->n; ++x) { - bam_swap_endian_8p(&p->list[x].u); - bam_swap_endian_8p(&p->list[x].v); - } - } else { - fwrite(&kh_key(index, k), 4, 1, fp); - fwrite(&p->n, 4, 1, fp); - fwrite(p->list, 16, p->n, fp); - } - } - } - // write linear index (index2) - if (bam_is_be) { - int x = index2->n; - fwrite(bam_swap_endian_4p(&x), 4, 1, fp); - } else fwrite(&index2->n, 4, 1, fp); - if (bam_is_be) { // big endian - int x; - for (x = 0; (int)x < index2->n; ++x) - bam_swap_endian_8p(&index2->offset[x]); - fwrite(index2->offset, 8, index2->n, fp); - for (x = 0; (int)x < index2->n; ++x) - bam_swap_endian_8p(&index2->offset[x]); - } else fwrite(index2->offset, 8, index2->n, fp); - } - { // write the number of reads coor-less records. - uint64_t x = idx->n_no_coor; - if (bam_is_be) bam_swap_endian_8p(&x); - fwrite(&x, 8, 1, fp); - } - fflush(fp); -} - -static bam_index_t *bam_index_load_core(FILE *fp) -{ - int i; - char magic[4]; - bam_index_t *idx; - if (fp == 0) { - fprintf(stderr, "[bam_index_load_core] fail to load index.\n"); - return 0; - } - fread(magic, 1, 4, fp); - if (strncmp(magic, "BAI\1", 4)) { - fprintf(stderr, "[bam_index_load] wrong magic number.\n"); - fclose(fp); - return 0; - } - idx = (bam_index_t*)calloc(1, sizeof(bam_index_t)); - fread(&idx->n, 4, 1, fp); - if (bam_is_be) bam_swap_endian_4p(&idx->n); - idx->index = (khash_t(i)**)calloc(idx->n, sizeof(void*)); - idx->index2 = (bam_lidx_t*)calloc(idx->n, sizeof(bam_lidx_t)); - for (i = 0; i < idx->n; ++i) { - khash_t(i) *index; - bam_lidx_t *index2 = idx->index2 + i; - uint32_t key, size; - khint_t k; - int j, ret; - bam_binlist_t *p; - index = idx->index[i] = kh_init(i); - // load binning index - fread(&size, 4, 1, fp); - if (bam_is_be) bam_swap_endian_4p(&size); - for (j = 0; j < (int)size; ++j) { - fread(&key, 4, 1, fp); - if (bam_is_be) bam_swap_endian_4p(&key); - k = kh_put(i, index, key, &ret); - p = &kh_value(index, k); - fread(&p->n, 4, 1, fp); - if (bam_is_be) bam_swap_endian_4p(&p->n); - p->m = p->n; - p->list = (pair64_t*)malloc(p->m * 16); - fread(p->list, 16, p->n, fp); - if (bam_is_be) { - int x; - for (x = 0; x < p->n; ++x) { - bam_swap_endian_8p(&p->list[x].u); - bam_swap_endian_8p(&p->list[x].v); - } - } - } - // load linear index - fread(&index2->n, 4, 1, fp); - if (bam_is_be) bam_swap_endian_4p(&index2->n); - index2->m = index2->n; - index2->offset = (uint64_t*)calloc(index2->m, 8); - fread(index2->offset, index2->n, 8, fp); - if (bam_is_be) - for (j = 0; j < index2->n; ++j) bam_swap_endian_8p(&index2->offset[j]); - } - if (fread(&idx->n_no_coor, 8, 1, fp) == 0) idx->n_no_coor = 0; - if (bam_is_be) bam_swap_endian_8p(&idx->n_no_coor); - return idx; -} - -bam_index_t *bam_index_load_local(const char *_fn) -{ - FILE *fp; - char *fnidx, *fn; - - if (strstr(_fn, "ftp://") == _fn || strstr(_fn, "http://") == _fn) { - const char *p; - int l = strlen(_fn); - for (p = _fn + l - 1; p >= _fn; --p) - if (*p == '/') break; - fn = strdup(p + 1); - } else fn = strdup(_fn); - fnidx = (char*)calloc(strlen(fn) + 5, 1); - strcpy(fnidx, fn); strcat(fnidx, ".bai"); - fp = fopen(fnidx, "rb"); - if (fp == 0) { // try "{base}.bai" - char *s = strstr(fn, "bam"); - if (s == fn + strlen(fn) - 3) { - strcpy(fnidx, fn); - fnidx[strlen(fn)-1] = 'i'; - fp = fopen(fnidx, "rb"); - } - } - free(fnidx); free(fn); - if (fp) { - bam_index_t *idx = bam_index_load_core(fp); - fclose(fp); - return idx; - } else return 0; -} - -#ifdef _USE_KNETFILE -static void download_from_remote(const char *url) -{ - const int buf_size = 1 * 1024 * 1024; - char *fn; - FILE *fp; - uint8_t *buf; - knetFile *fp_remote; - int l; - if (strstr(url, "ftp://") != url && strstr(url, "http://") != url) return; - l = strlen(url); - for (fn = (char*)url + l - 1; fn >= url; --fn) - if (*fn == '/') break; - ++fn; // fn now points to the file name - fp_remote = knet_open(url, "r"); - if (fp_remote == 0) { - fprintf(stderr, "[download_from_remote] fail to open remote file.\n"); - return; - } - if ((fp = fopen(fn, "wb")) == 0) { - fprintf(stderr, "[download_from_remote] fail to create file in the working directory.\n"); - knet_close(fp_remote); - return; - } - buf = (uint8_t*)calloc(buf_size, 1); - while ((l = knet_read(fp_remote, buf, buf_size)) != 0) - fwrite(buf, 1, l, fp); - free(buf); - fclose(fp); - knet_close(fp_remote); -} -#else -static void download_from_remote(const char *url) -{ - return; -} -#endif - -bam_index_t *bam_index_load(const char *fn) -{ - bam_index_t *idx; - idx = bam_index_load_local(fn); - if (idx == 0 && (strstr(fn, "ftp://") == fn || strstr(fn, "http://") == fn)) { - char *fnidx = calloc(strlen(fn) + 5, 1); - strcat(strcpy(fnidx, fn), ".bai"); - fprintf(stderr, "[bam_index_load] attempting to download the remote index file.\n"); - download_from_remote(fnidx); - idx = bam_index_load_local(fn); - } - if (idx == 0) fprintf(stderr, "[bam_index_load] fail to load BAM index.\n"); - return idx; -} - -int bam_index_build2(const char *fn, const char *_fnidx) -{ - char *fnidx; - FILE *fpidx; - bamFile fp; - bam_index_t *idx; - if ((fp = bam_open(fn, "r")) == 0) { - fprintf(stderr, "[bam_index_build2] fail to open the BAM file.\n"); - return -1; - } - idx = bam_index_core(fp); - bam_close(fp); - if(idx == 0) { - fprintf(stderr, "[bam_index_build2] fail to index the BAM file.\n"); - return -1; - } - if (_fnidx == 0) { - fnidx = (char*)calloc(strlen(fn) + 5, 1); - strcpy(fnidx, fn); strcat(fnidx, ".bai"); - } else fnidx = strdup(_fnidx); - fpidx = fopen(fnidx, "wb"); - if (fpidx == 0) { - fprintf(stderr, "[bam_index_build2] fail to create the index file.\n"); - free(fnidx); - return -1; - } - bam_index_save(idx, fpidx); - bam_index_destroy(idx); - fclose(fpidx); - free(fnidx); - return 0; -} - -int bam_index_build(const char *fn) -{ - return bam_index_build2(fn, 0); -} - -int bam_index(int argc, char *argv[]) -{ - if (argc < 2) { - fprintf(stderr, "Usage: samtools index [out.index]\n"); - return 1; - } - if (argc >= 3) bam_index_build2(argv[1], argv[2]); - else bam_index_build(argv[1]); - return 0; -} - -int bam_idxstats(int argc, char *argv[]) -{ - bam_index_t *idx; - bam_header_t *header; - bamFile fp; - int i; - if (argc < 2) { - fprintf(stderr, "Usage: samtools idxstats \n"); - return 1; - } - fp = bam_open(argv[1], "r"); - if (fp == 0) { fprintf(stderr, "[%s] fail to open BAM.\n", __func__); return 1; } - header = bam_header_read(fp); - bam_close(fp); - idx = bam_index_load(argv[1]); - if (idx == 0) { fprintf(stderr, "[%s] fail to load the index.\n", __func__); return 1; } - for (i = 0; i < idx->n; ++i) { - khint_t k; - khash_t(i) *h = idx->index[i]; - printf("%s\t%d", header->target_name[i], header->target_len[i]); - k = kh_get(i, h, BAM_MAX_BIN); - if (k != kh_end(h)) - printf("\t%llu\t%llu", (long long)kh_val(h, k).list[1].u, (long long)kh_val(h, k).list[1].v); - else printf("\t0\t0"); - putchar('\n'); - } - printf("*\t0\t0\t%llu\n", (long long)idx->n_no_coor); - bam_header_destroy(header); - bam_index_destroy(idx); - return 0; -} - -static inline int reg2bins(uint32_t beg, uint32_t end, uint16_t list[BAM_MAX_BIN]) -{ - int i = 0, k; - if (beg >= end) return 0; - if (end >= 1u<<29) end = 1u<<29; - --end; - list[i++] = 0; - for (k = 1 + (beg>>26); k <= 1 + (end>>26); ++k) list[i++] = k; - for (k = 9 + (beg>>23); k <= 9 + (end>>23); ++k) list[i++] = k; - for (k = 73 + (beg>>20); k <= 73 + (end>>20); ++k) list[i++] = k; - for (k = 585 + (beg>>17); k <= 585 + (end>>17); ++k) list[i++] = k; - for (k = 4681 + (beg>>14); k <= 4681 + (end>>14); ++k) list[i++] = k; - return i; -} - -static inline int is_overlap(uint32_t beg, uint32_t end, const bam1_t *b) -{ - uint32_t rbeg = b->core.pos; - uint32_t rend = b->core.n_cigar? bam_calend(&b->core, bam1_cigar(b)) : b->core.pos + 1; - return (rend > beg && rbeg < end); -} - -struct __bam_iter_t { - int from_first; // read from the first record; no random access - int tid, beg, end, n_off, i, finished; - uint64_t curr_off; - pair64_t *off; -}; - -// bam_fetch helper function retrieves -bam_iter_t bam_iter_query(const bam_index_t *idx, int tid, int beg, int end) -{ - uint16_t *bins; - int i, n_bins, n_off; - pair64_t *off; - khint_t k; - khash_t(i) *index; - uint64_t min_off; - bam_iter_t iter = 0; - - if (beg < 0) beg = 0; - if (end < beg) return 0; - // initialize iter - iter = calloc(1, sizeof(struct __bam_iter_t)); - iter->tid = tid, iter->beg = beg, iter->end = end; iter->i = -1; - // - bins = (uint16_t*)calloc(BAM_MAX_BIN, 2); - n_bins = reg2bins(beg, end, bins); - index = idx->index[tid]; - if (idx->index2[tid].n > 0) { - min_off = (beg>>BAM_LIDX_SHIFT >= idx->index2[tid].n)? idx->index2[tid].offset[idx->index2[tid].n-1] - : idx->index2[tid].offset[beg>>BAM_LIDX_SHIFT]; - if (min_off == 0) { // improvement for index files built by tabix prior to 0.1.4 - int n = beg>>BAM_LIDX_SHIFT; - if (n > idx->index2[tid].n) n = idx->index2[tid].n; - for (i = n - 1; i >= 0; --i) - if (idx->index2[tid].offset[i] != 0) break; - if (i >= 0) min_off = idx->index2[tid].offset[i]; - } - } else min_off = 0; // tabix 0.1.2 may produce such index files - for (i = n_off = 0; i < n_bins; ++i) { - if ((k = kh_get(i, index, bins[i])) != kh_end(index)) - n_off += kh_value(index, k).n; - } - if (n_off == 0) { - free(bins); return iter; - } - off = (pair64_t*)calloc(n_off, 16); - for (i = n_off = 0; i < n_bins; ++i) { - if ((k = kh_get(i, index, bins[i])) != kh_end(index)) { - int j; - bam_binlist_t *p = &kh_value(index, k); - for (j = 0; j < p->n; ++j) - if (p->list[j].v > min_off) off[n_off++] = p->list[j]; - } - } - free(bins); - if (n_off == 0) { - free(off); return iter; - } - { - bam1_t *b = (bam1_t*)calloc(1, sizeof(bam1_t)); - int l; - ks_introsort(off, n_off, off); - // resolve completely contained adjacent blocks - for (i = 1, l = 0; i < n_off; ++i) - if (off[l].v < off[i].v) - off[++l] = off[i]; - n_off = l + 1; - // resolve overlaps between adjacent blocks; this may happen due to the merge in indexing - for (i = 1; i < n_off; ++i) - if (off[i-1].v >= off[i].u) off[i-1].v = off[i].u; - { // merge adjacent blocks -#if defined(BAM_TRUE_OFFSET) || defined(BAM_VIRTUAL_OFFSET16) - for (i = 1, l = 0; i < n_off; ++i) { -#ifdef BAM_TRUE_OFFSET - if (off[l].v + BAM_MIN_CHUNK_GAP > off[i].u) off[l].v = off[i].v; -#else - if (off[l].v>>16 == off[i].u>>16) off[l].v = off[i].v; -#endif - else off[++l] = off[i]; - } - n_off = l + 1; -#endif - } - bam_destroy1(b); - } - iter->n_off = n_off; iter->off = off; - return iter; -} - -pair64_t *get_chunk_coordinates(const bam_index_t *idx, int tid, int beg, int end, int *cnt_off) -{ // for pysam compatibility - bam_iter_t iter; - pair64_t *off; - iter = bam_iter_query(idx, tid, beg, end); - off = iter->off; *cnt_off = iter->n_off; - free(iter); - return off; -} - -void bam_iter_destroy(bam_iter_t iter) -{ - if (iter) { free(iter->off); free(iter); } -} - -int bam_iter_read(bamFile fp, bam_iter_t iter, bam1_t *b) -{ - int ret; - if (iter && iter->finished) return -1; - if (iter == 0 || iter->from_first) { - ret = bam_read1(fp, b); - if (ret < 0 && iter) iter->finished = 1; - return ret; - } - if (iter->off == 0) return -1; - for (;;) { - if (iter->curr_off == 0 || iter->curr_off >= iter->off[iter->i].v) { // then jump to the next chunk - if (iter->i == iter->n_off - 1) { ret = -1; break; } // no more chunks - if (iter->i >= 0) assert(iter->curr_off == iter->off[iter->i].v); // otherwise bug - if (iter->i < 0 || iter->off[iter->i].v != iter->off[iter->i+1].u) { // not adjacent chunks; then seek - bam_seek(fp, iter->off[iter->i+1].u, SEEK_SET); - iter->curr_off = bam_tell(fp); - } - ++iter->i; - } - if ((ret = bam_read1(fp, b)) >= 0) { - iter->curr_off = bam_tell(fp); - if (b->core.tid != iter->tid || b->core.pos >= iter->end) { // no need to proceed - ret = bam_validate1(NULL, b)? -1 : -5; // determine whether end of region or error - break; - } - else if (is_overlap(iter->beg, iter->end, b)) return ret; - } else break; // end of file or error - } - iter->finished = 1; - return ret; -} - -int bam_fetch(bamFile fp, const bam_index_t *idx, int tid, int beg, int end, void *data, bam_fetch_f func) -{ - int ret; - bam_iter_t iter; - bam1_t *b; - b = bam_init1(); - iter = bam_iter_query(idx, tid, beg, end); - while ((ret = bam_iter_read(fp, iter, b)) >= 0) func(b, data); - bam_iter_destroy(iter); - bam_destroy1(b); - return (ret == -1)? 0 : ret; -} diff -Nru tophat-2.1.1+dfsg/src/samtools-0.1.18/bam_lpileup.c tophat-2.1.1+dfsg1/src/samtools-0.1.18/bam_lpileup.c --- tophat-2.1.1+dfsg/src/samtools-0.1.18/bam_lpileup.c 2016-02-14 18:21:17.390079000 +0000 +++ tophat-2.1.1+dfsg1/src/samtools-0.1.18/bam_lpileup.c 1970-01-01 00:00:00.000000000 +0000 @@ -1,198 +0,0 @@ -#include -#include -#include -#include "bam.h" -#include "ksort.h" - -#define TV_GAP 2 - -typedef struct __freenode_t { - uint32_t level:28, cnt:4; - struct __freenode_t *next; -} freenode_t, *freenode_p; - -#define freenode_lt(a,b) ((a)->cnt < (b)->cnt || ((a)->cnt == (b)->cnt && (a)->level < (b)->level)) -KSORT_INIT(node, freenode_p, freenode_lt) - -/* Memory pool, similar to the one in bam_pileup.c */ -typedef struct { - int cnt, n, max; - freenode_t **buf; -} mempool_t; - -static mempool_t *mp_init() -{ - return (mempool_t*)calloc(1, sizeof(mempool_t)); -} -static void mp_destroy(mempool_t *mp) -{ - int k; - for (k = 0; k < mp->n; ++k) free(mp->buf[k]); - free(mp->buf); free(mp); -} -static inline freenode_t *mp_alloc(mempool_t *mp) -{ - ++mp->cnt; - if (mp->n == 0) return (freenode_t*)calloc(1, sizeof(freenode_t)); - else return mp->buf[--mp->n]; -} -static inline void mp_free(mempool_t *mp, freenode_t *p) -{ - --mp->cnt; p->next = 0; p->cnt = TV_GAP; - if (mp->n == mp->max) { - mp->max = mp->max? mp->max<<1 : 256; - mp->buf = (freenode_t**)realloc(mp->buf, sizeof(freenode_t*) * mp->max); - } - mp->buf[mp->n++] = p; -} - -/* core part */ -struct __bam_lplbuf_t { - int max, n_cur, n_pre; - int max_level, *cur_level, *pre_level; - mempool_t *mp; - freenode_t **aux, *head, *tail; - int n_nodes, m_aux; - bam_pileup_f func; - void *user_data; - bam_plbuf_t *plbuf; -}; - -void bam_lplbuf_reset(bam_lplbuf_t *buf) -{ - freenode_t *p, *q; - bam_plbuf_reset(buf->plbuf); - for (p = buf->head; p->next;) { - q = p->next; - mp_free(buf->mp, p); - p = q; - } - buf->head = buf->tail; - buf->max_level = 0; - buf->n_cur = buf->n_pre = 0; - buf->n_nodes = 0; -} - -static int tview_func(uint32_t tid, uint32_t pos, int n, const bam_pileup1_t *pl, void *data) -{ - bam_lplbuf_t *tv = (bam_lplbuf_t*)data; - freenode_t *p; - int i, l, max_level; - // allocate memory if necessary - if (tv->max < n) { // enlarge - tv->max = n; - kroundup32(tv->max); - tv->cur_level = (int*)realloc(tv->cur_level, sizeof(int) * tv->max); - tv->pre_level = (int*)realloc(tv->pre_level, sizeof(int) * tv->max); - } - tv->n_cur = n; - // update cnt - for (p = tv->head; p->next; p = p->next) - if (p->cnt > 0) --p->cnt; - // calculate cur_level[] - max_level = 0; - for (i = l = 0; i < n; ++i) { - const bam_pileup1_t *p = pl + i; - if (p->is_head) { - if (tv->head->next && tv->head->cnt == 0) { // then take a free slot - freenode_t *p = tv->head->next; - tv->cur_level[i] = tv->head->level; - mp_free(tv->mp, tv->head); - tv->head = p; - --tv->n_nodes; - } else tv->cur_level[i] = ++tv->max_level; - } else { - tv->cur_level[i] = tv->pre_level[l++]; - if (p->is_tail) { // then return a free slot - tv->tail->level = tv->cur_level[i]; - tv->tail->next = mp_alloc(tv->mp); - tv->tail = tv->tail->next; - ++tv->n_nodes; - } - } - if (tv->cur_level[i] > max_level) max_level = tv->cur_level[i]; - ((bam_pileup1_t*)p)->level = tv->cur_level[i]; - } - assert(l == tv->n_pre); - tv->func(tid, pos, n, pl, tv->user_data); - // sort the linked list - if (tv->n_nodes) { - freenode_t *q; - if (tv->n_nodes + 1 > tv->m_aux) { // enlarge - tv->m_aux = tv->n_nodes + 1; - kroundup32(tv->m_aux); - tv->aux = (freenode_t**)realloc(tv->aux, sizeof(void*) * tv->m_aux); - } - for (p = tv->head, i = l = 0; p->next;) { - if (p->level > max_level) { // then discard this entry - q = p->next; - mp_free(tv->mp, p); - p = q; - } else { - tv->aux[i++] = p; - p = p->next; - } - } - tv->aux[i] = tv->tail; // add a proper tail for the loop below - tv->n_nodes = i; - if (tv->n_nodes) { - ks_introsort(node, tv->n_nodes, tv->aux); - for (i = 0; i < tv->n_nodes; ++i) tv->aux[i]->next = tv->aux[i+1]; - tv->head = tv->aux[0]; - } else tv->head = tv->tail; - } - // clean up - tv->max_level = max_level; - memcpy(tv->pre_level, tv->cur_level, tv->n_cur * 4); - // squeeze out terminated levels - for (i = l = 0; i < n; ++i) { - const bam_pileup1_t *p = pl + i; - if (!p->is_tail) - tv->pre_level[l++] = tv->pre_level[i]; - } - tv->n_pre = l; -/* - fprintf(stderr, "%d\t", pos+1); - for (i = 0; i < n; ++i) { - const bam_pileup1_t *p = pl + i; - if (p->is_head) fprintf(stderr, "^"); - if (p->is_tail) fprintf(stderr, "$"); - fprintf(stderr, "%d,", p->level); - } - fprintf(stderr, "\n"); -*/ - return 0; -} - -bam_lplbuf_t *bam_lplbuf_init(bam_pileup_f func, void *data) -{ - bam_lplbuf_t *tv; - tv = (bam_lplbuf_t*)calloc(1, sizeof(bam_lplbuf_t)); - tv->mp = mp_init(); - tv->head = tv->tail = mp_alloc(tv->mp); - tv->func = func; - tv->user_data = data; - tv->plbuf = bam_plbuf_init(tview_func, tv); - return (bam_lplbuf_t*)tv; -} - -void bam_lplbuf_destroy(bam_lplbuf_t *tv) -{ - freenode_t *p, *q; - free(tv->cur_level); free(tv->pre_level); - bam_plbuf_destroy(tv->plbuf); - free(tv->aux); - for (p = tv->head; p->next;) { - q = p->next; - mp_free(tv->mp, p); p = q; - } - mp_free(tv->mp, p); - assert(tv->mp->cnt == 0); - mp_destroy(tv->mp); - free(tv); -} - -int bam_lplbuf_push(const bam1_t *b, bam_lplbuf_t *tv) -{ - return bam_plbuf_push(b, tv->plbuf); -} diff -Nru tophat-2.1.1+dfsg/src/samtools-0.1.18/bam_mate.c tophat-2.1.1+dfsg1/src/samtools-0.1.18/bam_mate.c --- tophat-2.1.1+dfsg/src/samtools-0.1.18/bam_mate.c 2016-02-14 18:21:17.391079000 +0000 +++ tophat-2.1.1+dfsg1/src/samtools-0.1.18/bam_mate.c 1970-01-01 00:00:00.000000000 +0000 @@ -1,70 +0,0 @@ -#include -#include -#include "bam.h" - -// currently, this function ONLY works if each read has one hit -void bam_mating_core(bamFile in, bamFile out) -{ - bam_header_t *header; - bam1_t *b[2]; - int curr, has_prev; - - header = bam_header_read(in); - bam_header_write(out, header); - - b[0] = bam_init1(); - b[1] = bam_init1(); - curr = 0; has_prev = 0; - while (bam_read1(in, b[curr]) >= 0) { - bam1_t *cur = b[curr], *pre = b[1-curr]; - if (has_prev) { - if (strcmp(bam1_qname(cur), bam1_qname(pre)) == 0) { // identical pair name - cur->core.mtid = pre->core.tid; cur->core.mpos = pre->core.pos; - pre->core.mtid = cur->core.tid; pre->core.mpos = cur->core.pos; - if (pre->core.tid == cur->core.tid && !(cur->core.flag&(BAM_FUNMAP|BAM_FMUNMAP)) - && !(pre->core.flag&(BAM_FUNMAP|BAM_FMUNMAP))) - { - uint32_t cur5, pre5; - cur5 = (cur->core.flag&BAM_FREVERSE)? bam_calend(&cur->core, bam1_cigar(cur)) : cur->core.pos; - pre5 = (pre->core.flag&BAM_FREVERSE)? bam_calend(&pre->core, bam1_cigar(pre)) : pre->core.pos; - cur->core.isize = pre5 - cur5; pre->core.isize = cur5 - pre5; - } else cur->core.isize = pre->core.isize = 0; - if (pre->core.flag&BAM_FREVERSE) cur->core.flag |= BAM_FMREVERSE; - else cur->core.flag &= ~BAM_FMREVERSE; - if (cur->core.flag&BAM_FREVERSE) pre->core.flag |= BAM_FMREVERSE; - else pre->core.flag &= ~BAM_FMREVERSE; - if (cur->core.flag & BAM_FUNMAP) { pre->core.flag |= BAM_FMUNMAP; pre->core.flag &= ~BAM_FPROPER_PAIR; } - if (pre->core.flag & BAM_FUNMAP) { cur->core.flag |= BAM_FMUNMAP; cur->core.flag &= ~BAM_FPROPER_PAIR; } - bam_write1(out, pre); - bam_write1(out, cur); - has_prev = 0; - } else { // unpaired or singleton - pre->core.mtid = -1; pre->core.mpos = -1; pre->core.isize = 0; - if (pre->core.flag & BAM_FPAIRED) { - pre->core.flag |= BAM_FMUNMAP; - pre->core.flag &= ~BAM_FMREVERSE & ~BAM_FPROPER_PAIR; - } - bam_write1(out, pre); - } - } else has_prev = 1; - curr = 1 - curr; - } - if (has_prev) bam_write1(out, b[1-curr]); - bam_header_destroy(header); - bam_destroy1(b[0]); - bam_destroy1(b[1]); -} - -int bam_mating(int argc, char *argv[]) -{ - bamFile in, out; - if (argc < 3) { - fprintf(stderr, "samtools fixmate \n"); - return 1; - } - in = (strcmp(argv[1], "-") == 0)? bam_dopen(fileno(stdin), "r") : bam_open(argv[1], "r"); - out = (strcmp(argv[2], "-") == 0)? bam_dopen(fileno(stdout), "w") : bam_open(argv[2], "w"); - bam_mating_core(in, out); - bam_close(in); bam_close(out); - return 0; -} diff -Nru tophat-2.1.1+dfsg/src/samtools-0.1.18/bam_md.c tophat-2.1.1+dfsg1/src/samtools-0.1.18/bam_md.c --- tophat-2.1.1+dfsg/src/samtools-0.1.18/bam_md.c 2016-02-14 18:21:17.392079000 +0000 +++ tophat-2.1.1+dfsg1/src/samtools-0.1.18/bam_md.c 1970-01-01 00:00:00.000000000 +0000 @@ -1,384 +0,0 @@ -#include -#include -#include -#include -#include -#include "faidx.h" -#include "sam.h" -#include "kstring.h" -#include "kaln.h" -#include "kprobaln.h" - -#define USE_EQUAL 1 -#define DROP_TAG 2 -#define BIN_QUAL 4 -#define UPDATE_NM 8 -#define UPDATE_MD 16 -#define HASH_QNM 32 - -char bam_nt16_nt4_table[] = { 4, 0, 1, 4, 2, 4, 4, 4, 3, 4, 4, 4, 4, 4, 4, 4 }; - -int bam_aux_drop_other(bam1_t *b, uint8_t *s); - -void bam_fillmd1_core(bam1_t *b, char *ref, int flag, int max_nm) -{ - uint8_t *seq = bam1_seq(b); - uint32_t *cigar = bam1_cigar(b); - bam1_core_t *c = &b->core; - int i, x, y, u = 0; - kstring_t *str; - int32_t old_nm_i = -1, nm = 0; - - str = (kstring_t*)calloc(1, sizeof(kstring_t)); - for (i = y = 0, x = c->pos; i < c->n_cigar; ++i) { - int j, l = cigar[i]>>4, op = cigar[i]&0xf; - if (op == BAM_CMATCH || op == BAM_CEQUAL || op == BAM_CDIFF) { - for (j = 0; j < l; ++j) { - int z = y + j; - int c1 = bam1_seqi(seq, z), c2 = bam_nt16_table[(int)ref[x+j]]; - if (ref[x+j] == 0) break; // out of boundary - if ((c1 == c2 && c1 != 15 && c2 != 15) || c1 == 0) { // a match - if (flag&USE_EQUAL) seq[z/2] &= (z&1)? 0xf0 : 0x0f; - ++u; - } else { - kputw(u, str); kputc(ref[x+j], str); - u = 0; ++nm; - } - } - if (j < l) break; - x += l; y += l; - } else if (op == BAM_CDEL) { - kputw(u, str); kputc('^', str); - for (j = 0; j < l; ++j) { - if (ref[x+j] == 0) break; - kputc(ref[x+j], str); - } - u = 0; - if (j < l) break; - x += l; nm += l; - } else if (op == BAM_CINS || op == BAM_CSOFT_CLIP) { - y += l; - if (op == BAM_CINS) nm += l; - } else if (op == BAM_CREF_SKIP) { - x += l; - } - } - kputw(u, str); - // apply max_nm - if (max_nm > 0 && nm >= max_nm) { - for (i = y = 0, x = c->pos; i < c->n_cigar; ++i) { - int j, l = cigar[i]>>4, op = cigar[i]&0xf; - if (op == BAM_CMATCH || op == BAM_CEQUAL || op == BAM_CDIFF) { - for (j = 0; j < l; ++j) { - int z = y + j; - int c1 = bam1_seqi(seq, z), c2 = bam_nt16_table[(int)ref[x+j]]; - if (ref[x+j] == 0) break; // out of boundary - if ((c1 == c2 && c1 != 15 && c2 != 15) || c1 == 0) { // a match - seq[z/2] |= (z&1)? 0x0f : 0xf0; - bam1_qual(b)[z] = 0; - } - } - if (j < l) break; - x += l; y += l; - } else if (op == BAM_CDEL || op == BAM_CREF_SKIP) x += l; - else if (op == BAM_CINS || op == BAM_CSOFT_CLIP) y += l; - } - } - // update NM - if (flag & UPDATE_NM) { - uint8_t *old_nm = bam_aux_get(b, "NM"); - if (c->flag & BAM_FUNMAP) return; - if (old_nm) old_nm_i = bam_aux2i(old_nm); - if (!old_nm) bam_aux_append(b, "NM", 'i', 4, (uint8_t*)&nm); - else if (nm != old_nm_i) { - fprintf(stderr, "[bam_fillmd1] different NM for read '%s': %d -> %d\n", bam1_qname(b), old_nm_i, nm); - bam_aux_del(b, old_nm); - bam_aux_append(b, "NM", 'i', 4, (uint8_t*)&nm); - } - } - // update MD - if (flag & UPDATE_MD) { - uint8_t *old_md = bam_aux_get(b, "MD"); - if (c->flag & BAM_FUNMAP) return; - if (!old_md) bam_aux_append(b, "MD", 'Z', str->l + 1, (uint8_t*)str->s); - else { - int is_diff = 0; - if (strlen((char*)old_md+1) == str->l) { - for (i = 0; i < str->l; ++i) - if (toupper(old_md[i+1]) != toupper(str->s[i])) - break; - if (i < str->l) is_diff = 1; - } else is_diff = 1; - if (is_diff) { - fprintf(stderr, "[bam_fillmd1] different MD for read '%s': '%s' -> '%s'\n", bam1_qname(b), old_md+1, str->s); - bam_aux_del(b, old_md); - bam_aux_append(b, "MD", 'Z', str->l + 1, (uint8_t*)str->s); - } - } - } - // drop all tags but RG - if (flag&DROP_TAG) { - uint8_t *q = bam_aux_get(b, "RG"); - bam_aux_drop_other(b, q); - } - // reduce the resolution of base quality - if (flag&BIN_QUAL) { - uint8_t *qual = bam1_qual(b); - for (i = 0; i < b->core.l_qseq; ++i) - if (qual[i] >= 3) qual[i] = qual[i]/10*10 + 7; - } - free(str->s); free(str); -} - -void bam_fillmd1(bam1_t *b, char *ref, int flag) -{ - bam_fillmd1_core(b, ref, flag, 0); -} - -int bam_cap_mapQ(bam1_t *b, char *ref, int thres) -{ - uint8_t *seq = bam1_seq(b), *qual = bam1_qual(b); - uint32_t *cigar = bam1_cigar(b); - bam1_core_t *c = &b->core; - int i, x, y, mm, q, len, clip_l, clip_q; - double t; - if (thres < 0) thres = 40; // set the default - mm = q = len = clip_l = clip_q = 0; - for (i = y = 0, x = c->pos; i < c->n_cigar; ++i) { - int j, l = cigar[i]>>4, op = cigar[i]&0xf; - if (op == BAM_CMATCH || op == BAM_CEQUAL || op == BAM_CDIFF) { - for (j = 0; j < l; ++j) { - int z = y + j; - int c1 = bam1_seqi(seq, z), c2 = bam_nt16_table[(int)ref[x+j]]; - if (ref[x+j] == 0) break; // out of boundary - if (c2 != 15 && c1 != 15 && qual[z] >= 13) { // not ambiguous - ++len; - if (c1 && c1 != c2 && qual[z] >= 13) { // mismatch - ++mm; - q += qual[z] > 33? 33 : qual[z]; - } - } - } - if (j < l) break; - x += l; y += l; len += l; - } else if (op == BAM_CDEL) { - for (j = 0; j < l; ++j) - if (ref[x+j] == 0) break; - if (j < l) break; - x += l; - } else if (op == BAM_CSOFT_CLIP) { - for (j = 0; j < l; ++j) clip_q += qual[y+j]; - clip_l += l; - y += l; - } else if (op == BAM_CHARD_CLIP) { - clip_q += 13 * l; - clip_l += l; - } else if (op == BAM_CINS) y += l; - else if (op == BAM_CREF_SKIP) x += l; - } - for (i = 0, t = 1; i < mm; ++i) - t *= (double)len / (i+1); - t = q - 4.343 * log(t) + clip_q / 5.; - if (t > thres) return -1; - if (t < 0) t = 0; - t = sqrt((thres - t) / thres) * thres; -// fprintf(stderr, "%s %lf %d\n", bam1_qname(b), t, q); - return (int)(t + .499); -} - -int bam_prob_realn_core(bam1_t *b, const char *ref, int flag) -{ - int k, i, bw, x, y, yb, ye, xb, xe, apply_baq = flag&1, extend_baq = flag>>1&1; - uint32_t *cigar = bam1_cigar(b); - bam1_core_t *c = &b->core; - kpa_par_t conf = kpa_par_def; - uint8_t *bq = 0, *zq = 0, *qual = bam1_qual(b); - if ((c->flag & BAM_FUNMAP) || b->core.l_qseq == 0) return -1; // do nothing - // test if BQ or ZQ is present - if ((bq = bam_aux_get(b, "BQ")) != 0) ++bq; - if ((zq = bam_aux_get(b, "ZQ")) != 0 && *zq == 'Z') ++zq; - if (bq && zq) { // remove the ZQ tag - bam_aux_del(b, zq-1); - zq = 0; - } - if (bq || zq) { - if ((apply_baq && zq) || (!apply_baq && bq)) return -3; // in both cases, do nothing - if (bq && apply_baq) { // then convert BQ to ZQ - for (i = 0; i < c->l_qseq; ++i) - qual[i] = qual[i] + 64 < bq[i]? 0 : qual[i] - ((int)bq[i] - 64); - *(bq - 3) = 'Z'; - } else if (zq && !apply_baq) { // then convert ZQ to BQ - for (i = 0; i < c->l_qseq; ++i) - qual[i] += (int)zq[i] - 64; - *(zq - 3) = 'B'; - } - return 0; - } - // find the start and end of the alignment - x = c->pos, y = 0, yb = ye = xb = xe = -1; - for (k = 0; k < c->n_cigar; ++k) { - int op, l; - op = cigar[k]&0xf; l = cigar[k]>>4; - if (op == BAM_CMATCH || op == BAM_CEQUAL || op == BAM_CDIFF) { - if (yb < 0) yb = y; - if (xb < 0) xb = x; - ye = y + l; xe = x + l; - x += l; y += l; - } else if (op == BAM_CSOFT_CLIP || op == BAM_CINS) y += l; - else if (op == BAM_CDEL) x += l; - else if (op == BAM_CREF_SKIP) return -1; // do nothing if there is a reference skip - } - // set bandwidth and the start and the end - bw = 7; - if (abs((xe - xb) - (ye - yb)) > bw) - bw = abs((xe - xb) - (ye - yb)) + 3; - conf.bw = bw; - xb -= yb + bw/2; if (xb < 0) xb = 0; - xe += c->l_qseq - ye + bw/2; - if (xe - xb - c->l_qseq > bw) - xb += (xe - xb - c->l_qseq - bw) / 2, xe -= (xe - xb - c->l_qseq - bw) / 2; - { // glocal - uint8_t *s, *r, *q, *seq = bam1_seq(b), *bq; - int *state; - bq = calloc(c->l_qseq + 1, 1); - memcpy(bq, qual, c->l_qseq); - s = calloc(c->l_qseq, 1); - for (i = 0; i < c->l_qseq; ++i) s[i] = bam_nt16_nt4_table[bam1_seqi(seq, i)]; - r = calloc(xe - xb, 1); - for (i = xb; i < xe; ++i) { - if (ref[i] == 0) { xe = i; break; } - r[i-xb] = bam_nt16_nt4_table[bam_nt16_table[(int)ref[i]]]; - } - state = calloc(c->l_qseq, sizeof(int)); - q = calloc(c->l_qseq, 1); - kpa_glocal(r, xe-xb, s, c->l_qseq, qual, &conf, state, q); - if (!extend_baq) { // in this block, bq[] is capped by base quality qual[] - for (k = 0, x = c->pos, y = 0; k < c->n_cigar; ++k) { - int op = cigar[k]&0xf, l = cigar[k]>>4; - if (op == BAM_CMATCH || op == BAM_CEQUAL || op == BAM_CDIFF) { - for (i = y; i < y + l; ++i) { - if ((state[i]&3) != 0 || state[i]>>2 != x - xb + (i - y)) bq[i] = 0; - else bq[i] = bq[i] < q[i]? bq[i] : q[i]; - } - x += l; y += l; - } else if (op == BAM_CSOFT_CLIP || op == BAM_CINS) y += l; - else if (op == BAM_CDEL) x += l; - } - for (i = 0; i < c->l_qseq; ++i) bq[i] = qual[i] - bq[i] + 64; // finalize BQ - } else { // in this block, bq[] is BAQ that can be larger than qual[] (different from the above!) - uint8_t *left, *rght; - left = calloc(c->l_qseq, 1); rght = calloc(c->l_qseq, 1); - for (k = 0, x = c->pos, y = 0; k < c->n_cigar; ++k) { - int op = cigar[k]&0xf, l = cigar[k]>>4; - if (op == BAM_CMATCH || op == BAM_CEQUAL || op == BAM_CDIFF) { - for (i = y; i < y + l; ++i) - bq[i] = ((state[i]&3) != 0 || state[i]>>2 != x - xb + (i - y))? 0 : q[i]; - for (left[y] = bq[y], i = y + 1; i < y + l; ++i) - left[i] = bq[i] > left[i-1]? bq[i] : left[i-1]; - for (rght[y+l-1] = bq[y+l-1], i = y + l - 2; i >= y; --i) - rght[i] = bq[i] > rght[i+1]? bq[i] : rght[i+1]; - for (i = y; i < y + l; ++i) - bq[i] = left[i] < rght[i]? left[i] : rght[i]; - x += l; y += l; - } else if (op == BAM_CSOFT_CLIP || op == BAM_CINS) y += l; - else if (op == BAM_CDEL) x += l; - } - for (i = 0; i < c->l_qseq; ++i) bq[i] = 64 + (qual[i] <= bq[i]? 0 : qual[i] - bq[i]); // finalize BQ - free(left); free(rght); - } - if (apply_baq) { - for (i = 0; i < c->l_qseq; ++i) qual[i] -= bq[i] - 64; // modify qual - bam_aux_append(b, "ZQ", 'Z', c->l_qseq + 1, bq); - } else bam_aux_append(b, "BQ", 'Z', c->l_qseq + 1, bq); - free(bq); free(s); free(r); free(q); free(state); - } - return 0; -} - -int bam_prob_realn(bam1_t *b, const char *ref) -{ - return bam_prob_realn_core(b, ref, 1); -} - -int bam_fillmd(int argc, char *argv[]) -{ - int c, flt_flag, tid = -2, ret, len, is_bam_out, is_sam_in, is_uncompressed, max_nm, is_realn, capQ, baq_flag; - samfile_t *fp, *fpout = 0; - faidx_t *fai; - char *ref = 0, mode_w[8], mode_r[8]; - bam1_t *b; - - flt_flag = UPDATE_NM | UPDATE_MD; - is_bam_out = is_sam_in = is_uncompressed = is_realn = max_nm = capQ = baq_flag = 0; - mode_w[0] = mode_r[0] = 0; - strcpy(mode_r, "r"); strcpy(mode_w, "w"); - while ((c = getopt(argc, argv, "EqreuNhbSC:n:Ad")) >= 0) { - switch (c) { - case 'r': is_realn = 1; break; - case 'e': flt_flag |= USE_EQUAL; break; - case 'd': flt_flag |= DROP_TAG; break; - case 'q': flt_flag |= BIN_QUAL; break; - case 'h': flt_flag |= HASH_QNM; break; - case 'N': flt_flag &= ~(UPDATE_MD|UPDATE_NM); break; - case 'b': is_bam_out = 1; break; - case 'u': is_uncompressed = is_bam_out = 1; break; - case 'S': is_sam_in = 1; break; - case 'n': max_nm = atoi(optarg); break; - case 'C': capQ = atoi(optarg); break; - case 'A': baq_flag |= 1; break; - case 'E': baq_flag |= 2; break; - default: fprintf(stderr, "[bam_fillmd] unrecognized option '-%c'\n", c); return 1; - } - } - if (!is_sam_in) strcat(mode_r, "b"); - if (is_bam_out) strcat(mode_w, "b"); - else strcat(mode_w, "h"); - if (is_uncompressed) strcat(mode_w, "u"); - if (optind + 1 >= argc) { - fprintf(stderr, "\n"); - fprintf(stderr, "Usage: samtools fillmd [-eubrS] \n\n"); - fprintf(stderr, "Options: -e change identical bases to '='\n"); - fprintf(stderr, " -u uncompressed BAM output (for piping)\n"); - fprintf(stderr, " -b compressed BAM output\n"); - fprintf(stderr, " -S the input is SAM with header\n"); - fprintf(stderr, " -A modify the quality string\n"); - fprintf(stderr, " -r compute the BQ tag (without -A) or cap baseQ by BAQ (with -A)\n"); - fprintf(stderr, " -E extended BAQ for better sensitivity but lower specificity\n\n"); - return 1; - } - fp = samopen(argv[optind], mode_r, 0); - if (fp == 0) return 1; - if (is_sam_in && (fp->header == 0 || fp->header->n_targets == 0)) { - fprintf(stderr, "[bam_fillmd] input SAM does not have header. Abort!\n"); - return 1; - } - fpout = samopen("-", mode_w, fp->header); - fai = fai_load(argv[optind+1]); - - b = bam_init1(); - while ((ret = samread(fp, b)) >= 0) { - if (b->core.tid >= 0) { - if (tid != b->core.tid) { - free(ref); - ref = fai_fetch(fai, fp->header->target_name[b->core.tid], &len); - tid = b->core.tid; - if (ref == 0) - fprintf(stderr, "[bam_fillmd] fail to find sequence '%s' in the reference.\n", - fp->header->target_name[tid]); - } - if (is_realn) bam_prob_realn_core(b, ref, baq_flag); - if (capQ > 10) { - int q = bam_cap_mapQ(b, ref, capQ); - if (b->core.qual > q) b->core.qual = q; - } - if (ref) bam_fillmd1_core(b, ref, flt_flag, max_nm); - } - samwrite(fpout, b); - } - bam_destroy1(b); - - free(ref); - fai_destroy(fai); - samclose(fp); samclose(fpout); - return 0; -} diff -Nru tophat-2.1.1+dfsg/src/samtools-0.1.18/bam_pileup.c tophat-2.1.1+dfsg1/src/samtools-0.1.18/bam_pileup.c --- tophat-2.1.1+dfsg/src/samtools-0.1.18/bam_pileup.c 2016-02-14 18:21:17.400079000 +0000 +++ tophat-2.1.1+dfsg1/src/samtools-0.1.18/bam_pileup.c 1970-01-01 00:00:00.000000000 +0000 @@ -1,437 +0,0 @@ -#include -#include -#include -#include -#include "sam.h" - -typedef struct { - int k, x, y, end; -} cstate_t; - -static cstate_t g_cstate_null = { -1, 0, 0, 0 }; - -typedef struct __linkbuf_t { - bam1_t b; - uint32_t beg, end; - cstate_t s; - struct __linkbuf_t *next; -} lbnode_t; - -/* --- BEGIN: Memory pool */ - -typedef struct { - int cnt, n, max; - lbnode_t **buf; -} mempool_t; - -static mempool_t *mp_init() -{ - mempool_t *mp; - mp = (mempool_t*)calloc(1, sizeof(mempool_t)); - return mp; -} -static void mp_destroy(mempool_t *mp) -{ - int k; - for (k = 0; k < mp->n; ++k) { - free(mp->buf[k]->b.data); - free(mp->buf[k]); - } - free(mp->buf); - free(mp); -} -static inline lbnode_t *mp_alloc(mempool_t *mp) -{ - ++mp->cnt; - if (mp->n == 0) return (lbnode_t*)calloc(1, sizeof(lbnode_t)); - else return mp->buf[--mp->n]; -} -static inline void mp_free(mempool_t *mp, lbnode_t *p) -{ - --mp->cnt; p->next = 0; // clear lbnode_t::next here - if (mp->n == mp->max) { - mp->max = mp->max? mp->max<<1 : 256; - mp->buf = (lbnode_t**)realloc(mp->buf, sizeof(lbnode_t*) * mp->max); - } - mp->buf[mp->n++] = p; -} - -/* --- END: Memory pool */ - -/* --- BEGIN: Auxiliary functions */ - -/* s->k: the index of the CIGAR operator that has just been processed. - s->x: the reference coordinate of the start of s->k - s->y: the query coordiante of the start of s->k - */ -static inline int resolve_cigar2(bam_pileup1_t *p, uint32_t pos, cstate_t *s) -{ -#define _cop(c) ((c)&BAM_CIGAR_MASK) -#define _cln(c) ((c)>>BAM_CIGAR_SHIFT) - - bam1_t *b = p->b; - bam1_core_t *c = &b->core; - uint32_t *cigar = bam1_cigar(b); - int k, is_head = 0; - // determine the current CIGAR operation -// fprintf(stderr, "%s\tpos=%d\tend=%d\t(%d,%d,%d)\n", bam1_qname(b), pos, s->end, s->k, s->x, s->y); - if (s->k == -1) { // never processed - is_head = 1; - if (c->n_cigar == 1) { // just one operation, save a loop - if (_cop(cigar[0]) == BAM_CMATCH || _cop(cigar[0]) == BAM_CEQUAL || _cop(cigar[0]) == BAM_CDIFF) s->k = 0, s->x = c->pos, s->y = 0; - } else { // find the first match or deletion - for (k = 0, s->x = c->pos, s->y = 0; k < c->n_cigar; ++k) { - int op = _cop(cigar[k]); - int l = _cln(cigar[k]); - if (op == BAM_CMATCH || op == BAM_CDEL || op == BAM_CEQUAL || op == BAM_CDIFF) break; - else if (op == BAM_CREF_SKIP) s->x += l; - else if (op == BAM_CINS || op == BAM_CSOFT_CLIP) s->y += l; - } - assert(k < c->n_cigar); - s->k = k; - } - } else { // the read has been processed before - int op, l = _cln(cigar[s->k]); - if (pos - s->x >= l) { // jump to the next operation - assert(s->k < c->n_cigar); // otherwise a bug: this function should not be called in this case - op = _cop(cigar[s->k+1]); - if (op == BAM_CMATCH || op == BAM_CDEL || op == BAM_CREF_SKIP || op == BAM_CEQUAL || op == BAM_CDIFF) { // jump to the next without a loop - if (_cop(cigar[s->k]) == BAM_CMATCH|| _cop(cigar[s->k]) == BAM_CEQUAL || _cop(cigar[s->k]) == BAM_CDIFF) s->y += l; - s->x += l; - ++s->k; - } else { // find the next M/D/N/=/X - if (_cop(cigar[s->k]) == BAM_CMATCH|| _cop(cigar[s->k]) == BAM_CEQUAL || _cop(cigar[s->k]) == BAM_CDIFF) s->y += l; - s->x += l; - for (k = s->k + 1; k < c->n_cigar; ++k) { - op = _cop(cigar[k]), l = _cln(cigar[k]); - if (op == BAM_CMATCH || op == BAM_CDEL || op == BAM_CREF_SKIP || op == BAM_CEQUAL || op == BAM_CDIFF) break; - else if (op == BAM_CINS || op == BAM_CSOFT_CLIP) s->y += l; - } - s->k = k; - } - assert(s->k < c->n_cigar); // otherwise a bug - } // else, do nothing - } - { // collect pileup information - int op, l; - op = _cop(cigar[s->k]); l = _cln(cigar[s->k]); - p->is_del = p->indel = p->is_refskip = 0; - if (s->x + l - 1 == pos && s->k + 1 < c->n_cigar) { // peek the next operation - int op2 = _cop(cigar[s->k+1]); - int l2 = _cln(cigar[s->k+1]); - if (op2 == BAM_CDEL) p->indel = -(int)l2; - else if (op2 == BAM_CINS) p->indel = l2; - else if (op2 == BAM_CPAD && s->k + 2 < c->n_cigar) { // no working for adjacent padding - int l3 = 0; - for (k = s->k + 2; k < c->n_cigar; ++k) { - op2 = _cop(cigar[k]); l2 = _cln(cigar[k]); - if (op2 == BAM_CINS) l3 += l2; - else if (op2 == BAM_CDEL || op2 == BAM_CMATCH || op2 == BAM_CREF_SKIP || op2 == BAM_CEQUAL || op2 == BAM_CDIFF) break; - } - if (l3 > 0) p->indel = l3; - } - } - if (op == BAM_CMATCH || op == BAM_CEQUAL || op == BAM_CDIFF) { - p->qpos = s->y + (pos - s->x); - } else if (op == BAM_CDEL || op == BAM_CREF_SKIP) { - p->is_del = 1; p->qpos = s->y; // FIXME: distinguish D and N!!!!! - p->is_refskip = (op == BAM_CREF_SKIP); - } // cannot be other operations; otherwise a bug - p->is_head = (pos == c->pos); p->is_tail = (pos == s->end); - } - return 1; -} - -/* --- END: Auxiliary functions */ - -/******************* - * pileup iterator * - *******************/ - -struct __bam_plp_t { - mempool_t *mp; - lbnode_t *head, *tail, *dummy; - int32_t tid, pos, max_tid, max_pos; - int is_eof, flag_mask, max_plp, error, maxcnt; - bam_pileup1_t *plp; - // for the "auto" interface only - bam1_t *b; - bam_plp_auto_f func; - void *data; -}; - -bam_plp_t bam_plp_init(bam_plp_auto_f func, void *data) -{ - bam_plp_t iter; - iter = calloc(1, sizeof(struct __bam_plp_t)); - iter->mp = mp_init(); - iter->head = iter->tail = mp_alloc(iter->mp); - iter->dummy = mp_alloc(iter->mp); - iter->max_tid = iter->max_pos = -1; - iter->flag_mask = BAM_DEF_MASK; - iter->maxcnt = 8000; - if (func) { - iter->func = func; - iter->data = data; - iter->b = bam_init1(); - } - return iter; -} - -void bam_plp_destroy(bam_plp_t iter) -{ - mp_free(iter->mp, iter->dummy); - mp_free(iter->mp, iter->head); - if (iter->mp->cnt != 0) - fprintf(stderr, "[bam_plp_destroy] memory leak: %d. Continue anyway.\n", iter->mp->cnt); - mp_destroy(iter->mp); - if (iter->b) bam_destroy1(iter->b); - free(iter->plp); - free(iter); -} - -const bam_pileup1_t *bam_plp_next(bam_plp_t iter, int *_tid, int *_pos, int *_n_plp) -{ - if (iter->error) { *_n_plp = -1; return 0; } - *_n_plp = 0; - if (iter->is_eof && iter->head->next == 0) return 0; - while (iter->is_eof || iter->max_tid > iter->tid || (iter->max_tid == iter->tid && iter->max_pos > iter->pos)) { - int n_plp = 0; - lbnode_t *p, *q; - // write iter->plp at iter->pos - iter->dummy->next = iter->head; - for (p = iter->head, q = iter->dummy; p->next; q = p, p = p->next) { - if (p->b.core.tid < iter->tid || (p->b.core.tid == iter->tid && p->end <= iter->pos)) { // then remove - q->next = p->next; mp_free(iter->mp, p); p = q; - } else if (p->b.core.tid == iter->tid && p->beg <= iter->pos) { // here: p->end > pos; then add to pileup - if (n_plp == iter->max_plp) { // then double the capacity - iter->max_plp = iter->max_plp? iter->max_plp<<1 : 256; - iter->plp = (bam_pileup1_t*)realloc(iter->plp, sizeof(bam_pileup1_t) * iter->max_plp); - } - iter->plp[n_plp].b = &p->b; - if (resolve_cigar2(iter->plp + n_plp, iter->pos, &p->s)) ++n_plp; // actually always true... - } - } - iter->head = iter->dummy->next; // dummy->next may be changed - *_n_plp = n_plp; *_tid = iter->tid; *_pos = iter->pos; - // update iter->tid and iter->pos - if (iter->head->next) { - if (iter->tid > iter->head->b.core.tid) { - fprintf(stderr, "[%s] unsorted input. Pileup aborts.\n", __func__); - iter->error = 1; - *_n_plp = -1; - return 0; - } - } - if (iter->tid < iter->head->b.core.tid) { // come to a new reference sequence - iter->tid = iter->head->b.core.tid; iter->pos = iter->head->beg; // jump to the next reference - } else if (iter->pos < iter->head->beg) { // here: tid == head->b.core.tid - iter->pos = iter->head->beg; // jump to the next position - } else ++iter->pos; // scan contiguously - // return - if (n_plp) return iter->plp; - if (iter->is_eof && iter->head->next == 0) break; - } - return 0; -} - -int bam_plp_push(bam_plp_t iter, const bam1_t *b) -{ - if (iter->error) return -1; - if (b) { - if (b->core.tid < 0) return 0; - if (b->core.flag & iter->flag_mask) return 0; - if (iter->tid == b->core.tid && iter->pos == b->core.pos && iter->mp->cnt > iter->maxcnt) return 0; - bam_copy1(&iter->tail->b, b); - iter->tail->beg = b->core.pos; iter->tail->end = bam_calend(&b->core, bam1_cigar(b)); - iter->tail->s = g_cstate_null; iter->tail->s.end = iter->tail->end - 1; // initialize cstate_t - if (b->core.tid < iter->max_tid) { - fprintf(stderr, "[bam_pileup_core] the input is not sorted (chromosomes out of order)\n"); - iter->error = 1; - return -1; - } - if ((b->core.tid == iter->max_tid) && (iter->tail->beg < iter->max_pos)) { - fprintf(stderr, "[bam_pileup_core] the input is not sorted (reads out of order)\n"); - iter->error = 1; - return -1; - } - iter->max_tid = b->core.tid; iter->max_pos = iter->tail->beg; - if (iter->tail->end > iter->pos || iter->tail->b.core.tid > iter->tid) { - iter->tail->next = mp_alloc(iter->mp); - iter->tail = iter->tail->next; - } - } else iter->is_eof = 1; - return 0; -} - -const bam_pileup1_t *bam_plp_auto(bam_plp_t iter, int *_tid, int *_pos, int *_n_plp) -{ - const bam_pileup1_t *plp; - if (iter->func == 0 || iter->error) { *_n_plp = -1; return 0; } - if ((plp = bam_plp_next(iter, _tid, _pos, _n_plp)) != 0) return plp; - else { // no pileup line can be obtained; read alignments - *_n_plp = 0; - if (iter->is_eof) return 0; - while (iter->func(iter->data, iter->b) >= 0) { - if (bam_plp_push(iter, iter->b) < 0) { - *_n_plp = -1; - return 0; - } - if ((plp = bam_plp_next(iter, _tid, _pos, _n_plp)) != 0) return plp; - // otherwise no pileup line can be returned; read the next alignment. - } - bam_plp_push(iter, 0); - if ((plp = bam_plp_next(iter, _tid, _pos, _n_plp)) != 0) return plp; - return 0; - } -} - -void bam_plp_reset(bam_plp_t iter) -{ - lbnode_t *p, *q; - iter->max_tid = iter->max_pos = -1; - iter->tid = iter->pos = 0; - iter->is_eof = 0; - for (p = iter->head; p->next;) { - q = p->next; - mp_free(iter->mp, p); - p = q; - } - iter->head = iter->tail; -} - -void bam_plp_set_mask(bam_plp_t iter, int mask) -{ - iter->flag_mask = mask < 0? BAM_DEF_MASK : (BAM_FUNMAP | mask); -} - -void bam_plp_set_maxcnt(bam_plp_t iter, int maxcnt) -{ - iter->maxcnt = maxcnt; -} - -/***************** - * callback APIs * - *****************/ - -int bam_pileup_file(bamFile fp, int mask, bam_pileup_f func, void *func_data) -{ - bam_plbuf_t *buf; - int ret; - bam1_t *b; - b = bam_init1(); - buf = bam_plbuf_init(func, func_data); - bam_plbuf_set_mask(buf, mask); - while ((ret = bam_read1(fp, b)) >= 0) - bam_plbuf_push(b, buf); - bam_plbuf_push(0, buf); - bam_plbuf_destroy(buf); - bam_destroy1(b); - return 0; -} - -void bam_plbuf_set_mask(bam_plbuf_t *buf, int mask) -{ - bam_plp_set_mask(buf->iter, mask); -} - -void bam_plbuf_reset(bam_plbuf_t *buf) -{ - bam_plp_reset(buf->iter); -} - -bam_plbuf_t *bam_plbuf_init(bam_pileup_f func, void *data) -{ - bam_plbuf_t *buf; - buf = calloc(1, sizeof(bam_plbuf_t)); - buf->iter = bam_plp_init(0, 0); - buf->func = func; - buf->data = data; - return buf; -} - -void bam_plbuf_destroy(bam_plbuf_t *buf) -{ - bam_plp_destroy(buf->iter); - free(buf); -} - -int bam_plbuf_push(const bam1_t *b, bam_plbuf_t *buf) -{ - int ret, n_plp, tid, pos; - const bam_pileup1_t *plp; - ret = bam_plp_push(buf->iter, b); - if (ret < 0) return ret; - while ((plp = bam_plp_next(buf->iter, &tid, &pos, &n_plp)) != 0) - buf->func(tid, pos, n_plp, plp, buf->data); - return 0; -} - -/*********** - * mpileup * - ***********/ - -struct __bam_mplp_t { - int n; - uint64_t min, *pos; - bam_plp_t *iter; - int *n_plp; - const bam_pileup1_t **plp; -}; - -bam_mplp_t bam_mplp_init(int n, bam_plp_auto_f func, void **data) -{ - int i; - bam_mplp_t iter; - iter = calloc(1, sizeof(struct __bam_mplp_t)); - iter->pos = calloc(n, 8); - iter->n_plp = calloc(n, sizeof(int)); - iter->plp = calloc(n, sizeof(void*)); - iter->iter = calloc(n, sizeof(void*)); - iter->n = n; - iter->min = (uint64_t)-1; - for (i = 0; i < n; ++i) { - iter->iter[i] = bam_plp_init(func, data[i]); - iter->pos[i] = iter->min; - } - return iter; -} - -void bam_mplp_set_maxcnt(bam_mplp_t iter, int maxcnt) -{ - int i; - for (i = 0; i < iter->n; ++i) - iter->iter[i]->maxcnt = maxcnt; -} - -void bam_mplp_destroy(bam_mplp_t iter) -{ - int i; - for (i = 0; i < iter->n; ++i) bam_plp_destroy(iter->iter[i]); - free(iter->iter); free(iter->pos); free(iter->n_plp); free(iter->plp); - free(iter); -} - -int bam_mplp_auto(bam_mplp_t iter, int *_tid, int *_pos, int *n_plp, const bam_pileup1_t **plp) -{ - int i, ret = 0; - uint64_t new_min = (uint64_t)-1; - for (i = 0; i < iter->n; ++i) { - if (iter->pos[i] == iter->min) { - int tid, pos; - iter->plp[i] = bam_plp_auto(iter->iter[i], &tid, &pos, &iter->n_plp[i]); - iter->pos[i] = (uint64_t)tid<<32 | pos; - } - if (iter->plp[i] && iter->pos[i] < new_min) new_min = iter->pos[i]; - } - iter->min = new_min; - if (new_min == (uint64_t)-1) return 0; - *_tid = new_min>>32; *_pos = (uint32_t)new_min; - for (i = 0; i < iter->n; ++i) { - if (iter->pos[i] == iter->min) { // FIXME: valgrind reports "uninitialised value(s) at this line" - n_plp[i] = iter->n_plp[i], plp[i] = iter->plp[i]; - ++ret; - } else n_plp[i] = 0, plp[i] = 0; - } - return ret; -} diff -Nru tophat-2.1.1+dfsg/src/samtools-0.1.18/bam_plcmd.c tophat-2.1.1+dfsg1/src/samtools-0.1.18/bam_plcmd.c --- tophat-2.1.1+dfsg/src/samtools-0.1.18/bam_plcmd.c 2016-02-14 18:21:17.410079000 +0000 +++ tophat-2.1.1+dfsg1/src/samtools-0.1.18/bam_plcmd.c 1970-01-01 00:00:00.000000000 +0000 @@ -1,546 +0,0 @@ -#include -#include -#include -#include -#include -#include -#include "sam.h" -#include "faidx.h" -#include "kstring.h" - -static inline int printw(int c, FILE *fp) -{ - char buf[16]; - int l, x; - if (c == 0) return fputc('0', fp); - for (l = 0, x = c < 0? -c : c; x > 0; x /= 10) buf[l++] = x%10 + '0'; - if (c < 0) buf[l++] = '-'; - buf[l] = 0; - for (x = 0; x < l/2; ++x) { - int y = buf[x]; buf[x] = buf[l-1-x]; buf[l-1-x] = y; - } - fputs(buf, fp); - return 0; -} - -static inline void pileup_seq(const bam_pileup1_t *p, int pos, int ref_len, const char *ref) -{ - int j; - if (p->is_head) { - putchar('^'); - putchar(p->b->core.qual > 93? 126 : p->b->core.qual + 33); - } - if (!p->is_del) { - int c = bam_nt16_rev_table[bam1_seqi(bam1_seq(p->b), p->qpos)]; - if (ref) { - int rb = pos < ref_len? ref[pos] : 'N'; - if (c == '=' || bam_nt16_table[c] == bam_nt16_table[rb]) c = bam1_strand(p->b)? ',' : '.'; - else c = bam1_strand(p->b)? tolower(c) : toupper(c); - } else { - if (c == '=') c = bam1_strand(p->b)? ',' : '.'; - else c = bam1_strand(p->b)? tolower(c) : toupper(c); - } - putchar(c); - } else putchar(p->is_refskip? (bam1_strand(p->b)? '<' : '>') : '*'); - if (p->indel > 0) { - putchar('+'); printw(p->indel, stdout); - for (j = 1; j <= p->indel; ++j) { - int c = bam_nt16_rev_table[bam1_seqi(bam1_seq(p->b), p->qpos + j)]; - putchar(bam1_strand(p->b)? tolower(c) : toupper(c)); - } - } else if (p->indel < 0) { - printw(p->indel, stdout); - for (j = 1; j <= -p->indel; ++j) { - int c = (ref && (int)pos+j < ref_len)? ref[pos+j] : 'N'; - putchar(bam1_strand(p->b)? tolower(c) : toupper(c)); - } - } - if (p->is_tail) putchar('$'); -} - -#include -#include "bam2bcf.h" -#include "sample.h" - -#define MPLP_GLF 0x10 -#define MPLP_NO_COMP 0x20 -#define MPLP_NO_ORPHAN 0x40 -#define MPLP_REALN 0x80 -#define MPLP_FMT_DP 0x100 -#define MPLP_FMT_SP 0x200 -#define MPLP_NO_INDEL 0x400 -#define MPLP_EXT_BAQ 0x800 -#define MPLP_ILLUMINA13 0x1000 -#define MPLP_IGNORE_RG 0x2000 -#define MPLP_PRINT_POS 0x4000 -#define MPLP_PRINT_MAPQ 0x8000 - -void *bed_read(const char *fn); -void bed_destroy(void *_h); -int bed_overlap(const void *_h, const char *chr, int beg, int end); - -typedef struct { - int max_mq, min_mq, flag, min_baseQ, capQ_thres, max_depth, max_indel_depth; - int openQ, extQ, tandemQ, min_support; // for indels - double min_frac; // for indels - char *reg, *pl_list; - faidx_t *fai; - void *bed, *rghash; -} mplp_conf_t; - -typedef struct { - bamFile fp; - bam_iter_t iter; - bam_header_t *h; - int ref_id; - char *ref; - const mplp_conf_t *conf; -} mplp_aux_t; - -typedef struct { - int n; - int *n_plp, *m_plp; - bam_pileup1_t **plp; -} mplp_pileup_t; - -static int mplp_func(void *data, bam1_t *b) -{ - extern int bam_realn(bam1_t *b, const char *ref); - extern int bam_prob_realn_core(bam1_t *b, const char *ref, int); - extern int bam_cap_mapQ(bam1_t *b, char *ref, int thres); - mplp_aux_t *ma = (mplp_aux_t*)data; - int ret, skip = 0; - do { - int has_ref; - ret = ma->iter? bam_iter_read(ma->fp, ma->iter, b) : bam_read1(ma->fp, b); - if (ret < 0) break; - if (b->core.tid < 0 || (b->core.flag&BAM_FUNMAP)) { // exclude unmapped reads - skip = 1; - continue; - } - if (ma->conf->bed) { // test overlap - skip = !bed_overlap(ma->conf->bed, ma->h->target_name[b->core.tid], b->core.pos, bam_calend(&b->core, bam1_cigar(b))); - if (skip) continue; - } - if (ma->conf->rghash) { // exclude read groups - uint8_t *rg = bam_aux_get(b, "RG"); - skip = (rg && bcf_str2id(ma->conf->rghash, (const char*)(rg+1)) >= 0); - if (skip) continue; - } - if (ma->conf->flag & MPLP_ILLUMINA13) { - int i; - uint8_t *qual = bam1_qual(b); - for (i = 0; i < b->core.l_qseq; ++i) - qual[i] = qual[i] > 31? qual[i] - 31 : 0; - } - has_ref = (ma->ref && ma->ref_id == b->core.tid)? 1 : 0; - skip = 0; - if (has_ref && (ma->conf->flag&MPLP_REALN)) bam_prob_realn_core(b, ma->ref, (ma->conf->flag & MPLP_EXT_BAQ)? 3 : 1); - if (has_ref && ma->conf->capQ_thres > 10) { - int q = bam_cap_mapQ(b, ma->ref, ma->conf->capQ_thres); - if (q < 0) skip = 1; - else if (b->core.qual > q) b->core.qual = q; - } - else if (b->core.qual < ma->conf->min_mq) skip = 1; - else if ((ma->conf->flag&MPLP_NO_ORPHAN) && (b->core.flag&1) && !(b->core.flag&2)) skip = 1; - } while (skip); - return ret; -} - -static void group_smpl(mplp_pileup_t *m, bam_sample_t *sm, kstring_t *buf, - int n, char *const*fn, int *n_plp, const bam_pileup1_t **plp, int ignore_rg) -{ - int i, j; - memset(m->n_plp, 0, m->n * sizeof(int)); - for (i = 0; i < n; ++i) { - for (j = 0; j < n_plp[i]; ++j) { - const bam_pileup1_t *p = plp[i] + j; - uint8_t *q; - int id = -1; - q = ignore_rg? 0 : bam_aux_get(p->b, "RG"); - if (q) id = bam_smpl_rg2smid(sm, fn[i], (char*)q+1, buf); - if (id < 0) id = bam_smpl_rg2smid(sm, fn[i], 0, buf); - if (id < 0 || id >= m->n) { - assert(q); // otherwise a bug - fprintf(stderr, "[%s] Read group %s used in file %s but absent from the header or an alignment missing read group.\n", __func__, (char*)q+1, fn[i]); - exit(1); - } - if (m->n_plp[id] == m->m_plp[id]) { - m->m_plp[id] = m->m_plp[id]? m->m_plp[id]<<1 : 8; - m->plp[id] = realloc(m->plp[id], sizeof(bam_pileup1_t) * m->m_plp[id]); - } - m->plp[id][m->n_plp[id]++] = *p; - } - } -} - -static int mpileup(mplp_conf_t *conf, int n, char **fn) -{ - extern void *bcf_call_add_rg(void *rghash, const char *hdtext, const char *list); - extern void bcf_call_del_rghash(void *rghash); - mplp_aux_t **data; - int i, tid, pos, *n_plp, tid0 = -1, beg0 = 0, end0 = 1u<<29, ref_len, ref_tid = -1, max_depth, max_indel_depth; - const bam_pileup1_t **plp; - bam_mplp_t iter; - bam_header_t *h = 0; - char *ref; - void *rghash = 0; - - bcf_callaux_t *bca = 0; - bcf_callret1_t *bcr = 0; - bcf_call_t bc; - bcf_t *bp = 0; - bcf_hdr_t *bh = 0; - - bam_sample_t *sm = 0; - kstring_t buf; - mplp_pileup_t gplp; - - memset(&gplp, 0, sizeof(mplp_pileup_t)); - memset(&buf, 0, sizeof(kstring_t)); - memset(&bc, 0, sizeof(bcf_call_t)); - data = calloc(n, sizeof(void*)); - plp = calloc(n, sizeof(void*)); - n_plp = calloc(n, sizeof(int*)); - sm = bam_smpl_init(); - - // read the header and initialize data - for (i = 0; i < n; ++i) { - bam_header_t *h_tmp; - data[i] = calloc(1, sizeof(mplp_aux_t)); - data[i]->fp = strcmp(fn[i], "-") == 0? bam_dopen(fileno(stdin), "r") : bam_open(fn[i], "r"); - data[i]->conf = conf; - h_tmp = bam_header_read(data[i]->fp); - data[i]->h = i? h : h_tmp; // for i==0, "h" has not been set yet - bam_smpl_add(sm, fn[i], (conf->flag&MPLP_IGNORE_RG)? 0 : h_tmp->text); - rghash = bcf_call_add_rg(rghash, h_tmp->text, conf->pl_list); - if (conf->reg) { - int beg, end; - bam_index_t *idx; - idx = bam_index_load(fn[i]); - if (idx == 0) { - fprintf(stderr, "[%s] fail to load index for %d-th input.\n", __func__, i+1); - exit(1); - } - if (bam_parse_region(h_tmp, conf->reg, &tid, &beg, &end) < 0) { - fprintf(stderr, "[%s] malformatted region or wrong seqname for %d-th input.\n", __func__, i+1); - exit(1); - } - if (i == 0) tid0 = tid, beg0 = beg, end0 = end; - data[i]->iter = bam_iter_query(idx, tid, beg, end); - bam_index_destroy(idx); - } - if (i == 0) h = h_tmp; - else { - // FIXME: to check consistency - bam_header_destroy(h_tmp); - } - } - gplp.n = sm->n; - gplp.n_plp = calloc(sm->n, sizeof(int)); - gplp.m_plp = calloc(sm->n, sizeof(int)); - gplp.plp = calloc(sm->n, sizeof(void*)); - - fprintf(stderr, "[%s] %d samples in %d input files\n", __func__, sm->n, n); - // write the VCF header - if (conf->flag & MPLP_GLF) { - kstring_t s; - bh = calloc(1, sizeof(bcf_hdr_t)); - s.l = s.m = 0; s.s = 0; - bp = bcf_open("-", (conf->flag&MPLP_NO_COMP)? "wu" : "w"); - for (i = 0; i < h->n_targets; ++i) { - kputs(h->target_name[i], &s); - kputc('\0', &s); - } - bh->l_nm = s.l; - bh->name = malloc(s.l); - memcpy(bh->name, s.s, s.l); - s.l = 0; - for (i = 0; i < sm->n; ++i) { - kputs(sm->smpl[i], &s); kputc('\0', &s); - } - bh->l_smpl = s.l; - bh->sname = malloc(s.l); - memcpy(bh->sname, s.s, s.l); - bh->txt = malloc(strlen(BAM_VERSION) + 64); - bh->l_txt = 1 + sprintf(bh->txt, "##samtoolsVersion=%s\n", BAM_VERSION); - free(s.s); - bcf_hdr_sync(bh); - bcf_hdr_write(bp, bh); - bca = bcf_call_init(-1., conf->min_baseQ); - bcr = calloc(sm->n, sizeof(bcf_callret1_t)); - bca->rghash = rghash; - bca->openQ = conf->openQ, bca->extQ = conf->extQ, bca->tandemQ = conf->tandemQ; - bca->min_frac = conf->min_frac; - bca->min_support = conf->min_support; - } - if (tid0 >= 0 && conf->fai) { // region is set - ref = faidx_fetch_seq(conf->fai, h->target_name[tid0], 0, 0x7fffffff, &ref_len); - ref_tid = tid0; - for (i = 0; i < n; ++i) data[i]->ref = ref, data[i]->ref_id = tid0; - } else ref_tid = -1, ref = 0; - iter = bam_mplp_init(n, mplp_func, (void**)data); - max_depth = conf->max_depth; - if (max_depth * sm->n > 1<<20) - fprintf(stderr, "(%s) Max depth is above 1M. Potential memory hog!\n", __func__); - if (max_depth * sm->n < 8000) { - max_depth = 8000 / sm->n; - fprintf(stderr, "<%s> Set max per-file depth to %d\n", __func__, max_depth); - } - max_indel_depth = conf->max_indel_depth * sm->n; - bam_mplp_set_maxcnt(iter, max_depth); - while (bam_mplp_auto(iter, &tid, &pos, n_plp, plp) > 0) { - if (conf->reg && (pos < beg0 || pos >= end0)) continue; // out of the region requested - if (conf->bed && tid >= 0 && !bed_overlap(conf->bed, h->target_name[tid], pos, pos+1)) continue; - if (tid != ref_tid) { - free(ref); ref = 0; - if (conf->fai) ref = faidx_fetch_seq(conf->fai, h->target_name[tid], 0, 0x7fffffff, &ref_len); - for (i = 0; i < n; ++i) data[i]->ref = ref, data[i]->ref_id = tid; - ref_tid = tid; - } - if (conf->flag & MPLP_GLF) { - int total_depth, _ref0, ref16; - bcf1_t *b = calloc(1, sizeof(bcf1_t)); - for (i = total_depth = 0; i < n; ++i) total_depth += n_plp[i]; - group_smpl(&gplp, sm, &buf, n, fn, n_plp, plp, conf->flag & MPLP_IGNORE_RG); - _ref0 = (ref && pos < ref_len)? ref[pos] : 'N'; - ref16 = bam_nt16_table[_ref0]; - for (i = 0; i < gplp.n; ++i) - bcf_call_glfgen(gplp.n_plp[i], gplp.plp[i], ref16, bca, bcr + i); - bcf_call_combine(gplp.n, bcr, ref16, &bc); - bcf_call2bcf(tid, pos, &bc, b, (conf->flag&(MPLP_FMT_DP|MPLP_FMT_SP))? bcr : 0, - (conf->flag&MPLP_FMT_SP), 0, 0); - bcf_write(bp, bh, b); - bcf_destroy(b); - // call indels - if (!(conf->flag&MPLP_NO_INDEL) && total_depth < max_indel_depth && bcf_call_gap_prep(gplp.n, gplp.n_plp, gplp.plp, pos, bca, ref, rghash) >= 0) { - for (i = 0; i < gplp.n; ++i) - bcf_call_glfgen(gplp.n_plp[i], gplp.plp[i], -1, bca, bcr + i); - if (bcf_call_combine(gplp.n, bcr, -1, &bc) >= 0) { - b = calloc(1, sizeof(bcf1_t)); - bcf_call2bcf(tid, pos, &bc, b, (conf->flag&(MPLP_FMT_DP|MPLP_FMT_SP))? bcr : 0, - (conf->flag&MPLP_FMT_SP), bca, ref); - bcf_write(bp, bh, b); - bcf_destroy(b); - } - } - } else { - printf("%s\t%d\t%c", h->target_name[tid], pos + 1, (ref && pos < ref_len)? ref[pos] : 'N'); - for (i = 0; i < n; ++i) { - int j; - printf("\t%d\t", n_plp[i]); - if (n_plp[i] == 0) { - printf("*\t*"); // FIXME: printf() is very slow... - if (conf->flag & MPLP_PRINT_POS) printf("\t*"); - } else { - for (j = 0; j < n_plp[i]; ++j) - pileup_seq(plp[i] + j, pos, ref_len, ref); - putchar('\t'); - for (j = 0; j < n_plp[i]; ++j) { - const bam_pileup1_t *p = plp[i] + j; - int c = bam1_qual(p->b)[p->qpos] + 33; - if (c > 126) c = 126; - putchar(c); - } - if (conf->flag & MPLP_PRINT_MAPQ) { - putchar('\t'); - for (j = 0; j < n_plp[i]; ++j) { - int c = plp[i][j].b->core.qual + 33; - if (c > 126) c = 126; - putchar(c); - } - } - if (conf->flag & MPLP_PRINT_POS) { - putchar('\t'); - for (j = 0; j < n_plp[i]; ++j) { - if (j > 0) putchar(','); - printf("%d", plp[i][j].qpos + 1); // FIXME: printf() is very slow... - } - } - } - } - putchar('\n'); - } - } - - bcf_close(bp); - bam_smpl_destroy(sm); free(buf.s); - for (i = 0; i < gplp.n; ++i) free(gplp.plp[i]); - free(gplp.plp); free(gplp.n_plp); free(gplp.m_plp); - bcf_call_del_rghash(rghash); - bcf_hdr_destroy(bh); bcf_call_destroy(bca); free(bc.PL); free(bcr); - bam_mplp_destroy(iter); - bam_header_destroy(h); - for (i = 0; i < n; ++i) { - bam_close(data[i]->fp); - if (data[i]->iter) bam_iter_destroy(data[i]->iter); - free(data[i]); - } - free(data); free(plp); free(ref); free(n_plp); - return 0; -} - -#define MAX_PATH_LEN 1024 -static int read_file_list(const char *file_list,int *n,char **argv[]) -{ - char buf[MAX_PATH_LEN]; - int len, nfiles; - char **files; - - FILE *fh = fopen(file_list,"r"); - if ( !fh ) - { - fprintf(stderr,"%s: %s\n", file_list,strerror(errno)); - return 1; - } - - // Speed is not an issue here, determine the number of files by reading the file twice - nfiles = 0; - while ( fgets(buf,MAX_PATH_LEN,fh) ) nfiles++; - - if ( fseek(fh, 0L, SEEK_SET) ) - { - fprintf(stderr,"%s: %s\n", file_list,strerror(errno)); - return 1; - } - - files = calloc(nfiles,sizeof(char*)); - nfiles = 0; - while ( fgets(buf,MAX_PATH_LEN,fh) ) - { - len = strlen(buf); - while ( len>0 && isspace(buf[len-1]) ) len--; - if ( !len ) continue; - - files[nfiles] = malloc(sizeof(char)*(len+1)); - strncpy(files[nfiles],buf,len); - files[nfiles][len] = 0; - nfiles++; - } - fclose(fh); - if ( !nfiles ) - { - fprintf(stderr,"No files read from %s\n", file_list); - return 1; - } - *argv = files; - *n = nfiles; - return 0; -} -#undef MAX_PATH_LEN - -int bam_mpileup(int argc, char *argv[]) -{ - int c; - const char *file_list = NULL; - char **fn = NULL; - int nfiles = 0, use_orphan = 0; - mplp_conf_t mplp; - memset(&mplp, 0, sizeof(mplp_conf_t)); - #define MPLP_PRINT_POS 0x4000 - mplp.max_mq = 60; - mplp.min_baseQ = 13; - mplp.capQ_thres = 0; - mplp.max_depth = 250; mplp.max_indel_depth = 250; - mplp.openQ = 40; mplp.extQ = 20; mplp.tandemQ = 100; - mplp.min_frac = 0.002; mplp.min_support = 1; - mplp.flag = MPLP_NO_ORPHAN | MPLP_REALN; - while ((c = getopt(argc, argv, "Agf:r:l:M:q:Q:uaRC:BDSd:L:b:P:o:e:h:Im:F:EG:6Os")) >= 0) { - switch (c) { - case 'f': - mplp.fai = fai_load(optarg); - if (mplp.fai == 0) return 1; - break; - case 'd': mplp.max_depth = atoi(optarg); break; - case 'r': mplp.reg = strdup(optarg); break; - case 'l': mplp.bed = bed_read(optarg); break; - case 'P': mplp.pl_list = strdup(optarg); break; - case 'g': mplp.flag |= MPLP_GLF; break; - case 'u': mplp.flag |= MPLP_NO_COMP | MPLP_GLF; break; - case 'a': mplp.flag |= MPLP_NO_ORPHAN | MPLP_REALN; break; - case 'B': mplp.flag &= ~MPLP_REALN; break; - case 'D': mplp.flag |= MPLP_FMT_DP; break; - case 'S': mplp.flag |= MPLP_FMT_SP; break; - case 'I': mplp.flag |= MPLP_NO_INDEL; break; - case 'E': mplp.flag |= MPLP_EXT_BAQ; break; - case '6': mplp.flag |= MPLP_ILLUMINA13; break; - case 'R': mplp.flag |= MPLP_IGNORE_RG; break; - case 's': mplp.flag |= MPLP_PRINT_MAPQ; break; - case 'O': mplp.flag |= MPLP_PRINT_POS; break; - case 'C': mplp.capQ_thres = atoi(optarg); break; - case 'M': mplp.max_mq = atoi(optarg); break; - case 'q': mplp.min_mq = atoi(optarg); break; - case 'Q': mplp.min_baseQ = atoi(optarg); break; - case 'b': file_list = optarg; break; - case 'o': mplp.openQ = atoi(optarg); break; - case 'e': mplp.extQ = atoi(optarg); break; - case 'h': mplp.tandemQ = atoi(optarg); break; - case 'A': use_orphan = 1; break; - case 'F': mplp.min_frac = atof(optarg); break; - case 'm': mplp.min_support = atoi(optarg); break; - case 'L': mplp.max_indel_depth = atoi(optarg); break; - case 'G': { - FILE *fp_rg; - char buf[1024]; - mplp.rghash = bcf_str2id_init(); - if ((fp_rg = fopen(optarg, "r")) == 0) - fprintf(stderr, "(%s) Fail to open file %s. Continue anyway.\n", __func__, optarg); - while (!feof(fp_rg) && fscanf(fp_rg, "%s", buf) > 0) // this is not a good style, but forgive me... - bcf_str2id_add(mplp.rghash, strdup(buf)); - fclose(fp_rg); - } - break; - } - } - if (use_orphan) mplp.flag &= ~MPLP_NO_ORPHAN; - if (argc == 1) { - fprintf(stderr, "\n"); - fprintf(stderr, "Usage: samtools mpileup [options] in1.bam [in2.bam [...]]\n\n"); - fprintf(stderr, "Input options:\n\n"); - fprintf(stderr, " -6 assume the quality is in the Illumina-1.3+ encoding\n"); - fprintf(stderr, " -A count anomalous read pairs\n"); - fprintf(stderr, " -B disable BAQ computation\n"); - fprintf(stderr, " -b FILE list of input BAM files [null]\n"); - fprintf(stderr, " -C INT parameter for adjusting mapQ; 0 to disable [0]\n"); - fprintf(stderr, " -d INT max per-BAM depth to avoid excessive memory usage [%d]\n", mplp.max_depth); - fprintf(stderr, " -E extended BAQ for higher sensitivity but lower specificity\n"); - fprintf(stderr, " -f FILE faidx indexed reference sequence file [null]\n"); - fprintf(stderr, " -G FILE exclude read groups listed in FILE [null]\n"); - fprintf(stderr, " -l FILE list of positions (chr pos) or regions (BED) [null]\n"); - fprintf(stderr, " -M INT cap mapping quality at INT [%d]\n", mplp.max_mq); - fprintf(stderr, " -r STR region in which pileup is generated [null]\n"); - fprintf(stderr, " -R ignore RG tags\n"); - fprintf(stderr, " -q INT skip alignments with mapQ smaller than INT [%d]\n", mplp.min_mq); - fprintf(stderr, " -Q INT skip bases with baseQ/BAQ smaller than INT [%d]\n", mplp.min_baseQ); - fprintf(stderr, "\nOutput options:\n\n"); - fprintf(stderr, " -D output per-sample DP in BCF (require -g/-u)\n"); - fprintf(stderr, " -g generate BCF output (genotype likelihoods)\n"); - fprintf(stderr, " -O output base positions on reads (disabled by -g/-u)\n"); - fprintf(stderr, " -s output mapping quality (disabled by -g/-u)\n"); - fprintf(stderr, " -S output per-sample strand bias P-value in BCF (require -g/-u)\n"); - fprintf(stderr, " -u generate uncompress BCF output\n"); - fprintf(stderr, "\nSNP/INDEL genotype likelihoods options (effective with `-g' or `-u'):\n\n"); - fprintf(stderr, " -e INT Phred-scaled gap extension seq error probability [%d]\n", mplp.extQ); - fprintf(stderr, " -F FLOAT minimum fraction of gapped reads for candidates [%g]\n", mplp.min_frac); - fprintf(stderr, " -h INT coefficient for homopolymer errors [%d]\n", mplp.tandemQ); - fprintf(stderr, " -I do not perform indel calling\n"); - fprintf(stderr, " -L INT max per-sample depth for INDEL calling [%d]\n", mplp.max_indel_depth); - fprintf(stderr, " -m INT minimum gapped reads for indel candidates [%d]\n", mplp.min_support); - fprintf(stderr, " -o INT Phred-scaled gap open sequencing error probability [%d]\n", mplp.openQ); - fprintf(stderr, " -P STR comma separated list of platforms for indels [all]\n"); - fprintf(stderr, "\n"); - fprintf(stderr, "Notes: Assuming diploid individuals.\n\n"); - return 1; - } - if (file_list) { - if ( read_file_list(file_list,&nfiles,&fn) ) return 1; - mpileup(&mplp,nfiles,fn); - for (c=0; c -#include -#include "bgzf.h" -#include "bam.h" - -#define BUF_SIZE 0x10000 - -int bam_reheader(BGZF *in, const bam_header_t *h, int fd) -{ - BGZF *fp; - bam_header_t *old; - int len; - uint8_t *buf; - if (in->open_mode != 'r') return -1; - buf = malloc(BUF_SIZE); - old = bam_header_read(in); - fp = bgzf_fdopen(fd, "w"); - bam_header_write(fp, h); - if (in->block_offset < in->block_length) { - bgzf_write(fp, in->uncompressed_block + in->block_offset, in->block_length - in->block_offset); - bgzf_flush(fp); - } -#ifdef _USE_KNETFILE - while ((len = knet_read(in->x.fpr, buf, BUF_SIZE)) > 0) - fwrite(buf, 1, len, fp->x.fpw); -#else - while (!feof(in->file) && (len = fread(buf, 1, BUF_SIZE, in->file)) > 0) - fwrite(buf, 1, len, fp->file); -#endif - free(buf); - fp->block_offset = in->block_offset = 0; - bgzf_close(fp); - return 0; -} - -int main_reheader(int argc, char *argv[]) -{ - bam_header_t *h; - BGZF *in; - if (argc != 3) { - fprintf(stderr, "Usage: samtools reheader \n"); - return 1; - } - { // read the header - tamFile fph = sam_open(argv[1]); - if (fph == 0) { - fprintf(stderr, "[%s] fail to read the header from %s.\n", __func__, argv[1]); - return 1; - } - h = sam_header_read(fph); - sam_close(fph); - } - in = strcmp(argv[2], "-")? bam_open(argv[2], "r") : bam_dopen(fileno(stdin), "r"); - if (in == 0) { - fprintf(stderr, "[%s] fail to open file %s.\n", __func__, argv[2]); - return 1; - } - bam_reheader(in, h, fileno(stdout)); - bgzf_close(in); - return 0; -} diff -Nru tophat-2.1.1+dfsg/src/samtools-0.1.18/bam_rmdup.c tophat-2.1.1+dfsg1/src/samtools-0.1.18/bam_rmdup.c --- tophat-2.1.1+dfsg/src/samtools-0.1.18/bam_rmdup.c 2016-02-14 18:21:17.458079000 +0000 +++ tophat-2.1.1+dfsg1/src/samtools-0.1.18/bam_rmdup.c 1970-01-01 00:00:00.000000000 +0000 @@ -1,206 +0,0 @@ -#include -#include -#include -#include -#include -#include "sam.h" - -typedef bam1_t *bam1_p; - -#include "khash.h" -KHASH_SET_INIT_STR(name) -KHASH_MAP_INIT_INT64(pos, bam1_p) - -#define BUFFER_SIZE 0x40000 - -typedef struct { - uint64_t n_checked, n_removed; - khash_t(pos) *best_hash; -} lib_aux_t; -KHASH_MAP_INIT_STR(lib, lib_aux_t) - -typedef struct { - int n, max; - bam1_t **a; -} tmp_stack_t; - -static inline void stack_insert(tmp_stack_t *stack, bam1_t *b) -{ - if (stack->n == stack->max) { - stack->max = stack->max? stack->max<<1 : 0x10000; - stack->a = (bam1_t**)realloc(stack->a, sizeof(bam1_t*) * stack->max); - } - stack->a[stack->n++] = b; -} - -static inline void dump_best(tmp_stack_t *stack, samfile_t *out) -{ - int i; - for (i = 0; i != stack->n; ++i) { - samwrite(out, stack->a[i]); - bam_destroy1(stack->a[i]); - } - stack->n = 0; -} - -static void clear_del_set(khash_t(name) *del_set) -{ - khint_t k; - for (k = kh_begin(del_set); k < kh_end(del_set); ++k) - if (kh_exist(del_set, k)) - free((char*)kh_key(del_set, k)); - kh_clear(name, del_set); -} - -static lib_aux_t *get_aux(khash_t(lib) *aux, const char *lib) -{ - khint_t k = kh_get(lib, aux, lib); - if (k == kh_end(aux)) { - int ret; - char *p = strdup(lib); - lib_aux_t *q; - k = kh_put(lib, aux, p, &ret); - q = &kh_val(aux, k); - q->n_checked = q->n_removed = 0; - q->best_hash = kh_init(pos); - return q; - } else return &kh_val(aux, k); -} - -static void clear_best(khash_t(lib) *aux, int max) -{ - khint_t k; - for (k = kh_begin(aux); k != kh_end(aux); ++k) { - if (kh_exist(aux, k)) { - lib_aux_t *q = &kh_val(aux, k); - if (kh_size(q->best_hash) >= max) - kh_clear(pos, q->best_hash); - } - } -} - -static inline int sum_qual(const bam1_t *b) -{ - int i, q; - uint8_t *qual = bam1_qual(b); - for (i = q = 0; i < b->core.l_qseq; ++i) q += qual[i]; - return q; -} - -void bam_rmdup_core(samfile_t *in, samfile_t *out) -{ - bam1_t *b; - int last_tid = -1, last_pos = -1; - tmp_stack_t stack; - khint_t k; - khash_t(lib) *aux; - khash_t(name) *del_set; - - aux = kh_init(lib); - del_set = kh_init(name); - b = bam_init1(); - memset(&stack, 0, sizeof(tmp_stack_t)); - - kh_resize(name, del_set, 4 * BUFFER_SIZE); - while (samread(in, b) >= 0) { - bam1_core_t *c = &b->core; - if (c->tid != last_tid || last_pos != c->pos) { - dump_best(&stack, out); // write the result - clear_best(aux, BUFFER_SIZE); - if (c->tid != last_tid) { - clear_best(aux, 0); - if (kh_size(del_set)) { // check - fprintf(stderr, "[bam_rmdup_core] %llu unmatched pairs\n", (long long)kh_size(del_set)); - clear_del_set(del_set); - } - if ((int)c->tid == -1) { // append unmapped reads - samwrite(out, b); - while (samread(in, b) >= 0) samwrite(out, b); - break; - } - last_tid = c->tid; - fprintf(stderr, "[bam_rmdup_core] processing reference %s...\n", in->header->target_name[c->tid]); - } - } - if (!(c->flag&BAM_FPAIRED) || (c->flag&(BAM_FUNMAP|BAM_FMUNMAP)) || (c->mtid >= 0 && c->tid != c->mtid)) { - samwrite(out, b); - } else if (c->isize > 0) { // paired, head - uint64_t key = (uint64_t)c->pos<<32 | c->isize; - const char *lib; - lib_aux_t *q; - int ret; - lib = bam_get_library(in->header, b); - q = lib? get_aux(aux, lib) : get_aux(aux, "\t"); - ++q->n_checked; - k = kh_put(pos, q->best_hash, key, &ret); - if (ret == 0) { // found in best_hash - bam1_t *p = kh_val(q->best_hash, k); - ++q->n_removed; - if (sum_qual(p) < sum_qual(b)) { // the current alignment is better; this can be accelerated in principle - kh_put(name, del_set, strdup(bam1_qname(p)), &ret); // p will be removed - bam_copy1(p, b); // replaced as b - } else kh_put(name, del_set, strdup(bam1_qname(b)), &ret); // b will be removed - if (ret == 0) - fprintf(stderr, "[bam_rmdup_core] inconsistent BAM file for pair '%s'. Continue anyway.\n", bam1_qname(b)); - } else { // not found in best_hash - kh_val(q->best_hash, k) = bam_dup1(b); - stack_insert(&stack, kh_val(q->best_hash, k)); - } - } else { // paired, tail - k = kh_get(name, del_set, bam1_qname(b)); - if (k != kh_end(del_set)) { - free((char*)kh_key(del_set, k)); - kh_del(name, del_set, k); - } else samwrite(out, b); - } - last_pos = c->pos; - } - - for (k = kh_begin(aux); k != kh_end(aux); ++k) { - if (kh_exist(aux, k)) { - lib_aux_t *q = &kh_val(aux, k); - dump_best(&stack, out); - fprintf(stderr, "[bam_rmdup_core] %lld / %lld = %.4lf in library '%s'\n", (long long)q->n_removed, - (long long)q->n_checked, (double)q->n_removed/q->n_checked, kh_key(aux, k)); - kh_destroy(pos, q->best_hash); - free((char*)kh_key(aux, k)); - } - } - kh_destroy(lib, aux); - - clear_del_set(del_set); - kh_destroy(name, del_set); - free(stack.a); - bam_destroy1(b); -} - -void bam_rmdupse_core(samfile_t *in, samfile_t *out, int force_se); - -int bam_rmdup(int argc, char *argv[]) -{ - int c, is_se = 0, force_se = 0; - samfile_t *in, *out; - while ((c = getopt(argc, argv, "sS")) >= 0) { - switch (c) { - case 's': is_se = 1; break; - case 'S': force_se = is_se = 1; break; - } - } - if (optind + 2 > argc) { - fprintf(stderr, "\n"); - fprintf(stderr, "Usage: samtools rmdup [-sS] \n\n"); - fprintf(stderr, "Option: -s rmdup for SE reads\n"); - fprintf(stderr, " -S treat PE reads as SE in rmdup (force -s)\n\n"); - return 1; - } - in = samopen(argv[optind], "rb", 0); - out = samopen(argv[optind+1], "wb", in->header); - if (in == 0 || out == 0) { - fprintf(stderr, "[bam_rmdup] fail to read/write input files\n"); - return 1; - } - if (is_se) bam_rmdupse_core(in, out, force_se); - else bam_rmdup_core(in, out); - samclose(in); samclose(out); - return 0; -} diff -Nru tophat-2.1.1+dfsg/src/samtools-0.1.18/bam_rmdupse.c tophat-2.1.1+dfsg1/src/samtools-0.1.18/bam_rmdupse.c --- tophat-2.1.1+dfsg/src/samtools-0.1.18/bam_rmdupse.c 2016-02-14 18:21:17.467079000 +0000 +++ tophat-2.1.1+dfsg1/src/samtools-0.1.18/bam_rmdupse.c 1970-01-01 00:00:00.000000000 +0000 @@ -1,159 +0,0 @@ -#include -#include "sam.h" -#include "khash.h" -#include "klist.h" - -#define QUEUE_CLEAR_SIZE 0x100000 -#define MAX_POS 0x7fffffff - -typedef struct { - int endpos; - uint32_t score:31, discarded:1; - bam1_t *b; -} elem_t, *elem_p; -#define __free_elem(p) bam_destroy1((p)->data.b) -KLIST_INIT(q, elem_t, __free_elem) -typedef klist_t(q) queue_t; - -KHASH_MAP_INIT_INT(best, elem_p) -typedef khash_t(best) besthash_t; - -typedef struct { - uint64_t n_checked, n_removed; - besthash_t *left, *rght; -} lib_aux_t; -KHASH_MAP_INIT_STR(lib, lib_aux_t) - -static lib_aux_t *get_aux(khash_t(lib) *aux, const char *lib) -{ - khint_t k = kh_get(lib, aux, lib); - if (k == kh_end(aux)) { - int ret; - char *p = strdup(lib); - lib_aux_t *q; - k = kh_put(lib, aux, p, &ret); - q = &kh_val(aux, k); - q->left = kh_init(best); - q->rght = kh_init(best); - q->n_checked = q->n_removed = 0; - return q; - } else return &kh_val(aux, k); -} - -static inline int sum_qual(const bam1_t *b) -{ - int i, q; - uint8_t *qual = bam1_qual(b); - for (i = q = 0; i < b->core.l_qseq; ++i) q += qual[i]; - return q; -} - -static inline elem_t *push_queue(queue_t *queue, const bam1_t *b, int endpos, int score) -{ - elem_t *p = kl_pushp(q, queue); - p->discarded = 0; - p->endpos = endpos; p->score = score; - if (p->b == 0) p->b = bam_init1(); - bam_copy1(p->b, b); - return p; -} - -static void clear_besthash(besthash_t *h, int32_t pos) -{ - khint_t k; - for (k = kh_begin(h); k != kh_end(h); ++k) - if (kh_exist(h, k) && kh_val(h, k)->endpos <= pos) - kh_del(best, h, k); -} - -static void dump_alignment(samfile_t *out, queue_t *queue, int32_t pos, khash_t(lib) *h) -{ - if (queue->size > QUEUE_CLEAR_SIZE || pos == MAX_POS) { - khint_t k; - while (1) { - elem_t *q; - if (queue->head == queue->tail) break; - q = &kl_val(queue->head); - if (q->discarded) { - q->b->data_len = 0; - kl_shift(q, queue, 0); - continue; - } - if ((q->b->core.flag&BAM_FREVERSE) && q->endpos > pos) break; - samwrite(out, q->b); - q->b->data_len = 0; - kl_shift(q, queue, 0); - } - for (k = kh_begin(h); k != kh_end(h); ++k) { - if (kh_exist(h, k)) { - clear_besthash(kh_val(h, k).left, pos); - clear_besthash(kh_val(h, k).rght, pos); - } - } - } -} - -void bam_rmdupse_core(samfile_t *in, samfile_t *out, int force_se) -{ - bam1_t *b; - queue_t *queue; - khint_t k; - int last_tid = -2; - khash_t(lib) *aux; - - aux = kh_init(lib); - b = bam_init1(); - queue = kl_init(q); - while (samread(in, b) >= 0) { - bam1_core_t *c = &b->core; - int endpos = bam_calend(c, bam1_cigar(b)); - int score = sum_qual(b); - - if (last_tid != c->tid) { - if (last_tid >= 0) dump_alignment(out, queue, MAX_POS, aux); - last_tid = c->tid; - } else dump_alignment(out, queue, c->pos, aux); - if ((c->flag&BAM_FUNMAP) || ((c->flag&BAM_FPAIRED) && !force_se)) { - push_queue(queue, b, endpos, score); - } else { - const char *lib; - lib_aux_t *q; - besthash_t *h; - uint32_t key; - int ret; - lib = bam_get_library(in->header, b); - q = lib? get_aux(aux, lib) : get_aux(aux, "\t"); - ++q->n_checked; - h = (c->flag&BAM_FREVERSE)? q->rght : q->left; - key = (c->flag&BAM_FREVERSE)? endpos : c->pos; - k = kh_put(best, h, key, &ret); - if (ret == 0) { // in the hash table - elem_t *p = kh_val(h, k); - ++q->n_removed; - if (p->score < score) { - if (c->flag&BAM_FREVERSE) { // mark "discarded" and push the queue - p->discarded = 1; - kh_val(h, k) = push_queue(queue, b, endpos, score); - } else { // replace - p->score = score; p->endpos = endpos; - bam_copy1(p->b, b); - } - } // otherwise, discard the alignment - } else kh_val(h, k) = push_queue(queue, b, endpos, score); - } - } - dump_alignment(out, queue, MAX_POS, aux); - - for (k = kh_begin(aux); k != kh_end(aux); ++k) { - if (kh_exist(aux, k)) { - lib_aux_t *q = &kh_val(aux, k); - fprintf(stderr, "[bam_rmdupse_core] %lld / %lld = %.4lf in library '%s'\n", (long long)q->n_removed, - (long long)q->n_checked, (double)q->n_removed/q->n_checked, kh_key(aux, k)); - kh_destroy(best, q->left); kh_destroy(best, q->rght); - free((char*)kh_key(aux, k)); - } - } - kh_destroy(lib, aux); - bam_destroy1(b); - kl_destroy(q, queue); -} diff -Nru tophat-2.1.1+dfsg/src/samtools-0.1.18/bam_sort.c tophat-2.1.1+dfsg1/src/samtools-0.1.18/bam_sort.c --- tophat-2.1.1+dfsg/src/samtools-0.1.18/bam_sort.c 2016-02-14 18:21:17.468079000 +0000 +++ tophat-2.1.1+dfsg1/src/samtools-0.1.18/bam_sort.c 1970-01-01 00:00:00.000000000 +0000 @@ -1,438 +0,0 @@ -#include -#include -#include -#include -#include -#include -#include -#include "bam.h" -#include "ksort.h" - -static int g_is_by_qname = 0; - -static inline int strnum_cmp(const char *a, const char *b) -{ - char *pa, *pb; - pa = (char*)a; pb = (char*)b; - while (*pa && *pb) { - if (isdigit(*pa) && isdigit(*pb)) { - long ai, bi; - ai = strtol(pa, &pa, 10); - bi = strtol(pb, &pb, 10); - if (ai != bi) return aibi? 1 : 0; - } else { - if (*pa != *pb) break; - ++pa; ++pb; - } - } - if (*pa == *pb) - return (pa-a) < (pb-b)? -1 : (pa-a) > (pb-b)? 1 : 0; - return *pa<*pb? -1 : *pa>*pb? 1 : 0; -} - -#define HEAP_EMPTY 0xffffffffffffffffull - -typedef struct { - int i; - uint64_t pos, idx; - bam1_t *b; -} heap1_t; - -#define __pos_cmp(a, b) ((a).pos > (b).pos || ((a).pos == (b).pos && ((a).i > (b).i || ((a).i == (b).i && (a).idx > (b).idx)))) - -static inline int heap_lt(const heap1_t a, const heap1_t b) -{ - if (g_is_by_qname) { - int t; - if (a.b == 0 || b.b == 0) return a.b == 0? 1 : 0; - t = strnum_cmp(bam1_qname(a.b), bam1_qname(b.b)); - return (t > 0 || (t == 0 && __pos_cmp(a, b))); - } else return __pos_cmp(a, b); -} - -KSORT_INIT(heap, heap1_t, heap_lt) - -static void swap_header_targets(bam_header_t *h1, bam_header_t *h2) -{ - bam_header_t t; - t.n_targets = h1->n_targets, h1->n_targets = h2->n_targets, h2->n_targets = t.n_targets; - t.target_name = h1->target_name, h1->target_name = h2->target_name, h2->target_name = t.target_name; - t.target_len = h1->target_len, h1->target_len = h2->target_len, h2->target_len = t.target_len; -} - -static void swap_header_text(bam_header_t *h1, bam_header_t *h2) -{ - int tempi; - char *temps; - tempi = h1->l_text, h1->l_text = h2->l_text, h2->l_text = tempi; - temps = h1->text, h1->text = h2->text, h2->text = temps; -} - -#define MERGE_RG 1 -#define MERGE_UNCOMP 2 -#define MERGE_LEVEL1 4 -#define MERGE_FORCE 8 - -/*! - @abstract Merge multiple sorted BAM. - @param is_by_qname whether to sort by query name - @param out output BAM file name - @param headers name of SAM file from which to copy '@' header lines, - or NULL to copy them from the first file to be merged - @param n number of files to be merged - @param fn names of files to be merged - - @discussion Padding information may NOT correctly maintained. This - function is NOT thread safe. - */ -int bam_merge_core(int by_qname, const char *out, const char *headers, int n, char * const *fn, - int flag, const char *reg) -{ - bamFile fpout, *fp; - heap1_t *heap; - bam_header_t *hout = 0; - bam_header_t *hheaders = NULL; - int i, j, *RG_len = 0; - uint64_t idx = 0; - char **RG = 0; - bam_iter_t *iter = 0; - - if (headers) { - tamFile fpheaders = sam_open(headers); - if (fpheaders == 0) { - const char *message = strerror(errno); - fprintf(stderr, "[bam_merge_core] cannot open '%s': %s\n", headers, message); - return -1; - } - hheaders = sam_header_read(fpheaders); - sam_close(fpheaders); - } - - g_is_by_qname = by_qname; - fp = (bamFile*)calloc(n, sizeof(bamFile)); - heap = (heap1_t*)calloc(n, sizeof(heap1_t)); - iter = (bam_iter_t*)calloc(n, sizeof(bam_iter_t)); - // prepare RG tag - if (flag & MERGE_RG) { - RG = (char**)calloc(n, sizeof(void*)); - RG_len = (int*)calloc(n, sizeof(int)); - for (i = 0; i != n; ++i) { - int l = strlen(fn[i]); - const char *s = fn[i]; - if (l > 4 && strcmp(s + l - 4, ".bam") == 0) l -= 4; - for (j = l - 1; j >= 0; --j) if (s[j] == '/') break; - ++j; l -= j; - RG[i] = calloc(l + 1, 1); - RG_len[i] = l; - strncpy(RG[i], s + j, l); - } - } - // read the first - for (i = 0; i != n; ++i) { - bam_header_t *hin; - fp[i] = bam_open(fn[i], "r"); - if (fp[i] == 0) { - int j; - fprintf(stderr, "[bam_merge_core] fail to open file %s\n", fn[i]); - for (j = 0; j < i; ++j) bam_close(fp[j]); - free(fp); free(heap); - // FIXME: possible memory leak - return -1; - } - hin = bam_header_read(fp[i]); - if (i == 0) { // the first BAM - hout = hin; - } else { // validate multiple baf - int min_n_targets = hout->n_targets; - if (hin->n_targets < min_n_targets) min_n_targets = hin->n_targets; - - for (j = 0; j < min_n_targets; ++j) - if (strcmp(hout->target_name[j], hin->target_name[j]) != 0) { - fprintf(stderr, "[bam_merge_core] different target sequence name: '%s' != '%s' in file '%s'\n", - hout->target_name[j], hin->target_name[j], fn[i]); - return -1; - } - - // If this input file has additional target reference sequences, - // add them to the headers to be output - if (hin->n_targets > hout->n_targets) { - swap_header_targets(hout, hin); - // FIXME Possibly we should also create @SQ text headers - // for the newly added reference sequences - } - - bam_header_destroy(hin); - } - } - - if (hheaders) { - // If the text headers to be swapped in include any @SQ headers, - // check that they are consistent with the existing binary list - // of reference information. - if (hheaders->n_targets > 0) { - if (hout->n_targets != hheaders->n_targets) { - fprintf(stderr, "[bam_merge_core] number of @SQ headers in '%s' differs from number of target sequences\n", headers); - if (!reg) return -1; - } - for (j = 0; j < hout->n_targets; ++j) - if (strcmp(hout->target_name[j], hheaders->target_name[j]) != 0) { - fprintf(stderr, "[bam_merge_core] @SQ header '%s' in '%s' differs from target sequence\n", hheaders->target_name[j], headers); - if (!reg) return -1; - } - } - - swap_header_text(hout, hheaders); - bam_header_destroy(hheaders); - } - - if (reg) { - int tid, beg, end; - if (bam_parse_region(hout, reg, &tid, &beg, &end) < 0) { - fprintf(stderr, "[%s] Malformated region string or undefined reference name\n", __func__); - return -1; - } - for (i = 0; i < n; ++i) { - bam_index_t *idx; - idx = bam_index_load(fn[i]); - iter[i] = bam_iter_query(idx, tid, beg, end); - bam_index_destroy(idx); - } - } - - for (i = 0; i < n; ++i) { - heap1_t *h = heap + i; - h->i = i; - h->b = (bam1_t*)calloc(1, sizeof(bam1_t)); - if (bam_iter_read(fp[i], iter[i], h->b) >= 0) { - h->pos = ((uint64_t)h->b->core.tid<<32) | (uint32_t)((int32_t)h->b->core.pos+1)<<1 | bam1_strand(h->b); - h->idx = idx++; - } - else h->pos = HEAP_EMPTY; - } - if (flag & MERGE_UNCOMP) fpout = strcmp(out, "-")? bam_open(out, "wu") : bam_dopen(fileno(stdout), "wu"); - else if (flag & MERGE_LEVEL1) fpout = strcmp(out, "-")? bam_open(out, "w1") : bam_dopen(fileno(stdout), "w1"); - else fpout = strcmp(out, "-")? bam_open(out, "w") : bam_dopen(fileno(stdout), "w"); - if (fpout == 0) { - fprintf(stderr, "[%s] fail to create the output file.\n", __func__); - return -1; - } - bam_header_write(fpout, hout); - bam_header_destroy(hout); - - ks_heapmake(heap, n, heap); - while (heap->pos != HEAP_EMPTY) { - bam1_t *b = heap->b; - if (flag & MERGE_RG) { - uint8_t *rg = bam_aux_get(b, "RG"); - if (rg) bam_aux_del(b, rg); - bam_aux_append(b, "RG", 'Z', RG_len[heap->i] + 1, (uint8_t*)RG[heap->i]); - } - bam_write1_core(fpout, &b->core, b->data_len, b->data); - if ((j = bam_iter_read(fp[heap->i], iter[heap->i], b)) >= 0) { - heap->pos = ((uint64_t)b->core.tid<<32) | (uint32_t)((int)b->core.pos+1)<<1 | bam1_strand(b); - heap->idx = idx++; - } else if (j == -1) { - heap->pos = HEAP_EMPTY; - free(heap->b->data); free(heap->b); - heap->b = 0; - } else fprintf(stderr, "[bam_merge_core] '%s' is truncated. Continue anyway.\n", fn[heap->i]); - ks_heapadjust(heap, 0, n, heap); - } - - if (flag & MERGE_RG) { - for (i = 0; i != n; ++i) free(RG[i]); - free(RG); free(RG_len); - } - for (i = 0; i != n; ++i) { - bam_iter_destroy(iter[i]); - bam_close(fp[i]); - } - bam_close(fpout); - free(fp); free(heap); free(iter); - return 0; -} - -int bam_merge(int argc, char *argv[]) -{ - int c, is_by_qname = 0, flag = 0, ret = 0; - char *fn_headers = NULL, *reg = 0; - - while ((c = getopt(argc, argv, "h:nru1R:f")) >= 0) { - switch (c) { - case 'r': flag |= MERGE_RG; break; - case 'f': flag |= MERGE_FORCE; break; - case 'h': fn_headers = strdup(optarg); break; - case 'n': is_by_qname = 1; break; - case '1': flag |= MERGE_LEVEL1; break; - case 'u': flag |= MERGE_UNCOMP; break; - case 'R': reg = strdup(optarg); break; - } - } - if (optind + 2 >= argc) { - fprintf(stderr, "\n"); - fprintf(stderr, "Usage: samtools merge [-nr] [-h inh.sam] [...]\n\n"); - fprintf(stderr, "Options: -n sort by read names\n"); - fprintf(stderr, " -r attach RG tag (inferred from file names)\n"); - fprintf(stderr, " -u uncompressed BAM output\n"); - fprintf(stderr, " -f overwrite the output BAM if exist\n"); - fprintf(stderr, " -1 compress level 1\n"); - fprintf(stderr, " -R STR merge file in the specified region STR [all]\n"); - fprintf(stderr, " -h FILE copy the header in FILE to [in1.bam]\n\n"); - fprintf(stderr, "Note: Samtools' merge does not reconstruct the @RG dictionary in the header. Users\n"); - fprintf(stderr, " must provide the correct header with -h, or uses Picard which properly maintains\n"); - fprintf(stderr, " the header dictionary in merging.\n\n"); - return 1; - } - if (!(flag & MERGE_FORCE) && strcmp(argv[optind], "-")) { - FILE *fp = fopen(argv[optind], "rb"); - if (fp != NULL) { - fclose(fp); - fprintf(stderr, "[%s] File '%s' exists. Please apply '-f' to overwrite. Abort.\n", __func__, argv[optind]); - return 1; - } - } - if (bam_merge_core(is_by_qname, argv[optind], fn_headers, argc - optind - 1, argv + optind + 1, flag, reg) < 0) ret = 1; - free(reg); - free(fn_headers); - return ret; -} - -typedef bam1_t *bam1_p; - -static inline int bam1_lt(const bam1_p a, const bam1_p b) -{ - if (g_is_by_qname) { - int t = strnum_cmp(bam1_qname(a), bam1_qname(b)); - return (t < 0 || (t == 0 && (((uint64_t)a->core.tid<<32|(a->core.pos+1)) < ((uint64_t)b->core.tid<<32|(b->core.pos+1))))); - } else return (((uint64_t)a->core.tid<<32|(a->core.pos+1)) < ((uint64_t)b->core.tid<<32|(b->core.pos+1))); -} -KSORT_INIT(sort, bam1_p, bam1_lt) - -static void sort_blocks(int n, int k, bam1_p *buf, const char *prefix, const bam_header_t *h, int is_stdout) -{ - char *name, mode[3]; - int i; - bamFile fp; - ks_mergesort(sort, k, buf, 0); - name = (char*)calloc(strlen(prefix) + 20, 1); - if (n >= 0) { - sprintf(name, "%s.%.4d.bam", prefix, n); - strcpy(mode, "w1"); - } else { - sprintf(name, "%s.bam", prefix); - strcpy(mode, "w"); - } - fp = is_stdout? bam_dopen(fileno(stdout), mode) : bam_open(name, mode); - if (fp == 0) { - fprintf(stderr, "[sort_blocks] fail to create file %s.\n", name); - free(name); - // FIXME: possible memory leak - return; - } - free(name); - bam_header_write(fp, h); - for (i = 0; i < k; ++i) - bam_write1_core(fp, &buf[i]->core, buf[i]->data_len, buf[i]->data); - bam_close(fp); -} - -/*! - @abstract Sort an unsorted BAM file based on the chromosome order - and the leftmost position of an alignment - - @param is_by_qname whether to sort by query name - @param fn name of the file to be sorted - @param prefix prefix of the output and the temporary files; upon - sucessess, prefix.bam will be written. - @param max_mem approxiate maximum memory (very inaccurate) - - @discussion It may create multiple temporary subalignment files - and then merge them by calling bam_merge_core(). This function is - NOT thread safe. - */ -void bam_sort_core_ext(int is_by_qname, const char *fn, const char *prefix, size_t max_mem, int is_stdout) -{ - int n, ret, k, i; - size_t mem; - bam_header_t *header; - bamFile fp; - bam1_t *b, **buf; - - g_is_by_qname = is_by_qname; - n = k = 0; mem = 0; - fp = strcmp(fn, "-")? bam_open(fn, "r") : bam_dopen(fileno(stdin), "r"); - if (fp == 0) { - fprintf(stderr, "[bam_sort_core] fail to open file %s\n", fn); - return; - } - header = bam_header_read(fp); - buf = (bam1_t**)calloc(max_mem / BAM_CORE_SIZE, sizeof(bam1_t*)); - // write sub files - for (;;) { - if (buf[k] == 0) buf[k] = (bam1_t*)calloc(1, sizeof(bam1_t)); - b = buf[k]; - if ((ret = bam_read1(fp, b)) < 0) break; - mem += ret; - ++k; - if (mem >= max_mem) { - sort_blocks(n++, k, buf, prefix, header, 0); - mem = 0; k = 0; - } - } - if (ret != -1) - fprintf(stderr, "[bam_sort_core] truncated file. Continue anyway.\n"); - if (n == 0) sort_blocks(-1, k, buf, prefix, header, is_stdout); - else { // then merge - char **fns, *fnout; - fprintf(stderr, "[bam_sort_core] merging from %d files...\n", n+1); - sort_blocks(n++, k, buf, prefix, header, 0); - fnout = (char*)calloc(strlen(prefix) + 20, 1); - if (is_stdout) sprintf(fnout, "-"); - else sprintf(fnout, "%s.bam", prefix); - fns = (char**)calloc(n, sizeof(char*)); - for (i = 0; i < n; ++i) { - fns[i] = (char*)calloc(strlen(prefix) + 20, 1); - sprintf(fns[i], "%s.%.4d.bam", prefix, i); - } - bam_merge_core(is_by_qname, fnout, 0, n, fns, 0, 0); - free(fnout); - for (i = 0; i < n; ++i) { - unlink(fns[i]); - free(fns[i]); - } - free(fns); - } - for (k = 0; k < max_mem / BAM_CORE_SIZE; ++k) { - if (buf[k]) { - free(buf[k]->data); - free(buf[k]); - } - } - free(buf); - bam_header_destroy(header); - bam_close(fp); -} - -void bam_sort_core(int is_by_qname, const char *fn, const char *prefix, size_t max_mem) -{ - bam_sort_core_ext(is_by_qname, fn, prefix, max_mem, 0); -} - -int bam_sort(int argc, char *argv[]) -{ - size_t max_mem = 500000000; - int c, is_by_qname = 0, is_stdout = 0; - while ((c = getopt(argc, argv, "nom:")) >= 0) { - switch (c) { - case 'o': is_stdout = 1; break; - case 'n': is_by_qname = 1; break; - case 'm': max_mem = atol(optarg); break; - } - } - if (optind + 2 > argc) { - fprintf(stderr, "Usage: samtools sort [-on] [-m ] \n"); - return 1; - } - bam_sort_core_ext(is_by_qname, argv[optind], argv[optind+1], max_mem, is_stdout); - return 0; -} diff -Nru tophat-2.1.1+dfsg/src/samtools-0.1.18/bam_stat.c tophat-2.1.1+dfsg1/src/samtools-0.1.18/bam_stat.c --- tophat-2.1.1+dfsg/src/samtools-0.1.18/bam_stat.c 2016-02-14 18:21:17.469079000 +0000 +++ tophat-2.1.1+dfsg1/src/samtools-0.1.18/bam_stat.c 1970-01-01 00:00:00.000000000 +0000 @@ -1,77 +0,0 @@ -#include -#include -#include "bam.h" - -typedef struct { - long long n_reads[2], n_mapped[2], n_pair_all[2], n_pair_map[2], n_pair_good[2]; - long long n_sgltn[2], n_read1[2], n_read2[2]; - long long n_dup[2]; - long long n_diffchr[2], n_diffhigh[2]; -} bam_flagstat_t; - -#define flagstat_loop(s, c) do { \ - int w = ((c)->flag & BAM_FQCFAIL)? 1 : 0; \ - ++(s)->n_reads[w]; \ - if ((c)->flag & BAM_FPAIRED) { \ - ++(s)->n_pair_all[w]; \ - if ((c)->flag & BAM_FPROPER_PAIR) ++(s)->n_pair_good[w]; \ - if ((c)->flag & BAM_FREAD1) ++(s)->n_read1[w]; \ - if ((c)->flag & BAM_FREAD2) ++(s)->n_read2[w]; \ - if (((c)->flag & BAM_FMUNMAP) && !((c)->flag & BAM_FUNMAP)) ++(s)->n_sgltn[w]; \ - if (!((c)->flag & BAM_FUNMAP) && !((c)->flag & BAM_FMUNMAP)) { \ - ++(s)->n_pair_map[w]; \ - if ((c)->mtid != (c)->tid) { \ - ++(s)->n_diffchr[w]; \ - if ((c)->qual >= 5) ++(s)->n_diffhigh[w]; \ - } \ - } \ - } \ - if (!((c)->flag & BAM_FUNMAP)) ++(s)->n_mapped[w]; \ - if ((c)->flag & BAM_FDUP) ++(s)->n_dup[w]; \ - } while (0) - -bam_flagstat_t *bam_flagstat_core(bamFile fp) -{ - bam_flagstat_t *s; - bam1_t *b; - bam1_core_t *c; - int ret; - s = (bam_flagstat_t*)calloc(1, sizeof(bam_flagstat_t)); - b = bam_init1(); - c = &b->core; - while ((ret = bam_read1(fp, b)) >= 0) - flagstat_loop(s, c); - bam_destroy1(b); - if (ret != -1) - fprintf(stderr, "[bam_flagstat_core] Truncated file? Continue anyway.\n"); - return s; -} -int bam_flagstat(int argc, char *argv[]) -{ - bamFile fp; - bam_header_t *header; - bam_flagstat_t *s; - if (argc == optind) { - fprintf(stderr, "Usage: samtools flagstat \n"); - return 1; - } - fp = strcmp(argv[optind], "-")? bam_open(argv[optind], "r") : bam_dopen(fileno(stdin), "r"); - assert(fp); - header = bam_header_read(fp); - s = bam_flagstat_core(fp); - printf("%lld + %lld in total (QC-passed reads + QC-failed reads)\n", s->n_reads[0], s->n_reads[1]); - printf("%lld + %lld duplicates\n", s->n_dup[0], s->n_dup[1]); - printf("%lld + %lld mapped (%.2f%%:%.2f%%)\n", s->n_mapped[0], s->n_mapped[1], (float)s->n_mapped[0] / s->n_reads[0] * 100.0, (float)s->n_mapped[1] / s->n_reads[1] * 100.0); - printf("%lld + %lld paired in sequencing\n", s->n_pair_all[0], s->n_pair_all[1]); - printf("%lld + %lld read1\n", s->n_read1[0], s->n_read1[1]); - printf("%lld + %lld read2\n", s->n_read2[0], s->n_read2[1]); - printf("%lld + %lld properly paired (%.2f%%:%.2f%%)\n", s->n_pair_good[0], s->n_pair_good[1], (float)s->n_pair_good[0] / s->n_pair_all[0] * 100.0, (float)s->n_pair_good[1] / s->n_pair_all[1] * 100.0); - printf("%lld + %lld with itself and mate mapped\n", s->n_pair_map[0], s->n_pair_map[1]); - printf("%lld + %lld singletons (%.2f%%:%.2f%%)\n", s->n_sgltn[0], s->n_sgltn[1], (float)s->n_sgltn[0] / s->n_pair_all[0] * 100.0, (float)s->n_sgltn[1] / s->n_pair_all[1] * 100.0); - printf("%lld + %lld with mate mapped to a different chr\n", s->n_diffchr[0], s->n_diffchr[1]); - printf("%lld + %lld with mate mapped to a different chr (mapQ>=5)\n", s->n_diffhigh[0], s->n_diffhigh[1]); - free(s); - bam_header_destroy(header); - bam_close(fp); - return 0; -} diff -Nru tophat-2.1.1+dfsg/src/samtools-0.1.18/bamtk.c tophat-2.1.1+dfsg1/src/samtools-0.1.18/bamtk.c --- tophat-2.1.1+dfsg/src/samtools-0.1.18/bamtk.c 2016-02-14 18:21:17.471079000 +0000 +++ tophat-2.1.1+dfsg1/src/samtools-0.1.18/bamtk.c 1970-01-01 00:00:00.000000000 +0000 @@ -1,109 +0,0 @@ -#include -#include -#include -#include -#include "bam.h" - -#ifdef _USE_KNETFILE -#include "knetfile.h" -#endif - -int bam_taf2baf(int argc, char *argv[]); -int bam_mpileup(int argc, char *argv[]); -int bam_merge(int argc, char *argv[]); -int bam_index(int argc, char *argv[]); -int bam_sort(int argc, char *argv[]); -int bam_tview_main(int argc, char *argv[]); -int bam_mating(int argc, char *argv[]); -int bam_rmdup(int argc, char *argv[]); -int bam_flagstat(int argc, char *argv[]); -int bam_fillmd(int argc, char *argv[]); -int bam_idxstats(int argc, char *argv[]); -int main_samview(int argc, char *argv[]); -int main_import(int argc, char *argv[]); -int main_reheader(int argc, char *argv[]); -int main_cut_target(int argc, char *argv[]); -int main_phase(int argc, char *argv[]); -int main_cat(int argc, char *argv[]); -int main_depth(int argc, char *argv[]); -int main_bam2fq(int argc, char *argv[]); - -int faidx_main(int argc, char *argv[]); - -static int usage() -{ - fprintf(stderr, "\n"); - fprintf(stderr, "Program: samtools (Tools for alignments in the SAM format)\n"); - fprintf(stderr, "Version: %s\n\n", BAM_VERSION); - fprintf(stderr, "Usage: samtools [options]\n\n"); - fprintf(stderr, "Command: view SAM<->BAM conversion\n"); - fprintf(stderr, " sort sort alignment file\n"); - fprintf(stderr, " mpileup multi-way pileup\n"); - fprintf(stderr, " depth compute the depth\n"); - fprintf(stderr, " faidx index/extract FASTA\n"); -#if _CURSES_LIB != 0 - fprintf(stderr, " tview text alignment viewer\n"); -#endif - fprintf(stderr, " index index alignment\n"); - fprintf(stderr, " idxstats BAM index stats (r595 or later)\n"); - fprintf(stderr, " fixmate fix mate information\n"); - fprintf(stderr, " flagstat simple stats\n"); - fprintf(stderr, " calmd recalculate MD/NM tags and '=' bases\n"); - fprintf(stderr, " merge merge sorted alignments\n"); - fprintf(stderr, " rmdup remove PCR duplicates\n"); - fprintf(stderr, " reheader replace BAM header\n"); - fprintf(stderr, " cat concatenate BAMs\n"); - fprintf(stderr, " targetcut cut fosmid regions (for fosmid pool only)\n"); - fprintf(stderr, " phase phase heterozygotes\n"); - fprintf(stderr, "\n"); -#ifdef _WIN32 - fprintf(stderr, "\ -Note: The Windows version of SAMtools is mainly designed for read-only\n\ - operations, such as viewing the alignments and generating the pileup.\n\ - Binary files generated by the Windows version may be buggy.\n\n"); -#endif - return 1; -} - -int main(int argc, char *argv[]) -{ -#ifdef _WIN32 - setmode(fileno(stdout), O_BINARY); - setmode(fileno(stdin), O_BINARY); -#ifdef _USE_KNETFILE - knet_win32_init(); -#endif -#endif - if (argc < 2) return usage(); - if (strcmp(argv[1], "view") == 0) return main_samview(argc-1, argv+1); - else if (strcmp(argv[1], "import") == 0) return main_import(argc-1, argv+1); - else if (strcmp(argv[1], "mpileup") == 0) return bam_mpileup(argc-1, argv+1); - else if (strcmp(argv[1], "merge") == 0) return bam_merge(argc-1, argv+1); - else if (strcmp(argv[1], "sort") == 0) return bam_sort(argc-1, argv+1); - else if (strcmp(argv[1], "index") == 0) return bam_index(argc-1, argv+1); - else if (strcmp(argv[1], "idxstats") == 0) return bam_idxstats(argc-1, argv+1); - else if (strcmp(argv[1], "faidx") == 0) return faidx_main(argc-1, argv+1); - else if (strcmp(argv[1], "fixmate") == 0) return bam_mating(argc-1, argv+1); - else if (strcmp(argv[1], "rmdup") == 0) return bam_rmdup(argc-1, argv+1); - else if (strcmp(argv[1], "flagstat") == 0) return bam_flagstat(argc-1, argv+1); - else if (strcmp(argv[1], "calmd") == 0) return bam_fillmd(argc-1, argv+1); - else if (strcmp(argv[1], "fillmd") == 0) return bam_fillmd(argc-1, argv+1); - else if (strcmp(argv[1], "reheader") == 0) return main_reheader(argc-1, argv+1); - else if (strcmp(argv[1], "cat") == 0) return main_cat(argc-1, argv+1); - else if (strcmp(argv[1], "targetcut") == 0) return main_cut_target(argc-1, argv+1); - else if (strcmp(argv[1], "phase") == 0) return main_phase(argc-1, argv+1); - else if (strcmp(argv[1], "depth") == 0) return main_depth(argc-1, argv+1); - else if (strcmp(argv[1], "bam2fq") == 0) return main_bam2fq(argc-1, argv+1); - else if (strcmp(argv[1], "pileup") == 0) { - fprintf(stderr, "[main] The `pileup' command has been removed. Please use `mpileup' instead.\n"); - return 1; - } -#if _CURSES_LIB != 0 - else if (strcmp(argv[1], "tview") == 0) return bam_tview_main(argc-1, argv+1); -#endif - else { - fprintf(stderr, "[main] unrecognized command '%s'\n", argv[1]); - return 1; - } - return 0; -} diff -Nru tophat-2.1.1+dfsg/src/samtools-0.1.18/bam_tview.c tophat-2.1.1+dfsg1/src/samtools-0.1.18/bam_tview.c --- tophat-2.1.1+dfsg/src/samtools-0.1.18/bam_tview.c 2016-02-14 18:21:17.470079000 +0000 +++ tophat-2.1.1+dfsg1/src/samtools-0.1.18/bam_tview.c 1970-01-01 00:00:00.000000000 +0000 @@ -1,440 +0,0 @@ -#undef _HAVE_CURSES - -#if _CURSES_LIB == 0 -#elif _CURSES_LIB == 1 -#include -#ifndef NCURSES_VERSION -#warning "_CURSES_LIB=1 but NCURSES_VERSION not defined; tview is NOT compiled" -#else -#define _HAVE_CURSES -#endif -#elif _CURSES_LIB == 2 -#include -#define _HAVE_CURSES -#else -#warning "_CURSES_LIB is not 0, 1 or 2; tview is NOT compiled" -#endif - -#ifdef _HAVE_CURSES -#include -#include -#include -#include -#include "bam.h" -#include "faidx.h" -#include "bam2bcf.h" - -char bam_aux_getCEi(bam1_t *b, int i); -char bam_aux_getCSi(bam1_t *b, int i); -char bam_aux_getCQi(bam1_t *b, int i); - -#define TV_MIN_ALNROW 2 -#define TV_MAX_GOTO 40 -#define TV_LOW_MAPQ 10 - -#define TV_COLOR_MAPQ 0 -#define TV_COLOR_BASEQ 1 -#define TV_COLOR_NUCL 2 -#define TV_COLOR_COL 3 -#define TV_COLOR_COLQ 4 - -#define TV_BASE_NUCL 0 -#define TV_BASE_COLOR_SPACE 1 - -typedef struct { - int mrow, mcol; - WINDOW *wgoto, *whelp; - - bam_index_t *idx; - bam_lplbuf_t *lplbuf; - bam_header_t *header; - bamFile fp; - int curr_tid, left_pos; - faidx_t *fai; - bcf_callaux_t *bca; - - int ccol, last_pos, row_shift, base_for, color_for, is_dot, l_ref, ins, no_skip, show_name; - char *ref; -} tview_t; - -int tv_pl_func(uint32_t tid, uint32_t pos, int n, const bam_pileup1_t *pl, void *data) -{ - extern unsigned char bam_nt16_table[256]; - tview_t *tv = (tview_t*)data; - int i, j, c, rb, attr, max_ins = 0; - uint32_t call = 0; - if (pos < tv->left_pos || tv->ccol > tv->mcol) return 0; // out of screen - // print referece - rb = (tv->ref && pos - tv->left_pos < tv->l_ref)? tv->ref[pos - tv->left_pos] : 'N'; - for (i = tv->last_pos + 1; i < pos; ++i) { - if (i%10 == 0 && tv->mcol - tv->ccol >= 10) mvprintw(0, tv->ccol, "%-d", i+1); - c = tv->ref? tv->ref[i - tv->left_pos] : 'N'; - mvaddch(1, tv->ccol++, c); - } - if (pos%10 == 0 && tv->mcol - tv->ccol >= 10) mvprintw(0, tv->ccol, "%-d", pos+1); - { // call consensus - bcf_callret1_t bcr; - int qsum[4], a1, a2, tmp; - double p[3], prior = 30; - bcf_call_glfgen(n, pl, bam_nt16_table[rb], tv->bca, &bcr); - for (i = 0; i < 4; ++i) qsum[i] = bcr.qsum[i]<<2 | i; - for (i = 1; i < 4; ++i) // insertion sort - for (j = i; j > 0 && qsum[j] > qsum[j-1]; --j) - tmp = qsum[j], qsum[j] = qsum[j-1], qsum[j-1] = tmp; - a1 = qsum[0]&3; a2 = qsum[1]&3; - p[0] = bcr.p[a1*5+a1]; p[1] = bcr.p[a1*5+a2] + prior; p[2] = bcr.p[a2*5+a2]; - if ("ACGT"[a1] != toupper(rb)) p[0] += prior + 3; - if ("ACGT"[a2] != toupper(rb)) p[2] += prior + 3; - if (p[0] < p[1] && p[0] < p[2]) call = (1<>16&0xf]; - i = (call&0xffff)/10+1; - if (i > 4) i = 4; - attr |= COLOR_PAIR(i); - if (c == toupper(rb)) c = '.'; - attron(attr); - mvaddch(2, tv->ccol, c); - attroff(attr); - if(tv->ins) { - // calculate maximum insert - for (i = 0; i < n; ++i) { - const bam_pileup1_t *p = pl + i; - if (p->indel > 0 && max_ins < p->indel) max_ins = p->indel; - } - } - // core loop - for (j = 0; j <= max_ins; ++j) { - for (i = 0; i < n; ++i) { - const bam_pileup1_t *p = pl + i; - int row = TV_MIN_ALNROW + p->level - tv->row_shift; - if (j == 0) { - if (!p->is_del) { - if (tv->base_for == TV_BASE_COLOR_SPACE && - (c = bam_aux_getCSi(p->b, p->qpos))) { - c = bam_aux_getCSi(p->b, p->qpos); - // assume that if we found one color, we will be able to get the color error - if (tv->is_dot && '-' == bam_aux_getCEi(p->b, p->qpos)) c = bam1_strand(p->b)? ',' : '.'; - } else { - if (tv->show_name) { - char *name = bam1_qname(p->b); - c = (p->qpos + 1 >= p->b->core.l_qname)? ' ' : name[p->qpos]; - } else { - c = bam_nt16_rev_table[bam1_seqi(bam1_seq(p->b), p->qpos)]; - if (tv->is_dot && toupper(c) == toupper(rb)) c = bam1_strand(p->b)? ',' : '.'; - } - } - } else c = p->is_refskip? (bam1_strand(p->b)? '<' : '>') : '*'; - } else { // padding - if (j > p->indel) c = '*'; - else { // insertion - if (tv->base_for == TV_BASE_NUCL) { - if (tv->show_name) { - char *name = bam1_qname(p->b); - c = (p->qpos + j + 1 >= p->b->core.l_qname)? ' ' : name[p->qpos + j]; - } else { - c = bam_nt16_rev_table[bam1_seqi(bam1_seq(p->b), p->qpos + j)]; - if (j == 0 && tv->is_dot && toupper(c) == toupper(rb)) c = bam1_strand(p->b)? ',' : '.'; - } - } else { - c = bam_aux_getCSi(p->b, p->qpos + j); - if (tv->is_dot && '-' == bam_aux_getCEi(p->b, p->qpos + j)) c = bam1_strand(p->b)? ',' : '.'; - } - } - } - if (row > TV_MIN_ALNROW && row < tv->mrow) { - int x; - attr = 0; - if (((p->b->core.flag&BAM_FPAIRED) && !(p->b->core.flag&BAM_FPROPER_PAIR)) - || (p->b->core.flag & BAM_FSECONDARY)) attr |= A_UNDERLINE; - if (tv->color_for == TV_COLOR_BASEQ) { - x = bam1_qual(p->b)[p->qpos]/10 + 1; - if (x > 4) x = 4; - attr |= COLOR_PAIR(x); - } else if (tv->color_for == TV_COLOR_MAPQ) { - x = p->b->core.qual/10 + 1; - if (x > 4) x = 4; - attr |= COLOR_PAIR(x); - } else if (tv->color_for == TV_COLOR_NUCL) { - x = bam_nt16_nt4_table[bam1_seqi(bam1_seq(p->b), p->qpos)] + 5; - attr |= COLOR_PAIR(x); - } else if(tv->color_for == TV_COLOR_COL) { - x = 0; - switch(bam_aux_getCSi(p->b, p->qpos)) { - case '0': x = 0; break; - case '1': x = 1; break; - case '2': x = 2; break; - case '3': x = 3; break; - case '4': x = 4; break; - default: x = bam_nt16_nt4_table[bam1_seqi(bam1_seq(p->b), p->qpos)]; break; - } - x+=5; - attr |= COLOR_PAIR(x); - } else if(tv->color_for == TV_COLOR_COLQ) { - x = bam_aux_getCQi(p->b, p->qpos); - if(0 == x) x = bam1_qual(p->b)[p->qpos]; - x = x/10 + 1; - if (x > 4) x = 4; - attr |= COLOR_PAIR(x); - } - attron(attr); - mvaddch(row, tv->ccol, bam1_strand(p->b)? tolower(c) : toupper(c)); - attroff(attr); - } - } - c = j? '*' : rb; - if (c == '*') { - attr = COLOR_PAIR(8); - attron(attr); - mvaddch(1, tv->ccol++, c); - attroff(attr); - } else mvaddch(1, tv->ccol++, c); - } - tv->last_pos = pos; - return 0; -} - -tview_t *tv_init(const char *fn, const char *fn_fa) -{ - tview_t *tv = (tview_t*)calloc(1, sizeof(tview_t)); - tv->is_dot = 1; - tv->fp = bam_open(fn, "r"); - bgzf_set_cache_size(tv->fp, 8 * 1024 *1024); - assert(tv->fp); - tv->header = bam_header_read(tv->fp); - tv->idx = bam_index_load(fn); - if (tv->idx == 0) exit(1); - tv->lplbuf = bam_lplbuf_init(tv_pl_func, tv); - if (fn_fa) tv->fai = fai_load(fn_fa); - tv->bca = bcf_call_init(0.83, 13); - tv->ins = 1; - - initscr(); - keypad(stdscr, TRUE); - clear(); - noecho(); - cbreak(); - tv->mrow = 24; tv->mcol = 80; - getmaxyx(stdscr, tv->mrow, tv->mcol); - tv->wgoto = newwin(3, TV_MAX_GOTO + 10, 10, 5); - tv->whelp = newwin(29, 40, 5, 5); - tv->color_for = TV_COLOR_MAPQ; - start_color(); - init_pair(1, COLOR_BLUE, COLOR_BLACK); - init_pair(2, COLOR_GREEN, COLOR_BLACK); - init_pair(3, COLOR_YELLOW, COLOR_BLACK); - init_pair(4, COLOR_WHITE, COLOR_BLACK); - init_pair(5, COLOR_GREEN, COLOR_BLACK); - init_pair(6, COLOR_CYAN, COLOR_BLACK); - init_pair(7, COLOR_YELLOW, COLOR_BLACK); - init_pair(8, COLOR_RED, COLOR_BLACK); - init_pair(9, COLOR_BLUE, COLOR_BLACK); - return tv; -} - -void tv_destroy(tview_t *tv) -{ - delwin(tv->wgoto); delwin(tv->whelp); - endwin(); - - bam_lplbuf_destroy(tv->lplbuf); - bcf_call_destroy(tv->bca); - bam_index_destroy(tv->idx); - if (tv->fai) fai_destroy(tv->fai); - free(tv->ref); - bam_header_destroy(tv->header); - bam_close(tv->fp); - free(tv); -} - -int tv_fetch_func(const bam1_t *b, void *data) -{ - tview_t *tv = (tview_t*)data; - if (tv->no_skip) { - uint32_t *cigar = bam1_cigar(b); // this is cheating... - int i; - for (i = 0; i core.n_cigar; ++i) { - if ((cigar[i]&0xf) == BAM_CREF_SKIP) - cigar[i] = cigar[i]>>4<<4 | BAM_CDEL; - } - } - bam_lplbuf_push(b, tv->lplbuf); - return 0; -} - -int tv_draw_aln(tview_t *tv, int tid, int pos) -{ - // reset - clear(); - tv->curr_tid = tid; tv->left_pos = pos; - tv->last_pos = tv->left_pos - 1; - tv->ccol = 0; - // print ref and consensus - if (tv->fai) { - char *str; - if (tv->ref) free(tv->ref); - str = (char*)calloc(strlen(tv->header->target_name[tv->curr_tid]) + 30, 1); - sprintf(str, "%s:%d-%d", tv->header->target_name[tv->curr_tid], tv->left_pos + 1, tv->left_pos + tv->mcol); - tv->ref = fai_fetch(tv->fai, str, &tv->l_ref); - free(str); - } - // draw aln - bam_lplbuf_reset(tv->lplbuf); - bam_fetch(tv->fp, tv->idx, tv->curr_tid, tv->left_pos, tv->left_pos + tv->mcol, tv, tv_fetch_func); - bam_lplbuf_push(0, tv->lplbuf); - - while (tv->ccol < tv->mcol) { - int pos = tv->last_pos + 1; - if (pos%10 == 0 && tv->mcol - tv->ccol >= 10) mvprintw(0, tv->ccol, "%-d", pos+1); - mvaddch(1, tv->ccol++, (tv->ref && pos < tv->l_ref)? tv->ref[pos - tv->left_pos] : 'N'); - ++tv->last_pos; - } - return 0; -} - -static void tv_win_goto(tview_t *tv, int *tid, int *pos) -{ - char str[256], *p; - int i, l = 0; - wborder(tv->wgoto, '|', '|', '-', '-', '+', '+', '+', '+'); - mvwprintw(tv->wgoto, 1, 2, "Goto: "); - for (;;) { - int c = wgetch(tv->wgoto); - wrefresh(tv->wgoto); - if (c == KEY_BACKSPACE || c == '\010' || c == '\177') { - --l; - } else if (c == KEY_ENTER || c == '\012' || c == '\015') { - int _tid = -1, _beg, _end; - if (str[0] == '=') { - _beg = strtol(str+1, &p, 10) - 1; - if (_beg > 0) { - *pos = _beg; - return; - } - } else { - bam_parse_region(tv->header, str, &_tid, &_beg, &_end); - if (_tid >= 0) { - *tid = _tid; *pos = _beg; - return; - } - } - } else if (isgraph(c)) { - if (l < TV_MAX_GOTO) str[l++] = c; - } else if (c == '\027') l = 0; - else if (c == '\033') return; - str[l] = '\0'; - for (i = 0; i < TV_MAX_GOTO; ++i) mvwaddch(tv->wgoto, 1, 8 + i, ' '); - mvwprintw(tv->wgoto, 1, 8, "%s", str); - } -} - -static void tv_win_help(tview_t *tv) { - int r = 1; - WINDOW *win = tv->whelp; - wborder(win, '|', '|', '-', '-', '+', '+', '+', '+'); - mvwprintw(win, r++, 2, " -=- Help -=- "); - r++; - mvwprintw(win, r++, 2, "? This window"); - mvwprintw(win, r++, 2, "Arrows Small scroll movement"); - mvwprintw(win, r++, 2, "h,j,k,l Small scroll movement"); - mvwprintw(win, r++, 2, "H,J,K,L Large scroll movement"); - mvwprintw(win, r++, 2, "ctrl-H Scroll 1k left"); - mvwprintw(win, r++, 2, "ctrl-L Scroll 1k right"); - mvwprintw(win, r++, 2, "space Scroll one screen"); - mvwprintw(win, r++, 2, "backspace Scroll back one screen"); - mvwprintw(win, r++, 2, "g Go to specific location"); - mvwprintw(win, r++, 2, "m Color for mapping qual"); - mvwprintw(win, r++, 2, "n Color for nucleotide"); - mvwprintw(win, r++, 2, "b Color for base quality"); - mvwprintw(win, r++, 2, "c Color for cs color"); - mvwprintw(win, r++, 2, "z Color for cs qual"); - mvwprintw(win, r++, 2, ". Toggle on/off dot view"); - mvwprintw(win, r++, 2, "s Toggle on/off ref skip"); - mvwprintw(win, r++, 2, "r Toggle on/off rd name"); - mvwprintw(win, r++, 2, "N Turn on nt view"); - mvwprintw(win, r++, 2, "C Turn on cs view"); - mvwprintw(win, r++, 2, "i Toggle on/off ins"); - mvwprintw(win, r++, 2, "q Exit"); - r++; - mvwprintw(win, r++, 2, "Underline: Secondary or orphan"); - mvwprintw(win, r++, 2, "Blue: 0-9 Green: 10-19"); - mvwprintw(win, r++, 2, "Yellow: 20-29 White: >=30"); - wrefresh(win); - wgetch(win); -} - -void tv_loop(tview_t *tv) -{ - int tid, pos; - tid = tv->curr_tid; pos = tv->left_pos; - while (1) { - int c = getch(); - switch (c) { - case '?': tv_win_help(tv); break; - case '\033': - case 'q': goto end_loop; - case '/': - case 'g': tv_win_goto(tv, &tid, &pos); break; - case 'm': tv->color_for = TV_COLOR_MAPQ; break; - case 'b': tv->color_for = TV_COLOR_BASEQ; break; - case 'n': tv->color_for = TV_COLOR_NUCL; break; - case 'c': tv->color_for = TV_COLOR_COL; break; - case 'z': tv->color_for = TV_COLOR_COLQ; break; - case 's': tv->no_skip = !tv->no_skip; break; - case 'r': tv->show_name = !tv->show_name; break; - case KEY_LEFT: - case 'h': --pos; break; - case KEY_RIGHT: - case 'l': ++pos; break; - case KEY_SLEFT: - case 'H': pos -= 20; break; - case KEY_SRIGHT: - case 'L': pos += 20; break; - case '.': tv->is_dot = !tv->is_dot; break; - case 'N': tv->base_for = TV_BASE_NUCL; break; - case 'C': tv->base_for = TV_BASE_COLOR_SPACE; break; - case 'i': tv->ins = !tv->ins; break; - case '\010': pos -= 1000; break; - case '\014': pos += 1000; break; - case ' ': pos += tv->mcol; break; - case KEY_UP: - case 'j': --tv->row_shift; break; - case KEY_DOWN: - case 'k': ++tv->row_shift; break; - case KEY_BACKSPACE: - case '\177': pos -= tv->mcol; break; - case KEY_RESIZE: getmaxyx(stdscr, tv->mrow, tv->mcol); break; - default: continue; - } - if (pos < 0) pos = 0; - if (tv->row_shift < 0) tv->row_shift = 0; - tv_draw_aln(tv, tid, pos); - } -end_loop: - return; -} - -int bam_tview_main(int argc, char *argv[]) -{ - tview_t *tv; - if (argc == 1) { - fprintf(stderr, "Usage: bamtk tview [ref.fasta]\n"); - return 1; - } - tv = tv_init(argv[1], (argc == 2)? 0 : argv[2]); - tv_draw_aln(tv, 0, 0); - tv_loop(tv); - tv_destroy(tv); - return 0; -} -#else // #ifdef _HAVE_CURSES -#include -#warning "No curses library is available; tview is disabled." -int bam_tview_main(int argc, char *argv[]) -{ - fprintf(stderr, "[bam_tview_main] The ncurses library is unavailable; tview is not compiled.\n"); - return 1; -} -#endif // #ifdef _HAVE_CURSES diff -Nru tophat-2.1.1+dfsg/src/samtools-0.1.18/bcftools/bcf2qcall.c tophat-2.1.1+dfsg1/src/samtools-0.1.18/bcftools/bcf2qcall.c --- tophat-2.1.1+dfsg/src/samtools-0.1.18/bcftools/bcf2qcall.c 2016-02-14 18:21:17.477079000 +0000 +++ tophat-2.1.1+dfsg1/src/samtools-0.1.18/bcftools/bcf2qcall.c 1970-01-01 00:00:00.000000000 +0000 @@ -1,91 +0,0 @@ -#include -#include -#include -#include -#include "bcf.h" - -static int8_t nt4_table[256] = { - 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, - 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, - 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4 /*'-'*/, 4, 4, - 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, - 4, 0, 4, 1, 4, 4, 4, 2, 4, 4, 4, 4, 4, 4, 4, 4, - 4, 4, 4, 4, 3, 4, 4, 4, -1, 4, 4, 4, 4, 4, 4, 4, - 4, 0, 4, 1, 4, 4, 4, 2, 4, 4, 4, 4, 4, 4, 4, 4, - 4, 4, 4, 4, 3, 4, 4, 4, -1, 4, 4, 4, 4, 4, 4, 4, - 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, - 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, - 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, - 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, - 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, - 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, - 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, - 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4 -}; - -static int read_I16(bcf1_t *b, int anno[16]) -{ - char *p; - int i; - if ((p = strstr(b->info, "I16=")) == 0) return -1; - p += 4; - for (i = 0; i < 16; ++i) { - anno[i] = strtol(p, &p, 10); - if (anno[i] == 0 && (errno == EINVAL || errno == ERANGE)) return -2; - ++p; - } - return 0; -} - -int bcf_2qcall(bcf_hdr_t *h, bcf1_t *b) -{ - int a[4], k, g[10], l, map[4], k1, j, i, i0, anno[16], dp, mq, d_rest; - char *s; - if (b->ref[1] != 0 || b->n_alleles > 4) return -1; // ref is not a single base - for (i = 0; i < b->n_gi; ++i) - if (b->gi[i].fmt == bcf_str2int("PL", 2)) break; - if (i == b->n_gi) return -1; // no PL - if (read_I16(b, anno) != 0) return -1; // no I16; FIXME: can be improved - d_rest = dp = anno[0] + anno[1] + anno[2] + anno[3]; - if (dp == 0) return -1; // depth is zero - mq = (int)(sqrt((double)(anno[9] + anno[11]) / dp) + .499); - i0 = i; - a[0] = nt4_table[(int)b->ref[0]]; - if (a[0] > 3) return -1; // ref is not A/C/G/T - a[1] = a[2] = a[3] = -2; // -1 has a special meaning - if (b->alt[0] == 0) return -1; // no alternate allele - map[0] = map[1] = map[2] = map[3] = -2; - map[a[0]] = 0; - for (k = 0, s = b->alt, k1 = -1; k < 3 && *s; ++k, s += 2) { - if (s[1] != ',' && s[1] != 0) return -1; // ALT is not single base - a[k+1] = nt4_table[(int)*s]; - if (a[k+1] >= 0) map[a[k+1]] = k+1; - else k1 = k+1; - if (s[1] == 0) break; - } - for (k = 0; k < 4; ++k) - if (map[k] < 0) map[k] = k1; - for (i = 0; i < h->n_smpl; ++i) { - int d; - uint8_t *p = b->gi[i0].data + i * b->gi[i0].len; - for (j = 0; j < b->gi[i0].len; ++j) - if (p[j]) break; - d = (int)((double)d_rest / (h->n_smpl - i) + .499); - if (d == 0) d = 1; - if (j == b->gi[i0].len) d = 0; - d_rest -= d; - for (k = j = 0; k < 4; ++k) { - for (l = k; l < 4; ++l) { - int t, x = map[k], y = map[l]; - if (x > y) t = x, x = y, y = t; // swap - g[j++] = p[y * (y+1) / 2 + x]; - } - } - printf("%s\t%d\t%c", h->ns[b->tid], b->pos+1, *b->ref); - printf("\t%d\t%d\t0", d, mq); - for (j = 0; j < 10; ++j) - printf("\t%d", g[j]); - printf("\t%s\n", h->sns[i]); - } - return 0; -} diff -Nru tophat-2.1.1+dfsg/src/samtools-0.1.18/bcftools/bcf.c tophat-2.1.1+dfsg1/src/samtools-0.1.18/bcftools/bcf.c --- tophat-2.1.1+dfsg/src/samtools-0.1.18/bcftools/bcf.c 2016-02-14 18:21:17.474079000 +0000 +++ tophat-2.1.1+dfsg1/src/samtools-0.1.18/bcftools/bcf.c 1970-01-01 00:00:00.000000000 +0000 @@ -1,328 +0,0 @@ -#include -#include -#include -#include "kstring.h" -#include "bcf.h" - -bcf_t *bcf_open(const char *fn, const char *mode) -{ - bcf_t *b; - b = calloc(1, sizeof(bcf_t)); - if (strchr(mode, 'w')) { - b->fp = strcmp(fn, "-")? bgzf_open(fn, mode) : bgzf_fdopen(fileno(stdout), mode); - } else { - b->fp = strcmp(fn, "-")? bgzf_open(fn, mode) : bgzf_fdopen(fileno(stdin), mode); - } -#ifndef BCF_LITE - b->fp->owned_file = 1; -#endif - return b; -} - -int bcf_close(bcf_t *b) -{ - int ret; - if (b == 0) return 0; - ret = bgzf_close(b->fp); - free(b); - return ret; -} - -int bcf_hdr_write(bcf_t *b, const bcf_hdr_t *h) -{ - if (b == 0 || h == 0) return -1; - bgzf_write(b->fp, "BCF\4", 4); - bgzf_write(b->fp, &h->l_nm, 4); - bgzf_write(b->fp, h->name, h->l_nm); - bgzf_write(b->fp, &h->l_smpl, 4); - bgzf_write(b->fp, h->sname, h->l_smpl); - bgzf_write(b->fp, &h->l_txt, 4); - bgzf_write(b->fp, h->txt, h->l_txt); - bgzf_flush(b->fp); - return 16 + h->l_nm + h->l_smpl + h->l_txt; -} - -bcf_hdr_t *bcf_hdr_read(bcf_t *b) -{ - uint8_t magic[4]; - bcf_hdr_t *h; - if (b == 0) return 0; - h = calloc(1, sizeof(bcf_hdr_t)); - bgzf_read(b->fp, magic, 4); - bgzf_read(b->fp, &h->l_nm, 4); - h->name = malloc(h->l_nm); - bgzf_read(b->fp, h->name, h->l_nm); - bgzf_read(b->fp, &h->l_smpl, 4); - h->sname = malloc(h->l_smpl); - bgzf_read(b->fp, h->sname, h->l_smpl); - bgzf_read(b->fp, &h->l_txt, 4); - h->txt = malloc(h->l_txt); - bgzf_read(b->fp, h->txt, h->l_txt); - bcf_hdr_sync(h); - return h; -} - -void bcf_hdr_destroy(bcf_hdr_t *h) -{ - if (h == 0) return; - free(h->name); free(h->sname); free(h->txt); free(h->ns); free(h->sns); - free(h); -} - -static inline char **cnt_null(int l, char *str, int *_n) -{ - int n = 0; - char *p, **list; - *_n = 0; - if (l == 0 || str == 0) return 0; - for (p = str; p != str + l; ++p) - if (*p == 0) ++n; - *_n = n; - list = calloc(n, sizeof(void*)); - list[0] = str; - for (p = str, n = 1; p < str + l - 1; ++p) - if (*p == 0) list[n++] = p + 1; - return list; -} - -int bcf_hdr_sync(bcf_hdr_t *b) -{ - if (b == 0) return -1; - if (b->ns) free(b->ns); - if (b->sns) free(b->sns); - if (b->l_nm) b->ns = cnt_null(b->l_nm, b->name, &b->n_ref); - else b->ns = 0, b->n_ref = 0; - b->sns = cnt_null(b->l_smpl, b->sname, &b->n_smpl); - return 0; -} - -int bcf_sync(bcf1_t *b) -{ - char *p, *tmp[5]; - int i, n, n_smpl = b->n_smpl; - ks_tokaux_t aux; - // set ref, alt, flt, info, fmt - b->ref = b->alt = b->flt = b->info = b->fmt = 0; - for (p = b->str, n = 0; p < b->str + b->l_str; ++p) { - if (*p == 0 && p+1 != b->str + b->l_str) { - if (n == 5) { - ++n; - break; - } else tmp[n++] = p + 1; - } - } - if (n != 5) { - fprintf(stderr, "[%s] incorrect number of fields (%d != 5) at %d:%d\n", __func__, n, b->tid, b->pos); - return -1; - } - b->ref = tmp[0]; b->alt = tmp[1]; b->flt = tmp[2]; b->info = tmp[3]; b->fmt = tmp[4]; - // set n_alleles - if (*b->alt == 0) b->n_alleles = 1; - else { - for (p = b->alt, n = 1; *p; ++p) - if (*p == ',') ++n; - b->n_alleles = n + 1; - } - // set n_gi and gi[i].fmt - for (p = b->fmt, n = 1; *p; ++p) - if (*p == ':') ++n; - if (n > b->m_gi) { - int old_m = b->m_gi; - b->m_gi = n; - kroundup32(b->m_gi); - b->gi = realloc(b->gi, b->m_gi * sizeof(bcf_ginfo_t)); - memset(b->gi + old_m, 0, (b->m_gi - old_m) * sizeof(bcf_ginfo_t)); - } - b->n_gi = n; - for (p = kstrtok(b->fmt, ":", &aux), n = 0; p; p = kstrtok(0, 0, &aux)) - b->gi[n++].fmt = bcf_str2int(p, aux.p - p); - // set gi[i].len - for (i = 0; i < b->n_gi; ++i) { - if (b->gi[i].fmt == bcf_str2int("PL", 2)) { - b->gi[i].len = b->n_alleles * (b->n_alleles + 1) / 2; - } else if (b->gi[i].fmt == bcf_str2int("DP", 2) || b->gi[i].fmt == bcf_str2int("HQ", 2)) { - b->gi[i].len = 2; - } else if (b->gi[i].fmt == bcf_str2int("GQ", 2) || b->gi[i].fmt == bcf_str2int("GT", 2)) { - b->gi[i].len = 1; - } else if (b->gi[i].fmt == bcf_str2int("SP", 2)) { - b->gi[i].len = 4; - } else if (b->gi[i].fmt == bcf_str2int("GL", 2)) { - b->gi[i].len = b->n_alleles * (b->n_alleles + 1) / 2 * 4; - } - b->gi[i].data = realloc(b->gi[i].data, n_smpl * b->gi[i].len); - } - return 0; -} - -int bcf_write(bcf_t *bp, const bcf_hdr_t *h, const bcf1_t *b) -{ - int i, l = 0; - if (b == 0) return -1; - bgzf_write(bp->fp, &b->tid, 4); - bgzf_write(bp->fp, &b->pos, 4); - bgzf_write(bp->fp, &b->qual, 4); - bgzf_write(bp->fp, &b->l_str, 4); - bgzf_write(bp->fp, b->str, b->l_str); - l = 12 + b->l_str; - for (i = 0; i < b->n_gi; ++i) { - bgzf_write(bp->fp, b->gi[i].data, b->gi[i].len * h->n_smpl); - l += b->gi[i].len * h->n_smpl; - } - return l; -} - -int bcf_read(bcf_t *bp, const bcf_hdr_t *h, bcf1_t *b) -{ - int i, l = 0; - if (b == 0) return -1; - if (bgzf_read(bp->fp, &b->tid, 4) == 0) return -1; - b->n_smpl = h->n_smpl; - bgzf_read(bp->fp, &b->pos, 4); - bgzf_read(bp->fp, &b->qual, 4); - bgzf_read(bp->fp, &b->l_str, 4); - if (b->l_str > b->m_str) { - b->m_str = b->l_str; - kroundup32(b->m_str); - b->str = realloc(b->str, b->m_str); - } - bgzf_read(bp->fp, b->str, b->l_str); - l = 12 + b->l_str; - if (bcf_sync(b) < 0) return -2; - for (i = 0; i < b->n_gi; ++i) { - bgzf_read(bp->fp, b->gi[i].data, b->gi[i].len * h->n_smpl); - l += b->gi[i].len * h->n_smpl; - } - return l; -} - -int bcf_destroy(bcf1_t *b) -{ - int i; - if (b == 0) return -1; - free(b->str); - for (i = 0; i < b->m_gi; ++i) - free(b->gi[i].data); - free(b->gi); - free(b); - return 0; -} - -static inline void fmt_str(const char *p, kstring_t *s) -{ - if (*p == 0) kputc('.', s); - else kputs(p, s); -} - -void bcf_fmt_core(const bcf_hdr_t *h, bcf1_t *b, kstring_t *s) -{ - int i, j, x; - s->l = 0; - if (h->n_ref) kputs(h->ns[b->tid], s); - else kputw(b->tid, s); - kputc('\t', s); - kputw(b->pos + 1, s); kputc('\t', s); - fmt_str(b->str, s); kputc('\t', s); - fmt_str(b->ref, s); kputc('\t', s); - fmt_str(b->alt, s); kputc('\t', s); - ksprintf(s, "%.3g", b->qual); kputc('\t', s); - fmt_str(b->flt, s); kputc('\t', s); - fmt_str(b->info, s); - if (b->fmt[0]) { - kputc('\t', s); - fmt_str(b->fmt, s); - } - x = b->n_alleles * (b->n_alleles + 1) / 2; - if (b->n_gi == 0) return; - for (j = 0; j < h->n_smpl; ++j) { - kputc('\t', s); - for (i = 0; i < b->n_gi; ++i) { - if (i) kputc(':', s); - if (b->gi[i].fmt == bcf_str2int("PL", 2)) { - uint8_t *d = (uint8_t*)b->gi[i].data + j * x; - int k; - for (k = 0; k < x; ++k) { - if (k > 0) kputc(',', s); - kputw(d[k], s); - } - } else if (b->gi[i].fmt == bcf_str2int("DP", 2)) { - kputw(((uint16_t*)b->gi[i].data)[j], s); - } else if (b->gi[i].fmt == bcf_str2int("GQ", 2)) { - kputw(((uint8_t*)b->gi[i].data)[j], s); - } else if (b->gi[i].fmt == bcf_str2int("SP", 2)) { - kputw(((int32_t*)b->gi[i].data)[j], s); - } else if (b->gi[i].fmt == bcf_str2int("GT", 2)) { - int y = ((uint8_t*)b->gi[i].data)[j]; - if (y>>7&1) { - kputsn("./.", 3, s); - } else { - kputc('0' + (y>>3&7), s); - kputc("/|"[y>>6&1], s); - kputc('0' + (y&7), s); - } - } else if (b->gi[i].fmt == bcf_str2int("GL", 2)) { - float *d = (float*)b->gi[i].data + j * x; - int k; - //printf("- %lx\n", d); - for (k = 0; k < x; ++k) { - if (k > 0) kputc(',', s); - ksprintf(s, "%.2f", d[k]); - } - } else kputc('.', s); // custom fields - } - } -} - -char *bcf_fmt(const bcf_hdr_t *h, bcf1_t *b) -{ - kstring_t s; - s.l = s.m = 0; s.s = 0; - bcf_fmt_core(h, b, &s); - return s.s; -} - -int bcf_append_info(bcf1_t *b, const char *info, int l) -{ - int shift = b->fmt - b->str; - int l_fmt = b->l_str - shift; - char *ori = b->str; - if (b->l_str + l > b->m_str) { // enlarge if necessary - b->m_str = b->l_str + l; - kroundup32(b->m_str); - b->str = realloc(b->str, b->m_str); - } - memmove(b->str + shift + l, b->str + shift, l_fmt); // move the FORMAT field - memcpy(b->str + shift - 1, info, l); // append to the INFO field - b->str[shift + l - 1] = '\0'; - b->fmt = b->str + shift + l; - b->l_str += l; - if (ori != b->str) bcf_sync(b); // synchronize when realloc changes the pointer - return 0; -} - -int bcf_cpy(bcf1_t *r, const bcf1_t *b) -{ - char *t1 = r->str; - bcf_ginfo_t *t2 = r->gi; - int i, t3 = r->m_str, t4 = r->m_gi; - *r = *b; - r->str = t1; r->gi = t2; r->m_str = t3; r->m_gi = t4; - if (r->m_str < b->m_str) { - r->m_str = b->m_str; - r->str = realloc(r->str, r->m_str); - } - memcpy(r->str, b->str, r->m_str); - bcf_sync(r); // calling bcf_sync() is simple but inefficient - for (i = 0; i < r->n_gi; ++i) - memcpy(r->gi[i].data, b->gi[i].data, r->n_smpl * r->gi[i].len); - return 0; -} - -int bcf_is_indel(const bcf1_t *b) -{ - char *p; - if (strlen(b->ref) > 1) return 1; - for (p = b->alt; *p; ++p) - if (*p != ',' && p[1] != ',' && p[1] != '\0') - return 1; - return 0; -} diff -Nru tophat-2.1.1+dfsg/src/samtools-0.1.18/bcftools/bcf.h tophat-2.1.1+dfsg1/src/samtools-0.1.18/bcftools/bcf.h --- tophat-2.1.1+dfsg/src/samtools-0.1.18/bcftools/bcf.h 2016-02-14 18:21:17.475079000 +0000 +++ tophat-2.1.1+dfsg1/src/samtools-0.1.18/bcftools/bcf.h 1970-01-01 00:00:00.000000000 +0000 @@ -1,190 +0,0 @@ -/* The MIT License - - Copyright (c) 2010 Broad Institute - - Permission is hereby granted, free of charge, to any person obtaining - a copy of this software and associated documentation files (the - "Software"), to deal in the Software without restriction, including - without limitation the rights to use, copy, modify, merge, publish, - distribute, sublicense, and/or sell copies of the Software, and to - permit persons to whom the Software is furnished to do so, subject to - the following conditions: - - The above copyright notice and this permission notice shall be - included in all copies or substantial portions of the Software. - - THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS - BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN - ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN - CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - SOFTWARE. -*/ - -/* Contact: Heng Li */ - -#ifndef BCF_H -#define BCF_H - -#define BCF_VERSION "0.1.17-dev (r973:277)" - -#include -#include - -#ifndef BCF_LITE -#include "bgzf.h" -typedef BGZF *bcfFile; -#else -typedef gzFile bcfFile; -#define bgzf_open(fn, mode) gzopen(fn, mode) -#define bgzf_fdopen(fd, mode) gzdopen(fd, mode) -#define bgzf_close(fp) gzclose(fp) -#define bgzf_read(fp, buf, len) gzread(fp, buf, len) -#define bgzf_write(fp, buf, len) -#define bgzf_flush(fp) -#endif - -/* - A member in the structs below is said to "primary" if its content - cannot be inferred from other members in any of structs below; a - member is said to be "derived" if its content can be derived from - other members. For example, bcf1_t::str is primary as this comes from - the input data, while bcf1_t::info is derived as it can always be - correctly set if we know bcf1_t::str. Derived members are for quick - access to the content and must be synchronized with the primary data. - */ - -typedef struct { - uint32_t fmt; // format of the block, set by bcf_str2int(). - int len; // length of data for each individual - void *data; // concatenated data - // derived info: fmt, len (<-bcf1_t::fmt) -} bcf_ginfo_t; - -typedef struct { - int32_t tid, pos; // refID and 0-based position - int32_t l_str, m_str; // length and the allocated size of ->str - float qual; // SNP quality - char *str; // concatenated string of variable length strings in VCF (from col.2 to col.7) - char *ref, *alt, *flt, *info, *fmt; // they all point to ->str; no memory allocation - int n_gi, m_gi; // number and the allocated size of geno fields - bcf_ginfo_t *gi; // array of geno fields - int n_alleles, n_smpl; // number of alleles and samples - // derived info: ref, alt, flt, info, fmt (<-str), n_gi (<-fmt), n_alleles (<-alt), n_smpl (<-bcf_hdr_t::n_smpl) -} bcf1_t; - -typedef struct { - int32_t n_ref, n_smpl; // number of reference sequences and samples - int32_t l_nm; // length of concatenated sequence names; 0 padded - int32_t l_smpl; // length of concatenated sample names; 0 padded - int32_t l_txt; // length of header text (lines started with ##) - char *name, *sname, *txt; // concatenated sequence names, sample names and header text - char **ns, **sns; // array of sequence and sample names; point to name and sname, respectively - // derived info: n_ref (<-name), n_smpl (<-sname), ns (<-name), sns (<-sname) -} bcf_hdr_t; - -typedef struct { - int is_vcf; // if the file in operation is a VCF - void *v; // auxillary data structure for VCF - bcfFile fp; // file handler for BCF -} bcf_t; - -struct __bcf_idx_t; -typedef struct __bcf_idx_t bcf_idx_t; - -#ifdef __cplusplus -extern "C" { -#endif - - // open a BCF file; for BCF file only - bcf_t *bcf_open(const char *fn, const char *mode); - // close file - int bcf_close(bcf_t *b); - // read one record from BCF; return -1 on end-of-file, and <-1 for errors - int bcf_read(bcf_t *bp, const bcf_hdr_t *h, bcf1_t *b); - // call this function if b->str is changed - int bcf_sync(bcf1_t *b); - // write a BCF record - int bcf_write(bcf_t *bp, const bcf_hdr_t *h, const bcf1_t *b); - // read the BCF header; BCF only - bcf_hdr_t *bcf_hdr_read(bcf_t *b); - // write the BCF header - int bcf_hdr_write(bcf_t *b, const bcf_hdr_t *h); - // set bcf_hdr_t::ns and bcf_hdr_t::sns - int bcf_hdr_sync(bcf_hdr_t *b); - // destroy the header - void bcf_hdr_destroy(bcf_hdr_t *h); - // destroy a record - int bcf_destroy(bcf1_t *b); - // BCF->VCF conversion - char *bcf_fmt(const bcf_hdr_t *h, bcf1_t *b); - // append more info - int bcf_append_info(bcf1_t *b, const char *info, int l); - // copy - int bcf_cpy(bcf1_t *r, const bcf1_t *b); - - // open a VCF or BCF file if "b" is set in "mode" - bcf_t *vcf_open(const char *fn, const char *mode); - // close a VCF/BCF file - int vcf_close(bcf_t *bp); - // read the VCF/BCF header - bcf_hdr_t *vcf_hdr_read(bcf_t *bp); - // read the sequence dictionary from a separate file; required for VCF->BCF conversion - int vcf_dictread(bcf_t *bp, bcf_hdr_t *h, const char *fn); - // read a VCF/BCF record; return -1 on end-of-file and <-1 for errors - int vcf_read(bcf_t *bp, bcf_hdr_t *h, bcf1_t *b); - // write the VCF header - int vcf_hdr_write(bcf_t *bp, const bcf_hdr_t *h); - // write a VCF record - int vcf_write(bcf_t *bp, bcf_hdr_t *h, bcf1_t *b); - - // keep the first n alleles and discard the rest - int bcf_shrink_alt(bcf1_t *b, int n); - // convert GL to PL - int bcf_gl2pl(bcf1_t *b); - // if the site is an indel - int bcf_is_indel(const bcf1_t *b); - bcf_hdr_t *bcf_hdr_subsam(const bcf_hdr_t *h0, int n, char *const* samples, int *list); - int bcf_subsam(int n_smpl, int *list, bcf1_t *b); - // move GT to the first FORMAT field - int bcf_fix_gt(bcf1_t *b); - // update PL generated by old samtools - int bcf_fix_pl(bcf1_t *b); - // convert PL to GLF-like 10-likelihood GL - int bcf_gl10(const bcf1_t *b, uint8_t *gl); - // convert up to 4 INDEL alleles to GLF-like 10-likelihood GL - int bcf_gl10_indel(const bcf1_t *b, uint8_t *gl); - - // string hash table - void *bcf_build_refhash(bcf_hdr_t *h); - void bcf_str2id_destroy(void *_hash); - void bcf_str2id_thorough_destroy(void *_hash); - int bcf_str2id_add(void *_hash, const char *str); - int bcf_str2id(void *_hash, const char *str); - void *bcf_str2id_init(); - - // indexing related functions - int bcf_idx_build(const char *fn); - uint64_t bcf_idx_query(const bcf_idx_t *idx, int tid, int beg); - int bcf_parse_region(void *str2id, const char *str, int *tid, int *begin, int *end); - bcf_idx_t *bcf_idx_load(const char *fn); - void bcf_idx_destroy(bcf_idx_t *idx); - -#ifdef __cplusplus -} -#endif - -static inline uint32_t bcf_str2int(const char *str, int l) -{ - int i; - uint32_t x = 0; - for (i = 0; i < l && i < 4; ++i) { - if (str[i] == 0) return x; - x = x<<8 | str[i]; - } - return x; -} - -#endif diff -Nru tophat-2.1.1+dfsg/src/samtools-0.1.18/bcftools/bcf.tex tophat-2.1.1+dfsg1/src/samtools-0.1.18/bcftools/bcf.tex --- tophat-2.1.1+dfsg/src/samtools-0.1.18/bcftools/bcf.tex 2016-02-14 18:21:17.476079000 +0000 +++ tophat-2.1.1+dfsg1/src/samtools-0.1.18/bcftools/bcf.tex 1970-01-01 00:00:00.000000000 +0000 @@ -1,77 +0,0 @@ -\documentclass[10pt,pdftex]{article} -\usepackage{color} -\definecolor{gray}{rgb}{0.7,0.7,0.7} - -\setlength{\topmargin}{0.0cm} -\setlength{\textheight}{21.5cm} -\setlength{\oddsidemargin}{0cm} -\setlength{\textwidth}{16.5cm} -\setlength{\columnsep}{0.6cm} - -\begin{document} - -\begin{center} -\begin{tabular}{|l|l|l|l|l|} -\hline -\multicolumn{2}{|c|}{\bf Field} & \multicolumn{1}{c|}{\bf Descrption} & \multicolumn{1}{c|}{\bf Type} & \multicolumn{1}{c|}{\bf Value} \\\hline\hline -\multicolumn{2}{|l|}{\sf magic} & Magic string & {\tt char[4]} & {\tt BCF\char92 4} \\\hline -\multicolumn{2}{|l|}{\sf l\_seqnm} & Length of concatenated sequence names & {\tt int32\_t} & \\\hline -\multicolumn{2}{|l|}{\sf seqnm} & Concatenated names, {\tt NULL} padded & {\tt char[{\sf l\_seqnm}]} & \\\hline -\multicolumn{2}{|l|}{\sf l\_smpl} & Length of concatenated sample names & {\tt int32\_t} & \\\hline -\multicolumn{2}{|l|}{\sf smpl} & Concatenated sample names & {\tt char[{\sf l\_smpl}]} & \\\hline -\multicolumn{2}{|l|}{\sf l\_meta} & Length of the meta text (double-hash lines)& {\tt int32\_t} & \\\hline -\multicolumn{2}{|l|}{\sf meta} & Meta text, {\tt NULL} terminated & {\tt char[{\sf l\_meta}]} & \\\hline -\multicolumn{5}{|c|}{\it \color{gray}{List of records until the end of the file}}\\\cline{2-5} -& {\sf seq\_id} & Reference sequence ID & {\tt int32\_t} & \\\cline{2-5} -& {\sf pos} & Position & {\tt int32\_t} & \\\cline{2-5} -& {\sf qual} & Variant quality & {\tt float} & \\\cline{2-5} -& {\sf l\_str} & Length of {\sf str} & {\tt int32\_t} & \\\cline{2-5} -& {\sf str} & {\tt ID+REF+ALT+FILTER+INFO+FORMAT}, {\tt NULL} padded & {\tt char[{\sf l\_str}]} &\\\cline{2-5} -& \multicolumn{4}{c|}{Blocks of data; \#blocks and formats defined by {\tt FORMAT} (table below)}\\ -\hline -\end{tabular} -\end{center} - -\begin{center} -\begin{tabular}{clp{9cm}} -\hline -\multicolumn{1}{l}{\bf Field} & \multicolumn{1}{l}{\bf Type} & \multicolumn{1}{l}{\bf Description} \\\hline -{\tt DP} & {\tt uint16\_t[n]} & Read depth \\ -{\tt GL} & {\tt float[n*G]} & Log10 likelihood of data; $G=\frac{A(A+1)}{2}$, $A=\#\{alleles\}$\\ -{\tt GT} & {\tt uint8\_t[n]} & {\tt missing\char60\char60 7 | phased\char60\char60 6 | allele1\char60\char60 3 | allele2} \\ -{\tt \_GT} & {\tt uint8\_t+uint8\_t[n*P]} & {Generic GT; the first int equals the max ploidy $P$. If the highest bit is set, - the allele is not present (e.g. due to different ploidy between samples).} \\ -{\tt GQ} & {\tt uint8\_t[n]} & {Genotype quality}\\ -{\tt HQ} & {\tt uint8\_t[n*2]} & {Haplotype quality}\\ -{\tt \_HQ} & {\tt uint8\_t+uint8\_t[n*P]} & {Generic HQ}\\ -{\tt IBD} & {\tt uint32\_t[n*2]} & {IBD}\\ -{\tt \_IBD} & {\tt uint8\_t+uint32\_t[n*P]} & {Generic IBD}\\ -{\tt PL} & {\tt uint8\_t[n*G]} & {Phred-scaled likelihood of data}\\ -{\tt PS} & {\tt uint32\_t[n]} & {Phase set}\\ -%{\tt SP} & {\tt uint8\_t[n]} & {Strand bias P-value (bcftools only)}\\ -\emph{Integer} & {\tt int32\_t[n*X]} & {Fix-sized custom Integer; $X$ defined in the header}\\ -\emph{Numeric} & {\tt double[n*X]} & {Fix-sized custom Numeric}\\ -\emph{String} & {\tt uint32\_t+char*} & {\tt NULL} padded concat. strings (int equals to the length) \\ -\hline -\end{tabular} -\end{center} - -\begin{itemize} -\item A BCF file is in the {\tt BGZF} format. -\item All multi-byte numbers are little-endian. -\item In a string, a missing value `.' is an empty C string ``{\tt - \char92 0}'' (not ``{\tt .\char92 0}'') -\item For {\tt GL} and {\tt PL}, likelihoods of genotypes appear in the - order of alleles in {\tt REF} and then {\tt ALT}. For example, if {\tt - REF=C}, {\tt ALT=T,A}, likelihoods appear in the order of {\tt - CC,CT,TT,CA,TA,AA} (NB: the ordering is different from the one in the original - BCF proposal). -\item Predefined {\tt FORMAT} fields can be missing from VCF headers, but custom {\tt FORMAT} fields - are required to be explicitly defined in the headers. -\item A {\tt FORMAT} field with its name starting with `{\tt \_}' is specific to BCF only. - It gives an alternative binary representation of the corresponding VCF field, in case - the default representation is unable to keep the genotype information, - for example, when the ploidy is not 2 or there are more than 8 alleles. -\end{itemize} - -\end{document} diff -Nru tophat-2.1.1+dfsg/src/samtools-0.1.18/bcftools/bcfutils.c tophat-2.1.1+dfsg1/src/samtools-0.1.18/bcftools/bcfutils.c --- tophat-2.1.1+dfsg/src/samtools-0.1.18/bcftools/bcfutils.c 2016-02-14 18:21:17.477079000 +0000 +++ tophat-2.1.1+dfsg1/src/samtools-0.1.18/bcftools/bcfutils.c 1970-01-01 00:00:00.000000000 +0000 @@ -1,390 +0,0 @@ -#include -#include -#include "bcf.h" -#include "kstring.h" -#include "khash.h" -KHASH_MAP_INIT_STR(str2id, int) - -#ifdef _WIN32 -#define srand48(x) srand(x) -#define drand48() ((double)rand() / RAND_MAX) -#endif - -// FIXME: valgrind report a memory leak in this function. Probably it does not get deallocated... -void *bcf_build_refhash(bcf_hdr_t *h) -{ - khash_t(str2id) *hash; - int i, ret; - hash = kh_init(str2id); - for (i = 0; i < h->n_ref; ++i) { - khint_t k; - k = kh_put(str2id, hash, h->ns[i], &ret); // FIXME: check ret - kh_val(hash, k) = i; - } - return hash; -} - -void *bcf_str2id_init() -{ - return kh_init(str2id); -} - -void bcf_str2id_destroy(void *_hash) -{ - khash_t(str2id) *hash = (khash_t(str2id)*)_hash; - if (hash) kh_destroy(str2id, hash); // Note that strings are not freed. -} - -void bcf_str2id_thorough_destroy(void *_hash) -{ - khash_t(str2id) *hash = (khash_t(str2id)*)_hash; - khint_t k; - if (hash == 0) return; - for (k = 0; k < kh_end(hash); ++k) - if (kh_exist(hash, k)) free((char*)kh_key(hash, k)); - kh_destroy(str2id, hash); -} - -int bcf_str2id(void *_hash, const char *str) -{ - khash_t(str2id) *hash = (khash_t(str2id)*)_hash; - khint_t k; - if (!hash) return -1; - k = kh_get(str2id, hash, str); - return k == kh_end(hash)? -1 : kh_val(hash, k); -} - -int bcf_str2id_add(void *_hash, const char *str) -{ - khint_t k; - int ret; - khash_t(str2id) *hash = (khash_t(str2id)*)_hash; - if (!hash) return -1; - k = kh_put(str2id, hash, str, &ret); - if (ret == 0) return kh_val(hash, k); - kh_val(hash, k) = kh_size(hash) - 1; - return kh_val(hash, k); -} - -int bcf_shrink_alt(bcf1_t *b, int n) -{ - char *p; - int i, j, k, n_smpl = b->n_smpl; - if (b->n_alleles <= n) return -1; - // update ALT - if (n > 1) { - for (p = b->alt, k = 1; *p; ++p) - if (*p == ',' && ++k == n) break; - *p = '\0'; - } else p = b->alt, *p = '\0'; - ++p; - memmove(p, b->flt, b->str + b->l_str - b->flt); - b->l_str -= b->flt - p; - // update PL - for (i = 0; i < b->n_gi; ++i) { - bcf_ginfo_t *g = b->gi + i; - if (g->fmt == bcf_str2int("PL", 2)) { - int l, x = b->n_alleles * (b->n_alleles + 1) / 2; - uint8_t *d = (uint8_t*)g->data; - g->len = n * (n + 1) / 2; - for (l = k = 0; l < n_smpl; ++l) { - uint8_t *dl = d + l * x; - for (j = 0; j < g->len; ++j) d[k++] = dl[j]; - } - } // FIXME: to add GL - } - b->n_alleles = n; - bcf_sync(b); - return 0; -} - -int bcf_gl2pl(bcf1_t *b) -{ - char *p; - int i, n_smpl = b->n_smpl; - bcf_ginfo_t *g; - float *d0; - uint8_t *d1; - if (strstr(b->fmt, "PL")) return -1; - if ((p = strstr(b->fmt, "GL")) == 0) return -1; - *p = 'P'; - for (i = 0; i < b->n_gi; ++i) - if (b->gi[i].fmt == bcf_str2int("GL", 2)) - break; - g = b->gi + i; - g->fmt = bcf_str2int("PL", 2); - g->len /= 4; // 4 == sizeof(float) - d0 = (float*)g->data; d1 = (uint8_t*)g->data; - for (i = 0; i < n_smpl * g->len; ++i) { - int x = (int)(-10. * d0[i] + .499); - if (x > 255) x = 255; - if (x < 0) x = 0; - d1[i] = x; - } - return 0; -} -/* FIXME: this function will fail given AB:GTX:GT. BCFtools never - * produces such FMT, but others may do. */ -int bcf_fix_gt(bcf1_t *b) -{ - char *s; - int i; - uint32_t tmp; - bcf_ginfo_t gt; - // check the presence of the GT FMT - if ((s = strstr(b->fmt, ":GT")) == 0) return 0; // no GT or GT is already the first - if (s[3] != '\0' && s[3] != ':') return 0; // :GTX in fact - tmp = bcf_str2int("GT", 2); - for (i = 0; i < b->n_gi; ++i) - if (b->gi[i].fmt == tmp) break; - if (i == b->n_gi) return 0; // no GT in b->gi; probably a bug... - gt = b->gi[i]; - // move GT to the first - for (; i > 0; --i) b->gi[i] = b->gi[i-1]; - b->gi[0] = gt; - memmove(b->fmt + 3, b->fmt, s + 1 - b->fmt); - b->fmt[0] = 'G'; b->fmt[1] = 'T'; b->fmt[2] = ':'; - return 0; -} - -int bcf_fix_pl(bcf1_t *b) -{ - int i; - uint32_t tmp; - uint8_t *PL, *swap; - bcf_ginfo_t *gi; - // pinpoint PL - tmp = bcf_str2int("PL", 2); - for (i = 0; i < b->n_gi; ++i) - if (b->gi[i].fmt == tmp) break; - if (i == b->n_gi) return 0; - // prepare - gi = b->gi + i; - PL = (uint8_t*)gi->data; - swap = alloca(gi->len); - // loop through individuals - for (i = 0; i < b->n_smpl; ++i) { - int k, l, x; - uint8_t *PLi = PL + i * gi->len; - memcpy(swap, PLi, gi->len); - for (k = x = 0; k < b->n_alleles; ++k) - for (l = k; l < b->n_alleles; ++l) - PLi[l*(l+1)/2 + k] = swap[x++]; - } - return 0; -} - -int bcf_smpl_covered(const bcf1_t *b) -{ - int i, j, n = 0; - uint32_t tmp; - bcf_ginfo_t *gi; - // pinpoint PL - tmp = bcf_str2int("PL", 2); - for (i = 0; i < b->n_gi; ++i) - if (b->gi[i].fmt == tmp) break; - if (i == b->n_gi) return 0; - // count how many samples having PL!=[0..0] - gi = b->gi + i; - for (i = 0; i < b->n_smpl; ++i) { - uint8_t *PLi = ((uint8_t*)gi->data) + i * gi->len; - for (j = 0; j < gi->len; ++j) - if (PLi[j]) break; - if (j < gi->len) ++n; - } - return n; -} - -static void *locate_field(const bcf1_t *b, const char *fmt, int l) -{ - int i; - uint32_t tmp; - tmp = bcf_str2int(fmt, l); - for (i = 0; i < b->n_gi; ++i) - if (b->gi[i].fmt == tmp) break; - return i == b->n_gi? 0 : b->gi[i].data; -} - -int bcf_anno_max(bcf1_t *b) -{ - int k, max_gq, max_sp, n_het; - kstring_t str; - uint8_t *gt, *gq; - int32_t *sp; - max_gq = max_sp = n_het = 0; - gt = locate_field(b, "GT", 2); - if (gt == 0) return -1; - gq = locate_field(b, "GQ", 2); - sp = locate_field(b, "SP", 2); - if (sp) - for (k = 0; k < b->n_smpl; ++k) - if (gt[k]&0x3f) - max_sp = max_sp > (int)sp[k]? max_sp : sp[k]; - if (gq) - for (k = 0; k < b->n_smpl; ++k) - if (gt[k]&0x3f) - max_gq = max_gq > (int)gq[k]? max_gq : gq[k]; - for (k = 0; k < b->n_smpl; ++k) { - int a1, a2; - a1 = gt[k]&7; a2 = gt[k]>>3&7; - if ((!a1 && a2) || (!a2 && a1)) { // a het - if (gq == 0) ++n_het; - else if (gq[k] >= 20) ++n_het; - } - } - if (n_het) max_sp -= (int)(4.343 * log(n_het) + .499); - if (max_sp < 0) max_sp = 0; - memset(&str, 0, sizeof(kstring_t)); - if (*b->info) kputc(';', &str); - ksprintf(&str, "MXSP=%d;MXGQ=%d", max_sp, max_gq); - bcf_append_info(b, str.s, str.l); - free(str.s); - return 0; -} - -// FIXME: only data are shuffled; the header is NOT -int bcf_shuffle(bcf1_t *b, int seed) -{ - int i, j, *a; - if (seed > 0) srand48(seed); - a = malloc(b->n_smpl * sizeof(int)); - for (i = 0; i < b->n_smpl; ++i) a[i] = i; - for (i = b->n_smpl; i > 1; --i) { - int tmp; - j = (int)(drand48() * i); - tmp = a[j]; a[j] = a[i-1]; a[i-1] = tmp; - } - for (j = 0; j < b->n_gi; ++j) { - bcf_ginfo_t *gi = b->gi + j; - uint8_t *swap, *data = (uint8_t*)gi->data; - swap = malloc(gi->len * b->n_smpl); - for (i = 0; i < b->n_smpl; ++i) - memcpy(swap + gi->len * a[i], data + gi->len * i, gi->len); - free(gi->data); - gi->data = swap; - } - free(a); - return 0; -} - -bcf_hdr_t *bcf_hdr_subsam(const bcf_hdr_t *h0, int n, char *const* samples, int *list) -{ - int i, ret, j; - khint_t k; - bcf_hdr_t *h; - khash_t(str2id) *hash; - kstring_t s; - s.l = s.m = 0; s.s = 0; - hash = kh_init(str2id); - for (i = 0; i < h0->n_smpl; ++i) { - k = kh_put(str2id, hash, h0->sns[i], &ret); - kh_val(hash, k) = i; - } - for (i = j = 0; i < n; ++i) { - k = kh_get(str2id, hash, samples[i]); - if (k != kh_end(hash)) { - list[j++] = kh_val(hash, k); - kputs(samples[i], &s); kputc('\0', &s); - } - } - if (j < n) fprintf(stderr, "<%s> %d samples in the list but not in BCF.", __func__, n - j); - kh_destroy(str2id, hash); - h = calloc(1, sizeof(bcf_hdr_t)); - *h = *h0; - h->ns = 0; h->sns = 0; - h->name = malloc(h->l_nm); memcpy(h->name, h0->name, h->l_nm); - h->txt = calloc(1, h->l_txt + 1); memcpy(h->txt, h0->txt, h->l_txt); - h->l_smpl = s.l; h->sname = s.s; - bcf_hdr_sync(h); - return h; -} - -int bcf_subsam(int n_smpl, int *list, bcf1_t *b) -{ - int i, j; - for (j = 0; j < b->n_gi; ++j) { - bcf_ginfo_t *gi = b->gi + j; - uint8_t *swap; - swap = malloc(gi->len * b->n_smpl); - for (i = 0; i < n_smpl; ++i) - memcpy(swap + i * gi->len, (uint8_t*)gi->data + list[i] * gi->len, gi->len); - free(gi->data); - gi->data = swap; - } - b->n_smpl = n_smpl; - return 0; -} - -static int8_t nt4_table[128] = { - 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, - 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, - 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4 /*'-'*/, 4, 4, - 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, - 4, 0, 4, 1, 4, 4, 4, 2, 4, 4, 4, 4, 4, 4, 4, 4, - 4, 4, 4, 4, 3, 4, 4, 4, -1, 4, 4, 4, 4, 4, 4, 4, - 4, 0, 4, 1, 4, 4, 4, 2, 4, 4, 4, 4, 4, 4, 4, 4, - 4, 4, 4, 4, 3, 4, 4, 4, -1, 4, 4, 4, 4, 4, 4, 4 -}; - -int bcf_gl10(const bcf1_t *b, uint8_t *gl) -{ - int a[4], k, l, map[4], k1, j, i; - const bcf_ginfo_t *PL; - char *s; - if (b->ref[1] != 0 || b->n_alleles > 4) return -1; // ref is not a single base or >4 alleles - for (i = 0; i < b->n_gi; ++i) - if (b->gi[i].fmt == bcf_str2int("PL", 2)) break; - if (i == b->n_gi) return -1; // no PL - PL = b->gi + i; - a[0] = nt4_table[(int)b->ref[0]]; - if (a[0] > 3 || a[0] < 0) return -1; // ref is not A/C/G/T - a[1] = a[2] = a[3] = -2; // -1 has a special meaning - if (b->alt[0] == 0) return -1; // no alternate allele - map[0] = map[1] = map[2] = map[3] = -2; - map[a[0]] = 0; - for (k = 0, s = b->alt, k1 = -1; k < 3 && *s; ++k, s += 2) { - if (s[1] != ',' && s[1] != 0) return -1; // ALT is not single base - a[k+1] = nt4_table[(int)*s]; - if (a[k+1] >= 0) map[a[k+1]] = k+1; - else k1 = k + 1; - if (s[1] == 0) break; // the end of the ALT string - } - for (k = 0; k < 4; ++k) - if (map[k] < 0) map[k] = k1; - for (i = 0; i < b->n_smpl; ++i) { - const uint8_t *p = PL->data + i * PL->len; // the PL for the i-th individual - uint8_t *g = gl + 10 * i; - for (k = j = 0; k < 4; ++k) { - for (l = k; l < 4; ++l) { - int t, x = map[k], y = map[l]; - if (x > y) t = x, x = y, y = t; // make sure x is the smaller - g[j++] = p[y * (y+1) / 2 + x]; - } - } - } - return 0; -} - -int bcf_gl10_indel(const bcf1_t *b, uint8_t *gl) -{ - int k, l, j, i; - const bcf_ginfo_t *PL; - if (b->alt[0] == 0) return -1; // no alternate allele - for (i = 0; i < b->n_gi; ++i) - if (b->gi[i].fmt == bcf_str2int("PL", 2)) break; - if (i == b->n_gi) return -1; // no PL - PL = b->gi + i; - for (i = 0; i < b->n_smpl; ++i) { - const uint8_t *p = PL->data + i * PL->len; // the PL for the i-th individual - uint8_t *g = gl + 10 * i; - for (k = j = 0; k < 4; ++k) { - for (l = k; l < 4; ++l) { - int t, x = k, y = l; - if (x > y) t = x, x = y, y = t; // make sure x is the smaller - x = y * (y+1) / 2 + x; - g[j++] = x < PL->len? p[x] : 255; - } - } - } - return 0; -} diff -Nru tophat-2.1.1+dfsg/src/samtools-0.1.18/bcftools/call1.c tophat-2.1.1+dfsg1/src/samtools-0.1.18/bcftools/call1.c --- tophat-2.1.1+dfsg/src/samtools-0.1.18/bcftools/call1.c 2016-02-14 18:21:17.479079000 +0000 +++ tophat-2.1.1+dfsg1/src/samtools-0.1.18/bcftools/call1.c 1970-01-01 00:00:00.000000000 +0000 @@ -1,586 +0,0 @@ -#include -#include -#include -#include -#include -#include "bcf.h" -#include "prob1.h" -#include "kstring.h" -#include "time.h" - -#ifdef _WIN32 -#define srand48(x) srand(x) -#define lrand48() rand() -#endif - -#include "kseq.h" -KSTREAM_INIT(gzFile, gzread, 16384) - -#define VC_NO_GENO 2 -#define VC_BCFOUT 4 -#define VC_CALL 8 -#define VC_VARONLY 16 -#define VC_VCFIN 32 -#define VC_UNCOMP 64 -#define VC_KEEPALT 256 -#define VC_ACGT_ONLY 512 -#define VC_QCALL 1024 -#define VC_CALL_GT 2048 -#define VC_ADJLD 4096 -#define VC_NO_INDEL 8192 -#define VC_ANNO_MAX 16384 -#define VC_FIX_PL 32768 -#define VC_EM 0x10000 -#define VC_PAIRCALL 0x20000 -#define VC_QCNT 0x40000 - -typedef struct { - int flag, prior_type, n1, n_sub, *sublist, n_perm; - uint32_t *trio_aux; - char *prior_file, **subsam, *fn_dict; - uint8_t *ploidy; - double theta, pref, indel_frac, min_perm_p, min_smpl_frac, min_lrt; - void *bed; -} viewconf_t; - -void *bed_read(const char *fn); -void bed_destroy(void *_h); -int bed_overlap(const void *_h, const char *chr, int beg, int end); - -typedef struct { - double p[4]; - int mq, depth, is_tested, d[4]; -} anno16_t; - -static double ttest(int n1, int n2, int a[4]) -{ - extern double kf_betai(double a, double b, double x); - double t, v, u1, u2; - if (n1 == 0 || n2 == 0 || n1 + n2 < 3) return 1.0; - u1 = (double)a[0] / n1; u2 = (double)a[2] / n2; - if (u1 <= u2) return 1.; - t = (u1 - u2) / sqrt(((a[1] - n1 * u1 * u1) + (a[3] - n2 * u2 * u2)) / (n1 + n2 - 2) * (1./n1 + 1./n2)); - v = n1 + n2 - 2; -// printf("%d,%d,%d,%d,%lf,%lf,%lf\n", a[0], a[1], a[2], a[3], t, u1, u2); - return t < 0.? 1. : .5 * kf_betai(.5*v, .5, v/(v+t*t)); -} - -static int test16_core(int anno[16], anno16_t *a) -{ - extern double kt_fisher_exact(int n11, int n12, int n21, int n22, double *_left, double *_right, double *two); - double left, right; - int i; - a->p[0] = a->p[1] = a->p[2] = a->p[3] = 1.; - memcpy(a->d, anno, 4 * sizeof(int)); - a->depth = anno[0] + anno[1] + anno[2] + anno[3]; - a->is_tested = (anno[0] + anno[1] > 0 && anno[2] + anno[3] > 0); - if (a->depth == 0) return -1; - a->mq = (int)(sqrt((anno[9] + anno[11]) / a->depth) + .499); - kt_fisher_exact(anno[0], anno[1], anno[2], anno[3], &left, &right, &a->p[0]); - for (i = 1; i < 4; ++i) - a->p[i] = ttest(anno[0] + anno[1], anno[2] + anno[3], anno+4*i); - return 0; -} - -static int test16(bcf1_t *b, anno16_t *a) -{ - char *p; - int i, anno[16]; - a->p[0] = a->p[1] = a->p[2] = a->p[3] = 1.; - a->d[0] = a->d[1] = a->d[2] = a->d[3] = 0.; - a->mq = a->depth = a->is_tested = 0; - if ((p = strstr(b->info, "I16=")) == 0) return -1; - p += 4; - for (i = 0; i < 16; ++i) { - errno = 0; anno[i] = strtol(p, &p, 10); - if (anno[i] == 0 && (errno == EINVAL || errno == ERANGE)) return -2; - ++p; - } - return test16_core(anno, a); -} - -static void rm_info(bcf1_t *b, const char *key) -{ - char *p, *q; - if ((p = strstr(b->info, key)) == 0) return; - for (q = p; *q && *q != ';'; ++q); - if (p > b->info && *(p-1) == ';') --p; - memmove(p, q, b->l_str - (q - b->str)); - b->l_str -= q - p; - bcf_sync(b); -} - -static int update_bcf1(bcf1_t *b, const bcf_p1aux_t *pa, const bcf_p1rst_t *pr, double pref, int flag, double em[10], int cons_llr, int64_t cons_gt) -{ - kstring_t s; - int has_I16, is_var; - double fq, r; - anno16_t a; - - has_I16 = test16(b, &a) >= 0? 1 : 0; - rm_info(b, "I16="); // FIXME: probably this function has a bug. If I move it below, I16 will not be removed! - - memset(&s, 0, sizeof(kstring_t)); - kputc('\0', &s); kputs(b->ref, &s); kputc('\0', &s); - kputs(b->alt, &s); kputc('\0', &s); kputc('\0', &s); - kputs(b->info, &s); - if (b->info[0]) kputc(';', &s); - { // print EM - if (em[0] >= 0) ksprintf(&s, "AF1=%.4g", 1 - em[0]); - if (em[4] >= 0 && em[4] <= 0.05) ksprintf(&s, ";G3=%.4g,%.4g,%.4g;HWE=%.3g", em[3], em[2], em[1], em[4]); - if (em[5] >= 0 && em[6] >= 0) ksprintf(&s, ";AF2=%.4g,%.4g", 1 - em[5], 1 - em[6]); - if (em[7] >= 0) ksprintf(&s, ";LRT=%.3g", em[7]); - if (em[8] >= 0) ksprintf(&s, ";LRT2=%.3g", em[8]); - } - if (cons_llr > 0) { - ksprintf(&s, ";CLR=%d", cons_llr); - if (cons_gt > 0) - ksprintf(&s, ";UGT=%c%c%c;CGT=%c%c%c", cons_gt&0xff, cons_gt>>8&0xff, cons_gt>>16&0xff, - cons_gt>>32&0xff, cons_gt>>40&0xff, cons_gt>>48&0xff); - } - if (pr == 0) { // if pr is unset, return - kputc('\0', &s); kputs(b->fmt, &s); kputc('\0', &s); - free(b->str); - b->m_str = s.m; b->l_str = s.l; b->str = s.s; - bcf_sync(b); - return 1; - } - - is_var = (pr->p_ref < pref); - r = is_var? pr->p_ref : pr->p_var; - -// ksprintf(&s, ";CI95=%.4g,%.4g", pr->cil, pr->cih); // FIXME: when EM is not used, ";" should be omitted! - ksprintf(&s, ";AC1=%d", pr->ac); - if (has_I16) ksprintf(&s, ";DP4=%d,%d,%d,%d;MQ=%d", a.d[0], a.d[1], a.d[2], a.d[3], a.mq); - fq = pr->p_ref_folded < 0.5? -4.343 * log(pr->p_ref_folded) : 4.343 * log(pr->p_var_folded); - if (fq < -999) fq = -999; - if (fq > 999) fq = 999; - ksprintf(&s, ";FQ=%.3g", fq); - if (pr->cmp[0] >= 0.) { // two sample groups - int i, q[3]; - for (i = 1; i < 3; ++i) { - double x = pr->cmp[i] + pr->cmp[0]/2.; - q[i] = x == 0? 255 : (int)(-4.343 * log(x) + .499); - if (q[i] > 255) q[i] = 255; - } - if (pr->perm_rank >= 0) ksprintf(&s, ";PR=%d", pr->perm_rank); - // ksprintf(&s, ";LRT3=%.3g", pr->lrt); - ksprintf(&s, ";PCHI2=%.3g;PC2=%d,%d", q[1], q[2], pr->p_chi2); - } - if (has_I16 && a.is_tested) ksprintf(&s, ";PV4=%.2g,%.2g,%.2g,%.2g", a.p[0], a.p[1], a.p[2], a.p[3]); - kputc('\0', &s); - kputs(b->fmt, &s); kputc('\0', &s); - free(b->str); - b->m_str = s.m; b->l_str = s.l; b->str = s.s; - b->qual = r < 1e-100? 999 : -4.343 * log(r); - if (b->qual > 999) b->qual = 999; - bcf_sync(b); - if (!is_var) bcf_shrink_alt(b, 1); - else if (!(flag&VC_KEEPALT)) - bcf_shrink_alt(b, pr->rank0 < 2? 2 : pr->rank0+1); - if (is_var && (flag&VC_CALL_GT)) { // call individual genotype - int i, x, old_n_gi = b->n_gi; - s.m = b->m_str; s.l = b->l_str - 1; s.s = b->str; - kputs(":GT:GQ", &s); kputc('\0', &s); - b->m_str = s.m; b->l_str = s.l; b->str = s.s; - bcf_sync(b); - for (i = 0; i < b->n_smpl; ++i) { - x = bcf_p1_call_gt(pa, pr->f_exp, i); - ((uint8_t*)b->gi[old_n_gi].data)[i] = (x&3) == 0? 1<<3|1 : (x&3) == 1? 1 : 0; - ((uint8_t*)b->gi[old_n_gi+1].data)[i] = x>>2; - } - } - return is_var; -} - -static char **read_samples(const char *fn, int *_n) -{ - gzFile fp; - kstream_t *ks; - kstring_t s; - int dret, n = 0, max = 0; - char **sam = 0; - *_n = 0; - s.l = s.m = 0; s.s = 0; - fp = gzopen(fn, "r"); - if (fp == 0) return 0; // fail to open file - ks = ks_init(fp); - while (ks_getuntil(ks, 0, &s, &dret) >= 0) { - int l; - if (max == n) { - max = max? max<<1 : 4; - sam = realloc(sam, sizeof(void*)*max); - } - l = s.l; - sam[n] = malloc(s.l + 2); - strcpy(sam[n], s.s); - sam[n][l+1] = 2; // by default, diploid - if (dret != '\n') { - if (ks_getuntil(ks, 0, &s, &dret) >= 0) { // read ploidy, 1 or 2 - int x = (int)s.s[0] - '0'; - if (x == 1 || x == 2) sam[n][l+1] = x; - else fprintf(stderr, "(%s) ploidy can only be 1 or 2; assume diploid\n", __func__); - } - if (dret != '\n') ks_getuntil(ks, '\n', &s, &dret); - } - ++n; - } - ks_destroy(ks); - gzclose(fp); - free(s.s); - *_n = n; - return sam; -} - -static void write_header(bcf_hdr_t *h) -{ - kstring_t str; - str.l = h->l_txt? h->l_txt - 1 : 0; - str.m = str.l + 1; str.s = h->txt; - if (!strstr(str.s, "##INFO=\n", &str); - if (!strstr(str.s, "##INFO=\n", &str); - if (!strstr(str.s, "##INFO=\n", &str); - if (!strstr(str.s, "##INFO=\n", &str); - if (!strstr(str.s, "##INFO=\n", &str); - if (!strstr(str.s, "##INFO=\n", &str); - if (!strstr(str.s, "##INFO=\n", &str); - if (!strstr(str.s, "##INFO=\n", &str); - if (!strstr(str.s, "##INFO=\n", &str); - if (!strstr(str.s, "##INFO=\n", &str); - if (!strstr(str.s, "##INFO=\n", &str); -// if (!strstr(str.s, "##INFO=\n", &str); - if (!strstr(str.s, "##INFO=\n", &str); - if (!strstr(str.s, "##INFO=\n", &str); - if (!strstr(str.s, "##INFO=\n", &str); - if (!strstr(str.s, "##INFO=\n", &str); - if (!strstr(str.s, "##INFO=\n", &str); - if (!strstr(str.s, "##INFO=\n", &str); - if (!strstr(str.s, "##INFO=\n", &str); - if (!strstr(str.s, "##FORMAT=\n", &str); - if (!strstr(str.s, "##FORMAT=\n", &str); - if (!strstr(str.s, "##FORMAT=\n", &str); - if (!strstr(str.s, "##FORMAT=\n", &str); - if (!strstr(str.s, "##FORMAT=\n", &str); - if (!strstr(str.s, "##FORMAT=\n", &str); - h->l_txt = str.l + 1; h->txt = str.s; -} - -double bcf_pair_freq(const bcf1_t *b0, const bcf1_t *b1, double f[4]); - -int bcfview(int argc, char *argv[]) -{ - extern int bcf_2qcall(bcf_hdr_t *h, bcf1_t *b); - extern void bcf_p1_indel_prior(bcf_p1aux_t *ma, double x); - extern int bcf_fix_gt(bcf1_t *b); - extern int bcf_anno_max(bcf1_t *b); - extern int bcf_shuffle(bcf1_t *b, int seed); - extern uint32_t *bcf_trio_prep(int is_x, int is_son); - extern int bcf_trio_call(uint32_t *prep, const bcf1_t *b, int *llr, int64_t *gt); - extern int bcf_pair_call(const bcf1_t *b); - extern int bcf_min_diff(const bcf1_t *b); - - bcf_t *bp, *bout = 0; - bcf1_t *b, *blast; - int c, *seeds = 0; - uint64_t n_processed = 0, qcnt[256]; - viewconf_t vc; - bcf_p1aux_t *p1 = 0; - bcf_hdr_t *hin, *hout; - int tid, begin, end; - char moder[4], modew[4]; - - tid = begin = end = -1; - memset(&vc, 0, sizeof(viewconf_t)); - vc.prior_type = vc.n1 = -1; vc.theta = 1e-3; vc.pref = 0.5; vc.indel_frac = -1.; vc.n_perm = 0; vc.min_perm_p = 0.01; vc.min_smpl_frac = 0; vc.min_lrt = 1; - memset(qcnt, 0, 8 * 256); - while ((c = getopt(argc, argv, "FN1:l:cC:eHAGvbSuP:t:p:QgLi:IMs:D:U:X:d:T:Y")) >= 0) { - switch (c) { - case '1': vc.n1 = atoi(optarg); break; - case 'l': vc.bed = bed_read(optarg); break; - case 'D': vc.fn_dict = strdup(optarg); break; - case 'F': vc.flag |= VC_FIX_PL; break; - case 'N': vc.flag |= VC_ACGT_ONLY; break; - case 'G': vc.flag |= VC_NO_GENO; break; - case 'A': vc.flag |= VC_KEEPALT; break; - case 'b': vc.flag |= VC_BCFOUT; break; - case 'S': vc.flag |= VC_VCFIN; break; - case 'c': vc.flag |= VC_CALL; break; - case 'e': vc.flag |= VC_EM; break; - case 'v': vc.flag |= VC_VARONLY | VC_CALL; break; - case 'u': vc.flag |= VC_UNCOMP | VC_BCFOUT; break; - case 'g': vc.flag |= VC_CALL_GT | VC_CALL; break; - case 'I': vc.flag |= VC_NO_INDEL; break; - case 'M': vc.flag |= VC_ANNO_MAX; break; - case 'Y': vc.flag |= VC_QCNT; break; - case 't': vc.theta = atof(optarg); break; - case 'p': vc.pref = atof(optarg); break; - case 'i': vc.indel_frac = atof(optarg); break; - case 'Q': vc.flag |= VC_QCALL; break; - case 'L': vc.flag |= VC_ADJLD; break; - case 'U': vc.n_perm = atoi(optarg); break; - case 'C': vc.min_lrt = atof(optarg); break; - case 'X': vc.min_perm_p = atof(optarg); break; - case 'd': vc.min_smpl_frac = atof(optarg); break; - case 's': vc.subsam = read_samples(optarg, &vc.n_sub); - vc.ploidy = calloc(vc.n_sub + 1, 1); - for (tid = 0; tid < vc.n_sub; ++tid) vc.ploidy[tid] = vc.subsam[tid][strlen(vc.subsam[tid]) + 1]; - tid = -1; - break; - case 'T': - if (strcmp(optarg, "trioauto") == 0) vc.trio_aux = bcf_trio_prep(0, 0); - else if (strcmp(optarg, "trioxd") == 0) vc.trio_aux = bcf_trio_prep(1, 0); - else if (strcmp(optarg, "trioxs") == 0) vc.trio_aux = bcf_trio_prep(1, 1); - else if (strcmp(optarg, "pair") == 0) vc.flag |= VC_PAIRCALL; - else { - fprintf(stderr, "[%s] Option '-T' can only take value trioauto, trioxd or trioxs.\n", __func__); - return 1; - } - break; - case 'P': - if (strcmp(optarg, "full") == 0) vc.prior_type = MC_PTYPE_FULL; - else if (strcmp(optarg, "cond2") == 0) vc.prior_type = MC_PTYPE_COND2; - else if (strcmp(optarg, "flat") == 0) vc.prior_type = MC_PTYPE_FLAT; - else vc.prior_file = strdup(optarg); - break; - } - } - if (argc == optind) { - fprintf(stderr, "\n"); - fprintf(stderr, "Usage: bcftools view [options] [reg]\n\n"); - fprintf(stderr, "Input/output options:\n\n"); - fprintf(stderr, " -A keep all possible alternate alleles at variant sites\n"); - fprintf(stderr, " -b output BCF instead of VCF\n"); - fprintf(stderr, " -D FILE sequence dictionary for VCF->BCF conversion [null]\n"); - fprintf(stderr, " -F PL generated by r921 or before (which generate old ordering)\n"); - fprintf(stderr, " -G suppress all individual genotype information\n"); - fprintf(stderr, " -l FILE list of sites (chr pos) or regions (BED) to output [all sites]\n"); - fprintf(stderr, " -L calculate LD for adjacent sites\n"); - fprintf(stderr, " -N skip sites where REF is not A/C/G/T\n"); - fprintf(stderr, " -Q output the QCALL likelihood format\n"); - fprintf(stderr, " -s FILE list of samples to use [all samples]\n"); - fprintf(stderr, " -S input is VCF\n"); - fprintf(stderr, " -u uncompressed BCF output (force -b)\n"); - fprintf(stderr, "\nConsensus/variant calling options:\n\n"); - fprintf(stderr, " -c SNP calling (force -e)\n"); - fprintf(stderr, " -d FLOAT skip loci where less than FLOAT fraction of samples covered [0]\n"); - fprintf(stderr, " -e likelihood based analyses\n"); - fprintf(stderr, " -g call genotypes at variant sites (force -c)\n"); - fprintf(stderr, " -i FLOAT indel-to-substitution ratio [%.4g]\n", vc.indel_frac); - fprintf(stderr, " -I skip indels\n"); - fprintf(stderr, " -p FLOAT variant if P(ref|D)BCF conversion please specify the sequence dictionary with -D\n", __func__); - return 1; - } - if (vc.n1 <= 0) vc.n_perm = 0; // TODO: give a warning here! - if (vc.n_perm > 0) { - seeds = malloc(vc.n_perm * sizeof(int)); - srand48(time(0)); - for (c = 0; c < vc.n_perm; ++c) seeds[c] = lrand48(); - } - b = calloc(1, sizeof(bcf1_t)); - blast = calloc(1, sizeof(bcf1_t)); - strcpy(moder, "r"); - if (!(vc.flag & VC_VCFIN)) strcat(moder, "b"); - strcpy(modew, "w"); - if (vc.flag & VC_BCFOUT) strcat(modew, "b"); - if (vc.flag & VC_UNCOMP) strcat(modew, "u"); - bp = vcf_open(argv[optind], moder); - hin = hout = vcf_hdr_read(bp); - if (vc.fn_dict && (vc.flag & VC_VCFIN)) - vcf_dictread(bp, hin, vc.fn_dict); - bout = vcf_open("-", modew); - if (!(vc.flag & VC_QCALL)) { - if (vc.n_sub) { - vc.sublist = calloc(vc.n_sub, sizeof(int)); - hout = bcf_hdr_subsam(hin, vc.n_sub, vc.subsam, vc.sublist); - } - if (vc.flag & VC_CALL) write_header(hout); - vcf_hdr_write(bout, hout); - } - if (vc.flag & VC_CALL) { - p1 = bcf_p1_init(hout->n_smpl, vc.ploidy); - if (vc.prior_file) { - if (bcf_p1_read_prior(p1, vc.prior_file) < 0) { - fprintf(stderr, "[%s] fail to read the prior AFS.\n", __func__); - return 1; - } - } else bcf_p1_init_prior(p1, vc.prior_type, vc.theta); - if (vc.n1 > 0 && vc.min_lrt > 0.) { // set n1 - bcf_p1_set_n1(p1, vc.n1); - bcf_p1_init_subprior(p1, vc.prior_type, vc.theta); - } - if (vc.indel_frac > 0.) bcf_p1_indel_prior(p1, vc.indel_frac); // otherwise use the default indel_frac - } - if (optind + 1 < argc && !(vc.flag&VC_VCFIN)) { - void *str2id = bcf_build_refhash(hout); - if (bcf_parse_region(str2id, argv[optind+1], &tid, &begin, &end) >= 0) { - bcf_idx_t *idx; - idx = bcf_idx_load(argv[optind]); - if (idx) { - uint64_t off; - off = bcf_idx_query(idx, tid, begin); - if (off == 0) { - fprintf(stderr, "[%s] no records in the query region.\n", __func__); - return 1; // FIXME: a lot of memory leaks... - } - bgzf_seek(bp->fp, off, SEEK_SET); - bcf_idx_destroy(idx); - } - } - } - while (vcf_read(bp, hin, b) > 0) { - int is_indel, cons_llr = -1; - int64_t cons_gt = -1; - double em[10]; - if ((vc.flag & VC_VARONLY) && strcmp(b->alt, "X") == 0) continue; - if ((vc.flag & VC_VARONLY) && vc.min_smpl_frac > 0.) { - extern int bcf_smpl_covered(const bcf1_t *b); - int n = bcf_smpl_covered(b); - if ((double)n / b->n_smpl < vc.min_smpl_frac) continue; - } - if (vc.n_sub) bcf_subsam(vc.n_sub, vc.sublist, b); - if (vc.flag & VC_FIX_PL) bcf_fix_pl(b); - is_indel = bcf_is_indel(b); - if ((vc.flag & VC_NO_INDEL) && is_indel) continue; - if ((vc.flag & VC_ACGT_ONLY) && !is_indel) { - int x; - if (b->ref[0] == 0 || b->ref[1] != 0) continue; - x = toupper(b->ref[0]); - if (x != 'A' && x != 'C' && x != 'G' && x != 'T') continue; - } - if (vc.bed && !bed_overlap(vc.bed, hin->ns[b->tid], b->pos, b->pos + strlen(b->ref))) continue; - if (tid >= 0) { - int l = strlen(b->ref); - l = b->pos + (l > 0? l : 1); - if (b->tid != tid || b->pos >= end) break; - if (!(l > begin && end > b->pos)) continue; - } - ++n_processed; - if ((vc.flag & VC_QCNT) && !is_indel) { // summarize the difference - int x = bcf_min_diff(b); - if (x > 255) x = 255; - if (x >= 0) ++qcnt[x]; - } - if (vc.flag & VC_QCALL) { // output QCALL format; STOP here - bcf_2qcall(hout, b); - continue; - } - if (vc.trio_aux) // do trio calling - bcf_trio_call(vc.trio_aux, b, &cons_llr, &cons_gt); - else if (vc.flag & VC_PAIRCALL) - cons_llr = bcf_pair_call(b); - if (vc.flag & (VC_CALL|VC_ADJLD|VC_EM)) bcf_gl2pl(b); - if (vc.flag & VC_EM) bcf_em1(b, vc.n1, 0x1ff, em); - else { - int i; - for (i = 0; i < 9; ++i) em[i] = -1.; - } - if (vc.flag & VC_CALL) { // call variants - bcf_p1rst_t pr; - int calret = bcf_p1_cal(b, (em[7] >= 0 && em[7] < vc.min_lrt), p1, &pr); - if (n_processed % 100000 == 0) { - fprintf(stderr, "[%s] %ld sites processed.\n", __func__, (long)n_processed); - bcf_p1_dump_afs(p1); - } - if (pr.p_ref >= vc.pref && (vc.flag & VC_VARONLY)) continue; - if (vc.n_perm && vc.n1 > 0 && pr.p_chi2 < vc.min_perm_p) { // permutation test - bcf_p1rst_t r; - int i, n = 0; - for (i = 0; i < vc.n_perm; ++i) { -#ifdef BCF_PERM_LRT // LRT based permutation is much faster but less robust to artifacts - double x[10]; - bcf_shuffle(b, seeds[i]); - bcf_em1(b, vc.n1, 1<<7, x); - if (x[7] < em[7]) ++n; -#else - bcf_shuffle(b, seeds[i]); - bcf_p1_cal(b, 1, p1, &r); - if (pr.p_chi2 >= r.p_chi2) ++n; -#endif - } - pr.perm_rank = n; - } - if (calret >= 0) update_bcf1(b, p1, &pr, vc.pref, vc.flag, em, cons_llr, cons_gt); - } else if (vc.flag & VC_EM) update_bcf1(b, 0, 0, 0, vc.flag, em, cons_llr, cons_gt); - if (vc.flag & VC_ADJLD) { // compute LD - double f[4], r2; - if ((r2 = bcf_pair_freq(blast, b, f)) >= 0) { - kstring_t s; - s.m = s.l = 0; s.s = 0; - if (*b->info) kputc(';', &s); - ksprintf(&s, "NEIR=%.3f;NEIF4=%.3f,%.3f,%.3f,%.3f", r2, f[0], f[1], f[2], f[3]); - bcf_append_info(b, s.s, s.l); - free(s.s); - } - bcf_cpy(blast, b); - } - if (vc.flag & VC_ANNO_MAX) bcf_anno_max(b); - if (vc.flag & VC_NO_GENO) { // do not output GENO fields - b->n_gi = 0; - b->fmt[0] = '\0'; - b->l_str = b->fmt - b->str + 1; - } else bcf_fix_gt(b); - vcf_write(bout, hout, b); - } - if (vc.prior_file) free(vc.prior_file); - if (vc.flag & VC_CALL) bcf_p1_dump_afs(p1); - if (hin != hout) bcf_hdr_destroy(hout); - bcf_hdr_destroy(hin); - bcf_destroy(b); bcf_destroy(blast); - vcf_close(bp); vcf_close(bout); - if (vc.fn_dict) free(vc.fn_dict); - if (vc.ploidy) free(vc.ploidy); - if (vc.trio_aux) free(vc.trio_aux); - if (vc.n_sub) { - int i; - for (i = 0; i < vc.n_sub; ++i) free(vc.subsam[i]); - free(vc.subsam); free(vc.sublist); - } - if (vc.bed) bed_destroy(vc.bed); - if (vc.flag & VC_QCNT) - for (c = 0; c < 256; ++c) - fprintf(stderr, "QT\t%d\t%lld\n", c, (long long)qcnt[c]); - if (seeds) free(seeds); - if (p1) bcf_p1_destroy(p1); - return 0; -} diff -Nru tophat-2.1.1+dfsg/src/samtools-0.1.18/bcftools/em.c tophat-2.1.1+dfsg1/src/samtools-0.1.18/bcftools/em.c --- tophat-2.1.1+dfsg/src/samtools-0.1.18/bcftools/em.c 2016-02-14 18:21:17.480079000 +0000 +++ tophat-2.1.1+dfsg1/src/samtools-0.1.18/bcftools/em.c 1970-01-01 00:00:00.000000000 +0000 @@ -1,310 +0,0 @@ -#include -#include -#include -#include "bcf.h" -#include "kmin.h" - -static double g_q2p[256]; - -#define ITER_MAX 50 -#define ITER_TRY 10 -#define EPS 1e-5 - -extern double kf_gammaq(double, double); - -/* - Generic routines - */ -// get the 3 genotype likelihoods -static double *get_pdg3(const bcf1_t *b) -{ - double *pdg; - const uint8_t *PL = 0; - int i, PL_len = 0; - // initialize g_q2p if necessary - if (g_q2p[0] == 0.) - for (i = 0; i < 256; ++i) - g_q2p[i] = pow(10., -i / 10.); - // set PL and PL_len - for (i = 0; i < b->n_gi; ++i) { - if (b->gi[i].fmt == bcf_str2int("PL", 2)) { - PL = (const uint8_t*)b->gi[i].data; - PL_len = b->gi[i].len; - break; - } - } - if (i == b->n_gi) return 0; // no PL - // fill pdg - pdg = malloc(3 * b->n_smpl * sizeof(double)); - for (i = 0; i < b->n_smpl; ++i) { - const uint8_t *pi = PL + i * PL_len; - double *p = pdg + i * 3; - p[0] = g_q2p[pi[2]]; p[1] = g_q2p[pi[1]]; p[2] = g_q2p[pi[0]]; - } - return pdg; -} - -// estimate site allele frequency in a very naive and inaccurate way -static double est_freq(int n, const double *pdg) -{ - int i, gcnt[3], tmp1; - // get a rough estimate of the genotype frequency - gcnt[0] = gcnt[1] = gcnt[2] = 0; - for (i = 0; i < n; ++i) { - const double *p = pdg + i * 3; - if (p[0] != 1. || p[1] != 1. || p[2] != 1.) { - int which = p[0] > p[1]? 0 : 1; - which = p[which] > p[2]? which : 2; - ++gcnt[which]; - } - } - tmp1 = gcnt[0] + gcnt[1] + gcnt[2]; - return (tmp1 == 0)? -1.0 : (.5 * gcnt[1] + gcnt[2]) / tmp1; -} - -/* - Single-locus EM - */ - -typedef struct { - int beg, end; - const double *pdg; -} minaux1_t; - -static double prob1(double f, void *data) -{ - minaux1_t *a = (minaux1_t*)data; - double p = 1., l = 0., f3[3]; - int i; -// printf("brent %lg\n", f); - if (f < 0 || f > 1) return 1e300; - f3[0] = (1.-f)*(1.-f); f3[1] = 2.*f*(1.-f); f3[2] = f*f; - for (i = a->beg; i < a->end; ++i) { - const double *pdg = a->pdg + i * 3; - p *= pdg[0] * f3[0] + pdg[1] * f3[1] + pdg[2] * f3[2]; - if (p < 1e-200) l -= log(p), p = 1.; - } - return l - log(p); -} - -// one EM iteration for allele frequency estimate -static double freq_iter(double *f, const double *_pdg, int beg, int end) -{ - double f0 = *f, f3[3], err; - int i; -// printf("em %lg\n", *f); - f3[0] = (1.-f0)*(1.-f0); f3[1] = 2.*f0*(1.-f0); f3[2] = f0*f0; - for (i = beg, f0 = 0.; i < end; ++i) { - const double *pdg = _pdg + i * 3; - f0 += (pdg[1] * f3[1] + 2. * pdg[2] * f3[2]) - / (pdg[0] * f3[0] + pdg[1] * f3[1] + pdg[2] * f3[2]); - } - f0 /= (end - beg) * 2; - err = fabs(f0 - *f); - *f = f0; - return err; -} - -/* The following function combines EM and Brent's method. When the signal from - * the data is strong, EM is faster but sometimes, EM may converge very slowly. - * When this happens, we switch to Brent's method. The idea is learned from - * Rasmus Nielsen. - */ -static double freqml(double f0, int beg, int end, const double *pdg) -{ - int i; - double f; - for (i = 0, f = f0; i < ITER_TRY; ++i) - if (freq_iter(&f, pdg, beg, end) < EPS) break; - if (i == ITER_TRY) { // haven't converged yet; try Brent's method - minaux1_t a; - a.beg = beg; a.end = end; a.pdg = pdg; - kmin_brent(prob1, f0 == f? .5*f0 : f0, f, (void*)&a, EPS, &f); - } - return f; -} - -// one EM iteration for genotype frequency estimate -static double g3_iter(double g[3], const double *_pdg, int beg, int end) -{ - double err, gg[3]; - int i; - gg[0] = gg[1] = gg[2] = 0.; -// printf("%lg,%lg,%lg\n", g[0], g[1], g[2]); - for (i = beg; i < end; ++i) { - double sum, tmp[3]; - const double *pdg = _pdg + i * 3; - tmp[0] = pdg[0] * g[0]; tmp[1] = pdg[1] * g[1]; tmp[2] = pdg[2] * g[2]; - sum = (tmp[0] + tmp[1] + tmp[2]) * (end - beg); - gg[0] += tmp[0] / sum; gg[1] += tmp[1] / sum; gg[2] += tmp[2] / sum; - } - err = fabs(gg[0] - g[0]) > fabs(gg[1] - g[1])? fabs(gg[0] - g[0]) : fabs(gg[1] - g[1]); - err = err > fabs(gg[2] - g[2])? err : fabs(gg[2] - g[2]); - g[0] = gg[0]; g[1] = gg[1]; g[2] = gg[2]; - return err; -} - -// perform likelihood ratio test -static double lk_ratio_test(int n, int n1, const double *pdg, double f3[3][3]) -{ - double r; - int i; - for (i = 0, r = 1.; i < n1; ++i) { - const double *p = pdg + i * 3; - r *= (p[0] * f3[1][0] + p[1] * f3[1][1] + p[2] * f3[1][2]) - / (p[0] * f3[0][0] + p[1] * f3[0][1] + p[2] * f3[0][2]); - } - for (; i < n; ++i) { - const double *p = pdg + i * 3; - r *= (p[0] * f3[2][0] + p[1] * f3[2][1] + p[2] * f3[2][2]) - / (p[0] * f3[0][0] + p[1] * f3[0][1] + p[2] * f3[0][2]); - } - return r; -} - -// x[0]: ref frequency -// x[1..3]: alt-alt, alt-ref, ref-ref frequenc -// x[4]: HWE P-value -// x[5..6]: group1 freq, group2 freq -// x[7]: 1-degree P-value -// x[8]: 2-degree P-value -int bcf_em1(const bcf1_t *b, int n1, int flag, double x[10]) -{ - double *pdg; - int i, n, n2; - if (b->n_alleles < 2) return -1; // one allele only - // initialization - if (n1 < 0 || n1 > b->n_smpl) n1 = 0; - if (flag & 1<<7) flag |= 7<<5; // compute group freq if LRT is required - if (flag & 0xf<<1) flag |= 0xf<<1; - n = b->n_smpl; n2 = n - n1; - pdg = get_pdg3(b); - if (pdg == 0) return -1; - for (i = 0; i < 10; ++i) x[i] = -1.; // set to negative - { - if ((x[0] = est_freq(n, pdg)) < 0.) { - free(pdg); - return -1; // no data - } - x[0] = freqml(x[0], 0, n, pdg); - } - if (flag & (0xf<<1|3<<8)) { // estimate the genotype frequency and test HWE - double *g = x + 1, f3[3], r; - f3[0] = g[0] = (1 - x[0]) * (1 - x[0]); - f3[1] = g[1] = 2 * x[0] * (1 - x[0]); - f3[2] = g[2] = x[0] * x[0]; - for (i = 0; i < ITER_MAX; ++i) - if (g3_iter(g, pdg, 0, n) < EPS) break; - // Hardy-Weinberg equilibrium (HWE) - for (i = 0, r = 1.; i < n; ++i) { - double *p = pdg + i * 3; - r *= (p[0] * g[0] + p[1] * g[1] + p[2] * g[2]) / (p[0] * f3[0] + p[1] * f3[1] + p[2] * f3[2]); - } - x[4] = kf_gammaq(.5, log(r)); - } - if ((flag & 7<<5) && n1 > 0 && n1 < n) { // group frequency - x[5] = freqml(x[0], 0, n1, pdg); - x[6] = freqml(x[0], n1, n, pdg); - } - if ((flag & 1<<7) && n1 > 0 && n1 < n) { // 1-degree P-value - double f[3], f3[3][3], tmp; - f[0] = x[0]; f[1] = x[5]; f[2] = x[6]; - for (i = 0; i < 3; ++i) - f3[i][0] = (1-f[i])*(1-f[i]), f3[i][1] = 2*f[i]*(1-f[i]), f3[i][2] = f[i]*f[i]; - tmp = log(lk_ratio_test(n, n1, pdg, f3)); - if (tmp < 0) tmp = 0; - x[7] = kf_gammaq(.5, tmp); - } - if ((flag & 3<<8) && n1 > 0 && n1 < n) { // 2-degree P-value - double g[3][3], tmp; - for (i = 0; i < 3; ++i) memcpy(g[i], x + 1, 3 * sizeof(double)); - for (i = 0; i < ITER_MAX; ++i) - if (g3_iter(g[1], pdg, 0, n1) < EPS) break; - for (i = 0; i < ITER_MAX; ++i) - if (g3_iter(g[2], pdg, n1, n) < EPS) break; - tmp = log(lk_ratio_test(n, n1, pdg, g)); - if (tmp < 0) tmp = 0; - x[8] = kf_gammaq(1., tmp); - } - // free - free(pdg); - return 0; -} - -/* - Two-locus EM (LD) - */ - -#define _G1(h, k) ((h>>1&1) + (k>>1&1)) -#define _G2(h, k) ((h&1) + (k&1)) - -// 0: the previous site; 1: the current site -static int pair_freq_iter(int n, double *pdg[2], double f[4]) -{ - double ff[4]; - int i, k, h; -// printf("%lf,%lf,%lf,%lf\n", f[0], f[1], f[2], f[3]); - memset(ff, 0, 4 * sizeof(double)); - for (i = 0; i < n; ++i) { - double *p[2], sum, tmp; - p[0] = pdg[0] + i * 3; p[1] = pdg[1] + i * 3; - for (k = 0, sum = 0.; k < 4; ++k) - for (h = 0; h < 4; ++h) - sum += f[k] * f[h] * p[0][_G1(k,h)] * p[1][_G2(k,h)]; - for (k = 0; k < 4; ++k) { - tmp = f[0] * (p[0][_G1(0,k)] * p[1][_G2(0,k)] + p[0][_G1(k,0)] * p[1][_G2(k,0)]) - + f[1] * (p[0][_G1(1,k)] * p[1][_G2(1,k)] + p[0][_G1(k,1)] * p[1][_G2(k,1)]) - + f[2] * (p[0][_G1(2,k)] * p[1][_G2(2,k)] + p[0][_G1(k,2)] * p[1][_G2(k,2)]) - + f[3] * (p[0][_G1(3,k)] * p[1][_G2(3,k)] + p[0][_G1(k,3)] * p[1][_G2(k,3)]); - ff[k] += f[k] * tmp / sum; - } - } - for (k = 0; k < 4; ++k) f[k] = ff[k] / (2 * n); - return 0; -} - -double bcf_pair_freq(const bcf1_t *b0, const bcf1_t *b1, double f[4]) -{ - const bcf1_t *b[2]; - int i, j, n_smpl; - double *pdg[2], flast[4], r, f0[2]; - // initialize others - if (b0->n_smpl != b1->n_smpl) return -1; // different number of samples - n_smpl = b0->n_smpl; - b[0] = b0; b[1] = b1; - f[0] = f[1] = f[2] = f[3] = -1.; - if (b[0]->n_alleles < 2 || b[1]->n_alleles < 2) return -1; // one allele only - pdg[0] = get_pdg3(b0); pdg[1] = get_pdg3(b1); - if (pdg[0] == 0 || pdg[1] == 0) { - free(pdg[0]); free(pdg[1]); - return -1; - } - // set the initial value - f0[0] = est_freq(n_smpl, pdg[0]); - f0[1] = est_freq(n_smpl, pdg[1]); - f[0] = (1 - f0[0]) * (1 - f0[1]); f[3] = f0[0] * f0[1]; - f[1] = (1 - f0[0]) * f0[1]; f[2] = f0[0] * (1 - f0[1]); - // iteration - for (j = 0; j < ITER_MAX; ++j) { - double eps = 0; - memcpy(flast, f, 4 * sizeof(double)); - pair_freq_iter(n_smpl, pdg, f); - for (i = 0; i < 4; ++i) { - double x = fabs(f[i] - flast[i]); - if (x > eps) eps = x; - } - if (eps < EPS) break; - } - // free - free(pdg[0]); free(pdg[1]); - { // calculate r^2 - double p[2], q[2], D; - p[0] = f[0] + f[1]; q[0] = 1 - p[0]; - p[1] = f[0] + f[2]; q[1] = 1 - p[1]; - D = f[0] * f[3] - f[1] * f[2]; - r = sqrt(D * D / (p[0] * p[1] * q[0] * q[1])); -// printf("R(%lf,%lf,%lf,%lf)=%lf\n", f[0], f[1], f[2], f[3], r); - if (isnan(r)) r = -1.; - } - return r; -} diff -Nru tophat-2.1.1+dfsg/src/samtools-0.1.18/bcftools/fet.c tophat-2.1.1+dfsg1/src/samtools-0.1.18/bcftools/fet.c --- tophat-2.1.1+dfsg/src/samtools-0.1.18/bcftools/fet.c 2016-02-14 18:21:17.488079000 +0000 +++ tophat-2.1.1+dfsg1/src/samtools-0.1.18/bcftools/fet.c 1970-01-01 00:00:00.000000000 +0000 @@ -1,112 +0,0 @@ -#include -#include - -/* This program is implemented with ideas from this web page: - * - * http://www.langsrud.com/fisher.htm - */ - -// log\binom{n}{k} -static double lbinom(int n, int k) -{ - if (k == 0 || n == k) return 0; - return lgamma(n+1) - lgamma(k+1) - lgamma(n-k+1); -} - -// n11 n12 | n1_ -// n21 n22 | n2_ -//-----------+---- -// n_1 n_2 | n - -// hypergeometric distribution -static double hypergeo(int n11, int n1_, int n_1, int n) -{ - return exp(lbinom(n1_, n11) + lbinom(n-n1_, n_1-n11) - lbinom(n, n_1)); -} - -typedef struct { - int n11, n1_, n_1, n; - double p; -} hgacc_t; - -// incremental version of hypergenometric distribution -static double hypergeo_acc(int n11, int n1_, int n_1, int n, hgacc_t *aux) -{ - if (n1_ || n_1 || n) { - aux->n11 = n11; aux->n1_ = n1_; aux->n_1 = n_1; aux->n = n; - } else { // then only n11 changed; the rest fixed - if (n11%11 && n11 + aux->n - aux->n1_ - aux->n_1) { - if (n11 == aux->n11 + 1) { // incremental - aux->p *= (double)(aux->n1_ - aux->n11) / n11 - * (aux->n_1 - aux->n11) / (n11 + aux->n - aux->n1_ - aux->n_1); - aux->n11 = n11; - return aux->p; - } - if (n11 == aux->n11 - 1) { // incremental - aux->p *= (double)aux->n11 / (aux->n1_ - n11) - * (aux->n11 + aux->n - aux->n1_ - aux->n_1) / (aux->n_1 - n11); - aux->n11 = n11; - return aux->p; - } - } - aux->n11 = n11; - } - aux->p = hypergeo(aux->n11, aux->n1_, aux->n_1, aux->n); - return aux->p; -} - -double kt_fisher_exact(int n11, int n12, int n21, int n22, double *_left, double *_right, double *two) -{ - int i, j, max, min; - double p, q, left, right; - hgacc_t aux; - int n1_, n_1, n; - - n1_ = n11 + n12; n_1 = n11 + n21; n = n11 + n12 + n21 + n22; // calculate n1_, n_1 and n - max = (n_1 < n1_) ? n_1 : n1_; // max n11, for right tail - min = n1_ + n_1 - n; - if (min < 0) min = 0; // min n11, for left tail - *two = *_left = *_right = 1.; - if (min == max) return 1.; // no need to do test - q = hypergeo_acc(n11, n1_, n_1, n, &aux); // the probability of the current table - // left tail - p = hypergeo_acc(min, 0, 0, 0, &aux); - for (left = 0., i = min + 1; p < 0.99999999 * q; ++i) // loop until underflow - left += p, p = hypergeo_acc(i, 0, 0, 0, &aux); - --i; - if (p < 1.00000001 * q) left += p; - else --i; - // right tail - p = hypergeo_acc(max, 0, 0, 0, &aux); - for (right = 0., j = max - 1; p < 0.99999999 * q; --j) // loop until underflow - right += p, p = hypergeo_acc(j, 0, 0, 0, &aux); - ++j; - if (p < 1.00000001 * q) right += p; - else ++j; - // two-tail - *two = left + right; - if (*two > 1.) *two = 1.; - // adjust left and right - if (abs(i - n11) < abs(j - n11)) right = 1. - left + q; - else left = 1.0 - right + q; - *_left = left; *_right = right; - return q; -} - -#ifdef FET_MAIN -#include - -int main(int argc, char *argv[]) -{ - char id[1024]; - int n11, n12, n21, n22; - double left, right, twotail, prob; - - while (scanf("%s%d%d%d%d", id, &n11, &n12, &n21, &n22) == 5) { - prob = kt_fisher_exact(n11, n12, n21, n22, &left, &right, &twotail); - printf("%s\t%d\t%d\t%d\t%d\t%.6g\t%.6g\t%.6g\t%.6g\n", id, n11, n12, n21, n22, - prob, left, right, twotail); - } - return 0; -} -#endif diff -Nru tophat-2.1.1+dfsg/src/samtools-0.1.18/bcftools/index.c tophat-2.1.1+dfsg1/src/samtools-0.1.18/bcftools/index.c --- tophat-2.1.1+dfsg/src/samtools-0.1.18/bcftools/index.c 2016-02-14 18:21:17.498079000 +0000 +++ tophat-2.1.1+dfsg1/src/samtools-0.1.18/bcftools/index.c 1970-01-01 00:00:00.000000000 +0000 @@ -1,335 +0,0 @@ -#include -#include -#include -#include "bam_endian.h" -#include "kstring.h" -#include "bcf.h" -#ifdef _USE_KNETFILE -#include "knetfile.h" -#endif - -#define TAD_LIDX_SHIFT 13 - -typedef struct { - int32_t n, m; - uint64_t *offset; -} bcf_lidx_t; - -struct __bcf_idx_t { - int32_t n; - bcf_lidx_t *index2; -}; - -/************ - * indexing * - ************/ - -static inline void insert_offset2(bcf_lidx_t *index2, int _beg, int _end, uint64_t offset) -{ - int i, beg, end; - beg = _beg >> TAD_LIDX_SHIFT; - end = (_end - 1) >> TAD_LIDX_SHIFT; - if (index2->m < end + 1) { - int old_m = index2->m; - index2->m = end + 1; - kroundup32(index2->m); - index2->offset = (uint64_t*)realloc(index2->offset, index2->m * 8); - memset(index2->offset + old_m, 0, 8 * (index2->m - old_m)); - } - if (beg == end) { - if (index2->offset[beg] == 0) index2->offset[beg] = offset; - } else { - for (i = beg; i <= end; ++i) - if (index2->offset[i] == 0) index2->offset[i] = offset; - } - if (index2->n < end + 1) index2->n = end + 1; -} - -bcf_idx_t *bcf_idx_core(bcf_t *bp, bcf_hdr_t *h) -{ - bcf_idx_t *idx; - int32_t last_coor, last_tid; - uint64_t last_off; - kstring_t *str; - BGZF *fp = bp->fp; - bcf1_t *b; - int ret; - - b = calloc(1, sizeof(bcf1_t)); - str = calloc(1, sizeof(kstring_t)); - idx = (bcf_idx_t*)calloc(1, sizeof(bcf_idx_t)); - idx->n = h->n_ref; - idx->index2 = calloc(h->n_ref, sizeof(bcf_lidx_t)); - - last_tid = 0xffffffffu; - last_off = bgzf_tell(fp); last_coor = 0xffffffffu; - while ((ret = bcf_read(bp, h, b)) > 0) { - int end, tmp; - if (last_tid != b->tid) { // change of chromosomes - last_tid = b->tid; - } else if (last_coor > b->pos) { - fprintf(stderr, "[bcf_idx_core] the input is out of order\n"); - free(str->s); free(str); free(idx); bcf_destroy(b); - return 0; - } - tmp = strlen(b->ref); - end = b->pos + (tmp > 0? tmp : 1); - insert_offset2(&idx->index2[b->tid], b->pos, end, last_off); - last_off = bgzf_tell(fp); - last_coor = b->pos; - } - free(str->s); free(str); bcf_destroy(b); - return idx; -} - -void bcf_idx_destroy(bcf_idx_t *idx) -{ - int i; - if (idx == 0) return; - for (i = 0; i < idx->n; ++i) free(idx->index2[i].offset); - free(idx->index2); - free(idx); -} - -/****************** - * index file I/O * - ******************/ - -void bcf_idx_save(const bcf_idx_t *idx, BGZF *fp) -{ - int32_t i, ti_is_be; - ti_is_be = bam_is_big_endian(); - bgzf_write(fp, "BCI\4", 4); - if (ti_is_be) { - uint32_t x = idx->n; - bgzf_write(fp, bam_swap_endian_4p(&x), 4); - } else bgzf_write(fp, &idx->n, 4); - for (i = 0; i < idx->n; ++i) { - bcf_lidx_t *index2 = idx->index2 + i; - // write linear index (index2) - if (ti_is_be) { - int x = index2->n; - bgzf_write(fp, bam_swap_endian_4p(&x), 4); - } else bgzf_write(fp, &index2->n, 4); - if (ti_is_be) { // big endian - int x; - for (x = 0; (int)x < index2->n; ++x) - bam_swap_endian_8p(&index2->offset[x]); - bgzf_write(fp, index2->offset, 8 * index2->n); - for (x = 0; (int)x < index2->n; ++x) - bam_swap_endian_8p(&index2->offset[x]); - } else bgzf_write(fp, index2->offset, 8 * index2->n); - } -} - -static bcf_idx_t *bcf_idx_load_core(BGZF *fp) -{ - int i, ti_is_be; - char magic[4]; - bcf_idx_t *idx; - ti_is_be = bam_is_big_endian(); - if (fp == 0) { - fprintf(stderr, "[%s] fail to load index.\n", __func__); - return 0; - } - bgzf_read(fp, magic, 4); - if (strncmp(magic, "BCI\4", 4)) { - fprintf(stderr, "[%s] wrong magic number.\n", __func__); - return 0; - } - idx = (bcf_idx_t*)calloc(1, sizeof(bcf_idx_t)); - bgzf_read(fp, &idx->n, 4); - if (ti_is_be) bam_swap_endian_4p(&idx->n); - idx->index2 = (bcf_lidx_t*)calloc(idx->n, sizeof(bcf_lidx_t)); - for (i = 0; i < idx->n; ++i) { - bcf_lidx_t *index2 = idx->index2 + i; - int j; - bgzf_read(fp, &index2->n, 4); - if (ti_is_be) bam_swap_endian_4p(&index2->n); - index2->m = index2->n; - index2->offset = (uint64_t*)calloc(index2->m, 8); - bgzf_read(fp, index2->offset, index2->n * 8); - if (ti_is_be) - for (j = 0; j < index2->n; ++j) bam_swap_endian_8p(&index2->offset[j]); - } - return idx; -} - -bcf_idx_t *bcf_idx_load_local(const char *fnidx) -{ - BGZF *fp; - fp = bgzf_open(fnidx, "r"); - if (fp) { - bcf_idx_t *idx = bcf_idx_load_core(fp); - bgzf_close(fp); - return idx; - } else return 0; -} - -#ifdef _USE_KNETFILE -static void download_from_remote(const char *url) -{ - const int buf_size = 1 * 1024 * 1024; - char *fn; - FILE *fp; - uint8_t *buf; - knetFile *fp_remote; - int l; - if (strstr(url, "ftp://") != url && strstr(url, "http://") != url) return; - l = strlen(url); - for (fn = (char*)url + l - 1; fn >= url; --fn) - if (*fn == '/') break; - ++fn; // fn now points to the file name - fp_remote = knet_open(url, "r"); - if (fp_remote == 0) { - fprintf(stderr, "[download_from_remote] fail to open remote file.\n"); - return; - } - if ((fp = fopen(fn, "w")) == 0) { - fprintf(stderr, "[download_from_remote] fail to create file in the working directory.\n"); - knet_close(fp_remote); - return; - } - buf = (uint8_t*)calloc(buf_size, 1); - while ((l = knet_read(fp_remote, buf, buf_size)) != 0) - fwrite(buf, 1, l, fp); - free(buf); - fclose(fp); - knet_close(fp_remote); -} -#else -static void download_from_remote(const char *url) -{ - return; -} -#endif - -static char *get_local_version(const char *fn) -{ - struct stat sbuf; - char *fnidx = (char*)calloc(strlen(fn) + 5, 1); - strcat(strcpy(fnidx, fn), ".bci"); - if ((strstr(fnidx, "ftp://") == fnidx || strstr(fnidx, "http://") == fnidx)) { - char *p, *url; - int l = strlen(fnidx); - for (p = fnidx + l - 1; p >= fnidx; --p) - if (*p == '/') break; - url = fnidx; fnidx = strdup(p + 1); - if (stat(fnidx, &sbuf) == 0) { - free(url); - return fnidx; - } - fprintf(stderr, "[%s] downloading the index file...\n", __func__); - download_from_remote(url); - free(url); - } - if (stat(fnidx, &sbuf) == 0) return fnidx; - free(fnidx); return 0; -} - -bcf_idx_t *bcf_idx_load(const char *fn) -{ - bcf_idx_t *idx; - char *fname = get_local_version(fn); - if (fname == 0) return 0; - idx = bcf_idx_load_local(fname); - free(fname); - return idx; -} - -int bcf_idx_build2(const char *fn, const char *_fnidx) -{ - char *fnidx; - BGZF *fpidx; - bcf_t *bp; - bcf_idx_t *idx; - bcf_hdr_t *h; - if ((bp = bcf_open(fn, "r")) == 0) { - fprintf(stderr, "[bcf_idx_build2] fail to open the BAM file.\n"); - return -1; - } - h = bcf_hdr_read(bp); - idx = bcf_idx_core(bp, h); - bcf_close(bp); - if (_fnidx == 0) { - fnidx = (char*)calloc(strlen(fn) + 5, 1); - strcpy(fnidx, fn); strcat(fnidx, ".bci"); - } else fnidx = strdup(_fnidx); - fpidx = bgzf_open(fnidx, "w"); - if (fpidx == 0) { - fprintf(stderr, "[bcf_idx_build2] fail to create the index file.\n"); - free(fnidx); - return -1; - } - bcf_idx_save(idx, fpidx); - bcf_idx_destroy(idx); - bgzf_close(fpidx); - free(fnidx); - return 0; -} - -int bcf_idx_build(const char *fn) -{ - return bcf_idx_build2(fn, 0); -} - -/******************************************** - * parse a region in the format chr:beg-end * - ********************************************/ - -int bcf_parse_region(void *str2id, const char *str, int *tid, int *begin, int *end) -{ - char *s, *p; - int i, l, k; - l = strlen(str); - p = s = (char*)malloc(l+1); - /* squeeze out "," */ - for (i = k = 0; i != l; ++i) - if (str[i] != ',' && !isspace(str[i])) s[k++] = str[i]; - s[k] = 0; - for (i = 0; i != k; ++i) if (s[i] == ':') break; - s[i] = 0; - if ((*tid = bcf_str2id(str2id, s)) < 0) { - free(s); - return -1; - } - if (i == k) { /* dump the whole sequence */ - *begin = 0; *end = 1<<29; free(s); - return 0; - } - for (p = s + i + 1; i != k; ++i) if (s[i] == '-') break; - *begin = atoi(p); - if (i < k) { - p = s + i + 1; - *end = atoi(p); - } else *end = 1<<29; - if (*begin > 0) --*begin; - free(s); - if (*begin > *end) return -1; - return 0; -} - -/******************************* - * retrieve a specified region * - *******************************/ - -uint64_t bcf_idx_query(const bcf_idx_t *idx, int tid, int beg) -{ - uint64_t min_off, *offset; - int i; - if (beg < 0) beg = 0; - offset = idx->index2[tid].offset; - for (i = beg>>TAD_LIDX_SHIFT; i < idx->index2[tid].n && offset[i] == 0; ++i); - min_off = (i == idx->index2[tid].n)? offset[idx->index2[tid].n-1] : offset[i]; - return min_off; -} - -int bcf_main_index(int argc, char *argv[]) -{ - if (argc == 1) { - fprintf(stderr, "Usage: bcftools index \n"); - return 1; - } - bcf_idx_build(argv[1]); - return 0; -} diff -Nru tophat-2.1.1+dfsg/src/samtools-0.1.18/bcftools/kfunc.c tophat-2.1.1+dfsg1/src/samtools-0.1.18/bcftools/kfunc.c --- tophat-2.1.1+dfsg/src/samtools-0.1.18/bcftools/kfunc.c 2016-02-14 18:21:17.499079000 +0000 +++ tophat-2.1.1+dfsg1/src/samtools-0.1.18/bcftools/kfunc.c 1970-01-01 00:00:00.000000000 +0000 @@ -1,162 +0,0 @@ -#include - - -/* Log gamma function - * \log{\Gamma(z)} - * AS245, 2nd algorithm, http://lib.stat.cmu.edu/apstat/245 - */ -double kf_lgamma(double z) -{ - double x = 0; - x += 0.1659470187408462e-06 / (z+7); - x += 0.9934937113930748e-05 / (z+6); - x -= 0.1385710331296526 / (z+5); - x += 12.50734324009056 / (z+4); - x -= 176.6150291498386 / (z+3); - x += 771.3234287757674 / (z+2); - x -= 1259.139216722289 / (z+1); - x += 676.5203681218835 / z; - x += 0.9999999999995183; - return log(x) - 5.58106146679532777 - z + (z-0.5) * log(z+6.5); -} - -/* complementary error function - * \frac{2}{\sqrt{\pi}} \int_x^{\infty} e^{-t^2} dt - * AS66, 2nd algorithm, http://lib.stat.cmu.edu/apstat/66 - */ -double kf_erfc(double x) -{ - const double p0 = 220.2068679123761; - const double p1 = 221.2135961699311; - const double p2 = 112.0792914978709; - const double p3 = 33.912866078383; - const double p4 = 6.37396220353165; - const double p5 = .7003830644436881; - const double p6 = .03526249659989109; - const double q0 = 440.4137358247522; - const double q1 = 793.8265125199484; - const double q2 = 637.3336333788311; - const double q3 = 296.5642487796737; - const double q4 = 86.78073220294608; - const double q5 = 16.06417757920695; - const double q6 = 1.755667163182642; - const double q7 = .08838834764831844; - double expntl, z, p; - z = fabs(x) * M_SQRT2; - if (z > 37.) return x > 0.? 0. : 2.; - expntl = exp(z * z * - .5); - if (z < 10. / M_SQRT2) // for small z - p = expntl * ((((((p6 * z + p5) * z + p4) * z + p3) * z + p2) * z + p1) * z + p0) - / (((((((q7 * z + q6) * z + q5) * z + q4) * z + q3) * z + q2) * z + q1) * z + q0); - else p = expntl / 2.506628274631001 / (z + 1. / (z + 2. / (z + 3. / (z + 4. / (z + .65))))); - return x > 0.? 2. * p : 2. * (1. - p); -} - -/* The following computes regularized incomplete gamma functions. - * Formulas are taken from Wiki, with additional input from Numerical - * Recipes in C (for modified Lentz's algorithm) and AS245 - * (http://lib.stat.cmu.edu/apstat/245). - * - * A good online calculator is available at: - * - * http://www.danielsoper.com/statcalc/calc23.aspx - * - * It calculates upper incomplete gamma function, which equals - * kf_gammaq(s,z)*tgamma(s). - */ - -#define KF_GAMMA_EPS 1e-14 -#define KF_TINY 1e-290 - -// regularized lower incomplete gamma function, by series expansion -static double _kf_gammap(double s, double z) -{ - double sum, x; - int k; - for (k = 1, sum = x = 1.; k < 100; ++k) { - sum += (x *= z / (s + k)); - if (x / sum < KF_GAMMA_EPS) break; - } - return exp(s * log(z) - z - kf_lgamma(s + 1.) + log(sum)); -} -// regularized upper incomplete gamma function, by continued fraction -static double _kf_gammaq(double s, double z) -{ - int j; - double C, D, f; - f = 1. + z - s; C = f; D = 0.; - // Modified Lentz's algorithm for computing continued fraction - // See Numerical Recipes in C, 2nd edition, section 5.2 - for (j = 1; j < 100; ++j) { - double a = j * (s - j), b = (j<<1) + 1 + z - s, d; - D = b + a * D; - if (D < KF_TINY) D = KF_TINY; - C = b + a / C; - if (C < KF_TINY) C = KF_TINY; - D = 1. / D; - d = C * D; - f *= d; - if (fabs(d - 1.) < KF_GAMMA_EPS) break; - } - return exp(s * log(z) - z - kf_lgamma(s) - log(f)); -} - -double kf_gammap(double s, double z) -{ - return z <= 1. || z < s? _kf_gammap(s, z) : 1. - _kf_gammaq(s, z); -} - -double kf_gammaq(double s, double z) -{ - return z <= 1. || z < s? 1. - _kf_gammap(s, z) : _kf_gammaq(s, z); -} - -/* Regularized incomplete beta function. The method is taken from - * Numerical Recipe in C, 2nd edition, section 6.4. The following web - * page calculates the incomplete beta function, which equals - * kf_betai(a,b,x) * gamma(a) * gamma(b) / gamma(a+b): - * - * http://www.danielsoper.com/statcalc/calc36.aspx - */ -static double kf_betai_aux(double a, double b, double x) -{ - double C, D, f; - int j; - if (x == 0.) return 0.; - if (x == 1.) return 1.; - f = 1.; C = f; D = 0.; - // Modified Lentz's algorithm for computing continued fraction - for (j = 1; j < 200; ++j) { - double aa, d; - int m = j>>1; - aa = (j&1)? -(a + m) * (a + b + m) * x / ((a + 2*m) * (a + 2*m + 1)) - : m * (b - m) * x / ((a + 2*m - 1) * (a + 2*m)); - D = 1. + aa * D; - if (D < KF_TINY) D = KF_TINY; - C = 1. + aa / C; - if (C < KF_TINY) C = KF_TINY; - D = 1. / D; - d = C * D; - f *= d; - if (fabs(d - 1.) < KF_GAMMA_EPS) break; - } - return exp(kf_lgamma(a+b) - kf_lgamma(a) - kf_lgamma(b) + a * log(x) + b * log(1.-x)) / a / f; -} -double kf_betai(double a, double b, double x) -{ - return x < (a + 1.) / (a + b + 2.)? kf_betai_aux(a, b, x) : 1. - kf_betai_aux(b, a, 1. - x); -} - -#ifdef KF_MAIN -#include -int main(int argc, char *argv[]) -{ - double x = 5.5, y = 3; - double a, b; - printf("erfc(%lg): %lg, %lg\n", x, erfc(x), kf_erfc(x)); - printf("upper-gamma(%lg,%lg): %lg\n", x, y, kf_gammaq(y, x)*tgamma(y)); - a = 2; b = 2; x = 0.5; - printf("incomplete-beta(%lg,%lg,%lg): %lg\n", a, b, x, kf_betai(a, b, x) / exp(kf_lgamma(a+b) - kf_lgamma(a) - kf_lgamma(b))); - return 0; -} -#endif diff -Nru tophat-2.1.1+dfsg/src/samtools-0.1.18/bcftools/kmin.c tophat-2.1.1+dfsg1/src/samtools-0.1.18/bcftools/kmin.c --- tophat-2.1.1+dfsg/src/samtools-0.1.18/bcftools/kmin.c 2016-02-14 18:21:17.500079000 +0000 +++ tophat-2.1.1+dfsg1/src/samtools-0.1.18/bcftools/kmin.c 1970-01-01 00:00:00.000000000 +0000 @@ -1,209 +0,0 @@ -/* The MIT License - - Copyright (c) 2008, 2010 by Attractive Chaos - - Permission is hereby granted, free of charge, to any person obtaining - a copy of this software and associated documentation files (the - "Software"), to deal in the Software without restriction, including - without limitation the rights to use, copy, modify, merge, publish, - distribute, sublicense, and/or sell copies of the Software, and to - permit persons to whom the Software is furnished to do so, subject to - the following conditions: - - The above copyright notice and this permission notice shall be - included in all copies or substantial portions of the Software. - - THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS - BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN - ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN - CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - SOFTWARE. -*/ - -/* Hooke-Jeeves algorithm for nonlinear minimization - - Based on the pseudocodes by Bell and Pike (CACM 9(9):684-685), and - the revision by Tomlin and Smith (CACM 12(11):637-638). Both of the - papers are comments on Kaupe's Algorithm 178 "Direct Search" (ACM - 6(6):313-314). The original algorithm was designed by Hooke and - Jeeves (ACM 8:212-229). This program is further revised according to - Johnson's implementation at Netlib (opt/hooke.c). - - Hooke-Jeeves algorithm is very simple and it works quite well on a - few examples. However, it might fail to converge due to its heuristic - nature. A possible improvement, as is suggested by Johnson, may be to - choose a small r at the beginning to quickly approach to the minimum - and a large r at later step to hit the minimum. - */ - -#include -#include -#include -#include "kmin.h" - -static double __kmin_hj_aux(kmin_f func, int n, double *x1, void *data, double fx1, double *dx, int *n_calls) -{ - int k, j = *n_calls; - double ftmp; - for (k = 0; k != n; ++k) { - x1[k] += dx[k]; - ftmp = func(n, x1, data); ++j; - if (ftmp < fx1) fx1 = ftmp; - else { /* search the opposite direction */ - dx[k] = 0.0 - dx[k]; - x1[k] += dx[k] + dx[k]; - ftmp = func(n, x1, data); ++j; - if (ftmp < fx1) fx1 = ftmp; - else x1[k] -= dx[k]; /* back to the original x[k] */ - } - } - *n_calls = j; - return fx1; /* here: fx1=f(n,x1) */ -} - -double kmin_hj(kmin_f func, int n, double *x, void *data, double r, double eps, int max_calls) -{ - double fx, fx1, *x1, *dx, radius; - int k, n_calls = 0; - x1 = (double*)calloc(n, sizeof(double)); - dx = (double*)calloc(n, sizeof(double)); - for (k = 0; k != n; ++k) { /* initial directions, based on MGJ */ - dx[k] = fabs(x[k]) * r; - if (dx[k] == 0) dx[k] = r; - } - radius = r; - fx1 = fx = func(n, x, data); ++n_calls; - for (;;) { - memcpy(x1, x, n * sizeof(double)); /* x1 = x */ - fx1 = __kmin_hj_aux(func, n, x1, data, fx, dx, &n_calls); - while (fx1 < fx) { - for (k = 0; k != n; ++k) { - double t = x[k]; - dx[k] = x1[k] > x[k]? fabs(dx[k]) : 0.0 - fabs(dx[k]); - x[k] = x1[k]; - x1[k] = x1[k] + x1[k] - t; - } - fx = fx1; - if (n_calls >= max_calls) break; - fx1 = func(n, x1, data); ++n_calls; - fx1 = __kmin_hj_aux(func, n, x1, data, fx1, dx, &n_calls); - if (fx1 >= fx) break; - for (k = 0; k != n; ++k) - if (fabs(x1[k] - x[k]) > .5 * fabs(dx[k])) break; - if (k == n) break; - } - if (radius >= eps) { - if (n_calls >= max_calls) break; - radius *= r; - for (k = 0; k != n; ++k) dx[k] *= r; - } else break; /* converge */ - } - free(x1); free(dx); - return fx1; -} - -// I copied this function somewhere several years ago with some of my modifications, but I forgot the source. -double kmin_brent(kmin1_f func, double a, double b, void *data, double tol, double *xmin) -{ - double bound, u, r, q, fu, tmp, fa, fb, fc, c; - const double gold1 = 1.6180339887; - const double gold2 = 0.3819660113; - const double tiny = 1e-20; - const int max_iter = 100; - - double e, d, w, v, mid, tol1, tol2, p, eold, fv, fw; - int iter; - - fa = func(a, data); fb = func(b, data); - if (fb > fa) { // swap, such that f(a) > f(b) - tmp = a; a = b; b = tmp; - tmp = fa; fa = fb; fb = tmp; - } - c = b + gold1 * (b - a), fc = func(c, data); // golden section extrapolation - while (fb > fc) { - bound = b + 100.0 * (c - b); // the farthest point where we want to go - r = (b - a) * (fb - fc); - q = (b - c) * (fb - fa); - if (fabs(q - r) < tiny) { // avoid 0 denominator - tmp = q > r? tiny : 0.0 - tiny; - } else tmp = q - r; - u = b - ((b - c) * q - (b - a) * r) / (2.0 * tmp); // u is the parabolic extrapolation point - if ((b > u && u > c) || (b < u && u < c)) { // u lies between b and c - fu = func(u, data); - if (fu < fc) { // (b,u,c) bracket the minimum - a = b; b = u; fa = fb; fb = fu; - break; - } else if (fu > fb) { // (a,b,u) bracket the minimum - c = u; fc = fu; - break; - } - u = c + gold1 * (c - b); fu = func(u, data); // golden section extrapolation - } else if ((c > u && u > bound) || (c < u && u < bound)) { // u lies between c and bound - fu = func(u, data); - if (fu < fc) { // fb > fc > fu - b = c; c = u; u = c + gold1 * (c - b); - fb = fc; fc = fu; fu = func(u, data); - } else { // (b,c,u) bracket the minimum - a = b; b = c; c = u; - fa = fb; fb = fc; fc = fu; - break; - } - } else if ((u > bound && bound > c) || (u < bound && bound < c)) { // u goes beyond the bound - u = bound; fu = func(u, data); - } else { // u goes the other way around, use golden section extrapolation - u = c + gold1 * (c - b); fu = func(u, data); - } - a = b; b = c; c = u; - fa = fb; fb = fc; fc = fu; - } - if (a > c) u = a, a = c, c = u; // swap - - // now, afb and fb tol1) { - // related to parabolic interpolation - r = (b - w) * (fb - fv); - q = (b - v) * (fb - fw); - p = (b - v) * q - (b - w) * r; - q = 2.0 * (q - r); - if (q > 0.0) p = 0.0 - p; - else q = 0.0 - q; - eold = e; e = d; - if (fabs(p) >= fabs(0.5 * q * eold) || p <= q * (a - b) || p >= q * (c - b)) { - d = gold2 * (e = (b >= mid ? a - b : c - b)); - } else { - d = p / q; u = b + d; // actual parabolic interpolation happens here - if (u - a < tol2 || c - u < tol2) - d = (mid > b)? tol1 : 0.0 - tol1; - } - } else d = gold2 * (e = (b >= mid ? a - b : c - b)); // golden section interpolation - u = fabs(d) >= tol1 ? b + d : b + (d > 0.0? tol1 : -tol1); - fu = func(u, data); - if (fu <= fb) { // u is the minimum point so far - if (u >= b) a = b; - else c = b; - v = w; w = b; b = u; fv = fw; fw = fb; fb = fu; - } else { // adjust (a,c) and (u,v,w) - if (u < b) a = u; - else c = u; - if (fu <= fw || w == b) { - v = w; w = u; - fv = fw; fw = fu; - } else if (fu <= fv || v == b || v == w) { - v = u; fv = fu; - } - } - } - *xmin = b; - return fb; -} diff -Nru tophat-2.1.1+dfsg/src/samtools-0.1.18/bcftools/kmin.h tophat-2.1.1+dfsg1/src/samtools-0.1.18/bcftools/kmin.h --- tophat-2.1.1+dfsg/src/samtools-0.1.18/bcftools/kmin.h 2016-02-14 18:21:17.501079000 +0000 +++ tophat-2.1.1+dfsg1/src/samtools-0.1.18/bcftools/kmin.h 1970-01-01 00:00:00.000000000 +0000 @@ -1,46 +0,0 @@ -/* - Copyright (c) 2008, 2010 by Attractive Chaos - - Permission is hereby granted, free of charge, to any person obtaining - a copy of this software and associated documentation files (the - "Software"), to deal in the Software without restriction, including - without limitation the rights to use, copy, modify, merge, publish, - distribute, sublicense, and/or sell copies of the Software, and to - permit persons to whom the Software is furnished to do so, subject to - the following conditions: - - The above copyright notice and this permission notice shall be - included in all copies or substantial portions of the Software. - - THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS - BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN - ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN - CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - SOFTWARE. -*/ - -#ifndef KMIN_H -#define KMIN_H - -#define KMIN_RADIUS 0.5 -#define KMIN_EPS 1e-7 -#define KMIN_MAXCALL 50000 - -typedef double (*kmin_f)(int, double*, void*); -typedef double (*kmin1_f)(double, void*); - -#ifdef __cplusplus -extern "C" { -#endif - - double kmin_hj(kmin_f func, int n, double *x, void *data, double r, double eps, int max_calls); - double kmin_brent(kmin1_f func, double a, double b, void *data, double tol, double *xmin); - -#ifdef __cplusplus -} -#endif - -#endif diff -Nru tophat-2.1.1+dfsg/src/samtools-0.1.18/bcftools/main.c tophat-2.1.1+dfsg1/src/samtools-0.1.18/bcftools/main.c --- tophat-2.1.1+dfsg/src/samtools-0.1.18/bcftools/main.c 2016-02-14 18:21:17.502079000 +0000 +++ tophat-2.1.1+dfsg1/src/samtools-0.1.18/bcftools/main.c 1970-01-01 00:00:00.000000000 +0000 @@ -1,190 +0,0 @@ -#include -#include -#include -#include -#include "bcf.h" - -#include "kseq.h" -KSTREAM_INIT(gzFile, gzread, 0x10000) - -int bcfview(int argc, char *argv[]); -int bcf_main_index(int argc, char *argv[]); - -#define BUF_SIZE 0x10000 - -int bcf_cat(int n, char * const *fn) -{ - int i; - bcf_t *out; - uint8_t *buf; - buf = malloc(BUF_SIZE); - out = bcf_open("-", "w"); - for (i = 0; i < n; ++i) { - bcf_t *in; - bcf_hdr_t *h; - off_t end; - struct stat s; - in = bcf_open(fn[i], "r"); - h = bcf_hdr_read(in); - if (i == 0) bcf_hdr_write(out, h); - bcf_hdr_destroy(h); -#ifdef _USE_KNETFILE - fstat(knet_fileno(in->fp->x.fpr), &s); - end = s.st_size - 28; - while (knet_tell(in->fp->x.fpr) < end) { - int size = knet_tell(in->fp->x.fpr) + BUF_SIZE < end? BUF_SIZE : end - knet_tell(in->fp->x.fpr); - knet_read(in->fp->x.fpr, buf, size); - fwrite(buf, 1, size, out->fp->x.fpw); - } -#else - abort(); // FIXME: not implemented -#endif - bcf_close(in); - } - bcf_close(out); - free(buf); - return 0; -} - -extern double bcf_pair_freq(const bcf1_t *b0, const bcf1_t *b1, double f[4]); - -int bcf_main_ldpair(int argc, char *argv[]) -{ - bcf_t *fp; - bcf_hdr_t *h; - bcf1_t *b0, *b1; - bcf_idx_t *idx; - kstring_t str; - void *str2id; - gzFile fplist; - kstream_t *ks; - int dret, lineno = 0; - if (argc < 3) { - fprintf(stderr, "Usage: bcftools ldpair \n"); - return 1; - } - fplist = gzopen(argv[2], "rb"); - ks = ks_init(fplist); - memset(&str, 0, sizeof(kstring_t)); - fp = bcf_open(argv[1], "rb"); - h = bcf_hdr_read(fp); - str2id = bcf_build_refhash(h); - idx = bcf_idx_load(argv[1]); - if (idx == 0) { - fprintf(stderr, "[%s] No bcf index is found. Abort!\n", __func__); - return 1; - } - b0 = calloc(1, sizeof(bcf1_t)); - b1 = calloc(1, sizeof(bcf1_t)); - while (ks_getuntil(ks, '\n', &str, &dret) >= 0) { - char *p, *q; - int k; - int tid0 = -1, tid1 = -1, pos0 = -1, pos1 = -1; - ++lineno; - for (p = q = str.s, k = 0; *p; ++p) { - if (*p == ' ' || *p == '\t') { - *p = '\0'; - if (k == 0) tid0 = bcf_str2id(str2id, q); - else if (k == 1) pos0 = atoi(q) - 1; - else if (k == 2) tid1 = strcmp(q, "=")? bcf_str2id(str2id, q) : tid0; - else if (k == 3) pos1 = atoi(q) - 1; - q = p + 1; - ++k; - } - } - if (k == 3) pos1 = atoi(q) - 1; - if (tid0 >= 0 && tid1 >= 0 && pos0 >= 0 && pos1 >= 0) { - uint64_t off; - double r, f[4]; - off = bcf_idx_query(idx, tid0, pos0); - bgzf_seek(fp->fp, off, SEEK_SET); - while (bcf_read(fp, h, b0) >= 0 && b0->pos != pos0); - off = bcf_idx_query(idx, tid1, pos1); - bgzf_seek(fp->fp, off, SEEK_SET); - while (bcf_read(fp, h, b1) >= 0 && b1->pos != pos1); - r = bcf_pair_freq(b0, b1, f); - r *= r; - printf("%s\t%d\t%s\t%d\t%.4g\t%.4g\t%.4g\t%.4g\t%.4g\n", h->ns[tid0], pos0+1, h->ns[tid1], pos1+1, - r, f[0], f[1], f[2], f[3]); - } //else fprintf(stderr, "[%s] Parse error at line %d.\n", __func__, lineno); - } - bcf_destroy(b0); bcf_destroy(b1); - bcf_idx_destroy(idx); - bcf_str2id_destroy(str2id); - bcf_hdr_destroy(h); - bcf_close(fp); - free(str.s); - ks_destroy(ks); - gzclose(fplist); - return 0; -} - -int bcf_main_ld(int argc, char *argv[]) -{ - bcf_t *fp; - bcf_hdr_t *h; - bcf1_t **b, *b0; - int i, j, m, n; - double f[4]; - if (argc == 1) { - fprintf(stderr, "Usage: bcftools ld \n"); - return 1; - } - fp = bcf_open(argv[1], "rb"); - h = bcf_hdr_read(fp); - // read the entire BCF - m = n = 0; b = 0; - b0 = calloc(1, sizeof(bcf1_t)); - while (bcf_read(fp, h, b0) >= 0) { - if (m == n) { - m = m? m<<1 : 16; - b = realloc(b, sizeof(void*) * m); - } - b[n] = calloc(1, sizeof(bcf1_t)); - bcf_cpy(b[n++], b0); - } - bcf_destroy(b0); - // compute pair-wise r^2 - printf("%d\n", n); // the number of loci - for (i = 0; i < n; ++i) { - printf("%s:%d", h->ns[b[i]->tid], b[i]->pos + 1); - for (j = 0; j < i; ++j) { - double r = bcf_pair_freq(b[i], b[j], f); - printf("\t%.3f", r*r); - } - printf("\t1.000\n"); - } - // free - for (i = 0; i < n; ++i) bcf_destroy(b[i]); - free(b); - bcf_hdr_destroy(h); - bcf_close(fp); - return 0; -} - -int main(int argc, char *argv[]) -{ - if (argc == 1) { - fprintf(stderr, "\n"); - fprintf(stderr, "Program: bcftools (Tools for data in the VCF/BCF formats)\n"); - fprintf(stderr, "Version: %s\n\n", BCF_VERSION); - fprintf(stderr, "Usage: bcftools \n\n"); - fprintf(stderr, "Command: view print, extract, convert and call SNPs from BCF\n"); - fprintf(stderr, " index index BCF\n"); - fprintf(stderr, " cat concatenate BCFs\n"); - fprintf(stderr, " ld compute all-pair r^2\n"); - fprintf(stderr, " ldpair compute r^2 between requested pairs\n"); - fprintf(stderr, "\n"); - return 1; - } - if (strcmp(argv[1], "view") == 0) return bcfview(argc-1, argv+1); - else if (strcmp(argv[1], "index") == 0) return bcf_main_index(argc-1, argv+1); - else if (strcmp(argv[1], "ld") == 0) return bcf_main_ld(argc-1, argv+1); - else if (strcmp(argv[1], "ldpair") == 0) return bcf_main_ldpair(argc-1, argv+1); - else if (strcmp(argv[1], "cat") == 0) return bcf_cat(argc-2, argv+2); // cat is different ... - else { - fprintf(stderr, "[main] Unrecognized command.\n"); - return 1; - } - return 0; -} diff -Nru tophat-2.1.1+dfsg/src/samtools-0.1.18/bcftools/Makefile tophat-2.1.1+dfsg1/src/samtools-0.1.18/bcftools/Makefile --- tophat-2.1.1+dfsg/src/samtools-0.1.18/bcftools/Makefile 2016-02-14 18:21:17.472079000 +0000 +++ tophat-2.1.1+dfsg1/src/samtools-0.1.18/bcftools/Makefile 1970-01-01 00:00:00.000000000 +0000 @@ -1,51 +0,0 @@ -CC= gcc -CFLAGS= -g -Wall -O2 #-m64 #-arch ppc -DFLAGS= -D_FILE_OFFSET_BITS=64 -D_USE_KNETFILE -LOBJS= bcf.o vcf.o bcfutils.o prob1.o em.o kfunc.o kmin.o index.o fet.o mut.o bcf2qcall.o -OMISC= .. -AOBJS= call1.o main.o $(OMISC)/kstring.o $(OMISC)/bgzf.o $(OMISC)/knetfile.o $(OMISC)/bedidx.o -PROG= bcftools -INCLUDES= -SUBDIRS= . - -.SUFFIXES:.c .o - -.c.o: - $(CC) -c $(CFLAGS) $(DFLAGS) -I.. $(INCLUDES) $< -o $@ - -all-recur lib-recur clean-recur cleanlocal-recur install-recur: - @target=`echo $@ | sed s/-recur//`; \ - wdir=`pwd`; \ - list='$(SUBDIRS)'; for subdir in $$list; do \ - cd $$subdir; \ - $(MAKE) CC="$(CC)" DFLAGS="$(DFLAGS)" CFLAGS="$(CFLAGS)" \ - INCLUDES="$(INCLUDES)" LIBPATH="$(LIBPATH)" $$target || exit 1; \ - cd $$wdir; \ - done; - -all:$(PROG) - -lib:libbcf.a - -libbcf.a:$(LOBJS) - $(AR) -csru $@ $(LOBJS) - -bcftools:lib $(AOBJS) - $(CC) $(CFLAGS) -o $@ $(AOBJS) -L. $(LIBPATH) -lbcf -lm -lz - -bcf.o:bcf.h -vcf.o:bcf.h -index.o:bcf.h -bcfutils.o:bcf.h -prob1.o:prob1.h bcf.h -call1.o:prob1.h bcf.h -bcf2qcall.o:bcf.h -main.o:bcf.h - -bcf.pdf:bcf.tex - pdflatex bcf - -cleanlocal: - rm -fr gmon.out *.o a.out *.dSYM $(PROG) *~ *.a bcf.aux bcf.log bcf.pdf *.class libbcf.*.dylib libbcf.so* - -clean:cleanlocal-recur diff -Nru tophat-2.1.1+dfsg/src/samtools-0.1.18/bcftools/mut.c tophat-2.1.1+dfsg1/src/samtools-0.1.18/bcftools/mut.c --- tophat-2.1.1+dfsg/src/samtools-0.1.18/bcftools/mut.c 2016-02-14 18:21:17.502079000 +0000 +++ tophat-2.1.1+dfsg1/src/samtools-0.1.18/bcftools/mut.c 1970-01-01 00:00:00.000000000 +0000 @@ -1,127 +0,0 @@ -#include -#include -#include "bcf.h" - -#define MAX_GENO 359 - -int8_t seq_bitcnt[] = { 4, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4 }; -char *seq_nt16rev = "XACMGRSVTWYHKDBN"; - -uint32_t *bcf_trio_prep(int is_x, int is_son) -{ - int i, j, k, n, map[10]; - uint32_t *ret; - ret = calloc(MAX_GENO, 4); - for (i = 0, k = 0; i < 4; ++i) - for (j = i; j < 4; ++j) - map[k++] = 1<n_smpl != 3) return -1; // not a trio - for (i = 0; i < b->n_gi; ++i) - if (b->gi[i].fmt == bcf_str2int("PL", 2)) break; - if (i == b->n_gi) return -1; // no PL - gl10 = alloca(10 * b->n_smpl); - if (bcf_gl10(b, gl10) < 0) { - if (bcf_gl10_indel(b, gl10) < 0) return -1; - } - PL = b->gi + i; - for (i = 0, k = 0; i < 4; ++i) - for (j = i; j < 4; ++j) - map[k++] = seq_nt16rev[1<data)[j * PL->len] != 0) break; - if (j < 3) { // we need to go through the complex procedure - uint8_t *g[3]; - int minc = 1<<30, minc_j = -1, minf = 0, gtf = 0, gtc = 0; - g[0] = gl10; - g[1] = gl10 + 10; - g[2] = gl10 + 20; - for (j = 1; j <= (int)prep[0]; ++j) { // compute LK with constraint - int sum = g[0][prep[j]&0xff] + g[1][prep[j]>>8&0xff] + g[2][prep[j]>>16&0xff]; - if (sum < minc) minc = sum, minc_j = j; - } - gtc |= map[prep[minc_j]&0xff]; gtc |= map[prep[minc_j]>>8&0xff]<<8; gtc |= map[prep[minc_j]>>16]<<16; - for (j = 0; j < 3; ++j) { // compute LK without constraint - int min = 1<<30, min_k = -1; - for (k = 0; k < 10; ++k) - if (g[j][k] < min) min = g[j][k], min_k = k; - gtf |= map[min_k]<<(j*8); - minf += min; - } - *llr = minc - minf; *gt = (int64_t)gtc<<32 | gtf; - } else *llr = 0, *gt = -1; - return 0; -} - -int bcf_pair_call(const bcf1_t *b) -{ - int i, j, k; - const bcf_ginfo_t *PL; - if (b->n_smpl != 2) return -1; // not a pair - for (i = 0; i < b->n_gi; ++i) - if (b->gi[i].fmt == bcf_str2int("PL", 2)) break; - if (i == b->n_gi) return -1; // no PL - PL = b->gi + i; - for (j = 0; j < 2; ++j) // check if ref hom is the most probable in all members - if (((uint8_t*)PL->data)[j * PL->len] != 0) break; - if (j < 2) { // we need to go through the complex procedure - uint8_t *g[2]; - int minc = 1<<30, minf = 0; - g[0] = PL->data; - g[1] = (uint8_t*)PL->data + PL->len; - for (j = 0; j < PL->len; ++j) // compute LK with constraint - minc = minc < g[0][j] + g[1][j]? minc : g[0][j] + g[1][j]; - for (j = 0; j < 2; ++j) { // compute LK without constraint - int min = 1<<30; - for (k = 0; k < PL->len; ++k) - min = min < g[j][k]? min : g[j][k]; - minf += min; - } - return minc - minf; - } else return 0; -} - -int bcf_min_diff(const bcf1_t *b) -{ - int i, min = 1<<30; - const bcf_ginfo_t *PL; - for (i = 0; i < b->n_gi; ++i) - if (b->gi[i].fmt == bcf_str2int("PL", 2)) break; - if (i == b->n_gi) return -1; // no PL - PL = b->gi + i; - for (i = 0; i < b->n_smpl; ++i) { - int m1, m2, j; - const uint8_t *p = (uint8_t*)PL->data; - m1 = m2 = 1<<30; - for (j = 0; j < PL->len; ++j) { - if ((int)p[j] < m1) m2 = m1, m1 = p[j]; - else if ((int)p[j] < m2) m2 = p[j]; - } - min = min < m2 - m1? min : m2 - m1; - } - return min; -} diff -Nru tophat-2.1.1+dfsg/src/samtools-0.1.18/bcftools/prob1.c tophat-2.1.1+dfsg1/src/samtools-0.1.18/bcftools/prob1.c --- tophat-2.1.1+dfsg/src/samtools-0.1.18/bcftools/prob1.c 2016-02-14 18:21:17.503079000 +0000 +++ tophat-2.1.1+dfsg1/src/samtools-0.1.18/bcftools/prob1.c 1970-01-01 00:00:00.000000000 +0000 @@ -1,554 +0,0 @@ -#include -#include -#include -#include -#include -#include -#include "prob1.h" - -#include "kseq.h" -KSTREAM_INIT(gzFile, gzread, 16384) - -#define MC_MAX_EM_ITER 16 -#define MC_EM_EPS 1e-5 -#define MC_DEF_INDEL 0.15 - -unsigned char seq_nt4_table[256] = { - 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, - 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, - 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4 /*'-'*/, 4, 4, - 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, - 4, 0, 4, 1, 4, 4, 4, 2, 4, 4, 4, 4, 4, 4, 4, 4, - 4, 4, 4, 4, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, - 4, 0, 4, 1, 4, 4, 4, 2, 4, 4, 4, 4, 4, 4, 4, 4, - 4, 4, 4, 4, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, - 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, - 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, - 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, - 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, - 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, - 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, - 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, - 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4 -}; - -struct __bcf_p1aux_t { - int n, M, n1, is_indel; - uint8_t *ploidy; // haploid or diploid ONLY - double *q2p, *pdg; // pdg -> P(D|g) - double *phi, *phi_indel; - double *z, *zswap; // aux for afs - double *z1, *z2, *phi1, *phi2; // only calculated when n1 is set - double **hg; // hypergeometric distribution - double *lf; // log factorial - double t, t1, t2; - double *afs, *afs1; // afs: accumulative AFS; afs1: site posterior distribution - const uint8_t *PL; // point to PL - int PL_len; -}; - -void bcf_p1_indel_prior(bcf_p1aux_t *ma, double x) -{ - int i; - for (i = 0; i < ma->M; ++i) - ma->phi_indel[i] = ma->phi[i] * x; - ma->phi_indel[ma->M] = 1. - ma->phi[ma->M] * x; -} - -static void init_prior(int type, double theta, int M, double *phi) -{ - int i; - if (type == MC_PTYPE_COND2) { - for (i = 0; i <= M; ++i) - phi[i] = 2. * (i + 1) / (M + 1) / (M + 2); - } else if (type == MC_PTYPE_FLAT) { - for (i = 0; i <= M; ++i) - phi[i] = 1. / (M + 1); - } else { - double sum; - for (i = 0, sum = 0.; i < M; ++i) - sum += (phi[i] = theta / (M - i)); - phi[M] = 1. - sum; - } -} - -void bcf_p1_init_prior(bcf_p1aux_t *ma, int type, double theta) -{ - init_prior(type, theta, ma->M, ma->phi); - bcf_p1_indel_prior(ma, MC_DEF_INDEL); -} - -void bcf_p1_init_subprior(bcf_p1aux_t *ma, int type, double theta) -{ - if (ma->n1 <= 0 || ma->n1 >= ma->M) return; - init_prior(type, theta, 2*ma->n1, ma->phi1); - init_prior(type, theta, 2*(ma->n - ma->n1), ma->phi2); -} - -int bcf_p1_read_prior(bcf_p1aux_t *ma, const char *fn) -{ - gzFile fp; - kstring_t s; - kstream_t *ks; - long double sum; - int dret, k; - memset(&s, 0, sizeof(kstring_t)); - fp = strcmp(fn, "-")? gzopen(fn, "r") : gzdopen(fileno(stdin), "r"); - ks = ks_init(fp); - memset(ma->phi, 0, sizeof(double) * (ma->M + 1)); - while (ks_getuntil(ks, '\n', &s, &dret) >= 0) { - if (strstr(s.s, "[afs] ") == s.s) { - char *p = s.s + 6; - for (k = 0; k <= ma->M; ++k) { - int x; - double y; - x = strtol(p, &p, 10); - if (x != k && (errno == EINVAL || errno == ERANGE)) return -1; - ++p; - y = strtod(p, &p); - if (y == 0. && (errno == EINVAL || errno == ERANGE)) return -1; - ma->phi[ma->M - k] += y; - } - } - } - ks_destroy(ks); - gzclose(fp); - free(s.s); - for (sum = 0., k = 0; k <= ma->M; ++k) sum += ma->phi[k]; - fprintf(stderr, "[prior]"); - for (k = 0; k <= ma->M; ++k) ma->phi[k] /= sum; - for (k = 0; k <= ma->M; ++k) fprintf(stderr, " %d:%.3lg", k, ma->phi[ma->M - k]); - fputc('\n', stderr); - for (sum = 0., k = 1; k < ma->M; ++k) sum += ma->phi[ma->M - k] * (2.* k * (ma->M - k) / ma->M / (ma->M - 1)); - fprintf(stderr, "[%s] heterozygosity=%lf, ", __func__, (double)sum); - for (sum = 0., k = 1; k <= ma->M; ++k) sum += k * ma->phi[ma->M - k] / ma->M; - fprintf(stderr, "theta=%lf\n", (double)sum); - bcf_p1_indel_prior(ma, MC_DEF_INDEL); - return 0; -} - -bcf_p1aux_t *bcf_p1_init(int n, uint8_t *ploidy) -{ - bcf_p1aux_t *ma; - int i; - ma = calloc(1, sizeof(bcf_p1aux_t)); - ma->n1 = -1; - ma->n = n; ma->M = 2 * n; - if (ploidy) { - ma->ploidy = malloc(n); - memcpy(ma->ploidy, ploidy, n); - for (i = 0, ma->M = 0; i < n; ++i) ma->M += ploidy[i]; - if (ma->M == 2 * n) { - free(ma->ploidy); - ma->ploidy = 0; - } - } - ma->q2p = calloc(256, sizeof(double)); - ma->pdg = calloc(3 * ma->n, sizeof(double)); - ma->phi = calloc(ma->M + 1, sizeof(double)); - ma->phi_indel = calloc(ma->M + 1, sizeof(double)); - ma->phi1 = calloc(ma->M + 1, sizeof(double)); - ma->phi2 = calloc(ma->M + 1, sizeof(double)); - ma->z = calloc(ma->M + 1, sizeof(double)); - ma->zswap = calloc(ma->M + 1, sizeof(double)); - ma->z1 = calloc(ma->M + 1, sizeof(double)); // actually we do not need this large - ma->z2 = calloc(ma->M + 1, sizeof(double)); - ma->afs = calloc(ma->M + 1, sizeof(double)); - ma->afs1 = calloc(ma->M + 1, sizeof(double)); - ma->lf = calloc(ma->M + 1, sizeof(double)); - for (i = 0; i < 256; ++i) - ma->q2p[i] = pow(10., -i / 10.); - for (i = 0; i <= ma->M; ++i) ma->lf[i] = lgamma(i + 1); - bcf_p1_init_prior(ma, MC_PTYPE_FULL, 1e-3); // the simplest prior - return ma; -} - -int bcf_p1_set_n1(bcf_p1aux_t *b, int n1) -{ - if (n1 == 0 || n1 >= b->n) return -1; - if (b->M != b->n * 2) { - fprintf(stderr, "[%s] unable to set `n1' when there are haploid samples.\n", __func__); - return -1; - } - b->n1 = n1; - return 0; -} - -void bcf_p1_destroy(bcf_p1aux_t *ma) -{ - if (ma) { - int k; - free(ma->lf); - if (ma->hg && ma->n1 > 0) { - for (k = 0; k <= 2*ma->n1; ++k) free(ma->hg[k]); - free(ma->hg); - } - free(ma->ploidy); free(ma->q2p); free(ma->pdg); - free(ma->phi); free(ma->phi_indel); free(ma->phi1); free(ma->phi2); - free(ma->z); free(ma->zswap); free(ma->z1); free(ma->z2); - free(ma->afs); free(ma->afs1); - free(ma); - } -} - -static int cal_pdg(const bcf1_t *b, bcf_p1aux_t *ma) -{ - int i, j; - long *p, tmp; - p = alloca(b->n_alleles * sizeof(long)); - memset(p, 0, sizeof(long) * b->n_alleles); - for (j = 0; j < ma->n; ++j) { - const uint8_t *pi = ma->PL + j * ma->PL_len; - double *pdg = ma->pdg + j * 3; - pdg[0] = ma->q2p[pi[2]]; pdg[1] = ma->q2p[pi[1]]; pdg[2] = ma->q2p[pi[0]]; - for (i = 0; i < b->n_alleles; ++i) - p[i] += (int)pi[(i+1)*(i+2)/2-1]; - } - for (i = 0; i < b->n_alleles; ++i) p[i] = p[i]<<4 | i; - for (i = 1; i < b->n_alleles; ++i) // insertion sort - for (j = i; j > 0 && p[j] < p[j-1]; --j) - tmp = p[j], p[j] = p[j-1], p[j-1] = tmp; - for (i = b->n_alleles - 1; i >= 0; --i) - if ((p[i]&0xf) == 0) break; - return i; -} - -int bcf_p1_call_gt(const bcf_p1aux_t *ma, double f0, int k) -{ - double sum, g[3]; - double max, f3[3], *pdg = ma->pdg + k * 3; - int q, i, max_i, ploidy; - ploidy = ma->ploidy? ma->ploidy[k] : 2; - if (ploidy == 2) { - f3[0] = (1.-f0)*(1.-f0); f3[1] = 2.*f0*(1.-f0); f3[2] = f0*f0; - } else { - f3[0] = 1. - f0; f3[1] = 0; f3[2] = f0; - } - for (i = 0, sum = 0.; i < 3; ++i) - sum += (g[i] = pdg[i] * f3[i]); - for (i = 0, max = -1., max_i = 0; i < 3; ++i) { - g[i] /= sum; - if (g[i] > max) max = g[i], max_i = i; - } - max = 1. - max; - if (max < 1e-308) max = 1e-308; - q = (int)(-4.343 * log(max) + .499); - if (q > 99) q = 99; - return q<<2|max_i; -} - -#define TINY 1e-20 - -static void mc_cal_y_core(bcf_p1aux_t *ma, int beg) -{ - double *z[2], *tmp, *pdg; - int _j, last_min, last_max; - assert(beg == 0 || ma->M == ma->n*2); - z[0] = ma->z; - z[1] = ma->zswap; - pdg = ma->pdg; - memset(z[0], 0, sizeof(double) * (ma->M + 1)); - memset(z[1], 0, sizeof(double) * (ma->M + 1)); - z[0][0] = 1.; - last_min = last_max = 0; - ma->t = 0.; - if (ma->M == ma->n * 2) { - int M = 0; - for (_j = beg; _j < ma->n; ++_j) { - int k, j = _j - beg, _min = last_min, _max = last_max, M0; - double p[3], sum; - M0 = M; M += 2; - pdg = ma->pdg + _j * 3; - p[0] = pdg[0]; p[1] = 2. * pdg[1]; p[2] = pdg[2]; - for (; _min < _max && z[0][_min] < TINY; ++_min) z[0][_min] = z[1][_min] = 0.; - for (; _max > _min && z[0][_max] < TINY; --_max) z[0][_max] = z[1][_max] = 0.; - _max += 2; - if (_min == 0) k = 0, z[1][k] = (M0-k+1) * (M0-k+2) * p[0] * z[0][k]; - if (_min <= 1) k = 1, z[1][k] = (M0-k+1) * (M0-k+2) * p[0] * z[0][k] + k*(M0-k+2) * p[1] * z[0][k-1]; - for (k = _min < 2? 2 : _min; k <= _max; ++k) - z[1][k] = (M0-k+1)*(M0-k+2) * p[0] * z[0][k] + k*(M0-k+2) * p[1] * z[0][k-1] + k*(k-1)* p[2] * z[0][k-2]; - for (k = _min, sum = 0.; k <= _max; ++k) sum += z[1][k]; - ma->t += log(sum / (M * (M - 1.))); - for (k = _min; k <= _max; ++k) z[1][k] /= sum; - if (_min >= 1) z[1][_min-1] = 0.; - if (_min >= 2) z[1][_min-2] = 0.; - if (j < ma->n - 1) z[1][_max+1] = z[1][_max+2] = 0.; - if (_j == ma->n1 - 1) { // set pop1; ma->n1==-1 when unset - ma->t1 = ma->t; - memcpy(ma->z1, z[1], sizeof(double) * (ma->n1 * 2 + 1)); - } - tmp = z[0]; z[0] = z[1]; z[1] = tmp; - last_min = _min; last_max = _max; - } - //for (_j = 0; _j < last_min; ++_j) z[0][_j] = 0.; // TODO: are these necessary? - //for (_j = last_max + 1; _j < ma->M; ++_j) z[0][_j] = 0.; - } else { // this block is very similar to the block above; these two might be merged in future - int j, M = 0; - for (j = 0; j < ma->n; ++j) { - int k, M0, _min = last_min, _max = last_max; - double p[3], sum; - pdg = ma->pdg + j * 3; - for (; _min < _max && z[0][_min] < TINY; ++_min) z[0][_min] = z[1][_min] = 0.; - for (; _max > _min && z[0][_max] < TINY; --_max) z[0][_max] = z[1][_max] = 0.; - M0 = M; - M += ma->ploidy[j]; - if (ma->ploidy[j] == 1) { - p[0] = pdg[0]; p[1] = pdg[2]; - _max++; - if (_min == 0) k = 0, z[1][k] = (M0+1-k) * p[0] * z[0][k]; - for (k = _min < 1? 1 : _min; k <= _max; ++k) - z[1][k] = (M0+1-k) * p[0] * z[0][k] + k * p[1] * z[0][k-1]; - for (k = _min, sum = 0.; k <= _max; ++k) sum += z[1][k]; - ma->t += log(sum / M); - for (k = _min; k <= _max; ++k) z[1][k] /= sum; - if (_min >= 1) z[1][_min-1] = 0.; - if (j < ma->n - 1) z[1][_max+1] = 0.; - } else if (ma->ploidy[j] == 2) { - p[0] = pdg[0]; p[1] = 2 * pdg[1]; p[2] = pdg[2]; - _max += 2; - if (_min == 0) k = 0, z[1][k] = (M0-k+1) * (M0-k+2) * p[0] * z[0][k]; - if (_min <= 1) k = 1, z[1][k] = (M0-k+1) * (M0-k+2) * p[0] * z[0][k] + k*(M0-k+2) * p[1] * z[0][k-1]; - for (k = _min < 2? 2 : _min; k <= _max; ++k) - z[1][k] = (M0-k+1)*(M0-k+2) * p[0] * z[0][k] + k*(M0-k+2) * p[1] * z[0][k-1] + k*(k-1)* p[2] * z[0][k-2]; - for (k = _min, sum = 0.; k <= _max; ++k) sum += z[1][k]; - ma->t += log(sum / (M * (M - 1.))); - for (k = _min; k <= _max; ++k) z[1][k] /= sum; - if (_min >= 1) z[1][_min-1] = 0.; - if (_min >= 2) z[1][_min-2] = 0.; - if (j < ma->n - 1) z[1][_max+1] = z[1][_max+2] = 0.; - } - tmp = z[0]; z[0] = z[1]; z[1] = tmp; - last_min = _min; last_max = _max; - } - } - if (z[0] != ma->z) memcpy(ma->z, z[0], sizeof(double) * (ma->M + 1)); -} - -static void mc_cal_y(bcf_p1aux_t *ma) -{ - if (ma->n1 > 0 && ma->n1 < ma->n && ma->M == ma->n * 2) { // NB: ma->n1 is ineffective when there are haploid samples - int k; - long double x; - memset(ma->z1, 0, sizeof(double) * (2 * ma->n1 + 1)); - memset(ma->z2, 0, sizeof(double) * (2 * (ma->n - ma->n1) + 1)); - ma->t1 = ma->t2 = 0.; - mc_cal_y_core(ma, ma->n1); - ma->t2 = ma->t; - memcpy(ma->z2, ma->z, sizeof(double) * (2 * (ma->n - ma->n1) + 1)); - mc_cal_y_core(ma, 0); - // rescale z - x = expl(ma->t - (ma->t1 + ma->t2)); - for (k = 0; k <= ma->M; ++k) ma->z[k] *= x; - } else mc_cal_y_core(ma, 0); -} - -#define CONTRAST_TINY 1e-30 - -extern double kf_gammaq(double s, double z); // incomplete gamma function for chi^2 test - -static inline double chi2_test(int a, int b, int c, int d) -{ - double x, z; - x = (double)(a+b) * (c+d) * (b+d) * (a+c); - if (x == 0.) return 1; - z = a * d - b * c; - return kf_gammaq(.5, .5 * z * z * (a+b+c+d) / x); -} - -// chi2=(a+b+c+d)(ad-bc)^2/[(a+b)(c+d)(a+c)(b+d)] -static inline double contrast2_aux(const bcf_p1aux_t *p1, double sum, int k1, int k2, double x[3]) -{ - double p = p1->phi[k1+k2] * p1->z1[k1] * p1->z2[k2] / sum * p1->hg[k1][k2]; - int n1 = p1->n1, n2 = p1->n - p1->n1; - if (p < CONTRAST_TINY) return -1; - if (.5*k1/n1 < .5*k2/n2) x[1] += p; - else if (.5*k1/n1 > .5*k2/n2) x[2] += p; - else x[0] += p; - return p * chi2_test(k1, k2, (n1<<1) - k1, (n2<<1) - k2); -} - -static double contrast2(bcf_p1aux_t *p1, double ret[3]) -{ - int k, k1, k2, k10, k20, n1, n2; - double sum; - // get n1 and n2 - n1 = p1->n1; n2 = p1->n - p1->n1; - if (n1 <= 0 || n2 <= 0) return 0.; - if (p1->hg == 0) { // initialize the hypergeometric distribution - /* NB: the hg matrix may take a lot of memory when there are many samples. There is a way - to avoid precomputing this matrix, but it is slower and quite intricate. The following - computation in this block can be accelerated with a similar strategy, but perhaps this - is not a serious concern for now. */ - double tmp = lgamma(2*(n1+n2)+1) - (lgamma(2*n1+1) + lgamma(2*n2+1)); - p1->hg = calloc(2*n1+1, sizeof(void*)); - for (k1 = 0; k1 <= 2*n1; ++k1) { - p1->hg[k1] = calloc(2*n2+1, sizeof(double)); - for (k2 = 0; k2 <= 2*n2; ++k2) - p1->hg[k1][k2] = exp(lgamma(k1+k2+1) + lgamma(p1->M-k1-k2+1) - (lgamma(k1+1) + lgamma(k2+1) + lgamma(2*n1-k1+1) + lgamma(2*n2-k2+1) + tmp)); - } - } - { // compute - long double suml = 0; - for (k = 0; k <= p1->M; ++k) suml += p1->phi[k] * p1->z[k]; - sum = suml; - } - { // get the max k1 and k2 - double max; - int max_k; - for (k = 0, max = 0, max_k = -1; k <= 2*n1; ++k) { - double x = p1->phi1[k] * p1->z1[k]; - if (x > max) max = x, max_k = k; - } - k10 = max_k; - for (k = 0, max = 0, max_k = -1; k <= 2*n2; ++k) { - double x = p1->phi2[k] * p1->z2[k]; - if (x > max) max = x, max_k = k; - } - k20 = max_k; - } - { // We can do the following with one nested loop, but that is an O(N^2) thing. The following code block is much faster for large N. - double x[3], y; - long double z = 0., L[2]; - x[0] = x[1] = x[2] = 0; L[0] = L[1] = 0; - for (k1 = k10; k1 >= 0; --k1) { - for (k2 = k20; k2 >= 0; --k2) { - if ((y = contrast2_aux(p1, sum, k1, k2, x)) < 0) break; - else z += y; - } - for (k2 = k20 + 1; k2 <= 2*n2; ++k2) { - if ((y = contrast2_aux(p1, sum, k1, k2, x)) < 0) break; - else z += y; - } - } - ret[0] = x[0]; ret[1] = x[1]; ret[2] = x[2]; - x[0] = x[1] = x[2] = 0; - for (k1 = k10 + 1; k1 <= 2*n1; ++k1) { - for (k2 = k20; k2 >= 0; --k2) { - if ((y = contrast2_aux(p1, sum, k1, k2, x)) < 0) break; - else z += y; - } - for (k2 = k20 + 1; k2 <= 2*n2; ++k2) { - if ((y = contrast2_aux(p1, sum, k1, k2, x)) < 0) break; - else z += y; - } - } - ret[0] += x[0]; ret[1] += x[1]; ret[2] += x[2]; - if (ret[0] + ret[1] + ret[2] < 0.95) { // in case of bad things happened - ret[0] = ret[1] = ret[2] = 0; L[0] = L[1] = 0; - for (k1 = 0, z = 0.; k1 <= 2*n1; ++k1) - for (k2 = 0; k2 <= 2*n2; ++k2) - if ((y = contrast2_aux(p1, sum, k1, k2, ret)) >= 0) z += y; - if (ret[0] + ret[1] + ret[2] < 0.95) // It seems that this may be caused by floating point errors. I do not really understand why... - z = 1.0, ret[0] = ret[1] = ret[2] = 1./3; - } - return (double)z; - } -} - -static double mc_cal_afs(bcf_p1aux_t *ma, double *p_ref_folded, double *p_var_folded) -{ - int k; - long double sum = 0., sum2; - double *phi = ma->is_indel? ma->phi_indel : ma->phi; - memset(ma->afs1, 0, sizeof(double) * (ma->M + 1)); - mc_cal_y(ma); - // compute AFS - for (k = 0, sum = 0.; k <= ma->M; ++k) - sum += (long double)phi[k] * ma->z[k]; - for (k = 0; k <= ma->M; ++k) { - ma->afs1[k] = phi[k] * ma->z[k] / sum; - if (isnan(ma->afs1[k]) || isinf(ma->afs1[k])) return -1.; - } - // compute folded variant probability - for (k = 0, sum = 0.; k <= ma->M; ++k) - sum += (long double)(phi[k] + phi[ma->M - k]) / 2. * ma->z[k]; - for (k = 1, sum2 = 0.; k < ma->M; ++k) - sum2 += (long double)(phi[k] + phi[ma->M - k]) / 2. * ma->z[k]; - *p_var_folded = sum2 / sum; - *p_ref_folded = (phi[k] + phi[ma->M - k]) / 2. * (ma->z[ma->M] + ma->z[0]) / sum; - // the expected frequency - for (k = 0, sum = 0.; k <= ma->M; ++k) { - ma->afs[k] += ma->afs1[k]; - sum += k * ma->afs1[k]; - } - return sum / ma->M; -} - -int bcf_p1_cal(const bcf1_t *b, int do_contrast, bcf_p1aux_t *ma, bcf_p1rst_t *rst) -{ - int i, k; - long double sum = 0.; - ma->is_indel = bcf_is_indel(b); - rst->perm_rank = -1; - // set PL and PL_len - for (i = 0; i < b->n_gi; ++i) { - if (b->gi[i].fmt == bcf_str2int("PL", 2)) { - ma->PL = (uint8_t*)b->gi[i].data; - ma->PL_len = b->gi[i].len; - break; - } - } - if (i == b->n_gi) return -1; // no PL - if (b->n_alleles < 2) return -1; // FIXME: find a better solution - // - rst->rank0 = cal_pdg(b, ma); - rst->f_exp = mc_cal_afs(ma, &rst->p_ref_folded, &rst->p_var_folded); - rst->p_ref = ma->afs1[ma->M]; - for (k = 0, sum = 0.; k < ma->M; ++k) - sum += ma->afs1[k]; - rst->p_var = (double)sum; - { // compute the allele count - double max = -1; - rst->ac = -1; - for (k = 0; k <= ma->M; ++k) - if (max < ma->z[k]) max = ma->z[k], rst->ac = k; - rst->ac = ma->M - rst->ac; - } - // calculate f_flat and f_em - for (k = 0, sum = 0.; k <= ma->M; ++k) - sum += (long double)ma->z[k]; - rst->f_flat = 0.; - for (k = 0; k <= ma->M; ++k) { - double p = ma->z[k] / sum; - rst->f_flat += k * p; - } - rst->f_flat /= ma->M; - { // estimate equal-tail credible interval (95% level) - int l, h; - double p; - for (i = 0, p = 0.; i <= ma->M; ++i) - if (p + ma->afs1[i] > 0.025) break; - else p += ma->afs1[i]; - l = i; - for (i = ma->M, p = 0.; i >= 0; --i) - if (p + ma->afs1[i] > 0.025) break; - else p += ma->afs1[i]; - h = i; - rst->cil = (double)(ma->M - h) / ma->M; rst->cih = (double)(ma->M - l) / ma->M; - } - if (ma->n1 > 0) { // compute LRT - double max0, max1, max2; - for (k = 0, max0 = -1; k <= ma->M; ++k) - if (max0 < ma->z[k]) max0 = ma->z[k]; - for (k = 0, max1 = -1; k <= ma->n1 * 2; ++k) - if (max1 < ma->z1[k]) max1 = ma->z1[k]; - for (k = 0, max2 = -1; k <= ma->M - ma->n1 * 2; ++k) - if (max2 < ma->z2[k]) max2 = ma->z2[k]; - rst->lrt = log(max1 * max2 / max0); - rst->lrt = rst->lrt < 0? 1 : kf_gammaq(.5, rst->lrt); - } else rst->lrt = -1.0; - rst->cmp[0] = rst->cmp[1] = rst->cmp[2] = rst->p_chi2 = -1.0; - if (do_contrast && rst->p_var > 0.5) // skip contrast2() if the locus is a strong non-variant - rst->p_chi2 = contrast2(ma, rst->cmp); - return 0; -} - -void bcf_p1_dump_afs(bcf_p1aux_t *ma) -{ - int k; - fprintf(stderr, "[afs]"); - for (k = 0; k <= ma->M; ++k) - fprintf(stderr, " %d:%.3lf", k, ma->afs[ma->M - k]); - fprintf(stderr, "\n"); - memset(ma->afs, 0, sizeof(double) * (ma->M + 1)); -} diff -Nru tophat-2.1.1+dfsg/src/samtools-0.1.18/bcftools/prob1.h tophat-2.1.1+dfsg1/src/samtools-0.1.18/bcftools/prob1.h --- tophat-2.1.1+dfsg/src/samtools-0.1.18/bcftools/prob1.h 2016-02-14 18:21:17.504079000 +0000 +++ tophat-2.1.1+dfsg1/src/samtools-0.1.18/bcftools/prob1.h 1970-01-01 00:00:00.000000000 +0000 @@ -1,42 +0,0 @@ -#ifndef BCF_PROB1_H -#define BCF_PROB1_H - -#include "bcf.h" - -struct __bcf_p1aux_t; -typedef struct __bcf_p1aux_t bcf_p1aux_t; - -typedef struct { - int rank0, perm_rank; // NB: perm_rank is always set to -1 by bcf_p1_cal() - int ac; // ML alternative allele count - double f_exp, f_flat, p_ref_folded, p_ref, p_var_folded, p_var; - double cil, cih; - double cmp[3], p_chi2, lrt; // used by contrast2() -} bcf_p1rst_t; - -#define MC_PTYPE_FULL 1 -#define MC_PTYPE_COND2 2 -#define MC_PTYPE_FLAT 3 - -#ifdef __cplusplus -extern "C" { -#endif - - bcf_p1aux_t *bcf_p1_init(int n, uint8_t *ploidy); - void bcf_p1_init_prior(bcf_p1aux_t *ma, int type, double theta); - void bcf_p1_init_subprior(bcf_p1aux_t *ma, int type, double theta); - void bcf_p1_destroy(bcf_p1aux_t *ma); - int bcf_p1_cal(const bcf1_t *b, int do_contrast, bcf_p1aux_t *ma, bcf_p1rst_t *rst); - int bcf_p1_call_gt(const bcf_p1aux_t *ma, double f0, int k); - void bcf_p1_dump_afs(bcf_p1aux_t *ma); - int bcf_p1_read_prior(bcf_p1aux_t *ma, const char *fn); - int bcf_p1_set_n1(bcf_p1aux_t *b, int n1); - void bcf_p1_set_folded(bcf_p1aux_t *p1a); // only effective when set_n1() is not called - - int bcf_em1(const bcf1_t *b, int n1, int flag, double x[10]); - -#ifdef __cplusplus -} -#endif - -#endif diff -Nru tophat-2.1.1+dfsg/src/samtools-0.1.18/bcftools/README tophat-2.1.1+dfsg1/src/samtools-0.1.18/bcftools/README --- tophat-2.1.1+dfsg/src/samtools-0.1.18/bcftools/README 2016-02-14 18:21:17.473079000 +0000 +++ tophat-2.1.1+dfsg1/src/samtools-0.1.18/bcftools/README 1970-01-01 00:00:00.000000000 +0000 @@ -1,36 +0,0 @@ -The view command of bcftools calls variants, tests Hardy-Weinberg -equilibrium (HWE), tests allele balances and estimates allele frequency. - -This command calls a site as a potential variant if P(ref|D,F) is below -0.9 (controlled by the -p option), where D is data and F is the prior -allele frequency spectrum (AFS). - -The view command performs two types of allele balance tests, both based -on Fisher's exact test for 2x2 contingency tables with the row variable -being reference allele or not. In the first table, the column variable -is strand. Two-tail P-value is taken. We test if variant bases tend to -come from one strand. In the second table, the column variable is -whether a base appears in the first or the last 11bp of the read. -One-tail P-value is taken. We test if variant bases tend to occur -towards the end of reads, which is usually an indication of -misalignment. - -Site allele frequency is estimated in two ways. In the first way, the -frequency is esimated as \argmax_f P(D|f) under the assumption of -HWE. Prior AFS is not used. In the second way, the frequency is -estimated as the posterior expectation of allele counts \sum_k -kP(k|D,F), dividied by the total number of haplotypes. HWE is not -assumed, but the estimate depends on the prior AFS. The two estimates -largely agree when the signal is strong, but may differ greatly on weak -sites as in this case, the prior plays an important role. - -To test HWE, we calculate the posterior distribution of genotypes -(ref-hom, het and alt-hom). Chi-square test is performed. It is worth -noting that the model used here is prior dependent and assumes HWE, -which is different from both models for allele frequency estimate. The -new model actually yields a third estimate of site allele frequency. - -The estimate allele frequency spectrum is printed to stderr per 64k -sites. The estimate is in fact only the first round of a EM -procedure. The second model (not the model for HWE testing) is used to -estimate the AFS. \ No newline at end of file diff -Nru tophat-2.1.1+dfsg/src/samtools-0.1.18/bcftools/vcf.c tophat-2.1.1+dfsg1/src/samtools-0.1.18/bcftools/vcf.c --- tophat-2.1.1+dfsg/src/samtools-0.1.18/bcftools/vcf.c 2016-02-14 18:21:17.505079000 +0000 +++ tophat-2.1.1+dfsg1/src/samtools-0.1.18/bcftools/vcf.c 1970-01-01 00:00:00.000000000 +0000 @@ -1,244 +0,0 @@ -#include -#include -#include -#include -#include "bcf.h" -#include "kstring.h" -#include "kseq.h" -KSTREAM_INIT(gzFile, gzread, 4096) - -typedef struct { - gzFile fp; - FILE *fpout; - kstream_t *ks; - void *refhash; - kstring_t line; - int max_ref; -} vcf_t; - -bcf_hdr_t *vcf_hdr_read(bcf_t *bp) -{ - kstring_t meta, smpl; - int dret; - vcf_t *v; - bcf_hdr_t *h; - if (!bp->is_vcf) return bcf_hdr_read(bp); - h = calloc(1, sizeof(bcf_hdr_t)); - v = (vcf_t*)bp->v; - v->line.l = 0; - memset(&meta, 0, sizeof(kstring_t)); - memset(&smpl, 0, sizeof(kstring_t)); - while (ks_getuntil(v->ks, '\n', &v->line, &dret) >= 0) { - if (v->line.l < 2) continue; - if (v->line.s[0] != '#') return 0; // no sample line - if (v->line.s[0] == '#' && v->line.s[1] == '#') { - kputsn(v->line.s, v->line.l, &meta); kputc('\n', &meta); - } else if (v->line.s[0] == '#') { - int k; - ks_tokaux_t aux; - char *p; - for (p = kstrtok(v->line.s, "\t\n", &aux), k = 0; p; p = kstrtok(0, 0, &aux), ++k) { - if (k >= 9) { - kputsn(p, aux.p - p, &smpl); - kputc('\0', &smpl); - } - } - break; - } - } - kputc('\0', &meta); - h->name = 0; - h->sname = smpl.s; h->l_smpl = smpl.l; - h->txt = meta.s; h->l_txt = meta.l; - bcf_hdr_sync(h); - return h; -} - -bcf_t *vcf_open(const char *fn, const char *mode) -{ - bcf_t *bp; - vcf_t *v; - if (strchr(mode, 'b')) return bcf_open(fn, mode); - bp = calloc(1, sizeof(bcf_t)); - v = calloc(1, sizeof(vcf_t)); - bp->is_vcf = 1; - bp->v = v; - v->refhash = bcf_str2id_init(); - if (strchr(mode, 'r')) { - v->fp = strcmp(fn, "-")? gzopen(fn, "r") : gzdopen(fileno(stdin), "r"); - v->ks = ks_init(v->fp); - } else if (strchr(mode, 'w')) - v->fpout = strcmp(fn, "-")? fopen(fn, "w") : stdout; - return bp; -} - -int vcf_dictread(bcf_t *bp, bcf_hdr_t *h, const char *fn) -{ - vcf_t *v; - gzFile fp; - kstream_t *ks; - kstring_t s, rn; - int dret; - if (bp == 0) return -1; - if (!bp->is_vcf) return 0; - s.l = s.m = 0; s.s = 0; - rn.m = rn.l = h->l_nm; rn.s = h->name; - v = (vcf_t*)bp->v; - fp = gzopen(fn, "r"); - ks = ks_init(fp); - while (ks_getuntil(ks, 0, &s, &dret) >= 0) { - bcf_str2id_add(v->refhash, strdup(s.s)); - kputs(s.s, &rn); kputc('\0', &rn); - if (dret != '\n') ks_getuntil(ks, '\n', &s, &dret); - } - ks_destroy(ks); - gzclose(fp); - h->l_nm = rn.l; h->name = rn.s; - bcf_hdr_sync(h); - free(s.s); - return 0; -} - -int vcf_close(bcf_t *bp) -{ - vcf_t *v; - if (bp == 0) return -1; - if (!bp->is_vcf) return bcf_close(bp); - v = (vcf_t*)bp->v; - if (v->fp) { - ks_destroy(v->ks); - gzclose(v->fp); - } - if (v->fpout) fclose(v->fpout); - free(v->line.s); - bcf_str2id_thorough_destroy(v->refhash); - free(v); - free(bp); - return 0; -} - -int vcf_hdr_write(bcf_t *bp, const bcf_hdr_t *h) -{ - vcf_t *v = (vcf_t*)bp->v; - int i, has_ver = 0; - if (!bp->is_vcf) return bcf_hdr_write(bp, h); - if (h->l_txt > 0) { - if (strstr(h->txt, "##fileformat=")) has_ver = 1; - if (has_ver == 0) fprintf(v->fpout, "##fileformat=VCFv4.1\n"); - fwrite(h->txt, 1, h->l_txt - 1, v->fpout); - } - if (h->l_txt == 0) fprintf(v->fpout, "##fileformat=VCFv4.1\n"); - fprintf(v->fpout, "#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT"); - for (i = 0; i < h->n_smpl; ++i) - fprintf(v->fpout, "\t%s", h->sns[i]); - fputc('\n', v->fpout); - return 0; -} - -int vcf_write(bcf_t *bp, bcf_hdr_t *h, bcf1_t *b) -{ - vcf_t *v = (vcf_t*)bp->v; - extern void bcf_fmt_core(const bcf_hdr_t *h, bcf1_t *b, kstring_t *s); - if (!bp->is_vcf) return bcf_write(bp, h, b); - bcf_fmt_core(h, b, &v->line); - fwrite(v->line.s, 1, v->line.l, v->fpout); - fputc('\n', v->fpout); - return v->line.l + 1; -} - -int vcf_read(bcf_t *bp, bcf_hdr_t *h, bcf1_t *b) -{ - int dret, k, i, sync = 0; - vcf_t *v = (vcf_t*)bp->v; - char *p, *q; - kstring_t str, rn; - ks_tokaux_t aux, a2; - if (!bp->is_vcf) return bcf_read(bp, h, b); - v->line.l = 0; - str.l = 0; str.m = b->m_str; str.s = b->str; - rn.l = rn.m = h->l_nm; rn.s = h->name; - if (ks_getuntil(v->ks, '\n', &v->line, &dret) < 0) return -1; - b->n_smpl = h->n_smpl; - for (p = kstrtok(v->line.s, "\t", &aux), k = 0; p; p = kstrtok(0, 0, &aux), ++k) { - *(char*)aux.p = 0; - if (k == 0) { // ref - int tid = bcf_str2id(v->refhash, p); - if (tid < 0) { - tid = bcf_str2id_add(v->refhash, strdup(p)); - kputs(p, &rn); kputc('\0', &rn); - sync = 1; - } - b->tid = tid; - } else if (k == 1) { // pos - b->pos = atoi(p) - 1; - } else if (k == 5) { // qual - b->qual = (p[0] >= '0' && p[0] <= '9')? atof(p) : 0; - } else if (k <= 8) { // variable length strings - kputs(p, &str); kputc('\0', &str); - b->l_str = str.l; b->m_str = str.m; b->str = str.s; - if (k == 8) bcf_sync(b); - } else { // k > 9 - if (strncmp(p, "./.", 3) == 0) { - for (i = 0; i < b->n_gi; ++i) { - if (b->gi[i].fmt == bcf_str2int("GT", 2)) { - ((uint8_t*)b->gi[i].data)[k-9] = 1<<7; - } else if (b->gi[i].fmt == bcf_str2int("GQ", 2)) { - ((uint8_t*)b->gi[i].data)[k-9] = 0; - } else if (b->gi[i].fmt == bcf_str2int("SP", 2)) { - ((int32_t*)b->gi[i].data)[k-9] = 0; - } else if (b->gi[i].fmt == bcf_str2int("DP", 2)) { - ((uint16_t*)b->gi[i].data)[k-9] = 0; - } else if (b->gi[i].fmt == bcf_str2int("PL", 2)) { - int y = b->n_alleles * (b->n_alleles + 1) / 2; - memset((uint8_t*)b->gi[i].data + (k - 9) * y, 0, y); - } else if (b->gi[i].fmt == bcf_str2int("GL", 2)) { - int y = b->n_alleles * (b->n_alleles + 1) / 2; - memset((float*)b->gi[i].data + (k - 9) * y, 0, y * 4); - } - } - goto endblock; - } - for (q = kstrtok(p, ":", &a2), i = 0; q && i < b->n_gi; q = kstrtok(0, 0, &a2), ++i) { - if (b->gi[i].fmt == bcf_str2int("GT", 2)) { - ((uint8_t*)b->gi[i].data)[k-9] = (q[0] - '0')<<3 | (q[2] - '0') | (q[1] == '/'? 0 : 1) << 6; - } else if (b->gi[i].fmt == bcf_str2int("GQ", 2)) { - double _x = strtod(q, &q); - int x = (int)(_x + .499); - if (x > 255) x = 255; - ((uint8_t*)b->gi[i].data)[k-9] = x; - } else if (b->gi[i].fmt == bcf_str2int("SP", 2)) { - int x = strtol(q, &q, 10); - if (x > 0xffff) x = 0xffff; - ((uint32_t*)b->gi[i].data)[k-9] = x; - } else if (b->gi[i].fmt == bcf_str2int("DP", 2)) { - int x = strtol(q, &q, 10); - if (x > 0xffff) x = 0xffff; - ((uint16_t*)b->gi[i].data)[k-9] = x; - } else if (b->gi[i].fmt == bcf_str2int("PL", 2)) { - int x, y, j; - uint8_t *data = (uint8_t*)b->gi[i].data; - y = b->n_alleles * (b->n_alleles + 1) / 2; - for (j = 0; j < y; ++j) { - x = strtol(q, &q, 10); - if (x > 255) x = 255; - data[(k-9) * y + j] = x; - ++q; - } - } else if (b->gi[i].fmt == bcf_str2int("GL", 2)) { - int j, y; - float x, *data = (float*)b->gi[i].data; - y = b->n_alleles * (b->n_alleles + 1) / 2; - for (j = 0; j < y; ++j) { - x = strtod(q, &q); - data[(k-9) * y + j] = x > 0? -x/10. : x; - ++q; - } - } - } - endblock: i = i; - } - } - h->l_nm = rn.l; h->name = rn.s; - if (sync) bcf_hdr_sync(h); - return v->line.l + 1; -} diff -Nru tophat-2.1.1+dfsg/src/samtools-0.1.18/bcftools/vcfutils.pl tophat-2.1.1+dfsg1/src/samtools-0.1.18/bcftools/vcfutils.pl --- tophat-2.1.1+dfsg/src/samtools-0.1.18/bcftools/vcfutils.pl 2016-02-14 18:21:17.514079000 +0000 +++ tophat-2.1.1+dfsg1/src/samtools-0.1.18/bcftools/vcfutils.pl 1970-01-01 00:00:00.000000000 +0000 @@ -1,567 +0,0 @@ -#!/usr/bin/perl -w - -# Author: lh3 - -use strict; -use warnings; -use Getopt::Std; - -&main; -exit; - -sub main { - &usage if (@ARGV < 1); - my $command = shift(@ARGV); - my %func = (subsam=>\&subsam, listsam=>\&listsam, fillac=>\&fillac, qstats=>\&qstats, varFilter=>\&varFilter, - hapmap2vcf=>\&hapmap2vcf, ucscsnp2vcf=>\&ucscsnp2vcf, filter4vcf=>\&varFilter, ldstats=>\&ldstats, - gapstats=>\&gapstats, splitchr=>\&splitchr, vcf2fq=>\&vcf2fq); - die("Unknown command \"$command\".\n") if (!defined($func{$command})); - &{$func{$command}}; -} - -sub splitchr { - my %opts = (l=>5000000); - getopts('l:', \%opts); - my $l = $opts{l}; - die(qq/Usage: vcfutils.pl splitchr [-l $opts{l}] \n/) if (@ARGV == 0 && -t STDIN); - while (<>) { - my @t = split; - my $last = 0; - for (my $i = 0; $i < $t[1];) { - my $e = ($t[1] - $i) / $l < 1.1? $t[1] : $i + $l; - print "$t[0]:".($i+1)."-$e\n"; - $i = $e; - } - } -} - -sub subsam { - die(qq/Usage: vcfutils.pl subsam [samples]\n/) if (@ARGV == 0); - my ($fh, %h); - my $fn = shift(@ARGV); - my @col; - open($fh, ($fn =~ /\.gz$/)? "gzip -dc $fn |" : $fn) || die; - $h{$_} = 1 for (@ARGV); - while (<$fh>) { - if (/^##/) { - print; - } elsif (/^#/) { - my @t = split; - my @s = @t[0..8]; # all fixed fields + FORMAT - for (9 .. $#t) { - if ($h{$t[$_]}) { - push(@s, $t[$_]); - push(@col, $_); - } - } - pop(@s) if (@s == 9); # no sample selected; remove the FORMAT field - print join("\t", @s), "\n"; - } else { - my @t = split; - if (@col == 0) { - print join("\t", @t[0..7]), "\n"; - } else { - print join("\t", @t[0..8], map {$t[$_]} @col), "\n"; - } - } - } - close($fh); -} - -sub listsam { - die(qq/Usage: vcfutils.pl listsam \n/) if (@ARGV == 0 && -t STDIN); - while (<>) { - if (/^#/ && !/^##/) { - my @t = split; - print join("\n", @t[9..$#t]), "\n"; - exit; - } - } -} - -sub fillac { - die(qq/Usage: vcfutils.pl fillac \n\nNote: The GT field MUST BE present and always appear as the first field.\n/) if (@ARGV == 0 && -t STDIN); - while (<>) { - if (/^#/) { - print; - } else { - my @t = split; - my @c = (0, 0); - my $n = 0; - my $s = -1; - @_ = split(":", $t[8]); - for (0 .. $#_) { - if ($_[$_] eq 'GT') { $s = $_; last; } - } - if ($s < 0) { - print join("\t", @t), "\n"; - next; - } - for (9 .. $#t) { - if ($t[$_] =~ /^0,0,0/) { - } elsif ($t[$_] =~ /^([^\s:]+:){$s}(\d+).(\d+)/) { - ++$c[$2]; ++$c[$3]; - $n += 2; - } - } - my $AC = "AC=" . join("\t", @c[1..$#c]) . ";AN=$n"; - my $info = $t[7]; - $info =~ s/(;?)AC=(\d+)//; - $info =~ s/(;?)AN=(\d+)//; - if ($info eq '.') { - $info = $AC; - } else { - $info .= ";$AC"; - } - $t[7] = $info; - print join("\t", @t), "\n"; - } - } -} - -sub ldstats { - my %opts = (t=>0.9); - getopts('t:', \%opts); - die("Usage: vcfutils.pl ldstats [-t $opts{t}] \n") if (@ARGV == 0 && -t STDIN); - my $cutoff = $opts{t}; - my ($last, $lastchr) = (0x7fffffff, ''); - my ($x, $y, $n) = (0, 0, 0); - while (<>) { - if (/^([^#\s]+)\s(\d+)/) { - my ($chr, $pos) = ($1, $2); - if (/NEIR=([\d\.]+)/) { - ++$n; - ++$y, $x += $pos - $last if ($lastchr eq $chr && $pos > $last && $1 > $cutoff); - } - $last = $pos; $lastchr = $chr; - } - } - print "Number of SNP intervals in strong LD (r > $opts{t}): $y\n"; - print "Fraction: ", $y/$n, "\n"; - print "Length: $x\n"; -} - -sub qstats { - my %opts = (r=>'', s=>0.02, v=>undef); - getopts('r:s:v', \%opts); - die("Usage: vcfutils.pl qstats [-r ref.vcf] \n -Note: This command discards indels. Output: QUAL #non-indel #SNPs #transitions #joint ts/tv #joint/#ref #joint/#non-indel \n") if (@ARGV == 0 && -t STDIN); - my %ts = (AG=>1, GA=>1, CT=>1, TC=>1); - my %h = (); - my $is_vcf = defined($opts{v})? 1 : 0; - if ($opts{r}) { # read the reference positions - my $fh; - open($fh, $opts{r}) || die; - while (<$fh>) { - next if (/^#/); - if ($is_vcf) { - my @t = split; - $h{$t[0],$t[1]} = $t[4]; - } else { - $h{$1,$2} = 1 if (/^(\S+)\s+(\d+)/); - } - } - close($fh); - } - my $hsize = scalar(keys %h); - my @a; - while (<>) { - next if (/^#/); - my @t = split; - next if (length($t[3]) != 1 || uc($t[3]) eq 'N'); - $t[3] = uc($t[3]); $t[4] = uc($t[4]); - my @s = split(',', $t[4]); - $t[5] = 3 if ($t[5] eq '.' || $t[5] < 0); - next if (length($s[0]) != 1); - my $hit; - if ($is_vcf) { - $hit = 0; - my $aa = $h{$t[0],$t[1]}; - if (defined($aa)) { - my @aaa = split(",", $aa); - for (@aaa) { - $hit = 1 if ($_ eq $s[0]); - } - } - } else { - $hit = defined($h{$t[0],$t[1]})? 1 : 0; - } - push(@a, [$t[5], ($t[4] eq '.' || $t[4] eq $t[3])? 0 : 1, $ts{$t[3].$s[0]}? 1 : 0, $hit]); - } - push(@a, [-1, 0, 0, 0]); # end marker - die("[qstats] No SNP data!\n") if (@a == 0); - @a = sort {$b->[0]<=>$a->[0]} @a; - my $next = $opts{s}; - my $last = $a[0]; - my @c = (0, 0, 0, 0); - my @lc; - $lc[1] = $lc[2] = 0; - for my $p (@a) { - if ($p->[0] == -1 || ($p->[0] != $last && $c[0]/@a > $next)) { - my @x; - $x[0] = sprintf("%.4f", $c[1]-$c[2]? $c[2] / ($c[1] - $c[2]) : 100); - $x[1] = sprintf("%.4f", $hsize? $c[3] / $hsize : 0); - $x[2] = sprintf("%.4f", $c[3] / $c[1]); - my $a = $c[1] - $lc[1]; - my $b = $c[2] - $lc[2]; - $x[3] = sprintf("%.4f", $a-$b? $b / ($a-$b) : 100); - print join("\t", $last, @c, @x), "\n"; - $next = $c[0]/@a + $opts{s}; - $lc[1] = $c[1]; $lc[2] = $c[2]; - } - ++$c[0]; $c[1] += $p->[1]; $c[2] += $p->[2]; $c[3] += $p->[3]; - $last = $p->[0]; - } -} - -sub varFilter { - my %opts = (d=>2, D=>10000000, a=>2, W=>10, Q=>10, w=>3, p=>undef, 1=>1e-4, 2=>1e-100, 3=>0, 4=>1e-4, G=>0, S=>1000, e=>1e-4); - getopts('pd:D:W:Q:w:a:1:2:3:4:G:S:e:', \%opts); - die(qq/ -Usage: vcfutils.pl varFilter [options] - -Options: -Q INT minimum RMS mapping quality for SNPs [$opts{Q}] - -d INT minimum read depth [$opts{d}] - -D INT maximum read depth [$opts{D}] - -a INT minimum number of alternate bases [$opts{a}] - -w INT SNP within INT bp around a gap to be filtered [$opts{w}] - -W INT window size for filtering adjacent gaps [$opts{W}] - -1 FLOAT min P-value for strand bias (given PV4) [$opts{1}] - -2 FLOAT min P-value for baseQ bias [$opts{2}] - -3 FLOAT min P-value for mapQ bias [$opts{3}] - -4 FLOAT min P-value for end distance bias [$opts{4}] - -e FLOAT min P-value for HWE (plus F<0) [$opts{e}] - -p print filtered variants - -Note: Some of the filters rely on annotations generated by SAMtools\/BCFtools. -\n/) if (@ARGV == 0 && -t STDIN); - - # calculate the window size - my ($ol, $ow) = ($opts{W}, $opts{w}); - my $max_dist = $ol > $ow? $ol : $ow; - # the core loop - my @staging; # (indel_filtering_score, flt_tag, indel_span; chr, pos, ...) - while (<>) { - my @t = split; - if (/^#/) { - print; next; - } - next if ($t[4] eq '.'); # skip non-var sites - next if ($t[3] eq 'N'); # skip sites with unknown ref ('N') - # check if the site is a SNP - my $type = 1; # SNP - if (length($t[3]) > 1) { - $type = 2; # MNP - my @s = split(',', $t[4]); - for (@s) { - $type = 3 if (length != length($t[3])); - } - } else { - my @s = split(',', $t[4]); - for (@s) { - $type = 3 if (length > 1); - } - } - # clear the out-of-range elements - while (@staging) { - # Still on the same chromosome and the first element's window still affects this position? - last if ($staging[0][3] eq $t[0] && $staging[0][4] + $staging[0][2] + $max_dist >= $t[1]); - varFilter_aux(shift(@staging), $opts{p}); # calling a function is a bit slower, not much - } - my $flt = 0; - # parse annotations - my ($dp, $mq, $dp_alt) = (-1, -1, -1); - if ($t[7] =~ /DP4=(\d+),(\d+),(\d+),(\d+)/i) { - $dp = $1 + $2 + $3 + $4; - $dp_alt = $3 + $4; - } - if ($t[7] =~ /DP=(\d+)/i) { - $dp = $1; - } - $mq = $1 if ($t[7] =~ /MQ=(\d+)/i); - # the depth and mapQ filter - if ($dp >= 0) { - if ($dp < $opts{d}) { - $flt = 2; - } elsif ($dp > $opts{D}) { - $flt = 3; - } - } - $flt = 4 if ($dp_alt >= 0 && $dp_alt < $opts{a}); - $flt = 1 if ($flt == 0 && $mq >= 0 && $mq < $opts{Q}); - $flt = 7 if ($flt == 0 && /PV4=([^,]+),([^,]+),([^,]+),([^,;\t]+)/ - && ($1<$opts{1} || $2<$opts{2} || $3<$opts{3} || $4<$opts{4})); - $flt = 8 if ($flt == 0 && ((/MXGQ=(\d+)/ && $1 < $opts{G}) || (/MXSP=(\d+)/ && $1 >= $opts{S}))); - # HWE filter - if ($t[7] =~ /G3=([^;,]+),([^;,]+),([^;,]+).*HWE=([^;,]+)/ && $4 < $opts{e}) { - my $p = 2*$1 + $2; - my $f = ($p > 0 && $p < 1)? 1 - $2 / ($p * (1-$p)) : 0; - $flt = 9 if ($f < 0); - } - - my $score = $t[5] * 100 + $dp_alt; - my $rlen = length($t[3]) - 1; # $indel_score<0 for SNPs - if ($flt == 0) { - if ($type == 3) { # an indel - # filtering SNPs and MNPs - for my $x (@staging) { - next if (($x->[0]&3) == 3 || $x->[1] || $x->[4] + $x->[2] + $ow < $t[1]); - $x->[1] = 5; - } - # check the staging list for indel filtering - for my $x (@staging) { - next if (($x->[0]&3) != 3 || $x->[1] || $x->[4] + $x->[2] + $ol < $t[1]); - if ($x->[0]>>2 < $score) { - $x->[1] = 6; - } else { - $flt = 6; last; - } - } - } else { # SNP or MNP - for my $x (@staging) { - next if (($x->[0]&3) != 3 || $x->[4] + $x->[2] + $ow < $t[1]); - if ($x->[4] + length($x->[7]) - 1 == $t[1] && substr($x->[7], -1, 1) eq substr($t[4], 0, 1) - && length($x->[7]) - length($x->[6]) == 1) { - $x->[1] = 5; - } else { $flt = 5; } - last; - } - # check MNP - for my $x (@staging) { - next if (($x->[0]&3) == 3 || $x->[4] + $x->[2] < $t[1]); - if ($x->[0]>>2 < $score) { - $x->[1] = 8; - } else { - $flt = 8; last; - } - } - } - } - push(@staging, [$score<<2|$type, $flt, $rlen, @t]); - } - # output the last few elements in the staging list - while (@staging) { - varFilter_aux(shift @staging, $opts{p}); - } -} - -sub varFilter_aux { - my ($first, $is_print) = @_; - if ($first->[1] == 0) { - print join("\t", @$first[3 .. @$first-1]), "\n"; - } elsif ($is_print) { - print STDERR join("\t", substr("UQdDaGgPMS", $first->[1], 1), @$first[3 .. @$first-1]), "\n"; - } -} - -sub gapstats { - my (@c0, @c1); - $c0[$_] = $c1[$_] = 0 for (0 .. 10000); - while (<>) { - next if (/^#/); - my @t = split; - next if (length($t[3]) == 1 && $t[4] =~ /^[A-Za-z](,[A-Za-z])*$/); # not an indel - my @s = split(',', $t[4]); - for my $x (@s) { - my $l = length($x) - length($t[3]) + 5000; - if ($x =~ /^-/) { - $l = -(length($x) - 1) + 5000; - } elsif ($x =~ /^\+/) { - $l = length($x) - 1 + 5000; - } - $c0[$l] += 1 / @s; - } - } - for (my $i = 0; $i < 10000; ++$i) { - next if ($c0[$i] == 0); - $c1[0] += $c0[$i]; - $c1[1] += $c0[$i] if (($i-5000)%3 == 0); - printf("C\t%d\t%.2f\n", ($i-5000), $c0[$i]); - } - printf("3\t%d\t%d\t%.3f\n", $c1[0], $c1[1], $c1[1]/$c1[0]); -} - -sub ucscsnp2vcf { - die("Usage: vcfutils.pl \n") if (@ARGV == 0 && -t STDIN); - print "##fileformat=VCFv4.0\n"; - print join("\t", "#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO"), "\n"; - while (<>) { - my @t = split("\t"); - my $indel = ($t[9] =~ /^[ACGT](\/[ACGT])+$/)? 0 : 1; - my $pos = $t[2] + 1; - my @alt; - push(@alt, $t[7]); - if ($t[6] eq '-') { - $t[9] = reverse($t[9]); - $t[9] =~ tr/ACGTRYMKWSNacgtrymkwsn/TGCAYRKMWSNtgcayrkmwsn/; - } - my @a = split("/", $t[9]); - for (@a) { - push(@alt, $_) if ($_ ne $alt[0]); - } - if ($indel) { - --$pos; - for (0 .. $#alt) { - $alt[$_] =~ tr/-//d; - $alt[$_] = "N$alt[$_]"; - } - } - my $ref = shift(@alt); - my $af = $t[13] > 0? ";AF=$t[13]" : ''; - my $valid = ($t[12] eq 'unknown')? '' : ";valid=$t[12]"; - my $info = "molType=$t[10];class=$t[11]$valid$af"; - print join("\t", $t[1], $pos, $t[4], $ref, join(",", @alt), 0, '.', $info), "\n"; - } -} - -sub hapmap2vcf { - die("Usage: vcfutils.pl \n") if (@ARGV == 0); - my $fn = shift(@ARGV); - # parse UCSC SNP - warn("Parsing UCSC SNPs...\n"); - my ($fh, %map); - open($fh, ($fn =~ /\.gz$/)? "gzip -dc $fn |" : $fn) || die; - while (<$fh>) { - my @t = split; - next if ($t[3] - $t[2] != 1); # not SNP - @{$map{$t[4]}} = @t[1,3,7]; - } - close($fh); - # write VCF - warn("Writing VCF...\n"); - print "##fileformat=VCFv4.0\n"; - while (<>) { - my @t = split; - if ($t[0] eq 'rs#') { # the first line - print join("\t", "#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT", @t[11..$#t]), "\n"; - } else { - next unless ($map{$t[0]}); - next if (length($t[1]) != 3); # skip non-SNPs - my $a = \@{$map{$t[0]}}; - my $ref = $a->[2]; - my @u = split('/', $t[1]); - if ($u[1] eq $ref) { - $u[1] = $u[0]; $u[0] = $ref; - } elsif ($u[0] ne $ref) { next; } - my $alt = $u[1]; - my %w; - $w{$u[0]} = 0; $w{$u[1]} = 1; - my @s = (@$a[0,1], $t[0], $ref, $alt, 0, '.', '.', 'GT'); - my $is_tri = 0; - for (@t[11..$#t]) { - if ($_ eq 'NN') { - push(@s, './.'); - } else { - my @a = ($w{substr($_,0,1)}, $w{substr($_,1,1)}); - if (!defined($a[0]) || !defined($a[1])) { - $is_tri = 1; - last; - } - push(@s, "$a[0]/$a[1]"); - } - } - next if ($is_tri); - print join("\t", @s), "\n"; - } - } -} - -sub vcf2fq { - my %opts = (d=>3, D=>100000, Q=>10, l=>5); - getopts('d:D:Q:l:', \%opts); - die(qq/ -Usage: vcfutils.pl vcf2fq [options] - -Options: -d INT minimum depth [$opts{d}] - -D INT maximum depth [$opts{D}] - -Q INT min RMS mapQ [$opts{Q}] - -l INT INDEL filtering window [$opts{l}] -\n/) if (@ARGV == 0 && -t STDIN); - - my ($last_chr, $seq, $qual, $last_pos, @gaps); - my $_Q = $opts{Q}; - my $_d = $opts{d}; - my $_D = $opts{D}; - - my %het = (AC=>'M', AG=>'R', AT=>'W', CA=>'M', CG=>'S', CT=>'Y', - GA=>'R', GC=>'S', GT=>'K', TA=>'W', TC=>'Y', TG=>'K'); - - $last_chr = ''; - while (<>) { - next if (/^#/); - my @t = split; - if ($last_chr ne $t[0]) { - &v2q_post_process($last_chr, \$seq, \$qual, \@gaps, $opts{l}) if ($last_chr); - ($last_chr, $last_pos) = ($t[0], 0); - $seq = $qual = ''; - @gaps = (); - } - die("[vcf2fq] unsorted input\n") if ($t[1] - $last_pos < 0); - if ($t[1] - $last_pos > 1) { - $seq .= 'n' x ($t[1] - $last_pos - 1); - $qual .= '!' x ($t[1] - $last_pos - 1); - } - if (length($t[3]) == 1 && $t[7] !~ /INDEL/ && $t[4] =~ /^([A-Za-z.])(,[A-Za-z])*$/) { # a SNP or reference - my ($ref, $alt) = ($t[3], $1); - my ($b, $q); - $q = $1 if ($t[7] =~ /FQ=(-?[\d\.]+)/); - if ($q < 0) { - $_ = ($t[7] =~ /AF1=([\d\.]+)/)? $1 : 0; - $b = ($_ < .5 || $alt eq '.')? $ref : $alt; - $q = -$q; - } else { - $b = $het{"$ref$alt"}; - $b ||= 'N'; - } - $b = lc($b); - $b = uc($b) if (($t[7] =~ /MQ=(\d+)/ && $1 >= $_Q) && ($t[7] =~ /DP=(\d+)/ && $1 >= $_d && $1 <= $_D)); - $q = int($q + 33 + .499); - $q = chr($q <= 126? $q : 126); - $seq .= $b; - $qual .= $q; - } elsif ($t[4] ne '.') { # an INDEL - push(@gaps, [$t[1], length($t[3])]); - } - $last_pos = $t[1]; - } - &v2q_post_process($last_chr, \$seq, \$qual, \@gaps, $opts{l}); -} - -sub v2q_post_process { - my ($chr, $seq, $qual, $gaps, $l) = @_; - for my $g (@$gaps) { - my $beg = $g->[0] > $l? $g->[0] - $l : 0; - my $end = $g->[0] + $g->[1] + $l; - $end = length($$seq) if ($end > length($$seq)); - substr($$seq, $beg, $end - $beg) = lc(substr($$seq, $beg, $end - $beg)); - } - print "\@$chr\n"; &v2q_print_str($seq); - print "+\n"; &v2q_print_str($qual); -} - -sub v2q_print_str { - my ($s) = @_; - my $l = length($$s); - for (my $i = 0; $i < $l; $i += 60) { - print substr($$s, $i, 60), "\n"; - } -} - -sub usage { - die(qq/ -Usage: vcfutils.pl []\n -Command: subsam get a subset of samples - listsam list the samples - fillac fill the allele count field - qstats SNP stats stratified by QUAL - - hapmap2vcf convert the hapmap format to VCF - ucscsnp2vcf convert UCSC SNP SQL dump to VCF - - varFilter filtering short variants (*) - vcf2fq VCF->fastq (**) - -Notes: Commands with description endting with (*) may need bcftools - specific annotations. -\n/); -} diff -Nru tophat-2.1.1+dfsg/src/samtools-0.1.18/bedidx.c tophat-2.1.1+dfsg1/src/samtools-0.1.18/bedidx.c --- tophat-2.1.1+dfsg/src/samtools-0.1.18/bedidx.c 2016-02-14 18:21:17.522079000 +0000 +++ tophat-2.1.1+dfsg1/src/samtools-0.1.18/bedidx.c 1970-01-01 00:00:00.000000000 +0000 @@ -1,162 +0,0 @@ -#include -#include -#include -#include -#include - -#ifdef _WIN32 -#define drand48() ((double)rand() / RAND_MAX) -#endif - -#include "ksort.h" -KSORT_INIT_GENERIC(uint64_t) - -#include "kseq.h" -KSTREAM_INIT(gzFile, gzread, 8192) - -typedef struct { - int n, m; - uint64_t *a; - int *idx; -} bed_reglist_t; - -#include "khash.h" -KHASH_MAP_INIT_STR(reg, bed_reglist_t) - -#define LIDX_SHIFT 13 - -typedef kh_reg_t reghash_t; - -int *bed_index_core(int n, uint64_t *a, int *n_idx) -{ - int i, j, m, *idx; - m = *n_idx = 0; idx = 0; - for (i = 0; i < n; ++i) { - int beg, end; - beg = a[i]>>32 >> LIDX_SHIFT; end = ((uint32_t)a[i]) >> LIDX_SHIFT; - if (m < end + 1) { - int oldm = m; - m = end + 1; - kroundup32(m); - idx = realloc(idx, m * sizeof(int)); - for (j = oldm; j < m; ++j) idx[j] = -1; - } - if (beg == end) { - if (idx[beg] < 0) idx[beg] = i; - } else { - for (j = beg; j <= end; ++j) - if (idx[j] < 0) idx[j] = i; - } - *n_idx = end + 1; - } - return idx; -} - -void bed_index(void *_h) -{ - reghash_t *h = (reghash_t*)_h; - khint_t k; - for (k = 0; k < kh_end(h); ++k) { - if (kh_exist(h, k)) { - bed_reglist_t *p = &kh_val(h, k); - if (p->idx) free(p->idx); - ks_introsort(uint64_t, p->n, p->a); - p->idx = bed_index_core(p->n, p->a, &p->m); - } - } -} - -int bed_overlap_core(const bed_reglist_t *p, int beg, int end) -{ - int i, min_off; - if (p->n == 0) return 0; - min_off = (beg>>LIDX_SHIFT >= p->n)? p->idx[p->n-1] : p->idx[beg>>LIDX_SHIFT]; - if (min_off < 0) { // TODO: this block can be improved, but speed should not matter too much here - int n = beg>>LIDX_SHIFT; - if (n > p->n) n = p->n; - for (i = n - 1; i >= 0; --i) - if (p->idx[i] >= 0) break; - min_off = i >= 0? p->idx[i] : 0; - } - for (i = min_off; i < p->n; ++i) { - if ((int)(p->a[i]>>32) >= end) break; // out of range; no need to proceed - if ((int32_t)p->a[i] > beg && (int32_t)(p->a[i]>>32) < end) - return 1; // find the overlap; return - } - return 0; -} - -int bed_overlap(const void *_h, const char *chr, int beg, int end) -{ - const reghash_t *h = (const reghash_t*)_h; - khint_t k; - if (!h) return 0; - k = kh_get(reg, h, chr); - if (k == kh_end(h)) return 0; - return bed_overlap_core(&kh_val(h, k), beg, end); -} - -void *bed_read(const char *fn) -{ - reghash_t *h = kh_init(reg); - gzFile fp; - kstream_t *ks; - int dret; - kstring_t *str; - // read the list - fp = strcmp(fn, "-")? gzopen(fn, "r") : gzdopen(fileno(stdin), "r"); - if (fp == 0) return 0; - str = calloc(1, sizeof(kstring_t)); - ks = ks_init(fp); - while (ks_getuntil(ks, 0, str, &dret) >= 0) { // read the chr name - int beg = -1, end = -1; - bed_reglist_t *p; - khint_t k = kh_get(reg, h, str->s); - if (k == kh_end(h)) { // absent from the hash table - int ret; - char *s = strdup(str->s); - k = kh_put(reg, h, s, &ret); - memset(&kh_val(h, k), 0, sizeof(bed_reglist_t)); - } - p = &kh_val(h, k); - if (dret != '\n') { // if the lines has other characters - if (ks_getuntil(ks, 0, str, &dret) > 0 && isdigit(str->s[0])) { - beg = atoi(str->s); // begin - if (dret != '\n') { - if (ks_getuntil(ks, 0, str, &dret) > 0 && isdigit(str->s[0])) { - end = atoi(str->s); // end - if (end < beg) end = -1; - } - } - } - } - if (dret != '\n') while ((dret = ks_getc(ks)) > 0 && dret != '\n'); // skip the rest of the line - if (end < 0 && beg > 0) end = beg, beg = beg - 1; // if there is only one column - if (beg >= 0 && end > beg) { - if (p->n == p->m) { - p->m = p->m? p->m<<1 : 4; - p->a = realloc(p->a, p->m * 8); - } - p->a[p->n++] = (uint64_t)beg<<32 | end; - } - } - ks_destroy(ks); - gzclose(fp); - free(str->s); free(str); - bed_index(h); - return h; -} - -void bed_destroy(void *_h) -{ - reghash_t *h = (reghash_t*)_h; - khint_t k; - for (k = 0; k < kh_end(h); ++k) { - if (kh_exist(h, k)) { - free(kh_val(h, k).a); - free(kh_val(h, k).idx); - free((char*)kh_key(h, k)); - } - } - kh_destroy(reg, h); -} diff -Nru tophat-2.1.1+dfsg/src/samtools-0.1.18/bgzf.c tophat-2.1.1+dfsg1/src/samtools-0.1.18/bgzf.c --- tophat-2.1.1+dfsg/src/samtools-0.1.18/bgzf.c 2016-02-14 18:21:17.524079000 +0000 +++ tophat-2.1.1+dfsg1/src/samtools-0.1.18/bgzf.c 1970-01-01 00:00:00.000000000 +0000 @@ -1,714 +0,0 @@ -/* The MIT License - - Copyright (c) 2008 Broad Institute / Massachusetts Institute of Technology - - Permission is hereby granted, free of charge, to any person obtaining a copy - of this software and associated documentation files (the "Software"), to deal - in the Software without restriction, including without limitation the rights - to use, copy, modify, merge, publish, distribute, sublicense, and/or sell - copies of the Software, and to permit persons to whom the Software is - furnished to do so, subject to the following conditions: - - The above copyright notice and this permission notice shall be included in - all copies or substantial portions of the Software. - - THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN - THE SOFTWARE. -*/ - -/* - 2009-06-29 by lh3: cache recent uncompressed blocks. - 2009-06-25 by lh3: optionally use my knetfile library to access file on a FTP. - 2009-06-12 by lh3: support a mode string like "wu" where 'u' for uncompressed output */ - -#include -#include -#include -#include -#include -#include -#include -#include "bgzf.h" - -#include "khash.h" -typedef struct { - int size; - uint8_t *block; - int64_t end_offset; -} cache_t; -KHASH_MAP_INIT_INT64(cache, cache_t) - -#if defined(_WIN32) || defined(_MSC_VER) -#define ftello(fp) ftell(fp) -#define fseeko(fp, offset, whence) fseek(fp, offset, whence) -#else -extern off_t ftello(FILE *stream); -extern int fseeko(FILE *stream, off_t offset, int whence); -#endif - -typedef int8_t bgzf_byte_t; - -static const int DEFAULT_BLOCK_SIZE = 64 * 1024; -static const int MAX_BLOCK_SIZE = 64 * 1024; - -static const int BLOCK_HEADER_LENGTH = 18; -static const int BLOCK_FOOTER_LENGTH = 8; - -static const int GZIP_ID1 = 31; -static const int GZIP_ID2 = 139; -static const int CM_DEFLATE = 8; -static const int FLG_FEXTRA = 4; -static const int OS_UNKNOWN = 255; -static const int BGZF_ID1 = 66; // 'B' -static const int BGZF_ID2 = 67; // 'C' -static const int BGZF_LEN = 2; -static const int BGZF_XLEN = 6; // BGZF_LEN+4 - -static const int GZIP_WINDOW_BITS = -15; // no zlib header -static const int Z_DEFAULT_MEM_LEVEL = 8; - - -inline -void -packInt16(uint8_t* buffer, uint16_t value) -{ - buffer[0] = value; - buffer[1] = value >> 8; -} - -inline -int -unpackInt16(const uint8_t* buffer) -{ - return (buffer[0] | (buffer[1] << 8)); -} - -inline -void -packInt32(uint8_t* buffer, uint32_t value) -{ - buffer[0] = value; - buffer[1] = value >> 8; - buffer[2] = value >> 16; - buffer[3] = value >> 24; -} - -static inline -int -bgzf_min(int x, int y) -{ - return (x < y) ? x : y; -} - -static -void -report_error(BGZF* fp, const char* message) { - fp->error = message; -} - -int bgzf_check_bgzf(const char *fn) -{ - BGZF *fp; - uint8_t buf[10],magic[10]="\037\213\010\4\0\0\0\0\0\377"; - int n; - - if ((fp = bgzf_open(fn, "r")) == 0) - { - fprintf(stderr, "[bgzf_check_bgzf] failed to open the file: %s\n",fn); - return -1; - } - -#ifdef _USE_KNETFILE - n = knet_read(fp->x.fpr, buf, 10); -#else - n = fread(buf, 1, 10, fp->file); -#endif - bgzf_close(fp); - - if ( n!=10 ) - return -1; - - if ( !memcmp(magic, buf, 10) ) return 1; - return 0; -} - -static BGZF *bgzf_read_init() -{ - BGZF *fp; - fp = calloc(1, sizeof(BGZF)); - fp->uncompressed_block_size = MAX_BLOCK_SIZE; - fp->uncompressed_block = malloc(MAX_BLOCK_SIZE); - fp->compressed_block_size = MAX_BLOCK_SIZE; - fp->compressed_block = malloc(MAX_BLOCK_SIZE); - fp->cache_size = 0; - fp->cache = kh_init(cache); - return fp; -} - -static -BGZF* -open_read(int fd) -{ -#ifdef _USE_KNETFILE - knetFile *file = knet_dopen(fd, "r"); -#else - FILE* file = fdopen(fd, "r"); -#endif - BGZF* fp; - if (file == 0) return 0; - fp = bgzf_read_init(); - fp->file_descriptor = fd; - fp->open_mode = 'r'; -#ifdef _USE_KNETFILE - fp->x.fpr = file; -#else - fp->file = file; -#endif - return fp; -} - -static -BGZF* -open_write(int fd, int compress_level) // compress_level==-1 for the default level -{ - FILE* file = fdopen(fd, "w"); - BGZF* fp; - if (file == 0) return 0; - fp = malloc(sizeof(BGZF)); - fp->file_descriptor = fd; - fp->open_mode = 'w'; - fp->owned_file = 0; - fp->compress_level = compress_level < 0? Z_DEFAULT_COMPRESSION : compress_level; // Z_DEFAULT_COMPRESSION==-1 - if (fp->compress_level > 9) fp->compress_level = Z_DEFAULT_COMPRESSION; -#ifdef _USE_KNETFILE - fp->x.fpw = file; -#else - fp->file = file; -#endif - fp->uncompressed_block_size = DEFAULT_BLOCK_SIZE; - fp->uncompressed_block = NULL; - fp->compressed_block_size = MAX_BLOCK_SIZE; - fp->compressed_block = malloc(MAX_BLOCK_SIZE); - fp->block_address = 0; - fp->block_offset = 0; - fp->block_length = 0; - fp->error = NULL; - return fp; -} - -BGZF* -bgzf_open(const char* __restrict path, const char* __restrict mode) -{ - BGZF* fp = NULL; - if (strchr(mode, 'r') || strchr(mode, 'R')) { /* The reading mode is preferred. */ -#ifdef _USE_KNETFILE - knetFile *file = knet_open(path, mode); - if (file == 0) return 0; - fp = bgzf_read_init(); - fp->file_descriptor = -1; - fp->open_mode = 'r'; - fp->x.fpr = file; -#else - int fd, oflag = O_RDONLY; -#ifdef _WIN32 - oflag |= O_BINARY; -#endif - fd = open(path, oflag); - if (fd == -1) return 0; - fp = open_read(fd); -#endif - } else if (strchr(mode, 'w') || strchr(mode, 'W')) { - int fd, compress_level = -1, oflag = O_WRONLY | O_CREAT | O_TRUNC; -#ifdef _WIN32 - oflag |= O_BINARY; -#endif - fd = open(path, oflag, 0666); - if (fd == -1) return 0; - { // set compress_level - int i; - for (i = 0; mode[i]; ++i) - if (mode[i] >= '0' && mode[i] <= '9') break; - if (mode[i]) compress_level = (int)mode[i] - '0'; - if (strchr(mode, 'u')) compress_level = 0; - } - fp = open_write(fd, compress_level); - } - if (fp != NULL) fp->owned_file = 1; - return fp; -} - -BGZF* -bgzf_fdopen(int fd, const char * __restrict mode) -{ - if (fd == -1) return 0; - if (mode[0] == 'r' || mode[0] == 'R') { - return open_read(fd); - } else if (mode[0] == 'w' || mode[0] == 'W') { - int i, compress_level = -1; - for (i = 0; mode[i]; ++i) - if (mode[i] >= '0' && mode[i] <= '9') break; - if (mode[i]) compress_level = (int)mode[i] - '0'; - if (strchr(mode, 'u')) compress_level = 0; - return open_write(fd, compress_level); - } else { - return NULL; - } -} - -static -int -deflate_block(BGZF* fp, int block_length) -{ - // Deflate the block in fp->uncompressed_block into fp->compressed_block. - // Also adds an extra field that stores the compressed block length. - - bgzf_byte_t* buffer = fp->compressed_block; - int buffer_size = fp->compressed_block_size; - - // Init gzip header - buffer[0] = GZIP_ID1; - buffer[1] = GZIP_ID2; - buffer[2] = CM_DEFLATE; - buffer[3] = FLG_FEXTRA; - buffer[4] = 0; // mtime - buffer[5] = 0; - buffer[6] = 0; - buffer[7] = 0; - buffer[8] = 0; - buffer[9] = OS_UNKNOWN; - buffer[10] = BGZF_XLEN; - buffer[11] = 0; - buffer[12] = BGZF_ID1; - buffer[13] = BGZF_ID2; - buffer[14] = BGZF_LEN; - buffer[15] = 0; - buffer[16] = 0; // placeholder for block length - buffer[17] = 0; - - // loop to retry for blocks that do not compress enough - int input_length = block_length; - int compressed_length = 0; - while (1) { - z_stream zs; - zs.zalloc = NULL; - zs.zfree = NULL; - zs.next_in = fp->uncompressed_block; - zs.avail_in = input_length; - zs.next_out = (void*)&buffer[BLOCK_HEADER_LENGTH]; - zs.avail_out = buffer_size - BLOCK_HEADER_LENGTH - BLOCK_FOOTER_LENGTH; - - int status = deflateInit2(&zs, fp->compress_level, Z_DEFLATED, - GZIP_WINDOW_BITS, Z_DEFAULT_MEM_LEVEL, Z_DEFAULT_STRATEGY); - if (status != Z_OK) { - report_error(fp, "deflate init failed"); - return -1; - } - status = deflate(&zs, Z_FINISH); - if (status != Z_STREAM_END) { - deflateEnd(&zs); - if (status == Z_OK) { - // Not enough space in buffer. - // Can happen in the rare case the input doesn't compress enough. - // Reduce the amount of input until it fits. - input_length -= 1024; - if (input_length <= 0) { - // should never happen - report_error(fp, "input reduction failed"); - return -1; - } - continue; - } - report_error(fp, "deflate failed"); - return -1; - } - status = deflateEnd(&zs); - if (status != Z_OK) { - report_error(fp, "deflate end failed"); - return -1; - } - compressed_length = zs.total_out; - compressed_length += BLOCK_HEADER_LENGTH + BLOCK_FOOTER_LENGTH; - if (compressed_length > MAX_BLOCK_SIZE) { - // should never happen - report_error(fp, "deflate overflow"); - return -1; - } - break; - } - - packInt16((uint8_t*)&buffer[16], compressed_length-1); - uint32_t crc = crc32(0L, NULL, 0L); - crc = crc32(crc, fp->uncompressed_block, input_length); - packInt32((uint8_t*)&buffer[compressed_length-8], crc); - packInt32((uint8_t*)&buffer[compressed_length-4], input_length); - - int remaining = block_length - input_length; - if (remaining > 0) { - if (remaining > input_length) { - // should never happen (check so we can use memcpy) - report_error(fp, "remainder too large"); - return -1; - } - memcpy(fp->uncompressed_block, - fp->uncompressed_block + input_length, - remaining); - } - fp->block_offset = remaining; - return compressed_length; -} - -static -int -inflate_block(BGZF* fp, int block_length) -{ - // Inflate the block in fp->compressed_block into fp->uncompressed_block - - z_stream zs; - int status; - zs.zalloc = NULL; - zs.zfree = NULL; - zs.next_in = fp->compressed_block + 18; - zs.avail_in = block_length - 16; - zs.next_out = fp->uncompressed_block; - zs.avail_out = fp->uncompressed_block_size; - - status = inflateInit2(&zs, GZIP_WINDOW_BITS); - if (status != Z_OK) { - report_error(fp, "inflate init failed"); - return -1; - } - status = inflate(&zs, Z_FINISH); - if (status != Z_STREAM_END) { - inflateEnd(&zs); - report_error(fp, "inflate failed"); - return -1; - } - status = inflateEnd(&zs); - if (status != Z_OK) { - report_error(fp, "inflate failed"); - return -1; - } - return zs.total_out; -} - -static -int -check_header(const bgzf_byte_t* header) -{ - return (header[0] == GZIP_ID1 && - header[1] == (bgzf_byte_t) GZIP_ID2 && - header[2] == Z_DEFLATED && - (header[3] & FLG_FEXTRA) != 0 && - unpackInt16((uint8_t*)&header[10]) == BGZF_XLEN && - header[12] == BGZF_ID1 && - header[13] == BGZF_ID2 && - unpackInt16((uint8_t*)&header[14]) == BGZF_LEN); -} - -static void free_cache(BGZF *fp) -{ - khint_t k; - khash_t(cache) *h = (khash_t(cache)*)fp->cache; - if (fp->open_mode != 'r') return; - for (k = kh_begin(h); k < kh_end(h); ++k) - if (kh_exist(h, k)) free(kh_val(h, k).block); - kh_destroy(cache, h); -} - -static int load_block_from_cache(BGZF *fp, int64_t block_address) -{ - khint_t k; - cache_t *p; - khash_t(cache) *h = (khash_t(cache)*)fp->cache; - k = kh_get(cache, h, block_address); - if (k == kh_end(h)) return 0; - p = &kh_val(h, k); - if (fp->block_length != 0) fp->block_offset = 0; - fp->block_address = block_address; - fp->block_length = p->size; - memcpy(fp->uncompressed_block, p->block, MAX_BLOCK_SIZE); -#ifdef _USE_KNETFILE - knet_seek(fp->x.fpr, p->end_offset, SEEK_SET); -#else - fseeko(fp->file, p->end_offset, SEEK_SET); -#endif - return p->size; -} - -static void cache_block(BGZF *fp, int size) -{ - int ret; - khint_t k; - cache_t *p; - khash_t(cache) *h = (khash_t(cache)*)fp->cache; - if (MAX_BLOCK_SIZE >= fp->cache_size) return; - if ((kh_size(h) + 1) * MAX_BLOCK_SIZE > fp->cache_size) { - /* A better way would be to remove the oldest block in the - * cache, but here we remove a random one for simplicity. This - * should not have a big impact on performance. */ - for (k = kh_begin(h); k < kh_end(h); ++k) - if (kh_exist(h, k)) break; - if (k < kh_end(h)) { - free(kh_val(h, k).block); - kh_del(cache, h, k); - } - } - k = kh_put(cache, h, fp->block_address, &ret); - if (ret == 0) return; // if this happens, a bug! - p = &kh_val(h, k); - p->size = fp->block_length; - p->end_offset = fp->block_address + size; - p->block = malloc(MAX_BLOCK_SIZE); - memcpy(kh_val(h, k).block, fp->uncompressed_block, MAX_BLOCK_SIZE); -} - -int -bgzf_read_block(BGZF* fp) -{ - bgzf_byte_t header[BLOCK_HEADER_LENGTH]; - int count, size = 0, block_length, remaining; -#ifdef _USE_KNETFILE - int64_t block_address = knet_tell(fp->x.fpr); - if (load_block_from_cache(fp, block_address)) return 0; - count = knet_read(fp->x.fpr, header, sizeof(header)); -#else - int64_t block_address = ftello(fp->file); - if (load_block_from_cache(fp, block_address)) return 0; - count = fread(header, 1, sizeof(header), fp->file); -#endif - if (count == 0) { - fp->block_length = 0; - return 0; - } - size = count; - if (count != sizeof(header)) { - report_error(fp, "read failed"); - return -1; - } - if (!check_header(header)) { - report_error(fp, "invalid block header"); - return -1; - } - block_length = unpackInt16((uint8_t*)&header[16]) + 1; - bgzf_byte_t* compressed_block = (bgzf_byte_t*) fp->compressed_block; - memcpy(compressed_block, header, BLOCK_HEADER_LENGTH); - remaining = block_length - BLOCK_HEADER_LENGTH; -#ifdef _USE_KNETFILE - count = knet_read(fp->x.fpr, &compressed_block[BLOCK_HEADER_LENGTH], remaining); -#else - count = fread(&compressed_block[BLOCK_HEADER_LENGTH], 1, remaining, fp->file); -#endif - if (count != remaining) { - report_error(fp, "read failed"); - return -1; - } - size += count; - count = inflate_block(fp, block_length); - if (count < 0) return -1; - if (fp->block_length != 0) { - // Do not reset offset if this read follows a seek. - fp->block_offset = 0; - } - fp->block_address = block_address; - fp->block_length = count; - cache_block(fp, size); - return 0; -} - -int -bgzf_read(BGZF* fp, void* data, int length) -{ - if (length <= 0) { - return 0; - } - if (fp->open_mode != 'r') { - report_error(fp, "file not open for reading"); - return -1; - } - - int bytes_read = 0; - bgzf_byte_t* output = data; - while (bytes_read < length) { - int copy_length, available = fp->block_length - fp->block_offset; - bgzf_byte_t *buffer; - if (available <= 0) { - if (bgzf_read_block(fp) != 0) { - return -1; - } - available = fp->block_length - fp->block_offset; - if (available <= 0) { - break; - } - } - copy_length = bgzf_min(length-bytes_read, available); - buffer = fp->uncompressed_block; - memcpy(output, buffer + fp->block_offset, copy_length); - fp->block_offset += copy_length; - output += copy_length; - bytes_read += copy_length; - } - if (fp->block_offset == fp->block_length) { -#ifdef _USE_KNETFILE - fp->block_address = knet_tell(fp->x.fpr); -#else - fp->block_address = ftello(fp->file); -#endif - fp->block_offset = 0; - fp->block_length = 0; - } - return bytes_read; -} - -int bgzf_flush(BGZF* fp) -{ - while (fp->block_offset > 0) { - int count, block_length; - block_length = deflate_block(fp, fp->block_offset); - if (block_length < 0) return -1; -#ifdef _USE_KNETFILE - count = fwrite(fp->compressed_block, 1, block_length, fp->x.fpw); -#else - count = fwrite(fp->compressed_block, 1, block_length, fp->file); -#endif - if (count != block_length) { - report_error(fp, "write failed"); - return -1; - } - fp->block_address += block_length; - } - return 0; -} - -int bgzf_flush_try(BGZF *fp, int size) -{ - if (fp->block_offset + size > fp->uncompressed_block_size) - return bgzf_flush(fp); - return -1; -} - -int bgzf_write(BGZF* fp, const void* data, int length) -{ - const bgzf_byte_t *input = data; - int block_length, bytes_written; - if (fp->open_mode != 'w') { - report_error(fp, "file not open for writing"); - return -1; - } - - if (fp->uncompressed_block == NULL) - fp->uncompressed_block = malloc(fp->uncompressed_block_size); - - input = data; - block_length = fp->uncompressed_block_size; - bytes_written = 0; - while (bytes_written < length) { - int copy_length = bgzf_min(block_length - fp->block_offset, length - bytes_written); - bgzf_byte_t* buffer = fp->uncompressed_block; - memcpy(buffer + fp->block_offset, input, copy_length); - fp->block_offset += copy_length; - input += copy_length; - bytes_written += copy_length; - if (fp->block_offset == block_length) { - if (bgzf_flush(fp) != 0) { - break; - } - } - } - return bytes_written; -} - -int bgzf_close(BGZF* fp) -{ - if (fp->open_mode == 'w') { - if (bgzf_flush(fp) != 0) return -1; - { // add an empty block - int count, block_length = deflate_block(fp, 0); -#ifdef _USE_KNETFILE - count = fwrite(fp->compressed_block, 1, block_length, fp->x.fpw); -#else - count = fwrite(fp->compressed_block, 1, block_length, fp->file); -#endif - } -#ifdef _USE_KNETFILE - if (fflush(fp->x.fpw) != 0) { -#else - if (fflush(fp->file) != 0) { -#endif - report_error(fp, "flush failed"); - return -1; - } - } - if (fp->owned_file) { -#ifdef _USE_KNETFILE - int ret; - if (fp->open_mode == 'w') ret = fclose(fp->x.fpw); - else ret = knet_close(fp->x.fpr); - if (ret != 0) return -1; -#else - if (fclose(fp->file) != 0) return -1; -#endif - } - free(fp->uncompressed_block); - free(fp->compressed_block); - free_cache(fp); - free(fp); - return 0; -} - -void bgzf_set_cache_size(BGZF *fp, int cache_size) -{ - if (fp) fp->cache_size = cache_size; -} - -int bgzf_check_EOF(BGZF *fp) -{ - static uint8_t magic[28] = "\037\213\010\4\0\0\0\0\0\377\6\0\102\103\2\0\033\0\3\0\0\0\0\0\0\0\0\0"; - uint8_t buf[28]; - off_t offset; -#ifdef _USE_KNETFILE - offset = knet_tell(fp->x.fpr); - if (knet_seek(fp->x.fpr, -28, SEEK_END) != 0) return -1; - knet_read(fp->x.fpr, buf, 28); - knet_seek(fp->x.fpr, offset, SEEK_SET); -#else - offset = ftello(fp->file); - if (fseeko(fp->file, -28, SEEK_END) != 0) return -1; - fread(buf, 1, 28, fp->file); - fseeko(fp->file, offset, SEEK_SET); -#endif - return (memcmp(magic, buf, 28) == 0)? 1 : 0; -} - -int64_t bgzf_seek(BGZF* fp, int64_t pos, int where) -{ - int block_offset; - int64_t block_address; - - if (fp->open_mode != 'r') { - report_error(fp, "file not open for read"); - return -1; - } - if (where != SEEK_SET) { - report_error(fp, "unimplemented seek option"); - return -1; - } - block_offset = pos & 0xFFFF; - block_address = (pos >> 16) & 0xFFFFFFFFFFFFLL; -#ifdef _USE_KNETFILE - if (knet_seek(fp->x.fpr, block_address, SEEK_SET) != 0) { -#else - if (fseeko(fp->file, block_address, SEEK_SET) != 0) { -#endif - report_error(fp, "seek failed"); - return -1; - } - fp->block_length = 0; // indicates current block is not loaded - fp->block_address = block_address; - fp->block_offset = block_offset; - return 0; -} diff -Nru tophat-2.1.1+dfsg/src/samtools-0.1.18/bgzf.h tophat-2.1.1+dfsg1/src/samtools-0.1.18/bgzf.h --- tophat-2.1.1+dfsg/src/samtools-0.1.18/bgzf.h 2016-02-14 18:21:17.532079000 +0000 +++ tophat-2.1.1+dfsg1/src/samtools-0.1.18/bgzf.h 1970-01-01 00:00:00.000000000 +0000 @@ -1,157 +0,0 @@ -/* The MIT License - - Copyright (c) 2008 Broad Institute / Massachusetts Institute of Technology - - Permission is hereby granted, free of charge, to any person obtaining a copy - of this software and associated documentation files (the "Software"), to deal - in the Software without restriction, including without limitation the rights - to use, copy, modify, merge, publish, distribute, sublicense, and/or sell - copies of the Software, and to permit persons to whom the Software is - furnished to do so, subject to the following conditions: - - The above copyright notice and this permission notice shall be included in - all copies or substantial portions of the Software. - - THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN - THE SOFTWARE. -*/ - -#ifndef __BGZF_H -#define __BGZF_H - -#include -#include -#include -#ifdef _USE_KNETFILE -#include "knetfile.h" -#endif - -//typedef int8_t bool; - -typedef struct { - int file_descriptor; - char open_mode; // 'r' or 'w' - int16_t owned_file, compress_level; -#ifdef _USE_KNETFILE - union { - knetFile *fpr; - FILE *fpw; - } x; -#else - FILE* file; -#endif - int uncompressed_block_size; - int compressed_block_size; - void* uncompressed_block; - void* compressed_block; - int64_t block_address; - int block_length; - int block_offset; - int cache_size; - const char* error; - void *cache; // a pointer to a hash table -} BGZF; - -#ifdef __cplusplus -extern "C" { -#endif - -/* - * Open an existing file descriptor for reading or writing. - * Mode must be either "r" or "w". - * A subsequent bgzf_close will not close the file descriptor. - * Returns null on error. - */ -BGZF* bgzf_fdopen(int fd, const char* __restrict mode); - -/* - * Open the specified file for reading or writing. - * Mode must be either "r" or "w". - * Returns null on error. - */ -BGZF* bgzf_open(const char* path, const char* __restrict mode); - -/* - * Close the BGZ file and free all associated resources. - * Does not close the underlying file descriptor if created with bgzf_fdopen. - * Returns zero on success, -1 on error. - */ -int bgzf_close(BGZF* fp); - -/* - * Read up to length bytes from the file storing into data. - * Returns the number of bytes actually read. - * Returns zero on end of file. - * Returns -1 on error. - */ -int bgzf_read(BGZF* fp, void* data, int length); - -/* - * Write length bytes from data to the file. - * Returns the number of bytes written. - * Returns -1 on error. - */ -int bgzf_write(BGZF* fp, const void* data, int length); - -/* - * Return a virtual file pointer to the current location in the file. - * No interpetation of the value should be made, other than a subsequent - * call to bgzf_seek can be used to position the file at the same point. - * Return value is non-negative on success. - * Returns -1 on error. - */ -#define bgzf_tell(fp) ((fp->block_address << 16) | (fp->block_offset & 0xFFFF)) - -/* - * Set the file to read from the location specified by pos, which must - * be a value previously returned by bgzf_tell for this file (but not - * necessarily one returned by this file handle). - * The where argument must be SEEK_SET. - * Seeking on a file opened for write is not supported. - * Returns zero on success, -1 on error. - */ -int64_t bgzf_seek(BGZF* fp, int64_t pos, int where); - -/* - * Set the cache size. Zero to disable. By default, caching is - * disabled. The recommended cache size for frequent random access is - * about 8M bytes. - */ -void bgzf_set_cache_size(BGZF *fp, int cache_size); - -int bgzf_check_EOF(BGZF *fp); -int bgzf_read_block(BGZF* fp); -int bgzf_flush(BGZF* fp); -int bgzf_flush_try(BGZF *fp, int size); -int bgzf_check_bgzf(const char *fn); - -#ifdef __cplusplus -} -#endif - -static inline int bgzf_getc(BGZF *fp) -{ - int c; - if (fp->block_offset >= fp->block_length) { - if (bgzf_read_block(fp) != 0) return -2; /* error */ - if (fp->block_length == 0) return -1; /* end-of-file */ - } - c = ((unsigned char*)fp->uncompressed_block)[fp->block_offset++]; - if (fp->block_offset == fp->block_length) { -#ifdef _USE_KNETFILE - fp->block_address = knet_tell(fp->x.fpr); -#else - fp->block_address = ftello(fp->file); -#endif - fp->block_offset = 0; - fp->block_length = 0; - } - return c; -} - -#endif diff -Nru tophat-2.1.1+dfsg/src/samtools-0.1.18/bgzip.c tophat-2.1.1+dfsg1/src/samtools-0.1.18/bgzip.c --- tophat-2.1.1+dfsg/src/samtools-0.1.18/bgzip.c 2016-02-14 18:21:17.541079000 +0000 +++ tophat-2.1.1+dfsg1/src/samtools-0.1.18/bgzip.c 1970-01-01 00:00:00.000000000 +0000 @@ -1,206 +0,0 @@ -/* The MIT License - - Copyright (c) 2008 Broad Institute / Massachusetts Institute of Technology - - Permission is hereby granted, free of charge, to any person obtaining a copy - of this software and associated documentation files (the "Software"), to deal - in the Software without restriction, including without limitation the rights - to use, copy, modify, merge, publish, distribute, sublicense, and/or sell - copies of the Software, and to permit persons to whom the Software is - furnished to do so, subject to the following conditions: - - The above copyright notice and this permission notice shall be included in - all copies or substantial portions of the Software. - - THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN - THE SOFTWARE. -*/ - -#include -#include -#include -#include -#include -#include -#include -#include -#include "bgzf.h" - -static const int WINDOW_SIZE = 64 * 1024; - -static int bgzip_main_usage() -{ - fprintf(stderr, "\n"); - fprintf(stderr, "Usage: bgzip [options] [file] ...\n\n"); - fprintf(stderr, "Options: -c write on standard output, keep original files unchanged\n"); - fprintf(stderr, " -d decompress\n"); - fprintf(stderr, " -f overwrite files without asking\n"); - fprintf(stderr, " -b INT decompress at virtual file pointer INT\n"); - fprintf(stderr, " -s INT decompress INT bytes in the uncompressed file\n"); - fprintf(stderr, " -h give this help\n"); - fprintf(stderr, "\n"); - return 1; -} - -static int write_open(const char *fn, int is_forced) -{ - int fd = -1; - char c; - if (!is_forced) { - if ((fd = open(fn, O_WRONLY | O_CREAT | O_TRUNC | O_EXCL, 0666)) < 0 && errno == EEXIST) { - fprintf(stderr, "[bgzip] %s already exists; do you wish to overwrite (y or n)? ", fn); - scanf("%c", &c); - if (c != 'Y' && c != 'y') { - fprintf(stderr, "[bgzip] not overwritten\n"); - exit(1); - } - } - } - if (fd < 0) { - if ((fd = open(fn, O_WRONLY | O_CREAT | O_TRUNC, 0666)) < 0) { - fprintf(stderr, "[bgzip] %s: Fail to write\n", fn); - exit(1); - } - } - return fd; -} - -static void fail(BGZF* fp) -{ - fprintf(stderr, "Error: %s\n", fp->error); - exit(1); -} - -int main(int argc, char **argv) -{ - int c, compress, pstdout, is_forced; - BGZF *fp; - void *buffer; - long start, end, size; - - compress = 1; pstdout = 0; start = 0; size = -1; end = -1; is_forced = 0; - while((c = getopt(argc, argv, "cdhfb:s:")) >= 0){ - switch(c){ - case 'h': return bgzip_main_usage(); - case 'd': compress = 0; break; - case 'c': pstdout = 1; break; - case 'b': start = atol(optarg); break; - case 's': size = atol(optarg); break; - case 'f': is_forced = 1; break; - } - } - if (size >= 0) end = start + size; - if (end >= 0 && end < start) { - fprintf(stderr, "[bgzip] Illegal region: [%ld, %ld]\n", start, end); - return 1; - } - if (compress == 1) { - struct stat sbuf; - int f_src = fileno(stdin); - int f_dst = fileno(stdout); - - if ( argc>optind ) - { - if ( stat(argv[optind],&sbuf)<0 ) - { - fprintf(stderr, "[bgzip] %s: %s\n", strerror(errno), argv[optind]); - return 1; - } - - if ((f_src = open(argv[optind], O_RDONLY)) < 0) { - fprintf(stderr, "[bgzip] %s: %s\n", strerror(errno), argv[optind]); - return 1; - } - - if (pstdout) - f_dst = fileno(stdout); - else - { - char *name = malloc(strlen(argv[optind]) + 5); - strcpy(name, argv[optind]); - strcat(name, ".gz"); - f_dst = write_open(name, is_forced); - if (f_dst < 0) return 1; - free(name); - } - } - else if (!pstdout && isatty(fileno((FILE *)stdout)) ) - return bgzip_main_usage(); - - fp = bgzf_fdopen(f_dst, "w"); - buffer = malloc(WINDOW_SIZE); - while ((c = read(f_src, buffer, WINDOW_SIZE)) > 0) - if (bgzf_write(fp, buffer, c) < 0) fail(fp); - // f_dst will be closed here - if (bgzf_close(fp) < 0) fail(fp); - if (argc > optind && !pstdout) unlink(argv[optind]); - free(buffer); - close(f_src); - return 0; - } else { - struct stat sbuf; - int f_dst; - - if ( argc>optind ) - { - if ( stat(argv[optind],&sbuf)<0 ) - { - fprintf(stderr, "[bgzip] %s: %s\n", strerror(errno), argv[optind]); - return 1; - } - char *name; - int len = strlen(argv[optind]); - if ( strcmp(argv[optind]+len-3,".gz") ) - { - fprintf(stderr, "[bgzip] %s: unknown suffix -- ignored\n", argv[optind]); - return 1; - } - fp = bgzf_open(argv[optind], "r"); - if (fp == NULL) { - fprintf(stderr, "[bgzip] Could not open file: %s\n", argv[optind]); - return 1; - } - - if (pstdout) { - f_dst = fileno(stdout); - } - else { - name = strdup(argv[optind]); - name[strlen(name) - 3] = '\0'; - f_dst = write_open(name, is_forced); - free(name); - } - } - else if (!pstdout && isatty(fileno((FILE *)stdin)) ) - return bgzip_main_usage(); - else - { - f_dst = fileno(stdout); - fp = bgzf_fdopen(fileno(stdin), "r"); - if (fp == NULL) { - fprintf(stderr, "[bgzip] Could not read from stdin: %s\n", strerror(errno)); - return 1; - } - } - buffer = malloc(WINDOW_SIZE); - if (bgzf_seek(fp, start, SEEK_SET) < 0) fail(fp); - while (1) { - if (end < 0) c = bgzf_read(fp, buffer, WINDOW_SIZE); - else c = bgzf_read(fp, buffer, (end - start > WINDOW_SIZE)? WINDOW_SIZE:(end - start)); - if (c == 0) break; - if (c < 0) fail(fp); - start += c; - write(f_dst, buffer, c); - if (end >= 0 && start >= end) break; - } - free(buffer); - if (bgzf_close(fp) < 0) fail(fp); - if (!pstdout) unlink(argv[optind]); - return 0; - } -} diff -Nru tophat-2.1.1+dfsg/src/samtools-0.1.18/ChangeLog tophat-2.1.1+dfsg1/src/samtools-0.1.18/ChangeLog --- tophat-2.1.1+dfsg/src/samtools-0.1.18/ChangeLog 2016-02-14 18:21:17.374079000 +0000 +++ tophat-2.1.1+dfsg1/src/samtools-0.1.18/ChangeLog 1970-01-01 00:00:00.000000000 +0000 @@ -1,5948 +0,0 @@ ------------------------------------------------------------------------- -r925 | lh3lh3 | 2011-02-28 15:45:17 -0500 (Mon, 28 Feb 2011) | 2 lines -Changed paths: - M /trunk/samtools/phase.c - -minor changes to a heuristic rule - ------------------------------------------------------------------------- -r924 | lh3lh3 | 2011-02-28 15:24:04 -0500 (Mon, 28 Feb 2011) | 4 lines -Changed paths: - M /trunk/samtools/bam.h - M /trunk/samtools/bcftools/vcfutils.pl - M /trunk/samtools/phase.c - - * 0.1.12-r924:126 - * fixed a bug in phase (due to recent changes) - * fixed a bug in vcf2fq - ------------------------------------------------------------------------- -r923 | lh3lh3 | 2011-02-28 12:57:39 -0500 (Mon, 28 Feb 2011) | 5 lines -Changed paths: - M /trunk/samtools/Makefile - M /trunk/samtools/bam.h - M /trunk/samtools/bam_plcmd.c - M /trunk/samtools/bamtk.c - M /trunk/samtools/phase.c - - * put version number in bam.h - * write version to BCF - * in phase, change the default -q to 37 - * output a little more information during phasing - ------------------------------------------------------------------------- -r922 | lh3lh3 | 2011-02-25 16:40:09 -0500 (Fri, 25 Feb 2011) | 3 lines -Changed paths: - M /trunk/samtools/bam2bcf.c - M /trunk/samtools/bamtk.c - M /trunk/samtools/bcftools/bcf.c - M /trunk/samtools/bcftools/bcf.tex - M /trunk/samtools/bcftools/bcf2qcall.c - M /trunk/samtools/bcftools/bcfutils.c - M /trunk/samtools/bcftools/ld.c - M /trunk/samtools/bcftools/prob1.c - M /trunk/samtools/bcftools/vcf.c - M /trunk/samtools/cut_target.c - - * change the order of PL/GL according to the latest VCF spec - * change the type of SP to int32_t - ------------------------------------------------------------------------- -r921 | lh3lh3 | 2011-02-25 14:40:56 -0500 (Fri, 25 Feb 2011) | 2 lines -Changed paths: - M /trunk/samtools/bcftools/bcf.tex - -update the BCF spec - ------------------------------------------------------------------------- -r920 | lh3lh3 | 2011-02-25 00:59:27 -0500 (Fri, 25 Feb 2011) | 2 lines -Changed paths: - M /trunk/samtools/Makefile - M /trunk/samtools/bam_md.c - M /trunk/samtools/bam_plcmd.c - M /trunk/samtools/bam_sort.c - M /trunk/samtools/bamtk.c - A /trunk/samtools/cut_target.c - M /trunk/samtools/errmod.h - M /trunk/samtools/faidx.c - M /trunk/samtools/khash.h - M /trunk/samtools/kstring.c - M /trunk/samtools/kstring.h - A /trunk/samtools/phase.c - M /trunk/samtools/samtools.1 - -added the phase command - ------------------------------------------------------------------------- -r918 | lh3lh3 | 2011-02-24 10:05:54 -0500 (Thu, 24 Feb 2011) | 2 lines -Changed paths: - M /trunk/samtools/bcftools/prob1.c - M /trunk/samtools/bcftools/prob1.h - -added "const" to bcf_p1_cal() - ------------------------------------------------------------------------- -r917 | lh3lh3 | 2011-02-24 09:36:30 -0500 (Thu, 24 Feb 2011) | 2 lines -Changed paths: - M /trunk/samtools/bam.c - -more meaningful BAM truncation message - ------------------------------------------------------------------------- -r916 | lh3lh3 | 2011-02-24 09:35:06 -0500 (Thu, 24 Feb 2011) | 3 lines -Changed paths: - M /trunk/samtools/bcftools/bcf.c - M /trunk/samtools/bcftools/vcf.c - - * automatically fix errors in GL - * output unrecognized FORMAT as "." - ------------------------------------------------------------------------- -r913 | lh3lh3 | 2011-02-10 22:59:47 -0500 (Thu, 10 Feb 2011) | 2 lines -Changed paths: - M /trunk/samtools/bcftools/bcf.h - M /trunk/samtools/bcftools/call1.c - M /trunk/samtools/bcftools/vcf.c - -finished VCF->BCF conversion - ------------------------------------------------------------------------- -r910 | petulda | 2011-02-03 03:13:48 -0500 (Thu, 03 Feb 2011) | 1 line -Changed paths: - M /trunk/samtools/bcftools/vcfutils.pl - -Prevent division by zero ------------------------------------------------------------------------- -r909 | lh3lh3 | 2011-02-02 11:29:20 -0500 (Wed, 02 Feb 2011) | 2 lines -Changed paths: - M /trunk/samtools/bcftools/call1.c - -fixed a typo in the VCF header - ------------------------------------------------------------------------- -r908 | lh3lh3 | 2011-02-02 11:28:24 -0500 (Wed, 02 Feb 2011) | 3 lines -Changed paths: - M /trunk/samtools/bam2bcf.c - M /trunk/samtools/bam_index.c - - * fixed an out-of-boundary bug - * improved sorting order checking in index - ------------------------------------------------------------------------- -r907 | lh3lh3 | 2011-01-29 22:59:20 -0500 (Sat, 29 Jan 2011) | 4 lines -Changed paths: - M /trunk/samtools/INSTALL - M /trunk/samtools/bam_tview.c - M /trunk/samtools/knetfile.c - - * avoid a segfault when network connect fails - * update INSTALL - * fixed a bug in tview on big-endian by Nathan Weeks - ------------------------------------------------------------------------- -r903 | lh3lh3 | 2011-01-27 14:50:02 -0500 (Thu, 27 Jan 2011) | 3 lines -Changed paths: - M /trunk/samtools/bam2bcf_indel.c - M /trunk/samtools/bam_md.c - - * fixed a rare memory issue in bam_md.c - * fixed a bug in indel calling related to unmapped and refskip reads - ------------------------------------------------------------------------- -r902 | lh3lh3 | 2011-01-23 21:46:18 -0500 (Sun, 23 Jan 2011) | 2 lines -Changed paths: - M /trunk/samtools/bcftools/fet.c - -fixed two minor bugs in Fisher's exact test - ------------------------------------------------------------------------- -r899 | petulda | 2011-01-19 09:28:02 -0500 (Wed, 19 Jan 2011) | 1 line -Changed paths: - M /trunk/samtools/bcftools/vcfutils.pl - -Skip sites with unknown ref ------------------------------------------------------------------------- -r898 | lh3lh3 | 2011-01-15 12:56:05 -0500 (Sat, 15 Jan 2011) | 2 lines -Changed paths: - M /trunk/samtools/ChangeLog - M /trunk/samtools/bam_maqcns.c - M /trunk/samtools/bam_md.c - -move bam_nt16_nt4_table[] from bam_maqcns.c to bam_md.c - ------------------------------------------------------------------------- -r896 | lh3lh3 | 2011-01-06 10:52:15 -0500 (Thu, 06 Jan 2011) | 3 lines -Changed paths: - M /trunk/samtools/bam_plcmd.c - M /trunk/samtools/bamtk.c - M /trunk/samtools/bcftools/bcf.h - M /trunk/samtools/bcftools/bcfutils.c - M /trunk/samtools/bcftools/call1.c - - * samtools-0.1.12-10 (r896) - * allow to exclude read groups in mpileup - ------------------------------------------------------------------------- -r895 | lh3lh3 | 2011-01-04 11:31:29 -0500 (Tue, 04 Jan 2011) | 2 lines -Changed paths: - M /trunk/samtools/bcftools/bcf.tex - -sorry. It is SP not ST - ------------------------------------------------------------------------- -r894 | lh3lh3 | 2011-01-04 11:29:06 -0500 (Tue, 04 Jan 2011) | 2 lines -Changed paths: - M /trunk/samtools/bcftools/bcf.tex - -added ST - ------------------------------------------------------------------------- -r893 | petulda | 2011-01-04 06:55:56 -0500 (Tue, 04 Jan 2011) | 1 line -Changed paths: - M /trunk/samtools/bcftools/call1.c - -Fixed a typo in read_samples ------------------------------------------------------------------------- -r892 | jmarshall | 2010-12-28 08:06:49 -0500 (Tue, 28 Dec 2010) | 9 lines -Changed paths: - M /trunk/samtools/Makefile - M /trunk/samtools/bcftools/Makefile - M /trunk/samtools/examples/Makefile - -System libraries go *after* user libraries in link commands, because -the user libraries may themselves have dependencies that are satisfied -by the system libraries. It's not rocket science! - -This makes a difference with some linkers; or with -static or --as-needed. - -The examples/Makefile fix is from Charles Plessy. -See also http://bugs.debian.org/cgi-bin/bugreport.cgi?bug=606004 - ------------------------------------------------------------------------- -r891 | lh3lh3 | 2010-12-21 12:16:33 -0500 (Tue, 21 Dec 2010) | 3 lines -Changed paths: - M /trunk/samtools/bamtk.c - M /trunk/samtools/bcftools/bcf.h - M /trunk/samtools/bcftools/bcfutils.c - M /trunk/samtools/bcftools/call1.c - - * samtools-0.1.12-9 (r891) - * allow to call SNPs from a subset of samples - ------------------------------------------------------------------------- -r889 | lh3lh3 | 2010-12-15 11:28:16 -0500 (Wed, 15 Dec 2010) | 3 lines -Changed paths: - M /trunk/samtools/bam2bcf.c - M /trunk/samtools/bamtk.c - - * samtools-0.1.12-12 (r889) - * set mapQ as 20 if it equals 255 - ------------------------------------------------------------------------- -r888 | lh3lh3 | 2010-12-14 22:41:09 -0500 (Tue, 14 Dec 2010) | 2 lines -Changed paths: - M /trunk/samtools/bam_plcmd.c - M /trunk/samtools/bamtk.c - -When -B is applied to mpileup, still use paired reads only unless -A is flagged. - ------------------------------------------------------------------------- -r887 | lh3lh3 | 2010-12-14 22:37:05 -0500 (Tue, 14 Dec 2010) | 3 lines -Changed paths: - M /trunk/samtools/bam_md.c - M /trunk/samtools/bam_plcmd.c - M /trunk/samtools/bamtk.c - - * samtools-0.1.12-6 (r887) - * added a hidden option -E to mpileup/calmd. -E triggers an alternative way to apply BAQ. - ------------------------------------------------------------------------- -r886 | lh3lh3 | 2010-12-14 12:51:03 -0500 (Tue, 14 Dec 2010) | 2 lines -Changed paths: - M /trunk/samtools/bam2bcf_indel.c - M /trunk/samtools/bamtk.c - -(Arguably) improved the indel caller a tiny bit for lowCov data. - ------------------------------------------------------------------------- -r885 | petulda | 2010-12-14 04:55:46 -0500 (Tue, 14 Dec 2010) | 1 line -Changed paths: - M /trunk/samtools/bcftools/call1.c - -Fixed the VCF header to pass validation ------------------------------------------------------------------------- -r884 | lh3lh3 | 2010-12-12 23:02:19 -0500 (Sun, 12 Dec 2010) | 3 lines -Changed paths: - M /trunk/samtools/bam2bcf_indel.c - M /trunk/samtools/bamtk.c - M /trunk/samtools/bcftools/vcfutils.pl - - * samtools-0.1.12-4 (r884) - * fixed a long-existing flaw in the INDEL calling model - ------------------------------------------------------------------------- -r883 | lh3lh3 | 2010-12-11 20:05:42 -0500 (Sat, 11 Dec 2010) | 2 lines -Changed paths: - M /trunk/samtools/bcftools/bcfutils.c - M /trunk/samtools/bcftools/call1.c - M /trunk/samtools/bcftools/vcfutils.pl - -compute max SP and max GQ from sample genotypes - ------------------------------------------------------------------------- -r880 | lh3lh3 | 2010-12-10 10:50:54 -0500 (Fri, 10 Dec 2010) | 2 lines -Changed paths: - D /trunk/samtools/bcftools/bcf-fix.pl - -drop bcf-fix.pl as it is redundant by the latest changes - ------------------------------------------------------------------------- -r879 | lh3lh3 | 2010-12-10 10:50:29 -0500 (Fri, 10 Dec 2010) | 3 lines -Changed paths: - M /trunk/samtools/bcftools/call1.c - M /trunk/samtools/bcftools/vcf.c - - * fixed a minor issue in printing VCFs - * write bcftools specific INFO and FORMAT in the header - ------------------------------------------------------------------------- -r878 | lh3lh3 | 2010-12-10 10:09:14 -0500 (Fri, 10 Dec 2010) | 2 lines -Changed paths: - M /trunk/samtools/bamtk.c - M /trunk/samtools/bcftools/bcfutils.c - M /trunk/samtools/bcftools/call1.c - -Make sure that the GT genotype field is the first - ------------------------------------------------------------------------- -r877 | lh3lh3 | 2010-12-08 17:27:05 -0500 (Wed, 08 Dec 2010) | 7 lines -Changed paths: - M /trunk/samtools/bam2bcf.c - M /trunk/samtools/bam2bcf.h - M /trunk/samtools/bam2bcf_indel.c - M /trunk/samtools/bam_plcmd.c - M /trunk/samtools/bamtk.c - - * samtools-0.1.12-2 (r877) - - * allow to fine control the selection of indel candidates. The current - setting is okay for lowCov and highCov with ~100 samples, but it - skips too many indels for highCov with >250 samples. - - ------------------------------------------------------------------------- -r874 | lh3lh3 | 2010-12-07 22:40:35 -0500 (Tue, 07 Dec 2010) | 2 lines -Changed paths: - M /trunk/samtools/bam_plcmd.c - -a spelling error.. - ------------------------------------------------------------------------- -r873 | lh3lh3 | 2010-12-07 22:39:57 -0500 (Tue, 07 Dec 2010) | 3 lines -Changed paths: - M /trunk/samtools/bam_plcmd.c - M /trunk/samtools/bamtk.c - - * samtools-0.1.12-1 (r873) - * added a switch to allow anomalous read pairs in calling - ------------------------------------------------------------------------- -r872 | lh3lh3 | 2010-12-07 14:43:54 -0500 (Tue, 07 Dec 2010) | 2 lines -Changed paths: - M /trunk/samtools/bcftools/vcfutils.pl - -fixed a bug in vcf2fq - ------------------------------------------------------------------------- -r869 | lh3lh3 | 2010-12-05 01:18:06 -0500 (Sun, 05 Dec 2010) | 2 lines -Changed paths: - M /trunk/samtools/bamtk.c - -added a warning for the Windows version - ------------------------------------------------------------------------- -r868 | lh3lh3 | 2010-12-05 01:05:51 -0500 (Sun, 05 Dec 2010) | 4 lines -Changed paths: - M /trunk/samtools/bcftools/call1.c - -In ksprintf(), change "%lf" and "%lg" to "%f" and "%g", respectively. -According to the manual page, this change is valid. However, MinGW seems -to interpret "%lf" as "%Lf". - ------------------------------------------------------------------------- -r867 | lh3lh3 | 2010-12-05 00:35:43 -0500 (Sun, 05 Dec 2010) | 2 lines -Changed paths: - M /trunk/samtools/Makefile.mingw - M /trunk/samtools/bam_aux.c - -bring back the windows support - ------------------------------------------------------------------------- -r866 | lh3lh3 | 2010-12-04 23:33:51 -0500 (Sat, 04 Dec 2010) | 2 lines -Changed paths: - M /trunk/samtools/bam_reheader.c - M /trunk/samtools/bcftools/vcfutils.pl - -Fixed a compiling error when knetfile is not used. - ------------------------------------------------------------------------- -r865 | lh3lh3 | 2010-12-04 00:13:22 -0500 (Sat, 04 Dec 2010) | 2 lines -Changed paths: - M /trunk/samtools/bcftools/vcfutils.pl - -vcf->fastq - ------------------------------------------------------------------------- -r864 | lh3lh3 | 2010-12-03 17:12:30 -0500 (Fri, 03 Dec 2010) | 3 lines -Changed paths: - M /trunk/samtools/bcftools/call1.c - M /trunk/samtools/bcftools/prob1.c - M /trunk/samtools/bcftools/prob1.h - - * remove "-f". Instead always compute consensus quality - * increase the upper limit of quality - ------------------------------------------------------------------------- -r863 | lh3lh3 | 2010-12-03 15:28:15 -0500 (Fri, 03 Dec 2010) | 2 lines -Changed paths: - M /trunk/samtools/bcftools/bcf.c - -more informative error message - ------------------------------------------------------------------------- -r862 | lh3lh3 | 2010-12-02 16:16:08 -0500 (Thu, 02 Dec 2010) | 2 lines -Changed paths: - M /trunk/samtools/NEWS - M /trunk/samtools/bamtk.c - -Release samtools-0.1.12a - ------------------------------------------------------------------------- -r861 | lh3lh3 | 2010-12-02 15:55:06 -0500 (Thu, 02 Dec 2010) | 2 lines -Changed paths: - M /trunk/samtools/bcftools/call1.c - -a possible fix to DP4=0,0,0,0; have not tested, but should have no side-effect - ------------------------------------------------------------------------- -r859 | lh3lh3 | 2010-12-02 11:39:57 -0500 (Thu, 02 Dec 2010) | 2 lines -Changed paths: - M /trunk/samtools/NEWS - M /trunk/samtools/bam_index.c - M /trunk/samtools/bamtk.c - M /trunk/samtools/samtools.1 - -Release samtools-0.1.12 - ------------------------------------------------------------------------- -r858 | lh3lh3 | 2010-12-02 11:24:41 -0500 (Thu, 02 Dec 2010) | 4 lines -Changed paths: - M /trunk/samtools/bam_plcmd.c - M /trunk/samtools/bamtk.c - M /trunk/samtools/bcftools/bcf.c - - * samtools-0.1.11-1 (r858) - * fixed a bug in mpileup which causes segfaults - * bcftools: do not segfault when BCF contains errors - ------------------------------------------------------------------------- -r857 | lh3lh3 | 2010-11-30 23:52:50 -0500 (Tue, 30 Nov 2010) | 2 lines -Changed paths: - M /trunk/samtools/bam_index.c - -fixed a memory leak in bam_fetch() - ------------------------------------------------------------------------- -r856 | lh3lh3 | 2010-11-26 00:07:31 -0500 (Fri, 26 Nov 2010) | 3 lines -Changed paths: - M /trunk/samtools/bam2bcf_indel.c - M /trunk/samtools/bcftools/vcfutils.pl - - * fixed a memory violation - * added splitchr to vcfutils.pl - ------------------------------------------------------------------------- -r854 | lh3lh3 | 2010-11-23 09:05:08 -0500 (Tue, 23 Nov 2010) | 2 lines -Changed paths: - M /trunk/samtools/bcftools/ld.c - -fixed a typo/bug in r^2 computation - ------------------------------------------------------------------------- -r852 | lh3lh3 | 2010-11-21 22:20:20 -0500 (Sun, 21 Nov 2010) | 2 lines -Changed paths: - M /trunk/samtools/bamtk.c - -forget to change the version information - ------------------------------------------------------------------------- -r851 | lh3lh3 | 2010-11-21 22:16:52 -0500 (Sun, 21 Nov 2010) | 2 lines -Changed paths: - M /trunk/samtools/ChangeLog - M /trunk/samtools/NEWS - M /trunk/samtools/bcftools/bcftools.1 - M /trunk/samtools/samtools.1 - -Release samtools-0.1.11 - ------------------------------------------------------------------------- -r844 | lh3lh3 | 2010-11-19 23:16:08 -0500 (Fri, 19 Nov 2010) | 3 lines -Changed paths: - M /trunk/samtools/bamtk.c - M /trunk/samtools/bcftools/call1.c - M /trunk/samtools/bcftools/prob1.c - M /trunk/samtools/bcftools/prob1.h - - * samtools-0.1.10-9 (r844) - * added the "folded" or reference-free mode for variant calling - ------------------------------------------------------------------------- -r843 | lh3lh3 | 2010-11-19 22:26:36 -0500 (Fri, 19 Nov 2010) | 2 lines -Changed paths: - M /trunk/samtools/NEWS - M /trunk/samtools/bam_sort.c - -In merging, if -R is specified, do not abort if the sequence dictionary is different. - ------------------------------------------------------------------------- -r842 | jmarshall | 2010-11-19 21:24:28 -0500 (Fri, 19 Nov 2010) | 5 lines -Changed paths: - M /trunk/samtools/bam_sort.c - -When merging BAM headers, compare the list of target reference sequences -strictly (and fail/abort if there is a mismatch), but allow one list to be a -prefix of the other. (i.e., check that the lists are identical up until the -shorter runs out, and add the excess targets from the longer to the output.) - ------------------------------------------------------------------------- -r841 | lh3lh3 | 2010-11-19 14:49:27 -0500 (Fri, 19 Nov 2010) | 4 lines -Changed paths: - M /trunk/samtools/bam_index.c - M /trunk/samtools/bam_pileup.c - M /trunk/samtools/bamtk.c - - * samtools-0.1.10 (r841) - * fixed a bug in pileup when the first CIGAR operation is D - * fixed a bug in view with range query - ------------------------------------------------------------------------- -r840 | lh3lh3 | 2010-11-19 13:45:51 -0500 (Fri, 19 Nov 2010) | 10 lines -Changed paths: - M /trunk/samtools/ChangeLog - M /trunk/samtools/bam2bcf.c - M /trunk/samtools/bam2bcf.h - M /trunk/samtools/bam2bcf_indel.c - M /trunk/samtools/bam_plcmd.c - M /trunk/samtools/bamtk.c - - * samtools-0.1.10-4 (r840) - - * drop the MNP caller. It is slow while does not diliver too much - benefit. Possibly I will work on it in future given more time. - - * there is a segfault in pileup - - * someone has reported segfault from view/index/sort - - ------------------------------------------------------------------------- -r839 | lh3lh3 | 2010-11-18 17:30:11 -0500 (Thu, 18 Nov 2010) | 9 lines -Changed paths: - M /trunk/samtools/bam2bcf.h - M /trunk/samtools/bam2bcf_indel.c - M /trunk/samtools/bamtk.c - - * samtools-0.1.10-6 (r839) - - * call MNPs without realignment because it seems to me that it is not - worthwhile to significantly slow down SNP calling. - - * the result looks quite different from the previous version. I have - work to do... - - ------------------------------------------------------------------------- -r838 | lh3lh3 | 2010-11-18 11:26:09 -0500 (Thu, 18 Nov 2010) | 2 lines -Changed paths: - M /trunk/samtools/knetfile.c - -Apply a patch by Rob Davis, which improves fault detection. - ------------------------------------------------------------------------- -r836 | lh3lh3 | 2010-11-18 11:09:23 -0500 (Thu, 18 Nov 2010) | 3 lines -Changed paths: - M /trunk/samtools/bam2bcf_indel.c - M /trunk/samtools/bamtk.c - - * samtools-r836 - * initiate MNP realignment when the MNP has at least 0.2% frequency (otherwise too slow) - ------------------------------------------------------------------------- -r835 | lh3lh3 | 2010-11-18 00:25:13 -0500 (Thu, 18 Nov 2010) | 3 lines -Changed paths: - M /trunk/samtools/bcftools/vcfutils.pl - - * modify the filtering rule: also filter SNPs around filtered indels - * added MNP filter - ------------------------------------------------------------------------- -r834 | lh3lh3 | 2010-11-17 23:13:52 -0500 (Wed, 17 Nov 2010) | 4 lines -Changed paths: - M /trunk/samtools/bam2bcf.c - M /trunk/samtools/bam2bcf_indel.c - M /trunk/samtools/bamtk.c - - * samtools-0.1.10-4 (r834) - * fixed a silly bug in printing MNP - * restrict to at most 1 alternative allele - ------------------------------------------------------------------------- -r833 | lh3lh3 | 2010-11-17 21:58:58 -0500 (Wed, 17 Nov 2010) | 2 lines -Changed paths: - M /trunk/samtools/bam2bcf.c - M /trunk/samtools/bamtk.c - -fixed a bug in printing MNPs - ------------------------------------------------------------------------- -r832 | lh3lh3 | 2010-11-17 21:47:20 -0500 (Wed, 17 Nov 2010) | 2 lines -Changed paths: - M /trunk/samtools/bam2bcf_indel.c - M /trunk/samtools/bamtk.c - -minor change to how seqQ is applied - ------------------------------------------------------------------------- -r831 | lh3lh3 | 2010-11-17 21:41:12 -0500 (Wed, 17 Nov 2010) | 3 lines -Changed paths: - M /trunk/samtools/bam2bcf.c - M /trunk/samtools/bam2bcf.h - M /trunk/samtools/bam2bcf_indel.c - M /trunk/samtools/bam_plcmd.c - M /trunk/samtools/bamtk.c - - * samtools-0.1.10 (r831) - * initial MNP caller - ------------------------------------------------------------------------- -r829 | lh3lh3 | 2010-11-16 23:14:15 -0500 (Tue, 16 Nov 2010) | 2 lines -Changed paths: - M /trunk/samtools/ChangeLog - M /trunk/samtools/NEWS - M /trunk/samtools/bamtk.c - -Release samtools-0.1.10 (r829) - ------------------------------------------------------------------------- -r828 | lh3lh3 | 2010-11-16 20:48:49 -0500 (Tue, 16 Nov 2010) | 2 lines -Changed paths: - M /trunk/samtools/bamtk.c - -update version information: samtools-0.1.9-20 (r828) - ------------------------------------------------------------------------- -r827 | lh3lh3 | 2010-11-16 15:32:50 -0500 (Tue, 16 Nov 2010) | 2 lines -Changed paths: - M /trunk/samtools/bcftools/call1.c - -bcftools: allow to skip indels - ------------------------------------------------------------------------- -r826 | lh3lh3 | 2010-11-16 14:11:58 -0500 (Tue, 16 Nov 2010) | 2 lines -Changed paths: - M /trunk/samtools/bam_md.c - -remove ZQ if both BQ and ZQ are present - ------------------------------------------------------------------------- -r825 | lh3lh3 | 2010-11-16 13:51:33 -0500 (Tue, 16 Nov 2010) | 3 lines -Changed paths: - M /trunk/samtools/bam2bcf_indel.c - M /trunk/samtools/bam_md.c - M /trunk/samtools/bamtk.c - M /trunk/samtools/samtools.1 - - * samtools-0.1.9-18 (r825) - * change the behaviour of calmd such that by default it does not change the base quality - ------------------------------------------------------------------------- -r824 | lh3lh3 | 2010-11-15 23:31:53 -0500 (Mon, 15 Nov 2010) | 4 lines -Changed paths: - M /trunk/samtools/bam_plcmd.c - M /trunk/samtools/bamtk.c - M /trunk/samtools/bcftools/call1.c - M /trunk/samtools/samtools.1 - - * samtools-0.1.9-17 (r824) - * added command line options to change the default parameters in indel calling - * update the manual - ------------------------------------------------------------------------- -r823 | lh3lh3 | 2010-11-15 12:20:13 -0500 (Mon, 15 Nov 2010) | 3 lines -Changed paths: - M /trunk/samtools/bam2bcf_indel.c - M /trunk/samtools/bam_md.c - M /trunk/samtools/bamtk.c - - * samtools-0.1.9-r823 - * the BQ tag is now 64 shifted, not 33 shifted - ------------------------------------------------------------------------- -r822 | lh3lh3 | 2010-11-15 00:30:18 -0500 (Mon, 15 Nov 2010) | 3 lines -Changed paths: - M /trunk/samtools/bam2bcf.c - M /trunk/samtools/bam2bcf.h - M /trunk/samtools/bamtk.c - M /trunk/samtools/bcftools/vcfutils.pl - M /trunk/samtools/misc/samtools.pl - - * samtools-0.1.9-16 (r822) - * keep the raw depth because in indel calling, DP4 may be way off the true depth - ------------------------------------------------------------------------- -r821 | lh3lh3 | 2010-11-13 01:18:31 -0500 (Sat, 13 Nov 2010) | 4 lines -Changed paths: - M /trunk/samtools/bam_md.c - M /trunk/samtools/bamtk.c - - * samtools-0.1.9-15 (r821) - * calmd: write BQ - * skip realignment if BQ is present - ------------------------------------------------------------------------- -r820 | lh3lh3 | 2010-11-13 01:08:26 -0500 (Sat, 13 Nov 2010) | 3 lines -Changed paths: - M /trunk/samtools/bam2bcf_indel.c - M /trunk/samtools/bamtk.c - - * samtools-0.1.9-14 (r820) - * penalize reads with excessive differences in indel calling - ------------------------------------------------------------------------- -r819 | lh3lh3 | 2010-11-12 21:36:27 -0500 (Fri, 12 Nov 2010) | 3 lines -Changed paths: - M /trunk/samtools/bam_maqcns.c - M /trunk/samtools/bamtk.c - - * samtools-0.1.9-13 (r819) - * fixed a bug in pileup given refskip - ------------------------------------------------------------------------- -r818 | lh3lh3 | 2010-11-12 13:04:53 -0500 (Fri, 12 Nov 2010) | 3 lines -Changed paths: - M /trunk/samtools/bam2bcf_indel.c - M /trunk/samtools/bamtk.c - - * samtools-r818 - * for indel calling, do two rounds of probabilistic realignments - ------------------------------------------------------------------------- -r817 | lh3lh3 | 2010-11-11 20:04:07 -0500 (Thu, 11 Nov 2010) | 3 lines -Changed paths: - M /trunk/samtools/bam2bcf_indel.c - M /trunk/samtools/bamtk.c - M /trunk/samtools/bcftools/vcfutils.pl - - * samtools-0.1.19-11 (r817) - * only initiate indel calling when 0.2% of reads contain a gap - ------------------------------------------------------------------------- -r816 | lh3lh3 | 2010-11-11 01:22:59 -0500 (Thu, 11 Nov 2010) | 7 lines -Changed paths: - M /trunk/samtools/bam2bcf_indel.c - M /trunk/samtools/bamtk.c - - * samtools-0.1.9-10 (r816) - - * I know why the forward method fails. it is because of zero base - qualities. when that is fixed, the forward method seems to give - better results than Viterbi, as it should be. I am tired... - - ------------------------------------------------------------------------- -r815 | lh3lh3 | 2010-11-11 00:57:15 -0500 (Thu, 11 Nov 2010) | 2 lines -Changed paths: - M /trunk/samtools/ChangeLog - M /trunk/samtools/bam2bcf_indel.c - -effectively revert to the viterbi version. The forward realignment gives too many false positives. - ------------------------------------------------------------------------- -r814 | lh3lh3 | 2010-11-11 00:18:02 -0500 (Thu, 11 Nov 2010) | 4 lines -Changed paths: - M /trunk/samtools/bam2bcf_indel.c - M /trunk/samtools/bam_md.c - M /trunk/samtools/bam_plcmd.c - M /trunk/samtools/bamtk.c - - * samtools-0.1.9-9 (r810) - * use forward, instead of viterbi, for realignment - * realignment is now quality aware - ------------------------------------------------------------------------- -r813 | lh3lh3 | 2010-11-10 22:45:24 -0500 (Wed, 10 Nov 2010) | 2 lines -Changed paths: - M /trunk/samtools/bam2bcf_indel.c - M /trunk/samtools/kprobaln.c - M /trunk/samtools/kprobaln.h - - * prepare to replace kaln with kprobaln in realignment - ------------------------------------------------------------------------- -r812 | lh3lh3 | 2010-11-10 17:28:50 -0500 (Wed, 10 Nov 2010) | 2 lines -Changed paths: - M /trunk/samtools/bcftools/bcf.c - -fixed a typo - ------------------------------------------------------------------------- -r811 | lh3lh3 | 2010-11-10 16:54:46 -0500 (Wed, 10 Nov 2010) | 2 lines -Changed paths: - M /trunk/samtools/bcftools/bcf.c - M /trunk/samtools/bcftools/bcf.h - -use zlib for direct reading when BCF_LITE is in use - ------------------------------------------------------------------------- -r810 | lh3lh3 | 2010-11-10 16:32:13 -0500 (Wed, 10 Nov 2010) | 3 lines -Changed paths: - M /trunk/samtools/bam2bcf_indel.c - - * do not use reads containing too many mismatches for indel calling - * fixed a trivial bug in case of multi-allelic indels - ------------------------------------------------------------------------- -r809 | lh3lh3 | 2010-11-10 13:23:02 -0500 (Wed, 10 Nov 2010) | 3 lines -Changed paths: - M /trunk/samtools/bam2bcf.c - M /trunk/samtools/bam2bcf_indel.c - M /trunk/samtools/bam_plcmd.c - M /trunk/samtools/bamtk.c - - * samtools-0.1.9-8 (r809) - * fixed a bug in the indel caller - ------------------------------------------------------------------------- -r808 | lh3lh3 | 2010-11-10 12:24:10 -0500 (Wed, 10 Nov 2010) | 2 lines -Changed paths: - M /trunk/samtools/Makefile - -minor change to makefile - ------------------------------------------------------------------------- -r807 | lh3lh3 | 2010-11-10 12:10:21 -0500 (Wed, 10 Nov 2010) | 4 lines -Changed paths: - M /trunk/samtools/Makefile - M /trunk/samtools/bam2bcf.h - M /trunk/samtools/bam2bcf_indel.c - M /trunk/samtools/bam_plcmd.c - M /trunk/samtools/bamtk.c - M /trunk/samtools/bcftools/vcfutils.pl - - * samtools-0.1.9-8 (r807) - * collect indel candidates only from specified platforms (@RG-PL) - * merge varFilter and filter4vcf in vcfutils.pl - ------------------------------------------------------------------------- -r806 | lh3lh3 | 2010-11-09 22:05:46 -0500 (Tue, 09 Nov 2010) | 2 lines -Changed paths: - M /trunk/samtools/bcftools/call1.c - M /trunk/samtools/bcftools/prob1.c - M /trunk/samtools/bcftools/prob1.h - -bcftools: compute equal-tail (Bayesian) credible interval - ------------------------------------------------------------------------- -r805 | lh3lh3 | 2010-11-09 16:28:39 -0500 (Tue, 09 Nov 2010) | 2 lines -Changed paths: - M /trunk/samtools/bcftools/vcfutils.pl - -added a double-hit filter to avoid overestimated indel likelihood - ------------------------------------------------------------------------- -r804 | lh3lh3 | 2010-11-09 14:12:06 -0500 (Tue, 09 Nov 2010) | 3 lines -Changed paths: - M /trunk/samtools/bam2bcf_indel.c - M /trunk/samtools/bamtk.c - - * samtools-0.1.9-7 (r804) - * fixed a bug in the gap caller - ------------------------------------------------------------------------- -r803 | lh3lh3 | 2010-11-09 10:45:33 -0500 (Tue, 09 Nov 2010) | 4 lines -Changed paths: - M /trunk/samtools/bam2bcf.c - M /trunk/samtools/bam2bcf_indel.c - M /trunk/samtools/bamtk.c - M /trunk/samtools/bcftools/bcf.c - M /trunk/samtools/bcftools/bcf.h - M /trunk/samtools/bcftools/prob1.c - - * samtools-0.1.9-6 (r803) - * mpileup: apply homopolymer correction when calculating GL, instead of before - * bcftools: apply a different prior to indels - ------------------------------------------------------------------------- -r802 | lh3lh3 | 2010-11-08 23:53:15 -0500 (Mon, 08 Nov 2010) | 3 lines -Changed paths: - M /trunk/samtools/bam2bcf.c - M /trunk/samtools/bamtk.c - - * samtools-0.1.9-5 (r802) - * relax tandem penalty. this will be made a command-line option in future. - ------------------------------------------------------------------------- -r801 | lh3lh3 | 2010-11-08 23:35:52 -0500 (Mon, 08 Nov 2010) | 3 lines -Changed paths: - M /trunk/samtools/bam2bcf.c - M /trunk/samtools/bamtk.c - - * samtools-0.1.9-4 (r801) - * fixed a minor issue in printing indel VCF - ------------------------------------------------------------------------- -r800 | lh3lh3 | 2010-11-08 15:28:14 -0500 (Mon, 08 Nov 2010) | 2 lines -Changed paths: - M /trunk/samtools/bam2bcf_indel.c - M /trunk/samtools/bcftools/vcfutils.pl - -fixed another silly bug in mpileup's indel caller - ------------------------------------------------------------------------- -r799 | lh3lh3 | 2010-11-08 14:28:27 -0500 (Mon, 08 Nov 2010) | 2 lines -Changed paths: - M /trunk/samtools/bam2bcf.c - -fixed a silly bug in the indel caller - ------------------------------------------------------------------------- -r798 | lh3lh3 | 2010-11-08 14:07:33 -0500 (Mon, 08 Nov 2010) | 2 lines -Changed paths: - M /trunk/samtools/ChangeLog - M /trunk/samtools/sam_view.c - M /trunk/samtools/samtools.1 - -Incorporate patches by Marcel Martin for read counting. - ------------------------------------------------------------------------- -r797 | lh3lh3 | 2010-11-08 13:39:52 -0500 (Mon, 08 Nov 2010) | 3 lines -Changed paths: - M /trunk/samtools/bam2bcf.c - M /trunk/samtools/bam2bcf.h - M /trunk/samtools/bam2bcf_indel.c - M /trunk/samtools/bam_plcmd.c - M /trunk/samtools/bamtk.c - - * samtools-0.1.9-2 (r797) - * mpileup: indel calling seems to be working - ------------------------------------------------------------------------- -r796 | lh3lh3 | 2010-11-08 10:54:46 -0500 (Mon, 08 Nov 2010) | 2 lines -Changed paths: - M /trunk/samtools/bam2bcf.c - M /trunk/samtools/bam2bcf.h - M /trunk/samtools/bam2bcf_indel.c - M /trunk/samtools/bam_plcmd.c - M /trunk/samtools/kaln.c - -indel calling is apparently working, but more information needs to be collected - ------------------------------------------------------------------------- -r795 | lh3lh3 | 2010-11-08 00:39:18 -0500 (Mon, 08 Nov 2010) | 2 lines -Changed paths: - M /trunk/samtools/bam2bcf.c - M /trunk/samtools/bam2bcf_indel.c - -fixed a few bugs in the indel caller. Probably there are more. - ------------------------------------------------------------------------- -r794 | lh3lh3 | 2010-11-07 22:23:16 -0500 (Sun, 07 Nov 2010) | 2 lines -Changed paths: - M /trunk/samtools/Makefile - M /trunk/samtools/bam.h - M /trunk/samtools/bam2bcf.c - M /trunk/samtools/bam2bcf.h - A /trunk/samtools/bam2bcf_indel.c - M /trunk/samtools/bam_plcmd.c - M /trunk/samtools/kaln.c - M /trunk/samtools/kaln.h - -prepare for the indel caller. It is not ready yet. - ------------------------------------------------------------------------- -r793 | lh3lh3 | 2010-11-05 11:28:23 -0400 (Fri, 05 Nov 2010) | 2 lines -Changed paths: - M /trunk/samtools/bam2bcf.c - M /trunk/samtools/bam2bcf.h - M /trunk/samtools/bam_plcmd.c - -Revert to r790. The recent changes are not good... - ------------------------------------------------------------------------- -r792 | lh3lh3 | 2010-11-05 00:19:14 -0400 (Fri, 05 Nov 2010) | 6 lines -Changed paths: - M /trunk/samtools/bam2bcf.c - M /trunk/samtools/bam2bcf.h - M /trunk/samtools/bam_plcmd.c - - * this revision is UNSTABLE - - * indel caller seems working, but it is very insensitive and has - several things I do not quite understand. - - ------------------------------------------------------------------------- -r791 | lh3lh3 | 2010-11-04 22:58:43 -0400 (Thu, 04 Nov 2010) | 2 lines -Changed paths: - M /trunk/samtools/bam2bcf.c - M /trunk/samtools/bam2bcf.h - M /trunk/samtools/bam_plcmd.c - -for backup. no effective changes - ------------------------------------------------------------------------- -r790 | lh3lh3 | 2010-11-03 15:51:24 -0400 (Wed, 03 Nov 2010) | 2 lines -Changed paths: - M /trunk/samtools/bcftools/vcfutils.pl - M /trunk/samtools/kprobaln.c - -fixed a minor problem in the example coming with kprobaln.c - ------------------------------------------------------------------------- -r789 | lh3lh3 | 2010-11-02 15:41:27 -0400 (Tue, 02 Nov 2010) | 4 lines -Changed paths: - M /trunk/samtools/Makefile - M /trunk/samtools/bam_md.c - M /trunk/samtools/kaln.c - M /trunk/samtools/kaln.h - A /trunk/samtools/kprobaln.c - A /trunk/samtools/kprobaln.h - -Separate kaln and kprobaln as I am preparing further changes. At -present, the results should be identical to the previous. - - ------------------------------------------------------------------------- -r788 | petulda | 2010-11-02 12:19:04 -0400 (Tue, 02 Nov 2010) | 1 line -Changed paths: - M /trunk/samtools/bam_plcmd.c - -Added -b option: read file names from a file ------------------------------------------------------------------------- -r787 | lh3lh3 | 2010-10-29 23:17:22 -0400 (Fri, 29 Oct 2010) | 7 lines -Changed paths: - M /trunk/samtools/bam.h - M /trunk/samtools/bam_pileup.c - M /trunk/samtools/bam_plcmd.c - M /trunk/samtools/bamtk.c - - * samtools-0.1.9-2 (r787) - - * Allow to set a maximum per-sample depth to reduce memory. However, - BAQ computation is still applied to every read. The speed is not - improved. - - ------------------------------------------------------------------------- -r786 | lh3lh3 | 2010-10-29 12:10:40 -0400 (Fri, 29 Oct 2010) | 3 lines -Changed paths: - M /trunk/samtools/bam2bcf.c - M /trunk/samtools/bam2bcf.h - M /trunk/samtools/bam_plcmd.c - M /trunk/samtools/bamtk.c - M /trunk/samtools/bcftools/bcf.c - M /trunk/samtools/bcftools/vcf.c - - * samtools-0.1.9-1 (r786) - * samtools: optionally perform exact test for each sample - ------------------------------------------------------------------------- -r785 | lh3lh3 | 2010-10-29 09:42:25 -0400 (Fri, 29 Oct 2010) | 2 lines -Changed paths: - M /trunk/samtools/bam2bcf.c - M /trunk/samtools/bam2bcf.h - M /trunk/samtools/bam_plcmd.c - M /trunk/samtools/bcftools/bcf.c - -Optionally output "DP", the individual read depth - ------------------------------------------------------------------------- -r784 | lh3lh3 | 2010-10-27 23:10:27 -0400 (Wed, 27 Oct 2010) | 2 lines -Changed paths: - M /trunk/samtools/samtools.1 - -acknowledge Petr and John who have greatly contributed to the project. - ------------------------------------------------------------------------- -r783 | lh3lh3 | 2010-10-27 22:47:47 -0400 (Wed, 27 Oct 2010) | 2 lines -Changed paths: - M /trunk/samtools/ChangeLog - M /trunk/samtools/NEWS - M /trunk/samtools/bamtk.c - M /trunk/samtools/samtools.1 - -Release samtools-0.1.9 (r783) - ------------------------------------------------------------------------- -r782 | lh3lh3 | 2010-10-27 19:58:54 -0400 (Wed, 27 Oct 2010) | 2 lines -Changed paths: - M /trunk/samtools/bam_plcmd.c - -fixed a silly bug in pileup - ------------------------------------------------------------------------- -r781 | lh3lh3 | 2010-10-27 14:39:48 -0400 (Wed, 27 Oct 2010) | 5 lines -Changed paths: - M /trunk/samtools/ChangeLog - M /trunk/samtools/bam_plcmd.c - M /trunk/samtools/bam_sort.c - M /trunk/samtools/bamtk.c - M /trunk/samtools/samtools.1 - - * samtools-0.1.8-22 (r781) - * made BAQ the default behavior of mpileup - * updated manual - * in merge, force to exit given inconsistent header when "-R" is not in use. - ------------------------------------------------------------------------- -r780 | lh3lh3 | 2010-10-27 11:01:11 -0400 (Wed, 27 Oct 2010) | 3 lines -Changed paths: - M /trunk/samtools/bam.h - M /trunk/samtools/bam_plcmd.c - M /trunk/samtools/bamtk.c - - * samtools-0.1.8-21 (r780) - * minor speedup to pileup - ------------------------------------------------------------------------- -r779 | lh3lh3 | 2010-10-27 09:58:56 -0400 (Wed, 27 Oct 2010) | 2 lines -Changed paths: - M /trunk/samtools/bam_pileup.c - M /trunk/samtools/bam_plcmd.c - M /trunk/samtools/examples/toy.sam - -improve pileup a little bit - ------------------------------------------------------------------------- -r778 | lh3lh3 | 2010-10-27 00:14:43 -0400 (Wed, 27 Oct 2010) | 3 lines -Changed paths: - M /trunk/samtools/bam.h - M /trunk/samtools/bam_pileup.c - M /trunk/samtools/bam_plcmd.c - M /trunk/samtools/bam_tview.c - M /trunk/samtools/bamtk.c - - * samtools-0.1.8-20 (r778) - * speed up pileup, although I do not know how much is the improvement - ------------------------------------------------------------------------- -r777 | lh3lh3 | 2010-10-26 17:26:04 -0400 (Tue, 26 Oct 2010) | 3 lines -Changed paths: - M /trunk/samtools/bam_maqcns.c - M /trunk/samtools/bam_maqcns.h - M /trunk/samtools/bam_plcmd.c - M /trunk/samtools/bamtk.c - M /trunk/samtools/examples/Makefile - - * samtools-0.1.8-19 (r777) - * integrate mpileup features to pileup: min_baseQ, capQ, prob_realn, paired-only and biased prior - ------------------------------------------------------------------------- -r776 | lh3lh3 | 2010-10-26 15:27:46 -0400 (Tue, 26 Oct 2010) | 2 lines -Changed paths: - M /trunk/samtools/bam_md.c - -remove local realignment (probabilistic realignment is still there) - ------------------------------------------------------------------------- -r774 | jmarshall | 2010-10-21 06:52:38 -0400 (Thu, 21 Oct 2010) | 3 lines -Changed paths: - M /trunk/samtools/sam_view.c - -Add the relevant filename or region to error messages, and cause a failure -exit status where appropriate. Based on a patch provided by Marcel Martin. - ------------------------------------------------------------------------- -r773 | lh3lh3 | 2010-10-19 19:44:31 -0400 (Tue, 19 Oct 2010) | 3 lines -Changed paths: - M /trunk/samtools/examples/toy.sam - M /trunk/samtools/kaln.c - - * Minor code changes. No real effect. - * change quality to 30 in toy.sam - ------------------------------------------------------------------------- -r772 | lh3lh3 | 2010-10-18 23:40:13 -0400 (Mon, 18 Oct 2010) | 2 lines -Changed paths: - M /trunk/samtools/examples/toy.fa - M /trunk/samtools/examples/toy.sam - -added another toy example - ------------------------------------------------------------------------- -r771 | lh3lh3 | 2010-10-13 23:32:12 -0400 (Wed, 13 Oct 2010) | 2 lines -Changed paths: - M /trunk/samtools/bcftools/call1.c - M /trunk/samtools/bcftools/ld.c - M /trunk/samtools/bcftools/vcfutils.pl - -improve the LD statistics - ------------------------------------------------------------------------- -r770 | lh3lh3 | 2010-10-12 23:49:26 -0400 (Tue, 12 Oct 2010) | 3 lines -Changed paths: - M /trunk/samtools/bcftools/call1.c - M /trunk/samtools/bcftools/vcfutils.pl - - * a minor fix to the -L option - * add ldstats to vcfutils.pl - ------------------------------------------------------------------------- -r769 | lh3lh3 | 2010-10-12 15:51:57 -0400 (Tue, 12 Oct 2010) | 2 lines -Changed paths: - M /trunk/samtools/bcftools/bcf.c - -a minor change - ------------------------------------------------------------------------- -r768 | lh3lh3 | 2010-10-12 15:49:06 -0400 (Tue, 12 Oct 2010) | 2 lines -Changed paths: - M /trunk/samtools/bcftools/Makefile - A /trunk/samtools/bcftools/ld.c - -forget to add the key file - ------------------------------------------------------------------------- -r767 | lh3lh3 | 2010-10-12 15:48:46 -0400 (Tue, 12 Oct 2010) | 4 lines -Changed paths: - M /trunk/samtools/bcftools/Makefile - M /trunk/samtools/bcftools/bcf.c - M /trunk/samtools/bcftools/bcf.h - M /trunk/samtools/bcftools/call1.c - M /trunk/samtools/bcftools/prob1.c - M /trunk/samtools/bcftools/vcfutils.pl - - * vcfutils.pl: fixed a typo in help message - * added APIs: bcf_append_info() and bcf_cpy() - * calculate adjacent LD - ------------------------------------------------------------------------- -r766 | lh3lh3 | 2010-10-11 11:06:40 -0400 (Mon, 11 Oct 2010) | 2 lines -Changed paths: - M /trunk/samtools/bcftools/vcfutils.pl - -added filter for samtools/bcftools genetated VCFs - ------------------------------------------------------------------------- -r765 | lh3lh3 | 2010-10-05 14:05:18 -0400 (Tue, 05 Oct 2010) | 3 lines -Changed paths: - M /trunk/samtools/bcftools/vcfutils.pl - M /trunk/samtools/kaln.c - - * removed a comment line in kaln.c - * vcfutils.pl fillac works when GT is not the first field - ------------------------------------------------------------------------- -r764 | petulda | 2010-10-05 08:59:36 -0400 (Tue, 05 Oct 2010) | 1 line -Changed paths: - A /trunk/samtools/bcftools/bcf-fix.pl - -Convert VCF output of "bcftools view -bgcv" to a valid VCF file ------------------------------------------------------------------------- -r763 | lh3lh3 | 2010-10-02 22:51:03 -0400 (Sat, 02 Oct 2010) | 4 lines -Changed paths: - M /trunk/samtools/bam_plcmd.c - M /trunk/samtools/bamtk.c - A /trunk/samtools/bcftools/bcftools.1 - M /trunk/samtools/bcftools/call1.c - M /trunk/samtools/samtools.1 - - * samtools-0.1.8-18 (r763) - * added bcftools manual page - * minor fix to mpileup and view command lines - ------------------------------------------------------------------------- -r762 | lh3lh3 | 2010-10-02 21:46:25 -0400 (Sat, 02 Oct 2010) | 3 lines -Changed paths: - M /trunk/samtools/bcftools/bcf.c - M /trunk/samtools/bcftools/call1.c - M /trunk/samtools/bcftools/vcfutils.pl - - * vcfutils.pl qstats: calculate marginal ts/tv - * allow to call genotypes at variant sites - ------------------------------------------------------------------------- -r761 | lh3lh3 | 2010-10-01 00:29:55 -0400 (Fri, 01 Oct 2010) | 3 lines -Changed paths: - M /trunk/samtools/kaln.c - M /trunk/samtools/misc/HmmGlocal.java - -I am changing the gap open probability back to 0.001. It seems that -being conservative here is a good thing... - ------------------------------------------------------------------------- -r760 | lh3lh3 | 2010-10-01 00:11:27 -0400 (Fri, 01 Oct 2010) | 5 lines -Changed paths: - M /trunk/samtools/bamtk.c - M /trunk/samtools/kaln.c - A /trunk/samtools/misc/HmmGlocal.java - - * samtools-0.1.8-17 (r760) - * the default gap open penalty is too small (a typo) - * added comments on hmm_realn - * Java implementation - ------------------------------------------------------------------------- -r759 | lh3lh3 | 2010-09-30 10:12:54 -0400 (Thu, 30 Sep 2010) | 2 lines -Changed paths: - M /trunk/samtools/bamtk.c - -mark samtools-0.1.8-16 (r759) - ------------------------------------------------------------------------- -r758 | lh3lh3 | 2010-09-30 10:12:02 -0400 (Thu, 30 Sep 2010) | 2 lines -Changed paths: - M /trunk/samtools/kaln.c - -round to the nearest integer - ------------------------------------------------------------------------- -r757 | lh3lh3 | 2010-09-28 17:16:43 -0400 (Tue, 28 Sep 2010) | 4 lines -Changed paths: - M /trunk/samtools/kaln.c - -I was trying to accelerate ka_prob_glocal() as this will be the -bottleneck. After an hour, the only gain is to change division to -multiplication. OK. I will stop. - ------------------------------------------------------------------------- -r756 | lh3lh3 | 2010-09-28 16:57:49 -0400 (Tue, 28 Sep 2010) | 2 lines -Changed paths: - M /trunk/samtools/kaln.c - -this is interesting. multiplication is much faster than division, at least on my Mac - ------------------------------------------------------------------------- -r755 | lh3lh3 | 2010-09-28 16:19:13 -0400 (Tue, 28 Sep 2010) | 2 lines -Changed paths: - M /trunk/samtools/kaln.c - -minor changes - ------------------------------------------------------------------------- -r754 | lh3lh3 | 2010-09-28 15:44:16 -0400 (Tue, 28 Sep 2010) | 2 lines -Changed paths: - M /trunk/samtools/bam_md.c - M /trunk/samtools/bam_plcmd.c - M /trunk/samtools/kaln.c - -prob_realn() seems working! - ------------------------------------------------------------------------- -r753 | lh3lh3 | 2010-09-28 12:48:23 -0400 (Tue, 28 Sep 2010) | 2 lines -Changed paths: - M /trunk/samtools/kaln.c - -minor - ------------------------------------------------------------------------- -r752 | lh3lh3 | 2010-09-28 12:47:41 -0400 (Tue, 28 Sep 2010) | 2 lines -Changed paths: - M /trunk/samtools/kaln.c - M /trunk/samtools/kaln.h - -Convert phredQ to probabilities - ------------------------------------------------------------------------- -r751 | lh3lh3 | 2010-09-28 12:32:08 -0400 (Tue, 28 Sep 2010) | 2 lines -Changed paths: - M /trunk/samtools/kaln.c - M /trunk/samtools/kaln.h - -Implement the glocal HMM; discard the extention HMM - ------------------------------------------------------------------------- -r750 | lh3lh3 | 2010-09-28 00:06:11 -0400 (Tue, 28 Sep 2010) | 2 lines -Changed paths: - M /trunk/samtools/kaln.c - -improve numerical stability - ------------------------------------------------------------------------- -r749 | lh3lh3 | 2010-09-27 23:27:54 -0400 (Mon, 27 Sep 2010) | 2 lines -Changed paths: - M /trunk/samtools/kaln.c - -more comments - ------------------------------------------------------------------------- -r748 | lh3lh3 | 2010-09-27 23:17:16 -0400 (Mon, 27 Sep 2010) | 2 lines -Changed paths: - M /trunk/samtools/kaln.c - -fixed a bug in banded DP - ------------------------------------------------------------------------- -r747 | lh3lh3 | 2010-09-27 23:05:12 -0400 (Mon, 27 Sep 2010) | 3 lines -Changed paths: - M /trunk/samtools/kaln.c - - * fixed that weird issue. - * the banded version is NOT working - ------------------------------------------------------------------------- -r746 | lh3lh3 | 2010-09-27 22:57:05 -0400 (Mon, 27 Sep 2010) | 2 lines -Changed paths: - M /trunk/samtools/kaln.c - -More comments. This version seems working, but something is a little weird... - ------------------------------------------------------------------------- -r745 | lh3lh3 | 2010-09-27 17:21:40 -0400 (Mon, 27 Sep 2010) | 6 lines -Changed paths: - M /trunk/samtools/kaln.c - -A little code cleanup. Now the forward and backback algorithms give -nearly identical P(x), which means both are close to the correct -forms. However, I have only tested on toy examples. Minor errors in -the implementation may not be obvious. - - ------------------------------------------------------------------------- -r744 | lh3lh3 | 2010-09-27 16:55:15 -0400 (Mon, 27 Sep 2010) | 2 lines -Changed paths: - M /trunk/samtools/bam_plcmd.c - M /trunk/samtools/bam_sort.c - M /trunk/samtools/kaln.c - M /trunk/samtools/kaln.h - -... - ------------------------------------------------------------------------- -r743 | jmarshall | 2010-09-27 08:19:06 -0400 (Mon, 27 Sep 2010) | 6 lines -Changed paths: - M /trunk/samtools/bam_sort.c - -Abort if merge -h's INH.SAM cannot be opened, just as we abort -if any of the IN#.BAM input files cannot be opened. - -Also propagate any error indication returned by bam_merge_core() -to samtools merge's exit status. - ------------------------------------------------------------------------- -r741 | jmarshall | 2010-09-24 11:08:24 -0400 (Fri, 24 Sep 2010) | 5 lines -Changed paths: - M /trunk/samtools/bam_index.c - -Use bam_validate1() to detect garbage records in the event of a corrupt -BAI index file that causes a bam_seek() to an invalid position. At most -one record (namely, the bam_iter_read terminator) is tested per bam_fetch() -call, so the cost is insignificant in the normal case. - ------------------------------------------------------------------------- -r740 | jmarshall | 2010-09-24 11:00:19 -0400 (Fri, 24 Sep 2010) | 2 lines -Changed paths: - M /trunk/samtools/bam.c - M /trunk/samtools/bam.h - -Add bam_validate1(). - ------------------------------------------------------------------------- -r739 | lh3lh3 | 2010-09-22 12:07:50 -0400 (Wed, 22 Sep 2010) | 3 lines -Changed paths: - M /trunk/samtools/bam_md.c - M /trunk/samtools/bamtk.c - - * samtools-0.1.8-15 (r379) - * allow to change capQ parameter in calmd - ------------------------------------------------------------------------- -r738 | jmarshall | 2010-09-22 11:15:33 -0400 (Wed, 22 Sep 2010) | 13 lines -Changed paths: - M /trunk/samtools/bam_index.c - M /trunk/samtools/sam_view.c - -When bam_read1() returns an error (return value <= -2), propagate that error -to bam_iter_read()'s own return value. Similarly, also propagate it up to -bam_fetch()'s return value. Previously bam_fetch() always returned 0, and -callers ignored its return value anyway. With this change, 0 continues to -indicate success, while <= -2 (which can be written as < 0, as -1 is never -returned) indicates corrupted input. - -bam_iter_read() ought also to propagate errors returned by bam_seek(). - -main_samview() can now print an error message and fail when bam_fetch() -detects that a .bai index file is corrupted or otherwise does not correspond -to the .bam file it is being used with. - ------------------------------------------------------------------------- -r737 | jmarshall | 2010-09-22 10:47:42 -0400 (Wed, 22 Sep 2010) | 3 lines -Changed paths: - M /trunk/samtools/bam_index.c - -0 is a successful return value from bam_read1(). (In practice, it never -returns 0 anyway; but all the other callers treat 0 as successful.) - ------------------------------------------------------------------------- -r736 | lh3lh3 | 2010-09-20 17:43:08 -0400 (Mon, 20 Sep 2010) | 2 lines -Changed paths: - M /trunk/samtools/bam.h - M /trunk/samtools/bam_index.c - M /trunk/samtools/bam_sort.c - - * merge files region-by-region. work on small examples but more tests are needed. - ------------------------------------------------------------------------- -r735 | lh3lh3 | 2010-09-20 16:56:24 -0400 (Mon, 20 Sep 2010) | 2 lines -Changed paths: - M /trunk/samtools/bcftools/vcfutils.pl - -improve qstats by checking the alleles as well - ------------------------------------------------------------------------- -r734 | lh3lh3 | 2010-09-17 18:12:13 -0400 (Fri, 17 Sep 2010) | 2 lines -Changed paths: - M /trunk/samtools/bcftools/vcfutils.pl - -convert UCSC SNP SQL dump to VCF - ------------------------------------------------------------------------- -r733 | lh3lh3 | 2010-09-17 13:02:11 -0400 (Fri, 17 Sep 2010) | 2 lines -Changed paths: - M /trunk/samtools/bcftools/vcfutils.pl - -hapmap2vcf convertor - ------------------------------------------------------------------------- -r732 | lh3lh3 | 2010-09-17 10:11:37 -0400 (Fri, 17 Sep 2010) | 3 lines -Changed paths: - M /trunk/samtools/bcftools/Makefile - M /trunk/samtools/bcftools/bcf.c - M /trunk/samtools/bcftools/bcf.h - M /trunk/samtools/bcftools/vcf.c - - * added comments - * VCF->BCF is not possible without knowing the sequence dictionary before hand... - ------------------------------------------------------------------------- -r731 | lh3lh3 | 2010-09-17 09:15:53 -0400 (Fri, 17 Sep 2010) | 2 lines -Changed paths: - M /trunk/samtools/bam2bcf.c - M /trunk/samtools/bcftools/bcf.c - M /trunk/samtools/bcftools/bcf.h - M /trunk/samtools/bcftools/bcfutils.c - M /trunk/samtools/bcftools/call1.c - M /trunk/samtools/bcftools/vcf.c - - * put n_smpl to "bcf1_t" to simplify API a little - ------------------------------------------------------------------------- -r730 | lh3lh3 | 2010-09-16 21:36:01 -0400 (Thu, 16 Sep 2010) | 2 lines -Changed paths: - M /trunk/samtools/bcftools/bcf.h - M /trunk/samtools/bcftools/call1.c - M /trunk/samtools/bcftools/index.c - -fixed a bug in indexing - ------------------------------------------------------------------------- -r729 | lh3lh3 | 2010-09-16 16:54:48 -0400 (Thu, 16 Sep 2010) | 3 lines -Changed paths: - M /trunk/samtools/bam.c - M /trunk/samtools/bam_md.c - M /trunk/samtools/bam_pileup.c - - * fixed a bug in capQ - * valgrind identifies a use of uninitialised value, but I have not fixed it. - ------------------------------------------------------------------------- -r728 | lh3lh3 | 2010-09-16 15:03:59 -0400 (Thu, 16 Sep 2010) | 3 lines -Changed paths: - M /trunk/samtools/bgzip.c - M /trunk/samtools/razip.c - - * fixed a bug in razip: -c will delete the input file - * copy tabix/bgzip to here - ------------------------------------------------------------------------- -r727 | lh3lh3 | 2010-09-16 13:45:49 -0400 (Thu, 16 Sep 2010) | 3 lines -Changed paths: - M /trunk/samtools/bam_md.c - M /trunk/samtools/bam_plcmd.c - M /trunk/samtools/bamtk.c - - * samtools-0.1.8-14 (r727) - * allow to change the capQ parameter at the command line - ------------------------------------------------------------------------- -r726 | lh3lh3 | 2010-09-16 13:38:43 -0400 (Thu, 16 Sep 2010) | 4 lines -Changed paths: - M /trunk/samtools/bam_md.c - M /trunk/samtools/bam_plcmd.c - M /trunk/samtools/bcftools/vcfutils.pl - M /trunk/samtools/misc/samtools.pl - - * added varFilter to vcfutils.pl - * reimplement realn(). now it performs a local alignment - * added cap_mapQ() to cap mapping quality when there are many substitutions - ------------------------------------------------------------------------- -r724 | lh3lh3 | 2010-09-15 00:18:31 -0400 (Wed, 15 Sep 2010) | 2 lines -Changed paths: - M /trunk/samtools/bcftools/Makefile - A /trunk/samtools/bcftools/bcf2qcall.c - M /trunk/samtools/bcftools/call1.c - - * convert BCF to QCALL input - ------------------------------------------------------------------------- -r723 | lh3lh3 | 2010-09-14 22:41:50 -0400 (Tue, 14 Sep 2010) | 2 lines -Changed paths: - M /trunk/samtools/bam_md.c - -dynamic band width in realignment - ------------------------------------------------------------------------- -r722 | lh3lh3 | 2010-09-14 22:05:32 -0400 (Tue, 14 Sep 2010) | 2 lines -Changed paths: - M /trunk/samtools/bam_md.c - M /trunk/samtools/bam_plcmd.c - -fixed a bug in realignment - ------------------------------------------------------------------------- -r721 | lh3lh3 | 2010-09-14 20:54:09 -0400 (Tue, 14 Sep 2010) | 2 lines -Changed paths: - M /trunk/samtools/bcftools/prob1.c - -fixed a minor issue - ------------------------------------------------------------------------- -r720 | lh3lh3 | 2010-09-14 19:25:10 -0400 (Tue, 14 Sep 2010) | 2 lines -Changed paths: - M /trunk/samtools/Makefile - M /trunk/samtools/bam_maqcns.c - M /trunk/samtools/bam_md.c - -fixed a bug in realignment - ------------------------------------------------------------------------- -r719 | lh3lh3 | 2010-09-14 19:18:24 -0400 (Tue, 14 Sep 2010) | 2 lines -Changed paths: - M /trunk/samtools/bam_plcmd.c - -minor changes. It is BUGGY now! - ------------------------------------------------------------------------- -r718 | lh3lh3 | 2010-09-14 16:32:33 -0400 (Tue, 14 Sep 2010) | 4 lines -Changed paths: - M /trunk/samtools/bam_md.c - M /trunk/samtools/bam_pileup.c - M /trunk/samtools/kaln.c - M /trunk/samtools/kaln.h - - * aggressive gapped aligner is implemented in calmd. - * distinguish gap_open and gap_end_open in banded alignment - * make tview accepts alignment with heading and tailing D - ------------------------------------------------------------------------- -r717 | jmarshall | 2010-09-14 09:04:28 -0400 (Tue, 14 Sep 2010) | 2 lines -Changed paths: - M /trunk/samtools - -Add svn:ignore properties for generated files that don't appear in "make all". - ------------------------------------------------------------------------- -r716 | jmarshall | 2010-09-13 08:37:53 -0400 (Mon, 13 Sep 2010) | 3 lines -Changed paths: - M /trunk/samtools - M /trunk/samtools/bcftools - M /trunk/samtools/misc - -Add svn:ignore properties listing the generated files. -(Except for *.o, which we'll assume is in global-ignores.) - ------------------------------------------------------------------------- -r715 | lh3lh3 | 2010-09-08 12:53:55 -0400 (Wed, 08 Sep 2010) | 5 lines -Changed paths: - M /trunk/samtools/bamtk.c - M /trunk/samtools/bcftools/call1.c - M /trunk/samtools/bcftools/prob1.c - M /trunk/samtools/sample.c - M /trunk/samtools/sample.h - - * samtools-0.1.8-13 (r715) - * fixed a bug in identifying SM across files - * bcftools: estimate heterozygosity - * bcftools: allow to skip sites without reference bases - ------------------------------------------------------------------------- -r713 | lh3lh3 | 2010-09-03 17:19:12 -0400 (Fri, 03 Sep 2010) | 2 lines -Changed paths: - M /trunk/samtools/bcftools/Makefile - M /trunk/samtools/bcftools/call1.c - M /trunk/samtools/bcftools/prob1.c - M /trunk/samtools/bcftools/prob1.h - -quite a lot changes to the contrast caller, but I still feel something is missing... - ------------------------------------------------------------------------- -r711 | lh3lh3 | 2010-09-03 00:30:48 -0400 (Fri, 03 Sep 2010) | 4 lines -Changed paths: - M /trunk/samtools/bcftools/Makefile - M /trunk/samtools/bcftools/call1.c - M /trunk/samtools/bcftools/prob1.c - M /trunk/samtools/bcftools/vcfutils.pl - - * changed 3.434 to 4.343 (typo!) - * fixed a bug in the contrast caller - * calculate heterozygosity - ------------------------------------------------------------------------- -r710 | lh3lh3 | 2010-09-01 23:24:47 -0400 (Wed, 01 Sep 2010) | 2 lines -Changed paths: - M /trunk/samtools/bcftools/bcf.h - M /trunk/samtools/bcftools/bcfutils.c - M /trunk/samtools/bcftools/call1.c - -SNP calling from the GL field - ------------------------------------------------------------------------- -r709 | lh3lh3 | 2010-09-01 18:52:30 -0400 (Wed, 01 Sep 2010) | 2 lines -Changed paths: - M /trunk/samtools/bcftools/vcf.c - -fixed another problem - ------------------------------------------------------------------------- -r708 | lh3lh3 | 2010-09-01 18:31:17 -0400 (Wed, 01 Sep 2010) | 3 lines -Changed paths: - M /trunk/samtools/bcftools/bcf.c - M /trunk/samtools/bcftools/vcf.c - - * fixed bugs in parsing VCF - * parser now works with GT/GQ/DP/PL/GL - ------------------------------------------------------------------------- -r707 | lh3lh3 | 2010-09-01 15:28:29 -0400 (Wed, 01 Sep 2010) | 2 lines -Changed paths: - M /trunk/samtools/bcftools/Makefile - M /trunk/samtools/bcftools/prob1.c - -Do not compile _BCF_QUAD by default - ------------------------------------------------------------------------- -r706 | lh3lh3 | 2010-09-01 15:21:41 -0400 (Wed, 01 Sep 2010) | 2 lines -Changed paths: - M /trunk/samtools/bcftools/bcf.c - M /trunk/samtools/bcftools/bcf.h - M /trunk/samtools/bcftools/bcfutils.c - M /trunk/samtools/bcftools/call1.c - -Write the correct ALT and PL in the SNP calling mode. - ------------------------------------------------------------------------- -r705 | lh3lh3 | 2010-09-01 12:50:33 -0400 (Wed, 01 Sep 2010) | 2 lines -Changed paths: - M /trunk/samtools/bcftools/vcfutils.pl - -more commands for my own uses - ------------------------------------------------------------------------- -r704 | lh3lh3 | 2010-09-01 09:26:10 -0400 (Wed, 01 Sep 2010) | 2 lines -Changed paths: - A /trunk/samtools/bcftools/vcfutils.pl - -Utilities for processing VCF - ------------------------------------------------------------------------- -r703 | lh3lh3 | 2010-08-31 16:44:57 -0400 (Tue, 31 Aug 2010) | 2 lines -Changed paths: - M /trunk/samtools/bcftools/Makefile - M /trunk/samtools/bcftools/call1.c - M /trunk/samtools/bcftools/prob1.c - M /trunk/samtools/bcftools/prob1.h - -preliminary contrast variant caller - ------------------------------------------------------------------------- -r702 | lh3lh3 | 2010-08-31 12:28:39 -0400 (Tue, 31 Aug 2010) | 2 lines -Changed paths: - M /trunk/samtools/bcftools/call1.c - M /trunk/samtools/bcftools/prob1.c - M /trunk/samtools/bcftools/prob1.h - -z' and z'' can be calculated - ------------------------------------------------------------------------- -r701 | lh3lh3 | 2010-08-31 10:20:57 -0400 (Tue, 31 Aug 2010) | 3 lines -Changed paths: - M /trunk/samtools/bcftools/Makefile - A /trunk/samtools/bcftools/call1.c (from /trunk/samtools/bcftools/vcfout.c:699) - M /trunk/samtools/bcftools/prob1.c - D /trunk/samtools/bcftools/vcfout.c - - * rename vcfout.c as call1.c - * prepare to add two-sample comparison - ------------------------------------------------------------------------- -r699 | lh3lh3 | 2010-08-24 15:28:16 -0400 (Tue, 24 Aug 2010) | 2 lines -Changed paths: - M /trunk/samtools/bcftools/vcfout.c - -fixed a bug in calculating the t statistics - ------------------------------------------------------------------------- -r698 | lh3lh3 | 2010-08-24 14:05:50 -0400 (Tue, 24 Aug 2010) | 3 lines -Changed paths: - M /trunk/samtools/bam2bcf.c - M /trunk/samtools/bam2bcf.h - M /trunk/samtools/bamtk.c - M /trunk/samtools/bcftools/kfunc.c - M /trunk/samtools/bcftools/vcfout.c - - * samtools-0.1.8-13 (r698) - * perform one-tailed t-test for baseQ, mapQ and endDist - ------------------------------------------------------------------------- -r697 | lh3lh3 | 2010-08-24 12:30:13 -0400 (Tue, 24 Aug 2010) | 2 lines -Changed paths: - M /trunk/samtools/bcftools/kfunc.c - -added regularized incomplete beta function - ------------------------------------------------------------------------- -r695 | lh3lh3 | 2010-08-23 17:36:17 -0400 (Mon, 23 Aug 2010) | 2 lines -Changed paths: - M /trunk/samtools/bam_maqcns.c - M /trunk/samtools/bam_plcmd.c - -change the default correlation coefficient - ------------------------------------------------------------------------- -r694 | lh3lh3 | 2010-08-23 14:46:52 -0400 (Mon, 23 Aug 2010) | 2 lines -Changed paths: - M /trunk/samtools/bcftools/bcf.c - M /trunk/samtools/bcftools/vcfout.c - -print QUAL as floating numbers - ------------------------------------------------------------------------- -r693 | lh3lh3 | 2010-08-23 14:06:07 -0400 (Mon, 23 Aug 2010) | 3 lines -Changed paths: - M /trunk/samtools/Makefile - M /trunk/samtools/bam_plcmd.c - M /trunk/samtools/bamtk.c - M /trunk/samtools/examples/Makefile - A /trunk/samtools/sample.c - A /trunk/samtools/sample.h - - * samtools-0.1.8-12 (r692) - * group data by samples in "mpileup -g" - ------------------------------------------------------------------------- -r692 | lh3lh3 | 2010-08-23 10:58:53 -0400 (Mon, 23 Aug 2010) | 2 lines -Changed paths: - M /trunk/samtools/Makefile - D /trunk/samtools/bam_mcns.c - D /trunk/samtools/bam_mcns.h - M /trunk/samtools/bam_plcmd.c - -remove VCF output in mpileup - ------------------------------------------------------------------------- -r691 | lh3lh3 | 2010-08-23 10:48:20 -0400 (Mon, 23 Aug 2010) | 3 lines -Changed paths: - M /trunk/samtools/bam2bcf.c - M /trunk/samtools/bam2bcf.h - - * use the revised MAQ error model for mpileup - * prepare to remove the independent model from mpileup - ------------------------------------------------------------------------- -r690 | lh3lh3 | 2010-08-20 15:46:40 -0400 (Fri, 20 Aug 2010) | 2 lines -Changed paths: - M /trunk/samtools/Makefile - M /trunk/samtools/bam_maqcns.c - M /trunk/samtools/bam_maqcns.h - M /trunk/samtools/bam_plcmd.c - A /trunk/samtools/errmod.c - A /trunk/samtools/errmod.h - M /trunk/samtools/ksort.h - -added revised MAQ error model - ------------------------------------------------------------------------- -r689 | lh3lh3 | 2010-08-18 09:55:20 -0400 (Wed, 18 Aug 2010) | 2 lines -Changed paths: - M /trunk/samtools/bcftools/prob1.c - M /trunk/samtools/bcftools/prob1.h - M /trunk/samtools/bcftools/vcfout.c - -allow to read the prior from the error output. EM iteration is working. - ------------------------------------------------------------------------- -r688 | lh3lh3 | 2010-08-17 12:12:20 -0400 (Tue, 17 Aug 2010) | 3 lines -Changed paths: - M /trunk/samtools/bcftools/main.c - M /trunk/samtools/bcftools/vcf.c - - * write a little more VCF header - * concatenate BCFs - ------------------------------------------------------------------------- -r687 | lh3lh3 | 2010-08-16 20:53:16 -0400 (Mon, 16 Aug 2010) | 2 lines -Changed paths: - M /trunk/samtools/bcftools/bcf.c - M /trunk/samtools/bcftools/bcf.h - M /trunk/samtools/bcftools/bcf.tex - -use float for QUAL - ------------------------------------------------------------------------- -r686 | lh3lh3 | 2010-08-14 00:11:13 -0400 (Sat, 14 Aug 2010) | 2 lines -Changed paths: - M /trunk/samtools/bcftools/bcf.c - M /trunk/samtools/bcftools/prob1.c - -faster for large sample size (in principle) - ------------------------------------------------------------------------- -r685 | lh3lh3 | 2010-08-13 23:28:31 -0400 (Fri, 13 Aug 2010) | 4 lines -Changed paths: - M /trunk/samtools/bcftools/prob1.c - - * a numerically stable method to calculate z_{jk} - * currently slower than the old method but will be important for large sample size - * in principle, we can speed up for large n, but have not tried - ------------------------------------------------------------------------- -r684 | lh3lh3 | 2010-08-11 21:58:31 -0400 (Wed, 11 Aug 2010) | 2 lines -Changed paths: - M /trunk/samtools/bcftools/vcfout.c - -fixed an issue in parsing integer - ------------------------------------------------------------------------- -r683 | lh3lh3 | 2010-08-09 13:05:07 -0400 (Mon, 09 Aug 2010) | 2 lines -Changed paths: - M /trunk/samtools/bcftools/bcf.c - -do not print refname if file is converted from VCF - ------------------------------------------------------------------------- -r682 | lh3lh3 | 2010-08-09 12:59:47 -0400 (Mon, 09 Aug 2010) | 3 lines -Changed paths: - M /trunk/samtools/bcftools/vcf.c - - * parse PL - * fixed a bug in parsing VCF - ------------------------------------------------------------------------- -r681 | lh3lh3 | 2010-08-09 12:49:23 -0400 (Mon, 09 Aug 2010) | 4 lines -Changed paths: - M /trunk/samtools/bcftools/bcf.c - M /trunk/samtools/bcftools/bcf.h - M /trunk/samtools/bcftools/bcfutils.c - M /trunk/samtools/bcftools/main.c - M /trunk/samtools/bcftools/vcf.c - M /trunk/samtools/bcftools/vcfout.c - M /trunk/samtools/bgzf.c - M /trunk/samtools/kstring.c - - * fixed a bug in kstrtok@kstring.c - * preliminary VCF parser (not parse everything for now) - * improved view interface - ------------------------------------------------------------------------- -r680 | lh3lh3 | 2010-08-09 10:43:13 -0400 (Mon, 09 Aug 2010) | 4 lines -Changed paths: - M /trunk/samtools/bcftools/bcf.c - M /trunk/samtools/bcftools/bcf.h - M /trunk/samtools/bcftools/vcfout.c - M /trunk/samtools/kstring.c - M /trunk/samtools/kstring.h - - * improved kstring (added kstrtok) - * removed the limit on the format string length in bcftools - * use kstrtok to parse format which fixed a bug in the old code - ------------------------------------------------------------------------- -r679 | lh3lh3 | 2010-08-09 01:12:05 -0400 (Mon, 09 Aug 2010) | 2 lines -Changed paths: - A /trunk/samtools/bcftools/README - M /trunk/samtools/bcftools/vcfout.c - -help messages - ------------------------------------------------------------------------- -r678 | lh3lh3 | 2010-08-09 00:01:52 -0400 (Mon, 09 Aug 2010) | 2 lines -Changed paths: - M /trunk/samtools/bcftools/vcfout.c - -perform single-tail test for ED4 - ------------------------------------------------------------------------- -r677 | lh3lh3 | 2010-08-08 23:48:35 -0400 (Sun, 08 Aug 2010) | 2 lines -Changed paths: - M /trunk/samtools/bcftools/Makefile - M /trunk/samtools/bcftools/kfunc.c - M /trunk/samtools/bcftools/vcfout.c - - * test depth, end distance and HWE - ------------------------------------------------------------------------- -r676 | lh3lh3 | 2010-08-08 02:04:15 -0400 (Sun, 08 Aug 2010) | 2 lines -Changed paths: - M /trunk/samtools/bcftools/kfunc.c - -reimplement incomplete gamma functions. no copy-paste - ------------------------------------------------------------------------- -r675 | lh3lh3 | 2010-08-06 22:42:54 -0400 (Fri, 06 Aug 2010) | 3 lines -Changed paths: - M /trunk/samtools/bam2bcf.c - M /trunk/samtools/bam2bcf.h - M /trunk/samtools/bcftools/fet.c - M /trunk/samtools/bcftools/prob1.c - M /trunk/samtools/bcftools/prob1.h - M /trunk/samtools/bcftools/vcfout.c - - * bcftools: add HWE (no testing for now) - * record end dist in a 2x2 table, not avg, std any more - ------------------------------------------------------------------------- -r674 | lh3lh3 | 2010-08-06 17:30:16 -0400 (Fri, 06 Aug 2010) | 3 lines -Changed paths: - A /trunk/samtools/bcftools/kfunc.c - - * Special functions: log(gamma()), erfc(), P(a,x) (incomplete gamma) - * Not using Numerical Recipe due to licensing issues - ------------------------------------------------------------------------- -r673 | lh3lh3 | 2010-08-05 23:46:53 -0400 (Thu, 05 Aug 2010) | 2 lines -Changed paths: - A /trunk/samtools/bcftools/fet.c - -Fisher's exact test - ------------------------------------------------------------------------- -r672 | lh3lh3 | 2010-08-05 21:48:33 -0400 (Thu, 05 Aug 2010) | 3 lines -Changed paths: - M /trunk/samtools/bam2bcf.c - M /trunk/samtools/bam2bcf.h - M /trunk/samtools/bamtk.c - M /trunk/samtools/examples/Makefile - - * samtools-0.1.8-11 (r672) - * collect more stats for allele balance test in bcftools (not yet) - ------------------------------------------------------------------------- -r671 | lh3lh3 | 2010-08-05 16:17:58 -0400 (Thu, 05 Aug 2010) | 3 lines -Changed paths: - M /trunk/samtools/bam_plcmd.c - M /trunk/samtools/bcftools/bcf.c - M /trunk/samtools/bcftools/main.c - - * the code base is stablized again. - * I will delay the vcf parser, which is quite complicated but with little value for now - ------------------------------------------------------------------------- -r670 | lh3lh3 | 2010-08-05 16:03:23 -0400 (Thu, 05 Aug 2010) | 2 lines -Changed paths: - M /trunk/samtools/examples/Makefile - -minor - ------------------------------------------------------------------------- -r669 | lh3lh3 | 2010-08-05 16:03:08 -0400 (Thu, 05 Aug 2010) | 2 lines -Changed paths: - M /trunk/samtools/bcftools/vcf.c - -unfinished vcf parser - ------------------------------------------------------------------------- -r668 | lh3lh3 | 2010-08-05 15:46:40 -0400 (Thu, 05 Aug 2010) | 3 lines -Changed paths: - M /trunk/samtools/bcftools/Makefile - M /trunk/samtools/bcftools/bcf.c - M /trunk/samtools/bcftools/bcf.h - M /trunk/samtools/bcftools/bcfutils.c - M /trunk/samtools/bcftools/index.c - M /trunk/samtools/bcftools/main.c - A /trunk/samtools/bcftools/vcf.c - M /trunk/samtools/bcftools/vcfout.c - - * added prelimiary VCF parser (not finished) - * change struct a bit - ------------------------------------------------------------------------- -r667 | lh3lh3 | 2010-08-03 22:35:27 -0400 (Tue, 03 Aug 2010) | 3 lines -Changed paths: - M /trunk/samtools/bam2bcf.c - M /trunk/samtools/bam2bcf.h - M /trunk/samtools/bam_plcmd.c - M /trunk/samtools/bcftools/bcf.c - - * allow to set min base q - * fixed a bug in mpileup -u - ------------------------------------------------------------------------- -r666 | lh3lh3 | 2010-08-03 22:08:44 -0400 (Tue, 03 Aug 2010) | 2 lines -Changed paths: - A /trunk/samtools/bcftools/bcf.tex - -spec - ------------------------------------------------------------------------- -r665 | lh3lh3 | 2010-08-03 21:18:57 -0400 (Tue, 03 Aug 2010) | 2 lines -Changed paths: - M /trunk/samtools/examples/Makefile - -added more examples - ------------------------------------------------------------------------- -r664 | lh3lh3 | 2010-08-03 21:13:00 -0400 (Tue, 03 Aug 2010) | 2 lines -Changed paths: - M /trunk/samtools/Makefile - M /trunk/samtools/bam2bcf.c - M /trunk/samtools/bam2bcf.h - M /trunk/samtools/bcftools/Makefile - -fixed compilation error - ------------------------------------------------------------------------- -r662 | lh3lh3 | 2010-08-03 21:04:00 -0400 (Tue, 03 Aug 2010) | 2 lines -Changed paths: - M /trunk/samtools/Makefile - D /trunk/samtools/bcf.c - D /trunk/samtools/bcf.h - A /trunk/samtools/bcftools - A /trunk/samtools/bcftools/Makefile - A /trunk/samtools/bcftools/bcf.c - A /trunk/samtools/bcftools/bcf.h - A /trunk/samtools/bcftools/bcfutils.c - A /trunk/samtools/bcftools/index.c - A /trunk/samtools/bcftools/main.c - A /trunk/samtools/bcftools/prob1.c - A /trunk/samtools/bcftools/prob1.h - A /trunk/samtools/bcftools/vcfout.c - -move bcftools to samtools - ------------------------------------------------------------------------- -r660 | lh3lh3 | 2010-08-03 15:58:32 -0400 (Tue, 03 Aug 2010) | 2 lines -Changed paths: - M /trunk/samtools/bam2bcf.c - -fixed another minor bug - ------------------------------------------------------------------------- -r658 | lh3lh3 | 2010-08-03 15:06:45 -0400 (Tue, 03 Aug 2010) | 3 lines -Changed paths: - M /trunk/samtools/bamtk.c - M /trunk/samtools/bcf.c - - * samtools-0.1.8-10 (r658) - * fixed a bug in bam2bcf when the reference is N - ------------------------------------------------------------------------- -r657 | lh3lh3 | 2010-08-03 14:50:23 -0400 (Tue, 03 Aug 2010) | 3 lines -Changed paths: - M /trunk/samtools/bam2bcf.c - M /trunk/samtools/bam2bcf.h - - * fixed a bug - * treat ambiguous ref base as the fifth base - ------------------------------------------------------------------------- -r654 | lh3lh3 | 2010-08-02 17:38:27 -0400 (Mon, 02 Aug 2010) | 2 lines -Changed paths: - M /trunk/bcftools/bcf.c - M /trunk/samtools/bcf.c - -missing a column in VCF output... - ------------------------------------------------------------------------- -r653 | lh3lh3 | 2010-08-02 17:31:33 -0400 (Mon, 02 Aug 2010) | 2 lines -Changed paths: - M /trunk/samtools/bcf.c - -fixed a memory leak - ------------------------------------------------------------------------- -r651 | lh3lh3 | 2010-08-02 17:27:31 -0400 (Mon, 02 Aug 2010) | 2 lines -Changed paths: - M /trunk/samtools/bcf.c - -fixed a bug in bcf reader - ------------------------------------------------------------------------- -r650 | lh3lh3 | 2010-08-02 17:00:41 -0400 (Mon, 02 Aug 2010) | 2 lines -Changed paths: - M /trunk/samtools/bam2bcf.c - -fixed a bug - ------------------------------------------------------------------------- -r649 | lh3lh3 | 2010-08-02 16:49:35 -0400 (Mon, 02 Aug 2010) | 3 lines -Changed paths: - M /trunk/samtools/Makefile - M /trunk/samtools/bam2bcf.c - M /trunk/samtools/bam2bcf.h - M /trunk/samtools/bamtk.c - - * samtools-0.1.8-9 (r649) - * lossless representation of PL in BCF output - ------------------------------------------------------------------------- -r648 | lh3lh3 | 2010-08-02 16:07:25 -0400 (Mon, 02 Aug 2010) | 2 lines -Changed paths: - M /trunk/samtools/Makefile - A /trunk/samtools/bam2bcf.c - A /trunk/samtools/bam2bcf.h - M /trunk/samtools/bam_plcmd.c - A /trunk/samtools/bcf.c - A /trunk/samtools/bcf.h - -Generate binary VCF - ------------------------------------------------------------------------- -r644 | lh3lh3 | 2010-07-28 11:59:19 -0400 (Wed, 28 Jul 2010) | 5 lines -Changed paths: - M /trunk/samtools/bam_mcns.c - M /trunk/samtools/bamtk.c - - * samtools-0.1.8-8 (r644) - * mpileup becomes a little stable again - * the method is slightly different, but is more theoretically correct - * snp calling is O(n^2) instead of O(n^3) - ------------------------------------------------------------------------- -r643 | lh3lh3 | 2010-07-28 11:54:15 -0400 (Wed, 28 Jul 2010) | 3 lines -Changed paths: - M /trunk/samtools/bam_mcns.c - - * fixed a STUPID bug, which cost me a lot of time. - * I am going to clean up mcns a little bit - ------------------------------------------------------------------------- -r642 | lh3lh3 | 2010-07-27 23:23:07 -0400 (Tue, 27 Jul 2010) | 2 lines -Changed paths: - M /trunk/samtools/bam_mcns.c - M /trunk/samtools/bam_mcns.h - M /trunk/samtools/bam_plcmd.c - -supposedly this is THE correct implementation, but more testing is needed - ------------------------------------------------------------------------- -r641 | lh3lh3 | 2010-07-27 22:43:39 -0400 (Tue, 27 Jul 2010) | 2 lines -Changed paths: - M /trunk/samtools/bam_mcns.c - -NOT ready yet. Going to make further changes... - ------------------------------------------------------------------------- -r639 | lh3lh3 | 2010-07-25 22:18:38 -0400 (Sun, 25 Jul 2010) | 3 lines -Changed paths: - M /trunk/samtools/bam_mcns.c - M /trunk/samtools/bam_plcmd.c - M /trunk/samtools/bamtk.c - - * samtools-0.1.8-7 (r639) - * fixed the reference allele assignment - ------------------------------------------------------------------------- -r638 | lh3lh3 | 2010-07-25 12:01:26 -0400 (Sun, 25 Jul 2010) | 5 lines -Changed paths: - M /trunk/samtools/bam_mcns.c - M /trunk/samtools/bam_mcns.h - M /trunk/samtools/bam_plcmd.c - M /trunk/samtools/bamtk.c - - * samtools-0.1.8-6 (r638) - * skip isnan/isinf in case of float underflow - * added the flat prior - * fixed an issue where there are no reads supporting the reference - ------------------------------------------------------------------------- -r637 | lh3lh3 | 2010-07-24 14:16:27 -0400 (Sat, 24 Jul 2010) | 2 lines -Changed paths: - M /trunk/samtools/bam_plcmd.c - -minor changes - ------------------------------------------------------------------------- -r636 | lh3lh3 | 2010-07-24 14:07:27 -0400 (Sat, 24 Jul 2010) | 2 lines -Changed paths: - M /trunk/samtools/bam_mcns.c - M /trunk/samtools/bam_mcns.h - M /trunk/samtools/bam_plcmd.c - M /trunk/samtools/bamtk.c - -minor tweaks - ------------------------------------------------------------------------- -r635 | lh3lh3 | 2010-07-24 01:49:49 -0400 (Sat, 24 Jul 2010) | 2 lines -Changed paths: - M /trunk/samtools/bam_mcns.c - M /trunk/samtools/bam_mcns.h - M /trunk/samtools/bam_plcmd.c - -posterior expectation FINALLY working. I am so tired... - ------------------------------------------------------------------------- -r633 | lh3lh3 | 2010-07-23 13:50:48 -0400 (Fri, 23 Jul 2010) | 2 lines -Changed paths: - M /trunk/samtools/bam_plcmd.c - -another minor fix to mpileup - ------------------------------------------------------------------------- -r632 | lh3lh3 | 2010-07-23 13:43:31 -0400 (Fri, 23 Jul 2010) | 2 lines -Changed paths: - M /trunk/samtools/bam_plcmd.c - -added the format column - ------------------------------------------------------------------------- -r631 | lh3lh3 | 2010-07-23 13:25:44 -0400 (Fri, 23 Jul 2010) | 2 lines -Changed paths: - M /trunk/samtools/bam_mcns.c - M /trunk/samtools/bam_mcns.h - M /trunk/samtools/bam_plcmd.c - M /trunk/samtools/bamtk.c - -added an alternative prior - ------------------------------------------------------------------------- -r628 | lh3lh3 | 2010-07-23 11:48:51 -0400 (Fri, 23 Jul 2010) | 2 lines -Changed paths: - M /trunk/samtools/bam_mcns.c - M /trunk/samtools/bam_mcns.h - M /trunk/samtools/bam_plcmd.c - -calculate posterior allele frequency - ------------------------------------------------------------------------- -r627 | lh3lh3 | 2010-07-22 21:39:13 -0400 (Thu, 22 Jul 2010) | 3 lines -Changed paths: - M /trunk/samtools/bam_mcns.c - M /trunk/samtools/bam_plcmd.c - M /trunk/samtools/bamtk.c - - * samtools-0.1.8-3 (r627) - * multi-sample snp calling appears to work. More tests needed. - ------------------------------------------------------------------------- -r626 | lh3lh3 | 2010-07-22 16:37:56 -0400 (Thu, 22 Jul 2010) | 3 lines -Changed paths: - M /trunk/samtools/bam_mcns.c - M /trunk/samtools/bam_mcns.h - M /trunk/samtools/bam_plcmd.c - M /trunk/samtools/bam_tview.c - - * preliminary multisample SNP caller. - * something looks not so right, but it largely works - ------------------------------------------------------------------------- -r617 | lh3lh3 | 2010-07-14 16:26:27 -0400 (Wed, 14 Jul 2010) | 3 lines -Changed paths: - M /trunk/samtools/bam_mcns.c - M /trunk/samtools/bam_plcmd.c - M /trunk/samtools/bamtk.c - - * samtools-0.1.8-2 (r617) - * allele frequency calculation apparently works... - ------------------------------------------------------------------------- -r616 | lh3lh3 | 2010-07-14 13:33:51 -0400 (Wed, 14 Jul 2010) | 3 lines -Changed paths: - M /trunk/samtools/Makefile - A /trunk/samtools/bam_mcns.c - A /trunk/samtools/bam_mcns.h - M /trunk/samtools/bam_plcmd.c - - * added mutli-sample framework. It is not working, yet. - * improved the mpileup interface - ------------------------------------------------------------------------- -r615 | lh3lh3 | 2010-07-13 14:50:12 -0400 (Tue, 13 Jul 2010) | 3 lines -Changed paths: - M /trunk/samtools/bam_plcmd.c - M /trunk/samtools/bamtk.c - M /trunk/samtools/misc/Makefile - - * samtools-0.1.8-1 (r615) - * allow to get mpileup at required sites - ------------------------------------------------------------------------- -r613 | lh3lh3 | 2010-07-11 22:40:56 -0400 (Sun, 11 Jul 2010) | 2 lines -Changed paths: - M /trunk/samtools/ChangeLog - M /trunk/samtools/NEWS - M /trunk/samtools/bam_plcmd.c - M /trunk/samtools/bamtk.c - M /trunk/samtools/samtools.1 - -Release samtools-0.1.8 - ------------------------------------------------------------------------- -r612 | lh3lh3 | 2010-07-11 21:08:56 -0400 (Sun, 11 Jul 2010) | 2 lines -Changed paths: - M /trunk/samtools/knetfile.c - -fixed a compiling issue for Windows - ------------------------------------------------------------------------- -r611 | lh3lh3 | 2010-07-11 20:59:15 -0400 (Sun, 11 Jul 2010) | 2 lines -Changed paths: - M /trunk/samtools/bam_sort.c - -fixed a bug in sorting when output to stdout (by Peter Chines) - ------------------------------------------------------------------------- -r610 | lh3lh3 | 2010-07-09 17:05:10 -0400 (Fri, 09 Jul 2010) | 2 lines -Changed paths: - M /trunk/samtools/NEWS - M /trunk/samtools/bam_plcmd.c - -change the command line option of pileup - ------------------------------------------------------------------------- -r609 | lh3lh3 | 2010-07-09 00:39:34 -0400 (Fri, 09 Jul 2010) | 2 lines -Changed paths: - M /trunk/samtools/bam_pileup.c - A /trunk/samtools/examples/toy.fa - A /trunk/samtools/examples/toy.sam - -make pileup work with CIGAR with I/D at the beginning or in the end - ------------------------------------------------------------------------- -r608 | lh3lh3 | 2010-07-08 22:36:12 -0400 (Thu, 08 Jul 2010) | 3 lines -Changed paths: - M /trunk/samtools/bam_maqcns.c - M /trunk/samtools/bam_maqcns.h - M /trunk/samtools/bam_plcmd.c - M /trunk/samtools/bam_tview.c - - * make tview more friendly - * a temporary remedy for an issue in indel calling - ------------------------------------------------------------------------- -r607 | lh3lh3 | 2010-07-08 14:43:52 -0400 (Thu, 08 Jul 2010) | 4 lines -Changed paths: - M /trunk/samtools/bam_maqcns.c - M /trunk/samtools/bam_plcmd.c - M /trunk/samtools/bamtk.c - - * samtools-0.1.7-r607 - * improved the genotype accuracy for indels - * use the SOAPsnp model for SNP calling by default. - ------------------------------------------------------------------------- -r606 | lh3lh3 | 2010-07-08 01:05:19 -0400 (Thu, 08 Jul 2010) | 2 lines -Changed paths: - M /trunk/samtools/misc/Makefile - -removed a debugging example - ------------------------------------------------------------------------- -r605 | lh3lh3 | 2010-07-08 01:04:09 -0400 (Thu, 08 Jul 2010) | 4 lines -Changed paths: - M /trunk/samtools/bam_maqcns.c - M /trunk/samtools/bamtk.c - - * samtools-.1.7-18 (r605) - * fixed an issue when a deletion and mismatch occur at the same time - and the base quality is higher than 40 (if -I40). - ------------------------------------------------------------------------- -r604 | lh3lh3 | 2010-07-02 19:32:24 -0400 (Fri, 02 Jul 2010) | 2 lines -Changed paths: - M /trunk/samtools/bam.h - M /trunk/samtools/bam_index.c - M /trunk/samtools/misc/Makefile - -fixed a minor bug in idxstats - ------------------------------------------------------------------------- -r601 | lh3lh3 | 2010-06-16 09:03:59 -0400 (Wed, 16 Jun 2010) | 2 lines -Changed paths: - M /trunk/samtools/bam_index.c - -fixed a minor bug in indexing - ------------------------------------------------------------------------- -r600 | lh3lh3 | 2010-06-15 10:17:53 -0400 (Tue, 15 Jun 2010) | 2 lines -Changed paths: - M /trunk/samtools/ChangeLog - M /trunk/samtools/bam.c - -change printf() to puts in exporting - ------------------------------------------------------------------------- -r599 | lh3lh3 | 2010-06-13 21:41:11 -0400 (Sun, 13 Jun 2010) | 2 lines -Changed paths: - M /trunk/samtools/bamtk.c - -minor fix. No actual effect. - ------------------------------------------------------------------------- -r598 | lh3lh3 | 2010-06-13 21:32:45 -0400 (Sun, 13 Jun 2010) | 2 lines -Changed paths: - M /trunk/samtools/Makefile - -added Makefile targets to compile shared/dynamic library - ------------------------------------------------------------------------- -r596 | lh3lh3 | 2010-06-13 19:48:07 -0400 (Sun, 13 Jun 2010) | 3 lines -Changed paths: - M /trunk/samtools/ChangeLog - M /trunk/samtools/bam_index.c - M /trunk/samtools/bamtk.c - - * samtools-0.1.7-17 (r596) - * also keep the number of coor-less reads in the index file - ------------------------------------------------------------------------- -r595 | lh3lh3 | 2010-06-13 18:54:26 -0400 (Sun, 13 Jun 2010) | 3 lines -Changed paths: - M /trunk/samtools/bam_index.c - M /trunk/samtools/bamtk.c - - * samtools-0.1.7-16 (r595) - * write additional information to bam index - ------------------------------------------------------------------------- -r594 | lh3lh3 | 2010-06-13 17:29:52 -0400 (Sun, 13 Jun 2010) | 2 lines -Changed paths: - M /trunk/samtools/bam_index.c - -fixed a bug for unmapped sequences in indexing - ------------------------------------------------------------------------- -r593 | lh3lh3 | 2010-06-12 18:11:32 -0400 (Sat, 12 Jun 2010) | 2 lines -Changed paths: - M /trunk/samtools/bam.h - M /trunk/samtools/bam_index.c - M /trunk/samtools/bam_plcmd.c - M /trunk/samtools/samtools.1 - -rename iterf as iter - ------------------------------------------------------------------------- -r592 | lh3lh3 | 2010-06-12 18:02:38 -0400 (Sat, 12 Jun 2010) | 4 lines -Changed paths: - M /trunk/samtools/ChangeLog - M /trunk/samtools/bam_aux.c - M /trunk/samtools/bam_index.c - M /trunk/samtools/bam_pileup.c - M /trunk/samtools/bam_plcmd.c - M /trunk/samtools/bamtk.c - - * samtools-0.1.7-15 (r592) - * fixed a few minor memory leaks in the new pileup code - * improved the functionality of mpileup - ------------------------------------------------------------------------- -r591 | lh3lh3 | 2010-06-12 14:09:22 -0400 (Sat, 12 Jun 2010) | 3 lines -Changed paths: - M /trunk/samtools/bam.h - M /trunk/samtools/bam_pileup.c - M /trunk/samtools/bam_plcmd.c - M /trunk/samtools/bamtk.c - - * samtools-0.1.7-14 (r591) - * elementary multi-way pileup. More testing and more functionality to be done. - ------------------------------------------------------------------------- -r590 | lh3lh3 | 2010-06-12 01:00:24 -0400 (Sat, 12 Jun 2010) | 3 lines -Changed paths: - M /trunk/samtools/bam.h - M /trunk/samtools/bam_pileup.c - M /trunk/samtools/bamtk.c - - * samtools-0.1.7-13 (r590) - * added mpileup APIs. No compiling errors, but not tested at all. It is late. - ------------------------------------------------------------------------- -r589 | lh3lh3 | 2010-06-11 22:37:09 -0400 (Fri, 11 Jun 2010) | 3 lines -Changed paths: - M /trunk/samtools/bam.h - M /trunk/samtools/bam_pileup.c - M /trunk/samtools/bamtk.c - - * samtools-0.1.7-12 (r589) - * added iterator-like APIs for pileup - ------------------------------------------------------------------------- -r588 | lh3lh3 | 2010-06-11 17:41:13 -0400 (Fri, 11 Jun 2010) | 3 lines -Changed paths: - M /trunk/samtools/bam_index.c - M /trunk/samtools/bamtk.c - - * samtools-0.1.7-11 (r588) - * ported a few improvements from tabix back to samtools - ------------------------------------------------------------------------- -r587 | lh3lh3 | 2010-06-11 17:33:16 -0400 (Fri, 11 Jun 2010) | 3 lines -Changed paths: - M /trunk/samtools/bam.h - M /trunk/samtools/bam_index.c - M /trunk/samtools/bamtk.c - - * samtools-0.1.7-10 (r587) - * added iterator interface for bam_fetch (ported back from tabix) - ------------------------------------------------------------------------- -r586 | lh3lh3 | 2010-06-11 13:23:53 -0400 (Fri, 11 Jun 2010) | 3 lines -Changed paths: - M /trunk/samtools/Makefile - A /trunk/samtools/bam_reheader.c - M /trunk/samtools/bamtk.c - M /trunk/samtools/bgzf.c - - * samtools-0.1.7-9 (r586) - * added "reheader" to replace the BAM header - ------------------------------------------------------------------------- -r585 | lh3lh3 | 2010-06-11 12:22:06 -0400 (Fri, 11 Jun 2010) | 3 lines -Changed paths: - M /trunk/samtools/bam.c - M /trunk/samtools/bamtk.c - M /trunk/samtools/kstring.h - - * samtools-0.1.7-8 (r585) - * speed up "view" - ------------------------------------------------------------------------- -r584 | lh3lh3 | 2010-06-11 12:00:41 -0400 (Fri, 11 Jun 2010) | 4 lines -Changed paths: - M /trunk/samtools/bam.c - M /trunk/samtools/bamtk.c - M /trunk/samtools/bgzf.c - M /trunk/samtools/bgzf.h - M /trunk/samtools/kstring.h - M /trunk/samtools/misc/wgsim_eval.pl - - * samtools-0.1.7-7 (r584) - * ported tabix BGZF to samtools - * flush BGZF after writing the BAM header and between alignment boundaries - ------------------------------------------------------------------------- -r583 | petulda | 2010-06-11 11:58:20 -0400 (Fri, 11 Jun 2010) | 1 line -Changed paths: - A /trunk/samtools/misc/varfilter.py - -Initial release on behalf of Aylwyn Scally ------------------------------------------------------------------------- -r561 | petulda | 2010-05-07 08:41:56 -0400 (Fri, 07 May 2010) | 1 line -Changed paths: - M /trunk/samtools/samtools.1 - -Added a note about the indels coordinates ------------------------------------------------------------------------- -r551 | petulda | 2010-04-23 09:42:13 -0400 (Fri, 23 Apr 2010) | 1 line -Changed paths: - M /trunk/samtools/misc/sam2vcf.pl - -Added the possibility to print or not to print the reference allele ------------------------------------------------------------------------- -r546 | petulda | 2010-04-15 04:33:55 -0400 (Thu, 15 Apr 2010) | 1 line -Changed paths: - M /trunk/samtools/sam_header.c - -More descriptive message for space separated tags ------------------------------------------------------------------------- -r545 | petulda | 2010-04-14 11:44:50 -0400 (Wed, 14 Apr 2010) | 1 line -Changed paths: - M /trunk/samtools/misc/sam2vcf.pl - -Speedup with -i, no need to query the reference all the time ------------------------------------------------------------------------- -r541 | petulda | 2010-03-15 10:03:51 -0400 (Mon, 15 Mar 2010) | 1 line -Changed paths: - M /trunk/samtools/sam_header.c - -Fixed the order of sequences in the header ------------------------------------------------------------------------- -r540 | petulda | 2010-03-04 06:28:35 -0500 (Thu, 04 Mar 2010) | 1 line -Changed paths: - M /trunk/samtools/misc/sam2vcf.pl - -Added possibility to select indels only and fixed a bug in reporting homozygous indels. ------------------------------------------------------------------------- -r539 | jmarshall | 2010-02-27 06:48:17 -0500 (Sat, 27 Feb 2010) | 4 lines -Changed paths: - M /trunk/samtools/bam.c - -Improve the invalid 'BAM\1' magic number error message, and also print it -when no bytes can be read from the alleged BAM file, e.g., in the common -user error case when a SAM file has accidentally been supplied. - ------------------------------------------------------------------------- -r538 | petulda | 2010-02-26 10:51:40 -0500 (Fri, 26 Feb 2010) | 1 line -Changed paths: - M /trunk/samtools/AUTHORS - M /trunk/samtools/bam.h - M /trunk/samtools/bam_import.c - M /trunk/samtools/sam_header.c - -Improved efficiency of header parsing ------------------------------------------------------------------------- -r537 | lh3lh3 | 2010-02-23 21:08:48 -0500 (Tue, 23 Feb 2010) | 3 lines -Changed paths: - M /trunk/samtools/misc/export2sam.pl - -Updated export2sam.pl by Chris Saunders from Illumina. - - ------------------------------------------------------------------------- -r536 | petulda | 2010-02-17 08:32:53 -0500 (Wed, 17 Feb 2010) | 1 line -Changed paths: - M /trunk/samtools/misc/samtools.pl - -Fixed filtering of SNPs near indels. Added min indel and SNP quality filter. ------------------------------------------------------------------------- -r535 | petulda | 2010-02-12 04:52:37 -0500 (Fri, 12 Feb 2010) | 1 line -Changed paths: - M /trunk/samtools/misc/sam2vcf.pl - -Print an error for pileups in simple format ------------------------------------------------------------------------- -r534 | lh3lh3 | 2010-02-11 14:01:41 -0500 (Thu, 11 Feb 2010) | 2 lines -Changed paths: - M /trunk/samtools/bam_plcmd.c - -added a hidden option in pileup to output the base position (for Erin) - ------------------------------------------------------------------------- -r533 | petulda | 2010-02-09 10:12:14 -0500 (Tue, 09 Feb 2010) | 1 line -Changed paths: - M /trunk/samtools/misc/sam2vcf.pl - -Added possibility to specify a custom column title for the data column ------------------------------------------------------------------------- -r532 | petulda | 2010-02-09 09:46:09 -0500 (Tue, 09 Feb 2010) | 1 line -Changed paths: - M /trunk/samtools/bam_plcmd.c - -Added the -d option to limit maximum depth for indels. ------------------------------------------------------------------------- -r531 | petulda | 2010-02-03 07:57:27 -0500 (Wed, 03 Feb 2010) | 1 line -Changed paths: - M /trunk/samtools/misc/sam2vcf.pl - -Added VCF header ------------------------------------------------------------------------- -r530 | lh3lh3 | 2010-02-01 09:13:19 -0500 (Mon, 01 Feb 2010) | 3 lines -Changed paths: - M /trunk/samtools/ChangeLog - M /trunk/samtools/bam_plcmd.c - M /trunk/samtools/bamtk.c - M /trunk/samtools/misc/samtools.pl - M /trunk/samtools/misc/wgsim.c - - * samtools-0.1.7-6 - * fixed a bug in faidx - ------------------------------------------------------------------------- -r529 | jmarshall | 2010-01-11 18:51:49 -0500 (Mon, 11 Jan 2010) | 2 lines -Changed paths: - M /trunk/samtools/faidx.c - -Put the right filename in the error message. - ------------------------------------------------------------------------- -r528 | lh3lh3 | 2009-12-14 11:26:47 -0500 (Mon, 14 Dec 2009) | 3 lines -Changed paths: - M /trunk/samtools/bam_plcmd.c - M /trunk/samtools/bamtk.c - - * samtools-0.1.7-5 (r528) - * further add new consensus generation strategy - ------------------------------------------------------------------------- -r527 | petulda | 2009-12-11 12:31:05 -0500 (Fri, 11 Dec 2009) | 1 line -Changed paths: - M /trunk/samtools/knetfile.c - -Fixed a bug in knet_seek ------------------------------------------------------------------------- -r526 | petulda | 2009-12-11 07:51:18 -0500 (Fri, 11 Dec 2009) | 1 line -Changed paths: - M /trunk/samtools/misc/sam2vcf.pl - -Small fix in VCF format: dot for the empty INFO field ------------------------------------------------------------------------- -r525 | petulda | 2009-12-11 04:36:18 -0500 (Fri, 11 Dec 2009) | 1 line -Changed paths: - M /trunk/samtools/sam_header.c - -Allow tabs in the CO header field ------------------------------------------------------------------------- -r524 | jmarshall | 2009-12-10 10:03:58 -0500 (Thu, 10 Dec 2009) | 3 lines -Changed paths: - M /trunk/samtools/Makefile - M /trunk/samtools/Makefile.mingw - -Depend on libbam.a rather than the phony target, so that samtools is not -unnecessarily rebuilt every time. Also clean bgzip. - ------------------------------------------------------------------------- -r523 | jmarshall | 2009-12-10 09:45:32 -0500 (Thu, 10 Dec 2009) | 4 lines -Changed paths: - M /trunk/samtools/Makefile - M /trunk/samtools/Makefile.mingw - -Fix a bug in compiling bgzip: this also needs knetfile.o when _USE_KNETFILE -is defined. Also introduce $(KNETFILE_O) which can be set to empty to -facilitate non-knet builds. - ------------------------------------------------------------------------- -r522 | lh3lh3 | 2009-12-01 13:02:36 -0500 (Tue, 01 Dec 2009) | 4 lines -Changed paths: - M /trunk/samtools/bamtk.c - M /trunk/samtools/sam_view.c - - * samtools-0.1.7-4 (r522) - * fixed a bug in "view -r" - * added a new option "view -R" to read required read groups from a file - ------------------------------------------------------------------------- -r521 | lh3lh3 | 2009-12-01 10:00:12 -0500 (Tue, 01 Dec 2009) | 3 lines -Changed paths: - M /trunk/samtools/bam_md.c - M /trunk/samtools/bamtk.c - - * samtools-0.1.7-3 (r521) - * calmd: optionally mask matching bases as N - ------------------------------------------------------------------------- -r520 | lh3lh3 | 2009-12-01 09:37:17 -0500 (Tue, 01 Dec 2009) | 4 lines -Changed paths: - M /trunk/samtools/bam_plcmd.c - M /trunk/samtools/bamtk.c - M /trunk/samtools/misc/samtools.pl - - * samtools-0.1.7-2 (r520) - * fixed a few issues with compilation in Windows (on behalf of John) - * choose a random base as the consensus (for population genetics studies) - ------------------------------------------------------------------------- -r519 | jmarshall | 2009-11-30 10:53:02 -0500 (Mon, 30 Nov 2009) | 6 lines -Changed paths: - M /trunk/samtools/Makefile - -Put libraries at the end, so they can resolve references from libbam.a -as well, even with old-fashioned linkers. - -Also use libbam.a explicitly rather than "-L. -lbam" to ensure that we get -the freshly built library, not some other libbam.a lying around the system. - ------------------------------------------------------------------------- -r518 | jmarshall | 2009-11-30 08:44:56 -0500 (Mon, 30 Nov 2009) | 2 lines -Changed paths: - M /trunk/samtools/Makefile - M /trunk/samtools/misc/Makefile - -Also clean *.exe (for Cygwin users using this makefile). - ------------------------------------------------------------------------- -r517 | jmarshall | 2009-11-30 07:09:04 -0500 (Mon, 30 Nov 2009) | 2 lines -Changed paths: - M /trunk/samtools/bam_index.c - -Index files should be opened in binary mode, not text mode. - ------------------------------------------------------------------------- -r516 | lh3lh3 | 2009-11-27 15:18:59 -0500 (Fri, 27 Nov 2009) | 2 lines -Changed paths: - A /trunk/samtools/examples/bam2bed.c - -another example program - ------------------------------------------------------------------------- -r515 | lh3lh3 | 2009-11-27 10:44:56 -0500 (Fri, 27 Nov 2009) | 3 lines -Changed paths: - M /trunk/samtools/bam_import.c - M /trunk/samtools/bamtk.c - M /trunk/samtools/misc/wgsim_eval.pl - M /trunk/samtools/sam.c - - * samtools-0.1.7-1 (r515) - * report an error when .fai contains duplicated names, instead of segfault - ------------------------------------------------------------------------- -r514 | jmarshall | 2009-11-24 09:45:35 -0500 (Tue, 24 Nov 2009) | 2 lines -Changed paths: - M /trunk/samtools/bam.c - -Format 'c'-encoded auxiliary fields correctly, as *signed* integers. - ------------------------------------------------------------------------- -r513 | lh3lh3 | 2009-11-16 10:13:07 -0500 (Mon, 16 Nov 2009) | 2 lines -Changed paths: - M /trunk/samtools/Makefile.mingw - -Update Makefile.mingw for the same reason - ------------------------------------------------------------------------- -r512 | lh3lh3 | 2009-11-16 10:00:08 -0500 (Mon, 16 Nov 2009) | 2 lines -Changed paths: - M /trunk/samtools/Makefile - -Fixed a bug in compiling razip - ------------------------------------------------------------------------- -r510 | lh3lh3 | 2009-11-10 10:55:41 -0500 (Tue, 10 Nov 2009) | 2 lines -Changed paths: - M /trunk/samtools/ChangeLog - M /trunk/samtools/NEWS - M /trunk/samtools/bamtk.c - M /trunk/samtools/samtools.1 - -Release samtools-0.1.7 (r510) - ------------------------------------------------------------------------- -r509 | lh3lh3 | 2009-11-06 09:17:09 -0500 (Fri, 06 Nov 2009) | 3 lines -Changed paths: - M /trunk/samtools/bam_plcmd.c - M /trunk/samtools/bamtk.c - - * samtools-0.1.6-22 (r509) - * forget to fix a similar problem in glfgen - ------------------------------------------------------------------------- -r508 | lh3lh3 | 2009-11-06 09:06:40 -0500 (Fri, 06 Nov 2009) | 3 lines -Changed paths: - M /trunk/samtools/bam_maqcns.c - M /trunk/samtools/bam_plcmd.c - M /trunk/samtools/bamtk.c - M /trunk/samtools/sam_view.c - - * samtools-0.1.6-21 (r508) - * fixed a potential bug in the indel caller towards the end of a chromosome - ------------------------------------------------------------------------- -r494 | lh3lh3 | 2009-10-26 11:38:00 -0400 (Mon, 26 Oct 2009) | 3 lines -Changed paths: - M /trunk/samtools/bamtk.c - M /trunk/samtools/sam_view.c - - * samtools-0.1.6-19 (r494) - * allow to convert Illumina quality (64 based) to the BAM quality - ------------------------------------------------------------------------- -r493 | lh3lh3 | 2009-10-26 10:24:39 -0400 (Mon, 26 Oct 2009) | 4 lines -Changed paths: - M /trunk/samtools/Makefile - M /trunk/samtools/bam.c - M /trunk/samtools/bam_import.c - M /trunk/samtools/bamtk.c - M /trunk/samtools/sam.c - M /trunk/samtools/sam_header.c - - * samtools-0.1.6-18 (r493) - * fixed the bugs due to improperly incorporating Petr's header parser - * a little code clean up in sam_header.c - ------------------------------------------------------------------------- -r492 | petulda | 2009-10-24 09:43:25 -0400 (Sat, 24 Oct 2009) | 1 line -Changed paths: - M /trunk/samtools/sam_header.c - -Added sam_header_line_free call for sam_header_parse2 ------------------------------------------------------------------------- -r491 | lh3lh3 | 2009-10-24 00:50:16 -0400 (Sat, 24 Oct 2009) | 3 lines -Changed paths: - M /trunk/samtools/sam_view.c - - * BUGGY VERSION - * fixed a minor bug - ------------------------------------------------------------------------- -r490 | lh3lh3 | 2009-10-24 00:45:12 -0400 (Sat, 24 Oct 2009) | 4 lines -Changed paths: - M /trunk/samtools/bam.c - M /trunk/samtools/bam.h - M /trunk/samtools/bam_import.c - M /trunk/samtools/sam.c - - * BUGGY VERSION - * improved the interface a bit - * bug unfixed - ------------------------------------------------------------------------- -r489 | lh3lh3 | 2009-10-24 00:41:50 -0400 (Sat, 24 Oct 2009) | 3 lines -Changed paths: - M /trunk/samtools/bam_import.c - M /trunk/samtools/sam_header.c - M /trunk/samtools/sam_header.h - - * BUGGY VERSION. Please NOT use it. - * Fixed a minor bug, but the major bug is still there. - ------------------------------------------------------------------------- -r488 | lh3lh3 | 2009-10-24 00:17:10 -0400 (Sat, 24 Oct 2009) | 3 lines -Changed paths: - M /trunk/samtools/Makefile - M /trunk/samtools/bam.c - M /trunk/samtools/bam.h - M /trunk/samtools/bam_aux.c - M /trunk/samtools/bam_import.c - M /trunk/samtools/bam_rmdup.c - M /trunk/samtools/bam_rmdupse.c - M /trunk/samtools/kaln.c - M /trunk/samtools/sam.c - M /trunk/samtools/sam_header.c - M /trunk/samtools/sam_header.h - M /trunk/samtools/sam_view.c - - * This revision is SERIOUSLY BUGGY. Please NOT use it. - * Start to incorporate header parsing from Petr Danecek - ------------------------------------------------------------------------- -r487 | petulda | 2009-10-23 11:44:32 -0400 (Fri, 23 Oct 2009) | 1 line -Changed paths: - M /trunk/samtools/sam_header.c - M /trunk/samtools/sam_header.h - -Now possible to merge multiple HeaderDict dictionaries ------------------------------------------------------------------------- -r486 | petulda | 2009-10-22 11:46:58 -0400 (Thu, 22 Oct 2009) | 1 line -Changed paths: - M /trunk/samtools/sam_header.c - - ------------------------------------------------------------------------- -r485 | petulda | 2009-10-22 11:41:56 -0400 (Thu, 22 Oct 2009) | 1 line -Changed paths: - A /trunk/samtools/sam_header.c - A /trunk/samtools/sam_header.h - - ------------------------------------------------------------------------- -r484 | lh3lh3 | 2009-10-19 14:31:32 -0400 (Mon, 19 Oct 2009) | 5 lines -Changed paths: - M /trunk/samtools/bam_import.c - M /trunk/samtools/bam_rmdupse.c - M /trunk/samtools/bamtk.c - M /trunk/samtools/examples/Makefile - - * samtools-0.1.6-17 (r484) - * fixed a memory leak in rmdupse - * fixed a bug in parsing @RG header lines - * test rmdup in examples/ - ------------------------------------------------------------------------- -r483 | lh3lh3 | 2009-10-19 13:22:48 -0400 (Mon, 19 Oct 2009) | 4 lines -Changed paths: - M /trunk/samtools/bam_rmdup.c - M /trunk/samtools/bam_rmdupse.c - M /trunk/samtools/bamtk.c - - * samtools-0.1.6-16 (r483) - * unify the interface of rmdup and rmdupse - * a new bug found in rg2lib(). Have not been fixed yet. - ------------------------------------------------------------------------- -r482 | lh3lh3 | 2009-10-19 13:03:34 -0400 (Mon, 19 Oct 2009) | 4 lines -Changed paths: - M /trunk/samtools/bam.h - M /trunk/samtools/bam_rmdup.c - M /trunk/samtools/bam_rmdupse.c - M /trunk/samtools/bamtk.c - A /trunk/samtools/klist.h - - * samtools-0.1.6-15 (r482) - * rewrite rmdupse - * rmdupse is now library aware - ------------------------------------------------------------------------- -r481 | lh3lh3 | 2009-10-18 00:07:21 -0400 (Sun, 18 Oct 2009) | 3 lines -Changed paths: - M /trunk/samtools/bam_rmdup.c - M /trunk/samtools/bamtk.c - - * samtools-0.1.6-14 (r480) - * rmdup is now RG aware - ------------------------------------------------------------------------- -r480 | lh3lh3 | 2009-10-17 22:05:20 -0400 (Sat, 17 Oct 2009) | 2 lines -Changed paths: - M /trunk/samtools/misc/samtools.pl - -added a small unitity to parse SRA XML files - ------------------------------------------------------------------------- -r479 | lh3lh3 | 2009-10-17 20:57:26 -0400 (Sat, 17 Oct 2009) | 3 lines -Changed paths: - M /trunk/samtools/bam_maqcns.c - M /trunk/samtools/bam_maqcns.h - M /trunk/samtools/bam_plcmd.c - M /trunk/samtools/bam_sort.c - M /trunk/samtools/bamtk.c - - * samtools-0.1.6-13 (r479) - * merge: optionally use file names as RG tags - ------------------------------------------------------------------------- -r478 | lh3lh3 | 2009-10-14 14:18:12 -0400 (Wed, 14 Oct 2009) | 3 lines -Changed paths: - M /trunk/samtools/bam.c - M /trunk/samtools/bam.h - M /trunk/samtools/bam_maqcns.c - M /trunk/samtools/bamtk.c - M /trunk/samtools/kaln.c - - * samtools-0.1.6-12 (r478) - * fixed a bug in the indel caller - ------------------------------------------------------------------------- -r477 | lh3lh3 | 2009-10-10 06:12:26 -0400 (Sat, 10 Oct 2009) | 3 lines -Changed paths: - M /trunk/samtools/bam_index.c - M /trunk/samtools/bamtk.c - - * samtools-0.1.6-11 (r477) - * fixed a bug due to recent change in bam_index.c (thank Nicole Washington for the patch) - ------------------------------------------------------------------------- -r476 | petulda | 2009-10-09 11:45:36 -0400 (Fri, 09 Oct 2009) | 1 line -Changed paths: - A /trunk/samtools/misc/sam2vcf.pl - -Added the sam2vcf.pl script. ------------------------------------------------------------------------- -r475 | lh3lh3 | 2009-10-08 10:19:16 -0400 (Thu, 08 Oct 2009) | 2 lines -Changed paths: - M /trunk/samtools/Makefile - M /trunk/samtools/bam.c - M /trunk/samtools/bam.h - M /trunk/samtools/bam_maqcns.c - M /trunk/samtools/bamtk.c - A /trunk/samtools/kaln.c - A /trunk/samtools/kaln.h - -Unfinished modification. Please do not use this revision... - ------------------------------------------------------------------------- -r474 | petulda | 2009-10-08 06:39:54 -0400 (Thu, 08 Oct 2009) | 1 line -Changed paths: - M /trunk/samtools/knetfile.c - -Removed the offending knet_seek message. ------------------------------------------------------------------------- -r473 | petulda | 2009-10-06 09:26:35 -0400 (Tue, 06 Oct 2009) | 1 line -Changed paths: - M /trunk/samtools/knetfile.c - M /trunk/samtools/razf.c - -Bug fix - faidx on RAZF compressed files now working. ------------------------------------------------------------------------- -r472 | lh3lh3 | 2009-10-02 08:42:57 -0400 (Fri, 02 Oct 2009) | 2 lines -Changed paths: - M /trunk/samtools/samtools.1 - -Clarify the meaning of a region like "chr2:1,000,000". - ------------------------------------------------------------------------- -r471 | lh3lh3 | 2009-10-02 05:42:19 -0400 (Fri, 02 Oct 2009) | 2 lines -Changed paths: - M /trunk/samtools/misc/novo2sam.pl - -Fixed minor bugs in novo2sam.pl (on behalf of Ken Chen and Colin Hercus) - ------------------------------------------------------------------------- -r470 | lh3lh3 | 2009-09-29 15:01:27 -0400 (Tue, 29 Sep 2009) | 3 lines -Changed paths: - M /trunk/samtools/Makefile.mingw - M /trunk/samtools/bamtk.c - M /trunk/samtools/knetfile.c - M /trunk/samtools/knetfile.h - - * samtools-0.1.6-9 (r470) - * make knetfile.c compatible with MinGW (thank Martin Morgan for the patch) - ------------------------------------------------------------------------- -r469 | lh3lh3 | 2009-09-29 08:07:44 -0400 (Tue, 29 Sep 2009) | 3 lines -Changed paths: - M /trunk/samtools/bam_index.c - M /trunk/samtools/bamtk.c - - * samtools-0.1.6-9 (r469) - * refactor bam_fetch() for Python binding. On behalf of Leo Goodstadt. - ------------------------------------------------------------------------- -r468 | lh3lh3 | 2009-09-28 05:18:29 -0400 (Mon, 28 Sep 2009) | 3 lines -Changed paths: - M /trunk/samtools/bam_sort.c - M /trunk/samtools/bamtk.c - M /trunk/samtools/misc/samtools.pl - - * samtools-0.1.6-7 (r468) - * make merge stable - ------------------------------------------------------------------------- -r467 | petulda | 2009-09-28 04:51:29 -0400 (Mon, 28 Sep 2009) | 1 line -Changed paths: - M /trunk/samtools/bgzf.c - M /trunk/samtools/bgzip.c - M /trunk/samtools/razf.c - M /trunk/samtools/razip.c - -Changed the mode for newly created files to 0666. This allows less strict permissions with umask properly set (e.g. 0002 vs. 0022). ------------------------------------------------------------------------- -r466 | lh3lh3 | 2009-09-24 06:29:19 -0400 (Thu, 24 Sep 2009) | 3 lines -Changed paths: - M /trunk/samtools/ChangeLog - M /trunk/samtools/bam_md.c - M /trunk/samtools/bamtk.c - - * samtools-0.1.6-6 (r466) - * do not crash calmd when some sequences are absent from the reference. - ------------------------------------------------------------------------- -r464 | jmarshall | 2009-09-23 06:14:32 -0400 (Wed, 23 Sep 2009) | 3 lines -Changed paths: - M /trunk/samtools/bam.c - M /trunk/samtools/knetfile.c - -Suppress bgzf_check_EOF() messages when reading from a pipe, as there is -no way to seek on a pipe and the messages always appear. - ------------------------------------------------------------------------- -r463 | petulda | 2009-09-16 07:05:41 -0400 (Wed, 16 Sep 2009) | 1 line -Changed paths: - M /trunk/samtools/knetfile.c - M /trunk/samtools/razf.c - -A bug fix, "samtools view" is now working again. ------------------------------------------------------------------------- -r462 | lh3lh3 | 2009-09-16 04:51:07 -0400 (Wed, 16 Sep 2009) | 3 lines -Changed paths: - M /trunk/samtools/bamtk.c - M /trunk/samtools/faidx.c - M /trunk/samtools/knetfile.c - M /trunk/samtools/knetfile.h - M /trunk/samtools/razf.c - M /trunk/samtools/razf.h - - * samtools-0.1.6-5 (r462) - * Added knetfile support in razf and faidx (on behalf of Petr Danecek) - ------------------------------------------------------------------------- -r460 | lh3lh3 | 2009-09-09 07:06:22 -0400 (Wed, 09 Sep 2009) | 2 lines -Changed paths: - M /trunk/samtools/samtools.1 - -fixed a formatting issue - ------------------------------------------------------------------------- -r459 | lh3lh3 | 2009-09-08 18:14:08 -0400 (Tue, 08 Sep 2009) | 3 lines -Changed paths: - M /trunk/samtools/bam_sort.c - M /trunk/samtools/bamtk.c - - * samtools-0.1.6-4 (r459) - * make sort output the result to stdout when -o is in use - ------------------------------------------------------------------------- -r458 | lh3lh3 | 2009-09-07 05:10:28 -0400 (Mon, 07 Sep 2009) | 4 lines -Changed paths: - M /trunk/samtools/bamtk.c - M /trunk/samtools/faidx.c - M /trunk/samtools/faidx.h - M /trunk/samtools/samtools.1 - - * samtools-0.1.6-2 (r458) - * added more interface to faidx (by Nils) - * updated documentation - ------------------------------------------------------------------------- -r457 | lh3lh3 | 2009-09-05 16:12:04 -0400 (Sat, 05 Sep 2009) | 3 lines -Changed paths: - M /trunk/samtools/bam_sort.c - M /trunk/samtools/bamtk.c - - * samtools-0.1.6-2 (r457) - * get rid of three assert() in bam_sort.c - ------------------------------------------------------------------------- -r456 | jmarshall | 2009-09-04 12:46:25 -0400 (Fri, 04 Sep 2009) | 3 lines -Changed paths: - M /trunk/samtools/razf.c - -Return NULL from _razf_open() (and hence razf_open()/razf_open2()) -when opening the file fails. - ------------------------------------------------------------------------- -r453 | lh3lh3 | 2009-09-02 08:56:33 -0400 (Wed, 02 Sep 2009) | 2 lines -Changed paths: - M /trunk/samtools/ChangeLog - M /trunk/samtools/NEWS - M /trunk/samtools/bamtk.c - M /trunk/samtools/samtools.1 - D /trunk/samtools/source.dot - -Release samtools-0.1.6 - ------------------------------------------------------------------------- -r451 | lh3lh3 | 2009-09-02 05:44:48 -0400 (Wed, 02 Sep 2009) | 4 lines -Changed paths: - M /trunk/samtools/bam_md.c - M /trunk/samtools/bam_rmdup.c - M /trunk/samtools/bam_rmdupse.c - M /trunk/samtools/bam_sort.c - M /trunk/samtools/bamtk.c - M /trunk/samtools/samtools.1 - - * samtools-0.1.5-34 (r451) - * applied the patch by John - * improved the help message a little bit - ------------------------------------------------------------------------- -r450 | lh3lh3 | 2009-09-02 04:55:55 -0400 (Wed, 02 Sep 2009) | 3 lines -Changed paths: - M /trunk/samtools/bam_color.c - M /trunk/samtools/bamtk.c - - * samtools-0.1.5-33 (r450) - * fixed a bug in bam_color.c (on behalf of Nils Homer) - ------------------------------------------------------------------------- -r449 | lh3lh3 | 2009-08-29 15:36:41 -0400 (Sat, 29 Aug 2009) | 4 lines -Changed paths: - M /trunk/samtools/bam_import.c - M /trunk/samtools/bam_md.c - M /trunk/samtools/bamtk.c - M /trunk/samtools/misc/samtools.pl - - * samtools-0.1.5-32 (r449) - * fillmd: fixed a bug in modifying MD/NM tags - * in import, give a warning if the read is aligned but there is no CIGAR. - ------------------------------------------------------------------------- -r448 | lh3lh3 | 2009-08-19 04:44:28 -0400 (Wed, 19 Aug 2009) | 3 lines -Changed paths: - M /trunk/samtools/ChangeLog - M /trunk/samtools/bam_pileup.c - M /trunk/samtools/bamtk.c - M /trunk/samtools/misc/wgsim_eval.pl - - * samtools-0.1.5-31 (r448) - * fixed an issue when the last CIGAR is I or D - ------------------------------------------------------------------------- -r447 | lh3lh3 | 2009-08-17 04:34:57 -0400 (Mon, 17 Aug 2009) | 3 lines -Changed paths: - M /trunk/samtools/bam_aux.c - M /trunk/samtools/bamtk.c - - * samtools-0.1.5-30 (r447) - * fixed a bug in bam_aux_get(): 'A' is not checked - ------------------------------------------------------------------------- -r446 | lh3lh3 | 2009-08-17 04:33:17 -0400 (Mon, 17 Aug 2009) | 2 lines -Changed paths: - M /trunk/samtools/bam_aux.c - M /trunk/samtools/bamtk.c - - * - ------------------------------------------------------------------------- -r444 | lh3lh3 | 2009-08-11 05:02:36 -0400 (Tue, 11 Aug 2009) | 3 lines -Changed paths: - M /trunk/samtools/bam_sort.c - M /trunk/samtools/bamtk.c - - * samtools-0.1.5-28 (r444) - * bug in "merge -n" - ------------------------------------------------------------------------- -r443 | lh3lh3 | 2009-08-11 04:29:11 -0400 (Tue, 11 Aug 2009) | 4 lines -Changed paths: - M /trunk/samtools/bam.c - M /trunk/samtools/bam_import.c - M /trunk/samtools/bamtk.c - - * samtools-0.1.5-27 (r443) - * SEQ and QUAL can be "*" - * parse CIGAR "=" and "X" as "M" - ------------------------------------------------------------------------- -r442 | lh3lh3 | 2009-08-07 16:56:38 -0400 (Fri, 07 Aug 2009) | 4 lines -Changed paths: - M /trunk/samtools/bam_pileup.c - M /trunk/samtools/bamtk.c - M /trunk/samtools/misc/md5.c - M /trunk/samtools/misc/md5.h - M /trunk/samtools/misc/md5fa.c - - * samtools-0.1.5-26 (r442) - * replace RSA Inc md5.* with ones under permissive lincense - * fixed a bug in detecting unsorted bam in pileup - ------------------------------------------------------------------------- -r441 | bhandsaker | 2009-08-05 09:41:28 -0400 (Wed, 05 Aug 2009) | 2 lines -Changed paths: - M /trunk/samtools/bgzf.c - M /trunk/samtools/bgzf.h - M /trunk/samtools/bgzip.c - -Change copyright notices now that MIT has approved open source distribution. - ------------------------------------------------------------------------- -r440 | lh3lh3 | 2009-08-05 05:44:24 -0400 (Wed, 05 Aug 2009) | 3 lines -Changed paths: - M /trunk/samtools/bam_stat.c - M /trunk/samtools/bamtk.c - - * samtools-0.1.5-25 (r436) - * in flagstats, do not report singletons if both ends are unmapped - ------------------------------------------------------------------------- -r439 | lh3lh3 | 2009-08-04 17:16:51 -0400 (Tue, 04 Aug 2009) | 2 lines -Changed paths: - M /trunk/samtools/misc/maq2sam.c - -fixed a SERIOUS bug in setting 0x20 flag - ------------------------------------------------------------------------- -r438 | lh3lh3 | 2009-08-04 16:50:43 -0400 (Tue, 04 Aug 2009) | 2 lines -Changed paths: - M /trunk/samtools/misc/samtools.pl - -fixed two minor bugs (suggested by Tim M Storm) - ------------------------------------------------------------------------- -r437 | lh3lh3 | 2009-08-04 04:13:24 -0400 (Tue, 04 Aug 2009) | 3 lines -Changed paths: - M /trunk/samtools/bamtk.c - M /trunk/samtools/misc/samtools.pl - M /trunk/samtools/sam_view.c - - * samtools-0.1.5-24 (r435) - * fixed a typo - ------------------------------------------------------------------------- -r434 | lh3lh3 | 2009-08-03 05:40:42 -0400 (Mon, 03 Aug 2009) | 3 lines -Changed paths: - M /trunk/samtools/bam_tview.c - M /trunk/samtools/bamtk.c - - * samtools-0.1.5-23 (r434) - * in tview, press 'r' to show read names rather than sequences - ------------------------------------------------------------------------- -r433 | lh3lh3 | 2009-08-02 14:13:35 -0400 (Sun, 02 Aug 2009) | 3 lines -Changed paths: - M /trunk/samtools/knetfile.c - - * tried to fixed the buggy FTP random access in Windows. FAILED. - * anyway, MinGW seems to have problem with "%lld". - ------------------------------------------------------------------------- -r432 | lh3lh3 | 2009-08-01 19:32:07 -0400 (Sat, 01 Aug 2009) | 5 lines -Changed paths: - M /trunk/samtools/Makefile.mingw - M /trunk/samtools/bamtk.c - M /trunk/samtools/faidx.c - M /trunk/samtools/razf.c - A /trunk/samtools/win32/libcurses.a - A /trunk/samtools/win32/xcurses.h - - * samtools-0.1.5-22 (r432) - * faidx: fixed compitability issue with _WIN32 - * razf: fixed potential compitability issue with _WIN32 - * PDCurses support in Windows - ------------------------------------------------------------------------- -r431 | lh3lh3 | 2009-08-01 18:34:54 -0400 (Sat, 01 Aug 2009) | 2 lines -Changed paths: - M /trunk/samtools/win32/libz.a - -replace the GnuWin32 version of libz.a with my own build with MinGW. - ------------------------------------------------------------------------- -r430 | lh3lh3 | 2009-08-01 18:21:07 -0400 (Sat, 01 Aug 2009) | 2 lines -Changed paths: - M /trunk/samtools/knetfile.c - -add comments - ------------------------------------------------------------------------- -r429 | lh3lh3 | 2009-08-01 17:41:19 -0400 (Sat, 01 Aug 2009) | 3 lines -Changed paths: - M /trunk/samtools/Makefile.mingw - M /trunk/samtools/bamtk.c - M /trunk/samtools/knetfile.c - M /trunk/samtools/knetfile.h - - * samtools-0.1.5-21 (r428) - * knetfile.c is now compatible with mingw-winsock - ------------------------------------------------------------------------- -r428 | lh3lh3 | 2009-07-31 19:39:07 -0400 (Fri, 31 Jul 2009) | 2 lines -Changed paths: - M /trunk/samtools/Makefile.mingw - -simplify MinGW Makefile - ------------------------------------------------------------------------- -r427 | lh3lh3 | 2009-07-31 19:30:54 -0400 (Fri, 31 Jul 2009) | 5 lines -Changed paths: - A /trunk/samtools/Makefile.mingw - M /trunk/samtools/bam_import.c - M /trunk/samtools/bamtk.c - A /trunk/samtools/win32 - A /trunk/samtools/win32/libz.a - A /trunk/samtools/win32/zconf.h - A /trunk/samtools/win32/zlib.h - - * samtools-0.1.5-20 (r427) - * MinGW support. At least SAM<->BAM conversion is working. Other - functionality are not tested at the moment. - * zlib headers and Windows version of libz.a are included in win32/ - ------------------------------------------------------------------------- -r426 | lh3lh3 | 2009-07-31 18:32:09 -0400 (Fri, 31 Jul 2009) | 3 lines -Changed paths: - M /trunk/samtools/bamtk.c - M /trunk/samtools/sam_view.c - - * samtools-0.1.5-19 (r426) - * fixed a bug caused by recent modifications. Sorry. - ------------------------------------------------------------------------- -r425 | lh3lh3 | 2009-07-31 18:23:51 -0400 (Fri, 31 Jul 2009) | 2 lines -Changed paths: - M /trunk/samtools/bgzf.c - -compatible with Windows binary files - ------------------------------------------------------------------------- -r424 | lh3lh3 | 2009-07-31 05:19:59 -0400 (Fri, 31 Jul 2009) | 5 lines -Changed paths: - M /trunk/samtools/bam_maqcns.c - M /trunk/samtools/bam_maqcns.h - M /trunk/samtools/bam_plcmd.c - M /trunk/samtools/bam_tview.c - M /trunk/samtools/bamtk.c - M /trunk/samtools/misc/samtools.pl - - * samtools-0.1.5-18 (r423) - * output additional information in pileup indel lines, for the purepose - of debugging at the moment - * in tview, optionally allow to treat reference skip as deletion - ------------------------------------------------------------------------- -r423 | lh3lh3 | 2009-07-30 17:00:36 -0400 (Thu, 30 Jul 2009) | 2 lines -Changed paths: - A /trunk/samtools/misc/psl2sam.pl - -convert BLAT psl to SAM. - ------------------------------------------------------------------------- -r422 | lh3lh3 | 2009-07-30 06:24:39 -0400 (Thu, 30 Jul 2009) | 6 lines -Changed paths: - M /trunk/samtools/ChangeLog - M /trunk/samtools/bam.c - M /trunk/samtools/bamtk.c - M /trunk/samtools/bgzf.c - M /trunk/samtools/bgzf.h - M /trunk/samtools/knetfile.c - M /trunk/samtools/sam.c - M /trunk/samtools/sam_view.c - - * samtools-0.1.5-17 (r422) - * fixed a but in knetfile.c when seek type is not SEEK_SET - * write an empty BGZF block to every BGZF file - * check BGZF EOF marker in bam_header_read() - * update ChangeLog - ------------------------------------------------------------------------- -r421 | lh3lh3 | 2009-07-30 05:03:39 -0400 (Thu, 30 Jul 2009) | 3 lines -Changed paths: - M /trunk/samtools/bam_import.c - M /trunk/samtools/bam_plcmd.c - M /trunk/samtools/bamtk.c - M /trunk/samtools/misc/samtools.pl - M /trunk/samtools/sam.c - M /trunk/samtools/sam.h - M /trunk/samtools/sam_view.c - - * samtools-0.1.5-16 (r421) - * in view and pileup, load header from FASTA index if the input is SAM. - ------------------------------------------------------------------------- -r420 | lh3lh3 | 2009-07-29 04:18:55 -0400 (Wed, 29 Jul 2009) | 2 lines -Changed paths: - M /trunk/samtools/misc/maq2sam.c - -do not set "read 1" if reads are not mapped in the PE mode of maq - ------------------------------------------------------------------------- -r419 | lh3lh3 | 2009-07-28 04:52:33 -0400 (Tue, 28 Jul 2009) | 5 lines -Changed paths: - M /trunk/samtools/bam_import.c - M /trunk/samtools/bamtk.c - M /trunk/samtools/misc/samtools.pl - M /trunk/samtools/misc/wgsim_eval.pl - - * samtools-0.1.5-15 (r419) - * in sam_open(), return NULL when the file cannot be opened. - * make wgsim_eval.pl more robust to imperfect SAM - * add "unique" command to samtools.pl - ------------------------------------------------------------------------- -r418 | lh3lh3 | 2009-07-24 09:04:19 -0400 (Fri, 24 Jul 2009) | 2 lines -Changed paths: - M /trunk/samtools/misc/wgsim_eval.pl - -skip @header lines in SAM - ------------------------------------------------------------------------- -r417 | lh3lh3 | 2009-07-24 07:42:38 -0400 (Fri, 24 Jul 2009) | 3 lines -Changed paths: - M /trunk/samtools/bamtk.c - M /trunk/samtools/sam_view.c - - * samtools-0.1.5-14 (r417) - * more help in "samtools view" due to the recent changes. - ------------------------------------------------------------------------- -r416 | lh3lh3 | 2009-07-24 07:34:30 -0400 (Fri, 24 Jul 2009) | 3 lines -Changed paths: - M /trunk/samtools/bam.c - M /trunk/samtools/bam.h - M /trunk/samtools/bam_import.c - M /trunk/samtools/bamtk.c - M /trunk/samtools/sam.c - M /trunk/samtools/sam.h - M /trunk/samtools/sam_view.c - - * samtools-0.1.5-17 (r416) - * support import/export SAM with string tags - ------------------------------------------------------------------------- -r415 | lh3lh3 | 2009-07-24 06:39:26 -0400 (Fri, 24 Jul 2009) | 3 lines -Changed paths: - M /trunk/samtools/bam.c - M /trunk/samtools/bam.h - M /trunk/samtools/bam_import.c - M /trunk/samtools/bamtk.c - M /trunk/samtools/sam.c - M /trunk/samtools/sam.h - M /trunk/samtools/sam_view.c - - * samtools-0.1.5-12 (r415) - * FLAG now can be in HEX - ------------------------------------------------------------------------- -r414 | lh3lh3 | 2009-07-22 17:03:49 -0400 (Wed, 22 Jul 2009) | 2 lines -Changed paths: - M /trunk/samtools/kstring.h - -fixed a compiling error (thank Ken for fixing it) - ------------------------------------------------------------------------- -r412 | lh3lh3 | 2009-07-21 17:19:40 -0400 (Tue, 21 Jul 2009) | 2 lines -Changed paths: - M /trunk/samtools/kstring.c - M /trunk/samtools/kstring.h - -Implemented Boyer-Moore search in the kstring library. - ------------------------------------------------------------------------- -r409 | lh3lh3 | 2009-07-17 12:10:20 -0400 (Fri, 17 Jul 2009) | 2 lines -Changed paths: - M /trunk/samtools/bam_index.c - -do not include knetfile.h when _USE_KNETFILE is not defined - ------------------------------------------------------------------------- -r408 | lh3lh3 | 2009-07-17 10:29:21 -0400 (Fri, 17 Jul 2009) | 5 lines -Changed paths: - M /trunk/samtools/bam_md.c - M /trunk/samtools/bam_tview.c - M /trunk/samtools/bamtk.c - M /trunk/samtools/bgzf.c - - * samtools-0.1.5-11 (r408) - * force to overwirte existing MD if it is different from the one calculated - from fillmd. - * bgzf.c: improved the compatibility with Windows headers - ------------------------------------------------------------------------- -r407 | lh3lh3 | 2009-07-17 09:46:56 -0400 (Fri, 17 Jul 2009) | 5 lines -Changed paths: - M /trunk/samtools/bam.h - M /trunk/samtools/bam_aux.c - M /trunk/samtools/bam_md.c - M /trunk/samtools/bamtk.c - M /trunk/samtools/sam.h - - * samtools-0.1.5-10 (r407) - * implemented bam_aux_del() to remove a tag - * fillmd: generate the NM tag - * fillmd: cmd interface improvement - ------------------------------------------------------------------------- -r406 | lh3lh3 | 2009-07-16 18:30:40 -0400 (Thu, 16 Jul 2009) | 2 lines -Changed paths: - M /trunk/samtools/Makefile - -Sorry. The old Makefile is for PDCurses... - ------------------------------------------------------------------------- -r405 | lh3lh3 | 2009-07-16 18:30:11 -0400 (Thu, 16 Jul 2009) | 3 lines -Changed paths: - M /trunk/samtools/Makefile - M /trunk/samtools/bam_tview.c - M /trunk/samtools/bamtk.c - - * samtools-0.1.5-9 (r405) - * improved the compatibility with PDCurses a little bit - ------------------------------------------------------------------------- -r404 | lh3lh3 | 2009-07-16 18:23:52 -0400 (Thu, 16 Jul 2009) | 3 lines -Changed paths: - M /trunk/samtools/Makefile - M /trunk/samtools/bam_tview.c - M /trunk/samtools/bamtk.c - - * samtools-0.1.5-8 (r404) - * compatible with PDCurses - ------------------------------------------------------------------------- -r403 | lh3lh3 | 2009-07-16 17:39:39 -0400 (Thu, 16 Jul 2009) | 3 lines -Changed paths: - M /trunk/samtools/bamtk.c - M /trunk/samtools/kseq.h - - * samtools-0.1.5-7 (r403) - * fixed a bug in kseq.h for binary files (text files are fine) - ------------------------------------------------------------------------- -r402 | lh3lh3 | 2009-07-16 06:49:53 -0400 (Thu, 16 Jul 2009) | 4 lines -Changed paths: - M /trunk/samtools/bam_import.c - M /trunk/samtools/bam_index.c - M /trunk/samtools/bamtk.c - M /trunk/samtools/bgzf.c - - * samtools-0.1.5-6 (r402) - * fixed compiling error when "-D_USE_NETFILE" is not applied - * improve portability to MinGW - ------------------------------------------------------------------------- -r398 | lh3lh3 | 2009-07-13 05:21:36 -0400 (Mon, 13 Jul 2009) | 3 lines -Changed paths: - A /trunk/bam-lite/bam.h (from /trunk/samtools/bam.h:395) - A /trunk/bam-lite/bam_lite.c (from /trunk/samtools/bam_lite.c:395) - D /trunk/samtools/bam_lite.c - - * move bam_lite.c to bam-lite - * copy bam.h to bam-lite - ------------------------------------------------------------------------- -r395 | lh3lh3 | 2009-07-13 05:12:57 -0400 (Mon, 13 Jul 2009) | 3 lines -Changed paths: - M /trunk/samtools/bam.h - M /trunk/samtools/bam_lite.c - M /trunk/samtools/bam_lpileup.c - M /trunk/samtools/bam_pileup.c - M /trunk/samtools/bamtk.c - - * samtools-0.1.5-5 (r395) - * added bam_pileup_file() and removed bam_lpileup_file() - ------------------------------------------------------------------------- -r394 | lh3lh3 | 2009-07-12 19:35:10 -0400 (Sun, 12 Jul 2009) | 3 lines -Changed paths: - M /trunk/samtools/bamtk.c - M /trunk/samtools/knetfile.c - M /trunk/samtools/knetfile.h - - * samtools-0.1.5-4 (r394) - * http_proxy support in knetfile library (check http_proxy ENV) - ------------------------------------------------------------------------- -r393 | lh3lh3 | 2009-07-12 18:57:07 -0400 (Sun, 12 Jul 2009) | 5 lines -Changed paths: - M /trunk/samtools/bam_index.c - M /trunk/samtools/bam_tview.c - M /trunk/samtools/bamtk.c - M /trunk/samtools/knetfile.c - M /trunk/samtools/knetfile.h - - * samtools-0.1.5-3 (r393) - * knetfile now supports HTTP (no proxy at the moment) - * fixed a potential issue in knetfile on opening ordinary file, although I have - not seen the sideeffect so far. - ------------------------------------------------------------------------- -r392 | lh3lh3 | 2009-07-12 13:50:55 -0400 (Sun, 12 Jul 2009) | 2 lines -Changed paths: - M /trunk/samtools/samtools.1 - -Remove the warning in tview - ------------------------------------------------------------------------- -r391 | lh3lh3 | 2009-07-12 13:42:43 -0400 (Sun, 12 Jul 2009) | 3 lines -Changed paths: - M /trunk/samtools/bam_tview.c - M /trunk/samtools/bamtk.c - - * samtools-0.1.5-2 (r391) - * do not show a blank screen when no reads mapped - ------------------------------------------------------------------------- -r390 | lh3lh3 | 2009-07-09 09:01:42 -0400 (Thu, 09 Jul 2009) | 4 lines -Changed paths: - M /trunk/samtools/bam.h - A /trunk/samtools/bam_lite.c - M /trunk/samtools/bamtk.c - - * samtools-0.1.5-1 (r390) - * removed useless _IOLIB in bam.h. This should cause no change at all. - * added bam_lite.c for light-weight BAM reading - ------------------------------------------------------------------------- -r385 | lh3lh3 | 2009-07-07 11:53:29 -0400 (Tue, 07 Jul 2009) | 2 lines -Changed paths: - M /trunk/samtools/bamtk.c - M /trunk/samtools/knetfile.c - -Release samtools-0.1.5c (fixed a bug in piping) - ------------------------------------------------------------------------- -r383 | lh3lh3 | 2009-07-07 06:39:55 -0400 (Tue, 07 Jul 2009) | 2 lines -Changed paths: - M /trunk/samtools/bamtk.c - M /trunk/samtools/sam.c - -Release samtools-0.1.5b (BUG! so embarrassing!) - ------------------------------------------------------------------------- -r381 | lh3lh3 | 2009-07-07 06:20:06 -0400 (Tue, 07 Jul 2009) | 2 lines -Changed paths: - M /trunk/samtools/Makefile - M /trunk/samtools/bam.h - M /trunk/samtools/bam_aux.c - M /trunk/samtools/bamtk.c - -Release samtools-0.1.5a (for compatibility with Bio::DB::Sam) - ------------------------------------------------------------------------- -r373 | lh3lh3 | 2009-07-07 05:26:57 -0400 (Tue, 07 Jul 2009) | 2 lines -Changed paths: - M /trunk/samtools/ChangeLog - M /trunk/samtools/NEWS - M /trunk/samtools/bamtk.c - M /trunk/samtools/samtools.1 - -Release samtools-0.1.5 - ------------------------------------------------------------------------- -r372 | lh3lh3 | 2009-07-07 04:49:27 -0400 (Tue, 07 Jul 2009) | 3 lines -Changed paths: - M /trunk/samtools/bamtk.c - M /trunk/samtools/sam.c - - * samtools-0.1.4-23 (r372) - * keep header text if "view -t" is used (by Gerton) - ------------------------------------------------------------------------- -r371 | lh3lh3 | 2009-07-06 20:13:32 -0400 (Mon, 06 Jul 2009) | 2 lines -Changed paths: - M /trunk/samtools/samtools.1 - -update documentation - ------------------------------------------------------------------------- -r370 | bhandsaker | 2009-07-02 17:24:34 -0400 (Thu, 02 Jul 2009) | 2 lines -Changed paths: - M /trunk/samtools/Makefile - -Introduced LIBPATH variable so this could be overridden to allow samtools to build correct at the Broad. - ------------------------------------------------------------------------- -r369 | lh3lh3 | 2009-07-02 08:36:53 -0400 (Thu, 02 Jul 2009) | 4 lines -Changed paths: - M /trunk/samtools/ChangeLog - M /trunk/samtools/bam_aux.c - M /trunk/samtools/bam_plcmd.c - M /trunk/samtools/bamtk.c - - * samtools-0.1.4-22 (r369) - * in pileup, optionally print E2 and U2 - * remove the debugging code in bam_aux_get() (Drat!) - ------------------------------------------------------------------------- -r368 | lh3lh3 | 2009-07-02 06:32:26 -0400 (Thu, 02 Jul 2009) | 6 lines -Changed paths: - M /trunk/samtools/bam.c - M /trunk/samtools/bam.h - M /trunk/samtools/bam_aux.c - M /trunk/samtools/bam_index.c - M /trunk/samtools/bam_lpileup.c - M /trunk/samtools/bam_md.c - M /trunk/samtools/bam_pileup.c - M /trunk/samtools/bam_rmdup.c - M /trunk/samtools/bam_stat.c - M /trunk/samtools/bam_tview.c - M /trunk/samtools/bamtk.c - M /trunk/samtools/faidx.c - M /trunk/samtools/faidx.h - M /trunk/samtools/glf.c - - * samtools-0.1.4-21 (r368) - * propagate errors rather than exit or complain assertion failure. Assertion - should be only used for checking internal bugs, but not for external input - inconsistency. I was just a bit lazy. - * small memory leak may be present on failure, though - ------------------------------------------------------------------------- -r367 | lh3lh3 | 2009-06-30 11:18:42 -0400 (Tue, 30 Jun 2009) | 2 lines -Changed paths: - M /trunk/samtools/knetfile.c - -reduce the chance of blocking in FTP connection - ------------------------------------------------------------------------- -r366 | lh3lh3 | 2009-06-30 10:35:21 -0400 (Tue, 30 Jun 2009) | 2 lines -Changed paths: - M /trunk/samtools/knetfile.c - -minor changes to knetfile: invalid fd equals -1 rather than 0 - ------------------------------------------------------------------------- -r365 | lh3lh3 | 2009-06-30 09:04:30 -0400 (Tue, 30 Jun 2009) | 3 lines -Changed paths: - M /trunk/samtools/bam_index.c - M /trunk/samtools/bamtk.c - M /trunk/samtools/knetfile.c - M /trunk/samtools/knetfile.h - - * samtools-0.1.4-20 (r365) - * download the BAM index file if it is not found in the current working directory. - ------------------------------------------------------------------------- -r364 | lh3lh3 | 2009-06-30 07:39:07 -0400 (Tue, 30 Jun 2009) | 3 lines -Changed paths: - M /trunk/samtools/bamtk.c - M /trunk/samtools/knetfile.c - - * samtools-0.1.4-19 (r364) - * knetfile: report error when the file is not present on FTP - ------------------------------------------------------------------------- -r363 | lh3lh3 | 2009-06-29 18:23:32 -0400 (Mon, 29 Jun 2009) | 4 lines -Changed paths: - M /trunk/samtools/bam_tview.c - M /trunk/samtools/bamtk.c - M /trunk/samtools/bgzf.c - M /trunk/samtools/bgzf.h - M /trunk/samtools/knetfile.c - M /trunk/samtools/knetfile.h - - * samtools-0.1.4-18 (r363) - * knetfile: do not trigger network communication in FTP seek (lazy seek) - * bgzf: cache recent blocks (disabled by default) - ------------------------------------------------------------------------- -r362 | lh3lh3 | 2009-06-25 16:04:34 -0400 (Thu, 25 Jun 2009) | 2 lines -Changed paths: - M /trunk/samtools/bgzf.c - -write changelog - ------------------------------------------------------------------------- -r361 | lh3lh3 | 2009-06-25 16:03:10 -0400 (Thu, 25 Jun 2009) | 3 lines -Changed paths: - M /trunk/samtools/bam_index.c - M /trunk/samtools/bamtk.c - - * samtools-0.1.4-17 (r361) - * if a file is given on FTP, search locally for the BAM index - ------------------------------------------------------------------------- -r360 | lh3lh3 | 2009-06-25 15:44:52 -0400 (Thu, 25 Jun 2009) | 5 lines -Changed paths: - M /trunk/samtools/Makefile - M /trunk/samtools/bam_import.c - M /trunk/samtools/bam_index.c - M /trunk/samtools/bamtk.c - M /trunk/samtools/bgzf.c - M /trunk/samtools/bgzf.h - M /trunk/samtools/knetfile.c - M /trunk/samtools/knetfile.h - - * samtools-0.1.4-16 (r360) - * report more information in index when the input is not sorted - * change the behaviour of knet_seek() such that it returns 0 on success - * support knetfile library in BGZF - ------------------------------------------------------------------------- -r359 | lh3lh3 | 2009-06-25 12:10:55 -0400 (Thu, 25 Jun 2009) | 2 lines -Changed paths: - M /trunk/samtools/knetfile.c - M /trunk/samtools/knetfile.h - -fixed bugs in knetfile.* - ------------------------------------------------------------------------- -r358 | lh3lh3 | 2009-06-25 08:53:19 -0400 (Thu, 25 Jun 2009) | 2 lines -Changed paths: - A /trunk/samtools/knetfile.h - -this is the header file - ------------------------------------------------------------------------- -r357 | lh3lh3 | 2009-06-25 08:52:03 -0400 (Thu, 25 Jun 2009) | 3 lines -Changed paths: - A /trunk/samtools/knetfile.c - - * open a file at FTP - * preliminary version - ------------------------------------------------------------------------- -r354 | lh3lh3 | 2009-06-24 09:02:25 -0400 (Wed, 24 Jun 2009) | 3 lines -Changed paths: - M /trunk/samtools/bam.c - M /trunk/samtools/bamtk.c - - * samtools-0.1.4-15 (r354) - * fixed a memory leak in bam_view1(), although samtools is not using this routine. - ------------------------------------------------------------------------- -r351 | lh3lh3 | 2009-06-17 19:16:26 -0400 (Wed, 17 Jun 2009) | 4 lines -Changed paths: - M /trunk/samtools/bamtk.c - M /trunk/samtools/faidx.c - - * samtools-0.1.4-13 (r351) - * make faidx more tolerant to empty lines right before or after > lines - * hope this does not introduce new bugs... - ------------------------------------------------------------------------- -r350 | lh3lh3 | 2009-06-16 09:37:01 -0400 (Tue, 16 Jun 2009) | 3 lines -Changed paths: - M /trunk/samtools/bam_plcmd.c - M /trunk/samtools/bamtk.c - - * samtools-0.1.4-13 (r350) - * fixed a small memory leak in pileup, caused by recent modifications - ------------------------------------------------------------------------- -r347 | lh3lh3 | 2009-06-13 16:20:49 -0400 (Sat, 13 Jun 2009) | 3 lines -Changed paths: - M /trunk/samtools/bam_plcmd.c - M /trunk/samtools/bamtk.c - M /trunk/samtools/sam_view.c - - * samtools-0.1.4-12 (r347) - * added `-S' to pileup, similar to `view -S' - ------------------------------------------------------------------------- -r346 | lh3lh3 | 2009-06-13 12:52:31 -0400 (Sat, 13 Jun 2009) | 3 lines -Changed paths: - M /trunk/samtools/Makefile - M /trunk/samtools/bamtk.c - M /trunk/samtools/sam_view.c - M /trunk/samtools/samtools.1 - - * samtools-0.1.4-11 (r346) - * allow to select a read group at view command-line - ------------------------------------------------------------------------- -r344 | lh3lh3 | 2009-06-13 09:06:24 -0400 (Sat, 13 Jun 2009) | 2 lines -Changed paths: - M /trunk/samtools/examples/calDepth.c - -added more comments - ------------------------------------------------------------------------- -r343 | lh3lh3 | 2009-06-13 09:01:22 -0400 (Sat, 13 Jun 2009) | 2 lines -Changed paths: - M /trunk/samtools/examples/calDepth.c - -nothing really - ------------------------------------------------------------------------- -r342 | lh3lh3 | 2009-06-13 08:58:48 -0400 (Sat, 13 Jun 2009) | 2 lines -Changed paths: - M /trunk/samtools/examples/Makefile - A /trunk/samtools/examples/calDepth.c - -added an example of calculating read depth - ------------------------------------------------------------------------- -r341 | lh3lh3 | 2009-06-13 08:00:08 -0400 (Sat, 13 Jun 2009) | 6 lines -Changed paths: - M /trunk/samtools/Makefile - M /trunk/samtools/bam.h - M /trunk/samtools/bam_aux.c - A /trunk/samtools/bam_color.c - M /trunk/samtools/bam_plcmd.c - M /trunk/samtools/bam_sort.c - M /trunk/samtools/bam_tview.c - M /trunk/samtools/bamtk.c - M /trunk/samtools/sam.c - M /trunk/samtools/sam.h - - * samtools-0.1.4-10 (r341) - * only include key APIs in libbam.a - * move color-specific routines to bam_color.c - * update documentations - * remove the support of -q in pileup - ------------------------------------------------------------------------- -r340 | lh3lh3 | 2009-06-13 06:17:14 -0400 (Sat, 13 Jun 2009) | 6 lines -Changed paths: - M /trunk/samtools/INSTALL - M /trunk/samtools/Makefile - M /trunk/samtools/bam_aux.c - M /trunk/samtools/bam_import.c - M /trunk/samtools/bam_tview.c - M /trunk/samtools/bamtk.c - M /trunk/samtools/razf.c - M /trunk/samtools/sam_view.c - - * samtools-0.1.4-9 (r340) - * added a warning to razf.c if zlib<1.2.2.1 - * fixed a compilation warning - * fixed a segfault caused by @RG parsing - * detect NCURSES in bam_tview.c - ------------------------------------------------------------------------- -r339 | lh3lh3 | 2009-06-13 05:35:19 -0400 (Sat, 13 Jun 2009) | 2 lines -Changed paths: - M /trunk/samtools/INSTALL - -update INSTALL - ------------------------------------------------------------------------- -r338 | lh3lh3 | 2009-06-12 19:15:24 -0400 (Fri, 12 Jun 2009) | 4 lines -Changed paths: - M /trunk/samtools/bam.c - M /trunk/samtools/bam.h - M /trunk/samtools/bam_aux.c - M /trunk/samtools/bam_import.c - M /trunk/samtools/bamtk.c - M /trunk/samtools/kstring.h - M /trunk/samtools/sam.c - M /trunk/samtools/sam_view.c - - * samtools-0.1.4-8 (r338) - * parse the @RG header lines and allow to choose library at the "samtools view" - command line - ------------------------------------------------------------------------- -r337 | lh3lh3 | 2009-06-12 16:25:50 -0400 (Fri, 12 Jun 2009) | 4 lines -Changed paths: - M /trunk/samtools/bamtk.c - M /trunk/samtools/bgzf.c - M /trunk/samtools/bgzf.h - M /trunk/samtools/sam.c - M /trunk/samtools/sam_view.c - - * samtools-0.1.4-7 (r337) - * bgzf.c: support mode string "wu": uncompressed output - * "samtools view" support "-u" command-line option - ------------------------------------------------------------------------- -r336 | lh3lh3 | 2009-06-12 12:20:12 -0400 (Fri, 12 Jun 2009) | 5 lines -Changed paths: - M /trunk/samtools/Makefile - M /trunk/samtools/misc/Makefile - M /trunk/samtools/razf.c - M /trunk/samtools/razf.h - M /trunk/samtools/razip.c - - * no changes to samtools itself - * remove zlib source codes - * make RAZF reading compatible with old version of zlib - * on old version of zlib, writing is not available - ------------------------------------------------------------------------- -r335 | lh3lh3 | 2009-06-12 11:47:33 -0400 (Fri, 12 Jun 2009) | 2 lines -Changed paths: - D /trunk/samtools/zlib - -remove zlib for simplification... - ------------------------------------------------------------------------- -r334 | lh3lh3 | 2009-06-12 10:43:36 -0400 (Fri, 12 Jun 2009) | 5 lines -Changed paths: - M /trunk/samtools/bam.h - M /trunk/samtools/bam_aux.c - M /trunk/samtools/bamtk.c - - * samtools-0.1.4-6 (r334) - * do not export bam_aux_get_core() for Bio::DB::Sam because it has already - been implemented in that. - * this version works with the latest Bio::DB::Sam (20090612) - ------------------------------------------------------------------------- -r333 | lh3lh3 | 2009-06-12 10:33:42 -0400 (Fri, 12 Jun 2009) | 2 lines -Changed paths: - M /trunk/samtools/ChangeLog - -update ChangeLog - ------------------------------------------------------------------------- -r332 | lh3lh3 | 2009-06-12 10:21:21 -0400 (Fri, 12 Jun 2009) | 2 lines -Changed paths: - M /trunk/samtools/AUTHORS - M /trunk/samtools/Makefile - M /trunk/samtools/misc/Makefile - -fixed minor things in Makefile - ------------------------------------------------------------------------- -r331 | lh3lh3 | 2009-06-12 10:07:05 -0400 (Fri, 12 Jun 2009) | 4 lines -Changed paths: - M /trunk/samtools/bamtk.c - - * samtools-0.1.4-5 (r3310 - * no change to samtools itself. Version number is increased to reflect the - changes in the Makefile building system. - ------------------------------------------------------------------------- -r330 | lh3lh3 | 2009-06-12 10:03:38 -0400 (Fri, 12 Jun 2009) | 2 lines -Changed paths: - M /trunk/samtools/AUTHORS - D /trunk/samtools/README - -update information... - ------------------------------------------------------------------------- -r329 | lh3lh3 | 2009-06-12 09:52:21 -0400 (Fri, 12 Jun 2009) | 3 lines -Changed paths: - M /trunk/samtools/misc/novo2sam.pl - - * updated novoalign converter by Colin Hercus et al. - * this version works with indels - ------------------------------------------------------------------------- -r328 | lh3lh3 | 2009-06-12 09:50:53 -0400 (Fri, 12 Jun 2009) | 3 lines -Changed paths: - M /trunk/samtools/INSTALL - M /trunk/samtools/Makefile - M /trunk/samtools/misc/Makefile - M /trunk/samtools/zlib/Makefile - - * update Makefile - * update INSTALL instruction - ------------------------------------------------------------------------- -r327 | lh3lh3 | 2009-06-12 09:18:29 -0400 (Fri, 12 Jun 2009) | 4 lines -Changed paths: - A /trunk/samtools/Makefile (from /trunk/samtools/Makefile.generic:325) - D /trunk/samtools/Makefile.am - D /trunk/samtools/Makefile.generic - D /trunk/samtools/Makefile.lite - D /trunk/samtools/autogen.sh - D /trunk/samtools/cleanup.sh - D /trunk/samtools/configure.ac - A /trunk/samtools/misc/Makefile (from /trunk/samtools/misc/Makefile.generic:305) - D /trunk/samtools/misc/Makefile.am - D /trunk/samtools/misc/Makefile.generic - M /trunk/samtools/razf.c - A /trunk/samtools/zlib - A /trunk/samtools/zlib/Makefile - A /trunk/samtools/zlib/adler32.c - A /trunk/samtools/zlib/compress.c - A /trunk/samtools/zlib/crc32.c - A /trunk/samtools/zlib/crc32.h - A /trunk/samtools/zlib/deflate.c - A /trunk/samtools/zlib/deflate.h - A /trunk/samtools/zlib/gzio.c - A /trunk/samtools/zlib/infback.c - A /trunk/samtools/zlib/inffast.c - A /trunk/samtools/zlib/inffast.h - A /trunk/samtools/zlib/inffixed.h - A /trunk/samtools/zlib/inflate.c - A /trunk/samtools/zlib/inflate.h - A /trunk/samtools/zlib/inftrees.c - A /trunk/samtools/zlib/inftrees.h - A /trunk/samtools/zlib/trees.c - A /trunk/samtools/zlib/trees.h - A /trunk/samtools/zlib/uncompr.c - A /trunk/samtools/zlib/zconf.h - A /trunk/samtools/zlib/zlib.h - A /trunk/samtools/zlib/zutil.c - A /trunk/samtools/zlib/zutil.h - D /trunk/samtools/zutil.h - - * added zlib-1.2.3 as razip requires that - * prepare to changed back to the Makefile building system - * unfinished! (will be soon) - ------------------------------------------------------------------------- -r326 | lh3lh3 | 2009-06-12 09:12:03 -0400 (Fri, 12 Jun 2009) | 2 lines -Changed paths: - M /trunk/samtools/misc/samtools.pl - -Unfinished - ------------------------------------------------------------------------- -r325 | lh3lh3 | 2009-06-10 11:27:59 -0400 (Wed, 10 Jun 2009) | 3 lines -Changed paths: - M /trunk/samtools/bam_maqcns.c - M /trunk/samtools/bamtk.c - - * samtools-0.1.4-4 (r325) - * further avoid wrong consensus calls in repetitive regions. - ------------------------------------------------------------------------- -r324 | lh3lh3 | 2009-06-10 10:56:17 -0400 (Wed, 10 Jun 2009) | 4 lines -Changed paths: - M /trunk/samtools/bam_maqcns.c - M /trunk/samtools/bam_plcmd.c - M /trunk/samtools/bamtk.c - M /trunk/samtools/sam.c - M /trunk/samtools/sam.h - - * samtools-0.1.4-3 (r324) - * make maqcns generate the correct call in repetitive regions. - * allow filtering on mapQ at the pileup command line - ------------------------------------------------------------------------- -r323 | lh3lh3 | 2009-06-10 05:04:21 -0400 (Wed, 10 Jun 2009) | 3 lines -Changed paths: - M /trunk/samtools/misc/samtools.pl - - * samtools.pl-0.3.2 (r322) - * indels and SNPs use different mapping quality threshold - ------------------------------------------------------------------------- -r322 | lh3lh3 | 2009-06-10 05:03:22 -0400 (Wed, 10 Jun 2009) | 2 lines -Changed paths: - M /trunk/samtools/misc/export2sam.pl - -fixed a typo - ------------------------------------------------------------------------- -r321 | lh3lh3 | 2009-06-09 04:21:48 -0400 (Tue, 09 Jun 2009) | 2 lines -Changed paths: - M /trunk/samtools/misc/samtools.pl - -just typo. no real change - ------------------------------------------------------------------------- -r320 | lh3lh3 | 2009-06-08 09:32:51 -0400 (Mon, 08 Jun 2009) | 2 lines -Changed paths: - M /trunk/samtools/misc/samtools.pl - -a little bit code cleanup - ------------------------------------------------------------------------- -r319 | lh3lh3 | 2009-06-08 09:22:33 -0400 (Mon, 08 Jun 2009) | 4 lines -Changed paths: - M /trunk/samtools/misc/samtools.pl - - * samtools.pl-0.3.1 - * change default parameters - * optionally print filtered variants - ------------------------------------------------------------------------- -r318 | lh3lh3 | 2009-06-08 09:14:26 -0400 (Mon, 08 Jun 2009) | 3 lines -Changed paths: - M /trunk/samtools/misc/samtools.pl - - * samtools.pl-0.3.0 - * combine snpFilter and indelFilter - ------------------------------------------------------------------------- -r317 | lh3lh3 | 2009-06-08 06:31:42 -0400 (Mon, 08 Jun 2009) | 3 lines -Changed paths: - M /trunk/samtools/misc/samtools.pl - - * samtools.pl-0.2.3 - * change a default parameter - ------------------------------------------------------------------------- -r316 | lh3lh3 | 2009-06-08 06:11:06 -0400 (Mon, 08 Jun 2009) | 5 lines -Changed paths: - M /trunk/samtools/bam_maqcns.c - M /trunk/samtools/bam_maqcns.h - M /trunk/samtools/bam_plcmd.c - M /trunk/samtools/bamtk.c - M /trunk/samtools/sam.c - - * samtools-0.1.4-2 (r316) - * pileup: cap mapping quality at 60 (by default) - * pileup: always calculate RMS mapq - * pileup: allow to output variant sites only - ------------------------------------------------------------------------- -r312 | lh3lh3 | 2009-06-04 08:01:10 -0400 (Thu, 04 Jun 2009) | 3 lines -Changed paths: - M /trunk/samtools/misc/samtools.pl - - * samtools.pl-0.2.2 - * added pileup2fq - ------------------------------------------------------------------------- -r311 | lh3lh3 | 2009-06-03 04:40:40 -0400 (Wed, 03 Jun 2009) | 2 lines -Changed paths: - M /trunk/samtools/misc/samtools.pl - - * in snpFilter, suppress non-SNP sites - ------------------------------------------------------------------------- -r310 | lh3lh3 | 2009-06-01 09:35:13 -0400 (Mon, 01 Jun 2009) | 3 lines -Changed paths: - M /trunk/samtools/misc/samtools.pl - - * samtools.pl-0.2.1 - * fixed a typo - ------------------------------------------------------------------------- -r309 | lh3lh3 | 2009-06-01 09:04:39 -0400 (Mon, 01 Jun 2009) | 3 lines -Changed paths: - M /trunk/samtools/misc/samtools.pl - - * samtools.pl-0.2.0 - * snpFilter - ------------------------------------------------------------------------- -r306 | lh3lh3 | 2009-05-28 06:49:35 -0400 (Thu, 28 May 2009) | 3 lines -Changed paths: - M /trunk/samtools/bgzf.c - - * minor changes to bgzf: return NULL if fd == -1 - * suggested by {kdj,jm18}@sanger.ac.uk - ------------------------------------------------------------------------- -r305 | lh3lh3 | 2009-05-28 06:16:08 -0400 (Thu, 28 May 2009) | 2 lines -Changed paths: - A /trunk/samtools/misc/interpolate_sam.pl - -Script for paired-end pileup, contributed by Stephen Montgomery. - ------------------------------------------------------------------------- -r304 | lh3lh3 | 2009-05-28 06:08:49 -0400 (Thu, 28 May 2009) | 3 lines -Changed paths: - M /trunk/samtools/bamtk.c - M /trunk/samtools/sam.c - - * samtools-0.1.4-1 (r304) - * fixed a minor bug in printing headers - ------------------------------------------------------------------------- -r297 | lh3lh3 | 2009-05-21 11:06:16 -0400 (Thu, 21 May 2009) | 2 lines -Changed paths: - M /trunk/samtools/ChangeLog - M /trunk/samtools/NEWS - M /trunk/samtools/bam_plcmd.c - M /trunk/samtools/bamtk.c - M /trunk/samtools/misc/maq2sam.c - M /trunk/samtools/samtools.1 - -Release samtools-0.1.4 - ------------------------------------------------------------------------- -r296 | lh3lh3 | 2009-05-21 07:53:14 -0400 (Thu, 21 May 2009) | 3 lines -Changed paths: - M /trunk/samtools/bam_maqcns.c - M /trunk/samtools/bamtk.c - - * samtools-0.1.3-24 (r296) - * another similar bug in the indel caller - ------------------------------------------------------------------------- -r295 | lh3lh3 | 2009-05-21 07:50:28 -0400 (Thu, 21 May 2009) | 3 lines -Changed paths: - M /trunk/samtools/bam_maqcns.c - M /trunk/samtools/bamtk.c - - * samtools-0.1.3-23 (r295) - * fixed a critical bug in the indel caller - ------------------------------------------------------------------------- -r294 | lh3lh3 | 2009-05-20 08:00:20 -0400 (Wed, 20 May 2009) | 2 lines -Changed paths: - M /trunk/samtools/bam_stat.c - -added a missing header file - ------------------------------------------------------------------------- -r293 | lh3lh3 | 2009-05-19 18:44:25 -0400 (Tue, 19 May 2009) | 3 lines -Changed paths: - M /trunk/samtools/bam_tview.c - M /trunk/samtools/bamtk.c - - * samtools-0.1.3-22 (r293) - * open tview in the dot-view mode by default - ------------------------------------------------------------------------- -r292 | lh3lh3 | 2009-05-18 16:01:23 -0400 (Mon, 18 May 2009) | 6 lines -Changed paths: - M /trunk/samtools/samtools.1 - -Added a note to the manual. Currently SAMtools used unaligned words in -several places. Although this does not cause bus errors to me, it may -affect portability. Please see the "Bus error" wiki page for more -information. Also thank James Bonfields for pointing this out. - - ------------------------------------------------------------------------- -r286 | lh3lh3 | 2009-05-14 10:23:13 -0400 (Thu, 14 May 2009) | 3 lines -Changed paths: - M /trunk/samtools/bam.h - M /trunk/samtools/bam_aux.c - M /trunk/samtools/bamtk.c - - * samtools-0.1.3-21 (286) - * declare bam_aux_get_core() in bam.h - ------------------------------------------------------------------------- -r276 | lh3lh3 | 2009-05-13 05:07:55 -0400 (Wed, 13 May 2009) | 5 lines -Changed paths: - M /trunk/samtools/bam.h - M /trunk/samtools/bam_index.c - M /trunk/samtools/bamtk.c - - * samtools-0.1.3-20 (r276) - * remove bam1_t::hash again. We need to modify the Perl API anyway to - make it work with the latest SVN. - * As is suggested by Tim, scan "{base}.bai" and "{base}.bam.bai" for index - ------------------------------------------------------------------------- -r275 | lh3lh3 | 2009-05-12 16:14:10 -0400 (Tue, 12 May 2009) | 4 lines -Changed paths: - M /trunk/samtools/ChangeLog - M /trunk/samtools/bam.h - M /trunk/samtools/bamtk.c - - * samtools-0.1.3-19 (r275) - * a minor change to the bam1_t struct: added back "void *hash" for the - backward compatibility with Bio::DB::Sam - ------------------------------------------------------------------------- -r273 | lh3lh3 | 2009-05-12 09:28:39 -0400 (Tue, 12 May 2009) | 3 lines -Changed paths: - M /trunk/samtools/bam_rmdupse.c - M /trunk/samtools/bamtk.c - - * samtools-0.1.3-18 (r273) - * rmdupse: do not remove unmapped reads - ------------------------------------------------------------------------- -r272 | lh3lh3 | 2009-05-12 09:20:00 -0400 (Tue, 12 May 2009) | 2 lines -Changed paths: - M /trunk/samtools/bam_rmdupse.c - -change a parameter. It does nothing - ------------------------------------------------------------------------- -r271 | lh3lh3 | 2009-05-12 09:17:58 -0400 (Tue, 12 May 2009) | 3 lines -Changed paths: - M /trunk/samtools/Makefile.am - M /trunk/samtools/Makefile.generic - M /trunk/samtools/Makefile.lite - A /trunk/samtools/bam_rmdupse.c - M /trunk/samtools/bamtk.c - M /trunk/samtools/configure.ac - - * samtools-0.1.3-17 (r271) - * added 'rmdupse' command - ------------------------------------------------------------------------- -r267 | lh3lh3 | 2009-05-05 17:31:41 -0400 (Tue, 05 May 2009) | 3 lines -Changed paths: - M /trunk/samtools/bamtk.c - M /trunk/samtools/sam_view.c - - * samtools-0.1.3-16 (r267) - * in sam_view.c, changed g_flag_on based on the suggestion by Angie Hinrichs - ------------------------------------------------------------------------- -r266 | lh3lh3 | 2009-05-05 17:23:27 -0400 (Tue, 05 May 2009) | 3 lines -Changed paths: - M /trunk/samtools/bam_import.c - M /trunk/samtools/bamtk.c - - * samtools-0.1.3-15 (r266) - * report an error if a non-* reference is present while @SQ is absent - ------------------------------------------------------------------------- -r265 | lh3lh3 | 2009-05-05 17:09:00 -0400 (Tue, 05 May 2009) | 3 lines -Changed paths: - M /trunk/samtools/bam.h - M /trunk/samtools/bam_import.c - M /trunk/samtools/bamtk.c - M /trunk/samtools/sam.c - M /trunk/samtools/sam_view.c - - * samtools-0.1.3-14 (r262) - * make samopen() recognize @SQ header lines - ------------------------------------------------------------------------- -r261 | lh3lh3 | 2009-05-05 10:10:30 -0400 (Tue, 05 May 2009) | 3 lines -Changed paths: - M /trunk/samtools/bam_plcmd.c - M /trunk/samtools/bamtk.c - M /trunk/samtools/bgzf.c - M /trunk/samtools/sam.c - M /trunk/samtools/sam_view.c - - * samtools-0.1.3-13 (r260) - * report error for file I/O error - ------------------------------------------------------------------------- -r260 | lh3lh3 | 2009-05-05 10:01:16 -0400 (Tue, 05 May 2009) | 2 lines -Changed paths: - M /trunk/samtools/Makefile.am - -update Makefile.am - ------------------------------------------------------------------------- -r259 | lh3lh3 | 2009-05-05 09:52:25 -0400 (Tue, 05 May 2009) | 3 lines -Changed paths: - M /trunk/samtools/bam.h - M /trunk/samtools/bam_pileup.c - M /trunk/samtools/bam_plcmd.c - M /trunk/samtools/bamtk.c - M /trunk/samtools/sam.c - M /trunk/samtools/sam.h - - * samtools-0.1.3-12 (r259) - * use the new I/O interface in pileup - ------------------------------------------------------------------------- -r258 | lh3lh3 | 2009-05-05 09:33:22 -0400 (Tue, 05 May 2009) | 3 lines -Changed paths: - M /trunk/samtools/Makefile.generic - M /trunk/samtools/Makefile.lite - M /trunk/samtools/bam.c - M /trunk/samtools/bam.h - M /trunk/samtools/bam_import.c - M /trunk/samtools/bamtk.c - A /trunk/samtools/sam.c - A /trunk/samtools/sam.h - A /trunk/samtools/sam_view.c - - * samtools-0.1.3-11 (r258) - * unify the interface to BAM and SAM I/O - ------------------------------------------------------------------------- -r257 | lh3lh3 | 2009-05-05 04:53:35 -0400 (Tue, 05 May 2009) | 3 lines -Changed paths: - M /trunk/samtools/Makefile.lite - M /trunk/samtools/bam_plcmd.c - M /trunk/samtools/bamtk.c - - * samtools-0.1.3-10 (r257) - * allow hex with "pileup -m" - ------------------------------------------------------------------------- -r256 | lh3lh3 | 2009-05-04 14:16:50 -0400 (Mon, 04 May 2009) | 4 lines -Changed paths: - M /trunk/samtools/bam_lpileup.c - M /trunk/samtools/bamtk.c - - * samtools-0.1.3-9 (r256) - * fixed a bug in bam_lpileup.c - * I do not know if this also fixes the bug causing assertion failure in the tview - ------------------------------------------------------------------------- -r251 | lh3lh3 | 2009-04-28 08:53:23 -0400 (Tue, 28 Apr 2009) | 3 lines -Changed paths: - M /trunk/samtools/bam_pileup.c - M /trunk/samtools/bamtk.c - - * samtools-0.1.3-8 (r251) - * fixed a bug when there are reads without coordinates - ------------------------------------------------------------------------- -r250 | lh3lh3 | 2009-04-28 08:43:33 -0400 (Tue, 28 Apr 2009) | 2 lines -Changed paths: - A /trunk/samtools/AUTHORS - A /trunk/samtools/README - M /trunk/samtools/cleanup.sh - -added missing files - ------------------------------------------------------------------------- -r249 | lh3lh3 | 2009-04-28 08:37:16 -0400 (Tue, 28 Apr 2009) | 2 lines -Changed paths: - M /trunk/samtools/Makefile.generic - M /trunk/samtools/Makefile.lite - M /trunk/samtools/configure.ac - M /trunk/samtools/misc/Makefile.generic - -improve large file support in compilation - ------------------------------------------------------------------------- -r248 | lh3lh3 | 2009-04-28 08:33:24 -0400 (Tue, 28 Apr 2009) | 2 lines -Changed paths: - M /trunk/samtools/INSTALL - -update INSTALL - ------------------------------------------------------------------------- -r247 | lh3lh3 | 2009-04-28 08:28:50 -0400 (Tue, 28 Apr 2009) | 2 lines -Changed paths: - M /trunk/samtools/Makefile.am - M /trunk/samtools/autogen.sh - M /trunk/samtools/cleanup.sh - M /trunk/samtools/configure.ac - A /trunk/samtools/misc/Makefile.am - -fixed various issues about the GNU building scripts - ------------------------------------------------------------------------- -r246 | lh3lh3 | 2009-04-28 08:10:23 -0400 (Tue, 28 Apr 2009) | 4 lines -Changed paths: - M /trunk/samtools/ChangeLog - D /trunk/samtools/Makefile - A /trunk/samtools/Makefile.am - A /trunk/samtools/Makefile.generic - A /trunk/samtools/autogen.sh - M /trunk/samtools/bam.h - M /trunk/samtools/bam_aux.c - M /trunk/samtools/bam_tview.c - M /trunk/samtools/bamtk.c - A /trunk/samtools/cleanup.sh - A /trunk/samtools/configure.ac - D /trunk/samtools/misc/Makefile - A /trunk/samtools/misc/Makefile.generic (from /trunk/samtools/misc/Makefile:245) - - * samtools-0.1.3-7 (r246) - * incorporated revisions from Nils Homer - * enhanced support of displaying color-space reads - ------------------------------------------------------------------------- -r244 | lh3lh3 | 2009-04-25 06:49:40 -0400 (Sat, 25 Apr 2009) | 3 lines -Changed paths: - M /trunk/samtools/bam_md.c - M /trunk/samtools/bamtk.c - - * samtools-0.1.3-6 (r244) - * fixed segfault for unmapped reads - ------------------------------------------------------------------------- -r243 | lh3lh3 | 2009-04-24 16:27:26 -0400 (Fri, 24 Apr 2009) | 5 lines -Changed paths: - M /trunk/samtools/bam.h - M /trunk/samtools/bam_maqcns.c - M /trunk/samtools/bam_md.c - M /trunk/samtools/bamtk.c - - * samtools-0.1.3-5 (r243) - * fixed a long existing bug which may cause memory leak - * check MD - * consensus calling now works with "=", but indel calling not - ------------------------------------------------------------------------- -r242 | lh3lh3 | 2009-04-24 15:44:46 -0400 (Fri, 24 Apr 2009) | 3 lines -Changed paths: - M /trunk/samtools/bam_md.c - M /trunk/samtools/bamtk.c - - * samtools-0.1.3-4 (r242) - * fixed a memory leak - ------------------------------------------------------------------------- -r240 | lh3lh3 | 2009-04-24 11:40:18 -0400 (Fri, 24 Apr 2009) | 5 lines -Changed paths: - M /trunk/samtools/Makefile - M /trunk/samtools/Makefile.lite - M /trunk/samtools/bam.h - M /trunk/samtools/bam_aux.c - A /trunk/samtools/bam_md.c - M /trunk/samtools/bam_plcmd.c - M /trunk/samtools/bamtk.c - - * samtools-0.1.3-3 (r240) - * generate MD tag - * generate "=" bases - * the plain pileup now support "=" bases, but consensus calling and glfgen may fail - ------------------------------------------------------------------------- -r239 | lh3lh3 | 2009-04-24 07:08:20 -0400 (Fri, 24 Apr 2009) | 5 lines -Changed paths: - M /trunk/samtools/bam.h - M /trunk/samtools/bam_aux.c - M /trunk/samtools/bamtk.c - - * samtools-0.1.3-2 (r239) - * fixed bugs in bam_aux.c (these functions nevered used by samtools) - * removed bam_aux_init()/bam_aux_destroy() - * added tagview for testing bam_aux - ------------------------------------------------------------------------- -r235 | lh3lh3 | 2009-04-21 18:17:39 -0400 (Tue, 21 Apr 2009) | 3 lines -Changed paths: - M /trunk/samtools/bam_pileup.c - M /trunk/samtools/bamtk.c - - * samtools-0.1.3-1 - * fixed a bug in pileup: the first read in a chromosome may not be printed - ------------------------------------------------------------------------- -r232 | lh3lh3 | 2009-04-16 10:25:43 -0400 (Thu, 16 Apr 2009) | 2 lines -Changed paths: - M /trunk/samtools/Makefile.lite - -a missing file in Makefile.lite - ------------------------------------------------------------------------- -r227 | lh3lh3 | 2009-04-15 17:02:53 -0400 (Wed, 15 Apr 2009) | 2 lines -Changed paths: - M /trunk/samtools/NEWS - M /trunk/samtools/bamtk.c - -Release samtools-0.1.3 - ------------------------------------------------------------------------- -r223 | lh3lh3 | 2009-04-15 09:31:32 -0400 (Wed, 15 Apr 2009) | 3 lines -Changed paths: - M /trunk/samtools/bam_plcmd.c - M /trunk/samtools/bamtk.c - - * samtools-0.1.2-28 - * make samtools more robust to weird input such as empty file - ------------------------------------------------------------------------- -r222 | lh3lh3 | 2009-04-15 09:05:33 -0400 (Wed, 15 Apr 2009) | 2 lines -Changed paths: - M /trunk/samtools/ChangeLog - M /trunk/samtools/NEWS - M /trunk/samtools/samtools.1 - -prepare for release 0.1.3 - ------------------------------------------------------------------------- -r221 | lh3lh3 | 2009-04-15 08:32:14 -0400 (Wed, 15 Apr 2009) | 2 lines -Changed paths: - A /trunk/samtools/misc/blast2sam.pl - -convert NCBI-BLASTN to SAM - ------------------------------------------------------------------------- -r220 | lh3lh3 | 2009-04-15 08:18:19 -0400 (Wed, 15 Apr 2009) | 3 lines -Changed paths: - M /trunk/samtools/bam_lpileup.c - M /trunk/samtools/bamtk.c - - * samtools-0.1.2-27 - * fixed a small memory leak in tview - ------------------------------------------------------------------------- -r219 | lh3lh3 | 2009-04-15 08:00:08 -0400 (Wed, 15 Apr 2009) | 3 lines -Changed paths: - M /trunk/samtools/bam_rmdup.c - M /trunk/samtools/bamtk.c - - * samtools-0.1.2-26 - * fixed a bug in rmdup when there are unmapped reads - ------------------------------------------------------------------------- -r218 | lh3lh3 | 2009-04-14 17:28:58 -0400 (Tue, 14 Apr 2009) | 2 lines -Changed paths: - M /trunk/samtools/ChangeLog - M /trunk/samtools/NEWS - -proposed NEWS for the new release (have not yet) - ------------------------------------------------------------------------- -r216 | lh3lh3 | 2009-04-14 17:10:46 -0400 (Tue, 14 Apr 2009) | 4 lines -Changed paths: - M /trunk/samtools/misc/samtools.pl - - * samtools.pl-0.1.1 - * improve indelFilter to avoid filtering true indels. The new filter relies - on the new pileup indel line implemented in samtools-0.1.2-25 - ------------------------------------------------------------------------- -r215 | lh3lh3 | 2009-04-14 17:04:19 -0400 (Tue, 14 Apr 2009) | 4 lines -Changed paths: - M /trunk/samtools/bam_maqcns.c - M /trunk/samtools/bam_plcmd.c - M /trunk/samtools/bamtk.c - M /trunk/samtools/samtools.1 - - * samtools-0.1.2-25 - * change the pileup indel line to shows the number of alignments actually - containing indels - ------------------------------------------------------------------------- -r211 | lh3lh3 | 2009-04-13 07:07:13 -0400 (Mon, 13 Apr 2009) | 2 lines -Changed paths: - M /trunk/samtools/ChangeLog - -update ChangeLog from "svn log" - ------------------------------------------------------------------------- -r210 | lh3lh3 | 2009-04-12 15:57:05 -0400 (Sun, 12 Apr 2009) | 4 lines -Changed paths: - M /trunk/samtools/bam.c - M /trunk/samtools/bam_import.c - M /trunk/samtools/bam_sort.c - M /trunk/samtools/bamtk.c - M /trunk/samtools/kseq.h - - * samtools-0.1.2-24 - * in merge, gives a warning rather than error if the target sequence length is different - * allow empty header - ------------------------------------------------------------------------- -r209 | lh3lh3 | 2009-04-12 15:32:44 -0400 (Sun, 12 Apr 2009) | 3 lines -Changed paths: - M /trunk/samtools/bam.c - M /trunk/samtools/bam_import.c - M /trunk/samtools/bamtk.c - - * samtools-0.1.2-23 - * recognize '*' at the QUAL field - ------------------------------------------------------------------------- -r208 | lh3lh3 | 2009-04-12 15:08:02 -0400 (Sun, 12 Apr 2009) | 3 lines -Changed paths: - M /trunk/samtools/bam_import.c - M /trunk/samtools/bamtk.c - M /trunk/samtools/kseq.h - - * samtools-0.1.2-22 - * the field separater is TAB only, now - ------------------------------------------------------------------------- -r207 | lh3lh3 | 2009-04-08 10:18:03 -0400 (Wed, 08 Apr 2009) | 2 lines -Changed paths: - M /trunk/samtools/examples/ex1.sam.gz - - * fixed the problem in the example alignment due to the bug in fixmate - ------------------------------------------------------------------------- -r206 | lh3lh3 | 2009-04-08 10:15:05 -0400 (Wed, 08 Apr 2009) | 3 lines -Changed paths: - M /trunk/samtools/bam_mate.c - M /trunk/samtools/bamtk.c - M /trunk/samtools/misc/soap2sam.pl - - * samtools-0.1.2-21 - * fixed a nasty bug in `fixmate' - ------------------------------------------------------------------------- -r205 | lh3lh3 | 2009-04-08 05:57:08 -0400 (Wed, 08 Apr 2009) | 2 lines -Changed paths: - M /trunk/samtools/misc/bowtie2sam.pl - M /trunk/samtools/misc/soap2sam.pl - M /trunk/samtools/misc/wgsim_eval.pl - -make the script robust to the bugs in SOAP-2.1.7 - ------------------------------------------------------------------------- -r200 | lh3lh3 | 2009-04-02 10:14:56 -0400 (Thu, 02 Apr 2009) | 3 lines -Changed paths: - M /trunk/samtools/bam_stat.c - M /trunk/samtools/bamtk.c - - * samtools-0.1.2-20 - * check if file is truncated in flagstat - ------------------------------------------------------------------------- -r199 | lh3lh3 | 2009-04-02 10:09:10 -0400 (Thu, 02 Apr 2009) | 3 lines -Changed paths: - M /trunk/samtools/bamtk.c - - * samtools-0.1.2-19 - * print the header if requested - ------------------------------------------------------------------------- -r193 | lh3lh3 | 2009-03-27 11:09:50 -0400 (Fri, 27 Mar 2009) | 3 lines -Changed paths: - M /trunk/samtools/bam_plcmd.c - M /trunk/samtools/bamtk.c - - * samtools-0.1.2-18 - * fixed a minor bug reported by Nils Homer - ------------------------------------------------------------------------- -r185 | lh3lh3 | 2009-03-24 07:50:32 -0400 (Tue, 24 Mar 2009) | 2 lines -Changed paths: - A /trunk/samtools/Makefile (from /trunk/samtools/Makefile.std:184) - D /trunk/samtools/Makefile.std - A /trunk/samtools/misc/Makefile (from /trunk/samtools/misc/Makefile.std:184) - D /trunk/samtools/misc/Makefile.std - -rename Makefile.std as Makefile. GNU building systerm is not ready and may take some time... - ------------------------------------------------------------------------- -r184 | lh3lh3 | 2009-03-24 06:36:38 -0400 (Tue, 24 Mar 2009) | 4 lines -Changed paths: - D /trunk/samtools/Makefile - A /trunk/samtools/Makefile.std (from /trunk/samtools/Makefile:183) - M /trunk/samtools/bam_sort.c - M /trunk/samtools/bam_tview.c - M /trunk/samtools/bamtk.c - D /trunk/samtools/misc/Makefile - A /trunk/samtools/misc/Makefile.std (from /trunk/samtools/misc/Makefile:182) - M /trunk/samtools/samtools.1 - - * samtools-0.1.2-17 - * incorporating Nils' changes - * rename Makefile to Makefile.std and prepare to add the GNU building systerms (also by Nils) - ------------------------------------------------------------------------- -r183 | lh3lh3 | 2009-03-24 06:30:23 -0400 (Tue, 24 Mar 2009) | 4 lines -Changed paths: - M /trunk/samtools/Makefile - M /trunk/samtools/bam_import.c - M /trunk/samtools/bam_maqcns.c - M /trunk/samtools/bam_maqcns.h - M /trunk/samtools/bam_plcmd.c - M /trunk/samtools/bamtk.c - M /trunk/samtools/kseq.h - A /trunk/samtools/kstring.c - A /trunk/samtools/kstring.h - - * samtools-0.1.2-16 - * made pileup take a list of proposed indels. An insertion is N at the moment. - * added my kstring library for a bit complex parsing of the position list. - ------------------------------------------------------------------------- -r169 | lh3lh3 | 2009-03-12 09:40:14 -0400 (Thu, 12 Mar 2009) | 3 lines -Changed paths: - M /trunk/samtools/misc/soap2sam.pl - - * soap2sam.pl-0.1.2 - * more robust to truncated soap output - ------------------------------------------------------------------------- -r168 | lh3lh3 | 2009-03-11 06:49:00 -0400 (Wed, 11 Mar 2009) | 2 lines -Changed paths: - M /trunk/samtools/Makefile.lite - -added bam_stat.o to Makefile.lite - ------------------------------------------------------------------------- -r167 | lh3lh3 | 2009-03-10 18:11:31 -0400 (Tue, 10 Mar 2009) | 3 lines -Changed paths: - M /trunk/samtools/bam_maqcns.c - M /trunk/samtools/bamtk.c - - * samtools-0.1.2-15 - * generate RMS of mapQ instead of max mapQ - ------------------------------------------------------------------------- -r166 | lh3lh3 | 2009-03-10 18:06:45 -0400 (Tue, 10 Mar 2009) | 3 lines -Changed paths: - M /trunk/samtools/bam_plcmd.c - M /trunk/samtools/bamtk.c - M /trunk/samtools/glf.c - M /trunk/samtools/glf.h - M /trunk/samtools/misc/Makefile - - * samtools-0.1.2-14 - * implemented GLFv3 - ------------------------------------------------------------------------- -r159 | lh3lh3 | 2009-03-03 06:26:08 -0500 (Tue, 03 Mar 2009) | 3 lines -Changed paths: - M /trunk/samtools/bam_plcmd.c - M /trunk/samtools/bamtk.c - - * samtools-0.1.2-13 - * fixed a minor bug in displaying pileup - ------------------------------------------------------------------------- -r158 | lh3lh3 | 2009-03-03 06:24:16 -0500 (Tue, 03 Mar 2009) | 3 lines -Changed paths: - M /trunk/samtools/ChangeLog - M /trunk/samtools/bamtk.c - - * samtools-0.1.2-12 - * optionally print SAM header - ------------------------------------------------------------------------- -r153 | lh3lh3 | 2009-03-02 05:45:28 -0500 (Mon, 02 Mar 2009) | 3 lines -Changed paths: - M /trunk/samtools/bamtk.c - M /trunk/samtools/glf.c - - * samtools-0.1.2-11 - * use "GLF\3" as the magic for GLFv3 files - ------------------------------------------------------------------------- -r152 | lh3lh3 | 2009-03-02 05:39:09 -0500 (Mon, 02 Mar 2009) | 5 lines -Changed paths: - M /trunk/samtools/Makefile - M /trunk/samtools/bam_import.c - M /trunk/samtools/bam_index.c - M /trunk/samtools/bam_plcmd.c - M /trunk/samtools/bamtk.c - M /trunk/samtools/glf.c - M /trunk/samtools/glf.h - - * samtools-0.1.2-10 - * fixed a bug in import: core.bin is undefined for unmapped reads - * this bug can be alleviated (not completely solved) in bam_index.c - * update to GLFv3: pos is changed to offset for better compression - ------------------------------------------------------------------------- -r151 | lh3lh3 | 2009-03-01 10:18:43 -0500 (Sun, 01 Mar 2009) | 3 lines -Changed paths: - M /trunk/samtools/misc/wgsim.c - - * wgsim-0.2.3 - * fixed a bug in simulating indels - ------------------------------------------------------------------------- -r145 | lh3lh3 | 2009-02-26 14:43:57 -0500 (Thu, 26 Feb 2009) | 4 lines -Changed paths: - M /trunk/samtools/misc/wgsim.c - - * wgsim-0.2.2 - * allow to print mismatch information as fastq comment. MAQ does - not like long read names. - ------------------------------------------------------------------------- -r141 | lh3lh3 | 2009-02-26 09:53:03 -0500 (Thu, 26 Feb 2009) | 6 lines -Changed paths: - M /trunk/samtools/ChangeLog - M /trunk/samtools/misc/wgsim.c - M /trunk/samtools/misc/wgsim_eval.pl - - * wgsim-0.2.1 - * fixed a bug about color read coordinates - * fixed a bug in read names - * wgsim_eval.pl-0.1.3 - * make the script work with color reads - ------------------------------------------------------------------------- -r140 | lh3lh3 | 2009-02-26 09:02:57 -0500 (Thu, 26 Feb 2009) | 2 lines -Changed paths: - M /trunk/samtools/misc/Makefile - M /trunk/samtools/misc/wgsim.c - - * wgsim: added a note - ------------------------------------------------------------------------- -r139 | lh3lh3 | 2009-02-26 06:39:08 -0500 (Thu, 26 Feb 2009) | 7 lines -Changed paths: - M /trunk/samtools/misc/wgsim.c - M /trunk/samtools/misc/wgsim_eval.pl - - * wgsim-0.2.0 - * considerable code clean up - * print number of substitutions/indels/errors on each read - * potentially support SOLiD simulation, though not tested at the moment - * wgsim_eval.pl-0.1.2 - * change in accordant with wgsim - ------------------------------------------------------------------------- -r129 | lh3lh3 | 2009-02-18 17:23:27 -0500 (Wed, 18 Feb 2009) | 3 lines -Changed paths: - M /trunk/samtools/bam_index.c - M /trunk/samtools/bamtk.c - - * samtools-0.1.2-9 - * fixed a bug in bam_fetch, caused by completely contained adjacent chunks - ------------------------------------------------------------------------- -r128 | bhandsaker | 2009-02-18 14:06:57 -0500 (Wed, 18 Feb 2009) | 2 lines -Changed paths: - M /trunk/samtools/bamtk.c - -Fix annoying segv when invalid region specified. - ------------------------------------------------------------------------- -r127 | lh3lh3 | 2009-02-17 05:49:55 -0500 (Tue, 17 Feb 2009) | 2 lines -Changed paths: - D /trunk/samtools/misc/indel_filter.pl - A /trunk/samtools/misc/samtools.pl - - * move indel_filter.pl to samtools.pl - ------------------------------------------------------------------------- -r126 | lh3lh3 | 2009-02-14 16:22:30 -0500 (Sat, 14 Feb 2009) | 3 lines -Changed paths: - M /trunk/samtools/bam_mate.c - M /trunk/samtools/bamtk.c - - * samtools-0.1.2-7 - * fixed a bug in fixmate: SE reads are flagged as BAM_FMUNMAP - ------------------------------------------------------------------------- -r125 | lh3lh3 | 2009-02-13 04:54:45 -0500 (Fri, 13 Feb 2009) | 3 lines -Changed paths: - M /trunk/samtools/bam_stat.c - M /trunk/samtools/bamtk.c - - * samtools-0.1.2-7 - * fixed a minor bug in flagstat - ------------------------------------------------------------------------- -r124 | lh3lh3 | 2009-02-12 06:15:32 -0500 (Thu, 12 Feb 2009) | 3 lines -Changed paths: - M /trunk/samtools/bam_maqcns.c - M /trunk/samtools/bamtk.c - M /trunk/samtools/misc/indel_filter.pl - - * samtools-0.1.2-6 - * improve indel caller by setting maximum window size - ------------------------------------------------------------------------- -r123 | lh3lh3 | 2009-02-12 05:30:29 -0500 (Thu, 12 Feb 2009) | 2 lines -Changed paths: - M /trunk/samtools/bam_plcmd.c - M /trunk/samtools/bamtk.c - - * output max mapping quality in indel line - ------------------------------------------------------------------------- -r122 | lh3lh3 | 2009-02-11 05:59:10 -0500 (Wed, 11 Feb 2009) | 2 lines -Changed paths: - M /trunk/samtools/misc/maq2sam.c - -fixed a bug in generating tag AM - ------------------------------------------------------------------------- -r121 | lh3lh3 | 2009-02-03 05:43:11 -0500 (Tue, 03 Feb 2009) | 2 lines -Changed paths: - M /trunk/samtools/bam_index.c - M /trunk/samtools/bamtk.c - -fixed a potential memory problem in indexing - ------------------------------------------------------------------------- -r120 | bhandsaker | 2009-02-02 10:52:52 -0500 (Mon, 02 Feb 2009) | 2 lines -Changed paths: - M /trunk/samtools/Makefile - -Pass LIBS to recursive targets to facilitate building at Broad. - ------------------------------------------------------------------------- -r119 | lh3lh3 | 2009-02-02 05:12:15 -0500 (Mon, 02 Feb 2009) | 4 lines -Changed paths: - M /trunk/samtools/ChangeLog - M /trunk/samtools/bam_plcmd.c - M /trunk/samtools/bam_stat.c - M /trunk/samtools/bamtk.c - - * samtools-0.1.2-3 - * fixed a bug in generating GLFv2 for indels - * improve flagstat report a little bit - ------------------------------------------------------------------------- -r118 | lh3lh3 | 2009-01-29 07:33:23 -0500 (Thu, 29 Jan 2009) | 3 lines -Changed paths: - M /trunk/samtools/Makefile - A /trunk/samtools/bam_stat.c - M /trunk/samtools/bamtk.c - - * samtools-0.1.2-1 - * added flagstat command - ------------------------------------------------------------------------- -r116 | lh3lh3 | 2009-01-28 08:31:12 -0500 (Wed, 28 Jan 2009) | 2 lines -Changed paths: - M /trunk/samtools/ChangeLog - M /trunk/samtools/NEWS - M /trunk/samtools/bamtk.c - M /trunk/samtools/samtools.1 - -Release SAMtools-0.1.2 - ------------------------------------------------------------------------- -r115 | lh3lh3 | 2009-01-28 07:54:08 -0500 (Wed, 28 Jan 2009) | 2 lines -Changed paths: - A /trunk/samtools/misc/indel_filter.pl - -Script for filtering indel results - ------------------------------------------------------------------------- -r114 | lh3lh3 | 2009-01-25 06:45:37 -0500 (Sun, 25 Jan 2009) | 2 lines -Changed paths: - A /trunk/samtools/misc/zoom2sam.pl - -convert ZOOM to SAM - ------------------------------------------------------------------------- -r113 | lh3lh3 | 2009-01-24 09:25:07 -0500 (Sat, 24 Jan 2009) | 2 lines -Changed paths: - A /trunk/samtools/misc/novo2sam.pl - -add a script to convert novo alignment to SAM - ------------------------------------------------------------------------- -r112 | lh3lh3 | 2009-01-23 15:57:39 -0500 (Fri, 23 Jan 2009) | 2 lines -Changed paths: - M /trunk/samtools/ChangeLog - M /trunk/samtools/ChangeLog.old - M /trunk/samtools/samtools.1 - -update documentation and ChangeLog - ------------------------------------------------------------------------- -r111 | lh3lh3 | 2009-01-23 14:22:59 -0500 (Fri, 23 Jan 2009) | 3 lines -Changed paths: - M /trunk/samtools/bam_sort.c - M /trunk/samtools/bamtk.c - - * samtools-0.1.1-19 - * fixed a bug in "merge" command line - ------------------------------------------------------------------------- -r110 | lh3lh3 | 2009-01-22 10:36:48 -0500 (Thu, 22 Jan 2009) | 3 lines -Changed paths: - M /trunk/samtools/misc/Makefile - A /trunk/samtools/misc/bowtie2sam.pl (from /branches/dev/samtools/misc/bowtie2sam.pl:108) - M /trunk/samtools/misc/export2sam.pl - A /trunk/samtools/misc/soap2sam.pl (from /branches/dev/samtools/misc/soap2sam.pl:108) - A /trunk/samtools/misc/wgsim.c (from /branches/dev/samtools/misc/wgsim.c:108) - A /trunk/samtools/misc/wgsim_eval.pl (from /branches/dev/samtools/misc/wgsim_eval.pl:108) - - * merge from branches/dev/ - * all future development will happen here - ------------------------------------------------------------------------- -r109 | lh3lh3 | 2009-01-22 10:14:27 -0500 (Thu, 22 Jan 2009) | 3 lines -Changed paths: - M /trunk/samtools/COPYING - M /trunk/samtools/ChangeLog - A /trunk/samtools/INSTALL (from /branches/dev/samtools/INSTALL:108) - M /trunk/samtools/Makefile - A /trunk/samtools/Makefile.lite (from /branches/dev/samtools/Makefile.lite:108) - M /trunk/samtools/bam.c - M /trunk/samtools/bam.h - M /trunk/samtools/bam_import.c - M /trunk/samtools/bam_index.c - M /trunk/samtools/bam_lpileup.c - M /trunk/samtools/bam_maqcns.c - M /trunk/samtools/bam_maqcns.h - A /trunk/samtools/bam_mate.c (from /branches/dev/samtools/bam_mate.c:108) - M /trunk/samtools/bam_pileup.c - M /trunk/samtools/bam_plcmd.c - A /trunk/samtools/bam_rmdup.c (from /branches/dev/samtools/bam_rmdup.c:108) - M /trunk/samtools/bam_sort.c - M /trunk/samtools/bamtk.c - M /trunk/samtools/bgzf.h - M /trunk/samtools/examples/00README.txt - A /trunk/samtools/examples/Makefile (from /branches/dev/samtools/examples/Makefile:108) - D /trunk/samtools/examples/ex1.fa.fai - M /trunk/samtools/examples/ex1.sam.gz - M /trunk/samtools/faidx.c - A /trunk/samtools/glf.c (from /branches/dev/samtools/glf.c:108) - M /trunk/samtools/glf.h - M /trunk/samtools/misc/Makefile - M /trunk/samtools/misc/maq2sam.c - M /trunk/samtools/razf.c - M /trunk/samtools/source.dot - - * Merge from branches/dev/ - * all future development will happen here at trunk/ - ------------------------------------------------------------------------- -r79 | bhandsaker | 2009-01-07 16:42:15 -0500 (Wed, 07 Jan 2009) | 2 lines -Changed paths: - M /trunk/samtools/bam_maqcns.c - M /trunk/samtools/bam_tview.c - -Fix problem with compiling without curses. - ------------------------------------------------------------------------- -r63 | lh3lh3 | 2008-12-22 10:58:02 -0500 (Mon, 22 Dec 2008) | 2 lines -Changed paths: - A /trunk/samtools (from /branches/dev/samtools:62) - -Create trunk copy - ------------------------------------------------------------------------- -r62 | lh3lh3 | 2008-12-22 10:55:13 -0500 (Mon, 22 Dec 2008) | 2 lines -Changed paths: - A /branches/dev/samtools/NEWS - M /branches/dev/samtools/bamtk.c - M /branches/dev/samtools/samtools.1 - -Release samtools-0.1.1 - ------------------------------------------------------------------------- -r61 | lh3lh3 | 2008-12-22 10:46:08 -0500 (Mon, 22 Dec 2008) | 10 lines -Changed paths: - M /branches/dev/samtools/bam_aux.c - M /branches/dev/samtools/bam_index.c - M /branches/dev/samtools/bam_plcmd.c - M /branches/dev/samtools/bam_tview.c - M /branches/dev/samtools/bamtk.c - M /branches/dev/samtools/razf.c - M /branches/dev/samtools/samtools.1 - - * samtools-0.1.0-66 - * fixed a bug in razf.c: reset z_eof when razf_seek() is called - * fixed a memory leak in parsing a region - * changed pileup a little bit when -s is in use: output ^ and $ - * when a bam is not indexed, output more meaningful error message - * fixed a bug in indexing for small alignment - * fixed a bug in the viewer when we come to the end of a reference file - * updated documentation - * prepare to release 0.1.1 - ------------------------------------------------------------------------- -r60 | lh3lh3 | 2008-12-22 10:10:16 -0500 (Mon, 22 Dec 2008) | 2 lines -Changed paths: - A /branches/dev/samtools/examples - A /branches/dev/samtools/examples/00README.txt - A /branches/dev/samtools/examples/ex1.fa - A /branches/dev/samtools/examples/ex1.fa.fai - A /branches/dev/samtools/examples/ex1.sam.gz - -example - ------------------------------------------------------------------------- -r59 | lh3lh3 | 2008-12-22 04:38:15 -0500 (Mon, 22 Dec 2008) | 2 lines -Changed paths: - M /branches/dev/samtools/ChangeLog - -update ChangeLog - ------------------------------------------------------------------------- -r58 | lh3lh3 | 2008-12-20 18:06:00 -0500 (Sat, 20 Dec 2008) | 3 lines -Changed paths: - M /branches/dev/samtools/misc/export2sam.pl - - * added comments - * fixed several bugs - ------------------------------------------------------------------------- -r57 | lh3lh3 | 2008-12-20 10:44:20 -0500 (Sat, 20 Dec 2008) | 2 lines -Changed paths: - A /branches/dev/samtools/misc/export2sam.pl - -convert Export format to SAM; not thoroughly tested - ------------------------------------------------------------------------- -r56 | lh3lh3 | 2008-12-19 17:13:28 -0500 (Fri, 19 Dec 2008) | 6 lines -Changed paths: - M /branches/dev/samtools/bam_import.c - M /branches/dev/samtools/bam_plcmd.c - M /branches/dev/samtools/bam_tview.c - M /branches/dev/samtools/bamtk.c - A /branches/dev/samtools/source.dot - - * samtools-0.1.0-65 - * pileup: generate maq-like simple output - * pileup: allow to output pileup at required sites - * source.dot: source file relationship graph - * tview: fixed a minor bug - ------------------------------------------------------------------------- -r55 | lh3lh3 | 2008-12-19 15:10:26 -0500 (Fri, 19 Dec 2008) | 2 lines -Changed paths: - D /branches/dev/samtools/misc/all2sam.pl - -remove all2sam.pl - ------------------------------------------------------------------------- -r54 | lh3lh3 | 2008-12-16 17:34:25 -0500 (Tue, 16 Dec 2008) | 2 lines -Changed paths: - A /branches/dev/samtools/COPYING - M /branches/dev/samtools/bam.h - M /branches/dev/samtools/faidx.h - M /branches/dev/samtools/khash.h - M /branches/dev/samtools/kseq.h - M /branches/dev/samtools/ksort.h - M /branches/dev/samtools/samtools.1 - -Added copyright information and a bit more documentation. No code change. - ------------------------------------------------------------------------- -r53 | lh3lh3 | 2008-12-16 08:40:18 -0500 (Tue, 16 Dec 2008) | 3 lines -Changed paths: - M /branches/dev/samtools/bam.c - M /branches/dev/samtools/bam.h - M /branches/dev/samtools/bam_index.c - M /branches/dev/samtools/bam_maqcns.c - M /branches/dev/samtools/bamtk.c - - * samtools-0.1.0-64 - * improved efficiency of the indel caller for spliced alignments - ------------------------------------------------------------------------- -r52 | lh3lh3 | 2008-12-16 05:28:20 -0500 (Tue, 16 Dec 2008) | 3 lines -Changed paths: - M /branches/dev/samtools/bam.c - M /branches/dev/samtools/bam.h - M /branches/dev/samtools/bam_aux.c - M /branches/dev/samtools/bam_index.c - M /branches/dev/samtools/bamtk.c - - * samtools-0.1.0-63 - * a bit code cleanup: reduce the dependency between source files - ------------------------------------------------------------------------- -r51 | lh3lh3 | 2008-12-15 09:29:32 -0500 (Mon, 15 Dec 2008) | 3 lines -Changed paths: - M /branches/dev/samtools/bam_maqcns.c - M /branches/dev/samtools/bam_plcmd.c - M /branches/dev/samtools/bamtk.c - - * samtools-0.1.0-62 - * fixed a memory leak - ------------------------------------------------------------------------- -r50 | lh3lh3 | 2008-12-15 09:00:13 -0500 (Mon, 15 Dec 2008) | 2 lines -Changed paths: - M /branches/dev/samtools/ChangeLog - M /branches/dev/samtools/bam.h - M /branches/dev/samtools/samtools.1 - -update documentation, ChangeLog and a comment - ------------------------------------------------------------------------- -r49 | lh3lh3 | 2008-12-15 08:36:43 -0500 (Mon, 15 Dec 2008) | 6 lines -Changed paths: - M /branches/dev/samtools/Makefile - M /branches/dev/samtools/bam.h - M /branches/dev/samtools/bam_maqcns.c - M /branches/dev/samtools/bam_maqcns.h - M /branches/dev/samtools/bam_pileup.c - A /branches/dev/samtools/bam_plcmd.c - M /branches/dev/samtools/bamtk.c - M /branches/dev/samtools/samtools.1 - - * samtools-0.1.0-61 - * moved pileup command to a separate source file - * added indel caller - * added bam_cal_segend(). (NOT WORKING for spliced alignment!!!) - * updated documentation - ------------------------------------------------------------------------- -r48 | lh3lh3 | 2008-12-12 08:55:36 -0500 (Fri, 12 Dec 2008) | 3 lines -Changed paths: - M /branches/dev/samtools/bam_maqcns.c - M /branches/dev/samtools/bamtk.c - - * samtools-0.1.0-60 - * fixed another bug in maqcns when there is a nearby deletion - ------------------------------------------------------------------------- -r47 | lh3lh3 | 2008-12-12 08:42:16 -0500 (Fri, 12 Dec 2008) | 5 lines -Changed paths: - M /branches/dev/samtools/bam_maqcns.c - M /branches/dev/samtools/bam_pileup.c - M /branches/dev/samtools/bamtk.c - - * samtools-0.1.0-59 - * pileup: outputing consensus is now optional - * fixed a bug in glfgen. This bug also exists in maq's glfgen. However, - I am not quite sure why the previous version may have problem. - ------------------------------------------------------------------------- -r46 | lh3lh3 | 2008-12-12 06:44:56 -0500 (Fri, 12 Dec 2008) | 6 lines -Changed paths: - M /branches/dev/samtools/bam_pileup.c - M /branches/dev/samtools/bamtk.c - - * samtools-0.1.0-58 - * add maq consensus to pileup. However, I will move this part to a new - command as strictly speaking, consensus callin is not part of pileup, - and imposing it would make it harder to generate for other language - bindings. - ------------------------------------------------------------------------- -r45 | bhandsaker | 2008-12-11 15:43:56 -0500 (Thu, 11 Dec 2008) | 2 lines -Changed paths: - M /branches/dev/samtools/bgzf.c - -Fix bug in tell() after reads that consume to the exact end of a block. - ------------------------------------------------------------------------- -r44 | lh3lh3 | 2008-12-11 04:36:53 -0500 (Thu, 11 Dec 2008) | 2 lines -Changed paths: - M /branches/dev/samtools/samtools.1 - -update manual - ------------------------------------------------------------------------- -r43 | lh3lh3 | 2008-12-11 04:25:36 -0500 (Thu, 11 Dec 2008) | 4 lines -Changed paths: - M /branches/dev/samtools/bam_import.c - M /branches/dev/samtools/bamtk.c - - * samtools-0.1.0-57 - * fixed a bug in parser when there is auxiliary fields - * made the parser a bit more robust - ------------------------------------------------------------------------- -r42 | lh3lh3 | 2008-12-10 09:57:29 -0500 (Wed, 10 Dec 2008) | 5 lines -Changed paths: - M /branches/dev/samtools/bam_index.c - M /branches/dev/samtools/bamtk.c - M /branches/dev/samtools/bgzf.c - - * samtools-0.1.0-56 - * fixed a bug in bgzf (only reading is affected) - * fixed a typo in bam_index.c - * in bam_index.c, check potential bugs in the underlying I/O library - ------------------------------------------------------------------------- -r41 | lh3lh3 | 2008-12-10 07:53:08 -0500 (Wed, 10 Dec 2008) | 2 lines -Changed paths: - M /branches/dev/samtools/samtools.1 - -update manual - ------------------------------------------------------------------------- -r40 | lh3lh3 | 2008-12-10 06:52:10 -0500 (Wed, 10 Dec 2008) | 5 lines -Changed paths: - M /branches/dev/samtools/bam.h - M /branches/dev/samtools/bam_pileup.c - M /branches/dev/samtools/bamtk.c - - * samtools-0.1.0-55 - * tried to make pileup work with clipping (previously not), though NOT tested - * removed -v from pileup - * made pileup take the reference sequence - ------------------------------------------------------------------------- -r39 | lh3lh3 | 2008-12-09 06:59:28 -0500 (Tue, 09 Dec 2008) | 4 lines -Changed paths: - M /branches/dev/samtools/bam_import.c - M /branches/dev/samtools/bamtk.c - M /branches/dev/samtools/samtools.1 - - * samtools-0.1.0-54 - * in parser, recognize "=", rather than ",", as a match - * in parser, correctl parse "=" at the MRNM field. - ------------------------------------------------------------------------- -r38 | lh3lh3 | 2008-12-09 06:39:07 -0500 (Tue, 09 Dec 2008) | 2 lines -Changed paths: - M /branches/dev/samtools/misc/maq2sam.c - -fixed a bug in handling maq flag 64 and 192 - ------------------------------------------------------------------------- -r37 | lh3lh3 | 2008-12-09 04:53:46 -0500 (Tue, 09 Dec 2008) | 2 lines -Changed paths: - M /branches/dev/samtools/misc/md5fa.c - -also calculate unordered md5sum check - ------------------------------------------------------------------------- -r36 | lh3lh3 | 2008-12-09 04:46:21 -0500 (Tue, 09 Dec 2008) | 2 lines -Changed paths: - M /branches/dev/samtools/misc/md5fa.c - -fixed a minor bug when there are space in the sequence - ------------------------------------------------------------------------- -r35 | lh3lh3 | 2008-12-09 04:40:45 -0500 (Tue, 09 Dec 2008) | 2 lines -Changed paths: - M /branches/dev/samtools/misc/md5fa.c - -fixed a potential memory leak - ------------------------------------------------------------------------- -r34 | lh3lh3 | 2008-12-08 09:52:17 -0500 (Mon, 08 Dec 2008) | 2 lines -Changed paths: - M /branches/dev/samtools/bam_import.c - M /branches/dev/samtools/bam_index.c - M /branches/dev/samtools/bamtk.c - - * fixed a bug in import: bin is wrongly calculated - ------------------------------------------------------------------------- -r33 | lh3lh3 | 2008-12-08 09:08:01 -0500 (Mon, 08 Dec 2008) | 2 lines -Changed paths: - M /branches/dev/samtools/misc/all2sam.pl - -nothing, really - ------------------------------------------------------------------------- -r32 | lh3lh3 | 2008-12-08 07:56:02 -0500 (Mon, 08 Dec 2008) | 3 lines -Changed paths: - M /branches/dev/samtools/Makefile - M /branches/dev/samtools/kseq.h - M /branches/dev/samtools/misc/Makefile - A /branches/dev/samtools/misc/md5.c - A /branches/dev/samtools/misc/md5.h - A /branches/dev/samtools/misc/md5fa.c - - * fixed two warnings in kseq.h - * added md5sum utilities - ------------------------------------------------------------------------- -r31 | lh3lh3 | 2008-12-08 06:35:29 -0500 (Mon, 08 Dec 2008) | 5 lines -Changed paths: - M /branches/dev/samtools/Makefile - M /branches/dev/samtools/bam_import.c - M /branches/dev/samtools/bamtk.c - A /branches/dev/samtools/kseq.h - D /branches/dev/samtools/kstream.h - - * samtools-0.1.0-52 - * replace kstream with kseq. kseq is a superset of kstream. I need the - extra functions in kseq.h. - * also compile stand-alone faidx - ------------------------------------------------------------------------- -r30 | lh3lh3 | 2008-12-08 06:17:04 -0500 (Mon, 08 Dec 2008) | 3 lines -Changed paths: - M /branches/dev/samtools/bam.h - M /branches/dev/samtools/bam_sort.c - M /branches/dev/samtools/bamtk.c - - * samtools-0.1.0-51 - * sorting by read names is available - ------------------------------------------------------------------------- -r29 | lh3lh3 | 2008-12-08 05:29:02 -0500 (Mon, 08 Dec 2008) | 3 lines -Changed paths: - M /branches/dev/samtools/bam.c - M /branches/dev/samtools/bam.h - M /branches/dev/samtools/bam_import.c - M /branches/dev/samtools/bam_maqcns.c - M /branches/dev/samtools/bam_pileup.c - M /branches/dev/samtools/bam_sort.c - M /branches/dev/samtools/bam_tview.c - M /branches/dev/samtools/bamtk.c - M /branches/dev/samtools/misc/maq2sam.c - - * samtools-0.1.0-50 - * format change to meet the latest specification - ------------------------------------------------------------------------- -r28 | lh3lh3 | 2008-12-04 11:09:21 -0500 (Thu, 04 Dec 2008) | 3 lines -Changed paths: - M /branches/dev/samtools/bam_maqcns.c - M /branches/dev/samtools/misc/maq2sam.c - - * minor change in maqcns: special care when n==0 - * change maq2sam to meet the latest specification - ------------------------------------------------------------------------- -r27 | lh3lh3 | 2008-12-04 10:55:44 -0500 (Thu, 04 Dec 2008) | 2 lines -Changed paths: - M /branches/dev/samtools/razf.c - M /branches/dev/samtools/razf.h - -considerable code clean up in razf - ------------------------------------------------------------------------- -r26 | lh3lh3 | 2008-12-04 10:08:18 -0500 (Thu, 04 Dec 2008) | 2 lines -Changed paths: - M /branches/dev/samtools/ChangeLog - M /branches/dev/samtools/Makefile - M /branches/dev/samtools/faidx.c - -make RAZF optional in faidx.c - ------------------------------------------------------------------------- -r25 | lh3lh3 | 2008-12-01 10:27:22 -0500 (Mon, 01 Dec 2008) | 3 lines -Changed paths: - M /branches/dev/samtools/Makefile - M /branches/dev/samtools/bam.h - M /branches/dev/samtools/bam_aux.c - M /branches/dev/samtools/bamtk.c - M /branches/dev/samtools/samtools.1 - - * samtools-0.1.0-49 - * added routines for retrieving aux data, NOT TESTED YET! - ------------------------------------------------------------------------- -r24 | lh3lh3 | 2008-12-01 09:29:43 -0500 (Mon, 01 Dec 2008) | 5 lines -Changed paths: - M /branches/dev/samtools/bam.c - M /branches/dev/samtools/bam_import.c - M /branches/dev/samtools/bam_maqcns.c - M /branches/dev/samtools/bamtk.c - M /branches/dev/samtools/bgzf.c - M /branches/dev/samtools/samtools.1 - - * samtools-0.1.0-48 - * bgzf: fixed a potential integer overflow on 32-it machines - * maqcns: set the minimum combined quality as 0 - * supporting hex strings - ------------------------------------------------------------------------- -r23 | lh3lh3 | 2008-11-27 12:14:37 -0500 (Thu, 27 Nov 2008) | 3 lines -Changed paths: - M /branches/dev/samtools/bam_maqcns.c - M /branches/dev/samtools/bamtk.c - - * samtools-0.1.0-47 - * fixed the bug in maqcns - ------------------------------------------------------------------------- -r22 | lh3lh3 | 2008-11-27 12:08:11 -0500 (Thu, 27 Nov 2008) | 3 lines -Changed paths: - M /branches/dev/samtools/Makefile - M /branches/dev/samtools/bam.h - A /branches/dev/samtools/bam_maqcns.c - A /branches/dev/samtools/bam_maqcns.h - M /branches/dev/samtools/bam_tview.c - M /branches/dev/samtools/bamtk.c - A /branches/dev/samtools/glf.h - - * samtools-0.1.0-46 - * add MAQ consensus caller, currently BUGGY! - ------------------------------------------------------------------------- -r21 | lh3lh3 | 2008-11-27 08:51:28 -0500 (Thu, 27 Nov 2008) | 4 lines -Changed paths: - M /branches/dev/samtools/bam_pileup.c - M /branches/dev/samtools/bam_tview.c - M /branches/dev/samtools/bamtk.c - - * samtools-0.1.0-45 - * tview: display padded alignment (but not P operation) - * better coordinates and reference sequence - ------------------------------------------------------------------------- -r19 | lh3lh3 | 2008-11-27 04:26:05 -0500 (Thu, 27 Nov 2008) | 2 lines -Changed paths: - A /branches/dev/samtools/ChangeLog - -new ChangeLog - ------------------------------------------------------------------------- -r18 | lh3lh3 | 2008-11-27 04:24:45 -0500 (Thu, 27 Nov 2008) | 3 lines -Changed paths: - D /branches/dev/samtools/ChangeLog - A /branches/dev/samtools/ChangeLog.old (from /branches/dev/samtools/ChangeLog:6) - -Rename ChangeLog to ChangeLog.old. This old ChangeLog is generated from -the log of my personal SVN repository. - ------------------------------------------------------------------------- -r17 | lh3lh3 | 2008-11-27 04:22:55 -0500 (Thu, 27 Nov 2008) | 6 lines -Changed paths: - M /branches/dev/samtools/Makefile - M /branches/dev/samtools/bamtk.c - M /branches/dev/samtools/bgzf.c - - * samtools-0.1.0-44 - * declare fseeko and ftello as some Linux may not do this by default and - missing these declarations will make bgzf buggy - * get rid of some harmless warings - * use BGZF by default, now - ------------------------------------------------------------------------- -r16 | lh3lh3 | 2008-11-26 16:19:11 -0500 (Wed, 26 Nov 2008) | 4 lines -Changed paths: - M /branches/dev/samtools/bam_index.c - M /branches/dev/samtools/bamtk.c - M /branches/dev/samtools/razf.c - - * samtools-0.1.0-43 - * fixed a bug in razf_read() - * give more warnings when the file is truncated (or due to bugs in I/O library) - ------------------------------------------------------------------------- -r15 | lh3lh3 | 2008-11-26 15:41:39 -0500 (Wed, 26 Nov 2008) | 2 lines -Changed paths: - M /branches/dev/samtools/bgzf.c - -fixed a bug in bgzf.c at the end of the file - ------------------------------------------------------------------------- -r14 | lh3lh3 | 2008-11-26 12:05:18 -0500 (Wed, 26 Nov 2008) | 4 lines -Changed paths: - M /branches/dev/samtools/bamtk.c - - * samtools-0.1.0-42 - * a lot happened to RAZF, although samtools itself is untouched. Better - also update the version number anyway to avoid confusion - ------------------------------------------------------------------------- -r13 | lh3lh3 | 2008-11-26 12:03:48 -0500 (Wed, 26 Nov 2008) | 2 lines -Changed paths: - M /branches/dev/samtools/razf.c - -a change from Jue, but I think it should not matter - ------------------------------------------------------------------------- -r12 | lh3lh3 | 2008-11-26 11:48:14 -0500 (Wed, 26 Nov 2008) | 3 lines -Changed paths: - M /branches/dev/samtools/razf.c - -fixed a potential bug in razf. However, it seems still buggy, just -rarely happens, very rarely. - ------------------------------------------------------------------------- -r11 | lh3lh3 | 2008-11-26 09:02:56 -0500 (Wed, 26 Nov 2008) | 2 lines -Changed paths: - M /branches/dev/samtools/razf.c - -fixed a bug in razf, with the help of Jue - ------------------------------------------------------------------------- -r10 | lh3lh3 | 2008-11-26 06:55:32 -0500 (Wed, 26 Nov 2008) | 2 lines -Changed paths: - M /branches/dev/samtools/bam_index.c - -remove a comment - ------------------------------------------------------------------------- -r9 | lh3lh3 | 2008-11-26 06:37:05 -0500 (Wed, 26 Nov 2008) | 2 lines -Changed paths: - M /branches/dev/samtools/Makefile - M /branches/dev/samtools/bam.h - M /branches/dev/samtools/razf.c - M /branches/dev/samtools/razf.h - - * Jue has updated razf to realize Bob's scheme - ------------------------------------------------------------------------- -r7 | lh3lh3 | 2008-11-25 15:37:37 -0500 (Tue, 25 Nov 2008) | 2 lines -Changed paths: - A /branches/dev/samtools/samtools.1 - -the manual page - ------------------------------------------------------------------------- -r6 | lh3lh3 | 2008-11-25 15:37:16 -0500 (Tue, 25 Nov 2008) | 3 lines -Changed paths: - A /branches/dev/samtools/ChangeLog - A /branches/dev/samtools/Makefile - A /branches/dev/samtools/bam.c - A /branches/dev/samtools/bam.h - A /branches/dev/samtools/bam_aux.c - A /branches/dev/samtools/bam_endian.h - A /branches/dev/samtools/bam_import.c - A /branches/dev/samtools/bam_index.c - A /branches/dev/samtools/bam_lpileup.c - A /branches/dev/samtools/bam_pileup.c - A /branches/dev/samtools/bam_sort.c - A /branches/dev/samtools/bam_tview.c - A /branches/dev/samtools/bamtk.c - A /branches/dev/samtools/bgzf.c - A /branches/dev/samtools/bgzf.h - A /branches/dev/samtools/bgzip.c - A /branches/dev/samtools/faidx.c - A /branches/dev/samtools/faidx.h - A /branches/dev/samtools/khash.h - A /branches/dev/samtools/ksort.h - A /branches/dev/samtools/kstream.h - A /branches/dev/samtools/misc - A /branches/dev/samtools/misc/Makefile - A /branches/dev/samtools/misc/all2sam.pl - A /branches/dev/samtools/misc/maq2sam.c - A /branches/dev/samtools/razf.c - A /branches/dev/samtools/razf.h - A /branches/dev/samtools/razip.c - A /branches/dev/samtools/zutil.h - -The initial version of samtools, replicated from my local SVN repository. -The current version is: 0.1.0-42. All future development will happen here. - ------------------------------------------------------------------------- -r5 | lh3lh3 | 2008-11-25 15:30:49 -0500 (Tue, 25 Nov 2008) | 2 lines -Changed paths: - A /branches/dev/samtools - -samtools (C version) - ------------------------------------------------------------------------- diff -Nru tophat-2.1.1+dfsg/src/samtools-0.1.18/COPYING tophat-2.1.1+dfsg1/src/samtools-0.1.18/COPYING --- tophat-2.1.1+dfsg/src/samtools-0.1.18/COPYING 2016-02-14 18:21:17.372079000 +0000 +++ tophat-2.1.1+dfsg1/src/samtools-0.1.18/COPYING 1970-01-01 00:00:00.000000000 +0000 @@ -1,21 +0,0 @@ -The MIT License - -Copyright (c) 2008-2009 Genome Research Ltd. - -Permission is hereby granted, free of charge, to any person obtaining a copy -of this software and associated documentation files (the "Software"), to deal -in the Software without restriction, including without limitation the rights -to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -copies of the Software, and to permit persons to whom the Software is -furnished to do so, subject to the following conditions: - -The above copyright notice and this permission notice shall be included in -all copies or substantial portions of the Software. - -THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN -THE SOFTWARE. \ No newline at end of file diff -Nru tophat-2.1.1+dfsg/src/samtools-0.1.18/cut_target.c tophat-2.1.1+dfsg1/src/samtools-0.1.18/cut_target.c --- tophat-2.1.1+dfsg/src/samtools-0.1.18/cut_target.c 2016-02-14 18:21:17.550079000 +0000 +++ tophat-2.1.1+dfsg1/src/samtools-0.1.18/cut_target.c 1970-01-01 00:00:00.000000000 +0000 @@ -1,193 +0,0 @@ -#include -#include -#include -#include "bam.h" -#include "errmod.h" -#include "faidx.h" - -#define ERR_DEP 0.83f - -typedef struct { - int e[2][3], p[2][2]; -} score_param_t; - -/* Note that although the two matrics have 10 parameters in total, only 4 - * (probably 3) are free. Changing the scoring matrices in a sort of symmetric - * way will not change the result. */ -static score_param_t g_param = { {{0,0,0},{-4,1,6}}, {{0,-14000}, {0,0}} }; - -typedef struct { - int min_baseQ, tid, max_bases; - uint16_t *bases; - bamFile fp; - bam_header_t *h; - char *ref; - faidx_t *fai; - errmod_t *em; -} ct_t; - -static uint16_t gencns(ct_t *g, int n, const bam_pileup1_t *plp) -{ - int i, j, ret, tmp, k, sum[4], qual; - float q[16]; - if (n > g->max_bases) { // enlarge g->bases - g->max_bases = n; - kroundup32(g->max_bases); - g->bases = realloc(g->bases, g->max_bases * 2); - } - for (i = k = 0; i < n; ++i) { - const bam_pileup1_t *p = plp + i; - uint8_t *seq; - int q, baseQ, b; - if (p->is_refskip || p->is_del) continue; - baseQ = bam1_qual(p->b)[p->qpos]; - if (baseQ < g->min_baseQ) continue; - seq = bam1_seq(p->b); - b = bam_nt16_nt4_table[bam1_seqi(seq, p->qpos)]; - if (b > 3) continue; - q = baseQ < p->b->core.qual? baseQ : p->b->core.qual; - if (q < 4) q = 4; - if (q > 63) q = 63; - g->bases[k++] = q<<5 | bam1_strand(p->b)<<4 | b; - } - if (k == 0) return 0; - errmod_cal(g->em, k, 4, g->bases, q); - for (i = 0; i < 4; ++i) sum[i] = (int)(q[i<<2|i] + .499) << 2 | i; - for (i = 1; i < 4; ++i) // insertion sort - for (j = i; j > 0 && sum[j] < sum[j-1]; --j) - tmp = sum[j], sum[j] = sum[j-1], sum[j-1] = tmp; - qual = (sum[1]>>2) - (sum[0]>>2); - k = k < 256? k : 255; - ret = (qual < 63? qual : 63) << 2 | (sum[0]&3); - return ret<<8|k; -} - -static void process_cns(bam_header_t *h, int tid, int l, uint16_t *cns) -{ - int i, f[2][2], *prev, *curr, *swap_tmp, s; - uint8_t *b; // backtrack array - b = calloc(l, 1); - f[0][0] = f[0][1] = 0; - prev = f[0]; curr = f[1]; - // fill the backtrack matrix - for (i = 0; i < l; ++i) { - int c = (cns[i] == 0)? 0 : (cns[i]>>8 == 0)? 1 : 2; - int tmp0, tmp1; - // compute f[0] - tmp0 = prev[0] + g_param.e[0][c] + g_param.p[0][0]; // (s[i+1],s[i])=(0,0) - tmp1 = prev[1] + g_param.e[0][c] + g_param.p[1][0]; // (0,1) - if (tmp0 > tmp1) curr[0] = tmp0, b[i] = 0; - else curr[0] = tmp1, b[i] = 1; - // compute f[1] - tmp0 = prev[0] + g_param.e[1][c] + g_param.p[0][1]; // (s[i+1],s[i])=(1,0) - tmp1 = prev[1] + g_param.e[1][c] + g_param.p[1][1]; // (1,1) - if (tmp0 > tmp1) curr[1] = tmp0, b[i] |= 0<<1; - else curr[1] = tmp1, b[i] |= 1<<1; - // swap - swap_tmp = prev; prev = curr; curr = swap_tmp; - } - // backtrack - s = prev[0] > prev[1]? 0 : 1; - for (i = l - 1; i > 0; --i) { - b[i] |= s<<2; - s = b[i]>>s&1; - } - // print - for (i = 0, s = -1; i <= l; ++i) { - if (i == l || ((b[i]>>2&3) == 0 && s >= 0)) { - if (s >= 0) { - int j; - printf("%s:%d-%d\t0\t%s\t%d\t60\t%dM\t*\t0\t0\t", h->target_name[tid], s+1, i, h->target_name[tid], s+1, i-s); - for (j = s; j < i; ++j) { - int c = cns[j]>>8; - if (c == 0) putchar('N'); - else putchar("ACGT"[c&3]); - } - putchar('\t'); - for (j = s; j < i; ++j) - putchar(33 + (cns[j]>>8>>2)); - putchar('\n'); - } - //if (s >= 0) printf("%s\t%d\t%d\t%d\n", h->target_name[tid], s, i, i - s); - s = -1; - } else if ((b[i]>>2&3) && s < 0) s = i; - } - free(b); -} - -static int read_aln(void *data, bam1_t *b) -{ - extern int bam_prob_realn_core(bam1_t *b, const char *ref, int flag); - ct_t *g = (ct_t*)data; - int ret, len; - ret = bam_read1(g->fp, b); - if (ret >= 0 && g->fai && b->core.tid >= 0 && (b->core.flag&4) == 0) { - if (b->core.tid != g->tid) { // then load the sequence - free(g->ref); - g->ref = fai_fetch(g->fai, g->h->target_name[b->core.tid], &len); - g->tid = b->core.tid; - } - bam_prob_realn_core(b, g->ref, 1<<1|1); - } - return ret; -} - -int main_cut_target(int argc, char *argv[]) -{ - int c, tid, pos, n, lasttid = -1, lastpos = -1, l, max_l; - const bam_pileup1_t *p; - bam_plp_t plp; - uint16_t *cns; - ct_t g; - - memset(&g, 0, sizeof(ct_t)); - g.min_baseQ = 13; g.tid = -1; - while ((c = getopt(argc, argv, "f:Q:i:o:0:1:2:")) >= 0) { - switch (c) { - case 'Q': g.min_baseQ = atoi(optarg); break; // quality cutoff - case 'i': g_param.p[0][1] = -atoi(optarg); break; // 0->1 transition (in) PENALTY - case '0': g_param.e[1][0] = atoi(optarg); break; // emission SCORE - case '1': g_param.e[1][1] = atoi(optarg); break; - case '2': g_param.e[1][2] = atoi(optarg); break; - case 'f': g.fai = fai_load(optarg); - if (g.fai == 0) fprintf(stderr, "[%s] fail to load the fasta index.\n", __func__); - break; - } - } - if (argc == optind) { - fprintf(stderr, "Usage: samtools targetcut [-Q minQ] [-i inPen] [-0 em0] [-1 em1] [-2 em2] [-f ref] \n"); - return 1; - } - l = max_l = 0; cns = 0; - g.fp = strcmp(argv[optind], "-")? bam_open(argv[optind], "r") : bam_dopen(fileno(stdin), "r"); - g.h = bam_header_read(g.fp); - g.em = errmod_init(1 - ERR_DEP); - plp = bam_plp_init(read_aln, &g); - while ((p = bam_plp_auto(plp, &tid, &pos, &n)) != 0) { - if (tid < 0) break; - if (tid != lasttid) { // change of chromosome - if (cns) process_cns(g.h, lasttid, l, cns); - if (max_l < g.h->target_len[tid]) { - max_l = g.h->target_len[tid]; - kroundup32(max_l); - cns = realloc(cns, max_l * 2); - } - l = g.h->target_len[tid]; - memset(cns, 0, max_l * 2); - lasttid = tid; - } - cns[pos] = gencns(&g, n, p); - lastpos = pos; - } - process_cns(g.h, lasttid, l, cns); - free(cns); - bam_header_destroy(g.h); - bam_plp_destroy(plp); - bam_close(g.fp); - if (g.fai) { - fai_destroy(g.fai); free(g.ref); - } - errmod_destroy(g.em); - free(g.bases); - return 0; -} diff -Nru tophat-2.1.1+dfsg/src/samtools-0.1.18/errmod.c tophat-2.1.1+dfsg1/src/samtools-0.1.18/errmod.c --- tophat-2.1.1+dfsg/src/samtools-0.1.18/errmod.c 2016-02-14 18:21:17.559079000 +0000 +++ tophat-2.1.1+dfsg1/src/samtools-0.1.18/errmod.c 1970-01-01 00:00:00.000000000 +0000 @@ -1,130 +0,0 @@ -#include -#include "errmod.h" -#include "ksort.h" -KSORT_INIT_GENERIC(uint16_t) - -typedef struct __errmod_coef_t { - double *fk, *beta, *lhet; -} errmod_coef_t; - -typedef struct { - double fsum[16], bsum[16]; - uint32_t c[16]; -} call_aux_t; - -static errmod_coef_t *cal_coef(double depcorr, double eta) -{ - int k, n, q; - long double sum, sum1; - double *lC; - errmod_coef_t *ec; - - ec = calloc(1, sizeof(errmod_coef_t)); - // initialize ->fk - ec->fk = (double*)calloc(256, sizeof(double)); - ec->fk[0] = 1.0; - for (n = 1; n != 256; ++n) - ec->fk[n] = pow(1. - depcorr, n) * (1.0 - eta) + eta; - // initialize ->coef - ec->beta = (double*)calloc(256 * 256 * 64, sizeof(double)); - lC = (double*)calloc(256 * 256, sizeof(double)); - for (n = 1; n != 256; ++n) { - double lgn = lgamma(n+1); - for (k = 1; k <= n; ++k) - lC[n<<8|k] = lgn - lgamma(k+1) - lgamma(n-k+1); - } - for (q = 1; q != 64; ++q) { - double e = pow(10.0, -q/10.0); - double le = log(e); - double le1 = log(1.0 - e); - for (n = 1; n <= 255; ++n) { - double *beta = ec->beta + (q<<16|n<<8); - sum1 = sum = 0.0; - for (k = n; k >= 0; --k, sum1 = sum) { - sum = sum1 + expl(lC[n<<8|k] + k*le + (n-k)*le1); - beta[k] = -10. / M_LN10 * logl(sum1 / sum); - } - } - } - // initialize ->lhet - ec->lhet = (double*)calloc(256 * 256, sizeof(double)); - for (n = 0; n < 256; ++n) - for (k = 0; k < 256; ++k) - ec->lhet[n<<8|k] = lC[n<<8|k] - M_LN2 * n; - free(lC); - return ec; -} - -errmod_t *errmod_init(float depcorr) -{ - errmod_t *em; - em = (errmod_t*)calloc(1, sizeof(errmod_t)); - em->depcorr = depcorr; - em->coef = cal_coef(depcorr, 0.03); - return em; -} - -void errmod_destroy(errmod_t *em) -{ - if (em == 0) return; - free(em->coef->lhet); free(em->coef->fk); free(em->coef->beta); - free(em->coef); free(em); -} -// qual:6, strand:1, base:4 -int errmod_cal(const errmod_t *em, int n, int m, uint16_t *bases, float *q) -{ - call_aux_t aux; - int i, j, k, w[32]; - - if (m > m) return -1; - memset(q, 0, m * m * sizeof(float)); - if (n == 0) return 0; - // calculate aux.esum and aux.fsum - if (n > 255) { // then sample 255 bases - ks_shuffle(uint16_t, n, bases); - n = 255; - } - ks_introsort(uint16_t, n, bases); - memset(w, 0, 32 * sizeof(int)); - memset(&aux, 0, sizeof(call_aux_t)); - for (j = n - 1; j >= 0; --j) { // calculate esum and fsum - uint16_t b = bases[j]; - int q = b>>5 < 4? 4 : b>>5; - if (q > 63) q = 63; - k = b&0x1f; - aux.fsum[k&0xf] += em->coef->fk[w[k]]; - aux.bsum[k&0xf] += em->coef->fk[w[k]] * em->coef->beta[q<<16|n<<8|aux.c[k&0xf]]; - ++aux.c[k&0xf]; - ++w[k]; - } - // generate likelihood - for (j = 0; j != m; ++j) { - float tmp1, tmp3; - int tmp2, bar_e; - // homozygous - for (k = 0, tmp1 = tmp3 = 0.0, tmp2 = 0; k != m; ++k) { - if (k == j) continue; - tmp1 += aux.bsum[k]; tmp2 += aux.c[k]; tmp3 += aux.fsum[k]; - } - if (tmp2) { - bar_e = (int)(tmp1 / tmp3 + 0.499); - if (bar_e > 63) bar_e = 63; - q[j*m+j] = tmp1; - } - // heterozygous - for (k = j + 1; k < m; ++k) { - int cjk = aux.c[j] + aux.c[k]; - for (i = 0, tmp2 = 0, tmp1 = tmp3 = 0.0; i < m; ++i) { - if (i == j || i == k) continue; - tmp1 += aux.bsum[i]; tmp2 += aux.c[i]; tmp3 += aux.fsum[i]; - } - if (tmp2) { - bar_e = (int)(tmp1 / tmp3 + 0.499); - if (bar_e > 63) bar_e = 63; - q[j*m+k] = q[k*m+j] = -4.343 * em->coef->lhet[cjk<<8|aux.c[k]] + tmp1; - } else q[j*m+k] = q[k*m+j] = -4.343 * em->coef->lhet[cjk<<8|aux.c[k]]; // all the bases are either j or k - } - for (k = 0; k != m; ++k) if (q[j*m+k] < 0.0) q[j*m+k] = 0.0; - } - return 0; -} diff -Nru tophat-2.1.1+dfsg/src/samtools-0.1.18/errmod.h tophat-2.1.1+dfsg1/src/samtools-0.1.18/errmod.h --- tophat-2.1.1+dfsg/src/samtools-0.1.18/errmod.h 2016-02-14 18:21:17.568079000 +0000 +++ tophat-2.1.1+dfsg1/src/samtools-0.1.18/errmod.h 1970-01-01 00:00:00.000000000 +0000 @@ -1,24 +0,0 @@ -#ifndef ERRMOD_H -#define ERRMOD_H - -#include - -struct __errmod_coef_t; - -typedef struct { - double depcorr; - struct __errmod_coef_t *coef; -} errmod_t; - -errmod_t *errmod_init(float depcorr); -void errmod_destroy(errmod_t *em); - -/* - n: number of bases - m: maximum base - bases[i]: qual:6, strand:1, base:4 - q[i*m+j]: phred-scaled likelihood of (i,j) - */ -int errmod_cal(const errmod_t *em, int n, int m, uint16_t *bases, float *q); - -#endif diff -Nru tophat-2.1.1+dfsg/src/samtools-0.1.18/faidx.c tophat-2.1.1+dfsg1/src/samtools-0.1.18/faidx.c --- tophat-2.1.1+dfsg/src/samtools-0.1.18/faidx.c 2016-02-14 18:21:17.636079000 +0000 +++ tophat-2.1.1+dfsg1/src/samtools-0.1.18/faidx.c 1970-01-01 00:00:00.000000000 +0000 @@ -1,432 +0,0 @@ -#include -#include -#include -#include -#include -#include "faidx.h" -#include "khash.h" - -typedef struct { - int32_t line_len, line_blen; - int64_t len; - uint64_t offset; -} faidx1_t; -KHASH_MAP_INIT_STR(s, faidx1_t) - -#ifndef _NO_RAZF -#include "razf.h" -#else -#ifdef _WIN32 -#define ftello(fp) ftell(fp) -#define fseeko(fp, offset, whence) fseek(fp, offset, whence) -#else -extern off_t ftello(FILE *stream); -extern int fseeko(FILE *stream, off_t offset, int whence); -#endif -#define RAZF FILE -#define razf_read(fp, buf, size) fread(buf, 1, size, fp) -#define razf_open(fn, mode) fopen(fn, mode) -#define razf_close(fp) fclose(fp) -#define razf_seek(fp, offset, whence) fseeko(fp, offset, whence) -#define razf_tell(fp) ftello(fp) -#endif -#ifdef _USE_KNETFILE -#include "knetfile.h" -#endif - -struct __faidx_t { - RAZF *rz; - int n, m; - char **name; - khash_t(s) *hash; -}; - -#ifndef kroundup32 -#define kroundup32(x) (--(x), (x)|=(x)>>1, (x)|=(x)>>2, (x)|=(x)>>4, (x)|=(x)>>8, (x)|=(x)>>16, ++(x)) -#endif - -static inline void fai_insert_index(faidx_t *idx, const char *name, int len, int line_len, int line_blen, uint64_t offset) -{ - khint_t k; - int ret; - faidx1_t t; - if (idx->n == idx->m) { - idx->m = idx->m? idx->m<<1 : 16; - idx->name = (char**)realloc(idx->name, sizeof(void*) * idx->m); - } - idx->name[idx->n] = strdup(name); - k = kh_put(s, idx->hash, idx->name[idx->n], &ret); - t.len = len; t.line_len = line_len; t.line_blen = line_blen; t.offset = offset; - kh_value(idx->hash, k) = t; - ++idx->n; -} - -faidx_t *fai_build_core(RAZF *rz) -{ - char c, *name; - int l_name, m_name, ret; - int line_len, line_blen, state; - int l1, l2; - faidx_t *idx; - uint64_t offset; - int64_t len; - - idx = (faidx_t*)calloc(1, sizeof(faidx_t)); - idx->hash = kh_init(s); - name = 0; l_name = m_name = 0; - len = line_len = line_blen = -1; state = 0; l1 = l2 = -1; offset = 0; - while (razf_read(rz, &c, 1)) { - if (c == '\n') { // an empty line - if (state == 1) { - offset = razf_tell(rz); - continue; - } else if ((state == 0 && len < 0) || state == 2) continue; - } - if (c == '>') { // fasta header - if (len >= 0) - fai_insert_index(idx, name, len, line_len, line_blen, offset); - l_name = 0; - while ((ret = razf_read(rz, &c, 1)) != 0 && !isspace(c)) { - if (m_name < l_name + 2) { - m_name = l_name + 2; - kroundup32(m_name); - name = (char*)realloc(name, m_name); - } - name[l_name++] = c; - } - name[l_name] = '\0'; - if (ret == 0) { - fprintf(stderr, "[fai_build_core] the last entry has no sequence\n"); - free(name); fai_destroy(idx); - return 0; - } - if (c != '\n') while (razf_read(rz, &c, 1) && c != '\n'); - state = 1; len = 0; - offset = razf_tell(rz); - } else { - if (state == 3) { - fprintf(stderr, "[fai_build_core] inlined empty line is not allowed in sequence '%s'.\n", name); - free(name); fai_destroy(idx); - return 0; - } - if (state == 2) state = 3; - l1 = l2 = 0; - do { - ++l1; - if (isgraph(c)) ++l2; - } while ((ret = razf_read(rz, &c, 1)) && c != '\n'); - if (state == 3 && l2) { - fprintf(stderr, "[fai_build_core] different line length in sequence '%s'.\n", name); - free(name); fai_destroy(idx); - return 0; - } - ++l1; len += l2; - if (state == 1) line_len = l1, line_blen = l2, state = 0; - else if (state == 0) { - if (l1 != line_len || l2 != line_blen) state = 2; - } - } - } - fai_insert_index(idx, name, len, line_len, line_blen, offset); - free(name); - return idx; -} - -void fai_save(const faidx_t *fai, FILE *fp) -{ - khint_t k; - int i; - for (i = 0; i < fai->n; ++i) { - faidx1_t x; - k = kh_get(s, fai->hash, fai->name[i]); - x = kh_value(fai->hash, k); -#ifdef _WIN32 - fprintf(fp, "%s\t%d\t%ld\t%d\t%d\n", fai->name[i], (int)x.len, (long)x.offset, (int)x.line_blen, (int)x.line_len); -#else - fprintf(fp, "%s\t%d\t%lld\t%d\t%d\n", fai->name[i], (int)x.len, (long long)x.offset, (int)x.line_blen, (int)x.line_len); -#endif - } -} - -faidx_t *fai_read(FILE *fp) -{ - faidx_t *fai; - char *buf, *p; - int len, line_len, line_blen; -#ifdef _WIN32 - long offset; -#else - long long offset; -#endif - fai = (faidx_t*)calloc(1, sizeof(faidx_t)); - fai->hash = kh_init(s); - buf = (char*)calloc(0x10000, 1); - while (!feof(fp) && fgets(buf, 0x10000, fp)) { - for (p = buf; *p && isgraph(*p); ++p); - *p = 0; ++p; -#ifdef _WIN32 - sscanf(p, "%d%ld%d%d", &len, &offset, &line_blen, &line_len); -#else - sscanf(p, "%d%lld%d%d", &len, &offset, &line_blen, &line_len); -#endif - fai_insert_index(fai, buf, len, line_len, line_blen, offset); - } - free(buf); - return fai; -} - -void fai_destroy(faidx_t *fai) -{ - int i; - for (i = 0; i < fai->n; ++i) free(fai->name[i]); - free(fai->name); - kh_destroy(s, fai->hash); - if (fai->rz) razf_close(fai->rz); - free(fai); -} - -int fai_build(const char *fn) -{ - char *str; - RAZF *rz; - FILE *fp; - faidx_t *fai; - str = (char*)calloc(strlen(fn) + 5, 1); - sprintf(str, "%s.fai", fn); - rz = razf_open(fn, "r"); - if (rz == 0) { - fprintf(stderr, "[fai_build] fail to open the FASTA file %s\n",fn); - free(str); - return -1; - } - fai = fai_build_core(rz); - razf_close(rz); - fp = fopen(str, "wb"); - if (fp == 0) { - fprintf(stderr, "[fai_build] fail to write FASTA index %s\n",str); - fai_destroy(fai); free(str); - return -1; - } - fai_save(fai, fp); - fclose(fp); - free(str); - fai_destroy(fai); - return 0; -} - -#ifdef _USE_KNETFILE -FILE *download_and_open(const char *fn) -{ - const int buf_size = 1 * 1024 * 1024; - uint8_t *buf; - FILE *fp; - knetFile *fp_remote; - const char *url = fn; - const char *p; - int l = strlen(fn); - for (p = fn + l - 1; p >= fn; --p) - if (*p == '/') break; - fn = p + 1; - - // First try to open a local copy - fp = fopen(fn, "r"); - if (fp) - return fp; - - // If failed, download from remote and open - fp_remote = knet_open(url, "rb"); - if (fp_remote == 0) { - fprintf(stderr, "[download_from_remote] fail to open remote file %s\n",url); - return NULL; - } - if ((fp = fopen(fn, "wb")) == 0) { - fprintf(stderr, "[download_from_remote] fail to create file in the working directory %s\n",fn); - knet_close(fp_remote); - return NULL; - } - buf = (uint8_t*)calloc(buf_size, 1); - while ((l = knet_read(fp_remote, buf, buf_size)) != 0) - fwrite(buf, 1, l, fp); - free(buf); - fclose(fp); - knet_close(fp_remote); - - return fopen(fn, "r"); -} -#endif - -faidx_t *fai_load(const char *fn) -{ - char *str; - FILE *fp; - faidx_t *fai; - str = (char*)calloc(strlen(fn) + 5, 1); - sprintf(str, "%s.fai", fn); - -#ifdef _USE_KNETFILE - if (strstr(fn, "ftp://") == fn || strstr(fn, "http://") == fn) - { - fp = download_and_open(str); - if ( !fp ) - { - fprintf(stderr, "[fai_load] failed to open remote FASTA index %s\n", str); - free(str); - return 0; - } - } - else -#endif - fp = fopen(str, "rb"); - if (fp == 0) { - fprintf(stderr, "[fai_load] build FASTA index.\n"); - fai_build(fn); - fp = fopen(str, "rb"); - if (fp == 0) { - fprintf(stderr, "[fai_load] fail to open FASTA index.\n"); - free(str); - return 0; - } - } - - fai = fai_read(fp); - fclose(fp); - - fai->rz = razf_open(fn, "rb"); - free(str); - if (fai->rz == 0) { - fprintf(stderr, "[fai_load] fail to open FASTA file.\n"); - return 0; - } - return fai; -} - -char *fai_fetch(const faidx_t *fai, const char *str, int *len) -{ - char *s, c; - int i, l, k, name_end; - khiter_t iter; - faidx1_t val; - khash_t(s) *h; - int beg, end; - - beg = end = -1; - h = fai->hash; - name_end = l = strlen(str); - s = (char*)malloc(l+1); - // remove space - for (i = k = 0; i < l; ++i) - if (!isspace(str[i])) s[k++] = str[i]; - s[k] = 0; l = k; - // determine the sequence name - for (i = l - 1; i >= 0; --i) if (s[i] == ':') break; // look for colon from the end - if (i >= 0) name_end = i; - if (name_end < l) { // check if this is really the end - int n_hyphen = 0; - for (i = name_end + 1; i < l; ++i) { - if (s[i] == '-') ++n_hyphen; - else if (!isdigit(s[i]) && s[i] != ',') break; - } - if (i < l || n_hyphen > 1) name_end = l; // malformated region string; then take str as the name - s[name_end] = 0; - iter = kh_get(s, h, s); - if (iter == kh_end(h)) { // cannot find the sequence name - iter = kh_get(s, h, str); // try str as the name - if (iter == kh_end(h)) { - *len = 0; - free(s); return 0; - } else s[name_end] = ':', name_end = l; - } - } else iter = kh_get(s, h, str); - val = kh_value(h, iter); - // parse the interval - if (name_end < l) { - for (i = k = name_end + 1; i < l; ++i) - if (s[i] != ',') s[k++] = s[i]; - s[k] = 0; - beg = atoi(s + name_end + 1); - for (i = name_end + 1; i != k; ++i) if (s[i] == '-') break; - end = i < k? atoi(s + i + 1) : val.len; - if (beg > 0) --beg; - } else beg = 0, end = val.len; - if (beg >= val.len) beg = val.len; - if (end >= val.len) end = val.len; - if (beg > end) beg = end; - free(s); - - // now retrieve the sequence - l = 0; - s = (char*)malloc(end - beg + 2); - razf_seek(fai->rz, val.offset + beg / val.line_blen * val.line_len + beg % val.line_blen, SEEK_SET); - while (razf_read(fai->rz, &c, 1) == 1 && l < end - beg && !fai->rz->z_err) - if (isgraph(c)) s[l++] = c; - s[l] = '\0'; - *len = l; - return s; -} - -int faidx_main(int argc, char *argv[]) -{ - if (argc == 1) { - fprintf(stderr, "Usage: faidx [ [...]]\n"); - return 1; - } else { - if (argc == 2) fai_build(argv[1]); - else { - int i, j, k, l; - char *s; - faidx_t *fai; - fai = fai_load(argv[1]); - if (fai == 0) return 1; - for (i = 2; i != argc; ++i) { - printf(">%s\n", argv[i]); - s = fai_fetch(fai, argv[i], &l); - for (j = 0; j < l; j += 60) { - for (k = 0; k < 60 && k < l - j; ++k) - putchar(s[j + k]); - putchar('\n'); - } - free(s); - } - fai_destroy(fai); - } - } - return 0; -} - -int faidx_fetch_nseq(const faidx_t *fai) -{ - return fai->n; -} - -char *faidx_fetch_seq(const faidx_t *fai, char *c_name, int p_beg_i, int p_end_i, int *len) -{ - int l; - char c; - khiter_t iter; - faidx1_t val; - char *seq=NULL; - - // Adjust position - iter = kh_get(s, fai->hash, c_name); - if(iter == kh_end(fai->hash)) return 0; - val = kh_value(fai->hash, iter); - if(p_end_i < p_beg_i) p_beg_i = p_end_i; - if(p_beg_i < 0) p_beg_i = 0; - else if(val.len <= p_beg_i) p_beg_i = val.len - 1; - if(p_end_i < 0) p_end_i = 0; - else if(val.len <= p_end_i) p_end_i = val.len - 1; - - // Now retrieve the sequence - l = 0; - seq = (char*)malloc(p_end_i - p_beg_i + 2); - razf_seek(fai->rz, val.offset + p_beg_i / val.line_blen * val.line_len + p_beg_i % val.line_blen, SEEK_SET); - while (razf_read(fai->rz, &c, 1) == 1 && l < p_end_i - p_beg_i + 1) - if (isgraph(c)) seq[l++] = c; - seq[l] = '\0'; - *len = l; - return seq; -} - -#ifdef FAIDX_MAIN -int main(int argc, char *argv[]) { return faidx_main(argc, argv); } -#endif diff -Nru tophat-2.1.1+dfsg/src/samtools-0.1.18/faidx.h tophat-2.1.1+dfsg1/src/samtools-0.1.18/faidx.h --- tophat-2.1.1+dfsg/src/samtools-0.1.18/faidx.h 2016-02-14 18:21:17.644079000 +0000 +++ tophat-2.1.1+dfsg1/src/samtools-0.1.18/faidx.h 1970-01-01 00:00:00.000000000 +0000 @@ -1,103 +0,0 @@ -/* The MIT License - - Copyright (c) 2008 Genome Research Ltd (GRL). - - Permission is hereby granted, free of charge, to any person obtaining - a copy of this software and associated documentation files (the - "Software"), to deal in the Software without restriction, including - without limitation the rights to use, copy, modify, merge, publish, - distribute, sublicense, and/or sell copies of the Software, and to - permit persons to whom the Software is furnished to do so, subject to - the following conditions: - - The above copyright notice and this permission notice shall be - included in all copies or substantial portions of the Software. - - THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS - BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN - ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN - CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - SOFTWARE. -*/ - -/* Contact: Heng Li */ - -#ifndef FAIDX_H -#define FAIDX_H - -/*! - @header - - Index FASTA files and extract subsequence. - - @copyright The Wellcome Trust Sanger Institute. - */ - -struct __faidx_t; -typedef struct __faidx_t faidx_t; - -#ifdef __cplusplus -extern "C" { -#endif - - /*! - @abstract Build index for a FASTA or razip compressed FASTA file. - @param fn FASTA file name - @return 0 on success; or -1 on failure - @discussion File "fn.fai" will be generated. - */ - int fai_build(const char *fn); - - /*! - @abstract Distroy a faidx_t struct. - @param fai Pointer to the struct to be destroyed - */ - void fai_destroy(faidx_t *fai); - - /*! - @abstract Load index from "fn.fai". - @param fn File name of the FASTA file - */ - faidx_t *fai_load(const char *fn); - - /*! - @abstract Fetch the sequence in a region. - @param fai Pointer to the faidx_t struct - @param reg Region in the format "chr2:20,000-30,000" - @param len Length of the region - @return Pointer to the sequence; null on failure - - @discussion The returned sequence is allocated by malloc family - and should be destroyed by end users by calling free() on it. - */ - char *fai_fetch(const faidx_t *fai, const char *reg, int *len); - - /*! - @abstract Fetch the number of sequences. - @param fai Pointer to the faidx_t struct - @return The number of sequences - */ - int faidx_fetch_nseq(const faidx_t *fai); - - /*! - @abstract Fetch the sequence in a region. - @param fai Pointer to the faidx_t struct - @param c_name Region name - @param p_beg_i Beginning position number (zero-based) - @param p_end_i End position number (zero-based) - @param len Length of the region - @return Pointer to the sequence; null on failure - - @discussion The returned sequence is allocated by malloc family - and should be destroyed by end users by calling free() on it. - */ - char *faidx_fetch_seq(const faidx_t *fai, char *c_name, int p_beg_i, int p_end_i, int *len); - -#ifdef __cplusplus -} -#endif - -#endif diff -Nru tophat-2.1.1+dfsg/src/samtools-0.1.18/INSTALL tophat-2.1.1+dfsg1/src/samtools-0.1.18/INSTALL --- tophat-2.1.1+dfsg/src/samtools-0.1.18/INSTALL 2016-02-14 18:21:17.375079000 +0000 +++ tophat-2.1.1+dfsg1/src/samtools-0.1.18/INSTALL 1970-01-01 00:00:00.000000000 +0000 @@ -1,30 +0,0 @@ -System Requirements -=================== - -SAMtools depends on the zlib library . Version 1.2.3+ is -preferred and with 1.2.3+ you can compile razip and use it to compress a FASTA -file. SAMtools' faidx is able to index a razip-compressed FASTA file to save -diskspace. Older zlib also works with SAMtools, but razip cannot be compiled. - -The text-based viewer (tview) requires the GNU ncurses library -, which comes with Mac OS X and most of -the modern Linux/Unix distributions. If you do not have this library installed, -you can still compile the rest of SAMtools by manually changing: -`-D_CURSES_LIB=1' to `-D_CURSES_LIB=0' at the line starting with `DFLAGS=', and -comment out the line starting with `LIBCURSES='. - - -Compilation -=========== - -Type `make' to compile samtools. If you have zlib >= 1.2.2.1, you can compile -razip with `make razip'. - - -Installation -============ - -Copy `samtools', `bcftools/bcftools' and other executables/scripts in `misc' to -a location you want (e.g. a directory in your $PATH). You may also copy -`samtools.1' and `bcftools/bcftools.1' to a directory in your $MANPATH such -that the `man' command may find the manual. diff -Nru tophat-2.1.1+dfsg/src/samtools-0.1.18/kaln.c tophat-2.1.1+dfsg1/src/samtools-0.1.18/kaln.c --- tophat-2.1.1+dfsg/src/samtools-0.1.18/kaln.c 2016-02-14 18:21:17.654079000 +0000 +++ tophat-2.1.1+dfsg1/src/samtools-0.1.18/kaln.c 1970-01-01 00:00:00.000000000 +0000 @@ -1,486 +0,0 @@ -/* The MIT License - - Copyright (c) 2003-2006, 2008, 2009, by Heng Li - - Permission is hereby granted, free of charge, to any person obtaining - a copy of this software and associated documentation files (the - "Software"), to deal in the Software without restriction, including - without limitation the rights to use, copy, modify, merge, publish, - distribute, sublicense, and/or sell copies of the Software, and to - permit persons to whom the Software is furnished to do so, subject to - the following conditions: - - The above copyright notice and this permission notice shall be - included in all copies or substantial portions of the Software. - - THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS - BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN - ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN - CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - SOFTWARE. -*/ - -#include -#include -#include -#include -#include -#include "kaln.h" - -#define FROM_M 0 -#define FROM_I 1 -#define FROM_D 2 - -typedef struct { - int i, j; - unsigned char ctype; -} path_t; - -int aln_sm_blosum62[] = { -/* A R N D C Q E G H I L K M F P S T W Y V * X */ - 4,-1,-2,-2, 0,-1,-1, 0,-2,-1,-1,-1,-1,-2,-1, 1, 0,-3,-2, 0,-4, 0, - -1, 5, 0,-2,-3, 1, 0,-2, 0,-3,-2, 2,-1,-3,-2,-1,-1,-3,-2,-3,-4,-1, - -2, 0, 6, 1,-3, 0, 0, 0, 1,-3,-3, 0,-2,-3,-2, 1, 0,-4,-2,-3,-4,-1, - -2,-2, 1, 6,-3, 0, 2,-1,-1,-3,-4,-1,-3,-3,-1, 0,-1,-4,-3,-3,-4,-1, - 0,-3,-3,-3, 9,-3,-4,-3,-3,-1,-1,-3,-1,-2,-3,-1,-1,-2,-2,-1,-4,-2, - -1, 1, 0, 0,-3, 5, 2,-2, 0,-3,-2, 1, 0,-3,-1, 0,-1,-2,-1,-2,-4,-1, - -1, 0, 0, 2,-4, 2, 5,-2, 0,-3,-3, 1,-2,-3,-1, 0,-1,-3,-2,-2,-4,-1, - 0,-2, 0,-1,-3,-2,-2, 6,-2,-4,-4,-2,-3,-3,-2, 0,-2,-2,-3,-3,-4,-1, - -2, 0, 1,-1,-3, 0, 0,-2, 8,-3,-3,-1,-2,-1,-2,-1,-2,-2, 2,-3,-4,-1, - -1,-3,-3,-3,-1,-3,-3,-4,-3, 4, 2,-3, 1, 0,-3,-2,-1,-3,-1, 3,-4,-1, - -1,-2,-3,-4,-1,-2,-3,-4,-3, 2, 4,-2, 2, 0,-3,-2,-1,-2,-1, 1,-4,-1, - -1, 2, 0,-1,-3, 1, 1,-2,-1,-3,-2, 5,-1,-3,-1, 0,-1,-3,-2,-2,-4,-1, - -1,-1,-2,-3,-1, 0,-2,-3,-2, 1, 2,-1, 5, 0,-2,-1,-1,-1,-1, 1,-4,-1, - -2,-3,-3,-3,-2,-3,-3,-3,-1, 0, 0,-3, 0, 6,-4,-2,-2, 1, 3,-1,-4,-1, - -1,-2,-2,-1,-3,-1,-1,-2,-2,-3,-3,-1,-2,-4, 7,-1,-1,-4,-3,-2,-4,-2, - 1,-1, 1, 0,-1, 0, 0, 0,-1,-2,-2, 0,-1,-2,-1, 4, 1,-3,-2,-2,-4, 0, - 0,-1, 0,-1,-1,-1,-1,-2,-2,-1,-1,-1,-1,-2,-1, 1, 5,-2,-2, 0,-4, 0, - -3,-3,-4,-4,-2,-2,-3,-2,-2,-3,-2,-3,-1, 1,-4,-3,-2,11, 2,-3,-4,-2, - -2,-2,-2,-3,-2,-1,-2,-3, 2,-1,-1,-2,-1, 3,-3,-2,-2, 2, 7,-1,-4,-1, - 0,-3,-3,-3,-1,-2,-2,-3,-3, 3, 1,-2, 1,-1,-2,-2, 0,-3,-1, 4,-4,-1, - -4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4, 1,-4, - 0,-1,-1,-1,-2,-1,-1,-1,-1,-1,-1,-1,-1,-1,-2, 0, 0,-2,-1,-1,-4,-1 -}; - -int aln_sm_blast[] = { - 1, -3, -3, -3, -2, - -3, 1, -3, -3, -2, - -3, -3, 1, -3, -2, - -3, -3, -3, 1, -2, - -2, -2, -2, -2, -2 -}; - -int aln_sm_qual[] = { - 0, -23, -23, -23, 0, - -23, 0, -23, -23, 0, - -23, -23, 0, -23, 0, - -23, -23, -23, 0, 0, - 0, 0, 0, 0, 0 -}; - -ka_param_t ka_param_blast = { 5, 2, 5, 2, aln_sm_blast, 5, 50 }; -ka_param_t ka_param_aa2aa = { 10, 2, 10, 2, aln_sm_blosum62, 22, 50 }; - -ka_param2_t ka_param2_qual = { 37, 11, 37, 11, 37, 11, 0, 0, aln_sm_qual, 5, 50 }; - -static uint32_t *ka_path2cigar32(const path_t *path, int path_len, int *n_cigar) -{ - int i, n; - uint32_t *cigar; - unsigned char last_type; - - if (path_len == 0 || path == 0) { - *n_cigar = 0; - return 0; - } - - last_type = path->ctype; - for (i = n = 1; i < path_len; ++i) { - if (last_type != path[i].ctype) ++n; - last_type = path[i].ctype; - } - *n_cigar = n; - cigar = (uint32_t*)calloc(*n_cigar, 4); - - cigar[0] = 1u << 4 | path[path_len-1].ctype; - last_type = path[path_len-1].ctype; - for (i = path_len - 2, n = 0; i >= 0; --i) { - if (path[i].ctype == last_type) cigar[n] += 1u << 4; - else { - cigar[++n] = 1u << 4 | path[i].ctype; - last_type = path[i].ctype; - } - } - - return cigar; -} - -/***************************/ -/* START OF common_align.c */ -/***************************/ - -#define SET_INF(s) (s).M = (s).I = (s).D = MINOR_INF; - -#define set_M(MM, cur, p, sc) \ -{ \ - if ((p)->M >= (p)->I) { \ - if ((p)->M >= (p)->D) { \ - (MM) = (p)->M + (sc); (cur)->Mt = FROM_M; \ - } else { \ - (MM) = (p)->D + (sc); (cur)->Mt = FROM_D; \ - } \ - } else { \ - if ((p)->I > (p)->D) { \ - (MM) = (p)->I + (sc); (cur)->Mt = FROM_I; \ - } else { \ - (MM) = (p)->D + (sc); (cur)->Mt = FROM_D; \ - } \ - } \ -} -#define set_I(II, cur, p) \ -{ \ - if ((p)->M - gap_open > (p)->I) { \ - (cur)->It = FROM_M; \ - (II) = (p)->M - gap_open - gap_ext; \ - } else { \ - (cur)->It = FROM_I; \ - (II) = (p)->I - gap_ext; \ - } \ -} -#define set_end_I(II, cur, p) \ -{ \ - if (gap_end_ext >= 0) { \ - if ((p)->M - gap_end_open > (p)->I) { \ - (cur)->It = FROM_M; \ - (II) = (p)->M - gap_end_open - gap_end_ext; \ - } else { \ - (cur)->It = FROM_I; \ - (II) = (p)->I - gap_end_ext; \ - } \ - } else set_I(II, cur, p); \ -} -#define set_D(DD, cur, p) \ -{ \ - if ((p)->M - gap_open > (p)->D) { \ - (cur)->Dt = FROM_M; \ - (DD) = (p)->M - gap_open - gap_ext; \ - } else { \ - (cur)->Dt = FROM_D; \ - (DD) = (p)->D - gap_ext; \ - } \ -} -#define set_end_D(DD, cur, p) \ -{ \ - if (gap_end_ext >= 0) { \ - if ((p)->M - gap_end_open > (p)->D) { \ - (cur)->Dt = FROM_M; \ - (DD) = (p)->M - gap_end_open - gap_end_ext; \ - } else { \ - (cur)->Dt = FROM_D; \ - (DD) = (p)->D - gap_end_ext; \ - } \ - } else set_D(DD, cur, p); \ -} - -typedef struct { - uint8_t Mt:3, It:2, Dt:3; -} dpcell_t; - -typedef struct { - int M, I, D; -} dpscore_t; - -/*************************** - * banded global alignment * - ***************************/ -uint32_t *ka_global_core(uint8_t *seq1, int len1, uint8_t *seq2, int len2, const ka_param_t *ap, int *_score, int *n_cigar) -{ - int i, j; - dpcell_t **dpcell, *q; - dpscore_t *curr, *last, *s; - int b1, b2, tmp_end; - int *mat, end, max = 0; - uint8_t type, ctype; - uint32_t *cigar = 0; - - int gap_open, gap_ext, gap_end_open, gap_end_ext, b; - int *score_matrix, N_MATRIX_ROW; - - /* initialize some align-related parameters. just for compatibility */ - gap_open = ap->gap_open; - gap_ext = ap->gap_ext; - gap_end_open = ap->gap_end_open; - gap_end_ext = ap->gap_end_ext; - b = ap->band_width; - score_matrix = ap->matrix; - N_MATRIX_ROW = ap->row; - - if (n_cigar) *n_cigar = 0; - if (len1 == 0 || len2 == 0) return 0; - - /* calculate b1 and b2 */ - if (len1 > len2) { - b1 = len1 - len2 + b; - b2 = b; - } else { - b1 = b; - b2 = len2 - len1 + b; - } - if (b1 > len1) b1 = len1; - if (b2 > len2) b2 = len2; - --seq1; --seq2; - - /* allocate memory */ - end = (b1 + b2 <= len1)? (b1 + b2 + 1) : (len1 + 1); - dpcell = (dpcell_t**)malloc(sizeof(dpcell_t*) * (len2 + 1)); - for (j = 0; j <= len2; ++j) - dpcell[j] = (dpcell_t*)malloc(sizeof(dpcell_t) * end); - for (j = b2 + 1; j <= len2; ++j) - dpcell[j] -= j - b2; - curr = (dpscore_t*)malloc(sizeof(dpscore_t) * (len1 + 1)); - last = (dpscore_t*)malloc(sizeof(dpscore_t) * (len1 + 1)); - - /* set first row */ - SET_INF(*curr); curr->M = 0; - for (i = 1, s = curr + 1; i < b1; ++i, ++s) { - SET_INF(*s); - set_end_D(s->D, dpcell[0] + i, s - 1); - } - s = curr; curr = last; last = s; - - /* core dynamic programming, part 1 */ - tmp_end = (b2 < len2)? b2 : len2 - 1; - for (j = 1; j <= tmp_end; ++j) { - q = dpcell[j]; s = curr; SET_INF(*s); - set_end_I(s->I, q, last); - end = (j + b1 <= len1 + 1)? (j + b1 - 1) : len1; - mat = score_matrix + seq2[j] * N_MATRIX_ROW; - ++s; ++q; - for (i = 1; i != end; ++i, ++s, ++q) { - set_M(s->M, q, last + i - 1, mat[seq1[i]]); /* this will change s->M ! */ - set_I(s->I, q, last + i); - set_D(s->D, q, s - 1); - } - set_M(s->M, q, last + i - 1, mat[seq1[i]]); - set_D(s->D, q, s - 1); - if (j + b1 - 1 > len1) { /* bug fixed, 040227 */ - set_end_I(s->I, q, last + i); - } else s->I = MINOR_INF; - s = curr; curr = last; last = s; - } - /* last row for part 1, use set_end_D() instead of set_D() */ - if (j == len2 && b2 != len2 - 1) { - q = dpcell[j]; s = curr; SET_INF(*s); - set_end_I(s->I, q, last); - end = (j + b1 <= len1 + 1)? (j + b1 - 1) : len1; - mat = score_matrix + seq2[j] * N_MATRIX_ROW; - ++s; ++q; - for (i = 1; i != end; ++i, ++s, ++q) { - set_M(s->M, q, last + i - 1, mat[seq1[i]]); /* this will change s->M ! */ - set_I(s->I, q, last + i); - set_end_D(s->D, q, s - 1); - } - set_M(s->M, q, last + i - 1, mat[seq1[i]]); - set_end_D(s->D, q, s - 1); - if (j + b1 - 1 > len1) { /* bug fixed, 040227 */ - set_end_I(s->I, q, last + i); - } else s->I = MINOR_INF; - s = curr; curr = last; last = s; - ++j; - } - - /* core dynamic programming, part 2 */ - for (; j <= len2 - b2 + 1; ++j) { - SET_INF(curr[j - b2]); - mat = score_matrix + seq2[j] * N_MATRIX_ROW; - end = j + b1 - 1; - for (i = j - b2 + 1, q = dpcell[j] + i, s = curr + i; i != end; ++i, ++s, ++q) { - set_M(s->M, q, last + i - 1, mat[seq1[i]]); - set_I(s->I, q, last + i); - set_D(s->D, q, s - 1); - } - set_M(s->M, q, last + i - 1, mat[seq1[i]]); - set_D(s->D, q, s - 1); - s->I = MINOR_INF; - s = curr; curr = last; last = s; - } - - /* core dynamic programming, part 3 */ - for (; j < len2; ++j) { - SET_INF(curr[j - b2]); - mat = score_matrix + seq2[j] * N_MATRIX_ROW; - for (i = j - b2 + 1, q = dpcell[j] + i, s = curr + i; i < len1; ++i, ++s, ++q) { - set_M(s->M, q, last + i - 1, mat[seq1[i]]); - set_I(s->I, q, last + i); - set_D(s->D, q, s - 1); - } - set_M(s->M, q, last + len1 - 1, mat[seq1[i]]); - set_end_I(s->I, q, last + i); - set_D(s->D, q, s - 1); - s = curr; curr = last; last = s; - } - /* last row */ - if (j == len2) { - SET_INF(curr[j - b2]); - mat = score_matrix + seq2[j] * N_MATRIX_ROW; - for (i = j - b2 + 1, q = dpcell[j] + i, s = curr + i; i < len1; ++i, ++s, ++q) { - set_M(s->M, q, last + i - 1, mat[seq1[i]]); - set_I(s->I, q, last + i); - set_end_D(s->D, q, s - 1); - } - set_M(s->M, q, last + len1 - 1, mat[seq1[i]]); - set_end_I(s->I, q, last + i); - set_end_D(s->D, q, s - 1); - s = curr; curr = last; last = s; - } - - *_score = last[len1].M; - if (n_cigar) { /* backtrace */ - path_t *p, *path = (path_t*)malloc(sizeof(path_t) * (len1 + len2 + 2)); - i = len1; j = len2; - q = dpcell[j] + i; - s = last + len1; - max = s->M; type = q->Mt; ctype = FROM_M; - if (s->I > max) { max = s->I; type = q->It; ctype = FROM_I; } - if (s->D > max) { max = s->D; type = q->Dt; ctype = FROM_D; } - - p = path; - p->ctype = ctype; p->i = i; p->j = j; /* bug fixed 040408 */ - ++p; - do { - switch (ctype) { - case FROM_M: --i; --j; break; - case FROM_I: --j; break; - case FROM_D: --i; break; - } - q = dpcell[j] + i; - ctype = type; - switch (type) { - case FROM_M: type = q->Mt; break; - case FROM_I: type = q->It; break; - case FROM_D: type = q->Dt; break; - } - p->ctype = ctype; p->i = i; p->j = j; - ++p; - } while (i || j); - cigar = ka_path2cigar32(path, p - path - 1, n_cigar); - free(path); - } - - /* free memory */ - for (j = b2 + 1; j <= len2; ++j) - dpcell[j] += j - b2; - for (j = 0; j <= len2; ++j) - free(dpcell[j]); - free(dpcell); - free(curr); free(last); - - return cigar; -} - -typedef struct { - int M, I, D; -} score_aux_t; - -#define MINUS_INF -0x40000000 - -// matrix: len2 rows and len1 columns -int ka_global_score(const uint8_t *_seq1, int len1, const uint8_t *_seq2, int len2, const ka_param2_t *ap) -{ - -#define __score_aux(_p, _q0, _sc, _io, _ie, _do, _de) { \ - int t1, t2; \ - score_aux_t *_q; \ - _q = _q0; \ - _p->M = _q->M >= _q->I? _q->M : _q->I; \ - _p->M = _p->M >= _q->D? _p->M : _q->D; \ - _p->M += (_sc); \ - ++_q; t1 = _q->M - _io - _ie; t2 = _q->I - _ie; _p->I = t1 >= t2? t1 : t2; \ - _q = _p-1; t1 = _q->M - _do - _de; t2 = _q->D - _de; _p->D = t1 >= t2? t1 : t2; \ - } - - int i, j, bw, scmat_size = ap->row, *scmat = ap->matrix, ret; - const uint8_t *seq1, *seq2; - score_aux_t *curr, *last, *swap; - bw = abs(len1 - len2) + ap->band_width; - i = len1 > len2? len1 : len2; - if (bw > i + 1) bw = i + 1; - seq1 = _seq1 - 1; seq2 = _seq2 - 1; - curr = calloc(len1 + 2, sizeof(score_aux_t)); - last = calloc(len1 + 2, sizeof(score_aux_t)); - { // the zero-th row - int x, end = len1; - score_aux_t *p; - j = 0; - x = j + bw; end = len1 < x? len1 : x; // band end - p = curr; - p->M = 0; p->I = p->D = MINUS_INF; - for (i = 1, p = &curr[1]; i <= end; ++i, ++p) - p->M = p->I = MINUS_INF, p->D = -(ap->edo + ap->ede * i); - p->M = p->I = p->D = MINUS_INF; - swap = curr; curr = last; last = swap; - } - for (j = 1; j < len2; ++j) { - int x, beg = 0, end = len1, *scrow, col_end; - score_aux_t *p; - x = j - bw; beg = 0 > x? 0 : x; // band start - x = j + bw; end = len1 < x? len1 : x; // band end - if (beg == 0) { // from zero-th column - p = curr; - p->M = p->D = MINUS_INF; p->I = -(ap->eio + ap->eie * j); - ++beg; // then beg = 1 - } - scrow = scmat + seq2[j] * scmat_size; - if (end == len1) col_end = 1, --end; - else col_end = 0; - for (i = beg, p = &curr[beg]; i <= end; ++i, ++p) - __score_aux(p, &last[i-1], scrow[(int)seq1[i]], ap->iio, ap->iie, ap->ido, ap->ide); - if (col_end) { - __score_aux(p, &last[i-1], scrow[(int)seq1[i]], ap->eio, ap->eie, ap->ido, ap->ide); - ++p; - } - p->M = p->I = p->D = MINUS_INF; -// for (i = 0; i <= len1; ++i) printf("(%d,%d,%d) ", curr[i].M, curr[i].I, curr[i].D); putchar('\n'); - swap = curr; curr = last; last = swap; - } - { // the last row - int x, beg = 0, *scrow; - score_aux_t *p; - j = len2; - x = j - bw; beg = 0 > x? 0 : x; // band start - if (beg == 0) { // from zero-th column - p = curr; - p->M = p->D = MINUS_INF; p->I = -(ap->eio + ap->eie * j); - ++beg; // then beg = 1 - } - scrow = scmat + seq2[j] * scmat_size; - for (i = beg, p = &curr[beg]; i < len1; ++i, ++p) - __score_aux(p, &last[i-1], scrow[(int)seq1[i]], ap->iio, ap->iie, ap->edo, ap->ede); - __score_aux(p, &last[i-1], scrow[(int)seq1[i]], ap->eio, ap->eie, ap->edo, ap->ede); -// for (i = 0; i <= len1; ++i) printf("(%d,%d,%d) ", curr[i].M, curr[i].I, curr[i].D); putchar('\n'); - } - ret = curr[len1].M >= curr[len1].I? curr[len1].M : curr[len1].I; - ret = ret >= curr[len1].D? ret : curr[len1].D; - free(curr); free(last); - return ret; -} - -#ifdef _MAIN -int main(int argc, char *argv[]) -{ -// int len1 = 35, len2 = 35; -// uint8_t *seq1 = (uint8_t*)"\0\0\3\3\2\0\0\0\1\0\2\1\2\1\3\2\3\3\3\0\2\3\2\1\1\3\3\3\2\3\3\1\0\0\1"; -// uint8_t *seq2 = (uint8_t*)"\0\0\3\3\2\0\0\0\1\0\2\1\2\1\3\2\3\3\3\0\2\3\2\1\1\3\3\3\2\3\3\1\0\1\0"; - int len1 = 4, len2 = 4; - uint8_t *seq1 = (uint8_t*)"\1\0\0\1"; - uint8_t *seq2 = (uint8_t*)"\1\0\1\0"; - int sc; -// ka_global_core(seq1, 2, seq2, 1, &ka_param_qual, &sc, 0); - sc = ka_global_score(seq1, len1, seq2, len2, &ka_param2_qual); - printf("%d\n", sc); - return 0; -} -#endif diff -Nru tophat-2.1.1+dfsg/src/samtools-0.1.18/kaln.h tophat-2.1.1+dfsg1/src/samtools-0.1.18/kaln.h --- tophat-2.1.1+dfsg/src/samtools-0.1.18/kaln.h 2016-02-14 18:21:17.662079000 +0000 +++ tophat-2.1.1+dfsg1/src/samtools-0.1.18/kaln.h 1970-01-01 00:00:00.000000000 +0000 @@ -1,67 +0,0 @@ -/* The MIT License - - Copyright (c) 2003-2006, 2008, 2009 by Heng Li - - Permission is hereby granted, free of charge, to any person obtaining - a copy of this software and associated documentation files (the - "Software"), to deal in the Software without restriction, including - without limitation the rights to use, copy, modify, merge, publish, - distribute, sublicense, and/or sell copies of the Software, and to - permit persons to whom the Software is furnished to do so, subject to - the following conditions: - - The above copyright notice and this permission notice shall be - included in all copies or substantial portions of the Software. - - THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS - BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN - ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN - CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - SOFTWARE. -*/ - -#ifndef LH3_KALN_H_ -#define LH3_KALN_H_ - -#include - -#define MINOR_INF -1073741823 - -typedef struct { - int gap_open; - int gap_ext; - int gap_end_open; - int gap_end_ext; - - int *matrix; - int row; - int band_width; -} ka_param_t; - -typedef struct { - int iio, iie, ido, ide; - int eio, eie, edo, ede; - int *matrix; - int row; - int band_width; -} ka_param2_t; - -#ifdef __cplusplus -extern "C" { -#endif - - uint32_t *ka_global_core(uint8_t *seq1, int len1, uint8_t *seq2, int len2, const ka_param_t *ap, - int *_score, int *n_cigar); - int ka_global_score(const uint8_t *_seq1, int len1, const uint8_t *_seq2, int len2, const ka_param2_t *ap); -#ifdef __cplusplus -} -#endif - -extern ka_param_t ka_param_blast; /* = { 5, 2, 5, 2, aln_sm_blast, 5, 50 }; */ -extern ka_param_t ka_param_qual; // only use this for global alignment!!! -extern ka_param2_t ka_param2_qual; // only use this for global alignment!!! - -#endif diff -Nru tophat-2.1.1+dfsg/src/samtools-0.1.18/khash.h tophat-2.1.1+dfsg1/src/samtools-0.1.18/khash.h --- tophat-2.1.1+dfsg/src/samtools-0.1.18/khash.h 2016-02-14 18:21:17.663079000 +0000 +++ tophat-2.1.1+dfsg1/src/samtools-0.1.18/khash.h 1970-01-01 00:00:00.000000000 +0000 @@ -1,528 +0,0 @@ -/* The MIT License - - Copyright (c) 2008, 2009, 2011 by Attractive Chaos - - Permission is hereby granted, free of charge, to any person obtaining - a copy of this software and associated documentation files (the - "Software"), to deal in the Software without restriction, including - without limitation the rights to use, copy, modify, merge, publish, - distribute, sublicense, and/or sell copies of the Software, and to - permit persons to whom the Software is furnished to do so, subject to - the following conditions: - - The above copyright notice and this permission notice shall be - included in all copies or substantial portions of the Software. - - THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS - BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN - ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN - CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - SOFTWARE. -*/ - -/* - An example: - -#include "khash.h" -KHASH_MAP_INIT_INT(32, char) -int main() { - int ret, is_missing; - khiter_t k; - khash_t(32) *h = kh_init(32); - k = kh_put(32, h, 5, &ret); - if (!ret) kh_del(32, h, k); - kh_value(h, k) = 10; - k = kh_get(32, h, 10); - is_missing = (k == kh_end(h)); - k = kh_get(32, h, 5); - kh_del(32, h, k); - for (k = kh_begin(h); k != kh_end(h); ++k) - if (kh_exist(h, k)) kh_value(h, k) = 1; - kh_destroy(32, h); - return 0; -} -*/ - -/* - 2011-02-14 (0.2.5): - - * Allow to declare global functions. - - 2009-09-26 (0.2.4): - - * Improve portability - - 2008-09-19 (0.2.3): - - * Corrected the example - * Improved interfaces - - 2008-09-11 (0.2.2): - - * Improved speed a little in kh_put() - - 2008-09-10 (0.2.1): - - * Added kh_clear() - * Fixed a compiling error - - 2008-09-02 (0.2.0): - - * Changed to token concatenation which increases flexibility. - - 2008-08-31 (0.1.2): - - * Fixed a bug in kh_get(), which has not been tested previously. - - 2008-08-31 (0.1.1): - - * Added destructor -*/ - - -#ifndef __AC_KHASH_H -#define __AC_KHASH_H - -/*! - @header - - Generic hash table library. - - @copyright Heng Li - */ - -#define AC_VERSION_KHASH_H "0.2.5" - -#include -#include -#include - -/* compipler specific configuration */ - -#if UINT_MAX == 0xffffffffu -typedef unsigned int khint32_t; -#elif ULONG_MAX == 0xffffffffu -typedef unsigned long khint32_t; -#endif - -#if ULONG_MAX == ULLONG_MAX -typedef unsigned long khint64_t; -#else -typedef unsigned long long khint64_t; -#endif - -#ifdef _MSC_VER -#define inline __inline -#endif - -typedef khint32_t khint_t; -typedef khint_t khiter_t; - -#define __ac_HASH_PRIME_SIZE 32 -static const khint32_t __ac_prime_list[__ac_HASH_PRIME_SIZE] = -{ - 0ul, 3ul, 11ul, 23ul, 53ul, - 97ul, 193ul, 389ul, 769ul, 1543ul, - 3079ul, 6151ul, 12289ul, 24593ul, 49157ul, - 98317ul, 196613ul, 393241ul, 786433ul, 1572869ul, - 3145739ul, 6291469ul, 12582917ul, 25165843ul, 50331653ul, - 100663319ul, 201326611ul, 402653189ul, 805306457ul, 1610612741ul, - 3221225473ul, 4294967291ul -}; - -#define __ac_isempty(flag, i) ((flag[i>>4]>>((i&0xfU)<<1))&2) -#define __ac_isdel(flag, i) ((flag[i>>4]>>((i&0xfU)<<1))&1) -#define __ac_iseither(flag, i) ((flag[i>>4]>>((i&0xfU)<<1))&3) -#define __ac_set_isdel_false(flag, i) (flag[i>>4]&=~(1ul<<((i&0xfU)<<1))) -#define __ac_set_isempty_false(flag, i) (flag[i>>4]&=~(2ul<<((i&0xfU)<<1))) -#define __ac_set_isboth_false(flag, i) (flag[i>>4]&=~(3ul<<((i&0xfU)<<1))) -#define __ac_set_isdel_true(flag, i) (flag[i>>4]|=1ul<<((i&0xfU)<<1)) - -static const double __ac_HASH_UPPER = 0.77; - -#define KHASH_DECLARE(name, khkey_t, khval_t) \ - typedef struct { \ - khint_t n_buckets, size, n_occupied, upper_bound; \ - khint32_t *flags; \ - khkey_t *keys; \ - khval_t *vals; \ - } kh_##name##_t; \ - extern kh_##name##_t *kh_init_##name(); \ - extern void kh_destroy_##name(kh_##name##_t *h); \ - extern void kh_clear_##name(kh_##name##_t *h); \ - extern khint_t kh_get_##name(const kh_##name##_t *h, khkey_t key); \ - extern void kh_resize_##name(kh_##name##_t *h, khint_t new_n_buckets); \ - extern khint_t kh_put_##name(kh_##name##_t *h, khkey_t key, int *ret); \ - extern void kh_del_##name(kh_##name##_t *h, khint_t x); - -#define KHASH_INIT2(name, SCOPE, khkey_t, khval_t, kh_is_map, __hash_func, __hash_equal) \ - typedef struct { \ - khint_t n_buckets, size, n_occupied, upper_bound; \ - khint32_t *flags; \ - khkey_t *keys; \ - khval_t *vals; \ - } kh_##name##_t; \ - SCOPE kh_##name##_t *kh_init_##name() { \ - return (kh_##name##_t*)calloc(1, sizeof(kh_##name##_t)); \ - } \ - SCOPE void kh_destroy_##name(kh_##name##_t *h) \ - { \ - if (h) { \ - free(h->keys); free(h->flags); \ - free(h->vals); \ - free(h); \ - } \ - } \ - SCOPE void kh_clear_##name(kh_##name##_t *h) \ - { \ - if (h && h->flags) { \ - memset(h->flags, 0xaa, ((h->n_buckets>>4) + 1) * sizeof(khint32_t)); \ - h->size = h->n_occupied = 0; \ - } \ - } \ - SCOPE khint_t kh_get_##name(const kh_##name##_t *h, khkey_t key) \ - { \ - if (h->n_buckets) { \ - khint_t inc, k, i, last; \ - k = __hash_func(key); i = k % h->n_buckets; \ - inc = 1 + k % (h->n_buckets - 1); last = i; \ - while (!__ac_isempty(h->flags, i) && (__ac_isdel(h->flags, i) || !__hash_equal(h->keys[i], key))) { \ - if (i + inc >= h->n_buckets) i = i + inc - h->n_buckets; \ - else i += inc; \ - if (i == last) return h->n_buckets; \ - } \ - return __ac_iseither(h->flags, i)? h->n_buckets : i; \ - } else return 0; \ - } \ - SCOPE void kh_resize_##name(kh_##name##_t *h, khint_t new_n_buckets) \ - { \ - khint32_t *new_flags = 0; \ - khint_t j = 1; \ - { \ - khint_t t = __ac_HASH_PRIME_SIZE - 1; \ - while (__ac_prime_list[t] > new_n_buckets) --t; \ - new_n_buckets = __ac_prime_list[t+1]; \ - if (h->size >= (khint_t)(new_n_buckets * __ac_HASH_UPPER + 0.5)) j = 0; \ - else { \ - new_flags = (khint32_t*)malloc(((new_n_buckets>>4) + 1) * sizeof(khint32_t)); \ - memset(new_flags, 0xaa, ((new_n_buckets>>4) + 1) * sizeof(khint32_t)); \ - if (h->n_buckets < new_n_buckets) { \ - h->keys = (khkey_t*)realloc(h->keys, new_n_buckets * sizeof(khkey_t)); \ - if (kh_is_map) \ - h->vals = (khval_t*)realloc(h->vals, new_n_buckets * sizeof(khval_t)); \ - } \ - } \ - } \ - if (j) { \ - for (j = 0; j != h->n_buckets; ++j) { \ - if (__ac_iseither(h->flags, j) == 0) { \ - khkey_t key = h->keys[j]; \ - khval_t val; \ - if (kh_is_map) val = h->vals[j]; \ - __ac_set_isdel_true(h->flags, j); \ - while (1) { \ - khint_t inc, k, i; \ - k = __hash_func(key); \ - i = k % new_n_buckets; \ - inc = 1 + k % (new_n_buckets - 1); \ - while (!__ac_isempty(new_flags, i)) { \ - if (i + inc >= new_n_buckets) i = i + inc - new_n_buckets; \ - else i += inc; \ - } \ - __ac_set_isempty_false(new_flags, i); \ - if (i < h->n_buckets && __ac_iseither(h->flags, i) == 0) { \ - { khkey_t tmp = h->keys[i]; h->keys[i] = key; key = tmp; } \ - if (kh_is_map) { khval_t tmp = h->vals[i]; h->vals[i] = val; val = tmp; } \ - __ac_set_isdel_true(h->flags, i); \ - } else { \ - h->keys[i] = key; \ - if (kh_is_map) h->vals[i] = val; \ - break; \ - } \ - } \ - } \ - } \ - if (h->n_buckets > new_n_buckets) { \ - h->keys = (khkey_t*)realloc(h->keys, new_n_buckets * sizeof(khkey_t)); \ - if (kh_is_map) \ - h->vals = (khval_t*)realloc(h->vals, new_n_buckets * sizeof(khval_t)); \ - } \ - free(h->flags); \ - h->flags = new_flags; \ - h->n_buckets = new_n_buckets; \ - h->n_occupied = h->size; \ - h->upper_bound = (khint_t)(h->n_buckets * __ac_HASH_UPPER + 0.5); \ - } \ - } \ - SCOPE khint_t kh_put_##name(kh_##name##_t *h, khkey_t key, int *ret) \ - { \ - khint_t x; \ - if (h->n_occupied >= h->upper_bound) { \ - if (h->n_buckets > (h->size<<1)) kh_resize_##name(h, h->n_buckets - 1); \ - else kh_resize_##name(h, h->n_buckets + 1); \ - } \ - { \ - khint_t inc, k, i, site, last; \ - x = site = h->n_buckets; k = __hash_func(key); i = k % h->n_buckets; \ - if (__ac_isempty(h->flags, i)) x = i; \ - else { \ - inc = 1 + k % (h->n_buckets - 1); last = i; \ - while (!__ac_isempty(h->flags, i) && (__ac_isdel(h->flags, i) || !__hash_equal(h->keys[i], key))) { \ - if (__ac_isdel(h->flags, i)) site = i; \ - if (i + inc >= h->n_buckets) i = i + inc - h->n_buckets; \ - else i += inc; \ - if (i == last) { x = site; break; } \ - } \ - if (x == h->n_buckets) { \ - if (__ac_isempty(h->flags, i) && site != h->n_buckets) x = site; \ - else x = i; \ - } \ - } \ - } \ - if (__ac_isempty(h->flags, x)) { \ - h->keys[x] = key; \ - __ac_set_isboth_false(h->flags, x); \ - ++h->size; ++h->n_occupied; \ - *ret = 1; \ - } else if (__ac_isdel(h->flags, x)) { \ - h->keys[x] = key; \ - __ac_set_isboth_false(h->flags, x); \ - ++h->size; \ - *ret = 2; \ - } else *ret = 0; \ - return x; \ - } \ - SCOPE void kh_del_##name(kh_##name##_t *h, khint_t x) \ - { \ - if (x != h->n_buckets && !__ac_iseither(h->flags, x)) { \ - __ac_set_isdel_true(h->flags, x); \ - --h->size; \ - } \ - } - -#define KHASH_INIT(name, khkey_t, khval_t, kh_is_map, __hash_func, __hash_equal) \ - KHASH_INIT2(name, static inline, khkey_t, khval_t, kh_is_map, __hash_func, __hash_equal) - -/* --- BEGIN OF HASH FUNCTIONS --- */ - -/*! @function - @abstract Integer hash function - @param key The integer [khint32_t] - @return The hash value [khint_t] - */ -#define kh_int_hash_func(key) (khint32_t)(key) -/*! @function - @abstract Integer comparison function - */ -#define kh_int_hash_equal(a, b) ((a) == (b)) -/*! @function - @abstract 64-bit integer hash function - @param key The integer [khint64_t] - @return The hash value [khint_t] - */ -#define kh_int64_hash_func(key) (khint32_t)((key)>>33^(key)^(key)<<11) -/*! @function - @abstract 64-bit integer comparison function - */ -#define kh_int64_hash_equal(a, b) ((a) == (b)) -/*! @function - @abstract const char* hash function - @param s Pointer to a null terminated string - @return The hash value - */ -static inline khint_t __ac_X31_hash_string(const char *s) -{ - khint_t h = *s; - if (h) for (++s ; *s; ++s) h = (h << 5) - h + *s; - return h; -} -/*! @function - @abstract Another interface to const char* hash function - @param key Pointer to a null terminated string [const char*] - @return The hash value [khint_t] - */ -#define kh_str_hash_func(key) __ac_X31_hash_string(key) -/*! @function - @abstract Const char* comparison function - */ -#define kh_str_hash_equal(a, b) (strcmp(a, b) == 0) - -/* --- END OF HASH FUNCTIONS --- */ - -/* Other necessary macros... */ - -/*! - @abstract Type of the hash table. - @param name Name of the hash table [symbol] - */ -#define khash_t(name) kh_##name##_t - -/*! @function - @abstract Initiate a hash table. - @param name Name of the hash table [symbol] - @return Pointer to the hash table [khash_t(name)*] - */ -#define kh_init(name) kh_init_##name() - -/*! @function - @abstract Destroy a hash table. - @param name Name of the hash table [symbol] - @param h Pointer to the hash table [khash_t(name)*] - */ -#define kh_destroy(name, h) kh_destroy_##name(h) - -/*! @function - @abstract Reset a hash table without deallocating memory. - @param name Name of the hash table [symbol] - @param h Pointer to the hash table [khash_t(name)*] - */ -#define kh_clear(name, h) kh_clear_##name(h) - -/*! @function - @abstract Resize a hash table. - @param name Name of the hash table [symbol] - @param h Pointer to the hash table [khash_t(name)*] - @param s New size [khint_t] - */ -#define kh_resize(name, h, s) kh_resize_##name(h, s) - -/*! @function - @abstract Insert a key to the hash table. - @param name Name of the hash table [symbol] - @param h Pointer to the hash table [khash_t(name)*] - @param k Key [type of keys] - @param r Extra return code: 0 if the key is present in the hash table; - 1 if the bucket is empty (never used); 2 if the element in - the bucket has been deleted [int*] - @return Iterator to the inserted element [khint_t] - */ -#define kh_put(name, h, k, r) kh_put_##name(h, k, r) - -/*! @function - @abstract Retrieve a key from the hash table. - @param name Name of the hash table [symbol] - @param h Pointer to the hash table [khash_t(name)*] - @param k Key [type of keys] - @return Iterator to the found element, or kh_end(h) is the element is absent [khint_t] - */ -#define kh_get(name, h, k) kh_get_##name(h, k) - -/*! @function - @abstract Remove a key from the hash table. - @param name Name of the hash table [symbol] - @param h Pointer to the hash table [khash_t(name)*] - @param k Iterator to the element to be deleted [khint_t] - */ -#define kh_del(name, h, k) kh_del_##name(h, k) - - -/*! @function - @abstract Test whether a bucket contains data. - @param h Pointer to the hash table [khash_t(name)*] - @param x Iterator to the bucket [khint_t] - @return 1 if containing data; 0 otherwise [int] - */ -#define kh_exist(h, x) (!__ac_iseither((h)->flags, (x))) - -/*! @function - @abstract Get key given an iterator - @param h Pointer to the hash table [khash_t(name)*] - @param x Iterator to the bucket [khint_t] - @return Key [type of keys] - */ -#define kh_key(h, x) ((h)->keys[x]) - -/*! @function - @abstract Get value given an iterator - @param h Pointer to the hash table [khash_t(name)*] - @param x Iterator to the bucket [khint_t] - @return Value [type of values] - @discussion For hash sets, calling this results in segfault. - */ -#define kh_val(h, x) ((h)->vals[x]) - -/*! @function - @abstract Alias of kh_val() - */ -#define kh_value(h, x) ((h)->vals[x]) - -/*! @function - @abstract Get the start iterator - @param h Pointer to the hash table [khash_t(name)*] - @return The start iterator [khint_t] - */ -#define kh_begin(h) (khint_t)(0) - -/*! @function - @abstract Get the end iterator - @param h Pointer to the hash table [khash_t(name)*] - @return The end iterator [khint_t] - */ -#define kh_end(h) ((h)->n_buckets) - -/*! @function - @abstract Get the number of elements in the hash table - @param h Pointer to the hash table [khash_t(name)*] - @return Number of elements in the hash table [khint_t] - */ -#define kh_size(h) ((h)->size) - -/*! @function - @abstract Get the number of buckets in the hash table - @param h Pointer to the hash table [khash_t(name)*] - @return Number of buckets in the hash table [khint_t] - */ -#define kh_n_buckets(h) ((h)->n_buckets) - -/* More conenient interfaces */ - -/*! @function - @abstract Instantiate a hash set containing integer keys - @param name Name of the hash table [symbol] - */ -#define KHASH_SET_INIT_INT(name) \ - KHASH_INIT(name, khint32_t, char, 0, kh_int_hash_func, kh_int_hash_equal) - -/*! @function - @abstract Instantiate a hash map containing integer keys - @param name Name of the hash table [symbol] - @param khval_t Type of values [type] - */ -#define KHASH_MAP_INIT_INT(name, khval_t) \ - KHASH_INIT(name, khint32_t, khval_t, 1, kh_int_hash_func, kh_int_hash_equal) - -/*! @function - @abstract Instantiate a hash map containing 64-bit integer keys - @param name Name of the hash table [symbol] - */ -#define KHASH_SET_INIT_INT64(name) \ - KHASH_INIT(name, khint64_t, char, 0, kh_int64_hash_func, kh_int64_hash_equal) - -/*! @function - @abstract Instantiate a hash map containing 64-bit integer keys - @param name Name of the hash table [symbol] - @param khval_t Type of values [type] - */ -#define KHASH_MAP_INIT_INT64(name, khval_t) \ - KHASH_INIT(name, khint64_t, khval_t, 1, kh_int64_hash_func, kh_int64_hash_equal) - -typedef const char *kh_cstr_t; -/*! @function - @abstract Instantiate a hash map containing const char* keys - @param name Name of the hash table [symbol] - */ -#define KHASH_SET_INIT_STR(name) \ - KHASH_INIT(name, kh_cstr_t, char, 0, kh_str_hash_func, kh_str_hash_equal) - -/*! @function - @abstract Instantiate a hash map containing const char* keys - @param name Name of the hash table [symbol] - @param khval_t Type of values [type] - */ -#define KHASH_MAP_INIT_STR(name, khval_t) \ - KHASH_INIT(name, kh_cstr_t, khval_t, 1, kh_str_hash_func, kh_str_hash_equal) - -#endif /* __AC_KHASH_H */ diff -Nru tophat-2.1.1+dfsg/src/samtools-0.1.18/klist.h tophat-2.1.1+dfsg1/src/samtools-0.1.18/klist.h --- tophat-2.1.1+dfsg/src/samtools-0.1.18/klist.h 2016-02-14 18:21:17.672079000 +0000 +++ tophat-2.1.1+dfsg1/src/samtools-0.1.18/klist.h 1970-01-01 00:00:00.000000000 +0000 @@ -1,96 +0,0 @@ -#ifndef _LH3_KLIST_H -#define _LH3_KLIST_H - -#include - -#define KMEMPOOL_INIT(name, kmptype_t, kmpfree_f) \ - typedef struct { \ - size_t cnt, n, max; \ - kmptype_t **buf; \ - } kmp_##name##_t; \ - static inline kmp_##name##_t *kmp_init_##name() { \ - return calloc(1, sizeof(kmp_##name##_t)); \ - } \ - static inline void kmp_destroy_##name(kmp_##name##_t *mp) { \ - size_t k; \ - for (k = 0; k < mp->n; ++k) { \ - kmpfree_f(mp->buf[k]); free(mp->buf[k]); \ - } \ - free(mp->buf); free(mp); \ - } \ - static inline kmptype_t *kmp_alloc_##name(kmp_##name##_t *mp) { \ - ++mp->cnt; \ - if (mp->n == 0) return calloc(1, sizeof(kmptype_t)); \ - return mp->buf[--mp->n]; \ - } \ - static inline void kmp_free_##name(kmp_##name##_t *mp, kmptype_t *p) { \ - --mp->cnt; \ - if (mp->n == mp->max) { \ - mp->max = mp->max? mp->max<<1 : 16; \ - mp->buf = realloc(mp->buf, sizeof(void*) * mp->max); \ - } \ - mp->buf[mp->n++] = p; \ - } - -#define kmempool_t(name) kmp_##name##_t -#define kmp_init(name) kmp_init_##name() -#define kmp_destroy(name, mp) kmp_destroy_##name(mp) -#define kmp_alloc(name, mp) kmp_alloc_##name(mp) -#define kmp_free(name, mp, p) kmp_free_##name(mp, p) - -#define KLIST_INIT(name, kltype_t, kmpfree_t) \ - struct __kl1_##name { \ - kltype_t data; \ - struct __kl1_##name *next; \ - }; \ - typedef struct __kl1_##name kl1_##name; \ - KMEMPOOL_INIT(name, kl1_##name, kmpfree_t) \ - typedef struct { \ - kl1_##name *head, *tail; \ - kmp_##name##_t *mp; \ - size_t size; \ - } kl_##name##_t; \ - static inline kl_##name##_t *kl_init_##name() { \ - kl_##name##_t *kl = calloc(1, sizeof(kl_##name##_t)); \ - kl->mp = kmp_init(name); \ - kl->head = kl->tail = kmp_alloc(name, kl->mp); \ - kl->head->next = 0; \ - return kl; \ - } \ - static inline void kl_destroy_##name(kl_##name##_t *kl) { \ - kl1_##name *p; \ - for (p = kl->head; p != kl->tail; p = p->next) \ - kmp_free(name, kl->mp, p); \ - kmp_free(name, kl->mp, p); \ - kmp_destroy(name, kl->mp); \ - free(kl); \ - } \ - static inline kltype_t *kl_pushp_##name(kl_##name##_t *kl) { \ - kl1_##name *q, *p = kmp_alloc(name, kl->mp); \ - q = kl->tail; p->next = 0; kl->tail->next = p; kl->tail = p; \ - ++kl->size; \ - return &q->data; \ - } \ - static inline int kl_shift_##name(kl_##name##_t *kl, kltype_t *d) { \ - kl1_##name *p; \ - if (kl->head->next == 0) return -1; \ - --kl->size; \ - p = kl->head; kl->head = kl->head->next; \ - if (d) *d = p->data; \ - kmp_free(name, kl->mp, p); \ - return 0; \ - } - -#define kliter_t(name) kl1_##name -#define klist_t(name) kl_##name##_t -#define kl_val(iter) ((iter)->data) -#define kl_next(iter) ((iter)->next) -#define kl_begin(kl) ((kl)->head) -#define kl_end(kl) ((kl)->tail) - -#define kl_init(name) kl_init_##name() -#define kl_destroy(name, kl) kl_destroy_##name(kl) -#define kl_pushp(name, kl) kl_pushp_##name(kl) -#define kl_shift(name, kl, d) kl_shift_##name(kl, d) - -#endif diff -Nru tophat-2.1.1+dfsg/src/samtools-0.1.18/knetfile.c tophat-2.1.1+dfsg1/src/samtools-0.1.18/knetfile.c --- tophat-2.1.1+dfsg/src/samtools-0.1.18/knetfile.c 2016-02-14 18:21:17.681079000 +0000 +++ tophat-2.1.1+dfsg1/src/samtools-0.1.18/knetfile.c 1970-01-01 00:00:00.000000000 +0000 @@ -1,632 +0,0 @@ -/* The MIT License - - Copyright (c) 2008 by Genome Research Ltd (GRL). - 2010 by Attractive Chaos - - Permission is hereby granted, free of charge, to any person obtaining - a copy of this software and associated documentation files (the - "Software"), to deal in the Software without restriction, including - without limitation the rights to use, copy, modify, merge, publish, - distribute, sublicense, and/or sell copies of the Software, and to - permit persons to whom the Software is furnished to do so, subject to - the following conditions: - - The above copyright notice and this permission notice shall be - included in all copies or substantial portions of the Software. - - THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS - BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN - ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN - CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - SOFTWARE. -*/ - -/* Probably I will not do socket programming in the next few years and - therefore I decide to heavily annotate this file, for Linux and - Windows as well. -ac */ - -#include -#include -#include -#include -#include -#include -#include -#include - -#ifndef _WIN32 -#include -#include -#include -#endif - -#include "knetfile.h" - -/* In winsock.h, the type of a socket is SOCKET, which is: "typedef - * u_int SOCKET". An invalid SOCKET is: "(SOCKET)(~0)", or signed - * integer -1. In knetfile.c, I use "int" for socket type - * throughout. This should be improved to avoid confusion. - * - * In Linux/Mac, recv() and read() do almost the same thing. You can see - * in the header file that netread() is simply an alias of read(). In - * Windows, however, they are different and using recv() is mandatory. - */ - -/* This function tests if the file handler is ready for reading (or - * writing if is_read==0). */ -static int socket_wait(int fd, int is_read) -{ - fd_set fds, *fdr = 0, *fdw = 0; - struct timeval tv; - int ret; - tv.tv_sec = 5; tv.tv_usec = 0; // 5 seconds time out - FD_ZERO(&fds); - FD_SET(fd, &fds); - if (is_read) fdr = &fds; - else fdw = &fds; - ret = select(fd+1, fdr, fdw, 0, &tv); -#ifndef _WIN32 - if (ret == -1) perror("select"); -#else - if (ret == 0) - fprintf(stderr, "select time-out\n"); - else if (ret == SOCKET_ERROR) - fprintf(stderr, "select: %d\n", WSAGetLastError()); -#endif - return ret; -} - -#ifndef _WIN32 -/* This function does not work with Windows due to the lack of - * getaddrinfo() in winsock. It is addapted from an example in "Beej's - * Guide to Network Programming" (http://beej.us/guide/bgnet/). */ -static int socket_connect(const char *host, const char *port) -{ -#define __err_connect(func) do { perror(func); freeaddrinfo(res); return -1; } while (0) - - int on = 1, fd; - struct linger lng = { 0, 0 }; - struct addrinfo hints, *res = 0; - memset(&hints, 0, sizeof(struct addrinfo)); - hints.ai_family = AF_UNSPEC; - hints.ai_socktype = SOCK_STREAM; - /* In Unix/Mac, getaddrinfo() is the most convenient way to get - * server information. */ - if (getaddrinfo(host, port, &hints, &res) != 0) __err_connect("getaddrinfo"); - if ((fd = socket(res->ai_family, res->ai_socktype, res->ai_protocol)) == -1) __err_connect("socket"); - /* The following two setsockopt() are used by ftplib - * (http://nbpfaus.net/~pfau/ftplib/). I am not sure if they - * necessary. */ - if (setsockopt(fd, SOL_SOCKET, SO_REUSEADDR, &on, sizeof(on)) == -1) __err_connect("setsockopt"); - if (setsockopt(fd, SOL_SOCKET, SO_LINGER, &lng, sizeof(lng)) == -1) __err_connect("setsockopt"); - if (connect(fd, res->ai_addr, res->ai_addrlen) != 0) __err_connect("connect"); - freeaddrinfo(res); - return fd; -} -#else -/* MinGW's printf has problem with "%lld" */ -char *int64tostr(char *buf, int64_t x) -{ - int cnt; - int i = 0; - do { - buf[i++] = '0' + x % 10; - x /= 10; - } while (x); - buf[i] = 0; - for (cnt = i, i = 0; i < cnt/2; ++i) { - int c = buf[i]; buf[i] = buf[cnt-i-1]; buf[cnt-i-1] = c; - } - return buf; -} - -int64_t strtoint64(const char *buf) -{ - int64_t x; - for (x = 0; *buf != '\0'; ++buf) - x = x * 10 + ((int64_t) *buf - 48); - return x; -} -/* In windows, the first thing is to establish the TCP connection. */ -int knet_win32_init() -{ - WSADATA wsaData; - return WSAStartup(MAKEWORD(2, 2), &wsaData); -} -void knet_win32_destroy() -{ - WSACleanup(); -} -/* A slightly modfied version of the following function also works on - * Mac (and presummably Linux). However, this function is not stable on - * my Mac. It sometimes works fine but sometimes does not. Therefore for - * non-Windows OS, I do not use this one. */ -static SOCKET socket_connect(const char *host, const char *port) -{ -#define __err_connect(func) \ - do { \ - fprintf(stderr, "%s: %d\n", func, WSAGetLastError()); \ - return -1; \ - } while (0) - - int on = 1; - SOCKET fd; - struct linger lng = { 0, 0 }; - struct sockaddr_in server; - struct hostent *hp = 0; - // open socket - if ((fd = socket(AF_INET, SOCK_STREAM, IPPROTO_TCP)) == INVALID_SOCKET) __err_connect("socket"); - if (setsockopt(fd, SOL_SOCKET, SO_REUSEADDR, (char*)&on, sizeof(on)) == -1) __err_connect("setsockopt"); - if (setsockopt(fd, SOL_SOCKET, SO_LINGER, (char*)&lng, sizeof(lng)) == -1) __err_connect("setsockopt"); - // get host info - if (isalpha(host[0])) hp = gethostbyname(host); - else { - struct in_addr addr; - addr.s_addr = inet_addr(host); - hp = gethostbyaddr((char*)&addr, 4, AF_INET); - } - if (hp == 0) __err_connect("gethost"); - // connect - server.sin_addr.s_addr = *((unsigned long*)hp->h_addr); - server.sin_family= AF_INET; - server.sin_port = htons(atoi(port)); - if (connect(fd, (struct sockaddr*)&server, sizeof(server)) != 0) __err_connect("connect"); - // freehostent(hp); // strangely in MSDN, hp is NOT freed (memory leak?!) - return fd; -} -#endif - -static off_t my_netread(int fd, void *buf, off_t len) -{ - off_t rest = len, curr, l = 0; - /* recv() and read() may not read the required length of data with - * one call. They have to be called repeatedly. */ - while (rest) { - if (socket_wait(fd, 1) <= 0) break; // socket is not ready for reading - curr = netread(fd, buf + l, rest); - /* According to the glibc manual, section 13.2, a zero returned - * value indicates end-of-file (EOF), which should mean that - * read() will not return zero if EOF has not been met but data - * are not immediately available. */ - if (curr == 0) break; - l += curr; rest -= curr; - } - return l; -} - -/************************* - * FTP specific routines * - *************************/ - -static int kftp_get_response(knetFile *ftp) -{ -#ifndef _WIN32 - unsigned char c; -#else - char c; -#endif - int n = 0; - char *p; - if (socket_wait(ftp->ctrl_fd, 1) <= 0) return 0; - while (netread(ftp->ctrl_fd, &c, 1)) { // FIXME: this is *VERY BAD* for unbuffered I/O - //fputc(c, stderr); - if (n >= ftp->max_response) { - ftp->max_response = ftp->max_response? ftp->max_response<<1 : 256; - ftp->response = realloc(ftp->response, ftp->max_response); - } - ftp->response[n++] = c; - if (c == '\n') { - if (n >= 4 && isdigit(ftp->response[0]) && isdigit(ftp->response[1]) && isdigit(ftp->response[2]) - && ftp->response[3] != '-') break; - n = 0; - continue; - } - } - if (n < 2) return -1; - ftp->response[n-2] = 0; - return strtol(ftp->response, &p, 0); -} - -static int kftp_send_cmd(knetFile *ftp, const char *cmd, int is_get) -{ - if (socket_wait(ftp->ctrl_fd, 0) <= 0) return -1; // socket is not ready for writing - netwrite(ftp->ctrl_fd, cmd, strlen(cmd)); - return is_get? kftp_get_response(ftp) : 0; -} - -static int kftp_pasv_prep(knetFile *ftp) -{ - char *p; - int v[6]; - kftp_send_cmd(ftp, "PASV\r\n", 1); - for (p = ftp->response; *p && *p != '('; ++p); - if (*p != '(') return -1; - ++p; - sscanf(p, "%d,%d,%d,%d,%d,%d", &v[0], &v[1], &v[2], &v[3], &v[4], &v[5]); - memcpy(ftp->pasv_ip, v, 4 * sizeof(int)); - ftp->pasv_port = (v[4]<<8&0xff00) + v[5]; - return 0; -} - - -static int kftp_pasv_connect(knetFile *ftp) -{ - char host[80], port[10]; - if (ftp->pasv_port == 0) { - fprintf(stderr, "[kftp_pasv_connect] kftp_pasv_prep() is not called before hand.\n"); - return -1; - } - sprintf(host, "%d.%d.%d.%d", ftp->pasv_ip[0], ftp->pasv_ip[1], ftp->pasv_ip[2], ftp->pasv_ip[3]); - sprintf(port, "%d", ftp->pasv_port); - ftp->fd = socket_connect(host, port); - if (ftp->fd == -1) return -1; - return 0; -} - -int kftp_connect(knetFile *ftp) -{ - ftp->ctrl_fd = socket_connect(ftp->host, ftp->port); - if (ftp->ctrl_fd == -1) return -1; - kftp_get_response(ftp); - kftp_send_cmd(ftp, "USER anonymous\r\n", 1); - kftp_send_cmd(ftp, "PASS kftp@\r\n", 1); - kftp_send_cmd(ftp, "TYPE I\r\n", 1); - return 0; -} - -int kftp_reconnect(knetFile *ftp) -{ - if (ftp->ctrl_fd != -1) { - netclose(ftp->ctrl_fd); - ftp->ctrl_fd = -1; - } - netclose(ftp->fd); - ftp->fd = -1; - return kftp_connect(ftp); -} - -// initialize ->type, ->host, ->retr and ->size -knetFile *kftp_parse_url(const char *fn, const char *mode) -{ - knetFile *fp; - char *p; - int l; - if (strstr(fn, "ftp://") != fn) return 0; - for (p = (char*)fn + 6; *p && *p != '/'; ++p); - if (*p != '/') return 0; - l = p - fn - 6; - fp = calloc(1, sizeof(knetFile)); - fp->type = KNF_TYPE_FTP; - fp->fd = -1; - /* the Linux/Mac version of socket_connect() also recognizes a port - * like "ftp", but the Windows version does not. */ - fp->port = strdup("21"); - fp->host = calloc(l + 1, 1); - if (strchr(mode, 'c')) fp->no_reconnect = 1; - strncpy(fp->host, fn + 6, l); - fp->retr = calloc(strlen(p) + 8, 1); - sprintf(fp->retr, "RETR %s\r\n", p); - fp->size_cmd = calloc(strlen(p) + 8, 1); - sprintf(fp->size_cmd, "SIZE %s\r\n", p); - fp->seek_offset = 0; - return fp; -} -// place ->fd at offset off -int kftp_connect_file(knetFile *fp) -{ - int ret; - long long file_size; - if (fp->fd != -1) { - netclose(fp->fd); - if (fp->no_reconnect) kftp_get_response(fp); - } - kftp_pasv_prep(fp); - kftp_send_cmd(fp, fp->size_cmd, 1); -#ifndef _WIN32 - if ( sscanf(fp->response,"%*d %lld", &file_size) != 1 ) - { - fprintf(stderr,"[kftp_connect_file] %s\n", fp->response); - return -1; - } -#else - const char *p = fp->response; - while (*p != ' ') ++p; - while (*p < '0' || *p > '9') ++p; - file_size = strtoint64(p); -#endif - fp->file_size = file_size; - if (fp->offset>=0) { - char tmp[32]; -#ifndef _WIN32 - sprintf(tmp, "REST %lld\r\n", (long long)fp->offset); -#else - strcpy(tmp, "REST "); - int64tostr(tmp + 5, fp->offset); - strcat(tmp, "\r\n"); -#endif - kftp_send_cmd(fp, tmp, 1); - } - kftp_send_cmd(fp, fp->retr, 0); - kftp_pasv_connect(fp); - ret = kftp_get_response(fp); - if (ret != 150) { - fprintf(stderr, "[kftp_connect_file] %s\n", fp->response); - netclose(fp->fd); - fp->fd = -1; - return -1; - } - fp->is_ready = 1; - return 0; -} - - -/************************** - * HTTP specific routines * - **************************/ - -knetFile *khttp_parse_url(const char *fn, const char *mode) -{ - knetFile *fp; - char *p, *proxy, *q; - int l; - if (strstr(fn, "http://") != fn) return 0; - // set ->http_host - for (p = (char*)fn + 7; *p && *p != '/'; ++p); - l = p - fn - 7; - fp = calloc(1, sizeof(knetFile)); - fp->http_host = calloc(l + 1, 1); - strncpy(fp->http_host, fn + 7, l); - fp->http_host[l] = 0; - for (q = fp->http_host; *q && *q != ':'; ++q); - if (*q == ':') *q++ = 0; - // get http_proxy - proxy = getenv("http_proxy"); - // set ->host, ->port and ->path - if (proxy == 0) { - fp->host = strdup(fp->http_host); // when there is no proxy, server name is identical to http_host name. - fp->port = strdup(*q? q : "80"); - fp->path = strdup(*p? p : "/"); - } else { - fp->host = (strstr(proxy, "http://") == proxy)? strdup(proxy + 7) : strdup(proxy); - for (q = fp->host; *q && *q != ':'; ++q); - if (*q == ':') *q++ = 0; - fp->port = strdup(*q? q : "80"); - fp->path = strdup(fn); - } - fp->type = KNF_TYPE_HTTP; - fp->ctrl_fd = fp->fd = -1; - fp->seek_offset = 0; - return fp; -} - -int khttp_connect_file(knetFile *fp) -{ - int ret, l = 0; - char *buf, *p; - if (fp->fd != -1) netclose(fp->fd); - fp->fd = socket_connect(fp->host, fp->port); - buf = calloc(0x10000, 1); // FIXME: I am lazy... But in principle, 64KB should be large enough. - l += sprintf(buf + l, "GET %s HTTP/1.0\r\nHost: %s\r\n", fp->path, fp->http_host); - l += sprintf(buf + l, "Range: bytes=%lld-\r\n", (long long)fp->offset); - l += sprintf(buf + l, "\r\n"); - netwrite(fp->fd, buf, l); - l = 0; - while (netread(fp->fd, buf + l, 1)) { // read HTTP header; FIXME: bad efficiency - if (buf[l] == '\n' && l >= 3) - if (strncmp(buf + l - 3, "\r\n\r\n", 4) == 0) break; - ++l; - } - buf[l] = 0; - if (l < 14) { // prematured header - netclose(fp->fd); - fp->fd = -1; - return -1; - } - ret = strtol(buf + 8, &p, 0); // HTTP return code - if (ret == 200 && fp->offset>0) { // 200 (complete result); then skip beginning of the file - off_t rest = fp->offset; - while (rest) { - off_t l = rest < 0x10000? rest : 0x10000; - rest -= my_netread(fp->fd, buf, l); - } - } else if (ret != 206 && ret != 200) { - free(buf); - fprintf(stderr, "[khttp_connect_file] fail to open file (HTTP code: %d).\n", ret); - netclose(fp->fd); - fp->fd = -1; - return -1; - } - free(buf); - fp->is_ready = 1; - return 0; -} - -/******************** - * Generic routines * - ********************/ - -knetFile *knet_open(const char *fn, const char *mode) -{ - knetFile *fp = 0; - if (mode[0] != 'r') { - fprintf(stderr, "[kftp_open] only mode \"r\" is supported.\n"); - return 0; - } - if (strstr(fn, "ftp://") == fn) { - fp = kftp_parse_url(fn, mode); - if (fp == 0) return 0; - if (kftp_connect(fp) == -1) { - knet_close(fp); - return 0; - } - kftp_connect_file(fp); - } else if (strstr(fn, "http://") == fn) { - fp = khttp_parse_url(fn, mode); - if (fp == 0) return 0; - khttp_connect_file(fp); - } else { // local file -#ifdef _WIN32 - /* In windows, O_BINARY is necessary. In Linux/Mac, O_BINARY may - * be undefined on some systems, although it is defined on my - * Mac and the Linux I have tested on. */ - int fd = open(fn, O_RDONLY | O_BINARY); -#else - int fd = open(fn, O_RDONLY); -#endif - if (fd == -1) { - perror("open"); - return 0; - } - fp = (knetFile*)calloc(1, sizeof(knetFile)); - fp->type = KNF_TYPE_LOCAL; - fp->fd = fd; - fp->ctrl_fd = -1; - } - if (fp && fp->fd == -1) { - knet_close(fp); - return 0; - } - return fp; -} - -knetFile *knet_dopen(int fd, const char *mode) -{ - knetFile *fp = (knetFile*)calloc(1, sizeof(knetFile)); - fp->type = KNF_TYPE_LOCAL; - fp->fd = fd; - return fp; -} - -off_t knet_read(knetFile *fp, void *buf, off_t len) -{ - off_t l = 0; - if (fp->fd == -1) return 0; - if (fp->type == KNF_TYPE_FTP) { - if (fp->is_ready == 0) { - if (!fp->no_reconnect) kftp_reconnect(fp); - kftp_connect_file(fp); - } - } else if (fp->type == KNF_TYPE_HTTP) { - if (fp->is_ready == 0) - khttp_connect_file(fp); - } - if (fp->type == KNF_TYPE_LOCAL) { // on Windows, the following block is necessary; not on UNIX - off_t rest = len, curr; - while (rest) { - do { - curr = read(fp->fd, buf + l, rest); - } while (curr < 0 && EINTR == errno); - if (curr < 0) return -1; - if (curr == 0) break; - l += curr; rest -= curr; - } - } else l = my_netread(fp->fd, buf, len); - fp->offset += l; - return l; -} - -off_t knet_seek(knetFile *fp, int64_t off, int whence) -{ - if (whence == SEEK_SET && off == fp->offset) return 0; - if (fp->type == KNF_TYPE_LOCAL) { - /* Be aware that lseek() returns the offset after seeking, - * while fseek() returns zero on success. */ - off_t offset = lseek(fp->fd, off, whence); - if (offset == -1) { - // Be silent, it is OK for knet_seek to fail when the file is streamed - // fprintf(stderr,"[knet_seek] %s\n", strerror(errno)); - return -1; - } - fp->offset = offset; - return 0; - } - else if (fp->type == KNF_TYPE_FTP) - { - if (whence==SEEK_CUR) - fp->offset += off; - else if (whence==SEEK_SET) - fp->offset = off; - else if ( whence==SEEK_END) - fp->offset = fp->file_size+off; - fp->is_ready = 0; - return 0; - } - else if (fp->type == KNF_TYPE_HTTP) - { - if (whence == SEEK_END) { // FIXME: can we allow SEEK_END in future? - fprintf(stderr, "[knet_seek] SEEK_END is not supported for HTTP. Offset is unchanged.\n"); - errno = ESPIPE; - return -1; - } - if (whence==SEEK_CUR) - fp->offset += off; - else if (whence==SEEK_SET) - fp->offset = off; - fp->is_ready = 0; - return 0; - } - errno = EINVAL; - fprintf(stderr,"[knet_seek] %s\n", strerror(errno)); - return -1; -} - -int knet_close(knetFile *fp) -{ - if (fp == 0) return 0; - if (fp->ctrl_fd != -1) netclose(fp->ctrl_fd); // FTP specific - if (fp->fd != -1) { - /* On Linux/Mac, netclose() is an alias of close(), but on - * Windows, it is an alias of closesocket(). */ - if (fp->type == KNF_TYPE_LOCAL) close(fp->fd); - else netclose(fp->fd); - } - free(fp->host); free(fp->port); - free(fp->response); free(fp->retr); // FTP specific - free(fp->path); free(fp->http_host); // HTTP specific - free(fp); - return 0; -} - -#ifdef KNETFILE_MAIN -int main(void) -{ - char *buf; - knetFile *fp; - int type = 4, l; -#ifdef _WIN32 - knet_win32_init(); -#endif - buf = calloc(0x100000, 1); - if (type == 0) { - fp = knet_open("knetfile.c", "r"); - knet_seek(fp, 1000, SEEK_SET); - } else if (type == 1) { // NCBI FTP, large file - fp = knet_open("ftp://ftp.ncbi.nih.gov/1000genomes/ftp/data/NA12878/alignment/NA12878.chrom6.SLX.SRP000032.2009_06.bam", "r"); - knet_seek(fp, 2500000000ll, SEEK_SET); - l = knet_read(fp, buf, 255); - } else if (type == 2) { - fp = knet_open("ftp://ftp.sanger.ac.uk/pub4/treefam/tmp/index.shtml", "r"); - knet_seek(fp, 1000, SEEK_SET); - } else if (type == 3) { - fp = knet_open("http://www.sanger.ac.uk/Users/lh3/index.shtml", "r"); - knet_seek(fp, 1000, SEEK_SET); - } else if (type == 4) { - fp = knet_open("http://www.sanger.ac.uk/Users/lh3/ex1.bam", "r"); - knet_read(fp, buf, 10000); - knet_seek(fp, 20000, SEEK_SET); - knet_seek(fp, 10000, SEEK_SET); - l = knet_read(fp, buf+10000, 10000000) + 10000; - } - if (type != 4 && type != 1) { - knet_read(fp, buf, 255); - buf[255] = 0; - printf("%s\n", buf); - } else write(fileno(stdout), buf, l); - knet_close(fp); - free(buf); - return 0; -} -#endif diff -Nru tophat-2.1.1+dfsg/src/samtools-0.1.18/knetfile.h tophat-2.1.1+dfsg1/src/samtools-0.1.18/knetfile.h --- tophat-2.1.1+dfsg/src/samtools-0.1.18/knetfile.h 2016-02-14 18:21:17.682079000 +0000 +++ tophat-2.1.1+dfsg1/src/samtools-0.1.18/knetfile.h 1970-01-01 00:00:00.000000000 +0000 @@ -1,75 +0,0 @@ -#ifndef KNETFILE_H -#define KNETFILE_H - -#include -#include - -#ifndef _WIN32 -#define netread(fd, ptr, len) read(fd, ptr, len) -#define netwrite(fd, ptr, len) write(fd, ptr, len) -#define netclose(fd) close(fd) -#else -#include -#define netread(fd, ptr, len) recv(fd, ptr, len, 0) -#define netwrite(fd, ptr, len) send(fd, ptr, len, 0) -#define netclose(fd) closesocket(fd) -#endif - -// FIXME: currently I/O is unbuffered - -#define KNF_TYPE_LOCAL 1 -#define KNF_TYPE_FTP 2 -#define KNF_TYPE_HTTP 3 - -typedef struct knetFile_s { - int type, fd; - int64_t offset; - char *host, *port; - - // the following are for FTP only - int ctrl_fd, pasv_ip[4], pasv_port, max_response, no_reconnect, is_ready; - char *response, *retr, *size_cmd; - int64_t seek_offset; // for lazy seek - int64_t file_size; - - // the following are for HTTP only - char *path, *http_host; -} knetFile; - -#define knet_tell(fp) ((fp)->offset) -#define knet_fileno(fp) ((fp)->fd) - -#ifdef __cplusplus -extern "C" { -#endif - -#ifdef _WIN32 - int knet_win32_init(); - void knet_win32_destroy(); -#endif - - knetFile *knet_open(const char *fn, const char *mode); - - /* - This only works with local files. - */ - knetFile *knet_dopen(int fd, const char *mode); - - /* - If ->is_ready==0, this routine updates ->fd; otherwise, it simply - reads from ->fd. - */ - off_t knet_read(knetFile *fp, void *buf, off_t len); - - /* - This routine only sets ->offset and ->is_ready=0. It does not - communicate with the FTP server. - */ - off_t knet_seek(knetFile *fp, int64_t off, int whence); - int knet_close(knetFile *fp); - -#ifdef __cplusplus -} -#endif - -#endif diff -Nru tophat-2.1.1+dfsg/src/samtools-0.1.18/kprobaln.c tophat-2.1.1+dfsg1/src/samtools-0.1.18/kprobaln.c --- tophat-2.1.1+dfsg/src/samtools-0.1.18/kprobaln.c 2016-02-14 18:21:17.683079000 +0000 +++ tophat-2.1.1+dfsg1/src/samtools-0.1.18/kprobaln.c 1970-01-01 00:00:00.000000000 +0000 @@ -1,278 +0,0 @@ -/* The MIT License - - Copyright (c) 2003-2006, 2008-2010, by Heng Li - - Permission is hereby granted, free of charge, to any person obtaining - a copy of this software and associated documentation files (the - "Software"), to deal in the Software without restriction, including - without limitation the rights to use, copy, modify, merge, publish, - distribute, sublicense, and/or sell copies of the Software, and to - permit persons to whom the Software is furnished to do so, subject to - the following conditions: - - The above copyright notice and this permission notice shall be - included in all copies or substantial portions of the Software. - - THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS - BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN - ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN - CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - SOFTWARE. -*/ - -#include -#include -#include -#include -#include -#include "kprobaln.h" - -/***************************************** - * Probabilistic banded glocal alignment * - *****************************************/ - -#define EI .25 -#define EM .33333333333 - -static float g_qual2prob[256]; - -#define set_u(u, b, i, k) { int x=(i)-(b); x=x>0?x:0; (u)=((k)-x+1)*3; } - -kpa_par_t kpa_par_def = { 0.001, 0.1, 10 }; -kpa_par_t kpa_par_alt = { 0.0001, 0.01, 10 }; - -/* - The topology of the profile HMM: - - /\ /\ /\ /\ - I[1] I[k-1] I[k] I[L] - ^ \ \ ^ \ ^ \ \ ^ - | \ \ | \ | \ \ | - M[0] M[1] -> ... -> M[k-1] -> M[k] -> ... -> M[L] M[L+1] - \ \/ \/ \/ / - \ /\ /\ /\ / - -> D[k-1] -> D[k] -> - - M[0] points to every {M,I}[k] and every {M,I}[k] points M[L+1]. - - On input, _ref is the reference sequence and _query is the query - sequence. Both are sequences of 0/1/2/3/4 where 4 stands for an - ambiguous residue. iqual is the base quality. c sets the gap open - probability, gap extension probability and band width. - - On output, state and q are arrays of length l_query. The higher 30 - bits give the reference position the query base is matched to and the - lower two bits can be 0 (an alignment match) or 1 (an - insertion). q[i] gives the phred scaled posterior probability of - state[i] being wrong. - */ -int kpa_glocal(const uint8_t *_ref, int l_ref, const uint8_t *_query, int l_query, const uint8_t *iqual, - const kpa_par_t *c, int *state, uint8_t *q) -{ - double **f, **b = 0, *s, m[9], sI, sM, bI, bM, pb; - float *qual, *_qual; - const uint8_t *ref, *query; - int bw, bw2, i, k, is_diff = 0, is_backward = 1, Pr; - - /*** initialization ***/ - is_backward = state && q? 1 : 0; - ref = _ref - 1; query = _query - 1; // change to 1-based coordinate - bw = l_ref > l_query? l_ref : l_query; - if (bw > c->bw) bw = c->bw; - if (bw < abs(l_ref - l_query)) bw = abs(l_ref - l_query); - bw2 = bw * 2 + 1; - // allocate the forward and backward matrices f[][] and b[][] and the scaling array s[] - f = calloc(l_query+1, sizeof(void*)); - if (is_backward) b = calloc(l_query+1, sizeof(void*)); - for (i = 0; i <= l_query; ++i) { - f[i] = calloc(bw2 * 3 + 6, sizeof(double)); // FIXME: this is over-allocated for very short seqs - if (is_backward) b[i] = calloc(bw2 * 3 + 6, sizeof(double)); - } - s = calloc(l_query+2, sizeof(double)); // s[] is the scaling factor to avoid underflow - // initialize qual - _qual = calloc(l_query, sizeof(float)); - if (g_qual2prob[0] == 0) - for (i = 0; i < 256; ++i) - g_qual2prob[i] = pow(10, -i/10.); - for (i = 0; i < l_query; ++i) _qual[i] = g_qual2prob[iqual? iqual[i] : 30]; - qual = _qual - 1; - // initialize transition probability - sM = sI = 1. / (2 * l_query + 2); // the value here seems not to affect results; FIXME: need proof - m[0*3+0] = (1 - c->d - c->d) * (1 - sM); m[0*3+1] = m[0*3+2] = c->d * (1 - sM); - m[1*3+0] = (1 - c->e) * (1 - sI); m[1*3+1] = c->e * (1 - sI); m[1*3+2] = 0.; - m[2*3+0] = 1 - c->e; m[2*3+1] = 0.; m[2*3+2] = c->e; - bM = (1 - c->d) / l_ref; bI = c->d / l_ref; // (bM+bI)*l_ref==1 - /*** forward ***/ - // f[0] - set_u(k, bw, 0, 0); - f[0][k] = s[0] = 1.; - { // f[1] - double *fi = f[1], sum; - int beg = 1, end = l_ref < bw + 1? l_ref : bw + 1, _beg, _end; - for (k = beg, sum = 0.; k <= end; ++k) { - int u; - double e = (ref[k] > 3 || query[1] > 3)? 1. : ref[k] == query[1]? 1. - qual[1] : qual[1] * EM; - set_u(u, bw, 1, k); - fi[u+0] = e * bM; fi[u+1] = EI * bI; - sum += fi[u] + fi[u+1]; - } - // rescale - s[1] = sum; - set_u(_beg, bw, 1, beg); set_u(_end, bw, 1, end); _end += 2; - for (k = _beg; k <= _end; ++k) fi[k] /= sum; - } - // f[2..l_query] - for (i = 2; i <= l_query; ++i) { - double *fi = f[i], *fi1 = f[i-1], sum, qli = qual[i]; - int beg = 1, end = l_ref, x, _beg, _end; - uint8_t qyi = query[i]; - x = i - bw; beg = beg > x? beg : x; // band start - x = i + bw; end = end < x? end : x; // band end - for (k = beg, sum = 0.; k <= end; ++k) { - int u, v11, v01, v10; - double e; - e = (ref[k] > 3 || qyi > 3)? 1. : ref[k] == qyi? 1. - qli : qli * EM; - set_u(u, bw, i, k); set_u(v11, bw, i-1, k-1); set_u(v10, bw, i-1, k); set_u(v01, bw, i, k-1); - fi[u+0] = e * (m[0] * fi1[v11+0] + m[3] * fi1[v11+1] + m[6] * fi1[v11+2]); - fi[u+1] = EI * (m[1] * fi1[v10+0] + m[4] * fi1[v10+1]); - fi[u+2] = m[2] * fi[v01+0] + m[8] * fi[v01+2]; - sum += fi[u] + fi[u+1] + fi[u+2]; -// fprintf(stderr, "F (%d,%d;%d): %lg,%lg,%lg\n", i, k, u, fi[u], fi[u+1], fi[u+2]); // DEBUG - } - // rescale - s[i] = sum; - set_u(_beg, bw, i, beg); set_u(_end, bw, i, end); _end += 2; - for (k = _beg, sum = 1./sum; k <= _end; ++k) fi[k] *= sum; - } - { // f[l_query+1] - double sum; - for (k = 1, sum = 0.; k <= l_ref; ++k) { - int u; - set_u(u, bw, l_query, k); - if (u < 3 || u >= bw2*3+3) continue; - sum += f[l_query][u+0] * sM + f[l_query][u+1] * sI; - } - s[l_query+1] = sum; // the last scaling factor - } - { // compute likelihood - double p = 1., Pr1 = 0.; - for (i = 0; i <= l_query + 1; ++i) { - p *= s[i]; - if (p < 1e-100) Pr1 += -4.343 * log(p), p = 1.; - } - Pr1 += -4.343 * log(p * l_ref * l_query); - Pr = (int)(Pr1 + .499); - if (!is_backward) { // skip backward and MAP - for (i = 0; i <= l_query; ++i) free(f[i]); - free(f); free(s); free(_qual); - return Pr; - } - } - /*** backward ***/ - // b[l_query] (b[l_query+1][0]=1 and thus \tilde{b}[][]=1/s[l_query+1]; this is where s[l_query+1] comes from) - for (k = 1; k <= l_ref; ++k) { - int u; - double *bi = b[l_query]; - set_u(u, bw, l_query, k); - if (u < 3 || u >= bw2*3+3) continue; - bi[u+0] = sM / s[l_query] / s[l_query+1]; bi[u+1] = sI / s[l_query] / s[l_query+1]; - } - // b[l_query-1..1] - for (i = l_query - 1; i >= 1; --i) { - int beg = 1, end = l_ref, x, _beg, _end; - double *bi = b[i], *bi1 = b[i+1], y = (i > 1), qli1 = qual[i+1]; - uint8_t qyi1 = query[i+1]; - x = i - bw; beg = beg > x? beg : x; - x = i + bw; end = end < x? end : x; - for (k = end; k >= beg; --k) { - int u, v11, v01, v10; - double e; - set_u(u, bw, i, k); set_u(v11, bw, i+1, k+1); set_u(v10, bw, i+1, k); set_u(v01, bw, i, k+1); - e = (k >= l_ref? 0 : (ref[k+1] > 3 || qyi1 > 3)? 1. : ref[k+1] == qyi1? 1. - qli1 : qli1 * EM) * bi1[v11]; - bi[u+0] = e * m[0] + EI * m[1] * bi1[v10+1] + m[2] * bi[v01+2]; // bi1[v11] has been foled into e. - bi[u+1] = e * m[3] + EI * m[4] * bi1[v10+1]; - bi[u+2] = (e * m[6] + m[8] * bi[v01+2]) * y; -// fprintf(stderr, "B (%d,%d;%d): %lg,%lg,%lg\n", i, k, u, bi[u], bi[u+1], bi[u+2]); // DEBUG - } - // rescale - set_u(_beg, bw, i, beg); set_u(_end, bw, i, end); _end += 2; - for (k = _beg, y = 1./s[i]; k <= _end; ++k) bi[k] *= y; - } - { // b[0] - int beg = 1, end = l_ref < bw + 1? l_ref : bw + 1; - double sum = 0.; - for (k = end; k >= beg; --k) { - int u; - double e = (ref[k] > 3 || query[1] > 3)? 1. : ref[k] == query[1]? 1. - qual[1] : qual[1] * EM; - set_u(u, bw, 1, k); - if (u < 3 || u >= bw2*3+3) continue; - sum += e * b[1][u+0] * bM + EI * b[1][u+1] * bI; - } - set_u(k, bw, 0, 0); - pb = b[0][k] = sum / s[0]; // if everything works as is expected, pb == 1.0 - } - is_diff = fabs(pb - 1.) > 1e-7? 1 : 0; - /*** MAP ***/ - for (i = 1; i <= l_query; ++i) { - double sum = 0., *fi = f[i], *bi = b[i], max = 0.; - int beg = 1, end = l_ref, x, max_k = -1; - x = i - bw; beg = beg > x? beg : x; - x = i + bw; end = end < x? end : x; - for (k = beg; k <= end; ++k) { - int u; - double z; - set_u(u, bw, i, k); - z = fi[u+0] * bi[u+0]; if (z > max) max = z, max_k = (k-1)<<2 | 0; sum += z; - z = fi[u+1] * bi[u+1]; if (z > max) max = z, max_k = (k-1)<<2 | 1; sum += z; - } - max /= sum; sum *= s[i]; // if everything works as is expected, sum == 1.0 - if (state) state[i-1] = max_k; - if (q) k = (int)(-4.343 * log(1. - max) + .499), q[i-1] = k > 100? 99 : k; -#ifdef _MAIN - fprintf(stderr, "(%.10lg,%.10lg) (%d,%d:%c,%c:%d) %lg\n", pb, sum, i-1, max_k>>2, - "ACGT"[query[i]], "ACGT"[ref[(max_k>>2)+1]], max_k&3, max); // DEBUG -#endif - } - /*** free ***/ - for (i = 0; i <= l_query; ++i) { - free(f[i]); free(b[i]); - } - free(f); free(b); free(s); free(_qual); - return Pr; -} - -#ifdef _MAIN -#include -int main(int argc, char *argv[]) -{ - uint8_t conv[256], *iqual, *ref, *query; - int c, l_ref, l_query, i, q = 30, b = 10, P; - while ((c = getopt(argc, argv, "b:q:")) >= 0) { - switch (c) { - case 'b': b = atoi(optarg); break; - case 'q': q = atoi(optarg); break; - } - } - if (optind + 2 > argc) { - fprintf(stderr, "Usage: %s [-q %d] [-b %d] \n", argv[0], q, b); // example: acttc attc - return 1; - } - memset(conv, 4, 256); - conv['a'] = conv['A'] = 0; conv['c'] = conv['C'] = 1; - conv['g'] = conv['G'] = 2; conv['t'] = conv['T'] = 3; - ref = (uint8_t*)argv[optind]; query = (uint8_t*)argv[optind+1]; - l_ref = strlen((char*)ref); l_query = strlen((char*)query); - for (i = 0; i < l_ref; ++i) ref[i] = conv[ref[i]]; - for (i = 0; i < l_query; ++i) query[i] = conv[query[i]]; - iqual = malloc(l_query); - memset(iqual, q, l_query); - kpa_par_def.bw = b; - P = kpa_glocal(ref, l_ref, query, l_query, iqual, &kpa_par_alt, 0, 0); - fprintf(stderr, "%d\n", P); - free(iqual); - return 0; -} -#endif diff -Nru tophat-2.1.1+dfsg/src/samtools-0.1.18/kprobaln.h tophat-2.1.1+dfsg1/src/samtools-0.1.18/kprobaln.h --- tophat-2.1.1+dfsg/src/samtools-0.1.18/kprobaln.h 2016-02-14 18:21:17.684079000 +0000 +++ tophat-2.1.1+dfsg1/src/samtools-0.1.18/kprobaln.h 1970-01-01 00:00:00.000000000 +0000 @@ -1,49 +0,0 @@ -/* The MIT License - - Copyright (c) 2003-2006, 2008, 2009 by Heng Li - - Permission is hereby granted, free of charge, to any person obtaining - a copy of this software and associated documentation files (the - "Software"), to deal in the Software without restriction, including - without limitation the rights to use, copy, modify, merge, publish, - distribute, sublicense, and/or sell copies of the Software, and to - permit persons to whom the Software is furnished to do so, subject to - the following conditions: - - The above copyright notice and this permission notice shall be - included in all copies or substantial portions of the Software. - - THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS - BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN - ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN - CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - SOFTWARE. -*/ - -#ifndef LH3_KPROBALN_H_ -#define LH3_KPROBALN_H_ - -#include - -typedef struct { - float d, e; - int bw; -} kpa_par_t; - -#ifdef __cplusplus -extern "C" { -#endif - - int kpa_glocal(const uint8_t *_ref, int l_ref, const uint8_t *_query, int l_query, const uint8_t *iqual, - const kpa_par_t *c, int *state, uint8_t *q); - -#ifdef __cplusplus -} -#endif - -extern kpa_par_t kpa_par_def, kpa_par_alt; - -#endif diff -Nru tophat-2.1.1+dfsg/src/samtools-0.1.18/kseq.h tophat-2.1.1+dfsg1/src/samtools-0.1.18/kseq.h --- tophat-2.1.1+dfsg/src/samtools-0.1.18/kseq.h 2016-02-14 18:21:17.685079000 +0000 +++ tophat-2.1.1+dfsg1/src/samtools-0.1.18/kseq.h 1970-01-01 00:00:00.000000000 +0000 @@ -1,224 +0,0 @@ -/* The MIT License - - Copyright (c) 2008, 2009, 2011 Attractive Chaos - - Permission is hereby granted, free of charge, to any person obtaining - a copy of this software and associated documentation files (the - "Software"), to deal in the Software without restriction, including - without limitation the rights to use, copy, modify, merge, publish, - distribute, sublicense, and/or sell copies of the Software, and to - permit persons to whom the Software is furnished to do so, subject to - the following conditions: - - The above copyright notice and this permission notice shall be - included in all copies or substantial portions of the Software. - - THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS - BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN - ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN - CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - SOFTWARE. -*/ - -/* Last Modified: 18AUG2011 */ - -#ifndef AC_KSEQ_H -#define AC_KSEQ_H - -#include -#include -#include - -#define KS_SEP_SPACE 0 // isspace(): \t, \n, \v, \f, \r -#define KS_SEP_TAB 1 // isspace() && !' ' -#define KS_SEP_MAX 1 - -#define __KS_TYPE(type_t) \ - typedef struct __kstream_t { \ - unsigned char *buf; \ - int begin, end, is_eof; \ - type_t f; \ - } kstream_t; - -#define ks_eof(ks) ((ks)->is_eof && (ks)->begin >= (ks)->end) -#define ks_rewind(ks) ((ks)->is_eof = (ks)->begin = (ks)->end = 0) - -#define __KS_BASIC(type_t, __bufsize) \ - static inline kstream_t *ks_init(type_t f) \ - { \ - kstream_t *ks = (kstream_t*)calloc(1, sizeof(kstream_t)); \ - ks->f = f; \ - ks->buf = malloc(__bufsize); \ - return ks; \ - } \ - static inline void ks_destroy(kstream_t *ks) \ - { \ - if (ks) { \ - free(ks->buf); \ - free(ks); \ - } \ - } - -#define __KS_GETC(__read, __bufsize) \ - static inline int ks_getc(kstream_t *ks) \ - { \ - if (ks->is_eof && ks->begin >= ks->end) return -1; \ - if (ks->begin >= ks->end) { \ - ks->begin = 0; \ - ks->end = __read(ks->f, ks->buf, __bufsize); \ - if (ks->end < __bufsize) ks->is_eof = 1; \ - if (ks->end == 0) return -1; \ - } \ - return (int)ks->buf[ks->begin++]; \ - } - -#ifndef KSTRING_T -#define KSTRING_T kstring_t -typedef struct __kstring_t { - size_t l, m; - char *s; -} kstring_t; -#endif - -#ifndef kroundup32 -#define kroundup32(x) (--(x), (x)|=(x)>>1, (x)|=(x)>>2, (x)|=(x)>>4, (x)|=(x)>>8, (x)|=(x)>>16, ++(x)) -#endif - -#define __KS_GETUNTIL(__read, __bufsize) \ - static int ks_getuntil2(kstream_t *ks, int delimiter, kstring_t *str, int *dret, int append) \ - { \ - if (dret) *dret = 0; \ - str->l = append? str->l : 0; \ - if (ks->begin >= ks->end && ks->is_eof) return -1; \ - for (;;) { \ - int i; \ - if (ks->begin >= ks->end) { \ - if (!ks->is_eof) { \ - ks->begin = 0; \ - ks->end = __read(ks->f, ks->buf, __bufsize); \ - if (ks->end < __bufsize) ks->is_eof = 1; \ - if (ks->end == 0) break; \ - } else break; \ - } \ - if (delimiter > KS_SEP_MAX) { \ - for (i = ks->begin; i < ks->end; ++i) \ - if (ks->buf[i] == delimiter) break; \ - } else if (delimiter == KS_SEP_SPACE) { \ - for (i = ks->begin; i < ks->end; ++i) \ - if (isspace(ks->buf[i])) break; \ - } else if (delimiter == KS_SEP_TAB) { \ - for (i = ks->begin; i < ks->end; ++i) \ - if (isspace(ks->buf[i]) && ks->buf[i] != ' ') break; \ - } else i = 0; /* never come to here! */ \ - if (str->m - str->l < i - ks->begin + 1) { \ - str->m = str->l + (i - ks->begin) + 1; \ - kroundup32(str->m); \ - str->s = (char*)realloc(str->s, str->m); \ - } \ - memcpy(str->s + str->l, ks->buf + ks->begin, i - ks->begin); \ - str->l = str->l + (i - ks->begin); \ - ks->begin = i + 1; \ - if (i < ks->end) { \ - if (dret) *dret = ks->buf[i]; \ - break; \ - } \ - } \ - if (str->s == 0) { \ - str->m = 1; \ - str->s = (char*)calloc(1, 1); \ - } \ - str->s[str->l] = '\0'; \ - return str->l; \ - } \ - static inline int ks_getuntil(kstream_t *ks, int delimiter, kstring_t *str, int *dret) \ - { return ks_getuntil2(ks, delimiter, str, dret, 0); } - -#define KSTREAM_INIT(type_t, __read, __bufsize) \ - __KS_TYPE(type_t) \ - __KS_BASIC(type_t, __bufsize) \ - __KS_GETC(__read, __bufsize) \ - __KS_GETUNTIL(__read, __bufsize) - -#define __KSEQ_BASIC(type_t) \ - static inline kseq_t *kseq_init(type_t fd) \ - { \ - kseq_t *s = (kseq_t*)calloc(1, sizeof(kseq_t)); \ - s->f = ks_init(fd); \ - return s; \ - } \ - static inline void kseq_rewind(kseq_t *ks) \ - { \ - ks->last_char = 0; \ - ks->f->is_eof = ks->f->begin = ks->f->end = 0; \ - } \ - static inline void kseq_destroy(kseq_t *ks) \ - { \ - if (!ks) return; \ - free(ks->name.s); free(ks->comment.s); free(ks->seq.s); free(ks->qual.s); \ - ks_destroy(ks->f); \ - free(ks); \ - } - -/* Return value: - >=0 length of the sequence (normal) - -1 end-of-file - -2 truncated quality string - */ -#define __KSEQ_READ \ - static int kseq_read(kseq_t *seq) \ - { \ - int c; \ - kstream_t *ks = seq->f; \ - if (seq->last_char == 0) { /* then jump to the next header line */ \ - while ((c = ks_getc(ks)) != -1 && c != '>' && c != '@'); \ - if (c == -1) return -1; /* end of file */ \ - seq->last_char = c; \ - } /* else: the first header char has been read in the previous call */ \ - seq->comment.l = seq->seq.l = seq->qual.l = 0; /* reset all members */ \ - if (ks_getuntil(ks, 0, &seq->name, &c) < 0) return -1; /* normal exit: EOF */ \ - if (c != '\n') ks_getuntil(ks, '\n', &seq->comment, 0); /* read FASTA/Q comment */ \ - if (seq->seq.s == 0) { /* we can do this in the loop below, but that is slower */ \ - seq->seq.m = 256; \ - seq->seq.s = (char*)malloc(seq->seq.m); \ - } \ - while ((c = ks_getc(ks)) != -1 && c != '>' && c != '+' && c != '@') { \ - seq->seq.s[seq->seq.l++] = c; /* this is safe: we always have enough space for 1 char */ \ - ks_getuntil2(ks, '\n', &seq->seq, 0, 1); /* read the rest of the line */ \ - } \ - if (c == '>' || c == '@') seq->last_char = c; /* the first header char has been read */ \ - if (seq->seq.l + 1 >= seq->seq.m) { /* seq->seq.s[seq->seq.l] below may be out of boundary */ \ - seq->seq.m = seq->seq.l + 2; \ - kroundup32(seq->seq.m); /* rounded to the next closest 2^k */ \ - seq->seq.s = (char*)realloc(seq->seq.s, seq->seq.m); \ - } \ - seq->seq.s[seq->seq.l] = 0; /* null terminated string */ \ - if (c != '+') return seq->seq.l; /* FASTA */ \ - if (seq->qual.m < seq->seq.m) { /* allocate memory for qual in case insufficient */ \ - seq->qual.m = seq->seq.m; \ - seq->qual.s = (char*)realloc(seq->qual.s, seq->qual.m); \ - } \ - while ((c = ks_getc(ks)) != -1 && c != '\n'); /* skip the rest of '+' line */ \ - if (c == -1) return -2; /* error: no quality string */ \ - while (ks_getuntil2(ks, '\n', &seq->qual, 0, 1) >= 0 && seq->qual.l < seq->seq.l); \ - seq->last_char = 0; /* we have not come to the next header line */ \ - if (seq->seq.l != seq->qual.l) return -2; /* error: qual string is of a different length */ \ - return seq->seq.l; \ - } - -#define __KSEQ_TYPE(type_t) \ - typedef struct { \ - kstring_t name, comment, seq, qual; \ - int last_char; \ - kstream_t *f; \ - } kseq_t; - -#define KSEQ_INIT(type_t, __read) \ - KSTREAM_INIT(type_t, __read, 16384) \ - __KSEQ_TYPE(type_t) \ - __KSEQ_BASIC(type_t) \ - __KSEQ_READ - -#endif diff -Nru tophat-2.1.1+dfsg/src/samtools-0.1.18/ksort.h tophat-2.1.1+dfsg1/src/samtools-0.1.18/ksort.h --- tophat-2.1.1+dfsg/src/samtools-0.1.18/ksort.h 2016-02-14 18:21:17.686079000 +0000 +++ tophat-2.1.1+dfsg1/src/samtools-0.1.18/ksort.h 1970-01-01 00:00:00.000000000 +0000 @@ -1,281 +0,0 @@ -/* The MIT License - - Copyright (c) 2008 Genome Research Ltd (GRL). - - Permission is hereby granted, free of charge, to any person obtaining - a copy of this software and associated documentation files (the - "Software"), to deal in the Software without restriction, including - without limitation the rights to use, copy, modify, merge, publish, - distribute, sublicense, and/or sell copies of the Software, and to - permit persons to whom the Software is furnished to do so, subject to - the following conditions: - - The above copyright notice and this permission notice shall be - included in all copies or substantial portions of the Software. - - THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS - BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN - ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN - CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - SOFTWARE. -*/ - -/* Contact: Heng Li */ - -/* - 2008-11-16 (0.1.4): - - * Fixed a bug in introsort() that happens in rare cases. - - 2008-11-05 (0.1.3): - - * Fixed a bug in introsort() for complex comparisons. - - * Fixed a bug in mergesort(). The previous version is not stable. - - 2008-09-15 (0.1.2): - - * Accelerated introsort. On my Mac (not on another Linux machine), - my implementation is as fast as std::sort on random input. - - * Added combsort and in introsort, switch to combsort if the - recursion is too deep. - - 2008-09-13 (0.1.1): - - * Added k-small algorithm - - 2008-09-05 (0.1.0): - - * Initial version - -*/ - -#ifndef AC_KSORT_H -#define AC_KSORT_H - -#include -#include - -typedef struct { - void *left, *right; - int depth; -} ks_isort_stack_t; - -#define KSORT_SWAP(type_t, a, b) { register type_t t=(a); (a)=(b); (b)=t; } - -#define KSORT_INIT(name, type_t, __sort_lt) \ - void ks_mergesort_##name(size_t n, type_t array[], type_t temp[]) \ - { \ - type_t *a2[2], *a, *b; \ - int curr, shift; \ - \ - a2[0] = array; \ - a2[1] = temp? temp : (type_t*)malloc(sizeof(type_t) * n); \ - for (curr = 0, shift = 0; (1ul<> 1) - 1; i != (size_t)(-1); --i) \ - ks_heapadjust_##name(i, lsize, l); \ - } \ - void ks_heapsort_##name(size_t lsize, type_t l[]) \ - { \ - size_t i; \ - for (i = lsize - 1; i > 0; --i) { \ - type_t tmp; \ - tmp = *l; *l = l[i]; l[i] = tmp; ks_heapadjust_##name(0, i, l); \ - } \ - } \ - inline void __ks_insertsort_##name(type_t *s, type_t *t) \ - { \ - type_t *i, *j, swap_tmp; \ - for (i = s + 1; i < t; ++i) \ - for (j = i; j > s && __sort_lt(*j, *(j-1)); --j) { \ - swap_tmp = *j; *j = *(j-1); *(j-1) = swap_tmp; \ - } \ - } \ - void ks_combsort_##name(size_t n, type_t a[]) \ - { \ - const double shrink_factor = 1.2473309501039786540366528676643; \ - int do_swap; \ - size_t gap = n; \ - type_t tmp, *i, *j; \ - do { \ - if (gap > 2) { \ - gap = (size_t)(gap / shrink_factor); \ - if (gap == 9 || gap == 10) gap = 11; \ - } \ - do_swap = 0; \ - for (i = a; i < a + n - gap; ++i) { \ - j = i + gap; \ - if (__sort_lt(*j, *i)) { \ - tmp = *i; *i = *j; *j = tmp; \ - do_swap = 1; \ - } \ - } \ - } while (do_swap || gap > 2); \ - if (gap != 1) __ks_insertsort_##name(a, a + n); \ - } \ - void ks_introsort_##name(size_t n, type_t a[]) \ - { \ - int d; \ - ks_isort_stack_t *top, *stack; \ - type_t rp, swap_tmp; \ - type_t *s, *t, *i, *j, *k; \ - \ - if (n < 1) return; \ - else if (n == 2) { \ - if (__sort_lt(a[1], a[0])) { swap_tmp = a[0]; a[0] = a[1]; a[1] = swap_tmp; } \ - return; \ - } \ - for (d = 2; 1ul<>1) + 1; \ - if (__sort_lt(*k, *i)) { \ - if (__sort_lt(*k, *j)) k = j; \ - } else k = __sort_lt(*j, *i)? i : j; \ - rp = *k; \ - if (k != t) { swap_tmp = *k; *k = *t; *t = swap_tmp; } \ - for (;;) { \ - do ++i; while (__sort_lt(*i, rp)); \ - do --j; while (i <= j && __sort_lt(rp, *j)); \ - if (j <= i) break; \ - swap_tmp = *i; *i = *j; *j = swap_tmp; \ - } \ - swap_tmp = *i; *i = *t; *t = swap_tmp; \ - if (i-s > t-i) { \ - if (i-s > 16) { top->left = s; top->right = i-1; top->depth = d; ++top; } \ - s = t-i > 16? i+1 : t; \ - } else { \ - if (t-i > 16) { top->left = i+1; top->right = t; top->depth = d; ++top; } \ - t = i-s > 16? i-1 : s; \ - } \ - } else { \ - if (top == stack) { \ - free(stack); \ - __ks_insertsort_##name(a, a+n); \ - return; \ - } else { --top; s = (type_t*)top->left; t = (type_t*)top->right; d = top->depth; } \ - } \ - } \ - } \ - /* This function is adapted from: http://ndevilla.free.fr/median/ */ \ - /* 0 <= kk < n */ \ - type_t ks_ksmall_##name(size_t n, type_t arr[], size_t kk) \ - { \ - type_t *low, *high, *k, *ll, *hh, *mid; \ - low = arr; high = arr + n - 1; k = arr + kk; \ - for (;;) { \ - if (high <= low) return *k; \ - if (high == low + 1) { \ - if (__sort_lt(*high, *low)) KSORT_SWAP(type_t, *low, *high); \ - return *k; \ - } \ - mid = low + (high - low) / 2; \ - if (__sort_lt(*high, *mid)) KSORT_SWAP(type_t, *mid, *high); \ - if (__sort_lt(*high, *low)) KSORT_SWAP(type_t, *low, *high); \ - if (__sort_lt(*low, *mid)) KSORT_SWAP(type_t, *mid, *low); \ - KSORT_SWAP(type_t, *mid, *(low+1)); \ - ll = low + 1; hh = high; \ - for (;;) { \ - do ++ll; while (__sort_lt(*ll, *low)); \ - do --hh; while (__sort_lt(*low, *hh)); \ - if (hh < ll) break; \ - KSORT_SWAP(type_t, *ll, *hh); \ - } \ - KSORT_SWAP(type_t, *low, *hh); \ - if (hh <= k) low = ll; \ - if (hh >= k) high = hh - 1; \ - } \ - } \ - void ks_shuffle_##name(size_t n, type_t a[]) \ - { \ - int i, j; \ - for (i = n; i > 1; --i) { \ - type_t tmp; \ - j = (int)(drand48() * i); \ - tmp = a[j]; a[j] = a[i-1]; a[i-1] = tmp; \ - } \ - } - -#define ks_mergesort(name, n, a, t) ks_mergesort_##name(n, a, t) -#define ks_introsort(name, n, a) ks_introsort_##name(n, a) -#define ks_combsort(name, n, a) ks_combsort_##name(n, a) -#define ks_heapsort(name, n, a) ks_heapsort_##name(n, a) -#define ks_heapmake(name, n, a) ks_heapmake_##name(n, a) -#define ks_heapadjust(name, i, n, a) ks_heapadjust_##name(i, n, a) -#define ks_ksmall(name, n, a, k) ks_ksmall_##name(n, a, k) -#define ks_shuffle(name, n, a) ks_shuffle_##name(n, a) - -#define ks_lt_generic(a, b) ((a) < (b)) -#define ks_lt_str(a, b) (strcmp((a), (b)) < 0) - -typedef const char *ksstr_t; - -#define KSORT_INIT_GENERIC(type_t) KSORT_INIT(type_t, type_t, ks_lt_generic) -#define KSORT_INIT_STR KSORT_INIT(str, ksstr_t, ks_lt_str) - -#endif diff -Nru tophat-2.1.1+dfsg/src/samtools-0.1.18/kstring.c tophat-2.1.1+dfsg1/src/samtools-0.1.18/kstring.c --- tophat-2.1.1+dfsg/src/samtools-0.1.18/kstring.c 2016-02-14 18:21:17.687079000 +0000 +++ tophat-2.1.1+dfsg1/src/samtools-0.1.18/kstring.c 1970-01-01 00:00:00.000000000 +0000 @@ -1,212 +0,0 @@ -#include -#include -#include -#include -#include -#include "kstring.h" - -int ksprintf(kstring_t *s, const char *fmt, ...) -{ - va_list ap; - int l; - va_start(ap, fmt); - l = vsnprintf(s->s + s->l, s->m - s->l, fmt, ap); // This line does not work with glibc 2.0. See `man snprintf'. - va_end(ap); - if (l + 1 > s->m - s->l) { - s->m = s->l + l + 2; - kroundup32(s->m); - s->s = (char*)realloc(s->s, s->m); - va_start(ap, fmt); - l = vsnprintf(s->s + s->l, s->m - s->l, fmt, ap); - } - va_end(ap); - s->l += l; - return l; -} - -char *kstrtok(const char *str, const char *sep, ks_tokaux_t *aux) -{ - const char *p, *start; - if (sep) { // set up the table - if (str == 0 && (aux->tab[0]&1)) return 0; // no need to set up if we have finished - aux->finished = 0; - if (sep[1]) { - aux->sep = -1; - aux->tab[0] = aux->tab[1] = aux->tab[2] = aux->tab[3] = 0; - for (p = sep; *p; ++p) aux->tab[*p>>6] |= 1ull<<(*p&0x3f); - } else aux->sep = sep[0]; - } - if (aux->finished) return 0; - else if (str) aux->p = str - 1, aux->finished = 0; - if (aux->sep < 0) { - for (p = start = aux->p + 1; *p; ++p) - if (aux->tab[*p>>6]>>(*p&0x3f)&1) break; - } else { - for (p = start = aux->p + 1; *p; ++p) - if (*p == aux->sep) break; - } - aux->p = p; // end of token - if (*p == 0) aux->finished = 1; // no more tokens - return (char*)start; -} - -// s MUST BE a null terminated string; l = strlen(s) -int ksplit_core(char *s, int delimiter, int *_max, int **_offsets) -{ - int i, n, max, last_char, last_start, *offsets, l; - n = 0; max = *_max; offsets = *_offsets; - l = strlen(s); - -#define __ksplit_aux do { \ - if (_offsets) { \ - s[i] = 0; \ - if (n == max) { \ - max = max? max<<1 : 2; \ - offsets = (int*)realloc(offsets, sizeof(int) * max); \ - } \ - offsets[n++] = last_start; \ - } else ++n; \ - } while (0) - - for (i = 0, last_char = last_start = 0; i <= l; ++i) { - if (delimiter == 0) { - if (isspace(s[i]) || s[i] == 0) { - if (isgraph(last_char)) __ksplit_aux; // the end of a field - } else { - if (isspace(last_char) || last_char == 0) last_start = i; - } - } else { - if (s[i] == delimiter || s[i] == 0) { - if (last_char != 0 && last_char != delimiter) __ksplit_aux; // the end of a field - } else { - if (last_char == delimiter || last_char == 0) last_start = i; - } - } - last_char = s[i]; - } - *_max = max; *_offsets = offsets; - return n; -} - -/********************** - * Boyer-Moore search * - **********************/ - -typedef unsigned char ubyte_t; - -// reference: http://www-igm.univ-mlv.fr/~lecroq/string/node14.html -static int *ksBM_prep(const ubyte_t *pat, int m) -{ - int i, *suff, *prep, *bmGs, *bmBc; - prep = calloc(m + 256, sizeof(int)); - bmGs = prep; bmBc = prep + m; - { // preBmBc() - for (i = 0; i < 256; ++i) bmBc[i] = m; - for (i = 0; i < m - 1; ++i) bmBc[pat[i]] = m - i - 1; - } - suff = calloc(m, sizeof(int)); - { // suffixes() - int f = 0, g; - suff[m - 1] = m; - g = m - 1; - for (i = m - 2; i >= 0; --i) { - if (i > g && suff[i + m - 1 - f] < i - g) - suff[i] = suff[i + m - 1 - f]; - else { - if (i < g) g = i; - f = i; - while (g >= 0 && pat[g] == pat[g + m - 1 - f]) --g; - suff[i] = f - g; - } - } - } - { // preBmGs() - int j = 0; - for (i = 0; i < m; ++i) bmGs[i] = m; - for (i = m - 1; i >= 0; --i) - if (suff[i] == i + 1) - for (; j < m - 1 - i; ++j) - if (bmGs[j] == m) - bmGs[j] = m - 1 - i; - for (i = 0; i <= m - 2; ++i) - bmGs[m - 1 - suff[i]] = m - 1 - i; - } - free(suff); - return prep; -} - -void *kmemmem(const void *_str, int n, const void *_pat, int m, int **_prep) -{ - int i, j, *prep = 0, *bmGs, *bmBc; - const ubyte_t *str, *pat; - str = (const ubyte_t*)_str; pat = (const ubyte_t*)_pat; - prep = (_prep == 0 || *_prep == 0)? ksBM_prep(pat, m) : *_prep; - if (_prep && *_prep == 0) *_prep = prep; - bmGs = prep; bmBc = prep + m; - j = 0; - while (j <= n - m) { - for (i = m - 1; i >= 0 && pat[i] == str[i+j]; --i); - if (i >= 0) { - int max = bmBc[str[i+j]] - m + 1 + i; - if (max < bmGs[i]) max = bmGs[i]; - j += max; - } else return (void*)(str + j); - } - if (_prep == 0) free(prep); - return 0; -} - -char *kstrstr(const char *str, const char *pat, int **_prep) -{ - return (char*)kmemmem(str, strlen(str), pat, strlen(pat), _prep); -} - -char *kstrnstr(const char *str, const char *pat, int n, int **_prep) -{ - return (char*)kmemmem(str, n, pat, strlen(pat), _prep); -} - -/*********************** - * The main() function * - ***********************/ - -#ifdef KSTRING_MAIN -#include -int main() -{ - kstring_t *s; - int *fields, n, i; - ks_tokaux_t aux; - char *p; - s = (kstring_t*)calloc(1, sizeof(kstring_t)); - // test ksprintf() - ksprintf(s, " abcdefg: %d ", 100); - printf("'%s'\n", s->s); - // test ksplit() - fields = ksplit(s, 0, &n); - for (i = 0; i < n; ++i) - printf("field[%d] = '%s'\n", i, s->s + fields[i]); - // test kstrtok() - s->l = 0; - for (p = kstrtok("ab:cde:fg/hij::k", ":/", &aux); p; p = kstrtok(0, 0, &aux)) { - kputsn(p, aux.p - p, s); - kputc('\n', s); - } - printf("%s", s->s); - // free - free(s->s); free(s); free(fields); - - { - static char *str = "abcdefgcdgcagtcakcdcd"; - static char *pat = "cd"; - char *ret, *s = str; - int *prep = 0; - while ((ret = kstrstr(s, pat, &prep)) != 0) { - printf("match: %s\n", ret); - s = ret + prep[0]; - } - free(prep); - } - return 0; -} -#endif diff -Nru tophat-2.1.1+dfsg/src/samtools-0.1.18/kstring.h tophat-2.1.1+dfsg1/src/samtools-0.1.18/kstring.h --- tophat-2.1.1+dfsg/src/samtools-0.1.18/kstring.h 2016-02-14 18:21:17.688079000 +0000 +++ tophat-2.1.1+dfsg1/src/samtools-0.1.18/kstring.h 1970-01-01 00:00:00.000000000 +0000 @@ -1,117 +0,0 @@ -#ifndef KSTRING_H -#define KSTRING_H - -#include -#include -#include - -#ifndef kroundup32 -#define kroundup32(x) (--(x), (x)|=(x)>>1, (x)|=(x)>>2, (x)|=(x)>>4, (x)|=(x)>>8, (x)|=(x)>>16, ++(x)) -#endif - -#ifndef KSTRING_T -#define KSTRING_T kstring_t -typedef struct __kstring_t { - size_t l, m; - char *s; -} kstring_t; -#endif - -typedef struct { - uint64_t tab[4]; - int sep, finished; - const char *p; // end of the current token -} ks_tokaux_t; - -#ifdef __cplusplus -extern "C" { -#endif - - int ksprintf(kstring_t *s, const char *fmt, ...); - int ksplit_core(char *s, int delimiter, int *_max, int **_offsets); - char *kstrstr(const char *str, const char *pat, int **_prep); - char *kstrnstr(const char *str, const char *pat, int n, int **_prep); - void *kmemmem(const void *_str, int n, const void *_pat, int m, int **_prep); - - /* kstrtok() is similar to strtok_r() except that str is not - * modified and both str and sep can be NULL. For efficiency, it is - * actually recommended to set both to NULL in the subsequent calls - * if sep is not changed. */ - char *kstrtok(const char *str, const char *sep, ks_tokaux_t *aux); - -#ifdef __cplusplus -} -#endif - -static inline int kputsn(const char *p, int l, kstring_t *s) -{ - if (s->l + l + 1 >= s->m) { - s->m = s->l + l + 2; - kroundup32(s->m); - s->s = (char*)realloc(s->s, s->m); - } - memcpy(s->s + s->l, p, l); - s->l += l; - s->s[s->l] = 0; - return l; -} - -static inline int kputs(const char *p, kstring_t *s) -{ - return kputsn(p, strlen(p), s); -} - -static inline int kputc(int c, kstring_t *s) -{ - if (s->l + 1 >= s->m) { - s->m = s->l + 2; - kroundup32(s->m); - s->s = (char*)realloc(s->s, s->m); - } - s->s[s->l++] = c; - s->s[s->l] = 0; - return c; -} - -static inline int kputw(int c, kstring_t *s) -{ - char buf[16]; - int l, x; - if (c == 0) return kputc('0', s); - for (l = 0, x = c < 0? -c : c; x > 0; x /= 10) buf[l++] = x%10 + '0'; - if (c < 0) buf[l++] = '-'; - if (s->l + l + 1 >= s->m) { - s->m = s->l + l + 2; - kroundup32(s->m); - s->s = (char*)realloc(s->s, s->m); - } - for (x = l - 1; x >= 0; --x) s->s[s->l++] = buf[x]; - s->s[s->l] = 0; - return 0; -} - -static inline int kputuw(unsigned c, kstring_t *s) -{ - char buf[16]; - int l, i; - unsigned x; - if (c == 0) return kputc('0', s); - for (l = 0, x = c; x > 0; x /= 10) buf[l++] = x%10 + '0'; - if (s->l + l + 1 >= s->m) { - s->m = s->l + l + 2; - kroundup32(s->m); - s->s = (char*)realloc(s->s, s->m); - } - for (i = l - 1; i >= 0; --i) s->s[s->l++] = buf[i]; - s->s[s->l] = 0; - return 0; -} - -static inline int *ksplit(kstring_t *s, int delimiter, int *n) -{ - int max = 0, *offsets = 0; - *n = ksplit_core(s->s, delimiter, &max, &offsets); - return offsets; -} - -#endif diff -Nru tophat-2.1.1+dfsg/src/samtools-0.1.18/Makefile tophat-2.1.1+dfsg1/src/samtools-0.1.18/Makefile --- tophat-2.1.1+dfsg/src/samtools-0.1.18/Makefile 2016-02-14 18:21:17.376079000 +0000 +++ tophat-2.1.1+dfsg1/src/samtools-0.1.18/Makefile 1970-01-01 00:00:00.000000000 +0000 @@ -1,93 +0,0 @@ -CC= gcc -CFLAGS= -g -Wall -O2 #-m64 #-arch ppc -DFLAGS= -D_FILE_OFFSET_BITS=64 -D_LARGEFILE64_SOURCE -D_USE_KNETFILE -D_CURSES_LIB=0 -KNETFILE_O= knetfile.o -LOBJS= bgzf.o kstring.o bam_aux.o bam.o bam_import.o sam.o bam_index.o \ - bam_pileup.o bam_lpileup.o bam_md.o razf.o faidx.o bedidx.o \ - $(KNETFILE_O) bam_sort.o sam_header.o bam_reheader.o kprobaln.o bam_cat.o -AOBJS= bam_tview.o bam_plcmd.o sam_view.o \ - bam_rmdup.o bam_rmdupse.o bam_mate.o bam_stat.o bam_color.o \ - bamtk.o kaln.o bam2bcf.o bam2bcf_indel.o errmod.o sample.o \ - cut_target.o phase.o bam2depth.o -PROG= samtools_0.1.18 -INCLUDES= -I. -SUBDIRS= . bcftools -LIBPATH= -LIBCURSES= -lcurses - -.SUFFIXES:.c .o - -.c.o: - $(CC) -c $(CFLAGS) $(DFLAGS) $(INCLUDES) $< -o $@ - -all-recur lib-recur clean-recur cleanlocal-recur install-recur: - @target=`echo $@ | sed s/-recur//`; \ - wdir=`pwd`; \ - list='$(SUBDIRS)'; for subdir in $$list; do \ - cd $$subdir; \ - $(MAKE) CC="$(CC)" DFLAGS="$(DFLAGS)" CFLAGS="$(CFLAGS)" \ - INCLUDES="$(INCLUDES)" LIBPATH="$(LIBPATH)" $$target || exit 1; \ - cd $$wdir; \ - done; - -all:$(PROG) - -.PHONY:all lib clean cleanlocal -.PHONY:all-recur lib-recur clean-recur cleanlocal-recur install-recur - -lib:libbam.a - -libbam.a:$(LOBJS) - $(AR) -csru $@ $(LOBJS) - -samtools_0.1.18:lib-recur $(AOBJS) - $(CC) $(CFLAGS) -o $@ $(AOBJS) -Lbcftools $(LIBPATH) libbam.a -lbcf -lm -lz #$(LIBCURSES) - -razip:razip.o razf.o $(KNETFILE_O) - $(CC) $(CFLAGS) -o $@ razf.o razip.o $(KNETFILE_O) -lz - -bgzip:bgzip.o bgzf.o $(KNETFILE_O) - $(CC) $(CFLAGS) -o $@ bgzf.o bgzip.o $(KNETFILE_O) -lz - -razip.o:razf.h -bam.o:bam.h razf.h bam_endian.h kstring.h sam_header.h -sam.o:sam.h bam.h -bam_import.o:bam.h kseq.h khash.h razf.h -bam_pileup.o:bam.h razf.h ksort.h -bam_plcmd.o:bam.h faidx.h bcftools/bcf.h bam2bcf.h -bam_index.o:bam.h khash.h ksort.h razf.h bam_endian.h -bam_lpileup.o:bam.h ksort.h -bam_tview.o:bam.h faidx.h -bam_sort.o:bam.h ksort.h razf.h -bam_md.o:bam.h faidx.h -sam_header.o:sam_header.h khash.h -bcf.o:bcftools/bcf.h -bam2bcf.o:bam2bcf.h errmod.h bcftools/bcf.h -bam2bcf_indel.o:bam2bcf.h -errmod.o:errmod.h -phase.o:bam.h khash.h ksort.h -bamtk.o:bam.h - -faidx.o:faidx.h razf.h khash.h -faidx_main.o:faidx.h razf.h - - -libbam.1.dylib-local:$(LOBJS) - libtool -dynamic $(LOBJS) -o libbam.1.dylib -lc -lz - -libbam.so.1-local:$(LOBJS) - $(CC) -shared -Wl,-soname,libbam.so -o libbam.so.1 $(LOBJS) -lc -lz - -dylib: - @$(MAKE) cleanlocal; \ - case `uname` in \ - Linux) $(MAKE) CFLAGS="$(CFLAGS) -fPIC" libbam.so.1-local;; \ - Darwin) $(MAKE) CFLAGS="$(CFLAGS) -fPIC" libbam.1.dylib-local;; \ - *) echo 'Unknown OS';; \ - esac - - -cleanlocal: - rm -fr gmon.out *.o a.out *.exe *.dSYM razip bgzip $(PROG) *~ *.a *.so.* *.so *.dylib - -clean:cleanlocal-recur diff -Nru tophat-2.1.1+dfsg/src/samtools-0.1.18/Makefile.mingw tophat-2.1.1+dfsg1/src/samtools-0.1.18/Makefile.mingw --- tophat-2.1.1+dfsg/src/samtools-0.1.18/Makefile.mingw 2016-02-14 18:21:17.377079000 +0000 +++ tophat-2.1.1+dfsg1/src/samtools-0.1.18/Makefile.mingw 1970-01-01 00:00:00.000000000 +0000 @@ -1,63 +0,0 @@ -CC= gcc.exe -AR= ar.exe -CFLAGS= -g -Wall -O2 -DFLAGS= -D_USE_KNETFILE -D_CURSES_LIB=2 -KNETFILE_O= knetfile.o -LOBJS= bgzf.o kstring.o bam_aux.o bam.o bam_import.o sam.o bam_index.o \ - bam_pileup.o bam_lpileup.o bam_md.o razf.o faidx.o \ - $(KNETFILE_O) bam_sort.o sam_header.o bam_reheader.o kprobaln.o bedidx.o -AOBJS= bam_tview.o bam_plcmd.o sam_view.o \ - bam_rmdup.o bam_rmdupse.o bam_mate.o bam_stat.o bam_color.o \ - bamtk.o kaln.o bam2bcf.o bam2bcf_indel.o errmod.o sample.o \ - cut_target.o phase.o bam_cat.o bam2depth.o -BCFOBJS= bcftools/bcf.o bcftools/fet.o bcftools/bcf2qcall.o bcftools/bcfutils.o \ - bcftools/call1.o bcftools/index.o bcftools/kfunc.o bcftools/em.o \ - bcftools/kmin.o bcftools/prob1.o bcftools/vcf.o bcftools/mut.o -PROG= samtools.exe bcftools.exe -INCLUDES= -I. -Iwin32 -SUBDIRS= . -LIBPATH= - -.SUFFIXES:.c .o - -.c.o: - $(CC) -c $(CFLAGS) $(DFLAGS) $(INCLUDES) $< -o $@ - -all:$(PROG) - -.PHONY:all lib clean cleanlocal -.PHONY:all-recur lib-recur clean-recur cleanlocal-recur install-recur - -lib:libbam.a - -libbam.a:$(LOBJS) - $(AR) -cru $@ $(LOBJS) - -samtools.exe:$(AOBJS) libbam.a $(BCFOBJS) - $(CC) $(CFLAGS) -o $@ $(AOBJS) $(BCFOBJS) $(LIBPATH) -lm -L. -lbam -Lwin32 -lz -lcurses -lws2_32 - -bcftools.exe:$(BCFOBJS) bcftools/main.o kstring.o bgzf.o knetfile.o bedidx.o - $(CC) $(CFLAGS) -o $@ $(BCFOBJS) bcftools/main.o kstring.o bgzf.o knetfile.o bedidx.o -lm -Lwin32 -lz -lws2_32 - -razip.o:razf.h -bam.o:bam.h razf.h bam_endian.h kstring.h sam_header.h -sam.o:sam.h bam.h -bam_import.o:bam.h kseq.h khash.h razf.h -bam_pileup.o:bam.h razf.h ksort.h -bam_plcmd.o:bam.h faidx.h bcftools/bcf.h bam2bcf.h -bam_index.o:bam.h khash.h ksort.h razf.h bam_endian.h -bam_lpileup.o:bam.h ksort.h -bam_tview.o:bam.h faidx.h -bam_sort.o:bam.h ksort.h razf.h -bam_md.o:bam.h faidx.h -sam_header.o:sam_header.h khash.h -bcf.o:bcftools/bcf.h -bam2bcf.o:bam2bcf.h errmod.h bcftools/bcf.h -bam2bcf_indel.o:bam2bcf.h -errmod.o:errmod.h - -faidx.o:faidx.h razf.h khash.h -faidx_main.o:faidx.h razf.h - -clean: - rm -fr gmon.out *.o a.out *.exe *.dSYM razip bgzip $(PROG) *~ *.a *.so.* *.so *.dylib diff -Nru tophat-2.1.1+dfsg/src/samtools-0.1.18/NEWS tophat-2.1.1+dfsg1/src/samtools-0.1.18/NEWS --- tophat-2.1.1+dfsg/src/samtools-0.1.18/NEWS 2016-02-14 18:21:17.378079000 +0000 +++ tophat-2.1.1+dfsg1/src/samtools-0.1.18/NEWS 1970-01-01 00:00:00.000000000 +0000 @@ -1,806 +0,0 @@ -Beta Release 0.1.18 (2 September, 2011) -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -Notable changes in samtools: - - * Support the new =/X CIGAR operators (by Peter Cock). - - * Allow to subsample BAM while keeping the pairing intact (view -s). - - * Implemented variant distance bias as a new filter (by Petr Danecek). - - * Bugfix: huge memory usage during indexing - - * Bugfix: use of uninitialized variable in mpileup (rare) - - * Bugfix: wrong BAQ probability (rare) - -Notable changes in bcftools: - - * Support indel in the contrast caller. - - * Bugfix: LRT2=nan in rare cases - -(0.1.18: 2 September 2011, r982:295) - - - -Beta Release 0.1.17 (6 July, 2011) -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -With the maturity of `mpileup' and the lack of update in the `pileup' command, -the `pileup' command is now formally dropped. Most of the pileup functionality, -such as outputting mapping quality and read positions, have been added -`mpileup'. - -Since this release, `bcftools view' is able to perform contrast SNP calling -(option -T) for discovering de novo and/or somatic mutations between a pair of -samples or in a family trio. Potential mutations are scored by a log likelihood -ratio, which is very simple in math, but should be comparable to more -sophisticated methods. Note that getting the score is only the very first step. -A lot more need to be done to reduce systematical errors due to mapping and -reference errors and structural variations. - -Other notable changes in samtools: - - * Improved sorting order checking during indexing. - - * Improved region parsing. Colons in reference sequence names are parsed - properly. - - * Fixed an issue where mpileup does not apply BAQ for the first few reads when - a region is specified. - - * Fixed an issue where `faidx' does not work with FASTA files with long lines. - - * Bugfix: wrong SP genotype information in the BCF output. - -Other notable changes in bcftools: - - * Output the ML esitmate of the allele count. - - * Added the HWE plus F<0 filter to varFilter. For multiple samples, it - effectively filters false heterozygous calls around centromeres. - - * For association mapping, perform both 1-degree and 2-degree test. The - 2-degree test is conservative but more robust to HWE violation. - -(0.1.17: 6 July 2011, r973:277) - - - -Beta Release 0.1.16 (21 April, 2011) -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -Notable changes in samtools: - - * Support the new SAM/BAM type `B' in the latest SAM spec v1.4. - - * When the output file of `samtools merge' exists, do not overwrite it unless - a new command-line option `-f' is applied. - - * Bugfix: BED support is not working when the input BED is not sorted. - - * Bugfix: some reads without coordinates but given on the reverse strand are - lost in merging. - -Notable changes in bcftools: - - * Code cleanup: separated max-likelihood inference and Bayesian inference. - - * Test Hardy-Weinberg equilibrium with a likelihood-ratio test. - - * Provided another association test P-value by likelihood-ratio test. - - * Use Brent's method to estimate the site allele frequency when EM converges - slowly. The resulting ML estimate of allele frequnecy is more accurate. - - * Added the `ldpair' command, which computes r^2 between SNP pairs given in - an input file. - -Also, the `pileup' command, which has been deprecated by `mpileup' since -version 0.1.10, will be dropped in the next release. The old `pileup' command -is substandard and causing a lot of confusion. - -(0.1.16: 21 April 2011, r963:234) - - - -Beta Release 0.1.15 (10 April, 2011) -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -Noteable changes: - - * Allow to perform variant calling or to extract information in multiple - regions specified by a BED file (`samtools mpileup -l', `samtools view -L' - and `bcftools view -l'). - - * Added the `depth' command to samtools to compute the per-base depth with a - simpler interface. File `bam2depth.c', which implements this command, is the - recommended example on how to use the mpileup APIs. - - * Estimate genotype frequencies with ML; perform chi^2 based Hardy-Weinberg - test using this estimate. - - * For `samtools view', when `-R' is specified, drop read groups in the header - that are not contained in the specified file. - - * For `samtools flagstat', separate QC-pass and QC-fail reads. - - * Improved the command line help of `samtools mpileup' and `bcftools view'. - - * Use a global variable to control the verbose level of samtools stderr - output. Nonetheless, it has not been full utilized. - - * Fixed an issue in association test which may report false associations, - possibly due to floating point underflow. - -(0.1.15: 10 April 2011, r949:203) - - - -Beta release 0.1.14 (21 March, 2011) -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -This release implements a method for testing associations for case-control -data. The method does not call genotypes but instead sums over all genotype -configurations to compute a chi^2 based test statistics. It can be potentially -applied to comparing a pair of samples (e.g. a tumor-normal pair), but this -has not been evaluated on real data. - -Another new feature is to make X chromosome variant calls when female and male -samples are both present. The user needs to provide a file indicating the -ploidy of each sample (see also manual bcftools/bcftools.1). - -Other notable changes: - - * Added `bcftools view -F' to parse BCF files generated by samtools r921 or - older which encodes PL in a different way. - - * Changed the behavior of `bcftools view -s'. Now when a list of samples is - provided, the samples in the output will be reordered to match the ordering - in the sample list. This change is mainly designed for association test. - - * Sped up `bcftools view -v' for target sequencing given thousands of samples. - Also added a new option `view -d' to skip loci where only a few samples are - covered by reads. - - * Dropped HWE test. This feature has never been implemented properly. An EM - should be much better. To be implemented in future. - - * Added the `cat' command to samtools. This command concatenate BAMs with - identical sequence dictionaries in an efficient way. Modified from bam_cat.c - written by Chris Saunders. - - * Added `samtools view -1' to write BAMs at a low compression level but twice - faster to create. The `sort' command generates temporary files at a low - compression level as well. - - * Added `samtools mpileup -6' to accept "BAM" with Illumina 1.3+ quality - strings (strictly speaking, such a file is not BAM). - - * Added `samtools mpileup -L' to skip INDEL calling in regions with - excessively high coverage. Such regions dramatically slow down mpileup. - - * Updated `misc/export2sam.pl', provided by Chris Saunders from Illumina Inc. - -(0.1.14: 21 March 2011, r933:170) - - - -Beta release 0.1.13 (1 March, 2011) -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -The most important though largely invisible modification is the change of the -order of genotypes in the PL VCF/BCF tag. This is to conform the upcoming VCF -spec v4.1. The change means that 0.1.13 is not backward compatible with VCF/BCF -generated by samtools older than r921 inclusive. VCF/BCF generated by the new -samtools will contain a line `##fileformat=VCFv4.1' as well as the samtools -version number. - -Single Individual Haplotyping (SIH) is added as an experimental feature. It -originally aims to produce haploid consensus from fosmid pool sequencing, but -also works with short-read data. For short reads, phased blocks are usually too -short to be useful in many applications, but they can help to rule out part of -SNPs close to INDELs or between copies of CNVs. - - -Other notable changes in samtools: - - * Construct per-sample consensus to reduce the effect of nearby SNPs in INDEL - calling. This reduces the power but improves specificity. - - * Improved sorting order checking in indexing. Now indexing is the preferred way - to check if a BAM is sorted. - - * Added a switch `-E' to mpileup and calmd. This option uses an alternative way - to apply BAQ, which increases sensistivity, especially to MNPs, at the cost of - a little loss in specificity. - - * Added `mpileup -A' to allow to use reads in anomalous pairs in SNP calling. - - * Added `mpileup -m' to allow fine control of the collection of INDEL candidates. - - * Added `mpileup -S' to compute per-sample strand bias P-value. - - * Added `mpileup -G' to exclude read groups in variant calling. - - * Fixed segfault in indel calling related to unmapped and refskip reads. - - * Fixed an integer overflow in INDEL calling. This bug produces wrong INDEL - genotypes for longer short INDELs, typically over 10bp. - - * Fixed a bug in tview on big-endian machines. - - * Fixed a very rare memory issue in bam_md.c - - * Fixed an out-of-boundary bug in mpileup when the read base is `N'. - - * Fixed a compiling error when the knetfile library is not used. Fixed a - library compiling error due to the lack of bam_nt16_nt4_table[] table. - Suppress a compiling warning related to the latest zlib. - - -Other notable changes in bcftools: - - * Updated the BCF spec. - - * Added the `FQ' VCF INFO field, which gives the phred-scaled probability - of all samples being the same (identical to the reference or all homozygous - variants). Option `view -f' has been dropped. - - * Implementated of "vcfutils.pl vcf2fq" to generate a consensus sequence - similar to "samtools.pl pileup2fq". - - * Make sure the GT FORMAT field is always the first FORMAT to conform the VCF - spec. Drop bcf-fix.pl. - - * Output bcftools specific INFO and FORMAT in the VCF header. - - * Added `view -s' to call variants from a subset of samples. - - * Properly convert VCF to BCF with a user provided sequence dictionary. Nonetheless, - custom fields are still unparsed and will be stored as a missing value. - - * Fixed a minor bug in Fisher's exact test; the results are rarely changed. - - -(0.1.13: 1 March 2011, r926:134) - - - -Beta release 0.1.12a (2 December, 2010) -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -This is another bug fix release: - - * Fixed a memory violation in mpileup, which causes segfault. Release - 0.1.9 and above are affected. - - * Fixed a memory violation in the indel caller, which does not causes - segfault, but may potentially affect deletion calls in an unexpected - way. Release 0.1.10 and above are affected. - - * Fixed a bug in computing r-square in bcftools. Few are using this - functionality and it only has minor effect. - - * Fixed a memory leak in bam_fetch(). - - * Fixed a bug in writing meta information to the BAM index for the last - sequence. This bug is invisible to most users, but it is a bug anyway. - - * Fixed a bug in bcftools which causes false "DP4=0,0,0,0" annotations. - -(0.1.12: 2 December 2010, r862) - - - -Beta release 0.1.11 (21 November, 2010) -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -This is mainly a bug fix release: - - * Fixed a bug in random retrieval (since 0.1.8). It occurs when reads - are retrieved from a small region containing no reads. - - * Fixed a bug in pileup (since 0.1.9). The bug causes an assertion - failure when the first CIGAR operation is a deletion. - - * Improved fault tolerence in remote access. - -One minor feature has been implemented in bcftools: - - * Added a reference-free variant calling mode. In this mode, a site is - regarded as a variat iff the sample(s) contains two or more alleles; - the meaning of the QUAL field in the VCF output is changed - accordingly. Effectively, the reference allele is irrelevant to the - result in the new mode, although the reference sequence has to be - used in realignment when SAMtools computes genotype likelihoods. - -In addition, since 0.1.10, the `pileup' command has been deprecated by -`mpileup' which is more powerful and more accurate. The `pileup' command -will not be removed in the next few releases, but new features will not -be added. - -(0.1.11: 21 November 2010, r851) - - - -Beta Release 0.1.10 (16 November, 2010) -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -This release is featured as the first major improvement to the indel -caller. The method is similar to the old one implemented in the pileup -command, but the details are handled more carefully both in theory and -in practice. As a result, the new indel caller usually gives more -accurate indel calls, though at the cost of sensitivity. The caller is -implemented in the mpileup command and is invoked by default. It works -with multiple samples. - -Other notable changes: - - * With the -r option, the calmd command writes the difference between - the original base quality and the BAQ capped base quality at the BQ - tag but does not modify the base quality. Please use -Ar to overwrite - the original base quality (the 0.1.9 behavior). - - * Allow to set a maximum per-sample read depth to reduce memory. In - 0.1.9, most of memory is wasted for the ultra high read depth in some - regions (e.g. the chr1 centromere). - - * Optionally write per-sample read depth and per-sample strand bias - P-value. - - * Compute equal-tail (Bayesian) credible interval of site allele - frequency at the CI95 VCF annotation. - - * Merged the vcfutils.pl varFilter and filter4vcf for better SNP/indel - filtering. - -(0.1.10: 16 November 2010, r829) - - - -Beta Release 0.1.9 (27 October, 2010) -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -This release is featured as the first major improvement to the samtools' -SNP caller. It comes with a revised MAQ error model, the support of -multi-sample SNP calling and the computation of base alignment quality -(BAQ). - -The revised MAQ error model is based on the original model. It solves an -issue of miscalling SNPs in repetitive regions. Althought such SNPs can -usually be filtered at a later step, they mess up unfiltered calls. This -is a theoretical flaw in the original model. The revised MAQ model -deprecates the orginal MAQ model and the simplified SOAPsnp model. - -Multi-sample SNP calling is separated in two steps. The first is done by -samtools mpileup and the second by a new program, bcftools, which is -included in the samtools source code tree. Multi-sample SNP calling also -works for single sample and has the advantage of enabling more powerful -filtration. It is likely to deprecate pileup in future once a proper -indel calling method is implemented. - -BAQ is the Phred-scaled probability of a read base being wrongly -aligned. Capping base quality by BAQ has been shown to be very effective -in suppressing false SNPs caused by misalignments around indels or in -low-complexity regions with acceptable compromise on computation -time. This strategy is highly recommended and can be used with other SNP -callers as well. - -In addition to the three major improvements, other notable changes are: - - * Changes to the pileup format. A reference skip (the N CIGAR operator) - is shown as '<' or '>' depending on the strand. Tview is also changed - accordingly. - - * Accelerated pileup. The plain pileup is about 50% faster. - - * Regional merge. The merge command now accepts a new option to merge - files in a specified region. - - * Fixed a bug in bgzip and razip which causes source files to be - deleted even if option -c is applied. - - * In APIs, propogate errors to downstream callers and make samtools - return non-zero values once errors occur. - -(0.1.9: 27 October 2010, r783) - - - -Beta Release 0.1.8 (11 July, 2010) -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -Notable functional changes: - - * Added the `reheader' command which replaces a BAM header with a new - header. This command is much faster than replacing header by - BAM->SAM->BAM conversions. - - * Added the `mpileup' command which computes the pileup of multiple - alignments. - - * The `index' command now stores the number of mapped and unmapped - reads in the index file. This information can be retrieved quickly by - the new `idxstats' command. - - * By default, pileup used the SOAPsnp model for SNP calling. This - avoids the floating overflow in the MAQ model which leads to spurious - calls in repetitive regions, although these calls will be immediately - filtered by varFilter. - - * The `tview' command now correctly handles CIGARs like 7I10M and - 10M1P1I10M which cause assertion failure in earlier versions. - - * Tview accepts a region like `=10,000' where `=' stands for the - current sequence name. This saves typing for long sequence names. - - * Added the `-d' option to `pileup' which avoids slow indel calling - in ultradeep regions by subsampling reads locally. - - * Added the `-R' option to `view' which retrieves alignments in read - groups listed in the specified file. - -Performance improvements: - - * The BAM->SAM conversion is up to twice faster, depending on the - characteristic of the input. - - * Parsing SAM headers with a lot of reference sequences is now much - faster. - - * The number of lseek() calls per query is reduced when the query - region contains no read alignments. - -Bug fixes: - - * Fixed an issue in the indel caller that leads to miscall of indels. - Note that this solution may not work well when the sequencing indel - error rate is higher than the rate of SNPs. - - * Fixed another issue in the indel caller which may lead to incorrect - genotype. - - * Fixed a bug in `sort' when option `-o' is applied. - - * Fixed a bug in `view -r'. - -APIs and other changes: - - * Added iterator interfaces to random access and pileup. The callback - interfaces directly call the iterator interfaces. - - * The BGZF blocks holding the BAM header are indepedent of alignment - BGZF blocks. Alignment records shorter than 64kB is guaranteed to be - fully contained in one BGZF block. This change is fully compatible - with the old version of samtools/picard. - -Changes in other utilities: - - * Updated export2sam.pl by Chris Saunders. - - * Improved the sam2vcf.pl script. - - * Added a Python version of varfilter.py by Aylwyn Scally. - -(0.1.8: 11 July 2010, r613) - - - -Beta Release 0.1.7 (10 November, 2009) -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -Notable changes: - - * Improved the indel caller in complex scenariors, in particular for - long reads. The indel caller is now able to make reasonable indel - calls from Craig Venter capillary reads. - - * Rewrote single-end duplicate removal with improved - performance. Paired-end reads are not touched. - - * Duplicate removal is now library aware. Samtools remove potential - PCR/optical dupliates inside a library rather than across libraries. - - * SAM header is now fully parsed, although this functionality is not - used in merging and so on. - - * In samtools merge, optionally take the input file name as RG-ID and - attach the RG tag to each alignment. - - * Added FTP support in the RAZF library. RAZF-compressed reference - sequence can be retrieved remotely. - - * Improved network support for Win32. - - * Samtools sort and merge are now stable. - -Changes in other utilities: - - * Implemented sam2vcf.pl that converts the pileup format to the VCF - format. - - * This release of samtools is known to work with the latest - Bio-Samtools Perl module. - -(0.1.7: 10 November 2009, r510) - - - -Beta Release 0.1.6 (2 September, 2009) -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -Notable changes: - - * In tview, do not show a blank screen when no reads mapped to the - corresponding region. - - * Implemented native HTTP support in the BGZF library. Samtools is now - able to directly open a BAM file on HTTP. HTTP proxy is also - supported via the "http_proxy" environmental variable. - - * Samtools is now compitable with the MinGW (win32) compiler and the - PDCurses library. - - * The calmd (or fillmd) command now calculates the NM tag and replaces - MD tags if they are wrong. - - * The view command now recognizes and optionally prints FLAG in HEXs or - strings to make a SAM file more friendly to human eyes. This is a - samtools-C extension, not implemented in Picard for the time - being. Please type `samtools view -?' for more information. - - * BAM files now have an end-of-file (EOF) marker to facilitate - truncation detection. A warning will be given if an on-disk BAM file - does not have this marker. The warning will be seen on BAM files - generated by an older version of samtools. It does NO harm. - - * New key bindings in tview: `r' to show read names and `s' to show - reference skip (N operation) as deletions. - - * Fixed a bug in `samtools merge -n'. - - * Samtools merge now optionally copies the header of a user specified - SAM file to the resultant BAM output. - - * Samtools pileup/tview works with a CIGAR with the first or the last - operation is an indel. - - * Fixed a bug in bam_aux_get(). - - -Changes in other utilies: - - * Fixed wrong FLAG in maq2sam. - - -(0.1.6: 2 September 2009, r453) - - - -Beta Release 0.1.5 (7 July, 2009) -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -Notable changes: - - * Support opening a BAM alignment on FTP. Users can now use "tview" to - view alignments at the NCBI ftp site. Please read manual for more - information. - - * In library, propagate errors rather than exit or complain assertion - failure. - - * Simplified the building system and fixed compiling errors caused by - zlib<1.2.2.1. - - * Fixed an issue about lost header information when a SAM is imported - with "view -t". - - * Implemented "samtool.pl varFilter" which filters both SNPs and short - indels. This command replaces "indelFilter". - - * Implemented "samtools.pl pileup2fq" to generate FASTQ consensus from - pileup output. - - * In pileup, cap mapping quality at 60. This helps filtering when - different aligners are in use. - - * In pileup, allow to output variant sites only. - - * Made pileup generate correct calls in repetitive region. At the same - time, I am considering to implement a simplified model in SOAPsnp, - although this has not happened yet. - - * In view, added '-u' option to output BAM without compression. This - option is preferred when the output is piped to other commands. - - * In view, added '-l' and '-r' to get the alignments for one library or - read group. The "@RG" header lines are now partially parsed. - - * Do not include command line utilities to libbam.a. - - * Fixed memory leaks in pileup and bam_view1(). - - * Made faidx more tolerant to empty lines right before or after FASTA > - lines. - - -Changes in other utilities: - - * Updated novo2sam.pl by Colin Hercus, the key developer of novoalign. - - -This release involves several modifications to the key code base which -may potentially introduce new bugs even though we have tried to minimize -this by testing on several examples. Please let us know if you catch -bugs. - -(0.1.5: 7 July 2009, r373) - - - -Beta Release 0.1.4 (21 May, 2009) -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -Notable changes: - - * Added the 'rmdupse' command: removing duplicates for SE reads. - - * Fixed a critical bug in the indel caller: clipped alignments are not - processed correctly. - - * Fixed a bug in the tview: gapped alignment may be incorrectly - displayed. - - * Unified the interface to BAM and SAM I/O. This is done by - implementing a wrapper on top of the old APIs and therefore old APIs - are still valid. The new I/O APIs also recognize the @SQ header - lines. - - * Generate the MD tag. - - * Generate "=" bases. However, the indel caller will not work when "=" - bases are present. - - * Enhanced support of color-read display (by Nils Homer). - - * Implemented the GNU building system. However, currently the building - system does not generate libbam.a. We will improve this later. For - the time being, `make -f Makefile.generic' is preferred. - - * Fixed a minor bug in pileup: the first read in a chromosome may be - skipped. - - * Fixed bugs in bam_aux.c. These bugs do not affect other components as - they were not used previously. - - * Output the 'SM' tag from maq2sam. - -(0.1.4: 21 May 2009, r297) - - - -Beta Release 0.1.3 (15 April, 2009) -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -Notable changes in SAMtools: - - * SAMtools is more consistent with the specification: a) '*' in the - QUAL field is allowed; b) the field separator is TAB only and SPACE - is treated as a character in a field; c) empty header is allowed. - - * Implemented GLFv3 support in pileup. - - * Fixed a severe bug in fixmate: strand information is wrongly - overwritten. - - * Fixed a bug in alignment retrieval: alignments bridging n*16384bp are - not correctly retrieved sometimes. - - * Fixed a bug in rmdup: segfault if unmapped reads are present. - - * Move indel_filter.pl to samtools.pl and improved the filtering by - checking the actual number of alignments containing indels. The indel - pileup line is also changed a little to make this filtration easier. - - * Fixed a minor bug in indexing: the bin number of an unmapped read is - wrongly calculated. - - * Added `flagstat' command to show statistics on the FLAG field. - - * Improved indel caller by setting the maximum window size in local - realignment. - -Changes in other utilities: - - * Fixed a bug in maq2sam: a tag name is obsolete. - - * Improvement to wgsim: a) added support for SOLiD read simulation; b) - show the number of substitutions/indels/errors in read name; c) - considerable code clean up. - - * Various converters: improved functionality in general. - - * Updated the example SAM due to the previous bug in fixmate. - -(0.1.3: 15 April 2009, r227) - - - -Beta Release 0.1.2 (28 January, 2008) -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -Notable changes in SAMtools: - - * Implemented a Bayesian indel caller. The new caller generate scores - and genotype and is potentially more accurate than Maq's indel - caller. The pileup format is also changed accordingly. - - * Implemented rmdup command: remove potential PCR duplicates. Note that - this command ONLY works for FR orientation and requires ISIZE is - correctly set. - - * Added fixmate command: fill in mate coordinates, ISIZE and mate - related flags from a name-sorted alignment. - - * Fixed a bug in indexing: reads bridging 16x kbp were not retrieved. - - * Allow to select reads shown in the pileup output with a mask. - - * Generate GLFv2 from pileup. - - * Added two more flags for flagging PCR/optical duplicates and for QC - failure. - - * Fixed a bug in sort command: name sorting for large alignment did not - work. - - * Allow to completely disable RAZF (using Makefile.lite) as some people - have problem to compile it. - - * Fixed a bug in import command when there are reads without - coordinates. - - * Fixed a bug in tview: clipping broke the alignment viewer. - - * Fixed a compiling error when _NO_CURSES is applied. - - * Fixed a bug in merge command. - -Changes in other utilities: - - * Added wgsim, a paired-end reads simulator. Wgsim was adapted from - maq's reads simulator. Colin Hercus further improved it to allow - longer indels. - - * Added wgsim_eval.pl, a script that evaluates the accuracy of - alignment on reads generated by wgsim. - - * Added soap2sam.pl, a SOAP2->SAM converter. This converter does not - work properly when multiple hits are output. - - * Added bowtie2sam.pl, a Bowtie->SAM converter. Only the top hit will - be retained when multiple hits are present. - - * Fixed a bug in export2sam.pl for QC reads. - - * Support RG tag at MAQ->SAM converter. - - * Added novo2sam.pl, a NovoAlign->SAM converter. Multiple hits and - indel are not properly handled, though. - - * Added zoom2sam.pl, a ZOOM->SAM converter. It only works with the - default Illumina output. - -(0.1.2: 28 January 2008; r116) - - - -Beta Release 0.1.1 (22 December, 2008) -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -The is the first public release of samtools. For more information, -please check the manual page `samtools.1' and the samtools website -http://samtools.sourceforge.net diff -Nru tophat-2.1.1+dfsg/src/samtools-0.1.18/phase.c tophat-2.1.1+dfsg1/src/samtools-0.1.18/phase.c --- tophat-2.1.1+dfsg/src/samtools-0.1.18/phase.c 2016-02-14 18:21:17.767079000 +0000 +++ tophat-2.1.1+dfsg1/src/samtools-0.1.18/phase.c 1970-01-01 00:00:00.000000000 +0000 @@ -1,687 +0,0 @@ -#include -#include -#include -#include -#include -#include -#include "bam.h" -#include "errmod.h" - -#include "kseq.h" -KSTREAM_INIT(gzFile, gzread, 16384) - -#define MAX_VARS 256 -#define FLIP_PENALTY 2 -#define FLIP_THRES 4 -#define MASK_THRES 3 - -#define FLAG_FIX_CHIMERA 0x1 -#define FLAG_LIST_EXCL 0x4 -#define FLAG_DROP_AMBI 0x8 - -typedef struct { - // configurations, initialized in the main function - int flag, k, min_baseQ, min_varLOD, max_depth; - // other global variables - int vpos_shift; - bamFile fp; - char *pre; - bamFile out[3]; - // alignment queue - int n, m; - bam1_t **b; -} phaseg_t; - -typedef struct { - int8_t seq[MAX_VARS]; // TODO: change to dynamic memory allocation! - int vpos, beg, end; - uint32_t vlen:16, single:1, flip:1, phase:1, phased:1, ambig:1; - uint32_t in:16, out:16; // in-phase and out-phase -} frag_t, *frag_p; - -#define rseq_lt(a,b) ((a)->vpos < (b)->vpos) - -#include "khash.h" -KHASH_SET_INIT_INT64(set64) -KHASH_MAP_INIT_INT64(64, frag_t) - -typedef khash_t(64) nseq_t; - -#include "ksort.h" -KSORT_INIT(rseq, frag_p, rseq_lt) - -static char nt16_nt4_table[] = { 4, 0, 1, 4, 2, 4, 4, 4, 3, 4, 4, 4, 4, 4, 4, 4 }; - -static inline uint64_t X31_hash_string(const char *s) -{ - uint64_t h = *s; - if (h) for (++s ; *s; ++s) h = (h << 5) - h + *s; - return h; -} - -static void count1(int l, const uint8_t *seq, int *cnt) -{ - int i, j, n_ambi; - uint32_t z, x; - if (seq[l-1] == 0) return; // do nothing is the last base is ambiguous - for (i = n_ambi = 0; i < l; ++i) // collect ambiguous bases - if (seq[i] == 0) ++n_ambi; - if (l - n_ambi <= 1) return; // only one SNP - for (x = 0; x < 1u<>j&1; - ++j; - } - z = z<<1 | c; - } - ++cnt[z]; - } -} - -static int **count_all(int l, int vpos, nseq_t *hash) -{ - khint_t k; - int i, j, **cnt; - uint8_t *seq; - seq = calloc(l, 1); - cnt = calloc(vpos, sizeof(void*)); - for (i = 0; i < vpos; ++i) cnt[i] = calloc(1<vpos >= vpos || f->single) continue; // out of region; or singleton - if (f->vlen == 1) { // such reads should be flagged as deleted previously if everything is right - f->single = 1; - continue; - } - for (j = 1; j < f->vlen; ++j) { - for (i = 0; i < l; ++i) - seq[i] = j < l - 1 - i? 0 : f->seq[j - (l - 1 - i)]; - count1(l, seq, cnt[f->vpos + j]); - } - } - } - free(seq); - return cnt; -} - -// phasing -static int8_t *dynaprog(int l, int vpos, int **w) -{ - int *f[2], *curr, *prev, max, i; - int8_t **b, *h = 0; - uint32_t x, z = 1u<<(l-1), mask = (1u<>1; y1 = xc>>1; - c0 = prev[y0] + wi[x] + wi[xc]; - c1 = prev[y1] + wi[x] + wi[xc]; - if (c0 > c1) bi[x] = 0, curr[x] = c0; - else bi[x] = 1, curr[x] = c1; - } - tmp = prev; prev = curr; curr = tmp; // swap - } - { // backtrack - uint32_t max_x = 0; - int which = 0; - h = calloc(vpos, 1); - for (x = 0, max = 0, max_x = 0; x < z; ++x) - if (prev[x] > max) max = prev[x], max_x = x; - for (i = vpos - 1, x = max_x; i >= 0; --i) { - h[i] = which? (~x&1) : (x&1); - which = b[i][x]? !which : which; - x = b[i][x]? (~x&mask)>>1 : x>>1; - } - } - // free - for (i = 0; i < vpos; ++i) free(b[i]); - free(f[0]); free(f[1]); free(b); - return h; -} - -// phase each fragment -static uint64_t *fragphase(int vpos, const int8_t *path, nseq_t *hash, int flip) -{ - khint_t k; - uint64_t *pcnt; - uint32_t *left, *rght, max; - left = rght = 0; max = 0; - pcnt = calloc(vpos, 8); - for (k = 0; k < kh_end(hash); ++k) { - if (kh_exist(hash, k)) { - int i, c[2]; - frag_t *f = &kh_val(hash, k); - if (f->vpos >= vpos) continue; - // get the phase - c[0] = c[1] = 0; - for (i = 0; i < f->vlen; ++i) { - if (f->seq[i] == 0) continue; - ++c[f->seq[i] == path[f->vpos + i] + 1? 0 : 1]; - } - f->phase = c[0] > c[1]? 0 : 1; - f->in = c[f->phase]; f->out = c[1 - f->phase]; - f->phased = f->in == f->out? 0 : 1; - f->ambig = (f->in && f->out && f->out < 3 && f->in <= f->out + 1)? 1 : 0; - // fix chimera - f->flip = 0; - if (flip && c[0] >= 3 && c[1] >= 3) { - int sum[2], m, mi, md; - if (f->vlen > max) { // enlarge the array - max = f->vlen; - kroundup32(max); - left = realloc(left, max * 4); - rght = realloc(rght, max * 4); - } - for (i = 0, sum[0] = sum[1] = 0; i < f->vlen; ++i) { // get left counts - if (f->seq[i]) { - int c = f->phase? 2 - f->seq[i] : f->seq[i] - 1; - ++sum[c == path[f->vpos + i]? 0 : 1]; - } - left[i] = sum[1]<<16 | sum[0]; - } - for (i = f->vlen - 1, sum[0] = sum[1] = 0; i >= 0; --i) { // get right counts - if (f->seq[i]) { - int c = f->phase? 2 - f->seq[i] : f->seq[i] - 1; - ++sum[c == path[f->vpos + i]? 0 : 1]; - } - rght[i] = sum[1]<<16 | sum[0]; - } - // find the best flip point - for (i = m = 0, mi = -1, md = -1; i < f->vlen - 1; ++i) { - int a[2]; - a[0] = (left[i]&0xffff) + (rght[i+1]>>16&0xffff) - (rght[i+1]&0xffff) * FLIP_PENALTY; - a[1] = (left[i]>>16&0xffff) + (rght[i+1]&0xffff) - (rght[i+1]>>16&0xffff) * FLIP_PENALTY; - if (a[0] > a[1]) { - if (a[0] > m) m = a[0], md = 0, mi = i; - } else { - if (a[1] > m) m = a[1], md = 1, mi = i; - } - } - if (m - c[0] >= FLIP_THRES && m - c[1] >= FLIP_THRES) { // then flip - f->flip = 1; - if (md == 0) { // flip the tail - for (i = mi + 1; i < f->vlen; ++i) - if (f->seq[i] == 1) f->seq[i] = 2; - else if (f->seq[i] == 2) f->seq[i] = 1; - } else { // flip the head - for (i = 0; i <= mi; ++i) - if (f->seq[i] == 1) f->seq[i] = 2; - else if (f->seq[i] == 2) f->seq[i] = 1; - } - } - } - // update pcnt[] - if (!f->single) { - for (i = 0; i < f->vlen; ++i) { - int c; - if (f->seq[i] == 0) continue; - c = f->phase? 2 - f->seq[i] : f->seq[i] - 1; - if (c == path[f->vpos + i]) { - if (f->phase == 0) ++pcnt[f->vpos + i]; - else pcnt[f->vpos + i] += 1ull<<32; - } else { - if (f->phase == 0) pcnt[f->vpos + i] += 1<<16; - else pcnt[f->vpos + i] += 1ull<<48; - } - } - } - } - } - free(left); free(rght); - return pcnt; -} - -static uint64_t *genmask(int vpos, const uint64_t *pcnt, int *_n) -{ - int i, max = 0, max_i = -1, m = 0, n = 0, beg = 0, score = 0; - uint64_t *list = 0; - for (i = 0; i < vpos; ++i) { - uint64_t x = pcnt[i]; - int c[4], pre = score, s; - c[0] = x&0xffff; c[1] = x>>16&0xffff; c[2] = x>>32&0xffff; c[3] = x>>48&0xffff; - s = (c[1] + c[3] == 0)? -(c[0] + c[2]) : (c[1] + c[3] - 1); - if (c[3] > c[2]) s += c[3] - c[2]; - if (c[1] > c[0]) s += c[1] - c[0]; - score += s; - if (score < 0) score = 0; - if (pre == 0 && score > 0) beg = i; // change from zero to non-zero - if ((i == vpos - 1 || score == 0) && max >= MASK_THRES) { - if (n == m) { - m = m? m<<1 : 4; - list = realloc(list, m * 8); - } - list[n++] = (uint64_t)beg<<32 | max_i; - i = max_i; // reset i to max_i - score = 0; - } else if (score > max) max = score, max_i = i; - if (score == 0) max = 0; - } - *_n = n; - return list; -} - -// trim heading and tailing ambiguous bases; mark deleted and remove sequence -static int clean_seqs(int vpos, nseq_t *hash) -{ - khint_t k; - int ret = 0; - for (k = 0; k < kh_end(hash); ++k) { - if (kh_exist(hash, k)) { - frag_t *f = &kh_val(hash, k); - int beg, end, i; - if (f->vpos >= vpos) { - ret = 1; - continue; - } - for (i = 0; i < f->vlen; ++i) - if (f->seq[i] != 0) break; - beg = i; - for (i = f->vlen - 1; i >= 0; --i) - if (f->seq[i] != 0) break; - end = i + 1; - if (end - beg <= 0) kh_del(64, hash, k); - else { - if (beg != 0) memmove(f->seq, f->seq + beg, end - beg); - f->vpos += beg; f->vlen = end - beg; - f->single = f->vlen == 1? 1 : 0; - } - } - } - return ret; -} - -static void dump_aln(phaseg_t *g, int min_pos, const nseq_t *hash) -{ - int i, is_flip, drop_ambi; - drop_ambi = g->flag & FLAG_DROP_AMBI; - is_flip = (drand48() < 0.5); - for (i = 0; i < g->n; ++i) { - int end, which; - uint64_t key; - khint_t k; - bam1_t *b = g->b[i]; - key = X31_hash_string(bam1_qname(b)); - end = bam_calend(&b->core, bam1_cigar(b)); - if (end > min_pos) break; - k = kh_get(64, hash, key); - if (k == kh_end(hash)) which = 3; - else { - frag_t *f = &kh_val(hash, k); - if (f->ambig) which = drop_ambi? 2 : 3; - else if (f->phased && f->flip) which = 2; - else if (f->phased == 0) which = 3; - else { // phased and not flipped - char c = 'Y'; - which = f->phase; - bam_aux_append(b, "ZP", 'A', 1, (uint8_t*)&c); - } - if (which < 2 && is_flip) which = 1 - which; // increase the randomness - } - if (which == 3) which = (drand48() < 0.5); - bam_write1(g->out[which], b); - bam_destroy1(b); - g->b[i] = 0; - } - memmove(g->b, g->b + i, (g->n - i) * sizeof(void*)); - g->n -= i; -} - -static int phase(phaseg_t *g, const char *chr, int vpos, uint64_t *cns, nseq_t *hash) -{ - int i, j, n_seqs = kh_size(hash), n_masked = 0, min_pos; - khint_t k; - frag_t **seqs; - int8_t *path, *sitemask; - uint64_t *pcnt, *regmask; - - if (vpos == 0) return 0; - i = clean_seqs(vpos, hash); // i is true if hash has an element with its vpos >= vpos - min_pos = i? cns[vpos]>>32 : 0x7fffffff; - if (vpos == 1) { - printf("PS\t%s\t%d\t%d\n", chr, (int)(cns[0]>>32) + 1, (int)(cns[0]>>32) + 1); - printf("M0\t%s\t%d\t%d\t%c\t%c\t%d\t0\t0\t0\t0\n//\n", chr, (int)(cns[0]>>32) + 1, (int)(cns[0]>>32) + 1, - "ACGTX"[cns[0]&3], "ACGTX"[cns[0]>>16&3], g->vpos_shift + 1); - for (k = 0; k < kh_end(hash); ++k) { - if (kh_exist(hash, k)) { - frag_t *f = &kh_val(hash, k); - if (f->vpos) continue; - f->flip = 0; - if (f->seq[0] == 0) f->phased = 0; - else f->phased = 1, f->phase = f->seq[0] - 1; - } - } - dump_aln(g, min_pos, hash); - ++g->vpos_shift; - return 1; - } - { // phase - int **cnt; - uint64_t *mask; - printf("PS\t%s\t%d\t%d\n", chr, (int)(cns[0]>>32) + 1, (int)(cns[vpos-1]>>32) + 1); - sitemask = calloc(vpos, 1); - cnt = count_all(g->k, vpos, hash); - path = dynaprog(g->k, vpos, cnt); - for (i = 0; i < vpos; ++i) free(cnt[i]); - free(cnt); - pcnt = fragphase(vpos, path, hash, 0); // do not fix chimeras when masking - mask = genmask(vpos, pcnt, &n_masked); - regmask = calloc(n_masked, 8); - for (i = 0; i < n_masked; ++i) { - regmask[i] = cns[mask[i]>>32]>>32<<32 | cns[(uint32_t)mask[i]]>>32; - for (j = mask[i]>>32; j <= (int32_t)mask[i]; ++j) - sitemask[j] = 1; - } - free(mask); - if (g->flag & FLAG_FIX_CHIMERA) { - free(pcnt); - pcnt = fragphase(vpos, path, hash, 1); - } - } - for (i = 0; i < n_masked; ++i) - printf("FL\t%s\t%d\t%d\n", chr, (int)(regmask[i]>>32) + 1, (int)regmask[i] + 1); - for (i = 0; i < vpos; ++i) { - uint64_t x = pcnt[i]; - int8_t c[2]; - c[0] = (cns[i]&0xffff)>>2 == 0? 4 : (cns[i]&3); - c[1] = (cns[i]>>16&0xffff)>>2 == 0? 4 : (cns[i]>>16&3); - printf("M%d\t%s\t%d\t%d\t%c\t%c\t%d\t%d\t%d\t%d\t%d\n", sitemask[i]+1, chr, (int)(cns[0]>>32) + 1, (int)(cns[i]>>32) + 1, "ACGTX"[c[path[i]]], "ACGTX"[c[1-path[i]]], - i + g->vpos_shift + 1, (int)(x&0xffff), (int)(x>>16&0xffff), (int)(x>>32&0xffff), (int)(x>>48&0xffff)); - } - free(path); free(pcnt); free(regmask); free(sitemask); - seqs = calloc(n_seqs, sizeof(void*)); - for (k = 0, i = 0; k < kh_end(hash); ++k) - if (kh_exist(hash, k) && kh_val(hash, k).vpos < vpos && !kh_val(hash, k).single) - seqs[i++] = &kh_val(hash, k); - n_seqs = i; - ks_introsort_rseq(n_seqs, seqs); - for (i = 0; i < n_seqs; ++i) { - frag_t *f = seqs[i]; - printf("EV\t0\t%s\t%d\t40\t%dM\t*\t0\t0\t", chr, f->vpos + 1 + g->vpos_shift, f->vlen); - for (j = 0; j < f->vlen; ++j) { - uint32_t c = cns[f->vpos + j]; - if (f->seq[j] == 0) putchar('N'); - else putchar("ACGT"[f->seq[j] == 1? (c&3) : (c>>16&3)]); - } - printf("\t*\tYP:i:%d\tYF:i:%d\tYI:i:%d\tYO:i:%d\tYS:i:%d\n", f->phase, f->flip, f->in, f->out, f->beg+1); - } - free(seqs); - printf("//\n"); - fflush(stdout); - g->vpos_shift += vpos; - dump_aln(g, min_pos, hash); - return vpos; -} - -static void update_vpos(int vpos, nseq_t *hash) -{ - khint_t k; - for (k = 0; k < kh_end(hash); ++k) { - if (kh_exist(hash, k)) { - frag_t *f = &kh_val(hash, k); - if (f->vpos < vpos) kh_del(64, hash, k); // TODO: if frag_t::seq is allocated dynamically, free it - else f->vpos -= vpos; - } - } -} - -static nseq_t *shrink_hash(nseq_t *hash) // TODO: to implement -{ - return hash; -} - -static int readaln(void *data, bam1_t *b) -{ - phaseg_t *g = (phaseg_t*)data; - int ret; - ret = bam_read1(g->fp, b); - if (ret < 0) return ret; - if (!(b->core.flag & (BAM_FUNMAP|BAM_FSECONDARY|BAM_FQCFAIL|BAM_FDUP)) && g->pre) { - if (g->n == g->m) { - g->m = g->m? g->m<<1 : 16; - g->b = realloc(g->b, g->m * sizeof(void*)); - } - g->b[g->n++] = bam_dup1(b); - } - return ret; -} - -static khash_t(set64) *loadpos(const char *fn, bam_header_t *h) -{ - gzFile fp; - kstream_t *ks; - int ret, dret; - kstring_t *str; - khash_t(set64) *hash; - - hash = kh_init(set64); - str = calloc(1, sizeof(kstring_t)); - fp = strcmp(fn, "-")? gzopen(fn, "r") : gzdopen(fileno(stdin), "r"); - ks = ks_init(fp); - while (ks_getuntil(ks, 0, str, &dret) >= 0) { - int tid = bam_get_tid(h, str->s); - if (tid >= 0 && dret != '\n') { - if (ks_getuntil(ks, 0, str, &dret) >= 0) { - uint64_t x = (uint64_t)tid<<32 | (atoi(str->s) - 1); - kh_put(set64, hash, x, &ret); - } else break; - } - if (dret != '\n') while ((dret = ks_getc(ks)) > 0 && dret != '\n'); - if (dret < 0) break; - } - ks_destroy(ks); - gzclose(fp); - free(str->s); free(str); - return hash; -} - -static int gl2cns(float q[16]) -{ - int i, j, min_ij; - float min, min2; - min = min2 = 1e30; min_ij = -1; - for (i = 0; i < 4; ++i) { - for (j = i; j < 4; ++j) { - if (q[i<<2|j] < min) min_ij = i<<2|j, min2 = min, min = q[i<<2|j]; - else if (q[i<<2|j] < min2) min2 = q[i<<2|j]; - } - } - return (min_ij>>2&3) == (min_ij&3)? 0 : 1<<18 | (min_ij>>2&3)<<16 | (min_ij&3) | (int)(min2 - min + .499) << 2; -} - -int main_phase(int argc, char *argv[]) -{ - extern void bam_init_header_hash(bam_header_t *header); - int c, tid, pos, vpos = 0, n, lasttid = -1, max_vpos = 0; - const bam_pileup1_t *plp; - bam_plp_t iter; - bam_header_t *h; - nseq_t *seqs; - uint64_t *cns = 0; - phaseg_t g; - char *fn_list = 0; - khash_t(set64) *set = 0; - errmod_t *em; - uint16_t *bases; - - memset(&g, 0, sizeof(phaseg_t)); - g.flag = FLAG_FIX_CHIMERA; - g.min_varLOD = 37; g.k = 13; g.min_baseQ = 13; g.max_depth = 256; - while ((c = getopt(argc, argv, "Q:eFq:k:b:l:D:A:")) >= 0) { - switch (c) { - case 'D': g.max_depth = atoi(optarg); break; - case 'q': g.min_varLOD = atoi(optarg); break; - case 'Q': g.min_baseQ = atoi(optarg); break; - case 'k': g.k = atoi(optarg); break; - case 'F': g.flag &= ~FLAG_FIX_CHIMERA; break; - case 'e': g.flag |= FLAG_LIST_EXCL; break; - case 'A': g.flag |= FLAG_DROP_AMBI; break; - case 'b': g.pre = strdup(optarg); break; - case 'l': fn_list = strdup(optarg); break; - } - } - if (argc == optind) { - fprintf(stderr, "\n"); - fprintf(stderr, "Usage: samtools phase [options] \n\n"); - fprintf(stderr, "Options: -k INT block length [%d]\n", g.k); - fprintf(stderr, " -b STR prefix of BAMs to output [null]\n"); - fprintf(stderr, " -q INT min het phred-LOD [%d]\n", g.min_varLOD); - fprintf(stderr, " -Q INT min base quality in het calling [%d]\n", g.min_baseQ); - fprintf(stderr, " -D INT max read depth [%d]\n", g.max_depth); -// fprintf(stderr, " -l FILE list of sites to phase [null]\n"); - fprintf(stderr, " -F do not attempt to fix chimeras\n"); - fprintf(stderr, " -A drop reads with ambiguous phase\n"); -// fprintf(stderr, " -e do not discover SNPs (effective with -l)\n"); - fprintf(stderr, "\n"); - return 1; - } - g.fp = strcmp(argv[optind], "-")? bam_open(argv[optind], "r") : bam_dopen(fileno(stdin), "r"); - h = bam_header_read(g.fp); - if (fn_list) { // read the list of sites to phase - bam_init_header_hash(h); - set = loadpos(fn_list, h); - free(fn_list); - } else g.flag &= ~FLAG_LIST_EXCL; - if (g.pre) { // open BAMs to write - char *s = malloc(strlen(g.pre) + 20); - strcpy(s, g.pre); strcat(s, ".0.bam"); g.out[0] = bam_open(s, "w"); - strcpy(s, g.pre); strcat(s, ".1.bam"); g.out[1] = bam_open(s, "w"); - strcpy(s, g.pre); strcat(s, ".chimera.bam"); g.out[2] = bam_open(s, "w"); - for (c = 0; c <= 2; ++c) bam_header_write(g.out[c], h); - free(s); - } - - iter = bam_plp_init(readaln, &g); - g.vpos_shift = 0; - seqs = kh_init(64); - em = errmod_init(1. - 0.83); - bases = calloc(g.max_depth, 2); - printf("CC\n"); - printf("CC\tDescriptions:\nCC\n"); - printf("CC\t CC comments\n"); - printf("CC\t PS start of a phase set\n"); - printf("CC\t FL filtered region\n"); - printf("CC\t M[012] markers; 0 for singletons, 1 for phased and 2 for filtered\n"); - printf("CC\t EV supporting reads; SAM format\n"); - printf("CC\t // end of a phase set\nCC\n"); - printf("CC\tFormats of PS, FL and M[012] lines (1-based coordinates):\nCC\n"); - printf("CC\t PS chr phaseSetStart phaseSetEnd\n"); - printf("CC\t FL chr filterStart filterEnd\n"); - printf("CC\t M? chr PS pos allele0 allele1 hetIndex #supports0 #errors0 #supp1 #err1\n"); - printf("CC\nCC\n"); - fflush(stdout); - while ((plp = bam_plp_auto(iter, &tid, &pos, &n)) != 0) { - int i, k, c, tmp, dophase = 1, in_set = 0; - float q[16]; - if (tid < 0) break; - if (tid != lasttid) { // change of chromosome - g.vpos_shift = 0; - if (lasttid >= 0) { - seqs = shrink_hash(seqs); - phase(&g, h->target_name[lasttid], vpos, cns, seqs); - update_vpos(0x7fffffff, seqs); - } - lasttid = tid; - vpos = 0; - } - if (set && kh_get(set64, set, (uint64_t)tid<<32 | pos) != kh_end(set)) in_set = 1; - if (n > g.max_depth) continue; // do not proceed if the depth is too high - // fill the bases array and check if there is a variant - for (i = k = 0; i < n; ++i) { - const bam_pileup1_t *p = plp + i; - uint8_t *seq; - int q, baseQ, b; - if (p->is_del || p->is_refskip) continue; - baseQ = bam1_qual(p->b)[p->qpos]; - if (baseQ < g.min_baseQ) continue; - seq = bam1_seq(p->b); - b = bam_nt16_nt4_table[bam1_seqi(seq, p->qpos)]; - if (b > 3) continue; - q = baseQ < p->b->core.qual? baseQ : p->b->core.qual; - if (q < 4) q = 4; - if (q > 63) q = 63; - bases[k++] = q<<5 | (int)bam1_strand(p->b)<<4 | b; - } - if (k == 0) continue; - errmod_cal(em, k, 4, bases, q); // compute genotype likelihood - c = gl2cns(q); // get the consensus - // tell if to proceed - if (set && (g.flag&FLAG_LIST_EXCL) && !in_set) continue; // not in the list - if (!in_set && (c&0xffff)>>2 < g.min_varLOD) continue; // not a variant - // add the variant - if (vpos == max_vpos) { - max_vpos = max_vpos? max_vpos<<1 : 128; - cns = realloc(cns, max_vpos * 8); - } - cns[vpos] = (uint64_t)pos<<32 | c; - for (i = 0; i < n; ++i) { - const bam_pileup1_t *p = plp + i; - uint64_t key; - khint_t k; - uint8_t *seq = bam1_seq(p->b); - frag_t *f; - if (p->is_del || p->is_refskip) continue; - if (p->b->core.qual == 0) continue; - // get the base code - c = nt16_nt4_table[(int)bam1_seqi(seq, p->qpos)]; - if (c == (cns[vpos]&3)) c = 1; - else if (c == (cns[vpos]>>16&3)) c = 2; - else c = 0; - // write to seqs - key = X31_hash_string(bam1_qname(p->b)); - k = kh_put(64, seqs, key, &tmp); - f = &kh_val(seqs, k); - if (tmp == 0) { // present in the hash table - if (vpos - f->vpos + 1 < MAX_VARS) { - f->vlen = vpos - f->vpos + 1; - f->seq[f->vlen-1] = c; - f->end = bam_calend(&p->b->core, bam1_cigar(p->b)); - } - dophase = 0; - } else { // absent - memset(f->seq, 0, MAX_VARS); - f->beg = p->b->core.pos; - f->end = bam_calend(&p->b->core, bam1_cigar(p->b)); - f->vpos = vpos, f->vlen = 1, f->seq[0] = c, f->single = f->phased = f->flip = f->ambig = 0; - } - } - if (dophase) { - seqs = shrink_hash(seqs); - phase(&g, h->target_name[tid], vpos, cns, seqs); - update_vpos(vpos, seqs); - cns[0] = cns[vpos]; - vpos = 0; - } - ++vpos; - } - if (tid >= 0) phase(&g, h->target_name[tid], vpos, cns, seqs); - bam_header_destroy(h); - bam_plp_destroy(iter); - bam_close(g.fp); - kh_destroy(64, seqs); - kh_destroy(set64, set); - free(cns); - errmod_destroy(em); - free(bases); - if (g.pre) { - for (c = 0; c <= 2; ++c) bam_close(g.out[c]); - free(g.pre); free(g.b); - } - return 0; -} diff -Nru tophat-2.1.1+dfsg/src/samtools-0.1.18/razf.c tophat-2.1.1+dfsg1/src/samtools-0.1.18/razf.c --- tophat-2.1.1+dfsg/src/samtools-0.1.18/razf.c 2016-02-14 18:21:17.768079000 +0000 +++ tophat-2.1.1+dfsg1/src/samtools-0.1.18/razf.c 1970-01-01 00:00:00.000000000 +0000 @@ -1,853 +0,0 @@ -/* - * RAZF : Random Access compressed(Z) File - * Version: 1.0 - * Release Date: 2008-10-27 - * - * Copyright 2008, Jue Ruan , Heng Li - * - * All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * - * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE - * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL - * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS - * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) - * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT - * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY - * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF - * SUCH DAMAGE. - */ - -#ifndef _NO_RAZF - -#include -#include -#include -#include -#include -#include "razf.h" - - -#if ZLIB_VERNUM < 0x1221 -struct _gz_header_s { - int text; - uLong time; - int xflags; - int os; - Bytef *extra; - uInt extra_len; - uInt extra_max; - Bytef *name; - uInt name_max; - Bytef *comment; - uInt comm_max; - int hcrc; - int done; -}; -#warning "zlib < 1.2.2.1; RAZF writing is disabled." -#endif - -#define DEF_MEM_LEVEL 8 - -static inline uint32_t byte_swap_4(uint32_t v){ - v = ((v & 0x0000FFFFU) << 16) | (v >> 16); - return ((v & 0x00FF00FFU) << 8) | ((v & 0xFF00FF00U) >> 8); -} - -static inline uint64_t byte_swap_8(uint64_t v){ - v = ((v & 0x00000000FFFFFFFFLLU) << 32) | (v >> 32); - v = ((v & 0x0000FFFF0000FFFFLLU) << 16) | ((v & 0xFFFF0000FFFF0000LLU) >> 16); - return ((v & 0x00FF00FF00FF00FFLLU) << 8) | ((v & 0xFF00FF00FF00FF00LLU) >> 8); -} - -static inline int is_big_endian(){ - int x = 0x01; - char *c = (char*)&x; - return (c[0] != 0x01); -} - -#ifndef _RZ_READONLY -static void add_zindex(RAZF *rz, int64_t in, int64_t out){ - if(rz->index->size == rz->index->cap){ - rz->index->cap = rz->index->cap * 1.5 + 2; - rz->index->cell_offsets = realloc(rz->index->cell_offsets, sizeof(int) * rz->index->cap); - rz->index->bin_offsets = realloc(rz->index->bin_offsets, sizeof(int64_t) * (rz->index->cap/RZ_BIN_SIZE + 1)); - } - if(rz->index->size % RZ_BIN_SIZE == 0) rz->index->bin_offsets[rz->index->size / RZ_BIN_SIZE] = out; - rz->index->cell_offsets[rz->index->size] = out - rz->index->bin_offsets[rz->index->size / RZ_BIN_SIZE]; - rz->index->size ++; -} - -static void save_zindex(RAZF *rz, int fd){ - int32_t i, v32; - int is_be; - is_be = is_big_endian(); - if(is_be) write(fd, &rz->index->size, sizeof(int)); - else { - v32 = byte_swap_4((uint32_t)rz->index->size); - write(fd, &v32, sizeof(uint32_t)); - } - v32 = rz->index->size / RZ_BIN_SIZE + 1; - if(!is_be){ - for(i=0;iindex->bin_offsets[i] = byte_swap_8((uint64_t)rz->index->bin_offsets[i]); - for(i=0;iindex->size;i++) rz->index->cell_offsets[i] = byte_swap_4((uint32_t)rz->index->cell_offsets[i]); - } - write(fd, rz->index->bin_offsets, sizeof(int64_t) * v32); - write(fd, rz->index->cell_offsets, sizeof(int32_t) * rz->index->size); -} -#endif - -#ifdef _USE_KNETFILE -static void load_zindex(RAZF *rz, knetFile *fp){ -#else -static void load_zindex(RAZF *rz, int fd){ -#endif - int32_t i, v32; - int is_be; - if(!rz->load_index) return; - if(rz->index == NULL) rz->index = malloc(sizeof(ZBlockIndex)); - is_be = is_big_endian(); -#ifdef _USE_KNETFILE - knet_read(fp, &rz->index->size, sizeof(int)); -#else - read(fd, &rz->index->size, sizeof(int)); -#endif - if(!is_be) rz->index->size = byte_swap_4((uint32_t)rz->index->size); - rz->index->cap = rz->index->size; - v32 = rz->index->size / RZ_BIN_SIZE + 1; - rz->index->bin_offsets = malloc(sizeof(int64_t) * v32); -#ifdef _USE_KNETFILE - knet_read(fp, rz->index->bin_offsets, sizeof(int64_t) * v32); -#else - read(fd, rz->index->bin_offsets, sizeof(int64_t) * v32); -#endif - rz->index->cell_offsets = malloc(sizeof(int) * rz->index->size); -#ifdef _USE_KNETFILE - knet_read(fp, rz->index->cell_offsets, sizeof(int) * rz->index->size); -#else - read(fd, rz->index->cell_offsets, sizeof(int) * rz->index->size); -#endif - if(!is_be){ - for(i=0;iindex->bin_offsets[i] = byte_swap_8((uint64_t)rz->index->bin_offsets[i]); - for(i=0;iindex->size;i++) rz->index->cell_offsets[i] = byte_swap_4((uint32_t)rz->index->cell_offsets[i]); - } -} - -#ifdef _RZ_READONLY -static RAZF* razf_open_w(int fd) -{ - fprintf(stderr, "[razf_open_w] Writing is not available with zlib ver < 1.2.2.1\n"); - return 0; -} -#else -static RAZF* razf_open_w(int fd){ - RAZF *rz; -#ifdef _WIN32 - setmode(fd, O_BINARY); -#endif - rz = calloc(1, sizeof(RAZF)); - rz->mode = 'w'; -#ifdef _USE_KNETFILE - rz->x.fpw = fd; -#else - rz->filedes = fd; -#endif - rz->stream = calloc(sizeof(z_stream), 1); - rz->inbuf = malloc(RZ_BUFFER_SIZE); - rz->outbuf = malloc(RZ_BUFFER_SIZE); - rz->index = calloc(sizeof(ZBlockIndex), 1); - deflateInit2(rz->stream, RZ_COMPRESS_LEVEL, Z_DEFLATED, WINDOW_BITS + 16, DEF_MEM_LEVEL, Z_DEFAULT_STRATEGY); - rz->stream->avail_out = RZ_BUFFER_SIZE; - rz->stream->next_out = rz->outbuf; - rz->header = calloc(sizeof(gz_header), 1); - rz->header->os = 0x03; //Unix - rz->header->text = 0; - rz->header->time = 0; - rz->header->extra = malloc(7); - strncpy((char*)rz->header->extra, "RAZF", 4); - rz->header->extra[4] = 1; // obsolete field - // block size = RZ_BLOCK_SIZE, Big-Endian - rz->header->extra[5] = RZ_BLOCK_SIZE >> 8; - rz->header->extra[6] = RZ_BLOCK_SIZE & 0xFF; - rz->header->extra_len = 7; - rz->header->name = rz->header->comment = 0; - rz->header->hcrc = 0; - deflateSetHeader(rz->stream, rz->header); - rz->block_pos = rz->block_off = 0; - return rz; -} - -static void _razf_write(RAZF* rz, const void *data, int size){ - int tout; - rz->stream->avail_in = size; - rz->stream->next_in = (void*)data; - while(1){ - tout = rz->stream->avail_out; - deflate(rz->stream, Z_NO_FLUSH); - rz->out += tout - rz->stream->avail_out; - if(rz->stream->avail_out) break; -#ifdef _USE_KNETFILE - write(rz->x.fpw, rz->outbuf, RZ_BUFFER_SIZE - rz->stream->avail_out); -#else - write(rz->filedes, rz->outbuf, RZ_BUFFER_SIZE - rz->stream->avail_out); -#endif - rz->stream->avail_out = RZ_BUFFER_SIZE; - rz->stream->next_out = rz->outbuf; - if(rz->stream->avail_in == 0) break; - }; - rz->in += size - rz->stream->avail_in; - rz->block_off += size - rz->stream->avail_in; -} - -static void razf_flush(RAZF *rz){ - uint32_t tout; - if(rz->buf_len){ - _razf_write(rz, rz->inbuf, rz->buf_len); - rz->buf_off = rz->buf_len = 0; - } - if(rz->stream->avail_out){ -#ifdef _USE_KNETFILE - write(rz->x.fpw, rz->outbuf, RZ_BUFFER_SIZE - rz->stream->avail_out); -#else - write(rz->filedes, rz->outbuf, RZ_BUFFER_SIZE - rz->stream->avail_out); -#endif - rz->stream->avail_out = RZ_BUFFER_SIZE; - rz->stream->next_out = rz->outbuf; - } - while(1){ - tout = rz->stream->avail_out; - deflate(rz->stream, Z_FULL_FLUSH); - rz->out += tout - rz->stream->avail_out; - if(rz->stream->avail_out == 0){ -#ifdef _USE_KNETFILE - write(rz->x.fpw, rz->outbuf, RZ_BUFFER_SIZE - rz->stream->avail_out); -#else - write(rz->filedes, rz->outbuf, RZ_BUFFER_SIZE - rz->stream->avail_out); -#endif - rz->stream->avail_out = RZ_BUFFER_SIZE; - rz->stream->next_out = rz->outbuf; - } else break; - } - rz->block_pos = rz->out; - rz->block_off = 0; -} - -static void razf_end_flush(RAZF *rz){ - uint32_t tout; - if(rz->buf_len){ - _razf_write(rz, rz->inbuf, rz->buf_len); - rz->buf_off = rz->buf_len = 0; - } - while(1){ - tout = rz->stream->avail_out; - deflate(rz->stream, Z_FINISH); - rz->out += tout - rz->stream->avail_out; - if(rz->stream->avail_out < RZ_BUFFER_SIZE){ -#ifdef _USE_KNETFILE - write(rz->x.fpw, rz->outbuf, RZ_BUFFER_SIZE - rz->stream->avail_out); -#else - write(rz->filedes, rz->outbuf, RZ_BUFFER_SIZE - rz->stream->avail_out); -#endif - rz->stream->avail_out = RZ_BUFFER_SIZE; - rz->stream->next_out = rz->outbuf; - } else break; - } -} - -static void _razf_buffered_write(RAZF *rz, const void *data, int size){ - int i, n; - while(1){ - if(rz->buf_len == RZ_BUFFER_SIZE){ - _razf_write(rz, rz->inbuf, rz->buf_len); - rz->buf_len = 0; - } - if(size + rz->buf_len < RZ_BUFFER_SIZE){ - for(i=0;iinbuf + rz->buf_len)[i] = ((char*)data)[i]; - rz->buf_len += size; - return; - } else { - n = RZ_BUFFER_SIZE - rz->buf_len; - for(i=0;iinbuf + rz->buf_len)[i] = ((char*)data)[i]; - size -= n; - data += n; - rz->buf_len += n; - } - } -} - -int razf_write(RAZF* rz, const void *data, int size){ - int ori_size, n; - int64_t next_block; - ori_size = size; - next_block = ((rz->in / RZ_BLOCK_SIZE) + 1) * RZ_BLOCK_SIZE; - while(rz->in + rz->buf_len + size >= next_block){ - n = next_block - rz->in - rz->buf_len; - _razf_buffered_write(rz, data, n); - data += n; - size -= n; - razf_flush(rz); - add_zindex(rz, rz->in, rz->out); - next_block = ((rz->in / RZ_BLOCK_SIZE) + 1) * RZ_BLOCK_SIZE; - } - _razf_buffered_write(rz, data, size); - return ori_size; -} -#endif - -/* gzip flag byte */ -#define ASCII_FLAG 0x01 /* bit 0 set: file probably ascii text */ -#define HEAD_CRC 0x02 /* bit 1 set: header CRC present */ -#define EXTRA_FIELD 0x04 /* bit 2 set: extra field present */ -#define ORIG_NAME 0x08 /* bit 3 set: original file name present */ -#define COMMENT 0x10 /* bit 4 set: file comment present */ -#define RESERVED 0xE0 /* bits 5..7: reserved */ - -static int _read_gz_header(unsigned char *data, int size, int *extra_off, int *extra_len){ - int method, flags, n, len; - if(size < 2) return 0; - if(data[0] != 0x1f || data[1] != 0x8b) return 0; - if(size < 4) return 0; - method = data[2]; - flags = data[3]; - if(method != Z_DEFLATED || (flags & RESERVED)) return 0; - n = 4 + 6; // Skip 6 bytes - *extra_off = n + 2; - *extra_len = 0; - if(flags & EXTRA_FIELD){ - if(size < n + 2) return 0; - len = ((int)data[n + 1] << 8) | data[n]; - n += 2; - *extra_off = n; - while(len){ - if(n >= size) return 0; - n ++; - len --; - } - *extra_len = n - (*extra_off); - } - if(flags & ORIG_NAME) while(n < size && data[n++]); - if(flags & COMMENT) while(n < size && data[n++]); - if(flags & HEAD_CRC){ - if(n + 2 > size) return 0; - n += 2; - } - return n; -} - -#ifdef _USE_KNETFILE -static RAZF* razf_open_r(knetFile *fp, int _load_index){ -#else -static RAZF* razf_open_r(int fd, int _load_index){ -#endif - RAZF *rz; - int ext_off, ext_len; - int n, is_be, ret; - int64_t end; - unsigned char c[] = "RAZF"; - rz = calloc(1, sizeof(RAZF)); - rz->mode = 'r'; -#ifdef _USE_KNETFILE - rz->x.fpr = fp; -#else -#ifdef _WIN32 - setmode(fd, O_BINARY); -#endif - rz->filedes = fd; -#endif - rz->stream = calloc(sizeof(z_stream), 1); - rz->inbuf = malloc(RZ_BUFFER_SIZE); - rz->outbuf = malloc(RZ_BUFFER_SIZE); - rz->end = rz->src_end = 0x7FFFFFFFFFFFFFFFLL; -#ifdef _USE_KNETFILE - n = knet_read(rz->x.fpr, rz->inbuf, RZ_BUFFER_SIZE); -#else - n = read(rz->filedes, rz->inbuf, RZ_BUFFER_SIZE); -#endif - ret = _read_gz_header(rz->inbuf, n, &ext_off, &ext_len); - if(ret == 0){ - PLAIN_FILE: - rz->in = n; - rz->file_type = FILE_TYPE_PLAIN; - memcpy(rz->outbuf, rz->inbuf, n); - rz->buf_len = n; - free(rz->stream); - rz->stream = NULL; - return rz; - } - rz->header_size = ret; - ret = inflateInit2(rz->stream, -WINDOW_BITS); - if(ret != Z_OK){ inflateEnd(rz->stream); goto PLAIN_FILE;} - rz->stream->avail_in = n - rz->header_size; - rz->stream->next_in = rz->inbuf + rz->header_size; - rz->stream->avail_out = RZ_BUFFER_SIZE; - rz->stream->next_out = rz->outbuf; - rz->file_type = FILE_TYPE_GZ; - rz->in = rz->header_size; - rz->block_pos = rz->header_size; - rz->next_block_pos = rz->header_size; - rz->block_off = 0; - if(ext_len < 7 || memcmp(rz->inbuf + ext_off, c, 4) != 0) return rz; - if(((((unsigned char*)rz->inbuf)[ext_off + 5] << 8) | ((unsigned char*)rz->inbuf)[ext_off + 6]) != RZ_BLOCK_SIZE){ - fprintf(stderr, " -- WARNING: RZ_BLOCK_SIZE is not %d, treat source as gz file. in %s -- %s:%d --\n", RZ_BLOCK_SIZE, __FUNCTION__, __FILE__, __LINE__); - return rz; - } - rz->load_index = _load_index; - rz->file_type = FILE_TYPE_RZ; -#ifdef _USE_KNETFILE - if(knet_seek(fp, -16, SEEK_END) == -1){ -#else - if(lseek(fd, -16, SEEK_END) == -1){ -#endif - UNSEEKABLE: - rz->seekable = 0; - rz->index = NULL; - rz->src_end = rz->end = 0x7FFFFFFFFFFFFFFFLL; - } else { - is_be = is_big_endian(); - rz->seekable = 1; -#ifdef _USE_KNETFILE - knet_read(fp, &end, sizeof(int64_t)); -#else - read(fd, &end, sizeof(int64_t)); -#endif - if(!is_be) rz->src_end = (int64_t)byte_swap_8((uint64_t)end); - else rz->src_end = end; - -#ifdef _USE_KNETFILE - knet_read(fp, &end, sizeof(int64_t)); -#else - read(fd, &end, sizeof(int64_t)); -#endif - if(!is_be) rz->end = (int64_t)byte_swap_8((uint64_t)end); - else rz->end = end; - if(n > rz->end){ - rz->stream->avail_in -= n - rz->end; - n = rz->end; - } - if(rz->end > rz->src_end){ -#ifdef _USE_KNETFILE - knet_seek(fp, rz->in, SEEK_SET); -#else - lseek(fd, rz->in, SEEK_SET); -#endif - goto UNSEEKABLE; - } -#ifdef _USE_KNETFILE - knet_seek(fp, rz->end, SEEK_SET); - if(knet_tell(fp) != rz->end){ - knet_seek(fp, rz->in, SEEK_SET); -#else - if(lseek(fd, rz->end, SEEK_SET) != rz->end){ - lseek(fd, rz->in, SEEK_SET); -#endif - goto UNSEEKABLE; - } -#ifdef _USE_KNETFILE - load_zindex(rz, fp); - knet_seek(fp, n, SEEK_SET); -#else - load_zindex(rz, fd); - lseek(fd, n, SEEK_SET); -#endif - } - return rz; -} - -#ifdef _USE_KNETFILE -RAZF* razf_dopen(int fd, const char *mode){ - if (strstr(mode, "r")) fprintf(stderr,"[razf_dopen] implement me\n"); - else if(strstr(mode, "w")) return razf_open_w(fd); - return NULL; -} - -RAZF* razf_dopen2(int fd, const char *mode) -{ - fprintf(stderr,"[razf_dopen2] implement me\n"); - return NULL; -} -#else -RAZF* razf_dopen(int fd, const char *mode){ - if(strstr(mode, "r")) return razf_open_r(fd, 1); - else if(strstr(mode, "w")) return razf_open_w(fd); - else return NULL; -} - -RAZF* razf_dopen2(int fd, const char *mode) -{ - if(strstr(mode, "r")) return razf_open_r(fd, 0); - else if(strstr(mode, "w")) return razf_open_w(fd); - else return NULL; -} -#endif - -static inline RAZF* _razf_open(const char *filename, const char *mode, int _load_index){ - int fd; - RAZF *rz; - if(strstr(mode, "r")){ -#ifdef _USE_KNETFILE - knetFile *fd = knet_open(filename, "r"); - if (fd == 0) { - fprintf(stderr, "[_razf_open] fail to open %s\n", filename); - return NULL; - } -#else -#ifdef _WIN32 - fd = open(filename, O_RDONLY | O_BINARY); -#else - fd = open(filename, O_RDONLY); -#endif -#endif - if(fd < 0) return NULL; - rz = razf_open_r(fd, _load_index); - } else if(strstr(mode, "w")){ -#ifdef _WIN32 - fd = open(filename, O_WRONLY | O_CREAT | O_TRUNC | O_BINARY, 0666); -#else - fd = open(filename, O_WRONLY | O_CREAT | O_TRUNC, 0666); -#endif - if(fd < 0) return NULL; - rz = razf_open_w(fd); - } else return NULL; - return rz; -} - -RAZF* razf_open(const char *filename, const char *mode){ - return _razf_open(filename, mode, 1); -} - -RAZF* razf_open2(const char *filename, const char *mode){ - return _razf_open(filename, mode, 0); -} - -int razf_get_data_size(RAZF *rz, int64_t *u_size, int64_t *c_size){ - int64_t n; - if(rz->mode != 'r' && rz->mode != 'R') return 0; - switch(rz->file_type){ - case FILE_TYPE_PLAIN: - if(rz->end == 0x7fffffffffffffffLL){ -#ifdef _USE_KNETFILE - if(knet_seek(rz->x.fpr, 0, SEEK_CUR) == -1) return 0; - n = knet_tell(rz->x.fpr); - knet_seek(rz->x.fpr, 0, SEEK_END); - rz->end = knet_tell(rz->x.fpr); - knet_seek(rz->x.fpr, n, SEEK_SET); -#else - if((n = lseek(rz->filedes, 0, SEEK_CUR)) == -1) return 0; - rz->end = lseek(rz->filedes, 0, SEEK_END); - lseek(rz->filedes, n, SEEK_SET); -#endif - } - *u_size = *c_size = rz->end; - return 1; - case FILE_TYPE_GZ: - return 0; - case FILE_TYPE_RZ: - if(rz->src_end == rz->end) return 0; - *u_size = rz->src_end; - *c_size = rz->end; - return 1; - default: - return 0; - } -} - -static int _razf_read(RAZF* rz, void *data, int size){ - int ret, tin; - if(rz->z_eof || rz->z_err) return 0; - if (rz->file_type == FILE_TYPE_PLAIN) { -#ifdef _USE_KNETFILE - ret = knet_read(rz->x.fpr, data, size); -#else - ret = read(rz->filedes, data, size); -#endif - if (ret == 0) rz->z_eof = 1; - return ret; - } - rz->stream->avail_out = size; - rz->stream->next_out = data; - while(rz->stream->avail_out){ - if(rz->stream->avail_in == 0){ - if(rz->in >= rz->end){ rz->z_eof = 1; break; } - if(rz->end - rz->in < RZ_BUFFER_SIZE){ -#ifdef _USE_KNETFILE - rz->stream->avail_in = knet_read(rz->x.fpr, rz->inbuf, rz->end -rz->in); -#else - rz->stream->avail_in = read(rz->filedes, rz->inbuf, rz->end -rz->in); -#endif - } else { -#ifdef _USE_KNETFILE - rz->stream->avail_in = knet_read(rz->x.fpr, rz->inbuf, RZ_BUFFER_SIZE); -#else - rz->stream->avail_in = read(rz->filedes, rz->inbuf, RZ_BUFFER_SIZE); -#endif - } - if(rz->stream->avail_in == 0){ - rz->z_eof = 1; - break; - } - rz->stream->next_in = rz->inbuf; - } - tin = rz->stream->avail_in; - ret = inflate(rz->stream, Z_BLOCK); - rz->in += tin - rz->stream->avail_in; - if(ret == Z_NEED_DICT || ret == Z_MEM_ERROR || ret == Z_DATA_ERROR){ - fprintf(stderr, "[_razf_read] inflate error: %d %s (at %s:%d)\n", ret, rz->stream->msg ? rz->stream->msg : "", __FILE__, __LINE__); - rz->z_err = 1; - break; - } - if(ret == Z_STREAM_END){ - rz->z_eof = 1; - break; - } - if ((rz->stream->data_type&128) && !(rz->stream->data_type&64)){ - rz->buf_flush = 1; - rz->next_block_pos = rz->in; - break; - } - } - return size - rz->stream->avail_out; -} - -int razf_read(RAZF *rz, void *data, int size){ - int ori_size, i; - ori_size = size; - while(size > 0){ - if(rz->buf_len){ - if(size < rz->buf_len){ - for(i=0;ioutbuf + rz->buf_off)[i]; - rz->buf_off += size; - rz->buf_len -= size; - data += size; - rz->block_off += size; - size = 0; - break; - } else { - for(i=0;ibuf_len;i++) ((char*)data)[i] = ((char*)rz->outbuf + rz->buf_off)[i]; - data += rz->buf_len; - size -= rz->buf_len; - rz->block_off += rz->buf_len; - rz->buf_off = 0; - rz->buf_len = 0; - if(rz->buf_flush){ - rz->block_pos = rz->next_block_pos; - rz->block_off = 0; - rz->buf_flush = 0; - } - } - } else if(rz->buf_flush){ - rz->block_pos = rz->next_block_pos; - rz->block_off = 0; - rz->buf_flush = 0; - } - if(rz->buf_flush) continue; - rz->buf_len = _razf_read(rz, rz->outbuf, RZ_BUFFER_SIZE); - if(rz->z_eof && rz->buf_len == 0) break; - } - rz->out += ori_size - size; - return ori_size - size; -} - -int razf_skip(RAZF* rz, int size){ - int ori_size; - ori_size = size; - while(size > 0){ - if(rz->buf_len){ - if(size < rz->buf_len){ - rz->buf_off += size; - rz->buf_len -= size; - rz->block_off += size; - size = 0; - break; - } else { - size -= rz->buf_len; - rz->buf_off = 0; - rz->buf_len = 0; - rz->block_off += rz->buf_len; - if(rz->buf_flush){ - rz->block_pos = rz->next_block_pos; - rz->block_off = 0; - rz->buf_flush = 0; - } - } - } else if(rz->buf_flush){ - rz->block_pos = rz->next_block_pos; - rz->block_off = 0; - rz->buf_flush = 0; - } - if(rz->buf_flush) continue; - rz->buf_len = _razf_read(rz, rz->outbuf, RZ_BUFFER_SIZE); - if(rz->z_eof || rz->z_err) break; - } - rz->out += ori_size - size; - return ori_size - size; -} - -static void _razf_reset_read(RAZF *rz, int64_t in, int64_t out){ -#ifdef _USE_KNETFILE - knet_seek(rz->x.fpr, in, SEEK_SET); -#else - lseek(rz->filedes, in, SEEK_SET); -#endif - rz->in = in; - rz->out = out; - rz->block_pos = in; - rz->next_block_pos = in; - rz->block_off = 0; - rz->buf_flush = 0; - rz->z_eof = rz->z_err = 0; - inflateReset(rz->stream); - rz->stream->avail_in = 0; - rz->buf_off = rz->buf_len = 0; -} - -int64_t razf_jump(RAZF *rz, int64_t block_start, int block_offset){ - int64_t pos; - rz->z_eof = 0; - if(rz->file_type == FILE_TYPE_PLAIN){ - rz->buf_off = rz->buf_len = 0; - pos = block_start + block_offset; -#ifdef _USE_KNETFILE - knet_seek(rz->x.fpr, pos, SEEK_SET); - pos = knet_tell(rz->x.fpr); -#else - pos = lseek(rz->filedes, pos, SEEK_SET); -#endif - rz->out = rz->in = pos; - return pos; - } - if(block_start == rz->block_pos && block_offset >= rz->block_off) { - block_offset -= rz->block_off; - goto SKIP; // Needn't reset inflate - } - if(block_start == 0) block_start = rz->header_size; // Automaticly revist wrong block_start - _razf_reset_read(rz, block_start, 0); - SKIP: - if(block_offset) razf_skip(rz, block_offset); - return rz->block_off; -} - -int64_t razf_seek(RAZF* rz, int64_t pos, int where){ - int64_t idx; - int64_t seek_pos, new_out; - rz->z_eof = 0; - if (where == SEEK_CUR) pos += rz->out; - else if (where == SEEK_END) pos += rz->src_end; - if(rz->file_type == FILE_TYPE_PLAIN){ -#ifdef _USE_KNETFILE - knet_seek(rz->x.fpr, pos, SEEK_SET); - seek_pos = knet_tell(rz->x.fpr); -#else - seek_pos = lseek(rz->filedes, pos, SEEK_SET); -#endif - rz->buf_off = rz->buf_len = 0; - rz->out = rz->in = seek_pos; - return seek_pos; - } else if(rz->file_type == FILE_TYPE_GZ){ - if(pos >= rz->out) goto SKIP; - return rz->out; - } - if(pos == rz->out) return pos; - if(pos > rz->src_end) return rz->out; - if(!rz->seekable || !rz->load_index){ - if(pos >= rz->out) goto SKIP; - } - idx = pos / RZ_BLOCK_SIZE - 1; - seek_pos = (idx < 0)? rz->header_size:(rz->index->cell_offsets[idx] + rz->index->bin_offsets[idx / RZ_BIN_SIZE]); - new_out = (idx + 1) * RZ_BLOCK_SIZE; - if(pos > rz->out && new_out <= rz->out) goto SKIP; - _razf_reset_read(rz, seek_pos, new_out); - SKIP: - razf_skip(rz, (int)(pos - rz->out)); - return rz->out; -} - -uint64_t razf_tell2(RAZF *rz) -{ - /* - if (rz->load_index) { - int64_t idx, seek_pos; - idx = rz->out / RZ_BLOCK_SIZE - 1; - seek_pos = (idx < 0)? rz->header_size:(rz->index->cell_offsets[idx] + rz->index->bin_offsets[idx / RZ_BIN_SIZE]); - if (seek_pos != rz->block_pos || rz->out%RZ_BLOCK_SIZE != rz->block_off) - fprintf(stderr, "[razf_tell2] inconsistent block offset: (%lld, %lld) != (%lld, %lld)\n", - (long long)seek_pos, (long long)rz->out%RZ_BLOCK_SIZE, (long long)rz->block_pos, (long long) rz->block_off); - } - */ - return (uint64_t)rz->block_pos<<16 | (rz->block_off&0xffff); -} - -int64_t razf_seek2(RAZF *rz, uint64_t voffset, int where) -{ - if (where != SEEK_SET) return -1; - return razf_jump(rz, voffset>>16, voffset&0xffff); -} - -void razf_close(RAZF *rz){ - if(rz->mode == 'w'){ -#ifndef _RZ_READONLY - razf_end_flush(rz); - deflateEnd(rz->stream); -#ifdef _USE_KNETFILE - save_zindex(rz, rz->x.fpw); - if(is_big_endian()){ - write(rz->x.fpw, &rz->in, sizeof(int64_t)); - write(rz->x.fpw, &rz->out, sizeof(int64_t)); - } else { - uint64_t v64 = byte_swap_8((uint64_t)rz->in); - write(rz->x.fpw, &v64, sizeof(int64_t)); - v64 = byte_swap_8((uint64_t)rz->out); - write(rz->x.fpw, &v64, sizeof(int64_t)); - } -#else - save_zindex(rz, rz->filedes); - if(is_big_endian()){ - write(rz->filedes, &rz->in, sizeof(int64_t)); - write(rz->filedes, &rz->out, sizeof(int64_t)); - } else { - uint64_t v64 = byte_swap_8((uint64_t)rz->in); - write(rz->filedes, &v64, sizeof(int64_t)); - v64 = byte_swap_8((uint64_t)rz->out); - write(rz->filedes, &v64, sizeof(int64_t)); - } -#endif -#endif - } else if(rz->mode == 'r'){ - if(rz->stream) inflateEnd(rz->stream); - } - if(rz->inbuf) free(rz->inbuf); - if(rz->outbuf) free(rz->outbuf); - if(rz->header){ - free(rz->header->extra); - free(rz->header->name); - free(rz->header->comment); - free(rz->header); - } - if(rz->index){ - free(rz->index->bin_offsets); - free(rz->index->cell_offsets); - free(rz->index); - } - free(rz->stream); -#ifdef _USE_KNETFILE - if (rz->mode == 'r') - knet_close(rz->x.fpr); - if (rz->mode == 'w') - close(rz->x.fpw); -#else - close(rz->filedes); -#endif - free(rz); -} - -#endif diff -Nru tophat-2.1.1+dfsg/src/samtools-0.1.18/razf.h tophat-2.1.1+dfsg1/src/samtools-0.1.18/razf.h --- tophat-2.1.1+dfsg/src/samtools-0.1.18/razf.h 2016-02-14 18:21:17.769079000 +0000 +++ tophat-2.1.1+dfsg1/src/samtools-0.1.18/razf.h 1970-01-01 00:00:00.000000000 +0000 @@ -1,134 +0,0 @@ - /*- - * RAZF : Random Access compressed(Z) File - * Version: 1.0 - * Release Date: 2008-10-27 - * - * Copyright 2008, Jue Ruan , Heng Li - * - * All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * - * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE - * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL - * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS - * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) - * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT - * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY - * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF - * SUCH DAMAGE. - */ - - -#ifndef __RAZF_RJ_H -#define __RAZF_RJ_H - -#include -#include -#include "zlib.h" - -#ifdef _USE_KNETFILE -#include "knetfile.h" -#endif - -#if ZLIB_VERNUM < 0x1221 -#define _RZ_READONLY -struct _gz_header_s; -typedef struct _gz_header_s _gz_header; -#define gz_header _gz_header -#endif - -#define WINDOW_BITS 15 - -#ifndef RZ_BLOCK_SIZE -#define RZ_BLOCK_SIZE (1<mode from HEAD to TYPE after call inflateReset */ - int buf_off, buf_len; - int z_err, z_eof; - int seekable; - /* Indice where the source is seekable */ - int load_index; - /* set has_index to 0 in mode 'w', then index will be discarded */ -} RAZF; - -#ifdef __cplusplus -extern "C" { -#endif - - RAZF* razf_dopen(int data_fd, const char *mode); - RAZF *razf_open(const char *fn, const char *mode); - int razf_write(RAZF* rz, const void *data, int size); - int razf_read(RAZF* rz, void *data, int size); - int64_t razf_seek(RAZF* rz, int64_t pos, int where); - void razf_close(RAZF* rz); - -#define razf_tell(rz) ((rz)->out) - - RAZF* razf_open2(const char *filename, const char *mode); - RAZF* razf_dopen2(int fd, const char *mode); - uint64_t razf_tell2(RAZF *rz); - int64_t razf_seek2(RAZF *rz, uint64_t voffset, int where); - -#ifdef __cplusplus -} -#endif - -#endif diff -Nru tophat-2.1.1+dfsg/src/samtools-0.1.18/razip.c tophat-2.1.1+dfsg1/src/samtools-0.1.18/razip.c --- tophat-2.1.1+dfsg/src/samtools-0.1.18/razip.c 2016-02-14 18:21:17.770079000 +0000 +++ tophat-2.1.1+dfsg1/src/samtools-0.1.18/razip.c 1970-01-01 00:00:00.000000000 +0000 @@ -1,141 +0,0 @@ -#include -#include -#include -#include -#include -#include -#include "razf.h" - -#define WINDOW_SIZE 4096 - -static int razf_main_usage() -{ - printf("\n"); - printf("Usage: razip [options] [file] ...\n\n"); - printf("Options: -c write on standard output, keep original files unchanged\n"); - printf(" -d decompress\n"); - printf(" -l list compressed file contents\n"); - printf(" -b INT decompress at INT position in the uncompressed file\n"); - printf(" -s INT decompress INT bytes in the uncompressed file\n"); - printf(" -h give this help\n"); - printf("\n"); - return 0; -} - -static int write_open(const char *fn, int is_forced) -{ - int fd = -1; - char c; - if (!is_forced) { - if ((fd = open(fn, O_WRONLY | O_CREAT | O_TRUNC | O_EXCL, 0666)) < 0 && errno == EEXIST) { - printf("razip: %s already exists; do you wish to overwrite (y or n)? ", fn); - scanf("%c", &c); - if (c != 'Y' && c != 'y') { - printf("razip: not overwritten\n"); - exit(1); - } - } - } - if (fd < 0) { - if ((fd = open(fn, O_WRONLY | O_CREAT | O_TRUNC, 0666)) < 0) { - fprintf(stderr, "razip: %s: Fail to write\n", fn); - exit(1); - } - } - return fd; -} - -int main(int argc, char **argv) -{ - int c, compress, pstdout, is_forced; - RAZF *rz; - void *buffer; - long start, end, size; - - compress = 1; pstdout = 0; start = 0; size = -1; end = -1; is_forced = 0; - while((c = getopt(argc, argv, "cdlhfb:s:")) >= 0){ - switch(c){ - case 'h': return razf_main_usage(); - case 'd': compress = 0; break; - case 'c': pstdout = 1; break; - case 'l': compress = 2; break; - case 'b': start = atol(optarg); break; - case 's': size = atol(optarg); break; - case 'f': is_forced = 1; break; - } - } - if (size >= 0) end = start + size; - if(end >= 0 && end < start){ - fprintf(stderr, " -- Illegal region: [%ld, %ld] --\n", start, end); - return 1; - } - if(compress == 1){ - int f_src, f_dst = -1; - if(argc > optind){ - if((f_src = open(argv[optind], O_RDONLY)) < 0){ - fprintf(stderr, " -- Cannot open file: %s --\n", argv[optind]); - return 1; - } - if(pstdout){ - f_dst = fileno(stdout); - } else { - char *name = malloc(sizeof(strlen(argv[optind]) + 5)); - strcpy(name, argv[optind]); - strcat(name, ".rz"); - f_dst = write_open(name, is_forced); - if (f_dst < 0) return 1; - free(name); - } - } else if(pstdout){ - f_src = fileno(stdin); - f_dst = fileno(stdout); - } else return razf_main_usage(); - rz = razf_dopen(f_dst, "w"); - buffer = malloc(WINDOW_SIZE); - while((c = read(f_src, buffer, WINDOW_SIZE)) > 0) razf_write(rz, buffer, c); - razf_close(rz); // f_dst will be closed here - if (argc > optind && !pstdout) unlink(argv[optind]); - free(buffer); - close(f_src); - return 0; - } else { - if(argc <= optind) return razf_main_usage(); - if(compress == 2){ - rz = razf_open(argv[optind], "r"); - if(rz->file_type == FILE_TYPE_RZ) { - printf("%20s%20s%7s %s\n", "compressed", "uncompressed", "ratio", "name"); - printf("%20lld%20lld%6.1f%% %s\n", (long long)rz->end, (long long)rz->src_end, rz->end * 100.0f / rz->src_end, - argv[optind]); - } else fprintf(stdout, "%s is not a regular rz file\n", argv[optind]); - } else { - int f_dst; - if (argc > optind && !pstdout) { - char *name; - if (strstr(argv[optind], ".rz") - argv[optind] != strlen(argv[optind]) - 3) { - printf("razip: %s: unknown suffix -- ignored\n", argv[optind]); - return 1; - } - name = strdup(argv[optind]); - name[strlen(name) - 3] = '\0'; - f_dst = write_open(name, is_forced); - free(name); - } else f_dst = fileno(stdout); - rz = razf_open(argv[optind], "r"); - buffer = malloc(WINDOW_SIZE); - razf_seek(rz, start, SEEK_SET); - while(1){ - if(end < 0) c = razf_read(rz, buffer, WINDOW_SIZE); - else c = razf_read(rz, buffer, (end - start > WINDOW_SIZE)? WINDOW_SIZE:(end - start)); - if(c <= 0) break; - start += c; - write(f_dst, buffer, c); - if(end >= 0 && start >= end) break; - } - free(buffer); - if (!pstdout) unlink(argv[optind]); - } - razf_close(rz); - return 0; - } -} - diff -Nru tophat-2.1.1+dfsg/src/samtools-0.1.18/sam.c tophat-2.1.1+dfsg1/src/samtools-0.1.18/sam.c --- tophat-2.1.1+dfsg/src/samtools-0.1.18/sam.c 2016-02-14 18:21:17.771079000 +0000 +++ tophat-2.1.1+dfsg1/src/samtools-0.1.18/sam.c 1970-01-01 00:00:00.000000000 +0000 @@ -1,179 +0,0 @@ -#include -#include -#include "faidx.h" -#include "sam.h" - -#define TYPE_BAM 1 -#define TYPE_READ 2 - -bam_header_t *bam_header_dup(const bam_header_t *h0) -{ - bam_header_t *h; - int i; - h = bam_header_init(); - *h = *h0; - h->hash = h->dict = h->rg2lib = 0; - h->text = (char*)calloc(h->l_text + 1, 1); - memcpy(h->text, h0->text, h->l_text); - h->target_len = (uint32_t*)calloc(h->n_targets, 4); - h->target_name = (char**)calloc(h->n_targets, sizeof(void*)); - for (i = 0; i < h->n_targets; ++i) { - h->target_len[i] = h0->target_len[i]; - h->target_name[i] = strdup(h0->target_name[i]); - } - return h; -} -static void append_header_text(bam_header_t *header, char* text, int len) -{ - int x = header->l_text + 1; - int y = header->l_text + len + 1; // 1 byte null - if (text == 0) return; - kroundup32(x); - kroundup32(y); - if (x < y) header->text = (char*)realloc(header->text, y); - strncpy(header->text + header->l_text, text, len); // we cannot use strcpy() here. - header->l_text += len; - header->text[header->l_text] = 0; -} - -samfile_t *samopen(const char *fn, const char *mode, const void *aux) -{ - samfile_t *fp; - fp = (samfile_t*)calloc(1, sizeof(samfile_t)); - if (strchr(mode, 'r')) { // read - fp->type |= TYPE_READ; - if (strchr(mode, 'b')) { // binary - fp->type |= TYPE_BAM; - fp->x.bam = strcmp(fn, "-")? bam_open(fn, "r") : bam_dopen(fileno(stdin), "r"); - if (fp->x.bam == 0) goto open_err_ret; - fp->header = bam_header_read(fp->x.bam); - } else { // text - fp->x.tamr = sam_open(fn); - if (fp->x.tamr == 0) goto open_err_ret; - fp->header = sam_header_read(fp->x.tamr); - if (fp->header->n_targets == 0) { // no @SQ fields - if (aux) { // check if aux is present - bam_header_t *textheader = fp->header; - fp->header = sam_header_read2((const char*)aux); - if (fp->header == 0) goto open_err_ret; - append_header_text(fp->header, textheader->text, textheader->l_text); - bam_header_destroy(textheader); - } - if (fp->header->n_targets == 0 && bam_verbose >= 1) - fprintf(stderr, "[samopen] no @SQ lines in the header.\n"); - } else if (bam_verbose >= 2) fprintf(stderr, "[samopen] SAM header is present: %d sequences.\n", fp->header->n_targets); - } - } else if (strchr(mode, 'w')) { // write - fp->header = bam_header_dup((const bam_header_t*)aux); - if (strchr(mode, 'b')) { // binary - char bmode[3]; - int i, compress_level = -1; - for (i = 0; mode[i]; ++i) if (mode[i] >= '0' && mode[i] <= '9') break; - if (mode[i]) compress_level = mode[i] - '0'; - if (strchr(mode, 'u')) compress_level = 0; - bmode[0] = 'w'; bmode[1] = compress_level < 0? 0 : compress_level + '0'; bmode[2] = 0; - fp->type |= TYPE_BAM; - fp->x.bam = strcmp(fn, "-")? bam_open(fn, bmode) : bam_dopen(fileno(stdout), bmode); - if (fp->x.bam == 0) goto open_err_ret; - bam_header_write(fp->x.bam, fp->header); - } else { // text - // open file - fp->x.tamw = strcmp(fn, "-")? fopen(fn, "w") : stdout; - if (fp->x.tamr == 0) goto open_err_ret; - if (strchr(mode, 'X')) fp->type |= BAM_OFSTR<<2; - else if (strchr(mode, 'x')) fp->type |= BAM_OFHEX<<2; - else fp->type |= BAM_OFDEC<<2; - // write header - if (strchr(mode, 'h')) { - int i; - bam_header_t *alt; - // parse the header text - alt = bam_header_init(); - alt->l_text = fp->header->l_text; alt->text = fp->header->text; - sam_header_parse(alt); - alt->l_text = 0; alt->text = 0; - // check if there are @SQ lines in the header - fwrite(fp->header->text, 1, fp->header->l_text, fp->x.tamw); // FIXME: better to skip the trailing NULL - if (alt->n_targets) { // then write the header text without dumping ->target_{name,len} - if (alt->n_targets != fp->header->n_targets && bam_verbose >= 1) - fprintf(stderr, "[samopen] inconsistent number of target sequences. Output the text header.\n"); - } else { // then dump ->target_{name,len} - for (i = 0; i < fp->header->n_targets; ++i) - fprintf(fp->x.tamw, "@SQ\tSN:%s\tLN:%d\n", fp->header->target_name[i], fp->header->target_len[i]); - } - bam_header_destroy(alt); - } - } - } - return fp; - -open_err_ret: - free(fp); - return 0; -} - -void samclose(samfile_t *fp) -{ - if (fp == 0) return; - if (fp->header) bam_header_destroy(fp->header); - if (fp->type & TYPE_BAM) bam_close(fp->x.bam); - else if (fp->type & TYPE_READ) sam_close(fp->x.tamr); - else fclose(fp->x.tamw); - free(fp); -} - -int samread(samfile_t *fp, bam1_t *b) -{ - if (fp == 0 || !(fp->type & TYPE_READ)) return -1; // not open for reading - if (fp->type & TYPE_BAM) return bam_read1(fp->x.bam, b); - else return sam_read1(fp->x.tamr, fp->header, b); -} - -int samwrite(samfile_t *fp, const bam1_t *b) -{ - if (fp == 0 || (fp->type & TYPE_READ)) return -1; // not open for writing - if (fp->type & TYPE_BAM) return bam_write1(fp->x.bam, b); - else { - char *s = bam_format1_core(fp->header, b, fp->type>>2&3); - int l = strlen(s); - fputs(s, fp->x.tamw); fputc('\n', fp->x.tamw); - free(s); - return l + 1; - } -} - -int sampileup(samfile_t *fp, int mask, bam_pileup_f func, void *func_data) -{ - bam_plbuf_t *buf; - int ret; - bam1_t *b; - b = bam_init1(); - buf = bam_plbuf_init(func, func_data); - bam_plbuf_set_mask(buf, mask); - while ((ret = samread(fp, b)) >= 0) - bam_plbuf_push(b, buf); - bam_plbuf_push(0, buf); - bam_plbuf_destroy(buf); - bam_destroy1(b); - return 0; -} - -char *samfaipath(const char *fn_ref) -{ - char *fn_list = 0; - if (fn_ref == 0) return 0; - fn_list = calloc(strlen(fn_ref) + 5, 1); - strcat(strcpy(fn_list, fn_ref), ".fai"); - if (access(fn_list, R_OK) == -1) { // fn_list is unreadable - if (access(fn_ref, R_OK) == -1) { - fprintf(stderr, "[samfaipath] fail to read file %s.\n", fn_ref); - } else { - if (bam_verbose >= 3) fprintf(stderr, "[samfaipath] build FASTA index...\n"); - if (fai_build(fn_ref) == -1) { - fprintf(stderr, "[samfaipath] fail to build FASTA index.\n"); - free(fn_list); fn_list = 0; - } - } - } - return fn_list; -} diff -Nru tophat-2.1.1+dfsg/src/samtools-0.1.18/sam.h tophat-2.1.1+dfsg1/src/samtools-0.1.18/sam.h --- tophat-2.1.1+dfsg/src/samtools-0.1.18/sam.h 2016-02-14 18:21:17.772079000 +0000 +++ tophat-2.1.1+dfsg1/src/samtools-0.1.18/sam.h 1970-01-01 00:00:00.000000000 +0000 @@ -1,98 +0,0 @@ -#ifndef BAM_SAM_H -#define BAM_SAM_H - -#include "bam.h" - -/*! - @header - - This file provides higher level of I/O routines and unifies the APIs - for SAM and BAM formats. These APIs are more convenient and - recommended. - - @copyright Genome Research Ltd. - */ - -/*! @typedef - @abstract SAM/BAM file handler - @field type type of the handler; bit 1 for BAM, 2 for reading and bit 3-4 for flag format - @field bam BAM file handler; valid if (type&1) == 1 - @field tamr SAM file handler for reading; valid if type == 2 - @field tamw SAM file handler for writing; valid if type == 0 - @field header header struct - */ -typedef struct { - int type; - union { - tamFile tamr; - bamFile bam; - FILE *tamw; - } x; - bam_header_t *header; -} samfile_t; - -#ifdef __cplusplus -extern "C" { -#endif - - /*! - @abstract Open a SAM/BAM file - - @param fn SAM/BAM file name; "-" is recognized as stdin (for - reading) or stdout (for writing). - - @param mode open mode /[rw](b?)(u?)(h?)([xX]?)/: 'r' for reading, - 'w' for writing, 'b' for BAM I/O, 'u' for uncompressed BAM output, - 'h' for outputing header in SAM, 'x' for HEX flag and 'X' for - string flag. If 'b' present, it must immediately follow 'r' or - 'w'. Valid modes are "r", "w", "wh", "wx", "whx", "wX", "whX", - "rb", "wb" and "wbu" exclusively. - - @param aux auxiliary data; if mode[0]=='w', aux points to - bam_header_t; if strcmp(mode, "rb")!=0 and @SQ header lines in SAM - are absent, aux points the file name of the list of the reference; - aux is not used otherwise. If @SQ header lines are present in SAM, - aux is not used, either. - - @return SAM/BAM file handler - */ - samfile_t *samopen(const char *fn, const char *mode, const void *aux); - - /*! - @abstract Close a SAM/BAM handler - @param fp file handler to be closed - */ - void samclose(samfile_t *fp); - - /*! - @abstract Read one alignment - @param fp file handler - @param b alignment - @return bytes read - */ - int samread(samfile_t *fp, bam1_t *b); - - /*! - @abstract Write one alignment - @param fp file handler - @param b alignment - @return bytes written - */ - int samwrite(samfile_t *fp, const bam1_t *b); - - /*! - @abstract Get the pileup for a whole alignment file - @param fp file handler - @param mask mask transferred to bam_plbuf_set_mask() - @param func user defined function called in the pileup process - #param data user provided data for func() - */ - int sampileup(samfile_t *fp, int mask, bam_pileup_f func, void *data); - - char *samfaipath(const char *fn_ref); - -#ifdef __cplusplus -} -#endif - -#endif diff -Nru tophat-2.1.1+dfsg/src/samtools-0.1.18/sam_header.c tophat-2.1.1+dfsg1/src/samtools-0.1.18/sam_header.c --- tophat-2.1.1+dfsg/src/samtools-0.1.18/sam_header.c 2016-02-14 18:21:17.773079000 +0000 +++ tophat-2.1.1+dfsg1/src/samtools-0.1.18/sam_header.c 1970-01-01 00:00:00.000000000 +0000 @@ -1,736 +0,0 @@ -#include "sam_header.h" -#include -#include -#include -#include -#include - -#include "khash.h" -KHASH_MAP_INIT_STR(str, const char *) - -struct _HeaderList -{ - struct _HeaderList *last; // Hack: Used and maintained only by list_append_to_end. Maintained in the root node only. - struct _HeaderList *next; - void *data; -}; -typedef struct _HeaderList list_t; -typedef list_t HeaderDict; - -typedef struct -{ - char key[2]; - char *value; -} -HeaderTag; - -typedef struct -{ - char type[2]; - list_t *tags; -} -HeaderLine; - -const char *o_hd_tags[] = {"SO","GO",NULL}; -const char *r_hd_tags[] = {"VN",NULL}; - -const char *o_sq_tags[] = {"AS","M5","UR","SP",NULL}; -const char *r_sq_tags[] = {"SN","LN",NULL}; -const char *u_sq_tags[] = {"SN",NULL}; - -const char *o_rg_tags[] = {"CN","DS","DT","FO","KS","LB","PG","PI","PL","PU","SM",NULL}; -const char *r_rg_tags[] = {"ID",NULL}; -const char *u_rg_tags[] = {"ID",NULL}; - -const char *o_pg_tags[] = {"VN","CL",NULL}; -const char *r_pg_tags[] = {"ID",NULL}; - -const char *types[] = {"HD","SQ","RG","PG","CO",NULL}; -const char **optional_tags[] = {o_hd_tags,o_sq_tags,o_rg_tags,o_pg_tags,NULL,NULL}; -const char **required_tags[] = {r_hd_tags,r_sq_tags,r_rg_tags,r_pg_tags,NULL,NULL}; -const char **unique_tags[] = {NULL, u_sq_tags,u_rg_tags,NULL,NULL,NULL}; - - -static void debug(const char *format, ...) -{ - va_list ap; - va_start(ap, format); - vfprintf(stderr, format, ap); - va_end(ap); -} - -#if 0 -// Replaced by list_append_to_end -static list_t *list_prepend(list_t *root, void *data) -{ - list_t *l = malloc(sizeof(list_t)); - l->next = root; - l->data = data; - return l; -} -#endif - -// Relies on the root->last being correct. Do not use with the other list_* -// routines unless they are fixed to modify root->last as well. -static list_t *list_append_to_end(list_t *root, void *data) -{ - list_t *l = malloc(sizeof(list_t)); - l->last = l; - l->next = NULL; - l->data = data; - - if ( !root ) - return l; - - root->last->next = l; - root->last = l; - return root; -} - -static list_t *list_append(list_t *root, void *data) -{ - list_t *l = root; - while (l && l->next) - l = l->next; - if ( l ) - { - l->next = malloc(sizeof(list_t)); - l = l->next; - } - else - { - l = malloc(sizeof(list_t)); - root = l; - } - l->data = data; - l->next = NULL; - return root; -} - -static void list_free(list_t *root) -{ - list_t *l = root; - while (root) - { - l = root; - root = root->next; - free(l); - } -} - - - -// Look for a tag "XY" in a predefined const char *[] array. -static int tag_exists(const char *tag, const char **tags) -{ - int itag=0; - if ( !tags ) return -1; - while ( tags[itag] ) - { - if ( tags[itag][0]==tag[0] && tags[itag][1]==tag[1] ) return itag; - itag++; - } - return -1; -} - - - -// Mimics the behaviour of getline, except it returns pointer to the next chunk of the text -// or NULL if everything has been read. The lineptr should be freed by the caller. The -// newline character is stripped. -static const char *nextline(char **lineptr, size_t *n, const char *text) -{ - int len; - const char *to = text; - - if ( !*to ) return NULL; - - while ( *to && *to!='\n' && *to!='\r' ) to++; - len = to - text + 1; - - if ( *to ) - { - // Advance the pointer for the next call - if ( *to=='\n' ) to++; - else if ( *to=='\r' && *(to+1)=='\n' ) to+=2; - } - if ( !len ) - return to; - - if ( !*lineptr ) - { - *lineptr = malloc(len); - *n = len; - } - else if ( *nkey[0] = name[0]; - tag->key[1] = name[1]; - tag->value = malloc(len+1); - memcpy(tag->value,value_from,len+1); - tag->value[len] = 0; - return tag; -} - -static HeaderTag *header_line_has_tag(HeaderLine *hline, const char *key) -{ - list_t *tags = hline->tags; - while (tags) - { - HeaderTag *tag = tags->data; - if ( tag->key[0]==key[0] && tag->key[1]==key[1] ) return tag; - tags = tags->next; - } - return NULL; -} - - -// Return codes: -// 0 .. different types or unique tags differ or conflicting tags, cannot be merged -// 1 .. all tags identical -> no need to merge, drop one -// 2 .. the unique tags match and there are some conflicting tags (same tag, different value) -> error, cannot be merged nor duplicated -// 3 .. there are some missing complementary tags and no unique conflict -> can be merged into a single line -static int sam_header_compare_lines(HeaderLine *hline1, HeaderLine *hline2) -{ - HeaderTag *t1, *t2; - - if ( hline1->type[0]!=hline2->type[0] || hline1->type[1]!=hline2->type[1] ) - return 0; - - int itype = tag_exists(hline1->type,types); - if ( itype==-1 ) { - debug("[sam_header_compare_lines] Unknown type [%c%c]\n", hline1->type[0],hline1->type[1]); - return -1; // FIXME (lh3): error; I do not know how this will be handled in Petr's code - } - - if ( unique_tags[itype] ) - { - t1 = header_line_has_tag(hline1,unique_tags[itype][0]); - t2 = header_line_has_tag(hline2,unique_tags[itype][0]); - if ( !t1 || !t2 ) // this should never happen, the unique tags are required - return 2; - - if ( strcmp(t1->value,t2->value) ) - return 0; // the unique tags differ, cannot be merged - } - if ( !required_tags[itype] && !optional_tags[itype] ) - { - t1 = hline1->tags->data; - t2 = hline2->tags->data; - if ( !strcmp(t1->value,t2->value) ) return 1; // identical comments - return 0; - } - - int missing=0, itag=0; - while ( required_tags[itype] && required_tags[itype][itag] ) - { - t1 = header_line_has_tag(hline1,required_tags[itype][itag]); - t2 = header_line_has_tag(hline2,required_tags[itype][itag]); - if ( !t1 && !t2 ) - return 2; // this should never happen - else if ( !t1 || !t2 ) - missing = 1; // there is some tag missing in one of the hlines - else if ( strcmp(t1->value,t2->value) ) - { - if ( unique_tags[itype] ) - return 2; // the lines have a matching unique tag but have a conflicting tag - - return 0; // the lines contain conflicting tags, cannot be merged - } - itag++; - } - itag = 0; - while ( optional_tags[itype] && optional_tags[itype][itag] ) - { - t1 = header_line_has_tag(hline1,optional_tags[itype][itag]); - t2 = header_line_has_tag(hline2,optional_tags[itype][itag]); - if ( !t1 && !t2 ) - { - itag++; - continue; - } - if ( !t1 || !t2 ) - missing = 1; // there is some tag missing in one of the hlines - else if ( strcmp(t1->value,t2->value) ) - { - if ( unique_tags[itype] ) - return 2; // the lines have a matching unique tag but have a conflicting tag - - return 0; // the lines contain conflicting tags, cannot be merged - } - itag++; - } - if ( missing ) return 3; // there are some missing complementary tags with no conflicts, can be merged - return 1; -} - - -static HeaderLine *sam_header_line_clone(const HeaderLine *hline) -{ - list_t *tags; - HeaderLine *out = malloc(sizeof(HeaderLine)); - out->type[0] = hline->type[0]; - out->type[1] = hline->type[1]; - out->tags = NULL; - - tags = hline->tags; - while (tags) - { - HeaderTag *old = tags->data; - - HeaderTag *new = malloc(sizeof(HeaderTag)); - new->key[0] = old->key[0]; - new->key[1] = old->key[1]; - new->value = strdup(old->value); - out->tags = list_append(out->tags, new); - - tags = tags->next; - } - return out; -} - -static int sam_header_line_merge_with(HeaderLine *out_hline, const HeaderLine *tmpl_hline) -{ - list_t *tmpl_tags; - - if ( out_hline->type[0]!=tmpl_hline->type[0] || out_hline->type[1]!=tmpl_hline->type[1] ) - return 0; - - tmpl_tags = tmpl_hline->tags; - while (tmpl_tags) - { - HeaderTag *tmpl_tag = tmpl_tags->data; - HeaderTag *out_tag = header_line_has_tag(out_hline, tmpl_tag->key); - if ( !out_tag ) - { - HeaderTag *tag = malloc(sizeof(HeaderTag)); - tag->key[0] = tmpl_tag->key[0]; - tag->key[1] = tmpl_tag->key[1]; - tag->value = strdup(tmpl_tag->value); - out_hline->tags = list_append(out_hline->tags,tag); - } - tmpl_tags = tmpl_tags->next; - } - return 1; -} - - -static HeaderLine *sam_header_line_parse(const char *headerLine) -{ - HeaderLine *hline; - HeaderTag *tag; - const char *from, *to; - from = headerLine; - - if ( *from != '@' ) { - debug("[sam_header_line_parse] expected '@', got [%s]\n", headerLine); - return 0; - } - to = ++from; - - while (*to && *to!='\t') to++; - if ( to-from != 2 ) { - debug("[sam_header_line_parse] expected '@XY', got [%s]\nHint: The header tags must be tab-separated.\n", headerLine); - return 0; - } - - hline = malloc(sizeof(HeaderLine)); - hline->type[0] = from[0]; - hline->type[1] = from[1]; - hline->tags = NULL; - - int itype = tag_exists(hline->type, types); - - from = to; - while (*to && *to=='\t') to++; - if ( to-from != 1 ) { - debug("[sam_header_line_parse] multiple tabs on line [%s] (%d)\n", headerLine,(int)(to-from)); - return 0; - } - from = to; - while (*from) - { - while (*to && *to!='\t') to++; - - if ( !required_tags[itype] && !optional_tags[itype] ) - { - // CO is a special case, it can contain anything, including tabs - if ( *to ) { to++; continue; } - tag = new_tag(" ",from,to-1); - } - else - tag = new_tag(from,from+3,to-1); - - if ( header_line_has_tag(hline,tag->key) ) - debug("The tag '%c%c' present (at least) twice on line [%s]\n", tag->key[0],tag->key[1], headerLine); - hline->tags = list_append(hline->tags, tag); - - from = to; - while (*to && *to=='\t') to++; - if ( *to && to-from != 1 ) { - debug("[sam_header_line_parse] multiple tabs on line [%s] (%d)\n", headerLine,(int)(to-from)); - return 0; - } - - from = to; - } - return hline; -} - - -// Must be of an existing type, all tags must be recognised and all required tags must be present -static int sam_header_line_validate(HeaderLine *hline) -{ - list_t *tags; - HeaderTag *tag; - int itype, itag; - - // Is the type correct? - itype = tag_exists(hline->type, types); - if ( itype==-1 ) - { - debug("The type [%c%c] not recognised.\n", hline->type[0],hline->type[1]); - return 0; - } - - // Has all required tags? - itag = 0; - while ( required_tags[itype] && required_tags[itype][itag] ) - { - if ( !header_line_has_tag(hline,required_tags[itype][itag]) ) - { - debug("The tag [%c%c] required for [%c%c] not present.\n", required_tags[itype][itag][0],required_tags[itype][itag][1], - hline->type[0],hline->type[1]); - return 0; - } - itag++; - } - - // Are all tags recognised? - tags = hline->tags; - while ( tags ) - { - tag = tags->data; - if ( !tag_exists(tag->key,required_tags[itype]) && !tag_exists(tag->key,optional_tags[itype]) ) - { - debug("Unknown tag [%c%c] for [%c%c].\n", tag->key[0],tag->key[1], hline->type[0],hline->type[1]); - return 0; - } - tags = tags->next; - } - - return 1; -} - - -static void print_header_line(FILE *fp, HeaderLine *hline) -{ - list_t *tags = hline->tags; - HeaderTag *tag; - - fprintf(fp, "@%c%c", hline->type[0],hline->type[1]); - while (tags) - { - tag = tags->data; - - fprintf(fp, "\t"); - if ( tag->key[0]!=' ' || tag->key[1]!=' ' ) - fprintf(fp, "%c%c:", tag->key[0],tag->key[1]); - fprintf(fp, "%s", tag->value); - - tags = tags->next; - } - fprintf(fp,"\n"); -} - - -static void sam_header_line_free(HeaderLine *hline) -{ - list_t *tags = hline->tags; - while (tags) - { - HeaderTag *tag = tags->data; - free(tag->value); - free(tag); - tags = tags->next; - } - list_free(hline->tags); - free(hline); -} - -void sam_header_free(void *_header) -{ - HeaderDict *header = (HeaderDict*)_header; - list_t *hlines = header; - while (hlines) - { - sam_header_line_free(hlines->data); - hlines = hlines->next; - } - list_free(header); -} - -HeaderDict *sam_header_clone(const HeaderDict *dict) -{ - HeaderDict *out = NULL; - while (dict) - { - HeaderLine *hline = dict->data; - out = list_append(out, sam_header_line_clone(hline)); - dict = dict->next; - } - return out; -} - -// Returns a newly allocated string -char *sam_header_write(const void *_header) -{ - const HeaderDict *header = (const HeaderDict*)_header; - char *out = NULL; - int len=0, nout=0; - const list_t *hlines; - - // Calculate the length of the string to allocate - hlines = header; - while (hlines) - { - len += 4; // @XY and \n - - HeaderLine *hline = hlines->data; - list_t *tags = hline->tags; - while (tags) - { - HeaderTag *tag = tags->data; - len += strlen(tag->value) + 1; // \t - if ( tag->key[0]!=' ' || tag->key[1]!=' ' ) - len += strlen(tag->value) + 3; // XY: - tags = tags->next; - } - hlines = hlines->next; - } - - nout = 0; - out = malloc(len+1); - hlines = header; - while (hlines) - { - HeaderLine *hline = hlines->data; - - nout += sprintf(out+nout,"@%c%c",hline->type[0],hline->type[1]); - - list_t *tags = hline->tags; - while (tags) - { - HeaderTag *tag = tags->data; - nout += sprintf(out+nout,"\t"); - if ( tag->key[0]!=' ' || tag->key[1]!=' ' ) - nout += sprintf(out+nout,"%c%c:", tag->key[0],tag->key[1]); - nout += sprintf(out+nout,"%s", tag->value); - tags = tags->next; - } - hlines = hlines->next; - nout += sprintf(out+nout,"\n"); - } - out[len] = 0; - return out; -} - -void *sam_header_parse2(const char *headerText) -{ - list_t *hlines = NULL; - HeaderLine *hline; - const char *text; - char *buf=NULL; - size_t nbuf = 0; - int tovalidate = 0; - - if ( !headerText ) - return 0; - - text = headerText; - while ( (text=nextline(&buf, &nbuf, text)) ) - { - hline = sam_header_line_parse(buf); - if ( hline && (!tovalidate || sam_header_line_validate(hline)) ) - // With too many (~250,000) reference sequences the header parsing was too slow with list_append. - hlines = list_append_to_end(hlines, hline); - else - { - if (hline) sam_header_line_free(hline); - sam_header_free(hlines); - if ( buf ) free(buf); - return NULL; - } - } - if ( buf ) free(buf); - - return hlines; -} - -void *sam_header2tbl(const void *_dict, char type[2], char key_tag[2], char value_tag[2]) -{ - const HeaderDict *dict = (const HeaderDict*)_dict; - const list_t *l = dict; - khash_t(str) *tbl = kh_init(str); - khiter_t k; - int ret; - - if (_dict == 0) return tbl; // return an empty (not null) hash table - while (l) - { - HeaderLine *hline = l->data; - if ( hline->type[0]!=type[0] || hline->type[1]!=type[1] ) - { - l = l->next; - continue; - } - - HeaderTag *key, *value; - key = header_line_has_tag(hline,key_tag); - value = header_line_has_tag(hline,value_tag); - if ( !key || !value ) - { - l = l->next; - continue; - } - - k = kh_get(str, tbl, key->value); - if ( k != kh_end(tbl) ) - debug("[sam_header_lookup_table] They key %s not unique.\n", key->value); - k = kh_put(str, tbl, key->value, &ret); - kh_value(tbl, k) = value->value; - - l = l->next; - } - return tbl; -} - -char **sam_header2list(const void *_dict, char type[2], char key_tag[2], int *_n) -{ - const HeaderDict *dict = (const HeaderDict*)_dict; - const list_t *l = dict; - int max, n; - char **ret; - - ret = 0; *_n = max = n = 0; - while (l) - { - HeaderLine *hline = l->data; - if ( hline->type[0]!=type[0] || hline->type[1]!=type[1] ) - { - l = l->next; - continue; - } - - HeaderTag *key; - key = header_line_has_tag(hline,key_tag); - if ( !key ) - { - l = l->next; - continue; - } - - if (n == max) { - max = max? max<<1 : 4; - ret = realloc(ret, max * sizeof(void*)); - } - ret[n++] = key->value; - - l = l->next; - } - *_n = n; - return ret; -} - -const char *sam_tbl_get(void *h, const char *key) -{ - khash_t(str) *tbl = (khash_t(str)*)h; - khint_t k; - k = kh_get(str, tbl, key); - return k == kh_end(tbl)? 0 : kh_val(tbl, k); -} - -int sam_tbl_size(void *h) -{ - khash_t(str) *tbl = (khash_t(str)*)h; - return h? kh_size(tbl) : 0; -} - -void sam_tbl_destroy(void *h) -{ - khash_t(str) *tbl = (khash_t(str)*)h; - kh_destroy(str, tbl); -} - -void *sam_header_merge(int n, const void **_dicts) -{ - const HeaderDict **dicts = (const HeaderDict**)_dicts; - HeaderDict *out_dict; - int idict, status; - - if ( n<2 ) return NULL; - - out_dict = sam_header_clone(dicts[0]); - - for (idict=1; idictdata, out_hlines->data); - if ( status==0 ) - { - out_hlines = out_hlines->next; - continue; - } - - if ( status==2 ) - { - print_header_line(stderr,tmpl_hlines->data); - print_header_line(stderr,out_hlines->data); - debug("Conflicting lines, cannot merge the headers.\n"); - return 0; - } - if ( status==3 ) - sam_header_line_merge_with(out_hlines->data, tmpl_hlines->data); - - inserted = 1; - break; - } - if ( !inserted ) - out_dict = list_append(out_dict, sam_header_line_clone(tmpl_hlines->data)); - - tmpl_hlines = tmpl_hlines->next; - } - } - - return out_dict; -} - - diff -Nru tophat-2.1.1+dfsg/src/samtools-0.1.18/sam_header.h tophat-2.1.1+dfsg1/src/samtools-0.1.18/sam_header.h --- tophat-2.1.1+dfsg/src/samtools-0.1.18/sam_header.h 2016-02-14 18:21:17.773079000 +0000 +++ tophat-2.1.1+dfsg1/src/samtools-0.1.18/sam_header.h 1970-01-01 00:00:00.000000000 +0000 @@ -1,24 +0,0 @@ -#ifndef __SAM_HEADER_H__ -#define __SAM_HEADER_H__ - -#ifdef __cplusplus -extern "C" { -#endif - - void *sam_header_parse2(const char *headerText); - void *sam_header_merge(int n, const void **dicts); - void sam_header_free(void *header); - char *sam_header_write(const void *headerDict); // returns a newly allocated string - - char **sam_header2list(const void *_dict, char type[2], char key_tag[2], int *_n); - - void *sam_header2tbl(const void *dict, char type[2], char key_tag[2], char value_tag[2]); - const char *sam_tbl_get(void *h, const char *key); - int sam_tbl_size(void *h); - void sam_tbl_destroy(void *h); - -#ifdef __cplusplus -} -#endif - -#endif diff -Nru tophat-2.1.1+dfsg/src/samtools-0.1.18/sample.c tophat-2.1.1+dfsg1/src/samtools-0.1.18/sample.c --- tophat-2.1.1+dfsg/src/samtools-0.1.18/sample.c 2016-02-14 18:21:17.775079000 +0000 +++ tophat-2.1.1+dfsg1/src/samtools-0.1.18/sample.c 1970-01-01 00:00:00.000000000 +0000 @@ -1,107 +0,0 @@ -#include -#include -#include "sample.h" -#include "khash.h" -KHASH_MAP_INIT_STR(sm, int) - -bam_sample_t *bam_smpl_init(void) -{ - bam_sample_t *s; - s = calloc(1, sizeof(bam_sample_t)); - s->rg2smid = kh_init(sm); - s->sm2id = kh_init(sm); - return s; -} - -void bam_smpl_destroy(bam_sample_t *sm) -{ - int i; - khint_t k; - khash_t(sm) *rg2smid = (khash_t(sm)*)sm->rg2smid; - if (sm == 0) return; - for (i = 0; i < sm->n; ++i) free(sm->smpl[i]); - free(sm->smpl); - for (k = kh_begin(rg2smid); k != kh_end(rg2smid); ++k) - if (kh_exist(rg2smid, k)) free((char*)kh_key(rg2smid, k)); - kh_destroy(sm, sm->rg2smid); - kh_destroy(sm, sm->sm2id); - free(sm); -} - -static void add_pair(bam_sample_t *sm, khash_t(sm) *sm2id, const char *key, const char *val) -{ - khint_t k_rg, k_sm; - int ret; - khash_t(sm) *rg2smid = (khash_t(sm)*)sm->rg2smid; - k_rg = kh_get(sm, rg2smid, key); - if (k_rg != kh_end(rg2smid)) return; // duplicated @RG-ID - k_rg = kh_put(sm, rg2smid, strdup(key), &ret); - k_sm = kh_get(sm, sm2id, val); - if (k_sm == kh_end(sm2id)) { // absent - if (sm->n == sm->m) { - sm->m = sm->m? sm->m<<1 : 1; - sm->smpl = realloc(sm->smpl, sizeof(void*) * sm->m); - } - sm->smpl[sm->n] = strdup(val); - k_sm = kh_put(sm, sm2id, sm->smpl[sm->n], &ret); - kh_val(sm2id, k_sm) = sm->n++; - } - kh_val(rg2smid, k_rg) = kh_val(sm2id, k_sm); -} - -int bam_smpl_add(bam_sample_t *sm, const char *fn, const char *txt) -{ - const char *p = txt, *q, *r; - kstring_t buf, first_sm; - int n = 0; - khash_t(sm) *sm2id = (khash_t(sm)*)sm->sm2id; - if (txt == 0) { - add_pair(sm, sm2id, fn, fn); - return 0; - } - memset(&buf, 0, sizeof(kstring_t)); - memset(&first_sm, 0, sizeof(kstring_t)); - while ((q = strstr(p, "@RG")) != 0) { - p = q + 3; - r = q = 0; - if ((q = strstr(p, "\tID:")) != 0) q += 4; - if ((r = strstr(p, "\tSM:")) != 0) r += 4; - if (r && q) { - char *u, *v; - int oq, or; - for (u = (char*)q; *u && *u != '\t' && *u != '\n'; ++u); - for (v = (char*)r; *v && *v != '\t' && *v != '\n'; ++v); - oq = *u; or = *v; *u = *v = '\0'; - buf.l = 0; kputs(fn, &buf); kputc('/', &buf); kputs(q, &buf); - add_pair(sm, sm2id, buf.s, r); - if ( !first_sm.s ) - kputs(r,&first_sm); - *u = oq; *v = or; - } else break; - p = q > r? q : r; - ++n; - } - if (n == 0) add_pair(sm, sm2id, fn, fn); - // If there is only one RG tag present in the header and reads are not annotated, don't refuse to work but - // use the tag instead. - else if ( n==1 && first_sm.s ) - add_pair(sm,sm2id,fn,first_sm.s); - if ( first_sm.s ) - free(first_sm.s); - -// add_pair(sm, sm2id, fn, fn); - free(buf.s); - return 0; -} - -int bam_smpl_rg2smid(const bam_sample_t *sm, const char *fn, const char *rg, kstring_t *str) -{ - khint_t k; - khash_t(sm) *rg2smid = (khash_t(sm)*)sm->rg2smid; - if (rg) { - str->l = 0; - kputs(fn, str); kputc('/', str); kputs(rg, str); - k = kh_get(sm, rg2smid, str->s); - } else k = kh_get(sm, rg2smid, fn); - return k == kh_end(rg2smid)? -1 : kh_val(rg2smid, k); -} diff -Nru tophat-2.1.1+dfsg/src/samtools-0.1.18/sample.h tophat-2.1.1+dfsg1/src/samtools-0.1.18/sample.h --- tophat-2.1.1+dfsg/src/samtools-0.1.18/sample.h 2016-02-14 18:21:17.776079000 +0000 +++ tophat-2.1.1+dfsg1/src/samtools-0.1.18/sample.h 1970-01-01 00:00:00.000000000 +0000 @@ -1,17 +0,0 @@ -#ifndef BAM_SAMPLE_H -#define BAM_SAMPLE_H - -#include "kstring.h" - -typedef struct { - int n, m; - char **smpl; - void *rg2smid, *sm2id; -} bam_sample_t; - -bam_sample_t *bam_smpl_init(void); -int bam_smpl_add(bam_sample_t *sm, const char *abs, const char *txt); -int bam_smpl_rg2smid(const bam_sample_t *sm, const char *fn, const char *rg, kstring_t *str); -void bam_smpl_destroy(bam_sample_t *sm); - -#endif diff -Nru tophat-2.1.1+dfsg/src/samtools-0.1.18/samtools.1 tophat-2.1.1+dfsg1/src/samtools-0.1.18/samtools.1 --- tophat-2.1.1+dfsg/src/samtools-0.1.18/samtools.1 2016-02-14 18:21:17.777079000 +0000 +++ tophat-2.1.1+dfsg1/src/samtools-0.1.18/samtools.1 1970-01-01 00:00:00.000000000 +0000 @@ -1,994 +0,0 @@ -.TH samtools 1 "05 July 2011" "samtools-0.1.17" "Bioinformatics tools" -.SH NAME -.PP -samtools - Utilities for the Sequence Alignment/Map (SAM) format - -bcftools - Utilities for the Binary Call Format (BCF) and VCF -.SH SYNOPSIS -.PP -samtools view -bt ref_list.txt -o aln.bam aln.sam.gz -.PP -samtools sort aln.bam aln.sorted -.PP -samtools index aln.sorted.bam -.PP -samtools idxstats aln.sorted.bam -.PP -samtools view aln.sorted.bam chr2:20,100,000-20,200,000 -.PP -samtools merge out.bam in1.bam in2.bam in3.bam -.PP -samtools faidx ref.fasta -.PP -samtools pileup -vcf ref.fasta aln.sorted.bam -.PP -samtools mpileup -C50 -gf ref.fasta -r chr3:1,000-2,000 in1.bam in2.bam -.PP -samtools tview aln.sorted.bam ref.fasta -.PP -bcftools index in.bcf -.PP -bcftools view in.bcf chr2:100-200 > out.vcf -.PP -bcftools view -vc in.bcf > out.vcf 2> out.afs - -.SH DESCRIPTION -.PP -Samtools is a set of utilities that manipulate alignments in the BAM -format. It imports from and exports to the SAM (Sequence Alignment/Map) -format, does sorting, merging and indexing, and allows to retrieve reads -in any regions swiftly. - -Samtools is designed to work on a stream. It regards an input file `-' -as the standard input (stdin) and an output file `-' as the standard -output (stdout). Several commands can thus be combined with Unix -pipes. Samtools always output warning and error messages to the standard -error output (stderr). - -Samtools is also able to open a BAM (not SAM) file on a remote FTP or -HTTP server if the BAM file name starts with `ftp://' or `http://'. -Samtools checks the current working directory for the index file and -will download the index upon absence. Samtools does not retrieve the -entire alignment file unless it is asked to do so. - -.SH SAMTOOLS COMMANDS AND OPTIONS - -.TP 10 -.B view -samtools view [-bchuHS] [-t in.refList] [-o output] [-f reqFlag] [-F -skipFlag] [-q minMapQ] [-l library] [-r readGroup] [-R rgFile] | [region1 [...]] - -Extract/print all or sub alignments in SAM or BAM format. If no region -is specified, all the alignments will be printed; otherwise only -alignments overlapping the specified regions will be output. An -alignment may be given multiple times if it is overlapping several -regions. A region can be presented, for example, in the following -format: `chr2' (the whole chr2), `chr2:1000000' (region starting from -1,000,000bp) or `chr2:1,000,000-2,000,000' (region between 1,000,000 and -2,000,000bp including the end points). The coordinate is 1-based. - -.B OPTIONS: -.RS -.TP 8 -.B -b -Output in the BAM format. -.TP -.BI -f \ INT -Only output alignments with all bits in INT present in the FLAG -field. INT can be in hex in the format of /^0x[0-9A-F]+/ [0] -.TP -.BI -F \ INT -Skip alignments with bits present in INT [0] -.TP -.B -h -Include the header in the output. -.TP -.B -H -Output the header only. -.TP -.BI -l \ STR -Only output reads in library STR [null] -.TP -.BI -o \ FILE -Output file [stdout] -.TP -.BI -q \ INT -Skip alignments with MAPQ smaller than INT [0] -.TP -.BI -r \ STR -Only output reads in read group STR [null] -.TP -.BI -R \ FILE -Output reads in read groups listed in -.I FILE -[null] -.TP -.B -S -Input is in SAM. If @SQ header lines are absent, the -.B `-t' -option is required. -.TP -.B -c -Instead of printing the alignments, only count them and print the -total number. All filter options, such as -.B `-f', -.B `-F' -and -.B `-q' -, are taken into account. -.TP -.BI -t \ FILE -This file is TAB-delimited. Each line must contain the reference name -and the length of the reference, one line for each distinct reference; -additional fields are ignored. This file also defines the order of the -reference sequences in sorting. If you run `samtools faidx ', -the resultant index file -.I .fai -can be used as this -.I -file. -.TP -.B -u -Output uncompressed BAM. This option saves time spent on -compression/decomprssion and is thus preferred when the output is piped -to another samtools command. -.RE - -.TP -.B tview -samtools tview [ref.fasta] - -Text alignment viewer (based on the ncurses library). In the viewer, -press `?' for help and press `g' to check the alignment start from a -region in the format like `chr10:10,000,000' or `=10,000,000' when -viewing the same reference sequence. - -.TP -.B mpileup -.B samtools mpileup -.RB [ \-EBug ] -.RB [ \-C -.IR capQcoef ] -.RB [ \-r -.IR reg ] -.RB [ \-f -.IR in.fa ] -.RB [ \-l -.IR list ] -.RB [ \-M -.IR capMapQ ] -.RB [ \-Q -.IR minBaseQ ] -.RB [ \-q -.IR minMapQ ] -.I in.bam -.RI [ in2.bam -.RI [ ... ]] - -Generate BCF or pileup for one or multiple BAM files. Alignment records -are grouped by sample identifiers in @RG header lines. If sample -identifiers are absent, each input file is regarded as one sample. - -In the pileup format (without -.BR -u or -g ), -each -line represents a genomic position, consisting of chromosome name, -coordinate, reference base, read bases, read qualities and alignment -mapping qualities. Information on match, mismatch, indel, strand, -mapping quality and start and end of a read are all encoded at the read -base column. At this column, a dot stands for a match to the reference -base on the forward strand, a comma for a match on the reverse strand, -a '>' or '<' for a reference skip, `ACGTN' for a mismatch on the forward -strand and `acgtn' for a mismatch on the reverse strand. A pattern -`\\+[0-9]+[ACGTNacgtn]+' indicates there is an insertion between this -reference position and the next reference position. The length of the -insertion is given by the integer in the pattern, followed by the -inserted sequence. Similarly, a pattern `-[0-9]+[ACGTNacgtn]+' -represents a deletion from the reference. The deleted bases will be -presented as `*' in the following lines. Also at the read base column, a -symbol `^' marks the start of a read. The ASCII of the character -following `^' minus 33 gives the mapping quality. A symbol `$' marks the -end of a read segment. - -.B Input Options: -.RS -.TP 10 -.B -6 -Assume the quality is in the Illumina 1.3+ encoding. -.B -A -Do not skip anomalous read pairs in variant calling. -.TP -.B -B -Disable probabilistic realignment for the computation of base alignment -quality (BAQ). BAQ is the Phred-scaled probability of a read base being -misaligned. Applying this option greatly helps to reduce false SNPs -caused by misalignments. -.TP -.BI -b \ FILE -List of input BAM files, one file per line [null] -.TP -.BI -C \ INT -Coefficient for downgrading mapping quality for reads containing -excessive mismatches. Given a read with a phred-scaled probability q of -being generated from the mapped position, the new mapping quality is -about sqrt((INT-q)/INT)*INT. A zero value disables this -functionality; if enabled, the recommended value for BWA is 50. [0] -.TP -.BI -d \ INT -At a position, read maximally -.I INT -reads per input BAM. [250] -.TP -.B -E -Extended BAQ computation. This option helps sensitivity especially for MNPs, but may hurt -specificity a little bit. -.TP -.BI -f \ FILE -The -.BR faidx -indexed -reference file in the FASTA format. The file can be optionally compressed by -.BR razip . -[null] -.TP -.BI -l \ FILE -BED or position list file containing a list of regions or sites where pileup or BCF should be generated [null] -.TP -.BI -q \ INT -Minimum mapping quality for an alignment to be used [0] -.TP -.BI -Q \ INT -Minimum base quality for a base to be considered [13] -.TP -.BI -r \ STR -Only generate pileup in region -.I STR -[all sites] -.TP -.B Output Options: - -.TP -.B -D -Output per-sample read depth -.TP -.B -g -Compute genotype likelihoods and output them in the binary call format (BCF). -.TP -.B -S -Output per-sample Phred-scaled strand bias P-value -.TP -.B -u -Similar to -.B -g -except that the output is uncompressed BCF, which is preferred for piping. - -.TP -.B Options for Genotype Likelihood Computation (for -g or -u): - -.TP -.BI -e \ INT -Phred-scaled gap extension sequencing error probability. Reducing -.I INT -leads to longer indels. [20] -.TP -.BI -h \ INT -Coefficient for modeling homopolymer errors. Given an -.IR l -long -homopolymer -run, the sequencing error of an indel of size -.I s -is modeled as -.IR INT * s / l . -[100] -.TP -.B -I -Do not perform INDEL calling -.TP -.BI -L \ INT -Skip INDEL calling if the average per-sample depth is above -.IR INT . -[250] -.TP -.BI -o \ INT -Phred-scaled gap open sequencing error probability. Reducing -.I INT -leads to more indel calls. [40] -.TP -.BI -P \ STR -Comma dilimited list of platforms (determined by -.BR @RG-PL ) -from which indel candidates are obtained. It is recommended to collect -indel candidates from sequencing technologies that have low indel error -rate such as ILLUMINA. [all] -.RE - -.TP -.B reheader -samtools reheader - -Replace the header in -.I in.bam -with the header in -.I in.header.sam. -This command is much faster than replacing the header with a -BAM->SAM->BAM conversion. - -.TP -.B cat -samtools cat [-h header.sam] [-o out.bam] [ ... ] - -Concatenate BAMs. The sequence dictionary of each input BAM must be identical, -although this command does not check this. This command uses a similar trick -to -.B reheader -which enables fast BAM concatenation. - -.TP -.B sort -samtools sort [-no] [-m maxMem] - -Sort alignments by leftmost coordinates. File -.I .bam -will be created. This command may also create temporary files -.I .%d.bam -when the whole alignment cannot be fitted into memory (controlled by -option -m). - -.B OPTIONS: -.RS -.TP 8 -.B -o -Output the final alignment to the standard output. -.TP -.B -n -Sort by read names rather than by chromosomal coordinates -.TP -.BI -m \ INT -Approximately the maximum required memory. [500000000] -.RE - -.TP -.B merge -samtools merge [-nur1f] [-h inh.sam] [-R reg] [...] - -Merge multiple sorted alignments. -The header reference lists of all the input BAM files, and the @SQ headers of -.IR inh.sam , -if any, must all refer to the same set of reference sequences. -The header reference list and (unless overridden by -.BR -h ) -`@' headers of -.I in1.bam -will be copied to -.IR out.bam , -and the headers of other files will be ignored. - -.B OPTIONS: -.RS -.TP 8 -.B -1 -Use zlib compression level 1 to comrpess the output -.TP -.B -f -Force to overwrite the output file if present. -.TP 8 -.BI -h \ FILE -Use the lines of -.I FILE -as `@' headers to be copied to -.IR out.bam , -replacing any header lines that would otherwise be copied from -.IR in1.bam . -.RI ( FILE -is actually in SAM format, though any alignment records it may contain -are ignored.) -.TP -.B -n -The input alignments are sorted by read names rather than by chromosomal -coordinates -.TP -.BI -R \ STR -Merge files in the specified region indicated by -.I STR -[null] -.TP -.B -r -Attach an RG tag to each alignment. The tag value is inferred from file names. -.TP -.B -u -Uncompressed BAM output -.RE - -.TP -.B index -samtools index - -Index sorted alignment for fast random access. Index file -.I .bai -will be created. - -.TP -.B idxstats -samtools idxstats - -Retrieve and print stats in the index file. The output is TAB delimited -with each line consisting of reference sequence name, sequence length, # -mapped reads and # unmapped reads. - -.TP -.B faidx -samtools faidx [region1 [...]] - -Index reference sequence in the FASTA format or extract subsequence from -indexed reference sequence. If no region is specified, -.B faidx -will index the file and create -.I .fai -on the disk. If regions are speficified, the subsequences will be -retrieved and printed to stdout in the FASTA format. The input file can -be compressed in the -.B RAZF -format. - -.TP -.B fixmate -samtools fixmate - -Fill in mate coordinates, ISIZE and mate related flags from a -name-sorted alignment. - -.TP -.B rmdup -samtools rmdup [-sS] - -Remove potential PCR duplicates: if multiple read pairs have identical -external coordinates, only retain the pair with highest mapping quality. -In the paired-end mode, this command -.B ONLY -works with FR orientation and requires ISIZE is correctly set. It does -not work for unpaired reads (e.g. two ends mapped to different -chromosomes or orphan reads). - -.B OPTIONS: -.RS -.TP 8 -.B -s -Remove duplicate for single-end reads. By default, the command works for -paired-end reads only. -.TP 8 -.B -S -Treat paired-end reads and single-end reads. -.RE - -.TP -.B calmd -samtools calmd [-EeubSr] [-C capQcoef] - -Generate the MD tag. If the MD tag is already present, this command will -give a warning if the MD tag generated is different from the existing -tag. Output SAM by default. - -.B OPTIONS: -.RS -.TP 8 -.B -A -When used jointly with -.B -r -this option overwrites the original base quality. -.TP 8 -.B -e -Convert a the read base to = if it is identical to the aligned reference -base. Indel caller does not support the = bases at the moment. -.TP -.B -u -Output uncompressed BAM -.TP -.B -b -Output compressed BAM -.TP -.B -S -The input is SAM with header lines -.TP -.BI -C \ INT -Coefficient to cap mapping quality of poorly mapped reads. See the -.B pileup -command for details. [0] -.TP -.B -r -Compute the BQ tag (without -A) or cap base quality by BAQ (with -A). -.TP -.B -E -Extended BAQ calculation. This option trades specificity for sensitivity, though the -effect is minor. -.RE - -.TP -.B targetcut -samtools targetcut [-Q minBaseQ] [-i inPenalty] [-0 em0] [-1 em1] [-2 em2] [-f ref] - -This command identifies target regions by examining the continuity of read depth, computes -haploid consensus sequences of targets and outputs a SAM with each sequence corresponding -to a target. When option -.B -f -is in use, BAQ will be applied. This command is -.B only -designed for cutting fosmid clones from fosmid pool sequencing [Ref. Kitzman et al. (2010)]. -.RE - -.TP -.B phase -samtools phase [-AF] [-k len] [-b prefix] [-q minLOD] [-Q minBaseQ] - -Call and phase heterozygous SNPs. -.B OPTIONS: -.RS -.TP 8 -.B -A -Drop reads with ambiguous phase. -.TP 8 -.BI -b \ STR -Prefix of BAM output. When this option is in use, phase-0 reads will be saved in file -.BR STR .0.bam -and phase-1 reads in -.BR STR .1.bam. -Phase unknown reads will be randomly allocated to one of the two files. Chimeric reads -with switch errors will be saved in -.BR STR .chimeric.bam. -[null] -.TP -.B -F -Do not attempt to fix chimeric reads. -.TP -.BI -k \ INT -Maximum length for local phasing. [13] -.TP -.BI -q \ INT -Minimum Phred-scaled LOD to call a heterozygote. [40] -.TP -.BI -Q \ INT -Minimum base quality to be used in het calling. [13] -.RE - -.SH BCFTOOLS COMMANDS AND OPTIONS - -.TP 10 -.B view -.B bcftools view -.RB [ \-AbFGNQSucgv ] -.RB [ \-D -.IR seqDict ] -.RB [ \-l -.IR listLoci ] -.RB [ \-s -.IR listSample ] -.RB [ \-i -.IR gapSNPratio ] -.RB [ \-t -.IR mutRate ] -.RB [ \-p -.IR varThres ] -.RB [ \-P -.IR prior ] -.RB [ \-1 -.IR nGroup1 ] -.RB [ \-d -.IR minFrac ] -.RB [ \-U -.IR nPerm ] -.RB [ \-X -.IR permThres ] -.RB [ \-T -.IR trioType ] -.I in.bcf -.RI [ region ] - -Convert between BCF and VCF, call variant candidates and estimate allele -frequencies. - -.RS -.TP -.B Input/Output Options: -.TP 10 -.B -A -Retain all possible alternate alleles at variant sites. By default, the view -command discards unlikely alleles. -.TP 10 -.B -b -Output in the BCF format. The default is VCF. -.TP -.BI -D \ FILE -Sequence dictionary (list of chromosome names) for VCF->BCF conversion [null] -.TP -.B -F -Indicate PL is generated by r921 or before (ordering is different). -.TP -.B -G -Suppress all individual genotype information. -.TP -.BI -l \ FILE -List of sites at which information are outputted [all sites] -.TP -.B -N -Skip sites where the REF field is not A/C/G/T -.TP -.B -Q -Output the QCALL likelihood format -.TP -.BI -s \ FILE -List of samples to use. The first column in the input gives the sample names -and the second gives the ploidy, which can only be 1 or 2. When the 2nd column -is absent, the sample ploidy is assumed to be 2. In the output, the ordering of -samples will be identical to the one in -.IR FILE . -[null] -.TP -.B -S -The input is VCF instead of BCF. -.TP -.B -u -Uncompressed BCF output (force -b). -.TP -.B Consensus/Variant Calling Options: -.TP 10 -.B -c -Call variants using Bayesian inference. This option automatically invokes option -.BR -e . -.TP -.BI -d \ FLOAT -When -.B -v -is in use, skip loci where the fraction of samples covered by reads is below FLOAT. [0] -.TP -.B -e -Perform max-likelihood inference only, including estimating the site allele frequency, -testing Hardy-Weinberg equlibrium and testing associations with LRT. -.TP -.B -g -Call per-sample genotypes at variant sites (force -c) -.TP -.BI -i \ FLOAT -Ratio of INDEL-to-SNP mutation rate [0.15] -.TP -.BI -p \ FLOAT -A site is considered to be a variant if P(ref|D) rg.txt - samtools merge -rh rg.txt merged.bam ga.bam 454.bam - -The value in a -.B RG -tag is determined by the file name the read is coming from. In this -example, in the -.IR merged.bam , -reads from -.I ga.bam -will be attached -.IR RG:Z:ga , -while reads from -.I 454.bam -will be attached -.IR RG:Z:454 . - -.IP o 2 -Call SNPs and short INDELs for one diploid individual: - - samtools mpileup -ugf ref.fa aln.bam | bcftools view -bvcg - > var.raw.bcf - bcftools view var.raw.bcf | vcfutils.pl varFilter -D 100 > var.flt.vcf - -The -.B -D -option of varFilter controls the maximum read depth, which should be -adjusted to about twice the average read depth. One may consider to add -.B -C50 -to -.B mpileup -if mapping quality is overestimated for reads containing excessive -mismatches. Applying this option usually helps -.B BWA-short -but may not other mappers. - -.IP o 2 -Generate the consensus sequence for one diploid individual: - - samtools mpileup -uf ref.fa aln.bam | bcftools view -cg - | vcfutils.pl vcf2fq > cns.fq - -.IP o 2 -Call somatic mutations from a pair of samples: - - samtools mpileup -DSuf ref.fa aln.bam | bcftools view -bvcgT pair - > var.bcf - -In the output INFO field, -.I CLR -gives the Phred-log ratio between the likelihood by treating the -two samples independently, and the likelihood by requiring the genotype to be identical. -This -.I CLR -is effectively a score measuring the confidence of somatic calls. The higher the better. - -.IP o 2 -Call de novo and somatic mutations from a family trio: - - samtools mpileup -DSuf ref.fa aln.bam | bcftools view -bvcgT pair -s samples.txt - > var.bcf - -File -.I samples.txt -should consist of three lines specifying the member and order of samples (in the order of child-father-mother). -Similarly, -.I CLR -gives the Phred-log likelihood ratio with and without the trio constraint. -.I UGT -shows the most likely genotype configuration without the trio constraint, and -.I CGT -gives the most likely genotype configuration satisfying the trio constraint. - -.IP o 2 -Phase one individual: - - samtools calmd -AEur aln.bam ref.fa | samtools phase -b prefix - > phase.out - -The -.B calmd -command is used to reduce false heterozygotes around INDELs. - -.IP o 2 -Call SNPs and short indels for multiple diploid individuals: - - samtools mpileup -P ILLUMINA -ugf ref.fa *.bam | bcftools view -bcvg - > var.raw.bcf - bcftools view var.raw.bcf | vcfutils.pl varFilter -D 2000 > var.flt.vcf - -Individuals are identified from the -.B SM -tags in the -.B @RG -header lines. Individuals can be pooled in one alignment file; one -individual can also be separated into multiple files. The -.B -P -option specifies that indel candidates should be collected only from -read groups with the -.B @RG-PL -tag set to -.IR ILLUMINA . -Collecting indel candidates from reads sequenced by an indel-prone -technology may affect the performance of indel calling. - -.IP o 2 -Derive the allele frequency spectrum (AFS) on a list of sites from multiple individuals: - - samtools mpileup -Igf ref.fa *.bam > all.bcf - bcftools view -bl sites.list all.bcf > sites.bcf - bcftools view -cGP cond2 sites.bcf > /dev/null 2> sites.1.afs - bcftools view -cGP sites.1.afs sites.bcf > /dev/null 2> sites.2.afs - bcftools view -cGP sites.2.afs sites.bcf > /dev/null 2> sites.3.afs - ...... - -where -.I sites.list -contains the list of sites with each line consisting of the reference -sequence name and position. The following -.B bcftools -commands estimate AFS by EM. - -.IP o 2 -Dump BAQ applied alignment for other SNP callers: - - samtools calmd -bAr aln.bam > aln.baq.bam - -It adds and corrects the -.B NM -and -.B MD -tags at the same time. The -.B calmd -command also comes with the -.B -C -option, the same as the one in -.B pileup -and -.BR mpileup . -Apply if it helps. - -.SH LIMITATIONS -.PP -.IP o 2 -Unaligned words used in bam_import.c, bam_endian.h, bam.c and bam_aux.c. -.IP o 2 -Samtools paired-end rmdup does not work for unpaired reads (e.g. orphan -reads or ends mapped to different chromosomes). If this is a concern, -please use Picard's MarkDuplicate which correctly handles these cases, -although a little slower. - -.SH AUTHOR -.PP -Heng Li from the Sanger Institute wrote the C version of samtools. Bob -Handsaker from the Broad Institute implemented the BGZF library and Jue -Ruan from Beijing Genomics Institute wrote the RAZF library. John -Marshall and Petr Danecek contribute to the source code and various -people from the 1000 Genomes Project have contributed to the SAM format -specification. - -.SH SEE ALSO -.PP -Samtools website: diff -Nru tophat-2.1.1+dfsg/src/samtools-0.1.18/sam_view.c tophat-2.1.1+dfsg1/src/samtools-0.1.18/sam_view.c --- tophat-2.1.1+dfsg/src/samtools-0.1.18/sam_view.c 2016-02-14 18:21:17.774079000 +0000 +++ tophat-2.1.1+dfsg1/src/samtools-0.1.18/sam_view.c 1970-01-01 00:00:00.000000000 +0000 @@ -1,406 +0,0 @@ -#include -#include -#include -#include -#include -#include "sam_header.h" -#include "sam.h" -#include "faidx.h" -#include "kstring.h" -#include "khash.h" -KHASH_SET_INIT_STR(rg) - -// When counting records instead of printing them, -// data passed to the bam_fetch callback is encapsulated in this struct. -typedef struct { - bam_header_t *header; - int *count; -} count_func_data_t; - -typedef khash_t(rg) *rghash_t; - -// FIXME: we'd better use no global variables... -static rghash_t g_rghash = 0; -static int g_min_mapQ = 0, g_flag_on = 0, g_flag_off = 0; -static float g_subsam = -1; -static char *g_library, *g_rg; -static void *g_bed; - -void *bed_read(const char *fn); -void bed_destroy(void *_h); -int bed_overlap(const void *_h, const char *chr, int beg, int end); - -static inline int __g_skip_aln(const bam_header_t *h, const bam1_t *b) -{ - if (b->core.qual < g_min_mapQ || ((b->core.flag & g_flag_on) != g_flag_on) || (b->core.flag & g_flag_off)) - return 1; - if (g_bed && b->core.tid >= 0 && !bed_overlap(g_bed, h->target_name[b->core.tid], b->core.pos, bam_calend(&b->core, bam1_cigar(b)))) - return 1; - if (g_subsam > 0.) { - int x = (int)(g_subsam + .499); - uint32_t k = __ac_X31_hash_string(bam1_qname(b)) + x; - if (k%1024 / 1024.0 >= g_subsam - x) return 1; - } - if (g_rg || g_rghash) { - uint8_t *s = bam_aux_get(b, "RG"); - if (s) { - if (g_rg) return (strcmp(g_rg, (char*)(s + 1)) == 0)? 0 : 1; - if (g_rghash) { - khint_t k = kh_get(rg, g_rghash, (char*)(s + 1)); - return (k != kh_end(g_rghash))? 0 : 1; - } - } - } - if (g_library) { - const char *p = bam_get_library((bam_header_t*)h, b); - return (p && strcmp(p, g_library) == 0)? 0 : 1; - } - return 0; -} - -static char *drop_rg(char *hdtxt, rghash_t h, int *len) -{ - char *p = hdtxt, *q, *r, *s; - kstring_t str; - memset(&str, 0, sizeof(kstring_t)); - while (1) { - int toprint = 0; - q = strchr(p, '\n'); - if (q == 0) q = p + strlen(p); - if (q - p < 3) break; // the line is too short; then stop - if (strncmp(p, "@RG\t", 4) == 0) { - int c; - khint_t k; - if ((r = strstr(p, "\tID:")) != 0) { - r += 4; - for (s = r; *s != '\0' && *s != '\n' && *s != '\t'; ++s); - c = *s; *s = '\0'; - k = kh_get(rg, h, r); - *s = c; - if (k != kh_end(h)) toprint = 1; - } - } else toprint = 1; - if (toprint) { - kputsn(p, q - p, &str); kputc('\n', &str); - } - p = q + 1; - } - *len = str.l; - return str.s; -} - -// callback function for bam_fetch() that prints nonskipped records -static int view_func(const bam1_t *b, void *data) -{ - if (!__g_skip_aln(((samfile_t*)data)->header, b)) - samwrite((samfile_t*)data, b); - return 0; -} - -// callback function for bam_fetch() that counts nonskipped records -static int count_func(const bam1_t *b, void *data) -{ - if (!__g_skip_aln(((count_func_data_t*)data)->header, b)) { - (*((count_func_data_t*)data)->count)++; - } - return 0; -} - -static int usage(int is_long_help); - -int main_samview(int argc, char *argv[]) -{ - int c, is_header = 0, is_header_only = 0, is_bamin = 1, ret = 0, compress_level = -1, is_bamout = 0, is_count = 0; - int of_type = BAM_OFDEC, is_long_help = 0; - int count = 0; - samfile_t *in = 0, *out = 0; - char in_mode[5], out_mode[5], *fn_out = 0, *fn_list = 0, *fn_ref = 0, *fn_rg = 0; - - /* parse command-line options */ - strcpy(in_mode, "r"); strcpy(out_mode, "w"); - while ((c = getopt(argc, argv, "Sbct:h1Ho:q:f:F:ul:r:xX?T:R:L:s:")) >= 0) { - switch (c) { - case 's': g_subsam = atof(optarg); break; - case 'c': is_count = 1; break; - case 'S': is_bamin = 0; break; - case 'b': is_bamout = 1; break; - case 't': fn_list = strdup(optarg); is_bamin = 0; break; - case 'h': is_header = 1; break; - case 'H': is_header_only = 1; break; - case 'o': fn_out = strdup(optarg); break; - case 'f': g_flag_on = strtol(optarg, 0, 0); break; - case 'F': g_flag_off = strtol(optarg, 0, 0); break; - case 'q': g_min_mapQ = atoi(optarg); break; - case 'u': compress_level = 0; break; - case '1': compress_level = 1; break; - case 'l': g_library = strdup(optarg); break; - case 'L': g_bed = bed_read(optarg); break; - case 'r': g_rg = strdup(optarg); break; - case 'R': fn_rg = strdup(optarg); break; - case 'x': of_type = BAM_OFHEX; break; - case 'X': of_type = BAM_OFSTR; break; - case '?': is_long_help = 1; break; - case 'T': fn_ref = strdup(optarg); is_bamin = 0; break; - default: return usage(is_long_help); - } - } - if (compress_level >= 0) is_bamout = 1; - if (is_header_only) is_header = 1; - if (is_bamout) strcat(out_mode, "b"); - else { - if (of_type == BAM_OFHEX) strcat(out_mode, "x"); - else if (of_type == BAM_OFSTR) strcat(out_mode, "X"); - } - if (is_bamin) strcat(in_mode, "b"); - if (is_header) strcat(out_mode, "h"); - if (compress_level >= 0) { - char tmp[2]; - tmp[0] = compress_level + '0'; tmp[1] = '\0'; - strcat(out_mode, tmp); - } - if (argc == optind) return usage(is_long_help); // potential memory leak... - - // read the list of read groups - if (fn_rg) { - FILE *fp_rg; - char buf[1024]; - int ret; - g_rghash = kh_init(rg); - fp_rg = fopen(fn_rg, "r"); - while (!feof(fp_rg) && fscanf(fp_rg, "%s", buf) > 0) // this is not a good style, but bear me... - kh_put(rg, g_rghash, strdup(buf), &ret); // we'd better check duplicates... - fclose(fp_rg); - } - - // generate the fn_list if necessary - if (fn_list == 0 && fn_ref) fn_list = samfaipath(fn_ref); - // open file handlers - if ((in = samopen(argv[optind], in_mode, fn_list)) == 0) { - fprintf(stderr, "[main_samview] fail to open \"%s\" for reading.\n", argv[optind]); - ret = 1; - goto view_end; - } - if (in->header == 0) { - fprintf(stderr, "[main_samview] fail to read the header from \"%s\".\n", argv[optind]); - ret = 1; - goto view_end; - } - if (g_rghash) { // FIXME: I do not know what "bam_header_t::n_text" is for... - char *tmp; - int l; - tmp = drop_rg(in->header->text, g_rghash, &l); - free(in->header->text); - in->header->text = tmp; - in->header->l_text = l; - } - if (!is_count && (out = samopen(fn_out? fn_out : "-", out_mode, in->header)) == 0) { - fprintf(stderr, "[main_samview] fail to open \"%s\" for writing.\n", fn_out? fn_out : "standard output"); - ret = 1; - goto view_end; - } - if (is_header_only) goto view_end; // no need to print alignments - - if (argc == optind + 1) { // convert/print the entire file - bam1_t *b = bam_init1(); - int r; - while ((r = samread(in, b)) >= 0) { // read one alignment from `in' - if (!__g_skip_aln(in->header, b)) { - if (!is_count) samwrite(out, b); // write the alignment to `out' - count++; - } - } - if (r < -1) { - fprintf(stderr, "[main_samview] truncated file.\n"); - ret = 1; - } - bam_destroy1(b); - } else { // retrieve alignments in specified regions - int i; - bam_index_t *idx = 0; - if (is_bamin) idx = bam_index_load(argv[optind]); // load BAM index - if (idx == 0) { // index is unavailable - fprintf(stderr, "[main_samview] random alignment retrieval only works for indexed BAM files.\n"); - ret = 1; - goto view_end; - } - for (i = optind + 1; i < argc; ++i) { - int tid, beg, end, result; - bam_parse_region(in->header, argv[i], &tid, &beg, &end); // parse a region in the format like `chr2:100-200' - if (tid < 0) { // reference name is not found - fprintf(stderr, "[main_samview] region \"%s\" specifies an unknown reference name. Continue anyway.\n", argv[i]); - continue; - } - // fetch alignments - if (is_count) { - count_func_data_t count_data = { in->header, &count }; - result = bam_fetch(in->x.bam, idx, tid, beg, end, &count_data, count_func); - } else - result = bam_fetch(in->x.bam, idx, tid, beg, end, out, view_func); - if (result < 0) { - fprintf(stderr, "[main_samview] retrieval of region \"%s\" failed due to truncated file or corrupt BAM index file\n", argv[i]); - ret = 1; - break; - } - } - bam_index_destroy(idx); // destroy the BAM index - } - -view_end: - if (is_count && ret == 0) { - printf("%d\n", count); - } - // close files, free and return - free(fn_list); free(fn_ref); free(fn_out); free(g_library); free(g_rg); free(fn_rg); - if (g_bed) bed_destroy(g_bed); - if (g_rghash) { - khint_t k; - for (k = 0; k < kh_end(g_rghash); ++k) - if (kh_exist(g_rghash, k)) free((char*)kh_key(g_rghash, k)); - kh_destroy(rg, g_rghash); - } - samclose(in); - if (!is_count) - samclose(out); - return ret; -} - -static int usage(int is_long_help) -{ - fprintf(stderr, "\n"); - fprintf(stderr, "Usage: samtools view [options] | [region1 [...]]\n\n"); - fprintf(stderr, "Options: -b output BAM\n"); - fprintf(stderr, " -h print header for the SAM output\n"); - fprintf(stderr, " -H print header only (no alignments)\n"); - fprintf(stderr, " -S input is SAM\n"); - fprintf(stderr, " -u uncompressed BAM output (force -b)\n"); - fprintf(stderr, " -1 fast compression (force -b)\n"); - fprintf(stderr, " -x output FLAG in HEX (samtools-C specific)\n"); - fprintf(stderr, " -X output FLAG in string (samtools-C specific)\n"); - fprintf(stderr, " -c print only the count of matching records\n"); - fprintf(stderr, " -L FILE output alignments overlapping the input BED FILE [null]\n"); - fprintf(stderr, " -t FILE list of reference names and lengths (force -S) [null]\n"); - fprintf(stderr, " -T FILE reference sequence file (force -S) [null]\n"); - fprintf(stderr, " -o FILE output file name [stdout]\n"); - fprintf(stderr, " -R FILE list of read groups to be outputted [null]\n"); - fprintf(stderr, " -f INT required flag, 0 for unset [0]\n"); - fprintf(stderr, " -F INT filtering flag, 0 for unset [0]\n"); - fprintf(stderr, " -q INT minimum mapping quality [0]\n"); - fprintf(stderr, " -l STR only output reads in library STR [null]\n"); - fprintf(stderr, " -r STR only output reads in read group STR [null]\n"); - fprintf(stderr, " -s FLOAT fraction of templates to subsample; integer part as seed [-1]\n"); - fprintf(stderr, " -? longer help\n"); - fprintf(stderr, "\n"); - if (is_long_help) - fprintf(stderr, "Notes:\n\ -\n\ - 1. By default, this command assumes the file on the command line is in\n\ - the BAM format and it prints the alignments in SAM. If `-t' is\n\ - applied, the input file is assumed to be in the SAM format. The\n\ - file supplied with `-t' is SPACE/TAB delimited with the first two\n\ - fields of each line consisting of the reference name and the\n\ - corresponding sequence length. The `.fai' file generated by `faidx'\n\ - can be used here. This file may be empty if reads are unaligned.\n\ -\n\ - 2. SAM->BAM conversion: `samtools view -bT ref.fa in.sam.gz'.\n\ -\n\ - 3. BAM->SAM conversion: `samtools view in.bam'.\n\ -\n\ - 4. A region should be presented in one of the following formats:\n\ - `chr1', `chr2:1,000' and `chr3:1000-2,000'. When a region is\n\ - specified, the input alignment file must be an indexed BAM file.\n\ -\n\ - 5. Option `-u' is preferred over `-b' when the output is piped to\n\ - another samtools command.\n\ -\n\ - 6. In a string FLAG, each character represents one bit with\n\ - p=0x1 (paired), P=0x2 (properly paired), u=0x4 (unmapped),\n\ - U=0x8 (mate unmapped), r=0x10 (reverse), R=0x20 (mate reverse)\n\ - 1=0x40 (first), 2=0x80 (second), s=0x100 (not primary), \n\ - f=0x200 (failure) and d=0x400 (duplicate). Note that `-x' and\n\ - `-X' are samtools-C specific. Picard and older samtools do not\n\ - support HEX or string flags.\n\ -\n"); - return 1; -} - -int main_import(int argc, char *argv[]) -{ - int argc2, ret; - char **argv2; - if (argc != 4) { - fprintf(stderr, "Usage: bamtk import \n"); - return 1; - } - argc2 = 6; - argv2 = calloc(6, sizeof(char*)); - argv2[0] = "import", argv2[1] = "-o", argv2[2] = argv[3], argv2[3] = "-bt", argv2[4] = argv[1], argv2[5] = argv[2]; - ret = main_samview(argc2, argv2); - free(argv2); - return ret; -} - -int8_t seq_comp_table[16] = { 0, 8, 4, 12, 2, 10, 9, 14, 1, 6, 5, 13, 3, 11, 7, 15 }; - -int main_bam2fq(int argc, char *argv[]) -{ - bamFile fp; - bam_header_t *h; - bam1_t *b; - int8_t *buf; - int max_buf; - if (argc == 1) { - fprintf(stderr, "Usage: samtools bam2fq \n"); - return 1; - } - fp = strcmp(argv[1], "-")? bam_open(argv[1], "r") : bam_dopen(fileno(stdin), "r"); - if (fp == 0) return 1; - h = bam_header_read(fp); - b = bam_init1(); - buf = 0; - max_buf = 0; - while (bam_read1(fp, b) >= 0) { - int i, qlen = b->core.l_qseq; - uint8_t *seq; - putchar('@'); fputs(bam1_qname(b), stdout); - if ((b->core.flag & 0x40) && !(b->core.flag & 0x80)) puts("/1"); - else if ((b->core.flag & 0x80) && !(b->core.flag & 0x40)) puts("/2"); - else putchar('\n'); - if (max_buf < qlen + 1) { - max_buf = qlen + 1; - kroundup32(max_buf); - buf = realloc(buf, max_buf); - } - buf[qlen] = 0; - seq = bam1_seq(b); - for (i = 0; i < qlen; ++i) - buf[i] = bam1_seqi(seq, i); - if (b->core.flag & 16) { // reverse complement - for (i = 0; i < qlen>>1; ++i) { - int8_t t = seq_comp_table[buf[qlen - 1 - i]]; - buf[qlen - 1 - i] = seq_comp_table[buf[i]]; - buf[i] = t; - } - if (qlen&1) buf[i] = seq_comp_table[buf[i]]; - } - for (i = 0; i < qlen; ++i) - buf[i] = bam_nt16_rev_table[buf[i]]; - puts((char*)buf); - puts("+"); - seq = bam1_qual(b); - for (i = 0; i < qlen; ++i) - buf[i] = 33 + seq[i]; - if (b->core.flag & 16) { // reverse - for (i = 0; i < qlen>>1; ++i) { - int8_t t = buf[qlen - 1 - i]; - buf[qlen - 1 - i] = buf[i]; - buf[i] = t; - } - } - puts((char*)buf); - } - free(buf); - bam_destroy1(b); - bam_header_destroy(h); - bam_close(fp); - return 0; -} diff -Nru tophat-2.1.1+dfsg/src/sortedcontainers/__init__.py tophat-2.1.1+dfsg1/src/sortedcontainers/__init__.py --- tophat-2.1.1+dfsg/src/sortedcontainers/__init__.py 2016-02-14 18:21:17.798079000 +0000 +++ tophat-2.1.1+dfsg1/src/sortedcontainers/__init__.py 1970-01-01 00:00:00.000000000 +0000 @@ -1,55 +0,0 @@ -# -*- coding: utf-8 -*- - -""" -sortedcontainers Sorted Container Types Library -=============================================== - -SortedContainers is an Apache2 licensed containers library, written in -pure-Python, and fast as C-extensions. - -Python's standard library is great until you need a sorted container type. Many -will attest that you can get really far without one, but the moment you -**really need** a sorted list, dict, or set, you're faced with a dozen -different implementations, most using C-extensions without great documentation -and benchmarking. - -Things shouldn't be this way. Not in Python. - -:: - - >>> from sortedcontainers import SortedList, SortedDict, SortedSet - >>> sl = SortedList(xrange(10000000)) - >>> 1234567 in sl - True - >>> sl[7654321] - 7654321 - >>> sl.add(1234567) - >>> sl.count(1234567) - 2 - >>> sl *= 3 - >>> len(sl) - 30000003 - -SortedContainers takes all of the work out of Python sorted types - making your -deployment and use of Python easy. There's no need to install a C compiler or -pre-build and distribute custom extensions. Performance is a feature and -testing has 100% coverage with unit tests and hours of stress. - -:copyright: (c) 2014 by Grant Jenks. -:license: Apache 2.0, see LICENSE for more details. - -""" - -__title__ = 'sortedcontainers' -__version__ = '0.9.4' -__build__ = 0x000904 -__author__ = 'Grant Jenks' -__license__ = 'Apache 2.0' -__copyright__ = 'Copyright 2014 Grant Jenks' - -from .sortedlist import SortedList -from .sortedset import SortedSet -from .sorteddict import SortedDict -from .sortedlistwithkey import SortedListWithKey - -__all__ = ['SortedList', 'SortedSet', 'SortedDict', 'SortedListWithKey'] diff -Nru tophat-2.1.1+dfsg/src/sortedcontainers/sorteddict.py tophat-2.1.1+dfsg1/src/sortedcontainers/sorteddict.py --- tophat-2.1.1+dfsg/src/sortedcontainers/sorteddict.py 2016-02-14 18:21:17.799079000 +0000 +++ tophat-2.1.1+dfsg1/src/sortedcontainers/sorteddict.py 1970-01-01 00:00:00.000000000 +0000 @@ -1,737 +0,0 @@ -# -*- coding: utf-8 -*- -# -# Sorted dict implementation. - -from .sortedset import SortedSet -from .sortedlist import SortedList, recursive_repr -from .sortedlistwithkey import SortedListWithKey -from collections import Set, Sequence -from collections import KeysView as AbstractKeysView -from collections import ValuesView as AbstractValuesView -from collections import ItemsView as AbstractItemsView - -from functools import wraps -from sys import hexversion - -_NotGiven = object() - -def not26(func): - """Function decorator for methods not implemented in Python 2.6.""" - - @wraps(func) - def errfunc(*args, **kwargs): - raise NotImplementedError - - if hexversion < 0x02070000: - return errfunc - else: - return func - -class _IlocWrapper: - def __init__(self, _dict): - self._dict = _dict - def __len__(self): - return len(self._dict) - def __getitem__(self, index): - """ - Very efficiently return the key at index *index* in iteration. Supports - negative indices and slice notation. Raises IndexError on invalid - *index*. - """ - return self._dict._list[index] - def __delitem__(self, index): - """ - Remove the ``sdict[sdict.iloc[index]]`` from *sdict*. Supports negative - indices and slice notation. Raises IndexError on invalid *index*. - """ - _temp = self._dict - _list = _temp._list - _delitem = _temp._delitem - - if isinstance(index, slice): - keys = _list[index] - del _list[index] - for key in keys: - _delitem(key) - else: - key = _list[index] - del _list[index] - _delitem(key) - -class SortedDict(dict): - """ - A SortedDict provides the same methods as a dict. Additionally, a - SortedDict efficiently maintains its keys in sorted order. Consequently, the - keys method will return the keys in sorted order, the popitem method will - remove the item with the highest key, etc. - """ - def __init__(self, *args, **kwargs): - """ - A SortedDict provides the same methods as a dict. Additionally, a - SortedDict efficiently maintains its keys in sorted order. Consequently, - the keys method will return the keys in sorted order, the popitem method - will remove the item with the highest key, etc. - - An optional *key* argument defines a callable that, like the `key` - argument to Python's `sorted` function, extracts a comparison key from - each dict key. If no function is specified, the default compares the - dict keys directly. The `key` argument must be provided as a positional - argument and must come before all other arguments. - - An optional *load* argument defines the load factor of the internal list - used to maintain sort order. If present, this argument must come before - an iterable. The default load factor of '1000' works well for lists from - tens to tens of millions of elements. Good practice is to use a value - that is the cube root of the list size. With billions of elements, the - best load factor depends on your usage. It's best to leave the load - factor at the default until you start benchmarking. - - An optional *iterable* argument provides an initial series of items to - populate the SortedDict. Each item in the series must itself contain - two items. The first is used as a key in the new dictionary, and the - second as the key's value. If a given key is seen more than once, the - last value associated with it is retained in the new dictionary. - - If keyword arguments are given, the keywords themselves with their - associated values are added as items to the dictionary. If a key is - specified both in the positional argument and as a keyword argument, the - value associated with the keyword is retained in the dictionary. For - example, these all return a dictionary equal to ``{"one": 2, "two": - 3}``: - - * ``SortedDict(one=2, two=3)`` - * ``SortedDict({'one': 2, 'two': 3})`` - * ``SortedDict(zip(('one', 'two'), (2, 3)))`` - * ``SortedDict([['two', 3], ['one', 2]])`` - - The first example only works for keys that are valid Python - identifiers; the others work with any valid keys. - """ - if len(args) > 0 and (args[0] is None or callable(args[0])): - self._key = args[0] - args = args[1:] - else: - self._key = None - - if len(args) > 0 and type(args[0]) == int: - self._load = args[0] - args = args[1:] - else: - self._load = 1000 - - if self._key is None: - self._list = SortedList(load=self._load) - else: - self._list = SortedListWithKey(key=self._key, load=self._load) - - # Cache function pointers to dict methods. - - _dict = super(SortedDict, self) - self._dict = _dict - self._clear = _dict.clear - self._delitem = _dict.__delitem__ - self._iter = _dict.__iter__ - self._pop = _dict.pop - self._setdefault = _dict.setdefault - self._setitem = _dict.__setitem__ - self._update = _dict.update - - # Cache function pointers to SortedList methods. - - self._list_add = self._list.add - self._list_bisect_left = self._list.bisect_left - self._list_bisect_right = self._list.bisect_right - self._list_clear = self._list.clear - self._list_index = self._list.index - self._list_pop = self._list.pop - self._list_remove = self._list.remove - self._list_update = self._list.update - - self.iloc = _IlocWrapper(self) - - self.update(*args, **kwargs) - - def clear(self): - """Remove all elements from the dictionary.""" - self._clear() - self._list_clear() - - def __delitem__(self, key): - """ - Remove ``d[key]`` from *d*. Raises a KeyError if *key* is not in the - dictionary. - """ - self._delitem(key) - self._list_remove(key) - - def __iter__(self): - """Create an iterator over the sorted keys of the dictionary.""" - return iter(self._list) - - def __reversed__(self): - """ - Create a reversed iterator over the sorted keys of the dictionary. - """ - return reversed(self._list) - - def __setitem__(self, key, value): - """Set `d[key]` to *value*.""" - if key not in self: - self._list_add(key) - self._setitem(key, value) - - def copy(self): - """Return a shallow copy of the sorted dictionary.""" - return self.__class__(self._key, self._load, self.iteritems()) - - __copy__ = copy - - @classmethod - def fromkeys(cls, seq, value=None): - """ - Create a new dictionary with keys from *seq* and values set to *value*. - """ - return cls((key, value) for key in seq) - - if hexversion < 0x03000000: - def items(self): - """ - Return a list of the dictionary's items (``(key, value)`` pairs). - """ - return list(self.iteritems()) - else: - def items(self): - """ - Return a new ItemsView of the dictionary's items. In addition to - the methods provided by the built-in `view` the ItemsView is - indexable (e.g. ``d.items()[5]``). - """ - return ItemsView(self) - - def iteritems(self): - """Return an iterable over the items (``(key, value)`` pairs).""" - return iter((key, self[key]) for key in self._list) - - if hexversion < 0x03000000: - def keys(self): - """Return a SortedSet of the dictionary's keys.""" - return SortedSet(self._list, key=self._key, load=self._load) - else: - def keys(self): - """ - Return a new KeysView of the dictionary's keys. In addition to the - methods provided by the built-in `view` the KeysView is indexable - (e.g. ``d.keys()[5]``). - """ - return KeysView(self) - - def iterkeys(self): - """Return an iterable over the keys of the dictionary.""" - return iter(self._list) - - if hexversion < 0x03000000: - def values(self): - """Return a list of the dictionary's values.""" - return list(self.itervalues()) - else: - def values(self): - """ - Return a new :class:`ValuesView` of the dictionary's values. - In addition to the methods provided by the built-in `view` the - ValuesView is indexable (e.g., ``d.values()[5]``). - """ - return ValuesView(self) - - def itervalues(self): - """Return an iterable over the values of the dictionary.""" - return iter(self[key] for key in self._list) - - def pop(self, key, default=_NotGiven): - """ - If *key* is in the dictionary, remove it and return its value, - else return *default*. If *default* is not given and *key* is not in - the dictionary, a KeyError is raised. - """ - if key in self: - self._list_remove(key) - return self._pop(key) - else: - if default is _NotGiven: - raise KeyError(key) - else: - return default - - def popitem(self): - """ - Remove and return the ``(key, value)`` pair with the greatest *key* - from the dictionary. - - If the dictionary is empty, calling `popitem` raises a - KeyError`. - """ - if not len(self): - raise KeyError('popitem(): dictionary is empty') - - key = self._list_pop() - value = self._pop(key) - - return (key, value) - - def setdefault(self, key, default=None): - """ - If *key* is in the dictionary, return its value. If not, insert *key* - with a value of *default* and return *default*. *default* defaults to - ``None``. - """ - if key in self: - return self[key] - else: - self._setitem(key, default) - self._list_add(key) - return default - - def update(self, *args, **kwargs): - """ - Update the dictionary with the key/value pairs from *other*, overwriting - existing keys. - - *update* accepts either another dictionary object or an iterable of - key/value pairs (as a tuple or other iterable of length two). If - keyword arguments are specified, the dictionary is then updated with - those key/value pairs: ``d.update(red=1, blue=2)``. - """ - if not len(self): - self._update(*args, **kwargs) - self._list_update(self._iter()) - return - - if (len(kwargs) == 0 and len(args) == 1 and isinstance(args[0], dict)): - pairs = args[0] - else: - pairs = dict(*args, **kwargs) - - if (10 * len(pairs)) > len(self): - self._update(pairs) - self._list_clear() - self._list_update(self._iter()) - else: - for key in pairs: - self[key] = pairs[key] - - def index(self, key, start=None, stop=None): - """ - Return the smallest *k* such that `d.iloc[k] == key` and `i <= k < j`. - Raises `ValueError` if *key* is not present. *stop* defaults to the end - of the set. *start* defaults to the beginning. Negative indexes are - supported, as for slice indices. - """ - return self._list_index(key, start, stop) - - def bisect_left(self, key): - """ - Similar to the ``bisect`` module in the standard library, this returns - an appropriate index to insert *key* in SortedDict. If *key* is - already present in SortedDict, the insertion point will be before (to - the left of) any existing entries. - """ - return self._list_bisect_left(key) - - def bisect(self, key): - """Same as bisect_right.""" - return self._list_bisect_right(key) - - def bisect_right(self, key): - """ - Same as `bisect_left`, but if *key* is already present in SortedDict, - the insertion point will be after (to the right of) any existing - entries. - """ - return self._list_bisect_right(key) - - @not26 - def viewkeys(self): - """ - In Python 2.7 and later, return a new `KeysView` of the dictionary's - keys. - - In Python 2.6, raise a NotImplementedError. - """ - return KeysView(self) - - @not26 - def viewvalues(self): - """ - In Python 2.7 and later, return a new `ValuesView` of the dictionary's - values. - - In Python 2.6, raise a NotImplementedError. - """ - return ValuesView(self) - - @not26 - def viewitems(self): - """ - In Python 2.7 and later, return a new `ItemsView` of the dictionary's - items. - - In Python 2.6, raise a NotImplementedError. - """ - return ItemsView(self) - - def __reduce__(self): - return (self.__class__, (self._key, self._load, list(self.iteritems()))) - - @recursive_repr - def __repr__(self): - temp = '{0}({1}, {2}, {{{3}}})' - items = ', '.join('{0}: {1}'.format(repr(key), repr(self[key])) - for key in self._list) - return temp.format( - self.__class__.__name__, - repr(self._key), - repr(self._load), - items - ) - - def _check(self): - self._list._check() - assert len(self) == len(self._list) - assert all(val in self for val in self._list) - -class KeysView(AbstractKeysView, Set, Sequence): - """ - A KeysView object is a dynamic view of the dictionary's keys, which - means that when the dictionary's keys change, the view reflects - those changes. - - The KeysView class implements the Set and Sequence Abstract Base Classes. - """ - if hexversion < 0x03000000: - def __init__(self, sorted_dict): - """ - Initialize a KeysView from a SortedDict container as *sorted_dict*. - """ - self._list = sorted_dict._list - self._view = sorted_dict._dict.viewkeys() - else: - def __init__(self, sorted_dict): - """ - Initialize a KeysView from a SortedDict container as *sorted_dict*. - """ - self._list = sorted_dict._list - self._view = sorted_dict._dict.keys() - def __len__(self): - """Return the number of entries in the dictionary.""" - return len(self._view) - def __contains__(self, key): - """ - Return True if and only if *key* is one of the underlying dictionary's - keys. - """ - return key in self._view - def __iter__(self): - """ - Return an iterable over the keys in the dictionary. Keys are iterated - over in their sorted order. - - Iterating views while adding or deleting entries in the dictionary may - raise a RuntimeError or fail to iterate over all entries. - """ - return iter(self._list) - def __getitem__(self, index): - """Return the key at position *index*.""" - return self._list[index] - def __reversed__(self): - """ - Return a reversed iterable over the keys in the dictionary. Keys are - iterated over in their reverse sort order. - - Iterating views while adding or deleting entries in the dictionary may - raise a RuntimeError or fail to iterate over all entries. - """ - return reversed(self._list) - def index(self, value, start=None, stop=None): - """ - Return the smallest *k* such that `keysview[k] == value` and `start <= k - < end`. Raises `KeyError` if *value* is not present. *stop* defaults - to the end of the set. *start* defaults to the beginning. Negative - indexes are supported, as for slice indices. - """ - return self._list.index(value, start, stop) - def count(self, value): - """Return the number of occurrences of *value* in the set.""" - return 1 if value in self._view else 0 - def __eq__(self, that): - """Test set-like equality with *that*.""" - return self._view == that - def __ne__(self, that): - """Test set-like inequality with *that*.""" - return self._view != that - def __lt__(self, that): - """Test whether self is a proper subset of *that*.""" - return self._view < that - def __gt__(self, that): - """Test whether self is a proper superset of *that*.""" - return self._view > that - def __le__(self, that): - """Test whether self is contained within *that*.""" - return self._view <= that - def __ge__(self, that): - """Test whether *that* is contained within self.""" - return self._view >= that - def __and__(self, that): - """Return a SortedSet of the intersection of self and *that*.""" - return SortedSet(self._view & that) - def __or__(self, that): - """Return a SortedSet of the union of self and *that*.""" - return SortedSet(self._view | that) - def __sub__(self, that): - """Return a SortedSet of the difference of self and *that*.""" - return SortedSet(self._view - that) - def __xor__(self, that): - """Return a SortedSet of the symmetric difference of self and *that*.""" - return SortedSet(self._view ^ that) - if hexversion < 0x03000000: - def isdisjoint(self, that): - """Return True if and only if *that* is disjoint with self.""" - return not any(key in self._list for key in that) - else: - def isdisjoint(self, that): - """Return True if and only if *that* is disjoint with self.""" - return self._view.isdisjoint(that) - @recursive_repr - def __repr__(self): - return 'SortedDict_keys({0})'.format(repr(list(self))) - -class ValuesView(AbstractValuesView, Sequence): - """ - A ValuesView object is a dynamic view of the dictionary's values, which - means that when the dictionary's values change, the view reflects those - changes. - - The ValuesView class implements the Sequence Abstract Base Class. - """ - if hexversion < 0x03000000: - def __init__(self, sorted_dict): - """ - Initialize a ValuesView from a SortedDict container as - *sorted_dict*. - """ - self._dict = sorted_dict - self._list = sorted_dict._list - self._view = sorted_dict._dict.viewvalues() - else: - def __init__(self, sorted_dict): - """ - Initialize a ValuesView from a SortedDict container as - *sorted_dict*. - """ - self._dict = sorted_dict - self._list = sorted_dict._list - self._view = sorted_dict._dict.values() - def __len__(self): - """Return the number of entries in the dictionary.""" - return len(self._dict) - def __contains__(self, value): - """ - Return True if and only if *value* is on the underlying dictionary's - values. - """ - return value in self._view - def __iter__(self): - """ - Return an iterator over the values in the dictionary. Values are - iterated over in sorted order of the keys. - - Iterating views while adding or deleting entries in the dictionary may - raise a `RuntimeError` or fail to iterate over all entries. - """ - _dict = self._dict - return iter(_dict[key] for key in self._list) - def __getitem__(self, index): - """ - Efficiently return value at *index* in iteration. - - Supports slice notation and negative indexes. - """ - _dict, _list = self._dict, self._list - if isinstance(index, slice): - return [_dict[key] for key in _list[index]] - else: - return _dict[_list[index]] - def __reversed__(self): - """ - Return a reverse iterator over the values in the dictionary. Values are - iterated over in reverse sort order of the keys. - - Iterating views while adding or deleting entries in the dictionary may - raise a `RuntimeError` or fail to iterate over all entries. - """ - _dict = self._dict - return iter(_dict[key] for key in reversed(self._list)) - def index(self, value): - """ - Return index of *value* in self. - - Raises ValueError if *value* is not found. - """ - for idx, val in enumerate(self): - if value == val: - return idx - else: - raise ValueError('{0} is not in dict'.format(repr(value))) - if hexversion < 0x03000000: - def count(self, value): - """Return the number of occurrences of *value* in self.""" - return sum(1 for val in self._dict.itervalues() if val == value) - else: - def count(self, value): - """Return the number of occurrences of *value* in self.""" - return sum(1 for val in _dict.values() if val == value) - def __lt__(self, that): - raise TypeError - def __gt__(self, that): - raise TypeError - def __le__(self, that): - raise TypeError - def __ge__(self, that): - raise TypeError - def __and__(self, that): - raise TypeError - def __or__(self, that): - raise TypeError - def __sub__(self, that): - raise TypeError - def __xor__(self, that): - raise TypeError - @recursive_repr - def __repr__(self): - return 'SortedDict_values({0})'.format(repr(list(self))) - -class ItemsView(AbstractItemsView, Set, Sequence): - """ - An ItemsView object is a dynamic view of the dictionary's ``(key, - value)`` pairs, which means that when the dictionary changes, the - view reflects those changes. - - The ItemsView class implements the Set and Sequence Abstract Base Classes. - However, the set-like operations (``&``, ``|``, ``-``, ``^``) will only - operate correctly if all of the dictionary's values are hashable. - """ - if hexversion < 0x03000000: - def __init__(self, sorted_dict): - """ - Initialize an ItemsView from a SortedDict container as - *sorted_dict*. - """ - self._dict = sorted_dict - self._list = sorted_dict._list - self._view = sorted_dict._dict.viewitems() - else: - def __init__(self, sorted_dict): - """ - Initialize an ItemsView from a SortedDict container as - *sorted_dict*. - """ - self._dict = sorted_dict - self._list = sorted_dict._list - self._view = sorted_dict._dict.items() - def __len__(self): - """Return the number of entries in the dictionary.""" - return len(self._view) - def __contains__(self, key): - """ - Return True if and only if *key* is one of the underlying dictionary's - items. - """ - return key in self._view - def __iter__(self): - """ - Return an iterable over the items in the dictionary. Items are iterated - over in their sorted order. - - Iterating views while adding or deleting entries in the dictionary may - raise a RuntimeError or fail to iterate over all entries. - """ - _dict = self._dict - return iter((key, _dict[key]) for key in self._list) - def __getitem__(self, index): - """Return the item as position *index*.""" - _dict, _list = self._dict, self._list - if isinstance(index, slice): - return [(key, _dict[key]) for key in _list[index]] - else: - key = _list[index] - return (key, _dict[key]) - def __reversed__(self): - """ - Return a reversed iterable over the items in the dictionary. Items are - iterated over in their reverse sort order. - - Iterating views while adding or deleting entries in the dictionary may - raise a RuntimeError or fail to iterate over all entries. - """ - _dict = self._dict - return iter((key, _dict[key]) for key in reversed(self._list)) - def index(self, key, start=None, stop=None): - """ - Return the smallest *k* such that `itemssview[k] == key` and `start <= k - < end`. Raises `KeyError` if *key* is not present. *stop* defaults - to the end of the set. *start* defaults to the beginning. Negative - indexes are supported, as for slice indices. - """ - temp, value = key - pos = self._list.index(temp, start, stop) - if value == self._dict[temp]: - return pos - else: - raise ValueError('{0} is not in dict'.format(repr(key))) - def count(self, item): - """Return the number of occurrences of *item* in the set.""" - key, value = item - return 1 if key in self._dict and self._dict[key] == value else 0 - def __eq__(self, that): - """Test set-like equality with *that*.""" - return self._view == that - def __ne__(self, that): - """Test set-like inequality with *that*.""" - return self._view != that - def __lt__(self, that): - """Test whether self is a proper subset of *that*.""" - return self._view < that - def __gt__(self, that): - """Test whether self is a proper superset of *that*.""" - return self._view > that - def __le__(self, that): - """Test whether self is contained within *that*.""" - return self._view <= that - def __ge__(self, that): - """Test whether *that* is contained within self.""" - return self._view >= that - def __and__(self, that): - """Return a SortedSet of the intersection of self and *that*.""" - return SortedSet(self._view & that) - def __or__(self, that): - """Return a SortedSet of the union of self and *that*.""" - return SortedSet(self._view | that) - def __sub__(self, that): - """Return a SortedSet of the difference of self and *that*.""" - return SortedSet(self._view - that) - def __xor__(self, that): - """Return a SortedSet of the symmetric difference of self and *that*.""" - return SortedSet(self._view ^ that) - if hexversion < 0x03000000: - def isdisjoint(self, that): - """Return True if and only if *that* is disjoint with self.""" - _dict = self._dict - for key, value in that: - if key in _dict and _dict[key] == value: - return False - return True - else: - def isdisjoint(self, that): - """Return True if and only if *that* is disjoint with self.""" - return self._view.isdisjoint(that) - @recursive_repr - def __repr__(self): - return 'SortedDict_items({0})'.format(repr(list(self))) diff -Nru tophat-2.1.1+dfsg/src/sortedcontainers/sortedlist.py tophat-2.1.1+dfsg1/src/sortedcontainers/sortedlist.py --- tophat-2.1.1+dfsg/src/sortedcontainers/sortedlist.py 2016-02-14 18:21:17.800079000 +0000 +++ tophat-2.1.1+dfsg1/src/sortedcontainers/sortedlist.py 1970-01-01 00:00:00.000000000 +0000 @@ -1,1233 +0,0 @@ -# -*- coding: utf-8 -*- -# -# Sorted list implementation. - -from __future__ import print_function -from sys import hexversion - -from bisect import bisect_left, bisect_right, insort -from itertools import chain, repeat, starmap -from collections import MutableSequence -from operator import iadd, add -from functools import wraps -from math import log - -if hexversion < 0x03000000: - from itertools import izip as zip - from itertools import imap as map - try: - from thread import get_ident - except ImportError: - from dummy_thread import get_ident -else: - from functools import reduce - try: - from _thread import get_ident - except ImportError: - from _dummy_thread import get_ident - -def recursive_repr(func): - """Decorator to prevent infinite repr recursion.""" - repr_running = set() - - @wraps(func) - def wrapper(self): - key = id(self), get_ident() - - if key in repr_running: - return '...' - - repr_running.add(key) - - try: - return func(self) - finally: - repr_running.discard(key) - - return wrapper - -class SortedList(MutableSequence): - """ - SortedList provides most of the same methods as a list but keeps the items - in sorted order. - """ - - def __init__(self, iterable=None, load=1000): - """ - SortedList provides most of the same methods as a list but keeps the - items in sorted order. - - An optional *iterable* provides an initial series of items to populate - the SortedList. - - An optional *load* specifies the load-factor of the list. The default - load factor of '1000' works well for lists from tens to tens of millions - of elements. Good practice is to use a value that is the cube root of - the list size. With billions of elements, the best load factor depends - on your usage. It's best to leave the load factor at the default until - you start benchmarking. - """ - self._len, self._maxes, self._lists, self._index = 0, [], [], [] - self._load, self._twice, self._half = load, load * 2, load >> 1 - self._offset = 0 - - if iterable is not None: - self.update(iterable) - - def clear(self): - """Remove all the elements from the list.""" - self._len = 0 - del self._maxes[:] - del self._lists[:] - del self._index[:] - - def add(self, val): - """Add the element *val* to the list.""" - _maxes, _lists = self._maxes, self._lists - - if _maxes: - pos = bisect_right(_maxes, val) - - if pos == len(_maxes): - pos -= 1 - _maxes[pos] = val - _lists[pos].append(val) - else: - insort(_lists[pos], val) - - self._expand(pos) - else: - _maxes.append(val) - _lists.append([val]) - - self._len += 1 - - def _expand(self, pos): - """Splits sublists that are more than double the load level. - - Updates the index when the sublist length is less than double the load - level. This requires incrementing the nodes in a traversal from the leaf - node to the root. For an example traversal see self._loc. - """ - _lists, _index = self._lists, self._index - - if len(_lists[pos]) > self._twice: - _maxes, _load = self._maxes, self._load - half = _lists[pos][_load:] - del _lists[pos][_load:] - _maxes[pos] = _lists[pos][-1] - _maxes.insert(pos + 1, half[-1]) - _lists.insert(pos + 1, half) - del _index[:] - else: - if len(_index) > 0: - child = self._offset + pos - while child > 0: - _index[child] += 1 - child = (child - 1) >> 1 - _index[0] += 1 - - def update(self, iterable): - """Update the list by adding all elements from *iterable*.""" - _maxes, _lists = self._maxes, self._lists - values = sorted(iterable) - - if _maxes: - if len(values) * 4 >= self._len: - values.extend(chain.from_iterable(_lists)) - values.sort() - self.clear() - else: - _add = self.add - for val in values: - _add(val) - return - - _load, _index = self._load, self._index - _lists.extend(values[pos:(pos + _load)] - for pos in range(0, len(values), _load)) - _maxes.extend(sublist[-1] for sublist in _lists) - self._len = len(values) - del _index[:] - - def __contains__(self, val): - """Return True if and only if *val* is an element in the list.""" - _maxes = self._maxes - - if not _maxes: - return False - - pos = bisect_left(_maxes, val) - - if pos == len(_maxes): - return False - - _lists = self._lists - idx = bisect_left(_lists[pos], val) - return _lists[pos][idx] == val - - def discard(self, val): - """ - Remove the first occurrence of *val*. - - If *val* is not a member, does nothing. - """ - _maxes = self._maxes - - if not _maxes: - return - - pos = bisect_left(_maxes, val) - - if pos == len(_maxes): - return - - _lists = self._lists - idx = bisect_left(_lists[pos], val) - if _lists[pos][idx] == val: - self._delete(pos, idx) - - def remove(self, val): - """ - Remove first occurrence of *val*. - - Raises ValueError if *val* is not present. - """ - _maxes = self._maxes - - if not _maxes: - raise ValueError('{0} not in list'.format(repr(val))) - - pos = bisect_left(_maxes, val) - - if pos == len(_maxes): - raise ValueError('{0} not in list'.format(repr(val))) - - _lists = self._lists - idx = bisect_left(_lists[pos], val) - if _lists[pos][idx] == val: - self._delete(pos, idx) - else: - raise ValueError('{0} not in list'.format(repr(val))) - - def _delete(self, pos, idx): - """Delete the item at the given (pos, idx). - - Combines lists that are less than half the load level. - - Updates the index when the sublist length is more than half the load - level. This requires decrementing the nodes in a traversal from the leaf - node to the root. For an example traversal see self._loc. - """ - _maxes, _lists, _index = self._maxes, self._lists, self._index - - lists_pos = _lists[pos] - - del lists_pos[idx] - self._len -= 1 - - len_lists_pos = len(lists_pos) - - if len_lists_pos > self._half: - - _maxes[pos] = lists_pos[-1] - - if len(_index) > 0: - child = self._offset + pos - while child > 0: - _index[child] -= 1 - child = (child - 1) >> 1 - _index[0] -= 1 - - elif len(_lists) > 1: - - if not pos: - pos += 1 - - prev = pos - 1 - _lists[prev].extend(_lists[pos]) - _maxes[prev] = _lists[prev][-1] - - del _maxes[pos] - del _lists[pos] - del _index[:] - - self._expand(prev) - - elif len_lists_pos: - - _maxes[pos] = lists_pos[-1] - - else: - - del _maxes[pos] - del _lists[pos] - del _index[:] - - def _loc(self, pos, idx): - """Convert an index pair (alpha, beta) into a single index that corresponds to - the position of the value in the sorted list. - - Most queries require the index be built. Details of the index are - described in self._build_index. - - Indexing requires traversing the tree from a leaf node to the root. The - parent of each node is easily computable at (pos - 1) // 2. - - Left-child nodes are always at odd indices and right-child nodes are - always at even indices. - - When traversing up from a right-child node, increment the total by the - left-child node. - - The final index is the sum from traversal and the index in the sublist. - - For example, using the index from self._build_index: - - _index = 14 5 9 3 2 4 5 - _offset = 3 - - Tree: - - 14 - 5 9 - 3 2 4 5 - - Converting index pair (2, 3) into a single index involves iterating like - so: - - 1. Starting at the leaf node: offset + alpha = 3 + 2 = 5. We identify - the node as a left-child node. At such nodes, we simply traverse to - the parent. - - 2. At node 9, position 2, we recognize the node as a right-child node - and accumulate the left-child in our total. Total is now 5 and we - traverse to the parent at position 0. - - 3. Iteration ends at the root. - - Computing the index is the sum of the total and beta: 5 + 3 = 8. - """ - if not pos: - return idx - - _index = self._index - - if not len(_index): - self._build_index() - - total = 0 - - # Increment pos to point in the index to len(self._lists[pos]). - - pos += self._offset - - # Iterate until reaching the root of the index tree at pos = 0. - - while pos: - - # Right-child nodes are at odd indices. At such indices - # account the total below the left child node. - - if not (pos & 1): - total += _index[pos - 1] - - # Advance pos to the parent node. - - pos = (pos - 1) >> 1 - - return total + idx - - def _pos(self, idx): - """Convert an index into a pair (alpha, beta) that can be used to access - the corresponding _lists[alpha][beta] position. - - Most queries require the index be built. Details of the index are - described in self._build_index. - - Indexing requires traversing the tree to a leaf node. Each node has - two children which are easily computable. Given an index, pos, the - left-child is at pos * 2 + 1 and the right-child is at pos * 2 + 2. - - When the index is less than the left-child, traversal moves to the - left sub-tree. Otherwise, the index is decremented by the left-child - and traversal moves to the right sub-tree. - - At a child node, the indexing pair is computed from the relative - position of the child node as compared with the offset and the remaining - index. - - For example, using the index from self._build_index: - - _index = 14 5 9 3 2 4 5 - _offset = 3 - - Tree: - - 14 - 5 9 - 3 2 4 5 - - Indexing position 8 involves iterating like so: - - 1. Starting at the root, position 0, 8 is compared with the left-child - node (5) which it is greater than. When greater the index is - decremented and the position is updated to the right child node. - - 2. At node 9 with index 3, we again compare the index to the left-child - node with value 4. Because the index is the less than the left-child - node, we simply traverse to the left. - - 3. At node 4 with index 3, we recognize that we are at a leaf node and - stop iterating. - - 4. To compute the sublist index, we subtract the offset from the index - of the leaf node: 5 - 3 = 2. To compute the index in the sublist, we - simply use the index remaining from iteration. In this case, 3. - - The final index pair from our example is (2, 3) which corresponds to - index 8 in the sorted list. - """ - _len, _lists = self._len, self._lists - - if idx < 0: - last_len = len(_lists[-1]) - if (-idx) <= last_len: - return len(_lists) - 1, last_len + idx - idx += _len - if idx < 0: - raise IndexError('list index out of range') - elif idx >= _len: - raise IndexError('list index out of range') - - if idx < len(_lists[0]): - return 0, idx - - _index = self._index - - if not len(_index): - self._build_index() - - pos = 0 - len_index = len(_index) - child = (pos << 1) + 1 - - while child < len_index: - index_child = _index[child] - - if idx < index_child: - pos = child - else: - idx -= index_child - pos = child + 1 - - child = (pos << 1) + 1 - - return (pos - self._offset, idx) - - def _build_index(self): - """Build an index for indexing the sorted list. - - Indexes are represented as binary trees in a dense array notation - similar to a binary heap. - - For example, given a _lists representation storing integers: - - [0]: 1 2 3 - [1]: 4 5 - [2]: 6 7 8 9 - [3]: 10 11 12 13 14 - - The first transformation maps the sub-lists by their length. The - first row of the index is the length of the sub-lists. - - [0]: 3 2 4 5 - - Each row after that is the sum of consecutive pairs of the previous row: - - [1]: 5 9 - [2]: 14 - - Finally, the index is built by concatenating these lists together: - - _index = 14 5 9 3 2 4 5 - - An offset storing the start of the first row is also stored: - - _offset = 3 - - When built, the index can be used for efficient indexing into the list. - See the comment and notes on self._pos for details. - """ - row0 = list(map(len, self._lists)) - - if len(row0) == 1: - self._index[:] = row0 - self._offset = 0 - return - - head = iter(row0) - tail = iter(head) - row1 = list(starmap(add, zip(head, tail))) - - if len(row0) & 1: - row1.append(row0[-1]) - - if len(row1) == 1: - self._index[:] = row1 + row0 - self._offset = 1 - return - - size = 2 ** (int(log(len(row1) - 1, 2)) + 1) - row1.extend(repeat(0, size - len(row1))) - tree = [row0, row1] - - while len(tree[-1]) > 1: - head = iter(tree[-1]) - tail = iter(head) - row = list(starmap(add, zip(head, tail))) - tree.append(row) - - reduce(iadd, reversed(tree), self._index) - self._offset = size * 2 - 1 - - def _slice(self, slc): - start, stop, step = slc.start, slc.stop, slc.step - - if step == 0: - raise ValueError('slice step cannot be zero') - - # Set defaults for missing values. - - if step is None: - step = 1 - - if step > 0: - if start is None: - start = 0 - - if stop is None: - stop = len(self) - elif stop < 0: - stop += len(self) - else: - if start is None: - start = len(self) - 1 - - if stop is None: - stop = -1 - elif stop < 0: - stop += len(self) - - if start < 0: - start += len(self) - - # Fix indices that are too big or too small. - # Slice notation is surprisingly permissive - # where normal indexing would raise IndexError. - - if step > 0: - if start < 0: - start = 0 - elif start > len(self): - start = len(self) - - if stop < 0: - stop = 0 - elif stop > len(self): - stop = len(self) - else: - if start < 0: - start = -1 - elif start >= len(self): - start = len(self) - 1 - - if stop < 0: - stop = -1 - elif stop > len(self): - stop = len(self) - - return start, stop, step - - def __delitem__(self, idx): - """Remove the element at *idx*. Supports slicing.""" - if isinstance(idx, slice): - start, stop, step = self._slice(idx) - - if ((step == 1) and (start < stop) - and ((stop - start) * 8 >= self._len)): - - values = self[:start] - if stop < self._len: - values += self[stop:] - self.clear() - self.update(values) - return - - indices = range(start, stop, step) - - # Delete items from greatest index to least so - # that the indices remain valid throughout iteration. - - if step > 0: - indices = reversed(indices) - - _pos, _delete = self._pos, self._delete - - for index in indices: - pos, idx = _pos(index) - _delete(pos, idx) - else: - pos, idx = self._pos(idx) - self._delete(pos, idx) - - def __getitem__(self, idx): - """Return the element at *idx*. Supports slicing.""" - _lists = self._lists - - if isinstance(idx, slice): - start, stop, step = self._slice(idx) - - if step == 1 and start < stop: - if start == 0 and stop == self._len: - return self.as_list() - - start_pos, start_idx = self._pos(start) - - if stop == self._len: - stop_pos = len(_lists) - 1 - stop_idx = len(_lists[stop_pos]) - else: - stop_pos, stop_idx = self._pos(stop) - - if start_pos == stop_pos: - return _lists[start_pos][start_idx:stop_idx] - - prefix = _lists[start_pos][start_idx:] - middle = _lists[(start_pos + 1):stop_pos] - result = reduce(iadd, middle, prefix) - result += _lists[stop_pos][:stop_idx] - - return result - - if step == -1 and start > stop: - result = self[(stop + 1):(start + 1)] - result.reverse() - return result - - # Return a list because a negative step could - # reverse the order of the items and this could - # be the desired behavior. - - indices = range(start, stop, step) - return list(self[index] for index in indices) - else: - pos, idx = self._pos(idx) - return _lists[pos][idx] - - def _check_order(self, idx, val): - _lists, _len = self._lists, self._len - - pos, loc = self._pos(idx) - - if idx < 0: - idx += _len - - # Check that the inserted value is not less than the - # previous value. - - if idx > 0: - idx_prev = loc - 1 - pos_prev = pos - - if idx_prev < 0: - pos_prev -= 1 - idx_prev = len(_lists[pos_prev]) - 1 - - if _lists[pos_prev][idx_prev] > val: - msg = '{0} not in sort order at index {1}'.format(repr(val), idx) - raise ValueError(msg) - - # Check that the inserted value is not greater than - # the previous value. - - if idx < (_len - 1): - idx_next = loc + 1 - pos_next = pos - - if idx_next == len(_lists[pos_next]): - pos_next += 1 - idx_next = 0 - - if _lists[pos_next][idx_next] < val: - msg = '{0} not in sort order at index {1}'.format(repr(val), idx) - raise ValueError(msg) - - def __setitem__(self, index, value): - """ - Replace the item at position *index* with *value*. - - Supports slice notation. Raises a :exc:`ValueError` if the sort order - would be violated. When used with a slice and iterable, the - :exc:`ValueError` is raised before the list is mutated if the sort order - would be violated by the operation. - """ - _maxes, _lists, _pos = self._maxes, self._lists, self._pos - _check_order = self._check_order - - if isinstance(index, slice): - start, stop, step = self._slice(index) - indices = range(start, stop, step) - - if step != 1: - if not hasattr(value, '__len__'): - value = list(value) - - indices = list(indices) - - if len(value) != len(indices): - raise ValueError( - 'attempt to assign sequence of size {0}' - ' to extended slice of size {1}' - .format(len(value), len(indices))) - - # Keep a log of values that are set so that we can - # roll back changes if ordering is violated. - - log = [] - _append = log.append - - for idx, val in zip(indices, value): - pos, loc = _pos(idx) - _append((idx, _lists[pos][loc], val)) - _lists[pos][loc] = val - if len(_lists[pos]) == (loc + 1): - _maxes[pos] = val - - try: - # Validate ordering of new values. - - for idx, oldval, newval in log: - _check_order(idx, newval) - - except ValueError: - - # Roll back changes from log. - - for idx, oldval, newval in log: - pos, loc = _pos(idx) - _lists[pos][loc] = oldval - if len(_lists[pos]) == (loc + 1): - _maxes[pos] = oldval - - raise - else: - # Test ordering using indexing. If the value given - # doesn't support getitem, convert it to a list. - - if not hasattr(value, '__getitem__'): - value = list(value) - - # Check that the given values are ordered properly. - - ordered = all(value[pos - 1] <= value[pos] - for pos in range(1, len(value))) - - if not ordered: - raise ValueError('given sequence not in sort order') - - # Check ordering in context of sorted list. - - if not start or not len(value): - # Nothing to check on the lhs. - pass - else: - if self[start - 1] > value[0]: - msg = '{0} not in sort order at index {1}'.format(repr(value[0]), start) - raise ValueError(msg) - - if stop == len(self) or not len(value): - # Nothing to check on the rhs. - pass - else: - # "stop" is exclusive so we don't need - # to add one for the index. - if self[stop] < value[-1]: - msg = '{0} not in sort order at index {1}'.format(repr(value[-1]), stop) - raise ValueError(msg) - - # Delete the existing values. - - del self[index] - - # Insert the new values. - - _insert = self.insert - for idx, val in enumerate(value): - _insert(start + idx, val) - else: - pos, loc = _pos(index) - _check_order(index, value) - _lists[pos][loc] = value - if len(_lists[pos]) == (loc + 1): - _maxes[pos] = value - - def __iter__(self): - """Create an iterator over the list.""" - return chain.from_iterable(self._lists) - - def __reversed__(self): - """Create an iterator to traverse the list in reverse.""" - return chain.from_iterable(map(reversed, reversed(self._lists))) - - def __len__(self): - """Return the number of elements in the list.""" - return self._len - - def bisect_left(self, val): - """ - Similar to the *bisect* module in the standard library, this returns an - appropriate index to insert *val*. If *val* is already present, the - insertion point will be before (to the left of) any existing entries. - """ - _maxes = self._maxes - - if not _maxes: - return 0 - - pos = bisect_left(_maxes, val) - - if pos == len(_maxes): - return self._len - - idx = bisect_left(self._lists[pos], val) - - return self._loc(pos, idx) - - def bisect_right(self, val): - """ - Same as *bisect_left*, but if *val* is already present, the insertion - point will be after (to the right of) any existing entries. - """ - _maxes = self._maxes - - if not _maxes: - return 0 - - pos = bisect_right(_maxes, val) - - if pos == len(_maxes): - return self._len - - idx = bisect_right(self._lists[pos], val) - - return self._loc(pos, idx) - - bisect = bisect_right - - def count(self, val): - """Return the number of occurrences of *val* in the list.""" - _maxes = self._maxes - - if not _maxes: - return 0 - - pos_left = bisect_left(_maxes, val) - - if pos_left == len(_maxes): - return 0 - - _lists = self._lists - idx_left = bisect_left(_lists[pos_left], val) - pos_right = bisect_right(_maxes, val) - - if pos_right == len(_maxes): - return self._len - self._loc(pos_left, idx_left) - - idx_right = bisect_right(_lists[pos_right], val) - - if pos_left == pos_right: - return idx_right - idx_left - - right = self._loc(pos_right, idx_right) - left = self._loc(pos_left, idx_left) - - return right - left - - def copy(self): - """Return a shallow copy of the sorted list.""" - return self.__class__(self, load=self._load) - - __copy__ = copy - - def append(self, val): - """ - Append the element *val* to the list. Raises a ValueError if the *val* - would violate the sort order. - """ - _maxes, _lists = self._maxes, self._lists - - if not _maxes: - _maxes.append(val) - _lists.append([val]) - self._len = 1 - return - - pos = len(_lists) - 1 - - if val < _lists[pos][-1]: - msg = '{0} not in sort order at index {1}'.format(repr(val), self._len) - raise ValueError(msg) - - _maxes[pos] = val - _lists[pos].append(val) - self._len += 1 - self._expand(pos) - - def extend(self, values): - """ - Extend the list by appending all elements from the *values*. Raises a - ValueError if the sort order would be violated. - """ - _maxes, _lists, _load = self._maxes, self._lists, self._load - - if not isinstance(values, list): - values = list(values) - - if any(values[pos - 1] > values[pos] - for pos in range(1, len(values))): - raise ValueError('given sequence not in sort order') - - offset = 0 - - if _maxes: - if values[0] < _lists[-1][-1]: - msg = '{0} not in sort order at index {1}'.format(repr(values[0]), self._len) - raise ValueError(msg) - - if len(_lists[-1]) < self._half: - _lists[-1].extend(values[:_load]) - _maxes[-1] = _lists[-1][-1] - offset = _load - - len_lists = len(_lists) - - for idx in range(offset, len(values), _load): - _lists.append(values[idx:(idx + _load)]) - _maxes.append(_lists[-1][-1]) - - _index = self._index - - if len_lists == len(_lists): - len_index = len(_index) - if len_index > 0: - len_values = len(values) - child = len_index - 1 - while child: - _index[child] += len_values - child = (child - 1) >> 1 - _index[0] += len_values - else: - del _index[:] - - self._len += len(values) - - def insert(self, idx, val): - """ - Insert the element *val* into the list at *idx*. Raises a ValueError if - the *val* at *idx* would violate the sort order. - """ - _maxes, _lists, _len = self._maxes, self._lists, self._len - - if idx < 0: - idx += _len - if idx < 0: - idx = 0 - if idx > _len: - idx = _len - - if not _maxes: - # The idx must be zero by the inequalities above. - _maxes.append(val) - _lists.append([val]) - self._len = 1 - return - - if not idx: - if val > _lists[0][0]: - msg = '{0} not in sort order at index {1}'.format(repr(val), 0) - raise ValueError(msg) - else: - _lists[0].insert(0, val) - self._expand(0) - self._len += 1 - return - - if idx == _len: - pos = len(_lists) - 1 - if _lists[pos][-1] > val: - msg = '{0} not in sort order at index {1}'.format(repr(val), _len) - raise ValueError(msg) - else: - _lists[pos].append(val) - _maxes[pos] = _lists[pos][-1] - self._expand(pos) - self._len += 1 - return - - pos, idx = self._pos(idx) - idx_before = idx - 1 - if idx_before < 0: - pos_before = pos - 1 - idx_before = len(_lists[pos_before]) - 1 - else: - pos_before = pos - - before = _lists[pos_before][idx_before] - if before <= val <= _lists[pos][idx]: - _lists[pos].insert(idx, val) - self._expand(pos) - self._len += 1 - else: - msg = '{0} not in sort order at index {1}'.format(repr(val), idx) - raise ValueError(msg) - - def pop(self, idx=-1): - """ - Remove and return item at *idx* (default last). Raises IndexError if - list is empty or index is out of range. Negative indices are supported, - as for slice indices. - """ - if (idx < 0 and -idx > self._len) or (idx >= self._len): - raise IndexError('pop index out of range') - - pos, idx = self._pos(idx) - val = self._lists[pos][idx] - self._delete(pos, idx) - - return val - - def index(self, val, start=None, stop=None): - """ - Return the smallest *k* such that L[k] == val and i <= k < j`. Raises - ValueError if *val* is not present. *stop* defaults to the end of the - list. *start* defaults to the beginning. Negative indices are supported, - as for slice indices. - """ - _len, _maxes = self._len, self._maxes - - if not _maxes: - raise ValueError('{0} is not in list'.format(repr(val))) - - if start is None: - start = 0 - if start < 0: - start += _len - if start < 0: - start = 0 - - if stop is None: - stop = _len - if stop < 0: - stop += _len - if stop > _len: - stop = _len - - if stop <= start: - raise ValueError('{0} is not in list'.format(repr(val))) - - stop -= 1 - pos_left = bisect_left(_maxes, val) - - if pos_left == len(_maxes): - raise ValueError('{0} is not in list'.format(repr(val))) - - _lists = self._lists - idx_left = bisect_left(_lists[pos_left], val) - - if _lists[pos_left][idx_left] != val: - raise ValueError('{0} is not in list'.format(repr(val))) - - left = self._loc(pos_left, idx_left) - - if start <= left: - if left <= stop: - return left - else: - right = self.bisect_right(val) - 1 - - if start <= right: - return start - - raise ValueError('{0} is not in list'.format(repr(val))) - - def as_list(self): - """Very efficiently convert the SortedList to a list.""" - return reduce(iadd, self._lists, []) - - def __add__(self, that): - """ - Return a new sorted list containing all the elements in *self* and - *that*. Elements in *that* do not need to be properly ordered with - respect to *self*. - """ - values = self.as_list() - values.extend(that) - return self.__class__(values, load=self._load) - - def __iadd__(self, that): - """ - Update *self* to include all values in *that*. Elements in *that* do not - need to be properly ordered with respect to *self*. - """ - self.update(that) - return self - - def __mul__(self, that): - """ - Return a new sorted list containing *that* shallow copies of each item - in SortedList. - """ - values = self.as_list() * that - return self.__class__(values, load=self._load) - - def __imul__(self, that): - """ - Increase the length of the list by appending *that* shallow copies of - each item. - """ - values = self.as_list() * that - self.clear() - self.update(values) - return self - - def __eq__(self, that): - """Compare two Sequences for equality.""" - return ((self._len == len(that)) - and all(lhs == rhs for lhs, rhs in zip(self, that))) - - def __ne__(self, that): - """Compare two Sequences for inequality.""" - return ((self._len != len(that)) - or any(lhs != rhs for lhs, rhs in zip(self, that))) - - def __lt__(self, that): - """Compare two Sequences for less than.""" - return ((self._len <= len(that)) - and all(lhs < rhs for lhs, rhs in zip(self, that))) - - def __le__(self, that): - """Compare two Sequences for less than equal.""" - return ((self._len <= len(that)) - and all(lhs <= rhs for lhs, rhs in zip(self, that))) - - def __gt__(self, that): - """Compare two Sequences for greater than.""" - return ((self._len >= len(that)) - and all(lhs > rhs for lhs, rhs in zip(self, that))) - - def __ge__(self, that): - """Compare two Sequences for greater than equal.""" - return ((self._len >= len(that)) - and all(lhs >= rhs for lhs, rhs in zip(self, that))) - - @recursive_repr - def __repr__(self): - """Return string representation of SortedList.""" - temp = '{0}({1}, load={2})' - return temp.format( - self.__class__.__name__, - repr(list(self)), - repr(self._load) - ) - - def _check(self): - try: - # Check load parameters. - - assert self._load >= 4 - assert self._half == (self._load >> 1) - assert self._twice == (self._load * 2) - - # Check empty sorted list case. - - if self._maxes == []: - assert self._lists == [] - return - - assert len(self._maxes) > 0 and len(self._lists) > 0 - - # Check all sublists are sorted. - - assert all(sublist[pos - 1] <= sublist[pos] - for sublist in self._lists - for pos in range(1, len(sublist))) - - # Check beginning/end of sublists are sorted. - - for pos in range(1, len(self._lists)): - assert self._lists[pos - 1][-1] <= self._lists[pos][0] - - # Check length of _maxes and _lists match. - - assert len(self._maxes) == len(self._lists) - - # Check _maxes is a map of _lists. - - assert all(self._maxes[pos] == self._lists[pos][-1] - for pos in range(len(self._maxes))) - - # Check load level is less than _twice. - - assert all(len(sublist) <= self._twice for sublist in self._lists) - - # Check load level is greater than _half for all - # but the last sublist. - - assert all(len(self._lists[pos]) >= self._half - for pos in range(0, len(self._lists) - 1)) - - # Check length. - - assert self._len == sum(len(sublist) for sublist in self._lists) - - # Check index. - - if len(self._index): - assert len(self._index) == self._offset + len(self._lists) - assert self._len == self._index[0] - - def test_offset_pos(pos): - from_index = self._index[self._offset + pos] - return from_index == len(self._lists[pos]) - - assert all(test_offset_pos(pos) - for pos in range(len(self._lists))) - - for pos in range(self._offset): - child = (pos << 1) + 1 - if self._index[pos] == 0: - assert child >= len(self._index) - elif child + 1 == len(self._index): - assert self._index[pos] == self._index[child] - else: - child_sum = self._index[child] + self._index[child + 1] - assert self._index[pos] == child_sum - - except: - import sys - import traceback - - traceback.print_exc(file=sys.stdout) - - print('len', self._len) - print('load', self._load, self._half, self._twice) - print('offset', self._offset) - print('len_index', len(self._index)) - print('index', self._index) - print('len_maxes', len(self._maxes)) - print('maxes', self._maxes) - print('len_lists', len(self._lists)) - print('lists', self._lists) - - raise diff -Nru tophat-2.1.1+dfsg/src/sortedcontainers/sortedlistwithkey.py tophat-2.1.1+dfsg1/src/sortedcontainers/sortedlistwithkey.py --- tophat-2.1.1+dfsg/src/sortedcontainers/sortedlistwithkey.py 2016-02-14 18:21:17.801079000 +0000 +++ tophat-2.1.1+dfsg1/src/sortedcontainers/sortedlistwithkey.py 1970-01-01 00:00:00.000000000 +0000 @@ -1,1331 +0,0 @@ -# -*- coding: utf-8 -*- -# -# Sorted list implementation. - -from __future__ import print_function -from sys import hexversion - -from .sortedlist import recursive_repr -from bisect import bisect_left, bisect_right, insort -from itertools import chain, repeat, starmap -from collections import MutableSequence -from operator import iadd, add -from functools import wraps -from math import log - -if hexversion < 0x03000000: - from itertools import izip as zip - from itertools import imap as map -else: - from functools import reduce - -def identity(value): - return value - -class SortedListWithKey(MutableSequence): - """ - SortedList provides most of the same methods as a list but keeps the items - in sorted order. - """ - - def __init__(self, iterable=None, key=identity, load=1000): - """ - SortedList provides most of the same methods as a list but keeps the - items in sorted order. - - An optional *iterable* provides an initial series of items to populate - the SortedList. - - An optional *load* specifies the load-factor of the list. The default - load factor of '1000' works well for lists from tens to tens of millions - of elements. Good practice is to use a value that is the cube root of - the list size. With billions of elements, the best load factor depends - on your usage. It's best to leave the load factor at the default until - you start benchmarking. - """ - self._len, self._maxes, self._lists, self._keys, self._index = 0, [], [], [], [] - self._key, self._load, self._twice, self._half = key, load, load * 2, load >> 1 - self._offset = 0 - - if iterable is not None: - self.update(iterable) - - def clear(self): - """Remove all the elements from the list.""" - self._len = 0 - del self._maxes[:] - del self._lists[:] - del self._keys[:] - del self._index[:] - - def add(self, val): - """Add the element *val* to the list.""" - _maxes, _lists, _keys = self._maxes, self._lists, self._keys - - key = self._key(val) - - if _maxes: - pos = bisect_right(_maxes, key) - - if pos == len(_maxes): - pos -= 1 - _maxes[pos] = key - _keys[pos].append(key) - _lists[pos].append(val) - else: - idx = bisect_right(_keys[pos], key) - _keys[pos].insert(idx, key) - _lists[pos].insert(idx, val) - - self._expand(pos) - else: - _maxes.append(key) - _keys.append([key]) - _lists.append([val]) - - self._len += 1 - - def _expand(self, pos): - """ - Splits sublists that are more than double the load level. - - Updates the index when the sublist length is less than double the load - level. This requires incrementing the nodes in a traversal from the leaf - node to the root. For an example traversal see self._loc. - """ - _lists, _keys, _index = self._lists, self._keys, self._index - - if len(_keys[pos]) > self._twice: - _maxes, _load = self._maxes, self._load - - half = _keys[pos][_load:] - half_list = _lists[pos][_load:] - del _keys[pos][_load:] - del _lists[pos][_load:] - _maxes[pos] = _keys[pos][-1] - - _maxes.insert(pos + 1, half[-1]) - _keys.insert(pos + 1, half) - _lists.insert(pos + 1, half_list) - - del _index[:] - else: - if len(_index) > 0: - child = self._offset + pos - while child > 0: - _index[child] += 1 - child = (child - 1) >> 1 - _index[0] += 1 - - def update(self, iterable): - """Update the list by adding all elements from *iterable*.""" - _maxes, _lists, _keys = self._maxes, self._lists, self._keys - values = sorted(iterable, key=self._key) - - if _maxes: - if len(values) * 4 >= self._len: - values.extend(chain.from_iterable(_lists)) - values.sort(key=self._key) - self.clear() - else: - _add = self.add - for val in values: - _add(val) - return - - _load, _index = self._load, self._index - _lists.extend(values[pos:(pos + _load)] - for pos in range(0, len(values), _load)) - _keys.extend(list(map(self._key, _list)) for _list in _lists) - _maxes.extend(sublist[-1] for sublist in _keys) - self._len = len(values) - del _index[:] - - def __contains__(self, val): - """Return True if and only if *val* is an element in the list.""" - _maxes = self._maxes - - if not _maxes: - return False - - key = self._key(val) - pos = bisect_left(_maxes, key) - - if pos == len(_maxes): - return False - - _keys = self._keys - _lists = self._lists - - idx = bisect_left(_keys[pos], key) - - len_keys = len(_keys) - len_sublist = len(_keys[pos]) - - while True: - if _keys[pos][idx] != key: - return False - if _lists[pos][idx] == val: - return True - idx += 1 - if idx == len_sublist: - pos += 1 - if pos == len_keys: - return False - len_sublist = len(_keys[pos]) - idx = 0 - - def discard(self, val): - """ - Remove the first occurrence of *val*. - - If *val* is not a member, does nothing. - """ - _maxes = self._maxes - - if not _maxes: - return - - key = self._key(val) - pos = bisect_left(_maxes, key) - - if pos == len(_maxes): - return - - _keys = self._keys - _lists = self._lists - idx = bisect_left(_keys[pos], key) - - len_keys = len(_keys) - len_sublist = len(_keys[pos]) - - while True: - if _keys[pos][idx] != key: - return - if _lists[pos][idx] == val: - self._delete(pos, idx) - return - idx += 1 - if idx == len_sublist: - pos += 1 - if pos == len_keys: - return - len_sublist = len(_keys[pos]) - idx = 0 - - def remove(self, val): - """ - Remove first occurrence of *val*. - - Raises ValueError if *val* is not present. - """ - _maxes = self._maxes - - if not _maxes: - raise ValueError('{0} not in list'.format(repr(val))) - - key = self._key(val) - pos = bisect_left(_maxes, key) - - if pos == len(_maxes): - raise ValueError('{0} not in list'.format(repr(val))) - - _keys = self._keys - _lists = self._lists - idx = bisect_left(_keys[pos], key) - - len_keys = len(_keys) - len_sublist = len(_keys[pos]) - - while True: - if _keys[pos][idx] != key: - raise ValueError('{0} not in list'.format(repr(val))) - if _lists[pos][idx] == val: - self._delete(pos, idx) - return - idx += 1 - if idx == len_sublist: - pos += 1 - if pos == len_keys: - raise ValueError('{0} not in list'.format(repr(val))) - len_sublist = len(_keys[pos]) - idx = 0 - - def _delete(self, pos, idx): - """ - Delete the item at the given (pos, idx). - - Combines lists that are less than half the load level. - - Updates the index when the sublist length is more than half the load - level. This requires decrementing the nodes in a traversal from the leaf - node to the root. For an example traversal see self._loc. - """ - _maxes, _lists, _keys, _index = self._maxes, self._lists, self._keys, self._index - - keys_pos = _keys[pos] - lists_pos = _lists[pos] - - del keys_pos[idx] - del lists_pos[idx] - self._len -= 1 - - len_keys_pos = len(keys_pos) - - if len_keys_pos > self._half: - - _maxes[pos] = keys_pos[-1] - - if len(_index) > 0: - child = self._offset + pos - while child > 0: - _index[child] -= 1 - child = (child - 1) >> 1 - _index[0] -= 1 - - elif len(_keys) > 1: - - if not pos: - pos += 1 - - prev = pos - 1 - _keys[prev].extend(_keys[pos]) - _lists[prev].extend(_lists[pos]) - _maxes[prev] = _keys[prev][-1] - - del _keys[pos] - del _lists[pos] - del _maxes[pos] - del _index[:] - - self._expand(prev) - - elif len_keys_pos: - - _maxes[pos] = keys_pos[-1] - - else: - - del _keys[pos] - del _lists[pos] - del _maxes[pos] - del _index[:] - - def _loc(self, pos, idx): - """Convert an index pair (alpha, beta) into a single index that corresponds to - the position of the value in the sorted list. - - Most queries require the index be built. Details of the index are - described in self._build_index. - - Indexing requires traversing the tree from a leaf node to the root. The - parent of each node is easily computable at (pos - 1) // 2. - - Left-child nodes are always at odd indices and right-child nodes are - always at even indices. - - When traversing up from a right-child node, increment the total by the - left-child node. - - The final index is the sum from traversal and the index in the sublist. - - For example, using the index from self._build_index: - - _index = 14 5 9 3 2 4 5 - _offset = 3 - - Tree: - - 14 - 5 9 - 3 2 4 5 - - Converting index pair (2, 3) into a single index involves iterating like - so: - - 1. Starting at the leaf node: offset + alpha = 3 + 2 = 5. We identify - the node as a left-child node. At such nodes, we simply traverse to - the parent. - - 2. At node 9, position 2, we recognize the node as a right-child node - and accumulate the left-child in our total. Total is now 5 and we - traverse to the parent at position 0. - - 3. Iteration ends at the root. - - Computing the index is the sum of the total and beta: 5 + 3 = 8. - """ - if not pos: - return idx - - _index = self._index - - if not len(_index): - self._build_index() - - total = 0 - - # Increment pos to point in the index to len(self._lists[pos]). - - pos += self._offset - - # Iterate until reaching the root of the index tree at pos = 0. - - while pos: - - # Right-child nodes are at odd indices. At such indices - # account the total below the left child node. - - if not (pos & 1): - total += _index[pos - 1] - - # Advance pos to the parent node. - - pos = (pos - 1) >> 1 - - return total + idx - - def _pos(self, idx): - """Convert an index into a pair (alpha, beta) that can be used to access - the corresponding _lists[alpha][beta] position. - - Most queries require the index be built. Details of the index are - described in self._build_index. - - Indexing requires traversing the tree to a leaf node. Each node has - two children which are easily computable. Given an index, pos, the - left-child is at pos * 2 + 1 and the right-child is at pos * 2 + 2. - - When the index is less than the left-child, traversal moves to the - left sub-tree. Otherwise, the index is decremented by the left-child - and traversal moves to the right sub-tree. - - At a child node, the indexing pair is computed from the relative - position of the child node as compared with the offset and the remaining - index. - - For example, using the index from self._build_index: - - _index = 14 5 9 3 2 4 5 - _offset = 3 - - Tree: - - 14 - 5 9 - 3 2 4 5 - - Indexing position 8 involves iterating like so: - - 1. Starting at the root, position 0, 8 is compared with the left-child - node (5) which it is greater than. When greater the index is - decremented and the position is updated to the right child node. - - 2. At node 9 with index 3, we again compare the index to the left-child - node with value 4. Because the index is the less than the left-child - node, we simply traverse to the left. - - 3. At node 4 with index 3, we recognize that we are at a leaf node and - stop iterating. - - 4. To compute the sublist index, we subtract the offset from the index - of the leaf node: 5 - 3 = 2. To compute the index in the sublist, we - simply use the index remaining from iteration. In this case, 3. - - The final index pair from our example is (2, 3) which corresponds to - index 8 in the sorted list. - """ - _len, _lists = self._len, self._lists - - if idx < 0: - last_len = len(_lists[-1]) - if (-idx) <= last_len: - return len(_lists) - 1, last_len + idx - idx += _len - if idx < 0: - raise IndexError('list index out of range') - elif idx >= _len: - raise IndexError('list index out of range') - - if idx < len(_lists[0]): - return 0, idx - - _index = self._index - - if not len(_index): - self._build_index() - - pos = 0 - len_index = len(_index) - child = (pos << 1) + 1 - - while child < len_index: - index_child = _index[child] - - if idx < index_child: - pos = child - else: - idx -= index_child - pos = child + 1 - - child = (pos << 1) + 1 - - return (pos - self._offset, idx) - - def _build_index(self): - """Build an index for indexing the sorted list. - - Indexes are represented as binary trees in a dense array notation - similar to a binary heap. - - For example, given a _lists representation storing integers: - - [0]: 1 2 3 - [1]: 4 5 - [2]: 6 7 8 9 - [3]: 10 11 12 13 14 - - The first transformation maps the sub-lists by their length. The - first row of the index is the length of the sub-lists. - - [0]: 3 2 4 5 - - Each row after that is the sum of consecutive pairs of the previous row: - - [1]: 5 9 - [2]: 14 - - Finally, the index is built by concatenating these lists together: - - _index = 14 5 9 3 2 4 5 - - An offset storing the start of the first row is also stored: - - _offset = 3 - - When built, the index can be used for efficient indexing into the list. - See the comment and notes on self._pos for details. - """ - row0 = list(map(len, self._lists)) - - if len(row0) == 1: - self._index[:] = row0 - self._offset = 0 - return - - head = iter(row0) - tail = iter(head) - row1 = list(starmap(add, zip(head, tail))) - - if len(row0) & 1: - row1.append(row0[-1]) - - if len(row1) == 1: - self._index[:] = row1 + row0 - self._offset = 1 - return - - size = 2 ** (int(log(len(row1) - 1, 2)) + 1) - row1.extend(repeat(0, size - len(row1))) - tree = [row0, row1] - - while len(tree[-1]) > 1: - head = iter(tree[-1]) - tail = iter(head) - row = list(starmap(add, zip(head, tail))) - tree.append(row) - - reduce(iadd, reversed(tree), self._index) - self._offset = size * 2 - 1 - - def _slice(self, slc): - start, stop, step = slc.start, slc.stop, slc.step - - if step == 0: - raise ValueError('slice step cannot be zero') - - # Set defaults for missing values. - - if step is None: - step = 1 - - if step > 0: - if start is None: - start = 0 - - if stop is None: - stop = len(self) - elif stop < 0: - stop += len(self) - else: - if start is None: - start = len(self) - 1 - - if stop is None: - stop = -1 - elif stop < 0: - stop += len(self) - - if start < 0: - start += len(self) - - # Fix indices that are too big or too small. - # Slice notation is surprisingly permissive - # where normal indexing would raise IndexError. - - if step > 0: - if start < 0: - start = 0 - elif start > len(self): - start = len(self) - - if stop < 0: - stop = 0 - elif stop > len(self): - stop = len(self) - else: - if start < 0: - start = -1 - elif start >= len(self): - start = len(self) - 1 - - if stop < 0: - stop = -1 - elif stop > len(self): - stop = len(self) - - return start, stop, step - - def __delitem__(self, idx): - """Remove the element at *idx*. Supports slicing.""" - if isinstance(idx, slice): - start, stop, step = self._slice(idx) - - if ((step == 1) and (start < stop) - and ((stop - start) * 8 >= self._len)): - - values = self[:start] - if stop < self._len: - values += self[stop:] - self.clear() - self.update(values) - return - - indices = range(start, stop, step) - - # Delete items from greatest index to least so - # that the indices remain valid throughout iteration. - - if step > 0: - indices = reversed(indices) - - _pos, _delete = self._pos, self._delete - - for index in indices: - pos, idx = _pos(index) - _delete(pos, idx) - else: - pos, idx = self._pos(idx) - self._delete(pos, idx) - - def __getitem__(self, idx): - """Return the element at *idx*. Supports slicing.""" - _lists = self._lists - - if isinstance(idx, slice): - start, stop, step = self._slice(idx) - - if step == 1 and start < stop: - if start == 0 and stop == self._len: - return self.as_list() - - start_pos, start_idx = self._pos(start) - - if stop == self._len: - stop_pos = len(_lists) - 1 - stop_idx = len(_lists[stop_pos]) - else: - stop_pos, stop_idx = self._pos(stop) - - if start_pos == stop_pos: - return _lists[start_pos][start_idx:stop_idx] - - prefix = _lists[start_pos][start_idx:] - middle = _lists[(start_pos + 1):stop_pos] - result = reduce(iadd, middle, prefix) - result += _lists[stop_pos][:stop_idx] - - return result - - if step == -1 and start > stop: - result = self[(stop + 1):(start + 1)] - result.reverse() - return result - - # Return a list because a negative step could - # reverse the order of the items and this could - # be the desired behavior. - - indices = range(start, stop, step) - return list(self[index] for index in indices) - else: - pos, idx = self._pos(idx) - return _lists[pos][idx] - - def _check_order(self, idx, key, val): - _keys, _len = self._keys, self._len - - pos, loc = self._pos(idx) - - if idx < 0: - idx += _len - - # Check that the inserted value is not less than the - # previous value. - - if idx > 0: - idx_prev = loc - 1 - pos_prev = pos - - if idx_prev < 0: - pos_prev -= 1 - idx_prev = len(_keys[pos_prev]) - 1 - - if _keys[pos_prev][idx_prev] > key: - msg = '{0} not in sort order at index {1}'.format(repr(val), idx) - raise ValueError(msg) - - # Check that the inserted value is not greater than - # the previous value. - - if idx < (_len - 1): - idx_next = loc + 1 - pos_next = pos - - if idx_next == len(_keys[pos_next]): - pos_next += 1 - idx_next = 0 - - if _keys[pos_next][idx_next] < key: - msg = '{0} not in sort order at index {1}'.format(repr(val), idx) - raise ValueError(msg) - - def __setitem__(self, index, value): - """ - Replace the item at position *index* with *value*. - - Supports slice notation. Raises a :exc:`ValueError` if the sort order - would be violated. When used with a slice and iterable, the - :exc:`ValueError` is raised before the list is mutated if the sort order - would be violated by the operation. - """ - _maxes, _lists, _keys, _pos = self._maxes, self._lists, self._keys, self._pos - _check_order = self._check_order - - if isinstance(index, slice): - start, stop, step = self._slice(index) - indices = range(start, stop, step) - - if step != 1: - if not hasattr(value, '__len__'): - value = list(value) - - indices = list(indices) - - if len(value) != len(indices): - raise ValueError( - 'attempt to assign sequence of size {0}' - ' to extended slice of size {1}' - .format(len(value), len(indices))) - - # Keep a log of values that are set so that we can - # roll back changes if ordering is violated. - - log = [] - _append = log.append - - for idx, val in zip(indices, value): - pos, loc = _pos(idx) - key = self._key(val) - _append((idx, _keys[pos][loc], key, _lists[pos][loc], val)) - _keys[pos][loc] = key - _lists[pos][loc] = val - if len(_keys[pos]) == (loc + 1): - _maxes[pos] = key - - try: - # Validate ordering of new values. - - for idx, oldkey, newkey, oldval, newval in log: - _check_order(idx, newkey, newval) - - except ValueError: - - # Roll back changes from log. - - for idx, oldkey, newkey, oldval, newval in log: - pos, loc = _pos(idx) - _keys[pos][loc] = oldkey - _lists[pos][loc] = oldval - if len(_keys[pos]) == (loc + 1): - _maxes[pos] = oldkey - - raise - else: - # Test ordering using indexing. If the value given - # doesn't support getitem, convert it to a list. - - if not hasattr(value, '__getitem__'): - value = list(value) - - # Check that the given values are ordered properly. - - keys = list(map(self._key, value)) - ordered = all(keys[pos - 1] <= keys[pos] - for pos in range(1, len(keys))) - - if not ordered: - raise ValueError('given sequence not in sort order') - - # Check ordering in context of sorted list. - - if not start or not len(value): - # Nothing to check on the lhs. - pass - else: - pos, loc = _pos(start - 1) - if _keys[pos][loc] > keys[0]: - msg = '{0} not in sort order at index {1}'.format(repr(value[0]), start) - raise ValueError(msg) - - if stop == len(self) or not len(value): - # Nothing to check on the rhs. - pass - else: - # "stop" is exclusive so we don't need - # to add one for the index. - pos, loc = _pos(stop) - if _keys[pos][loc] < keys[-1]: - msg = '{0} not in sort order at index {1}'.format(repr(value[-1]), stop) - raise ValueError(msg) - - # Delete the existing values. - - del self[index] - - # Insert the new values. - - _insert = self.insert - for idx, val in enumerate(value): - _insert(start + idx, val) - else: - pos, loc = _pos(index) - key = self._key(value) - _check_order(index, key, value) - _keys[pos][loc] = key - _lists[pos][loc] = value - if len(_lists[pos]) == (loc + 1): - _maxes[pos] = key - - def __iter__(self): - """Create an iterator over the list.""" - return chain.from_iterable(self._lists) - - def __reversed__(self): - """Create an iterator to traverse the list in reverse.""" - return chain.from_iterable(map(reversed, reversed(self._lists))) - - def __len__(self): - """Return the number of elements in the list.""" - return self._len - - def bisect_left(self, val): - """ - Similar to the *bisect* module in the standard library, this returns an - appropriate index to insert *val*. If *val* is already present, the - insertion point will be before (to the left of) any existing entries. - """ - _maxes = self._maxes - - if not _maxes: - return 0 - - key = self._key(val) - pos = bisect_left(_maxes, key) - - if pos == len(_maxes): - return self._len - - idx = bisect_left(self._keys[pos], key) - - return self._loc(pos, idx) - - def bisect_right(self, val): - """ - Same as *bisect_left*, but if *val* is already present, the insertion - point will be after (to the right of) any existing entries. - """ - _maxes = self._maxes - - if not _maxes: - return 0 - - key = self._key(val) - pos = bisect_right(_maxes, key) - - if pos == len(_maxes): - return self._len - - idx = bisect_right(self._keys[pos], key) - - return self._loc(pos, idx) - - bisect = bisect_right - - def count(self, val): - """Return the number of occurrences of *val* in the list.""" - _maxes = self._maxes - - if not _maxes: - return 0 - - key = self._key(val) - pos = bisect_left(_maxes, key) - - if pos == len(_maxes): - return 0 - - _keys = self._keys - _lists = self._lists - - idx = bisect_left(_keys[pos], key) - - total = 0 - len_keys = len(_keys) - len_sublist = len(_keys[pos]) - - while True: - if _keys[pos][idx] != key: - return total - if _lists[pos][idx] == val: - total += 1 - idx += 1 - if idx == len_sublist: - pos += 1 - if pos == len_keys: - return total - len_sublist = len(_keys[pos]) - idx = 0 - - def copy(self): - """Return a shallow copy of the sorted list.""" - return self.__class__(self, key=self._key, load=self._load) - - __copy__ = copy - - def append(self, val): - """ - Append the element *val* to the list. Raises a ValueError if the *val* - would violate the sort order. - """ - _maxes, _lists, _keys = self._maxes, self._lists, self._keys - - key = self._key(val) - - if not _maxes: - _maxes.append(key) - _keys.append([key]) - _lists.append([val]) - self._len = 1 - return - - pos = len(_keys) - 1 - - if key < _keys[pos][-1]: - msg = '{0} not in sort order at index {1}'.format(repr(val), self._len) - raise ValueError(msg) - - _maxes[pos] = key - _keys[pos].append(key) - _lists[pos].append(val) - self._len += 1 - self._expand(pos) - - def extend(self, values): - """ - Extend the list by appending all elements from the *values*. Raises a - ValueError if the sort order would be violated. - """ - _maxes, _keys, _lists, _load = self._maxes, self._keys, self._lists, self._load - - if not isinstance(values, list): - values = list(values) - - keys = list(map(self._key, values)) - - if any(keys[pos - 1] > keys[pos] - for pos in range(1, len(keys))): - raise ValueError('given sequence not in sort order') - - offset = 0 - - if _maxes: - if keys[0] < _keys[-1][-1]: - msg = '{0} not in sort order at index {1}'.format(repr(values[0]), self._len) - raise ValueError(msg) - - if len(_keys[-1]) < self._half: - _lists[-1].extend(values[:_load]) - _keys[-1].extend(keys[:_load]) - _maxes[-1] = _keys[-1][-1] - offset = _load - - len_keys = len(_keys) - - for idx in range(offset, len(keys), _load): - _lists.append(values[idx:(idx + _load)]) - _keys.append(keys[idx:(idx + _load)]) - _maxes.append(_keys[-1][-1]) - - _index = self._index - - if len_keys == len(_keys): - len_index = len(_index) - if len_index > 0: - len_values = len(values) - child = len_index - 1 - while child: - _index[child] += len_values - child = (child - 1) >> 1 - _index[0] += len_values - else: - del _index[:] - - self._len += len(values) - - def insert(self, idx, val): - """ - Insert the element *val* into the list at *idx*. Raises a ValueError if - the *val* at *idx* would violate the sort order. - """ - _maxes, _lists, _keys, _len = self._maxes, self._lists, self._keys, self._len - - if idx < 0: - idx += _len - if idx < 0: - idx = 0 - if idx > _len: - idx = _len - - key = self._key(val) - - if not _maxes: - # The idx must be zero by the inequalities above. - _maxes.append(key) - _lists.append([val]) - _keys.append([key]) - self._len = 1 - return - - if not idx: - if key > _keys[0][0]: - msg = '{0} not in sort order at index {1}'.format(repr(val), 0) - raise ValueError(msg) - else: - _keys[0].insert(0, key) - _lists[0].insert(0, val) - self._expand(0) - self._len += 1 - return - - if idx == _len: - pos = len(_keys) - 1 - if _keys[pos][-1] > key: - msg = '{0} not in sort order at index {1}'.format(repr(val), _len) - raise ValueError(msg) - else: - _keys[pos].append(key) - _lists[pos].append(val) - _maxes[pos] = _keys[pos][-1] - self._expand(pos) - self._len += 1 - return - - pos, idx = self._pos(idx) - idx_before = idx - 1 - if idx_before < 0: - pos_before = pos - 1 - idx_before = len(_keys[pos_before]) - 1 - else: - pos_before = pos - - before = _keys[pos_before][idx_before] - if before <= key <= _keys[pos][idx]: - _lists[pos].insert(idx, val) - _keys[pos].insert(idx, key) - self._expand(pos) - self._len += 1 - else: - msg = '{0} not in sort order at index {1}'.format(repr(val), idx) - raise ValueError(msg) - - def pop(self, idx=-1): - """ - Remove and return item at *idx* (default last). Raises IndexError if - list is empty or index is out of range. Negative indices are supported, - as for slice indices. - """ - if (idx < 0 and -idx > self._len) or (idx >= self._len): - raise IndexError('pop index out of range') - - pos, idx = self._pos(idx) - val = self._lists[pos][idx] - self._delete(pos, idx) - - return val - - def index(self, val, start=None, stop=None): - """ - Return the smallest *k* such that L[k] == val and i <= k < j`. Raises - ValueError if *val* is not present. *stop* defaults to the end of the - list. *start* defaults to the beginning. Negative indices are supported, - as for slice indices. - """ - _len, _maxes = self._len, self._maxes - - if not _maxes: - raise ValueError('{0} is not in list'.format(repr(val))) - - if start is None: - start = 0 - if start < 0: - start += _len - if start < 0: - start = 0 - - if stop is None: - stop = _len - if stop < 0: - stop += _len - if stop > _len: - stop = _len - - if stop <= start: - raise ValueError('{0} is not in list'.format(repr(val))) - - stop -= 1 - key = self._key(val) - pos = bisect_left(_maxes, key) - - if pos == len(_maxes): - raise ValueError('{0} is not in list'.format(repr(val))) - - _keys = self._keys - _lists = self._lists - - idx = bisect_left(_keys[pos], key) - - len_keys = len(_keys) - len_sublist = len(_keys[pos]) - - while True: - if _keys[pos][idx] != key: - raise ValueError('{0} is not in list'.format(repr(val))) - if _lists[pos][idx] == val: - loc = self._loc(pos, idx) - if start <= loc <= stop: - return loc - elif loc > stop: - break - idx += 1 - if idx == len_sublist: - pos += 1 - if pos == len_keys: - raise ValueError('{0} is not in list'.format(repr(val))) - len_sublist = len(_keys[pos]) - idx = 0 - - raise ValueError('{0} is not in list'.format(repr(val))) - - def as_list(self): - """Very efficiently convert the SortedList to a list.""" - return reduce(iadd, self._lists, []) - - def __add__(self, that): - """ - Return a new sorted list containing all the elements in *self* and - *that*. Elements in *that* do not need to be properly ordered with - respect to *self*. - """ - values = self.as_list() - values.extend(that) - return self.__class__(values, key=self._key, load=self._load) - - def __iadd__(self, that): - """ - Update *self* to include all values in *that*. Elements in *that* do not - need to be properly ordered with respect to *self*. - """ - self.update(that) - return self - - def __mul__(self, that): - """ - Return a new sorted list containing *that* shallow copies of each item - in SortedList. - """ - values = self.as_list() * that - return self.__class__(values, key=self._key, load=self._load) - - def __imul__(self, that): - """ - Increase the length of the list by appending *that* shallow copies of - each item. - """ - values = self.as_list() * that - self.clear() - self.update(values) - return self - - def __eq__(self, that): - """Compare two Sequences for equality.""" - return ((self._len == len(that)) - and all(lhs == rhs for lhs, rhs in zip(self, that))) - - def __ne__(self, that): - """Compare two Sequences for inequality.""" - return ((self._len != len(that)) - or any(lhs != rhs for lhs, rhs in zip(self, that))) - - def __lt__(self, that): - """Compare two Sequences for less than.""" - return ((self._len <= len(that)) - and all(lhs < rhs for lhs, rhs in zip(self, that))) - - def __le__(self, that): - """Compare two Sequences for less than equal.""" - return ((self._len <= len(that)) - and all(lhs <= rhs for lhs, rhs in zip(self, that))) - - def __gt__(self, that): - """Compare two Sequences for greater than.""" - return ((self._len >= len(that)) - and all(lhs > rhs for lhs, rhs in zip(self, that))) - - def __ge__(self, that): - """Compare two Sequences for greater than equal.""" - return ((self._len >= len(that)) - and all(lhs >= rhs for lhs, rhs in zip(self, that))) - - @recursive_repr - def __repr__(self): - """Return string representation of SortedListWithKey.""" - temp = '{0}({1}, key={2}, load={3})' - return temp.format( - self.__class__.__name__, - repr(list(self)), - repr(self._key), - repr(self._load) - ) - - def _check(self): - try: - # Check load parameters. - - assert self._load >= 4 - assert self._half == (self._load >> 1) - assert self._twice == (self._load * 2) - - # Check empty sorted list case. - - if self._maxes == []: - assert self._keys == [] - assert self._lists == [] - return - - assert len(self._maxes) > 0 and len(self._keys) > 0 and len(self._lists) > 0 - - # Check all sublists are sorted. - - assert all(sublist[pos - 1] <= sublist[pos] - for sublist in self._keys - for pos in range(1, len(sublist))) - - # Check beginning/end of sublists are sorted. - - for pos in range(1, len(self._keys)): - assert self._keys[pos - 1][-1] <= self._keys[pos][0] - - # Check length of _maxes and _lists match. - - assert len(self._maxes) == len(self._lists) == len(self._keys) - - # Check _keys matches _key mapped to _lists. - - assert all(len(val_list) == len(key_list) - for val_list, key_list in zip(self._lists, self._keys)) - assert all(self._key(val) == key for val, key in - zip((_val for _val_list in self._lists for _val in _val_list), - (_key for _key_list in self._keys for _key in _key_list))) - - # Check _maxes is a map of _keys. - - assert all(self._maxes[pos] == self._keys[pos][-1] - for pos in range(len(self._maxes))) - - # Check load level is less than _twice. - - assert all(len(sublist) <= self._twice for sublist in self._lists) - - # Check load level is greater than _half for all - # but the last sublist. - - assert all(len(self._lists[pos]) >= self._half - for pos in range(0, len(self._lists) - 1)) - - # Check length. - - assert self._len == sum(len(sublist) for sublist in self._lists) - - # Check index. - - if len(self._index): - assert len(self._index) == self._offset + len(self._lists) - assert self._len == self._index[0] - - def test_offset_pos(pos): - from_index = self._index[self._offset + pos] - return from_index == len(self._lists[pos]) - - assert all(test_offset_pos(pos) - for pos in range(len(self._lists))) - - for pos in range(self._offset): - child = (pos << 1) + 1 - if self._index[pos] == 0: - assert child >= len(self._index) - elif child + 1 == len(self._index): - assert self._index[pos] == self._index[child] - else: - child_sum = self._index[child] + self._index[child + 1] - assert self._index[pos] == child_sum - - except: - import sys - import traceback - - traceback.print_exc(file=sys.stdout) - - print('len', self._len) - print('load', self._load, self._half, self._twice) - print('offset', self._offset) - print('len_index', len(self._index)) - print('index', self._index) - print('len_maxes', len(self._maxes)) - print('maxes', self._maxes) - print('len_keys', len(self._keys)) - print('keys', self._keys) - print('len_lists', len(self._lists)) - print('lists', self._lists) - - raise diff -Nru tophat-2.1.1+dfsg/src/sortedcontainers/sortedset.py tophat-2.1.1+dfsg1/src/sortedcontainers/sortedset.py --- tophat-2.1.1+dfsg/src/sortedcontainers/sortedset.py 2016-02-14 18:21:17.801079000 +0000 +++ tophat-2.1.1+dfsg1/src/sortedcontainers/sortedset.py 1970-01-01 00:00:00.000000000 +0000 @@ -1,294 +0,0 @@ -# -*- coding: utf-8 -*- -# -# Sorted set implementation. - -from .sortedlist import SortedList, recursive_repr -from .sortedlistwithkey import SortedListWithKey -from collections import Set, MutableSet, Sequence -from itertools import chain -import operator as op - -class SortedSet(MutableSet, Sequence): - """ - A `SortedSet` provides the same methods as a `set`. Additionally, a - `SortedSet` maintains its items in sorted order, allowing the `SortedSet` to - be indexed. - - Unlike a `set`, a `SortedSet` requires items be hashable and comparable. - """ - def __init__(self, iterable=None, key=None, load=1000, _set=None): - """ - A `SortedSet` provides the same methods as a `set`. Additionally, a - `SortedSet` maintains its items in sorted order, allowing the - `SortedSet` to be indexed. - - An optional *iterable* provides an initial series of items to populate - the `SortedSet`. - - An optional *key* argument defines a callable that, like the `key` - argument to Python's `sorted` function, extracts a comparison key from - each set item. If no function is specified, the default compares the - set items directly. - - An optional *load* specifies the load-factor of the set. The default - load factor of '1000' works well for sets from tens to tens of millions - of elements. Good practice is to use a value that is the cube root of - the set size. With billions of elements, the best load factor depends - on your usage. It's best to leave the load factor at the default until - you start benchmarking. - """ - self._key = key - self._load = load - - self._set = set() if _set is None else _set - - _set = self._set - self.isdisjoint = _set.isdisjoint - self.issubset = _set.issubset - self.issuperset = _set.issuperset - - if key is None: - self._list = SortedList(self._set, load=load) - else: - self._list = SortedListWithKey(self._set, key=key, load=load) - - _list = self._list - self.bisect_left = _list.bisect_left - self.bisect = _list.bisect - self.bisect_right = _list.bisect_right - self.index = _list.index - - if iterable is not None: - self.update(iterable) - - def __contains__(self, value): - """Return True if and only if *value* is an element in the set.""" - return (value in self._set) - - def __getitem__(self, index): - """ - Return the element at position *index*. - - Supports slice notation and negative indexes. - """ - return self._list[index] - - def __delitem__(self, index): - """ - Remove the element at position *index*. - - Supports slice notation and negative indexes. - """ - _list = self._list - if isinstance(index, slice): - values = _list[index] - self._set.difference_update(values) - else: - value = _list[index] - self._set.remove(value) - del _list[index] - - def _make_cmp(set_op, doc): - def comparer(self, that): - if isinstance(that, SortedSet): - return set_op(self._set, that._set) - elif isinstance(that, Set): - return set_op(self._set, that) - else: - raise TypeError('can only compare to a Set') - - comparer.__name__ = '__{0}__'.format(set_op.__name__) - comparer.__doc__ = 'Return True if and only if ' + doc - - return comparer - - __eq__ = _make_cmp(op.eq, 'self and *that* are equal sets.') - __ne__ = _make_cmp(op.ne, 'self and *that* are inequal sets.') - __lt__ = _make_cmp(op.lt, 'self is a proper subset of *that*.') - __gt__ = _make_cmp(op.gt, 'self is a proper superset of *that*.') - __le__ = _make_cmp(op.le, 'self is a subset of *that*.') - __ge__ = _make_cmp(op.ge, 'self is a superset of *that*.') - - def __len__(self): - """Return the number of elements in the set.""" - return len(self._set) - - def __iter__(self): - """ - Return an iterator over the SortedSet. Elements are iterated over - in their sorted order. - """ - return iter(self._list) - - def __reversed__(self): - """ - Return an iterator over the SortedSet. Elements are iterated over - in their reversed sorted order. - """ - return reversed(self._list) - - def add(self, value): - """Add the element *value* to the set.""" - if value not in self._set: - self._set.add(value) - self._list.add(value) - - def clear(self): - """Remove all elements from the set.""" - self._set.clear() - self._list.clear() - - def copy(self): - """Create a shallow copy of the sorted set.""" - return self.__class__(key=self._key, load=self._load, _set=set(self._set)) - - __copy__ = copy - - def count(self, value): - """Return the number of occurrences of *value* in the set.""" - return 1 if value in self._set else 0 - - def discard(self, value): - """ - Remove the first occurrence of *value*. If *value* is not a member, - does nothing. - """ - if value in self._set: - self._set.remove(value) - self._list.discard(value) - - def pop(self, index=-1): - """ - Remove and return item at *index* (default last). Raises IndexError if - set is empty or index is out of range. Negative indexes are supported, - as for slice indices. - """ - value = self._list.pop(index) - self._set.remove(value) - return value - - def remove(self, value): - """ - Remove first occurrence of *value*. Raises ValueError if - *value* is not present. - """ - self._set.remove(value) - self._list.remove(value) - - def difference(self, *iterables): - """ - Return a new set with elements in the set that are not in the - *iterables*. - """ - diff = self._set.difference(*iterables) - new_set = self.__class__(key=self._key, load=self._load, _set=diff) - return new_set - - __sub__ = difference - __rsub__ = __sub__ - - def difference_update(self, *iterables): - """ - Update the set, removing elements found in keeping only elements - found in any of the *iterables*. - """ - values = set(chain(*iterables)) - if (4 * len(values)) > len(self): - self._set.difference_update(values) - self._list.clear() - self._list.update(self._set) - else: - _discard = self.discard - for value in values: - _discard(value) - return self - - __isub__ = difference_update - - def intersection(self, *iterables): - """ - Return a new set with elements common to the set and all *iterables*. - """ - comb = self._set.intersection(*iterables) - new_set = self.__class__(key=self._key, load=self._load, _set=comb) - return new_set - - __and__ = intersection - __rand__ = __and__ - - def intersection_update(self, *iterables): - """ - Update the set, keeping only elements found in it and all *iterables*. - """ - self._set.intersection_update(*iterables) - self._list.clear() - self._list.update(self._set) - return self - - __iand__ = intersection_update - - def symmetric_difference(self, that): - """ - Return a new set with elements in either *self* or *that* but not both. - """ - diff = self._set.symmetric_difference(that) - new_set = self.__class__(key=self._key, load=self._load, _set=diff) - return new_set - - __xor__ = symmetric_difference - __rxor__ = __xor__ - - def symmetric_difference_update(self, that): - """ - Update the set, keeping only elements found in either *self* or *that*, - but not in both. - """ - self._set.symmetric_difference_update(that) - self._list.clear() - self._list.update(self._set) - return self - - __ixor__ = symmetric_difference_update - - def union(self, *iterables): - """ - Return a new SortedSet with elements from the set and all *iterables*. - """ - return self.__class__(chain(iter(self), *iterables), key=self._key, load=self._load) - - __or__ = union - __ror__ = __or__ - - def update(self, *iterables): - """Update the set, adding elements from all *iterables*.""" - values = set(chain(*iterables)) - if (4 * len(values)) > len(self): - self._set.update(values) - self._list.clear() - self._list.update(self._set) - else: - _add = self.add - for value in values: - _add(value) - return self - - __ior__ = union - - def __reduce__(self): - return (self.__class__, ((), self._key, self._load, self._set)) - - @recursive_repr - def __repr__(self): - temp = '{0}({1}, key={2}, load={3})' - return temp.format( - self.__class__.__name__, - repr(list(self)), - repr(self._key), - repr(self._load) - ) - - def _check(self): - self._list._check() - assert len(self._set) == len(self._list) - _set = self._set - assert all(val in _set for val in self._list)