diff -Nru stringtie-2.1.1+ds/debian/changelog stringtie-2.1.2+ds/debian/changelog --- stringtie-2.1.1+ds/debian/changelog 2020-02-09 06:51:25.000000000 +0000 +++ stringtie-2.1.2+ds/debian/changelog 2020-04-27 13:44:50.000000000 +0000 @@ -1,3 +1,12 @@ +stringtie (2.1.2+ds-1) unstable; urgency=medium + + * New upstream version + * Add salsa-ci file (routine-update) + * Rules-Requires-Root: no (routine-update) + * Removed "debian uupdate" from d/watch + + -- Steffen Moeller Mon, 27 Apr 2020 15:44:50 +0200 + stringtie (2.1.1+ds-2) unstable; urgency=medium * Team upload diff -Nru stringtie-2.1.1+ds/debian/control stringtie-2.1.2+ds/debian/control --- stringtie-2.1.1+ds/debian/control 2020-02-09 06:51:25.000000000 +0000 +++ stringtie-2.1.2+ds/debian/control 2020-04-27 13:44:50.000000000 +0000 @@ -11,6 +11,7 @@ Vcs-Browser: https://salsa.debian.org/med-team/stringtie Vcs-Git: https://salsa.debian.org/med-team/stringtie.git Homepage: http://ccb.jhu.edu/software/stringtie/ +Rules-Requires-Root: no Package: stringtie Architecture: any diff -Nru stringtie-2.1.1+ds/debian/salsa-ci.yml stringtie-2.1.2+ds/debian/salsa-ci.yml --- stringtie-2.1.1+ds/debian/salsa-ci.yml 1970-01-01 00:00:00.000000000 +0000 +++ stringtie-2.1.2+ds/debian/salsa-ci.yml 2020-04-27 13:44:47.000000000 +0000 @@ -0,0 +1,4 @@ +--- +include: + - https://salsa.debian.org/salsa-ci-team/pipeline/raw/master/salsa-ci.yml + - https://salsa.debian.org/salsa-ci-team/pipeline/raw/master/pipeline-jobs.yml diff -Nru stringtie-2.1.1+ds/debian/watch stringtie-2.1.2+ds/debian/watch --- stringtie-2.1.1+ds/debian/watch 2020-02-09 06:51:25.000000000 +0000 +++ stringtie-2.1.2+ds/debian/watch 2020-04-27 13:44:20.000000000 +0000 @@ -1,3 +1,3 @@ version=4 opts="dversionmangle=auto,oversionmangle=s/$/+ds/,repack" \ -http://ccb.jhu.edu/software/stringtie/ dl/stringtie-([\d\.]+)\.tar\.gz debian uupdate +http://ccb.jhu.edu/software/stringtie/ dl/stringtie-([\d\.]+)\.tar\.gz diff -Nru stringtie-2.1.1+ds/gclib/GArgs.cpp stringtie-2.1.2+ds/gclib/GArgs.cpp --- stringtie-2.1.1+ds/gclib/GArgs.cpp 2020-01-30 15:09:03.000000000 +0000 +++ stringtie-2.1.2+ds/gclib/GArgs.cpp 2020-04-21 20:54:10.000000000 +0000 @@ -172,6 +172,9 @@ //value is the next argument if (p+1<_argc && _argv[p+1][0]!=0) { p++; + #if defined(__APPLE__) && defined(DEBUG) + dbg_dequote(_argv[p]); + #endif args[count].value=Gstrdup(_argv[p]); } else { diff -Nru stringtie-2.1.1+ds/gclib/GBam.cpp stringtie-2.1.2+ds/gclib/GBam.cpp --- stringtie-2.1.1+ds/gclib/GBam.cpp 2020-01-30 15:09:03.000000000 +0000 +++ stringtie-2.1.2+ds/gclib/GBam.cpp 2020-04-21 20:54:10.000000000 +0000 @@ -374,6 +374,16 @@ return 0; } + char GBamRecord::tag_char1(const char tag[2]) { //just the first char from Z type tags + uint8_t* s=bam_aux_get(this->b, tag); + if (s==NULL) return 0; + int type; + type = *s++; + if (s == 0) return 0; + if (type == 'A' || type == 'Z') return *(char*)s; + else return 0; + } + int GBamRecord::tag_int(const char tag[2]) { //get the numeric value of tag uint8_t *s=find_tag(tag); if (s) return ( bam_aux2i(s) ); @@ -393,10 +403,10 @@ } char GBamRecord::spliceStrand() { // '+', '-' from the XS tag, or 0 if no XS tag - char c=tag_char("XS"); + char c=tag_char1("XS"); if (c==0) { //try minimap2's "ts" tag - char m=tag_char("ts"); + char m=tag_char1("ts"); if (m=='+' || m=='-') { if ((this->b->core.flag & BAM_FREVERSE) != 0) c=((m=='+') ? '-' : '+'); else c=m; diff -Nru stringtie-2.1.1+ds/gclib/GBam.h stringtie-2.1.2+ds/gclib/GBam.h --- stringtie-2.1.1+ds/gclib/GBam.h 2020-01-30 15:09:03.000000000 +0000 +++ stringtie-2.1.2+ds/gclib/GBam.h 2020-04-21 20:54:10.000000000 +0000 @@ -194,6 +194,7 @@ int tag_int(const char tag[2]); //return numeric value of tag (for numeric types) float tag_float(const char tag[2]); //return float value of tag (for float types) char tag_char(const char tag[2]); //return char value of tag (for type 'A') + char tag_char1(const char tag[2]); //return char value of tag (for type 'A') char spliceStrand(); // '+', '-' from the XS tag, or '.' if no XS tag char* sequence(); //user should free after use diff -Nru stringtie-2.1.1+ds/gclib/GBase.cpp stringtie-2.1.2+ds/gclib/GBase.cpp --- stringtie-2.1.1+ds/gclib/GBase.cpp 2020-01-30 15:09:03.000000000 +0000 +++ stringtie-2.1.2+ds/gclib/GBase.cpp 2020-04-21 20:54:10.000000000 +0000 @@ -637,11 +637,56 @@ g=h&0xF0000000; if(g) h^=g>>24; h&=0x0fffffff; - } + } GASSERT(h<=0x0fffffff); return h; } + +inline uint32_t fmix32 ( uint32_t h ) { + h ^= h >> 16; + h *= 0x85ebca6b; + h ^= h >> 13; + h *= 0xc2b2ae35; + h ^= h >> 16; + return h; +} + +int murmur3(const char *key) { + int len=strlen(key); + const uint8_t * data = (const uint8_t*)key; + const int nblocks = len / 4; + + uint32_t h1 = 5381; //seed; + + const uint32_t c1 = 0xcc9e2d51; + const uint32_t c2 = 0x1b873593; + + const uint32_t * blocks = (const uint32_t *)(data + nblocks*4); + for(int i = -nblocks; i; i++) { + uint32_t k1 = blocks[i]; + k1 *= c1; + k1 = (k1 << 15) | (k1 >> (32 - 15)); + k1 *= c2; + h1 ^= k1; + h1 = (k1 << 13) | (k1 >> (32 - 13)); + h1 = h1*5+0xe6546b64; + } + const uint8_t * tail = (const uint8_t*)(data + nblocks*4); + + uint32_t k1 = 0; + switch(len & 3) { + case 3: k1 ^= tail[2] << 16; + case 2: k1 ^= tail[1] << 8; + case 1: k1 ^= tail[0]; + k1 *= c1; k1 = (k1 << 15) | (k1 >> (32 - 15)); k1 *= c2; h1 ^= k1; + }; + + h1 ^= len; + return (fmix32(h1) & 0x7FFFFFFF); + } + + int djb_hash(const char* cp) { int h = 5381; diff -Nru stringtie-2.1.1+ds/gclib/GBase.h stringtie-2.1.2+ds/gclib/GBase.h --- stringtie-2.1.1+ds/gclib/GBase.h 2020-01-30 15:09:03.000000000 +0000 +++ stringtie-2.1.2+ds/gclib/GBase.h 2020-04-21 20:54:10.000000000 +0000 @@ -1,6 +1,6 @@ #ifndef G_BASE_DEFINED #define G_BASE_DEFINED -#define GCLIB_VERSION "0.11.4" +#define GCLIB_VERSION "0.11.7" #ifdef HAVE_CONFIG_H #include "config.h" @@ -289,6 +289,7 @@ //alternate hash functions: int fnv1a_hash(const char* cp); int djb_hash(const char* cp); +int murmur3(const char *key); //---- generic base GSeg : genomic segment (interval) -- // coordinates are considered 1-based (so 0 is invalid) diff -Nru stringtie-2.1.1+ds/gclib/GBitVec.h stringtie-2.1.2+ds/gclib/GBitVec.h --- stringtie-2.1.1+ds/gclib/GBitVec.h 2020-01-30 15:09:03.000000000 +0000 +++ stringtie-2.1.2+ds/gclib/GBitVec.h 2020-04-21 20:54:10.000000000 +0000 @@ -153,7 +153,7 @@ void bitSizeError() { - GError("Error at GBitVec: unsupported BitWord size (%d)!\n", + GError("Error at GBitVec: unsupported BitWord size (%d)!\n", sizeof(BitWord)); } /// count - Returns the number of bits which are set. @@ -277,6 +277,9 @@ } GBitVec &set(uint Idx) { +#ifndef NDEBUG + indexCheck(Idx, Size); +#endif fBits[Idx / BITWORD_SIZE] |= 1L << (Idx % BITWORD_SIZE); return *this; } @@ -287,6 +290,9 @@ } GBitVec &reset(uint Idx) { +#ifndef NDEBUG + indexCheck(Idx, Size); +#endif fBits[Idx / BITWORD_SIZE] &= ~(1L << (Idx % BITWORD_SIZE)); return *this; } @@ -299,6 +305,9 @@ } GBitVec &flip(uint Idx) { +#ifndef NDEBUG + indexCheck(Idx, Size); +#endif fBits[Idx / BITWORD_SIZE] ^= 1L << (Idx % BITWORD_SIZE); return *this; } @@ -310,7 +319,7 @@ inline static void indexCheck(uint vIdx, uint vSize) { if (vIdx >= vSize) - GError("Error at GBitVec: index %d out of bounds (size %d)\n", + GError("Error at GBitVec: index %d out of bounds (size %d)\n", (int)vIdx, vSize); } @@ -439,7 +448,7 @@ // Then set any stray high bits of the last used word. uint ExtraBits = Size % BITWORD_SIZE; - + if (ExtraBits) { BitWord ExtraBitMask = ~0UL << ExtraBits; if (value) diff -Nru stringtie-2.1.1+ds/gclib/gff.cpp stringtie-2.1.2+ds/gclib/gff.cpp --- stringtie-2.1.1+ds/gclib/gff.cpp 2020-01-30 15:09:03.000000000 +0000 +++ stringtie-2.1.2+ds/gclib/gff.cpp 2020-04-21 20:54:10.000000000 +0000 @@ -3201,7 +3201,6 @@ } //-- single-exon qry overlaping multi-exon ref //check full pre-mRNA case (all introns retained): code 'm' - if (m.start<=r.exons[0]->end && m.end>=r.exons[jmax]->start) return 'm'; @@ -3272,27 +3271,32 @@ //check intron chain overlap (match, containment, intron retention etc.) int i=1; //index of exon to the right of current qry intron int j=1; //index of exon to the right of current ref intron - bool intron_conflict=false; //used for checking for retained introns + bool intron_conflict=false; //overlapping introns have at least a mismatching splice site //from here on we check all qry introns against ref introns bool junct_match=false; //true if at least a junction match is found - bool ichain_match=true; //if there is intron (sub-)chain match, to be updated by any mismatch + bool ichain_match=false; //if there is intron (sub-)chain match, to be updated by any mismatch bool intron_ovl=false; //if any intron overlap is found bool intron_retention=false; //if any ref intron is covered by a qry exon - int imfirst=0; //index of first intron match in query (valid>0) - int jmfirst=0; //index of first intron match in reference (valid>0) - int imlast=0; //index of first intron match in query - int jmlast=0; //index of first intron match in reference + //intron chain (partial) match exon-index boundaries: + int imfirst=0; //index of exon after first intron match in query (valid>0) + int jmfirst=0; //index of exon after first intron match in reference (valid>0) + int imlast=0; //index of exon after last intron match in query + int jmlast=0; //index of exon after last intron match in reference + //--keep track of the last overlapping introns in both qry and ref: + //int q_last_iovl=0; + //int r_last_iovl=0; + //check for intron matches while (i<=imax && j<=jmax) { - uint mstart=m.exons[i-1]->end; + uint mstart=m.exons[i-1]->end; //qry intron start-end uint mend=m.exons[i]->start; - uint rstart=r.exons[j-1]->end; + uint rstart=r.exons[j-1]->end; //ref intron start-end uint rend=r.exons[j]->start; if (rendoverlap(mstart+1, mend-1)) - intron_conflict=true; - if (!intron_retention && rstart>=m.exons[i-1]->start) - intron_retention=true; + intron_conflict=true; //next ref exon overlaps this qry intron + if (!intron_retention && rstart>=m.exons[i-1]->start && rend<=m.exons[i-1]->end) + intron_retention=true; //this ref intron is covered by previous qry exons[i-1] if (intron_ovl) ichain_match=false; j++; continue; @@ -3301,64 +3305,136 @@ //if qry intron overlaps the exon on the left, we have an intron conflict if (!intron_conflict && r.exons[j-1]->overlap(mstart+1, mend-1)) intron_conflict=true; - if (!intron_retention && rend<=m.exons[i]->end) + if (!intron_retention && rstart>=m.exons[i]->start && rend<=m.exons[i]->end) intron_retention=true; if (intron_ovl) ichain_match=false; i++; continue; } //no intron overlap, skipping qry intron intron_ovl=true; + //q_last_iovl=i; //keep track of the last overlapping introns in both qry and ref + //r_last_iovl=j; //overlapping introns, test junction matching bool smatch=(mstart==rstart); bool ematch=(mend==rend); if (smatch || ematch) junct_match=true; if (smatch && ematch) { //perfect match for this intron - if (ichain_match) { //chain matching still possible - if (jmfirst==0) jmfirst=j; - if (imfirst==0) imfirst=i; - imlast=i; - jmlast=j; + if (jmfirst==0) { + ichain_match=true; + jmfirst=j; + imfirst=i; + } + if (ichain_match) { + imlast=i; + jmlast=j; } i++; j++; continue; } - //intron overlapping but with at least a junction mismatch + //intron overlapping but not fully matching intron_conflict=true; ichain_match=false; if (mend>rend) j++; else i++; } //while checking intron overlaps - if (ichain_match) { //intron sub-chain match + /*** additional checking needed for intron retention when there is no ichain_match or overlap ? + if (!intron_retention && r_last_iovlend; //ref intron start-end + uint rend=r.exons[j]->start; + if (rendstart) { + i++; + continue; + } + if (rstart>m.exons[i]->end) + continue; + //overlap between ref intron and m.exons[i] + if (rstart>=m.exons[i]->start && rend<=m.exons[i]->end) { + intron_retention=true; + break; + } + } + } + ***/ + // --- when qry intron chain is contained within ref intron chain + // qry terminal exons may poke (overhang) into ref's other introns + int l_iovh=0; // overhang of q left boundary beyond the end of ref intron on the left + int r_iovh=0; // same type of overhang through the ref intron on the right + int qry_intron_poking=0; + // --- when ref intron chain is contained within qry intron chain, + // terminal exons of ref may poke (overhang) into qry other introns + int l_jovh=0; // overhang of q left boundary beyond the end of ref intron to the left + int r_jovh=0; // same type of overhang through the ref intron on the right + int ref_intron_poking=0; + if (ichain_match) { //intron (sub-)chain compatible so far (but there could still be conflicts) if (imfirst==1 && imlast==imax) { // qry full intron chain match if (jmfirst==1 && jmlast==jmax) {//identical intron chains if (strictMatch) return (r.exons[0]->start==m.exons[0]->start && r.exons.Last()->end && m.exons.Last()->end) ? '=' : '~'; else return '='; } - // -- qry intron chain is shorter than ref intron chain -- - int l_iovh=0; // overhang of leftmost q exon left boundary beyond the end of ref intron to the left - int r_iovh=0; // same type of overhang through the ref intron on the right - if (jmfirst>1 && r.exons[jmfirst-1]->start>m.start) - l_iovh = r.exons[jmfirst-1]->start - m.start; - if (jmlast r.exons[jmlast]->end) - r_iovh = m.end - r.exons[jmlast]->end; - if (l_iovh<4 && r_iovh<4) return 'c'; - } else if ((jmfirst==1 && jmlast==jmax)) {//ref full intron chain match - //check if the reference i-chain is contained in qry i-chain - int l_jovh=0; // overhang of leftmost q exon left boundary beyond the end of ref intron to the left - int r_jovh=0; // same type of overhang through the ref intron on the right - if (imfirst>1 && m.exons[imfirst-1]->start>r.start) - l_jovh = m.exons[imfirst-1]->start - r.start; - if (imlast m.exons[imlast]->end) - r_jovh = r.end - m.exons[imlast]->end; - if (l_jovh<4 && r_jovh<4) return 'k'; //reverse containment + // -- a partial intron chain match + if (jmfirst>1) { + //find if m.start falls within any ref intron before jmfirst + for (int j=jmfirst-1;j>0;--j) + if (m.startstart) { + if (m.start>r.exons[j-1]->end) { //m.start within this ref intron + l_iovh = r.exons[j]->start - m.start; + break; + } + else { intron_retention=true; ichain_match=false; } + } + } + if (jmlast r.exons[j]->end) { + if (m.end < r.exons[j+1]->start) { //m.end within this ref intron + r_iovh = m.end - r.exons[j]->end; + break; + } + else { intron_retention=true; ichain_match=false; } + } + } + if (ichain_match && l_iovh<4 && r_iovh<4) return 'c'; + qry_intron_poking=GMAX(l_iovh, r_iovh); + } else if ((jmfirst==1 && jmlast==jmax)) {//ref intron chain match + //check if the reference j-chain is contained in qry i-chain + //check for ref ends poking into qry introns + if (imfirst>1) { + for (int i=imfirst-1;i>0;--i) + if (m.exons[i]->start>r.start) { + if (r.start>m.exons[i-1]->end) { + l_jovh = m.exons[i]->start - r.start; + break; + } + else { ichain_match = false; } + } + } + if (imlast m.exons[i]->end) { + if (r.end < m.exons[i+1]->start) + { r_jovh = r.end - m.exons[i]->end; break; } + else { ichain_match = false; } + } + } + if (ichain_match && l_jovh<4 && r_jovh<4) return 'k'; //reverse containment + ref_intron_poking=GMAX(l_jovh, r_jovh); } } //'=', 'c' and 'k' were checked and assigned, check for 'm' and 'n' before falling back to 'j' - if (!intron_conflict && (m.start<=r.exons[0]->end && m.end>=r.exons[jmax]->start)) { - return 'm'; + if (intron_retention) { + //ref is boundary contained with qry intron chain ? that's not required for 'm' + //GMessage("r_jovh=%d, r_iovh=%d, l_jovh=%d, l_iovh=%d\n", r_jovh, r_iovh, l_jovh, l_iovh); + //GMessage("m.start=%d, r.exons[0]->end=%d, m.end=%d, r.exons[jmax]->start=%d\n", + // m.start, r.exons[0]->end, m.end, r.exons[jmax]->start); + //if (ref_intron_poking>0 && ) + //we just need to have no intron poking going on + if (!intron_conflict && ref_intron_poking<4 && qry_intron_poking<4) return 'm'; + else return 'n'; } - if (intron_retention) return 'n'; if (junct_match) return 'j'; //we could have 'o' or 'y' here //any real exon overlaps? diff -Nru stringtie-2.1.1+ds/gclib/GHash.hh stringtie-2.1.2+ds/gclib/GHash.hh --- stringtie-2.1.1+ds/gclib/GHash.hh 2020-01-30 15:09:03.000000000 +0000 +++ stringtie-2.1.2+ds/gclib/GHash.hh 2020-04-21 20:54:10.000000000 +0000 @@ -1,139 +1,129 @@ /******************************************************************************** -* Hash table class template (char* based) * -*********************************************************************************/ + * Hash table class template (char* based) * + *********************************************************************************/ #ifndef GHash_HH #define GHash_HH #include "GBase.h" /** -* This class maintains a fast-access hash table of entities -* indexed by a character string (essentially, maps strings to pointers) -*/ + * This class maintains a fast-access hash table of entities + * indexed by a character string (essentially, maps strings to pointers) + */ //#define HASH_DBG_PRINT 1 #define GSTR_HASH(s) strhash(s) //#define GSTR_HASH(s) djb_hash(s) //#define GSTR_HASH(s) fnv1a_hash(s) +//#define GSTR_HASH(s) murmur3(s) template class GHash { - protected: +protected: struct GHashEntry { - char* key; // Key string - bool keyalloc; //shared key flag (to not free the key chars) - int hash; // Hash value of key - pointer data; // Data - bool mark; // Entry is marked - }; - GHashEntry* hash; // Hash - int fCapacity; // table size - int fCount; // number of valid entries - int fCurrentEntry; - char* lastkeyptr; //pointer to last key string added - //---------- Raw data retrieval (including empty entries - // Return key at position pos. - const char* Key(uint pos) const { return hash[pos].key; } - // return data OBJ* at given position - OBJ* Data(uint pos) const { return (OBJ*) hash[pos].data; } - // Return mark flag of entry at position pos. - bool Mark(uint pos) const { return hash[pos].mark; } - // Return position of first filled slot, or >= fCapacity - int First() const; - // Return position of last filled slot or -1 - int Last() const; - // Return position of next filled slot in hash table - // or a value greater than or equal to fCapacity if no filled - // slot was found - int Next(int pos) const; - //Return position of previous filled slot in hash table - //or a -1 if no filled slot was found - int Prev(int pos) const; + char* key; // Key string + bool keyalloc; // shared key flag (to free/not the key) + int hash; // Hash value of key + pointer data; // Data + }; + GHashEntry* hash; // Hash + int fCapacity; // table size + int fCount; // number of valid entries + int fCurrentEntry; + char* lastkeyptr; //pointer to last key string added + //---------- Raw data retrieval (including empty entries) + // Return key at position pos. + const char* Key(uint pos) const { return hash[pos].key; } + // return data OBJ* at given position + OBJ* Data(uint pos) const { return (OBJ*) hash[pos].data; } + // Return position of first filled slot, or >= fCapacity + int First() const; + // Return position of last filled slot or -1 + int Last() const; + // Return position of next filled slot in hash table + // or a value greater than or equal to fCapacity if no filled + // slot was found + int Next(int pos) const; + //Return position of previous filled slot in hash table + //or a -1 if no filled slot was found + int Prev(int pos) const; private: - GHash(const GHash&); - GHash &operator=(const GHash&); - GFreeProc* fFreeProc; //procedure to free item data + GHash(const GHash&); + GHash &operator=(const GHash&); + GFreeProc* fFreeProc; //procedure to free item data protected: public: - static void DefaultFreeProc(pointer item) { - delete (OBJ*)item; - } + static void DefaultFreeProc(pointer item) { + delete (OBJ*)item; + } public: - GHash(GFreeProc* freeProc); // constructs of an empty hash - GHash(bool doFree=true); // constructs of an empty hash (free the item objects) - void setFreeItem(GFreeProc *freeProc) { fFreeProc=freeProc; } - void setFreeItem(bool doFree) { fFreeProc=(doFree)? &DefaultFreeProc : NULL; } - int Capacity() const { return fCapacity; } // table's size, including the empty slots. - void Resize(int m); // Resize the table to the given size. - int Count() const { return fCount; }// the total number of entries in the table. - - // Insert a new entry into the table given key and mark. - // If there is already an entry with that key, leave it unchanged - OBJ* Add(const char* ky, OBJ* ptr=NULL, bool mrk=false); - - //same with Add, but frees the old element if it's a replacement - OBJ* fAdd(const char* ky, OBJ* ptr=NULL); - - //same as Add, but the key pointer is stored directly, no string duplicate - //is made (shared-key-Add) - OBJ* shkAdd(const char* ky, OBJ* ptr, bool mrk=false); - - // Replace data at key, if the entry's mark is less than - // or equal to the given mark. If there was no existing entry, - // a new entry is inserted with the given mark. - OBJ* Replace(const char* ky, OBJ* ptr, bool mrk=false); - // Remove a given key and its data - OBJ* Remove(const char* ky); - // Find data OBJ* given key. - OBJ* Find(const char* ky, char** keyptr=NULL); - bool hasKey(const char* ky); - char* getLastKey() { return lastkeyptr; } - OBJ* operator[](const char* ky) { return Find(ky); } - void startIterate(); //iterator-like initialization - char* NextKey(); //returns next valid key in the table (NULL if no more) - OBJ* NextData(); //returns next valid hash[].data - OBJ* NextData(char*& nextkey); //returns next valid hash[].data - //or NULL if no more - //nextkey is SET to the corresponding key - GHashEntry* NextEntry() { //returns a pointer to a GHashEntry - int pos=fCurrentEntry; - while (pos GHash::GHash(GFreeProc* freeProc) { - GMALLOC(hash, sizeof(GHashEntry)*DEF_HASH_SIZE); - fCurrentEntry=-1; - fFreeProc=freeProc; - lastkeyptr=NULL; - for (uint i=0; i GHash::GHash(bool doFree) { - GMALLOC(hash, sizeof(GHashEntry)*DEF_HASH_SIZE); - fCurrentEntry=-1; - lastkeyptr=NULL; - fFreeProc = (doFree)?&DefaultFreeProc : NULL; - for (uint i=0; i void GHash::Resize(int m){ - int i,n,p,x,h; - GHashEntry *k; - GASSERT(fCount<=fCapacity); - if(m>2)>m) n>>=1; // Shrink until n/4 <= m - while((n>>1)>1)); - GASSERT(DEF_HASH_SIZE<=n); - if(n!=fCapacity){ - GASSERT(m<=n); - GMALLOC(k, sizeof(GHashEntry)*n); - for(i=0; i void GHash::Resize(int m) { + int i,n,p,x,h; + GHashEntry *k; + GASSERT(fCount<=fCapacity); + if(m>2)>m) n>>=1; // Shrink until n/4 <= m + while((n>>1)>1)); + GASSERT(DEF_HASH_SIZE<=n); + if(n!=fCapacity){ + GASSERT(m<=n); + GMALLOC(k, sizeof(GHashEntry)*n); + for(i=0; i=0){ + p=HASH1(h,n); + GASSERT(0<=p && p OBJ* GHash::Add(const char* ky, - OBJ* pdata, bool mrk){ - int p,i,x,h,n; - if(!ky) GError("GHash::insert: NULL key argument.\n"); - GASSERT(fCount=(MAX_LOAD*fCapacity)) Resize(fCount); - GASSERT(fCount OBJ* GHash::Add(const char* ky, OBJ* pdata) { + int p,i,x,h,n; + if(!ky) GError("GHash::insert: NULL key argument.\n"); + GASSERT(fCount=(MAX_LOAD*fCapacity)) Resize(fCount); + GASSERT(fCount=(MAX_LOAD*fCapacity)) Resize(fCount); - GASSERT(fCount OBJ* GHash::fAdd(const char* ky, - OBJ* pdata){ - int p,i,x,h,n; - if(!ky) GError("GHash::insert: NULL key argument.\n"); - GASSERT(fCount=(MAX_LOAD*fCapacity)) Resize(fCount); - GASSERT(fCount OBJ* GHash::shkAdd(const char* ky, - OBJ* pdata,bool mrk){ - int p,i,x,h,n; - if(!ky) GError("GHash::insert: NULL key argument.\n"); - GASSERT(fCount=(MAX_LOAD*fCapacity)) Resize(fCount); - GASSERT(fCount OBJ* GHash::fAdd(const char* ky, OBJ* pdata) { + int p,i,x,h,n; + if(!ky) GError("GHash::insert: NULL key argument.\n"); + GASSERT(fCount=(MAX_LOAD*fCapacity)) Resize(fCount); + GASSERT(fCount OBJ* GHash::shkAdd(const char* ky, OBJ* pdata) { + int p,i,x,h,n; + if(!ky) GError("GHash::insert: NULL key argument.\n"); + GASSERT(fCount=(MAX_LOAD*fCapacity)) Resize(fCount); + GASSERT(fCount OBJ* GHash::Replace(const char* ky, OBJ* pdata, bool mrk){ - int p,i,x,h,n; - if(!ky){ GError("GHash::replace: NULL key argument.\n"); } - GASSERT(fCount=(MAX_LOAD*fCapacity)) Resize(fCount); - GASSERT(fCount OBJ* GHash::Replace(const char* ky, OBJ* pdata){ + int p,i,x,h,n; + if(!ky){ GError("GHash::replace: NULL key argument.\n"); } + GASSERT(fCount=(MAX_LOAD*fCapacity)) Resize(fCount); + GASSERT(fCount OBJ* GHash::Remove(const char* ky){ - int p,x,h,n; - if(!ky){ GError("GHash::remove: NULL key argument.\n"); } - OBJ* removed=NULL; - if(0 bool GHash::hasKey(const char* ky) { - int p,x,h,n; - if(!ky){ GError("GHash::find: NULL key argument.\n"); } - if(0 OBJ* GHash::Find(const char* ky, char** keyptr){ - int p,x,h,n; - if(!ky){ GError("GHash::find: NULL key argument.\n"); } - if (fCount==0) return NULL; - h=GSTR_HASH(ky); - GASSERT(0<=h); - p=HASH1(h,fCapacity); - GASSERT(0<=p && p void GHash::startIterate() {// initialize a key iterator; call - fCurrentEntry=0; + fCurrentEntry=0; } template char* GHash::NextKey() { - int pos=fCurrentEntry; - while (pos OBJ* GHash::NextData() { - int pos=fCurrentEntry; - while (pos OBJ* GHash::NextData(char* &nextkey) { - int pos=fCurrentEntry; - while (pos int GHash::First() const { - int pos=0; - while(pos int GHash::Last() const { - int pos=fCapacity-1; - while(0<=pos){ if(0<=hash[pos].hash) break; pos--; } - GASSERT(pos<0 || 0<=hash[pos].hash); - return pos; - } + int pos=fCapacity-1; + while(0<=pos){ if(0<=hash[pos].hash) break; pos--; } + GASSERT(pos<0 || 0<=hash[pos].hash); + return pos; +} // Find next valid entry template int GHash::Next(int pos) const { - GASSERT(0<=pos && pos int GHash::Prev(int pos) const { - GASSERT(0<=pos && pos= 0){ if(0<=hash[pos].hash) break; } - GASSERT(pos<0 || 0<=hash[pos].hash); - return pos; - } + GASSERT(0<=pos && pos= 0){ if(0<=hash[pos].hash) break; } + GASSERT(pos<0 || 0<=hash[pos].hash); + return pos; +} // Remove all template void GHash::Clear(){ - int i; - for(i=0; i=0){ - if (hash[i].keyalloc) GFREE((hash[i].key)); - if (FREEDATA) - (*fFreeProc)(hash[i].data); - } - } - GFREE(hash); - GMALLOC(hash, sizeof(GHashEntry)*DEF_HASH_SIZE); - //reinitialize it - for (i=0; i=0){ - uint len=strlen(hash[i].key); - store << len; - store << hash[i].mark; - store.save(hash[i].key,len); - } - } - } - - -// Load data -void GHash::Load(Stream& store){ - Object::load(store); - store >> fCapacity; - store >> fCount; - for(int i=0; i> hash[i].hash; - if(hash[i].hash>=0){ - uint len; - store >> len; - store >> hash[i].mark; - GMALLOC(hash[i].key,len+1); - store.load(hash[i].key,len); - hash[i].key[len]='\0'; - } - } - } -*/ + int i; + for(i=0; i=0){ + if (hash[i].keyalloc) GFREE((hash[i].key)); + if (FREEDATA) + (*fFreeProc)(hash[i].data); + } + } + GFREE(hash); + GMALLOC(hash, sizeof(GHashEntry)*DEF_HASH_SIZE); + //reinitialize it + for (i=0; i GHash::~GHash(){ - for(int i=0; i=0){ - if (hash[i].keyalloc) GFREE((hash[i].key)); - if (FREEDATA) (*fFreeProc)(hash[i].data); - } - } - GFREE(hash); - } + for(int i=0; i=0){ + if (hash[i].keyalloc) GFREE((hash[i].key)); + if (FREEDATA) (*fFreeProc)(hash[i].data); + } + } + GFREE(hash); +} class GStrSet:public GHash { protected: diff -Nru stringtie-2.1.1+ds/gclib/GIntHash.hh stringtie-2.1.2+ds/gclib/GIntHash.hh --- stringtie-2.1.1+ds/gclib/GIntHash.hh 2020-01-30 15:09:03.000000000 +0000 +++ stringtie-2.1.2+ds/gclib/GIntHash.hh 2020-04-21 20:54:10.000000000 +0000 @@ -2,15 +2,15 @@ #define _GHASHT_HH #include "GBase.h" //---------------------------------------------- -// Hash table templates based on Jeff Preshing's code +// Int Hash table templates // --------------------------------------------- // Maps 32-bit integers to user data // Uses open addressing with linear probing. // In the m_cells array, key = 0 is reserved to indicate an unused cell. // Actual value for key 0 (if any) is stored in m_zeroCell. // The hash table automatically doubles in size when it becomes 75% full. -// The hash table never shrinks in size, even after Clear(), -// unless you explicitly call Compact(). +// The hash table never shrinks in size +// unless you explicitly call Clear() or Compact(). //---------------------------------------------- inline uint32_t upper_power_of_two(uint32_t v) { v--; diff -Nru stringtie-2.1.1+ds/gclib/GList.hh stringtie-2.1.2+ds/gclib/GList.hh --- stringtie-2.1.1+ds/gclib/GList.hh 2020-01-30 15:09:03.000000000 +0000 +++ stringtie-2.1.2+ds/gclib/GList.hh 2020-04-21 20:54:10.000000000 +0000 @@ -481,7 +481,7 @@ return result; } -//by default, it deletes the item if it has an equal in the list! +//by default, it deletes item if it an equal is found in the list! //returns the existing equal (==) object if it's in the list already //or returns the item itself if it's unique (and adds it) template OBJ* GList::AddIfNew(OBJ* item, @@ -490,21 +490,20 @@ if (Found(item, r)) { if (deleteIfFound && (pointer)item != (pointer)(this->fList[r])) { this->deallocate_item(item); - } + } if (fidx!=NULL) *fidx=r; return this->fList[r]; //found - } + } //not found: if (SORTED) { //Found() set result to the position where the item should be inserted: sortInsert(r, item); - } - else { + } else { r = this->fCount; if (r==this->fCapacity) GPVec::Grow(); this->fList[r]=item; this->fCount++; - } + } if (fidx!=NULL) *fidx=r; return item; } diff -Nru stringtie-2.1.1+ds/README.md stringtie-2.1.2+ds/README.md --- stringtie-2.1.1+ds/README.md 2020-01-30 15:09:03.000000000 +0000 +++ stringtie-2.1.2+ds/README.md 2020-04-21 20:54:10.000000000 +0000 @@ -1,7 +1,9 @@ +For StringTie's manual and prepared source and binary packages, please refer to the official website: + ## Obtaining and installing StringTie Source and binary packages for this software, along with a small test data set -can be directly downloaded from the Releases page for this repository. +can be directly downloaded from the [Releases](https://github.com/gpertea/stringtie/releases) page for this repository. StringTie is compatible with a wide range of Linux and Apple OS systems (going as far back as RedHat Enterprise Linux 5.0 and OS X 10.7). The main program (StringTie) does not have any other library dependencies and in order to compile it from source it requires only a C++ compiler which supports the C++ 0x standard (GCC 4.5 or newer). diff -Nru stringtie-2.1.1+ds/rlink.cpp stringtie-2.1.2+ds/rlink.cpp --- stringtie-2.1.1+ds/rlink.cpp 2020-01-30 15:09:03.000000000 +0000 +++ stringtie-2.1.2+ds/rlink.cpp 2020-04-21 20:54:10.000000000 +0000 @@ -46,10 +46,10 @@ CJunction* add_junction(int start, int end, GList& junction, char strand) { + int oidx=-1; CJunction *nj=NULL; - //CJunction* nj=junction.AddIfNew(new CJunction(start, end, strand), true, &oidx); CJunction jn(start, end, strand); if (junction.Found(&jn, oidx)) { nj=junction.Get(oidx); @@ -335,7 +335,12 @@ for (int i=0;ilen+=brec.exons[i].len(); if(i) { - CJunction* nj=add_junction(brec.exons[i-1].end, brec.exons[i].start, junction, strand); // unstranded junction might get added even though it might match a stranded one + if(!junction.Count()) { // always add null junction first + CJunction *nullj=new CJunction(0, 0, 0); + junction.Add(nullj); + } + //CJunction* nj=add_junction(brec.exons[i-1].end, brec.exons[i].start, junction, strand); // unstranded junction might get added even though it might match a stranded one + CJunction* nj=junction.AddIfNew(new CJunction(brec.exons[i-1].end, brec.exons[i].start, strand), true); if (alndata.juncs.Count()) nj->guide_match=alndata.juncs[i-1]->guide_match; if (nj) { @@ -359,7 +364,11 @@ */ } - + /*fprintf(stderr,"Add read %s with strand=%d and exons:",brec.name(),strand); + for (int i=0;icurrentend) { @@ -412,11 +421,8 @@ //if (brec.isProperlyPaired()) { //only consider mate pairing data if mates are properly paired int pairstart=brec.mate_start(); if (currentstart<=pairstart) { // if pairstart is in a previous bundle I don't care about it - GStr readname(brec.name()); GStr id(readname); // init id with readname - //id+=':';id+=hi; // the HI tag is not stored in all aligners, like HISAT - if(pairstart<=readstart) { // if I've seen the pair already <- I might not have seen it yet because the pair starts at the same place id+='-';id+=pairstart; id+=".=";id+=hi; // (!) this suffix actually speeds up the hash by improving distribution! @@ -755,11 +761,9 @@ if(group1->colorcolor) { eqcol[group2->color]=group1->color; - // print STDERR "in merge set eqcol[",$$group2[2],"]=",$$eqcol{$$group1[2]},"\n"; } else if(group1->color>group2->color) { eqcol[group1->color]=group2->color; - // print STDERR "in merge set eqcol[",$$group1[2],"]=",$$group2[2],"\n"; group1->color=group2->color; } @@ -1734,73 +1738,86 @@ while(nlestart) nle++; while((nls=lend.Count() || (nls(int)(graphnode->start+longintronanchor)) &&(endcov || lstart[nls].predno(int)(graphnode->start+longintronanchor)) &&(endcov || lstart[nls].predno0) { - tmpcov+=trthr; - - uint prevend=graphnode->end; - graphnode->end=lstart[nls].predno-1; - CGraphnode *prevnode=graphnode; - graphnode=create_graphnode(s,g,lstart[nls].predno,prevend,graphno,bundlenode,bundle2graph,no2gnode); - graphno++; - source->child.Add(graphnode->nodeid); // this node is the child of source - graphnode->parent.Add(source->nodeid); // this node has source as parent - //fprintf(stderr,"trim edge 0-%d with sourceabund=%f at position=%d\n",graphnode->nodeid,sourceabundance,graphnode->start); - prevnode->child.Add(graphnode->nodeid); // this node is the child of previous node + int winstart=startpos-CHI_THR; + if(winstart<0) winstart=0; + int winend=startpos+CHI_THR-1; + if(winend>=bpcov->Count()) winend=bpcov->Count()-1; + tmpcov=(get_cov(1,startpos,winend,bpcov)-get_cov(2-2*s,startpos,winend,bpcov)- + get_cov(1,winstart,startpos-1,bpcov)+get_cov(2-2*s,winstart,startpos-1,bpcov))/(DROP*CHI_THR); + } + if(tmpcov<=0 && lstart[nls].cov<0) tmpcov=ERROR_PERC; // to re-estimate later in process_transfrags + if(tmpcov>0) { + tmpcov+=trthr; + uint prevend=graphnode->end; + graphnode->end=lstart[nls].predno-1; + CGraphnode *prevnode=graphnode; + graphnode=create_graphnode(s,g,lstart[nls].predno,prevend,graphno,bundlenode,bundle2graph,no2gnode); + graphnode->hardstart=true; + graphno++; + source->child.Add(graphnode->nodeid); // this node is the child of source + graphnode->parent.Add(source->nodeid); // this node has source as parent + //fprintf(stderr,"trim edge 0-%d with sourceabund=%f at position=%d\n",graphnode->nodeid,sourceabundance,graphnode->start); + prevnode->child.Add(graphnode->nodeid); // this node is the child of previous node graphnode->parent.Add(prevnode->nodeid); // this node has as parent the previous node - //fprintf(stderr,"trim edge %d-%d\n",prevnode->nodeid,graphnode->nodeid); - float tmp=graphno-1; - futuretr.cAdd(0.0); - futuretr.Add(tmp); - //tmp=lstart[nls].cov+trthr; - futuretr.Add(tmpcov); - //fprintf(stderr,"s=%d trim edge 0-%d with sourceabund=%f at position=%d lstart->cov=%f %f (%f %f) %f (%f %f)\n",s,graphnode->nodeid,tmpcov, - // graphnode->start,lstart[nls].cov,get_cov(2*s,startpos,startpos+CHI_THR-1,bpcov),get_cov(1,startpos,startpos+CHI_THR-1,bpcov),get_cov(2-2*s,startpos,startpos+CHI_THR-1,bpcov), - // get_cov(2*s,startpos-CHI_THR,startpos-1,bpcov),get_cov(1,startpos-CHI_THR,startpos-1,bpcov),get_cov(2-2*s,startpos-CHI_THR,startpos-1,bpcov)); - tmp=prevnode->nodeid;futuretr.Add(tmp); - tmp=graphnode->nodeid;futuretr.Add(tmp); - tmp=trthr;futuretr.Add(tmp); - // COUNT 2 EDGES HERE - edgeno+=2; - //fprintf(stderr,"edgeno=%d\n",edgeno); - startcov=false; - } + //fprintf(stderr,"trim edge %d-%d\n",prevnode->nodeid,graphnode->nodeid); + float tmp=graphno-1; + futuretr.cAdd(0.0); + futuretr.Add(tmp); + //tmp=lstart[nls].cov+trthr; + futuretr.Add(tmpcov); + //fprintf(stderr,"s=%d trim edge 0-%d with sourceabund=%f at position=%d lstart->cov=%f %f (%f %f) %f (%f %f)\n",s,graphnode->nodeid,tmpcov, + // graphnode->start,lstart[nls].cov,get_cov(2*s,startpos,startpos+CHI_THR-1,bpcov),get_cov(1,startpos,startpos+CHI_THR-1,bpcov),get_cov(2-2*s,startpos,startpos+CHI_THR-1,bpcov), + // get_cov(2*s,startpos-CHI_THR,startpos-1,bpcov),get_cov(1,startpos-CHI_THR,startpos-1,bpcov),get_cov(2-2*s,startpos-CHI_THR,startpos-1,bpcov)); + tmp=prevnode->nodeid;futuretr.Add(tmp); + tmp=graphnode->nodeid;futuretr.Add(tmp); + tmp=trthr;futuretr.Add(tmp); + // COUNT 2 EDGES HERE + edgeno+=2; + //fprintf(stderr,"edgeno=%d\n",edgeno); + startcov=false; } nls++; } else if(nls>=lstart.Count() || (nle(int)(graphnode->start+longintronanchor)) &&(!endcov || lend[nle].predno0) { - tmpcov+=trthr; - float tmp=graphno-1; - uint prevend=graphnode->end; - graphnode->end=lend[nle].predno; - CGraphnode *prevnode=graphnode; - graphnode=create_graphnode(s,g,lend[nle].predno+1,prevend,graphno,bundlenode,bundle2graph,no2gnode); - graphno++; - prevnode->child.Add(graphnode->nodeid); // this node is the child of previous node - graphnode->parent.Add(prevnode->nodeid); // this node has as parent the previous node - //fprintf(stderr,"trim edge %d-%d with sinkabundance=%f at position=%d\n",prevnode->nodeid,graphnode->nodeid,sinkabundance,prevnode->end); - sink->parent.Add(prevnode->nodeid); // prevnode is the parent of sink - // remember to create transfrag as well -> I don't know the gno yet, so I can not create it here - futuretr.Add(tmp); - futuretr.cAdd(-1.0); - //tmp=lend[nle].cov+trthr; - futuretr.Add(tmpcov); - tmp=prevnode->nodeid;futuretr.Add(tmp); - tmp=graphnode->nodeid;futuretr.Add(tmp); - tmp=trthr;futuretr.Add(tmp); - // COUNT 2 EDGES HERE - edgeno+=2; - //fprintf(stderr,"edgeno=%d\n",edgeno); - startcov=true; - } + int winstart=endpos-CHI_THR+1; + if(winstart<0) winstart=0; + int winend=endpos+CHI_THR; + if(winend>=bpcov->Count()) winend=bpcov->Count()-1; + tmpcov=(get_cov(1,winstart,endpos,bpcov)-get_cov(2-2*s,winstart,endpos,bpcov)- + get_cov(1,endpos+1,winend,bpcov)+get_cov(2-2*s,endpos+1,winend,bpcov))/(DROP*CHI_THR); + } + if(tmpcov<=0 && lend[nle].cov<0) tmpcov=ERROR_PERC; // to re-estimate later in process_transfrags + if(tmpcov>0) { + tmpcov+=trthr; + float tmp=graphno-1; + uint prevend=graphnode->end; + graphnode->end=lend[nle].predno; + CGraphnode *prevnode=graphnode; + graphnode->hardend=true; + graphnode=create_graphnode(s,g,lend[nle].predno+1,prevend,graphno,bundlenode,bundle2graph,no2gnode); + graphno++; + prevnode->child.Add(graphnode->nodeid); // this node is the child of previous node + graphnode->parent.Add(prevnode->nodeid); // this node has as parent the previous node + //fprintf(stderr,"trim edge %d-%d with sinkabundance=%f at position=%d\n",prevnode->nodeid,graphnode->nodeid,sinkabundance,prevnode->end); + sink->parent.Add(prevnode->nodeid); // prevnode is the parent of sink + // remember to create transfrag as well -> I don't know the gno yet, so I can not create it here + futuretr.Add(tmp); + futuretr.cAdd(-1.0); + //tmp=lend[nle].cov+trthr; + futuretr.Add(tmpcov); + tmp=prevnode->nodeid;futuretr.Add(tmp); + tmp=graphnode->nodeid;futuretr.Add(tmp); + tmp=trthr;futuretr.Add(tmp); + // COUNT 2 EDGES HERE + edgeno+=2; + //fprintf(stderr,"edgeno=%d\n",edgeno); + startcov=true; } nle++; } @@ -2482,6 +2499,7 @@ int &edgeno,int &lastgpos,GArray& guideedge, int refend=0){ GVec* bpcov = bdata->bpcov; // I might want to use a different type of data for bpcov to save memory in the case of very long bundles + GPVec& feature = bdata->ptfs; // these are point features (confirmed starts/stops) CGraphnode* source=new CGraphnode(0,0,0); no2gnode[s][g].Add(source); @@ -2546,6 +2564,8 @@ //int seenjunc=0; + int f=0; // feature index + uint bundle_end=bnode[bundle->lastnodeid]->end; while(bundlenode!=NULL) { @@ -2562,134 +2582,190 @@ nje++; } - // see if I need to adjust the start to ignore little hanging pieces that make no sense - if(!end) { - while(njestrand+1!=2*s) nje++; // skip all junctions that are not on the same strand - if(njeend - currentstart < junctionsupport) { // there is a junction ending soon here - float covleft=get_cov(1,currentstart-refstart,ejunction[nje]->end-1-refstart,bpcov); - float covright=get_cov(1,ejunction[nje]->end-refstart,2*ejunction[nje]->end - currentstart-1-refstart,bpcov); - if(covleftend; - // I have to check ending junctions here again - while(njeend<=currentstart) { // read all junction ends at or before the current start -> assuming there are any (at this point, smaller junction ends should not be relevant to this bundle/currentstart - if(ejunction[nje]->end==currentstart && (ejunction[nje]->strand+1) == 2*s) { // junction ends at current start and is on the same strand and not deleted - end=1; + GVec lstart; + GVec lend; + int fs=-1; // first start feature index in lstart + int fe=-1; // first end feature index in lend + if(longreads) { + if(feature.Count()) { // I already know the features + while(fcoordcoord<=endbundle) { + if(feature[f]->strand==2*s-1) { + if(feature[f]->ftype==GPFT_TSS) { // this is a start + CPred p(feature[f]->coord,-1); // start here -> a -1 indicates this is a given point feature + if(fs<0) fs=lstart.Count(); + lstart.Add(p); + } + else if(feature[f]->ftype==GPFT_CPAS) { // this is an end + CPred p(feature[f]->coord,-1); // start here -> a -1 indicates this is a given point feature + if(fe<0) fe=lend.Count(); + lend.Add(p); } - nje++; } + f++; } } - } - - //fprintf(stderr,"create graph 1\n"); - CGraphnode *graphnode=create_graphnode(s,g,currentstart,endbundle,graphno,bundlenode,bundle2graph,no2gnode); // creates a $graphno graphnode with start at bundle start, and end at bundle end - graphno++; + else { // populate starts and ends for the bundlenode here if I have enough space to determine starts/ends in the bundlenode - - GVec lstart; - GVec lend; - if(longreads && endbundle-currentstart>CHI_WIN+CHI_THR) { // populate starts and ends for the bundlenode here if I have enough space to determine starts/ends in the bundlenode - GVec diffval; - float sumstartleft=0; - float sumendleft=0; - float sumstartright=0; - float sumendright=0; - float lastcov=get_cov(2*s,longintronanchor-1,longintronanchor-1,bpcov); - for(int i=currentstart+longintronanchor;i<(int)currentstart+CHI_THR+(int)longintronanchor;i++) { - float icov=get_cov(2*s,i-refstart,i-refstart,bpcov); - float diff=icov-lastcov; - diffval.Add(diff); - if(diff>0) sumstartleft+=diff; - else sumendleft-=diff; - lastcov=icov; - } - for(int i=currentstart+CHI_THR+longintronanchor;i<(int)currentstart+CHI_WIN+(int)longintronanchor;i++) { - float icov=get_cov(2*s,i-refstart,i-refstart,bpcov); - float diff=icov-lastcov; - diffval.Add(diff); - if(diff>0) sumstartright+=diff; - else sumendright-=diff; - lastcov=icov; - } - int covend=endbundle-refstart; - if(bpcov->Count()<(int)endbundle) covend=bpcov->Count(); - covend-=longintronanchor; - //fprintf(stderr,"covend=%d\n",covend); - for(int i=currentstart+CHI_WIN+longintronanchor-refstart;i1/ERROR_PERC) { // starts here - if(sumstartleftlstart[j].cov) { - lstart[j].predno=istart; - lstart[j].cov=diffval[m]; - } - } - } - if(addstart) { - CPred p(istart,diffval[m]); + while(fcoordstrand==2*s-1) { + if(feature[f]->ftype==GPFT_TSS) { // this is a start + CPred p(feature[f]->coord,-1); // start here -> a -1 indicates this is a given point feature + if(fs<0) fs=lstart.Count(); lstart.Add(p); } + else if(feature[f]->ftype==GPFT_CPAS) { // this is an end + CPred p(feature[f]->coord,-1); // start here -> a -1 indicates this is a given point feature + if(fe<0) fe=lend.Count(); + lend.Add(p); + } } + f++; } - float icov=get_cov(2*s,i,i,bpcov); - //fprintf(stderr," dfV[%d]=%.1f cov[%d]=%.1f",s,diffval[m],i+refstart,icov); - int p=(m+CHI_THR)%CHI_WIN; - float diff=icov-lastcov; - //fprintf(stderr," [%s](sL=%.1f sR=%.1f sL/sR=",s,sumstartleft,sumstartright); - //if(diffval[m]>1/ERROR_PERC) fprintf(stderr,"%.1f",sumstartleft/sumstartright); - //else fprintf(stderr,"*"); - if(diffval[p]>0) sumstartleft-=diffval[p]; - else sumendleft+=diffval[p]; - if(diffval[m]>0) { - sumstartleft+=diffval[m]; - sumstartright-=diffval[m]; + + GVec diffval; + float sumstartleft=0; + float sumendleft=0; + float sumstartright=0; + float sumendright=0; + float lastcov=get_cov(2*s,longintronanchor-1,longintronanchor-1,bpcov); // this only computes signed coverage (should I include neutral coverage here too?) + for(int i=currentstart+longintronanchor;i<(int)currentstart+CHI_THR+(int)longintronanchor;i++) { + float icov=get_cov(2*s,i-refstart,i-refstart,bpcov); + float diff=icov-lastcov; // compute how many reads start at position i + diffval.Add(diff); + if(diff>0) sumstartleft+=diff; + else sumendleft-=diff; + lastcov=icov; } - else { - sumendleft-=diffval[m]; - sumendright+=diffval[m]; + for(int i=currentstart+CHI_THR+longintronanchor;i<(int)currentstart+CHI_WIN+(int)longintronanchor;i++) { + float icov=get_cov(2*s,i-refstart,i-refstart,bpcov); + float diff=icov-lastcov; + diffval.Add(diff); + if(diff>0) sumstartright+=diff; + else sumendright-=diff; + lastcov=icov; } - diffval[p]=diff; - if(diff>0) sumstartright+=diff; - else sumendright-=diff; - lastcov=icov; - //fprintf(stderr," eL=%.1f eR=%.1f eR/eL=",sumendleft,sumendright); - if(diffval[m]<-1/ERROR_PERC) { // ends here - if(sumendleft*ERROR_PERC>sumendright) { - bool addend=true; - int istart=refstart+i-CHI_THR-1; - if(lend.Count()) { - int j=lend.Count()-1; - if(istart-lend[j].predno=lend[j].cov) { - lend[j].predno=istart; - lend[j].cov=-diffval[m]; + int covend=endbundle-refstart; + if(bpcov->Count()<(int)endbundle) covend=bpcov->Count(); + covend-=longintronanchor; + //fprintf(stderr,"covend=%d\n",covend); + for(int i=currentstart+CHI_WIN+longintronanchor-refstart;icoord==(uint)i+CHI_THR+refstart) { // see if there are any features before the start + if(feature[f]->strand==2*s-1) { + if(feature[f]->ftype==GPFT_TSS) { // this is a start + CPred p(feature[f]->coord,-1); // start here -> a -1 indicates this is a given point feature + lstart.Add(p); + } + else if(feature[f]->ftype==GPFT_CPAS) { // this is an end + CPred p(feature[f]->coord,-1); // start here -> a -1 indicates this is a given point feature + lend.Add(p); + } + } + f++; + } + + int m=(i-CHI_THR-longintronanchor)%CHI_WIN; + //fprintf(stderr,"m=%d ",i-CHI_THR+refstart); + if(diffval[m]>1/ERROR_PERC) { // starts here + if(sumstartleft0 && diffval[m]>lstart[j].cov) { + lstart[j].predno=istart; + lstart[j].cov=diffval[m]; + } } } + if(addstart) { + CPred p(istart,diffval[m]); + lstart.Add(p); + } } - if(addend) { - CPred p(refstart+i-CHI_THR-1,-diffval[m]); - lend.Add(p); + } + float icov=get_cov(2*s,i,i,bpcov); + //fprintf(stderr," dfV[%d]=%.1f cov[%d]=%.1f",s,diffval[m],i+refstart,icov); + int p=(m+CHI_THR)%CHI_WIN; + float diff=icov-lastcov; + //fprintf(stderr," [%s](sL=%.1f sR=%.1f sL/sR=",s,sumstartleft,sumstartright); + //if(diffval[m]>1/ERROR_PERC) fprintf(stderr,"%.1f",sumstartleft/sumstartright); + //else fprintf(stderr,"*"); + if(diffval[p]>0) sumstartleft-=diffval[p]; + else sumendleft+=diffval[p]; + if(diffval[m]>0) { + sumstartleft+=diffval[m]; + sumstartright-=diffval[m]; + } + else { + sumendleft-=diffval[m]; + sumendright+=diffval[m]; + } + diffval[p]=diff; + if(diff>0) sumstartright+=diff; + else sumendright-=diff; + lastcov=icov; + //fprintf(stderr," eL=%.1f eR=%.1f eR/eL=",sumendleft,sumendright); + if(diffval[m]<-1/ERROR_PERC) { // ends here + if(sumendleft*ERROR_PERC>sumendright) { + bool addend=true; + int istart=refstart+i-CHI_THR-1; + if(lend.Count()) { + int j=lend.Count()-1; + if(abs(istart-lend[j].predno)0 && -diffval[m]>=lend[j].cov) { + lend[j].predno=istart; + lend[j].cov=-diffval[m]; + } + } + } + if(addend) { + CPred p(refstart+i-CHI_THR-1,-diffval[m]); + lend.Add(p); + } } + //fprintf(stderr,"%.1f)",sumendright/sumendleft); } - //fprintf(stderr,"%.1f)",sumendright/sumendleft); + //else fprintf(stderr,"*)"); + //fprintf(stderr,"\n"); } - //else fprintf(stderr,"*)"); - //fprintf(stderr,"\n"); - } - //for(int i=0;istrand+1!=2*s) nje++; // skip all junctions that are not on the same strand + if((njeend - currentstart < junctionsupport) && + (fs<0 || (uint)lstart[fs].predno>=ejunction[nje]->end) && // I do not want to miss any hard starts/ends + (fe<0 || (uint)lend[fe].predno>=ejunction[nje]->end)) { // there is a junction ending soon here + float covleft=get_cov(1,currentstart-refstart,ejunction[nje]->end-1-refstart,bpcov); + float covright=get_cov(1,ejunction[nje]->end-refstart,2*ejunction[nje]->end - currentstart-1-refstart,bpcov); + if(covleftend; + // I have to check ending junctions here again + while(njeend<=currentstart) { // read all junction ends at or before the current start -> assuming there are any (at this point, smaller junction ends should not be relevant to this bundle/currentstart + if(ejunction[nje]->end==currentstart && (ejunction[nje]->strand+1) == 2*s) { // junction ends at current start and is on the same strand and not deleted + end=1; + } + nje++; + } + } + } + } + //fprintf(stderr,"create graph 1\n"); + CGraphnode *graphnode=create_graphnode(s,g,currentstart,endbundle,graphno,bundlenode,bundle2graph,no2gnode); // creates a $graphno graphnode with start at bundle start, and end at bundle end + graphno++; if(end) { // I might have nodes finishing here; but I have a junction finishing here for sure GStr cs((int)currentstart); @@ -2704,7 +2780,7 @@ //fprintf(stderr,"1 Edge %d-%d, edgeno=%d\n",node->nodeid,graphnode->nodeid,edgeno); } } - else { // I haven't seen nodes before that finish here => link to source + else { // I haven't seen nodes before that finish here (maybe due to error correction?) => link to source source->child.Add(graphnode->nodeid); // this node is the child of source graphnode->parent.Add(source->nodeid); // this node has source as parent // COUNT EDGE HERE @@ -2723,7 +2799,7 @@ bool completed=false; - bool dropcov=false; // false(0) means start of bundle or junction end (drop in coverage); true(1) means junction start (raise in coverage) + bool dropcov=false; // false(0) means start of bundle or junction end (raise in coverage); true(1) means junction start (drop in coverage) int nls=0; // index in longstart int nle=0; // index in longend @@ -3395,6 +3471,7 @@ CTreePat *root=new CTreePat(0,gno-1); // if links from source to nodes are desired source==1 and all nodes are treated as +1 // now construct all child CTreePat's + //fprintf(stderr,"There are %d transfrags\n",transfrag.Count()); for(int t=0;tnodes[0]){ // don't include transfrags from source -> not needed CTreePat *tree=root; @@ -3481,8 +3558,10 @@ for(int n=0;nnextpat[gno-1-node[n-1]+node[n]-node[n-1]-1]; + } else tree=tree->nextpat[node[n]-node[n-1]-1]; } else tree=tree->nextpat[node[n]-1]; @@ -3528,17 +3607,20 @@ } */ - bool is_source=(rstart>=no2gnode[node[0]]->start); - bool is_sink=(rend<=no2gnode[node.Last()]->end); - if(longreads) { // see if I need to adjust nodes in pattern if(node.Count()>1) { // correct longreads that extend inside introns or past nodes that link back to source or sink + + bool is_source=(rstart>=no2gnode[node[0]]->start); // read start could be inside of the node (node could be extended due to other reads) + bool is_sink=(rend<=no2gnode[node.Last()]->end); // read end could be inside of the node (node could be extended due to other reads + if(no2gnode[node[0]]->hardstart) is_source=false; // do not do any trimming if this is an ending node + if(no2gnode[node.Last()]->hardend) is_sink=false; // do not do any trimming if this is an ending node + if(is_sink) { // check last node int i=node.Count()-1; while(i>0 && no2gnode[node[i-1]]->end+1==no2gnode[node[i]]->start) { uint dist=rend-no2gnode[node[i-1]]->end; if(distcov*DROP*no2gnode[node[i]]->len()>no2gnode[node[i]]->cov*no2gnode[node[i-1]]->len()) { // there is a significant drop + if(no2gnode[node[i-1]]->hardend || no2gnode[node[i-1]]->cov*DROP*no2gnode[node[i]]->len()>no2gnode[node[i]]->cov*no2gnode[node[i-1]]->len()) { // there is a significant drop int j=no2gnode[node[i-1]]->child.Count()-1; bool trim=false; while(j>=0) { @@ -3561,7 +3643,7 @@ } } else if(distcov*DROP*no2gnode[node[i]]->len()>no2gnode[node[i]]->cov*no2gnode[node[i-1]]->len()) { // there is a significant drop + if(no2gnode[node[i-1]]->hardend || no2gnode[node[i-1]]->cov*DROP*no2gnode[node[i]]->len()>no2gnode[node[i]]->cov*no2gnode[node[i-1]]->len()) { // there is a significant drop int j=no2gnode[node[i-1]]->child.Count()-1; bool trim=false; while(j>=0) { @@ -3591,9 +3673,9 @@ if(is_source) { // check first node int i=0; while(i+1end+1==no2gnode[node[i+1]]->start) { - uint dist=no2gnode[node[i+1]]->start-rend; + uint dist=no2gnode[node[i+1]]->start-rstart; if(distcov*DROP*no2gnode[node[i]]->len()>no2gnode[node[i]]->cov*no2gnode[node[i+1]]->len()) { // there is a significant drop + if(no2gnode[node[i+1]]->hardstart || no2gnode[node[i+1]]->cov*DROP*no2gnode[node[i]]->len()>no2gnode[node[i]]->cov*no2gnode[node[i+1]]->len()) { // there is a significant drop int j=0; bool trim=false; while(jparent.Count()) { @@ -3616,7 +3698,7 @@ } } else if(distcov*DROP*no2gnode[node[i]]->len()>no2gnode[node[i]]->cov*no2gnode[node[i+1]]->len()) { // there is a significant drop + if(no2gnode[node[i+1]]->hardstart || no2gnode[node[i+1]]->cov*DROP*no2gnode[node[i]]->len()>no2gnode[node[i]]->cov*no2gnode[node[i+1]]->len()) { // there is a significant drop int j=0; bool trim=false; while(jparent.Count()) { @@ -3642,13 +3724,12 @@ i++; } } - } } else if(!mergeMode && node.Count()==1) return(NULL); // do the one node transfrag make any difference? CHECK IF YOU NEED TO KEEP THIS ONE - CTransfrag *t=findtrf_in_treepat(gno,gpos,node,pattern,tr2no[s][g]); - if(!t) { // t is NULL + CTransfrag *t=findtrf_in_treepat(gno,gpos,node,pattern,tr2no[s][g]); + if(!t) { // t is NULL t=new CTransfrag(node,pattern,0); /* @@ -3678,16 +3759,16 @@ tree=child; } tree->tr=t; - } + } - if(is_sr) t->srabund+=abundance; - else //fprintf(stderr,"Update transfrag with abundance=%f in graph[%d][%d]\n",abundance,s,g); + if(is_sr) t->srabund+=abundance; + else //fprintf(stderr,"Update transfrag with abundance=%f in graph[%d][%d]\n",abundance,s,g); t->abundance+=abundance; - if(no2gnode[node[0]]->start<=rstart) if(!t->longstart || rstartlongstart) t->longstart=rstart; - if(no2gnode[node.Last()]->end>=rend) if(!t->longend|| rend>t->longend) t->longend=rend; + if(no2gnode[node[0]]->start<=rstart) if(!t->longstart || rstartlongstart) t->longstart=rstart; // value of longstart inside node; the other extensions could come from spliced nodes + if(no2gnode[node.Last()]->end>=rend) if(!t->longend|| rend>t->longend) t->longend=rend; // value of longend inside node; the other extensions could come from spliced nodes - return(t); + return(t); } void get_fragment_pattern(GList& readlist,int n, int np,float readcov,GVec *readgroup,GVec& merge, GVec *group2bundle, @@ -3695,7 +3776,7 @@ GPVec **transfrag,CTreePat ***tr2no,GPVec &group) { //fprintf(stderr,"get fragment for read[%d]:%d-%d-%d-%d-%f with pair[%d]\n",n,readlist[n]->start,readlist[n]->end,int(readlist[n]->strand),readlist[n]->nh,readlist[n]->read_count,np); - uint rstart=readlist[n]->start; + uint rstart=readlist[n]->start; // this only works for unpaired long reads -> I will have to take into account the pair if I want to do it for all reads uint rend=readlist[n]->end; if(np>-1 && readlist[np]->end>rend) rend=readlist[np]->end; @@ -4414,18 +4495,35 @@ } int compatible_long(int* t,int *len,GPVec& transfrag,GPVec& no2gnode,int gno,GIntHash &gpos) { -// return 0 if no compatibility, then 4*(rets)+rete where rets/ret3 = 1 if t[0] has extra intron, 2 if t[1] has extra intron, 3 if compatible starts/ends +// return 0 if no compatibility, then 4*(rets)+rete where rets/rete = 1 if t[0] has extra intron, 2 if t[1] has extra intron, 3 if compatible starts/ends // len[0] is how much t[0] extends past t[1] if positive, otherwise how much t[1] extends past t[0] at the start of transcripts // len[1] positive: if t[0] has extra intron at start how much is t[1] inside of the intron -> default is 0; negative: same for t[1] // len[2] is how much t[0] extends past t[1] if positive, otherwise how much t[1] extends past t[0] at the end of transcripts // len[3] positive: if t[0] has extra intron at end how much is t[1] inside of the intron -> default is 0; negative: same for t[1] + + if(transfrag[t[0]]->nodes[0]!=transfrag[t[1]]->nodes[0] && + no2gnode[transfrag[t[0]]->nodes[0]]->hardstart && no2gnode[transfrag[t[1]]->nodes[0]]->hardstart) + return 0; // both starts can't be hard + + if(transfrag[t[0]]->nodes.Last()!=transfrag[t[1]]->nodes.Last() && + no2gnode[transfrag[t[0]]->nodes.Last()]->hardend && no2gnode[transfrag[t[1]]->nodes.Last()]->hardend) + return 0; // both ends can't be hard + + uint tstart[2]={no2gnode[transfrag[t[0]]->nodes[0]]->start,no2gnode[transfrag[t[1]]->nodes[0]]->start}; + if(transfrag[t[0]]->longstart) tstart[0]=transfrag[t[0]]->longstart; + if(transfrag[t[1]]->longstart) tstart[1]=transfrag[t[1]]->longstart; + uint tend[2]={no2gnode[transfrag[t[0]]->nodes.Last()]->end,no2gnode[transfrag[t[1]]->nodes.Last()]->end}; + if(transfrag[t[0]]->longend) tend[0]=transfrag[t[0]]->longend; + if(transfrag[t[1]]->longend) tend[1]=transfrag[t[1]]->longend; + int f=0; // first starting transcript int s=1; // second starting transcript - if(transfrag[t[1]]->nodes[0]nodes[0]) { // transfrag t1 starts before trf t0 + if(tstart[1]nodes.Last()nodes[0]) // no overlapping transcripts return 0; @@ -4440,24 +4538,29 @@ if(no2gnode[transfrag[t[f]]->nodes[i[f]]]->end+1nodes[i[f]+1]]->start) { // found intron in f rets=1+f; //fprintf(stderr,"..found intron in t[f]=%d rets=%d\n",t[f],rets); + if(no2gnode[transfrag[t[s]]->nodes[0]]->hardstart) return 0; // there is no compatibility here } - len[0]+=no2gnode[transfrag[t[f]]->nodes[i[f]]]->len(); // this only approximates the length + // len[0]+=no2gnode[transfrag[t[f]]->nodes[i[f]]]->len(); // this only approximates the length // adj if I want real distances: - //if(i[f]) len[0]+=no2gnode[transfrag[t[f]]->nodes[i[f]]]->len(); - //else len[0]+=no2gnode[transfrag[t[f]]->nodes[i[f]]]->end-transfrag[t[f]]->longstart+1; - + if(i[f]) len[0]+=no2gnode[transfrag[t[f]]->nodes[i[f]]]->len(); + else len[0]+=no2gnode[transfrag[t[f]]->nodes[i[f]]]->end-tstart[f]+1; i[f]++; } + if(transfrag[t[s]]->nodes.Last()nodes[i[f]]) return 0; // no overlap: t[s] is contained in t[f] - if(f) len[0]=-len[0]; // a negative value signals that t[1] extends past t[0] + + // adj for real distance + if(!i[f]) { // transfrag[t[f]]->nodes[0]==transfrag[t[s]]->nodes[0] + len[0]=tstart[s]-tstart[f]; + } if(transfrag[t[s]]->nodes[0]nodes[i[f]]) { // i[f]>0 because f has to start first if(!no2gnode[transfrag[t[s]]->nodes[0]]->childpat[transfrag[t[f]]->nodes[i[f]]]) return 0; // I can not reach node i[f] from t[s][0] // to approximate: - //len[1]=no2gnode[transfrag[t[f]]->nodes[i[f]]]->start-no2gnode[transfrag[t[s]]->nodes[0]]->start; - // adj if I want real distances: - len[1]=no2gnode[transfrag[t[f]]->nodes[i[f]]]->start-transfrag[t[s]]->longstart; + // len[1]=no2gnode[transfrag[t[f]]->nodes[i[f]]]->start-no2gnode[transfrag[t[s]]->nodes[0]]->start; + // adj if I want real distances: + len[1]=no2gnode[transfrag[t[f]]->nodes[i[f]]]->start-tstart[s]; while(transfrag[t[s]]->nodes[i[s]]nodes[i[f]]) { i[s]++; if(no2gnode[transfrag[t[s]]->nodes[i[s]-1]]->end+1nodes[i[s]]]->start) { // not continuous @@ -4465,12 +4568,17 @@ } } } + else if(i[f]){ // no intron -> adj lentgh for real distance + len[0]+=tstart[s]-no2gnode[transfrag[t[s]]->nodes[0]]->start; + } + + if(f) len[0]=-len[0]; // a negative value signals that t[1] extends past t[0] // check END int rete=3; f=0; s=1; - if(transfrag[t[0]]->nodes.Last()nodes.Last()) { // f should be the longer transcript at the end + if(tend[1]>tend[0]) { // f should be the longer transcript at the end f=1; s=0; } @@ -4487,23 +4595,25 @@ rete=1+f; //fprintf(stderr,"..found intron in t[f]=%d rete=%d\n",t[f],rete); if(rets<3 && rete!=rets) return 0; + if(no2gnode[transfrag[t[s]]->nodes.Last()]->hardend) return 0; // there is no compatibility here } // to approximate: - len[2]+=no2gnode[transfrag[t[f]]->nodes[j[f]]]->len(); + //len[2]+=no2gnode[transfrag[t[f]]->nodes[j[f]]]->len(); // adj: - //if(j[f]nodes.Count()-1) len[2]+=no2gnode[transfrag[t[f]]->nodes[j[f]]]->len(); - //else len[2]+=transfrag[t[f]]->longend-no2gnode[transfrag[t[f]]->nodes[j[f]]]->start+1; + if(j[f]nodes.Count()-1) len[2]+=no2gnode[transfrag[t[f]]->nodes[j[f]]]->len(); + else len[2]+=tend[f]-no2gnode[transfrag[t[f]]->nodes[j[f]]]->start+1; j[f]--; } if(transfrag[t[s]]->nodes[0]>transfrag[t[f]]->nodes[j[f]]) return 0; // no overlap (this shouldn't be the case since there is one node in common) - if(f) len[2]=-len[2]; // a negative value signals that t[1] extends past t[0] + + if(j[f]==transfrag[t[f]]->nodes.Count()-1) len[2]+=tend[f]-tend[s]; if(transfrag[t[s]]->nodes[j[s]]>transfrag[t[f]]->nodes[j[f]]) { if(!no2gnode[transfrag[t[f]]->nodes[j[f]]]->childpat[transfrag[t[s]]->nodes[j[s]]]) return 0; // I can not reach node j[s] from j[f] // len[3]=no2gnode[transfrag[t[s]]->nodes.Last()]->end-no2gnode[transfrag[t[f]]->nodes[j[f]]]->end; // to approximate dist // adj if I want real distances: - len[3]=transfrag[t[s]]->longend-no2gnode[transfrag[t[f]]->nodes[j[f]]]->end; + len[3]=tend[s]-no2gnode[transfrag[t[f]]->nodes[j[f]]]->end; while(transfrag[t[s]]->nodes[j[s]]>transfrag[t[f]]->nodes[j[f]]) { j[s]--; if(no2gnode[transfrag[t[s]]->nodes[j[s]]]->end+1nodes[j[s]+1]]->start) { // not continuous @@ -4511,9 +4621,14 @@ } } } + else if(j[f]nodes.Count()-1) { + len[2]+=no2gnode[transfrag[t[s]]->nodes[0]]->end-tend[s]; + } if(i[f]>j[f] || i[s]>j[s]) return 0; // there should be i<=j + if(f) len[2]=-len[2]; // a negative value signals that t[1] extends past t[0] + // now if and is point at the same node in transcripts, and so do jf and js while(i[f]<=j[f] && i[s]<=j[s]) { if(transfrag[t[f]]->nodes[i[f]]==transfrag[t[s]]->nodes[i[s]]) { // skip equal nodes @@ -4555,7 +4670,7 @@ } void process_transfrags(int s, int gno,int edgeno,GPVec& no2gnode,GPVec& transfrag,CTreePat *tr2no, - GIntHash &gpos,GVec& guidetrf,GList& pred) { + GIntHash &gpos,GVec& guidetrf,GList& pred,GVec& trflong) { /* { // DEBUG ONLY @@ -4576,9 +4691,9 @@ /* { // DEBUG ONLY //printTime(stderr); - fprintf(stderr,"There are %d transfrags after clean up:\n",transfrag.Count()); + fprintf(stderr,"\nThere are %d transfrags after clean up:\n",transfrag.Count()); for(int i=0;iabundance); for(int j=0;jnodes.Count();j++) fprintf(stderr," %d",transfrag[i]->nodes[j]); fprintf(stderr,"\n"); } @@ -4586,37 +4701,72 @@ */ // add all guide patterns to the set of transfrags so that I can have a "backbone" for each guide - // I need this because there might be an uncompatible transfrag connecting the nodes in the guide + // I need this because there might be an incompatible transfrag connecting the nodes in the guide for(int i=0;iguide){ - //CTransfrag *t=findtrf_in_treepat(gno,gpos,guidetrf[i].trf->nodes,guidetrf[i].trf->pattern,tr2no); // I need to adjust first/last node - //if(!t) { // t is NULL - CTransfrag *t=new CTransfrag(guidetrf[i].trf->nodes,guidetrf[i].trf->pattern,trthr*ERROR_PERC); + CTransfrag *t=NULL; + bool add=true; + if(longreads) { + /*guidetrf[i].trf->pattern[0]=0; + guidetrf[i].trf->pattern[gno-1]=0; + int *pos=gpos[edge(0,guidetrf[i].trf->nodes[1],gno)]; + if(pos) guidetrf[i].trf->pattern[*pos]=0; + pos=gpos[edge(guidetrf[i].trf->nodes[guidetrf[i].trf->nodes.Count()-2],guidetrf[i].trf->nodes.Last(),gno)]; + if(pos) guidetrf[i].trf->pattern[*pos]=0; + guidetrf[i].trf->nodes.Pop(); + guidetrf[i].trf->nodes.Shift();*/ + + t=findtrf_in_treepat(gno,gpos,guidetrf[i].trf->nodes,guidetrf[i].trf->pattern,tr2no); // I need to adjust first/last node + if(!t) { // t is NULL + t=new CTransfrag(guidetrf[i].trf->nodes,guidetrf[i].trf->pattern,0); + } + else add=false; + } + else { + t=new CTransfrag(guidetrf[i].trf->nodes,guidetrf[i].trf->pattern,trthr*ERROR_PERC); + } /* + float abund=0; + if(!longreads) { + abund=trthr*ERROR_PERC; + } + CTransfrag *t=new CTransfrag(guidetrf[i].trf->nodes,guidetrf[i].trf->pattern,abund); + */ + /* { // DEBUG ONLY fprintf(stderr,"Add guidetrf with nodes:"); for(int j=0;jnodes.Count();j++) fprintf(stderr," %d",guidetrf[i].trf->nodes[j]); //fprintf(stderr," and pattern: "); //printBitVec(guidetrf[i].trf->pattern); fprintf(stderr,"\n"); + fprintf(stderr,"Found transfrag %p with nodes:",t); + for(int j=0;jnodes.Count();j++) fprintf(stderr," %d",t->nodes[j]); + //fprintf(stderr," and pattern: "); + //printBitVec(guidetrf[i].trf->pattern); + fprintf(stderr,"\n"); + } */ // I might not need to do this for the normal max_flow, but for the push I am counting on having only one transcript linking back to source from the first node - t->pattern[0]=0; - t->pattern[gno-1]=0; - int *pos=gpos[edge(0,t->nodes[1],gno)]; - if(pos) t->pattern[*pos]=0; - pos=gpos[edge(t->nodes[t->nodes.Count()-2],t->nodes.Last(),gno)]; - if(pos) t->pattern[*pos]=0; - t->nodes.Pop(); - t->nodes.Shift(); + if(!longreads) { + t->pattern[0]=0; + t->pattern[gno-1]=0; + int *pos=gpos[edge(0,t->nodes[1],gno)]; + if(pos) t->pattern[*pos]=0; + pos=gpos[edge(t->nodes[t->nodes.Count()-2],t->nodes.Last(),gno)]; + if(pos) t->pattern[*pos]=0; + t->nodes.Pop(); + t->nodes.Shift(); + } t->guide=true; t->longstart=no2gnode[t->nodes[0]]->start; t->longend=no2gnode[t->nodes.Last()]->end; - transfrag.Add(t); - + if(longreads) t->usepath=guidetrf[i].g; // guide index + no2gnode[t->nodes[0]]->hardstart=1; // I can always trust a guide's start + no2gnode[t->nodes.Last()]->hardend=1; // I can always trust a guide's end + if(add) transfrag.Add(t); } GBitVec allpat(gno+edgeno); @@ -4625,19 +4775,32 @@ if(longreads) { // add source/sink links but only if they need to be added to explain the traversals in the graph transfrag.Sort(longtrCmp); // most abundant transfrag in the graph come first, then the ones with most nodes, then the ones more complete - //trsort=false; + trsort=false; int source=0; int sink=gno-1; GVec hassource(gno,-1); // remembers transcript number that links given node to source GVec hassink(gno,-1); // remembers transcript number that links given node to sink - GBitVec keepsource(gno); - GBitVec keepsink(gno); + GBitVec keepsource(gno); // if not set then I can remove link from node to source; keeps source link if it exists otherwise otherwise + GBitVec keepsink(gno); // if not set then I can remove link from node to sink; keeps sink link if it exists otherwise otherwise GVec keeptrf; // keeps all potential transfrags that will be kept from most abundant to least, unassembled float zero=0; GVec addsource(gno,zero); GVec addsink(gno,zero); - int edgedist=CHI_WIN+CHI_THR; + int edgedist=CHI_WIN; // I need to be consistent (if I change here then I need to change in update_abundance too) int ssdist=longintronanchor; + + /* + { // DEBUG ONLY + //printTime(stderr); + fprintf(stderr,"\nThere are %d transfrags after clean up:\n",transfrag.Count()); + for(int i=0;iabundance); + for(int j=0;jnodes.Count();j++) fprintf(stderr," %d",transfrag[i]->nodes[j]); + fprintf(stderr,"\n"); + } + } + */ + for(int t1=0;t1abundance); for(int j=0;jnodes.Count();j++) { @@ -4659,50 +4822,73 @@ //fprintf(stderr,"Node %d in t=%d with cov=%f has sink\n",transfrag[t1]->nodes[0],t1,transfrag[t1]->abundance); } else { - if(transfrag[t1]->longstart) keepsource[transfrag[t1]->nodes[0]]=1; //fprintf(stderr,"keep source %d\n",transfrag[t1]->nodes[0]);} - if(transfrag[t1]->longend) keepsink[transfrag[t1]->nodes.Last()]=1;//fprintf(stderr,"keep sink %d\n",transfrag[t1]->nodes.Last());} + if(eonly && !transfrag[t1]->guide) continue; // do not remember transfrags that are not guides + if(!keepsource[transfrag[t1]->nodes[0]]) { + if(transfrag[t1]->longstart) keepsource[transfrag[t1]->nodes[0]]=1; //fprintf(stderr,"keep source %d\n",transfrag[t1]->nodes[0]);} + else if(no2gnode[transfrag[t1]->nodes[0]]->hardstart) keepsource[transfrag[t1]->nodes[0]]=1; + } + if(!keepsink[transfrag[t1]->nodes.Last()]) { + if(transfrag[t1]->longend) keepsink[transfrag[t1]->nodes.Last()]=1;//fprintf(stderr,"keep sink %d\n",transfrag[t1]->nodes.Last());} + else if(no2gnode[transfrag[t1]->nodes.Last()]) keepsink[transfrag[t1]->nodes.Last()]=1; + } bool included=false; // a transfrag that starts at source and ends at sink can never be included in a kept transfrag, so I am safe to do next for(int t2=0; t2 keep unless I can eliminate a previous one - if(!transfrag[t[1]]->guide && transfrag[t1]->longstart && transfrag[t1]->longend) { // t[1] might be included in t[0] so I might eliminate if it doesn't pass threshold + if(!transfrag[t[1]]->guide && transfrag[t1]->longstart && transfrag[t1]->longend && // t[1] might be included in t[0] so I might eliminate if it doesn't pass threshold + (!no2gnode[transfrag[t[1]]->nodes[0]]->hardstart || transfrag[t[0]]->nodes[0] == transfrag[t[1]]->nodes[0] ) && + (!no2gnode[transfrag[t[1]]->nodes.Last()]->hardend || transfrag[t[0]]->nodes.Last() == transfrag[t[1]]->nodes.Last())) { //if(transfrag[t[0]]->abundance>(1-ERROR_PERC/DROP)*transfrag[t[1]]->abundance) { // t[0] is within limits of t[1] - //if(transfrag[t[0]]->abundance>DROP*transfrag[t[1]]->abundance) { // t[0] is within limits of t[1] - if(len[1]abundance; - keeptrf[t2].group.Add(t1); - included=true; // I do not want to store transcript - //fprintf(stderr,"trf %d includes %d\n",t[0],t[1]); - } - //} + if(transfrag[t[0]]->abundance>DROP*transfrag[t[1]]->abundance) { // t[0] is within limits of t[1] + if(len[1]abundance; + keeptrf[t2].group.Add(t1); + included=true; // I do not want to store transcript + //fprintf(stderr,"trf %d includes %d\n",t[0],t[1]); + } + } } break; case 2: // t[1] includes t[0]: extends with introns past ends of t[0] (t[1] possibly includes t[0]); t[1] is more abundant than t0 //if(transfrag[t[1]]->guide || transfrag[t[1]]->abundance>(1-ERROR_PERC/DROP)*transfrag[t[0]]->abundance) { - if(!transfrag[t[0]]->guide) { + if(!transfrag[t[0]]->guide && + (!no2gnode[transfrag[t[0]]->nodes[0]]->hardstart || transfrag[t[0]]->nodes[0] == transfrag[t[1]]->nodes[0]) && + (!no2gnode[transfrag[t[0]]->nodes.Last()]->hardend || transfrag[t[0]]->nodes.Last() == transfrag[t[1]]->nodes.Last())) { if(len[1]abundance; keeptrf[t2].group.Add(t1); included=true; - //fprintf(stderr,"trf %d is intronic including %d\n",t[1],t[0]); + //fprintf(stderr,"trf %d is intronic including %d\n",t[1],t[0]); } } //} break; case 3: // t1 and t0 are compatible --> just look for the edges; t1 goes further apart - if(len[0]guide) keeptrf[t2].t=t1; // t[0] to replace t[1] + if(transfrag[t[0]]->nodes[0]!=transfrag[t[1]]->nodes[0] && transfrag[t[0]]->nodes.Last()!=transfrag[t[1]]->nodes.Last() && + ((no2gnode[transfrag[t[0]]->nodes[0]]->hardstart && !no2gnode[transfrag[t[1]]->nodes[0]]->hardstart && + !no2gnode[transfrag[t[0]]->nodes.Last()]->hardend && no2gnode[transfrag[t[1]]->nodes.Last()]->hardend) || + (!no2gnode[transfrag[t[0]]->nodes[0]]->hardstart && no2gnode[transfrag[t[1]]->nodes[0]]->hardstart && + no2gnode[transfrag[t[0]]->nodes.Last()]->hardend && !no2gnode[transfrag[t[1]]->nodes.Last()]->hardend))) { + // these two transcripts both have one good start and one different --> keep them both (different option would be to add them to another transfrag that is compatible and has both hardends but then it's more complicated + break; + } + + // I keep both if both are guides + //if((!transfrag[t[0]]->guide || !transfrag[t[1]]->guide) && abs(len[0])guide || !transfrag[t[1]]->guide) && len[0]guide || (!transfrag[t[1]]->guide && no2gnode[transfrag[t[0]]->nodes[0]]->hardstart && no2gnode[transfrag[t[0]]->nodes.Last()]->hardend)) + keeptrf[t2].t=t1; // t[0] to replace t[1] keeptrf[t2].cov+=transfrag[t1]->abundance; keeptrf[t2].group.Add(t1); included=true; - //fprintf(stderr,"trf %d %d equivalent start/ends\n",t[1],t[0]); + //fprintf(stderr,"trf %d %d equivalent start/ends\n",t[1],t[0]); } break; } @@ -4711,14 +4897,16 @@ } if(!included){ - if(transfrag[t1]->guide || (transfrag[t1]->longstart && transfrag[t1]->longend)) { // if this is not included and has correct start/end + if(transfrag[t1]->guide || ((transfrag[t1]->longstart || no2gnode[transfrag[t1]->nodes[0]]->hardstart)&& + (transfrag[t1]->longend || no2gnode[transfrag[t1]->nodes.Last()]->hardend))) { // if this is not included and has correct start/end CLongTrf kt(t1,transfrag[t1]->abundance); keeptrf.Add(kt); keeptrf.Last().group.Add(t1); + //fprintf(stderr,"keep transfrag %d\n",t1); } else { // incomplete transcript, possibly wrong transfrag[t1]->weak=1; - //fprintf(stderr,"Incomplete transcript\n"); + //fprintf(stderr,"Incomplete transcript %d\n",t1); } } } @@ -4728,22 +4916,25 @@ if(s) { sign='+';} //GBitVec guidesource(gno); //GBitVec guidesink(gno); - for(int i=0;i=0;i--) { // I add the kept transcripts to trflong from least significant to most in order to make deletion easier + //fprintf(stderr,"Build source/sink for transfrag %d\n",keeptrf[i].t); int n1=transfrag[keeptrf[i].t]->nodes[0]; int n2=transfrag[keeptrf[i].t]->nodes.Last(); //if(hassource[n1]<0 || hassink[n2]<0) fprintf(stderr,"Build source/sink for transfrag %d\n",keeptrf[i].t); - /*if(transfrag[keeptrf[i].t]->guide) { - addsource[n1]=keeptrf[i].cov; // give a boost to a guide - addsink[n2]=keeptrf[i].cov; // give a boost to a guide - guidesource[n1]=1; - guidesource[n2]=1; - } - else {*/ - addsource[n1]=transfrag[keeptrf[i].t]->abundance; //all_eq else makes no difference - addsink[n2]=transfrag[keeptrf[i].t]->abundance; //all_eq else makes no difference - //} + if(!rawreads && !transfrag[keeptrf[i].t]->guide && ((!no2gnode[n1]->hardstart && (hassource[n1]<0 || !keepsource[n1])) || + (!no2gnode[n2]->hardend && (hassink[n2]<0 || !keepsink[n2])))) { + //if(!rawreads && (no2gnode[n1]->hardstart || (hassource[n1]>=0 && keepsource[n1])) && (no2gnode[n2]->hardend ||(hassink[n2]>=0 && keepsink[n2]))) { + trflong.Add(keeptrf[i].t); + } + + /******* previous implementation here + + addsource[n1]=keeptrf[i].cov; + addsink[n2]=keeptrf[i].cov; + *******/ /* if(!addsource[n1] && hassource[n1]<0) { @@ -4764,7 +4955,7 @@ } }*/ - /* // all + // all for(int j=0;jnodes[0]) { addsource[n1]+=transfrag[keeptrf[i].group[j]]->abundance; @@ -4773,9 +4964,9 @@ if(n2==transfrag[keeptrf[i].group[j]]->nodes.Last()) { addsink[n2]+=transfrag[keeptrf[i].group[j]]->abundance; //fprintf(stderr,"Add sink t[%d]->cov=%f to node %d = %f\n",keeptrf[i].group[j],transfrag[keeptrf[i].group[j]]->abundance,n2,addsink[n2]); + } } - } - }*/ + if(rawreads) { @@ -4803,37 +4994,50 @@ pred.Add(p); } } + for(int i=keeptrf.Count()-1;i>=0;i--) { + int n1=transfrag[keeptrf[i].t]->nodes[0]; + int n2=transfrag[keeptrf[i].t]->nodes.Last(); + + if(transfrag[keeptrf[i].t]->guide || ((no2gnode[n1]->hardstart || (hassource[n1]>=0 && keepsource[n1])) && (no2gnode[n2]->hardend ||(hassink[n2]>=0 && keepsink[n2])))) + trflong.Add(keeptrf[i].t); + + } /* { // DEBUG ONLY fprintf(stderr,"%d keeptrf:\n",keeptrf.Count()); for(int i=0;iabundance); + fprintf(stderr,"(%d) %d abund=%f keepcov=%f",i,keeptrf[i].t,transfrag[keeptrf[i].t]->abundance,keeptrf[i].cov); for(int j=0;jnodes.Count();j++) { fprintf(stderr," %d",transfrag[keeptrf[i].t]->nodes[j]); } fprintf(stderr,"\n"); - int j=0; - while(jnodes.Count()) { - fprintf(stderr,"Ad5\tStringTie\texon\t%d",no2gnode[transfrag[keeptrf[i].t]->nodes[j]]->start); - while(j+1nodes.Count() && no2gnode[transfrag[keeptrf[i].t]->nodes[j]]->end+1==no2gnode[transfrag[keeptrf[i].t]->nodes[j+1]]->start) - j++; - fprintf(stderr,"\t%d\t1000\t-\t.\ttranscript_id \"STRG.%d.%.1f\n",no2gnode[transfrag[keeptrf[i].t]->nodes[j]]->end,keeptrf[i].t,transfrag[keeptrf[i].t]->abundance); - j++; - } + } + fprintf(stderr,"%d trflong:",trflong.Count()); + for(int i=0;ihardstart) { + /*float abund=trthr; + if(no2gnode[i]->hardstart) { // node i doesn't have source as parent but it should + abund=addsource[i]; + }*/ + // else I am not that confident this is a start no2gnode[i]->parent.Insert(0,zero); no2gnode[0]->child.Add(i); GVec nodes; nodes.Add(source); nodes.Add(i); + //CTransfrag *t=new CTransfrag(nodes,allpat,abund); CTransfrag *t=new CTransfrag(nodes,allpat,addsource[i]); t->pattern[source]=1; t->pattern[i]=1; @@ -4843,19 +5047,25 @@ } else { if(!keepsource[i]) { // it was a mistake to link this to source --> remove - //fprintf(stderr,"delete abundance of trf:"); - //for(int j=0;jnodes.Count();j++) fprintf(stderr," %d",transfrag[hassource[i]]->nodes[j]);fprintf(stderr,"\n"); + /*fprintf(stderr,"delete abundance of trf:"); + for(int j=0;jnodes.Count();j++) fprintf(stderr," %d",transfrag[hassource[i]]->nodes[j]);fprintf(stderr,"\n");*/ transfrag[hassource[i]]->abundance=0; } - /*else { // this one has source and it should be kept => update trf abundance to a more realistic value - //fprintf(stderr,"source %d:%d abd=%f addsource=%f\n",i,no2gnode[i]->start,transfrag[hassource[i]]->abundance,addsource[i]); - if(guidesource[i] && transfrag[hassource[i]]->abundanceabundance=addsource[i]; - }*/ + else { // this one has source and it should be kept => update trf abundance to a more realistic value + //fprintf(stderr,"source %d:%d abund=%f addsource=%f\n",i,no2gnode[i]->start,transfrag[hassource[i]]->abundance,addsource[i]); + //if(guidesource[i] && transfrag[hassource[i]]->abundanceabundance=addsource[i]; + } } if(hassink[i]<0) { - if(addsink[i]) { + if(addsink[i] && no2gnode[i]->hardend) { + /*float abund=trthr; + if(no2gnode[i]->hardend) { + abund=addsink[i]; + }*/ + // else not very confident about this end + no2gnode[i]->child.Add(sink); no2gnode[sink]->parent.Add(i); GVec nodes; @@ -4870,13 +5080,15 @@ } else { if(!keepsink[i]) { // it was a mistake to link this to sink --> remove + /*fprintf(stderr,"delete abundance of trf:"); + for(int j=0;jnodes.Count();j++) fprintf(stderr," %d",transfrag[hassink[i]]->nodes[j]);fprintf(stderr,"\n");*/ transfrag[hassink[i]]->abundance=0; } - /*else { - //fprintf(stderr,"sink %d:%d abd=%f addsource=%f\n",i,no2gnode[i]->end,transfrag[hassink[i]]->abundance,addsink[i]); - if(guidesink[i] && transfrag[hassink[i]]->abundanceabundance=addsink[i]; - }*/ + else { + //fprintf(stderr,"sink %d:%d abund=%f addsink=%f\n",i,no2gnode[i]->end,transfrag[hassink[i]]->abundance,addsink[i]); + //if(guidesink[i] && transfrag[hassink[i]]->abundanceabundance=addsink[i]; + } } } } @@ -4916,7 +5128,7 @@ /* { // DEBUG ONLY //printTime(stderr); - fprintf(stderr,"There are %d transfrags after sorting:\n",transfrag.Count()); + fprintf(stderr,"\nThere are %d transfrags after sorting:\n",transfrag.Count()); for(int i=0;inodes.Count();j++) fprintf(stderr," %d",transfrag[i]->nodes[j]); @@ -4980,6 +5192,7 @@ else transfrag[t1]->real=true; } + //else if(longreads) no2gnode[n1]->trf.Add(t1); /* else { // this transcript is included completely in node no2gnode[transfrag[t1]->nodes[0]]->frag+=transfrag[t1]->abundance; @@ -5933,7 +6146,7 @@ int tn=trnode.Count(); while(1) { while(jtrnode[j] + if(!prevp || (edgep && pathpattern[*edgep]) || p==gno-1) return false; // there is an edge between prevptrnode[j]; when prevp==0/gno-1 I might not have an edge there if(!no2gnode[trnode[j]]->childpat[p]) { //fprintf(stderr,"Node p=%d cannot be reached from node=%d\n",p,trnode[j]); return false; // I can not reach p from trnode[j] @@ -6165,6 +6378,8 @@ /* fprintf(stderr,"Children of node %d with maxpath=%d are:",i,maxpath); for(int c=0;cchild[c]); + fprintf(stderr," pathpat="); + printBitVec(pathpat); fprintf(stderr,"\n"); */ @@ -6181,11 +6396,16 @@ int tmax=-1; bool exclude=false; + int nextnode=0; // nextnode on path + bool reach=false; + if(maxpath<=i) reach=true; + //int maxchild=inode->child[0]; //float maxchildcov=-1; if(pathpat[i+1]) { maxc=i+1; tmax=-1; + reach=true; } else { for(int c=0;cchild[c]; tmax=-1; + reach=true; break; } + // check if I can reach nextnode from c + if(maxpath>i) { // actually maxpath is >= i+2 otherwise I wouldn't get here + if(!nextnode) { + int j=i+2; + while(j<=maxpath) { + if(pathpat[j]) { + nextnode=j; + break; + } + j++; + } + } + if(!no2gnode[inode->child[c]]->childpat[nextnode]) { + continue; + } + reach=true; + } + + /*if(no2gnode[maxpath]->hardend && pathpat[gno-1]) { // this is a trflong transcript + if(!pathpat[c]) continue; + }*/ + float childcov=0; int tchild=-1; int endpath=maxpath; @@ -6266,6 +6509,9 @@ } } } + + if(!reach) return false; + if(maxc==-1) { if(exclude && nodecov[i+1]) { CGraphnode *cnode=no2gnode[i+1]; @@ -6346,6 +6592,8 @@ /* fprintf(stderr,"Parents of node %d are:",i); for(int p=0;pparent[p]); + fprintf(stderr," pathpat="); + printBitVec(pathpat); fprintf(stderr,"\n"); */ @@ -6363,10 +6611,15 @@ int tmax=-1; bool exclude=false; + int nextnode=gno; // nextnode on path + bool reach=false; + if(minpath>=i) reach=true; + // check if adjacent parent is on the path if(pathpat[i-1]) { maxp=i-1; tmax=-1; + reach=true; } else { //int maxparent=inode->parent[0]; @@ -6377,9 +6630,31 @@ if(pos && pathpat[*pos]) { // this is next child on the path maxp=inode->parent[p]; tmax=-1; + reach=true; break; } + // check if I can reach nextnode from p + if(minpath=minpath) { + if(pathpat[j]) { + nextnode=j; + break; + } + j++; + } + } + if(!no2gnode[inode->parent[p]]->parentpat[nextnode]) { + continue; + } + reach=true; + } + + /*if(no2gnode[minpath]->hardstart && pathpat[gno-1]) { // this is a trflong transcript + if(!pathpat[p]) continue; + }*/ float parentcov=0; int tpar=-1; @@ -6450,6 +6725,9 @@ } } } + + if(!reach) return false; + if(maxp==-1) { if(exclude && nodecov[i-1]) { CGraphnode *pnode=no2gnode[i-1]; @@ -7317,6 +7595,7 @@ t->abundance-=val; if(t->abundanceabundance=0; for(int j=start;jnodes.Count()-1;j++) { // for all nodes but the last + //fprintf(stderr," %d",t->nodes[j]); int i=node2path[t->nodes[j]]; capacity[i]+=val; } @@ -7448,7 +7727,7 @@ int nt=no2gnode[path[i]]->trf.Count(); for(int j=0;jtrf[j]; - if(transfrag[t]->abundance && (istranscript[t] || ((pathpat & transfrag[t]->pattern)==transfrag[t]->pattern))) { + if(transfrag[t]->abundance && (istranscript[t] || ((pathpat & transfrag[t]->pattern)==transfrag[t]->pattern))) { // this needs to be adjusted istranscript[t]=1; if(transfrag[t]->nodes[0]==path[i]) { // transfrag starts at this node int n1=i; @@ -7564,61 +7843,263 @@ return(flux); } +float long_max_flow(int gno,GVec& path,GBitVec& istranscript,GPVec& transfrag,GPVec& no2gnode, + GVec& nodecapacity,GBitVec& pathpat) { -// version of push_max_flow where I weight the incomplete transfrags -float push_max_flow(int gno,GVec& path,GBitVec& istranscript,GPVec& transfrag,GPVec& no2gnode, - GVec& nodeflux,GBitVec& pathpat, GIntHash &gpos, bool &full) { - + float flux=0; int n=path.Count(); + GVec *capacity=new GVec[n]; // capacity of edges in network + GVec *flow=new GVec[n]; // flow in network + GVec *link=new GVec[n]; // for each node remembers it's neighbours + GVec pred; // this stores the augmenting path + pred.Resize(n,-1); GVec node2path; + GVec noderate; node2path.Resize(gno,-1); - for(int i=0;i capacityleft; // how many transcripts compatible to path enter node - GVec capacityright; // how many transcripts compatible to path exit node - capacityleft.Resize(n,0); - capacityright.Resize(n,0); - GVec sumleft; // how many transcripts enter node - GVec sumright; // how many transcripts exit node - sumleft.Resize(n,0); - sumright.Resize(n,0); - - //bool full=true; - //if(longreads && path.Count()>3) full=false; /* { // DEBUG ONLY - //printTime(stderr); - fprintf(stderr,"Start push max flow algorithm for path "); - //printBitVec(pathpat); + printTime(stderr); + fprintf(stderr,"Start max flow algorithm for path "); + printBitVec(pathpat); fprintf(stderr," :"); for(int i=0;iabundance); - //fprintf(stderr,"\n"); + fprintf(stderr,"Used transcripts:"); + for(int i=0;itrf.Count(); + float sumleft=0; + float sumright=0; for(int j=0;jtrf[j]; - if(transfrag[t]->abundance) { - bool keeptr=false; - if(istranscript[t]) keeptr=true; - else if(!transfrag[t]->nodes[0]) { - if(transfrag[t]->nodes.Last()==path[1]) keeptr=true; - } - else if(transfrag[t]->nodes.Last()==path.Last()) { - if(transfrag[t]->nodes[0]==path[n-2]) keeptr=true; + if(transfrag[t]->nodes[0]==path[i] && transfrag[t]->abundance && (istranscript[t] || ((pathpat & transfrag[t]->pattern)==transfrag[t]->pattern))) { + bool keeptr=true; + if(i==0) max_fl=transfrag[t]->abundance; // this is the flow from source on this path + else { + int ti=1; + int pi=i+1; + int lenp=0; + while(tinodes.Count()) { + if(path[pi]!=transfrag[t]->nodes[ti]) { // I found a gap in transfrag => I need to check if it's not too big and can not support path + // node on path is coming before transfrag; otherwise I wouldn't have the match above + lenp+=no2gnode[path[pi]]->len(); + if(lenp>CHI_WIN) { + keeptr=false; + break; + } + } + ti++; + pi++; + } } - else if((pathpat & transfrag[t]->pattern)==transfrag[t]->pattern) { + if(keeptr) { + istranscript[t]=1; + int n1=i; + int n2=node2path[transfrag[t]->nodes.Last()]; + //fprintf(stderr,"t=%d n1=%d n2=%d(%d) ",t,n1,n2,transfrag[t]->nodes.Last()); + if(!capacity[n1][n2]) { // haven't seen this link before + link[n1].Add(n2); + link[n2].Add(n1); + } + capacity[n1][n2]+=transfrag[t]->abundance; + //fprintf(stderr,"capacity[%d][%d]=%f after adding transfrag[%d]->abundance=%f\n",n1,n2,capacity[n1][n2],t,transfrag[t]->abundance); + } + } + if(i>1 && inodes[0]) sumright+=transfrag[t]->abundance; // how many transfrags exit node + if(path[i]==transfrag[t]->nodes.Last()) sumleft+=transfrag[t]->abundance; // how many transfrags enter node + } + } + if(sumright && sumleft) { + noderate[i]=sumright/sumleft; + } + } + + /* + fprintf(stderr,"Used transcripts:"); + for(int i=0;iabundance); + fprintf(stderr,"\n"); + */ + + for(int i=0;i rate; + rate.Resize(n,1); + + while(bfs(n,capacity,flow,link,pred)) { + int r=0; + float increment=max_fl; + rate[r++]=1; + for(int u=n-1;pred[u]>=0;u=pred[u]) { + float adjflux=(capacity[pred[u]][u]-flow[pred[u]][u])*rate[r-1]; + increment = increment < adjflux ? increment : adjflux; // minimum flux increment on the path + if(pred[pred[u]]>=0) { + if(pred[u]rate; + else rate[r]=rate[r-1]; + } + else { + if(pred[pred[u]]rate; + } + r++; + } + } + r=0; + for(int u=n-1;pred[u]>=0;u=pred[u]) { + flow[pred[u]][u]+=increment/rate[r]; + flow[u][pred[u]]-=increment/rate[r]; + r++; + } + flux+=increment; + } + + /* + { // DEBUG ONLY + fprintf(stderr,"Flow:"); + for(int n1=0;n1trf.Count(); + float sumout=0; + int pos=-1; + for(int j=0;jtrf[j]; + if(istranscript[t] && transfrag[t]->abundance) { + if(transfrag[t]->nodes[0]==path[i]) { // transfrag starts at this node + int n1=i; + int n2=node2path[transfrag[t]->nodes.Last()]; + if(flow[n1][n2]>0) { + if(flow[n1][n2]abundance) { + if(!i) sumout+=flow[n1][n2]; + //fprintf(stderr,"Update capacity of transfrag[%d] with value %f to %f\n",t,transfrag[t]->abundance, transfrag[t]->abundance-flow[n1][n2]); + update_capacity(0,transfrag[t],flow[n1][n2],nodecapacity,node2path); + //if(path[i] && transfrag[t]->nodes.Last()!=gno-1) fragno+=flow[n1][n2]; + flow[n1][n2]=0; + } + else { + if(!i) sumout+=transfrag[t]->abundance; + flow[n1][n2]-=transfrag[t]->abundance; + //if(path[i] && transfrag[t]->nodes.Last()!=gno-1) fragno+=transfrag[t]->abundance; + //fprintf(stderr,"Update capacity of transfrag[%d] with value=%f to 0\n",t,transfrag[t]->abundance); + update_capacity(0,transfrag[t],transfrag[t]->abundance,nodecapacity,node2path); + } + } + } + else if(!i && transfrag[t]->nodes.Last()==path[i]) pos=j; + } + } + if(!i && pos>-1) { // this is first node -> adjust entering transfrag + int t=no2gnode[path[i]]->trf[pos]; + float val=sumout/noderate[i]; //no2gnode[path[i]]->rate; + transfrag[t]->abundance-=val; + if(transfrag[t]->abundanceabundance=0; + } + } + + // clean up + delete [] capacity; + delete [] flow; + delete [] link; + + return(flux); +} + + +// version of push_max_flow where I weight the incomplete transfrags +float push_max_flow(int gno,GVec& path,GBitVec& istranscript,GPVec& transfrag,GPVec& no2gnode, + GVec& nodeflux,GBitVec& pathpat, GIntHash &gpos, bool &full) { + + int n=path.Count(); + GVec node2path; + node2path.Resize(gno,-1); + for(int i=0;i capacityleft; // how many transcripts compatible to path enter node + GVec capacityright; // how many transcripts compatible to path exit node + capacityleft.Resize(n,0); + capacityright.Resize(n,0); + GVec sumleft; // how many transcripts enter node + GVec sumright; // how many transcripts exit node + sumleft.Resize(n,0); + sumright.Resize(n,0); + + //bool full=true; + //if(longreads && path.Count()>3) full=false; + + /* + { // DEBUG ONLY + //printTime(stderr); + fprintf(stderr,"Start push max flow algorithm for path "); + //printBitVec(pathpat); + fprintf(stderr," :"); + for(int i=0;iabundance); + //fprintf(stderr,"\n"); + } + */ + + // compute capacities and sums for all nodes + for(int i=1;itrf.Count(); + for(int j=0;jtrf[j]; + if(transfrag[t]->abundance) { + bool keeptr=false; + if(istranscript[t]) keeptr=true; + else if(!transfrag[t]->nodes[0]) { + if(transfrag[t]->nodes.Last()==path[1]) keeptr=true; + } + else if(transfrag[t]->nodes.Last()==path.Last()) { + if(transfrag[t]->nodes[0]==path[n-2]) keeptr=true; + } + else if(transfrag[t]->nodes[0]==path[i] && ((pathpat & transfrag[t]->pattern)==transfrag[t]->pattern)) { // only need to check transfrag the first time I encounter it keeptr=true; - if(!full) { // check if transcript fully supports path (full is false means I have not found any transcript to fully support path) + + if(longreads) { // an extremely gapped transcript should not be considered to support path (I am doing this for longreads but it might work for paired reads too + int ti=1; + int pi=i+1; + int lenp=0; + while(tinodes.Count()) { + if(path[pi]!=transfrag[t]->nodes[ti]) { // I found a gap in transfrag => I need to check if it's not too big and can not support path + // node on path is coming before transfrag; otherwise I wouldn't have the match above + lenp+=no2gnode[path[pi]]->len(); + if(lenp>CHI_WIN) { + keeptr=false; + break; + } + } + ti++; + pi++; + } + } + + if(keeptr && !full) { // check if transcript fully supports path (full is false means I have not found any transcript to fully support path) full=true; int p=1; if(!transfrag[t]->longstart || !transfrag[t]->longend) full=false; @@ -8564,7 +9045,6 @@ - float weight_max_flow(int gno,GVec& path,GBitVec& istranscript,GPVec& transfrag,GPVec& no2gnode, GVec& nodecapacity,GBitVec& pathpat) {//,float& fragno) { @@ -9005,15 +9485,422 @@ return(np); } +int best_trf_match(CTransfrag *t,GVec& keeptrf,GPVec& no2gnode,int gno) { + int mineditdist=no2gnode[gno-2]->end-no2gnode[1]->start+1; + int mininternaldist=mineditdist; + int maxintersect=0; + int mink=-1; + for(int k=0;knodes[0]<=keeptrf[k].nodes[keeptrf[k].nodes.Count()-2] && keeptrf[k].nodes[1]<=t->nodes.Last()){ + int editdist=0; + int internaldist=0; + int intersect=0; + int i=0; // t index + int j=1; // k index + + while(jnodes[i]>keeptrf[k].nodes[j]) { + editdist+=no2gnode[keeptrf[k].nodes[j]]->len(); + j++; + } + + while(inodes.Count() && t->nodes[i]nodes[i]nodes[i]]->len(); + if(j>1) internaldist+=no2gnode[t->nodes[i]]->len(); + i++; + } + else if(t->nodes[i]>keeptrf[k].nodes[j]) { + editdist+=no2gnode[keeptrf[k].nodes[j]]->len(); + internaldist+=no2gnode[keeptrf[k].nodes[j]]->len(); + j++; + } + else { // nodes are equal + i++; + j++; + intersect++; + } + } + + if(!intersect || internaldist>CHI_THR) continue; + + if(intersect>maxintersect) { + mink=k; + while(inodes.Count()) { // i>=1 because t intersects k + editdist+=no2gnode[t->nodes[i]]->len(); + if(no2gnode[t->nodes[i-1]]->end+1nodes[i]]->start) internaldist+=no2gnode[t->nodes[i]]->len(); + i++; + } + while(jlen(); + j++; + } + mineditdist=editdist; + mininternaldist=internaldist; + maxintersect=intersect; + } + else if(intersect==maxintersect) { + if(internaldistnodes.Count()) { // i>=1 because t intersects k + editdist+=no2gnode[t->nodes[i]]->len(); + if(no2gnode[t->nodes[i-1]]->end+1nodes[i]]->start) internaldist+=no2gnode[t->nodes[i]]->len(); + i++; + } + while(jlen(); + j++; + } + if(internaldistnodes.Count()) { + editdist+=no2gnode[t->nodes[i]]->len(); + i++; + } + while(jlen(); + j++; + } + if(editdist &gpos,GPVec& no2gnode,GPVec& transfrag, + int& geneno,int strand,GList& pred,GVec& trflong,BundleData *bdata) { + + GPVec& guides = bdata->keepguides; + + GVec nodecov; + GVec noderate; + for(int i=0;itrf.Count();j++) { // for all transfrags going through node + int t=inode->trf[j]; + if(transfrag[t]->nodes[0]abundance; + } + if(nodecov[i]) rate=nodecov[i]; + if(rate<=0) rate=1; // this shouldn't happen + //fprintf(stderr,"rate=%f\n",rate); + rate=inode->cov/rate; + } + noderate.Add(rate); + //fprintf(stderr,"Node[%d] no2gnode->cov=%f nodecov=%f noderate=%f\n",i,inode->cov,nodecov[i],noderate[i]); + } + + GBitVec istranscript(transfrag.Count()); + + GVec path; + GBitVec pathpat(gno+edgeno); + int minp; + int maxp; + int maxi; + + char sign='-'; + if(strand) { sign='+';} + int npred=pred.Count(); + + GVec keeptrf; + GVec checktrf; + for(int f=trflong.Count()-1;f>=0;f--) { // if this is a guide it should be reflected in the prediction downstream + path.Clear(); + int t=trflong[f]; + pathpat=transfrag[t]->pattern; + minp=transfrag[t]->nodes[0]; + maxp=transfrag[t]->nodes.Last(); + //if(no2gnode[transfrag[t]->nodes[0]]->hardstart) { + int *pos=gpos[edge(0,minp,gno)]; + if(pos) pathpat[*pos]=1; + //minp=0; + //} + //if(no2gnode[transfrag[t]->nodes.Last()]->hardend) { + pos=gpos[edge(maxp,gno-1,gno)]; + if(pos) pathpat[*pos]=1; + //maxp=gno-1; + //} + maxi=minp; + path.Add(maxi); + pathpat[maxi]=1; + + istranscript.reset(); + + float flux=0; + //float fragno=0; + GVec nodeflux; + + /* + { // DEBUG ONLY + fprintf(stderr,"\n\n***Start parse_trf with maxi=%d and transcript:",maxi); + for(int i=0;inodes.Count();i++) fprintf(stderr," %d",transfrag[t]->nodes[i]); + fprintf(stderr," pathpat="); + printBitVec(pathpat); + fprintf(stderr,"\n"); + +#ifdef GMEMTRACE + double vm,rsm; + get_mem_usage(vm, rsm); + GMessage("\t\tM(s):parse_trf memory usage: rsm=%6.1fMB vm=%6.1fMB\n",rsm/1024,vm/1024); +#endif + } + */ + + bool tocheck=true; + if(back_to_source_fast_long(maxi,path,minp,maxp,pathpat,transfrag,no2gnode,nodecov,gno,gpos)) { + path.cAdd(0); + path.Reverse(); // back to source adds the nodes at the end to avoid pushing the list all the time + + if(fwd_to_sink_fast_long(maxi,path,minp,maxp,pathpat,transfrag,no2gnode,nodecov,gno,gpos)) { + + flux=long_max_flow(gno,path,istranscript,transfrag,no2gnode,nodeflux,pathpat); + + /* + { // DEBUG ONLY + //printTime(stderr); + fprintf(stderr,"flux=%g Path:",flux); + for(int i=0;i exons; + GVec exoncov; + int j=1; + int len=0; + float cov=0; + while(jstart; + int nodeend=no2gnode[path[j]]->end; + nodecov[path[j]]-=nodeflux[j]; + len+=nodeend-nodestart+1; + float excov=nodeflux[j]*noderate[path[j]]; + while(j+1end+1==no2gnode[path[j+1]]->start) { + j++; + len+=no2gnode[path[j]]->len(); + nodeend=no2gnode[path[j]]->end; + excov+=nodeflux[j]*noderate[path[j]]; + } + GSeg exon(nodestart,nodeend); + exons.Add(exon); + cov+=excov; + exoncov.Add(excov); + j++; + } + if(transfrag[t]->nodes.Count()==1) transfrag[t]->abundance=0; + //fprintf(stderr,"Store prediction %d with abundance=%f len=%d\n",pred.Count(),cov/len,len); + GffObj *g=NULL; + if(transfrag[t]->guide) { + g=guides[int(transfrag[t]->usepath)]; + if (g && g->uptr) { + RC_TData &td = *(RC_TData*) (g->uptr); + td.in_bundle=3; + //fprintf(stderr,"sg guide %s is stored\n",g->getID()); + } + } + CPrediction *p=new CPrediction(geneno, g,exons[0].start , exons.Last().end, cov, sign, len); + p->exons=exons; + p->exoncov=exoncov; + p->mergename='.'; // I should not delete this prediction + pred.Add(p); + + CTransfrag u(path,pathpat,cov/len); + keeptrf.Add(u); + } + else if(transfrag[t]->guide) checktrf.Add(t); + } + } + + if(tocheck) // try to see if you can rescue transfrag -> they are stored from more abundant to least + checktrf.Add(t); + } + + //keeptrf.Sort(longtrCmp); // most abundant transfrag in the graph come first, then the ones with most nodes, then the ones more complete + + for(int c=0;cguide || transfrag[checktrf[c]]->abundance>=readthr) { // only in this case it is worth considering it as a potential prediction + int t=checktrf[c]; + int mink=best_trf_match(transfrag[t],keeptrf,no2gnode,gno); + if(mink>=0) { // found good match + int p=0; + int i=0; + int np=npred+mink; + while(inodes.Count() && pexons.Count()) { + if(no2gnode[transfrag[t]->nodes[i]]->endexons[p].start) i++; + else if(pred[np]->exons[p].endnodes[i]]->start) p++; + else { // the two intersect (I can only have the full node included in exon) + float addcov=transfrag[t]->abundance*noderate[transfrag[t]->nodes[i]]; + pred[np]->exoncov[p]+=transfrag[t]->abundance*noderate[transfrag[t]->nodes[i]]; + pred[np]->cov+=addcov; + i++; + } + } + } + else { // store it as an independent prediction + pathpat=transfrag[t]->pattern; // not used right now but maybe in the future? + path.Clear(); + path.cAdd(0); + path.Add(transfrag[t]->nodes[0]); + for(int j=1;jnodes.Count();j++) { + if(transfrag[t]->nodes[j]!=1+transfrag[t]->nodes[j-1] || + no2gnode[transfrag[t]->nodes[j]]->start-1!=no2gnode[transfrag[t]->nodes[j-1]]->end) { + // check if transfrag t1 is incomplete between node[n-1] and node [n] + int *pos=gpos[edge(transfrag[t]->nodes[j-1],transfrag[t]->nodes[j],gno)]; + if(!pos || !transfrag[t]->pattern[*pos]) { // incomplete transfrag + break; + } + if(pos) pathpat[*pos]=1; + path.Add(transfrag[t]->nodes[j]); + } + else path.Add(transfrag[t]->nodes[j]); + } + if(path.Last()==transfrag[t]->nodes.Last()) { // this transfrag is complete, might be worth rescuing + int sink=gno-1; + path.Add(sink); + + GVec exons; + GVec exoncov; + int j=1; + int len=0; + float cov=0; + while(jstart; + int nodeend=no2gnode[path[j]]->end; + nodecov[path[j]]-=transfrag[t]->abundance; + len+=nodeend-nodestart+1; + float excov=transfrag[t]->abundance*noderate[path[j]]; + while(j+1end+1==no2gnode[path[j+1]]->start) { + j++; + len+=no2gnode[path[j]]->len(); + nodeend=no2gnode[path[j]]->end; + excov+=transfrag[t]->abundance*noderate[path[j]]; + } + GSeg exon(nodestart,nodeend); + exons.Add(exon); + cov+=excov; + exoncov.Add(excov); + j++; + } + GffObj *g=NULL; + if(transfrag[t]->guide) { + g=guides[int(transfrag[t]->usepath)]; + if (g && g->uptr) { + RC_TData &td = *(RC_TData*) (g->uptr); + td.in_bundle=3; + //fprintf(stderr,"sg guide %s is stored\n",g->getID()); + } + } + //fprintf(stderr,"Store prediction %d with abundance=%f\n",pred.Count(),cov/len); + CPrediction *p=new CPrediction(geneno, NULL,exons[0].start , exons.Last().end, cov, sign, len); + p->exons=exons; + p->exoncov=exoncov; + pred.Add(p); + + CTransfrag u(path,pathpat,cov/len); + keeptrf.Add(u); + } + } + transfrag[t]->abundance=0; + } + + for(int t=0;tabundance>epsilon && transfrag[t]->nodes[0] && transfrag[t]->nodes.Last()!=gno-1 ) { + /*fprintf(stderr,"Consider transfrag[%d]->abundance=%f with nodes:",t,transfrag[t]->abundance); + for(int i=0;inodes.Count();i++) fprintf(stderr," %d",transfrag[t]->nodes[i]); + fprintf(stderr,"\n");*/ + int mink=best_trf_match(transfrag[t],keeptrf,no2gnode,gno); + + if(mink>=0) { // mink gives the prediction's position too + /*fprintf(stderr,"Added to prediction:"); + for(int i=0;inodes.Count() && pexons.Count()) { + if(no2gnode[transfrag[t]->nodes[i]]->endexons[p].start) i++; + else if(pred[np]->exons[p].endnodes[i]]->start) p++; + else { // the two intersect (I can only have the full node included in exon) + float addcov=transfrag[t]->abundance*noderate[transfrag[t]->nodes[i]]; + pred[np]->exoncov[p]+=transfrag[t]->abundance*noderate[transfrag[t]->nodes[i]]; + pred[np]->cov+=addcov; + i++; + } + } + //fprintf(stderr," new abundance=%f\n",pred[np]->cov/pred[np]->tlen); + } + } + } + + for(int p=npred;pcov) { + pred[p]->cov/=pred[p]->tlen; + for(int i=0;iexons.Count();i++) + pred[p]->exoncov[i]/=pred[p]->exons[i].len(); + } + else pred.Delete(p); + } + +} + void parse_trf_long(int maxi,int gno,int edgeno, GIntHash &gpos,GPVec& no2gnode,GPVec& transfrag, int& geneno,bool first,int strand,GList& pred,GVec& nodecov, - GBitVec& istranscript,GBitVec& removable,GBitVec& usednode,float maxcov,GBitVec& prevpath) { + GBitVec& istranscript,GBitVec& removable,GBitVec& usednode,float maxcov,GBitVec& prevpath,GVec& trflong) { GVec path; - path.Add(maxi); GBitVec pathpat(gno+edgeno); + int minp=maxi; + int maxp=maxi; + + bool usetrflong=false; + + if(trflong.Count()) { + /**** this version still maintains maxi in the center + int i=trflong.Count()-1; + int t=-1; + while(i>=0) { + if(transfrag[trflong[i]]->pattern[maxi]) { + t=trflong[i]; + trflong.Delete(i); + fprintf(stderr,"found transcript t=%d\n",t); + break; + } + i--; + } + if(t<0) {****/ + int t=trflong.Pop(); + //} + usetrflong=true; + pathpat=pathpat | transfrag[t]->pattern; + minp=transfrag[t]->nodes[0]; + maxp=transfrag[t]->nodes.Last(); + int *pos=gpos[edge(0,minp,gno)]; + if(pos) pathpat[*pos]=1; + pos=gpos[edge(maxp,gno-1,gno)]; + if(pos) pathpat[*pos]=1; + maxi=minp; + } + //else return; + + path.Add(maxi); pathpat[maxi]=1; + + + bool full=false; istranscript.reset(); float flux=0; @@ -9035,9 +9922,6 @@ } */ - int minp=maxi; - int maxp=maxi; - bool full=false; if(back_to_source_fast_long(maxi,path,minp,maxp,pathpat,transfrag,no2gnode,nodecov,gno,gpos)) { if(includesource) path.cAdd(0); @@ -9047,14 +9931,14 @@ flux=push_max_flow(gno,path,istranscript,transfrag,no2gnode,nodeflux,pathpat,gpos,full); - /* + /* { // DEBUG ONLY //printTime(stderr); fprintf(stderr,"flux=%g Path:",flux); for(int i=0;i covered(ng,false); - GVec gcount; + GVec gcount; // count number of bits set in each guide's pattern for(int g=0;gpattern.count(); gcount.Add(gc); @@ -9485,7 +10365,7 @@ maxcovscore=covscore; //mingscore=gcount[g]-covscore; } - /*else if(covscore==maxcovscore){ + else if(covscore==maxcovscore){ int gcov=gcount[g]-covscore; if(gcov=0 && !covered[maxg]) { @@ -9509,7 +10389,7 @@ guidetrf.Delete(g); } } - } + }*/ // print guide coverages if(c_out) { @@ -9573,7 +10453,7 @@ // **** NEXT PORTION NEEDS TO BE CHECKED!!! - guidetrf.Sort(guideCmp); //longest guide comes first + guidetrf.Sort(guideCmp); // guide with most bits set in pattern comes first uint refstart=(uint)bdata->start; int g=0; while(gnodes.Insert(0,0); // I need to comment this if I need path not to include the source - guidetrf[g].trf->pattern[0]=1; - guidetrf[g].trf->pattern[*pos]=1; + guidetrf[g].trf->pattern[0]=1; + guidetrf[g].trf->pattern[*pos]=1; + } bool sourcestart=false; // assume there is no extension to source CGraphnode *inode=no2gnode[nodei]; @@ -9672,8 +10553,6 @@ nodei=guidetrf[g].trf->nodes.Last(); if(nodei != gno-1) { // sink is not already present in guide pattern int sink=gno-1; - guidetrf[g].trf->nodes.Add(sink); - guidetrf[g].trf->pattern[sink]=1; int *pos=gpos[edge(nodei,sink,gno)]; if(!pos) { //fprintf(stderr,"Add sink link from position %d for guide=%d\n",no2gnode[nodei]->end,guidetrf[g].g); @@ -9682,7 +10561,11 @@ lastgpos++; pos=gpos[key]; } - guidetrf[g].trf->pattern[*pos]=1; + if(!longreads) { + guidetrf[g].trf->nodes.Add(sink); + guidetrf[g].trf->pattern[sink]=1; + guidetrf[g].trf->pattern[*pos]=1; + } bool sinkend=false; CGraphnode *inode=no2gnode[nodei]; @@ -9744,8 +10627,6 @@ } - - } bool is_reference_transcript(GVec& guidetrf,GBitVec& pattern) { @@ -10472,14 +11353,15 @@ } -int find_transcripts(int gno,int edgeno, GIntHash &gpos,GPVec& no2gnode,GPVec& transfrag, - int geneno,int strand,GVec& guidetrf,GList& pred,GPVec& guides,GVec& guidepred,BundleData* bdata) { +int find_transcripts(int gno,int edgeno, GIntHash &gpos,GPVec& no2gnode,GPVec& transfrag,int geneno,int strand, + GVec& guidetrf,GPVec& guides,GVec& guidepred,BundleData* bdata,GVec& trflong) { + + GList& pred = bdata->pred; // process in and out coverages for each node int maxi=0; // node with maximum coverage GVec nodecov; // node coverages - for(int i=0;i=1) { // sensitive mode only; otherwise >=readthr + if(nodecov[maxi]>=1) { // sensitive mode only; otherwise >=readthr - // process rest of the transfrags - if(!eonly && nodecov[maxi]>=1) { - GBitVec removable(transfrag.Count(),true); + // process rest of the transfrags + if(!eonly && nodecov[maxi]>=1) { + GBitVec removable(transfrag.Count(),true); + + // 1: + // parse_trf_weight_max_flow(gno,no2gnode,transfrag,geneno,strand,pred,nodecov,pathpat); + // 2: + GBitVec usednode(gno+edgeno); + /*if(longreads) { + get_trf_long(gno,edgeno, gpos,no2gnode,transfrag,geneno,strand,pred,trflong); + //parse_trf_long(maxi,gno,edgeno,gpos,no2gnode,transfrag,geneno,first,strand,pred,nodecov,istranscript,removable,usednode,0,pathpat,trflong); + } + else {*/ + parse_trf(maxi,gno,edgeno,gpos,no2gnode,transfrag,geneno,first,strand,pred,nodecov,istranscript,removable,usednode,0,pathpat); + //} - // 1: - // parse_trf_weight_max_flow(gno,no2gnode,transfrag,geneno,strand,pred,nodecov,pathpat); - // 2: - GBitVec usednode(gno+edgeno); - if(longreads) { - parse_trf_long(maxi,gno,edgeno,gpos,no2gnode,transfrag,geneno,first,strand,pred,nodecov,istranscript,removable,usednode,0,pathpat); - } - else { - parse_trf(maxi,gno,edgeno,gpos,no2gnode,transfrag,geneno,first,strand,pred,nodecov,istranscript,removable,usednode,0,pathpat); } - } } @@ -10731,7 +11616,7 @@ // ^ KEEP THIS IN MIND IF WE CHANGE HOW WE USE GOOD_JUNC /*** 3. don't keep if it's below threshold ***/ - if (jd.nreads_goodlongintron && jd.nreads_good& junction = bdata->junction; GPVec& guides = bdata->keepguides; GVec* bpcov = bdata->bpcov; // I might want to use a different type of data for bpcov to save memory in the case of very long bundles - GList& pred = bdata->pred; - // form groups on strands: all groups below are like this: 0 = negative strand; 1 = unknown strand; 2 = positive strand - GPVec group; - CGroup *currgroup[3]={NULL,NULL,NULL}; // current group of each type - CGroup *startgroup[3]={NULL,NULL,NULL}; // start group of each type - int color=0; // next color to assign - GVec merge; // remembers merged groups - GVec equalcolor; // remembers colors for the same bundle - GVec *readgroup=new GVec[readlist.Count()]; // remebers groups for each read; don't forget to delete it when no longer needed - GVec guidepred; // for eonly keeps the prediction number associated with a guide - GArray guideedge; // 0: negative starts; 1 positive starts - - //fprintf(stderr,"build_graphs with %d guides\n",guides.Count()); - - /* - GVec starts[3]; - GVec ends[3]; - int edgedist=longintronanchor; - - if(bpcov->Count()>2*edgedist+CHI_WIN) { - GVec *diffval=new GVec[3]; - float sumstartleft[3]={0,0,0}; - float sumendleft[3]={0,0,0}; - float sumstartright[3]={0,0,0}; - float sumendright[3]={0,0,0}; - float lastcov[3]; - for(int s=0;s<3;s++) - lastcov[s]=get_cov(s,edgedist-1,edgedist-1,bpcov); - lastcov[1]-=lastcov[0]+lastcov[2]; - if(lastcov[1]<0) lastcov[1]=0; - for(int i=edgedist;i0) sumstartleft[s]+=diff; - else sumendleft[s]-=diff; - lastcov[s]=icov[s]; - } - } - for(int i=CHI_THR+edgedist;i0) sumstartright[s]+=diff; - else sumendright[s]-=diff; - lastcov[s]=icov[s]; - } - } - - for(int i=CHI_WIN+edgedist;iCount()-edgedist;i++) { - float icov[3]; - int m=(i-CHI_THR+edgedist)%CHI_WIN; - fprintf(stderr,"m=%d ",i-CHI_THR+refstart); - for(int s=0;s<3;s++) { - if(diffval[s][m]>1/ERROR_PERC) { // starts here - if(sumstartleft[s]& pred = bdata->pred; + // form groups on strands: all groups below are like this: 0 = negative strand; 1 = unknown strand; 2 = positive strand + GPVec group; + CGroup *currgroup[3]={NULL,NULL,NULL}; // current group of each type + CGroup *startgroup[3]={NULL,NULL,NULL}; // start group of each type + int color=0; // next color to assign + GVec merge; // remembers merged groups + GVec equalcolor; // remembers colors for the same bundle + GVec *readgroup=new GVec[readlist.Count()]; // remebers groups for each read; don't forget to delete it when no longer needed + GVec guidepred; // for eonly keeps the prediction number associated with a guide + GArray guideedge; // 0: negative starts; 1 positive starts + /*GPVec& feature = bdata->ptfs; // these are point features (confirmed starts/stops) - if(s!=1) {fprintf(stderr," [%d](sL=%.1f sR=%.1f sL/sR=",s,sumstartleft[s],sumstartright[s]); - if(diffval[s][m]>1/ERROR_PERC) fprintf(stderr,"%.1f",sumstartleft[s]/sumstartright[s]); - else fprintf(stderr,"*");} + for(int i=0;iftype==GPFT_TSS) + fprintf(stderr,"TSS at position %d on strand %d\n",feature[i]->coord,feature[i]->strand); + if(feature[i]->ftype==GPFT_CPAS) + fprintf(stderr,"CPAS at position %d on strand %d\n",feature[i]->coord,feature[i]->strand); + }*/ - if(diffval[s][p]>0) { - sumstartleft[s]-=diffval[s][p]; - } - else { - sumendleft[s]+=diffval[s][p]; - } - if(diffval[s][m]>0) { - sumstartleft[s]+=diffval[s][m]; - sumstartright[s]-=diffval[s][m]; - } - else { - sumendleft[s]-=diffval[s][m]; - sumendright[s]+=diffval[s][m]; - } - diffval[s][p]=diff; - if(diff>0) sumstartright[s]+=diff; - else sumendright[s]-=diff; - lastcov[s]=icov[s]; - if(s!=1) fprintf(stderr," eL=%.1f eR=%.1f eR/eL=",sumendleft[s],sumendright[s]); - if(diffval[s][m]<-1/ERROR_PERC) { // ends here - if(sumendleft[s]*ERROR_PERC>sumendright[s]) { - CPred p(refstart+i-CHI_THR-1,-diffval[s][m]); - ends[s].Add(p); - } - if(s!=1) fprintf(stderr,"%.1f)",sumendright[s]/sumendleft[s]); - } - else if(s!=1) fprintf(stderr,"*)"); - } - fprintf(stderr,"\n"); - } + //fprintf(stderr,"build_graphs with %d guides\n",guides.Count()); - for(int s=0;s<3;s++) { - int i=0; - while(istarts[s][j].cov) { - starts[s][j].cov=0; - } - else if(starts[s][i].covends[s][j].cov) { - ends[s][j].cov=0; - } - else if(ends[s][i].cov=0 && ejunction[j]->end==end) { ejunction[j]->consright=rightcons; if(!ejunction[j]->guide_match && !rightcons && ejunction[j]->nreads_goodstrand=0; - if(ejunction[j]->strand>0) ejunction[j]->rightsupport=rightsupport[(1+ejunction[j]->strand)/2]; + if(ejunction[j]->strand) ejunction[j]->rightsupport=rightsupport[(1+ejunction[j]->strand)/2]; j--; } // I do not check here for possible missalignments @@ -11244,341 +11991,116 @@ //fprintf(stderr,"In higherr!\n"); - /* - int lastleftjunc=-1; // last left junction that is kept - int lastrightjunc=-1; - for(int i=0;istart,junction[i]->end,junction[i]->strand,junction[i]->rightsupport,junction[i]->nm,junction[i]->nreads); + //fprintf(stderr,"junct[%d]:%d-%d:%d lefttsupport=%f nm=%f mm=%f nreads=%f nreads_good=%f\n",i,junction[i]->start,junction[i]->end,junction[i]->strand,junction[i]->leftsupport,junction[i]->nm,junction[i]->mm,junction[i]->nreads,junction[i]->nreads_good); if(junction[i]->strand && junction[i]->nm && !junction[i]->guide_match && junction[i]->nm>=junction[i]->nreads) { // this is a bad junction -> check if it's maximal; - if(junction[i]->nreads_good<1.25*junctionthr) { // threshold for bad junctions is higher; (should I also add that too short junctions not to be accepted?) - junction[i]->strand=0; // just delete junction if it's low count + if(junction[i]->nreads_good>0 && (junction[i]->nreads_good<1.25*junctionthr || !good_junc(*junction[i],refstart,bpcov))) { // threshold for bad junctions is higher; (should I also add that too short junctions not to be accepted?) + //junction[i]->strand=0; // just delete junction if it's low count + junction[i]->mm=-1; + //fprintf(stderr,"...delete due to being under threshold\n"); } - else { // junction passes higher threshold - int j=i-1; - //if(j>=0) fprintf(stderr,"...start at junct:%d-%d:%d leftsupport=%f dist=%d\n",junction[j]->start,junction[j]->end,junction[j]->strand,junction[j]->leftsupport,junction[i]->start-junction[j]->start); - while(j>=0 && junction[i]->start-junction[j]->startstart!=junction[i]->start && junction[j]->leftsupport>junction[i]->leftsupport) { - //fprintf(stderr,"...compare to junct:%d-%d:%d leftsupport=%f\n",junction[j]->start,junction[j]->end,junction[j]->strand,junction[j]->leftsupport); - junction[i]->strand=0; // delete junction if I have a better junction nearby on the left + + int j=i-1; + float support=0; + bool searchjunc=true; + //if(j>=0) fprintf(stderr,"...start at junct:%d-%d:%d leftsupport=%f dist=%d\n",junction[j]->start,junction[j]->end,junction[j]->strand,junction[j]->leftsupport,junction[i]->start-junction[j]->start); + while(j>=0 && junction[i]->start-junction[j]->startstrand==junction[i]->strand) { + if(junction[j]->start==junction[i]->start) { // found a junction with the same start -> I have already searched it + if(junction[j]->nreads<0) { + junction[i]->nreads=junction[j]->nreads; + searchjunc=false; + } break; } - j--; - } - if(junction[i]->strand) { - j=i+1; - //if(jstart,junction[j]->end,junction[j]->strand,junction[j]->leftsupport,junction[j]->start-junction[i]->start); - while(jstart-junction[i]->startstart,junction[j]->end,junction[j]->strand,junction[j]->leftsupport); - if(junction[j]->start!=junction[i]->start && junction[j]->leftsupport>junction[i]->leftsupport) { // junction is not best within window - junction[i]->strand=0; // delete junction if I have a better junction nearby on the right - break; + else if(junction[j]->leftsupport*tolerance>junction[i]->leftsupport) { + //if(junction[j]->start!=junction[i]->start && junction[j]->leftsupport*tolerance>junction[i]->leftsupport) { + //fprintf(stderr,"...1 compare to [%d]:%d-%d:%d leftsupport=%f\n",j,junction[j]->start,junction[j]->end,junction[j]->strand,junction[j]->leftsupport); + //junction[i]->strand=0; // delete junction if I have a better junction nearby on the left + //break; + if(junction[j]->leftsupport>support) { + junction[i]->nreads=-j; + support=junction[j]->leftsupport; } - j++; } } - } - } - if(junction[i]->strand) { // update closest left junctions' starts back to lastleftjunc's start - int k=lastleftjunc+1; - if(lastleftjunc>-1) { - uint laststart=junction[lastleftjunc]->start; - uint mid=laststart+(junction[i]->start-laststart)/2; - //while(junction[k]->startstart-junction[lastleftjunc]->startleftsupport==1))) { // shorten exon - while(junction[k]->startstart-junction[lastleftjunc]->startstartstart=junction[lastleftjunc]->start; // junction is closer to last junction - junction[k]->nreads=lastleftjunc; - //fprintf(stderr,"update junction[%d]->start to junction[%d]->start=%d\n",k,lastleftjunc,junction[lastleftjunc]->start); - k++; - } - } - int j=i-1; - //while(j>k && (junction[i]->start-junction[j]->startleftsupport==1))) { //lengthen exon - while(j>k && junction[i]->start-junction[j]->startk) { //lengthen exon - junction[j]->start=junction[i]->start; // junction is closer to last junction - //fprintf(stderr,"update junction[%d]->start to junction[%d]->start=%d\n",k,i,junction[i]->start); - junction[j]->nreads=i; j--; } - - lastleftjunc=i; - } - - - //fprintf(stderr,"ejunct:%d-%d:%d rightsupport=%f nm=%f nreads=%f\n",ejunction[i]->start,ejunction[i]->end,ejunction[i]->strand,ejunction[i]->rightsupport,ejunction[i]->nm,ejunction[i]->nreads); - - if(ejunction[i]->strand && ejunction[i]->nm && !ejunction[i]->guide_match && ejunction[i]->nm>=ejunction[i]->nreads) { // this is a bad junction -> check if it's maximal - if(ejunction[i]->nreads_good<1.25*junctionthr) { // threshold for bad junctions is higher - ejunction[i]->strand=0; - } - else { - int j=i-1; - //if(j>=0) fprintf(stderr,"...start at junct:%d-%d:%d rightsupport=%f dist=%d\n",ejunction[j]->start,ejunction[j]->end,ejunction[j]->strand,ejunction[j]->rightsupport,ejunction[i]->end-ejunction[j]->end); - while(j>=0 && ejunction[i]->end-ejunction[j]->endstart,ejunction[j]->end,ejunction[j]->strand,ejunction[j]->rightsupport); - if(ejunction[j]->end!=ejunction[i]->end && ejunction[j]->rightsupport>ejunction[i]->rightsupport) { - ejunction[i]->strand=0; - break; - } - j--; - } - if(ejunction[i]->strand) { - j=i+1; - //if(jstart,ejunction[j]->end,ejunction[j]->strand,ejunction[j]->rightsupport,ejunction[j]->end-ejunction[i]->end); - while(jend-ejunction[i]->endstart,ejunction[j]->end,ejunction[j]->strand,ejunction[j]->rightsupport); - if(ejunction[j]->end!=ejunction[i]->end && ejunction[j]->rightsupport>ejunction[i]->rightsupport) { // junction is not best within window - ejunction[i]->strand=0; - break; + if(searchjunc) { + j=i+1; + //if(jstart,junction[j]->end,junction[j]->strand,junction[j]->leftsupport,junction[j]->start-junction[i]->start); + while(jstart-junction[i]->startstrand==junction[i]->strand && junction[j]->start!=junction[i]->start && junction[j]->leftsupport*tolerance>junction[i]->leftsupport) { // junction is not best within window + //if(junction[j]->start!=junction[i]->start && junction[j]->leftsupport*tolerance>junction[i]->leftsupport) { // junction is not best within window + //fprintf(stderr,"...2 compare to [%d]:%d-%d:%d leftsupport=%f\n",j,junction[j]->start,junction[j]->end,junction[j]->strand,junction[j]->leftsupport); + //junction[i]->strand=0; // delete junction if I have a better junction nearby on the right + //break; + if(junction[j]->leftsupport>support) { + junction[i]->nreads=-j; + support=junction[j]->leftsupport; } - j++; } - } - } - } - if(ejunction[i]->strand) { // update closest right junctions back to lastrightjunc - if(lastrightjunc<0) { - int k=i-1; - //while(k>=0 && (ejunction[i]->end-ejunction[k]->endrightsupport==1))) { - while(k>=0 && ejunction[i]->end-ejunction[k]->end=0) { - ejunction[k]->end=ejunction[i]->end; - //fprintf(stderr,"update ejunction[%d]->end to ejunction[%d]->end=%d\n",k,i,ejunction[i]->end); - ejunction[k]->nreads_good=i; - k--; - } - } - else { - uint lastend=ejunction[lastrightjunc]->end; - uint mid=lastend+(ejunction[i]->end-lastend)/2; - int k=i-1; - //while(ejunction[k]->end>mid && (ejunction[i]->end-ejunction[k]->endrightsupport==1))) { - while(ejunction[k]->end>mid && ejunction[i]->end-ejunction[k]->endend>mid) { - ejunction[k]->end=ejunction[i]->end; // junction is closer to i junction - //fprintf(stderr,"update ejunction[%d]->end to ejunction[%d]->end=%d\n",k,i,ejunction[i]->end); - ejunction[k]->nreads_good=i; - k--; - } - int j=lastrightjunc+1; - //while(jend-ejunction[lastrightjunc]->endrightsupport==1))) { - while(jend-ejunction[lastrightjunc]->endend=ejunction[lastrightjunc]->end; - //fprintf(stderr,"update ejunction[%d]->end to ejunction[%d]->end=%d\n",j,lastrightjunc,ejunction[lastrightjunc]->end); - ejunction[j]->nreads_good=lastrightjunc; j++; } } - lastrightjunc=i; - } - } - - if(lastleftjunc>-1) { - int i=lastleftjunc+1; - //while(istart-junction[lastleftjunc]->startleftsupport==1))) { - while(istart-junction[lastleftjunc]->startstart=junction[lastleftjunc]->start; - junction[i]->nreads=lastleftjunc; - //fprintf(stderr,"end update junction[%d]->start to junction[%d]->start=%d\n",i,lastleftjunc,junction[lastleftjunc]->start); - i++; - } - } - - if(lastrightjunc>-1) { - int i=lastrightjunc+1; - //while(iend-ejunction[lastrightjunc]->endrightsupport==1))) { - while(iend-ejunction[lastrightjunc]->endend=ejunction[lastrightjunc]->end; - ejunction[i]->nreads_good=lastrightjunc; - //fprintf(stderr,"end update ejunction[%d]->end to ejunction[%d]->end=%d\n",i,lastrightjunc,ejunction[lastrightjunc]->end); - i++; } - }*/ - - float tolerance=1-ERROR_PERC; - // strand based version - int lastleftjunc[2]={-1,-1}; // last left junction that is kept - int lastrightjunc[2]={-1,-1}; - for(int i=0;istart,ejunction[i]->end,ejunction[i]->strand,ejunction[i]->rightsupport,ejunction[i]->nm,ejunction[i]->nreads); - //fprintf(stderr,"junct:%d-%d:%d lefttsupport=%f nm=%f nreads=%f\n",junction[i]->start,junction[i]->end,junction[i]->strand,junction[i]->leftsupport,junction[i]->nm,junction[i]->nreads); - - if(junction[i]->strand && junction[i]->nm && !junction[i]->guide_match && junction[i]->nm>=junction[i]->nreads) { // this is a bad junction -> check if it's maximal; - if(junction[i]->nreads_good<1.25*junctionthr) { // threshold for bad junctions is higher; (should I also add that too short junctions not to be accepted?) - junction[i]->strand=0; // just delete junction if it's low count + if(ejunction[i]->strand && ejunction[i]->nm && !ejunction[i]->guide_match && ejunction[i]->nm>=ejunction[i]->nreads) { // this is a bad junction -> check if it's maximal + if(ejunction[i]->nreads_good>0 && (ejunction[i]->nreads_good<1.25*junctionthr || !good_junc(*ejunction[i],refstart,bpcov))) { // threshold for bad junctions is higher + //ejunction[i]->strand=0; + ejunction[i]->mm=-1; + //fprintf(stderr,"...delete due to being under threshold\n"); } - else { // junction passes higher threshold - int j=i-1; - //if(j>=0) fprintf(stderr,"...start at junct:%d-%d:%d leftsupport=%f dist=%d\n",junction[j]->start,junction[j]->end,junction[j]->strand,junction[j]->leftsupport,junction[i]->start-junction[j]->start); - while(j>=0 && junction[i]->start-junction[j]->startstrand==junction[i]->strand && junction[j]->start!=junction[i]->start && junction[j]->leftsupport*tolerance>junction[i]->leftsupport) { - //if(junction[j]->start!=junction[i]->start && junction[j]->leftsupport*tolerance>junction[i]->leftsupport) { - //fprintf(stderr,"...compare to junct:%d-%d:%d leftsupport=%f\n",junction[j]->start,junction[j]->end,junction[j]->strand,junction[j]->leftsupport); - junction[i]->strand=0; // delete junction if I have a better junction nearby on the left + int j=i-1; + float support=0; + bool searchjunc=true; + //if(j>=0) fprintf(stderr,"...start at junct:%d-%d:%d rightsupport=%f dist=%d\n",ejunction[j]->start,ejunction[j]->end,ejunction[j]->strand,ejunction[j]->rightsupport,ejunction[i]->end-ejunction[j]->end); + while(j>=0 && ejunction[i]->end-ejunction[j]->endstrand==ejunction[i]->strand) { + if(ejunction[j]->end==ejunction[i]->end) { + if(ejunction[j]->nreads_good<0) { + ejunction[i]->nreads_good=ejunction[j]->nreads_good; + searchjunc=false; + } break; } - j--; - } - if(junction[i]->strand) { - j=i+1; - //if(jstart,junction[j]->end,junction[j]->strand,junction[j]->leftsupport,junction[j]->start-junction[i]->start); - while(jstart-junction[i]->startstrand==junction[i]->strand && junction[j]->start!=junction[i]->start && junction[j]->leftsupport*tolerance>junction[i]->leftsupport) { // junction is not best within window - //if(junction[j]->start!=junction[i]->start && junction[j]->leftsupport*tolerance>junction[i]->leftsupport) { // junction is not best within window - //fprintf(stderr,"...compare to junct:%d-%d:%d leftsupport=%f\n",junction[j]->start,junction[j]->end,junction[j]->strand,junction[j]->leftsupport); - junction[i]->strand=0; // delete junction if I have a better junction nearby on the right - break; + else if(ejunction[j]->rightsupport*tolerance>ejunction[i]->rightsupport) { + //if(ejunction[j]->end!=ejunction[i]->end && ejunction[j]->rightsupport*tolerance>ejunction[i]->rightsupport) { + //fprintf(stderr,"...1 compare to [%d]:%d-%d:%d rightsupport=%f\n",j,ejunction[j]->start,ejunction[j]->end,ejunction[j]->strand,ejunction[j]->rightsupport); + //ejunction[i]->strand=0; + //break; + if(ejunction[j]->rightsupport>support) { + ejunction[i]->nreads_good=-j; + support=ejunction[j]->rightsupport; } - j++; - } - } - } - } - if(junction[i]->strand) { // update closest left junctions' starts back to lastleftjunc's start - int s=(1+junction[i]->strand)/2; - int k=lastleftjunc[s]+1; - if(lastleftjunc[s]>-1) { - uint laststart=junction[lastleftjunc[s]]->start; - uint mid=laststart+(junction[i]->start-laststart)/2; - //while(junction[k]->startstart-junction[lastleftjunc]->startleftsupport==1))) { // shorten exon - while(junction[k]->startstart-junction[lastleftjunc[s]]->startstartstrand==junction[i]->strand) { - junction[k]->start=junction[lastleftjunc[s]]->start; // junction is closer to last junction - junction[k]->nreads=lastleftjunc[s]; - //fprintf(stderr,"update junction[%d]->start to junction[%d]->start=%d\n",k,lastleftjunc,junction[lastleftjunc]->start); } - k++; - } - } - int j=i-1; - //while(j>k && (junction[i]->start-junction[j]->startleftsupport==1))) { //lengthen exon - while(j>k && junction[i]->start-junction[j]->startk) { //lengthen exon - if(junction[j]->strand==junction[i]->strand) { - junction[j]->start=junction[i]->start; // junction is closer to last junction - //fprintf(stderr,"update junction[%d]->start to junction[%d]->start=%d\n",k,i,junction[i]->start); - junction[j]->nreads=i; } j--; } - - lastleftjunc[s]=i; - } - - - //fprintf(stderr,"ejunct:%d-%d:%d rightsupport=%f nm=%f nreads=%f\n",ejunction[i]->start,ejunction[i]->end,ejunction[i]->strand,ejunction[i]->rightsupport,ejunction[i]->nm,ejunction[i]->nreads); - - if(ejunction[i]->strand && ejunction[i]->nm && !ejunction[i]->guide_match && ejunction[i]->nm>=ejunction[i]->nreads) { // this is a bad junction -> check if it's maximal - if(ejunction[i]->nreads_good<1.25*junctionthr) { // threshold for bad junctions is higher - ejunction[i]->strand=0; - } - else { - int j=i-1; - //if(j>=0) fprintf(stderr,"...start at junct:%d-%d:%d rightsupport=%f dist=%d\n",ejunction[j]->start,ejunction[j]->end,ejunction[j]->strand,ejunction[j]->rightsupport,ejunction[i]->end-ejunction[j]->end); - while(j>=0 && ejunction[i]->end-ejunction[j]->endstrand==ejunction[i]->strand && ejunction[j]->end!=ejunction[i]->end && ejunction[j]->rightsupport*tolerance>ejunction[i]->rightsupport) { - //if(ejunction[j]->end!=ejunction[i]->end && ejunction[j]->rightsupport*tolerance>ejunction[i]->rightsupport) { - //fprintf(stderr,"...compare to junct:%d-%d:%d rightsupport=%f\n",ejunction[j]->start,ejunction[j]->end,ejunction[j]->strand,ejunction[j]->rightsupport); - ejunction[i]->strand=0; - break; - } - j--; - } - if(ejunction[i]->strand) { - j=i+1; - //if(jstart,ejunction[j]->end,ejunction[j]->strand,ejunction[j]->rightsupport,ejunction[j]->end-ejunction[i]->end); - while(jend-ejunction[i]->endstrand==ejunction[i]->strand && ejunction[j]->end!=ejunction[i]->end && ejunction[j]->rightsupport*tolerance>ejunction[i]->rightsupport) { // junction is not best within window + if(searchjunc) { + j=i+1; + //if(jstart,ejunction[j]->end,ejunction[j]->strand,ejunction[j]->rightsupport,ejunction[j]->end-ejunction[i]->end); + while(jend-ejunction[i]->endstrand==ejunction[i]->strand && ejunction[j]->end!=ejunction[i]->end && ejunction[j]->rightsupport*tolerance>ejunction[i]->rightsupport) { // junction is not best within window //if(ejunction[j]->end!=ejunction[i]->end && ejunction[j]->rightsupport*tolerance>ejunction[i]->rightsupport) { // junction is not best within window - //fprintf(stderr,"...compare to junct:%d-%d:%d rightsupport=%f\n",ejunction[j]->start,ejunction[j]->end,ejunction[j]->strand,ejunction[j]->rightsupport); - ejunction[i]->strand=0; - break; + //fprintf(stderr,"...2 compare to [%d]:%d-%d:%d rightsupport=%f\n",j,ejunction[j]->start,ejunction[j]->end,ejunction[j]->strand,ejunction[j]->rightsupport); + //ejunction[i]->strand=0; + //break; + if(ejunction[j]->rightsupport>support) { + ejunction[i]->nreads_good=-j; + support=ejunction[j]->rightsupport; } - j++; - } - } - } - } - if(ejunction[i]->strand) { // update closest right junctions back to lastrightjunc - int s=(1+ejunction[i]->strand)/2; - if(lastrightjunc[s]<0) { - int k=i-1; - //while(k>=0 && (ejunction[i]->end-ejunction[k]->endrightsupport==1))) { - while(k>=0 && ejunction[i]->end-ejunction[k]->end=0) { - if(ejunction[k]->strand==ejunction[i]->strand) { - ejunction[k]->end=ejunction[i]->end; - //fprintf(stderr,"update ejunction[%d]->end to ejunction[%d]->end=%d\n",k,i,ejunction[i]->end); - ejunction[k]->nreads_good=i; - } - k--; - } - } - else { - uint lastend=ejunction[lastrightjunc[s]]->end; - uint mid=lastend+(ejunction[i]->end-lastend)/2; - int k=i-1; - //while(ejunction[k]->end>mid && (ejunction[i]->end-ejunction[k]->endrightsupport==1))) { - while(ejunction[k]->end>mid && ejunction[i]->end-ejunction[k]->endend>mid) { - if(ejunction[k]->strand==ejunction[i]->strand) { - ejunction[k]->end=ejunction[i]->end; // junction is closer to i junction - //fprintf(stderr,"update ejunction[%d]->end to ejunction[%d]->end=%d\n",k,i,ejunction[i]->end); - ejunction[k]->nreads_good=i; - } - k--; - } - int j=lastrightjunc[s]+1; - //while(jend-ejunction[lastrightjunc]->endrightsupport==1))) { - while(jend-ejunction[lastrightjunc[s]]->endstrand==ejunction[i]->strand) { - ejunction[j]->end=ejunction[lastrightjunc[s]]->end; - //fprintf(stderr,"update ejunction[%d]->end to ejunction[%d]->end=%d\n",j,lastrightjunc,ejunction[lastrightjunc]->end); - ejunction[j]->nreads_good=lastrightjunc[s]; } j++; } } - lastrightjunc[s]=i; - } - } - - for(int s=0;s<2;s++) { - if(lastleftjunc[s]>-1) { - int i=lastleftjunc[s]+1; - //while(istart-junction[lastleftjunc]->startleftsupport==1))) { - while(istart-junction[lastleftjunc[s]]->startstrand==junction[lastleftjunc[s]]->strand) { - junction[i]->start=junction[lastleftjunc[s]]->start; - junction[i]->nreads=lastleftjunc[s]; - //fprintf(stderr,"end update junction[%d]->start to junction[%d]->start=%d\n",i,lastleftjunc,junction[lastleftjunc]->start); - } - i++; - } - } - - if(lastrightjunc[s]>-1) { - int i=lastrightjunc[s]+1; - //while(iend-ejunction[lastrightjunc]->endrightsupport==1))) { - while(iend-ejunction[lastrightjunc[s]]->endstrand==ejunction[lastrightjunc[s]]->strand) { - ejunction[i]->end=ejunction[lastrightjunc[s]]->end; - ejunction[i]->nreads_good=lastrightjunc[s]; - //fprintf(stderr,"end update ejunction[%d]->end to ejunction[%d]->end=%d\n",i,lastrightjunc,ejunction[lastrightjunc]->end); - } - i++; - } } } @@ -11587,8 +12109,8 @@ /* { // DEBUG ONLY for(int i=0;istrand) - fprintf(stderr,"Junction %d-%d:%d has nm=%f mm=%f nreads=%f nreads_good=%f leftsupport=%f and rightsupport=%f\n",junction[i]->start,junction[i]->end,junction[i]->strand, + if(junction[i]->strand && junction[i]->nreads>0 && junction[i]->nreads_good>0 && junction[i]->mm>=0) + fprintf(stderr,"Junction[%d] %d-%d:%d has nm=%f mm=%f nreads=%f nreads_good=%f leftsupport=%f and rightsupport=%f\n",i,junction[i]->start,junction[i]->end,junction[i]->strand, junction[i]->nm,junction[i]->mm,junction[i]->nreads, junction[i]->nreads_good,junction[i]->leftsupport,junction[i]->rightsupport); } @@ -11613,18 +12135,11 @@ GHash boundaryleft; GHash boundaryright; - //GIntHash starts[3]; - //GIntHash ends[3]; - + bool resort=false; + int njunc=junction.Count(); for (int n=0;nstrand; - if(!starts[s][readlist[n]->start]) starts[s].Add(readlist[n]->start,readlist[n]->read_count); - else starts[s][readlist[n]->start]+=readlist[n]->read_count; - if(!ends[s][readlist[n]->end]) ends[s].Add(readlist[n]->end,readlist[n]->read_count); - else ends[s][readlist[n]->end]+=readlist[n]->read_count;*/ - // version that does not adjust read but discards it completely bool keep=true; int i=0; @@ -11634,76 +12149,142 @@ fprintf(stderr," %d-%d\n",rd.segs[i].start,rd.segs[i].end); i=0;*/ - //int leftlen=0; + while(i just delete junction and make sure that add_read_to_group adds read to multiple groups //jd.nreads-=rd.read_count; - bool changeleft=false; - bool changeright=false; - // I need to adjust ends of read to match the adjusted junctions - if(rd.segs[i].end!=jd.start && jd.start>=rd.segs[i].start) { // if start is before adjusted junction start there is nothing I can do - if(jd.startjunctionsupport || (i && rd.juncs[i-1]->strand))) { // only extend if the exon shouldn't just get erased - rd.segs[i].end=jd.start; - changeleft=true; - } - } - - if(rd.segs[i+1].start!=jd.end && jd.end<=rd.segs[i+1].end) { // if end is after end of exon there is nothing I can do - if(jd.end>rd.segs[i+1].start) { - rd.segs[i+1].start=jd.end; - changeright=true; + if(jd.mm>=0 && (changeleft || changeright)) { + uint newstart=rd.segs[i].end; + uint newend=rd.segs[i+1].start; + int jk=-1; + int ek=-1; + if(changeleft) { + jk=abs(int(jd.nreads)); + if(junction[jk]->nreads<0) { // not a valid left support -> delete junction + newstart=rd.segs[i].start-1; + } + else { + newstart=junction[jk]->start; // junction jk is good + //fprintf(stderr,"junction has start from junction[jk=%d]\n",jk); + } } - else if(jd.end>rd.segs[i].end && (rd.segs[i+1].len()>junctionsupport || (i+1strand))) { - rd.segs[i+1].start=jd.end; - changeright=true; + if(changeright) { + ek=abs(int(jd.nreads_good)); + if(ejunction[ek]->nreads_good<0) { + newend=rd.segs[i+1].end+1; + } + else { + newend=ejunction[ek]->end; + //fprintf(stderr,"junction has end from junction[ek=%d]\n",ek); + } } - } - - if(changeleft) { - int k=(int)jd.nreads; - if(junction[k]->end==rd.segs[i+1].start && junction[k]->strand==rd.strand) rd.juncs.Put(i,junction.Get(k)); - else { - while(k>=0 && junction[k]->start==rd.segs[i].end) k--; - k++; - while(kstart==rd.segs[i].end) { - if(junction[k]->end==rd.segs[i+1].start && junction[k]->strand==rd.strand) { // found the right junction - rd.juncs.Put(i,junction.Get(k)); - //fprintf(stderr,"left junction[%d]->strand=%d vs this rd.juncs[%d]->strand=%d\n",k,junction[k]->strand,i,rd.juncs[i]->strand); - break; + if(newstart>=rd.segs[i].start && newend<=rd.segs[i+1].end) { // junction inside read boundaries + bool searchjunc=true; + rd.segs[i].end=newstart; + rd.segs[i+1].start=newend; + if(jk>0) { // search junctions + int k=jk; + while(k>0 && junction[k]->start==newstart) { + if(rd.strand==junction[k]->strand && junction[k]->end==newend) { + rd.juncs.Put(i,junction[k]); + searchjunc=false; + break; + } + k--; + } + if(searchjunc) { + k=jk+1; + while(kstart==newstart) { + if(rd.strand==junction[k]->strand && junction[k]->end==newend) { + rd.juncs.Put(i,junction[k]); + searchjunc=false; + break; + } + k++; + } + } + if(searchjunc) { // I did not find junction -> I need to create a new one + // first check if I already added such a junction + for(k=njunc;kstrand && junction[k]->start==newstart && junction[k]->end==newend) { + rd.juncs.Put(i,junction[k]); + searchjunc=false; + break; + } + } + if(searchjunc) { + if(!resort) { + junction.setSorted(false); + ejunction.setSorted(false); + resort=true; + } + //fprintf(stderr,"Add new junction:%d-%d:%d at position %d njunc=%d\n",newstart,newend,rd.strand,junction.Count(),njunc); + CJunction *junc=new CJunction(newstart,newend,rd.strand); + junction.Add(junc); + ejunction.Add(junc); + rd.juncs.Put(i,junc); + } } - k++; } - } - } - else if(changeright) { // why do I have else here? and not just if? - int k=(int)jd.nreads_good; - if(ejunction[k]->start==rd.segs[i].end && ejunction[k]->strand==rd.strand) rd.juncs.Put(i,ejunction.Get(k)); - else { - while(k>=0 && ejunction[k]->end==rd.segs[i+1].start) k--; - k++; - while(kend==rd.segs[i+1].start) { - if(ejunction[k]->start==rd.segs[i].end && junction[k]->strand==rd.strand) { // found the right junction - rd.juncs.Put(i,ejunction.Get(k)); - //fprintf(stderr,"right ejunction[%d]->strand=%d vs this rd.juncs[%d]->strand=%d\n",k,ejunction[k]->strand,i,rd.juncs[i]->strand); - break; + else { // ek>0 => search ejunctions + int k=ek; + while(k>0 && ejunction[k]->end==newend) { + if(rd.strand==ejunction[k]->strand && ejunction[k]->start==newstart) { + rd.juncs.Put(i,ejunction[k]); + searchjunc=false; + break; + } + k--; + } + if(searchjunc) { + k=ek+1; + while(kend==newend) { + if(rd.strand==ejunction[k]->strand && ejunction[k]->start==newstart) { + rd.juncs.Put(i,ejunction[k]); + searchjunc=false; + break; + } + k++; + } + } + if(searchjunc) { // I did not find junction -> I need to create a new one + // first check if I already added such a junction + for(k=njunc;kstrand && ejunction[k]->start==newstart && ejunction[k]->end==newend) { + rd.juncs.Put(i,ejunction[k]); + searchjunc=false; + break; + } + } + if(searchjunc) { + if(!resort) { + junction.setSorted(false); + ejunction.setSorted(false); + resort=true; + } + //fprintf(stderr,"Add new junction:%d-%d:%d at position %d njunc=%d\n",newstart,newend,rd.strand,junction.Count(),njunc); + CJunction *junc=new CJunction(newstart,newend,rd.strand); + junction.Add(junc); + ejunction.Add(junc); + rd.juncs.Put(i,ejunction[k]); + } } - k++; } } + // fprintf(stderr, "read[%d] adjusted to junction:%d-%d\n",n,rd.segs[i].end,rd.segs[i+1].start); } - //if(changeleft || changeright) fprintf(stderr, "read[%d] adjusted to junction:%d-%d\n",n,rd.segs[i].end,rd.segs[i+1].start); - - // because read might be poorly mapped I also have to unpair it if(keep) { // this is the first time I unpair read -> remove strand if pair has single exon? for(int p=0;pcolor=grcol; - //print STDERR "nextgr=$nextgr grcol=$grcol current group is at coords: ",$currgroup[$nextgr][0],"-",$currgroup[$nextgr][1],"\n"; //fprintf(stderr,"group %d id=%d: %u-%u col=%d from col=%d\n",nextgr,currgroup[nextgr]->grid,currgroup[nextgr]->start,currgroup[nextgr]->end,grcol,prevcol); if(nextgr == 1) { // unknown strand group @@ -12179,7 +12764,9 @@ int g=0; - for(int b=0;b TODO bool guide_ovlp=false; while(gexons.Count()>1 || guides[g]->endstartnode]->start)) { @@ -12474,8 +13061,10 @@ if(guides.Count()) process_refguides(graphno[s][b],edgeno[s][b],gpos[s][b],lastgpos[s][b],no2gnode[s][b],transfrag[s][b],s,guidetrf,bdata); + GVec trflong; // non-redundant long transfrags that I can use to guide the long read assemblies //process transfrags to eliminate noise, and set compatibilities, and node memberships - process_transfrags(s,graphno[s][b],edgeno[s][b],no2gnode[s][b],transfrag[s][b],tr2no[s][b],gpos[s][b],guidetrf,pred); + process_transfrags(s,graphno[s][b],edgeno[s][b],no2gnode[s][b],transfrag[s][b],tr2no[s][b],gpos[s][b],guidetrf,pred,trflong); + //get_trf_long(graphno[s][b],edgeno[s][b], gpos[s][b],no2gnode[s][b],transfrag[s][b],geneno,s,pred,trflong); /* { //DEBUG ONLY @@ -12514,10 +13103,11 @@ //fprintf(stderr,"guidetrf no=%d\n",guidetrf.Count()); + //if(!longreads) { // find transcripts now if(!rawreads) geneno=find_transcripts(graphno[s][b],edgeno[s][b],gpos[s][b],no2gnode[s][b],transfrag[s][b], - geneno,s,guidetrf,pred,guides,guidepred,bdata); - + geneno,s,guidetrf,guides,guidepred,bdata,trflong); + //} for(int g=0;gstrand=='+') s=1; // guide on positive strand else if(guides[g]->strand=='-') s=-1; // guide on negative strand - for(int i=1;iexons.Count();i++) add_junction((int)guides[g]->exons[i-1]->end,(int)guides[g]->exons[i]->start,gjunc,s); + for(int i=1;iexons.Count();i++) { //add_junction((int)guides[g]->exons[i-1]->end,(int)guides[g]->exons[i]->start,gjunc,s); + gjunc.AddIfNew(new CJunction(guides[g]->exons[i-1]->end,guides[g]->exons[i]->start,s),true); + //fprintf(stderr,"Add guide junction=%d-%d:%d from reference %s\n",guides[g]->exons[i-1]->end,guides[g]->exons[i]->start,s,guides[g]->getID()); + } } if(gjunc.Count()) { @@ -13305,7 +13898,7 @@ egjunc.setSorted(juncCmpEnd); int s=0; int e=0; - for(int i=0;istart,junction[i]->end,junction[i]->strand,junction[i]->rightsupport,junction[i]->nm,junction[i]->nreads); @@ -13459,7 +14052,6 @@ //if(nex>1) fprintf(stderr," %d-%d",rd.segs[i].start,rd.segs[i].end); if(i) { //fprintf(stderr,":%d",rd.juncs[i-1]->strand); - CJunction *nullj=NULL; if(modified) { // see if read uses modified junction -> correct it sprintf(sbuf, "%p", rd.juncs[i-1]); @@ -13470,16 +14062,9 @@ if(rd.segs[i-1].start<=jp->start) rd.segs[i-1].end=jp->start; if(rd.segs[i].end>=jp->end) rd.segs[i].start=jp->end; - if(!nullj) { - if(!junction[0]->start && !junction[0]->end) nullj=junction[0]; - else { - nullj=new CJunction(0, 0, 0); - junction.Add(nullj); - } - } rd.juncs[i-1]->nreads-=rd.read_count; //if(rd.juncs[i-1]->nreadsstrand=0; // this approach removes a perfectly valid (guide) junction which might not be the desired effect - rd.juncs[i-1]=nullj; + rd.juncs[i-1]=junction[0]; } else { @@ -13495,16 +14080,9 @@ if(rd.segs[i-1].start>rd.juncs[i-1]->start || rd.segs[i].endend) { if(rd.segs[i-1].end!=rd.juncs[i-1]->start && rd.segs[i-1].start<=rd.juncs[i-1]->start) rd.segs[i-1].end=rd.juncs[i-1]->start; if(rd.segs[i].start!=rd.juncs[i-1]->end && rd.segs[i].end>=rd.juncs[i-1]->end) rd.segs[i].start=rd.juncs[i-1]->end; - if(!nullj) { - if(!junction[0]->start && !junction[0]->end) nullj=junction[0]; - else { - nullj=new CJunction(0, 0, 0); - junction.Add(nullj); - } - } rd.juncs[i-1]->nreads-=rd.read_count; //if(rd.juncs[i-1]->nreadsstrand=0; // this approach removes a perfectly valid (guide) junction which might not be the desired effect - rd.juncs[i-1]=nullj; + rd.juncs[i-1]=junction[0]; } else { //if(rd.segs[i-1].end!=rd.juncs[i-1]->start || rd.segs[i].start!=rd.juncs[i-1]->end) fprintf(stderr," [chg rd from %d-%d to %d-%d]",rd.segs[i-1].end,rd.segs[i].start,rd.juncs[i-1]->start,rd.juncs[i-1]->end); @@ -14314,7 +14892,9 @@ } } - CExon ex(0,0,pred[0]->cov*pred[0]->len()); // this keeps the exon flow based on per bp coverage + float excov=pred[0]->cov; + if(!longreads) excov*=pred[0]->tlen; // it was len() before but that doesn't make any sense + CExon ex(0,0,excov); // this keeps the exon flow based on per bp coverage //CExon ex(0,0,pred[0]->exoncov[0]*pred[0]->exons[0].len()); // this keeps the exon flow based on read coverage (elen factor) //pred[0]->exoncov[0]=0; maxint->node.Add(ex); @@ -14357,7 +14937,7 @@ //fprintf(stderr,"pred[%d] intron[%d]:%d-%d introncov=%f exoncov=%f\n",0,j-1,pred[0]->exons[j-1].end,pred[0]->exons[j].start,introncov,exoncov); } } - nextmaxint=add_exon_to_maxint(nextmaxint,pred[0]->exons[j].start,pred[0]->exons[j].end,0,j,pred[0]->cov*pred[0]->len(),pred,overlap); // per bp coverage + nextmaxint=add_exon_to_maxint(nextmaxint,pred[0]->exons[j].start,pred[0]->exons[j].end,0,j,excov,pred,overlap); // per bp coverage //nextmaxint=add_exon_to_maxint(nextmaxint,pred[0]->exons[j].start,pred[0]->exons[j].end,0,j,pred[0]->exoncov[j]*pred[0]->exons[j].len(),pred,overlap); // per read coverage //pred[0]->exoncov[j]=0; } @@ -14380,10 +14960,12 @@ color.Add(n); + excov=pred[n]->cov; + if(!longreads) excov*=pred[n]->tlen; //CPred p(n,pred[n]->cov); // priority based on cov/bp CPred p(n,pred[n]->tlen*pred[n]->cov); // priority based on number of bases covered predord.Add(p); - nextmaxint=add_exon_to_maxint(nextmaxint,pred[n]->exons[0].start,pred[n]->exons[0].end,n,0,pred[n]->cov*pred[n]->len(),pred,overlap); // per bp coverage + nextmaxint=add_exon_to_maxint(nextmaxint,pred[n]->exons[0].start,pred[n]->exons[0].end,n,0,excov,pred,overlap); // per bp coverage //nextmaxint=add_exon_to_maxint(nextmaxint,pred[n]->exons[0].start,pred[n]->exons[0].end,n,0,pred[n]->exoncov[0]*pred[n]->exons[0].len(),pred,overlap); // read coverage //pred[n]->exoncov[0]=0; intron.clear(); @@ -14430,7 +15012,7 @@ } } } - nextintv=add_exon_to_maxint(nextintv,pred[n]->exons[j].start,pred[n]->exons[j].end,n,j,pred[n]->cov*pred[n]->len(),pred,overlap); // per bp coverage + nextintv=add_exon_to_maxint(nextintv,pred[n]->exons[j].start,pred[n]->exons[j].end,n,j,excov,pred,overlap); // per bp coverage //nextintv=add_exon_to_maxint(nextintv,pred[n]->exons[j].start,pred[n]->exons[j].end,n,j,pred[n]->exoncov[j]*pred[n]->exons[j].len(),pred,overlap); // read coverage //pred[n]->exoncov[j]=0; } @@ -14493,12 +15075,12 @@ uint anchor=longintronanchor; if(longreads && pred[n2]->cov>readthr) anchor=junctionsupport; if(pred[n2]->exons[0].len()exons.Last().len()flag=false; } else if(retainedintron(pred,n1,n2,lowintron)) { //if(ret>1 || pred[n2]->covcov) { - //fprintf(stderr,"falseflag: n2=%d has low intron coverage\n",n2); + //fprintf(stderr,"falseflag: pred[%d] n2=%d has low intron coverage\n",n2,n2); pred[n2]->flag=false; //} } @@ -14506,12 +15088,12 @@ //fprintf(stderr,"diff strand\n"); if(pred[n2]->exons.Count()==1) { // why was I restricting it to single exons? - //fprintf(stderr,"falseflag: ...strand elmination of n2=%d by n1=%d\n",n2,n1); + //fprintf(stderr,"falseflag: ...strand elmination of pred[%d] n2=%d by n1=%d\n",n2,n2,n1); pred[n2]->flag=false; } else if(!pred[n1]->t_eq && pred[n1]->exons.Count()==1) { if(pred[n1]->cov < pred[n2]->cov+singlethr) { - //fprintf(stderr,"flaseflag: n1=%d with 1 exon eliminated by n2=%d\n",n1,n2); + //fprintf(stderr,"flaseflag: pred[%d] n1=%d with 1 exon eliminated by n2=%d\n",n1,n1,n2); pred[n1]->flag=false; break; } @@ -14522,7 +15104,7 @@ }*/ } else if(pred[n2]->exons.Count()<=pred[n1]->exons.Count() && included_pred(pred,n1,n2,(uint)bundleData->start,bpcov)) { - //fprintf(stderr,"falseflag: n2=%d is included in n1=%d\n",n2,n1); + //fprintf(stderr,"falseflag: pred[%d] n2=%d is included in n1=%d\n",n2,n2,n1); pred[n2]->flag=false; } else if(!longreads){ @@ -14534,36 +15116,42 @@ break; } } - if(lowcov) pred[n2]->flag=false; + if(lowcov) { + pred[n2]->flag=false; + //fprintf(stderr,"falseflag: pred[%d] n2=%d has lowcov\n",n2,n2); + } } } // end pred[n1]->strand != pred[n2]->strand //else if(pred[n2]->covcov) pred[n2]->flag=false; // I am only considering inclusions here since this might change allocations else if(pred[n2]->exons.Count()<=pred[n1]->exons.Count() && pred[n1]->cov>pred[n2]->cov*DROP && included_pred(pred,n1,n2,(uint)bundleData->start,bpcov)) { pred[n2]->flag=false; - //fprintf(stderr,"falseflag: ...included elmination of n2=%d by n1=%d\n",n2,n1); + //fprintf(stderr,"falseflag: ...included elmination of pred[%d] n2=%d by n1=%d\n",n2,n2,n1); } else if(!pred[n1]->t_eq && n1 && ((!longreads && pred[n1]->exons.Count()<=2) || pred[n1]->cov < pred[n2]->cov+singlethr) && pred[n1]->exons.Count()exons.Count() && included_pred(pred,n2,n1,(uint)bundleData->start,bpcov)) { pred[n1]->flag=false; - //fprintf(stderr,"falseflag: ...included elmination of n1=%d by n2=%d\n",n1,n2); + //fprintf(stderr,"falseflag: ...included elmination of pred[%d] n1=%d by n2=%d\n",n1,n1,n2); break; } } else if(!pred[n1]->t_eq && n1 && ((!longreads && pred[n1]->exons.Count()<=2) || pred[n1]->cov < pred[n2]->cov+singlethr) && pred[n1]->exons.Count()<=pred[n2]->exons.Count() && included_pred(pred,n2,n1,(uint)bundleData->start,bpcov)) { - //fprintf(stderr,"falseflag: ...included elmination of n1=%d(%f) by n2=%d(%f)\n",n1,pred[n1]->cov,n2,pred[n2]->cov); + //fprintf(stderr,"falseflag: ...included elmination of pred[%d] n1=%d(%f) by n2=%d(%f)\n",n1,n1,pred[n1]->cov,n2,pred[n2]->cov); pred[n1]->flag=false; break; } - else if(longreads && pred[n2]->t_eq && pred[n2]->cov < DROP && pred[n2]->exons.Count()<=pred[n1]->exons.Count() && included_pred(pred,n1,n2,(uint)bundleData->start,bpcov,false)) pred[n2]->flag=false; + else if(longreads && !pred[n2]->t_eq && pred[n2]->cov < DROP && pred[n2]->exons.Count()<=pred[n1]->exons.Count() && included_pred(pred,n1,n2,(uint)bundleData->start,bpcov,false)) { + pred[n2]->flag=false; + //fprintf(stderr,"falseflag: pred[%d] n2=%d is included in n1=%d\n",n2,n2,n1); + } } //else if(!pred[n2]->t_eq && pred[n2]->exons.Count()==1 && pred[n2]->covcov && pred[n1]->startstart && pred[n2]->endend) { else if(!pred[n2]->t_eq && pred[n2]->exons.Count()==1 && ((!longreads && pred[n2]->covcov) || pred[n2]->covcov) && pred[n1]->start<=pred[n2]->start && pred[n2]->end<=pred[n1]->end) { - //fprintf(stderr,"falseflag: ...single exon elmination of n2=%d by n1=%d\n",n2,n1); + //fprintf(stderr,"falseflag: ...single exon elmination of pred[%d] n2=%d by n1=%d\n",n2,n2,n1); pred[n2]->flag=false; // delete intronic prediction if single exon and at realtively low coverage } else if(!longreads) { if(!pred[n1]->t_eq && pred[n1]->exons.Count()==1 && pred[n1]->covcov && pred[n2]->start<=pred[n1]->start && pred[n1]->end<=pred[n2]->end) { - //fprintf(stderr,"falseflag: ...single exon elmination of n1=%d by n2=%d\n",n1,n2); + //fprintf(stderr,"falseflag: ...single exon elmination of pred[%d] n1=%d by n2=%d\n",n1, n1,n2); pred[n1]->flag=false; break; } @@ -14636,126 +15224,155 @@ } */ - // recompute coverages - bool adjust=true; - while(adjust) { // by here I eliminated inclusions and intronic single exons - adjust=false; - - for(int n=0;nflag) { - /*if(!pred[n]->t_eq && !keep[n] && pred[n]->covflag=false; // this was commented and it was almost there Pr 29.8 for 534291 - else*/ - for(int i=0;iexons.Count();i++) - pred[n]->exoncov[i]=0; - } + if(longreads) { // eliminate predictions under threshold without doing the coverage adjustment nextmaxint=maxint; while(nextmaxint) { if((int)nextmaxint->start-bundleData->start exord; for(int i=0;inode.Count();i++) if(pred[nextmaxint->node[i].predno]->flag) { - //CPred ex(i,pred[nextmaxint->node[i].predno]->cov); CPred ex(i,pred[nextmaxint->node[i].predno]->tlen*pred[nextmaxint->node[i].predno]->cov); exord.Add(ex); //exord only remembers the non-false predictions - - //covsum+=pred[nextmaxint->node[i].predno]->cov; // priority based on cov/bp - //covsum+=pred[nextmaxint->node[i].predno]->tlen*pred[nextmaxint->node[i].predno]->cov; // priority based on number of bases covered - covsum+=nextmaxint->node[i].exoncov; // priority based on exoncov } - - if(covsum) { - float totalcov=get_cov(1,nextmaxint->start-bundleData->start,nextmaxint->end-bundleData->start,bpcov); + if(exord.Count()) { exord.Sort(predordCmp); // sort exons from the most highest coverage to lowest int i=exord[0].predno; int n=nextmaxint->node[i].predno; - int e=nextmaxint->node[i].exonno; - //float exoncov=totalcov*pred[n]->tlen*pred[n]->cov/covsum; // priority based on number of bases covered - //float exoncov=totalcov*pred[n]->cov/covsum; // priority based on per bp cov - float exoncov=totalcov*nextmaxint->node[i].exoncov/covsum; // priority based on per bp cov - pred[n]->exoncov[e]+=exoncov; - float usedcov=pred[n]->cov; - float strandcov[2]={0,0}; - if(longreads) { - if(pred[n]->strand=='+') strandcov[1]+=pred[n]->cov; - else if(pred[n]->strand=='-') strandcov[0]+=pred[n]->cov; - else { - strandcov[0]+=pred[n]->cov; - strandcov[1]+=pred[n]->cov; - } - } - - //fprintf(stderr,"maxint=%d-%d has first pred[%d] usedcov=%f\n",nextmaxint->start,nextmaxint->end,n,usedcov); + float usedcov[2]={0,0}; + float multicov[2]={0,0}; + int s=0; + if(pred[n]->strand=='+') s=1; + usedcov[s]=pred[n]->cov; + if(pred[n]->exons.Count()>1) multicov[s]=usedcov[s]; + //fprintf(stderr,"maxint=%d-%d has first pred[%d] with cov=%f usedcov=%f\n",nextmaxint->start,nextmaxint->end,n,pred[n]->cov,usedcov[s]); for(int j=1;jnode[i].predno; bool longunder=false; - if(longreads) { + if(pred[n]->strand=='+') s=1; + else s=0; + if(!pred[n]->t_eq) { float isofraclong=isofrac; if(pred[n]->cov>CHI_WIN) isofraclong=isofrac*ERROR_PERC*DROP; // more abundant transcript have higher error tollerance - if(pred[n]->cov>CHI_THR) isofraclong=isofrac*DROP; - if(pred[n]->strand=='+') { if(pred[n]->covstrand=='-') { if(pred[n]->covcovcovt_eq && (longunder || (!longreads && (pred[n]->covcovcovexons.Count()==1 && pred[n]->covcov<1 || (pred[n]->exons.Count()==2 && (pred[n]->covcovcov,usedcov); + else if(pred[n]->cov>CHI_THR) isofraclong=isofrac*DROP; + //else if(pred[n]->cov>1/ERROR_PERC) isofraclong=isofrac*DROP; + if(pred[n]->exons.Count()>1) { + if((!multicov[s] && pred[n]->covcovcovcovcov,usedcov[s]); pred[n]->flag=false; } else { - e=nextmaxint->node[i].exonno; - //exoncov=totalcov*pred[n]->cov/covsum; // priority based on per bp cov - //float exoncov=totalcov*pred[n]->tlen*pred[n]->cov/covsum; // priority based on number of bases covered - exoncov=totalcov*nextmaxint->node[i].exoncov/covsum; // priority based on per bp cov - pred[n]->exoncov[e]+=exoncov; - usedcov+=pred[n]->cov; - if(longreads) { - if(pred[n]->strand=='+') strandcov[1]+=pred[n]->cov; - else if(pred[n]->strand=='-') strandcov[0]+=pred[n]->cov; - else { - strandcov[0]+=pred[n]->cov; - strandcov[1]+=pred[n]->cov; - } - } - //fprintf(stderr,"maxint=%d-%d add pred[%d]->cov=%f to usedcov=%f\n",nextmaxint->start,nextmaxint->end,n,pred[n]->cov,usedcov); + usedcov[s]+=pred[n]->cov; + if(pred[n]->exons.Count()>1) multicov[s]+=pred[n]->cov; + //fprintf(stderr,"maxint=%d-%d add pred[%d]->cov=%f to usedcov=%f\n",nextmaxint->start,nextmaxint->end,n,pred[n]->cov,usedcov[s]); } } - } } nextmaxint=nextmaxint->next; } + } + else { // recompute coverages -> mostly for short reads; long reads should be more reliable in the way coverages are estimated + bool adjust=true; + while(adjust) { // by here I eliminated inclusions and intronic single exons + adjust=false; + + for(int n=0;nflag) { + /*if(!pred[n]->t_eq && !keep[n] && pred[n]->covflag=false; // this was commented and it was almost there Pr 29.8 for 534291 + else*/ + for(int i=0;iexons.Count();i++) + pred[n]->exoncov[i]=0; + } - //for(int p=0;pflag) { + nextmaxint=maxint; + while(nextmaxint) { + if((int)nextmaxint->start-bundleData->start exord; + for(int i=0;inode.Count();i++) if(pred[nextmaxint->node[i].predno]->flag) { + //CPred ex(i,pred[nextmaxint->node[i].predno]->cov); + CPred ex(i,pred[nextmaxint->node[i].predno]->tlen*pred[nextmaxint->node[i].predno]->cov); + exord.Add(ex); //exord only remembers the non-false predictions - // compute prediction coverage first - pred[n]->cov=0; - for(int i=0;iexons.Count();i++) { - pred[n]->cov+=pred[n]->exoncov[i]; - pred[n]->exoncov[i]/=pred[n]->exons[i].len(); + //covsum+=pred[nextmaxint->node[i].predno]->cov; // priority based on cov/bp + //covsum+=pred[nextmaxint->node[i].predno]->tlen*pred[nextmaxint->node[i].predno]->cov; // priority based on number of bases covered + covsum+=nextmaxint->node[i].exoncov; // priority based on exoncov + } + + if(covsum) { + float totalcov=get_cov(1,nextmaxint->start-bundleData->start,nextmaxint->end-bundleData->start,bpcov); + exord.Sort(predordCmp); // sort exons from the most highest coverage to lowest + int i=exord[0].predno; + int n=nextmaxint->node[i].predno; + int e=nextmaxint->node[i].exonno; + //float exoncov=totalcov*pred[n]->tlen*pred[n]->cov/covsum; // priority based on number of bases covered + //float exoncov=totalcov*pred[n]->cov/covsum; // priority based on per bp cov + float exoncov=totalcov*nextmaxint->node[i].exoncov/covsum; // priority based on per bp cov + pred[n]->exoncov[e]+=exoncov; + //fprintf(stderr,"pred[%d]->exoncov[%d]=%f exoncov=%f totalcov=%f covsum=%f\n",n,e,pred[n]->exoncov[e],exoncov,totalcov,covsum); + float usedcov=pred[n]->cov; + + //fprintf(stderr,"maxint=%d-%d has first pred[%d] with cov=%f usedcov=%f\n",nextmaxint->start,nextmaxint->end,n,pred[n]->cov,usedcov); + + for(int j=1;jnode[i].predno; + if(!pred[n]->t_eq && (pred[n]->covcovcovexons.Count()==1 && pred[n]->covcov<1 || + (pred[n]->exons.Count()==2 && (pred[n]->covcovcov,usedcov); + pred[n]->flag=false; + } + else { + e=nextmaxint->node[i].exonno; + //exoncov=totalcov*pred[n]->cov/covsum; // priority based on per bp cov + //float exoncov=totalcov*pred[n]->tlen*pred[n]->cov/covsum; // priority based on number of bases covered + exoncov=totalcov*nextmaxint->node[i].exoncov/covsum; // priority based on per bp cov + pred[n]->exoncov[e]+=exoncov; + usedcov+=pred[n]->cov; + //fprintf(stderr,"maxint=%d-%d add pred[%d]->cov=%f to usedcov=%f\n",nextmaxint->start,nextmaxint->end,n,pred[n]->cov,usedcov); + } + } + } } - pred[n]->cov/=pred[n]->tlen; - if(!pred[n]->t_eq) { - if(pred[n]->cov<1 || (pred[n]->exons.Count()==1 && pred[n]->covcovexons.Count()==1 && pred[n]->covnext; + } + + //for(int p=0;pflag) { + + // compute prediction coverage first + pred[n]->cov=0; + for(int i=0;iexons.Count();i++) { + pred[n]->cov+=pred[n]->exoncov[i]; + pred[n]->exoncov[i]/=pred[n]->exons[i].len(); + } + pred[n]->cov/=pred[n]->tlen; + if(!pred[n]->t_eq) { + if(pred[n]->cov<1 || (pred[n]->exons.Count()==1 && pred[n]->covcovexons.Count()==1 && pred[n]->covflag=false; + //fprintf(stderr,"falseflag: pred[%d]->cov=%f eliminated due to low coverage\n",n,pred[n]->cov); + adjust=true; + } + } + else if(longreads && pred[n]->covflag=false; - //fprintf(stderr,"falseflag: pred[%d]->cov=%f eliminated due to low coverage\n",n,pred[n]->cov); - adjust=true; + //fprintf(stderr,"falseflag: too low cov of pred[%d]:%f\n",n,pred[n]->cov); } + //predord[p].cov=pred[n]->tlen*pred[n]->cov; ** use this + //predord[p].cov=pred[n]->cov; + //fprintf(stderr,"coverage of pred[%d]=%f\n",n,pred[n]->cov); } - else if(longreads && pred[n]->covflag=false; - //fprintf(stderr,"falseflag: too low cov of pred[%d]:%f\n",n,pred[n]->cov); - } - //predord[p].cov=pred[n]->tlen*pred[n]->cov; ** use this - //predord[p].cov=pred[n]->cov; - //fprintf(stderr,"coverage of pred[%d]=%f\n",n,pred[n]->cov); } } } @@ -15230,6 +15847,20 @@ // print transcripts including the necessary isoform fraction cleanings GList& pred = bundleData->pred; + + /* + { // DEBUG ONLY + fprintf(stderr,"Pred set before sorting:\n"); + for(int i=0;it_eq) fprintf(stderr,"%s ",pred[i]->t_eq->getID()); + fprintf(stderr,"pred[%d]:%d-%d (cov=%f, readcov=%f, strand=%c):",i,pred[i]->start,pred[i]->end,pred[i]->cov,pred[i]->tlen*pred[i]->cov,pred[i]->strand); + for(int j=0;jexons.Count();j++) fprintf(stderr," %d-%d",pred[i]->exons[j].start,pred[i]->exons[j].end); + fprintf(stderr,"\n"); + } + fprintf(stderr,"\n"); + } + */ + int npred=pred.Count(); pred.setSorted(predCmp); @@ -15430,7 +16061,7 @@ for(int i=0;it_eq) fprintf(stderr,"%s ",pred[i]->t_eq->getID()); fprintf(stderr,"pred[%d]:%d-%d (cov=%f, readcov=%f, strand=%c):",i,pred[i]->start,pred[i]->end,pred[i]->cov,pred[i]->tlen*pred[i]->cov,pred[i]->strand); - for(int j=0;jexons.Count();j++) fprintf(stderr," %d-%d",pred[i]->exons[j].start,pred[i]->exons[j].end); + for(int j=0;jexons.Count();j++) fprintf(stderr," %d-%d(%f)",pred[i]->exons[j].start,pred[i]->exons[j].end,pred[i]->exoncov[j]); fprintf(stderr,"\n"); } fprintf(stderr,"\n"); @@ -15608,7 +16239,7 @@ if(pred[n]->t_eq && pred[m]->t_eq && pred[n]->t_eq!=pred[m]->t_eq) { m++; continue;} // both are equal but represent different transcripts if(pred[n]->exons.Count()==1) { - if(pred[m]->cov>ERROR_PERC*pred[n]->cov && pred[n]->cov>ERROR_PERC*pred[m]->cov) { // predictions close to each other in abundance + if(pred[m]->cov>ERROR_PERC*pred[n]->cov && pred[n]->cov>ERROR_PERC*pred[m]->cov) { // predictions close to each other in abundance -> otherwise I just delete the less abundant overlapping single exon gene if(!pred[m]->t_eq && pred[n]->end>pred[m]->end) pred[m]->end=pred[n]->end; @@ -15914,7 +16545,7 @@ for(int i=0;it_eq) fprintf(stderr,"%s ",pred[i]->t_eq->getID()); fprintf(stderr,"pred[%d]:%d-%d (cov=%f, readcov=%f, strand=%c):",i,pred[i]->start,pred[i]->end,pred[i]->cov,pred[i]->tlen*pred[i]->cov,pred[i]->strand); - for(int j=0;jexons.Count();j++) fprintf(stderr," %d-%d",pred[i]->exons[j].start,pred[i]->exons[j].end); + for(int j=0;jexons.Count();j++) fprintf(stderr," %d-%d(%f)",pred[i]->exons[j].start,pred[i]->exons[j].end,pred[i]->exoncov[j]); fprintf(stderr,"\n"); } } diff -Nru stringtie-2.1.1+ds/rlink.h stringtie-2.1.2+ds/rlink.h --- stringtie-2.1.1+ds/rlink.h 2020-01-30 15:09:03.000000000 +0000 +++ stringtie-2.1.2+ds/rlink.h 2020-04-21 20:54:10.000000000 +0000 @@ -49,10 +49,8 @@ //collect all refguide transcripts for a single genomic sequence struct GRefData { GList rnas; //all transcripts on this genomic seq - //GList loci; int gseq_id; const char* gseq_name; - //GList tdata; //transcript data (uptr holder for all rnas loaded here) GRefData(int gid=-1):rnas(false,true,false),gseq_id(gid),gseq_name(NULL) { gseq_id=gid; if (gseq_id>=0) @@ -93,6 +91,48 @@ cov(_cov),bid(_bid),nextnode(_nextnode) {} }; + +enum GPFType { + GPFT_NONE=0, + GPFT_TSS, + GPFT_CPAS +}; //on 4 bits: maximum 15 types + +struct GPtFeature { //point feature (single coordinate) + GPFType ftype : 4; + int ref_id: 26; //index in a reftable[] with reference names, max 67,108,863 + int strand: 2; //-1=-, 0=unstranded, +1=+ + uint coord; //genomic coordinate for this feature + GPtFeature(GPFType ft=GPFT_NONE, int rid=-1, int _strand=0, uint loc=0):ftype(ft), + ref_id(rid), strand(_strand), coord(loc) {} + bool operator<(const GPtFeature &o) { return coordgseqs + GList pfs; //all point feature on this genomic seq, sorted + GRefPtData(int gid=-1):ref_id(gid), pfs(true,false,false) { } + + void add(GPtFeature* t) { //adds a fully formed GPtFeature record + if (ref_id!=t->ref_id || ref_id<0 || t->ref_id<0) + GError("Error: invalid call to GRefPtData::add() - cannot add feature with ref id %d to ref data id %d!\n", + t->ref_id, ref_id); + pfs.Add(t); + } + //sorting by unique ref_id + bool operator==(GRefPtData& d){ + return ref_id==d.ref_id; + } + bool operator<(GRefPtData& d){ + return (ref_id trf; // transfrags that pass the node + bool hardstart:1; // verified/strong start + bool hardend:1; // verified/strong end //CGraphnode(int s=0,int e=0,unsigned int id=MAX_NODE,float nodecov=0,float cap=0,float r=0,float f=0):GSeg(s,e),nodeid(id),cov(nodecov),capacity(cap),rate(r),frag(f),child(),parent(),childpat(),parentpat(),trf(){} CGraphnode(int s=0,int e=0,unsigned int id=MAX_NODE,float nodecov=0,float cap=0,float r=0):GSeg(s,e), - nodeid(id),cov(nodecov),capacity(cap),rate(r),child(),parent(),childpat(),parentpat(),trf(){} + nodeid(id),cov(nodecov),capacity(cap),rate(r),child(),parent(),childpat(),parentpat(),trf(),hardstart(false),hardend(false){} }; // # 0: strand; 1: start; 2: end; 3: nreads; 4: nreads_good; @@ -586,7 +628,7 @@ int idx; //index in the main bundles array int start; int end; - unsigned long numreads; // number of reads in bundles + unsigned long numreads; // number of reads in this bundle /* float wnumreads; // NEW: weighted numreads; a multi-mapped read mapped in 2 places will contribute only 0.5 double sumreads; // sum of all reads' lengths in bundle @@ -608,7 +650,7 @@ GVec bpcov[3]; // this needs to be changed to a more inteligent way of storing the data GList junction; GPVec keepguides; - //GPVec covguides; + GPVec ptfs; //point features for this bundle GList pred; RC_BundleData* rc_data; BundleData():status(BUNDLE_STATUS_CLEAR), idx(0), start(0), end(0), @@ -616,7 +658,7 @@ num_fragments(0), frag_len(0),sum_cov(0),covflags(0), refseq(), gseq(NULL), readlist(false,true), //bpcov(1024), junction(true, true, true), - keepguides(false), pred(false), rc_data(NULL) { + keepguides(false), ptfs(false), pred(false), rc_data(NULL) { for(int i=0;i<3;i++) bpcov[i].setCapacity(1024); } @@ -655,6 +697,7 @@ void Clear() { keepguides.Clear(); + ptfs.Clear(); pred.Clear(); pred.setSorted(false); readlist.Clear(); diff -Nru stringtie-2.1.1+ds/stringtie.cpp stringtie-2.1.2+ds/stringtie.cpp --- stringtie-2.1.1+ds/stringtie.cpp 2020-01-30 15:09:03.000000000 +0000 +++ stringtie-2.1.2+ds/stringtie.cpp 2020-04-21 20:54:10.000000000 +0000 @@ -11,7 +11,7 @@ #include "proc_mem.h" #endif -#define VERSION "2.1.1" +#define VERSION "2.1.2" //#define DEBUGPRINT 1 @@ -32,8 +32,8 @@ #define USAGE "StringTie v" VERSION " usage:\n\ stringtie [-G ] [-l