diff -Nru spaln-2.4.7+dfsg/debian/changelog spaln-2.4.9a+dfsg/debian/changelog --- spaln-2.4.7+dfsg/debian/changelog 2022-01-29 09:41:31.000000000 +0000 +++ spaln-2.4.9a+dfsg/debian/changelog 2022-08-24 14:45:27.000000000 +0000 @@ -1,3 +1,12 @@ +spaln (2.4.9a+dfsg-1) unstable; urgency=medium + + * Team upload. + * Avoid using makeidx.pl in autopkgtest + * Standards-Version: 4.6.1 (routine-update) + * Set upstream metadata fields: Repository-Browse. + + -- Andreas Tille Wed, 24 Aug 2022 16:45:27 +0200 + spaln (2.4.7+dfsg-1) unstable; urgency=medium * Team upload. diff -Nru spaln-2.4.7+dfsg/debian/control spaln-2.4.9a+dfsg/debian/control --- spaln-2.4.7+dfsg/debian/control 2022-01-29 09:41:31.000000000 +0000 +++ spaln-2.4.9a+dfsg/debian/control 2022-08-24 14:45:27.000000000 +0000 @@ -5,7 +5,7 @@ Priority: optional Build-Depends: debhelper-compat (= 13), zlib1g-dev -Standards-Version: 4.6.0 +Standards-Version: 4.6.1 Vcs-Browser: https://salsa.debian.org/med-team/spaln Vcs-Git: https://salsa.debian.org/med-team/spaln.git Homepage: https://github.com/ogotoh/spaln diff -Nru spaln-2.4.7+dfsg/debian/tests/run-unit-test spaln-2.4.9a+dfsg/debian/tests/run-unit-test --- spaln-2.4.7+dfsg/debian/tests/run-unit-test 2022-01-29 09:41:31.000000000 +0000 +++ spaln-2.4.9a+dfsg/debian/tests/run-unit-test 2022-08-24 14:45:27.000000000 +0000 @@ -15,7 +15,12 @@ #set -x #export ALN_DBS="${AUTOPKGTEST_TMP}" -perl makeidx.pl -v -inp dictdisc_g.gf.gz +NCPUS=`grep '^cpu cores' /proc/cpuinfo | head -n 1 | sed 's/cpu cores[[:space:]:]\+//'` + +# Avoid using makeidx.pl (see https://github.com/ogotoh/spaln/issues/46#issuecomment-1025331901) +#perl makeidx.pl -v -inp dictdisc_g.gf.gz +spaln -W -KD -t${NCPUS} dictdisc_g.gf.gz +spaln -W -KP -t${NCPUS} dictdisc_g.gf.gz spaln -Q7 -d dictdisc_g -T dictdisc dictdisc.faa.gz spaln -Q7 -d dictdisc_g -yS -T dictdisc -O12 -g dictdisc.cf.gz sortgrcd -O15 -F2 dictdisc.cf.grd.gz diff -Nru spaln-2.4.7+dfsg/debian/upstream/metadata spaln-2.4.9a+dfsg/debian/upstream/metadata --- spaln-2.4.7+dfsg/debian/upstream/metadata 2022-01-29 09:41:31.000000000 +0000 +++ spaln-2.4.9a+dfsg/debian/upstream/metadata 2022-08-24 14:45:27.000000000 +0000 @@ -31,3 +31,4 @@ Entry: OMICS_21171 - Name: bio.tools Entry: NA +Repository-Browse: https://github.com/ogotoh/spaln diff -Nru spaln-2.4.7+dfsg/README.md spaln-2.4.9a+dfsg/README.md --- spaln-2.4.7+dfsg/README.md 2022-01-28 07:55:49.000000000 +0000 +++ spaln-2.4.9a+dfsg/README.md 2022-05-12 01:54:43.000000000 +0000 @@ -1,8 +1,8 @@ # SPALN information ### Map and align a set of cDNA/EST or protein sequences onto a genome -#### Present Version 2.4.7 -#### Last updated: 2022-01-28 +#### Present Version 2.4.9a +#### Last updated: 2022-05-12 - [Overview](#Ov) - [Install](#Inst) @@ -108,7 +108,7 @@ not specified, *MAX_GENE* is also estimated from the genome size. Don't forget to specify *MAX_GENE* if xxxgnm.gf represents only a part of the genome!! Otherwise, *MAX_GENE* may be seriously underestimated. * Options : (default value) - * -g: The outputs are gzipped. + * -g: The outputs except for X.grp are gzipped. * -t*N*: Number of threads. (1) * ~~-E: Generate local lookup table.~~ * -yX: Format for remote queries (more sensitive but less economic than default) @@ -218,9 +218,12 @@ * -pa*N*: Terminal polyA or polyT sequence longer than *N* (12) is trimmed off and the orientation is fixed accordingly. If *N* = 0 or empty, these functionalities are disabled. * -pi: Mark exon-intron junctions by color in the alignment (-O1). + * -pn: Prohibit ovewrite of existing output files (ask). + * -po: Allow ovewrite of existing output files (ask). * -pq: Suppress warning messages sent to *stderr*. * -pw: Report result even if alignment score is below threshold value. * -px: Suppress self-comparisons in the execution mode (C) or (D). + * -pT: Exclude termination codon from output (include). * -u *N*: Gap-extension penalty (3, 2, 2) * -v *N*: Gap-opening penalty (8, 6, 9) * -xB *S*: Bit pattern of seeds used for HSP search at level 1 @@ -309,6 +312,19 @@ ``` ## Changes from previous version +## Changes in version 2.4.9a +1. Change several codes which might cause compiler-dependent compilation errors. + +## Changes in version 2.4.9 +1. The -g option at the format time compresses all output files except for X.grp. According to this change, the format of X.grp is slightly modified, which does not necessitate reformat of existing files. +2. To guard existing files from accidental loss, the propriety of overwrite of each existing file is asked by default. The -pn (always skip) or -po (always overwrite) option evades this inquiry. +3. The default memory size used for core sort in *sortgrcd* has been changed from 1MB to 16MB. + +## Changes in version 2.4.8 +1. By default, termination codon is included in the output of any form with a protein query. Use -pT option to get the previous forms (without termination codon) of outputs. +2. Enable to read long (>= 2**31B) gzipped files by *spaln* and *sortgrcd*. +3. Fix a bug at the format time with -g option. + ## Changes in version 2.4.7 1. The lower limit of query length has been halved, which improved mappability of short queries. 2. A serious bug for DNA queries with -LS option has been fixed. @@ -402,4 +418,4 @@ * * * -Copyright (c) 1997-2021 Osamu Gotoh (o.gotoh@aist.go.jp) All Rights Reserved. +Copyright (c) 1997-2022 Osamu Gotoh (o.gotoh@aist.go.jp) All Rights Reserved. diff -Nru spaln-2.4.7+dfsg/src/adddef.h spaln-2.4.9a+dfsg/src/adddef.h --- spaln-2.4.7+dfsg/src/adddef.h 2022-01-28 07:55:49.000000000 +0000 +++ spaln-2.4.9a+dfsg/src/adddef.h 2022-05-12 01:54:43.000000000 +0000 @@ -28,6 +28,7 @@ #include #include +#include typedef unsigned char CHAR; typedef unsigned short SHORT; @@ -88,16 +89,28 @@ #ifndef USE_ZLIB #define USE_ZLIB 1 #endif + #if USE_ZLIB #include static const char gz_ext[] = ".gz"; #if ZLIB_VERNUM < 0x12c0 + +extern size_t lgzread(gzFile gzfd, char* buf, size_t sz); +extern size_t lgzwrite(gzFile gzfd, const char* buf, size_t sz); + inline size_t fread(void* buf, size_t s, size_t c, gzFile gzfd) { - int rv = gzread(gzfd, buf, s * c); - return (rv < 0? 0: rv / s); -} -inline size_t fwrite(const void* buf, size_t s, size_t c, gzFile gzfd) { - return (gzwrite(gzfd, buf, s * c) / s); + size_t sz = s * c; + size_t rv = (sz > size_t(INT_MAX))? + lgzread(gzfd, (char*) buf, sz): + gzread(gzfd, buf, sz); + return (rv <= 0? rv: rv / s); +} +inline size_t fwrite(const void* buf, size_t s, size_t c, gzFile gzfd) { + size_t sz = s * c; + size_t rv = (sz > size_t(INT_MAX))? + lgzwrite(gzfd, (const char*) buf, sz): + gzwrite(gzfd, buf, sz); + return (rv <= 0? rv: rv / s); } inline int fputs(const char* buf, gzFile gzfd) { return (gzputs(gzfd, buf)); @@ -158,6 +171,16 @@ extern const char* not_found; extern const char* read_error; extern const char* write_error; +inline void fatal(const char* format,...) +{ + va_list args; + + va_start(args, format); + (void) vfprintf(stderr, format, args); + putc('\n', stderr); + va_end(args); + exit(1); +} /* etc. */ diff -Nru spaln-2.4.7+dfsg/src/aln2.cc spaln-2.4.9a+dfsg/src/aln2.cc --- spaln-2.4.7+dfsg/src/aln2.cc 2022-01-28 07:55:49.000000000 +0000 +++ spaln-2.4.9a+dfsg/src/aln2.cc 2022-05-12 01:54:43.000000000 +0000 @@ -21,7 +21,6 @@ *****************************************************************************/ #include "aln.h" -#include "utilseq.h" #include static int estimlen(int na, int nb, const SKL* skl); @@ -91,6 +90,7 @@ setSimmtxes((ComPmt) dvsp, use_mdm); // default if (alprm2.sss < 0.) alprm2.sss = defSss[algmode.crs]; if (alprm2.z < 0) alprm2.z = dvsp? def_alprm2z: 0; + if (dvsp == 1) alprm2.jneibr /= 2; return (dvsp); } @@ -106,22 +106,18 @@ LongGEP((VTYPE) (-alp->u1 * Vab)), diffu(LongGEP - BasicGEP), LongGOP(BasicGOP - diffu * alp->k1), - MaxGapL(0), ExtraGOP(0), GapE1(0), GapE2(0), - GapW1(0), GapW2(0), GapW3(0), GapW3L(0), - pmt(0), codepot(0), exinpot(0), eijpat(0), IntPen(0) -{ - GOP[0] = 0; - GOP[1] = BasicGOP; - GOP[2] = LongGOP; + GOP{0, BasicGOP, LongGOP} +{ int step = (DvsP == 3)? 1: 3; codonk1 = alp->ls == 3? step * alp->k1: LARGEN; if (!seqs[0]->inex.intr && !seqs[1]->inex.intr) return; // without splice - eijpat = new EijPat(DvsP); // boundary signal - if (DvsP || algmode.mns == 0) codepot = new CodePot(); // coding potential - if (!DvsP) { // C vs G - int zZ = (alprm2.z > 0) + 2 * (alprm2.Z > 0); - if (zZ) exinpot = new ExinPot(zZ); - } else { // A vs G + eijpat = new EijPat(DvsP); // boundary signal + if (DvsP || algmode.mns == 0) + codepot = new ExinPot(static_cast(Iefp::CP)); // coding potential + if (!DvsP) { // C vs G + if (alprm2.z > 0) exonpot = new ExinPot(static_cast(Iefp::EP)); + if (alprm2.Z > 0) intnpot = new ExinPot(static_cast(Iefp::IP)); + } else if (DvsP != 3) { // A vs G ++Nrow; ++Nrwb; MaxGapL = 100; ExtraGOP = (VTYPE) (-alprm2.x * Vab); @@ -132,18 +128,19 @@ GapW3 = BasicGOP + BasicGEP; GapW3L = LongGOP + LongGEP; pmt = new Premat(seqs); - if (alprm2.Z > 0) exinpot = new ExinPot(2); + if (alprm2.Z > 0) intnpot = new ExinPot(static_cast(Iefp::IP), 4); } - IntPen = new IntronPenalty(Vab, DvsP, eijpat, exinpot); + if (DvsP != 3) IntPen = new IntronPenalty(Vab, DvsP, eijpat, intnpot); } PwdB::~PwdB() { delete pmt; - delete IntPen; delete codepot; - delete exinpot; + delete intnpot; + delete exonpot; delete eijpat; + delete IntPen; } void putvar(VTYPE x) @@ -355,12 +352,12 @@ int mlb = cc->mlb + OutPrm.AllowdOverlap; while ((--cw)->mrb > mlb) { - if (cw->mark > 0) continue; /* active */ + if (cw->mark > 0) continue; // active if (cc->mrb - cw->mlb > OutPrm.AllowdOverlap && cc->nrb - cw->nlb > OutPrm.AllowdOverlap && cw->nrb - cc->nlb > OutPrm.AllowdOverlap) { if (cc->val < cw->val) cc->mark = -1; - else cw->mark = -1; /* delete */ + else cw->mark = -1; // delete } } } @@ -379,11 +376,11 @@ clny->mlb = clny->mrb = 0; qsort((UPTR) (clny + 1), (INT) no_clny, sizeof(COLONY), (CMPF) bymrb); for (cc = cix; cc > clny + 1; --cc) - if (cc->mark == 0) detectoverlap(cc); /* non-active */ + if (cc->mark == 0) detectoverlap(cc); // non-active qsort((UPTR) (clny + 1), (INT) no_clny, sizeof(COLONY), (CMPF) byclno); for (COLONY* cw = cc = clny + 1; cw <= cix; ++cw) { int nc = 0; - if (cw->mark >= 0) { /* retain */ + if (cw->mark >= 0) { // retain nc = cc - clny; *cc++ = *cw; } @@ -401,11 +398,11 @@ clny->mlb = clny->mrb = 0; qsort((UPTR) (clny + 1), (INT) no_clny, sizeof(COLONY), (CMPF) byscr); for (cc = cix; cc > clny + OutPrm.NoOut; --cc) - if (cc->mark == 0) cc->mark = -1; /* non-active */ + if (cc->mark == 0) cc->mark = -1; // non-active qsort((UPTR) (clny + 1), (INT) no_clny, sizeof(COLONY), (CMPF) byclno); for (COLONY* cw = cc = clny + 1; cw <= cix; ++cw) { int nc = 0; - if (cw->mark >= 0) { /* retain */ + if (cw->mark >= 0) { // retain nc = cc - clny; *cc++ = *cw; } @@ -415,7 +412,7 @@ no_clny = cc - clny; } -Colonies::Colonies(int n) : no_clny(0) +Colonies::Colonies(int n) { if (n == 0) n = OutPrm.NoOut; OutPrm.MaxOut = n + MAX_COLONY; diff -Nru spaln-2.4.7+dfsg/src/aln.h spaln-2.4.9a+dfsg/src/aln.h --- spaln-2.4.7+dfsg/src/aln.h 2022-01-28 07:55:49.000000000 +0000 +++ spaln-2.4.9a+dfsg/src/aln.h 2022-05-12 01:54:43.000000000 +0000 @@ -254,8 +254,9 @@ VTYPE GapW3 = 0; VTYPE GapW3L = 0; Premat* pmt = 0; - CodePot* codepot = 0; - ExinPot* exinpot = 0; + ExinPot* codepot = 0; + ExinPot* intnpot = 0; + ExinPot* exonpot = 0; EijPat* eijpat = 0; IntronPenalty* IntPen = 0; @@ -282,8 +283,8 @@ if (d <= codonk1) return unp; return (unp + diffu * (d - codonk1)); } - VTYPE GapPenalty3(int i) const {return GapPenalty3(i, BasicGOP);} VTYPE GapPenalty3(int i, VTYPE bgop) const; + VTYPE GapPenalty3(int i) const {return GapPenalty3(i, BasicGOP);} VTYPE UnpPenalty3(int i) const { int d = i / 3; VTYPE unp = d * BasicGEP; @@ -307,7 +308,7 @@ public: FILE** fds = 0; void getopt(const char* arg); - void setup(const char* prefix); + int setup(const char* prefix); void alnoutput(Seq** sqs, Gsinfo* GsI); int end() {return (n_out_modes);} AlnOutModes() { diff -Nru spaln-2.4.7+dfsg/src/blksrc.cc spaln-2.4.9a+dfsg/src/blksrc.cc --- spaln-2.4.7+dfsg/src/blksrc.cc 2022-01-28 07:55:49.000000000 +0000 +++ spaln-2.4.9a+dfsg/src/blksrc.cc 2022-05-12 01:54:43.000000000 +0000 @@ -184,26 +184,24 @@ *****************************************************/ MakeDbs::MakeDbs(const char* dbn, int molc) - : recnbr(0), cridxf(false), isaa(molc == PROTEIN), - b(0), fgrp(0), fseq(0), fidx(0), fent(0) -#if USE_ZLIB - , gzseq(0) -#endif + : isaa(molc == PROTEIN) { entryprv[0] = '\0'; dbname = strrealloc(0, dbn); #if USE_ZLIB if (OutPrm.gzipped) { gzseq = gzopenpbe("", dbname, SGZ_EXT, "w", 2); + gzidx = gzopenpbe("", dbname, IDZ_EXT, "w", 2); + gzent = gzopenpbe("", dbname, ENZ_EXT, "w", 2); } else #endif { fseq = fopenpbe("", dbname, SEQ_EXT, "w", 2); + fidx = fopenpbe("", dbname, IDX_EXT, "w+", 2); + fent = fopenpbe("", dbname, ENT_EXT, "w+", 2); } vclear(&rec); - fidx = fopenpbe("", dbname, IDX_EXT, "w+", 2); fgrp = fopenpbe("", dbname, GRP_EXT, "w", 2); - fent = fopenpbe("", dbname, ENT_EXT, "w+", 2); } static DbsRec* rbuf; @@ -214,26 +212,72 @@ return(strcmp(cbuf + rbuf[*a].entptr, cbuf + rbuf[*b].entptr)); } -void MakeDbs::mkidx() +template +void MakeDbs::read_ent(file_t fd) { - if (!cridxf) return; // has been sorted - size_t flen = ftell(fent); + size_t flen = ftell(fd); cbuf = new char[flen]; - rewind(fent); - if (fread(cbuf, sizeof(char), flen, fent) != flen) + if (gzent) { + fclose(fd); + gzent = gzopenpbe("", dbname, ENZ_EXT, "r", 2); + } else rewind(fd); + if (fread(cbuf, sizeof(char), flen, fd) != flen) fatal("Corrupted entry file !\n"); - rbuf = new DbsRec[recnbr]; - rewind(fidx); - if (fread(rbuf, sizeof(DbsRec), recnbr, fidx) != recnbr) +} + +template +void MakeDbs::read_idx(file_t fd) +{ + if (gzidx) { + fclose(fd); + gzidx = gzopenpbe("", dbname, IDZ_EXT, "r", 2); + } else + rewind(fd); + if (fread(rbuf, sizeof(DbsRec), recnbr, fd) != recnbr) fatal("Corrupted index file !\n"); +} + +template +void MakeDbs::write_idx(file_t fd) +{ + if (fwrite(&rec, sizeof(DbsRec), 1, fd) != 1) + fatal(write_error, "dbs.rec"); + ++recnbr; +} + +template +void MakeDbs::write_odr(file_t fd, const INT* order, const char* str) +{ + if (fwrite(order, sizeof(INT), recnbr, fd) != recnbr) + fatal(write_error, str); + fclose(fd); +} + +void MakeDbs::mkidx() +{ + if (!cridxf) return; // has been sorted + if (gzent) read_ent(gzent); + else read_ent(fent); + rbuf = new DbsRec[recnbr]; + if (gzidx) read_idx(gzidx); + else read_idx(fidx); INT* order = new INT[recnbr]; for (INT i = 0; i < recnbr; ++i) order[i] = i; qsort((UPTR) order, recnbr, sizeof(INT), (CMPF) cmpkey); char str[LINE_MAX]; - FILE* fodr = fopenpbe("", dbname, ODR_EXT, "w", 2, str); - if (fwrite(order, sizeof(INT), recnbr, fodr) != recnbr) - fatal(write_error, str); - fclose(fodr); + FILE* fodr = 0; +#if USE_ZLIB + gzFile gzodr = 0; + if (OutPrm.gzipped) + gzodr = gzopenpbe("", dbname, ODZ_EXT, "w", 2, str); + else + fodr = fopenpbe("", dbname, ODR_EXT, "w", 2, str); + if (fodr) write_odr(fodr, order, str); + else write_odr(gzodr, order, str); +#else + fodr = fopenpbe("", dbname, ODR_EXT, "w", 2, str); + write_odr(fodr, str); +#endif delete[] cbuf; delete[] rbuf; delete[] order; @@ -244,19 +288,14 @@ { putseq(SEQ_DELIM); if (rec.seqlen) { - if (fwrite(&rec, sizeof(DbsRec), 1, fidx) != 1) - fatal(write_error, "dbs.rec"); - ++recnbr; + if (gzidx) write_idx(gzidx); + else write_idx(fidx); } if (c != EOF) { - rec.entptr = ftell(fent); -#if USE_ZLIB - if (fseq) rec.seqptr = ftell(fseq); - else rec.seqptr = ftell(gzseq); -#else - rec.seqptr = ftell(fseq); -#endif - rec.seqptr = ftell(fseq); + if (gzent) rec.entptr = ftell(gzent); + else rec.entptr = ftell(fent); + if (gzseq) rec.seqptr = ftell(gzseq); + else rec.seqptr = ftell(fseq); char* ps = entrystr; char* pe = ps; char* pb = ps; @@ -272,7 +311,11 @@ else *ps = '\0'; if (wordcmp(pe, entryprv) < 0) cridxf = true; strcpy(entryprv, pe); - fputs(pe, fent); fputc('\0', fent); + if (gzent) { + fputs(pe, gzent); fputc('\0', gzent); + } else { + fputs(pe, fent); fputc('\0', fent); + } } rec.seqlen = b = 0; return (c); @@ -652,6 +695,12 @@ static bool newer(char* str, const char** argv) { bool update = !file_size(str); +#if USE_ZLIB + if (update) { + strcat(str, gz_ext); + update = !file_size(str); + } +#endif if (update) return (true); // absent time_t exist_time = modify_time(str); for (const char** av = argv; *av; ++av) @@ -667,9 +716,6 @@ case DNA: strcat(str, BKN_EXT); break; case TRON: strcat(str, BKP_EXT); break; } -#if USE_ZLIB - if (!file_size(str)) strcat(str, gz_ext); -#endif return (newer(str, argv)); } @@ -1012,9 +1058,6 @@ case DNA: strcat(str, LUN_EXT); break; case TRON: strcat(str, LUP_EXT); break; } -#if USE_ZLIB - if (!file_size(str)) strcat(str, gz_ext); -#endif return (newer(str, argv)); } @@ -1092,14 +1135,14 @@ mkdbs->stamp21(); mkdbs->mkidx(); } - fclose(fd); + if (fd) fclose(fd); } void MakeBlk::idxblk(int argc, const char** argv) { // first phase - const char** seqdb = argv; +const char** seqdb = argv; for (int ac = 0; ac++ < argc && *seqdb; ++seqdb) { if (mkdbs) mkdbs->wrtgrp(*argv); @@ -2890,7 +2933,7 @@ const CHAR* at = is_shorquery? query->at(query->right): 0; const CHAR* ab = is_shorquery? query->at(query->left): 0; bool meet[2] = {false, false}; - Dhash hh(2 * pbwc->MaxBlk); + Dhash hh(2 * pbwc->MaxBlk, 0); INT nmmc = 0; int notry = 0; int maxbscr[4]; @@ -3091,10 +3134,10 @@ { Seq*& b = sqs[0]; // query Seq*& a = sqs[1]; // translated - Dhash hh(2 * pbwc->MaxBlk); + Dhash hh(2 * pbwc->MaxBlk, 0); ORF* orf = b->getorf(); if (!orf) return (ERROR); - Dhash shash(2 * MaxNref); + Dhash shash(2 * MaxNref, 0); Qwords qwd(kk, DRNA, ConvTab, pbwc, bpp); INT norf = 0; int c; @@ -3186,7 +3229,7 @@ INT testword[2] = {0, 0}; bool meet = false; BLKTYPE maxb = 0; - Dhash hh(int(2 * pbwc->MaxBlk)); + Dhash hh(int(2 * pbwc->MaxBlk), 0); Qwords qwd(kk, DRNA, ConvTab, pbwc, bpp, query); const CHAR* rms = query->at(query->right) - wcp.Nshift * ((c < ptpl)? c: ptpl); #if TESTRAN @@ -3256,7 +3299,7 @@ if (bh2->sigm) { // heap sort in the order of block score INT w = bh2->hsort(); if (w < bh2->sigm && OutPrm.supself) ++w; - Dhash shash(2 * w); + Dhash shash(2 * w, 0); int j = 1; for (INT i = 0; i < w; ++i) { c = findChrNo((*bh2->prqueue_b)[i].key); diff -Nru spaln-2.4.7+dfsg/src/blksrc.h spaln-2.4.9a+dfsg/src/blksrc.h --- spaln-2.4.7+dfsg/src/blksrc.h 2022-01-28 07:55:49.000000000 +0000 +++ spaln-2.4.9a+dfsg/src/blksrc.h 2022-05-12 01:54:43.000000000 +0000 @@ -86,43 +86,57 @@ class MakeDbs { DbsRec rec; - size_t recnbr; - bool cridxf; + size_t recnbr = 0; + bool cridxf = false; const bool isaa; - int b; + int b = 0; INT maxlen; char entrystr[MAXL]; char entryprv[MAXL]; - FILE* fgrp; - FILE* fseq; - FILE* fidx; - FILE* fent; + FILE* fgrp = 0; + FILE* fseq = 0; + FILE* fidx = 0; + FILE* fent = 0; #if USE_ZLIB - gzFile gzseq; + gzFile gzseq = 0; + gzFile gzidx = 0; + gzFile gzent = 0; +#else + bool gzseq = false; + bool gzidx = false; + bool gzent = false; #endif - char* dbname; + char* dbname = 0; void putsq(int c) { - if (fseq) fputc(c, fseq); -#if USE_ZLIB - else fputc(c, gzseq); -#endif + if (gzseq) fputc(c, gzseq); + else fputc(c, fseq); } public: MakeDbs(const char* dbname, int molc); ~MakeDbs() { fclose(fgrp); if (fseq) fclose(fseq); + if (fidx) fclose(fidx); + if (fent) fclose(fent); #if USE_ZLIB if (gzseq) fclose(gzseq); + if (gzidx) fclose(gzidx); + if (gzent) fclose(gzent); #endif - fclose(fidx); - fclose(fent); delete[] dbname; } INT max_dbseq_len() {return maxlen;} void mkidx(); template int write_recrd(file_t fd, int c = 0); +template + void read_ent(file_t fd); +template + void read_idx(file_t fd); +template + void write_idx(file_t fd); +template + void write_odr(file_t fd, const INT* order, const char* str); void putseq(int c) { if (isaa) putsq(c); else if (rec.seqlen & 1) putsq(c + b); @@ -134,15 +148,18 @@ } void wrtgrp(const char* ps) { long fpos = 0L; - if (fseq) fpos = ftell(fseq); -#if USE_ZLIB - else fpos = ftell(gzseq); -#endif - fprintf(fgrp, "%8ld %u %s\n", fpos, (INT) recnbr, ps); + if (gzseq) fpos = ftell(gzseq); + else fpos = ftell(fseq); + long epos = 0L; + if (gzent) epos = ftell(gzent); + else epos = ftell(fent); + fprintf(fgrp, "%8ld %u %u %s\n", + fpos, (INT) recnbr, (INT) epos, ps); } void stamp21() { DbsRec rec21 = {magicver21, false, 0}; - fwrite(&rec21, sizeof(DbsRec), 1, fidx); // header record + if (fidx) fwrite(&rec21, sizeof(DbsRec), 1, fidx); // header record + else fwrite(&rec21, sizeof(DbsRec), 1, gzidx); } }; @@ -209,8 +226,7 @@ class Chash : public Dhash { public: - Chash(int n) - : Dhash(n) {} + Chash(int n) : Dhash(n, 0) {} ~Chash() {} void countBlk(ContBlk& cntblk); void registBlk(INT m, ContBlk& cntblk); diff -Nru spaln-2.4.7+dfsg/src/clib.cc spaln-2.4.9a+dfsg/src/clib.cc --- spaln-2.4.7+dfsg/src/clib.cc 2022-01-28 07:55:49.000000000 +0000 +++ spaln-2.4.9a+dfsg/src/clib.cc 2022-05-12 01:54:43.000000000 +0000 @@ -7,7 +7,7 @@ * ipower lpower * next_wd wordcmp * isBlankLine car cdr -* chop replace +* chop replace rm_end_spc * fbisrch * comb2 comb3 decomb * @@ -194,6 +194,22 @@ return (c); } +char* rm_end_spc(char* str) +{ + char* ps = str; + char* pd = str; + int ns = 0; + while (*ps && isspace(*ps)) ++ps; + while (*ps) { + if (isspace(*ps)) ++ns; + else ns = 0; + *pd++ = *ps++; + } + pd -= ns; + *pd = '\0'; + return (str); +} + char* next_wd(char** sp) { char c; @@ -319,78 +335,66 @@ } Strlist::Strlist(int m, int len) - : totallen(0), lastlen(0), maxlen(0), many(0), filled(false) { - if (m == 0) { - sunitsize = 0; strbuf = 0; - } else { + if (m > 0) { sunitsize = (len + defsunit - 1) / defsunit * defsunit; strbuf = new char[sunitsize]; *strbuf = '\0'; - } - if (m > 1) { - punitsize = (m + defpunit - 1) / defpunit * defpunit; - idxlst = new INT[punitsize]; - *idxlst = 0; - } else { - punitsize = 0; idxlst = 0; - } -} - -void Strlist::reset(int m) -{ - if (m > 1 && (INT) m > punitsize) { - punitsize = (m + defpunit - 1) / defpunit * defpunit; - delete[] idxlst; + punitsize = (m < defpunit)? m: + (m + defpunit - 1) / defpunit * defpunit; idxlst = new INT[punitsize]; *idxlst = 0; } - *strbuf = '\0'; - many = totallen = lastlen = maxlen = 0; - filled = false; } -Strlist::Strlist(const char* str) - : idxlst(0), punitsize(0), many(1), filled(true) +Strlist::Strlist(const Strlist& src) { - totallen = lastlen = maxlen = strlen(str) + 1; - sunitsize = (totallen + defsunit - 1) / defsunit * defsunit; - strbuf = new char[sunitsize]; - strcpy(strbuf, str); + *this = src; + strbuf = new char[src.totallen]; + idxlst = new INT[src.many]; + vcopy(strbuf, src.strbuf, totallen); + vcopy(idxlst, src.idxlst, many); } -Strlist::Strlist(const char* str, const char* indelim) - : lastlen(0), maxlen(0), many(0) +Strlist::Strlist(const char* str, const char* delim) { - char* delim = new char[strlen(indelim) + 1]; - bool spc = false; // space is a delim - char* pd = delim; - for (const char* ps = indelim; *ps; ++ps) { - if (isspace(*ps)) spc = true; - else *pd++ = *ps; - } - *pd = '\0'; + if (!delim || !*delim) { // single member + totallen = lastlen = maxlen = strlen(str) + 1; + sunitsize = (totallen + defsunit - 1) / defsunit * defsunit; + strbuf = new char[sunitsize]; + strcpy(strbuf, str); + many = 1; + filled = true; + return; + } + bool spc = false; // space is a delim + for (const char* ps = delim; *ps; ++ps) + if (*ps == ' ') spc = true; +const char dchar = *delim; -const char dchar = *delim? *delim: ' '; totallen = sunitsize = strlen(str) + 2; char* pb = strbuf = new char[totallen]; int state = 1; // isspc = 2, isdlm = 1 for (const char* ps = str; *ps; ++ps) { - if (spc && isspace(*ps)) { - if (!state) *pb++ = dchar; + if (!strchr(delim, *ps)) { // not delim + *pb++ = *ps; + state = 0; + } else if (spc) { // pack continuous delims + if (!state) { + *pb++ = dchar; + ++many; + } state = 2; - } else if (strchr(delim, *ps)) { - if (state == 2) --pb; // cancel previous space + } else { // individual delims *pb++ = dchar; + ++many; state = 1; - } else { - *pb++ = *ps; - if (state) ++many; - state = 0; } } - if (state == 2) --pb; // cancel tailing space - *pb++ = dchar; + if (state == 0) { + *pb++ = dchar; + ++many; + } *pb = '\0'; punitsize = (many + defpunit - 1) / defpunit * defpunit; @@ -407,34 +411,9 @@ } else ++ps; } filled = true; - delete[] delim; -} - -void Strlist::format() -{ - char* ps = strbuf; - if (!many) { - for (INT ttl = 0; ttl < totallen; ++many) { - lastlen = strlen(ps) + 1; - ttl += lastlen; - ps += lastlen; - } - ps = strbuf; - } - punitsize = (many + defpunit - 1) / defpunit * defpunit; - idxlst = new INT[punitsize]; - for (INT ttl = many = 0; ttl < totallen; ++many) { - idxlst[many] = ttl; - lastlen = strlen(ps) + 1; - if (lastlen > maxlen) maxlen = lastlen; - ttl += lastlen; - ps += lastlen; - } - filled = true; } -Strlist::Strlist(FILE* fd, int m) - : lastlen(0), maxlen(0), many(m) +Strlist::Strlist(FILE* fd, int m) : many(m) { fseek(fd, 0L, SEEK_END); totallen = ftell(fd); @@ -442,13 +421,12 @@ strbuf = new char[sunitsize]; fseek(fd, 0L, SEEK_SET); if (fread(strbuf, sizeof(char), totallen, fd) != (INT) totallen) - fatal("Strlist file may be corupped !\n"); + fatal(fread_error, "Strlist"); format(); } #if USE_ZLIB -Strlist::Strlist(gzFile gzfd, int m) - : totallen(0), lastlen(0), maxlen(0), many(m) +Strlist::Strlist(gzFile gzfd, int m) : many(m) { int c; while ((c = gzgetc(gzfd)) != EOF) ++totallen; @@ -461,6 +439,42 @@ } #endif +void Strlist::reset(int m) +{ + if (m > 1 && (INT) m > punitsize) { + punitsize = (m + defpunit - 1) / defpunit * defpunit; + delete[] idxlst; + idxlst = new INT[punitsize]; + *idxlst = 0; + } + *strbuf = '\0'; + many = totallen = lastlen = maxlen = 0; + filled = false; +} + +void Strlist::format() +{ + char* ps = strbuf; + if (!many) { + for (INT ttl = 0; ttl < totallen; ++many) { + lastlen = strlen(ps) + 1; + ttl += lastlen; + ps += lastlen; + } + ps = strbuf; + } + punitsize = (many + defpunit - 1) / defpunit * defpunit; + idxlst = new INT[punitsize]; + for (INT ttl = many = 0; ttl < totallen; ++many) { + idxlst[many] = ttl; + lastlen = strlen(ps) + 1; + if (lastlen > maxlen) maxlen = lastlen; + ttl += lastlen; + ps += lastlen; + } + filled = true; +} + char* Strlist::assign(const Strlist& src) { if (sunitsize < src.sunitsize) { @@ -483,15 +497,6 @@ return (strbuf); } -Strlist::Strlist(Strlist& src) -{ - *this = src; - strbuf = new char[sunitsize]; - memcpy(strbuf, src.strbuf, totallen); - idxlst = punitsize? new INT[punitsize]: 0; - if (src.idxlst) vcopy(idxlst, src.idxlst, many); -} - char* Strlist::assign(const char* str) { totallen = lastlen = maxlen = strlen(str) + 1; @@ -539,6 +544,58 @@ return (rv); } +void Strlist::write_binary(FILE* fd) +{ + if (fwrite(this, sizeof(Strlist), 1, fd) != 1) + fatal(fwrite_error, "Strlist"); + if (fwrite(strbuf, sizeof(char), totallen, fd) != totallen) + fatal(fwrite_error, "Strlist"); + if (fwrite(idxlst, sizeof(int), many, fd) != many) + fatal(fwrite_error, "Strlist"); +} + +AddExt::AddExt(int argc, const char** argv, const char* ext) + : argv_(argv), arg_ext(0) +{ + int tl = strlen(argv[0]); + int woext = 0; + for (int i = 1; i < argc; ++i) { + if (argv[i][0] == OPTCHAR) tl += strlen(argv[i]); + else { + const char* dot = strrchr(argv[i], '.'); + if (!dot) { + tl += strlen(argv[i]) + 4; + ++woext; + } else if (strcmp(dot, ext)) { + tl += dot - argv[i] + 4; + ++woext; + } else tl += strlen(argv[i]); + } + } + if (!woext) return; + arg_ext = new char*[argc + 1]; + char* ps = new char[tl + argc]; + arg_ext[0] = strcpy(ps, argv[0]); + for (int i = 1; i < argc; ++i) { + ps += strlen(ps) + 1; + if (argv[i][0] == OPTCHAR) { + arg_ext[i] = strcpy(ps, argv[i]); + } else { + const char* dot = strrchr(argv[i], '.'); + if (!dot) { + arg_ext[i] = strcpy(ps, argv[i]); + strcat(ps, ext); + } else if (strcmp(dot, ext)) { + arg_ext[i] = strncpy(ps, argv[i], dot - argv[i]); + strcat(ps, ext); + } else { + arg_ext[i] = strcpy(ps, argv[i]); + } + } + } + arg_ext[argc] = 0; +} + // distribute into bins // Input must be sorted on x diff -Nru spaln-2.4.7+dfsg/src/clib.h spaln-2.4.9a+dfsg/src/clib.h --- spaln-2.4.7+dfsg/src/clib.h 2022-01-28 07:55:49.000000000 +0000 +++ spaln-2.4.9a+dfsg/src/clib.h 2022-05-12 01:54:43.000000000 +0000 @@ -6,7 +6,7 @@ * max3 min3 max4 min4 * ipower * next_wd wordcmp isBlankLine -* car cdr prefix chop replace +* car carn cdr chop replace rm_end_spc * queue stack fbisrch insort * comb2 comb3 decomb * Strlist @@ -79,6 +79,8 @@ static const int HashovLS = 12; static const int def_un_def = (INT_MIN / 8 * 7); static const float FACT_QUEUE = 1.5; +static const char* fread_error = "Fread error: %s !\n"; +static const char* fwrite_error = "Fwrite error: %s !\n"; template X* vcopy(X* dst, const X* src, int n) { @@ -181,16 +183,23 @@ template class Dhash { protected: - INT size1; - INT size2; - KVpair* hash; - KVpair* hz; - val_t un_def; + INT size1 = 0; + INT size2 = SecondHS; + KVpair* hash = 0; + KVpair* hz = 0; + val_t un_def = 0; public: - Dhash(int n = 0, val_t ud = 0, float hf = DefHashFact); + Dhash() {} + Dhash(int n, val_t ud, float hf = DefHashFact); +#if USE_ZLIB +template + Dhash(file_t fd); +#else + Dhash(FILE* fd); +#endif ~Dhash() {delete[] hash;} - KVpair* begin() {return hash;} - KVpair* end() {return hz;} + KVpair* begin() const {return hash;} + KVpair* end() const {return hz;} INT size() const {return (size1);} KVpair* map(key_t ky, bool record = true); KVpair* find(key_t ky) { @@ -232,16 +241,14 @@ return (hash[pos]); } val_t undef() {return (un_def);} + void write_binary(FILE* fd); }; template Dhash::Dhash(int n, val_t ud, float hf) - : size2(SecondHS), un_def(ud) + : un_def(ud) { - if (n == 0) { - size1 = 0; - hash = hz = 0; - } else { + if (n) { n = int(hf * n); size1 = (n < int(min_size))? min_size: supprime(n); hash = new KVpair[size1]; @@ -251,6 +258,33 @@ } template +#if USE_ZLIB +template +Dhash::Dhash(file_t fd) +#else +Dhash::Dhash(FILE* fd) +#endif +{ + if (fread(this, sizeof(*this), 1, fd) != 1) + fatal(fread_error, "Dhash"); + INT n = hz - hash; + hash = new KVpair[n]; + if (fread(hash, sizeof(KVpair), n, fd) != n) + fatal(fread_error, "Dhash"); + hz = hash + n; +} + +template +void Dhash::write_binary(FILE* fd) +{ + if (fwrite(this, sizeof(*this), 1, fd) != 1) + fatal(fwrite_error, "Dhash"); + INT n = hz - hash; + if (fwrite(hash, sizeof(KVpair), n, fd) != n) + fatal(fwrite_error, "Dhash"); +} + +template KVpair* Dhash::map(key_t key, bool record) { INT v = key % size1; @@ -402,9 +436,9 @@ template class PrQueue { private: - val_t* data; - int capacity; - int front; + val_t* data = 0; + int capacity = 0; + int front = 0; bool dec_order; // default is ascending order bool replace; // may replace older value public: // initial data size, ascending order, never replace data @@ -829,30 +863,34 @@ class Strlist { protected: - char* strbuf; - INT* idxlst; - INT sunitsize; - INT punitsize; - INT totallen; - INT lastlen; - INT maxlen; - INT many; - bool filled; + char* strbuf = 0; + INT* idxlst = 0; + INT sunitsize = 0; + INT punitsize = 0; + INT totallen = 0; + INT lastlen = 0; + INT maxlen = 0; + INT many = 0; + bool filled = false; void format(); public: Strlist(int m = 1, int len = defsunit); - Strlist(const char* str); + Strlist(const Strlist& src); Strlist(const char* str, const char* delim); - Strlist(FILE* fd, int m = 0); #if USE_ZLIB - Strlist(gzFile gzfd, int m = 0); +template + Strlist(file_t fd); // read binary + Strlist(gzFile gzfd, int m); +#else + Strlist(FILE* fd); #endif - Strlist(Strlist& src); + Strlist(FILE* fd, int m); ~Strlist() {delete[] strbuf; delete[] idxlst;} char* operator[](INT n) const { return ((idxlst && n < many)? strbuf + idxlst[n]: strbuf); } INT size() const {return (many);} + INT space() const {return (totallen);} char* assign(const Strlist& src); char* assign(const char* str); char* push(const char* str); @@ -863,23 +901,48 @@ void undo() {--many; totallen -= lastlen;} char* squeeze() {char* tmp = strbuf; strbuf = 0; return (tmp);} INT longest() const {return (maxlen? maxlen - 1: 0);} + void write_binary(FILE* fd); }; +#if USE_ZLIB +template +Strlist::Strlist(file_t fd) +#else +Strlist::Strlist(FILE* fd) +#endif +{ + if (fread(this, sizeof(Strlist), 1, fd) != 1) + fatal(fread_error, "Strlist"); + strbuf = new char[totallen]; + idxlst = new INT[many]; + if (fread(strbuf, sizeof(char), totallen, fd) != totallen) + fatal(fread_error, "Strlist"); + if (fread(idxlst, sizeof(int), many, fd) != many) + fatal(fread_error, "Strlist"); +} + // hash for string key template class StrHash { protected: - INT size1; - INT size2; - KVpair* hash; - KVpair* hz; - Strlist* sl; + INT size1 = 0; + INT size2 = 0; + KVpair* hash = 0; + KVpair* hz = 0; val_t un_def; public: - StrHash(int n = 0, val_t udf = def_un_def, float hf = DefHashFact); - StrHash(const char* fname); - StrHash(StrHash& src) {*this = src;} + Strlist* sl = 0; + StrHash(int n = 0, val_t udf = def_un_def, + float hf = DefHashFact, int strbufsize = defsunit); +#if USE_ZLIB +template + StrHash(file_t fd); +#else + StrHash(FILE* fd); +#endif +// StrHash(const char* fname); + StrHash(const StrHash& src); ~StrHash() {delete sl; delete[] hash;} KVpair* begin() {return hash;} KVpair* end() {return hz;} @@ -928,18 +991,17 @@ void erase_sl() {sl = 0;} char* squeeze() {return (sl->squeeze());} val_t undef() {return (un_def);} + void write_binary(FILE* fd); }; template -StrHash::StrHash(int n, val_t udf, float hf) : un_def(udf) +StrHash::StrHash(int n, val_t udf, float hf, int strbufsize) + : un_def(udf) { - if (n == 0) { - size1 = size2 = 0; - hash = hz = 0; - sl = 0; - } else { + if (n) { size1 = supprime(int(hf * n)); - sl = new Strlist(size1); + sl = new Strlist(size1, strbufsize); + sl->push(""); hash = new KVpair[size1]; hz = hash + size1; if (un_def) { @@ -951,6 +1013,36 @@ } template +#if USE_ZLIB +template +StrHash::StrHash(file_t fd) +#else +StrHash::StrHash(FILE* fd) +#endif +{ + if (fread(this, sizeof(*this), 1, fd) != 1) + fatal(fread_error, "StrHash"); + INT n = hz - hash; + hash = new KVpair[n]; + if (fread(hash, sizeof(KVpair), n, fd) != n) + fatal(fread_error, "StrHash"); + hz = hash + n; + sl = new Strlist(fd); + sl->push(""); // dummy to skip 0-th entry +} + +template +StrHash::StrHash(const StrHash& src) +{ + *this = src; + INT n = hz - hash; + hash = new KVpair[n]; + vcopy(hash, src.hash, n); + hz = hash + n; + sl = new Strlist(*src.sl); +}; + +template KVpair* StrHash::map(const char* skey, int record) { INT key = str2uint(skey); @@ -958,8 +1050,9 @@ INT u = size2 - key % size2; INT v0 = v; KVpair* sh = hash + v; + int ne_key = 1; - while (sh->key && strcmp(strkey(sh->key), skey)) { + while (sh->key && (ne_key = strcmp(strkey(sh->key), skey))) { v = (v + u) % size1; if (v == v0) { resize(); @@ -973,7 +1066,7 @@ sl->push(skey); } else if (!record) sh = 0; - } + } else if (ne_key) sh = 0; return (sh); } @@ -1009,6 +1102,7 @@ size1 = supprime(s? s: 2 * size1); if (!size2) for (size2 = SecondHS; size2 > size1; size2 /= 2) ; + delete[] hash; hash = new KVpair[size1]; hz = hash + size1; clear(); @@ -1018,7 +1112,17 @@ *kv = *hw; } } - org.erase_sl(); +} + +template +void StrHash::write_binary(FILE* fd) +{ + if (fwrite(this, sizeof(*this), 1, fd) != 1) + fatal(fwrite_error, "StrHash"); + INT n = hz - hash; + if (fwrite(hash, sizeof(KVpair), n, fd) != n) + fatal(fwrite_error, "StrHash"); + sl->write_binary(fd); } // insert sort; for any 0 <= i < num @@ -1100,6 +1204,15 @@ int get() {return buf[int(resol * drand48())];} }; +class AddExt { + const char** argv_; + char** arg_ext; +public: + AddExt(int argc, const char** argv, const char* ext); + ~AddExt() {if (arg_ext) {delete[] arg_ext[0]; delete[] arg_ext;}} + const char** add_ext() {return arg_ext? (const char**) arg_ext: argv_;} +}; + extern int ipower(int x, int n); extern long lpower(long x, int n); extern char* strrealloc(char* dst, const char* src); @@ -1113,6 +1226,7 @@ extern char* cdr(const char* ps); extern char* cdr(const char* ps, const char* delim); extern int chop(char* ps); +extern char* rm_end_spc(char* str); extern char* next_wd(char** sp); extern int wordcmp(const char* a, const char* b, int n = INT_MAX); extern UPTR fbisrch(UPTR found, const UPTR key, FILE* fd, long left, long right, int width, CMPF cmpf); diff -Nru spaln-2.4.7+dfsg/src/codepot.cc spaln-2.4.9a+dfsg/src/codepot.cc --- spaln-2.4.7+dfsg/src/codepot.cc 2022-01-28 07:55:49.000000000 +0000 +++ spaln-2.4.9a+dfsg/src/codepot.cc 2022-05-12 01:54:43.000000000 +0000 @@ -339,7 +339,8 @@ Exinon::Exinon(Seq* sd_, const PwdB* pwd_, bool bo) : sd(sd_), pwd(pwd_), both_ori(bo), size(sd->right - sd->left + 2), bias(sd->left - 1), - exinpot(pwd? pwd->exinpot: 0), + intnpot(pwd? pwd->intnpot: 0), + exonpot(pwd? pwd->exonpot: 0), fact((FTYPE) (pwd->Vab / sd->many)), fS(alprm2.y * fact) { @@ -374,7 +375,7 @@ STYPE Exinon::sig53(int m, int n, INTENDS c) const { STYPE sig = 0; - switch (c) { // canonical signal + switch (c) { // acanonical signal case IE5: sig = data[m].sig5; break; case IE3: sig = data[n].sig3; break; case IE53: sig = data[n].sig3 + (1. - alprm2.sss) * @@ -391,8 +392,8 @@ break; } if (alprm2.Z > 0) { - if (c == IE53 || c == IE5P3) sig += exinpot->intpot(data + m, data + n); - else if (c == IE35) sig += exinpot->intpot(data + n, data + m); + if (c == IE53 || c == IE5P3) sig += intnpot->intpot(data + m, data + n); + else if (c == IE35) sig += intnpot->intpot(data + n, data + m); } return (sig); } @@ -456,9 +457,9 @@ float* prefS = pwd->eijpat->patternI? pwd->eijpat->patternI->calcPatMat(sd): 0; float* prefT = pwd->eijpat->patternT? pwd->eijpat->patternT->calcPatMat(sd): 0; float* prefB = pwd->eijpat->patternB? pwd->eijpat->patternB->calcPatMat(sd): 0; - float* prefE = pwd->codepot? pwd->codepot->calcPrefCodePot(sd, 0): 0; - float* prefI = pwd->exinpot? pwd->exinpot->calcExinPot(sd, false): 0; - if (!prefE && pwd->exinpot) prefE = pwd->exinpot->calcExinPot(sd, true); + float* prefE = pwd->codepot? pwd->codepot->calcScr(sd): 0; + float* prefI = pwd->intnpot? pwd->intnpot->calcScr(sd): 0; + if (!prefE && pwd->exonpot) prefE = pwd->exonpot->calcScr(sd); float* prf5 = pref5; // 5'ss signal float* prf3 = pref3; // 3'ss signal float* prfS = prefS; // start codon diff -Nru spaln-2.4.7+dfsg/src/codepot.h spaln-2.4.9a+dfsg/src/codepot.h --- spaln-2.4.7+dfsg/src/codepot.h 2022-01-28 07:55:49.000000000 +0000 +++ spaln-2.4.9a+dfsg/src/codepot.h 2022-05-12 01:54:43.000000000 +0000 @@ -68,7 +68,8 @@ INT53* int53 = 0; STYPE** sig53tab = 0; bool statictab; - ExinPot* exinpot = 0; + ExinPot* intnpot = 0; + ExinPot* exonpot = 0; public: FTYPE fact; FTYPE fS; @@ -228,7 +229,6 @@ static const char INT3PAT[] = "Intron3"; static const char INT53PAT[] = "Intron53"; static const char INT35PAT[] = "Intron35"; -static const char GNM2TAB[] = "gnm2tab"; static const int BoundRng = 20; static const int Ip_equ_k = 3; // gap length equivalen to Intron Penalty static const char ipstat[] = "IldModel.txt"; diff -Nru spaln-2.4.7+dfsg/src/dbs.cc spaln-2.4.9a+dfsg/src/dbs.cc --- spaln-2.4.7+dfsg/src/dbs.cc 2022-01-28 07:55:49.000000000 +0000 +++ spaln-2.4.9a+dfsg/src/dbs.cc 2022-05-12 01:54:43.000000000 +0000 @@ -21,8 +21,6 @@ #include "seq.h" -static int cmpodr_r20(const int* a, const int* b); - DbsDt* dbs_dt[MAX_DBS]; static DbsDt* defdbf; @@ -349,114 +347,37 @@ defdbf = dbf; } -void DbsDt::readseq(const char* fn) -{ - fseek(fseq, 0L, SEEK_END); - long fp = ftell(fseq); - dbsseq = new CHAR[fp]; - if (!dbsseq) fatal("No memory to accomodate seqdb!\n"); - rewind(fseq); - if (fread(dbsseq, sizeof(CHAR), fp, fseq) != (size_t) fp) - fatal("%s: Bad seq file !\n", fn); - fclose(fseq); - fseq = 0; -} - size_t DbsDt::readgrp(FILE* fgrp) { -static const char frmt[] = "%ld %ld %s"; char str[MAXL]; Mfile mfd(sizeof(DbsGrp)); DbsGrp grp; grplbl = new Strlist(); size_t ress = 0; - while (fscanf(fgrp, frmt, &grp.seqptr, &grp.recnbr, str) == 3) { - grplbl->push(str); + numidx = 0; + while (fgets(str, MAXL, fgrp)) { + Strlist stl(str, stddelim); + grp.seqptr = atol(stl[0]); + grp.recnbr = atoi(stl[1]); + if (stl.size() == 3) { // ver21 + grp.entspc = 0; + grplbl->push(stl[2]); + } else if (stl.size() == 4) { // ver22 + grp.entspc = atoi(stl[2]); + grplbl->push(stl[3]); + } else + continue; mfd.write(&grp); ress += grp.seqptr; + numidx += grp.recnbr; + ent_space += grp.entspc; } numgrp = (INT) mfd.size() - 1; dbsgrp = (DbsGrp*) mfd.flush(); return (ress); } -static int cmpodr_r20(const int* a, const int* b) -{ - return (wordcmp(defdbf->entname(*a), defdbf->entname(*b), ENTLEN)); -} - -void DbsDt::readidx20(FILE* fd, const char* fn) -{ - fseek(fd, 0L, SEEK_END); - long fp = ftell(fd); - if (fp % sizeof(DbsRec20)) fatal("%s: Index file may be corrupted!\n", fn); - numidx = (INT) (fp / sizeof(DbsRec20)); - DbsRec20* reched20 = new DbsRec20[numidx]; - rewind(fd); - if (fread(reched20, sizeof(DbsRec20), numidx, fd) != numidx) - fatal("%s: Bad index file!", fn); - DbsRec20* rec20 = reched20; - DbsRec* rec = recidx = new DbsRec[numidx]; - entry = new char[numidx * (ENTLEN + 1)]; - recodr = 0; - size_t entidx = 0; - bool sorted = true; - for (INT i = 0; i < numidx; ++i, ++rec, ++rec20) { - rec->seqptr = rec20->seqptr; - if (i && strncmp(rec20[-1].entry, rec20->entry, ENTLEN) > 0) - sorted = false; - rec->seqlen = (INT) rec20->seqlen; - rec->entptr = entidx; - entry[entidx + ENTLEN] = '\0'; - strncpy(entry + entidx, rec20->entry, ENTLEN); - entidx += strlen(entry + entidx) + 1; - } - delete[] reched20; - if (sorted) return; - recodr = new INT[numidx]; - for (INT i = 0; i < numidx; ++i) recodr[i] = i; - qsort((UPTR) recodr, numidx, sizeof(INT), (CMPF) cmpodr_r20); -} - -void DbsDt::readentry(FILE* fent, const char* fn) -{ - fseek(fent, 0L, SEEK_END); - size_t fp = ftell(fent); - entry = new char[fp]; - rewind(fent); - if (fread(entry, sizeof(char), fp, fent) != fp) - fatal("%s: Bad entry file!", fn); -} - -void DbsDt::readodr(FILE* fodr, const char* fn) -{ - fseek(fodr, 0L, SEEK_END); - long fp = ftell(fodr); - recodr = new INT[fp / sizeof(INT)]; - rewind(fodr); - if (fread(recodr, sizeof(INT), numidx, fodr) != numidx) - fatal("%s: Bad order file!", fn); -} - -DbsRec* DbsDt::readidx(FILE* fidx, const char* fn) -{ - fseek(fidx, 0L, SEEK_END); - long fp = ftell(fidx); - if (fp % sizeof(DbsRec)) - fatal("%s: Index file may be corrupted!\n", fn); - numidx = (INT) (fp / sizeof(DbsRec)); - recidx = new DbsRec[numidx]; - rewind(fidx); - if (fread(recidx, sizeof(DbsRec), numidx, fidx) != numidx) - fatal("%s: Index file may be corrupted!\n", fn); - if (recidx[--numidx].seqptr != magicver21) { - delete[] recidx; recidx = 0; - return (0); - } - return (recidx); -} - DbsGrp* DbsDt::finddbsgrp(const char* name) const { DbsGrp* dgrp = dbsgrp; @@ -494,17 +415,18 @@ { dbsseq = new CHAR[seq_space]; recidx = new DbsRec[numidx = num]; - entry = new char[entry_space]; + entry = new char[ent_space = entry_space]; if (gsi_space) { gsipool = new int[gsi_space + 1]; gsiidx = new int[num + 1]; } } -void DbsDt::prepare(Strlist& sname, int num, size_t space, size_t gsi_space) +void DbsDt::prepare(Strlist& sname, int num, size_t seq_space, size_t gsi_space) { - dbsseq = new CHAR[space]; + dbsseq = new CHAR[seq_space]; recidx = new DbsRec[numidx = num]; + ent_space = sname.space(); entry = sname.squeeze(); if (gsi_space) { gsipool = new int[gsi_space + 1]; @@ -514,7 +436,6 @@ DbsDt::DbsDt(int c, int molc) { - clean(); switch (tolower(c)) { case 'e': curdb = SeqDBs + EMBL; break; case 'd': curdb = SeqDBs + NBRF; break; @@ -563,7 +484,6 @@ char str[LINE_MAX]; char buf[LINE_MAX]; - clean(); if (form == 0) form = progets(buf, openmsg); if (strlen(form) <= 1) { switch (tolower(*form)) { @@ -584,57 +504,77 @@ for (int i = 0; i < 3; ++i) { const char* path = dbstab[i]; if (!path) continue; - FILE* fd = fopenpbe(path, form, IDX_EXT, "r", -1, str); - if (fd) { + +// read "grp" file + FILE* fd = fopenpbe(path, form, GRP_EXT, "r", -1, str); + if (!fd) continue; + size_t rss = readgrp(fd); + fclose(fd); + // read "index" file - DbsRec* ridx = readidx(fd, str); - fclose(fd); - if (ridx) { -// read "entry" and "order" files - fd = fopenpbe(path, form, ENT_EXT, "r", 1, str); - readentry(fd, str); - fclose(fd); - fd = fopenpbe(path, form, ODR_EXT, "r", -1, str); - if (fd) { - readodr(fd, str); - fclose(fd); - } - } else { -// read "header" file - setdefdbf(this); - fd = fopenpbe(path, form, HED_EXT, "r", -1, str); - if (!fd) fd = fopenpbe(path, form, IDX_EXT, "r", 1, str); - readidx20(fd, str); - fclose(fd); - } -// read "group" file - fd = fopenpbe(path, form, GRP_EXT, "r", 1, str); + fd = fopenpbe(path, form, IDX_EXT, "r", -1, str); #if USE_ZLIB - size_t rss = readgrp(fd); + gzFile gzfd = 0; +#else + bool gzfd = false; #endif - fclose(fd); -// assign "seq" file - if ((fseq = fopenpbe(path, form, SEQ_EXT, "r", -1, str))) { - pseq = strrealloc(0, str); - if (algmode.dim) readseq(str); - } else { + if (!fd) { #if USE_ZLIB - gzFile gzfd = gzopenpbe(path, form, SGZ_EXT, "r", 1, str); - if (!gzfd) continue; - pseq = strrealloc(0, str); - dbsseq = new CHAR[rss]; - if (!dbsseq) fatal(no_space); - if (gzread(gzfd, dbsseq, rss) <= 0) - fatal("fail to read %s!\n", str); - fclose(gzfd); + gzfd = gzopenpbe(path, form, IDZ_EXT, "r", -1, str); + if (!gzfd) continue; #else - continue; + continue; +#endif + } + DbsRec* ridx = gzfd? readidx(gzfd, str): readidx(fd, str); + if (!ridx) continue; + +// read "entry" file + fd = fopenpbe(path, form, ENT_EXT, "r", -1, str); + if (!fd) { +#if USE_ZLIB + gzfd = gzopenpbe(path, form, ENZ_EXT, "r", -1, str); + if (!gzfd) continue; +#else + continue; +#endif + } else gzfd = 0; + if (gzfd) readentry(gzfd, str); + else readentry(fd, str); + +// read "order" file + fd = fopenpbe(path, form, ODR_EXT, "r", -1, str); + if (fd) readodr(fd, str); +#if USE_ZLIB + else { + gzfd = gzopenpbe(path, form, ODZ_EXT, "r", -1, str); + if (gzfd) readodr(gzfd, str); + } +#endif + +// read "seq" file + fseq = fopenpbe(path, form, SEQ_EXT, "r", -1, str); + if (fseq) { + pseq = strrealloc(0, str); + if (algmode.dim) readseq(str); // seq dat in memory + } else { +#if USE_ZLIB + gzfd = gzopenpbe(path, form, SGZ_EXT, "r", -1, str); + if (!gzfd) continue; + pseq = strrealloc(0, str); + + dbsseq = new CHAR[rss]; + if (!dbsseq) fatal(no_space); + if (fread(dbsseq, sizeof(CHAR), rss, gzfd) <= 0) + fatal("Fail to read .seq.gz file !\n"); + fclose(gzfd); +#else + continue; #endif - } - if (curdb->defmolc == UNKNOWN) - curdb->defmolc = guessmolc(); - return; } + if (curdb->defmolc == UNKNOWN) + curdb->defmolc = guessmolc(); + return; } fatal("%s was not found !\n", str); } @@ -649,6 +589,19 @@ delete[] gsiidx; delete[] gsipool; } +void DbsDt::readseq(const char* fn) +{ + fseek(fseq, 0L, SEEK_END); + long fp = ftell(fseq); + dbsseq = new CHAR[fp]; + if (!dbsseq) fatal(no_space); + if (fread(dbsseq, sizeof(CHAR), fp, fseq) != (size_t) fp) + fatal("%s: Bad seq file !\n", fn); + fclose(fseq); + fseq = 0; +} + + template CHAR* Seq::read_dbres(file_t fd, const RANGE* rng) { @@ -712,7 +665,7 @@ dbs += rng->left / 2; // set the start site int n = rng->right - rng->left; if (n && rng->left % 2) { // odd start - int h = *dbs & 15; + int h = *dbs++ & 15; *ps = h? h + bias: N; if (isAmb(*ps++)) inex.ambs = 1; --n; @@ -809,7 +762,7 @@ char token[MAXL]; car(token, code); if (sname) sname->assign(token); - else sname = new Strlist(token); + else sname = new Strlist(&token[0], ""); record = dbf->findcode(token); if (!record) return(0); } diff -Nru spaln-2.4.7+dfsg/src/dbs.src spaln-2.4.9a+dfsg/src/dbs.src --- spaln-2.4.7+dfsg/src/dbs.src 2022-01-28 07:55:49.000000000 +0000 +++ spaln-2.4.9a+dfsg/src/dbs.src 2022-05-12 01:54:43.000000000 +0000 @@ -41,6 +41,9 @@ static const char LXP_EXT[] = ".lxp"; #if USE_ZLIB static const char SGZ_EXT[] = ".seq.gz"; +static const char IDZ_EXT[] = ".idx.gz"; +static const char ENZ_EXT[] = ".ent.gz"; +static const char ODZ_EXT[] = ".odr.gz"; static const char AGZ_EXT[] = ".bka.gz"; static const char NGZ_EXT[] = ".bkn.gz"; static const char PGZ_EXT[] = ".bkp.gz"; @@ -60,10 +63,6 @@ #define ALN_DBS "ALN_DBS" #endif -#ifndef ALN_SDBS -#define ALN_SDBS "ALN_SDBS" -#endif - static const char DBSID = '$'; static const int MAX_DBS = 2; static const int ENTLEN = 14; @@ -101,22 +100,6 @@ long dbnextentry(FILE* fd, char* ps) const; }; -struct DbsRec20 { - char entry[ENTLEN]; - CHAR master; - CHAR subset; - long recnbr; - long srcptr; - long seqptr; - long seqlen; -}; - -struct DbsRec25 { - long seqptr; - long seqlen; - size_t entptr; -}; - struct DbsRec { long seqptr; long seqlen; @@ -125,36 +108,46 @@ struct DbsGrp { long seqptr; - long recnbr; + INT recnbr; + INT entspc; }; class DbsDt { friend class MakeBlk; friend class SrchBlk; - char* pseq; - CHAR* dbsseq; - Strlist* grplbl; - bool comment; - DbsRec* recidx; - int* gsiidx; - DbsRec20* recidx20; - INT* recodr; - char* entry; - int* gsipool; + char* pseq = 0; + CHAR* dbsseq = 0; + Strlist* grplbl = 0; + bool comment = false; + DbsRec* recidx = 0; + int* gsiidx = 0; + INT* recodr = 0; + char* entry = 0; + int* gsipool = 0; DbsRec* bisearch(const char* key) const; void readseq(const char* fn); size_t readgrp(FILE* fgrp); - void readidx20(FILE* fd, const char* fn); - DbsRec* readidx(FILE* fidx, const char* fn); +#if USE_ZLIB +template + void readentry(file_t fent, const char* fn); +template + void readodr(file_t fodr, const char* fn); +template + DbsRec* readidx(file_t fidx, const char* fn); +#else void readentry(FILE* fent, const char* fn); void readodr(FILE* fodr, const char* fn); + DbsRec* readidx(FILE* fidx, const char* fn); +#endif // USE_ZLIB + public: -const char* dbsid; - SeqDb* curdb; - FILE* fseq; - DbsGrp* dbsgrp; - int numgrp; - INT numidx; +const char* dbsid = 0; + SeqDb* curdb = 0; + FILE* fseq = 0; + DbsGrp* dbsgrp = 0; + int numgrp = 0; + INT numidx = 0; + INT ent_space = 0; void clean(); DbsDt(int c = 0, int molc = UNKNOWN); DbsDt(const char* form); @@ -195,8 +188,63 @@ extern const char* finddbfpath(const char* fn, const char* ext = 0); extern SeqDb* whichdb(const char* ps); +#if USE_ZLIB +template +void DbsDt::readentry(file_t fent, const char* fn) +#else +void DbsDt::readentry(FILE* fent, const char* fn) +#endif +{ + if (ent_space == 0) { + fseek(fent, 0L, SEEK_END); + ent_space = (INT) ftell(fent); + } + if (ent_space == 0) { + int c; + while ((c = fgetc(fent)) != EOF) ++ent_space; + } + rewind(fent); + entry = new char[ent_space]; + if (fread(entry, sizeof(char), ent_space, fent) != ent_space) + fatal("%s: Bad entry file!", fn); + fclose(fent); +} + +#if USE_ZLIB +template +void DbsDt::readodr(file_t fodr, const char* fn) +#else +void DbsDt::readodr(FILE* fodr, const char* fn) +#endif +{ + recodr = new INT[numidx]; + rewind(fodr); + if (fread(recodr, sizeof(INT), numidx, fodr) != numidx) + fatal("%s: Bad order file!", fn); + fclose(fodr); +} + +#if USE_ZLIB +template +DbsRec* DbsDt::readidx(file_t fidx, const char* fn) +#else +DbsRec* DbsDt::readidx(FILE* fidx, const char* fn) +#endif +{ + recidx = new DbsRec[numidx]; + rewind(fidx); + if (fread(recidx, sizeof(DbsRec), numidx, fidx) != numidx) + fatal("%s: Index file may be corrupted!\n", fn); + fclose(fidx); + return (recidx); +} + +#if USE_ZLIB template char* dbs_header(char* str, file_t fin) +#else +char* dbs_header(char* str, FILE* fin) +#endif { char* ps = str; while (*ps == _LCOMM || space_digit(ps)) @@ -207,9 +255,10 @@ template SeqDb* whichdb(char* ps, file_t fd = 0) +#else +SeqDb* whichdb(char* ps, FILE* fd = 0) +#endif { if (fd) ps = dbs_header(ps, fd); return (whichdb(ps)); } - -#endif diff -Nru spaln-2.4.7+dfsg/src/eijunc.cc spaln-2.4.9a+dfsg/src/eijunc.cc --- spaln-2.4.7+dfsg/src/eijunc.cc 1970-01-01 00:00:00.000000000 +0000 +++ spaln-2.4.9a+dfsg/src/eijunc.cc 2022-05-12 01:54:43.000000000 +0000 @@ -0,0 +1,238 @@ +/***************************************************************************** +* +* Read exon/intron sequences according to coordinates in xxx.eij +* +* Osamu Gotoh, ph.D. (-2001) +* Saitama Cancer Center Research Institute +* 818 Komuro, Ina-machi, Saitama 362-0806, Japan +* +* Osamu Gotoh, Ph.D. (2001-) +* National Institute of Advanced Industrial Science and Technology +* Computational Biology Research Center (CBRC) +* 2-41-6 Aomi, Koutou-ku, Tokyo 135-0064, Japan +* +* Osamu Gotoh, Ph.D. (2003-) +* Department of Intelligence Science and Technology +* Graduate School of Informatics, Kyoto University +* Yoshida Honmachi, Sakyo-ku, Kyoto 606-8501, Japan +* +* Copyright(c) Osamu Gotoh <> +*****************************************************************************/ + +#include "eijunc.h" + +#ifdef MAIN + +void usage() +{ + fputs("Usage:\n", stdout); + fputs("\teijunc -[3|5|b|c|e|i|j] [other otptions] xxx.eij\n", stdout); + fputs("Options:\n", stdout); + fputs("\t-3:\tacceptor\n", stdout); + fputs("\t-5:\tdonor\n", stdout); + fputs("\t-a:\tinclude sequences with ambiguous nucleotide\n", stdout); + fputs("\t-b:\tbranch point\n", stdout); + fputs("\t-c:\tcDNA\n", stdout); + fputs("\t-dS:\tgenome dbs\n", stdout); + fputs("\t-e:\texon\n", stdout); + fputs("\t-h:\tprint help\n", stdout); + fputs("\t-i:\tintron\n", stdout); + fputs("\t-j:\tjoin ends\n", stdout); + fputs("\t-oS:\toutput (stdout)\n", stdout); + fputs("\t-wN:\twing size (bp)\n", stdout); + exit (1); +} + +int main(int argc, const char* argv[]) +{ + const char* pn; + const char* gnm = 0; + const char* outf = 0; + int wing = 0; + Eijmode mode = DONOR; + FILE* fo = stdout; + const char* ext = 0; + bool cdna = false; + bool no_amb = true; + + while (--argc && **++argv == '-') { + switch (argv[0][1]) { + case 'd': pn = getarg(argc, argv); + gnm = pn? pn: ""; break; + case '3': mode = ACCPTR; ext = ".SP3"; break; + case '5': mode = DONOR; ext = ".SP5"; break; + case 'a': no_amb = false; break; + case 'b': mode = BRANCH; break; + case 'c': cdna = true; + case 'e': mode = EXON; break; + case 'h': usage(); + case 'i': mode = INTRON; break; + case 'j': mode = JOIN; ext = ".SPb"; break; + case 'o': outf = getarg(argc, argv); + if (outf) fo = fopen(outf, "w"); + if (!fo) fatal("Can't write to %s\n", outf); + break; + case 's': pn = getarg(argc, argv); + if (pn) setdfn(pn); break; + case 'w': pn = getarg(argc, argv); + if (pn) wing = atoi(pn); break; + default: break; + } + } + + EiJuncSeq eijseq(mode, *argv, gnm, wing); + int nrcd = 0; + setup_output(4); + do { + Seq* sd = eijseq.nextseq(); + if (!sd || (no_amb && sd->inex.ambs)) continue; + if (!ext) { + if (!cdna) + fprintf(fo, ">%s.%d\n", sd->sqname(), ++nrcd); + else if (eijseq.new_entry) + fprintf(fo, ">%s %d\n", eijseq.transcript,++nrcd); + sd->typeseq(fo); + } else if (sd->len == eijseq.width) ++nrcd; + } while (eijseq.goahead()); + + if (ext) { + if (!outf) { + strcpy(eijseq.genspc + 8, ext); + outf = eijseq.genspc; + } + fprintf(fo, ">%s [%d]\n\n", outf, nrcd); + eijseq.reset(); + nrcd = 0; + while (eijseq.goahead()) { + Seq* sd = eijseq.nextseq(); + if (!sd || sd->inex.ambs || sd->len != eijseq.width) continue; + fputs(" 1 ", fo); + sd->typeseq(fo, true); + fprintf(fo, "| %s.%d\n", sd->sqname(), ++nrcd); + } + } + return (0); +} + +#endif // MAIN + +EiJuncSeq::EiJuncSeq(Eijmode mode, const char* eij, const char* gnmdb, int w) + : eori(mode), wing(w) +{ + if (eij) { + eijfd = fopen(eij, "r"); + if (!eijfd) fatal(not_found, eij); + } + + switch (eori) { + case INTRON: case EXON: setlpw(60); break; + case BRANCH: setlpw(50); + wms1 = (wing? wing: def_brch) - 1; wing = 0; break; + case JOIN: + if (!wing) wing = 30; + wms1 = wing - 1; + setlpw(width = 4 * wing); + break; + default: + if (!wing) wing = def_wing; + wms1 = wing - 1; + setlpw(width = 2 * wing); + break; + } + +// prepare dbs and background + + *str = *transcript = '\0'; + while (fgets(str, MAXL, eijfd) && (*str == '#' || *str == 0)); + char genome[MAXL]; + if (gnmdb) strcpy(genome, gnmdb); + if (!gnmdb || !*genome) { + strncpy(genome, str, 8); + strcpy(genome + 8, "_g"); + } + *genome = tolower(*genome); + strncpy(genspc, genome, 8); + dbs = new DbsDt(genome); + if (gnmdb && !dbs) fatal(no_file, genome); + eijseq = new Seq(1); +} + +Seq* EiJuncSeq::nextseq() +{ +static const char seqargfrm[] = "%s %d %d %c"; +static const char seqargfrm2[] = "%s %d %d %d %d %c"; + Strlist stl(str, stddelim); + new_entry = strcmp(transcript, stl[Ref_Column]); + if (new_entry) strcpy(transcript, stl[Ref_Column]); + char* chr = stl[0]; + int l = atoi(stl[2]); + int r = atoi(stl[3]); + bool rvs = *stl[1] == '-'; + char sgn = rvs? '<': ' '; + if (rvs) std::swap(l, r); + eijseq->tlen = l; + int ll, lr, rl, rr; + if (rvs && eori != INTRON) { + ll = r - wms1; + lr = r + wing; + rl = l - wing; + rr = l + wms1; + if (eori == BRANCH && rr > r) rr = r; + if (rl < 0) return (0); + } else { + ll = l - wing; + lr = l + wms1; + rl = r - wms1; + if (eori == BRANCH && rl < l) rl = l; + rr = r + wing; + if (lr < 0) return (0); + } + if (eori == EXON) { + if (new_entry) { + if (rvs) { + ll = atoi(stl[6]); + rr = atoi(stl[3]) - 1; + } else { + ll = atoi(stl[5]); + rr = atoi(stl[2]) - 1; + } + } else { + if (rvs) { + ll = atoi(stl[2]) + 1; + rr = atoi(stl[5]); + } else { + ll = atoi(stl[3]) + 1; + rr = atoi(stl[6]); + } + } + } + + char seqarg[MAXL] = "$"; + char sid[MAXL]; + char* seqid = dbs? seqarg + 1: seqarg; + switch (eori) { + case DONOR: + sprintf(seqid, seqargfrm, chr, ll, lr, sgn); + sprintf(sid, "%s%d", chr, l); + break; + case ACCPTR: case BRANCH: + sprintf(seqid, seqargfrm, chr, rl, rr, sgn); + sprintf(sid, "%s%d", chr, r); + break; + case JOIN: + if (rvs) + sprintf(seqid, seqargfrm2, chr, rl, rr, ll, lr, sgn); + else + sprintf(seqid, seqargfrm2, chr, ll, lr, rl, rr, sgn); + sprintf(sid, "%s%d.%d", chr, l, r); + break; + case INTRON: case EXON: + sprintf(seqid, seqargfrm, chr, ll, rr, sgn); + sprintf(sid, "%s%d.%d", chr, l, r); + break; + } + Seq* tmp = eijseq->getseq(seqarg, dbs); + if (tmp) tmp->tlen = rl - l; // from 5' end to left + return (tmp); +} + diff -Nru spaln-2.4.7+dfsg/src/eijunc.h spaln-2.4.9a+dfsg/src/eijunc.h --- spaln-2.4.7+dfsg/src/eijunc.h 1970-01-01 00:00:00.000000000 +0000 +++ spaln-2.4.9a+dfsg/src/eijunc.h 2022-05-12 01:54:43.000000000 +0000 @@ -0,0 +1,67 @@ +/***************************************************************************** +* +* header to read exon/intron sequences indicated in xxx.eij +* +* Osamu Gotoh, ph.D. (-2001) +* Saitama Cancer Center Research Institute +* 818 Komuro, Ina-machi, Saitama 362-0806, Japan +* +* Osamu Gotoh, Ph.D. (2001-) +* National Institute of Advanced Industrial Science and Technology +* Computational Biology Research Center (CBRC) +* 2-41-6 Aomi, Koutou-ku, Tokyo 135-0064, Japan +* +* Osamu Gotoh, Ph.D. (2003-) +* Department of Intelligence Science and Technology +* Graduate School of Informatics, Kyoto University +* Yoshida Honmachi, Sakyo-ku, Kyoto 606-8501, Japan +* +* Copyright(c) Osamu Gotoh <> +*****************************************************************************/ + +#ifndef _EIJUNC_H_ +#define _EIJUNC_H_ + +#include +#include "seq.h" + +enum Eijmode {DONOR, ACCPTR, JOIN, INTRON, EXON, BRANCH}; + +static const int Ref_Column = 7; +static const int def_wing = 25; +static const int def_brch = 120; + +class EiJuncSeq { + Eijmode eori = INTRON; + DbsDt* dbs = 0; + Seq* eijseq = 0; + FILE* eijfd = 0; + char str[MAXL]; + int wing = 0; + int wms1 = 0; +public: + int width = 0; + bool new_entry = false; + char genspc[MAXL]; + char transcript[MAXL]; + Seq* nextseq(); + void typeseq(FILE* fo) {eijseq->typeseq(fo);} + void reset() { + new_entry = false; + if (eijfd) rewind(eijfd); + *str = *transcript = '\0'; + } + char* goahead() { + if (eori == EXON && new_entry) return (str); + return (fgets(str, MAXL, eijfd)); + } + EiJuncSeq(Eijmode mode, const char* eij, + const char* genome = 0, int w = 0); + ~EiJuncSeq() { + delete eijseq; + if (eijfd) fclose(eijfd); + delete dbs; + } +}; + +#endif // _EIJUNC_H_ diff -Nru spaln-2.4.7+dfsg/src/fwd2h1.cc spaln-2.4.9a+dfsg/src/fwd2h1.cc --- spaln-2.4.7+dfsg/src/fwd2h1.cc 2022-01-28 07:55:49.000000000 +0000 +++ spaln-2.4.9a+dfsg/src/fwd2h1.cc 2022-05-12 01:54:43.000000000 +0000 @@ -624,7 +624,7 @@ SKL* wsk = gsi->skl; int num = (wsk++)->n; SpJunc* spjcs = new SpJunc(b, pwd); - VTYPE h = 0; // + VTYPE h = 0; VTYPE hi = NEVSEL; // intron-containing path VTYPE ha = 0; VTYPE hb = 0; @@ -639,11 +639,12 @@ int preint = 0; int phs = 0; int nb = 0; + int psp = 0; // post splicing position int maxexon = IntronPrm.rlmt; EISCR rbuf; FSTAT* fst = &gsi->fstat; FSTAT pst; - Eijnc* eijnc = gsi->eijnc = new Eijnc(); + Eijnc* eijnc = gsi->eijnc = new Eijnc(true); Cigar* cigar = gsi->cigar = 0; Vulgar* vlgar = gsi->vlgar = 0; switch (algmode.nsa) { @@ -655,20 +656,22 @@ vclear(&pst); vclear(&rbuf); gsi->noeij = 0; -const CHAR* cs = b->at(wsk[num - 1].n - 2); - int termcodon = (*cs == TRM || *cs == TRM2); - cs = 0; - if (termcodon) wsk[num-1].n -= 3; + int termcodon = 0; + if (OutPrm.supTcodon) { +const CHAR* cs = b->at(wsk[num - 1].n - 2); + termcodon = (*cs == TRM || *cs == TRM2); + if (termcodon) wsk[num-1].n -= 3; + } if (wsk[1].n == wsk->n && b->inex.exgl) {++wsk; --num;} int m = wsk->m; int n = wsk->n; const CHAR* as = a->at(m); const CHAR* bs = b->at(n); +const CHAR* cs = 0; const EXIN* bb = b->exin->score(n); const EXIN* bb_last = b->exin->score(b->right); PfqItr api(a, m); - int api_size = api.size(); - bool usespb = api_size && use_spb(); +const bool usespb = api.size() && use_spb(); if ((algmode.lcl & (16 + 1)) && bb[1].sigS > h) h = bb[1].sigS; if ((algmode.lcl & (16 + 4)) && bb->sig3 > h) h = bb->sig3; rbuf.left = n; @@ -713,14 +716,18 @@ } hi = NEVSEL; if (insert) { // post-intron gap - if (term && IsTerm(bs[insert - 1])) insert -= 3; + if (term && IsTerm(bs[-1])) insert -= 3; if (cigar) cigar->push('D', insert); phs = insert % 3; insert -= phs; if (!((a->inex.exgl && m == a->left) || (a->inex.exgr && m == a->right))) fst->gap += gop; - fst->unp += insert; + for (int j = 0; j < insert; j += 3, psp += 3) { + if (eijnc) eijnc->shift(rbuf, *fst, + psp / 3 == alprm2.jneibr); + fst->unp += 3; + } if (phs) { // insertion frame shift rbuf.right = n - phs; rbuf.rright = m; @@ -743,7 +750,6 @@ if (!(b->inex.exgl && n == b->left)) { h += pwd->GapPenalty3(deletn); fst->gap += 1; - fst->unp += deletn; } as += deletn / 3; if ((phs = deletn % 3)) { // deletion frame shift @@ -773,7 +779,9 @@ if (vlgar) vlgar->push('M', d / 3, d); n += d; m += d / 3; - for ( ; d > 2; d -= 3, ++as, bs += 3, bb += 3) { + for ( ; d > 2; d -= 3, ++as, bs += 3, bb += 3, psp += 3) { + if (eijnc) eijnc->shift(rbuf, *fst, + psp / 3 == alprm2.jneibr); const CHAR* gs = (cs? cs: bs) + 1; hvl = pwd->sim2(as, gs); fst->val += hvl; @@ -788,6 +796,11 @@ cs = 0; deletn += i; if (cigar) cigar->push('I', i); + for (int j = 0; j < i; j += 3, psp += 3) { + if (eijnc) eijnc->shift(rbuf, *fst, + psp / 3 == alprm2.jneibr); + fst->unp += 3; + } } else if (i < 0) { const EXIN* b3 = bb + (i = -i); if (hi <= NEVSEL && i >= IntronPrm.minl && wsk->n < b->right) { // intron? @@ -849,11 +862,11 @@ rbuf.escr = h + pwd->GapPenalty3(insert); ha = rbuf.escr + xi - sig3; rbuf.escr += sig5 - hb; - rbuf.mch = (int) (fst->mch - pst.mch); - rbuf.mmc = (int) (fst->mmc - pst.mmc); - rbuf.gap = (int) (fst->gap - pst.gap); - rbuf.unp = (int) (fst->unp - pst.unp) / 3; - pst = *fst; + if (eijnc) { + eijnc->store(rbuf, *fst, pst, psp < alprm2.jneibr); + pst = *fst; + psp = 0; + } } } else ++gop; if (i < maxexon) h += SumCodePot(bb, i, 0, pwd); @@ -882,10 +895,7 @@ rbuf.sig5 = sig5; rbuf.right = n; rbuf.rright = m; - rbuf.mch = (int) (fst->mch - pst.mch); - rbuf.mmc = (int) (fst->mmc - pst.mmc); - rbuf.gap = (int) (fst->gap - pst.gap); - rbuf.unp = (int) (fst->unp - pst.unp) / 3; + eijnc->store(rbuf, *fst, pst, n - rbuf.left <= alprm2.jneibr); eijnc->push(&rbuf); rbuf.left = endrng.left; rbuf.right = endrng.right; diff -Nru spaln-2.4.7+dfsg/src/fwd2s1.cc spaln-2.4.9a+dfsg/src/fwd2s1.cc --- spaln-2.4.7+dfsg/src/fwd2s1.cc 2022-01-28 07:55:49.000000000 +0000 +++ spaln-2.4.9a+dfsg/src/fwd2s1.cc 2022-05-12 01:54:43.000000000 +0000 @@ -455,6 +455,7 @@ EISCR rbuf; FSTAT* fst = &gsi->fstat; FSTAT pst; + int psp = 0; // post splicing position Eijnc* eijnc = gsi->eijnc = new Eijnc(true); Cigar* cigar = gsi->cigar = 0; Vulgar* vlgar = gsi->vlgar = 0; @@ -534,7 +535,6 @@ if (!(b->inex.exgl && n == b->left)) { h += pwd->GapPenalty(deletn); fst->gap += 1; - fst->unp += deletn; } as += deletn; deletn = 0; @@ -551,15 +551,17 @@ VTYPE x = 0; #if CigarM for ( ; d; --d, ++as, ++bs, ++n) { + if (eijnc) eijnc->shift(rbuf, *fst, + ++pst == alprm2.jneibr); x += pwd->sim2(as, bs); if (*as == *bs) ++fst->mch; else ++fst->mmc; - if (eijnc) eijnc->shift(rbuf, *fst, pst, - n - rbuf.left == alprm2.jneibr); } #else int run = 0; for ( ; d; --d, ++as, ++bs, ++n) { + if (eijnc) eijnc->shift(rbuf, *fst, + psp++ == alprm2.jneibr); x += pwd->sim2(as, bs); if (*as == *bs) { if (run < 0) { @@ -576,8 +578,6 @@ ++fst->mmc; --run; } - if (eijnc) eijnc->shift(rbuf, *fst, pst, - n - rbuf.left == alprm2.jneibr); if (samfm) { if (run > 0) samfm->push('=', run); else if (run < 0) samfm->push('X', -run); @@ -592,6 +592,11 @@ if (cigar) cigar->push('I', i); if (samfm) samfm->push('I', i); if (vlgar) vlgar->push('G', i, 0); + for (int j = 0; j < i; ++j) { + if (eijnc) + eijnc->shift(rbuf, *fst, psp++ == alprm2.jneibr); + ++fst->unp; + } } else if (i < 0) { const int n3 = n + (i = -i); if (algmode.lsg && i > IntronPrm.minl) { @@ -611,15 +616,16 @@ ha = h + xi - sig3; if (eijnc) { eijnc->store(rbuf, *fst, pst, - n - rbuf.left < alprm2.jneibr); + psp < alprm2.jneibr); pst = *fst; + psp = 0; } } else if (!a->inex.exgl || m != a->left) { if (!insert) fst->gap += 1; for (int j = 0; j < i; ++j, ++n) { + if (eijnc) eijnc->shift(rbuf, *fst, + psp++ == alprm2.jneibr); ++fst->unp; - if (eijnc) eijnc->shift(rbuf, *fst, pst, - n - rbuf.left == alprm2.jneibr); } } bs += i; @@ -651,7 +657,6 @@ rbuf.sig5 = 0; rbuf.right = n; rbuf.rright = m; - eijnc->unshift(); eijnc->store(rbuf, *fst, pst, n - rbuf.left <= alprm2.jneibr); eijnc->push(&rbuf); rbuf.left = endrng.left; diff -Nru spaln-2.4.7+dfsg/src/gsinfo.cc spaln-2.4.9a+dfsg/src/gsinfo.cc --- spaln-2.4.7+dfsg/src/gsinfo.cc 2022-01-28 07:55:49.000000000 +0000 +++ spaln-2.4.9a+dfsg/src/gsinfo.cc 2022-05-12 01:54:43.000000000 +0000 @@ -65,7 +65,7 @@ { PfqItr api(sd, sd->left); if (api.size()) { - cip_hash = new Dhash(api.pfqnum); + cip_hash = new Dhash(api.pfqnum, 0); for ( ; !api.end(); ++api) { #if USE_WEIGHT VTYPE val = SpbFact * api.wfq->dns; @@ -1224,12 +1224,12 @@ } } -Eijnc::Eijnc(bool que) +Eijnc::Eijnc(bool que) : qsize(que? alprm2.jneibr: 0) { emfd = new Mfile(sizeof(EISCR)); if (que) { - fstque = new FSTAT[alprm2.jneibr]; - vclear(fstque, alprm2.jneibr); + fstque = new FSTAT[qsize]; + vclear(fstque, qsize); } } @@ -1249,18 +1249,16 @@ rbuf.mmc3 = (int) (now.mmc - fstque[q].mmc); rbuf.unp3 = (int) (now.unp - fstque[q].unp); rbuf.gap3 = (int) (now.gap - fstque[q].gap); - vclear(fstque, alprm2.jneibr); - q = 0; } -void Eijnc::shift(EISCR& rbuf, FSTAT& now, FSTAT& prv, bool nearjnc) +void Eijnc::shift(EISCR& rbuf, FSTAT& now, bool nearjnc) { if (nearjnc) { - rbuf.mch5 = (int) (now.mch - prv.mch); - rbuf.mmc5 = (int) (now.mmc - prv.mmc); - rbuf.unp5 = (int) (now.unp - prv.unp); - rbuf.gap5 = (int) (now.gap - prv.gap); + rbuf.mch5 = (int) (now.mch - fstque[q].mch); + rbuf.mmc5 = (int) (now.mmc - fstque[q].mmc); + rbuf.unp5 = (int) (now.unp - fstque[q].unp); + rbuf.gap5 = (int) (now.gap - fstque[q].gap); } fstque[q] = now; - if (++q == alprm2.jneibr) q = 0; + if (++q == qsize) q = 0; } diff -Nru spaln-2.4.7+dfsg/src/gsinfo.h spaln-2.4.9a+dfsg/src/gsinfo.h --- spaln-2.4.7+dfsg/src/gsinfo.h 2022-01-28 07:55:49.000000000 +0000 +++ spaln-2.4.9a+dfsg/src/gsinfo.h 2022-05-12 01:54:43.000000000 +0000 @@ -340,6 +340,7 @@ int num = 0; Mfile* emfd = 0; EISCR* rec = 0; +const int qsize = 0; FSTAT* fstque = 0; int q = 0; public: @@ -358,7 +359,7 @@ int refleft() const {return rec? rec->rleft: 0;} int refright() const {return rec? rec[num - 2].rright: 0;} void store(EISCR& rbuf, FSTAT& now, FSTAT& prv, bool nearjnc); - void shift(EISCR& rbuf, FSTAT& now, FSTAT& prv, bool nearjnc); + void shift(EISCR& rbuf, FSTAT& now, bool nearjnc); void unshift() {if (q) --q; else q = alprm2.jneibr - 1;} }; diff -Nru spaln-2.4.7+dfsg/src/iolib.cc spaln-2.4.9a+dfsg/src/iolib.cc --- spaln-2.4.7+dfsg/src/iolib.cc 2022-01-28 07:55:49.000000000 +0000 +++ spaln-2.4.9a+dfsg/src/iolib.cc 2022-05-12 01:54:43.000000000 +0000 @@ -44,8 +44,14 @@ const char* read_error = "\'%s\' read error!\n"; const char* write_error = "\'%s\' write error!\n"; const char* gz_unsupport = "compressed %s is not supported!\n"; +const char* bad_file = "Bad binary file : %s !\n"; +const char* gnm2tab = "gnm2tab"; +const char* esc_code = "\x1b["; +const char* font_end = ""; + + //lpw blk Nout Ncolony eij ovl fnm rm trim lg lbl dsc odr spj olr color self term +OUTPRM OutPrm = {60, 0, 16, 1, 4, 10, 5, 0, 1, 0, 3, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0}; -static const char* gnm2tab = "gnm2tab"; static const char* font_tag[3] = { "", "", @@ -60,7 +66,7 @@ static void display(const char* s, va_list args); #if LEAKTRACE -LeakTrace leak_trace; +//leaktracer::LeakTrace leak_trace; #endif char* fgets_wocr(char* str, INT maxl, FILE* fd) @@ -112,14 +118,14 @@ char const* sg[5]; char const* sd[5]; char buf[LINE_MAX]; - int i, l; if (fnam == result) fnam = strcpy(buf, fnam); make_fn(fnam, sg); make_fn(defn, sd); *result = '\0'; - for (i = 0; i < 4; i++) { - if ((l = sg[i+1] - sg[i])) + for (int i = 0; i < 4; ++i) { +const int l = sg[i+1] - sg[i]; + if (l) strncat(result, sg[i], l); else strncat(result, sd[i], sd[i+1] - sd[i]); @@ -141,25 +147,26 @@ else while (*fname++ != ':'); } - while (strchr(fname, PATHDELM)) + while (strchr(fname, PATHDELM)) { if (strchr(where, 'p')) while ((*part++ = *fname++) != PATHDELM); else while (*fname++ != PATHDELM); - if (strchr(fname, '.')) { + } +const char* dot = strrchr(fname, '.'); + if (dot) { if (strchr(where, 'b')) - while (*fname != '.') *part++ = *fname++; + while (fname != dot) *part++ = *fname++; else - while (*fname != '.') fname++; + while (fname != dot) fname++; if (strchr(where, 'e')) { while ((*part++ = *fname++)); } else *part = '\0'; - } - else { - if (strchr(where, 'b')) { + } else { + if (strchr(where, 'b')) while ((*part++ = *fname++)); - } else *part = '\0'; + else *part = '\0'; } return (top); } @@ -174,6 +181,19 @@ return (fs); } +// sizeof(buf) must be larger than strlen(fn) + strlen(ext) + +char* add_ext(const char* fn, const char* ext, char* buf) +{ + strcpy(buf, fn); + if (!ext || !*ext) return (buf); + if (*ext == '.') ++ext; + char* dot = strrchr(buf, '.'); + if (dot && !strcmp(dot + 1, ext)) return (buf); + if (!dot || strlen(dot) > 1) strcat(buf, "."); + return (strcat(buf, ext)); +} + FILE* fopenpbe(const char* path, const char* name, const char* extent, const char* opt, int lvl, char* str) { @@ -209,6 +229,19 @@ return (0); } +FILE* wfopen(const char* name, const char* mode) +{ + if (is_file(name)) { + int c = 'y'; + if (OutPrm.overwrite == 0) + c = progetc("Overwrite existing file \"%s\"? [y/n] ", name); + else if (OutPrm.overwrite > 1) + c = 'n'; + if (tolower(c) != 'y') return (0); + } + return (fopen(name, mode)); +} + FILE* qopen(const char* dfname, const char* mode) { char fname[LINE_MAX]; @@ -294,7 +327,6 @@ if (fd) return (fd); } } - fatal("%s not found! Confirm whether ALN_TAB is correctly set!\n", fname); return (0); } @@ -406,7 +438,6 @@ * *********************************************************************/ - static INTERACT crt = {1, 0}; void setprompt(int prom, int ech) @@ -565,17 +596,6 @@ } } -void fatal(const char* format,...) -{ - va_list args; - - va_start(args, format); - (void) vfprintf(stderr, format, args); - putc('\n', stderr); - va_end(args); - exit(1); -} - void prompt(const char* format,...) { va_list args; @@ -831,6 +851,22 @@ #if USE_ZLIB +gzFile wgzopen(const char* name, const char* mode) +{ + char str[LINE_MAX]; + strcpy(str, name); + if (!is_gz(name)) strcat(str, gz_ext); + if (is_file(str)) { + int c = 'y'; + if (OutPrm.overwrite == 0) + c = progetc("Overwrite existing file \"%s\"? [y/n] ", str); + else if (OutPrm.overwrite > 1) + c = 'n'; + if (tolower(c) != 'y') return (0); + } + return (gzopen(str, mode)); +} + gzFile gzopenpbe(const char* path, const char* name, const char* extent, const char* opt, int lvl, char* str) { @@ -851,12 +887,11 @@ if (ps > str && ps[-1] != PATHDELM) *ps++ = PATHDELM; *ps = '\0'; + strcat(str, name); if (extent) { - partfnam(str + strlen(str), name, "b"); if (*extent && *extent != '.') strcat(str, "."); strcat(str, extent); - } else - strcat(str, name); + } if ((fd = gzopen(str, opt))) return (fd); } while (*pt); @@ -866,5 +901,76 @@ return (0); } -#endif +size_t lgzread(gzFile gzfd, char* buf, size_t sz) +{ + char* tq = buf + sz; + for (char* sq = buf; sq < tq; sq += INT_MAX) { +const int n = std::min(long(INT_MAX), tq - sq); + int actural_read = gzread(gzfd, sq, n); + if (actural_read <= 0) { + int z_errnum = 0; + fatal("gz_read_error %s!\n", gzerror(gzfd, &z_errnum)); + } + } + return (sz); +} + +size_t lgzwrite(gzFile gzfd, const char* buf, size_t sz) +{ +const char* tq = buf + sz; + for (const char* sq = buf; sq < tq; sq += INT_MAX) { +const int n = std::min(long(INT_MAX), tq - sq); + int actural_write = gzwrite(gzfd, sq, n); + if (actural_write <= 0) { + int z_errnum = 0; + fatal("gz_write_error %s!\n", gzerror(gzfd, &z_errnum)); + } + } + return (sz); +} + +gzFile Ftable::gzopen(const char* fname, const char* mode) +{ + char str[LINE_MAX]; + gzFile fd = 0; +const char* ext = is_gz(fname)? 0: gz_ext; + for (int i = 0; i < n_tabpath; ++i) { + if (tabpath[i]) { + strcpy(str, tabpath[i]); + if (subdir) { + strcat(str, "/"); + strcat(str, subdir); + fd = gzopenpbe(str, fname, ext, mode, -1); + if (fd) return (fd); + } + fd = gzopenpbe(tabpath[i], fname, ext, mode, -1); + if (fd) return (fd); + } + } + fatal("%s not found! Confirm whether ALN_TAB is correctly set!\n", fname); + return (0); +} + +gzFile Ftable::gzopen(const char* fname, const char* envpath, const char* defpath) +{ + gzFile fd = this->gzopen(fname, "r"); + if (fd) return (fd); +const char* ext = is_gz(fname)? 0: gz_ext; + + char str[LINE_MAX]; + char* path = envpath? getenv(envpath): 0; + if (path) { + strcpy(str, path); + fd = gzopenpbe(str, fname, ext, "r", -1); + if (fd) return (fd); + } + if (defpath) { + strcpy(str, defpath); + fd = gzopenpbe(str, fname, ext, "r", -1); + if (fd) return (fd); + } + return (0); +} + +#endif // USE_ZLIB diff -Nru spaln-2.4.7+dfsg/src/iolib.src spaln-2.4.9a+dfsg/src/iolib.src --- spaln-2.4.7+dfsg/src/iolib.src 2022-01-28 07:55:49.000000000 +0000 +++ spaln-2.4.9a+dfsg/src/iolib.src 2022-05-12 01:54:43.000000000 +0000 @@ -29,7 +29,6 @@ #define _IOLIB_ #include "adddef.h" -#include #include #include @@ -70,8 +69,11 @@ static const int n_tabpath = 3; static const float FSILENT = FQUERY + 1.L; static const float FPOPUP = FQUERY + 2.L; -static const char* esc_code = "\x1b["; -static const char* font_end = ""; + +extern const char* gnm2tab; +extern const char* bad_file; +extern const char* esc_code; +extern const char* font_end; enum CharColorCode { CMC_BLACK = '0', CMC_RED, CMC_GREEN, CMC_YELLOW, @@ -87,6 +89,37 @@ INT echo : 1; }; +struct OUTPRM { + int lpw; + INT BlkSz; + int NoOut; + INT MaxOut; + INT MaxOut2; // intermediate state + int EijMergin; + int AllowdOverlap; +const char* out_file; + INT RemoveTmp: 1; + INT trimend: 1; + INT SkipLongGap: 2; // 0: don't skip, 1: skip, 3: run time + INT deflbl: 2; // 0: sname, 1: fname, 2: tally, 3: new + INT fastanno: 1; // 0: add annotation in fasta output + INT descrp: 1; // output description + INT sortodr: 2; // 0: input, 1: bytree, 2: shorter, 3: longer + INT spjinf: 1; + INT olrsum: 1; + INT ColorEij: 2; // color mark intron position + INT supself: 2; // suppress result of self comparison + INT noseqline: 1; // additional prefix line in m-fasta + INT asterisk: 1; // add asterisk as the eos mark + INT trimendgap: 1; // suppress tail gap characters + INT taxoncode: 3; // add taxinomic code field X in gnm2tab + INT printweight: 1; // output seq weights in MSA + INT gzipped: 1; // gzipped output for spaln + INT debug: 1; // print debug lines + INT supTcodon: 1; // don't show termination codon + INT overwrite: 2; // overwrite existing file +}; + class EscCharCtl { FILE* fd; void putctl(int maincode, int subcode = 0, int attr = 0); @@ -137,8 +170,8 @@ char* subdir; Gnm2tab* g2t; public: - Ftable() : subdir(0), g2t(0) { - tabpath[0] = "."; + Ftable(const bool cur_dir = true) : subdir(0), g2t(0) { + tabpath[0] = cur_dir? ".": 0; tabpath[1] = getenv(ALN_TAB); tabpath[2] = DEF_TAB_PATH; } @@ -156,6 +189,10 @@ if (!g2t) g2t = new Gnm2tab(field); return (g2t); } +#if USE_ZLIB + gzFile gzopen(const char* fname, const char* mode); + gzFile gzopen(const char* fname, const char* envpath, const char* defpath); +#endif }; #ifdef BSD @@ -164,6 +201,7 @@ #endif #endif +extern OUTPRM OutPrm; extern Ftable ftable; extern char* fgets_wocr(char* str, INT maxl, FILE* fd); extern char* topath( char* res, const char* org); @@ -171,6 +209,7 @@ extern char* partfnam( char* part, const char* fname, const char* where); extern FILE* fopenpbe(const char* path, const char* name, const char* extent, const char* opt, int lvl = 0, char* buf = 0); +extern FILE* wfopen(const char* name, const char* mode); extern FILE* qopen(const char* dfname, const char* mode); extern FILE* qout(const char* dfn); extern void qclose(); @@ -179,7 +218,6 @@ extern long diskspc(char* drive); extern void setprompt(int prom, int ech); extern int getprompt(); -extern void fatal(const char* format,...); extern void prompt(const char* s,...); extern int progetc(const char* frmt,...); extern char* progets(char* str, const char* frmt,...); @@ -191,6 +229,7 @@ extern size_t file_size(FILE* fd); extern FILE* out_fd; extern const char* gz_unsupport; +extern char* add_ext(const char* fn, const char* ext, char* buf); inline size_t file_size(const char* fn) { struct stat buf; @@ -225,13 +264,14 @@ } #if USE_ZLIB +extern gzFile wgzopen(const char* name, const char* mode); extern gzFile gzopenpbe(const char* path, const char* name, const char* extent, const char* opt, int lvl = 0, char* buf = 0); #endif inline FILE* outfd(const char* fn) { - if (!fn || !(out_fd = fopen(fn, "w"))) out_fd = stdout; + if (!fn || !(out_fd = wfopen(fn, "w"))) out_fd = stdout; return (out_fd); } diff -Nru spaln-2.4.7+dfsg/src/Makefile.in spaln-2.4.9a+dfsg/src/Makefile.in --- spaln-2.4.7+dfsg/src/Makefile.in 2022-01-28 07:55:49.000000000 +0000 +++ spaln-2.4.9a+dfsg/src/Makefile.in 2022-05-12 01:54:43.000000000 +0000 @@ -103,6 +103,7 @@ vmf.o: vmf.cc vmf.h $(STDH) dbs.o: dbs.cc dbs.h $(STDH) gaps.o: gaps.cc gaps.h seq.h +eijunc.o: eijunc.cc eijunc.h seq.h divseq.o: divseq.cc aln.h fwd2b1.o: fwd2b1.cc aln.h seq.h codepot.h fwd2h1.o: fwd2h1.cc aln.h seq.h codepot.h @@ -111,10 +112,10 @@ boyer_moore.o: boyer_moore.h boyer_moore.cc sblib.a: aln2.o dbs.o gaps.o codepot.o divseq.o gsinfo.o \ - fwd2b1.o fwd2d1.o fwd2h1.o fwd2s1.o bitpat.o \ + fwd2b1.o fwd2d1.o fwd2h1.o fwd2s1.o bitpat.o eijunc.o \ seq.o simmtx.o sqpr.o utilseq.o vmf.o wln.o boyer_moore.o $(AR) sblib.a aln2.o dbs.o gaps.o codepot.o divseq.o gsinfo.o \ - fwd2b1.o fwd2d1.o fwd2h1.o fwd2s1.o bitpat.o \ + fwd2b1.o fwd2d1.o fwd2h1.o fwd2s1.o bitpat.o eijunc.o \ seq.o simmtx.o sqpr.o utilseq.o vmf.o wln.o boyer_moore.o $(RANLIB) sblib.a diff -Nru spaln-2.4.7+dfsg/src/seq.cc spaln-2.4.9a+dfsg/src/seq.cc --- spaln-2.4.7+dfsg/src/seq.cc 2022-01-28 07:55:49.000000000 +0000 +++ spaln-2.4.9a+dfsg/src/seq.cc 2022-05-12 01:54:43.000000000 +0000 @@ -1558,18 +1558,18 @@ return (dst); } -void Seq::estimate_len(FILE* fd, const int& nos) +void Seq::estimate_len(FILE* fd, const int& nos, const SeqDb* dbf) { long fpos = ftell(fd); if (nos == 1) { // FASTA single sequence char str[MAXL]; while (fgets(str, MAXL, fd)) { if (*str == _NHEAD || *str == _CHEAD || *str == _EOS) { - flush_line(fd); + if ((strlen(str) + 1) == MAXL) flush_line(fd); break; } if (*str == _COMM || *str == _LCOMM || *str == _WGHT) { - flush_line(fd); + if ((strlen(str) + 1) == MAXL) flush_line(fd); continue; } len += strlen(str) - 1; @@ -1583,30 +1583,26 @@ } #if USE_ZLIB -void Seq::estimate_len(gzFile gzfd, const int& nos) +void Seq::estimate_len(gzFile gzfd, const int& nos, const SeqDb* dbf) { long fpos = ftell(gzfd); char str[MAXL]; - int many = 1; while (fgets(str, MAXL, gzfd)) { - if (nos == 1 && - (*str == _NHEAD || *str == _CHEAD || *str == _EOS)) { - flush_line(gzfd); - break; - } - if (*str == _NHEAD || *str == _CHEAD) { - flush_line(gzfd); - ++many; + if (*str == _NHEAD || *str == _CHEAD || *str == _EOS || + *str == _COMM || *str == _LCOMM || *str == _WGHT) { + if ((strlen(str) + 1) == MAXL) flush_line(gzfd); continue; } - if (*str == _COMM || *str == _LCOMM || *str == _WGHT || *str == _EOS) { - flush_line(gzfd); - continue; + if (nos == 1 || (dbf && dbf->FormID == FASTA)) { + len += strlen(str) - 1; + } else { + Strlist stl(str, stddelim); + if (stl.size() > 1) + len += strlen(stl[1]) - 1; } - len += strlen(str) - 1; } area_ = len; - len /= many; + len /= nos; fseek(gzfd, fpos, SEEK_SET); } #endif @@ -1647,7 +1643,7 @@ sprintf(input, "std%d", sid); spath = strrealloc(spath, input); if (sname) sname->assign(input); - else sname = new Strlist(input); + else sname = new Strlist(&input[0], ""); } else { #if USE_ZLIB fd = openseq(str, &gzfd); @@ -2041,7 +2037,7 @@ inex.molc = molc; sprintf(name, "RAND%d", sid); if (sname) sname->assign(name); - else sname = new Strlist(name); + else sname = new Strlist(&name[0], ""); CHAR* s = at(0); RandNumGen rn(pcmp, isprotein()? 20: 4); for (int n = 0; n < len; ++n) diff -Nru spaln-2.4.7+dfsg/src/seq.h spaln-2.4.9a+dfsg/src/seq.h --- spaln-2.4.7+dfsg/src/seq.h 2022-01-28 07:55:49.000000000 +0000 +++ spaln-2.4.9a+dfsg/src/seq.h 2022-05-12 01:54:43.000000000 +0000 @@ -184,36 +184,6 @@ extern int cmpPfqPos(const PFQ* a, const PFQ* b); extern int en_code(int c, const SEQ_CODE* code); -struct OUTPRM { - int lpw; - INT BlkSz; - int NoOut; - INT MaxOut; - INT MaxOut2; // intermediate state - int EijMergin; - int AllowdOverlap; -const char* out_file; - INT RemoveTmp: 1; - INT trimend: 1; - INT SkipLongGap: 2; // 0: don't skip, 1: skip, 3: run time - INT deflbl: 2; // 0: sname, 1: fname, 2: tally, 3: new - INT fastanno: 1; // 0: add annotation in fasta output - INT descrp: 1; // output description - INT sortodr: 2; // 0: input, 1: bytree, 2: shorter, 3: longer - INT spjinf: 1; - INT olrsum: 1; - INT ColorEij: 2; // color mark intron position - INT supself: 2; // suppress result of self comparison - INT noseqline: 1; // additional prefix line in m-fasta - INT asterisk: 1; // add asterisk as the eos mark - INT trimendgap: 1; // suppress tail gap characters - INT taxoncode: 3; // add taxinomic code field X in gnm2tab - INT printweight: 1; // output seq weights in MSA - INT gzipped: 1; // gzipped output for spaln - INT debug: 1; // print debug lines -}; - -extern OUTPRM OutPrm; extern DefSetup def_setup; extern SEQ_CODE* setSeqCode(Seq* sd, const int& molc); @@ -265,9 +235,9 @@ RANGE* setrange(const char* pa, int* ncr = 0) const; template char* readanno(file_t fd, char* str, SeqDb* db, Mfile& gapmfd); - void estimate_len(FILE* fd, const int& nos); + void estimate_len(FILE* fd, const int& nos, const SeqDb* dbf = 0); #if USE_ZLIB - void estimate_len(gzFile fd, const int& nos); + void estimate_len(gzFile fd, const int& nos, const SeqDb* dbf = 0); #endif void header_nat_aln(const int& n, const FTYPE& sumwt); CHAR* seq_realloc(); @@ -558,7 +528,7 @@ // infer input sequence format SeqDb* dbf = seq_NandL(nos, len, mode, str, fd, dm); if (nos) { - if (!len) estimate_len(fd, nos); + if (!len) estimate_len(fd, nos, dbf); refresh(nos, len); } setSeqCode(this, dm); @@ -598,7 +568,7 @@ if (dm < right) right = dm; } if (spath) { - if (!sname) sname = new Strlist(path2fn(spath)); + if (!sname) sname = new Strlist(path2fn(spath), ""); else if (sname->empty()) sname->assign(path2fn(spath)); } return attrseq(attr); diff -Nru spaln-2.4.7+dfsg/src/sortgrcd.cc spaln-2.4.9a+dfsg/src/sortgrcd.cc --- spaln-2.4.7+dfsg/src/sortgrcd.cc 2022-01-28 07:55:49.000000000 +0000 +++ spaln-2.4.9a+dfsg/src/sortgrcd.cc 2022-05-12 01:54:43.000000000 +0000 @@ -67,11 +67,12 @@ static void usage() { - fputs("sortgrcd version 2.2: read binary grds and erds and sort them\n", stderr); + fputs("sortgrcd version 2.2.1: read binary .?rds and sort them\n", stderr); fputs("\tin the order of chromosomal location in each direction\n", stderr); fputs("Usage: sortgrcd [options] *.grd\n", stderr); fputs("Note: version 2 supports outputs from spaln 2.1.0 or later\n", stderr); fputs("Note: version 2.1 supports -O3, 6, 7, and 8 options\n", stderr); + fputs("Note: version 2.2 supports gzipped inputs\n", stderr); fputs("Options:\n", stderr); fputs("\t-CN:\tMinimum % of coverage (0-100)\n", stderr); fputs("\t-EN:\tReport only the best (N=1) or all (N=2) results per gene locus (1)\n", stderr); @@ -84,16 +85,17 @@ fputs("\t-ON:\tOutput format. 0:Gff3, 3:BED, 4:Native, 5:Intron, \n", stderr); fputs("\t\t\t6:cDNA, 7:translate, 8:CDS, 15:unique intron\n", stderr); fputs("\t-PN:\tMinimum Overall % identity (0-100)\n", stderr); + fputs("\t-Sa:\tsort chromosomes in the alphabetical order of identifier (default)\n", stderr); + fputs("\t-Sb:\tsort chromosomes in the order of abundance mapped on them\n", stderr); + fputs("\t-Sc:\tsort chromosomes in the order of apparence in the genome db\n", stderr); + fputs("\t-Sr:\tsort records mapped on minus strand in the reverse order of genomic positions\n", stderr); fputs("\t-UN:\tMaximum total number of unpaired bases in gaps\n", stderr); + fputs("\t-VN:\ttMaximum memory size used for core sort (16M)\n", stderr); + fputs("\t-gS:\tSpecify the .grp file name\n", stderr); fputs("\t-lN:\tNumber of residues per line for -O6 or -O7 (60)\n", stderr); fputs("\t-mN:\tMaximum allowed missmatches at both exon boundaries\n", stderr); fputs("\t-nN:\tallow non-canonical boundary? [0: no; 1: AT-AN; 2: 1bp mismatch; 3: any]\n", stderr); fputs("\t-uN:\tMaximum allowed unpaired bases in gaps at both exon boundaries\n", stderr); - fputs("\t-gS:\tSpecify the .grp file name\n", stderr); - fputs("\t-Sa:\tsort chromosomes in the alphabetical order of identifier (default)\n", stderr); - fputs("\t-Sb:\tsort chromosomes in the order of abundance mapped on them\n", stderr); - fputs("\t-Sc:\tsort chromosomes in the order of apparence in the genome db\n", stderr); - fputs("\t-Sr:\tsort records mapped on minus strand in the reverse order of genomic positions\n", stderr); exit(1); } @@ -997,8 +999,11 @@ fclose(gzfd); return (nr); } + char* dot = strrchr(str, '.'); + if (dot) *dot = '\0'; + strcat(str, "(.gz)"); #endif - fatal(errmsg, str); + fatal(not_found, str); return (0); } @@ -1134,7 +1139,7 @@ // Get options while (--argc && (++argv)[0][0] == OPTCHAR) { - const char* val = argv[0] + 2; +const char* val = argv[0] + 2; int flevel = 0; switch (argv[0][1]) { case 'F': diff -Nru spaln-2.4.7+dfsg/src/sortgrcd.h spaln-2.4.9a+dfsg/src/sortgrcd.h --- spaln-2.4.7+dfsg/src/sortgrcd.h 2022-01-28 07:55:49.000000000 +0000 +++ spaln-2.4.9a+dfsg/src/sortgrcd.h 2022-05-12 01:54:43.000000000 +0000 @@ -41,7 +41,7 @@ enum InOrder {INPUT_ODR, ALPHABETIC, ABUNDANCE}; static const int max_no_exon = 1024; -static const int MAXRECD = 1024 * 1024; +static const int MAXRECD = 16 * 1024 * 1024; static const char* RGB_RED = "255,0,0"; static const char* RGB_BLUE = "0,255,255"; diff -Nru spaln-2.4.7+dfsg/src/spaln.cc spaln-2.4.9a+dfsg/src/spaln.cc --- spaln-2.4.7+dfsg/src/spaln.cc 2022-01-28 07:55:49.000000000 +0000 +++ spaln-2.4.9a+dfsg/src/spaln.cc 2022-05-12 01:54:43.000000000 +0000 @@ -144,8 +144,8 @@ static int q_mns = 3; static int no_seqs = 3; static bool gsquery = QRYvsDB == GvsA || QRYvsDB == GvsC; -static const char* version = "2.4.7"; -static const int date = 220128; +static const char* version = "2.4.9a"; +static const int date = 220512; static AlnOutModes outputs; static void usage(const char* messg) @@ -198,8 +198,11 @@ fputs("\t-l#\tNumber of characters per line in alignment (60)\n", stderr); fputs("\t-o$\tFile/directory/prefix where results are written (stdout)\n", stderr); fputs("\t-pa#\tRemove 3' poly A >= # (0: don't remove)\n", stderr); + fputs("\t-pn\tRetain existing output file\n", stderr); + fputs("\t-po\tOverwrite existing output file\n", stderr); fputs("\t-pw\tReport results even if the score is below the threshold\n", stderr); - fputs("\t-pq\tQuiet mode\n", stderr); + fputs("\t-pT\tExclude termination codon from CDS\n", stderr); + fputs("\t-r$\tReport information about block data file\n", stderr); fputs("\t-r$\tReport information about block data file\n", stderr); fputs("\t-u#\tGap-extension penalty (3)\n", stderr); fputs("\t-v#\tGap-open penalty (8)\n", stderr); @@ -237,10 +240,10 @@ DbsDt** dbs = dbs_dt; while (--argc > 0 && **++argv == OPTCHAR) { - const char* opt = argv[0] + 1; +const char* opt = argv[0] + 1; int c = *opt; if (!c) break; - const char* val = argv[0] + 2; +const char* val = argv[0] + 2; int k = 0; int rv = 0; switch (c) { @@ -397,6 +400,8 @@ case 'f': OutPrm.deflbl = 1; break; case 'i': OutPrm.ColorEij = 1; break; case 'j': OutPrm.spjinf = 1 - OutPrm.spjinf; break; + case 'n': OutPrm.overwrite = 2; break; + case 'o': OutPrm.overwrite = 1; break; case 'q': setprompt(0, 0); break; case 'Q': setprompt(0, 1); break; case 'r': algmode.mlt = 1; break; @@ -405,6 +410,7 @@ case 'x': OutPrm.supself = (val[1] == '2')? 2: 1; break; case 'J': case 'm': case 'u': case 'v': setexprm_z(argc, argv); break; + case 'T': OutPrm.supTcodon = 1; break; default: break; } break; @@ -453,7 +459,7 @@ #endif case 'T': if ((val = getarg(argc, argv))) - ftable.setpath(val, GNM2TAB); + ftable.setpath(val, gnm2tab); readargs(); break; case 'u': case 'v': case 'w': @@ -547,6 +553,7 @@ gene->inex.sens? '<': ' ', sqs[1]->sqname()); } for (int n = 0; n < n_out_modes; ++n) { + if (!fds[n]) continue; if (QRYvsDB == AvsG || QRYvsDB == GvsA) { // DNA vs protein if (out_mode[n] == ALN_FORM) { if (gene) { @@ -1479,11 +1486,12 @@ set_max_extend_gene_rng(def_max_extend_gene_rng); if (bprm->pwd->DvsP == 3) OutPrm.SkipLongGap = 0; if (algmode.nsa == SAM_FORM) put_genome_entries(); - outputs.setup(a->spath); + if (outputs.setup(a->spath)) { #if M_THREAD - if (thread_num) MasterWorker(seqs, &svr, (void*) bprm); else + if (thread_num) MasterWorker(seqs, &svr, (void*) bprm); else #endif - all_in_func(seqs, &svr, (void*) bprm); + all_in_func(seqs, &svr, (void*) bprm); + } delete bprm; } else { if (svr.nextseq(b, 1) == IS_END) { @@ -1498,11 +1506,12 @@ if (pwd->DvsP == 3) OutPrm.SkipLongGap = 0; if (a->inex.intr || b->inex.intr) makeStdSig53(); set_max_extend_gene_rng(0); - outputs.setup(a->spath); + if (outputs.setup(a->spath)) { #if M_THREAD - if (thread_num) MasterWorker(seqs, &svr, (void*) pwd); else + if (thread_num) MasterWorker(seqs, &svr, (void*) pwd); else #endif - all_in_func(seqs, &svr, (void*) pwd); + all_in_func(seqs, &svr, (void*) pwd); + } delete pwd; } diff -Nru spaln-2.4.7+dfsg/src/sqpr.cc spaln-2.4.9a+dfsg/src/sqpr.cc --- spaln-2.4.7+dfsg/src/sqpr.cc 2022-01-28 07:55:49.000000000 +0000 +++ spaln-2.4.9a+dfsg/src/sqpr.cc 2022-05-12 01:54:43.000000000 +0000 @@ -52,8 +52,6 @@ static int checksum(int* checks, const GAPS* gaps, const Seq* sd); static void gcg_out(const GAPS** gaps, Seq* seqs[], int seqnum, FILE* fd); - //lpw blk Nout Ncolony eij ovl fnm rm trim lg lbl dsc odr spj olr color self -OUTPRM OutPrm = {60, 0, MAX_COLONY, 1, 4, 10, 5, 0, 1, 0, 3, 0, 0, 0, 0, 1, 0, 0, 0, 0}; static char ditto = _IBID; static Row_Mode prmode = Row_Last; static int prtrc = TRON; @@ -101,7 +99,7 @@ if (!(OutPrm.out_file && *OutPrm.out_file) && def_fn) OutPrm.out_file = def_fn; if (OutPrm.out_file && !is_dir(OutPrm.out_file)) { - if (!(out_fd = fopen(OutPrm.out_file, "w"))) + if (!(out_fd = wfopen(OutPrm.out_file, "w"))) fatal("Can't write to %s\n", OutPrm.out_file); } } @@ -827,6 +825,7 @@ if (gene->exin && gene->exin->fact) scale *= gene->exin->fact; float fscr = scr / alprm.scale; VTYPE iscr = 0; +const int bbt = qry->isprotein()? 3: 1; const EISCR* fst = eijnc->begin(); const EISCR* wkr = fst; const EISCR* prv = wkr; @@ -861,28 +860,28 @@ #if USE_ZLIB if (OutPrm.gzipped) { strcat(str, gz_ext); - if (!(gzfg = gzopen(str, "wb"))) fatal(efmt, str, 0); + if (!(gzfg = wgzopen(str, "wb"))) fatal(efmt, str, 0); } else #endif - if (!(fg = fopen(str, "wb"))) fatal(efmt, str, 0); + if (!(fg = wfopen(str, "wb"))) fatal(efmt, str, 0); strcpy(bdy, erext); #if USE_ZLIB if (OutPrm.gzipped) { strcat(str, gz_ext); - if (!(gzfe = gzopen(str, "wb"))) fatal(efmt, str, 0); + if (!(gzfe = wgzopen(str, "wb"))) fatal(efmt, str, 0); } else #endif - if (!(fe = fopen(str, "wb"))) fatal(efmt, str, 0); + if (!(fe = wfopen(str, "wb"))) fatal(efmt, str, 0); strcpy(bdy, qrext); #if USE_ZLIB if (OutPrm.gzipped) { strcat(str, gz_ext); - if (!(gzfq = gzopen(str, "wb"))) fatal(efmt, str, 0); + if (!(gzfq = wgzopen(str, "wb"))) fatal(efmt, str, 0); if ((fputs(dbs_dt[0]->dbsid, gzfq) == EOF) || (fputc('\0', gzfq) == EOF)) fatal(efmt, qrext, gr.Nrecord); } else #endif - if (!(fq = fopen(str, "wb"))) fatal(efmt, str, 0); + if (!(fq = wfopen(str, "wb"))) fatal(efmt, str, 0); if (fq && ((fputs(dbs_dt[0]->dbsid, fq) == EOF) || (fputc('\0', fq) == EOF))) fatal(efmt, qrext, gr.Nrecord); ++qryidx; // o-th q-recode contains the database name @@ -894,7 +893,7 @@ } while (neoeij(wkr)) { if (wkr->iscr > NEVSEL) { // normal - if (qry->isdrna() && gr.nexn) { + if (gr.nexn) { gr.bmmc += prv->mmc3 + wkr->mmc5; gr.bunp += prv->unp3 + wkr->unp5; } @@ -912,7 +911,8 @@ er.Gleft = gene->SiteNo(skp->left); er.Gright = gene->SiteNo(wkr->right - 1); er.Bmmc = prv->mmc3 + wkr->mmc5; - er.Bunp = prv->unp3 + wkr->unp5; + if (prv->unp3 % bbt || wkr->unp5 % bbt) er.Bunp = 9; + else er.Bunp = (prv->unp3 + wkr->unp5) / bbt; er.Escore = (float) wkr->escr / scale; er.Iscore = (float) iscr / scale; er.Sig3 = (float) wkr->sig3 / scale; @@ -951,7 +951,7 @@ er.miss = wkr->left - skp->right; } } - if (qry->isprotein()) cds /= 3; + cds /= bbt; er.Ilen = qry->right - qry->left; gr.Gstart = gene->SiteNo(fst->left); gr.Gend = gene->SiteNo(prv->right - 1); @@ -1341,10 +1341,11 @@ delete[] arg; } -void AlnOutModes::setup(const char* spath) +int AlnOutModes::setup(const char* spath) { if (!n_out_modes) ++n_out_modes; fds = new FILE*[n_out_modes]; // directory + vclear(fds, n_out_modes); const char* sl = strrchr(spath, '/'); if (sl) ++sl; else sl = spath; @@ -1372,26 +1373,33 @@ if (OutPrm.out_file) { if (isdir) { OutPrm.out_file = 0; - if (algmode.nsa == BIN_FORM) return; + if (algmode.nsa == BIN_FORM) { + fds[0] = stdout; + return (1); + } sprintf(fn, ".O%d", algmode.nsa); + } else if (algmode.nsa == BIN_FORM) { + fds[0] = stdout; + return (1); } - fds[0] = fopen(str, "w"); - if (!fds[0]) fatal(write_error, str); + fds[0] = wfopen(str, "w"); + if (!fds[0]) return (0); } else fds[0] = stdout; - return; + return (1); } OutPrm.out_file = 0; - for (int i = 0, n = 0; i < N_Out_Modes; ++i) { + int n = 0; + for (int i = 0; i < N_Out_Modes; ++i) { if (out_mode[i] < 0) continue; if (out_mode[i] == BIN_FORM) fds[n] = 0; else { sprintf(fn, ".O%d", i); - fds[n] = fopen(str, "w"); - if (!fds[n]) fatal(write_error, str); + fds[n] = wfopen(str, "w"); } - out_mode[n++] = out_mode[i]; + if (fds[n]) out_mode[n++] = out_mode[i]; } + return (n); } enum AaProp {GAP, SML, PLS, MNS, HPO, ARO = 6, NEU, PHI, PHO}; diff -Nru spaln-2.4.7+dfsg/src/utilseq.cc spaln-2.4.9a+dfsg/src/utilseq.cc --- spaln-2.4.7+dfsg/src/utilseq.cc 2022-01-28 07:55:49.000000000 +0000 +++ spaln-2.4.9a+dfsg/src/utilseq.cc 2022-05-12 01:54:43.000000000 +0000 @@ -19,7 +19,8 @@ * Copyright(c) Osamu Gotoh <> *****************************************************************************/ -#include "aln.h" +#include "seq.h" +#include "eijunc.h" enum Triplet {AAA = 0, AGA = 8, AGG = 10, AUA = 12, CUA = 28, CUC, CUG, CUU, @@ -112,7 +113,6 @@ static CHAR* de_codon(CHAR* ncs, int n); static int lcomp(ORF* a, ORF* b); static int evenusage(); -static int kmer(const CHAR* s, int m, const CHAR* elements, int k, int* x = 0); static int curcode = 0; static int ChangeCodon = 1; @@ -346,7 +346,9 @@ return (b->len - a->len); } -/* DFA to find ORF +/************************************* + +* DFA to find ORF fromic == 0: Term< --- >Term fromic == 1: Term @@ -355,26 +357,26 @@ fromic == 4: ATG - AG< --- GT - >Term fromic == 5: Term -*/ +*************************************/ ORF* Seq::getorf() const { static const int tm[14][4] = { -/* A C G T */ - {1, 0, 2, 3}, /* 0: C */ - {1, 0, 4, 5}, /* 1: A */ - {1, 6, 2, 7}, /* 2: G */ - {8, 0, 9, 3}, /* 3: T */ - {1, 6, 2, 7}, /* 4: AG */ - {8, 0,10, 3}, /* 5: AT */ - {1, 0, 2, 3}, /* 6: GC */ - {8, 0, 9, 3}, /* 7: GT */ - {11,0,12, 5}, /* 8: TA */ - {13,6, 2, 7}, /* 9: TG */ - {13,6, 2, 7}, /* 10: ATG */ - {1, 0, 4, 5}, /* 11: TAA */ - {1, 6, 2, 7}, /* 12: TAG */ - {1, 0, 4, 5}}; /* 13: TGA */ +// A C G T + {1, 0, 2, 3}, // 0: C + {1, 0, 4, 5}, // 1: A + {1, 6, 2, 7}, // 2: G + {8, 0, 9, 3}, // 3: T + {1, 6, 2, 7}, // 4: AG + {8, 0,10, 3}, // 5: AT + {1, 0, 2, 3}, // 6: GC + {8, 0, 9, 3}, // 7: GT + {11,0,12, 5}, // 8: TA + {13,6, 2, 7}, // 9: TG + {13,6, 2, 7}, // 10: ATG + {1, 0, 4, 5}, // 11: TAA + {1, 6, 2, 7}, // 12: TAG + {1, 0, 4, 5}}; // 13: TGA ORF cur[3]; ORF* orfs; @@ -382,7 +384,7 @@ int n = left; for (f = 0; f < 3; ++f) { cur[f].pos = n; - cur[f].frm = fromic? 0: 1; /* close/open */ + cur[f].frm = fromic? 0: 1; // close/open cur[f].len = 0; } const CHAR* ts = at(right); @@ -394,8 +396,8 @@ if (c < 4) { state = tm[state][c]; switch (state) { - case 4: /* AG */ - if (fromic > 1) { /* exon start */ + case 4: // AG + if (fromic > 1) { // exon start for (f = 0; f < 3; ++f) { if (!cur[f].frm || (fromic % 2 == 0 && cur[f].frm == 1)) { cur[f].pos = n + 1; @@ -404,30 +406,30 @@ } } break; - case 6: /* GC */ - case 7: /* GT */ - if (fromic > 1) { /* exon end */ + case 6: // GC + case 7: // GT + if (fromic > 1) { // exon end for (f = 0; f < 3; ++f) { if (cur[f].frm) { cur[f].len = n - 1 - cur[f].pos; if (fromic == 2 || fromic == 3) - cur[f].frm = 4; /* mark */ + cur[f].frm = 4; // mark } } } break; - case 10: /* ATG */ + case 10: // ATG f = (n - 2) % 3; if (!cur[f].frm) { cur[f].pos = n - 2; cur[f].frm = 1; } break; - case 11: /* TAA */ + case 11: // TAA if (gencode[UAA] != TRM) break; - case 12: /* TAG */ + case 12: // TAG if (gencode[UAG] != TRM) break; - case 13: /* TGA */ + case 13: // TGA if (gencode[UGA] != TRM2) break; f = (n - 2) % 3; if (cur[f].frm == 1 || cur[f].frm == 2) @@ -439,7 +441,7 @@ cur[f].pos = n + 1; cur[f].frm = fromic? 0: 1; cur[f].len = 0; - if (state == 12 && fromic > 1) { /* AG */ + if (state == 12 && fromic > 1) { // AG for (f = 0; f < 3; ++f) { if (!cur[f].frm || (fromic % 2 == 0 && cur[f].frm == 1)) { cur[f].pos = n + 1; @@ -450,7 +452,7 @@ break; default: break; } - } else { /* ambiguous base */ + } else { // ambiguous base for (f = 0; f < 3; ++f) { cur[f].pos = n + 1; cur[f].frm = fromic? 0: 1; @@ -706,7 +708,7 @@ sscanf(str, "%*s %*s %*s %f", CodonUsage + i++); if (i == 64) return (OK); } while (fgets(str, LINE_MAX, fd)); -/* readerr */ +// readerr fputs(errmsg, stderr); fprintf(stderr, "%s was incomplete!\n", fname); for (i = 0; i < 64; ++i) CodonUsage[i] = Human50CDU[i]; @@ -736,8 +738,6 @@ int t = 0; int skip = 0; char str[MAXL]; - double maxval = -FLT_MAX; - double minval = FLT_MAX; vclear(&mmm); do { @@ -749,83 +749,92 @@ rows <= 0 || cols <= 0) return; tonic = mmm.min; if (-tonic > maxtonic) tonic = -maxtonic; - mtx = new double[rows * cols]; + mtx = new float[rows * cols]; while (skip-- > 0) { int rc; while ((rc = fgetc(fd)) != EOF && rc != '\n') ; } - double* wk = mtx; + float* wk = mtx; for (int rc = 0; rc < rows * cols; ++rc, ++wk) { - if (fscanf(fd, "%lf", wk) <= 0) { + if (fscanf(fd, "%f", wk) <= 0) { fputs("Insufficient data!\n", stderr); delete[] mtx; mtx = 0; return; } - if (*wk > maxval) {maxval = *wk; maxidx = rc;} - if (*wk < minval) {minval = *wk; minidx = rc;} + if (*wk < min_elem) min_elem = *wk; } - if (t) swap(rows, cols); + if (t) std::swap(rows, cols); if (rows % 23 == 0) nalpha = 23; else if (rows % 4 == 0) nalpha = 4; else nalpha = rows; + morder = 0; + for (int d = nalpha; d < rows; d = d * (d + 1)) + ++morder; while ((skip = getc(fd)) != EOF && skip != '\n'); } +void PatMat::readBinPatMat(FILE* fd) +{ + if (fread(this, sizeof(PatMat), 1, fd) != 1) + fatal("incompatible !\n"); + if (transvers) std::swap(rows, cols); + tonic = mmm.min; + if (-tonic > maxtonic) tonic = -maxtonic; + INT dszie = rows * cols; + mtx = new float[dszie]; + if (fread(mtx, sizeof(float), dszie, fd) != dszie) + fatal("incompatible !\n"); +} + PatMat& PatMat::operator=(const PatMat& src) { rows = src.rows; cols = src.cols; offset = src.offset; nalpha = src.nalpha; - maxidx = src.maxidx; minidx = src.minidx; nsupport = src.nsupport; + nsupport = src.nsupport; tonic = src.tonic; mmm = src.mmm; int mtxsize = rows * cols; if (mtxsize != (src.rows * src.cols)) { delete[] mtx; - mtx = new double[mtxsize]; + mtx = new float[mtxsize]; } vcopy(mtx, src.mtx, rows * cols); return (*this); } PatMat::PatMat(const PatMat& src) : - maxidx(src.maxidx), minidx(src.minidx), nsupport(src.nsupport), - nalpha(src.nalpha), rows(src.rows), cols(src.cols), offset(src.offset), - tonic(src.tonic), mmm(src.mmm) + rows(src.rows), cols(src.cols), offset(src.offset), + tonic(src.tonic), min_elem(src.min_elem), + nsupport(src.nsupport), nalpha(src.nalpha), morder(src.morder), + mmm(src.mmm) { int mtxsize = rows * cols; - mtx = new double[mtxsize]; + mtx = new float[mtxsize]; vcopy(mtx, src.mtx, mtxsize); } PatMat::PatMat(const int r, const int c, const int o, float* m) - : maxidx(0), minidx(0), nsupport(0), - rows(r), cols(c), offset(o), tonic(0), mtx(0) + : rows(r), cols(c), offset(o) { if (r <= 0 || c <= 0) return; mmm.min = mmm.mean = mmm.max = 0; - double maxval = -FLT_MAX; - double minval = FLT_MAX; - mtx = new double[r * c]; - if (!m) return; - for (int i = 0; i < r * c; ++i) { - mtx[i] = m[i]; - if (mtx[i] > maxval) {maxval = mtx[i]; maxidx = i;} - if (mtx[i] < minval) {minval = mtx[i]; minidx = i;} - } if (rows % 4 == 0) nalpha = 4; else if (rows == 23) nalpha = 23; - else nalpha = cols; + else nalpha = rows; + for (int d = nalpha; d < rows; d = d * (d + 1)) + ++morder; + mtx = new float[r * c]; + if (!m) return; + vcopy(mtx, m, r * c); + min_elem = *vmin(mtx, r * c); } -PatMat::PatMat(FILE* fd) : - maxidx(0), minidx(0), nsupport(0), - rows(0), cols(0), offset(0), tonic(0), mtx(0) +PatMat::PatMat(FILE* fd, bool binary) { - readPatMat(fd); + if (binary) readBinPatMat(fd); + else readPatMat(fd); } -PatMat::PatMat(const char* fname) : - maxidx(0), minidx(0), nsupport(0), - rows(0), cols(0), offset(0), tonic(0), mtx(0) +PatMat::PatMat(const char* fname) { char str[LINE_MAX]; FILE* fd = 0; @@ -836,10 +845,14 @@ } else progets(str, "pattern file name? "); if (!*str || *str == '\n') return; - fd = ftable.fopen(str, "r"); - if (fd) readPatMat(fd); - else goto retry; - fclose(fd); +const char* dot = strrchr(str, '.'); +const bool binary = dot && !strcmp(dot, patmat_ext); + fd = ftable.fopen(str, "rb"); + if (fd) { + if (binary) readBinPatMat(fd); + else readPatMat(fd); + } else goto retry; + if (fd) fclose(fd); } CHAR* PatMat::setredctab(const Seq* sd) const @@ -860,7 +873,7 @@ { const CHAR* ps = sd->at(pos); const CHAR* ts = sd->at(pos + cols); - double* ptn = mtx; + float* ptn = mtx; for ( ; ps < ts; ++ps, ptn += rows) { int k = redctab? redctab[*ps]: (*ps - sd->code->base_code); if (k >= 0 && k < rows) ++ptn[k]; @@ -869,9 +882,9 @@ float PatMat::pwm_score(const Seq* sd, const CHAR* ps, const CHAR* redctab) const { -const double* ptn = mtx; +const float* ptn = mtx; const CHAR* ts = min((const CHAR*) sd->at(sd->right), ps + cols); - double fit = 0; + float fit = 0; for ( ; ps < ts; ++ps, ptn += rows) { int k = redctab? redctab[*ps]: (*ps - sd->code->base_code); if (k >= 0 && k < rows) fit += ptn[k]; @@ -882,16 +895,13 @@ float* PatMat::calcPatMat(const Seq* sd) const { int k = sd->right - sd->left; - int Mrkv = 0; + int Mrkv = order(); const CHAR* redctab = setredctab(sd); - if (rows == 20) Mrkv = 1; // 1st-order Markov/ - else if (rows == 84) Mrkv = 2; // 2nd-order Markov/ - else if (rows == 67) Mrkv = 3; // Should be MODIFIED !!! float* result = new float[k]; float* rest = result; float* last = result + k; - double minval = cols * mtx[minidx]; + float minval = cols * min_elem; const CHAR* aa = sd->at(0); // left limit const CHAR* zz = sd->at(sd->len - Mrkv); // right limit int n = sd->left - offset; // start point @@ -900,8 +910,8 @@ const CHAR* ss = sd->at(n); const CHAR* tt = sd->at(n + cols); if (tt > zz) tt = zz; - double fit = 0; -const double* ptn = mtx; + float fit = 0; +const float* ptn = mtx; if (n < 0) {ptn -= n * rows; ss = aa;} int q = n + cols >= sd->len; // number of bad chars for (int m = 0; ss < tt; ptn += rows, ++m) { @@ -916,8 +926,8 @@ if (j < 0 || j >= nalpha) ++q; k = nalpha * k + j + nalpha; } - double tabval = q? 0: ptn[k]; -// double tabval = q? minval: ptn[k]; + float tabval = q? 0: ptn[k]; +// float tabval = q? minval: ptn[k]; fit += tabval; } } @@ -928,8 +938,8 @@ const CHAR* ss = sd->at(n); const CHAR* tt = sd->at(n + cols); if (tt > zz) tt = zz; - double fit = 0; -const double* ptn = mtx; + float fit = 0; +const float* ptn = mtx; if (n < 0) {ptn -= n * rows; ss = aa;} int q = n + cols >= sd->len; // number of bad chars for (int m = 0; ss < tt; ptn += rows, ++m) { @@ -983,12 +993,12 @@ const CHAR* ss = sd->at(n); const CHAR* tt = sd->at(n + cols); if (tt > zz) tt = zz; -const double* ptn = mtx; +const float* ptn = mtx; if (n < 0) {ptn -= n * rows; ss = aa;} const CHAR* sss = ss; int flag = 0; - double fit = 0; + float fit = 0; for (int m = 0; ss < tt; ptn += rows, ++m) { const CHAR* rr = ss + sd->many; for ( ; ss < rr; ++ss) { @@ -1005,7 +1015,7 @@ fit += ptn[16 * k + nalpha * i + j + 3]; } if (flag == 1) { - fit = 3 * ptn[minidx]; + fit = 3 * min_elem; break; } } @@ -1015,296 +1025,405 @@ return (result); } -// read coding potential from file - -CodePot::CodePot() +int fname2exin(const char* fname, int& file_type) { - char str[MAXL+1]; - int d = 0; - int trows = TSIMD * TSIMD; - double buf[CP_NTERM]; - FILE* fd = ftable.fopen(CODEPOT, "r"); +const int ng = static_cast(Iefp::NG); +const int write_mode = file_type; + char str[MAXL]; + strcpy(str, fname); + file_type = 0; + char* dot = strrchr(str, '.'); + if (dot) { + if (!strcmp(dot, text_ext)) { + *dot = '\0'; + file_type = 1; // text + dot = strrchr(str, '.'); + } + for (int exn = 0; exn < ng; ++exn) { + if (!strcmp(dot, iefp_ext[exn])) { + if (!file_type) file_type = 2; + return (exn); // binary + } + } + if (write_mode) { + fputs("Error: extension must be one of:\n\t", stderr); + for (int exn = 0; exn < ng; ++exn) + fprintf(stderr, "%s ", iefp_ext[exn]); + fputc('\n', stderr); + exit (1); + } + } + file_type = 0; + FILE* fd = ftable.fopen(fname, "r"); if (!fd) { - fatal("CodePotTab can't open!\n", stderr); - return; + prompt(not_found, fname); + return (ng); } - if (!fgets(str, MAXL, fd)) goto fail_to_read; - if (!wordcmp(str, "CodePotTab")) { /* header line */ - sscanf(str, "%*s %d %d", &CodePotClmn, &d); - if (!fgets(str, MAXL, fd)) goto fail_to_read; - if (d % 23) { - CodePotType = DNA; - trows = d; - } else - CodePotType = TRON; - } - if (CodePotClmn > CP_NTERM) CodePotClmn = CP_NTERM; - CodePotBuf = new FTYPE[CodePotClmn * trows]; - d = 0; - for (int i = 0; i < CodePotClmn; d += trows) - CodePotTab[i++] = CodePotBuf + d; - d = 0; - do { - char* ps = str; - if (isalpha(*ps)) ps = cdr(str); - for (int i = 0; ps && i < CodePotClmn; ps = cdr(ps)) - buf[i++] = atof(ps); - if (CodePotType == TRON && isalpha(*str) && isalpha(str[1])) { - d = TSIMD * trccode[*str - 'A'] + trccode[str[1] - 'A']; - for (int i = 0; i < CodePotClmn; ++i) CodePotTab[i][d] = buf[i]; - } else { - for (int i = 0; i < CodePotClmn; ++i) CodePotTab[i][d] = buf[i]; - ++d; + if (!fgets(str, MAXL, fd)) return (ng); + int exn = 0; + for ( ; exn < static_cast(Iefp::NG); ++exn) { + if (!wordcmp(str, iefp_tid[exn])) { + file_type = 1; + break; // text } - } while (fgets(str, MAXL, fd)); + } + fclose(fd); + return (exn); +} + +bool ExinPot::readFile(const char* fname) +{ + int file_type = 0; + exin = fname2exin(fname, file_type); + if (file_type == 2) // binary + return readBinary(fname); + else if (file_type == 0) { + prompt(incompatible, fname); + return (false); + } + FILE* fd = ftable.fopen(fname, "r"); + if (!fd) { + prompt(not_found, fname); + return false; + } + char str[MAXL]; + int sz = 1; + float* pot = 0; + if (!fgets(str, MAXL, fd)) goto fail_to_read; + if (sscanf(str, "%*s %d %d %*f %f %*f %*d %d %d %f", + &nphase, &ndata, &avpot, &lm, &rm, &avlen) < 1) goto fail_to_read; + nphase = (nphase >= 3)? 3: 1; + avlen -= (lm + rm); + pot = data = new float[dsize()]; + for (morder = -1; sz < ndata; sz *= CP_NTERM) ++morder; + if (sz != ndata) goto fail_to_read; + while (fgets(str, MAXL, fd)) { + int i = 0; + char* ps = isalpha(*str)? cdr(str): str; + for ( ; i++ < 3 && ps; ps = cdr(ps)) + *pot++ = atof(ps); + if (i < 3) break; + } fclose(fd); - return; + if ((pot - data) == dsize()) return true; fail_to_read: - delete[] CodePotBuf; CodePotBuf = 0; - fputs("CodePotTab can't be read in !\n", stderr); fclose(fd); + prompt(incompatible, fname); + return false; } -static int kmer(const CHAR* s, int m, const CHAR* elements, int k, int* x) +// from wdfq file specific to nphase == 1 + +float* ExinPot::getKmers(const char* wdfq, const bool foregrd) { - int c = elements[*s]; - int d = c; - if (c >= 4) { - d = 0; - if (x) *x = 0; + FILE* fd = fopen(wdfq, "r"); + if (!fd) { + prompt(not_found, wdfq); + return (0); + } +const int plus = (foregrd && isfpp())? ndata: 0; + float* frq = new float[ndata + plus]; + float* fq = frq + plus; + if (foregrd) data = frq; + char str[MAXL]; + char kmer[16]; + vclear(fq, ndata); + int n = 0; + while (fgets(str, MAXL, fd)) { + int freq; + if (sscanf(str, "%s %d", kmer, &freq) < 2) break; + int k = strlen(kmer); + if (--k > morder) break; + if (k < morder) continue; + *fq++ = freq; + if (n++ == ndata) break; } -const CHAR* t = s + k * m; + fclose(fd); + return (frq); +} - while ((s += m) < t) { - c = elements[*s]; +void ExinPot::count_kmers_1(const Seq* sd, float* fq) +{ +const CHAR* ss = sd->at(sd->left + lm); +const CHAR* tt = sd->at(sd->right - rm - morder); +const CHAR* redctab = sd->istron()? tnredctab: ncredctab; + + int x = morder + 1; + int w = 0; + + while (ss < tt) { + int c = redctab[*ss++]; if (c < 4) { - d = CP_NTERM * d + c; - ++*x; + if (x) --x; + w = (4 * w + c) % ndata; } else { - d = 0; - if (x) *x = 0; + w = 0; + x = morder + 1; } + if (!x) ++fq[w]; } - return (d); } -/* Calculate coding potential from DNA seq based on the 5th-order Markov model */ - -float* CodePot::calc5MMCodePot(const Seq* sd, int phase) const +void ExinPot::count_kmers_3(const Seq* sd, float* fq) { - CHAR* redctab = sd->inex.molc == TRON? tnredctab: ncredctab; - CHAR* elements = sd->inex.molc == TRON? trelements: ncelements; - int k = sd->right - sd->left; - FTYPE prf; const CHAR* ss = sd->at(sd->left); -const CHAR* tt = sd->at(sd->right); +const CHAR* tt = sd->at(sd->right - 5); +const CHAR* redctab = sd->istron()? tnredctab: ncredctab; - float* result = new float[k]; - float* rest = result; - int x = 0; - if (sd->left < 0) *rest++ = 0; /* fill pat */ - else ss -= sd->many; - if (sd->many == 1) { /* in most cases */ - int d = kmer(ss, 1, redctab, 6, &x); /* dummy when phase != 0 */ - int dm = d / CP_NTERM; - ss += 6; - while (ss < tt) { - int c = redctab[*ss++]; - FTYPE val = 0; - if (phase == 0) { - if (c < 4) { - val = CodePotTab[2][dm] + CodePotTab[0][d]; /* -1 + 0 */ - d = (CP_NTERM * (dm = d) + c) % 4096; - ++x; - } else d = dm = x = 0; - if (x >= 6) val += CodePotTab[1][d]; /* +1 */ - else val = 0; - } else { - if (x >= 6) { - if (phase <= CodePotClmn) { - val = CodePotTab[phase - 1][d]; - } else if (phase == CP_NTERM + 1 && CodePotClmn >= CP_NTERM) { - val = CodePotTab[0][d] - CodePotTab[1][d] - - CodePotTab[2][d] - CodePotTab[3][d]; - } else { - val = CodePotTab[0][d] - CodePotTab[1][d] - CodePotTab[2][d]; - } - } - if (c < 4) { - d = (CP_NTERM * d + c) % 4096; - ++x; - } else d = x = 0; - } - *rest++ = val; + int x = 6; + int w = 0; + + for (int p = 1; ss < tt; p = next_p[p]) { + int c = redctab[*ss++]; + if (c < 4) { + if (x) --x; + w = (4 * w + c) % ndata; + } else { + w = 0; + x = 6; } - } else { - tt -= 6 * sd->many; - while (ss < tt) { - FTYPE val = 0; - for (int i = 0; i++ < sd->many; ++ss) { - int d = kmer(ss, sd->many, elements, 6); - if (phase == 0) { - prf = CodePotTab[0][d]; - d = (ss < sd->at(0) - sd->many)? d / CP_NTERM: - kmer(ss - sd->many, sd->many, elements, 6); - prf += CodePotTab[2][d]; - d = kmer(ss + sd->many, sd->many, elements, 6); - prf += CodePotTab[1][d]; - } else if (phase <= CodePotClmn) { - prf = CodePotTab[phase - 1][d]; - } else if (phase == CP_NTERM + 1 && CodePotClmn >= CP_NTERM) { - prf = CodePotTab[0][d] - CodePotTab[1][d] - - CodePotTab[2][d] - CodePotTab[3][d]; - } else { - prf = CodePotTab[0][d] - CodePotTab[1][d] - CodePotTab[2][d]; + if (!x) ++fq[3 * w + p]; + } +} + +float* ExinPot::getKmers(EiJuncSeq* eijseq) +{ +const int vsize = ndata * nphase; +const int plus = isfpp()? vsize: 0; + data = new float[vsize + plus]; + float* fq = data + plus; + vclear(fq, vsize); + do { + Seq* sd = eijseq->nextseq(); + if (!sd) continue; + if (nphase == 1) count_kmers_1(sd, fq); + else count_kmers_3(sd, fq); + } while (eijseq->goahead()); + return (data); +} + +float* ExinPot::getKmers(int argc, const char** argv) +{ +const int vsize = ndata * nphase; +const int plus = isfpp()? vsize: 0; + data = new float[vsize + plus]; + float* fq = data + plus; + vclear(fq, vsize); + Seq sd(1); + SeqServer sqsvr(argc, argv, IM_SNGL); + InSt inst; + while ((inst = sqsvr.nextseq(&sd, 0)) != IS_END) { + if (inst == IS_ERR) continue; + if (nphase == 1) count_kmers_1(&sd, fq); + else count_kmers_3(&sd, fq); + } + return (data); +} + +void ExinPot::reform_1(float* bkg) +{ + int i = 0; + float s = 0; + float* frq = bkg? bkg: fbegin(); + float* fre = frq + ndata; + float* pot = bkg? bkg: begin(); +const bool cpb = ispot(); // conditional probability +const bool fpp = bkg? false: isfpp(); + while (frq < fre) { + if (cpb) { + s += ++frq[i++]; // psuedo count 1 + if (i == 4) { + for (int j = 0; j++ < 4; ++frq) { + *pot++ = *frq / s; + if (fpp) *frq /= total; } - val += prf; + s = i = 0; } - *rest++ = val; - } + } else *frq++ /= total; } - while (rest < result + k) *rest++ = 0.; - return (result); } -/* Calculate coding potential from DiTRON (1st-order Markov) model */ +void ExinPot::reform_3() +{ + int i = 0; + float s[3]; + vclear(s, 3); + float* frq = fbegin(); + float* fre = fend(); + float* pot = begin(); +const bool cpb = ispot(); // conditional probability +const bool fpp = isfpp(); + for (int p = 1; frq < fre; p = next_p[p]) { + if (cpb) { + s[p] += frq[i++] + 1; // psuedo count 1 + if (i == 12) { + for (int j = 0, q = 1; j++ < 12; ++frq, q = next_p[q]) { + *pot++ = (*frq + 1)/ s[q]; + if (fpp) *frq /= total; + } + i = 0; + vclear(s, 3); + } + } else *frq++ /= total; + } +} -float* CodePot::calcDitCodePot(Seq* sd, int phase) const +void ExinPot::reform(float* bkg) { - int i = sd->right; - int k = sd->right - sd->left; - int d = sd->len - 3; - int n2t = 0; -const CHAR* ss = sd->at(sd->left); -const CHAR* uu = ss + 3 * sd->many; -const CHAR* tt = sd->at(min(i, d)); + total = 0; +const float* fw = bkg? bkg: fbegin(); +const float* fe = bkg? (fw + ndata): fend(); + while (fw < fe) total += *fw++; + if (bkg || nphase == 1) reform_1(bkg); + else reform_3(); +} - if (sd->isdrna()) {sd->nuc2tron(); n2t = 1;} - if (sd->inex.molc != TRON) { - prompt("%s must be DNA or TRON code!\n", sd->sqname()); - return (0); - } - float* result = new float[k]; - float* rest = result; - if (phase == 0 && sd->left < 0) { - *rest++ = 0; - ss += sd->many; - uu += sd->many; - } - FTYPE val = 0; - while (ss < tt) { - for (int i = 0; i++ < sd->many; ++ss, ++uu) { - FTYPE prf; - d = TSIMD * *ss + *uu; - if (phase == 0) { - prf = CodePotTab[0][d]; - d = TSIMD * ss[-sd->many] + uu[-sd->many]; - prf += CodePotTab[2][d]; - d = TSIMD * ss[sd->many] + uu[sd->many]; - prf += CodePotTab[1][d]; - } else if (phase <= CP_NTERM) { - prf = CodePotTab[phase - 1][d]; - } else if (phase == CP_NTERM + 1) { - prf = CodePotTab[0][d] - CodePotTab[1][d] - - CodePotTab[2][d] - CodePotTab[3][d]; - } else { - prf = CodePotTab[0][d] - CodePotTab[1][d] - CodePotTab[2][d]; - } - val += prf; +bool ExinPot::makeExinPot(const float* bkg) +{ + if (!bkg) return (false); + float* pot = begin(); + float* frq = fbegin(); + float* fed = fend(); + int p = 0; + while (frq < fed) { +const float freq = *frq++; + *pot = log10(*pot / *bkg); + if (isfpp()) ess += freq * *pot++; + if (++p == nphase) { + ++bkg; + p = 0; } - *rest++ = val; } - for (tt = sd->at(sd->right); ss < tt; ss += sd->many) - *rest++ = 0.; - if (n2t) sd->tron2nuc(0); - return (result); + if (isfpp()) ess *= 1000; // mean score per 1000 bp + return (true); } -float* CodePot::calcPrefCodePot(Seq* sd, int phase) const +bool ExinPot::readBinary(const char* fname) { - if (!CodePotBuf) return (0); - switch (CodePotType) { - case DNA: return calc5MMCodePot(sd, phase); - case TRON: return calcDitCodePot(sd, phase); - default: return (0); + FILE* fd = fopen(fname, "rb"); + if (!fd) { + prompt(not_found, fname); + return (false); } + if (fread(this, sizeof(ExinPot), 1, fd) != 1) { + prompt(incompatible, fname); + return (false); + } + data = new float[dsize()]; + if (fread(data, sizeof(float), dsize(), fd) != size_t(dsize())) { + prompt(incompatible, fname); + return (false); + } + fclose(fd); + return (true); } -bool ExinPot::readExinPot(const char* fname) +bool ExinPot::writeBinary(const char* oname) { - FILE* fd = ftable.fopen(fname, "r"); + int file_type = 0; + int exn = fname2exin(oname, file_type); + if (file_type == 0) exn = exin; + else std::swap(exn, exin); + char str[MAXL]; +const char* fn = file_type? oname: add_ext(oname, iefp_ext[exin], str); + FILE* fd = fopen(fn, "wb"); if (!fd) { - prompt("%s can't open!\n", fname); - return false; + prompt(no_file, str); + return (false); } - char str[MAXL+1]; - int exin = -1; - int sz = 1; - FTYPE* pot = 0; - if (!fgets(str, MAXL, fd)) goto fail_to_read; - if (!wordcmp(str, EXONPOT)) exin = 1; /* header line */ - else if (!wordcmp(str, INTRONPOT)) exin = 0; - else goto fail_to_read; - if (sscanf(str, "%*s %*d %d %*f %f %*f %*d %d %d %f", - &size, &avpot, &lm, &rm, &avlen) < 1) goto fail_to_read; - avlen -= (lm + rm); - pot = new FTYPE[size]; - for (morder = 0; sz < size; sz *= CP_NTERM) ++morder; - if (sz != size) goto fail_to_read; - if (exin) ExonPot = pot; - else IntronPot = pot; - while (fgets(str, MAXL, fd)) { - char* ps = str; - if (isalpha(*ps)) ps = cdr(str); - *pot++ = atof(ps); + if (fwrite(this, sizeof(ExinPot), 1, fd) != 1) { + prompt(no_file, str); + return (false); + } + float* pot = begin(); + if (fwrite(pot, sizeof(float), dsize(), fd) != size_t(dsize())) { + prompt(no_file, str); + return (false); } fclose(fd); - return true; -fail_to_read: - delete[] ExonPot; ExonPot = 0; - delete[] IntronPot; IntronPot = 0; - prompt("%s has wrong exinpot format !\n", fname); - fclose(fd); - return false; + std::swap(exn, exin); + return (true); } -ExinPot::ExinPot(int zZ, const char* fname) - : morder(0), size(0), lm(0), rm(0), - avpot(0), avlen(0), ExonPot(0), IntronPot(0) +// calculate "instaneous" exonic or "cumulative" intronic potential + +float* ExinPot::calcScr_1(const Seq* sd, float* scr) const { - if (zZ & 1 && !readExinPot(fname? fname: EXONPOT)) return; - if (zZ & 2) readExinPot(fname? fname: INTRONPOT); +const float* pot = begin(); +const int len = sd->right - sd->left; +const CHAR* ss = sd->at(sd->left); +const CHAR* tt = sd->at(sd->right); +const CHAR* redctab = sd->istron()? tnredctab: ncredctab; +const int kk = morder + 1; + + if (sd->left >= 0) --ss; + float* result = new float[len]; + float* rest = result - morder; + int x = kk; + int w = 0; +const bool epot = (exin == static_cast(Iefp::EP)); + double acc = 0.; // cumulative + for ( ; ss < tt; ++rest) { + INT c = redctab[*ss++]; + if (c < 4) { + if (x) --x; + w = (4 * w + c) % ndata; + } else { + w = 0; + x = kk; + } +const float pt = x? 0: pot[w]; + if (rest >= result) { + acc += pt; + *rest = epot? pt: float(acc); + } + } + vclear(rest, result + len - rest); + if (scr) + *scr = epot? float(acc): (result[len - 1 - rm] - result[lm]); + return (result); } -// calculate "instaneous" exonic or "cumulative" intronic potential +// Calculate coding potential from DNA seq based on the 5th-order Markov model -float* ExinPot::calcExinPot(const Seq* sd, bool exon) const +float* ExinPot::calcScr_3(const Seq* sd, float* scr) const { - FTYPE* pot = exon? ExonPot: IntronPot; - if (!pot) return 0; - double acc = 0.; + CHAR* redctab = sd->inex.molc == TRON? tnredctab: ncredctab; +const int len = sd->right - sd->left; const CHAR* ss = sd->at(sd->left); const CHAR* tt = sd->at(sd->right); -const CHAR* redctab = sd->inex.molc == TRON? tnredctab: ncredctab; - float* result = new float[sd->right - sd->left]; - float* rest = result; - if (sd->left < 0) *rest++ = 0; /* fill pat */ - else --ss; - INT c = redctab[*ss]; - int x = c < 4; - int d = x? c: 0; - while (++ss < tt) { - if ((c = redctab[*ss]) < 4) { - ++x; - d = (CP_NTERM * d + c) % size; +const float* cdp = begin(); + float* result = new float[len]; + float* rest = result - 5; +const int kk = morder + 1; + int x = kk; + int buf[3] = {0, 0, 0}; + int w = 0; +double acc = 0.; + for (int p = 1; ss < tt; p = next_p[p], ++rest) { + int c = redctab[*ss++]; + if (c < 4) { + buf[p] = 3 * (w = (4 * w + c) % ndata); + if (x) --x; } else { - d = x = 0; + w = 0; + x = kk; + } + float val = 0; + if (!x) { + val += cdp[buf[next_p[p]] + 2]; // -1 + val += cdp[buf[prev_p[p]]]; // 0 + val += cdp[buf[p] + 1]; // +1 + } + if (rest >= result) { + *rest = val; + if (scr && p == 1) acc += val; } - float pt = (x < morder)? 0: pot[d]; - *rest++ = exon? pt: acc += pt; } + vclear(rest, result + len - rest); + if (scr) *scr = float(acc); return (result); } diff -Nru spaln-2.4.7+dfsg/src/utilseq.h spaln-2.4.9a+dfsg/src/utilseq.h --- spaln-2.4.7+dfsg/src/utilseq.h 2022-01-28 07:55:49.000000000 +0000 +++ spaln-2.4.9a+dfsg/src/utilseq.h 2022-05-12 01:54:43.000000000 +0000 @@ -22,35 +22,62 @@ #ifndef _UTILSEQ_H_ #define _UTILSEQ_H_ 1 -#define CODEPOT "CodePotTab" -#define INTRONPOT "IntronPotTab" -#define EXONPOT "ExonPotTab" - extern CHAR gencode[]; +class EiJuncSeq; + +enum class Iefp {IF, IP, IB, EF, EP, EB, CF, CP, CB, GF, NG}; +static const int itn_lm = 6; // intron 5' immune margin +static const int itn_rm = 16; // intron 3' immune margin +static const char CODEPOT[] = "CodePotTab"; +static const char INTRONPOT[] = "IntronPotTab"; +static const char EXONPOT[] = "ExonPotTab"; static const int CP_NTERM = 4; static const int STOP = INT_MAX; static const float maxtonic = 5.; +static const CHAR max_add_size = 24; // capacity of additional bytes +static const char text_ext[] = ".txt"; // text file +static const char patmat_ext[] = ".psm"; // extension to binary pssm file +static const char iefp_ext[11][8] = + {".ifq", ".ipt", ".ifp", ".efq", ".ept", ".efp", + ".cfq", ".cpt", ".cfp", ".gfq", ""}; +static const char iefp_tid[11][16] = + {"IntronFrqTab", "IntronPotTab", "IntronFpPTab", + "ExonFrqTab", "ExonPotTab", "ExonFpPTab", + "CodeFrqTab", "CodePotTab", "CodeFpPTab", "GenomeFrqTab", ""}; +static const char incompatible[] = "%s is incompatible !\n"; + +struct PAT_MATRIX { + int rows, cols, offset; + float* mtx; +}; -struct MATRIX {int rows, cols, offset; int maxidx, minidx; float tonic; double* mtx;}; struct ExpectMmm {float min, mean, max;}; +struct PatMatHead { + CHAR vtype, vsize, nelm, add; // 0/1:int/float, sizeof, 4/20/23 + int rows, cols; +}; + class PatMat { - void readPatMat(FILE* fd); -protected: - int maxidx, minidx, nsupport, nalpha; public: - int rows, cols, offset; - float tonic; - ExpectMmm mmm; // min, mean, max - double* mtx; + int rows = 0, cols = 0, offset = 0; + float tonic = 0, min_elem = 0; + int transvers = 0, skip = 0; // used in reading + int nsupport = 0, nalpha = 0, morder = 0; + ExpectMmm mmm; // min, mean, max + float* mtx = 0; + PatMat(const PatMat& src); // copy constructor - PatMat(FILE* fd); + PatMat(FILE* fd, bool binary = false); PatMat(const char* fname = 0); PatMat(const int r, const int c, const int o = 0, float* m = 0); ~PatMat() {delete[] mtx;} + void readPatMat(FILE* fd); + void readBinPatMat(FILE* fd); float pwm_score(const Seq* sd, const CHAR* ps, const CHAR* redctab = 0) const; float* calcPatMat(const Seq* sd) const; + int order() const {return (morder);} int columns() const {return cols;} void clearmtx() {vclear(mtx, rows * cols);} CHAR* setredctab(const Seq* sd) const; @@ -58,35 +85,78 @@ PatMat& operator=(const PatMat& src); }; -class CodePot { - FTYPE* CodePotTab[CP_NTERM]; - FTYPE* CodePotBuf; - int CodePotType; - int CodePotClmn; - float* calc5MMCodePot(const Seq* sd, int phase) const; - float* calcDitCodePot(Seq* sd, int phase) const; -public: - CodePot(); - ~CodePot() {delete[] CodePotBuf;} - float* calcPrefCodePot(Seq* sd, int phase) const; -}; - class ExinPot { - int morder; - int size; - int lm; - int rm; - float avpot; - float avlen; - FTYPE* ExonPot; - FTYPE* IntronPot; - bool readExinPot(const char* fname); +protected: + int exin = static_cast(Iefp::IP); + int morder = 0; + int nphase = 1; // 1 or 3 + int ndata = 0; + int nsupport = 0; + int lm = 0; + int rm = 0; + float total = 0; // total # of foreground kmers + float avpot = 0; // average of nsupport self scores + float avlen = 0; // average length of nsupport introns + float ess = 0; // expected mean of self scores + float* data = 0; + float* read_wdfq(const char* wdfq, const bool& conditional); + void count_kmers_1(const Seq* sd, float* fq); + void count_kmers_3(const Seq* sd, float* fq); + void reform_1(float* bkg = 0); + void reform_3(); + float* calcScr_1(const Seq* sd, float* scr = 0) const; + float* calcScr_3(const Seq* sd, float* scr = 0) const; public: - ExinPot(int zZ, const char* fname = 0); - ~ExinPot() {delete[] ExonPot; delete[] IntronPot;} - float* calcExinPot(const Seq* sd, bool exon) const; + ExinPot(const int& ein, const int& mo, const int& nf = 1) : + exin(ein), morder(mo), nphase(nf), ndata(ipower(4, mo + 1)), + lm(ein / 3? 0: itn_lm), rm(ein / 3? 0: itn_rm) {} + ExinPot(const char*& fname) { + readFile(fname); + } + ExinPot(const int& exn) : exin(exn) { + if (static_cast(exin) != Iefp::NG) + readFile(iefp_tid[exin]); + else + fatal("Invalid exin code (%d) !\n", exin); + } + ~ExinPot() {delete[] data;} + bool ispot() const { +const Iefp iefp = static_cast(exin); + return (iefp == Iefp::IP || iefp == Iefp::IB || + iefp == Iefp::EP || iefp == Iefp::EB || + iefp == Iefp::CP || iefp == Iefp::CB); + } + bool isfpp() const { +const Iefp iefp = static_cast(exin); + return (iefp == Iefp::IB || iefp == Iefp::EB || + iefp == Iefp::CB); + } + int size() const {return (ndata);} + int dsize() const { + return (nphase * ndata + (isfpp()? nphase * ndata: 0)); + } + bool readFile(const char* fname); + float* getKmers(const char* wdfq, const bool foregrd = true); + float* getKmers(EiJuncSeq* eijseq); + float* getKmers(int argc, const char** argv); + void reform(float* background = 0); + bool makeExinPot(const float* gfq); + bool readBinary(const char* fname); + bool writeBinary(const char* oname); + float* begin() const {return (data);} + float* end() const {return (data + nphase * ndata);} + float* fbegin() const { + return (data + nphase * (isfpp()? ndata: 0)); + } + float* fend() const { + return (data + dsize()); + } + float* calcScr(const Seq* sd, float* scr = 0) const { + return ((nphase == 1)? calcScr_1(sd, scr): calcScr_3(sd, scr)); + } VTYPE intpot(const EXIN* b5,const EXIN* b3) const; - VTYPE avrpot(float f) const {return (VTYPE) (f * avpot);} + VTYPE avrpot(float f = 1.) const {return (VTYPE) (f * avpot);} + VTYPE self_score() const {return (ess);} }; struct EijPat { @@ -95,20 +165,13 @@ PatMat* patternI; PatMat* patternT; PatMat* patternB; - FTYPE tonic3; - FTYPE tonic5; - FTYPE tonicB; + float tonic3; + float tonic5; + float tonicB; EijPat(int hh); ~EijPat(); }; -struct SpbInfo { - CodePot* codepot; - EijPat* eijpat; - SpbInfo(); - ~SpbInfo() {delete codepot; delete eijpat;} -}; - extern void setorf(int len, int ic = SILENT); extern int setcodon(int c); extern int codon_id(const CHAR* s, int byte); @@ -121,5 +184,6 @@ extern void mkinvtab(); extern int getCodonUsage(const char* fname); extern int setCodonUsage(int gc); +extern int fname2exin(const char* fname, int& file_type); #endif