* Notes: * (1) This builds a horizontal disparity model (HDM), but * does not check it against constraints for validity. * Constraint checking is done at rendering time. - * (2) This is not required for a successful model; only the vertical - * disparity is required. This will not be called if the - * function to build the vertical disparity fails. + * (2) Horizontal disparity is not required for a successful model; + * only the vertical disparity is required. This will not be + * called if the function to build the vertical disparity fails. * (3) This sets the hsuccess flag to 1 on success. * (4) Internally in ptal1, ptar1, ptal2, ptar2: x and y are reversed, * so the 'y' value is horizontal distance across the image width. @@ -621,16 +627,15 @@ dewarpFindHorizDisparity(L_DEWARP *dew, PTAA *ptaa) { -l_int32 i, j, w, h, nx, ny, sampling, ret; +l_int32 i, j, n, w, h, nx, ny, sampling, ret; l_float32 c0, c1, cl0, cl1, cl2, cr0, cr1, cr2; -l_float32 x, y, ymin, ymax, refl, refr; +l_float32 x, y, refl, refr; l_float32 val, mederr; NUMA *nald, *nard; PIX *pix1; PTA *ptal1, *ptar1; /* left/right end points of lines; initial */ PTA *ptal2, *ptar2; /* left/right end points; after filtering */ -PTA *ptal3, *ptar3; /* left/right end points; long lines only */ -PTA *ptal4, *ptar4; /* left and right block, fitted, uniform spacing */ +PTA *ptal3, *ptar3; /* left and right block, fitted, uniform spacing */ PTA *pta, *ptat, *pta1, *pta2; PTAA *ptaah; FPIX *fpix; @@ -645,9 +650,9 @@ if (dew->debug) L_INFO("finding horizontal disparity\n", procName); - /* Get the endpoints of the lines */ + /* Get the endpoints of the lines, and sort from top to bottom */ h = pixGetHeight(dew->pixs); - ret = dewarpGetLineEndpoints(h, ptaa, &ptal1, &ptar1); + ret = dewarpGetLineEndPoints(h, ptaa, &ptal1, &ptar1); if (ret) { L_INFO("Horiz disparity not built\n", procName); return 1; @@ -662,22 +667,14 @@ /* Filter the points by x-location to prevent 2-column images * from getting confused about left and right endpoints. We * require valid left points to not be farther than - * 0.15 * (remaining distance to the right edge of the image) + * 0.20 * (remaining distance to the right edge of the image) * to the right of the leftmost endpoint, and similarly for - * the right endpoints. (Note: x and y are reversed in the pta.) */ - w = pixGetWidth(dew->pixs); - ptaGetMinMax(ptal1, NULL, &ymin, NULL, NULL); - ptal2 = ptaSelectByValue(ptal1, 0, ymin + 0.15 * (w - ymin), - L_SELECT_YVAL, L_SELECT_IF_LT); - ptaGetMinMax(ptar1, NULL, NULL, NULL, &ymax); - ptar2 = ptaSelectByValue(ptar1, 0, 0.85 * ymax, L_SELECT_YVAL, - L_SELECT_IF_GT); + * the right endpoints. (Note: x and y are reversed in the pta.) + * Also require end points to be near the medians in the + * upper and lower halves. */ + ret = dewarpFilterLineEndPoints(dew, ptal1, ptar1, &ptal2, &ptar2); ptaDestroy(&ptal1); ptaDestroy(&ptar1); - if (dew->debug) { - ptaWrite("/tmp/lept/dewdebug/endpts_left2.pta", ptal2, 1); - ptaWrite("/tmp/lept/dewdebug/endpts_right2.pta", ptar2, 1); - } /* Do a quadratic fit to the left and right endpoints of the * longest lines. Each line is represented by 3 coefficients: @@ -688,48 +685,36 @@ nx = dew->nx; ny = dew->ny; - /* Find the top and bottom set of long lines, defined by being - * at least 0.95 of the length of the longest line in each set. - * Quit if there are not at least 3 lines in each set. */ - ptal3 = ptar3 = NULL; /* end points of longest lines */ - ret = dewarpFindLongLines(ptal2, ptar2, 0.95, &ptal3, &ptar3); - if (ret) { - L_INFO("Horiz disparity not built\n", procName); - ptaDestroy(&ptal2); - ptaDestroy(&ptar2); - return 1; - } - /* Fit the left side, using quadratic LSF on the set of long * lines. It is not necessary to use the noisy LSF fit * function, because we've removed outlier end points by * selecting the long lines. Then uniformly sample along * this fitted curve. */ - dewarpQuadraticLSF(ptal3, &cl2, &cl1, &cl0, &mederr); + dewarpQuadraticLSF(ptal2, &cl2, &cl1, &cl0, &mederr); dew->leftslope = lept_roundftoi(1000. * cl1); /* milli-units */ dew->leftcurv = lept_roundftoi(1000000. * cl2); /* micro-units */ L_INFO("Left quad LSF median error = %5.2f\n", procName, mederr); L_INFO("Left edge slope = %d\n", procName, dew->leftslope); L_INFO("Left edge curvature = %d\n", procName, dew->leftcurv); - ptal4 = ptaCreate(ny); + ptal3 = ptaCreate(ny); for (i = 0; i < ny; i++) { /* uniformly sampled in y */ y = i * sampling; applyQuadraticFit(cl2, cl1, cl0, y, &x); - ptaAddPt(ptal4, x, y); + ptaAddPt(ptal3, x, y); } /* Fit the right side in the same way. */ - dewarpQuadraticLSF(ptar3, &cr2, &cr1, &cr0, &mederr); + dewarpQuadraticLSF(ptar2, &cr2, &cr1, &cr0, &mederr); dew->rightslope = lept_roundftoi(1000.0 * cr1); /* milli-units */ dew->rightcurv = lept_roundftoi(1000000. * cr2); /* micro-units */ L_INFO("Right quad LSF median error = %5.2f\n", procName, mederr); L_INFO("Right edge slope = %d\n", procName, dew->rightslope); L_INFO("Right edge curvature = %d\n", procName, dew->rightcurv); - ptar4 = ptaCreate(ny); + ptar3 = ptaCreate(ny); for (i = 0; i < ny; i++) { /* uniformly sampled in y */ y = i * sampling; applyQuadraticFit(cr2, cr1, cr0, y, &x); - ptaAddPt(ptar4, x, y); + ptaAddPt(ptar3, x, y); } if (dew->debug) { @@ -745,15 +730,15 @@ } pix1 = pixDisplayPta(NULL, dew->pixs, pta1); pixDisplayPta(pix1, pix1, pta2); - pixRenderHorizEndPoints(pix1, ptal3, ptar3, 0xff000000); + pixRenderHorizEndPoints(pix1, ptal2, ptar2, 0xff000000); pixDisplay(pix1, 600, 800); pixWrite("/tmp/lept/dewmod/0051.png", pix1, IFF_PNG); pixDestroy(&pix1); pix1 = pixDisplayPta(NULL, dew->pixs, pta1); pixDisplayPta(pix1, pix1, pta2); - ptalft = ptaTranspose(ptal4); - ptarft = ptaTranspose(ptar4); + ptalft = ptaTranspose(ptal3); + ptarft = ptaTranspose(ptar3); pixRenderHorizEndPoints(pix1, ptalft, ptarft, 0x0000ff00); pixDisplay(pix1, 800, 800); pixWrite("/tmp/lept/dewmod/0052.png", pix1, IFF_PNG); @@ -769,19 +754,19 @@ } /* Find the x value at the midpoints (in y) of the two vertical lines, - * ptal4 and ptar4. These are the reference values for each of the + * ptal3 and ptar3. These are the reference values for each of the * lines. Then use the difference between the these midpoint * values and the actual x coordinates of the lines to represent * the horizontal disparity (nald, nard) on the vertical lines * for the sampled y values. */ - ptaGetPt(ptal4, ny / 2, &refl, NULL); - ptaGetPt(ptar4, ny / 2, &refr, NULL); + ptaGetPt(ptal3, ny / 2, &refl, NULL); + ptaGetPt(ptar3, ny / 2, &refr, NULL); nald = numaCreate(ny); nard = numaCreate(ny); for (i = 0; i < ny; i++) { - ptaGetPt(ptal4, i, &x, NULL); + ptaGetPt(ptal3, i, &x, NULL); numaAddNumber(nald, refl - x); - ptaGetPt(ptar4, i, &x, NULL); + ptaGetPt(ptar3, i, &x, NULL); numaAddNumber(nard, refr - x); } @@ -818,13 +803,10 @@ } dew->samphdispar = fpix; dew->hsuccess = 1; - ptaDestroy(&ptal2); ptaDestroy(&ptar2); ptaDestroy(&ptal3); ptaDestroy(&ptar3); - ptaDestroy(&ptal4); - ptaDestroy(&ptar4); ptaaDestroy(&ptaah); return 0; } @@ -888,9 +870,9 @@ if (debugflag) { lept_mkdir("lept/dewmod"); - pixWrite("/tmp/lept/dewmod/0011.png", pix1, IFF_PNG); + pixWrite("/tmp/lept/dewmod/0011.tif", pix1, IFF_TIFF_G4); pixDisplayWithTitle(pix1, 0, 600, "pix1", 1); - pixWrite("/tmp/lept/dewmod/0012.png", pix2, IFF_PNG); + pixWrite("/tmp/lept/dewmod/0012.tif", pix2, IFF_TIFF_G4); pixDisplayWithTitle(pix2, 0, 800, "pix2", 1); } pixDestroy(&pix1); @@ -914,7 +896,7 @@ } if (debugflag) { pix2 = pixaDisplay(pixa2, w, h); - pixWrite("/tmp/lept/dewmod/0013.png", pix2, IFF_PNG); + pixWrite("/tmp/lept/dewmod/0013.tif", pix2, IFF_TIFF_G4); pixDisplayWithTitle(pix2, 0, 1000, "pix2", 1); pixDestroy(&pix2); } @@ -933,7 +915,7 @@ if (debugflag) { pix1 = pixCreateTemplate(pixs); pix2 = pixDisplayPtaa(pix1, ptaa); - pixWrite("/tmp/lept/dewmod/0014.png", pix2, IFF_PNG); + pixWrite("/tmp/lept/dewmod/0014.tif", pix2, IFF_PNG); pixDisplayWithTitle(pix2, 0, 1200, "pix3", 1); pixDestroy(&pix1); pixDestroy(&pix2); @@ -1061,12 +1043,12 @@ /*! - * \brief dewarpGetLineEndpoints() + * \brief dewarpGetLineEndPoints() * - * \param[in] h height of pixs - * \param[in] ptaa lines - * \param[out] pptal left end points of each line - * \param[out] pptar right end points of each line + * \param[in] h height of pixs + * \param[in] ptaa lines + * \param[out] pptal left end points of each line + * \param[out] pptar right end points of each line * \return 0 if OK, 1 on error. * *

@@ -1075,22 +1057,24 @@
  *          height of the input image, to insure good coverage and
  *          avoid extrapolating the curvature too far beyond the
  *          actual textlines.  Large extrapolations are particularly
- *          dangerous if used as a reference model.
- *      (2) For fitting the endpoints, x = f(y), we transpose x and y.
+ *          dangerous if used as a reference model.  We also require
+ *          at least 10 lines of text.
+ *      (2) We sort the lines from top to bottom (sort by x in the ptas).
+ *      (3) For fitting the endpoints, x = f(y), we transpose x and y.
  *          Thus all these ptas have x and y swapped!
  *

*/ static l_int32 -dewarpGetLineEndpoints(l_int32 h, +dewarpGetLineEndPoints(l_int32 h, PTAA *ptaa, PTA **pptal, PTA **pptar) { l_int32 i, n, npt, x, y; l_float32 miny, maxy, ratio; -PTA *pta, *ptal, *ptar; +PTA *pta, *ptal1, *ptar1; - PROCNAME("dewarpGetLineEndpoints"); + PROCNAME("dewarpGetLineEndPoints"); if (!pptal || !pptar) return ERROR_INT("&ptal and &ptar not both defined", procName, 1); @@ -1098,165 +1082,213 @@ if (!ptaa) return ERROR_INT("ptaa undefined", procName, 1); + /* Are there at least 10 lines? */ n = ptaaGetCount(ptaa); - ptal = ptaCreate(n); - ptar = ptaCreate(n); + if (n < L_MIN_LINES_FOR_HORIZ_1) { + L_INFO("only %d lines; too few\n", procName, n); + return 1; + } + + /* Extract the line end points, and transpose x and y values */ + ptal1 = ptaCreate(n); + ptar1 = ptaCreate(n); for (i = 0; i < n; i++) { pta = ptaaGetPta(ptaa, i, L_CLONE); ptaGetIPt(pta, 0, &x, &y); - ptaAddPt(ptal, y, x); + ptaAddPt(ptal1, y, x); /* transpose */ npt = ptaGetCount(pta); ptaGetIPt(pta, npt - 1, &x, &y); - ptaAddPt(ptar, y, x); + ptaAddPt(ptar1, y, x); /* transpose */ ptaDestroy(&pta); } /* Use the min and max of the y value on the left side. */ - ptaGetRange(ptal, &miny, &maxy, NULL, NULL); + ptaGetRange(ptal1, &miny, &maxy, NULL, NULL); ratio = (maxy - miny) / (l_float32)h; - if (ratio < MIN_RATIO_LINES_TO_HEIGHT) { + if (ratio < L_MIN_RATIO_LINES_TO_HEIGHT) { L_INFO("ratio lines to height, %f, too small\n", procName, ratio); - ptaDestroy(&ptal); - ptaDestroy(&ptar); + ptaDestroy(&ptal1); + ptaDestroy(&ptar1); return 1; } - *pptal = ptal; - *pptar = ptar; + /* Sort from top to bottom */ + *pptal = ptaSort(ptal1, L_SORT_BY_X, L_SORT_INCREASING, NULL); + *pptar = ptaSort(ptar1, L_SORT_BY_X, L_SORT_INCREASING, NULL); + ptaDestroy(&ptal1); + ptaDestroy(&ptar1); return 0; } /*! - * \brief dewarpFindLongLines() + * \brief dewarpFilterLineEndPoints() * - * \param[in] ptal left end points of lines - * \param[in] ptar right end points of lines - * \param[in] minfract minimum allowed fraction of longest line - * \param[out] pptald left end points of longest lines - * \param[out] pptard right end points of longest lines - * \return 0 if OK, 1 on error or if there aren't enough long lines + * \param[in] dew + * \param[in] ptal input left end points of each line + * \param[in] ptar input right end points of each line + * \param[out] pptalf filtered left end points + * \param[out] pptarf filtered right end points + * \return 0 if OK, 1 on error. * *

  * Notes:
- *      (1) We do the following:
- *         (a) Sort the lines from top to bottom, and divide equally
- *             into Top and Bottom sets.
- *         (b) For each set, select the lines that are at least %minfract
- *             of the length of the longest line in the set.
- *             Typically choose %minfract around 0.95.
- *         (c) Accumulate the left and right end points from both
- *             sets into the two returned ptas.
+ *      (1) Avoid confusion with multiple columns by requiring that line
+ *          end points be close enough to leftmost and rightmost end points.
+ *          Must have at least 8 points on left and right after this step.
+ *      (2) Apply second filtering step, find the median positions in
+ *          top and bottom halves, and removing end points that are
+ *          displaced too much from these in the x direction.
+ *          Must have at least 6 points on left and right after this step.
+ *      (3) Reminder: x and y in the pta are transposed; think x = f(y).
  *

*/ static l_int32 -dewarpFindLongLines(PTA *ptal, - PTA *ptar, - l_float32 minfract, - PTA **pptald, - PTA **pptard) +dewarpFilterLineEndPoints(L_DEWARP *dew, + PTA *ptal, + PTA *ptar, + PTA **pptalf, + PTA **pptarf) { -l_int32 i, n, ntop, nt, nb; -l_float32 xl, xr, yl, yr, len, maxtoplen, maxbotlen, tbratio; -NUMA *nalen, *naindex; -PTA *ptals, *ptars, *ptald, *ptard; - - PROCNAME("dewarpFindLongLines"); - - if (!pptald || !pptard) - return ERROR_INT("&ptald and &ptard are not both defined", procName, 1); - *pptald = *pptard = NULL; +l_int32 w, i, n; +l_float32 ymin, ymax, xvall, xvalr, yvall, yvalr; +PTA *ptal1, *ptar1, *ptal2, *ptar2; + + PROCNAME("dewarpFilterLineEndPoints"); if (!ptal || !ptar) - return ERROR_INT("ptal and ptar are not both defined", procName, 1); - if (minfract < 0.8 || minfract > 1.0) - return ERROR_INT("typ minfract is in [0.90 - 0.95]", procName, 1); - - /* Sort from top to bottom, remembering that x <--> y in the pta */ - n = ptaGetCount(ptal); - ptaGetSortIndex(ptal, L_SORT_BY_X, L_SORT_INCREASING, &naindex); - ptals = ptaSortByIndex(ptal, naindex); - ptars = ptaSortByIndex(ptar, naindex); - numaDestroy(&naindex); + return ERROR_INT("ptal or ptar not defined", procName, 1); + *pptalf = *pptarf = NULL; - ptald = ptaCreate(n); /* output of long lines */ - ptard = ptaCreate(n); /* ditto */ + /* First filter for lines near left and right margins */ + w = pixGetWidth(dew->pixs); + ptaGetMinMax(ptal, NULL, &ymin, NULL, NULL); + ptaGetMinMax(ptar, NULL, NULL, NULL, &ymax); + n = ptaGetCount(ptal); /* ptar is the same size; at least 10 */ + ptal1 = ptaCreate(n); + ptar1 = ptaCreate(n); + for (i = 0; i < n; i++) { + ptaGetPt(ptal, i, &xvall, &yvall); + ptaGetPt(ptar, i, &xvalr, &yvalr); + if (yvall < ymin + 0.20 * (w - ymin) && + yvalr > 0.80 * ymax) { + ptaAddPt(ptal1, xvall, yvall); + ptaAddPt(ptar1, xvalr, yvalr); + } + } + if (dew->debug) { + ptaWrite("/tmp/lept/dewdebug/endpts_left2.pta", ptal1, 1); + ptaWrite("/tmp/lept/dewdebug/endpts_right2.pta", ptar1, 1); + } - /* Find all lines in the top half that are within typically - * about 5 percent of the length of the longest line in that set. */ - ntop = n / 2; - nalen = numaCreate(n / 2); /* lengths of top lines */ - for (i = 0; i < ntop; i++) { - ptaGetPt(ptals, i, NULL, &xl); - ptaGetPt(ptars, i, NULL, &xr); - numaAddNumber(nalen, xr - xl); - } - numaGetMax(nalen, &maxtoplen, NULL); - L_INFO("Top: maxtoplen = %8.3f\n", procName, maxtoplen); - for (i = 0; i < ntop; i++) { - numaGetFValue(nalen, i, &len); - if (len >= minfract * maxtoplen) { - ptaGetPt(ptals, i, &yl, &xl); - ptaAddPt(ptald, yl, xl); - ptaGetPt(ptars, i, &yr, &xr); - ptaAddPt(ptard, yr, xr); - } - } - numaDestroy(&nalen); - - nt = ptaGetCount(ptald); - if (nt < 3) { - L_INFO("too few long lines at top: %d\n", procName, nt); - ptaDestroy(&ptals); - ptaDestroy(&ptars); - ptaDestroy(&ptald); - ptaDestroy(&ptard); + n = L_MIN(ptaGetCount(ptal1), ptaGetCount(ptar1)); + if (n < L_MIN_LINES_FOR_HORIZ_1 - 2) { + ptaDestroy(&ptal1); + ptaDestroy(&ptar1); + L_INFO("First filter: only %d endpoints; needed 8\n", procName, n); return 1; } - /* Find all lines in the bottom half that are within 8 percent - * of the length of the longest line in that set. */ - nalen = numaCreate(0); /* lengths of bottom lines */ - for (i = ntop; i < n; i++) { - ptaGetPt(ptals, i, NULL, &xl); - ptaGetPt(ptars, i, NULL, &xr); - numaAddNumber(nalen, xr - xl); - } - numaGetMax(nalen, &maxbotlen, NULL); - L_INFO("Bottom: maxbotlen = %8.3f\n", procName, maxbotlen); - for (i = 0; i < n - ntop; i++) { - numaGetFValue(nalen, i, &len); - if (len >= minfract * maxbotlen) { - ptaGetPt(ptals, ntop + i, &yl, &xl); - ptaAddPt(ptald, yl, xl); - ptaGetPt(ptars, ntop + i, &yr, &xr); - ptaAddPt(ptard, yr, xr); - } - } - numaDestroy(&nalen); - ptaDestroy(&ptals); - ptaDestroy(&ptars); - - /* Impose another condition: the top and bottom max lengths must - * be within 15% of each other. */ - tbratio = (maxtoplen >= maxbotlen) ? maxbotlen / maxtoplen : - maxtoplen / maxbotlen; - nb = ptaGetCount(ptald) - nt; - if (nb < 3 || tbratio < 0.85) { - if (nb < 3) L_INFO("too few long lines at bottom: %d\n", procName, nb); - if (tbratio < 0.85) L_INFO("big length diff: ratio = %4.2f\n", - procName, tbratio); - ptaDestroy(&ptald); - ptaDestroy(&ptard); + /* Remove outlier points */ + ptal2 = dewarpRemoveBadEndPoints(w, ptal1); + ptar2 = dewarpRemoveBadEndPoints(w, ptar1); + ptaDestroy(&ptal1); + ptaDestroy(&ptar1); + if (!ptal2 || !ptar2) { + ptaDestroy(&ptal2); + ptaDestroy(&ptar2); + L_INFO("Second filter: too few endpoints left after outliers removed\n", + procName); return 1; - } else { - *pptald = ptald; - *pptard = ptard; } + if (dew->debug) { + ptaWrite("/tmp/lept/dewdebug/endpts_left3.pta", ptal2, 1); + ptaWrite("/tmp/lept/dewdebug/endpts_right3.pta", ptar2, 1); + } + + *pptalf = ptal2; + *pptarf = ptar2; return 0; } /*! + * \brief dewarpRemoveBadEndPoints() + * + * \param[in] w width of input image + * \param[in] ptas left or right line end points + * \return ptad filtered left or right end points, or NULL on error. + * + *

+ * Notes:
+ *      (1) The input set is sorted by line position (x value).
+ *          Break into two (upper and lower); for each find the median
+ *          horizontal (y value), and remove all points farther than
+ *          a fraction of the image width from this.  Make sure each
+ *          part still has at least 3 points, and join the two sections
+ *          before returning.
+ *      (2) Reminder: x and y in the pta are transposed; think x = f(y).
+ *

+ */ +static PTA * +dewarpRemoveBadEndPoints(l_int32 w, + PTA *ptas) +{ +l_int32 i, n, nu, nd; +l_float32 rval, xval, yval, delta; +PTA *ptau1, *ptau2, *ptad1, *ptad2; + + PROCNAME("dewarpRemoveBadEndPoints"); + + if (!ptas) + return (PTA *)ERROR_PTR("ptas not defined", procName, NULL); + + delta = w * L_ALLOWED_W_FRACT; + n = ptaGetCount(ptas); /* will be at least 8 */ + + /* Check the upper half */ + ptau1 = ptaSelectRange(ptas, 0, n / 2); + ptaGetRankValue(ptau1, 0.5, NULL, L_SORT_BY_Y, &rval); + nu = ptaGetCount(ptau1); + ptau2 = ptaCreate(nu); + for (i = 0; i < nu; i++) { + ptaGetPt(ptau1, i, &xval, &yval); /* transposed */ + if (L_ABS(rval - yval) <= delta) + ptaAddPt(ptau2, xval, yval); + } + ptaDestroy(&ptau1); + if (ptaGetCount(ptau2) < L_MIN_LINES_FOR_HORIZ_2) { + ptaDestroy(&ptau2); + L_INFO("Second filter: upper set is too small after outliers removed\n", + procName); + return NULL; + } + + /* Check the lower half */ + ptad1 = ptaSelectRange(ptas, n / 2 + 1, 0); + ptaGetRankValue(ptad1, 0.5, NULL, L_SORT_BY_Y, &rval); + nd = ptaGetCount(ptad1); + ptad2 = ptaCreate(nd); + for (i = 0; i < nd; i++) { + ptaGetPt(ptad1, i, &xval, &yval); /* transposed */ + if (L_ABS(rval - yval) <= delta) + ptaAddPt(ptad2, xval, yval); + } + ptaDestroy(&ptad1); + if (ptaGetCount(ptad2) < L_MIN_LINES_FOR_HORIZ_2) { + ptaDestroy(&ptau2); + ptaDestroy(&ptad2); + L_INFO("Second filter: lower set is too small after outliers removed\n", + procName); + return NULL; + } + + ptaJoin(ptau2, ptad2, 0, -1); + ptaDestroy(&ptad2); + return ptau2; +} + + +/*! * \brief dewarpIsLineCoverageValid() * * \param[in] ptaa of validated lines @@ -1370,6 +1402,243 @@ return 0; } +/*----------------------------------------------------------------------* + * Build disparity model for slope near binding * + *----------------------------------------------------------------------*/ +/*! + * \brief dewarpFindHorizSlopeDisparity() + * + * \param[in] dew + * \param[in] pixb (1 bpp, with vertical and horizontal disparity removed) + * \param[in] fractthresh (threshold fractional difference in density) + * \param[in] parity (0 if even page, 1 if odd page) + * \return 0 if OK, 1 on error + * + *

+ * Notes:
+ *      (1) %fractthresh is a threshold on the fractional difference in stroke
+ *          density between between left and right sides.  Process this
+ *          disparity only if the absolute value of the fractional
+ *          difference equals or exceeds this threshold.
+ *      (2) %parity indicates where the binding is: on the left for
+ *          %parity == 0 and on the right for @parity == 1.
+ *      (3) This takes a 1 bpp %pixb where both vertical and horizontal
+ *          disparity have been applied, so the text lines are straight and,
+ *          more importantly, the line end points are vertically aligned.
+ *          It estimates the foreshortening of the characters on the
+ *          binding side, and if significant, computes a one-dimensional
+ *          horizontal disparity function to compensate.
+ *      (4) The first attempt was to use the average width of the
+ *          connected components (c.c.) in vertical slices.  This does not work
+ *          reliably, because the horizontal compression of the text is
+ *          often accompanied by horizontal joining of c.c.
+ *      (5) We use the density of vertical strokes, measured by first using
+ *          a vertical opening, which improves the signal.  The result
+ *          is relatively insensitive to the size of the opening; we use
+ *          a 10-pixel opening.  The relative density is measured by
+ *          finding the number of c.c. in a full height sliding window
+ *          of width 50 pixels, and compute every 25 pixels.  Similar results
+ *          are obtained counting c.c. that either intersect the window
+ *          or are fully contained within it.
+ *      (6) Debug output goes to /tmp/lept/dewmod/ for collection into a pdf.
+ *

+ */ +l_int32 +dewarpFindHorizSlopeDisparity(L_DEWARP *dew, + PIX *pixb, + l_float32 fractthresh, + l_int32 parity) +{ +l_int32 i, j, x, n1, n2, nb, ne, count, w, h, ival, prev; +l_int32 istart, iend, first, last, x0, x1, nx, ny; +l_float32 fract, delta, sum, aveval, fval, del, denom; +l_float32 ca, cb, cc, cd, ce, y; +BOX *box; +BOXA *boxa1, *boxa2; +NUMA *na1, *na2, *na3, *na4, *nasum; +PIX *pix1; +PTA *pta1; +FPIX *fpix; + + PROCNAME("dewarpFindHorizSlopeDisparity"); + + if (!dew) + return ERROR_INT("dew not defined", procName, 1); + if (!dew->vvalid || !dew->hvalid) + return ERROR_INT("invalid vert or horiz disparity model", procName, 1); + if (!pixb || pixGetDepth(pixb) != 1) + return ERROR_INT("pixb not defined or not 1 bpp", procName, 1); + + if (dew->debug) L_INFO("finding slope horizontal disparity\n", procName); + + /* Find the bounding boxes of the vertical strokes; remove noise */ + pix1 = pixMorphSequence(pixb, "o1.10", 0); + pixDisplay(pix1, 100, 100); + boxa1 = pixConnCompBB(pix1, 4); + boxa2 = boxaSelectBySize(boxa1, 0, 5, L_SELECT_HEIGHT, L_SELECT_IF_GT, + NULL); + nb = boxaGetCount(boxa2); + fprintf(stderr, "number of components: %d\n", nb); + boxaDestroy(&boxa1); + + /* Estimate the horizontal density of vertical strokes */ + na1 = numaCreate(0); + numaSetParameters(na1, 0, 25); + pixGetDimensions(pixb, &w, &h, NULL); + for (x = 0; x + 50 < w; x += 25) { + box = boxCreate(x, 0, 50, h); + boxaContainedInBoxCount(boxa2, box, &count); + numaAddNumber(na1, count); + boxDestroy(&box); + } + if (dew->debug) { + lept_mkdir("lept/dew"); + gplotSimple1(na1, GPLOT_PNG, "/tmp/lept/dew/0091", NULL); + lept_mv("/tmp/lept/dew/0091.png", "lept/dewmod", NULL, NULL); + pixWrite("/tmp/lept/dewmod/0090.png", pix1, IFF_PNG); + } + pixDestroy(&pix1); + boxaDestroy(&boxa2); + + /* Find the left and right end local maxima; if the difference + * is small, quit. */ + n1 = numaGetCount(na1); + prev = 0; + istart = 0; + first = 0; + for (i = 0; i < n1; i++) { + numaGetIValue(na1, i, &ival); + if (ival >= prev) { + prev = ival; + continue; + } else { + first = prev; + istart = i - 1; + break; + } + } + prev = 0; + last = 0; + iend = n1 - 1; + for (i = n1 - 1; i >= 0; i--) { + numaGetIValue(na1, i, &ival); + if (ival >= prev) { + prev = ival; + continue; + } else { + last = prev; + iend = i + 1; + break; + } + } + na2 = numaClipToInterval(na1, istart, iend); + numaDestroy(&na1); + n2 = numaGetCount(na2); + delta = (parity == 0) ? last - first : first - last; + denom = L_MAX(1.0, (l_float32)(L_MIN(first, last))); + fract = (l_float32)delta / denom; + if (dew->debug) { + L_INFO("Slope-disparity: first = %d, last = %d, fract = %7.3f\n", + procName, first, last, fract); + gplotSimple1(na2, GPLOT_PNG, "/tmp/lept/dew/0092", NULL); + lept_mv("/tmp/lept/dew/0092.png", "lept/dewmod", NULL, NULL); + } + if (fract < fractthresh) { + L_INFO("Small slope-disparity: first = %d, last = %d, fract = %7.3f\n", + procName, first, last, fract); + numaDestroy(&na2); + return 0; + } + + /* Find the density far from the binding, and normalize to 1. */ + ne = n2 - n2 % 2; + if (parity == 0) + numaGetSumOnInterval(na2, 0, ne / 2 - 1, &sum); + else /* parity == 1 */ + numaGetSumOnInterval(na2, ne / 2, ne - 1, &sum); + denom = L_MAX(1.0, (l_float32)(ne / 2)); + aveval = sum / denom; + na3 = numaMakeConstant(aveval, n2); + numaArithOp(na2, na2, na3, L_ARITH_DIVIDE); + numaDestroy(&na3); + if (dew->debug) { + L_INFO("Average background density: %5.1f\n", procName, aveval); + gplotSimple1(na2, GPLOT_PNG, "/tmp/lept/dew/0093", NULL); + lept_mv("/tmp/lept/dew/0093.png", "lept/dewmod", NULL, NULL); + } + + /* Fit the normalized density curve to a quartic */ + pta1 = numaConvertToPta1(na2); + ptaWriteStream(stderr, pta1, 0); +/* ptaGetQuadraticLSF(pta1, NULL, NULL, NULL, &na3); */ + ptaGetQuarticLSF(pta1, &ca, &cb, &cc, &cd, &ce, &na3); + ptaGetArrays(pta1, &na4, NULL); + if (dew->debug) { + gplotSimpleXY1(na4, na3, GPLOT_LINES, GPLOT_PNG, + "/tmp/lept/dew/0094", NULL); + lept_mv("/tmp/lept/dew/0094.png", "lept/dewmod", NULL, NULL); + } + ptaDestroy(&pta1); + + /* Integrate from the high point down to 1 (or v.v) to get the + * disparity needed to make the density constant. */ + nasum = numaMakeConstant(0, w); /* area under the curve above 1.0 */ + if (parity == 0) { + for (i = n2 - 1; i >= 0; i--) { + numaGetFValue(na3, i, &fval); + if (fval < 1.0) break; + } + numaGetIValue(na4, i + 1, &x0); + numaGetIValue(na4, n2 - 1, &x1); + numaSetParameters(nasum, x0, 1); + sum = 0.0; + for (x = x0; x < x1; x++) { + applyQuarticFit(ca, cb, cc, cd, ce, (l_float32)x, &y); + sum += (y - 1.0); + numaReplaceNumber(nasum, x, sum); + } + for (x = x1; x < w; x++) + numaReplaceNumber(nasum, x, sum); + } else { /* parity == 1 */ + for (i = 0; i < n2; i++) { + numaGetFValue(na3, i, &fval); + if (fval < 1.0) break; + } + numaGetIValue(na4, 0, &x0); + numaGetIValue(na4, i - 1, &x1); + numaSetParameters(nasum, x0, 1); + sum = 0.0; + for (x = x1; x >= x0; x--) { + applyQuarticFit(ca, cb, cc, cd, ce, (l_float32)x, &y); + sum += (y - 1.0); + numaReplaceNumber(nasum, x, sum); + } + for (x = x0; x >= 0; x--) + numaReplaceNumber(nasum, x, sum); + } + + /* Save the result in a fpix at the specified subsampling */ + nx = dew->nx; + ny = dew->ny; + fpix = fpixCreate(nx, ny); + del = (l_float32)w / (l_float32)nx; + for (i = 0; i < ny; i++) { + for (j = 0; j < nx; j++) { + x = del * j; + numaGetFValue(nasum, x, &fval); + fpixSetPixel(fpix, j, i, fval); + } + } + dew->sampydispar = fpix; + dew->ysuccess = 1; + + numaDestroy(&na2); + numaDestroy(&na3); + numaDestroy(&na4); + numaDestroy(&nasum); + return 0; +} + /*----------------------------------------------------------------------* * Build line disparity model * @@ -1394,7 +1663,7 @@ * of pre-processing here to insure that. * (3) %opensize is typically about 8. It must be larger than * the thickness of the lines to be extracted. This is the - * default value, which is applied if %opensize \< 3. + * default value, which is applied if %opensize < 3. * (4) Sets vsuccess = 1 and hsuccess = 1 if the vertical and/or * horizontal disparity arrays build. * (5) Similar to dewarpBuildPageModel(), except here the vertical @@ -1505,6 +1774,7 @@ /* Remove all lines that are not at least 0.75 times the length * of the longest line. */ +/* WILLUS MOD */ /* ptaa2 = dewarpRemoveShortLines(pix, ptaa1, 0.75, DEBUG_SHORT_LINES); if (debugfile) { @@ -1515,9 +1785,9 @@ pixDestroy(&pix1); pixDestroy(&pix2); } - ptaaDestroy(&ptaa1); */ ptaa2=ptaa1; + ptaaDestroy(&ptaa1); nlines = ptaaGetCount(ptaa2); if (nlines < dew->minlines) { pixDestroy(&pix); diff -Nru k2pdfopt-2.42+ds/leptonica_mod/leptwin.c k2pdfopt-2.51+ds/leptonica_mod/leptwin.c --- k2pdfopt-2.42+ds/leptonica_mod/leptwin.c 1970-01-01 00:00:00.000000000 +0000 +++ k2pdfopt-2.51+ds/leptonica_mod/leptwin.c 2018-01-13 23:27:32.000000000 +0000 @@ -0,0 +1,372 @@ +/*====================================================================* + - Copyright (C) 2001 Leptonica. All rights reserved. + - + - Redistribution and use in source and binary forms, with or without + - modification, are permitted provided that the following conditions + - are met: + - 1. Redistributions of source code must retain the above copyright + - notice, this list of conditions and the following disclaimer. + - 2. Redistributions in binary form must reproduce the above + - copyright notice, this list of conditions and the following + - disclaimer in the documentation and/or other materials + - provided with the distribution. + - + - THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + - ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + - LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + - A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL ANY + - CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + - EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + - PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + - PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY + - OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + - NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + - SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + *====================================================================*/ + +/*! + * \file leptwin.c + *

+ *
+ *    This file contains Leptonica routines needed only on Microsoft Windows
+ *
+ *    Currently it only contains one public function
+ *    (based on dibsectn.c by jmh, 03-30-98):
+ *
+ *      HBITMAP    pixGetWindowsHBITMAP(PIX *pix)
+ *

+ */ + +#ifdef _WIN32 +#include +#include +#include "allheaders.h" +#include "leptwin.h" + +/* Macro to determine the number of bytes per line in the DIB bits. + * This accounts for DWORD alignment by adding 31 bits, + * then dividing by 32, then rounding up to the next highest + * count of 4-bytes. Then, we multiply by 4 to get the total byte count. */ +#define BYTESPERLINE(Width, BPP) ((l_int32)((((DWORD)(Width) * (DWORD)(BPP) + 31) >> 5)) << 2) + + +/* ********************************************************************** + DWORD DSImageBitsSize(LPBITMAPINFO pbmi) + + PARAMETERS: + LPBITMAPINFO - pointer to a BITMAPINFO describing a DIB + + RETURNS: + DWORD - the size, in bytes, of the DIB's image bits + + REMARKS: + Calculates and returns the size, in bytes, of the image bits for + the DIB described by the BITMAPINFO. +********************************************************************** */ +static DWORD +DSImageBitsSize(LPBITMAPINFO pbmi) +{ + switch(pbmi->bmiHeader.biCompression) + { + case BI_RLE8: /* wrong if haven't called DSCreateDIBSection or + * CreateDIBSection with this pbmi */ + case BI_RLE4: + return pbmi->bmiHeader.biSizeImage; + break; + default: /* should not have to use "default" */ + case BI_RGB: + case BI_BITFIELDS: + return BYTESPERLINE(pbmi->bmiHeader.biWidth, \ + pbmi->bmiHeader.biBitCount * pbmi->bmiHeader.biPlanes) * + pbmi->bmiHeader.biHeight; + break; + } + return 0; +} + +/* ********************************************************************** + DWORD ImageBitsSize(HBITMAP hbitmap) + + PARAMETERS: + HBITMAP - hbitmap + + RETURNS: + DWORD - the size, in bytes, of the HBITMAP's image bits + + REMARKS: + Calculates and returns the size, in bytes, of the image bits for + the DIB described by the HBITMAP. +********************************************************************** */ +static DWORD +ImageBitsSize(HBITMAP hBitmap) +{ + DIBSECTION ds; + + GetObject(hBitmap, sizeof(DIBSECTION), &ds); + switch( ds.dsBmih.biCompression ) + { + case BI_RLE8: /* wrong if haven't called DSCreateDIBSection or + * CreateDIBSection with this pbmi */ + case BI_RLE4: + return ds.dsBmih.biSizeImage; + break; + default: /* should not have to use "default" */ + case BI_RGB: + case BI_BITFIELDS: + return BYTESPERLINE(ds.dsBmih.biWidth, \ + ds.dsBmih.biBitCount * ds.dsBmih.biPlanes) * + ds.dsBmih.biHeight; + break; + } + return 0; +} + +/*! + * \brief setColormap(LPBITMAPINFO pbmi, PIXCMAP *cmap) + * + * \param[in] pbmi pointer to a BITMAPINFO describing a DIB + * \param[in] cmap leptonica colormap + * \return number of colors in cmap + */ +static int +setColormap(LPBITMAPINFO pbmi, + PIXCMAP *cmap) +{ +l_int32 i, nColors, rval, gval, bval; + + nColors = pixcmapGetCount(cmap); + for (i = 0; i < nColors; i++) { + pixcmapGetColor(cmap, i, &rval, &gval, &bval); + pbmi->bmiColors[i].rgbRed = rval; + pbmi->bmiColors[i].rgbGreen = gval; + pbmi->bmiColors[i].rgbBlue = bval; + pbmi->bmiColors[i].rgbReserved = 0; + } + pbmi->bmiHeader.biClrUsed = nColors; + return nColors; +} + +/* ********************************************************************** + HBITMAP DSCreateBitmapInfo(l_int32 width, l_int32 height, l_int32 depth, + PIXCMAP *cmap) + + PARAMETERS: + l_int32 width - Desired width of the DIBSection + l_int32 height - Desired height of the DIBSection + l_int32 depth - Desired bit-depth of the DIBSection + PIXCMAP cmap - leptonica colormap for depths < 16 + + RETURNS: + LPBITMAPINFO - a ptr to BITMAPINFO of the desired size and bit-depth + NULL on failure + + REMARKS: + Creates a BITMAPINFO based on the criteria passed in as parameters. + +********************************************************************** */ +static LPBITMAPINFO +DSCreateBitmapInfo(l_int32 width, + l_int32 height, + l_int32 depth, + PIXCMAP *cmap) +{ +l_int32 nInfoSize; +LPBITMAPINFO pbmi; +LPDWORD pMasks; + + nInfoSize = sizeof(BITMAPINFOHEADER); + if( depth <= 8 ) + nInfoSize += sizeof(RGBQUAD) * (1 << depth); + if((depth == 16) || (depth == 32)) + nInfoSize += (3 * sizeof(DWORD)); + + /* Create the header big enough to contain color table and + * bitmasks if needed. */ + pbmi = (LPBITMAPINFO)malloc(nInfoSize); + if (!pbmi) + return NULL; + + ZeroMemory(pbmi, nInfoSize); + pbmi->bmiHeader.biSize = sizeof(BITMAPINFOHEADER); + pbmi->bmiHeader.biWidth = width; + pbmi->bmiHeader.biHeight = height; + pbmi->bmiHeader.biPlanes = 1; + pbmi->bmiHeader.biBitCount = depth; + + /* override below for 16 and 32 bpp */ + pbmi->bmiHeader.biCompression = BI_RGB; + + /* ?? not sure if this is right? */ + pbmi->bmiHeader.biSizeImage = DSImageBitsSize(pbmi); + + pbmi->bmiHeader.biXPelsPerMeter = 0; + pbmi->bmiHeader.biYPelsPerMeter = 0; + pbmi->bmiHeader.biClrUsed = 0; /* override below */ + pbmi->bmiHeader.biClrImportant = 0; + + switch(depth) + { + case 24: + /* 24bpp requires no special handling */ + break; + case 16: + /* if it's 16bpp, fill in the masks and override the + * compression. These are the default masks -- you + * could change them if needed. */ + pMasks = (LPDWORD)(pbmi->bmiColors); + pMasks[0] = 0x00007c00; + pMasks[1] = 0x000003e0; + pMasks[2] = 0x0000001f; + pbmi->bmiHeader.biCompression = BI_BITFIELDS; + break; + case 32: + /* if it's 32 bpp, fill in the masks and override + * the compression */ + pMasks = (LPDWORD)(pbmi->bmiColors); + /*pMasks[0] = 0x00ff0000; */ + /*pMasks[1] = 0x0000ff00; */ + /*pMasks[2] = 0x000000ff; */ + pMasks[0] = 0xff000000; + pMasks[1] = 0x00ff0000; + pMasks[2] = 0x0000ff00; + + pbmi->bmiHeader.biCompression = BI_BITFIELDS; + break; + case 8: + case 4: + case 1: + setColormap(pbmi, cmap); + break; + } + return pbmi; +} + +/* ********************************************************************** + HBITMAP DSCreateDIBSection(l_int32 width, l_int32 height, l_int32 depth, + PIXCMAP *cmap) + + PARAMETERS: + l_int32 width - Desired width of the DIBSection + l_int32 height - Desired height of the DIBSection + l_int32 depth - Desired bit-depth of the DIBSection + PIXCMAP cmap - leptonica colormap for depths < 16 + + RETURNS: + HBITMAP - a DIBSection HBITMAP of the desired size and bit-depth + NULL on failure + + REMARKS: + Creates a DIBSection based on the criteria passed in as parameters. + +********************************************************************** */ +static HBITMAP +DSCreateDIBSection(l_int32 width, + l_int32 height, + l_int32 depth, + PIXCMAP *cmap) +{ +HBITMAP hBitmap; +l_int32 nInfoSize; +LPBITMAPINFO pbmi; +HDC hRefDC; +LPBYTE pBits; + + pbmi = DSCreateBitmapInfo (width, height, depth, cmap); + if (!pbmi) + return NULL; + + hRefDC = GetDC(NULL); + hBitmap = CreateDIBSection(hRefDC, pbmi, DIB_RGB_COLORS, + (void **) &pBits, NULL, 0); + nInfoSize = GetLastError(); + ReleaseDC(NULL, hRefDC); + free(pbmi); + + return hBitmap; +} + + +/*! + * \brief pixGetWindowsHBITMAP() + * + * \param[in] pix + * \return Windows hBitmap, or NULL on error + * + *

+ * Notes:
+ *      (1) It's the responsibility of the caller to destroy the
+ *          returned hBitmap with a call to DeleteObject (or with
+ *          something that eventually calls DeleteObject).
+ *

+ */ +HBITMAP +pixGetWindowsHBITMAP(PIX *pix) +{ +l_int32 width, height, depth; +l_uint32 *data; +HBITMAP hBitmap = NULL; +BITMAP bm; +DWORD imageBitsSize; +PIX *pixt = NULL; +PIXCMAP *cmap; + + PROCNAME("pixGetWindowsHBITMAP"); + if (!pix) + return (HBITMAP)ERROR_PTR("pix not defined", procName, NULL); + + pixGetDimensions(pix, &width, &height, &depth); + cmap = pixGetColormap(pix); + + if (depth == 24) depth = 32; + if (depth == 2) { + pixt = pixConvert2To8(pix, 0, 85, 170, 255, TRUE); + if (!pixt) + return (HBITMAP)ERROR_PTR("unable to convert pix from 2bpp to 8bpp", + procName, NULL); + depth = pixGetDepth(pixt); + cmap = pixGetColormap(pixt); + } + + if (depth < 16) { + if (!cmap) + cmap = pixcmapCreateLinear(depth, 1< */ +/* +#include "lcms2art.h" +#include "lcms2art_plugin.h" +*/ +#include +#include "colorspace-imp.h" + +#define LCMS_BYTES_MASK 0x7 +/* #define DEBUG_LCMS_MEM(A) do { printf A; fflush(stdout); } while (0) */ +#define DEBUG_LCMS_MEM(A) do { } while (0) + +static void +fz_lcms_log_error(cmsContext id, cmsUInt32Number error_code, const char *error_text) +{ + fz_context *ctx = (fz_context *)cmsGetContextUserData(id); + fz_warn(ctx, "lcms error: %s", error_text); +} + +static void +*fz_lcms_malloc(cmsContext id, unsigned int size) +{ + void *result; + fz_context *ctx = (fz_context *)cmsGetContextUserData(id); + result = fz_malloc_no_throw(ctx, size); + DEBUG_LCMS_MEM(("Allocation:: mupdf ctx = %p lcms ctx = %p allocation = %p \n", (void*) ctx, (void*) id, (void*) result)); + return result; +} + +static void +fz_lcms_free(cmsContext id, void *ptr) +{ + fz_context *ctx = (fz_context *)cmsGetContextUserData(id); + DEBUG_LCMS_MEM(("Free:: mupdf ctx = %p lcms ctx = %p allocation = %p \n", (void*) ctx, (void*) id, (void*) ptr)); + fz_free(ctx, ptr); +} + +static void* +fz_lcms_realloc(cmsContext id, void *ptr, unsigned int size) +{ + fz_context *ctx = (fz_context *)cmsGetContextUserData(id); + DEBUG_LCMS_MEM(("Realloc:: mupdf ctx = %p lcms ctx = %p allocation = %p \n", (void*) ctx, (void*) id, (void*) ptr)); + if (ptr == 0) + return fz_lcms_malloc(id, size); + if (size == 0) + { + fz_lcms_free(id, ptr); + return NULL; + } + return fz_resize_array_no_throw(ctx, ptr, size, 1); +} + +static cmsPluginMemHandler fz_lcms_memhandler = +{ + { + cmsPluginMagicNumber, + LCMS_VERSION, + cmsPluginMemHandlerSig, + NULL + }, + fz_lcms_malloc, + fz_lcms_free, + fz_lcms_realloc, + NULL, + NULL, + NULL, +}; + +static int +fz_lcms_num_devcomps(cmsContext cmm_ctx, fz_iccprofile *profile) +{ + return cmsChannelsOf(cmm_ctx, cmsGetColorSpace(cmm_ctx, profile->cmm_handle)); +} + +static char * +fz_lcms_description(cmsContext cmm_ctx, fz_iccprofile *profile) +{ + fz_context *ctx = (fz_context *)cmsGetContextUserData(cmm_ctx); + cmsMLU *descMLU; + char *desc; + size_t size; + + descMLU = cmsReadTag(cmm_ctx, profile->cmm_handle, cmsSigProfileDescriptionTag); + size = cmsMLUgetASCII(cmm_ctx, descMLU, "en", "US", NULL, 0); + desc = fz_malloc(ctx, size); + cmsMLUgetASCII(cmm_ctx, descMLU, "en", "US", desc, size); + return desc; +} + +static void +fz_lcms_premultiply_row(fz_context *ctx, int n, int c, int w, unsigned char *s) +{ + unsigned char a; + int k; + int n1 = n-1; + + for (; w > 0; w--) + { + a = s[n1]; + for (k = 0; k < c; k++) + s[k] = fz_mul255(s[k], a); + s += n; + } +} + +static void +fz_lcms_unmultiply_row(fz_context *ctx, int n, int c, int w, unsigned char *s, const unsigned char *in) +{ + int a, inva; + int k; + int n1 = n-1; + + for (; w > 0; w--) + { + a = in[n1]; + inva = a ? 255 * 256 / a : 0; + for (k = 0; k < c; k++) + s[k] = (in[k] * inva) >> 8; + for (;k < n1; k++) + s[k] = in[k]; + s[n1] = a; + s += n; + in += n; + } +} + +/* Transform pixmap */ +void +fz_lcms_transform_pixmap(fz_cmm_instance *instance, fz_icclink *link, fz_pixmap *dst, fz_pixmap *src) +{ + cmsContext cmm_ctx = (cmsContext)instance; + fz_context *ctx = (fz_context *)cmsGetContextUserData(cmm_ctx); + cmsHTRANSFORM hTransform = (cmsHTRANSFORM)link->cmm_handle; + int cmm_num_src, cmm_num_des, cmm_extras; + unsigned char *inputpos, *outputpos, *buffer; + int ss = src->stride; + int ds = dst->stride; + int sw = src->w; + int dw = dst->w; + int sn = src->n; + int dn = dst->n; + int sa = src->alpha; + int da = dst->alpha; + int ssp = src->s; + int dsp = dst->s; + int sc = sn - ssp - sa; + int dc = dn - dsp - da; + int h = src->h; + cmsUInt32Number src_format, dst_format; + DEBUG_LCMS_MEM(("@@@@@@@ Transform Pixmap Start:: mupdf ctx = %p lcms ctx = %p link = %p \n", (void*)ctx, (void*)cmm_ctx, (void*)link->cmm_handle)); + + /* check the channels. */ + src_format = cmsGetTransformInputFormat(cmm_ctx, hTransform); + dst_format = cmsGetTransformOutputFormat(cmm_ctx, hTransform); + cmm_num_src = T_CHANNELS(src_format); + cmm_num_des = T_CHANNELS(dst_format); + cmm_extras = T_EXTRA(src_format); + if (cmm_num_src != sc || cmm_num_des != dc || cmm_extras != ssp+sa || sa != da || (link->copy_spots && ssp != dsp)) + fz_throw(ctx, FZ_ERROR_GENERIC, "Mismatching color setup in cmm pixmap transformation: src: %d vs %d+%d+%d, dst: %d vs %d+%d+%d", cmm_num_src, sc, ssp, sa, cmm_num_des, dc, dsp, da); + + /* Transform */ + inputpos = src->samples; + outputpos = dst->samples; + if (sa) + { + /* Allow for premultiplied alpha */ + buffer = fz_malloc(ctx, ss); + for (; h > 0; h--) + { + fz_lcms_unmultiply_row(ctx, sn, sc, sw, buffer, inputpos); + cmsDoTransform(cmm_ctx, hTransform, buffer, outputpos, sw); + fz_lcms_premultiply_row(ctx, dn, dc, dw, outputpos); + inputpos += ss; + outputpos += ds; + } + fz_free(ctx, buffer); + } + else + { + for (; h > 0; h--) + { + cmsDoTransform(cmm_ctx, hTransform, inputpos, outputpos, sw); + inputpos += ss; + outputpos += ds; + } + } + DEBUG_LCMS_MEM(("@@@@@@@ Transform Pixmap End:: mupdf ctx = %p lcms ctx = %p link = %p \n", (void*)ctx, (void*)cmm_ctx, (void*)link->cmm_handle)); +} + +/* Transform a single color. */ +void +fz_lcms_transform_color(fz_cmm_instance *instance, fz_icclink *link, unsigned short *dst, const unsigned short *src) +{ + cmsContext cmm_ctx = (cmsContext)instance; + cmsHTRANSFORM hTransform = (cmsHTRANSFORM) link->cmm_handle; + + cmsDoTransform(cmm_ctx, hTransform, src, dst, 1); +} + +void +fz_lcms_init_link(fz_cmm_instance *instance, fz_icclink *link, const fz_iccprofile *dst, int dst_extras, const fz_iccprofile *src, int src_extras, const fz_iccprofile *prf, const fz_color_params *rend, int cmm_flags, int num_bytes, int copy_spots) +{ + cmsContext cmm_ctx = (cmsContext)instance; + fz_context *ctx = (fz_context *)cmsGetContextUserData(cmm_ctx); + + cmsUInt32Number src_data_type, des_data_type; + cmsColorSpaceSignature src_cs, des_cs; + int src_num_chan, des_num_chan; + int lcms_src_cs, lcms_des_cs; + unsigned int flag = cmsFLAGS_LOWRESPRECALC | cmm_flags; + + DEBUG_LCMS_MEM(("@@@@@@@ Create Link Start:: mupdf ctx = %p lcms ctx = %p src = %p des = %p \n", (void*)ctx, (void*)cmm_ctx, (void*)src->cmm_handle, (void*)dst->cmm_handle)); + + /* src */ + src_cs = cmsGetColorSpace(cmm_ctx, src->cmm_handle); + lcms_src_cs = _cmsLCMScolorSpace(cmm_ctx, src_cs); + if (lcms_src_cs < 0) + lcms_src_cs = 0; + src_num_chan = cmsChannelsOf(cmm_ctx, src_cs); + src_data_type = (COLORSPACE_SH(lcms_src_cs) | CHANNELS_SH(src_num_chan) | DOSWAP_SH(src->bgr) | SWAPFIRST_SH(src->bgr && (src_extras != 0)) | BYTES_SH(num_bytes) | EXTRA_SH(src_extras)); + + /* dst */ + des_cs = cmsGetColorSpace(cmm_ctx, dst->cmm_handle); + lcms_des_cs = _cmsLCMScolorSpace(cmm_ctx, des_cs); + if (lcms_des_cs < 0) + lcms_des_cs = 0; + des_num_chan = cmsChannelsOf(cmm_ctx, des_cs); + des_data_type = (COLORSPACE_SH(lcms_des_cs) | CHANNELS_SH(des_num_chan) | DOSWAP_SH(dst->bgr) | SWAPFIRST_SH(dst->bgr && (dst_extras != 0)) | BYTES_SH(num_bytes) | EXTRA_SH(dst_extras)); + + /* flags */ + if (rend->bp) + flag |= cmsFLAGS_BLACKPOINTCOMPENSATION; + + if (copy_spots) + flag |= cmsFLAGS_COPY_ALPHA; + + link->depth = num_bytes; + link->src_extras = src_extras; + link->dst_extras = dst_extras; + link->copy_spots = copy_spots; + + if (prf == NULL) + { + link->cmm_handle = cmsCreateTransformTHR(cmm_ctx, src->cmm_handle, src_data_type, dst->cmm_handle, des_data_type, rend->ri, flag); + if (!link->cmm_handle) + fz_throw(ctx, FZ_ERROR_GENERIC, "cmsCreateTransform failed"); + } + else + { + /* littleCMS proof creation links don't work properly with the Ghent + * test files. Handle this in a brutish manner. + */ + if (src == prf) + { + link->cmm_handle = cmsCreateTransformTHR(cmm_ctx, src->cmm_handle, src_data_type, dst->cmm_handle, des_data_type, INTENT_RELATIVE_COLORIMETRIC, flag); + if (!link->cmm_handle) + fz_throw(ctx, FZ_ERROR_GENERIC, "cmsCreateTransform failed"); + } + else if (prf == dst) + { + link->cmm_handle = cmsCreateTransformTHR(cmm_ctx, src->cmm_handle, src_data_type, prf->cmm_handle, des_data_type, rend->ri, flag); + if (!link->cmm_handle) + fz_throw(ctx, FZ_ERROR_GENERIC, "cmsCreateTransform failed"); + } + else + { + cmsHPROFILE src_to_prf_profile; + cmsHTRANSFORM src_to_prf_link; + cmsColorSpaceSignature prf_cs; + int prf_num_chan; + int lcms_prf_cs; + cmsUInt32Number prf_data_type; + cmsHPROFILE hProfiles[3]; + + prf_cs = cmsGetColorSpace(cmm_ctx, prf->cmm_handle); + lcms_prf_cs = _cmsLCMScolorSpace(cmm_ctx, prf_cs); + if (lcms_prf_cs < 0) + lcms_prf_cs = 0; + prf_num_chan = cmsChannelsOf(cmm_ctx, prf_cs); + prf_data_type = (COLORSPACE_SH(lcms_prf_cs) | CHANNELS_SH(prf_num_chan) | BYTES_SH(num_bytes)); + src_to_prf_link = cmsCreateTransformTHR(cmm_ctx, src->cmm_handle, src_data_type, prf->cmm_handle, prf_data_type, rend->ri, flag); + if (!src_to_prf_link) + fz_throw(ctx, FZ_ERROR_GENERIC, "cmsCreateTransform failed"); + src_to_prf_profile = cmsTransform2DeviceLink(cmm_ctx, src_to_prf_link, 3.4, flag); + cmsDeleteTransform(cmm_ctx, src_to_prf_link); + if (!src_to_prf_profile) + fz_throw(ctx, FZ_ERROR_GENERIC, "cmsTransform2DeviceLink failed"); + + hProfiles[0] = src_to_prf_profile; + hProfiles[1] = prf->cmm_handle; + hProfiles[2] = dst->cmm_handle; + link->cmm_handle = cmsCreateMultiprofileTransformTHR(cmm_ctx, hProfiles, 3, src_data_type, des_data_type, INTENT_RELATIVE_COLORIMETRIC, flag); + cmsCloseProfile(cmm_ctx, src_to_prf_profile); + if (!link->cmm_handle) + fz_throw(ctx, FZ_ERROR_GENERIC, "cmsCreateMultiprofileTransform failed"); + } + } + + DEBUG_LCMS_MEM(("@@@@@@@ Create Link End:: mupdf ctx = %p lcms ctx = %p link = %p link_cmm = %p src = %p des = %p \n", (void*)ctx, (void*)cmm_ctx, (void*)link, (void*)link->cmm_handle, (void*)src->cmm_handle, (void*)dst->cmm_handle)); +} + +void +fz_lcms_fin_link(fz_cmm_instance *instance, fz_icclink *link) +{ + cmsContext cmm_ctx = (cmsContext)instance; + DEBUG_LCMS_MEM(("Free Link:: link = %p \n", (void*)link->cmm_handle)); + if (link->cmm_handle != NULL) + cmsDeleteTransform(cmm_ctx, link->cmm_handle); + link->cmm_handle = NULL; +} + +static fz_cmm_instance * +fz_lcms_new_instance(fz_context *ctx) +{ + cmsContext cmm_ctx; + + cmm_ctx = cmsCreateContext(&fz_lcms_memhandler, ctx); + DEBUG_LCMS_MEM(("Context Creation:: mupdf ctx = %p lcms ctx = %p \n", (void*) ctx, (void*) cmm_ctx)); + if (cmm_ctx == NULL) + fz_throw(ctx, FZ_ERROR_GENERIC, "cmsCreateContext failed"); + cmsSetLogErrorHandlerTHR(cmm_ctx, fz_lcms_log_error); + return (fz_cmm_instance *)cmm_ctx; +} + +static void +fz_lcms_drop_instance(fz_cmm_instance *instance) +{ + DEBUG_LCMS_MEM(("Context Destruction:: lcms ctx = %p \n", (void*)instance)); + if (instance == NULL) + return; + cmsDeleteContext((cmsContext)instance); +} + +static void +fz_lcms_init_profile(fz_cmm_instance *instance, fz_iccprofile *profile) +{ + cmsContext cmm_ctx = (cmsContext)instance; + fz_context *ctx = (fz_context *)cmsGetContextUserData(cmm_ctx); + size_t size; + unsigned char *data; + + DEBUG_LCMS_MEM(("@@@@@@@ Create Profile Start:: mupdf ctx = %p lcms ctx = %p \n", (void*)ctx, (void*)cmm_ctx)); + + size = fz_buffer_storage(ctx, profile->buffer, &data); + profile->cmm_handle = cmsOpenProfileFromMemTHR(cmm_ctx, data, (cmsUInt32Number)size); + if (profile->cmm_handle == NULL) + { + profile->num_devcomp = 0; + fz_throw(ctx, FZ_ERROR_GENERIC, "cmsOpenProfileFromMem failed"); + } + profile->num_devcomp = fz_lcms_num_devcomps(cmm_ctx, profile); + profile->desc = fz_lcms_description(cmm_ctx, profile); + + DEBUG_LCMS_MEM(("@@@@@@@ Create Profile End:: mupdf ctx = %p lcms ctx = %p profile = %p profile_cmm = %p \n", (void*)ctx, (void*)cmm_ctx, (void*)profile, (void*)profile->cmm_handle)); +} + +static void +fz_lcms_fin_profile(fz_cmm_instance *instance, fz_iccprofile *profile) +{ + cmsContext cmm_ctx = (cmsContext)instance; + fz_context *ctx = (fz_context *)cmsGetContextUserData(cmm_ctx); + DEBUG_LCMS_MEM(("Free Profile:: profile = %p \n", (void*) profile->cmm_handle)); + if (profile->cmm_handle != NULL) + cmsCloseProfile(cmm_ctx, profile->cmm_handle); + fz_free(ctx, profile->desc); + profile->cmm_handle = NULL; +} + +fz_cmm_engine fz_cmm_engine_lcms = { + fz_lcms_new_instance, + fz_lcms_drop_instance, + fz_lcms_transform_pixmap, + fz_lcms_transform_color, + fz_lcms_init_link, + fz_lcms_fin_link, + fz_lcms_init_profile, + fz_lcms_fin_profile, +}; +#endif diff -Nru k2pdfopt-2.42+ds/mupdf_mod/filter-basic.c k2pdfopt-2.51+ds/mupdf_mod/filter-basic.c --- k2pdfopt-2.42+ds/mupdf_mod/filter-basic.c 1970-01-01 00:00:00.000000000 +0000 +++ k2pdfopt-2.51+ds/mupdf_mod/filter-basic.c 2018-12-22 23:32:44.000000000 +0000 @@ -0,0 +1,877 @@ +#include "mupdf/fitz.h" + +#include + +/* The null filter reads a specified amount of data from the substream. */ + +struct null_filter +{ + fz_stream *chain; + size_t remain; + int64_t offset; + unsigned char buffer[4096]; +}; + +static int +next_null(fz_context *ctx, fz_stream *stm, size_t max) +{ + struct null_filter *state = stm->state; + size_t n; + + if (state->remain == 0) + return EOF; + + fz_seek(ctx, state->chain, state->offset, 0); + n = fz_available(ctx, state->chain, max); + if (n == 0) + return EOF; + if (n > state->remain) + n = state->remain; + if (n > sizeof(state->buffer)) + n = sizeof(state->buffer); + + memcpy(state->buffer, state->chain->rp, n); + stm->rp = state->buffer; + stm->wp = stm->rp + n; + state->chain->rp += n; + state->remain -= n; + state->offset += n; + stm->pos += n; + return *stm->rp++; +} + +static void +close_null(fz_context *ctx, void *state_) +{ + struct null_filter *state = (struct null_filter *)state_; + fz_drop_stream(ctx, state->chain); + fz_free(ctx, state); +} + +fz_stream * +fz_open_null_filter(fz_context *ctx, fz_stream *chain, int len, int64_t offset) +{ + struct null_filter *state = fz_malloc_struct(ctx, struct null_filter); + state->chain = fz_keep_stream(ctx, chain); + state->remain = len; + state->offset = offset; + return fz_new_stream(ctx, state, next_null, close_null); +} + +/* The range filter copies data from specified ranges of the chained stream */ + +struct range_filter +{ + fz_stream *chain; + fz_range *ranges; + int nranges; + int next_range; + size_t remain; + int64_t offset; + unsigned char buffer[4096]; +}; + +static int +next_range(fz_context *ctx, fz_stream *stm, size_t max) +{ + struct range_filter *state = stm->state; + size_t n; + + while (state->remain == 0 && state->next_range < state->nranges) + { + fz_range *range = &state->ranges[state->next_range++]; + state->remain = range->length; + state->offset = range->offset; + } + + if (state->remain == 0) + return EOF; + fz_seek(ctx, state->chain, state->offset, 0); + n = fz_available(ctx, state->chain, max); + if (n > state->remain) + n = state->remain; + if (n > sizeof(state->buffer)) + n = sizeof(state->buffer); + memcpy(state->buffer, state->chain->rp, n); + stm->rp = state->buffer; + stm->wp = stm->rp + n; + if (n == 0) + return EOF; + state->chain->rp += n; + state->remain -= n; + state->offset += n; + stm->pos += n; + return *stm->rp++; +} + +static void +close_range(fz_context *ctx, void *state_) +{ + struct range_filter *state = (struct range_filter *)state_; + fz_drop_stream(ctx, state->chain); + fz_free(ctx, state->ranges); + fz_free(ctx, state); +} + +fz_stream * +fz_open_range_filter(fz_context *ctx, fz_stream *chain, fz_range *ranges, int nranges) +{ + struct range_filter *state = NULL; + + state = fz_malloc_struct(ctx, struct range_filter); + fz_try(ctx) + { + if (nranges > 0) + { + state->ranges = fz_calloc(ctx, nranges, sizeof(*ranges)); + memcpy(state->ranges, ranges, nranges * sizeof(*ranges)); + state->nranges = nranges; + state->next_range = 1; + state->remain = ranges[0].length; + state->offset = ranges[0].offset; + } + else + { + state->ranges = NULL; + state->nranges = 0; + state->next_range = 1; + state->remain = 0; + state->offset = 0; + } + state->chain = fz_keep_stream(ctx, chain); + } + fz_catch(ctx) + { + fz_free(ctx, state->ranges); + fz_free(ctx, state); + fz_rethrow(ctx); + } + + return fz_new_stream(ctx, state, next_range, close_range); +} + +/* + * The endstream filter reads a PDF substream, and starts to look for an 'endstream' token + * after the specified length. + */ + +#define END_CHECK_SIZE 32 + +struct endstream_filter +{ + fz_stream *chain; + size_t remain, extras, size; + int64_t offset; + int warned; + unsigned char buffer[4096]; +}; + +static int +next_endstream(fz_context *ctx, fz_stream *stm, size_t max) +{ + struct endstream_filter *state = stm->state; + size_t n, nbytes_in_buffer, size; + unsigned char *rp; + + if (state->remain == 0) + goto look_for_endstream; + + fz_seek(ctx, state->chain, state->offset, 0); + n = fz_available(ctx, state->chain, max); + if (n == 0) + return EOF; + if (n > state->remain) + n = state->remain; + if (n > sizeof(state->buffer)) + n = sizeof(state->buffer); + memcpy(state->buffer, state->chain->rp, n); + stm->rp = state->buffer; + stm->wp = stm->rp + n; + state->chain->rp += n; + state->remain -= n; + state->offset += n; + stm->pos += n; + return *stm->rp++; + +look_for_endstream: + /* We should distrust the stream length, and check for end + * marker before terminating the stream - this is to cope + * with files with duff "Length" values. */ + + /* Move any data left over in our buffer down to the start. + * Ordinarily, there won't be any, but this allows for the + * case where we were part way through matching a stream end + * marker when the buffer filled before. */ + nbytes_in_buffer = state->extras; + if (nbytes_in_buffer) + memmove(state->buffer, stm->rp, nbytes_in_buffer); + stm->rp = state->buffer; + stm->wp = stm->rp + nbytes_in_buffer; + + /* In most sane files, we'll get "\nendstream" instantly. We + * should only need (say) 32 bytes to be sure. For crap files + * where we overread regularly, don't harm performance by + * working in small chunks. */ + size = state->size * 2; + if (size > sizeof(state->buffer)) + size = sizeof(state->buffer); + state->size = size; + + /* Read enough data into our buffer to start looking for the 'endstream' token. */ + fz_seek(ctx, state->chain, state->offset, 0); + while (nbytes_in_buffer < size) + { + n = fz_available(ctx, state->chain, size - nbytes_in_buffer); + if (n == 0) + break; + if (n > size - nbytes_in_buffer) + n = size - nbytes_in_buffer; + memcpy(stm->wp, state->chain->rp, n); + stm->wp += n; + state->chain->rp += n; + nbytes_in_buffer += n; + state->offset += n; + } + + /* Look for the 'endstream' token. */ + rp = fz_memmem(state->buffer, nbytes_in_buffer, "endstream", 9); + if (rp) + { + /* Include newline (CR|LF|CRLF) before 'endstream' token */ + if (rp > state->buffer && rp[-1] == '\n') --rp; + if (rp > state->buffer && rp[-1] == '\r') --rp; + n = rp - state->buffer; + stm->eof = 1; /* We're done, don't call us again! */ + } + else if (nbytes_in_buffer > 11) /* 11 covers enough data to detect "\r?\n?endstream" */ + n = nbytes_in_buffer - 11; /* no endstream, but there is more data */ + else + n = nbytes_in_buffer; /* no endstream, but at the end of the file */ + + /* We have at least n bytes before we hit an end marker */ + state->extras = nbytes_in_buffer - n; + stm->wp = stm->rp + n; + stm->pos += n; + + if (n == 0) + return EOF; + + if (!state->warned) + { + state->warned = 1; +/* willus mod -- no warning */ +/* + fz_warn(ctx, "PDF stream Length incorrect"); +*/ + } + return *stm->rp++; +} + +static void +close_endstream(fz_context *ctx, void *state_) +{ + struct endstream_filter *state = (struct endstream_filter *)state_; + fz_drop_stream(ctx, state->chain); + fz_free(ctx, state); +} + +fz_stream * +fz_open_endstream_filter(fz_context *ctx, fz_stream *chain, int len, int64_t offset) +{ + struct endstream_filter *state; + + if (len < 0) + len = 0; + + state = fz_malloc_struct(ctx, struct endstream_filter); + state->chain = fz_keep_stream(ctx, chain); + state->remain = len; + state->offset = offset; + state->extras = 0; + state->size = END_CHECK_SIZE >> 1; /* size is doubled first thing when used */ + + return fz_new_stream(ctx, state, next_endstream, close_endstream); +} + +/* Concat filter concatenates several streams into one */ + +struct concat_filter +{ + int max; + int count; + int current; + int pad; /* 1 if we should add whitespace padding between streams */ + unsigned char ws_buf; + fz_stream *chain[1]; +}; + +static int +next_concat(fz_context *ctx, fz_stream *stm, size_t max) +{ + struct concat_filter *state = (struct concat_filter *)stm->state; + size_t n; + + while (state->current < state->count) + { + /* Read the next block of underlying data. */ + if (stm->wp == state->chain[state->current]->wp) + state->chain[state->current]->rp = stm->wp; + n = fz_available(ctx, state->chain[state->current], max); + if (n) + { + stm->rp = state->chain[state->current]->rp; + stm->wp = state->chain[state->current]->wp; + stm->pos += n; + return *stm->rp++; + } + else + { + if (state->chain[state->current]->error) + { + stm->error = 1; + break; + } + state->current++; + fz_drop_stream(ctx, state->chain[state->current-1]); + if (state->pad) + { + stm->rp = (&state->ws_buf)+1; + stm->wp = stm->rp + 1; + stm->pos++; + return 32; + } + } + } + + stm->rp = stm->wp; + + return EOF; +} + +static void +close_concat(fz_context *ctx, void *state_) +{ + struct concat_filter *state = (struct concat_filter *)state_; + int i; + + for (i = state->current; i < state->count; i++) + { + fz_drop_stream(ctx, state->chain[i]); + } + fz_free(ctx, state); +} + +fz_stream * +fz_open_concat(fz_context *ctx, int len, int pad) +{ + struct concat_filter *state; + + state = fz_calloc(ctx, 1, sizeof(struct concat_filter) + (len-1)*sizeof(fz_stream *)); + state->max = len; + state->count = 0; + state->current = 0; + state->pad = pad; + state->ws_buf = 32; + + return fz_new_stream(ctx, state, next_concat, close_concat); +} + +void +fz_concat_push_drop(fz_context *ctx, fz_stream *concat, fz_stream *chain) +{ + struct concat_filter *state = (struct concat_filter *)concat->state; + + if (state->count == state->max) + { + fz_drop_stream(ctx, chain); + fz_throw(ctx, FZ_ERROR_GENERIC, "Concat filter size exceeded"); + } + + state->chain[state->count++] = chain; +} + +/* ASCII Hex Decode */ + +typedef struct fz_ahxd_s fz_ahxd; + +struct fz_ahxd_s +{ + fz_stream *chain; + int eod; + unsigned char buffer[256]; +}; + +static inline int iswhite(int a) +{ + switch (a) { + case '\n': case '\r': case '\t': case ' ': + case '\0': case '\f': case '\b': case 0177: + return 1; + } + return 0; +} + +static inline int ishex(int a) +{ + return (a >= 'A' && a <= 'F') || + (a >= 'a' && a <= 'f') || + (a >= '0' && a <= '9'); +} + +static inline int unhex(int a) +{ + if (a >= 'A' && a <= 'F') return a - 'A' + 0xA; + if (a >= 'a' && a <= 'f') return a - 'a' + 0xA; + if (a >= '0' && a <= '9') return a - '0'; + return 0; +} + +static int +next_ahxd(fz_context *ctx, fz_stream *stm, size_t max) +{ + fz_ahxd *state = stm->state; + unsigned char *p = state->buffer; + unsigned char *ep; + int a, b, c, odd; + + if (max > sizeof(state->buffer)) + max = sizeof(state->buffer); + ep = p + max; + + odd = 0; + + while (p < ep) + { + if (state->eod) + break; + + c = fz_read_byte(ctx, state->chain); + if (c < 0) + break; + + if (ishex(c)) + { + if (!odd) + { + a = unhex(c); + odd = 1; + } + else + { + b = unhex(c); + *p++ = (a << 4) | b; + odd = 0; + } + } + else if (c == '>') + { + if (odd) + *p++ = (a << 4); + state->eod = 1; + break; + } + else if (!iswhite(c)) + { + fz_throw(ctx, FZ_ERROR_GENERIC, "bad data in ahxd: '%c'", c); + } + } + stm->rp = state->buffer; + stm->wp = p; + stm->pos += p - state->buffer; + + if (stm->rp != p) + return *stm->rp++; + return EOF; +} + +static void +close_ahxd(fz_context *ctx, void *state_) +{ + fz_ahxd *state = (fz_ahxd *)state_; + fz_drop_stream(ctx, state->chain); + fz_free(ctx, state); +} + +fz_stream * +fz_open_ahxd(fz_context *ctx, fz_stream *chain) +{ + fz_ahxd *state = fz_malloc_struct(ctx, fz_ahxd); + state->chain = fz_keep_stream(ctx, chain); + state->eod = 0; + return fz_new_stream(ctx, state, next_ahxd, close_ahxd); +} + +/* ASCII 85 Decode */ + +typedef struct fz_a85d_s fz_a85d; + +struct fz_a85d_s +{ + fz_stream *chain; + unsigned char buffer[256]; + int eod; +}; + +static int +next_a85d(fz_context *ctx, fz_stream *stm, size_t max) +{ + fz_a85d *state = stm->state; + unsigned char *p = state->buffer; + unsigned char *ep; + int count = 0; + int word = 0; + int c; + + if (state->eod) + return EOF; + + if (max > sizeof(state->buffer)) + max = sizeof(state->buffer); + + ep = p + max; + while (p < ep) + { + c = fz_read_byte(ctx, state->chain); + if (c < 0) + break; + + if (c >= '!' && c <= 'u') + { + if (count == 4) + { + word = word * 85 + (c - '!'); + + *p++ = (word >> 24) & 0xff; + *p++ = (word >> 16) & 0xff; + *p++ = (word >> 8) & 0xff; + *p++ = (word) & 0xff; + + word = 0; + count = 0; + } + else + { + word = word * 85 + (c - '!'); + count ++; + } + } + + else if (c == 'z' && count == 0) + { + *p++ = 0; + *p++ = 0; + *p++ = 0; + *p++ = 0; + } + + else if (c == '~') + { + c = fz_read_byte(ctx, state->chain); + if (c != '>') + fz_warn(ctx, "bad eod marker in a85d"); + + switch (count) { + case 0: + break; + case 1: + /* Specifically illegal in the spec, but adobe + * and gs both cope. See normal_87.pdf for a + * case where this matters. */ + fz_warn(ctx, "partial final byte in a85d"); + break; + case 2: + word = word * (85 * 85 * 85) + 0xffffff; + *p++ = word >> 24; + break; + case 3: + word = word * (85 * 85) + 0xffff; + *p++ = word >> 24; + *p++ = word >> 16; + break; + case 4: + word = word * 85 + 0xff; + *p++ = word >> 24; + *p++ = word >> 16; + *p++ = word >> 8; + break; + } + state->eod = 1; + break; + } + + else if (!iswhite(c)) + { + fz_throw(ctx, FZ_ERROR_GENERIC, "bad data in a85d: '%c'", c); + } + } + + stm->rp = state->buffer; + stm->wp = p; + stm->pos += p - state->buffer; + + if (p == stm->rp) + return EOF; + + return *stm->rp++; +} + +static void +close_a85d(fz_context *ctx, void *state_) +{ + fz_a85d *state = (fz_a85d *)state_; + fz_drop_stream(ctx, state->chain); + fz_free(ctx, state); +} + +fz_stream * +fz_open_a85d(fz_context *ctx, fz_stream *chain) +{ + fz_a85d *state = fz_malloc_struct(ctx, fz_a85d); + state->chain = fz_keep_stream(ctx, chain); + state->eod = 0; + return fz_new_stream(ctx, state, next_a85d, close_a85d); +} + +/* Run Length Decode */ + +typedef struct fz_rld_s fz_rld; + +struct fz_rld_s +{ + fz_stream *chain; + int run, n, c; + unsigned char buffer[256]; +}; + +static int +next_rld(fz_context *ctx, fz_stream *stm, size_t max) +{ + fz_rld *state = stm->state; + unsigned char *p = state->buffer; + unsigned char *ep; + + if (state->run == 128) + return EOF; + + if (max > sizeof(state->buffer)) + max = sizeof(state->buffer); + ep = p + max; + + while (p < ep) + { + if (state->run == 128) + break; + + if (state->n == 0) + { + state->run = fz_read_byte(ctx, state->chain); + if (state->run < 0) + { + state->run = 128; + break; + } + if (state->run < 128) + state->n = state->run + 1; + if (state->run > 128) + { + state->n = 257 - state->run; + state->c = fz_read_byte(ctx, state->chain); + if (state->c < 0) + fz_throw(ctx, FZ_ERROR_GENERIC, "premature end of data in run length decode"); + } + } + + if (state->run < 128) + { + while (p < ep && state->n) + { + int c = fz_read_byte(ctx, state->chain); + if (c < 0) + fz_throw(ctx, FZ_ERROR_GENERIC, "premature end of data in run length decode"); + *p++ = c; + state->n--; + } + } + + if (state->run > 128) + { + while (p < ep && state->n) + { + *p++ = state->c; + state->n--; + } + } + } + + stm->rp = state->buffer; + stm->wp = p; + stm->pos += p - state->buffer; + + if (p == stm->rp) + return EOF; + + return *stm->rp++; +} + +static void +close_rld(fz_context *ctx, void *state_) +{ + fz_rld *state = (fz_rld *)state_; + fz_drop_stream(ctx, state->chain); + fz_free(ctx, state); +} + +fz_stream * +fz_open_rld(fz_context *ctx, fz_stream *chain) +{ + fz_rld *state = fz_malloc_struct(ctx, fz_rld); + state->chain = fz_keep_stream(ctx, chain); + state->run = 0; + state->n = 0; + state->c = 0; + return fz_new_stream(ctx, state, next_rld, close_rld); +} + +/* RC4 Filter */ + +typedef struct fz_arc4c_s fz_arc4c; + +struct fz_arc4c_s +{ + fz_stream *chain; + fz_arc4 arc4; + unsigned char buffer[256]; +}; + +static int +next_arc4(fz_context *ctx, fz_stream *stm, size_t max) +{ + fz_arc4c *state = stm->state; + size_t n = fz_available(ctx, state->chain, max); + + if (n == 0) + return EOF; + if (n > sizeof(state->buffer)) + n = sizeof(state->buffer); + + stm->rp = state->buffer; + stm->wp = state->buffer + n; + fz_arc4_encrypt(&state->arc4, stm->rp, state->chain->rp, n); + state->chain->rp += n; + stm->pos += n; + + return *stm->rp++; +} + +static void +close_arc4(fz_context *ctx, void *state_) +{ + fz_arc4c *state = (fz_arc4c *)state_; + fz_drop_stream(ctx, state->chain); + fz_free(ctx, state); +} + +fz_stream * +fz_open_arc4(fz_context *ctx, fz_stream *chain, unsigned char *key, unsigned keylen) +{ + fz_arc4c *state = fz_malloc_struct(ctx, fz_arc4c); + state->chain = fz_keep_stream(ctx, chain); + fz_arc4_init(&state->arc4, key, keylen); + return fz_new_stream(ctx, state, next_arc4, close_arc4); +} + +/* AES Filter */ + +typedef struct fz_aesd_s fz_aesd; + +struct fz_aesd_s +{ + fz_stream *chain; + fz_aes aes; + unsigned char iv[16]; + int ivcount; + unsigned char bp[16]; + unsigned char *rp, *wp; + unsigned char buffer[256]; +}; + +static int +next_aesd(fz_context *ctx, fz_stream *stm, size_t max) +{ + fz_aesd *state = stm->state; + unsigned char *p = state->buffer; + unsigned char *ep; + + if (max > sizeof(state->buffer)) + max = sizeof(state->buffer); + ep = p + max; + + while (state->ivcount < 16) + { + int c = fz_read_byte(ctx, state->chain); + if (c < 0) + fz_throw(ctx, FZ_ERROR_GENERIC, "premature end in aes filter"); + state->iv[state->ivcount++] = c; + } + + while (state->rp < state->wp && p < ep) + *p++ = *state->rp++; + + while (p < ep) + { + size_t n = fz_read(ctx, state->chain, state->bp, 16); + if (n == 0) + break; + else if (n < 16) + fz_throw(ctx, FZ_ERROR_GENERIC, "partial block in aes filter"); + + fz_aes_crypt_cbc(&state->aes, FZ_AES_DECRYPT, 16, state->iv, state->bp, state->bp); + state->rp = state->bp; + state->wp = state->bp + 16; + + /* strip padding at end of file */ + if (fz_is_eof(ctx, state->chain)) + { + int pad = state->bp[15]; + if (pad < 1 || pad > 16) + fz_throw(ctx, FZ_ERROR_GENERIC, "aes padding out of range: %d", pad); + state->wp -= pad; + } + + while (state->rp < state->wp && p < ep) + *p++ = *state->rp++; + } + + stm->rp = state->buffer; + stm->wp = p; + stm->pos += p - state->buffer; + + if (p == stm->rp) + return EOF; + + return *stm->rp++; +} + +static void +close_aesd(fz_context *ctx, void *state_) +{ + fz_aesd *state = (fz_aesd *)state_; + fz_drop_stream(ctx, state->chain); + fz_free(ctx, state); +} + +fz_stream * +fz_open_aesd(fz_context *ctx, fz_stream *chain, unsigned char *key, unsigned keylen) +{ + fz_aesd *state = fz_malloc_struct(ctx, fz_aesd); + if (fz_aes_setkey_dec(&state->aes, key, keylen * 8)) + { + fz_free(ctx, state); + fz_throw(ctx, FZ_ERROR_GENERIC, "AES key init failed (keylen=%d)", keylen * 8); + } + state->ivcount = 0; + state->rp = state->bp; + state->wp = state->bp; + state->chain = fz_keep_stream(ctx, chain); + return fz_new_stream(ctx, state, next_aesd, close_aesd); +} diff -Nru k2pdfopt-2.42+ds/mupdf_mod/font.c k2pdfopt-2.51+ds/mupdf_mod/font.c --- k2pdfopt-2.42+ds/mupdf_mod/font.c 2017-02-25 05:39:41.000000000 +0000 +++ k2pdfopt-2.51+ds/mupdf_mod/font.c 2018-11-21 00:33:57.000000000 +0000 @@ -1,17 +1,23 @@ +#include "mupdf/fitz.h" +#include "mupdf/ucdn.h" #include "fitz-imp.h" - #include "font-imp.h" +#include "draw-imp.h" #include -/* willus mod */ +/* willus mod -- remove hb includes */ /* #include "hb.h" #include "hb-ft.h" */ +#include + #include FT_FREETYPE_H #include FT_ADVANCES_H +#include FT_MODULE_H #include FT_STROKER_H +#include FT_SYSTEM_H #include FT_TRUETYPE_TABLES_H #include FT_TRUETYPE_TAGS_H @@ -45,7 +51,6 @@ font->flags.ft_substitute = 0; font->flags.fake_bold = 0; font->flags.fake_italic = 0; - font->flags.force_hinting = 0; font->flags.has_opentype = 0; font->t3matrix = fz_identity; @@ -64,7 +69,6 @@ font->glyph_count = glyph_count; - font->flags.use_glyph_bbox = !!use_glyph_bbox; if (use_glyph_bbox && glyph_count <= MAX_BBOX_TABLE_SIZE) { font->bbox_table = fz_malloc_array(ctx, glyph_count, sizeof(fz_rect)); @@ -73,11 +77,6 @@ } else { -/* willus.com mod -- no warning */ -/* - if (use_glyph_bbox) - fz_warn(ctx, "not building glyph bbox table for font '%s' with %d glyphs", font->name, glyph_count); -*/ font->bbox_table = NULL; } @@ -174,12 +173,17 @@ { if (xmin >= xmax || ymin >= ymax) { - /* Invalid bbox supplied. It would be prohibitively slow to - * measure the true one, so make one up. */ - font->bbox.x0 = -1; - font->bbox.y0 = -1; - font->bbox.x1 = 2; - font->bbox.y1 = 2; + /* Invalid bbox supplied. */ + if (font->t3procs) + { + /* For type3 fonts we use the union of all the glyphs' bboxes. */ + font->bbox = fz_empty_rect; + } + else + { + /* For other fonts it would be prohibitively slow to measure the true one, so make one up. */ + font->bbox = fz_unit_rect; + } font->flags.invalid_bbox = 1; } else @@ -191,6 +195,28 @@ } } +float fz_font_ascender(fz_context *ctx, fz_font *font) +{ + if (font->t3procs) + return font->bbox.y1; + else + { + FT_Face face = font->ft_face; + return (float)face->ascender / face->units_per_EM; + } +} + +float fz_font_descender(fz_context *ctx, fz_font *font) +{ + if (font->t3procs) + return font->bbox.y0; + else + { + FT_Face face = font->ft_face; + return (float)face->descender / face->units_per_EM; + } +} + /* * Freetype hooks */ @@ -199,13 +225,17 @@ { int ctx_refs; FT_Library ftlib; + struct FT_MemoryRec_ ftmemory; int ftlib_refs; - fz_load_system_font_func load_font; - fz_load_system_cjk_font_func load_cjk_font; + fz_load_system_font_fn *load_font; + fz_load_system_cjk_font_fn *load_cjk_font; + fz_load_system_fallback_font_fn *load_fallback_font; /* Cached fallback fonts */ + fz_font *base14[14]; + fz_font *cjk[4]; struct { fz_font *serif, *sans; } fallback[256]; - fz_font *symbol; + fz_font *symbol1, *symbol2; fz_font *emoji; }; @@ -220,6 +250,33 @@ char *str; }; +static void *ft_alloc(FT_Memory memory, long size) +{ + fz_context *ctx = (fz_context *) memory->user; + return fz_malloc_no_throw(ctx, size); +} + +static void ft_free(FT_Memory memory, void *block) +{ + fz_context *ctx = (fz_context *) memory->user; + fz_free(ctx, block); +} + +static void *ft_realloc(FT_Memory memory, long cur_size, long new_size, void *block) +{ + fz_context *ctx = (fz_context *) memory->user; + void *newblock = NULL; + if (new_size == 0) + { + fz_free(ctx, block); + return newblock; + } + if (block == NULL) + return ft_alloc(memory, new_size); + return fz_resize_array_no_throw(ctx, block, 1, new_size); +} + + void fz_new_font_context(fz_context *ctx) { ctx->font = fz_malloc_struct(ctx, fz_font_context); @@ -227,6 +284,10 @@ ctx->font->ftlib = NULL; ctx->font->ftlib_refs = 0; ctx->font->load_font = NULL; + ctx->font->ftmemory.user = ctx; + ctx->font->ftmemory.alloc = ft_alloc; + ctx->font->ftmemory.free = ft_free; + ctx->font->ftmemory.realloc = ft_realloc; } fz_font_context * @@ -246,22 +307,31 @@ { int i; + for (i = 0; i < nelem(ctx->font->base14); ++i) + fz_drop_font(ctx, ctx->font->base14[i]); + for (i = 0; i < nelem(ctx->font->cjk); ++i) + fz_drop_font(ctx, ctx->font->cjk[i]); for (i = 0; i < nelem(ctx->font->fallback); ++i) { fz_drop_font(ctx, ctx->font->fallback[i].serif); fz_drop_font(ctx, ctx->font->fallback[i].sans); } - fz_drop_font(ctx, ctx->font->symbol); + fz_drop_font(ctx, ctx->font->symbol1); + fz_drop_font(ctx, ctx->font->symbol2); fz_drop_font(ctx, ctx->font->emoji); fz_free(ctx, ctx->font); ctx->font = NULL; } } -void fz_install_load_system_font_funcs(fz_context *ctx, fz_load_system_font_func f, fz_load_system_cjk_font_func f_cjk) +void fz_install_load_system_font_funcs(fz_context *ctx, + fz_load_system_font_fn *f, + fz_load_system_cjk_font_fn *f_cjk, + fz_load_system_fallback_font_fn *f_back) { ctx->font->load_font = f; ctx->font->load_cjk_font = f_cjk; + ctx->font->load_fallback_font = f_back; } fz_font *fz_load_system_font(fz_context *ctx, const char *name, int bold, int italic, int needs_exact_metrics) @@ -271,13 +341,9 @@ if (ctx->font->load_font) { fz_try(ctx) - { font = ctx->font->load_font(ctx, name, bold, italic, needs_exact_metrics); - } fz_catch(ctx) - { font = NULL; - } } return font; @@ -290,13 +356,24 @@ if (ctx->font->load_cjk_font) { fz_try(ctx) - { font = ctx->font->load_cjk_font(ctx, name, ros, serif); - } fz_catch(ctx) - { font = NULL; - } + } + + return font; +} + +fz_font *fz_load_system_fallback_font(fz_context *ctx, int script, int language, int serif, int bold, int italic) +{ + fz_font *font = NULL; + + if (ctx->font->load_fallback_font) + { + fz_try(ctx) + font = ctx->font->load_fallback_font(ctx, script, language, serif, bold, italic); + fz_catch(ctx) + font = NULL; } return font; @@ -304,8 +381,10 @@ fz_font *fz_load_fallback_font(fz_context *ctx, int script, int language, int serif, int bold, int italic) { - const char *data; + fz_font **fontp; + const unsigned char *data; int index; + int subfont; int size; if (script < 0 || script > nelem(ctx->font->fallback)) @@ -320,8 +399,8 @@ { case FZ_LANG_ja: index = UCDN_LAST_SCRIPT + 1; break; case FZ_LANG_ko: index = UCDN_LAST_SCRIPT + 2; break; - case FZ_LANG_zh_Hant: index = UCDN_LAST_SCRIPT + 3; break; - case FZ_LANG_zh_Hans: index = UCDN_LAST_SCRIPT + 4; break; + case FZ_LANG_zh_Hans: index = UCDN_LAST_SCRIPT + 3; break; + case FZ_LANG_zh_Hant: index = UCDN_LAST_SCRIPT + 4; break; } } if (script == UCDN_SCRIPT_ARABIC) @@ -331,45 +410,53 @@ } if (serif) + fontp = &ctx->font->fallback[index].serif; + else + fontp = &ctx->font->fallback[index].sans; + + if (!*fontp) { - if (ctx->font->fallback[index].serif) - return ctx->font->fallback[index].serif; - data = fz_lookup_noto_font(ctx, script, language, 1, &size); - if (data) + *fontp = fz_load_system_fallback_font(ctx, script, language, serif, bold, italic); + if (!*fontp) { - ctx->font->fallback[index].serif = fz_new_font_from_memory(ctx, NULL, data, size, 0, 0); - return ctx->font->fallback[index].serif; + data = fz_lookup_noto_font(ctx, script, language, &size, &subfont); + if (data) + *fontp = fz_new_font_from_memory(ctx, NULL, data, size, subfont, 0); } } - if (ctx->font->fallback[index].sans) - return ctx->font->fallback[index].sans; - data = fz_lookup_noto_font(ctx, script, language, 0, &size); - if (data) + return *fontp; +} + +static fz_font *fz_load_fallback_symbol1_font(fz_context *ctx) +{ + const unsigned char *data; + int size; + if (!ctx->font->symbol1) { - ctx->font->fallback[index].sans = fz_new_font_from_memory(ctx, NULL, data, size, 0, 0); - return ctx->font->fallback[index].sans; + data = fz_lookup_noto_symbol1_font(ctx, &size); + if (data) + ctx->font->symbol1 = fz_new_font_from_memory(ctx, NULL, data, size, 0, 0); } - - return NULL; + return ctx->font->symbol1; } -fz_font *fz_load_fallback_symbol_font(fz_context *ctx) +static fz_font *fz_load_fallback_symbol2_font(fz_context *ctx) { - const char *data; + const unsigned char *data; int size; - if (!ctx->font->symbol) + if (!ctx->font->symbol2) { - data = fz_lookup_noto_symbol_font(ctx, &size); + data = fz_lookup_noto_symbol2_font(ctx, &size); if (data) - ctx->font->symbol = fz_new_font_from_memory(ctx, NULL, data, size, 0, 0); + ctx->font->symbol2 = fz_new_font_from_memory(ctx, NULL, data, size, 0, 0); } - return ctx->font->symbol; + return ctx->font->symbol2; } -fz_font *fz_load_fallback_emoji_font(fz_context *ctx) +static fz_font *fz_load_fallback_emoji_font(fz_context *ctx) { - const char *data; + const unsigned char *data; int size; if (!ctx->font->emoji) { @@ -411,7 +498,7 @@ return; } - fterr = FT_Init_FreeType(&fct->ftlib); + fterr = FT_New_Library(&fct->ftmemory, &fct->ftlib); if (fterr) { const char *mess = ft_error_string(fterr); @@ -419,10 +506,12 @@ fz_throw(ctx, FZ_ERROR_GENERIC, "cannot init freetype: %s", mess); } + FT_Add_Default_Modules(fct->ftlib); + FT_Library_Version(fct->ftlib, &maj, &min, &pat); if (maj == 2 && min == 1 && pat < 7) { - fterr = FT_Done_FreeType(fct->ftlib); + fterr = FT_Done_Library(fct->ftlib); if (fterr) fz_warn(ctx, "freetype finalizing: %s", ft_error_string(fterr)); fz_unlock(ctx, FZ_LOCK_FREETYPE); @@ -442,7 +531,7 @@ fz_lock(ctx, FZ_LOCK_FREETYPE); if (--fct->ftlib_refs == 0) { - fterr = FT_Done_FreeType(fct->ftlib); + fterr = FT_Done_Library(fct->ftlib); if (fterr) fz_warn(ctx, "freetype finalizing: %s", ft_error_string(fterr)); fct->ftlib = NULL; @@ -458,6 +547,7 @@ fz_font *font; int fterr; FT_ULong tag, size, i, n; + char namebuf[sizeof(font->name)]; fz_keep_freetype(ctx); @@ -471,7 +561,27 @@ } if (!name) - name = face->family_name; + { + if (!face->family_name) + { + name = face->style_name; + } + else if (!face->style_name) + { + name = face->family_name; + } + else if (strstr(face->style_name, face->family_name) == face->style_name) + { + name = face->style_name; + } + else + { + fz_strlcpy(namebuf, face->family_name, sizeof(namebuf)); + fz_strlcat(namebuf, " ", sizeof(namebuf)); + fz_strlcat(namebuf, face->style_name, sizeof(namebuf)); + name = namebuf; + } + } font = fz_new_font(ctx, name, use_glyph_bbox, face->num_glyphs); font->ft_face = face; @@ -501,16 +611,30 @@ } } + if (name) + { + if (!font->flags.is_bold) + { + if (strstr(name, "Semibold")) font->flags.is_bold = 1; + if (strstr(name, "Bold")) font->flags.is_bold = 1; + } + if (!font->flags.is_italic) + { + if (strstr(name, "Italic")) font->flags.is_italic = 1; + if (strstr(name, "Oblique")) font->flags.is_italic = 1; + } + } + font->buffer = fz_keep_buffer(ctx, buffer); return font; } fz_font * -fz_new_font_from_memory(fz_context *ctx, const char *name, const char *data, int len, int index, int use_glyph_bbox) +fz_new_font_from_memory(fz_context *ctx, const char *name, const unsigned char *data, int len, int index, int use_glyph_bbox) { fz_buffer *buffer = fz_new_buffer_from_shared_data(ctx, data, len); - fz_font *font; + fz_font *font = NULL; fz_try(ctx) font = fz_new_font_from_buffer(ctx, name, buffer, index, use_glyph_bbox); fz_always(ctx) @@ -524,7 +648,7 @@ fz_new_font_from_file(fz_context *ctx, const char *name, const char *path, int index, int use_glyph_bbox) { fz_buffer *buffer = fz_read_file(ctx, path); - fz_font *font; + fz_font *font = NULL; fz_try(ctx) font = fz_new_font_from_buffer(ctx, name, buffer, index, use_glyph_bbox); fz_always(ctx) @@ -534,21 +658,95 @@ return font; } +static int +find_base14_index(const char *name) +{ + if (!strcmp(name, "Courier")) return 0; + if (!strcmp(name, "Courier-Oblique")) return 1; + if (!strcmp(name, "Courier-Bold")) return 2; + if (!strcmp(name, "Courier-BoldOblique")) return 3; + if (!strcmp(name, "Helvetica")) return 4; + if (!strcmp(name, "Helvetica-Oblique")) return 5; + if (!strcmp(name, "Helvetica-Bold")) return 6; + if (!strcmp(name, "Helvetica-BoldOblique")) return 7; + if (!strcmp(name, "Times-Roman")) return 8; + if (!strcmp(name, "Times-Italic")) return 9; + if (!strcmp(name, "Times-Bold")) return 10; + if (!strcmp(name, "Times-BoldItalic")) return 11; + if (!strcmp(name, "Symbol")) return 12; + if (!strcmp(name, "ZapfDingbats")) return 13; + return -1; +} + +fz_font * +fz_new_base14_font(fz_context *ctx, const char *name) +{ + const unsigned char *data; + int size; + int x = find_base14_index(name); + if (x >= 0) + { + if (ctx->font->base14[x]) + return fz_keep_font(ctx, ctx->font->base14[x]); + data = fz_lookup_base14_font(ctx, name, &size); + if (data) + { + ctx->font->base14[x] = fz_new_font_from_memory(ctx, name, data, size, 0, 1); + ctx->font->base14[x]->flags.is_serif = (name[0] == 'T'); /* Times-Roman */ + return fz_keep_font(ctx, ctx->font->base14[x]); + } + } + fz_throw(ctx, FZ_ERROR_GENERIC, "cannot find builtin font with name '%s'", name); +} + +fz_font * +fz_new_cjk_font(fz_context *ctx, int ordering) +{ + const unsigned char *data; + int size, index; + if (ordering >= 0 && ordering < nelem(ctx->font->cjk)) + { + if (ctx->font->cjk[ordering]) + return fz_keep_font(ctx, ctx->font->cjk[ordering]); + data = fz_lookup_cjk_font(ctx, ordering, &size, &index); + if (data) + { + ctx->font->cjk[ordering] = fz_new_font_from_memory(ctx, NULL, data, size, index, 0); + return fz_keep_font(ctx, ctx->font->cjk[ordering]); + } + } + fz_throw(ctx, FZ_ERROR_GENERIC, "cannot find builtin CJK font"); +} + +fz_font * +fz_new_builtin_font(fz_context *ctx, const char *name, int is_bold, int is_italic) +{ + const unsigned char *data; + int size; + data = fz_lookup_builtin_font(ctx, name, is_bold, is_italic, &size); + if (!data) + fz_throw(ctx, FZ_ERROR_GENERIC, "cannot find builtin font with name '%s'", name); + return fz_new_font_from_memory(ctx, NULL, data, size, 0, 0); +} + static fz_matrix * fz_adjust_ft_glyph_width(fz_context *ctx, fz_font *font, int gid, fz_matrix *trm) { /* Fudge the font matrix to stretch the glyph if we've substituted the font. */ if (font->flags.ft_stretch && font->width_table /* && font->wmode == 0 */) { - FT_Fixed adv; + FT_Error fterr; + FT_Fixed adv = 0; float subw; float realw; fz_lock(ctx, FZ_LOCK_FREETYPE); - FT_Get_Advance(font->ft_face, gid, FT_LOAD_NO_SCALE | FT_LOAD_NO_HINTING | FT_LOAD_IGNORE_TRANSFORM, &adv); + fterr = FT_Get_Advance(font->ft_face, gid, FT_LOAD_NO_SCALE | FT_LOAD_NO_HINTING | FT_LOAD_IGNORE_TRANSFORM, &adv); fz_unlock(ctx, FZ_LOCK_FREETYPE); + if (fterr) + fz_warn(ctx, "freetype getting character advance: %s", ft_error_string(fterr)); - realw = (float)adv * 1000 / ((FT_Face)font->ft_face)->units_per_EM; + realw = adv * 1000.0f / ((FT_Face)font->ft_face)->units_per_EM; if (gid < font->width_count) subw = font->width_table[gid]; else @@ -556,7 +754,7 @@ /* Sanity check scaling in case of broken metrics. */ if (realw > 0 && subw > 0) - fz_pre_scale(trm, subw / realw, 1); + *trm = fz_pre_scale(*trm, subw / realw, 1); } return trm; @@ -582,20 +780,19 @@ /* Takes the freetype lock, and returns with it held */ static FT_GlyphSlot -do_ft_render_glyph(fz_context *ctx, fz_font *font, int gid, const fz_matrix *trm, int aa) +do_ft_render_glyph(fz_context *ctx, fz_font *font, int gid, fz_matrix trm, int aa) { FT_Face face = font->ft_face; FT_Matrix m; FT_Vector v; FT_Error fterr; - fz_matrix local_trm = *trm; float strength = fz_matrix_expansion(trm) * 0.02f; - fz_adjust_ft_glyph_width(ctx, font, gid, &local_trm); + fz_adjust_ft_glyph_width(ctx, font, gid, &trm); if (font->flags.fake_italic) - fz_pre_shear(&local_trm, SHEAR, 0); + trm = fz_pre_shear(trm, SHEAR, 0); /* Freetype mutilates complex glyphs if they are loaded @@ -605,12 +802,12 @@ into FT_Set_Char_Size instead */ - m.xx = local_trm.a * 64; /* should be 65536 */ - m.yx = local_trm.b * 64; - m.xy = local_trm.c * 64; - m.yy = local_trm.d * 64; - v.x = local_trm.e * 64; - v.y = local_trm.f * 64; + m.xx = trm.a * 64; /* should be 65536 */ + m.yx = trm.b * 64; + m.xy = trm.c * 64; + m.yy = trm.d * 64; + v.x = trm.e * 64; + v.y = trm.f * 64; fz_lock(ctx, FZ_LOCK_FREETYPE); fterr = FT_Set_Char_Size(face, 65536, 65536, 72, 72); /* should be 64, 64 */ @@ -621,11 +818,11 @@ if (aa == 0) { /* enable grid fitting for non-antialiased rendering */ - float scale = fz_matrix_expansion(&local_trm); - m.xx = local_trm.a * 65536 / scale; - m.yx = local_trm.b * 65536 / scale; - m.xy = local_trm.c * 65536 / scale; - m.yy = local_trm.d * 65536 / scale; + float scale = fz_matrix_expansion(trm); + m.xx = trm.a * 65536 / scale; + m.yx = trm.b * 65536 / scale; + m.xy = trm.c * 65536 / scale; + m.yy = trm.d * 65536 / scale; v.x = 0; v.y = 0; @@ -639,21 +836,6 @@ goto retry_unhinted; } } - else if (font->flags.force_hinting) - { - /* - Enable hinting, but keep the huge char size so that - it is hinted for a character. This will in effect nullify - the effect of grid fitting. This form of hinting should - only be used for DynaLab and similar tricky TrueType fonts, - so that we get the correct outline shape. - */ - fterr = FT_Load_Glyph(face, gid, FT_LOAD_NO_BITMAP); - if (fterr) { - fz_warn(ctx, "freetype load hinted glyph (gid %d): %s", gid, ft_error_string(fterr)); - goto retry_unhinted; - } - } else { retry_unhinted: @@ -671,7 +853,7 @@ FT_Outline_Translate(&face->glyph->outline, -strength * 32, -strength * 32); } - fterr = FT_Render_Glyph(face->glyph, fz_text_aa_level(ctx) > 0 ? FT_RENDER_MODE_NORMAL : FT_RENDER_MODE_MONO); + fterr = FT_Render_Glyph(face->glyph, aa > 0 ? FT_RENDER_MODE_NORMAL : FT_RENDER_MODE_MONO); if (fterr) { fz_warn(ctx, "freetype render glyph (gid %d): %s", gid, ft_error_string(fterr)); @@ -681,10 +863,10 @@ } fz_pixmap * -fz_render_ft_glyph_pixmap(fz_context *ctx, fz_font *font, int gid, const fz_matrix *trm, int aa) +fz_render_ft_glyph_pixmap(fz_context *ctx, fz_font *font, int gid, fz_matrix trm, int aa) { FT_GlyphSlot slot = do_ft_render_glyph(ctx, font, gid, trm, aa); - fz_pixmap *pixmap; + fz_pixmap *pixmap = NULL; if (slot == NULL) { @@ -710,10 +892,10 @@ /* The glyph cache lock is always taken when this is called. */ fz_glyph * -fz_render_ft_glyph(fz_context *ctx, fz_font *font, int gid, const fz_matrix *trm, int aa) +fz_render_ft_glyph(fz_context *ctx, fz_font *font, int gid, fz_matrix trm, int aa) { FT_GlyphSlot slot = do_ft_render_glyph(ctx, font, gid, trm, aa); - fz_glyph *glyph; + fz_glyph *glyph = NULL; if (slot == NULL) { @@ -739,7 +921,7 @@ /* Takes the freetype lock, and returns with it held */ static FT_Glyph -do_render_ft_stroked_glyph(fz_context *ctx, fz_font *font, int gid, const fz_matrix *trm, const fz_matrix *ctm, const fz_stroke_state *state) +do_render_ft_stroked_glyph(fz_context *ctx, fz_font *font, int gid, fz_matrix trm, fz_matrix ctm, const fz_stroke_state *state, int aa) { FT_Face face = font->ft_face; float expansion = fz_matrix_expansion(ctm); @@ -751,19 +933,18 @@ FT_Glyph glyph; FT_Stroker_LineJoin line_join; FT_Stroker_LineCap line_cap; - fz_matrix local_trm = *trm; - fz_adjust_ft_glyph_width(ctx, font, gid, &local_trm); + fz_adjust_ft_glyph_width(ctx, font, gid, &trm); if (font->flags.fake_italic) - fz_pre_shear(&local_trm, SHEAR, 0); + trm = fz_pre_shear(trm, SHEAR, 0); - m.xx = local_trm.a * 64; /* should be 65536 */ - m.yx = local_trm.b * 64; - m.xy = local_trm.c * 64; - m.yy = local_trm.d * 64; - v.x = local_trm.e * 64; - v.y = local_trm.f * 64; + m.xx = trm.a * 64; /* should be 65536 */ + m.yx = trm.b * 64; + m.xy = trm.c * 64; + m.yy = trm.d * 64; + v.x = trm.e * 64; + v.y = trm.f * 64; fz_lock(ctx, FZ_LOCK_FREETYPE); fterr = FT_Set_Char_Size(face, 65536, 65536, 72, 72); /* should be 64, 64 */ @@ -822,7 +1003,7 @@ FT_Stroker_Done(stroker); - fterr = FT_Glyph_To_Bitmap(&glyph, fz_text_aa_level(ctx) > 0 ? FT_RENDER_MODE_NORMAL : FT_RENDER_MODE_MONO, 0, 1); + fterr = FT_Glyph_To_Bitmap(&glyph, aa > 0 ? FT_RENDER_MODE_NORMAL : FT_RENDER_MODE_MONO, 0, 1); if (fterr) { fz_warn(ctx, "FT_Glyph_To_Bitmap: %s", ft_error_string(fterr)); @@ -833,11 +1014,11 @@ } fz_pixmap * -fz_render_ft_stroked_glyph_pixmap(fz_context *ctx, fz_font *font, int gid, const fz_matrix *trm, const fz_matrix *ctm, const fz_stroke_state *state) +fz_render_ft_stroked_glyph_pixmap(fz_context *ctx, fz_font *font, int gid, fz_matrix trm, fz_matrix ctm, const fz_stroke_state *state, int aa) { - FT_Glyph glyph = do_render_ft_stroked_glyph(ctx, font, gid, trm, ctm, state); + FT_Glyph glyph = do_render_ft_stroked_glyph(ctx, font, gid, trm, ctm, state, aa); FT_BitmapGlyph bitmap = (FT_BitmapGlyph)glyph; - fz_pixmap *pixmap; + fz_pixmap *pixmap = NULL; if (bitmap == NULL) { @@ -863,11 +1044,11 @@ } fz_glyph * -fz_render_ft_stroked_glyph(fz_context *ctx, fz_font *font, int gid, const fz_matrix *trm, const fz_matrix *ctm, const fz_stroke_state *state) +fz_render_ft_stroked_glyph(fz_context *ctx, fz_font *font, int gid, fz_matrix trm, fz_matrix ctm, const fz_stroke_state *state, int aa) { - FT_Glyph glyph = do_render_ft_stroked_glyph(ctx, font, gid, trm, ctm, state); + FT_Glyph glyph = do_render_ft_stroked_glyph(ctx, font, gid, trm, ctm, state, aa); FT_BitmapGlyph bitmap = (FT_BitmapGlyph)glyph; - fz_glyph *result; + fz_glyph *result = NULL; if (bitmap == NULL) { @@ -900,37 +1081,27 @@ FT_BBox cbox; FT_Matrix m; FT_Vector v; - int ft_flags; fz_rect *bounds = &font->bbox_table[gid]; // TODO: refactor loading into fz_load_ft_glyph // TODO: cache results const int scale = face->units_per_EM; - const float recip = 1 / (float)scale; + const float recip = 1.0f / scale; const float strength = 0.02f; - fz_matrix local_trm = fz_identity; + fz_matrix trm = fz_identity; - fz_adjust_ft_glyph_width(ctx, font, gid, &local_trm); + fz_adjust_ft_glyph_width(ctx, font, gid, &trm); if (font->flags.fake_italic) - fz_pre_shear(&local_trm, SHEAR, 0); + trm = fz_pre_shear(trm, SHEAR, 0); - m.xx = local_trm.a * 65536; - m.yx = local_trm.b * 65536; - m.xy = local_trm.c * 65536; - m.yy = local_trm.d * 65536; - v.x = local_trm.e * 65536; - v.y = local_trm.f * 65536; - - if (font->flags.force_hinting) - { - ft_flags = FT_LOAD_NO_BITMAP; - } - else - { - ft_flags = FT_LOAD_NO_BITMAP | FT_LOAD_NO_HINTING; - } + m.xx = trm.a * 65536; + m.yx = trm.b * 65536; + m.xy = trm.c * 65536; + m.yy = trm.d * 65536; + v.x = trm.e * 65536; + v.y = trm.f * 65536; fz_lock(ctx, FZ_LOCK_FREETYPE); /* Set the char size to scale=face->units_per_EM to effectively give @@ -941,20 +1112,20 @@ fz_warn(ctx, "freetype setting character size: %s", ft_error_string(fterr)); FT_Set_Transform(face, &m, &v); - fterr = FT_Load_Glyph(face, gid, ft_flags); + fterr = FT_Load_Glyph(face, gid, FT_LOAD_NO_BITMAP | FT_LOAD_NO_HINTING); if (fterr) { fz_warn(ctx, "freetype load glyph (gid %d): %s", gid, ft_error_string(fterr)); fz_unlock(ctx, FZ_LOCK_FREETYPE); - bounds->x0 = bounds->x1 = local_trm.e; - bounds->y0 = bounds->y1 = local_trm.f; + bounds->x0 = bounds->x1 = trm.e; + bounds->y0 = bounds->y1 = trm.f; return bounds; } if (font->flags.fake_bold) { FT_Outline_Embolden(&face->glyph->outline, strength * scale); - FT_Outline_Translate(&face->glyph->outline, -strength * 0.5 * scale, -strength * 0.5 * scale); + FT_Outline_Translate(&face->glyph->outline, -strength * 0.5f * scale, -strength * 0.5f * scale); } FT_Outline_Get_CBox(&face->glyph->outline, &cbox); @@ -964,10 +1135,10 @@ bounds->x1 = cbox.xMax * recip; bounds->y1 = cbox.yMax * recip; - if (fz_is_empty_rect(bounds)) + if (fz_is_empty_rect(*bounds)) { - bounds->x0 = bounds->x1 = local_trm.e; - bounds->y0 = bounds->y1 = local_trm.f; + bounds->x0 = bounds->x1 = trm.e; + bounds->y0 = bounds->y1 = trm.f; } return bounds; @@ -988,7 +1159,7 @@ fz_path *path = cc->path; fz_point pt; - fz_transform_point_xy(&pt, &cc->trm, p->x, p->y); + pt = fz_transform_point_xy(p->x, p->y, cc->trm); fz_moveto(ctx, path, pt.x, pt.y); return 0; } @@ -1000,7 +1171,7 @@ fz_path *path = cc->path; fz_point pt; - fz_transform_point_xy(&pt, &cc->trm, p->x, p->y); + pt = fz_transform_point_xy(p->x, p->y, cc->trm); fz_lineto(ctx, path, pt.x, pt.y); return 0; } @@ -1012,8 +1183,8 @@ fz_path *path = cc->path; fz_point ct, pt; - fz_transform_point_xy(&ct, &cc->trm, c->x, c->y); - fz_transform_point_xy(&pt, &cc->trm, p->x, p->y); + ct = fz_transform_point_xy(c->x, c->y, cc->trm); + pt = fz_transform_point_xy(p->x, p->y, cc->trm); fz_quadto(ctx, path, ct.x, ct.y, pt.x, pt.y); return 0; @@ -1026,9 +1197,9 @@ fz_path *path = cc->path; fz_point c1t, c2t, pt; - fz_transform_point_xy(&c1t, &cc->trm, c1->x, c1->y); - fz_transform_point_xy(&c2t, &cc->trm, c2->x, c2->y); - fz_transform_point_xy(&pt, &cc->trm, p->x, p->y); + c1t = fz_transform_point_xy(c1->x, c1->y, cc->trm); + c2t = fz_transform_point_xy(c2->x, c2->y, cc->trm); + pt = fz_transform_point_xy(p->x, p->y, cc->trm); fz_curveto(ctx, path, c1t.x, c1t.y, c2t.x, c2t.y, pt.x, pt.y); return 0; @@ -1039,38 +1210,24 @@ }; fz_path * -fz_outline_ft_glyph(fz_context *ctx, fz_font *font, int gid, const fz_matrix *trm) +fz_outline_ft_glyph(fz_context *ctx, fz_font *font, int gid, fz_matrix trm) { struct closure cc; FT_Face face = font->ft_face; int fterr; - fz_matrix local_trm = *trm; - int ft_flags; const int scale = face->units_per_EM; - const float recip = 1 / (float)scale; + const float recip = 1.0f / scale; const float strength = 0.02f; - fz_adjust_ft_glyph_width(ctx, font, gid, &local_trm); + fz_adjust_ft_glyph_width(ctx, font, gid, &trm); if (font->flags.fake_italic) - fz_pre_shear(&local_trm, SHEAR, 0); + trm = fz_pre_shear(trm, SHEAR, 0); fz_lock(ctx, FZ_LOCK_FREETYPE); - if (font->flags.force_hinting) - { - ft_flags = FT_LOAD_NO_BITMAP | FT_LOAD_IGNORE_TRANSFORM; - fterr = FT_Set_Char_Size(face, scale, scale, 72, 72); - if (fterr) - fz_warn(ctx, "freetype setting character size: %s", ft_error_string(fterr)); - } - else - { - ft_flags = FT_LOAD_NO_SCALE | FT_LOAD_IGNORE_TRANSFORM; - } - - fterr = FT_Load_Glyph(face, gid, ft_flags); + fterr = FT_Load_Glyph(face, gid, FT_LOAD_NO_SCALE | FT_LOAD_IGNORE_TRANSFORM); if (fterr) { fz_warn(ctx, "freetype load glyph (gid %d): %s", gid, ft_error_string(fterr)); @@ -1081,7 +1238,7 @@ if (font->flags.fake_bold) { FT_Outline_Embolden(&face->glyph->outline, strength * scale); - FT_Outline_Translate(&face->glyph->outline, -strength * 0.5 * scale, -strength * 0.5 * scale); + FT_Outline_Translate(&face->glyph->outline, -strength * 0.5f * scale, -strength * 0.5f * scale); } cc.path = NULL; @@ -1089,7 +1246,7 @@ { cc.ctx = ctx; cc.path = fz_new_path(ctx); - fz_concat(&cc.trm, fz_scale(&cc.trm, recip, recip), &local_trm); + cc.trm = fz_concat(fz_scale(recip, recip), trm); fz_moveto(ctx, cc.path, cc.trm.e, cc.trm.f); FT_Outline_Decompose(&face->glyph->outline, &outline_funcs, &cc); fz_closepath(ctx, cc.path); @@ -1113,7 +1270,7 @@ */ fz_font * -fz_new_type3_font(fz_context *ctx, const char *name, const fz_matrix *matrix) +fz_new_type3_font(fz_context *ctx, const char *name, fz_matrix matrix) { fz_font *font; @@ -1131,7 +1288,7 @@ fz_rethrow(ctx); } - font->t3matrix = *matrix; + font->t3matrix = matrix; return font; } @@ -1152,7 +1309,7 @@ dev = fz_new_bbox_device(ctx, &font->bbox_table[gid]); fz_try(ctx) { - fz_run_display_list(ctx, list, dev, &font->t3matrix, &fz_infinite_rect, NULL); + fz_run_display_list(ctx, list, dev, font->t3matrix, fz_infinite_rect, NULL); fz_close_device(ctx, dev); } fz_always(ctx) @@ -1163,10 +1320,14 @@ { fz_rethrow(ctx); } + + /* Update font bbox with glyph's computed bbox if the font bbox is invalid */ + if (font->flags.invalid_bbox) + font->bbox = fz_union_rect(font->bbox, font->bbox_table[gid]); } void -fz_prepare_t3_glyph(fz_context *ctx, fz_font *font, int gid, int nested_depth) +fz_prepare_t3_glyph(fz_context *ctx, fz_font *font, int gid) { fz_buffer *contents; fz_device *dev; @@ -1179,7 +1340,7 @@ /* We've not already loaded this one! */ assert(font->t3lists[gid] == NULL); - font->t3lists[gid] = fz_new_display_list(ctx, &font->bbox); + font->t3lists[gid] = fz_new_display_list(ctx, font->bbox); dev = fz_new_list_device(ctx, font->t3lists[gid]); dev->flags = FZ_DEVFLAG_FILLCOLOR_UNDEFINED | @@ -1190,29 +1351,33 @@ FZ_DEVFLAG_LINEJOIN_UNDEFINED | FZ_DEVFLAG_MITERLIMIT_UNDEFINED | FZ_DEVFLAG_LINEWIDTH_UNDEFINED; - font->t3run(ctx, font->t3doc, font->t3resources, contents, dev, &fz_identity, NULL, 0); - fz_close_device(ctx, dev); - font->t3flags[gid] = dev->flags; - d1_rect = dev->d1_rect; - fz_drop_device(ctx, dev); - dev = NULL; + fz_try(ctx) + { + font->t3run(ctx, font->t3doc, font->t3resources, contents, dev, fz_identity, NULL, NULL); + fz_close_device(ctx, dev); + font->t3flags[gid] = dev->flags; + d1_rect = dev->d1_rect; + } + fz_always(ctx) + fz_drop_device(ctx, dev); + fz_catch(ctx) + fz_rethrow(ctx); if (fz_display_list_is_empty(ctx, font->t3lists[gid])) { /* If empty, no need for a huge bbox, especially as the logic * in the 'else if' can make it huge. */ font->bbox_table[gid].x0 = font->bbox.x0; font->bbox_table[gid].y0 = font->bbox.y0; - font->bbox_table[gid].x1 = font->bbox.x0 + .00001; - font->bbox_table[gid].y1 = font->bbox.y0 + .00001; + font->bbox_table[gid].x1 = font->bbox.x0 + .00001f; + font->bbox_table[gid].y1 = font->bbox.y0 + .00001f; } else if (font->t3flags[gid] & FZ_DEVFLAG_BBOX_DEFINED) { assert(font->bbox_table != NULL); assert(font->glyph_count > gid); - font->bbox_table[gid] = d1_rect; - fz_transform_rect(&font->bbox_table[gid], &font->t3matrix); + font->bbox_table[gid] = fz_transform_rect(d1_rect, font->t3matrix); - if (font->flags.invalid_bbox || !fz_contains_rect(&font->bbox, &d1_rect)) + if (font->flags.invalid_bbox || !fz_contains_rect(font->bbox, d1_rect)) { /* Either the font bbox is invalid, or the d1_rect returned is * incompatible with it. Either way, don't trust the d1 rect @@ -1220,10 +1385,15 @@ fz_bound_t3_glyph(ctx, font, gid); } } + else + { + /* No bbox has been defined for this glyph, so compute it. */ + fz_bound_t3_glyph(ctx, font, gid); + } } void -fz_run_t3_glyph(fz_context *ctx, fz_font *font, int gid, const fz_matrix *trm, fz_device *dev) +fz_run_t3_glyph(fz_context *ctx, fz_font *font, int gid, fz_matrix trm, fz_device *dev) { fz_display_list *list; fz_matrix ctm; @@ -1232,19 +1402,19 @@ if (!list) return; - fz_concat(&ctm, &font->t3matrix, trm); - fz_run_display_list(ctx, list, dev, &ctm, &fz_infinite_rect, NULL); + ctm = fz_concat(font->t3matrix, trm); + fz_run_display_list(ctx, list, dev, ctm, fz_infinite_rect, NULL); } fz_pixmap * -fz_render_t3_glyph_pixmap(fz_context *ctx, fz_font *font, int gid, const fz_matrix *trm, fz_colorspace *model, const fz_irect *scissor) +fz_render_t3_glyph_pixmap(fz_context *ctx, fz_font *font, int gid, fz_matrix trm, fz_colorspace *model, const fz_irect *scissor, int aa) { fz_display_list *list; fz_rect bounds; fz_irect bbox; - fz_device *dev; + fz_device *dev = NULL; fz_pixmap *glyph; - fz_pixmap *result; + fz_pixmap *result = NULL; if (gid < 0 || gid > 255) return NULL; @@ -1270,17 +1440,18 @@ model = NULL; /* Treat as masked */ } - fz_expand_rect(fz_bound_glyph(ctx, font, gid, trm, &bounds), 1); - fz_irect_from_rect(&bbox, &bounds); - fz_intersect_irect(&bbox, scissor); + bounds = fz_expand_rect(fz_bound_glyph(ctx, font, gid, trm), 1); + bbox = fz_irect_from_rect(bounds); + bbox = fz_intersect_irect(bbox, *scissor); /* Glyphs must always have alpha */ - glyph = fz_new_pixmap_with_bbox(ctx, model, &bbox, 1); - fz_clear_pixmap(ctx, glyph); + glyph = fz_new_pixmap_with_bbox(ctx, model, bbox, NULL/* FIXME */, 1); - dev = fz_new_draw_device_type3(ctx, NULL, glyph); + fz_var(dev); fz_try(ctx) { + fz_clear_pixmap(ctx, glyph); + dev = fz_new_draw_device_type3(ctx, fz_identity, glyph); fz_run_t3_glyph(ctx, font, gid, trm, dev); fz_close_device(ctx, dev); } @@ -1290,6 +1461,7 @@ } fz_catch(ctx) { + fz_drop_pixmap(ctx, glyph); fz_rethrow(ctx); } @@ -1315,14 +1487,14 @@ } fz_glyph * -fz_render_t3_glyph(fz_context *ctx, fz_font *font, int gid, const fz_matrix *trm, fz_colorspace *model, const fz_irect *scissor) +fz_render_t3_glyph(fz_context *ctx, fz_font *font, int gid, fz_matrix trm, fz_colorspace *model, const fz_irect *scissor, int aa) { - fz_pixmap *pixmap = fz_render_t3_glyph_pixmap(ctx, font, gid, trm, model, scissor); + fz_pixmap *pixmap = fz_render_t3_glyph_pixmap(ctx, font, gid, trm, model, scissor, aa); return fz_new_glyph_from_pixmap(ctx, pixmap); } void -fz_render_t3_glyph_direct(fz_context *ctx, fz_device *dev, fz_font *font, int gid, const fz_matrix *trm, void *gstate, int nested_depth) +fz_render_t3_glyph_direct(fz_context *ctx, fz_device *dev, fz_font *font, int gid, fz_matrix trm, void *gstate, fz_default_colorspaces *def_cs) { fz_matrix ctm; void *contents; @@ -1347,42 +1519,17 @@ fz_warn(ctx, "type3 glyph doesn't specify masked or colored"); } - fz_concat(&ctm, &font->t3matrix, trm); - font->t3run(ctx, font->t3doc, font->t3resources, contents, dev, &ctm, gstate, nested_depth); -} - -void -fz_print_font(fz_context *ctx, fz_output *out, fz_font *font) -{ - fz_printf(ctx, out, "font '%s' {\n", font->name); - - if (font->ft_face) - { - fz_printf(ctx, out, "\tfreetype face %p\n", font->ft_face); - if (font->flags.ft_substitute) - fz_printf(ctx, out, "\tsubstitute font\n"); - } - - if (font->t3procs) - { - fz_printf(ctx, out, "\ttype3 matrix [%g %g %g %g]\n", - font->t3matrix.a, font->t3matrix.b, - font->t3matrix.c, font->t3matrix.d); - - fz_printf(ctx, out, "\ttype3 bbox [%g %g %g %g]\n", - font->bbox.x0, font->bbox.y0, - font->bbox.x1, font->bbox.y1); - } - - fz_printf(ctx, out, "}\n"); + ctm = fz_concat(font->t3matrix, trm); + font->t3run(ctx, font->t3doc, font->t3resources, contents, dev, ctm, gstate, def_cs); } -fz_rect * -fz_bound_glyph(fz_context *ctx, fz_font *font, int gid, const fz_matrix *trm, fz_rect *rect) +fz_rect +fz_bound_glyph(fz_context *ctx, fz_font *font, int gid, fz_matrix trm) { + fz_rect rect; if (font->bbox_table && gid < font->glyph_count) { - if (fz_is_infinite_rect(&font->bbox_table[gid])) + if (fz_is_infinite_rect(font->bbox_table[gid])) { if (font->ft_face) fz_bound_ft_glyph(ctx, font, gid); @@ -1391,21 +1538,20 @@ else font->bbox_table[gid] = fz_empty_rect; } - *rect = font->bbox_table[gid]; + rect = font->bbox_table[gid]; if (fz_is_empty_rect(rect)) - *rect = font->bbox; + rect = font->bbox; } else { /* fall back to font bbox */ - *rect = font->bbox; + rect = font->bbox; } - return fz_transform_rect(rect, trm); } fz_path * -fz_outline_glyph(fz_context *ctx, fz_font *font, int gid, const fz_matrix *ctm) +fz_outline_glyph(fz_context *ctx, fz_font *font, int gid, fz_matrix ctm) { if (!font->ft_face) return NULL; @@ -1422,7 +1568,8 @@ static float fz_advance_ft_glyph(fz_context *ctx, fz_font *font, int gid, int wmode) { - FT_Fixed adv; + FT_Error fterr; + FT_Fixed adv = 0; int mask; /* Substitute font widths. */ @@ -1437,8 +1584,10 @@ if (wmode) mask |= FT_LOAD_VERTICAL_LAYOUT; fz_lock(ctx, FZ_LOCK_FREETYPE); - FT_Get_Advance(font->ft_face, gid, mask, &adv); + fterr = FT_Get_Advance(font->ft_face, gid, mask, &adv); fz_unlock(ctx, FZ_LOCK_FREETYPE); + if (fterr) + fz_warn(ctx, "freetype getting character advance: %s", ft_error_string(fterr)); return (float) adv / ((FT_Face)font->ft_face)->units_per_EM; } @@ -1457,7 +1606,11 @@ if (face) { if (FT_HAS_GLYPH_NAMES(face)) - FT_Get_Glyph_Name(face, glyph, buf, size); + { + int fterr = FT_Get_Glyph_Name(face, glyph, buf, size); + if (fterr) + fz_warn(ctx, "freetype get glyph name (gid %d): %s", glyph, ft_error_string(fterr)); + } else fz_snprintf(buf, size, "%d", glyph); } @@ -1549,7 +1702,49 @@ return *out_font = font, gid; } - font = fz_load_fallback_symbol_font(ctx); +#ifndef TOFU_CJK_LANG + if (script == UCDN_SCRIPT_HAN) + { + font = fz_load_fallback_font(ctx, script, FZ_LANG_zh_Hant, user_font->flags.is_serif, user_font->flags.is_bold, user_font->flags.is_italic); + if (font) + { + gid = fz_encode_character(ctx, font, unicode); + if (gid > 0) + return *out_font = font, gid; + } + font = fz_load_fallback_font(ctx, script, FZ_LANG_ja, user_font->flags.is_serif, user_font->flags.is_bold, user_font->flags.is_italic); + if (font) + { + gid = fz_encode_character(ctx, font, unicode); + if (gid > 0) + return *out_font = font, gid; + } + font = fz_load_fallback_font(ctx, script, FZ_LANG_ko, user_font->flags.is_serif, user_font->flags.is_bold, user_font->flags.is_italic); + if (font) + { + gid = fz_encode_character(ctx, font, unicode); + if (gid > 0) + return *out_font = font, gid; + } + font = fz_load_fallback_font(ctx, script, FZ_LANG_zh_Hans, user_font->flags.is_serif, user_font->flags.is_bold, user_font->flags.is_italic); + if (font) + { + gid = fz_encode_character(ctx, font, unicode); + if (gid > 0) + return *out_font = font, gid; + } + } +#endif + + font = fz_load_fallback_symbol1_font(ctx); + if (font) + { + gid = fz_encode_character(ctx, font, unicode); + if (gid > 0) + return *out_font = font, gid; + } + + font = fz_load_fallback_symbol2_font(ctx); if (font) { gid = fz_encode_character(ctx, font, unicode); @@ -1568,6 +1763,26 @@ return *out_font = user_font, 0; } +int fz_font_is_bold(fz_context *ctx, fz_font *font) +{ + return font ? font->flags.is_bold : 0; +} + +int fz_font_is_italic(fz_context *ctx, fz_font *font) +{ + return font ? font->flags.is_italic : 0; +} + +int fz_font_is_serif(fz_context *ctx, fz_font *font) +{ + return font ? font->flags.is_serif : 0; +} + +int fz_font_is_monospaced(fz_context *ctx, fz_font *font) +{ + return font ? font->flags.is_mono : 0; +} + const char *fz_font_name(fz_context *ctx, fz_font *font) { return font ? font->name : ""; @@ -1578,9 +1793,9 @@ return font ? font->t3procs : NULL; } -fz_rect *fz_font_bbox(fz_context *ctx, fz_font *font) +fz_rect fz_font_bbox(fz_context *ctx, fz_font *font) { - return font ? &font->bbox : NULL; + return font->bbox; } void *fz_font_ft_face(fz_context *ctx, fz_font *font) @@ -1597,3 +1812,15 @@ { return font ? &font->shaper_data : NULL; } + +void fz_font_digest(fz_context *ctx, fz_font *font, unsigned char digest[16]) +{ + if (!font->buffer) + fz_throw(ctx, FZ_ERROR_GENERIC, "no font file for digest"); + if (!font->has_digest) + { + fz_md5_buffer(ctx, font->buffer, font->digest); + font->has_digest = 1; + } + memcpy(digest, font->digest, 16); +} diff -Nru k2pdfopt-2.42+ds/mupdf_mod/font-win32.c k2pdfopt-2.51+ds/mupdf_mod/font-win32.c --- k2pdfopt-2.42+ds/mupdf_mod/font-win32.c 2016-10-21 18:56:36.000000000 +0000 +++ k2pdfopt-2.51+ds/mupdf_mod/font-win32.c 2018-11-21 02:39:32.000000000 +0000 @@ -815,10 +815,10 @@ { switch (ros) { - case FZ_ADOBE_CNS_1: font = pdf_load_windows_font_by_name(ctx, "MingLiU"); break; - case FZ_ADOBE_GB_1: font = pdf_load_windows_font_by_name(ctx, "SimSun"); break; - case FZ_ADOBE_JAPAN_1: font = pdf_load_windows_font_by_name(ctx, "MS-Mincho"); break; - case FZ_ADOBE_KOREA_1: font = pdf_load_windows_font_by_name(ctx, "Batang"); break; + case FZ_ADOBE_CNS: font = pdf_load_windows_font_by_name(ctx, "MingLiU"); break; + case FZ_ADOBE_GB: font = pdf_load_windows_font_by_name(ctx, "SimSun"); break; + case FZ_ADOBE_JAPAN: font = pdf_load_windows_font_by_name(ctx, "MS-Mincho"); break; + case FZ_ADOBE_KOREA: font = pdf_load_windows_font_by_name(ctx, "Batang"); break; default: fz_throw(ctx, FZ_ERROR_GENERIC, "invalid serif ros"); } } @@ -826,8 +826,8 @@ { switch (ros) { - case FZ_ADOBE_CNS_1: font = pdf_load_windows_font_by_name(ctx, "DFKaiShu-SB-Estd-BF"); break; - case FZ_ADOBE_GB_1: + case FZ_ADOBE_CNS: font = pdf_load_windows_font_by_name(ctx, "DFKaiShu-SB-Estd-BF"); break; + case FZ_ADOBE_GB: fz_try(ctx) { font = pdf_load_windows_font_by_name(ctx, "KaiTi"); @@ -837,8 +837,8 @@ font = pdf_load_windows_font_by_name(ctx, "KaiTi_GB2312"); } break; - case FZ_ADOBE_JAPAN_1: font = pdf_load_windows_font_by_name(ctx, "MS-Gothic"); break; - case FZ_ADOBE_KOREA_1: font = pdf_load_windows_font_by_name(ctx, "Gulim"); break; + case FZ_ADOBE_JAPAN: font = pdf_load_windows_font_by_name(ctx, "MS-Gothic"); break; + case FZ_ADOBE_KOREA: font = pdf_load_windows_font_by_name(ctx, "Gulim"); break; default: fz_throw(ctx, FZ_ERROR_GENERIC, "invalid sans-serif ros"); } } @@ -861,6 +861,6 @@ void pdf_install_load_system_font_funcs(fz_context *ctx) { #ifdef _WIN32 - fz_install_load_system_font_funcs(ctx, pdf_load_windows_font, pdf_load_windows_cjk_font); + fz_install_load_system_font_funcs(ctx, pdf_load_windows_font, pdf_load_windows_cjk_font, NULL); #endif } diff -Nru k2pdfopt-2.42+ds/mupdf_mod/mupdf/fitz/config.h k2pdfopt-2.51+ds/mupdf_mod/mupdf/fitz/config.h --- k2pdfopt-2.42+ds/mupdf_mod/mupdf/fitz/config.h 1970-01-01 00:00:00.000000000 +0000 +++ k2pdfopt-2.51+ds/mupdf_mod/mupdf/fitz/config.h 2018-11-25 04:42:54.000000000 +0000 @@ -0,0 +1,172 @@ +#ifndef FZ_CONFIG_H + +#define FZ_CONFIG_H + +/* willus mod: make sure SHARE_JPEG is defined */ +#ifndef SHARE_JPEG +#define SHARE_JPEG +#endif + +/* + Enable the following for spot (and hence overprint/overprint + simulation) capable rendering. This forces FZ_PLOTTERS_N on. +*/ +#define FZ_ENABLE_SPOT_RENDERING 1 + +/* + Choose which plotters we need. + By default we build all the plotters in. To avoid building + plotters in that aren't needed, define the unwanted + FZ_PLOTTERS_... define to 0. +*/ +/* #define FZ_PLOTTERS_G 1 */ +/* #define FZ_PLOTTERS_RGB 1 */ +/* #define FZ_PLOTTERS_CMYK 1 */ +/* #define FZ_PLOTTERS_N 1 */ + +/* + Choose which document agents to include. + By default all but GPRF are enabled. To avoid building unwanted + ones, define FZ_ENABLE_... to 0. +*/ +/* #define FZ_ENABLE_PDF 1 */ +/* #define FZ_ENABLE_XPS 1 */ +#define FZ_ENABLE_SVG 0 +/* #define FZ_ENABLE_CBZ 1 */ +/* #define FZ_ENABLE_IMG 1 */ +#define FZ_ENABLE_HTML 0 +#define FZ_ENABLE_EPUB 0 +#define FZ_ENABLE_GPRF 0 + +/* + Choose whether to enable JPEG2000 decoding. + By default, it is enabled, but due to frequent security + issues with the third party libraries we support disabling + it with this flag. +*/ +#define FZ_ENABLE_JPX 1 + +/* + Choose whether to enable JavaScript. + By default JavaScript is enabled both for mutool and PDF interactivity. +*/ +#define FZ_ENABLE_JS 0 + +/* + Choose which fonts to include. + By default we include the base 14 PDF fonts, + DroidSansFallback from Android for CJK, and + Charis SIL from SIL for epub/html. + Enable the following defines to AVOID including + unwanted fonts. +*/ +/* To avoid all noto fonts except CJK, enable: */ +#define TOFU + +/* To skip the CJK font, enable: (this implicitly enables TOFU_CJK_EXT and TOFU_CJK_LANG) */ +/* #define TOFU_CJK */ + +/* To skip CJK Extension A, enable: (this implicitly enables TOFU_CJK_LANG) */ +/* #define TOFU_CJK_EXT */ + +/* To skip CJK language specific fonts, enable: */ +#define TOFU_CJK_LANG + +/* To skip the Emoji font, enable: */ +/* #define TOFU_EMOJI */ + +/* To skip the ancient/historic scripts, enable: */ +/* #define TOFU_HISTORIC */ + +/* To skip the symbol font, enable: */ +/* #define TOFU_SYMBOL */ + +/* To skip the SIL fonts, enable: */ +/* #define TOFU_SIL */ + +/* To skip the ICC profiles, enable: */ +/* #define NO_ICC */ + +/* To skip the Base14 fonts, enable: */ +/* #define TOFU_BASE14 */ +/* (You probably really don't want to do that except for measurement purposes!) */ + +/* ---------- DO NOT EDIT ANYTHING UNDER THIS LINE ---------- */ + +#ifndef FZ_ENABLE_SPOT_RENDERING +#undef FZ_PLOTTERS_N +#define FZ_PLOTTERS_N 1 +#endif /* FZ_ENABLE_SPOT_RENDERING */ + +#ifndef FZ_PLOTTERS_G +#define FZ_PLOTTERS_G 1 +#endif /* FZ_PLOTTERS_G */ + +#ifndef FZ_PLOTTERS_RGB +#define FZ_PLOTTERS_RGB 1 +#endif /* FZ_PLOTTERS_RGB */ + +#ifndef FZ_PLOTTERS_CMYK +#define FZ_PLOTTERS_CMYK 1 +#endif /* FZ_PLOTTERS_CMYK */ + +#ifndef FZ_PLOTTERS_N +#define FZ_PLOTTERS_N 1 +#endif /* FZ_PLOTTERS_N */ + +/* We need at least 1 plotter defined */ +#if FZ_PLOTTERS_G == 0 && FZ_PLOTTERS_RGB == 0 && FZ_PLOTTERS_CMYK == 0 +#undef FZ_PLOTTERS_N +#define FZ_PLOTTERS_N 1 +#endif + +#ifndef FZ_ENABLE_PDF +#define FZ_ENABLE_PDF 1 +#endif /* FZ_ENABLE_PDF */ + +#ifndef FZ_ENABLE_XPS +#define FZ_ENABLE_XPS 1 +#endif /* FZ_ENABLE_XPS */ + +#ifndef FZ_ENABLE_SVG +#define FZ_ENABLE_SVG 1 +#endif /* FZ_ENABLE_SVG */ + +#ifndef FZ_ENABLE_CBZ +#define FZ_ENABLE_CBZ 1 +#endif /* FZ_ENABLE_CBZ */ + +#ifndef FZ_ENABLE_IMG +#define FZ_ENABLE_IMG 1 +#endif /* FZ_ENABLE_IMG */ + +#ifndef FZ_ENABLE_HTML +#define FZ_ENABLE_HTML 1 +#endif /* FZ_ENABLE_HTML */ + +#ifndef FZ_ENABLE_EPUB +#define FZ_ENABLE_EPUB 1 +#endif /* FZ_ENABLE_EPUB */ + +#ifndef FZ_ENABLE_GPRF +#define FZ_ENABLE_GPRF 0 +#endif /* FZ_ENABLE_GPRF */ + +#ifndef FZ_ENABLE_JPX +#define FZ_ENABLE_JPX 1 +#endif /* FZ_ENABLE_JPX */ + +#ifndef FZ_ENABLE_JS +#define FZ_ENABLE_JS 1 +#endif /* FZ_ENABLE_JS */ + +/* If Epub and HTML are both disabled, disable SIL fonts */ +#if FZ_ENABLE_HTML == 0 && FZ_ENABLE_EPUB == 0 +#undef TOFU_SIL +#define TOFU_SIL +#endif + +/* PTHREAD --affects mu-thread helper tool */ +#define HAVE_PTHREAD + +#endif /* FZ_CONFIG_H */ diff -Nru k2pdfopt-2.42+ds/mupdf_mod/mupdf/helpers/mu-threads.h k2pdfopt-2.51+ds/mupdf_mod/mupdf/helpers/mu-threads.h --- k2pdfopt-2.42+ds/mupdf_mod/mupdf/helpers/mu-threads.h 1970-01-01 00:00:00.000000000 +0000 +++ k2pdfopt-2.51+ds/mupdf_mod/mupdf/helpers/mu-threads.h 2018-11-25 04:45:36.000000000 +0000 @@ -0,0 +1,262 @@ +#ifndef MUPDF_HELPERS_MU_THREADS_H +#define MUPDF_HELPERS_MU_THREADS_H + +/* + Simple threading helper library. + Includes implementations for Windows, pthreads, + and "no threads". + + The "no threads" implementation simply provides types + and stub functions so that things will build, but abort + if we try to call them. This simplifies the job for + calling functions. + + To build this library on a platform with no threading, + define DISABLE_MUTHREADS (or extend the ifdeffery below + so that it does so). + + To build this library on a platform that uses a + threading model other than windows threads or pthreads, + extend the #ifdeffery below to set MUTHREAD_IMPL_TYPE + to an unused value, and modify mu-threads.c + appropriately. +*/ + +/* willus mod */ +#ifndef HAVE_PTHREAD +#define HAVE_PTHREAD +#endif +#if !defined(DISABLE_MUTHREADS) +#ifdef _WIN32 +#define MU_THREAD_IMPL_TYPE 1 +#elif defined(HAVE_PTHREAD) +#define MU_THREAD_IMPL_TYPE 2 +#else +#define DISABLE_MUTHREADS +#endif +#endif + +/* + Types +*/ +typedef struct mu_thread_s mu_thread; +typedef struct mu_semaphore_s mu_semaphore; +typedef struct mu_mutex_s mu_mutex; + +/* + Semaphores + + Created with a value of 0. Triggering a semaphore + increments the value. Waiting on a semaphore reduces + the value, blocking if it would become negative. + + Never increment the value of a semaphore above 1, as + this has undefined meaning in this implementation. +*/ + +/* + mu_create_semaphore: Create a semaphore. + + sem: Pointer to a mu_semaphore to populate. + + Returns non-zero for error. +*/ +int mu_create_semaphore(mu_semaphore *sem); + +/* + mu_destroy_semaphore: Destroy a semaphore. + Semaphores may safely be destroyed multiple + times. Any semaphore initialised to zeros is + safe to destroy. + + Never destroy a semaphore that may be being waited + upon, as this has undefined meaning in this + implementation. + + sem: Pointer to a mu_semaphore to destroy. +*/ +void mu_destroy_semaphore(mu_semaphore *sem); + +/* + mu_trigger_semaphore: Increment the value of the + semaphore. Never blocks. + + sem: The semaphore to increment. + + Returns non-zero on error. +*/ +int mu_trigger_semaphore(mu_semaphore *sem); + +/* + mu_wait_semaphore: Decrement the value of the + semaphore, blocking if this would involve making + the value negative. + + sem: The semaphore to decrement. + + Returns non-zero on error. +*/ +int mu_wait_semaphore(mu_semaphore *sem); + +/* + Threads +*/ + +/* + The type for the function that a thread runs. + + arg: User supplied data. +*/ +typedef void (mu_thread_fn)(void *arg); + +/* + mu_create_thread: Create a thread to run the + supplied function with the supplied argument. + + th: Pointer to mu_thread to populate with created + threads information. + + fn: The function for the thread to run. + + arg: The argument to pass to fn. +*/ +int mu_create_thread(mu_thread *th, mu_thread_fn *fn, void *arg); + +/* + mu_destroy_thread: Destroy a thread. This function + blocks until a thread has terminated normally, and + destroys its storage. A mu_thread may safely be destroyed + multiple times, as may any mu_thread initialised with + zeros. + + th: Pointer to mu_thread to destroy. +*/ +void mu_destroy_thread(mu_thread *th); + +/* + Mutexes + + This implementation does not specify whether + mutexes are recursive or not. +*/ + +/* + mu_create_mutex: Create a mutex. + + mutex: pointer to a mu_mutex to populate. + + Returns non-zero on error. +*/ +int mu_create_mutex(mu_mutex *mutex); + +/* + mu_destroy_mutex: Destroy a mutex. A mu_mutex may + safely be destroyed several times, as may a mu_mutex + initialised with zeros. Never destroy locked mu_mutex. + + mutex: Pointer to mu_mutex to destroy. +*/ +void mu_destroy_mutex(mu_mutex *mutex); + +/* + mu_lock_mutex: Lock a mutex. + + mutex: Mutex to lock. +*/ +void mu_lock_mutex(mu_mutex *mutex); + +/* + mu_unlock_mutex: Unlock a mutex. + + mutex: Mutex to unlock. +*/ +void mu_unlock_mutex(mu_mutex *mutex); + +/* + Everything under this point is implementation specific. + Only people looking to extend the capabilities of this + helper module should need to look below here. +*/ + +#ifdef DISABLE_MUTHREADS + +/* Null implementation */ +struct mu_semaphore_s +{ + int dummy; +}; + +struct mu_thread_s +{ + int dummy; +}; + +struct mu_mutex_s +{ + int dummy; +}; + +#elif MU_THREAD_IMPL_TYPE == 1 + +#include + +/* Windows threads */ +struct mu_semaphore_s +{ + HANDLE handle; +}; + +struct mu_thread_s +{ + HANDLE handle; + mu_thread_fn *fn; + void *arg; +}; + +struct mu_mutex_s +{ + CRITICAL_SECTION mutex; +}; + +#elif MU_THREAD_IMPL_TYPE == 2 + +/* + PThreads - without working unnamed semaphores. + + Neither ios nor OSX supports unnamed semaphores. + Named semaphores are a pain to use, so we implement + our own sempahores using condition variables and + mutexes. +*/ + +#include + +struct mu_semaphore_s +{ + int count; + pthread_mutex_t mutex; + pthread_cond_t cond; +}; + +struct mu_thread_s +{ + pthread_t thread; + mu_thread_fn *fn; + void *arg; +}; + +struct mu_mutex_s +{ + pthread_mutex_t mutex; +}; + +/* + Add new threading implementations here, with + #elif MU_THREAD_IMPL_TYPE == 3... etc. +*/ + +#else +#error Unknown MU_THREAD_IMPL_TYPE setting +#endif + +#endif /* MUPDF_HELPERS_MU_THREADS_H */ diff -Nru k2pdfopt-2.42+ds/mupdf_mod/pdf-annot.c k2pdfopt-2.51+ds/mupdf_mod/pdf-annot.c --- k2pdfopt-2.42+ds/mupdf_mod/pdf-annot.c 2017-02-25 08:08:28.000000000 +0000 +++ k2pdfopt-2.51+ds/mupdf_mod/pdf-annot.c 2018-12-24 06:10:37.000000000 +0000 @@ -1,531 +1,1526 @@ +#include "mupdf/fitz.h" #include "mupdf/pdf.h" -static pdf_obj * -resolve_dest_rec(fz_context *ctx, pdf_document *doc, pdf_obj *dest, int depth) +#include +#include + +/* willus mod--don't use _mkgmtime--not available in Win XP */ +#ifdef _WIN32 +static time_t timegm(struct tm *date); +static time_t timegm(struct tm *date) + + { + time_t t,z; + struct tm gmz; + + z=(time_t)0; + gmz=(*gmtime(&z)); + t=mktime(date)-mktime(&gmz); + return(t); + } +#endif + +#define TEXT_ANNOT_SIZE (25.0f) + +#define isdigit(c) (c >= '0' && c <= '9') + +static void +pdf_drop_annot_imp(fz_context *ctx, pdf_annot *annot) +{ + pdf_drop_obj(ctx, annot->ap); + pdf_drop_obj(ctx, annot->obj); +} + +void +pdf_drop_annots(fz_context *ctx, pdf_annot *annot) +{ + while (annot) + { + pdf_annot *next = annot->next; + fz_drop_annot(ctx, &annot->super); + annot = next; + } +} + +/* Create transform to fit appearance stream to annotation Rect */ +fz_matrix +pdf_annot_transform(fz_context *ctx, pdf_annot *annot) +{ + fz_rect bbox, rect; + fz_matrix matrix; + float w, h, x, y; + + rect = pdf_dict_get_rect(ctx, annot->obj, PDF_NAME(Rect)); + bbox = pdf_xobject_bbox(ctx, annot->ap); + matrix = pdf_xobject_matrix(ctx, annot->ap); + + bbox = fz_transform_rect(bbox, matrix); + if (bbox.x1 == bbox.x0) + w = 0; + else + w = (rect.x1 - rect.x0) / (bbox.x1 - bbox.x0); + if (bbox.y1 == bbox.y0) + h = 0; + else + h = (rect.y1 - rect.y0) / (bbox.y1 - bbox.y0); + x = rect.x0 - bbox.x0; + y = rect.y0 - bbox.y0; + + return fz_pre_scale(fz_translate(x, y), w, h); +} + +pdf_annot *pdf_new_annot(fz_context *ctx, pdf_page *page, pdf_obj *obj) +{ + pdf_annot *annot; + + annot = fz_new_derived_annot(ctx, pdf_annot); + + annot->super.drop_annot = (fz_annot_drop_fn*)pdf_drop_annot_imp; + annot->super.bound_annot = (fz_annot_bound_fn*)pdf_bound_annot; + annot->super.run_annot = (fz_annot_run_fn*)pdf_run_annot; + annot->super.next_annot = (fz_annot_next_fn*)pdf_next_annot; + + annot->page = page; /* only borrowed, as the page owns the annot */ + annot->obj = pdf_keep_obj(ctx, obj); + + return annot; +} + +void +pdf_load_annots(fz_context *ctx, pdf_page *page, pdf_obj *annots) +{ + pdf_document *doc = page->doc; + pdf_annot *annot; + pdf_obj *subtype; + int i, n; + + n = pdf_array_len(ctx, annots); + for (i = 0; i < n; ++i) + { + pdf_obj *obj = pdf_array_get(ctx, annots, i); + if (obj) + { + subtype = pdf_dict_get(ctx, obj, PDF_NAME(Subtype)); + if (pdf_name_eq(ctx, subtype, PDF_NAME(Link))) + continue; + if (pdf_name_eq(ctx, subtype, PDF_NAME(Popup))) + continue; + + annot = pdf_new_annot(ctx, page, obj); + fz_try(ctx) + { + pdf_update_annot(ctx, annot); + annot->has_new_ap = 0; + } + fz_catch(ctx) + fz_warn(ctx, "could not update appearance for annotation"); + + if (doc->focus_obj == obj) + doc->focus = annot; + + *page->annot_tailp = annot; + page->annot_tailp = &annot->next; + } + } +} + +pdf_annot * +pdf_first_annot(fz_context *ctx, pdf_page *page) +{ + return page ? page->annots : NULL; +} + +pdf_annot * +pdf_next_annot(fz_context *ctx, pdf_annot *annot) +{ + return annot ? annot->next : NULL; +} + +fz_rect +pdf_bound_annot(fz_context *ctx, pdf_annot *annot) +{ + fz_matrix page_ctm; + fz_rect rect; + int flags; + + rect = pdf_dict_get_rect(ctx, annot->obj, PDF_NAME(Rect)); + pdf_page_transform(ctx, annot->page, NULL, &page_ctm); + + flags = pdf_dict_get_int(ctx, annot->obj, PDF_NAME(F)); + if (flags & PDF_ANNOT_IS_NO_ROTATE) + { + int rotate = pdf_to_int(ctx, pdf_dict_get_inheritable(ctx, annot->page->obj, PDF_NAME(Rotate))); + fz_point tp = fz_transform_point_xy(rect.x0, rect.y1, page_ctm); + page_ctm = fz_concat(page_ctm, fz_translate(-tp.x, -tp.y)); + page_ctm = fz_concat(page_ctm, fz_rotate(-rotate)); + page_ctm = fz_concat(page_ctm, fz_translate(tp.x, tp.y)); + } + + return fz_transform_rect(rect, page_ctm); +} + +void +pdf_dirty_annot(fz_context *ctx, pdf_annot *annot) +{ + annot->needs_new_ap = 1; + if (annot->page && annot->page->doc) + annot->page->doc->dirty = 1; +} + +const char * +pdf_string_from_annot_type(fz_context *ctx, enum pdf_annot_type type) +{ + switch (type) + { + case PDF_ANNOT_TEXT: return "Text"; + case PDF_ANNOT_LINK: return "Link"; + case PDF_ANNOT_FREE_TEXT: return "FreeText"; + case PDF_ANNOT_LINE: return "Line"; + case PDF_ANNOT_SQUARE: return "Square"; + case PDF_ANNOT_CIRCLE: return "Circle"; + case PDF_ANNOT_POLYGON: return "Polygon"; + case PDF_ANNOT_POLY_LINE: return "PolyLine"; + case PDF_ANNOT_HIGHLIGHT: return "Highlight"; + case PDF_ANNOT_UNDERLINE: return "Underline"; + case PDF_ANNOT_SQUIGGLY: return "Squiggly"; + case PDF_ANNOT_STRIKE_OUT: return "StrikeOut"; + case PDF_ANNOT_STAMP: return "Stamp"; + case PDF_ANNOT_CARET: return "Caret"; + case PDF_ANNOT_INK: return "Ink"; + case PDF_ANNOT_POPUP: return "Popup"; + case PDF_ANNOT_FILE_ATTACHMENT: return "FileAttachment"; + case PDF_ANNOT_SOUND: return "Sound"; + case PDF_ANNOT_MOVIE: return "Movie"; + case PDF_ANNOT_WIDGET: return "Widget"; + case PDF_ANNOT_SCREEN: return "Screen"; + case PDF_ANNOT_PRINTER_MARK: return "PrinterMark"; + case PDF_ANNOT_TRAP_NET: return "TrapNet"; + case PDF_ANNOT_WATERMARK: return "Watermark"; + case PDF_ANNOT_3D: return "3D"; + default: return "UNKNOWN"; + } +} + +int +pdf_annot_type_from_string(fz_context *ctx, const char *subtype) +{ + if (!strcmp("Text", subtype)) return PDF_ANNOT_TEXT; + if (!strcmp("Link", subtype)) return PDF_ANNOT_LINK; + if (!strcmp("FreeText", subtype)) return PDF_ANNOT_FREE_TEXT; + if (!strcmp("Line", subtype)) return PDF_ANNOT_LINE; + if (!strcmp("Square", subtype)) return PDF_ANNOT_SQUARE; + if (!strcmp("Circle", subtype)) return PDF_ANNOT_CIRCLE; + if (!strcmp("Polygon", subtype)) return PDF_ANNOT_POLYGON; + if (!strcmp("PolyLine", subtype)) return PDF_ANNOT_POLY_LINE; + if (!strcmp("Highlight", subtype)) return PDF_ANNOT_HIGHLIGHT; + if (!strcmp("Underline", subtype)) return PDF_ANNOT_UNDERLINE; + if (!strcmp("Squiggly", subtype)) return PDF_ANNOT_SQUIGGLY; + if (!strcmp("StrikeOut", subtype)) return PDF_ANNOT_STRIKE_OUT; + if (!strcmp("Stamp", subtype)) return PDF_ANNOT_STAMP; + if (!strcmp("Caret", subtype)) return PDF_ANNOT_CARET; + if (!strcmp("Ink", subtype)) return PDF_ANNOT_INK; + if (!strcmp("Popup", subtype)) return PDF_ANNOT_POPUP; + if (!strcmp("FileAttachment", subtype)) return PDF_ANNOT_FILE_ATTACHMENT; + if (!strcmp("Sound", subtype)) return PDF_ANNOT_SOUND; + if (!strcmp("Movie", subtype)) return PDF_ANNOT_MOVIE; + if (!strcmp("Widget", subtype)) return PDF_ANNOT_WIDGET; + if (!strcmp("Screen", subtype)) return PDF_ANNOT_SCREEN; + if (!strcmp("PrinterMark", subtype)) return PDF_ANNOT_PRINTER_MARK; + if (!strcmp("TrapNet", subtype)) return PDF_ANNOT_TRAP_NET; + if (!strcmp("Watermark", subtype)) return PDF_ANNOT_WATERMARK; + if (!strcmp("3D", subtype)) return PDF_ANNOT_3D; + return PDF_ANNOT_UNKNOWN; +} + +static int is_allowed_subtype(fz_context *ctx, pdf_annot *annot, pdf_obj *property, pdf_obj **allowed) +{ + pdf_obj *subtype = pdf_dict_get(ctx, annot->obj, PDF_NAME(Subtype)); + while (*allowed) { + if (pdf_name_eq(ctx, subtype, *allowed)) + return 1; + allowed++; + } + + return 0; +} + +static void check_allowed_subtypes(fz_context *ctx, pdf_annot *annot, pdf_obj *property, pdf_obj **allowed) +{ + pdf_obj *subtype = pdf_dict_get(ctx, annot->obj, PDF_NAME(Subtype)); + if (!is_allowed_subtype(ctx, annot, property, allowed)) + fz_throw(ctx, FZ_ERROR_GENERIC, "%s annotations have no %s property", pdf_to_name(ctx, subtype), pdf_to_name(ctx, property)); +} + +pdf_annot * +pdf_create_annot(fz_context *ctx, pdf_page *page, enum pdf_annot_type type) +{ + pdf_annot *annot = NULL; + pdf_document *doc = page->doc; + pdf_obj *annot_obj = pdf_new_dict(ctx, doc, 0); + pdf_obj *ind_obj = NULL; + + fz_var(annot); + fz_var(ind_obj); + fz_try(ctx) + { + int ind_obj_num; + const char *type_str; + pdf_obj *annot_arr; + + type_str = pdf_string_from_annot_type(ctx, type); + if (type == PDF_ANNOT_UNKNOWN) + fz_throw(ctx, FZ_ERROR_GENERIC, "cannot create unknown annotation"); + + annot_arr = pdf_dict_get(ctx, page->obj, PDF_NAME(Annots)); + if (annot_arr == NULL) + { + annot_arr = pdf_new_array(ctx, doc, 0); + pdf_dict_put_drop(ctx, page->obj, PDF_NAME(Annots), annot_arr); + } + + pdf_dict_put(ctx, annot_obj, PDF_NAME(Type), PDF_NAME(Annot)); + pdf_dict_put_name(ctx, annot_obj, PDF_NAME(Subtype), type_str); + + /* Make printable as default */ + pdf_dict_put_int(ctx, annot_obj, PDF_NAME(F), PDF_ANNOT_IS_PRINT); + + /* + Both annotation object and annotation structure are now created. + Insert the object in the hierarchy and the structure in the + page's array. + */ + ind_obj_num = pdf_create_object(ctx, doc); + pdf_update_object(ctx, doc, ind_obj_num, annot_obj); + ind_obj = pdf_new_indirect(ctx, doc, ind_obj_num, 0); + pdf_array_push(ctx, annot_arr, ind_obj); + + annot = pdf_new_annot(ctx, page, ind_obj); + annot->ap = NULL; + + /* + Linking must be done after any call that might throw because + pdf_drop_annots below actually frees a list. Put the new annot + at the end of the list, so that it will be drawn last. + */ + *page->annot_tailp = annot; + page->annot_tailp = &annot->next; + + doc->dirty = 1; + } + fz_always(ctx) + { + pdf_drop_obj(ctx, annot_obj); + pdf_drop_obj(ctx, ind_obj); + } + fz_catch(ctx) + { + pdf_drop_annots(ctx, annot); + fz_rethrow(ctx); + } + + return annot; +} + +void +pdf_delete_annot(fz_context *ctx, pdf_page *page, pdf_annot *annot) +{ + pdf_document *doc = annot->page->doc; + pdf_annot **annotptr; + pdf_obj *annot_arr; + int i; + + if (annot == NULL) + return; + + /* Remove annot from page's list */ + for (annotptr = &page->annots; *annotptr; annotptr = &(*annotptr)->next) + { + if (*annotptr == annot) + break; + } + + /* Check the passed annotation was of this page */ + if (*annotptr == NULL) + return; + + *annotptr = annot->next; + + /* If the removed annotation was the last in the list adjust the end pointer */ + if (*annotptr == NULL) + page->annot_tailp = annotptr; + + /* If the removed annotation has the focus, blur it. */ + if (doc->focus == annot) + { + doc->focus = NULL; + doc->focus_obj = NULL; + } + + /* Remove the annot from the "Annots" array. */ + annot_arr = pdf_dict_get(ctx, page->obj, PDF_NAME(Annots)); + i = pdf_array_find(ctx, annot_arr, annot->obj); + if (i >= 0) + pdf_array_delete(ctx, annot_arr, i); + + /* The garbage collection pass when saving will remove the annot object, + * removing it here may break files if multiple pages use the same annot. */ + + /* And free it. */ + fz_drop_annot(ctx, &annot->super); + + doc->dirty = 1; +} + +int +pdf_annot_type(fz_context *ctx, pdf_annot *annot) +{ + pdf_obj *obj = annot->obj; + pdf_obj *subtype = pdf_dict_get(ctx, obj, PDF_NAME(Subtype)); + return pdf_annot_type_from_string(ctx, pdf_to_name(ctx, subtype)); +} + +int +pdf_annot_flags(fz_context *ctx, pdf_annot *annot) +{ + return pdf_dict_get_int(ctx, annot->obj, PDF_NAME(F)); +} + +void +pdf_set_annot_flags(fz_context *ctx, pdf_annot *annot, int flags) +{ + pdf_dict_put_int(ctx, annot->obj, PDF_NAME(F), flags); + pdf_dirty_annot(ctx, annot); +} + +fz_rect +pdf_annot_rect(fz_context *ctx, pdf_annot *annot) +{ + fz_matrix page_ctm; + fz_rect annot_rect; + pdf_page_transform(ctx, annot->page, NULL, &page_ctm); + annot_rect = pdf_dict_get_rect(ctx, annot->obj, PDF_NAME(Rect)); + return fz_transform_rect(annot_rect, page_ctm); +} + +void +pdf_set_annot_rect(fz_context *ctx, pdf_annot *annot, fz_rect rect) +{ + fz_matrix page_ctm, inv_page_ctm; + + pdf_page_transform(ctx, annot->page, NULL, &page_ctm); + inv_page_ctm = fz_invert_matrix(page_ctm); + rect = fz_transform_rect(rect, inv_page_ctm); + + pdf_dict_put_rect(ctx, annot->obj, PDF_NAME(Rect), rect); + pdf_dirty_annot(ctx, annot); +} + +const char * +pdf_annot_contents(fz_context *ctx, pdf_annot *annot) +{ + return pdf_dict_get_text_string(ctx, annot->obj, PDF_NAME(Contents)); +} + +void +pdf_set_annot_contents(fz_context *ctx, pdf_annot *annot, const char *text) +{ + pdf_dict_put_text_string(ctx, annot->obj, PDF_NAME(Contents), text); + pdf_dict_del(ctx, annot->obj, PDF_NAME(RC)); /* not supported */ + pdf_dirty_annot(ctx, annot); +} + +static pdf_obj *open_subtypes[] = { + PDF_NAME(Popup), + PDF_NAME(Text), + NULL, +}; + +int +pdf_annot_has_open(fz_context *ctx, pdf_annot *annot) +{ + return is_allowed_subtype(ctx, annot, PDF_NAME(Open), open_subtypes); +} + +int +pdf_annot_is_open(fz_context *ctx, pdf_annot *annot) +{ + check_allowed_subtypes(ctx, annot, PDF_NAME(Open), open_subtypes); + return pdf_dict_get_bool(ctx, annot->obj, PDF_NAME(Open)); +} + +void +pdf_set_annot_is_open(fz_context *ctx, pdf_annot *annot, int is_open) +{ + check_allowed_subtypes(ctx, annot, PDF_NAME(Open), open_subtypes); + pdf_dict_put_bool(ctx, annot->obj, PDF_NAME(Open), is_open); + pdf_dirty_annot(ctx, annot); +} + +static pdf_obj *icon_name_subtypes[] = { + PDF_NAME(FileAttachment), + PDF_NAME(Sound), + PDF_NAME(Stamp), + PDF_NAME(Text), + NULL, +}; + +int +pdf_annot_has_icon_name(fz_context *ctx, pdf_annot *annot) +{ + return is_allowed_subtype(ctx, annot, PDF_NAME(Name), icon_name_subtypes); +} + +const char * +pdf_annot_icon_name(fz_context *ctx, pdf_annot *annot) +{ + pdf_obj *name; + check_allowed_subtypes(ctx, annot, PDF_NAME(Name), icon_name_subtypes); + name = pdf_dict_get(ctx, annot->obj, PDF_NAME(Name)); + if (!name) + { + pdf_obj *subtype = pdf_dict_get(ctx, annot->obj, PDF_NAME(Subtype)); + if (pdf_name_eq(ctx, subtype, PDF_NAME(Text))) + return "Note"; + if (pdf_name_eq(ctx, subtype, PDF_NAME(Stamp))) + return "Draft"; + if (pdf_name_eq(ctx, subtype, PDF_NAME(FileAttachment))) + return "PushPin"; + if (pdf_name_eq(ctx, subtype, PDF_NAME(Sound))) + return "Speaker"; + } + return pdf_to_name(ctx, name); +} + +void +pdf_set_annot_icon_name(fz_context *ctx, pdf_annot *annot, const char *name) +{ + check_allowed_subtypes(ctx, annot, PDF_NAME(Name), icon_name_subtypes); + pdf_dict_put_name(ctx, annot->obj, PDF_NAME(Name), name); + pdf_dirty_annot(ctx, annot); +} + +enum pdf_line_ending pdf_line_ending_from_name(fz_context *ctx, pdf_obj *end) +{ + if (pdf_name_eq(ctx, end, PDF_NAME(None))) return PDF_ANNOT_LE_NONE; + else if (pdf_name_eq(ctx, end, PDF_NAME(Square))) return PDF_ANNOT_LE_SQUARE; + else if (pdf_name_eq(ctx, end, PDF_NAME(Circle))) return PDF_ANNOT_LE_CIRCLE; + else if (pdf_name_eq(ctx, end, PDF_NAME(Diamond))) return PDF_ANNOT_LE_DIAMOND; + else if (pdf_name_eq(ctx, end, PDF_NAME(OpenArrow))) return PDF_ANNOT_LE_OPEN_ARROW; + else if (pdf_name_eq(ctx, end, PDF_NAME(ClosedArrow))) return PDF_ANNOT_LE_CLOSED_ARROW; + else if (pdf_name_eq(ctx, end, PDF_NAME(Butt))) return PDF_ANNOT_LE_BUTT; + else if (pdf_name_eq(ctx, end, PDF_NAME(ROpenArrow))) return PDF_ANNOT_LE_R_OPEN_ARROW; + else if (pdf_name_eq(ctx, end, PDF_NAME(RClosedArrow))) return PDF_ANNOT_LE_R_CLOSED_ARROW; + else if (pdf_name_eq(ctx, end, PDF_NAME(Slash))) return PDF_ANNOT_LE_SLASH; + else return PDF_ANNOT_LE_NONE; +} + +enum pdf_line_ending pdf_line_ending_from_string(fz_context *ctx, const char *end) +{ + if (!strcmp(end, "None")) return PDF_ANNOT_LE_NONE; + else if (!strcmp(end, "Square")) return PDF_ANNOT_LE_SQUARE; + else if (!strcmp(end, "Circle")) return PDF_ANNOT_LE_CIRCLE; + else if (!strcmp(end, "Diamond")) return PDF_ANNOT_LE_DIAMOND; + else if (!strcmp(end, "OpenArrow")) return PDF_ANNOT_LE_OPEN_ARROW; + else if (!strcmp(end, "ClosedArrow")) return PDF_ANNOT_LE_CLOSED_ARROW; + else if (!strcmp(end, "Butt")) return PDF_ANNOT_LE_BUTT; + else if (!strcmp(end, "ROpenArrow")) return PDF_ANNOT_LE_R_OPEN_ARROW; + else if (!strcmp(end, "RClosedArrow")) return PDF_ANNOT_LE_R_CLOSED_ARROW; + else if (!strcmp(end, "Slash")) return PDF_ANNOT_LE_SLASH; + else return PDF_ANNOT_LE_NONE; +} + +pdf_obj *pdf_name_from_line_ending(fz_context *ctx, enum pdf_line_ending end) +{ + switch (end) + { + default: + case PDF_ANNOT_LE_NONE: return PDF_NAME(None); + case PDF_ANNOT_LE_SQUARE: return PDF_NAME(Square); + case PDF_ANNOT_LE_CIRCLE: return PDF_NAME(Circle); + case PDF_ANNOT_LE_DIAMOND: return PDF_NAME(Diamond); + case PDF_ANNOT_LE_OPEN_ARROW: return PDF_NAME(OpenArrow); + case PDF_ANNOT_LE_CLOSED_ARROW: return PDF_NAME(ClosedArrow); + case PDF_ANNOT_LE_BUTT: return PDF_NAME(Butt); + case PDF_ANNOT_LE_R_OPEN_ARROW: return PDF_NAME(ROpenArrow); + case PDF_ANNOT_LE_R_CLOSED_ARROW: return PDF_NAME(RClosedArrow); + case PDF_ANNOT_LE_SLASH: return PDF_NAME(Slash); + } +} + +const char *pdf_string_from_line_ending(fz_context *ctx, enum pdf_line_ending end) +{ + switch (end) + { + default: + case PDF_ANNOT_LE_NONE: return "None"; + case PDF_ANNOT_LE_SQUARE: return "Square"; + case PDF_ANNOT_LE_CIRCLE: return "Circle"; + case PDF_ANNOT_LE_DIAMOND: return "Diamond"; + case PDF_ANNOT_LE_OPEN_ARROW: return "OpenArrow"; + case PDF_ANNOT_LE_CLOSED_ARROW: return "ClosedArrow"; + case PDF_ANNOT_LE_BUTT: return "Butt"; + case PDF_ANNOT_LE_R_OPEN_ARROW: return "ROpenArrow"; + case PDF_ANNOT_LE_R_CLOSED_ARROW: return "RClosedArrow"; + case PDF_ANNOT_LE_SLASH: return "Slash"; + } +} + +static pdf_obj *line_ending_subtypes[] = { + PDF_NAME(FreeText), + PDF_NAME(Line), + PDF_NAME(PolyLine), + PDF_NAME(Polygon), + NULL, +}; + +int +pdf_annot_has_line_ending_styles(fz_context *ctx, pdf_annot *annot) +{ + return is_allowed_subtype(ctx, annot, PDF_NAME(LE), line_ending_subtypes); +} + +void +pdf_annot_line_ending_styles(fz_context *ctx, pdf_annot *annot, + enum pdf_line_ending *start_style, + enum pdf_line_ending *end_style) +{ + pdf_obj *style; + check_allowed_subtypes(ctx, annot, PDF_NAME(LE), line_ending_subtypes); + style = pdf_dict_get(ctx, annot->obj, PDF_NAME(LE)); + *start_style = pdf_line_ending_from_name(ctx, pdf_array_get(ctx, style, 0)); + *end_style = pdf_line_ending_from_name(ctx, pdf_array_get(ctx, style, 1)); +} + +enum pdf_line_ending +pdf_annot_line_start_style(fz_context *ctx, pdf_annot *annot) +{ + pdf_obj *le = pdf_dict_get(ctx, annot->obj, PDF_NAME(LE)); + return pdf_line_ending_from_name(ctx, pdf_array_get(ctx, le, 0)); +} + +enum pdf_line_ending +pdf_annot_line_end_style(fz_context *ctx, pdf_annot *annot) +{ + pdf_obj *le = pdf_dict_get(ctx, annot->obj, PDF_NAME(LE)); + return pdf_line_ending_from_name(ctx, pdf_array_get(ctx, le, 1)); +} + +void +pdf_set_annot_line_ending_styles(fz_context *ctx, pdf_annot *annot, + enum pdf_line_ending start_style, + enum pdf_line_ending end_style) +{ + pdf_document *doc = annot->page->doc; + pdf_obj *style; + check_allowed_subtypes(ctx, annot, PDF_NAME(LE), line_ending_subtypes); + style = pdf_new_array(ctx, doc, 2); + pdf_dict_put_drop(ctx, annot->obj, PDF_NAME(LE), style); + pdf_array_put_drop(ctx, style, 0, pdf_name_from_line_ending(ctx, start_style)); + pdf_array_put_drop(ctx, style, 1, pdf_name_from_line_ending(ctx, end_style)); + pdf_dirty_annot(ctx, annot); +} + +void +pdf_set_annot_line_start_style(fz_context *ctx, pdf_annot *annot, enum pdf_line_ending s) +{ + enum pdf_line_ending e = pdf_annot_line_end_style(ctx, annot); + pdf_set_annot_line_ending_styles(ctx, annot, s, e); +} + +void +pdf_set_annot_line_end_style(fz_context *ctx, pdf_annot *annot, enum pdf_line_ending e) +{ + enum pdf_line_ending s = pdf_annot_line_start_style(ctx, annot); + pdf_set_annot_line_ending_styles(ctx, annot, s, e); +} + +float +pdf_annot_border(fz_context *ctx, pdf_annot *annot) +{ + pdf_obj *bs, *bs_w; + bs = pdf_dict_get(ctx, annot->obj, PDF_NAME(BS)); + bs_w = pdf_dict_get(ctx, bs, PDF_NAME(W)); + if (pdf_is_number(ctx, bs_w)) + return pdf_to_real(ctx, bs_w); + return 1; +} + +void +pdf_set_annot_border(fz_context *ctx, pdf_annot *annot, float w) +{ + pdf_obj *bs = pdf_dict_get(ctx, annot->obj, PDF_NAME(BS)); + if (!pdf_is_dict(ctx, bs)) + bs = pdf_dict_put_dict(ctx, annot->obj, PDF_NAME(BS), 1); + pdf_dict_put_real(ctx, bs, PDF_NAME(W), w); + + pdf_dict_del(ctx, annot->obj, PDF_NAME(Border)); /* deprecated */ + pdf_dict_del(ctx, annot->obj, PDF_NAME(BE)); /* not supported */ + + pdf_dirty_annot(ctx, annot); +} + +int +pdf_annot_quadding(fz_context *ctx, pdf_annot *annot) +{ + int q = pdf_dict_get_int(ctx, annot->obj, PDF_NAME(Q)); + return (q < 0 || q > 2) ? 0 : q; +} + +void +pdf_set_annot_quadding(fz_context *ctx, pdf_annot *annot, int q) +{ + q = (q < 0 || q > 2) ? 0 : q; + pdf_dict_put_int(ctx, annot->obj, PDF_NAME(Q), q); + pdf_dirty_annot(ctx, annot); +} + +float pdf_annot_opacity(fz_context *ctx, pdf_annot *annot) +{ + pdf_obj *ca = pdf_dict_get(ctx, annot->obj, PDF_NAME(CA)); + if (pdf_is_number(ctx, ca)) + return pdf_to_real(ctx, ca); + return 1; +} + +void pdf_set_annot_opacity(fz_context *ctx, pdf_annot *annot, float opacity) +{ + if (opacity != 1) + pdf_dict_put_real(ctx, annot->obj, PDF_NAME(CA), opacity); + else + pdf_dict_del(ctx, annot->obj, PDF_NAME(CA)); + pdf_dirty_annot(ctx, annot); +} + +static void pdf_annot_color_imp(fz_context *ctx, pdf_obj *arr, int *n, float color[4]) +{ + switch (pdf_array_len(ctx, arr)) + { + case 0: + if (n) + *n = 0; + break; + case 1: + case 2: + if (n) + *n = 1; + if (color) + color[0] = pdf_array_get_real(ctx, arr, 0); + break; + case 3: + if (n) + *n = 3; + if (color) + { + color[0] = pdf_array_get_real(ctx, arr, 0); + color[1] = pdf_array_get_real(ctx, arr, 1); + color[2] = pdf_array_get_real(ctx, arr, 2); + } + break; + case 4: + default: + if (n) + *n = 4; + if (color) + { + color[0] = pdf_array_get_real(ctx, arr, 0); + color[1] = pdf_array_get_real(ctx, arr, 1); + color[2] = pdf_array_get_real(ctx, arr, 2); + color[3] = pdf_array_get_real(ctx, arr, 3); + } + break; + } +} + +static void pdf_set_annot_color_imp(fz_context *ctx, pdf_annot *annot, pdf_obj *key, int n, const float color[4], pdf_obj **allowed) +{ + pdf_document *doc = annot->page->doc; + pdf_obj *arr; + + if (allowed) + check_allowed_subtypes(ctx, annot, key, allowed); + if (n != 0 && n != 1 && n != 3 && n != 4) + fz_throw(ctx, FZ_ERROR_GENERIC, "color must be 0, 1, 3 or 4 components"); + if (!color) + fz_throw(ctx, FZ_ERROR_GENERIC, "no color given"); + + arr = pdf_new_array(ctx, doc, n); + fz_try(ctx) + { + switch (n) + { + case 1: + pdf_array_push_real(ctx, arr, color[0]); + break; + case 3: + pdf_array_push_real(ctx, arr, color[0]); + pdf_array_push_real(ctx, arr, color[1]); + pdf_array_push_real(ctx, arr, color[2]); + break; + case 4: + pdf_array_push_real(ctx, arr, color[0]); + pdf_array_push_real(ctx, arr, color[1]); + pdf_array_push_real(ctx, arr, color[2]); + pdf_array_push_real(ctx, arr, color[3]); + break; + } + } + fz_catch(ctx) + { + pdf_drop_obj(ctx, arr); + fz_rethrow(ctx); + } + + pdf_dict_put_drop(ctx, annot->obj, key, arr); + pdf_dirty_annot(ctx, annot); +} + +void +pdf_annot_color(fz_context *ctx, pdf_annot *annot, int *n, float color[4]) +{ + pdf_obj *c = pdf_dict_get(ctx, annot->obj, PDF_NAME(C)); + pdf_annot_color_imp(ctx, c, n, color); +} + +void +pdf_annot_MK_BG(fz_context *ctx, pdf_annot *annot, int *n, float color[4]) +{ + pdf_obj *mk_bg = pdf_dict_get(ctx, pdf_dict_get(ctx, annot->obj, PDF_NAME(MK)), PDF_NAME(BG)); + pdf_annot_color_imp(ctx, mk_bg, n, color); +} + +void +pdf_annot_MK_BC(fz_context *ctx, pdf_annot *annot, int *n, float color[4]) +{ + pdf_obj *mk_bc = pdf_dict_get(ctx, pdf_dict_get(ctx, annot->obj, PDF_NAME(MK)), PDF_NAME(BC)); + pdf_annot_color_imp(ctx, mk_bc, n, color); +} + +void +pdf_set_annot_color(fz_context *ctx, pdf_annot *annot, int n, const float color[4]) +{ + pdf_set_annot_color_imp(ctx, annot, PDF_NAME(C), n, color, NULL); +} + +static pdf_obj *interior_color_subtypes[] = { + PDF_NAME(Circle), + PDF_NAME(Line), + PDF_NAME(PolyLine), + PDF_NAME(Polygon), + PDF_NAME(Square), + NULL, +}; + +int +pdf_annot_has_interior_color(fz_context *ctx, pdf_annot *annot) +{ + return is_allowed_subtype(ctx, annot, PDF_NAME(IC), interior_color_subtypes); +} + +void +pdf_annot_interior_color(fz_context *ctx, pdf_annot *annot, int *n, float color[4]) { - if (depth > 10) /* Arbitrary to avoid infinite recursion */ - return NULL; + pdf_obj *ic = pdf_dict_get(ctx, annot->obj, PDF_NAME(IC)); + pdf_annot_color_imp(ctx, ic, n, color); +} - if (pdf_is_name(ctx, dest) || pdf_is_string(ctx, dest)) - { - dest = pdf_lookup_dest(ctx, doc, dest); - dest = resolve_dest_rec(ctx, doc, dest, depth+1); - return dest; - } +void +pdf_set_annot_interior_color(fz_context *ctx, pdf_annot *annot, int n, const float color[4]) +{ + pdf_set_annot_color_imp(ctx, annot, PDF_NAME(IC), n, color, interior_color_subtypes); +} - else if (pdf_is_array(ctx, dest)) - { - return dest; - } +static pdf_obj *line_subtypes[] = { + PDF_NAME(Line), + NULL, +}; - else if (pdf_is_dict(ctx, dest)) - { - dest = pdf_dict_get(ctx, dest, PDF_NAME_D); - return resolve_dest_rec(ctx, doc, dest, depth+1); - } +int +pdf_annot_has_line(fz_context *ctx, pdf_annot *annot) +{ + return is_allowed_subtype(ctx, annot, PDF_NAME(L), line_subtypes); +} + +void +pdf_annot_line(fz_context *ctx, pdf_annot *annot, fz_point *a, fz_point *b) +{ + fz_matrix page_ctm; + pdf_obj *line; + + check_allowed_subtypes(ctx, annot, PDF_NAME(L), line_subtypes); - else if (pdf_is_indirect(ctx, dest)) - return dest; + pdf_page_transform(ctx, annot->page, NULL, &page_ctm); - return NULL; + line = pdf_dict_get(ctx, annot->obj, PDF_NAME(L)); + a->x = pdf_array_get_real(ctx, line, 0); + a->y = pdf_array_get_real(ctx, line, 1); + b->x = pdf_array_get_real(ctx, line, 2); + b->y = pdf_array_get_real(ctx, line, 3); + *a = fz_transform_point(*a, page_ctm); + *b = fz_transform_point(*b, page_ctm); } -static pdf_obj * -resolve_dest(fz_context *ctx, pdf_document *doc, pdf_obj *dest) +void +pdf_set_annot_line(fz_context *ctx, pdf_annot *annot, fz_point a, fz_point b) { - return resolve_dest_rec(ctx, doc, dest, 0); + fz_matrix page_ctm, inv_page_ctm; + pdf_obj *line; + + check_allowed_subtypes(ctx, annot, PDF_NAME(L), line_subtypes); + + pdf_page_transform(ctx, annot->page, NULL, &page_ctm); + inv_page_ctm = fz_invert_matrix(page_ctm); + + a = fz_transform_point(a, inv_page_ctm); + b = fz_transform_point(b, inv_page_ctm); + + line = pdf_new_array(ctx, annot->page->doc, 4); + pdf_dict_put_drop(ctx, annot->obj, PDF_NAME(L), line); + pdf_array_push_real(ctx, line, a.x); + pdf_array_push_real(ctx, line, a.y); + pdf_array_push_real(ctx, line, b.x); + pdf_array_push_real(ctx, line, b.y); + + pdf_dirty_annot(ctx, annot); } -char * -pdf_parse_link_dest(fz_context *ctx, pdf_document *doc, pdf_obj *dest) +static pdf_obj *vertices_subtypes[] = { + PDF_NAME(PolyLine), + PDF_NAME(Polygon), + NULL, +}; + +int +pdf_annot_has_vertices(fz_context *ctx, pdf_annot *annot) { - pdf_obj *obj; - char buf[256]; - char *ld; - int page; - int x, y; + return is_allowed_subtype(ctx, annot, PDF_NAME(Vertices), vertices_subtypes); +} - dest = resolve_dest(ctx, doc, dest); - if (dest == NULL) - { - fz_warn(ctx, "undefined link destination"); - return NULL; - } +int +pdf_annot_vertex_count(fz_context *ctx, pdf_annot *annot) +{ + pdf_obj *vertices; + check_allowed_subtypes(ctx, annot, PDF_NAME(Vertices), vertices_subtypes); + vertices = pdf_dict_get(ctx, annot->obj, PDF_NAME(Vertices)); + return pdf_array_len(ctx, vertices) / 2; +} - if (pdf_is_name(ctx, dest)) - { - ld = pdf_to_name(ctx, dest); - return fz_strdup(ctx, ld); - } - else if (pdf_is_string(ctx, dest)) - { - ld = pdf_to_str_buf(ctx, dest); - return fz_strdup(ctx, ld); - } +fz_point +pdf_annot_vertex(fz_context *ctx, pdf_annot *annot, int i) +{ + pdf_obj *vertices; + fz_matrix page_ctm; + fz_point point; - obj = pdf_array_get(ctx, dest, 0); - if (pdf_is_int(ctx, obj)) - page = pdf_to_int(ctx, obj); - else - { - fz_try(ctx) - page = pdf_lookup_page_number(ctx, doc, obj); - fz_catch(ctx) - page = -1; - } + check_allowed_subtypes(ctx, annot, PDF_NAME(Vertices), vertices_subtypes); - x = y = 0; - obj = pdf_array_get(ctx, dest, 1); - if (pdf_name_eq(ctx, obj, PDF_NAME_XYZ)) - { - x = pdf_to_int(ctx, pdf_array_get(ctx, dest, 2)); - y = pdf_to_int(ctx, pdf_array_get(ctx, dest, 3)); - } - else if (pdf_name_eq(ctx, obj, PDF_NAME_FitR)) - { - x = pdf_to_int(ctx, pdf_array_get(ctx, dest, 2)); - y = pdf_to_int(ctx, pdf_array_get(ctx, dest, 5)); - } - else if (pdf_name_eq(ctx, obj, PDF_NAME_FitH) || pdf_name_eq(ctx, obj, PDF_NAME_FitBH)) - y = pdf_to_int(ctx, pdf_array_get(ctx, dest, 2)); - else if (pdf_name_eq(ctx, obj, PDF_NAME_FitV) || pdf_name_eq(ctx, obj, PDF_NAME_FitBV)) - x = pdf_to_int(ctx, pdf_array_get(ctx, dest, 2)); + vertices = pdf_dict_get(ctx, annot->obj, PDF_NAME(Vertices)); - if (page >= 0) - { - if (x != 0 || y != 0) - fz_snprintf(buf, sizeof buf, "#%d,%d,%d", page + 1, x, y); - else - fz_snprintf(buf, sizeof buf, "#%d", page + 1); - return fz_strdup(ctx, buf); + pdf_page_transform(ctx, annot->page, NULL, &page_ctm); + + point.x = pdf_array_get_real(ctx, vertices, i * 2); + point.y = pdf_array_get_real(ctx, vertices, i * 2 + 1); + return fz_transform_point(point, page_ctm); +} + +void +pdf_set_annot_vertices(fz_context *ctx, pdf_annot *annot, int n, const fz_point *v) +{ + pdf_document *doc = annot->page->doc; + fz_matrix page_ctm, inv_page_ctm; + pdf_obj *vertices; + fz_point point; + int i; + + check_allowed_subtypes(ctx, annot, PDF_NAME(Vertices), vertices_subtypes); + if (n <= 0 || !v) + fz_throw(ctx, FZ_ERROR_GENERIC, "invalid number of vertices"); + + pdf_page_transform(ctx, annot->page, NULL, &page_ctm); + inv_page_ctm = fz_invert_matrix(page_ctm); + + vertices = pdf_new_array(ctx, doc, n * 2); + for (i = 0; i < n; ++i) + { + point = fz_transform_point(v[i], inv_page_ctm); + pdf_array_push_real(ctx, vertices, point.x); + pdf_array_push_real(ctx, vertices, point.y); } + pdf_dict_put_drop(ctx, annot->obj, PDF_NAME(Vertices), vertices); + pdf_dirty_annot(ctx, annot); +} - return NULL; +void pdf_clear_annot_vertices(fz_context *ctx, pdf_annot *annot) +{ + check_allowed_subtypes(ctx, annot, PDF_NAME(Vertices), vertices_subtypes); + pdf_dict_del(ctx, annot->obj, PDF_NAME(Vertices)); + pdf_dirty_annot(ctx, annot); } -char * -pdf_parse_file_spec(fz_context *ctx, pdf_document *doc, pdf_obj *file_spec, pdf_obj *dest) +void pdf_add_annot_vertex(fz_context *ctx, pdf_annot *annot, fz_point p) { - pdf_obj *filename=NULL; - char *path = NULL; - char *uri = NULL; - char buf[256]; - size_t n; + pdf_document *doc = annot->page->doc; + fz_matrix page_ctm, inv_page_ctm; + pdf_obj *vertices; - if (pdf_is_string(ctx, file_spec)) - filename = file_spec; + check_allowed_subtypes(ctx, annot, PDF_NAME(Vertices), vertices_subtypes); - if (pdf_is_dict(ctx, file_spec)) { -#if defined(_WIN32) || defined(_WIN64) - filename = pdf_dict_get(ctx, file_spec, PDF_NAME_DOS); -#else - filename = pdf_dict_get(ctx, file_spec, PDF_NAME_Unix); -#endif - if (!filename) - filename = pdf_dict_geta(ctx, file_spec, PDF_NAME_UF, PDF_NAME_F); - } + pdf_page_transform(ctx, annot->page, NULL, &page_ctm); + inv_page_ctm = fz_invert_matrix(page_ctm); - if (!pdf_is_string(ctx, filename)) + vertices = pdf_dict_get(ctx, annot->obj, PDF_NAME(Vertices)); + if (!pdf_is_array(ctx, vertices)) { - fz_warn(ctx, "cannot parse file specification"); - return NULL; + vertices = pdf_new_array(ctx, doc, 32); + pdf_dict_put_drop(ctx, annot->obj, PDF_NAME(Vertices), vertices); } - path = pdf_to_utf8(ctx, filename); -#if defined(_WIN32) || defined(_WIN64) - if (strcmp(pdf_to_name(ctx, pdf_dict_gets(ctx, file_spec, "FS")), "URL") != 0) - { - /* move the file name into the expected place and use the expected path separator */ - char *c; - if (path[0] == '/' && (('A' <= path[1] && path[1] <= 'Z') || ('a' <= path[1] && path[1] <= 'z')) && path[2] == '/') - { - path[0] = path[1]; - path[1] = ':'; - } - for (c = path; *c; c++) - { - if (*c == '/') - *c = '\\'; - } - } -#endif + p = fz_transform_point(p, inv_page_ctm); + pdf_array_push_real(ctx, vertices, p.x); + pdf_array_push_real(ctx, vertices, p.y); - if (pdf_is_array(ctx, dest)) - fz_snprintf(buf, sizeof buf, "#page=%d", pdf_to_int(ctx, pdf_array_get(ctx, dest, 0)) + 1); - else if (pdf_is_name(ctx, dest)) - fz_snprintf(buf, sizeof buf, "#%s", pdf_to_name(ctx, dest)); - else if (pdf_is_stream(ctx, dest)) - fz_snprintf(buf, sizeof buf, "#%s", pdf_to_str_buf(ctx, dest)); - else - buf[0] = 0; + pdf_dirty_annot(ctx, annot); +} + +void pdf_set_annot_vertex(fz_context *ctx, pdf_annot *annot, int i, fz_point p) +{ + fz_matrix page_ctm, inv_page_ctm; + pdf_obj *vertices; + + check_allowed_subtypes(ctx, annot, PDF_NAME(Vertices), vertices_subtypes); + + pdf_page_transform(ctx, annot->page, NULL, &page_ctm); + inv_page_ctm = fz_invert_matrix(page_ctm); - n = 7 + strlen(path) + strlen(buf) + 1; - uri = fz_malloc(ctx, n); - fz_strlcpy(uri, "file://", n); - fz_strlcat(uri, path, n); - fz_strlcat(uri, buf, n); - fz_free(ctx, path); - return uri; + p = fz_transform_point(p, inv_page_ctm); + + vertices = pdf_dict_get(ctx, annot->obj, PDF_NAME(Vertices)); + pdf_array_put_drop(ctx, vertices, i * 2 + 0, pdf_new_real(ctx, p.x)); + pdf_array_put_drop(ctx, vertices, i * 2 + 1, pdf_new_real(ctx, p.y)); } -char * -pdf_parse_link_action(fz_context *ctx, pdf_document *doc, pdf_obj *action) +static pdf_obj *quad_point_subtypes[] = { + PDF_NAME(Highlight), + PDF_NAME(Link), + PDF_NAME(Squiggly), + PDF_NAME(StrikeOut), + PDF_NAME(Underline), + NULL, +}; + +int +pdf_annot_has_quad_points(fz_context *ctx, pdf_annot *annot) { - pdf_obj *obj, *dest, *file_spec; + return is_allowed_subtype(ctx, annot, PDF_NAME(QuadPoints), quad_point_subtypes); +} - if (!action) - return NULL; +int +pdf_annot_quad_point_count(fz_context *ctx, pdf_annot *annot) +{ + pdf_obj *quad_points; + check_allowed_subtypes(ctx, annot, PDF_NAME(QuadPoints), quad_point_subtypes); + quad_points = pdf_dict_get(ctx, annot->obj, PDF_NAME(QuadPoints)); + return pdf_array_len(ctx, quad_points) / 8; +} - obj = pdf_dict_get(ctx, action, PDF_NAME_S); - if (pdf_name_eq(ctx, PDF_NAME_GoTo, obj)) - { - dest = pdf_dict_get(ctx, action, PDF_NAME_D); - return pdf_parse_link_dest(ctx, doc, dest); +void +pdf_annot_quad_point(fz_context *ctx, pdf_annot *annot, int idx, float v[8]) +{ + pdf_obj *quad_points; + fz_matrix page_ctm; + int i; + + check_allowed_subtypes(ctx, annot, PDF_NAME(QuadPoints), quad_point_subtypes); + quad_points = pdf_dict_get(ctx, annot->obj, PDF_NAME(QuadPoints)); + pdf_page_transform(ctx, annot->page, NULL, &page_ctm); + + for (i = 0; i < 8; i += 2) + { + fz_point point; + point.x = pdf_array_get_real(ctx, quad_points, idx * 8 + i + 0); + point.y = pdf_array_get_real(ctx, quad_points, idx * 8 + i + 1); + point = fz_transform_point(point, page_ctm); + v[i+0] = point.x; + v[i+1] = point.y; } - else if (pdf_name_eq(ctx, PDF_NAME_URI, obj)) +} + +void +pdf_set_annot_quad_points(fz_context *ctx, pdf_annot *annot, int n, const float *v) +{ + pdf_document *doc = annot->page->doc; + fz_matrix page_ctm, inv_page_ctm; + pdf_obj *quad_points; + fz_point point; + int i, k; + + check_allowed_subtypes(ctx, annot, PDF_NAME(QuadPoints), quad_point_subtypes); + if (n <= 0 || !v) + fz_throw(ctx, FZ_ERROR_GENERIC, "invalid number of quadrilaterals"); + + pdf_page_transform(ctx, annot->page, NULL, &page_ctm); + inv_page_ctm = fz_invert_matrix(page_ctm); + + quad_points = pdf_new_array(ctx, doc, n * 8); + for (i = 0; i < n; ++i) { - /* URI entries are ASCII strings */ - const char *uri = pdf_to_str_buf(ctx, pdf_dict_get(ctx, action, PDF_NAME_URI)); - if (!fz_is_external_link(ctx, uri)) + for (k = 0; k < 4; ++k) { - pdf_obj *uri_base_obj = pdf_dict_getp(ctx, pdf_trailer(ctx, doc), "Root/URI/Base"); - const char *uri_base = uri_base_obj ? pdf_to_str_buf(ctx, uri_base_obj) : "file://"; - char *new_uri = fz_malloc(ctx, strlen(uri_base) + strlen(uri) + 1); - strcpy(new_uri, uri_base); - strcat(new_uri, uri); - return new_uri; + point.x = v[i * 8 + k * 2 + 0]; + point.y = v[i * 8 + k * 2 + 1]; + point = fz_transform_point(point, inv_page_ctm); + pdf_array_push_real(ctx, quad_points, point.x); + pdf_array_push_real(ctx, quad_points, point.y); } - return fz_strdup(ctx, uri); - } - else if (pdf_name_eq(ctx, PDF_NAME_Launch, obj)) - { - file_spec = pdf_dict_get(ctx, action, PDF_NAME_F); - return pdf_parse_file_spec(ctx, doc, file_spec, NULL); } - else if (pdf_name_eq(ctx, PDF_NAME_GoToR, obj)) - { - dest = pdf_dict_get(ctx, action, PDF_NAME_D); - file_spec = pdf_dict_get(ctx, action, PDF_NAME_F); - return pdf_parse_file_spec(ctx, doc, file_spec, dest); - } - - return NULL; + pdf_dict_put_drop(ctx, annot->obj, PDF_NAME(QuadPoints), quad_points); + pdf_dirty_annot(ctx, annot); } -static fz_link * -pdf_load_link(fz_context *ctx, pdf_document *doc, pdf_obj *dict, const fz_matrix *page_ctm) +void +pdf_clear_annot_quad_points(fz_context *ctx, pdf_annot *annot) { - pdf_obj *action; - pdf_obj *obj; - fz_rect bbox; - char *uri; - fz_link *link; + check_allowed_subtypes(ctx, annot, PDF_NAME(QuadPoints), quad_point_subtypes); + pdf_dict_del(ctx, annot->obj, PDF_NAME(QuadPoints)); + pdf_dirty_annot(ctx, annot); +} - obj = pdf_dict_get(ctx, dict, PDF_NAME_Subtype); - if (!pdf_name_eq(ctx, obj, PDF_NAME_Link)) - return NULL; +void +pdf_add_annot_quad_point(fz_context *ctx, pdf_annot *annot, fz_quad quad) +{ + pdf_document *doc = annot->page->doc; + fz_matrix page_ctm, inv_page_ctm; + pdf_obj *quad_points; + + check_allowed_subtypes(ctx, annot, PDF_NAME(QuadPoints), quad_point_subtypes); + + pdf_page_transform(ctx, annot->page, NULL, &page_ctm); + inv_page_ctm = fz_invert_matrix(page_ctm); + + quad_points = pdf_dict_get(ctx, annot->obj, PDF_NAME(QuadPoints)); + if (!pdf_is_array(ctx, quad_points)) + { + quad_points = pdf_new_array(ctx, doc, 8); + pdf_dict_put_drop(ctx, annot->obj, PDF_NAME(QuadPoints), quad_points); + } + + /* Contrary to the specification, the points within a QuadPoint are NOT ordered + * in a counterclockwise fashion. Experiments with Adobe's implementation + * indicates a cross-wise ordering is intended: ul, ur, ll, lr. + */ + quad = fz_transform_quad(quad, inv_page_ctm); + pdf_array_push_real(ctx, quad_points, quad.ul.x); + pdf_array_push_real(ctx, quad_points, quad.ul.y); + pdf_array_push_real(ctx, quad_points, quad.ur.x); + pdf_array_push_real(ctx, quad_points, quad.ur.y); + pdf_array_push_real(ctx, quad_points, quad.ll.x); + pdf_array_push_real(ctx, quad_points, quad.ll.y); + pdf_array_push_real(ctx, quad_points, quad.lr.x); + pdf_array_push_real(ctx, quad_points, quad.lr.y); - obj = pdf_dict_get(ctx, dict, PDF_NAME_Rect); - if (!obj) - return NULL; + pdf_dirty_annot(ctx, annot); +} - pdf_to_rect(ctx, obj, &bbox); - fz_transform_rect(&bbox, page_ctm); +static pdf_obj *ink_list_subtypes[] = { + PDF_NAME(Ink), + NULL, +}; - obj = pdf_dict_get(ctx, dict, PDF_NAME_Dest); - if (obj) - uri = pdf_parse_link_dest(ctx, doc, obj); - else - { - action = pdf_dict_get(ctx, dict, PDF_NAME_A); - /* fall back to additional action button's down/up action */ - if (!action) - action = pdf_dict_geta(ctx, pdf_dict_get(ctx, dict, PDF_NAME_AA), PDF_NAME_U, PDF_NAME_D); - uri = pdf_parse_link_action(ctx, doc, action); - } +int +pdf_annot_has_ink_list(fz_context *ctx, pdf_annot *annot) +{ + return is_allowed_subtype(ctx, annot, PDF_NAME(InkList), ink_list_subtypes); +} - if (!uri) - return NULL; +int +pdf_annot_ink_list_count(fz_context *ctx, pdf_annot *annot) +{ + pdf_obj *ink_list; + check_allowed_subtypes(ctx, annot, PDF_NAME(InkList), ink_list_subtypes); + ink_list = pdf_dict_get(ctx, annot->obj, PDF_NAME(InkList)); + return pdf_array_len(ctx, ink_list); +} - link = fz_new_link(ctx, &bbox, doc, uri); - fz_free(ctx, uri); - return link; +int +pdf_annot_ink_list_stroke_count(fz_context *ctx, pdf_annot *annot, int i) +{ + pdf_obj *ink_list; + pdf_obj *stroke; + check_allowed_subtypes(ctx, annot, PDF_NAME(InkList), ink_list_subtypes); + ink_list = pdf_dict_get(ctx, annot->obj, PDF_NAME(InkList)); + stroke = pdf_array_get(ctx, ink_list, i); + return pdf_array_len(ctx, stroke) / 2; } -fz_link * -pdf_load_link_annots(fz_context *ctx, pdf_document *doc, pdf_obj *annots, const fz_matrix *page_ctm) +fz_point +pdf_annot_ink_list_stroke_vertex(fz_context *ctx, pdf_annot *annot, int i, int k) { - fz_link *link, *head, *tail; - pdf_obj *obj; - int i, n; + pdf_obj *ink_list; + pdf_obj *stroke; + fz_matrix page_ctm; + fz_point point; - head = tail = NULL; - link = NULL; + check_allowed_subtypes(ctx, annot, PDF_NAME(InkList), ink_list_subtypes); - n = pdf_array_len(ctx, annots); - for (i = 0; i < n; i++) - { - /* FIXME: Move the try/catch out of the loop for performance? */ - fz_try(ctx) - { - obj = pdf_array_get(ctx, annots, i); - link = pdf_load_link(ctx, doc, obj, page_ctm); - } - fz_catch(ctx) - { - fz_rethrow_if(ctx, FZ_ERROR_TRYLATER); - link = NULL; - } + ink_list = pdf_dict_get(ctx, annot->obj, PDF_NAME(InkList)); + stroke = pdf_array_get(ctx, ink_list, i); - if (link) - { - if (!head) - head = tail = link; - else - { - tail->next = link; - tail = link; - } - } - } + pdf_page_transform(ctx, annot->page, NULL, &page_ctm); - return head; + point.x = pdf_array_get_real(ctx, stroke, k * 2 + 0); + point.y = pdf_array_get_real(ctx, stroke, k * 2 + 1); + return fz_transform_point(point, page_ctm); } -int -pdf_resolve_link(fz_context *ctx, pdf_document *doc, const char *uri, float *xp, float *yp) +void +pdf_set_annot_ink_list(fz_context *ctx, pdf_annot *annot, int n, const int *count, const fz_point *v) { - if (uri && uri[0] == '#') + pdf_document *doc = annot->page->doc; + fz_matrix page_ctm, inv_page_ctm; + pdf_obj *ink_list, *stroke; + fz_point point; + int i, k; + + check_allowed_subtypes(ctx, annot, PDF_NAME(InkList), ink_list_subtypes); + + pdf_page_transform(ctx, annot->page, NULL, &page_ctm); + inv_page_ctm = fz_invert_matrix(page_ctm); + + // TODO: update Rect (in update appearance perhaps?) + + ink_list = pdf_new_array(ctx, doc, n); + for (i = 0; i < n; ++i) { - int page = fz_atoi(uri + 1) - 1; - if (xp || yp) + stroke = pdf_new_array(ctx, doc, count[i] * 2); + for (k = 0; k < count[i]; ++k) { - const char *x = strchr(uri, ','); - const char *y = strrchr(uri, ','); - if (x && y) - { - pdf_obj *obj; - fz_matrix ctm; - fz_point p; - - p.x = x ? fz_atoi(x + 1) : 0; - p.y = y ? fz_atoi(y + 1) : 0; - obj = pdf_lookup_page_obj(ctx, doc, page); - pdf_page_obj_transform(ctx, obj, NULL, &ctm); - fz_transform_point(&p, &ctm); - - if (xp) *xp = p.x; - if (yp) *yp = p.y; - } + point = fz_transform_point(*v++, inv_page_ctm); + pdf_array_push_real(ctx, stroke, point.x); + pdf_array_push_real(ctx, stroke, point.y); } - return page; + pdf_array_push_drop(ctx, ink_list, stroke); } -/* willus mod -- be quiet */ -/* - fz_warn(ctx, "unknown link uri '%s'", uri); -*/ - return -1; + pdf_dict_put_drop(ctx, annot->obj, PDF_NAME(InkList), ink_list); + pdf_dirty_annot(ctx, annot); } -static void -pdf_drop_annot_imp(fz_context *ctx, pdf_annot *annot) +void +pdf_clear_annot_ink_list(fz_context *ctx, pdf_annot *annot) { - pdf_drop_xobject(ctx, annot->ap); - pdf_drop_obj(ctx, annot->obj); + check_allowed_subtypes(ctx, annot, PDF_NAME(InkList), ink_list_subtypes); + pdf_dict_del(ctx, annot->obj, PDF_NAME(InkList)); + pdf_dirty_annot(ctx, annot); } void -pdf_drop_annots(fz_context *ctx, pdf_annot *annot) +pdf_add_annot_ink_list(fz_context *ctx, pdf_annot *annot, int n, fz_point p[]) { - while (annot) + pdf_document *doc = annot->page->doc; + fz_matrix page_ctm, inv_page_ctm; + pdf_obj *ink_list, *stroke; + int i; + + check_allowed_subtypes(ctx, annot, PDF_NAME(InkList), ink_list_subtypes); + + pdf_page_transform(ctx, annot->page, NULL, &page_ctm); + inv_page_ctm = fz_invert_matrix(page_ctm); + + ink_list = pdf_dict_get(ctx, annot->obj, PDF_NAME(InkList)); + if (!pdf_is_array(ctx, ink_list)) { - pdf_annot *next = annot->next; - fz_drop_annot(ctx, &annot->super); - annot = next; + ink_list = pdf_new_array(ctx, doc, 10); + pdf_dict_put_drop(ctx, annot->obj, PDF_NAME(InkList), ink_list); + } + + stroke = pdf_new_array(ctx, doc, n * 2); + fz_try(ctx) + { + for (i = 0; i < n; ++i) + { + fz_point tp = fz_transform_point(p[i], inv_page_ctm); + pdf_array_push_real(ctx, stroke, tp.x); + pdf_array_push_real(ctx, stroke, tp.y); + } } + fz_catch(ctx) + { + pdf_drop_obj(ctx, stroke); + fz_rethrow(ctx); + } + + pdf_array_push_drop(ctx, ink_list, stroke); + + pdf_dirty_annot(ctx, annot); } -/* Create transform to fit appearance stream to annotation Rect */ void -pdf_annot_transform(fz_context *ctx, pdf_annot *annot, fz_matrix *annot_ctm) +pdf_set_text_annot_position(fz_context *ctx, pdf_annot *annot, fz_point pt) { - fz_rect bbox, rect; - fz_matrix matrix; - float w, h, x, y; - - pdf_to_rect(ctx, pdf_dict_get(ctx, annot->obj, PDF_NAME_Rect), &rect); - pdf_xobject_bbox(ctx, annot->ap, &bbox); - pdf_xobject_matrix(ctx, annot->ap, &matrix); + fz_matrix page_ctm, inv_page_ctm; + fz_rect rect; + int flags; + + pdf_page_transform(ctx, annot->page, NULL, &page_ctm); + inv_page_ctm = fz_invert_matrix(page_ctm); + + rect.x0 = pt.x; + rect.x1 = pt.x + TEXT_ANNOT_SIZE; + rect.y0 = pt.y; + rect.y1 = pt.y + TEXT_ANNOT_SIZE; + rect = fz_transform_rect(rect, inv_page_ctm); + + pdf_dict_put_rect(ctx, annot->obj, PDF_NAME(Rect), rect); + + flags = pdf_dict_get_int(ctx, annot->obj, PDF_NAME(F)); + flags |= (PDF_ANNOT_IS_NO_ZOOM|PDF_ANNOT_IS_NO_ROTATE); + pdf_dict_put_int(ctx, annot->obj, PDF_NAME(F), flags); +} - fz_transform_rect(&bbox, &matrix); - if (bbox.x1 == bbox.x0) - w = 0; - else - w = (rect.x1 - rect.x0) / (bbox.x1 - bbox.x0); - if (bbox.y1 == bbox.y0) - h = 0; +static void +pdf_format_date(fz_context *ctx, char *s, int n, time_t secs) +{ +#ifdef _POSIX_SOURCE + struct tm tmbuf, *tm = gmtime_r(&secs, &tmbuf); +#else + struct tm *tm = gmtime(&secs); +#endif + if (!tm) + fz_strlcpy(s, "D:19700101000000Z", n); else - h = (rect.y1 - rect.y0) / (bbox.y1 - bbox.y0); - x = rect.x0 - bbox.x0; - y = rect.y0 - bbox.y0; - - fz_pre_scale(fz_translate(annot_ctm, x, y), w, h); + strftime(s, n, "D:%Y%m%d%H%M%SZ", tm); } -pdf_annot *pdf_new_annot(fz_context *ctx, pdf_page *page) +static int64_t +pdf_parse_date(fz_context *ctx, const char *s) { - pdf_annot *annot = fz_new_annot(ctx, sizeof(pdf_annot)); - - annot->super.drop_annot = (fz_annot_drop_fn*)pdf_drop_annot_imp; - annot->super.bound_annot = (fz_annot_bound_fn*)pdf_bound_annot; - annot->super.run_annot = (fz_annot_run_fn*)pdf_run_annot; - annot->super.next_annot = (fz_annot_next_fn*)pdf_next_annot; + int tz_sign, tz_hour, tz_min, tz_adj; + struct tm tm; + time_t utc; - annot->page = page; + if (!s) + return 0; - return annot; -} + memset(&tm, 0, sizeof tm); + tm.tm_mday = 1; -void -pdf_load_annots(fz_context *ctx, pdf_page *page, pdf_obj *annots) -{ - pdf_document *doc = page->doc; - pdf_annot *annot, **itr; - pdf_obj *obj, *ap, *as, *n; - int i, len, keep_annot; + tz_sign = 1; + tz_hour = 0; + tz_min = 0; - fz_var(annot); - fz_var(itr); - fz_var(keep_annot); + if (s[0] == 'D' && s[1] == ':') + s += 2; - itr = &page->annots; + if (!isdigit(s[0]) || !isdigit(s[1]) || !isdigit(s[2]) || !isdigit(s[3])) + { + fz_warn(ctx, "invalid date format (missing year)"); + return 0; + } + tm.tm_year = (s[0]-'0')*1000 + (s[1]-'0')*100 + (s[2]-'0')*10 + (s[3]-'0') - 1900; + s += 4; - len = pdf_array_len(ctx, annots); - /* - Create an initial linked list of pdf_annot structures with only the obj field - filled in. We do this because update_appearance has the potential to change - the annot array, so we don't want to be iterating through the array while - that happens. - */ - fz_try(ctx) + if (isdigit(s[0]) && isdigit(s[1])) { - for (i = 0; i < len; i++) + tm.tm_mon = (s[0]-'0')*10 + (s[1]-'0') - 1; /* month is 0-11 in struct tm */ + s += 2; + if (isdigit(s[0]) && isdigit(s[1])) { - obj = pdf_array_get(ctx, annots, i); - - annot = pdf_new_annot(ctx, page); - *itr = annot; - annot->obj = pdf_keep_obj(ctx, obj); - itr = &annot->next; + tm.tm_mday = (s[0]-'0')*10 + (s[1]-'0'); + s += 2; + if (isdigit(s[0]) && isdigit(s[1])) + { + tm.tm_hour = (s[0]-'0')*10 + (s[1]-'0'); + s += 2; + if (isdigit(s[0]) && isdigit(s[1])) + { + tm.tm_min = (s[0]-'0')*10 + (s[1]-'0'); + s += 2; + if (isdigit(s[0]) && isdigit(s[1])) + { + tm.tm_sec = (s[0]-'0')*10 + (s[1]-'0'); + s += 2; + } + } + } } } - fz_catch(ctx) + + if (s[0] == 'Z') { - pdf_drop_annots(ctx, page->annots); - page->annots = NULL; - fz_rethrow(ctx); + s += 1; } - - /* - Iterate through the newly created annot linked list, using a double pointer to - facilitate deleting broken annotations. - */ - itr = &page->annots; - while (*itr) + else if ((s[0] == '-' || s[0] == '+') && isdigit(s[1]) && isdigit(s[2])) { - annot = *itr; - - fz_try(ctx) + tz_sign = (s[0] == '-') ? -1 : 1; + tz_hour = (s[1]-'0')*10 + (s[2]-'0'); + s += 3; + if (s[0] == '\'' && isdigit(s[1]) && isdigit(s[2])) { - pdf_hotspot *hp = &doc->hotspot; + tz_min = (s[1]-'0')*10 + (s[2]-'0'); + s += 3; + if (s[0] == '\'') + s += 1; + } + } - n = NULL; + if (s[0] != 0) + fz_warn(ctx, "invalid date format (garbage at end)"); - if (doc->update_appearance) - doc->update_appearance(ctx, doc, annot); + utc = timegm(&tm); + if (utc == (time_t)-1) + { + fz_warn(ctx, "date overflow error"); + return 0; + } - obj = annot->obj; - ap = pdf_dict_get(ctx, obj, PDF_NAME_AP); - as = pdf_dict_get(ctx, obj, PDF_NAME_AS); + tz_adj = tz_sign * (tz_hour * 3600 + tz_min * 60); + return utc - tz_adj; +} - /* We only collect annotations with an appearance - * stream into this list, so remove any that don't - * (such as links) and continue. */ - keep_annot = pdf_is_dict(ctx, ap); - if (!keep_annot) - break; - if (hp->num == pdf_to_num(ctx, obj) && (hp->state & HOTSPOT_POINTER_DOWN)) - { - n = pdf_dict_get(ctx, ap, PDF_NAME_D); /* down state */ - } +static pdf_obj *markup_subtypes[] = { + PDF_NAME(Text), + PDF_NAME(FreeText), + PDF_NAME(Line), + PDF_NAME(Square), + PDF_NAME(Circle), + PDF_NAME(Polygon), + PDF_NAME(PolyLine), + PDF_NAME(Highlight), + PDF_NAME(Underline), + PDF_NAME(Squiggly), + PDF_NAME(StrikeOut), + PDF_NAME(Stamp), + PDF_NAME(Caret), + PDF_NAME(Ink), + PDF_NAME(FileAttachment), + PDF_NAME(Sound), + NULL, +}; - if (n == NULL) - n = pdf_dict_get(ctx, ap, PDF_NAME_N); /* normal state */ +int64_t +pdf_annot_modification_date(fz_context *ctx, pdf_annot *annot) +{ + pdf_obj *date = pdf_dict_get(ctx, annot->obj, PDF_NAME(M)); + return date ? pdf_parse_date(ctx, pdf_to_str_buf(ctx, date)) : 0; +} - /* lookup current state in sub-dictionary */ - if (!pdf_is_stream(ctx, n)) - n = pdf_dict_get(ctx, n, as); +void +pdf_set_annot_modification_date(fz_context *ctx, pdf_annot *annot, int64_t secs) +{ + char s[40]; - annot->ap = NULL; + check_allowed_subtypes(ctx, annot, PDF_NAME(M), markup_subtypes); - if (pdf_is_stream(ctx, n)) - { - annot->ap = pdf_load_xobject(ctx, doc, n); - annot->ap_iteration = annot->ap->iteration; - } - else - fz_warn(ctx, "no appearance stream for annotation %d 0 R", pdf_to_num(ctx, annot->obj)); + pdf_format_date(ctx, s, sizeof s, secs); + pdf_dict_put_string(ctx, annot->obj, PDF_NAME(M), s, strlen(s)); + pdf_dirty_annot(ctx, annot); +} - if (obj == doc->focus_obj) - doc->focus = annot; +int +pdf_annot_has_author(fz_context *ctx, pdf_annot *annot) +{ + return is_allowed_subtype(ctx, annot, PDF_NAME(T), markup_subtypes); +} + +const char * +pdf_annot_author(fz_context *ctx, pdf_annot *annot) +{ + check_allowed_subtypes(ctx, annot, PDF_NAME(T), markup_subtypes); + return pdf_dict_get_text_string(ctx, annot->obj, PDF_NAME(T)); +} + +void +pdf_set_annot_author(fz_context *ctx, pdf_annot *annot, const char *author) +{ + check_allowed_subtypes(ctx, annot, PDF_NAME(T), markup_subtypes); + pdf_dict_put_text_string(ctx, annot->obj, PDF_NAME(T), author); + pdf_dirty_annot(ctx, annot); +} - /* Move to next item in the linked list */ - itr = &annot->next; +void +pdf_parse_default_appearance(fz_context *ctx, const char *da, const char **font, float *size, float color[3]) +{ + char buf[100], *p = buf, *tok, *end; + float stack[3] = { 0, 0, 0 }; + int top = 0; + + *font = "Helv"; + *size = 12; + color[0] = color[1] = color[2] = 0; + + fz_strlcpy(buf, da, sizeof buf); + while ((tok = fz_strsep(&p, " \n\r\t")) != NULL) + { + if (tok[0] == 0) + ; + else if (tok[0] == '/') + { + if (!strcmp(tok+1, "Cour")) *font = "Cour"; + if (!strcmp(tok+1, "Helv")) *font = "Helv"; + if (!strcmp(tok+1, "TiRo")) *font = "TiRo"; + if (!strcmp(tok+1, "Symb")) *font = "Symb"; + if (!strcmp(tok+1, "ZaDb")) *font = "ZaDb"; } - fz_catch(ctx) + else if (!strcmp(tok, "Tf")) { - if (fz_caught(ctx) == FZ_ERROR_TRYLATER) - { - pdf_drop_annots(ctx, page->annots); - page->annots = NULL; - fz_rethrow(ctx); - } - keep_annot = 0; - fz_warn(ctx, "ignoring broken annotation"); + *size = stack[0]; + top = 0; } - if (!keep_annot) + else if (!strcmp(tok, "g")) { - /* Move to next item in the linked list, dropping this one */ - *itr = annot->next; - annot->next = NULL; /* Required because pdf_drop_annots follows the "next" chain */ - pdf_drop_annots(ctx, annot); + color[0] = color[1] = color[2] = stack[0]; + top = 0; + } + else if (!strcmp(tok, "rg")) + { + color[0] = stack[0]; + color[1] = stack[1]; + color[2] = stack[2]; + top=0; + } + else + { + if (top < 3) + stack[top] = fz_strtof(tok, &end); + if (*end == 0) + ++top; + else + top = 0; } } - - page->annot_tailp = itr; } -pdf_annot * -pdf_first_annot(fz_context *ctx, pdf_page *page) +void +pdf_print_default_appearance(fz_context *ctx, char *buf, int nbuf, const char *font, float size, const float color[3]) { - return page ? page->annots : NULL; + if (color[0] > 0 || color[1] > 0 || color[2] > 0) + fz_snprintf(buf, nbuf, "/%s %g Tf %g %g %g rg", font, size, color[0], color[1], color[2]); + else + fz_snprintf(buf, nbuf, "/%s %g Tf", font, size); } -pdf_annot * -pdf_next_annot(fz_context *ctx, pdf_annot *annot) +void +pdf_annot_default_appearance(fz_context *ctx, pdf_annot *annot, const char **font, float *size, float color[3]) { - return annot ? annot->next : NULL; + pdf_obj *da = pdf_dict_get_inheritable(ctx, annot->obj, PDF_NAME(DA)); + if (!da) + { + pdf_obj *trailer = pdf_trailer(ctx, annot->page->doc); + da = pdf_dict_getl(ctx, trailer, PDF_NAME(Root), PDF_NAME(AcroForm), PDF_NAME(DA), NULL); + } + pdf_parse_default_appearance(ctx, pdf_to_str_buf(ctx, da), font, size, color); } -fz_rect * -pdf_bound_annot(fz_context *ctx, pdf_annot *annot, fz_rect *rect) +void +pdf_set_annot_default_appearance(fz_context *ctx, pdf_annot *annot, const char *font, float size, const float color[3]) { - pdf_obj *obj = pdf_dict_get(ctx, annot->obj, PDF_NAME_Rect); - fz_rect mediabox; - fz_matrix page_ctm; - pdf_to_rect(ctx, obj, rect); - pdf_page_transform(ctx, annot->page, &mediabox, &page_ctm); - fz_transform_rect(rect, &page_ctm); - return rect; + char buf[100]; + + pdf_print_default_appearance(ctx, buf, sizeof buf, font, size, color); + + pdf_dict_put_string(ctx, annot->obj, PDF_NAME(DA), buf, strlen(buf)); + + pdf_dict_del(ctx, annot->obj, PDF_NAME(DS)); /* not supported */ + pdf_dict_del(ctx, annot->obj, PDF_NAME(RC)); /* not supported */ + + pdf_dirty_annot(ctx, annot); } diff -Nru k2pdfopt-2.42+ds/mupdf_mod/pdf-colorspace.c k2pdfopt-2.51+ds/mupdf_mod/pdf-colorspace.c --- k2pdfopt-2.42+ds/mupdf_mod/pdf-colorspace.c 2017-02-25 05:40:53.000000000 +0000 +++ k2pdfopt-2.51+ds/mupdf_mod/pdf-colorspace.c 2018-11-21 02:41:50.000000000 +0000 @@ -1,122 +1,237 @@ +#include "mupdf/fitz.h" #include "mupdf/pdf.h" -/* willus mod */ +/* willus mod -- remove ../fitz/... */ #include "colorspace-imp.h" -/* ICCBased */ +#include +/* ICCBased */ static fz_colorspace * -load_icc_based(fz_context *ctx, pdf_document *doc, pdf_obj *dict) +load_icc_based(fz_context *ctx, pdf_obj *dict, int alt) { int n; pdf_obj *obj; + fz_buffer *buffer = NULL; + fz_colorspace *cs = NULL; + fz_colorspace *cs_alt = NULL; + fz_colorspace_clamp_fn *alt_lab_clamping = NULL; + + fz_var(cs); + fz_var(cs_alt); + fz_var(buffer); + + /* + alt => "If ICC unreadable/unsupported, then return the + alternate instead". + + Regardless of whether alt is set or not, we DO read the + alternate space, because we need to know whether it's a + LAB space or not to affect our clamping. We just might + not return it. + */ + fz_try(ctx) + { + obj = pdf_dict_get(ctx, dict, PDF_NAME(Alternate)); + if (obj) + { + cs_alt = pdf_load_colorspace(ctx, obj); + if (fz_colorspace_is_lab_icc(ctx, cs_alt)) + alt_lab_clamping = cs_alt->clamp; + } + } + fz_catch(ctx) + { + fz_drop_colorspace(ctx, cs_alt); + cs_alt = NULL; + } - n = pdf_to_int(ctx, pdf_dict_get(ctx, dict, PDF_NAME_N)); - obj = pdf_dict_get(ctx, dict, PDF_NAME_Alternate); - - if (obj) + /* If we're not going to be allowed to return it, drop it! */ + if (!alt) { - fz_colorspace *cs_alt = NULL; + fz_drop_colorspace(ctx, cs_alt); + cs_alt = NULL; + } - fz_try(ctx) + n = pdf_dict_get_int(ctx, dict, PDF_NAME(N)); + + fz_try(ctx) + { + if (fz_get_cmm_engine(ctx)) { - cs_alt = pdf_load_colorspace(ctx, doc, obj); - if (cs_alt->n != n) - { - fz_drop_colorspace(ctx, cs_alt); - fz_throw(ctx, FZ_ERROR_GENERIC, "ICCBased /Alternate colorspace must have %d components", n); - } + enum fz_colorspace_type type; + if (n == 1) type = FZ_COLORSPACE_GRAY; + else if (n == 3) type = FZ_COLORSPACE_RGB; + else if (n == 4) type = FZ_COLORSPACE_CMYK; + else type = FZ_COLORSPACE_NONE; + buffer = pdf_load_stream(ctx, dict); + cs = fz_new_icc_colorspace(ctx, type, buffer); + } + } + fz_always(ctx) + fz_drop_buffer(ctx, buffer); + fz_catch(ctx) + { + if (!alt) { + fz_drop_colorspace(ctx, cs_alt); + fz_rethrow(ctx); } - fz_catch(ctx) + } + + if (cs) + { + if (n != 1 && n != 3 && n != 4) { - cs_alt = NULL; + fz_drop_colorspace(ctx, cs_alt); + fz_drop_colorspace(ctx, cs); + fz_throw(ctx, FZ_ERROR_GENERIC, "ICC Based must have 1, 3 or 4 components"); } - if (cs_alt) - return cs_alt; + /* Override the clamping if the alternate was LAB */ + if (alt_lab_clamping) + cs->clamp = alt_lab_clamping; + fz_drop_colorspace(ctx, cs_alt); + return cs; } - switch (n) + /* Failed to load the ICC profile - either because it was broken, + * or because we aren't in an ICC workflow. If we aren't allowed + * to return the alternate, then that's all she wrote. */ + if (!alt) { - case 1: return fz_device_gray(ctx); - case 3: return fz_device_rgb(ctx); - case 4: return fz_device_cmyk(ctx); + fz_drop_colorspace(ctx, cs_alt); + fz_throw(ctx, FZ_ERROR_GENERIC, "Unable to read ICC workflow"); } - fz_throw(ctx, FZ_ERROR_GENERIC, "syntaxerror: ICCBased must have 1, 3 or 4 components"); -} + /* If we have an alternate we are allowed to use, return that. */ + if (cs_alt) + { + if (n != 1 && n != 3 && n != 4) + { + fz_drop_colorspace(ctx, cs_alt); + fz_throw(ctx, FZ_ERROR_GENERIC, "ICC Based must have 1, 3 or 4 components"); + } + return cs_alt; + } -/* Lab */ + switch (n) + { + case 1: + cs = fz_keep_colorspace(ctx, fz_device_gray(ctx)); + break; + case 3: + cs = fz_keep_colorspace(ctx, fz_device_rgb(ctx)); + break; + case 4: + cs = fz_keep_colorspace(ctx, fz_device_cmyk(ctx)); + break; + default: fz_throw(ctx, FZ_ERROR_SYNTAX, "ICCBased must have 1, 3 or 4 components"); + } -/* Separation and DeviceN */ + return cs; +} -struct separation +struct devicen { fz_colorspace *base; - fz_function *tint; + pdf_function *tint; }; static void -separation_to_rgb(fz_context *ctx, fz_colorspace *cs, const float *color, float *rgb) +devicen_to_alt(fz_context *ctx, const fz_colorspace *cs, const float *color, float *alt) +{ + struct devicen *devn = cs->data; + pdf_eval_function(ctx, devn->tint, color, cs->n, alt, devn->base->n); +} + +static void +devicen_to_rgb(fz_context *ctx, const fz_colorspace *cs, const float *color, float *rgb) { - struct separation *sep = cs->data; + struct devicen *devn = cs->data; float alt[FZ_MAX_COLORS]; - fz_eval_function(ctx, sep->tint, color, cs->n, alt, sep->base->n); - fz_convert_color(ctx, fz_device_rgb(ctx), rgb, sep->base, alt); + pdf_eval_function(ctx, devn->tint, color, cs->n, alt, devn->base->n); + fz_convert_color(ctx, fz_default_color_params(ctx), NULL, fz_device_rgb(ctx), rgb, devn->base, alt); } static void -free_separation(fz_context *ctx, fz_colorspace *cs) +free_devicen(fz_context *ctx, fz_colorspace *cs) { - struct separation *sep = cs->data; - fz_drop_colorspace(ctx, sep->base); - fz_drop_function(ctx, sep->tint); - fz_free(ctx, sep); + struct devicen *devn = cs->data; + fz_drop_colorspace(ctx, devn->base); + pdf_drop_function(ctx, devn->tint); + fz_free(ctx, devn); } static fz_colorspace * -load_separation(fz_context *ctx, pdf_document *doc, pdf_obj *array) +base_devicen(const fz_colorspace *cs) { - fz_colorspace *cs; - struct separation *sep = NULL; + struct devicen *devn = cs->data; + + return devn->base; +} + +static fz_colorspace * +load_devicen(fz_context *ctx, pdf_obj *array) +{ + fz_colorspace *cs = NULL; + struct devicen *devn = NULL; pdf_obj *nameobj = pdf_array_get(ctx, array, 1); pdf_obj *baseobj = pdf_array_get(ctx, array, 2); pdf_obj *tintobj = pdf_array_get(ctx, array, 3); fz_colorspace *base; - fz_function *tint = NULL; - int n; + pdf_function *tint = NULL; + char *colorspace_name; + int i, n; fz_var(tint); - fz_var(sep); + fz_var(devn); if (pdf_is_array(ctx, nameobj)) + { n = pdf_array_len(ctx, nameobj); + colorspace_name = "DeviceN"; + } else + { n = 1; + colorspace_name = "Separation"; + } + if (n < 1) + fz_throw(ctx, FZ_ERROR_SYNTAX, "insufficient components in colorspace"); if (n > FZ_MAX_COLORS) - fz_throw(ctx, FZ_ERROR_GENERIC, "too many components in colorspace"); + fz_throw(ctx, FZ_ERROR_SYNTAX, "too many components in colorspace"); - base = pdf_load_colorspace(ctx, doc, baseobj); + base = pdf_load_colorspace(ctx, baseobj); fz_try(ctx) { - tint = pdf_load_function(ctx, doc, tintobj, n, base->n); + tint = pdf_load_function(ctx, tintobj, n, base->n); /* RJW: fz_drop_colorspace(ctx, base); * "cannot load tint function (%d 0 R)", pdf_to_num(ctx, tintobj) */ - sep = fz_malloc_struct(ctx, struct separation); - sep->base = base; - sep->tint = tint; + devn = fz_malloc_struct(ctx, struct devicen); + devn->base = fz_keep_colorspace(ctx, base); /* We drop it during the devn free... */ + devn->tint = tint; + + cs = fz_new_colorspace(ctx, colorspace_name, FZ_COLORSPACE_SEPARATION, 0, n, + fz_colorspace_is_icc(ctx, fz_device_rgb(ctx)) ? devicen_to_alt : devicen_to_rgb, NULL, base_devicen, NULL, free_devicen, devn, + sizeof(struct devicen) + base->size + pdf_function_size(ctx, tint)); + + devn = NULL; + if (pdf_is_array(ctx, nameobj)) + for (i = 0; i < n; i++) + fz_colorspace_name_colorant(ctx, cs, i, pdf_to_name(ctx, pdf_array_get(ctx, nameobj, i))); + else + fz_colorspace_name_colorant(ctx, cs, 0, pdf_to_name(ctx, nameobj)); - cs = fz_new_colorspace(ctx, n == 1 ? "Separation" : "DeviceN", n, separation_to_rgb, NULL, free_separation, sep, - sizeof(struct separation) + (base ? base->size : 0) + fz_function_size(ctx, tint)); } + fz_always(ctx) + fz_drop_colorspace(ctx, base); fz_catch(ctx) { - fz_drop_colorspace(ctx, base); - fz_drop_function(ctx, tint); - fz_free(ctx, sep); + pdf_drop_function(ctx, tint); + fz_free(ctx, devn); fz_rethrow(ctx); } @@ -126,13 +241,13 @@ int pdf_is_tint_colorspace(fz_context *ctx, fz_colorspace *cs) { - return fz_colorspace_is(ctx, cs, separation_to_rgb); + return cs && cs->free_data == free_devicen; } /* Indexed */ static fz_colorspace * -load_indexed(fz_context *ctx, pdf_document *doc, pdf_obj *array) +load_indexed(fz_context *ctx, pdf_obj *array) { pdf_obj *baseobj = pdf_array_get(ctx, array, 1); pdf_obj *highobj = pdf_array_get(ctx, array, 2); @@ -147,7 +262,7 @@ fz_try(ctx) { - base = pdf_load_colorspace(ctx, doc, baseobj); + base = pdf_load_colorspace(ctx, baseobj); high = pdf_to_int(ctx, highobj); high = fz_clampi(high, 0, 255); @@ -184,14 +299,15 @@ } else { - fz_throw(ctx, FZ_ERROR_GENERIC, "cannot parse colorspace lookup table"); + fz_throw(ctx, FZ_ERROR_SYNTAX, "cannot parse colorspace lookup table"); } cs = fz_new_indexed_colorspace(ctx, base, high, lookup); } + fz_always(ctx) + fz_drop_colorspace(ctx, base); fz_catch(ctx) { - fz_drop_colorspace(ctx, base); fz_free(ctx, lookup); fz_rethrow(ctx); } @@ -199,33 +315,130 @@ return cs; } -/* Parse and create colorspace from PDF object */ +static void +pdf_load_cal_common(fz_context *ctx, pdf_obj *dict, float *wp, float *bp, float *gamma) +{ + pdf_obj *obj; + int i; + + obj = pdf_dict_get(ctx, dict, PDF_NAME(WhitePoint)); + if (pdf_array_len(ctx, obj) != 3) + fz_throw(ctx, FZ_ERROR_SYNTAX, "WhitePoint must be a 3-element array"); + + for (i = 0; i < 3; i++) + { + wp[i] = pdf_array_get_real(ctx, obj, i); + if (wp[i] < 0) + fz_throw(ctx, FZ_ERROR_SYNTAX, "WhitePoint numbers must be positive"); + } + if (wp[1] != 1) + fz_throw(ctx, FZ_ERROR_SYNTAX, "WhitePoint Yw must be 1.0"); + + obj = pdf_dict_get(ctx, dict, PDF_NAME(BlackPoint)); + if (pdf_array_len(ctx, obj) == 3) + { + for (i = 0; i < 3; i++) + { + bp[i] = pdf_array_get_real(ctx, obj, i); + if (bp[i] < 0) + fz_throw(ctx, FZ_ERROR_SYNTAX, "BlackPoint numbers must be positive"); + } + } + + obj = pdf_dict_get(ctx, dict, PDF_NAME(Gamma)); + if (pdf_is_number(ctx, obj)) + { + gamma[0] = pdf_to_real(ctx, obj); + gamma[1] = gamma[2]; + if (gamma[0] <= 0) + fz_throw(ctx, FZ_ERROR_SYNTAX, "Gamma must be greater than zero"); + } + else if (pdf_array_len(ctx, obj) == 3) + { + for (i = 0; i < 3; i++) + { + gamma[i] = pdf_array_get_real(ctx, obj, i); + if (gamma[i] <= 0) + fz_throw(ctx, FZ_ERROR_SYNTAX, "Gamma must be greater than zero"); + } + } +} + +static fz_colorspace * +pdf_load_cal_gray(fz_context *ctx, pdf_obj *dict) +{ + float wp[3]; + float bp[3] = { 0, 0, 0 }; + float gamma[3] = { 1, 1, 1 }; + + if (dict == NULL) + return fz_keep_colorspace(ctx, fz_device_gray(ctx)); + + fz_try(ctx) + { + pdf_load_cal_common(ctx, dict, wp, bp, gamma); + gamma[2] = gamma[1] = gamma[0]; + } + fz_catch(ctx) + return fz_keep_colorspace(ctx, fz_device_gray(ctx)); + return fz_new_cal_colorspace(ctx, "CalGray", wp, bp, gamma, NULL); +} static fz_colorspace * -pdf_load_colorspace_imp(fz_context *ctx, pdf_document *doc, pdf_obj *obj) +pdf_load_cal_rgb(fz_context *ctx, pdf_obj *dict) { + pdf_obj *obj; + float matrix[9] = { 1, 0, 0, 0, 1, 0, 0, 0, 1 }; + float wp[3]; + float bp[3] = { 0, 0, 0 }; + float gamma[3] = { 1, 1, 1 }; + int i; + + if (dict == NULL) + return fz_keep_colorspace(ctx, fz_device_rgb(ctx)); + + fz_try(ctx) + { + pdf_load_cal_common(ctx, dict, wp, bp, gamma); + obj = pdf_dict_get(ctx, dict, PDF_NAME(Matrix)); + if (pdf_array_len(ctx, obj) == 9) + { + for (i = 0; i < 9; i++) + matrix[i] = pdf_array_get_real(ctx, obj, i); + } + } + fz_catch(ctx) + return fz_keep_colorspace(ctx, fz_device_rgb(ctx)); + return fz_new_cal_colorspace(ctx, "CalRGB", wp, bp, gamma, matrix); +} + +/* Parse and create colorspace from PDF object */ + +static fz_colorspace * +pdf_load_colorspace_imp(fz_context *ctx, pdf_obj *obj) +{ if (pdf_obj_marked(ctx, obj)) - fz_throw(ctx, FZ_ERROR_GENERIC, "Recursion in colorspace definition"); + fz_throw(ctx, FZ_ERROR_SYNTAX, "recursion in colorspace definition"); if (pdf_is_name(ctx, obj)) { - if (pdf_name_eq(ctx, obj, PDF_NAME_Pattern)) - return fz_device_gray(ctx); - else if (pdf_name_eq(ctx, obj, PDF_NAME_G)) - return fz_device_gray(ctx); - else if (pdf_name_eq(ctx, obj, PDF_NAME_RGB)) - return fz_device_rgb(ctx); - else if (pdf_name_eq(ctx, obj, PDF_NAME_CMYK)) - return fz_device_cmyk(ctx); - else if (pdf_name_eq(ctx, obj, PDF_NAME_DeviceGray)) - return fz_device_gray(ctx); - else if (pdf_name_eq(ctx, obj, PDF_NAME_DeviceRGB)) - return fz_device_rgb(ctx); - else if (pdf_name_eq(ctx, obj, PDF_NAME_DeviceCMYK)) - return fz_device_cmyk(ctx); + if (pdf_name_eq(ctx, obj, PDF_NAME(Pattern))) + return fz_keep_colorspace(ctx, fz_device_gray(ctx)); + else if (pdf_name_eq(ctx, obj, PDF_NAME(G))) + return fz_keep_colorspace(ctx, fz_device_gray(ctx)); + else if (pdf_name_eq(ctx, obj, PDF_NAME(RGB))) + return fz_keep_colorspace(ctx, fz_device_rgb(ctx)); + else if (pdf_name_eq(ctx, obj, PDF_NAME(CMYK))) + return fz_keep_colorspace(ctx, fz_device_cmyk(ctx)); + else if (pdf_name_eq(ctx, obj, PDF_NAME(DeviceGray))) + return fz_keep_colorspace(ctx, fz_device_gray(ctx)); + else if (pdf_name_eq(ctx, obj, PDF_NAME(DeviceRGB))) + return fz_keep_colorspace(ctx, fz_device_rgb(ctx)); + else if (pdf_name_eq(ctx, obj, PDF_NAME(DeviceCMYK))) + return fz_keep_colorspace(ctx, fz_device_cmyk(ctx)); else - fz_throw(ctx, FZ_ERROR_GENERIC, "unknown colorspace: %s", pdf_to_name(ctx, obj)); + fz_throw(ctx, FZ_ERROR_SYNTAX, "unknown colorspace: %s", pdf_to_name(ctx, obj)); } else if (pdf_is_array(ctx, obj)) @@ -235,60 +448,71 @@ if (pdf_is_name(ctx, name)) { /* load base colorspace instead */ - if (pdf_name_eq(ctx, name, PDF_NAME_G)) - return fz_device_gray(ctx); - else if (pdf_name_eq(ctx, name, PDF_NAME_RGB)) - return fz_device_rgb(ctx); - else if (pdf_name_eq(ctx, name, PDF_NAME_CMYK)) - return fz_device_cmyk(ctx); - else if (pdf_name_eq(ctx, name, PDF_NAME_DeviceGray)) - return fz_device_gray(ctx); - else if (pdf_name_eq(ctx, name, PDF_NAME_DeviceRGB)) - return fz_device_rgb(ctx); - else if (pdf_name_eq(ctx, name, PDF_NAME_DeviceCMYK)) - return fz_device_cmyk(ctx); - else if (pdf_name_eq(ctx, name, PDF_NAME_CalGray)) - return fz_device_gray(ctx); - else if (pdf_name_eq(ctx, name, PDF_NAME_CalRGB)) - return fz_device_rgb(ctx); - else if (pdf_name_eq(ctx, name, PDF_NAME_CalCMYK)) - return fz_device_cmyk(ctx); - else if (pdf_name_eq(ctx, name, PDF_NAME_Lab)) - return fz_device_lab(ctx); + if (pdf_name_eq(ctx, name, PDF_NAME(G))) + return fz_keep_colorspace(ctx, fz_device_gray(ctx)); + else if (pdf_name_eq(ctx, name, PDF_NAME(RGB))) + return fz_keep_colorspace(ctx, fz_device_rgb(ctx)); + else if (pdf_name_eq(ctx, name, PDF_NAME(CMYK))) + return fz_keep_colorspace(ctx, fz_device_cmyk(ctx)); + else if (pdf_name_eq(ctx, name, PDF_NAME(DeviceGray))) + return fz_keep_colorspace(ctx, fz_device_gray(ctx)); + else if (pdf_name_eq(ctx, name, PDF_NAME(DeviceRGB))) + return fz_keep_colorspace(ctx, fz_device_rgb(ctx)); + else if (pdf_name_eq(ctx, name, PDF_NAME(DeviceCMYK))) + return fz_keep_colorspace(ctx, fz_device_cmyk(ctx)); + else if (pdf_name_eq(ctx, name, PDF_NAME(CalGray))) + { + if (fz_get_cmm_engine(ctx)) + return pdf_load_cal_gray(ctx, pdf_array_get(ctx, obj, 1)); + else + return fz_keep_colorspace(ctx, fz_device_gray(ctx)); + } + else if (pdf_name_eq(ctx, name, PDF_NAME(CalRGB))) + { + if (fz_get_cmm_engine(ctx)) + return pdf_load_cal_rgb(ctx, pdf_array_get(ctx, obj, 1)); + else + return fz_keep_colorspace(ctx, fz_device_rgb(ctx)); + } + else if (pdf_name_eq(ctx, name, PDF_NAME(CalCMYK))) + return fz_keep_colorspace(ctx, fz_device_cmyk(ctx)); + else if (pdf_name_eq(ctx, name, PDF_NAME(Lab))) + return fz_keep_colorspace(ctx, fz_device_lab(ctx)); else { fz_colorspace *cs; fz_try(ctx) { - pdf_mark_obj(ctx, obj); - if (pdf_name_eq(ctx, name, PDF_NAME_ICCBased)) - cs = load_icc_based(ctx, doc, pdf_array_get(ctx, obj, 1)); - - else if (pdf_name_eq(ctx, name, PDF_NAME_Indexed)) - cs = load_indexed(ctx, doc, obj); - else if (pdf_name_eq(ctx, name, PDF_NAME_I)) - cs = load_indexed(ctx, doc, obj); - - else if (pdf_name_eq(ctx, name, PDF_NAME_Separation)) - cs = load_separation(ctx, doc, obj); - - else if (pdf_name_eq(ctx, name, PDF_NAME_DeviceN)) - cs = load_separation(ctx, doc, obj); - else if (pdf_name_eq(ctx, name, PDF_NAME_Pattern)) + if (pdf_mark_obj(ctx, obj)) + fz_throw(ctx, FZ_ERROR_SYNTAX, "recursive colorspace"); + if (pdf_name_eq(ctx, name, PDF_NAME(ICCBased))) + cs = load_icc_based(ctx, pdf_array_get(ctx, obj, 1), 1); + + else if (pdf_name_eq(ctx, name, PDF_NAME(Indexed))) + cs = load_indexed(ctx, obj); + else if (pdf_name_eq(ctx, name, PDF_NAME(I))) + cs = load_indexed(ctx, obj); + + else if (pdf_name_eq(ctx, name, PDF_NAME(Separation))) + cs = load_devicen(ctx, obj); + + else if (pdf_name_eq(ctx, name, PDF_NAME(DeviceN))) + cs = load_devicen(ctx, obj); + else if (pdf_name_eq(ctx, name, PDF_NAME(Pattern))) { pdf_obj *pobj; pobj = pdf_array_get(ctx, obj, 1); if (!pobj) { - cs = fz_device_gray(ctx); + cs = fz_keep_colorspace(ctx, fz_device_gray(ctx)); break; } - cs = pdf_load_colorspace(ctx, doc, pobj); + cs = pdf_load_colorspace(ctx, pobj); } else - fz_throw(ctx, FZ_ERROR_GENERIC, "syntaxerror: unknown colorspace %s", pdf_to_name(ctx, name)); + fz_throw(ctx, FZ_ERROR_SYNTAX, "unknown colorspace %s", pdf_to_name(ctx, name)); } fz_always(ctx) { @@ -303,11 +527,21 @@ } } - fz_throw(ctx, FZ_ERROR_GENERIC, "syntaxerror: could not parse color space (%d 0 R)", pdf_to_num(ctx, obj)); + /* We have seen files where /DefaultRGB is specified as 1 0 R, + * and 1 0 obj << /Length 3144 /Alternate /DeviceRGB /N 3 >> + * stream ...iccprofile... endstream endobj. + * This *should* be [ /ICCBased 1 0 R ], but Acrobat seems to + * handle it, so do our best. */ + else if (pdf_is_dict(ctx, obj)) + { + return load_icc_based(ctx, obj, 1); + } + + fz_throw(ctx, FZ_ERROR_SYNTAX, "could not parse color space (%d 0 R)", pdf_to_num(ctx, obj)); } fz_colorspace * -pdf_load_colorspace(fz_context *ctx, pdf_document *doc, pdf_obj *obj) +pdf_load_colorspace(fz_context *ctx, pdf_obj *obj) { fz_colorspace *cs; @@ -316,9 +550,54 @@ return cs; } - cs = pdf_load_colorspace_imp(ctx, doc, obj); + cs = pdf_load_colorspace_imp(ctx, obj); pdf_store_item(ctx, obj, cs, cs->size); return cs; } + +static fz_colorspace * +pdf_load_output_intent(fz_context *ctx, pdf_document *doc) +{ + pdf_obj *root = pdf_dict_get(ctx, pdf_trailer(ctx, doc), PDF_NAME(Root)); + pdf_obj *intents = pdf_dict_get(ctx, root, PDF_NAME(OutputIntents)); + pdf_obj *intent_dict; + pdf_obj *dest_profile; + fz_colorspace *cs = NULL; + + /* An array of intents */ + if (!intents) + return NULL; + + /* For now, always just use the first intent. I have never even seen a file + * with multiple intents but it could happen */ + intent_dict = pdf_array_get(ctx, intents, 0); + if (!intent_dict) + return NULL; + dest_profile = pdf_dict_get(ctx, intent_dict, PDF_NAME(DestOutputProfile)); + if (!dest_profile) + return NULL; + + fz_var(cs); + + fz_try(ctx) + cs = load_icc_based(ctx, dest_profile, 0); + fz_catch(ctx) + { + /* Swallow the error */ + fz_warn(ctx, "Attempt to read Output Intent failed"); + } + + return cs; +} + +fz_colorspace * +pdf_document_output_intent(fz_context *ctx, pdf_document *doc) +{ +#ifndef NOICC + if (!doc->oi) + doc->oi = pdf_load_output_intent(ctx, doc); +#endif + return doc->oi; +} diff -Nru k2pdfopt-2.42+ds/mupdf_mod/pdf-font.c k2pdfopt-2.51+ds/mupdf_mod/pdf-font.c --- k2pdfopt-2.42+ds/mupdf_mod/pdf-font.c 2017-02-25 05:41:15.000000000 +0000 +++ k2pdfopt-2.51+ds/mupdf_mod/pdf-font.c 2018-11-21 02:42:17.000000000 +0000 @@ -1,7 +1,11 @@ +#include "mupdf/fitz.h" #include "mupdf/pdf.h" -/* willus mod */ +/* willus mod -- remove ../fitz/ */ #include "font-imp.h" +#include "fitz-imp.h" + +#include #include #include FT_FREETYPE_H @@ -17,7 +21,8 @@ #define FT_SFNT_HEAD ft_sfnt_head #endif -static void pdf_load_font_descriptor(fz_context *ctx, pdf_document *doc, pdf_font_desc *fontdesc, pdf_obj *dict, char *collection, char *basefont, int iscidfont); +static void pdf_load_font_descriptor(fz_context *ctx, pdf_document *doc, pdf_font_desc *fontdesc, pdf_obj *dict, + const char *collection, const char *basefont, int iscidfont); static const char *base_font_names[][10] = { @@ -50,7 +55,7 @@ { "ZapfDingbats", NULL } }; -const char * +const unsigned char * pdf_lookup_substitute_font(fz_context *ctx, int mono, int serif, int bold, int italic, int *len) { if (mono) { @@ -129,7 +134,7 @@ if (!font->buffer) return 0; fz_buffer_storage(ctx, font->buffer, &data); - return fz_lookup_base14_font(ctx, clean_font_name(font->name), &size) == (char*)data; + return fz_lookup_base14_font(ctx, clean_font_name(font->name), &size) == data; } /* @@ -179,9 +184,9 @@ return gid; } -static int ft_name_index(FT_Face face, char *name) +static int ft_name_index(FT_Face face, const char *name) { - int code = FT_Get_Name_Index(face, name); + int code = FT_Get_Name_Index(face, (char*)name); if (code == 0) { int unicode = pdf_lookup_agl(name); @@ -291,19 +296,48 @@ int gid = ft_cid_to_gid(fontdesc, cid); FT_Fixed adv; int fterr; + FT_Face face = fontdesc->font->ft_face; + FT_UShort units_per_EM; - fterr = FT_Get_Advance(fontdesc->font->ft_face, gid, mask, &adv); + fterr = FT_Get_Advance(face, gid, mask, &adv); if (fterr) { fz_warn(ctx, "freetype advance glyph (gid %d): %s", gid, ft_error_string(fterr)); return 0; } - return adv * 1000 / ((FT_Face)fontdesc->font->ft_face)->units_per_EM; -} + units_per_EM = face->units_per_EM; + if (units_per_EM == 0) + units_per_EM = 2048; + + return adv * 1000 / units_per_EM; +} + +static const struct { int code; const char *name; } mre_diff_table[] = +{ + { 173, "notequal" }, + { 176, "infinity" }, + { 178, "lessequal" }, + { 179, "greaterequal" }, + { 182, "partialdiff" }, + { 183, "summation" }, + { 184, "product" }, + { 185, "pi" }, + { 186, "integral" }, + { 189, "Omega" }, + { 195, "radical" }, + { 197, "approxequal" }, + { 198, "Delta" }, + { 215, "lozenge" }, + { 219, "Euro" }, + { 240, "apple" }, +}; -static int lookup_mre_code(char *name) +static int lookup_mre_code(const char *name) { int i; + for (i = 0; i < nelem(mre_diff_table); ++i) + if (!strcmp(name, mre_diff_table[i].name)) + return mre_diff_table[i].code; for (i = 0; i < 256; i++) if (pdf_mac_roman[i] && !strcmp(name, pdf_mac_roman[i])) return i; @@ -315,7 +349,7 @@ */ static void -pdf_load_builtin_font(fz_context *ctx, pdf_font_desc *fontdesc, char *fontname, int has_descriptor) +pdf_load_builtin_font(fz_context *ctx, pdf_font_desc *fontdesc, const char *fontname, int has_descriptor) { FT_Face face; const char *clean_name = clean_font_name(fontname); @@ -323,7 +357,7 @@ fontdesc->font = fz_load_system_font(ctx, fontname, 0, 0, !has_descriptor); if (!fontdesc->font) { - const char *data; + const unsigned char *data; int len; data = fz_lookup_base14_font(ctx, clean_name, &len); @@ -343,12 +377,12 @@ } static void -pdf_load_substitute_font(fz_context *ctx, pdf_font_desc *fontdesc, char *fontname, int mono, int serif, int bold, int italic) +pdf_load_substitute_font(fz_context *ctx, pdf_font_desc *fontdesc, const char *fontname, int mono, int serif, int bold, int italic) { fontdesc->font = fz_load_system_font(ctx, fontname, bold, italic, 0); if (!fontdesc->font) { - const char *data; + const unsigned char *data; int len; data = pdf_lookup_substitute_font(ctx, mono, serif, bold, italic, &len); @@ -370,21 +404,21 @@ } static void -pdf_load_substitute_cjk_font(fz_context *ctx, pdf_font_desc *fontdesc, char *fontname, int ros, int serif) +pdf_load_substitute_cjk_font(fz_context *ctx, pdf_font_desc *fontdesc, const char *fontname, int ros, int serif) { fontdesc->font = fz_load_system_cjk_font(ctx, fontname, ros, serif); if (!fontdesc->font) { - const char *data; - int len; - int index; + const unsigned char *data; + int size; + int subfont; - data = fz_lookup_cjk_font(ctx, ros, serif, fontdesc->wmode, &len, &index); + data = fz_lookup_cjk_font(ctx, ros, &size, &subfont); if (!data) fz_throw(ctx, FZ_ERROR_GENERIC, "cannot find builtin CJK font"); /* A glyph bbox cache is too big for CJK fonts. */ - fontdesc->font = fz_new_font_from_memory(ctx, fontname, data, len, index, 0); + fontdesc->font = fz_new_font_from_memory(ctx, fontname, data, size, subfont, 0); } fontdesc->font->flags.ft_substitute = 1; @@ -392,7 +426,7 @@ } static void -pdf_load_system_font(fz_context *ctx, pdf_font_desc *fontdesc, char *fontname, char *collection) +pdf_load_system_font(fz_context *ctx, pdf_font_desc *fontdesc, const char *fontname, const char *collection) { int bold = 0; int italic = 0; @@ -418,13 +452,13 @@ if (collection) { if (!strcmp(collection, "Adobe-CNS1")) - pdf_load_substitute_cjk_font(ctx, fontdesc, fontname, FZ_ADOBE_CNS_1, serif); + pdf_load_substitute_cjk_font(ctx, fontdesc, fontname, FZ_ADOBE_CNS, serif); else if (!strcmp(collection, "Adobe-GB1")) - pdf_load_substitute_cjk_font(ctx, fontdesc, fontname, FZ_ADOBE_GB_1, serif); + pdf_load_substitute_cjk_font(ctx, fontdesc, fontname, FZ_ADOBE_GB, serif); else if (!strcmp(collection, "Adobe-Japan1")) - pdf_load_substitute_cjk_font(ctx, fontdesc, fontname, FZ_ADOBE_JAPAN_1, serif); + pdf_load_substitute_cjk_font(ctx, fontdesc, fontname, FZ_ADOBE_JAPAN, serif); else if (!strcmp(collection, "Adobe-Korea1")) - pdf_load_substitute_cjk_font(ctx, fontdesc, fontname, FZ_ADOBE_KOREA_1, serif); + pdf_load_substitute_cjk_font(ctx, fontdesc, fontname, FZ_ADOBE_KOREA, serif); else { if (strcmp(collection, "Adobe-Identity") != 0) @@ -439,7 +473,7 @@ } static void -pdf_load_embedded_font(fz_context *ctx, pdf_document *doc, pdf_font_desc *fontdesc, char *fontname, pdf_obj *stmref) +pdf_load_embedded_font(fz_context *ctx, pdf_document *doc, pdf_font_desc *fontdesc, const char *fontname, pdf_obj *stmref) { fz_buffer *buf; @@ -543,8 +577,59 @@ * Simple fonts (Type1 and TrueType) */ +static FT_CharMap +select_type1_cmap(FT_Face face) +{ + int i; + for (i = 0; i < face->num_charmaps; i++) + if (face->charmaps[i]->platform_id == 7) + return face->charmaps[i]; + if (face->num_charmaps > 0) + return face->charmaps[0]; + return NULL; +} + +static FT_CharMap +select_truetype_cmap(FT_Face face, int symbolic) +{ + int i; + + /* First look for a Microsoft symbolic cmap, if applicable */ + if (symbolic) + { + for (i = 0; i < face->num_charmaps; i++) + if (face->charmaps[i]->platform_id == 3 && face->charmaps[i]->encoding_id == 0) + return face->charmaps[i]; + } + + /* Then look for a Microsoft Unicode cmap */ + for (i = 0; i < face->num_charmaps; i++) + if (face->charmaps[i]->platform_id == 3 && face->charmaps[i]->encoding_id == 1) + if (FT_Get_CMap_Format(face->charmaps[i]) != -1) + return face->charmaps[i]; + + /* Finally look for an Apple MacRoman cmap */ + for (i = 0; i < face->num_charmaps; i++) + if (face->charmaps[i]->platform_id == 1 && face->charmaps[i]->encoding_id == 0) + if (FT_Get_CMap_Format(face->charmaps[i]) != -1) + return face->charmaps[i]; + + if (face->num_charmaps > 0) + if (FT_Get_CMap_Format(face->charmaps[0]) != -1) + return face->charmaps[0]; + return NULL; +} + +static FT_CharMap +select_unknown_cmap(FT_Face face) +{ + if (face->num_charmaps > 0) + return face->charmaps[0]; + return NULL; +} + static pdf_font_desc * -pdf_load_simple_font_by_name(fz_context *ctx, pdf_document *doc, pdf_obj *dict, char *basefont) +pdf_load_simple_font_by_name(fz_context *ctx, pdf_document *doc, pdf_obj *dict, const char *basefont) { pdf_obj *descriptor; pdf_obj *encoding; @@ -558,7 +643,7 @@ int kind; int glyph; - char *estrings[256]; + const char *estrings[256]; char ebuffer[256][32]; int i, k, n; int fterr; @@ -573,17 +658,17 @@ { fontdesc = pdf_new_font_desc(ctx); - descriptor = pdf_dict_get(ctx, dict, PDF_NAME_FontDescriptor); + descriptor = pdf_dict_get(ctx, dict, PDF_NAME(FontDescriptor)); if (descriptor) pdf_load_font_descriptor(ctx, doc, fontdesc, descriptor, NULL, basefont, 0); else pdf_load_builtin_font(ctx, fontdesc, basefont, 0); /* Some chinese documents mistakenly consider WinAnsiEncoding to be codepage 936 */ - if (descriptor && pdf_is_string(ctx, pdf_dict_get(ctx, descriptor, PDF_NAME_FontName)) && - !pdf_dict_get(ctx, dict, PDF_NAME_ToUnicode) && - pdf_name_eq(ctx, pdf_dict_get(ctx, dict, PDF_NAME_Encoding), PDF_NAME_WinAnsiEncoding) && - pdf_to_int(ctx, pdf_dict_get(ctx, descriptor, PDF_NAME_Flags)) == 4) + if (descriptor && pdf_is_string(ctx, pdf_dict_get(ctx, descriptor, PDF_NAME(FontName))) && + !pdf_dict_get(ctx, dict, PDF_NAME(ToUnicode)) && + pdf_name_eq(ctx, pdf_dict_get(ctx, dict, PDF_NAME(Encoding)), PDF_NAME(WinAnsiEncoding)) && + pdf_dict_get_int(ctx, descriptor, PDF_NAME(Flags)) == 4) { char *cp936fonts[] = { "\xCB\xCE\xCC\xE5", "SimSun,Regular", @@ -618,31 +703,12 @@ symbolic = fontdesc->flags & 4; - if (face->num_charmaps > 0) - cmap = face->charmaps[0]; + if (kind == TYPE1) + cmap = select_type1_cmap(face); + else if (kind == TRUETYPE) + cmap = select_truetype_cmap(face, symbolic); else - cmap = NULL; - - for (i = 0; i < face->num_charmaps; i++) - { - FT_CharMap test = face->charmaps[i]; - - if (kind == TYPE1) - { - if (test->platform_id == 7) - cmap = test; - } - - if (kind == TRUETYPE) - { - if (test->platform_id == 1 && test->encoding_id == 0) - cmap = test; - if (test->platform_id == 3 && test->encoding_id == 1) - cmap = test; - if (symbolic && test->platform_id == 3 && test->encoding_id == 0) - cmap = test; - } - } + cmap = select_unknown_cmap(face); if (cmap) { @@ -661,7 +727,7 @@ etable[i] = 0; } - encoding = pdf_dict_get(ctx, dict, PDF_NAME_Encoding); + encoding = pdf_dict_get(ctx, dict, PDF_NAME(Encoding)); if (encoding) { if (pdf_is_name(ctx, encoding)) @@ -671,13 +737,13 @@ { pdf_obj *base, *diff, *item; - base = pdf_dict_get(ctx, encoding, PDF_NAME_BaseEncoding); + base = pdf_dict_get(ctx, encoding, PDF_NAME(BaseEncoding)); if (pdf_is_name(ctx, base)) pdf_load_encoding(estrings, pdf_to_name(ctx, base)); else if (!fontdesc->is_embedded && !symbolic) pdf_load_encoding(estrings, "StandardEncoding"); - diff = pdf_dict_get(ctx, encoding, PDF_NAME_Differences); + diff = pdf_dict_get(ctx, encoding, PDF_NAME(Differences)); if (pdf_is_array(ctx, diff)) { n = pdf_array_len(ctx, diff); @@ -704,16 +770,16 @@ has_lock = 1; /* built-in and substitute fonts may be a different type than what the document expects */ - subtype = pdf_dict_get(ctx, dict, PDF_NAME_Subtype); - if (pdf_name_eq(ctx, subtype, PDF_NAME_Type1)) + subtype = pdf_dict_get(ctx, dict, PDF_NAME(Subtype)); + if (pdf_name_eq(ctx, subtype, PDF_NAME(Type1))) kind = TYPE1; - else if (pdf_name_eq(ctx, subtype, PDF_NAME_MMType1)) + else if (pdf_name_eq(ctx, subtype, PDF_NAME(MMType1))) kind = TYPE1; - else if (pdf_name_eq(ctx, subtype, PDF_NAME_TrueType)) + else if (pdf_name_eq(ctx, subtype, PDF_NAME(TrueType))) kind = TRUETYPE; - else if (pdf_name_eq(ctx, subtype, PDF_NAME_CIDFontType0)) + else if (pdf_name_eq(ctx, subtype, PDF_NAME(CIDFontType0))) kind = TYPE1; - else if (pdf_name_eq(ctx, subtype, PDF_NAME_CIDFontType2)) + else if (pdf_name_eq(ctx, subtype, PDF_NAME(CIDFontType2))) kind = TRUETYPE; /* encode by glyph name where we can */ @@ -822,7 +888,7 @@ fz_try(ctx) { - pdf_load_to_unicode(ctx, doc, fontdesc, estrings, NULL, pdf_dict_get(ctx, dict, PDF_NAME_ToUnicode)); + pdf_load_to_unicode(ctx, doc, fontdesc, estrings, NULL, pdf_dict_get(ctx, dict, PDF_NAME(ToUnicode))); } fz_catch(ctx) { @@ -836,20 +902,20 @@ pdf_set_default_hmtx(ctx, fontdesc, fontdesc->missing_width); - widths = pdf_dict_get(ctx, dict, PDF_NAME_Widths); + widths = pdf_dict_get(ctx, dict, PDF_NAME(Widths)); if (widths) { int first, last; - first = pdf_to_int(ctx, pdf_dict_get(ctx, dict, PDF_NAME_FirstChar)); - last = pdf_to_int(ctx, pdf_dict_get(ctx, dict, PDF_NAME_LastChar)); + first = pdf_dict_get_int(ctx, dict, PDF_NAME(FirstChar)); + last = pdf_dict_get_int(ctx, dict, PDF_NAME(LastChar)); if (first < 0 || last > 255 || first > last) first = last = 0; for (i = 0; i < last - first + 1; i++) { - int wid = pdf_to_int(ctx, pdf_array_get(ctx, widths, i)); + int wid = pdf_array_get_int(ctx, widths, i); pdf_add_hmtx(ctx, fontdesc, i + first, i + first, wid); } } @@ -876,8 +942,7 @@ static pdf_font_desc * pdf_load_simple_font(fz_context *ctx, pdf_document *doc, pdf_obj *dict) { - char *basefont = pdf_to_name(ctx, pdf_dict_get(ctx, dict, PDF_NAME_BaseFont)); - + const char *basefont = pdf_to_name(ctx, pdf_dict_get(ctx, dict, PDF_NAME(BaseFont))); return pdf_load_simple_font_by_name(ctx, doc, dict, basefont); } @@ -907,18 +972,21 @@ } static void -hail_mary_print_key(fz_context *ctx, fz_output *out, void *key_) +hail_mary_format_key(fz_context *ctx, char *s, int n, void *key_) { - fz_printf(ctx, out, "hail mary "); + fz_strlcpy(s, "(hail mary font)", n); } -static fz_store_type hail_mary_store_type = +static int hail_mary_store_key; /* Dummy */ + +static const fz_store_type hail_mary_store_type = { hail_mary_make_hash_key, hail_mary_keep_key, hail_mary_drop_key, hail_mary_cmp_key, - hail_mary_print_key + hail_mary_format_key, + NULL }; pdf_font_desc * @@ -927,7 +995,7 @@ pdf_font_desc *fontdesc; pdf_font_desc *existing; - if ((fontdesc = fz_find_item(ctx, pdf_drop_font_imp, &hail_mary_store_type, &hail_mary_store_type)) != NULL) + if ((fontdesc = fz_find_item(ctx, pdf_drop_font_imp, &hail_mary_store_key, &hail_mary_store_type)) != NULL) { return fontdesc; } @@ -935,7 +1003,7 @@ /* FIXME: Get someone with a clue about fonts to fix this */ fontdesc = pdf_load_simple_font_by_name(ctx, doc, NULL, "Helvetica"); - existing = fz_store_item(ctx, &hail_mary_store_type, fontdesc, fontdesc->size, &hail_mary_store_type); + existing = fz_store_item(ctx, &hail_mary_store_key, fontdesc, fontdesc->size, &hail_mary_store_type); assert(existing == NULL); (void)existing; /* Silence warning in release builds */ @@ -955,7 +1023,7 @@ pdf_cmap *cmap; FT_Face face; char collection[256]; - char *basefont; + const char *basefont; int i, k, fterr; pdf_obj *cidtogidmap; pdf_obj *obj; @@ -967,42 +1035,26 @@ { /* Get font name and CID collection */ - basefont = pdf_to_name(ctx, pdf_dict_get(ctx, dict, PDF_NAME_BaseFont)); + basefont = pdf_to_name(ctx, pdf_dict_get(ctx, dict, PDF_NAME(BaseFont))); { pdf_obj *cidinfo; - char tmpstr[64]; - int tmplen; + const char *reg, *ord; - cidinfo = pdf_dict_get(ctx, dict, PDF_NAME_CIDSystemInfo); + cidinfo = pdf_dict_get(ctx, dict, PDF_NAME(CIDSystemInfo)); if (!cidinfo) - fz_throw(ctx, FZ_ERROR_GENERIC, "cid font is missing info"); + fz_throw(ctx, FZ_ERROR_SYNTAX, "cid font is missing info"); - obj = pdf_dict_get(ctx, cidinfo, PDF_NAME_Registry); - tmplen = fz_mini(sizeof tmpstr - 1, pdf_to_str_len(ctx, obj)); - memcpy(tmpstr, pdf_to_str_buf(ctx, obj), tmplen); - tmpstr[tmplen] = '\0'; - fz_strlcpy(collection, tmpstr, sizeof collection); - - fz_strlcat(collection, "-", sizeof collection); - - obj = pdf_dict_get(ctx, cidinfo, PDF_NAME_Ordering); - tmplen = fz_mini(sizeof tmpstr - 1, pdf_to_str_len(ctx, obj)); - memcpy(tmpstr, pdf_to_str_buf(ctx, obj), tmplen); - tmpstr[tmplen] = '\0'; - fz_strlcat(collection, tmpstr, sizeof collection); + reg = pdf_dict_get_string(ctx, cidinfo, PDF_NAME(Registry), NULL); + ord = pdf_dict_get_string(ctx, cidinfo, PDF_NAME(Ordering), NULL); + fz_snprintf(collection, sizeof collection, "%s-%s", reg, ord); } /* Encoding */ if (pdf_is_name(ctx, encoding)) { - if (pdf_name_eq(ctx, encoding, PDF_NAME_Identity_H)) - cmap = pdf_new_identity_cmap(ctx, 0, 2); - else if (pdf_name_eq(ctx, encoding, PDF_NAME_Identity_V)) - cmap = pdf_new_identity_cmap(ctx, 1, 2); - else - cmap = pdf_load_system_cmap(ctx, pdf_to_name(ctx, encoding)); + cmap = pdf_load_system_cmap(ctx, pdf_to_name(ctx, encoding)); } else if (pdf_is_indirect(ctx, encoding)) { @@ -1010,7 +1062,7 @@ } else { - fz_throw(ctx, FZ_ERROR_GENERIC, "syntaxerror: font missing encoding"); + fz_throw(ctx, FZ_ERROR_SYNTAX, "font missing encoding"); } /* Load font file */ @@ -1022,17 +1074,17 @@ pdf_set_font_wmode(ctx, fontdesc, pdf_cmap_wmode(ctx, fontdesc->encoding)); - descriptor = pdf_dict_get(ctx, dict, PDF_NAME_FontDescriptor); + descriptor = pdf_dict_get(ctx, dict, PDF_NAME(FontDescriptor)); if (!descriptor) - fz_throw(ctx, FZ_ERROR_GENERIC, "syntaxerror: missing font descriptor"); + fz_throw(ctx, FZ_ERROR_SYNTAX, "missing font descriptor"); pdf_load_font_descriptor(ctx, doc, fontdesc, descriptor, collection, basefont, 1); face = fontdesc->font->ft_face; /* Apply encoding */ - cidtogidmap = pdf_dict_get(ctx, dict, PDF_NAME_CIDToGIDMap); - if (pdf_is_indirect(ctx, cidtogidmap)) + cidtogidmap = pdf_dict_get(ctx, dict, PDF_NAME(CIDToGIDMap)); + if (pdf_is_stream(ctx, cidtogidmap)) { fz_buffer *buf; size_t z, len; @@ -1049,6 +1101,10 @@ fz_drop_buffer(ctx, buf); } + else if (cidtogidmap && !pdf_name_eq(ctx, PDF_NAME(Identity), cidtogidmap)) + { + fz_warn(ctx, "ignoring unknown CIDToGIDMap entry"); + } /* if font is external, cidtogidmap should not be identity */ /* so we map from cid to unicode and then map that through the (3 1) */ @@ -1087,12 +1143,12 @@ /* Horizontal */ dw = 1000; - obj = pdf_dict_get(ctx, dict, PDF_NAME_DW); + obj = pdf_dict_get(ctx, dict, PDF_NAME(DW)); if (obj) dw = pdf_to_int(ctx, obj); pdf_set_default_hmtx(ctx, fontdesc, dw); - widths = pdf_dict_get(ctx, dict, PDF_NAME_W); + widths = pdf_dict_get(ctx, dict, PDF_NAME(W)); if (widths) { int c0, c1, w, n, m; @@ -1100,14 +1156,14 @@ n = pdf_array_len(ctx, widths); for (i = 0; i < n; ) { - c0 = pdf_to_int(ctx, pdf_array_get(ctx, widths, i)); + c0 = pdf_array_get_int(ctx, widths, i); obj = pdf_array_get(ctx, widths, i + 1); if (pdf_is_array(ctx, obj)) { m = pdf_array_len(ctx, obj); for (k = 0; k < m; k++) { - w = pdf_to_int(ctx, pdf_array_get(ctx, obj, k)); + w = pdf_array_get_int(ctx, obj, k); pdf_add_hmtx(ctx, fontdesc, c0 + k, c0 + k, w); } i += 2; @@ -1115,7 +1171,7 @@ else { c1 = pdf_to_int(ctx, obj); - w = pdf_to_int(ctx, pdf_array_get(ctx, widths, i + 2)); + w = pdf_array_get_int(ctx, widths, i + 2); pdf_add_hmtx(ctx, fontdesc, c0, c1, w); i += 3; } @@ -1131,16 +1187,16 @@ int dw2y = 880; int dw2w = -1000; - obj = pdf_dict_get(ctx, dict, PDF_NAME_DW2); + obj = pdf_dict_get(ctx, dict, PDF_NAME(DW2)); if (obj) { - dw2y = pdf_to_int(ctx, pdf_array_get(ctx, obj, 0)); - dw2w = pdf_to_int(ctx, pdf_array_get(ctx, obj, 1)); + dw2y = pdf_array_get_int(ctx, obj, 0); + dw2w = pdf_array_get_int(ctx, obj, 1); } pdf_set_default_vmtx(ctx, fontdesc, dw2y, dw2w); - widths = pdf_dict_get(ctx, dict, PDF_NAME_W2); + widths = pdf_dict_get(ctx, dict, PDF_NAME(W2)); if (widths) { int c0, c1, w, x, y, n; @@ -1148,16 +1204,16 @@ n = pdf_array_len(ctx, widths); for (i = 0; i < n; ) { - c0 = pdf_to_int(ctx, pdf_array_get(ctx, widths, i)); + c0 = pdf_array_get_int(ctx, widths, i); obj = pdf_array_get(ctx, widths, i + 1); if (pdf_is_array(ctx, obj)) { int m = pdf_array_len(ctx, obj); for (k = 0; k * 3 < m; k ++) { - w = pdf_to_int(ctx, pdf_array_get(ctx, obj, k * 3 + 0)); - x = pdf_to_int(ctx, pdf_array_get(ctx, obj, k * 3 + 1)); - y = pdf_to_int(ctx, pdf_array_get(ctx, obj, k * 3 + 2)); + w = pdf_array_get_int(ctx, obj, k * 3 + 0); + x = pdf_array_get_int(ctx, obj, k * 3 + 1); + y = pdf_array_get_int(ctx, obj, k * 3 + 2); pdf_add_vmtx(ctx, fontdesc, c0 + k, c0 + k, x, y, w); } i += 2; @@ -1165,9 +1221,9 @@ else { c1 = pdf_to_int(ctx, obj); - w = pdf_to_int(ctx, pdf_array_get(ctx, widths, i + 2)); - x = pdf_to_int(ctx, pdf_array_get(ctx, widths, i + 3)); - y = pdf_to_int(ctx, pdf_array_get(ctx, widths, i + 4)); + w = pdf_array_get_int(ctx, widths, i + 2); + x = pdf_array_get_int(ctx, widths, i + 3); + y = pdf_array_get_int(ctx, widths, i + 4); pdf_add_vmtx(ctx, fontdesc, c0, c1, x, y, w); i += 5; } @@ -1195,21 +1251,21 @@ pdf_obj *encoding; pdf_obj *to_unicode; - dfonts = pdf_dict_get(ctx, dict, PDF_NAME_DescendantFonts); + dfonts = pdf_dict_get(ctx, dict, PDF_NAME(DescendantFonts)); if (!dfonts) - fz_throw(ctx, FZ_ERROR_GENERIC, "cid font is missing descendant fonts"); + fz_throw(ctx, FZ_ERROR_SYNTAX, "cid font is missing descendant fonts"); dfont = pdf_array_get(ctx, dfonts, 0); - subtype = pdf_dict_get(ctx, dfont, PDF_NAME_Subtype); - encoding = pdf_dict_get(ctx, dict, PDF_NAME_Encoding); - to_unicode = pdf_dict_get(ctx, dict, PDF_NAME_ToUnicode); + subtype = pdf_dict_get(ctx, dfont, PDF_NAME(Subtype)); + encoding = pdf_dict_get(ctx, dict, PDF_NAME(Encoding)); + to_unicode = pdf_dict_get(ctx, dict, PDF_NAME(ToUnicode)); - if (pdf_is_name(ctx, subtype) && pdf_name_eq(ctx, subtype, PDF_NAME_CIDFontType0)) + if (pdf_is_name(ctx, subtype) && pdf_name_eq(ctx, subtype, PDF_NAME(CIDFontType0))) return load_cid_font(ctx, doc, dfont, encoding, to_unicode); - if (pdf_is_name(ctx, subtype) && pdf_name_eq(ctx, subtype, PDF_NAME_CIDFontType2)) + if (pdf_is_name(ctx, subtype) && pdf_name_eq(ctx, subtype, PDF_NAME(CIDFontType2))) return load_cid_font(ctx, doc, dfont, encoding, to_unicode); - fz_throw(ctx, FZ_ERROR_GENERIC, "syntaxerror: unknown cid font type"); + fz_throw(ctx, FZ_ERROR_SYNTAX, "unknown cid font type"); } /* @@ -1217,26 +1273,27 @@ */ static void -pdf_load_font_descriptor(fz_context *ctx, pdf_document *doc, pdf_font_desc *fontdesc, pdf_obj *dict, char *collection, char *basefont, int iscidfont) +pdf_load_font_descriptor(fz_context *ctx, pdf_document *doc, pdf_font_desc *fontdesc, pdf_obj *dict, + const char *collection, const char *basefont, int iscidfont) { pdf_obj *obj1, *obj2, *obj3, *obj; - char *fontname; + const char *fontname; FT_Face face; /* Prefer BaseFont; don't bother with FontName */ fontname = basefont; - fontdesc->flags = pdf_to_int(ctx, pdf_dict_get(ctx, dict, PDF_NAME_Flags)); - fontdesc->italic_angle = pdf_to_real(ctx, pdf_dict_get(ctx, dict, PDF_NAME_ItalicAngle)); - fontdesc->ascent = pdf_to_real(ctx, pdf_dict_get(ctx, dict, PDF_NAME_Ascent)); - fontdesc->descent = pdf_to_real(ctx, pdf_dict_get(ctx, dict, PDF_NAME_Descent)); - fontdesc->cap_height = pdf_to_real(ctx, pdf_dict_get(ctx, dict, PDF_NAME_CapHeight)); - fontdesc->x_height = pdf_to_real(ctx, pdf_dict_get(ctx, dict, PDF_NAME_XHeight)); - fontdesc->missing_width = pdf_to_real(ctx, pdf_dict_get(ctx, dict, PDF_NAME_MissingWidth)); - - obj1 = pdf_dict_get(ctx, dict, PDF_NAME_FontFile); - obj2 = pdf_dict_get(ctx, dict, PDF_NAME_FontFile2); - obj3 = pdf_dict_get(ctx, dict, PDF_NAME_FontFile3); + fontdesc->flags = pdf_dict_get_int(ctx, dict, PDF_NAME(Flags)); + fontdesc->italic_angle = pdf_dict_get_real(ctx, dict, PDF_NAME(ItalicAngle)); + fontdesc->ascent = pdf_dict_get_real(ctx, dict, PDF_NAME(Ascent)); + fontdesc->descent = pdf_dict_get_real(ctx, dict, PDF_NAME(Descent)); + fontdesc->cap_height = pdf_dict_get_real(ctx, dict, PDF_NAME(CapHeight)); + fontdesc->x_height = pdf_dict_get_real(ctx, dict, PDF_NAME(XHeight)); + fontdesc->missing_width = pdf_dict_get_real(ctx, dict, PDF_NAME(MissingWidth)); + + obj1 = pdf_dict_get(ctx, dict, PDF_NAME(FontFile)); + obj2 = pdf_dict_get(ctx, dict, PDF_NAME(FontFile2)); + obj3 = pdf_dict_get(ctx, dict, PDF_NAME(FontFile3)); obj = obj1 ? obj1 : obj2 ? obj2 : obj3; if (pdf_is_indirect(ctx, obj)) @@ -1267,8 +1324,9 @@ face = fontdesc->font->ft_face; if (ft_kind(face) == TRUETYPE) { - if (FT_IS_TRICKY(face) || is_dynalab(fontdesc->font->name)) - fontdesc->font->flags.force_hinting = 1; + /* FreeType's own 'tricky' font detection needs a bit of help */ + if (is_dynalab(fontdesc->font->name)) + face->face_flags |= FT_FACE_FLAG_TRICKY; if (fontdesc->ascent == 0.0f) fontdesc->ascent = 1000.0f * face->ascender / face->units_per_EM; @@ -1322,32 +1380,35 @@ } pdf_font_desc * -pdf_load_font(fz_context *ctx, pdf_document *doc, pdf_obj *rdb, pdf_obj *dict, int nested_depth) +pdf_load_font(fz_context *ctx, pdf_document *doc, pdf_obj *rdb, pdf_obj *dict) { pdf_obj *subtype; pdf_obj *dfonts; pdf_obj *charprocs; - pdf_font_desc *fontdesc; + pdf_font_desc *fontdesc = NULL; int type3 = 0; + if (pdf_obj_marked(ctx, dict)) + fz_throw(ctx, FZ_ERROR_SYNTAX, "Recursive Type3 font definition."); + if ((fontdesc = pdf_find_item(ctx, pdf_drop_font_imp, dict)) != NULL) { return fontdesc; } - subtype = pdf_dict_get(ctx, dict, PDF_NAME_Subtype); - dfonts = pdf_dict_get(ctx, dict, PDF_NAME_DescendantFonts); - charprocs = pdf_dict_get(ctx, dict, PDF_NAME_CharProcs); + subtype = pdf_dict_get(ctx, dict, PDF_NAME(Subtype)); + dfonts = pdf_dict_get(ctx, dict, PDF_NAME(DescendantFonts)); + charprocs = pdf_dict_get(ctx, dict, PDF_NAME(CharProcs)); - if (pdf_name_eq(ctx, subtype, PDF_NAME_Type0)) + if (pdf_name_eq(ctx, subtype, PDF_NAME(Type0))) fontdesc = pdf_load_type0_font(ctx, doc, dict); - else if (pdf_name_eq(ctx, subtype, PDF_NAME_Type1)) + else if (pdf_name_eq(ctx, subtype, PDF_NAME(Type1))) fontdesc = pdf_load_simple_font(ctx, doc, dict); - else if (pdf_name_eq(ctx, subtype, PDF_NAME_MMType1)) + else if (pdf_name_eq(ctx, subtype, PDF_NAME(MMType1))) fontdesc = pdf_load_simple_font(ctx, doc, dict); - else if (pdf_name_eq(ctx, subtype, PDF_NAME_TrueType)) + else if (pdf_name_eq(ctx, subtype, PDF_NAME(TrueType))) fontdesc = pdf_load_simple_font(ctx, doc, dict); - else if (pdf_name_eq(ctx, subtype, PDF_NAME_Type3)) + else if (pdf_name_eq(ctx, subtype, PDF_NAME(Type3))) { fontdesc = pdf_load_type3_font(ctx, doc, rdb, dict); type3 = 1; @@ -1369,13 +1430,25 @@ fontdesc = pdf_load_simple_font(ctx, doc, dict); } - /* Create glyph width table for stretching substitute fonts and text extraction. */ - pdf_make_width_table(ctx, fontdesc); + pdf_mark_obj(ctx, dict); + fz_try(ctx) + { + /* Create glyph width table for stretching substitute fonts and text extraction. */ + pdf_make_width_table(ctx, fontdesc); - pdf_store_item(ctx, dict, fontdesc, fontdesc->size); + /* Load CharProcs */ + if (type3) + pdf_load_type3_glyphs(ctx, doc, fontdesc); - if (type3) - pdf_load_type3_glyphs(ctx, doc, fontdesc, nested_depth); + pdf_store_item(ctx, dict, fontdesc, fontdesc->size); + } + fz_always(ctx) + pdf_unmark_obj(ctx, dict); + fz_catch(ctx) + { + pdf_drop_font(ctx, fontdesc); + fz_rethrow(ctx); + } return fontdesc; } @@ -1385,75 +1458,34 @@ { int i; - fz_printf(ctx, out, "fontdesc {\n"); + fz_write_printf(ctx, out, "fontdesc {\n"); if (fontdesc->font->ft_face) - fz_printf(ctx, out, "\tfreetype font\n"); + fz_write_printf(ctx, out, "\tfreetype font\n"); if (fontdesc->font->t3procs) - fz_printf(ctx, out, "\ttype3 font\n"); + fz_write_printf(ctx, out, "\ttype3 font\n"); - fz_printf(ctx, out, "\twmode %d\n", fontdesc->wmode); - fz_printf(ctx, out, "\tDW %d\n", fontdesc->dhmtx.w); + fz_write_printf(ctx, out, "\twmode %d\n", fontdesc->wmode); + fz_write_printf(ctx, out, "\tDW %d\n", fontdesc->dhmtx.w); - fz_printf(ctx, out, "\tW {\n"); + fz_write_printf(ctx, out, "\tW {\n"); for (i = 0; i < fontdesc->hmtx_len; i++) - fz_printf(ctx, out, "\t\t<%04x> <%04x> %d\n", + fz_write_printf(ctx, out, "\t\t<%04x> <%04x> %d\n", fontdesc->hmtx[i].lo, fontdesc->hmtx[i].hi, fontdesc->hmtx[i].w); - fz_printf(ctx, out, "\t}\n"); + fz_write_printf(ctx, out, "\t}\n"); if (fontdesc->wmode) { - fz_printf(ctx, out, "\tDW2 [%d %d]\n", fontdesc->dvmtx.y, fontdesc->dvmtx.w); - fz_printf(ctx, out, "\tW2 {\n"); + fz_write_printf(ctx, out, "\tDW2 [%d %d]\n", fontdesc->dvmtx.y, fontdesc->dvmtx.w); + fz_write_printf(ctx, out, "\tW2 {\n"); for (i = 0; i < fontdesc->vmtx_len; i++) - fz_printf(ctx, out, "\t\t<%04x> <%04x> %d %d %d\n", fontdesc->vmtx[i].lo, fontdesc->vmtx[i].hi, + fz_write_printf(ctx, out, "\t\t<%04x> <%04x> %d %d %d\n", fontdesc->vmtx[i].lo, fontdesc->vmtx[i].hi, fontdesc->vmtx[i].x, fontdesc->vmtx[i].y, fontdesc->vmtx[i].w); - fz_printf(ctx, out, "\t}\n"); + fz_write_printf(ctx, out, "\t}\n"); } } -fz_rect *pdf_measure_text(fz_context *ctx, pdf_font_desc *fontdesc, unsigned char *buf, size_t len, fz_rect *acc) -{ - size_t i; - int w = 0; - - for (i = 0; i < len; i++) - w += pdf_lookup_hmtx(ctx, fontdesc, buf[i]).w; - - acc->x0 = 0; - acc->x1 = w / 1000.0f; - acc->y0 = fontdesc->descent / 1000.0f; - acc->y1 = fontdesc->ascent / 1000.0f; - - return acc; -} - -float pdf_text_stride(fz_context *ctx, pdf_font_desc *fontdesc, float fontsize, unsigned char *buf, size_t len, float room, size_t *count) -{ - pdf_hmtx h; - size_t i = 0; - float x = 0.0; - - while(i < len) - { - float span; - - h = pdf_lookup_hmtx(ctx, fontdesc, buf[i]); - - span = h.w * fontsize / 1000.0; - - if (x + span > room) - break; - - x += span; - i ++; - } - - if (count) - *count = i; - - return x; -} +/* Font creation */ static pdf_obj* pdf_add_font_file(fz_context *ctx, pdf_document *doc, fz_font *font) @@ -1473,21 +1505,21 @@ { size_t len = fz_buffer_storage(ctx, buf, NULL); obj = pdf_new_dict(ctx, doc, 3); - pdf_dict_put_drop(ctx, obj, PDF_NAME_Length1, pdf_new_int(ctx, doc, (int)len)); + pdf_dict_put_int(ctx, obj, PDF_NAME(Length1), (int)len); switch (ft_font_file_kind(font->ft_face)) { case 1: /* TODO: these may not be the correct values, but I doubt it matters */ - pdf_dict_put_drop(ctx, obj, PDF_NAME_Length2, pdf_new_int(ctx, doc, (int)len)); - pdf_dict_put_drop(ctx, obj, PDF_NAME_Length3, pdf_new_int(ctx, doc, 0)); + pdf_dict_put_int(ctx, obj, PDF_NAME(Length2), len); + pdf_dict_put_int(ctx, obj, PDF_NAME(Length3), 0); break; case 2: break; case 3: if (FT_Get_Sfnt_Table(font->ft_face, FT_SFNT_HEAD)) - pdf_dict_put_drop(ctx, obj, PDF_NAME_Subtype, PDF_NAME_OpenType); + pdf_dict_put(ctx, obj, PDF_NAME(Subtype), PDF_NAME(OpenType)); else - pdf_dict_put_drop(ctx, obj, PDF_NAME_Subtype, PDF_NAME_Type1C); + pdf_dict_put(ctx, obj, PDF_NAME(Subtype), PDF_NAME(Type1C)); break; } ref = pdf_add_object(ctx, doc, obj); @@ -1505,133 +1537,109 @@ return ref; } -static pdf_obj* -pdf_add_font_descriptor(fz_context *ctx, pdf_document *doc, pdf_font_desc *fontdesc, pdf_obj *fileref) +static void +pdf_add_font_descriptor(fz_context *ctx, pdf_document *doc, pdf_obj *fobj, fz_font *font) { - pdf_obj *ref = NULL; - pdf_obj *fdobj = NULL; - pdf_obj *bbox = NULL; - - fz_font *font = fontdesc->font; FT_Face face = font->ft_face; + pdf_obj *fdobj = NULL; + pdf_obj *fileref; + fz_rect bbox; - fz_var(fdobj); - fz_var(bbox); - fz_var(ref); - + fdobj = pdf_new_dict(ctx, doc, 10); fz_try(ctx) { - fdobj = pdf_new_dict(ctx, doc, 10); - pdf_dict_put(ctx, fdobj, PDF_NAME_Type, PDF_NAME_FontDescriptor); - pdf_dict_put_drop(ctx, fdobj, PDF_NAME_FontName, pdf_new_name(ctx, doc, font->name)); - - bbox = pdf_new_array(ctx, doc, 4); - pdf_array_push_drop(ctx, bbox, pdf_new_real(ctx, doc, 1000.0f * font->bbox.x0)); - pdf_array_push_drop(ctx, bbox, pdf_new_real(ctx, doc, 1000.0f * font->bbox.y0)); - pdf_array_push_drop(ctx, bbox, pdf_new_real(ctx, doc, 1000.0f * font->bbox.x1)); - pdf_array_push_drop(ctx, bbox, pdf_new_real(ctx, doc, 1000.0f * font->bbox.y1)); - pdf_dict_put(ctx, fdobj, PDF_NAME_FontBBox, bbox); - - pdf_dict_put_drop(ctx, fdobj, PDF_NAME_ItalicAngle, pdf_new_real(ctx, doc, fontdesc->italic_angle)); - pdf_dict_put_drop(ctx, fdobj, PDF_NAME_Ascent, pdf_new_real(ctx, doc, fontdesc->ascent)); - pdf_dict_put_drop(ctx, fdobj, PDF_NAME_Descent, pdf_new_real(ctx, doc, fontdesc->descent)); - pdf_dict_put_drop(ctx, fdobj, PDF_NAME_CapHeight, pdf_new_real(ctx, doc, fontdesc->cap_height)); - pdf_dict_put_drop(ctx, fdobj, PDF_NAME_StemV, pdf_new_real(ctx, doc, 80)); - pdf_dict_put_drop(ctx, fdobj, PDF_NAME_Flags, pdf_new_real(ctx, doc, fontdesc->flags)); + pdf_dict_put(ctx, fdobj, PDF_NAME(Type), PDF_NAME(FontDescriptor)); + pdf_dict_put_name(ctx, fdobj, PDF_NAME(FontName), font->name); + + bbox.x0 = font->bbox.x0 * 1000; + bbox.y0 = font->bbox.y0 * 1000; + bbox.x1 = font->bbox.x1 * 1000; + bbox.y1 = font->bbox.y1 * 1000; + pdf_dict_put_rect(ctx, fdobj, PDF_NAME(FontBBox), bbox); + + pdf_dict_put_int(ctx, fdobj, PDF_NAME(ItalicAngle), 0); + pdf_dict_put_int(ctx, fdobj, PDF_NAME(Ascent), face->ascender * 1000.0f / face->units_per_EM); + pdf_dict_put_int(ctx, fdobj, PDF_NAME(Descent), face->descender * 1000.0f / face->units_per_EM); + pdf_dict_put_int(ctx, fdobj, PDF_NAME(StemV), 80); + pdf_dict_put_int(ctx, fdobj, PDF_NAME(Flags), PDF_FD_NONSYMBOLIC); + fileref = pdf_add_font_file(ctx, doc, font); if (fileref) { switch (ft_font_file_kind(face)) { - case 1: pdf_dict_put(ctx, fdobj, PDF_NAME_FontFile, fileref); break; - case 2: pdf_dict_put(ctx, fdobj, PDF_NAME_FontFile2, fileref); break; - case 3: pdf_dict_put(ctx, fdobj, PDF_NAME_FontFile3, fileref); break; + default: + case 1: pdf_dict_put_drop(ctx, fdobj, PDF_NAME(FontFile), fileref); break; + case 2: pdf_dict_put_drop(ctx, fdobj, PDF_NAME(FontFile2), fileref); break; + case 3: pdf_dict_put_drop(ctx, fdobj, PDF_NAME(FontFile3), fileref); break; } } - ref = pdf_add_object(ctx, doc, fdobj); + pdf_dict_put_drop(ctx, fobj, PDF_NAME(FontDescriptor), pdf_add_object(ctx, doc, fdobj)); } fz_always(ctx) - { pdf_drop_obj(ctx, fdobj); - pdf_drop_obj(ctx, bbox); - } fz_catch(ctx) - { - pdf_drop_obj(ctx, ref); fz_rethrow(ctx); - } - return ref; } -static pdf_obj* -pdf_add_simple_font_widths(fz_context *ctx, pdf_document *doc, pdf_font_desc *fontdesc, - int *first_char, int *last_char) +static void +pdf_add_simple_font_widths(fz_context *ctx, pdf_document *doc, pdf_obj *fobj, fz_font *font, const char * const encoding[]) { int width_table[256]; - pdf_obj *arr; - int i; + pdf_obj *widths; + int i, first, last; - *first_char = 0; - *last_char = 0; + first = 0; + last = 0; for (i = 0; i < 256; ++i) { - int glyph = fz_encode_character(ctx, fontdesc->font, i); + int glyph = 0; + if (encoding[i]) + { + glyph = ft_name_index(font->ft_face, encoding[i]); + if (glyph == 0) + glyph = ft_char_index(font->ft_face, pdf_lookup_agl(encoding[i])); + } if (glyph > 0) { - if (!*first_char) - *first_char = i; - *last_char = i; - width_table[i] = fz_advance_glyph(ctx, fontdesc->font, glyph, 0) * 1000; + if (!first) + first = i; + last = i; + width_table[i] = fz_advance_glyph(ctx, font, glyph, 0) * 1000; } else width_table[i] = 0; } - arr = pdf_new_array(ctx, doc, *last_char - *first_char + 1); - fz_try(ctx) - { - for (i = *first_char; i <= *last_char; i++) - pdf_array_push_drop(ctx, arr, pdf_new_int(ctx, doc, width_table[i])); - } - fz_catch(ctx) - { - pdf_drop_obj(ctx, arr); - fz_rethrow(ctx); - } - - return pdf_add_object_drop(ctx, doc, arr); + widths = pdf_new_array(ctx, doc, last - first + 1); + pdf_dict_put_drop(ctx, fobj, PDF_NAME(Widths), widths); + for (i = first; i <= last; ++i) + pdf_array_push_int(ctx, widths, width_table[i]); + pdf_dict_put_int(ctx, fobj, PDF_NAME(FirstChar), first); + pdf_dict_put_int(ctx, fobj, PDF_NAME(LastChar), last); } -static pdf_obj* -pdf_add_cid_system_info(fz_context *ctx, pdf_document *doc) +static void +pdf_add_cid_system_info(fz_context *ctx, pdf_document *doc, pdf_obj *fobj, const char *reg, const char *ord, int supp) { - pdf_obj *fobj = pdf_new_dict(ctx, doc, 3); - fz_try(ctx) - { - pdf_dict_put_drop(ctx, fobj, PDF_NAME_Ordering, pdf_new_string(ctx, doc, "Identity", strlen("Identity"))); - pdf_dict_put_drop(ctx, fobj, PDF_NAME_Registry, pdf_new_string(ctx, doc, "Adobe", strlen("Adobe"))); - pdf_dict_put_drop(ctx, fobj, PDF_NAME_Supplement, pdf_new_int(ctx, doc, 0)); - } - fz_catch(ctx) - { - pdf_drop_obj(ctx, fobj); - fz_rethrow(ctx); - } - return fobj; + pdf_obj *csi = pdf_dict_put_dict(ctx, fobj, PDF_NAME(CIDSystemInfo), 3); + pdf_dict_put_string(ctx, csi, PDF_NAME(Registry), reg, strlen(reg)); + pdf_dict_put_string(ctx, csi, PDF_NAME(Ordering), ord, strlen(ord)); + pdf_dict_put_int(ctx, csi, PDF_NAME(Supplement), supp); } /* Different states of starting, same width as last, or consecutive glyph */ enum { FW_START, FW_SAME, FW_RUN }; /* ToDo: Ignore the default sized characters */ -static pdf_obj* -pdf_add_cid_font_widths(fz_context *ctx, pdf_document *doc, pdf_font_desc *fontdesc, fz_font *font) +static void +pdf_add_cid_font_widths(fz_context *ctx, pdf_document *doc, pdf_obj *fobj, fz_font *font) { FT_Face face = font->ft_face; pdf_obj *run_obj = NULL; - pdf_obj *fwobj; + pdf_obj *fw; int curr_code; int prev_code; int curr_size; @@ -1644,7 +1652,7 @@ fz_var(run_obj); - fwobj = pdf_new_array(ctx, doc, 10); + fw = pdf_add_new_array(ctx, doc, 10); fz_try(ctx) { prev_code = 0; @@ -1664,7 +1672,8 @@ /* End of same widths for consecutive ids. Current will * be pushed as prev. below during next iteration */ publish = 1; - run_obj = pdf_new_array(ctx, doc, 10); + if (curr_code < face->num_glyphs) + run_obj = pdf_new_array(ctx, doc, 10); new_state = FW_RUN; /* And the new first code is our current code */ new_first_code = curr_code; @@ -1683,7 +1692,7 @@ else { /* Add prev size to run_obj */ - pdf_array_push_drop(ctx, run_obj, pdf_new_int(ctx, doc, prev_size)); + pdf_array_push_int(ctx, run_obj, prev_size); } break; case FW_START: @@ -1695,7 +1704,7 @@ else { run_obj = pdf_new_array(ctx, doc, 10); - pdf_array_push_drop(ctx, run_obj, pdf_new_int(ctx, doc, prev_size)); + pdf_array_push_int(ctx, run_obj, prev_size); state = FW_RUN; } new_first_code = prev_code; @@ -1708,111 +1717,87 @@ { case FW_SAME: /* Add three entries. First cid, last cid and width */ - pdf_array_push_drop(ctx, fwobj, pdf_new_int(ctx, doc, first_code)); - pdf_array_push_drop(ctx, fwobj, pdf_new_int(ctx, doc, prev_code)); - pdf_array_push_drop(ctx, fwobj, pdf_new_int(ctx, doc, prev_size)); + pdf_array_push_int(ctx, fw, first_code); + pdf_array_push_int(ctx, fw, prev_code); + pdf_array_push_int(ctx, fw, prev_size); break; case FW_RUN: if (pdf_array_len(ctx, run_obj) > 0) { - pdf_array_push_drop(ctx, fwobj, pdf_new_int(ctx, doc, first_code)); - pdf_array_push(ctx, fwobj, run_obj); + pdf_array_push_int(ctx, fw, first_code); + pdf_array_push(ctx, fw, run_obj); } pdf_drop_obj(ctx, run_obj); run_obj = NULL; break; case FW_START: /* Lone wolf. Not part of a consecutive run */ - pdf_array_push_drop(ctx, fwobj, pdf_new_int(ctx, doc, prev_code)); - pdf_array_push_drop(ctx, fwobj, pdf_new_int(ctx, doc, prev_code)); - pdf_array_push_drop(ctx, fwobj, pdf_new_int(ctx, doc, prev_size)); + pdf_array_push_int(ctx, fw, prev_code); + pdf_array_push_int(ctx, fw, prev_code); + pdf_array_push_int(ctx, fw, prev_size); break; } - state = new_state; - first_code = new_first_code; - publish = 0; + if (curr_code < face->num_glyphs) + { + state = new_state; + first_code = new_first_code; + publish = 0; + } } prev_size = curr_size; prev_code = curr_code; } + + if (font->width_table != NULL) + pdf_dict_put_int(ctx, fobj, PDF_NAME(DW), font->width_default); + if (pdf_array_len(ctx, fw) > 0) + pdf_dict_put(ctx, fobj, PDF_NAME(W), fw); } + fz_always(ctx) + pdf_drop_obj(ctx, fw); fz_catch(ctx) - { - pdf_drop_obj(ctx, fwobj); - pdf_drop_obj(ctx, run_obj); fz_rethrow(ctx); - } - return pdf_add_object_drop(ctx, doc, fwobj); } /* Descendant font construction used for CID font creation from ttf or Adobe type1 */ static pdf_obj* -pdf_add_descendant_font(fz_context *ctx, pdf_document *doc, pdf_font_desc *fontdesc) +pdf_add_descendant_cid_font(fz_context *ctx, pdf_document *doc, fz_font *font) { - pdf_obj *fobj = NULL; - pdf_obj *fref = NULL; - pdf_obj *fstr_ref = NULL; - pdf_obj *fsys_ref = NULL; - pdf_obj *fdes_ref = NULL; - pdf_obj *fw = NULL; - - const char *ps_name; - fz_font *font = fontdesc->font; FT_Face face = font->ft_face; + pdf_obj *fobj, *fref; + const char *ps_name; - fz_var(fobj); - fz_var(fref); - fz_var(fstr_ref); - fz_var(fsys_ref); - fz_var(fw); - + fobj = pdf_new_dict(ctx, doc, 3); fz_try(ctx) { - /* refs */ - fstr_ref = pdf_add_font_file(ctx, doc, fontdesc->font); - fdes_ref = pdf_add_font_descriptor(ctx, doc, fontdesc, fstr_ref); - fsys_ref = pdf_add_cid_system_info(ctx, doc); - - /* We may have a cid font already with width info in source font and no cmap in the ft face */ - fw = pdf_add_cid_font_widths(ctx, doc, fontdesc, font); - - /* And now the font */ - fobj = pdf_new_dict(ctx, doc, 3); - pdf_dict_put(ctx, fobj, PDF_NAME_Type, PDF_NAME_Font); + pdf_dict_put(ctx, fobj, PDF_NAME(Type), PDF_NAME(Font)); switch (ft_kind(face)) { - case TYPE1: pdf_dict_put(ctx, fobj, PDF_NAME_Subtype, PDF_NAME_CIDFontType0); break; - case TRUETYPE: pdf_dict_put(ctx, fobj, PDF_NAME_Subtype, PDF_NAME_CIDFontType2); break; + case TYPE1: pdf_dict_put(ctx, fobj, PDF_NAME(Subtype), PDF_NAME(CIDFontType0)); break; + case TRUETYPE: pdf_dict_put(ctx, fobj, PDF_NAME(Subtype), PDF_NAME(CIDFontType2)); break; } + + pdf_add_cid_system_info(ctx, doc, fobj, "Adobe", "Identity", 0); + ps_name = FT_Get_Postscript_Name(face); if (ps_name) - pdf_dict_put_drop(ctx, fobj, PDF_NAME_BaseFont, pdf_new_name(ctx, doc, ps_name)); + pdf_dict_put_name(ctx, fobj, PDF_NAME(BaseFont), ps_name); else - pdf_dict_put_drop(ctx, fobj, PDF_NAME_BaseFont, pdf_new_name(ctx, doc, font->name)); - pdf_dict_put(ctx, fobj, PDF_NAME_CIDSystemInfo, fsys_ref); - pdf_dict_put(ctx, fobj, PDF_NAME_FontDescriptor, fdes_ref); - if (font->width_table != NULL) - pdf_dict_put_drop(ctx, fobj, PDF_NAME_DW, pdf_new_int(ctx, doc, font->width_default)); - if (fw != NULL) - pdf_dict_put(ctx, fobj, PDF_NAME_W, fw); + pdf_dict_put_name(ctx, fobj, PDF_NAME(BaseFont), font->name); + + pdf_add_font_descriptor(ctx, doc, fobj, font); + + /* We may have a cid font already with width info in source font and no cmap in the ft face */ + pdf_add_cid_font_widths(ctx, doc, fobj, font); fref = pdf_add_object(ctx, doc, fobj); } fz_always(ctx) - { pdf_drop_obj(ctx, fobj); - pdf_drop_obj(ctx, fstr_ref); - pdf_drop_obj(ctx, fsys_ref); - pdf_drop_obj(ctx, fdes_ref); - pdf_drop_obj(ctx, fw); - } fz_catch(ctx) - { - pdf_drop_obj(ctx, fref); fz_rethrow(ctx); - } return fref; } @@ -1830,12 +1815,10 @@ } /* Create the ToUnicode CMap. */ -static pdf_obj* -pdf_add_to_unicode(fz_context *ctx, pdf_document *doc, fz_font *font) +static void +pdf_add_to_unicode(fz_context *ctx, pdf_document *doc, pdf_obj *fobj, fz_font *font) { FT_Face face = font->ft_face; - pdf_obj *fref = NULL; - pdf_obj *fobj = NULL; fz_buffer *buf; int *table; @@ -1843,9 +1826,6 @@ int num_chr = 0; int n, k; - fz_var(fref); - fz_var(fobj); - /* Populate reverse cmap table */ { FT_ULong ucs; @@ -1877,22 +1857,22 @@ { fz_warn(ctx, "cannot create ToUnicode mapping for %s", font->name); fz_free(ctx, table); - return NULL; + return; } buf = fz_new_buffer(ctx, 0); fz_try(ctx) { /* Header boiler plate */ - fz_buffer_printf(ctx, buf, "/CIDInit /ProcSet findresource begin\n"); - fz_buffer_printf(ctx, buf, "12 dict begin\n"); - fz_buffer_printf(ctx, buf, "begincmap\n"); - fz_buffer_printf(ctx, buf, "/CIDSystemInfo <> def\n"); - fz_buffer_printf(ctx, buf, "/CMapName /Adobe-Identity-UCS def\n"); - fz_buffer_printf(ctx, buf, "/CMapType 2 def\n"); - fz_buffer_printf(ctx, buf, "1 begincodespacerange\n"); - fz_buffer_printf(ctx, buf, "<0000> \n"); - fz_buffer_printf(ctx, buf, "endcodespacerange\n"); + fz_append_string(ctx, buf, "/CIDInit /ProcSet findresource begin\n"); + fz_append_string(ctx, buf, "12 dict begin\n"); + fz_append_string(ctx, buf, "begincmap\n"); + fz_append_string(ctx, buf, "/CIDSystemInfo <> def\n"); + fz_append_string(ctx, buf, "/CMapName /Adobe-Identity-UCS def\n"); + fz_append_string(ctx, buf, "/CMapType 2 def\n"); + fz_append_string(ctx, buf, "1 begincodespacerange\n"); + fz_append_string(ctx, buf, "<0000> \n"); + fz_append_string(ctx, buf, "endcodespacerange\n"); /* Note to have a valid CMap, the number of entries in table set can * not exceed 100, so we have to break into multiple tables. Also, note @@ -1906,11 +1886,11 @@ int count = 0; if (num_seq > 100) { - fz_buffer_printf(ctx, buf, "100 beginbfrange\n"); + fz_append_string(ctx, buf, "100 beginbfrange\n"); num_seq -= 100; } else - fz_buffer_printf(ctx, buf, "%d beginbfrange\n", num_seq); + fz_append_printf(ctx, buf, "%d beginbfrange\n", num_seq); for (k = 0; k < face->num_glyphs; k += n) { n = next_range(table, face->num_glyphs, k); @@ -1918,21 +1898,21 @@ { if (count == 100) { - fz_buffer_printf(ctx, buf, "endbfrange\n"); + fz_append_string(ctx, buf, "endbfrange\n"); if (num_seq > 100) { - fz_buffer_printf(ctx, buf, "100 beginbfrange\n"); + fz_append_string(ctx, buf, "100 beginbfrange\n"); num_seq -= 100; } else - fz_buffer_printf(ctx, buf, "%d beginbfrange\n", num_seq); + fz_append_printf(ctx, buf, "%d beginbfrange\n", num_seq); count = 0; } - fz_buffer_printf(ctx, buf, "<%04x> <%04x> <%04x>\n", k, k+n-1, table[k]); + fz_append_printf(ctx, buf, "<%04x> <%04x> <%04x>\n", k, k+n-1, table[k]); ++count; } } - fz_buffer_printf(ctx, buf, "endbfrange\n"); + fz_append_string(ctx, buf, "endbfrange\n"); } /* Then the singles */ @@ -1941,11 +1921,11 @@ int count = 0; if (num_chr > 100) { - fz_buffer_printf(ctx, buf, "100 beginbfchar\n"); + fz_append_string(ctx, buf, "100 beginbfchar\n"); num_chr -= 100; } else - fz_buffer_printf(ctx, buf, "%d beginbfchar\n", num_chr); + fz_append_printf(ctx, buf, "%d beginbfchar\n", num_chr); for (k = 0; k < face->num_glyphs; k += n) { n = next_range(table, face->num_glyphs, k); @@ -1953,44 +1933,37 @@ { if (count == 100) { - fz_buffer_printf(ctx, buf, "endbfchar\n"); + fz_append_string(ctx, buf, "endbfchar\n"); if (num_chr > 100) { - fz_buffer_printf(ctx, buf, "100 beginbfchar\n"); + fz_append_string(ctx, buf, "100 beginbfchar\n"); num_chr -= 100; } else - fz_buffer_printf(ctx, buf, "%d beginbfchar\n", num_chr); + fz_append_printf(ctx, buf, "%d beginbfchar\n", num_chr); count = 0; } - fz_buffer_printf(ctx, buf, "<%04x> <%04x>\n", k, table[k]); + fz_append_printf(ctx, buf, "<%04x> <%04x>\n", k, table[k]); ++count; } } - fz_buffer_printf(ctx, buf, "endbfchar\n"); + fz_append_string(ctx, buf, "endbfchar\n"); } /* Trailer boiler plate */ - fz_buffer_printf(ctx, buf, "endcmap\n"); - fz_buffer_printf(ctx, buf, "CMapName currentdict /CMap defineresource pop\n"); - fz_buffer_printf(ctx, buf, "end\nend\n"); + fz_append_string(ctx, buf, "endcmap\n"); + fz_append_string(ctx, buf, "CMapName currentdict /CMap defineresource pop\n"); + fz_append_string(ctx, buf, "end\nend\n"); - fobj = pdf_new_dict(ctx, doc, 3); - fref = pdf_add_object(ctx, doc, fobj); - pdf_update_stream(ctx, doc, fref, buf, 0); + pdf_dict_put_drop(ctx, fobj, PDF_NAME(ToUnicode), pdf_add_stream(ctx, doc, buf, NULL, 0)); } fz_always(ctx) { fz_free(ctx, table); fz_drop_buffer(ctx, buf); - pdf_drop_obj(ctx, fobj); } fz_catch(ctx) - { - pdf_drop_obj(ctx, fref); fz_rethrow(ctx); - } - return fref; } /* Creates CID font with Identity-H CMap and a ToUnicode CMap that is created by @@ -2002,159 +1975,134 @@ { pdf_obj *fobj = NULL; pdf_obj *fref = NULL; - pdf_obj *obj_desc_ref = NULL; - pdf_obj *obj_tounicode_ref = NULL; - pdf_obj *obj_array = NULL; - pdf_font_desc *fontdesc = NULL; - - FT_Face face = font->ft_face; + pdf_obj *dfonts = NULL; unsigned char digest[16]; - fz_var(fobj); - fz_var(fref); - fz_var(obj_desc_ref); - fz_var(obj_tounicode_ref); - fz_var(fontdesc); - fz_var(obj_array); + fref = pdf_find_font_resource(ctx, doc, PDF_CID_FONT_RESOURCE, 0, font, digest); + if (fref) + return fref; + fobj = pdf_add_new_dict(ctx, doc, 10); fz_try(ctx) { - /* Before we add this font as a resource check if the same font - * already exists in our resources for this doc. If yes, then - * hand back that reference */ - fref = pdf_find_font_resource(ctx, doc, font->buffer, digest); - if (fref == NULL) - { - /* Set up desc, width, and font file */ - fontdesc = pdf_new_font_desc(ctx); - fontdesc->font = fz_keep_font(ctx, font); - fontdesc->flags = PDF_FD_NONSYMBOLIC; /* ToDo: FixMe. Set non-symbolic always for now */ - fontdesc->ascent = face->ascender * 1000.0f / face->units_per_EM; - fontdesc->descent = face->descender * 1000.0f / face->units_per_EM; - - /* Get the descendant font and the tounicode references */ - obj_desc_ref = pdf_add_descendant_font(ctx, doc, fontdesc); - obj_tounicode_ref = pdf_add_to_unicode(ctx, doc, font); - - /* And now the font */ - fobj = pdf_new_dict(ctx, doc, 10); - pdf_dict_put(ctx, fobj, PDF_NAME_Type, PDF_NAME_Font); - pdf_dict_put(ctx, fobj, PDF_NAME_Subtype, PDF_NAME_Type0); - pdf_dict_put_drop(ctx, fobj, PDF_NAME_BaseFont, pdf_new_name(ctx, doc, font->name)); - pdf_dict_put(ctx, fobj, PDF_NAME_Encoding, PDF_NAME_Identity_H); - - obj_array = pdf_new_array(ctx, doc, 3); - pdf_array_insert(ctx, obj_array, obj_desc_ref, 0); - pdf_dict_put(ctx, fobj, PDF_NAME_DescendantFonts, obj_array); - if (obj_tounicode_ref) - pdf_dict_put(ctx, fobj, PDF_NAME_ToUnicode, obj_tounicode_ref); - fref = pdf_add_object(ctx, doc, fobj); + pdf_dict_put(ctx, fobj, PDF_NAME(Type), PDF_NAME(Font)); + pdf_dict_put(ctx, fobj, PDF_NAME(Subtype), PDF_NAME(Type0)); + pdf_dict_put_name(ctx, fobj, PDF_NAME(BaseFont), font->name); + pdf_dict_put(ctx, fobj, PDF_NAME(Encoding), PDF_NAME(Identity_H)); + pdf_add_to_unicode(ctx, doc, fobj, font); - /* Add ref to our font resource hash table. */ - fref = pdf_insert_font_resource(ctx, doc, digest, fref); - } + dfonts = pdf_dict_put_array(ctx, fobj, PDF_NAME(DescendantFonts), 1); + pdf_array_push_drop(ctx, dfonts, pdf_add_descendant_cid_font(ctx, doc, font)); + + fref = pdf_insert_font_resource(ctx, doc, digest, fobj); } fz_always(ctx) - { - pdf_drop_font(ctx, fontdesc); pdf_drop_obj(ctx, fobj); - pdf_drop_obj(ctx, obj_desc_ref); - pdf_drop_obj(ctx, obj_array); - pdf_drop_obj(ctx, obj_tounicode_ref); - } fz_catch(ctx) - { - pdf_drop_obj(ctx, fref); fz_rethrow(ctx); - } return fref; } -/* Creates simple font */ +/* Create simple (8-bit encoding) fonts */ + +static void +pdf_add_simple_font_encoding_imp(fz_context *ctx, pdf_document *doc, pdf_obj *font, const char *glyph_names[]) +{ + pdf_obj *enc, *diff; + int i, last; + + enc = pdf_dict_put_dict(ctx, font, PDF_NAME(Encoding), 2); + pdf_dict_put(ctx, enc, PDF_NAME(BaseEncoding), PDF_NAME(WinAnsiEncoding)); + diff = pdf_dict_put_array(ctx, enc, PDF_NAME(Differences), 129); + last = 0; + for (i = 128; i < 256; ++i) + { + const char *glyph = glyph_names[i]; + if (glyph) + { + if (last != i-1) + pdf_array_push_int(ctx, diff, i); + last = i; + pdf_array_push_name(ctx, diff, glyph); + } + } +} + +static void +pdf_add_simple_font_encoding(fz_context *ctx, pdf_document *doc, pdf_obj *fobj, int encoding) +{ + switch (encoding) + { + default: + case PDF_SIMPLE_ENCODING_LATIN: + pdf_dict_put(ctx, fobj, PDF_NAME(Encoding), PDF_NAME(WinAnsiEncoding)); + break; + case PDF_SIMPLE_ENCODING_GREEK: + pdf_add_simple_font_encoding_imp(ctx, doc, fobj, pdf_glyph_name_from_iso8859_7); + break; + case PDF_SIMPLE_ENCODING_CYRILLIC: + pdf_add_simple_font_encoding_imp(ctx, doc, fobj, pdf_glyph_name_from_koi8u); + break; + } +} + pdf_obj * -pdf_add_simple_font(fz_context *ctx, pdf_document *doc, fz_font *font) +pdf_add_simple_font(fz_context *ctx, pdf_document *doc, fz_font *font, int encoding) { + FT_Face face = font->ft_face; pdf_obj *fobj = NULL; pdf_obj *fref = NULL; - pdf_obj *fstr_ref = NULL; - pdf_obj *fdes_ref = NULL; - pdf_obj *fwidth_ref = NULL; - pdf_font_desc *fontdesc = NULL; - - FT_Face face = font->ft_face; + const char **enc; unsigned char digest[16]; - int first_char, last_char; - fz_var(fobj); - fz_var(fref); - fz_var(fstr_ref); - fz_var(fdes_ref); - fz_var(fwidth_ref); - fz_var(fontdesc); + fref = pdf_find_font_resource(ctx, doc, PDF_SIMPLE_FONT_RESOURCE, encoding, font, digest); + if (fref) + return fref; - fz_try(ctx) + switch (encoding) { - /* Before we add this font as a resource check if the same font - * already exists in our resources for this doc. If yes, then - * hand back that reference */ - fref = pdf_find_font_resource(ctx, doc, font->buffer, digest); - if (fref == NULL) - { - fobj = pdf_new_dict(ctx, doc, 10); - pdf_dict_put_drop(ctx, fobj, PDF_NAME_Type, PDF_NAME_Font); - switch (ft_kind(face)) - { - case TYPE1: pdf_dict_put(ctx, fobj, PDF_NAME_Subtype, PDF_NAME_Type1); break; - case TRUETYPE: pdf_dict_put(ctx, fobj, PDF_NAME_Subtype, PDF_NAME_TrueType); break; - } - pdf_dict_put(ctx, fobj, PDF_NAME_Encoding, PDF_NAME_WinAnsiEncoding); - - if (!is_builtin_font(ctx, font)) - { - const char *ps_name = FT_Get_Postscript_Name(face); - if (!ps_name) - ps_name = font->name; - pdf_dict_put_drop(ctx, fobj, PDF_NAME_BaseFont, pdf_new_name(ctx, doc, ps_name)); - - fontdesc = pdf_new_font_desc(ctx); - fontdesc->font = fz_keep_font(ctx, font); - fontdesc->flags = PDF_FD_NONSYMBOLIC; /* ToDo: FixMe. Set non-symbolic always for now */ - fontdesc->ascent = face->ascender * 1000.0f / face->units_per_EM; - fontdesc->descent = face->descender * 1000.0f / face->units_per_EM; - - fstr_ref = pdf_add_font_file(ctx, doc, font); - fdes_ref = pdf_add_font_descriptor(ctx, doc, fontdesc, fstr_ref); - fwidth_ref = pdf_add_simple_font_widths(ctx, doc, fontdesc, &first_char, &last_char); - - pdf_dict_put_drop(ctx, fobj, PDF_NAME_FirstChar, pdf_new_int(ctx, doc, first_char)); - pdf_dict_put_drop(ctx, fobj, PDF_NAME_LastChar, pdf_new_int(ctx, doc, last_char)); - pdf_dict_put(ctx, fobj, PDF_NAME_Widths, fwidth_ref); - pdf_dict_put(ctx, fobj, PDF_NAME_FontDescriptor, fdes_ref); - } - else - { - pdf_dict_put_drop(ctx, fobj, PDF_NAME_BaseFont, pdf_new_name(ctx, doc, clean_font_name(font->name))); - } + default: enc = pdf_win_ansi; break; + case PDF_SIMPLE_ENCODING_LATIN: enc = pdf_win_ansi; break; + case PDF_SIMPLE_ENCODING_GREEK: enc = pdf_glyph_name_from_iso8859_7; break; + case PDF_SIMPLE_ENCODING_CYRILLIC: enc = pdf_glyph_name_from_koi8u; break; + } - fref = pdf_add_object(ctx, doc, fobj); + fobj = pdf_add_new_dict(ctx, doc, 10); + fz_try(ctx) + { + pdf_dict_put(ctx, fobj, PDF_NAME(Type), PDF_NAME(Font)); + switch (ft_kind(face)) + { + case TYPE1: pdf_dict_put(ctx, fobj, PDF_NAME(Subtype), PDF_NAME(Type1)); break; + case TRUETYPE: pdf_dict_put(ctx, fobj, PDF_NAME(Subtype), PDF_NAME(TrueType)); break; + } - /* Add ref to our font resource hash table. */ - fref = pdf_insert_font_resource(ctx, doc, digest, fref); + if (!is_builtin_font(ctx, font)) + { + const char *ps_name = FT_Get_Postscript_Name(face); + if (!ps_name) + ps_name = font->name; + pdf_dict_put_name(ctx, fobj, PDF_NAME(BaseFont), ps_name); + pdf_add_simple_font_encoding(ctx, doc, fobj, encoding); + pdf_add_simple_font_widths(ctx, doc, fobj, font, enc); + pdf_add_font_descriptor(ctx, doc, fobj, font); + } + else + { + pdf_dict_put_name(ctx, fobj, PDF_NAME(BaseFont), clean_font_name(font->name)); + pdf_add_simple_font_encoding(ctx, doc, fobj, encoding); + if (encoding != PDF_SIMPLE_ENCODING_LATIN) + pdf_add_simple_font_widths(ctx, doc, fobj, font, enc); } + + fref = pdf_insert_font_resource(ctx, doc, digest, fobj); } fz_always(ctx) { - pdf_drop_font(ctx, fontdesc); pdf_drop_obj(ctx, fobj); - pdf_drop_obj(ctx, fstr_ref); - pdf_drop_obj(ctx, fdes_ref); - pdf_drop_obj(ctx, fwidth_ref); } fz_catch(ctx) - { - pdf_drop_obj(ctx, fref); fz_rethrow(ctx); - } return fref; } @@ -2170,3 +2118,93 @@ } return 0; } + +/* Add a non-embedded UTF16-encoded CID-font for the CJK scripts: CNS1, GB1, Japan1, or Korea1 */ +pdf_obj * +pdf_add_cjk_font(fz_context *ctx, pdf_document *doc, fz_font *fzfont, int script, int wmode, int serif) +{ + pdf_obj *fref, *font, *subfont, *fontdesc; + pdf_obj *dfonts; + fz_rect bbox = { -200, -200, 1200, 1200 }; + unsigned char digest[16]; + int flags; + + const char *basefont, *encoding, *ordering; + int supplement; + + switch (script) + { + default: + script = FZ_ADOBE_CNS; + /* fall through */ + case FZ_ADOBE_CNS: /* traditional chinese */ + basefont = serif ? "Ming" : "Fangti"; + encoding = wmode ? "UniCNS-UTF16-V" : "UniCNS-UTF16-H"; + ordering = "CNS1"; + supplement = 7; + break; + case FZ_ADOBE_GB: /* simplified chinese */ + basefont = serif ? "Song" : "Heiti"; + encoding = wmode ? "UniGB-UTF16-V" : "UniGB-UTF16-H"; + ordering = "GB1"; + supplement = 5; + break; + case FZ_ADOBE_JAPAN: + basefont = serif ? "Mincho" : "Gothic"; + encoding = wmode ? "UniJIS-UTF16-V" : "UniJIS-UTF16-H"; + ordering = "Japan1"; + supplement = 6; + break; + case FZ_ADOBE_KOREA: + basefont = serif ? "Batang" : "Dotum"; + encoding = wmode ? "UniKS-UTF16-V" : "UniKS-UTF16-H"; + ordering = "Korea1"; + supplement = 2; + break; + } + + flags = PDF_FD_SYMBOLIC; + if (serif) + flags |= PDF_FD_SERIF; + + fref = pdf_find_font_resource(ctx, doc, PDF_CJK_FONT_RESOURCE, script, fzfont, digest); + if (fref) + return fref; + + font = pdf_add_new_dict(ctx, doc, 5); + fz_try(ctx) + { + pdf_dict_put(ctx, font, PDF_NAME(Type), PDF_NAME(Font)); + pdf_dict_put(ctx, font, PDF_NAME(Subtype), PDF_NAME(Type0)); + pdf_dict_put_name(ctx, font, PDF_NAME(BaseFont), basefont); + pdf_dict_put_name(ctx, font, PDF_NAME(Encoding), encoding); + dfonts = pdf_dict_put_array(ctx, font, PDF_NAME(DescendantFonts), 1); + pdf_array_push_drop(ctx, dfonts, subfont = pdf_add_new_dict(ctx, doc, 5)); + { + pdf_dict_put(ctx, subfont, PDF_NAME(Type), PDF_NAME(Font)); + pdf_dict_put(ctx, subfont, PDF_NAME(Subtype), PDF_NAME(CIDFontType0)); + pdf_dict_put_name(ctx, subfont, PDF_NAME(BaseFont), basefont); + pdf_add_cid_system_info(ctx, doc, subfont, "Adobe", ordering, supplement); + fontdesc = pdf_add_new_dict(ctx, doc, 8); + pdf_dict_put_drop(ctx, subfont, PDF_NAME(FontDescriptor), fontdesc); + { + pdf_dict_put(ctx, fontdesc, PDF_NAME(Type), PDF_NAME(FontDescriptor)); + pdf_dict_put_text_string(ctx, fontdesc, PDF_NAME(FontName), basefont); + pdf_dict_put_rect(ctx, fontdesc, PDF_NAME(FontBBox), bbox); + pdf_dict_put_int(ctx, fontdesc, PDF_NAME(Flags), flags); + pdf_dict_put_int(ctx, fontdesc, PDF_NAME(ItalicAngle), 0); + pdf_dict_put_int(ctx, fontdesc, PDF_NAME(Ascent), 1000); + pdf_dict_put_int(ctx, fontdesc, PDF_NAME(Descent), -200); + pdf_dict_put_int(ctx, fontdesc, PDF_NAME(StemV), 80); + } + } + + fref = pdf_insert_font_resource(ctx, doc, digest, font); + } + fz_always(ctx) + pdf_drop_obj(ctx, font); + fz_catch(ctx) + fz_rethrow(ctx); + + return fref; +} diff -Nru k2pdfopt-2.42+ds/mupdf_mod/pdf-graft.c k2pdfopt-2.51+ds/mupdf_mod/pdf-graft.c --- k2pdfopt-2.42+ds/mupdf_mod/pdf-graft.c 1970-01-01 00:00:00.000000000 +0000 +++ k2pdfopt-2.51+ds/mupdf_mod/pdf-graft.c 2018-11-21 02:42:38.000000000 +0000 @@ -0,0 +1,199 @@ +#include "mupdf/fitz.h" +#include "mupdf/pdf.h" +/* willus mod -- remove ../fitz/ */ +#include "fitz-imp.h" + +#include + +struct pdf_graft_map_s +{ + int refs; + int len; + pdf_document *src; + pdf_document *dst; + int *dst_from_src; +}; + +pdf_graft_map * +pdf_new_graft_map(fz_context *ctx, pdf_document *dst) +{ + pdf_graft_map *map = NULL; + + map = fz_malloc_struct(ctx, pdf_graft_map); + + map->dst = pdf_keep_document(ctx, dst); + map->refs = 1; + return map; +} + +pdf_graft_map * +pdf_keep_graft_map(fz_context *ctx, pdf_graft_map *map) +{ + return fz_keep_imp(ctx, map, &map->refs); +} + +void +pdf_drop_graft_map(fz_context *ctx, pdf_graft_map *map) +{ + if (fz_drop_imp(ctx, map, &map->refs)) + { + pdf_drop_document(ctx, map->src); + pdf_drop_document(ctx, map->dst); + fz_free(ctx, map->dst_from_src); + fz_free(ctx, map); + } +} + +/* Graft object from dst to source */ +pdf_obj * +pdf_graft_object(fz_context *ctx, pdf_document *dst, pdf_obj *obj) +{ + pdf_document *src; + pdf_graft_map *map; + + /* Primitive objects are not bound to a document, so can be re-used as is. */ + src = pdf_get_bound_document(ctx, obj); + if (src == NULL) + return pdf_keep_obj(ctx, obj); + + map = pdf_new_graft_map(ctx, dst); + + fz_try(ctx) + obj = pdf_graft_mapped_object(ctx, map, obj); + fz_always(ctx) + pdf_drop_graft_map(ctx, map); + fz_catch(ctx) + fz_rethrow(ctx); + + return obj; +} + +pdf_obj * +pdf_graft_mapped_object(fz_context *ctx, pdf_graft_map *map, pdf_obj *obj) +{ + pdf_obj *val, *key; + pdf_obj *new_obj = NULL; + pdf_obj *new_dict; + pdf_obj *new_array; + pdf_obj *ref = NULL; + fz_buffer *buffer = NULL; + pdf_document *src; + int new_num, src_num, len, i; + + /* Primitive objects are not bound to a document, so can be re-used as is. */ + src = pdf_get_bound_document(ctx, obj); + if (!src) + return pdf_keep_obj(ctx, obj); + + if (map->src && src != map->src) + fz_throw(ctx, FZ_ERROR_GENERIC, "grafted objects must all belong to the same source document"); + + if (pdf_is_indirect(ctx, obj)) + { + src_num = pdf_to_num(ctx, obj); + + if (map->src == NULL) + { + fz_try(ctx) + { + map->src = pdf_keep_document(ctx, src); + map->len = pdf_xref_len(ctx, src); + map->dst_from_src = fz_calloc(ctx, map->len, sizeof(int)); + } + fz_catch(ctx) + { + pdf_drop_document(ctx, map->src); + map->src = NULL; + fz_rethrow(ctx); + } + } + + if (src_num < 1 || src_num >= map->len) + fz_throw(ctx, FZ_ERROR_GENERIC, "source object number out of range"); + + /* Check if we have done this one. If yes, then just + * return our indirect ref */ + if (map->dst_from_src[src_num] != 0) + { + int dest_num = map->dst_from_src[src_num]; + return pdf_new_indirect(ctx, map->dst, dest_num, 0); + } + + fz_var(buffer); + fz_var(ref); + + fz_try(ctx) + { + /* Create new slot for our src object, set the mapping and call again + * using the resolved indirect reference */ + new_num = pdf_create_object(ctx, map->dst); + map->dst_from_src[src_num] = new_num; + new_obj = pdf_graft_mapped_object(ctx, map, pdf_resolve_indirect(ctx, obj)); + + /* Return a ref to the new_obj making sure to attach any stream */ + pdf_update_object(ctx, map->dst, new_num, new_obj); + pdf_drop_obj(ctx, new_obj); + ref = pdf_new_indirect(ctx, map->dst, new_num, 0); + if (pdf_is_stream(ctx, obj)) + { + buffer = pdf_load_raw_stream_number(ctx, src, src_num); + pdf_update_stream(ctx, map->dst, ref, buffer, 1); + } + } + fz_always(ctx) + fz_drop_buffer(ctx, buffer); + fz_catch(ctx) + { + pdf_drop_obj(ctx, ref); + fz_rethrow(ctx); + } + return ref; + } + else if (pdf_is_dict(ctx, obj)) + { + len = pdf_dict_len(ctx, obj); + new_dict = pdf_new_dict(ctx, map->dst, len); + + fz_try(ctx) + { + for (i = 0; i < len; i++) + { + key = pdf_dict_get_key(ctx, obj, i); + val = pdf_dict_get_val(ctx, obj, i); + pdf_dict_put_drop(ctx, new_dict, key, pdf_graft_mapped_object(ctx, map, val)); + } + } + fz_catch(ctx) + { + pdf_drop_obj(ctx, new_dict); + fz_rethrow(ctx); + } + return new_dict; + } + else if (pdf_is_array(ctx, obj)) + { + /* Step through the array items handling indirect refs */ + len = pdf_array_len(ctx, obj); + new_array = pdf_new_array(ctx, map->dst, len); + + fz_try(ctx) + { + for (i = 0; i < len; i++) + { + val = pdf_array_get(ctx, obj, i); + pdf_array_push_drop(ctx, new_array, pdf_graft_mapped_object(ctx, map, val)); + } + } + fz_catch(ctx) + { + pdf_drop_obj(ctx, new_array); + fz_rethrow(ctx); + } + return new_array; + } + else + { + assert("This never happens" == NULL); + return NULL; + } +} diff -Nru k2pdfopt-2.42+ds/mupdf_mod/pdf-link.c k2pdfopt-2.51+ds/mupdf_mod/pdf-link.c --- k2pdfopt-2.42+ds/mupdf_mod/pdf-link.c 1970-01-01 00:00:00.000000000 +0000 +++ k2pdfopt-2.51+ds/mupdf_mod/pdf-link.c 2018-11-21 00:36:44.000000000 +0000 @@ -0,0 +1,359 @@ +#include "mupdf/fitz.h" +#include "mupdf/pdf.h" + +#include + +static pdf_obj * +resolve_dest_rec(fz_context *ctx, pdf_document *doc, pdf_obj *dest, int depth) +{ + if (depth > 10) /* Arbitrary to avoid infinite recursion */ + return NULL; + + if (pdf_is_name(ctx, dest) || pdf_is_string(ctx, dest)) + { + dest = pdf_lookup_dest(ctx, doc, dest); + dest = resolve_dest_rec(ctx, doc, dest, depth+1); + return dest; + } + + else if (pdf_is_array(ctx, dest)) + { + return dest; + } + + else if (pdf_is_dict(ctx, dest)) + { + dest = pdf_dict_get(ctx, dest, PDF_NAME(D)); + return resolve_dest_rec(ctx, doc, dest, depth+1); + } + + else if (pdf_is_indirect(ctx, dest)) + return dest; + + return NULL; +} + +static pdf_obj * +resolve_dest(fz_context *ctx, pdf_document *doc, pdf_obj *dest) +{ + return resolve_dest_rec(ctx, doc, dest, 0); +} + +char * +pdf_parse_link_dest(fz_context *ctx, pdf_document *doc, pdf_obj *dest) +{ + pdf_obj *obj; + char buf[256]; + const char *ld; + int page; + int x, y; + + dest = resolve_dest(ctx, doc, dest); + if (dest == NULL) + { + fz_warn(ctx, "undefined link destination"); + return NULL; + } + + if (pdf_is_name(ctx, dest)) + { + ld = pdf_to_name(ctx, dest); + return fz_strdup(ctx, ld); + } + else if (pdf_is_string(ctx, dest)) + { + ld = pdf_to_str_buf(ctx, dest); + return fz_strdup(ctx, ld); + } + + obj = pdf_array_get(ctx, dest, 0); + if (pdf_is_int(ctx, obj)) + page = pdf_to_int(ctx, obj); + else + { + fz_try(ctx) + page = pdf_lookup_page_number(ctx, doc, obj); + fz_catch(ctx) + page = -1; + } + + x = y = 0; + obj = pdf_array_get(ctx, dest, 1); + if (pdf_name_eq(ctx, obj, PDF_NAME(XYZ))) + { + x = pdf_array_get_int(ctx, dest, 2); + y = pdf_array_get_int(ctx, dest, 3); + } + else if (pdf_name_eq(ctx, obj, PDF_NAME(FitR))) + { + x = pdf_array_get_int(ctx, dest, 2); + y = pdf_array_get_int(ctx, dest, 5); + } + else if (pdf_name_eq(ctx, obj, PDF_NAME(FitH)) || pdf_name_eq(ctx, obj, PDF_NAME(FitBH))) + y = pdf_array_get_int(ctx, dest, 2); + else if (pdf_name_eq(ctx, obj, PDF_NAME(FitV)) || pdf_name_eq(ctx, obj, PDF_NAME(FitBV))) + x = pdf_array_get_int(ctx, dest, 2); + + if (page >= 0) + { + if (x != 0 || y != 0) + fz_snprintf(buf, sizeof buf, "#%d,%d,%d", page + 1, x, y); + else + fz_snprintf(buf, sizeof buf, "#%d", page + 1); + return fz_strdup(ctx, buf); + } + + return NULL; +} + +char * +pdf_parse_file_spec(fz_context *ctx, pdf_document *doc, pdf_obj *file_spec, pdf_obj *dest) +{ + pdf_obj *filename=NULL; + char *path = NULL; + char *uri = NULL; + char buf[256]; + size_t n; + + if (pdf_is_string(ctx, file_spec)) + filename = file_spec; + + if (pdf_is_dict(ctx, file_spec)) { +#ifdef _WIN32 + filename = pdf_dict_get(ctx, file_spec, PDF_NAME(DOS)); +#else + filename = pdf_dict_get(ctx, file_spec, PDF_NAME(Unix)); +#endif + if (!filename) + filename = pdf_dict_geta(ctx, file_spec, PDF_NAME(UF), PDF_NAME(F)); + } + + if (!pdf_is_string(ctx, filename)) + { + fz_warn(ctx, "cannot parse file specification"); + return NULL; + } + + path = fz_strdup(ctx, pdf_to_text_string(ctx, filename)); +#ifdef _WIN32 + if (!pdf_name_eq(ctx, pdf_dict_get(ctx, file_spec, PDF_NAME(FS)), PDF_NAME(URL))) + { + /* move the file name into the expected place and use the expected path separator */ + char *c; + if (path[0] == '/' && (('A' <= path[1] && path[1] <= 'Z') || ('a' <= path[1] && path[1] <= 'z')) && path[2] == '/') + { + path[0] = path[1]; + path[1] = ':'; + } + for (c = path; *c; c++) + { + if (*c == '/') + *c = '\\'; + } + } +#endif + + if (pdf_is_array(ctx, dest)) + fz_snprintf(buf, sizeof buf, "#page=%d", pdf_array_get_int(ctx, dest, 0) + 1); + else if (pdf_is_name(ctx, dest)) + fz_snprintf(buf, sizeof buf, "#%s", pdf_to_name(ctx, dest)); + else if (pdf_is_string(ctx, dest)) + fz_snprintf(buf, sizeof buf, "#%s", pdf_to_str_buf(ctx, dest)); + else + buf[0] = 0; + + n = 7 + strlen(path) + strlen(buf) + 1; + uri = fz_malloc(ctx, n); + fz_strlcpy(uri, "file://", n); + fz_strlcat(uri, path, n); + fz_strlcat(uri, buf, n); + fz_free(ctx, path); + return uri; +} + +char * +pdf_parse_link_action(fz_context *ctx, pdf_document *doc, pdf_obj *action, int pagenum) +{ + pdf_obj *obj, *dest, *file_spec; + + if (!action) + return NULL; + + obj = pdf_dict_get(ctx, action, PDF_NAME(S)); + if (pdf_name_eq(ctx, PDF_NAME(GoTo), obj)) + { + dest = pdf_dict_get(ctx, action, PDF_NAME(D)); + return pdf_parse_link_dest(ctx, doc, dest); + } + else if (pdf_name_eq(ctx, PDF_NAME(URI), obj)) + { + /* URI entries are ASCII strings */ + const char *uri = pdf_dict_get_string(ctx, action, PDF_NAME(URI), NULL); + if (!fz_is_external_link(ctx, uri)) + { + pdf_obj *uri_base_obj = pdf_dict_getp(ctx, pdf_trailer(ctx, doc), "Root/URI/Base"); + const char *uri_base = uri_base_obj ? pdf_to_str_buf(ctx, uri_base_obj) : "file://"; + char *new_uri = fz_malloc(ctx, strlen(uri_base) + strlen(uri) + 1); + strcpy(new_uri, uri_base); + strcat(new_uri, uri); + return new_uri; + } + return fz_strdup(ctx, uri); + } + else if (pdf_name_eq(ctx, PDF_NAME(Launch), obj)) + { + file_spec = pdf_dict_get(ctx, action, PDF_NAME(F)); + return pdf_parse_file_spec(ctx, doc, file_spec, NULL); + } + else if (pdf_name_eq(ctx, PDF_NAME(GoToR), obj)) + { + dest = pdf_dict_get(ctx, action, PDF_NAME(D)); + file_spec = pdf_dict_get(ctx, action, PDF_NAME(F)); + return pdf_parse_file_spec(ctx, doc, file_spec, dest); + } + else if (pdf_name_eq(ctx, PDF_NAME(Named), obj)) + { + dest = pdf_dict_get(ctx, action, PDF_NAME(N)); + + if (pdf_name_eq(ctx, PDF_NAME(FirstPage), dest)) + pagenum = 0; + else if (pdf_name_eq(ctx, PDF_NAME(LastPage), dest)) + pagenum = pdf_count_pages(ctx, doc) - 1; + else if (pdf_name_eq(ctx, PDF_NAME(PrevPage), dest) && pagenum >= 0) + { + if (pagenum > 0) + pagenum--; + } + else if (pdf_name_eq(ctx, PDF_NAME(NextPage), dest) && pagenum >= 0) + { + if (pagenum < pdf_count_pages(ctx, doc) - 1) + pagenum++; + } + else + return NULL; + + return fz_asprintf(ctx, "#%d", pagenum + 1); + } + + return NULL; +} + +static fz_link * +pdf_load_link(fz_context *ctx, pdf_document *doc, pdf_obj *dict, int pagenum, fz_matrix page_ctm) +{ + pdf_obj *action; + pdf_obj *obj; + fz_rect bbox; + char *uri; + fz_link *link = NULL; + + obj = pdf_dict_get(ctx, dict, PDF_NAME(Subtype)); + if (!pdf_name_eq(ctx, obj, PDF_NAME(Link))) + return NULL; + + obj = pdf_dict_get(ctx, dict, PDF_NAME(Rect)); + if (!obj) + return NULL; + + bbox = pdf_to_rect(ctx, obj); + bbox = fz_transform_rect(bbox, page_ctm); + + obj = pdf_dict_get(ctx, dict, PDF_NAME(Dest)); + if (obj) + uri = pdf_parse_link_dest(ctx, doc, obj); + else + { + action = pdf_dict_get(ctx, dict, PDF_NAME(A)); + /* fall back to additional action button's down/up action */ + if (!action) + action = pdf_dict_geta(ctx, pdf_dict_get(ctx, dict, PDF_NAME(AA)), PDF_NAME(U), PDF_NAME(D)); + uri = pdf_parse_link_action(ctx, doc, action, pagenum); + } + + if (!uri) + return NULL; + + fz_try(ctx) + link = fz_new_link(ctx, bbox, doc, uri); + fz_always(ctx) + fz_free(ctx, uri); + fz_catch(ctx) + fz_rethrow(ctx); + + return link; +} + +fz_link * +pdf_load_link_annots(fz_context *ctx, pdf_document *doc, pdf_obj *annots, int pagenum, fz_matrix page_ctm) +{ + fz_link *link, *head, *tail; + pdf_obj *obj; + int i, n; + + head = tail = NULL; + link = NULL; + + n = pdf_array_len(ctx, annots); + for (i = 0; i < n; i++) + { + /* FIXME: Move the try/catch out of the loop for performance? */ + fz_try(ctx) + { + obj = pdf_array_get(ctx, annots, i); + link = pdf_load_link(ctx, doc, obj, pagenum, page_ctm); + } + fz_catch(ctx) + { + fz_rethrow_if(ctx, FZ_ERROR_TRYLATER); + link = NULL; + } + + if (link) + { + if (!head) + head = tail = link; + else + { + tail->next = link; + tail = link; + } + } + } + + return head; +} + +int +pdf_resolve_link(fz_context *ctx, pdf_document *doc, const char *uri, float *xp, float *yp) +{ + if (uri && uri[0] == '#') + { + int page = fz_atoi(uri + 1) - 1; + if (xp || yp) + { + const char *x = strchr(uri, ','); + const char *y = strrchr(uri, ','); + if (x && y) + { + pdf_obj *obj; + fz_matrix ctm; + fz_point p; + + p.x = x ? fz_atoi(x + 1) : 0; + p.y = y ? fz_atoi(y + 1) : 0; + obj = pdf_lookup_page_obj(ctx, doc, page); + pdf_page_obj_transform(ctx, obj, NULL, &ctm); + p = fz_transform_point(p, ctm); + + if (xp) *xp = p.x; + if (yp) *yp = p.y; + } + } + return page; + } +/* willus mod -- be quiet */ +/* + fz_warn(ctx, "unknown link uri '%s'", uri); +*/ + return -1; +} diff -Nru k2pdfopt-2.42+ds/mupdf_mod/pdf-object.c k2pdfopt-2.51+ds/mupdf_mod/pdf-object.c --- k2pdfopt-2.42+ds/mupdf_mod/pdf-object.c 1970-01-01 00:00:00.000000000 +0000 +++ k2pdfopt-2.51+ds/mupdf_mod/pdf-object.c 2018-11-21 02:42:58.000000000 +0000 @@ -0,0 +1,2370 @@ +#include "mupdf/fitz.h" +#include "mupdf/pdf.h" +/* willus mod -- remove ../fitz/ */ +#include "fitz-imp.h" + +#include +#include +#include + +#define PDF_MAKE_NAME(STRING,NAME) STRING, +static const char *PDF_NAME_LIST[] = { + "", "", "", /* dummy slots for null, true, and false */ +#include "mupdf/pdf/name-table.h" +}; +#undef PDF_MAKE_NAME + +typedef enum pdf_objkind_e +{ + PDF_INT = 'i', + PDF_REAL = 'f', + PDF_STRING = 's', + PDF_NAME = 'n', + PDF_ARRAY = 'a', + PDF_DICT = 'd', + PDF_INDIRECT = 'r' +} pdf_objkind; + +struct keyval +{ + pdf_obj *k; + pdf_obj *v; +}; + +enum +{ + PDF_FLAGS_MARKED = 1, + PDF_FLAGS_SORTED = 2, + PDF_FLAGS_DIRTY = 4, + PDF_FLAGS_MEMO_BASE = 8, + PDF_FLAGS_MEMO_BASE_BOOL = 16 +}; + +struct pdf_obj_s +{ + short refs; + unsigned char kind; + unsigned char flags; +}; + +typedef struct pdf_obj_num_s +{ + pdf_obj super; + union + { + int64_t i; + float f; + } u; +} pdf_obj_num; + +typedef struct pdf_obj_string_s +{ + pdf_obj super; + char *text; /* utf8 encoded text string */ + unsigned int len; + char buf[1]; +} pdf_obj_string; + +typedef struct pdf_obj_name_s +{ + pdf_obj super; + char n[1]; +} pdf_obj_name; + +typedef struct pdf_obj_array_s +{ + pdf_obj super; + pdf_document *doc; + int parent_num; + int len; + int cap; + pdf_obj **items; +} pdf_obj_array; + +typedef struct pdf_obj_dict_s +{ + pdf_obj super; + pdf_document *doc; + int parent_num; + int len; + int cap; + struct keyval *items; +} pdf_obj_dict; + +typedef struct pdf_obj_ref_s +{ + pdf_obj super; + pdf_document *doc; /* Only needed for arrays, dicts and indirects */ + int num; + int gen; +} pdf_obj_ref; + +#define NAME(obj) ((pdf_obj_name *)(obj)) +#define NUM(obj) ((pdf_obj_num *)(obj)) +#define STRING(obj) ((pdf_obj_string *)(obj)) +#define DICT(obj) ((pdf_obj_dict *)(obj)) +#define ARRAY(obj) ((pdf_obj_array *)(obj)) +#define REF(obj) ((pdf_obj_ref *)(obj)) + +pdf_obj * +pdf_new_int(fz_context *ctx, int64_t i) +{ + pdf_obj_num *obj; + obj = Memento_label(fz_malloc(ctx, sizeof(pdf_obj_num)), "pdf_obj(int)"); + obj->super.refs = 1; + obj->super.kind = PDF_INT; + obj->super.flags = 0; + obj->u.i = i; + return &obj->super; +} + +pdf_obj * +pdf_new_real(fz_context *ctx, float f) +{ + pdf_obj_num *obj; + obj = Memento_label(fz_malloc(ctx, sizeof(pdf_obj_num)), "pdf_obj(real)"); + obj->super.refs = 1; + obj->super.kind = PDF_REAL; + obj->super.flags = 0; + obj->u.f = f; + return &obj->super; +} + +pdf_obj * +pdf_new_string(fz_context *ctx, const char *str, size_t len) +{ + pdf_obj_string *obj; + unsigned int l = (unsigned int)len; + + if ((size_t)l != len) + fz_throw(ctx, FZ_ERROR_GENERIC, "Overflow in pdf string"); + + obj = Memento_label(fz_malloc(ctx, offsetof(pdf_obj_string, buf) + len + 1), "pdf_obj(string)"); + obj->super.refs = 1; + obj->super.kind = PDF_STRING; + obj->super.flags = 0; + obj->text = NULL; + obj->len = l; + memcpy(obj->buf, str, len); + obj->buf[len] = '\0'; + return &obj->super; +} + +pdf_obj * +pdf_new_name(fz_context *ctx, const char *str) +{ + pdf_obj_name *obj; + int l = 3; /* skip dummy slots */ + int r = nelem(PDF_NAME_LIST) - 1; + while (l <= r) + { + int m = (l + r) >> 1; + int c = strcmp(str, PDF_NAME_LIST[m]); + if (c < 0) + r = m - 1; + else if (c > 0) + l = m + 1; + else + return (pdf_obj*)(intptr_t)m; + } + + obj = Memento_label(fz_malloc(ctx, offsetof(pdf_obj_name, n) + strlen(str) + 1), "pdf_obj(name)"); + obj->super.refs = 1; + obj->super.kind = PDF_NAME; + obj->super.flags = 0; + strcpy(obj->n, str); + return &obj->super; +} + +pdf_obj * +pdf_new_indirect(fz_context *ctx, pdf_document *doc, int num, int gen) +{ + pdf_obj_ref *obj; + obj = Memento_label(fz_malloc(ctx, sizeof(pdf_obj_ref)), "pdf_obj(indirect)"); + obj->super.refs = 1; + obj->super.kind = PDF_INDIRECT; + obj->super.flags = 0; + obj->doc = doc; + obj->num = num; + obj->gen = gen; + return &obj->super; +} + +#define OBJ_IS_NULL(obj) (obj == PDF_NULL) +#define OBJ_IS_BOOL(obj) (obj == PDF_TRUE || obj == PDF_FALSE) +#define OBJ_IS_NAME(obj) ((obj > PDF_FALSE && obj < PDF_LIMIT) || (obj >= PDF_LIMIT && obj->kind == PDF_NAME)) +#define OBJ_IS_INT(obj) \ + (obj >= PDF_LIMIT && obj->kind == PDF_INT) +#define OBJ_IS_REAL(obj) \ + (obj >= PDF_LIMIT && obj->kind == PDF_REAL) +#define OBJ_IS_NUMBER(obj) \ + (obj >= PDF_LIMIT && (obj->kind == PDF_REAL || obj->kind == PDF_INT)) +#define OBJ_IS_STRING(obj) \ + (obj >= PDF_LIMIT && obj->kind == PDF_STRING) +#define OBJ_IS_ARRAY(obj) \ + (obj >= PDF_LIMIT && obj->kind == PDF_ARRAY) +#define OBJ_IS_DICT(obj) \ + (obj >= PDF_LIMIT && obj->kind == PDF_DICT) +#define OBJ_IS_INDIRECT(obj) \ + (obj >= PDF_LIMIT && obj->kind == PDF_INDIRECT) + +#define RESOLVE(obj) \ + if (OBJ_IS_INDIRECT(obj)) \ + obj = pdf_resolve_indirect_chain(ctx, obj); \ + +int pdf_is_indirect(fz_context *ctx, pdf_obj *obj) +{ + return OBJ_IS_INDIRECT(obj); +} + +int pdf_is_null(fz_context *ctx, pdf_obj *obj) +{ + RESOLVE(obj); + return OBJ_IS_NULL(obj); +} + +int pdf_is_bool(fz_context *ctx, pdf_obj *obj) +{ + RESOLVE(obj); + return OBJ_IS_BOOL(obj); +} + +int pdf_is_int(fz_context *ctx, pdf_obj *obj) +{ + RESOLVE(obj); + return OBJ_IS_INT(obj); +} + +int pdf_is_real(fz_context *ctx, pdf_obj *obj) +{ + RESOLVE(obj); + return OBJ_IS_REAL(obj); +} + +int pdf_is_number(fz_context *ctx, pdf_obj *obj) +{ + RESOLVE(obj); + return OBJ_IS_NUMBER(obj); +} + +int pdf_is_string(fz_context *ctx, pdf_obj *obj) +{ + RESOLVE(obj); + return OBJ_IS_STRING(obj); +} + +int pdf_is_name(fz_context *ctx, pdf_obj *obj) +{ + RESOLVE(obj); + return OBJ_IS_NAME(obj); +} + +int pdf_is_array(fz_context *ctx, pdf_obj *obj) +{ + RESOLVE(obj); + return OBJ_IS_ARRAY(obj); +} + +int pdf_is_dict(fz_context *ctx, pdf_obj *obj) +{ + RESOLVE(obj); + return OBJ_IS_DICT(obj); +} + +int pdf_to_bool(fz_context *ctx, pdf_obj *obj) +{ + RESOLVE(obj); + return obj == PDF_TRUE; +} + +int pdf_to_int(fz_context *ctx, pdf_obj *obj) +{ + RESOLVE(obj); + if (obj < PDF_LIMIT) + return 0; + if (obj->kind == PDF_INT) + return (int)NUM(obj)->u.i; + if (obj->kind == PDF_REAL) + return (int)(NUM(obj)->u.f + 0.5f); /* No roundf in MSVC */ + return 0; +} + +int64_t pdf_to_int64(fz_context *ctx, pdf_obj *obj) +{ + RESOLVE(obj); + if (obj < PDF_LIMIT) + return 0; + if (obj->kind == PDF_INT) + return NUM(obj)->u.i; + if (obj->kind == PDF_REAL) + return (NUM(obj)->u.f + 0.5f); /* No roundf in MSVC */ + return 0; +} + +float pdf_to_real(fz_context *ctx, pdf_obj *obj) +{ + RESOLVE(obj); + if (obj < PDF_LIMIT) + return 0; + if (obj->kind == PDF_REAL) + return NUM(obj)->u.f; + if (obj->kind == PDF_INT) + return NUM(obj)->u.i; + return 0; +} + +const char *pdf_to_name(fz_context *ctx, pdf_obj *obj) +{ + RESOLVE(obj); + if (obj < PDF_LIMIT) + return PDF_NAME_LIST[((intptr_t)obj)]; + if (obj->kind == PDF_NAME) + return NAME(obj)->n; + return ""; +} + +char *pdf_to_str_buf(fz_context *ctx, pdf_obj *obj) +{ + RESOLVE(obj); + if (OBJ_IS_STRING(obj)) + return STRING(obj)->buf; + return ""; +} + +int pdf_to_str_len(fz_context *ctx, pdf_obj *obj) +{ + RESOLVE(obj); + if (OBJ_IS_STRING(obj)) + return STRING(obj)->len; + return 0; +} + +const char *pdf_to_string(fz_context *ctx, pdf_obj *obj, size_t *sizep) +{ + RESOLVE(obj); + if (OBJ_IS_STRING(obj)) + { + if (sizep) + *sizep = STRING(obj)->len; + return STRING(obj)->buf; + } + if (sizep) + *sizep = 0; + return ""; +} + +const char *pdf_to_text_string(fz_context *ctx, pdf_obj *obj) +{ + RESOLVE(obj); + if (OBJ_IS_STRING(obj)) + { + if (!STRING(obj)->text) + STRING(obj)->text = pdf_new_utf8_from_pdf_string(ctx, STRING(obj)->buf, STRING(obj)->len); + return STRING(obj)->text; + } + return ""; +} + +void pdf_set_int(fz_context *ctx, pdf_obj *obj, int64_t i) +{ + if (OBJ_IS_INT(obj)) + NUM(obj)->u.i = i; +} + +/* for use by pdf_crypt_obj_imp to decrypt AES string in place */ +void pdf_set_str_len(fz_context *ctx, pdf_obj *obj, int newlen) +{ + RESOLVE(obj); + if (!OBJ_IS_STRING(obj)) + return; /* This should never happen */ + if (newlen < 0 || (unsigned int)newlen > STRING(obj)->len) + return; /* This should never happen */ + STRING(obj)->len = newlen; +} + +int pdf_to_num(fz_context *ctx, pdf_obj *obj) +{ + if (OBJ_IS_INDIRECT(obj)) + return REF(obj)->num; + return 0; +} + +int pdf_to_gen(fz_context *ctx, pdf_obj *obj) +{ + if (OBJ_IS_INDIRECT(obj)) + return REF(obj)->gen; + return 0; +} + +pdf_document *pdf_get_indirect_document(fz_context *ctx, pdf_obj *obj) +{ + if (OBJ_IS_INDIRECT(obj)) + return REF(obj)->doc; + return NULL; +} + +pdf_document *pdf_get_bound_document(fz_context *ctx, pdf_obj *obj) +{ + if (obj < PDF_LIMIT) + return NULL; + if (obj->kind == PDF_INDIRECT) + return REF(obj)->doc; + if (obj->kind == PDF_ARRAY) + return ARRAY(obj)->doc; + if (obj->kind == PDF_DICT) + return DICT(obj)->doc; + return NULL; +} + +int pdf_objcmp_resolve(fz_context *ctx, pdf_obj *a, pdf_obj *b) +{ + RESOLVE(a); + RESOLVE(b); + return pdf_objcmp(ctx, a, b); +} + +int +pdf_objcmp(fz_context *ctx, pdf_obj *a, pdf_obj *b) +{ + int i; + + if (a == b) + return 0; + + /* a or b is null, true, or false */ + if (a <= PDF_FALSE || b <= PDF_FALSE) + return 1; + + /* a is a constant name */ + if (a < PDF_LIMIT) + { + if (b < PDF_LIMIT) + return a != b; + if (b->kind != PDF_NAME) + return 1; + return strcmp(PDF_NAME_LIST[(intptr_t)a], NAME(b)->n); + } + + /* b is a constant name */ + if (b < PDF_LIMIT) + { + if (a->kind != PDF_NAME) + return 1; + return strcmp(NAME(a)->n, PDF_NAME_LIST[(intptr_t)b]); + } + + /* both a and b are allocated objects */ + if (a->kind != b->kind) + return 1; + + switch (a->kind) + { + case PDF_INT: + return NUM(a)->u.i - NUM(b)->u.i; + + case PDF_REAL: + if (NUM(a)->u.f < NUM(b)->u.f) + return -1; + if (NUM(a)->u.f > NUM(b)->u.f) + return 1; + return 0; + + case PDF_STRING: + if (STRING(a)->len < STRING(b)->len) + { + if (memcmp(STRING(a)->buf, STRING(b)->buf, STRING(a)->len) <= 0) + return -1; + return 1; + } + if (STRING(a)->len > STRING(b)->len) + { + if (memcmp(STRING(a)->buf, STRING(b)->buf, STRING(b)->len) >= 0) + return 1; + return -1; + } + return memcmp(STRING(a)->buf, STRING(b)->buf, STRING(a)->len); + + case PDF_NAME: + return strcmp(NAME(a)->n, NAME(b)->n); + + case PDF_INDIRECT: + if (REF(a)->num == REF(b)->num) + return REF(a)->gen - REF(b)->gen; + return REF(a)->num - REF(b)->num; + + case PDF_ARRAY: + if (ARRAY(a)->len != ARRAY(b)->len) + return ARRAY(a)->len - ARRAY(b)->len; + for (i = 0; i < ARRAY(a)->len; i++) + if (pdf_objcmp(ctx, ARRAY(a)->items[i], ARRAY(b)->items[i])) + return 1; + return 0; + + case PDF_DICT: + if (DICT(a)->len != DICT(b)->len) + return DICT(a)->len - DICT(b)->len; + for (i = 0; i < DICT(a)->len; i++) + { + if (pdf_objcmp(ctx, DICT(a)->items[i].k, DICT(b)->items[i].k)) + return 1; + if (pdf_objcmp(ctx, DICT(a)->items[i].v, DICT(b)->items[i].v)) + return 1; + } + return 0; + } + return 1; +} + +int pdf_name_eq(fz_context *ctx, pdf_obj *a, pdf_obj *b) +{ + RESOLVE(a); + RESOLVE(b); + if (a <= PDF_FALSE || b <= PDF_FALSE) + return 0; + if (a < PDF_LIMIT || b < PDF_LIMIT) + return (a == b); + if (a->kind == PDF_NAME && b->kind == PDF_NAME) + return !strcmp(NAME(a)->n, NAME(b)->n); + return 0; +} + +static char * +pdf_objkindstr(pdf_obj *obj) +{ + if (obj == PDF_NULL) + return "null"; + if (obj == PDF_TRUE || obj == PDF_FALSE) + return "boolean"; + if (obj < PDF_LIMIT) + return "name"; + switch (obj->kind) + { + case PDF_INT: return "integer"; + case PDF_REAL: return "real"; + case PDF_STRING: return "string"; + case PDF_NAME: return "name"; + case PDF_ARRAY: return "array"; + case PDF_DICT: return "dictionary"; + case PDF_INDIRECT: return "reference"; + } + return ""; +} + +pdf_obj * +pdf_new_array(fz_context *ctx, pdf_document *doc, int initialcap) +{ + pdf_obj_array *obj; + int i; + + obj = Memento_label(fz_malloc(ctx, sizeof(pdf_obj_array)), "pdf_obj(array)"); + obj->super.refs = 1; + obj->super.kind = PDF_ARRAY; + obj->super.flags = 0; + obj->doc = doc; + obj->parent_num = 0; + + obj->len = 0; + obj->cap = initialcap > 1 ? initialcap : 6; + + fz_try(ctx) + { + obj->items = Memento_label(fz_malloc_array(ctx, obj->cap, sizeof(pdf_obj*)), "pdf_obj(array items)"); + } + fz_catch(ctx) + { + fz_free(ctx, obj); + fz_rethrow(ctx); + } + for (i = 0; i < obj->cap; i++) + obj->items[i] = NULL; + + return &obj->super; +} + +static void +pdf_array_grow(fz_context *ctx, pdf_obj_array *obj) +{ + int i; + int new_cap = (obj->cap * 3) / 2; + + obj->items = fz_resize_array(ctx, obj->items, new_cap, sizeof(pdf_obj*)); + obj->cap = new_cap; + + for (i = obj->len ; i < obj->cap; i++) + obj->items[i] = NULL; +} + +pdf_obj * +pdf_copy_array(fz_context *ctx, pdf_obj *obj) +{ + pdf_document *doc; + pdf_obj *arr; + int i; + int n; + + RESOLVE(obj); + if (!OBJ_IS_ARRAY(obj)) + fz_throw(ctx, FZ_ERROR_GENERIC, "not an array (%s)", pdf_objkindstr(obj)); + + doc = ARRAY(obj)->doc; + + n = pdf_array_len(ctx, obj); + arr = pdf_new_array(ctx, doc, n); + fz_try(ctx) + for (i = 0; i < n; i++) + pdf_array_push(ctx, arr, pdf_array_get(ctx, obj, i)); + fz_catch(ctx) + { + pdf_drop_obj(ctx, arr); + fz_rethrow(ctx); + } + + return arr; +} + +int +pdf_array_len(fz_context *ctx, pdf_obj *obj) +{ + RESOLVE(obj); + if (!OBJ_IS_ARRAY(obj)) + return 0; + return ARRAY(obj)->len; +} + +pdf_obj * +pdf_array_get(fz_context *ctx, pdf_obj *obj, int i) +{ + RESOLVE(obj); + if (!OBJ_IS_ARRAY(obj)) + return NULL; + if (i < 0 || i >= ARRAY(obj)->len) + return NULL; + return ARRAY(obj)->items[i]; +} + +static void prepare_object_for_alteration(fz_context *ctx, pdf_obj *obj, pdf_obj *val) +{ + pdf_document *doc, *val_doc; + int parent; + + /* + obj should be a dict or an array. We don't care about + any other types, as they aren't 'containers'. + */ + if (obj < PDF_LIMIT) + return; + + switch (obj->kind) + { + case PDF_DICT: + doc = DICT(obj)->doc; + parent = DICT(obj)->parent_num; + break; + case PDF_ARRAY: + doc = ARRAY(obj)->doc; + parent = ARRAY(obj)->parent_num; + break; + default: + return; + } + + if (val) + { + val_doc = pdf_get_bound_document(ctx, val); + if (doc && val_doc && val_doc != doc) + fz_throw(ctx, FZ_ERROR_GENERIC, "container and item belong to different documents"); + } + + /* + parent_num = 0 while an object is being parsed from the file. + No further action is necessary. + */ + if (parent == 0 || doc->freeze_updates) + return; + + /* + Otherwise we need to ensure that the containing hierarchy of objects + has been moved to the incremental xref section and the newly linked + object needs to record the parent_num + */ + pdf_xref_ensure_incremental_object(ctx, doc, parent); + pdf_set_obj_parent(ctx, val, parent); +} + +void +pdf_array_put(fz_context *ctx, pdf_obj *obj, int i, pdf_obj *item) +{ + RESOLVE(obj); + if (!OBJ_IS_ARRAY(obj)) + fz_throw(ctx, FZ_ERROR_GENERIC, "not an array (%s)", pdf_objkindstr(obj)); + if (i == ARRAY(obj)->len) + { + pdf_array_push(ctx, obj, item); + return; + } + if (i < 0 || i > ARRAY(obj)->len) + fz_throw(ctx, FZ_ERROR_GENERIC, "index out of bounds"); + prepare_object_for_alteration(ctx, obj, item); + pdf_drop_obj(ctx, ARRAY(obj)->items[i]); + ARRAY(obj)->items[i] = pdf_keep_obj(ctx, item); +} + +void +pdf_array_put_drop(fz_context *ctx, pdf_obj *obj, int i, pdf_obj *item) +{ + fz_try(ctx) + pdf_array_put(ctx, obj, i, item); + fz_always(ctx) + pdf_drop_obj(ctx, item); + fz_catch(ctx) + fz_rethrow(ctx); +} + +void +pdf_array_push(fz_context *ctx, pdf_obj *obj, pdf_obj *item) +{ + RESOLVE(obj); + if (!OBJ_IS_ARRAY(obj)) + fz_throw(ctx, FZ_ERROR_GENERIC, "not an array (%s)", pdf_objkindstr(obj)); + prepare_object_for_alteration(ctx, obj, item); + if (ARRAY(obj)->len + 1 > ARRAY(obj)->cap) + pdf_array_grow(ctx, ARRAY(obj)); + ARRAY(obj)->items[ARRAY(obj)->len] = pdf_keep_obj(ctx, item); + ARRAY(obj)->len++; +} + +void +pdf_array_push_drop(fz_context *ctx, pdf_obj *obj, pdf_obj *item) +{ + fz_try(ctx) + pdf_array_push(ctx, obj, item); + fz_always(ctx) + pdf_drop_obj(ctx, item); + fz_catch(ctx) + fz_rethrow(ctx); +} + +void +pdf_array_insert(fz_context *ctx, pdf_obj *obj, pdf_obj *item, int i) +{ + RESOLVE(obj); + if (!OBJ_IS_ARRAY(obj)) + fz_throw(ctx, FZ_ERROR_GENERIC, "not an array (%s)", pdf_objkindstr(obj)); + if (i < 0 || i > ARRAY(obj)->len) + fz_throw(ctx, FZ_ERROR_GENERIC, "index out of bounds"); + prepare_object_for_alteration(ctx, obj, item); + if (ARRAY(obj)->len + 1 > ARRAY(obj)->cap) + pdf_array_grow(ctx, ARRAY(obj)); + memmove(ARRAY(obj)->items + i + 1, ARRAY(obj)->items + i, (ARRAY(obj)->len - i) * sizeof(pdf_obj*)); + ARRAY(obj)->items[i] = pdf_keep_obj(ctx, item); + ARRAY(obj)->len++; +} + +void +pdf_array_insert_drop(fz_context *ctx, pdf_obj *obj, pdf_obj *item, int i) +{ + fz_try(ctx) + pdf_array_insert(ctx, obj, item, i); + fz_always(ctx) + pdf_drop_obj(ctx, item); + fz_catch(ctx) + fz_rethrow(ctx); +} + +void +pdf_array_delete(fz_context *ctx, pdf_obj *obj, int i) +{ + RESOLVE(obj); + if (!OBJ_IS_ARRAY(obj)) + fz_throw(ctx, FZ_ERROR_GENERIC, "not an array (%s)", pdf_objkindstr(obj)); + if (i < 0 || i >= ARRAY(obj)->len) + fz_throw(ctx, FZ_ERROR_GENERIC, "index out of bounds"); + prepare_object_for_alteration(ctx, obj, NULL); + pdf_drop_obj(ctx, ARRAY(obj)->items[i]); + ARRAY(obj)->items[i] = 0; + ARRAY(obj)->len--; + memmove(ARRAY(obj)->items + i, ARRAY(obj)->items + i + 1, (ARRAY(obj)->len - i) * sizeof(pdf_obj*)); +} + +int +pdf_array_contains(fz_context *ctx, pdf_obj *arr, pdf_obj *obj) +{ + int i, len; + + len = pdf_array_len(ctx, arr); + for (i = 0; i < len; i++) + if (!pdf_objcmp(ctx, pdf_array_get(ctx, arr, i), obj)) + return 1; + + return 0; +} + +int +pdf_array_find(fz_context *ctx, pdf_obj *arr, pdf_obj *obj) +{ + int i, len; + + len = pdf_array_len(ctx, arr); + for (i = 0; i < len; i++) + if (!pdf_objcmp(ctx, pdf_array_get(ctx, arr, i), obj)) + return i; + + return -1; +} + +pdf_obj *pdf_new_rect(fz_context *ctx, pdf_document *doc, fz_rect rect) +{ + pdf_obj *arr = pdf_new_array(ctx, doc, 4); + fz_try(ctx) + { + pdf_array_push_real(ctx, arr, rect.x0); + pdf_array_push_real(ctx, arr, rect.y0); + pdf_array_push_real(ctx, arr, rect.x1); + pdf_array_push_real(ctx, arr, rect.y1); + } + fz_catch(ctx) + { + pdf_drop_obj(ctx, arr); + fz_rethrow(ctx); + } + return arr; +} + +pdf_obj *pdf_new_matrix(fz_context *ctx, pdf_document *doc, fz_matrix mtx) +{ + pdf_obj *arr = pdf_new_array(ctx, doc, 6); + fz_try(ctx) + { + pdf_array_push_real(ctx, arr, mtx.a); + pdf_array_push_real(ctx, arr, mtx.b); + pdf_array_push_real(ctx, arr, mtx.c); + pdf_array_push_real(ctx, arr, mtx.d); + pdf_array_push_real(ctx, arr, mtx.e); + pdf_array_push_real(ctx, arr, mtx.f); + } + fz_catch(ctx) + { + pdf_drop_obj(ctx, arr); + fz_rethrow(ctx); + } + return arr; +} + +/* dicts may only have names as keys! */ + +static int keyvalcmp(const void *ap, const void *bp) +{ + const struct keyval *a = ap; + const struct keyval *b = bp; + const char *an; + const char *bn; + + /* We should never get a->k == NULL or b->k == NULL. If we + * do, then they match. */ + if (a->k < PDF_LIMIT) + an = PDF_NAME_LIST[(intptr_t)a->k]; + else if (a->k >= PDF_LIMIT && a->k->kind == PDF_NAME) + an = NAME(a->k)->n; + else + return 0; + + if (b->k < PDF_LIMIT) + bn = PDF_NAME_LIST[(intptr_t)b->k]; + else if (b->k >= PDF_LIMIT && b->k->kind == PDF_NAME) + bn = NAME(b->k)->n; + else + return 0; + + return strcmp(an, bn); +} + +pdf_obj * +pdf_new_dict(fz_context *ctx, pdf_document *doc, int initialcap) +{ + pdf_obj_dict *obj; + int i; + + obj = Memento_label(fz_malloc(ctx, sizeof(pdf_obj_dict)), "pdf_obj(dict)"); + obj->super.refs = 1; + obj->super.kind = PDF_DICT; + obj->super.flags = 0; + obj->doc = doc; + obj->parent_num = 0; + + obj->len = 0; + obj->cap = initialcap > 1 ? initialcap : 10; + + fz_try(ctx) + { + DICT(obj)->items = Memento_label(fz_malloc_array(ctx, DICT(obj)->cap, sizeof(struct keyval)), "pdf_obj(dict items)"); + } + fz_catch(ctx) + { + fz_free(ctx, obj); + fz_rethrow(ctx); + } + for (i = 0; i < DICT(obj)->cap; i++) + { + DICT(obj)->items[i].k = NULL; + DICT(obj)->items[i].v = NULL; + } + + return &obj->super; +} + +static void +pdf_dict_grow(fz_context *ctx, pdf_obj *obj) +{ + int i; + int new_cap = (DICT(obj)->cap * 3) / 2; + + DICT(obj)->items = fz_resize_array(ctx, DICT(obj)->items, new_cap, sizeof(struct keyval)); + DICT(obj)->cap = new_cap; + + for (i = DICT(obj)->len; i < DICT(obj)->cap; i++) + { + DICT(obj)->items[i].k = NULL; + DICT(obj)->items[i].v = NULL; + } +} + +pdf_obj * +pdf_copy_dict(fz_context *ctx, pdf_obj *obj) +{ + pdf_document *doc; + pdf_obj *dict; + int i, n; + + RESOLVE(obj); + if (!OBJ_IS_DICT(obj)) + fz_throw(ctx, FZ_ERROR_GENERIC, "not a dict (%s)", pdf_objkindstr(obj)); + + doc = DICT(obj)->doc; + n = pdf_dict_len(ctx, obj); + dict = pdf_new_dict(ctx, doc, n); + fz_try(ctx) + for (i = 0; i < n; i++) + pdf_dict_put(ctx, dict, pdf_dict_get_key(ctx, obj, i), pdf_dict_get_val(ctx, obj, i)); + fz_catch(ctx) + { + pdf_drop_obj(ctx, dict); + fz_rethrow(ctx); + } + + return dict; +} + +int +pdf_dict_len(fz_context *ctx, pdf_obj *obj) +{ + RESOLVE(obj); + if (!OBJ_IS_DICT(obj)) + return 0; + return DICT(obj)->len; +} + +pdf_obj * +pdf_dict_get_key(fz_context *ctx, pdf_obj *obj, int i) +{ + RESOLVE(obj); + if (!OBJ_IS_DICT(obj)) + return NULL; + if (i < 0 || i >= DICT(obj)->len) + return NULL; + return DICT(obj)->items[i].k; +} + +pdf_obj * +pdf_dict_get_val(fz_context *ctx, pdf_obj *obj, int i) +{ + RESOLVE(obj); + if (!OBJ_IS_DICT(obj)) + return NULL; + if (i < 0 || i >= DICT(obj)->len) + return NULL; + return DICT(obj)->items[i].v; +} + +void +pdf_dict_put_val_null(fz_context *ctx, pdf_obj *obj, int idx) +{ + RESOLVE(obj); + if (!OBJ_IS_DICT(obj)) + fz_throw(ctx, FZ_ERROR_GENERIC, "not a dict (%s)", pdf_objkindstr(obj)); + if (idx < 0 || idx >= DICT(obj)->len) + fz_throw(ctx, FZ_ERROR_GENERIC, "index out of bounds"); + + prepare_object_for_alteration(ctx, obj, NULL); + pdf_drop_obj(ctx, DICT(obj)->items[idx].v); + DICT(obj)->items[idx].v = PDF_NULL; +} + +/* Returns 0 <= i < len for key found. Returns -1-len < i <= -1 for key + * not found, but with insertion point -1-i. */ +static int +pdf_dict_finds(fz_context *ctx, pdf_obj *obj, const char *key) +{ + int len = DICT(obj)->len; + if ((obj->flags & PDF_FLAGS_SORTED) && len > 0) + { + int l = 0; + int r = len - 1; + + if (strcmp(pdf_to_name(ctx, DICT(obj)->items[r].k), key) < 0) + { + return -1 - (r+1); + } + + while (l <= r) + { + int m = (l + r) >> 1; + int c = -strcmp(pdf_to_name(ctx, DICT(obj)->items[m].k), key); + if (c < 0) + r = m - 1; + else if (c > 0) + l = m + 1; + else + return m; + } + return -1 - l; + } + + else + { + int i; + for (i = 0; i < len; i++) + if (strcmp(pdf_to_name(ctx, DICT(obj)->items[i].k), key) == 0) + return i; + + return -1 - len; + } +} + +static int +pdf_dict_find(fz_context *ctx, pdf_obj *obj, pdf_obj *key) +{ + int len = DICT(obj)->len; + if ((obj->flags & PDF_FLAGS_SORTED) && len > 0) + { + int l = 0; + int r = len - 1; + pdf_obj *k = DICT(obj)->items[r].k; + + if (k == key || (k >= PDF_LIMIT && strcmp(NAME(k)->n, PDF_NAME_LIST[(intptr_t)key]) < 0)) + { + return -1 - (r+1); + } + + while (l <= r) + { + int m = (l + r) >> 1; + int c; + + k = DICT(obj)->items[m].k; + c = (k < PDF_LIMIT ? (char *)key-(char *)k : -strcmp(NAME(k)->n, PDF_NAME_LIST[(intptr_t)key])); + if (c < 0) + r = m - 1; + else if (c > 0) + l = m + 1; + else + return m; + } + return -1 - l; + } + else + { + int i; + for (i = 0; i < len; i++) + { + pdf_obj *k = DICT(obj)->items[i].k; + if (k < PDF_LIMIT) + { + if (k == key) + return i; + } + else + { + if (!strcmp(PDF_NAME_LIST[(intptr_t)key], NAME(k)->n)) + return i; + } + } + + return -1 - len; + } +} + +pdf_obj * +pdf_dict_gets(fz_context *ctx, pdf_obj *obj, const char *key) +{ + int i; + + RESOLVE(obj); + if (!OBJ_IS_DICT(obj)) + return NULL; + if (!key) + return NULL; + + i = pdf_dict_finds(ctx, obj, key); + if (i >= 0) + return DICT(obj)->items[i].v; + return NULL; +} + +pdf_obj * +pdf_dict_getp(fz_context *ctx, pdf_obj *obj, const char *keys) +{ + char buf[256]; + char *k, *e; + + RESOLVE(obj); + if (!OBJ_IS_DICT(obj)) + return NULL; + if (strlen(keys)+1 > 256) + fz_throw(ctx, FZ_ERROR_GENERIC, "path too long"); + + strcpy(buf, keys); + + e = buf; + while (*e && obj) + { + k = e; + while (*e != '/' && *e != '\0') + e++; + + if (*e == '/') + { + *e = '\0'; + e++; + } + + obj = pdf_dict_gets(ctx, obj, k); + } + + return obj; +} + +pdf_obj * +pdf_dict_getl(fz_context *ctx, pdf_obj *obj, ...) +{ + va_list keys; + pdf_obj *key; + + va_start(keys, obj); + + while (obj != NULL && (key = va_arg(keys, pdf_obj *)) != NULL) + { + obj = pdf_dict_get(ctx, obj, key); + } + + va_end(keys); + return obj; +} + +pdf_obj * +pdf_dict_get(fz_context *ctx, pdf_obj *obj, pdf_obj *key) +{ + int i; + + RESOLVE(obj); + if (!OBJ_IS_DICT(obj)) + return NULL; + if (!OBJ_IS_NAME(key)) + return NULL; + + if (key < PDF_LIMIT) + i = pdf_dict_find(ctx, obj, key); + else + i = pdf_dict_finds(ctx, obj, pdf_to_name(ctx, key)); + if (i >= 0) + return DICT(obj)->items[i].v; + return NULL; +} + +pdf_obj * +pdf_dict_getsa(fz_context *ctx, pdf_obj *obj, const char *key, const char *abbrev) +{ + pdf_obj *v; + v = pdf_dict_gets(ctx, obj, key); + if (v) + return v; + return pdf_dict_gets(ctx, obj, abbrev); +} + +pdf_obj * +pdf_dict_geta(fz_context *ctx, pdf_obj *obj, pdf_obj *key, pdf_obj *abbrev) +{ + pdf_obj *v; + v = pdf_dict_get(ctx, obj, key); + if (v) + return v; + return pdf_dict_get(ctx, obj, abbrev); +} + +static void +pdf_dict_get_put(fz_context *ctx, pdf_obj *obj, pdf_obj *key, pdf_obj *val, pdf_obj **old_val) +{ + int i; + + if (old_val) + *old_val = NULL; + + RESOLVE(obj); + if (!OBJ_IS_DICT(obj)) + fz_throw(ctx, FZ_ERROR_GENERIC, "not a dict (%s)", pdf_objkindstr(obj)); + if (!OBJ_IS_NAME(key)) + fz_throw(ctx, FZ_ERROR_GENERIC, "key is not a name (%s)", pdf_objkindstr(obj)); + + if (DICT(obj)->len > 100 && !(obj->flags & PDF_FLAGS_SORTED)) + pdf_sort_dict(ctx, obj); + + if (key < PDF_LIMIT) + i = pdf_dict_find(ctx, obj, key); + else + i = pdf_dict_finds(ctx, obj, pdf_to_name(ctx, key)); + + prepare_object_for_alteration(ctx, obj, val); + + if (i >= 0 && i < DICT(obj)->len) + { + if (DICT(obj)->items[i].v != val) + { + pdf_obj *d = DICT(obj)->items[i].v; + DICT(obj)->items[i].v = pdf_keep_obj(ctx, val); + if (old_val) + *old_val = d; + else + pdf_drop_obj(ctx, d); + } + } + else + { + if (DICT(obj)->len + 1 > DICT(obj)->cap) + pdf_dict_grow(ctx, obj); + + i = -1-i; + if ((obj->flags & PDF_FLAGS_SORTED) && DICT(obj)->len > 0) + memmove(&DICT(obj)->items[i + 1], + &DICT(obj)->items[i], + (DICT(obj)->len - i) * sizeof(struct keyval)); + + DICT(obj)->items[i].k = pdf_keep_obj(ctx, key); + DICT(obj)->items[i].v = pdf_keep_obj(ctx, val); + DICT(obj)->len ++; + } +} + +void +pdf_dict_put(fz_context *ctx, pdf_obj *obj, pdf_obj *key, pdf_obj *val) +{ + pdf_dict_get_put(ctx, obj, key, val, NULL); +} + +void +pdf_dict_put_drop(fz_context *ctx, pdf_obj *obj, pdf_obj *key, pdf_obj *val) +{ + fz_try(ctx) + pdf_dict_get_put(ctx, obj, key, val, NULL); + fz_always(ctx) + pdf_drop_obj(ctx, val); + fz_catch(ctx) + fz_rethrow(ctx); +} + +void +pdf_dict_get_put_drop(fz_context *ctx, pdf_obj *obj, pdf_obj *key, pdf_obj *val, pdf_obj **old_val) +{ + fz_try(ctx) + pdf_dict_get_put(ctx, obj, key, val, old_val); + fz_always(ctx) + pdf_drop_obj(ctx, val); + fz_catch(ctx) + fz_rethrow(ctx); +} + +void +pdf_dict_puts(fz_context *ctx, pdf_obj *obj, const char *key, pdf_obj *val) +{ + pdf_obj *keyobj; + + RESOLVE(obj); + if (!OBJ_IS_DICT(obj)) + fz_throw(ctx, FZ_ERROR_GENERIC, "not a dict (%s)", pdf_objkindstr(obj)); + + keyobj = pdf_new_name(ctx, key); + + fz_try(ctx) + pdf_dict_put(ctx, obj, keyobj, val); + fz_always(ctx) + pdf_drop_obj(ctx, keyobj); + fz_catch(ctx) + fz_rethrow(ctx); +} + +void +pdf_dict_puts_drop(fz_context *ctx, pdf_obj *obj, const char *key, pdf_obj *val) +{ + pdf_obj *keyobj; + + RESOLVE(obj); + if (!OBJ_IS_DICT(obj)) + fz_throw(ctx, FZ_ERROR_GENERIC, "not a dict (%s)", pdf_objkindstr(obj)); + + keyobj = pdf_new_name(ctx, key); + + fz_var(keyobj); + + fz_try(ctx) + pdf_dict_put(ctx, obj, keyobj, val); + fz_always(ctx) + { + pdf_drop_obj(ctx, keyobj); + pdf_drop_obj(ctx, val); + } + fz_catch(ctx) + { + fz_rethrow(ctx); + } +} + +void +pdf_dict_putp(fz_context *ctx, pdf_obj *obj, const char *keys, pdf_obj *val) +{ + pdf_document *doc; + char buf[256]; + char *k, *e; + pdf_obj *cobj = NULL; + + RESOLVE(obj); + if (!OBJ_IS_DICT(obj)) + fz_throw(ctx, FZ_ERROR_GENERIC, "not a dict (%s)", pdf_objkindstr(obj)); + if (strlen(keys)+1 > 256) + fz_throw(ctx, FZ_ERROR_GENERIC, "buffer overflow in pdf_dict_putp"); + + doc = DICT(obj)->doc; + strcpy(buf, keys); + + e = buf; + while (*e) + { + k = e; + while (*e != '/' && *e != '\0') + e++; + + if (*e == '/') + { + *e = '\0'; + e++; + } + + if (*e) + { + /* Not the last key in the key path. Create subdict if not already there. */ + cobj = pdf_dict_gets(ctx, obj, k); + if (cobj == NULL) + { + cobj = pdf_new_dict(ctx, doc, 1); + fz_try(ctx) + pdf_dict_puts(ctx, obj, k, cobj); + fz_always(ctx) + pdf_drop_obj(ctx, cobj); + fz_catch(ctx) + fz_rethrow(ctx); + } + /* Move to subdict */ + obj = cobj; + } + else + { + /* Last key. Use it to store the value */ + /* Use val = NULL to request delete */ + if (val) + pdf_dict_puts(ctx, obj, k, val); + else + pdf_dict_dels(ctx, obj, k); + } + } +} + +void +pdf_dict_putp_drop(fz_context *ctx, pdf_obj *obj, const char *keys, pdf_obj *val) +{ + fz_try(ctx) + pdf_dict_putp(ctx, obj, keys, val); + fz_always(ctx) + pdf_drop_obj(ctx, val); + fz_catch(ctx) + fz_rethrow(ctx); +} + +static void +pdf_dict_vputl(fz_context *ctx, pdf_obj *obj, pdf_obj *val, va_list keys) +{ + pdf_obj *key; + pdf_obj *next_key; + pdf_obj *next_obj; + pdf_document *doc; + + RESOLVE(obj); + if (!OBJ_IS_DICT(obj)) + fz_throw(ctx, FZ_ERROR_GENERIC, "not a dict (%s)", pdf_objkindstr(obj)); + + doc = DICT(obj)->doc; + + key = va_arg(keys, pdf_obj *); + if (key == NULL) + return; + + while ((next_key = va_arg(keys, pdf_obj *)) != NULL) + { + next_obj = pdf_dict_get(ctx, obj, key); + if (next_obj == NULL) + goto new_obj; + obj = next_obj; + key = next_key; + } + + pdf_dict_put(ctx, obj, key, val); + return; + +new_obj: + /* We have to create entries */ + do + { + next_obj = pdf_new_dict(ctx, doc, 1); + pdf_dict_put_drop(ctx, obj, key, next_obj); + obj = next_obj; + key = next_key; + } + while ((next_key = va_arg(keys, pdf_obj *)) != NULL); + + pdf_dict_put(ctx, obj, key, val); + return; +} + +void +pdf_dict_putl(fz_context *ctx, pdf_obj *obj, pdf_obj *val, ...) +{ + va_list keys; + va_start(keys, val); + + fz_try(ctx) + pdf_dict_vputl(ctx, obj, val, keys); + fz_always(ctx) + va_end(keys); + fz_catch(ctx) + fz_rethrow(ctx); +} + +void +pdf_dict_putl_drop(fz_context *ctx, pdf_obj *obj, pdf_obj *val, ...) +{ + va_list keys; + va_start(keys, val); + + fz_try(ctx) + pdf_dict_vputl(ctx, obj, val, keys); + fz_always(ctx) + { + pdf_drop_obj(ctx, val); + va_end(keys); + } + fz_catch(ctx) + fz_rethrow(ctx); +} + +void +pdf_dict_dels(fz_context *ctx, pdf_obj *obj, const char *key) +{ + int i; + + RESOLVE(obj); + if (!OBJ_IS_DICT(obj)) + fz_throw(ctx, FZ_ERROR_GENERIC, "not a dict (%s)", pdf_objkindstr(obj)); + if (!key) + fz_throw(ctx, FZ_ERROR_GENERIC, "key is null"); + + prepare_object_for_alteration(ctx, obj, NULL); + i = pdf_dict_finds(ctx, obj, key); + if (i >= 0) + { + pdf_drop_obj(ctx, DICT(obj)->items[i].k); + pdf_drop_obj(ctx, DICT(obj)->items[i].v); + obj->flags &= ~PDF_FLAGS_SORTED; + DICT(obj)->items[i] = DICT(obj)->items[DICT(obj)->len-1]; + DICT(obj)->len --; + } +} + +void +pdf_dict_del(fz_context *ctx, pdf_obj *obj, pdf_obj *key) +{ + if (!OBJ_IS_NAME(key)) + fz_throw(ctx, FZ_ERROR_GENERIC, "key is not a name (%s)", pdf_objkindstr(key)); + + if (key < PDF_LIMIT) + pdf_dict_dels(ctx, obj, PDF_NAME_LIST[(intptr_t)key]); + else + pdf_dict_dels(ctx, obj, NAME(key)->n); +} + +void +pdf_sort_dict(fz_context *ctx, pdf_obj *obj) +{ + RESOLVE(obj); + if (!OBJ_IS_DICT(obj)) + return; + if (!(obj->flags & PDF_FLAGS_SORTED)) + { + qsort(DICT(obj)->items, DICT(obj)->len, sizeof(struct keyval), keyvalcmp); + obj->flags |= PDF_FLAGS_SORTED; + } +} + +pdf_obj * +pdf_deep_copy_obj(fz_context *ctx, pdf_obj *obj) +{ + if (obj < PDF_LIMIT) + { + return obj; + } + if (obj->kind == PDF_DICT) + { + pdf_document *doc = DICT(obj)->doc; + int n = pdf_dict_len(ctx, obj); + pdf_obj *dict = pdf_new_dict(ctx, doc, n); + int i; + + fz_try(ctx) + for (i = 0; i < n; i++) + { + pdf_obj *obj_copy = pdf_deep_copy_obj(ctx, pdf_dict_get_val(ctx, obj, i)); + pdf_dict_put_drop(ctx, dict, pdf_dict_get_key(ctx, obj, i), obj_copy); + } + fz_catch(ctx) + { + pdf_drop_obj(ctx, dict); + fz_rethrow(ctx); + } + + return dict; + } + else if (obj->kind == PDF_ARRAY) + { + pdf_document *doc = ARRAY(obj)->doc; + int n = pdf_array_len(ctx, obj); + pdf_obj *arr = pdf_new_array(ctx, doc, n); + int i; + + fz_try(ctx) + for (i = 0; i < n; i++) + { + pdf_obj *obj_copy = pdf_deep_copy_obj(ctx, pdf_array_get(ctx, obj, i)); + pdf_array_push_drop(ctx, arr, obj_copy); + } + fz_catch(ctx) + { + pdf_drop_obj(ctx, arr); + fz_rethrow(ctx); + } + + return arr; + } + else + { + return pdf_keep_obj(ctx, obj); + } +} + +int +pdf_obj_marked(fz_context *ctx, pdf_obj *obj) +{ + RESOLVE(obj); + if (obj < PDF_LIMIT) + return 0; + return !!(obj->flags & PDF_FLAGS_MARKED); +} + +int +pdf_mark_obj(fz_context *ctx, pdf_obj *obj) +{ + int marked; + RESOLVE(obj); + if (obj < PDF_LIMIT) + return 0; + marked = !!(obj->flags & PDF_FLAGS_MARKED); + obj->flags |= PDF_FLAGS_MARKED; + return marked; +} + +void +pdf_unmark_obj(fz_context *ctx, pdf_obj *obj) +{ + RESOLVE(obj); + if (obj < PDF_LIMIT) + return; + obj->flags &= ~PDF_FLAGS_MARKED; +} + +void +pdf_set_obj_memo(fz_context *ctx, pdf_obj *obj, int bit, int memo) +{ + if (obj < PDF_LIMIT) + return; + bit <<= 1; + obj->flags |= PDF_FLAGS_MEMO_BASE << bit; + if (memo) + obj->flags |= PDF_FLAGS_MEMO_BASE_BOOL << bit; + else + obj->flags &= ~(PDF_FLAGS_MEMO_BASE_BOOL << bit); +} + +int +pdf_obj_memo(fz_context *ctx, pdf_obj *obj, int bit, int *memo) +{ + if (obj < PDF_LIMIT) + return 0; + bit <<= 1; + if (!(obj->flags & (PDF_FLAGS_MEMO_BASE<flags & (PDF_FLAGS_MEMO_BASE_BOOL<flags & PDF_FLAGS_DIRTY); +} + +void pdf_dirty_obj(fz_context *ctx, pdf_obj *obj) +{ + RESOLVE(obj); + if (obj < PDF_LIMIT) + return; + obj->flags |= PDF_FLAGS_DIRTY; +} + +void pdf_clean_obj(fz_context *ctx, pdf_obj *obj) +{ + RESOLVE(obj); + if (obj < PDF_LIMIT) + return; + obj->flags &= ~PDF_FLAGS_DIRTY; +} + +static void +pdf_drop_array(fz_context *ctx, pdf_obj *obj) +{ + int i; + + for (i = 0; i < DICT(obj)->len; i++) + pdf_drop_obj(ctx, ARRAY(obj)->items[i]); + + fz_free(ctx, DICT(obj)->items); + fz_free(ctx, obj); +} + +static void +pdf_drop_dict(fz_context *ctx, pdf_obj *obj) +{ + int i; + + for (i = 0; i < DICT(obj)->len; i++) { + pdf_drop_obj(ctx, DICT(obj)->items[i].k); + pdf_drop_obj(ctx, DICT(obj)->items[i].v); + } + + fz_free(ctx, DICT(obj)->items); + fz_free(ctx, obj); +} + +pdf_obj * +pdf_keep_obj(fz_context *ctx, pdf_obj *obj) +{ + if (obj >= PDF_LIMIT) + return fz_keep_imp16(ctx, obj, &obj->refs); + return obj; +} + +void +pdf_drop_obj(fz_context *ctx, pdf_obj *obj) +{ + if (obj >= PDF_LIMIT) + { + if (fz_drop_imp16(ctx, obj, &obj->refs)) + { + if (obj->kind == PDF_ARRAY) + pdf_drop_array(ctx, obj); + else if (obj->kind == PDF_DICT) + pdf_drop_dict(ctx, obj); + else if (obj->kind == PDF_STRING) + { + fz_free(ctx, STRING(obj)->text); + fz_free(ctx, obj); + } + else + fz_free(ctx, obj); + } + } +} + +void +pdf_set_obj_parent(fz_context *ctx, pdf_obj *obj, int num) +{ + int n, i; + + if (obj < PDF_LIMIT) + return; + + switch (obj->kind) + { + case PDF_ARRAY: + ARRAY(obj)->parent_num = num; + n = pdf_array_len(ctx, obj); + for (i = 0; i < n; i++) + pdf_set_obj_parent(ctx, pdf_array_get(ctx, obj, i), num); + break; + case PDF_DICT: + DICT(obj)->parent_num = num; + n = pdf_dict_len(ctx, obj); + for (i = 0; i < n; i++) + pdf_set_obj_parent(ctx, pdf_dict_get_val(ctx, obj, i), num); + break; + } +} + +int pdf_obj_parent_num(fz_context *ctx, pdf_obj *obj) +{ + if (obj < PDF_LIMIT) + return 0; + + switch (obj->kind) + { + case PDF_INDIRECT: + return REF(obj)->num; + case PDF_ARRAY: + return ARRAY(obj)->parent_num; + case PDF_DICT: + return DICT(obj)->parent_num; + default: + return 0; + } +} + +/* Pretty printing objects */ + +struct fmt +{ + char *buf; + int cap; + int len; + int indent; + int tight; + int col; + int sep; + int last; + pdf_crypt *crypt; + int num; + int gen; +}; + +static void fmt_obj(fz_context *ctx, struct fmt *fmt, pdf_obj *obj); + +static inline int iswhite(int ch) +{ + return + ch == '\000' || + ch == '\011' || + ch == '\012' || + ch == '\014' || + ch == '\015' || + ch == '\040'; +} + +static inline int isdelim(int ch) +{ + return + ch == '(' || ch == ')' || + ch == '<' || ch == '>' || + ch == '[' || ch == ']' || + ch == '{' || ch == '}' || + ch == '/' || + ch == '%'; +} + +static inline void fmt_putc(fz_context *ctx, struct fmt *fmt, int c) +{ + if (fmt->sep && !isdelim(fmt->last) && !isdelim(c)) { + fmt->sep = 0; + fmt_putc(ctx, fmt, ' '); + } + fmt->sep = 0; + + if (fmt->buf && fmt->len < fmt->cap) + fmt->buf[fmt->len] = c; + + if (c == '\n') + fmt->col = 0; + else + fmt->col ++; + + fmt->len ++; + + fmt->last = c; +} + +static inline void fmt_indent(fz_context *ctx, struct fmt *fmt) +{ + int i = fmt->indent; + while (i--) { + fmt_putc(ctx, fmt, ' '); + fmt_putc(ctx, fmt, ' '); + } +} + +static inline void fmt_puts(fz_context *ctx, struct fmt *fmt, char *s) +{ + while (*s) + fmt_putc(ctx, fmt, *s++); +} + +static inline void fmt_sep(fz_context *ctx, struct fmt *fmt) +{ + fmt->sep = 1; +} + +static void fmt_str_out(fz_context *ctx, void *fmt_, const unsigned char *s, int n) +{ + struct fmt *fmt = (struct fmt *)fmt_; + int i, c; + + for (i = 0; i < n; i++) + { + c = (unsigned char)s[i]; + if (c == '\n') + fmt_puts(ctx, fmt, "\\n"); + else if (c == '\r') + fmt_puts(ctx, fmt, "\\r"); + else if (c == '\t') + fmt_puts(ctx, fmt, "\\t"); + else if (c == '\b') + fmt_puts(ctx, fmt, "\\b"); + else if (c == '\f') + fmt_puts(ctx, fmt, "\\f"); + else if (c == '(') + fmt_puts(ctx, fmt, "\\("); + else if (c == ')') + fmt_puts(ctx, fmt, "\\)"); + else if (c == '\\') + fmt_puts(ctx, fmt, "\\\\"); + else if (c < 32 || c >= 127) { + fmt_putc(ctx, fmt, '\\'); + fmt_putc(ctx, fmt, '0' + ((c / 64) & 7)); + fmt_putc(ctx, fmt, '0' + ((c / 8) & 7)); + fmt_putc(ctx, fmt, '0' + ((c) & 7)); + } + else + fmt_putc(ctx, fmt, c); + } +} + +static void fmt_str(fz_context *ctx, struct fmt *fmt, pdf_obj *obj) +{ + unsigned char *s = (unsigned char *)pdf_to_str_buf(ctx, obj); + int n = pdf_to_str_len(ctx, obj); + + fmt_putc(ctx, fmt, '('); + pdf_encrypt_data(ctx, fmt->crypt, fmt->num, fmt->gen, fmt_str_out, fmt, s, n); + fmt_putc(ctx, fmt, ')'); +} + +static void fmt_hex_out(fz_context *ctx, void *arg, const unsigned char *s, int n) +{ + struct fmt *fmt = (struct fmt *)arg; + int i, b, c; + + for (i = 0; i < n; i++) { + b = (unsigned char) s[i]; + c = (b >> 4) & 0x0f; + fmt_putc(ctx, fmt, c < 0xA ? c + '0' : c + 'A' - 0xA); + c = (b) & 0x0f; + fmt_putc(ctx, fmt, c < 0xA ? c + '0' : c + 'A' - 0xA); + } +} + +static void fmt_hex(fz_context *ctx, struct fmt *fmt, pdf_obj *obj) +{ + unsigned char *s = (unsigned char *)pdf_to_str_buf(ctx, obj); + int n = pdf_to_str_len(ctx, obj); + + fmt_putc(ctx, fmt, '<'); + pdf_encrypt_data(ctx, fmt->crypt, fmt->num, fmt->gen, fmt_hex_out, fmt, s, n); + fmt_putc(ctx, fmt, '>'); +} + +static void fmt_name(fz_context *ctx, struct fmt *fmt, pdf_obj *obj) +{ + unsigned char *s = (unsigned char *) pdf_to_name(ctx, obj); + int i, c; + + fmt_putc(ctx, fmt, '/'); + + for (i = 0; s[i]; i++) + { + if (isdelim(s[i]) || iswhite(s[i]) || + s[i] == '#' || s[i] < 32 || s[i] >= 127) + { + fmt_putc(ctx, fmt, '#'); + c = (s[i] >> 4) & 0xf; + fmt_putc(ctx, fmt, c < 0xA ? c + '0' : c + 'A' - 0xA); + c = s[i] & 0xf; + fmt_putc(ctx, fmt, c < 0xA ? c + '0' : c + 'A' - 0xA); + } + else + { + fmt_putc(ctx, fmt, s[i]); + } + } +} + +static void fmt_array(fz_context *ctx, struct fmt *fmt, pdf_obj *obj) +{ + int i, n; + + n = pdf_array_len(ctx, obj); + if (fmt->tight) { + fmt_putc(ctx, fmt, '['); + for (i = 0; i < n; i++) { + fmt_obj(ctx, fmt, pdf_array_get(ctx, obj, i)); + fmt_sep(ctx, fmt); + } + fmt_putc(ctx, fmt, ']'); + } + else { + fmt_putc(ctx, fmt, '['); + fmt->indent ++; + for (i = 0; i < n; i++) { + if (fmt->col > 60) { + fmt_putc(ctx, fmt, '\n'); + fmt_indent(ctx, fmt); + } else { + fmt_putc(ctx, fmt, ' '); + } + fmt_obj(ctx, fmt, pdf_array_get(ctx, obj, i)); + } + fmt->indent --; + fmt_putc(ctx, fmt, ' '); + fmt_putc(ctx, fmt, ']'); + fmt_sep(ctx, fmt); + } +} + +static void fmt_dict(fz_context *ctx, struct fmt *fmt, pdf_obj *obj) +{ + int i, n; + pdf_obj *key, *val; + + n = pdf_dict_len(ctx, obj); + if (fmt->tight) { + fmt_puts(ctx, fmt, "<<"); + for (i = 0; i < n; i++) { + fmt_obj(ctx, fmt, pdf_dict_get_key(ctx, obj, i)); + fmt_sep(ctx, fmt); + fmt_obj(ctx, fmt, pdf_dict_get_val(ctx, obj, i)); + fmt_sep(ctx, fmt); + } + fmt_puts(ctx, fmt, ">>"); + } + else { + fmt_puts(ctx, fmt, "<<\n"); + fmt->indent ++; + for (i = 0; i < n; i++) { + key = pdf_dict_get_key(ctx, obj, i); + val = pdf_dict_get_val(ctx, obj, i); + fmt_indent(ctx, fmt); + fmt_obj(ctx, fmt, key); + fmt_putc(ctx, fmt, ' '); + if (!pdf_is_indirect(ctx, val) && pdf_is_array(ctx, val)) + fmt->indent ++; + fmt_obj(ctx, fmt, val); + fmt_putc(ctx, fmt, '\n'); + if (!pdf_is_indirect(ctx, val) && pdf_is_array(ctx, val)) + fmt->indent --; + } + fmt->indent --; + fmt_indent(ctx, fmt); + fmt_puts(ctx, fmt, ">>"); + } +} + +static void count_encrypted_data(fz_context *ctx, void *arg, const unsigned char *str, int len) +{ + int *encrypted_len = (int *)arg; + int added = 0; + int i; + unsigned char c; + + for (i = 0; i < len; i++) { + c = (unsigned char)str[i]; + if (c != 0 && strchr("()\\\n\r\t\b\f", c)) + added ++; + else if (c < 32 || c >= 127) + added += 3; + } + *encrypted_len += added; +} + +static void fmt_obj(fz_context *ctx, struct fmt *fmt, pdf_obj *obj) +{ + char buf[256]; + + if (obj == PDF_NULL) + fmt_puts(ctx, fmt, "null"); + else if (obj == PDF_TRUE) + fmt_puts(ctx, fmt, "true"); + else if (obj == PDF_FALSE) + fmt_puts(ctx, fmt, "false"); + else if (pdf_is_indirect(ctx, obj)) + { + fz_snprintf(buf, sizeof buf, "%d %d R", pdf_to_num(ctx, obj), pdf_to_gen(ctx, obj)); + fmt_puts(ctx, fmt, buf); + } + else if (pdf_is_int(ctx, obj)) + { + fz_snprintf(buf, sizeof buf, "%d", pdf_to_int(ctx, obj)); + fmt_puts(ctx, fmt, buf); + } + else if (pdf_is_real(ctx, obj)) + { + fz_snprintf(buf, sizeof buf, "%g", pdf_to_real(ctx, obj)); + fmt_puts(ctx, fmt, buf); + } + else if (pdf_is_string(ctx, obj)) + { + unsigned char *str = (unsigned char *)pdf_to_str_buf(ctx, obj); + int len = pdf_to_str_len(ctx, obj); + int encoded_len = 0; + + pdf_encrypt_data(ctx, fmt->crypt, fmt->num, fmt->gen, count_encrypted_data, &encoded_len, str, len); + if (encoded_len < 2*len) + fmt_str(ctx, fmt, obj); + else + fmt_hex(ctx, fmt, obj); + } + else if (pdf_is_name(ctx, obj)) + fmt_name(ctx, fmt, obj); + else if (pdf_is_array(ctx, obj)) + fmt_array(ctx, fmt, obj); + else if (pdf_is_dict(ctx, obj)) + fmt_dict(ctx, fmt, obj); + else + fmt_puts(ctx, fmt, ""); +} + +int +pdf_sprint_encrypted_obj(fz_context *ctx, char *s, int n, pdf_obj *obj, int tight, pdf_crypt *crypt, int num, int gen) +{ + struct fmt fmt; + + fmt.indent = 0; + fmt.col = 0; + fmt.sep = 0; + fmt.last = 0; + + fmt.tight = tight; + fmt.buf = s; + fmt.cap = n; + fmt.len = 0; + fmt.crypt = crypt; + fmt.num = num; + fmt.gen = gen; + fmt_obj(ctx, &fmt, obj); + + if (fmt.buf && fmt.len < fmt.cap) + fmt.buf[fmt.len] = '\0'; + + return fmt.len; +} + +int +pdf_sprint_obj(fz_context *ctx, char *s, int n, pdf_obj *obj, int tight) +{ + return pdf_sprint_encrypted_obj(ctx, s, n, obj, tight, NULL, 0, 0); +} + +int pdf_print_encrypted_obj(fz_context *ctx, fz_output *out, pdf_obj *obj, int tight, pdf_crypt *crypt, int num, int gen) +{ + char buf[1024]; + char *ptr; + int n; + + n = pdf_sprint_encrypted_obj(ctx, buf, sizeof buf, obj, tight, crypt, num, gen); + if (n <= sizeof buf) + { + fz_write_data(ctx, out, buf, n); + } + else + { + ptr = fz_malloc(ctx, n + 1); + pdf_sprint_encrypted_obj(ctx, ptr, n + 1, obj, tight, crypt, num, gen); + fz_write_data(ctx, out, ptr, n); + fz_free(ctx, ptr); + } + return n; +} + +int pdf_print_obj(fz_context *ctx, fz_output *out, pdf_obj *obj, int tight) +{ + return pdf_print_encrypted_obj(ctx, out, obj, tight, NULL, 0, 0); +} + +static int pdf_debug_encrypted_obj(fz_context *ctx, pdf_obj *obj, int tight, pdf_crypt *crypt, int num, int gen) +{ + char buf[1024]; + char *ptr; + int n; + + n = pdf_sprint_obj(ctx, NULL, 0, obj, tight); + if ((n + 1) < sizeof buf) + { + pdf_sprint_encrypted_obj(ctx, buf, sizeof buf, obj, tight, crypt, num, gen); + fwrite(buf, 1, n, stdout); + } + else + { + ptr = fz_malloc(ctx, n + 1); + pdf_sprint_encrypted_obj(ctx, ptr, n + 1, obj, tight, crypt, num, gen); + fwrite(ptr, 1, n, stdout); + fz_free(ctx, ptr); + } + return n; +} + +void pdf_debug_obj(fz_context *ctx, pdf_obj *obj) +{ + pdf_debug_encrypted_obj(ctx, obj, 0, NULL, 0, 0); + putchar('\n'); +} + +int pdf_obj_refs(fz_context *ctx, pdf_obj *obj) +{ + if (obj < PDF_LIMIT) + return 0; + return obj->refs; +} + +/* Convenience functions */ + +pdf_obj * +pdf_dict_get_inheritable(fz_context *ctx, pdf_obj *node, pdf_obj *key) +{ + pdf_obj *node2 = node; + pdf_obj *val = NULL; + + fz_var(node); + fz_try(ctx) + { + do + { + val = pdf_dict_get(ctx, node, key); + if (val) + break; + if (pdf_mark_obj(ctx, node)) + fz_throw(ctx, FZ_ERROR_GENERIC, "cycle in tree (parents)"); + node = pdf_dict_get(ctx, node, PDF_NAME(Parent)); + } + while (node); + } + fz_always(ctx) + { + do + { + pdf_unmark_obj(ctx, node2); + if (node2 == node) + break; + node2 = pdf_dict_get(ctx, node2, PDF_NAME(Parent)); + } + while (node2); + } + fz_catch(ctx) + { + fz_rethrow(ctx); + } + + return val; +} + +void pdf_dict_put_bool(fz_context *ctx, pdf_obj *dict, pdf_obj *key, int x) +{ + pdf_dict_put(ctx, dict, key, x ? PDF_TRUE : PDF_FALSE); +} + +void pdf_dict_put_int(fz_context *ctx, pdf_obj *dict, pdf_obj *key, int64_t x) +{ + pdf_dict_put_drop(ctx, dict, key, pdf_new_int(ctx, x)); +} + +void pdf_dict_put_real(fz_context *ctx, pdf_obj *dict, pdf_obj *key, double x) +{ + pdf_dict_put_drop(ctx, dict, key, pdf_new_real(ctx, x)); +} + +void pdf_dict_put_name(fz_context *ctx, pdf_obj *dict, pdf_obj *key, const char *x) +{ + pdf_dict_put_drop(ctx, dict, key, pdf_new_name(ctx, x)); +} + +void pdf_dict_put_string(fz_context *ctx, pdf_obj *dict, pdf_obj *key, const char *x, size_t n) +{ + pdf_dict_put_drop(ctx, dict, key, pdf_new_string(ctx, x, n)); +} + +void pdf_dict_put_text_string(fz_context *ctx, pdf_obj *dict, pdf_obj *key, const char *x) +{ + pdf_dict_put_drop(ctx, dict, key, pdf_new_text_string(ctx, x)); +} + +void pdf_dict_put_rect(fz_context *ctx, pdf_obj *dict, pdf_obj *key, fz_rect x) +{ + pdf_dict_put_drop(ctx, dict, key, pdf_new_rect(ctx, NULL, x)); +} + +void pdf_dict_put_matrix(fz_context *ctx, pdf_obj *dict, pdf_obj *key, fz_matrix x) +{ + pdf_dict_put_drop(ctx, dict, key, pdf_new_matrix(ctx, NULL, x)); +} + +pdf_obj *pdf_dict_put_array(fz_context *ctx, pdf_obj *dict, pdf_obj *key, int initial) +{ + pdf_obj *obj = pdf_new_array(ctx, pdf_get_bound_document(ctx, dict), initial); + pdf_dict_put_drop(ctx, dict, key, obj); + return obj; +} + +pdf_obj *pdf_dict_put_dict(fz_context *ctx, pdf_obj *dict, pdf_obj *key, int initial) +{ + pdf_obj *obj = pdf_new_dict(ctx, pdf_get_bound_document(ctx, dict), initial); + pdf_dict_put_drop(ctx, dict, key, obj); + return obj; +} + +void pdf_array_push_bool(fz_context *ctx, pdf_obj *array, int x) +{ + pdf_array_push(ctx, array, x ? PDF_TRUE : PDF_FALSE); +} + +void pdf_array_push_int(fz_context *ctx, pdf_obj *array, int64_t x) +{ + pdf_array_push_drop(ctx, array, pdf_new_int(ctx, x)); +} + +void pdf_array_push_real(fz_context *ctx, pdf_obj *array, double x) +{ + pdf_array_push_drop(ctx, array, pdf_new_real(ctx, x)); +} + +void pdf_array_push_name(fz_context *ctx, pdf_obj *array, const char *x) +{ + pdf_array_push_drop(ctx, array, pdf_new_name(ctx, x)); +} + +void pdf_array_push_string(fz_context *ctx, pdf_obj *array, const char *x, size_t n) +{ + pdf_array_push_drop(ctx, array, pdf_new_string(ctx, x, n)); +} + +void pdf_array_push_text_string(fz_context *ctx, pdf_obj *array, const char *x) +{ + pdf_array_push_drop(ctx, array, pdf_new_text_string(ctx, x)); +} + +pdf_obj *pdf_array_push_array(fz_context *ctx, pdf_obj *array, int initial) +{ + pdf_obj *obj = pdf_new_array(ctx, pdf_get_bound_document(ctx, array), initial); + pdf_array_push_drop(ctx, array, obj); + return obj; +} + +pdf_obj *pdf_array_push_dict(fz_context *ctx, pdf_obj *array, int initial) +{ + pdf_obj *obj = pdf_new_dict(ctx, pdf_get_bound_document(ctx, array), initial); + pdf_array_push_drop(ctx, array, obj); + return obj; +} + +int pdf_dict_get_bool(fz_context *ctx, pdf_obj *dict, pdf_obj *key) +{ + return pdf_to_bool(ctx, pdf_dict_get(ctx, dict, key)); +} + +int pdf_dict_get_int(fz_context *ctx, pdf_obj *dict, pdf_obj *key) +{ + return pdf_to_int(ctx, pdf_dict_get(ctx, dict, key)); +} + +float pdf_dict_get_real(fz_context *ctx, pdf_obj *dict, pdf_obj *key) +{ + return pdf_to_real(ctx, pdf_dict_get(ctx, dict, key)); +} + +const char *pdf_dict_get_name(fz_context *ctx, pdf_obj *dict, pdf_obj *key) +{ + return pdf_to_name(ctx, pdf_dict_get(ctx, dict, key)); +} + +const char *pdf_dict_get_string(fz_context *ctx, pdf_obj *dict, pdf_obj *key, size_t *sizep) +{ + return pdf_to_string(ctx, pdf_dict_get(ctx, dict, key), sizep); +} + +const char *pdf_dict_get_text_string(fz_context *ctx, pdf_obj *dict, pdf_obj *key) +{ + return pdf_to_text_string(ctx, pdf_dict_get(ctx, dict, key)); +} + +fz_rect pdf_dict_get_rect(fz_context *ctx, pdf_obj *dict, pdf_obj *key) +{ + return pdf_to_rect(ctx, pdf_dict_get(ctx, dict, key)); +} + +fz_matrix pdf_dict_get_matrix(fz_context *ctx, pdf_obj *dict, pdf_obj *key) +{ + return pdf_to_matrix(ctx, pdf_dict_get(ctx, dict, key)); +} + +int pdf_array_get_bool(fz_context *ctx, pdf_obj *array, int index) +{ + return pdf_to_bool(ctx, pdf_array_get(ctx, array, index)); +} + +int pdf_array_get_int(fz_context *ctx, pdf_obj *array, int index) +{ + return pdf_to_int(ctx, pdf_array_get(ctx, array, index)); +} + +float pdf_array_get_real(fz_context *ctx, pdf_obj *array, int index) +{ + return pdf_to_real(ctx, pdf_array_get(ctx, array, index)); +} + +const char *pdf_array_get_string(fz_context *ctx, pdf_obj *array, int index, size_t *sizep) +{ + return pdf_to_string(ctx, pdf_array_get(ctx, array, index), sizep); +} + +const char *pdf_array_get_text_string(fz_context *ctx, pdf_obj *array, int index) +{ + return pdf_to_text_string(ctx, pdf_array_get(ctx, array, index)); +} + +fz_rect pdf_array_get_rect(fz_context *ctx, pdf_obj *array, int index) +{ + return pdf_to_rect(ctx, pdf_array_get(ctx, array, index)); +} + +fz_matrix pdf_array_get_matrix(fz_context *ctx, pdf_obj *array, int index) +{ + return pdf_to_matrix(ctx, pdf_array_get(ctx, array, index)); +} diff -Nru k2pdfopt-2.42+ds/mupdf_mod/pdf-parse.c k2pdfopt-2.51+ds/mupdf_mod/pdf-parse.c --- k2pdfopt-2.42+ds/mupdf_mod/pdf-parse.c 2017-02-25 05:42:13.000000000 +0000 +++ k2pdfopt-2.51+ds/mupdf_mod/pdf-parse.c 2018-11-21 00:38:13.000000000 +0000 @@ -1,43 +1,48 @@ +#include "mupdf/fitz.h" #include "mupdf/pdf.h" -fz_rect * -pdf_to_rect(fz_context *ctx, pdf_obj *array, fz_rect *r) +#include + +fz_rect +pdf_to_rect(fz_context *ctx, pdf_obj *array) { if (!pdf_is_array(ctx, array)) - *r = fz_empty_rect; + return fz_empty_rect; else { - float a = pdf_to_real(ctx, pdf_array_get(ctx, array, 0)); - float b = pdf_to_real(ctx, pdf_array_get(ctx, array, 1)); - float c = pdf_to_real(ctx, pdf_array_get(ctx, array, 2)); - float d = pdf_to_real(ctx, pdf_array_get(ctx, array, 3)); - r->x0 = fz_min(a, c); - r->y0 = fz_min(b, d); - r->x1 = fz_max(a, c); - r->y1 = fz_max(b, d); + float a = pdf_array_get_real(ctx, array, 0); + float b = pdf_array_get_real(ctx, array, 1); + float c = pdf_array_get_real(ctx, array, 2); + float d = pdf_array_get_real(ctx, array, 3); + fz_rect r; + r.x0 = fz_min(a, c); + r.y0 = fz_min(b, d); + r.x1 = fz_max(a, c); + r.y1 = fz_max(b, d); + return r; } - return r; } -fz_matrix * -pdf_to_matrix(fz_context *ctx, pdf_obj *array, fz_matrix *m) +fz_matrix +pdf_to_matrix(fz_context *ctx, pdf_obj *array) { if (!pdf_is_array(ctx, array)) - *m = fz_identity; + return fz_identity; else { - m->a = pdf_to_real(ctx, pdf_array_get(ctx, array, 0)); - m->b = pdf_to_real(ctx, pdf_array_get(ctx, array, 1)); - m->c = pdf_to_real(ctx, pdf_array_get(ctx, array, 2)); - m->d = pdf_to_real(ctx, pdf_array_get(ctx, array, 3)); - m->e = pdf_to_real(ctx, pdf_array_get(ctx, array, 4)); - m->f = pdf_to_real(ctx, pdf_array_get(ctx, array, 5)); + fz_matrix m; + m.a = pdf_array_get_real(ctx, array, 0); + m.b = pdf_array_get_real(ctx, array, 1); + m.c = pdf_array_get_real(ctx, array, 2); + m.d = pdf_array_get_real(ctx, array, 3); + m.e = pdf_array_get_real(ctx, array, 4); + m.f = pdf_array_get_real(ctx, array, 5); + return m; } - return m; } static int -rune_from_utf16be(int *out, unsigned char *s, unsigned char *end) +rune_from_utf16be(int *out, const unsigned char *s, const unsigned char *end) { if (s + 2 <= end) { @@ -51,12 +56,12 @@ *out = a; return 2; } - *out = 0xFFFD; + *out = FZ_REPLACEMENT_CHARACTER; return 1; } static size_t -skip_language_code_utf16be(unsigned char *s, size_t n, size_t i) +skip_language_code_utf16be(const unsigned char *s, size_t n, size_t i) { /* skip language escape codes */ if (i + 6 <= n && s[i+0] == 0 && s[i+1] == 27 && s[i+4] == 0 && s[i+5] == 27) @@ -67,7 +72,7 @@ } static size_t -skip_language_code_utf8(unsigned char *s, size_t n, size_t i) +skip_language_code_utf8(const unsigned char *s, size_t n, size_t i) { /* skip language escape codes */ if (i + 3 <= n && s[i] == 27 && s[i+3]) @@ -77,9 +82,11 @@ return 0; } +/* Convert Unicode/PdfDocEncoding string into utf-8 */ char * -pdf_to_utf8_imp(fz_context *ctx, unsigned char *srcptr, size_t srclen) +pdf_new_utf8_from_pdf_string(fz_context *ctx, const char *ssrcptr, size_t srclen) { + const unsigned char *srcptr = (const unsigned char*)ssrcptr; char *dstptr, *dst; size_t dstlen = 0; int ucs; @@ -165,30 +172,29 @@ return dst; } -/* Convert Unicode/PdfDocEncoding string into utf-8 */ +/* Convert text string object to UTF-8 */ char * -pdf_to_utf8(fz_context *ctx, pdf_obj *src) +pdf_new_utf8_from_pdf_string_obj(fz_context *ctx, pdf_obj *src) { - unsigned char *srcptr; + const char *srcptr; size_t srclen; - srcptr = (unsigned char *) pdf_to_str_buf(ctx, src); - srclen = pdf_to_str_len(ctx, src); - return pdf_to_utf8_imp(ctx, srcptr, srclen); + srcptr = pdf_to_string(ctx, src, &srclen); + return pdf_new_utf8_from_pdf_string(ctx, srcptr, srclen); } /* Load text stream and convert to UTF-8 */ char * -pdf_load_stream_as_utf8(fz_context *ctx, pdf_obj *src) +pdf_new_utf8_from_pdf_stream_obj(fz_context *ctx, pdf_obj *src) { fz_buffer *stmbuf; - unsigned char *srcptr; + char *srcptr; size_t srclen; - char *dst; + char *dst = NULL; stmbuf = pdf_load_stream(ctx, src); - srclen = fz_buffer_storage(ctx, stmbuf, &srcptr); + srclen = fz_buffer_storage(ctx, stmbuf, (unsigned char **)&srcptr); fz_try(ctx) - dst = pdf_to_utf8_imp(ctx, srcptr, srclen); + dst = pdf_new_utf8_from_pdf_string(ctx, srcptr, srclen); fz_always(ctx) fz_drop_buffer(ctx, stmbuf); fz_catch(ctx) @@ -201,117 +207,48 @@ pdf_load_stream_or_string_as_utf8(fz_context *ctx, pdf_obj *src) { if (pdf_is_stream(ctx, src)) - return pdf_load_stream_as_utf8(ctx, src); - return pdf_to_utf8(ctx, src); -} - -/* Convert Unicode/PdfDocEncoding string into ucs-2 */ -unsigned short * -pdf_to_ucs2(fz_context *ctx, pdf_obj *src) -{ - unsigned char *srcptr = (unsigned char *) pdf_to_str_buf(ctx, src); - unsigned short *dstptr, *dst; - int srclen = pdf_to_str_len(ctx, src); - int i; - - if (srclen >= 2 && srcptr[0] == 254 && srcptr[1] == 255) - { - dstptr = dst = fz_malloc_array(ctx, (srclen - 2) / 2 + 1, sizeof(short)); - for (i = 2; i + 1 < srclen; i += 2) - *dstptr++ = srcptr[i] << 8 | srcptr[i+1]; - } - else if (srclen >= 2 && srcptr[0] == 255 && srcptr[1] == 254) - { - dstptr = dst = fz_malloc_array(ctx, (srclen - 2) / 2 + 1, sizeof(short)); - for (i = 2; i + 1 < srclen; i += 2) - *dstptr++ = srcptr[i] | srcptr[i+1] << 8; - } - else - { - dstptr = dst = fz_malloc_array(ctx, srclen + 1, sizeof(short)); - for (i = 0; i < srclen; i++) - *dstptr++ = pdf_doc_encoding[srcptr[i]]; - } - - *dstptr = '\0'; - return dst; -} - -/* allow to convert to UCS-2 without the need for an fz_context */ -/* (buffer must be at least (fz_to_str_len(src) + 1) * 2 bytes in size) */ -void -pdf_to_ucs2_buf(fz_context *ctx, unsigned short *buffer, pdf_obj *src) -{ - unsigned char *srcptr = (unsigned char *) pdf_to_str_buf(ctx, src); - unsigned short *dstptr = buffer; - int srclen = pdf_to_str_len(ctx, src); - int i; - - if (srclen >= 2 && srcptr[0] == 254 && srcptr[1] == 255) - { - for (i = 2; i + 1 < srclen; i += 2) - *dstptr++ = srcptr[i] << 8 | srcptr[i+1]; - } - else if (srclen >= 2 && srcptr[0] == 255 && srcptr[1] == 254) - { - for (i = 2; i + 1 < srclen; i += 2) - *dstptr++ = srcptr[i] | srcptr[i+1] << 8; - } - else - { - for (i = 0; i < srclen; i++) - *dstptr++ = pdf_doc_encoding[srcptr[i]]; - } - - *dstptr = '\0'; + return pdf_new_utf8_from_pdf_stream_obj(ctx, src); + return pdf_new_utf8_from_pdf_string_obj(ctx, src); } -/* Convert UCS-2 string into PdfDocEncoding for authentication */ -char * -pdf_from_ucs2(fz_context *ctx, unsigned short *src) +static pdf_obj * +pdf_new_text_string_utf16be(fz_context *ctx, const char *s) { - int i, j, len; - char *docstr; - - len = 0; - while (src[len]) - len++; - - docstr = fz_malloc(ctx, len + 1); - - for (i = 0; i < len; i++) - { - /* shortcut: check if the character has the same code point in both encodings */ - if (0 < src[i] && src[i] < 256 && pdf_doc_encoding[src[i]] == src[i]) { - docstr[i] = src[i]; - continue; - } - - /* search through pdf_docencoding for the character's code point */ - for (j = 0; j < 256; j++) - if (pdf_doc_encoding[j] == src[i]) - break; - docstr[i] = j; - - /* fail, if a character can't be encoded */ - if (!docstr[i]) - { - fz_free(ctx, docstr); - return NULL; - } + int c, i = 0, n = fz_utflen(s); + unsigned char *p = fz_malloc(ctx, n * 2 + 2); + pdf_obj *obj; + p[i++] = 254; + p[i++] = 255; + while (*s) + { + s += fz_chartorune(&c, s); + p[i++] = (c>>8) & 0xff; + p[i++] = (c) & 0xff; } - docstr[len] = '\0'; - - return docstr; + fz_try(ctx) + obj = pdf_new_string(ctx, (char*)p, i); + fz_always(ctx) + fz_free(ctx, p); + fz_catch(ctx) + fz_rethrow(ctx); + return obj; } +/* + * Create a PDF 'text string' by encoding input string as either ASCII or UTF-16BE. + * In theory, we could also use PDFDocEncoding. + */ pdf_obj * -pdf_to_utf8_name(fz_context *ctx, pdf_document *doc, pdf_obj *src) +pdf_new_text_string(fz_context *ctx, const char *s) { - char *buf = pdf_to_utf8(ctx, src); - pdf_obj *dst = pdf_new_name(ctx, doc, buf); - fz_free(ctx, buf); - return dst; + int i = 0; + while (s[i] != 0) + { + if (((unsigned char)s[i]) >= 128) + return pdf_new_text_string_utf16be(ctx, s); + ++i; + } + return pdf_new_string(ctx, s, i); } pdf_obj * @@ -319,7 +256,7 @@ { pdf_obj *ary = NULL; pdf_obj *obj = NULL; - fz_off_t a = 0, b = 0, n = 0; + int64_t a = 0, b = 0, n = 0; pdf_token tok; pdf_obj *op = NULL; @@ -336,34 +273,24 @@ if (tok != PDF_TOK_INT && tok != PDF_TOK_R) { if (n > 0) - { - obj = pdf_new_int_offset(ctx, doc, a); - pdf_array_push(ctx, ary, obj); - pdf_drop_obj(ctx, obj); - obj = NULL; - } + pdf_array_push_int(ctx, ary, a); if (n > 1) - { - obj = pdf_new_int_offset(ctx, doc, b); - pdf_array_push(ctx, ary, obj); - pdf_drop_obj(ctx, obj); - obj = NULL; - } + pdf_array_push_int(ctx, ary, b); n = 0; } if (tok == PDF_TOK_INT && n == 2) { - obj = pdf_new_int_offset(ctx, doc, a); - pdf_array_push(ctx, ary, obj); - pdf_drop_obj(ctx, obj); - obj = NULL; + pdf_array_push_int(ctx, ary, a); a = b; n --; } switch (tok) { + case PDF_TOK_EOF: + fz_throw(ctx, FZ_ERROR_SYNTAX, "array not closed before end of file"); + case PDF_TOK_CLOSE_ARRAY: op = ary; goto end; @@ -378,67 +305,43 @@ case PDF_TOK_R: if (n != 2) - fz_throw(ctx, FZ_ERROR_GENERIC, "cannot parse indirect reference in array"); - obj = pdf_new_indirect(ctx, doc, a, b); - pdf_array_push(ctx, ary, obj); - pdf_drop_obj(ctx, obj); - obj = NULL; + fz_throw(ctx, FZ_ERROR_SYNTAX, "cannot parse indirect reference in array"); + pdf_array_push_drop(ctx, ary, pdf_new_indirect(ctx, doc, a, b)); n = 0; break; case PDF_TOK_OPEN_ARRAY: obj = pdf_parse_array(ctx, doc, file, buf); - pdf_array_push(ctx, ary, obj); - pdf_drop_obj(ctx, obj); - obj = NULL; + pdf_array_push_drop(ctx, ary, obj); break; case PDF_TOK_OPEN_DICT: obj = pdf_parse_dict(ctx, doc, file, buf); - pdf_array_push(ctx, ary, obj); - pdf_drop_obj(ctx, obj); - obj = NULL; + pdf_array_push_drop(ctx, ary, obj); break; case PDF_TOK_NAME: - obj = pdf_new_name(ctx, doc, buf->scratch); - pdf_array_push(ctx, ary, obj); - pdf_drop_obj(ctx, obj); - obj = NULL; + pdf_array_push_name(ctx, ary, buf->scratch); break; case PDF_TOK_REAL: - obj = pdf_new_real(ctx, doc, buf->f); - pdf_array_push(ctx, ary, obj); - pdf_drop_obj(ctx, obj); - obj = NULL; + pdf_array_push_real(ctx, ary, buf->f); break; case PDF_TOK_STRING: - obj = pdf_new_string(ctx, doc, buf->scratch, buf->len); - pdf_array_push(ctx, ary, obj); - pdf_drop_obj(ctx, obj); - obj = NULL; + pdf_array_push_string(ctx, ary, buf->scratch, buf->len); break; case PDF_TOK_TRUE: - obj = pdf_new_bool(ctx, doc, 1); - pdf_array_push(ctx, ary, obj); - pdf_drop_obj(ctx, obj); - obj = NULL; + pdf_array_push_bool(ctx, ary, 1); break; case PDF_TOK_FALSE: - obj = pdf_new_bool(ctx, doc, 0); - pdf_array_push(ctx, ary, obj); - pdf_drop_obj(ctx, obj); - obj = NULL; + pdf_array_push_bool(ctx, ary, 0); break; case PDF_TOK_NULL: - obj = pdf_new_null(ctx, doc); - pdf_array_push(ctx, ary, obj); - pdf_drop_obj(ctx, obj); - obj = NULL; + pdf_array_push(ctx, ary, PDF_NULL); break; default: - fz_throw(ctx, FZ_ERROR_GENERIC, "cannot parse token in array"); + pdf_array_push(ctx, ary, PDF_NULL); + break; } } end: @@ -446,7 +349,6 @@ } fz_catch(ctx) { - pdf_drop_obj(ctx, obj); pdf_drop_obj(ctx, ary); fz_rethrow(ctx); } @@ -460,7 +362,7 @@ pdf_obj *key = NULL; pdf_obj *val = NULL; pdf_token tok; - fz_off_t a, b; + int64_t a, b; dict = pdf_new_dict(ctx, doc, 8); @@ -481,9 +383,9 @@ break; if (tok != PDF_TOK_NAME) - fz_throw(ctx, FZ_ERROR_GENERIC, "invalid key in dict"); + fz_throw(ctx, FZ_ERROR_SYNTAX, "invalid key in dict"); - key = pdf_new_name(ctx, doc, buf->scratch); + key = pdf_new_name(ctx, buf->scratch); tok = pdf_lex(ctx, file, buf); @@ -497,12 +399,12 @@ val = pdf_parse_dict(ctx, doc, file, buf); break; - case PDF_TOK_NAME: val = pdf_new_name(ctx, doc, buf->scratch); break; - case PDF_TOK_REAL: val = pdf_new_real(ctx, doc, buf->f); break; - case PDF_TOK_STRING: val = pdf_new_string(ctx, doc, buf->scratch, buf->len); break; - case PDF_TOK_TRUE: val = pdf_new_bool(ctx, doc, 1); break; - case PDF_TOK_FALSE: val = pdf_new_bool(ctx, doc, 0); break; - case PDF_TOK_NULL: val = pdf_new_null(ctx, doc); break; + case PDF_TOK_NAME: val = pdf_new_name(ctx, buf->scratch); break; + case PDF_TOK_REAL: val = pdf_new_real(ctx, buf->f); break; + case PDF_TOK_STRING: val = pdf_new_string(ctx, buf->scratch, buf->len); break; + case PDF_TOK_TRUE: val = PDF_TRUE; break; + case PDF_TOK_FALSE: val = PDF_FALSE; break; + case PDF_TOK_NULL: val = PDF_NULL; break; case PDF_TOK_INT: /* 64-bit to allow for numbers > INT_MAX and overflow */ @@ -511,7 +413,7 @@ if (tok == PDF_TOK_CLOSE_DICT || tok == PDF_TOK_NAME || (tok == PDF_TOK_KEYWORD && !strcmp(buf->scratch, "ID"))) { - val = pdf_new_int_offset(ctx, doc, a); + val = pdf_new_int(ctx, a); pdf_dict_put(ctx, dict, key, val); pdf_drop_obj(ctx, val); val = NULL; @@ -529,10 +431,13 @@ break; } } - fz_throw(ctx, FZ_ERROR_GENERIC, "invalid indirect reference in dict"); + fz_warn(ctx, "invalid indirect reference in dict"); + val = PDF_NULL; + break; default: - fz_throw(ctx, FZ_ERROR_GENERIC, "unknown token in dict"); + val = PDF_NULL; + break; } pdf_dict_put(ctx, dict, key, val); @@ -565,27 +470,28 @@ return pdf_parse_array(ctx, doc, file, buf); case PDF_TOK_OPEN_DICT: return pdf_parse_dict(ctx, doc, file, buf); - case PDF_TOK_NAME: return pdf_new_name(ctx, doc, buf->scratch); break; - case PDF_TOK_REAL: return pdf_new_real(ctx, doc, buf->f); break; - case PDF_TOK_STRING: return pdf_new_string(ctx, doc, buf->scratch, buf->len); break; - case PDF_TOK_TRUE: return pdf_new_bool(ctx, doc, 1); break; - case PDF_TOK_FALSE: return pdf_new_bool(ctx, doc, 0); break; - case PDF_TOK_NULL: return pdf_new_null(ctx, doc); break; - case PDF_TOK_INT: return pdf_new_int_offset(ctx, doc, buf->i); break; - default: fz_throw(ctx, FZ_ERROR_GENERIC, "unknown token in object stream"); + case PDF_TOK_NAME: return pdf_new_name(ctx, buf->scratch); + case PDF_TOK_REAL: return pdf_new_real(ctx, buf->f); + case PDF_TOK_STRING: return pdf_new_string(ctx, buf->scratch, buf->len); + case PDF_TOK_TRUE: return PDF_TRUE; + case PDF_TOK_FALSE: return PDF_FALSE; + case PDF_TOK_NULL: return PDF_NULL; + case PDF_TOK_INT: return pdf_new_int(ctx, buf->i); + default: fz_throw(ctx, FZ_ERROR_SYNTAX, "unknown token in object stream"); } } pdf_obj * pdf_parse_ind_obj(fz_context *ctx, pdf_document *doc, fz_stream *file, pdf_lexbuf *buf, - int *onum, int *ogen, fz_off_t *ostmofs, int *try_repair) + int *onum, int *ogen, int64_t *ostmofs, int *try_repair) { pdf_obj *obj = NULL; int num = 0, gen = 0; - fz_off_t stm_ofs; + int64_t stm_ofs; pdf_token tok; - fz_off_t a, b; + int64_t a, b; + int read_next_token = 1; fz_var(obj); @@ -594,16 +500,18 @@ { if (try_repair) *try_repair = 1; - fz_throw(ctx, FZ_ERROR_GENERIC, "expected object number"); + fz_throw(ctx, FZ_ERROR_SYNTAX, "expected object number"); } num = buf->i; + if (num < 0 || num > PDF_MAX_OBJECT_NUMBER) + fz_throw(ctx, FZ_ERROR_SYNTAX, "object number out of range"); tok = pdf_lex(ctx, file, buf); if (tok != PDF_TOK_INT) { if (try_repair) *try_repair = 1; - fz_throw(ctx, FZ_ERROR_GENERIC, "expected generation number (%d ? obj)", num); + fz_throw(ctx, FZ_ERROR_SYNTAX, "expected generation number (%d ? obj)", num); } gen = buf->i; @@ -612,7 +520,7 @@ { if (try_repair) *try_repair = 1; - fz_throw(ctx, FZ_ERROR_GENERIC, "expected 'obj' keyword (%d %d ?)", num, gen); + fz_throw(ctx, FZ_ERROR_SYNTAX, "expected 'obj' keyword (%d %d ?)", num, gen); } tok = pdf_lex(ctx, file, buf); @@ -627,12 +535,12 @@ obj = pdf_parse_dict(ctx, doc, file, buf); break; - case PDF_TOK_NAME: obj = pdf_new_name(ctx, doc, buf->scratch); break; - case PDF_TOK_REAL: obj = pdf_new_real(ctx, doc, buf->f); break; - case PDF_TOK_STRING: obj = pdf_new_string(ctx, doc, buf->scratch, buf->len); break; - case PDF_TOK_TRUE: obj = pdf_new_bool(ctx, doc, 1); break; - case PDF_TOK_FALSE: obj = pdf_new_bool(ctx, doc, 0); break; - case PDF_TOK_NULL: obj = pdf_new_null(ctx, doc); break; + case PDF_TOK_NAME: obj = pdf_new_name(ctx, buf->scratch); break; + case PDF_TOK_REAL: obj = pdf_new_real(ctx, buf->f); break; + case PDF_TOK_STRING: obj = pdf_new_string(ctx, buf->scratch, buf->len); break; + case PDF_TOK_TRUE: obj = PDF_TRUE; break; + case PDF_TOK_FALSE: obj = PDF_FALSE; break; + case PDF_TOK_NULL: obj = PDF_NULL; break; case PDF_TOK_INT: a = buf->i; @@ -640,10 +548,11 @@ if (tok == PDF_TOK_STREAM || tok == PDF_TOK_ENDOBJ) { - obj = pdf_new_int_offset(ctx, doc, a); - goto skip; + obj = pdf_new_int(ctx, a); + read_next_token = 0; + break; } - if (tok == PDF_TOK_INT) + else if (tok == PDF_TOK_INT) { b = buf->i; tok = pdf_lex(ctx, file, buf); @@ -653,58 +562,61 @@ break; } } - fz_throw(ctx, FZ_ERROR_GENERIC, "expected 'R' keyword (%d %d R)", num, gen); + fz_throw(ctx, FZ_ERROR_SYNTAX, "expected 'R' keyword (%d %d R)", num, gen); case PDF_TOK_ENDOBJ: - obj = pdf_new_null(ctx, doc); - goto skip; + obj = PDF_NULL; + read_next_token = 0; + break; default: - fz_throw(ctx, FZ_ERROR_GENERIC, "syntax error in object (%d %d R)", num, gen); + fz_throw(ctx, FZ_ERROR_SYNTAX, "syntax error in object (%d %d R)", num, gen); } fz_try(ctx) { - tok = pdf_lex(ctx, file, buf); - } - fz_catch(ctx) - { - pdf_drop_obj(ctx, obj); - fz_rethrow(ctx); - } + if (read_next_token) + tok = pdf_lex(ctx, file, buf); -skip: - if (tok == PDF_TOK_STREAM) - { - int c = fz_read_byte(ctx, file); - while (c == ' ') - c = fz_read_byte(ctx, file); - if (c == '\r') + if (tok == PDF_TOK_STREAM) { - c = fz_peek_byte(ctx, file); -/* willus.com -- no warning */ + int c = fz_read_byte(ctx, file); + while (c == ' ') + c = fz_read_byte(ctx, file); + if (c == '\r') + { + c = fz_peek_byte(ctx, file); +/* willus mod -- no warning */ /* - if (c != '\n') - fz_warn(ctx, "line feed missing after stream begin marker (%d %d R)", num, gen); - else + if (c != '\n') + fz_warn(ctx, "line feed missing after stream begin marker (%d %d R)", num, gen); + else */ if (c=='\n') - fz_read_byte(ctx, file); +/* willus mod -- end */ + fz_read_byte(ctx, file); + } + stm_ofs = fz_tell(ctx, file); + } + else if (tok == PDF_TOK_ENDOBJ) + { + stm_ofs = 0; + } + else + { + fz_warn(ctx, "expected 'endobj' or 'stream' keyword (%d %d R)", num, gen); + stm_ofs = 0; } - stm_ofs = fz_tell(ctx, file); - } - else if (tok == PDF_TOK_ENDOBJ) - { - stm_ofs = 0; } - else + fz_catch(ctx) { - fz_warn(ctx, "expected 'endobj' or 'stream' keyword (%d %d R)", num, gen); - stm_ofs = 0; + pdf_drop_obj(ctx, obj); + fz_rethrow(ctx); } if (onum) *onum = num; if (ogen) *ogen = gen; if (ostmofs) *ostmofs = stm_ofs; + return obj; } diff -Nru k2pdfopt-2.42+ds/mupdf_mod/pdf-signature.c k2pdfopt-2.51+ds/mupdf_mod/pdf-signature.c --- k2pdfopt-2.42+ds/mupdf_mod/pdf-signature.c 1970-01-01 00:00:00.000000000 +0000 +++ k2pdfopt-2.51+ds/mupdf_mod/pdf-signature.c 2018-11-21 02:43:31.000000000 +0000 @@ -0,0 +1,118 @@ +#include "mupdf/fitz.h" +#include "mupdf/pdf.h" +/* willus mod -- remove ../fitz/ */ +#include "fitz-imp.h" + +#include + + +void pdf_write_digest(fz_context *ctx, fz_output *out, pdf_obj *byte_range, int hexdigest_offset, int hexdigest_length, pdf_pkcs7_signer *signer) +{ + fz_stream *stm = NULL; + fz_stream *in = NULL; + fz_range *brange = NULL; + int brange_len = pdf_array_len(ctx, byte_range)/2; + unsigned char *digest = NULL; + int digest_len; + + fz_var(stm); + fz_var(in); + fz_var(brange); + + if (hexdigest_length < 4) + fz_throw(ctx, FZ_ERROR_GENERIC, "Bad parameters to pdf_write_digest"); + + fz_try(ctx) + { + int i, res; + + brange = fz_calloc(ctx, brange_len, sizeof(*brange)); + for (i = 0; i < brange_len; i++) + { + brange[i].offset = pdf_array_get_int(ctx, byte_range, 2*i); + brange[i].length = pdf_array_get_int(ctx, byte_range, 2*i+1); + } + + stm = fz_stream_from_output(ctx, out); + in = fz_open_range_filter(ctx, stm, brange, brange_len); + + digest_len = (hexdigest_length - 2) / 2; + digest = fz_malloc(ctx, digest_len); + res = signer->create_digest(signer, in, digest, &digest_len); + if (!res) + fz_throw(ctx, FZ_ERROR_GENERIC, "pdf_pkcs7_create_digest failed"); + + fz_drop_stream(ctx, in); + in = NULL; + fz_drop_stream(ctx, stm); + stm = NULL; + + fz_seek_output(ctx, out, hexdigest_offset+1, SEEK_SET); + + for (i = 0; i < digest_len; i++) + fz_write_printf(ctx, out, "%02x", digest[i]); + } + fz_always(ctx) + { + fz_free(ctx, digest); + fz_free(ctx, brange); + fz_drop_stream(ctx, stm); + fz_drop_stream(ctx, in); + } + fz_catch(ctx) + { + fz_rethrow(ctx); + } +} + +void pdf_sign_signature(fz_context *ctx, pdf_document *doc, pdf_widget *widget, pdf_pkcs7_signer *signer) +{ + pdf_pkcs7_designated_name *dn = NULL; + fz_buffer *fzbuf = NULL; + + fz_try(ctx) + { + const char *dn_str; + pdf_obj *wobj = ((pdf_annot *)widget)->obj; + fz_rect rect; + + rect = pdf_dict_get_rect(ctx, wobj, PDF_NAME(Rect)); + + /* Create an appearance stream only if the signature is intended to be visible */ + if (!fz_is_empty_rect(rect)) + { + dn = signer->designated_name(signer); + fzbuf = fz_new_buffer(ctx, 256); + if (!dn->cn) + fz_throw(ctx, FZ_ERROR_GENERIC, "Certificate has no common name"); + + fz_append_printf(ctx, fzbuf, "cn=%s", dn->cn); + + if (dn->o) + fz_append_printf(ctx, fzbuf, ", o=%s", dn->o); + + if (dn->ou) + fz_append_printf(ctx, fzbuf, ", ou=%s", dn->ou); + + if (dn->email) + fz_append_printf(ctx, fzbuf, ", email=%s", dn->email); + + if (dn->c) + fz_append_printf(ctx, fzbuf, ", c=%s", dn->c); + + dn_str = fz_string_from_buffer(ctx, fzbuf); + pdf_update_signature_appearance(ctx, (pdf_annot *)widget, dn->cn, dn_str, NULL); + } + + pdf_signature_set_value(ctx, doc, wobj, signer); + } + fz_always(ctx) + { + signer->drop_designated_name(signer, dn); + fz_drop_buffer(ctx, fzbuf); + } + fz_catch(ctx) + { + fz_rethrow(ctx); + } +} diff -Nru k2pdfopt-2.42+ds/mupdf_mod/pdf-type3.c k2pdfopt-2.51+ds/mupdf_mod/pdf-type3.c --- k2pdfopt-2.42+ds/mupdf_mod/pdf-type3.c 2017-02-25 05:42:33.000000000 +0000 +++ k2pdfopt-2.51+ds/mupdf_mod/pdf-type3.c 1970-01-01 00:00:00.000000000 +0000 @@ -1,207 +0,0 @@ -#include "mupdf/pdf.h" - -/* willus mod */ -#include "font-imp.h" - -static void -pdf_run_glyph_func(fz_context *ctx, void *doc, void *rdb, fz_buffer *contents, fz_device *dev, const fz_matrix *ctm, void *gstate, int nested_depth) -{ - pdf_run_glyph(ctx, doc, (pdf_obj *)rdb, contents, dev, ctm, gstate, nested_depth); -} - -static void -pdf_t3_free_resources(fz_context *ctx, void *doc, void *rdb_) -{ - pdf_obj *rdb = (pdf_obj *)rdb_; - pdf_drop_obj(ctx, rdb); -} - -pdf_font_desc * -pdf_load_type3_font(fz_context *ctx, pdf_document *doc, pdf_obj *rdb, pdf_obj *dict) -{ - char buf[256]; - char *estrings[256]; - pdf_font_desc *fontdesc = NULL; - pdf_obj *encoding; - pdf_obj *widths; - pdf_obj *charprocs; - pdf_obj *obj; - int first, last; - int i, k, n; - fz_rect bbox; - fz_matrix matrix; - fz_font *font; - - fz_var(fontdesc); - - /* Make a new type3 font entry in the document */ - if (doc->num_type3_fonts == doc->max_type3_fonts) - { - int new_max = doc->max_type3_fonts * 2; - - if (new_max == 0) - new_max = 4; - doc->type3_fonts = fz_resize_array(ctx, doc->type3_fonts, new_max, sizeof(*doc->type3_fonts)); - doc->max_type3_fonts = new_max; - } - - fz_try(ctx) - { - obj = pdf_dict_get(ctx, dict, PDF_NAME_Name); - if (pdf_is_name(ctx, obj)) - fz_strlcpy(buf, pdf_to_name(ctx, obj), sizeof buf); - else - fz_strlcpy(buf, "Unnamed-T3", sizeof buf); - - fontdesc = pdf_new_font_desc(ctx); - - obj = pdf_dict_get(ctx, dict, PDF_NAME_FontMatrix); - pdf_to_matrix(ctx, obj, &matrix); - - obj = pdf_dict_get(ctx, dict, PDF_NAME_FontBBox); - fz_transform_rect(pdf_to_rect(ctx, obj, &bbox), &matrix); - - font = fz_new_type3_font(ctx, buf, &matrix); - fontdesc->font = font; - fontdesc->size += sizeof(fz_font) + 256 * (sizeof(fz_buffer*) + sizeof(float)); - - fz_set_font_bbox(ctx, font, bbox.x0, bbox.y0, bbox.x1, bbox.y1); - - /* Encoding */ - - for (i = 0; i < 256; i++) - estrings[i] = NULL; - - encoding = pdf_dict_get(ctx, dict, PDF_NAME_Encoding); - if (!encoding) - { - fz_throw(ctx, FZ_ERROR_GENERIC, "syntaxerror: Type3 font missing Encoding"); - } - - if (pdf_is_name(ctx, encoding)) - pdf_load_encoding(estrings, pdf_to_name(ctx, encoding)); - - if (pdf_is_dict(ctx, encoding)) - { - pdf_obj *base, *diff, *item; - - base = pdf_dict_get(ctx, encoding, PDF_NAME_BaseEncoding); - if (pdf_is_name(ctx, base)) - pdf_load_encoding(estrings, pdf_to_name(ctx, base)); - - diff = pdf_dict_get(ctx, encoding, PDF_NAME_Differences); - if (pdf_is_array(ctx, diff)) - { - n = pdf_array_len(ctx, diff); - k = 0; - for (i = 0; i < n; i++) - { - item = pdf_array_get(ctx, diff, i); - if (pdf_is_int(ctx, item)) - k = pdf_to_int(ctx, item); - if (pdf_is_name(ctx, item) && k >= 0 && k < nelem(estrings)) - estrings[k++] = pdf_to_name(ctx, item); - } - } - } - - fontdesc->encoding = pdf_new_identity_cmap(ctx, 0, 1); - fontdesc->size += pdf_cmap_size(ctx, fontdesc->encoding); - - pdf_load_to_unicode(ctx, doc, fontdesc, estrings, NULL, pdf_dict_get(ctx, dict, PDF_NAME_ToUnicode)); - - /* Widths */ - - pdf_set_default_hmtx(ctx, fontdesc, 0); - - first = pdf_to_int(ctx, pdf_dict_get(ctx, dict, PDF_NAME_FirstChar)); - last = pdf_to_int(ctx, pdf_dict_get(ctx, dict, PDF_NAME_LastChar)); - - if (first < 0 || last > 255 || first > last) - first = last = 0; - - widths = pdf_dict_get(ctx, dict, PDF_NAME_Widths); - if (!widths) - { - fz_throw(ctx, FZ_ERROR_GENERIC, "syntaxerror: Type3 font missing Widths"); - } - - for (i = first; i <= last; i++) - { - float w = pdf_to_real(ctx, pdf_array_get(ctx, widths, i - first)); - w = font->t3matrix.a * w * 1000; - font->t3widths[i] = w * 0.001f; - pdf_add_hmtx(ctx, fontdesc, i, i, w); - } - - pdf_end_hmtx(ctx, fontdesc); - - /* Resources -- inherit page resources if the font doesn't have its own */ - - font->t3freeres = pdf_t3_free_resources; - font->t3resources = pdf_dict_get(ctx, dict, PDF_NAME_Resources); - if (!font->t3resources) - font->t3resources = rdb; - if (font->t3resources) - pdf_keep_obj(ctx, font->t3resources); - if (!font->t3resources) - fz_warn(ctx, "no resource dictionary for type 3 font!"); - - font->t3doc = doc; - font->t3run = pdf_run_glyph_func; - - /* CharProcs */ - - charprocs = pdf_dict_get(ctx, dict, PDF_NAME_CharProcs); - if (!charprocs) - { - fz_throw(ctx, FZ_ERROR_GENERIC, "syntaxerror: Type3 font missing CharProcs"); - } - - for (i = 0; i < 256; i++) - { - if (estrings[i]) - { - obj = pdf_dict_gets(ctx, charprocs, estrings[i]); - if (pdf_is_stream(ctx, obj)) - { - font->t3procs[i] = pdf_load_stream(ctx, obj); - fz_trim_buffer(ctx, font->t3procs[i]); - fontdesc->size += fz_buffer_storage(ctx, font->t3procs[i], NULL); - fontdesc->size += 0; // TODO: display list size calculation - } - } - } - } - fz_catch(ctx) - { - pdf_drop_font(ctx, fontdesc); - fz_rethrow(ctx); - } - - doc->type3_fonts[doc->num_type3_fonts++] = fz_keep_font(ctx, font); - - return fontdesc; -} - -void pdf_load_type3_glyphs(fz_context *ctx, pdf_document *doc, pdf_font_desc *fontdesc, int nested_depth) -{ - int i; - - fz_try(ctx) - { - for (i = 0; i < 256; i++) - { - if (fontdesc->font->t3procs[i]) - { - fz_prepare_t3_glyph(ctx, fontdesc->font, i, nested_depth); - fontdesc->size += 0; // TODO: display list size calculation - } - } - } - fz_catch(ctx) - { - fz_rethrow_if(ctx, FZ_ERROR_TRYLATER); - fz_warn(ctx, "Type3 glyph load failed: %s", fz_caught_message(ctx)); - } -} diff -Nru k2pdfopt-2.42+ds/mupdf_mod/pdf-xref.c k2pdfopt-2.51+ds/mupdf_mod/pdf-xref.c --- k2pdfopt-2.42+ds/mupdf_mod/pdf-xref.c 2017-02-25 05:45:26.000000000 +0000 +++ k2pdfopt-2.51+ds/mupdf_mod/pdf-xref.c 2018-11-21 00:33:17.000000000 +0000 @@ -1,5 +1,10 @@ +#include "mupdf/fitz.h" +#include "mupdf/pdf.h" #include "pdf-imp.h" -#include "mupdf/fitz/document.h" + +#include +#include +#include #undef DEBUG_PROGESSIVE_ADVANCE @@ -9,6 +14,8 @@ #define DEBUGMESS(A) do { } while (0) #endif +#define isdigit(c) (c >= '0' && c <= '9') + static inline int iswhite(int ch) { return @@ -20,14 +27,14 @@ * xref tables */ -static void pdf_drop_xref_sections(fz_context *ctx, pdf_document *doc) +static void pdf_drop_xref_sections_imp(fz_context *ctx, pdf_document *doc, pdf_xref *xref_sections, int num_xref_sections) { pdf_unsaved_sig *usig; int x, e; - for (x = 0; x < doc->num_xref_sections; x++) + for (x = 0; x < num_xref_sections; x++) { - pdf_xref *xref = &doc->xref_sections[x]; + pdf_xref *xref = &xref_sections[x]; pdf_xref_subsec *sub = xref->subsec; while (sub != NULL) @@ -36,7 +43,6 @@ for (e = 0; e < sub->len; e++) { pdf_xref_entry *entry = &sub->table[e]; - if (entry->obj) { pdf_drop_obj(ctx, entry->obj); @@ -55,12 +61,21 @@ { xref->unsaved_sigs = usig->next; pdf_drop_obj(ctx, usig->field); - pdf_drop_signer(ctx, usig->signer); + usig->signer->drop(usig->signer); fz_free(ctx, usig); } } - fz_free(ctx, doc->xref_sections); + fz_free(ctx, xref_sections); +} + +static void pdf_drop_xref_sections(fz_context *ctx, pdf_document *doc) +{ + pdf_drop_xref_sections_imp(ctx, doc, doc->saved_xref_sections, doc->saved_num_xref_sections); + pdf_drop_xref_sections_imp(ctx, doc, doc->xref_sections, doc->num_xref_sections); + + doc->saved_xref_sections = NULL; + doc->saved_num_xref_sections = 0; doc->xref_sections = NULL; doc->num_xref_sections = 0; doc->num_incremental_sections = 0; @@ -130,7 +145,7 @@ /* Return the document's final trailer */ pdf_xref *xref = &doc->xref_sections[0]; - return xref->trailer; + return xref ? xref->trailer : NULL; } void pdf_set_populating_xref_trailer(fz_context *ctx, pdf_document *doc, pdf_obj *trailer) @@ -215,8 +230,8 @@ } /* Prevent accidental heap underflow */ - if (num < 0) - fz_throw(ctx, FZ_ERROR_GENERIC, "object number must not be negative (%d)", num); + if (num < 0 || num > PDF_MAX_OBJECT_NUMBER) + fz_throw(ctx, FZ_ERROR_GENERIC, "object number out of range (%d)", num); /* Return the pointer to the entry in the last section. */ xref = &doc->xref_sections[doc->num_xref_sections-1]; @@ -395,7 +410,7 @@ return num < xref->num_objects && sub->table[num].type; } -void pdf_xref_store_unsaved_signature(fz_context *ctx, pdf_document *doc, pdf_obj *field, pdf_signer *signer) +void pdf_xref_store_unsaved_signature(fz_context *ctx, pdf_document *doc, pdf_obj *field, pdf_pkcs7_signer *signer) { pdf_xref *xref = &doc->xref_sections[0]; pdf_unsaved_sig *unsaved_sig; @@ -405,7 +420,7 @@ * saving time */ unsaved_sig = fz_malloc_struct(ctx, pdf_unsaved_sig); unsaved_sig->field = pdf_keep_obj(ctx, field); - unsaved_sig->signer = pdf_keep_signer(ctx, signer); + unsaved_sig->signer = signer->keep(signer); unsaved_sig->next = NULL; if (xref->unsaved_sigs_end == NULL) xref->unsaved_sigs_end = &xref->unsaved_sigs; @@ -536,6 +551,36 @@ } } +void pdf_forget_xref(fz_context *ctx, pdf_document *doc) +{ + pdf_obj *trailer = pdf_keep_obj(ctx, pdf_trailer(ctx, doc)); + + if (doc->saved_xref_sections) + pdf_drop_xref_sections_imp(ctx, doc, doc->saved_xref_sections, doc->saved_num_xref_sections); + + doc->saved_xref_sections = doc->xref_sections; + doc->saved_num_xref_sections = doc->num_xref_sections; + + doc->startxref = 0; + doc->num_xref_sections = 0; + doc->num_incremental_sections = 0; + doc->xref_base = 0; + doc->disallow_new_increments = 0; + + fz_try(ctx) + { + pdf_get_populating_xref_entry(ctx, doc, 0); + } + fz_catch(ctx) + { + pdf_drop_obj(ctx, trailer); + fz_rethrow(ctx); + } + + /* Set the trailer of the final xref section. */ + doc->xref_sections[0].trailer = trailer; +} + /* * magic version tag and startxref */ @@ -547,10 +592,13 @@ fz_seek(ctx, doc->file, 0, SEEK_SET); fz_read_line(ctx, doc->file, buf, sizeof buf); - if (memcmp(buf, "%PDF-", 5) != 0) + if (strlen(buf) < 5 || memcmp(buf, "%PDF-", 5) != 0) fz_throw(ctx, FZ_ERROR_GENERIC, "cannot recognize version marker"); - doc->version = 10 * (fz_atof(buf+5) + 0.05); + doc->version = 10 * (fz_atof(buf+5) + 0.05f); + if (doc->version < 10 || doc->version > 17) + if (doc->version != 20) + fz_warn(ctx, "unknown PDF version: %d.%d", doc->version / 10, doc->version % 10); } static void @@ -558,13 +606,13 @@ { unsigned char buf[1024]; size_t i, n; - fz_off_t t; + int64_t t; fz_seek(ctx, doc->file, 0, SEEK_END); doc->file_size = fz_tell(ctx, doc->file); - t = fz_maxo(0, doc->file_size - (fz_off_t)sizeof buf); + t = fz_maxi64(0, doc->file_size - (int64_t)sizeof buf); fz_seek(ctx, doc->file, t, SEEK_SET); n = fz_read(ctx, doc->file, buf, sizeof buf); @@ -580,9 +628,9 @@ while (i < n && iswhite(buf[i])) i ++; doc->startxref = 0; - while (i < n && buf[i] >= '0' && buf[i] <= '9') + while (i < n && isdigit(buf[i])) { - if (doc->startxref >= FZ_OFF_MAX/10) + if (doc->startxref >= INT64_MAX/10) fz_throw(ctx, FZ_ERROR_GENERIC, "startxref too large"); doc->startxref = doc->startxref * 10 + (buf[i++] - '0'); } @@ -601,7 +649,7 @@ do { int c = fz_peek_byte(ctx, stm); - if (c > 32 && c != EOF) + if (c == EOF || c > 32) return; (void)fz_read_byte(ctx, stm); } @@ -627,13 +675,13 @@ static int pdf_xref_size_from_old_trailer(fz_context *ctx, pdf_document *doc, pdf_lexbuf *buf) { - fz_off_t len; + int len; char *s; - fz_off_t t; + int64_t t; pdf_token tok; int c; - int size; - fz_off_t ofs; + int size = 0; + int64_t ofs; pdf_obj *trailer = NULL; size_t n; @@ -650,19 +698,19 @@ while (1) { c = fz_peek_byte(ctx, doc->file); - if (!(c >= '0' && c <= '9')) + if (!isdigit(c)) break; fz_read_line(ctx, doc->file, buf->scratch, buf->size); s = buf->scratch; - fz_strsep(&s, " "); /* ignore ofs */ + fz_strsep(&s, " "); /* ignore start */ if (!s) - fz_throw(ctx, FZ_ERROR_GENERIC, "invalid range marker in xref"); - len = fz_atoo(fz_strsep(&s, " ")); -/* willus.com -- no warning */ + fz_throw(ctx, FZ_ERROR_GENERIC, "xref subsection length missing"); + len = fz_atoi(fz_strsep(&s, " ")); +/* willus mod -- no warning */ /* if (len < 0) - fz_throw(ctx, FZ_ERROR_GENERIC, "xref range marker must be positive"); + fz_throw(ctx, FZ_ERROR_GENERIC, "xref subsection length must be positive"); */ /* broken pdfs where the section is not on a separate line */ @@ -686,10 +734,10 @@ else n = 20; - if (len > (fz_off_t)((FZ_OFF_MAX - t) / n)) + if (len > (int64_t)((INT64_MAX - t) / n)) fz_throw(ctx, FZ_ERROR_GENERIC, "xref has too many entries"); - fz_seek(ctx, doc->file, (fz_off_t)(t + n * len), SEEK_SET); + fz_seek(ctx, doc->file, t + n * len, SEEK_SET); } fz_try(ctx) @@ -704,9 +752,9 @@ trailer = pdf_parse_dict(ctx, doc, doc->file, buf); - size = pdf_to_int(ctx, pdf_dict_get(ctx, trailer, PDF_NAME_Size)); - if (!size) - fz_throw(ctx, FZ_ERROR_GENERIC, "trailer missing Size entry"); + size = pdf_dict_get_int(ctx, trailer, PDF_NAME(Size)); + if (size < 0 || size > PDF_MAX_OBJECT_NUMBER + 1) + fz_throw(ctx, FZ_ERROR_GENERIC, "trailer Size entry out of range"); } fz_always(ctx) { @@ -723,11 +771,11 @@ } static pdf_xref_entry * -pdf_xref_find_subsection(fz_context *ctx, pdf_document *doc, fz_off_t ofs, int len) +pdf_xref_find_subsection(fz_context *ctx, pdf_document *doc, int start, int len) { pdf_xref *xref = &doc->xref_sections[doc->num_xref_sections-1]; pdf_xref_subsec *sub; - int new_max; + int num_objects; /* Different cases here. Case 1) We might be asking for a * subsection (or a subset of a subsection) that we already @@ -739,15 +787,15 @@ /* Sanity check */ for (sub = xref->subsec; sub != NULL; sub = sub->next) { - if (ofs >= sub->start && ofs + len <= sub->start + sub->len) - return &sub->table[ofs-sub->start]; /* Case 1 */ - if (ofs + len > sub->start && ofs <= sub->start + sub->len) + if (start >= sub->start && start + len <= sub->start + sub->len) + return &sub->table[start-sub->start]; /* Case 1 */ + if (start + len > sub->start && start <= sub->start + sub->len) break; /* Case 3 */ } - new_max = xref->num_objects; - if (new_max < ofs + len) - new_max = ofs + len; + num_objects = xref->num_objects; + if (num_objects < start + len) + num_objects = start + len; if (sub == NULL) { @@ -756,7 +804,7 @@ fz_try(ctx) { sub->table = fz_calloc(ctx, len, sizeof(pdf_xref_entry)); - sub->start = ofs; + sub->start = start; sub->len = len; sub->next = xref->subsec; xref->subsec = sub; @@ -766,35 +814,31 @@ fz_free(ctx, sub); fz_rethrow(ctx); } - xref->num_objects = new_max; - if (doc->max_xref_len < new_max) - extend_xref_index(ctx, doc, new_max); + xref->num_objects = num_objects; + if (doc->max_xref_len < num_objects) + extend_xref_index(ctx, doc, num_objects); } else { /* Case 3 */ - ensure_solid_xref(ctx, doc, new_max, doc->num_xref_sections-1); + ensure_solid_xref(ctx, doc, num_objects, doc->num_xref_sections-1); xref = &doc->xref_sections[doc->num_xref_sections-1]; sub = xref->subsec; } - return &sub->table[ofs-sub->start]; + return &sub->table[start-sub->start]; } static pdf_obj * pdf_read_old_xref(fz_context *ctx, pdf_document *doc, pdf_lexbuf *buf) { + int start, len, c, i, xref_len, carried; fz_stream *file = doc->file; - - fz_off_t ofs; - int len; - char *s; - size_t n; - pdf_token tok; - fz_off_t i; - int c; - int xref_len = pdf_xref_size_from_old_trailer(ctx, doc, buf); pdf_xref_entry *table; - int carried; + pdf_token tok; + size_t n; + char *s, *e; + + xref_len = pdf_xref_size_from_old_trailer(ctx, doc, buf); fz_skip_space(ctx, doc->file); if (fz_skip_string(ctx, doc->file, "xref")) @@ -804,61 +848,80 @@ while (1) { c = fz_peek_byte(ctx, file); - if (!(c >= '0' && c <= '9')) + if (!isdigit(c)) break; fz_read_line(ctx, file, buf->scratch, buf->size); s = buf->scratch; - ofs = fz_atoo(fz_strsep(&s, " ")); + start = fz_atoi(fz_strsep(&s, " ")); len = fz_atoi(fz_strsep(&s, " ")); /* broken pdfs where the section is not on a separate line */ if (s && *s != '\0') { - fz_warn(ctx, "broken xref section. proceeding anyway."); + fz_warn(ctx, "broken xref subsection. proceeding anyway."); fz_seek(ctx, file, -(2 + (int)strlen(s)), SEEK_CUR); } - if (ofs < 0) - fz_throw(ctx, FZ_ERROR_GENERIC, "out of range object num in xref: %d", (int)ofs); - + if (start < 0 || start > PDF_MAX_OBJECT_NUMBER + || len < 0 || len > PDF_MAX_OBJECT_NUMBER + || start + len - 1 > PDF_MAX_OBJECT_NUMBER) + { + fz_throw(ctx, FZ_ERROR_GENERIC, "xref subsection object numbers are out of range"); + } /* broken pdfs where size in trailer undershoots entries in xref sections */ - if (ofs + len > xref_len) + if (start + len > xref_len) { - fz_warn(ctx, "broken xref section, proceeding anyway."); + fz_warn(ctx, "broken xref subsection, proceeding anyway."); } - table = pdf_xref_find_subsection(ctx, doc, ofs, len); + table = pdf_xref_find_subsection(ctx, doc, start, len); /* Xref entries SHOULD be 20 bytes long, but we see 19 byte * ones more frequently than we'd like (e.g. PCLm drivers). * Cope with this by 'carrying' data forward. */ carried = 0; - for (i = ofs; i < ofs + len; i++) + for (i = 0; i < len; i++) { - pdf_xref_entry *entry = &table[i-ofs]; + pdf_xref_entry *entry = &table[i]; n = fz_read(ctx, file, (unsigned char *) buf->scratch + carried, 20-carried); if (n != 20-carried) fz_throw(ctx, FZ_ERROR_GENERIC, "unexpected EOF in xref table"); n += carried; + buf->scratch[n] = '\0'; if (!entry->type) { s = buf->scratch; + e = s + n; + + entry->num = start + i; /* broken pdfs where line start with white space */ - while (*s != '\0' && iswhite(*s)) + while (s < e && iswhite(*s)) + s++; + + if (s == e || !isdigit(*s)) + fz_throw(ctx, FZ_ERROR_GENERIC, "xref offset missing"); + while (s < e && isdigit(*s)) + entry->ofs = entry->ofs * 10 + *s++ - '0'; + + while (s < e && iswhite(*s)) + s++; + if (s == e || !isdigit(*s)) + fz_throw(ctx, FZ_ERROR_GENERIC, "xref generation number missing"); + while (s < e && isdigit(*s)) + entry->gen = entry->gen * 10 + *s++ - '0'; + + while (s < e && iswhite(*s)) s++; + if (s == e || (*s != 'f' && *s != 'n' && *s != 'o')) + fz_throw(ctx, FZ_ERROR_GENERIC, "unexpected xref type: 0x%x (%d %d R)", s == e ? 0 : *s, entry->num, entry->gen); + entry->type = *s++; - entry->ofs = fz_atoo(s); - entry->gen = fz_atoi(s + 11); - entry->num = (int)i; - entry->type = s[17]; - if (s[17] != 'f' && s[17] != 'n' && s[17] != 'o') - fz_throw(ctx, FZ_ERROR_GENERIC, "unexpected xref type: %#x (%d %d R)", s[17], entry->num, entry->gen); /* If the last byte of our buffer isn't an EOL (or space), carry one byte forward */ - carried = s[19] > 32; + carried = buf->scratch[19] > 32; if (carried) - s[0] = s[19]; + buf->scratch[0] = buf->scratch[19]; } } if (carried) @@ -873,26 +936,26 @@ if (tok != PDF_TOK_OPEN_DICT) fz_throw(ctx, FZ_ERROR_GENERIC, "expected trailer dictionary"); + doc->has_old_style_xrefs = 1; + return pdf_parse_dict(ctx, doc, file, buf); } static void -pdf_read_new_xref_section(fz_context *ctx, pdf_document *doc, fz_stream *stm, fz_off_t i0, int i1, int w0, int w1, int w2) +pdf_read_new_xref_section(fz_context *ctx, pdf_document *doc, fz_stream *stm, int i0, int i1, int w0, int w1, int w2) { pdf_xref_entry *table; int i, n; - if (i0 < 0 || i1 < 0) - fz_throw(ctx, FZ_ERROR_GENERIC, "negative xref stream entry index"); - //if (i0 + i1 > pdf_xref_len(ctx, doc)) - // fz_throw(ctx, FZ_ERROR_GENERIC, "xref stream has too many entries"); + if (i0 < 0 || i0 > PDF_MAX_OBJECT_NUMBER || i1 < 0 || i1 > PDF_MAX_OBJECT_NUMBER || i0 + i1 - 1 > PDF_MAX_OBJECT_NUMBER) + fz_throw(ctx, FZ_ERROR_GENERIC, "xref subsection object numbers are out of range"); table = pdf_xref_find_subsection(ctx, doc, i0, i1); for (i = i0; i < i0 + i1; i++) { pdf_xref_entry *entry = &table[i-i0]; int a = 0; - fz_off_t b = 0; + int64_t b = 0; int c = 0; if (fz_is_eof(ctx, stm)) @@ -926,8 +989,8 @@ pdf_obj *trailer = NULL; pdf_obj *index = NULL; pdf_obj *obj = NULL; - int num, gen; - fz_off_t ofs, stm_ofs; + int gen, num = 0; + int64_t ofs, stm_ofs; int size, w0, w1, w2; int t; @@ -949,18 +1012,18 @@ { pdf_xref_entry *entry; - obj = pdf_dict_get(ctx, trailer, PDF_NAME_Size); + obj = pdf_dict_get(ctx, trailer, PDF_NAME(Size)); if (!obj) fz_throw(ctx, FZ_ERROR_GENERIC, "xref stream missing Size entry (%d 0 R)", num); size = pdf_to_int(ctx, obj); - obj = pdf_dict_get(ctx, trailer, PDF_NAME_W); + obj = pdf_dict_get(ctx, trailer, PDF_NAME(W)); if (!obj) fz_throw(ctx, FZ_ERROR_GENERIC, "xref stream missing W entry (%d R)", num); - w0 = pdf_to_int(ctx, pdf_array_get(ctx, obj, 0)); - w1 = pdf_to_int(ctx, pdf_array_get(ctx, obj, 1)); - w2 = pdf_to_int(ctx, pdf_array_get(ctx, obj, 2)); + w0 = pdf_array_get_int(ctx, obj, 0); + w1 = pdf_array_get_int(ctx, obj, 1); + w2 = pdf_array_get_int(ctx, obj, 2); if (w0 < 0) fz_warn(ctx, "xref stream objects have corrupt type"); @@ -973,7 +1036,7 @@ w1 = w1 < 0 ? 0 : w1; w2 = w2 < 0 ? 0 : w2; - index = pdf_dict_get(ctx, trailer, PDF_NAME_Index); + index = pdf_dict_get(ctx, trailer, PDF_NAME(Index)); stm = pdf_open_stream_with_offset(ctx, doc, num, trailer, stm_ofs); @@ -986,8 +1049,8 @@ int n = pdf_array_len(ctx, index); for (t = 0; t < n; t += 2) { - int i0 = pdf_to_int(ctx, pdf_array_get(ctx, index, t + 0)); - int i1 = pdf_to_int(ctx, pdf_array_get(ctx, index, t + 1)); + int i0 = pdf_array_get_int(ctx, index, t + 0); + int i1 = pdf_array_get_int(ctx, index, t + 1); pdf_read_new_xref_section(ctx, doc, stm, i0, i1, w0, w1, w2); } } @@ -1014,7 +1077,7 @@ } static pdf_obj * -pdf_read_xref(fz_context *ctx, pdf_document *doc, fz_off_t ofs, pdf_lexbuf *buf) +pdf_read_xref(fz_context *ctx, pdf_document *doc, int64_t ofs, pdf_lexbuf *buf) { pdf_obj *trailer; int c; @@ -1027,7 +1090,7 @@ c = fz_peek_byte(ctx, doc->file); if (c == 'x') trailer = pdf_read_old_xref(ctx, doc, buf); - else if (c >= '0' && c <= '9') + else if (isdigit(c)) trailer = pdf_read_new_xref(ctx, doc, buf); else fz_throw(ctx, FZ_ERROR_GENERIC, "cannot recognize xref format"); @@ -1035,52 +1098,22 @@ return trailer; } -typedef struct ofs_list_s ofs_list; - -struct ofs_list_s -{ - int max; - int len; - fz_off_t *list; -}; - -static fz_off_t -read_xref_section(fz_context *ctx, pdf_document *doc, fz_off_t ofs, pdf_lexbuf *buf, ofs_list *offsets) +static int64_t +read_xref_section(fz_context *ctx, pdf_document *doc, int64_t ofs, pdf_lexbuf *buf) { pdf_obj *trailer = NULL; - fz_off_t xrefstmofs = 0; - fz_off_t prevofs = 0; - - fz_var(trailer); + pdf_obj *prevobj; + int64_t xrefstmofs = 0; + int64_t prevofs = 0; + trailer = pdf_read_xref(ctx, doc, ofs, buf); fz_try(ctx) { - int i; - /* Avoid potential infinite recursion */ - for (i = 0; i < offsets->len; i ++) - { - if (offsets->list[i] == ofs) - break; - } - if (i < offsets->len) - { - fz_warn(ctx, "ignoring xref recursion with offset %d", (int)ofs); - break; - } - if (offsets->len == offsets->max) - { - offsets->list = fz_resize_array(ctx, offsets->list, offsets->max*2, sizeof(*offsets->list)); - offsets->max *= 2; - } - offsets->list[offsets->len++] = ofs; - - trailer = pdf_read_xref(ctx, doc, ofs, buf); - pdf_set_populating_xref_trailer(ctx, doc, trailer); /* FIXME: do we overwrite free entries properly? */ /* FIXME: Does this work properly with progression? */ - xrefstmofs = pdf_to_offset(ctx, pdf_dict_get(ctx, trailer, PDF_NAME_XRefStm)); + xrefstmofs = pdf_to_int64(ctx, pdf_dict_get(ctx, trailer, PDF_NAME(XRefStm))); if (xrefstmofs) { if (xrefstmofs < 0) @@ -1094,44 +1127,62 @@ pdf_drop_obj(ctx, pdf_read_xref(ctx, doc, xrefstmofs, buf)); } - /* FIXME: pdf_to_offset? */ - prevofs = pdf_to_offset(ctx, pdf_dict_get(ctx, trailer, PDF_NAME_Prev)); - if (prevofs < 0) - fz_throw(ctx, FZ_ERROR_GENERIC, "negative xref stream offset for previous xref stream"); + prevobj = pdf_dict_get(ctx, trailer, PDF_NAME(Prev)); + if (pdf_is_int(ctx, prevobj)) + { + prevofs = pdf_to_int64(ctx, prevobj); + if (prevofs <= 0) + fz_throw(ctx, FZ_ERROR_GENERIC, "invalid offset for previous xref section"); + } } fz_always(ctx) - { pdf_drop_obj(ctx, trailer); - } fz_catch(ctx) - { fz_rethrow(ctx); - } return prevofs; } static void -pdf_read_xref_sections(fz_context *ctx, pdf_document *doc, fz_off_t ofs, pdf_lexbuf *buf, int read_previous) +pdf_read_xref_sections(fz_context *ctx, pdf_document *doc, int64_t ofs, pdf_lexbuf *buf, int read_previous) { - ofs_list list; + int i, len, cap; + int64_t *offsets; + + len = 0; + cap = 10; + offsets = fz_malloc_array(ctx, cap, sizeof(*offsets)); - list.len = 0; - list.max = 10; - list.list = fz_malloc_array(ctx, 10, sizeof(*list.list)); fz_try(ctx) { while(ofs) { + for (i = 0; i < len; i ++) + { + if (offsets[i] == ofs) + break; + } + if (i < len) + { + fz_warn(ctx, "ignoring xref section recursion at offset %d", (int)ofs); + break; + } + if (len == cap) + { + cap *= 2; + offsets = fz_resize_array(ctx, offsets, cap, sizeof(*offsets)); + } + offsets[len++] = ofs; + pdf_populate_next_xref_level(ctx, doc); - ofs = read_xref_section(ctx, doc, ofs, buf, &list); + ofs = read_xref_section(ctx, doc, ofs, buf); if (!read_previous) break; } } fz_always(ctx) { - fz_free(ctx, list.list); + fz_free(ctx, offsets); } fz_catch(ctx) { @@ -1203,7 +1254,7 @@ xref_len = pdf_xref_len(ctx, doc); for (i = 0; i < xref_len; i++) { - pdf_xref_entry *entry = pdf_get_xref_entry(ctx, doc, i); + entry = pdf_get_xref_entry(ctx, doc, i); if (entry->type == 'n') { /* Special case code: "0000000000 * n" means free, @@ -1218,7 +1269,7 @@ /* Read this into a local variable here, because pdf_get_xref_entry * may solidify the xref, hence invalidating "entry", meaning we * need a stashed value for the throw. */ - fz_off_t ofs = entry->ofs; + int64_t ofs = entry->ofs; if (ofs <= 0 || ofs >= xref_len || pdf_get_xref_entry(ctx, doc, ofs)->type != 'n') fz_throw(ctx, FZ_ERROR_GENERIC, "invalid reference to an objstm that does not exist: %d (%d 0 R)", (int)ofs, i); } @@ -1232,7 +1283,7 @@ pdf_obj *hint = NULL; pdf_obj *o; int num, gen, lin, len; - fz_off_t stmofs; + int64_t stmofs; fz_var(dict); fz_var(hint); @@ -1244,29 +1295,29 @@ dict = pdf_parse_ind_obj(ctx, doc, doc->file, &doc->lexbuf.base, &num, &gen, &stmofs, NULL); if (!pdf_is_dict(ctx, dict)) fz_throw(ctx, FZ_ERROR_GENERIC, "Failed to read linearized dictionary"); - o = pdf_dict_get(ctx, dict, PDF_NAME_Linearized); + o = pdf_dict_get(ctx, dict, PDF_NAME(Linearized)); if (o == NULL) fz_throw(ctx, FZ_ERROR_GENERIC, "Failed to read linearized dictionary"); lin = pdf_to_int(ctx, o); if (lin != 1) fz_throw(ctx, FZ_ERROR_GENERIC, "Unexpected version of Linearized tag (%d)", lin); - len = pdf_to_int(ctx, pdf_dict_get(ctx, dict, PDF_NAME_L)); + len = pdf_dict_get_int(ctx, dict, PDF_NAME(L)); if (len != doc->file_length) fz_throw(ctx, FZ_ERROR_GENERIC, "File has been updated since linearization"); pdf_read_xref_sections(ctx, doc, fz_tell(ctx, doc->file), &doc->lexbuf.base, 0); - doc->page_count = pdf_to_int(ctx, pdf_dict_get(ctx, dict, PDF_NAME_N)); - doc->linear_page_refs = fz_resize_array(ctx, doc->linear_page_refs, doc->page_count, sizeof(pdf_obj *)); - memset(doc->linear_page_refs, 0, doc->page_count * sizeof(pdf_obj*)); + doc->linear_page_count = pdf_dict_get_int(ctx, dict, PDF_NAME(N)); + doc->linear_page_refs = fz_resize_array(ctx, doc->linear_page_refs, doc->linear_page_count, sizeof(pdf_obj *)); + memset(doc->linear_page_refs, 0, doc->linear_page_count * sizeof(pdf_obj*)); doc->linear_obj = dict; doc->linear_pos = fz_tell(ctx, doc->file); - doc->linear_page1_obj_num = pdf_to_int(ctx, pdf_dict_get(ctx, dict, PDF_NAME_O)); + doc->linear_page1_obj_num = pdf_dict_get_int(ctx, dict, PDF_NAME(O)); doc->linear_page_refs[0] = pdf_new_indirect(ctx, doc, doc->linear_page1_obj_num, 0); doc->linear_page_num = 0; - hint = pdf_dict_get(ctx, dict, PDF_NAME_H); - doc->hint_object_offset = pdf_to_int(ctx, pdf_array_get(ctx, hint, 0)); - doc->hint_object_length = pdf_to_int(ctx, pdf_array_get(ctx, hint, 1)); + hint = pdf_dict_get(ctx, dict, PDF_NAME(H)); + doc->hint_object_offset = pdf_array_get_int(ctx, hint, 0); + doc->hint_object_length = pdf_array_get_int(ctx, hint, 1); entry = pdf_get_populating_xref_entry(ctx, doc, 0); entry->type = 'f'; @@ -1343,8 +1394,8 @@ pdf_prime_xref_index(ctx, doc); } - encrypt = pdf_dict_get(ctx, pdf_trailer(ctx, doc), PDF_NAME_Encrypt); - id = pdf_dict_get(ctx, pdf_trailer(ctx, doc), PDF_NAME_ID); + encrypt = pdf_dict_get(ctx, pdf_trailer(ctx, doc), PDF_NAME(Encrypt)); + id = pdf_dict_get(ctx, pdf_trailer(ctx, doc), PDF_NAME(ID)); if (pdf_is_dict(ctx, encrypt)) doc->crypt = pdf_new_crypt(ctx, encrypt, id); @@ -1356,8 +1407,8 @@ int xref_len = pdf_xref_len(ctx, doc); pdf_repair_obj_stms(ctx, doc); - hasroot = (pdf_dict_get(ctx, pdf_trailer(ctx, doc), PDF_NAME_Root) != NULL); - hasinfo = (pdf_dict_get(ctx, pdf_trailer(ctx, doc), PDF_NAME_Info) != NULL); + hasroot = (pdf_dict_get(ctx, pdf_trailer(ctx, doc), PDF_NAME(Root)) != NULL); + hasinfo = (pdf_dict_get(ctx, pdf_trailer(ctx, doc), PDF_NAME(Info)) != NULL); for (i = 1; i < xref_len; i++) { @@ -1378,24 +1429,20 @@ if (!hasroot) { - obj = pdf_dict_get(ctx, dict, PDF_NAME_Type); - if (pdf_name_eq(ctx, obj, PDF_NAME_Catalog)) + obj = pdf_dict_get(ctx, dict, PDF_NAME(Type)); + if (pdf_name_eq(ctx, obj, PDF_NAME(Catalog))) { nobj = pdf_new_indirect(ctx, doc, i, 0); - pdf_dict_put(ctx, pdf_trailer(ctx, doc), PDF_NAME_Root, nobj); - pdf_drop_obj(ctx, nobj); - nobj = NULL; + pdf_dict_put_drop(ctx, pdf_trailer(ctx, doc), PDF_NAME(Root), nobj); } } if (!hasinfo) { - if (pdf_dict_get(ctx, dict, PDF_NAME_Creator) || pdf_dict_get(ctx, dict, PDF_NAME_Producer)) + if (pdf_dict_get(ctx, dict, PDF_NAME(Creator)) || pdf_dict_get(ctx, dict, PDF_NAME(Producer))) { nobj = pdf_new_indirect(ctx, doc, i, 0); - pdf_dict_put(ctx, pdf_trailer(ctx, doc), PDF_NAME_Info, nobj); - pdf_drop_obj(ctx, nobj); - nobj = NULL; + pdf_dict_put_drop(ctx, pdf_trailer(ctx, doc), PDF_NAME(Info), nobj); } } @@ -1411,7 +1458,6 @@ fz_catch(ctx) { pdf_drop_obj(ctx, dict); - pdf_drop_obj(ctx, nobj); fz_rethrow(ctx); } @@ -1426,12 +1472,12 @@ fz_try(ctx) { - char *version_str; - obj = pdf_dict_getl(ctx, pdf_trailer(ctx, doc), PDF_NAME_Root, PDF_NAME_Version, NULL); + const char *version_str; + obj = pdf_dict_getl(ctx, pdf_trailer(ctx, doc), PDF_NAME(Root), PDF_NAME(Version), NULL); version_str = pdf_to_name(ctx, obj); if (*version_str) { - int version = 10 * (fz_atof(version_str) + 0.05); + int version = 10 * (fz_atof(version_str) + 0.05f); if (version > doc->version) doc->version = version; } @@ -1444,66 +1490,77 @@ { int i; + fz_defer_reap_start(ctx); + + /* Type3 glyphs in the glyph cache can contain pdf_obj pointers + * that we are about to destroy. Simplest solution is to bin the + * glyph cache at this point. */ fz_try(ctx) + fz_purge_glyph_cache(ctx); + fz_catch(ctx) { - fz_defer_reap_start(ctx); + /* Swallow error, but continue dropping */ + } - /* Type3 glyphs in the glyph cache can contain pdf_obj pointers - * that we are about to destroy. Simplest solution is to bin the - * glyph cache at this point. */ - fz_purge_glyph_cache(ctx); +/* willu smod -- no pdf_drop_js */ +/* + pdf_drop_js(ctx, doc->js); +*/ - /* willus mod */ - /* - pdf_drop_js(ctx, doc->js); - */ + pdf_drop_xref_sections(ctx, doc); + fz_free(ctx, doc->xref_index); - pdf_drop_xref_sections(ctx, doc); - fz_free(ctx, doc->xref_index); + pdf_drop_obj(ctx, doc->focus_obj); + fz_drop_stream(ctx, doc->file); + pdf_drop_crypt(ctx, doc->crypt); - pdf_drop_obj(ctx, doc->focus_obj); - fz_drop_stream(ctx, doc->file); - pdf_drop_crypt(ctx, doc->crypt); + pdf_drop_obj(ctx, doc->linear_obj); + if (doc->linear_page_refs) + { + for (i=0; i < doc->linear_page_count; i++) + pdf_drop_obj(ctx, doc->linear_page_refs[i]); - pdf_drop_obj(ctx, doc->linear_obj); - if (doc->linear_page_refs) - { - for (i=0; i < doc->page_count; i++) - pdf_drop_obj(ctx, doc->linear_page_refs[i]); + fz_free(ctx, doc->linear_page_refs); + } - fz_free(ctx, doc->linear_page_refs); - } - fz_free(ctx, doc->hint_page); - fz_free(ctx, doc->hint_shared_ref); - fz_free(ctx, doc->hint_shared); - fz_free(ctx, doc->hint_obj_offsets); + fz_free(ctx, doc->hint_page); + fz_free(ctx, doc->hint_shared_ref); + fz_free(ctx, doc->hint_shared); + fz_free(ctx, doc->hint_obj_offsets); - for (i=0; i < doc->num_type3_fonts; i++) - { + for (i=0; i < doc->num_type3_fonts; i++) + { + fz_try(ctx) fz_decouple_type3_font(ctx, doc->type3_fonts[i], (void *)doc); + fz_always(ctx) fz_drop_font(ctx, doc->type3_fonts[i]); + fz_catch(ctx) + { + /* Swallow error, but continue dropping */ } - fz_free(ctx, doc->type3_fonts); + } - pdf_drop_ocg(ctx, doc); + fz_free(ctx, doc->type3_fonts); - pdf_empty_store(ctx, doc); + pdf_drop_ocg(ctx, doc); + pdf_drop_portfolio(ctx, doc); - pdf_lexbuf_fin(ctx, &doc->lexbuf.base); + pdf_empty_store(ctx, doc); - pdf_drop_resource_tables(ctx, doc); + pdf_lexbuf_fin(ctx, &doc->lexbuf.base); - for (i = 0; i < doc->orphans_count; i++) - pdf_drop_obj(ctx, doc->orphans[i]); + pdf_drop_resource_tables(ctx, doc); - fz_free(ctx, doc->orphans); - } - fz_always(ctx) - { - fz_defer_reap_end(ctx); - } - fz_catch(ctx) - fz_rethrow(ctx); + fz_drop_colorspace(ctx, doc->oi); + + for (i = 0; i < doc->orphans_count; i++) + pdf_drop_obj(ctx, doc->orphans[i]); + + fz_free(ctx, doc->orphans); + + fz_free(ctx, doc->rev_page_map); + + fz_defer_reap_end(ctx); } void @@ -1512,22 +1569,10 @@ fz_drop_document(ctx, &doc->super); } -void -pdf_print_xref(fz_context *ctx, pdf_document *doc) +pdf_document * +pdf_keep_document(fz_context *ctx, pdf_document *doc) { - int i; - int xref_len = pdf_xref_len(ctx, doc); - printf("xref\n0 %d\n", xref_len); - for (i = 0; i < xref_len; i++) - { - pdf_xref_entry *entry = pdf_get_xref_entry(ctx, doc, i); - printf("%05d: %010d %05d %c (stm_ofs=%d; stm_buf=%p)\n", i, - (int)entry->ofs, - entry->gen, - entry->type ? entry->type : '-', - (int)entry->stm_ofs, - entry->stm_buf); - } + return (pdf_document *)fz_keep_document(ctx, &doc->super); } /* @@ -1540,14 +1585,16 @@ fz_stream *stm = NULL; pdf_obj *objstm = NULL; int *numbuf = NULL; - fz_off_t *ofsbuf = NULL; + int64_t *ofsbuf = NULL; pdf_obj *obj; - fz_off_t first; + int64_t first; int count; int i; pdf_token tok; pdf_xref_entry *ret_entry = NULL; + int xref_len; + int found; fz_var(numbuf); fz_var(ofsbuf); @@ -1558,47 +1605,63 @@ { objstm = pdf_load_object(ctx, doc, num); - count = pdf_to_int(ctx, pdf_dict_get(ctx, objstm, PDF_NAME_N)); - first = pdf_to_int(ctx, pdf_dict_get(ctx, objstm, PDF_NAME_First)); + if (pdf_obj_marked(ctx, objstm)) + fz_throw(ctx, FZ_ERROR_GENERIC, "recursive object stream lookup"); + } + fz_catch(ctx) + { + pdf_drop_obj(ctx, objstm); + fz_rethrow(ctx); + } + + fz_try(ctx) + { + pdf_mark_obj(ctx, objstm); - if (count < 0) - fz_throw(ctx, FZ_ERROR_GENERIC, "negative number of objects in object stream"); - if (first < 0) - fz_throw(ctx, FZ_ERROR_GENERIC, "first object in object stream resides outside stream"); + count = pdf_dict_get_int(ctx, objstm, PDF_NAME(N)); + first = pdf_dict_get_int(ctx, objstm, PDF_NAME(First)); + + if (count < 0 || count > PDF_MAX_OBJECT_NUMBER) + fz_throw(ctx, FZ_ERROR_GENERIC, "number of objects in object stream out of range"); + if (first < 0 || first > PDF_MAX_OBJECT_NUMBER + || count < 0 || count > PDF_MAX_OBJECT_NUMBER + || first + count - 1 > PDF_MAX_OBJECT_NUMBER) + fz_throw(ctx, FZ_ERROR_GENERIC, "object stream object numbers are out of range"); numbuf = fz_calloc(ctx, count, sizeof(*numbuf)); ofsbuf = fz_calloc(ctx, count, sizeof(*ofsbuf)); + xref_len = pdf_xref_len(ctx, doc); + + found = 0; + stm = pdf_open_stream_number(ctx, doc, num); for (i = 0; i < count; i++) { tok = pdf_lex(ctx, stm, buf); if (tok != PDF_TOK_INT) fz_throw(ctx, FZ_ERROR_GENERIC, "corrupt object stream (%d 0 R)", num); - numbuf[i] = buf->i; + numbuf[found] = buf->i; tok = pdf_lex(ctx, stm, buf); if (tok != PDF_TOK_INT) fz_throw(ctx, FZ_ERROR_GENERIC, "corrupt object stream (%d 0 R)", num); - ofsbuf[i] = buf->i; - } + ofsbuf[found] = buf->i; - fz_seek(ctx, stm, first, SEEK_SET); + if (numbuf[found] <= 0 || numbuf[found] >= xref_len) + fz_warn(ctx, "object stream object out of range, skipping"); + else + found++; + } - for (i = 0; i < count; i++) + for (i = 0; i < found; i++) { - int xref_len = pdf_xref_len(ctx, doc); pdf_xref_entry *entry; + fz_seek(ctx, stm, first + ofsbuf[i], SEEK_SET); obj = pdf_parse_stm_obj(ctx, doc, stm, buf); - if (numbuf[i] <= 0 || numbuf[i] >= xref_len) - { - pdf_drop_obj(ctx, obj); - fz_throw(ctx, FZ_ERROR_GENERIC, "object id (%d 0 R) out of range (0..%d)", numbuf[i], xref_len - 1); - } - entry = pdf_get_xref_entry(ctx, doc, numbuf[i]); pdf_set_obj_parent(ctx, obj, numbuf[i]); @@ -1637,6 +1700,7 @@ fz_drop_stream(ctx, stm); fz_free(ctx, ofsbuf); fz_free(ctx, numbuf); + pdf_unmark_obj(ctx, objstm); pdf_drop_obj(ctx, objstm); } fz_catch(ctx) @@ -1650,11 +1714,11 @@ * object loading */ static int -pdf_obj_read(fz_context *ctx, pdf_document *doc, fz_off_t *offset, int *nump, pdf_obj **page) +pdf_obj_read(fz_context *ctx, pdf_document *doc, int64_t *offset, int *nump, pdf_obj **page) { pdf_lexbuf *buf = &doc->lexbuf.base; int num, gen, tok; - fz_off_t numofs, genofs, stmofs, tmpofs, newtmpofs; + int64_t numofs, genofs, stmofs, tmpofs, newtmpofs; int xref_len; pdf_xref_entry *entry; @@ -1781,7 +1845,6 @@ static void pdf_load_hinted_page(fz_context *ctx, pdf_document *doc, int pagenum) { - if (!doc->hints_loaded || !doc->linear_page_refs) return; @@ -1792,7 +1855,7 @@ { int num = doc->hint_page[pagenum].number; pdf_obj *page = pdf_load_object(ctx, doc, num); - if (pdf_name_eq(ctx, PDF_NAME_Page, pdf_dict_get(ctx, page, PDF_NAME_Type))) + if (pdf_name_eq(ctx, PDF_NAME(Page), pdf_dict_get(ctx, page, PDF_NAME(Type)))) { /* We have found the page object! */ DEBUGMESS((ctx, "LoadHintedPage pagenum=%d num=%d", pagenum, num)); @@ -1805,7 +1868,6 @@ fz_rethrow_if(ctx, FZ_ERROR_TRYLATER); /* Silently swallow the error and proceed as normal */ } - } static int @@ -1816,7 +1878,7 @@ * there. */ int expected = num; int curr_pos; - fz_off_t start, offset; + int64_t start, offset; while (doc->hint_obj_offsets[expected] == 0 && expected > 0) expected--; @@ -1902,7 +1964,7 @@ if (x->type == 'f') { - x->obj = pdf_new_null(ctx, doc); + x->obj = PDF_NULL; } else if (x->type == 'n') { @@ -1937,6 +1999,7 @@ { pdf_repair_xref(ctx, doc); pdf_prime_xref_index(ctx, doc); + pdf_repair_obj_stms(ctx, doc); } fz_catch(ctx) { @@ -1983,7 +2046,6 @@ pdf_load_object(fz_context *ctx, pdf_document *doc, int num) { pdf_xref_entry *entry = pdf_cache_object(ctx, doc, num); - assert(entry->obj != NULL); return pdf_keep_obj(ctx, entry->obj); } @@ -2049,6 +2111,10 @@ /* TODO: reuse free object slots by properly linking free object chains in the ofs field */ pdf_xref_entry *entry; int num = pdf_xref_len(ctx, doc); + + if (num > PDF_MAX_OBJECT_NUMBER) + fz_throw(ctx, FZ_ERROR_GENERIC, "too many objects stored in pdf"); + entry = pdf_get_incremental_xref_entry(ctx, doc, num); entry->type = 'f'; entry->ofs = -1; @@ -2096,6 +2162,12 @@ return; } + if (!newobj) + { + pdf_delete_object(ctx, doc, num); + return; + } + x = pdf_get_incremental_xref_entry(ctx, doc, num); pdf_drop_obj(ctx, x->obj); @@ -2128,11 +2200,11 @@ fz_drop_buffer(ctx, x->stm_buf); x->stm_buf = fz_keep_buffer(ctx, newbuf); - pdf_dict_puts_drop(ctx, obj, "Length", pdf_new_int(ctx, doc, (int)fz_buffer_storage(ctx, newbuf, NULL))); + pdf_dict_put_int(ctx, obj, PDF_NAME(Length), (int)fz_buffer_storage(ctx, newbuf, NULL)); if (!compressed) { - pdf_dict_dels(ctx, obj, "Filter"); - pdf_dict_dels(ctx, obj, "DecodeParms"); + pdf_dict_del(ctx, obj, PDF_NAME(Filter)); + pdf_dict_del(ctx, obj, PDF_NAME(DecodeParms)); } } @@ -2157,10 +2229,10 @@ if (strstr(key, "info:") == key) { pdf_obj *info; - char *s; + const char *s; int n; - info = pdf_dict_get(ctx, pdf_trailer(ctx, doc), PDF_NAME_Info); + info = pdf_dict_get(ctx, pdf_trailer(ctx, doc), PDF_NAME(Info)); if (!info) return -1; @@ -2168,9 +2240,8 @@ if (!info) return -1; - s = pdf_to_utf8(ctx, info); + s = pdf_to_text_string(ctx, info); n = (int)fz_strlcpy(buf, s, size); - fz_free(ctx, s); return n; } @@ -2190,18 +2261,18 @@ static pdf_document * pdf_new_document(fz_context *ctx, fz_stream *file) { - pdf_document *doc = fz_new_document(ctx, pdf_document); + pdf_document *doc = fz_new_derived_document(ctx, pdf_document); - doc->super.drop_document = (fz_document_drop_fn *)pdf_drop_document_imp; - doc->super.needs_password = (fz_document_needs_password_fn *)pdf_needs_password; - doc->super.authenticate_password = (fz_document_authenticate_password_fn *)pdf_authenticate_password; - doc->super.has_permission = (fz_document_has_permission_fn *)pdf_has_permission; - doc->super.load_outline = (fz_document_load_outline_fn *)pdf_load_outline; - doc->super.resolve_link = (fz_document_resolve_link_fn *)pdf_resolve_link; - doc->super.count_pages = (fz_document_count_pages_fn *)pdf_count_pages; - doc->super.load_page = (fz_document_load_page_fn *)pdf_load_page; - doc->super.lookup_metadata = (fz_document_lookup_metadata_fn *)pdf_lookup_metadata; - doc->update_appearance = pdf_update_appearance; + doc->super.drop_document = (fz_document_drop_fn*)pdf_drop_document_imp; + doc->super.get_output_intent = (fz_document_output_intent_fn*)pdf_document_output_intent; + doc->super.needs_password = (fz_document_needs_password_fn*)pdf_needs_password; + doc->super.authenticate_password = (fz_document_authenticate_password_fn*)pdf_authenticate_password; + doc->super.has_permission = (fz_document_has_permission_fn*)pdf_has_permission; + doc->super.load_outline = (fz_document_load_outline_fn*)pdf_load_outline; + doc->super.resolve_link = (fz_document_resolve_link_fn*)pdf_resolve_link; + doc->super.count_pages = (fz_document_count_pages_fn*)pdf_count_pages; + doc->super.load_page = (fz_document_load_page_fn*)pdf_load_page; + doc->super.lookup_metadata = (fz_document_lookup_metadata_fn*)pdf_lookup_metadata; pdf_lexbuf_init(ctx, &doc->lexbuf.base, PDF_LEXBUF_LARGE); doc->file = fz_keep_stream(ctx, file); @@ -2280,12 +2351,12 @@ if (dict == NULL || !pdf_is_dict(ctx, dict)) fz_throw(ctx, FZ_ERROR_GENERIC, "malformed hint object"); - shared_hint_offset = pdf_to_int(ctx, pdf_dict_get(ctx, dict, PDF_NAME_S)); + shared_hint_offset = pdf_dict_get_int(ctx, dict, PDF_NAME(S)); /* Malloc the structures (use realloc to cope with the fact we * may try this several times before enough data is loaded) */ - doc->hint_page = fz_resize_array(ctx, doc->hint_page, doc->page_count+1, sizeof(*doc->hint_page)); - memset(doc->hint_page, 0, sizeof(*doc->hint_page) * (doc->page_count+1)); + doc->hint_page = fz_resize_array(ctx, doc->hint_page, doc->linear_page_count+1, sizeof(*doc->hint_page)); + memset(doc->hint_page, 0, sizeof(*doc->hint_page) * (doc->linear_page_count+1)); doc->hint_obj_offsets = fz_resize_array(ctx, doc->hint_obj_offsets, max_object_num, sizeof(*doc->hint_obj_offsets)); memset(doc->hint_obj_offsets, 0, sizeof(*doc->hint_obj_offsets) * max_object_num); doc->hint_obj_offsets_max = max_object_num; @@ -2316,7 +2387,7 @@ /* We don't care about the number of objects in the first page */ (void)fz_read_bits(ctx, stream, page_obj_num_bits); j = 1; - for (i = 1; i < doc->page_count; i++) + for (i = 1; i < doc->linear_page_count; i++) { int delta_page_objs = fz_read_bits(ctx, stream, page_obj_num_bits); @@ -2327,7 +2398,7 @@ fz_sync_bits(ctx, stream); /* Item 2: Page lengths */ j = doc->hint_page[0].offset; - for (i = 0; i < doc->page_count; i++) + for (i = 0; i < doc->linear_page_count; i++) { int delta_page_len = fz_read_bits(ctx, stream, page_len_num_bits); int old = j; @@ -2341,7 +2412,7 @@ fz_sync_bits(ctx, stream); /* Item 3: Shared references */ shared = 0; - for (i = 0; i < doc->page_count; i++) + for (i = 0; i < doc->linear_page_count; i++) { int num_shared_objs = fz_read_bits(ctx, stream, num_shared_obj_num_bits); doc->hint_page[i].index = shared; @@ -2446,7 +2517,7 @@ { doc->hint_obj_offsets[doc->hint_shared[i].number] = doc->hint_shared[i].offset; } - for (i = 0; i < doc->page_count; i++) + for (i = 0; i < doc->linear_page_count; i++) { doc->hint_obj_offsets[doc->hint_page[i].number] = doc->hint_page[i].offset; } @@ -2460,7 +2531,7 @@ fz_rethrow_if(ctx, FZ_ERROR_TRYLATER); /* Don't try to load hints again */ doc->hints_loaded = 1; - /* We won't use the linearized object any more. */ + /* We won't use the linearized object anymore. */ doc->file_reading_linearly = 0; /* Any other error becomes a TRYLATER */ fz_throw(ctx, FZ_ERROR_TRYLATER, "malformed hints object"); @@ -2472,7 +2543,7 @@ pdf_load_hint_object(fz_context *ctx, pdf_document *doc) { pdf_lexbuf *buf = &doc->lexbuf.base; - fz_off_t curr_pos; + int64_t curr_pos; curr_pos = fz_tell(ctx, doc->file); fz_seek(ctx, doc->file, doc->hint_object_offset, SEEK_SET); @@ -2481,7 +2552,7 @@ while (1) { pdf_obj *page = NULL; - fz_off_t tmpofs; + int64_t tmpofs; int num, tok; tok = pdf_lex(ctx, doc->file, buf); @@ -2513,12 +2584,12 @@ { pdf_lexbuf *buf = &doc->lexbuf.base; int curr_pos; - pdf_obj *page; + pdf_obj *page = NULL; pdf_load_hinted_page(ctx, doc, pagenum); - if (pagenum < 0 || pagenum >= doc->page_count) - fz_throw(ctx, FZ_ERROR_GENERIC, "page load out of range (%d of %d)", pagenum, doc->page_count); + if (pagenum < 0 || pagenum >= doc->linear_page_count) + fz_throw(ctx, FZ_ERROR_GENERIC, "page load out of range (%d of %d)", pagenum, doc->linear_page_count); if (doc->linear_pos == doc->file_length) return doc->linear_page_refs[pagenum]; @@ -2541,7 +2612,6 @@ do { int num; - page = NULL; eof = pdf_obj_read(ctx, doc, &doc->linear_pos, &num, &page); pdf_drop_obj(ctx, page); page = NULL; @@ -2553,8 +2623,8 @@ pdf_obj *pages; doc->linear_pos = doc->file_length; pdf_load_xref(ctx, doc, buf); - catalog = pdf_dict_get(ctx, pdf_trailer(ctx, doc), PDF_NAME_Root); - pages = pdf_dict_get(ctx, catalog, PDF_NAME_Pages); + catalog = pdf_dict_get(ctx, pdf_trailer(ctx, doc), PDF_NAME(Root)); + pages = pdf_dict_get(ctx, catalog, PDF_NAME(Pages)); if (!pdf_is_dict(ctx, pages)) fz_throw(ctx, FZ_ERROR_GENERIC, "missing page tree"); @@ -2585,17 +2655,17 @@ pdf_document *pdf_document_from_fz_document(fz_context *ctx, fz_document *ptr) { - return (pdf_document *)((ptr && ptr->count_pages == (void*)pdf_count_pages) ? ptr : NULL); + return (pdf_document *)((ptr && ptr->count_pages == (fz_document_count_pages_fn*)pdf_count_pages) ? ptr : NULL); } pdf_page *pdf_page_from_fz_page(fz_context *ctx, fz_page *ptr) { - return (pdf_page *)((ptr && ptr->bound_page == (void*)pdf_bound_page) ? ptr : NULL); + return (pdf_page *)((ptr && ptr->bound_page == (fz_page_bound_page_fn*)pdf_bound_page) ? ptr : NULL); } pdf_annot *pdf_annot_from_fz_annot(fz_context *ctx, fz_annot *ptr) { - return (pdf_annot *)((ptr && ptr->bound_annot == (void*)pdf_bound_annot) ? ptr : NULL); + return (pdf_annot *)((ptr && ptr->bound_annot == (fz_annot_bound_fn*)pdf_bound_annot) ? ptr : NULL); } pdf_document *pdf_specifics(fz_context *ctx, fz_document *doc) @@ -2622,7 +2692,7 @@ pdf_obj * pdf_add_object_drop(fz_context *ctx, pdf_document *doc, pdf_obj *obj) { - pdf_obj *ind; + pdf_obj *ind = NULL; fz_try(ctx) ind = pdf_add_object(ctx, doc, obj); fz_always(ctx) @@ -2633,11 +2703,23 @@ } pdf_obj * +pdf_add_new_dict(fz_context *ctx, pdf_document *doc, int initial) +{ + return pdf_add_object_drop(ctx, doc, pdf_new_dict(ctx, doc, initial)); +} + +pdf_obj * +pdf_add_new_array(fz_context *ctx, pdf_document *doc, int initial) +{ + return pdf_add_object_drop(ctx, doc, pdf_new_array(ctx, doc, initial)); +} + +pdf_obj * pdf_add_stream(fz_context *ctx, pdf_document *doc, fz_buffer *buf, pdf_obj *obj, int compressed) { pdf_obj *ind; if (!obj) - ind = pdf_add_object_drop(ctx, doc, pdf_new_dict(ctx, doc, 4)); + ind = pdf_add_new_dict(ctx, doc, 4); else ind = pdf_add_object(ctx, doc, obj); fz_try(ctx) @@ -2653,12 +2735,10 @@ pdf_document *pdf_create_document(fz_context *ctx) { pdf_document *doc; - pdf_obj *o = NULL; pdf_obj *root; pdf_obj *pages; pdf_obj *trailer = NULL; - fz_var(o); fz_var(trailer); doc = pdf_new_document(ctx, NULL); @@ -2672,53 +2752,50 @@ doc->xref_base = 0; doc->disallow_new_increments = 0; pdf_get_populating_xref_entry(ctx, doc, 0); + trailer = pdf_new_dict(ctx, doc, 2); - pdf_dict_put_drop(ctx, trailer, PDF_NAME_Size, pdf_new_int(ctx, doc, 3)); - o = root = pdf_new_dict(ctx, doc, 2); - pdf_dict_put_drop(ctx, trailer, PDF_NAME_Root, pdf_add_object(ctx, doc, o)); - pdf_drop_obj(ctx, o); - o = NULL; - pdf_dict_put_drop(ctx, root, PDF_NAME_Type, PDF_NAME_Catalog); - o = pages = pdf_new_dict(ctx, doc, 3); - pdf_dict_put_drop(ctx, root, PDF_NAME_Pages, pdf_add_object(ctx, doc, o)); - pdf_drop_obj(ctx, o); - o = NULL; - pdf_dict_put_drop(ctx, pages, PDF_NAME_Type, PDF_NAME_Pages); - pdf_dict_put_drop(ctx, pages, PDF_NAME_Count, pdf_new_int(ctx, doc, 0)); - pdf_dict_put_drop(ctx, pages, PDF_NAME_Kids, pdf_new_array(ctx, doc, 1)); + pdf_dict_put_int(ctx, trailer, PDF_NAME(Size), 3); + pdf_dict_put_drop(ctx, trailer, PDF_NAME(Root), root = pdf_add_new_dict(ctx, doc, 2)); + pdf_dict_put(ctx, root, PDF_NAME(Type), PDF_NAME(Catalog)); + pdf_dict_put_drop(ctx, root, PDF_NAME(Pages), pages = pdf_add_new_dict(ctx, doc, 3)); + pdf_dict_put(ctx, pages, PDF_NAME(Type), PDF_NAME(Pages)); + pdf_dict_put_int(ctx, pages, PDF_NAME(Count), 0); + pdf_dict_put_array(ctx, pages, PDF_NAME(Kids), 1); + /* Set the trailer of the final xref section. */ doc->xref_sections[0].trailer = trailer; } fz_catch(ctx) { pdf_drop_obj(ctx, trailer); - pdf_drop_obj(ctx, o); + fz_drop_document(ctx, &doc->super); fz_rethrow(ctx); } return doc; } -int -pdf_recognize(fz_context *doc, const char *magic) +static const char *pdf_extensions[] = { - char *ext = strrchr(magic, '.'); - - if (ext) - { - if (!fz_strcasecmp(ext, ".pdf")) - return 100; - } - if (!strcmp(magic, "pdf") || !strcmp(magic, "application/pdf")) - return 100; + "pdf", + "pclm", + "ai", + NULL +}; - return 1; -} +static const char *pdf_mimetypes[] = +{ + "application/pdf", + "application/PCLm", + NULL +}; fz_document_handler pdf_document_handler = { - (fz_document_recognize_fn *)&pdf_recognize, - (fz_document_open_fn *)&pdf_open_document, - (fz_document_open_with_stream_fn *)&pdf_open_document_with_stream + NULL, + (fz_document_open_fn*)pdf_open_document, + (fz_document_open_with_stream_fn*)pdf_open_document_with_stream, + pdf_extensions, + pdf_mimetypes }; void pdf_mark_xref(fz_context *ctx, pdf_document *doc) diff -Nru k2pdfopt-2.42+ds/mupdf_mod/pkcs7-openssl.c k2pdfopt-2.51+ds/mupdf_mod/pkcs7-openssl.c --- k2pdfopt-2.42+ds/mupdf_mod/pkcs7-openssl.c 1970-01-01 00:00:00.000000000 +0000 +++ k2pdfopt-2.51+ds/mupdf_mod/pkcs7-openssl.c 2018-11-21 02:44:04.000000000 +0000 @@ -0,0 +1,832 @@ +#include "mupdf/fitz.h" +#include "mupdf/pdf.h" +/* willus mod -- remove ../../fitz/ */ +#include "fitz-imp.h" /* for fz_keep/drop_imp */ + +#include "mupdf/helpers/pkcs7-openssl.h" + +#ifndef HAVE_LIBCRYPTO + +enum pdf_signature_error +pkcs7_openssl_check_digest(fz_context *ctx, fz_stream *stm, char *sig, int sig_len) +{ + return PDF_SIGNATURE_ERROR_UNKNOWN; +} + +/* Check a singature's certificate is trusted */ +enum pdf_signature_error +pkcs7_openssl_check_certificate(char *sig, int sig_len) +{ + return PDF_SIGNATURE_ERROR_UNKNOWN; +} + +pdf_pkcs7_designated_name * +pkcs7_openssl_designated_name(fz_context *ctx, char *sig, int sig_len) +{ + return NULL; +} + +void +pkcs7_openssl_drop_designated_name(fz_context *ctx, pdf_pkcs7_designated_name *dn) +{ +} + +pdf_pkcs7_signer * +pkcs7_openssl_read_pfx(fz_context *ctx, const char *pfile, const char *pw) +{ + fz_throw(ctx, FZ_ERROR_GENERIC, "No OpenSSL support."); +} + +#else + +#include +#include + +/* Generated from resources/certs/AdobeCA.p7c */ +static const char AdobeCA_p7c[] = { +48,130,4,208,6,9,42,134,72,134,247,13,1,7,2,160,130,4,193,48,130,4,189,2, +1,1,49,0,48,11,6,9,42,134,72,134,247,13,1,7,1,160,130,4,165,48,130,4,161, +48,130,3,137,160,3,2,1,2,2,4,62,28,189,40,48,13,6,9,42,134,72,134,247,13, +1,1,5,5,0,48,105,49,11,48,9,6,3,85,4,6,19,2,85,83,49,35,48,33,6,3,85,4,10, +19,26,65,100,111,98,101,32,83,121,115,116,101,109,115,32,73,110,99,111,114, +112,111,114,97,116,101,100,49,29,48,27,6,3,85,4,11,19,20,65,100,111,98,101, +32,84,114,117,115,116,32,83,101,114,118,105,99,101,115,49,22,48,20,6,3,85, +4,3,19,13,65,100,111,98,101,32,82,111,111,116,32,67,65,48,30,23,13,48,51, +48,49,48,56,50,51,51,55,50,51,90,23,13,50,51,48,49,48,57,48,48,48,55,50,51, +90,48,105,49,11,48,9,6,3,85,4,6,19,2,85,83,49,35,48,33,6,3,85,4,10,19,26, +65,100,111,98,101,32,83,121,115,116,101,109,115,32,73,110,99,111,114,112, +111,114,97,116,101,100,49,29,48,27,6,3,85,4,11,19,20,65,100,111,98,101,32, +84,114,117,115,116,32,83,101,114,118,105,99,101,115,49,22,48,20,6,3,85,4, +3,19,13,65,100,111,98,101,32,82,111,111,116,32,67,65,48,130,1,34,48,13,6, +9,42,134,72,134,247,13,1,1,1,5,0,3,130,1,15,0,48,130,1,10,2,130,1,1,0,204, +79,84,132,247,167,162,231,51,83,127,63,156,18,136,107,44,153,71,103,126,15, +30,185,173,20,136,249,195,16,216,29,240,240,213,159,105,10,47,89,53,176,204, +108,169,76,156,21,160,159,206,32,191,160,207,84,226,224,32,102,69,63,57,134, +56,126,156,196,142,7,34,198,36,246,1,18,176,53,223,85,234,105,144,176,219, +133,55,30,226,78,7,178,66,161,106,19,105,160,102,234,128,145,17,89,42,155, +8,121,90,32,68,45,201,189,115,56,139,60,47,224,67,27,93,179,11,240,175,53, +26,41,254,239,166,146,221,129,76,157,61,89,142,173,49,60,64,126,155,145,54, +6,252,226,92,141,209,141,38,213,92,69,207,175,101,63,177,170,210,98,150,244, +168,56,234,186,96,66,244,244,28,74,53,21,206,248,78,34,86,15,149,24,197,248, +150,159,159,251,176,183,120,37,233,128,107,189,214,10,240,198,116,148,157, +243,15,80,219,154,119,206,75,112,131,35,141,160,202,120,32,68,92,60,84,100, +241,234,162,48,25,159,234,76,6,77,6,120,75,94,146,223,34,210,201,103,179, +122,210,1,2,3,1,0,1,163,130,1,79,48,130,1,75,48,17,6,9,96,134,72,1,134,248, +66,1,1,4,4,3,2,0,7,48,129,142,6,3,85,29,31,4,129,134,48,129,131,48,129,128, +160,126,160,124,164,122,48,120,49,11,48,9,6,3,85,4,6,19,2,85,83,49,35,48, +33,6,3,85,4,10,19,26,65,100,111,98,101,32,83,121,115,116,101,109,115,32,73, +110,99,111,114,112,111,114,97,116,101,100,49,29,48,27,6,3,85,4,11,19,20,65, +100,111,98,101,32,84,114,117,115,116,32,83,101,114,118,105,99,101,115,49, +22,48,20,6,3,85,4,3,19,13,65,100,111,98,101,32,82,111,111,116,32,67,65,49, +13,48,11,6,3,85,4,3,19,4,67,82,76,49,48,43,6,3,85,29,16,4,36,48,34,128,15, +50,48,48,51,48,49,48,56,50,51,51,55,50,51,90,129,15,50,48,50,51,48,49,48, +57,48,48,48,55,50,51,90,48,11,6,3,85,29,15,4,4,3,2,1,6,48,31,6,3,85,29,35, +4,24,48,22,128,20,130,183,56,74,147,170,155,16,239,128,187,217,84,226,241, +15,251,128,156,222,48,29,6,3,85,29,14,4,22,4,20,130,183,56,74,147,170,155, +16,239,128,187,217,84,226,241,15,251,128,156,222,48,12,6,3,85,29,19,4,5,48, +3,1,1,255,48,29,6,9,42,134,72,134,246,125,7,65,0,4,16,48,14,27,8,86,54,46, +48,58,52,46,48,3,2,4,144,48,13,6,9,42,134,72,134,247,13,1,1,5,5,0,3,130,1, +1,0,50,218,159,67,117,193,250,111,201,111,219,171,29,54,55,62,188,97,25,54, +183,2,60,29,35,89,152,108,158,238,77,133,231,84,200,32,31,167,212,187,226, +191,0,119,125,36,107,112,47,92,193,58,118,73,181,211,224,35,132,42,113,106, +34,243,193,39,41,152,21,246,53,144,228,4,76,195,141,188,159,97,28,231,253, +36,140,209,68,67,140,22,186,155,77,165,212,53,47,188,17,206,189,247,81,55, +141,159,144,228,20,241,24,63,190,233,89,18,53,249,51,146,243,158,224,213, +107,154,113,155,153,75,200,113,195,225,177,97,9,196,229,250,145,240,66,58, +55,125,52,249,114,232,205,170,98,28,33,233,213,244,130,16,227,123,5,182,45, +104,86,11,126,126,146,44,111,77,114,130,12,237,86,116,178,157,185,171,45, +43,29,16,95,219,39,117,112,143,253,29,215,226,2,160,121,229,28,229,255,175, +100,64,81,45,158,155,71,219,66,165,124,31,194,166,72,176,215,190,146,105, +77,164,246,41,87,197,120,17,24,220,135,81,202,19,178,98,157,79,43,50,189, +49,165,193,250,82,171,5,136,200,49,0 +}; + +#include "openssl/err.h" +#include "openssl/bio.h" +#include "openssl/asn1.h" +#include "openssl/x509.h" +#include "openssl/x509v3.h" +#include "openssl/err.h" +#include "openssl/objects.h" +#include "openssl/pem.h" +#include "openssl/pkcs7.h" +#include "openssl/pkcs12.h" +#include "openssl/opensslv.h" + +#ifndef OPENSSL_VERSION_NUMBER +#warning detect version of openssl at compile time +#endif + +typedef struct +{ + fz_context *ctx; + fz_stream *stm; +} BIO_stream_data; + +static int stream_read(BIO *b, char *buf, int size) +{ + BIO_stream_data *data = (BIO_stream_data *)BIO_get_data(b); + return fz_read(data->ctx, data->stm, (unsigned char *) buf, size); +} + +static long stream_ctrl(BIO *b, int cmd, long arg1, void *arg2) +{ + BIO_stream_data *data = (BIO_stream_data *)BIO_get_data(b); + switch (cmd) + { + case BIO_C_FILE_SEEK: + fz_seek(data->ctx, data->stm, arg1, SEEK_SET); + return 0; + default: + return 1; + } +} + +static int stream_new(BIO *b) +{ + BIO_stream_data *data = (BIO_stream_data *)malloc(sizeof(BIO_stream_data)); + if (!data) + return 0; + + data->ctx = NULL; + data->stm = NULL; + + BIO_set_init(b, 1); + BIO_set_data(b, data); + BIO_clear_flags(b, INT_MAX); + + return 1; +} + +static int stream_free(BIO *b) +{ + if (b == NULL) + return 0; + + free(BIO_get_data(b)); + BIO_set_data(b, NULL); + BIO_set_init(b, 0); + BIO_clear_flags(b, INT_MAX); + + return 1; +} + +static long stream_callback_ctrl(BIO *b, int cmd, bio_info_cb *fp) +{ + return 1; +} + +static BIO *BIO_new_stream(fz_context *ctx, fz_stream *stm) +{ + static BIO_METHOD *methods = NULL; + BIO *bio; + BIO_stream_data *data; + + if (!methods) + { + methods = BIO_meth_new(BIO_TYPE_NONE, "segment reader"); + if (!methods) + return NULL; + + BIO_meth_set_read(methods, stream_read); + BIO_meth_set_ctrl(methods, stream_ctrl); + BIO_meth_set_create(methods, stream_new); + BIO_meth_set_destroy(methods, stream_free); + BIO_meth_set_callback_ctrl(methods, stream_callback_ctrl); + } + + bio = BIO_new(methods); + data = BIO_get_data(bio); + data->ctx = ctx; + data->stm = stm; + + return bio; +} + +static int verify_callback(int ok, X509_STORE_CTX *ctx) +{ + int err, depth; + + err = X509_STORE_CTX_get_error(ctx); + depth = X509_STORE_CTX_get_error_depth(ctx); + + if (!ok && depth >= 6) + { + X509_STORE_CTX_set_error(ctx, X509_V_ERR_CERT_CHAIN_TOO_LONG); + } + + switch (err) + { + case X509_V_ERR_INVALID_PURPOSE: + case X509_V_ERR_CERT_HAS_EXPIRED: + case X509_V_ERR_KEYUSAGE_NO_CERTSIGN: + X509_STORE_CTX_set_error(ctx, X509_V_OK); + ok = 1; + break; + + default: + break; + } + + return ok; +} + +/* Get the certificates from a PKCS7 object */ +static STACK_OF(X509) *pk7_certs(PKCS7 *pk7) +{ + if (pk7 == NULL || pk7->d.ptr == NULL) + return NULL; + + if (PKCS7_type_is_signed(pk7)) + return pk7->d.sign->cert; + else if (PKCS7_type_is_signedAndEnveloped(pk7)) + return pk7->d.signed_and_enveloped->cert; + else + return NULL; +} + +/* Get the signing certificate from a PKCS7 object */ +static X509 *pk7_signer(STACK_OF(X509) *certs, PKCS7_SIGNER_INFO *si) +{ + PKCS7_ISSUER_AND_SERIAL *ias = si->issuer_and_serial; + if (certs == NULL) + return NULL; + + return X509_find_by_issuer_and_serial(certs, ias->issuer, ias->serial); +} + +static enum pdf_signature_error pk7_verify_sig(PKCS7 *p7, BIO *detached) +{ + BIO *p7bio=NULL; + char readbuf[1024*4]; + int res = PDF_SIGNATURE_ERROR_UNKNOWN; + int i; + STACK_OF(PKCS7_SIGNER_INFO) *sk; + + ERR_clear_error(); + + p7bio = PKCS7_dataInit(p7, detached); + if (!p7bio) + goto exit; + + + /* We now have to 'read' from p7bio to calculate digests etc. */ + while (BIO_read(p7bio, readbuf, sizeof(readbuf)) > 0) + ; + + /* We can now verify signatures */ + sk = PKCS7_get_signer_info(p7); + if (sk == NULL || sk_PKCS7_SIGNER_INFO_num(sk) <= 0) + { + /* there are no signatures on this data */ + res = PDF_SIGNATURE_ERROR_NO_SIGNATURES; + goto exit; + } + + for (i=0; i 0) + { + res = PDF_SIGNATURE_ERROR_OKAY; + } + else + { + long err = ERR_GET_REASON(ERR_get_error()); + switch (err) + { + case PKCS7_R_DIGEST_FAILURE: + res = PDF_SIGNATURE_ERROR_DOCUMENT_CHANGED; + break; + default: + break; + } + goto exit; + } + } + +exit: + ERR_free_strings(); + + return res; +} + +static enum pdf_signature_error pk7_verify_cert(X509_STORE *cert_store, PKCS7 *p7) +{ + int res = PDF_SIGNATURE_ERROR_OKAY; + int i; + STACK_OF(PKCS7_SIGNER_INFO) *sk; + X509_STORE_CTX *ctx; + + ctx = X509_STORE_CTX_new(); + if (!ctx) + return PDF_SIGNATURE_ERROR_UNKNOWN; + + ERR_clear_error(); + + X509_STORE_set_verify_cb_func(cert_store, verify_callback); + + /* We can now verify signatures */ + sk = PKCS7_get_signer_info(p7); + if (sk == NULL) + { + /* there are no signatures on this data */ + res = PDF_SIGNATURE_ERROR_NO_SIGNATURES; + goto exit; + } + + for (i=0; i= 0) + { + X509_EXTENSION *ext = X509_get_ext(cert, i); + X509_delete_ext(cert, i); + X509_EXTENSION_free(ext); + } + } + + if (!X509_STORE_CTX_init(ctx, cert_store, cert, certs)) + { + res = PDF_SIGNATURE_ERROR_UNKNOWN; + goto exit; + } + + if (!X509_STORE_CTX_set_purpose(ctx, X509_PURPOSE_SMIME_SIGN)) + { + res = PDF_SIGNATURE_ERROR_UNKNOWN; + goto exit; + } + + /* X509_verify_cert may return an error, but in all such cases + * it sets a context error */ + X509_verify_cert(ctx); + X509_STORE_CTX_cleanup(ctx); + ctx_err = X509_STORE_CTX_get_error(ctx); + switch (ctx_err) + { + case X509_V_OK: + break; + case X509_V_ERR_DEPTH_ZERO_SELF_SIGNED_CERT: + res = PDF_SIGNATURE_ERROR_SELF_SIGNED; + goto exit; + case X509_V_ERR_SELF_SIGNED_CERT_IN_CHAIN: + res = PDF_SIGNATURE_ERROR_SELF_SIGNED_IN_CHAIN; + goto exit; + default: + res = PDF_SIGNATURE_ERROR_UNKNOWN; + goto exit; + } + } + +exit: + X509_STORE_CTX_cleanup(ctx); + ERR_free_strings(); + + return res; +} + +enum pdf_signature_error pkcs7_openssl_check_digest(fz_context *ctx, fz_stream *stm, char *sig, int sig_len) +{ + PKCS7 *pk7sig = NULL; + BIO *bsig = NULL; + BIO *bdata = NULL; + int res = PDF_SIGNATURE_ERROR_UNKNOWN; + + bsig = BIO_new_mem_buf(sig, sig_len); + pk7sig = d2i_PKCS7_bio(bsig, NULL); + if (pk7sig == NULL) + goto exit; + + bdata = BIO_new_stream(ctx, stm); + if (bdata == NULL) + goto exit; + + res = pk7_verify_sig(pk7sig, bdata); + +exit: + BIO_free(bsig); + BIO_free(bdata); + PKCS7_free(pk7sig); + + return res; +} + +enum pdf_signature_error pkcs7_openssl_check_certificate(char *sig, int sig_len) +{ + PKCS7 *pk7sig = NULL; + PKCS7 *pk7cert = NULL; + X509_STORE *st = NULL; + BIO *bsig = NULL; + BIO *bcert = NULL; + STACK_OF(X509) *certs = NULL; + int res = 0; + + bsig = BIO_new_mem_buf(sig, sig_len); + pk7sig = d2i_PKCS7_bio(bsig, NULL); + if (pk7sig == NULL) + goto exit; + + /* Find the certificates in the pk7 file */ + bcert = BIO_new_mem_buf((void*)AdobeCA_p7c, sizeof AdobeCA_p7c); + pk7cert = d2i_PKCS7_bio(bcert, NULL); + if (pk7cert == NULL) + goto exit; + + certs = pk7_certs(pk7cert); + + st = X509_STORE_new(); + if (st == NULL) + goto exit; + + /* Add the certificates to the store */ + if (certs != NULL) + { + int i, n = sk_X509_num(certs); + + for (i = 0; i < n; i++) + { + X509 *c = sk_X509_value(certs, i); + X509_STORE_add_cert(st, c); + } + } + + res = pk7_verify_cert(st, pk7sig); + +exit: + BIO_free(bsig); + BIO_free(bcert); + PKCS7_free(pk7sig); + PKCS7_free(pk7cert); + X509_STORE_free(st); + + return res; +} + +typedef struct pdf_pkcs7_designated_name_openssl_s +{ + pdf_pkcs7_designated_name base; + char buf[8192]; +} pdf_pkcs7_designated_name_openssl; + +void pkcs7_openssl_drop_designated_name(fz_context *ctx, pdf_pkcs7_designated_name *dn) +{ + fz_free(ctx, dn); +} + +typedef struct +{ + pdf_pkcs7_signer base; + fz_context *ctx; + int refs; + X509 *x509; + EVP_PKEY *pkey; +} openssl_signer; + +static void signer_drop_designated_name(pdf_pkcs7_signer *signer, pdf_pkcs7_designated_name *dn) +{ + openssl_signer *osigner = (openssl_signer *)signer; + fz_free(osigner->ctx, dn); +} + +static void add_from_bags(X509 **pX509, EVP_PKEY **pPkey, const STACK_OF(PKCS12_SAFEBAG) *bags, const char *pw); + +static void add_from_bag(X509 **pX509, EVP_PKEY **pPkey, PKCS12_SAFEBAG *bag, const char *pw) +{ + EVP_PKEY *pkey = NULL; + X509 *x509 = NULL; + switch (M_PKCS12_bag_type(bag)) + { + case NID_keyBag: + { + const PKCS8_PRIV_KEY_INFO *p8 = PKCS12_SAFEBAG_get0_p8inf(bag); + pkey = EVP_PKCS82PKEY(p8); + } + break; + + case NID_pkcs8ShroudedKeyBag: + { + PKCS8_PRIV_KEY_INFO *p8 = PKCS12_decrypt_skey(bag, pw, (int)strlen(pw)); + if (p8) + { + pkey = EVP_PKCS82PKEY(p8); + PKCS8_PRIV_KEY_INFO_free(p8); + } + } + break; + + case NID_certBag: + if (M_PKCS12_cert_bag_type(bag) == NID_x509Certificate) + x509 = PKCS12_certbag2x509(bag); + break; + + case NID_safeContentsBag: + add_from_bags(pX509, pPkey, PKCS12_SAFEBAG_get0_safes(bag), pw); + break; + } + + if (pkey) + { + if (!*pPkey) + *pPkey = pkey; + else + EVP_PKEY_free(pkey); + } + + if (x509) + { + if (!*pX509) + *pX509 = x509; + else + X509_free(x509); + } +} + +static void add_from_bags(X509 **pX509, EVP_PKEY **pPkey, const STACK_OF(PKCS12_SAFEBAG) *bags, const char *pw) +{ + int i; + + for (i = 0; i < sk_PKCS12_SAFEBAG_num(bags); i++) + add_from_bag(pX509, pPkey, sk_PKCS12_SAFEBAG_value(bags, i), pw); +} + +static pdf_pkcs7_signer *keep_signer(pdf_pkcs7_signer *signer) +{ + openssl_signer *osigner = (openssl_signer *)signer; + return fz_keep_imp(osigner->ctx, osigner, &osigner->refs); +} + +static void drop_signer(pdf_pkcs7_signer *signer) +{ + openssl_signer *osigner = (openssl_signer *)signer; + if (fz_drop_imp(osigner->ctx, osigner, &osigner->refs)) + { + X509_free(osigner->x509); + EVP_PKEY_free(osigner->pkey); + fz_free(osigner->ctx, osigner); + } +} + +static pdf_pkcs7_designated_name *x509_designated_name(fz_context *ctx, X509 *x509) +{ + pdf_pkcs7_designated_name_openssl *dn = fz_malloc_struct(ctx, pdf_pkcs7_designated_name_openssl); + char *p; + + X509_NAME_oneline(X509_get_subject_name(x509), dn->buf, sizeof(dn->buf)); + p = strstr(dn->buf, "/CN="); + if (p) dn->base.cn = p+4; + p = strstr(dn->buf, "/O="); + if (p) dn->base.o = p+3; + p = strstr(dn->buf, "/OU="); + if (p) dn->base.ou = p+4; + p = strstr(dn->buf, "/emailAddress="); + if (p) dn->base.email = p+14; + p = strstr(dn->buf, "/C="); + if (p) dn->base.c = p+3; + + for (p = dn->buf; *p; p++) + if (*p == '/') + *p = 0; + + return (pdf_pkcs7_designated_name *)dn; +} + +static pdf_pkcs7_designated_name *signer_designated_name(pdf_pkcs7_signer *signer) +{ + openssl_signer *osigner = (openssl_signer *)signer; + return x509_designated_name(osigner->ctx, osigner->x509); +} + +static int signer_create_digest(pdf_pkcs7_signer *signer, fz_stream *in, unsigned char *digest, int *digest_len) +{ + openssl_signer *osigner = (openssl_signer *)signer; + fz_context *ctx = osigner->ctx; + int res = 0; + BIO *bdata = NULL; + BIO *bp7in = NULL; + BIO *bp7 = NULL; + PKCS7 *p7 = NULL; + PKCS7_SIGNER_INFO *si; + + unsigned char *p7_ptr; + int p7_len; + + if (in != NULL) + { + bdata = BIO_new_stream(ctx, in); + if (bdata == NULL) + goto exit; + } + + p7 = PKCS7_new(); + if (p7 == NULL) + goto exit; + + PKCS7_set_type(p7, NID_pkcs7_signed); + si = PKCS7_add_signature(p7, osigner->x509, osigner->pkey, EVP_sha1()); + if (si == NULL) + goto exit; + + PKCS7_add_signed_attribute(si, NID_pkcs9_contentType, V_ASN1_OBJECT, OBJ_nid2obj(NID_pkcs7_data)); + PKCS7_add_certificate(p7, osigner->x509); + + PKCS7_content_new(p7, NID_pkcs7_data); + PKCS7_set_detached(p7, 1); + + bp7in = PKCS7_dataInit(p7, NULL); + if (bp7in == NULL) + goto exit; + + while(bdata) /* bdata knowingly not changed in the loop */ + { + char buf[4096]; + int n = BIO_read(bdata, buf, sizeof(buf)); + if (n <= 0) + break; + BIO_write(bp7in, buf, n); + } + + if (!PKCS7_dataFinal(p7, bp7in)) + goto exit; + + BIO_free(bdata); + bdata = NULL; + + bp7 = BIO_new(BIO_s_mem()); + if (bp7 == NULL || !i2d_PKCS7_bio(bp7, p7)) + goto exit; + + p7_len = BIO_get_mem_data(bp7, &p7_ptr); + if (digest && p7_len > *digest_len) + goto exit; + + if (digest) + memcpy(digest, p7_ptr, p7_len); + + *digest_len = p7_len; + res = 1; + +exit: + PKCS7_free(p7); + BIO_free(bdata); + BIO_free(bp7in); + BIO_free(bp7); + return res; +} + +static int max_digest_size(pdf_pkcs7_signer *signer) +{ + /* Perform a test digest generation to find the required size. Size + * is assumed independent of data being hashed */ + int digest_len = 0; + + signer_create_digest(signer, NULL, NULL, &digest_len); + + return digest_len; +} + +pdf_pkcs7_signer *pkcs7_openssl_read_pfx(fz_context *ctx, const char *pfile, const char *pw) +{ + BIO *pfxbio = NULL; + PKCS12 *p12 = NULL; + STACK_OF(PKCS7) *asafes; + openssl_signer *signer = NULL; + int i; + + fz_var(pfxbio); + fz_var(p12); + fz_var(signer); + fz_try(ctx) + { + signer = fz_malloc_struct(ctx, openssl_signer); + signer->base.keep = keep_signer; + signer->base.drop = drop_signer; + signer->base.designated_name = signer_designated_name; + signer->base.drop_designated_name = signer_drop_designated_name; + signer->base.max_digest_size = max_digest_size; + signer->base.create_digest = signer_create_digest; + signer->ctx = ctx; + signer->refs = 1; + + OpenSSL_add_all_algorithms(); + + EVP_add_digest(EVP_md5()); + EVP_add_digest(EVP_sha1()); + + ERR_load_crypto_strings(); + + ERR_clear_error(); + + pfxbio = BIO_new_file(pfile, "rb"); + if (pfxbio == NULL) + fz_throw(ctx, FZ_ERROR_GENERIC, "Can't open pfx file: %s", pfile); + + p12 = d2i_PKCS12_bio(pfxbio, NULL); + if (p12 == NULL) + fz_throw(ctx, FZ_ERROR_GENERIC, "Invalid pfx file: %s", pfile); + + asafes = PKCS12_unpack_authsafes(p12); + if (asafes == NULL) + fz_throw(ctx, FZ_ERROR_GENERIC, "Invalid pfx file: %s", pfile); + + /* Nothing in this for loop can fz_throw */ + for (i = 0; i < sk_PKCS7_num(asafes); i++) + { + PKCS7 *p7; + STACK_OF(PKCS12_SAFEBAG) *bags; + int bagnid; + + p7 = sk_PKCS7_value(asafes, i); + bagnid = OBJ_obj2nid(p7->type); + switch (bagnid) + { + case NID_pkcs7_data: + bags = PKCS12_unpack_p7data(p7); + break; + case NID_pkcs7_encrypted: + bags = PKCS12_unpack_p7encdata(p7, pw, (int)strlen(pw)); + break; + default: + continue; + } + + if (bags) + { + add_from_bags(&signer->x509, &signer->pkey, bags, pw); + sk_PKCS12_SAFEBAG_pop_free(bags, PKCS12_SAFEBAG_free); + } + } + sk_PKCS7_pop_free (asafes, PKCS7_free); + + if (signer->pkey == NULL) + fz_throw(ctx, FZ_ERROR_GENERIC, "Failed to obtain public key"); + + if (signer->x509 == NULL) + fz_throw(ctx, FZ_ERROR_GENERIC, "Failed to obtain certificate"); + } + fz_always(ctx) + { + BIO_free(pfxbio); + PKCS12_free(p12); + } + fz_catch(ctx) + { + drop_signer(&signer->base); + fz_rethrow(ctx); + } + + return &signer->base; +} + +pdf_pkcs7_designated_name *pkcs7_openssl_designated_name(fz_context *ctx, char *sig, int sig_len) +{ + pdf_pkcs7_designated_name *name = NULL; + PKCS7 *pk7sig = NULL; + BIO *bsig = NULL; + STACK_OF(PKCS7_SIGNER_INFO) *sk = NULL; + X509 *x509 = NULL; + + bsig = BIO_new_mem_buf(sig, sig_len); + pk7sig = d2i_PKCS7_bio(bsig, NULL); + if (pk7sig == NULL) + goto exit; + + sk = PKCS7_get_signer_info(pk7sig); + if (sk == NULL || sk_PKCS7_SIGNER_INFO_num(sk) <= 0) + goto exit; + + x509 = pk7_signer(pk7_certs(pk7sig), sk_PKCS7_SIGNER_INFO_value(sk, 0)); + + name = x509_designated_name(ctx, x509); + +exit: + BIO_free(bsig); + PKCS7_free(pk7sig); + + return name; +} + +#endif diff -Nru k2pdfopt-2.42+ds/mupdf_mod/stext-device.c k2pdfopt-2.51+ds/mupdf_mod/stext-device.c --- k2pdfopt-2.42+ds/mupdf_mod/stext-device.c 2017-02-25 06:40:19.000000000 +0000 +++ k2pdfopt-2.51+ds/mupdf_mod/stext-device.c 2018-11-21 00:35:19.000000000 +0000 @@ -1,589 +1,229 @@ #include "mupdf/fitz.h" +#include "mupdf/ucdn.h" -/* Extract text into an unsorted span soup. */ +#include +#include +#include + +/* Extract text into blocks and lines. */ #define LINE_DIST 0.9f #define SPACE_DIST 0.15f #define SPACE_MAX_DIST 0.8f #define PARAGRAPH_DIST 0.5f -#undef DEBUG_SPANS -#undef DEBUG_INTERNALS -#undef DEBUG_LINE_HEIGHTS -#undef DEBUG_MASKS -#undef DEBUG_ALIGN -#undef DEBUG_INDENTS - -#include -#include FT_FREETYPE_H -#include FT_ADVANCES_H - typedef struct fz_stext_device_s fz_stext_device; -typedef struct span_soup_s span_soup; - struct fz_stext_device_s { fz_device super; - fz_stext_sheet *sheet; fz_stext_page *page; - span_soup *spans; - fz_stext_span *cur_span; + fz_point pen, start; + fz_matrix trm; + int new_obj; + int curdir; int lastchar; int flags; }; const char *fz_stext_options_usage = - "Structured text output options:\n" - "\tpreserve-ligatures: do not expand all ligatures into constituent characters\n" - "\tpreserve-whitespace: do not convert all whitespace characters into spaces\n" + "Text output options:\n" + "\tpreserve-ligatures: do not expand ligatures into constituent characters\n" + "\tpreserve-whitespace: do not convert all whitespace into space characters\n" + "\tpreserve-images: keep images in output\n" "\n"; -static fz_rect * -add_point_to_rect(fz_rect *a, const fz_point *p) -{ - if (p->x < a->x0) - a->x0 = p->x; - if (p->x > a->x1) - a->x1 = p->x; - if (p->y < a->y0) - a->y0 = p->y; - if (p->y > a->y1) - a->y1 = p->y; - return a; -} - -fz_rect * -fz_stext_char_bbox(fz_context *ctx, fz_rect *bbox, fz_stext_span *span, int i) +fz_stext_page * +fz_new_stext_page(fz_context *ctx, fz_rect mediabox) { - fz_point a, d; - const fz_point *max; - fz_stext_char *ch; - - if (!span || i >= span->len) - { - *bbox = fz_empty_rect; - return bbox; - } - ch = &span->text[i]; - if (i == span->len-1) - max = &span->max; - else - max = &span->text[i+1].p; - if (span->wmode == 0) - { - a.x = 0; - a.y = span->ascender_max; - d.x = 0; - d.y = span->descender_min; + fz_pool *pool = fz_new_pool(ctx); + fz_stext_page *page = NULL; + fz_try(ctx) + { + page = fz_pool_alloc(ctx, pool, sizeof(*page)); + page->pool = pool; + page->mediabox = mediabox; + page->first_block = NULL; + page->last_block = NULL; } - else + fz_catch(ctx) { - a.x = span->ascender_max; - a.y = 0; - d.x = span->descender_min; - d.y = 0; + fz_drop_pool(ctx, pool); + fz_rethrow(ctx); } - fz_transform_vector(&a, &span->transform); - fz_transform_vector(&d, &span->transform); - bbox->x0 = bbox->x1 = ch->p.x + a.x; - bbox->y0 = bbox->y1 = ch->p.y + a.y; - a.x += max->x; - a.y += max->y; - add_point_to_rect(bbox, &a); - a.x = ch->p.x + d.x; - a.y = ch->p.y + d.y; - add_point_to_rect(bbox, &a); - a.x = max->x + d.x; - a.y = max->y + d.y; - add_point_to_rect(bbox, &a); - return bbox; + return page; } -static void -add_bbox_to_span(fz_stext_span *span) +void +fz_drop_stext_page(fz_context *ctx, fz_stext_page *page) { - fz_point a, d; - fz_rect *bbox = &span->bbox; - - if (!span) - return; - if (span->wmode == 0) + if (page) { - a.x = 0; - a.y = span->ascender_max; - d.x = 0; - d.y = span->descender_min; + fz_stext_block *block; + for (block = page->first_block; block; block = block->next) + if (block->type == FZ_STEXT_BLOCK_IMAGE) + fz_drop_image(ctx, block->u.i.image); + fz_drop_pool(ctx, page->pool); } - else - { - a.x = span->ascender_max; - a.y = 0; - d.x = span->descender_min; - d.y = 0; - } - fz_transform_vector(&a, &span->transform); - fz_transform_vector(&d, &span->transform); - bbox->x0 = bbox->x1 = span->min.x + a.x; - bbox->y0 = bbox->y1 = span->min.y + a.y; - a.x += span->max.x; - a.y += span->max.y; - add_point_to_rect(bbox, &a); - a.x = span->min.x + d.x; - a.y = span->min.y + d.y; - add_point_to_rect(bbox, &a); - a.x = span->max.x + d.x; - a.y = span->max.y + d.y; - add_point_to_rect(bbox, &a); } -struct span_soup_s +static fz_stext_block * +add_block_to_page(fz_context *ctx, fz_stext_page *page) { - int len, cap; - fz_stext_span **spans; -}; - -static span_soup * -new_span_soup(fz_context *ctx) -{ - span_soup *soup = fz_malloc_struct(ctx, span_soup); - soup->len = 0; - soup->cap = 0; - soup->spans = NULL; - return soup; + fz_stext_block *block = fz_pool_alloc(ctx, page->pool, sizeof *page->first_block); + block->prev = page->last_block; + if (!page->first_block) + page->first_block = page->last_block = block; + else + { + page->last_block->next = block; + page->last_block = block; + } + return block; } -static void -free_span_soup(fz_context *ctx, span_soup *soup) +static fz_stext_block * +add_text_block_to_page(fz_context *ctx, fz_stext_page *page) { - int i; - - if (soup == NULL) - return; - for (i = 0; i < soup->len; i++) - { - fz_free(ctx, soup->spans[i]); - } - fz_free(ctx, soup->spans); - fz_free(ctx, soup); + fz_stext_block *block = add_block_to_page(ctx, page); + block->type = FZ_STEXT_BLOCK_TEXT; + return block; } -static void -add_span_to_soup(fz_context *ctx, span_soup *soup, fz_stext_span *span) +static fz_stext_block * +add_image_block_to_page(fz_context *ctx, fz_stext_page *page, fz_matrix ctm, fz_image *image) { - if (span == NULL) - return; - if (soup->len == soup->cap) - { - int newcap = (soup->cap ? soup->cap * 2 : 16); - soup->spans = fz_resize_array(ctx, soup->spans, newcap, sizeof(*soup->spans)); - soup->cap = newcap; - } - add_bbox_to_span(span); - soup->spans[soup->len++] = span; + fz_stext_block *block = add_block_to_page(ctx, page); + block->type = FZ_STEXT_BLOCK_IMAGE; + block->u.i.transform = ctm; + block->u.i.image = fz_keep_image(ctx, image); + block->bbox = fz_transform_rect(fz_unit_rect, ctm); + return block; } static fz_stext_line * -push_span(fz_context *ctx, fz_stext_device *tdev, fz_stext_span *span, int new_line, float distance) +add_line_to_block(fz_context *ctx, fz_stext_page *page, fz_stext_block *block, const fz_point *dir, int wmode) { - fz_stext_line *line; - fz_stext_block *block; - fz_stext_page *page = tdev->page; - int prev_not_text = 0; - - if (page->len == 0 || page->blocks[page->len-1].type != FZ_PAGE_BLOCK_TEXT) - prev_not_text = 1; - - if (new_line || prev_not_text) - { - float size = fz_matrix_expansion(&span->transform); - /* So, a new line. Part of the same block or not? */ - if (distance == 0 || distance > size * 1.5 || distance < -size * PARAGRAPH_DIST || page->len == 0 || prev_not_text) - { - /* New block */ - if (page->len == page->cap) - { - int newcap = (page->cap ? page->cap*2 : 4); - page->blocks = fz_resize_array(ctx, page->blocks, newcap, sizeof(*page->blocks)); - page->cap = newcap; - } - block = fz_malloc_struct(ctx, fz_stext_block); - page->blocks[page->len].type = FZ_PAGE_BLOCK_TEXT; - page->blocks[page->len].u.text = block; - block->cap = 0; - block->len = 0; - block->lines = 0; - block->bbox = fz_empty_rect; - page->len++; - distance = 0; - } - - /* New line */ - block = page->blocks[page->len-1].u.text; - if (block->len == block->cap) - { - int newcap = (block->cap ? block->cap*2 : 4); - block->lines = fz_resize_array(ctx, block->lines, newcap, sizeof(*block->lines)); - block->cap = newcap; - } - block->lines[block->len].first_span = NULL; - block->lines[block->len].last_span = NULL; - block->lines[block->len].distance = distance; - block->lines[block->len].bbox = fz_empty_rect; - block->len++; - } - - /* Find last line and append to it */ - block = page->blocks[page->len-1].u.text; - line = &block->lines[block->len-1]; - - fz_union_rect(&block->lines[block->len-1].bbox, &span->bbox); - fz_union_rect(&block->bbox, &span->bbox); - span->base_offset = (new_line ? 0 : distance); - - if (!line->first_span) - { - line->first_span = line->last_span = span; - span->next = NULL; - } + fz_stext_line *line = fz_pool_alloc(ctx, page->pool, sizeof *block->u.t.first_line); + line->prev = block->u.t.last_line; + if (!block->u.t.first_line) + block->u.t.first_line = block->u.t.last_line = line; else { - line->last_span->next = span; - line->last_span = span; + block->u.t.last_line->next = line; + block->u.t.last_line = line; } - return line; -} - -#if defined(DEBUG_SPANS) || defined(DEBUG_ALIGN) || defined(DEBUG_INDENTS) -static void -dump_span(fz_stext_span *s) -{ - int i; - for (i=0; i < s->len; i++) - { - printf("%c", s->text[i].c); - } -} -#endif + line->dir = *dir; + line->wmode = wmode; -#ifdef DEBUG_ALIGN -static void -dump_line(fz_stext_line *line) -{ - int i; - for (i=0; i < line->len; i++) - { - fz_stext_span *s = line->spans[i]; - if (s->spacing > 1) - printf(" "); - dump_span(s); - } - printf("\n"); + return line; } -#endif -static void -strain_soup(fz_context *ctx, fz_stext_device *tdev) +static fz_stext_char * +add_char_to_line(fz_context *ctx, fz_stext_page *page, fz_stext_line *line, fz_matrix trm, fz_font *font, float size, int c, fz_point *p, fz_point *q) { - span_soup *soup = tdev->spans; - fz_stext_line *last_line = NULL; - fz_stext_span *last_span = NULL; - int span_num; - - if (soup == NULL) - return; + fz_stext_char *ch = fz_pool_alloc(ctx, page->pool, sizeof *line->first_char); + fz_point a, d; - /* Really dumb implementation to match what we had before */ - for (span_num=0; span_num < soup->len; span_num++) + if (!line->first_char) + line->first_char = line->last_char = ch; + else { - fz_stext_span *span = soup->spans[span_num]; - int new_line = 1; - float distance = 0; - float spacing = 0; - soup->spans[span_num] = NULL; - if (last_span) - { - /* If we have a last_span, we must have a last_line */ - /* Do span and last_line share the same baseline? */ - fz_point p, q, perp_r; - float dot; - float size = fz_matrix_expansion(&span->transform); - -#ifdef DEBUG_SPANS - { - printf("Comparing: \""); - dump_span(last_span); - printf("\" and \""); - dump_span(span); - printf("\"\n"); - } -#endif - - p.x = last_line->first_span->max.x - last_line->first_span->min.x; - p.y = last_line->first_span->max.y - last_line->first_span->min.y; - fz_normalize_vector(&p); - q.x = span->max.x - span->min.x; - q.y = span->max.y - span->min.y; - fz_normalize_vector(&q); -#ifdef DEBUG_SPANS - printf("last_span=%g %g -> %g %g = %g %g\n", last_span->min.x, last_span->min.y, last_span->max.x, last_span->max.y, p.x, p.y); - printf("span =%g %g -> %g %g = %g %g\n", span->min.x, span->min.y, span->max.x, span->max.y, q.x, q.y); -#endif - perp_r.y = last_line->first_span->min.x - span->min.x; - perp_r.x = -(last_line->first_span->min.y - span->min.y); - /* Check if p and q are parallel. If so, then this - * line is parallel with the last one. */ - dot = p.x * q.x + p.y * q.y; - if (fabsf(dot) > 0.9995) - { - /* If we take the dot product of normalised(p) and - * perp(r), we get the perpendicular distance from - * one line to the next (assuming they are parallel). */ - distance = p.x * perp_r.x + p.y * perp_r.y; - /* We allow 'small' distances of baseline changes - * to cope with super/subscript. FIXME: We should - * gather subscript/superscript information here. */ - new_line = (fabsf(distance) > size * LINE_DIST); - } - else - { - new_line = 1; - distance = 0; - } - if (!new_line) - { - fz_point delta; - - delta.x = span->min.x - last_span->max.x; - delta.y = span->min.y - last_span->max.y; - - spacing = (p.x * delta.x + p.y * delta.y); - spacing = fabsf(spacing); - /* Only allow changes in baseline (subscript/superscript etc) - * when the spacing is small. */ - if (spacing * fabsf(distance) > size * LINE_DIST && fabsf(distance) > size * 0.1f) - { - new_line = 1; - distance = 0; - spacing = 0; - } - else - { - spacing /= size * SPACE_DIST; - /* Apply the same logic here as when we're adding chars to build spans. */ - if (spacing >= 1 && spacing < (SPACE_MAX_DIST/SPACE_DIST)) - spacing = 1; - } - } -#ifdef DEBUG_SPANS - printf("dot=%g new_line=%d distance=%g size=%g spacing=%g\n", dot, new_line, distance, size, spacing); -#endif - } - span->spacing = spacing; - last_line = push_span(ctx, tdev, span, new_line, distance); - last_span = span; + line->last_char->next = ch; + line->last_char = ch; } -} -fz_stext_sheet * -fz_new_stext_sheet(fz_context *ctx) -{ - fz_stext_sheet *sheet = fz_malloc(ctx, sizeof *sheet); - sheet->maxid = 0; - sheet->style = NULL; - return sheet; -} - -void -fz_drop_stext_sheet(fz_context *ctx, fz_stext_sheet *sheet) -{ - fz_stext_style *style; + ch->c = c; + ch->origin = *p; + ch->size = size; + ch->font = font; /* TODO: keep and drop */ - if (sheet == NULL) - return; - - style = sheet->style; - while (style) + if (line->wmode == 0) { - fz_stext_style *next = style->next; - fz_drop_font(ctx, style->font); - fz_free(ctx, style); - style = next; + a.x = 0; + d.x = 0; + a.y = fz_font_ascender(ctx, font); + d.y = fz_font_descender(ctx, font); } - fz_free(ctx, sheet); -} - -static fz_stext_style * -fz_lookup_stext_style_imp(fz_context *ctx, fz_stext_sheet *sheet, - float size, fz_font *font, int wmode, int script) -{ - fz_stext_style *style; - - for (style = sheet->style; style; style = style->next) + else { - if (style->font == font && - style->size == size && - style->wmode == wmode && - style->script == script) /* FIXME: others */ - { - return style; - } + fz_rect bbox = fz_font_bbox(ctx, font); + a.x = bbox.x1; + d.x = bbox.x0; + a.y = 0; + d.y = 0; } + a = fz_transform_vector(a, trm); + d = fz_transform_vector(d, trm); - /* Better make a new one and add it to our list */ - style = fz_malloc(ctx, sizeof *style); - style->id = sheet->maxid++; - style->font = fz_keep_font(ctx, font); - style->size = size; - style->wmode = wmode; - style->script = script; - style->next = sheet->style; - sheet->style = style; - return style; -} - -static fz_stext_style * -fz_lookup_stext_style(fz_context *ctx, fz_stext_sheet *sheet, fz_text_span *span, const fz_matrix *ctm, - fz_colorspace *colorspace, const float *color, float alpha, const fz_stroke_state *stroke) -{ - float size = 1.0f; - fz_font *font = span ? span->font : NULL; - int wmode = span ? span->wmode : 0; - if (ctm && span) - { - fz_matrix tm = span->trm; - fz_matrix trm; - tm.e = 0; - tm.f = 0; - fz_concat(&trm, &tm, ctm); - size = fz_matrix_expansion(&trm); - } - return fz_lookup_stext_style_imp(ctx, sheet, size, font, wmode, 0); -} + ch->quad.ll = fz_make_point(p->x + d.x, p->y + d.y); + ch->quad.ul = fz_make_point(p->x + a.x, p->y + a.y); + ch->quad.lr = fz_make_point(q->x + d.x, q->y + d.y); + ch->quad.ur = fz_make_point(q->x + a.x, q->y + a.y); -fz_stext_page * -fz_new_stext_page(fz_context *ctx, const fz_rect *mediabox) -{ - fz_stext_page *page = fz_malloc(ctx, sizeof(*page)); - page->mediabox = *mediabox; - page->len = 0; - page->cap = 0; - page->blocks = NULL; - page->next = NULL; - return page; + return ch; } -static void -fz_drop_stext_line_contents(fz_context *ctx, fz_stext_line *line) +static int +direction_from_bidi_class(int bidiclass, int curdir) { - fz_stext_span *span, *next; - for (span = line->first_span; span; span=next) + switch (bidiclass) { - next = span->next; - fz_free(ctx, span->text); - fz_free(ctx, span); - } -} + /* strong */ + case UCDN_BIDI_CLASS_L: return 1; + case UCDN_BIDI_CLASS_R: return -1; + case UCDN_BIDI_CLASS_AL: return -1; -static void -fz_drop_stext_block(fz_context *ctx, fz_stext_block *block) -{ - fz_stext_line *line; - if (block == NULL) - return; - for (line = block->lines; line < block->lines + block->len; line++) - fz_drop_stext_line_contents(ctx, line); - fz_free(ctx, block->lines); - fz_free(ctx, block); -} + /* weak */ + case UCDN_BIDI_CLASS_EN: + case UCDN_BIDI_CLASS_ES: + case UCDN_BIDI_CLASS_ET: + case UCDN_BIDI_CLASS_AN: + case UCDN_BIDI_CLASS_CS: + case UCDN_BIDI_CLASS_NSM: + case UCDN_BIDI_CLASS_BN: + return curdir; -static void -fz_drop_image_block(fz_context *ctx, fz_image_block *block) -{ - if (block == NULL) - return; - fz_drop_image(ctx, block->image); - fz_drop_colorspace(ctx, block->cspace); - fz_free(ctx, block); -} + /* neutral */ + case UCDN_BIDI_CLASS_B: + case UCDN_BIDI_CLASS_S: + case UCDN_BIDI_CLASS_WS: + case UCDN_BIDI_CLASS_ON: + return curdir; -void -fz_drop_stext_page(fz_context *ctx, fz_stext_page *page) -{ - fz_page_block *block; - if (page == NULL) - return; - for (block = page->blocks; block < page->blocks + page->len; block++) - { - switch (block->type) - { - case FZ_PAGE_BLOCK_TEXT: - fz_drop_stext_block(ctx, block->u.text); - break; - case FZ_PAGE_BLOCK_IMAGE: - fz_drop_image_block(ctx, block->u.image); - break; - } + /* embedding, override, pop ... we don't support them */ + default: + return 0; } - fz_free(ctx, page->blocks); - fz_free(ctx, page); } -static fz_stext_span * -fz_new_stext_span(fz_context *ctx, const fz_point *p, int wmode, const fz_matrix *trm) +static float +vec_dot(const fz_point *a, const fz_point *b) { - fz_stext_span *span = fz_malloc_struct(ctx, fz_stext_span); - span->ascender_max = 0; - span->descender_min = 0; - span->cap = 0; - span->len = 0; - span->min = *p; - span->max = *p; - span->wmode = wmode; - span->transform.a = trm->a; - span->transform.b = trm->b; - span->transform.c = trm->c; - span->transform.d = trm->d; - span->transform.e = 0; - span->transform.f = 0; - span->text = NULL; - span->next = NULL; - return span; + return a->x * b->x + a->y * b->y; } static void -add_char_to_span(fz_context *ctx, fz_stext_span *span, int c, fz_point *p, fz_point *max, fz_stext_style *style) +fz_add_stext_char_imp(fz_context *ctx, fz_stext_device *dev, fz_font *font, int c, int glyph, fz_matrix trm, float adv, int wmode) { - if (span->len == span->cap) - { - int newcap = (span->cap ? span->cap * 2 : 16); - span->text = fz_resize_array(ctx, span->text, newcap, sizeof(fz_stext_char)); - span->cap = newcap; - span->bbox = fz_empty_rect; - } - span->max = *max; - if (style->ascender > span->ascender_max) - span->ascender_max = style->ascender; - if (style->descender < span->descender_min) - span->descender_min = style->descender; - span->text[span->len].c = c; - span->text[span->len].p = *p; - span->text[span->len].style = style; - span->len++; -} + fz_stext_page *page = dev->page; + fz_stext_block *cur_block; + fz_stext_line *cur_line; -static void -fz_add_stext_char_imp(fz_context *ctx, fz_stext_device *dev, fz_stext_style *style, int c, int glyph, fz_matrix *trm, float adv, int wmode) -{ - int can_append = 1; + int new_para = 0; + int new_line = 1; int add_space = 0; - fz_point dir, ndir, p, q, r; + fz_point dir, ndir, p, q; float size; fz_point delta; float spacing = 0; float base_offset = 0; + int rtl = 0; + dev->curdir = direction_from_bidi_class(ucdn_get_bidi_class(c), dev->curdir); + + /* dir = direction vector for motion. ndir = normalised(dir) */ if (wmode == 0) { dir.x = 1; @@ -594,20 +234,18 @@ dir.x = 0; dir.y = -1; } - fz_transform_vector(&dir, trm); - ndir = dir; - fz_normalize_vector(&ndir); - /* dir = direction vector for motion. ndir = normalised(dir) */ + dir = fz_transform_vector(dir, trm); + ndir = fz_normalize_vector(dir); size = fz_matrix_expansion(trm); /* We need to identify where glyphs 'start' (p) and 'stop' (q). - * Each glyph holds it's 'start' position, and the next glyph in the - * span (or span->max if there is no next glyph) holds it's 'end' + * Each glyph holds its 'start' position, and the next glyph in the + * span (or span->max if there is no next glyph) holds its 'end' * position. * * For both horizontal and vertical motion, trm->{e,f} gives the - * bottom left corner of the glyph. + * origin (usually the bottom left) of the glyph. * * In horizontal mode: * + p is bottom left. @@ -618,50 +256,51 @@ */ if (wmode == 0) { - p.x = trm->e; - p.y = trm->f; - q.x = trm->e + adv * dir.x; - q.y = trm->f + adv * dir.y; + p.x = trm.e; + p.y = trm.f; + q.x = trm.e + adv * dir.x; + q.y = trm.f + adv * dir.y; } else { - p.x = trm->e - adv * dir.x; - p.y = trm->f - adv * dir.y; - q.x = trm->e; - q.y = trm->f; - } - - if (glyph < 0) - { - /* Don't reset 'pen' to start of no-glyph characters in cluster */ - if (dev->cur_span) - q = dev->cur_span->max; - goto no_glyph; - } - - if (dev->cur_span == NULL || - trm->a != dev->cur_span->transform.a || trm->b != dev->cur_span->transform.b || - trm->c != dev->cur_span->transform.c || trm->d != dev->cur_span->transform.d || - dev->cur_span->wmode != wmode) - { - /* If the matrix has changed, or the wmode is different (or - * if we don't have a span at all), then we can't append. */ -#ifdef DEBUG_SPANS - printf("Transform/WMode changed\n"); -#endif - can_append = 0; + p.x = trm.e - adv * dir.x; + p.y = trm.f - adv * dir.y; + q.x = trm.e; + q.y = trm.f; + } + + /* Find current position to enter new text. */ + cur_block = page->last_block; + if (cur_block && cur_block->type != FZ_STEXT_BLOCK_TEXT) + cur_block = NULL; + cur_line = cur_block ? cur_block->u.t.last_line : NULL; + + if (cur_line && glyph < 0) + { + /* Don't advance pen or break lines for no-glyph characters in a cluster */ + add_char_to_line(ctx, page, cur_line, trm, font, size, c, &dev->pen, &dev->pen); + dev->lastchar = c; + return; + } + + if (cur_line == NULL || cur_line->wmode != wmode || vec_dot(&ndir, &cur_line->dir) < 0.999f) + { + /* If the matrix has changed rotation, or the wmode is different (or if we don't have a line at all), + * then we can't append to the current block/line. */ + new_para = 1; + new_line = 1; } else { - delta.x = q.x - dev->cur_span->max.x; - delta.y = q.y - dev->cur_span->max.y; + /* Detect fake bold where text is printed twice in the same place. */ + delta.x = fabsf(q.x - dev->pen.x); + delta.y = fabsf(q.y - dev->pen.y); if (delta.x < FLT_EPSILON && delta.y < FLT_EPSILON && c == dev->lastchar) return; - /* Calculate how far we've moved since the end of the current - * span. */ - delta.x = p.x - dev->cur_span->max.x; - delta.y = p.y - dev->cur_span->max.y; + /* Calculate how far we've moved since the last character. */ + delta.x = p.x - dev->pen.x; + delta.y = p.y - dev->pen.y; /* The transform has not changed, so we know we're in the same * direction. Calculate 2 distances; how far off the previous @@ -670,102 +309,129 @@ spacing = ndir.x * delta.x + ndir.y * delta.y; base_offset = -ndir.y * delta.x + ndir.x * delta.y; - spacing /= size * SPACE_DIST; - if (fabsf(base_offset) < size * 0.1) + /* Only a small amount off the baseline - we'll take this */ + if (fabsf(base_offset) < size * 0.8f) { - /* Only a small amount off the baseline - we'll take this */ - if (fabsf(spacing) < 1.0) + /* LTR or neutral character */ + if (dev->curdir >= 0) { - /* Motion is in line, and small. */ - } - else if (spacing >= 1 && spacing < (SPACE_MAX_DIST/SPACE_DIST)) - { - /* Motion is in line, but large enough - * to warrant us adding a space */ - if (dev->lastchar != ' ' && wmode == 0) - add_space = 1; + if (fabsf(spacing) < size * SPACE_DIST) + { + /* Motion is in line, and small. */ + new_line = 0; + } + else if (spacing >= size * SPACE_DIST && spacing < size * SPACE_MAX_DIST) + { + /* Motion is in line, but large enough to warrant us adding a space. */ + if (dev->lastchar != ' ' && wmode == 0) + add_space = 1; + new_line = 0; + } + else + { + /* Motion is in line, but large enough to warrant splitting to a new line */ + new_line = 1; + } } + + /* RTL character -- disable space character and column detection heuristics */ else { - /* Motion is in line, but too large - split to a new span */ - can_append = 0; + new_line = 0; + if (spacing > size * SPACE_DIST || spacing < 0) + rtl = 0; /* backward (or big jump to 'right' side) means logical order */ + else + rtl = 1; /* visual order, we need to reverse in a post process pass */ } } + + /* Enough for a new line, but not enough for a new paragraph */ + else if (fabsf(base_offset) < size * 1.3f) + { + /* Check indent to spot text-indent style paragraphs */ + if (wmode == 0 && cur_line && dev->new_obj) + if (fabsf(p.x - dev->start.x) > size * 0.5f) + new_para = 1; + new_line = 1; + } + + /* Way off the baseline - open a new paragraph */ else { - can_append = 0; -#ifdef DEBUG_SPANS - spacing = 0; -#endif + new_para = 1; + new_line = 1; } } -#ifdef DEBUG_SPANS - printf("%c%c append=%d space=%d size=%g spacing=%g base_offset=%g\n", dev->lastchar, c, can_append, add_space, size, spacing, base_offset); -#endif - - /* Start a new span */ - if (!can_append) - { - add_span_to_soup(ctx, dev->spans, dev->cur_span); - dev->cur_span = NULL; - dev->cur_span = fz_new_stext_span(ctx, &p, wmode, trm); - dev->cur_span->spacing = 0; + /* Start a new block (but only at the beginning of a text object) */ + if (new_para || !cur_block) + { + cur_block = add_text_block_to_page(ctx, page); + cur_line = cur_block->u.t.last_line; } - /* Add synthetic space */ - if (add_space) + /* Start a new line */ + if (new_line || !cur_line) { - /* We know we always have a cur_span here */ - r = dev->cur_span->max; - add_char_to_span(ctx, dev->cur_span, ' ', &r, &p, style); + cur_line = add_line_to_block(ctx, page, cur_block, &ndir, wmode); + dev->start = p; } -no_glyph: - add_char_to_span(ctx, dev->cur_span, c, &p, &q, style); + /* Add synthetic space */ + if (add_space) + add_char_to_line(ctx, page, cur_line, trm, font, size, ' ', &dev->pen, &p); + + add_char_to_line(ctx, page, cur_line, trm, font, size, c, &p, &q); dev->lastchar = c; + dev->pen = q; + + dev->new_obj = 0; + dev->trm = trm; } static void -fz_add_stext_char(fz_context *ctx, fz_stext_device *dev, fz_stext_style *style, int c, int glyph, fz_matrix *trm, float adv, int wmode) +fz_add_stext_char(fz_context *ctx, fz_stext_device *dev, fz_font *font, int c, int glyph, fz_matrix trm, float adv, int wmode) { /* ignore when one unicode character maps to multiple glyphs */ if (c == -1) return; if (!(dev->flags & FZ_STEXT_PRESERVE_LIGATURES)) + { switch (c) { case 0xFB00: /* ff */ - fz_add_stext_char_imp(ctx, dev, style, 'f', glyph, trm, adv, wmode); - fz_add_stext_char_imp(ctx, dev, style, 'f', -1, trm, 0, wmode); + fz_add_stext_char_imp(ctx, dev, font, 'f', glyph, trm, adv, wmode); + fz_add_stext_char_imp(ctx, dev, font, 'f', -1, trm, 0, wmode); return; case 0xFB01: /* fi */ - fz_add_stext_char_imp(ctx, dev, style, 'f', glyph, trm, adv, wmode); - fz_add_stext_char_imp(ctx, dev, style, 'i', -1, trm, 0, wmode); + fz_add_stext_char_imp(ctx, dev, font, 'f', glyph, trm, adv, wmode); + fz_add_stext_char_imp(ctx, dev, font, 'i', -1, trm, 0, wmode); return; case 0xFB02: /* fl */ - fz_add_stext_char_imp(ctx, dev, style, 'f', glyph, trm, adv, wmode); - fz_add_stext_char_imp(ctx, dev, style, 'l', -1, trm, 0, wmode); + fz_add_stext_char_imp(ctx, dev, font, 'f', glyph, trm, adv, wmode); + fz_add_stext_char_imp(ctx, dev, font, 'l', -1, trm, 0, wmode); return; case 0xFB03: /* ffi */ - fz_add_stext_char_imp(ctx, dev, style, 'f', glyph, trm, adv, wmode); - fz_add_stext_char_imp(ctx, dev, style, 'f', -1, trm, 0, wmode); - fz_add_stext_char_imp(ctx, dev, style, 'i', -1, trm, 0, wmode); + fz_add_stext_char_imp(ctx, dev, font, 'f', glyph, trm, adv, wmode); + fz_add_stext_char_imp(ctx, dev, font, 'f', -1, trm, 0, wmode); + fz_add_stext_char_imp(ctx, dev, font, 'i', -1, trm, 0, wmode); return; case 0xFB04: /* ffl */ - fz_add_stext_char_imp(ctx, dev, style, 'f', glyph, trm, adv, wmode); - fz_add_stext_char_imp(ctx, dev, style, 'f', -1, trm, 0, wmode); - fz_add_stext_char_imp(ctx, dev, style, 'l', -1, trm, 0, wmode); + fz_add_stext_char_imp(ctx, dev, font, 'f', glyph, trm, adv, wmode); + fz_add_stext_char_imp(ctx, dev, font, 'f', -1, trm, 0, wmode); + fz_add_stext_char_imp(ctx, dev, font, 'l', -1, trm, 0, wmode); return; case 0xFB05: /* long st */ case 0xFB06: /* st */ - fz_add_stext_char_imp(ctx, dev, style, 's', glyph, trm, adv, wmode); - fz_add_stext_char_imp(ctx, dev, style, 't', -1, trm, 0, wmode); + fz_add_stext_char_imp(ctx, dev, font, 's', glyph, trm, adv, wmode); + fz_add_stext_char_imp(ctx, dev, font, 't', -1, trm, 0, wmode); return; } + } if (!(dev->flags & FZ_STEXT_PRESERVE_WHITESPACE)) + { switch (c) { case 0x0009: /* tab */ @@ -789,294 +455,196 @@ case 0x3000: /* ideographic space */ c = ' '; } + } - fz_add_stext_char_imp(ctx, dev, style, c, glyph, trm, adv, wmode); + fz_add_stext_char_imp(ctx, dev, font, c, glyph, trm, adv, wmode); } static void -fz_stext_extract(fz_context *ctx, fz_stext_device *dev, fz_text_span *span, const fz_matrix *ctm, fz_stext_style *style) +fz_stext_extract(fz_context *ctx, fz_stext_device *dev, fz_text_span *span, fz_matrix ctm) { fz_font *font = span->font; - FT_Face face = fz_font_ft_face(ctx, font); - fz_buffer **t3procs = fz_font_t3_procs(ctx, font); - fz_rect *bbox = fz_font_bbox(ctx, font); fz_matrix tm = span->trm; fz_matrix trm; float adv; - float ascender = 1; - float descender = 0; - int i, err; + int i; if (span->len == 0) return; - if (dev->spans == NULL) - dev->spans = new_span_soup(ctx); - - if (style->wmode == 0) - { - if (face) - { - fz_lock(ctx, FZ_LOCK_FREETYPE); - err = FT_Set_Char_Size(face, 64, 64, 72, 72); - if (err) - fz_warn(ctx, "freetype set character size: %s", ft_error_string(err)); - ascender = (float)face->ascender / face->units_per_EM; - descender = (float)face->descender / face->units_per_EM; - fz_unlock(ctx, FZ_LOCK_FREETYPE); - } - else if (t3procs && !fz_is_empty_rect(bbox)) - { - ascender = bbox->y1; - descender = bbox->y0; - } - } - else - { - ascender = bbox->x1; - descender = bbox->x0; - } - style->ascender = ascender; - style->descender = descender; - tm.e = 0; tm.f = 0; - fz_concat(&trm, &tm, ctm); + trm = fz_concat(tm, ctm); for (i = 0; i < span->len; i++) { /* Calculate new pen location and delta */ tm.e = span->items[i].x; tm.f = span->items[i].y; - fz_concat(&trm, &tm, ctm); + trm = fz_concat(tm, ctm); /* Calculate bounding box and new pen position based on font metrics */ if (span->items[i].gid >= 0) - adv = fz_advance_glyph(ctx, font, span->items[i].gid, style->wmode); + adv = fz_advance_glyph(ctx, font, span->items[i].gid, span->wmode); else adv = 0; - fz_add_stext_char(ctx, dev, style, span->items[i].ucs, span->items[i].gid, &trm, adv, span->wmode); + fz_add_stext_char(ctx, dev, font, span->items[i].ucs, span->items[i].gid, trm, adv, span->wmode); } } static void -fz_stext_fill_text(fz_context *ctx, fz_device *dev, const fz_text *text, const fz_matrix *ctm, - fz_colorspace *colorspace, const float *color, float alpha) +fz_stext_fill_text(fz_context *ctx, fz_device *dev, const fz_text *text, fz_matrix ctm, + fz_colorspace *colorspace, const float *color, float alpha, const fz_color_params *color_params) { fz_stext_device *tdev = (fz_stext_device*)dev; - fz_stext_style *style; fz_text_span *span; + tdev->new_obj = 1; for (span = text->head; span; span = span->next) - { - style = fz_lookup_stext_style(ctx, tdev->sheet, span, ctm, colorspace, color, alpha, NULL); - fz_stext_extract(ctx, tdev, span, ctm, style); - } + fz_stext_extract(ctx, tdev, span, ctm); } static void -fz_stext_stroke_text(fz_context *ctx, fz_device *dev, const fz_text *text, const fz_stroke_state *stroke, const fz_matrix *ctm, - fz_colorspace *colorspace, const float *color, float alpha) +fz_stext_stroke_text(fz_context *ctx, fz_device *dev, const fz_text *text, const fz_stroke_state *stroke, fz_matrix ctm, + fz_colorspace *colorspace, const float *color, float alpha, const fz_color_params *color_params) { fz_stext_device *tdev = (fz_stext_device*)dev; - fz_stext_style *style; fz_text_span *span; + tdev->new_obj = 1; for (span = text->head; span; span = span->next) - { - style = fz_lookup_stext_style(ctx, tdev->sheet, span, ctm, colorspace, color, alpha, stroke); - fz_stext_extract(ctx, tdev, span, ctm, style); - } + fz_stext_extract(ctx, tdev, span, ctm); } static void -fz_stext_clip_text(fz_context *ctx, fz_device *dev, const fz_text *text, const fz_matrix *ctm, const fz_rect *scissor) +fz_stext_clip_text(fz_context *ctx, fz_device *dev, const fz_text *text, fz_matrix ctm, fz_rect scissor) { fz_stext_device *tdev = (fz_stext_device*)dev; - fz_stext_style *style; fz_text_span *span; + tdev->new_obj = 1; for (span = text->head; span; span = span->next) - { - style = fz_lookup_stext_style(ctx, tdev->sheet, span, ctm, NULL, NULL, 0, NULL); - fz_stext_extract(ctx, tdev, span, ctm, style); - } + fz_stext_extract(ctx, tdev, span, ctm); } static void -fz_stext_clip_stroke_text(fz_context *ctx, fz_device *dev, const fz_text *text, const fz_stroke_state *stroke, const fz_matrix *ctm, const fz_rect *scissor) +fz_stext_clip_stroke_text(fz_context *ctx, fz_device *dev, const fz_text *text, const fz_stroke_state *stroke, fz_matrix ctm, fz_rect scissor) { fz_stext_device *tdev = (fz_stext_device*)dev; - fz_stext_style *style; fz_text_span *span; + tdev->new_obj = 1; for (span = text->head; span; span = span->next) - { - style = fz_lookup_stext_style(ctx, tdev->sheet, span, ctm, NULL, NULL, 0, stroke); - fz_stext_extract(ctx, tdev, span, ctm, style); - } + fz_stext_extract(ctx, tdev, span, ctm); } static void -fz_stext_ignore_text(fz_context *ctx, fz_device *dev, const fz_text *text, const fz_matrix *ctm) +fz_stext_ignore_text(fz_context *ctx, fz_device *dev, const fz_text *text, fz_matrix ctm) { fz_stext_device *tdev = (fz_stext_device*)dev; - fz_stext_style *style; fz_text_span *span; + tdev->new_obj = 1; for (span = text->head; span; span = span->next) - { - style = fz_lookup_stext_style(ctx, tdev->sheet, span, ctm, NULL, NULL, 0, NULL); - fz_stext_extract(ctx, tdev, span, ctm, style); - } + fz_stext_extract(ctx, tdev, span, ctm); } +/* Images and shadings */ + static void -fz_stext_fill_image_mask(fz_context *ctx, fz_device *dev, fz_image *img, const fz_matrix *ctm, - fz_colorspace *cspace, const float *color, float alpha) +fz_stext_fill_image(fz_context *ctx, fz_device *dev, fz_image *img, fz_matrix ctm, float alpha, const fz_color_params *color_params) { fz_stext_device *tdev = (fz_stext_device*)dev; - fz_stext_page *page = tdev->page; - fz_image_block *block; - /* If the alpha is less than 50% then it's probably a watermark or - * effect or something. Skip it */ - if (alpha < 0.5) + /* If the alpha is less than 50% then it's probably a watermark or effect or something. Skip it. */ + if (alpha < 0.5f) return; - /* New block */ - if (page->len == page->cap) - { - int newcap = (page->cap ? page->cap*2 : 4); - page->blocks = fz_resize_array(ctx, page->blocks, newcap, sizeof(*page->blocks)); - page->cap = newcap; - } - block = fz_malloc_struct(ctx, fz_image_block); - page->blocks[page->len].type = FZ_PAGE_BLOCK_IMAGE; - page->blocks[page->len].u.image = block; - block->image = fz_keep_image(ctx, img); - block->cspace = fz_keep_colorspace(ctx, cspace); - if (cspace) - memcpy(block->colors, color, sizeof(block->colors[0])*fz_colorspace_n(ctx, cspace)); - block->mat = *ctm; - block->bbox.x0 = 0; - block->bbox.y0 = 0; - block->bbox.x1 = 1; - block->bbox.y1 = 1; - fz_transform_rect(&block->bbox, ctm); - page->len++; + add_image_block_to_page(ctx, tdev->page, ctm, img); } static void -fz_stext_fill_image(fz_context *ctx, fz_device *dev, fz_image *img, const fz_matrix *ctm, float alpha) +fz_stext_fill_image_mask(fz_context *ctx, fz_device *dev, fz_image *img, fz_matrix ctm, + fz_colorspace *cspace, const float *color, float alpha, const fz_color_params *color_params) { - fz_stext_fill_image_mask(ctx, dev, img, ctm, NULL, NULL, alpha); + fz_stext_fill_image(ctx, dev, img, ctm, alpha, color_params); } -static int -direction_from_bidi_class(int bidiclass, int curdir) +static fz_image * +fz_new_image_from_shade(fz_context *ctx, fz_shade *shade, fz_matrix *in_out_ctm, const fz_color_params *color_params, fz_rect scissor) { - switch (bidiclass) - { - /* strong */ - case UCDN_BIDI_CLASS_L: return 1; - case UCDN_BIDI_CLASS_R: return -1; - case UCDN_BIDI_CLASS_AL: return -1; + fz_matrix ctm = *in_out_ctm; + fz_pixmap *pix; + fz_image *img = NULL; + fz_rect bounds; + fz_irect bbox; - /* weak */ - case UCDN_BIDI_CLASS_EN: - case UCDN_BIDI_CLASS_ES: - case UCDN_BIDI_CLASS_ET: - case UCDN_BIDI_CLASS_AN: - case UCDN_BIDI_CLASS_CS: - case UCDN_BIDI_CLASS_NSM: - case UCDN_BIDI_CLASS_BN: - return curdir; - - /* neutral */ - case UCDN_BIDI_CLASS_B: - case UCDN_BIDI_CLASS_S: - case UCDN_BIDI_CLASS_WS: - case UCDN_BIDI_CLASS_ON: - return curdir; + bounds = fz_bound_shade(ctx, shade, ctm); + bounds = fz_intersect_rect(bounds, scissor); + bbox = fz_irect_from_rect(bounds); - /* embedding, override, pop ... we don't support them */ - default: - return 0; - } + pix = fz_new_pixmap_with_bbox(ctx, fz_device_rgb(ctx), bbox, NULL, !shade->use_background); + fz_try(ctx) + { + if (shade->use_background) + fz_fill_pixmap_with_color(ctx, pix, shade->colorspace, shade->background, color_params); + else + fz_clear_pixmap(ctx, pix); + fz_paint_shade(ctx, shade, NULL, ctm, pix, color_params, bbox, NULL); + img = fz_new_image_from_pixmap(ctx, pix, NULL); + } + fz_always(ctx) + fz_drop_pixmap(ctx, pix); + fz_catch(ctx) + fz_rethrow(ctx); + + in_out_ctm->a = pix->w; + in_out_ctm->b = 0; + in_out_ctm->c = 0; + in_out_ctm->d = pix->h; + in_out_ctm->e = pix->x; + in_out_ctm->f = pix->y; + return img; } static void -fz_bidi_reorder_run(fz_stext_span *span, int a, int b, int dir) -{ - if (a < b && dir == -1) - { - fz_stext_char c; - int m = a + (b - a) / 2; - while (a < m) - { - b--; - c = span->text[a]; - span->text[a] = span->text[b]; - span->text[b] = c; - a++; - } - } +fz_stext_fill_shade(fz_context *ctx, fz_device *dev, fz_shade *shade, fz_matrix ctm, float alpha, const fz_color_params *color_params) +{ + fz_matrix local_ctm = ctm; + fz_rect scissor = fz_device_current_scissor(ctx, dev); + fz_image *image = fz_new_image_from_shade(ctx, shade, &local_ctm, color_params, scissor); + fz_try(ctx) + fz_stext_fill_image(ctx, dev, image, local_ctm, alpha, color_params); + fz_always(ctx) + fz_drop_image(ctx, image); + fz_catch(ctx) + fz_rethrow(ctx); } static void -fz_bidi_reorder_span(fz_stext_span *span) +fz_stext_close_device(fz_context *ctx, fz_device *dev) { - int a, b, dir, curdir; + fz_stext_device *tdev = (fz_stext_device*)dev; + fz_stext_page *page = tdev->page; + fz_stext_block *block; + fz_stext_line *line; + fz_stext_char *ch; - a = 0; - curdir = 1; - for (b = 0; b < span->len; b++) + for (block = page->first_block; block; block = block->next) { - dir = direction_from_bidi_class(ucdn_get_bidi_class(span->text[b].c), curdir); - if (dir != curdir) + if (block->type != FZ_STEXT_BLOCK_TEXT) + continue; + for (line = block->u.t.first_line; line; line = line->next) { - fz_bidi_reorder_run(span, a, b, curdir); - curdir = dir; - a = b; + for (ch = line->first_char; ch; ch = ch->next) + line->bbox = fz_union_rect(line->bbox, fz_rect_from_quad(ch->quad)); + block->bbox = fz_union_rect(block->bbox, line->bbox); } } - fz_bidi_reorder_run(span, a, b, curdir); -} - -static void -fz_bidi_reorder_stext_page(fz_context *ctx, fz_stext_page *page) -{ - fz_page_block *pageblock; - fz_stext_block *block; - fz_stext_line *line; - fz_stext_span *span; - - for (pageblock = page->blocks; pageblock < page->blocks + page->len; pageblock++) - if (pageblock->type == FZ_PAGE_BLOCK_TEXT) - for (block = pageblock->u.text, line = block->lines; line < block->lines + block->len; line++) - for (span = line->first_span; span; span = span->next) - fz_bidi_reorder_span(span); -} -static void -fz_stext_close_device(fz_context *ctx, fz_device *dev) -{ - fz_stext_device *tdev = (fz_stext_device*)dev; - - add_span_to_soup(ctx, tdev->spans, tdev->cur_span); - tdev->cur_span = NULL; - - strain_soup(ctx, tdev); - - /* TODO: smart sorting of blocks in reading order */ + /* TODO: smart sorting of blocks and lines in reading order */ /* TODO: unicode NFC normalization */ - - fz_bidi_reorder_stext_page(ctx, tdev->page); } static void fz_stext_drop_device(fz_context *ctx, fz_device *dev) { - fz_stext_device *tdev = (fz_stext_device*)dev; - free_span_soup(ctx, tdev->spans); - tdev->spans = NULL; } fz_stext_options * @@ -1090,16 +658,16 @@ opts->flags |= FZ_STEXT_PRESERVE_LIGATURES; if (fz_has_option(ctx, string, "preserve-whitespace", &val) && fz_option_eq(val, "yes")) opts->flags |= FZ_STEXT_PRESERVE_WHITESPACE; + if (fz_has_option(ctx, string, "preserve-images", &val) && fz_option_eq(val, "yes")) + opts->flags |= FZ_STEXT_PRESERVE_IMAGES; return opts; } fz_device * -fz_new_stext_device(fz_context *ctx, fz_stext_sheet *sheet, fz_stext_page *page, const fz_stext_options *opts) +fz_new_stext_device(fz_context *ctx, fz_stext_page *page, const fz_stext_options *opts) { - fz_stext_device *dev = fz_new_device(ctx, sizeof *dev); - - dev->super.hints = FZ_IGNORE_IMAGE | FZ_IGNORE_SHADE; + fz_stext_device *dev = fz_new_derived_device(ctx, fz_stext_device); dev->super.close_device = fz_stext_close_device; dev->super.drop_device = fz_stext_drop_device; @@ -1109,19 +677,26 @@ dev->super.clip_text = fz_stext_clip_text; dev->super.clip_stroke_text = fz_stext_clip_stroke_text; dev->super.ignore_text = fz_stext_ignore_text; - dev->super.fill_image = fz_stext_fill_image; - dev->super.fill_image_mask = fz_stext_fill_image_mask; - dev->sheet = sheet; + if (opts && (opts->flags & FZ_STEXT_PRESERVE_IMAGES)) + { + dev->super.hints |= FZ_MAINTAIN_CONTAINER_STACK; + dev->super.fill_shade = fz_stext_fill_shade; + dev->super.fill_image = fz_stext_fill_image; + dev->super.fill_image_mask = fz_stext_fill_image_mask; + } + dev->page = page; - dev->spans = NULL; - dev->cur_span = NULL; + dev->pen.x = 0; + dev->pen.y = 0; + dev->trm = fz_identity; dev->lastchar = ' '; - /* willus mod -- add dev->flags=0 to else branch */ - if (opts) + dev->curdir = 1; + /* willus mod -- seems like this should be here, but not sure. */ + if (opts) dev->flags = opts->flags; else dev->flags = 0; - return (fz_device*)dev; + return (fz_device*)dev; } diff -Nru k2pdfopt-2.42+ds/mupdf_mod/string.c k2pdfopt-2.51+ds/mupdf_mod/string.c --- k2pdfopt-2.42+ds/mupdf_mod/string.c 2017-02-25 05:47:33.000000000 +0000 +++ k2pdfopt-2.51+ds/mupdf_mod/string.c 2018-11-21 00:31:15.000000000 +0000 @@ -1,5 +1,11 @@ #include "mupdf/fitz.h" +#include +#include +#include +#include +#include + static inline int fz_tolower(int c) { @@ -8,6 +14,13 @@ return c; } +size_t +fz_strnlen(const char *s, size_t n) +{ + const char *p = memchr(s, 0, n); + return p ? p - s : n; +} + int fz_strcasecmp(const char *a, const char *b) { @@ -183,7 +196,6 @@ while (i > 0) path[n++] = num[--i]; fz_strlcpy(path + n, p, size - n); - } #define SEP(x) ((x)=='/' || (x) == 0) @@ -282,7 +294,7 @@ fz_chartorune(int *rune, const char *str) { int c, c1, c2, c3; - long l; + int l; /* * one character sequence @@ -357,7 +369,7 @@ fz_runetochar(char *str, int rune) { /* Runes are signed, so convert to unsigned for range check. */ - unsigned long c = (unsigned long)rune; + unsigned int c = (unsigned int)rune; /* * one character sequence @@ -436,11 +448,10 @@ float fz_atof(const char *s) { -/* willus mod: #if-#else-#endif */ +/* willus mod: atof(s), #if-#else-#endif */ #if (!defined(__SSE__)) return(atof(s)); #else - float result; errno = 0; @@ -460,11 +471,11 @@ return atoi(s); } -fz_off_t fz_atoo(const char *s) +int64_t fz_atoi64(const char *s) { if (s == NULL) return 0; - return fz_atoo_imp(s); + return atoll(s); } int fz_is_page_range(fz_context *ctx, const char *s) @@ -513,3 +524,150 @@ return s; } + +/* memmem from musl */ + +#define MAX(a,b) ((a)>(b)?(a):(b)) + +#define BITOP(a,b,op) \ + ((a)[(size_t)(b)/(8*sizeof *(a))] op (size_t)1<<((size_t)(b)%(8*sizeof *(a)))) + +static char *twobyte_memmem(const unsigned char *h, size_t k, const unsigned char *n) +{ + uint16_t nw = n[0]<<8 | n[1], hw = h[0]<<8 | h[1]; + for (h++, k--; k; k--, hw = hw<<8 | *++h) + if (hw == nw) return (char *)h-1; + return 0; +} + +static char *threebyte_memmem(const unsigned char *h, size_t k, const unsigned char *n) +{ + uint32_t nw = n[0]<<24 | n[1]<<16 | n[2]<<8; + uint32_t hw = h[0]<<24 | h[1]<<16 | h[2]<<8; + for (h+=2, k-=2; k; k--, hw = (hw|*++h)<<8) + if (hw == nw) return (char *)h-2; + return 0; +} + +static char *fourbyte_memmem(const unsigned char *h, size_t k, const unsigned char *n) +{ + uint32_t nw = n[0]<<24 | n[1]<<16 | n[2]<<8 | n[3]; + uint32_t hw = h[0]<<24 | h[1]<<16 | h[2]<<8 | h[3]; + for (h+=3, k-=3; k; k--, hw = hw<<8 | *++h) + if (hw == nw) return (char *)h-3; + return 0; +} + +static char *twoway_memmem(const unsigned char *h, const unsigned char *z, const unsigned char *n, size_t l) +{ + size_t i, ip, jp, k, p, ms, p0, mem, mem0; + size_t byteset[32 / sizeof(size_t)] = { 0 }; + size_t shift[256]; + + /* Computing length of needle and fill shift table */ + for (i=0; i n[jp+k]) { + jp += k; + k = 1; + p = jp - ip; + } else { + ip = jp++; + k = p = 1; + } + } + ms = ip; + p0 = p; + + /* And with the opposite comparison */ + ip = -1; jp = 0; k = p = 1; + while (jp+k ms+1) ms = ip; + else p = p0; + + /* Periodic needle? */ + if (memcmp(n, n+p, ms+1)) { + mem0 = 0; + p = MAX(ms, l-ms-1) + 1; + } else mem0 = l-p; + mem = 0; + + /* Search loop */ + for (;;) { + /* If remainder of haystack is shorter than needle, done */ + if (z-h < l) return 0; + + /* Check last byte first; advance by shift on mismatch */ + if (BITOP(byteset, h[l-1], &)) { + k = l-shift[h[l-1]]; + if (k) { + if (mem0 && mem && k < p) k = l-p; + h += k; + mem = 0; + continue; + } + } else { + h += l; + mem = 0; + continue; + } + + /* Compare right half */ + for (k=MAX(ms+1,mem); kmem && n[k-1] == h[k-1]; k--); + if (k <= mem) return (char *)h; + h += p; + mem = mem0; + } +} + +void *fz_memmem(const void *h0, size_t k, const void *n0, size_t l) +{ + const unsigned char *h = h0, *n = n0; + + /* Return immediately on empty needle */ + if (!l) return (void *)h; + + /* Return immediately when needle is longer than haystack */ + if (k +#include FT_FREETYPE_H +#include FT_ADVANCES_H + +static inline int ishex(int a) +{ + return (a >= 'A' && a <= 'F') || + (a >= 'a' && a <= 'f') || + (a >= '0' && a <= '9'); +} + +static inline int unhex(int a) +{ + if (a >= 'A' && a <= 'F') return a - 'A' + 0xA; + if (a >= 'a' && a <= 'f') return a - 'a' + 0xA; + if (a >= '0' && a <= '9') return a - '0'; + return 0; +} + +int +xps_count_font_encodings(fz_context *ctx, fz_font *font) +{ + FT_Face face = fz_font_ft_face(ctx, font); + return face->num_charmaps; +} + +void +xps_identify_font_encoding(fz_context *ctx, fz_font *font, int idx, int *pid, int *eid) +{ + FT_Face face = fz_font_ft_face(ctx, font); + *pid = face->charmaps[idx]->platform_id; + *eid = face->charmaps[idx]->encoding_id; +} + +void +xps_select_font_encoding(fz_context *ctx, fz_font *font, int idx) +{ + FT_Face face = fz_font_ft_face(ctx, font); + FT_Set_Charmap(face, face->charmaps[idx]); +} + +int +xps_encode_font_char(fz_context *ctx, fz_font *font, int code) +{ + FT_Face face = fz_font_ft_face(ctx, font); + int gid = FT_Get_Char_Index(face, code); + if (gid == 0 && face->charmap && face->charmap->platform_id == 3 && face->charmap->encoding_id == 0) + gid = FT_Get_Char_Index(face, 0xF000 | code); + return gid; +} + +void +xps_measure_font_glyph(fz_context *ctx, xps_document *doc, fz_font *font, int gid, xps_glyph_metrics *mtx) +{ + int mask = FT_LOAD_NO_SCALE | FT_LOAD_IGNORE_TRANSFORM; + FT_Face face = fz_font_ft_face(ctx, font); + FT_Fixed hadv = 0, vadv = 0; + + fz_lock(ctx, FZ_LOCK_FREETYPE); + FT_Get_Advance(face, gid, mask, &hadv); + FT_Get_Advance(face, gid, mask | FT_LOAD_VERTICAL_LAYOUT, &vadv); + fz_unlock(ctx, FZ_LOCK_FREETYPE); + + mtx->hadv = (float) hadv / face->units_per_EM; + mtx->vadv = (float) vadv / face->units_per_EM; + mtx->vorg = (float) face->ascender / face->units_per_EM; +} + +static fz_font * +xps_lookup_font_imp(fz_context *ctx, xps_document *doc, char *name) +{ + xps_font_cache *cache; + for (cache = doc->font_table; cache; cache = cache->next) + if (!xps_strcasecmp(cache->name, name)) + return fz_keep_font(ctx, cache->font); + return NULL; +} + +static void +xps_insert_font(fz_context *ctx, xps_document *doc, char *name, fz_font *font) +{ + xps_font_cache *cache = fz_malloc_struct(ctx, xps_font_cache); + cache->name = fz_strdup(ctx, name); + cache->font = fz_keep_font(ctx, font); + cache->next = doc->font_table; + doc->font_table = cache; +} + +/* + * Some fonts in XPS are obfuscated by XOR:ing the first 32 bytes of the + * data with the GUID in the fontname. + */ +static void +xps_deobfuscate_font_resource(fz_context *ctx, xps_document *doc, xps_part *part) +{ + unsigned char buf[33]; + unsigned char key[16]; + unsigned char *data; + size_t size; + char *p; + int i; + + size = fz_buffer_storage(ctx, part->data, &data); + if (size < 32) + { + fz_warn(ctx, "insufficient data for font deobfuscation"); + return; + } + + p = strrchr(part->name, '/'); + if (!p) + p = part->name; + + for (i = 0; i < 32 && *p; p++) + { + if (ishex(*p)) + buf[i++] = *p; + } + buf[i] = 0; + + if (i != 32) + { + fz_warn(ctx, "cannot extract GUID from obfuscated font part name"); + return; + } + + for (i = 0; i < 16; i++) + key[i] = unhex(buf[i*2+0]) * 16 + unhex(buf[i*2+1]); + + for (i = 0; i < 16; i++) + { + data[i] ^= key[15-i]; + data[i+16] ^= key[15-i]; + } +} + +static void +xps_select_best_font_encoding(fz_context *ctx, xps_document *doc, fz_font *font) +{ + static struct { int pid, eid; } xps_cmap_list[] = + { + { 3, 10 }, /* Unicode with surrogates */ + { 3, 1 }, /* Unicode without surrogates */ + { 3, 5 }, /* Wansung */ + { 3, 4 }, /* Big5 */ + { 3, 3 }, /* Prc */ + { 3, 2 }, /* ShiftJis */ + { 3, 0 }, /* Symbol */ + { 1, 0 }, + { -1, -1 }, + }; + + int i, k, n, pid, eid; + + n = xps_count_font_encodings(ctx, font); + for (k = 0; xps_cmap_list[k].pid != -1; k++) + { + for (i = 0; i < n; i++) + { + xps_identify_font_encoding(ctx, font, i, &pid, &eid); + if (pid == xps_cmap_list[k].pid && eid == xps_cmap_list[k].eid) + { + xps_select_font_encoding(ctx, font, i); + return; + } + } + } + + fz_warn(ctx, "cannot find a suitable cmap"); +} + +fz_font * +xps_lookup_font(fz_context *ctx, xps_document *doc, char *base_uri, char *font_uri, char *style_att) +{ + char partname[1024]; + char fakename[1024]; + char *subfont; + int subfontid = 0; + xps_part *part; + fz_font *font; + + xps_resolve_url(ctx, doc, partname, base_uri, font_uri, sizeof partname); + subfont = strrchr(partname, '#'); + if (subfont) + { + subfontid = atoi(subfont + 1); + *subfont = 0; + } + + /* Make a new part name for font with style simulation applied */ + fz_strlcpy(fakename, partname, sizeof fakename); + if (style_att) + { + if (!strcmp(style_att, "BoldSimulation")) + fz_strlcat(fakename, "#Bold", sizeof fakename); + else if (!strcmp(style_att, "ItalicSimulation")) + fz_strlcat(fakename, "#Italic", sizeof fakename); + else if (!strcmp(style_att, "BoldItalicSimulation")) + fz_strlcat(fakename, "#BoldItalic", sizeof fakename); + } + + font = xps_lookup_font_imp(ctx, doc, fakename); + if (!font) + { + fz_buffer *buf = NULL; + fz_var(buf); + + fz_try(ctx) + { + part = xps_read_part(ctx, doc, partname); + } + fz_catch(ctx) + { + fz_rethrow_if(ctx, FZ_ERROR_TRYLATER); + fz_warn(ctx, "cannot find font resource part '%s'", partname); + return NULL; + } + + /* deobfuscate if necessary */ + if (strstr(part->name, ".odttf")) + xps_deobfuscate_font_resource(ctx, doc, part); + if (strstr(part->name, ".ODTTF")) + xps_deobfuscate_font_resource(ctx, doc, part); + + fz_try(ctx) + { + font = fz_new_font_from_buffer(ctx, NULL, part->data, subfontid, 1); + } + fz_always(ctx) + { + xps_drop_part(ctx, doc, part); + } + fz_catch(ctx) + { + fz_rethrow_if(ctx, FZ_ERROR_TRYLATER); + fz_warn(ctx, "cannot load font resource '%s'", partname); + return NULL; + } + + if (style_att) + { + fz_font_flags_t *flags = fz_font_flags(font); + int bold = !!strstr(style_att, "Bold"); + int italic = !!strstr(style_att, "Italic"); + flags->fake_bold = bold; + flags->is_bold = bold; + flags->fake_italic = italic; + flags->is_italic = italic; + } + + xps_select_best_font_encoding(ctx, doc, font); + xps_insert_font(ctx, doc, fakename, font); + } + return font; +} + +/* + * Parse and draw an XPS element. + * + * Indices syntax: + + GlyphIndices = GlyphMapping ( ";" GlyphMapping ) + GlyphMapping = ( [ClusterMapping] GlyphIndex ) [GlyphMetrics] + ClusterMapping = "(" ClusterCodeUnitCount [":" ClusterGlyphCount] ")" + ClusterCodeUnitCount = * DIGIT + ClusterGlyphCount = * DIGIT + GlyphIndex = * DIGIT + GlyphMetrics = "," AdvanceWidth ["," uOffset ["," vOffset]] + AdvanceWidth = ["+"] RealNum + uOffset = ["+" | "-"] RealNum + vOffset = ["+" | "-"] RealNum + RealNum = ((DIGIT ["." DIGIT]) | ("." DIGIT)) [Exponent] + Exponent = ( ("E"|"e") ("+"|"-") DIGIT ) + + */ + +static char * +xps_parse_digits(char *s, int *digit) +{ + *digit = 0; + while (*s >= '0' && *s <= '9') + { + *digit = *digit * 10 + (*s - '0'); + s ++; + } + return s; +} + +static char * +xps_parse_real_num(char *s, float *number, int *override) +{ + char *tail; + float v; + v = fz_strtof(s, &tail); + *override = tail != s; + if (*override) + *number = v; + return tail; +} + +static char * +xps_parse_cluster_mapping(char *s, int *code_count, int *glyph_count) +{ + if (*s == '(') + s = xps_parse_digits(s + 1, code_count); + if (*s == ':') + s = xps_parse_digits(s + 1, glyph_count); + if (*s == ')') + s ++; + return s; +} + +static char * +xps_parse_glyph_index(char *s, int *glyph_index) +{ + if (*s >= '0' && *s <= '9') + s = xps_parse_digits(s, glyph_index); + return s; +} + +static char * +xps_parse_glyph_metrics(char *s, float *advance, float *uofs, float *vofs, int bidi_level) +{ + int override; + if (*s == ',') + { + s = xps_parse_real_num(s + 1, advance, &override); + if (override && (bidi_level & 1)) + *advance = -*advance; + } + if (*s == ',') + s = xps_parse_real_num(s + 1, uofs, &override); + if (*s == ',') + s = xps_parse_real_num(s + 1, vofs, &override); + return s; +} + +/* + * Parse unicode and indices strings and encode glyphs. + * Calculate metrics for positioning. + */ +fz_text * +xps_parse_glyphs_imp(fz_context *ctx, xps_document *doc, fz_matrix ctm, + fz_font *font, float size, float originx, float originy, + int is_sideways, int bidi_level, + char *indices, char *unicode) +{ + xps_glyph_metrics mtx; + fz_text *text; + fz_matrix tm; + float x = originx; + float y = originy; + char *us = unicode; + char *is = indices; + size_t un = 0; + + if (!unicode && !indices) + fz_warn(ctx, "glyphs element with neither characters nor indices"); + + if (us) + { + if (us[0] == '{' && us[1] == '}') + us = us + 2; + un = strlen(us); + } + + if (is_sideways) + tm = fz_pre_scale(fz_rotate(90), -size, size); + else + tm = fz_scale(size, -size); + + text = fz_new_text(ctx); + + while ((us && un > 0) || (is && *is)) + { + int char_code = FZ_REPLACEMENT_CHARACTER; + int code_count = 1; + int glyph_count = 1; + + if (is && *is) + { + is = xps_parse_cluster_mapping(is, &code_count, &glyph_count); + } + + if (code_count < 1) + code_count = 1; + if (glyph_count < 1) + glyph_count = 1; + + /* TODO: add code chars with cluster mappings for text extraction */ + + while (code_count--) + { + if (us && un > 0) + { + int t = fz_chartorune(&char_code, us); + us += t; un -= t; + } + } + + while (glyph_count--) + { + int glyph_index = -1; + float u_offset = 0; + float v_offset = 0; + float advance; + int dir; + + if (is && *is) + is = xps_parse_glyph_index(is, &glyph_index); + + if (glyph_index == -1) + glyph_index = xps_encode_font_char(ctx, font, char_code); + + xps_measure_font_glyph(ctx, doc, font, glyph_index, &mtx); + if (is_sideways) + advance = mtx.vadv * 100; + else if (bidi_level & 1) + advance = -mtx.hadv * 100; + else + advance = mtx.hadv * 100; + + if (fz_font_flags(font)->fake_bold) + advance *= 1.02f; + + if (is && *is) + { + is = xps_parse_glyph_metrics(is, &advance, &u_offset, &v_offset, bidi_level); + if (*is == ';') + is ++; + } + + if (bidi_level & 1) + u_offset = -mtx.hadv * 100 - u_offset; + + u_offset = u_offset * 0.01f * size; + v_offset = v_offset * 0.01f * size; + + if (is_sideways) + { + tm.e = x + u_offset + (mtx.vorg * size); + tm.f = y - v_offset + (mtx.hadv * 0.5f * size); + } + else + { + tm.e = x + u_offset; + tm.f = y - v_offset; + } + + dir = bidi_level & 1 ? FZ_BIDI_RTL : FZ_BIDI_LTR; + fz_show_glyph(ctx, text, font, tm, glyph_index, char_code, is_sideways, bidi_level, dir, FZ_LANG_UNSET); + + x += advance * 0.01f * size; + } + } + + return text; +} + +void +xps_parse_glyphs(fz_context *ctx, xps_document *doc, fz_matrix ctm, + char *base_uri, xps_resource *dict, fz_xml *root) +{ + fz_device *dev = doc->dev; + + fz_xml *node; + + char *fill_uri; + char *opacity_mask_uri; + + char *bidi_level_att; + char *fill_att; + char *font_size_att; + char *font_uri_att; + char *origin_x_att; + char *origin_y_att; + char *is_sideways_att; + char *indices_att; + char *unicode_att; + char *style_att; + char *transform_att; + char *clip_att; + char *opacity_att; + char *opacity_mask_att; + + fz_xml *transform_tag = NULL; + fz_xml *clip_tag = NULL; + fz_xml *fill_tag = NULL; + fz_xml *opacity_mask_tag = NULL; + + char *fill_opacity_att = NULL; + + fz_font *font; + + float font_size = 10; + int is_sideways = 0; + int bidi_level = 0; + + fz_text *text; + fz_rect area; + + /* + * Extract attributes and extended attributes. + */ + + bidi_level_att = fz_xml_att(root, "BidiLevel"); + fill_att = fz_xml_att(root, "Fill"); + font_size_att = fz_xml_att(root, "FontRenderingEmSize"); + font_uri_att = fz_xml_att(root, "FontUri"); + origin_x_att = fz_xml_att(root, "OriginX"); + origin_y_att = fz_xml_att(root, "OriginY"); + is_sideways_att = fz_xml_att(root, "IsSideways"); + indices_att = fz_xml_att(root, "Indices"); + unicode_att = fz_xml_att(root, "UnicodeString"); + style_att = fz_xml_att(root, "StyleSimulations"); + transform_att = fz_xml_att(root, "RenderTransform"); + clip_att = fz_xml_att(root, "Clip"); + opacity_att = fz_xml_att(root, "Opacity"); + opacity_mask_att = fz_xml_att(root, "OpacityMask"); + + for (node = fz_xml_down(root); node; node = fz_xml_next(node)) + { + if (fz_xml_is_tag(node, "Glyphs.RenderTransform")) + transform_tag = fz_xml_down(node); + if (fz_xml_is_tag(node, "Glyphs.OpacityMask")) + opacity_mask_tag = fz_xml_down(node); + if (fz_xml_is_tag(node, "Glyphs.Clip")) + clip_tag = fz_xml_down(node); + if (fz_xml_is_tag(node, "Glyphs.Fill")) + fill_tag = fz_xml_down(node); + } + + fill_uri = base_uri; + opacity_mask_uri = base_uri; + + xps_resolve_resource_reference(ctx, doc, dict, &transform_att, &transform_tag, NULL); + xps_resolve_resource_reference(ctx, doc, dict, &clip_att, &clip_tag, NULL); + xps_resolve_resource_reference(ctx, doc, dict, &fill_att, &fill_tag, &fill_uri); + xps_resolve_resource_reference(ctx, doc, dict, &opacity_mask_att, &opacity_mask_tag, &opacity_mask_uri); + + /* + * Check that we have all the necessary information. + */ + + if (!font_size_att || !font_uri_att || !origin_x_att || !origin_y_att) { + fz_warn(ctx, "missing attributes in glyphs element"); + return; + } + + if (!indices_att && !unicode_att) + return; /* nothing to draw */ + + if (is_sideways_att) + is_sideways = !strcmp(is_sideways_att, "true"); + + if (bidi_level_att) + bidi_level = atoi(bidi_level_att); + + /* + * Find and load the font resource. + */ + + font = xps_lookup_font(ctx, doc, base_uri, font_uri_att, style_att); + if (!font) + return; /* bail if we can't find the font */ + + /* + * Set up graphics state. + */ + + ctm = xps_parse_transform(ctx, doc, transform_att, transform_tag, ctm); + + if (clip_att || clip_tag) + xps_clip(ctx, doc, ctm, dict, clip_att, clip_tag); + + font_size = fz_atof(font_size_att); + + text = xps_parse_glyphs_imp(ctx, doc, ctm, font, font_size, + fz_atof(origin_x_att), fz_atof(origin_y_att), + is_sideways, bidi_level, indices_att, unicode_att); + + area = fz_bound_text(ctx, text, NULL, ctm); + + xps_begin_opacity(ctx, doc, ctm, area, opacity_mask_uri, dict, opacity_att, opacity_mask_tag); + + /* If it's a solid color brush fill/stroke do a simple fill */ + + if (fz_xml_is_tag(fill_tag, "SolidColorBrush")) + { + fill_opacity_att = fz_xml_att(fill_tag, "Opacity"); + fill_att = fz_xml_att(fill_tag, "Color"); + fill_tag = NULL; + } + + if (fill_att) + { + float samples[FZ_MAX_COLORS]; + fz_colorspace *colorspace; + + xps_parse_color(ctx, doc, base_uri, fill_att, &colorspace, samples); + if (fill_opacity_att) + samples[0] *= fz_atof(fill_opacity_att); + xps_set_color(ctx, doc, colorspace, samples); + + fz_fill_text(ctx, dev, text, ctm, doc->colorspace, doc->color, doc->alpha, NULL); + } + + /* If it's a complex brush, use the charpath as a clip mask */ + + if (fill_tag) + { + fz_clip_text(ctx, dev, text, ctm, area); + xps_parse_brush(ctx, doc, ctm, area, fill_uri, dict, fill_tag); + fz_pop_clip(ctx, dev); + } + + xps_end_opacity(ctx, doc, opacity_mask_uri, dict, opacity_att, opacity_mask_tag); + + fz_drop_text(ctx, text); + + if (clip_att || clip_tag) + fz_pop_clip(ctx, dev); + + fz_drop_font(ctx, font); +} diff -Nru k2pdfopt-2.42+ds/readme_k2src.txt k2pdfopt-2.51+ds/readme_k2src.txt --- k2pdfopt-2.42+ds/readme_k2src.txt 2017-05-20 23:41:15.000000000 +0000 +++ k2pdfopt-2.51+ds/readme_k2src.txt 2019-01-04 21:43:51.000000000 +0000 @@ -1,7 +1,7 @@ K2pdfopt build help. http://willus.com Original: 7 September 2012 -Last updated: 20 May 2017 (v2.42) +Last updated: 4 Jan 2019 (v2.51) This "read me" file describes the source code distribution for k2pdfopt. @@ -37,15 +37,15 @@ REQUIRED -------- 1. Z-lib 1.2.11 (zlib.net) - 2. libpng 1.6.28 (www.libpng.org) - 3. Turbo JPEG lib 1.5.1 (sourceforge.net/projects/libjpeg-turbo/) + 2. libpng 1.6.35 (www.libpng.org) + 3. Turbo JPEG lib 2.0.1 (sourceforge.net/projects/libjpeg-turbo/) TO INCLUDE MuPDF LIBRARY (search for HAVE_MUPDF in k2pdfopt.c) -------------------------------------------------------------- 4. JBIG2Dec 0.11 (jbig2dec.sourceforge.net) - 5. OpenJPEG 2.1.0 (www.openjpeg.org) - 6. FreeType 2.7.1 (freetype.sourceforge.net/index2.html) - 7. Mupdf 1.10a (mupdf.com) -- SEE NOTE 1. + 5. OpenJPEG 2.3.0 (www.openjpeg.org) + 6. FreeType 2.9.1 (freetype.sourceforge.net/index2.html) + 7. Mupdf 1.14 (mupdf.com) -- SEE NOTE 1. TO INCLUDE DjVuLibre LIBRARY (search for HAVE_DJVU in k2pdfopt.c) ----------------------------------------------------------------- @@ -54,8 +54,8 @@ FOR OCR VERSIONS OF K2PDFOPT (search for HAVE_OCR in k2pdfopt.c) ---------------------------------------------------------------- 9. GOCR 0.50 (sourceforge.net/jocr/) - 10. Leptonica 1.74.1 (leptonica.com) - 11. Tesseract 3.05.00 (C++) (code.google.com/tesseract-ocr/) -- SEE NOTE 2. + 10. Leptonica 1.74.4 (leptonica.com) + 11. Tesseract 4.0.0 (C++) (code.google.com/tesseract-ocr/) -- SEE NOTE 2. 12. POSIX threads support (pretty standard with gcc implementations) If you don't include MuPDF, DjVuLibre, or OCR, then k2pdfopt will @@ -67,7 +67,7 @@ 1. Mods to the released MuPDF library are in the mupdf_mod folder. Search for "willus" or "sumatra" or "bugs" in the files to find the mods. -2. Tesseract requires my small C API file plus three custom-modified source files. +2. Tesseract requires my small C API file plus some custom-modified source files. These are in the tesseract_mod folder. Search for "willus" in the files to find the mods. To use Tesseract, you'll need to download one of the data packages for it from the Tesseract web site and to point the TESSDATA_PREFIX @@ -98,7 +98,7 @@ to build the project without using these files (I do not use them). -Build Steps for k2pdfopt on Windows (gcc 6.3.0) +Build Steps for k2pdfopt on Windows (gcc 7.3.0) ----------------------------------------------- My compile steps with gcc (MinGW) are as follows (assuming all the libraries are built to libxxx.a files in d:\3rdparty_lib and headers are in d:\3rdparty_include): @@ -125,14 +125,14 @@ 4. g++ -Ofast -m32 -Wall -o k2pdfopt.exe k2pdfopt.o resfile.o -static-libgcc -static-libstdc++ d:\mingw\i386\lib\crt_noglob.o -Ld:\3rdparty_lib -lk2pdfopt -lwillus -lgocr -ltesseract -lleptonica -ldjvu -lmupdf -lfreetype -ljbig2 -ljpeglib -lopenjpeg -lpng -lzlib -lpthread -lgdi32 -luuid -lole32 -lcomdlg32 -lshlwapi -Build Steps on Linux (64-bit, gcc 4.8.5, compiled on CentOS 7.2) ----------------------------------------------------------------- +Build Steps on Linux (64-bit, gcc 8.2.0, compiled on Fedora 29) +--------------------------------------------------------------- 1. gcc -Wall -Ofast -m64 -o k2pdfopt.o -c k2pdfopt.c 2. g++ -Ofast -m64 -o k2pdfopt k2pdfopt.o -static -static-libgcc -static-libstdc++ -lk2pdfopt -lwillus -lgocr -ltesseract -lleptonica -ldjvu -lmupdf -lfreetype -ljbig2 -ljpeglib -lopenjpeg -lpng -lzlib -lpthread -lstdc++ -lc -lm -Build Steps on OS/X (64-bit, gcc 6.2.0, compiled on OSX 10.12 Sierra) +Build Steps on OS/X (64-bit, gcc 8.2.0, compiled on OSX 10.12 Sierra) ---------------------------------------------------------------------- 1. gcc -Ofast -Wall -m64 -o k2pdfopt.o -c k2pdfopt.c diff -Nru k2pdfopt-2.42+ds/tesseract_mod/allheaders.h k2pdfopt-2.51+ds/tesseract_mod/allheaders.h --- k2pdfopt-2.42+ds/tesseract_mod/allheaders.h 2017-02-25 04:35:26.000000000 +0000 +++ k2pdfopt-2.51+ds/tesseract_mod/allheaders.h 1970-01-01 00:00:00.000000000 +0000 @@ -1 +0,0 @@ -#include diff -Nru k2pdfopt-2.42+ds/tesseract_mod/ambigs.cpp k2pdfopt-2.51+ds/tesseract_mod/ambigs.cpp --- k2pdfopt-2.42+ds/tesseract_mod/ambigs.cpp 2017-02-25 04:36:20.000000000 +0000 +++ k2pdfopt-2.51+ds/tesseract_mod/ambigs.cpp 1970-01-01 00:00:00.000000000 +0000 @@ -1,400 +0,0 @@ -#include "config_auto.h" -/////////////////////////////////////////////////////////////////////// -// File: ambigs.cc -// Description: Functions for dealing with ambiguities -// (training and recognition). -// Author: Daria Antonova -// Created: Mon Feb 5 11:26:43 PDT 2009 -// -// (C) Copyright 2008, Google Inc. -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// -/////////////////////////////////////////////////////////////////////// - -#include "ambigs.h" - -#include -#include "helpers.h" -#include "universalambigs.h" -/* -#if defined _WIN32 -#ifndef __GNUC__ -#define strtok_r strtok_s -#else -#include "strtok_r.h" -#endif -#endif -*/ - -namespace tesseract { - -// Maximum line size: -// 10 for sizes of ambigs, tabs, abmig type and newline -// UNICHAR_LEN * (MAX_AMBIG_SIZE + 1) for each part of the ambig -const int kMaxAmbigStringSize = UNICHAR_LEN * (MAX_AMBIG_SIZE + 1); - -AmbigSpec::AmbigSpec() { - wrong_ngram[0] = INVALID_UNICHAR_ID; - correct_fragments[0] = INVALID_UNICHAR_ID; - correct_ngram_id = INVALID_UNICHAR_ID; - type = NOT_AMBIG; - wrong_ngram_size = 0; -} - -ELISTIZE(AmbigSpec); - -// Initializes the ambigs by adding a NULL pointer to each table. -void UnicharAmbigs::InitUnicharAmbigs(const UNICHARSET& unicharset, - bool use_ambigs_for_adaption) { - for (int i = 0; i < unicharset.size(); ++i) { - replace_ambigs_.push_back(NULL); - dang_ambigs_.push_back(NULL); - one_to_one_definite_ambigs_.push_back(NULL); - if (use_ambigs_for_adaption) { - ambigs_for_adaption_.push_back(NULL); - reverse_ambigs_for_adaption_.push_back(NULL); - } - } -} - -// Loads the universal ambigs that are useful for any language. -void UnicharAmbigs::LoadUniversal(const UNICHARSET& encoder_set, - UNICHARSET* unicharset) { - TFile file; - if (!file.Open(kUniversalAmbigsFile, ksizeofUniversalAmbigsFile)) return; - LoadUnicharAmbigs(encoder_set, &file, 0, false, unicharset); -} - -void UnicharAmbigs::LoadUnicharAmbigs(const UNICHARSET& encoder_set, - TFile *ambig_file, - int debug_level, - bool use_ambigs_for_adaption, - UNICHARSET *unicharset) { - int i, j; - UnicharIdVector *adaption_ambigs_entry; - if (debug_level) tprintf("Reading ambiguities\n"); - - int test_ambig_part_size; - int replacement_ambig_part_size; - // The space for buffer is allocated on the heap to avoid - // GCC frame size warning. - const int kBufferSize = 10 + 2 * kMaxAmbigStringSize; - char *buffer = new char[kBufferSize]; - char replacement_string[kMaxAmbigStringSize]; - UNICHAR_ID test_unichar_ids[MAX_AMBIG_SIZE + 1]; - int line_num = 0; - int type = NOT_AMBIG; - - // Determine the version of the ambigs file. - int version = 0; - ASSERT_HOST(ambig_file->FGets(buffer, kBufferSize) != NULL && - strlen(buffer) > 0); - if (*buffer == 'v') { - version = static_cast(strtol(buffer+1, NULL, 10)); - ++line_num; - } else { - ambig_file->Rewind(); - } - while (ambig_file->FGets(buffer, kBufferSize) != NULL) { - chomp_string(buffer); - if (debug_level > 2) tprintf("read line %s\n", buffer); - ++line_num; - if (!ParseAmbiguityLine(line_num, version, debug_level, encoder_set, - buffer, &test_ambig_part_size, test_unichar_ids, - &replacement_ambig_part_size, - replacement_string, &type)) continue; - // Construct AmbigSpec and add it to the appropriate AmbigSpec_LIST. - AmbigSpec *ambig_spec = new AmbigSpec(); - if (!InsertIntoTable((type == REPLACE_AMBIG) ? replace_ambigs_ - : dang_ambigs_, - test_ambig_part_size, test_unichar_ids, - replacement_ambig_part_size, replacement_string, type, - ambig_spec, unicharset)) - continue; - - // Update one_to_one_definite_ambigs_. - if (test_ambig_part_size == 1 && - replacement_ambig_part_size == 1 && type == DEFINITE_AMBIG) { - if (one_to_one_definite_ambigs_[test_unichar_ids[0]] == NULL) { - one_to_one_definite_ambigs_[test_unichar_ids[0]] = new UnicharIdVector(); - } - one_to_one_definite_ambigs_[test_unichar_ids[0]]->push_back( - ambig_spec->correct_ngram_id); - } - // Update ambigs_for_adaption_. - if (use_ambigs_for_adaption) { - GenericVector encoding; - // Silently ignore invalid strings, as before, so it is safe to use a - // universal ambigs file. - if (unicharset->encode_string(replacement_string, true, &encoding, - NULL, NULL)) { - for (i = 0; i < test_ambig_part_size; ++i) { - if (ambigs_for_adaption_[test_unichar_ids[i]] == NULL) { - ambigs_for_adaption_[test_unichar_ids[i]] = new UnicharIdVector(); - } - adaption_ambigs_entry = ambigs_for_adaption_[test_unichar_ids[i]]; - for (int r = 0; r < encoding.size(); ++r) { - UNICHAR_ID id_to_insert = encoding[r]; - ASSERT_HOST(id_to_insert != INVALID_UNICHAR_ID); - // Add the new unichar id to adaption_ambigs_entry (only if the - // vector does not already contain it) keeping it in sorted order. - for (j = 0; j < adaption_ambigs_entry->size() && - (*adaption_ambigs_entry)[j] > id_to_insert; ++j); - if (j < adaption_ambigs_entry->size()) { - if ((*adaption_ambigs_entry)[j] != id_to_insert) { - adaption_ambigs_entry->insert(id_to_insert, j); - } - } else { - adaption_ambigs_entry->push_back(id_to_insert); - } - } - } - } - } - } - delete[] buffer; - - // Fill in reverse_ambigs_for_adaption from ambigs_for_adaption vector. - if (use_ambigs_for_adaption) { - for (i = 0; i < ambigs_for_adaption_.size(); ++i) { - adaption_ambigs_entry = ambigs_for_adaption_[i]; - if (adaption_ambigs_entry == NULL) continue; - for (j = 0; j < adaption_ambigs_entry->size(); ++j) { - UNICHAR_ID ambig_id = (*adaption_ambigs_entry)[j]; - if (reverse_ambigs_for_adaption_[ambig_id] == NULL) { - reverse_ambigs_for_adaption_[ambig_id] = new UnicharIdVector(); - } - reverse_ambigs_for_adaption_[ambig_id]->push_back(i); - } - } - } - - // Print what was read from the input file. - if (debug_level > 1) { - for (int tbl = 0; tbl < 2; ++tbl) { - const UnicharAmbigsVector &print_table = - (tbl == 0) ? replace_ambigs_ : dang_ambigs_; - for (i = 0; i < print_table.size(); ++i) { - AmbigSpec_LIST *lst = print_table[i]; - if (lst == NULL) continue; - if (!lst->empty()) { - tprintf("%s Ambiguities for %s:\n", - (tbl == 0) ? "Replaceable" : "Dangerous", - unicharset->debug_str(i).string()); - } - AmbigSpec_IT lst_it(lst); - for (lst_it.mark_cycle_pt(); !lst_it.cycled_list(); lst_it.forward()) { - AmbigSpec *ambig_spec = lst_it.data(); - tprintf("wrong_ngram:"); - UnicharIdArrayUtils::print(ambig_spec->wrong_ngram, *unicharset); - tprintf("correct_fragments:"); - UnicharIdArrayUtils::print(ambig_spec->correct_fragments, *unicharset); - } - } - } - if (use_ambigs_for_adaption) { - for (int vec_id = 0; vec_id < 2; ++vec_id) { - const GenericVector &vec = (vec_id == 0) ? - ambigs_for_adaption_ : reverse_ambigs_for_adaption_; - for (i = 0; i < vec.size(); ++i) { - adaption_ambigs_entry = vec[i]; - if (adaption_ambigs_entry != NULL) { - tprintf("%sAmbigs for adaption for %s:\n", - (vec_id == 0) ? "" : "Reverse ", - unicharset->debug_str(i).string()); - for (j = 0; j < adaption_ambigs_entry->size(); ++j) { - tprintf("%s ", unicharset->debug_str( - (*adaption_ambigs_entry)[j]).string()); - } - tprintf("\n"); - } - } - } - } - } -} - -bool UnicharAmbigs::ParseAmbiguityLine( - int line_num, int version, int debug_level, const UNICHARSET &unicharset, - char *buffer, int *test_ambig_part_size, UNICHAR_ID *test_unichar_ids, - int *replacement_ambig_part_size, char *replacement_string, int *type) { - if (version > 1) { - // Simpler format is just wrong-string correct-string type\n. - STRING input(buffer); - GenericVector fields; - input.split(' ', &fields); - if (fields.size() != 3) { - if (debug_level) tprintf(kIllegalMsg, line_num); - return false; - } - // Encode wrong-string. - GenericVector unichars; - if (!unicharset.encode_string(fields[0].string(), true, &unichars, NULL, - NULL)) { - return false; - } - *test_ambig_part_size = unichars.size(); - if (*test_ambig_part_size > MAX_AMBIG_SIZE) { - if (debug_level) - tprintf("Too many unichars in ambiguity on line %d\n", line_num); - return false; - } - // Copy encoded string to output. - for (int i = 0; i < unichars.size(); ++i) - test_unichar_ids[i] = unichars[i]; - test_unichar_ids[unichars.size()] = INVALID_UNICHAR_ID; - // Encode replacement-string to check validity. - if (!unicharset.encode_string(fields[1].string(), true, &unichars, NULL, - NULL)) { - return false; - } - *replacement_ambig_part_size = unichars.size(); - if (*replacement_ambig_part_size > MAX_AMBIG_SIZE) { - if (debug_level) - tprintf("Too many unichars in ambiguity on line %d\n", line_num); - return false; - } - if (sscanf(fields[2].string(), "%d", type) != 1) { - if (debug_level) tprintf(kIllegalMsg, line_num); - return false; - } - snprintf(replacement_string, kMaxAmbigStringSize, "%s", fields[1].string()); - return true; - } - int i; - char *token; - char *next_token; - if (!(token = strtok_r(buffer, kAmbigDelimiters, &next_token)) || - !sscanf(token, "%d", test_ambig_part_size) || - *test_ambig_part_size <= 0) { - if (debug_level) tprintf(kIllegalMsg, line_num); - return false; - } - if (*test_ambig_part_size > MAX_AMBIG_SIZE) { - if (debug_level) - tprintf("Too many unichars in ambiguity on line %d\n", line_num); - return false; - } - for (i = 0; i < *test_ambig_part_size; ++i) { - if (!(token = strtok_r(NULL, kAmbigDelimiters, &next_token))) break; - if (!unicharset.contains_unichar(token)) { - if (debug_level) tprintf(kIllegalUnicharMsg, token); - break; - } - test_unichar_ids[i] = unicharset.unichar_to_id(token); - } - test_unichar_ids[i] = INVALID_UNICHAR_ID; - - if (i != *test_ambig_part_size || - !(token = strtok_r(NULL, kAmbigDelimiters, &next_token)) || - !sscanf(token, "%d", replacement_ambig_part_size) || - *replacement_ambig_part_size <= 0) { - if (debug_level) tprintf(kIllegalMsg, line_num); - return false; - } - if (*replacement_ambig_part_size > MAX_AMBIG_SIZE) { - if (debug_level) - tprintf("Too many unichars in ambiguity on line %d\n", line_num); - return false; - } - replacement_string[0] = '\0'; - for (i = 0; i < *replacement_ambig_part_size; ++i) { - if (!(token = strtok_r(NULL, kAmbigDelimiters, &next_token))) break; - strcat(replacement_string, token); - if (!unicharset.contains_unichar(token)) { - if (debug_level) tprintf(kIllegalUnicharMsg, token); - break; - } - } - if (i != *replacement_ambig_part_size) { - if (debug_level) tprintf(kIllegalMsg, line_num); - return false; - } - if (version > 0) { - // The next field being true indicates that the abiguity should - // always be substituted (e.g. '' should always be changed to "). - // For such "certain" n -> m ambigs tesseract will insert character - // fragments for the n pieces in the unicharset. AmbigsFound() - // will then replace the incorrect ngram with the character - // fragments of the correct character (or ngram if m > 1). - // Note that if m > 1, an ngram will be inserted into the - // modified word, not the individual unigrams. Tesseract - // has limited support for ngram unichar (e.g. dawg permuter). - if (!(token = strtok_r(NULL, kAmbigDelimiters, &next_token)) || - !sscanf(token, "%d", type)) { - if (debug_level) tprintf(kIllegalMsg, line_num); - return false; - } - } - return true; -} - -bool UnicharAmbigs::InsertIntoTable( - UnicharAmbigsVector &table, int test_ambig_part_size, - UNICHAR_ID *test_unichar_ids, int replacement_ambig_part_size, - const char *replacement_string, int type, - AmbigSpec *ambig_spec, UNICHARSET *unicharset) { - ambig_spec->type = static_cast(type); - if (test_ambig_part_size == 1 && replacement_ambig_part_size == 1 && - unicharset->to_lower(test_unichar_ids[0]) == - unicharset->to_lower(unicharset->unichar_to_id(replacement_string))) { - ambig_spec->type = CASE_AMBIG; - } - - ambig_spec->wrong_ngram_size = - UnicharIdArrayUtils::copy(test_unichar_ids, ambig_spec->wrong_ngram); - - // Since we need to maintain a constant number of unichar positions in - // order to construct ambig_blob_choices vector in NoDangerousAmbig(), for - // each n->m ambiguity we will have to place n character fragments of the - // correct ngram into the corresponding positions in the vector (e.g. given - // "vvvvw" and vvvv->ww we will place v and |ww|0|4 into position 0, v and - // |ww|1|4 into position 1 and so on. The correct ngram is reconstructed - // from fragments by dawg_permute_and_select(). - - // Insert the corresponding correct ngram into the unicharset. - // Unicharset code assumes that the "base" ngram is inserted into - // the unicharset before fragments of this ngram are inserted. - unicharset->unichar_insert(replacement_string); - ambig_spec->correct_ngram_id = - unicharset->unichar_to_id(replacement_string); - if (replacement_ambig_part_size > 1) { - unicharset->set_isngram(ambig_spec->correct_ngram_id, true); - } - // Add the corresponding fragments of the wrong ngram to unicharset. - int i; - for (i = 0; i < test_ambig_part_size; ++i) { - UNICHAR_ID unichar_id; - if (test_ambig_part_size == 1) { - unichar_id = ambig_spec->correct_ngram_id; - } else { - STRING frag_str = CHAR_FRAGMENT::to_string( - replacement_string, i, test_ambig_part_size, false); - unicharset->unichar_insert(frag_str.string()); - unichar_id = unicharset->unichar_to_id(frag_str.string()); - } - ambig_spec->correct_fragments[i] = unichar_id; - } - ambig_spec->correct_fragments[i] = INVALID_UNICHAR_ID; - - // Add AmbigSpec for this ambiguity to the corresponding AmbigSpec_LIST. - // Keep AmbigSpec_LISTs sorted by AmbigSpec.wrong_ngram. - if (table[test_unichar_ids[0]] == NULL) { - table[test_unichar_ids[0]] = new AmbigSpec_LIST(); - } - if (table[test_unichar_ids[0]]->add_sorted( - AmbigSpec::compare_ambig_specs, true, ambig_spec)) - return true; - delete ambig_spec; - return false; -} - -} // namespace tesseract diff -Nru k2pdfopt-2.42+ds/tesseract_mod/baseapi.cpp k2pdfopt-2.51+ds/tesseract_mod/baseapi.cpp --- k2pdfopt-2.42+ds/tesseract_mod/baseapi.cpp 1970-01-01 00:00:00.000000000 +0000 +++ k2pdfopt-2.51+ds/tesseract_mod/baseapi.cpp 2018-11-22 22:46:15.000000000 +0000 @@ -0,0 +1,3083 @@ +/********************************************************************** + * File: baseapi.cpp + * Description: Simple API for calling tesseract. + * Author: Ray Smith + * + * (C) Copyright 2006, Google Inc. + ** Licensed under the Apache License, Version 2.0 (the "License"); + ** you may not use this file except in compliance with the License. + ** You may obtain a copy of the License at + ** http://www.apache.org/licenses/LICENSE-2.0 + ** Unless required by applicable law or agreed to in writing, software + ** distributed under the License is distributed on an "AS IS" BASIS, + ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + ** See the License for the specific language governing permissions and + ** limitations under the License. + * + **********************************************************************/ + +// Include automatically generated configuration file if running autoconf. +#ifdef HAVE_CONFIG_H +#include "config_auto.h" +#endif + +#include "baseapi.h" +#ifdef __linux__ +#include // for sigaction, SA_RESETHAND, SIGBUS, SIGFPE +#endif + +#if defined(_WIN32) +#if defined(__MINGW32__) +// workaround for stdlib.h with -std=c++11 for _splitpath and _MAX_FNAME +#undef __STRICT_ANSI__ +#endif // __MINGW32__ +#include +#include +#else +#include // for closedir, opendir, readdir, DIR, dirent +#include +#include +#include // for stat, S_IFDIR +#include +#endif // _WIN32 + +#include // for LC_ALL, LC_CTYPE, LC_NUMERIC +#include // for round, M_PI +#include // for int32_t +#include // for strcmp, strcpy +#include // for size_t +#include // for std::cin +#include // for std::unique_ptr +#include // for std::pair +#include // for std::vector +#include "allheaders.h" // for pixDestroy, boxCreate, boxaAddBox, box... +#include "blobclass.h" // for ExtractFontName +#include "boxword.h" // for BoxWord +#include "config_auto.h" // for PACKAGE_VERSION +#include "coutln.h" // for C_OUTLINE_IT, C_OUTLINE_LIST +#include "dawg_cache.h" // for DawgCache +#include "dict.h" // for Dict +#include "edgblob.h" // for extract_edges +#include "elst.h" // for ELIST_ITERATOR, ELISTIZE, ELISTIZEH +#include "environ.h" // for l_uint8, FALSE, TRUE +#include "equationdetect.h" // for EquationDetect +#include "errcode.h" // for ASSERT_HOST +#include "globaloc.h" // for SavePixForCrash, signal_exit +#include "helpers.h" // for IntCastRounded, chomp_string +#include "host.h" // for BOOL8 +#include "imageio.h" // for IFF_TIFF_G4, IFF_TIFF, IFF_TIFF_G3 +#include "intfx.h" // for INT_FX_RESULT_STRUCT +#include "mutableiterator.h" // for MutableIterator +#include "normalis.h" // for kBlnBaselineOffset, kBlnXHeight +#include "ocrclass.h" // for ETEXT_DESC +#include "openclwrapper.h" // for PERF_COUNT_END, PERF_COUNT_START, PERF... +#include "osdetect.h" // for OSResults, OSBestResult, OrientationId... +#include "pageres.h" // for PAGE_RES_IT, WERD_RES, PAGE_RES, CR_DE... +#include "paragraphs.h" // for DetectParagraphs +#include "params.h" // for BoolParam, IntParam, DoubleParam, Stri... +#include "pdblock.h" // for PDBLK +#include "points.h" // for FCOORD +#include "polyblk.h" // for POLY_BLOCK +#include "rect.h" // for TBOX +#include "renderer.h" // for TessResultRenderer +#include "resultiterator.h" // for ResultIterator +#include "stepblob.h" // for C_BLOB_IT, C_BLOB, C_BLOB_LIST +#include "strngs.h" // for STRING +#include "tessdatamanager.h" // for TessdataManager, kTrainedDataSuffix +#include "tesseractclass.h" // for Tesseract +#include "thresholder.h" // for ImageThresholder +#include "tprintf.h" // for tprintf +#include "werd.h" // for WERD, WERD_IT, W_FUZZY_NON, W_FUZZY_SP + +BOOL_VAR(stream_filelist, FALSE, "Stream a filelist from stdin"); + +namespace tesseract { + +/** Minimum sensible image size to be worth running tesseract. */ +const int kMinRectSize = 10; +/** Character returned when Tesseract couldn't recognize as anything. */ +const char kTesseractReject = '~'; +/** Character used by UNLV error counter as a reject. */ +const char kUNLVReject = '~'; +/** Character used by UNLV as a suspect marker. */ +const char kUNLVSuspect = '^'; +/** + * Filename used for input image file, from which to derive a name to search + * for a possible UNLV zone file, if none is specified by SetInputName. + */ +const char* kInputFile = "noname.tif"; +/** + * Temp file used for storing current parameters before applying retry values. + */ +const char* kOldVarsFile = "failed_vars.txt"; +/** Max string length of an int. */ +const int kMaxIntSize = 22; + +/* Add all available languages recursively. +*/ +static void addAvailableLanguages(const STRING &datadir, const STRING &base, + GenericVector* langs) +{ + const STRING base2 = (base.string()[0] == '\0') ? base : base + "/"; + const size_t extlen = sizeof(kTrainedDataSuffix); +#ifdef _WIN32 + WIN32_FIND_DATA data; + HANDLE handle = FindFirstFile((datadir + base2 + "*").string(), &data); + if (handle != INVALID_HANDLE_VALUE) { + BOOL result = TRUE; + for (; result;) { + char *name = data.cFileName; + // Skip '.', '..', and hidden files + if (name[0] != '.') { + if ((data.dwFileAttributes & FILE_ATTRIBUTE_DIRECTORY) == + FILE_ATTRIBUTE_DIRECTORY) { + addAvailableLanguages(datadir, base2 + name, langs); + } else { + size_t len = strlen(name); + if (len > extlen && name[len - extlen] == '.' && + strcmp(&name[len - extlen + 1], kTrainedDataSuffix) == 0) { + name[len - extlen] = '\0'; + langs->push_back(base2 + name); + } + } + } + result = FindNextFile(handle, &data); + } + FindClose(handle); + } +#else // _WIN32 + DIR* dir = opendir((datadir + base).string()); + if (dir != nullptr) { + dirent *de; + while ((de = readdir(dir))) { + char *name = de->d_name; + // Skip '.', '..', and hidden files + if (name[0] != '.') { + struct stat st; + if (stat((datadir + base2 + name).string(), &st) == 0 && + (st.st_mode & S_IFDIR) == S_IFDIR) { + addAvailableLanguages(datadir, base2 + name, langs); + } else { + size_t len = strlen(name); + if (len > extlen && name[len - extlen] == '.' && + strcmp(&name[len - extlen + 1], kTrainedDataSuffix) == 0) { + name[len - extlen] = '\0'; + langs->push_back(base2 + name); + } + } + } + } + closedir(dir); + } +#endif +} + +// Compare two STRING values (used for sorting). +static int CompareSTRING(const void* p1, const void* p2) { + const STRING* s1 = static_cast(p1); + const STRING* s2 = static_cast(p2); + return strcmp(s1->c_str(), s2->c_str()); +} + +TessBaseAPI::TessBaseAPI() + : tesseract_(nullptr), + osd_tesseract_(nullptr), + equ_detect_(nullptr), + reader_(nullptr), + // Thresholder is initialized to nullptr here, but will be set before use by: + // A constructor of a derived API, SetThresholder(), or + // created implicitly when used in InternalSetImage. + thresholder_(nullptr), + paragraph_models_(nullptr), + block_list_(nullptr), + page_res_(nullptr), + input_file_(nullptr), + output_file_(nullptr), + datapath_(nullptr), + language_(nullptr), + last_oem_requested_(OEM_DEFAULT), + recognition_done_(false), + truth_cb_(nullptr), + rect_left_(0), + rect_top_(0), + rect_width_(0), + rect_height_(0), + image_width_(0), + image_height_(0) { + const char *locale; + locale = std::setlocale(LC_ALL, nullptr); +/* willus mod Remove assertions--taken care of in tesscapi.cpp */ +// ASSERT_HOST(!strcmp(locale, "C")); + locale = std::setlocale(LC_CTYPE, nullptr); +// ASSERT_HOST(!strcmp(locale, "C")); + locale = std::setlocale(LC_NUMERIC, nullptr); +// ASSERT_HOST(!strcmp(locale, "C")); +} + +TessBaseAPI::~TessBaseAPI() { + End(); +} + +/** + * Returns the version identifier as a static string. Do not delete. + */ +const char* TessBaseAPI::Version() { + return PACKAGE_VERSION; +} + +/** + * If compiled with OpenCL AND an available OpenCL + * device is deemed faster than serial code, then + * "device" is populated with the cl_device_id + * and returns sizeof(cl_device_id) + * otherwise *device=nullptr and returns 0. + */ +#ifdef USE_OPENCL +#ifdef USE_DEVICE_SELECTION +#include "opencl_device_selection.h" +#endif +#endif +size_t TessBaseAPI::getOpenCLDevice(void **data) { +#ifdef USE_OPENCL +#ifdef USE_DEVICE_SELECTION + ds_device device = OpenclDevice::getDeviceSelection(); + if (device.type == DS_DEVICE_OPENCL_DEVICE) { + *data = new cl_device_id; + memcpy(*data, &device.oclDeviceID, sizeof(cl_device_id)); + return sizeof(cl_device_id); + } +#endif +#endif + + *data = nullptr; + return 0; +} + +/** + * Writes the thresholded image to stderr as a PBM file on receipt of a + * SIGSEGV, SIGFPE, or SIGBUS signal. (Linux/Unix only). + */ +void TessBaseAPI::CatchSignals() { +#ifdef __linux__ + struct sigaction action; + memset(&action, 0, sizeof(action)); + action.sa_handler = &signal_exit; + action.sa_flags = SA_RESETHAND; + sigaction(SIGSEGV, &action, nullptr); + sigaction(SIGFPE, &action, nullptr); + sigaction(SIGBUS, &action, nullptr); +#else + // Warn API users that an implementation is needed. + tprintf("CatchSignals has no non-linux implementation!\n"); +#endif +} + +/** + * Set the name of the input file. Needed only for training and + * loading a UNLV zone file. + */ +void TessBaseAPI::SetInputName(const char* name) { + if (input_file_ == nullptr) + input_file_ = new STRING(name); + else + *input_file_ = name; +} + +/** Set the name of the output files. Needed only for debugging. */ +void TessBaseAPI::SetOutputName(const char* name) { + if (output_file_ == nullptr) + output_file_ = new STRING(name); + else + *output_file_ = name; +} + +bool TessBaseAPI::SetVariable(const char* name, const char* value) { + if (tesseract_ == nullptr) tesseract_ = new Tesseract; + return ParamUtils::SetParam(name, value, SET_PARAM_CONSTRAINT_NON_INIT_ONLY, + tesseract_->params()); +} + +bool TessBaseAPI::SetDebugVariable(const char* name, const char* value) { + if (tesseract_ == nullptr) tesseract_ = new Tesseract; + return ParamUtils::SetParam(name, value, SET_PARAM_CONSTRAINT_DEBUG_ONLY, + tesseract_->params()); +} + +bool TessBaseAPI::GetIntVariable(const char *name, int *value) const { + IntParam *p = ParamUtils::FindParam( + name, GlobalParams()->int_params, tesseract_->params()->int_params); + if (p == nullptr) return false; + *value = (int32_t)(*p); + return true; +} + +bool TessBaseAPI::GetBoolVariable(const char *name, bool *value) const { + BoolParam *p = ParamUtils::FindParam( + name, GlobalParams()->bool_params, tesseract_->params()->bool_params); + if (p == nullptr) return false; + *value = (BOOL8)(*p); + return true; +} + +const char *TessBaseAPI::GetStringVariable(const char *name) const { + StringParam *p = ParamUtils::FindParam( + name, GlobalParams()->string_params, tesseract_->params()->string_params); + return (p != nullptr) ? p->string() : nullptr; +} + +bool TessBaseAPI::GetDoubleVariable(const char *name, double *value) const { + DoubleParam *p = ParamUtils::FindParam( + name, GlobalParams()->double_params, tesseract_->params()->double_params); + if (p == nullptr) return false; + *value = (double)(*p); + return true; +} + +/** Get value of named variable as a string, if it exists. */ +bool TessBaseAPI::GetVariableAsString(const char *name, STRING *val) { + return ParamUtils::GetParamAsString(name, tesseract_->params(), val); +} + +/** Print Tesseract parameters to the given file. */ +void TessBaseAPI::PrintVariables(FILE *fp) const { + ParamUtils::PrintParams(fp, tesseract_->params()); +} + +/** + * The datapath must be the name of the data directory (no ending /) or + * some other file in which the data directory resides (for instance argv[0].) + * The language is (usually) an ISO 639-3 string or nullptr will default to eng. + * If numeric_mode is true, then only digits and Roman numerals will + * be returned. + * @return: 0 on success and -1 on initialization failure. + */ +int TessBaseAPI::Init(const char* datapath, const char* language, + OcrEngineMode oem, char **configs, int configs_size, + const GenericVector *vars_vec, + const GenericVector *vars_values, + bool set_only_non_debug_params) { + return Init(datapath, 0, language, oem, configs, configs_size, vars_vec, + vars_values, set_only_non_debug_params, nullptr); +} + +// In-memory version reads the traineddata file directly from the given +// data[data_size] array. Also implements the version with a datapath in data, +// flagged by data_size = 0. +int TessBaseAPI::Init(const char* data, int data_size, const char* language, + OcrEngineMode oem, char** configs, int configs_size, + const GenericVector* vars_vec, + const GenericVector* vars_values, + bool set_only_non_debug_params, FileReader reader) { + PERF_COUNT_START("TessBaseAPI::Init") + // Default language is "eng". + if (language == nullptr) language = "eng"; + STRING datapath = data_size == 0 ? data : language; + // If the datapath, OcrEngineMode or the language have changed - start again. + // Note that the language_ field stores the last requested language that was + // initialized successfully, while tesseract_->lang stores the language + // actually used. They differ only if the requested language was nullptr, in + // which case tesseract_->lang is set to the Tesseract default ("eng"). + if (tesseract_ != nullptr && + (datapath_ == nullptr || language_ == nullptr || *datapath_ != datapath || + last_oem_requested_ != oem || + (*language_ != language && tesseract_->lang != language))) { + delete tesseract_; + tesseract_ = nullptr; + } + // PERF_COUNT_SUB("delete tesseract_") +#ifdef USE_OPENCL + OpenclDevice od; + od.InitEnv(); +#endif + PERF_COUNT_SUB("OD::InitEnv()") + bool reset_classifier = true; + if (tesseract_ == nullptr) { + reset_classifier = false; + tesseract_ = new Tesseract; + if (reader != nullptr) reader_ = reader; + TessdataManager mgr(reader_); + if (data_size != 0) { + mgr.LoadMemBuffer(language, data, data_size); + } + if (tesseract_->init_tesseract( + datapath.string(), + output_file_ != nullptr ? output_file_->string() : nullptr, + language, oem, configs, configs_size, vars_vec, vars_values, + set_only_non_debug_params, &mgr) != 0) { + return -1; + } + } + + PERF_COUNT_SUB("update tesseract_") + // Update datapath and language requested for the last valid initialization. + if (datapath_ == nullptr) + datapath_ = new STRING(datapath); + else + *datapath_ = datapath; + if ((strcmp(datapath_->string(), "") == 0) && + (strcmp(tesseract_->datadir.string(), "") != 0)) + *datapath_ = tesseract_->datadir; + + if (language_ == nullptr) + language_ = new STRING(language); + else + *language_ = language; + last_oem_requested_ = oem; + +#ifndef DISABLED_LEGACY_ENGINE + // PERF_COUNT_SUB("update last_oem_requested_") + // For same language and datapath, just reset the adaptive classifier. + if (reset_classifier) { + tesseract_->ResetAdaptiveClassifier(); + PERF_COUNT_SUB("tesseract_->ResetAdaptiveClassifier()") + } +#endif // ndef DISABLED_LEGACY_ENGINE + PERF_COUNT_END + return 0; +} + +/** + * Returns the languages string used in the last valid initialization. + * If the last initialization specified "deu+hin" then that will be + * returned. If hin loaded eng automatically as well, then that will + * not be included in this list. To find the languages actually + * loaded use GetLoadedLanguagesAsVector. + * The returned string should NOT be deleted. + */ +const char* TessBaseAPI::GetInitLanguagesAsString() const { + return (language_ == nullptr || language_->string() == nullptr) ? + "" : language_->string(); +} + +/** + * Returns the loaded languages in the vector of STRINGs. + * Includes all languages loaded by the last Init, including those loaded + * as dependencies of other loaded languages. + */ +void TessBaseAPI::GetLoadedLanguagesAsVector( + GenericVector* langs) const { + langs->clear(); + if (tesseract_ != nullptr) { + langs->push_back(tesseract_->lang); + int num_subs = tesseract_->num_sub_langs(); + for (int i = 0; i < num_subs; ++i) + langs->push_back(tesseract_->get_sub_lang(i)->lang); + } +} + +/** + * Returns the available languages in the sorted vector of STRINGs. + */ +void TessBaseAPI::GetAvailableLanguagesAsVector( + GenericVector* langs) const { + langs->clear(); + if (tesseract_ != nullptr) { + addAvailableLanguages(tesseract_->datadir, "", langs); + langs->sort(CompareSTRING); + } +} + +//TODO(amit): Adapt to lstm +#ifndef DISABLED_LEGACY_ENGINE +/** + * Init only the lang model component of Tesseract. The only functions + * that work after this init are SetVariable and IsValidWord. + * WARNING: temporary! This function will be removed from here and placed + * in a separate API at some future time. + */ +int TessBaseAPI::InitLangMod(const char* datapath, const char* language) { + if (tesseract_ == nullptr) + tesseract_ = new Tesseract; + else + ParamUtils::ResetToDefaults(tesseract_->params()); + TessdataManager mgr; + return tesseract_->init_tesseract_lm(datapath, nullptr, language, &mgr); +} +#endif // ndef DISABLED_LEGACY_ENGINE + +/** + * Init only for page layout analysis. Use only for calls to SetImage and + * AnalysePage. Calls that attempt recognition will generate an error. + */ +void TessBaseAPI::InitForAnalysePage() { + if (tesseract_ == nullptr) { + tesseract_ = new Tesseract; + #ifndef DISABLED_LEGACY_ENGINE + tesseract_->InitAdaptiveClassifier(nullptr); + #endif + } +} + +/** + * Read a "config" file containing a set of parameter name, value pairs. + * Searches the standard places: tessdata/configs, tessdata/tessconfigs + * and also accepts a relative or absolute path name. + */ +void TessBaseAPI::ReadConfigFile(const char* filename) { + tesseract_->read_config_file(filename, SET_PARAM_CONSTRAINT_NON_INIT_ONLY); +} + +/** Same as above, but only set debug params from the given config file. */ +void TessBaseAPI::ReadDebugConfigFile(const char* filename) { + tesseract_->read_config_file(filename, SET_PARAM_CONSTRAINT_DEBUG_ONLY); +} + +/** + * Set the current page segmentation mode. Defaults to PSM_AUTO. + * The mode is stored as an IntParam so it can also be modified by + * ReadConfigFile or SetVariable("tessedit_pageseg_mode", mode as string). + */ +void TessBaseAPI::SetPageSegMode(PageSegMode mode) { + if (tesseract_ == nullptr) + tesseract_ = new Tesseract; + tesseract_->tessedit_pageseg_mode.set_value(mode); +} + +/** Return the current page segmentation mode. */ +PageSegMode TessBaseAPI::GetPageSegMode() const { + if (tesseract_ == nullptr) + return PSM_SINGLE_BLOCK; + return static_cast( + static_cast(tesseract_->tessedit_pageseg_mode)); +} + +/** + * Recognize a rectangle from an image and return the result as a string. + * May be called many times for a single Init. + * Currently has no error checking. + * Greyscale of 8 and color of 24 or 32 bits per pixel may be given. + * Palette color images will not work properly and must be converted to + * 24 bit. + * Binary images of 1 bit per pixel may also be given but they must be + * byte packed with the MSB of the first byte being the first pixel, and a + * one pixel is WHITE. For binary images set bytes_per_pixel=0. + * The recognized text is returned as a char* which is coded + * as UTF8 and must be freed with the delete [] operator. + */ +char* TessBaseAPI::TesseractRect(const unsigned char* imagedata, + int bytes_per_pixel, + int bytes_per_line, + int left, int top, + int width, int height) { + if (tesseract_ == nullptr || width < kMinRectSize || height < kMinRectSize) + return nullptr; // Nothing worth doing. + + // Since this original api didn't give the exact size of the image, + // we have to invent a reasonable value. + int bits_per_pixel = bytes_per_pixel == 0 ? 1 : bytes_per_pixel * 8; + SetImage(imagedata, bytes_per_line * 8 / bits_per_pixel, height + top, + bytes_per_pixel, bytes_per_line); + SetRectangle(left, top, width, height); + + return GetUTF8Text(); +} + +#ifndef DISABLED_LEGACY_ENGINE +/** + * Call between pages or documents etc to free up memory and forget + * adaptive data. + */ +void TessBaseAPI::ClearAdaptiveClassifier() { + if (tesseract_ == nullptr) + return; + tesseract_->ResetAdaptiveClassifier(); + tesseract_->ResetDocumentDictionary(); +} +#endif // ndef DISABLED_LEGACY_ENGINE + +/** + * Provide an image for Tesseract to recognize. Format is as + * TesseractRect above. Copies the image buffer and converts to Pix. + * SetImage clears all recognition results, and sets the rectangle to the + * full image, so it may be followed immediately by a GetUTF8Text, and it + * will automatically perform recognition. + */ +void TessBaseAPI::SetImage(const unsigned char* imagedata, + int width, int height, + int bytes_per_pixel, int bytes_per_line) { + if (InternalSetImage()) { + thresholder_->SetImage(imagedata, width, height, + bytes_per_pixel, bytes_per_line); + SetInputImage(thresholder_->GetPixRect()); + } +} + +void TessBaseAPI::SetSourceResolution(int ppi) { + if (thresholder_) + thresholder_->SetSourceYResolution(ppi); + else + tprintf("Please call SetImage before SetSourceResolution.\n"); +} + +/** + * Provide an image for Tesseract to recognize. As with SetImage above, + * Tesseract takes its own copy of the image, so it need not persist until + * after Recognize. + * Pix vs raw, which to use? + * Use Pix where possible. Tesseract uses Pix as its internal representation + * and it is therefore more efficient to provide a Pix directly. + */ +void TessBaseAPI::SetImage(Pix* pix) { + if (InternalSetImage()) { + if (pixGetSpp(pix) == 4 && pixGetInputFormat(pix) == IFF_PNG) { + // remove alpha channel from png + PIX* p1 = pixRemoveAlpha(pix); + pixSetSpp(p1, 3); + pix = pixCopy(nullptr, p1); + pixDestroy(&p1); + } + thresholder_->SetImage(pix); + SetInputImage(thresholder_->GetPixRect()); + } +} + +/** + * Restrict recognition to a sub-rectangle of the image. Call after SetImage. + * Each SetRectangle clears the recogntion results so multiple rectangles + * can be recognized with the same image. + */ +void TessBaseAPI::SetRectangle(int left, int top, int width, int height) { + if (thresholder_ == nullptr) + return; + thresholder_->SetRectangle(left, top, width, height); + ClearResults(); +} + +/** + * ONLY available after SetImage if you have Leptonica installed. + * Get a copy of the internal thresholded image from Tesseract. + */ +Pix* TessBaseAPI::GetThresholdedImage() { + if (tesseract_ == nullptr || thresholder_ == nullptr) return nullptr; + if (tesseract_->pix_binary() == nullptr && + !Threshold(tesseract_->mutable_pix_binary())) { + return nullptr; + } + return pixClone(tesseract_->pix_binary()); +} + +/** + * Get the result of page layout analysis as a leptonica-style + * Boxa, Pixa pair, in reading order. + * Can be called before or after Recognize. + */ +Boxa* TessBaseAPI::GetRegions(Pixa** pixa) { + return GetComponentImages(RIL_BLOCK, false, pixa, nullptr); +} + +/** + * Get the textlines as a leptonica-style Boxa, Pixa pair, in reading order. + * Can be called before or after Recognize. + * If blockids is not nullptr, the block-id of each line is also returned as an + * array of one element per line. delete [] after use. + * If paraids is not nullptr, the paragraph-id of each line within its block is + * also returned as an array of one element per line. delete [] after use. + */ +Boxa* TessBaseAPI::GetTextlines(const bool raw_image, const int raw_padding, + Pixa** pixa, int** blockids, int** paraids) { + return GetComponentImages(RIL_TEXTLINE, true, raw_image, raw_padding, + pixa, blockids, paraids); +} + +/** + * Get textlines and strips of image regions as a leptonica-style Boxa, Pixa + * pair, in reading order. Enables downstream handling of non-rectangular + * regions. + * Can be called before or after Recognize. + * If blockids is not nullptr, the block-id of each line is also returned as an + * array of one element per line. delete [] after use. + */ +Boxa* TessBaseAPI::GetStrips(Pixa** pixa, int** blockids) { + return GetComponentImages(RIL_TEXTLINE, false, pixa, blockids); +} + +/** + * Get the words as a leptonica-style + * Boxa, Pixa pair, in reading order. + * Can be called before or after Recognize. + */ +Boxa* TessBaseAPI::GetWords(Pixa** pixa) { + return GetComponentImages(RIL_WORD, true, pixa, nullptr); +} + +/** + * Gets the individual connected (text) components (created + * after pages segmentation step, but before recognition) + * as a leptonica-style Boxa, Pixa pair, in reading order. + * Can be called before or after Recognize. + */ +Boxa* TessBaseAPI::GetConnectedComponents(Pixa** pixa) { + return GetComponentImages(RIL_SYMBOL, true, pixa, nullptr); +} + +/** + * Get the given level kind of components (block, textline, word etc.) as a + * leptonica-style Boxa, Pixa pair, in reading order. + * Can be called before or after Recognize. + * If blockids is not nullptr, the block-id of each component is also returned + * as an array of one element per component. delete [] after use. + * If text_only is true, then only text components are returned. + */ +Boxa* TessBaseAPI::GetComponentImages(PageIteratorLevel level, + bool text_only, bool raw_image, + const int raw_padding, + Pixa** pixa, int** blockids, + int** paraids) { + PageIterator* page_it = GetIterator(); + if (page_it == nullptr) + page_it = AnalyseLayout(); + if (page_it == nullptr) + return nullptr; // Failed. + + // Count the components to get a size for the arrays. + int component_count = 0; + int left, top, right, bottom; + + TessResultCallback* get_bbox = nullptr; + if (raw_image) { + // Get bounding box in original raw image with padding. + get_bbox = NewPermanentTessCallback(page_it, &PageIterator::BoundingBox, + level, raw_padding, + &left, &top, &right, &bottom); + } else { + // Get bounding box from binarized imaged. Note that this could be + // differently scaled from the original image. + get_bbox = NewPermanentTessCallback(page_it, + &PageIterator::BoundingBoxInternal, + level, &left, &top, &right, &bottom); + } + do { + if (get_bbox->Run() && + (!text_only || PTIsTextType(page_it->BlockType()))) + ++component_count; + } while (page_it->Next(level)); + + Boxa* boxa = boxaCreate(component_count); + if (pixa != nullptr) + *pixa = pixaCreate(component_count); + if (blockids != nullptr) + *blockids = new int[component_count]; + if (paraids != nullptr) + *paraids = new int[component_count]; + + int blockid = 0; + int paraid = 0; + int component_index = 0; + page_it->Begin(); + do { + if (get_bbox->Run() && + (!text_only || PTIsTextType(page_it->BlockType()))) { + Box* lbox = boxCreate(left, top, right - left, bottom - top); + boxaAddBox(boxa, lbox, L_INSERT); + if (pixa != nullptr) { + Pix* pix = nullptr; + if (raw_image) { + pix = page_it->GetImage(level, raw_padding, GetInputImage(), &left, + &top); + } else { + pix = page_it->GetBinaryImage(level); + } + pixaAddPix(*pixa, pix, L_INSERT); + pixaAddBox(*pixa, lbox, L_CLONE); + } + if (paraids != nullptr) { + (*paraids)[component_index] = paraid; + if (page_it->IsAtFinalElement(RIL_PARA, level)) + ++paraid; + } + if (blockids != nullptr) { + (*blockids)[component_index] = blockid; + if (page_it->IsAtFinalElement(RIL_BLOCK, level)) { + ++blockid; + paraid = 0; + } + } + ++component_index; + } + } while (page_it->Next(level)); + delete page_it; + delete get_bbox; + return boxa; +} + +int TessBaseAPI::GetThresholdedImageScaleFactor() const { + if (thresholder_ == nullptr) { + return 0; + } + return thresholder_->GetScaleFactor(); +} + +/** + * Runs page layout analysis in the mode set by SetPageSegMode. + * May optionally be called prior to Recognize to get access to just + * the page layout results. Returns an iterator to the results. + * If merge_similar_words is true, words are combined where suitable for use + * with a line recognizer. Use if you want to use AnalyseLayout to find the + * textlines, and then want to process textline fragments with an external + * line recognizer. + * Returns nullptr on error or an empty page. + * The returned iterator must be deleted after use. + * WARNING! This class points to data held within the TessBaseAPI class, and + * therefore can only be used while the TessBaseAPI class still exists and + * has not been subjected to a call of Init, SetImage, Recognize, Clear, End + * DetectOS, or anything else that changes the internal PAGE_RES. + */ +PageIterator* TessBaseAPI::AnalyseLayout() { return AnalyseLayout(false); } + +PageIterator* TessBaseAPI::AnalyseLayout(bool merge_similar_words) { + if (FindLines() == 0) { + if (block_list_->empty()) + return nullptr; // The page was empty. + page_res_ = new PAGE_RES(merge_similar_words, block_list_, nullptr); + DetectParagraphs(false); + return new PageIterator( + page_res_, tesseract_, thresholder_->GetScaleFactor(), + thresholder_->GetScaledYResolution(), + rect_left_, rect_top_, rect_width_, rect_height_); + } + return nullptr; +} + +/** + * Recognize the tesseract global image and return the result as Tesseract + * internal structures. + */ +int TessBaseAPI::Recognize(ETEXT_DESC* monitor) { + if (tesseract_ == nullptr) + return -1; + if (FindLines() != 0) + return -1; + delete page_res_; + if (block_list_->empty()) { + page_res_ = new PAGE_RES(false, block_list_, + &tesseract_->prev_word_best_choice_); + return 0; // Empty page. + } + + tesseract_->SetBlackAndWhitelist(); + recognition_done_ = true; +#ifndef DISABLED_LEGACY_ENGINE + if (tesseract_->tessedit_resegment_from_line_boxes) { + page_res_ = tesseract_->ApplyBoxes(*input_file_, true, block_list_); + } else if (tesseract_->tessedit_resegment_from_boxes) { + page_res_ = tesseract_->ApplyBoxes(*input_file_, false, block_list_); + } else +#endif // ndef DISABLED_LEGACY_ENGINE + { + page_res_ = new PAGE_RES(tesseract_->AnyLSTMLang(), + block_list_, &tesseract_->prev_word_best_choice_); + } + + if (page_res_ == nullptr) { + return -1; + } + + if (tesseract_->tessedit_train_line_recognizer) { + tesseract_->TrainLineRecognizer(*input_file_, *output_file_, block_list_); + tesseract_->CorrectClassifyWords(page_res_); + return 0; + } +#ifndef DISABLED_LEGACY_ENGINE + if (tesseract_->tessedit_make_boxes_from_boxes) { + tesseract_->CorrectClassifyWords(page_res_); + return 0; + } +#endif // ndef DISABLED_LEGACY_ENGINE + + if (truth_cb_ != nullptr) { + tesseract_->wordrec_run_blamer.set_value(true); + PageIterator *page_it = new PageIterator( + page_res_, tesseract_, thresholder_->GetScaleFactor(), + thresholder_->GetScaledYResolution(), + rect_left_, rect_top_, rect_width_, rect_height_); + truth_cb_->Run(tesseract_->getDict().getUnicharset(), + image_height_, page_it, this->tesseract()->pix_grey()); + delete page_it; + } + + int result = 0; + if (tesseract_->interactive_display_mode) { + #ifndef GRAPHICS_DISABLED + tesseract_->pgeditor_main(rect_width_, rect_height_, page_res_); + #endif // GRAPHICS_DISABLED + // The page_res is invalid after an interactive session, so cleanup + // in a way that lets us continue to the next page without crashing. + delete page_res_; + page_res_ = nullptr; + return -1; + #ifndef DISABLED_LEGACY_ENGINE + } else if (tesseract_->tessedit_train_from_boxes) { + STRING fontname; + ExtractFontName(*output_file_, &fontname); + tesseract_->ApplyBoxTraining(fontname, page_res_); + } else if (tesseract_->tessedit_ambigs_training) { + FILE *training_output_file = tesseract_->init_recog_training(*input_file_); + // OCR the page segmented into words by tesseract. + tesseract_->recog_training_segmented( + *input_file_, page_res_, monitor, training_output_file); + fclose(training_output_file); + #endif // ndef DISABLED_LEGACY_ENGINE + } else { + // Now run the main recognition. + bool wait_for_text = true; + GetBoolVariable("paragraph_text_based", &wait_for_text); + if (!wait_for_text) DetectParagraphs(false); + if (tesseract_->recog_all_words(page_res_, monitor, nullptr, nullptr, 0)) { + if (wait_for_text) DetectParagraphs(true); + } else { + result = -1; + } + } + return result; +} + +#ifndef DISABLED_LEGACY_ENGINE +/** Tests the chopper by exhaustively running chop_one_blob. */ +int TessBaseAPI::RecognizeForChopTest(ETEXT_DESC* monitor) { + if (tesseract_ == nullptr) + return -1; + if (thresholder_ == nullptr || thresholder_->IsEmpty()) { + tprintf("Please call SetImage before attempting recognition.\n"); + return -1; + } + if (page_res_ != nullptr) + ClearResults(); + if (FindLines() != 0) + return -1; + // Additional conditions under which chopper test cannot be run + if (tesseract_->interactive_display_mode) return -1; + + recognition_done_ = true; + + page_res_ = new PAGE_RES(false, block_list_, + &(tesseract_->prev_word_best_choice_)); + + PAGE_RES_IT page_res_it(page_res_); + + while (page_res_it.word() != nullptr) { + WERD_RES *word_res = page_res_it.word(); + GenericVector boxes; + tesseract_->MaximallyChopWord(boxes, page_res_it.block()->block, + page_res_it.row()->row, word_res); + page_res_it.forward(); + } + return 0; +} +#endif // ndef DISABLED_LEGACY_ENGINE + +// Takes ownership of the input pix. +void TessBaseAPI::SetInputImage(Pix* pix) { tesseract_->set_pix_original(pix); } + +Pix* TessBaseAPI::GetInputImage() { return tesseract_->pix_original(); } + +const char * TessBaseAPI::GetInputName() { + if (input_file_) + return input_file_->c_str(); + return nullptr; +} + +const char * TessBaseAPI::GetDatapath() { + return tesseract_->datadir.c_str(); +} + +int TessBaseAPI::GetSourceYResolution() { + return thresholder_->GetSourceYResolution(); +} + +// If flist exists, get data from there. Otherwise get data from buf. +// Seems convoluted, but is the easiest way I know of to meet multiple +// goals. Support streaming from stdin, and also work on platforms +// lacking fmemopen. +bool TessBaseAPI::ProcessPagesFileList(FILE *flist, + STRING *buf, + const char* retry_config, + int timeout_millisec, + TessResultRenderer* renderer, + int tessedit_page_number) { + if (!flist && !buf) return false; + int page = (tessedit_page_number >= 0) ? tessedit_page_number : 0; + char pagename[MAX_PATH]; + + GenericVector lines; + if (!flist) { + buf->split('\n', &lines); + if (lines.empty()) return false; + } + + // Skip to the requested page number. + for (int i = 0; i < page; i++) { + if (flist) { + if (fgets(pagename, sizeof(pagename), flist) == nullptr) break; + } + } + + // Begin producing output + if (renderer && !renderer->BeginDocument(unknown_title_)) { + return false; + } + + // Loop over all pages - or just the requested one + while (true) { + if (flist) { + if (fgets(pagename, sizeof(pagename), flist) == nullptr) break; + } else { + if (page >= lines.size()) break; + snprintf(pagename, sizeof(pagename), "%s", lines[page].c_str()); + } + chomp_string(pagename); + Pix *pix = pixRead(pagename); + if (pix == nullptr) { + tprintf("Image file %s cannot be read!\n", pagename); + return false; + } + tprintf("Page %d : %s\n", page, pagename); + bool r = ProcessPage(pix, page, pagename, retry_config, + timeout_millisec, renderer); + pixDestroy(&pix); + if (!r) return false; + if (tessedit_page_number >= 0) break; + ++page; + } + + // Finish producing output + if (renderer && !renderer->EndDocument()) { + return false; + } + return true; +} + +bool TessBaseAPI::ProcessPagesMultipageTiff(const l_uint8 *data, + size_t size, + const char* filename, + const char* retry_config, + int timeout_millisec, + TessResultRenderer* renderer, + int tessedit_page_number) { +#ifndef ANDROID_BUILD + Pix *pix = nullptr; + int page = (tessedit_page_number >= 0) ? tessedit_page_number : 0; + size_t offset = 0; + for (; ; ++page) { + if (tessedit_page_number >= 0) + page = tessedit_page_number; + pix = (data) ? pixReadMemFromMultipageTiff(data, size, &offset) + : pixReadFromMultipageTiff(filename, &offset); + if (pix == nullptr) break; + tprintf("Page %d\n", page + 1); + char page_str[kMaxIntSize]; + snprintf(page_str, kMaxIntSize - 1, "%d", page); + SetVariable("applybox_page", page_str); + bool r = ProcessPage(pix, page, filename, retry_config, + timeout_millisec, renderer); + pixDestroy(&pix); + if (!r) return false; + if (tessedit_page_number >= 0) break; + if (!offset) break; + } + return true; +#else + return false; +#endif +} + +// Master ProcessPages calls ProcessPagesInternal and then does any post- +// processing required due to being in a training mode. +bool TessBaseAPI::ProcessPages(const char* filename, const char* retry_config, + int timeout_millisec, + TessResultRenderer* renderer) { + bool result = + ProcessPagesInternal(filename, retry_config, timeout_millisec, renderer); + #ifndef DISABLED_LEGACY_ENGINE + if (result) { + if (tesseract_->tessedit_train_from_boxes && + !tesseract_->WriteTRFile(*output_file_)) { + tprintf("Write of TR file failed: %s\n", output_file_->string()); + return false; + } + } + #endif // ndef DISABLED_LEGACY_ENGINE + return result; +} + +// In the ideal scenario, Tesseract will start working on data as soon +// as it can. For example, if you stream a filelist through stdin, we +// should start the OCR process as soon as the first filename is +// available. This is particularly useful when hooking Tesseract up to +// slow hardware such as a book scanning machine. +// +// Unfortunately there are tradeoffs. You can't seek on stdin. That +// makes automatic detection of datatype (TIFF? filelist? PNG?) +// impractical. So we support a command line flag to explicitly +// identify the scenario that really matters: filelists on +// stdin. We'll still do our best if the user likes pipes. +bool TessBaseAPI::ProcessPagesInternal(const char* filename, + const char* retry_config, + int timeout_millisec, + TessResultRenderer* renderer) { + PERF_COUNT_START("ProcessPages") + bool stdInput = !strcmp(filename, "stdin") || !strcmp(filename, "-"); + if (stdInput) { +#ifdef WIN32 + if (_setmode(_fileno(stdin), _O_BINARY) == -1) + tprintf("ERROR: cin to binary: %s", strerror(errno)); +#endif // WIN32 + } + + if (stream_filelist) { + return ProcessPagesFileList(stdin, nullptr, retry_config, + timeout_millisec, renderer, + tesseract_->tessedit_page_number); + } + + // At this point we are officially in autodection territory. + // That means any data in stdin must be buffered, to make it + // seekable. + std::string buf; + const l_uint8 *data = nullptr; + if (stdInput) { + buf.assign((std::istreambuf_iterator(std::cin)), + (std::istreambuf_iterator())); + data = reinterpret_cast(buf.data()); + } else { + // Check whether the input file can be read. + if (FILE* file = fopen(filename, "rb")) { + fclose(file); + } else { + fprintf(stderr, "Error, cannot read input file %s: %s\n", + filename, strerror(errno)); + return false; + } + } + + // Here is our autodetection + int format; + int r = (stdInput) ? + findFileFormatBuffer(data, &format) : + findFileFormat(filename, &format); + + // Maybe we have a filelist + if (r != 0 || format == IFF_UNKNOWN) { + STRING s; + if (stdInput) { + s = buf.c_str(); + } else { + std::ifstream t(filename); + std::string u((std::istreambuf_iterator(t)), + std::istreambuf_iterator()); + s = u.c_str(); + } + return ProcessPagesFileList(nullptr, &s, retry_config, + timeout_millisec, renderer, + tesseract_->tessedit_page_number); + } + + // Maybe we have a TIFF which is potentially multipage + bool tiff = (format == IFF_TIFF || format == IFF_TIFF_PACKBITS || + format == IFF_TIFF_RLE || format == IFF_TIFF_G3 || + format == IFF_TIFF_G4 || format == IFF_TIFF_LZW || + format == IFF_TIFF_ZIP); + + // Fail early if we can, before producing any output + Pix *pix = nullptr; + if (!tiff) { + pix = (stdInput) ? pixReadMem(data, buf.size()) : pixRead(filename); + if (pix == nullptr) { + return false; + } + } + + // Begin the output + if (renderer && !renderer->BeginDocument(unknown_title_)) { + pixDestroy(&pix); + return false; + } + + // Produce output + r = (tiff) ? + ProcessPagesMultipageTiff(data, buf.size(), filename, retry_config, + timeout_millisec, renderer, + tesseract_->tessedit_page_number) : + ProcessPage(pix, 0, filename, retry_config, + timeout_millisec, renderer); + + // Clean up memory as needed + pixDestroy(&pix); + + // End the output + if (!r || (renderer && !renderer->EndDocument())) { + return false; + } + PERF_COUNT_END + return true; +} + +bool TessBaseAPI::ProcessPage(Pix* pix, int page_index, const char* filename, + const char* retry_config, int timeout_millisec, + TessResultRenderer* renderer) { + PERF_COUNT_START("ProcessPage") + SetInputName(filename); + SetImage(pix); + bool failed = false; + + if (tesseract_->tessedit_pageseg_mode == PSM_AUTO_ONLY) { + // Disabled character recognition + PageIterator* it = AnalyseLayout(); + + if (it == nullptr) { + failed = true; + } else { + delete it; + } + } else if (tesseract_->tessedit_pageseg_mode == PSM_OSD_ONLY) { + failed = FindLines() != 0; + } else if (timeout_millisec > 0) { + // Running with a timeout. + ETEXT_DESC monitor; + monitor.cancel = nullptr; + monitor.cancel_this = nullptr; + monitor.set_deadline_msecs(timeout_millisec); + + // Now run the main recognition. + failed = Recognize(&monitor) < 0; + } else { + // Normal layout and character recognition with no timeout. + failed = Recognize(nullptr) < 0; + } + + if (tesseract_->tessedit_write_images) { +#ifndef ANDROID_BUILD + Pix* page_pix = GetThresholdedImage(); + pixWrite("tessinput.tif", page_pix, IFF_TIFF_G4); +#endif // ANDROID_BUILD + } + + if (failed && retry_config != nullptr && retry_config[0] != '\0') { + // Save current config variables before switching modes. + FILE* fp = fopen(kOldVarsFile, "wb"); + if (fp == nullptr) { + tprintf("Error, failed to open file \"%s\"\n", kOldVarsFile); + } else { + PrintVariables(fp); + fclose(fp); + } + // Switch to alternate mode for retry. + ReadConfigFile(retry_config); + SetImage(pix); + Recognize(nullptr); + // Restore saved config variables. + ReadConfigFile(kOldVarsFile); + } + + if (renderer && !failed) { + failed = !renderer->AddImage(this); + } + + PERF_COUNT_END + return !failed; +} + +/** + * Get a left-to-right iterator to the results of LayoutAnalysis and/or + * Recognize. The returned iterator must be deleted after use. + */ +LTRResultIterator* TessBaseAPI::GetLTRIterator() { + if (tesseract_ == nullptr || page_res_ == nullptr) + return nullptr; + return new LTRResultIterator( + page_res_, tesseract_, + thresholder_->GetScaleFactor(), thresholder_->GetScaledYResolution(), + rect_left_, rect_top_, rect_width_, rect_height_); +} + +/** + * Get a reading-order iterator to the results of LayoutAnalysis and/or + * Recognize. The returned iterator must be deleted after use. + * WARNING! This class points to data held within the TessBaseAPI class, and + * therefore can only be used while the TessBaseAPI class still exists and + * has not been subjected to a call of Init, SetImage, Recognize, Clear, End + * DetectOS, or anything else that changes the internal PAGE_RES. + */ +ResultIterator* TessBaseAPI::GetIterator() { + if (tesseract_ == nullptr || page_res_ == nullptr) + return nullptr; + return ResultIterator::StartOfParagraph(LTRResultIterator( + page_res_, tesseract_, + thresholder_->GetScaleFactor(), thresholder_->GetScaledYResolution(), + rect_left_, rect_top_, rect_width_, rect_height_)); +} + +/** + * Get a mutable iterator to the results of LayoutAnalysis and/or Recognize. + * The returned iterator must be deleted after use. + * WARNING! This class points to data held within the TessBaseAPI class, and + * therefore can only be used while the TessBaseAPI class still exists and + * has not been subjected to a call of Init, SetImage, Recognize, Clear, End + * DetectOS, or anything else that changes the internal PAGE_RES. + */ +MutableIterator* TessBaseAPI::GetMutableIterator() { + if (tesseract_ == nullptr || page_res_ == nullptr) + return nullptr; + return new MutableIterator(page_res_, tesseract_, + thresholder_->GetScaleFactor(), + thresholder_->GetScaledYResolution(), + rect_left_, rect_top_, rect_width_, rect_height_); +} + +/** Make a text string from the internal data structures. */ +char* TessBaseAPI::GetUTF8Text() { + if (tesseract_ == nullptr || + (!recognition_done_ && Recognize(nullptr) < 0)) + return nullptr; + STRING text(""); + ResultIterator *it = GetIterator(); + do { + if (it->Empty(RIL_PARA)) continue; + const std::unique_ptr para_text(it->GetUTF8Text(RIL_PARA)); + text += para_text.get(); + } while (it->Next(RIL_PARA)); + char* result = new char[text.length() + 1]; + strncpy(result, text.string(), text.length() + 1); + delete it; + return result; +} + +/** + * Gets the block orientation at the current iterator position. + */ +static tesseract::Orientation GetBlockTextOrientation(const PageIterator *it) { + tesseract::Orientation orientation; + tesseract::WritingDirection writing_direction; + tesseract::TextlineOrder textline_order; + float deskew_angle; + it->Orientation(&orientation, &writing_direction, &textline_order, + &deskew_angle); + return orientation; +} + +/** + * Fits a line to the baseline at the given level, and appends its coefficients + * to the hOCR string. + * NOTE: The hOCR spec is unclear on how to specify baseline coefficients for + * rotated textlines. For this reason, on textlines that are not upright, this + * method currently only inserts a 'textangle' property to indicate the rotation + * direction and does not add any baseline information to the hocr string. + */ +static void AddBaselineCoordsTohOCR(const PageIterator *it, + PageIteratorLevel level, + STRING* hocr_str) { + tesseract::Orientation orientation = GetBlockTextOrientation(it); + if (orientation != ORIENTATION_PAGE_UP) { + hocr_str->add_str_int("; textangle ", 360 - orientation * 90); + return; + } + + int left, top, right, bottom; + it->BoundingBox(level, &left, &top, &right, &bottom); + + // Try to get the baseline coordinates at this level. + int x1, y1, x2, y2; + if (!it->Baseline(level, &x1, &y1, &x2, &y2)) + return; + // Following the description of this field of the hOCR spec, we convert the + // baseline coordinates so that "the bottom left of the bounding box is the + // origin". + x1 -= left; + x2 -= left; + y1 -= bottom; + y2 -= bottom; + + // Now fit a line through the points so we can extract coefficients for the + // equation: y = p1 x + p0 + double p1 = 0; + double p0 = 0; + if (x1 == x2) { + // Problem computing the polynomial coefficients. + return; + } + p1 = (y2 - y1) / static_cast(x2 - x1); + p0 = y1 - static_cast(p1 * x1); + + hocr_str->add_str_double("; baseline ", round(p1 * 1000.0) / 1000.0); + hocr_str->add_str_double(" ", round(p0 * 1000.0) / 1000.0); +} + +static void AddIdTohOCR(STRING* hocr_str, const std::string base, int num1, + int num2) { + const size_t BUFSIZE = 64; + char id_buffer[BUFSIZE]; + if (num2 >= 0) { + snprintf(id_buffer, BUFSIZE - 1, "%s_%d_%d", base.c_str(), num1, num2); + } else { + snprintf(id_buffer, BUFSIZE - 1, "%s_%d", base.c_str(), num1); + } + id_buffer[BUFSIZE - 1] = '\0'; + *hocr_str += " id='"; + *hocr_str += id_buffer; + *hocr_str += "'"; +} + +static void AddIdTohOCR(STRING* hocr_str, const std::string base, int num1, + int num2, int num3) { + const size_t BUFSIZE = 64; + char id_buffer[BUFSIZE]; + snprintf(id_buffer, BUFSIZE - 1, "%s_%d_%d_%d", base.c_str(), num1, num2,num3); + id_buffer[BUFSIZE - 1] = '\0'; + *hocr_str += " id='"; + *hocr_str += id_buffer; + *hocr_str += "'"; +} + +static void AddBoxTohOCR(const ResultIterator* it, PageIteratorLevel level, + STRING* hocr_str) { + int left, top, right, bottom; + it->BoundingBox(level, &left, &top, &right, &bottom); + // This is the only place we use double quotes instead of single quotes, + // but it may too late to change for consistency + hocr_str->add_str_int(" title=\"bbox ", left); + hocr_str->add_str_int(" ", top); + hocr_str->add_str_int(" ", right); + hocr_str->add_str_int(" ", bottom); + // Add baseline coordinates & heights for textlines only. + if (level == RIL_TEXTLINE) { + AddBaselineCoordsTohOCR(it, level, hocr_str); + // add custom height measures + float row_height, descenders, ascenders; // row attributes + it->RowAttributes(&row_height, &descenders, &ascenders); + // TODO(rays): Do we want to limit these to a single decimal place? + hocr_str->add_str_double("; x_size ", row_height); + hocr_str->add_str_double("; x_descenders ", descenders * -1); + hocr_str->add_str_double("; x_ascenders ", ascenders); + } + *hocr_str += "\">"; +} + +static void AddBoxToTSV(const PageIterator* it, PageIteratorLevel level, + STRING* hocr_str) { + int left, top, right, bottom; + it->BoundingBox(level, &left, &top, &right, &bottom); + hocr_str->add_str_int("\t", left); + hocr_str->add_str_int("\t", top); + hocr_str->add_str_int("\t", right - left); + hocr_str->add_str_int("\t", bottom - top); +} + +/** + * Make a HTML-formatted string with hOCR markup from the internal + * data structures. + * page_number is 0-based but will appear in the output as 1-based. + * Image name/input_file_ can be set by SetInputName before calling + * GetHOCRText + * STL removed from original patch submission and refactored by rays. + * Returned string must be freed with the delete [] operator. + */ +char* TessBaseAPI::GetHOCRText(int page_number) { + return GetHOCRText(nullptr, page_number); +} + + +/* willus mod */ +int TessBaseAPI::GetOCRWords(int **x00,int **y00,int **x11,int **y11,int **ybaseline0, + char **utf8words) + + { + int iword,nwords,totlen,it8; + int *x0,*y0,*x1,*y1,*ybaseline; + char *tutf8; + + ResultIterator *res_it = GetIterator(); + /* Count words */ + iword=0; + totlen=0; + while (!res_it->Empty(RIL_BLOCK)) + { + if (res_it->Empty(RIL_WORD)) + { + res_it->Next(RIL_WORD); + continue; + } + iword++; + STRING textstr=std::unique_ptr(res_it->GetUTF8Text(RIL_WORD)).get(); + totlen+=strlen(textstr.string())+1; + res_it->Next(RIL_WORD); + } + nwords=iword; +/* +printf("\nnwords=%d, totlen=%d\n",nwords,totlen); +*/ + x0=(*x00)=(int *)malloc(sizeof(int)*5*nwords); + y0=(*y00)=&x0[nwords]; + x1=(*x11)=&y0[nwords]; + y1=(*y11)=&x1[nwords]; + ybaseline=(*ybaseline0)=&y1[nwords]; + tutf8=(*utf8words)=(char *)malloc(totlen); + iword=0; + it8=0; + res_it->Begin(); + while (!res_it->Empty(RIL_BLOCK)) + { + if (res_it->Empty(RIL_WORD)) + { + res_it->Next(RIL_WORD); + continue; + } + STRING textstr=std::unique_ptr(res_it->GetUTF8Text(RIL_WORD)).get(); + strcpy(&tutf8[it8],textstr.string()); + it8 += strlen(&tutf8[it8])+1; + /* + STRING textstr(""); + textstr += std::unique_ptr(res_it->GetUTF8Text(RIL_WORD)).get(); + */ +/* +printf("Word %d: '%s'\n",iword,textstr.string()); +*/ + int left, top, right, bottom; + int u1,v1,u2,v2; + res_it->BoundingBox(RIL_WORD, &left, &top, &right, &bottom); + res_it->Baseline(RIL_WORD, &u1, &v1, &u2, &v2); + x0[iword]=left; + x1[iword]=right; + y0[iword]=top; + y1[iword]=bottom; + ybaseline[iword]=(v1+v2)/2; + iword++; +/* +printf("BB: (%d,%d)-(%d,%d) BL: (%d,%d)-(%d,%d)\n",left,bottom,right,top,x1,y1,x2,y2); +*/ + res_it->Next(RIL_WORD); + } +/* +printf("iword=%d\n",iword); +*/ + return(iword); + } + +/** + * Make a HTML-formatted string with hOCR markup from the internal + * data structures. + * page_number is 0-based but will appear in the output as 1-based. + * Image name/input_file_ can be set by SetInputName before calling + * GetHOCRText + * STL removed from original patch submission and refactored by rays. + * Returned string must be freed with the delete [] operator. + */ +char* TessBaseAPI::GetHOCRText(ETEXT_DESC* monitor, int page_number) { + if (tesseract_ == nullptr || (page_res_ == nullptr && Recognize(monitor) < 0)) + return nullptr; + + int lcnt = 1, bcnt = 1, pcnt = 1, wcnt = 1, tcnt = 1, gcnt = 1; + int page_id = page_number + 1; // hOCR uses 1-based page numbers. + bool para_is_ltr = true; // Default direction is LTR + const char* paragraph_lang = nullptr; + bool font_info = false; + GetBoolVariable("hocr_font_info", &font_info); + + STRING hocr_str(""); + + if (input_file_ == nullptr) + SetInputName(nullptr); + +#ifdef _WIN32 + // convert input name from ANSI encoding to utf-8 + int str16_len = + MultiByteToWideChar(CP_ACP, 0, input_file_->string(), -1, nullptr, 0); + wchar_t *uni16_str = new WCHAR[str16_len]; + str16_len = MultiByteToWideChar(CP_ACP, 0, input_file_->string(), -1, + uni16_str, str16_len); + int utf8_len = WideCharToMultiByte(CP_UTF8, 0, uni16_str, str16_len, nullptr, 0, + nullptr, nullptr); + char *utf8_str = new char[utf8_len]; + WideCharToMultiByte(CP_UTF8, 0, uni16_str, str16_len, utf8_str, + utf8_len, nullptr, nullptr); + *input_file_ = utf8_str; + delete[] uni16_str; + delete[] utf8_str; +#endif + + hocr_str += "

string()); + } else { + hocr_str += "unknown"; + } + hocr_str.add_str_int("\"; bbox ", rect_left_); + hocr_str.add_str_int(" ", rect_top_); + hocr_str.add_str_int(" ", rect_width_); + hocr_str.add_str_int(" ", rect_height_); + hocr_str.add_str_int("; ppageno ", page_number); + hocr_str += "'>\n"; + + ResultIterator *res_it = GetIterator(); + while (!res_it->Empty(RIL_BLOCK)) { + if (res_it->Empty(RIL_WORD)) { + res_it->Next(RIL_WORD); + continue; + } + + // Open any new block/paragraph/textline. + if (res_it->IsAtBeginningOf(RIL_BLOCK)) { + para_is_ltr = true; // reset to default direction + hocr_str += "

IsAtBeginningOf(RIL_PARA)) { + hocr_str += "\n

ParagraphIsLtr(); + if (!para_is_ltr) { + hocr_str += " dir='rtl'"; + } + AddIdTohOCR(&hocr_str, "par", page_id, pcnt); + paragraph_lang = res_it->WordRecognitionLanguage(); + if (paragraph_lang) { + hocr_str += " lang='"; + hocr_str += paragraph_lang; + hocr_str += "'"; + } + AddBoxTohOCR(res_it, RIL_PARA, &hocr_str); + } + if (res_it->IsAtBeginningOf(RIL_TEXTLINE)) { + hocr_str += "\n >>* confidencemap = nullptr; + if (tesseract_->lstm_choice_mode) { + confidencemap = res_it->GetBestLSTMSymbolChoices(); + } + hocr_str += "\n BoundingBox(RIL_WORD, &left, &top, &right, &bottom); + font_name = res_it->WordFontAttributes(&bold, &italic, &underlined, + &monospace, &serif, &smallcaps, + &pointsize, &font_id); + hocr_str.add_str_int(" title='bbox ", left); + hocr_str.add_str_int(" ", top); + hocr_str.add_str_int(" ", right); + hocr_str.add_str_int(" ", bottom); + hocr_str.add_str_int("; x_wconf ", res_it->Confidence(RIL_WORD)); + if (font_info) { + if (font_name) { + hocr_str += "; x_font "; + hocr_str += HOcrEscape(font_name); + } + hocr_str.add_str_int("; x_fsize ", pointsize); + } + hocr_str += "'"; + const char* lang = res_it->WordRecognitionLanguage(); + if (lang && (!paragraph_lang || strcmp(lang, paragraph_lang))) { + hocr_str += " lang='"; + hocr_str += lang; + hocr_str += "'"; + } + switch (res_it->WordDirection()) { + // Only emit direction if different from current paragraph direction + case DIR_LEFT_TO_RIGHT: + if (!para_is_ltr) hocr_str += " dir='ltr'"; + break; + case DIR_RIGHT_TO_LEFT: + if (para_is_ltr) hocr_str += " dir='rtl'"; + break; + case DIR_MIX: + case DIR_NEUTRAL: + default: // Do nothing. + break; + } + hocr_str += ">"; + bool last_word_in_line = res_it->IsAtFinalElement(RIL_TEXTLINE, RIL_WORD); + bool last_word_in_para = res_it->IsAtFinalElement(RIL_PARA, RIL_WORD); + bool last_word_in_block = res_it->IsAtFinalElement(RIL_BLOCK, RIL_WORD); + if (bold) hocr_str += ""; + if (italic) hocr_str += ""; + do { + const std::unique_ptr grapheme( + res_it->GetUTF8Text(RIL_SYMBOL)); + if (grapheme && grapheme[0] != 0) { + hocr_str += HOcrEscape(grapheme.get()); + } + res_it->Next(RIL_SYMBOL); + } while (!res_it->Empty(RIL_BLOCK) && !res_it->IsAtBeginningOf(RIL_WORD)); + if (italic) hocr_str += ""; + if (bold) hocr_str += ""; + // If the lstm choice mode is required it is added here + if (tesseract_->lstm_choice_mode == 1 && confidencemap != nullptr) { + for (size_t i = 0; i < confidencemap->size(); i++) { + hocr_str += "\n > timestep = (*confidencemap)[i]; + for (std::pair conf : timestep) { + hocr_str += "lstm_choice_mode == 2 && confidencemap != nullptr) { + for (size_t i = 0; i < confidencemap->size(); i++) { + std::vector> timestep = (*confidencemap)[i]; + if (timestep.size() > 0) { + hocr_str += "\n Empty(RIL_BLOCK)) { + if (res_it->Empty(RIL_WORD)) { + res_it->Next(RIL_WORD); + continue; + } + + // Add rows for any new block/paragraph/textline. + if (res_it->IsAtBeginningOf(RIL_BLOCK)) { + block_num++; + par_num = 0; + line_num = 0; + word_num = 0; + tsv_str.add_str_int("2\t", page_num); // level 2 - block + tsv_str.add_str_int("\t", block_num); + tsv_str.add_str_int("\t", par_num); + tsv_str.add_str_int("\t", line_num); + tsv_str.add_str_int("\t", word_num); + AddBoxToTSV(res_it, RIL_BLOCK, &tsv_str); + tsv_str += "\t-1\t\n"; // end of row for block + } + if (res_it->IsAtBeginningOf(RIL_PARA)) { + par_num++; + line_num = 0; + word_num = 0; + tsv_str.add_str_int("3\t", page_num); // level 3 - paragraph + tsv_str.add_str_int("\t", block_num); + tsv_str.add_str_int("\t", par_num); + tsv_str.add_str_int("\t", line_num); + tsv_str.add_str_int("\t", word_num); + AddBoxToTSV(res_it, RIL_PARA, &tsv_str); + tsv_str += "\t-1\t\n"; // end of row for para + } + if (res_it->IsAtBeginningOf(RIL_TEXTLINE)) { + line_num++; + word_num = 0; + tsv_str.add_str_int("4\t", page_num); // level 4 - line + tsv_str.add_str_int("\t", block_num); + tsv_str.add_str_int("\t", par_num); + tsv_str.add_str_int("\t", line_num); + tsv_str.add_str_int("\t", word_num); + AddBoxToTSV(res_it, RIL_TEXTLINE, &tsv_str); + tsv_str += "\t-1\t\n"; // end of row for line + } + + // Now, process the word... + int left, top, right, bottom; + res_it->BoundingBox(RIL_WORD, &left, &top, &right, &bottom); + word_num++; + tsv_str.add_str_int("5\t", page_num); // level 5 - word + tsv_str.add_str_int("\t", block_num); + tsv_str.add_str_int("\t", par_num); + tsv_str.add_str_int("\t", line_num); + tsv_str.add_str_int("\t", word_num); + tsv_str.add_str_int("\t", left); + tsv_str.add_str_int("\t", top); + tsv_str.add_str_int("\t", right - left); + tsv_str.add_str_int("\t", bottom - top); + tsv_str.add_str_int("\t", res_it->Confidence(RIL_WORD)); + tsv_str += "\t"; + + // Increment counts if at end of block/paragraph/textline. + if (res_it->IsAtFinalElement(RIL_TEXTLINE, RIL_WORD)) lcnt++; + if (res_it->IsAtFinalElement(RIL_PARA, RIL_WORD)) pcnt++; + if (res_it->IsAtFinalElement(RIL_BLOCK, RIL_WORD)) bcnt++; + + do { + tsv_str += + std::unique_ptr(res_it->GetUTF8Text(RIL_SYMBOL)).get(); + res_it->Next(RIL_SYMBOL); + } while (!res_it->Empty(RIL_BLOCK) && !res_it->IsAtBeginningOf(RIL_WORD)); + tsv_str += "\n"; // end of row + wcnt++; + } + + char* ret = new char[tsv_str.length() + 1]; + strcpy(ret, tsv_str.string()); + delete res_it; + return ret; +} + +/** The 5 numbers output for each box (the usual 4 and a page number.) */ +const int kNumbersPerBlob = 5; +/** + * The number of bytes taken by each number. Since we use int16_t for ICOORD, + * assume only 5 digits max. + */ +const int kBytesPerNumber = 5; +/** + * Multiplier for max expected textlength assumes (kBytesPerNumber + space) + * * kNumbersPerBlob plus the newline. Add to this the + * original UTF8 characters, and one kMaxBytesPerLine for safety. + */ +const int kBytesPerBoxFileLine = (kBytesPerNumber + 1) * kNumbersPerBlob + 1; +/** Max bytes in the decimal representation of int64_t. */ +const int kBytesPer64BitNumber = 20; +/** + * A maximal single box could occupy kNumbersPerBlob numbers at + * kBytesPer64BitNumber digits (if someone sneaks in a 64 bit value) and a + * space plus the newline and the maximum length of a UNICHAR. + * Test against this on each iteration for safety. + */ +const int kMaxBytesPerLine = kNumbersPerBlob * (kBytesPer64BitNumber + 1) + 1 + + UNICHAR_LEN; + +/** + * The recognized text is returned as a char* which is coded + * as a UTF8 box file. + * page_number is a 0-base page index that will appear in the box file. + * Returned string must be freed with the delete [] operator. + */ +char* TessBaseAPI::GetBoxText(int page_number) { + if (tesseract_ == nullptr || + (!recognition_done_ && Recognize(nullptr) < 0)) + return nullptr; + int blob_count; + int utf8_length = TextLength(&blob_count); + int total_length = blob_count * kBytesPerBoxFileLine + utf8_length + + kMaxBytesPerLine; + char* result = new char[total_length]; + result[0] = '\0'; + int output_length = 0; + LTRResultIterator* it = GetLTRIterator(); + do { + int left, top, right, bottom; + if (it->BoundingBox(RIL_SYMBOL, &left, &top, &right, &bottom)) { + const std::unique_ptr text( + it->GetUTF8Text(RIL_SYMBOL)); + // Tesseract uses space for recognition failure. Fix to a reject + // character, kTesseractReject so we don't create illegal box files. + for (int i = 0; text[i] != '\0'; ++i) { + if (text[i] == ' ') + text[i] = kTesseractReject; + } + snprintf(result + output_length, total_length - output_length, + "%s %d %d %d %d %d\n", text.get(), left, image_height_ - bottom, + right, image_height_ - top, page_number); + output_length += strlen(result + output_length); + // Just in case... + if (output_length + kMaxBytesPerLine > total_length) + break; + } + } while (it->Next(RIL_SYMBOL)); + delete it; + return result; +} + +/** + * Conversion table for non-latin characters. + * Maps characters out of the latin set into the latin set. + * TODO(rays) incorporate this translation into unicharset. + */ +const int kUniChs[] = { + 0x20ac, 0x201c, 0x201d, 0x2018, 0x2019, 0x2022, 0x2014, 0 +}; +/** Latin chars corresponding to the unicode chars above. */ +const int kLatinChs[] = { + 0x00a2, 0x0022, 0x0022, 0x0027, 0x0027, 0x00b7, 0x002d, 0 +}; + +/** + * The recognized text is returned as a char* which is coded + * as UNLV format Latin-1 with specific reject and suspect codes. + * Returned string must be freed with the delete [] operator. + */ +char* TessBaseAPI::GetUNLVText() { + if (tesseract_ == nullptr || + (!recognition_done_ && Recognize(nullptr) < 0)) + return nullptr; + bool tilde_crunch_written = false; + bool last_char_was_newline = true; + bool last_char_was_tilde = false; + + int total_length = TextLength(nullptr); + PAGE_RES_IT page_res_it(page_res_); + char* result = new char[total_length]; + char* ptr = result; + for (page_res_it.restart_page(); page_res_it.word () != nullptr; + page_res_it.forward()) { + WERD_RES *word = page_res_it.word(); + // Process the current word. + if (word->unlv_crunch_mode != CR_NONE) { + if (word->unlv_crunch_mode != CR_DELETE && + (!tilde_crunch_written || + (word->unlv_crunch_mode == CR_KEEP_SPACE && + word->word->space() > 0 && + !word->word->flag(W_FUZZY_NON) && + !word->word->flag(W_FUZZY_SP)))) { + if (!word->word->flag(W_BOL) && + word->word->space() > 0 && + !word->word->flag(W_FUZZY_NON) && + !word->word->flag(W_FUZZY_SP)) { + /* Write a space to separate from preceding good text */ + *ptr++ = ' '; + last_char_was_tilde = false; + } + if (!last_char_was_tilde) { + // Write a reject char. + last_char_was_tilde = true; + *ptr++ = kUNLVReject; + tilde_crunch_written = true; + last_char_was_newline = false; + } + } + } else { + // NORMAL PROCESSING of non tilde crunched words. + tilde_crunch_written = false; + tesseract_->set_unlv_suspects(word); + const char* wordstr = word->best_choice->unichar_string().string(); + const STRING& lengths = word->best_choice->unichar_lengths(); + int length = lengths.length(); + int i = 0; + int offset = 0; + + if (last_char_was_tilde && + word->word->space() == 0 && wordstr[offset] == ' ') { + // Prevent adjacent tilde across words - we know that adjacent tildes + // within words have been removed. + // Skip the first character. + offset = lengths[i++]; + } + if (i < length && wordstr[offset] != 0) { + if (!last_char_was_newline) + *ptr++ = ' '; + else + last_char_was_newline = false; + for (; i < length; offset += lengths[i++]) { + if (wordstr[offset] == ' ' || + wordstr[offset] == kTesseractReject) { + *ptr++ = kUNLVReject; + last_char_was_tilde = true; + } else { + if (word->reject_map[i].rejected()) + *ptr++ = kUNLVSuspect; + UNICHAR ch(wordstr + offset, lengths[i]); + int uni_ch = ch.first_uni(); + for (int j = 0; kUniChs[j] != 0; ++j) { + if (kUniChs[j] == uni_ch) { + uni_ch = kLatinChs[j]; + break; + } + } + if (uni_ch <= 0xff) { + *ptr++ = static_cast(uni_ch); + last_char_was_tilde = false; + } else { + *ptr++ = kUNLVReject; + last_char_was_tilde = true; + } + } + } + } + } + if (word->word->flag(W_EOL) && !last_char_was_newline) { + /* Add a new line output */ + *ptr++ = '\n'; + tilde_crunch_written = false; + last_char_was_newline = true; + last_char_was_tilde = false; + } + } + *ptr++ = '\n'; + *ptr = '\0'; + return result; +} + +#ifndef DISABLED_LEGACY_ENGINE + +/** + * Detect the orientation of the input image and apparent script (alphabet). + * orient_deg is the detected clockwise rotation of the input image in degrees + * (0, 90, 180, 270) + * orient_conf is the confidence (15.0 is reasonably confident) + * script_name is an ASCII string, the name of the script, e.g. "Latin" + * script_conf is confidence level in the script + * Returns true on success and writes values to each parameter as an output + */ +bool TessBaseAPI::DetectOrientationScript(int* orient_deg, float* orient_conf, + const char** script_name, + float* script_conf) { + OSResults osr; + + bool osd = DetectOS(&osr); + if (!osd) { + return false; + } + + int orient_id = osr.best_result.orientation_id; + int script_id = osr.get_best_script(orient_id); + if (orient_conf) *orient_conf = osr.best_result.oconfidence; + if (orient_deg) *orient_deg = orient_id * 90; // convert quadrant to degrees + + if (script_name) { + const char* script = osr.unicharset->get_script_from_script_id(script_id); + + *script_name = script; + } + + if (script_conf) *script_conf = osr.best_result.sconfidence; + + return true; +} + +/** + * The recognized text is returned as a char* which is coded + * as UTF8 and must be freed with the delete [] operator. + * page_number is a 0-based page index that will appear in the osd file. + */ +char* TessBaseAPI::GetOsdText(int page_number) { + int orient_deg; + float orient_conf; + const char* script_name; + float script_conf; + + if (!DetectOrientationScript(&orient_deg, &orient_conf, &script_name, + &script_conf)) + return nullptr; + + // clockwise rotation needed to make the page upright + int rotate = OrientationIdToValue(orient_deg / 90); + + const int kOsdBufsize = 255; + char* osd_buf = new char[kOsdBufsize]; + snprintf(osd_buf, kOsdBufsize, + "Page number: %d\n" + "Orientation in degrees: %d\n" + "Rotate: %d\n" + "Orientation confidence: %.2f\n" + "Script: %s\n" + "Script confidence: %.2f\n", + page_number, orient_deg, rotate, orient_conf, script_name, + script_conf); + + return osd_buf; +} + +#endif // ndef DISABLED_LEGACY_ENGINE + +/** Returns the average word confidence for Tesseract page result. */ +int TessBaseAPI::MeanTextConf() { + int* conf = AllWordConfidences(); + if (!conf) return 0; + int sum = 0; + int *pt = conf; + while (*pt >= 0) sum += *pt++; + if (pt != conf) sum /= pt - conf; + delete [] conf; + return sum; +} + +/** Returns an array of all word confidences, terminated by -1. */ +int* TessBaseAPI::AllWordConfidences() { + if (tesseract_ == nullptr || + (!recognition_done_ && Recognize(nullptr) < 0)) + return nullptr; + int n_word = 0; + PAGE_RES_IT res_it(page_res_); + for (res_it.restart_page(); res_it.word() != nullptr; res_it.forward()) + n_word++; + + int* conf = new int[n_word+1]; + n_word = 0; + for (res_it.restart_page(); res_it.word() != nullptr; res_it.forward()) { + WERD_RES *word = res_it.word(); + WERD_CHOICE* choice = word->best_choice; + int w_conf = static_cast(100 + 5 * choice->certainty()); + // This is the eq for converting Tesseract confidence to 1..100 + if (w_conf < 0) w_conf = 0; + if (w_conf > 100) w_conf = 100; + conf[n_word++] = w_conf; + } + conf[n_word] = -1; + return conf; +} + +#ifndef DISABLED_LEGACY_ENGINE +/** + * Applies the given word to the adaptive classifier if possible. + * The word must be SPACE-DELIMITED UTF-8 - l i k e t h i s , so it can + * tell the boundaries of the graphemes. + * Assumes that SetImage/SetRectangle have been used to set the image + * to the given word. The mode arg should be PSM_SINGLE_WORD or + * PSM_CIRCLE_WORD, as that will be used to control layout analysis. + * The currently set PageSegMode is preserved. + * Returns false if adaption was not possible for some reason. + */ +bool TessBaseAPI::AdaptToWordStr(PageSegMode mode, const char* wordstr) { + int debug = 0; + GetIntVariable("applybox_debug", &debug); + bool success = true; + PageSegMode current_psm = GetPageSegMode(); + SetPageSegMode(mode); + SetVariable("classify_enable_learning", "0"); + const std::unique_ptr text(GetUTF8Text()); + if (debug) { + tprintf("Trying to adapt \"%s\" to \"%s\"\n", text.get(), wordstr); + } + if (text != nullptr) { + PAGE_RES_IT it(page_res_); + WERD_RES* word_res = it.word(); + if (word_res != nullptr) { + word_res->word->set_text(wordstr); + // Check to see if text matches wordstr. + int w = 0; + int t; + for (t = 0; text[t] != '\0'; ++t) { + if (text[t] == '\n' || text[t] == ' ') + continue; + while (wordstr[w] == ' ') ++w; + if (text[t] != wordstr[w]) + break; + ++w; + } + if (text[t] != '\0' || wordstr[w] != '\0') { + // No match. + delete page_res_; + GenericVector boxes; + page_res_ = tesseract_->SetupApplyBoxes(boxes, block_list_); + tesseract_->ReSegmentByClassification(page_res_); + tesseract_->TidyUp(page_res_); + PAGE_RES_IT pr_it(page_res_); + if (pr_it.word() == nullptr) + success = false; + else + word_res = pr_it.word(); + } else { + word_res->BestChoiceToCorrectText(); + } + if (success) { + tesseract_->EnableLearning = true; + tesseract_->LearnWord(nullptr, word_res); + } + } else { + success = false; + } + } else { + success = false; + } + SetPageSegMode(current_psm); + return success; +} +#endif // ndef DISABLED_LEGACY_ENGINE + +/** + * Free up recognition results and any stored image data, without actually + * freeing any recognition data that would be time-consuming to reload. + * Afterwards, you must call SetImage or TesseractRect before doing + * any Recognize or Get* operation. + */ +void TessBaseAPI::Clear() { + if (thresholder_ != nullptr) + thresholder_->Clear(); + ClearResults(); + if (tesseract_ != nullptr) SetInputImage(nullptr); +} + +/** + * Close down tesseract and free up all memory. End() is equivalent to + * destructing and reconstructing your TessBaseAPI. + * Once End() has been used, none of the other API functions may be used + * other than Init and anything declared above it in the class definition. + */ +void TessBaseAPI::End() { + Clear(); + delete thresholder_; + thresholder_ = nullptr; + delete page_res_; + page_res_ = nullptr; + delete block_list_; + block_list_ = nullptr; + if (paragraph_models_ != nullptr) { + paragraph_models_->delete_data_pointers(); + delete paragraph_models_; + paragraph_models_ = nullptr; + } + if (osd_tesseract_ == tesseract_) osd_tesseract_ = nullptr; + delete tesseract_; + tesseract_ = nullptr; + delete osd_tesseract_; + osd_tesseract_ = nullptr; + delete equ_detect_; + equ_detect_ = nullptr; + delete input_file_; + input_file_ = nullptr; + delete output_file_; + output_file_ = nullptr; + delete datapath_; + datapath_ = nullptr; + delete language_; + language_ = nullptr; +} + +// Clear any library-level memory caches. +// There are a variety of expensive-to-load constant data structures (mostly +// language dictionaries) that are cached globally -- surviving the Init() +// and End() of individual TessBaseAPI's. This function allows the clearing +// of these caches. +void TessBaseAPI::ClearPersistentCache() { + Dict::GlobalDawgCache()->DeleteUnusedDawgs(); +} + +/** + * Check whether a word is valid according to Tesseract's language model + * returns 0 if the word is invalid, non-zero if valid + */ +int TessBaseAPI::IsValidWord(const char *word) { + return tesseract_->getDict().valid_word(word); +} +// Returns true if utf8_character is defined in the UniCharset. +bool TessBaseAPI::IsValidCharacter(const char *utf8_character) { + return tesseract_->unicharset.contains_unichar(utf8_character); +} + + +// TODO(rays) Obsolete this function and replace with a more aptly named +// function that returns image coordinates rather than tesseract coordinates. +bool TessBaseAPI::GetTextDirection(int* out_offset, float* out_slope) { + PageIterator* it = AnalyseLayout(); + if (it == nullptr) { + return false; + } + int x1, x2, y1, y2; + it->Baseline(RIL_TEXTLINE, &x1, &y1, &x2, &y2); + // Calculate offset and slope (NOTE: Kind of ugly) + if (x2 <= x1) x2 = x1 + 1; + // Convert the point pair to slope/offset of the baseline (in image coords.) + *out_slope = static_cast(y2 - y1) / (x2 - x1); + *out_offset = static_cast(y1 - *out_slope * x1); + // Get the y-coord of the baseline at the left and right edges of the + // textline's bounding box. + int left, top, right, bottom; + if (!it->BoundingBox(RIL_TEXTLINE, &left, &top, &right, &bottom)) { + delete it; + return false; + } + int left_y = IntCastRounded(*out_slope * left + *out_offset); + int right_y = IntCastRounded(*out_slope * right + *out_offset); + // Shift the baseline down so it passes through the nearest bottom-corner + // of the textline's bounding box. This is the difference between the y + // at the lowest (max) edge of the box and the actual box bottom. + *out_offset += bottom - std::max(left_y, right_y); + // Switch back to bottom-up tesseract coordinates. Requires negation of + // the slope and height - offset for the offset. + *out_slope = -*out_slope; + *out_offset = rect_height_ - *out_offset; + delete it; + + return true; +} + +/** Sets Dict::letter_is_okay_ function to point to the given function. */ +void TessBaseAPI::SetDictFunc(DictFunc f) { + if (tesseract_ != nullptr) { + tesseract_->getDict().letter_is_okay_ = f; + } +} + +/** + * Sets Dict::probability_in_context_ function to point to the given + * function. + * + * @param f A single function that returns the probability of the current + * "character" (in general a utf-8 string), given the context of a previous + * utf-8 string. + */ +void TessBaseAPI::SetProbabilityInContextFunc(ProbabilityInContextFunc f) { + if (tesseract_ != nullptr) { + tesseract_->getDict().probability_in_context_ = f; + // Set it for the sublangs too. + int num_subs = tesseract_->num_sub_langs(); + for (int i = 0; i < num_subs; ++i) { + tesseract_->get_sub_lang(i)->getDict().probability_in_context_ = f; + } + } +} + +#ifndef DISABLED_LEGACY_ENGINE +/** Sets Wordrec::fill_lattice_ function to point to the given function. */ +void TessBaseAPI::SetFillLatticeFunc(FillLatticeFunc f) { + if (tesseract_ != nullptr) tesseract_->fill_lattice_ = f; +} +#endif // ndef DISABLED_LEGACY_ENGINE + +/** Common code for setting the image. */ +bool TessBaseAPI::InternalSetImage() { + if (tesseract_ == nullptr) { + tprintf("Please call Init before attempting to set an image.\n"); + return false; + } + if (thresholder_ == nullptr) + thresholder_ = new ImageThresholder; + ClearResults(); + return true; +} + +/** + * Run the thresholder to make the thresholded image, returned in pix, + * which must not be nullptr. *pix must be initialized to nullptr, or point + * to an existing pixDestroyable Pix. + * The usual argument to Threshold is Tesseract::mutable_pix_binary(). + */ +bool TessBaseAPI::Threshold(Pix** pix) { + ASSERT_HOST(pix != nullptr); + if (*pix != nullptr) + pixDestroy(pix); + // Zero resolution messes up the algorithms, so make sure it is credible. + int user_dpi = 0; + bool a = GetIntVariable("user_defined_dpi", &user_dpi); + int y_res = thresholder_->GetScaledYResolution(); + if (user_dpi && (user_dpi < kMinCredibleResolution || + user_dpi > kMaxCredibleResolution)) { + tprintf("Warning: User defined image dpi is outside of expected range " + "(%d - %d)!\n", + kMinCredibleResolution, kMaxCredibleResolution); + } + // Always use user defined dpi + if (user_dpi) { + thresholder_->SetSourceYResolution(user_dpi); + } else if (y_res < kMinCredibleResolution || + y_res > kMaxCredibleResolution) { + tprintf("Warning: Invalid resolution %d dpi. Using %d instead.\n", + y_res, kMinCredibleResolution); + thresholder_->SetSourceYResolution(kMinCredibleResolution); + } + PageSegMode pageseg_mode = + static_cast( + static_cast(tesseract_->tessedit_pageseg_mode)); + if (!thresholder_->ThresholdToPix(pageseg_mode, pix)) return false; + thresholder_->GetImageSizes(&rect_left_, &rect_top_, + &rect_width_, &rect_height_, + &image_width_, &image_height_); + if (!thresholder_->IsBinary()) { + tesseract_->set_pix_thresholds(thresholder_->GetPixRectThresholds()); + tesseract_->set_pix_grey(thresholder_->GetPixRectGrey()); + } else { + tesseract_->set_pix_thresholds(nullptr); + tesseract_->set_pix_grey(nullptr); + } + // Set the internal resolution that is used for layout parameters from the + // estimated resolution, rather than the image resolution, which may be + // fabricated, but we will use the image resolution, if there is one, to + // report output point sizes. + int estimated_res = ClipToRange(thresholder_->GetScaledEstimatedResolution(), + kMinCredibleResolution, + kMaxCredibleResolution); + if (estimated_res != thresholder_->GetScaledEstimatedResolution()) { + tprintf("Estimated internal resolution %d out of range! " + "Corrected to %d.\n", + thresholder_->GetScaledEstimatedResolution(), estimated_res); + } + tesseract_->set_source_resolution(estimated_res); + SavePixForCrash(estimated_res, *pix); + return true; +} + +/** Find lines from the image making the BLOCK_LIST. */ +int TessBaseAPI::FindLines() { + if (thresholder_ == nullptr || thresholder_->IsEmpty()) { + tprintf("Please call SetImage before attempting recognition.\n"); + return -1; + } + if (recognition_done_) + ClearResults(); + if (!block_list_->empty()) { + return 0; + } + if (tesseract_ == nullptr) { + tesseract_ = new Tesseract; + #ifndef DISABLED_LEGACY_ENGINE + tesseract_->InitAdaptiveClassifier(nullptr); + #endif + } + if (tesseract_->pix_binary() == nullptr && + !Threshold(tesseract_->mutable_pix_binary())) { + return -1; + } + + tesseract_->PrepareForPageseg(); + +#ifndef DISABLED_LEGACY_ENGINE + if (tesseract_->textord_equation_detect) { + if (equ_detect_ == nullptr && datapath_ != nullptr) { + equ_detect_ = new EquationDetect(datapath_->string(), nullptr); + } + if (equ_detect_ == nullptr) { + tprintf("Warning: Could not set equation detector\n"); + } else { + tesseract_->SetEquationDetect(equ_detect_); + } + } +#endif // ndef DISABLED_LEGACY_ENGINE + + Tesseract* osd_tess = osd_tesseract_; + OSResults osr; + if (PSM_OSD_ENABLED(tesseract_->tessedit_pageseg_mode) && + osd_tess == nullptr) { + if (strcmp(language_->string(), "osd") == 0) { + osd_tess = tesseract_; + } else { + osd_tesseract_ = new Tesseract; + TessdataManager mgr(reader_); + if (datapath_ == nullptr) { + tprintf("Warning: Auto orientation and script detection requested," + " but data path is undefined\n"); + delete osd_tesseract_; + osd_tesseract_ = nullptr; + } else if (osd_tesseract_->init_tesseract(datapath_->string(), nullptr, + "osd", OEM_TESSERACT_ONLY, + nullptr, 0, nullptr, nullptr, + false, &mgr) == 0) { + osd_tess = osd_tesseract_; + osd_tesseract_->set_source_resolution( + thresholder_->GetSourceYResolution()); + } else { + tprintf("Warning: Auto orientation and script detection requested," + " but osd language failed to load\n"); + delete osd_tesseract_; + osd_tesseract_ = nullptr; + } + } + } + + if (tesseract_->SegmentPage(input_file_, block_list_, osd_tess, &osr) < 0) + return -1; + + // If Devanagari is being recognized, we use different images for page seg + // and for OCR. + tesseract_->PrepareForTessOCR(block_list_, osd_tess, &osr); + return 0; +} + +/** Delete the pageres and clear the block list ready for a new page. */ +void TessBaseAPI::ClearResults() { + if (tesseract_ != nullptr) { + tesseract_->Clear(); + } + delete page_res_; + page_res_ = nullptr; + recognition_done_ = false; + if (block_list_ == nullptr) + block_list_ = new BLOCK_LIST; + else + block_list_->clear(); + if (paragraph_models_ != nullptr) { + paragraph_models_->delete_data_pointers(); + delete paragraph_models_; + paragraph_models_ = nullptr; + } + SavePixForCrash(0, nullptr); +} + +/** + * Return the length of the output text string, as UTF8, assuming + * liberally two spacing marks after each word (as paragraphs end with two + * newlines), and assuming a single character reject marker for each rejected + * character. + * Also return the number of recognized blobs in blob_count. + */ +int TessBaseAPI::TextLength(int* blob_count) { + if (tesseract_ == nullptr || page_res_ == nullptr) + return 0; + + PAGE_RES_IT page_res_it(page_res_); + int total_length = 2; + int total_blobs = 0; + // Iterate over the data structures to extract the recognition result. + for (page_res_it.restart_page(); page_res_it.word () != nullptr; + page_res_it.forward()) { + WERD_RES *word = page_res_it.word(); + WERD_CHOICE* choice = word->best_choice; + if (choice != nullptr) { + total_blobs += choice->length() + 2; + total_length += choice->unichar_string().length() + 2; + for (int i = 0; i < word->reject_map.length(); ++i) { + if (word->reject_map[i].rejected()) + ++total_length; + } + } + } + if (blob_count != nullptr) + *blob_count = total_blobs; + return total_length; +} + +#ifndef DISABLED_LEGACY_ENGINE +/** + * Estimates the Orientation And Script of the image. + * Returns true if the image was processed successfully. + */ +bool TessBaseAPI::DetectOS(OSResults* osr) { + if (tesseract_ == nullptr) + return false; + ClearResults(); + if (tesseract_->pix_binary() == nullptr && + !Threshold(tesseract_->mutable_pix_binary())) { + return false; + } + + if (input_file_ == nullptr) + input_file_ = new STRING(kInputFile); + return orientation_and_script_detection(*input_file_, osr, tesseract_) > 0; +} +#endif // ndef DISABLED_LEGACY_ENGINE + +void TessBaseAPI::set_min_orientation_margin(double margin) { + tesseract_->min_orientation_margin.set_value(margin); +} + +/** + * Return text orientation of each block as determined in an earlier page layout + * analysis operation. Orientation is returned as the number of ccw 90-degree + * rotations (in [0..3]) required to make the text in the block upright + * (readable). Note that this may not necessary be the block orientation + * preferred for recognition (such as the case of vertical CJK text). + * + * Also returns whether the text in the block is believed to have vertical + * writing direction (when in an upright page orientation). + * + * The returned array is of length equal to the number of text blocks, which may + * be less than the total number of blocks. The ordering is intended to be + * consistent with GetTextLines(). + */ +void TessBaseAPI::GetBlockTextOrientations(int** block_orientation, + bool** vertical_writing) { + delete[] *block_orientation; + *block_orientation = nullptr; + delete[] *vertical_writing; + *vertical_writing = nullptr; + BLOCK_IT block_it(block_list_); + + block_it.move_to_first(); + int num_blocks = 0; + for (block_it.mark_cycle_pt(); !block_it.cycled_list(); block_it.forward()) { + if (!block_it.data()->pdblk.poly_block()->IsText()) { + continue; + } + ++num_blocks; + } + if (!num_blocks) { + tprintf("WARNING: Found no blocks\n"); + return; + } + *block_orientation = new int[num_blocks]; + *vertical_writing = new bool[num_blocks]; + block_it.move_to_first(); + int i = 0; + for (block_it.mark_cycle_pt(); !block_it.cycled_list(); + block_it.forward()) { + if (!block_it.data()->pdblk.poly_block()->IsText()) { + continue; + } + FCOORD re_rotation = block_it.data()->re_rotation(); + float re_theta = re_rotation.angle(); + FCOORD classify_rotation = block_it.data()->classify_rotation(); + float classify_theta = classify_rotation.angle(); + double rot_theta = - (re_theta - classify_theta) * 2.0 / M_PI; + if (rot_theta < 0) rot_theta += 4; + int num_rotations = static_cast(rot_theta + 0.5); + (*block_orientation)[i] = num_rotations; + // The classify_rotation is non-zero only if the text has vertical + // writing direction. + (*vertical_writing)[i] = classify_rotation.y() != 0.0f; + ++i; + } +} + + +void TessBaseAPI::DetectParagraphs(bool after_text_recognition) { + int debug_level = 0; + GetIntVariable("paragraph_debug_level", &debug_level); + if (paragraph_models_ == nullptr) + paragraph_models_ = new GenericVector; + MutableIterator *result_it = GetMutableIterator(); + do { // Detect paragraphs for this block + GenericVector models; + ::tesseract::DetectParagraphs(debug_level, after_text_recognition, + result_it, &models); + *paragraph_models_ += models; + } while (result_it->Next(RIL_BLOCK)); + delete result_it; +} + +/** This method returns the string form of the specified unichar. */ +const char* TessBaseAPI::GetUnichar(int unichar_id) { + return tesseract_->unicharset.id_to_unichar(unichar_id); +} + +/** Return the pointer to the i-th dawg loaded into tesseract_ object. */ +const Dawg *TessBaseAPI::GetDawg(int i) const { + if (tesseract_ == nullptr || i >= NumDawgs()) return nullptr; + return tesseract_->getDict().GetDawg(i); +} + +/** Return the number of dawgs loaded into tesseract_ object. */ +int TessBaseAPI::NumDawgs() const { + return tesseract_ == nullptr ? 0 : tesseract_->getDict().NumDawgs(); +} + +/** Escape a char string - remove <>&"' with HTML codes. */ +STRING HOcrEscape(const char* text) { + STRING ret; + const char *ptr; + for (ptr = text; *ptr; ptr++) { + switch (*ptr) { + case '<': ret += "<"; break; + case '>': ret += ">"; break; + case '&': ret += "&"; break; + case '"': ret += """; break; + case '\'': ret += "'"; break; + default: ret += *ptr; + } + } + return ret; +} + + +#ifndef DISABLED_LEGACY_ENGINE + + +// ____________________________________________________________________________ +// Ocropus add-ons. + +/** Find lines from the image making the BLOCK_LIST. */ +BLOCK_LIST* TessBaseAPI::FindLinesCreateBlockList() { + ASSERT_HOST(FindLines() == 0); + BLOCK_LIST* result = block_list_; + block_list_ = nullptr; + return result; +} + +/** + * Delete a block list. + * This is to keep BLOCK_LIST pointer opaque + * and let go of including the other headers. + */ +void TessBaseAPI::DeleteBlockList(BLOCK_LIST *block_list) { + delete block_list; +} + + +ROW *TessBaseAPI::MakeTessOCRRow(float baseline, + float xheight, + float descender, + float ascender) { + int32_t xstarts[] = {-32000}; + double quad_coeffs[] = {0, 0, baseline}; + return new ROW(1, + xstarts, + quad_coeffs, + xheight, + ascender - (baseline + xheight), + descender - baseline, + 0, + 0); +} + +/** Creates a TBLOB* from the whole pix. */ +TBLOB *TessBaseAPI::MakeTBLOB(Pix *pix) { + int width = pixGetWidth(pix); + int height = pixGetHeight(pix); + BLOCK block("a character", TRUE, 0, 0, 0, 0, width, height); + + // Create C_BLOBs from the page + extract_edges(pix, &block); + + // Merge all C_BLOBs + C_BLOB_LIST *list = block.blob_list(); + C_BLOB_IT c_blob_it(list); + if (c_blob_it.empty()) + return nullptr; + // Move all the outlines to the first blob. + C_OUTLINE_IT ol_it(c_blob_it.data()->out_list()); + for (c_blob_it.forward(); + !c_blob_it.at_first(); + c_blob_it.forward()) { + C_BLOB *c_blob = c_blob_it.data(); + ol_it.add_list_after(c_blob->out_list()); + } + // Convert the first blob to the output TBLOB. + return TBLOB::PolygonalCopy(false, c_blob_it.data()); +} + +/** + * This method baseline normalizes a TBLOB in-place. The input row is used + * for normalization. The denorm is an optional parameter in which the + * normalization-antidote is returned. + */ +void TessBaseAPI::NormalizeTBLOB(TBLOB *tblob, ROW *row, bool numeric_mode) { + TBOX box = tblob->bounding_box(); + float x_center = (box.left() + box.right()) / 2.0f; + float baseline = row->base_line(x_center); + float scale = kBlnXHeight / row->x_height(); + tblob->Normalize(nullptr, nullptr, nullptr, x_center, baseline, scale, scale, + 0.0f, static_cast(kBlnBaselineOffset), false, nullptr); +} + +/** + * Return a TBLOB * from the whole pix. + * To be freed later with delete. + */ +static TBLOB *make_tesseract_blob(float baseline, float xheight, + float descender, float ascender, + bool numeric_mode, Pix* pix) { + TBLOB *tblob = TessBaseAPI::MakeTBLOB(pix); + + // Normalize TBLOB + ROW *row = + TessBaseAPI::MakeTessOCRRow(baseline, xheight, descender, ascender); + TessBaseAPI::NormalizeTBLOB(tblob, row, numeric_mode); + delete row; + return tblob; +} + +/** + * Adapt to recognize the current image as the given character. + * The image must be preloaded into pix_binary_ and be just an image + * of a single character. + */ +void TessBaseAPI::AdaptToCharacter(const char *unichar_repr, + int length, + float baseline, + float xheight, + float descender, + float ascender) { + UNICHAR_ID id = tesseract_->unicharset.unichar_to_id(unichar_repr, length); + TBLOB *blob = make_tesseract_blob(baseline, xheight, descender, ascender, + tesseract_->classify_bln_numeric_mode, + tesseract_->pix_binary()); + float threshold; + float best_rating = -100; + + + // Classify to get a raw choice. + BLOB_CHOICE_LIST choices; + tesseract_->AdaptiveClassifier(blob, &choices); + BLOB_CHOICE_IT choice_it; + choice_it.set_to_list(&choices); + for (choice_it.mark_cycle_pt(); !choice_it.cycled_list(); + choice_it.forward()) { + if (choice_it.data()->rating() > best_rating) { + best_rating = choice_it.data()->rating(); + } + } + + threshold = tesseract_->matcher_good_threshold; + + if (blob->outlines) + tesseract_->AdaptToChar(blob, id, kUnknownFontinfoId, threshold, + tesseract_->AdaptedTemplates); + delete blob; +} + + +PAGE_RES* TessBaseAPI::RecognitionPass1(BLOCK_LIST* block_list) { + PAGE_RES *page_res = new PAGE_RES(false, block_list, + &(tesseract_->prev_word_best_choice_)); + tesseract_->recog_all_words(page_res, nullptr, nullptr, nullptr, 1); + return page_res; +} + +PAGE_RES* TessBaseAPI::RecognitionPass2(BLOCK_LIST* block_list, + PAGE_RES* pass1_result) { + if (!pass1_result) + pass1_result = new PAGE_RES(false, block_list, + &(tesseract_->prev_word_best_choice_)); + tesseract_->recog_all_words(pass1_result, nullptr, nullptr, nullptr, 2); + return pass1_result; +} + +struct TESS_CHAR : ELIST_LINK { + char *unicode_repr; + int length; // of unicode_repr + float cost; + TBOX box; + + TESS_CHAR(float _cost, const char *repr, int len = -1) : cost(_cost) { + length = (len == -1 ? strlen(repr) : len); + unicode_repr = new char[length + 1]; + strncpy(unicode_repr, repr, length); + } + + TESS_CHAR() + : unicode_repr(nullptr), + length(0), + cost(0.0f) + { // Satisfies ELISTIZE. + } + ~TESS_CHAR() { + delete [] unicode_repr; + } +}; + +ELISTIZEH(TESS_CHAR) +ELISTIZE(TESS_CHAR) + +static void add_space(TESS_CHAR_IT* it) { + TESS_CHAR *t = new TESS_CHAR(0, " "); + it->add_after_then_move(t); +} + + +static float rating_to_cost(float rating) { + rating = 100 + rating; + // cuddled that to save from coverage profiler + // (I have never seen ratings worse than -100, + // but the check won't hurt) + if (rating < 0) rating = 0; + return rating; +} + +/** + * Extract the OCR results, costs (penalty points for uncertainty), + * and the bounding boxes of the characters. + */ +static void extract_result(TESS_CHAR_IT* out, + PAGE_RES* page_res) { + PAGE_RES_IT page_res_it(page_res); + int word_count = 0; + while (page_res_it.word() != nullptr) { + WERD_RES *word = page_res_it.word(); + const char *str = word->best_choice->unichar_string().string(); + const char *len = word->best_choice->unichar_lengths().string(); + TBOX real_rect = word->word->bounding_box(); + + if (word_count) + add_space(out); + int n = strlen(len); + for (int i = 0; i < n; i++) { + TESS_CHAR *tc = new TESS_CHAR(rating_to_cost(word->best_choice->rating()), + str, *len); + tc->box = real_rect.intersection(word->box_word->BlobBox(i)); + out->add_after_then_move(tc); + str += *len; + len++; + } + page_res_it.forward(); + word_count++; + } +} + +/** + * Extract the OCR results, costs (penalty points for uncertainty), + * and the bounding boxes of the characters. + */ +int TessBaseAPI::TesseractExtractResult(char** text, + int** lengths, + float** costs, + int** x0, + int** y0, + int** x1, + int** y1, + PAGE_RES* page_res) { + TESS_CHAR_LIST tess_chars; + TESS_CHAR_IT tess_chars_it(&tess_chars); + extract_result(&tess_chars_it, page_res); + tess_chars_it.move_to_first(); + int n = tess_chars.length(); + int text_len = 0; + *lengths = new int[n]; + *costs = new float[n]; + *x0 = new int[n]; + *y0 = new int[n]; + *x1 = new int[n]; + *y1 = new int[n]; + int i = 0; + for (tess_chars_it.mark_cycle_pt(); + !tess_chars_it.cycled_list(); + tess_chars_it.forward(), i++) { + TESS_CHAR *tc = tess_chars_it.data(); + text_len += (*lengths)[i] = tc->length; + (*costs)[i] = tc->cost; + (*x0)[i] = tc->box.left(); + (*y0)[i] = tc->box.bottom(); + (*x1)[i] = tc->box.right(); + (*y1)[i] = tc->box.top(); + } + char *p = *text = new char[text_len]; + + tess_chars_it.move_to_first(); + for (tess_chars_it.mark_cycle_pt(); + !tess_chars_it.cycled_list(); + tess_chars_it.forward()) { + TESS_CHAR *tc = tess_chars_it.data(); + strncpy(p, tc->unicode_repr, tc->length); + p += tc->length; + } + return n; +} + +/** This method returns the features associated with the input blob. */ +// The resulting features are returned in int_features, which must be +// of size MAX_NUM_INT_FEATURES. The number of features is returned in +// num_features (or 0 if there was a failure). +// On return feature_outline_index is filled with an index of the outline +// corresponding to each feature in int_features. +// TODO(rays) Fix the caller to out outline_counts instead. +void TessBaseAPI::GetFeaturesForBlob(TBLOB* blob, + INT_FEATURE_STRUCT* int_features, + int* num_features, + int* feature_outline_index) { + GenericVector outline_counts; + GenericVector bl_features; + GenericVector cn_features; + INT_FX_RESULT_STRUCT fx_info; + tesseract_->ExtractFeatures(*blob, false, &bl_features, + &cn_features, &fx_info, &outline_counts); + if (cn_features.empty() || cn_features.size() > MAX_NUM_INT_FEATURES) { + *num_features = 0; + return; // Feature extraction failed. + } + *num_features = cn_features.size(); + memcpy(int_features, &cn_features[0], *num_features * sizeof(cn_features[0])); + // TODO(rays) Pass outline_counts back and simplify the calling code. + if (feature_outline_index != nullptr) { + int f = 0; + for (int i = 0; i < outline_counts.size(); ++i) { + while (f < outline_counts[i]) + feature_outline_index[f++] = i; + } + } +} + +// This method returns the row to which a box of specified dimensions would +// belong. If no good match is found, it returns nullptr. +ROW* TessBaseAPI::FindRowForBox(BLOCK_LIST* blocks, + int left, int top, int right, int bottom) { + TBOX box(left, bottom, right, top); + BLOCK_IT b_it(blocks); + for (b_it.mark_cycle_pt(); !b_it.cycled_list(); b_it.forward()) { + BLOCK* block = b_it.data(); + if (!box.major_overlap(block->pdblk.bounding_box())) + continue; + ROW_IT r_it(block->row_list()); + for (r_it.mark_cycle_pt(); !r_it.cycled_list(); r_it.forward()) { + ROW* row = r_it.data(); + if (!box.major_overlap(row->bounding_box())) + continue; + WERD_IT w_it(row->word_list()); + for (w_it.mark_cycle_pt(); !w_it.cycled_list(); w_it.forward()) { + WERD* word = w_it.data(); + if (box.major_overlap(word->bounding_box())) + return row; + } + } + } + return nullptr; +} + +/** Method to run adaptive classifier on a blob. */ +void TessBaseAPI::RunAdaptiveClassifier(TBLOB* blob, + int num_max_matches, + int* unichar_ids, + float* ratings, + int* num_matches_returned) { + BLOB_CHOICE_LIST* choices = new BLOB_CHOICE_LIST; + tesseract_->AdaptiveClassifier(blob, choices); + BLOB_CHOICE_IT choices_it(choices); + int& index = *num_matches_returned; + index = 0; + for (choices_it.mark_cycle_pt(); + !choices_it.cycled_list() && index < num_max_matches; + choices_it.forward()) { + BLOB_CHOICE* choice = choices_it.data(); + unichar_ids[index] = choice->unichar_id(); + ratings[index] = choice->rating(); + ++index; + } + *num_matches_returned = index; + delete choices; +} +#endif // ndef DISABLED_LEGACY_ENGINE + +} // namespace tesseract. diff -Nru k2pdfopt-2.42+ds/tesseract_mod/baseapi.h k2pdfopt-2.51+ds/tesseract_mod/baseapi.h --- k2pdfopt-2.42+ds/tesseract_mod/baseapi.h 1970-01-01 00:00:00.000000000 +0000 +++ k2pdfopt-2.51+ds/tesseract_mod/baseapi.h 2018-11-22 20:15:54.000000000 +0000 @@ -0,0 +1,929 @@ +/////////////////////////////////////////////////////////////////////// +// File: baseapi.h +// Description: Simple API for calling tesseract. +// Author: Ray Smith +// +// (C) Copyright 2006, Google Inc. +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// http://www.apache.org/licenses/LICENSE-2.0 +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +/////////////////////////////////////////////////////////////////////// + +#ifndef TESSERACT_API_BASEAPI_H_ +#define TESSERACT_API_BASEAPI_H_ + +#include +// To avoid collision with other typenames include the ABSOLUTE MINIMUM +// complexity of includes here. Use forward declarations wherever possible +// and hide includes of complex types in baseapi.cpp. +#include "tess_version.h" +#include "apitypes.h" +#include "pageiterator.h" +#include "platform.h" +#include "publictypes.h" +#include "resultiterator.h" +#include "serialis.h" +#include "tesscallback.h" +#include "thresholder.h" +#include "unichar.h" + +template class GenericVector; +class PAGE_RES; +class PAGE_RES_IT; +class ParagraphModel; +struct BlamerBundle; +class BLOCK_LIST; +class DENORM; +class MATRIX; +class ROW; +class STRING; +class WERD; +struct Pix; +struct Box; +struct Pixa; +struct Boxa; +class ETEXT_DESC; +struct OSResults; +class TBOX; +class UNICHARSET; +class WERD_CHOICE_LIST; + +struct INT_FEATURE_STRUCT; +typedef INT_FEATURE_STRUCT *INT_FEATURE; +struct TBLOB; + +namespace tesseract { + +class Dawg; +class Dict; +class EquationDetect; +class PageIterator; +class LTRResultIterator; +class ResultIterator; +class MutableIterator; +class TessResultRenderer; +class Tesseract; +class Trie; +class Wordrec; + +typedef int (Dict::*DictFunc)(void* void_dawg_args, + const UNICHARSET& unicharset, + UNICHAR_ID unichar_id, bool word_end) const; +typedef double (Dict::*ProbabilityInContextFunc)(const char* lang, + const char* context, + int context_bytes, + const char* character, + int character_bytes); +typedef float (Dict::*ParamsModelClassifyFunc)( + const char *lang, void *path); +typedef void (Wordrec::*FillLatticeFunc)(const MATRIX &ratings, + const WERD_CHOICE_LIST &best_choices, + const UNICHARSET &unicharset, + BlamerBundle *blamer_bundle); +typedef TessCallback4 + TruthCallback; + +/** + * Base class for all tesseract APIs. + * Specific classes can add ability to work on different inputs or produce + * different outputs. + * This class is mostly an interface layer on top of the Tesseract instance + * class to hide the data types so that users of this class don't have to + * include any other Tesseract headers. + */ +class TESS_API TessBaseAPI { + public: + TessBaseAPI(); + virtual ~TessBaseAPI(); + + /** + * Returns the version identifier as a static string. Do not delete. + */ + static const char* Version(); + + /** + * If compiled with OpenCL AND an available OpenCL + * device is deemed faster than serial code, then + * "device" is populated with the cl_device_id + * and returns sizeof(cl_device_id) + * otherwise *device=nullptr and returns 0. + */ + static size_t getOpenCLDevice(void **device); + + /** + * Writes the thresholded image to stderr as a PBM file on receipt of a + * SIGSEGV, SIGFPE, or SIGBUS signal. (Linux/Unix only). + */ + static void CatchSignals(); + + /** + * Set the name of the input file. Needed for training and + * reading a UNLV zone file, and for searchable PDF output. + */ + void SetInputName(const char* name); + /** + * These functions are required for searchable PDF output. + * We need our hands on the input file so that we can include + * it in the PDF without transcoding. If that is not possible, + * we need the original image. Finally, resolution metadata + * is stored in the PDF so we need that as well. + */ + const char* GetInputName(); + // Takes ownership of the input pix. + void SetInputImage(Pix *pix); + Pix* GetInputImage(); + int GetSourceYResolution(); + const char* GetDatapath(); + + /** Set the name of the bonus output files. Needed only for debugging. */ + void SetOutputName(const char* name); + + /** + * Set the value of an internal "parameter." + * Supply the name of the parameter and the value as a string, just as + * you would in a config file. + * Returns false if the name lookup failed. + * Eg SetVariable("tessedit_char_blacklist", "xyz"); to ignore x, y and z. + * Or SetVariable("classify_bln_numeric_mode", "1"); to set numeric-only mode. + * SetVariable may be used before Init, but settings will revert to + * defaults on End(). + * + * Note: Must be called after Init(). Only works for non-init variables + * (init variables should be passed to Init()). + */ + bool SetVariable(const char* name, const char* value); + bool SetDebugVariable(const char* name, const char* value); + + /** + * Returns true if the parameter was found among Tesseract parameters. + * Fills in value with the value of the parameter. + */ + bool GetIntVariable(const char *name, int *value) const; + bool GetBoolVariable(const char *name, bool *value) const; + bool GetDoubleVariable(const char *name, double *value) const; + + /** + * Returns the pointer to the string that represents the value of the + * parameter if it was found among Tesseract parameters. + */ + const char *GetStringVariable(const char *name) const; + + /** + * Print Tesseract parameters to the given file. + */ + void PrintVariables(FILE *fp) const; + + /** + * Get value of named variable as a string, if it exists. + */ + bool GetVariableAsString(const char *name, STRING *val); + + /** + * Instances are now mostly thread-safe and totally independent, + * but some global parameters remain. Basically it is safe to use multiple + * TessBaseAPIs in different threads in parallel, UNLESS: + * you use SetVariable on some of the Params in classify and textord. + * If you do, then the effect will be to change it for all your instances. + * + * Start tesseract. Returns zero on success and -1 on failure. + * NOTE that the only members that may be called before Init are those + * listed above here in the class definition. + * + * The datapath must be the name of the parent directory of tessdata and + * must end in / . Any name after the last / will be stripped. + * The language is (usually) an ISO 639-3 string or nullptr will default to eng. + * It is entirely safe (and eventually will be efficient too) to call + * Init multiple times on the same instance to change language, or just + * to reset the classifier. + * The language may be a string of the form [~][+[~]]* indicating + * that multiple languages are to be loaded. Eg hin+eng will load Hindi and + * English. Languages may specify internally that they want to be loaded + * with one or more other languages, so the ~ sign is available to override + * that. Eg if hin were set to load eng by default, then hin+~eng would force + * loading only hin. The number of loaded languages is limited only by + * memory, with the caveat that loading additional languages will impact + * both speed and accuracy, as there is more work to do to decide on the + * applicable language, and there is more chance of hallucinating incorrect + * words. + * WARNING: On changing languages, all Tesseract parameters are reset + * back to their default values. (Which may vary between languages.) + * If you have a rare need to set a Variable that controls + * initialization for a second call to Init you should explicitly + * call End() and then use SetVariable before Init. This is only a very + * rare use case, since there are very few uses that require any parameters + * to be set before Init. + * + * If set_only_non_debug_params is true, only params that do not contain + * "debug" in the name will be set. + */ + int Init(const char* datapath, const char* language, OcrEngineMode mode, + char **configs, int configs_size, + const GenericVector *vars_vec, + const GenericVector *vars_values, + bool set_only_non_debug_params); + int Init(const char* datapath, const char* language, OcrEngineMode oem) { + return Init(datapath, language, oem, nullptr, 0, nullptr, nullptr, false); + } + int Init(const char* datapath, const char* language) { + return Init(datapath, language, OEM_DEFAULT, nullptr, 0, nullptr, nullptr, false); + } + // In-memory version reads the traineddata file directly from the given + // data[data_size] array, and/or reads data via a FileReader. + int Init(const char* data, int data_size, const char* language, + OcrEngineMode mode, char** configs, int configs_size, + const GenericVector* vars_vec, + const GenericVector* vars_values, + bool set_only_non_debug_params, FileReader reader); + + /** + * Returns the languages string used in the last valid initialization. + * If the last initialization specified "deu+hin" then that will be + * returned. If hin loaded eng automatically as well, then that will + * not be included in this list. To find the languages actually + * loaded use GetLoadedLanguagesAsVector. + * The returned string should NOT be deleted. + */ + const char* GetInitLanguagesAsString() const; + + /** + * Returns the loaded languages in the vector of STRINGs. + * Includes all languages loaded by the last Init, including those loaded + * as dependencies of other loaded languages. + */ + void GetLoadedLanguagesAsVector(GenericVector* langs) const; + + /** + * Returns the available languages in the sorted vector of STRINGs. + */ + void GetAvailableLanguagesAsVector(GenericVector* langs) const; + + /** + * Init only the lang model component of Tesseract. The only functions + * that work after this init are SetVariable and IsValidWord. + * WARNING: temporary! This function will be removed from here and placed + * in a separate API at some future time. + */ + int InitLangMod(const char* datapath, const char* language); + + /** + * Init only for page layout analysis. Use only for calls to SetImage and + * AnalysePage. Calls that attempt recognition will generate an error. + */ + void InitForAnalysePage(); + + /** + * Read a "config" file containing a set of param, value pairs. + * Searches the standard places: tessdata/configs, tessdata/tessconfigs + * and also accepts a relative or absolute path name. + * Note: only non-init params will be set (init params are set by Init()). + */ + void ReadConfigFile(const char* filename); + /** Same as above, but only set debug params from the given config file. */ + void ReadDebugConfigFile(const char* filename); + + /** + * Set the current page segmentation mode. Defaults to PSM_SINGLE_BLOCK. + * The mode is stored as an IntParam so it can also be modified by + * ReadConfigFile or SetVariable("tessedit_pageseg_mode", mode as string). + */ + void SetPageSegMode(PageSegMode mode); + + /** Return the current page segmentation mode. */ + PageSegMode GetPageSegMode() const; + + /** + * Recognize a rectangle from an image and return the result as a string. + * May be called many times for a single Init. + * Currently has no error checking. + * Greyscale of 8 and color of 24 or 32 bits per pixel may be given. + * Palette color images will not work properly and must be converted to + * 24 bit. + * Binary images of 1 bit per pixel may also be given but they must be + * byte packed with the MSB of the first byte being the first pixel, and a + * 1 represents WHITE. For binary images set bytes_per_pixel=0. + * The recognized text is returned as a char* which is coded + * as UTF8 and must be freed with the delete [] operator. + * + * Note that TesseractRect is the simplified convenience interface. + * For advanced uses, use SetImage, (optionally) SetRectangle, Recognize, + * and one or more of the Get*Text functions below. + */ + char* TesseractRect(const unsigned char* imagedata, + int bytes_per_pixel, int bytes_per_line, + int left, int top, int width, int height); + + /** + * Call between pages or documents etc to free up memory and forget + * adaptive data. + */ + void ClearAdaptiveClassifier(); + + /** + * @defgroup AdvancedAPI Advanced API + * The following methods break TesseractRect into pieces, so you can + * get hold of the thresholded image, get the text in different formats, + * get bounding boxes, confidences etc. + */ + /* @{ */ + + /** + * Provide an image for Tesseract to recognize. Format is as + * TesseractRect above. Copies the image buffer and converts to Pix. + * SetImage clears all recognition results, and sets the rectangle to the + * full image, so it may be followed immediately by a GetUTF8Text, and it + * will automatically perform recognition. + */ + void SetImage(const unsigned char* imagedata, int width, int height, + int bytes_per_pixel, int bytes_per_line); + + /** + * Provide an image for Tesseract to recognize. As with SetImage above, + * Tesseract takes its own copy of the image, so it need not persist until + * after Recognize. + * Pix vs raw, which to use? + * Use Pix where possible. Tesseract uses Pix as its internal representation + * and it is therefore more efficient to provide a Pix directly. + */ + void SetImage(Pix* pix); + + /** + * Set the resolution of the source image in pixels per inch so font size + * information can be calculated in results. Call this after SetImage(). + */ + void SetSourceResolution(int ppi); + + /** + * Restrict recognition to a sub-rectangle of the image. Call after SetImage. + * Each SetRectangle clears the recogntion results so multiple rectangles + * can be recognized with the same image. + */ + void SetRectangle(int left, int top, int width, int height); + + /** + * In extreme cases only, usually with a subclass of Thresholder, it + * is possible to provide a different Thresholder. The Thresholder may + * be preloaded with an image, settings etc, or they may be set after. + * Note that Tesseract takes ownership of the Thresholder and will + * delete it when it it is replaced or the API is destructed. + */ + void SetThresholder(ImageThresholder* thresholder) { + delete thresholder_; + thresholder_ = thresholder; + ClearResults(); + } + + /** + * Get a copy of the internal thresholded image from Tesseract. + * Caller takes ownership of the Pix and must pixDestroy it. + * May be called any time after SetImage, or after TesseractRect. + */ + Pix* GetThresholdedImage(); + + /** + * Get the result of page layout analysis as a leptonica-style + * Boxa, Pixa pair, in reading order. + * Can be called before or after Recognize. + */ + Boxa* GetRegions(Pixa** pixa); + + /** + * Get the textlines as a leptonica-style + * Boxa, Pixa pair, in reading order. + * Can be called before or after Recognize. + * If raw_image is true, then extract from the original image instead of the + * thresholded image and pad by raw_padding pixels. + * If blockids is not nullptr, the block-id of each line is also returned as an + * array of one element per line. delete [] after use. + * If paraids is not nullptr, the paragraph-id of each line within its block is + * also returned as an array of one element per line. delete [] after use. + */ + Boxa* GetTextlines(const bool raw_image, const int raw_padding, + Pixa** pixa, int** blockids, int** paraids); + /* + Helper method to extract from the thresholded image. (most common usage) + */ + Boxa* GetTextlines(Pixa** pixa, int** blockids) { + return GetTextlines(false, 0, pixa, blockids, nullptr); + } + + /** + * Get textlines and strips of image regions as a leptonica-style Boxa, Pixa + * pair, in reading order. Enables downstream handling of non-rectangular + * regions. + * Can be called before or after Recognize. + * If blockids is not nullptr, the block-id of each line is also returned as an + * array of one element per line. delete [] after use. + */ + Boxa* GetStrips(Pixa** pixa, int** blockids); + + /** + * Get the words as a leptonica-style + * Boxa, Pixa pair, in reading order. + * Can be called before or after Recognize. + */ + Boxa* GetWords(Pixa** pixa); + + /** + * Gets the individual connected (text) components (created + * after pages segmentation step, but before recognition) + * as a leptonica-style Boxa, Pixa pair, in reading order. + * Can be called before or after Recognize. + * Note: the caller is responsible for calling boxaDestroy() + * on the returned Boxa array and pixaDestroy() on cc array. + */ + Boxa* GetConnectedComponents(Pixa** cc); + + /** + * Get the given level kind of components (block, textline, word etc.) as a + * leptonica-style Boxa, Pixa pair, in reading order. + * Can be called before or after Recognize. + * If blockids is not nullptr, the block-id of each component is also returned + * as an array of one element per component. delete [] after use. + * If blockids is not nullptr, the paragraph-id of each component with its block + * is also returned as an array of one element per component. delete [] after + * use. + * If raw_image is true, then portions of the original image are extracted + * instead of the thresholded image and padded with raw_padding. + * If text_only is true, then only text components are returned. + */ + Boxa* GetComponentImages(const PageIteratorLevel level, + const bool text_only, const bool raw_image, + const int raw_padding, + Pixa** pixa, int** blockids, int** paraids); + // Helper function to get binary images with no padding (most common usage). + Boxa* GetComponentImages(const PageIteratorLevel level, + const bool text_only, + Pixa** pixa, int** blockids) { + return GetComponentImages(level, text_only, false, 0, pixa, blockids, nullptr); + } + + /** + * Returns the scale factor of the thresholded image that would be returned by + * GetThresholdedImage() and the various GetX() methods that call + * GetComponentImages(). + * Returns 0 if no thresholder has been set. + */ + int GetThresholdedImageScaleFactor() const; + + /** + * Runs page layout analysis in the mode set by SetPageSegMode. + * May optionally be called prior to Recognize to get access to just + * the page layout results. Returns an iterator to the results. + * If merge_similar_words is true, words are combined where suitable for use + * with a line recognizer. Use if you want to use AnalyseLayout to find the + * textlines, and then want to process textline fragments with an external + * line recognizer. + * Returns nullptr on error or an empty page. + * The returned iterator must be deleted after use. + * WARNING! This class points to data held within the TessBaseAPI class, and + * therefore can only be used while the TessBaseAPI class still exists and + * has not been subjected to a call of Init, SetImage, Recognize, Clear, End + * DetectOS, or anything else that changes the internal PAGE_RES. + */ + PageIterator* AnalyseLayout(); + PageIterator* AnalyseLayout(bool merge_similar_words); + + /** + * Recognize the image from SetAndThresholdImage, generating Tesseract + * internal structures. Returns 0 on success. + * Optional. The Get*Text functions below will call Recognize if needed. + * After Recognize, the output is kept internally until the next SetImage. + */ + int Recognize(ETEXT_DESC* monitor); + + /** + * Methods to retrieve information after SetAndThresholdImage(), + * Recognize() or TesseractRect(). (Recognize is called implicitly if needed.) + */ + + #ifndef DISABLED_LEGACY_ENGINE + /** Variant on Recognize used for testing chopper. */ + int RecognizeForChopTest(ETEXT_DESC* monitor); + #endif + + /** + * Turns images into symbolic text. + * + * filename can point to a single image, a multi-page TIFF, + * or a plain text list of image filenames. + * + * retry_config is useful for debugging. If not nullptr, you can fall + * back to an alternate configuration if a page fails for some + * reason. + * + * timeout_millisec terminates processing if any single page + * takes too long. Set to 0 for unlimited time. + * + * renderer is responible for creating the output. For example, + * use the TessTextRenderer if you want plaintext output, or + * the TessPDFRender to produce searchable PDF. + * + * If tessedit_page_number is non-negative, will only process that + * single page. Works for multi-page tiff file, or filelist. + * + * Returns true if successful, false on error. + */ + bool ProcessPages(const char* filename, const char* retry_config, + int timeout_millisec, TessResultRenderer* renderer); + // Does the real work of ProcessPages. + bool ProcessPagesInternal(const char* filename, const char* retry_config, + int timeout_millisec, TessResultRenderer* renderer); + + /** + * Turn a single image into symbolic text. + * + * The pix is the image processed. filename and page_index are + * metadata used by side-effect processes, such as reading a box + * file or formatting as hOCR. + * + * See ProcessPages for desciptions of other parameters. + */ + bool ProcessPage(Pix* pix, int page_index, const char* filename, + const char* retry_config, int timeout_millisec, + TessResultRenderer* renderer); + + /** + * Get a reading-order iterator to the results of LayoutAnalysis and/or + * Recognize. The returned iterator must be deleted after use. + * WARNING! This class points to data held within the TessBaseAPI class, and + * therefore can only be used while the TessBaseAPI class still exists and + * has not been subjected to a call of Init, SetImage, Recognize, Clear, End + * DetectOS, or anything else that changes the internal PAGE_RES. + */ + ResultIterator* GetIterator(); + + /** + * Get a mutable iterator to the results of LayoutAnalysis and/or Recognize. + * The returned iterator must be deleted after use. + * WARNING! This class points to data held within the TessBaseAPI class, and + * therefore can only be used while the TessBaseAPI class still exists and + * has not been subjected to a call of Init, SetImage, Recognize, Clear, End + * DetectOS, or anything else that changes the internal PAGE_RES. + */ + MutableIterator* GetMutableIterator(); + + /** + * The recognized text is returned as a char* which is coded + * as UTF8 and must be freed with the delete [] operator. + */ + char* GetUTF8Text(); + + /** + * Make a HTML-formatted string with hOCR markup from the internal + * data structures. + * page_number is 0-based but will appear in the output as 1-based. + * monitor can be used to + * cancel the recognition + * receive progress callbacks + * Returned string must be freed with the delete [] operator. + */ + char* GetHOCRText(ETEXT_DESC* monitor, int page_number); + +/* willus mod */ +int GetOCRWords(int **x0,int **y0,int **x1,int **y1,int **ybaseline,char **utf8words); + + /** + * Make a HTML-formatted string with hOCR markup from the internal + * data structures. + * page_number is 0-based but will appear in the output as 1-based. + * Returned string must be freed with the delete [] operator. + */ + char* GetHOCRText(int page_number); + + /** + * Make a TSV-formatted string from the internal data structures. + * page_number is 0-based but will appear in the output as 1-based. + * Returned string must be freed with the delete [] operator. + */ + char* GetTSVText(int page_number); + + /** + * The recognized text is returned as a char* which is coded in the same + * format as a box file used in training. + * Constructs coordinates in the original image - not just the rectangle. + * page_number is a 0-based page index that will appear in the box file. + * Returned string must be freed with the delete [] operator. + */ + char* GetBoxText(int page_number); + + /** + * The recognized text is returned as a char* which is coded + * as UNLV format Latin-1 with specific reject and suspect codes. + * Returned string must be freed with the delete [] operator. + */ + char* GetUNLVText(); + + /** + * Detect the orientation of the input image and apparent script (alphabet). + * orient_deg is the detected clockwise rotation of the input image in degrees + * (0, 90, 180, 270) + * orient_conf is the confidence (15.0 is reasonably confident) + * script_name is an ASCII string, the name of the script, e.g. "Latin" + * script_conf is confidence level in the script + * Returns true on success and writes values to each parameter as an output + */ + bool DetectOrientationScript(int* orient_deg, float* orient_conf, + const char** script_name, float* script_conf); + + /** + * The recognized text is returned as a char* which is coded + * as UTF8 and must be freed with the delete [] operator. + * page_number is a 0-based page index that will appear in the osd file. + */ + char* GetOsdText(int page_number); + + /** Returns the (average) confidence value between 0 and 100. */ + int MeanTextConf(); + /** + * Returns all word confidences (between 0 and 100) in an array, terminated + * by -1. The calling function must delete [] after use. + * The number of confidences should correspond to the number of space- + * delimited words in GetUTF8Text. + */ + int* AllWordConfidences(); + +#ifndef DISABLED_LEGACY_ENGINE + /** + * Applies the given word to the adaptive classifier if possible. + * The word must be SPACE-DELIMITED UTF-8 - l i k e t h i s , so it can + * tell the boundaries of the graphemes. + * Assumes that SetImage/SetRectangle have been used to set the image + * to the given word. The mode arg should be PSM_SINGLE_WORD or + * PSM_CIRCLE_WORD, as that will be used to control layout analysis. + * The currently set PageSegMode is preserved. + * Returns false if adaption was not possible for some reason. + */ + bool AdaptToWordStr(PageSegMode mode, const char* wordstr); +#endif // ndef DISABLED_LEGACY_ENGINE + + /** + * Free up recognition results and any stored image data, without actually + * freeing any recognition data that would be time-consuming to reload. + * Afterwards, you must call SetImage or TesseractRect before doing + * any Recognize or Get* operation. + */ + void Clear(); + + /** + * Close down tesseract and free up all memory. End() is equivalent to + * destructing and reconstructing your TessBaseAPI. + * Once End() has been used, none of the other API functions may be used + * other than Init and anything declared above it in the class definition. + */ + void End(); + + /** + * Clear any library-level memory caches. + * There are a variety of expensive-to-load constant data structures (mostly + * language dictionaries) that are cached globally -- surviving the Init() + * and End() of individual TessBaseAPI's. This function allows the clearing + * of these caches. + **/ + static void ClearPersistentCache(); + + /** + * Check whether a word is valid according to Tesseract's language model + * @return 0 if the word is invalid, non-zero if valid. + * @warning temporary! This function will be removed from here and placed + * in a separate API at some future time. + */ + int IsValidWord(const char *word); + // Returns true if utf8_character is defined in the UniCharset. + bool IsValidCharacter(const char *utf8_character); + + + bool GetTextDirection(int* out_offset, float* out_slope); + + /** Sets Dict::letter_is_okay_ function to point to the given function. */ + void SetDictFunc(DictFunc f); + + /** Sets Dict::probability_in_context_ function to point to the given + * function. + */ + void SetProbabilityInContextFunc(ProbabilityInContextFunc f); + + /** + * Estimates the Orientation And Script of the image. + * @return true if the image was processed successfully. + */ + bool DetectOS(OSResults*); + + /** + * Return text orientation of each block as determined by an earlier run + * of layout analysis. + */ + void GetBlockTextOrientations(int** block_orientation, + bool** vertical_writing); + + + #ifndef DISABLED_LEGACY_ENGINE + + /** Sets Wordrec::fill_lattice_ function to point to the given function. */ + void SetFillLatticeFunc(FillLatticeFunc f); + + /** Find lines from the image making the BLOCK_LIST. */ + BLOCK_LIST* FindLinesCreateBlockList(); + + /** + * Delete a block list. + * This is to keep BLOCK_LIST pointer opaque + * and let go of including the other headers. + */ + static void DeleteBlockList(BLOCK_LIST* block_list); + + /** Returns a ROW object created from the input row specification. */ + static ROW *MakeTessOCRRow(float baseline, float xheight, + float descender, float ascender); + + /** Returns a TBLOB corresponding to the entire input image. */ + static TBLOB *MakeTBLOB(Pix *pix); + + /** + * This method baseline normalizes a TBLOB in-place. The input row is used + * for normalization. The denorm is an optional parameter in which the + * normalization-antidote is returned. + */ + static void NormalizeTBLOB(TBLOB *tblob, ROW *row, bool numeric_mode); + + /** This method returns the features associated with the input image. */ + void GetFeaturesForBlob(TBLOB* blob, INT_FEATURE_STRUCT* int_features, + int* num_features, int* feature_outline_index); + + /** + * This method returns the row to which a box of specified dimensions would + * belong. If no good match is found, it returns nullptr. + */ + static ROW* FindRowForBox(BLOCK_LIST* blocks, int left, int top, + int right, int bottom); + + /** + * Method to run adaptive classifier on a blob. + * It returns at max num_max_matches results. + */ + void RunAdaptiveClassifier(TBLOB* blob, + int num_max_matches, + int* unichar_ids, + float* ratings, + int* num_matches_returned); +#endif // ndef DISABLED_LEGACY_ENGINE + + /** This method returns the string form of the specified unichar. */ + const char* GetUnichar(int unichar_id); + + /** Return the pointer to the i-th dawg loaded into tesseract_ object. */ + const Dawg *GetDawg(int i) const; + + /** Return the number of dawgs loaded into tesseract_ object. */ + int NumDawgs() const; + + Tesseract* tesseract() const { return tesseract_; } + + OcrEngineMode oem() const { return last_oem_requested_; } + + void InitTruthCallback(TruthCallback *cb) { truth_cb_ = cb; } + + void set_min_orientation_margin(double margin); + /* @} */ + + protected: + + /** Common code for setting the image. Returns true if Init has been called. */ + TESS_LOCAL bool InternalSetImage(); + + /** + * Run the thresholder to make the thresholded image. If pix is not nullptr, + * the source is thresholded to pix instead of the internal IMAGE. + */ + TESS_LOCAL virtual bool Threshold(Pix** pix); + + /** + * Find lines from the image making the BLOCK_LIST. + * @return 0 on success. + */ + TESS_LOCAL int FindLines(); + + /** Delete the pageres and block list ready for a new page. */ + void ClearResults(); + + /** + * Return an LTR Result Iterator -- used only for training, as we really want + * to ignore all BiDi smarts at that point. + * delete once you're done with it. + */ + TESS_LOCAL LTRResultIterator* GetLTRIterator(); + + /** + * Return the length of the output text string, as UTF8, assuming + * one newline per line and one per block, with a terminator, + * and assuming a single character reject marker for each rejected character. + * Also return the number of recognized blobs in blob_count. + */ + TESS_LOCAL int TextLength(int* blob_count); + + //// paragraphs.cpp //////////////////////////////////////////////////// + TESS_LOCAL void DetectParagraphs(bool after_text_recognition); + + #ifndef DISABLED_LEGACY_ENGINE + + /** @defgroup ocropusAddOns ocropus add-ons */ + /* @{ */ + + /** + * Adapt to recognize the current image as the given character. + * The image must be preloaded and be just an image of a single character. + */ + TESS_LOCAL void AdaptToCharacter(const char *unichar_repr, + int length, + float baseline, + float xheight, + float descender, + float ascender); + + /** Recognize text doing one pass only, using settings for a given pass. */ + TESS_LOCAL PAGE_RES* RecognitionPass1(BLOCK_LIST* block_list); + + TESS_LOCAL PAGE_RES* RecognitionPass2(BLOCK_LIST* block_list, + PAGE_RES* pass1_result); + + /** + * Extract the OCR results, costs (penalty points for uncertainty), + * and the bounding boxes of the characters. + */ + TESS_LOCAL static int TesseractExtractResult(char** text, + int** lengths, + float** costs, + int** x0, + int** y0, + int** x1, + int** y1, + PAGE_RES* page_res); + + TESS_LOCAL const PAGE_RES* GetPageRes() const { return page_res_; } + /* @} */ +#endif // ndef DISABLED_LEGACY_ENGINE + + protected: + Tesseract* tesseract_; ///< The underlying data object. + Tesseract* osd_tesseract_; ///< For orientation & script detection. + EquationDetect* equ_detect_; ///* paragraph_models_; + BLOCK_LIST* block_list_; ///< The page layout. + PAGE_RES* page_res_; ///< The page-level data. + STRING* input_file_; ///< Name used by training code. + STRING* output_file_; ///< Name used by debug code. + STRING* datapath_; ///< Current location of tessdata. + STRING* language_; ///< Last initialized language. + OcrEngineMode last_oem_requested_; ///< Last ocr language mode requested. + bool recognition_done_; ///< page_res_ contains recognition data. + TruthCallback *truth_cb_; /// fxn for setting truth_* in WERD_RES + + /** + * @defgroup ThresholderParams Thresholder Parameters + * Parameters saved from the Thresholder. Needed to rebuild coordinates. + */ + /* @{ */ + int rect_left_; + int rect_top_; + int rect_width_; + int rect_height_; + int image_width_; + int image_height_; + /* @} */ + + private: + // A list of image filenames gets special consideration + bool ProcessPagesFileList(FILE *fp, + STRING *buf, + const char* retry_config, int timeout_millisec, + TessResultRenderer* renderer, + int tessedit_page_number); + // TIFF supports multipage so gets special consideration. + bool ProcessPagesMultipageTiff(const unsigned char *data, + size_t size, + const char* filename, + const char* retry_config, + int timeout_millisec, + TessResultRenderer* renderer, + int tessedit_page_number); + // There's currently no way to pass a document title from the + // Tesseract command line, and we have multiple places that choose + // to set the title to an empty string. Using a single named + // variable will hopefully reduce confusion if the situation changes + // in the future. + const char *unknown_title_ = ""; +}; // class TessBaseAPI. + +/** Escape a char string - remove &<>"' with HTML codes. */ +STRING HOcrEscape(const char* text); +} // namespace tesseract. + +#endif // TESSERACT_API_BASEAPI_H_ diff -Nru k2pdfopt-2.42+ds/tesseract_mod/ccutil.cpp k2pdfopt-2.51+ds/tesseract_mod/ccutil.cpp --- k2pdfopt-2.42+ds/tesseract_mod/ccutil.cpp 2017-02-25 04:39:23.000000000 +0000 +++ k2pdfopt-2.51+ds/tesseract_mod/ccutil.cpp 2018-11-18 18:27:53.000000000 +0000 @@ -1,62 +1,63 @@ -#include "config_auto.h" -// Copyright 2008 Google Inc. All Rights Reserved. -// Author: scharron@google.com (Samuel Charron) -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "ccutil.h" - -namespace tesseract { -CCUtil::CCUtil() : - params_(), - STRING_INIT_MEMBER(m_data_sub_dir, - "tessdata/", "Directory for data files", ¶ms_), -/* - STRING_INIT_MEMBER(tessedit_module_name, WINDLLNAME, - "Module colocated with tessdata dir", ¶ms_), -*/ - INT_INIT_MEMBER(ambigs_debug_level, 0, "Debug level for unichar ambiguities", - ¶ms_), - BOOL_MEMBER(use_definite_ambigs_for_classifier, 0, "Use definite" - " ambiguities when running character classifier", ¶ms_), - BOOL_MEMBER(use_ambigs_for_adaption, 0, "Use ambigs for deciding" - " whether to adapt to a character", ¶ms_) { -} - -CCUtil::~CCUtil() { -} - - -CCUtilMutex::CCUtilMutex() { -#ifdef _WIN32 - mutex_ = CreateMutex(0, FALSE, 0); -#else - pthread_mutex_init(&mutex_, NULL); -#endif -} - -void CCUtilMutex::Lock() { -#ifdef _WIN32 - WaitForSingleObject(mutex_, INFINITE); -#else - pthread_mutex_lock(&mutex_); -#endif -} - -void CCUtilMutex::Unlock() { -#ifdef _WIN32 - ReleaseMutex(mutex_); -#else - pthread_mutex_unlock(&mutex_); -#endif -} - -CCUtilMutex tprintfMutex; // should remain global -} // namespace tesseract +// Copyright 2008 Google Inc. All Rights Reserved. +// Author: scharron@google.com (Samuel Charron) +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// http://www.apache.org/licenses/LICENSE-2.0 +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "ccutil.h" + +namespace tesseract { +CCUtil::CCUtil() : + params_(), +/* willus mod */ +/* +#ifdef _WIN32 + STRING_INIT_MEMBER(tessedit_module_name, WINDLLNAME, + "Module colocated with tessdata dir", ¶ms_), +#endif +*/ + INT_INIT_MEMBER(ambigs_debug_level, 0, "Debug level for unichar ambiguities", + ¶ms_), + BOOL_MEMBER(use_definite_ambigs_for_classifier, 0, "Use definite" + " ambiguities when running character classifier", ¶ms_), + BOOL_MEMBER(use_ambigs_for_adaption, 0, "Use ambigs for deciding" + " whether to adapt to a character", ¶ms_) { +} + +// Destructor. +// It is defined here, so the compiler can create a single vtable +// instead of weak vtables in every compilation unit. +CCUtil::~CCUtil() = default; + +CCUtilMutex::CCUtilMutex() { +#ifdef _WIN32 + mutex_ = CreateMutex(0, FALSE, 0); +#else + pthread_mutex_init(&mutex_, nullptr); +#endif +} + +void CCUtilMutex::Lock() { +#ifdef _WIN32 + WaitForSingleObject(mutex_, INFINITE); +#else + pthread_mutex_lock(&mutex_); +#endif +} + +void CCUtilMutex::Unlock() { +#ifdef _WIN32 + ReleaseMutex(mutex_); +#else + pthread_mutex_unlock(&mutex_); +#endif +} + +CCUtilMutex tprintfMutex; // should remain global +} // namespace tesseract diff -Nru k2pdfopt-2.42+ds/tesseract_mod/ccutil.h k2pdfopt-2.51+ds/tesseract_mod/ccutil.h --- k2pdfopt-2.42+ds/tesseract_mod/ccutil.h 2017-02-25 04:39:37.000000000 +0000 +++ k2pdfopt-2.51+ds/tesseract_mod/ccutil.h 2018-11-18 18:28:07.000000000 +0000 @@ -16,13 +16,12 @@ // /////////////////////////////////////////////////////////////////////// -#ifndef TESSERACT_CCUTIL_CCUTIL_H__ -#define TESSERACT_CCUTIL_CCUTIL_H__ +#ifndef TESSERACT_CCUTIL_CCUTIL_H_ +#define TESSERACT_CCUTIL_CCUTIL_H_ #include "ambigs.h" #include "errcode.h" #include "strngs.h" -#include "tessdatamanager.h" #include "params.h" #include "unicharset.h" @@ -66,7 +65,6 @@ STRING imagebasename; // name of image STRING lang; STRING language_data_path_prefix; - TessdataManager tessdata_manager; UNICHARSET unicharset; UnicharAmbigs unichar_ambigs; STRING imagefile; // image file name @@ -79,10 +77,12 @@ // Member parameters. // These have to be declared and initialized after params_ member, since // params_ should be initialized before parameters are added to it. - STRING_VAR_H(m_data_sub_dir, "tessdata/", "Directory for data files"); +/* willus mod */ /* + #ifdef _WIN32 STRING_VAR_H(tessedit_module_name, WINDLLNAME, "Module colocated with tessdata dir"); + #endif */ INT_VAR_H(ambigs_debug_level, 0, "Debug level for unichar ambiguities"); BOOL_VAR_H(use_definite_ambigs_for_classifier, 0, @@ -94,4 +94,4 @@ extern CCUtilMutex tprintfMutex; // should remain global } // namespace tesseract -#endif // TESSERACT_CCUTIL_CCUTIL_H__ +#endif // TESSERACT_CCUTIL_CCUTIL_H_ diff -Nru k2pdfopt-2.42+ds/tesseract_mod/config_auto.h k2pdfopt-2.51+ds/tesseract_mod/config_auto.h --- k2pdfopt-2.42+ds/tesseract_mod/config_auto.h 2016-03-18 11:33:23.000000000 +0000 +++ k2pdfopt-2.51+ds/tesseract_mod/config_auto.h 2018-11-18 18:46:42.000000000 +0000 @@ -1,205 +0,0 @@ -/* config/config.h.in. Generated from configure.ac by autoheader. */ - - -#ifndef CONFIG_AUTO_H -#define CONFIG_AUTO_H -/* config_auto.h: begin */ - - -/* Define if building universal (internal helper macro) */ -#undef AC_APPLE_UNIVERSAL_BUILD - -/* Define to be the git revision */ -#undef GIT_REV - -/* Disable graphics */ -#undef GRAPHICS_DISABLED -#define GRAPHICS_DISABLED - -/* Define to 1 if you have the header file. */ -#undef HAVE_CAIRO_CAIRO_VERSION_H - -/* Define to 1 if you have the header file. */ -#undef HAVE_CL_CL_H - -/* Define to 1 if you have the header file. */ -#undef HAVE_DLFCN_H - -/* Define if you have the OpenCL framework */ -#undef HAVE_FRAMEWORK_OPENCL - -/* Define to 1 if you have the `getline' function. */ -#undef HAVE_GETLINE - -/* Define to 1 if you have the header file. */ -#undef HAVE_INTTYPES_H - -/* Define to 1 if you have the `lept' library (-llept). */ -#undef HAVE_LIBLEPT -#define HAVE_LIBLEPT 1 - -/* Define to 1 if you have the header file. */ -#undef HAVE_LIMITS_H - -/* Define to 1 if the system has the type `long long int'. */ -#undef HAVE_LONG_LONG_INT - -/* Define to 1 if you have the header file. */ -#undef HAVE_MALLOC_H - -/* Define to 1 if the system has the type `mbstate_t'. */ -#undef HAVE_MBSTATE_T - -/* Define to 1 if you have the header file. */ -#undef HAVE_MEMORY_H - -/* Define to 1 if the system has the type `off_t'. */ -#define HAVE_OFF_T 1 - -/* Define to 1 if you have the header file. */ -#undef HAVE_OPENCL_CL_H - -/* Define to 1 if you have the header file. - */ -#undef HAVE_PANGO_1_0_PANGO_PANGO_FEATURES_H - -/* Define to 1 if you have the `snprintf' function. */ -#undef HAVE_SNPRINTF - -/* Define to 1 if stdbool.h conforms to C99. */ -#undef HAVE_STDBOOL_H - -/* Define to 1 if you have the header file. */ -#undef HAVE_STDINT_H - -/* Define to 1 if you have the header file. */ -#undef HAVE_STDLIB_H -#define HAVE_STDLIB_H 1 - -/* Define to 1 if you have the header file. */ -#undef HAVE_STRINGS_H - -/* Define to 1 if you have the header file. */ -#undef HAVE_STRING_H -#define HAVE_STRING_H - -/* Define to 1 if you have the header file. */ -#undef HAVE_SYS_IPC_H - -/* Define to 1 if you have the header file. */ -#undef HAVE_SYS_SHM_H - -/* Define to 1 if you have the header file. */ -#undef HAVE_SYS_STAT_H - -/* Define to 1 if you have the header file. */ -#undef HAVE_SYS_TYPES_H - -/* Define to 1 if you have that is POSIX.1 compatible. */ -#undef HAVE_SYS_WAIT_H - -/* Define to 1 if you have the header file. */ -#undef HAVE_TIFFIO_H - -/* Define to 1 if you have the header file. */ -#undef HAVE_UNICODE_UCHAR_H - -/* Define to 1 if you have the header file. */ -#undef HAVE_UNISTD_H -#define HAVE_UNISTD_H 1 - -/* Define to 1 if the system has the type `wchar_t'. */ -#undef HAVE_WCHAR_T - -/* Define to 1 if the system has the type `_Bool'. */ -#undef HAVE__BOOL - -/* Define to the sub-directory in which libtool stores uninstalled libraries. - */ -#undef LT_OBJDIR - -/* This is a MinGW system */ -#undef MINGW - -/* Defined when compiled with OpenMP support */ -#undef OPENMP - -/* Name of package */ -#undef PACKAGE - -/* Define to the address where bug reports for this package should be sent. */ -#undef PACKAGE_BUGREPORT - -/* Official date of release */ -#undef PACKAGE_DATE - -/* Name of package */ -#undef PACKAGE_NAME - -/* Define to the full name and version of this package. */ -#undef PACKAGE_STRING - -/* Define to the one symbol short name of this package. */ -#undef PACKAGE_TARNAME - -/* Define to the home page for this package. */ -#undef PACKAGE_URL - -/* Version number */ -#undef PACKAGE_VERSION - -/* Official year for this release */ -#undef PACKAGE_YEAR - -/* Define to 1 if you have the ANSI C header files. */ -#undef STDC_HEADERS -#define STDC_HEADERS - -/* Define to 1 if you can safely include both and . */ -#undef TIME_WITH_SYS_TIME - -/* Version number of package */ -#undef VERSION -#define VERSION "3.04.01" - -#ifndef USE_STD_NAMESPACE -#define USE_STD_NAMESPACE -#endif - -/* Define WORDS_BIGENDIAN to 1 if your processor stores words with the most - significant byte first (like Motorola and SPARC, unlike Intel). */ -#if defined AC_APPLE_UNIVERSAL_BUILD -# if defined __BIG_ENDIAN__ -# define WORDS_BIGENDIAN 1 -# endif -#else -# ifndef WORDS_BIGENDIAN -# undef WORDS_BIGENDIAN -# endif -#endif - -/* Enable large inode numbers on Mac OS X 10.5. */ -#ifndef _DARWIN_USE_64_BIT_INODE -# define _DARWIN_USE_64_BIT_INODE 1 -#endif - -/* Number of bits in a file offset, on hosts where this is settable. */ -#undef _FILE_OFFSET_BITS - -/* Define for large files, on AIX-style hosts. */ -#undef _LARGE_FILES - - - -/* Miscellaneous defines */ -#define AUTOCONF 1 - -/* Not used yet -#ifndef NO_GETTEXT -#define USING_GETTEXT -#endif -*/ - -/* config_auto.h: end */ -#endif - diff -Nru k2pdfopt-2.42+ds/tesseract_mod/dawg.cpp k2pdfopt-2.51+ds/tesseract_mod/dawg.cpp --- k2pdfopt-2.42+ds/tesseract_mod/dawg.cpp 2017-02-25 04:41:30.000000000 +0000 +++ k2pdfopt-2.51+ds/tesseract_mod/dawg.cpp 1970-01-01 00:00:00.000000000 +0000 @@ -1,460 +0,0 @@ -#include "config_auto.h" -/* -*-C-*- - ******************************************************************************** - * - * File: dawg.c (Formerly dawg.c) - * Description: Use a Directed Accyclic Word Graph - * Author: Mark Seaman, OCR Technology - * Created: Fri Oct 16 14:37:00 1987 - * Modified: Wed Jul 24 16:59:16 1991 (Mark Seaman) marks@hpgrlt - * Language: C - * Package: N/A - * Status: Reusable Software Component - * - * (c) Copyright 1987, Hewlett-Packard Company. - ** Licensed under the Apache License, Version 2.0 (the "License"); - ** you may not use this file except in compliance with the License. - ** You may obtain a copy of the License at - ** http://www.apache.org/licenses/LICENSE-2.0 - ** Unless required by applicable law or agreed to in writing, software - ** distributed under the License is distributed on an "AS IS" BASIS, - ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - ** See the License for the specific language governing permissions and - ** limitations under the License. - * - *********************************************************************************/ -/*---------------------------------------------------------------------- - I n c l u d e s -----------------------------------------------------------------------*/ - -#ifdef _MSC_VER -#pragma warning(disable:4244) // Conversion warnings -#pragma warning(disable:4800) // int/bool warnings -#endif -#include "dawg.h" - -#include "cutil.h" -#include "dict.h" -#include "emalloc.h" -#include "freelist.h" -#include "helpers.h" -#include "strngs.h" -#include "tesscallback.h" -#include "tprintf.h" - -/*---------------------------------------------------------------------- - F u n c t i o n s f o r D a w g -----------------------------------------------------------------------*/ -namespace tesseract { - -bool Dawg::prefix_in_dawg(const WERD_CHOICE &word, - bool requires_complete) const { - if (word.length() == 0) return !requires_complete; - NODE_REF node = 0; - int end_index = word.length() - 1; - for (int i = 0; i < end_index; i++) { - EDGE_REF edge = edge_char_of(node, word.unichar_id(i), false); - if (edge == NO_EDGE) { - return false; - } - if ((node = next_node(edge)) == 0) { - // This only happens if all words following this edge terminate -- - // there are no larger words. See Trie::add_word_to_dawg() - return false; - } - } - // Now check the last character. - return edge_char_of(node, word.unichar_id(end_index), requires_complete) != - NO_EDGE; -} - -bool Dawg::word_in_dawg(const WERD_CHOICE &word) const { - return prefix_in_dawg(word, true); -} - -int Dawg::check_for_words(const char *filename, - const UNICHARSET &unicharset, - bool enable_wildcard) const { - if (filename == NULL) return 0; - - FILE *word_file; - char string [CHARS_PER_LINE]; - int misses = 0; - UNICHAR_ID wildcard = unicharset.unichar_to_id(kWildcard); - - word_file = open_file (filename, "r"); - - while (fgets (string, CHARS_PER_LINE, word_file) != NULL) { - chomp_string(string); // remove newline - WERD_CHOICE word(string, unicharset); - if (word.length() > 0 && - !word.contains_unichar_id(INVALID_UNICHAR_ID)) { - if (!match_words(&word, 0, 0, - enable_wildcard ? wildcard : INVALID_UNICHAR_ID)) { - tprintf("Missing word: %s\n", string); - ++misses; - } - } else { - tprintf("Failed to create a valid word from %s\n", string); - } - } - fclose (word_file); - // Make sure the user sees this with fprintf instead of tprintf. - if (debug_level_) tprintf("Number of lost words=%d\n", misses); - return misses; -} - -void Dawg::iterate_words(const UNICHARSET &unicharset, - TessCallback1 *cb) const { - WERD_CHOICE word(&unicharset); - iterate_words_rec(word, 0, cb); -} - -void CallWithUTF8(TessCallback1 *cb, const WERD_CHOICE *wc) { - STRING s; - wc->string_and_lengths(&s, NULL); - cb->Run(s.string()); -} - -void Dawg::iterate_words(const UNICHARSET &unicharset, - TessCallback1 *cb) const { - TessCallback1 *shim = - NewPermanentTessCallback(CallWithUTF8, cb); - WERD_CHOICE word(&unicharset); - iterate_words_rec(word, 0, shim); - delete shim; -} - -void Dawg::iterate_words_rec(const WERD_CHOICE &word_so_far, - NODE_REF to_explore, - TessCallback1 *cb) const { - NodeChildVector children; - this->unichar_ids_of(to_explore, &children, false); - for (int i = 0; i < children.size(); i++) { - WERD_CHOICE next_word(word_so_far); - next_word.append_unichar_id(children[i].unichar_id, 1, 0.0, 0.0); - if (this->end_of_word(children[i].edge_ref)) { - cb->Run(&next_word); - } - NODE_REF next = next_node(children[i].edge_ref); - if (next != 0) { - iterate_words_rec(next_word, next, cb); - } - } -} - -bool Dawg::match_words(WERD_CHOICE *word, inT32 index, - NODE_REF node, UNICHAR_ID wildcard) const { - EDGE_REF edge; - inT32 word_end; - - if (wildcard != INVALID_UNICHAR_ID && word->unichar_id(index) == wildcard) { - bool any_matched = false; - NodeChildVector vec; - this->unichar_ids_of(node, &vec, false); - for (int i = 0; i < vec.size(); ++i) { - word->set_unichar_id(vec[i].unichar_id, index); - if (match_words(word, index, node, wildcard)) - any_matched = true; - } - word->set_unichar_id(wildcard, index); - return any_matched; - } else { - word_end = index == word->length() - 1; - edge = edge_char_of(node, word->unichar_id(index), word_end); - if (edge != NO_EDGE) { // normal edge in DAWG - node = next_node(edge); - if (word_end) { - if (debug_level_ > 1) word->print("match_words() found: "); - return true; - } else if (node != 0) { - return match_words(word, index+1, node, wildcard); - } - } - } - return false; -} - -void Dawg::init(DawgType type, const STRING &lang, - PermuterType perm, int unicharset_size, int debug_level) { - type_ = type; - lang_ = lang; - perm_ = perm; - ASSERT_HOST(unicharset_size > 0); - unicharset_size_ = unicharset_size; - // Set bit masks. We will use the value unicharset_size_ as a null char, so - // the actual number of unichars is unicharset_size_ + 1. - flag_start_bit_ = ceil(log(unicharset_size_ + 1.0) / log(2.0)); - next_node_start_bit_ = flag_start_bit_ + NUM_FLAG_BITS; - letter_mask_ = ~(~0ull << flag_start_bit_); - next_node_mask_ = ~0ull << (flag_start_bit_ + NUM_FLAG_BITS); - flags_mask_ = ~(letter_mask_ | next_node_mask_); - - debug_level_ = debug_level; -} - - -/*---------------------------------------------------------------------- - F u n c t i o n s f o r S q u i s h e d D a w g -----------------------------------------------------------------------*/ - -SquishedDawg::~SquishedDawg() { memfree(edges_); } - -EDGE_REF SquishedDawg::edge_char_of(NODE_REF node, - UNICHAR_ID unichar_id, - bool word_end) const { - EDGE_REF edge = node; - if (node == 0) { // binary search - EDGE_REF start = 0; - EDGE_REF end = num_forward_edges_in_node0 - 1; - int compare; - while (start <= end) { - edge = (start + end) >> 1; // (start + end) / 2 - compare = given_greater_than_edge_rec(NO_EDGE, word_end, - unichar_id, edges_[edge]); - if (compare == 0) { // given == vec[k] - return edge; - } else if (compare == 1) { // given > vec[k] - start = edge + 1; - } else { // given < vec[k] - end = edge - 1; - } - } - } else { // linear search - if (edge != NO_EDGE && edge_occupied(edge)) { - do { - if ((unichar_id_from_edge_rec(edges_[edge]) == unichar_id) && - (!word_end || end_of_word_from_edge_rec(edges_[edge]))) - return (edge); - } while (!last_edge(edge++)); - } - } - return (NO_EDGE); // not found -} - -inT32 SquishedDawg::num_forward_edges(NODE_REF node) const { - EDGE_REF edge = node; - inT32 num = 0; - - if (forward_edge (edge)) { - do { - num++; - } while (!last_edge(edge++)); - } - - return (num); -} - -void SquishedDawg::print_node(NODE_REF node, int max_num_edges) const { - if (node == NO_EDGE) return; // nothing to print - - EDGE_REF edge = node; - const char *forward_string = "FORWARD"; - const char *backward_string = " "; - - const char *last_string = "LAST"; - const char *not_last_string = " "; - - const char *eow_string = "EOW"; - const char *not_eow_string = " "; - - const char *direction; - const char *is_last; - const char *eow; - - UNICHAR_ID unichar_id; - - if (edge_occupied(edge)) { - do { - direction = - forward_edge(edge) ? forward_string : backward_string; - is_last = last_edge(edge) ? last_string : not_last_string; - eow = end_of_word(edge) ? eow_string : not_eow_string; - - unichar_id = edge_letter(edge); - tprintf(REFFORMAT " : next = " REFFORMAT ", unichar_id = %d, %s %s %s\n", - edge, next_node(edge), unichar_id, - direction, is_last, eow); - - if (edge - node > max_num_edges) return; - } while (!last_edge(edge++)); - - if (edge < num_edges_ && - edge_occupied(edge) && backward_edge(edge)) { - do { - direction = - forward_edge(edge) ? forward_string : backward_string; - is_last = last_edge(edge) ? last_string : not_last_string; - eow = end_of_word(edge) ? eow_string : not_eow_string; - - unichar_id = edge_letter(edge); - tprintf(REFFORMAT " : next = " REFFORMAT - ", unichar_id = %d, %s %s %s\n", - edge, next_node(edge), unichar_id, - direction, is_last, eow); - - if (edge - node > MAX_NODE_EDGES_DISPLAY) return; - } while (!last_edge(edge++)); - } - } - else { - tprintf(REFFORMAT " : no edges in this node\n", node); - } - tprintf("\n"); -} - -void SquishedDawg::print_edge(EDGE_REF edge) const { - if (edge == NO_EDGE) { - tprintf("NO_EDGE\n"); - } else { - tprintf(REFFORMAT " : next = " REFFORMAT - ", unichar_id = '%d', %s %s %s\n", edge, - next_node(edge), edge_letter(edge), - (forward_edge(edge) ? "FORWARD" : " "), - (last_edge(edge) ? "LAST" : " "), - (end_of_word(edge) ? "EOW" : "")); - } -} - -void SquishedDawg::read_squished_dawg(FILE *file, - DawgType type, - const STRING &lang, - PermuterType perm, - int debug_level) { - if (debug_level) tprintf("Reading squished dawg\n"); - - // Read the magic number and if it does not match kDawgMagicNumber - // set swap to true to indicate that we need to switch endianness. - inT16 magic; - fread(&magic, sizeof(inT16), 1, file); - bool swap = (magic != kDawgMagicNumber); - - int unicharset_size; - fread(&unicharset_size, sizeof(inT32), 1, file); - fread(&num_edges_, sizeof(inT32), 1, file); - - if (swap) { - ReverseN(&unicharset_size, sizeof(unicharset_size)); - ReverseN(&num_edges_, sizeof(num_edges_)); - } - ASSERT_HOST(num_edges_ > 0); // DAWG should not be empty - Dawg::init(type, lang, perm, unicharset_size, debug_level); - - edges_ = (EDGE_ARRAY) memalloc(sizeof(EDGE_RECORD) * num_edges_); - /* willus.com mod */ - /* - ** Very weird issue running XP version in virtual box XP emulator--needed to - ** limit the blocksize on fread() to < 2000000 (not sure of exact limit). - ** Happens during reading of eng.trained_data file. - */ - /* fread(&edges_[0], sizeof(EDGE_RECORD), num_edges_, file); */ - { - int blocksize,index,nleft; - - blocksize=65536; - for (index=0,nleft=num_edges_;nleft > 0;) - { - int bsize; - bsize = nleft > blocksize ? blocksize : nleft; - fread(&edges_[index], sizeof(EDGE_RECORD), bsize, file); - index += bsize; - nleft -= bsize; - } - } - /* End willus.com mod */ - - EDGE_REF edge; - if (swap) { - for (edge = 0; edge < num_edges_; ++edge) { - ReverseN(&edges_[edge], sizeof(edges_[edge])); - } - } - if (debug_level > 2) { - tprintf("type: %d lang: %s perm: %d unicharset_size: %d num_edges: %d\n", - type_, lang_.string(), perm_, unicharset_size_, num_edges_); - for (edge = 0; edge < num_edges_; ++edge) - print_edge(edge); - } -} - -NODE_MAP SquishedDawg::build_node_map(inT32 *num_nodes) const { - EDGE_REF edge; - NODE_MAP node_map; - inT32 node_counter; - inT32 num_edges; - - node_map = (NODE_MAP) malloc(sizeof(EDGE_REF) * num_edges_); - - for (edge = 0; edge < num_edges_; edge++) // init all slots - node_map [edge] = -1; - - node_counter = num_forward_edges(0); - - *num_nodes = 0; - for (edge = 0; edge < num_edges_; edge++) { // search all slots - - if (forward_edge(edge)) { - (*num_nodes)++; // count nodes links - node_map[edge] = (edge ? node_counter : 0); - num_edges = num_forward_edges(edge); - if (edge != 0) node_counter += num_edges; - edge += num_edges; - if (edge >= num_edges_) break; - if (backward_edge(edge)) while (!last_edge(edge++)); - edge--; - } - } - return (node_map); -} - -void SquishedDawg::write_squished_dawg(FILE *file) { - EDGE_REF edge; - inT32 num_edges; - inT32 node_count = 0; - NODE_MAP node_map; - EDGE_REF old_index; - EDGE_RECORD temp_record; - - if (debug_level_) tprintf("write_squished_dawg\n"); - - node_map = build_node_map(&node_count); - - // Write the magic number to help detecting a change in endianness. - inT16 magic = kDawgMagicNumber; - fwrite(&magic, sizeof(inT16), 1, file); - fwrite(&unicharset_size_, sizeof(inT32), 1, file); - - // Count the number of edges in this Dawg. - num_edges = 0; - for (edge=0; edge < num_edges_; edge++) - if (forward_edge(edge)) - num_edges++; - - fwrite(&num_edges, sizeof(inT32), 1, file); // write edge count to file - - if (debug_level_) { - tprintf("%d nodes in DAWG\n", node_count); - tprintf("%d edges in DAWG\n", num_edges); - } - - for (edge = 0; edge < num_edges_; edge++) { - if (forward_edge(edge)) { // write forward edges - do { - old_index = next_node_from_edge_rec(edges_[edge]); - set_next_node(edge, node_map[old_index]); - temp_record = edges_[edge]; - fwrite(&(temp_record), sizeof(EDGE_RECORD), 1, file); - set_next_node(edge, old_index); - } while (!last_edge(edge++)); - - if (edge >= num_edges_) break; - if (backward_edge(edge)) // skip back links - while (!last_edge(edge++)); - - edge--; - } - } - free(node_map); -} - -} // namespace tesseract diff -Nru k2pdfopt-2.42+ds/tesseract_mod/genericvector.h k2pdfopt-2.51+ds/tesseract_mod/genericvector.h --- k2pdfopt-2.42+ds/tesseract_mod/genericvector.h 1970-01-01 00:00:00.000000000 +0000 +++ k2pdfopt-2.51+ds/tesseract_mod/genericvector.h 2018-12-24 15:55:24.000000000 +0000 @@ -0,0 +1,1149 @@ +/////////////////////////////////////////////////////////////////////// +// File: genericvector.h +// Description: Generic vector class +// Author: Daria Antonova +// Created: Mon Jun 23 11:26:43 PDT 2008 +// +// (C) Copyright 2007, Google Inc. +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// http://www.apache.org/licenses/LICENSE-2.0 +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +/////////////////////////////////////////////////////////////////////// +// +#ifndef TESSERACT_CCUTIL_GENERICVECTOR_H_ +#define TESSERACT_CCUTIL_GENERICVECTOR_H_ + +#include +#include +#include +#include + +#include "tesscallback.h" +#include "helpers.h" +#include "serialis.h" +#include "strngs.h" + +// Use PointerVector below in preference to GenericVector, as that +// provides automatic deletion of pointers, [De]Serialize that works, and +// sort that works. +template +class GenericVector { + public: + GenericVector() { + init(kDefaultVectorSize); + } + GenericVector(int size, const T& init_val) { + init(size); + init_to_size(size, init_val); + } + + // Copy + GenericVector(const GenericVector& other) { + this->init(other.size()); + this->operator+=(other); + } + GenericVector &operator+=(const GenericVector& other); + GenericVector &operator=(const GenericVector& other); + + ~GenericVector(); + + // Reserve some memory. + void reserve(int size); + // Double the size of the internal array. + void double_the_size(); + + // Resizes to size and sets all values to t. + void init_to_size(int size, const T& t); + // Resizes to size without any initialization. + void resize_no_init(int size) { + reserve(size); + size_used_ = size; + } + + // Return the size used. + int size() const { + return size_used_; + } + // Workaround to avoid g++ -Wsign-compare warnings. + size_t unsigned_size() const { + static_assert(sizeof(size_used_) <= sizeof(size_t), + "Wow! sizeof(size_t) < sizeof(int32_t)!!"); + assert(0 <= size_used_); + return static_cast(size_used_); + } + int size_reserved() const { + return size_reserved_; + } + + int length() const { + return size_used_; + } + + // Return true if empty. + bool empty() const { + return size_used_ == 0; + } + + // Return the object from an index. + T &get(int index) const; + T &back() const; + T &operator[](int index) const; + // Returns the last object and removes it. + T pop_back(); + + // Return the index of the T object. + // This method NEEDS a compare_callback to be passed to + // set_compare_callback. + int get_index(const T& object) const; + + // Return true if T is in the array + bool contains(const T& object) const; + + // Return true if the index is valid + T contains_index(int index) const; + + // Push an element in the end of the array + int push_back(T object); + void operator+=(const T& t); + + // Push an element in the end of the array if the same + // element is not already contained in the array. + int push_back_new(const T& object); + + // Push an element in the front of the array + // Note: This function is O(n) + int push_front(const T& object); + + // Set the value at the given index + void set(const T& t, int index); + + // Insert t at the given index, push other elements to the right. + void insert(const T& t, int index); + + // Removes an element at the given index and + // shifts the remaining elements to the left. + void remove(int index); + + // Truncates the array to the given size by removing the end. + // If the current size is less, the array is not expanded. + void truncate(int size) { + if (size < size_used_) + size_used_ = size; + } + + // Add a callback to be called to delete the elements when the array took + // their ownership. + void set_clear_callback(TessCallback1* cb); + + // Add a callback to be called to compare the elements when needed (contains, + // get_id, ...) + void set_compare_callback(TessResultCallback2* cb); + + // Clear the array, calling the clear callback function if any. + // All the owned callbacks are also deleted. + // If you don't want the callbacks to be deleted, before calling clear, set + // the callback to nullptr. + void clear(); + + // Delete objects pointed to by data_[i] + void delete_data_pointers(); + + // This method clears the current object, then, does a shallow copy of + // its argument, and finally invalidates its argument. + // Callbacks are moved to the current object; + void move(GenericVector* from); + + // Read/Write the array to a file. This does _NOT_ read/write the callbacks. + // The callback given must be permanent since they will be called more than + // once. The given callback will be deleted at the end. + // If the callbacks are nullptr, then the data is simply read/written using + // fread (and swapping)/fwrite. + // Returns false on error or if the callback returns false. + // DEPRECATED. Use [De]Serialize[Classes] instead. + bool write(FILE* f, TessResultCallback2* cb) const; + bool read(tesseract::TFile* f, + TessResultCallback2* cb); + // Writes a vector of simple types to the given file. Assumes that bitwise + // read/write of T will work. Returns false in case of error. + // TODO(rays) Change all callers to use TFile and remove deprecated methods. + bool Serialize(FILE* fp) const; + bool Serialize(tesseract::TFile* fp) const; + // Reads a vector of simple types from the given file. Assumes that bitwise + // read/write will work with ReverseN according to sizeof(T). + // Returns false in case of error. + // If swap is true, assumes a big/little-endian swap is needed. + // TFile is assumed to know about swapping. + bool DeSerialize(bool swap, FILE* fp); + bool DeSerialize(tesseract::TFile* fp); + // Skips the deserialization of the vector. + static bool SkipDeSerialize(tesseract::TFile* fp); + // Writes a vector of classes to the given file. Assumes the existence of + // bool T::Serialize(FILE* fp) const that returns false in case of error. + // Returns false in case of error. + bool SerializeClasses(FILE* fp) const; + bool SerializeClasses(tesseract::TFile* fp) const; + // Reads a vector of classes from the given file. Assumes the existence of + // bool T::Deserialize(bool swap, FILE* fp) that returns false in case of + // error. Also needs T::T() and T::T(constT&), as init_to_size is used in + // this function. Returns false in case of error. + // If swap is true, assumes a big/little-endian swap is needed. + bool DeSerializeClasses(bool swap, FILE* fp); + bool DeSerializeClasses(tesseract::TFile* fp); + // Calls SkipDeSerialize on the elements of the vector. + static bool SkipDeSerializeClasses(tesseract::TFile* fp); + + // Allocates a new array of double the current_size, copies over the + // information from data to the new location, deletes data and returns + // the pointed to the new larger array. + // This function uses memcpy to copy the data, instead of invoking + // operator=() for each element like double_the_size() does. + static T *double_the_size_memcpy(int current_size, T *data) { + T *data_new = new T[current_size * 2]; + memcpy(data_new, data, sizeof(T) * current_size); + delete[] data; + return data_new; + } + + // Reverses the elements of the vector. + void reverse() { + for (int i = 0; i < size_used_ / 2; ++i) + Swap(&data_[i], &data_[size_used_ - 1 - i]); + } + + // Sorts the members of this vector using the less than comparator (cmp_lt), + // which compares the values. Useful for GenericVectors to primitive types. + // Will not work so great for pointers (unless you just want to sort some + // pointers). You need to provide a specialization to sort_cmp to use + // your type. + void sort(); + + // Sort the array into the order defined by the qsort function comparator. + // The comparator function is as defined by qsort, ie. it receives pointers + // to two Ts and returns negative if the first element is to appear earlier + // in the result and positive if it is to appear later, with 0 for equal. + void sort(int (*comparator)(const void*, const void*)) { + qsort(data_, size_used_, sizeof(*data_), comparator); + } + + // Searches the array (assuming sorted in ascending order, using sort()) for + // an element equal to target and returns true if it is present. + // Use binary_search to get the index of target, or its nearest candidate. + bool bool_binary_search(const T& target) const { + int index = binary_search(target); + if (index >= size_used_) + return false; + return data_[index] == target; + } + // Searches the array (assuming sorted in ascending order, using sort()) for + // an element equal to target and returns the index of the best candidate. + // The return value is conceptually the largest index i such that + // data_[i] <= target or 0 if target < the whole vector. + // NOTE that this function uses operator> so really the return value is + // the largest index i such that data_[i] > target is false. + int binary_search(const T& target) const { + int bottom = 0; + int top = size_used_; + while (top - bottom > 1) { + int middle = (bottom + top) / 2; + if (data_[middle] > target) + top = middle; + else + bottom = middle; + } + return bottom; + } + + // Compact the vector by deleting elements using operator!= on basic types. + // The vector must be sorted. + void compact_sorted() { + if (size_used_ == 0) + return; + + // First element is in no matter what, hence the i = 1. + int last_write = 0; + for (int i = 1; i < size_used_; ++i) { + // Finds next unique item and writes it. + if (data_[last_write] != data_[i]) + data_[++last_write] = data_[i]; + } + // last_write is the index of a valid data cell, so add 1. + size_used_ = last_write + 1; + } + + // Compact the vector by deleting elements for which delete_cb returns + // true. delete_cb is a permanent callback and will be deleted. + void compact(TessResultCallback1* delete_cb) { + int new_size = 0; + int old_index = 0; + // Until the callback returns true, the elements stay the same. + while (old_index < size_used_ && !delete_cb->Run(old_index++)) + ++new_size; + // Now just copy anything else that gets false from delete_cb. + for (; old_index < size_used_; ++old_index) { + if (!delete_cb->Run(old_index)) { + data_[new_size++] = data_[old_index]; + } + } + size_used_ = new_size; + delete delete_cb; + } + + T dot_product(const GenericVector& other) const { + T result = static_cast(0); + for (int i = std::min(size_used_, other.size_used_) - 1; i >= 0; --i) + result += data_[i] * other.data_[i]; + return result; + } + + // Returns the index of what would be the target_index_th item in the array + // if the members were sorted, without actually sorting. Members are + // shuffled around, but it takes O(n) time. + // NOTE: uses operator< and operator== on the members. + int choose_nth_item(int target_index) { + // Make sure target_index is legal. + if (target_index < 0) + target_index = 0; // ensure legal + else if (target_index >= size_used_) + target_index = size_used_ - 1; + unsigned int seed = 1; + return choose_nth_item(target_index, 0, size_used_, &seed); + } + + // Swaps the elements with the given indices. + void swap(int index1, int index2) { + if (index1 != index2) { + T tmp = data_[index1]; + data_[index1] = data_[index2]; + data_[index2] = tmp; + } + } + // Returns true if all elements of *this are within the given range. + // Only uses operator< + bool WithinBounds(const T& rangemin, const T& rangemax) const { + for (int i = 0; i < size_used_; ++i) { + if (data_[i] < rangemin || rangemax < data_[i]) + return false; + } + return true; + } + + protected: + // Internal recursive version of choose_nth_item. + int choose_nth_item(int target_index, int start, int end, unsigned int* seed); + + // Init the object, allocating size memory. + void init(int size); + + // We are assuming that the object generally placed in the + // vector are small enough that for efficiency it makes sense + // to start with a larger initial size. + static const int kDefaultVectorSize = 4; + int32_t size_used_; + int32_t size_reserved_; + T* data_; + TessCallback1* clear_cb_; + // Mutable because Run method is not const + mutable TessResultCallback2* compare_cb_; +}; + +namespace tesseract { + +// Function to read a GenericVector from a whole file. +// Returns false on failure. +typedef bool (*FileReader)(const STRING& filename, GenericVector* data); +// Function to write a GenericVector to a whole file. +// Returns false on failure. +typedef bool (*FileWriter)(const GenericVector& data, + const STRING& filename); +// The default FileReader loads the whole file into the vector of char, +// returning false on error. +inline bool LoadDataFromFile(const char* filename, GenericVector* data) { + bool result = false; + FILE* fp = fopen(filename, "rb"); + if (fp != nullptr) { + fseek(fp, 0, SEEK_END); + long size = ftell(fp); + fseek(fp, 0, SEEK_SET); + // Trying to open a directory on Linux sets size to LONG_MAX. Catch it here. + if (size > 0 && size < LONG_MAX) { + // reserve an extra byte in case caller wants to append a '\0' character + data->reserve(size + 1); + data->resize_no_init(size); + /* willus mod Dec 2018--weird issue with Win XP and MinGW gcc 7.3.0 */ + /* Can't read entire file at once -- need to break up into smaller blocksize reads */ + { + int frs,n; + int blocksize; + blocksize=1024*1024; + for (n=0;1;) + { + int bs; + bs= size-n > blocksize ? blocksize : size-n; + frs=(int)fread(&(*data)[n],1,bs,fp); + n+=frs; + if (frs=size) + break; + } + result = static_cast((long)n==size); + } + /* + result = static_cast(fread(&(*data)[0], 1, size, fp)) == size; + */ + } + fclose(fp); + } + return result; +} + +inline bool LoadDataFromFile(const STRING& filename, + GenericVector* data) { + return LoadDataFromFile(filename.string(), data); +} + +// The default FileWriter writes the vector of char to the filename file, +// returning false on error. +inline bool SaveDataToFile(const GenericVector& data, + const STRING& filename) { + FILE* fp = fopen(filename.string(), "wb"); + if (fp == nullptr) return false; + bool result = + static_cast(fwrite(&data[0], 1, data.size(), fp)) == data.size(); + fclose(fp); + return result; +} +// Reads a file as a vector of STRING. +inline bool LoadFileLinesToStrings(const STRING& filename, + GenericVector* lines) { + GenericVector data; + if (!LoadDataFromFile(filename.string(), &data)) { + return false; + } + STRING lines_str(&data[0], data.size()); + lines_str.split('\n', lines); + return true; +} + +template +bool cmp_eq(T const & t1, T const & t2) { + return t1 == t2; +} + +// Used by sort() +// return < 0 if t1 < t2 +// return 0 if t1 == t2 +// return > 0 if t1 > t2 +template +int sort_cmp(const void* t1, const void* t2) { + const T* a = static_cast (t1); + const T* b = static_cast (t2); + if (*a < *b) { + return -1; + } else if (*b < *a) { + return 1; + } else { + return 0; + } +} + +// Used by PointerVector::sort() +// return < 0 if t1 < t2 +// return 0 if t1 == t2 +// return > 0 if t1 > t2 +template +int sort_ptr_cmp(const void* t1, const void* t2) { + const T* a = *static_cast(t1); + const T* b = *static_cast(t2); + if (*a < *b) { + return -1; + } else if (*b < *a) { + return 1; + } else { + return 0; + } +} + +// Subclass for a vector of pointers. Use in preference to GenericVector +// as it provides automatic deletion and correct serialization, with the +// corollary that all copy operations are deep copies of the pointed-to objects. +template +class PointerVector : public GenericVector { + public: + PointerVector() : GenericVector() { } + explicit PointerVector(int size) : GenericVector(size) { } + ~PointerVector() { + // Clear must be called here, even though it is called again by the base, + // as the base will call the wrong clear. + clear(); + } + // Copy must be deep, as the pointers will be automatically deleted on + // destruction. + PointerVector(const PointerVector& other) : GenericVector(other) { + this->init(other.size()); + this->operator+=(other); + } + PointerVector& operator+=(const PointerVector& other) { + this->reserve(this->size_used_ + other.size_used_); + for (int i = 0; i < other.size(); ++i) { + this->push_back(new T(*other.data_[i])); + } + return *this; + } + + PointerVector& operator=(const PointerVector& other) { + if (&other != this) { + this->truncate(0); + this->operator+=(other); + } + return *this; + } + + // Removes an element at the given index and + // shifts the remaining elements to the left. + void remove(int index) { + delete GenericVector::data_[index]; + GenericVector::remove(index); + } + + // Truncates the array to the given size by removing the end. + // If the current size is less, the array is not expanded. + void truncate(int size) { + for (int i = size; i < GenericVector::size_used_; ++i) + delete GenericVector::data_[i]; + GenericVector::truncate(size); + } + + // Compact the vector by deleting elements for which delete_cb returns + // true. delete_cb is a permanent callback and will be deleted. + void compact(TessResultCallback1* delete_cb) { + int new_size = 0; + int old_index = 0; + // Until the callback returns true, the elements stay the same. + while (old_index < GenericVector::size_used_ && + !delete_cb->Run(GenericVector::data_[old_index++])) + ++new_size; + // Now just copy anything else that gets false from delete_cb. + for (; old_index < GenericVector::size_used_; ++old_index) { + if (!delete_cb->Run(GenericVector::data_[old_index])) { + GenericVector::data_[new_size++] = + GenericVector::data_[old_index]; + } else { + delete GenericVector::data_[old_index]; + } + } + GenericVector::size_used_ = new_size; + delete delete_cb; + } + + // Clear the array, calling the clear callback function if any. + // All the owned callbacks are also deleted. + // If you don't want the callbacks to be deleted, before calling clear, set + // the callback to nullptr. + void clear() { + GenericVector::delete_data_pointers(); + GenericVector::clear(); + } + + // Writes a vector of (pointers to) classes to the given file. Assumes the + // existence of bool T::Serialize(FILE*) const that returns false in case of + // error. There is no Serialize for simple types, as you would have a + // normal GenericVector of those. + // Returns false in case of error. + bool Serialize(FILE* fp) const { + int32_t used = GenericVector::size_used_; + if (fwrite(&used, sizeof(used), 1, fp) != 1) return false; + for (int i = 0; i < used; ++i) { + int8_t non_null = GenericVector::data_[i] != nullptr; + if (fwrite(&non_null, sizeof(non_null), 1, fp) != 1) return false; + if (non_null && !GenericVector::data_[i]->Serialize(fp)) return false; + } + return true; + } + bool Serialize(TFile* fp) const { + int32_t used = GenericVector::size_used_; + if (fp->FWrite(&used, sizeof(used), 1) != 1) return false; + for (int i = 0; i < used; ++i) { + int8_t non_null = GenericVector::data_[i] != nullptr; + if (fp->FWrite(&non_null, sizeof(non_null), 1) != 1) return false; + if (non_null && !GenericVector::data_[i]->Serialize(fp)) return false; + } + return true; + } + // Reads a vector of (pointers to) classes to the given file. Assumes the + // existence of bool T::DeSerialize(bool, Tfile*) const that returns false in + // case of error. There is no Serialize for simple types, as you would have a + // normal GenericVector of those. + // If swap is true, assumes a big/little-endian swap is needed. + // Also needs T::T(), as new T is used in this function. + // Returns false in case of error. + bool DeSerialize(bool swap, FILE* fp) { + uint32_t reserved; + if (fread(&reserved, sizeof(reserved), 1, fp) != 1) return false; + if (swap) Reverse32(&reserved); + // Arbitrarily limit the number of elements to protect against bad data. + assert(reserved <= UINT16_MAX); + if (reserved > UINT16_MAX) { + return false; + } + GenericVector::reserve(reserved); + truncate(0); + for (uint32_t i = 0; i < reserved; ++i) { + int8_t non_null; + if (fread(&non_null, sizeof(non_null), 1, fp) != 1) return false; + T* item = nullptr; + if (non_null) { + item = new T; + if (!item->DeSerialize(swap, fp)) { + delete item; + return false; + } + this->push_back(item); + } else { + // Null elements should keep their place in the vector. + this->push_back(nullptr); + } + } + return true; + } + bool DeSerialize(TFile* fp) { + int32_t reserved; + if (!DeSerializeSize(fp, &reserved)) return false; + GenericVector::reserve(reserved); + truncate(0); + for (int i = 0; i < reserved; ++i) { + if (!DeSerializeElement(fp)) return false; + } + return true; + } + // Enables deserialization of a selection of elements. Note that in order to + // retain the integrity of the stream, the caller must call some combination + // of DeSerializeElement and DeSerializeSkip of the exact number returned in + // *size, assuming a true return. + static bool DeSerializeSize(TFile* fp, int32_t* size) { + return fp->FReadEndian(size, sizeof(*size), 1) == 1; + } + // Reads and appends to the vector the next element of the serialization. + bool DeSerializeElement(TFile* fp) { + int8_t non_null; + if (fp->FRead(&non_null, sizeof(non_null), 1) != 1) return false; + T* item = nullptr; + if (non_null) { + item = new T; + if (!item->DeSerialize(fp)) { + delete item; + return false; + } + this->push_back(item); + } else { + // Null elements should keep their place in the vector. + this->push_back(nullptr); + } + return true; + } + // Skips the next element of the serialization. + static bool DeSerializeSkip(TFile* fp) { + int8_t non_null; + if (fp->FRead(&non_null, sizeof(non_null), 1) != 1) return false; + if (non_null) { + if (!T::SkipDeSerialize(fp)) return false; + } + return true; + } + + // Sorts the items pointed to by the members of this vector using + // t::operator<(). + void sort() { this->GenericVector::sort(&sort_ptr_cmp); } +}; + +} // namespace tesseract + +// A useful vector that uses operator== to do comparisons. +template +class GenericVectorEqEq : public GenericVector { + public: + GenericVectorEqEq() { + GenericVector::set_compare_callback( + NewPermanentTessCallback(tesseract::cmp_eq)); + } + GenericVectorEqEq(int size) : GenericVector(size) { + GenericVector::set_compare_callback( + NewPermanentTessCallback(tesseract::cmp_eq)); + } +}; + +template +void GenericVector::init(int size) { + size_used_ = 0; + if (size <= 0) { + data_ = nullptr; + size_reserved_ = 0; + } else { + if (size < kDefaultVectorSize) size = kDefaultVectorSize; + data_ = new T[size]; + size_reserved_ = size; + } + clear_cb_ = nullptr; + compare_cb_ = nullptr; +} + +template +GenericVector::~GenericVector() { + clear(); +} + +// Reserve some memory. If the internal array contains elements, they are +// copied. +template +void GenericVector::reserve(int size) { + if (size_reserved_ >= size || size <= 0) + return; + if (size < kDefaultVectorSize) size = kDefaultVectorSize; + T* new_array = new T[size]; + for (int i = 0; i < size_used_; ++i) + new_array[i] = data_[i]; + delete[] data_; + data_ = new_array; + size_reserved_ = size; +} + +template +void GenericVector::double_the_size() { + if (size_reserved_ == 0) { + reserve(kDefaultVectorSize); + } + else { + reserve(2 * size_reserved_); + } +} + +// Resizes to size and sets all values to t. +template +void GenericVector::init_to_size(int size, const T& t) { + reserve(size); + size_used_ = size; + for (int i = 0; i < size; ++i) + data_[i] = t; +} + + +// Return the object from an index. +template +T &GenericVector::get(int index) const { + assert(index >= 0 && index < size_used_); + return data_[index]; +} + +template +T &GenericVector::operator[](int index) const { + assert(index >= 0 && index < size_used_); + return data_[index]; +} + +template +T &GenericVector::back() const { + assert(size_used_ > 0); + return data_[size_used_ - 1]; +} +// Returns the last object and removes it. +template +T GenericVector::pop_back() { + assert(size_used_ > 0); + return data_[--size_used_]; +} + +// Return the object from an index. +template +void GenericVector::set(const T& t, int index) { + assert(index >= 0 && index < size_used_); + data_[index] = t; +} + +// Shifts the rest of the elements to the right to make +// space for the new elements and inserts the given element +// at the specified index. +template +void GenericVector::insert(const T& t, int index) { + assert(index >= 0 && index <= size_used_); + if (size_reserved_ == size_used_) + double_the_size(); + for (int i = size_used_; i > index; --i) { + data_[i] = data_[i-1]; + } + data_[index] = t; + size_used_++; +} + +// Removes an element at the given index and +// shifts the remaining elements to the left. +template +void GenericVector::remove(int index) { + assert(index >= 0 && index < size_used_); + for (int i = index; i < size_used_ - 1; ++i) { + data_[i] = data_[i+1]; + } + size_used_--; +} + +// Return true if the index is valindex +template +T GenericVector::contains_index(int index) const { + return index >= 0 && index < size_used_; +} + +// Return the index of the T object. +template +int GenericVector::get_index(const T& object) const { + for (int i = 0; i < size_used_; ++i) { + assert(compare_cb_ != nullptr); + if (compare_cb_->Run(object, data_[i])) + return i; + } + return -1; +} + +// Return true if T is in the array +template +bool GenericVector::contains(const T& object) const { + return get_index(object) != -1; +} + +// Add an element in the array +template +int GenericVector::push_back(T object) { + int index = 0; + if (size_used_ == size_reserved_) + double_the_size(); + index = size_used_++; + data_[index] = object; + return index; +} + +template +int GenericVector::push_back_new(const T& object) { + int index = get_index(object); + if (index >= 0) + return index; + return push_back(object); +} + +// Add an element in the array (front) +template +int GenericVector::push_front(const T& object) { + if (size_used_ == size_reserved_) + double_the_size(); + for (int i = size_used_; i > 0; --i) + data_[i] = data_[i-1]; + data_[0] = object; + ++size_used_; + return 0; +} + +template +void GenericVector::operator+=(const T& t) { + push_back(t); +} + +template +GenericVector &GenericVector::operator+=(const GenericVector& other) { + this->reserve(size_used_ + other.size_used_); + for (int i = 0; i < other.size(); ++i) { + this->operator+=(other.data_[i]); + } + return *this; +} + +template +GenericVector &GenericVector::operator=(const GenericVector& other) { + if (&other != this) { + this->truncate(0); + this->operator+=(other); + } + return *this; +} + +// Add a callback to be called to delete the elements when the array took +// their ownership. +template +void GenericVector::set_clear_callback(TessCallback1* cb) { + clear_cb_ = cb; +} + +// Add a callback to be called to delete the elements when the array took +// their ownership. +template +void GenericVector::set_compare_callback( + TessResultCallback2* cb) { + compare_cb_ = cb; +} + +// Clear the array, calling the callback function if any. +template +void GenericVector::clear() { + if (size_reserved_ > 0 && clear_cb_ != nullptr) { + for (int i = 0; i < size_used_; ++i) + clear_cb_->Run(data_[i]); + } + delete[] data_; + data_ = nullptr; + size_used_ = 0; + size_reserved_ = 0; + delete clear_cb_; + clear_cb_ = nullptr; + delete compare_cb_; + compare_cb_ = nullptr; +} + +template +void GenericVector::delete_data_pointers() { + for (int i = 0; i < size_used_; ++i) { + delete data_[i]; + } +} + + +template +bool GenericVector::write( + FILE* f, TessResultCallback2* cb) const { + if (fwrite(&size_reserved_, sizeof(size_reserved_), 1, f) != 1) return false; + if (fwrite(&size_used_, sizeof(size_used_), 1, f) != 1) return false; + if (cb != nullptr) { + for (int i = 0; i < size_used_; ++i) { + if (!cb->Run(f, data_[i])) { + delete cb; + return false; + } + } + delete cb; + } else { + if (fwrite(data_, sizeof(T), size_used_, f) != unsigned_size()) + return false; + } + return true; +} + +template +bool GenericVector::read( + tesseract::TFile* f, TessResultCallback2* cb) { + int32_t reserved; + if (f->FReadEndian(&reserved, sizeof(reserved), 1) != 1) return false; + reserve(reserved); + if (f->FReadEndian(&size_used_, sizeof(size_used_), 1) != 1) return false; + if (cb != nullptr) { + for (int i = 0; i < size_used_; ++i) { + if (!cb->Run(f, data_ + i)) { + delete cb; + return false; + } + } + delete cb; + } else { + if (f->FReadEndian(data_, sizeof(T), size_used_) != size_used_) + return false; + } + return true; +} + +// Writes a vector of simple types to the given file. Assumes that bitwise +// read/write of T will work. Returns false in case of error. +template +bool GenericVector::Serialize(FILE* fp) const { + if (fwrite(&size_used_, sizeof(size_used_), 1, fp) != 1) return false; + if (fwrite(data_, sizeof(*data_), size_used_, fp) != unsigned_size()) + return false; + return true; +} +template +bool GenericVector::Serialize(tesseract::TFile* fp) const { + if (fp->FWrite(&size_used_, sizeof(size_used_), 1) != 1) return false; + if (fp->FWrite(data_, sizeof(*data_), size_used_) != size_used_) return false; + return true; +} + +// Reads a vector of simple types from the given file. Assumes that bitwise +// read/write will work with ReverseN according to sizeof(T). +// Returns false in case of error. +// If swap is true, assumes a big/little-endian swap is needed. +template +bool GenericVector::DeSerialize(bool swap, FILE* fp) { + uint32_t reserved; + if (fread(&reserved, sizeof(reserved), 1, fp) != 1) return false; + if (swap) Reverse32(&reserved); + // Arbitrarily limit the number of elements to protect against bad data. + assert(reserved <= UINT16_MAX); + if (reserved > UINT16_MAX) return false; + reserve(reserved); + size_used_ = reserved; + if (fread(data_, sizeof(T), size_used_, fp) != unsigned_size()) return false; + if (swap) { + for (int i = 0; i < size_used_; ++i) + ReverseN(&data_[i], sizeof(data_[i])); + } + return true; +} +template +bool GenericVector::DeSerialize(tesseract::TFile* fp) { + uint32_t reserved; + if (fp->FReadEndian(&reserved, sizeof(reserved), 1) != 1) return false; + // Arbitrarily limit the number of elements to protect against bad data. + const uint32_t limit = 50000000; + assert(reserved <= limit); + if (reserved > limit) return false; + reserve(reserved); + size_used_ = reserved; + return fp->FReadEndian(data_, sizeof(T), size_used_) == size_used_; +} +template +bool GenericVector::SkipDeSerialize(tesseract::TFile* fp) { + uint32_t reserved; + if (fp->FReadEndian(&reserved, sizeof(reserved), 1) != 1) return false; + return fp->FRead(nullptr, sizeof(T), reserved) == reserved; +} + +// Writes a vector of classes to the given file. Assumes the existence of +// bool T::Serialize(FILE* fp) const that returns false in case of error. +// Returns false in case of error. +template +bool GenericVector::SerializeClasses(FILE* fp) const { + if (fwrite(&size_used_, sizeof(size_used_), 1, fp) != 1) return false; + for (int i = 0; i < size_used_; ++i) { + if (!data_[i].Serialize(fp)) return false; + } + return true; +} +template +bool GenericVector::SerializeClasses(tesseract::TFile* fp) const { + if (fp->FWrite(&size_used_, sizeof(size_used_), 1) != 1) return false; + for (int i = 0; i < size_used_; ++i) { + if (!data_[i].Serialize(fp)) return false; + } + return true; +} + +// Reads a vector of classes from the given file. Assumes the existence of +// bool T::Deserialize(bool swap, FILE* fp) that returns false in case of +// error. Also needs T::T() and T::T(constT&), as init_to_size is used in +// this function. Returns false in case of error. +// If swap is true, assumes a big/little-endian swap is needed. +template +bool GenericVector::DeSerializeClasses(bool swap, FILE* fp) { + int32_t reserved; + if (fread(&reserved, sizeof(reserved), 1, fp) != 1) return false; + if (swap) Reverse32(&reserved); + T empty; + init_to_size(reserved, empty); + for (int i = 0; i < reserved; ++i) { + if (!data_[i].DeSerialize(swap, fp)) return false; + } + return true; +} +template +bool GenericVector::DeSerializeClasses(tesseract::TFile* fp) { + int32_t reserved; + if (fp->FReadEndian(&reserved, sizeof(reserved), 1) != 1) return false; + T empty; + init_to_size(reserved, empty); + for (int i = 0; i < reserved; ++i) { + if (!data_[i].DeSerialize(fp)) return false; + } + return true; +} +template +bool GenericVector::SkipDeSerializeClasses(tesseract::TFile* fp) { + int32_t reserved; + if (fp->FReadEndian(&reserved, sizeof(reserved), 1) != 1) return false; + for (int i = 0; i < reserved; ++i) { + if (!T::SkipDeSerialize(fp)) return false; + } + return true; +} + +// This method clear the current object, then, does a shallow copy of +// its argument, and finally invalidates its argument. +template +void GenericVector::move(GenericVector* from) { + this->clear(); + this->data_ = from->data_; + this->size_reserved_ = from->size_reserved_; + this->size_used_ = from->size_used_; + this->compare_cb_ = from->compare_cb_; + this->clear_cb_ = from->clear_cb_; + from->data_ = nullptr; + from->clear_cb_ = nullptr; + from->compare_cb_ = nullptr; + from->size_used_ = 0; + from->size_reserved_ = 0; +} + +template +void GenericVector::sort() { + sort(&tesseract::sort_cmp); +} + +// Internal recursive version of choose_nth_item. +// The algorithm used comes from "Algorithms" by Sedgewick: +// http://books.google.com/books/about/Algorithms.html?id=idUdqdDXqnAC +// The principle is to choose a random pivot, and move everything less than +// the pivot to its left, and everything greater than the pivot to the end +// of the array, then recurse on the part that contains the desired index, or +// just return the answer if it is in the equal section in the middle. +// The random pivot guarantees average linear time for the same reason that +// n times vector::push_back takes linear time on average. +// target_index, start and and end are all indices into the full array. +// Seed is a seed for rand_r for thread safety purposes. Its value is +// unimportant as the random numbers do not affect the result except +// between equal answers. +template +int GenericVector::choose_nth_item(int target_index, int start, int end, + unsigned int* seed) { + // Number of elements to process. + int num_elements = end - start; + // Trivial cases. + if (num_elements <= 1) + return start; + if (num_elements == 2) { + if (data_[start] < data_[start + 1]) { + return target_index > start ? start + 1 : start; + } else { + return target_index > start ? start : start + 1; + } + } + // Place the pivot at start. + #ifndef rand_r // _MSC_VER, ANDROID + srand(*seed); + #define rand_r(seed) rand() + #endif // _MSC_VER + int pivot = rand_r(seed) % num_elements + start; + swap(pivot, start); + // The invariant condition here is that items [start, next_lesser) are less + // than the pivot (which is at index next_lesser) and items + // [prev_greater, end) are greater than the pivot, with items + // [next_lesser, prev_greater) being equal to the pivot. + int next_lesser = start; + int prev_greater = end; + for (int next_sample = start + 1; next_sample < prev_greater;) { + if (data_[next_sample] < data_[next_lesser]) { + swap(next_lesser++, next_sample++); + } else if (data_[next_sample] == data_[next_lesser]) { + ++next_sample; + } else { + swap(--prev_greater, next_sample); + } + } + // Now the invariant is set up, we recurse on just the section that contains + // the desired index. + if (target_index < next_lesser) + return choose_nth_item(target_index, start, next_lesser, seed); + else if (target_index < prev_greater) + return next_lesser; // In equal bracket. + else + return choose_nth_item(target_index, prev_greater, end, seed); +} + + +#endif // TESSERACT_CCUTIL_GENERICVECTOR_H_ diff -Nru k2pdfopt-2.42+ds/tesseract_mod/imagedata.cpp k2pdfopt-2.51+ds/tesseract_mod/imagedata.cpp --- k2pdfopt-2.42+ds/tesseract_mod/imagedata.cpp 2017-02-25 15:40:02.000000000 +0000 +++ k2pdfopt-2.51+ds/tesseract_mod/imagedata.cpp 1970-01-01 00:00:00.000000000 +0000 @@ -1,707 +0,0 @@ -#include "config_auto.h" -/////////////////////////////////////////////////////////////////////// -// File: imagedata.h -// Description: Class to hold information about a single multi-page tiff -// training file and its corresponding boxes or text file. -// Author: Ray Smith -// Created: Tue May 28 08:56:06 PST 2013 -// -// (C) Copyright 2013, Google Inc. -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -/////////////////////////////////////////////////////////////////////// - -// Include automatically generated configuration file if running autoconf. -#ifdef HAVE_CONFIG_H -#include "config_auto.h" -#endif - -#include "imagedata.h" - -#include "allheaders.h" -#include "boxread.h" -#include "callcpp.h" -#include "helpers.h" -#include "tprintf.h" - -/* willus mod -- force include of unistd.h */ -/* -#if defined(__MINGW32__) -# include -#elif __cplusplus > 199711L // in C++11 -# include -#endif -*/ -# include - -// Number of documents to read ahead while training. Doesn't need to be very -// large. -const int kMaxReadAhead = 8; - -namespace tesseract { - -WordFeature::WordFeature() : x_(0), y_(0), dir_(0) { -} - -WordFeature::WordFeature(const FCOORD& fcoord, uinT8 dir) - : x_(IntCastRounded(fcoord.x())), - y_(ClipToRange(IntCastRounded(fcoord.y()), 0, MAX_UINT8)), - dir_(dir) { -} - -// Computes the maximum x and y value in the features. -void WordFeature::ComputeSize(const GenericVector& features, - int* max_x, int* max_y) { - *max_x = 0; - *max_y = 0; - for (int f = 0; f < features.size(); ++f) { - if (features[f].x_ > *max_x) *max_x = features[f].x_; - if (features[f].y_ > *max_y) *max_y = features[f].y_; - } -} - -// Draws the features in the given window. -void WordFeature::Draw(const GenericVector& features, - ScrollView* window) { -#ifndef GRAPHICS_DISABLED - for (int f = 0; f < features.size(); ++f) { - FCOORD pos(features[f].x_, features[f].y_); - FCOORD dir; - dir.from_direction(features[f].dir_); - dir *= 8.0f; - window->SetCursor(IntCastRounded(pos.x() - dir.x()), - IntCastRounded(pos.y() - dir.y())); - window->DrawTo(IntCastRounded(pos.x() + dir.x()), - IntCastRounded(pos.y() + dir.y())); - } -#endif -} - -// Writes to the given file. Returns false in case of error. -bool WordFeature::Serialize(FILE* fp) const { - if (fwrite(&x_, sizeof(x_), 1, fp) != 1) return false; - if (fwrite(&y_, sizeof(y_), 1, fp) != 1) return false; - if (fwrite(&dir_, sizeof(dir_), 1, fp) != 1) return false; - return true; -} -// Reads from the given file. Returns false in case of error. -// If swap is true, assumes a big/little-endian swap is needed. -bool WordFeature::DeSerialize(bool swap, FILE* fp) { - if (fread(&x_, sizeof(x_), 1, fp) != 1) return false; - if (swap) ReverseN(&x_, sizeof(x_)); - if (fread(&y_, sizeof(y_), 1, fp) != 1) return false; - if (fread(&dir_, sizeof(dir_), 1, fp) != 1) return false; - return true; -} - -void FloatWordFeature::FromWordFeatures( - const GenericVector& word_features, - GenericVector* float_features) { - for (int i = 0; i < word_features.size(); ++i) { - FloatWordFeature f; - f.x = word_features[i].x(); - f.y = word_features[i].y(); - f.dir = word_features[i].dir(); - f.x_bucket = 0; // Will set it later. - float_features->push_back(f); - } -} - -// Sort function to sort first by x-bucket, then by y. -/* static */ -int FloatWordFeature::SortByXBucket(const void* v1, const void* v2) { - const FloatWordFeature* f1 = reinterpret_cast(v1); - const FloatWordFeature* f2 = reinterpret_cast(v2); - int x_diff = f1->x_bucket - f2->x_bucket; - if (x_diff == 0) return f1->y - f2->y; - return x_diff; -} - -ImageData::ImageData() : page_number_(-1), vertical_text_(false) { -} -// Takes ownership of the pix and destroys it. -ImageData::ImageData(bool vertical, Pix* pix) - : page_number_(0), vertical_text_(vertical) { - SetPix(pix); -} -ImageData::~ImageData() { -} - -// Builds and returns an ImageData from the basic data. Note that imagedata, -// truth_text, and box_text are all the actual file data, NOT filenames. -ImageData* ImageData::Build(const char* name, int page_number, const char* lang, - const char* imagedata, int imagedatasize, - const char* truth_text, const char* box_text) { - ImageData* image_data = new ImageData(); - image_data->imagefilename_ = name; - image_data->page_number_ = page_number; - image_data->language_ = lang; - // Save the imagedata. - image_data->image_data_.init_to_size(imagedatasize, 0); - memcpy(&image_data->image_data_[0], imagedata, imagedatasize); - if (!image_data->AddBoxes(box_text)) { - if (truth_text == NULL || truth_text[0] == '\0') { - tprintf("Error: No text corresponding to page %d from image %s!\n", - page_number, name); - delete image_data; - return NULL; - } - image_data->transcription_ = truth_text; - // If we have no boxes, the transcription is in the 0th box_texts_. - image_data->box_texts_.push_back(truth_text); - // We will create a box for the whole image on PreScale, to save unpacking - // the image now. - } else if (truth_text != NULL && truth_text[0] != '\0' && - image_data->transcription_ != truth_text) { - // Save the truth text as it is present and disagrees with the box text. - image_data->transcription_ = truth_text; - } - return image_data; -} - -// Writes to the given file. Returns false in case of error. -bool ImageData::Serialize(TFile* fp) const { - if (!imagefilename_.Serialize(fp)) return false; - if (fp->FWrite(&page_number_, sizeof(page_number_), 1) != 1) return false; - if (!image_data_.Serialize(fp)) return false; - if (!transcription_.Serialize(fp)) return false; - // WARNING: Will not work across different endian machines. - if (!boxes_.Serialize(fp)) return false; - if (!box_texts_.SerializeClasses(fp)) return false; - inT8 vertical = vertical_text_; - if (fp->FWrite(&vertical, sizeof(vertical), 1) != 1) return false; - return true; -} - -// Reads from the given file. Returns false in case of error. -// If swap is true, assumes a big/little-endian swap is needed. -bool ImageData::DeSerialize(bool swap, TFile* fp) { - if (!imagefilename_.DeSerialize(swap, fp)) return false; - if (fp->FRead(&page_number_, sizeof(page_number_), 1) != 1) return false; - if (swap) ReverseN(&page_number_, sizeof(page_number_)); - if (!image_data_.DeSerialize(swap, fp)) return false; - if (!transcription_.DeSerialize(swap, fp)) return false; - // WARNING: Will not work across different endian machines. - if (!boxes_.DeSerialize(swap, fp)) return false; - if (!box_texts_.DeSerializeClasses(swap, fp)) return false; - inT8 vertical = 0; - if (fp->FRead(&vertical, sizeof(vertical), 1) != 1) return false; - vertical_text_ = vertical != 0; - return true; -} - -// As DeSerialize, but only seeks past the data - hence a static method. -bool ImageData::SkipDeSerialize(bool swap, TFile* fp) { - if (!STRING::SkipDeSerialize(swap, fp)) return false; - inT32 page_number; - if (fp->FRead(&page_number, sizeof(page_number), 1) != 1) return false; - if (!GenericVector::SkipDeSerialize(swap, fp)) return false; - if (!STRING::SkipDeSerialize(swap, fp)) return false; - if (!GenericVector::SkipDeSerialize(swap, fp)) return false; - if (!GenericVector::SkipDeSerializeClasses(swap, fp)) return false; - inT8 vertical = 0; - return fp->FRead(&vertical, sizeof(vertical), 1) == 1; -} - -// Saves the given Pix as a PNG-encoded string and destroys it. -void ImageData::SetPix(Pix* pix) { - SetPixInternal(pix, &image_data_); -} - -// Returns the Pix image for *this. Must be pixDestroyed after use. -Pix* ImageData::GetPix() const { - return GetPixInternal(image_data_); -} - -// Gets anything and everything with a non-NULL pointer, prescaled to a -// given target_height (if 0, then the original image height), and aligned. -// Also returns (if not NULL) the width and height of the scaled image. -// The return value is the scaled Pix, which must be pixDestroyed after use, -// and scale_factor (if not NULL) is set to the scale factor that was applied -// to the image to achieve the target_height. -Pix* ImageData::PreScale(int target_height, int max_height, float* scale_factor, - int* scaled_width, int* scaled_height, - GenericVector* boxes) const { - int input_width = 0; - int input_height = 0; - Pix* src_pix = GetPix(); - ASSERT_HOST(src_pix != NULL); - input_width = pixGetWidth(src_pix); - input_height = pixGetHeight(src_pix); - if (target_height == 0) { - target_height = MIN(input_height, max_height); - } - float im_factor = static_cast(target_height) / input_height; - if (scaled_width != NULL) - *scaled_width = IntCastRounded(im_factor * input_width); - if (scaled_height != NULL) - *scaled_height = target_height; - // Get the scaled image. - Pix* pix = pixScale(src_pix, im_factor, im_factor); - if (pix == NULL) { - tprintf("Scaling pix of size %d, %d by factor %g made null pix!!\n", - input_width, input_height, im_factor); - } - if (scaled_width != NULL) *scaled_width = pixGetWidth(pix); - if (scaled_height != NULL) *scaled_height = pixGetHeight(pix); - pixDestroy(&src_pix); - if (boxes != NULL) { - // Get the boxes. - boxes->truncate(0); - for (int b = 0; b < boxes_.size(); ++b) { - TBOX box = boxes_[b]; - box.scale(im_factor); - boxes->push_back(box); - } - if (boxes->empty()) { - // Make a single box for the whole image. - TBOX box(0, 0, im_factor * input_width, target_height); - boxes->push_back(box); - } - } - if (scale_factor != NULL) *scale_factor = im_factor; - return pix; -} - -int ImageData::MemoryUsed() const { - return image_data_.size(); -} - -// Draws the data in a new window. -void ImageData::Display() const { -#ifndef GRAPHICS_DISABLED - const int kTextSize = 64; - // Draw the image. - Pix* pix = GetPix(); - if (pix == NULL) return; - int width = pixGetWidth(pix); - int height = pixGetHeight(pix); - ScrollView* win = new ScrollView("Imagedata", 100, 100, - 2 * (width + 2 * kTextSize), - 2 * (height + 4 * kTextSize), - width + 10, height + 3 * kTextSize, true); - win->Image(pix, 0, height - 1); - pixDestroy(&pix); - // Draw the boxes. - win->Pen(ScrollView::RED); - win->Brush(ScrollView::NONE); - int text_size = kTextSize; - if (!boxes_.empty() && boxes_[0].height() * 2 < text_size) - text_size = boxes_[0].height() * 2; - win->TextAttributes("Arial", text_size, false, false, false); - if (!boxes_.empty()) { - for (int b = 0; b < boxes_.size(); ++b) { - boxes_[b].plot(win); - win->Text(boxes_[b].left(), height + kTextSize, box_texts_[b].string()); - } - } else { - // The full transcription. - win->Pen(ScrollView::CYAN); - win->Text(0, height + kTextSize * 2, transcription_.string()); - } - win->Update(); - window_wait(win); -#endif -} - -// Adds the supplied boxes and transcriptions that correspond to the correct -// page number. -void ImageData::AddBoxes(const GenericVector& boxes, - const GenericVector& texts, - const GenericVector& box_pages) { - // Copy the boxes and make the transcription. - for (int i = 0; i < box_pages.size(); ++i) { - if (page_number_ >= 0 && box_pages[i] != page_number_) continue; - transcription_ += texts[i]; - boxes_.push_back(boxes[i]); - box_texts_.push_back(texts[i]); - } -} - -// Saves the given Pix as a PNG-encoded string and destroys it. -void ImageData::SetPixInternal(Pix* pix, GenericVector* image_data) { - l_uint8* data; - size_t size; - pixWriteMem(&data, &size, pix, IFF_PNG); - pixDestroy(&pix); - image_data->init_to_size(size, 0); - memcpy(&(*image_data)[0], data, size); - free(data); -} - -// Returns the Pix image for the image_data. Must be pixDestroyed after use. -Pix* ImageData::GetPixInternal(const GenericVector& image_data) { - Pix* pix = NULL; - if (!image_data.empty()) { - // Convert the array to an image. - const unsigned char* u_data = - reinterpret_cast(&image_data[0]); - pix = pixReadMem(u_data, image_data.size()); - } - return pix; -} - -// Parses the text string as a box file and adds any discovered boxes that -// match the page number. Returns false on error. -bool ImageData::AddBoxes(const char* box_text) { - if (box_text != NULL && box_text[0] != '\0') { - GenericVector boxes; - GenericVector texts; - GenericVector box_pages; - if (ReadMemBoxes(page_number_, false, box_text, &boxes, - &texts, NULL, &box_pages)) { - AddBoxes(boxes, texts, box_pages); - return true; - } else { - tprintf("Error: No boxes for page %d from image %s!\n", - page_number_, imagefilename_.string()); - } - } - return false; -} - -// Thread function to call ReCachePages. -void* ReCachePagesFunc(void* data) { - DocumentData* document_data = reinterpret_cast(data); - document_data->ReCachePages(); - return NULL; -} - -DocumentData::DocumentData(const STRING& name) - : document_name_(name), - pages_offset_(-1), - total_pages_(-1), - memory_used_(0), - max_memory_(0), - reader_(NULL) {} - -DocumentData::~DocumentData() { - SVAutoLock lock_p(&pages_mutex_); - SVAutoLock lock_g(&general_mutex_); -} - -// Reads all the pages in the given lstmf filename to the cache. The reader -// is used to read the file. -bool DocumentData::LoadDocument(const char* filename, const char* lang, - int start_page, inT64 max_memory, - FileReader reader) { - SetDocument(filename, lang, max_memory, reader); - pages_offset_ = start_page; - return ReCachePages(); -} - -// Sets up the document, without actually loading it. -void DocumentData::SetDocument(const char* filename, const char* lang, - inT64 max_memory, FileReader reader) { - SVAutoLock lock_p(&pages_mutex_); - SVAutoLock lock(&general_mutex_); - document_name_ = filename; - lang_ = lang; - pages_offset_ = -1; - max_memory_ = max_memory; - reader_ = reader; -} - -// Writes all the pages to the given filename. Returns false on error. -bool DocumentData::SaveDocument(const char* filename, FileWriter writer) { - SVAutoLock lock(&pages_mutex_); - TFile fp; - fp.OpenWrite(NULL); - if (!pages_.Serialize(&fp) || !fp.CloseWrite(filename, writer)) { - tprintf("Serialize failed: %s\n", filename); - return false; - } - return true; -} -bool DocumentData::SaveToBuffer(GenericVector* buffer) { - SVAutoLock lock(&pages_mutex_); - TFile fp; - fp.OpenWrite(buffer); - return pages_.Serialize(&fp); -} - -// Adds the given page data to this document, counting up memory. -void DocumentData::AddPageToDocument(ImageData* page) { - SVAutoLock lock(&pages_mutex_); - pages_.push_back(page); - set_memory_used(memory_used() + page->MemoryUsed()); -} - -// If the given index is not currently loaded, loads it using a separate -// thread. -void DocumentData::LoadPageInBackground(int index) { - ImageData* page = NULL; - if (IsPageAvailable(index, &page)) return; - SVAutoLock lock(&pages_mutex_); - if (pages_offset_ == index) return; - pages_offset_ = index; - pages_.clear(); - SVSync::StartThread(ReCachePagesFunc, this); -} - -// Returns a pointer to the page with the given index, modulo the total -// number of pages. Blocks until the background load is completed. -const ImageData* DocumentData::GetPage(int index) { - ImageData* page = NULL; - while (!IsPageAvailable(index, &page)) { - // If there is no background load scheduled, schedule one now. - pages_mutex_.Lock(); - bool needs_loading = pages_offset_ != index; - pages_mutex_.Unlock(); - if (needs_loading) LoadPageInBackground(index); - // We can't directly load the page, or the background load will delete it - // while the caller is using it, so give it a chance to work. -/* willus mod: start */ - sleep(1); -/* -#if __cplusplus > 199711L - std::this_thread::sleep_for(std::chrono::seconds(1)); -#elif _WIN32 - Sleep(1000); -#else - sleep(1); -#endif -*/ -/* willus mod: end */ - } - return page; -} - -// Returns true if the requested page is available, and provides a pointer, -// which may be NULL if the document is empty. May block, even though it -// doesn't guarantee to return true. -bool DocumentData::IsPageAvailable(int index, ImageData** page) { - SVAutoLock lock(&pages_mutex_); - int num_pages = NumPages(); - if (num_pages == 0 || index < 0) { - *page = NULL; // Empty Document. - return true; - } - if (num_pages > 0) { - index = Modulo(index, num_pages); - if (pages_offset_ <= index && index < pages_offset_ + pages_.size()) { - *page = pages_[index - pages_offset_]; // Page is available already. - return true; - } - } - return false; -} - -// Removes all pages from memory and frees the memory, but does not forget -// the document metadata. -inT64 DocumentData::UnCache() { - SVAutoLock lock(&pages_mutex_); - inT64 memory_saved = memory_used(); - pages_.clear(); - pages_offset_ = -1; - set_total_pages(-1); - set_memory_used(0); - tprintf("Unloaded document %s, saving %d memory\n", document_name_.string(), - memory_saved); - return memory_saved; -} - -// Locks the pages_mutex_ and Loads as many pages can fit in max_memory_ -// starting at index pages_offset_. -bool DocumentData::ReCachePages() { - SVAutoLock lock(&pages_mutex_); - // Read the file. - set_total_pages(0); - set_memory_used(0); - int loaded_pages = 0; - pages_.truncate(0); - TFile fp; - if (!fp.Open(document_name_, reader_) || - !PointerVector::DeSerializeSize(false, &fp, &loaded_pages) || - loaded_pages <= 0) { - tprintf("Deserialize header failed: %s\n", document_name_.string()); - return false; - } - pages_offset_ %= loaded_pages; - // Skip pages before the first one we want, and load the rest until max - // memory and skip the rest after that. - int page; - for (page = 0; page < loaded_pages; ++page) { - if (page < pages_offset_ || - (max_memory_ > 0 && memory_used() > max_memory_)) { - if (!PointerVector::DeSerializeSkip(false, &fp)) break; - } else { - if (!pages_.DeSerializeElement(false, &fp)) break; - ImageData* image_data = pages_.back(); - if (image_data->imagefilename().length() == 0) { - image_data->set_imagefilename(document_name_); - image_data->set_page_number(page); - } - image_data->set_language(lang_); - set_memory_used(memory_used() + image_data->MemoryUsed()); - } - } - if (page < loaded_pages) { - tprintf("Deserialize failed: %s read %d/%d pages\n", - document_name_.string(), page, loaded_pages); - pages_.truncate(0); - } else { - tprintf("Loaded %d/%d pages (%d-%d) of document %s\n", pages_.size(), - loaded_pages, pages_offset_, pages_offset_ + pages_.size(), - document_name_.string()); - } - set_total_pages(loaded_pages); - return !pages_.empty(); -} - -// A collection of DocumentData that knows roughly how much memory it is using. -DocumentCache::DocumentCache(inT64 max_memory) - : num_pages_per_doc_(0), max_memory_(max_memory) {} -DocumentCache::~DocumentCache() {} - -// Adds all the documents in the list of filenames, counting memory. -// The reader is used to read the files. -bool DocumentCache::LoadDocuments(const GenericVector& filenames, - const char* lang, - CachingStrategy cache_strategy, - FileReader reader) { - cache_strategy_ = cache_strategy; - inT64 fair_share_memory = 0; - // In the round-robin case, each DocumentData handles restricting its content - // to its fair share of memory. In the sequential case, DocumentCache - // determines which DocumentDatas are held entirely in memory. - if (cache_strategy_ == CS_ROUND_ROBIN) - fair_share_memory = max_memory_ / filenames.size(); - for (int arg = 0; arg < filenames.size(); ++arg) { - STRING filename = filenames[arg]; - DocumentData* document = new DocumentData(filename); - document->SetDocument(filename.string(), lang, fair_share_memory, reader); - AddToCache(document); - } - if (!documents_.empty()) { - // Try to get the first page now to verify the list of filenames. - if (GetPageBySerial(0) != NULL) return true; - tprintf("Load of page 0 failed!\n"); - } - return false; -} - -// Adds document to the cache. -bool DocumentCache::AddToCache(DocumentData* data) { - inT64 new_memory = data->memory_used(); - documents_.push_back(data); - return true; -} - -// Finds and returns a document by name. -DocumentData* DocumentCache::FindDocument(const STRING& document_name) const { - for (int i = 0; i < documents_.size(); ++i) { - if (documents_[i]->document_name() == document_name) - return documents_[i]; - } - return NULL; -} - -// Returns the total number of pages in an epoch. For CS_ROUND_ROBIN cache -// strategy, could take a long time. -int DocumentCache::TotalPages() { - if (cache_strategy_ == CS_SEQUENTIAL) { - // In sequential mode, we assume each doc has the same number of pages - // whether it is true or not. - if (num_pages_per_doc_ == 0) GetPageSequential(0); - return num_pages_per_doc_ * documents_.size(); - } - int total_pages = 0; - int num_docs = documents_.size(); - for (int d = 0; d < num_docs; ++d) { - // We have to load a page to make NumPages() valid. - documents_[d]->GetPage(0); - total_pages += documents_[d]->NumPages(); - } - return total_pages; -} - -// Returns a page by serial number, selecting them in a round-robin fashion -// from all the documents. Highly disk-intensive, but doesn't need samples -// to be shuffled between files to begin with. -const ImageData* DocumentCache::GetPageRoundRobin(int serial) { - int num_docs = documents_.size(); - int doc_index = serial % num_docs; - const ImageData* doc = documents_[doc_index]->GetPage(serial / num_docs); - for (int offset = 1; offset <= kMaxReadAhead && offset < num_docs; ++offset) { - doc_index = (serial + offset) % num_docs; - int page = (serial + offset) / num_docs; - documents_[doc_index]->LoadPageInBackground(page); - } - return doc; -} - -// Returns a page by serial number, selecting them in sequence from each file. -// Requires the samples to be shuffled between the files to give a random or -// uniform distribution of data. Less disk-intensive than GetPageRoundRobin. -const ImageData* DocumentCache::GetPageSequential(int serial) { - int num_docs = documents_.size(); - ASSERT_HOST(num_docs > 0); - if (num_pages_per_doc_ == 0) { - // Use the pages in the first doc as the number of pages in each doc. - documents_[0]->GetPage(0); - num_pages_per_doc_ = documents_[0]->NumPages(); - if (num_pages_per_doc_ == 0) { - tprintf("First document cannot be empty!!\n"); - ASSERT_HOST(num_pages_per_doc_ > 0); - } - // Get rid of zero now if we don't need it. - if (serial / num_pages_per_doc_ % num_docs > 0) documents_[0]->UnCache(); - } - int doc_index = serial / num_pages_per_doc_ % num_docs; - const ImageData* doc = - documents_[doc_index]->GetPage(serial % num_pages_per_doc_); - // Count up total memory. Background loading makes it more complicated to - // keep a running count. - inT64 total_memory = 0; - for (int d = 0; d < num_docs; ++d) { - total_memory += documents_[d]->memory_used(); - } - if (total_memory >= max_memory_) { - // Find something to un-cache. - // If there are more than 3 in front, then serial is from the back reader - // of a pair of readers. If we un-cache from in-front-2 to 2-ahead, then - // we create a hole between them and then un-caching the backmost occupied - // will work for both. - int num_in_front = CountNeighbourDocs(doc_index, 1); - for (int offset = num_in_front - 2; - offset > 1 && total_memory >= max_memory_; --offset) { - int next_index = (doc_index + offset) % num_docs; - total_memory -= documents_[next_index]->UnCache(); - } - // If that didn't work, the best solution is to un-cache from the back. If - // we take away the document that a 2nd reader is using, it will put it - // back and make a hole between. - int num_behind = CountNeighbourDocs(doc_index, -1); - for (int offset = num_behind; offset < 0 && total_memory >= max_memory_; - ++offset) { - int next_index = (doc_index + offset + num_docs) % num_docs; - total_memory -= documents_[next_index]->UnCache(); - } - } - int next_index = (doc_index + 1) % num_docs; - if (!documents_[next_index]->IsCached() && total_memory < max_memory_) { - documents_[next_index]->LoadPageInBackground(0); - } - return doc; -} - -// Helper counts the number of adjacent cached neighbours of index looking in -// direction dir, ie index+dir, index+2*dir etc. -int DocumentCache::CountNeighbourDocs(int index, int dir) { - int num_docs = documents_.size(); - for (int offset = dir; abs(offset) < num_docs; offset += dir) { - int offset_index = (index + offset + num_docs) % num_docs; - if (!documents_[offset_index]->IsCached()) return offset - dir; - } - return num_docs; -} - -} // namespace tesseract. diff -Nru k2pdfopt-2.42+ds/tesseract_mod/input.cpp k2pdfopt-2.51+ds/tesseract_mod/input.cpp --- k2pdfopt-2.42+ds/tesseract_mod/input.cpp 1970-01-01 00:00:00.000000000 +0000 +++ k2pdfopt-2.51+ds/tesseract_mod/input.cpp 2018-12-22 23:34:23.000000000 +0000 @@ -0,0 +1,150 @@ +/////////////////////////////////////////////////////////////////////// +// File: input.cpp +// Description: Input layer class for neural network implementations. +// Author: Ray Smith +// Created: Thu Mar 13 09:10:34 PDT 2014 +// +// (C) Copyright 2014, Google Inc. +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// http://www.apache.org/licenses/LICENSE-2.0 +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +/////////////////////////////////////////////////////////////////////// + +#include "input.h" + +#include "allheaders.h" +#include "imagedata.h" +#include "pageres.h" +#include "scrollview.h" + +namespace tesseract { + +// Max height for variable height inputs before scaling anyway. +const int kMaxInputHeight = 48; + +Input::Input(const STRING& name, int ni, int no) + : Network(NT_INPUT, name, ni, no), cached_x_scale_(1) {} +Input::Input(const STRING& name, const StaticShape& shape) + : Network(NT_INPUT, name, shape.height(), shape.depth()), + shape_(shape), + cached_x_scale_(1) { + if (shape.height() == 1) ni_ = shape.depth(); +} + +// Writes to the given file. Returns false in case of error. +bool Input::Serialize(TFile* fp) const { + return Network::Serialize(fp) && shape_.Serialize(fp); +} + +// Reads from the given file. Returns false in case of error. +bool Input::DeSerialize(TFile* fp) { + return shape_.DeSerialize(fp); +} + +// Returns an integer reduction factor that the network applies to the +// time sequence. Assumes that any 2-d is already eliminated. Used for +// scaling bounding boxes of truth data. +int Input::XScaleFactor() const { + return 1; +} + +// Provides the (minimum) x scale factor to the network (of interest only to +// input units) so they can determine how to scale bounding boxes. +void Input::CacheXScaleFactor(int factor) { + cached_x_scale_ = factor; +} + +// Runs forward propagation of activations on the input line. +// See Network for a detailed discussion of the arguments. +void Input::Forward(bool debug, const NetworkIO& input, + const TransposedArray* input_transpose, + NetworkScratch* scratch, NetworkIO* output) { + *output = input; +} + +// Runs backward propagation of errors on the deltas line. +// See NetworkCpp for a detailed discussion of the arguments. +bool Input::Backward(bool debug, const NetworkIO& fwd_deltas, + NetworkScratch* scratch, + NetworkIO* back_deltas) { + tprintf("Input::Backward should not be called!!\n"); + return false; +} + +// Creates and returns a Pix of appropriate size for the network from the +// image_data. If non-null, *image_scale returns the image scale factor used. +// Returns nullptr on error. +/* static */ +Pix* Input::PrepareLSTMInputs(const ImageData& image_data, + const Network* network, int min_width, + TRand* randomizer, float* image_scale) { + // Note that NumInputs() is defined as input image height. + int target_height = network->NumInputs(); + int width, height; + Pix* pix = image_data.PreScale(target_height, kMaxInputHeight, image_scale, + &width, &height, nullptr); + if (pix == nullptr) { + tprintf("Bad pix from ImageData!\n"); + return nullptr; + } + if (width <= min_width || height < min_width) { + /* willus mod -- no warning */ + /* + tprintf("Image too small to scale!! (%dx%d vs min width of %d)\n", width, + height, min_width); + */ + pixDestroy(&pix); + return nullptr; + } + return pix; +} + +// Converts the given pix to a NetworkIO of height and depth appropriate to the +// given StaticShape: +// If depth == 3, convert to 24 bit color, otherwise normalized grey. +// Scale to target height, if the shape's height is > 1, or its depth if the +// height == 1. If height == 0 then no scaling. +// NOTE: It isn't safe for multiple threads to call this on the same pix. +/* static */ +void Input::PreparePixInput(const StaticShape& shape, const Pix* pix, + TRand* randomizer, NetworkIO* input) { + bool color = shape.depth() == 3; + Pix* var_pix = const_cast(pix); + int depth = pixGetDepth(var_pix); + Pix* normed_pix = nullptr; + // On input to BaseAPI, an image is forced to be 1, 8 or 24 bit, without + // colormap, so we just have to deal with depth conversion here. + if (color) { + // Force RGB. + if (depth == 32) + normed_pix = pixClone(var_pix); + else + normed_pix = pixConvertTo32(var_pix); + } else { + // Convert non-8-bit images to 8 bit. + if (depth == 8) + normed_pix = pixClone(var_pix); + else + normed_pix = pixConvertTo8(var_pix, false); + } + int height = pixGetHeight(normed_pix); + int target_height = shape.height(); + if (target_height == 1) target_height = shape.depth(); + if (target_height != 0 && target_height != height) { + // Get the scaled image. + float im_factor = static_cast(target_height) / height; + Pix* scaled_pix = pixScale(normed_pix, im_factor, im_factor); + pixDestroy(&normed_pix); + normed_pix = scaled_pix; + } + input->FromPix(shape, normed_pix, randomizer); + pixDestroy(&normed_pix); +} + +} // namespace tesseract. diff -Nru k2pdfopt-2.42+ds/tesseract_mod/lstmrecognizer.cpp k2pdfopt-2.51+ds/tesseract_mod/lstmrecognizer.cpp --- k2pdfopt-2.42+ds/tesseract_mod/lstmrecognizer.cpp 1970-01-01 00:00:00.000000000 +0000 +++ k2pdfopt-2.51+ds/tesseract_mod/lstmrecognizer.cpp 2018-12-22 23:35:11.000000000 +0000 @@ -0,0 +1,520 @@ +/////////////////////////////////////////////////////////////////////// +// File: lstmrecognizer.cpp +// Description: Top-level line recognizer class for LSTM-based networks. +// Author: Ray Smith +// Created: Thu May 02 10:59:06 PST 2013 +// +// (C) Copyright 2013, Google Inc. +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// http://www.apache.org/licenses/LICENSE-2.0 +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +/////////////////////////////////////////////////////////////////////// + +// Include automatically generated configuration file if running autoconf. +#ifdef HAVE_CONFIG_H +#include "config_auto.h" +#endif + +#include "lstmrecognizer.h" + +#include "allheaders.h" +#include "callcpp.h" +#include "dict.h" +#include "genericheap.h" +#include "helpers.h" +#include "imagedata.h" +#include "input.h" +#include "lstm.h" +#include "normalis.h" +#include "pageres.h" +#include "ratngs.h" +#include "recodebeam.h" +#include "scrollview.h" +#include "statistc.h" +#include "tprintf.h" + +namespace tesseract { + +// Default ratio between dict and non-dict words. +const double kDictRatio = 2.25; +// Default certainty offset to give the dictionary a chance. +const double kCertOffset = -0.085; + +LSTMRecognizer::LSTMRecognizer() + : network_(nullptr), + training_flags_(0), + training_iteration_(0), + sample_iteration_(0), + null_char_(UNICHAR_BROKEN), + learning_rate_(0.0f), + momentum_(0.0f), + adam_beta_(0.0f), + dict_(nullptr), + search_(nullptr), + debug_win_(nullptr) {} + +LSTMRecognizer::~LSTMRecognizer() { + delete network_; + delete dict_; + delete search_; +} + +// Loads a model from mgr, including the dictionary only if lang is not null. +bool LSTMRecognizer::Load(const char* lang, TessdataManager* mgr) { + TFile fp; + if (!mgr->GetComponent(TESSDATA_LSTM, &fp)) return false; + if (!DeSerialize(mgr, &fp)) return false; + if (lang == nullptr) return true; + // Allow it to run without a dictionary. + LoadDictionary(lang, mgr); + return true; +} + +// Writes to the given file. Returns false in case of error. +bool LSTMRecognizer::Serialize(const TessdataManager* mgr, TFile* fp) const { + bool include_charsets = mgr == nullptr || + !mgr->IsComponentAvailable(TESSDATA_LSTM_RECODER) || + !mgr->IsComponentAvailable(TESSDATA_LSTM_UNICHARSET); + if (!network_->Serialize(fp)) return false; + if (include_charsets && !GetUnicharset().save_to_file(fp)) return false; + if (!network_str_.Serialize(fp)) return false; + if (!fp->Serialize(&training_flags_)) return false; + if (!fp->Serialize(&training_iteration_)) return false; + if (!fp->Serialize(&sample_iteration_)) return false; + if (!fp->Serialize(&null_char_)) return false; + if (!fp->Serialize(&adam_beta_)) return false; + if (!fp->Serialize(&learning_rate_)) return false; + if (!fp->Serialize(&momentum_)) return false; + if (include_charsets && IsRecoding() && !recoder_.Serialize(fp)) return false; + return true; +} + +// Reads from the given file. Returns false in case of error. +bool LSTMRecognizer::DeSerialize(const TessdataManager* mgr, TFile* fp) { + delete network_; + network_ = Network::CreateFromFile(fp); + if (network_ == nullptr) return false; + bool include_charsets = mgr == nullptr || + !mgr->IsComponentAvailable(TESSDATA_LSTM_RECODER) || + !mgr->IsComponentAvailable(TESSDATA_LSTM_UNICHARSET); + if (include_charsets && !ccutil_.unicharset.load_from_file(fp, false)) + return false; + if (!network_str_.DeSerialize(fp)) return false; + if (!fp->DeSerialize(&training_flags_)) return false; + if (!fp->DeSerialize(&training_iteration_)) return false; + if (!fp->DeSerialize(&sample_iteration_)) return false; + if (!fp->DeSerialize(&null_char_)) return false; + if (!fp->DeSerialize(&adam_beta_)) return false; + if (!fp->DeSerialize(&learning_rate_)) return false; + if (!fp->DeSerialize(&momentum_)) return false; + if (include_charsets && !LoadRecoder(fp)) return false; + if (!include_charsets && !LoadCharsets(mgr)) return false; + network_->SetRandomizer(&randomizer_); + network_->CacheXScaleFactor(network_->XScaleFactor()); + return true; +} + +// Loads the charsets from mgr. +bool LSTMRecognizer::LoadCharsets(const TessdataManager* mgr) { + TFile fp; + if (!mgr->GetComponent(TESSDATA_LSTM_UNICHARSET, &fp)) return false; + if (!ccutil_.unicharset.load_from_file(&fp, false)) return false; + if (!mgr->GetComponent(TESSDATA_LSTM_RECODER, &fp)) return false; + if (!LoadRecoder(&fp)) return false; + return true; +} + +// Loads the Recoder. +bool LSTMRecognizer::LoadRecoder(TFile* fp) { + if (IsRecoding()) { + if (!recoder_.DeSerialize(fp)) return false; + RecodedCharID code; + recoder_.EncodeUnichar(UNICHAR_SPACE, &code); + if (code(0) != UNICHAR_SPACE) { + tprintf("Space was garbled in recoding!!\n"); + return false; + } + } else { + recoder_.SetupPassThrough(GetUnicharset()); + training_flags_ |= TF_COMPRESS_UNICHARSET; + } + return true; +} + +// Loads the dictionary if possible from the traineddata file. +// Prints a warning message, and returns false but otherwise fails silently +// and continues to work without it if loading fails. +// Note that dictionary load is independent from DeSerialize, but dependent +// on the unicharset matching. This enables training to deserialize a model +// from checkpoint or restore without having to go back and reload the +// dictionary. +bool LSTMRecognizer::LoadDictionary(const char* lang, TessdataManager* mgr) { + delete dict_; + dict_ = new Dict(&ccutil_); + dict_->SetupForLoad(Dict::GlobalDawgCache()); + dict_->LoadLSTM(lang, mgr); + if (dict_->FinishLoad()) return true; // Success. + tprintf("Failed to load any lstm-specific dictionaries for lang %s!!\n", + lang); + delete dict_; + dict_ = nullptr; + return false; +} + +// Recognizes the line image, contained within image_data, returning the +// ratings matrix and matching box_word for each WERD_RES in the output. +void LSTMRecognizer::RecognizeLine(const ImageData& image_data, bool invert, + bool debug, double worst_dict_cert, + const TBOX& line_box, + PointerVector* words, + int lstm_choice_mode) { + NetworkIO outputs; + float scale_factor; + NetworkIO inputs; + if (!RecognizeLine(image_data, invert, debug, false, false, &scale_factor, + &inputs, &outputs)) + return; + if (search_ == nullptr) { + search_ = + new RecodeBeamSearch(recoder_, null_char_, SimpleTextOutput(), dict_); + } + search_->Decode(outputs, kDictRatio, kCertOffset, worst_dict_cert, + &GetUnicharset(), lstm_choice_mode); + search_->ExtractBestPathAsWords(line_box, scale_factor, debug, + &GetUnicharset(), words, lstm_choice_mode); +} + +// Helper computes min and mean best results in the output. +void LSTMRecognizer::OutputStats(const NetworkIO& outputs, float* min_output, + float* mean_output, float* sd) { + const int kOutputScale = INT8_MAX; + STATS stats(0, kOutputScale + 1); + for (int t = 0; t < outputs.Width(); ++t) { + int best_label = outputs.BestLabel(t, nullptr); + if (best_label != null_char_) { + float best_output = outputs.f(t)[best_label]; + stats.add(static_cast(kOutputScale * best_output), 1); + } + } + // If the output is all nulls it could be that the photometric interpretation + // is wrong, so make it look bad, so the other way can win, even if not great. + if (stats.get_total() == 0) { + *min_output = 0.0f; + *mean_output = 0.0f; + *sd = 1.0f; + } else { + *min_output = static_cast(stats.min_bucket()) / kOutputScale; + *mean_output = stats.mean() / kOutputScale; + *sd = stats.sd() / kOutputScale; + } +} + +// Recognizes the image_data, returning the labels, +// scores, and corresponding pairs of start, end x-coords in coords. +bool LSTMRecognizer::RecognizeLine(const ImageData& image_data, bool invert, + bool debug, bool re_invert, bool upside_down, + float* scale_factor, NetworkIO* inputs, + NetworkIO* outputs) { + // Maximum width of image to train on. + const int kMaxImageWidth = 2560; + // This ensures consistent recognition results. + SetRandomSeed(); + int min_width = network_->XScaleFactor(); + Pix* pix = Input::PrepareLSTMInputs(image_data, network_, min_width, + &randomizer_, scale_factor); + if (pix == nullptr) { + /* willus mod -- no warning */ + /* + tprintf("Line cannot be recognized!!\n"); + */ + return false; + } + if (network_->IsTraining() && pixGetWidth(pix) > kMaxImageWidth) { + tprintf("Image too large to learn!! Size = %dx%d\n", pixGetWidth(pix), + pixGetHeight(pix)); + pixDestroy(&pix); + return false; + } + if (upside_down) pixRotate180(pix, pix); + // Reduction factor from image to coords. + *scale_factor = min_width / *scale_factor; + inputs->set_int_mode(IsIntMode()); + SetRandomSeed(); + Input::PreparePixInput(network_->InputShape(), pix, &randomizer_, inputs); + network_->Forward(debug, *inputs, nullptr, &scratch_space_, outputs); + // Check for auto inversion. + float pos_min, pos_mean, pos_sd; + OutputStats(*outputs, &pos_min, &pos_mean, &pos_sd); + if (invert && pos_min < 0.5) { + // Run again inverted and see if it is any better. + NetworkIO inv_inputs, inv_outputs; + inv_inputs.set_int_mode(IsIntMode()); + SetRandomSeed(); + pixInvert(pix, pix); + Input::PreparePixInput(network_->InputShape(), pix, &randomizer_, + &inv_inputs); + network_->Forward(debug, inv_inputs, nullptr, &scratch_space_, &inv_outputs); + float inv_min, inv_mean, inv_sd; + OutputStats(inv_outputs, &inv_min, &inv_mean, &inv_sd); + if (inv_min > pos_min && inv_mean > pos_mean && inv_sd < pos_sd) { + // Inverted did better. Use inverted data. + if (debug) { + tprintf("Inverting image: old min=%g, mean=%g, sd=%g, inv %g,%g,%g\n", + pos_min, pos_mean, pos_sd, inv_min, inv_mean, inv_sd); + } + *outputs = inv_outputs; + *inputs = inv_inputs; + } else if (re_invert) { + // Inverting was not an improvement, so undo and run again, so the + // outputs match the best forward result. + SetRandomSeed(); + network_->Forward(debug, *inputs, nullptr, &scratch_space_, outputs); + } + } + pixDestroy(&pix); + if (debug) { + GenericVector labels, coords; + LabelsFromOutputs(*outputs, &labels, &coords); + DisplayForward(*inputs, labels, coords, "LSTMForward", &debug_win_); + DebugActivationPath(*outputs, labels, coords); + } + return true; +} + +// Converts an array of labels to utf-8, whether or not the labels are +// augmented with character boundaries. +STRING LSTMRecognizer::DecodeLabels(const GenericVector& labels) { + STRING result; + int end = 1; + for (int start = 0; start < labels.size(); start = end) { + if (labels[start] == null_char_) { + end = start + 1; + } else { + result += DecodeLabel(labels, start, &end, nullptr); + } + } + return result; +} + +// Displays the forward results in a window with the characters and +// boundaries as determined by the labels and label_coords. +void LSTMRecognizer::DisplayForward(const NetworkIO& inputs, + const GenericVector& labels, + const GenericVector& label_coords, + const char* window_name, + ScrollView** window) { +#ifndef GRAPHICS_DISABLED // do nothing if there's no graphics + Pix* input_pix = inputs.ToPix(); + Network::ClearWindow(false, window_name, pixGetWidth(input_pix), + pixGetHeight(input_pix), window); + int line_height = Network::DisplayImage(input_pix, *window); + DisplayLSTMOutput(labels, label_coords, line_height, *window); +#endif // GRAPHICS_DISABLED +} + +// Displays the labels and cuts at the corresponding xcoords. +// Size of labels should match xcoords. +void LSTMRecognizer::DisplayLSTMOutput(const GenericVector& labels, + const GenericVector& xcoords, + int height, ScrollView* window) { +#ifndef GRAPHICS_DISABLED // do nothing if there's no graphics + int x_scale = network_->XScaleFactor(); + window->TextAttributes("Arial", height / 4, false, false, false); + int end = 1; + for (int start = 0; start < labels.size(); start = end) { + int xpos = xcoords[start] * x_scale; + if (labels[start] == null_char_) { + end = start + 1; + window->Pen(ScrollView::RED); + } else { + window->Pen(ScrollView::GREEN); + const char* str = DecodeLabel(labels, start, &end, nullptr); + if (*str == '\\') str = "\\\\"; + xpos = xcoords[(start + end) / 2] * x_scale; + window->Text(xpos, height, str); + } + window->Line(xpos, 0, xpos, height * 3 / 2); + } + window->Update(); +#endif // GRAPHICS_DISABLED +} + +// Prints debug output detailing the activation path that is implied by the +// label_coords. +void LSTMRecognizer::DebugActivationPath(const NetworkIO& outputs, + const GenericVector& labels, + const GenericVector& xcoords) { + if (xcoords[0] > 0) + DebugActivationRange(outputs, "", null_char_, 0, xcoords[0]); + int end = 1; + for (int start = 0; start < labels.size(); start = end) { + if (labels[start] == null_char_) { + end = start + 1; + DebugActivationRange(outputs, "", null_char_, xcoords[start], + xcoords[end]); + continue; + } else { + int decoded; + const char* label = DecodeLabel(labels, start, &end, &decoded); + DebugActivationRange(outputs, label, labels[start], xcoords[start], + xcoords[start + 1]); + for (int i = start + 1; i < end; ++i) { + DebugActivationRange(outputs, DecodeSingleLabel(labels[i]), labels[i], + xcoords[i], xcoords[i + 1]); + } + } + } +} + +// Prints debug output detailing activations and 2nd choice over a range +// of positions. +void LSTMRecognizer::DebugActivationRange(const NetworkIO& outputs, + const char* label, int best_choice, + int x_start, int x_end) { + tprintf("%s=%d On [%d, %d), scores=", label, best_choice, x_start, x_end); + double max_score = 0.0; + double mean_score = 0.0; + const int width = x_end - x_start; + for (int x = x_start; x < x_end; ++x) { + const float* line = outputs.f(x); + const double score = line[best_choice] * 100.0; + if (score > max_score) max_score = score; + mean_score += score / width; + int best_c = 0; + double best_score = 0.0; + for (int c = 0; c < outputs.NumFeatures(); ++c) { + if (c != best_choice && line[c] > best_score) { + best_c = c; + best_score = line[c]; + } + } + tprintf(" %.3g(%s=%d=%.3g)", score, DecodeSingleLabel(best_c), best_c, + best_score * 100.0); + } + tprintf(", Mean=%g, max=%g\n", mean_score, max_score); +} + +// Helper returns true if the null_char is the winner at t, and it beats the +// null_threshold, or the next choice is space, in which case we will use the +// null anyway. +#if 0 // TODO: unused, remove if still unused after 2020. +static bool NullIsBest(const NetworkIO& output, float null_thr, + int null_char, int t) { + if (output.f(t)[null_char] >= null_thr) return true; + if (output.BestLabel(t, null_char, null_char, nullptr) != UNICHAR_SPACE) + return false; + return output.f(t)[null_char] > output.f(t)[UNICHAR_SPACE]; +} +#endif + +// Converts the network output to a sequence of labels. Outputs labels, scores +// and start xcoords of each char, and each null_char_, with an additional +// final xcoord for the end of the output. +// The conversion method is determined by internal state. +void LSTMRecognizer::LabelsFromOutputs(const NetworkIO& outputs, + GenericVector* labels, + GenericVector* xcoords) { + if (SimpleTextOutput()) { + LabelsViaSimpleText(outputs, labels, xcoords); + } else { + LabelsViaReEncode(outputs, labels, xcoords); + } +} + +// As LabelsViaCTC except that this function constructs the best path that +// contains only legal sequences of subcodes for CJK. +void LSTMRecognizer::LabelsViaReEncode(const NetworkIO& output, + GenericVector* labels, + GenericVector* xcoords) { + if (search_ == nullptr) { + search_ = + new RecodeBeamSearch(recoder_, null_char_, SimpleTextOutput(), dict_); + } + search_->Decode(output, 1.0, 0.0, RecodeBeamSearch::kMinCertainty, nullptr); + search_->ExtractBestPathAsLabels(labels, xcoords); +} + +// Converts the network output to a sequence of labels, with scores, using +// the simple character model (each position is a char, and the null_char_ is +// mainly intended for tail padding.) +void LSTMRecognizer::LabelsViaSimpleText(const NetworkIO& output, + GenericVector* labels, + GenericVector* xcoords) { + labels->truncate(0); + xcoords->truncate(0); + const int width = output.Width(); + for (int t = 0; t < width; ++t) { + float score = 0.0f; + const int label = output.BestLabel(t, &score); + if (label != null_char_) { + labels->push_back(label); + xcoords->push_back(t); + } + } + xcoords->push_back(width); +} + +// Returns a string corresponding to the label starting at start. Sets *end +// to the next start and if non-null, *decoded to the unichar id. +const char* LSTMRecognizer::DecodeLabel(const GenericVector& labels, + int start, int* end, int* decoded) { + *end = start + 1; + if (IsRecoding()) { + // Decode labels via recoder_. + RecodedCharID code; + if (labels[start] == null_char_) { + if (decoded != nullptr) { + code.Set(0, null_char_); + *decoded = recoder_.DecodeUnichar(code); + } + return ""; + } + int index = start; + while (index < labels.size() && + code.length() < RecodedCharID::kMaxCodeLen) { + code.Set(code.length(), labels[index++]); + while (index < labels.size() && labels[index] == null_char_) ++index; + int uni_id = recoder_.DecodeUnichar(code); + // If the next label isn't a valid first code, then we need to continue + // extending even if we have a valid uni_id from this prefix. + if (uni_id != INVALID_UNICHAR_ID && + (index == labels.size() || + code.length() == RecodedCharID::kMaxCodeLen || + recoder_.IsValidFirstCode(labels[index]))) { + *end = index; + if (decoded != nullptr) *decoded = uni_id; + if (uni_id == UNICHAR_SPACE) return " "; + return GetUnicharset().get_normed_unichar(uni_id); + } + } + return ""; + } else { + if (decoded != nullptr) *decoded = labels[start]; + if (labels[start] == null_char_) return ""; + if (labels[start] == UNICHAR_SPACE) return " "; + return GetUnicharset().get_normed_unichar(labels[start]); + } +} + +// Returns a string corresponding to a given single label id, falling back to +// a default of ".." for part of a multi-label unichar-id. +const char* LSTMRecognizer::DecodeSingleLabel(int label) { + if (label == null_char_) return ""; + if (IsRecoding()) { + // Decode label via recoder_. + RecodedCharID code; + code.Set(0, label); + label = recoder_.DecodeUnichar(code); + if (label == INVALID_UNICHAR_ID) return ".."; // Part of a bigger code. + } + if (label == UNICHAR_SPACE) return " "; + return GetUnicharset().get_normed_unichar(label); +} + +} // namespace tesseract. diff -Nru k2pdfopt-2.42+ds/tesseract_mod/mainblk.cpp k2pdfopt-2.51+ds/tesseract_mod/mainblk.cpp --- k2pdfopt-2.42+ds/tesseract_mod/mainblk.cpp 1970-01-01 00:00:00.000000000 +0000 +++ k2pdfopt-2.51+ds/tesseract_mod/mainblk.cpp 2018-12-24 06:28:50.000000000 +0000 @@ -0,0 +1,113 @@ +/********************************************************************** + * File: mainblk.cpp (Formerly main.c) + * Description: Function to call from main() to setup. + * Author: Ray Smith + * Created: Tue Oct 22 11:09:40 BST 1991 + * + * (C) Copyright 1991, Hewlett-Packard Ltd. + ** Licensed under the Apache License, Version 2.0 (the "License"); + ** you may not use this file except in compliance with the License. + ** You may obtain a copy of the License at + ** http://www.apache.org/licenses/LICENSE-2.0 + ** Unless required by applicable law or agreed to in writing, software + ** distributed under the License is distributed on an "AS IS" BASIS, + ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + ** See the License for the specific language governing permissions and + ** limitations under the License. + * + **********************************************************************/ + +#include +#if defined(_WIN32) +#include // for _access +#endif + +#include "fileerr.h" +#include "ccutil.h" + +const ERRCODE NO_PATH = +"Warning:explicit path for executable will not be used for configs"; +static const ERRCODE USAGE = "Usage"; + +namespace tesseract { +/********************************************************************** + * main_setup + * + * Main for mithras demo program. Read the arguments and set up globals. + **********************************************************************/ + +/** + * @brief CCUtil::main_setup - set location of tessdata and name of image + * + * @param argv0 - paths to the directory with language files and config files. + * An actual value of argv0 is used if not nullptr, otherwise TESSDATA_PREFIX is + * used if not nullptr, next try to use compiled in -DTESSDATA_PREFIX. If previous + * is not successful - use current directory. + * @param basename - name of image + */ +void CCUtil::main_setup(const char *argv0, const char *basename) { + imagebasename = basename; /**< name of image */ + + char *tessdata_prefix = getenv("TESSDATA_PREFIX"); + + if (argv0 != nullptr && *argv0 != '\0') { + /* Use tessdata prefix from the command line. */ + datadir = argv0; + } else if (tessdata_prefix) { + /* Use tessdata prefix from the environment. */ + datadir = tessdata_prefix; +#if defined(_WIN32) + } else if (datadir == nullptr || _access(datadir.string(), 0) != 0) { + /* Look for tessdata in directory of executable. */ + /* + char drive[_MAX_DRIVE]; + char dir[_MAX_DIR]; + */ + char path[_MAX_PATH]; + int i; + /* DWORD length = */ GetModuleFileName(nullptr, path, sizeof(path)); + /* willus mod--avoid _splitpath_s -- not in XP */ + for (i=strlen(path)-1;i>=0 && path[i]!='/' && path[i]!='\\';i--); + if (i>=0) + { + path[i]='\0'; + datadir=path; + datadir += "/tessdata"; + } + /* + if (length > 0 && length < sizeof(path)) { + errno_t result = _splitpath_s(path, drive, sizeof(drive), + dir, sizeof(dir), nullptr, 0, nullptr, 0); + if (result == ERANGE) { + tprintf("Error: Path too long: %s\n", path); + } + + datadir = drive; + datadir += dir; + datadir += "/tessdata"; + } + */ +#endif /* _WIN32 */ +#if defined(TESSDATA_PREFIX) + } else { +/* Use tessdata prefix which was compiled in. */ +#define _STR(a) #a +#define _XSTR(a) _STR(a) + datadir = _XSTR(TESSDATA_PREFIX) "/tessdata"; +#undef _XSTR +#undef _STR +#endif + } + + // datadir may still be empty: + if (datadir.length() == 0) { + datadir = "./"; + } + + // check for missing directory separator + const char *lastchar = datadir.string(); + lastchar += datadir.length() - 1; + if ((strcmp(lastchar, "/") != 0) && (strcmp(lastchar, "\\") != 0)) + datadir += "/"; +} +} // namespace tesseract diff -Nru k2pdfopt-2.42+ds/tesseract_mod/openclwrapper.h k2pdfopt-2.51+ds/tesseract_mod/openclwrapper.h --- k2pdfopt-2.42+ds/tesseract_mod/openclwrapper.h 2017-02-25 04:35:07.000000000 +0000 +++ k2pdfopt-2.51+ds/tesseract_mod/openclwrapper.h 1970-01-01 00:00:00.000000000 +0000 @@ -1,325 +0,0 @@ -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -#include -#include "allheaders.h" -#ifdef USE_OPENCL -#include "pix.h" -#include "tiff.h" -#include "tiffio.h" -#endif -#include "tprintf.h" - -// including CL/cl.h doesn't occur until USE_OPENCL defined below - -// platform preprocessor commands -#if defined(WIN32) || defined(__WIN32__) || defined(_WIN32) || \ - defined(__CYGWIN__) || defined(__MINGW32__) -#define ON_WINDOWS 1 -#define ON_LINUX 0 -#define ON_APPLE 0 -#define ON_OTHER 0 -#define IF_WINDOWS(X) X -#define IF_LINUX(X) -#define IF_APPLE(X) -#define IF_OTHER(X) -#define NOT_WINDOWS(X) -#elif defined( __linux__ ) -#define ON_WINDOWS 0 -#define ON_LINUX 1 -#define ON_APPLE 0 -#define ON_OTHER 0 -#define IF_WINDOWS(X) -#define IF_LINUX(X) X -#define IF_APPLE(X) -#define IF_OTHER(X) -#define NOT_WINDOWS(X) X -#elif defined( __APPLE__ ) -#define ON_WINDOWS 0 -#define ON_LINUX 0 -#define ON_APPLE 1 -#define ON_OTHER 0 -#define IF_WINDOWS(X) -#define IF_LINUX(X) -#define IF_APPLE(X) X -#define IF_OTHER(X) -#define NOT_WINDOWS(X) X -#else -#define ON_WINDOWS 0 -#define ON_LINUX 0 -#define ON_APPLE 0 -#define ON_OTHER 1 -#define IF_WINDOWS(X) -#define IF_LINUX(X) -#define IF_APPLE(X) -#define IF_OTHER(X) X -#define NOT_WINDOWS(X) X -#endif - -#if ON_LINUX -#include -#endif - -/************************************************************************************ - * enable/disable reporting of performance - * PERF_REPORT_LEVEL - * 0 - no reporting - * 1 - no reporting - * 2 - report total function call time for functions we're tracking - * 3 - optionally report breakdown of function calls (kernel launch, kernel time, data copies) - ************************************************************************************/ -#define PERF_COUNT_VERBOSE 1 -#define PERF_COUNT_REPORT_STR "[%36s], %24s, %11.6f\n" - - -#if ON_WINDOWS - -#if PERF_COUNT_VERBOSE >= 2 -#define PERF_COUNT_START(FUNCT_NAME) \ - char *funct_name = FUNCT_NAME; \ - double elapsed_time_sec; \ - LARGE_INTEGER freq, time_funct_start, time_funct_end, time_sub_start, time_sub_end; \ - QueryPerformanceFrequency(&freq); \ - QueryPerformanceCounter(&time_funct_start); \ - time_sub_start = time_funct_start; \ - time_sub_end = time_funct_start; - -#define PERF_COUNT_END \ - QueryPerformanceCounter(&time_funct_end); \ - elapsed_time_sec = (time_funct_end.QuadPart - time_funct_start.QuadPart) / \ - (double)(freq.QuadPart); \ - printf(PERF_COUNT_REPORT_STR, funct_name, "total", elapsed_time_sec); -#else -#define PERF_COUNT_START(FUNCT_NAME) -#define PERF_COUNT_END -#endif - -#if PERF_COUNT_VERBOSE >= 3 -#define PERF_COUNT_SUB(SUB) \ - QueryPerformanceCounter(&time_sub_end); \ - elapsed_time_sec = (time_sub_end.QuadPart - time_sub_start.QuadPart) / \ - (double)(freq.QuadPart); \ - printf(PERF_COUNT_REPORT_STR, funct_name, SUB, elapsed_time_sec); \ - time_sub_start = time_sub_end; -#else -#define PERF_COUNT_SUB(SUB) -#endif - - -// not on windows -#else - -#if PERF_COUNT_VERBOSE >= 2 -#define PERF_COUNT_START(FUNCT_NAME) \ - char *funct_name = FUNCT_NAME; \ - double elapsed_time_sec; \ - timespec time_funct_start, time_funct_end, time_sub_start, time_sub_end; \ - clock_gettime( CLOCK_MONOTONIC, &time_funct_start ); \ - time_sub_start = time_funct_start; \ - time_sub_end = time_funct_start; - -#define PERF_COUNT_END \ - clock_gettime(CLOCK_MONOTONIC, &time_funct_end); \ - elapsed_time_sec = \ - (time_funct_end.tv_sec - time_funct_start.tv_sec) * 1.0 + \ - (time_funct_end.tv_nsec - time_funct_start.tv_nsec) / 1000000000.0; \ - printf(PERF_COUNT_REPORT_STR, funct_name, "total", elapsed_time_sec); -#else -#define PERF_COUNT_START(FUNCT_NAME) -#define PERF_COUNT_END -#endif - -#if PERF_COUNT_VERBOSE >= 3 -#define PERF_COUNT_SUB(SUB) \ - clock_gettime(CLOCK_MONOTONIC, &time_sub_end); \ - elapsed_time_sec = \ - (time_sub_end.tv_sec - time_sub_start.tv_sec) * 1.0 + \ - (time_sub_end.tv_nsec - time_sub_start.tv_nsec) / 1000000000.0; \ - printf(PERF_COUNT_REPORT_STR, funct_name, SUB, elapsed_time_sec); \ - time_sub_start = time_sub_end; -#else -#define PERF_COUNT_SUB(SUB) -#endif - -#endif -/************************************************************************** - * enable/disable use of OpenCL - **************************************************************************/ - -#ifdef USE_OPENCL -#include "opencl_device_selection.h" - -#ifndef strcasecmp -#define strcasecmp strcmp -#endif - -#define MAX_KERNEL_STRING_LEN 64 -#define MAX_CLFILE_NUM 50 -#define MAX_CLKERNEL_NUM 200 -#define MAX_KERNEL_NAME_LEN 64 -#define CL_QUEUE_THREAD_HANDLE_AMD 0x403E -#define GROUPSIZE_X 16 -#define GROUPSIZE_Y 16 -#define GROUPSIZE_HMORX 256 -#define GROUPSIZE_HMORY 1 - -typedef struct _KernelEnv -{ - cl_context mpkContext; - cl_command_queue mpkCmdQueue; - cl_program mpkProgram; - cl_kernel mpkKernel; - char mckKernelName[150]; -} KernelEnv; - -typedef struct _OpenCLEnv -{ - cl_platform_id mpOclPlatformID; - cl_context mpOclContext; - cl_device_id mpOclDevsID; - cl_command_queue mpOclCmdQueue; -} OpenCLEnv; -typedef int ( *cl_kernel_function )( void **userdata, KernelEnv *kenv ); - -#define CHECK_OPENCL(status,name) \ -if( status != CL_SUCCESS ) \ -{ \ - printf ("OpenCL error code is %d at when %s .\n", status, name); \ -} - - -typedef struct _GPUEnv -{ - //share vb in all modules in hb library - cl_platform_id mpPlatformID; - cl_device_type mDevType; - cl_context mpContext; - cl_device_id *mpArryDevsID; - cl_device_id mpDevID; - cl_command_queue mpCmdQueue; - cl_kernel mpArryKernels[MAX_CLFILE_NUM]; - cl_program mpArryPrograms[MAX_CLFILE_NUM]; //one program object maps one kernel source file - char mArryKnelSrcFile[MAX_CLFILE_NUM][256], //the max len of kernel file name is 256 - mArrykernelNames[MAX_CLKERNEL_NUM][MAX_KERNEL_STRING_LEN + 1]; - cl_kernel_function mpArryKnelFuncs[MAX_CLKERNEL_NUM]; - int mnKernelCount, mnFileCount, // only one kernel file - mnIsUserCreated; // 1: created , 0:no create and needed to create by opencl wrapper - int mnKhrFp64Flag; - int mnAmdFp64Flag; - -} GPUEnv; - - -class OpenclDevice -{ - -public: - static GPUEnv gpuEnv; - static int isInited; - OpenclDevice(); - ~OpenclDevice(); - static int InitEnv(); // load dll, call InitOpenclRunEnv(0) - static int InitOpenclRunEnv( int argc ); // RegistOpenclKernel, double flags, compile kernels - static int InitOpenclRunEnv_DeviceSelection( int argc ); // RegistOpenclKernel, double flags, compile kernels - static int RegistOpenclKernel(); - static int ReleaseOpenclRunEnv(); - static int ReleaseOpenclEnv( GPUEnv *gpuInfo ); - static int CompileKernelFile( GPUEnv *gpuInfo, const char *buildOption ); - static int CachedOfKernerPrg( const GPUEnv *gpuEnvCached, const char * clFileName ); - static int GeneratBinFromKernelSource( cl_program program, const char * clFileName ); - static int WriteBinaryToFile( const char* fileName, const char* birary, size_t numBytes ); - static int BinaryGenerated( const char * clFileName, FILE ** fhandle ); - //static int CompileKernelFile( const char *filename, GPUEnv *gpuInfo, const char *buildOption ); - static l_uint32* pixReadFromTiffKernel(l_uint32 *tiffdata,l_int32 w,l_int32 h,l_int32 wpl, l_uint32 *line); - static Pix* pixReadTiffCl( const char *filename, l_int32 n ); - static PIX * pixReadStreamTiffCl ( FILE *fp, l_int32 n ); - static PIX * pixReadMemTiffCl(const l_uint8 *data, size_t size, l_int32 n); - static PIX* pixReadFromTiffStreamCl(TIFF *tif); - static int composeRGBPixelCl(int *tiffdata,int *line,int h,int w); - static l_int32 getTiffStreamResolutionCl(TIFF *tif,l_int32 *pxres,l_int32 *pyres); - static TIFF* fopenTiffCl(FILE *fp,const char *modestring); - -/* OpenCL implementations of Morphological operations*/ - - //Initialiation of OCL buffers used in Morph operations - static int initMorphCLAllocations(l_int32 wpl, l_int32 h, PIX* pixs); - static void releaseMorphCLBuffers(); - - // OpenCL implementation of Morphology Dilate - static PIX* pixDilateBrickCL(PIX *pixd, PIX *pixs, l_int32 hsize, l_int32 vsize, bool reqDataCopy); - - // OpenCL implementation of Morphology Erode - static PIX* pixErodeBrickCL(PIX *pixd, PIX *pixs, l_int32 hsize, l_int32 vsize, bool reqDataCopy); - - // OpenCL implementation of Morphology Close - static PIX* pixCloseBrickCL(PIX *pixd, PIX *pixs, l_int32 hsize, l_int32 vsize, bool reqDataCopy); - - // OpenCL implementation of Morphology Open - static PIX* pixOpenBrickCL(PIX *pixd, PIX *pixs, l_int32 hsize, l_int32 vsize, bool reqDataCopy); - - // OpenCL implementation of Morphology Open - static PIX* pixSubtractCL(PIX *pixd, PIX *pixs1, PIX *pixs2, bool reqDataCopy); - - // OpenCL implementation of Morphology (Hollow = Closed - Open) - static PIX* pixHollowCL(PIX *pixd, PIX *pixs, l_int32 close_hsize, l_int32 close_vsize, l_int32 open_hsize, l_int32 open_vsize, bool reqDataCopy); - - static void pixGetLinesCL(PIX *pixd, PIX *pixs, PIX **pix_vline, - PIX **pix_hline, PIX **pixClosed, - bool getpixClosed, l_int32 close_hsize, - l_int32 close_vsize, l_int32 open_hsize, - l_int32 open_vsize, l_int32 line_hsize, - l_int32 line_vsize); - - //int InitOpenclAttr( OpenCLEnv * env ); - //int ReleaseKernel( KernelEnv * env ); - static int SetKernelEnv( KernelEnv *envInfo ); - //int CreateKernel( char * kernelname, KernelEnv * env ); - //int RunKernel( const char *kernelName, void **userdata ); - //int ConvertToString( const char *filename, char **source ); - //int CheckKernelName( KernelEnv *envInfo, const char *kernelName ); - //int RegisterKernelWrapper( const char *kernelName, cl_kernel_function function ); - //int RunKernelWrapper( cl_kernel_function function, const char * kernelName, void **usrdata ); - //int GetKernelEnvAndFunc( const char *kernelName, KernelEnv *env, cl_kernel_function *function ); - // static cl_device_id performDeviceSelection( ); - //static bool thresholdRectToPixMicroBench( TessScoreEvaluationInputData input, ds_device_type type); - - static int LoadOpencl(); -#ifdef WIN32 - //static int OpenclInite(); - static void FreeOpenclDll(); -#endif - - inline static int AddKernelConfig( int kCount, const char *kName ); - - /* for binarization */ - static int HistogramRectOCL(unsigned char *imagedata, int bytes_per_pixel, - int bytes_per_line, int left, int top, - int width, int height, int kHistogramSize, - int *histogramAllChannels); - - static int ThresholdRectToPixOCL(unsigned char *imagedata, - int bytes_per_pixel, int bytes_per_line, - int *thresholds, int *hi_values, Pix **pix, - int rect_height, int rect_width, - int rect_top, int rect_left); - - static Pix *pixConvertRGBToGrayOCL(Pix *pix, float weightRed = 0.3, - float weightGreen = 0.5, - float weightBlue = 0.2); - - static ds_device getDeviceSelection(); - static ds_device selectedDevice; - static bool deviceIsSelected; - static bool selectedDeviceIsOpenCL(); - static bool selectedDeviceIsNativeCPU(); - -}; - - -#endif diff -Nru k2pdfopt-2.42+ds/tesseract_mod/params.cpp k2pdfopt-2.51+ds/tesseract_mod/params.cpp --- k2pdfopt-2.42+ds/tesseract_mod/params.cpp 1970-01-01 00:00:00.000000000 +0000 +++ k2pdfopt-2.51+ds/tesseract_mod/params.cpp 2018-11-21 22:16:47.000000000 +0000 @@ -0,0 +1,217 @@ +/********************************************************************** + * File: params.cpp + * Description: Initialization and setting of Tesseract parameters. + * Author: Ray Smith + * Created: Fri Feb 22 16:22:34 GMT 1991 + * + * (C) Copyright 1991, Hewlett-Packard Ltd. + ** Licensed under the Apache License, Version 2.0 (the "License"); + ** you may not use this file except in compliance with the License. + ** You may obtain a copy of the License at + ** http://www.apache.org/licenses/LICENSE-2.0 + ** Unless required by applicable law or agreed to in writing, software + ** distributed under the License is distributed on an "AS IS" BASIS, + ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + ** See the License for the specific language governing permissions and + ** limitations under the License. + * + **********************************************************************/ + +#include +#include +#include + +#include "genericvector.h" +#include "tprintf.h" +#include "params.h" + +#define PLUS '+' //flag states +#define MINUS '-' +#define EQUAL '=' + +tesseract::ParamsVectors *GlobalParams() { + static tesseract::ParamsVectors global_params = tesseract::ParamsVectors(); + return &global_params; +} + +namespace tesseract { + +bool ParamUtils::ReadParamsFile(const char *file, + SetParamConstraint constraint, + ParamsVectors *member_params) { + int16_t nameoffset; // offset for real name + + if (*file == PLUS) { + nameoffset = 1; + } else if (*file == MINUS) { + nameoffset = 1; + } else { + nameoffset = 0; + } + + TFile fp; + if (!fp.Open(file + nameoffset, nullptr)) { + tprintf("read_params_file: Can't open %s\n", file + nameoffset); + return true; + } + return ReadParamsFromFp(constraint, &fp, member_params); +} + +bool ParamUtils::ReadParamsFromFp(SetParamConstraint constraint, TFile *fp, + ParamsVectors *member_params) { + char line[MAX_PATH]; // input line + bool anyerr = false; // true if any error + bool foundit; // found parameter + char *valptr; // value field + + while (fp->FGets(line, MAX_PATH) != nullptr) { + if (line[0] != '\r' && line[0] != '\n' && line[0] != '#') { + chomp_string(line); // remove newline + for (valptr = line; *valptr && *valptr != ' ' && *valptr != '\t'; + valptr++); + if (*valptr) { // found blank + *valptr = '\0'; // make name a string + do + valptr++; // find end of blanks + while (*valptr == ' ' || *valptr == '\t'); + } + foundit = SetParam(line, valptr, constraint, member_params); + + if (!foundit) { + anyerr = true; // had an error + /* willus mod */ + tprintf("Tesseract warning: Parameter %s not found in file %s.\n",line,fp->tfile_filename); + } + } + } + return anyerr; +} + +bool ParamUtils::SetParam(const char *name, const char* value, + SetParamConstraint constraint, + ParamsVectors *member_params) { + // Look for the parameter among string parameters. + StringParam *sp = FindParam(name, GlobalParams()->string_params, + member_params->string_params); + if (sp != nullptr && sp->constraint_ok(constraint)) sp->set_value(value); + if (*value == '\0') return (sp != nullptr); + + // Look for the parameter among int parameters. + int intval; + IntParam *ip = FindParam(name, GlobalParams()->int_params, + member_params->int_params); + if (ip && ip->constraint_ok(constraint) && sscanf(value, "%d", &intval) == 1) + ip->set_value(intval); + + // Look for the parameter among bool parameters. + BoolParam *bp = FindParam(name, GlobalParams()->bool_params, + member_params->bool_params); + if (bp != nullptr && bp->constraint_ok(constraint)) { + if (*value == 'T' || *value == 't' || + *value == 'Y' || *value == 'y' || *value == '1') { + bp->set_value(true); + } else if (*value == 'F' || *value == 'f' || + *value == 'N' || *value == 'n' || *value == '0') { + bp->set_value(false); + } + } + + // Look for the parameter among double parameters. + double doubleval; + DoubleParam *dp = FindParam(name, GlobalParams()->double_params, + member_params->double_params); + if (dp != nullptr && dp->constraint_ok(constraint)) { +#ifdef EMBEDDED + doubleval = strtofloat(value); +#else + if (sscanf(value, "%lf", &doubleval) == 1) +#endif + dp->set_value(doubleval); + } + return (sp || ip || bp || dp); +} + +bool ParamUtils::GetParamAsString(const char *name, + const ParamsVectors* member_params, + STRING *value) { + // Look for the parameter among string parameters. + StringParam *sp = FindParam(name, GlobalParams()->string_params, + member_params->string_params); + if (sp) { + *value = sp->string(); + return true; + } + // Look for the parameter among int parameters. + IntParam *ip = FindParam(name, GlobalParams()->int_params, + member_params->int_params); + if (ip) { + char buf[128]; + snprintf(buf, sizeof(buf), "%d", int32_t(*ip)); + *value = buf; + return true; + } + // Look for the parameter among bool parameters. + BoolParam *bp = FindParam(name, GlobalParams()->bool_params, + member_params->bool_params); + if (bp != nullptr) { + *value = BOOL8(*bp) ? "1": "0"; + return true; + } + // Look for the parameter among double parameters. + DoubleParam *dp = FindParam(name, GlobalParams()->double_params, + member_params->double_params); + if (dp != nullptr) { + char buf[128]; + snprintf(buf, sizeof(buf), "%g", double(*dp)); + *value = buf; + return true; + } + return false; +} + +void ParamUtils::PrintParams(FILE *fp, const ParamsVectors *member_params) { + int v, i; + int num_iterations = (member_params == nullptr) ? 1 : 2; + for (v = 0; v < num_iterations; ++v) { + const ParamsVectors *vec = (v == 0) ? GlobalParams() : member_params; + for (i = 0; i < vec->int_params.size(); ++i) { + fprintf(fp, "%s\t%d\t%s\n", vec->int_params[i]->name_str(), + (int32_t)(*vec->int_params[i]), vec->int_params[i]->info_str()); + } + for (i = 0; i < vec->bool_params.size(); ++i) { + fprintf(fp, "%s\t%d\t%s\n", vec->bool_params[i]->name_str(), + (BOOL8)(*vec->bool_params[i]), vec->bool_params[i]->info_str()); + } + for (int i = 0; i < vec->string_params.size(); ++i) { + fprintf(fp, "%s\t%s\t%s\n", vec->string_params[i]->name_str(), + vec->string_params[i]->string(), vec->string_params[i]->info_str()); + } + for (int i = 0; i < vec->double_params.size(); ++i) { + fprintf(fp, "%s\t%g\t%s\n", vec->double_params[i]->name_str(), + (double)(*vec->double_params[i]), vec->double_params[i]->info_str()); + } + } +} + +// Resets all parameters back to default values; +void ParamUtils::ResetToDefaults(ParamsVectors* member_params) { + int v, i; + int num_iterations = (member_params == nullptr) ? 1 : 2; + for (v = 0; v < num_iterations; ++v) { + ParamsVectors *vec = (v == 0) ? GlobalParams() : member_params; + for (i = 0; i < vec->int_params.size(); ++i) { + vec->int_params[i]->ResetToDefault(); + } + for (i = 0; i < vec->bool_params.size(); ++i) { + vec->bool_params[i]->ResetToDefault(); + } + for (int i = 0; i < vec->string_params.size(); ++i) { + vec->string_params[i]->ResetToDefault(); + } + for (int i = 0; i < vec->double_params.size(); ++i) { + vec->double_params[i]->ResetToDefault(); + } + } +} + +} // namespace tesseract diff -Nru k2pdfopt-2.42+ds/tesseract_mod/readme.txt k2pdfopt-2.51+ds/tesseract_mod/readme.txt --- k2pdfopt-2.42+ds/tesseract_mod/readme.txt 1970-01-01 00:00:00.000000000 +0000 +++ k2pdfopt-2.51+ds/tesseract_mod/readme.txt 2019-01-04 21:42:24.000000000 +0000 @@ -0,0 +1,7 @@ +I compiled Tesseract v4.0.0 with gcc using these additional options: +-march=nehalem -DGRAPHICS_DISABLED -Wno-sign-compare + +I compiled these three files with -march=sandybridge +simddetect.cpp +dotproductavx.cpp +intsimdmatrixavx2.cpp diff -Nru k2pdfopt-2.42+ds/tesseract_mod/serialis.cpp k2pdfopt-2.51+ds/tesseract_mod/serialis.cpp --- k2pdfopt-2.42+ds/tesseract_mod/serialis.cpp 1970-01-01 00:00:00.000000000 +0000 +++ k2pdfopt-2.51+ds/tesseract_mod/serialis.cpp 2018-11-21 22:13:03.000000000 +0000 @@ -0,0 +1,335 @@ +/********************************************************************** + * File: serialis.cpp (Formerly serialmac.h) + * Description: Inline routines and macros for serialisation functions + * Author: Phil Cheatle + * Created: Tue Oct 08 08:33:12 BST 1991 + * + * (C) Copyright 1990, Hewlett-Packard Ltd. + ** Licensed under the Apache License, Version 2.0 (the "License"); + ** you may not use this file except in compliance with the License. + ** You may obtain a copy of the License at + ** http://www.apache.org/licenses/LICENSE-2.0 + ** Unless required by applicable law or agreed to in writing, software + ** distributed under the License is distributed on an "AS IS" BASIS, + ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + ** See the License for the specific language governing permissions and + ** limitations under the License. + * + **********************************************************************/ + +#include "serialis.h" +#include +#include "errcode.h" +#include "genericvector.h" + +namespace tesseract { + +bool DeSerialize(FILE* fp, char* data, size_t n) { + return fread(data, sizeof(*data), n, fp) == n; +} + +bool DeSerialize(FILE* fp, float* data, size_t n) { + return fread(data, sizeof(*data), n, fp) == n; +} + +bool DeSerialize(FILE* fp, int8_t* data, size_t n) { + return fread(data, sizeof(*data), n, fp) == n; +} + +bool DeSerialize(FILE* fp, int16_t* data, size_t n) { + return fread(data, sizeof(*data), n, fp) == n; +} + +bool DeSerialize(FILE* fp, int32_t* data, size_t n) { + return fread(data, sizeof(*data), n, fp) == n; +} + +bool DeSerialize(FILE* fp, uint8_t* data, size_t n) { + return fread(data, sizeof(*data), n, fp) == n; +} + +bool DeSerialize(FILE* fp, uint16_t* data, size_t n) { + return fread(data, sizeof(*data), n, fp) == n; +} + +bool DeSerialize(FILE* fp, uint32_t* data, size_t n) { + return fread(data, sizeof(*data), n, fp) == n; +} + +bool Serialize(FILE* fp, const char* data, size_t n) { + return fwrite(data, sizeof(*data), n, fp) == n; +} + +bool Serialize(FILE* fp, const float* data, size_t n) { + return fwrite(data, sizeof(*data), n, fp) == n; +} + +bool Serialize(FILE* fp, const int8_t* data, size_t n) { + return fwrite(data, sizeof(*data), n, fp) == n; +} + +bool Serialize(FILE* fp, const int16_t* data, size_t n) { + return fwrite(data, sizeof(*data), n, fp) == n; +} + +bool Serialize(FILE* fp, const int32_t* data, size_t n) { + return fwrite(data, sizeof(*data), n, fp) == n; +} + +bool Serialize(FILE* fp, const uint8_t* data, size_t n) { + return fwrite(data, sizeof(*data), n, fp) == n; +} + +bool Serialize(FILE* fp, const uint16_t* data, size_t n) { + return fwrite(data, sizeof(*data), n, fp) == n; +} + +bool Serialize(FILE* fp, const uint32_t* data, size_t n) { + return fwrite(data, sizeof(*data), n, fp) == n; +} + +TFile::TFile() + : offset_(0), + data_(nullptr), + data_is_owned_(false), + is_writing_(false), + swap_(false) {} + +TFile::~TFile() { + if (data_is_owned_) + delete data_; +} + +bool TFile::DeSerialize(char* buffer, size_t count) { + return FRead(buffer, sizeof(*buffer), count) == count; +} + +bool TFile::DeSerialize(double* buffer, size_t count) { + return FReadEndian(buffer, sizeof(*buffer), count) == count; +} + +bool TFile::DeSerialize(float* buffer, size_t count) { + return FReadEndian(buffer, sizeof(*buffer), count) == count; +} + +bool TFile::DeSerialize(int8_t* buffer, size_t count) { + return FRead(buffer, sizeof(*buffer), count) == count; +} + +bool TFile::DeSerialize(int16_t* buffer, size_t count) { + return FReadEndian(buffer, sizeof(*buffer), count) == count; +} + +bool TFile::DeSerialize(int32_t* buffer, size_t count) { + return FReadEndian(buffer, sizeof(*buffer), count) == count; +} + +bool TFile::DeSerialize(int64_t* buffer, size_t count) { + return FReadEndian(buffer, sizeof(*buffer), count) == count; +} + +bool TFile::DeSerialize(uint8_t* buffer, size_t count) { + return FRead(buffer, sizeof(*buffer), count) == count; +} + +bool TFile::DeSerialize(uint16_t* buffer, size_t count) { + return FReadEndian(buffer, sizeof(*buffer), count) == count; +} + +bool TFile::DeSerialize(uint32_t* buffer, size_t count) { + return FReadEndian(buffer, sizeof(*buffer), count) == count; +} + +bool TFile::DeSerialize(uint64_t* buffer, size_t count) { + return FReadEndian(buffer, sizeof(*buffer), count) == count; +} + +bool TFile::Serialize(const char* buffer, size_t count) { + return FWrite(buffer, sizeof(*buffer), count) == count; +} + +bool TFile::Serialize(const double* buffer, size_t count) { + return FWrite(buffer, sizeof(*buffer), count) == count; +} + +bool TFile::Serialize(const float* buffer, size_t count) { + return FWrite(buffer, sizeof(*buffer), count) == count; +} + +bool TFile::Serialize(const int8_t* buffer, size_t count) { + return FWrite(buffer, sizeof(*buffer), count) == count; +} + +bool TFile::Serialize(const int16_t* buffer, size_t count) { + return FWrite(buffer, sizeof(*buffer), count) == count; +} + +bool TFile::Serialize(const int32_t* buffer, size_t count) { + return FWrite(buffer, sizeof(*buffer), count) == count; +} + +bool TFile::Serialize(const int64_t* buffer, size_t count) { + return FWrite(buffer, sizeof(*buffer), count) == count; +} + +bool TFile::Serialize(const uint8_t* buffer, size_t count) { + return FWrite(buffer, sizeof(*buffer), count) == count; +} + +bool TFile::Serialize(const uint16_t* buffer, size_t count) { + return FWrite(buffer, sizeof(*buffer), count) == count; +} + +bool TFile::Serialize(const uint32_t* buffer, size_t count) { + return FWrite(buffer, sizeof(*buffer), count) == count; +} + +bool TFile::Serialize(const uint64_t* buffer, size_t count) { + return FWrite(buffer, sizeof(*buffer), count) == count; +} + +bool TFile::Skip(size_t count) { + offset_ += count; + return true; +} + +bool TFile::Open(const STRING& filename, FileReader reader) { + if (!data_is_owned_) { + data_ = new GenericVector; + data_is_owned_ = true; + } + offset_ = 0; + is_writing_ = false; + swap_ = false; + /* willus mod */ + strncpy(tfile_filename,filename.string(),511); + tfile_filename[511]='\0'; + if (reader == nullptr) + return LoadDataFromFile(filename, data_); + else + return (*reader)(filename, data_); +} + +bool TFile::Open(const char* data, int size) { + offset_ = 0; + if (!data_is_owned_) { + data_ = new GenericVector; + data_is_owned_ = true; + } + is_writing_ = false; + swap_ = false; + data_->resize_no_init(size); + memcpy(&(*data_)[0], data, size); + return true; +} + +bool TFile::Open(FILE* fp, int64_t end_offset) { + offset_ = 0; + long current_pos = ftell(fp); + if (current_pos < 0) { + // ftell failed. + return false; + } + if (end_offset < 0) { + if (fseek(fp, 0, SEEK_END)) + return false; + end_offset = ftell(fp); + if (fseek(fp, current_pos, SEEK_SET)) + return false; + } + int size = end_offset - current_pos; + is_writing_ = false; + swap_ = false; + if (!data_is_owned_) { + data_ = new GenericVector; + data_is_owned_ = true; + } + data_->resize_no_init(size); + return static_cast(fread(&(*data_)[0], 1, size, fp)) == size; +} + +char* TFile::FGets(char* buffer, int buffer_size) { + ASSERT_HOST(!is_writing_); + int size = 0; + while (size + 1 < buffer_size && offset_ < data_->size()) { + buffer[size++] = (*data_)[offset_++]; + if ((*data_)[offset_ - 1] == '\n') break; + } + if (size < buffer_size) buffer[size] = '\0'; + return size > 0 ? buffer : nullptr; +} + +int TFile::FReadEndian(void* buffer, size_t size, int count) { + int num_read = FRead(buffer, size, count); + if (swap_) { + char* char_buffer = static_cast(buffer); + for (int i = 0; i < num_read; ++i, char_buffer += size) { + ReverseN(char_buffer, size); + } + } + return num_read; +} + +int TFile::FRead(void* buffer, size_t size, int count) { + ASSERT_HOST(!is_writing_); + ASSERT_HOST(size > 0); + ASSERT_HOST(count >= 0); + size_t required_size; + if (SIZE_MAX / size <= count) { + // Avoid integer overflow. + required_size = data_->size() - offset_; + } else { + required_size = size * count; + if (data_->size() - offset_ < required_size) { + required_size = data_->size() - offset_; + } + } + if (required_size > 0 && buffer != nullptr) + memcpy(buffer, &(*data_)[offset_], required_size); + offset_ += required_size; + return required_size / size; +} + +void TFile::Rewind() { + ASSERT_HOST(!is_writing_); + offset_ = 0; +} + +void TFile::OpenWrite(GenericVector* data) { + offset_ = 0; + if (data != nullptr) { + if (data_is_owned_) delete data_; + data_ = data; + data_is_owned_ = false; + } else if (!data_is_owned_) { + data_ = new GenericVector; + data_is_owned_ = true; + } + is_writing_ = true; + swap_ = false; + data_->truncate(0); +} + +bool TFile::CloseWrite(const STRING& filename, FileWriter writer) { + ASSERT_HOST(is_writing_); + if (writer == nullptr) + return SaveDataToFile(*data_, filename); + else + return (*writer)(*data_, filename); +} + +int TFile::FWrite(const void* buffer, size_t size, int count) { + ASSERT_HOST(is_writing_); + ASSERT_HOST(size > 0); + ASSERT_HOST(count >= 0); + ASSERT_HOST(SIZE_MAX / size > count); + size_t total = size * count; + const char* buf = static_cast(buffer); + // This isn't very efficient, but memory is so fast compared to disk + // that it is relatively unimportant, and very simple. + for (size_t i = 0; i < total; ++i) + data_->push_back(buf[i]); + return count; +} + +} // namespace tesseract. diff -Nru k2pdfopt-2.42+ds/tesseract_mod/serialis.h k2pdfopt-2.51+ds/tesseract_mod/serialis.h --- k2pdfopt-2.42+ds/tesseract_mod/serialis.h 1970-01-01 00:00:00.000000000 +0000 +++ k2pdfopt-2.51+ds/tesseract_mod/serialis.h 2018-11-21 21:41:24.000000000 +0000 @@ -0,0 +1,164 @@ +/********************************************************************** + * File: serialis.h (Formerly serialmac.h) + * Description: Inline routines and macros for serialisation functions + * Author: Phil Cheatle + * Created: Tue Oct 08 08:33:12 BST 1991 + * + * (C) Copyright 1990, Hewlett-Packard Ltd. + ** Licensed under the Apache License, Version 2.0 (the "License"); + ** you may not use this file except in compliance with the License. + ** You may obtain a copy of the License at + ** http://www.apache.org/licenses/LICENSE-2.0 + ** Unless required by applicable law or agreed to in writing, software + ** distributed under the License is distributed on an "AS IS" BASIS, + ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + ** See the License for the specific language governing permissions and + ** limitations under the License. + * + **********************************************************************/ + +#ifndef SERIALIS_H +#define SERIALIS_H + +#include +#include +#include +#include "host.h" + +template class GenericVector; +class STRING; + +/*********************************************************************** + QUOTE_IT MACRO DEFINITION + =========================== +Replace with "". may be an arbitrary number of tokens +***********************************************************************/ + +#define QUOTE_IT(parm) #parm + +namespace tesseract { + +// Return number of elements of an array. +template +constexpr size_t countof(T const (&)[N]) noexcept { + return N; +} + +// Function to read a GenericVector from a whole file. +// Returns false on failure. +typedef bool (*FileReader)(const STRING& filename, GenericVector* data); +// Function to write a GenericVector to a whole file. +// Returns false on failure. +typedef bool (*FileWriter)(const GenericVector& data, + const STRING& filename); + +// Deserialize data from file. +bool DeSerialize(FILE* fp, char* data, size_t n = 1); +bool DeSerialize(FILE* fp, float* data, size_t n = 1); +bool DeSerialize(FILE* fp, int8_t* data, size_t n = 1); +bool DeSerialize(FILE* fp, int16_t* data, size_t n = 1); +bool DeSerialize(FILE* fp, int32_t* data, size_t n = 1); +bool DeSerialize(FILE* fp, uint8_t* data, size_t n = 1); +bool DeSerialize(FILE* fp, uint16_t* data, size_t n = 1); +bool DeSerialize(FILE* fp, uint32_t* data, size_t n = 1); + +// Serialize data to file. +bool Serialize(FILE* fp, const char* data, size_t n = 1); +bool Serialize(FILE* fp, const float* data, size_t n = 1); +bool Serialize(FILE* fp, const int8_t* data, size_t n = 1); +bool Serialize(FILE* fp, const int16_t* data, size_t n = 1); +bool Serialize(FILE* fp, const int32_t* data, size_t n = 1); +bool Serialize(FILE* fp, const uint8_t* data, size_t n = 1); +bool Serialize(FILE* fp, const uint16_t* data, size_t n = 1); +bool Serialize(FILE* fp, const uint32_t* data, size_t n = 1); + +// Simple file class. +// Allows for portable file input from memory and from foreign file systems. +class TFile { + public: + TFile(); + ~TFile(); + /* willus mod */ + char tfile_filename[512]; + + // All the Open methods load the whole file into memory for reading. + // Opens a file with a supplied reader, or nullptr to use the default. + // Note that mixed read/write is not supported. + bool Open(const STRING& filename, FileReader reader); + // From an existing memory buffer. + bool Open(const char* data, int size); + // From an open file and an end offset. + bool Open(FILE* fp, int64_t end_offset); + // Sets the value of the swap flag, so that FReadEndian does the right thing. + void set_swap(bool value) { swap_ = value; } + + // Deserialize data. + bool DeSerialize(char* data, size_t count = 1); + bool DeSerialize(double* data, size_t count = 1); + bool DeSerialize(float* data, size_t count = 1); + bool DeSerialize(int8_t* data, size_t count = 1); + bool DeSerialize(int16_t* data, size_t count = 1); + bool DeSerialize(int32_t* data, size_t count = 1); + bool DeSerialize(int64_t* data, size_t count = 1); + bool DeSerialize(uint8_t* data, size_t count = 1); + bool DeSerialize(uint16_t* data, size_t count = 1); + bool DeSerialize(uint32_t* data, size_t count = 1); + bool DeSerialize(uint64_t* data, size_t count = 1); + + // Serialize data. + bool Serialize(const char* data, size_t count = 1); + bool Serialize(const double* data, size_t count = 1); + bool Serialize(const float* data, size_t count = 1); + bool Serialize(const int8_t* data, size_t count = 1); + bool Serialize(const int16_t* data, size_t count = 1); + bool Serialize(const int32_t* data, size_t count = 1); + bool Serialize(const int64_t* data, size_t count = 1); + bool Serialize(const uint8_t* data, size_t count = 1); + bool Serialize(const uint16_t* data, size_t count = 1); + bool Serialize(const uint32_t* data, size_t count = 1); + bool Serialize(const uint64_t* data, size_t count = 1); + + // Skip data. + bool Skip(size_t count); + + // Reads a line like fgets. Returns nullptr on EOF, otherwise buffer. + // Reads at most buffer_size bytes, including '\0' terminator, even if + // the line is longer. Does nothing if buffer_size <= 0. + // To use fscanf use FGets and sscanf. + char* FGets(char* buffer, int buffer_size); + // Replicates fread, followed by a swap of the bytes if needed, returning the + // number of items read. If swap_ is true then the count items will each have + // size bytes reversed. + int FReadEndian(void* buffer, size_t size, int count); + // Replicates fread, returning the number of items read. + int FRead(void* buffer, size_t size, int count); + // Resets the TFile as if it has been Opened, but nothing read. + // Only allowed while reading! + void Rewind(); + + // Open for writing. Either supply a non-nullptr data with OpenWrite before + // calling FWrite, (no close required), or supply a nullptr data to OpenWrite + // and call CloseWrite to write to a file after the FWrites. + void OpenWrite(GenericVector* data); + bool CloseWrite(const STRING& filename, FileWriter writer); + + // Replicates fwrite, returning the number of items written. + // To use fprintf, use snprintf and FWrite. + int FWrite(const void* buffer, size_t size, int count); + + private: + // The number of bytes used so far. + int offset_; + // The buffered data from the file. + GenericVector* data_; + // True if the data_ pointer is owned by *this. + bool data_is_owned_; + // True if the TFile is open for writing. + bool is_writing_; + // True if bytes need to be swapped in FReadEndian. + bool swap_; +}; + +} // namespace tesseract. + +#endif diff -Nru k2pdfopt-2.42+ds/tesseract_mod/simddetect.cpp k2pdfopt-2.51+ds/tesseract_mod/simddetect.cpp --- k2pdfopt-2.42+ds/tesseract_mod/simddetect.cpp 1970-01-01 00:00:00.000000000 +0000 +++ k2pdfopt-2.51+ds/tesseract_mod/simddetect.cpp 2019-01-01 03:57:51.000000000 +0000 @@ -0,0 +1,109 @@ +/////////////////////////////////////////////////////////////////////// +// File: simddetect.cpp +// Description: Architecture detector. +// Author: Stefan Weil (based on code from Ray Smith) +// +// (C) Copyright 2014, Google Inc. +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// http://www.apache.org/licenses/LICENSE-2.0 +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +/////////////////////////////////////////////////////////////////////// + +#include "simddetect.h" + +#undef X86_BUILD +#if defined(__x86_64__) || defined(__i386__) || defined(_WIN32) +#if !defined(ANDROID_BUILD) +#define X86_BUILD 1 +#endif // !ANDROID_BUILD +#endif // x86 target + +#if defined(X86_BUILD) +#if defined(__GNUC__) +#include +#elif defined(_WIN32) +#include +#endif +#endif + +/* WILLUS */ +/* +#include +*/ + +SIMDDetect SIMDDetect::detector; + +// If true, then AVX has been detected. +bool SIMDDetect::avx_available_; +bool SIMDDetect::avx2_available_; +bool SIMDDetect::avx512F_available_; +bool SIMDDetect::avx512BW_available_; +// If true, then SSe4.1 has been detected. +bool SIMDDetect::sse_available_; + +// Constructor. +// Tests the architecture in a system-dependent way to detect AVX, SSE and +// any other available SIMD equipment. +// __GNUC__ is also defined by compilers that include GNU extensions such as +// clang. +SIMDDetect::SIMDDetect() { +#if defined(X86_BUILD) +#if defined(__GNUC__) + unsigned int eax, ebx, ecx, edx; + + /* willus mod: Default to false for all */ + sse_available_ = false; + avx_available_ = false; + avx2_available_ = false; + avx512F_available_ = false; + avx512BW_available_ = false; + + if (__get_cpuid(1, &eax, &ebx, &ecx, &edx) != 0) { + // Note that these tests all use hex because the older compilers don't have + // the newer flags. +#ifdef __SSE4_1__ + sse_available_ = (ecx & 0x00080000) != 0; +#endif +#ifdef __AVX__ + avx_available_ = (ecx & 0x10000000) != 0; +#endif + if (avx_available_) { + // There is supposed to be a __get_cpuid_count function, but this is all + // there is in my cpuid.h. It is a macro for an asm statement and cannot + // be used inside an if. + __cpuid_count(7, 0, eax, ebx, ecx, edx); +#ifdef __AVX2__ + avx2_available_ = (ebx & 0x00000020) != 0; + avx512F_available_ = (ebx & 0x00010000) != 0; + avx512BW_available_ = (ebx & 0x40000000) != 0; +#endif + } + } +#elif defined(_WIN32) + int cpuInfo[4]; + __cpuid(cpuInfo, 0); + if (cpuInfo[0] >= 1) { + __cpuid(cpuInfo, 1); +#ifdef __SSE4_1__ + sse_available_ = (cpuInfo[2] & 0x00080000) != 0; +#endif +#ifdef __AVX__ + avx_available_ = (cpuInfo[2] & 0x10000000) != 0; +#endif + } +#else +#error "I don't know how to test for SIMD with this compiler" +#endif +/* +printf("sse_available_ = %d\n",(int)sse_available_); +printf("avx_available_ = %d\n",(int)avx_available_); +printf("avx2_available_ = %d\n",(int)avx2_available_); +*/ +#endif // X86_BUILD +} diff -Nru k2pdfopt-2.42+ds/tesseract_mod/tesscapi.cpp k2pdfopt-2.51+ds/tesseract_mod/tesscapi.cpp --- k2pdfopt-2.42+ds/tesseract_mod/tesscapi.cpp 2016-12-31 01:16:18.000000000 +0000 +++ k2pdfopt-2.51+ds/tesseract_mod/tesscapi.cpp 2018-11-22 21:22:00.000000000 +0000 @@ -1,4 +1,3 @@ -#include "config_auto.h" /* ** tesscapi.cpp willus.com attempt at C wrapper for tesseract. ** (Butchered from tesseractmain.cpp) @@ -21,7 +20,6 @@ ** */ -#include "config_auto.h" /* #include "mfcpch.h" */ @@ -47,6 +45,8 @@ #include "strngs.h" #include "params.h" #include "blobs.h" +#include "simddetect.h" +#include "tesseractclass.h" /* #include "notdll.h" */ @@ -59,8 +59,8 @@ /* ** ocr_type=0: OEM_DEFAULT ** ocr_type=1: OEM_TESSERACT_ONLY -** ocr_type=2: OEM_CUBE_ONLY -** ocr_type=3: OEM_TESSERACT_CUBE_COMBINED +** ocr_type=2: OEM_LSTM_ONLY +** ocr_type=3: OEM_TESSERACT_LSTM_COMBINED */ void *tess_capi_init(char *datapath,char *language,int ocr_type,FILE *out, char *initstr,int maxlen,int *status) @@ -68,7 +68,12 @@ { char original_locale[256]; tesseract::TessBaseAPI *api = new tesseract::TessBaseAPI; - +/* +printf("@tess_capi_init\n"); +printf(" datapath='%s'\n",datapath); +printf(" language='%s'\n",language); +printf(" ocr_type=%d\n",ocr_type); +*/ #ifdef USE_NLS setlocale (LC_ALL, ""); bindtextdomain (PACKAGE, LOCALEDIR); @@ -76,9 +81,22 @@ #endif /* willus mod, 11-24-16 */ /* Tesseract needs "C" locale to correctly parse all data .traineddata files. */ +/* +printf("locale='%s'\n",setlocale(LC_ALL,NULL)); +printf("ctype='%s'\n",setlocale(LC_CTYPE,NULL)); +printf("numeric='%s'\n",setlocale(LC_NUMERIC,NULL)); +*/ strncpy(original_locale,setlocale(LC_ALL,NULL),255); original_locale[255]='\0'; +/* +printf("original_locale='%s'\n",original_locale); +*/ setlocale(LC_ALL,"C"); +/* +printf("new locale='%s'\n",setlocale(LC_ALL,NULL)); +printf("new ctype='%s'\n",setlocale(LC_CTYPE,NULL)); +printf("new numeric='%s'\n",setlocale(LC_NUMERIC,NULL)); +*/ // fprintf(stderr, "tesseract %s\n", tesseract::TessBaseAPI::Version()); // Make the order of args a bit more forgiving than it used to be. const char* lang = "eng"; @@ -108,13 +126,20 @@ exit(1); } */ - +/* +printf("SSE = %s\n",SIMDDetect::IsSSEAvailable() ? "AVAILABLE" : "NOT AVAILABLE"); +printf("AVX = %s\n",SIMDDetect::IsAVXAvailable() ? "AVAILABLE" : "NOT AVAILABLE"); +*/ +/* +v4.00 loads either TESSERACT enginer, LSTM engine, or both. No CUBE. +*/ + ocr_type=0; /* Ignore specified and use default */ api->SetOutputName(NULL); (*status)=api->Init(datapath,lang, ocr_type==0 ? tesseract::OEM_DEFAULT : (ocr_type==1 ? tesseract::OEM_TESSERACT_ONLY : - (ocr_type==2 ? tesseract::OEM_CUBE_ONLY : - (tesseract::OEM_TESSERACT_CUBE_COMBINED)))); + (ocr_type==2 ? tesseract::OEM_LSTM_ONLY : + (tesseract::OEM_TESSERACT_LSTM_COMBINED)))); if ((*status)!=0) { /* willus mod, 11-24-16 */ @@ -146,16 +171,51 @@ ** Initialization message */ { - char istr[256]; + char istr[1024]; + int sse,avx; + +// printf("tessedit_ocr_engine_mode = %d\n",tessedit_ocr_engine_mode); + sprintf(istr,"%s",api->Version()); + sse=SIMDDetect::IsSSEAvailable(); + avx=SIMDDetect::IsAVXAvailable(); + if (sse || avx) + sprintf(&istr[strlen(istr)]," [%s]",sse&&avx?"SSE+AVX":(sse?"SSE":"AVX")); + sprintf(&istr[strlen(istr)],"\n Tesseract data folder = '%s'",datapath==NULL?getenv("TESSDATA_PREFIX"):datapath); + strcat(istr,"\n Tesseract languages: "); + GenericVector languages; + api->GetLoadedLanguagesAsVector(&languages); +/* +printf("OEM=%d\n",api->oem()); +printf("Langs='%s'\n",api->GetInitLanguagesAsString()); +printf("AnyTessLang()=%d\n",(int)api->tesseract()->AnyTessLang()); +printf("AnyLSTMLang()=%d\n",(int)api->tesseract()->AnyLSTMLang()); +printf("num_sub_langs()=%d\n",api->tesseract()->num_sub_langs()); +printf("languages.size()=%d\n",(int)languages.size()); +*/ + + for (int i=0;i<=api->tesseract()->num_sub_langs();i++) + { + tesseract::Tesseract *lang1; + int eng; + lang1 = i==0 ? api->tesseract() : api->tesseract()->get_sub_lang(i-1); + eng=(int)lang1->tessedit_ocr_engine_mode; + sprintf(&istr[strlen(istr)],"%s%s [%s]",i==0?"":", ",lang1->lang.string(), + eng==2?"LSTM+Tess":(eng==1?"LSTM":"Tess")); + } +/* +printf("%d. '%s'\n",i+1,languages[i].string()); +printf(" sublang[%d].oem_engine = %d\n",i+1,(int)api->tesseract()->get_sub_lang(i)->tessedit_ocr_engine_mode); +*/ - sprintf(istr,"Tesseract Open Source OCR Engine v%s ",tesseract::TessBaseAPI::Version()); + /* if (ocr_type==0 || ocr_type==3) - sprintf(&istr[strlen(istr)],"[CUBE+] (lang="); + sprintf(&istr[strlen(istr)],"[LSTM+] (lang="); else if (ocr_type==2) - sprintf(&istr[strlen(istr)],"[CUBE] (lang="); + sprintf(&istr[strlen(istr)],"[LSTM] (lang="); strncpy(&istr[strlen(istr)],language,253-strlen(istr)); istr[253]='\0'; strcat(istr,")"); + */ if (out!=NULL) fprintf(out,"%s\n",istr); if (initstr!=NULL) @@ -166,10 +226,10 @@ } - /* Turn off CUBE debugging output */ - api->SetVariable("cube_debug_level","0"); + /* Turn off LSTM debugging output */ + api->SetVariable("lstm_debug_level","0"); #if (WILLUSDEBUG & 1) - api->SetVariable("cube_debug_level","9"); + api->SetVariable("lstm_debug_level","9"); api->SetVariable("paragraph_debug_level","9"); api->SetVariable("tessdata_manager_debug_level","9"); api->SetVariable("tosp_debug_level","9"); @@ -182,12 +242,18 @@ } -int tess_capi_get_ocr(void *vapi,PIX *pix,char *outstr,int maxlen,FILE *out) +int tess_capi_get_ocr(void *vapi,PIX *pix,char *outstr,int maxlen,int segmode,FILE *out) { tesseract::TessBaseAPI *api; + static int old_segmode=-1; api=(tesseract::TessBaseAPI *)vapi; + if (old_segmode != segmode) + { + old_segmode=segmode; + api->SetPageSegMode((tesseract::PageSegMode)segmode); + } if (!api->ProcessPage(pix,0,NULL,NULL,0,NULL)) { /* pixDestroy(&pix); */ @@ -201,6 +267,35 @@ api->Clear(); return(0); } + + +int tess_capi_get_ocr_multiword(void *vapi,PIX *pix,int segmode, + int **left,int **top,int **right,int **bottom, + int **ybase,char **text,int *nw, + FILE *out) + + { + tesseract::TessBaseAPI *api; + static int old_segmode=-1; + + api=(tesseract::TessBaseAPI *)vapi; + if (old_segmode != segmode) + { + old_segmode=segmode; + api->SetPageSegMode((tesseract::PageSegMode)segmode); + } + if (!api->ProcessPage(pix,0,NULL,NULL,0,NULL)) + { + if (out!=NULL) + fprintf(out,"tesscapi: Error during bitmap processing.\n"); + api->Clear(); + (*nw)=0; + return(-1); + } + (*nw)=api->GetOCRWords(left,top,right,bottom,ybase,text); + api->Clear(); + return(0); + } void tess_capi_end(void *vapi) diff -Nru k2pdfopt-2.42+ds/tesseract_mod/tessdatamanager.cpp k2pdfopt-2.51+ds/tesseract_mod/tessdatamanager.cpp --- k2pdfopt-2.42+ds/tesseract_mod/tessdatamanager.cpp 2017-02-25 04:38:45.000000000 +0000 +++ k2pdfopt-2.51+ds/tesseract_mod/tessdatamanager.cpp 2018-11-18 18:47:10.000000000 +0000 @@ -1,283 +1,277 @@ -#include "config_auto.h" -/////////////////////////////////////////////////////////////////////// -// File: tessdatamanager.cpp -// Description: Functions to handle loading/combining tesseract data files. -// Author: Daria Antonova -// Created: Wed Jun 03 11:26:43 PST 2009 -// -// (C) Copyright 2009, Google Inc. -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// -/////////////////////////////////////////////////////////////////////// - -#ifdef _MSC_VER -#pragma warning(disable:4244) // Conversion warnings -#endif - -#include "tessdatamanager.h" - -#include - -#include "helpers.h" -#include "serialis.h" -#include "strngs.h" -#include "tprintf.h" -#include "params.h" - -namespace tesseract { - -bool TessdataManager::Init(const char *data_file_name, int debug_level) { - int i; - debug_level_ = debug_level; - data_file_name_ = data_file_name; - data_file_ = fopen(data_file_name, "rb"); - if (data_file_ == NULL) { - if (debug_level_) { - tprintf("Error opening data file %s\n", data_file_name); - tprintf("Please make sure the TESSDATA_PREFIX environment variable is set " - "to the parent directory of your \"tessdata\" directory.\n"); - } - return false; - } - fread(&actual_tessdata_num_entries_, sizeof(inT32), 1, data_file_); - swap_ = (actual_tessdata_num_entries_ > kMaxNumTessdataEntries); - if (swap_) { - ReverseN(&actual_tessdata_num_entries_, - sizeof(actual_tessdata_num_entries_)); - } - if (actual_tessdata_num_entries_ > TESSDATA_NUM_ENTRIES) { - // For forward compatibility, truncate to the number we can handle. - actual_tessdata_num_entries_ = TESSDATA_NUM_ENTRIES; - } - fread(offset_table_, sizeof(inT64), - actual_tessdata_num_entries_, data_file_); - if (swap_) { - for (i = 0 ; i < actual_tessdata_num_entries_; ++i) { - ReverseN(&offset_table_[i], sizeof(offset_table_[i])); - } - } - if (debug_level_) { - tprintf("TessdataManager loaded %d types of tesseract data files.\n", - actual_tessdata_num_entries_); - for (i = 0; i < actual_tessdata_num_entries_; ++i) { - tprintf("Offset for type %d is %lld\n", i, offset_table_[i]); - } - } - return true; -} - -void TessdataManager::CopyFile(FILE *input_file, FILE *output_file, - bool newline_end, inT64 num_bytes_to_copy) { - if (num_bytes_to_copy == 0) return; - int buffer_size = 1024; - if (num_bytes_to_copy > 0 && buffer_size > num_bytes_to_copy) { - buffer_size = num_bytes_to_copy; - } - inT64 num_bytes_copied = 0; - char *chunk = new char[buffer_size]; - int bytes_read; - char last_char = 0x0; - while ((bytes_read = fread(chunk, sizeof(char), - buffer_size, input_file))) { - fwrite(chunk, sizeof(char), bytes_read, output_file); - last_char = chunk[bytes_read-1]; - if (num_bytes_to_copy > 0) { - num_bytes_copied += bytes_read; - if (num_bytes_copied == num_bytes_to_copy) break; - if (num_bytes_copied + buffer_size > num_bytes_to_copy) { - buffer_size = num_bytes_to_copy - num_bytes_copied; - } - } - } - if (newline_end) ASSERT_HOST(last_char == '\n'); - delete[] chunk; -} - -bool TessdataManager::WriteMetadata(inT64 *offset_table, - const char * language_data_path_prefix, - FILE *output_file) { - inT32 num_entries = TESSDATA_NUM_ENTRIES; - bool result = true; - if (fseek(output_file, 0, SEEK_SET) != 0 || - fwrite(&num_entries, sizeof(inT32), 1, output_file) != 1 || - fwrite(offset_table, sizeof(inT64), TESSDATA_NUM_ENTRIES, - output_file) != TESSDATA_NUM_ENTRIES) { - fclose(output_file); - result = false; - tprintf("WriteMetadata failed in TessdataManager!\n"); - } else if (fclose(output_file)) { - result = false; - tprintf("WriteMetadata failed to close file!\n"); - } else { - tprintf("TessdataManager combined tesseract data files.\n"); - for (int i = 0; i < TESSDATA_NUM_ENTRIES; ++i) { - tprintf("Offset for type %2d (%s%-22s) is %lld\n", i, - language_data_path_prefix, kTessdataFileSuffixes[i], - offset_table[i]); - } - } - return result; -} - -bool TessdataManager::CombineDataFiles( - const char *language_data_path_prefix, - const char *output_filename) { - int i; - inT64 offset_table[TESSDATA_NUM_ENTRIES]; - for (i = 0; i < TESSDATA_NUM_ENTRIES; ++i) offset_table[i] = -1; - FILE *output_file = fopen(output_filename, "wb"); - if (output_file == NULL) { - tprintf("Error opening %s for writing\n", output_filename); - return false; - } - // Leave some space for recording the offset_table. - if (fseek(output_file, - sizeof(inT32) + sizeof(inT64) * TESSDATA_NUM_ENTRIES, SEEK_SET)) { - tprintf("Error seeking %s\n", output_filename); - fclose(output_file); - return false; - } - - TessdataType type = TESSDATA_NUM_ENTRIES; - bool text_file = false; - FILE *file_ptr[TESSDATA_NUM_ENTRIES]; - - // Load individual tessdata components from files. - for (i = 0; i < TESSDATA_NUM_ENTRIES; ++i) { - ASSERT_HOST(TessdataTypeFromFileSuffix( - kTessdataFileSuffixes[i], &type, &text_file)); - STRING filename = language_data_path_prefix; - filename += kTessdataFileSuffixes[i]; - file_ptr[i] = fopen(filename.string(), "rb"); - if (file_ptr[i] != NULL) { - offset_table[type] = ftell(output_file); - CopyFile(file_ptr[i], output_file, text_file, -1); - fclose(file_ptr[i]); - } - } - - // Make sure that the required components are present. - if (file_ptr[TESSDATA_UNICHARSET] == NULL) { - tprintf("Error opening %sunicharset file\n", language_data_path_prefix); - fclose(output_file); - return false; - } - if (file_ptr[TESSDATA_INTTEMP] != NULL && - (file_ptr[TESSDATA_PFFMTABLE] == NULL || - file_ptr[TESSDATA_NORMPROTO] == NULL)) { - tprintf("Error opening %spffmtable and/or %snormproto files" - " while %sinttemp file was present\n", language_data_path_prefix, - language_data_path_prefix, language_data_path_prefix); - fclose(output_file); - return false; - } - - return WriteMetadata(offset_table, language_data_path_prefix, output_file); -} - -bool TessdataManager::OverwriteComponents( - const char *new_traineddata_filename, - char **component_filenames, - int num_new_components) { - int i; - inT64 offset_table[TESSDATA_NUM_ENTRIES]; - TessdataType type = TESSDATA_NUM_ENTRIES; - bool text_file = false; - FILE *file_ptr[TESSDATA_NUM_ENTRIES]; - for (i = 0; i < TESSDATA_NUM_ENTRIES; ++i) { - offset_table[i] = -1; - file_ptr[i] = NULL; - } - FILE *output_file = fopen(new_traineddata_filename, "wb"); - if (output_file == NULL) { - tprintf("Error opening %s for writing\n", new_traineddata_filename); - return false; - } - - // Leave some space for recording the offset_table. - if (fseek(output_file, - sizeof(inT32) + sizeof(inT64) * TESSDATA_NUM_ENTRIES, SEEK_SET)) { - fclose(output_file); - tprintf("Error seeking %s\n", new_traineddata_filename); - return false; - } - - // Open the files with the new components. - for (i = 0; i < num_new_components; ++i) { - if (TessdataTypeFromFileName(component_filenames[i], &type, &text_file)) - file_ptr[type] = fopen(component_filenames[i], "rb"); - } - - // Write updated data to the output traineddata file. - for (i = 0; i < TESSDATA_NUM_ENTRIES; ++i) { - if (file_ptr[i] != NULL) { - // Get the data from the opened component file. - offset_table[i] = ftell(output_file); - CopyFile(file_ptr[i], output_file, kTessdataFileIsText[i], -1); - fclose(file_ptr[i]); - } else { - // Get this data component from the loaded data file. - if (SeekToStart(static_cast(i))) { - offset_table[i] = ftell(output_file); - CopyFile(data_file_, output_file, kTessdataFileIsText[i], - GetEndOffset(static_cast(i)) - - ftell(data_file_) + 1); - } - } - } - const char *language_data_path_prefix = strchr(new_traineddata_filename, '.'); - return WriteMetadata(offset_table, language_data_path_prefix, output_file); -} - -bool TessdataManager::TessdataTypeFromFileSuffix( - const char *suffix, TessdataType *type, bool *text_file) { - for (int i = 0; i < TESSDATA_NUM_ENTRIES; ++i) { - if (strcmp(kTessdataFileSuffixes[i], suffix) == 0) { - *type = static_cast(i); - *text_file = kTessdataFileIsText[i]; - return true; - } - } - tprintf("TessdataManager can't determine which tessdata" - " component is represented by %s\n", suffix); - return false; -} - -bool TessdataManager::TessdataTypeFromFileName( - const char *filename, TessdataType *type, bool *text_file) { - // Get the file suffix (extension) - const char *suffix = strrchr(filename, '.'); - if (suffix == NULL || *(++suffix) == '\0') return false; - return TessdataTypeFromFileSuffix(suffix, type, text_file); -} - -bool TessdataManager::ExtractToFile(const char *filename) { - TessdataType type = TESSDATA_NUM_ENTRIES; - bool text_file = false; - ASSERT_HOST(tesseract::TessdataManager::TessdataTypeFromFileName( - filename, &type, &text_file)); - if (!SeekToStart(type)) return false; - - FILE *output_file = fopen(filename, "wb"); - if (output_file == NULL) { - tprintf("Error opening %s\n", filename); - exit(1); - } - inT64 begin_offset = ftell(GetDataFilePtr()); - inT64 end_offset = GetEndOffset(type); - tesseract::TessdataManager::CopyFile( - GetDataFilePtr(), output_file, text_file, - end_offset - begin_offset + 1); - fclose(output_file); - return true; -} - -} // namespace tesseract +/////////////////////////////////////////////////////////////////////// +// File: tessdatamanager.cpp +// Description: Functions to handle loading/combining tesseract data files. +// Author: Daria Antonova +// Created: Wed Jun 03 11:26:43 PST 2009 +// +// (C) Copyright 2009, Google Inc. +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// http://www.apache.org/licenses/LICENSE-2.0 +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +/////////////////////////////////////////////////////////////////////// + +#ifdef HAVE_CONFIG_H +#include "config_auto.h" +#endif + +#include "tessdatamanager.h" + +#include + +#include "errcode.h" +#include "helpers.h" +#include "serialis.h" +#include "strngs.h" +#include "tprintf.h" +#include "params.h" +/* willus mod */ +#include "tess_version.h" + +namespace tesseract { + +TessdataManager::TessdataManager() : reader_(nullptr), is_loaded_(false), swap_(false) { + SetVersionString(PACKAGE_VERSION); +} + +TessdataManager::TessdataManager(FileReader reader) + : reader_(reader), + is_loaded_(false), + swap_(false) { + SetVersionString(PACKAGE_VERSION); +} + +// Lazily loads from the the given filename. Won't actually read the file +// until it needs it. +void TessdataManager::LoadFileLater(const char *data_file_name) { + Clear(); + data_file_name_ = data_file_name; +} + +bool TessdataManager::Init(const char *data_file_name) { + GenericVector data; + if (reader_ == nullptr) { + if (!LoadDataFromFile(data_file_name, &data)) return false; + } else { + if (!(*reader_)(data_file_name, &data)) return false; + } + return LoadMemBuffer(data_file_name, &data[0], data.size()); +} + +// Loads from the given memory buffer as if a file. +bool TessdataManager::LoadMemBuffer(const char *name, const char *data, + int size) { + Clear(); + data_file_name_ = name; + TFile fp; + fp.Open(data, size); + uint32_t num_entries; + if (!fp.DeSerialize(&num_entries)) return false; + swap_ = num_entries > kMaxNumTessdataEntries; + fp.set_swap(swap_); + if (swap_) ReverseN(&num_entries, sizeof(num_entries)); + if (num_entries > kMaxNumTessdataEntries) return false; + GenericVector offset_table; + offset_table.resize_no_init(num_entries); + if (!fp.DeSerialize(&offset_table[0], num_entries)) return false; + for (int i = 0; i < num_entries && i < TESSDATA_NUM_ENTRIES; ++i) { + if (offset_table[i] >= 0) { + int64_t entry_size = size - offset_table[i]; + int j = i + 1; + while (j < num_entries && offset_table[j] == -1) ++j; + if (j < num_entries) entry_size = offset_table[j] - offset_table[i]; + entries_[i].resize_no_init(entry_size); + if (!fp.DeSerialize(&entries_[i][0], entry_size)) return false; + } + } + if (entries_[TESSDATA_VERSION].empty()) { + SetVersionString("Pre-4.0.0"); + } + is_loaded_ = true; + return true; +} + +// Overwrites a single entry of the given type. +void TessdataManager::OverwriteEntry(TessdataType type, const char *data, + int size) { + is_loaded_ = true; + entries_[type].resize_no_init(size); + memcpy(&entries_[type][0], data, size); +} + +// Saves to the given filename. +bool TessdataManager::SaveFile(const STRING &filename, + FileWriter writer) const { + ASSERT_HOST(is_loaded_); + GenericVector data; + Serialize(&data); + if (writer == nullptr) + return SaveDataToFile(data, filename); + else + return (*writer)(data, filename); +} + +// Serializes to the given vector. +void TessdataManager::Serialize(GenericVector *data) const { + ASSERT_HOST(is_loaded_); + // Compute the offset_table and total size. + int64_t offset_table[TESSDATA_NUM_ENTRIES]; + int64_t offset = sizeof(int32_t) + sizeof(offset_table); + for (int i = 0; i < TESSDATA_NUM_ENTRIES; ++i) { + if (entries_[i].empty()) { + offset_table[i] = -1; + } else { + offset_table[i] = offset; + offset += entries_[i].size(); + } + } + data->init_to_size(offset, 0); + int32_t num_entries = TESSDATA_NUM_ENTRIES; + TFile fp; + fp.OpenWrite(data); + fp.Serialize(&num_entries); + fp.Serialize(&offset_table[0], countof(offset_table)); + for (int i = 0; i < TESSDATA_NUM_ENTRIES; ++i) { + if (!entries_[i].empty()) { + fp.Serialize(&entries_[i][0], entries_[i].size()); + } + } +} + +// Resets to the initial state, keeping the reader. +void TessdataManager::Clear() { + for (int i = 0; i < TESSDATA_NUM_ENTRIES; ++i) { + entries_[i].clear(); + } + is_loaded_ = false; +} + +// Prints a directory of contents. +void TessdataManager::Directory() const { + tprintf("Version string:%s\n", VersionString().c_str()); + int offset = TESSDATA_NUM_ENTRIES * sizeof(int64_t); + for (int i = 0; i < TESSDATA_NUM_ENTRIES; ++i) { + if (!entries_[i].empty()) { + tprintf("%d:%s:size=%d, offset=%d\n", i, kTessdataFileSuffixes[i], + entries_[i].size(), offset); + offset += entries_[i].size(); + } + } +} + +// Opens the given TFile pointer to the given component type. +// Returns false in case of failure. +bool TessdataManager::GetComponent(TessdataType type, TFile *fp) { + if (!is_loaded_ && !Init(data_file_name_.string())) return false; + const TessdataManager *const_this = this; + return const_this->GetComponent(type, fp); +} + +// As non-const version except it can't load the component if not already +// loaded. +bool TessdataManager::GetComponent(TessdataType type, TFile *fp) const { + ASSERT_HOST(is_loaded_); + if (entries_[type].empty()) return false; + fp->Open(&entries_[type][0], entries_[type].size()); + fp->set_swap(swap_); + return true; +} + +// Returns the current version string. +std::string TessdataManager::VersionString() const { + return std::string(&entries_[TESSDATA_VERSION][0], + entries_[TESSDATA_VERSION].size()); +} + +// Sets the version string to the given v_str. +void TessdataManager::SetVersionString(const std::string &v_str) { + entries_[TESSDATA_VERSION].resize_no_init(v_str.size()); + memcpy(&entries_[TESSDATA_VERSION][0], v_str.data(), v_str.size()); +} + +bool TessdataManager::CombineDataFiles( + const char *language_data_path_prefix, + const char *output_filename) { + // Load individual tessdata components from files. + for (int i = 0; i < TESSDATA_NUM_ENTRIES; ++i) { + TessdataType type; + ASSERT_HOST(TessdataTypeFromFileSuffix(kTessdataFileSuffixes[i], &type)); + STRING filename = language_data_path_prefix; + filename += kTessdataFileSuffixes[i]; + FILE *fp = fopen(filename.string(), "rb"); + if (fp != nullptr) { + fclose(fp); + if (!LoadDataFromFile(filename, &entries_[type])) { + tprintf("Load of file %s failed!\n", filename.string()); + return false; + } + } + } + is_loaded_ = true; + + // Make sure that the required components are present. + if (!IsBaseAvailable() && !IsLSTMAvailable()) { + tprintf( + "Error: traineddata file must contain at least (a unicharset file" + "and inttemp) OR an lstm file.\n"); + return false; + } + // Write updated data to the output traineddata file. + return SaveFile(output_filename, nullptr); +} + +bool TessdataManager::OverwriteComponents( + const char *new_traineddata_filename, + char **component_filenames, + int num_new_components) { + // Open the files with the new components. + for (int i = 0; i < num_new_components; ++i) { + TessdataType type; + if (TessdataTypeFromFileName(component_filenames[i], &type)) { + if (!LoadDataFromFile(component_filenames[i], &entries_[type])) { + tprintf("Failed to read component file:%s\n", component_filenames[i]); + return false; + } + } + } + + // Write updated data to the output traineddata file. + return SaveFile(new_traineddata_filename, nullptr); +} + +bool TessdataManager::ExtractToFile(const char *filename) { + TessdataType type = TESSDATA_NUM_ENTRIES; + ASSERT_HOST( + tesseract::TessdataManager::TessdataTypeFromFileName(filename, &type)); + if (entries_[type].empty()) return false; + return SaveDataToFile(entries_[type], filename); +} + +bool TessdataManager::TessdataTypeFromFileSuffix(const char *suffix, + TessdataType *type) { + for (int i = 0; i < TESSDATA_NUM_ENTRIES; ++i) { + if (strcmp(kTessdataFileSuffixes[i], suffix) == 0) { + *type = static_cast(i); + return true; + } + } + tprintf("TessdataManager can't determine which tessdata" + " component is represented by %s\n", suffix); + return false; +} + +bool TessdataManager::TessdataTypeFromFileName(const char *filename, + TessdataType *type) { + // Get the file suffix (extension) + const char *suffix = strrchr(filename, '.'); + if (suffix == nullptr || *(++suffix) == '\0') return false; + return TessdataTypeFromFileSuffix(suffix, type); +} + +} // namespace tesseract diff -Nru k2pdfopt-2.42+ds/tesseract_mod/tessedit.cpp k2pdfopt-2.51+ds/tesseract_mod/tessedit.cpp --- k2pdfopt-2.42+ds/tesseract_mod/tessedit.cpp 2017-02-25 04:44:46.000000000 +0000 +++ k2pdfopt-2.51+ds/tesseract_mod/tessedit.cpp 2018-12-24 15:58:05.000000000 +0000 @@ -1,533 +1,491 @@ -#include "config_auto.h" -/********************************************************************** - * File: tessedit.cpp (Formerly tessedit.c) - * Description: (Previously) Main program for merge of tess and editor. - * Now just code to load the language model and various - * engine-specific data files. - * Author: Ray Smith - * Created: Tue Jan 07 15:21:46 GMT 1992 - * - * (C) Copyright 1992, Hewlett-Packard Ltd. - ** Licensed under the Apache License, Version 2.0 (the "License"); - ** you may not use this file except in compliance with the License. - ** You may obtain a copy of the License at - ** http://www.apache.org/licenses/LICENSE-2.0 - ** Unless required by applicable law or agreed to in writing, software - ** distributed under the License is distributed on an "AS IS" BASIS, - ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - ** See the License for the specific language governing permissions and - ** limitations under the License. - * - **********************************************************************/ - -// Include automatically generated configuration file if running autoconf. -#ifdef HAVE_CONFIG_H -#include "config_auto.h" -#endif - -#include "stderr.h" -#include "basedir.h" -#include "tessvars.h" -#include "control.h" -#include "reject.h" -#include "pageres.h" -#include "nwmain.h" -#include "pgedit.h" -#include "tprintf.h" -#include "tessedit.h" -#include "stopper.h" -#include "intmatcher.h" -#include "chop.h" -#include "efio.h" -#include "danerror.h" -#include "globals.h" -#include "tesseractclass.h" -#include "params.h" - -#define VARDIR "configs/" /*variables files */ - // config under api -#define API_CONFIG "configs/api_config" - -ETEXT_DESC *global_monitor = NULL; // progress monitor - -namespace tesseract { - -// Read a "config" file containing a set of variable, value pairs. -// Searches the standard places: tessdata/configs, tessdata/tessconfigs -// and also accepts a relative or absolute path name. -void Tesseract::read_config_file(const char *filename, - SetParamConstraint constraint) { - STRING path = datadir; - path += "configs/"; - path += filename; - FILE* fp; - if ((fp = fopen(path.string(), "rb")) != NULL) { - fclose(fp); - } else { - path = datadir; - path += "tessconfigs/"; - path += filename; - if ((fp = fopen(path.string(), "rb")) != NULL) { - fclose(fp); - } else { - path = filename; - } - } - ParamUtils::ReadParamsFile(path.string(), constraint, this->params()); -} - -// Returns false if a unicharset file for the specified language was not found -// or was invalid. -// This function initializes TessdataManager. After TessdataManager is -// no longer needed, TessdataManager::End() should be called. -// -// This function sets tessedit_oem_mode to the given OcrEngineMode oem, unless -// it is OEM_DEFAULT, in which case the value of the variable will be obtained -// from the language-specific config file (stored in [lang].traineddata), from -// the config files specified on the command line or left as the default -// OEM_TESSERACT_ONLY if none of the configs specify this variable. -bool Tesseract::init_tesseract_lang_data( - const char *arg0, const char *textbase, const char *language, - OcrEngineMode oem, char **configs, int configs_size, - const GenericVector *vars_vec, - const GenericVector *vars_values, - bool set_only_non_debug_params) { - // Set the basename, compute the data directory. - main_setup(arg0, textbase); - - // Set the language data path prefix - lang = language != NULL ? language : "eng"; - language_data_path_prefix = datadir; - language_data_path_prefix += lang; - language_data_path_prefix += "."; - - /* willus.com mod */ - int gotcube; - gotcube=0; - if (oem==OEM_DEFAULT) - { - int i; - static char *exts[] = {"size","params","nn","word-freq",""}; - for (i=0;exts[i][0]!='\0';i++) - { - STRING cubetest; - FILE *f; - cubetest = language_data_path_prefix + "cube." + exts[i]; - f=fopen(cubetest.string(),"r"); - gotcube= (f!=NULL); - if (f!=NULL) - fclose(f); - if (gotcube) - break; - } - } - /* end willus.com mod */ - - - // Initialize TessdataManager. - STRING tessdata_path = language_data_path_prefix + kTrainedDataSuffix; - if (!tessdata_manager.Init(tessdata_path.string(), - tessdata_manager_debug_level)) { - return false; - } - - // If a language specific config file (lang.config) exists, load it in. - if (tessdata_manager.SeekToStart(TESSDATA_LANG_CONFIG)) { - ParamUtils::ReadParamsFromFp( - tessdata_manager.GetDataFilePtr(), - tessdata_manager.GetEndOffset(TESSDATA_LANG_CONFIG), - SET_PARAM_CONSTRAINT_NONE, this->params()); - if (tessdata_manager_debug_level) { - tprintf("Loaded language config file\n"); - } - } - - SetParamConstraint set_params_constraint = set_only_non_debug_params ? - SET_PARAM_CONSTRAINT_NON_DEBUG_ONLY : SET_PARAM_CONSTRAINT_NONE; - // Load tesseract variables from config files. This is done after loading - // language-specific variables from [lang].traineddata file, so that custom - // config files can override values in [lang].traineddata file. - for (int i = 0; i < configs_size; ++i) { - read_config_file(configs[i], set_params_constraint); - } - - // Set params specified in vars_vec (done after setting params from config - // files, so that params in vars_vec can override those from files). - if (vars_vec != NULL && vars_values != NULL) { - for (int i = 0; i < vars_vec->size(); ++i) { - if (!ParamUtils::SetParam((*vars_vec)[i].string(), - (*vars_values)[i].string(), - set_params_constraint, this->params())) { - tprintf("Error setting param %s\n", (*vars_vec)[i].string()); - exit(1); - } - } - } - - if (((STRING &)tessedit_write_params_to_file).length() > 0) { - FILE *params_file = fopen(tessedit_write_params_to_file.string(), "wb"); - if (params_file != NULL) { - ParamUtils::PrintParams(params_file, this->params()); - fclose(params_file); - if (tessdata_manager_debug_level > 0) { - tprintf("Wrote parameters to %s\n", - tessedit_write_params_to_file.string()); - } - } else { - tprintf("Failed to open %s for writing params.\n", - tessedit_write_params_to_file.string()); - } - } - - // Determine which ocr engine(s) should be loaded and used for recognition. - if (oem != OEM_DEFAULT) tessedit_ocr_engine_mode.set_value(oem); - /* willus.com mod */ - else if (gotcube) tessedit_ocr_engine_mode.set_value(OEM_TESSERACT_CUBE_COMBINED); - /* end willus.com mod */ - if (tessdata_manager_debug_level) { - tprintf("Loading Tesseract/Cube with tessedit_ocr_engine_mode %d\n", - static_cast(tessedit_ocr_engine_mode)); - } - - // If we are only loading the config file (and so not planning on doing any - // recognition) then there's nothing else do here. - if (tessedit_init_config_only) { - if (tessdata_manager_debug_level) { - tprintf("Returning after loading config file\n"); - } - return true; - } - - // Load the unicharset - if (!tessdata_manager.SeekToStart(TESSDATA_UNICHARSET) || - !unicharset.load_from_file(tessdata_manager.GetDataFilePtr())) { - return false; - } - if (unicharset.size() > MAX_NUM_CLASSES) { - tprintf("Error: Size of unicharset is greater than MAX_NUM_CLASSES\n"); - return false; - } - if (tessdata_manager_debug_level) tprintf("Loaded unicharset\n"); - right_to_left_ = unicharset.major_right_to_left(); - - // Setup initial unichar ambigs table and read universal ambigs. - UNICHARSET encoder_unicharset; - encoder_unicharset.CopyFrom(unicharset); - unichar_ambigs.InitUnicharAmbigs(unicharset, use_ambigs_for_adaption); - unichar_ambigs.LoadUniversal(encoder_unicharset, &unicharset); - - if (!tessedit_ambigs_training && - tessdata_manager.SeekToStart(TESSDATA_AMBIGS)) { - TFile ambigs_file; - ambigs_file.Open(tessdata_manager.GetDataFilePtr(), - tessdata_manager.GetEndOffset(TESSDATA_AMBIGS) + 1); - unichar_ambigs.LoadUnicharAmbigs( - encoder_unicharset, - &ambigs_file, - ambigs_debug_level, use_ambigs_for_adaption, &unicharset); - if (tessdata_manager_debug_level) tprintf("Loaded ambigs\n"); - } - - // The various OcrEngineMode settings (see publictypes.h) determine which - // engine-specific data files need to be loaded. Currently everything needs - // the base tesseract data, which supplies other useful information, but - // alternative engines, such as cube and LSTM are optional. -#ifndef NO_CUBE_BUILD - if (tessedit_ocr_engine_mode == OEM_CUBE_ONLY) { - /* willus mod */ - // ASSERT_HOST(init_cube_objects(false, &tessdata_manager)); - if (!init_cube_objects(false,&tessdata_manager)) - return false; - /* end willus mod */ - if (tessdata_manager_debug_level) - tprintf("Loaded Cube w/out combiner\n"); - } else if (tessedit_ocr_engine_mode == OEM_TESSERACT_CUBE_COMBINED) { - /* willus mod */ - // ASSERT_HOST(init_cube_objects(true, &tessdata_manager)); - if (!init_cube_objects(true,&tessdata_manager)) - return false; - /* end willus mod */ - if (tessdata_manager_debug_level) - tprintf("Loaded Cube with combiner\n"); - } -#endif - // Init ParamsModel. - // Load pass1 and pass2 weights (for now these two sets are the same, but in - // the future separate sets of weights can be generated). - for (int p = ParamsModel::PTRAIN_PASS1; - p < ParamsModel::PTRAIN_NUM_PASSES; ++p) { - language_model_->getParamsModel().SetPass( - static_cast(p)); - if (tessdata_manager.SeekToStart(TESSDATA_PARAMS_MODEL)) { - if (!language_model_->getParamsModel().LoadFromFp( - lang.string(), tessdata_manager.GetDataFilePtr(), - tessdata_manager.GetEndOffset(TESSDATA_PARAMS_MODEL))) { - return false; - } - } - } - if (tessdata_manager_debug_level) language_model_->getParamsModel().Print(); - - return true; -} - -// Helper returns true if the given string is in the vector of strings. -static bool IsStrInList(const STRING& str, - const GenericVector& str_list) { - for (int i = 0; i < str_list.size(); ++i) { - if (str_list[i] == str) - return true; - } - return false; -} - -// Parse a string of the form [~][+[~]]*. -// Langs with no prefix get appended to to_load, provided they -// are not in there already. -// Langs with ~ prefix get appended to not_to_load, provided they are not in -// there already. -void Tesseract::ParseLanguageString(const char* lang_str, - GenericVector* to_load, - GenericVector* not_to_load) { - STRING remains(lang_str); - while (remains.length() > 0) { - // Find the start of the lang code and which vector to add to. - const char* start = remains.string(); - while (*start == '+') - ++start; - GenericVector* target = to_load; - if (*start == '~') { - target = not_to_load; - ++start; - } - // Find the index of the end of the lang code in string start. - int end = strlen(start); - const char* plus = strchr(start, '+'); - if (plus != NULL && plus - start < end) - end = plus - start; - STRING lang_code(start); - lang_code.truncate_at(end); - STRING next(start + end); - remains = next; - // Check whether lang_code is already in the target vector and add. - if (!IsStrInList(lang_code, *target)) { - if (tessdata_manager_debug_level) - tprintf("Adding language '%s' to list\n", lang_code.string()); - target->push_back(lang_code); - } - } -} - -// Initialize for potentially a set of languages defined by the language -// string and recursively any additional languages required by any language -// traineddata file (via tessedit_load_sublangs in its config) that is loaded. -// See init_tesseract_internal for args. -int Tesseract::init_tesseract( - const char *arg0, const char *textbase, const char *language, - OcrEngineMode oem, char **configs, int configs_size, - const GenericVector *vars_vec, - const GenericVector *vars_values, - bool set_only_non_debug_params) { - GenericVector langs_to_load; - GenericVector langs_not_to_load; - ParseLanguageString(language, &langs_to_load, &langs_not_to_load); - - sub_langs_.delete_data_pointers(); - sub_langs_.clear(); - // Find the first loadable lang and load into this. - // Add any languages that this language requires - bool loaded_primary = false; - // Load the rest into sub_langs_. - for (int lang_index = 0; lang_index < langs_to_load.size(); ++lang_index) { - if (!IsStrInList(langs_to_load[lang_index], langs_not_to_load)) { - const char *lang_str = langs_to_load[lang_index].string(); - Tesseract *tess_to_init; - if (!loaded_primary) { - tess_to_init = this; - } else { - tess_to_init = new Tesseract; - } - - int result = tess_to_init->init_tesseract_internal( - arg0, textbase, lang_str, oem, configs, configs_size, - vars_vec, vars_values, set_only_non_debug_params); - - if (!loaded_primary) { - if (result < 0) { - /* willus mod */ - return -1; - /* end willus mod */ - tprintf("Failed loading language '%s'\n", lang_str); - } else { - if (tessdata_manager_debug_level) - tprintf("Loaded language '%s' as main language\n", lang_str); - ParseLanguageString(tess_to_init->tessedit_load_sublangs.string(), - &langs_to_load, &langs_not_to_load); - loaded_primary = true; - } - } else { - if (result < 0) { - /* willus mod */ - delete tess_to_init; - return -1; - /* end willus mod */ - tprintf("Failed loading language '%s'\n", lang_str); - delete tess_to_init; - } else { - if (tessdata_manager_debug_level) - tprintf("Loaded language '%s' as secondary language\n", lang_str); - sub_langs_.push_back(tess_to_init); - // Add any languages that this language requires - ParseLanguageString(tess_to_init->tessedit_load_sublangs.string(), - &langs_to_load, &langs_not_to_load); - } - } - } - } - if (!loaded_primary) { - tprintf("Tesseract couldn't load any languages!\n"); - return -1; // Couldn't load any language! - } - if (!sub_langs_.empty()) { - // In multilingual mode word ratings have to be directly comparable, - // so use the same language model weights for all languages: - // use the primary language's params model if - // tessedit_use_primary_params_model is set, - // otherwise use default language model weights. - if (tessedit_use_primary_params_model) { - for (int s = 0; s < sub_langs_.size(); ++s) { - sub_langs_[s]->language_model_->getParamsModel().Copy( - this->language_model_->getParamsModel()); - } - tprintf("Using params model of the primary language\n"); - if (tessdata_manager_debug_level) { - this->language_model_->getParamsModel().Print(); - } - } else { - this->language_model_->getParamsModel().Clear(); - for (int s = 0; s < sub_langs_.size(); ++s) { - sub_langs_[s]->language_model_->getParamsModel().Clear(); - } - if (tessdata_manager_debug_level) - tprintf("Using default language params\n"); - } - } - - SetupUniversalFontIds(); - return 0; -} - -// Common initialization for a single language. -// arg0 is the datapath for the tessdata directory, which could be the -// path of the tessdata directory with no trailing /, or (if tessdata -// lives in the same directory as the executable, the path of the executable, -// hence the name arg0. -// textbase is an optional output file basename (used only for training) -// language is the language code to load. -// oem controls which engine(s) will operate on the image -// configs (argv) is an array of config filenames to load variables from. -// May be NULL. -// configs_size (argc) is the number of elements in configs. -// vars_vec is an optional vector of variables to set. -// vars_values is an optional corresponding vector of values for the variables -// in vars_vec. -// If set_only_init_params is true, then only the initialization variables -// will be set. -int Tesseract::init_tesseract_internal( - const char *arg0, const char *textbase, const char *language, - OcrEngineMode oem, char **configs, int configs_size, - const GenericVector *vars_vec, - const GenericVector *vars_values, - bool set_only_non_debug_params) { - if (!init_tesseract_lang_data(arg0, textbase, language, oem, configs, - configs_size, vars_vec, vars_values, - set_only_non_debug_params)) { - return -1; - } - if (tessedit_init_config_only) { - tessdata_manager.End(); - return 0; - } - // If only Cube will be used, skip loading Tesseract classifier's - // pre-trained templates. - bool init_tesseract_classifier = - (tessedit_ocr_engine_mode == OEM_TESSERACT_ONLY || - tessedit_ocr_engine_mode == OEM_TESSERACT_CUBE_COMBINED); - // If only Cube will be used and if it has its own Unicharset, - // skip initializing permuter and loading Tesseract Dawgs. - bool init_dict = - !(tessedit_ocr_engine_mode == OEM_CUBE_ONLY && - tessdata_manager.SeekToStart(TESSDATA_CUBE_UNICHARSET)); - program_editup(textbase, init_tesseract_classifier, init_dict); - tessdata_manager.End(); - return 0; //Normal exit -} - -// Helper builds the all_fonts table by adding new fonts from new_fonts. -static void CollectFonts(const UnicityTable& new_fonts, - UnicityTable* all_fonts) { - for (int i = 0; i < new_fonts.size(); ++i) { - // UnicityTable uniques as we go. - all_fonts->push_back(new_fonts.get(i)); - } -} - -// Helper assigns an id to lang_fonts using the index in all_fonts table. -static void AssignIds(const UnicityTable& all_fonts, - UnicityTable* lang_fonts) { - for (int i = 0; i < lang_fonts->size(); ++i) { - int index = all_fonts.get_id(lang_fonts->get(i)); - lang_fonts->get_mutable(i)->universal_id = index; - } -} - -// Set the universal_id member of each font to be unique among all -// instances of the same font loaded. -void Tesseract::SetupUniversalFontIds() { - // Note that we can get away with bitwise copying FontInfo in - // all_fonts, as it is a temporary structure and we avoid setting the - // delete callback. - UnicityTable all_fonts; - all_fonts.set_compare_callback(NewPermanentTessCallback(CompareFontInfo)); - - // Create the universal ID table. - CollectFonts(get_fontinfo_table(), &all_fonts); - for (int i = 0; i < sub_langs_.size(); ++i) { - CollectFonts(sub_langs_[i]->get_fontinfo_table(), &all_fonts); - } - // Assign ids from the table to each font table. - AssignIds(all_fonts, &get_fontinfo_table()); - for (int i = 0; i < sub_langs_.size(); ++i) { - AssignIds(all_fonts, &sub_langs_[i]->get_fontinfo_table()); - } - font_table_size_ = all_fonts.size(); -} - -// init the LM component -int Tesseract::init_tesseract_lm(const char *arg0, - const char *textbase, - const char *language) { - if (!init_tesseract_lang_data(arg0, textbase, language, OEM_TESSERACT_ONLY, - NULL, 0, NULL, NULL, false)) - return -1; - getDict().SetupForLoad(Dict::GlobalDawgCache()); - getDict().Load(tessdata_manager.GetDataFileName().string(), lang); - getDict().FinishLoad(); - tessdata_manager.End(); - return 0; -} - -void Tesseract::end_tesseract() { - end_recog(); -} - -/* Define command type identifiers */ - -enum CMD_EVENTS -{ - ACTION_1_CMD_EVENT, - RECOG_WERDS, - RECOG_PSEUDO, - ACTION_2_CMD_EVENT -}; -} // namespace tesseract +/********************************************************************** + * File: tessedit.cpp (Formerly tessedit.c) + * Description: (Previously) Main program for merge of tess and editor. + * Now just code to load the language model and various + * engine-specific data files. + * Author: Ray Smith + * Created: Tue Jan 07 15:21:46 GMT 1992 + * + * (C) Copyright 1992, Hewlett-Packard Ltd. + ** Licensed under the Apache License, Version 2.0 (the "License"); + ** you may not use this file except in compliance with the License. + ** You may obtain a copy of the License at + ** http://www.apache.org/licenses/LICENSE-2.0 + ** Unless required by applicable law or agreed to in writing, software + ** distributed under the License is distributed on an "AS IS" BASIS, + ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + ** See the License for the specific language governing permissions and + ** limitations under the License. + * + **********************************************************************/ + +// Include automatically generated configuration file if running autoconf. +#ifdef HAVE_CONFIG_H +#include "config_auto.h" +#endif + +#include "basedir.h" +#include "tessvars.h" +#include "control.h" +#include "reject.h" +#include "pageres.h" +#include "pgedit.h" +#include "tprintf.h" +#include "tessedit.h" +#include "stopper.h" +#ifndef DISABLED_LEGACY_ENGINE +#include "intmatcher.h" +#include "chop.h" +#endif +#include "globals.h" +#ifndef ANDROID_BUILD +#include "lstmrecognizer.h" +#endif +#include "tesseractclass.h" +#include "params.h" +#ifdef DISABLED_LEGACY_ENGINE +#include "matchdefs.h" +#endif + + // config under api +#define API_CONFIG "configs/api_config" + +ETEXT_DESC *global_monitor = nullptr; // progress monitor + +namespace tesseract { + +// Read a "config" file containing a set of variable, value pairs. +// Searches the standard places: tessdata/configs, tessdata/tessconfigs +// and also accepts a relative or absolute path name. +void Tesseract::read_config_file(const char *filename, + SetParamConstraint constraint) { + STRING path = datadir; + path += "configs/"; + path += filename; + FILE* fp; + if ((fp = fopen(path.string(), "rb")) != nullptr) { + fclose(fp); + } else { + path = datadir; + path += "tessconfigs/"; + path += filename; + if ((fp = fopen(path.string(), "rb")) != nullptr) { + fclose(fp); + } else { + path = filename; + } + } + ParamUtils::ReadParamsFile(path.string(), constraint, this->params()); +} + +// Returns false if a unicharset file for the specified language was not found +// or was invalid. +// This function initializes TessdataManager. After TessdataManager is +// no longer needed, TessdataManager::End() should be called. +// +// This function sets tessedit_oem_mode to the given OcrEngineMode oem, unless +// it is OEM_DEFAULT, in which case the value of the variable will be obtained +// from the language-specific config file (stored in [lang].traineddata), from +// the config files specified on the command line or left as the default +// OEM_TESSERACT_ONLY if none of the configs specify this variable. +bool Tesseract::init_tesseract_lang_data( + const char *arg0, const char *textbase, const char *language, + OcrEngineMode oem, char **configs, int configs_size, + const GenericVector *vars_vec, + const GenericVector *vars_values, bool set_only_non_debug_params, + TessdataManager *mgr) { + // Set the basename, compute the data directory. + main_setup(arg0, textbase); + + // Set the language data path prefix + lang = language != nullptr ? language : "eng"; + language_data_path_prefix = datadir; + language_data_path_prefix += lang; + language_data_path_prefix += "."; + + // Initialize TessdataManager. + STRING tessdata_path = language_data_path_prefix + kTrainedDataSuffix; + if (!mgr->is_loaded() && !mgr->Init(tessdata_path.string())) { + tprintf("Error opening data file %s\n", tessdata_path.string()); + tprintf("Please make sure the TESSDATA_PREFIX environment variable is set" + " to your \"tessdata\" directory.\n"); + return false; + } + /* willus mod */ + TFile fp; + strncpy(fp.tfile_filename,tessdata_path.string(),511); + fp.tfile_filename[511]='\0'; +#ifndef DISABLED_LEGACY_ENGINE + if (oem == OEM_DEFAULT) { + // Set the engine mode from availability, which can then be overridden by + // the config file when we read it below. + if (!mgr->IsLSTMAvailable()) { + tessedit_ocr_engine_mode.set_value(OEM_TESSERACT_ONLY); + } else if (!mgr->IsBaseAvailable()) { + tessedit_ocr_engine_mode.set_value(OEM_LSTM_ONLY); + } else { + tessedit_ocr_engine_mode.set_value(OEM_TESSERACT_LSTM_COMBINED); + } + } +#endif // ndef DISABLED_LEGACY_ENGINE + + // If a language specific config file (lang.config) exists, load it in. + if (mgr->GetComponent(TESSDATA_LANG_CONFIG, &fp)) { + ParamUtils::ReadParamsFromFp(SET_PARAM_CONSTRAINT_NONE, &fp, + this->params()); + } + + SetParamConstraint set_params_constraint = set_only_non_debug_params ? + SET_PARAM_CONSTRAINT_NON_DEBUG_ONLY : SET_PARAM_CONSTRAINT_NONE; + // Load tesseract variables from config files. This is done after loading + // language-specific variables from [lang].traineddata file, so that custom + // config files can override values in [lang].traineddata file. + for (int i = 0; i < configs_size; ++i) { + read_config_file(configs[i], set_params_constraint); + } + + // Set params specified in vars_vec (done after setting params from config + // files, so that params in vars_vec can override those from files). + if (vars_vec != nullptr && vars_values != nullptr) { + for (int i = 0; i < vars_vec->size(); ++i) { + if (!ParamUtils::SetParam((*vars_vec)[i].string(), + (*vars_values)[i].string(), + set_params_constraint, this->params())) { + tprintf("Error setting param %s\n", (*vars_vec)[i].string()); + exit(1); + } + } + } + + if (((STRING &)tessedit_write_params_to_file).length() > 0) { + FILE *params_file = fopen(tessedit_write_params_to_file.string(), "wb"); + if (params_file != nullptr) { + ParamUtils::PrintParams(params_file, this->params()); + fclose(params_file); + } else { + tprintf("Failed to open %s for writing params.\n", + tessedit_write_params_to_file.string()); + } + } + + // Determine which ocr engine(s) should be loaded and used for recognition. + if (oem != OEM_DEFAULT) tessedit_ocr_engine_mode.set_value(oem); + + // If we are only loading the config file (and so not planning on doing any + // recognition) then there's nothing else do here. + if (tessedit_init_config_only) { + return true; + } + +// The various OcrEngineMode settings (see publictypes.h) determine which +// engine-specific data files need to be loaded. +// If LSTM_ONLY is requested, the base Tesseract files are *Not* required. +#ifndef ANDROID_BUILD +#ifdef DISABLED_LEGACY_ENGINE + if (tessedit_ocr_engine_mode == OEM_LSTM_ONLY) { +#else + if (tessedit_ocr_engine_mode == OEM_LSTM_ONLY || + tessedit_ocr_engine_mode == OEM_TESSERACT_LSTM_COMBINED) { +#endif // ndef DISABLED_LEGACY_ENGINE + if (mgr->IsComponentAvailable(TESSDATA_LSTM)) { + lstm_recognizer_ = new LSTMRecognizer; + ASSERT_HOST( + lstm_recognizer_->Load(lstm_use_matrix ? language : nullptr, mgr)); + } else { + tprintf("Error: LSTM requested, but not present!! Loading tesseract.\n"); + tessedit_ocr_engine_mode.set_value(OEM_TESSERACT_ONLY); + } + } +#endif // ndef ANDROID_BUILD + + // Load the unicharset + if (tessedit_ocr_engine_mode == OEM_LSTM_ONLY) { + // Avoid requiring a unicharset when we aren't running base tesseract. +#ifndef ANDROID_BUILD + unicharset.CopyFrom(lstm_recognizer_->GetUnicharset()); +#endif // ndef ANDROID_BUILD + } +#ifndef DISABLED_LEGACY_ENGINE + else if (!mgr->GetComponent(TESSDATA_UNICHARSET, &fp) || + !unicharset.load_from_file(&fp, false)) { + return false; + } +#endif // ndef DISABLED_LEGACY_ENGINE + if (unicharset.size() > MAX_NUM_CLASSES) { + tprintf("Error: Size of unicharset is greater than MAX_NUM_CLASSES\n"); + return false; + } + right_to_left_ = unicharset.major_right_to_left(); + + // Setup initial unichar ambigs table and read universal ambigs. + UNICHARSET encoder_unicharset; + encoder_unicharset.CopyFrom(unicharset); + unichar_ambigs.InitUnicharAmbigs(unicharset, use_ambigs_for_adaption); + unichar_ambigs.LoadUniversal(encoder_unicharset, &unicharset); + + if (!tessedit_ambigs_training && mgr->GetComponent(TESSDATA_AMBIGS, &fp)) { + unichar_ambigs.LoadUnicharAmbigs(encoder_unicharset, &fp, + ambigs_debug_level, + use_ambigs_for_adaption, &unicharset); + } +#ifndef DISABLED_LEGACY_ENGINE + // Init ParamsModel. + // Load pass1 and pass2 weights (for now these two sets are the same, but in + // the future separate sets of weights can be generated). + for (int p = ParamsModel::PTRAIN_PASS1; + p < ParamsModel::PTRAIN_NUM_PASSES; ++p) { + language_model_->getParamsModel().SetPass( + static_cast(p)); + if (mgr->GetComponent(TESSDATA_PARAMS_MODEL, &fp)) { + if (!language_model_->getParamsModel().LoadFromFp(lang.string(), &fp)) { + return false; + } + } + } +#endif // ndef DISABLED_LEGACY_ENGINE + + return true; +} + +// Helper returns true if the given string is in the vector of strings. +static bool IsStrInList(const STRING& str, + const GenericVector& str_list) { + for (int i = 0; i < str_list.size(); ++i) { + if (str_list[i] == str) + return true; + } + return false; +} + +// Parse a string of the form [~][+[~]]*. +// Langs with no prefix get appended to to_load, provided they +// are not in there already. +// Langs with ~ prefix get appended to not_to_load, provided they are not in +// there already. +void Tesseract::ParseLanguageString(const char* lang_str, + GenericVector* to_load, + GenericVector* not_to_load) { + STRING remains(lang_str); + while (remains.length() > 0) { + // Find the start of the lang code and which vector to add to. + const char* start = remains.string(); + while (*start == '+') + ++start; + GenericVector* target = to_load; + if (*start == '~') { + target = not_to_load; + ++start; + } + // Find the index of the end of the lang code in string start. + int end = strlen(start); + const char* plus = strchr(start, '+'); + if (plus != nullptr && plus - start < end) + end = plus - start; + STRING lang_code(start); + lang_code.truncate_at(end); + STRING next(start + end); + remains = next; + // Check whether lang_code is already in the target vector and add. + if (!IsStrInList(lang_code, *target)) { + target->push_back(lang_code); + } + } +} + +// Initialize for potentially a set of languages defined by the language +// string and recursively any additional languages required by any language +// traineddata file (via tessedit_load_sublangs in its config) that is loaded. +// See init_tesseract_internal for args. +int Tesseract::init_tesseract(const char *arg0, const char *textbase, + const char *language, OcrEngineMode oem, + char **configs, int configs_size, + const GenericVector *vars_vec, + const GenericVector *vars_values, + bool set_only_non_debug_params, + TessdataManager *mgr) { + GenericVector langs_to_load; + GenericVector langs_not_to_load; + ParseLanguageString(language, &langs_to_load, &langs_not_to_load); + + sub_langs_.delete_data_pointers(); + sub_langs_.clear(); + // Find the first loadable lang and load into this. + // Add any languages that this language requires + bool loaded_primary = false; + // Load the rest into sub_langs_. + for (int lang_index = 0; lang_index < langs_to_load.size(); ++lang_index) { + if (!IsStrInList(langs_to_load[lang_index], langs_not_to_load)) { + const char *lang_str = langs_to_load[lang_index].string(); + Tesseract *tess_to_init; + if (!loaded_primary) { + tess_to_init = this; + } else { + tess_to_init = new Tesseract; + } + + int result = tess_to_init->init_tesseract_internal( + arg0, textbase, lang_str, oem, configs, configs_size, vars_vec, + vars_values, set_only_non_debug_params, mgr); + // Forget that language, but keep any reader we were given. + mgr->Clear(); + + if (!loaded_primary) { + if (result < 0) { + tprintf("Failed loading language '%s'\n", lang_str); + } else { + ParseLanguageString(tess_to_init->tessedit_load_sublangs.string(), + &langs_to_load, &langs_not_to_load); + loaded_primary = true; + } + } else { + if (result < 0) { + tprintf("Failed loading language '%s'\n", lang_str); + delete tess_to_init; + } else { + sub_langs_.push_back(tess_to_init); + // Add any languages that this language requires + ParseLanguageString(tess_to_init->tessedit_load_sublangs.string(), + &langs_to_load, &langs_not_to_load); + } + } + } + } + if (!loaded_primary) { + tprintf("Tesseract couldn't load any languages!\n"); + return -1; // Couldn't load any language! + } +#ifndef DISABLED_LEGACY_ENGINE + if (!sub_langs_.empty()) { + // In multilingual mode word ratings have to be directly comparable, + // so use the same language model weights for all languages: + // use the primary language's params model if + // tessedit_use_primary_params_model is set, + // otherwise use default language model weights. + if (tessedit_use_primary_params_model) { + for (int s = 0; s < sub_langs_.size(); ++s) { + sub_langs_[s]->language_model_->getParamsModel().Copy( + this->language_model_->getParamsModel()); + } + tprintf("Using params model of the primary language\n"); + } else { + this->language_model_->getParamsModel().Clear(); + for (int s = 0; s < sub_langs_.size(); ++s) { + sub_langs_[s]->language_model_->getParamsModel().Clear(); + } + } + } + + SetupUniversalFontIds(); +#endif // ndef DISABLED_LEGACY_ENGINE + return 0; +} + +// Common initialization for a single language. +// arg0 is the datapath for the tessdata directory, which could be the +// path of the tessdata directory with no trailing /, or (if tessdata +// lives in the same directory as the executable, the path of the executable, +// hence the name arg0. +// textbase is an optional output file basename (used only for training) +// language is the language code to load. +// oem controls which engine(s) will operate on the image +// configs (argv) is an array of config filenames to load variables from. +// May be nullptr. +// configs_size (argc) is the number of elements in configs. +// vars_vec is an optional vector of variables to set. +// vars_values is an optional corresponding vector of values for the variables +// in vars_vec. +// If set_only_init_params is true, then only the initialization variables +// will be set. +int Tesseract::init_tesseract_internal(const char *arg0, const char *textbase, + const char *language, OcrEngineMode oem, + char **configs, int configs_size, + const GenericVector *vars_vec, + const GenericVector *vars_values, + bool set_only_non_debug_params, + TessdataManager *mgr) { + if (!init_tesseract_lang_data(arg0, textbase, language, oem, configs, + configs_size, vars_vec, vars_values, + set_only_non_debug_params, mgr)) { + return -1; + } + if (tessedit_init_config_only) { + return 0; + } + // If only LSTM will be used, skip loading Tesseract classifier's + // pre-trained templates and dictionary. + bool init_tesseract = tessedit_ocr_engine_mode != OEM_LSTM_ONLY; + program_editup(textbase, init_tesseract ? mgr : nullptr, + init_tesseract ? mgr : nullptr); + return 0; //Normal exit +} + +#ifndef DISABLED_LEGACY_ENGINE + +// Helper builds the all_fonts table by adding new fonts from new_fonts. +static void CollectFonts(const UnicityTable& new_fonts, + UnicityTable* all_fonts) { + for (int i = 0; i < new_fonts.size(); ++i) { + // UnicityTable uniques as we go. + all_fonts->push_back(new_fonts.get(i)); + } +} + +// Helper assigns an id to lang_fonts using the index in all_fonts table. +static void AssignIds(const UnicityTable& all_fonts, + UnicityTable* lang_fonts) { + for (int i = 0; i < lang_fonts->size(); ++i) { + int index = all_fonts.get_id(lang_fonts->get(i)); + lang_fonts->get_mutable(i)->universal_id = index; + } +} + +// Set the universal_id member of each font to be unique among all +// instances of the same font loaded. +void Tesseract::SetupUniversalFontIds() { + // Note that we can get away with bitwise copying FontInfo in + // all_fonts, as it is a temporary structure and we avoid setting the + // delete callback. + UnicityTable all_fonts; + all_fonts.set_compare_callback(NewPermanentTessCallback(CompareFontInfo)); + + // Create the universal ID table. + CollectFonts(get_fontinfo_table(), &all_fonts); + for (int i = 0; i < sub_langs_.size(); ++i) { + CollectFonts(sub_langs_[i]->get_fontinfo_table(), &all_fonts); + } + // Assign ids from the table to each font table. + AssignIds(all_fonts, &get_fontinfo_table()); + for (int i = 0; i < sub_langs_.size(); ++i) { + AssignIds(all_fonts, &sub_langs_[i]->get_fontinfo_table()); + } + font_table_size_ = all_fonts.size(); +} + +// init the LM component +int Tesseract::init_tesseract_lm(const char *arg0, const char *textbase, + const char *language, TessdataManager *mgr) { + if (!init_tesseract_lang_data(arg0, textbase, language, OEM_TESSERACT_ONLY, + nullptr, 0, nullptr, nullptr, false, mgr)) + return -1; + getDict().SetupForLoad(Dict::GlobalDawgCache()); + getDict().Load(lang, mgr); + getDict().FinishLoad(); + return 0; +} + +#endif // ndef DISABLED_LEGACY_ENGINE + +void Tesseract::end_tesseract() { + end_recog(); +} + +/* Define command type identifiers */ + +enum CMD_EVENTS +{ + ACTION_1_CMD_EVENT, + RECOG_WERDS, + RECOG_PSEUDO, + ACTION_2_CMD_EVENT +}; +} // namespace tesseract diff -Nru k2pdfopt-2.42+ds/tesseract_mod/tess_lang_mod_edge.h k2pdfopt-2.51+ds/tesseract_mod/tess_lang_mod_edge.h --- k2pdfopt-2.42+ds/tesseract_mod/tess_lang_mod_edge.h 2017-02-25 05:09:40.000000000 +0000 +++ k2pdfopt-2.51+ds/tesseract_mod/tess_lang_mod_edge.h 1970-01-01 00:00:00.000000000 +0000 @@ -1,241 +0,0 @@ -/********************************************************************** - * File: tess_lang_mod_edge.h - * Description: Declaration of the Tesseract Language Model Edge Class - * Author: Ahmad Abdulkader - * Created: 2008 - * - * (C) Copyright 2008, Google Inc. - ** Licensed under the Apache License, Version 2.0 (the "License"); - ** you may not use this file except in compliance with the License. - ** You may obtain a copy of the License at - ** http://www.apache.org/licenses/LICENSE-2.0 - ** Unless required by applicable law or agreed to in writing, software - ** distributed under the License is distributed on an "AS IS" BASIS, - ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - ** See the License for the specific language governing permissions and - ** limitations under the License. - * - **********************************************************************/ - -// The TessLangModEdge models an edge in the Tesseract language models -// It inherits from the LangModEdge class - -#ifndef TESS_LANG_MOD_EDGE_H -#define TESS_LANG_MOD_EDGE_H - -#include "dawg.h" -#include "char_set.h" - -#include "lang_mod_edge.h" -#include "cube_reco_context.h" -#include "cube_utils.h" - -// Macros needed to identify punctuation in the langmodel state -#ifdef _HMSW32_H -#define LEAD_PUNC_EDGE_REF_MASK (inT64) 0x0000000100000000i64 -#define TRAIL_PUNC_EDGE_REF_MASK (inT64) 0x0000000200000000i64 -#define TRAIL_PUNC_REPEAT_MASK (inT64) 0xffff000000000000i64 -#else -#define LEAD_PUNC_EDGE_REF_MASK (inT64) 0x0000000100000000ll -#define TRAIL_PUNC_EDGE_REF_MASK (inT64) 0x0000000200000000ll -#define TRAIL_PUNC_REPEAT_MASK (inT64) 0xffff000000000000ll -#endif - -// Number state machine macros -#define NUMBER_STATE_SHIFT 0 -#define NUMBER_STATE_MASK 0x0000000fl -#define NUMBER_LITERAL_SHIFT 4 -#define NUMBER_LITERAL_MASK 0x000000f0l -#define NUMBER_REPEAT_SHIFT 8 -#define NUMBER_REPEAT_MASK 0x00000f00l -#define NUM_TRM -99 -#define TRAIL_PUNC_REPEAT_SHIFT 48 - -#define IsLeadingPuncEdge(edge_mask) \ - ((edge_mask & LEAD_PUNC_EDGE_REF_MASK) != 0) -#define IsTrailingPuncEdge(edge_mask) \ - ((edge_mask & TRAIL_PUNC_EDGE_REF_MASK) != 0) -#define TrailingPuncCount(edge_mask) \ - ((edge_mask & TRAIL_PUNC_REPEAT_MASK) >> TRAIL_PUNC_REPEAT_SHIFT) -#define TrailingPuncEdgeMask(Cnt) \ - (TRAIL_PUNC_EDGE_REF_MASK | ((Cnt) << TRAIL_PUNC_REPEAT_SHIFT)) - -// State machine IDs -#define DAWG_OOD 0 -#define DAWG_NUMBER 1 - -namespace tesseract { -class TessLangModEdge : public LangModEdge { - public: - // Different ways of constructing a TessLangModEdge - TessLangModEdge(CubeRecoContext *cntxt, const Dawg *edge_array, - EDGE_REF edge, int class_id); - TessLangModEdge(CubeRecoContext *cntxt, const Dawg *edge_array, - EDGE_REF start_edge_idx, EDGE_REF end_edge_idx, - int class_id); - TessLangModEdge(CubeRecoContext *cntxt, int class_id); - ~TessLangModEdge() {} - - // Accessors - inline bool IsRoot() const { - return root_; - } - inline void SetRoot(bool flag) { root_ = flag; } - - inline bool IsOOD() const { - return (dawg_ == (Dawg *)DAWG_OOD); - } - - inline bool IsNumber() const { - return (dawg_ == (Dawg *)DAWG_NUMBER); - } - - inline bool IsEOW() const { - return (IsTerminal() || (dawg_->end_of_word(end_edge_) != 0)); - } - - inline const Dawg *GetDawg() const { return dawg_; } - inline EDGE_REF StartEdge() const { return start_edge_; } - inline EDGE_REF EndEdge() const { return end_edge_; } - inline EDGE_REF EdgeMask() const { return edge_mask_; } - inline const char_32 * EdgeString() const { return str_; } - inline int ClassID () const { return class_id_; } - inline int PathCost() const { return path_cost_; } - inline void SetEdgeMask(EDGE_REF edge_mask) { edge_mask_ = edge_mask; } - inline void SetDawg(Dawg *dawg) { dawg_ = dawg; } - inline void SetStartEdge(EDGE_REF edge_idx) { start_edge_ = edge_idx; } - inline void SetEndEdge(EDGE_REF edge_idx) { end_edge_ = edge_idx; } - - // is this a terminal node: - // we can terminate at any OOD char, trailing punc or - // when the dawg terminates - inline bool IsTerminal() const { - return (IsOOD() || IsNumber() || IsTrailingPuncEdge(start_edge_) || - dawg_->next_node(end_edge_) == 0); - } - - // How many signals does the LM provide for tuning. These are flags like: - // OOD or not, Number of not that are used by the training to compute - // extra costs for each word. - inline int SignalCnt() const { - return 2; - } - - // returns the weight assigned to a specified signal - inline double SignalWgt(int signal) const { - CubeTuningParams *params = - reinterpret_cast(cntxt_->Params()); - if (params != NULL) { - switch (signal) { - case 0: - return params->OODWgt(); - break; - - case 1: - return params->NumWgt(); - break; - } - } - - return 0.0; - } - - // sets the weight assigned to a specified signal: Used in training - void SetSignalWgt(int signal, double wgt) { - CubeTuningParams *params = - reinterpret_cast(cntxt_->Params()); - if (params != NULL) { - switch (signal) { - case 0: - params->SetOODWgt(wgt); - break; - - case 1: - params->SetNumWgt(wgt); - break; - } - } - } - - // returns the actual value of a specified signal - int Signal(int signal) { - switch (signal) { - case 0: - return IsOOD() ? MIN_PROB_COST : 0; - break; - - case 1: - return IsNumber() ? MIN_PROB_COST : 0; - break; - - default: - return 0; - } - } - - // returns the Hash value of the edge. Used by the SearchNode hash table - // to quickly lookup exisiting edges to converge during search - inline unsigned int Hash() const { - unsigned long int dg; - dg = (unsigned long int)(long long)dawg_; - return static_cast(((start_edge_ | end_edge_) ^ dg) ^ - ((unsigned int)edge_mask_) ^ - class_id_); - } -/* - inline unsigned int Hash() const { - return static_cast( - ((start_edge_ | end_edge_) ^ ((reinterpret_cast(dawg_)))) ^ - ((unsigned int)edge_mask_) ^ class_id_); -*/ - - // A verbal description of the edge: Used by visualizers - char *Description() const; - - // Is this edge identical to the specified edge - inline bool IsIdentical(LangModEdge *lang_mod_edge) const { - return (class_id_ == - reinterpret_cast(lang_mod_edge)->class_id_ && - str_ == reinterpret_cast(lang_mod_edge)->str_ && - dawg_ == reinterpret_cast(lang_mod_edge)->dawg_ && - start_edge_ == - reinterpret_cast(lang_mod_edge)->start_edge_ && - end_edge_ == - reinterpret_cast(lang_mod_edge)->end_edge_ && - edge_mask_ == - reinterpret_cast(lang_mod_edge)->edge_mask_); - } - - // Creates a set of fan-out edges for the specified edge - static int CreateChildren(CubeRecoContext *cntxt, - const Dawg *edges, - NODE_REF edge_reg, - LangModEdge **lm_edges); - - private: - bool root_; - CubeRecoContext *cntxt_; - const Dawg *dawg_; - EDGE_REF start_edge_; - EDGE_REF end_edge_; - EDGE_REF edge_mask_; - int path_cost_; - int class_id_; - const char_32 * str_; - // returns the cost of the lang_mod_edge - inline int Cost() const { - if (cntxt_ != NULL) { - CubeTuningParams *params = - reinterpret_cast(cntxt_->Params()); - if (dawg_ == (Dawg *)DAWG_OOD) { - return static_cast(params->OODWgt() * MIN_PROB_COST); - } else if (dawg_ == (Dawg *)DAWG_NUMBER) { - return static_cast(params->NumWgt() * MIN_PROB_COST); - } - } - return 0; - } -}; -} // namespace tesseract - -#endif // TESS_LANG_MOD_EDGE_H diff -Nru k2pdfopt-2.42+ds/tesseract_mod/thresholder.cpp k2pdfopt-2.51+ds/tesseract_mod/thresholder.cpp --- k2pdfopt-2.42+ds/tesseract_mod/thresholder.cpp 2017-02-25 18:59:18.000000000 +0000 +++ k2pdfopt-2.51+ds/tesseract_mod/thresholder.cpp 1970-01-01 00:00:00.000000000 +0000 @@ -1,330 +0,0 @@ -#include "config_auto.h" -/////////////////////////////////////////////////////////////////////// -// File: thresholder.cpp -// Description: Base API for thresolding images in tesseract. -// Author: Ray Smith -// Created: Mon May 12 11:28:15 PDT 2008 -// -// (C) Copyright 2008, Google Inc. -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// -/////////////////////////////////////////////////////////////////////// - -#include "allheaders.h" - -#include "thresholder.h" - -#include - -#include "otsuthr.h" - -#include "openclwrapper.h" - -namespace tesseract { - -ImageThresholder::ImageThresholder() - : pix_(NULL), - image_width_(0), image_height_(0), - pix_channels_(0), pix_wpl_(0), - scale_(1), yres_(300), estimated_res_(300) { - SetRectangle(0, 0, 0, 0); -} - -ImageThresholder::~ImageThresholder() { - Clear(); -} - -// Destroy the Pix if there is one, freeing memory. -void ImageThresholder::Clear() { - pixDestroy(&pix_); -} - -// Return true if no image has been set. -bool ImageThresholder::IsEmpty() const { - return pix_ == NULL; -} - -// SetImage makes a copy of all the image data, so it may be deleted -// immediately after this call. -// Greyscale of 8 and color of 24 or 32 bits per pixel may be given. -// Palette color images will not work properly and must be converted to -// 24 bit. -// Binary images of 1 bit per pixel may also be given but they must be -// byte packed with the MSB of the first byte being the first pixel, and a -// one pixel is WHITE. For binary images set bytes_per_pixel=0. -void ImageThresholder::SetImage(const unsigned char* imagedata, - int width, int height, - int bytes_per_pixel, int bytes_per_line) { - int bpp = bytes_per_pixel * 8; - if (bpp == 0) bpp = 1; - Pix* pix = pixCreate(width, height, bpp == 24 ? 32 : bpp); - l_uint32* data = pixGetData(pix); - int wpl = pixGetWpl(pix); - switch (bpp) { - case 1: - for (int y = 0; y < height; ++y, data += wpl, imagedata += bytes_per_line) { - for (int x = 0; x < width; ++x) { - if (imagedata[x / 8] & (0x80 >> (x % 8))) - CLEAR_DATA_BIT(data, x); - else - SET_DATA_BIT(data, x); - } - } - break; - - case 8: - // Greyscale just copies the bytes in the right order. - for (int y = 0; y < height; ++y, data += wpl, imagedata += bytes_per_line) { - for (int x = 0; x < width; ++x) - SET_DATA_BYTE(data, x, imagedata[x]); - } - break; - - case 24: - // Put the colors in the correct places in the line buffer. - for (int y = 0; y < height; ++y, imagedata += bytes_per_line) { - for (int x = 0; x < width; ++x, ++data) { - SET_DATA_BYTE(data, COLOR_RED, imagedata[3 * x]); - SET_DATA_BYTE(data, COLOR_GREEN, imagedata[3 * x + 1]); - SET_DATA_BYTE(data, COLOR_BLUE, imagedata[3 * x + 2]); - } - } - break; - - case 32: - // Maintain byte order consistency across different endianness. - for (int y = 0; y < height; ++y, imagedata += bytes_per_line, data += wpl) { - for (int x = 0; x < width; ++x) { - data[x] = (imagedata[x * 4] << 24) | (imagedata[x * 4 + 1] << 16) | - (imagedata[x * 4 + 2] << 8) | imagedata[x * 4 + 3]; - } - } - break; - - default: - tprintf("Cannot convert RAW image to Pix with bpp = %d\n", bpp); - } - pixSetYRes(pix, 300); - SetImage(pix); - pixDestroy(&pix); -} - -// Store the coordinates of the rectangle to process for later use. -// Doesn't actually do any thresholding. -void ImageThresholder::SetRectangle(int left, int top, int width, int height) { - rect_left_ = left; - rect_top_ = top; - rect_width_ = width; - rect_height_ = height; -} - -// Get enough parameters to be able to rebuild bounding boxes in the -// original image (not just within the rectangle). -// Left and top are enough with top-down coordinates, but -// the height of the rectangle and the image are needed for bottom-up. -void ImageThresholder::GetImageSizes(int* left, int* top, - int* width, int* height, - int* imagewidth, int* imageheight) { - *left = rect_left_; - *top = rect_top_; - *width = rect_width_; - *height = rect_height_; - *imagewidth = image_width_; - *imageheight = image_height_; -} - -// Pix vs raw, which to use? Pix is the preferred input for efficiency, -// since raw buffers are copied. -// SetImage for Pix clones its input, so the source pix may be pixDestroyed -// immediately after, but may not go away until after the Thresholder has -// finished with it. -void ImageThresholder::SetImage(const Pix* pix) { - if (pix_ != NULL) - pixDestroy(&pix_); - Pix* src = const_cast(pix); - int depth; - pixGetDimensions(src, &image_width_, &image_height_, &depth); - // Convert the image as necessary so it is one of binary, plain RGB, or - // 8 bit with no colormap. Guarantee that we always end up with our own copy, - // not just a clone of the input. - if (pixGetColormap(src)) { - Pix* tmp = pixRemoveColormap(src, REMOVE_CMAP_BASED_ON_SRC); - depth = pixGetDepth(tmp); - if (depth > 1 && depth < 8) { - pix_ = pixConvertTo8(tmp, false); - pixDestroy(&tmp); - } else { - pix_ = tmp; - } - } else if (depth > 1 && depth < 8) { - pix_ = pixConvertTo8(src, false); - } else { - pix_ = pixCopy(NULL, src); - } - depth = pixGetDepth(pix_); - pix_channels_ = depth / 8; - pix_wpl_ = pixGetWpl(pix_); - scale_ = 1; - estimated_res_ = yres_ = pixGetYRes(pix_); - Init(); -} - -// Threshold the source image as efficiently as possible to the output Pix. -// Creates a Pix and sets pix to point to the resulting pointer. -// Caller must use pixDestroy to free the created Pix. -void ImageThresholder::ThresholdToPix(PageSegMode pageseg_mode, Pix** pix) { - if (pix_channels_ == 0) { - // We have a binary image, but it still has to be copied, as this API - // allows the caller to modify the output. - Pix* original = GetPixRect(); - /* willus mod: nullptr -> NULL */ - *pix = pixCopy(NULL, original); - pixDestroy(&original); - } else { - OtsuThresholdRectToPix(pix_, pix); - } -} - -// Gets a pix that contains an 8 bit threshold value at each pixel. The -// returned pix may be an integer reduction of the binary image such that -// the scale factor may be inferred from the ratio of the sizes, even down -// to the extreme of a 1x1 pixel thresholds image. -// Ideally the 8 bit threshold should be the exact threshold used to generate -// the binary image in ThresholdToPix, but this is not a hard constraint. -// Returns NULL if the input is binary. PixDestroy after use. -Pix* ImageThresholder::GetPixRectThresholds() { - if (IsBinary()) return NULL; - Pix* pix_grey = GetPixRectGrey(); - int width = pixGetWidth(pix_grey); - int height = pixGetHeight(pix_grey); - int* thresholds; - int* hi_values; - OtsuThreshold(pix_grey, 0, 0, width, height, &thresholds, &hi_values); - pixDestroy(&pix_grey); - Pix* pix_thresholds = pixCreate(width, height, 8); - int threshold = thresholds[0] > 0 ? thresholds[0] : 128; - pixSetAllArbitrary(pix_thresholds, threshold); - delete [] thresholds; - delete [] hi_values; - return pix_thresholds; -} - -// Common initialization shared between SetImage methods. -void ImageThresholder::Init() { - SetRectangle(0, 0, image_width_, image_height_); -} - -// Get a clone/copy of the source image rectangle. -// The returned Pix must be pixDestroyed. -// This function will be used in the future by the page layout analysis, and -// the layout analysis that uses it will only be available with Leptonica, -// so there is no raw equivalent. -Pix* ImageThresholder::GetPixRect() { - if (IsFullImage()) { - // Just clone the whole thing. - return pixClone(pix_); - } else { - // Crop to the given rectangle. - Box* box = boxCreate(rect_left_, rect_top_, rect_width_, rect_height_); - Pix* cropped = pixClipRectangle(pix_, box, NULL); - boxDestroy(&box); - return cropped; - } -} - -// Get a clone/copy of the source image rectangle, reduced to greyscale, -// and at the same resolution as the output binary. -// The returned Pix must be pixDestroyed. -// Provided to the classifier to extract features from the greyscale image. -Pix* ImageThresholder::GetPixRectGrey() { - Pix* pix = GetPixRect(); // May have to be reduced to grey. - int depth = pixGetDepth(pix); - if (depth != 8) { - Pix* result = depth < 8 ? pixConvertTo8(pix, false) - : pixConvertRGBToLuminance(pix); - pixDestroy(&pix); - return result; - } - return pix; -} - -// Otsu thresholds the rectangle, taking the rectangle from *this. -void ImageThresholder::OtsuThresholdRectToPix(Pix* src_pix, - Pix** out_pix) const { - PERF_COUNT_START("OtsuThresholdRectToPix") - int* thresholds; - int* hi_values; - - int num_channels = OtsuThreshold(src_pix, rect_left_, rect_top_, rect_width_, - rect_height_, &thresholds, &hi_values); - // only use opencl if compiled w/ OpenCL and selected device is opencl -#ifdef USE_OPENCL - OpenclDevice od; - if ((num_channels == 4 || num_channels == 1) && - od.selectedDeviceIsOpenCL() && rect_top_ == 0 && rect_left_ == 0 ) { - od.ThresholdRectToPixOCL((unsigned char*)pixGetData(src_pix), num_channels, - pixGetWpl(src_pix) * 4, thresholds, hi_values, - out_pix /*pix_OCL*/, rect_height_, rect_width_, - rect_top_, rect_left_); - } else { -#endif - ThresholdRectToPix(src_pix, num_channels, thresholds, hi_values, out_pix); -#ifdef USE_OPENCL - } -#endif - delete [] thresholds; - delete [] hi_values; - - PERF_COUNT_END -} - -/// Threshold the rectangle, taking everything except the src_pix -/// from the class, using thresholds/hi_values to the output pix. -/// NOTE that num_channels is the size of the thresholds and hi_values -// arrays and also the bytes per pixel in src_pix. -void ImageThresholder::ThresholdRectToPix(Pix* src_pix, - int num_channels, - const int* thresholds, - const int* hi_values, - Pix** pix) const { - PERF_COUNT_START("ThresholdRectToPix") - *pix = pixCreate(rect_width_, rect_height_, 1); - uinT32* pixdata = pixGetData(*pix); - int wpl = pixGetWpl(*pix); - int src_wpl = pixGetWpl(src_pix); - uinT32* srcdata = pixGetData(src_pix); - for (int y = 0; y < rect_height_; ++y) { - const uinT32* linedata = srcdata + (y + rect_top_) * src_wpl; - uinT32* pixline = pixdata + y * wpl; - for (int x = 0; x < rect_width_; ++x) { - bool white_result = true; - for (int ch = 0; ch < num_channels; ++ch) { - int pixel = GET_DATA_BYTE(const_cast( - reinterpret_cast(linedata)), - (x + rect_left_) * num_channels + ch); - if (hi_values[ch] >= 0 && - (pixel > thresholds[ch]) == (hi_values[ch] == 0)) { - white_result = false; - break; - } - } - if (white_result) - CLEAR_DATA_BIT(pixline, x); - else - SET_DATA_BIT(pixline, x); - } - } - - PERF_COUNT_END -} - -} // namespace tesseract. - diff -Nru k2pdfopt-2.42+ds/tesseract_mod/unicharset.h k2pdfopt-2.51+ds/tesseract_mod/unicharset.h --- k2pdfopt-2.42+ds/tesseract_mod/unicharset.h 1970-01-01 00:00:00.000000000 +0000 +++ k2pdfopt-2.51+ds/tesseract_mod/unicharset.h 2018-12-23 22:46:12.000000000 +0000 @@ -0,0 +1,1049 @@ +/////////////////////////////////////////////////////////////////////// +// File: unicharset.h +// Description: Unicode character/ligature set class. +// Author: Thomas Kielbus +// Created: Wed Jun 28 17:05:01 PDT 2006 +// +// (C) Copyright 2006, Google Inc. +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// http://www.apache.org/licenses/LICENSE-2.0 +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +/////////////////////////////////////////////////////////////////////// + +#ifndef TESSERACT_CCUTIL_UNICHARSET_H_ +#define TESSERACT_CCUTIL_UNICHARSET_H_ + +#include "errcode.h" +#include "genericvector.h" +#include "helpers.h" +#include "serialis.h" +#include "strngs.h" +#include "tesscallback.h" +#include "unichar.h" +#include "unicharmap.h" + +// Enum holding special values of unichar_id. Every unicharset has these. +// Warning! Keep in sync with kSpecialUnicharCodes. +enum SpecialUnicharCodes { + UNICHAR_SPACE, + UNICHAR_JOINED, + UNICHAR_BROKEN, + + SPECIAL_UNICHAR_CODES_COUNT +}; + +// Boolean flag for unichar_insert. It's a bit of a double negative to allow +// the default value to be false. +enum class OldUncleanUnichars { + kFalse, + kTrue, +}; + +class CHAR_FRAGMENT { + public: + // Minimum number of characters used for fragment representation. + static const int kMinLen = 6; + // Maximum number of characters used for fragment representation. + static const int kMaxLen = 3 + UNICHAR_LEN + 2; + // Maximum number of fragments per character. + static const int kMaxChunks = 5; + + // Setters and Getters. + inline void set_all(const char *unichar, int pos, int total, bool natural) { + set_unichar(unichar); + set_pos(pos); + set_total(total); + set_natural(natural); + } + inline void set_unichar(const char *uch) { + strncpy(this->unichar, uch, UNICHAR_LEN); + this->unichar[UNICHAR_LEN] = '\0'; + } + inline void set_pos(int p) { this->pos = p; } + inline void set_total(int t) { this->total = t; } + inline const char* get_unichar() const { return this->unichar; } + inline int get_pos() const { return this->pos; } + inline int get_total() const { return this->total; } + + // Returns the string that represents a fragment + // with the given unichar, pos and total. + static STRING to_string(const char *unichar, int pos, int total, + bool natural); + // Returns the string that represents this fragment. + STRING to_string() const { + return to_string(unichar, pos, total, natural); + } + + // Checks whether a fragment has the same unichar, + // position and total as the given inputs. + inline bool equals(const char *other_unichar, + int other_pos, int other_total) const { + return (strcmp(this->unichar, other_unichar) == 0 && + this->pos == other_pos && this->total == other_total); + } + inline bool equals(const CHAR_FRAGMENT *other) const { + return this->equals(other->get_unichar(), + other->get_pos(), + other->get_total()); + } + + // Checks whether a given fragment is a continuation of this fragment. + // Assumes that the given fragment pointer is not nullptr. + inline bool is_continuation_of(const CHAR_FRAGMENT *fragment) const { + return (strcmp(this->unichar, fragment->get_unichar()) == 0 && + this->total == fragment->get_total() && + this->pos == fragment->get_pos() + 1); + } + + // Returns true if this fragment is a beginning fragment. + inline bool is_beginning() const { return this->pos == 0; } + + // Returns true if this fragment is an ending fragment. + inline bool is_ending() const { return this->pos == this->total-1; } + + // Returns true if the fragment was a separate component to begin with, + // ie did not need chopping to be isolated, but may have been separated + // out from a multi-outline blob. + inline bool is_natural() const { return natural; } + void set_natural(bool value) { natural = value; } + + // Parses the string to see whether it represents a character fragment + // (rather than a regular character). If so, allocates memory for a new + // CHAR_FRAGMENT instance and fills it in with the corresponding fragment + // information. Fragments are of the form: + // |m|1|2, meaning chunk 1 of 2 of character m, or + // |:|1n2, meaning chunk 1 of 2 of character :, and no chopping was needed + // to divide the parts, as they were already separate connected components. + // + // If parsing succeeded returns the pointer to the allocated CHAR_FRAGMENT + // instance, otherwise (if the string does not represent a fragment or it + // looks like it does, but parsing it as a fragment fails) returns nullptr. + // + // Note: The caller is responsible for deallocating memory + // associated with the returned pointer. + static CHAR_FRAGMENT *parse_from_string(const char *str); + + private: + char unichar[UNICHAR_LEN + 1]; + // True if the fragment was a separate component to begin with, + // ie did not need chopping to be isolated, but may have been separated + // out from a multi-outline blob. + bool natural; + int16_t pos; // fragment position in the character + int16_t total; // total number of fragments in the character +}; + +// The UNICHARSET class is an utility class for Tesseract that holds the +// set of characters that are used by the engine. Each character is identified +// by a unique number, from 0 to (size - 1). +class UNICHARSET { + public: + // Custom list of characters and their ligature forms (UTF8) + // These map to unicode values in the private use area (PUC) and are supported + // by only few font families (eg. Wyld, Adobe Caslon Pro). + static TESS_API const char* kCustomLigatures[][2]; + + // List of strings for the SpecialUnicharCodes. Keep in sync with the enum. + static TESS_API const char* kSpecialUnicharCodes[SPECIAL_UNICHAR_CODES_COUNT]; + + // ICU 2.0 UCharDirection enum (from third_party/icu/include/unicode/uchar.h) + enum Direction { + U_LEFT_TO_RIGHT = 0, + U_RIGHT_TO_LEFT = 1, + U_EUROPEAN_NUMBER = 2, + U_EUROPEAN_NUMBER_SEPARATOR = 3, + U_EUROPEAN_NUMBER_TERMINATOR = 4, + U_ARABIC_NUMBER = 5, + U_COMMON_NUMBER_SEPARATOR = 6, + U_BLOCK_SEPARATOR = 7, + U_SEGMENT_SEPARATOR = 8, + U_WHITE_SPACE_NEUTRAL = 9, + U_OTHER_NEUTRAL = 10, + U_LEFT_TO_RIGHT_EMBEDDING = 11, + U_LEFT_TO_RIGHT_OVERRIDE = 12, + U_RIGHT_TO_LEFT_ARABIC = 13, + U_RIGHT_TO_LEFT_EMBEDDING = 14, + U_RIGHT_TO_LEFT_OVERRIDE = 15, + U_POP_DIRECTIONAL_FORMAT = 16, + U_DIR_NON_SPACING_MARK = 17, + U_BOUNDARY_NEUTRAL = 18, + U_CHAR_DIRECTION_COUNT + }; + + // Create an empty UNICHARSET + UNICHARSET(); + + ~UNICHARSET(); + + // Return the UNICHAR_ID of a given unichar representation within the + // UNICHARSET. + UNICHAR_ID unichar_to_id(const char* const unichar_repr) const; + + // Return the UNICHAR_ID of a given unichar representation within the + // UNICHARSET. Only the first length characters from unichar_repr are used. + UNICHAR_ID unichar_to_id(const char* const unichar_repr, int length) const; + + // Return the minimum number of bytes that matches a legal UNICHAR_ID, + // while leaving the rest of the string encodable. Returns 0 if the + // beginning of the string is not encodable. + // WARNING: this function now encodes the whole string for precision. + // Use encode_string in preference to repeatedly calling step. + int step(const char* str) const; + + // Returns true if the given UTF-8 string is encodable with this UNICHARSET. + // If not encodable, write the first byte offset which cannot be converted + // into the second (return) argument. + bool encodable_string(const char *str, int *first_bad_position) const; + + // Encodes the given UTF-8 string with this UNICHARSET. + // Any part of the string that cannot be encoded (because the utf8 can't + // be broken up into pieces that are in the unicharset) then: + // if give_up_on_failure, stops and returns a partial encoding, + // else continues and inserts an INVALID_UNICHAR_ID in the returned encoding. + // Returns true if the encoding succeeds completely, false if there is at + // least one failure. + // If lengths is not nullptr, then it is filled with the corresponding + // byte length of each encoded UNICHAR_ID. + // If encoded_length is not nullptr then on return it contains the length of + // str that was encoded. (if give_up_on_failure the location of the first + // failure, otherwise strlen(str).) + // WARNING: Caller must guarantee that str has already been cleaned of codes + // that do not belong in the unicharset, or encoding may fail. + // Use CleanupString to perform the cleaning. + bool encode_string(const char* str, bool give_up_on_failure, + GenericVector* encoding, + GenericVector* lengths, + int* encoded_length) const; + + // Return the unichar representation corresponding to the given UNICHAR_ID + // within the UNICHARSET. + const char* id_to_unichar(UNICHAR_ID id) const; + + // Return the UTF8 representation corresponding to the given UNICHAR_ID after + // resolving any private encodings internal to Tesseract. This method is + // preferable to id_to_unichar for outputting text that will be visible to + // external applications. + const char* id_to_unichar_ext(UNICHAR_ID id) const; + + // Return a STRING that reformats the utf8 str into the str followed + // by its hex unicodes. + static STRING debug_utf8_str(const char* str); + + // Removes/replaces content that belongs in rendered text, but not in the + // unicharset. + static std::string CleanupString(const char* utf8_str) { + return CleanupString(utf8_str, strlen(utf8_str)); + } + static std::string CleanupString(const char* utf8_str, size_t length); + + // Return a STRING containing debug information on the unichar, including + // the id_to_unichar, its hex unicodes and the properties. + STRING debug_str(UNICHAR_ID id) const; + STRING debug_str(const char * unichar_repr) const { + return debug_str(unichar_to_id(unichar_repr)); + } + + // Adds a unichar representation to the set. If old_style is true, then + // TATWEEL characters are kept and n-grams are allowed. Otherwise TATWEEL + // characters are ignored/skipped as if they don't exist and n-grams that + // can already be encoded are not added. + void unichar_insert(const char* const unichar_repr, + OldUncleanUnichars old_style); + void unichar_insert(const char* const unichar_repr) { + unichar_insert(unichar_repr, OldUncleanUnichars::kFalse); + } + // Adds a unichar representation to the set. Avoids setting old_style to true, + // unless it is necessary to make the new unichar get added. + void unichar_insert_backwards_compatible(const char* const unichar_repr) { + std::string cleaned = CleanupString(unichar_repr); + if (cleaned != unichar_repr) { + unichar_insert(unichar_repr, OldUncleanUnichars::kTrue); + } else { + int old_size = size(); + unichar_insert(unichar_repr, OldUncleanUnichars::kFalse); + if (size() == old_size) { + unichar_insert(unichar_repr, OldUncleanUnichars::kTrue); + } + } + } + + // Return true if the given unichar id exists within the set. + // Relies on the fact that unichar ids are contiguous in the unicharset. + bool contains_unichar_id(UNICHAR_ID unichar_id) const { + return unichar_id != INVALID_UNICHAR_ID && unichar_id < size_used && + unichar_id >= 0; + } + + // Return true if the given unichar representation exists within the set. + bool contains_unichar(const char* const unichar_repr) const; + bool contains_unichar(const char* const unichar_repr, int length) const; + + // Return true if the given unichar representation corresponds to the given + // UNICHAR_ID within the set. + bool eq(UNICHAR_ID unichar_id, const char* const unichar_repr) const; + + // Delete CHAR_FRAGMENTs stored in properties of unichars array. + void delete_pointers_in_unichars() { + for (int i = 0; i < size_used; ++i) { + delete unichars[i].properties.fragment; + unichars[i].properties.fragment = nullptr; + } + } + + // Clear the UNICHARSET (all the previous data is lost). + void clear() { + if (script_table != nullptr) { + for (int i = 0; i < script_table_size_used; ++i) + delete[] script_table[i]; + delete[] script_table; + script_table = nullptr; + script_table_size_used = 0; + } + if (unichars != nullptr) { + delete_pointers_in_unichars(); + delete[] unichars; + unichars = nullptr; + } + script_table_size_reserved = 0; + size_reserved = 0; + size_used = 0; + ids.clear(); + top_bottom_set_ = false; + script_has_upper_lower_ = false; + script_has_xheight_ = false; + old_style_included_ = false; + null_sid_ = 0; + common_sid_ = 0; + latin_sid_ = 0; + cyrillic_sid_ = 0; + greek_sid_ = 0; + han_sid_ = 0; + hiragana_sid_ = 0; + katakana_sid_ = 0; + thai_sid_ = 0; + hangul_sid_ = 0; + default_sid_ = 0; + } + + // Return the size of the set (the number of different UNICHAR it holds). + int size() const { + return size_used; + } + + // Reserve enough memory space for the given number of UNICHARS + void reserve(int unichars_number); + + // Opens the file indicated by filename and saves unicharset to that file. + // Returns true if the operation is successful. + bool save_to_file(const char * const filename) const { + FILE* file = fopen(filename, "w+b"); + if (file == nullptr) return false; + bool result = save_to_file(file); + fclose(file); + return result; + } + + // Saves the content of the UNICHARSET to the given file. + // Returns true if the operation is successful. + bool save_to_file(FILE *file) const { + STRING str; + return save_to_string(&str) && + tesseract::Serialize(file, &str[0], str.length()); + } + + bool save_to_file(tesseract::TFile *file) const { + STRING str; + return save_to_string(&str) && file->Serialize(&str[0], str.length()); + } + + // Saves the content of the UNICHARSET to the given STRING. + // Returns true if the operation is successful. + bool save_to_string(STRING *str) const; + + // Load a unicharset from a unicharset file that has been loaded into + // the given memory buffer. + // Returns true if the operation is successful. + bool load_from_inmemory_file(const char* const memory, int mem_size, + bool skip_fragments); + // Returns true if the operation is successful. + bool load_from_inmemory_file(const char* const memory, int mem_size) { + return load_from_inmemory_file(memory, mem_size, false); + } + + // Opens the file indicated by filename and loads the UNICHARSET + // from the given file. The previous data is lost. + // Returns true if the operation is successful. + bool load_from_file(const char* const filename, bool skip_fragments) { + FILE* file = fopen(filename, "rb"); + if (file == nullptr) return false; + bool result = load_from_file(file, skip_fragments); + fclose(file); + return result; + } + // returns true if the operation is successful. + bool load_from_file(const char* const filename) { + return load_from_file(filename, false); + } + + // Loads the UNICHARSET from the given file. The previous data is lost. + // Returns true if the operation is successful. + bool load_from_file(FILE *file, bool skip_fragments); + bool load_from_file(FILE *file) { return load_from_file(file, false); } + bool load_from_file(tesseract::TFile *file, bool skip_fragments); + + + // Sets up internal data after loading the file, based on the char + // properties. Called from load_from_file, but also needs to be run + // during set_unicharset_properties. + void post_load_setup(); + + // Returns true if right_to_left scripts are significant in the unicharset, + // but without being so sensitive that "universal" unicharsets containing + // characters from many scripts, like orientation and script detection, + // look like they are right_to_left. + bool major_right_to_left() const; + + // Set a whitelist and/or blacklist of characters to recognize. + // An empty or nullptr whitelist enables everything (minus any blacklist). + // An empty or nullptr blacklist disables nothing. + // An empty or nullptr unblacklist has no effect. + // The blacklist overrides the whitelist. + // The unblacklist overrides the blacklist. + // Each list is a string of utf8 character strings. Boundaries between + // unicharset units are worked out automatically, and characters not in + // the unicharset are silently ignored. + void set_black_and_whitelist(const char* blacklist, const char* whitelist, + const char* unblacklist); + + // Set the isalpha property of the given unichar to the given value. + void set_isalpha(UNICHAR_ID unichar_id, bool value) { + unichars[unichar_id].properties.isalpha = value; + } + + // Set the islower property of the given unichar to the given value. + void set_islower(UNICHAR_ID unichar_id, bool value) { + unichars[unichar_id].properties.islower = value; + } + + // Set the isupper property of the given unichar to the given value. + void set_isupper(UNICHAR_ID unichar_id, bool value) { + unichars[unichar_id].properties.isupper = value; + } + + // Set the isdigit property of the given unichar to the given value. + void set_isdigit(UNICHAR_ID unichar_id, bool value) { + unichars[unichar_id].properties.isdigit = value; + } + + // Set the ispunctuation property of the given unichar to the given value. + void set_ispunctuation(UNICHAR_ID unichar_id, bool value) { + unichars[unichar_id].properties.ispunctuation = value; + } + + // Set the isngram property of the given unichar to the given value. + void set_isngram(UNICHAR_ID unichar_id, bool value) { + unichars[unichar_id].properties.isngram = value; + } + + // Set the script name of the given unichar to the given value. + // Value is copied and thus can be a temporary; + void set_script(UNICHAR_ID unichar_id, const char* value) { + unichars[unichar_id].properties.script_id = add_script(value); + } + + // Set other_case unichar id in the properties for the given unichar id. + void set_other_case(UNICHAR_ID unichar_id, UNICHAR_ID other_case) { + unichars[unichar_id].properties.other_case = other_case; + } + + // Set the direction property of the given unichar to the given value. + void set_direction(UNICHAR_ID unichar_id, UNICHARSET::Direction value) { + unichars[unichar_id].properties.direction = value; + } + + // Set mirror unichar id in the properties for the given unichar id. + void set_mirror(UNICHAR_ID unichar_id, UNICHAR_ID mirror) { + unichars[unichar_id].properties.mirror = mirror; + } + + // Record normalized version of unichar with the given unichar_id. + void set_normed(UNICHAR_ID unichar_id, const char* normed) { + unichars[unichar_id].properties.normed = normed; + unichars[unichar_id].properties.normed_ids.truncate(0); + } + // Sets the normed_ids vector from the normed string. normed_ids is not + // stored in the file, and needs to be set when the UNICHARSET is loaded. + void set_normed_ids(UNICHAR_ID unichar_id); + + // Return the isalpha property of the given unichar. + bool get_isalpha(UNICHAR_ID unichar_id) const { + if (INVALID_UNICHAR_ID == unichar_id) return false; + /* willus mod to ASSERT_HOSTXX */ + ASSERT_HOSTXX(contains_unichar_id(unichar_id)); + return unichars[unichar_id].properties.isalpha; + } + + // Return the islower property of the given unichar. + bool get_islower(UNICHAR_ID unichar_id) const { + if (INVALID_UNICHAR_ID == unichar_id) return false; + /* willus mod to ASSERT_HOSTXX */ + ASSERT_HOSTXX(contains_unichar_id(unichar_id)); + return unichars[unichar_id].properties.islower; + } + + // Return the isupper property of the given unichar. + bool get_isupper(UNICHAR_ID unichar_id) const { + if (INVALID_UNICHAR_ID == unichar_id) return false; + /* willus mod to ASSERT_HOSTXX */ + ASSERT_HOSTXX(contains_unichar_id(unichar_id)); + return unichars[unichar_id].properties.isupper; + } + + // Return the isdigit property of the given unichar. + bool get_isdigit(UNICHAR_ID unichar_id) const { + if (INVALID_UNICHAR_ID == unichar_id) return false; + /* willus mod to ASSERT_HOSTXX */ + ASSERT_HOSTXX(contains_unichar_id(unichar_id)); + return unichars[unichar_id].properties.isdigit; + } + + // Return the ispunctuation property of the given unichar. + bool get_ispunctuation(UNICHAR_ID unichar_id) const { + if (INVALID_UNICHAR_ID == unichar_id) return false; + /* willus mod to ASSERT_HOSTXX */ + ASSERT_HOSTXX(contains_unichar_id(unichar_id)); + return unichars[unichar_id].properties.ispunctuation; + } + + // Return the isngram property of the given unichar. + bool get_isngram(UNICHAR_ID unichar_id) const { + if (INVALID_UNICHAR_ID == unichar_id) return false; + /* willus mod to ASSERT_HOSTXX */ + ASSERT_HOSTXX(contains_unichar_id(unichar_id)); + return unichars[unichar_id].properties.isngram; + } + + // Returns whether the unichar id represents a unicode value in the private + // use area. + bool get_isprivate(UNICHAR_ID unichar_id) const; + + // Returns true if the ids have useful min/max top/bottom values. + bool top_bottom_useful() const { + return top_bottom_set_; + } + // Sets all ranges to empty, so they can be expanded to set the values. + void set_ranges_empty(); + // Sets all the properties for this unicharset given a src_unicharset with + // everything set. The unicharsets don't have to be the same, and graphemes + // are correctly accounted for. + void SetPropertiesFromOther(const UNICHARSET& src) { + PartialSetPropertiesFromOther(0, src); + } + // Sets properties from Other, starting only at the given index. + void PartialSetPropertiesFromOther(int start_index, const UNICHARSET& src); + // Expands the tops and bottoms and widths for this unicharset given a + // src_unicharset with ranges in it. The unicharsets don't have to be the + // same, and graphemes are correctly accounted for. + void ExpandRangesFromOther(const UNICHARSET& src); + // Makes this a copy of src. Clears this completely first, so the automattic + // ids will not be present in this if not in src. + void CopyFrom(const UNICHARSET& src); + // For each id in src, if it does not occur in this, add it, as in + // SetPropertiesFromOther, otherwise expand the ranges, as in + // ExpandRangesFromOther. + void AppendOtherUnicharset(const UNICHARSET& src); + // Returns true if the acceptable ranges of the tops of the characters do + // not overlap, making their x-height calculations distinct. + bool SizesDistinct(UNICHAR_ID id1, UNICHAR_ID id2) const; + // Returns the min and max bottom and top of the given unichar in + // baseline-normalized coordinates, ie, where the baseline is + // kBlnBaselineOffset and the meanline is kBlnBaselineOffset + kBlnXHeight + // (See normalis.h for the definitions). + void get_top_bottom(UNICHAR_ID unichar_id, + int* min_bottom, int* max_bottom, + int* min_top, int* max_top) const { + if (INVALID_UNICHAR_ID == unichar_id) { + *min_bottom = *min_top = 0; + *max_bottom = *max_top = 256; // kBlnCellHeight + return; + } + ASSERT_HOST(contains_unichar_id(unichar_id)); + *min_bottom = unichars[unichar_id].properties.min_bottom; + *max_bottom = unichars[unichar_id].properties.max_bottom; + *min_top = unichars[unichar_id].properties.min_top; + *max_top = unichars[unichar_id].properties.max_top; + } + void set_top_bottom(UNICHAR_ID unichar_id, + int min_bottom, int max_bottom, + int min_top, int max_top) { + unichars[unichar_id].properties.min_bottom = + ClipToRange(min_bottom, 0, UINT8_MAX); + unichars[unichar_id].properties.max_bottom = + ClipToRange(max_bottom, 0, UINT8_MAX); + unichars[unichar_id].properties.min_top = + ClipToRange(min_top, 0, UINT8_MAX); + unichars[unichar_id].properties.max_top = + ClipToRange(max_top, 0, UINT8_MAX); + } + // Returns the width stats (as mean, sd) of the given unichar relative to the + // median advance of all characters in the character set. + void get_width_stats(UNICHAR_ID unichar_id, + float* width, float* width_sd) const { + if (INVALID_UNICHAR_ID == unichar_id) { + *width = 0.0f; + *width_sd = 0.0f;; + return; + } + ASSERT_HOST(contains_unichar_id(unichar_id)); + *width = unichars[unichar_id].properties.width; + *width_sd = unichars[unichar_id].properties.width_sd; + } + void set_width_stats(UNICHAR_ID unichar_id, float width, float width_sd) { + unichars[unichar_id].properties.width = width; + unichars[unichar_id].properties.width_sd = width_sd; + } + // Returns the stats of the x-bearing (as mean, sd) of the given unichar + // relative to the median advance of all characters in the character set. + void get_bearing_stats(UNICHAR_ID unichar_id, + float* bearing, float* bearing_sd) const { + if (INVALID_UNICHAR_ID == unichar_id) { + *bearing = *bearing_sd = 0.0f; + return; + } + ASSERT_HOST(contains_unichar_id(unichar_id)); + *bearing = unichars[unichar_id].properties.bearing; + *bearing_sd = unichars[unichar_id].properties.bearing_sd; + } + void set_bearing_stats(UNICHAR_ID unichar_id, + float bearing, float bearing_sd) { + unichars[unichar_id].properties.bearing = bearing; + unichars[unichar_id].properties.bearing_sd = bearing_sd; + } + // Returns the stats of the x-advance of the given unichar (as mean, sd) + // relative to the median advance of all characters in the character set. + void get_advance_stats(UNICHAR_ID unichar_id, + float* advance, float* advance_sd) const { + if (INVALID_UNICHAR_ID == unichar_id) { + *advance = *advance_sd = 0; + return; + } + ASSERT_HOST(contains_unichar_id(unichar_id)); + *advance = unichars[unichar_id].properties.advance; + *advance_sd = unichars[unichar_id].properties.advance_sd; + } + void set_advance_stats(UNICHAR_ID unichar_id, + float advance, float advance_sd) { + unichars[unichar_id].properties.advance = advance; + unichars[unichar_id].properties.advance_sd = advance_sd; + } + // Returns true if the font metrics properties are empty. + bool PropertiesIncomplete(UNICHAR_ID unichar_id) const { + return unichars[unichar_id].properties.AnyRangeEmpty(); + } + + // Returns true if the script of the given id is space delimited. + // Returns false for Han and Thai scripts. + bool IsSpaceDelimited(UNICHAR_ID unichar_id) const { + if (INVALID_UNICHAR_ID == unichar_id) return true; + int script_id = get_script(unichar_id); + return script_id != han_sid_ && script_id != thai_sid_ && + script_id != hangul_sid_ && script_id != hiragana_sid_ && + script_id != katakana_sid_; + } + + // Return the script name of the given unichar. + // The returned pointer will always be the same for the same script, it's + // managed by unicharset and thus MUST NOT be deleted + int get_script(UNICHAR_ID unichar_id) const { + if (INVALID_UNICHAR_ID == unichar_id) return null_sid_; + ASSERT_HOST(contains_unichar_id(unichar_id)); + return unichars[unichar_id].properties.script_id; + } + + // Return the character properties, eg. alpha/upper/lower/digit/punct, + // as a bit field of unsigned int. + unsigned int get_properties(UNICHAR_ID unichar_id) const; + + // Return the character property as a single char. If a character has + // multiple attributes, the main property is defined by the following order: + // upper_case : 'A' + // lower_case : 'a' + // alpha : 'x' + // digit : '0' + // punctuation: 'p' + char get_chartype(UNICHAR_ID unichar_id) const; + + // Get other_case unichar id in the properties for the given unichar id. + UNICHAR_ID get_other_case(UNICHAR_ID unichar_id) const { + if (INVALID_UNICHAR_ID == unichar_id) return INVALID_UNICHAR_ID; + ASSERT_HOST(contains_unichar_id(unichar_id)); + return unichars[unichar_id].properties.other_case; + } + + // Returns the direction property of the given unichar. + Direction get_direction(UNICHAR_ID unichar_id) const { + if (INVALID_UNICHAR_ID == unichar_id) return UNICHARSET::U_OTHER_NEUTRAL; + ASSERT_HOST(contains_unichar_id(unichar_id)); + return unichars[unichar_id].properties.direction; + } + + // Get mirror unichar id in the properties for the given unichar id. + UNICHAR_ID get_mirror(UNICHAR_ID unichar_id) const { + if (INVALID_UNICHAR_ID == unichar_id) return INVALID_UNICHAR_ID; + ASSERT_HOST(contains_unichar_id(unichar_id)); + return unichars[unichar_id].properties.mirror; + } + + // Returns UNICHAR_ID of the corresponding lower-case unichar. + UNICHAR_ID to_lower(UNICHAR_ID unichar_id) const { + if (INVALID_UNICHAR_ID == unichar_id) return INVALID_UNICHAR_ID; + ASSERT_HOST(contains_unichar_id(unichar_id)); + if (unichars[unichar_id].properties.islower) return unichar_id; + return unichars[unichar_id].properties.other_case; + } + + // Returns UNICHAR_ID of the corresponding upper-case unichar. + UNICHAR_ID to_upper(UNICHAR_ID unichar_id) const { + if (INVALID_UNICHAR_ID == unichar_id) return INVALID_UNICHAR_ID; + ASSERT_HOST(contains_unichar_id(unichar_id)); + if (unichars[unichar_id].properties.isupper) return unichar_id; + return unichars[unichar_id].properties.other_case; + } + + // Returns true if this UNICHARSET has the special codes in + // SpecialUnicharCodes available. If false then there are normal unichars + // at these codes and they should not be used. + bool has_special_codes() const { + return get_fragment(UNICHAR_BROKEN) != nullptr && + strcmp(id_to_unichar(UNICHAR_BROKEN), + kSpecialUnicharCodes[UNICHAR_BROKEN]) == 0; + } + + // Returns true if there are any repeated unicodes in the normalized + // text of any unichar-id in the unicharset. + bool AnyRepeatedUnicodes() const; + + // Return a pointer to the CHAR_FRAGMENT class if the given + // unichar id represents a character fragment. + const CHAR_FRAGMENT *get_fragment(UNICHAR_ID unichar_id) const { + if (INVALID_UNICHAR_ID == unichar_id) return nullptr; + ASSERT_HOST(contains_unichar_id(unichar_id)); + return unichars[unichar_id].properties.fragment; + } + + // Return the isalpha property of the given unichar representation. + bool get_isalpha(const char* const unichar_repr) const { + return get_isalpha(unichar_to_id(unichar_repr)); + } + + // Return the islower property of the given unichar representation. + bool get_islower(const char* const unichar_repr) const { + return get_islower(unichar_to_id(unichar_repr)); + } + + // Return the isupper property of the given unichar representation. + bool get_isupper(const char* const unichar_repr) const { + return get_isupper(unichar_to_id(unichar_repr)); + } + + // Return the isdigit property of the given unichar representation. + bool get_isdigit(const char* const unichar_repr) const { + return get_isdigit(unichar_to_id(unichar_repr)); + } + + // Return the ispunctuation property of the given unichar representation. + bool get_ispunctuation(const char* const unichar_repr) const { + return get_ispunctuation(unichar_to_id(unichar_repr)); + } + + // Return the character properties, eg. alpha/upper/lower/digit/punct, + // of the given unichar representation + unsigned int get_properties(const char* const unichar_repr) const { + return get_properties(unichar_to_id(unichar_repr)); + } + + char get_chartype(const char* const unichar_repr) const { + return get_chartype(unichar_to_id(unichar_repr)); + } + + // Return the script name of the given unichar representation. + // The returned pointer will always be the same for the same script, it's + // managed by unicharset and thus MUST NOT be deleted + int get_script(const char* const unichar_repr) const { + return get_script(unichar_to_id(unichar_repr)); + } + + // Return a pointer to the CHAR_FRAGMENT class struct if the given + // unichar representation represents a character fragment. + const CHAR_FRAGMENT *get_fragment(const char* const unichar_repr) const { + if (unichar_repr == nullptr || unichar_repr[0] == '\0' || + !ids.contains(unichar_repr, false)) { + return nullptr; + } + return get_fragment(unichar_to_id(unichar_repr)); + } + + // Return the isalpha property of the given unichar representation. + // Only the first length characters from unichar_repr are used. + bool get_isalpha(const char* const unichar_repr, + int length) const { + return get_isalpha(unichar_to_id(unichar_repr, length)); + } + + // Return the islower property of the given unichar representation. + // Only the first length characters from unichar_repr are used. + bool get_islower(const char* const unichar_repr, + int length) const { + return get_islower(unichar_to_id(unichar_repr, length)); + } + + // Return the isupper property of the given unichar representation. + // Only the first length characters from unichar_repr are used. + bool get_isupper(const char* const unichar_repr, + int length) const { + return get_isupper(unichar_to_id(unichar_repr, length)); + } + + // Return the isdigit property of the given unichar representation. + // Only the first length characters from unichar_repr are used. + bool get_isdigit(const char* const unichar_repr, + int length) const { + return get_isdigit(unichar_to_id(unichar_repr, length)); + } + + // Return the ispunctuation property of the given unichar representation. + // Only the first length characters from unichar_repr are used. + bool get_ispunctuation(const char* const unichar_repr, + int length) const { + return get_ispunctuation(unichar_to_id(unichar_repr, length)); + } + + // Returns normalized version of unichar with the given unichar_id. + const char *get_normed_unichar(UNICHAR_ID unichar_id) const { + if (unichar_id == UNICHAR_SPACE) return " "; + return unichars[unichar_id].properties.normed.string(); + } + // Returns a vector of UNICHAR_IDs that represent the ids of the normalized + // version of the given id. There may be more than one UNICHAR_ID in the + // vector if unichar_id represents a ligature. + const GenericVector& normed_ids(UNICHAR_ID unichar_id) const { + return unichars[unichar_id].properties.normed_ids; + } + + // Return the script name of the given unichar representation. + // Only the first length characters from unichar_repr are used. + // The returned pointer will always be the same for the same script, it's + // managed by unicharset and thus MUST NOT be deleted + int get_script(const char* const unichar_repr, + int length) const { + return get_script(unichar_to_id(unichar_repr, length)); + } + + // Return the (current) number of scripts in the script table + int get_script_table_size() const { + return script_table_size_used; + } + + // Return the script string from its id + const char* get_script_from_script_id(int id) const { + if (id >= script_table_size_used || id < 0) + return null_script; + return script_table[id]; + } + + // Returns the id from the name of the script, or 0 if script is not found. + // Note that this is an expensive operation since it involves iteratively + // comparing strings in the script table. To avoid dependency on STL, we + // won't use a hash. Instead, the calling function can use this to lookup + // and save the ID for relevant scripts for fast comparisons later. + int get_script_id_from_name(const char* script_name) const; + + // Return true if the given script is the null script + bool is_null_script(const char* script) const { + return script == null_script; + } + + // Uniquify the given script. For two scripts a and b, if strcmp(a, b) == 0, + // then the returned pointer will be the same. + // The script parameter is copied and thus can be a temporary. + int add_script(const char* script); + + // Return the enabled property of the given unichar. + bool get_enabled(UNICHAR_ID unichar_id) const { + return unichars[unichar_id].properties.enabled; + } + + + int null_sid() const { return null_sid_; } + int common_sid() const { return common_sid_; } + int latin_sid() const { return latin_sid_; } + int cyrillic_sid() const { return cyrillic_sid_; } + int greek_sid() const { return greek_sid_; } + int han_sid() const { return han_sid_; } + int hiragana_sid() const { return hiragana_sid_; } + int katakana_sid() const { return katakana_sid_; } + int thai_sid() const { return thai_sid_; } + int hangul_sid() const { return hangul_sid_; } + int default_sid() const { return default_sid_; } + + // Returns true if the unicharset has the concept of upper/lower case. + bool script_has_upper_lower() const { + return script_has_upper_lower_; + } + // Returns true if the unicharset has the concept of x-height. + // script_has_xheight can be true even if script_has_upper_lower is not, + // when the script has a sufficiently predominant top line with ascenders, + // such as Devanagari and Thai. + bool script_has_xheight() const { + return script_has_xheight_; + } + + private: + + struct UNICHAR_PROPERTIES { + UNICHAR_PROPERTIES(); + // Initializes all properties to sensible default values. + void Init(); + // Sets all ranges wide open. Initialization default in case there are + // no useful values available. + void SetRangesOpen(); + // Sets all ranges to empty. Used before expanding with font-based data. + void SetRangesEmpty(); + // Returns true if any of the top/bottom/width/bearing/advance ranges/stats + // is empty. + bool AnyRangeEmpty() const; + // Expands the ranges with the ranges from the src properties. + void ExpandRangesFrom(const UNICHAR_PROPERTIES& src); + // Copies the properties from src into this. + void CopyFrom(const UNICHAR_PROPERTIES& src); + + bool isalpha; + bool islower; + bool isupper; + bool isdigit; + bool ispunctuation; + bool isngram; + bool enabled; + // Possible limits of the top and bottom of the bounding box in + // baseline-normalized coordinates, ie, where the baseline is + // kBlnBaselineOffset and the meanline is kBlnBaselineOffset + kBlnXHeight + // (See normalis.h for the definitions). + uint8_t min_bottom; + uint8_t max_bottom; + uint8_t min_top; + uint8_t max_top; + // Statstics of the widths of bounding box, relative to the median advance. + float width; + float width_sd; + // Stats of the x-bearing and advance, also relative to the median advance. + float bearing; + float bearing_sd; + float advance; + float advance_sd; + int script_id; + UNICHAR_ID other_case; // id of the corresponding upper/lower case unichar + Direction direction; // direction of this unichar + // Mirror property is useful for reverse DAWG lookup for words in + // right-to-left languages (e.g. "(word)" would be in + // '[open paren]' 'w' 'o' 'r' 'd' '[close paren]' in a UTF8 string. + // However, what we want in our DAWG is + // '[open paren]', 'd', 'r', 'o', 'w', '[close paren]' not + // '[close paren]', 'd', 'r', 'o', 'w', '[open paren]'. + UNICHAR_ID mirror; + // A string of unichar_ids that represent the corresponding normed string. + // For awkward characters like em-dash, this gives hyphen. + // For ligatures, this gives the string of normal unichars. + GenericVector normed_ids; + STRING normed; // normalized version of this unichar + // Contains meta information about the fragment if a unichar represents + // a fragment of a character, otherwise should be set to nullptr. + // It is assumed that character fragments are added to the unicharset + // after the corresponding 'base' characters. + CHAR_FRAGMENT *fragment; + }; + + struct UNICHAR_SLOT { + char representation[UNICHAR_LEN + 1]; + UNICHAR_PROPERTIES properties; + }; + + // Internal recursive version of encode_string above. + // str is the start of the whole string. + // str_index is the current position in str. + // str_length is the length of str. + // encoding is a working encoding of str. + // lengths is a working set of lengths of each element of encoding. + // best_total_length is the longest length of str that has been successfully + // encoded so far. + // On return: + // best_encoding contains the encoding that used the longest part of str. + // best_lengths (may be null) contains the lengths of best_encoding. + void encode_string(const char* str, int str_index, int str_length, + GenericVector* encoding, + GenericVector* lengths, + int* best_total_length, + GenericVector* best_encoding, + GenericVector* best_lengths) const; + + // Gets the properties for a grapheme string, combining properties for + // multiple characters in a meaningful way where possible. + // Returns false if no valid match was found in the unicharset. + // NOTE that script_id, mirror, and other_case refer to this unicharset on + // return and will need redirecting if the target unicharset is different. + bool GetStrProperties(const char* utf8_str, + UNICHAR_PROPERTIES* props) const; + + // Load ourselves from a "file" where our only interface to the file is + // an implementation of fgets(). This is the parsing primitive accessed by + // the public routines load_from_file() and load_from_inmemory_file(). + bool load_via_fgets(TessResultCallback2 *fgets_cb, + bool skip_fragments); + + // List of mappings to make when ingesting strings from the outside. + // The substitutions clean up text that should exists for rendering of + // synthetic data, but not in the recognition set. + static const char* kCleanupMaps[][2]; + static TESS_API const char* null_script; + + UNICHAR_SLOT* unichars; + UNICHARMAP ids; + int size_used; + int size_reserved; + char** script_table; + int script_table_size_used; + int script_table_size_reserved; + // True if the unichars have their tops/bottoms set. + bool top_bottom_set_; + // True if the unicharset has significant upper/lower case chars. + bool script_has_upper_lower_; + // True if the unicharset has a significant mean-line with significant + // ascenders above that. + bool script_has_xheight_; + // True if the set contains chars that would be changed by the cleanup. + bool old_style_included_; + + // A few convenient script name-to-id mapping without using hash. + // These are initialized when unicharset file is loaded. Anything + // missing from this list can be looked up using get_script_id_from_name. + int null_sid_; + int common_sid_; + int latin_sid_; + int cyrillic_sid_; + int greek_sid_; + int han_sid_; + int hiragana_sid_; + int katakana_sid_; + int thai_sid_; + int hangul_sid_; + // The most frequently occurring script in the charset. + int default_sid_; +}; + +#endif // TESSERACT_CCUTIL_UNICHARSET_H_ diff -Nru k2pdfopt-2.42+ds/willuslib/bmp.c k2pdfopt-2.51+ds/willuslib/bmp.c --- k2pdfopt-2.42+ds/willuslib/bmp.c 2017-05-20 14:24:37.000000000 +0000 +++ k2pdfopt-2.51+ds/willuslib/bmp.c 2018-12-31 23:21:16.000000000 +0000 @@ -5,7 +5,7 @@ ** ** Part of willus.com general purpose C code library. ** -** Copyright (C) 2017 http://willus.com +** Copyright (C) 2018 http://willus.com ** ** This program is free software: you can redistribute it and/or modify ** it under the terms of the GNU Affero General Public License as @@ -149,6 +149,8 @@ double theta_radians); static int pixval_dither(int pv,int n,int maxsrc,int maxdst,int x0,int y0); static int dither_rec(int bits,int x0,int y0); +static int pcl_get_resolution(char *pclbuf,int n,int *w,int *h); +static int pcl_next_raster_row(char *pclbuf,int n,int *index); double bmp_get_dpi(void) @@ -2441,8 +2443,8 @@ copy=&_copy; bmp_init(copy); bmp_copy(copy,bmp); - bmp->width *= scalefactor; - bmp->height *= scalefactor; + bmp->width = bmp->width*scalefactor+0.5; + bmp->height = bmp->height*scalefactor+0.5; bmp_resample(bmp,copy,0.,0.,(double)copy->width,(double)copy->height, bmp->width,bmp->height); bmp_free(copy); @@ -3074,6 +3076,44 @@ } + +/* +** Crop all edge pixels of bitmap that match the color of the upper +** left corner pixel +*/ +/* +void bmp_autocrop(WILLUSBITMAP *bmp) + + { + unsigned char *p0,*p; + int i,j,pbw,imin,imax,jmin,jmax; + + p0=bmp_rowptr_from_top(bmp,0); + pbw=bmp->bpp>>3; + imin=bmp->height+1; + imax=-1; + jmax=-1; + jmin=bmp->width+1; + for (i=0;iheight;i++) + { + p=bmp_rowptr_from_top(bmp,i); + for (j=0;jwidth;j++,p+=pbw) + if (memcmp(p,p0,pbw)) + { + if (iimax) + imax=i; + if (jjmax) + jmax=j; + } + } + if (imax>=0) + bmp_crop(bmp,imin,jmin,(jmax-jmin+1),(imax-imin+1)); + } +*/ /* @@ -4796,3 +4836,113 @@ for (i=height;i>0;i--,psrc+=sbw,pdest+=dbw) memcpy(pdest,psrc,dbw); } + + +/* +** Read BMP from simple PCL-formatted data +** Return 0 for okay. +** Negative for error. +** +** Presently works only for the simplest black/white rasters--no encoding. +** This is enough to correctly convert HP 8722D VNA raster downloads. +** +*/ +int bmp_read_pcl(WILLUSBITMAP *bmp,char *pclbuf,int n) + + { + int i,r,status; + + + /* Set up bitmap params */ + status=pcl_get_resolution(pclbuf,n,&bmp->width,&bmp->height); + if (status<0) + return(status); + bmp->bpp=8; + bmp->type=WILLUSBITMAP_TYPE_WIN32; + for (i=0;i<256;i++) + bmp->red[i]=bmp->blue[i]=bmp->green[i]=i; + /* Allocate */ + if (!bmp_alloc(bmp)) + return(-4); + /* Fill */ + i=0; + for (r=0;rheight;r++) + { + unsigned char *p; + int status,j,k; + + p=bmp_rowptr_from_top(bmp,r); + status=pcl_next_raster_row(pclbuf,n,&i); + if (status<=0) + break; + if (status*8!=bmp->width) + return(-1); + for (i++,k=0;k>=1,p++) + if (pclbuf[i]&j) + (*p)=0; + else + (*p)=255; + i--; + } + return(0); + } + + +static int pcl_get_resolution(char *pclbuf,int n,int *w,int *h) + + { + int i,nr,nc; + + i=nc=nr=0; + while (1) + { + int status; + + status=pcl_next_raster_row(pclbuf,n,&i); + if (status<=0) + break; + if (nc==0) + nc=status; + else if (nc!=status) + return(-2); + nr++; + } + if (nr<=0 || nc<=0) + return(-3); + (*w)=nc*8; + (*h)=nr; + return(0); + } + + +static int pcl_next_raster_row(char *pclbuf,int n,int *index) + + { + int i; + static char *mstr="\x1b\x2a\x62"; + char numbuf[32]; + + while (1) + { + if (memcmp(mstr,&pclbuf[(*index)],3)) + { + (*index)=(*index)+1; + if ((*index)>n-3) + break; + continue; + } + (*index)=(*index)+3; + for (i=0;(*index)='0' && pclbuf[(*index)]<='9' && i<30;) + { + numbuf[i++]=pclbuf[(*index)]; + (*index)=(*index)+1; + } + numbuf[i]='\0'; + if ((*index)>=n || i<=0) + break; + if (is_an_integer(numbuf)) + return(atoi(numbuf)); + } + return(-1); + } diff -Nru k2pdfopt-2.42+ds/willuslib/bmpmupdf.c k2pdfopt-2.51+ds/willuslib/bmpmupdf.c --- k2pdfopt-2.42+ds/willuslib/bmpmupdf.c 2016-10-21 19:36:30.000000000 +0000 +++ k2pdfopt-2.51+ds/willuslib/bmpmupdf.c 2018-11-21 04:13:11.000000000 +0000 @@ -3,7 +3,7 @@ ** ** Part of willus.com general purpose C code library. ** -** Copyright (C) 2016 http://willus.com +** Copyright (C) 2018 http://willus.com ** ** This program is free software: you can redistribute it and/or modify ** it under the terms of the GNU Affero General Public License as @@ -20,6 +20,7 @@ ** */ #include +#include #include "willus.h" #ifdef HAVE_MUPDF_LIB @@ -93,9 +94,10 @@ fz_drop_context(ctx); return(-3); } - fz_try(ctx) { list=fz_new_display_list(ctx,NULL); + bounds=fz_bound_page(ctx,page); + fz_try(ctx) { list=fz_new_display_list(ctx,bounds); dev=fz_new_list_device(ctx,list); - fz_run_page(ctx,page,dev,&fz_identity,NULL); + fz_run_page(ctx,page,dev,fz_identity,NULL); } fz_catch(ctx) { @@ -113,14 +115,12 @@ dpp=dpi/72.; pix=NULL; fz_var(pix); - fz_bound_page(ctx,page,&bounds); ctm=fz_identity; identity=fz_identity; - fz_scale(&ctm,dpp,dpp); + ctm=fz_scale(dpp,dpp); // ctm=fz_concat(ctm,fz_rotate(rotation)); - bounds2=bounds; - fz_transform_rect(&bounds2,&ctm); - fz_round_rect(&bbox,&bounds2); + bounds2=fz_transform_rect(bounds,ctm); + bbox=fz_round_rect(bounds2); // ctm=fz_translate(0,-page->mediabox.y1); // ctm=fz_concat(ctm,fz_scale(dpp,-dpp)); // ctm=fz_concat(ctm,fz_rotate(page->rotate)); @@ -129,13 +129,13 @@ // pix=fz_new_pixmap_with_rect(colorspace,bbox); fz_try(ctx) { - pix=fz_new_pixmap_with_bbox(ctx,colorspace,&bbox,1); + pix=fz_new_pixmap_with_bbox(ctx,colorspace,bbox,NULL,1); fz_clear_pixmap_with_value(ctx,pix,255); - dev=fz_new_draw_device(ctx,&identity,pix); + dev=fz_new_draw_device(ctx,identity,pix); if (list) - fz_run_display_list(ctx,list,dev,&ctm,&bounds2,NULL); + fz_run_display_list(ctx,list,dev,ctm,bounds2,NULL); else - fz_run_page(ctx,page,dev,&ctm,NULL); + fz_run_page(ctx,page,dev,ctm,NULL); fz_close_device(ctx,dev); fz_drop_device(ctx,dev); dev=NULL; @@ -217,9 +217,10 @@ fz_drop_context(ctx); return(-3); } - fz_try(ctx) { list=fz_new_display_list(ctx,NULL); + bounds=fz_bound_page(ctx,page); + fz_try(ctx) { list=fz_new_display_list(ctx,bounds); dev=fz_new_list_device(ctx,list); - fz_run_page(ctx,page,dev,&fz_identity,NULL); + fz_run_page(ctx,page,dev,fz_identity,NULL); } fz_catch(ctx) { @@ -234,7 +235,6 @@ fz_close_device(ctx,dev); fz_drop_device(ctx,dev); dev=NULL; - fz_bound_page(ctx,page,&bounds); if (width_in!=NULL) (*width_in)=fabs(bounds.x1-bounds.x0)/72.; if (height_in!=NULL) diff -Nru k2pdfopt-2.42+ds/willuslib/filelist.c k2pdfopt-2.51+ds/willuslib/filelist.c --- k2pdfopt-2.42+ds/willuslib/filelist.c 2015-06-13 14:35:00.000000000 +0000 +++ k2pdfopt-2.51+ds/willuslib/filelist.c 2018-01-24 22:38:06.000000000 +0000 @@ -3,7 +3,7 @@ ** ** Part of willus.com general purpose C code library. ** -** Copyright (C) 2015 http://willus.com +** Copyright (C) 2018 http://willus.com ** ** This program is free software: you can redistribute it and/or modify ** it under the terms of the GNU Affero General Public License as @@ -42,7 +42,7 @@ int *index,int *count); static void filelist_conditionally_add_file(FILELIST *fl,wfile *wf, char *include_only[],char *exclude[], - int *index,int *count); + int *index,int *count,int is_symlink); static int parse_exline(char *buf,double *size,int *month,int *day,int *year, int *hour,int *minute,int *second,int *attr, char *filename,int dirstoo); @@ -1131,6 +1131,9 @@ printf("fdf: index=%d, dirname='%s', io[0]='%s', rec=%d, dt=%d\n", index,dirname,include_only[0],recursive,dirstoo); */ +/* +printf("fdf(%s)\n",dirname); +*/ is_archive = (recursive>1 && wfile_is_archive(dirname)); if (is_archive) { @@ -1139,22 +1142,25 @@ wfile_unique_part(dir,fl->dir); return(filelist_recursive_archive_add(fl,index,dir,dirname,include_only,exclude,recursive,dirstoo)); } - if (recursive || include_only[0]=='\0' || include_only[1]!='\0') + if (recursive || include_only[0][0]=='\0' || include_only[1][0]!='\0') wfile_fullname(wildspec,dirname,"*"); else wfile_fullname(wildspec,dirname,include_only[0]); i=index; count=0; +/* +printf(" Reg files...\n"); +*/ for (s=wfile_findfirst(wildspec,&wf);s;s=wfile_findnext(&wf)) { - int fstatus,is_archive,symlink; - + int is_archive,symlink; if (!strcmp(wf.basename,".") || !strcmp(wf.basename,"..")) - continue; - fstatus=wfile_status(wf.fullname); + continue + /* WARNING: wfile_is_symlink() can be a slow call on a network drive! */; + /* fstatus=wfile_status(wf.fullname); */ /* 1-24-18--remove this call to go faster */ is_archive=wfile_is_archive(wf.fullname); symlink=(wf.attr&WFILE_SYMLINK); - if (fstatus==2 && !symlink && (dirstoo!=1 || recursive)) + if ((wf.attr&WFILE_DIR) && !symlink && (dirstoo!=1 || recursive)) continue; /* If archive file and we want to look into archives, then skip it. */ if (is_archive && recursive>1) @@ -1164,23 +1170,36 @@ continue; */ /* Regular file includes sym link to regular file or broken symlink */ - if (fstatus!=2 && !wfile_is_regular_file(wf.fullname) + if (!(wf.attr&WFILE_DIR) && !wfile_is_regular_file(wf.fullname)) +/* Removed 1-24-18 */ +/* && (fstatus!=0 || !symlink)) +*/ continue; - filelist_conditionally_add_file(fl,&wf,include_only,exclude,&i,&count); + filelist_conditionally_add_file(fl,&wf,include_only,exclude,&i,&count, + wf.attr&WFILE_SYMLINK); } wfile_findclose(&wf); if (!recursive) return(count); +/* +printf(" Dir files...\n"); +*/ for (s=wfile_findfirst(wildspec,&wf);s;s=wfile_findnext(&wf)) { int n,archive; - +/* +printf(" %s\n",wf.fullname); +*/ if (!strcmp(wf.basename,".") || !strcmp(wf.basename,"..")) continue; archive = (recursive>1 && wfile_is_archive(wf.fullname)); + /* 1-24-18 -- don't call wfile_status */ + if (!(wf.attr&WFILE_DIR) && !archive) +/* if (wfile_status(wf.fullname)!=2 && !archive) +*/ continue; /* Do not recurse symbolic links to dirs */ if (wf.attr & WFILE_SYMLINK) @@ -1205,11 +1224,11 @@ || (dirstoo==3 && dir_truly_empty(wf.fullname)))) { filelist_conditionally_add_file(fl,&wf,include_only,exclude, - &i,&count); + &i,&count,wf.attr&WFILE_SYMLINK); continue; } if (n>0 && dirstoo==1) - filelist_conditionally_add_file(fl,&wf,NULL,NULL,&i,&count); + filelist_conditionally_add_file(fl,&wf,NULL,NULL,&i,&count,wf.attr&WFILE_SYMLINK); i+=n; count+=n; } @@ -1407,7 +1426,7 @@ static void filelist_conditionally_add_file(FILELIST *fl,wfile *wf, char *include_only[],char *exclude[], - int *index,int *count) + int *index,int *count,int is_symlink) { char unique[MAXFILENAMELEN]; @@ -1435,7 +1454,12 @@ entry.size = 0; } #endif - if (wfile_is_symlink(wf->fullname)) + /* WARNING: wfile_is_symlink() can be a slow call on a network drive! */ + /* + if (is_symlink<0) + is_symlink=wfile_is_symlink(wf->fullname); + */ + if (is_symlink) entry.attr |= WFILE_SYMLINK; filelist_add_entry(fl,&entry); (*index) = (*index) + 1; diff -Nru k2pdfopt-2.42+ds/willuslib/fontrender.c k2pdfopt-2.51+ds/willuslib/fontrender.c --- k2pdfopt-2.42+ds/willuslib/fontrender.c 2013-08-12 19:48:14.000000000 +0000 +++ k2pdfopt-2.51+ds/willuslib/fontrender.c 2019-01-01 20:52:31.000000000 +0000 @@ -5,7 +5,7 @@ ** ** Part of willus.com general purpose C code library. ** -** Copyright (C) 2013 http://willus.com +** Copyright (C) 2019 http://willus.com ** ** This program is free software: you can redistribute it and/or modify ** it under the terms of the GNU Affero General Public License as @@ -201,7 +201,9 @@ char *string,int rot,FILE *out) { +#ifdef HAVE_PNG_LIB fontrender_render_ex(bmp,x,y_from_bottom,string,rot,0,0,0,out); +#endif } diff -Nru k2pdfopt-2.42+ds/willuslib/linux.c k2pdfopt-2.51+ds/willuslib/linux.c --- k2pdfopt-2.42+ds/willuslib/linux.c 2012-10-19 15:37:08.000000000 +0000 +++ k2pdfopt-2.51+ds/willuslib/linux.c 2018-12-07 04:04:19.000000000 +0000 @@ -3,7 +3,7 @@ ** ** Part of willus.com general purpose C code library. ** -** Copyright (C) 2012 http://willus.com +** Copyright (C) 2018 http://willus.com ** ** This program is free software: you can redistribute it and/or modify ** it under the terms of the GNU Affero General Public License as @@ -31,7 +31,10 @@ #include #include #include +/* 12-6-18: Apparently sys/termios.h no longer needed to compile. */ +/* #include +*/ #include #include #include diff -Nru k2pdfopt-2.42+ds/willuslib/mem.c k2pdfopt-2.51+ds/willuslib/mem.c --- k2pdfopt-2.42+ds/willuslib/mem.c 2014-12-26 21:11:26.000000000 +0000 +++ k2pdfopt-2.51+ds/willuslib/mem.c 2018-10-12 15:48:26.000000000 +0000 @@ -3,7 +3,7 @@ ** ** Part of willus.com general purpose C code library. ** -** Copyright (C) 2014 http://willus.com +** Copyright (C) 2018 http://willus.com ** ** This program is free software: you can redistribute it and/or modify ** it under the terms of the GNU Affero General Public License as @@ -361,6 +361,7 @@ size_t memsize; void *newptr; #endif + #ifndef NOMEMDEBUG #ifdef DEBUG int ra=0; diff -Nru k2pdfopt-2.42+ds/willuslib/ocrgocr.c k2pdfopt-2.51+ds/willuslib/ocrgocr.c --- k2pdfopt-2.42+ds/willuslib/ocrgocr.c 2016-10-29 18:12:25.000000000 +0000 +++ k2pdfopt-2.51+ds/willuslib/ocrgocr.c 2018-11-24 16:44:59.000000000 +0000 @@ -3,7 +3,7 @@ ** ** Part of willus.com general purpose C code library. ** -** Copyright (C) 2016 http://willus.com +** Copyright (C) 2018 http://willus.com ** ** This program is free software: you can redistribute it and/or modify ** it under the terms of the GNU Affero General Public License as @@ -38,10 +38,32 @@ int std_proc) { + OCRWORDS *ocrwords,_ocrwords; + + ocrwords=&_ocrwords; + ocrwords_init(ocrwords); + gocr_ocrwords_from_bmp8(ocrwords,bmp8,x1,y1,x2,y2,allow_spaces,std_proc); + if (ocrwords->n>0) + { + strncpy(text,ocrwords->word[0].text,maxlen-1); + text[maxlen-1]='\0'; + } + else + text[0]='\0'; + ocrwords_free(ocrwords); + } + + +void gocr_ocrwords_from_bmp8(OCRWORDS *ocrwords,WILLUSBITMAP *bmp8, + int x1,int y1,int x2,int y2,int allow_spaces, + int std_proc) + + { job_t *job,_job; int i,w,h,dw,dh,bw; unsigned char *src,*dst; - char *buf; + char *buf,*buf2; + static char *funcname="gocr_ocrwords_from_bmp8"; if (x1>x2) { @@ -78,15 +100,25 @@ memcpy(dst,src,w); pgm2asc(job); buf=getTextLine(&(job->res.linelist),0); - if (buf) - { - strncpy(text,buf,maxlen-1); - text[maxlen-1]='\0'; - if (std_proc) - ocr_text_proc(text,allow_spaces); - } - else - text[0]='\0'; + ocrwords_clear(ocrwords); + { + OCRWORD word; + ocrword_init(&word); + word.c=bw; + word.r=y2; + word.maxheight=y2-y1; + word.w=x2-x1+1; + word.h=y2-y1+1; + word.lcheight=word.h; + word.rot=0; + willus_mem_alloc_warn((void **)&buf2,2*(strlen(buf)+1),funcname,10); + strcpy(buf2,buf); + if (std_proc) + ocr_text_proc(buf2,allow_spaces); + word.text=buf2; + ocrwords_add_word(ocrwords,&word); + willus_mem_free((double **)&buf2,funcname); + } // willus_mem_free((double **)&job->src.p.p,funcname); job_free_image(job); } diff -Nru k2pdfopt-2.42+ds/willuslib/ocrtess.c k2pdfopt-2.51+ds/willuslib/ocrtess.c --- k2pdfopt-2.42+ds/willuslib/ocrtess.c 2017-02-25 06:56:49.000000000 +0000 +++ k2pdfopt-2.51+ds/willuslib/ocrtess.c 2019-01-01 02:59:58.000000000 +0000 @@ -3,7 +3,7 @@ ** ** Part of willus.com general purpose C code library. ** -** Copyright (C) 2017 http://willus.com +** Copyright (C) 2018 http://willus.com ** ** This program is free software: you can redistribute it and/or modify ** it under the terms of the GNU Affero General Public License as @@ -29,7 +29,10 @@ #include "willus.h" -static void lang_default(char *langdef); +static int ocrtess_lstm_file(char *filename); +static int ocrtess_tess_file(char *filename); +static int file_contains_keyword(char *filename,char *keyword); +static int buf_contains_keyword(char *buf,char *keyword,int n); /* static int has_cube_data(char *lang); */ @@ -38,47 +41,63 @@ /* ** Returns 0 for success, NZ for failure. */ -void *ocrtess_init(char *datadir,char *lang,FILE *out,char *initstr,int maxlen,int *status) +void *ocrtess_init(char *datadir,char *tesspath,int maxtesspathlen, + char *lang,FILE *out,char *initstr,int maxlen,int *status) { char langdef[16]; void *api; + char tesspath0[MAXFILENAMELEN]; + ocrtess_datapath(tesspath0,datadir,MAXFILENAMELEN-1); + if (tesspath!=NULL) + { + strncpy(tesspath,tesspath0,maxtesspathlen-1); + tesspath[maxtesspathlen-1]='\0'; + } if (lang==NULL || lang[0]=='\0') - lang_default(langdef); + ocrtess_lang_default(tesspath0,NULL,0,langdef,16,NULL,0,0); else { strncpy(langdef,lang,15); langdef[15]='\0'; } - /* Try CUBE/COMBINED first */ - api=tess_capi_init(datadir,langdef,0,out,initstr,maxlen,status); - /* Next try just CUBE if that didn't work */ - if (api==NULL) - api=tess_capi_init(datadir,langdef,2,out,initstr,maxlen,status); - /* Final try: regular */ - if (api==NULL) - api=tess_capi_init(datadir,langdef,1,out,initstr,maxlen,status); + /* Tess v4.00 needs only one attempt with ocrtype=0 */ + api=tess_capi_init(tesspath0,langdef,0,out,initstr,maxlen,status); return(api); } -static void lang_default(char *langdef) +void ocrtess_lang_default(char *datadir,char *tesspath,int maxtesspathlen, + char *langdef,int maxlen,char *tessdebug,int maxdebug,int use_ansi) { - char *p; - char tessdir[512]; - char wildcard[512]; + char wildcard[MAXFILENAMELEN+32]; int j; FILELIST *fl,_fl; + char tesspath0[MAXFILENAMELEN]; + static char *header= + "File name Size Date Type*\n" + "---------------------------------------------------------------------\n"; + /* "12345678901234567890123456789012 XXX.XX MB XX-XXX-XXXX [LSTM+TESS] */ + static char *fmt=" %6.2f MB %2d-%s-%04d %s"; + static char *month_name[12]={"JAN","FEB","MAR","APR","MAY","JUN","JUL","AUG","SEP","OCT","NOV","DEC"}; - strcpy(langdef,"eng"); - p=getenv("TESSDATA_PREFIX"); - if (p==NULL) - return; - wfile_fullname(tessdir,p,"tessdata"); - wfile_reslash(tessdir); - wfile_fullname(wildcard,tessdir,"*.traineddata"); + if (datadir==NULL) + ocrtess_datapath(tesspath0,datadir,MAXFILENAMELEN-1); + else + { + strncpy(tesspath0,datadir,MAXFILENAMELEN-1); + tesspath0[MAXFILENAMELEN-1]='\0'; + } + if (tesspath!=NULL) + { + strncpy(tesspath,tesspath0,maxtesspathlen-1); + tesspath[maxtesspathlen-1]='\0'; + } + strncpy(langdef,"eng",maxlen-1); + langdef[maxlen-1]='\0'; + wfile_fullname(wildcard,tesspath0,"*.traineddata"); fl=&_fl; filelist_init(fl); filelist_fill_from_disk_1(fl,wildcard,0,0); @@ -91,16 +110,165 @@ wfile_basespec(basename,fl->entry[j].name); if (in_string(basename,"-")>0) continue; - strncpy(langdef,basename,15); - langdef[15]='\0'; + strcpy(wildcard,fl->entry[j].name); + strncpy(langdef,basename,maxlen-1); + langdef[maxlen-1]='\0'; i=in_string(langdef,"."); if (i>0) langdef[i]='\0'; break; } + if (tessdebug!=NULL) + { + filelist_sort_by_name(fl); + if (strlen(header)+strlen(tessdebug) < maxdebug) + sprintf(&tessdebug[strlen(tessdebug)],"%s",header); + for (j=0;jn;j++) + { + char fullname[256]; + char name[256]; + int len1,tess,lstm,len; + + wfile_fullname(fullname,fl->dir,fl->entry[j].name); + strcpy(name,fl->entry[j].name); + lstm=ocrtess_lstm_file(fullname); + tess=ocrtess_tess_file(fullname); + len1 = strlen(name)>32 ? strlen(name) : 32; + if (use_ansi) + len1+=20; + if (strlen(tessdebug)+len1+46 < maxdebug) + { + if (use_ansi) + { + if (tess || lstm) + strcat(tessdebug,ANSI_YELLOW); + } + strcat(tessdebug,name); + len=strlen(name); + if (!strcmp(fl->entry[j].name,wildcard)) + { + if (use_ansi) + strcat(tessdebug,ANSI_WHITE); + strcat(tessdebug," [Def]"); + len+=6; + } + else + if (use_ansi) + strcat(tessdebug,ANSI_NORMAL); + for (;len<32;len++) + strcat(tessdebug," "); + sprintf(&tessdebug[strlen(tessdebug)],fmt,fl->entry[j].size/1024./1024., + fl->entry[j].date.tm_mday, + month_name[fl->entry[j].date.tm_mon], + fl->entry[j].date.tm_year+1900, + tess&&lstm?"[LSTM+TESS]":(tess?"[TESS]":(lstm?"[LSTM]":"(not valid)"))); + if (use_ansi) + strcat(tessdebug,ANSI_NORMAL); + strcat(tessdebug,"\n"); + } + } + } filelist_free(fl); } + +/* +** Determine OCR path. Check both TESSDATA_PREFIX\tessdata AND TESSDATA_PREFIX +*/ +void ocrtess_datapath(char *datapath,char *suggested,int maxlen) + + { + char path1[MAXFILENAMELEN-12]; + char path[MAXFILENAMELEN]; + char *p; + + if (suggested!=NULL) + { + strncpy(datapath,suggested,maxlen-1); + datapath[maxlen-1]='\0'; + return; + } + if ((p=getenv("TESSDATA_PREFIX"))==NULL) + { + datapath[0]='\0'; + return; + } + strncpy(path1,p,MAXFILENAMELEN-13); + path1[MAXFILENAMELEN-13]='\0'; + wfile_fullname(path,path1,"tessdata"); + if (wfile_status(path)==2) + { + strncpy(datapath,path,maxlen-1); + datapath[maxlen-1]='\0'; + return; + } + strncpy(datapath,p,maxlen-1); + datapath[maxlen-1]='\0'; + } + + +static int ocrtess_lstm_file(char *filename) + + { + return(file_contains_keyword(filename,"XYTransLSTM")); + } + + +static int ocrtess_tess_file(char *filename) + + { + return(file_contains_keyword(filename,"NULL 0 NULL 0")); + } + + +static int file_contains_keyword(char *filename,char *keyword) + + { + int pos,bufsize,len; + FILE *f; + char *buf; + static char *funcname="file_contains_keyword"; + + f=fopen(filename,"rb"); + if (f==NULL) + return(0); + bufsize=1024*1024; + len=strlen(keyword); + willus_mem_alloc_warn((void **)&buf,bufsize,funcname,10); + for (pos=0;1;pos+=bufsize-len) + { + int n; + + fseek(f,pos,0); + n=fread(buf,1,bufsize,f); + if (buf_contains_keyword(buf,keyword,n)) + { + willus_mem_free((double **)&buf,funcname); + fclose(f); + return(1); + } + if (nn=ocrtesswords->na=0; + ocrtesswords->word=NULL; + } + + +void ocrtesswords_free(OCRTESSWORDS *ocrtesswords) + + { + static char *funcname="ocrtesswords_free"; + int i; + + for (i=ocrtesswords->n;i>=0;i--) + willus_mem_free((double **)&ocrtesswords->word[i].utf8,funcname); + willus_mem_free((double **)&ocrtesswords->word,funcname); + ocrtesswords_init(ocrtesswords); + } + + +void ocrtesswords_add_ocrtessword(OCRTESSWORDS *ocrtesswords,int left,int top, + int right,int bottom,int baseline,char *text) + + { + static char *funcname="ocrtesswords_add_ocrtessword"; + OCRTESSWORD *word; + + if (ocrtesswords->n >= ocrtesswords->na) + { + int newsize; + + newsize=ocrtesswords->na<128?256:ocrtesswords->na*2; + if (ocrtesswords->na==0) + willus_mem_alloc_warn((void **)&ocrtesswords->word,newsize*sizeof(OCRTESSWORD), + funcname,10); + else + willus_mem_realloc_robust_warn((void **)&ocrtesswords->word,newsize*sizeof(OCRTESSWORD), + ocrtesswords->na*sizeof(OCRTESSWORD),funcname,10); + ocrtesswords->na=newsize; + } + word=&ocrtesswords->word[ocrtesswords->n]; + word->top=top; + word->left=left; + word->right=right; + word->bottom=bottom; + word->baseline=baseline; + willus_mem_alloc_warn((void **)&word->utf8,strlen(text)+1,funcname,10); + strcpy(word->utf8,text); + ocrtesswords->n++; + } +*/ + + +/* ** bmp8 must be grayscale ** (x1,y1) and (x2,y2) from top left of bitmap ** Output: ** "text" gets UTF-8 formatted string of OCR'd text. +** +** segmode: +** -1 = Default (6) +** 0 = Orientation and script detection (OSD) only. +** 1 = Automatic page segmentation with OSD. +** 2 = Automatic page segmentation, but no OSD, or OCR +** 3 = Fully automatic page segmentation, but no OSD. +** 4 = Assume a single column of text of variable sizes. +** 5 = Assume a single uniform block of vertically aligned text. +** 6 = Assume a single uniform block of text. (Default) +** 7 = Treat the image as a single text line. +** 8 = Treat the image as a single word. +** 9 = Treat the image as a single word in a circle. +** 10 = Treat the image as a single character. +** */ -void ocrtess_single_word_from_bmp8(void *api,char *text,int maxlen,WILLUSBITMAP *bmp8, - int x1,int y1,int x2,int y2, - int ocr_type,int allow_spaces, - int std_proc,FILE *out) +void ocrtess_ocrwords_from_bmp8(void *api,OCRWORDS *ocrwords,WILLUSBITMAP *bmp8, + int x1,int y1,int x2,int y2,int dpi, + int segmode,double downsample,FILE *out) + + { + PIX *pix; + WILLUSBITMAP *bmp,_bmp; + int nw,i,it,w,h,dw,dh,bw; + unsigned char *src,*dst; + int *top,*left,*bottom,*right,*ybase; + char *text; + + if (x1>x2) + { + w=x1; + x1=x2; + x2=w; + } + w=x2-x1+1; + + /* Add a border */ + bw=w/40; + if (bw<6) + bw=6; + dw=w+bw*2; + dw=(dw+3)&(~3); + if (y1>y2) + { + h=y1; + y1=y2; + y2=h; + } + h=y2-y1+1; + dh=h+bw*2; + + /* Store in new bitmap */ + bmp=&_bmp; + bmp_init(bmp); + bmp->width=dw; + bmp->height=dh; + bmp->bpp=8; + bmp_alloc(bmp); + for (i=0;i<256;i++) + bmp->red[i]=bmp->blue[i]=bmp->green[i]=i; + dst=bmp_rowptr_from_top(bmp,0); + memset(dst,255,dw*dh); + src=bmp_rowptr_from_top(bmp8,y1)+x1; + dst=bmp_rowptr_from_top(bmp,bw)+bw; + for (i=y1;i<=y2;i++,dst+=dw,src+=bmp8->width) + memcpy(dst,src,w); + bmp_set_dpi((double)dpi); + + /* Apply downsample */ + if (downsample > 0. && downsample < 0.9) + { + int wnew; + + /* Make sure new width is even multiple of 4 */ + wnew=downsample*bmp->width+0.5; + wnew=(wnew+3)&(~3); + downsample = (double)wnew/bmp->width; + bmp_resize(bmp,downsample); + dpi=dpi*downsample+0.5; + bmp_set_dpi((double)dpi); + } + else + downsample=1.0; +/* +{ +static int counter=0; +char filename[256]; +sprintf(filename,"ocrtext%04d.png",++counter); +bmp_write(bmp,filename,stdout,100); +} +*/ + pix=pixCreate(bmp->width,bmp->height,8); + src=bmp_rowptr_from_top(bmp,0); + dst=(unsigned char *)pixGetData(pix); + memcpy(dst,src,bmp->width*bmp->height); + bmp_free(bmp); + endian_flip((char *)dst,pixGetWpl(pix)*pixGetHeight(pix)); + pix->xres = pix->yres = dpi; + tess_capi_get_ocr_multiword(api,pix,segmode<0 || segmode>10 ? 6 : segmode, + &left,&top,&right,&bottom,&ybase,&text,&nw,out); + pixDestroy(&pix); + ocrwords_clear(ocrwords); + for (it=i=0;ixres = pix->yres = 100; /* Just make up resolution of 100 ppi */ - status=tess_capi_get_ocr(api,pix,text,maxlen,out); + pix->xres = pix->yres = dpi; +/* +{ +static int counter=0; +WILLUSBITMAP *bmp,_bmp; +char filename[256]; +sprintf(filename,"ocrt_%04d.png",++counter); +bmp=&_bmp; +bmp_init(bmp); +bmp_copy(bmp,bmp8); +bmp_promote_to_24(bmp); +bmp_set_dpi((double)dpi); +bmp_write(bmp,filename,stdout,100); +bmp_free(bmp); +} +*/ + status=tess_capi_get_ocr(api,pix,text,maxlen,segmode<0 || segmode>10 ? 6 : segmode,out); pixDestroy(&pix); if (status<0) text[0]='\0'; + /* clean_line(text); + */ if (std_proc) ocr_text_proc(text,allow_spaces); } diff -Nru k2pdfopt-2.42+ds/willuslib/pdfwrite.c k2pdfopt-2.51+ds/willuslib/pdfwrite.c --- k2pdfopt-2.42+ds/willuslib/pdfwrite.c 2016-11-24 17:52:06.000000000 +0000 +++ k2pdfopt-2.51+ds/willuslib/pdfwrite.c 2018-07-31 15:37:36.000000000 +0000 @@ -3,7 +3,7 @@ ** ** Part of willus.com general purpose C code library. ** -** Copyright (C) 2016 http://willus.com +** Copyright (C) 2018 http://willus.com ** ** This program is free software: you can redistribute it and/or modify ** it under the terms of the GNU Affero General Public License as @@ -20,15 +20,6 @@ ** */ -/* -** IMPORTANT!! -** -** NEEDS SPECIAL VERSION OF ZLIB WITH CUSTOM MODES--SEE gzflags BELOW! -** SEE gzwrite.c and gzlib.c in zlib_mod FOLDER. -** (SEARCH FOR "WILLUS MOD" IN FILES.) -** -*/ - #include #include #include @@ -579,10 +570,6 @@ /* ** Use quality=-1 for PNG ** -** NEEDS SPECIAL VERSION OF ZLIB WITH CUSTOM MOD--SEE gzflags BELOW! -** SEE gzwrite.c and gzlib.c in zlib_mod FOLDER. -** (SEARCH FOR "WILLUS MOD" IN FILES.) -** ** If quality < 0, the deflate (PNG-style) method is used. ** ** halfsize==0 for 8-bits per color plane diff -Nru k2pdfopt-2.42+ds/willuslib/string.c k2pdfopt-2.51+ds/willuslib/string.c --- k2pdfopt-2.42+ds/willuslib/string.c 2016-03-19 16:19:33.000000000 +0000 +++ k2pdfopt-2.51+ds/willuslib/string.c 2018-09-18 17:19:40.000000000 +0000 @@ -3,7 +3,7 @@ ** ** Part of willus.com general purpose C code library. ** -** Copyright (C) 2016 http://willus.com +** Copyright (C) 2018 http://willus.com ** ** This program is free software: you can redistribute it and/or modify ** it under the terms of the GNU Affero General Public License as @@ -938,6 +938,10 @@ ** YYYY-MM-DD ** MM-DD-YYYY|YY [Default] ** +** Treats the following as white space: / - \ . : , +** +** 18 Sept 2018: Treat comma a white space. +** ** CAUTION! This function zeros the time fields! */ int structtm_from_date(struct tm *date,char *datestr) @@ -958,7 +962,7 @@ date->tm_mday=1; for (i=0;buf[i]!='\0';i++) if (buf[i]=='/' || buf[i]=='-' || buf[i]=='\\' - || buf[i]=='.' || buf[i]==':') + || buf[i]=='.' || buf[i]==':' || buf[i]==',') buf[i]=' '; n=sscanf(buf,"%s %s %s",tok[0],tok[1],tok[2]); yr = -1; diff -Nru k2pdfopt-2.42+ds/willuslib/wfile.c k2pdfopt-2.51+ds/willuslib/wfile.c --- k2pdfopt-2.42+ds/willuslib/wfile.c 2016-03-22 02:30:20.000000000 +0000 +++ k2pdfopt-2.51+ds/willuslib/wfile.c 2018-07-27 17:47:22.000000000 +0000 @@ -5,7 +5,7 @@ ** ** Part of willus.com general purpose C code library. ** -** Copyright (C) 2016 http://willus.com +** Copyright (C) 2018 http://willus.com ** ** This program is free software: you can redistribute it and/or modify ** it under the terms of the GNU Affero General Public License as @@ -524,6 +524,7 @@ strcpy(fn2,filename); /* If a folder, use a different method since CreateFile doesn't work */ /* the way I have it set up. */ +#ifndef NO_FILELIST if (wfile_status(fn2)==2) { FILELIST *fl,_fl; @@ -538,6 +539,7 @@ } filelist_free(fl); } +#endif /* Weird bug: If I don't use the full path, then sometimes */ /* this returns an incorrect result. */ /* Started using CreateFile instead of OpenFile because OpenFile */ @@ -3102,6 +3104,7 @@ +#ifndef NO_FILELIST FILE *wfile_open_most_recent(char *wildspec,char *mode,int recursive) { @@ -3118,6 +3121,7 @@ wfile_fullname(wildspec,fl->dir,fl->entry[fl->n-1].name); return(wfile_fopen_utf8(wildspec,mode)); } +#endif /* @@ -3127,6 +3131,7 @@ ** that file name. ** */ +#ifndef NO_FILELIST int wfile_extract_in_place(char *filename) { @@ -3176,6 +3181,7 @@ wfile_fullname(filename,relpath,fullname); return(0); } +#endif /* @@ -3194,6 +3200,7 @@ ** RETURNS 0 FOR SUCCESS ** */ +#ifndef NO_FILELIST int wfile_find_file(char *fullname,char *basename,char *folderlist[],char *drives, int checkpath,int cwd,int exedir,char *envdir) @@ -3276,6 +3283,7 @@ } return(-99); } +#endif /* NO_FILELIST */ /* @@ -3286,6 +3294,7 @@ ** ** Ret 0 for success, and fullname[] gets the full path name. */ +#ifndef NO_FILELIST int wfile_smartfind(char *fullname,char *basename,char *folder,int recursive) { @@ -3349,6 +3358,7 @@ filelist_free(fl); return(-3); } +#endif static int wfile_correct_exe(char *basename,char *correctname,char *fullname) diff -Nru k2pdfopt-2.42+ds/willuslib/wgs.c k2pdfopt-2.51+ds/willuslib/wgs.c --- k2pdfopt-2.42+ds/willuslib/wgs.c 2016-03-13 20:34:50.000000000 +0000 +++ k2pdfopt-2.51+ds/willuslib/wgs.c 2018-12-30 17:31:32.000000000 +0000 @@ -3,7 +3,7 @@ ** ** Part of willus.com general purpose C code library. ** -** Copyright (C) 2016 http://willus.com +** Copyright (C) 2018 http://willus.com ** ** This program is free software: you can redistribute it and/or modify ** it under the terms of the GNU Affero General Public License as @@ -48,6 +48,8 @@ static int willusgs_inited=0; static char willusgs_name[512]; static int willusgs_isdll=0; +static int willusgs_device_width_pts=-1; +static int willusgs_device_height_pts=-1; /* ** Pointers which will get pointed to the DLL functions @@ -86,13 +88,22 @@ willusgs_close(); } */ +/* Use -1 to ignore or use default */ +void willusgs_set_device_width_and_height_pts(int w,int h) + + { + willusgs_device_width_pts=w; + willusgs_device_height_pts=h; + } +#define NARGSMAX 18 + int willusgs_read_pdf_or_ps_bmp(WILLUSBITMAP *bmp,char *filename,int pageno,double dpi,FILE *out) { - char argdata[16][48]; - char *argv[16]; + char argdata[NARGSMAX][48]; + char *argv[NARGSMAX]; int i,status; char tempfile[256]; char argtemp[280]; @@ -100,7 +111,7 @@ willusgs_init(out); wfile_abstmpnam(tempfile); - for (i=0;i<16;i++) + for (i=0;i bitmap should be %d x %d\n",(int)(willusgs_device_width_pts*dpi/72.+.5), + (int)(willusgs_device_height_pts*dpi/72.+.5)); +*/ + if (willusgs_device_width_pts>0) + sprintf(argv[i++],"-dDEVICEWIDTHPOINTS=%d",willusgs_device_width_pts); + if (willusgs_device_height_pts>0) + sprintf(argv[i++],"-dDEVICEHEIGHTPOINTS=%d",willusgs_device_height_pts); strcpy(argv[i++],"-dGraphicsAlphaBits=4"); strcpy(argv[i++],"-dTextAlphaBits=4"); sprintf(argv[i++],"-r%g",dpi); @@ -127,6 +147,9 @@ nprintf(out,"BMP read failed. GS output in file %s (not deleted).\n",tempfile); return(status-100); } +/* +printf("bitmap is %d x %d x %d\n",bmp->width,bmp->height,bmp->bpp); +*/ remove(tempfile); return(0); } @@ -156,6 +179,10 @@ strcpy(argv[i++],"-dBATCH"); strcpy(argv[i++],"-dNOPAUSE"); strcpy(argv[i++],"-sDEVICE=pdfwrite"); + if (willusgs_device_width_pts>0) + sprintf(argv[i++],"-dDEVICEWIDTHPOINTS=%d",willusgs_device_width_pts); + if (willusgs_device_height_pts>0) + sprintf(argv[i++],"-dDEVICEHEIGHTPOINTS=%d",willusgs_device_height_pts); strcpy(argv[i++],"-dPDFSETTINGS=/prepress"); /* if (firstpage>0) @@ -287,6 +314,7 @@ } #endif willusgs_inited=1; + willusgs_device_width_pts=willusgs_device_height_pts=-1; return(0); } diff -Nru k2pdfopt-2.42+ds/willuslib/wgui.c k2pdfopt-2.51+ds/willuslib/wgui.c --- k2pdfopt-2.42+ds/willuslib/wgui.c 2017-01-07 18:58:45.000000000 +0000 +++ k2pdfopt-2.51+ds/willuslib/wgui.c 2018-11-22 01:14:46.000000000 +0000 @@ -4,7 +4,7 @@ ** ** Part of willus.com general purpose C code library. ** -** Copyright (C) 2017 http://willus.com +** Copyright (C) 2018 http://willus.com ** ** This program is free software: you can redistribute it and/or modify ** it under the terms of the GNU Affero General Public License as @@ -208,7 +208,8 @@ char pwd[512]; int procnum; - if (strnicmp(filename,"http://",7) && wfile_status(filename)==0) + if (strnicmp(filename,"http://",7) && strnicmp(filename,"https://",8) + && wfile_status(filename)==0) { char *message; int len; diff -Nru k2pdfopt-2.42+ds/willuslib/willus.h k2pdfopt-2.51+ds/willuslib/willus.h --- k2pdfopt-2.42+ds/willuslib/willus.h 2017-05-20 22:12:16.000000000 +0000 +++ k2pdfopt-2.51+ds/willuslib/willus.h 2019-01-04 18:28:40.000000000 +0000 @@ -79,6 +79,12 @@ ** #define unix 1 ** */ +#ifndef WILLUSLIB +/* +** __x87_inline_math__: Can use x87 math intrinsics +** __x87_inline_pow_only__: Only in-line the pow() function (GCC 4.x) +*/ +#endif // WILLUSLIB typedef double real; @@ -187,6 +193,34 @@ ** As of 2013 and gcc 4.7.x, x87_line_math is turned off entirely. ** My x87 in-line routines are no faster than gcc on modern Intel CPUs. */ +#ifndef WILLUSLIB +#if (defined(WILLUS_X863264)) +#if (!defined(__TINYC__) && (__GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ > 4))) +#define __has_sincos_builtin__ +#if (defined(__gnu_linux__) || (!defined(__GNUC__) || __GNUC__ < 4 || (__GNUC__ == 4 && __GNUC_MINOR__ < 6))) +#define __needs_sincos_declaration__ +#endif +#endif +#if (!defined(__x87_inline_math__) && defined(__FAST_MATH__) && !defined(__TINYC__)) +#if (defined(__gnu_linux__) || !defined(__GNUC__) || __GNUC__ < 4 || (__GNUC__ == 4 && __GNUC_MINOR__ < 7)) +#define __x87_inline_math__ +#endif +#endif +#endif + +/* +** This was the WILLUSLIB sincos as of k2pdfopt v1.62 +*/ +// #ifdef WILLUSLIB +// /* Check whether sincos built-in */ +// #if (defined(WILLUS_X863264) && !defined(__TINYC__) && __GNUC__ >= 4 && __GNUC_MINOR__ > 4) +// void sincos(double th,double *s,double *c); +// #else +// #define sincos(th,x,y) { (*(x))=sin(th); (*(y))=cos(th); } +// #endif +// #endif // WILLUSLIB + +#endif // WILLUSLIB #if (defined(__linux) || defined(linux) || defined(__linux__)) #define LINUX @@ -491,7 +525,9 @@ double x2,double y2,int newwidth,int newheight); int bmp_resample_fixed_point(WILLUSBITMAP *dest,WILLUSBITMAP *src,double fx1,double fy1, double fx2,double fy2,int newwidth,int newheight); +/* void bmp_crop_edge(WILLUSBITMAP *bmp); +*/ void bmp_invert(WILLUSBITMAP *bmp); void bmp_overlay(WILLUSBITMAP *dest,WILLUSBITMAP *src,int x0,int y0_from_top, int *dbgc,int *dfgc,int *sbgc,int *sfgc); @@ -520,6 +556,7 @@ void bmp_apply_whitethresh(WILLUSBITMAP *bmp,int whitethresh); void bmp_dither_to_bpc(WILLUSBITMAP *bmp,int newbpc); void bmp_extract(WILLUSBITMAP *dst,WILLUSBITMAP *src,int x0,int y0_from_top,int width,int height); +int bmp_read_pcl(WILLUSBITMAP *bmp,char *pclbuf,int n); /* fontrender.c */ void fontrender_set_or(int status); @@ -915,6 +952,7 @@ int win_setdir(char *directory); wmetafile *win_emf_clipboard(void); int win_text_file_to_clipboard(char *filename,FILE *out); +int win_text_file_to_clipboard_ex(char *filename,FILE *out,int nocrs); int win_buf_to_clipboard(char *lbuf,FILE *out); char *win_clipboard_to_buf(FILE *out); int win_clipboard_has_bitmap(void); @@ -1016,6 +1054,12 @@ int maxwidth_pixels,int rgbcolor,void *myproc, void **window,int *button_colors,void *aboutbox, int modal); +int winmbox_login_box(void *parent,char *title, + char *message, + char *username,int umaxlen, + char *password,int pmaxlen, + int fontsize_pixels,int maxwidth_pixels, + int rgbcolor,int *button_colors); int winmbox_def_proc(void *hwnd,int iMsg,int wParam,void *lParam); void winmbox_destroy(void); void winmbox_message_box_display_message(char *message,int *ypos); @@ -1025,6 +1069,15 @@ void winmbox_button_draw(void *hdc0,void *rect0,int state,int basecolorrgb, void *hfont0,char *text,int textcolorrgb); void winmbox_set_font(char *fontname); +void winmbox_terminate(void); +void winmbox_wait(void *mainwin,char *message,int cancel_option); +int winmbox_wait_cancel(void); +void winmbox_wait_proc_messages(void); +void winmbox_wait_end(void); +int winmbox_wait_busy(void); +void winmbox_wait_busy_pointer(void); +void winmbox_wait_restore_pointer(void); +void winmbox_wait_normal_pointer(void); #endif /* winbmp.c */ @@ -1084,6 +1137,8 @@ int wsys_set_decimal_period(int setit); int wsys_set_envvar(char *varname,char *value,int system); int wsys_get_envvar_ex(char *varname,char *value,int maxlen); +int wsys_file_lock(char *filename); +int wsys_file_unlock(char *filename,int fd); /* token.c */ @@ -1374,15 +1429,43 @@ void gocr_single_word_from_bmp8(char *text,int maxlen,WILLUSBITMAP *bmp8, int x1,int y1,int x2,int y2,int allow_spaces, int std_proc); +void gocr_ocrwords_from_bmp8(OCRWORDS *ocrwords,WILLUSBITMAP *bmp8, + int x1,int y1,int x2,int y2,int allow_spaces, + int std_proc); #endif -#ifdef HAVE_TESSERACT_LIB /* ocrtess.c */ -void *ocrtess_init(char *datadir,char *lang,FILE *out,char *initstr,int maxlen,int *status); +/* +typedef struct + { + int top,left,bottom,right,baseline; + char *utf8; + } OCRTESSWORD; +typedef struct + { + OCRTESSWORD *word; + int na,n; + } OCRTESSWORDS; +*/ +#ifdef HAVE_TESSERACT_LIB +void *ocrtess_init(char *datadir,char *tesspath,int maxtesspathlen, + char *lang,FILE *out,char *initstr,int maxlen,int *status); +void ocrtess_lang_default(char *datadir,char *tesspath,int maxtesspathlen, + char *langdef,int maxlen,char *debugstr,int maxdebug,int use_ansi); +void ocrtess_datapath(char *datapath,char *suggested,int maxlen); void ocrtess_end(void *api); -void ocrtess_single_word_from_bmp8(void *api,char *text,int maxlen,WILLUSBITMAP *bmp8, - int x1,int y1,int x2,int y2, - int ocr_type,int allow_spaces, +/* +void ocrtesswords_init(OCRTESSWORDS *ocrtesswords); +void ocrtesswords_free(OCRTESSWORDS *ocrtesswords); +void ocrtesswords_add_ocrtessword(OCRTESSWORDS *ocrtesswords,int left,int top, + int right,int bottom,int baseline,char *text); +*/ +void ocrtess_ocrwords_from_bmp8(void *api,OCRWORDS *words,WILLUSBITMAP *bmp8, + int x1,int y1,int x2,int y2,int dpi, + int segmode,double downsample,FILE *out); +void ocrtess_from_bmp8(void *api,char *text,int maxlen,WILLUSBITMAP *bmp8, + int x1,int y1,int x2,int y2,int dpi, + int segmode,int allow_spaces, int std_proc,FILE *out); #endif @@ -1796,8 +1879,10 @@ /* Generic (cross-platform) message box functions */ /* wleptonica.c */ +#ifdef HAVE_LEPTONICA_LIB void wlept_bmp_dewarp(WILLUSBITMAP *src,WILLUSBITMAP *bmp1,WILLUSBITMAP *bmp2,int wthresh,int fit_order, char *debugfile); +#endif #ifdef PI diff -Nru k2pdfopt-2.42+ds/willuslib/willusversion.c k2pdfopt-2.51+ds/willuslib/willusversion.c --- k2pdfopt-2.42+ds/willuslib/willusversion.c 2016-09-11 20:12:01.000000000 +0000 +++ k2pdfopt-2.51+ds/willuslib/willusversion.c 2018-11-22 01:12:10.000000000 +0000 @@ -1,10 +1,10 @@ -static char *version = "Ver 5.26 (Sep 11, 2016)"; +static char *version = "Ver 5.37 (Nov 21, 2018)"; /* ** willusversion.c Report version of WILLUSLIB library ** ** Part of willus.com general purpose C code library. ** -** Copyright (C) 2016 http://willus.com +** Copyright (C) 2018 http://willus.com ** ** This program is free software: you can redistribute it and/or modify ** it under the terms of the GNU Affero General Public License as diff -Nru k2pdfopt-2.42+ds/willuslib/win.c k2pdfopt-2.51+ds/willuslib/win.c --- k2pdfopt-2.42+ds/willuslib/win.c 2016-02-28 04:34:44.000000000 +0000 +++ k2pdfopt-2.51+ds/willuslib/win.c 2018-08-31 14:57:27.000000000 +0000 @@ -3,7 +3,7 @@ ** ** Part of willus.com general purpose C code library. ** -** Copyright (C) 2016 http://willus.com +** Copyright (C) 2018 http://willus.com ** ** This program is free software: you can redistribute it and/or modify ** it under the terms of the GNU Affero General Public License as @@ -52,6 +52,7 @@ static int win_registry_search1(char *value,int maxlen,HKEY key_class,char *keyname,char *searchvalue,int recursive); static BOOL CALLBACK find_win_by_procid(HWND hwnd,LPARAM lp); static int win_adjust_privilege(void); +static void cr_filter(char *s); static int windate_warn=1; @@ -548,12 +549,17 @@ } +int win_text_file_to_clipboard(char *filename,FILE *out) + + { + return(win_text_file_to_clipboard_ex(filename,out,0)); + } /* ** Followed example at this link: ** http://msdn.microsoft.com/library/default.asp?url=/library/en-us/winui/WinUI/WindowsUserInterface/DataExchange/Clipboard/UsingtheClipboard.asp#_win32_Copying_Information_to_the_Clipboard ** */ -int win_text_file_to_clipboard(char *filename,FILE *out) +int win_text_file_to_clipboard_ex(char *filename,FILE *out,int nocrs) { FILE *f; @@ -587,6 +593,8 @@ } fclose(f); p[size]='\0'; + if (nocrs) + cr_filter(p); GlobalUnlock(buf); if (!OpenClipboard(GetDesktopWindow())) { @@ -610,6 +618,26 @@ } +static void cr_filter(char *s) + + { + int i,j; + + for (i=j=0;s[i]!='\0';i++) + { + if (s[i]=='\r') + continue; + if (s[i]=='\t') + s[j]=' '; + else + if (i!=j) + s[j]=s[i]; + j++; + } + s[j]='\0'; + } + + /* ** Followed example at this link: ** http://msdn.microsoft.com/library/default.asp?url=/library/en-us/winui/WinUI/WindowsUserInterface/DataExchange/Clipboard/UsingtheClipboard.asp#_win32_Copying_Information_to_the_Clipboard @@ -1521,6 +1549,7 @@ } +#ifndef NO_FILELIST int win_most_recent_in_path(char *exactname,char *wildcard) { @@ -1558,6 +1587,7 @@ } return(exactname[0]!='\0'); } +#endif int win_which(char *exactname,char *exename) diff -Nru k2pdfopt-2.42+ds/willuslib/winmbox.c k2pdfopt-2.51+ds/willuslib/winmbox.c --- k2pdfopt-2.42+ds/willuslib/winmbox.c 2016-02-23 03:55:15.000000000 +0000 +++ k2pdfopt-2.51+ds/willuslib/winmbox.c 2018-11-22 01:06:35.000000000 +0000 @@ -3,7 +3,7 @@ ** ** Part of willus.com general purpose C code library. ** -** Copyright (C) 2016 http://willus.com +** Copyright (C) 2018 http://willus.com ** ** This program is free software: you can redistribute it and/or modify ** it under the terms of the GNU Affero General Public License as @@ -37,14 +37,19 @@ HWND hwnd; HWND parent; HWND edit_hwnd; + HWND pass_hwnd; HWND b1_hwnd; HWND b2_hwnd; HWND b3_hwnd; char b1[256]; char b2[256]; char b3[256]; + char inbuflabel[64]; char *inbuf; int maxlen; + char passlabel[64]; + char *passbuf; + int passmaxlen; char msg[1024]; int maxwidth; int width; @@ -60,7 +65,9 @@ RECT aboutbox; RECT messagebox; WNDPROC eclassproc; + void *myproc; int modal; + char *title; } WINMBOX; static unsigned int n_magplus = 2903; @@ -392,6 +399,7 @@ static WINMBOX _wmb,*wmb; static void winmbox_init(void); +static int winmbox_message_box_create(void); static void winmbox_message_box_calc_size(WINMBOX *wmb); static void winmbox_message_box_add_children(void); static LRESULT CALLBACK winmbox_edit_proc(HWND hWnd,UINT message,WPARAM wParam,LPARAM lParam); @@ -470,6 +478,7 @@ 0)); } + /* ** Present a Windows Dialog Box w/up to three buttons and a text entry box if desired. ** @@ -507,12 +516,7 @@ int modal) { - WNDCLASSEX wndclass; - int x0,y0,brgcolor; - RECT rect; - HWND pwin; - MSG msg; - static char *classname="message_box"; + int status; /* ansi_dprintf(NULL,"@winmbox_message_box_ex(%s,...) wmb_inuse=%d\n",message,wmb_inuse); @@ -531,8 +535,6 @@ r=(RECT *)aboutbox; wmb->aboutbox = (*r); } - strcpy(wmb->class,classname); - pwin = parent==NULL ? GetDesktopWindow() : (HWND)parent; if (wmb->hwnd!=NULL) return(-1); wmb->hinstance=(HINSTANCE)GetModuleHandle(0); @@ -540,25 +542,18 @@ wmb->maxwidth = maxwidth_pixels; wmb->parent = parent; wmb->modal = modal; + wmb->title = title; + { + int brgcolor; brgcolor=((rgbcolor&0xff0000)>>16)|(rgbcolor&0xff00)|((rgbcolor&0xff)<<16); wmb->brush=CreateSolidBrush(brgcolor); - wndclass.cbSize = sizeof (wndclass) ; - wndclass.style = CS_HREDRAW | CS_VREDRAW ; - wndclass.lpfnWndProc = myproc==NULL ? (WNDPROC)winmbox_def_proc : (WNDPROC)myproc; - wndclass.cbClsExtra = 0; - wndclass.cbWndExtra = 0; - wndclass.hInstance = wmb->hinstance; - wndclass.hIcon = NULL; - wndclass.hCursor = NULL; - wndclass.hbrBackground = wmb->brush; - wndclass.lpszMenuName = NULL; - wndclass.lpszClassName = classname; - wndclass.hIconSm = NULL; - - RegisterClassEx(&wndclass) ; - + } wmb->inbuf=inbuf; wmb->maxlen=maxlen; + wmb->inbuflabel[0]='\0'; + wmb->passbuf=NULL; + wmb->passmaxlen=0; + wmb->passlabel[0]='\0'; strncpy(wmb->b1,button1==NULL ? "" : button1,255); wmb->b1[255]='\0'; strncpy(wmb->b2,button2==NULL ? "" : button2,255); @@ -578,15 +573,141 @@ if (button3!=NULL) wmb->buttoncolor[2]=button_colors[2]; } + wmb->myproc=myproc; + status = winmbox_message_box_create(); + if (window!=NULL) + (*window)=(void *)wmb->hwnd; + return(status); + } + + +/* +** Present a Windows Dialog Box w/up to three buttons and a text entry box if desired. +** +** Inputs: +** parent = parent window (HWND). Can be NULL in which case the Desktop window is used. +** title = window title +** username = receives user name, max len umaxlen +** password = receives password, max len pmaxlen +** fontsize_pixels = font size to be used in dialog box. +** maxwidth_pixels = max width of dialog box (min width is 400) +** rgbcolor = background color of dialog box. +** +** Return values: +** inbuf[] gets text entry +** 0 = could not open dialog window +** -1 = Escape key press +** 1 = Button 1 pressed +** 2 = Button 2 pressed +** 3 = Button 3 pressed +** 4 = pressed (if no default button) +** +*/ +int winmbox_login_box(void *parent,char *title, + char *message, + char *username,int umaxlen, + char *password,int pmaxlen, + int fontsize_pixels,int maxwidth_pixels, + int rgbcolor,int *button_colors) + + { +/* +ansi_dprintf(NULL,"@winmbox_message_box_ex(%s,...) wmb_inuse=%d\n",message,wmb_inuse); +*/ + winmbox_init(); + if (wmb_inuse) + return(0); + wmb_inuse=1; + wmb=&_wmb; + wmb->messagebox.left = wmb->messagebox.right=0; + wmb->aboutbox.left=wmb->aboutbox.right=0; + if (wmb->hwnd!=NULL) + return(-1); + wmb->hinstance=(HINSTANCE)GetModuleHandle(0); + wmb->mfsize = fontsize_pixels; + wmb->maxwidth = maxwidth_pixels; + wmb->parent = parent; + wmb->modal = 1; + wmb->title = title; + { + int brgcolor; + brgcolor=((rgbcolor&0xff0000)>>16)|(rgbcolor&0xff00)|((rgbcolor&0xff)<<16); + wmb->brush=CreateSolidBrush(brgcolor); + } + wmb->inbuf=username; + wmb->maxlen=umaxlen; + strcpy(wmb->inbuflabel,"User name: "); + wmb->passbuf=password; + wmb->passbuf[0]='\0'; + wmb->passmaxlen=pmaxlen; + strcpy(wmb->passlabel,"Password: "); + strcpy(wmb->b1,"*&Login"); + strcpy(wmb->b2,"&Cancel"); + wmb->b3[0]='\0'; + strncpy(wmb->msg,message==NULL ? "" : message,1023); + wmb->msg[1023]='\0'; + if (button_colors==NULL) + wmb->buttoncolor[0]=wmb->buttoncolor[1]=wmb->buttoncolor[2]=0xf0f0f0; + else + { + wmb->buttoncolor[0]=button_colors[0]; + wmb->buttoncolor[1]=button_colors[1]; + wmb->buttoncolor[2]=0xf0f0f0; + } + wmb->myproc=NULL; + return(winmbox_message_box_create()); + } + + +/* +** +** Global variable wmb must be completely populated before calling +** this function. +** +** Return values: +** inbuf[] gets text entry +** 0 = could not open dialog window +** -1 = Escape key press +** 1 = Button 1 pressed +** 2 = Button 2 pressed +** 3 = Button 3 pressed +** 4 = pressed (if no default button) +** +*/ +static int winmbox_message_box_create(void) + + { + WNDCLASSEX wndclass; + static char *classname="message_box"; + + strcpy(wmb->class,classname); + wndclass.cbSize = sizeof (wndclass) ; + wndclass.style = CS_HREDRAW | CS_VREDRAW ; + wndclass.lpfnWndProc = wmb->myproc==NULL ? (WNDPROC)winmbox_def_proc : (WNDPROC)wmb->myproc; + wndclass.cbClsExtra = 0; + wndclass.cbWndExtra = 0; + wndclass.hInstance = wmb->hinstance; + wndclass.hIcon = NULL; + wndclass.hCursor = NULL; + wndclass.hbrBackground = wmb->brush; + wndclass.lpszMenuName = NULL; + wndclass.lpszClassName = classname; + wndclass.hIconSm = NULL; + RegisterClassEx(&wndclass) ; + { + HWND pwin; + RECT rect; + int x0,y0; + + pwin = wmb->parent==NULL ? GetDesktopWindow() : (HWND)wmb->parent; GetWindowRect(pwin,&rect); winmbox_message_box_calc_size(wmb); x0 = rect.left + ((rect.right-rect.left)-wmb->width)/2; y0 = rect.top + ((rect.bottom-rect.top)-wmb->height)/2; - wmb->hwnd=CreateWindowEx(WS_EX_TOPMOST,classname,title,WS_OVERLAPPED, + wmb->hwnd=CreateWindowEx(WS_EX_TOPMOST,classname,wmb->title,WS_OVERLAPPED, x0,y0,wmb->width,wmb->height, NULL,NULL,0,NULL); - if (window!=NULL) - (*window)=(void *)wmb->hwnd; + } if (wmb->hwnd==NULL) { wmb_inuse=0; @@ -604,10 +725,12 @@ SetFocus(wmb->b2_hwnd); else if (wmb->b1_hwnd!=NULL) SetFocus(wmb->b1_hwnd); - if (myproc!=NULL) + if (wmb->myproc!=NULL) return(1); - if (modal && parent!=NULL) - EnableWindow(parent,0); + if (wmb->modal && wmb->parent!=NULL) + EnableWindow(wmb->parent,0); + { + MSG msg; while (wmb->hwnd!=NULL && GetMessage(&msg,NULL,0,0)) { if (!IsDialogMessage(wmb->hwnd,&msg)) @@ -616,15 +739,23 @@ DispatchMessage(&msg); } } + } return(wmb->status); } +void winmbox_terminate(void) + + { + SendMessage((HWND)wmb->hwnd,WM_CLOSE,0,0); + } + + static void winmbox_message_box_calc_size(WINMBOX *wmb) { SIZE b1,b2,b3,m; - int w; + int button_width; wmb->bfsize=wmb->mfsize; if (wmb->b1[0]!='\0' || wmb->b2[0]!='\0' || wmb->b3[0]!='\0') @@ -635,27 +766,27 @@ winmbox_text_extents(wmb->bf,wmb->b1,&b1,-1); winmbox_text_extents(wmb->bf,wmb->b2,&b2,-1); winmbox_text_extents(wmb->bf,wmb->b3,&b3,-1); - w=b1.cx+b2.cx+b3.cx+wmb->bfsize*5; - if (w<=wmb->maxwidth) + button_width=b1.cx+b2.cx+b3.cx+wmb->bfsize*5; + if (button_width<=wmb->maxwidth) break; - wmb->bfsize *= (wmb->maxwidth*.98)/w; + wmb->bfsize *= (wmb->maxwidth*.98)/button_width; DeleteObject(wmb->bf); } wmb->height=wmb->bfsize*2.; - if (w < 400) - w=400; + if (button_width < 400) + button_width=400; } else { wmb->height=0.; - w=wmb->maxwidth; + button_width=0; } wmb->mf=winmbox_get_font(wmb->mfsize); - winmbox_text_extents(wmb->mf,wmb->msg,&m,w); - /* wmb->width=w; */ - wmb->width=m.cx+10 > wmb->maxwidth ? wmb->maxwidth: m.cx+10; - if (wmb->widthwidth=w; + winmbox_text_extents(wmb->mf,wmb->msg,&m,wmb->maxwidth); + wmb->width=m.cx+50 > wmb->maxwidth ? wmb->maxwidth: m.cx+50; + /* Make sure the buttons fit */ + if (wmb->widthwidth=button_width; wmb->height += wmb->mfsize*2.+m.cy; if (wmb->aboutbox.right-wmb->aboutbox.left>0) { @@ -664,8 +795,12 @@ wmb->height += (wmb->aboutbox.bottom-wmb->aboutbox.top)*0.9; } else + { if (wmb->inbuf!=NULL) wmb->height += wmb->mfsize*5.; + if (wmb->passbuf!=NULL) + wmb->height += wmb->mfsize*2.; + } if (wmb->height < 200) wmb->height=200; } @@ -681,7 +816,7 @@ if (!wmb_inuse || wmb==NULL) return; - vcenter = (wmb->aboutbox.right-wmb->aboutbox.left==0 && wmb->inbuf==NULL + vcenter = (wmb->aboutbox.right-wmb->aboutbox.left==0 && wmb->inbuf==NULL && wmb->passbuf==NULL && wmb->b1[0]=='\0' && wmb->b2[0]=='\0' && wmb->b3[0]=='\0'); winmbox_text_extents(wmb->mf,message,&m,wmb->width); x1=(wmb->width-6-m.cx)/2; @@ -701,10 +836,18 @@ y1 -= (wmb->aboutbox.bottom-wmb->aboutbox.top)*0.9; n++; } - else if (wmb->inbuf!=NULL) + else { - y1-=(wmb->mfsize*1.2); - n++; + if (wmb->inbuf!=NULL) + { + y1-=(wmb->mfsize*1.2); + n++; + } + if (wmb->passbuf!=NULL) + { + y1-=(wmb->mfsize*1.2); + n++; + } } y1 = y1/n-wmb->mfsize/2.; } @@ -769,30 +912,81 @@ { dx=(int)((wmb->aboutbox.right-wmb->aboutbox.left)*0.9); dy=(int)((wmb->aboutbox.bottom-wmb->aboutbox.top)*0.88); + x1=(wmb->width - dx)/2; flags=WS_CHILD|WS_VISIBLE|WS_BORDER|WS_HSCROLL|WS_VSCROLL |ES_MULTILINE|ES_READONLY|WS_TABSTOP; } else { - dx=(int)(wmb->width*.9); - dy=(int)(wmb->mfsize*1.25); + SIZE lsize; + int dx1; + winmbox_text_extents(wmb->bf,wmb->inbuflabel,&lsize,-1); + dy=(int)(wmb->mfsize*1.2); + dx1=wmb->inbuflabel[0]=='\0'?0:0; + dx=(int)(wmb->width*.9-lsize.cx-dx1); + x1=0.05*wmb->width+lsize.cx+dx1; flags=WS_CHILD|WS_VISIBLE|WS_BORDER|WS_TABSTOP; } - x1=(wmb->width - dx)/2; y1 += wmb->mfsize; wmb->edit_hwnd=CreateWindow(wmbeditclass,"",flags, x1,y1,dx,dy, wmb->hwnd,(HMENU)10,wmb->hinstance,NULL); SendMessage(wmb->edit_hwnd,WM_SETFONT,(WPARAM)wmb->mf,1); - if (wmb->aboutbox.right - wmb->aboutbox.left > 0) - y1 += dy; - else - y1 += wmb->mfsize; if (wmb->inbuf!=NULL) { SendMessage(wmb->edit_hwnd,WM_SETTEXT,(WPARAM)0,(LPARAM)wmb->inbuf); if (wmb->aboutbox.right-wmb->aboutbox.left==0) SendMessage(wmb->edit_hwnd,EM_SETSEL,0,-1); + if (wmb->inbuflabel[0]!='\0') + { + HDC hdc; + SIZE size; + + hdc=GetDC(wmb->hwnd); + x1=0.05*wmb->width; + SetTextColor(hdc,0x000000); + SetBkMode(hdc,TRANSPARENT); + SetTextAlign(hdc,TA_TOP|TA_LEFT); + winmbox_display_text_1(wmb->mf,wmb->inbuflabel,&size,hdc, + x1,(int)(y1+wmb->mfsize*.2)); + ReleaseDC(wmb->hwnd,hdc); + } + } + if (wmb->aboutbox.right - wmb->aboutbox.left > 0) + y1 += dy; + else + y1 += wmb->mfsize; + if (wmb->passbuf!=NULL) + { + SIZE lsize; + int dx1; + + winmbox_text_extents(wmb->bf,wmb->passlabel,&lsize,-1); + dy=(int)(wmb->mfsize*1.2); + dx1=wmb->passlabel[0]=='\0'?0:0; + dx=(int)(wmb->width*.9-lsize.cx-dx1); + x1=0.05*wmb->width+lsize.cx+dx1; + y1 += wmb->mfsize*0.5; + flags=WS_CHILD|WS_VISIBLE|WS_BORDER|WS_TABSTOP|ES_PASSWORD; + wmb->pass_hwnd=CreateWindow(wmbeditclass,"",flags, + x1,y1,dx,dy, + wmb->hwnd,(HMENU)10,wmb->hinstance,NULL); + SendMessage(wmb->pass_hwnd,WM_SETFONT,(WPARAM)wmb->mf,1); + if (wmb->passlabel[0]!='\0') + { + HDC hdc; + SIZE size; + + hdc=GetDC(wmb->hwnd); + x1=0.05*wmb->width; + SetTextColor(hdc,0x000000); + SetBkMode(hdc,TRANSPARENT); + SetTextAlign(hdc,TA_TOP|TA_LEFT); + winmbox_display_text_1(wmb->mf,wmb->passlabel,&size,hdc, + x1,(int)(y1+wmb->mfsize*.2)); + ReleaseDC(wmb->hwnd,hdc); + } + y1 += wmb->mfsize; } } else @@ -1736,6 +1930,8 @@ { if (wmb->inbuf!=NULL && wmb->aboutbox.right-wmb->aboutbox.left==0) SendMessage(wmb->edit_hwnd,WM_GETTEXT,(WPARAM)wmb->maxlen-1,(LPARAM)wmb->inbuf); + if (wmb->passbuf!=NULL) + SendMessage(wmb->pass_hwnd,WM_GETTEXT,(WPARAM)wmb->passmaxlen-1,(LPARAM)wmb->passbuf); wmb->status=LOWORD(wParam); if (wmb->status==2) wmb->status=-1; @@ -1945,4 +2141,147 @@ return(hf); } + +static int winmbox_wait_last_pointer=0; +static int winmbox_wait_pointer_status=0; +static int winmbox_wait_allow_cancel=0; +static int winmbox_wait_cancelled=0; +static int winmbox_wait_busy_status=0; +static HWND winmbox_wait_hwnd=NULL; +static int winmbox_wait_bgcolor=0xffe0a0; +static int winmbox_wait_callback(HWND hwnd,UINT iMsg,WPARAM wParam,LPARAM lParam); + +void winmbox_wait(void *mainwin,char *message,int cancel_option) + + { + static int bcolors[3]={0xa00000,0xf0f0f0,0xf0f0f0}; + + if (winmbox_wait_hwnd!=NULL) + winmbox_wait_end(); + if (cancel_option) + { + winmbox_message_box(mainwin,"Please wait...",message, + "*&CANCEL",NULL,NULL,NULL,0,30,600,winmbox_wait_bgcolor, + (void *)winmbox_wait_callback,(void **)&winmbox_wait_hwnd,bcolors); + winmbox_wait_normal_pointer(); + winmbox_wait_allow_cancel=1; + } + else + { + winmbox_message_box(mainwin,"Please wait...",message, + NULL,NULL,NULL,NULL,0,30,600,winmbox_wait_bgcolor, + (void *)winmbox_wait_callback,(void **)&winmbox_wait_hwnd,NULL); + winmbox_wait_busy_pointer(); + winmbox_wait_allow_cancel=0; + } +/* +{ +FILE *f; +f=fopen("debug.txt","a"); +fprintf(f,"winmbox_wait_hwnd=%p\n",winmbox_wait_hwnd); +fclose(f); +} +*/ + winmbox_wait_cancelled=0; + winmbox_wait_busy_status=1; + winmbox_wait_bgcolor=0xffe0a0; + } + + +int winmbox_wait_cancel(void) + + { + winmbox_wait_proc_messages(); + return(winmbox_wait_cancelled); + } + + +void winmbox_wait_proc_messages(void) + + { + MSG msg; + int i; + + if (winmbox_wait_hwnd==NULL) + return; + for (i=0;i<50;i++) + { + if (!PeekMessage(&msg,winmbox_wait_hwnd,0,0,PM_REMOVE)) + break; + if (!IsDialogMessage(winmbox_wait_hwnd,&msg)) + { + TranslateMessage(&msg); + DispatchMessage(&msg); + } + } + } + + +static int winmbox_wait_callback(HWND hwnd,UINT iMsg,WPARAM wParam,LPARAM lParam) + + { + if (iMsg==WM_COMMAND) + { + int child_id; + + child_id = LOWORD(wParam); + /* Allow ENTER, ESC, or button press to Cancel */ + if (winmbox_wait_allow_cancel && (child_id>=1 && child_id<=3)) + { + winmbox_wait_cancelled=1; + winmbox_wait_end(); + } + } + return(winmbox_def_proc(hwnd,iMsg,wParam,(void *)lParam)); + } + + +void winmbox_wait_end(void) + + { + if (winmbox_wait_hwnd!=NULL) + { + winmbox_destroy(); + winmbox_wait_hwnd=NULL; + } + winmbox_wait_busy_status=0; + winmbox_wait_normal_pointer(); + } + + +int winmbox_wait_busy(void) + + { + return(winmbox_wait_busy_status); + } + + +void winmbox_wait_busy_pointer(void) + + { + winmbox_wait_last_pointer=winmbox_wait_pointer_status; + SetCursor(LoadCursor(NULL,IDC_WAIT)); + winmbox_wait_pointer_status=1; + } + + +void winmbox_wait_restore_pointer(void) + + { + if (winmbox_wait_last_pointer) + winmbox_wait_busy_pointer(); + else + winmbox_wait_normal_pointer(); + } + + +void winmbox_wait_normal_pointer(void) + + { + winmbox_wait_last_pointer=winmbox_wait_pointer_status; + SetCursor(LoadCursor(NULL,IDC_ARROW)); + winmbox_wait_pointer_status=0; + } + + #endif /* HAVE_WIN32_API */ diff -Nru k2pdfopt-2.42+ds/willuslib/wleptonica.c k2pdfopt-2.51+ds/willuslib/wleptonica.c --- k2pdfopt-2.42+ds/willuslib/wleptonica.c 2017-05-20 22:12:06.000000000 +0000 +++ k2pdfopt-2.51+ds/willuslib/wleptonica.c 2018-12-07 04:07:14.000000000 +0000 @@ -3,7 +3,7 @@ ** ** Part of willus.com general purpose C code library. ** -** Copyright (C) 2017 http://willus.com +** Copyright (C) 2018 http://willus.com ** ** This program is free software: you can redistribute it and/or modify ** it under the terms of the GNU Affero General Public License as @@ -21,6 +21,8 @@ */ #include #include "willus.h" + +#ifdef HAVE_LEPTONICA_LIB #include static void wlept_pix_from_bmp(PIX **pixptr,WILLUSBITMAP *bmp); @@ -155,3 +157,4 @@ dewarpaDestroy(&dewa); /* Includes dewarpDestroy of dew1 */ pixDestroy(&pix); } +#endif /* HAVE_LEPTONICA_LIB */ diff -Nru k2pdfopt-2.42+ds/willuslib/wmupdf.c k2pdfopt-2.51+ds/willuslib/wmupdf.c --- k2pdfopt-2.42+ds/willuslib/wmupdf.c 2017-02-25 08:26:29.000000000 +0000 +++ k2pdfopt-2.51+ds/willuslib/wmupdf.c 2018-12-22 17:22:56.000000000 +0000 @@ -4,7 +4,7 @@ ** ** Part of willus.com general purpose C code library. ** -** Copyright (C) 2017 http://willus.com +** Copyright (C) 2018 http://willus.com ** ** This program is free software: you can redistribute it and/or modify ** it under the terms of the GNU Affero General Public License as @@ -21,6 +21,7 @@ ** */ #include +#include #include "willus.h" #ifdef HAVE_Z_LIB @@ -32,7 +33,7 @@ void pdf_install_load_system_font_funcs(fz_context *ctx); static void info_update(fz_context *ctx,pdf_document *xref,char *producer,char *author,char *title); -static void dict_put_string(fz_context *ctx,pdf_document *doc,pdf_obj *dict,char *key,char *string); +static void dict_put_string(fz_context *ctx,pdf_obj *dict,char *key,char *string); static void wmupdf_object_bbox(fz_context *ctx,pdf_obj *srcpage,double *bbox_array,double *defbbox); static int wmupdf_pdfdoc_newpages(pdf_document *xref,fz_context *ctx,WPDFPAGEINFO *pageinfo, int use_forms,WPDFOUTLINE *wpdfoutline,FILE *out); @@ -71,7 +72,7 @@ static void pdf_create_outline(fz_context *ctx,pdf_document *doc,pdf_obj *outline_root,pdf_obj *orref,WPDFOUTLINE *outline); static void pdf_create_outline_1(fz_context *ctx,pdf_document *doc,pdf_obj *parent,pdf_obj *parentref,pdf_obj *dict,pdf_obj *dictref,int drefnum,WPDFOUTLINE *outline); static pdf_obj *anchor_reference(fz_context *ctx,pdf_document *doc,int pageno); -static pdf_obj *pdf_new_string_utf8(fz_context *ctx,pdf_document *doc,char *string); +static pdf_obj *pdf_new_string_utf8(fz_context *ctx,char *string); int wmupdf_numpages(char *filename) @@ -276,12 +277,12 @@ else newinfo=0; if (producer!=NULL && producer[0]!='\0') - dict_put_string(ctx,xref,info,"Producer",producer); + dict_put_string(ctx,info,"Producer",producer); if (author!=NULL && author[0]!='\0') - dict_put_string(ctx,xref,info,"Author",author); + dict_put_string(ctx,info,"Author",author); if (title!=NULL && title[0]!='\0') - dict_put_string(ctx,xref,info,"Title",title); - dict_put_string(ctx,xref,info,"ModDate",moddate); + dict_put_string(ctx,info,"Title",title); + dict_put_string(ctx,info,"ModDate",moddate); if (newinfo) { pdf_dict_puts(ctx,pdf_trailer(ctx,xref),"Info",info); @@ -290,12 +291,12 @@ } -static void dict_put_string(fz_context *ctx,pdf_document *doc,pdf_obj *dict,char *key,char *string) +static void dict_put_string(fz_context *ctx,pdf_obj *dict,char *key,char *string) { pdf_obj *value; - value=pdf_new_string(ctx,doc,string,strlen(string)); + value=pdf_new_string(ctx,string,strlen(string)); pdf_dict_puts(ctx,dict,key,value); pdf_drop_obj(ctx,value); } @@ -729,7 +730,7 @@ /* Update page count and kids array */ numpages = pdf_array_len(ctx,kids); - countobj = pdf_new_int(ctx,xref, numpages); + countobj = pdf_new_int(ctx,numpages); pdf_dict_puts(ctx,pages, "Count", countobj); pdf_drop_obj(ctx,countobj); pdf_dict_puts(ctx,pages, "Kids", kids); @@ -923,19 +924,19 @@ ** Once we turn the object into an XObject type (and not a Page type) ** it can no longer be looked up using pdf_lookup_page_obj() as of MuPDF v1.3 */ - pdf_dict_puts(ctx,srcpageobj,"Type",pdf_new_name(ctx,xref,"XObject")); - pdf_dict_puts(ctx,srcpageobj,"Subtype",pdf_new_name(ctx,xref,"Form")); - pdf_dict_puts(ctx,srcpageobj,"FormType",pdf_new_int(ctx,xref,1)); + pdf_dict_puts(ctx,srcpageobj,"Type",pdf_new_name(ctx,"XObject")); + pdf_dict_puts(ctx,srcpageobj,"Subtype",pdf_new_name(ctx,"Form")); + pdf_dict_puts(ctx,srcpageobj,"FormType",pdf_new_int(ctx,1)); if (compressed) - pdf_dict_puts(ctx,srcpageobj,"Filter",pdf_new_name(ctx,xref,"FlateDecode")); - pdf_dict_puts(ctx,srcpageobj,"Length",pdf_new_int(ctx,xref,streamlen)); + pdf_dict_puts(ctx,srcpageobj,"Filter",pdf_new_name(ctx,"FlateDecode")); + pdf_dict_puts(ctx,srcpageobj,"Length",pdf_new_int(ctx,streamlen)); array=pdf_new_array(ctx,xref,4); for (i=0;i<4;i++) - pdf_array_push(ctx,array,pdf_new_real(ctx,xref,bbox_array[i])); + pdf_array_push(ctx,array,pdf_new_real(ctx,bbox_array[i])); pdf_dict_puts(ctx,srcpageobj,"BBox",array); array=pdf_new_array(ctx,xref,6); for (i=0;i<6;i++) - pdf_array_push(ctx,array,pdf_new_real(ctx,xref,matrix[i])); + pdf_array_push(ctx,array,pdf_new_real(ctx,matrix[i])); pdf_dict_puts(ctx,srcpageobj,"Matrix",array); } @@ -1051,11 +1052,11 @@ whitespace[0]=' '; whitespace[1]='\0'; - fz_write_buffer(ctx,dstbuf,whitespace,1); + fz_append_data(ctx,dstbuf,whitespace,1); } /* mupdf 1.10a--replace write with append */ /* - fz_write_buffer(ctx,dstbuf,srcbuf->data,fz_buffer_storage(ctx,srcbuf,NULL)); + fz_append_data(ctx,dstbuf,srcbuf->data,fz_buffer_storage(ctx,srcbuf,NULL)); */ fz_append_buffer(ctx,dstbuf,srcbuf); dstlen=fz_buffer_storage(ctx,dstbuf,NULL); @@ -1086,12 +1087,12 @@ pdf_obj *mbox; pageobj=pdf_new_dict(ctx,doc,2); - pdf_dict_puts(ctx,pageobj,"Type",pdf_new_name(ctx,doc,"Page")); + pdf_dict_puts(ctx,pageobj,"Type",pdf_new_name(ctx,"Page")); mbox=pdf_new_array(ctx,doc,4); - pdf_array_push(ctx,mbox,pdf_new_real(ctx,doc,0.)); - pdf_array_push(ctx,mbox,pdf_new_real(ctx,doc,0.)); - pdf_array_push(ctx,mbox,pdf_new_real(ctx,doc,width_pts)); - pdf_array_push(ctx,mbox,pdf_new_real(ctx,doc,height_pts)); + pdf_array_push(ctx,mbox,pdf_new_real(ctx,0.)); + pdf_array_push(ctx,mbox,pdf_new_real(ctx,0.)); + pdf_array_push(ctx,mbox,pdf_new_real(ctx,width_pts)); + pdf_array_push(ctx,mbox,pdf_new_real(ctx,height_pts)); pdf_dict_puts(ctx,pageobj,"MediaBox",mbox); return(pageobj); } @@ -1114,7 +1115,7 @@ { pdf_obj *key = pdf_dict_get_key(ctx,olddests,i); pdf_obj *val = pdf_dict_get_val(ctx,olddests,i); - pdf_obj *key_str = pdf_new_string(ctx,xref,pdf_to_name(ctx,key),strlen(pdf_to_name(ctx,key))); + pdf_obj *key_str = pdf_new_string(ctx,pdf_to_name(ctx,key),strlen(pdf_to_name(ctx,key))); pdf_obj *dest = pdf_dict_gets(ctx,val,"D"); dest = pdf_array_get(ctx,dest ? dest : val, 0); @@ -1147,13 +1148,13 @@ ref = pdf_create_object(ctx,xref); obj = pdf_new_dict(ctx,xref,1); - len=pdf_new_int(ctx,xref,strlen(buf)); + len=pdf_new_int(ctx,strlen(buf)); pdf_dict_puts(ctx,obj,"Length",len); pdf_drop_obj(ctx,len); pdf_update_object(ctx,xref,ref,obj); pdf_drop_obj(ctx,obj); fzbuf=fz_new_buffer(ctx,strlen(buf)); - fz_write_buffer(ctx,fzbuf,(unsigned char *)buf,strlen(buf)); + fz_append_data(ctx,fzbuf,(unsigned char *)buf,strlen(buf)); wmupdf_update_stream(ctx,xref,ref,fzbuf); fz_drop_buffer(ctx,fzbuf); return(ref); @@ -1177,7 +1178,7 @@ obj = pdf_load_object(ctx,doc,num); if (obj!=NULL) { - pdf_dict_puts_drop(ctx,obj,"Length",pdf_new_int(ctx,doc,fz_buffer_storage(ctx,newbuf,NULL))); + pdf_dict_puts_drop(ctx,obj,"Length",pdf_new_int(ctx,fz_buffer_storage(ctx,newbuf,NULL))); /* if (!compressed) { @@ -1242,7 +1243,7 @@ if (okay_to_merge[i][0]!='\0') { /* Merge source dict into dest dict */ - wmupdf_dict_merge(ctx,pdf_to_name(ctx,key),dstval,value); + wmupdf_dict_merge(ctx,(char *)pdf_to_name(ctx,key),dstval,value); pdf_dict_put(ctx,dstdict,key,dstval); } else @@ -1253,7 +1254,7 @@ /* This works for ProcSet array, but maybe not for any array (e.g. rectangle) */ if (pdf_is_array(ctx,dstval) && pdf_is_array(ctx,value)) { - wmupdf_array_merge(ctx,pdf_to_name(ctx,key),dstval,value); + wmupdf_array_merge(ctx,(char *)pdf_to_name(ctx,key),dstval,value); return; } /* Last resort: overwrite with new value */ @@ -1393,7 +1394,6 @@ fz_document *doc=NULL; fz_display_list *list=NULL; fz_context *ctx; - fz_stext_sheet *textsheet=NULL; fz_page *page; fz_stext_page *text=NULL; fz_device *dev=NULL; @@ -1429,11 +1429,12 @@ fz_drop_context(ctx); return(-3); } + bounds=fz_bound_page(ctx,page); fz_try(ctx) { - list=fz_new_display_list(ctx,NULL); + list=fz_new_display_list(ctx,bounds); dev=fz_new_list_device(ctx,list); - fz_run_page(ctx,page,dev,&fz_identity,NULL); + fz_run_page(ctx,page,dev,fz_identity,NULL); } fz_always(ctx) { @@ -1450,18 +1451,17 @@ return(-4); } fz_var(text); - fz_bound_page(ctx,page,&bounds); + /* Mupdf v1.14: bounds.y1 > bounds.y0 */ wtc->width=fabs(bounds.x1-bounds.x0); wtc->height=fabs(bounds.y1-bounds.y0); - textsheet=fz_new_stext_sheet(ctx); fz_try(ctx) { /* options= FZ_STEXT_PRESERVE_LIGATURES | FZ_STEXT_PRESERVE_WHITESPACE; */ /* Do not preserve ligatures or white space */ if (list) - text=fz_new_stext_page_from_display_list(ctx,list,textsheet,0); + text=fz_new_stext_page_from_display_list(ctx,list,NULL); else - text=fz_new_stext_page_from_page(ctx,page,textsheet,0); + text=fz_new_stext_page_from_page(ctx,page,NULL); /* dev=fz_new_stext_device(ctx,textsheet,text,options); if (list) @@ -1482,7 +1482,6 @@ dev=NULL; */ fz_drop_stext_page(ctx,text); - fz_drop_stext_sheet(ctx,textsheet); fz_drop_display_list(ctx,list); fz_drop_page(ctx,page); fz_drop_document(ctx,doc); @@ -1507,180 +1506,158 @@ int boundingbox) { - int iblock,lig; + int lig; + fz_stext_block *block; lig=-1; - for (iblock=lig=0;iblocklen;iblock++) + for (lig=0,block=page->first_block;block;block=block->next) { - fz_stext_block *block; fz_stext_line *line; - char *s; - if (page->blocks[iblock].type != FZ_PAGE_BLOCK_TEXT) + if (block->type != FZ_STEXT_BLOCK_TEXT) continue; - block=page->blocks[iblock].u.text; - for (line=block->lines;linelines+block->len;line++) + for (line=block->u.t.first_line;line;line=line->next) { - fz_stext_span *span; + fz_stext_char *ch; - for (span=line->first_span;span;span=span->next) + for (ch = line->first_char;ch;ch=ch->next) { - fz_stext_style *style=NULL; - int char_num; -/* -printf("Span:\n"); -printf(" len=%d, cap=%d\n",span->len,span->cap); -printf(" min=(%d,%d)\n",(int)span->min.x,(int)span->min.y); -printf(" max=(%d,%d)\n",(int)span->max.x,(int)span->max.y); -printf(" wmode=%d\n",span->wmode); -printf(" asmax=%g, dsmin=%g\n",span->ascender_max,span->descender_min); -printf(" bbox=(%g,%g) - (%g,%g)\n",span->bbox.x0,span->bbox.y0,span->bbox.x1,span->bbox.y1); -printf(" baseoff=%g\n",span->base_offset); -printf(" spacing=%g\n",span->spacing); -printf(" column=%d\n",span->column); -printf(" colwidth=%g\n",span->column_width); -printf(" align=%d\n",span->align); -printf(" indent=%g\n",span->indent); -*/ - for (char_num=0;char_numlen;char_num++) + fz_quad quad; + double dx,dy; + WTEXTCHAR textchar; +/* +printf("Char '%c' (%02Xh):\n",ch->c,ch->c); +printf(" size=%g\n",ch->size); +printf(" origin=(%g,%g)\n",ch->origin.x,ch->origin.y); +printf(" quad.ll=(%g,%g)\n",ch->quad.ll.x,ch->quad.ll.y); +printf(" quad.ul=(%g,%g)\n",ch->quad.ul.x,ch->quad.ul.y); +printf(" quad.lr=(%g,%g)\n",ch->quad.lr.x,ch->quad.lr.y); +printf(" quad.ur=(%g,%g)\n",ch->quad.ur.x,ch->quad.ur.y); +*/ + quad=ch->quad; + if (lig>0) + lig++; + /* Ligature char? */ + if (quad.ll.x==quad.lr.x && ch->c!=' ') + lig=1; + /* Skip space after ligature */ + if (lig==3 && ch->c==' ') { - fz_stext_char *ch; - fz_rect rect; - double dx,dy; - WTEXTCHAR textchar; - - ch=&span->text[char_num]; - if (ch->style != style) - { - char *fname; - /* style change if style!=NULL */ - style=ch->style; - fname=(char *)fz_font_name(ctx,style->font); - s=strchr(fname,'+'); - s= s ? s+1 : fname; - } - fz_stext_char_bbox(ctx,&rect,span,char_num); - if (lig>0) - lig++; - /* Ligature char? */ - if (rect.x0==rect.x1 && ch->c!=' ') - lig=1; - /* Skip space after ligature */ - if (lig==3 && ch->c==' ') - { - lig = -1; - continue; - } + lig = -1; + continue; + } #if 0 - /* - ** Deal correctly with ligatures - */ - /* Indicator of second char in ligature, e.g. 'i' in 'fi' */ - if (ch->p.x==0. && ch->p.y==0.) - { - if (char_num>0) - { - fz_stext_char *ch2; - fz_rect rect2; - ch2=&span->text[char_num-1]; - fz_stext_char_bbox(ctx,&rect2,span,char_num-1); - ch->p.y=ch2->p.y; - rect.y0+=ch2->p.y; - rect.y1+=ch2->p.y; - rect.x0=rect.x1=rect2.x0; - } - lig=1; - } - /* Indicator of first char in ligature, e.g. 'f' in 'fi' */ - else if (rect.x0==0. && rect.y0<0.) + /* + ** Deal correctly with ligatures + */ + /* Indicator of second char in ligature, e.g. 'i' in 'fi' */ + if (ch->p.x==0. && ch->p.y==0.) + { + if (char_num>0) { - rect.x0 = rect.x1; - rect.y0 += ch->p.y; + fz_stext_char *ch2; + fz_rect rect2; + ch2=&ch->text[char_num-1]; + rect2=ch2->bbox; + ch->p.y=ch2->p.y; + rect.y0+=ch2->p.y; + rect.y1+=ch2->p.y; + rect.x0=rect.x1=rect2.x0; } + lig=1; + } + /* Indicator of first char in ligature, e.g. 'f' in 'fi' */ + else if (rect.x0==0. && rect.y0<0.) + { + rect.x0 = rect.x1; + rect.y0 += ch->p.y; + } #endif - textchar.x1=rect.x0; - textchar.y1=rect.y0; - textchar.x2=rect.x1; - textchar.y2=rect.y1; - textchar.xp=ch->p.x; - textchar.yp=ch->p.y; - textchar.ucs=ch->c; - /* - ** Strange behavior in one particular PDF (modul1.pdf) file lead to this... - ** MuPDF bugzilla #695362: - ** "Incorrect structured-text character bounding boxes and character values" - ** Filed 13 July 2014 - */ - dx=textchar.x2-textchar.x1; - if (fabs(dx)>3000.) - { - if (fabs(textchar.x1-textchar.xp) < fabs(textchar.x2-textchar.xp)) - textchar.x2 = textchar.x1 + dx/1000.; - else - textchar.x1 = textchar.x2 - dx/1000.; - } - dy=textchar.y2-textchar.y1; - if (fabs(dy)>3000.) - { - if (fabs(textchar.y1-textchar.yp) < fabs(textchar.y2-textchar.yp)) - textchar.y2 = textchar.y1 + dy/1000.; - else - textchar.y1 = textchar.y2 - dy/1000.; - } + textchar.x1=quad.ll.x < quad.ul.x ? quad.ll.x : quad.ul.x; + textchar.y1=quad.ul.y > quad.ur.y ? quad.ul.y : quad.ur.y; + textchar.x2=quad.lr.x > quad.ur.x ? quad.lr.x : quad.ur.x; + textchar.y2=quad.ll.y < quad.lr.y ? quad.ll.y : quad.lr.y; + textchar.xp=ch->origin.x; + textchar.yp=ch->origin.y; + textchar.ucs=ch->c; + + /* + ** Strange behavior in one particular PDF (modul1.pdf) file lead to this... + ** MuPDF bugzilla #695362: + ** "Incorrect structured-text character bounding boxes and character values" + ** Filed 13 July 2014 + ** + ** (Not sure if we still need this in MuPDF v1.14) + */ + dx=textchar.x2-textchar.x1; + if (fabs(dx)>3000.) + { + if (fabs(textchar.x1-textchar.xp) < fabs(textchar.x2-textchar.xp)) + textchar.x2 = textchar.x1 + dx/1000.; + else + textchar.x1 = textchar.x2 - dx/1000.; + } + dy=textchar.y2-textchar.y1; + if (fabs(dy)>3000.) + { + if (fabs(textchar.y1-textchar.yp) < fabs(textchar.y2-textchar.yp)) + textchar.y2 = textchar.y1 + dy/1000.; + else + textchar.y1 = textchar.y2 - dy/1000.; + } /* printf("Char %4d: (%7.1f,%7.1f) - (%7.1f,%7.1f) (%7.1f,%7.1f)\n", ch->c,textchar.x1,textchar.y1,textchar.x2,textchar.y2,textchar.xp,textchar.yp); */ #if 0 - /* If just had ligature, adjust x-values */ - if (lig==2) + /* If just had ligature, adjust x-values */ + if (lig==2) + { + if (wtc->n>1) { - if (wtc->n>1) - { - double xmid; - WTEXTCHAR *tc1,*tc2; - tc1=&wtc->wtextchar[wtc->n-2]; - tc2=&wtc->wtextchar[wtc->n-1]; - xmid = (tc1->x1 + textchar.x1)/2.; - tc1->x2 = tc2->x1 = tc2->xp = xmid; - tc2->x2 = textchar.x1; - } - else if (wtc->n>0) - { - WTEXTCHAR *tc1; - tc1=&wtc->wtextchar[wtc->n-1]; - tc1->x2=textchar.x1; - } - lig=0; + double xmid; + WTEXTCHAR *tc1,*tc2; + tc1=&wtc->wtextchar[wtc->n-2]; + tc2=&wtc->wtextchar[wtc->n-1]; + xmid = (tc1->x1 + textchar.x1)/2.; + tc1->x2 = tc2->x1 = tc2->xp = xmid; + tc2->x2 = textchar.x1; } - else if (lig==1) - lig++; -#endif - if (boundingbox==0 || wtc->n<=0) + else if (wtc->n>0) { - wtextchars_add_wtextchar(wtc,&textchar); - /* Split difference in char widths for ligature */ - if (lig==2 && wtc->n>1) - { - wtc->wtextchar[wtc->n-1].xp = - wtc->wtextchar[wtc->n-2].x2 = wtc->wtextchar[wtc->n-1].x1 - = (wtc->wtextchar[wtc->n-2].x1+wtc->wtextchar[wtc->n-1].x2)/2.; - } + WTEXTCHAR *tc1; + tc1=&wtc->wtextchar[wtc->n-1]; + tc1->x2=textchar.x1; } - else + lig=0; + } + else if (lig==1) + lig++; +#endif + if (boundingbox==0 || wtc->n<=0) + { + wtextchars_add_wtextchar(wtc,&textchar); + /* Split difference in char widths for ligature */ + if (lig==2 && wtc->n>1) { - WTEXTCHAR *tc0; - tc0 = &wtc->wtextchar[0]; - if (textchar.x1 < tc0->x1) - tc0->x1 = textchar.x1; - if (textchar.x2 > tc0->x2) - tc0->x2 = textchar.x2; - if (textchar.y1 < tc0->y1) - tc0->y1 = textchar.y1; - if (textchar.y2 > tc0->y2) - tc0->y2 = textchar.y2; + wtc->wtextchar[wtc->n-1].xp = + wtc->wtextchar[wtc->n-2].x2 = wtc->wtextchar[wtc->n-1].x1 + = (wtc->wtextchar[wtc->n-2].x1+wtc->wtextchar[wtc->n-1].x2)/2.; } } + else + { + WTEXTCHAR *tc0; + tc0 = &wtc->wtextchar[0]; + if (textchar.x1 < tc0->x1) + tc0->x1 = textchar.x1; + if (textchar.x2 > tc0->x2) + tc0->x2 = textchar.x2; + if (textchar.y1 < tc0->y1) + tc0->y1 = textchar.y1; + if (textchar.y2 > tc0->y2) + tc0->y2 = textchar.y2; + } } } } @@ -1803,7 +1780,7 @@ pdf_obj *title,*nextdict,*nextdictref,*aref; int nextdictrefnum; - title=pdf_new_string_utf8(ctx,doc,outline->title); + title=pdf_new_string_utf8(ctx,outline->title); pdf_dict_puts(ctx,dict,"Title",title); pdf_drop_obj(ctx,title); aref=anchor_reference(ctx,doc,outline->dstpage); @@ -1847,7 +1824,7 @@ { pdf_obj *countobj; - countobj=pdf_new_int(ctx,doc,count); + countobj=pdf_new_int(ctx,count); pdf_dict_puts(ctx,parent,"Count",countobj); pdf_drop_obj(ctx,countobj); } @@ -1872,12 +1849,12 @@ anchorref = pdf_new_indirect(ctx,doc,arefnum,0); array = pdf_new_array(ctx,doc,2); pdf_array_push(ctx,array,pageref); - name = pdf_new_name(ctx,doc,"Fit"); + name = pdf_new_name(ctx,"Fit"); pdf_array_push(ctx,array,name); pdf_drop_obj(ctx,name); pdf_dict_puts(ctx,anchor,"D",array); pdf_drop_obj(ctx,array); - name = pdf_new_name(ctx,doc,"GoTo"); + name = pdf_new_name(ctx,"GoTo"); pdf_dict_puts(ctx,anchor,"S",name); pdf_drop_obj(ctx,name); pdf_update_object(ctx,doc,arefnum,anchor); @@ -1886,7 +1863,7 @@ } -static pdf_obj *pdf_new_string_utf8(fz_context *ctx,pdf_document *doc,char *string) +static pdf_obj *pdf_new_string_utf8(fz_context *ctx,char *string) { int *utf16; @@ -1909,7 +1886,7 @@ } utfbuf[j]='\0'; willus_mem_free((double **)&utf16,funcname); - pdfobj=pdf_new_string(ctx,doc,utfbuf,j); + pdfobj=pdf_new_string(ctx,utfbuf,j); willus_mem_free((double **)&utfbuf,funcname); return(pdfobj); } diff -Nru k2pdfopt-2.42+ds/willuslib/wmupdfinfo.c k2pdfopt-2.51+ds/willuslib/wmupdfinfo.c --- k2pdfopt-2.42+ds/willuslib/wmupdfinfo.c 2017-05-20 21:09:15.000000000 +0000 +++ k2pdfopt-2.51+ds/willuslib/wmupdfinfo.c 2018-11-21 17:05:21.000000000 +0000 @@ -4,7 +4,7 @@ ** ** Part of willus.com general purpose C code library. ** -** Copyright (C) 2017 http://willus.com +** Copyright (C) 2018 http://willus.com ** ** This program is free software: you can redistribute it and/or modify ** it under the terms of the GNU Affero General Public License as @@ -208,26 +208,26 @@ pdf_document *doc = glo->doc; /* - fz_printf(ctx, out, "\nPDF-%d.%d\n", doc->version / 10, doc->version % 10); + fz_write_printf(ctx, out, "\nPDF-%d.%d\n", doc->version / 10, doc->version % 10); - obj = pdf_dict_get(ctx, pdf_trailer(ctx, doc), PDF_NAME_Info); + obj = pdf_dict_get(ctx, pdf_trailer(ctx, doc), PDF_NAME(Info)); if (obj) { - fz_printf(ctx, out, "Info object (%d %d R):\n", pdf_to_num(ctx, obj), pdf_to_gen(ctx, obj)); + fz_write_printf(ctx, out, "Info object (%d %d R):\n", pdf_to_num(ctx, obj), pdf_to_gen(ctx, obj)); pdf_print_obj(ctx, out, pdf_resolve_indirect(ctx, obj), 1); } - obj = pdf_dict_get(ctx, pdf_trailer(ctx, doc), PDF_NAME_Encrypt); + obj = pdf_dict_get(ctx, pdf_trailer(ctx, doc), PDF_NAME(Encrypt)); if (obj) { - fz_printf(ctx, out, "\nEncryption object (%d %d R):\n", pdf_to_num(ctx, obj), pdf_to_gen(ctx, obj)); + fz_write_printf(ctx, out, "\nEncryption object (%d %d R):\n", pdf_to_num(ctx, obj), pdf_to_gen(ctx, obj)); pdf_print_obj(ctx, out, pdf_resolve_indirect(ctx, obj), 1); } - fz_printf(ctx, out, "\nPages: %d\n\n", glo->pagecount); + fz_write_printf(ctx, out, "\nPages: %d\n\n", glo->pagecount); } */ - fz_printf(ctx,out,"PDF VERSION: %d.%d\n",doc->version/10,doc->version%10); + fz_write_printf(ctx,out,"PDF VERSION: %d.%d\n",doc->version/10,doc->version%10); obj = pdf_dict_gets(ctx,pdf_trailer(ctx,doc), "Info"); if (obj) @@ -241,7 +241,7 @@ buf=malloc(n+2); if (buf==NULL) { - fz_printf(ctx,out,"Info object (%d %d R):\n",pdf_to_num(ctx,obj),pdf_to_gen(ctx,obj)); + fz_write_printf(ctx,out,"Info object (%d %d R):\n",pdf_to_num(ctx,obj),pdf_to_gen(ctx,obj)); pdf_print_obj(ctx,out,robj,1); } else @@ -263,18 +263,18 @@ sprintf(buf1,"PAGE SIZE: %.2f x %.2f in\n", (glo->dim[0].u.dim.bbox->x1-glo->dim[0].u.dim.bbox->x0)/72., (glo->dim[0].u.dim.bbox->y1-glo->dim[0].u.dim.bbox->y0)/72.); - fz_printf(ctx,out,"%s",buf1); + fz_write_printf(ctx,out,"%s",buf1); } else { if (glo->dims>1) - fz_printf(ctx,out,"PAGE SIZE: (varies)\n"); + fz_write_printf(ctx,out,"PAGE SIZE: (varies)\n"); } - fz_printf(ctx,out, "PAGES: %d\n\n", glo->pagecount); + fz_write_printf(ctx,out, "PAGES: %d\n\n", glo->pagecount); obj = pdf_dict_gets(ctx,pdf_trailer(ctx,doc), "Encrypt"); if (obj) { - fz_printf(ctx,out, "\nEncryption object (%d %d R):\n", pdf_to_num(ctx,obj), pdf_to_gen(ctx,obj)); + fz_write_printf(ctx,out, "\nEncryption object (%d %d R):\n", pdf_to_num(ctx,obj), pdf_to_gen(ctx,obj)); pdf_print_obj(ctx,out, pdf_resolve_indirect(ctx,obj), 1); } } @@ -302,15 +302,15 @@ int j; for (j=i+lenfn+1;buf[j]!='\0' && buf[j]!=')';j++); buf[j]='\0'; - fz_printf(ctx,out,"%s",label3); + fz_write_printf(ctx,out,"%s",label3); if (in_string(fieldname,"date")>=0) { char newdate[128]; date_convert(newdate,&buf[i+lenfn+1]); - fz_printf(ctx,out,"%s\n",newdate); + fz_write_printf(ctx,out,"%s\n",newdate); } else - fz_printf(ctx,out,"%s\n",&buf[i+lenfn+1]); + fz_write_printf(ctx,out,"%s\n",&buf[i+lenfn+1]); break; } } @@ -334,7 +334,7 @@ fclose(f); str_format_int_grouped(sizecommas,sz); sprintf(buf,"FILE SIZE: %.1f kB (%s bytes)\n",sz/1024.,sizecommas); - fz_printf(ctx,out,"%s",buf); + fz_write_printf(ctx,out,"%s",buf); } @@ -373,13 +373,13 @@ pdf_obj *obj; int j; - obj = pdf_dict_get(ctx, pageobj, PDF_NAME_MediaBox); + obj = pdf_dict_get(ctx, pageobj, PDF_NAME(MediaBox)); if (!pdf_is_array(ctx, obj)) return; - pdf_to_rect(ctx, obj, &bbox); + bbox=pdf_to_rect(ctx, obj); - obj = pdf_dict_get(ctx, pageobj, PDF_NAME_UserUnit); + obj = pdf_dict_get(ctx, pageobj, PDF_NAME(UserUnit)); if (pdf_is_real(ctx, obj)) { float unit = pdf_to_real(ctx, obj); @@ -429,10 +429,10 @@ continue; } - subtype = pdf_dict_get(ctx, fontdict, PDF_NAME_Subtype); - basefont = pdf_dict_get(ctx, fontdict, PDF_NAME_BaseFont); + subtype = pdf_dict_get(ctx, fontdict, PDF_NAME(Subtype)); + basefont = pdf_dict_get(ctx, fontdict, PDF_NAME(BaseFont)); if (!basefont || pdf_is_null(ctx, basefont)) - name = pdf_dict_get(ctx, fontdict, PDF_NAME_Name); + name = pdf_dict_get(ctx, fontdict, PDF_NAME(Name)); for (k = 0; k < glo->fonts; k++) if (!pdf_objcmp(ctx, glo->font[k].u.font.obj, fontdict)) @@ -478,20 +478,20 @@ continue; } - type = pdf_dict_get(ctx, imagedict, PDF_NAME_Subtype); - if (!pdf_name_eq(ctx, type, PDF_NAME_Image)) + type = pdf_dict_get(ctx, imagedict, PDF_NAME(Subtype)); + if (!pdf_name_eq(ctx, type, PDF_NAME(Image))) continue; - filter = pdf_dict_get(ctx, imagedict, PDF_NAME_Filter); + filter = pdf_dict_get(ctx, imagedict, PDF_NAME(Filter)); altcs = NULL; - cs = pdf_dict_get(ctx, imagedict, PDF_NAME_ColorSpace); + cs = pdf_dict_get(ctx, imagedict, PDF_NAME(ColorSpace)); if (pdf_is_array(ctx, cs)) { pdf_obj *cses = cs; cs = pdf_array_get(ctx, cses, 0); - if (pdf_name_eq(ctx, cs, PDF_NAME_DeviceN) || pdf_name_eq(ctx, cs, PDF_NAME_Separation)) + if (pdf_name_eq(ctx, cs, PDF_NAME(DeviceN)) || pdf_name_eq(ctx, cs, PDF_NAME(Separation))) { altcs = pdf_array_get(ctx, cses, 2); if (pdf_is_array(ctx, altcs)) @@ -499,9 +499,9 @@ } } - width = pdf_dict_get(ctx, imagedict, PDF_NAME_Width); - height = pdf_dict_get(ctx, imagedict, PDF_NAME_Height); - bpc = pdf_dict_get(ctx, imagedict, PDF_NAME_BitsPerComponent); + width = pdf_dict_get(ctx, imagedict, PDF_NAME(Width)); + height = pdf_dict_get(ctx, imagedict, PDF_NAME(Height)); + bpc = pdf_dict_get(ctx, imagedict, PDF_NAME(BitsPerComponent)); for (k = 0; k < glo->images; k++) if (!pdf_objcmp(ctx, glo->image[k].u.image.obj, imagedict)) @@ -549,17 +549,17 @@ continue; } - type = pdf_dict_get(ctx, xobjdict, PDF_NAME_Subtype); - if (!pdf_name_eq(ctx, type, PDF_NAME_Form)) + type = pdf_dict_get(ctx, xobjdict, PDF_NAME(Subtype)); + if (!pdf_name_eq(ctx, type, PDF_NAME(Form))) continue; - subtype = pdf_dict_get(ctx, xobjdict, PDF_NAME_Subtype2); - if (!pdf_name_eq(ctx, subtype, PDF_NAME_PS)) + subtype = pdf_dict_get(ctx, xobjdict, PDF_NAME(Subtype2)); + if (!pdf_name_eq(ctx, subtype, PDF_NAME(PS))) continue; - group = pdf_dict_get(ctx, xobjdict, PDF_NAME_Group); - groupsubtype = pdf_dict_get(ctx, group, PDF_NAME_S); - reference = pdf_dict_get(ctx, xobjdict, PDF_NAME_Ref); + group = pdf_dict_get(ctx, xobjdict, PDF_NAME(Group)); + groupsubtype = pdf_dict_get(ctx, group, PDF_NAME(S)); + reference = pdf_dict_get(ctx, xobjdict, PDF_NAME(Ref)); for (k = 0; k < glo->forms; k++) if (!pdf_objcmp(ctx, glo->form[k].u.form.obj, xobjdict)) @@ -600,10 +600,10 @@ continue; } - type = pdf_dict_get(ctx, xobjdict, PDF_NAME_Subtype); - subtype = pdf_dict_get(ctx, xobjdict, PDF_NAME_Subtype2); - if (!pdf_name_eq(ctx, type, PDF_NAME_PS) && - (!pdf_name_eq(ctx, type, PDF_NAME_Form) || !pdf_name_eq(ctx, subtype, PDF_NAME_PS))) + type = pdf_dict_get(ctx, xobjdict, PDF_NAME(Subtype)); + subtype = pdf_dict_get(ctx, xobjdict, PDF_NAME(Subtype2)); + if (!pdf_name_eq(ctx, type, PDF_NAME(PS)) && + (!pdf_name_eq(ctx, type, PDF_NAME(Form)) || !pdf_name_eq(ctx, subtype, PDF_NAME(PS)))) continue; for (k = 0; k < glo->psobjs; k++) @@ -642,7 +642,7 @@ continue; } - type = pdf_dict_get(ctx, shade, PDF_NAME_ShadingType); + type = pdf_dict_get(ctx, shade, PDF_NAME(ShadingType)); if (!pdf_is_int(ctx, type) || pdf_to_int(ctx, type) < 1 || pdf_to_int(ctx, type) > 7) { fz_warn(ctx, "not a shading type (%d %d R)", pdf_to_num(ctx, shade), pdf_to_gen(ctx, shade)); @@ -689,7 +689,7 @@ continue; } - type = pdf_dict_get(ctx, patterndict, PDF_NAME_PatternType); + type = pdf_dict_get(ctx, patterndict, PDF_NAME(PatternType)); if (!pdf_is_int(ctx, type) || pdf_to_int(ctx, type) < 1 || pdf_to_int(ctx, type) > 2) { fz_warn(ctx, "not a pattern type (%d %d R)", pdf_to_num(ctx, patterndict), pdf_to_gen(ctx, patterndict)); @@ -698,14 +698,14 @@ if (pdf_to_int(ctx, type) == 1) { - paint = pdf_dict_get(ctx, patterndict, PDF_NAME_PaintType); + paint = pdf_dict_get(ctx, patterndict, PDF_NAME(PaintType)); if (!pdf_is_int(ctx, paint) || pdf_to_int(ctx, paint) < 1 || pdf_to_int(ctx, paint) > 2) { fz_warn(ctx, "not a pattern paint type (%d %d R)", pdf_to_num(ctx, patterndict), pdf_to_gen(ctx, patterndict)); paint = NULL; } - tiling = pdf_dict_get(ctx, patterndict, PDF_NAME_TilingType); + tiling = pdf_dict_get(ctx, patterndict, PDF_NAME(TilingType)); if (!pdf_is_int(ctx, tiling) || pdf_to_int(ctx, tiling) < 1 || pdf_to_int(ctx, tiling) > 3) { fz_warn(ctx, "not a pattern tiling type (%d %d R)", pdf_to_num(ctx, patterndict), pdf_to_gen(ctx, patterndict)); @@ -714,7 +714,7 @@ } else { - shading = pdf_dict_get(ctx, patterndict, PDF_NAME_Shading); + shading = pdf_dict_get(ctx, patterndict, PDF_NAME(Shading)); } for (k = 0; k < glo->patterns; k++) @@ -756,7 +756,7 @@ if (!pageobj) fz_throw(ctx, FZ_ERROR_GENERIC, "cannot retrieve info from page %d", page); - font = pdf_dict_get(ctx, rsrc, PDF_NAME_Font); + font = pdf_dict_get(ctx, rsrc, PDF_NAME(Font)); if (show & FONTS && font) { int n; @@ -767,13 +767,13 @@ { pdf_obj *obj = pdf_dict_get_val(ctx, font, i); - subrsrc = pdf_dict_get(ctx, obj, PDF_NAME_Resources); + subrsrc = pdf_dict_get(ctx, obj, PDF_NAME(Resources)); if (subrsrc && pdf_objcmp(ctx, rsrc, subrsrc)) gatherresourceinfo(ctx, glo, page, subrsrc, show); } } - xobj = pdf_dict_get(ctx, rsrc, PDF_NAME_XObject); + xobj = pdf_dict_get(ctx, rsrc, PDF_NAME(XObject)); if (show & XOBJS && xobj) { int n; @@ -785,17 +785,17 @@ for (i = 0; i < n; i++) { pdf_obj *obj = pdf_dict_get_val(ctx, xobj, i); - subrsrc = pdf_dict_get(ctx, obj, PDF_NAME_Resources); + subrsrc = pdf_dict_get(ctx, obj, PDF_NAME(Resources)); if (subrsrc && pdf_objcmp(ctx, rsrc, subrsrc)) gatherresourceinfo(ctx, glo, page, subrsrc, show); } } - shade = pdf_dict_get(ctx, rsrc, PDF_NAME_Shading); + shade = pdf_dict_get(ctx, rsrc, PDF_NAME(Shading)); if (show & SHADINGS && shade) gathershadings(ctx, glo, page, pageref, pageobj, shade); - pattern = pdf_dict_get(ctx, rsrc, PDF_NAME_Pattern); + pattern = pdf_dict_get(ctx, rsrc, PDF_NAME(Pattern)); if (show & PATTERNS && pattern) { int n; @@ -804,7 +804,7 @@ for (i = 0; i < n; i++) { pdf_obj *obj = pdf_dict_get_val(ctx, pattern, i); - subrsrc = pdf_dict_get(ctx, obj, PDF_NAME_Resources); + subrsrc = pdf_dict_get(ctx, obj, PDF_NAME(Resources)); if (subrsrc && pdf_objcmp(ctx, rsrc, subrsrc)) gatherresourceinfo(ctx, glo, page, subrsrc, show); } @@ -819,7 +819,7 @@ if (page > glo->pagecount) { - fz_printf(ctx,glo->out,"[Error: Page %d not found.]\n",page); + fz_write_printf(ctx,glo->out,"[Error: Page %d not found.]\n",page); return; } pageref = pdf_lookup_page_obj(ctx, glo->doc, page-1); @@ -830,7 +830,7 @@ gatherdimensions(ctx, glo, page, pageref, pageobj); - rsrc = pdf_dict_get(ctx, pageobj, PDF_NAME_Resources); + rsrc = pdf_dict_get(ctx, pageobj, PDF_NAME(Resources)); gatherresourceinfo(ctx, glo, page, rsrc, show); } @@ -845,7 +845,7 @@ if (show & DIMENSIONS && glo->dims > 0) { - fz_printf(ctx, out, "Mediaboxes (%d):\n", glo->dims); + fz_write_printf(ctx, out, "Mediaboxes (%d):\n", glo->dims); for (i = 0; i < glo->dims; i++) { char buf1[64]; @@ -853,7 +853,7 @@ sprintf(buf1,"%.2f x %.2f", (glo->dim[i].u.dim.bbox->x1-glo->dim[i].u.dim.bbox->x0)/72., (glo->dim[i].u.dim.bbox->y1-glo->dim[i].u.dim.bbox->y0)/72.); - fz_printf(ctx, out, PAGE_FMT "[ %g %g %g %g ] (%s in)\n", + fz_write_printf(ctx, out, PAGE_FMT "[ %g %g %g %g ] (%s in)\n", glo->dim[i].page, pdf_to_num(ctx, glo->dim[i].pageref), pdf_to_gen(ctx, glo->dim[i].pageref), @@ -862,15 +862,15 @@ glo->dim[i].u.dim.bbox->x1, glo->dim[i].u.dim.bbox->y1,buf1); } - fz_printf(ctx, out, "\n"); + fz_write_printf(ctx, out, "\n"); } if (show & FONTS && glo->fonts > 0) { - fz_printf(ctx, out, "Fonts (%d):\n", glo->fonts); + fz_write_printf(ctx, out, "Fonts (%d):\n", glo->fonts); for (i = 0; i < glo->fonts; i++) { - fz_printf(ctx, out, PAGE_FMT "%s '%s' (%d %d R)\n", + fz_write_printf(ctx, out, PAGE_FMT "%s '%s' (%d %d R)\n", glo->font[i].page, pdf_to_num(ctx, glo->font[i].pageref), pdf_to_gen(ctx, glo->font[i].pageref), @@ -879,18 +879,18 @@ pdf_to_num(ctx, glo->font[i].u.font.obj), pdf_to_gen(ctx, glo->font[i].u.font.obj)); } - fz_printf(ctx, out, "\n"); + fz_write_printf(ctx, out, "\n"); } if (show & IMAGES && glo->images > 0) { - fz_printf(ctx, out, "Images (%d):\n", glo->images); + fz_write_printf(ctx, out, "Images (%d):\n", glo->images); for (i = 0; i < glo->images; i++) { char *cs = NULL; char *altcs = NULL; - fz_printf(ctx, out, PAGE_FMT "[ ", + fz_write_printf(ctx, out, PAGE_FMT "[ ", glo->image[i].page, pdf_to_num(ctx, glo->image[i].pageref), pdf_to_gen(ctx, glo->image[i].pageref)); @@ -906,7 +906,7 @@ if (strstr(filter, "Decode")) *(strstr(filter, "Decode")) = '\0'; - fz_printf(ctx, out, "%s%s", + fz_write_printf(ctx, out, "%s%s", filter, j == pdf_array_len(ctx, glo->image[i].u.image.filter) - 1 ? "" : " "); fz_free(ctx, filter); @@ -920,11 +920,11 @@ if (strstr(filter, "Decode")) *(strstr(filter, "Decode")) = '\0'; - fz_printf(ctx, out, "%s", filter); + fz_write_printf(ctx, out, "%s", filter); fz_free(ctx, filter); } else - fz_printf(ctx, out, "Raw"); + fz_write_printf(ctx, out, "Raw"); if (glo->image[i].u.image.cs) { @@ -965,7 +965,7 @@ fz_strlcpy(altcs, "Sep", 4); } - fz_printf(ctx, out, " ] %dx%d %dbpc %s%s%s (%d %d R)\n", + fz_write_printf(ctx, out, " ] %dx%d %dbpc %s%s%s (%d %d R)\n", pdf_to_int(ctx, glo->image[i].u.image.width), pdf_to_int(ctx, glo->image[i].u.image.height), glo->image[i].u.image.bpc ? pdf_to_int(ctx, glo->image[i].u.image.bpc) : 1, @@ -978,12 +978,12 @@ fz_free(ctx, cs); fz_free(ctx, altcs); } - fz_printf(ctx, out, "\n"); + fz_write_printf(ctx, out, "\n"); } if (show & SHADINGS && glo->shadings > 0) { - fz_printf(ctx, out, "Shading patterns (%d):\n", glo->shadings); + fz_write_printf(ctx, out, "Shading patterns (%d):\n", glo->shadings); for (i = 0; i < glo->shadings; i++) { char *shadingtype[] = @@ -998,7 +998,7 @@ "Tensor patch", }; - fz_printf(ctx, out, PAGE_FMT "%s (%d %d R)\n", + fz_write_printf(ctx, out, PAGE_FMT "%s (%d %d R)\n", glo->shading[i].page, pdf_to_num(ctx, glo->shading[i].pageref), pdf_to_gen(ctx, glo->shading[i].pageref), @@ -1006,12 +1006,12 @@ pdf_to_num(ctx, glo->shading[i].u.shading.obj), pdf_to_gen(ctx, glo->shading[i].u.shading.obj)); } - fz_printf(ctx, out, "\n"); + fz_write_printf(ctx, out, "\n"); } if (show & PATTERNS && glo->patterns > 0) { - fz_printf(ctx, out, "Patterns (%d):\n", glo->patterns); + fz_write_printf(ctx, out, "Patterns (%d):\n", glo->patterns); for (i = 0; i < glo->patterns; i++) { if (pdf_to_int(ctx, glo->pattern[i].u.pattern.type) == 1) @@ -1030,7 +1030,7 @@ "Constant/fast tiling", }; - fz_printf(ctx, out, PAGE_FMT "Tiling %s %s (%d %d R)\n", + fz_write_printf(ctx, out, PAGE_FMT "Tiling %s %s (%d %d R)\n", glo->pattern[i].page, pdf_to_num(ctx, glo->pattern[i].pageref), pdf_to_gen(ctx, glo->pattern[i].pageref), @@ -1041,7 +1041,7 @@ } else { - fz_printf(ctx, out, PAGE_FMT "Shading %d %d R (%d %d R)\n", + fz_write_printf(ctx, out, PAGE_FMT "Shading %d %d R (%d %d R)\n", glo->pattern[i].page, pdf_to_num(ctx, glo->pattern[i].pageref), pdf_to_gen(ctx, glo->pattern[i].pageref), @@ -1051,15 +1051,15 @@ pdf_to_gen(ctx, glo->pattern[i].u.pattern.obj)); } } - fz_printf(ctx, out, "\n"); + fz_write_printf(ctx, out, "\n"); } if (show & XOBJS && glo->forms > 0) { - fz_printf(ctx, out, "Form xobjects (%d):\n", glo->forms); + fz_write_printf(ctx, out, "Form xobjects (%d):\n", glo->forms); for (i = 0; i < glo->forms; i++) { - fz_printf(ctx, out, PAGE_FMT "Form%s%s%s%s (%d %d R)\n", + fz_write_printf(ctx, out, PAGE_FMT "Form%s%s%s%s (%d %d R)\n", glo->form[i].page, pdf_to_num(ctx, glo->form[i].pageref), pdf_to_gen(ctx, glo->form[i].pageref), @@ -1070,22 +1070,22 @@ pdf_to_num(ctx, glo->form[i].u.form.obj), pdf_to_gen(ctx, glo->form[i].u.form.obj)); } - fz_printf(ctx, out, "\n"); + fz_write_printf(ctx, out, "\n"); } if (show & XOBJS && glo->psobjs > 0) { - fz_printf(ctx, out, "Postscript xobjects (%d):\n", glo->psobjs); + fz_write_printf(ctx, out, "Postscript xobjects (%d):\n", glo->psobjs); for (i = 0; i < glo->psobjs; i++) { - fz_printf(ctx, out, PAGE_FMT "(%d %d R)\n", + fz_write_printf(ctx, out, PAGE_FMT "(%d %d R)\n", glo->psobj[i].page, pdf_to_num(ctx, glo->psobj[i].pageref), pdf_to_gen(ctx, glo->psobj[i].pageref), pdf_to_num(ctx, glo->psobj[i].u.form.obj), pdf_to_gen(ctx, glo->psobj[i].u.form.obj)); } - fz_printf(ctx, out, "\n"); + fz_write_printf(ctx, out, "\n"); } } @@ -1121,7 +1121,7 @@ glo.out = out; glo.ctx = ctx; - fz_printf(ctx,out,"FILE: %s\n",filename); + fz_write_printf(ctx,out,"FILE: %s\n",filename); glo.doc = pdf_open_document(ctx,filename); if (pdf_needs_password(ctx,glo.doc)) if (!pdf_authenticate_password(ctx, glo.doc, password)) @@ -1129,7 +1129,7 @@ glo.pagecount=pdf_count_pages(ctx,glo.doc); gather_all_info(ctx,&glo,filename,show,pagelist); showglobalinfo(ctx,&glo,filename); - fz_printf(ctx,glo.out," Page Ref Details\n"); + fz_write_printf(ctx,glo.out," Page Ref Details\n"); printinfo(ctx,&glo,filename,show); closexref(ctx,&glo); } @@ -1149,7 +1149,7 @@ closexref(ctx, &glo); filename = argv[argidx]; - fz_printf(ctx, out, "%s:\n", filename); + fz_write_printf(ctx, out, "%s:\n", filename); glo.doc = pdf_open_document(glo.ctx, filename); if (pdf_needs_password(ctx, glo.doc)) if (!pdf_authenticate_password(ctx, glo.doc, password)) @@ -1211,9 +1211,11 @@ */ (*buf)=NULL; wfile_abstmpnam(tempname); + /* fout=fopen(tempname,"w"); if (fout==NULL) return; + */ ctx = fz_new_context(NULL, NULL, FZ_STORE_UNLIMITED); if (!ctx) @@ -1227,13 +1229,14 @@ /* ret = 0; */ fz_try(ctx) { - out = fz_new_output_with_file_ptr(ctx,fout,1); + out = fz_new_output_with_path(ctx,tempname,0); pdfinfo_info(ctx,out,filename,password,show,pagelist); } fz_catch(ctx) { - /* ret = 1; */ + return; } + fz_close_output(ctx,out); fz_drop_output(ctx,out); fz_drop_context(ctx); /* fclose(fout); */ diff -Nru k2pdfopt-2.42+ds/willuslib/wsys.c k2pdfopt-2.51+ds/willuslib/wsys.c --- k2pdfopt-2.42+ds/willuslib/wsys.c 2016-12-31 17:35:55.000000000 +0000 +++ k2pdfopt-2.51+ds/willuslib/wsys.c 2018-11-21 17:06:05.000000000 +0000 @@ -3,7 +3,7 @@ ** ** Part of willus.com general purpose C code library. ** -** Copyright (C) 2016 http://willus.com +** Copyright (C) 2018 http://willus.com ** ** This program is free software: you can redistribute it and/or modify ** it under the terms of the GNU Affero General Public License as @@ -33,6 +33,7 @@ #include #include #endif +#include /* MinGW has this -- using for file locking */ /* Digital Mars: __DMC__ == 0x700 (7.0) 0x720 (7.2) 0x800 (8.0) @@ -363,6 +364,7 @@ } +#ifndef NO_FILELIST int wsys_most_recent_in_path(char *exename,char *wildcard) { @@ -376,6 +378,7 @@ #endif #endif } +#endif void wsys_computer_name(char *name,int maxlen) @@ -454,7 +457,7 @@ { double tz; int c,hr,min; - static char buf[8]; + static char buf[32]; tz = wsys_utc_offset(); if (tz<0) @@ -640,3 +643,32 @@ value[maxlen-1]='\0'; return(0); } + +/* +** Returns -1 for no lock, otherwise, file lock obtained +** and returns file descriptor. +*/ +int wsys_file_lock(char *filename) + + { + return(open(filename,O_CREAT|O_EXCL,0644)); + } + + +/* +** Returns -1 or -2 for failure +** Returns 0 for file correctly unlocked +*/ +int wsys_file_unlock(char *filename,int fd) + + { + int status; + + status=close(fd); + if (status!=0) + return(-1); + status=remove(filename); + if (status!=0) + return(-2); + return(0); + }