From 526e44e59f8672bc44745347b5cb84033dbeef13 Mon Sep 17 00:00:00 2001 From: Calixte Denizet Date: Thu, 11 Jun 2026 22:18:36 +0200 Subject: [PATCH] Improve word-space detection in `getTextContent` The gap between two glyphs was turned into a space using a hardcoded `trackingSpaceMin` of `0.102 * fontSize`. Depending on the font this was either too large or too small, producing fake spaces inside words ("Robe rt") or a space between every letter of letter-spaced text ("R E A S O N S"). The threshold is now derived from the font's space width (or the glyph's advance width for Type3 fonts, where the font size is meaningless) and raised for uniformly letter-spaced runs. A space wrongly inserted before the first glyph of such a run is retracted once the next gap confirms it, and the adaptive state is reset on spacing or text-matrix changes so it can't leak across sections. It fixes #18768 and #16752. --- src/core/evaluator.js | 187 +++++++++++++++-- src/core/fonts.js | 9 +- src/core/intersector.js | 42 +++- test/pdfs/.gitignore | 9 + test/pdfs/issue16752_reduced.pdf | Bin 0 -> 654 bytes test/pdfs/issue18768_1.pdf.link | 1 + test/pdfs/issue18768_2.pdf.link | 1 + test/pdfs/issue18768_initial_narrow_gap.pdf | Bin 0 -> 599 bytes test/pdfs/issue18768_narrow_space_font.pdf | Bin 0 -> 876 bytes test/pdfs/issue18768_stateful_spacing.pdf | Bin 0 -> 1181 bytes test/pdfs/issue18768_text_matrix_spacing.pdf | Bin 0 -> 658 bytes test/pdfs/issue18768_type3.pdf | Bin 0 -> 951 bytes test/pdfs/issue18768_type3_wide_glyph.pdf | Bin 0 -> 1059 bytes test/pdfs/issue1936_reduced.pdf | Bin 0 -> 605 bytes test/pdfs/issue19954_reduced.pdf | Bin 0 -> 965 bytes test/test_manifest.json | 16 ++ test/unit/api_spec.js | 202 +++++++++++++++++++ test/unit/pdf_find_controller_spec.js | 16 +- 18 files changed, 453 insertions(+), 30 deletions(-) create mode 100644 test/pdfs/issue16752_reduced.pdf create mode 100644 test/pdfs/issue18768_1.pdf.link create mode 100644 test/pdfs/issue18768_2.pdf.link create mode 100644 test/pdfs/issue18768_initial_narrow_gap.pdf create mode 100644 test/pdfs/issue18768_narrow_space_font.pdf create mode 100644 test/pdfs/issue18768_stateful_spacing.pdf create mode 100644 test/pdfs/issue18768_text_matrix_spacing.pdf create mode 100644 test/pdfs/issue18768_type3.pdf create mode 100644 test/pdfs/issue18768_type3_wide_glyph.pdf create mode 100644 test/pdfs/issue1936_reduced.pdf create mode 100644 test/pdfs/issue19954_reduced.pdf diff --git a/src/core/evaluator.js b/src/core/evaluator.js index ca5b702b3d51e..df362e1cefa8f 100644 --- a/src/core/evaluator.js +++ b/src/core/evaluator.js @@ -2418,6 +2418,12 @@ class PartialEvaluator { spaceInFlowMin: 0, spaceInFlowMax: 0, trackingSpaceMin: Infinity, + useCharWidthThreshold: false, + wideSpaceMax: Infinity, + minAdvance: Infinity, + pendingSpaceIndex: -1, + pendingSpaceAdvance: 0, + pendingSpaceExtraCharId: -1, negativeSpaceMax: -Infinity, notASpace: -Infinity, transform: null, @@ -2483,21 +2489,37 @@ class PartialEvaluator { // even if one is present in the text stream. const NOT_A_SPACE_FACTOR = 0.03; + // A reported space wider than fontSize * MAX_SPACE_WIDTH_FACTOR is treated + // as unreliable metrics, so trackingSpaceMin isn't derived from it. + const MAX_SPACE_WIDTH_FACTOR = 1 / 3; + // A negative white < fontSize * NEGATIVE_SPACE_FACTOR induces // a break (a new chunk of text is created). // It doesn't change anything when the text is copied but // it improves potential mismatch between text layer and canvas. const NEGATIVE_SPACE_FACTOR = -0.2; - // A white with a width in [fontSize * MIN_FACTOR; fontSize * MAX_FACTOR] - // is a space which will be inserted in the current flow of words. + // A white with a width in [trackingSpaceMin; fontSize * MAX_FACTOR] is a + // space which will be inserted in the current flow of words. // If the width is outside of this range then the flow is broken // (which means a new span in the text layer). // It's useful to adjust the best as possible the span in the layer // to what is displayed in the canvas. - const SPACE_IN_FLOW_MIN_FACTOR = 0.102; const SPACE_IN_FLOW_MAX_FACTOR = 0.6; + // When every gap within a run is wider than the space threshold (e.g. + // letter-spaced text), the threshold is raised to this multiple of the + // smallest gap, capped at fontSize * WIDE_SPACE_MAX_FACTOR, so the gaps + // between letters of a single word aren't turned into spaces. + const WIDE_SPACE_MULT = 1.3; + const WIDE_SPACE_MAX_FACTOR = 0.4; + + // For fonts without usable space metrics (Type3), the space threshold is + // derived from the current glyph's own advance width instead of the font + // size, since the latter is unreliable when the font matrix does the + // scaling. + const CHAR_WIDTH_SPACE_FACTOR = 0.25; + // If a char is too high/too low compared to the previous we just create // a new chunk. // If the advance isn't in the +/-VERTICAL_SHIFT_RATIO * height range then @@ -2516,6 +2538,7 @@ class PartialEvaluator { const preprocessor = new EvaluatorPreprocessor(stream, xref, stateManager); let textState, currentTextState; + let lastFakeSpaceExtraCharId = -1; function pushWhitespace({ width = 0, @@ -2610,10 +2633,21 @@ class PartialEvaluator { textContentItem.textAdvanceScale = scaleCtmX * scaleLineX; const { fontSize } = textState; - textContentItem.trackingSpaceMin = fontSize * TRACKING_SPACE_FACTOR; + + let trackingFactor = TRACKING_SPACE_FACTOR; + if (!font.isType3Font) { + const spaceEm = font.spaceWidth / 1000; + if (spaceEm > 0 && spaceEm <= MAX_SPACE_WIDTH_FACTOR) { + trackingFactor = Math.max(0.5 * spaceEm, TRACKING_SPACE_FACTOR); + } + } + textContentItem.trackingSpaceMin = fontSize * trackingFactor; + textContentItem.useCharWidthThreshold = font.isType3Font; + textContentItem.spaceInFlowMin = fontSize * trackingFactor; + textContentItem.wideSpaceMax = fontSize * WIDE_SPACE_MAX_FACTOR; + resetAdaptiveSpacing(); textContentItem.notASpace = fontSize * NOT_A_SPACE_FACTOR; textContentItem.negativeSpaceMax = fontSize * NEGATIVE_SPACE_FACTOR; - textContentItem.spaceInFlowMin = fontSize * SPACE_IN_FLOW_MIN_FACTOR; textContentItem.spaceInFlowMax = fontSize * SPACE_IN_FLOW_MAX_FACTOR; textContentItem.hasEOL = false; @@ -2690,6 +2724,84 @@ class PartialEvaluator { ]; } + function getTrackingSpaceMin(glyphWidth) { + if (textContentItem.useCharWidthThreshold && glyphWidth) { + return ( + Math.abs(glyphWidth * textState.textHScale) * CHAR_WIDTH_SPACE_FACTOR + ); + } + return textContentItem.trackingSpaceMin; + } + + function getSpaceThreshold(trackingSpaceMin) { + const { minAdvance, wideSpaceMax } = textContentItem; + if ( + minAdvance > trackingSpaceMin && + minAdvance < WIDE_SPACE_MULT * trackingSpaceMin + ) { + return Math.min(WIDE_SPACE_MULT * minAdvance, wideSpaceMax); + } + return trackingSpaceMin; + } + + function resetAdaptiveSpacing() { + textContentItem.minAdvance = Infinity; + textContentItem.pendingSpaceIndex = -1; + textContentItem.pendingSpaceAdvance = 0; + textContentItem.pendingSpaceExtraCharId = -1; + } + + function recordPendingSpace(advance, textOrientation) { + if ( + textContentItem.useCharWidthThreshold || + textContentItem.minAdvance < Infinity + ) { + return; + } + textContentItem.pendingSpaceIndex = textContentItem.str.length - 1; + textContentItem.pendingSpaceAdvance = textOrientation * advance; + textContentItem.pendingSpaceExtraCharId = lastFakeSpaceExtraCharId; + } + + function resolvePendingSpace( + advance, + textOrientation, + trackingSpaceMin, + spaceThreshold + ) { + if (textContentItem.pendingSpaceIndex < 0) { + return; + } + const { pendingSpaceAdvance } = textContentItem; + const forwardAdvance = textOrientation * advance; + const baseline = Math.min(pendingSpaceAdvance, forwardAdvance); + if ( + baseline > trackingSpaceMin / WIDE_SPACE_MULT && + forwardAdvance <= spaceThreshold && + pendingSpaceAdvance <= spaceThreshold + ) { + textContentItem.str.splice(textContentItem.pendingSpaceIndex, 1); + if (textContentItem.pendingSpaceExtraCharId >= 0) { + intersector?.removeExtraChar(textContentItem.pendingSpaceExtraCharId); + } + } + textContentItem.pendingSpaceIndex = -1; + textContentItem.pendingSpaceExtraCharId = -1; + } + + function recordMinAdvance(advance, textOrientation, trackingSpaceMin) { + if (textContentItem.useCharWidthThreshold) { + return; + } + const forwardAdvance = textOrientation * advance; + if ( + forwardAdvance > trackingSpaceMin && + forwardAdvance < textContentItem.minAdvance + ) { + textContentItem.minAdvance = forwardAdvance; + } + } + function compareWithLastPosition(glyphWidth) { const currentTransform = getCurrentTextTransform(); let posX = currentTransform[4]; @@ -2722,6 +2834,7 @@ class PartialEvaluator { let lastPosY = textContentItem.prevTransform[5]; if (lastPosX === posX && lastPosY === posY) { + textContentItem.minAdvance = Infinity; return true; } @@ -2806,9 +2919,18 @@ class PartialEvaluator { // The real spacing between 2 consecutive chars is thin enough to be // considered a non-space. resetLastChars(); + textContentItem.minAdvance = Infinity; } - if (advanceY <= textOrientation * textContentItem.trackingSpaceMin) { + const trackingSpaceMin = getTrackingSpaceMin(glyphWidth); + const spaceThreshold = getSpaceThreshold(trackingSpaceMin); + resolvePendingSpace( + advanceY, + textOrientation, + trackingSpaceMin, + spaceThreshold + ); + if (advanceY <= textOrientation * spaceThreshold) { if (shouldAddWhitepsace()) { // The space is very thin, hence it deserves to have its own span in // order to avoid too much shift between the canvas and the text @@ -2831,9 +2953,12 @@ class PartialEvaluator { pushWhitespace({ height: Math.abs(advanceY) }); } else { textContentItem.height += advanceY; + recordPendingSpace(advanceY, textOrientation); } } + recordMinAdvance(advanceY, textOrientation, trackingSpaceMin); + if (Math.abs(advanceX) > textContentItem.width * VERTICAL_SHIFT_RATIO) { flushTextContentItem(); } @@ -2886,9 +3011,18 @@ class PartialEvaluator { // The real spacing between 2 consecutive chars is thin enough to be // considered a non-space. resetLastChars(); + textContentItem.minAdvance = Infinity; } - if (advanceX <= textOrientation * textContentItem.trackingSpaceMin) { + const trackingSpaceMin = getTrackingSpaceMin(glyphWidth); + const spaceThreshold = getSpaceThreshold(trackingSpaceMin); + resolvePendingSpace( + advanceX, + textOrientation, + trackingSpaceMin, + spaceThreshold + ); + if (advanceX <= textOrientation * spaceThreshold) { if (shouldAddWhitepsace()) { // The space is very thin, hence it deserves to have its own span in // order to avoid too much shift between the canvas and the text @@ -2907,9 +3041,12 @@ class PartialEvaluator { pushWhitespace({ width: Math.abs(advanceX) }); } else { textContentItem.width += advanceX; + recordPendingSpace(advanceX, textOrientation); } } + recordMinAdvance(advanceX, textOrientation, trackingSpaceMin); + if (Math.abs(advanceY) > textContentItem.height * VERTICAL_SHIFT_RATIO) { flushTextContentItem(); } @@ -3092,6 +3229,7 @@ class PartialEvaluator { } function addFakeSpaces(width, transf, textOrientation) { + lastFakeSpaceExtraCharId = -1; if ( textOrientation * textContentItem.spaceInFlowMin <= width && width <= textOrientation * textContentItem.spaceInFlowMax @@ -3099,7 +3237,7 @@ class PartialEvaluator { if (textContentItem.initialized) { resetLastChars(); textContentItem.str.push(" "); - intersector?.addExtraChar(" "); + lastFakeSpaceExtraCharId = intersector?.addExtraChar(" ") ?? -1; } return false; } @@ -3140,6 +3278,7 @@ class PartialEvaluator { textContent.items.push(runBidiTransform(textContentItem)); textContentItem.initialized = false; + resetAdaptiveSpacing(); textContentItem.str.length = 0; } @@ -3216,7 +3355,10 @@ class PartialEvaluator { textState.textRise = args[0]; break; case OPS.setHScale: - textState.textHScale = args[0] / 100; + if (textState.textHScale !== args[0] / 100) { + textState.textHScale = args[0] / 100; + resetAdaptiveSpacing(); + } break; case OPS.setLeading: textState.leading = args[0]; @@ -3224,14 +3366,17 @@ class PartialEvaluator { case OPS.moveText: textState.translateTextLineMatrix(args[0], args[1]); textState.textMatrix = textState.textLineMatrix.slice(); + resetAdaptiveSpacing(); break; case OPS.setLeadingMoveText: textState.leading = -args[1]; textState.translateTextLineMatrix(args[0], args[1]); textState.textMatrix = textState.textLineMatrix.slice(); + resetAdaptiveSpacing(); break; case OPS.nextLine: textState.carriageReturn(); + resetAdaptiveSpacing(); break; case OPS.setTextMatrix: textState.setTextMatrix( @@ -3251,16 +3396,24 @@ class PartialEvaluator { args[5] ); updateAdvanceScale(); + resetAdaptiveSpacing(); break; case OPS.setCharSpacing: - textState.charSpacing = args[0]; + if (textState.charSpacing !== args[0]) { + textState.charSpacing = args[0]; + resetAdaptiveSpacing(); + } break; case OPS.setWordSpacing: - textState.wordSpacing = args[0]; + if (textState.wordSpacing !== args[0]) { + textState.wordSpacing = args[0]; + resetAdaptiveSpacing(); + } break; case OPS.beginText: textState.textMatrix = IDENTITY_MATRIX.slice(); textState.textLineMatrix = IDENTITY_MATRIX.slice(); + resetAdaptiveSpacing(); break; case OPS.showSpacedText: if (!stateManager.state.font) { @@ -3318,6 +3471,7 @@ class PartialEvaluator { continue; } textState.carriageReturn(); + resetAdaptiveSpacing(); buildTextContentItem({ chars: args[0], extraSpacing: 0, @@ -3328,9 +3482,16 @@ class PartialEvaluator { self.ensureStateFont(stateManager.state); continue; } - textState.wordSpacing = args[0]; - textState.charSpacing = args[1]; + if ( + textState.wordSpacing !== args[0] || + textState.charSpacing !== args[1] + ) { + textState.wordSpacing = args[0]; + textState.charSpacing = args[1]; + resetAdaptiveSpacing(); + } textState.carriageReturn(); + resetAdaptiveSpacing(); buildTextContentItem({ chars: args[2], extraSpacing: 0, diff --git a/src/core/fonts.js b/src/core/fonts.js index b858e01c26441..77bc42066f6cc 100644 --- a/src/core/fonts.js +++ b/src/core/fonts.js @@ -3462,10 +3462,7 @@ class Font { return builder.toArray(); } - /** - * @private - */ - get _spaceWidth() { + get spaceWidth() { // trying to estimate space character width const possibleSpaceReplacements = ["space", "minus", "one", "i", "I"]; let width; @@ -3500,7 +3497,7 @@ class Font { break; // the non-zero width found } } - return shadow(this, "_spaceWidth", width || this.defaultWidth); + return shadow(this, "spaceWidth", width || this.defaultWidth); } /** @@ -3552,7 +3549,7 @@ class Font { if (glyphName === "") { // Ensure that other relevant glyph properties are also updated // (fixes issue18059.pdf). - width ||= this._spaceWidth; + width ||= this.spaceWidth; unicode = String.fromCharCode(fontCharCode); } } diff --git a/src/core/intersector.js b/src/core/intersector.js index 687a15249b979..40842b927b108 100644 --- a/src/core/intersector.js +++ b/src/core/intersector.js @@ -28,8 +28,12 @@ class SingleIntersector { #text = []; + #textExtraCharIds = []; + #extraChars = []; + #extraCharIds = []; + #lastIntersectingQuadIndex = -1; #canTakeExtraChars = false; @@ -112,18 +116,39 @@ class SingleIntersector { } if (this.#extraChars.length > 0) { - this.#text.push(this.#extraChars.join("")); + for (let i = 0, ii = this.#extraChars.length; i < ii; i++) { + this.#text.push(this.#extraChars[i]); + this.#textExtraCharIds.push(this.#extraCharIds[i]); + } this.#extraChars.length = 0; + this.#extraCharIds.length = 0; } this.#text.push(glyph); + this.#textExtraCharIds.push(-1); this.#canTakeExtraChars = true; return true; } - addExtraChar(char) { + addExtraChar(char, id) { if (this.#canTakeExtraChars) { this.#extraChars.push(char); + this.#extraCharIds.push(id); + } + } + + removeExtraChar(id) { + let index = this.#extraCharIds.lastIndexOf(id); + if (index >= 0) { + this.#extraChars.splice(index, 1); + this.#extraCharIds.splice(index, 1); + return; + } + + index = this.#textExtraCharIds.lastIndexOf(id); + if (index >= 0) { + this.#text.splice(index, 1); + this.#textExtraCharIds.splice(index, 1); } } @@ -133,6 +158,7 @@ class SingleIntersector { } this.#canTakeExtraChars = false; this.#extraChars.length = 0; + this.#extraCharIds.length = 0; } setText() { @@ -148,6 +174,8 @@ class Intersector { #grid = []; + #extraCharId = 0; + #minX; #maxX; @@ -221,8 +249,16 @@ class Intersector { } addExtraChar(char) { + const id = this.#extraCharId++; + for (const intersector of this.#intersectors) { + intersector.addExtraChar(char, id); + } + return id; + } + + removeExtraChar(id) { for (const intersector of this.#intersectors) { - intersector.addExtraChar(char); + intersector.removeExtraChar(id); } } diff --git a/test/pdfs/.gitignore b/test/pdfs/.gitignore index 6070988ffb8dc..c8eafd60898f2 100644 --- a/test/pdfs/.gitignore +++ b/test/pdfs/.gitignore @@ -935,3 +935,12 @@ !text_field_own_canvas_calc.pdf !bug1802506.pdf !checkbox_no_appearance.pdf +!issue16752_reduced.pdf +!issue18768_type3.pdf +!issue18768_initial_narrow_gap.pdf +!issue18768_stateful_spacing.pdf +!issue18768_narrow_space_font.pdf +!issue18768_text_matrix_spacing.pdf +!issue18768_type3_wide_glyph.pdf +!issue19954_reduced.pdf +!issue1936_reduced.pdf diff --git a/test/pdfs/issue16752_reduced.pdf b/test/pdfs/issue16752_reduced.pdf new file mode 100644 index 0000000000000000000000000000000000000000..be834009281298a43cbfc9d7ce0ff38edfe20cde GIT binary patch literal 654 zcmZXSOK!q25Qg_Y#cU$Mg4zxwDH1{|g{bNa6tY7s3^6nnjjPyHw3qCnH)ti8&%QFqLHsW)8irN+i_(>w}{7=L(rL?H4Lmy<8i-Fp~j9EV9K15xAP5|)8 z!dOy62WwPPBDLC(-`+ literal 0 HcmV?d00001 diff --git a/test/pdfs/issue18768_1.pdf.link b/test/pdfs/issue18768_1.pdf.link new file mode 100644 index 0000000000000..3e1d562b4b214 --- /dev/null +++ b/test/pdfs/issue18768_1.pdf.link @@ -0,0 +1 @@ +https://github.com/user-attachments/files/17071119/thames_bug_example_1.pdf diff --git a/test/pdfs/issue18768_2.pdf.link b/test/pdfs/issue18768_2.pdf.link new file mode 100644 index 0000000000000..942e535188be3 --- /dev/null +++ b/test/pdfs/issue18768_2.pdf.link @@ -0,0 +1 @@ +https://github.com/user-attachments/files/17071116/thames_bug_example_2.pdf diff --git a/test/pdfs/issue18768_initial_narrow_gap.pdf b/test/pdfs/issue18768_initial_narrow_gap.pdf new file mode 100644 index 0000000000000000000000000000000000000000..a3962236ae1af6e46f6e8c5f00ab5090dd2b2d4b GIT binary patch literal 599 zcmZXS%WlFj5JmU?irFB+g4zywG!jB8(5UJ|MUWk0VaPyJ(YUfrMf>&jB+yV{DN1tZ zjPK3Fo!K<&%bSEr5TMuZY&Zm-e}5wIiE?VMcfeNvFJ{k+qAcW{E?<|>OIYejL#O$KD<3&>=!OTnH~TD literal 0 HcmV?d00001 diff --git a/test/pdfs/issue18768_narrow_space_font.pdf b/test/pdfs/issue18768_narrow_space_font.pdf new file mode 100644 index 0000000000000000000000000000000000000000..8fe1386a4a128fa7a80d43de9ba099c07873c400 GIT binary patch literal 876 zcmd^8%WlFj5WM><_JRZlYCDMukq`%Hb7*OcLaH864u%*=MH6Hj_3`!fCZSOA3AJP; zo}Jm9Y}TFmZQSQKA>qKFTD_6c2x$6sP(Y{B%6+v3I+t5zz;lEI#$%$&9Eo`TD`

dk{umMgTBnX&uu*fHi6; zQKmZTwX=!-V@GISI0)pp>S*9>9+f8#GgWTw4g!G`W3`g+gvJ~=gLFeUV-UKX=49D@ zNqc<)s;Bflj1RP2@>xU41MLa;@qQ ziVJP*bSE|V9?+R=4-u=QuSITmuE~7JKsa;%U}qN-^r_W`xS=5IcfmjiU%&>=SOj$x z$}=X0R~YB+XW_cwc|mxE`GFhE6VGZ{?3J!B%cA&JFaVlV70!8kNsN`+);>mbU}>a2ARb&|z!i0>#O!wa^*iIhlD40) zk*p+V&Kb|mZY}T9u8amuf`H=bncds~&;Gm`;8X4OvDgD&>bL(VQ?&5hgp14z{)S>;QkU)UsL zJe8PAwqqiaQbD$b?)mFC8%30u0D8v)wF0siwr;13PCSZ|b~ahe=N&1cUOVb+Ap2H( zL)I=7A0;{ z@}ai`O5hV+nc8x1j=zkzTMc~Xwnc6oHvZANv8(I_2=z;CS^4Qfm(YuW&-M9dB!czP z=Kg@yrGBaZV>M@phU#P8!??k<@b`9qIO6iaS7z&BEr%cz1p}HGRnPa&lg^igr3nva zbumMOoG3b6NWifX%^2QFgYyXkCjLFpSE{FA2Ulza!3yq*so@ulhWs-x?qinpG_(nZ s{}~%54W2KxJ(@DyE)VwIzyR|v3c48TR literal 0 HcmV?d00001 diff --git a/test/pdfs/issue18768_text_matrix_spacing.pdf b/test/pdfs/issue18768_text_matrix_spacing.pdf new file mode 100644 index 0000000000000000000000000000000000000000..298812e40fc6e0a98725767b0adf0217f71660b0 GIT binary patch literal 658 zcmZWnO>e?5488YP_<{rnmZYVHNeGET(Xep5GkEltSAox zI>d-Pw2{_0>{dWnM^CZ|br}FYk&GnCh2%pbd`VIA0DxAz+sk74!y+$K>op`^yNvX>K literal 0 HcmV?d00001 diff --git a/test/pdfs/issue18768_type3.pdf b/test/pdfs/issue18768_type3.pdf new file mode 100644 index 0000000000000000000000000000000000000000..eaf3e94a57502f72007d036f8e166161fd7e2fe9 GIT binary patch literal 951 zcmZWo!EU245WV{=<`OA6q_#UpWMU#!S0Xj z3^r+#UhKWz|cw2fk8!-9mtb zHQe1XT^B^8^?88f@o!bp!e&DBEoNzswFUVpkiJ|8QW>L>>gw~1%9c4<FI3AD%0A z3VXn&V50tZG0S?_6jiABHONxBvhE literal 0 HcmV?d00001 diff --git a/test/pdfs/issue18768_type3_wide_glyph.pdf b/test/pdfs/issue18768_type3_wide_glyph.pdf new file mode 100644 index 0000000000000000000000000000000000000000..a5b3b3ca0077c6cfb0da1ed68e60a48559721ae8 GIT binary patch literal 1059 zcmbVL!EU245WVvibBWX*TH7HeL8>Z+G#qxfyHSg@hbRX(n8a0sSH_XL`}I3xpiN0n z1riP8dGp>p^c>yX04x6f(ZCkUs~5Wiwo*IYz!(c2;Py7sbxDQv z{t?jDvG1mAU>#HW2D{i}?LnLXY2_S9G)`k_%fs$5V<{ZYHW4{O8*rVH` zAX1uJM1(-Vgcy;B9;B^3c56U(*Aj!*mYfDq_!aVI3yrtNk zsVa^B(BSJt0-FPNVT(^ZwEg0mIPwJBNoOdeGrXK5^GB4VNGyZ7)X-M%rSxd*WJKK#{ Q&W9OiB8f($<*z*Y4+_f>bpQYW literal 0 HcmV?d00001 diff --git a/test/pdfs/issue1936_reduced.pdf b/test/pdfs/issue1936_reduced.pdf new file mode 100644 index 0000000000000000000000000000000000000000..1c95e5294c0b54fc4bd2d500fc05b837c355538d GIT binary patch literal 605 zcmZXSUu(iJ6vf~3DejA)4=zbH?F12QXBqp0qJ0w|qP@Bf*Nij{w@<&^R;R;2AwBus zdk!~g%^uROyo;Cw0cQQqhC|@l_a_2Rw9|XD13uF`tRV2f97ZFCWlkXd|1W4b{#@i0 zEPI5nd>3;l9muOfI=L+*8;c%oeE2Kog^tY;mcsL)3?Lc=D+WLnVQ|XUa?C-zdf0zUT?b9>6y#-$W{Gov_l~XV69{5V_bq4_w z)^K;nbW;(L&L0yzkAF2)2b+-STddL^8wc_%k^cFdNNub}s_(-ebY)c9zQINyKb8R` z^I*#WsAS`i*})C<=%Kal+N0f}OwwNb2Y;yX*uopy%i@p6C)fK+ zVCcoBrw64@QV4<05&c0qYsiCZA!tH)kCFGHFt&5cgR&4u!0**~m;?WADtEw1P|3;H z=jP3>txU7;t@0925YJ3qYa9r9x}n+&;3;|!JOiF@$*X=A*m`B#oi}~~F-lw$a|o7W zkc~k;1Ve!jlS2$YgMn0GG;o+vcs?U~CV!8nK_wX>r?b^if1b>n)iv@GhCUA$x)+5J z)NsNi&W|o=-mw_hW`fB8$1=f!2v>Z9g%R%c1dEe1p0molXxqPkkLH(#1o*mb>DBQI TjH{`v>$N9wz-F`jTfzPV8&mX` literal 0 HcmV?d00001 diff --git a/test/test_manifest.json b/test/test_manifest.json index 9314e31fcba77..07c0a8472dbdb 100644 --- a/test/test_manifest.json +++ b/test/test_manifest.json @@ -14405,5 +14405,21 @@ "type": "eq", "annotations": true, "forms": true + }, + { + "id": "issue18768_1", + "file": "pdfs/issue18768_1.pdf", + "md5": "9ea22e419297ee29e2255a98a71753a3", + "rounds": 1, + "link": true, + "type": "other" + }, + { + "id": "issue18768_2", + "file": "pdfs/issue18768_2.pdf", + "md5": "d8c277668182811961ed4c0e9495bfc5", + "rounds": 1, + "link": true, + "type": "other" } ] diff --git a/test/unit/api_spec.js b/test/unit/api_spec.js index 86b9e2c9da093..0a25c0f427151 100644 --- a/test/unit/api_spec.js +++ b/test/unit/api_spec.js @@ -4090,6 +4090,195 @@ page 1 / 3`); await loadingTask.destroy(); }); + it("gets text content, with no spaces between letters of words (issue 18768)", async function () { + const loadingTask = getDocument( + buildGetDocumentParams("issue18768_1.pdf") + ); + const pdfDoc = await loadingTask.promise; + const pdfPage = await pdfDoc.getPage(1); + const { items } = await pdfPage.getTextContent({ + disableNormalization: true, + }); + const text = mergeText(items); + + expect(text.includes("Robert Thorogood")).toEqual(true); + + await loadingTask.destroy(); + }); + + it("gets text content, with no spaces between letters of words (issue 18768, second example)", async function () { + const loadingTask = getDocument( + buildGetDocumentParams("issue18768_2.pdf") + ); + const pdfDoc = await loadingTask.promise; + const pdfPage = await pdfDoc.getPage(1); + const { items } = await pdfPage.getTextContent({ + disableNormalization: true, + }); + const text = mergeText(items); + + expect(text.includes("THamesjoen murhat")).toEqual(true); + + await loadingTask.destroy(); + }); + + it("gets text content, without spurious spaces in letter-spaced text (issue 16752)", async function () { + const loadingTask = getDocument( + buildGetDocumentParams("issue16752_reduced.pdf") + ); + const pdfDoc = await loadingTask.promise; + const pdfPage = await pdfDoc.getPage(1); + const { items } = await pdfPage.getTextContent({ + disableNormalization: true, + }); + const text = mergeText(items); + + expect(text).toEqual("REASONS AND"); + + await loadingTask.destroy(); + }); + + it("gets text content, without spurious spaces after an initial narrow gap", async function () { + const loadingTask = getDocument( + buildGetDocumentParams("issue18768_initial_narrow_gap.pdf") + ); + const pdfDoc = await loadingTask.promise; + const pdfPage = await pdfDoc.getPage(1); + const { items } = await pdfPage.getTextContent({ + disableNormalization: true, + }); + + expect(mergeText(items)).toEqual("ABCD"); + + await loadingTask.destroy(); + }); + + it("gets text content, without leaking letter spacing across text state changes", async function () { + const loadingTask = getDocument( + buildGetDocumentParams("issue18768_stateful_spacing.pdf") + ); + const pdfDoc = await loadingTask.promise; + const pdfPage = await pdfDoc.getPage(1); + const { items } = await pdfPage.getTextContent({ + disableNormalization: true, + }); + + expect(mergeText(items)).toEqual(`FOTBOLL. +ABCD E`); + + await loadingTask.destroy(); + }); + + it("gets text content, without leaking letter spacing across text matrix changes", async function () { + const loadingTask = getDocument( + buildGetDocumentParams("issue18768_text_matrix_spacing.pdf") + ); + const pdfDoc = await loadingTask.promise; + const pdfPage = await pdfDoc.getPage(1); + const { items } = await pdfPage.getTextContent({ + disableNormalization: true, + }); + + expect(mergeText(items)).toEqual("ABCD E"); + + await loadingTask.destroy(); + }); + + it("gets text content, without spurious spaces in a Type3 font (issue 18768)", async function () { + const loadingTask = getDocument( + buildGetDocumentParams("issue18768_type3.pdf") + ); + const pdfDoc = await loadingTask.promise; + const pdfPage = await pdfDoc.getPage(1); + const { items } = await pdfPage.getTextContent({ + disableNormalization: true, + }); + const text = mergeText(items); + + expect(text).toEqual("AB CD"); + + await loadingTask.destroy(); + }); + + it("gets text content, without dropping Type3 word spaces before wide glyphs", async function () { + const loadingTask = getDocument( + buildGetDocumentParams("issue18768_type3_wide_glyph.pdf") + ); + const pdfDoc = await loadingTask.promise; + const pdfPage = await pdfDoc.getPage(1); + const { items } = await pdfPage.getTextContent({ + disableNormalization: true, + }); + + expect(mergeText(items)).toEqual("A BC"); + + await loadingTask.destroy(); + }); + + it("gets text content, without splitting a word in a narrow-space font (issue 18768)", async function () { + const loadingTask = getDocument( + buildGetDocumentParams("issue18768_narrow_space_font.pdf") + ); + const pdfDoc = await loadingTask.promise; + const pdfPage = await pdfDoc.getPage(1); + const { items } = await pdfPage.getTextContent({ + disableNormalization: true, + }); + + expect(mergeText(items)).toEqual("UTR 50"); + + await loadingTask.destroy(); + }); + + it("gets text content, without dropping word spaces in a Type3 font (issue 19954)", async function () { + const loadingTask = getDocument( + buildGetDocumentParams("issue19954_reduced.pdf") + ); + const pdfDoc = await loadingTask.promise; + const pdfPage = await pdfDoc.getPage(1); + const { items } = await pdfPage.getTextContent({ + disableNormalization: true, + }); + + expect(mergeText(items)).toEqual("AB C DE"); + + await loadingTask.destroy(); + }); + + it("gets text content, without dropping word spaces in uniformly spaced text (issue 1936)", async function () { + const loadingTask = getDocument( + buildGetDocumentParams("issue1936_reduced.pdf") + ); + const pdfDoc = await loadingTask.promise; + const pdfPage = await pdfDoc.getPage(1); + const { items } = await pdfPage.getTextContent({ + disableNormalization: true, + }); + + expect(mergeText(items)).toEqual("AAA BBB CCC"); + + await loadingTask.destroy(); + }); + + it("gets text content, without splitting a line into extra chunks at its word spaces (issue 18768)", async function () { + const loadingTask = getDocument( + buildGetDocumentParams("chrome-text-selection-markedContent.pdf") + ); + const pdfDoc = await loadingTask.promise; + const pdfPage = await pdfDoc.getPage(1); + const { items } = await pdfPage.getTextContent(); + const strings = items.map(item => item.str); + + expect(strings).toContain( + "strengthen in the coming quarters as the railway projects under" + ); + expect(strings).toContain( + "development enter the construction phase (estimated at around" + ); + + await loadingTask.destroy(); + }); + it("gets text content, with merged spaces (issue 10900)", async function () { const loadingTask = getDocument(buildGetDocumentParams("issue10900.pdf")); const pdfDoc = await loadingTask.promise; @@ -5868,6 +6057,19 @@ small scripts as well as for`); await loadingTask.destroy(); }); + + it("should avoid spurious spaces in overlaid text", async function () { + const loadingTask = getDocument( + buildGetDocumentParams("issue18768_stateful_spacing.pdf") + ); + const pdfDoc = await loadingTask.promise; + const page = await pdfDoc.getPage(1); + const [annot] = await page.getAnnotations(); + + expect(annot.overlaidText).toEqual("FOTBOLL."); + + await loadingTask.destroy(); + }); }); describe("Multiple documents and pages mapper", function () { diff --git a/test/unit/pdf_find_controller_spec.js b/test/unit/pdf_find_controller_spec.js index 47495cade5ccb..bcde255630d84 100644 --- a/test/unit/pdf_find_controller_spec.js +++ b/test/unit/pdf_find_controller_spec.js @@ -810,14 +810,14 @@ describe("pdf_find_controller", function () { }, pageMatches: [ [ - 302, 340, 418, 481, 628, 802, 983, 989, 1015, 1063, 1084, 1149, 1157, - 1278, 1346, 1394, 1402, 1424, 1500, 1524, 1530, 1686, 1776, 1788, - 1859, 1881, 1911, 1948, 2066, 2076, 2163, 2180, 2215, 2229, 2274, - 2324, 2360, 2402, 2413, 2424, 2463, 2532, 2538, 2553, 2562, 2576, - 2602, 2613, 2638, 2668, 2792, 2805, 2836, 2847, 2858, 2895, 2901, - 2915, 2939, 2959, 3089, 3236, 3246, 3336, 3384, 3391, 3465, 3474, - 3482, 3499, 3687, 3693, 3708, 3755, 3786, 3862, 3974, 4049, 4055, - 4068, + 302, 340, 418, 481, 628, 802, 983, 989, 1015, 1057, 1078, 1143, 1151, + 1272, 1340, 1388, 1396, 1418, 1494, 1518, 1524, 1680, 1770, 1782, + 1853, 1875, 1905, 1942, 2060, 2070, 2157, 2174, 2209, 2223, 2268, + 2318, 2354, 2396, 2407, 2418, 2457, 2526, 2532, 2547, 2556, 2570, + 2596, 2607, 2632, 2662, 2786, 2799, 2830, 2841, 2852, 2889, 2895, + 2909, 2933, 2953, 3083, 3230, 3240, 3330, 3378, 3385, 3459, 3468, + 3476, 3493, 3681, 3687, 3702, 3749, 3780, 3856, 3968, 4043, 4049, + 4062, ], ], pageMatchesLength: [