diff --git a/src/symbol/shaping.ts b/src/symbol/shaping.ts index 3fcb87bff2..0fce10b29b 100644 --- a/src/symbol/shaping.ts +++ b/src/symbol/shaping.ts @@ -1,6 +1,5 @@ import { charHasUprightVerticalOrientation, - charAllowsIdeographicBreaking, charInComplexShapingScript, rtlScriptRegExp, splitByGraphemeCluster @@ -70,6 +69,14 @@ function isEmpty(positionedLines: Array) { } const rtlCombiningMarkRegExp = new RegExp(`(${rtlScriptRegExp.source})([\\p{gc=Mn}\\p{gc=Mc}])`, 'gu'); +const wordSegmenter = ('Segmenter' in Intl) ? new Intl.Segmenter(undefined, {granularity: 'word'}) : { + segment: (text: String) => { + return text.split(/\b/u).map((segment, index) => ({ + index, + segment, + })); + }, +}; export type SymbolAnchor = 'center' | 'left' | 'right' | 'top' | 'bottom' | 'top-left' | 'top-right' | 'bottom-left' | 'bottom-right'; export type TextJustify = 'left' | 'center' | 'right'; @@ -373,34 +380,6 @@ const whitespace: { [0x20]: true, // space }; -const breakable: { - [_: number]: boolean; -} = { - [0x0a]: true, // newline - [0x20]: true, // space - [0x26]: true, // ampersand - [0x29]: true, // right parenthesis - [0x2b]: true, // plus sign - [0x2d]: true, // hyphen-minus - [0x2f]: true, // solidus - [0xad]: true, // soft hyphen - [0xb7]: true, // middle dot - [0x200b]: true, // zero-width space - [0x2010]: true, // hyphen - [0x2013]: true, // en dash - [0x2027]: true // interpunct - // Many other characters may be reasonable breakpoints - // Consider "neutral orientation" characters at scriptDetection.charHasNeutralVerticalOrientation - // See https://github.com/mapbox/mapbox-gl-js/issues/3658 -}; - -// Allow breaks depending on the following character -const breakableBefore: { - [_: number]: boolean; -} = { - [0x28]: true, // left parenthesis -}; - function getGlyphAdvance( grapheme: string, section: SectionOptions, @@ -465,17 +444,12 @@ function calculateBadness(lineWidth: number, return raggedness + Math.abs(penalty) * penalty; } -function calculatePenalty(codePoint: number, nextCodePoint: number, penalizableIdeographicBreak: boolean) { +function calculatePenalty(codePoint: number, nextCodePoint: number) { let penalty = 0; // Force break on newline if (codePoint === 0x0a) { penalty -= 10000; } - // Penalize breaks between characters that allow ideographic breaking because - // they are less preferable than breaks at spaces (or zero width spaces). - if (penalizableIdeographicBreak) { - penalty += 150; - } // Penalize open parenthesis at end of line if (codePoint === 0x28 || codePoint === 0xff08) { @@ -555,49 +529,28 @@ export function determineLineBreaks( const potentialLineBreaks = []; const targetWidth = determineAverageLineWidth(logicalInput, spacing, maxWidth, glyphMap, imagePositions, layoutTextSize); - const hasServerSuggestedBreakpoints = logicalInput.text.indexOf('\u200b') >= 0; - let currentX = 0; - - let i = 0; - const chars = splitByGraphemeCluster(logicalInput.text)[Symbol.iterator](); - let char = chars.next(); - const nextChars = splitByGraphemeCluster(logicalInput.text)[Symbol.iterator](); - nextChars.next(); - let nextChar = nextChars.next(); - const nextNextChars = splitByGraphemeCluster(logicalInput.text)[Symbol.iterator](); - nextNextChars.next(); - nextNextChars.next(); - let nextNextChar = nextNextChars.next(); - - while (!char.done) { - const section = logicalInput.getSection(i); - const segment = char.value; - const codePoint = segment.codePointAt(0); - if (!whitespace[codePoint]) currentX += getGlyphAdvance(segment, section, glyphMap, imagePositions, spacing, layoutTextSize); - - // Ideographic characters, spaces, and word-breaking punctuation that often appear without - // surrounding spaces. - if (!nextChar.done) { - const ideographicBreak = charAllowsIdeographicBreaking(codePoint); - const nextSegment = nextChar.value; - const nextCodePoint = nextSegment.codePointAt(0); - if (breakable[codePoint] || ideographicBreak || section.imageName || (!nextNextChar.done && breakableBefore[nextCodePoint])) { - - potentialLineBreaks.push( - evaluateBreak( - i + 1, - currentX, - targetWidth, - potentialLineBreaks, - calculatePenalty(codePoint, nextCodePoint, ideographicBreak && hasServerSuggestedBreakpoints), - false)); + let graphemeIndex = 0; + for (const {index: wordIndex, segment: word} of wordSegmenter.segment(logicalInput.text)) { + const graphemes = splitByGraphemeCluster(word); + for (const grapheme of graphemes) { + const section = logicalInput.getSection(graphemeIndex); + if (!!grapheme.trim()) { + currentX += getGlyphAdvance(grapheme, section, glyphMap, imagePositions, spacing, layoutTextSize); } + graphemeIndex++; } - i++; - char = chars.next(); - nextChar = nextChars.next(); - nextNextChar = nextNextChars.next(); + + const nextWordIndex = wordIndex + word.length; + const lastCodePoint = graphemes.at(-1).codePointAt(0); + const nextWordCodePoint = logicalInput.text.codePointAt(nextWordIndex); + if (!nextWordCodePoint) { + continue; + } + + const penalty = calculatePenalty(lastCodePoint, nextWordCodePoint); + const lineBreak = evaluateBreak(graphemeIndex, currentX, targetWidth, potentialLineBreaks, penalty, false) + potentialLineBreaks.push(lineBreak); } return leastBadBreaks( diff --git a/src/util/script_detection.test.ts b/src/util/script_detection.test.ts index 2475420f65..cbb4eb42ff 100644 --- a/src/util/script_detection.test.ts +++ b/src/util/script_detection.test.ts @@ -1,39 +1,4 @@ -import {allowsLetterSpacing, charAllowsIdeographicBreaking, charHasUprightVerticalOrientation, charInComplexShapingScript, stringContainsRTLText} from './script_detection'; - -describe('charAllowsIdeographicBreaking', () => { - test('disallows ideographic breaking of Latin text', () => { - expect(charAllowsIdeographicBreaking('A'.codePointAt(0))).toBe(false); - expect(charAllowsIdeographicBreaking('3'.codePointAt(0))).toBe(false); - }); - - test('allows ideographic breaking of ideographic punctuation', () => { - expect(charAllowsIdeographicBreaking('〈'.codePointAt(0))).toBe(true); - }); - - test('allows ideographic breaking of Bopomofo text', () => { - expect(charAllowsIdeographicBreaking('ㄎ'.codePointAt(0))).toBe(true); - }); - - test('allows ideographic breaking of Chinese and Vietnamese text', () => { - expect(charAllowsIdeographicBreaking('市'.codePointAt(0))).toBe(true); - expect(charAllowsIdeographicBreaking('𡔖'.codePointAt(0))).toBe(true); - expect(charAllowsIdeographicBreaking('麵'.codePointAt(0))).toBe(true); - expect(charAllowsIdeographicBreaking('𪚥'.codePointAt(0))).toBe(true); - }); - - test('disallows ideographic breaking of Korean text', () => { - expect(charAllowsIdeographicBreaking('아'.codePointAt(0))).toBe(false); - }); - - test('allows ideographic breaking of Japanese text', () => { - expect(charAllowsIdeographicBreaking('あ'.codePointAt(0))).toBe(true); - expect(charAllowsIdeographicBreaking('カ'.codePointAt(0))).toBe(true); - }); - - test('allows ideographic breaking of Yi text', () => { - expect(charAllowsIdeographicBreaking('ꉆ'.codePointAt(0))).toBe(true); - }); -}); +import {allowsLetterSpacing, charHasUprightVerticalOrientation, charInComplexShapingScript, stringContainsRTLText} from './script_detection'; describe('allowsLetterSpacing', () => { test('allows letter spacing of Latin text', () => { diff --git a/src/util/script_detection.ts b/src/util/script_detection.ts index 88309313bd..9bedd5dd00 100644 --- a/src/util/script_detection.ts +++ b/src/util/script_detection.ts @@ -35,13 +35,6 @@ export function splitByGraphemeCluster(text: string) { return baseSegments; } -export function allowsIdeographicBreaking(chars: string) { - for (const char of chars) { - if (!charAllowsIdeographicBreaking(char.codePointAt(0))) return false; - } - return true; -} - export function allowsVerticalWritingMode(chars: string) { for (const char of chars) { if (charHasUprightVerticalOrientation(char.codePointAt(0))) return true; @@ -101,30 +94,6 @@ const ideographicBreakingScriptCodes = [ const ideographicBreakingRegExp = sanitizedRegExpFromScriptCodes(ideographicBreakingScriptCodes); -export function charAllowsIdeographicBreaking(char: number) { - // Return early for characters outside all ideographic ranges. - if (char < 0x2E80) return false; - - if (isChar['CJK Compatibility'](char)) return true; - if (isChar['CJK Compatibility Forms'](char)) return true; - if (isChar['CJK Radicals Supplement'](char)) return true; - if (isChar['CJK Strokes'](char)) return true; - if (isChar['CJK Symbols and Punctuation'](char)) return true; - if (isChar['Enclosed CJK Letters and Months'](char)) return true; - if (isChar['Enclosed Ideographic Supplement'](char)) return true; - if (isChar['Halfwidth and Fullwidth Forms'](char)) return true; - if (isChar['Ideographic Description Characters'](char)) return true; - if (isChar['Ideographic Symbols and Punctuation'](char)) return true; - if (isChar['Kana Extended-A'](char)) return true; - if (isChar['Kana Extended-B'](char)) return true; - if (isChar['Kana Supplement'](char)) return true; - if (isChar['Kangxi Radicals'](char)) return true; - if (isChar['Katakana Phonetic Extensions'](char)) return true; - if (isChar['Small Kana Extension'](char)) return true; - if (isChar['Vertical Forms'](char)) return true; - return ideographicBreakingRegExp.test(String.fromCodePoint(char)); -} - // The following logic comes from // . // Keep it synchronized with @@ -227,9 +196,25 @@ export function charHasUprightVerticalOrientation(char: number) { if (/* Canadian Aboriginal */ /\p{sc=Cans}/u.test(String.fromCodePoint(char))) return true; if (/* Egyptian Hieroglyphs */ /\p{sc=Egyp}/u.test(String.fromCodePoint(char))) return true; if (/* Hangul */ /\p{sc=Hang}/u.test(String.fromCodePoint(char))) return true; - if (charAllowsIdeographicBreaking(char)) return true; - return false; + if (isChar['CJK Compatibility'](char)) return true; + if (isChar['CJK Compatibility Forms'](char)) return true; + if (isChar['CJK Radicals Supplement'](char)) return true; + if (isChar['CJK Strokes'](char)) return true; + if (isChar['CJK Symbols and Punctuation'](char)) return true; + if (isChar['Enclosed CJK Letters and Months'](char)) return true; + if (isChar['Enclosed Ideographic Supplement'](char)) return true; + if (isChar['Halfwidth and Fullwidth Forms'](char)) return true; + if (isChar['Ideographic Description Characters'](char)) return true; + if (isChar['Ideographic Symbols and Punctuation'](char)) return true; + if (isChar['Kana Extended-A'](char)) return true; + if (isChar['Kana Extended-B'](char)) return true; + if (isChar['Kana Supplement'](char)) return true; + if (isChar['Kangxi Radicals'](char)) return true; + if (isChar['Katakana Phonetic Extensions'](char)) return true; + if (isChar['Small Kana Extension'](char)) return true; + if (isChar['Vertical Forms'](char)) return true; + return ideographicBreakingRegExp.test(String.fromCodePoint(char)); } /**