Skip to content

Commit a08d3fc

Browse files
authored
[browser][HybridGlobalization] Improve speed performance of IndexOf and LastIndexOf text APIs with HybridGlobalization mode (dotnet#95583)
* Re-implement the grapheme segmenter from Intl. * Load segmentation rules as static json asset
1 parent e42a873 commit a08d3fc

19 files changed

+343
-71
lines changed

THIRD-PARTY-NOTICES.TXT

+14
Original file line numberDiff line numberDiff line change
@@ -1331,3 +1331,17 @@ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
13311331

13321332
Aspects of base64 encoding / decoding are based on algorithm described in "Base64 encoding and decoding at almost the speed of a memory
13331333
copy", Wojciech Muła and Daniel Lemire. https://arxiv.org/pdf/1910.05109.pdf
1334+
1335+
License for FormatJS Intl.Segmenter grapheme segmentation algorithm
1336+
--------------------------------------------------------------------------
1337+
Available at https://github.com/formatjs/formatjs/blob/58d6a7b398d776ca3d2726d72ae1573b65cc3bef/packages/intl-segmenter/LICENSE.md
1338+
1339+
MIT License
1340+
1341+
Copyright (c) 2022 FormatJS
1342+
1343+
Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
1344+
1345+
The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
1346+
1347+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.

eng/liveBuilds.targets

+2-1
Original file line numberDiff line numberDiff line change
@@ -208,7 +208,8 @@
208208
$(LibrariesNativeArtifactsPath)package.json;
209209
$(LibrariesNativeArtifactsPath)dotnet.native.wasm;
210210
$(LibrariesNativeArtifactsPath)dotnet.native.js.symbols;
211-
$(LibrariesNativeArtifactsPath)*.dat;"
211+
$(LibrariesNativeArtifactsPath)*.dat;
212+
$(LibrariesNativeArtifactsPath)segmentation-rules.json;"
212213
IsNative="true" />
213214
<!-- for threaded wasm -->
214215
<LibrariesRuntimeFiles Condition="'$(TargetOS)' == 'browser' and Exists('$(LibrariesNativeArtifactsPath)dotnet.native.worker.js')"

src/installer/pkg/sfx/Microsoft.NETCore.App/Directory.Build.props

+1
Original file line numberDiff line numberDiff line change
@@ -240,6 +240,7 @@
240240
<PlatformManifestFileEntry Include="icudt_optimal.dat" IsNative="true" />
241241
<PlatformManifestFileEntry Include="icudt_optimal_no_CJK.dat" IsNative="true" />
242242
<PlatformManifestFileEntry Include="icudt_hybrid.dat" IsNative="true" />
243+
<PlatformManifestFileEntry Include="segmentation-rules.json" IsNative="true" />
243244
<PlatformManifestFileEntry Include="package.json" IsNative="true" />
244245
<PlatformManifestFileEntry Include="dotnet.es6.pre.js" IsNative="true" />
245246
<PlatformManifestFileEntry Include="dotnet.es6.lib.js" IsNative="true" />

src/mono/browser/browser.proj

+2-1
Original file line numberDiff line numberDiff line change
@@ -357,7 +357,8 @@
357357
<ItemGroup>
358358
<ICULibNativeFiles Include="$(ICULibDir)/libicuuc.a;
359359
$(ICULibDir)/libicui18n.a;
360-
$(ICULibDir)/libicudata.a" />
360+
$(ICULibDir)/libicudata.a;
361+
$(BrowserProjectRoot)runtime/hybrid-globalization/segmentation-rules.json" />
361362
<ICULibFiles Include="$(ICULibDir)/*.dat" />
362363
</ItemGroup>
363364
<PropertyGroup>

src/mono/browser/build/BrowserWasmApp.targets

+2-1
Original file line numberDiff line numberDiff line change
@@ -104,8 +104,9 @@
104104

105105
<ItemGroup Condition="'$(InvariantGlobalization)' != 'true'">
106106
<_HybridGlobalizationDataFiles Include="$(MicrosoftNetCoreAppRuntimePackRidNativeDir)icudt_hybrid.dat"/>
107+
<_HybridGlobalizationDataFiles Include="$(MicrosoftNetCoreAppRuntimePackRidNativeDir)segmentation-rules.json"/>
107108
<_IcuAvailableDataFiles Include="$(MicrosoftNetCoreAppRuntimePackRidNativeDir)icudt_*" Exclude="@(_HybridGlobalizationDataFiles);$(_WasmIcuDataFileName)"/>
108-
<WasmIcuDataFileNames Condition="'$(HybridGlobalization)' == 'true'" Include="$(MicrosoftNetCoreAppRuntimePackRidNativeDir)icudt_hybrid.dat"/>
109+
<WasmIcuDataFileNames Condition="'$(HybridGlobalization)' == 'true'" Include="@(_HybridGlobalizationDataFiles)"/>
109110
<WasmIcuDataFileNames Condition="'$(HybridGlobalization)' != 'true' and '$(WasmIncludeFullIcuData)' == 'true'" Include="$(MicrosoftNetCoreAppRuntimePackRidNativeDir)icudt.dat"/>
110111
<WasmIcuDataFileNames Condition="'$(HybridGlobalization)' != 'true' and '$(WasmIncludeFullIcuData)' != 'true' and '$(_WasmIcuDataFileName)' == ''" Include="@(_IcuAvailableDataFiles)"/>
111112
<WasmIcuDataFileNames Condition="'$(HybridGlobalization)' != 'true' and '$(WasmIncludeFullIcuData)' != 'true' and '$(_WasmIcuDataFileName)' != ''" Include="$(_WasmIcuDataFileName)"/>

src/mono/browser/runtime/assets.ts

+12
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@ import { endMeasure, MeasuredBlock, startMeasure } from "./profiler";
1010
import { AssetEntryInternal } from "./types/internal";
1111
import { AssetEntry } from "./types";
1212
import { VoidPtr } from "./types/emscripten";
13+
import { setSegmentationRulesFromJson } from "./hybrid-globalization/grapheme-segmenter";
1314

1415
// this need to be run only after onRuntimeInitialized event, when the memory is ready
1516
export function instantiate_asset(asset: AssetEntry, url: string, bytes: Uint8Array): void {
@@ -25,6 +26,7 @@ export function instantiate_asset(asset: AssetEntry, url: string, bytes: Uint8Ar
2526
case "dotnetwasm":
2627
case "js-module-threads":
2728
case "symbols":
29+
case "segmentation-rules":
2830
// do nothing
2931
break;
3032
case "resource":
@@ -104,6 +106,16 @@ export async function instantiate_symbols_asset(pendingAsset: AssetEntryInternal
104106
}
105107
}
106108

109+
export async function instantiate_segmentation_rules_asset(pendingAsset: AssetEntryInternal): Promise<void> {
110+
try {
111+
const response = await pendingAsset.pendingDownloadInternal!.response;
112+
const json = await response.json();
113+
setSegmentationRulesFromJson(json);
114+
} catch (error: any) {
115+
mono_log_info(`Error loading static json asset ${pendingAsset.name}: ${JSON.stringify(error)}`);
116+
}
117+
}
118+
107119
export async function wait_for_all_assets() {
108120
// wait for all assets in memory
109121
await runtimeHelpers.allAssetsInMemory.promise;

src/mono/browser/runtime/exports.ts

+2-1
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,7 @@ import { mono_bind_static_method } from "./net6-legacy/method-calls";
2020
import { export_binding_api, export_internal_api, export_mono_api } from "./net6-legacy/exports-legacy";
2121
import { initializeLegacyExports } from "./net6-legacy/globals";
2222
import { mono_log_warn, mono_wasm_stringify_as_error_with_stack } from "./logging";
23-
import { instantiate_asset, instantiate_symbols_asset } from "./assets";
23+
import { instantiate_asset, instantiate_symbols_asset, instantiate_segmentation_rules_asset } from "./assets";
2424
import { jiterpreter_dump_stats } from "./jiterpreter";
2525
import { forceDisposeProxies } from "./gc-handles";
2626

@@ -46,6 +46,7 @@ function initializeExports(globalObjects: GlobalObjects): RuntimeAPI {
4646
instantiate_asset,
4747
jiterpreter_dump_stats,
4848
forceDisposeProxies,
49+
instantiate_segmentation_rules_asset,
4950
});
5051

5152
const API = export_api();

src/mono/browser/runtime/hybrid-globalization/change-case.ts

+1-14
Original file line numberDiff line numberDiff line change
@@ -7,11 +7,7 @@ import { MonoObject, MonoObjectRef, MonoString, MonoStringRef } from "../types/i
77
import { Int32Ptr } from "../types/emscripten";
88
import { wrap_error_root, wrap_no_error_root } from "../invoke-js";
99
import { localHeapViewU16, setU16_local } from "../memory";
10-
11-
const SURROGATE_HIGHER_START = "\uD800";
12-
const SURROGATE_HIGHER_END = "\uDBFF";
13-
const SURROGATE_LOWER_START = "\uDC00";
14-
const SURROGATE_LOWER_END = "\uDFFF";
10+
import { isSurrogate } from "./helpers";
1511

1612
export function mono_wasm_change_case_invariant(src: number, srcLength: number, dst: number, dstLength: number, toUpper: number, is_exception: Int32Ptr, ex_address: MonoObjectRef): void {
1713
const exceptionRoot = mono_wasm_new_external_root<MonoObject>(ex_address);
@@ -160,15 +156,6 @@ export function mono_wasm_change_case(culture: MonoStringRef, src: number, srcLe
160156
}
161157
}
162158

163-
function isSurrogate(str: string, startIdx: number) : boolean
164-
{
165-
return SURROGATE_HIGHER_START <= str[startIdx] &&
166-
str[startIdx] <= SURROGATE_HIGHER_END &&
167-
startIdx+1 < str.length &&
168-
SURROGATE_LOWER_START <= str[startIdx+1] &&
169-
str[startIdx+1] <= SURROGATE_LOWER_END;
170-
}
171-
172159
function appendSurrogateToMemory(heapI16: Uint16Array, dst: number, surrogate: string, idx: number)
173160
{
174161
setU16_local(heapI16, dst + idx*2, surrogate.charCodeAt(0));

src/mono/browser/runtime/hybrid-globalization/collations.ts

+44-53
Original file line numberDiff line numberDiff line change
@@ -6,9 +6,11 @@ import { monoStringToString, utf16ToString } from "../strings";
66
import { MonoObject, MonoObjectRef, MonoString, MonoStringRef } from "../types/internal";
77
import { Int32Ptr } from "../types/emscripten";
88
import { wrap_error_root, wrap_no_error_root } from "../invoke-js";
9+
import { GraphemeSegmenter } from "./grapheme-segmenter";
910

1011
const COMPARISON_ERROR = -2;
1112
const INDEXING_ERROR = -1;
13+
let graphemeSegmenterCached: GraphemeSegmenter | null;
1214

1315
export function mono_wasm_compare_string(culture: MonoStringRef, str1: number, str1Length: number, str2: number, str2Length: number, options: number, is_exception: Int32Ptr, ex_address: MonoObjectRef): number {
1416
const cultureRoot = mono_wasm_new_external_root<MonoString>(culture),
@@ -20,7 +22,7 @@ export function mono_wasm_compare_string(culture: MonoStringRef, str1: number, s
2022
const casePicker = (options & 0x1f);
2123
const locale = cultureName ? cultureName : undefined;
2224
wrap_no_error_root(is_exception, exceptionRoot);
23-
return compare_strings(string1, string2, locale, casePicker);
25+
return compareStrings(string1, string2, locale, casePicker);
2426
}
2527
catch (ex: any) {
2628
wrap_error_root(is_exception, ex, exceptionRoot);
@@ -37,19 +39,19 @@ export function mono_wasm_starts_with(culture: MonoStringRef, str1: number, str1
3739
exceptionRoot = mono_wasm_new_external_root<MonoObject>(ex_address);
3840
try {
3941
const cultureName = monoStringToString(cultureRoot);
40-
const prefix = decode_to_clean_string(str2, str2Length);
42+
const prefix = decodeToCleanString(str2, str2Length);
4143
// no need to look for an empty string
4244
if (prefix.length == 0)
4345
return 1; // true
4446

45-
const source = decode_to_clean_string(str1, str1Length);
47+
const source = decodeToCleanString(str1, str1Length);
4648
if (source.length < prefix.length)
4749
return 0; //false
4850
const sourceOfPrefixLength = source.slice(0, prefix.length);
4951

5052
const casePicker = (options & 0x1f);
5153
const locale = cultureName ? cultureName : undefined;
52-
const result = compare_strings(sourceOfPrefixLength, prefix, locale, casePicker);
54+
const result = compareStrings(sourceOfPrefixLength, prefix, locale, casePicker);
5355
wrap_no_error_root(is_exception, exceptionRoot);
5456
return result === 0 ? 1 : 0; // equals ? true : false
5557
}
@@ -68,19 +70,19 @@ export function mono_wasm_ends_with(culture: MonoStringRef, str1: number, str1Le
6870
exceptionRoot = mono_wasm_new_external_root<MonoObject>(ex_address);
6971
try {
7072
const cultureName = monoStringToString(cultureRoot);
71-
const suffix = decode_to_clean_string(str2, str2Length);
73+
const suffix = decodeToCleanString(str2, str2Length);
7274
if (suffix.length == 0)
7375
return 1; // true
7476

75-
const source = decode_to_clean_string(str1, str1Length);
77+
const source = decodeToCleanString(str1, str1Length);
7678
const diff = source.length - suffix.length;
7779
if (diff < 0)
7880
return 0; //false
7981
const sourceOfSuffixLength = source.slice(diff, source.length);
8082

8183
const casePicker = (options & 0x1f);
8284
const locale = cultureName ? cultureName : undefined;
83-
const result = compare_strings(sourceOfSuffixLength, suffix, locale, casePicker);
85+
const result = compareStrings(sourceOfSuffixLength, suffix, locale, casePicker);
8486
wrap_no_error_root(is_exception, exceptionRoot);
8587
return result === 0 ? 1 : 0; // equals ? true : false
8688
}
@@ -100,68 +102,57 @@ export function mono_wasm_index_of(culture: MonoStringRef, needlePtr: number, ne
100102
try {
101103
const needle = utf16ToString(<any>needlePtr, <any>(needlePtr + 2 * needleLength));
102104
// no need to look for an empty string
103-
if (clean_string(needle).length == 0) {
105+
if (cleanString(needle).length == 0) {
104106
wrap_no_error_root(is_exception, exceptionRoot);
105107
return fromBeginning ? 0 : srcLength;
106108
}
107109

108110
const source = utf16ToString(<any>srcPtr, <any>(srcPtr + 2 * srcLength));
109111
// no need to look in an empty string
110-
if (clean_string(source).length == 0) {
112+
if (cleanString(source).length == 0) {
111113
wrap_no_error_root(is_exception, exceptionRoot);
112114
return fromBeginning ? 0 : srcLength;
113115
}
114116
const cultureName = monoStringToString(cultureRoot);
115117
const locale = cultureName ? cultureName : undefined;
116118
const casePicker = (options & 0x1f);
117-
118-
const segmenter = new Intl.Segmenter(locale, { granularity: "grapheme" });
119-
const needleSegments = Array.from(segmenter.segment(needle)).map(s => s.segment);
120-
let i = 0;
121-
let stop = false;
122119
let result = -1;
123-
let segmentWidth = 0;
124-
let index = 0;
125-
let nextIndex = 0;
126-
while (!stop) {
127-
// we need to restart the iterator in this outer loop because we have shifted it in the inner loop
128-
const iteratorSrc = segmenter.segment(source.slice(i, source.length))[Symbol.iterator]();
129-
let srcNext = iteratorSrc.next();
130120

131-
if (srcNext.done)
132-
break;
121+
const graphemeSegmenter = graphemeSegmenterCached || (graphemeSegmenterCached = new GraphemeSegmenter());
122+
const needleSegments = [];
123+
let needleIdx = 0;
124+
125+
// Grapheme segmentation of needle string
126+
while (needleIdx < needle.length) {
127+
const needleGrapheme = graphemeSegmenter.nextGrapheme(needle, needleIdx);
128+
needleSegments.push(needleGrapheme);
129+
needleIdx += needleGrapheme.length;
130+
}
131+
132+
let srcIdx = 0;
133+
while (srcIdx < source.length) {
134+
const srcGrapheme = graphemeSegmenter.nextGrapheme(source, srcIdx);
135+
srcIdx += srcGrapheme.length;
133136

134-
let matchFound = check_match_found(srcNext.value.segment, needleSegments[0], locale, casePicker);
135-
index = nextIndex;
136-
srcNext = iteratorSrc.next();
137-
if (srcNext.done) {
138-
result = matchFound ? index : result;
139-
break;
137+
if (!checkMatchFound(srcGrapheme, needleSegments[0], locale, casePicker)) {
138+
continue;
140139
}
141-
segmentWidth = srcNext.value.index;
142-
nextIndex = index + segmentWidth;
143-
if (matchFound) {
144-
for (let j = 1; j < needleSegments.length; j++) {
145-
if (srcNext.done) {
146-
stop = true;
147-
break;
148-
}
149-
matchFound = check_match_found(srcNext.value.segment, needleSegments[j], locale, casePicker);
150-
if (!matchFound)
151-
break;
152140

153-
srcNext = iteratorSrc.next();
154-
}
155-
if (stop)
141+
let j;
142+
let srcNextIdx = srcIdx;
143+
for (j = 1; j < needleSegments.length; j++) {
144+
const srcGrapheme = graphemeSegmenter.nextGrapheme(source, srcNextIdx);
145+
146+
if (!checkMatchFound(srcGrapheme, needleSegments[j], locale, casePicker)) {
156147
break;
148+
}
149+
srcNextIdx += srcGrapheme.length;
157150
}
158-
159-
if (matchFound) {
160-
result = index;
151+
if (j === needleSegments.length) {
152+
result = srcIdx - srcGrapheme.length;
161153
if (fromBeginning)
162154
break;
163155
}
164-
i = nextIndex;
165156
}
166157
wrap_no_error_root(is_exception, exceptionRoot);
167158
return result;
@@ -175,12 +166,12 @@ export function mono_wasm_index_of(culture: MonoStringRef, needlePtr: number, ne
175166
exceptionRoot.release();
176167
}
177168

178-
function check_match_found(str1: string, str2: string, locale: string | undefined, casePicker: number): boolean {
179-
return compare_strings(str1, str2, locale, casePicker) === 0;
169+
function checkMatchFound(str1: string, str2: string, locale: string | undefined, casePicker: number): boolean {
170+
return compareStrings(str1, str2, locale, casePicker) === 0;
180171
}
181172
}
182173

183-
function compare_strings(string1: string, string2: string, locale: string | undefined, casePicker: number): number {
174+
function compareStrings(string1: string, string2: string, locale: string | undefined, casePicker: number): number {
184175
switch (casePicker) {
185176
case 0:
186177
// 0: None - default algorithm for the platform OR
@@ -272,12 +263,12 @@ function compare_strings(string1: string, string2: string, locale: string | unde
272263
}
273264
}
274265

275-
function decode_to_clean_string(strPtr: number, strLen: number) {
266+
function decodeToCleanString(strPtr: number, strLen: number) {
276267
const str = utf16ToString(<any>strPtr, <any>(strPtr + 2 * strLen));
277-
return clean_string(str);
268+
return cleanString(str);
278269
}
279270

280-
function clean_string(str: string) {
271+
function cleanString(str: string) {
281272
const nStr = str.normalize();
282273
return nStr.replace(/[\u200B-\u200D\uFEFF\0]/g, "");
283274
}

0 commit comments

Comments
 (0)