From 05f6dc892b16648146c96b689f5937be1aad22ab Mon Sep 17 00:00:00 2001 From: xbotter <xbotter@live.cn> Date: Thu, 17 Aug 2023 12:59:50 +0800 Subject: [PATCH] =?UTF-8?q?=F0=9F=94=80=20Add=20Unicode=20chunking=20for?= =?UTF-8?q?=20handling=20limited=20text?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Added a new method `ChunkStringByUnicode` in `MessageHandlerHelper` to split text into chunks based on Unicode characters. - Added a new method `TryHandleLimitedText` in `MessageHandlerHelper` to handle long text by splitting it into chunks and processing each chunk separately. - Added unit tests for the new methods in `MessageHandlerHelperTests`. The changes enable handling of long text content by splitting it into smaller chunks based on Unicode characters. This allows for more efficient processing and handling of text content within the specified byte limit. --- .../Helpers/MessageHandlerHelperTests.cs | 30 +++++++++++ .../Helpers/MessageHandlerHelper.cs | 52 +++++++++++++++++++ 2 files changed, 82 insertions(+) diff --git a/src/Senparc.NeuChar.Tests/Helpers/MessageHandlerHelperTests.cs b/src/Senparc.NeuChar.Tests/Helpers/MessageHandlerHelperTests.cs index e7f3b1f..70023dd 100644 --- a/src/Senparc.NeuChar.Tests/Helpers/MessageHandlerHelperTests.cs +++ b/src/Senparc.NeuChar.Tests/Helpers/MessageHandlerHelperTests.cs @@ -36,5 +36,35 @@ public void SubstringByByteTest() } + + [TestMethod] + public void ChunkStringByUnicode() + { + var limit = 10; + var text = "Senparc.NeuChar🤝 跨平台信息交互🔄标准🌍。使用 NeuChar 标准💬可以跨平台🌐兼容不同平台的交互🔀信息设置,一次设置🚀,多平台共享📱🖥。"; + + var results = MessageHandlerHelper.ChunkStringByUnicode(text, limit); + + foreach (var result in results) + { + var bytes = Encoding.UTF8.GetBytes(result); + Assert.IsTrue(bytes.Length <= limit); + } + } + + [TestMethod] + public async Task HandleLimitedTextAsync() + { + var limit = 10; + var text = "Senparc.NeuChar🤝 跨平台信息交互🔄标准🌍。使用 NeuChar 标准💬可以跨平台🌐兼容不同平台的交互🔀信息设置,一次设置🚀,多平台共享📱🖥。"; + + var results = await MessageHandlerHelper.TryHandleLimitedText(text, limit, chunk => + { + return Task.FromResult(chunk); + }); + + Assert.AreEqual(text, string.Join(string.Empty, results)); + } + } } \ No newline at end of file diff --git a/src/Senparc.NeuChar/Helpers/MessageHandlerHelper.cs b/src/Senparc.NeuChar/Helpers/MessageHandlerHelper.cs index 42b08e9..f223f8a 100644 --- a/src/Senparc.NeuChar/Helpers/MessageHandlerHelper.cs +++ b/src/Senparc.NeuChar/Helpers/MessageHandlerHelper.cs @@ -31,6 +31,7 @@ and limitations under the License. using System; using System.Collections.Generic; +using System.Globalization; using System.Linq; using System.Security.Principal; using System.Text; @@ -219,5 +220,56 @@ public static async Task<T> TrySendLimistedText<T>(string accessTokenOrAppId, st return null;//不做处理 } + + /// <summary> + /// 尝试使用Unicode编码分批处理超长的文本内容,返回处理结果集合 + /// </summary> + /// <param name="content">文本内容</param> + /// <param name="limitedBytes">每段文本的限制长度</param> + /// <param name="handleTextFuncAsync">处理方法</param> + /// <returns>处理结果集合</returns> + public static async Task<IEnumerable<T>> TryHandleLimitedText<T>(string content, int limitedBytes, Func<string, Task<T>> handleTextFuncAsync) + where T : class + { + List<T> results = new(); + + if (limitedBytes > 0) + { + foreach (var chunk in ChunkStringByUnicode(content, limitedBytes)) + { + results.Add(await handleTextFuncAsync(chunk)); + } + } + + return results; + } + + /// <summary> + /// 使用Unicode编码对文本进行拆分 + /// </summary> + /// <param name="text">文本内容</param> + /// <param name="chunkSize">分片大小</param> + /// <returns></returns> + public static IEnumerable<string> ChunkStringByUnicode(string text, int chunkSize) + { + var stringBuilder = new StringBuilder(); + var byteSize = 0; + TextElementEnumerator enumerator = StringInfo.GetTextElementEnumerator(text); + + while (enumerator.MoveNext()) + { + string unicodeCharacter = enumerator.GetTextElement(); + var b = Encoding.UTF8.GetBytes(unicodeCharacter); + if (byteSize + b.Length >= chunkSize) + { + yield return stringBuilder.ToString(); + stringBuilder.Clear(); + byteSize = 0; + } + byteSize += b.Length; + stringBuilder.Append(unicodeCharacter); + } + yield return stringBuilder.ToString(); + } } }