|
5 | 5 | import inspect |
6 | 6 |
|
7 | 7 | from typing import Callable, Sequence, TYPE_CHECKING |
| 8 | +from functools import lru_cache |
8 | 9 | from itertools import accumulate |
9 | 10 | from contextlib import suppress |
10 | | -from functools import lru_cache |
11 | 11 |
|
12 | 12 | import mpire |
13 | 13 |
|
@@ -68,13 +68,13 @@ def _split_text(text: str) -> tuple[str, bool, list[str]]: |
68 | 68 | # - The largest sequence of whitespace characters or, if the largest such sequence is only a single character and there exists a whitespace character preceded by a semantically meaningful non-whitespace splitter, then that whitespace character; |
69 | 69 | # - A semantically meaningful non-whitespace splitter. |
70 | 70 | if "\n" in text or "\r" in text: |
71 | | - splitter = max(re.findall(r"[\r\n]+", text)) |
| 71 | + splitter = max(re.findall(r"[\r\n]+", text), key=len) |
72 | 72 |
|
73 | 73 | elif "\t" in text: |
74 | | - splitter = max(re.findall(r"\t+", text)) |
| 74 | + splitter = max(re.findall(r"\t+", text), key=len) |
75 | 75 |
|
76 | 76 | elif re.search(r"\s", text): |
77 | | - splitter = max(re.findall(r"\s+", text)) |
| 77 | + splitter = max(re.findall(r"\s+", text), key=len) |
78 | 78 |
|
79 | 79 | # If the splitter is only a single character, see if we can target whitespace characters that are preceded by semantically meaningful non-whitespace splitters to avoid splitting in the middle of sentences. |
80 | 80 | if len(splitter) == 1: |
@@ -216,7 +216,7 @@ def chunk( |
216 | 216 | text=split, |
217 | 217 | chunk_size=local_chunk_size, |
218 | 218 | token_counter=token_counter, |
219 | | - offsets=return_offsets, |
| 219 | + offsets=True, |
220 | 220 | _recursion_depth=_recursion_depth + 1, |
221 | 221 | _start=split_start, |
222 | 222 | ) |
|
0 commit comments