Skip to content

Commit e14506a

Browse files
authored
Merge pull request #20 from isaacus-dev/dev
feat: release v3.3.4
2 parents 70c6fa6 + 292e4ba commit e14506a

File tree

3 files changed

+15
-6
lines changed

3 files changed

+15
-6
lines changed

CHANGELOG.md

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,13 @@
11
## Changelog 🔄
22
All notable changes to `semchunk` will be documented here. This project adheres to [Keep a Changelog](https://keepachangelog.com/en/1.1.0/) and [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
33

4+
## [3.2.4] - 2025-10-26
5+
### Fixed
6+
- Fixed splitters being sorted lexographically rather than by length, which should improve the meaningfulness of chunks.
7+
8+
### Fixed
9+
- Fixed broken Python download count shield ([crflynn/pypistats.org#82](https://github.com/crflynn/pypistats.org/issues/82#issue-3285911460)).
10+
411
## [3.2.3] - 2025-08-13
512
### Fixed
613
- Fixed broken Python download count shield ([crflynn/pypistats.org#82](https://github.com/crflynn/pypistats.org/issues/82#issue-3285911460)).

pyproject.toml

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@ build-backend = "hatchling.build"
44

55
[project]
66
name = "semchunk"
7-
version = "3.2.3"
7+
version = "3.2.4"
88
authors = [
99
{name="Isaacus", email="[email protected]"},
1010
{name="Umar Butler", email="[email protected]"},
@@ -122,6 +122,8 @@ target-version = "py312"
122122
dev = [
123123
"build>=1.2.2.post1",
124124
"hatch>=1.14.1",
125+
"ipykernel>=6.31.0",
126+
"isort>=6.1.0",
125127
"nltk>=3.9.1",
126128
"pytest>=8.4.0",
127129
"pytest-cov>=6.1.1",

src/semchunk/semchunk.py

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -5,9 +5,9 @@
55
import inspect
66

77
from typing import Callable, Sequence, TYPE_CHECKING
8+
from functools import lru_cache
89
from itertools import accumulate
910
from contextlib import suppress
10-
from functools import lru_cache
1111

1212
import mpire
1313

@@ -68,13 +68,13 @@ def _split_text(text: str) -> tuple[str, bool, list[str]]:
6868
# - The largest sequence of whitespace characters or, if the largest such sequence is only a single character and there exists a whitespace character preceded by a semantically meaningful non-whitespace splitter, then that whitespace character;
6969
# - A semantically meaningful non-whitespace splitter.
7070
if "\n" in text or "\r" in text:
71-
splitter = max(re.findall(r"[\r\n]+", text))
71+
splitter = max(re.findall(r"[\r\n]+", text), key=len)
7272

7373
elif "\t" in text:
74-
splitter = max(re.findall(r"\t+", text))
74+
splitter = max(re.findall(r"\t+", text), key=len)
7575

7676
elif re.search(r"\s", text):
77-
splitter = max(re.findall(r"\s+", text))
77+
splitter = max(re.findall(r"\s+", text), key=len)
7878

7979
# If the splitter is only a single character, see if we can target whitespace characters that are preceded by semantically meaningful non-whitespace splitters to avoid splitting in the middle of sentences.
8080
if len(splitter) == 1:
@@ -216,7 +216,7 @@ def chunk(
216216
text=split,
217217
chunk_size=local_chunk_size,
218218
token_counter=token_counter,
219-
offsets=return_offsets,
219+
offsets=True,
220220
_recursion_depth=_recursion_depth + 1,
221221
_start=split_start,
222222
)

0 commit comments

Comments
 (0)