Merge pull request #20 from isaacus-dev/dev

umarbutler · web-flow · commit e14506a8d0fd · 2025-10-26T14:55:36.000+11:00
feat: release v3.3.4
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,6 +1,13 @@
 ## Changelog 🔄
 All notable changes to `semchunk` will be documented here. This project adheres to [Keep a Changelog](https://keepachangelog.com/en/1.1.0/) and [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
 
+## [3.2.4] - 2025-10-26
+### Fixed
+- Fixed splitters being sorted lexographically rather than by length, which should improve the meaningfulness of chunks.
+
+### Fixed
+- Fixed broken Python download count shield ([crflynn/pypistats.org#82](https://github.com/crflynn/pypistats.org/issues/82#issue-3285911460)).
+
 ## [3.2.3] - 2025-08-13
 ### Fixed
 - Fixed broken Python download count shield ([crflynn/pypistats.org#82](https://github.com/crflynn/pypistats.org/issues/82#issue-3285911460)).
diff --git a/pyproject.toml b/pyproject.toml
@@ -4,7 +4,7 @@ build-backend = "hatchling.build"
 
 [project]
 name = "semchunk"
-version = "3.2.3"
+version = "3.2.4"
 authors = [
     {name="Isaacus", email="support@isaacus.com"},
     {name="Umar Butler", email="umar@umar.au"},
@@ -122,6 +122,8 @@ target-version = "py312"
 dev = [
     "build>=1.2.2.post1",
     "hatch>=1.14.1",
+    "ipykernel>=6.31.0",
+    "isort>=6.1.0",
     "nltk>=3.9.1",
     "pytest>=8.4.0",
     "pytest-cov>=6.1.1",
diff --git a/src/semchunk/semchunk.py b/src/semchunk/semchunk.py
@@ -5,9 +5,9 @@
 import inspect
 
 from typing import Callable, Sequence, TYPE_CHECKING
+from functools import lru_cache
 from itertools import accumulate
 from contextlib import suppress
-from functools import lru_cache
 
 import mpire
 
@@ -68,13 +68,13 @@ def _split_text(text: str) -> tuple[str, bool, list[str]]:
     # - The largest sequence of whitespace characters or, if the largest such sequence is only a single character and there exists a whitespace character preceded by a semantically meaningful non-whitespace splitter, then that whitespace character;
     # - A semantically meaningful non-whitespace splitter.
     if "\n" in text or "\r" in text:
-        splitter = max(re.findall(r"[\r\n]+", text))
+        splitter = max(re.findall(r"[\r\n]+", text), key=len)
 
     elif "\t" in text:
-        splitter = max(re.findall(r"\t+", text))
+        splitter = max(re.findall(r"\t+", text), key=len)
 
     elif re.search(r"\s", text):
-        splitter = max(re.findall(r"\s+", text))
+        splitter = max(re.findall(r"\s+", text), key=len)
         
         # If the splitter is only a single character, see if we can target whitespace characters that are preceded by semantically meaningful non-whitespace splitters to avoid splitting in the middle of sentences.
         if len(splitter) == 1:
@@ -216,7 +216,7 @@ def chunk(
                 text=split,
                 chunk_size=local_chunk_size,
                 token_counter=token_counter,
-                offsets=return_offsets,
+                offsets=True,
                 _recursion_depth=_recursion_depth + 1,
                 _start=split_start,
             )