Bumped version to 3.4.5, bumped min Python version to 3.9, fmt, clean…

…up, etc.
mideind · Aug 22, 2024 · cdba944 · cdba944
1 parent 8750e9c
commit cdba944
Show file tree

Hide file tree

Showing 9 changed files with 81 additions and 43 deletions.
diff --git a/.github/workflows/python-package.yml b/.github/workflows/python-package.yml
@@ -15,10 +15,11 @@ jobs:
     strategy:
       matrix:
         os: [ubuntu-latest]
-        python-version: [ "3.8", "3.9", "3.10", "3.11", "3.12", "pypy-3.9", "pypy-3.10"]
+        python-version: ["3.9", "3.10", "3.11", "3.12", "3.13.0-rc.1", "pypy-3.9", "pypy-3.10"]
 
     steps:
     - uses: actions/checkout@v4
+
     - name: Set up Python ${{ matrix.python-version }}
       uses: actions/setup-python@v5
       with:
@@ -29,10 +30,10 @@ jobs:
         python -m pip install --upgrade pip wheel setuptools
         python -m pip install -e ".[dev]"
 
-    - name: Type check with mypy (only on Python 3.8)
+    - name: Type check with mypy (only on Python 3.9)
       run: |
-        if [ "${{ matrix.python-version }}" == "3.8" ]; then python -m pip install mypy; fi
-        if [ "${{ matrix.python-version }}" == "3.8" ]; then mypy --python-version=3.8 src/tokenizer; fi
+        if [ "${{ matrix.python-version }}" == "3.9" ]; then python -m pip install mypy; fi
+        if [ "${{ matrix.python-version }}" == "3.9" ]; then mypy --python-version=3.9 src/tokenizer; fi
 
     - name: Test with pytest
       run: |

diff --git a/LICENSE.txt b/LICENSE.txt
@@ -1,6 +1,6 @@
 MIT License
 
-Copyright (C) 2023 Miðeind ehf.
+Copyright (C) 2016-2024 Miðeind ehf.
 Original author: Vilhjálmur Þorsteinsson
 
 Permission is hereby granted, free of charge, to any person obtaining a copy

diff --git a/MANIFEST.in b/MANIFEST.in
@@ -1,3 +1,4 @@
 graft src
 prune src/tokenizer/__pycache__
 prune src/tokenizer/.mypy_cache
+prune src/tokenizer/.DS_Store
diff --git a/README.rst b/README.rst
@@ -12,7 +12,7 @@ Tokenization is a necessary first step in many natural language processing
 tasks, such as word counting, parsing, spell checking, corpus generation, and
 statistical analysis of text.
 
-**Tokenizer** is a compact pure-Python (>= 3.8) executable
+**Tokenizer** is a compact pure-Python (>=3.9) executable
 program and module for tokenizing Icelandic text. It converts input text to
 streams of *tokens*, where each token is a separate word, punctuation sign,
 number/amount, date, e-mail, URL/URI, etc. It also segments the token stream
@@ -809,6 +809,7 @@ can be found in the file ``test/toktest_normal_gold_expected.txt``.
 Changelog
 ---------
 
+* Version 3.4.5: Compatibility with Python 3.13. Now requires Python 3.9 or later.
 * Version 3.4.4: Better handling of abbreviations
 * Version 3.4.3: Various minor fixes. Now requires Python 3.8 or later.
 * Version 3.4.2: Abbreviations and phrases added, ``META_BEGIN`` token added.

diff --git a/pyproject.toml b/pyproject.toml
@@ -1,11 +1,10 @@
 [project]
 name = "tokenizer"
-version = "3.4.4"
+version = "3.4.5"
 description = "A tokenizer for Icelandic text"
 authors = [{ name = "Miðeind ehf.", email = "[email protected]" }]
 readme = { file = "README.rst", content-type = "text/x-rst" }
-license = { file = "LICENSE.txt" }
-# For classifier list see: https://pypi.org/pypi?%3Aaction=list_classifiers
+license = "MIT"
 classifiers = [
     "Development Status :: 5 - Production/Stable",
     "Intended Audience :: Developers",
@@ -16,7 +15,6 @@ classifiers = [
     "Natural Language :: Icelandic",
     "Programming Language :: Python",
     "Programming Language :: Python :: 3",
-    "Programming Language :: Python :: 3.8",
     "Programming Language :: Python :: 3.9",
     "Programming Language :: Python :: 3.10",
     "Programming Language :: Python :: 3.11",
@@ -27,7 +25,7 @@ classifiers = [
     "Topic :: Utilities",
     "Topic :: Text Processing :: Linguistic",
 ]
-requires-python = ">=3.8"
+requires-python = ">=3.9"
 
 [project.urls]
 Repository = "https://github.com/mideind/Tokenizer"
@@ -51,17 +49,17 @@ where = ["src"]
 [tool.pytest.ini_options]
 filterwarnings = [
     # Ignore deprecation warnings in libraries, their problem not ours
-    "ignore::DeprecationWarning",
+    # "ignore::DeprecationWarning",
 ]
 
 [tool.ruff]
-line-length = 120
+line-length = 88
 
 [tool.black]
-line-length = 120
+line-length = 88
 
 [tool.isort]
 # This forces these imports to placed at the top
 known_future_library = ["__future__", "typing", "typing_extensions"]
 profile = "black"
-line_length = 120
+line_length = 88
diff --git a/src/tokenizer/__init__.py b/src/tokenizer/__init__.py
@@ -63,7 +63,7 @@
 from .abbrev import Abbreviations, ConfigError
 
 __author__ = "Miðeind ehf."
-__copyright__ = "(C) 2023 Miðeind ehf."
+__copyright__ = "(C) 2016-2024 Miðeind ehf."
 __version__ = importlib.metadata.version("tokenizer")
 
 

diff --git a/src/tokenizer/abbrev.py b/src/tokenizer/abbrev.py
@@ -43,25 +43,23 @@
 
 
 class ConfigError(Exception):
-
     pass
 
 
 _T = TypeVar("_T")
 
 
 class OrderedSet(Generic[_T]):
-
-    """ Shim class to provide an ordered set API on top
-        of an OrderedDict. This is necessary to make abbreviation
-        lookups predictable and repeatable, which they would not be
-        if a standard Python set() was used. """
+    """Shim class to provide an ordered set API on top
+    of an OrderedDict. This is necessary to make abbreviation
+    lookups predictable and repeatable, which they would not be
+    if a standard Python set() was used."""
 
     def __init__(self) -> None:
         self._dict: Dict[_T, None] = OrderedDict()
 
     def add(self, item: _T) -> None:
-        """ Add an item at the end of the ordered set """
+        """Add an item at the end of the ordered set"""
         if item not in self._dict:
             self._dict[item] = None
 
@@ -73,9 +71,8 @@ def __iter__(self) -> Iterator[_T]:
 
 
 class Abbreviations:
-
-    """ Wrapper around dictionary of abbreviations,
-        initialized from the config file """
+    """Wrapper around dictionary of abbreviations,
+    initialized from the config file"""
 
     # Dictionary of abbreviations and their meanings
     DICT: Dict[str, OrderedSet[BIN_Tuple]] = defaultdict(OrderedSet)
@@ -107,8 +104,8 @@ class Abbreviations:
 
     @staticmethod
     def add(abbrev: str, meaning: str, gender: str, fl: Optional[str] = None) -> None:
-        """ Add an abbreviation to the dictionary.
-            Called from the config file handler. """
+        """Add an abbreviation to the dictionary.
+        Called from the config file handler."""
         # Check for sentence finishers
         finisher = False
         not_finisher = False
@@ -152,7 +149,14 @@ def add(abbrev: str, meaning: str, gender: str, fl: Optional[str] = None) -> Non
         # Append the abbreviation and its meaning in tuple form
         # Multiple meanings are supported for each abbreviation
         Abbreviations.DICT[abbrev].add(
-            BIN_Tuple(meaning, 0, gender, "skst" if fl is None else fl, abbrev, "-",)
+            BIN_Tuple(
+                meaning,
+                0,
+                gender,
+                "skst" if fl is None else fl,
+                abbrev,
+                "-",
+            )
         )
         Abbreviations.MEANINGS.add(meaning)
         # Adding wrong versions of abbreviations
@@ -169,7 +173,14 @@ def add(abbrev: str, meaning: str, gender: str, fl: Optional[str] = None) -> Non
                 # as abbreviations, even though they are listed as such
                 # in the form 'Í.' and 'Á.' for use within person names
                 Abbreviations.WRONGDICT[wabbrev].add(
-                    BIN_Tuple(meaning, 0, gender, "skst" if fl is None else fl, wabbrev, "-",)
+                    BIN_Tuple(
+                        meaning,
+                        0,
+                        gender,
+                        "skst" if fl is None else fl,
+                        wabbrev,
+                        "-",
+                    )
                 )
 
         elif "." in abbrev:
@@ -182,7 +193,14 @@ def add(abbrev: str, meaning: str, gender: str, fl: Optional[str] = None) -> Non
                 wabbrev = abbrev[:i] + abbrev[i + 1 :]
                 Abbreviations.WRONGDOTS[wabbrev].append(abbrev)
                 Abbreviations.WRONGDICT[wabbrev].add(
-                    BIN_Tuple(meaning, 0, gender, "skst" if fl is None else fl, wabbrev, "-",)
+                    BIN_Tuple(
+                        meaning,
+                        0,
+                        gender,
+                        "skst" if fl is None else fl,
+                        wabbrev,
+                        "-",
+                    )
                 )
             if len(indices) > 2:
                 # 3 or 4 dots currently in vocabulary
@@ -214,7 +232,14 @@ def add(abbrev: str, meaning: str, gender: str, fl: Optional[str] = None) -> Non
             Abbreviations.WRONGSINGLES.add(wabbrev)
             Abbreviations.WRONGDOTS[wabbrev].append(abbrev)
             Abbreviations.WRONGDICT[wabbrev].add(
-                BIN_Tuple(meaning, 0, gender, "skst" if fl is None else fl, wabbrev, "-",)
+                BIN_Tuple(
+                    meaning,
+                    0,
+                    gender,
+                    "skst" if fl is None else fl,
+                    wabbrev,
+                    "-",
+                )
             )
         if finisher:
             Abbreviations.FINISHERS.add(abbrev)
@@ -233,15 +258,15 @@ def has_abbreviation(meaning: str) -> bool:
 
     @staticmethod
     def get_meaning(abbrev: str) -> Optional[List[BIN_Tuple]]:
-        """ Lookup meaning(s) of abbreviation, if available. """
+        """Look up meaning(s) of abbreviation, if available."""
         m = Abbreviations.DICT.get(abbrev)
         if not m:
             m = Abbreviations.WRONGDICT.get(abbrev)
         return list(m) if m else None
 
     @staticmethod
     def _handle_abbreviations(s: str) -> None:
-        """ Handle abbreviations in the settings section """
+        """Handle abbreviations in the settings section"""
         # Format: abbrev[*] = "meaning" gender (kk|kvk|hk)
         # An asterisk after an abbreviation ending with a period
         # indicates that the abbreviation may finish a sentence
@@ -272,21 +297,23 @@ def _handle_abbreviations(s: str) -> None:
 
     @staticmethod
     def _handle_not_abbreviations(s: str) -> None:
-        """ Handle not_abbreviations in the settings section """
+        """Handle not_abbreviations in the settings section"""
         if len(s) < 3 or s[0] != '"' or s[-1] != '"':
             raise ConfigError("not_abbreviations should be enclosed in double quotes")
         Abbreviations.NOT_ABBREVIATIONS.add(s[1:-1])
 
     @staticmethod
     def initialize():
-        """ Read the abbreviations config file """
+        """Read the abbreviations config file"""
         with Abbreviations._lock:
             if len(Abbreviations.DICT):
                 # Already initialized
                 return
 
             section = None
-            config = open_text(package="tokenizer", resource="Abbrev.conf", encoding="utf-8")
+            config = open_text(
+                package="tokenizer", resource="Abbrev.conf", encoding="utf-8"
+            )  # TODO: Deprecated in Python 3.13
             for s in config:
                 # Ignore comments
                 ix = s.find("#")

diff --git a/src/tokenizer/definitions.py b/src/tokenizer/definitions.py
@@ -534,8 +534,8 @@ class PersonNameTuple(NamedTuple):
     "N": "Norður",
 }
 
-_unit_lambda: Callable[[str], str] = (
-    lambda unit: unit + r"(?!\w)" if unit[-1].isalpha() else unit
+_unit_lambda: Callable[[str], str] = lambda unit: (
+    unit + r"(?!\w)" if unit[-1].isalpha() else unit
 )
 
 SI_UNITS_SET: FrozenSet[str] = frozenset(SI_UNITS.keys())

diff --git a/src/tokenizer/main.py b/src/tokenizer/main.py
@@ -71,8 +71,12 @@
 
 group = parser.add_mutually_exclusive_group()
 
-group.add_argument("--csv", help="Output one token per line in CSV format", action="store_true")
-group.add_argument("--json", help="Output one token per line in JSON format", action="store_true")
+group.add_argument(
+    "--csv", help="Output one token per line in CSV format", action="store_true"
+)
+group.add_argument(
+    "--json", help="Output one token per line in JSON format", action="store_true"
+)
 
 parser.add_argument(
     "-s",
@@ -92,7 +96,10 @@
     "-p",
     "--coalesce_percent",
     action="store_true",
-    help=("Numbers combined into one token with percentage word forms " "(prósent/prósentustig/hundraðshlutar)"),
+    help=(
+        "Numbers combined into one token with percentage word forms "
+        "(prósent/prósentustig/hundraðshlutar)"
+    ),
 )
 
 parser.add_argument(
@@ -127,7 +134,10 @@
     "-c",
     "--convert_numbers",
     action="store_true",
-    help=("English-style decimal points and thousands separators " "in numbers changed to Icelandic style"),
+    help=(
+        "English-style decimal points and thousands separators "
+        "in numbers changed to Icelandic style"
+    ),
 )
 
 parser.add_argument(