diff --git a/.github/workflows/python-package.yml b/.github/workflows/python-package.yml index e432efc..db75859 100644 --- a/.github/workflows/python-package.yml +++ b/.github/workflows/python-package.yml @@ -15,10 +15,11 @@ jobs: strategy: matrix: os: [ubuntu-latest] - python-version: [ "3.8", "3.9", "3.10", "3.11", "3.12", "pypy-3.9", "pypy-3.10"] + python-version: ["3.9", "3.10", "3.11", "3.12", "3.13.0-rc.1", "pypy-3.9", "pypy-3.10"] steps: - uses: actions/checkout@v4 + - name: Set up Python ${{ matrix.python-version }} uses: actions/setup-python@v5 with: @@ -29,10 +30,10 @@ jobs: python -m pip install --upgrade pip wheel setuptools python -m pip install -e ".[dev]" - - name: Type check with mypy (only on Python 3.8) + - name: Type check with mypy (only on Python 3.9) run: | - if [ "${{ matrix.python-version }}" == "3.8" ]; then python -m pip install mypy; fi - if [ "${{ matrix.python-version }}" == "3.8" ]; then mypy --python-version=3.8 src/tokenizer; fi + if [ "${{ matrix.python-version }}" == "3.9" ]; then python -m pip install mypy; fi + if [ "${{ matrix.python-version }}" == "3.9" ]; then mypy --python-version=3.9 src/tokenizer; fi - name: Test with pytest run: | diff --git a/LICENSE.txt b/LICENSE.txt index 6eebeb7..a3fd327 100644 --- a/LICENSE.txt +++ b/LICENSE.txt @@ -1,6 +1,6 @@ MIT License -Copyright (C) 2023 Miðeind ehf. +Copyright (C) 2016-2024 Miðeind ehf. Original author: Vilhjálmur Þorsteinsson Permission is hereby granted, free of charge, to any person obtaining a copy diff --git a/MANIFEST.in b/MANIFEST.in index 04cc9cf..0c93fb0 100644 --- a/MANIFEST.in +++ b/MANIFEST.in @@ -1,3 +1,4 @@ graft src prune src/tokenizer/__pycache__ prune src/tokenizer/.mypy_cache +prune src/tokenizer/.DS_Store \ No newline at end of file diff --git a/README.rst b/README.rst index 316697e..50bb5ff 100644 --- a/README.rst +++ b/README.rst @@ -12,7 +12,7 @@ Tokenization is a necessary first step in many natural language processing tasks, such as word counting, parsing, spell checking, corpus generation, and statistical analysis of text. -**Tokenizer** is a compact pure-Python (>= 3.8) executable +**Tokenizer** is a compact pure-Python (>=3.9) executable program and module for tokenizing Icelandic text. It converts input text to streams of *tokens*, where each token is a separate word, punctuation sign, number/amount, date, e-mail, URL/URI, etc. It also segments the token stream @@ -809,6 +809,7 @@ can be found in the file ``test/toktest_normal_gold_expected.txt``. Changelog --------- +* Version 3.4.5: Compatibility with Python 3.13. Now requires Python 3.9 or later. * Version 3.4.4: Better handling of abbreviations * Version 3.4.3: Various minor fixes. Now requires Python 3.8 or later. * Version 3.4.2: Abbreviations and phrases added, ``META_BEGIN`` token added. diff --git a/pyproject.toml b/pyproject.toml index 5bd7107..2365a00 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,11 +1,10 @@ [project] name = "tokenizer" -version = "3.4.4" +version = "3.4.5" description = "A tokenizer for Icelandic text" authors = [{ name = "Miðeind ehf.", email = "mideind@mideind.is" }] readme = { file = "README.rst", content-type = "text/x-rst" } -license = { file = "LICENSE.txt" } -# For classifier list see: https://pypi.org/pypi?%3Aaction=list_classifiers +license = "MIT" classifiers = [ "Development Status :: 5 - Production/Stable", "Intended Audience :: Developers", @@ -16,7 +15,6 @@ classifiers = [ "Natural Language :: Icelandic", "Programming Language :: Python", "Programming Language :: Python :: 3", - "Programming Language :: Python :: 3.8", "Programming Language :: Python :: 3.9", "Programming Language :: Python :: 3.10", "Programming Language :: Python :: 3.11", @@ -27,7 +25,7 @@ classifiers = [ "Topic :: Utilities", "Topic :: Text Processing :: Linguistic", ] -requires-python = ">=3.8" +requires-python = ">=3.9" [project.urls] Repository = "https://github.com/mideind/Tokenizer" @@ -51,17 +49,17 @@ where = ["src"] [tool.pytest.ini_options] filterwarnings = [ # Ignore deprecation warnings in libraries, their problem not ours - "ignore::DeprecationWarning", + # "ignore::DeprecationWarning", ] [tool.ruff] -line-length = 120 +line-length = 88 [tool.black] -line-length = 120 +line-length = 88 [tool.isort] # This forces these imports to placed at the top known_future_library = ["__future__", "typing", "typing_extensions"] profile = "black" -line_length = 120 +line_length = 88 diff --git a/src/tokenizer/__init__.py b/src/tokenizer/__init__.py index 51fba02..6f14e9e 100644 --- a/src/tokenizer/__init__.py +++ b/src/tokenizer/__init__.py @@ -63,7 +63,7 @@ from .abbrev import Abbreviations, ConfigError __author__ = "Miðeind ehf." -__copyright__ = "(C) 2023 Miðeind ehf." +__copyright__ = "(C) 2016-2024 Miðeind ehf." __version__ = importlib.metadata.version("tokenizer") diff --git a/src/tokenizer/abbrev.py b/src/tokenizer/abbrev.py index a08ce5b..a57c954 100644 --- a/src/tokenizer/abbrev.py +++ b/src/tokenizer/abbrev.py @@ -43,7 +43,6 @@ class ConfigError(Exception): - pass @@ -51,17 +50,16 @@ class ConfigError(Exception): class OrderedSet(Generic[_T]): - - """ Shim class to provide an ordered set API on top - of an OrderedDict. This is necessary to make abbreviation - lookups predictable and repeatable, which they would not be - if a standard Python set() was used. """ + """Shim class to provide an ordered set API on top + of an OrderedDict. This is necessary to make abbreviation + lookups predictable and repeatable, which they would not be + if a standard Python set() was used.""" def __init__(self) -> None: self._dict: Dict[_T, None] = OrderedDict() def add(self, item: _T) -> None: - """ Add an item at the end of the ordered set """ + """Add an item at the end of the ordered set""" if item not in self._dict: self._dict[item] = None @@ -73,9 +71,8 @@ def __iter__(self) -> Iterator[_T]: class Abbreviations: - - """ Wrapper around dictionary of abbreviations, - initialized from the config file """ + """Wrapper around dictionary of abbreviations, + initialized from the config file""" # Dictionary of abbreviations and their meanings DICT: Dict[str, OrderedSet[BIN_Tuple]] = defaultdict(OrderedSet) @@ -107,8 +104,8 @@ class Abbreviations: @staticmethod def add(abbrev: str, meaning: str, gender: str, fl: Optional[str] = None) -> None: - """ Add an abbreviation to the dictionary. - Called from the config file handler. """ + """Add an abbreviation to the dictionary. + Called from the config file handler.""" # Check for sentence finishers finisher = False not_finisher = False @@ -152,7 +149,14 @@ def add(abbrev: str, meaning: str, gender: str, fl: Optional[str] = None) -> Non # Append the abbreviation and its meaning in tuple form # Multiple meanings are supported for each abbreviation Abbreviations.DICT[abbrev].add( - BIN_Tuple(meaning, 0, gender, "skst" if fl is None else fl, abbrev, "-",) + BIN_Tuple( + meaning, + 0, + gender, + "skst" if fl is None else fl, + abbrev, + "-", + ) ) Abbreviations.MEANINGS.add(meaning) # Adding wrong versions of abbreviations @@ -169,7 +173,14 @@ def add(abbrev: str, meaning: str, gender: str, fl: Optional[str] = None) -> Non # as abbreviations, even though they are listed as such # in the form 'Í.' and 'Á.' for use within person names Abbreviations.WRONGDICT[wabbrev].add( - BIN_Tuple(meaning, 0, gender, "skst" if fl is None else fl, wabbrev, "-",) + BIN_Tuple( + meaning, + 0, + gender, + "skst" if fl is None else fl, + wabbrev, + "-", + ) ) elif "." in abbrev: @@ -182,7 +193,14 @@ def add(abbrev: str, meaning: str, gender: str, fl: Optional[str] = None) -> Non wabbrev = abbrev[:i] + abbrev[i + 1 :] Abbreviations.WRONGDOTS[wabbrev].append(abbrev) Abbreviations.WRONGDICT[wabbrev].add( - BIN_Tuple(meaning, 0, gender, "skst" if fl is None else fl, wabbrev, "-",) + BIN_Tuple( + meaning, + 0, + gender, + "skst" if fl is None else fl, + wabbrev, + "-", + ) ) if len(indices) > 2: # 3 or 4 dots currently in vocabulary @@ -214,7 +232,14 @@ def add(abbrev: str, meaning: str, gender: str, fl: Optional[str] = None) -> Non Abbreviations.WRONGSINGLES.add(wabbrev) Abbreviations.WRONGDOTS[wabbrev].append(abbrev) Abbreviations.WRONGDICT[wabbrev].add( - BIN_Tuple(meaning, 0, gender, "skst" if fl is None else fl, wabbrev, "-",) + BIN_Tuple( + meaning, + 0, + gender, + "skst" if fl is None else fl, + wabbrev, + "-", + ) ) if finisher: Abbreviations.FINISHERS.add(abbrev) @@ -233,7 +258,7 @@ def has_abbreviation(meaning: str) -> bool: @staticmethod def get_meaning(abbrev: str) -> Optional[List[BIN_Tuple]]: - """ Lookup meaning(s) of abbreviation, if available. """ + """Look up meaning(s) of abbreviation, if available.""" m = Abbreviations.DICT.get(abbrev) if not m: m = Abbreviations.WRONGDICT.get(abbrev) @@ -241,7 +266,7 @@ def get_meaning(abbrev: str) -> Optional[List[BIN_Tuple]]: @staticmethod def _handle_abbreviations(s: str) -> None: - """ Handle abbreviations in the settings section """ + """Handle abbreviations in the settings section""" # Format: abbrev[*] = "meaning" gender (kk|kvk|hk) # An asterisk after an abbreviation ending with a period # indicates that the abbreviation may finish a sentence @@ -272,21 +297,23 @@ def _handle_abbreviations(s: str) -> None: @staticmethod def _handle_not_abbreviations(s: str) -> None: - """ Handle not_abbreviations in the settings section """ + """Handle not_abbreviations in the settings section""" if len(s) < 3 or s[0] != '"' or s[-1] != '"': raise ConfigError("not_abbreviations should be enclosed in double quotes") Abbreviations.NOT_ABBREVIATIONS.add(s[1:-1]) @staticmethod def initialize(): - """ Read the abbreviations config file """ + """Read the abbreviations config file""" with Abbreviations._lock: if len(Abbreviations.DICT): # Already initialized return section = None - config = open_text(package="tokenizer", resource="Abbrev.conf", encoding="utf-8") + config = open_text( + package="tokenizer", resource="Abbrev.conf", encoding="utf-8" + ) # TODO: Deprecated in Python 3.13 for s in config: # Ignore comments ix = s.find("#") diff --git a/src/tokenizer/definitions.py b/src/tokenizer/definitions.py index 9811192..812aeb7 100644 --- a/src/tokenizer/definitions.py +++ b/src/tokenizer/definitions.py @@ -534,8 +534,8 @@ class PersonNameTuple(NamedTuple): "N": "Norður", } -_unit_lambda: Callable[[str], str] = ( - lambda unit: unit + r"(?!\w)" if unit[-1].isalpha() else unit +_unit_lambda: Callable[[str], str] = lambda unit: ( + unit + r"(?!\w)" if unit[-1].isalpha() else unit ) SI_UNITS_SET: FrozenSet[str] = frozenset(SI_UNITS.keys()) diff --git a/src/tokenizer/main.py b/src/tokenizer/main.py index 95e0e63..b6a94eb 100755 --- a/src/tokenizer/main.py +++ b/src/tokenizer/main.py @@ -71,8 +71,12 @@ group = parser.add_mutually_exclusive_group() -group.add_argument("--csv", help="Output one token per line in CSV format", action="store_true") -group.add_argument("--json", help="Output one token per line in JSON format", action="store_true") +group.add_argument( + "--csv", help="Output one token per line in CSV format", action="store_true" +) +group.add_argument( + "--json", help="Output one token per line in JSON format", action="store_true" +) parser.add_argument( "-s", @@ -92,7 +96,10 @@ "-p", "--coalesce_percent", action="store_true", - help=("Numbers combined into one token with percentage word forms " "(prósent/prósentustig/hundraðshlutar)"), + help=( + "Numbers combined into one token with percentage word forms " + "(prósent/prósentustig/hundraðshlutar)" + ), ) parser.add_argument( @@ -127,7 +134,10 @@ "-c", "--convert_numbers", action="store_true", - help=("English-style decimal points and thousands separators " "in numbers changed to Icelandic style"), + help=( + "English-style decimal points and thousands separators " + "in numbers changed to Icelandic style" + ), ) parser.add_argument(