Skip to content

Commit

Permalink
Bumped version to 3.4.5, bumped min Python version to 3.9, fmt, clean…
Browse files Browse the repository at this point in the history
…up, etc.
  • Loading branch information
sveinbjornt committed Aug 22, 2024
1 parent 8750e9c commit cdba944
Show file tree
Hide file tree
Showing 9 changed files with 81 additions and 43 deletions.
9 changes: 5 additions & 4 deletions .github/workflows/python-package.yml
Original file line number Diff line number Diff line change
Expand Up @@ -15,10 +15,11 @@ jobs:
strategy:
matrix:
os: [ubuntu-latest]
python-version: [ "3.8", "3.9", "3.10", "3.11", "3.12", "pypy-3.9", "pypy-3.10"]
python-version: ["3.9", "3.10", "3.11", "3.12", "3.13.0-rc.1", "pypy-3.9", "pypy-3.10"]

steps:
- uses: actions/checkout@v4

- name: Set up Python ${{ matrix.python-version }}
uses: actions/setup-python@v5
with:
Expand All @@ -29,10 +30,10 @@ jobs:
python -m pip install --upgrade pip wheel setuptools
python -m pip install -e ".[dev]"
- name: Type check with mypy (only on Python 3.8)
- name: Type check with mypy (only on Python 3.9)
run: |
if [ "${{ matrix.python-version }}" == "3.8" ]; then python -m pip install mypy; fi
if [ "${{ matrix.python-version }}" == "3.8" ]; then mypy --python-version=3.8 src/tokenizer; fi
if [ "${{ matrix.python-version }}" == "3.9" ]; then python -m pip install mypy; fi
if [ "${{ matrix.python-version }}" == "3.9" ]; then mypy --python-version=3.9 src/tokenizer; fi
- name: Test with pytest
run: |
Expand Down
2 changes: 1 addition & 1 deletion LICENSE.txt
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
MIT License

Copyright (C) 2023 Miðeind ehf.
Copyright (C) 2016-2024 Miðeind ehf.
Original author: Vilhjálmur Þorsteinsson

Permission is hereby granted, free of charge, to any person obtaining a copy
Expand Down
1 change: 1 addition & 0 deletions MANIFEST.in
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
graft src
prune src/tokenizer/__pycache__
prune src/tokenizer/.mypy_cache
prune src/tokenizer/.DS_Store
3 changes: 2 additions & 1 deletion README.rst
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ Tokenization is a necessary first step in many natural language processing
tasks, such as word counting, parsing, spell checking, corpus generation, and
statistical analysis of text.

**Tokenizer** is a compact pure-Python (>= 3.8) executable
**Tokenizer** is a compact pure-Python (>=3.9) executable
program and module for tokenizing Icelandic text. It converts input text to
streams of *tokens*, where each token is a separate word, punctuation sign,
number/amount, date, e-mail, URL/URI, etc. It also segments the token stream
Expand Down Expand Up @@ -809,6 +809,7 @@ can be found in the file ``test/toktest_normal_gold_expected.txt``.
Changelog
---------

* Version 3.4.5: Compatibility with Python 3.13. Now requires Python 3.9 or later.
* Version 3.4.4: Better handling of abbreviations
* Version 3.4.3: Various minor fixes. Now requires Python 3.8 or later.
* Version 3.4.2: Abbreviations and phrases added, ``META_BEGIN`` token added.
Expand Down
16 changes: 7 additions & 9 deletions pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,11 +1,10 @@
[project]
name = "tokenizer"
version = "3.4.4"
version = "3.4.5"
description = "A tokenizer for Icelandic text"
authors = [{ name = "Miðeind ehf.", email = "[email protected]" }]
readme = { file = "README.rst", content-type = "text/x-rst" }
license = { file = "LICENSE.txt" }
# For classifier list see: https://pypi.org/pypi?%3Aaction=list_classifiers
license = "MIT"
classifiers = [
"Development Status :: 5 - Production/Stable",
"Intended Audience :: Developers",
Expand All @@ -16,7 +15,6 @@ classifiers = [
"Natural Language :: Icelandic",
"Programming Language :: Python",
"Programming Language :: Python :: 3",
"Programming Language :: Python :: 3.8",
"Programming Language :: Python :: 3.9",
"Programming Language :: Python :: 3.10",
"Programming Language :: Python :: 3.11",
Expand All @@ -27,7 +25,7 @@ classifiers = [
"Topic :: Utilities",
"Topic :: Text Processing :: Linguistic",
]
requires-python = ">=3.8"
requires-python = ">=3.9"

[project.urls]
Repository = "https://github.com/mideind/Tokenizer"
Expand All @@ -51,17 +49,17 @@ where = ["src"]
[tool.pytest.ini_options]
filterwarnings = [
# Ignore deprecation warnings in libraries, their problem not ours
"ignore::DeprecationWarning",
# "ignore::DeprecationWarning",
]

[tool.ruff]
line-length = 120
line-length = 88

[tool.black]
line-length = 120
line-length = 88

[tool.isort]
# This forces these imports to placed at the top
known_future_library = ["__future__", "typing", "typing_extensions"]
profile = "black"
line_length = 120
line_length = 88
2 changes: 1 addition & 1 deletion src/tokenizer/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -63,7 +63,7 @@
from .abbrev import Abbreviations, ConfigError

__author__ = "Miðeind ehf."
__copyright__ = "(C) 2023 Miðeind ehf."
__copyright__ = "(C) 2016-2024 Miðeind ehf."
__version__ = importlib.metadata.version("tokenizer")


Expand Down
69 changes: 48 additions & 21 deletions src/tokenizer/abbrev.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,25 +43,23 @@


class ConfigError(Exception):

pass


_T = TypeVar("_T")


class OrderedSet(Generic[_T]):

""" Shim class to provide an ordered set API on top
of an OrderedDict. This is necessary to make abbreviation
lookups predictable and repeatable, which they would not be
if a standard Python set() was used. """
"""Shim class to provide an ordered set API on top
of an OrderedDict. This is necessary to make abbreviation
lookups predictable and repeatable, which they would not be
if a standard Python set() was used."""

def __init__(self) -> None:
self._dict: Dict[_T, None] = OrderedDict()

def add(self, item: _T) -> None:
""" Add an item at the end of the ordered set """
"""Add an item at the end of the ordered set"""
if item not in self._dict:
self._dict[item] = None

Expand All @@ -73,9 +71,8 @@ def __iter__(self) -> Iterator[_T]:


class Abbreviations:

""" Wrapper around dictionary of abbreviations,
initialized from the config file """
"""Wrapper around dictionary of abbreviations,
initialized from the config file"""

# Dictionary of abbreviations and their meanings
DICT: Dict[str, OrderedSet[BIN_Tuple]] = defaultdict(OrderedSet)
Expand Down Expand Up @@ -107,8 +104,8 @@ class Abbreviations:

@staticmethod
def add(abbrev: str, meaning: str, gender: str, fl: Optional[str] = None) -> None:
""" Add an abbreviation to the dictionary.
Called from the config file handler. """
"""Add an abbreviation to the dictionary.
Called from the config file handler."""
# Check for sentence finishers
finisher = False
not_finisher = False
Expand Down Expand Up @@ -152,7 +149,14 @@ def add(abbrev: str, meaning: str, gender: str, fl: Optional[str] = None) -> Non
# Append the abbreviation and its meaning in tuple form
# Multiple meanings are supported for each abbreviation
Abbreviations.DICT[abbrev].add(
BIN_Tuple(meaning, 0, gender, "skst" if fl is None else fl, abbrev, "-",)
BIN_Tuple(
meaning,
0,
gender,
"skst" if fl is None else fl,
abbrev,
"-",
)
)
Abbreviations.MEANINGS.add(meaning)
# Adding wrong versions of abbreviations
Expand All @@ -169,7 +173,14 @@ def add(abbrev: str, meaning: str, gender: str, fl: Optional[str] = None) -> Non
# as abbreviations, even though they are listed as such
# in the form 'Í.' and 'Á.' for use within person names
Abbreviations.WRONGDICT[wabbrev].add(
BIN_Tuple(meaning, 0, gender, "skst" if fl is None else fl, wabbrev, "-",)
BIN_Tuple(
meaning,
0,
gender,
"skst" if fl is None else fl,
wabbrev,
"-",
)
)

elif "." in abbrev:
Expand All @@ -182,7 +193,14 @@ def add(abbrev: str, meaning: str, gender: str, fl: Optional[str] = None) -> Non
wabbrev = abbrev[:i] + abbrev[i + 1 :]
Abbreviations.WRONGDOTS[wabbrev].append(abbrev)
Abbreviations.WRONGDICT[wabbrev].add(
BIN_Tuple(meaning, 0, gender, "skst" if fl is None else fl, wabbrev, "-",)
BIN_Tuple(
meaning,
0,
gender,
"skst" if fl is None else fl,
wabbrev,
"-",
)
)
if len(indices) > 2:
# 3 or 4 dots currently in vocabulary
Expand Down Expand Up @@ -214,7 +232,14 @@ def add(abbrev: str, meaning: str, gender: str, fl: Optional[str] = None) -> Non
Abbreviations.WRONGSINGLES.add(wabbrev)
Abbreviations.WRONGDOTS[wabbrev].append(abbrev)
Abbreviations.WRONGDICT[wabbrev].add(
BIN_Tuple(meaning, 0, gender, "skst" if fl is None else fl, wabbrev, "-",)
BIN_Tuple(
meaning,
0,
gender,
"skst" if fl is None else fl,
wabbrev,
"-",
)
)
if finisher:
Abbreviations.FINISHERS.add(abbrev)
Expand All @@ -233,15 +258,15 @@ def has_abbreviation(meaning: str) -> bool:

@staticmethod
def get_meaning(abbrev: str) -> Optional[List[BIN_Tuple]]:
""" Lookup meaning(s) of abbreviation, if available. """
"""Look up meaning(s) of abbreviation, if available."""
m = Abbreviations.DICT.get(abbrev)
if not m:
m = Abbreviations.WRONGDICT.get(abbrev)
return list(m) if m else None

@staticmethod
def _handle_abbreviations(s: str) -> None:
""" Handle abbreviations in the settings section """
"""Handle abbreviations in the settings section"""
# Format: abbrev[*] = "meaning" gender (kk|kvk|hk)
# An asterisk after an abbreviation ending with a period
# indicates that the abbreviation may finish a sentence
Expand Down Expand Up @@ -272,21 +297,23 @@ def _handle_abbreviations(s: str) -> None:

@staticmethod
def _handle_not_abbreviations(s: str) -> None:
""" Handle not_abbreviations in the settings section """
"""Handle not_abbreviations in the settings section"""
if len(s) < 3 or s[0] != '"' or s[-1] != '"':
raise ConfigError("not_abbreviations should be enclosed in double quotes")
Abbreviations.NOT_ABBREVIATIONS.add(s[1:-1])

@staticmethod
def initialize():
""" Read the abbreviations config file """
"""Read the abbreviations config file"""
with Abbreviations._lock:
if len(Abbreviations.DICT):
# Already initialized
return

section = None
config = open_text(package="tokenizer", resource="Abbrev.conf", encoding="utf-8")
config = open_text(
package="tokenizer", resource="Abbrev.conf", encoding="utf-8"
) # TODO: Deprecated in Python 3.13
for s in config:
# Ignore comments
ix = s.find("#")
Expand Down
4 changes: 2 additions & 2 deletions src/tokenizer/definitions.py
Original file line number Diff line number Diff line change
Expand Up @@ -534,8 +534,8 @@ class PersonNameTuple(NamedTuple):
"N": "Norður",
}

_unit_lambda: Callable[[str], str] = (
lambda unit: unit + r"(?!\w)" if unit[-1].isalpha() else unit
_unit_lambda: Callable[[str], str] = lambda unit: (
unit + r"(?!\w)" if unit[-1].isalpha() else unit
)

SI_UNITS_SET: FrozenSet[str] = frozenset(SI_UNITS.keys())
Expand Down
18 changes: 14 additions & 4 deletions src/tokenizer/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -71,8 +71,12 @@

group = parser.add_mutually_exclusive_group()

group.add_argument("--csv", help="Output one token per line in CSV format", action="store_true")
group.add_argument("--json", help="Output one token per line in JSON format", action="store_true")
group.add_argument(
"--csv", help="Output one token per line in CSV format", action="store_true"
)
group.add_argument(
"--json", help="Output one token per line in JSON format", action="store_true"
)

parser.add_argument(
"-s",
Expand All @@ -92,7 +96,10 @@
"-p",
"--coalesce_percent",
action="store_true",
help=("Numbers combined into one token with percentage word forms " "(prósent/prósentustig/hundraðshlutar)"),
help=(
"Numbers combined into one token with percentage word forms "
"(prósent/prósentustig/hundraðshlutar)"
),
)

parser.add_argument(
Expand Down Expand Up @@ -127,7 +134,10 @@
"-c",
"--convert_numbers",
action="store_true",
help=("English-style decimal points and thousands separators " "in numbers changed to Icelandic style"),
help=(
"English-style decimal points and thousands separators "
"in numbers changed to Icelandic style"
),
)

parser.add_argument(
Expand Down

0 comments on commit cdba944

Please sign in to comment.