diff --git a/src/tokenizer/__init__.py b/src/tokenizer/__init__.py index 6f14e9e..d57468b 100644 --- a/src/tokenizer/__init__.py +++ b/src/tokenizer/__init__.py @@ -1,6 +1,6 @@ """ - Copyright(C) 2022 Miðeind ehf. + Copyright(C) 2016-2024 Miðeind ehf. Original author: Vilhjálmur Þorsteinsson This software is licensed under the MIT License: @@ -64,8 +64,7 @@ __author__ = "Miðeind ehf." __copyright__ = "(C) 2016-2024 Miðeind ehf." -__version__ = importlib.metadata.version("tokenizer") - +__version__ = importlib.metadata.version(__name__) __all__ = ( "__author__", diff --git a/src/tokenizer/abbrev.py b/src/tokenizer/abbrev.py index 010cc29..bcee6b2 100644 --- a/src/tokenizer/abbrev.py +++ b/src/tokenizer/abbrev.py @@ -2,7 +2,7 @@ Abbreviations module for tokenization of Icelandic text - Copyright (C) 2022 Miðeind ehf. + Copyright (C) 2016-2024 Miðeind ehf. Original author: Vilhjálmur Þorsteinsson This software is licensed under the MIT License: diff --git a/src/tokenizer/definitions.py b/src/tokenizer/definitions.py index 1093d78..fd3eb42 100644 --- a/src/tokenizer/definitions.py +++ b/src/tokenizer/definitions.py @@ -2,7 +2,7 @@ Definitions used for tokenization of Icelandic text - Copyright (C) 2022 Miðeind ehf. + Copyright (C) 2016-2024 Miðeind ehf. Original author: Vilhjálmur Þorsteinsson This software is licensed under the MIT License: diff --git a/src/tokenizer/main.py b/src/tokenizer/main.py index dea083e..55d74c1 100755 --- a/src/tokenizer/main.py +++ b/src/tokenizer/main.py @@ -3,7 +3,7 @@ Tokenizer for Icelandic text - Copyright (C) 2022 Miðeind ehf. + Copyright (C) 2016-2024 Miðeind ehf. Original author: Vilhjálmur Þorsteinsson This software is licensed under the MIT License: diff --git a/src/tokenizer/tokenizer.py b/src/tokenizer/tokenizer.py index 4108108..1089581 100644 --- a/src/tokenizer/tokenizer.py +++ b/src/tokenizer/tokenizer.py @@ -2,7 +2,7 @@ Tokenizer for Icelandic text - Copyright (C) 2022 Miðeind ehf. + Copyright (C) 2016-2024 Miðeind ehf. Original author: Vilhjálmur Þorsteinsson This software is licensed under the MIT License: diff --git a/test/test_detokenize.py b/test/test_detokenize.py index db5f413..09545e1 100644 --- a/test/test_detokenize.py +++ b/test/test_detokenize.py @@ -6,7 +6,7 @@ Tests for Tokenizer module - Copyright (C) 2022 by Miðeind ehf. + Copyright (C) 2016-2024 by Miðeind ehf. Original author: Vilhjálmur Þorsteinsson This software is licensed under the MIT License: @@ -37,7 +37,7 @@ def test_detokenize() -> None: - options = { "normalize": True } + options = {"normalize": True} def should_be_equal(s: str) -> None: toklist = t.tokenize(s, **options) @@ -58,19 +58,18 @@ def should_be(s1: str, s2: str) -> None: should_be_equal("Páll veiddi 74 cm. lax í Norðurá þann 1.3.") should_be( - "Páll var með \"netfangið\" palli@einn.i.heiminum.is.", - "Páll var með „netfangið“ palli@einn.i.heiminum.is." + 'Páll var með "netfangið" palli@einn.i.heiminum.is.', + "Páll var með „netfangið“ palli@einn.i.heiminum.is.", ) # !!! BUG - #should_be( + # should_be( # "Páll var með \"netfangið\", þ.e.a.s. (\"þetta\").", # "Páll var með „netfangið“, þ.e.a.s. („þetta“).", - #) + # ) - options = { "normalize": False } + options = {"normalize": False} should_be_equal("Páll var með „netfangið“, þ.e.a.s. („þetta“).") - should_be_equal("Páll var með \"netfangið\" palli@einn.i.heiminum.is.") - should_be_equal("Páll var með \"netfangið\", þ.e.a.s. (\"þetta\").") - + should_be_equal('Páll var með "netfangið" palli@einn.i.heiminum.is.') + should_be_equal('Páll var með "netfangið", þ.e.a.s. ("þetta").') diff --git a/test/test_index_calculation.py b/test/test_index_calculation.py index 0b59e35..60a81a8 100644 --- a/test/test_index_calculation.py +++ b/test/test_index_calculation.py @@ -6,7 +6,7 @@ Tests for Tokenizer module - Copyright (C) 2022 by Miðeind ehf. + Copyright (C) 2016-2024 by Miðeind ehf. This software is licensed under the MIT License: @@ -169,7 +169,6 @@ def test_small_difficult_cases() -> None: assert char_indexes == [0, 2, 4] assert byte_indexes == [0, 2, 4] - # Two byte characters for x in ["þ", "æ", "á"]: s = x @@ -230,12 +229,11 @@ def test_small_difficult_cases() -> None: assert char_indexes == [0, 2, 4] assert byte_indexes == [0, 3, 6] - # Two character characters # These strings contain two unicode code points that are rendered as one letter. # They are counted as two characters in python. # In addition the accent and umlaut characters are two bytes. - for x in ["a"+ACCENT, "o"+UMLAUT]: + for x in ["a" + ACCENT, "o" + UMLAUT]: s = x toks = tokenizer.parse_tokens([s]) char_indexes, byte_indexes = tokenizer.calculate_indexes(toks) @@ -288,11 +286,11 @@ def test_small_difficult_cases() -> None: # example chars: # " a´ a´" # 012345 - # ^ ^ + # ^ ^ # example bytes: # " a´_ a´_" # 01234567 - # ^ ^ + # ^ ^ toks = tokenizer.parse_tokens([s]) char_indexes, byte_indexes = tokenizer.calculate_indexes(toks) assert char_indexes == [0, 3] @@ -302,7 +300,6 @@ def test_small_difficult_cases() -> None: assert char_indexes == [0, 3, 6] assert byte_indexes == [0, 4, 8] - # The em-dash is 3 bytes for x in [EM_DASH]: s = x @@ -361,7 +358,7 @@ def test_small_difficult_cases() -> None: # example bytes: # " a__ a__" # 01234567 - # ^ ^ + # ^ ^ toks = tokenizer.parse_tokens([s]) char_indexes, byte_indexes = tokenizer.calculate_indexes(toks) assert char_indexes == [0, 2] @@ -379,25 +376,181 @@ def test_larger_case() -> None: # x x x xx x toks = tokenizer.parse_tokens([s]) char_indexes, byte_indexes = tokenizer.calculate_indexes(toks) - assert char_indexes == [0, 5, 13, 16, 18, 25, 30, 33, 36, 40, 45, 50, 53, 61, 66, 72] - assert byte_indexes == [0, 6, 14, 17, 20, 27, 32, 35, 38, 43, 50, 55, 58, 66, 72, 78] + assert char_indexes == [ + 0, + 5, + 13, + 16, + 18, + 25, + 30, + 33, + 36, + 40, + 45, + 50, + 53, + 61, + 66, + 72, + ] + assert byte_indexes == [ + 0, + 6, + 14, + 17, + 20, + 27, + 32, + 35, + 38, + 43, + 50, + 55, + 58, + 66, + 72, + 78, + ] toks = tokenizer.parse_tokens([s]) char_indexes, byte_indexes = tokenizer.calculate_indexes(toks, last_is_end=True) - assert char_indexes == [0, 5, 13, 16, 18, 25, 30, 33, 36, 40, 45, 50, 53, 61, 66, 72, 73] - assert byte_indexes == [0, 6, 14, 17, 20, 27, 32, 35, 38, 43, 50, 55, 58, 66, 72, 78, 79] + assert char_indexes == [ + 0, + 5, + 13, + 16, + 18, + 25, + 30, + 33, + 36, + 40, + 45, + 50, + 53, + 61, + 66, + 72, + 73, + ] + assert byte_indexes == [ + 0, + 6, + 14, + 17, + 20, + 27, + 32, + 35, + 38, + 43, + 50, + 55, + 58, + 66, + 72, + 78, + 79, + ] def test_iterator_cases() -> None: - s = ["Þessi ", "setning ", "er ", "í ", "lengra ", "lagi ", "og ", "er ", "með ", "bæði ", "eins ", "og ", "tveggja ", "bæta ", "stafi."] + s = [ + "Þessi ", + "setning ", + "er ", + "í ", + "lengra ", + "lagi ", + "og ", + "er ", + "með ", + "bæði ", + "eins ", + "og ", + "tveggja ", + "bæta ", + "stafi.", + ] # (char and byte indexes in a similar test above) toks = tokenizer.parse_tokens(s) char_indexes, byte_indexes = tokenizer.calculate_indexes(toks) - assert char_indexes == [0, 5, 13, 16, 18, 25, 30, 33, 36, 40, 45, 50, 53, 61, 66, 72] - assert byte_indexes == [0, 6, 14, 17, 20, 27, 32, 35, 38, 43, 50, 55, 58, 66, 72, 78] + assert char_indexes == [ + 0, + 5, + 13, + 16, + 18, + 25, + 30, + 33, + 36, + 40, + 45, + 50, + 53, + 61, + 66, + 72, + ] + assert byte_indexes == [ + 0, + 6, + 14, + 17, + 20, + 27, + 32, + 35, + 38, + 43, + 50, + 55, + 58, + 66, + 72, + 78, + ] toks = tokenizer.parse_tokens(s) char_indexes, byte_indexes = tokenizer.calculate_indexes(toks, last_is_end=True) - assert char_indexes == [0, 5, 13, 16, 18, 25, 30, 33, 36, 40, 45, 50, 53, 61, 66, 72, 73] - assert byte_indexes == [0, 6, 14, 17, 20, 27, 32, 35, 38, 43, 50, 55, 58, 66, 72, 78, 79] + assert char_indexes == [ + 0, + 5, + 13, + 16, + 18, + 25, + 30, + 33, + 36, + 40, + 45, + 50, + 53, + 61, + 66, + 72, + 73, + ] + assert byte_indexes == [ + 0, + 6, + 14, + 17, + 20, + 27, + 32, + 35, + 38, + 43, + 50, + 55, + 58, + 66, + 72, + 78, + 79, + ] s = ["Stutt setning.", "", "Önnur setning."] # 01234567890123 45678901234567 @@ -493,11 +646,15 @@ def test_lengthening_substitutions() -> None: # ^ ^ ^ ^ ^ # x x # ! lengthening happens here (3ji->þriðji) - toks = tokenizer.parse_tokens(s, handle_kludgy_ordinals=tokenizer.KLUDGY_ORDINALS_MODIFY) + toks = tokenizer.parse_tokens( + s, handle_kludgy_ordinals=tokenizer.KLUDGY_ORDINALS_MODIFY + ) char_indexes, byte_indexes = tokenizer.calculate_indexes(toks) assert char_indexes == [0, 5, 8, 12, 21] assert byte_indexes == [0, 6, 9, 13, 23] - toks = tokenizer.parse_tokens(s, handle_kludgy_ordinals=tokenizer.KLUDGY_ORDINALS_MODIFY) + toks = tokenizer.parse_tokens( + s, handle_kludgy_ordinals=tokenizer.KLUDGY_ORDINALS_MODIFY + ) char_indexes, byte_indexes = tokenizer.calculate_indexes(toks, last_is_end=True) assert char_indexes == [0, 5, 8, 12, 21, 22] assert byte_indexes == [0, 6, 9, 13, 23, 24] diff --git a/test/test_tokenizer.py b/test/test_tokenizer.py index 21431d8..7b37088 100755 --- a/test/test_tokenizer.py +++ b/test/test_tokenizer.py @@ -5,7 +5,7 @@ Tests for Tokenizer module - Copyright (C) 2022 by Miðeind ehf. + Copyright (C) 2016-2024 by Miðeind ehf. Original author: Vilhjálmur Þorsteinsson This software is licensed under the MIT License: diff --git a/test/test_tokenizer_tok.py b/test/test_tokenizer_tok.py index 3ee1f46..c0caa7e 100644 --- a/test/test_tokenizer_tok.py +++ b/test/test_tokenizer_tok.py @@ -3,7 +3,7 @@ Tests for Tokenizer module - Copyright (C) 2022 by Miðeind ehf. + Copyright (C) 2016-2024 by Miðeind ehf. This software is licensed under the MIT License: