Skip to content

Commit

Permalink
Updated header + formatting
Browse files Browse the repository at this point in the history
  • Loading branch information
sveinbjornt committed Aug 22, 2024
1 parent 7c2bbb8 commit 12a2db2
Show file tree
Hide file tree
Showing 9 changed files with 193 additions and 38 deletions.
5 changes: 2 additions & 3 deletions src/tokenizer/__init__.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
"""
Copyright(C) 2022 Miðeind ehf.
Copyright(C) 2016-2024 Miðeind ehf.
Original author: Vilhjálmur Þorsteinsson
This software is licensed under the MIT License:
Expand Down Expand Up @@ -64,8 +64,7 @@

__author__ = "Miðeind ehf."
__copyright__ = "(C) 2016-2024 Miðeind ehf."
__version__ = importlib.metadata.version("tokenizer")

__version__ = importlib.metadata.version(__name__)

__all__ = (
"__author__",
Expand Down
2 changes: 1 addition & 1 deletion src/tokenizer/abbrev.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
Abbreviations module for tokenization of Icelandic text
Copyright (C) 2022 Miðeind ehf.
Copyright (C) 2016-2024 Miðeind ehf.
Original author: Vilhjálmur Þorsteinsson
This software is licensed under the MIT License:
Expand Down
2 changes: 1 addition & 1 deletion src/tokenizer/definitions.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
Definitions used for tokenization of Icelandic text
Copyright (C) 2022 Miðeind ehf.
Copyright (C) 2016-2024 Miðeind ehf.
Original author: Vilhjálmur Þorsteinsson
This software is licensed under the MIT License:
Expand Down
2 changes: 1 addition & 1 deletion src/tokenizer/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
Tokenizer for Icelandic text
Copyright (C) 2022 Miðeind ehf.
Copyright (C) 2016-2024 Miðeind ehf.
Original author: Vilhjálmur Þorsteinsson
This software is licensed under the MIT License:
Expand Down
2 changes: 1 addition & 1 deletion src/tokenizer/tokenizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
Tokenizer for Icelandic text
Copyright (C) 2022 Miðeind ehf.
Copyright (C) 2016-2024 Miðeind ehf.
Original author: Vilhjálmur Þorsteinsson
This software is licensed under the MIT License:
Expand Down
19 changes: 9 additions & 10 deletions test/test_detokenize.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
Tests for Tokenizer module
Copyright (C) 2022 by Miðeind ehf.
Copyright (C) 2016-2024 by Miðeind ehf.
Original author: Vilhjálmur Þorsteinsson
This software is licensed under the MIT License:
Expand Down Expand Up @@ -37,7 +37,7 @@

def test_detokenize() -> None:

options = { "normalize": True }
options = {"normalize": True}

def should_be_equal(s: str) -> None:
toklist = t.tokenize(s, **options)
Expand All @@ -58,19 +58,18 @@ def should_be(s1: str, s2: str) -> None:
should_be_equal("Páll veiddi 74 cm. lax í Norðurá þann 1.3.")

should_be(
"Páll var með \"netfangið\" [email protected].",
"Páll var með „netfangið“ [email protected]."
'Páll var með "netfangið" [email protected].',
"Páll var með „netfangið“ [email protected].",
)

# !!! BUG
#should_be(
# should_be(
# "Páll var með \"netfangið\", þ.e.a.s. (\"þetta\").",
# "Páll var með „netfangið“, þ.e.a.s. („þetta“).",
#)
# )

options = { "normalize": False }
options = {"normalize": False}

should_be_equal("Páll var með „netfangið“, þ.e.a.s. („þetta“).")
should_be_equal("Páll var með \"netfangið\" [email protected].")
should_be_equal("Páll var með \"netfangið\", þ.e.a.s. (\"þetta\").")

should_be_equal('Páll var með "netfangið" [email protected].')
should_be_equal('Páll var með "netfangið", þ.e.a.s. ("þetta").')
195 changes: 176 additions & 19 deletions test/test_index_calculation.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
Tests for Tokenizer module
Copyright (C) 2022 by Miðeind ehf.
Copyright (C) 2016-2024 by Miðeind ehf.
This software is licensed under the MIT License:
Expand Down Expand Up @@ -169,7 +169,6 @@ def test_small_difficult_cases() -> None:
assert char_indexes == [0, 2, 4]
assert byte_indexes == [0, 2, 4]


# Two byte characters
for x in ["þ", "æ", "á"]:
s = x
Expand Down Expand Up @@ -230,12 +229,11 @@ def test_small_difficult_cases() -> None:
assert char_indexes == [0, 2, 4]
assert byte_indexes == [0, 3, 6]


# Two character characters
# These strings contain two unicode code points that are rendered as one letter.
# They are counted as two characters in python.
# In addition the accent and umlaut characters are two bytes.
for x in ["a"+ACCENT, "o"+UMLAUT]:
for x in ["a" + ACCENT, "o" + UMLAUT]:
s = x
toks = tokenizer.parse_tokens([s])
char_indexes, byte_indexes = tokenizer.calculate_indexes(toks)
Expand Down Expand Up @@ -288,11 +286,11 @@ def test_small_difficult_cases() -> None:
# example chars:
# " a´ a´"
# 012345
# ^ ^
# ^ ^
# example bytes:
# " a´_ a´_"
# 01234567
# ^ ^
# ^ ^
toks = tokenizer.parse_tokens([s])
char_indexes, byte_indexes = tokenizer.calculate_indexes(toks)
assert char_indexes == [0, 3]
Expand All @@ -302,7 +300,6 @@ def test_small_difficult_cases() -> None:
assert char_indexes == [0, 3, 6]
assert byte_indexes == [0, 4, 8]


# The em-dash is 3 bytes
for x in [EM_DASH]:
s = x
Expand Down Expand Up @@ -361,7 +358,7 @@ def test_small_difficult_cases() -> None:
# example bytes:
# " a__ a__"
# 01234567
# ^ ^
# ^ ^
toks = tokenizer.parse_tokens([s])
char_indexes, byte_indexes = tokenizer.calculate_indexes(toks)
assert char_indexes == [0, 2]
Expand All @@ -379,25 +376,181 @@ def test_larger_case() -> None:
# x x x xx x
toks = tokenizer.parse_tokens([s])
char_indexes, byte_indexes = tokenizer.calculate_indexes(toks)
assert char_indexes == [0, 5, 13, 16, 18, 25, 30, 33, 36, 40, 45, 50, 53, 61, 66, 72]
assert byte_indexes == [0, 6, 14, 17, 20, 27, 32, 35, 38, 43, 50, 55, 58, 66, 72, 78]
assert char_indexes == [
0,
5,
13,
16,
18,
25,
30,
33,
36,
40,
45,
50,
53,
61,
66,
72,
]
assert byte_indexes == [
0,
6,
14,
17,
20,
27,
32,
35,
38,
43,
50,
55,
58,
66,
72,
78,
]
toks = tokenizer.parse_tokens([s])
char_indexes, byte_indexes = tokenizer.calculate_indexes(toks, last_is_end=True)
assert char_indexes == [0, 5, 13, 16, 18, 25, 30, 33, 36, 40, 45, 50, 53, 61, 66, 72, 73]
assert byte_indexes == [0, 6, 14, 17, 20, 27, 32, 35, 38, 43, 50, 55, 58, 66, 72, 78, 79]
assert char_indexes == [
0,
5,
13,
16,
18,
25,
30,
33,
36,
40,
45,
50,
53,
61,
66,
72,
73,
]
assert byte_indexes == [
0,
6,
14,
17,
20,
27,
32,
35,
38,
43,
50,
55,
58,
66,
72,
78,
79,
]


def test_iterator_cases() -> None:
s = ["Þessi ", "setning ", "er ", "í ", "lengra ", "lagi ", "og ", "er ", "með ", "bæði ", "eins ", "og ", "tveggja ", "bæta ", "stafi."]
s = [
"Þessi ",
"setning ",
"er ",
"í ",
"lengra ",
"lagi ",
"og ",
"er ",
"með ",
"bæði ",
"eins ",
"og ",
"tveggja ",
"bæta ",
"stafi.",
]
# (char and byte indexes in a similar test above)
toks = tokenizer.parse_tokens(s)
char_indexes, byte_indexes = tokenizer.calculate_indexes(toks)
assert char_indexes == [0, 5, 13, 16, 18, 25, 30, 33, 36, 40, 45, 50, 53, 61, 66, 72]
assert byte_indexes == [0, 6, 14, 17, 20, 27, 32, 35, 38, 43, 50, 55, 58, 66, 72, 78]
assert char_indexes == [
0,
5,
13,
16,
18,
25,
30,
33,
36,
40,
45,
50,
53,
61,
66,
72,
]
assert byte_indexes == [
0,
6,
14,
17,
20,
27,
32,
35,
38,
43,
50,
55,
58,
66,
72,
78,
]
toks = tokenizer.parse_tokens(s)
char_indexes, byte_indexes = tokenizer.calculate_indexes(toks, last_is_end=True)
assert char_indexes == [0, 5, 13, 16, 18, 25, 30, 33, 36, 40, 45, 50, 53, 61, 66, 72, 73]
assert byte_indexes == [0, 6, 14, 17, 20, 27, 32, 35, 38, 43, 50, 55, 58, 66, 72, 78, 79]
assert char_indexes == [
0,
5,
13,
16,
18,
25,
30,
33,
36,
40,
45,
50,
53,
61,
66,
72,
73,
]
assert byte_indexes == [
0,
6,
14,
17,
20,
27,
32,
35,
38,
43,
50,
55,
58,
66,
72,
78,
79,
]

s = ["Stutt setning.", "", "Önnur setning."]
# 01234567890123 45678901234567
Expand Down Expand Up @@ -493,11 +646,15 @@ def test_lengthening_substitutions() -> None:
# ^ ^ ^ ^ ^
# x x
# ! lengthening happens here (3ji->þriðji)
toks = tokenizer.parse_tokens(s, handle_kludgy_ordinals=tokenizer.KLUDGY_ORDINALS_MODIFY)
toks = tokenizer.parse_tokens(
s, handle_kludgy_ordinals=tokenizer.KLUDGY_ORDINALS_MODIFY
)
char_indexes, byte_indexes = tokenizer.calculate_indexes(toks)
assert char_indexes == [0, 5, 8, 12, 21]
assert byte_indexes == [0, 6, 9, 13, 23]
toks = tokenizer.parse_tokens(s, handle_kludgy_ordinals=tokenizer.KLUDGY_ORDINALS_MODIFY)
toks = tokenizer.parse_tokens(
s, handle_kludgy_ordinals=tokenizer.KLUDGY_ORDINALS_MODIFY
)
char_indexes, byte_indexes = tokenizer.calculate_indexes(toks, last_is_end=True)
assert char_indexes == [0, 5, 8, 12, 21, 22]
assert byte_indexes == [0, 6, 9, 13, 23, 24]
Expand Down
2 changes: 1 addition & 1 deletion test/test_tokenizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
Tests for Tokenizer module
Copyright (C) 2022 by Miðeind ehf.
Copyright (C) 2016-2024 by Miðeind ehf.
Original author: Vilhjálmur Þorsteinsson
This software is licensed under the MIT License:
Expand Down
2 changes: 1 addition & 1 deletion test/test_tokenizer_tok.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
Tests for Tokenizer module
Copyright (C) 2022 by Miðeind ehf.
Copyright (C) 2016-2024 by Miðeind ehf.
This software is licensed under the MIT License:
Expand Down

0 comments on commit 12a2db2

Please sign in to comment.