Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix bugs in for translation & added unique forms check #551

Merged
merged 16 commits into from
Jan 25, 2025
Merged
2 changes: 2 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,8 @@ scribe_data_csv_export/*
scribe_data_json_export/*
scribe_data_sqlite_export/*
scribe_data_tsv_export/*
scribe_data_mediawiki_export/*
scribe_data_wikidata_dumps_export/*

# MARK: Wiki Dumps

Expand Down
13 changes: 12 additions & 1 deletion src/scribe_data/cli/get.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,7 @@
DEFAULT_JSON_EXPORT_DIR,
DEFAULT_SQLITE_EXPORT_DIR,
DEFAULT_TSV_EXPORT_DIR,
DEFAULT_DUMP_EXPORT_DIR,
)
from scribe_data.wikidata.query_data import query_data
from scribe_data.wikidata.wikidata_utils import parse_wd_lexeme_dump
Expand Down Expand Up @@ -122,6 +123,8 @@ def prompt_user_download_all():
wikidata_dump_type=["form"],
data_types="all",
type_output_dir=output_dir,
wikidata_dump_path=wikidata_dump,
overwrite_all=overwrite,
)
else:
language_or_sub_language = language.split(" ")[0]
Expand All @@ -143,6 +146,8 @@ def prompt_user_download_all():
wikidata_dump_type=["form"],
data_types=[data_type],
type_output_dir=output_dir,
wikidata_dump_path=wikidata_dump,
overwrite_all=overwrite,
)
else:
print(f"Updating all languages for data type: {data_type.capitalize()}")
Expand All @@ -167,6 +172,7 @@ def prompt_user_download_all():
data_types="all",
type_output_dir=output_dir,
wikidata_dump_path=wikidata_dump,
overwrite_all=overwrite,
)

# MARK: Emojis
Expand All @@ -177,25 +183,30 @@ def prompt_user_download_all():
# MARK: Translations

elif data_type == "translations":
# If no language specified, use "all".
if language is None:
language = "all"
parse_wd_lexeme_dump(
language=language,
wikidata_dump_type=["translations"],
type_output_dir=output_dir,
wikidata_dump_path=wikidata_dump,
overwrite_all=overwrite,
)
return

# MARK: Form Dump

elif wikidata_dump:
elif wikidata_dump is not None:
if not wikidata_dump:
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Just wondering for myself -
but this is simply just a check for if wikidata_dump is an empty string ""? Did I get that right?

wikidata_dump = DEFAULT_DUMP_EXPORT_DIR
parse_wd_lexeme_dump(
language=language,
wikidata_dump_type=["form"],
data_types=data_types,
type_output_dir=output_dir,
wikidata_dump_path=wikidata_dump,
overwrite_all=overwrite,
)
return

Expand Down
10 changes: 7 additions & 3 deletions src/scribe_data/cli/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -166,8 +166,9 @@ def main() -> None:
get_parser.add_argument(
"-wdp",
"--wikidata-dump-path",
type=str,
help="Path to a local Wikidata lexemes dump for running with '--all'.",
nargs="?",
const="",
help="Path to a local Wikidata lexemes dump. Uses default directory if no path provided.",
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This here made me wonder..
Should we specify in the help message what the default path/directory would be? For this and other default directories as well?

)
get_parser.add_argument(
"-t", "--translation", type=str, help="parse a single word using MediaWiki API"
Expand Down Expand Up @@ -364,8 +365,11 @@ def main() -> None:
if args.interactive:
start_interactive_mode(operation="get")
if args.translation:
parse_wiktionary_translations(args.translation)
parse_wiktionary_translations(args.translation, args.output_dir)
else:
print(
f"Parsing Wikidata lexeme dump for {args.language} and {args.data_type}"
)
get_data(
language=args.language.lower()
if args.language is not None
Expand Down
7 changes: 6 additions & 1 deletion src/scribe_data/cli/total.py
Original file line number Diff line number Diff line change
Expand Up @@ -367,11 +367,15 @@ def total_wrapper(
"""
# Handle --all flag
if all_bool and wikidata_dump:
language = "all"
if data_type is None:
data_type = "all"
if language is None:
language = "all"
Comment on lines 351 to +355
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

nit - a quick suggestion:

The in-source docs here and elsewhere in the repo mention the following:

    all_bool : boolean
        Whether all languages and data types should be listed.

This seems a tad misleading since setting a specific language and/or data_type will process only for all for that language and/or data_type specified. Perhaps something with the below slight adjustment might make sense?

    all_bool : boolean
        Whether all languages and data types should be listed, unless otherwise specified.


if wikidata_dump is True: # flag without a wikidata lexeme dump path
parse_wd_lexeme_dump(
language=language,
data_types=[data_type],
wikidata_dump_type=["total"],
wikidata_dump_path=None,
)
Expand All @@ -380,6 +384,7 @@ def total_wrapper(
if isinstance(wikidata_dump, str): # if user provided a wikidata lexeme dump path
parse_wd_lexeme_dump(
language=language,
data_types=[data_type],
wikidata_dump_type=["total"],
wikidata_dump_path=wikidata_dump,
)
Expand Down
13 changes: 13 additions & 0 deletions src/scribe_data/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,7 @@
DEFAULT_TSV_EXPORT_DIR = "scribe_data_tsv_export"
DEFAULT_SQLITE_EXPORT_DIR = "scribe_data_sqlite_export"
DEFAULT_DUMP_EXPORT_DIR = "scribe_data_wikidata_dumps_export"
DEFAULT_MEDIAWIKI_EXPORT_DIR = "scribe_data_mediawiki_export"

LANGUAGE_DATA_EXTRACTION_DIR = (
Path(__file__).parent / "wikidata" / "language_data_extraction"
Expand Down Expand Up @@ -713,6 +714,18 @@ def check_lexeme_dump_prompt_download(output_dir: str):
rprint("[bold red]No valid dumps found.[/bold red]")
return None

elif user_input == "Download new version":
# Rename existing latest dump if it exists
latest_dump = Path(output_dir) / "latest-lexemes.json.bz2"
if latest_dump.exists():
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
backup_name = f"old_latest-lexemes_{timestamp}.json.bz2"
latest_dump.rename(Path(output_dir) / backup_name)
rprint(
f"[bold green]Renamed existing dump to {backup_name}[/bold green]"
)
return False

else:
rprint("[bold blue]Skipping download.[/bold blue]")
return True
Expand Down
13 changes: 10 additions & 3 deletions src/scribe_data/wikidata/wikidata_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -64,6 +64,7 @@ def parse_wd_lexeme_dump(
data_types: List[str] = None,
type_output_dir: str = None,
wikidata_dump_path: str = None,
overwrite_all: bool = False,
):
"""
Checks for the existence of a Wikidata lexeme dump and parses it if possible.
Expand All @@ -84,18 +85,25 @@ def parse_wd_lexeme_dump(

wikidata_dump_path : str, optional
The local Wikidata lexeme dump directory that should be used to get data.

overwrite_all : bool, default=False
If True, automatically overwrite existing files without prompting
"""
# Convert "all" to list of all languages
if isinstance(language, str) and language.lower() == "all":
language = list(language_metadata.keys())

# For processing: exclude translations and emoji-keywords
if isinstance(data_types, str) and data_types.lower() == "all":
# Exclude translations as it's a separate section
data_types = [
dt
for dt in data_type_metadata.keys()
if dt != "translations" and dt != "emoji-keywords"
]

print(f"Languages to process: {language}")
print(f"Data types to process: {data_types}")

file_path = wd_lexeme_dump_download_wrapper(None, wikidata_dump_path)

if isinstance(file_path, (str, Path)):
Expand All @@ -111,7 +119,6 @@ def parse_wd_lexeme_dump(
data_types=data_types,
file_path=file_path,
output_dir=type_output_dir,
overwrite_all=overwrite_all,
)
return

rprint(f"[bold red]No valid dumps found in {file_path}.[/bold red]")
Loading
Loading