Skip to content

Commit

Permalink
Fixes to query all user flow and outputs and test changes
Browse files Browse the repository at this point in the history
  • Loading branch information
andrewtavis committed Jan 25, 2025
1 parent 789b178 commit 06d77b2
Show file tree
Hide file tree
Showing 5 changed files with 46 additions and 43 deletions.
38 changes: 21 additions & 17 deletions src/scribe_data/cli/get.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
Functions for getting languages-data types packs for the Scribe-Data CLI.
"""

import os # for removing original JSON files
import os
from pathlib import Path
from typing import List, Union

Expand Down Expand Up @@ -94,22 +94,13 @@ def prompt_user_download_all():
Checks with the user if they'd rather use Wikidata lexeme dumps before a download all call.
"""
return questionary.confirm(
"Do you want to query Wikidata directly? (selecting 'no' will use Wikidata lexeme dumps)",
"Do you want to query Wikidata directly? (selecting 'no' will use a Wikidata lexemes dump locally to avoid large Query Service calls)",
default=False,
).ask()

if all_bool:
if language:
if prompt_user_download_all():
parse_wd_lexeme_dump(
language=language,
wikidata_dump_type=["form"],
data_types="all",
type_output_dir=output_dir,
wikidata_dump_path=wikidata_dump,
overwrite_all=overwrite,
)
else:
language_or_sub_language = language.split(" ")[0]
print(f"Updating all data types for language: {language.title()}")
query_data(
Expand All @@ -122,17 +113,18 @@ def prompt_user_download_all():
f"Query completed for all data types for language {language.title()}."
)

elif data_type:
if prompt_user_download_all():
else:
parse_wd_lexeme_dump(
language="all",
language=language,
wikidata_dump_type=["form"],
data_types=[data_type],
data_types="all",
type_output_dir=output_dir,
wikidata_dump_path=wikidata_dump,
overwrite_all=overwrite,
)
else:

elif data_type:
if prompt_user_download_all():
print(f"Updating all languages for data type: {data_type.capitalize()}")
query_data(
languages=None,
Expand All @@ -144,6 +136,16 @@ def prompt_user_download_all():
f"Query completed for all languages for data type {data_type.capitalize()}."
)

else:
parse_wd_lexeme_dump(
language="all",
wikidata_dump_type=["form"],
data_types=[data_type],
type_output_dir=output_dir,
wikidata_dump_path=wikidata_dump,
overwrite_all=overwrite,
)

else:
print("Updating all languages and data types...")
rprint(
Expand All @@ -169,6 +171,7 @@ def prompt_user_download_all():
# If no language specified, use "all".
if language is None:
language = "all"

parse_wd_lexeme_dump(
language=language,
wikidata_dump_type=["translations"],
Expand All @@ -182,8 +185,9 @@ def prompt_user_download_all():

elif wikidata_dump is not None:
# If wikidata_dump is an empty string, use the default path.
if wikidata_dump == "":
if not wikidata_dump:
wikidata_dump = DEFAULT_DUMP_EXPORT_DIR

parse_wd_lexeme_dump(
language=language,
wikidata_dump_type=["form"],
Expand Down
3 changes: 0 additions & 3 deletions src/scribe_data/cli/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -360,9 +360,6 @@ def main() -> None:
parse_wiktionary_translations(args.translation, args.output_dir)

else:
print(
f"Parsing Wikidata lexeme dump for {args.language} and {args.data_type}..."
)
get_data(
language=args.language.lower()
if args.language is not None
Expand Down
36 changes: 19 additions & 17 deletions src/scribe_data/wikidata/parse_dump.py
Original file line number Diff line number Diff line change
Expand Up @@ -373,7 +373,8 @@ def export_translations_json(self, filepath: str, language_iso: str = None) -> N

self._save_by_language(filtered, filepath, language_iso, "translations")

# MARK: export forms
# MARK: Export Forms

def export_forms_json(
self, filepath: str, language_iso: str = None, data_type: str = None
) -> None:
Expand All @@ -393,7 +394,7 @@ def export_forms_json(
Notes
-----
Creates a directory structure: <filepath>/<language_name>/lexeme_<data_type>.json
Creates a directory structure: <filepath>/<language_name>/<data_type>.json
Skips export if no forms are found for the specified language and data type.
"""
if language_iso:
Expand Down Expand Up @@ -421,7 +422,9 @@ def export_forms_json(

# Check if filtered data is empty before saving.
if not filtered:
print(f"No forms found for {lang_name} {data_type}, skipping export...")
print(
f"No forms found for {lang_name.capitalize()} {data_type}, skipping export..."
)
return

# Create the output directory structure.
Expand All @@ -445,17 +448,19 @@ def export_forms_json(
output_path.mkdir(parents=True, exist_ok=True)

# Create the full output filepath.
output_file = output_path / f"lexeme_{data_type}.json"
output_file = output_path / f"{data_type}.json"

# Save the filtered data to JSON file.
try:
with open(output_file, "wb") as f:
f.write(orjson.dumps(filtered, option=orjson.OPT_INDENT_2))
print(
f"Successfully exported forms for {lang_name} {data_type} to {output_file}"
f"Successfully exported forms for {lang_name.capitalize()} {data_type} to {output_file}"
)
except Exception as e:
print(f"Error saving forms for {lang_name} {data_type}: {e}")
print(
f"Error saving forms for {lang_name.capitalize()} {data_type}: {e}"
)

def _save_by_language(self, filtered, filepath, language_iso, data_type):
"""
Expand Down Expand Up @@ -558,7 +563,7 @@ def parse_dump(
if "translations" in parse_type:
languages_to_process = []
for lang in languages:
index_path = Path(output_dir) / lang / "lexeme_translations.json"
index_path = Path(output_dir) / lang / "translations.json"

if not check_index_exists(index_path, overwrite_all):
languages_to_process.append(lang)
Expand Down Expand Up @@ -591,19 +596,16 @@ def parse_dump(
# Create appropriate path based on whether it's a sub-language.
if main_lang:
index_path = (
Path(output_dir)
/ main_lang
/ lang
/ f"lexeme_{data_type}.json"
Path(output_dir) / main_lang / lang / f"{data_type}.json"
)

else:
index_path = (
Path(output_dir) / lang / f"lexeme_{data_type}.json"
)
index_path = Path(output_dir) / lang / f"{data_type}.json"

if not check_index_exists(index_path, overwrite_all):
needs_processing = True
data_types_to_process.add(data_type)

else:
# Update path display in skip message.
skip_path = (
Expand Down Expand Up @@ -644,7 +646,7 @@ def parse_dump(
),
None,
):
index_path = Path(output_dir) / language / "lexeme_translations.json"
index_path = Path(output_dir) / language / "translations.json"
# Ensure parent directory exists.
index_path.parent.mkdir(parents=True, exist_ok=True)
# print(f"Exporting translations for {language} to {index_path}").
Expand All @@ -654,9 +656,9 @@ def parse_dump(

# (b) If "form" in parse_type -> export forms for each data_type in data_types.
if "form" in parse_type:
# For each data_type, we create a separate file, e.g. lexeme_nouns.json.
# For each data_type, we create a separate file, e.g. nouns.json.
for dt in data_types:
index_path = Path(output_dir) / f"lexeme_{dt}.json"
index_path = Path(output_dir) / f"{dt}.json"
iso_codes = set()
for word_data in processor.forms_index.values():
iso_codes.update(word_data.keys())
Expand Down
2 changes: 1 addition & 1 deletion src/scribe_data/wikidata/wikidata_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -92,7 +92,7 @@ def parse_wd_lexeme_dump(
if dt != "translations" and dt != "emoji-keywords"
]

print(f"Languages to process: {language}")
print(f"Languages to process: {[lang.capitalize() for lang in language]}")

if "translations" not in wikidata_dump_type:
print(f"Data types to process: {data_types}")
Expand Down
10 changes: 5 additions & 5 deletions tests/cli/test_get.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,16 +48,16 @@ def test_invalid_arguments(self):
@patch("scribe_data.cli.get.query_data")
@patch("scribe_data.cli.get.parse_wd_lexeme_dump")
@patch("scribe_data.cli.get.questionary.confirm")
def test_get_all_data_types_for_language_user_says_yes(
def test_get_all_data_types_for_language_user_says_no(
self, mock_questionary_confirm, mock_parse, mock_query_data
):
"""
Test the behavior when the user agrees to query Wikidata directly.
Test the behavior when the user agrees to use Wikidata lexeme dumps.
This test checks that `parse_wd_lexeme_dump` is called with the correct parameters
when the user confirms they want to query Wikidata.
when the user confirms they don't want to query Wikidata.
"""
mock_questionary_confirm.return_value.ask.return_value = True
mock_questionary_confirm.return_value.ask.return_value = False

get_data(all_bool=True, language="English")

Expand Down Expand Up @@ -321,7 +321,7 @@ def test_get_data_with_wikidata_identifier(
when a Wikidata identifier is used.
"""
# Mock the user confirmation to return True (query Wikidata directly).
mock_questionary_confirm.return_value.ask.return_value = True
mock_questionary_confirm.return_value.ask.return_value = False

get_data(
language="Q9217",
Expand Down

0 comments on commit 06d77b2

Please sign in to comment.