Fixes to query all user flow and outputs and test changes

scribe-org · Jan 25, 2025 · 06d77b2 · 06d77b2
1 parent 789b178
commit 06d77b2
Show file tree

Hide file tree

Showing 5 changed files with 46 additions and 43 deletions.
diff --git a/src/scribe_data/cli/get.py b/src/scribe_data/cli/get.py
@@ -3,7 +3,7 @@
 Functions for getting languages-data types packs for the Scribe-Data CLI.
 """
 
-import os  # for removing original JSON files
+import os
 from pathlib import Path
 from typing import List, Union
 
@@ -94,22 +94,13 @@ def prompt_user_download_all():
         Checks with the user if they'd rather use Wikidata lexeme dumps before a download all call.
         """
         return questionary.confirm(
-            "Do you want to query Wikidata directly? (selecting 'no' will use Wikidata lexeme dumps)",
+            "Do you want to query Wikidata directly? (selecting 'no' will use a Wikidata lexemes dump locally to avoid large Query Service calls)",
             default=False,
         ).ask()
 
     if all_bool:
         if language:
             if prompt_user_download_all():
-                parse_wd_lexeme_dump(
-                    language=language,
-                    wikidata_dump_type=["form"],
-                    data_types="all",
-                    type_output_dir=output_dir,
-                    wikidata_dump_path=wikidata_dump,
-                    overwrite_all=overwrite,
-                )
-            else:
                 language_or_sub_language = language.split(" ")[0]
                 print(f"Updating all data types for language: {language.title()}")
                 query_data(
@@ -122,17 +113,18 @@ def prompt_user_download_all():
                     f"Query completed for all data types for language {language.title()}."
                 )
 
-        elif data_type:
-            if prompt_user_download_all():
+            else:
                 parse_wd_lexeme_dump(
-                    language="all",
+                    language=language,
                     wikidata_dump_type=["form"],
-                    data_types=[data_type],
+                    data_types="all",
                     type_output_dir=output_dir,
                     wikidata_dump_path=wikidata_dump,
                     overwrite_all=overwrite,
                 )
-            else:
+
+        elif data_type:
+            if prompt_user_download_all():
                 print(f"Updating all languages for data type: {data_type.capitalize()}")
                 query_data(
                     languages=None,
@@ -144,6 +136,16 @@ def prompt_user_download_all():
                     f"Query completed for all languages for data type {data_type.capitalize()}."
                 )
 
+            else:
+                parse_wd_lexeme_dump(
+                    language="all",
+                    wikidata_dump_type=["form"],
+                    data_types=[data_type],
+                    type_output_dir=output_dir,
+                    wikidata_dump_path=wikidata_dump,
+                    overwrite_all=overwrite,
+                )
+
         else:
             print("Updating all languages and data types...")
             rprint(
@@ -169,6 +171,7 @@ def prompt_user_download_all():
         # If no language specified, use "all".
         if language is None:
             language = "all"
+
         parse_wd_lexeme_dump(
             language=language,
             wikidata_dump_type=["translations"],
@@ -182,8 +185,9 @@ def prompt_user_download_all():
 
     elif wikidata_dump is not None:
         # If wikidata_dump is an empty string, use the default path.
-        if wikidata_dump == "":
+        if not wikidata_dump:
             wikidata_dump = DEFAULT_DUMP_EXPORT_DIR
+
         parse_wd_lexeme_dump(
             language=language,
             wikidata_dump_type=["form"],

diff --git a/src/scribe_data/cli/main.py b/src/scribe_data/cli/main.py
@@ -360,9 +360,6 @@ def main() -> None:
                 parse_wiktionary_translations(args.translation, args.output_dir)
 
             else:
-                print(
-                    f"Parsing Wikidata lexeme dump for {args.language} and {args.data_type}..."
-                )
                 get_data(
                     language=args.language.lower()
                     if args.language is not None

diff --git a/src/scribe_data/wikidata/parse_dump.py b/src/scribe_data/wikidata/parse_dump.py
@@ -373,7 +373,8 @@ def export_translations_json(self, filepath: str, language_iso: str = None) -> N
 
             self._save_by_language(filtered, filepath, language_iso, "translations")
 
-    # MARK: export forms
+    # MARK: Export Forms
+
     def export_forms_json(
         self, filepath: str, language_iso: str = None, data_type: str = None
     ) -> None:
@@ -393,7 +394,7 @@ def export_forms_json(
 
         Notes
         -----
-        Creates a directory structure: <filepath>/<language_name>/lexeme_<data_type>.json
+        Creates a directory structure: <filepath>/<language_name>/<data_type>.json
         Skips export if no forms are found for the specified language and data type.
         """
         if language_iso:
@@ -421,7 +422,9 @@ def export_forms_json(
 
             # Check if filtered data is empty before saving.
             if not filtered:
-                print(f"No forms found for {lang_name} {data_type}, skipping export...")
+                print(
+                    f"No forms found for {lang_name.capitalize()} {data_type}, skipping export..."
+                )
                 return
 
             # Create the output directory structure.
@@ -445,17 +448,19 @@ def export_forms_json(
             output_path.mkdir(parents=True, exist_ok=True)
 
             # Create the full output filepath.
-            output_file = output_path / f"lexeme_{data_type}.json"
+            output_file = output_path / f"{data_type}.json"
 
             # Save the filtered data to JSON file.
             try:
                 with open(output_file, "wb") as f:
                     f.write(orjson.dumps(filtered, option=orjson.OPT_INDENT_2))
                 print(
-                    f"Successfully exported forms for {lang_name} {data_type} to {output_file}"
+                    f"Successfully exported forms for {lang_name.capitalize()} {data_type} to {output_file}"
                 )
             except Exception as e:
-                print(f"Error saving forms for {lang_name} {data_type}: {e}")
+                print(
+                    f"Error saving forms for {lang_name.capitalize()} {data_type}: {e}"
+                )
 
     def _save_by_language(self, filtered, filepath, language_iso, data_type):
         """
@@ -558,7 +563,7 @@ def parse_dump(
         if "translations" in parse_type:
             languages_to_process = []
             for lang in languages:
-                index_path = Path(output_dir) / lang / "lexeme_translations.json"
+                index_path = Path(output_dir) / lang / "translations.json"
 
                 if not check_index_exists(index_path, overwrite_all):
                     languages_to_process.append(lang)
@@ -591,19 +596,16 @@ def parse_dump(
                     # Create appropriate path based on whether it's a sub-language.
                     if main_lang:
                         index_path = (
-                            Path(output_dir)
-                            / main_lang
-                            / lang
-                            / f"lexeme_{data_type}.json"
+                            Path(output_dir) / main_lang / lang / f"{data_type}.json"
                         )
+
                     else:
-                        index_path = (
-                            Path(output_dir) / lang / f"lexeme_{data_type}.json"
-                        )
+                        index_path = Path(output_dir) / lang / f"{data_type}.json"
 
                     if not check_index_exists(index_path, overwrite_all):
                         needs_processing = True
                         data_types_to_process.add(data_type)
+
                     else:
                         # Update path display in skip message.
                         skip_path = (
@@ -644,7 +646,7 @@ def parse_dump(
                 ),
                 None,
             ):
-                index_path = Path(output_dir) / language / "lexeme_translations.json"
+                index_path = Path(output_dir) / language / "translations.json"
                 # Ensure parent directory exists.
                 index_path.parent.mkdir(parents=True, exist_ok=True)
                 # print(f"Exporting translations for {language} to {index_path}").
@@ -654,9 +656,9 @@ def parse_dump(
 
     # (b) If "form" in parse_type -> export forms for each data_type in data_types.
     if "form" in parse_type:
-        # For each data_type, we create a separate file, e.g. lexeme_nouns.json.
+        # For each data_type, we create a separate file, e.g. nouns.json.
         for dt in data_types:
-            index_path = Path(output_dir) / f"lexeme_{dt}.json"
+            index_path = Path(output_dir) / f"{dt}.json"
             iso_codes = set()
             for word_data in processor.forms_index.values():
                 iso_codes.update(word_data.keys())

diff --git a/src/scribe_data/wikidata/wikidata_utils.py b/src/scribe_data/wikidata/wikidata_utils.py
@@ -92,7 +92,7 @@ def parse_wd_lexeme_dump(
             if dt != "translations" and dt != "emoji-keywords"
         ]
 
-    print(f"Languages to process: {language}")
+    print(f"Languages to process: {[lang.capitalize() for lang in language]}")
 
     if "translations" not in wikidata_dump_type:
         print(f"Data types to process: {data_types}")

diff --git a/tests/cli/test_get.py b/tests/cli/test_get.py
@@ -48,16 +48,16 @@ def test_invalid_arguments(self):
     @patch("scribe_data.cli.get.query_data")
     @patch("scribe_data.cli.get.parse_wd_lexeme_dump")
     @patch("scribe_data.cli.get.questionary.confirm")
-    def test_get_all_data_types_for_language_user_says_yes(
+    def test_get_all_data_types_for_language_user_says_no(
         self, mock_questionary_confirm, mock_parse, mock_query_data
     ):
         """
-        Test the behavior when the user agrees to query Wikidata directly.
+        Test the behavior when the user agrees to use Wikidata lexeme dumps.
 
         This test checks that `parse_wd_lexeme_dump` is called with the correct parameters
-        when the user confirms they want to query Wikidata.
+        when the user confirms they don't want to query Wikidata.
         """
-        mock_questionary_confirm.return_value.ask.return_value = True
+        mock_questionary_confirm.return_value.ask.return_value = False
 
         get_data(all_bool=True, language="English")
 
@@ -321,7 +321,7 @@ def test_get_data_with_wikidata_identifier(
         when a Wikidata identifier is used.
         """
         # Mock the user confirmation to return True (query Wikidata directly).
-        mock_questionary_confirm.return_value.ask.return_value = True
+        mock_questionary_confirm.return_value.ask.return_value = False
 
         get_data(
             language="Q9217",