From 0fe68642de07d3657f0924d1d871eb19ba1e3861 Mon Sep 17 00:00:00 2001
From: axif <muhamadasif570@gmail.com>
Date: Sun, 12 Jan 2025 03:03:28 +0600
Subject: [PATCH 01/13] fix small bugs

---
 .gitignore                                    |  2 +
 src/scribe_data/cli/get.py                    |  5 +-
 src/scribe_data/cli/main.py                   | 10 ++--
 src/scribe_data/utils.py                      | 13 +++++
 src/scribe_data/wikidata/wikidata_utils.py    |  2 -
 src/scribe_data/wiktionary/parse_dump.py      | 21 +-------
 src/scribe_data/wiktionary/parse_mediaWiki.py | 53 ++++++++++++++++---
 7 files changed, 72 insertions(+), 34 deletions(-)

diff --git a/.gitignore b/.gitignore
index 610b9da8..4bcc3809 100644
--- a/.gitignore
+++ b/.gitignore
@@ -40,6 +40,8 @@ scribe_data_csv_export/*
 scribe_data_json_export/*
 scribe_data_sqlite_export/*
 scribe_data_tsv_export/*
+scribe_data_mediawiki_export/*
+scribe_data_wikidata_dumps_export/*
 
 # MARK: Wiki Dumps
 
diff --git a/src/scribe_data/cli/get.py b/src/scribe_data/cli/get.py
index c3e98e6d..9be44075 100644
--- a/src/scribe_data/cli/get.py
+++ b/src/scribe_data/cli/get.py
@@ -34,6 +34,7 @@
     DEFAULT_JSON_EXPORT_DIR,
     DEFAULT_SQLITE_EXPORT_DIR,
     DEFAULT_TSV_EXPORT_DIR,
+    DEFAULT_DUMP_EXPORT_DIR,
 )
 from scribe_data.wikidata.query_data import query_data
 from scribe_data.wikidata.wikidata_utils import parse_wd_lexeme_dump
@@ -189,7 +190,9 @@ def prompt_user_download_all():
 
     # MARK: Form Dump
 
-    elif wikidata_dump:
+    elif wikidata_dump is not None:
+        if not wikidata_dump:
+            wikidata_dump = DEFAULT_DUMP_EXPORT_DIR
         parse_wd_lexeme_dump(
             language=language,
             wikidata_dump_type=["form"],
diff --git a/src/scribe_data/cli/main.py b/src/scribe_data/cli/main.py
index e22f4aea..d51712d2 100644
--- a/src/scribe_data/cli/main.py
+++ b/src/scribe_data/cli/main.py
@@ -166,8 +166,9 @@ def main() -> None:
     get_parser.add_argument(
         "-wdp",
         "--wikidata-dump-path",
-        type=str,
-        help="Path to a local Wikidata lexemes dump for running with '--all'.",
+        nargs="?",
+        const="",
+        help="Path to a local Wikidata lexemes dump. Uses default directory if no path provided.",
     )
     get_parser.add_argument(
         "-t", "--translation", type=str, help="parse a single word using MediaWiki API"
@@ -364,8 +365,11 @@ def main() -> None:
             if args.interactive:
                 start_interactive_mode(operation="get")
             if args.translation:
-                parse_wiktionary_translations(args.translation)
+                parse_wiktionary_translations(args.translation, args.output_dir)
             else:
+                print(
+                    f"Parsing Wikidata lexeme dump for {args.language} and {args.data_type}"
+                )
                 get_data(
                     language=args.language.lower()
                     if args.language is not None
diff --git a/src/scribe_data/utils.py b/src/scribe_data/utils.py
index 153fc293..fe4b89db 100644
--- a/src/scribe_data/utils.py
+++ b/src/scribe_data/utils.py
@@ -43,6 +43,7 @@
 DEFAULT_TSV_EXPORT_DIR = "scribe_data_tsv_export"
 DEFAULT_SQLITE_EXPORT_DIR = "scribe_data_sqlite_export"
 DEFAULT_DUMP_EXPORT_DIR = "scribe_data_wikidata_dumps_export"
+DEFAULT_MEDIAWIKI_EXPORT_DIR = "scribe_data_mediawiki_export"
 
 LANGUAGE_DATA_EXTRACTION_DIR = (
     Path(__file__).parent / "wikidata" / "language_data_extraction"
@@ -713,6 +714,18 @@ def check_lexeme_dump_prompt_download(output_dir: str):
                 rprint("[bold red]No valid dumps found.[/bold red]")
                 return None
 
+        elif user_input == "Download new version":
+            # Rename existing latest dump if it exists
+            latest_dump = Path(output_dir) / "latest-lexemes.json.bz2"
+            if latest_dump.exists():
+                timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
+                backup_name = f"old_latest-lexemes_{timestamp}.json.bz2"
+                latest_dump.rename(Path(output_dir) / backup_name)
+                rprint(
+                    f"[bold green]Renamed existing dump to {backup_name}[/bold green]"
+                )
+            return False
+
         else:
             rprint("[bold blue]Skipping download.[/bold blue]")
             return True
diff --git a/src/scribe_data/wikidata/wikidata_utils.py b/src/scribe_data/wikidata/wikidata_utils.py
index 29182070..036f58c9 100644
--- a/src/scribe_data/wikidata/wikidata_utils.py
+++ b/src/scribe_data/wikidata/wikidata_utils.py
@@ -113,5 +113,3 @@ def parse_wd_lexeme_dump(
                 output_dir=type_output_dir,
             )
             return
-
-    rprint(f"[bold red]No valid dumps found in {file_path}.[/bold red]")
diff --git a/src/scribe_data/wiktionary/parse_dump.py b/src/scribe_data/wiktionary/parse_dump.py
index cea8de12..cffab046 100644
--- a/src/scribe_data/wiktionary/parse_dump.py
+++ b/src/scribe_data/wiktionary/parse_dump.py
@@ -27,7 +27,6 @@
 from typing import List, Union
 
 import orjson
-import questionary
 from scribe_data.utils import (
     DEFAULT_DUMP_EXPORT_DIR,
     check_index_exists,
@@ -103,10 +102,7 @@ def _build_iso_mapping(self) -> dict:
                 iso_mapping[iso_code] = lang_name
 
         for language in self.target_iso:
-            if (
-                language.lower().startswith("q")
-                and language[1:].isdigit()
-            ):
+            if language.lower().startswith("q") and language[1:].isdigit():
                 qid_to_lang = check_qid_is_language(language)
                 if qid_to_lang:
                     iso_code = get_language_iso_code(language.upper())
@@ -415,20 +411,7 @@ def parse_dump(
     parse_type = parse_type or []
     data_types = data_types or []
 
-    print(f"Languages: {languages}")
-    print(f"parse_type: {parse_type}")
-    if data_types:
-        print(f"data_types for forms: {data_types}")
-
     if "total" not in parse_type:
-        choice = questionary.select(
-            "Choose an action:",
-            choices=["Overwrite existing data", "Skip process"],
-            default="Skip process",
-        ).ask()
-        if choice == "Overwrite existing data":
-            overwrite_all = True
-
         # For translations, we only need to check the translations index.
         if "translations" in parse_type:
             languages_to_process = []
@@ -500,8 +483,6 @@ def parse_dump(
         # For each data_type, we create a separate file, e.g. lexeme_nouns.json.
         for dt in data_types:
             index_path = Path(output_dir) / f"lexeme_{dt}.json"
-            print(f"Exporting forms for {dt} to {index_path}...")
-
             iso_codes = set()
             for word_data in processor.forms_index.values():
                 iso_codes.update(word_data.keys())
diff --git a/src/scribe_data/wiktionary/parse_mediaWiki.py b/src/scribe_data/wiktionary/parse_mediaWiki.py
index 6968c8ad..668c44a4 100644
--- a/src/scribe_data/wiktionary/parse_mediaWiki.py
+++ b/src/scribe_data/wiktionary/parse_mediaWiki.py
@@ -22,8 +22,8 @@
 
 import json
 import re
-
-from scribe_data.utils import get_language_from_iso
+from pathlib import Path
+from scribe_data.utils import get_language_from_iso, DEFAULT_MEDIAWIKI_EXPORT_DIR
 from scribe_data.wikidata.wikidata_utils import mediaWiki_query
 
 
@@ -121,16 +121,53 @@ def build_json_format(word, translations_by_lang):
     return book_translations
 
 
-def parse_wiktionary_translations(word):
+def parse_wiktionary_translations(word, output_dir=DEFAULT_MEDIAWIKI_EXPORT_DIR):
     """
-    Parse the translations of a word from Wiktionary.
+    Parse translations from Wiktionary and save them to a JSON file.
+
+    Fetches the Wiktionary page for the given word, extracts translations
+    across different languages, and saves them in a structured JSON format.
+
+    Parameters
+    ----------
+    word : str
+        The word to fetch translations for.
+    output_dir : str or Path, optional
+        Directory to save JSON output (default is DEFAULT_MEDIAWIKI_EXPORT_DIR).
+        Will be created if it doesn't exist.
+
+    Notes
+    -----
+    The output JSON structure follows the format:
+    {
+        "word": {
+            "language": {
+                "part_of_speech": {
+                    "1": {
+                        "description": "context",
+                        "translations": "translated_text"
+                    }
+                }
+            }
+        }
+    }
     """
-    wikitext = fetch_translation_page(word)
-    translations_by_lang = parse_wikitext_for_translations(wikitext)
+    output_dir = output_dir or DEFAULT_MEDIAWIKI_EXPORT_DIR
+    output_path = Path(output_dir)
+    output_path.mkdir(parents=True, exist_ok=True)
 
+    translations_by_lang = parse_wikitext_for_translations(fetch_translation_page(word))
     if not translations_by_lang:
         print("No translations found")
         return
 
-    final_json = build_json_format(word, translations_by_lang)
-    print(json.dumps(final_json, indent=4, ensure_ascii=False))
+    json_path = output_path / f"{word}.json"
+    with open(json_path, "w", encoding="utf-8") as file:
+        json.dump(
+            build_json_format(word, translations_by_lang),
+            file,
+            indent=4,
+            ensure_ascii=False,
+        )
+
+    print(f"JSON file saved to {json_path}")

From 15735d3a02a1981a17de820551ac1501fcf22fe8 Mon Sep 17 00:00:00 2001
From: axif <muhamadasif570@gmail.com>
Date: Tue, 14 Jan 2025 02:16:57 +0600
Subject: [PATCH 02/13] fix small bugs

---
 src/scribe_data/cli/get.py                 |   5 +
 src/scribe_data/wikidata/wikidata_utils.py |   5 +
 src/scribe_data/wiktionary/parse_dump.py   | 268 ++++++++++++++-------
 tests/cli/test_get.py                      |   9 +-
 4 files changed, 192 insertions(+), 95 deletions(-)

diff --git a/src/scribe_data/cli/get.py b/src/scribe_data/cli/get.py
index 9be44075..67e603d7 100644
--- a/src/scribe_data/cli/get.py
+++ b/src/scribe_data/cli/get.py
@@ -123,6 +123,7 @@ def prompt_user_download_all():
                     wikidata_dump_type=["form"],
                     data_types="all",
                     type_output_dir=output_dir,
+                    overwrite_all=overwrite,
                 )
             else:
                 language_or_sub_language = language.split(" ")[0]
@@ -144,6 +145,7 @@ def prompt_user_download_all():
                     wikidata_dump_type=["form"],
                     data_types=[data_type],
                     type_output_dir=output_dir,
+                    overwrite_all=overwrite,
                 )
             else:
                 print(f"Updating all languages for data type: {data_type.capitalize()}")
@@ -168,6 +170,7 @@ def prompt_user_download_all():
                 data_types="all",
                 type_output_dir=output_dir,
                 wikidata_dump_path=wikidata_dump,
+                overwrite_all=overwrite,
             )
 
     # MARK: Emojis
@@ -185,6 +188,7 @@ def prompt_user_download_all():
             wikidata_dump_type=["translations"],
             type_output_dir=output_dir,
             wikidata_dump_path=wikidata_dump,
+            overwrite_all=overwrite,
         )
         return
 
@@ -199,6 +203,7 @@ def prompt_user_download_all():
             data_types=data_types,
             type_output_dir=output_dir,
             wikidata_dump_path=wikidata_dump,
+            overwrite_all=overwrite,
         )
         return
 
diff --git a/src/scribe_data/wikidata/wikidata_utils.py b/src/scribe_data/wikidata/wikidata_utils.py
index 036f58c9..afa9e6f9 100644
--- a/src/scribe_data/wikidata/wikidata_utils.py
+++ b/src/scribe_data/wikidata/wikidata_utils.py
@@ -64,6 +64,7 @@ def parse_wd_lexeme_dump(
     data_types: List[str] = None,
     type_output_dir: str = None,
     wikidata_dump_path: str = None,
+    overwrite_all: bool = False,
 ):
     """
     Checks for the existence of a Wikidata lexeme dump and parses it if possible.
@@ -84,6 +85,9 @@ def parse_wd_lexeme_dump(
 
     wikidata_dump_path : str, optional
         The local Wikidata lexeme dump directory that should be used to get data.
+
+    overwrite_all : bool, default=False
+        If True, automatically overwrite existing files without prompting
     """
     # Convert "all" to list of all languages
     if isinstance(language, str) and language.lower() == "all":
@@ -111,5 +115,6 @@ def parse_wd_lexeme_dump(
                 data_types=data_types,
                 file_path=file_path,
                 output_dir=type_output_dir,
+                overwrite_all=overwrite_all,
             )
             return
diff --git a/src/scribe_data/wiktionary/parse_dump.py b/src/scribe_data/wiktionary/parse_dump.py
index cffab046..97ce3eca 100644
--- a/src/scribe_data/wiktionary/parse_dump.py
+++ b/src/scribe_data/wiktionary/parse_dump.py
@@ -34,6 +34,7 @@
     language_metadata,
     get_language_iso_code,
     check_qid_is_language,
+    lexeme_form_metadata,
 )
 from tqdm import tqdm
 
@@ -74,19 +75,22 @@ def __init__(
         self.forms_index = defaultdict(lambda: defaultdict(lambda: defaultdict(dict)))
 
         # Stats.
-        self.stats = {"processed_entries": 0, "unique_words": 0, "processing_time": 0}
+        self.stats = {"processed_entries": 0, "processing_time": 0}
 
-        # For category lookups, invert data_type_metadata.
-        # E.g., {"Q1084": "nouns", "Q24905": "verbs", ...}.
-        self._category_lookup = {v: k for k, v in data_type_metadata.items()}
-
-        # Build map from ISO to full language name.
-        self.iso_to_name = self._build_iso_mapping()
         # For "total" usage.
         self.lexical_category_counts = defaultdict(Counter)
         self.translation_counts = defaultdict(Counter)
         self.forms_counts = defaultdict(Counter)
 
+        # Cache for feature labels.
+        self._feature_label_cache = {}
+        for category, items in lexeme_form_metadata.items():
+            for item_data in items.values():
+                self._feature_label_cache[item_data["qid"]] = (
+                    category,
+                    item_data["label"],
+                )
+
     # MARK: build iso mapping
     def _build_iso_mapping(self) -> dict:
         """
@@ -114,29 +118,29 @@ def _build_iso_mapping(self) -> dict:
     # MARK: process lines
     def process_lines(self, line: str) -> None:
         """
-        Process one line of data. Depending on parse_type, we do:
-            - total stats
-            - translations
-            - form categories (filtered by data_types)
+        Process one line of data with optimized parsing.
         """
         try:
+            # Use faster exception handling.
             lexeme = orjson.loads(line.strip().rstrip(","))
             if not lexeme:
                 return
 
-            # Get common values once.
-            lemmas = lexeme.get("lemmas", {})
-            lexical_category = lexeme.get("lexicalCategory")
+            # Combine field checks into single lookup.
+            required_fields = ("lemmas", "lexicalCategory")
+            if not all(field in lexeme for field in required_fields):
+                return
 
-            if not (lemmas and lexical_category in self.valid_categories):
+            lexical_category = lexeme["lexicalCategory"]
+            if lexical_category not in self.valid_categories:
                 return
 
             category_name = self._category_lookup.get(lexical_category)
             if not category_name:
                 return
 
-            # Process each type in a single pass through the data.
-            for lang_code, lemma_data in lemmas.items():
+            # Process first valid lemma only.
+            for lang_code, lemma_data in lexeme["lemmas"].items():
                 if lang_code not in self.valid_iso_codes:
                     continue
 
@@ -144,50 +148,92 @@ def process_lines(self, line: str) -> None:
                 if not word:
                     continue
 
-                if "total" in self.parse_type:
-                    self.lexical_category_counts[lang_code][category_name] += 1
-                    translation_count = sum(
-                        len(sense.get("glosses", {}))
-                        for sense in lexeme.get("senses", [])
-                    )
-                    self.translation_counts[lang_code][category_name] += (
-                        translation_count
-                    )
+                parse_types = self.parse_type
+                if "translations" in parse_types and lexeme.get("senses"):
+                    self._process_translations(lexeme, word, lang_code, category_name)
+
+                if "form" in parse_types and category_name in self.data_types:
+                    self._process_forms(lexeme, lang_code, category_name)
 
-                if "translations" in self.parse_type:
-                    if translations := {
-                        lang: gloss["value"]
-                        for sense in lexeme.get("senses", [])
-                        for lang, gloss in sense.get("glosses", {}).items()
-                        if lang in self.valid_iso_codes
-                    }:
-                        self.translations_index[word][lang_code][category_name] = (
-                            translations
-                        )
-
-                if "form" in self.parse_type and category_name in self.data_types:
-                    forms_data = defaultdict(list)
-                    for form in lexeme.get("forms", []):
-                        for rep_lang, rep_data in form.get(
-                            "representations", {}
-                        ).items():
-                            if rep_lang == lang_code:
-                                if form_value := rep_data.get("value"):
-                                    forms_data[form_value].extend(
-                                        form.get("grammaticalFeatures", [])
-                                    )
-
-                    if forms_data:
-                        self.forms_index[word][lang_code][category_name] = dict(
-                            forms_data
-                        )
-                        self.forms_counts[lang_code][category_name] += len(forms_data)
-
-                break  # only process first valid lemma
+                if "total" in parse_types:
+                    self._process_totals(lexeme, lang_code, category_name)
+
+                break
 
         except Exception as e:
             print(f"Error processing line: {e}")
 
+    def _process_translations(self, lexeme, word, lang_code, category_name):
+        """
+        Optimized translations processing
+        """
+        translations = {}
+        valid_iso_codes = self.valid_iso_codes
+
+        # Pre-fetch senses to avoid repeated lookups.
+        for sense in lexeme["senses"]:
+            if glosses := sense.get("glosses"):
+                translations.update(
+                    (lang, gloss["value"])
+                    for lang, gloss in glosses.items()
+                    if lang in valid_iso_codes
+                )
+
+        if translations:
+            self.translations_index[word][lang_code][category_name] = translations
+
+    def _process_forms(self, lexeme, lang_code, category_name):
+        """
+        Optimized forms processing
+        """
+        lexeme_id = lexeme["id"]
+        forms_data = {}
+
+        # Pre-compute form data structure.
+        forms_dict = forms_data.setdefault(lexeme_id, {})
+        lang_dict = forms_dict.setdefault(lang_code, {})
+        cat_dict = lang_dict.setdefault(category_name, {})
+
+        for form in lexeme.get("forms", []):
+            if not (representations := form.get("representations")):
+                continue
+
+            for rep_data in representations.values():
+                if form_value := rep_data.get("value"):
+                    if features := form.get("grammaticalFeatures"):
+                        if form_name := self._get_form_name(features):
+                            cat_dict[form_name] = form_value
+                    break  # Only process first representation
+
+        if forms_data:
+            self.forms_index.update(forms_data)
+            self.forms_counts[lang_code][category_name] += len(forms_data)
+
+    def _get_form_name(self, features):
+        """
+        Optimized form name generation
+        """
+        if not features:
+            return ""
+
+        categorized_features = defaultdict(list)
+        for feature in features:
+            if feature_info := self._feature_label_cache.get(feature):
+                category, label = feature_info
+                categorized_features[category].append((label, feature))
+
+        form_parts = []
+        is_first = True
+        for category in sorted(categorized_features.keys()):
+            for label, _ in sorted(categorized_features[category]):
+                if is_first:
+                    form_parts.append(label.lower())
+                    is_first = False
+                else:
+                    form_parts.append(label)
+
+        return "".join(form_parts)
+
     # MARK: process file
     def process_file(self, file_path: str, batch_size: int = 50000):
         """
@@ -293,11 +339,21 @@ def export_forms_json(
         self, filepath: str, language_iso: str = None, data_type: str = None
     ) -> None:
         """
-        Save forms_index to file, optionally filtering by:
-            - language_iso
-            - data_type (e.g. "nouns", "adverbs")
-
-        If data_type is given, we only export that one category from forms.
+        Export grammatical forms to a JSON file with readable feature labels.
+
+        Parameters
+        ----------
+        filepath : str
+            Base path where the JSON file will be saved.
+        language_iso : str, optional
+            ISO code of the language to export. If None, exports all languages.
+        data_type : str, optional
+            Category of forms to export (e.g., "nouns", "verbs"). If None, exports all types.
+
+        Notes
+        -----
+        Creates a directory structure: <filepath>/<language_name>/lexeme_<data_type>.json
+        Skips export if no forms are found for the specified language and data type.
         """
         if language_iso:
             if language_iso not in self.iso_to_name:
@@ -305,57 +361,83 @@ def export_forms_json(
                 return
 
             filtered = {}
-            for word, lang_data in self.forms_index.items():
-                if language_iso in lang_data:
-                    # If data_type is given, only keep that category.
-                    if data_type:
-                        if data_type in lang_data[language_iso]:
-                            filtered[word] = {
-                                language_iso: {
-                                    data_type: lang_data[language_iso][data_type]
-                                }
-                            }
-
-                    else:
-                        filtered[word] = {language_iso: lang_data[language_iso]}
-
-            # Check if filtered data is empty before saving.
+            for id, lang_data in self.forms_index.items():
+                if (
+                    language_iso in lang_data and data_type
+                ):  # Only process if we have a data_type
+                    if (
+                        data_type in lang_data[language_iso]
+                    ):  # Check if this data_type exists
+                        # Initialize the nested dictionary for this ID if it doesn't exist
+                        if id not in filtered:
+                            filtered[id] = {}
+
+                        form_data = lang_data[language_iso][data_type]
+                        for form_name, word in form_data.items():
+                            filtered[id][form_name] = word
+
+            lang_name = self.iso_to_name[language_iso]
+
+            # Check if filtered data is empty before saving
             if not filtered:
-                print(f"No forms found for {language_iso}, skipping export...")
+                print(f"No forms found for {lang_name} {data_type}, skipping export...")
                 return
 
-            self._save_by_language(
-                filtered, filepath, language_iso, data_type or "forms"
-            )
+            # Create the output directory structure
+            output_path = Path(filepath).parent / lang_name
+            output_path.mkdir(parents=True, exist_ok=True)
+
+            # Create the full output filepath
+            output_file = output_path / f"lexeme_{data_type}.json"
+
+            # Save the filtered data to JSON file
+            try:
+                with open(output_file, "wb") as f:
+                    f.write(orjson.dumps(filtered, option=orjson.OPT_INDENT_2))
+                print(
+                    f"Successfully exported forms for {lang_name} {data_type} to {output_file}"
+                )
+            except Exception as e:
+                print(f"Error saving forms for {lang_name} {data_type}: {e}")
 
-    def _save_by_language(self, data, filepath, language_iso, category_type):
+    def _save_by_language(self, filtered, filepath, language_iso, data_type):
         """
-        Save data to exports/<langName>/filename.
+        Save filtered data to language-specific directory.
+
+        Parameters
+        ----------
+        filtered : dict
+            Dictionary with form features as keys and words as values.
+        filepath : Path
+            Base path for saving the file.
+        language_iso : str
+            ISO code of the language.
+        data_type : str
+            Type of data being saved (e.g., "nouns", "verbs").
+
+        Notes
+        -----
+        Creates directory structure: exports/<langName>/filename
+        and saves the filtered data as a JSON file.
         """
         base_path = Path(filepath)
         lang_name = self.iso_to_name[language_iso]
 
+        # Create language-specific directory
         lang_filepath = base_path.parent / lang_name / base_path.name
         lang_filepath.parent.mkdir(parents=True, exist_ok=True)
 
-        print(f"Saving {lang_name} {category_type} index to {lang_filepath}...")
+        print(f"Saving {lang_name} {data_type} forms to {lang_filepath}...")
+
+        # Save the filtered data with pretty printing
         with open(lang_filepath, "wb") as f:
             f.write(
                 orjson.dumps(
-                    self._to_dict(data),
+                    filtered,
                     option=orjson.OPT_INDENT_2 | orjson.OPT_NON_STR_KEYS,
                 )
             )
 
-    def _to_dict(self, dd):
-        """
-        Recursively convert defaultdict to dict.
-        """
-        if isinstance(dd, defaultdict):
-            dd = {k: self._to_dict(v) for k, v in dd.items()}
-
-        return dd
-
 
 # MARK: parse dump
 def parse_dump(
diff --git a/tests/cli/test_get.py b/tests/cli/test_get.py
index 54cf389d..f6c76d59 100644
--- a/tests/cli/test_get.py
+++ b/tests/cli/test_get.py
@@ -83,6 +83,7 @@ def test_get_all_data_types_for_language_user_says_yes(
             wikidata_dump_type=["form"],
             data_types="all",  # because if only language given, data_types is None
             type_output_dir="scribe_data_json_export",  # default for JSON
+            overwrite_all=False,
         )
         mock_query_data.assert_not_called()
 
@@ -101,6 +102,7 @@ def test_get_all_languages_and_data_types(self, mock_parse):
             data_types="all",
             type_output_dir="scribe_data_json_export",
             wikidata_dump_path=None,
+            overwrite_all=False,
         )
 
     # MARK: Language and Data Type
@@ -281,8 +283,9 @@ def test_get_translations_no_language_specified(self, mock_parse):
         mock_parse.assert_called_once_with(
             language="all",
             wikidata_dump_type=["translations"],
-            type_output_dir="scribe_data_json_export",  # default output dir for JSON
+            type_output_dir="scribe_data_json_export",
             wikidata_dump_path=None,
+            overwrite_all=False,
         )
 
     @patch("scribe_data.cli.get.parse_wd_lexeme_dump")
@@ -299,6 +302,7 @@ def test_get_translations_with_specific_language(self, mock_parse):
             wikidata_dump_type=["translations"],
             type_output_dir="./test_output",
             wikidata_dump_path=None,
+            overwrite_all=False,
         )
 
     @patch("scribe_data.cli.get.parse_wd_lexeme_dump")
@@ -314,6 +318,7 @@ def test_get_translations_with_dump(self, mock_parse):
         mock_parse.assert_called_once_with(
             language="German",
             wikidata_dump_type=["translations"],
-            type_output_dir="scribe_data_json_export",  # default for JSON
+            type_output_dir="scribe_data_json_export",
             wikidata_dump_path="./wikidump.json",
+            overwrite_all=False,
         )

From c1bec8775b707e843d4ca9f4a597bb08e1962e3d Mon Sep 17 00:00:00 2001
From: axif <muhamadasif570@gmail.com>
Date: Tue, 14 Jan 2025 20:35:18 +0600
Subject: [PATCH 03/13] fix small bugs

---
 src/scribe_data/cli/get.py                 |  1 +
 src/scribe_data/wikidata/wikidata_utils.py | 12 +++-
 src/scribe_data/wiktionary/parse_dump.py   | 66 ++++++++++++++--------
 3 files changed, 55 insertions(+), 24 deletions(-)

diff --git a/src/scribe_data/cli/get.py b/src/scribe_data/cli/get.py
index 67e603d7..acb7b1ad 100644
--- a/src/scribe_data/cli/get.py
+++ b/src/scribe_data/cli/get.py
@@ -186,6 +186,7 @@ def prompt_user_download_all():
         parse_wd_lexeme_dump(
             language=language,
             wikidata_dump_type=["translations"],
+            data_types=data_types,
             type_output_dir=output_dir,
             wikidata_dump_path=wikidata_dump,
             overwrite_all=overwrite,
diff --git a/src/scribe_data/wikidata/wikidata_utils.py b/src/scribe_data/wikidata/wikidata_utils.py
index afa9e6f9..18d01895 100644
--- a/src/scribe_data/wikidata/wikidata_utils.py
+++ b/src/scribe_data/wikidata/wikidata_utils.py
@@ -92,13 +92,23 @@ def parse_wd_lexeme_dump(
     # Convert "all" to list of all languages
     if isinstance(language, str) and language.lower() == "all":
         language = list(language_metadata.keys())
+
+    # For printing: include all data types including translations
+    display_data_types = list(data_type_metadata.keys())
+
+    # For processing: exclude translations and emoji-keywords
     if isinstance(data_types, str) and data_types.lower() == "all":
-        # Exclude translations as it's a separate section
         data_types = [
             dt
             for dt in data_type_metadata.keys()
             if dt != "translations" and dt != "emoji-keywords"
         ]
+        display_data_types += ["translations"]
+    else:
+        display_data_types = data_types
+
+    print(f"Languages to process: {language}")
+    print(f"Data types to process: {display_data_types}")
 
     file_path = wd_lexeme_dump_download_wrapper(None, wikidata_dump_path)
 
diff --git a/src/scribe_data/wiktionary/parse_dump.py b/src/scribe_data/wiktionary/parse_dump.py
index 97ce3eca..d2a359c9 100644
--- a/src/scribe_data/wiktionary/parse_dump.py
+++ b/src/scribe_data/wiktionary/parse_dump.py
@@ -70,7 +70,7 @@ def __init__(
 
         # Separate data structures.
         self.translations_index = defaultdict(
-            lambda: defaultdict(lambda: defaultdict(dict))
+            lambda: defaultdict(lambda: defaultdict(lambda: defaultdict(dict)))
         )
         self.forms_index = defaultdict(lambda: defaultdict(lambda: defaultdict(dict)))
 
@@ -169,8 +169,9 @@ def _process_translations(self, lexeme, word, lang_code, category_name):
         """
         translations = {}
         valid_iso_codes = self.valid_iso_codes
+        lexeme_id = lexeme["id"]
 
-        # Pre-fetch senses to avoid repeated lookups.
+        # Pre-fetch senses to avoid repeated lookups
         for sense in lexeme["senses"]:
             if glosses := sense.get("glosses"):
                 translations.update(
@@ -180,7 +181,21 @@ def _process_translations(self, lexeme, word, lang_code, category_name):
                 )
 
         if translations:
-            self.translations_index[word][lang_code][category_name] = translations
+            self.translations_index[lang_code][category_name][lexeme_id][word] = (
+                translations
+            )
+
+            # Debug: Print translations_index for specific words
+            # if word.lower() in ["ändern", "cat", "dog"]:  # Add any words to debug
+            #     print("\nStored in translations_index:")
+            #     print(f"Word: {word}")
+            #     print(f"ID: {lexeme_id}")
+            #     print(f"Language: {lang_code}")
+            #     print(f"Category: {category_name}")
+            #     print("Translations:", orjson.dumps(
+            #         translations,
+            #         option=orjson.OPT_INDENT_2
+            #     ).decode('utf-8'))
 
     def _process_forms(self, lexeme, lang_code, category_name):
         """
@@ -321,13 +336,13 @@ def export_translations_json(self, filepath: str, language_iso: str = None) -> N
                 )
                 return
 
-            filtered = {
-                word: {language_iso: lang_data[language_iso]}
-                for word, lang_data in self.translations_index.items()
-                if language_iso in lang_data
-            }
+            # Flatten the category level
+            filtered = {}
+            for category_data in self.translations_index[language_iso].values():
+                for lexeme_id, word_data in category_data.items():
+                    filtered[lexeme_id] = word_data
 
-            # Check if filtered data is empty before saving.
+            # Check if filtered data is empty before saving
             if not filtered:
                 print(f"No translations found for {language_iso}, skipping export...")
                 return
@@ -424,7 +439,7 @@ def _save_by_language(self, filtered, filepath, language_iso, data_type):
         lang_name = self.iso_to_name[language_iso]
 
         # Create language-specific directory
-        lang_filepath = base_path.parent / lang_name / base_path.name
+        lang_filepath = base_path.parent / base_path.name
         lang_filepath.parent.mkdir(parents=True, exist_ok=True)
 
         print(f"Saving {lang_name} {data_type} forms to {lang_filepath}...")
@@ -533,9 +548,9 @@ def parse_dump(
             languages = languages_to_process
             data_types = list(data_types_to_process)
 
-        print(f"Languages to process: {languages}")
-        if data_types:
-            print(f"Data types to process: {data_types}")
+        if not data_types or not languages:
+            print("No data types or languages provided. Nothing to process.")
+            return
 
         if not languages:
             print("All requested data already exists. Nothing to process.")
@@ -547,18 +562,23 @@ def parse_dump(
     processor.process_file(file_path)
 
     # MARK: Handle JSON exports
-
-    # (a) If "translations" in parse_type -> export them.
     if "translations" in parse_type:
-        index_path = Path(output_dir) / "lexeme_translations.json"
-
-        # Export translations for each ISO found.
-        iso_codes = set()
-        for word_data in processor.translations_index.values():
-            iso_codes.update(word_data.keys())
-        for iso_code in iso_codes:
-            if iso_code in processor.iso_to_name:
+        for language in languages:
+            # Get the ISO code for the language
+            iso_code = None
+            for iso, name in processor.iso_to_name.items():
+                if name.lower() == language.lower():
+                    iso_code = iso
+                    break
+
+            if iso_code:
+                index_path = Path(output_dir) / language / "lexeme_translations.json"
+                # Ensure parent directory exists
+                index_path.parent.mkdir(parents=True, exist_ok=True)
+                # print(f"Exporting translations for {language} to {index_path}")
                 processor.export_translations_json(str(index_path), iso_code)
+            else:
+                print(f"Warning: Could not find ISO code for {language}")
 
     # (b) If "form" in parse_type -> export forms for each data_type in data_types.
     if "form" in parse_type:

From 78b82c05d09d11dd833c7fda1e510b95920292c3 Mon Sep 17 00:00:00 2001
From: axif <muhamadasif570@gmail.com>
Date: Tue, 14 Jan 2025 23:41:06 +0600
Subject: [PATCH 04/13] translation add L:id

---
 src/scribe_data/wiktionary/parse_dump.py | 43 +++++++++++++++++-------
 1 file changed, 30 insertions(+), 13 deletions(-)

diff --git a/src/scribe_data/wiktionary/parse_dump.py b/src/scribe_data/wiktionary/parse_dump.py
index d2a359c9..f9fcb158 100644
--- a/src/scribe_data/wiktionary/parse_dump.py
+++ b/src/scribe_data/wiktionary/parse_dump.py
@@ -72,7 +72,7 @@ def __init__(
         self.translations_index = defaultdict(
             lambda: defaultdict(lambda: defaultdict(lambda: defaultdict(dict)))
         )
-        self.forms_index = defaultdict(lambda: defaultdict(lambda: defaultdict(dict)))
+        self.forms_index = defaultdict(lambda: defaultdict(list))
 
         # Stats.
         self.stats = {"processed_entries": 0, "processing_time": 0}
@@ -82,6 +82,9 @@ def __init__(
         self.translation_counts = defaultdict(Counter)
         self.forms_counts = defaultdict(Counter)
 
+        # For "unique_forms" usage.
+        self.unique_forms = defaultdict(lambda: defaultdict(list))
+
         # Cache for feature labels.
         self._feature_label_cache = {}
         for category, items in lexeme_form_metadata.items():
@@ -185,18 +188,6 @@ def _process_translations(self, lexeme, word, lang_code, category_name):
                 translations
             )
 
-            # Debug: Print translations_index for specific words
-            # if word.lower() in ["ändern", "cat", "dog"]:  # Add any words to debug
-            #     print("\nStored in translations_index:")
-            #     print(f"Word: {word}")
-            #     print(f"ID: {lexeme_id}")
-            #     print(f"Language: {lang_code}")
-            #     print(f"Category: {category_name}")
-            #     print("Translations:", orjson.dumps(
-            #         translations,
-            #         option=orjson.OPT_INDENT_2
-            #     ).decode('utf-8'))
-
     def _process_forms(self, lexeme, lang_code, category_name):
         """
         Optimized forms processing
@@ -215,6 +206,15 @@ def _process_forms(self, lexeme, lang_code, category_name):
 
             for rep_data in representations.values():
                 if form_value := rep_data.get("value"):
+                    features = form.get("grammaticalFeatures", [])
+
+                    # If features are not empty and not already in the list
+                    if (
+                        features
+                        and features not in self.unique_forms[lang_code][category_name]
+                    ):
+                        self.unique_forms[lang_code][category_name].append(features)
+
                     if features := form.get("grammaticalFeatures"):
                         if form_name := self._get_form_name(features):
                             cat_dict[form_name] = form_value
@@ -594,3 +594,20 @@ def parse_dump(
                     processor.export_forms_json(
                         filepath=str(index_path), language_iso=iso_code, data_type=dt
                     )
+
+    # def print_unique_forms(unique_forms):
+    #     """
+    #     Pretty print unique grammatical feature sets
+    #     """
+    #     for lang, lang_data in unique_forms.items():
+    #         print(f"\nLanguage: {lang}")
+    #         for category, features_list in lang_data.items():
+    #             print(f"  Category: {category}")
+    #             print(f"  Total unique feature sets: {len(features_list)}")
+    #             print("  Feature Sets:")
+    #             for i, feature_set in enumerate(features_list, 1):
+    #                 # Convert QIDs to a more readable format
+    #                 readable_features = [f"Q{qid}" for qid in feature_set]
+    #                 print(f"    {i}. {readable_features}")
+
+    # print_unique_forms(processor.unique_forms)

From 192b09cddc46eb2fd56c9b8fd85cff858f7fd108 Mon Sep 17 00:00:00 2001
From: axif <muhamadasif570@gmail.com>
Date: Tue, 14 Jan 2025 23:56:40 +0600
Subject: [PATCH 05/13] fix tests and add tests for QID

---
 src/scribe_data/cli/get.py |  4 ++-
 tests/cli/test_get.py      | 55 ++++++++++++++++++++++++++++++++++++++
 2 files changed, 58 insertions(+), 1 deletion(-)

diff --git a/src/scribe_data/cli/get.py b/src/scribe_data/cli/get.py
index acb7b1ad..6659c0ba 100644
--- a/src/scribe_data/cli/get.py
+++ b/src/scribe_data/cli/get.py
@@ -123,6 +123,7 @@ def prompt_user_download_all():
                     wikidata_dump_type=["form"],
                     data_types="all",
                     type_output_dir=output_dir,
+                    wikidata_dump_path=wikidata_dump,
                     overwrite_all=overwrite,
                 )
             else:
@@ -145,6 +146,7 @@ def prompt_user_download_all():
                     wikidata_dump_type=["form"],
                     data_types=[data_type],
                     type_output_dir=output_dir,
+                    wikidata_dump_path=wikidata_dump,
                     overwrite_all=overwrite,
                 )
             else:
@@ -181,12 +183,12 @@ def prompt_user_download_all():
     # MARK: Translations
 
     elif data_type == "translations":
+        # If no language specified, use "all".
         if language is None:
             language = "all"
         parse_wd_lexeme_dump(
             language=language,
             wikidata_dump_type=["translations"],
-            data_types=data_types,
             type_output_dir=output_dir,
             wikidata_dump_path=wikidata_dump,
             overwrite_all=overwrite,
diff --git a/tests/cli/test_get.py b/tests/cli/test_get.py
index f6c76d59..836a5699 100644
--- a/tests/cli/test_get.py
+++ b/tests/cli/test_get.py
@@ -83,6 +83,7 @@ def test_get_all_data_types_for_language_user_says_yes(
             wikidata_dump_type=["form"],
             data_types="all",  # because if only language given, data_types is None
             type_output_dir="scribe_data_json_export",  # default for JSON
+            wikidata_dump_path=None,  # explicitly set to None
             overwrite_all=False,
         )
         mock_query_data.assert_not_called()
@@ -322,3 +323,57 @@ def test_get_translations_with_dump(self, mock_parse):
             wikidata_dump_path="./wikidump.json",
             overwrite_all=False,
         )
+
+    # MARK: Use QID as language
+
+    @patch("scribe_data.cli.get.parse_wd_lexeme_dump")
+    @patch("scribe_data.cli.get.questionary.confirm")
+    def test_get_data_with_wikidata_identifier(
+        self, mock_questionary_confirm, mock_parse
+    ):
+        """
+        Test retrieving data with a Wikidata identifier as language.
+
+        Ensures that `parse_wd_lexeme_dump` is called with the correct parameters
+        when a Wikidata identifier is used.
+        """
+        # Mock the user confirmation to return True (query Wikidata directly).
+        mock_questionary_confirm.return_value.ask.return_value = True
+
+        get_data(
+            language="Q9217",
+            wikidata_dump="scribe",
+            output_dir="exported_json",
+            all_bool=True,
+        )
+        mock_parse.assert_called_once_with(
+            language="Q9217",
+            wikidata_dump_type=["form"],
+            data_types="all",
+            type_output_dir="exported_json",
+            wikidata_dump_path="scribe",
+            overwrite_all=False,
+        )
+
+    @patch("scribe_data.cli.get.parse_wd_lexeme_dump")
+    def test_get_data_with_wikidata_identifier_and_data_type(self, mock_parse):
+        """
+        Test retrieving a specific data type with a Wikidata identifier.
+
+        Ensures that `parse_wd_lexeme_dump` is called with the correct parameters
+        when a Wikidata identifier and specific data type are used.
+        """
+        get_data(
+            language="Q9217",
+            data_type="nouns",
+            wikidata_dump="scribe",
+            output_dir="exported_json",
+        )
+        mock_parse.assert_called_once_with(
+            language="Q9217",
+            wikidata_dump_type=["form"],
+            data_types=["nouns"],
+            type_output_dir="exported_json",
+            wikidata_dump_path="scribe",
+            overwrite_all=False,
+        )

From cfc2777729d180293b2e55295f87cdd09fb1c509 Mon Sep 17 00:00:00 2001
From: axif <muhamadasif570@gmail.com>
Date: Thu, 16 Jan 2025 00:51:01 +0600
Subject: [PATCH 06/13] fix total

---
 src/scribe_data/cli/total.py               |  7 +-
 src/scribe_data/wikidata/wikidata_utils.py |  8 +-
 src/scribe_data/wiktionary/parse_dump.py   | 27 +++++++
 tests/cli/test_total.py                    | 90 ++++++++++++++++++++++
 4 files changed, 124 insertions(+), 8 deletions(-)

diff --git a/src/scribe_data/cli/total.py b/src/scribe_data/cli/total.py
index 89396f72..b867a48f 100644
--- a/src/scribe_data/cli/total.py
+++ b/src/scribe_data/cli/total.py
@@ -367,11 +367,15 @@ def total_wrapper(
     """
     # Handle --all flag
     if all_bool and wikidata_dump:
-        language = "all"
+        if data_type is None:
+            data_type = "all"
+        if language is None:
+            language = "all"
 
     if wikidata_dump is True:  # flag without a wikidata lexeme dump path
         parse_wd_lexeme_dump(
             language=language,
+            data_types=[data_type],
             wikidata_dump_type=["total"],
             wikidata_dump_path=None,
         )
@@ -380,6 +384,7 @@ def total_wrapper(
     if isinstance(wikidata_dump, str):  # if user provided a wikidata lexeme dump path
         parse_wd_lexeme_dump(
             language=language,
+            data_types=[data_type],
             wikidata_dump_type=["total"],
             wikidata_dump_path=wikidata_dump,
         )
diff --git a/src/scribe_data/wikidata/wikidata_utils.py b/src/scribe_data/wikidata/wikidata_utils.py
index 18d01895..cf6fb872 100644
--- a/src/scribe_data/wikidata/wikidata_utils.py
+++ b/src/scribe_data/wikidata/wikidata_utils.py
@@ -93,9 +93,6 @@ def parse_wd_lexeme_dump(
     if isinstance(language, str) and language.lower() == "all":
         language = list(language_metadata.keys())
 
-    # For printing: include all data types including translations
-    display_data_types = list(data_type_metadata.keys())
-
     # For processing: exclude translations and emoji-keywords
     if isinstance(data_types, str) and data_types.lower() == "all":
         data_types = [
@@ -103,12 +100,9 @@ def parse_wd_lexeme_dump(
             for dt in data_type_metadata.keys()
             if dt != "translations" and dt != "emoji-keywords"
         ]
-        display_data_types += ["translations"]
-    else:
-        display_data_types = data_types
 
     print(f"Languages to process: {language}")
-    print(f"Data types to process: {display_data_types}")
+    print(f"Data types to process: {data_types}")
 
     file_path = wd_lexeme_dump_download_wrapper(None, wikidata_dump_path)
 
diff --git a/src/scribe_data/wiktionary/parse_dump.py b/src/scribe_data/wiktionary/parse_dump.py
index f9fcb158..fa6bd0f6 100644
--- a/src/scribe_data/wiktionary/parse_dump.py
+++ b/src/scribe_data/wiktionary/parse_dump.py
@@ -249,6 +249,32 @@ def _get_form_name(self, features):
 
         return "".join(form_parts)
 
+    def _process_totals(self, lexeme, lang_code, category_name):
+        """
+        Process totals for statistical counting.
+        """
+        # Skip if we have specific data types and this category isn't in them
+        if self.data_types and category_name.lower() not in [
+            dt.lower() for dt in self.data_types
+        ]:
+            return
+
+        # Increment lexeme count for this language and category
+        self.lexical_category_counts[lang_code][category_name] += 1
+
+        # Count translations if they exist
+        if lexeme.get("senses"):
+            translation_count = sum(
+                1
+                for sense in lexeme["senses"]
+                if sense.get("glosses")
+                and any(
+                    lang in self.valid_iso_codes for lang in sense["glosses"].keys()
+                )
+            )
+            if translation_count > 0:
+                self.translation_counts[lang_code][category_name] += translation_count
+
     # MARK: process file
     def process_file(self, file_path: str, batch_size: int = 50000):
         """
@@ -611,3 +637,4 @@ def parse_dump(
     #                 print(f"    {i}. {readable_features}")
 
     # print_unique_forms(processor.unique_forms)
+    # print(processor.unique_forms)
diff --git a/tests/cli/test_total.py b/tests/cli/test_total.py
index 7ede34b4..a9640142 100644
--- a/tests/cli/test_total.py
+++ b/tests/cli/test_total.py
@@ -274,3 +274,93 @@ def test_total_wrapper_language_and_data_type(self, mock_get_total_lexemes):
     def test_total_wrapper_invalid_input(self):
         with self.assertRaises(ValueError):
             total_wrapper()
+
+    # MARK: Using wikidata_dump
+    @patch("scribe_data.cli.total.parse_wd_lexeme_dump")
+    def test_total_wrapper_wikidata_dump_flag(self, mock_parse_dump):
+        """Test when wikidata_dump is True (flag without path)"""
+        total_wrapper(wikidata_dump=True)
+        mock_parse_dump.assert_called_once_with(
+            language=None,
+            data_types=[None],
+            wikidata_dump_type=["total"],
+            wikidata_dump_path=None,
+        )
+
+    @patch("scribe_data.cli.total.parse_wd_lexeme_dump")
+    def test_total_wrapper_wikidata_dump_path(self, mock_parse_dump):
+        """Test when wikidata_dump is a file path"""
+        dump_path = "/path/to/dump.json"
+        total_wrapper(wikidata_dump=dump_path)
+        mock_parse_dump.assert_called_once_with(
+            language=None,
+            data_types=[None],
+            wikidata_dump_type=["total"],
+            wikidata_dump_path=dump_path,
+        )
+
+    @patch("scribe_data.cli.total.parse_wd_lexeme_dump")
+    def test_total_wrapper_wikidata_dump_with_all(self, mock_parse_dump):
+        """Test when both wikidata_dump and all_bool are True"""
+        total_wrapper(wikidata_dump=True, all_bool=True)
+        mock_parse_dump.assert_called_once_with(
+            language="all",
+            data_types=["all"],
+            wikidata_dump_type=["total"],
+            wikidata_dump_path=None,
+        )
+
+    @patch("scribe_data.cli.total.parse_wd_lexeme_dump")
+    def test_total_wrapper_wikidata_dump_with_language_and_type(self, mock_parse_dump):
+        """Test wikidata_dump with specific language and data type"""
+        total_wrapper(
+            language="English", data_type="nouns", wikidata_dump="/path/to/dump.json"
+        )
+        mock_parse_dump.assert_called_once_with(
+            language="English",
+            data_types=["nouns"],
+            wikidata_dump_type=["total"],
+            wikidata_dump_path="/path/to/dump.json",
+        )
+
+    # MARK: Using QID
+    @patch("scribe_data.cli.total.check_qid_is_language")
+    @patch("scribe_data.cli.total.print_total_lexemes")
+    def test_total_wrapper_with_qid(self, mock_print_total, mock_check_qid):
+        """
+        Test when language is provided as a QID
+        """
+        mock_check_qid.return_value = "Thai"
+        total_wrapper(language="Q9217")
+        mock_print_total.assert_called_once_with(language="Q9217")
+
+    @patch("scribe_data.cli.total.check_qid_is_language")
+    @patch("scribe_data.cli.total.get_total_lexemes")
+    def test_total_wrapper_with_qid_and_datatype(self, mock_get_total, mock_check_qid):
+        """
+        Test when language QID and data type are provided
+        """
+        mock_check_qid.return_value = "Thai"
+        total_wrapper(language="Q9217", data_type="nouns")
+        mock_get_total.assert_called_once_with(language="Q9217", data_type="nouns")
+
+    @patch("scribe_data.cli.total.parse_wd_lexeme_dump")
+    def test_total_wrapper_qid_with_wikidata_dump(self, mock_parse_dump):
+        """
+        Test QID with wikidata dump
+        """
+        total_wrapper(language="Q9217", wikidata_dump=True, all_bool=True)
+        mock_parse_dump.assert_called_once_with(
+            language="Q9217",
+            data_types=["all"],
+            wikidata_dump_type=["total"],
+            wikidata_dump_path=None,
+        )
+
+    @patch("scribe_data.cli.total.get_total_lexemes")
+    def test_get_total_lexemes_with_qid(self, mock_get_total):
+        """
+        Test get_total_lexemes with QID input
+        """
+        total_wrapper(language="Q9217", data_type="Q1084")  # Q1084 is noun QID
+        mock_get_total.assert_called_once_with(language="Q9217", data_type="Q1084")

From e302a9b35041b910213cee3c49a6e628a13ebbee Mon Sep 17 00:00:00 2001
From: axif <muhamadasif570@gmail.com>
Date: Mon, 20 Jan 2025 03:18:03 +0600
Subject: [PATCH 07/13] fix small bugs

---
 .../workflows/missing_form_check&update.yaml  | 133 ++++++++++
 .github/workflows/update_emojis.yaml          | 150 +++++++++++
 .gitignore                                    |   4 +
 .../check_missing_forms.py                    | 239 ++++++++++++++++++
 .../check/check_missing_forms/download_wd.py  | 102 ++++++++
 .../check_missing_forms/generate_query.py     | 166 ++++++++++++
 .../check/check_missing_forms/get_forms.py    | 173 +++++++++++++
 .../check/check_missing_forms/pr_body.py      | 109 ++++++++
 src/scribe_data/cli/get.py                    |   3 +-
 src/scribe_data/cli/main.py                   |  16 +-
 src/scribe_data/wikidata/wikidata_utils.py    |  12 +-
 src/scribe_data/wiktionary/parse_dump.py      |  62 ++++-
 12 files changed, 1158 insertions(+), 11 deletions(-)
 create mode 100644 .github/workflows/missing_form_check&update.yaml
 create mode 100644 .github/workflows/update_emojis.yaml
 create mode 100644 src/scribe_data/check/check_missing_forms/check_missing_forms.py
 create mode 100644 src/scribe_data/check/check_missing_forms/download_wd.py
 create mode 100644 src/scribe_data/check/check_missing_forms/generate_query.py
 create mode 100644 src/scribe_data/check/check_missing_forms/get_forms.py
 create mode 100644 src/scribe_data/check/check_missing_forms/pr_body.py

diff --git a/.github/workflows/missing_form_check&update.yaml b/.github/workflows/missing_form_check&update.yaml
new file mode 100644
index 00000000..1e51ca3f
--- /dev/null
+++ b/.github/workflows/missing_form_check&update.yaml
@@ -0,0 +1,133 @@
+name: Create Automated PR
+on:
+  schedule:
+    - cron: '0 0 1 * *'  # Runs at 00:00 UTC on the first day of every month
+  # Allow manual trigger
+  workflow_dispatch:
+
+jobs:
+  check-repository:
+    runs-on: ubuntu-latest
+    outputs:
+      is_correct_repo: ${{ steps.check.outputs.is_correct_repo }}
+    steps:
+      - name: Check repository
+        id: check
+        run: |
+          if [ "$GITHUB_REPOSITORY" = "scribe-org/Scribe-Data" ]; then
+            echo "is_correct_repo=true" >> "$GITHUB_OUTPUT"
+          else
+            echo "is_correct_repo=false" >> "$GITHUB_OUTPUT"
+            echo "::warning::This workflow should only run in scribe-org/Scribe-Data repository."
+          fi
+
+  create-pull-request:
+    needs: check-repository
+    if: needs.check-repository.outputs.is_correct_repo == 'true'
+    runs-on: ubuntu-latest
+    steps:
+      - name: Checkout repository
+        uses: actions/checkout@v4
+        with:
+          fetch-depth: 0
+
+      - name: Set up Python
+        uses: actions/setup-python@v4
+        with:
+          python-version: '3.x'
+
+      - name: Install dependencies
+        run: |
+          python -m pip install --upgrade pip
+          pip install -r requirements.txt
+          pip install rich requests tqdm
+          pip install -e .
+
+      - name: Generate Missing Features Data
+        run: |
+          # Set up paths
+          DUMP_PATH=$(PYTHONPATH=$PYTHONPATH:$(pwd)/src python src/scribe_data/check/check_missing_forms/download_wd.py | grep "DOWNLOAD_PATH=" | cut -d'=' -f2)
+          QUERY_DIR="$(pwd)/src/scribe_data/wikidata/language_data_extraction"
+
+          echo "Dump path: ${DUMP_PATH}"
+          echo "Query directory: ${QUERY_DIR}"
+
+          # Check if paths exist
+          if [ -n "${DUMP_PATH}" ] && [ -d "${QUERY_DIR}" ]; then
+            # Generate the missing features data with all keys processing
+            PYTHONPATH=$PYTHONPATH:$(pwd)/src python src/scribe_data/check/check_missing_forms/check_missing_forms.py "${DUMP_PATH}" "${QUERY_DIR}" --process-all-keys
+          else
+            echo "Required paths not found:"
+            echo "Dump path exists: $([ -n "${DUMP_PATH}" ] && echo "Yes" || echo "No")"
+            echo "Query directory exists: $([ -d "${QUERY_DIR}" ] && echo "Yes" || echo "No")"
+            exit 1
+          fi
+
+      # Debug steps to understand the state
+      - name: Debug Info
+        run: |
+          echo "Current branch: $(git branch --show-current)"
+          echo "List of changes:"
+          git status
+
+      - name: Make changes
+        run: |
+          git add src/scribe_data/wikidata/language_data_extraction/**/*.sparql
+          git config --global user.email "github-actions[bot]@users.noreply.github.com"
+          git config --global user.name "github-actions[bot]"
+
+      - name: Debug Missing Features Data
+        if: always()
+        run: |
+          # Print the contents of the missing features JSON file if it exists
+          if [ -f missing_features.json ]; then
+            echo "Contents of missing_features.json:"
+            cat missing_features.json
+          else
+            echo "missing_features.json not found"
+          fi
+
+      - name: Generate PR Body
+        id: pr-body
+        run: |
+          # Run the pr_body.py script with the missing features data
+          PR_BODY_CONTENT=$(python src/scribe_data/check/check_missing_forms/pr_body.py missing_features.json)
+
+          # Debug output
+          echo "PR Body Content:"
+          echo "$PR_BODY_CONTENT"
+
+          # Initialize PR body with delimiter
+          {
+            echo "body<<EOF"
+            echo "$PR_BODY_CONTENT"
+            echo "EOF"
+          } >> $GITHUB_OUTPUT
+
+      - name: Debug PR Body Output
+        run: |
+          # Print the PR body content from the output
+          echo "PR Body from GITHUB_OUTPUT:"
+          cat $GITHUB_OUTPUT
+
+      - name: Create Pull Request
+        uses: peter-evans/create-pull-request@v5
+        with:
+          token: ${{ secrets.GITHUB_TOKEN }}
+          title: 'Automated PR: Updated Language Data Files'
+          body: ${{ steps.pr-body.outputs.body }}
+          base: master
+          branch: automated-missing-forms-pr
+          delete-branch: true
+          draft: false
+          commit-message: '[create-pull-request] automated change'
+          committer: GitHub <noreply@github.com>
+          author: github-actions[bot] <github-actions[bot]@users.noreply.github.com>
+
+      # Debug step to verify PR creation attempt
+      - name: Check PR Creation
+        run: |
+          echo "Checking if PR was created..."
+          gh pr list
+        env:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
diff --git a/.github/workflows/update_emojis.yaml b/.github/workflows/update_emojis.yaml
new file mode 100644
index 00000000..8465147e
--- /dev/null
+++ b/.github/workflows/update_emojis.yaml
@@ -0,0 +1,150 @@
+name: Check and Update Emoji Data
+on:
+  schedule:
+    - cron: '0 0 1 * *'  # Runs at 00:00 UTC on the first day of every month
+  # Allow manual trigger
+  workflow_dispatch:
+
+jobs:
+  check-repository:
+    runs-on: ubuntu-latest
+    outputs:
+      is_correct_repo: ${{ steps.check.outputs.is_correct_repo }}
+    steps:
+      - name: Check repository
+        id: check
+        run: |
+          if [ "$GITHUB_REPOSITORY" = "scribe-org/Scribe-Data" ]; then
+            echo "is_correct_repo=true" >> "$GITHUB_OUTPUT"
+          else
+            echo "is_correct_repo=false" >> "$GITHUB_OUTPUT"
+            echo "::warning::This workflow should only run in scribe-org/Scribe-Data repository."
+          fi
+
+  check-and-update:
+    needs: check-repository
+    if: needs.check-repository.outputs.is_correct_repo == 'true'
+    runs-on: ubuntu-latest
+    steps:
+      - name: Checkout repository
+        uses: actions/checkout@v4
+        with:
+          fetch-depth: 0
+
+      - name: Set up Python
+        uses: actions/setup-python@v4
+        with:
+          python-version: '3.x'
+
+      - name: Install dependencies
+        run: |
+          python -m pip install --upgrade pip
+          pip install requests
+          sudo apt-get install jq
+
+      - name: Get language list
+        id: get-langs
+        run: |
+          # Fetch language list from GitHub API
+          DERIVED_LANGS=$(curl -s https://api.github.com/repos/unicode-org/cldr-json/contents/cldr-json/cldr-annotations-derived-full/annotationsDerived | jq -r '.[].name')
+          FULL_LANGS=$(curl -s https://api.github.com/repos/unicode-org/cldr-json/contents/cldr-json/cldr-annotations-full/annotations | jq -r '.[].name')
+
+          # Combine and deduplicate language lists
+          LANG_LIST=$(echo "$DERIVED_LANGS $FULL_LANGS" | tr ' ' '\n' | sort -u | tr '\n' ' ')
+          echo "lang_list=${LANG_LIST}" >> $GITHUB_OUTPUT
+          echo "Detected languages: ${LANG_LIST}"
+
+      - name: Download and check emoji data
+        id: check-updates
+        run: |
+          # Create directories if they don't exist
+          mkdir -p src/scribe_data/unicode/cldr-annotations-derived-full
+          mkdir -p src/scribe_data/unicode/cldr-annotations-full
+
+          CHANGES_EXIST=false
+          CHANGE_SUMMARY="| Language | Derived Changes | Full Changes |\n|----------|-----------------|--------------|"
+
+          # Use dynamic language list from previous step
+          for lang in ${{ steps.get-langs.outputs.lang_list }}; do
+            DERIVED_CHANGED="No"
+            FULL_CHANGED="No"
+
+            # Download latest data for each language
+            mkdir -p "src/scribe_data/unicode/cldr-annotations-derived-full/annotationsDerived/$lang"
+            mkdir -p "src/scribe_data/unicode/cldr-annotations-full/annotations/$lang"
+
+            curl -L "https://raw.githubusercontent.com/unicode-org/cldr-json/main/cldr-json/cldr-annotations-derived-full/annotationsDerived/$lang/annotations.json" -o "new_derived_$lang.json"
+            curl -L "https://raw.githubusercontent.com/unicode-org/cldr-json/main/cldr-json/cldr-annotations-full/annotations/$lang/annotations.json" -o "new_full_$lang.json"
+
+            # Check derived annotations
+            if [ -f "src/scribe_data/unicode/cldr-annotations-derived-full/annotationsDerived/$lang/annotations.json" ]; then
+              if ! cmp -s "new_derived_$lang.json" "src/scribe_data/unicode/cldr-annotations-derived-full/annotationsDerived/$lang/annotations.json"; then
+                CHANGES_EXIST=true
+                DERIVED_CHANGED="Yes"
+              fi
+            else
+              CHANGES_EXIST=true
+              DERIVED_CHANGED="New"
+            fi
+
+            # Check full annotations
+            if [ -f "src/scribe_data/unicode/cldr-annotations-full/annotations/$lang/annotations.json" ]; then
+              if ! cmp -s "new_full_$lang.json" "src/scribe_data/unicode/cldr-annotations-full/annotations/$lang/annotations.json"; then
+                CHANGES_EXIST=true
+                FULL_CHANGED="Yes"
+              fi
+            else
+              CHANGES_EXIST=true
+              FULL_CHANGED="New"
+            fi
+
+            # Only add to summary if there are changes
+            if [ "$DERIVED_CHANGED" != "No" ] || [ "$FULL_CHANGED" != "No" ]; then
+              CHANGE_SUMMARY="$CHANGE_SUMMARY\n| $lang | $DERIVED_CHANGED | $FULL_CHANGED |"
+            fi
+          done
+
+          echo "changes_exist=${CHANGES_EXIST}" >> $GITHUB_OUTPUT
+          echo "change_summary<<EOF" >> $GITHUB_OUTPUT
+          echo -e "$CHANGE_SUMMARY" >> $GITHUB_OUTPUT
+          echo "EOF" >> $GITHUB_OUTPUT
+
+      - name: Update files if changed
+        if: steps.check-updates.outputs.changes_exist == 'true'
+        run: |
+          # Use dynamic language list
+          for lang in ${{ steps.get-langs.outputs.lang_list }}; do
+            mkdir -p "src/scribe_data/unicode/cldr-annotations-derived-full/annotationsDerived/$lang"
+            mkdir -p "src/scribe_data/unicode/cldr-annotations-full/annotations/$lang"
+
+            mv "new_derived_$lang.json" "src/scribe_data/unicode/cldr-annotations-derived-full/annotationsDerived/$lang/annotations.json"
+            mv "new_full_$lang.json" "src/scribe_data/unicode/cldr-annotations-full/annotations/$lang/annotations.json"
+          done
+
+          git config --global user.email "github-actions[bot]@users.noreply.github.com"
+          git config --global user.name "github-actions[bot]"
+
+      - name: Create Pull Request
+        if: steps.check-updates.outputs.changes_exist == 'true'
+        uses: peter-evans/create-pull-request@v5
+        with:
+          token: ${{ secrets.GITHUB_TOKEN }}
+          title: 'chore: Update emoji annotations data'
+          body: |
+            This PR updates the emoji annotations data from CLDR.
+
+            ## Changes Summary
+            ${{ steps.check-updates.outputs.change_summary }}
+
+            ### Legend:
+            - Yes: File was updated
+            - New: File was newly added
+            - No: No changes
+
+            This is an automated PR created by the emoji data update workflow.
+          branch: update-emoji-data # Branch name
+          delete-branch: true
+          commit-message: 'chore: Update emoji annotations data'
+          labels: |
+            automated pr
+            emoji-data
diff --git a/.gitignore b/.gitignore
index 4bcc3809..475ba504 100644
--- a/.gitignore
+++ b/.gitignore
@@ -46,3 +46,7 @@ scribe_data_wikidata_dumps_export/*
 # MARK: Wiki Dumps
 
 *.json.bz2
+
+# MARK: GitHub Actions
+
+missing_features.json
diff --git a/src/scribe_data/check/check_missing_forms/check_missing_forms.py b/src/scribe_data/check/check_missing_forms/check_missing_forms.py
new file mode 100644
index 00000000..ac57e498
--- /dev/null
+++ b/src/scribe_data/check/check_missing_forms/check_missing_forms.py
@@ -0,0 +1,239 @@
+"""
+Check for missing forms in Wikidata.
+
+.. raw:: html
+    <!--
+    * Copyright (C) 2024 Scribe
+    *
+    * This program is free software: you can redistribute it and/or modify
+    * it under the terms of the GNU General Public License as published by
+    * the Free Software Foundation, either version 3 of the License, or
+    * (at your option) any later version.
+    *
+    * This program is distributed in the hope that it will be useful,
+    * but WITHOUT ANY WARRANTY; without even the implied warranty of
+    * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    * GNU General Public License for more details.
+    *
+    * You should have received a copy of the GNU General Public License
+    * along with this program.  If not, see <https://www.gnu.org/licenses/>.
+    -->
+"""
+
+import json
+import sys
+import argparse
+from pathlib import Path
+from get_forms import parse_sparql_files, extract_dump_forms
+from generate_query import generate_query
+from collections import defaultdict
+from scribe_data.utils import (
+    lexeme_form_metadata,
+    language_metadata,
+    data_type_metadata,
+)
+
+
+def get_all_languages():
+    """
+    Extract all languages and sublanguages from language metadata.
+
+    Returns
+    -------
+    list of str
+        List of language codes for all languages and sublanguages that have
+        both ISO codes and QIDs defined.
+
+    Notes
+    -----
+    Only includes languages and sublanguages that have both 'iso' and 'qid'
+    fields in their metadata.
+    """
+    languages = []
+
+    for lang, lang_data in language_metadata.items():
+        # Add main language if it has ISO and QID.
+        if "iso" in lang_data and "qid" in lang_data:
+            languages.append(lang)
+
+        # Add sublanguages.
+        if "sub_languages" in lang_data:
+            for sublang, sublang_data in lang_data["sub_languages"].items():
+                if "iso" in sublang_data and "qid" in sublang_data:
+                    languages.append(sublang)
+
+    return languages
+
+
+def get_missing_features(result_sparql, result_dump):
+    """
+    Compare features between SPARQL results and dump data to find missing ones.
+
+    Parameters
+    ----------
+    result_sparql : dict
+        Features extracted from SPARQL queries.
+        Format: {language: {data_type: [features]}}
+    result_dump : dict
+        Features extracted from Wikidata dump.
+        Format: {language: {data_type: [features]}}
+
+    Returns
+    -------
+    dict or None
+        Dictionary of missing features by language and data type if any found,
+        otherwise None.
+        Format: {language: {data_type: [missing_features]}}
+
+    Notes
+    -----
+    Only includes features that have valid QIDs present in lexeme_form_metadata.
+    """
+    missing_by_lang_type = defaultdict(lambda: defaultdict(list))
+
+    # Extract all QIDs from the metadata.
+    all_qids = set()
+    for category, items in lexeme_form_metadata.items():
+        for key, value in items.items():
+            all_qids.add(value["qid"])
+
+    # Compare features for each language and data type.
+    for lang in result_sparql:
+        if lang in result_dump:
+            # Get all unique data types from both sources.
+            all_data_types = set(result_sparql[lang].keys()) | set(
+                result_dump[lang].keys()
+            )
+
+            for dt in all_data_types:
+                sparql_values = set()
+                dump_values = set()
+
+                # Get values from SPARQL if available.
+                if dt in result_sparql[lang]:
+                    sparql_values = set(tuple(item) for item in result_sparql[lang][dt])
+
+                # Get values from dump if available.
+                if dt in result_dump[lang]:
+                    dump_values = set(tuple(item) for item in result_dump[lang][dt])
+
+                # Get unique values from both sources.
+                unique_dump_values = dump_values - sparql_values
+                unique_sparql_values = sparql_values - dump_values
+
+                # Store valid missing features from dump.
+                for item in unique_dump_values:
+                    if all(qid in all_qids for qid in item):
+                        missing_by_lang_type[lang][dt].append(list(item))
+
+                # Store valid missing features from SPARQL.
+                for item in unique_sparql_values:
+                    if all(qid in all_qids for qid in item):
+                        missing_by_lang_type[lang][dt].append(list(item))
+
+    return missing_by_lang_type if missing_by_lang_type else None
+
+
+def process_missing_features(missing_features, query_dir):
+    """
+    Generate SPARQL queries for missing features by language and data type.
+
+    Parameters
+    ----------
+    missing_features : dict
+        Dictionary of missing features by language and data type.
+        Format: {language: {data_type: [features]}}
+    query_dir : str or Path
+        Directory where generated query files should be saved.
+
+    Notes
+    -----
+    Generates separate queries for each data type within each language.
+    """
+    if not missing_features:
+        return
+
+    for language, data_types in missing_features.items():
+        print(f"Processing language: {language}")
+        print(f"Data types: {list(data_types.keys())}")
+
+        # Create a separate entry for each data type.
+        for data_type, features in data_types.items():
+            language_entry = {language: {data_type: features}}
+            print(f"Generating query for {language} - {data_type}")
+            generate_query(language_entry, query_dir)
+
+
+def main():
+    """
+    Main function to check for missing forms in Wikidata.
+
+    Processes command line arguments, downloads and compares Wikidata dump data
+    with SPARQL query results to identify missing features, and generates
+    appropriate SPARQL queries.
+
+    Notes
+    -----
+    Required command line arguments:
+    - dump_path: Path to the Wikidata dump file
+    - query_dir: Directory for storing generated queries
+
+    Optional arguments:
+    - --process-all-keys: Flag to process all nested keys in missing features
+    """
+    parser = argparse.ArgumentParser(description="Check missing forms in Wikidata")
+    parser.add_argument("dump_path", type=str, help="Path to the dump file")
+    parser.add_argument("query_dir", type=str, help="Path to the query directory")
+    parser.add_argument(
+        "--process-all-keys",
+        action="store_true",
+        help="Process all nested keys in the missing features",
+    )
+
+    args = parser.parse_args()
+
+    dump_path = Path(args.dump_path)
+    query_dir = Path(args.query_dir)
+
+    if not dump_path.exists():
+        print(f"Error: Dump path does not exist: {dump_path}")
+        sys.exit(1)
+
+    if not query_dir.exists():
+        print(f"Error: Query directory does not exist: {query_dir}")
+        sys.exit(1)
+
+    # Get all languages including sublanguages.
+    languages = get_all_languages()
+
+    print("Parsing SPARQL files...")
+    result_sparql = parse_sparql_files()
+
+    print("Extracting Wiki lexeme dump...")
+    result_dump = extract_dump_forms(
+        languages=languages,
+        data_types=list(data_type_metadata.keys()),
+        file_path=dump_path,
+    )
+
+    missing_features = get_missing_features(result_sparql, result_dump)
+
+    try:
+        print("Generated missing features:", missing_features)
+
+        # Save the missing features to a JSON file.
+        with open("missing_features.json", "w") as f:
+            json.dump(missing_features, f, indent=4)
+        print("Missing features data has been saved to missing_features.json")
+
+        if missing_features:
+            # Process all data types for each language.
+            process_missing_features(missing_features, query_dir)
+
+    except Exception as e:
+        print(f"An error occurred: {e}")
+        sys.exit(1)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/src/scribe_data/check/check_missing_forms/download_wd.py b/src/scribe_data/check/check_missing_forms/download_wd.py
new file mode 100644
index 00000000..ccf6b797
--- /dev/null
+++ b/src/scribe_data/check/check_missing_forms/download_wd.py
@@ -0,0 +1,102 @@
+"""
+Download Wikidata lexeme dump.
+
+.. raw:: html
+    <!--
+    * Copyright (C) 2024 Scribe
+    *
+    * This program is free software: you can redistribute it and/or modify
+    * it under the terms of the GNU General Public License as published by
+    * the Free Software Foundation, either version 3 of the License, or
+    * (at your option) any later version.
+    *
+    * This program is distributed in the hope that it will be useful,
+    * but WITHOUT ANY WARRANTY; without even the implied warranty of
+    * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    * GNU General Public License for more details.
+    *
+    * You should have received a copy of the GNU General Public License
+    * along with this program.  If not, see <https://www.gnu.org/licenses/>.
+    -->
+"""
+
+from pathlib import Path
+from scribe_data.cli.download import download_wd_lexeme_dump
+from scribe_data.utils import DEFAULT_DUMP_EXPORT_DIR
+import requests
+import os
+
+
+def wd_lexeme_dump_download(wikidata_dump=None, output_dir=None):
+    """
+    Download Wikidata lexeme dumps automatically.
+
+    Parameters
+    ----------
+    wikidata_dump : str, optional
+        Date string in YYYYMMDD format for specific dumps.
+        If None, downloads the latest dump.
+    output_dir : str, optional
+        Directory path for the downloaded file.
+        If None, uses DEFAULT_DUMP_EXPORT_DIR.
+
+    Returns
+    -------
+    str or False
+        Path to downloaded file if successful, False otherwise.
+
+    Notes
+    -----
+    - Downloads are skipped if the file already exists in the output directory
+    - Progress is displayed every 50MB during download
+    - Creates output directory if it doesn't exist
+    """
+    dump_url = download_wd_lexeme_dump(wikidata_dump or "latest-lexemes")
+
+    if not dump_url:
+        print("No dump URL found.")
+        return False
+
+    output_dir = output_dir or DEFAULT_DUMP_EXPORT_DIR
+    os.makedirs(output_dir, exist_ok=True)
+
+    filename = dump_url.split("/")[-1]
+    output_path = str(Path(output_dir) / filename)
+
+    # Check if the file already exists.
+    if os.path.exists(output_path):
+        print(f"File already exists: {output_path}. Skipping download.")
+        return output_path
+
+    # Proceed with the download if the file does not exist.
+    print(f"Downloading dump to {output_path}...")
+
+    try:
+        response = requests.get(dump_url, stream=True)
+        total_size = int(response.headers.get("content-length", 0))
+        downloaded_size = 0
+
+        with open(output_path, "wb") as f:
+            for chunk in response.iter_content(chunk_size=8192):
+                if chunk:
+                    f.write(chunk)
+                    downloaded_size += len(chunk)
+                    # Print progress percentage every 50MB.
+                    if total_size and downloaded_size % (50 * 1024 * 1024) < 8192:
+                        progress = (downloaded_size / total_size) * 100
+                        print(f"Download progress: {progress:.1f}%")
+
+        print("Download completed successfully!")
+        return output_path
+
+    except requests.exceptions.RequestException as e:
+        print(f"Error downloading dump: {e}")
+
+    except Exception as e:
+        print(f"An error occurred: {e}")
+
+
+if __name__ == "__main__":
+    output_path = wd_lexeme_dump_download()
+    if output_path:
+        print(f"DOWNLOAD_PATH={output_path}")
diff --git a/src/scribe_data/check/check_missing_forms/generate_query.py b/src/scribe_data/check/check_missing_forms/generate_query.py
new file mode 100644
index 00000000..71bfde50
--- /dev/null
+++ b/src/scribe_data/check/check_missing_forms/generate_query.py
@@ -0,0 +1,166 @@
+"""
+Generate SPARQL queries for missing lexeme forms.
+
+.. raw:: html
+    <!--
+    * Copyright (C) 2024 Scribe
+    *
+    * This program is free software: you can redistribute it and/or modify
+    * it under the terms of the GNU General Public License as published by
+    * the Free Software Foundation, either version 3 of the License, or
+    * (at your option) any later version.
+    *
+    * This program is distributed in the hope that it will be useful,
+    * but WITHOUT ANY WARRANTY; without even the implied warranty of
+    * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    * GNU General Public License for more details.
+    *
+    * You should have received a copy of the GNU General Public License
+    * along with this program.  If not, see <https://www.gnu.org/licenses/>.
+    -->
+"""
+
+from scribe_data.utils import (
+    lexeme_form_metadata,
+    language_metadata,
+    data_type_metadata,
+    LANGUAGE_DATA_EXTRACTION_DIR as language_data_extraction,
+)
+
+import os
+from pathlib import Path
+
+
+def generate_query(missing_features, query_dir=None):
+    """
+    Generate SPARQL queries for missing lexeme forms.
+
+    Parameters
+    ----------
+    missing_features : dict
+        Dictionary containing missing features by language and data type.
+        Format: {language_qid: {data_type_qid: [[form_qids]]}}
+    query_dir : str or Path, optional
+        Directory where query files should be saved.
+        If None, uses default language_data_extraction directory.
+
+    Returns
+    -------
+    str
+        Path to the generated query file.
+
+    Notes
+    -----
+    - Generates a single query file combining all forms for a given
+      language and data type combination
+    - Query files are named incrementally if duplicates exist
+    - Creates necessary directories if they don't exist
+    """
+    language_qid = next(iter(missing_features.keys()))
+    data_type_qid = next(iter(missing_features[language_qid].keys()))
+
+    # Find the language entry by QID.
+    language_entry = next(
+        (name, data)
+        for name, data in language_metadata.items()
+        if data.get("qid") == language_qid
+    )
+    language = language_entry[0]  # The language name.
+
+    data_type = next(
+        name for name, qid in data_type_metadata.items() if qid == data_type_qid
+    )
+
+    iso_code = language_metadata[language]["iso"]
+
+    # Create a QID to label mapping from the metadata.
+    qid_to_label = {}
+    for category in lexeme_form_metadata.values():
+        for item in category.values():
+            qid_to_label[item["qid"]] = item["label"]
+
+    # Process all forms at once
+    forms_query = []
+    all_form_combinations = missing_features[language_qid][data_type_qid]
+    for form_qids in all_form_combinations:
+        # Convert QIDs to labels and join them together.
+        labels = [qid_to_label.get(qid, qid) for qid in form_qids]
+        concatenated_label = "".join(labels)
+        # Make first letter lowercase
+        concatenated_label = concatenated_label[0].lower() + concatenated_label[1:]
+        forms_query.append({"label": concatenated_label, "qids": form_qids})
+
+    # Generate a single query for all forms.
+    main_body = f"""# tool: scribe-data
+# All {language} ({language_qid}) {data_type} ({data_type_qid}) and their forms.
+# Enter this query at https://query.wikidata.org/.
+
+SELECT
+   (REPLACE(STR(?lexeme), "http://www.wikidata.org/entity/", "") AS ?lexemeID)
+    ?{data_type}
+    """ + "\n  ".join(f'?{form["label"]}' for form in forms_query)
+
+    where_clause = f"""
+  WHERE {{
+    ?lexeme dct:language wd:{language_qid} ;
+        wikibase:lexicalCategory wd:{data_type_qid} ;
+        wikibase:lemma ?{data_type} .
+        FILTER(lang(?{data_type}) = "{iso_code}")
+    """
+
+    # Generate OPTIONAL clauses for all forms in one query.
+    optional_clauses = ""
+    for form in forms_query:
+        qids = ", ".join(f"wd:{qid}" for qid in form["qids"])
+        optional_clauses += f"""
+        OPTIONAL {{
+            ?lexeme ontolex:lexicalForm ?{form['label']}Form .
+            ?{form['label']}Form ontolex:representation ?{form['label']} ;
+                wikibase:grammaticalFeature {qids} .
+        }}
+"""
+
+    # Print the complete query.
+    final_query = main_body + where_clause + optional_clauses + "}"
+
+    def get_available_filename(base_path):
+        """Helper function to find the next available filename"""
+        if not os.path.exists(base_path):
+            return base_path
+
+        base, ext = os.path.splitext(base_path)
+        counter = 1
+
+        # If the base already ends with _N, start from that number.
+        import re
+
+        if match := re.search(r"_(\d+)$", base):
+            counter = int(match.group(1)) + 1
+            base = base[: match.start()]
+
+        while True:
+            new_path = f"{base}_{counter}{ext}"
+            if not os.path.exists(new_path):
+                return new_path
+            counter += 1
+
+    # Create base filename using the provided query_dir or default.
+    if query_dir:
+        base_file_name = (
+            Path(query_dir) / language / data_type / f"query_{data_type}.sparql"
+        )
+    else:
+        base_file_name = f"{language_data_extraction}/{language}/{data_type}/query_{data_type}.sparql"
+
+    # Get the next available filename.
+    file_name = get_available_filename(str(base_file_name))
+
+    # Create directory if it doesn't exist.
+    os.makedirs(os.path.dirname(file_name), exist_ok=True)
+
+    # Write the file.
+    with open(file_name, "w") as file:
+        file.write(final_query)
+
+    print(f"Query file created: {file_name}")
+    return file_name
diff --git a/src/scribe_data/check/check_missing_forms/get_forms.py b/src/scribe_data/check/check_missing_forms/get_forms.py
new file mode 100644
index 00000000..909abf5e
--- /dev/null
+++ b/src/scribe_data/check/check_missing_forms/get_forms.py
@@ -0,0 +1,173 @@
+"""
+Get forms from Wikidata.
+.. raw:: html
+    <!--
+    * Copyright (C) 2024 Scribe
+    *
+    * This program is free software: you can redistribute it and/or modify
+    * it under the terms of the GNU General Public License as published by
+    * the Free Software Foundation, either version 3 of the License, or
+    * (at your option) any later version.
+    *
+    * This program is distributed in the hope that it will be useful,
+    * but WITHOUT ANY WARRANTY; without even the implied warranty of
+    * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    * GNU General Public License for more details.
+    *
+    * You should have received a copy of the GNU General Public License
+    * along with this program.  If not, see <https://www.gnu.org/licenses/>.
+    -->
+"""
+
+from scribe_data.wiktionary.parse_dump import LexemeProcessor
+import re
+from collections import defaultdict
+from scribe_data.utils import (
+    language_metadata,
+    data_type_metadata,
+    LANGUAGE_DATA_EXTRACTION_DIR as language_data_extraction,
+)
+
+iso_to_qid = {
+    lang_data["iso"]: lang_data["qid"]
+    for lang, lang_data in language_metadata.items()
+    if "iso" in lang_data and "qid" in lang_data
+}
+
+all_forms = defaultdict(lambda: defaultdict(list))
+
+
+def parse_sparql_files():
+    """
+    Read and parse all SPARQL query files to extract form information.
+
+    Returns
+    -------
+    dict
+        Accumulated forms for each language and lexical category.
+        Format: {language: {lexical_category: [forms]}}
+
+    Notes
+    -----
+    Recursively searches through language_data_extraction directory
+    for .sparql files and accumulates all form information.
+    """
+    for sub_sub_file in language_data_extraction.rglob("*.sparql"):
+        with open(sub_sub_file, "r", encoding="utf-8") as query_text:
+            result = parse_sparql_query(query_text.read())
+
+            # Accumulate forms for each language and lexical category.
+            for lang, categories in result.items():
+                for category, forms in categories.items():
+                    if forms:
+                        all_forms[lang][category].extend(forms)
+
+    return all_forms
+
+
+def parse_sparql_query(query_text):
+    """
+    Parse a SPARQL query to extract lexical categories and features.
+
+    Parameters
+    ----------
+    query_text : str
+        Content of the SPARQL query file.
+
+    Returns
+    -------
+    dict
+        Dictionary containing parsed information.
+        Format: {language: {lexical_category: [forms]}}
+
+    Notes
+    -----
+    Extracts:
+    - Language QID
+    - Lexical category QID
+    - Grammatical features from OPTIONAL blocks
+    """
+    # Get language and category first.
+    language = None
+    lexical_category = None
+
+    # Parse lexical category.
+    lexical_matches = re.finditer(r"wikibase:lexicalCategory\s+wd:(Q\d+)", query_text)
+    for match in lexical_matches:
+        lexical_category = match.group(1)
+
+    # Parse language.
+    language_matches = re.finditer(r"dct:language\s+wd:(Q\d+)", query_text)
+    for match in language_matches:
+        language = match.group(1)
+
+    result = {language: {lexical_category: []}}
+
+    # Parse optional blocks for forms and features.
+    optional_blocks = re.finditer(r"OPTIONAL\s*{([^}]+)}", query_text)
+
+    for block in optional_blocks:
+        block_text = block.group(1)
+
+        # Extract grammatical features.
+        features = re.finditer(r"wd:(Q\d+)", block_text)
+        feature_list = [f.group(1) for f in features]
+
+        if feature_list:
+            result[language][lexical_category].append(feature_list)
+
+    return result
+
+
+parse_sparql_files()
+
+
+def extract_dump_forms(
+    languages=None, data_types=None, file_path="latest-lexemes.json.bz2"
+):
+    """
+    Extract unique grammatical features from Wikidata lexeme dump.
+
+    Parameters
+    ----------
+    languages : list of str, optional
+        List of language ISO codes (e.g., ['en', 'fr'])
+    data_types : list of str, optional
+        List of lexical categories (e.g., ['nouns', 'verbs'])
+    file_path : str, optional
+        Path to the lexeme dump file, by default "latest-lexemes.json.bz2"
+
+    Returns
+    -------
+    dict
+        Dictionary of unique grammatical features per language and lexical category.
+        Format: {language_qid: {data_type_qid: features}}
+
+    Notes
+    -----
+    - Converts ISO codes to QIDs in the output
+    - Converts data type names to their corresponding QIDs
+    - Only includes languages and data types that have valid QID mappings
+    """
+    processor = LexemeProcessor(
+        target_iso=languages, parse_type=["form"], data_types=data_types
+    )
+
+    processor.process_file(file_path)
+
+    unique_features = dict(processor.unique_forms)
+
+    # Convert ISO codes to QIDs and data types to QIDs.
+    converted_features = {}
+    for iso_code, data_types_dict in unique_features.items():
+        if iso_code in iso_to_qid:
+            lang_qid = iso_to_qid[iso_code]
+            converted_features[lang_qid] = {}
+
+            for data_type, features in data_types_dict.items():
+                # Get QID from data_type_metadata.
+                data_type_qid = data_type_metadata.get(data_type)
+                if data_type_qid:
+                    converted_features[lang_qid][data_type_qid] = features
+
+    return converted_features
diff --git a/src/scribe_data/check/check_missing_forms/pr_body.py b/src/scribe_data/check/check_missing_forms/pr_body.py
new file mode 100644
index 00000000..11aadbe9
--- /dev/null
+++ b/src/scribe_data/check/check_missing_forms/pr_body.py
@@ -0,0 +1,109 @@
+"""
+Generate a formatted PR body describing missing features for each language.
+
+.. raw:: html
+    <!--
+    * Copyright (C) 2024 Scribe
+    *
+    * This program is free software: you can redistribute it and/or modify
+    * it under the terms of the GNU General Public License as published by
+    * the Free Software Foundation, either version 3 of the License, or
+    * (at your option) any later version.
+    *
+    * This program is distributed in the hope that it will be useful,
+    * but WITHOUT ANY WARRANTY; without even the implied warranty of
+    * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    * GNU General Public License for more details.
+    *
+    * You should have received a copy of the GNU General Public License
+    * along with this program.  If not, see <https://www.gnu.org/licenses/>.
+    -->
+"""
+
+import json
+import sys
+from scribe_data.utils import (
+    language_metadata,
+    data_type_metadata,
+)
+
+
+def pr_body(missing_features):
+    """
+    Generate a formatted PR body describing missing features for each language.
+
+    Parameters
+    ----------
+    missing_features : dict
+        Dictionary mapping language QIDs to their missing features.
+        Format: {language_qid: {feature_type: [features]}}
+
+    Returns
+    -------
+    str
+        Formatted PR body content in markdown format containing a table of
+        missing features grouped by language.
+
+    Notes
+    -----
+    The PR body includes:
+    - A header indicating this is an automated PR
+    - A table showing languages and their missing feature types
+    - Features are grouped by language for better readability
+    """
+    # Initialize PR body with a header.
+    pr_body_content = "## Automated PR: Missing Features\n\n"
+    pr_body_content += "This PR was automatically created by a GitHub Action.\n\n"
+    pr_body_content += "### Missing Features Summary\n"
+    pr_body_content += "| **Language** | **Feature Type** |\n"
+    pr_body_content += "|--------------|------------------|\n"
+
+    # Create a dictionary to group features by language.
+    grouped_features = {}
+
+    # Iterate over the missing features to populate the table.
+    for entity, features in missing_features.items():
+        # Check for sub-languages.
+        language_name = None
+        for name, data in language_metadata.items():
+            if data.get("qid") == entity:
+                language_name = name
+                break
+            if "sub_languages" in data:
+                for sub_name, sub_data in data["sub_languages"].items():
+                    if sub_data.get("qid") == entity:
+                        language_name = f"{name} ({sub_name})"
+                        break
+            if language_name:
+                break
+
+        # Default to entity if no name is found.
+        language_name = language_name or entity
+
+        # Group features by language.
+        if language_name not in grouped_features:
+            grouped_features[language_name] = set()
+
+        for feature in features.keys():
+            feature_name = next(
+                (name for name, qid in data_type_metadata.items() if qid == feature),
+                feature,
+            )
+            grouped_features[language_name].add(feature_name)
+
+    # Add grouped features to the PR body.
+    for language, features in sorted(grouped_features.items()):
+        feature_list = ", ".join(sorted(features))
+        pr_body_content += f"| **{language}** | {feature_list} |\n"
+
+    pr_body_content += "\nPlease review the changes and provide feedback.\n"
+
+    print(pr_body_content)
+    return pr_body_content
+
+
+if __name__ == "__main__":
+    with open(sys.argv[1], "r") as f:
+        missing_features = json.load(f)
+
+    pr_body(missing_features)
diff --git a/src/scribe_data/cli/get.py b/src/scribe_data/cli/get.py
index 6659c0ba..dc56fb8e 100644
--- a/src/scribe_data/cli/get.py
+++ b/src/scribe_data/cli/get.py
@@ -198,7 +198,8 @@ def prompt_user_download_all():
     # MARK: Form Dump
 
     elif wikidata_dump is not None:
-        if not wikidata_dump:
+        # If wikidata_dump is an empty string, use the default path
+        if wikidata_dump == "":
             wikidata_dump = DEFAULT_DUMP_EXPORT_DIR
         parse_wd_lexeme_dump(
             language=language,
diff --git a/src/scribe_data/cli/main.py b/src/scribe_data/cli/main.py
index d51712d2..fbc67d8e 100644
--- a/src/scribe_data/cli/main.py
+++ b/src/scribe_data/cli/main.py
@@ -37,6 +37,11 @@
 from scribe_data.cli.upgrade import upgrade_cli
 from scribe_data.cli.version import get_version_message
 from scribe_data.wiktionary.parse_mediaWiki import parse_wiktionary_translations
+from scribe_data.utils import (
+    DEFAULT_JSON_EXPORT_DIR,
+    DEFAULT_CSV_EXPORT_DIR,
+    DEFAULT_DUMP_EXPORT_DIR,
+)
 
 LIST_DESCRIPTION = "List languages, data types and combinations of each that Scribe-Data can be used for."
 GET_DESCRIPTION = (
@@ -132,7 +137,10 @@ def main() -> None:
         help="The output file type.",
     )
     get_parser.add_argument(
-        "-od", "--output-dir", type=str, help="The output directory path for results."
+        "-od",
+        "--output-dir",
+        type=str,
+        help=f"The output directory path for results (default: ./{DEFAULT_JSON_EXPORT_DIR} for JSON, ./{DEFAULT_CSV_EXPORT_DIR} for CSV, etc.).",
     )
     get_parser.add_argument(
         "-ope",
@@ -168,7 +176,7 @@ def main() -> None:
         "--wikidata-dump-path",
         nargs="?",
         const="",
-        help="Path to a local Wikidata lexemes dump. Uses default directory if no path provided.",
+        help=f"Path to a local Wikidata lexemes dump. Uses default directory (./{DEFAULT_DUMP_EXPORT_DIR}) if no path provided.",
     )
     get_parser.add_argument(
         "-t", "--translation", type=str, help="parse a single word using MediaWiki API"
@@ -208,7 +216,7 @@ def main() -> None:
         "--wikidata-dump-path",
         nargs="?",
         const=True,
-        help="Path to a local Wikidata lexemes dump for running with '--all'.",
+        help=f"Path to a local Wikidata lexemes dump for running with '--all' (default: ./{DEFAULT_DUMP_EXPORT_DIR}).",
     )
 
     # MARK: Convert
@@ -308,7 +316,7 @@ def main() -> None:
         "-wdp",
         "--wikidata-dump-path",
         type=str,
-        help="The output directory path for the downloaded dump.",
+        help=f"The output directory path for the downloaded dump (default: ./{DEFAULT_DUMP_EXPORT_DIR}).",
     )
 
     # MARK: Interactive
diff --git a/src/scribe_data/wikidata/wikidata_utils.py b/src/scribe_data/wikidata/wikidata_utils.py
index cf6fb872..9e481d10 100644
--- a/src/scribe_data/wikidata/wikidata_utils.py
+++ b/src/scribe_data/wikidata/wikidata_utils.py
@@ -89,9 +89,17 @@ def parse_wd_lexeme_dump(
     overwrite_all : bool, default=False
         If True, automatically overwrite existing files without prompting
     """
-    # Convert "all" to list of all languages
+    # Convert "all" to list of all languages including sub-languages
     if isinstance(language, str) and language.lower() == "all":
-        language = list(language_metadata.keys())
+        languages = []
+        for main_lang, lang_data in language_metadata.items():
+            # Add sub-languages if they exist
+            if "sub_languages" in lang_data:
+                for sub_lang in lang_data["sub_languages"]:
+                    main_lang = sub_lang
+            languages.append(main_lang)
+
+        language = languages
 
     # For processing: exclude translations and emoji-keywords
     if isinstance(data_types, str) and data_types.lower() == "all":
diff --git a/src/scribe_data/wiktionary/parse_dump.py b/src/scribe_data/wiktionary/parse_dump.py
index fa6bd0f6..9d66100a 100644
--- a/src/scribe_data/wiktionary/parse_dump.py
+++ b/src/scribe_data/wiktionary/parse_dump.py
@@ -102,6 +102,17 @@ def _build_iso_mapping(self) -> dict:
         """
         iso_mapping = {}
         for lang_name, data in language_metadata.items():
+            # Handle sub-languages if they exist
+            if "sub_languages" in data:
+                for sub_lang, sub_data in data["sub_languages"].items():
+                    if self.target_iso and sub_lang not in self.target_iso:
+                        continue
+
+                    if iso_code := sub_data.get("iso"):
+                        iso_mapping[iso_code] = sub_lang
+                continue  # Skip main language if it only has sub-languages
+
+            # Handle main languages
             if self.target_iso and lang_name not in self.target_iso:
                 continue
 
@@ -425,7 +436,23 @@ def export_forms_json(
                 return
 
             # Create the output directory structure
-            output_path = Path(filepath).parent / lang_name
+            # Check if this is a sub-language and get its main language
+            main_lang = None
+            for lang, data in language_metadata.items():
+                if "sub_languages" in data:
+                    for sub_lang, sub_data in data["sub_languages"].items():
+                        if sub_lang == lang_name:
+                            main_lang = lang
+                            break
+                    if main_lang:
+                        break
+
+            # If it's a sub-language, create path like: parent/chinese/mandarin/
+            if main_lang:
+                output_path = Path(filepath).parent / main_lang / lang_name
+            else:
+                output_path = Path(filepath).parent / lang_name
+
             output_path.mkdir(parents=True, exist_ok=True)
 
             # Create the full output filepath
@@ -557,15 +584,42 @@ def parse_dump(
 
             for lang in languages:
                 needs_processing = False
+                # Check if this is a sub-language
+                main_lang = None
+                for lang_name, data in language_metadata.items():
+                    if "sub_languages" in data:
+                        for sub_lang in data["sub_languages"]:
+                            if sub_lang == lang:
+                                main_lang = lang_name
+                                break
+                    if main_lang:
+                        break
+
                 for data_type in data_types:
-                    index_path = Path(output_dir) / lang / f"lexeme_{data_type}.json"
+                    # Create appropriate path based on whether it's a sub-language
+                    if main_lang:
+                        index_path = (
+                            Path(output_dir)
+                            / main_lang
+                            / lang
+                            / f"lexeme_{data_type}.json"
+                        )
+                    else:
+                        index_path = (
+                            Path(output_dir) / lang / f"lexeme_{data_type}.json"
+                        )
 
                     if not check_index_exists(index_path, overwrite_all):
                         needs_processing = True
                         data_types_to_process.add(data_type)
-
                     else:
-                        print(f"Skipping {lang}/{data_type}.json - already exists")
+                        # Update path display in skip message
+                        skip_path = (
+                            f"{main_lang}/{lang}/{data_type}.json"
+                            if main_lang
+                            else f"{lang}/{data_type}.json"
+                        )
+                        print(f"Skipping {skip_path} - already exists")
 
                 if needs_processing:
                     languages_to_process.append(lang)

From 04573726e18a8643c94bdb189dc5f8ef64295646 Mon Sep 17 00:00:00 2001
From: axif <muhamadasif570@gmail.com>
Date: Wed, 22 Jan 2025 04:30:12 +0600
Subject: [PATCH 08/13] Refactor parameter names and fix single langugage
 translation error

---
 src/scribe_data/cli/total.py               |  2 +-
 src/scribe_data/wikidata/wikidata_utils.py |  4 +++-
 src/scribe_data/wiktionary/parse_dump.py   | 23 +++++++++++-----------
 3 files changed, 16 insertions(+), 13 deletions(-)

diff --git a/src/scribe_data/cli/total.py b/src/scribe_data/cli/total.py
index a0fb5105..cc98cf9a 100644
--- a/src/scribe_data/cli/total.py
+++ b/src/scribe_data/cli/total.py
@@ -373,7 +373,7 @@ def total_wrapper(
     if wikidata_dump is True:  # flag without a wikidata lexeme dump path
         parse_wd_lexeme_dump(
             language=language,
-            data_types=[data_type],
+            data_types=data_type,
             wikidata_dump_type=["total"],
             wikidata_dump_path=None,
         )
diff --git a/src/scribe_data/wikidata/wikidata_utils.py b/src/scribe_data/wikidata/wikidata_utils.py
index 9e481d10..c302ea78 100644
--- a/src/scribe_data/wikidata/wikidata_utils.py
+++ b/src/scribe_data/wikidata/wikidata_utils.py
@@ -110,7 +110,9 @@ def parse_wd_lexeme_dump(
         ]
 
     print(f"Languages to process: {language}")
-    print(f"Data types to process: {data_types}")
+
+    if "translations" not in wikidata_dump_type:
+        print(f"Data types to process: {data_types}")
 
     file_path = wd_lexeme_dump_download_wrapper(None, wikidata_dump_path)
 
diff --git a/src/scribe_data/wiktionary/parse_dump.py b/src/scribe_data/wiktionary/parse_dump.py
index 9d66100a..b0a96abe 100644
--- a/src/scribe_data/wiktionary/parse_dump.py
+++ b/src/scribe_data/wiktionary/parse_dump.py
@@ -42,7 +42,7 @@
 class LexemeProcessor:
     def __init__(
         self,
-        target_iso: Union[str, List[str]] = None,
+        target_lang: Union[str, List[str]] = None,
         parse_type: List[str] = None,
         data_types: List[str] = None,
     ):
@@ -56,8 +56,8 @@ def __init__(
         # Pre-compute sets for faster lookups.
         self.parse_type = set(parse_type or [])
         self.data_types = set(data_types or [])
-        self.target_iso = set(
-            [target_iso] if isinstance(target_iso, str) else target_iso or []
+        self.target_lang = set(
+            [target_lang] if isinstance(target_lang, str) else target_lang or []
         )
 
         # Pre-compute valid categories and languages.
@@ -98,14 +98,14 @@ def __init__(
     def _build_iso_mapping(self) -> dict:
         """
         Build mapping of ISO codes to language names based on language_metadata.
-        If self.target_iso is non-null, only include those iso codes.
+        If self.target_lang is non-null, only include those iso codes.
         """
         iso_mapping = {}
         for lang_name, data in language_metadata.items():
             # Handle sub-languages if they exist
             if "sub_languages" in data:
                 for sub_lang, sub_data in data["sub_languages"].items():
-                    if self.target_iso and sub_lang not in self.target_iso:
+                    if self.target_lang and sub_lang not in self.target_lang:
                         continue
 
                     if iso_code := sub_data.get("iso"):
@@ -113,13 +113,13 @@ def _build_iso_mapping(self) -> dict:
                 continue  # Skip main language if it only has sub-languages
 
             # Handle main languages
-            if self.target_iso and lang_name not in self.target_iso:
+            if self.target_lang and lang_name not in self.target_lang:
                 continue
 
             if iso_code := data.get("iso"):
                 iso_mapping[iso_code] = lang_name
 
-        for language in self.target_iso:
+        for language in self.target_lang:
             if language.lower().startswith("q") and language[1:].isdigit():
                 qid_to_lang = check_qid_is_language(language)
                 if qid_to_lang:
@@ -628,16 +628,17 @@ def parse_dump(
             languages = languages_to_process
             data_types = list(data_types_to_process)
 
-        if not data_types or not languages:
-            print("No data types or languages provided. Nothing to process.")
-            return
+        if "translations" not in parse_type:
+            if not data_types or not languages:
+                print("No data types or languages provided. Nothing to process.")
+                return
 
         if not languages:
             print("All requested data already exists. Nothing to process.")
             return
 
     processor = LexemeProcessor(
-        target_iso=languages, parse_type=parse_type, data_types=data_types
+        target_lang=languages, parse_type=parse_type, data_types=data_types
     )
     processor.process_file(file_path)
 

From 2479fe22609ba1cac04252a398cad206c79e96a5 Mon Sep 17 00:00:00 2001
From: axif <muhamadasif570@gmail.com>
Date: Wed, 22 Jan 2025 04:35:20 +0600
Subject: [PATCH 09/13] fix target_lang occored when fixing conflict

---
 src/scribe_data/wiktionary/parse_dump.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/scribe_data/wiktionary/parse_dump.py b/src/scribe_data/wiktionary/parse_dump.py
index edda3176..107e5e98 100644
--- a/src/scribe_data/wiktionary/parse_dump.py
+++ b/src/scribe_data/wiktionary/parse_dump.py
@@ -102,7 +102,7 @@ def _build_iso_mapping(self) -> dict:
             if iso_code := data.get("iso"):
                 iso_mapping[iso_code] = lang_name
 
-        for language in self.target_iso:
+        for language in self.target_lang:
             if language.lower().startswith("q") and language[1:].isdigit():
                 qid_to_lang = check_qid_is_language(language)
                 if qid_to_lang:

From 386d6a0343a79ce605c1657eeb5763cc53582610 Mon Sep 17 00:00:00 2001
From: axif <muhamadasif570@gmail.com>
Date: Wed, 22 Jan 2025 04:37:52 +0600
Subject: [PATCH 10/13] fix total tests

---
 tests/cli/test_total.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/tests/cli/test_total.py b/tests/cli/test_total.py
index 6893f15d..089fde97 100644
--- a/tests/cli/test_total.py
+++ b/tests/cli/test_total.py
@@ -265,7 +265,7 @@ def test_total_wrapper_wikidata_dump_flag(self, mock_parse_dump):
         total_wrapper(wikidata_dump=True)
         mock_parse_dump.assert_called_once_with(
             language=None,
-            data_types=[None],
+            data_types=None,
             wikidata_dump_type=["total"],
             wikidata_dump_path=None,
         )
@@ -288,7 +288,7 @@ def test_total_wrapper_wikidata_dump_with_all(self, mock_parse_dump):
         total_wrapper(wikidata_dump=True, all_bool=True)
         mock_parse_dump.assert_called_once_with(
             language="all",
-            data_types=["all"],
+            data_types="all",
             wikidata_dump_type=["total"],
             wikidata_dump_path=None,
         )
@@ -335,7 +335,7 @@ def test_total_wrapper_qid_with_wikidata_dump(self, mock_parse_dump):
         total_wrapper(language="Q9217", wikidata_dump=True, all_bool=True)
         mock_parse_dump.assert_called_once_with(
             language="Q9217",
-            data_types=["all"],
+            data_types="all",
             wikidata_dump_type=["total"],
             wikidata_dump_path=None,
         )

From 8c109621d749de8605214b3db133d4b330edc6d4 Mon Sep 17 00:00:00 2001
From: Andrew Tavis McAllister <andrew.t.mcallister@gmail.com>
Date: Sat, 25 Jan 2025 09:53:22 +0100
Subject: [PATCH 11/13] Misc formatting + simplifying code where possible

---
 ...te.yaml => missing_form_check_update.yaml} |  26 ++--
 .github/workflows/update_emojis.yaml          |  32 ++---
 src/scribe_data/check/__init__.py             |   0
 .../check/check_missing_forms/__init__.py     |   0
 .../check_missing_forms.py                    |  57 ++++-----
 .../check/check_missing_forms/download_wd.py  |  35 ++----
 .../check_missing_forms/generate_query.py     |  54 ++++-----
 .../check/check_missing_forms/get_forms.py    |  36 ++----
 .../check/check_missing_forms/pr_body.py      |  38 ++----
 src/scribe_data/cli/get.py                    |   4 +-
 src/scribe_data/cli/list.py                   |   2 +
 src/scribe_data/cli/main.py                   |   8 +-
 src/scribe_data/cli/total.py                  |   1 +
 src/scribe_data/utils.py                      |   3 +-
 .../{wiktionary => wikidata}/parse_dump.py    | 114 ++++++++++--------
 src/scribe_data/wikidata/wikidata_utils.py    |   8 +-
 src/scribe_data/wiktionary/parse_mediaWiki.py |   4 +-
 tests/cli/test_total.py                       |   4 +-
 18 files changed, 190 insertions(+), 236 deletions(-)
 rename .github/workflows/{missing_form_check&update.yaml => missing_form_check_update.yaml} (89%)
 create mode 100644 src/scribe_data/check/__init__.py
 create mode 100644 src/scribe_data/check/check_missing_forms/__init__.py
 rename src/scribe_data/{wiktionary => wikidata}/parse_dump.py (91%)

diff --git a/.github/workflows/missing_form_check&update.yaml b/.github/workflows/missing_form_check_update.yaml
similarity index 89%
rename from .github/workflows/missing_form_check&update.yaml
rename to .github/workflows/missing_form_check_update.yaml
index 1e51ca3f..96219e1a 100644
--- a/.github/workflows/missing_form_check&update.yaml
+++ b/.github/workflows/missing_form_check_update.yaml
@@ -1,9 +1,9 @@
 name: Create Automated PR
 on:
   schedule:
-    - cron: '0 0 1 * *'  # Runs at 00:00 UTC on the first day of every month
-  # Allow manual trigger
-  workflow_dispatch:
+    # Runs at 00:00 UTC on the first day of every month.
+    - cron: "0 0 1 * *"
+  workflow_dispatch: # allow manual trigger
 
 jobs:
   check-repository:
@@ -34,7 +34,7 @@ jobs:
       - name: Set up Python
         uses: actions/setup-python@v4
         with:
-          python-version: '3.x'
+          python-version: "3.x"
 
       - name: Install dependencies
         run: |
@@ -54,7 +54,7 @@ jobs:
 
           # Check if paths exist
           if [ -n "${DUMP_PATH}" ] && [ -d "${QUERY_DIR}" ]; then
-            # Generate the missing features data with all keys processing
+            # Generate the missing features data with all keys processing.
             PYTHONPATH=$PYTHONPATH:$(pwd)/src python src/scribe_data/check/check_missing_forms/check_missing_forms.py "${DUMP_PATH}" "${QUERY_DIR}" --process-all-keys
           else
             echo "Required paths not found:"
@@ -63,7 +63,7 @@ jobs:
             exit 1
           fi
 
-      # Debug steps to understand the state
+      # Debug steps to understand the state.
       - name: Debug Info
         run: |
           echo "Current branch: $(git branch --show-current)"
@@ -79,7 +79,7 @@ jobs:
       - name: Debug Missing Features Data
         if: always()
         run: |
-          # Print the contents of the missing features JSON file if it exists
+          # Print the contents of the missing features JSON file if it exists.
           if [ -f missing_features.json ]; then
             echo "Contents of missing_features.json:"
             cat missing_features.json
@@ -90,10 +90,10 @@ jobs:
       - name: Generate PR Body
         id: pr-body
         run: |
-          # Run the pr_body.py script with the missing features data
+          # Run the pr_body.py script with the missing features data.
           PR_BODY_CONTENT=$(python src/scribe_data/check/check_missing_forms/pr_body.py missing_features.json)
 
-          # Debug output
+          # Debug output.
           echo "PR Body Content:"
           echo "$PR_BODY_CONTENT"
 
@@ -106,7 +106,7 @@ jobs:
 
       - name: Debug PR Body Output
         run: |
-          # Print the PR body content from the output
+          # Print the PR body content from the output.
           echo "PR Body from GITHUB_OUTPUT:"
           cat $GITHUB_OUTPUT
 
@@ -114,17 +114,17 @@ jobs:
         uses: peter-evans/create-pull-request@v5
         with:
           token: ${{ secrets.GITHUB_TOKEN }}
-          title: 'Automated PR: Updated Language Data Files'
+          title: "Automated PR: Updated Language Data Files"
           body: ${{ steps.pr-body.outputs.body }}
           base: master
           branch: automated-missing-forms-pr
           delete-branch: true
           draft: false
-          commit-message: '[create-pull-request] automated change'
+          commit-message: "[create-pull-request] automated change"
           committer: GitHub <noreply@github.com>
           author: github-actions[bot] <github-actions[bot]@users.noreply.github.com>
 
-      # Debug step to verify PR creation attempt
+      # Debug step to verify PR creation attempt.
       - name: Check PR Creation
         run: |
           echo "Checking if PR was created..."
diff --git a/.github/workflows/update_emojis.yaml b/.github/workflows/update_emojis.yaml
index 8465147e..14514cec 100644
--- a/.github/workflows/update_emojis.yaml
+++ b/.github/workflows/update_emojis.yaml
@@ -1,9 +1,9 @@
 name: Check and Update Emoji Data
 on:
   schedule:
-    - cron: '0 0 1 * *'  # Runs at 00:00 UTC on the first day of every month
-  # Allow manual trigger
-  workflow_dispatch:
+    # Runs at 00:00 UTC on the first day of every month.
+    - cron: "0 0 1 * *"
+  workflow_dispatch: # allow manual trigger
 
 jobs:
   check-repository:
@@ -34,7 +34,7 @@ jobs:
       - name: Set up Python
         uses: actions/setup-python@v4
         with:
-          python-version: '3.x'
+          python-version: "3.x"
 
       - name: Install dependencies
         run: |
@@ -45,11 +45,11 @@ jobs:
       - name: Get language list
         id: get-langs
         run: |
-          # Fetch language list from GitHub API
+          # Fetch language list from GitHub API.
           DERIVED_LANGS=$(curl -s https://api.github.com/repos/unicode-org/cldr-json/contents/cldr-json/cldr-annotations-derived-full/annotationsDerived | jq -r '.[].name')
           FULL_LANGS=$(curl -s https://api.github.com/repos/unicode-org/cldr-json/contents/cldr-json/cldr-annotations-full/annotations | jq -r '.[].name')
 
-          # Combine and deduplicate language lists
+          # Combine and deduplicate language lists.
           LANG_LIST=$(echo "$DERIVED_LANGS $FULL_LANGS" | tr ' ' '\n' | sort -u | tr '\n' ' ')
           echo "lang_list=${LANG_LIST}" >> $GITHUB_OUTPUT
           echo "Detected languages: ${LANG_LIST}"
@@ -57,26 +57,26 @@ jobs:
       - name: Download and check emoji data
         id: check-updates
         run: |
-          # Create directories if they don't exist
+          # Create directories if they don't exist.
           mkdir -p src/scribe_data/unicode/cldr-annotations-derived-full
           mkdir -p src/scribe_data/unicode/cldr-annotations-full
 
           CHANGES_EXIST=false
           CHANGE_SUMMARY="| Language | Derived Changes | Full Changes |\n|----------|-----------------|--------------|"
 
-          # Use dynamic language list from previous step
+          # Use dynamic language list from previous step.
           for lang in ${{ steps.get-langs.outputs.lang_list }}; do
             DERIVED_CHANGED="No"
             FULL_CHANGED="No"
 
-            # Download latest data for each language
+            # Download latest data for each language.
             mkdir -p "src/scribe_data/unicode/cldr-annotations-derived-full/annotationsDerived/$lang"
             mkdir -p "src/scribe_data/unicode/cldr-annotations-full/annotations/$lang"
 
             curl -L "https://raw.githubusercontent.com/unicode-org/cldr-json/main/cldr-json/cldr-annotations-derived-full/annotationsDerived/$lang/annotations.json" -o "new_derived_$lang.json"
             curl -L "https://raw.githubusercontent.com/unicode-org/cldr-json/main/cldr-json/cldr-annotations-full/annotations/$lang/annotations.json" -o "new_full_$lang.json"
 
-            # Check derived annotations
+            # Check derived annotations.
             if [ -f "src/scribe_data/unicode/cldr-annotations-derived-full/annotationsDerived/$lang/annotations.json" ]; then
               if ! cmp -s "new_derived_$lang.json" "src/scribe_data/unicode/cldr-annotations-derived-full/annotationsDerived/$lang/annotations.json"; then
                 CHANGES_EXIST=true
@@ -87,7 +87,7 @@ jobs:
               DERIVED_CHANGED="New"
             fi
 
-            # Check full annotations
+            # Check full annotations.
             if [ -f "src/scribe_data/unicode/cldr-annotations-full/annotations/$lang/annotations.json" ]; then
               if ! cmp -s "new_full_$lang.json" "src/scribe_data/unicode/cldr-annotations-full/annotations/$lang/annotations.json"; then
                 CHANGES_EXIST=true
@@ -98,7 +98,7 @@ jobs:
               FULL_CHANGED="New"
             fi
 
-            # Only add to summary if there are changes
+            # Only add to summary if there are changes.
             if [ "$DERIVED_CHANGED" != "No" ] || [ "$FULL_CHANGED" != "No" ]; then
               CHANGE_SUMMARY="$CHANGE_SUMMARY\n| $lang | $DERIVED_CHANGED | $FULL_CHANGED |"
             fi
@@ -112,7 +112,7 @@ jobs:
       - name: Update files if changed
         if: steps.check-updates.outputs.changes_exist == 'true'
         run: |
-          # Use dynamic language list
+          # Use dynamic language list.
           for lang in ${{ steps.get-langs.outputs.lang_list }}; do
             mkdir -p "src/scribe_data/unicode/cldr-annotations-derived-full/annotationsDerived/$lang"
             mkdir -p "src/scribe_data/unicode/cldr-annotations-full/annotations/$lang"
@@ -129,7 +129,7 @@ jobs:
         uses: peter-evans/create-pull-request@v5
         with:
           token: ${{ secrets.GITHUB_TOKEN }}
-          title: 'chore: Update emoji annotations data'
+          title: "chore: Update emoji annotations data"
           body: |
             This PR updates the emoji annotations data from CLDR.
 
@@ -142,9 +142,9 @@ jobs:
             - No: No changes
 
             This is an automated PR created by the emoji data update workflow.
-          branch: update-emoji-data # Branch name
+          branch: update-emoji-data # branch name
           delete-branch: true
-          commit-message: 'chore: Update emoji annotations data'
+          commit-message: "chore: Update emoji annotations data"
           labels: |
             automated pr
             emoji-data
diff --git a/src/scribe_data/check/__init__.py b/src/scribe_data/check/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/src/scribe_data/check/check_missing_forms/__init__.py b/src/scribe_data/check/check_missing_forms/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/src/scribe_data/check/check_missing_forms/check_missing_forms.py b/src/scribe_data/check/check_missing_forms/check_missing_forms.py
index ac57e498..43039976 100644
--- a/src/scribe_data/check/check_missing_forms/check_missing_forms.py
+++ b/src/scribe_data/check/check_missing_forms/check_missing_forms.py
@@ -1,52 +1,37 @@
+# SPDX-License-Identifier: GPL-3.0-or-later
 """
 Check for missing forms in Wikidata.
-
-.. raw:: html
-    <!--
-    * Copyright (C) 2024 Scribe
-    *
-    * This program is free software: you can redistribute it and/or modify
-    * it under the terms of the GNU General Public License as published by
-    * the Free Software Foundation, either version 3 of the License, or
-    * (at your option) any later version.
-    *
-    * This program is distributed in the hope that it will be useful,
-    * but WITHOUT ANY WARRANTY; without even the implied warranty of
-    * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-    * GNU General Public License for more details.
-    *
-    * You should have received a copy of the GNU General Public License
-    * along with this program.  If not, see <https://www.gnu.org/licenses/>.
-    -->
 """
 
+import argparse
 import json
 import sys
-import argparse
+from collections import defaultdict
 from pathlib import Path
-from get_forms import parse_sparql_files, extract_dump_forms
+
 from generate_query import generate_query
-from collections import defaultdict
+from get_forms import extract_dump_forms, parse_sparql_files
+
 from scribe_data.utils import (
-    lexeme_form_metadata,
-    language_metadata,
     data_type_metadata,
+    language_metadata,
+    lexeme_form_metadata,
 )
 
 
 def get_all_languages():
     """
-    Extract all languages and sublanguages from language metadata.
+    Extract all languages and sub languages from language metadata.
 
     Returns
     -------
     list of str
-        List of language codes for all languages and sublanguages that have
+        List of language codes for all languages and sub languages that have
         both ISO codes and QIDs defined.
 
     Notes
     -----
-    Only includes languages and sublanguages that have both 'iso' and 'qid'
+    Only includes languages and sub languages that have both 'iso' and 'qid'
     fields in their metadata.
     """
     languages = []
@@ -56,11 +41,13 @@ def get_all_languages():
         if "iso" in lang_data and "qid" in lang_data:
             languages.append(lang)
 
-        # Add sublanguages.
+        # Add sub languages.
         if "sub_languages" in lang_data:
-            for sublang, sublang_data in lang_data["sub_languages"].items():
-                if "iso" in sublang_data and "qid" in sublang_data:
-                    languages.append(sublang)
+            languages.extend(
+                sublang
+                for sublang, sublang_data in lang_data["sub_languages"].items()
+                if "iso" in sublang_data and "qid" in sublang_data
+            )
 
     return languages
 
@@ -74,6 +61,7 @@ def get_missing_features(result_sparql, result_dump):
     result_sparql : dict
         Features extracted from SPARQL queries.
         Format: {language: {data_type: [features]}}
+
     result_dump : dict
         Features extracted from Wikidata dump.
         Format: {language: {data_type: [features]}}
@@ -111,11 +99,11 @@ def get_missing_features(result_sparql, result_dump):
 
                 # Get values from SPARQL if available.
                 if dt in result_sparql[lang]:
-                    sparql_values = set(tuple(item) for item in result_sparql[lang][dt])
+                    sparql_values = {tuple(item) for item in result_sparql[lang][dt]}
 
                 # Get values from dump if available.
                 if dt in result_dump[lang]:
-                    dump_values = set(tuple(item) for item in result_dump[lang][dt])
+                    dump_values = {tuple(item) for item in result_dump[lang][dt]}
 
                 # Get unique values from both sources.
                 unique_dump_values = dump_values - sparql_values
@@ -131,7 +119,7 @@ def get_missing_features(result_sparql, result_dump):
                     if all(qid in all_qids for qid in item):
                         missing_by_lang_type[lang][dt].append(list(item))
 
-    return missing_by_lang_type if missing_by_lang_type else None
+    return missing_by_lang_type or None
 
 
 def process_missing_features(missing_features, query_dir):
@@ -143,6 +131,7 @@ def process_missing_features(missing_features, query_dir):
     missing_features : dict
         Dictionary of missing features by language and data type.
         Format: {language: {data_type: [features]}}
+
     query_dir : str or Path
         Directory where generated query files should be saved.
 
@@ -203,7 +192,7 @@ def main():
         print(f"Error: Query directory does not exist: {query_dir}")
         sys.exit(1)
 
-    # Get all languages including sublanguages.
+    # Get all languages including sub languages.
     languages = get_all_languages()
 
     print("Parsing SPARQL files...")
diff --git a/src/scribe_data/check/check_missing_forms/download_wd.py b/src/scribe_data/check/check_missing_forms/download_wd.py
index ccf6b797..c8efb137 100644
--- a/src/scribe_data/check/check_missing_forms/download_wd.py
+++ b/src/scribe_data/check/check_missing_forms/download_wd.py
@@ -1,30 +1,15 @@
+# SPDX-License-Identifier: GPL-3.0-or-later
 """
 Download Wikidata lexeme dump.
-
-.. raw:: html
-    <!--
-    * Copyright (C) 2024 Scribe
-    *
-    * This program is free software: you can redistribute it and/or modify
-    * it under the terms of the GNU General Public License as published by
-    * the Free Software Foundation, either version 3 of the License, or
-    * (at your option) any later version.
-    *
-    * This program is distributed in the hope that it will be useful,
-    * but WITHOUT ANY WARRANTY; without even the implied warranty of
-    * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-    * GNU General Public License for more details.
-    *
-    * You should have received a copy of the GNU General Public License
-    * along with this program.  If not, see <https://www.gnu.org/licenses/>.
-    -->
 """
 
+import os
 from pathlib import Path
+
+import requests
+
 from scribe_data.cli.download import download_wd_lexeme_dump
 from scribe_data.utils import DEFAULT_DUMP_EXPORT_DIR
-import requests
-import os
 
 
 def wd_lexeme_dump_download(wikidata_dump=None, output_dir=None):
@@ -36,6 +21,7 @@ def wd_lexeme_dump_download(wikidata_dump=None, output_dir=None):
     wikidata_dump : str, optional
         Date string in YYYYMMDD format for specific dumps.
         If None, downloads the latest dump.
+
     output_dir : str, optional
         Directory path for the downloaded file.
         If None, uses DEFAULT_DUMP_EXPORT_DIR.
@@ -47,9 +33,9 @@ def wd_lexeme_dump_download(wikidata_dump=None, output_dir=None):
 
     Notes
     -----
-    - Downloads are skipped if the file already exists in the output directory
-    - Progress is displayed every 50MB during download
-    - Creates output directory if it doesn't exist
+    - Downloads are skipped if the file already exists in the output directory.
+    - Progress is displayed every 50MB during download.
+    - Creates output directory if it doesn't exist.
     """
     dump_url = download_wd_lexeme_dump(wikidata_dump or "latest-lexemes")
 
@@ -97,6 +83,5 @@ def wd_lexeme_dump_download(wikidata_dump=None, output_dir=None):
 
 
 if __name__ == "__main__":
-    output_path = wd_lexeme_dump_download()
-    if output_path:
+    if output_path := wd_lexeme_dump_download():
         print(f"DOWNLOAD_PATH={output_path}")
diff --git a/src/scribe_data/check/check_missing_forms/generate_query.py b/src/scribe_data/check/check_missing_forms/generate_query.py
index 71bfde50..42f04eac 100644
--- a/src/scribe_data/check/check_missing_forms/generate_query.py
+++ b/src/scribe_data/check/check_missing_forms/generate_query.py
@@ -1,34 +1,19 @@
+# SPDX-License-Identifier: GPL-3.0-or-later
 """
 Generate SPARQL queries for missing lexeme forms.
-
-.. raw:: html
-    <!--
-    * Copyright (C) 2024 Scribe
-    *
-    * This program is free software: you can redistribute it and/or modify
-    * it under the terms of the GNU General Public License as published by
-    * the Free Software Foundation, either version 3 of the License, or
-    * (at your option) any later version.
-    *
-    * This program is distributed in the hope that it will be useful,
-    * but WITHOUT ANY WARRANTY; without even the implied warranty of
-    * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-    * GNU General Public License for more details.
-    *
-    * You should have received a copy of the GNU General Public License
-    * along with this program.  If not, see <https://www.gnu.org/licenses/>.
-    -->
 """
 
+import os
+from pathlib import Path
+
 from scribe_data.utils import (
-    lexeme_form_metadata,
-    language_metadata,
-    data_type_metadata,
     LANGUAGE_DATA_EXTRACTION_DIR as language_data_extraction,
 )
-
-import os
-from pathlib import Path
+from scribe_data.utils import (
+    data_type_metadata,
+    language_metadata,
+    lexeme_form_metadata,
+)
 
 
 def generate_query(missing_features, query_dir=None):
@@ -40,6 +25,7 @@ def generate_query(missing_features, query_dir=None):
     missing_features : dict
         Dictionary containing missing features by language and data type.
         Format: {language_qid: {data_type_qid: [[form_qids]]}}
+
     query_dir : str or Path, optional
         Directory where query files should be saved.
         If None, uses default language_data_extraction directory.
@@ -51,10 +37,9 @@ def generate_query(missing_features, query_dir=None):
 
     Notes
     -----
-    - Generates a single query file combining all forms for a given
-      language and data type combination
-    - Query files are named incrementally if duplicates exist
-    - Creates necessary directories if they don't exist
+    - Generates a single query file combining all forms for a given language and data type combination.
+    - Query files are named incrementally if duplicates exist.
+    - Creates necessary directories if they don't exist.
     """
     language_qid = next(iter(missing_features.keys()))
     data_type_qid = next(iter(missing_features[language_qid].keys()))
@@ -65,7 +50,7 @@ def generate_query(missing_features, query_dir=None):
         for name, data in language_metadata.items()
         if data.get("qid") == language_qid
     )
-    language = language_entry[0]  # The language name.
+    language = language_entry[0]  # the language name
 
     data_type = next(
         name for name, qid in data_type_metadata.items() if qid == data_type_qid
@@ -79,14 +64,15 @@ def generate_query(missing_features, query_dir=None):
         for item in category.values():
             qid_to_label[item["qid"]] = item["label"]
 
-    # Process all forms at once
+    # Process all forms at once.
     forms_query = []
     all_form_combinations = missing_features[language_qid][data_type_qid]
     for form_qids in all_form_combinations:
         # Convert QIDs to labels and join them together.
         labels = [qid_to_label.get(qid, qid) for qid in form_qids]
         concatenated_label = "".join(labels)
-        # Make first letter lowercase
+
+        # Make first letter lowercase.
         concatenated_label = concatenated_label[0].lower() + concatenated_label[1:]
         forms_query.append({"label": concatenated_label, "qids": form_qids})
 
@@ -96,12 +82,12 @@ def generate_query(missing_features, query_dir=None):
 # Enter this query at https://query.wikidata.org/.
 
 SELECT
-   (REPLACE(STR(?lexeme), "http://www.wikidata.org/entity/", "") AS ?lexemeID)
+    (REPLACE(STR(?lexeme), "http://www.wikidata.org/entity/", "") AS ?lexemeID)
     ?{data_type}
     """ + "\n  ".join(f'?{form["label"]}' for form in forms_query)
 
     where_clause = f"""
-  WHERE {{
+    WHERE {{
     ?lexeme dct:language wd:{language_qid} ;
         wikibase:lexicalCategory wd:{data_type_qid} ;
         wikibase:lemma ?{data_type} .
@@ -149,6 +135,7 @@ def get_available_filename(base_path):
         base_file_name = (
             Path(query_dir) / language / data_type / f"query_{data_type}.sparql"
         )
+
     else:
         base_file_name = f"{language_data_extraction}/{language}/{data_type}/query_{data_type}.sparql"
 
@@ -163,4 +150,5 @@ def get_available_filename(base_path):
         file.write(final_query)
 
     print(f"Query file created: {file_name}")
+
     return file_name
diff --git a/src/scribe_data/check/check_missing_forms/get_forms.py b/src/scribe_data/check/check_missing_forms/get_forms.py
index 909abf5e..6b061411 100644
--- a/src/scribe_data/check/check_missing_forms/get_forms.py
+++ b/src/scribe_data/check/check_missing_forms/get_forms.py
@@ -1,32 +1,19 @@
+# SPDX-License-Identifier: GPL-3.0-or-later
 """
 Get forms from Wikidata.
-.. raw:: html
-    <!--
-    * Copyright (C) 2024 Scribe
-    *
-    * This program is free software: you can redistribute it and/or modify
-    * it under the terms of the GNU General Public License as published by
-    * the Free Software Foundation, either version 3 of the License, or
-    * (at your option) any later version.
-    *
-    * This program is distributed in the hope that it will be useful,
-    * but WITHOUT ANY WARRANTY; without even the implied warranty of
-    * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-    * GNU General Public License for more details.
-    *
-    * You should have received a copy of the GNU General Public License
-    * along with this program.  If not, see <https://www.gnu.org/licenses/>.
-    -->
 """
 
-from scribe_data.wiktionary.parse_dump import LexemeProcessor
 import re
 from collections import defaultdict
+
 from scribe_data.utils import (
-    language_metadata,
-    data_type_metadata,
     LANGUAGE_DATA_EXTRACTION_DIR as language_data_extraction,
 )
+from scribe_data.utils import (
+    data_type_metadata,
+    language_metadata,
+)
+from scribe_data.wikidata.parse_dump import LexemeProcessor
 
 iso_to_qid = {
     lang_data["iso"]: lang_data["qid"]
@@ -111,9 +98,7 @@ def parse_sparql_query(query_text):
 
         # Extract grammatical features.
         features = re.finditer(r"wd:(Q\d+)", block_text)
-        feature_list = [f.group(1) for f in features]
-
-        if feature_list:
+        if feature_list := [f.group(1) for f in features]:
             result[language][lexical_category].append(feature_list)
 
     return result
@@ -132,8 +117,10 @@ def extract_dump_forms(
     ----------
     languages : list of str, optional
         List of language ISO codes (e.g., ['en', 'fr'])
+
     data_types : list of str, optional
         List of lexical categories (e.g., ['nouns', 'verbs'])
+
     file_path : str, optional
         Path to the lexeme dump file, by default "latest-lexemes.json.bz2"
 
@@ -166,8 +153,7 @@ def extract_dump_forms(
 
             for data_type, features in data_types_dict.items():
                 # Get QID from data_type_metadata.
-                data_type_qid = data_type_metadata.get(data_type)
-                if data_type_qid:
+                if data_type_qid := data_type_metadata.get(data_type):
                     converted_features[lang_qid][data_type_qid] = features
 
     return converted_features
diff --git a/src/scribe_data/check/check_missing_forms/pr_body.py b/src/scribe_data/check/check_missing_forms/pr_body.py
index 11aadbe9..822ef9ec 100644
--- a/src/scribe_data/check/check_missing_forms/pr_body.py
+++ b/src/scribe_data/check/check_missing_forms/pr_body.py
@@ -1,30 +1,14 @@
+# SPDX-License-Identifier: GPL-3.0-or-later
 """
 Generate a formatted PR body describing missing features for each language.
-
-.. raw:: html
-    <!--
-    * Copyright (C) 2024 Scribe
-    *
-    * This program is free software: you can redistribute it and/or modify
-    * it under the terms of the GNU General Public License as published by
-    * the Free Software Foundation, either version 3 of the License, or
-    * (at your option) any later version.
-    *
-    * This program is distributed in the hope that it will be useful,
-    * but WITHOUT ANY WARRANTY; without even the implied warranty of
-    * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-    * GNU General Public License for more details.
-    *
-    * You should have received a copy of the GNU General Public License
-    * along with this program.  If not, see <https://www.gnu.org/licenses/>.
-    -->
 """
 
 import json
 import sys
+
 from scribe_data.utils import (
-    language_metadata,
     data_type_metadata,
+    language_metadata,
 )
 
 
@@ -51,12 +35,13 @@ def pr_body(missing_features):
     - A table showing languages and their missing feature types
     - Features are grouped by language for better readability
     """
-    # Initialize PR body with a header.
-    pr_body_content = "## Automated PR: Missing Features\n\n"
-    pr_body_content += "This PR was automatically created by a GitHub Action.\n\n"
-    pr_body_content += "### Missing Features Summary\n"
-    pr_body_content += "| **Language** | **Feature Type** |\n"
-    pr_body_content += "|--------------|------------------|\n"
+    pr_body_content = (
+        "## Automated PR: Missing Features\n\n"
+        + "This PR was automatically created by a GitHub Action.\n\n"
+        + "### Missing Features Summary\n"
+        + "| **Language** | **Feature Type** |\n"
+        + "|--------------|------------------|\n"
+    )
 
     # Create a dictionary to group features by language.
     grouped_features = {}
@@ -69,11 +54,13 @@ def pr_body(missing_features):
             if data.get("qid") == entity:
                 language_name = name
                 break
+
             if "sub_languages" in data:
                 for sub_name, sub_data in data["sub_languages"].items():
                     if sub_data.get("qid") == entity:
                         language_name = f"{name} ({sub_name})"
                         break
+
             if language_name:
                 break
 
@@ -99,6 +86,7 @@ def pr_body(missing_features):
     pr_body_content += "\nPlease review the changes and provide feedback.\n"
 
     print(pr_body_content)
+
     return pr_body_content
 
 
diff --git a/src/scribe_data/cli/get.py b/src/scribe_data/cli/get.py
index 7277d04d..589bc7f4 100644
--- a/src/scribe_data/cli/get.py
+++ b/src/scribe_data/cli/get.py
@@ -14,10 +14,10 @@
 from scribe_data.unicode.generate_emoji_keywords import generate_emoji
 from scribe_data.utils import (
     DEFAULT_CSV_EXPORT_DIR,
+    DEFAULT_DUMP_EXPORT_DIR,
     DEFAULT_JSON_EXPORT_DIR,
     DEFAULT_SQLITE_EXPORT_DIR,
     DEFAULT_TSV_EXPORT_DIR,
-    DEFAULT_DUMP_EXPORT_DIR,
 )
 from scribe_data.wikidata.query_data import query_data
 from scribe_data.wikidata.wikidata_utils import parse_wd_lexeme_dump
@@ -181,7 +181,7 @@ def prompt_user_download_all():
     # MARK: Form Dump
 
     elif wikidata_dump is not None:
-        # If wikidata_dump is an empty string, use the default path
+        # If wikidata_dump is an empty string, use the default path.
         if wikidata_dump == "":
             wikidata_dump = DEFAULT_DUMP_EXPORT_DIR
         parse_wd_lexeme_dump(
diff --git a/src/scribe_data/cli/list.py b/src/scribe_data/cli/list.py
index 4a1b09a5..287df047 100644
--- a/src/scribe_data/cli/list.py
+++ b/src/scribe_data/cli/list.py
@@ -174,8 +174,10 @@ def list_wrapper(
     ----------
     language : str
         The language to potentially list data types for.
+
     data_type : str
         The data type to check for.
+
     all_bool : bool
         Whether all languages and data types should be listed.
 
diff --git a/src/scribe_data/cli/main.py b/src/scribe_data/cli/main.py
index 7a37e5c0..89ec60c2 100644
--- a/src/scribe_data/cli/main.py
+++ b/src/scribe_data/cli/main.py
@@ -19,12 +19,12 @@
 from scribe_data.cli.total import total_wrapper
 from scribe_data.cli.upgrade import upgrade_cli
 from scribe_data.cli.version import get_version_message
-from scribe_data.wiktionary.parse_mediaWiki import parse_wiktionary_translations
 from scribe_data.utils import (
-    DEFAULT_JSON_EXPORT_DIR,
     DEFAULT_CSV_EXPORT_DIR,
     DEFAULT_DUMP_EXPORT_DIR,
+    DEFAULT_JSON_EXPORT_DIR,
 )
+from scribe_data.wiktionary.parse_mediaWiki import parse_wiktionary_translations
 
 LIST_DESCRIPTION = "List languages, data types and combinations of each that Scribe-Data can be used for."
 GET_DESCRIPTION = (
@@ -355,11 +355,13 @@ def main() -> None:
         elif args.command in ["get", "g"]:
             if args.interactive:
                 start_interactive_mode(operation="get")
+
             if args.translation:
                 parse_wiktionary_translations(args.translation, args.output_dir)
+
             else:
                 print(
-                    f"Parsing Wikidata lexeme dump for {args.language} and {args.data_type}"
+                    f"Parsing Wikidata lexeme dump for {args.language} and {args.data_type}..."
                 )
                 get_data(
                     language=args.language.lower()
diff --git a/src/scribe_data/cli/total.py b/src/scribe_data/cli/total.py
index a53f2412..c1699832 100644
--- a/src/scribe_data/cli/total.py
+++ b/src/scribe_data/cli/total.py
@@ -339,6 +339,7 @@ def total_wrapper(
 
     data_type : Union[str, List[str]]
         The data type(s) to check for.
+
     all_bool : bool
         Whether all languages and data types should be listed.
 
diff --git a/src/scribe_data/utils.py b/src/scribe_data/utils.py
index 6a8c270a..3ef45ac5 100644
--- a/src/scribe_data/utils.py
+++ b/src/scribe_data/utils.py
@@ -697,7 +697,7 @@ def check_lexeme_dump_prompt_download(output_dir: str):
                 return None
 
         elif user_input == "Download new version":
-            # Rename existing latest dump if it exists
+            # Rename existing latest dump if it exists.
             latest_dump = Path(output_dir) / "latest-lexemes.json.bz2"
             if latest_dump.exists():
                 timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
@@ -706,6 +706,7 @@ def check_lexeme_dump_prompt_download(output_dir: str):
                 rprint(
                     f"[bold green]Renamed existing dump to {backup_name}[/bold green]"
                 )
+
             return False
 
         else:
diff --git a/src/scribe_data/wiktionary/parse_dump.py b/src/scribe_data/wikidata/parse_dump.py
similarity index 91%
rename from src/scribe_data/wiktionary/parse_dump.py
rename to src/scribe_data/wikidata/parse_dump.py
index 107e5e98..d0e8689c 100644
--- a/src/scribe_data/wiktionary/parse_dump.py
+++ b/src/scribe_data/wikidata/parse_dump.py
@@ -10,6 +10,8 @@
 from typing import List, Union
 
 import orjson
+from tqdm import tqdm
+
 from scribe_data.utils import (
     DEFAULT_DUMP_EXPORT_DIR,
     check_index_exists,
@@ -19,7 +21,6 @@
     language_metadata,
     lexeme_form_metadata,
 )
-from tqdm import tqdm
 
 
 class LexemeProcessor:
@@ -77,7 +78,8 @@ def __init__(
                     item_data["label"],
                 )
 
-    # MARK: build iso mapping
+    # MARK: Build ISO Mapping
+
     def _build_iso_mapping(self) -> dict:
         """
         Build mapping of ISO codes to language names based on language_metadata.
@@ -85,7 +87,7 @@ def _build_iso_mapping(self) -> dict:
         """
         iso_mapping = {}
         for lang_name, data in language_metadata.items():
-            # Handle sub-languages if they exist
+            # Handle sub-languages if they exist.
             if "sub_languages" in data:
                 for sub_lang, sub_data in data["sub_languages"].items():
                     if self.target_lang and sub_lang not in self.target_lang:
@@ -93,9 +95,9 @@ def _build_iso_mapping(self) -> dict:
 
                     if iso_code := sub_data.get("iso"):
                         iso_mapping[iso_code] = sub_lang
-                continue  # Skip main language if it only has sub-languages
+                continue  # skip main language if it only has sub-languages
 
-            # Handle main languages
+            # Handle main languages.
             if self.target_lang and lang_name not in self.target_lang:
                 continue
 
@@ -104,15 +106,14 @@ def _build_iso_mapping(self) -> dict:
 
         for language in self.target_lang:
             if language.lower().startswith("q") and language[1:].isdigit():
-                qid_to_lang = check_qid_is_language(language)
-                if qid_to_lang:
+                if qid_to_lang := check_qid_is_language(language):
                     iso_code = get_language_iso_code(language.upper())
                     iso_mapping[iso_code] = qid_to_lang
                     print(f"ISO code for {language} is {iso_code}")
 
         return iso_mapping
 
-    # MARK: process lines
+    # MARK: Process Lines
     def process_lines(self, line: str) -> None:
         """
         Process one line of data with optimized parsing.
@@ -125,7 +126,7 @@ def process_lines(self, line: str) -> None:
 
             # Combine field checks into single lookup.
             required_fields = ("lemmas", "lexicalCategory")
-            if not all(field in lexeme for field in required_fields):
+            if any(field not in lexeme for field in required_fields):
                 return
 
             lexical_category = lexeme["lexicalCategory"]
@@ -162,13 +163,13 @@ def process_lines(self, line: str) -> None:
 
     def _process_translations(self, lexeme, word, lang_code, category_name):
         """
-        Optimized translations processing
+        Optimized translations processing.
         """
         translations = {}
         valid_iso_codes = self.valid_iso_codes
         lexeme_id = lexeme["id"]
 
-        # Pre-fetch senses to avoid repeated lookups
+        # Pre-fetch senses to avoid repeated lookups.
         for sense in lexeme["senses"]:
             if glosses := sense.get("glosses"):
                 translations.update(
@@ -184,7 +185,7 @@ def _process_translations(self, lexeme, word, lang_code, category_name):
 
     def _process_forms(self, lexeme, lang_code, category_name):
         """
-        Optimized forms processing
+        Optimized forms processing.
         """
         lexeme_id = lexeme["id"]
         forms_data = {}
@@ -202,7 +203,7 @@ def _process_forms(self, lexeme, lang_code, category_name):
                 if form_value := rep_data.get("value"):
                     features = form.get("grammaticalFeatures", [])
 
-                    # If features are not empty and not already in the list
+                    # If features are not empty and not already in the list.
                     if (
                         features
                         and features not in self.unique_forms[lang_code][category_name]
@@ -212,7 +213,8 @@ def _process_forms(self, lexeme, lang_code, category_name):
                     if features := form.get("grammaticalFeatures"):
                         if form_name := self._get_form_name(features):
                             cat_dict[form_name] = form_value
-                    break  # Only process first representation
+
+                    break  # only process first representation
 
         if forms_data:
             self.forms_index.update(forms_data)
@@ -220,7 +222,7 @@ def _process_forms(self, lexeme, lang_code, category_name):
 
     def _get_form_name(self, features):
         """
-        Optimized form name generation
+        Optimized form name generation.
         """
         if not features:
             return ""
@@ -238,6 +240,7 @@ def _get_form_name(self, features):
                 if is_first:
                     form_parts.append(label.lower())
                     is_first = False
+
                 else:
                     form_parts.append(label)
 
@@ -247,24 +250,25 @@ def _process_totals(self, lexeme, lang_code, category_name):
         """
         Process totals for statistical counting.
         """
-        # Skip if we have specific data types and this category isn't in them
+        # Skip if we have specific data types and this category isn't in them.
         if self.data_types and category_name.lower() not in [
             dt.lower() for dt in self.data_types
         ]:
             return
 
-        # Increment lexeme count for this language and category
+        # Increment lexeme count for this language and category.
         self.lexical_category_counts[lang_code][category_name] += 1
 
-        # Count translations if they exist
+        # Count translations if they exist.
         if lexeme.get("senses"):
             translation_count = sum(
-                1
-                for sense in lexeme["senses"]
-                if sense.get("glosses")
-                and any(
-                    lang in self.valid_iso_codes for lang in sense["glosses"].keys()
+                bool(
+                    sense.get("glosses")
+                    and any(
+                        lang in self.valid_iso_codes for lang in sense["glosses"].keys()
+                    )
                 )
+                for sense in lexeme["senses"]
             )
             if translation_count > 0:
                 self.translation_counts[lang_code][category_name] += translation_count
@@ -356,13 +360,13 @@ def export_translations_json(self, filepath: str, language_iso: str = None) -> N
                 )
                 return
 
-            # Flatten the category level
+            # Flatten the category level.
             filtered = {}
             for category_data in self.translations_index[language_iso].values():
                 for lexeme_id, word_data in category_data.items():
                     filtered[lexeme_id] = word_data
 
-            # Check if filtered data is empty before saving
+            # Check if filtered data is empty before saving.
             if not filtered:
                 print(f"No translations found for {language_iso}, skipping export...")
                 return
@@ -380,8 +384,10 @@ def export_forms_json(
         ----------
         filepath : str
             Base path where the JSON file will be saved.
+
         language_iso : str, optional
             ISO code of the language to export. If None, exports all languages.
+
         data_type : str, optional
             Category of forms to export (e.g., "nouns", "verbs"). If None, exports all types.
 
@@ -399,11 +405,11 @@ def export_forms_json(
             for id, lang_data in self.forms_index.items():
                 if (
                     language_iso in lang_data and data_type
-                ):  # Only process if we have a data_type
+                ):  # only process if we have a data_type
                     if (
                         data_type in lang_data[language_iso]
-                    ):  # Check if this data_type exists
-                        # Initialize the nested dictionary for this ID if it doesn't exist
+                    ):  # Check if this data_type exists.
+                        # Initialize the nested dictionary for this ID if it doesn't exist.
                         if id not in filtered:
                             filtered[id] = {}
 
@@ -413,13 +419,13 @@ def export_forms_json(
 
             lang_name = self.iso_to_name[language_iso]
 
-            # Check if filtered data is empty before saving
+            # Check if filtered data is empty before saving.
             if not filtered:
                 print(f"No forms found for {lang_name} {data_type}, skipping export...")
                 return
 
-            # Create the output directory structure
-            # Check if this is a sub-language and get its main language
+            # Create the output directory structure.
+            # Check if this is a sub-language and get its main language.
             main_lang = None
             for lang, data in language_metadata.items():
                 if "sub_languages" in data:
@@ -430,7 +436,7 @@ def export_forms_json(
                     if main_lang:
                         break
 
-            # If it's a sub-language, create path like: parent/chinese/mandarin/
+            # If it's a sub-language, create path like: parent/chinese/mandarin/.
             if main_lang:
                 output_path = Path(filepath).parent / main_lang / lang_name
             else:
@@ -438,10 +444,10 @@ def export_forms_json(
 
             output_path.mkdir(parents=True, exist_ok=True)
 
-            # Create the full output filepath
+            # Create the full output filepath.
             output_file = output_path / f"lexeme_{data_type}.json"
 
-            # Save the filtered data to JSON file
+            # Save the filtered data to JSON file.
             try:
                 with open(output_file, "wb") as f:
                     f.write(orjson.dumps(filtered, option=orjson.OPT_INDENT_2))
@@ -459,10 +465,13 @@ def _save_by_language(self, filtered, filepath, language_iso, data_type):
         ----------
         filtered : dict
             Dictionary with form features as keys and words as values.
+
         filepath : Path
             Base path for saving the file.
+
         language_iso : str
             ISO code of the language.
+
         data_type : str
             Type of data being saved (e.g., "nouns", "verbs").
 
@@ -474,13 +483,13 @@ def _save_by_language(self, filtered, filepath, language_iso, data_type):
         base_path = Path(filepath)
         lang_name = self.iso_to_name[language_iso]
 
-        # Create language-specific directory
+        # Create language-specific directory.
         lang_filepath = base_path.parent / base_path.name
         lang_filepath.parent.mkdir(parents=True, exist_ok=True)
 
         print(f"Saving {lang_name} {data_type} forms to {lang_filepath}...")
 
-        # Save the filtered data with pretty printing
+        # Save the filtered data with pretty printing.
         with open(lang_filepath, "wb") as f:
             f.write(
                 orjson.dumps(
@@ -579,7 +588,7 @@ def parse_dump(
                         break
 
                 for data_type in data_types:
-                    # Create appropriate path based on whether it's a sub-language
+                    # Create appropriate path based on whether it's a sub-language.
                     if main_lang:
                         index_path = (
                             Path(output_dir)
@@ -596,7 +605,7 @@ def parse_dump(
                         needs_processing = True
                         data_types_to_process.add(data_type)
                     else:
-                        # Update path display in skip message
+                        # Update path display in skip message.
                         skip_path = (
                             f"{main_lang}/{lang}/{data_type}.json"
                             if main_lang
@@ -611,10 +620,9 @@ def parse_dump(
             languages = languages_to_process
             data_types = list(data_types_to_process)
 
-        if "translations" not in parse_type:
-            if not data_types or not languages:
-                print("No data types or languages provided. Nothing to process.")
-                return
+        if "translations" not in parse_type and (not data_types or not languages):
+            print("No data types or languages provided. Nothing to process.")
+            return
 
         if not languages:
             print("All requested data already exists. Nothing to process.")
@@ -628,18 +636,18 @@ def parse_dump(
     # MARK: Handle JSON exports
     if "translations" in parse_type:
         for language in languages:
-            # Get the ISO code for the language
-            iso_code = None
-            for iso, name in processor.iso_to_name.items():
-                if name.lower() == language.lower():
-                    iso_code = iso
-                    break
-
-            if iso_code:
+            if iso_code := next(
+                (
+                    iso
+                    for iso, name in processor.iso_to_name.items()
+                    if name.lower() == language.lower()
+                ),
+                None,
+            ):
                 index_path = Path(output_dir) / language / "lexeme_translations.json"
-                # Ensure parent directory exists
+                # Ensure parent directory exists.
                 index_path.parent.mkdir(parents=True, exist_ok=True)
-                # print(f"Exporting translations for {language} to {index_path}")
+                # print(f"Exporting translations for {language} to {index_path}").
                 processor.export_translations_json(str(index_path), iso_code)
             else:
                 print(f"Warning: Could not find ISO code for {language}")
@@ -661,7 +669,7 @@ def parse_dump(
 
     # def print_unique_forms(unique_forms):
     #     """
-    #     Pretty print unique grammatical feature sets
+    #     Pretty print unique grammatical feature sets.
     #     """
     #     for lang, lang_data in unique_forms.items():
     #         print(f"\nLanguage: {lang}")
diff --git a/src/scribe_data/wikidata/wikidata_utils.py b/src/scribe_data/wikidata/wikidata_utils.py
index b29a7450..36cc6bba 100644
--- a/src/scribe_data/wikidata/wikidata_utils.py
+++ b/src/scribe_data/wikidata/wikidata_utils.py
@@ -12,7 +12,7 @@
 
 from scribe_data.cli.download import wd_lexeme_dump_download_wrapper
 from scribe_data.utils import data_type_metadata, language_metadata
-from scribe_data.wiktionary.parse_dump import parse_dump
+from scribe_data.wikidata.parse_dump import parse_dump
 
 sparql = SPARQLWrapper("https://query.wikidata.org/sparql")
 sparql.setReturnFormat(JSON)
@@ -72,11 +72,11 @@ def parse_wd_lexeme_dump(
     overwrite_all : bool, default=False
         If True, automatically overwrite existing files without prompting
     """
-    # Convert "all" to list of all languages including sub-languages
+    # Convert "all" to list of all languages including sub-languages.
     if isinstance(language, str) and language.lower() == "all":
         languages = []
         for main_lang, lang_data in language_metadata.items():
-            # Add sub-languages if they exist
+            # Add sub-languages if they exist.
             if "sub_languages" in lang_data:
                 for sub_lang in lang_data["sub_languages"]:
                     main_lang = sub_lang
@@ -84,7 +84,7 @@ def parse_wd_lexeme_dump(
 
         language = languages
 
-    # For processing: exclude translations and emoji-keywords
+    # For processing: exclude translations and emoji-keywords.
     if isinstance(data_types, str) and data_types.lower() == "all":
         data_types = [
             dt
diff --git a/src/scribe_data/wiktionary/parse_mediaWiki.py b/src/scribe_data/wiktionary/parse_mediaWiki.py
index d6228e93..319fe4da 100644
--- a/src/scribe_data/wiktionary/parse_mediaWiki.py
+++ b/src/scribe_data/wiktionary/parse_mediaWiki.py
@@ -6,7 +6,8 @@
 import json
 import re
 from pathlib import Path
-from scribe_data.utils import get_language_from_iso, DEFAULT_MEDIAWIKI_EXPORT_DIR
+
+from scribe_data.utils import DEFAULT_MEDIAWIKI_EXPORT_DIR, get_language_from_iso
 from scribe_data.wikidata.wikidata_utils import mediaWiki_query
 
 
@@ -115,6 +116,7 @@ def parse_wiktionary_translations(word, output_dir=DEFAULT_MEDIAWIKI_EXPORT_DIR)
     ----------
     word : str
         The word to fetch translations for.
+
     output_dir : str or Path, optional
         Directory to save JSON output (default is DEFAULT_MEDIAWIKI_EXPORT_DIR).
         Will be created if it doesn't exist.
diff --git a/tests/cli/test_total.py b/tests/cli/test_total.py
index 089fde97..2d6f56e9 100644
--- a/tests/cli/test_total.py
+++ b/tests/cli/test_total.py
@@ -258,7 +258,8 @@ def test_total_wrapper_invalid_input(self):
         with self.assertRaises(ValueError):
             total_wrapper()
 
-    # MARK: Using wikidata_dump
+    # MARK: Using Dump
+
     @patch("scribe_data.cli.total.parse_wd_lexeme_dump")
     def test_total_wrapper_wikidata_dump_flag(self, mock_parse_dump):
         """Test when wikidata_dump is True (flag without path)"""
@@ -307,6 +308,7 @@ def test_total_wrapper_wikidata_dump_with_language_and_type(self, mock_parse_dum
         )
 
     # MARK: Using QID
+
     @patch("scribe_data.cli.total.check_qid_is_language")
     @patch("scribe_data.cli.total.print_total_lexemes")
     def test_total_wrapper_with_qid(self, mock_print_total, mock_check_qid):

From 789b178725e7267d7aa8c8552ba5ed1c7615bc8a Mon Sep 17 00:00:00 2001
From: Andrew Tavis McAllister <andrew.t.mcallister@gmail.com>
Date: Sat, 25 Jan 2025 11:14:39 +0100
Subject: [PATCH 12/13] Adding comments and minor name changes

---
 src/scribe_data/check/check_missing_forms/get_forms.py | 1 +
 src/scribe_data/wikidata/wikidata_utils.py             | 8 ++++----
 src/scribe_data/wiktionary/parse_mediaWiki.py          | 9 ++++++---
 3 files changed, 11 insertions(+), 7 deletions(-)

diff --git a/src/scribe_data/check/check_missing_forms/get_forms.py b/src/scribe_data/check/check_missing_forms/get_forms.py
index 6b061411..30208b62 100644
--- a/src/scribe_data/check/check_missing_forms/get_forms.py
+++ b/src/scribe_data/check/check_missing_forms/get_forms.py
@@ -104,6 +104,7 @@ def parse_sparql_query(query_text):
     return result
 
 
+# Debug line to parsed file.
 parse_sparql_files()
 
 
diff --git a/src/scribe_data/wikidata/wikidata_utils.py b/src/scribe_data/wikidata/wikidata_utils.py
index 36cc6bba..d33fa096 100644
--- a/src/scribe_data/wikidata/wikidata_utils.py
+++ b/src/scribe_data/wikidata/wikidata_utils.py
@@ -19,13 +19,13 @@
 sparql.setMethod(POST)
 
 
-def mediaWiki_query(query: str) -> dict:
+def mediawiki_query(word: str) -> dict:
     """
     Query the Wikidata API using a MediaWiki query.
 
     Parameters
     ----------
-    query : str
+    word : str
         The MediaWiki query to execute.
 
     Returns
@@ -34,8 +34,8 @@ def mediaWiki_query(query: str) -> dict:
         The JSON response from the API.
     """
     url = (
-        f"https://en.wiktionary.org/w/api.php?"
-        f"action=query&format=json&titles={query}/translations&prop=revisions&rvprop=content"
+        f"https://wikidata.org/w/api.php?"
+        f"action=query&format=json&titles={word}/translations&prop=revisions&rvprop=content"
     )
     response = requests.get(url)
     return response.json()
diff --git a/src/scribe_data/wiktionary/parse_mediaWiki.py b/src/scribe_data/wiktionary/parse_mediaWiki.py
index 319fe4da..e451830d 100644
--- a/src/scribe_data/wiktionary/parse_mediaWiki.py
+++ b/src/scribe_data/wiktionary/parse_mediaWiki.py
@@ -8,11 +8,14 @@
 from pathlib import Path
 
 from scribe_data.utils import DEFAULT_MEDIAWIKI_EXPORT_DIR, get_language_from_iso
-from scribe_data.wikidata.wikidata_utils import mediaWiki_query
+from scribe_data.wikidata.wikidata_utils import mediawiki_query
 
 
-def fetch_translation_page(word):
-    data = mediaWiki_query(word)
+def fetch_translation_page(word: str):
+    """
+    Fetches the translation for a given word via the Wiktionary MediaWiki API.
+    """
+    data = mediawiki_query(word=word)
 
     pages = data.get("query", {}).get("pages", {})
     # Extract page object from dictionary.

From 06d77b284ab0c4aa1459e4723437dda820ac92bc Mon Sep 17 00:00:00 2001
From: Andrew Tavis McAllister <andrew.t.mcallister@gmail.com>
Date: Sat, 25 Jan 2025 12:24:04 +0100
Subject: [PATCH 13/13] Fixes to query all user flow and outputs and test
 changes

---
 src/scribe_data/cli/get.py                 | 38 ++++++++++++----------
 src/scribe_data/cli/main.py                |  3 --
 src/scribe_data/wikidata/parse_dump.py     | 36 ++++++++++----------
 src/scribe_data/wikidata/wikidata_utils.py |  2 +-
 tests/cli/test_get.py                      | 10 +++---
 5 files changed, 46 insertions(+), 43 deletions(-)

diff --git a/src/scribe_data/cli/get.py b/src/scribe_data/cli/get.py
index 589bc7f4..c1f47756 100644
--- a/src/scribe_data/cli/get.py
+++ b/src/scribe_data/cli/get.py
@@ -3,7 +3,7 @@
 Functions for getting languages-data types packs for the Scribe-Data CLI.
 """
 
-import os  # for removing original JSON files
+import os
 from pathlib import Path
 from typing import List, Union
 
@@ -94,22 +94,13 @@ def prompt_user_download_all():
         Checks with the user if they'd rather use Wikidata lexeme dumps before a download all call.
         """
         return questionary.confirm(
-            "Do you want to query Wikidata directly? (selecting 'no' will use Wikidata lexeme dumps)",
+            "Do you want to query Wikidata directly? (selecting 'no' will use a Wikidata lexemes dump locally to avoid large Query Service calls)",
             default=False,
         ).ask()
 
     if all_bool:
         if language:
             if prompt_user_download_all():
-                parse_wd_lexeme_dump(
-                    language=language,
-                    wikidata_dump_type=["form"],
-                    data_types="all",
-                    type_output_dir=output_dir,
-                    wikidata_dump_path=wikidata_dump,
-                    overwrite_all=overwrite,
-                )
-            else:
                 language_or_sub_language = language.split(" ")[0]
                 print(f"Updating all data types for language: {language.title()}")
                 query_data(
@@ -122,17 +113,18 @@ def prompt_user_download_all():
                     f"Query completed for all data types for language {language.title()}."
                 )
 
-        elif data_type:
-            if prompt_user_download_all():
+            else:
                 parse_wd_lexeme_dump(
-                    language="all",
+                    language=language,
                     wikidata_dump_type=["form"],
-                    data_types=[data_type],
+                    data_types="all",
                     type_output_dir=output_dir,
                     wikidata_dump_path=wikidata_dump,
                     overwrite_all=overwrite,
                 )
-            else:
+
+        elif data_type:
+            if prompt_user_download_all():
                 print(f"Updating all languages for data type: {data_type.capitalize()}")
                 query_data(
                     languages=None,
@@ -144,6 +136,16 @@ def prompt_user_download_all():
                     f"Query completed for all languages for data type {data_type.capitalize()}."
                 )
 
+            else:
+                parse_wd_lexeme_dump(
+                    language="all",
+                    wikidata_dump_type=["form"],
+                    data_types=[data_type],
+                    type_output_dir=output_dir,
+                    wikidata_dump_path=wikidata_dump,
+                    overwrite_all=overwrite,
+                )
+
         else:
             print("Updating all languages and data types...")
             rprint(
@@ -169,6 +171,7 @@ def prompt_user_download_all():
         # If no language specified, use "all".
         if language is None:
             language = "all"
+
         parse_wd_lexeme_dump(
             language=language,
             wikidata_dump_type=["translations"],
@@ -182,8 +185,9 @@ def prompt_user_download_all():
 
     elif wikidata_dump is not None:
         # If wikidata_dump is an empty string, use the default path.
-        if wikidata_dump == "":
+        if not wikidata_dump:
             wikidata_dump = DEFAULT_DUMP_EXPORT_DIR
+
         parse_wd_lexeme_dump(
             language=language,
             wikidata_dump_type=["form"],
diff --git a/src/scribe_data/cli/main.py b/src/scribe_data/cli/main.py
index 89ec60c2..1c08ca52 100644
--- a/src/scribe_data/cli/main.py
+++ b/src/scribe_data/cli/main.py
@@ -360,9 +360,6 @@ def main() -> None:
                 parse_wiktionary_translations(args.translation, args.output_dir)
 
             else:
-                print(
-                    f"Parsing Wikidata lexeme dump for {args.language} and {args.data_type}..."
-                )
                 get_data(
                     language=args.language.lower()
                     if args.language is not None
diff --git a/src/scribe_data/wikidata/parse_dump.py b/src/scribe_data/wikidata/parse_dump.py
index d0e8689c..e39b1ec3 100644
--- a/src/scribe_data/wikidata/parse_dump.py
+++ b/src/scribe_data/wikidata/parse_dump.py
@@ -373,7 +373,8 @@ def export_translations_json(self, filepath: str, language_iso: str = None) -> N
 
             self._save_by_language(filtered, filepath, language_iso, "translations")
 
-    # MARK: export forms
+    # MARK: Export Forms
+
     def export_forms_json(
         self, filepath: str, language_iso: str = None, data_type: str = None
     ) -> None:
@@ -393,7 +394,7 @@ def export_forms_json(
 
         Notes
         -----
-        Creates a directory structure: <filepath>/<language_name>/lexeme_<data_type>.json
+        Creates a directory structure: <filepath>/<language_name>/<data_type>.json
         Skips export if no forms are found for the specified language and data type.
         """
         if language_iso:
@@ -421,7 +422,9 @@ def export_forms_json(
 
             # Check if filtered data is empty before saving.
             if not filtered:
-                print(f"No forms found for {lang_name} {data_type}, skipping export...")
+                print(
+                    f"No forms found for {lang_name.capitalize()} {data_type}, skipping export..."
+                )
                 return
 
             # Create the output directory structure.
@@ -445,17 +448,19 @@ def export_forms_json(
             output_path.mkdir(parents=True, exist_ok=True)
 
             # Create the full output filepath.
-            output_file = output_path / f"lexeme_{data_type}.json"
+            output_file = output_path / f"{data_type}.json"
 
             # Save the filtered data to JSON file.
             try:
                 with open(output_file, "wb") as f:
                     f.write(orjson.dumps(filtered, option=orjson.OPT_INDENT_2))
                 print(
-                    f"Successfully exported forms for {lang_name} {data_type} to {output_file}"
+                    f"Successfully exported forms for {lang_name.capitalize()} {data_type} to {output_file}"
                 )
             except Exception as e:
-                print(f"Error saving forms for {lang_name} {data_type}: {e}")
+                print(
+                    f"Error saving forms for {lang_name.capitalize()} {data_type}: {e}"
+                )
 
     def _save_by_language(self, filtered, filepath, language_iso, data_type):
         """
@@ -558,7 +563,7 @@ def parse_dump(
         if "translations" in parse_type:
             languages_to_process = []
             for lang in languages:
-                index_path = Path(output_dir) / lang / "lexeme_translations.json"
+                index_path = Path(output_dir) / lang / "translations.json"
 
                 if not check_index_exists(index_path, overwrite_all):
                     languages_to_process.append(lang)
@@ -591,19 +596,16 @@ def parse_dump(
                     # Create appropriate path based on whether it's a sub-language.
                     if main_lang:
                         index_path = (
-                            Path(output_dir)
-                            / main_lang
-                            / lang
-                            / f"lexeme_{data_type}.json"
+                            Path(output_dir) / main_lang / lang / f"{data_type}.json"
                         )
+
                     else:
-                        index_path = (
-                            Path(output_dir) / lang / f"lexeme_{data_type}.json"
-                        )
+                        index_path = Path(output_dir) / lang / f"{data_type}.json"
 
                     if not check_index_exists(index_path, overwrite_all):
                         needs_processing = True
                         data_types_to_process.add(data_type)
+
                     else:
                         # Update path display in skip message.
                         skip_path = (
@@ -644,7 +646,7 @@ def parse_dump(
                 ),
                 None,
             ):
-                index_path = Path(output_dir) / language / "lexeme_translations.json"
+                index_path = Path(output_dir) / language / "translations.json"
                 # Ensure parent directory exists.
                 index_path.parent.mkdir(parents=True, exist_ok=True)
                 # print(f"Exporting translations for {language} to {index_path}").
@@ -654,9 +656,9 @@ def parse_dump(
 
     # (b) If "form" in parse_type -> export forms for each data_type in data_types.
     if "form" in parse_type:
-        # For each data_type, we create a separate file, e.g. lexeme_nouns.json.
+        # For each data_type, we create a separate file, e.g. nouns.json.
         for dt in data_types:
-            index_path = Path(output_dir) / f"lexeme_{dt}.json"
+            index_path = Path(output_dir) / f"{dt}.json"
             iso_codes = set()
             for word_data in processor.forms_index.values():
                 iso_codes.update(word_data.keys())
diff --git a/src/scribe_data/wikidata/wikidata_utils.py b/src/scribe_data/wikidata/wikidata_utils.py
index d33fa096..7109620e 100644
--- a/src/scribe_data/wikidata/wikidata_utils.py
+++ b/src/scribe_data/wikidata/wikidata_utils.py
@@ -92,7 +92,7 @@ def parse_wd_lexeme_dump(
             if dt != "translations" and dt != "emoji-keywords"
         ]
 
-    print(f"Languages to process: {language}")
+    print(f"Languages to process: {[lang.capitalize() for lang in language]}")
 
     if "translations" not in wikidata_dump_type:
         print(f"Data types to process: {data_types}")
diff --git a/tests/cli/test_get.py b/tests/cli/test_get.py
index 63af75a8..2cb52061 100644
--- a/tests/cli/test_get.py
+++ b/tests/cli/test_get.py
@@ -48,16 +48,16 @@ def test_invalid_arguments(self):
     @patch("scribe_data.cli.get.query_data")
     @patch("scribe_data.cli.get.parse_wd_lexeme_dump")
     @patch("scribe_data.cli.get.questionary.confirm")
-    def test_get_all_data_types_for_language_user_says_yes(
+    def test_get_all_data_types_for_language_user_says_no(
         self, mock_questionary_confirm, mock_parse, mock_query_data
     ):
         """
-        Test the behavior when the user agrees to query Wikidata directly.
+        Test the behavior when the user agrees to use Wikidata lexeme dumps.
 
         This test checks that `parse_wd_lexeme_dump` is called with the correct parameters
-        when the user confirms they want to query Wikidata.
+        when the user confirms they don't want to query Wikidata.
         """
-        mock_questionary_confirm.return_value.ask.return_value = True
+        mock_questionary_confirm.return_value.ask.return_value = False
 
         get_data(all_bool=True, language="English")
 
@@ -321,7 +321,7 @@ def test_get_data_with_wikidata_identifier(
         when a Wikidata identifier is used.
         """
         # Mock the user confirmation to return True (query Wikidata directly).
-        mock_questionary_confirm.return_value.ask.return_value = True
+        mock_questionary_confirm.return_value.ask.return_value = False
 
         get_data(
             language="Q9217",