From 0fe68642de07d3657f0924d1d871eb19ba1e3861 Mon Sep 17 00:00:00 2001 From: axif Date: Sun, 12 Jan 2025 03:03:28 +0600 Subject: [PATCH 01/13] fix small bugs --- .gitignore | 2 + src/scribe_data/cli/get.py | 5 +- src/scribe_data/cli/main.py | 10 ++-- src/scribe_data/utils.py | 13 +++++ src/scribe_data/wikidata/wikidata_utils.py | 2 - src/scribe_data/wiktionary/parse_dump.py | 21 +------- src/scribe_data/wiktionary/parse_mediaWiki.py | 53 ++++++++++++++++--- 7 files changed, 72 insertions(+), 34 deletions(-) diff --git a/.gitignore b/.gitignore index 610b9da8..4bcc3809 100644 --- a/.gitignore +++ b/.gitignore @@ -40,6 +40,8 @@ scribe_data_csv_export/* scribe_data_json_export/* scribe_data_sqlite_export/* scribe_data_tsv_export/* +scribe_data_mediawiki_export/* +scribe_data_wikidata_dumps_export/* # MARK: Wiki Dumps diff --git a/src/scribe_data/cli/get.py b/src/scribe_data/cli/get.py index c3e98e6d..9be44075 100644 --- a/src/scribe_data/cli/get.py +++ b/src/scribe_data/cli/get.py @@ -34,6 +34,7 @@ DEFAULT_JSON_EXPORT_DIR, DEFAULT_SQLITE_EXPORT_DIR, DEFAULT_TSV_EXPORT_DIR, + DEFAULT_DUMP_EXPORT_DIR, ) from scribe_data.wikidata.query_data import query_data from scribe_data.wikidata.wikidata_utils import parse_wd_lexeme_dump @@ -189,7 +190,9 @@ def prompt_user_download_all(): # MARK: Form Dump - elif wikidata_dump: + elif wikidata_dump is not None: + if not wikidata_dump: + wikidata_dump = DEFAULT_DUMP_EXPORT_DIR parse_wd_lexeme_dump( language=language, wikidata_dump_type=["form"], diff --git a/src/scribe_data/cli/main.py b/src/scribe_data/cli/main.py index e22f4aea..d51712d2 100644 --- a/src/scribe_data/cli/main.py +++ b/src/scribe_data/cli/main.py @@ -166,8 +166,9 @@ def main() -> None: get_parser.add_argument( "-wdp", "--wikidata-dump-path", - type=str, - help="Path to a local Wikidata lexemes dump for running with '--all'.", + nargs="?", + const="", + help="Path to a local Wikidata lexemes dump. Uses default directory if no path provided.", ) get_parser.add_argument( "-t", "--translation", type=str, help="parse a single word using MediaWiki API" @@ -364,8 +365,11 @@ def main() -> None: if args.interactive: start_interactive_mode(operation="get") if args.translation: - parse_wiktionary_translations(args.translation) + parse_wiktionary_translations(args.translation, args.output_dir) else: + print( + f"Parsing Wikidata lexeme dump for {args.language} and {args.data_type}" + ) get_data( language=args.language.lower() if args.language is not None diff --git a/src/scribe_data/utils.py b/src/scribe_data/utils.py index 153fc293..fe4b89db 100644 --- a/src/scribe_data/utils.py +++ b/src/scribe_data/utils.py @@ -43,6 +43,7 @@ DEFAULT_TSV_EXPORT_DIR = "scribe_data_tsv_export" DEFAULT_SQLITE_EXPORT_DIR = "scribe_data_sqlite_export" DEFAULT_DUMP_EXPORT_DIR = "scribe_data_wikidata_dumps_export" +DEFAULT_MEDIAWIKI_EXPORT_DIR = "scribe_data_mediawiki_export" LANGUAGE_DATA_EXTRACTION_DIR = ( Path(__file__).parent / "wikidata" / "language_data_extraction" @@ -713,6 +714,18 @@ def check_lexeme_dump_prompt_download(output_dir: str): rprint("[bold red]No valid dumps found.[/bold red]") return None + elif user_input == "Download new version": + # Rename existing latest dump if it exists + latest_dump = Path(output_dir) / "latest-lexemes.json.bz2" + if latest_dump.exists(): + timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") + backup_name = f"old_latest-lexemes_{timestamp}.json.bz2" + latest_dump.rename(Path(output_dir) / backup_name) + rprint( + f"[bold green]Renamed existing dump to {backup_name}[/bold green]" + ) + return False + else: rprint("[bold blue]Skipping download.[/bold blue]") return True diff --git a/src/scribe_data/wikidata/wikidata_utils.py b/src/scribe_data/wikidata/wikidata_utils.py index 29182070..036f58c9 100644 --- a/src/scribe_data/wikidata/wikidata_utils.py +++ b/src/scribe_data/wikidata/wikidata_utils.py @@ -113,5 +113,3 @@ def parse_wd_lexeme_dump( output_dir=type_output_dir, ) return - - rprint(f"[bold red]No valid dumps found in {file_path}.[/bold red]") diff --git a/src/scribe_data/wiktionary/parse_dump.py b/src/scribe_data/wiktionary/parse_dump.py index cea8de12..cffab046 100644 --- a/src/scribe_data/wiktionary/parse_dump.py +++ b/src/scribe_data/wiktionary/parse_dump.py @@ -27,7 +27,6 @@ from typing import List, Union import orjson -import questionary from scribe_data.utils import ( DEFAULT_DUMP_EXPORT_DIR, check_index_exists, @@ -103,10 +102,7 @@ def _build_iso_mapping(self) -> dict: iso_mapping[iso_code] = lang_name for language in self.target_iso: - if ( - language.lower().startswith("q") - and language[1:].isdigit() - ): + if language.lower().startswith("q") and language[1:].isdigit(): qid_to_lang = check_qid_is_language(language) if qid_to_lang: iso_code = get_language_iso_code(language.upper()) @@ -415,20 +411,7 @@ def parse_dump( parse_type = parse_type or [] data_types = data_types or [] - print(f"Languages: {languages}") - print(f"parse_type: {parse_type}") - if data_types: - print(f"data_types for forms: {data_types}") - if "total" not in parse_type: - choice = questionary.select( - "Choose an action:", - choices=["Overwrite existing data", "Skip process"], - default="Skip process", - ).ask() - if choice == "Overwrite existing data": - overwrite_all = True - # For translations, we only need to check the translations index. if "translations" in parse_type: languages_to_process = [] @@ -500,8 +483,6 @@ def parse_dump( # For each data_type, we create a separate file, e.g. lexeme_nouns.json. for dt in data_types: index_path = Path(output_dir) / f"lexeme_{dt}.json" - print(f"Exporting forms for {dt} to {index_path}...") - iso_codes = set() for word_data in processor.forms_index.values(): iso_codes.update(word_data.keys()) diff --git a/src/scribe_data/wiktionary/parse_mediaWiki.py b/src/scribe_data/wiktionary/parse_mediaWiki.py index 6968c8ad..668c44a4 100644 --- a/src/scribe_data/wiktionary/parse_mediaWiki.py +++ b/src/scribe_data/wiktionary/parse_mediaWiki.py @@ -22,8 +22,8 @@ import json import re - -from scribe_data.utils import get_language_from_iso +from pathlib import Path +from scribe_data.utils import get_language_from_iso, DEFAULT_MEDIAWIKI_EXPORT_DIR from scribe_data.wikidata.wikidata_utils import mediaWiki_query @@ -121,16 +121,53 @@ def build_json_format(word, translations_by_lang): return book_translations -def parse_wiktionary_translations(word): +def parse_wiktionary_translations(word, output_dir=DEFAULT_MEDIAWIKI_EXPORT_DIR): """ - Parse the translations of a word from Wiktionary. + Parse translations from Wiktionary and save them to a JSON file. + + Fetches the Wiktionary page for the given word, extracts translations + across different languages, and saves them in a structured JSON format. + + Parameters + ---------- + word : str + The word to fetch translations for. + output_dir : str or Path, optional + Directory to save JSON output (default is DEFAULT_MEDIAWIKI_EXPORT_DIR). + Will be created if it doesn't exist. + + Notes + ----- + The output JSON structure follows the format: + { + "word": { + "language": { + "part_of_speech": { + "1": { + "description": "context", + "translations": "translated_text" + } + } + } + } + } """ - wikitext = fetch_translation_page(word) - translations_by_lang = parse_wikitext_for_translations(wikitext) + output_dir = output_dir or DEFAULT_MEDIAWIKI_EXPORT_DIR + output_path = Path(output_dir) + output_path.mkdir(parents=True, exist_ok=True) + translations_by_lang = parse_wikitext_for_translations(fetch_translation_page(word)) if not translations_by_lang: print("No translations found") return - final_json = build_json_format(word, translations_by_lang) - print(json.dumps(final_json, indent=4, ensure_ascii=False)) + json_path = output_path / f"{word}.json" + with open(json_path, "w", encoding="utf-8") as file: + json.dump( + build_json_format(word, translations_by_lang), + file, + indent=4, + ensure_ascii=False, + ) + + print(f"JSON file saved to {json_path}") From 15735d3a02a1981a17de820551ac1501fcf22fe8 Mon Sep 17 00:00:00 2001 From: axif Date: Tue, 14 Jan 2025 02:16:57 +0600 Subject: [PATCH 02/13] fix small bugs --- src/scribe_data/cli/get.py | 5 + src/scribe_data/wikidata/wikidata_utils.py | 5 + src/scribe_data/wiktionary/parse_dump.py | 268 ++++++++++++++------- tests/cli/test_get.py | 9 +- 4 files changed, 192 insertions(+), 95 deletions(-) diff --git a/src/scribe_data/cli/get.py b/src/scribe_data/cli/get.py index 9be44075..67e603d7 100644 --- a/src/scribe_data/cli/get.py +++ b/src/scribe_data/cli/get.py @@ -123,6 +123,7 @@ def prompt_user_download_all(): wikidata_dump_type=["form"], data_types="all", type_output_dir=output_dir, + overwrite_all=overwrite, ) else: language_or_sub_language = language.split(" ")[0] @@ -144,6 +145,7 @@ def prompt_user_download_all(): wikidata_dump_type=["form"], data_types=[data_type], type_output_dir=output_dir, + overwrite_all=overwrite, ) else: print(f"Updating all languages for data type: {data_type.capitalize()}") @@ -168,6 +170,7 @@ def prompt_user_download_all(): data_types="all", type_output_dir=output_dir, wikidata_dump_path=wikidata_dump, + overwrite_all=overwrite, ) # MARK: Emojis @@ -185,6 +188,7 @@ def prompt_user_download_all(): wikidata_dump_type=["translations"], type_output_dir=output_dir, wikidata_dump_path=wikidata_dump, + overwrite_all=overwrite, ) return @@ -199,6 +203,7 @@ def prompt_user_download_all(): data_types=data_types, type_output_dir=output_dir, wikidata_dump_path=wikidata_dump, + overwrite_all=overwrite, ) return diff --git a/src/scribe_data/wikidata/wikidata_utils.py b/src/scribe_data/wikidata/wikidata_utils.py index 036f58c9..afa9e6f9 100644 --- a/src/scribe_data/wikidata/wikidata_utils.py +++ b/src/scribe_data/wikidata/wikidata_utils.py @@ -64,6 +64,7 @@ def parse_wd_lexeme_dump( data_types: List[str] = None, type_output_dir: str = None, wikidata_dump_path: str = None, + overwrite_all: bool = False, ): """ Checks for the existence of a Wikidata lexeme dump and parses it if possible. @@ -84,6 +85,9 @@ def parse_wd_lexeme_dump( wikidata_dump_path : str, optional The local Wikidata lexeme dump directory that should be used to get data. + + overwrite_all : bool, default=False + If True, automatically overwrite existing files without prompting """ # Convert "all" to list of all languages if isinstance(language, str) and language.lower() == "all": @@ -111,5 +115,6 @@ def parse_wd_lexeme_dump( data_types=data_types, file_path=file_path, output_dir=type_output_dir, + overwrite_all=overwrite_all, ) return diff --git a/src/scribe_data/wiktionary/parse_dump.py b/src/scribe_data/wiktionary/parse_dump.py index cffab046..97ce3eca 100644 --- a/src/scribe_data/wiktionary/parse_dump.py +++ b/src/scribe_data/wiktionary/parse_dump.py @@ -34,6 +34,7 @@ language_metadata, get_language_iso_code, check_qid_is_language, + lexeme_form_metadata, ) from tqdm import tqdm @@ -74,19 +75,22 @@ def __init__( self.forms_index = defaultdict(lambda: defaultdict(lambda: defaultdict(dict))) # Stats. - self.stats = {"processed_entries": 0, "unique_words": 0, "processing_time": 0} + self.stats = {"processed_entries": 0, "processing_time": 0} - # For category lookups, invert data_type_metadata. - # E.g., {"Q1084": "nouns", "Q24905": "verbs", ...}. - self._category_lookup = {v: k for k, v in data_type_metadata.items()} - - # Build map from ISO to full language name. - self.iso_to_name = self._build_iso_mapping() # For "total" usage. self.lexical_category_counts = defaultdict(Counter) self.translation_counts = defaultdict(Counter) self.forms_counts = defaultdict(Counter) + # Cache for feature labels. + self._feature_label_cache = {} + for category, items in lexeme_form_metadata.items(): + for item_data in items.values(): + self._feature_label_cache[item_data["qid"]] = ( + category, + item_data["label"], + ) + # MARK: build iso mapping def _build_iso_mapping(self) -> dict: """ @@ -114,29 +118,29 @@ def _build_iso_mapping(self) -> dict: # MARK: process lines def process_lines(self, line: str) -> None: """ - Process one line of data. Depending on parse_type, we do: - - total stats - - translations - - form categories (filtered by data_types) + Process one line of data with optimized parsing. """ try: + # Use faster exception handling. lexeme = orjson.loads(line.strip().rstrip(",")) if not lexeme: return - # Get common values once. - lemmas = lexeme.get("lemmas", {}) - lexical_category = lexeme.get("lexicalCategory") + # Combine field checks into single lookup. + required_fields = ("lemmas", "lexicalCategory") + if not all(field in lexeme for field in required_fields): + return - if not (lemmas and lexical_category in self.valid_categories): + lexical_category = lexeme["lexicalCategory"] + if lexical_category not in self.valid_categories: return category_name = self._category_lookup.get(lexical_category) if not category_name: return - # Process each type in a single pass through the data. - for lang_code, lemma_data in lemmas.items(): + # Process first valid lemma only. + for lang_code, lemma_data in lexeme["lemmas"].items(): if lang_code not in self.valid_iso_codes: continue @@ -144,50 +148,92 @@ def process_lines(self, line: str) -> None: if not word: continue - if "total" in self.parse_type: - self.lexical_category_counts[lang_code][category_name] += 1 - translation_count = sum( - len(sense.get("glosses", {})) - for sense in lexeme.get("senses", []) - ) - self.translation_counts[lang_code][category_name] += ( - translation_count - ) + parse_types = self.parse_type + if "translations" in parse_types and lexeme.get("senses"): + self._process_translations(lexeme, word, lang_code, category_name) + + if "form" in parse_types and category_name in self.data_types: + self._process_forms(lexeme, lang_code, category_name) - if "translations" in self.parse_type: - if translations := { - lang: gloss["value"] - for sense in lexeme.get("senses", []) - for lang, gloss in sense.get("glosses", {}).items() - if lang in self.valid_iso_codes - }: - self.translations_index[word][lang_code][category_name] = ( - translations - ) - - if "form" in self.parse_type and category_name in self.data_types: - forms_data = defaultdict(list) - for form in lexeme.get("forms", []): - for rep_lang, rep_data in form.get( - "representations", {} - ).items(): - if rep_lang == lang_code: - if form_value := rep_data.get("value"): - forms_data[form_value].extend( - form.get("grammaticalFeatures", []) - ) - - if forms_data: - self.forms_index[word][lang_code][category_name] = dict( - forms_data - ) - self.forms_counts[lang_code][category_name] += len(forms_data) - - break # only process first valid lemma + if "total" in parse_types: + self._process_totals(lexeme, lang_code, category_name) + + break except Exception as e: print(f"Error processing line: {e}") + def _process_translations(self, lexeme, word, lang_code, category_name): + """ + Optimized translations processing + """ + translations = {} + valid_iso_codes = self.valid_iso_codes + + # Pre-fetch senses to avoid repeated lookups. + for sense in lexeme["senses"]: + if glosses := sense.get("glosses"): + translations.update( + (lang, gloss["value"]) + for lang, gloss in glosses.items() + if lang in valid_iso_codes + ) + + if translations: + self.translations_index[word][lang_code][category_name] = translations + + def _process_forms(self, lexeme, lang_code, category_name): + """ + Optimized forms processing + """ + lexeme_id = lexeme["id"] + forms_data = {} + + # Pre-compute form data structure. + forms_dict = forms_data.setdefault(lexeme_id, {}) + lang_dict = forms_dict.setdefault(lang_code, {}) + cat_dict = lang_dict.setdefault(category_name, {}) + + for form in lexeme.get("forms", []): + if not (representations := form.get("representations")): + continue + + for rep_data in representations.values(): + if form_value := rep_data.get("value"): + if features := form.get("grammaticalFeatures"): + if form_name := self._get_form_name(features): + cat_dict[form_name] = form_value + break # Only process first representation + + if forms_data: + self.forms_index.update(forms_data) + self.forms_counts[lang_code][category_name] += len(forms_data) + + def _get_form_name(self, features): + """ + Optimized form name generation + """ + if not features: + return "" + + categorized_features = defaultdict(list) + for feature in features: + if feature_info := self._feature_label_cache.get(feature): + category, label = feature_info + categorized_features[category].append((label, feature)) + + form_parts = [] + is_first = True + for category in sorted(categorized_features.keys()): + for label, _ in sorted(categorized_features[category]): + if is_first: + form_parts.append(label.lower()) + is_first = False + else: + form_parts.append(label) + + return "".join(form_parts) + # MARK: process file def process_file(self, file_path: str, batch_size: int = 50000): """ @@ -293,11 +339,21 @@ def export_forms_json( self, filepath: str, language_iso: str = None, data_type: str = None ) -> None: """ - Save forms_index to file, optionally filtering by: - - language_iso - - data_type (e.g. "nouns", "adverbs") - - If data_type is given, we only export that one category from forms. + Export grammatical forms to a JSON file with readable feature labels. + + Parameters + ---------- + filepath : str + Base path where the JSON file will be saved. + language_iso : str, optional + ISO code of the language to export. If None, exports all languages. + data_type : str, optional + Category of forms to export (e.g., "nouns", "verbs"). If None, exports all types. + + Notes + ----- + Creates a directory structure: //lexeme_.json + Skips export if no forms are found for the specified language and data type. """ if language_iso: if language_iso not in self.iso_to_name: @@ -305,57 +361,83 @@ def export_forms_json( return filtered = {} - for word, lang_data in self.forms_index.items(): - if language_iso in lang_data: - # If data_type is given, only keep that category. - if data_type: - if data_type in lang_data[language_iso]: - filtered[word] = { - language_iso: { - data_type: lang_data[language_iso][data_type] - } - } - - else: - filtered[word] = {language_iso: lang_data[language_iso]} - - # Check if filtered data is empty before saving. + for id, lang_data in self.forms_index.items(): + if ( + language_iso in lang_data and data_type + ): # Only process if we have a data_type + if ( + data_type in lang_data[language_iso] + ): # Check if this data_type exists + # Initialize the nested dictionary for this ID if it doesn't exist + if id not in filtered: + filtered[id] = {} + + form_data = lang_data[language_iso][data_type] + for form_name, word in form_data.items(): + filtered[id][form_name] = word + + lang_name = self.iso_to_name[language_iso] + + # Check if filtered data is empty before saving if not filtered: - print(f"No forms found for {language_iso}, skipping export...") + print(f"No forms found for {lang_name} {data_type}, skipping export...") return - self._save_by_language( - filtered, filepath, language_iso, data_type or "forms" - ) + # Create the output directory structure + output_path = Path(filepath).parent / lang_name + output_path.mkdir(parents=True, exist_ok=True) + + # Create the full output filepath + output_file = output_path / f"lexeme_{data_type}.json" + + # Save the filtered data to JSON file + try: + with open(output_file, "wb") as f: + f.write(orjson.dumps(filtered, option=orjson.OPT_INDENT_2)) + print( + f"Successfully exported forms for {lang_name} {data_type} to {output_file}" + ) + except Exception as e: + print(f"Error saving forms for {lang_name} {data_type}: {e}") - def _save_by_language(self, data, filepath, language_iso, category_type): + def _save_by_language(self, filtered, filepath, language_iso, data_type): """ - Save data to exports//filename. + Save filtered data to language-specific directory. + + Parameters + ---------- + filtered : dict + Dictionary with form features as keys and words as values. + filepath : Path + Base path for saving the file. + language_iso : str + ISO code of the language. + data_type : str + Type of data being saved (e.g., "nouns", "verbs"). + + Notes + ----- + Creates directory structure: exports//filename + and saves the filtered data as a JSON file. """ base_path = Path(filepath) lang_name = self.iso_to_name[language_iso] + # Create language-specific directory lang_filepath = base_path.parent / lang_name / base_path.name lang_filepath.parent.mkdir(parents=True, exist_ok=True) - print(f"Saving {lang_name} {category_type} index to {lang_filepath}...") + print(f"Saving {lang_name} {data_type} forms to {lang_filepath}...") + + # Save the filtered data with pretty printing with open(lang_filepath, "wb") as f: f.write( orjson.dumps( - self._to_dict(data), + filtered, option=orjson.OPT_INDENT_2 | orjson.OPT_NON_STR_KEYS, ) ) - def _to_dict(self, dd): - """ - Recursively convert defaultdict to dict. - """ - if isinstance(dd, defaultdict): - dd = {k: self._to_dict(v) for k, v in dd.items()} - - return dd - # MARK: parse dump def parse_dump( diff --git a/tests/cli/test_get.py b/tests/cli/test_get.py index 54cf389d..f6c76d59 100644 --- a/tests/cli/test_get.py +++ b/tests/cli/test_get.py @@ -83,6 +83,7 @@ def test_get_all_data_types_for_language_user_says_yes( wikidata_dump_type=["form"], data_types="all", # because if only language given, data_types is None type_output_dir="scribe_data_json_export", # default for JSON + overwrite_all=False, ) mock_query_data.assert_not_called() @@ -101,6 +102,7 @@ def test_get_all_languages_and_data_types(self, mock_parse): data_types="all", type_output_dir="scribe_data_json_export", wikidata_dump_path=None, + overwrite_all=False, ) # MARK: Language and Data Type @@ -281,8 +283,9 @@ def test_get_translations_no_language_specified(self, mock_parse): mock_parse.assert_called_once_with( language="all", wikidata_dump_type=["translations"], - type_output_dir="scribe_data_json_export", # default output dir for JSON + type_output_dir="scribe_data_json_export", wikidata_dump_path=None, + overwrite_all=False, ) @patch("scribe_data.cli.get.parse_wd_lexeme_dump") @@ -299,6 +302,7 @@ def test_get_translations_with_specific_language(self, mock_parse): wikidata_dump_type=["translations"], type_output_dir="./test_output", wikidata_dump_path=None, + overwrite_all=False, ) @patch("scribe_data.cli.get.parse_wd_lexeme_dump") @@ -314,6 +318,7 @@ def test_get_translations_with_dump(self, mock_parse): mock_parse.assert_called_once_with( language="German", wikidata_dump_type=["translations"], - type_output_dir="scribe_data_json_export", # default for JSON + type_output_dir="scribe_data_json_export", wikidata_dump_path="./wikidump.json", + overwrite_all=False, ) From c1bec8775b707e843d4ca9f4a597bb08e1962e3d Mon Sep 17 00:00:00 2001 From: axif Date: Tue, 14 Jan 2025 20:35:18 +0600 Subject: [PATCH 03/13] fix small bugs --- src/scribe_data/cli/get.py | 1 + src/scribe_data/wikidata/wikidata_utils.py | 12 +++- src/scribe_data/wiktionary/parse_dump.py | 66 ++++++++++++++-------- 3 files changed, 55 insertions(+), 24 deletions(-) diff --git a/src/scribe_data/cli/get.py b/src/scribe_data/cli/get.py index 67e603d7..acb7b1ad 100644 --- a/src/scribe_data/cli/get.py +++ b/src/scribe_data/cli/get.py @@ -186,6 +186,7 @@ def prompt_user_download_all(): parse_wd_lexeme_dump( language=language, wikidata_dump_type=["translations"], + data_types=data_types, type_output_dir=output_dir, wikidata_dump_path=wikidata_dump, overwrite_all=overwrite, diff --git a/src/scribe_data/wikidata/wikidata_utils.py b/src/scribe_data/wikidata/wikidata_utils.py index afa9e6f9..18d01895 100644 --- a/src/scribe_data/wikidata/wikidata_utils.py +++ b/src/scribe_data/wikidata/wikidata_utils.py @@ -92,13 +92,23 @@ def parse_wd_lexeme_dump( # Convert "all" to list of all languages if isinstance(language, str) and language.lower() == "all": language = list(language_metadata.keys()) + + # For printing: include all data types including translations + display_data_types = list(data_type_metadata.keys()) + + # For processing: exclude translations and emoji-keywords if isinstance(data_types, str) and data_types.lower() == "all": - # Exclude translations as it's a separate section data_types = [ dt for dt in data_type_metadata.keys() if dt != "translations" and dt != "emoji-keywords" ] + display_data_types += ["translations"] + else: + display_data_types = data_types + + print(f"Languages to process: {language}") + print(f"Data types to process: {display_data_types}") file_path = wd_lexeme_dump_download_wrapper(None, wikidata_dump_path) diff --git a/src/scribe_data/wiktionary/parse_dump.py b/src/scribe_data/wiktionary/parse_dump.py index 97ce3eca..d2a359c9 100644 --- a/src/scribe_data/wiktionary/parse_dump.py +++ b/src/scribe_data/wiktionary/parse_dump.py @@ -70,7 +70,7 @@ def __init__( # Separate data structures. self.translations_index = defaultdict( - lambda: defaultdict(lambda: defaultdict(dict)) + lambda: defaultdict(lambda: defaultdict(lambda: defaultdict(dict))) ) self.forms_index = defaultdict(lambda: defaultdict(lambda: defaultdict(dict))) @@ -169,8 +169,9 @@ def _process_translations(self, lexeme, word, lang_code, category_name): """ translations = {} valid_iso_codes = self.valid_iso_codes + lexeme_id = lexeme["id"] - # Pre-fetch senses to avoid repeated lookups. + # Pre-fetch senses to avoid repeated lookups for sense in lexeme["senses"]: if glosses := sense.get("glosses"): translations.update( @@ -180,7 +181,21 @@ def _process_translations(self, lexeme, word, lang_code, category_name): ) if translations: - self.translations_index[word][lang_code][category_name] = translations + self.translations_index[lang_code][category_name][lexeme_id][word] = ( + translations + ) + + # Debug: Print translations_index for specific words + # if word.lower() in ["ändern", "cat", "dog"]: # Add any words to debug + # print("\nStored in translations_index:") + # print(f"Word: {word}") + # print(f"ID: {lexeme_id}") + # print(f"Language: {lang_code}") + # print(f"Category: {category_name}") + # print("Translations:", orjson.dumps( + # translations, + # option=orjson.OPT_INDENT_2 + # ).decode('utf-8')) def _process_forms(self, lexeme, lang_code, category_name): """ @@ -321,13 +336,13 @@ def export_translations_json(self, filepath: str, language_iso: str = None) -> N ) return - filtered = { - word: {language_iso: lang_data[language_iso]} - for word, lang_data in self.translations_index.items() - if language_iso in lang_data - } + # Flatten the category level + filtered = {} + for category_data in self.translations_index[language_iso].values(): + for lexeme_id, word_data in category_data.items(): + filtered[lexeme_id] = word_data - # Check if filtered data is empty before saving. + # Check if filtered data is empty before saving if not filtered: print(f"No translations found for {language_iso}, skipping export...") return @@ -424,7 +439,7 @@ def _save_by_language(self, filtered, filepath, language_iso, data_type): lang_name = self.iso_to_name[language_iso] # Create language-specific directory - lang_filepath = base_path.parent / lang_name / base_path.name + lang_filepath = base_path.parent / base_path.name lang_filepath.parent.mkdir(parents=True, exist_ok=True) print(f"Saving {lang_name} {data_type} forms to {lang_filepath}...") @@ -533,9 +548,9 @@ def parse_dump( languages = languages_to_process data_types = list(data_types_to_process) - print(f"Languages to process: {languages}") - if data_types: - print(f"Data types to process: {data_types}") + if not data_types or not languages: + print("No data types or languages provided. Nothing to process.") + return if not languages: print("All requested data already exists. Nothing to process.") @@ -547,18 +562,23 @@ def parse_dump( processor.process_file(file_path) # MARK: Handle JSON exports - - # (a) If "translations" in parse_type -> export them. if "translations" in parse_type: - index_path = Path(output_dir) / "lexeme_translations.json" - - # Export translations for each ISO found. - iso_codes = set() - for word_data in processor.translations_index.values(): - iso_codes.update(word_data.keys()) - for iso_code in iso_codes: - if iso_code in processor.iso_to_name: + for language in languages: + # Get the ISO code for the language + iso_code = None + for iso, name in processor.iso_to_name.items(): + if name.lower() == language.lower(): + iso_code = iso + break + + if iso_code: + index_path = Path(output_dir) / language / "lexeme_translations.json" + # Ensure parent directory exists + index_path.parent.mkdir(parents=True, exist_ok=True) + # print(f"Exporting translations for {language} to {index_path}") processor.export_translations_json(str(index_path), iso_code) + else: + print(f"Warning: Could not find ISO code for {language}") # (b) If "form" in parse_type -> export forms for each data_type in data_types. if "form" in parse_type: From 78b82c05d09d11dd833c7fda1e510b95920292c3 Mon Sep 17 00:00:00 2001 From: axif Date: Tue, 14 Jan 2025 23:41:06 +0600 Subject: [PATCH 04/13] translation add L:id --- src/scribe_data/wiktionary/parse_dump.py | 43 +++++++++++++++++------- 1 file changed, 30 insertions(+), 13 deletions(-) diff --git a/src/scribe_data/wiktionary/parse_dump.py b/src/scribe_data/wiktionary/parse_dump.py index d2a359c9..f9fcb158 100644 --- a/src/scribe_data/wiktionary/parse_dump.py +++ b/src/scribe_data/wiktionary/parse_dump.py @@ -72,7 +72,7 @@ def __init__( self.translations_index = defaultdict( lambda: defaultdict(lambda: defaultdict(lambda: defaultdict(dict))) ) - self.forms_index = defaultdict(lambda: defaultdict(lambda: defaultdict(dict))) + self.forms_index = defaultdict(lambda: defaultdict(list)) # Stats. self.stats = {"processed_entries": 0, "processing_time": 0} @@ -82,6 +82,9 @@ def __init__( self.translation_counts = defaultdict(Counter) self.forms_counts = defaultdict(Counter) + # For "unique_forms" usage. + self.unique_forms = defaultdict(lambda: defaultdict(list)) + # Cache for feature labels. self._feature_label_cache = {} for category, items in lexeme_form_metadata.items(): @@ -185,18 +188,6 @@ def _process_translations(self, lexeme, word, lang_code, category_name): translations ) - # Debug: Print translations_index for specific words - # if word.lower() in ["ändern", "cat", "dog"]: # Add any words to debug - # print("\nStored in translations_index:") - # print(f"Word: {word}") - # print(f"ID: {lexeme_id}") - # print(f"Language: {lang_code}") - # print(f"Category: {category_name}") - # print("Translations:", orjson.dumps( - # translations, - # option=orjson.OPT_INDENT_2 - # ).decode('utf-8')) - def _process_forms(self, lexeme, lang_code, category_name): """ Optimized forms processing @@ -215,6 +206,15 @@ def _process_forms(self, lexeme, lang_code, category_name): for rep_data in representations.values(): if form_value := rep_data.get("value"): + features = form.get("grammaticalFeatures", []) + + # If features are not empty and not already in the list + if ( + features + and features not in self.unique_forms[lang_code][category_name] + ): + self.unique_forms[lang_code][category_name].append(features) + if features := form.get("grammaticalFeatures"): if form_name := self._get_form_name(features): cat_dict[form_name] = form_value @@ -594,3 +594,20 @@ def parse_dump( processor.export_forms_json( filepath=str(index_path), language_iso=iso_code, data_type=dt ) + + # def print_unique_forms(unique_forms): + # """ + # Pretty print unique grammatical feature sets + # """ + # for lang, lang_data in unique_forms.items(): + # print(f"\nLanguage: {lang}") + # for category, features_list in lang_data.items(): + # print(f" Category: {category}") + # print(f" Total unique feature sets: {len(features_list)}") + # print(" Feature Sets:") + # for i, feature_set in enumerate(features_list, 1): + # # Convert QIDs to a more readable format + # readable_features = [f"Q{qid}" for qid in feature_set] + # print(f" {i}. {readable_features}") + + # print_unique_forms(processor.unique_forms) From 192b09cddc46eb2fd56c9b8fd85cff858f7fd108 Mon Sep 17 00:00:00 2001 From: axif Date: Tue, 14 Jan 2025 23:56:40 +0600 Subject: [PATCH 05/13] fix tests and add tests for QID --- src/scribe_data/cli/get.py | 4 ++- tests/cli/test_get.py | 55 ++++++++++++++++++++++++++++++++++++++ 2 files changed, 58 insertions(+), 1 deletion(-) diff --git a/src/scribe_data/cli/get.py b/src/scribe_data/cli/get.py index acb7b1ad..6659c0ba 100644 --- a/src/scribe_data/cli/get.py +++ b/src/scribe_data/cli/get.py @@ -123,6 +123,7 @@ def prompt_user_download_all(): wikidata_dump_type=["form"], data_types="all", type_output_dir=output_dir, + wikidata_dump_path=wikidata_dump, overwrite_all=overwrite, ) else: @@ -145,6 +146,7 @@ def prompt_user_download_all(): wikidata_dump_type=["form"], data_types=[data_type], type_output_dir=output_dir, + wikidata_dump_path=wikidata_dump, overwrite_all=overwrite, ) else: @@ -181,12 +183,12 @@ def prompt_user_download_all(): # MARK: Translations elif data_type == "translations": + # If no language specified, use "all". if language is None: language = "all" parse_wd_lexeme_dump( language=language, wikidata_dump_type=["translations"], - data_types=data_types, type_output_dir=output_dir, wikidata_dump_path=wikidata_dump, overwrite_all=overwrite, diff --git a/tests/cli/test_get.py b/tests/cli/test_get.py index f6c76d59..836a5699 100644 --- a/tests/cli/test_get.py +++ b/tests/cli/test_get.py @@ -83,6 +83,7 @@ def test_get_all_data_types_for_language_user_says_yes( wikidata_dump_type=["form"], data_types="all", # because if only language given, data_types is None type_output_dir="scribe_data_json_export", # default for JSON + wikidata_dump_path=None, # explicitly set to None overwrite_all=False, ) mock_query_data.assert_not_called() @@ -322,3 +323,57 @@ def test_get_translations_with_dump(self, mock_parse): wikidata_dump_path="./wikidump.json", overwrite_all=False, ) + + # MARK: Use QID as language + + @patch("scribe_data.cli.get.parse_wd_lexeme_dump") + @patch("scribe_data.cli.get.questionary.confirm") + def test_get_data_with_wikidata_identifier( + self, mock_questionary_confirm, mock_parse + ): + """ + Test retrieving data with a Wikidata identifier as language. + + Ensures that `parse_wd_lexeme_dump` is called with the correct parameters + when a Wikidata identifier is used. + """ + # Mock the user confirmation to return True (query Wikidata directly). + mock_questionary_confirm.return_value.ask.return_value = True + + get_data( + language="Q9217", + wikidata_dump="scribe", + output_dir="exported_json", + all_bool=True, + ) + mock_parse.assert_called_once_with( + language="Q9217", + wikidata_dump_type=["form"], + data_types="all", + type_output_dir="exported_json", + wikidata_dump_path="scribe", + overwrite_all=False, + ) + + @patch("scribe_data.cli.get.parse_wd_lexeme_dump") + def test_get_data_with_wikidata_identifier_and_data_type(self, mock_parse): + """ + Test retrieving a specific data type with a Wikidata identifier. + + Ensures that `parse_wd_lexeme_dump` is called with the correct parameters + when a Wikidata identifier and specific data type are used. + """ + get_data( + language="Q9217", + data_type="nouns", + wikidata_dump="scribe", + output_dir="exported_json", + ) + mock_parse.assert_called_once_with( + language="Q9217", + wikidata_dump_type=["form"], + data_types=["nouns"], + type_output_dir="exported_json", + wikidata_dump_path="scribe", + overwrite_all=False, + ) From cfc2777729d180293b2e55295f87cdd09fb1c509 Mon Sep 17 00:00:00 2001 From: axif Date: Thu, 16 Jan 2025 00:51:01 +0600 Subject: [PATCH 06/13] fix total --- src/scribe_data/cli/total.py | 7 +- src/scribe_data/wikidata/wikidata_utils.py | 8 +- src/scribe_data/wiktionary/parse_dump.py | 27 +++++++ tests/cli/test_total.py | 90 ++++++++++++++++++++++ 4 files changed, 124 insertions(+), 8 deletions(-) diff --git a/src/scribe_data/cli/total.py b/src/scribe_data/cli/total.py index 89396f72..b867a48f 100644 --- a/src/scribe_data/cli/total.py +++ b/src/scribe_data/cli/total.py @@ -367,11 +367,15 @@ def total_wrapper( """ # Handle --all flag if all_bool and wikidata_dump: - language = "all" + if data_type is None: + data_type = "all" + if language is None: + language = "all" if wikidata_dump is True: # flag without a wikidata lexeme dump path parse_wd_lexeme_dump( language=language, + data_types=[data_type], wikidata_dump_type=["total"], wikidata_dump_path=None, ) @@ -380,6 +384,7 @@ def total_wrapper( if isinstance(wikidata_dump, str): # if user provided a wikidata lexeme dump path parse_wd_lexeme_dump( language=language, + data_types=[data_type], wikidata_dump_type=["total"], wikidata_dump_path=wikidata_dump, ) diff --git a/src/scribe_data/wikidata/wikidata_utils.py b/src/scribe_data/wikidata/wikidata_utils.py index 18d01895..cf6fb872 100644 --- a/src/scribe_data/wikidata/wikidata_utils.py +++ b/src/scribe_data/wikidata/wikidata_utils.py @@ -93,9 +93,6 @@ def parse_wd_lexeme_dump( if isinstance(language, str) and language.lower() == "all": language = list(language_metadata.keys()) - # For printing: include all data types including translations - display_data_types = list(data_type_metadata.keys()) - # For processing: exclude translations and emoji-keywords if isinstance(data_types, str) and data_types.lower() == "all": data_types = [ @@ -103,12 +100,9 @@ def parse_wd_lexeme_dump( for dt in data_type_metadata.keys() if dt != "translations" and dt != "emoji-keywords" ] - display_data_types += ["translations"] - else: - display_data_types = data_types print(f"Languages to process: {language}") - print(f"Data types to process: {display_data_types}") + print(f"Data types to process: {data_types}") file_path = wd_lexeme_dump_download_wrapper(None, wikidata_dump_path) diff --git a/src/scribe_data/wiktionary/parse_dump.py b/src/scribe_data/wiktionary/parse_dump.py index f9fcb158..fa6bd0f6 100644 --- a/src/scribe_data/wiktionary/parse_dump.py +++ b/src/scribe_data/wiktionary/parse_dump.py @@ -249,6 +249,32 @@ def _get_form_name(self, features): return "".join(form_parts) + def _process_totals(self, lexeme, lang_code, category_name): + """ + Process totals for statistical counting. + """ + # Skip if we have specific data types and this category isn't in them + if self.data_types and category_name.lower() not in [ + dt.lower() for dt in self.data_types + ]: + return + + # Increment lexeme count for this language and category + self.lexical_category_counts[lang_code][category_name] += 1 + + # Count translations if they exist + if lexeme.get("senses"): + translation_count = sum( + 1 + for sense in lexeme["senses"] + if sense.get("glosses") + and any( + lang in self.valid_iso_codes for lang in sense["glosses"].keys() + ) + ) + if translation_count > 0: + self.translation_counts[lang_code][category_name] += translation_count + # MARK: process file def process_file(self, file_path: str, batch_size: int = 50000): """ @@ -611,3 +637,4 @@ def parse_dump( # print(f" {i}. {readable_features}") # print_unique_forms(processor.unique_forms) + # print(processor.unique_forms) diff --git a/tests/cli/test_total.py b/tests/cli/test_total.py index 7ede34b4..a9640142 100644 --- a/tests/cli/test_total.py +++ b/tests/cli/test_total.py @@ -274,3 +274,93 @@ def test_total_wrapper_language_and_data_type(self, mock_get_total_lexemes): def test_total_wrapper_invalid_input(self): with self.assertRaises(ValueError): total_wrapper() + + # MARK: Using wikidata_dump + @patch("scribe_data.cli.total.parse_wd_lexeme_dump") + def test_total_wrapper_wikidata_dump_flag(self, mock_parse_dump): + """Test when wikidata_dump is True (flag without path)""" + total_wrapper(wikidata_dump=True) + mock_parse_dump.assert_called_once_with( + language=None, + data_types=[None], + wikidata_dump_type=["total"], + wikidata_dump_path=None, + ) + + @patch("scribe_data.cli.total.parse_wd_lexeme_dump") + def test_total_wrapper_wikidata_dump_path(self, mock_parse_dump): + """Test when wikidata_dump is a file path""" + dump_path = "/path/to/dump.json" + total_wrapper(wikidata_dump=dump_path) + mock_parse_dump.assert_called_once_with( + language=None, + data_types=[None], + wikidata_dump_type=["total"], + wikidata_dump_path=dump_path, + ) + + @patch("scribe_data.cli.total.parse_wd_lexeme_dump") + def test_total_wrapper_wikidata_dump_with_all(self, mock_parse_dump): + """Test when both wikidata_dump and all_bool are True""" + total_wrapper(wikidata_dump=True, all_bool=True) + mock_parse_dump.assert_called_once_with( + language="all", + data_types=["all"], + wikidata_dump_type=["total"], + wikidata_dump_path=None, + ) + + @patch("scribe_data.cli.total.parse_wd_lexeme_dump") + def test_total_wrapper_wikidata_dump_with_language_and_type(self, mock_parse_dump): + """Test wikidata_dump with specific language and data type""" + total_wrapper( + language="English", data_type="nouns", wikidata_dump="/path/to/dump.json" + ) + mock_parse_dump.assert_called_once_with( + language="English", + data_types=["nouns"], + wikidata_dump_type=["total"], + wikidata_dump_path="/path/to/dump.json", + ) + + # MARK: Using QID + @patch("scribe_data.cli.total.check_qid_is_language") + @patch("scribe_data.cli.total.print_total_lexemes") + def test_total_wrapper_with_qid(self, mock_print_total, mock_check_qid): + """ + Test when language is provided as a QID + """ + mock_check_qid.return_value = "Thai" + total_wrapper(language="Q9217") + mock_print_total.assert_called_once_with(language="Q9217") + + @patch("scribe_data.cli.total.check_qid_is_language") + @patch("scribe_data.cli.total.get_total_lexemes") + def test_total_wrapper_with_qid_and_datatype(self, mock_get_total, mock_check_qid): + """ + Test when language QID and data type are provided + """ + mock_check_qid.return_value = "Thai" + total_wrapper(language="Q9217", data_type="nouns") + mock_get_total.assert_called_once_with(language="Q9217", data_type="nouns") + + @patch("scribe_data.cli.total.parse_wd_lexeme_dump") + def test_total_wrapper_qid_with_wikidata_dump(self, mock_parse_dump): + """ + Test QID with wikidata dump + """ + total_wrapper(language="Q9217", wikidata_dump=True, all_bool=True) + mock_parse_dump.assert_called_once_with( + language="Q9217", + data_types=["all"], + wikidata_dump_type=["total"], + wikidata_dump_path=None, + ) + + @patch("scribe_data.cli.total.get_total_lexemes") + def test_get_total_lexemes_with_qid(self, mock_get_total): + """ + Test get_total_lexemes with QID input + """ + total_wrapper(language="Q9217", data_type="Q1084") # Q1084 is noun QID + mock_get_total.assert_called_once_with(language="Q9217", data_type="Q1084") From e302a9b35041b910213cee3c49a6e628a13ebbee Mon Sep 17 00:00:00 2001 From: axif Date: Mon, 20 Jan 2025 03:18:03 +0600 Subject: [PATCH 07/13] fix small bugs --- .../workflows/missing_form_check&update.yaml | 133 ++++++++++ .github/workflows/update_emojis.yaml | 150 +++++++++++ .gitignore | 4 + .../check_missing_forms.py | 239 ++++++++++++++++++ .../check/check_missing_forms/download_wd.py | 102 ++++++++ .../check_missing_forms/generate_query.py | 166 ++++++++++++ .../check/check_missing_forms/get_forms.py | 173 +++++++++++++ .../check/check_missing_forms/pr_body.py | 109 ++++++++ src/scribe_data/cli/get.py | 3 +- src/scribe_data/cli/main.py | 16 +- src/scribe_data/wikidata/wikidata_utils.py | 12 +- src/scribe_data/wiktionary/parse_dump.py | 62 ++++- 12 files changed, 1158 insertions(+), 11 deletions(-) create mode 100644 .github/workflows/missing_form_check&update.yaml create mode 100644 .github/workflows/update_emojis.yaml create mode 100644 src/scribe_data/check/check_missing_forms/check_missing_forms.py create mode 100644 src/scribe_data/check/check_missing_forms/download_wd.py create mode 100644 src/scribe_data/check/check_missing_forms/generate_query.py create mode 100644 src/scribe_data/check/check_missing_forms/get_forms.py create mode 100644 src/scribe_data/check/check_missing_forms/pr_body.py diff --git a/.github/workflows/missing_form_check&update.yaml b/.github/workflows/missing_form_check&update.yaml new file mode 100644 index 00000000..1e51ca3f --- /dev/null +++ b/.github/workflows/missing_form_check&update.yaml @@ -0,0 +1,133 @@ +name: Create Automated PR +on: + schedule: + - cron: '0 0 1 * *' # Runs at 00:00 UTC on the first day of every month + # Allow manual trigger + workflow_dispatch: + +jobs: + check-repository: + runs-on: ubuntu-latest + outputs: + is_correct_repo: ${{ steps.check.outputs.is_correct_repo }} + steps: + - name: Check repository + id: check + run: | + if [ "$GITHUB_REPOSITORY" = "scribe-org/Scribe-Data" ]; then + echo "is_correct_repo=true" >> "$GITHUB_OUTPUT" + else + echo "is_correct_repo=false" >> "$GITHUB_OUTPUT" + echo "::warning::This workflow should only run in scribe-org/Scribe-Data repository." + fi + + create-pull-request: + needs: check-repository + if: needs.check-repository.outputs.is_correct_repo == 'true' + runs-on: ubuntu-latest + steps: + - name: Checkout repository + uses: actions/checkout@v4 + with: + fetch-depth: 0 + + - name: Set up Python + uses: actions/setup-python@v4 + with: + python-version: '3.x' + + - name: Install dependencies + run: | + python -m pip install --upgrade pip + pip install -r requirements.txt + pip install rich requests tqdm + pip install -e . + + - name: Generate Missing Features Data + run: | + # Set up paths + DUMP_PATH=$(PYTHONPATH=$PYTHONPATH:$(pwd)/src python src/scribe_data/check/check_missing_forms/download_wd.py | grep "DOWNLOAD_PATH=" | cut -d'=' -f2) + QUERY_DIR="$(pwd)/src/scribe_data/wikidata/language_data_extraction" + + echo "Dump path: ${DUMP_PATH}" + echo "Query directory: ${QUERY_DIR}" + + # Check if paths exist + if [ -n "${DUMP_PATH}" ] && [ -d "${QUERY_DIR}" ]; then + # Generate the missing features data with all keys processing + PYTHONPATH=$PYTHONPATH:$(pwd)/src python src/scribe_data/check/check_missing_forms/check_missing_forms.py "${DUMP_PATH}" "${QUERY_DIR}" --process-all-keys + else + echo "Required paths not found:" + echo "Dump path exists: $([ -n "${DUMP_PATH}" ] && echo "Yes" || echo "No")" + echo "Query directory exists: $([ -d "${QUERY_DIR}" ] && echo "Yes" || echo "No")" + exit 1 + fi + + # Debug steps to understand the state + - name: Debug Info + run: | + echo "Current branch: $(git branch --show-current)" + echo "List of changes:" + git status + + - name: Make changes + run: | + git add src/scribe_data/wikidata/language_data_extraction/**/*.sparql + git config --global user.email "github-actions[bot]@users.noreply.github.com" + git config --global user.name "github-actions[bot]" + + - name: Debug Missing Features Data + if: always() + run: | + # Print the contents of the missing features JSON file if it exists + if [ -f missing_features.json ]; then + echo "Contents of missing_features.json:" + cat missing_features.json + else + echo "missing_features.json not found" + fi + + - name: Generate PR Body + id: pr-body + run: | + # Run the pr_body.py script with the missing features data + PR_BODY_CONTENT=$(python src/scribe_data/check/check_missing_forms/pr_body.py missing_features.json) + + # Debug output + echo "PR Body Content:" + echo "$PR_BODY_CONTENT" + + # Initialize PR body with delimiter + { + echo "body<> $GITHUB_OUTPUT + + - name: Debug PR Body Output + run: | + # Print the PR body content from the output + echo "PR Body from GITHUB_OUTPUT:" + cat $GITHUB_OUTPUT + + - name: Create Pull Request + uses: peter-evans/create-pull-request@v5 + with: + token: ${{ secrets.GITHUB_TOKEN }} + title: 'Automated PR: Updated Language Data Files' + body: ${{ steps.pr-body.outputs.body }} + base: master + branch: automated-missing-forms-pr + delete-branch: true + draft: false + commit-message: '[create-pull-request] automated change' + committer: GitHub + author: github-actions[bot] + + # Debug step to verify PR creation attempt + - name: Check PR Creation + run: | + echo "Checking if PR was created..." + gh pr list + env: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} diff --git a/.github/workflows/update_emojis.yaml b/.github/workflows/update_emojis.yaml new file mode 100644 index 00000000..8465147e --- /dev/null +++ b/.github/workflows/update_emojis.yaml @@ -0,0 +1,150 @@ +name: Check and Update Emoji Data +on: + schedule: + - cron: '0 0 1 * *' # Runs at 00:00 UTC on the first day of every month + # Allow manual trigger + workflow_dispatch: + +jobs: + check-repository: + runs-on: ubuntu-latest + outputs: + is_correct_repo: ${{ steps.check.outputs.is_correct_repo }} + steps: + - name: Check repository + id: check + run: | + if [ "$GITHUB_REPOSITORY" = "scribe-org/Scribe-Data" ]; then + echo "is_correct_repo=true" >> "$GITHUB_OUTPUT" + else + echo "is_correct_repo=false" >> "$GITHUB_OUTPUT" + echo "::warning::This workflow should only run in scribe-org/Scribe-Data repository." + fi + + check-and-update: + needs: check-repository + if: needs.check-repository.outputs.is_correct_repo == 'true' + runs-on: ubuntu-latest + steps: + - name: Checkout repository + uses: actions/checkout@v4 + with: + fetch-depth: 0 + + - name: Set up Python + uses: actions/setup-python@v4 + with: + python-version: '3.x' + + - name: Install dependencies + run: | + python -m pip install --upgrade pip + pip install requests + sudo apt-get install jq + + - name: Get language list + id: get-langs + run: | + # Fetch language list from GitHub API + DERIVED_LANGS=$(curl -s https://api.github.com/repos/unicode-org/cldr-json/contents/cldr-json/cldr-annotations-derived-full/annotationsDerived | jq -r '.[].name') + FULL_LANGS=$(curl -s https://api.github.com/repos/unicode-org/cldr-json/contents/cldr-json/cldr-annotations-full/annotations | jq -r '.[].name') + + # Combine and deduplicate language lists + LANG_LIST=$(echo "$DERIVED_LANGS $FULL_LANGS" | tr ' ' '\n' | sort -u | tr '\n' ' ') + echo "lang_list=${LANG_LIST}" >> $GITHUB_OUTPUT + echo "Detected languages: ${LANG_LIST}" + + - name: Download and check emoji data + id: check-updates + run: | + # Create directories if they don't exist + mkdir -p src/scribe_data/unicode/cldr-annotations-derived-full + mkdir -p src/scribe_data/unicode/cldr-annotations-full + + CHANGES_EXIST=false + CHANGE_SUMMARY="| Language | Derived Changes | Full Changes |\n|----------|-----------------|--------------|" + + # Use dynamic language list from previous step + for lang in ${{ steps.get-langs.outputs.lang_list }}; do + DERIVED_CHANGED="No" + FULL_CHANGED="No" + + # Download latest data for each language + mkdir -p "src/scribe_data/unicode/cldr-annotations-derived-full/annotationsDerived/$lang" + mkdir -p "src/scribe_data/unicode/cldr-annotations-full/annotations/$lang" + + curl -L "https://raw.githubusercontent.com/unicode-org/cldr-json/main/cldr-json/cldr-annotations-derived-full/annotationsDerived/$lang/annotations.json" -o "new_derived_$lang.json" + curl -L "https://raw.githubusercontent.com/unicode-org/cldr-json/main/cldr-json/cldr-annotations-full/annotations/$lang/annotations.json" -o "new_full_$lang.json" + + # Check derived annotations + if [ -f "src/scribe_data/unicode/cldr-annotations-derived-full/annotationsDerived/$lang/annotations.json" ]; then + if ! cmp -s "new_derived_$lang.json" "src/scribe_data/unicode/cldr-annotations-derived-full/annotationsDerived/$lang/annotations.json"; then + CHANGES_EXIST=true + DERIVED_CHANGED="Yes" + fi + else + CHANGES_EXIST=true + DERIVED_CHANGED="New" + fi + + # Check full annotations + if [ -f "src/scribe_data/unicode/cldr-annotations-full/annotations/$lang/annotations.json" ]; then + if ! cmp -s "new_full_$lang.json" "src/scribe_data/unicode/cldr-annotations-full/annotations/$lang/annotations.json"; then + CHANGES_EXIST=true + FULL_CHANGED="Yes" + fi + else + CHANGES_EXIST=true + FULL_CHANGED="New" + fi + + # Only add to summary if there are changes + if [ "$DERIVED_CHANGED" != "No" ] || [ "$FULL_CHANGED" != "No" ]; then + CHANGE_SUMMARY="$CHANGE_SUMMARY\n| $lang | $DERIVED_CHANGED | $FULL_CHANGED |" + fi + done + + echo "changes_exist=${CHANGES_EXIST}" >> $GITHUB_OUTPUT + echo "change_summary<> $GITHUB_OUTPUT + echo -e "$CHANGE_SUMMARY" >> $GITHUB_OUTPUT + echo "EOF" >> $GITHUB_OUTPUT + + - name: Update files if changed + if: steps.check-updates.outputs.changes_exist == 'true' + run: | + # Use dynamic language list + for lang in ${{ steps.get-langs.outputs.lang_list }}; do + mkdir -p "src/scribe_data/unicode/cldr-annotations-derived-full/annotationsDerived/$lang" + mkdir -p "src/scribe_data/unicode/cldr-annotations-full/annotations/$lang" + + mv "new_derived_$lang.json" "src/scribe_data/unicode/cldr-annotations-derived-full/annotationsDerived/$lang/annotations.json" + mv "new_full_$lang.json" "src/scribe_data/unicode/cldr-annotations-full/annotations/$lang/annotations.json" + done + + git config --global user.email "github-actions[bot]@users.noreply.github.com" + git config --global user.name "github-actions[bot]" + + - name: Create Pull Request + if: steps.check-updates.outputs.changes_exist == 'true' + uses: peter-evans/create-pull-request@v5 + with: + token: ${{ secrets.GITHUB_TOKEN }} + title: 'chore: Update emoji annotations data' + body: | + This PR updates the emoji annotations data from CLDR. + + ## Changes Summary + ${{ steps.check-updates.outputs.change_summary }} + + ### Legend: + - Yes: File was updated + - New: File was newly added + - No: No changes + + This is an automated PR created by the emoji data update workflow. + branch: update-emoji-data # Branch name + delete-branch: true + commit-message: 'chore: Update emoji annotations data' + labels: | + automated pr + emoji-data diff --git a/.gitignore b/.gitignore index 4bcc3809..475ba504 100644 --- a/.gitignore +++ b/.gitignore @@ -46,3 +46,7 @@ scribe_data_wikidata_dumps_export/* # MARK: Wiki Dumps *.json.bz2 + +# MARK: GitHub Actions + +missing_features.json diff --git a/src/scribe_data/check/check_missing_forms/check_missing_forms.py b/src/scribe_data/check/check_missing_forms/check_missing_forms.py new file mode 100644 index 00000000..ac57e498 --- /dev/null +++ b/src/scribe_data/check/check_missing_forms/check_missing_forms.py @@ -0,0 +1,239 @@ +""" +Check for missing forms in Wikidata. + +.. raw:: html + +""" + +import json +import sys +import argparse +from pathlib import Path +from get_forms import parse_sparql_files, extract_dump_forms +from generate_query import generate_query +from collections import defaultdict +from scribe_data.utils import ( + lexeme_form_metadata, + language_metadata, + data_type_metadata, +) + + +def get_all_languages(): + """ + Extract all languages and sublanguages from language metadata. + + Returns + ------- + list of str + List of language codes for all languages and sublanguages that have + both ISO codes and QIDs defined. + + Notes + ----- + Only includes languages and sublanguages that have both 'iso' and 'qid' + fields in their metadata. + """ + languages = [] + + for lang, lang_data in language_metadata.items(): + # Add main language if it has ISO and QID. + if "iso" in lang_data and "qid" in lang_data: + languages.append(lang) + + # Add sublanguages. + if "sub_languages" in lang_data: + for sublang, sublang_data in lang_data["sub_languages"].items(): + if "iso" in sublang_data and "qid" in sublang_data: + languages.append(sublang) + + return languages + + +def get_missing_features(result_sparql, result_dump): + """ + Compare features between SPARQL results and dump data to find missing ones. + + Parameters + ---------- + result_sparql : dict + Features extracted from SPARQL queries. + Format: {language: {data_type: [features]}} + result_dump : dict + Features extracted from Wikidata dump. + Format: {language: {data_type: [features]}} + + Returns + ------- + dict or None + Dictionary of missing features by language and data type if any found, + otherwise None. + Format: {language: {data_type: [missing_features]}} + + Notes + ----- + Only includes features that have valid QIDs present in lexeme_form_metadata. + """ + missing_by_lang_type = defaultdict(lambda: defaultdict(list)) + + # Extract all QIDs from the metadata. + all_qids = set() + for category, items in lexeme_form_metadata.items(): + for key, value in items.items(): + all_qids.add(value["qid"]) + + # Compare features for each language and data type. + for lang in result_sparql: + if lang in result_dump: + # Get all unique data types from both sources. + all_data_types = set(result_sparql[lang].keys()) | set( + result_dump[lang].keys() + ) + + for dt in all_data_types: + sparql_values = set() + dump_values = set() + + # Get values from SPARQL if available. + if dt in result_sparql[lang]: + sparql_values = set(tuple(item) for item in result_sparql[lang][dt]) + + # Get values from dump if available. + if dt in result_dump[lang]: + dump_values = set(tuple(item) for item in result_dump[lang][dt]) + + # Get unique values from both sources. + unique_dump_values = dump_values - sparql_values + unique_sparql_values = sparql_values - dump_values + + # Store valid missing features from dump. + for item in unique_dump_values: + if all(qid in all_qids for qid in item): + missing_by_lang_type[lang][dt].append(list(item)) + + # Store valid missing features from SPARQL. + for item in unique_sparql_values: + if all(qid in all_qids for qid in item): + missing_by_lang_type[lang][dt].append(list(item)) + + return missing_by_lang_type if missing_by_lang_type else None + + +def process_missing_features(missing_features, query_dir): + """ + Generate SPARQL queries for missing features by language and data type. + + Parameters + ---------- + missing_features : dict + Dictionary of missing features by language and data type. + Format: {language: {data_type: [features]}} + query_dir : str or Path + Directory where generated query files should be saved. + + Notes + ----- + Generates separate queries for each data type within each language. + """ + if not missing_features: + return + + for language, data_types in missing_features.items(): + print(f"Processing language: {language}") + print(f"Data types: {list(data_types.keys())}") + + # Create a separate entry for each data type. + for data_type, features in data_types.items(): + language_entry = {language: {data_type: features}} + print(f"Generating query for {language} - {data_type}") + generate_query(language_entry, query_dir) + + +def main(): + """ + Main function to check for missing forms in Wikidata. + + Processes command line arguments, downloads and compares Wikidata dump data + with SPARQL query results to identify missing features, and generates + appropriate SPARQL queries. + + Notes + ----- + Required command line arguments: + - dump_path: Path to the Wikidata dump file + - query_dir: Directory for storing generated queries + + Optional arguments: + - --process-all-keys: Flag to process all nested keys in missing features + """ + parser = argparse.ArgumentParser(description="Check missing forms in Wikidata") + parser.add_argument("dump_path", type=str, help="Path to the dump file") + parser.add_argument("query_dir", type=str, help="Path to the query directory") + parser.add_argument( + "--process-all-keys", + action="store_true", + help="Process all nested keys in the missing features", + ) + + args = parser.parse_args() + + dump_path = Path(args.dump_path) + query_dir = Path(args.query_dir) + + if not dump_path.exists(): + print(f"Error: Dump path does not exist: {dump_path}") + sys.exit(1) + + if not query_dir.exists(): + print(f"Error: Query directory does not exist: {query_dir}") + sys.exit(1) + + # Get all languages including sublanguages. + languages = get_all_languages() + + print("Parsing SPARQL files...") + result_sparql = parse_sparql_files() + + print("Extracting Wiki lexeme dump...") + result_dump = extract_dump_forms( + languages=languages, + data_types=list(data_type_metadata.keys()), + file_path=dump_path, + ) + + missing_features = get_missing_features(result_sparql, result_dump) + + try: + print("Generated missing features:", missing_features) + + # Save the missing features to a JSON file. + with open("missing_features.json", "w") as f: + json.dump(missing_features, f, indent=4) + print("Missing features data has been saved to missing_features.json") + + if missing_features: + # Process all data types for each language. + process_missing_features(missing_features, query_dir) + + except Exception as e: + print(f"An error occurred: {e}") + sys.exit(1) + + +if __name__ == "__main__": + main() diff --git a/src/scribe_data/check/check_missing_forms/download_wd.py b/src/scribe_data/check/check_missing_forms/download_wd.py new file mode 100644 index 00000000..ccf6b797 --- /dev/null +++ b/src/scribe_data/check/check_missing_forms/download_wd.py @@ -0,0 +1,102 @@ +""" +Download Wikidata lexeme dump. + +.. raw:: html + +""" + +from pathlib import Path +from scribe_data.cli.download import download_wd_lexeme_dump +from scribe_data.utils import DEFAULT_DUMP_EXPORT_DIR +import requests +import os + + +def wd_lexeme_dump_download(wikidata_dump=None, output_dir=None): + """ + Download Wikidata lexeme dumps automatically. + + Parameters + ---------- + wikidata_dump : str, optional + Date string in YYYYMMDD format for specific dumps. + If None, downloads the latest dump. + output_dir : str, optional + Directory path for the downloaded file. + If None, uses DEFAULT_DUMP_EXPORT_DIR. + + Returns + ------- + str or False + Path to downloaded file if successful, False otherwise. + + Notes + ----- + - Downloads are skipped if the file already exists in the output directory + - Progress is displayed every 50MB during download + - Creates output directory if it doesn't exist + """ + dump_url = download_wd_lexeme_dump(wikidata_dump or "latest-lexemes") + + if not dump_url: + print("No dump URL found.") + return False + + output_dir = output_dir or DEFAULT_DUMP_EXPORT_DIR + os.makedirs(output_dir, exist_ok=True) + + filename = dump_url.split("/")[-1] + output_path = str(Path(output_dir) / filename) + + # Check if the file already exists. + if os.path.exists(output_path): + print(f"File already exists: {output_path}. Skipping download.") + return output_path + + # Proceed with the download if the file does not exist. + print(f"Downloading dump to {output_path}...") + + try: + response = requests.get(dump_url, stream=True) + total_size = int(response.headers.get("content-length", 0)) + downloaded_size = 0 + + with open(output_path, "wb") as f: + for chunk in response.iter_content(chunk_size=8192): + if chunk: + f.write(chunk) + downloaded_size += len(chunk) + # Print progress percentage every 50MB. + if total_size and downloaded_size % (50 * 1024 * 1024) < 8192: + progress = (downloaded_size / total_size) * 100 + print(f"Download progress: {progress:.1f}%") + + print("Download completed successfully!") + return output_path + + except requests.exceptions.RequestException as e: + print(f"Error downloading dump: {e}") + + except Exception as e: + print(f"An error occurred: {e}") + + +if __name__ == "__main__": + output_path = wd_lexeme_dump_download() + if output_path: + print(f"DOWNLOAD_PATH={output_path}") diff --git a/src/scribe_data/check/check_missing_forms/generate_query.py b/src/scribe_data/check/check_missing_forms/generate_query.py new file mode 100644 index 00000000..71bfde50 --- /dev/null +++ b/src/scribe_data/check/check_missing_forms/generate_query.py @@ -0,0 +1,166 @@ +""" +Generate SPARQL queries for missing lexeme forms. + +.. raw:: html + +""" + +from scribe_data.utils import ( + lexeme_form_metadata, + language_metadata, + data_type_metadata, + LANGUAGE_DATA_EXTRACTION_DIR as language_data_extraction, +) + +import os +from pathlib import Path + + +def generate_query(missing_features, query_dir=None): + """ + Generate SPARQL queries for missing lexeme forms. + + Parameters + ---------- + missing_features : dict + Dictionary containing missing features by language and data type. + Format: {language_qid: {data_type_qid: [[form_qids]]}} + query_dir : str or Path, optional + Directory where query files should be saved. + If None, uses default language_data_extraction directory. + + Returns + ------- + str + Path to the generated query file. + + Notes + ----- + - Generates a single query file combining all forms for a given + language and data type combination + - Query files are named incrementally if duplicates exist + - Creates necessary directories if they don't exist + """ + language_qid = next(iter(missing_features.keys())) + data_type_qid = next(iter(missing_features[language_qid].keys())) + + # Find the language entry by QID. + language_entry = next( + (name, data) + for name, data in language_metadata.items() + if data.get("qid") == language_qid + ) + language = language_entry[0] # The language name. + + data_type = next( + name for name, qid in data_type_metadata.items() if qid == data_type_qid + ) + + iso_code = language_metadata[language]["iso"] + + # Create a QID to label mapping from the metadata. + qid_to_label = {} + for category in lexeme_form_metadata.values(): + for item in category.values(): + qid_to_label[item["qid"]] = item["label"] + + # Process all forms at once + forms_query = [] + all_form_combinations = missing_features[language_qid][data_type_qid] + for form_qids in all_form_combinations: + # Convert QIDs to labels and join them together. + labels = [qid_to_label.get(qid, qid) for qid in form_qids] + concatenated_label = "".join(labels) + # Make first letter lowercase + concatenated_label = concatenated_label[0].lower() + concatenated_label[1:] + forms_query.append({"label": concatenated_label, "qids": form_qids}) + + # Generate a single query for all forms. + main_body = f"""# tool: scribe-data +# All {language} ({language_qid}) {data_type} ({data_type_qid}) and their forms. +# Enter this query at https://query.wikidata.org/. + +SELECT + (REPLACE(STR(?lexeme), "http://www.wikidata.org/entity/", "") AS ?lexemeID) + ?{data_type} + """ + "\n ".join(f'?{form["label"]}' for form in forms_query) + + where_clause = f""" + WHERE {{ + ?lexeme dct:language wd:{language_qid} ; + wikibase:lexicalCategory wd:{data_type_qid} ; + wikibase:lemma ?{data_type} . + FILTER(lang(?{data_type}) = "{iso_code}") + """ + + # Generate OPTIONAL clauses for all forms in one query. + optional_clauses = "" + for form in forms_query: + qids = ", ".join(f"wd:{qid}" for qid in form["qids"]) + optional_clauses += f""" + OPTIONAL {{ + ?lexeme ontolex:lexicalForm ?{form['label']}Form . + ?{form['label']}Form ontolex:representation ?{form['label']} ; + wikibase:grammaticalFeature {qids} . + }} +""" + + # Print the complete query. + final_query = main_body + where_clause + optional_clauses + "}" + + def get_available_filename(base_path): + """Helper function to find the next available filename""" + if not os.path.exists(base_path): + return base_path + + base, ext = os.path.splitext(base_path) + counter = 1 + + # If the base already ends with _N, start from that number. + import re + + if match := re.search(r"_(\d+)$", base): + counter = int(match.group(1)) + 1 + base = base[: match.start()] + + while True: + new_path = f"{base}_{counter}{ext}" + if not os.path.exists(new_path): + return new_path + counter += 1 + + # Create base filename using the provided query_dir or default. + if query_dir: + base_file_name = ( + Path(query_dir) / language / data_type / f"query_{data_type}.sparql" + ) + else: + base_file_name = f"{language_data_extraction}/{language}/{data_type}/query_{data_type}.sparql" + + # Get the next available filename. + file_name = get_available_filename(str(base_file_name)) + + # Create directory if it doesn't exist. + os.makedirs(os.path.dirname(file_name), exist_ok=True) + + # Write the file. + with open(file_name, "w") as file: + file.write(final_query) + + print(f"Query file created: {file_name}") + return file_name diff --git a/src/scribe_data/check/check_missing_forms/get_forms.py b/src/scribe_data/check/check_missing_forms/get_forms.py new file mode 100644 index 00000000..909abf5e --- /dev/null +++ b/src/scribe_data/check/check_missing_forms/get_forms.py @@ -0,0 +1,173 @@ +""" +Get forms from Wikidata. +.. raw:: html + +""" + +from scribe_data.wiktionary.parse_dump import LexemeProcessor +import re +from collections import defaultdict +from scribe_data.utils import ( + language_metadata, + data_type_metadata, + LANGUAGE_DATA_EXTRACTION_DIR as language_data_extraction, +) + +iso_to_qid = { + lang_data["iso"]: lang_data["qid"] + for lang, lang_data in language_metadata.items() + if "iso" in lang_data and "qid" in lang_data +} + +all_forms = defaultdict(lambda: defaultdict(list)) + + +def parse_sparql_files(): + """ + Read and parse all SPARQL query files to extract form information. + + Returns + ------- + dict + Accumulated forms for each language and lexical category. + Format: {language: {lexical_category: [forms]}} + + Notes + ----- + Recursively searches through language_data_extraction directory + for .sparql files and accumulates all form information. + """ + for sub_sub_file in language_data_extraction.rglob("*.sparql"): + with open(sub_sub_file, "r", encoding="utf-8") as query_text: + result = parse_sparql_query(query_text.read()) + + # Accumulate forms for each language and lexical category. + for lang, categories in result.items(): + for category, forms in categories.items(): + if forms: + all_forms[lang][category].extend(forms) + + return all_forms + + +def parse_sparql_query(query_text): + """ + Parse a SPARQL query to extract lexical categories and features. + + Parameters + ---------- + query_text : str + Content of the SPARQL query file. + + Returns + ------- + dict + Dictionary containing parsed information. + Format: {language: {lexical_category: [forms]}} + + Notes + ----- + Extracts: + - Language QID + - Lexical category QID + - Grammatical features from OPTIONAL blocks + """ + # Get language and category first. + language = None + lexical_category = None + + # Parse lexical category. + lexical_matches = re.finditer(r"wikibase:lexicalCategory\s+wd:(Q\d+)", query_text) + for match in lexical_matches: + lexical_category = match.group(1) + + # Parse language. + language_matches = re.finditer(r"dct:language\s+wd:(Q\d+)", query_text) + for match in language_matches: + language = match.group(1) + + result = {language: {lexical_category: []}} + + # Parse optional blocks for forms and features. + optional_blocks = re.finditer(r"OPTIONAL\s*{([^}]+)}", query_text) + + for block in optional_blocks: + block_text = block.group(1) + + # Extract grammatical features. + features = re.finditer(r"wd:(Q\d+)", block_text) + feature_list = [f.group(1) for f in features] + + if feature_list: + result[language][lexical_category].append(feature_list) + + return result + + +parse_sparql_files() + + +def extract_dump_forms( + languages=None, data_types=None, file_path="latest-lexemes.json.bz2" +): + """ + Extract unique grammatical features from Wikidata lexeme dump. + + Parameters + ---------- + languages : list of str, optional + List of language ISO codes (e.g., ['en', 'fr']) + data_types : list of str, optional + List of lexical categories (e.g., ['nouns', 'verbs']) + file_path : str, optional + Path to the lexeme dump file, by default "latest-lexemes.json.bz2" + + Returns + ------- + dict + Dictionary of unique grammatical features per language and lexical category. + Format: {language_qid: {data_type_qid: features}} + + Notes + ----- + - Converts ISO codes to QIDs in the output + - Converts data type names to their corresponding QIDs + - Only includes languages and data types that have valid QID mappings + """ + processor = LexemeProcessor( + target_iso=languages, parse_type=["form"], data_types=data_types + ) + + processor.process_file(file_path) + + unique_features = dict(processor.unique_forms) + + # Convert ISO codes to QIDs and data types to QIDs. + converted_features = {} + for iso_code, data_types_dict in unique_features.items(): + if iso_code in iso_to_qid: + lang_qid = iso_to_qid[iso_code] + converted_features[lang_qid] = {} + + for data_type, features in data_types_dict.items(): + # Get QID from data_type_metadata. + data_type_qid = data_type_metadata.get(data_type) + if data_type_qid: + converted_features[lang_qid][data_type_qid] = features + + return converted_features diff --git a/src/scribe_data/check/check_missing_forms/pr_body.py b/src/scribe_data/check/check_missing_forms/pr_body.py new file mode 100644 index 00000000..11aadbe9 --- /dev/null +++ b/src/scribe_data/check/check_missing_forms/pr_body.py @@ -0,0 +1,109 @@ +""" +Generate a formatted PR body describing missing features for each language. + +.. raw:: html + +""" + +import json +import sys +from scribe_data.utils import ( + language_metadata, + data_type_metadata, +) + + +def pr_body(missing_features): + """ + Generate a formatted PR body describing missing features for each language. + + Parameters + ---------- + missing_features : dict + Dictionary mapping language QIDs to their missing features. + Format: {language_qid: {feature_type: [features]}} + + Returns + ------- + str + Formatted PR body content in markdown format containing a table of + missing features grouped by language. + + Notes + ----- + The PR body includes: + - A header indicating this is an automated PR + - A table showing languages and their missing feature types + - Features are grouped by language for better readability + """ + # Initialize PR body with a header. + pr_body_content = "## Automated PR: Missing Features\n\n" + pr_body_content += "This PR was automatically created by a GitHub Action.\n\n" + pr_body_content += "### Missing Features Summary\n" + pr_body_content += "| **Language** | **Feature Type** |\n" + pr_body_content += "|--------------|------------------|\n" + + # Create a dictionary to group features by language. + grouped_features = {} + + # Iterate over the missing features to populate the table. + for entity, features in missing_features.items(): + # Check for sub-languages. + language_name = None + for name, data in language_metadata.items(): + if data.get("qid") == entity: + language_name = name + break + if "sub_languages" in data: + for sub_name, sub_data in data["sub_languages"].items(): + if sub_data.get("qid") == entity: + language_name = f"{name} ({sub_name})" + break + if language_name: + break + + # Default to entity if no name is found. + language_name = language_name or entity + + # Group features by language. + if language_name not in grouped_features: + grouped_features[language_name] = set() + + for feature in features.keys(): + feature_name = next( + (name for name, qid in data_type_metadata.items() if qid == feature), + feature, + ) + grouped_features[language_name].add(feature_name) + + # Add grouped features to the PR body. + for language, features in sorted(grouped_features.items()): + feature_list = ", ".join(sorted(features)) + pr_body_content += f"| **{language}** | {feature_list} |\n" + + pr_body_content += "\nPlease review the changes and provide feedback.\n" + + print(pr_body_content) + return pr_body_content + + +if __name__ == "__main__": + with open(sys.argv[1], "r") as f: + missing_features = json.load(f) + + pr_body(missing_features) diff --git a/src/scribe_data/cli/get.py b/src/scribe_data/cli/get.py index 6659c0ba..dc56fb8e 100644 --- a/src/scribe_data/cli/get.py +++ b/src/scribe_data/cli/get.py @@ -198,7 +198,8 @@ def prompt_user_download_all(): # MARK: Form Dump elif wikidata_dump is not None: - if not wikidata_dump: + # If wikidata_dump is an empty string, use the default path + if wikidata_dump == "": wikidata_dump = DEFAULT_DUMP_EXPORT_DIR parse_wd_lexeme_dump( language=language, diff --git a/src/scribe_data/cli/main.py b/src/scribe_data/cli/main.py index d51712d2..fbc67d8e 100644 --- a/src/scribe_data/cli/main.py +++ b/src/scribe_data/cli/main.py @@ -37,6 +37,11 @@ from scribe_data.cli.upgrade import upgrade_cli from scribe_data.cli.version import get_version_message from scribe_data.wiktionary.parse_mediaWiki import parse_wiktionary_translations +from scribe_data.utils import ( + DEFAULT_JSON_EXPORT_DIR, + DEFAULT_CSV_EXPORT_DIR, + DEFAULT_DUMP_EXPORT_DIR, +) LIST_DESCRIPTION = "List languages, data types and combinations of each that Scribe-Data can be used for." GET_DESCRIPTION = ( @@ -132,7 +137,10 @@ def main() -> None: help="The output file type.", ) get_parser.add_argument( - "-od", "--output-dir", type=str, help="The output directory path for results." + "-od", + "--output-dir", + type=str, + help=f"The output directory path for results (default: ./{DEFAULT_JSON_EXPORT_DIR} for JSON, ./{DEFAULT_CSV_EXPORT_DIR} for CSV, etc.).", ) get_parser.add_argument( "-ope", @@ -168,7 +176,7 @@ def main() -> None: "--wikidata-dump-path", nargs="?", const="", - help="Path to a local Wikidata lexemes dump. Uses default directory if no path provided.", + help=f"Path to a local Wikidata lexemes dump. Uses default directory (./{DEFAULT_DUMP_EXPORT_DIR}) if no path provided.", ) get_parser.add_argument( "-t", "--translation", type=str, help="parse a single word using MediaWiki API" @@ -208,7 +216,7 @@ def main() -> None: "--wikidata-dump-path", nargs="?", const=True, - help="Path to a local Wikidata lexemes dump for running with '--all'.", + help=f"Path to a local Wikidata lexemes dump for running with '--all' (default: ./{DEFAULT_DUMP_EXPORT_DIR}).", ) # MARK: Convert @@ -308,7 +316,7 @@ def main() -> None: "-wdp", "--wikidata-dump-path", type=str, - help="The output directory path for the downloaded dump.", + help=f"The output directory path for the downloaded dump (default: ./{DEFAULT_DUMP_EXPORT_DIR}).", ) # MARK: Interactive diff --git a/src/scribe_data/wikidata/wikidata_utils.py b/src/scribe_data/wikidata/wikidata_utils.py index cf6fb872..9e481d10 100644 --- a/src/scribe_data/wikidata/wikidata_utils.py +++ b/src/scribe_data/wikidata/wikidata_utils.py @@ -89,9 +89,17 @@ def parse_wd_lexeme_dump( overwrite_all : bool, default=False If True, automatically overwrite existing files without prompting """ - # Convert "all" to list of all languages + # Convert "all" to list of all languages including sub-languages if isinstance(language, str) and language.lower() == "all": - language = list(language_metadata.keys()) + languages = [] + for main_lang, lang_data in language_metadata.items(): + # Add sub-languages if they exist + if "sub_languages" in lang_data: + for sub_lang in lang_data["sub_languages"]: + main_lang = sub_lang + languages.append(main_lang) + + language = languages # For processing: exclude translations and emoji-keywords if isinstance(data_types, str) and data_types.lower() == "all": diff --git a/src/scribe_data/wiktionary/parse_dump.py b/src/scribe_data/wiktionary/parse_dump.py index fa6bd0f6..9d66100a 100644 --- a/src/scribe_data/wiktionary/parse_dump.py +++ b/src/scribe_data/wiktionary/parse_dump.py @@ -102,6 +102,17 @@ def _build_iso_mapping(self) -> dict: """ iso_mapping = {} for lang_name, data in language_metadata.items(): + # Handle sub-languages if they exist + if "sub_languages" in data: + for sub_lang, sub_data in data["sub_languages"].items(): + if self.target_iso and sub_lang not in self.target_iso: + continue + + if iso_code := sub_data.get("iso"): + iso_mapping[iso_code] = sub_lang + continue # Skip main language if it only has sub-languages + + # Handle main languages if self.target_iso and lang_name not in self.target_iso: continue @@ -425,7 +436,23 @@ def export_forms_json( return # Create the output directory structure - output_path = Path(filepath).parent / lang_name + # Check if this is a sub-language and get its main language + main_lang = None + for lang, data in language_metadata.items(): + if "sub_languages" in data: + for sub_lang, sub_data in data["sub_languages"].items(): + if sub_lang == lang_name: + main_lang = lang + break + if main_lang: + break + + # If it's a sub-language, create path like: parent/chinese/mandarin/ + if main_lang: + output_path = Path(filepath).parent / main_lang / lang_name + else: + output_path = Path(filepath).parent / lang_name + output_path.mkdir(parents=True, exist_ok=True) # Create the full output filepath @@ -557,15 +584,42 @@ def parse_dump( for lang in languages: needs_processing = False + # Check if this is a sub-language + main_lang = None + for lang_name, data in language_metadata.items(): + if "sub_languages" in data: + for sub_lang in data["sub_languages"]: + if sub_lang == lang: + main_lang = lang_name + break + if main_lang: + break + for data_type in data_types: - index_path = Path(output_dir) / lang / f"lexeme_{data_type}.json" + # Create appropriate path based on whether it's a sub-language + if main_lang: + index_path = ( + Path(output_dir) + / main_lang + / lang + / f"lexeme_{data_type}.json" + ) + else: + index_path = ( + Path(output_dir) / lang / f"lexeme_{data_type}.json" + ) if not check_index_exists(index_path, overwrite_all): needs_processing = True data_types_to_process.add(data_type) - else: - print(f"Skipping {lang}/{data_type}.json - already exists") + # Update path display in skip message + skip_path = ( + f"{main_lang}/{lang}/{data_type}.json" + if main_lang + else f"{lang}/{data_type}.json" + ) + print(f"Skipping {skip_path} - already exists") if needs_processing: languages_to_process.append(lang) From 04573726e18a8643c94bdb189dc5f8ef64295646 Mon Sep 17 00:00:00 2001 From: axif Date: Wed, 22 Jan 2025 04:30:12 +0600 Subject: [PATCH 08/13] Refactor parameter names and fix single langugage translation error --- src/scribe_data/cli/total.py | 2 +- src/scribe_data/wikidata/wikidata_utils.py | 4 +++- src/scribe_data/wiktionary/parse_dump.py | 23 +++++++++++----------- 3 files changed, 16 insertions(+), 13 deletions(-) diff --git a/src/scribe_data/cli/total.py b/src/scribe_data/cli/total.py index a0fb5105..cc98cf9a 100644 --- a/src/scribe_data/cli/total.py +++ b/src/scribe_data/cli/total.py @@ -373,7 +373,7 @@ def total_wrapper( if wikidata_dump is True: # flag without a wikidata lexeme dump path parse_wd_lexeme_dump( language=language, - data_types=[data_type], + data_types=data_type, wikidata_dump_type=["total"], wikidata_dump_path=None, ) diff --git a/src/scribe_data/wikidata/wikidata_utils.py b/src/scribe_data/wikidata/wikidata_utils.py index 9e481d10..c302ea78 100644 --- a/src/scribe_data/wikidata/wikidata_utils.py +++ b/src/scribe_data/wikidata/wikidata_utils.py @@ -110,7 +110,9 @@ def parse_wd_lexeme_dump( ] print(f"Languages to process: {language}") - print(f"Data types to process: {data_types}") + + if "translations" not in wikidata_dump_type: + print(f"Data types to process: {data_types}") file_path = wd_lexeme_dump_download_wrapper(None, wikidata_dump_path) diff --git a/src/scribe_data/wiktionary/parse_dump.py b/src/scribe_data/wiktionary/parse_dump.py index 9d66100a..b0a96abe 100644 --- a/src/scribe_data/wiktionary/parse_dump.py +++ b/src/scribe_data/wiktionary/parse_dump.py @@ -42,7 +42,7 @@ class LexemeProcessor: def __init__( self, - target_iso: Union[str, List[str]] = None, + target_lang: Union[str, List[str]] = None, parse_type: List[str] = None, data_types: List[str] = None, ): @@ -56,8 +56,8 @@ def __init__( # Pre-compute sets for faster lookups. self.parse_type = set(parse_type or []) self.data_types = set(data_types or []) - self.target_iso = set( - [target_iso] if isinstance(target_iso, str) else target_iso or [] + self.target_lang = set( + [target_lang] if isinstance(target_lang, str) else target_lang or [] ) # Pre-compute valid categories and languages. @@ -98,14 +98,14 @@ def __init__( def _build_iso_mapping(self) -> dict: """ Build mapping of ISO codes to language names based on language_metadata. - If self.target_iso is non-null, only include those iso codes. + If self.target_lang is non-null, only include those iso codes. """ iso_mapping = {} for lang_name, data in language_metadata.items(): # Handle sub-languages if they exist if "sub_languages" in data: for sub_lang, sub_data in data["sub_languages"].items(): - if self.target_iso and sub_lang not in self.target_iso: + if self.target_lang and sub_lang not in self.target_lang: continue if iso_code := sub_data.get("iso"): @@ -113,13 +113,13 @@ def _build_iso_mapping(self) -> dict: continue # Skip main language if it only has sub-languages # Handle main languages - if self.target_iso and lang_name not in self.target_iso: + if self.target_lang and lang_name not in self.target_lang: continue if iso_code := data.get("iso"): iso_mapping[iso_code] = lang_name - for language in self.target_iso: + for language in self.target_lang: if language.lower().startswith("q") and language[1:].isdigit(): qid_to_lang = check_qid_is_language(language) if qid_to_lang: @@ -628,16 +628,17 @@ def parse_dump( languages = languages_to_process data_types = list(data_types_to_process) - if not data_types or not languages: - print("No data types or languages provided. Nothing to process.") - return + if "translations" not in parse_type: + if not data_types or not languages: + print("No data types or languages provided. Nothing to process.") + return if not languages: print("All requested data already exists. Nothing to process.") return processor = LexemeProcessor( - target_iso=languages, parse_type=parse_type, data_types=data_types + target_lang=languages, parse_type=parse_type, data_types=data_types ) processor.process_file(file_path) From 2479fe22609ba1cac04252a398cad206c79e96a5 Mon Sep 17 00:00:00 2001 From: axif Date: Wed, 22 Jan 2025 04:35:20 +0600 Subject: [PATCH 09/13] fix target_lang occored when fixing conflict --- src/scribe_data/wiktionary/parse_dump.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/scribe_data/wiktionary/parse_dump.py b/src/scribe_data/wiktionary/parse_dump.py index edda3176..107e5e98 100644 --- a/src/scribe_data/wiktionary/parse_dump.py +++ b/src/scribe_data/wiktionary/parse_dump.py @@ -102,7 +102,7 @@ def _build_iso_mapping(self) -> dict: if iso_code := data.get("iso"): iso_mapping[iso_code] = lang_name - for language in self.target_iso: + for language in self.target_lang: if language.lower().startswith("q") and language[1:].isdigit(): qid_to_lang = check_qid_is_language(language) if qid_to_lang: From 386d6a0343a79ce605c1657eeb5763cc53582610 Mon Sep 17 00:00:00 2001 From: axif Date: Wed, 22 Jan 2025 04:37:52 +0600 Subject: [PATCH 10/13] fix total tests --- tests/cli/test_total.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/tests/cli/test_total.py b/tests/cli/test_total.py index 6893f15d..089fde97 100644 --- a/tests/cli/test_total.py +++ b/tests/cli/test_total.py @@ -265,7 +265,7 @@ def test_total_wrapper_wikidata_dump_flag(self, mock_parse_dump): total_wrapper(wikidata_dump=True) mock_parse_dump.assert_called_once_with( language=None, - data_types=[None], + data_types=None, wikidata_dump_type=["total"], wikidata_dump_path=None, ) @@ -288,7 +288,7 @@ def test_total_wrapper_wikidata_dump_with_all(self, mock_parse_dump): total_wrapper(wikidata_dump=True, all_bool=True) mock_parse_dump.assert_called_once_with( language="all", - data_types=["all"], + data_types="all", wikidata_dump_type=["total"], wikidata_dump_path=None, ) @@ -335,7 +335,7 @@ def test_total_wrapper_qid_with_wikidata_dump(self, mock_parse_dump): total_wrapper(language="Q9217", wikidata_dump=True, all_bool=True) mock_parse_dump.assert_called_once_with( language="Q9217", - data_types=["all"], + data_types="all", wikidata_dump_type=["total"], wikidata_dump_path=None, ) From 8c109621d749de8605214b3db133d4b330edc6d4 Mon Sep 17 00:00:00 2001 From: Andrew Tavis McAllister Date: Sat, 25 Jan 2025 09:53:22 +0100 Subject: [PATCH 11/13] Misc formatting + simplifying code where possible --- ...te.yaml => missing_form_check_update.yaml} | 26 ++-- .github/workflows/update_emojis.yaml | 32 ++--- src/scribe_data/check/__init__.py | 0 .../check/check_missing_forms/__init__.py | 0 .../check_missing_forms.py | 57 ++++----- .../check/check_missing_forms/download_wd.py | 35 ++---- .../check_missing_forms/generate_query.py | 54 ++++----- .../check/check_missing_forms/get_forms.py | 36 ++---- .../check/check_missing_forms/pr_body.py | 38 ++---- src/scribe_data/cli/get.py | 4 +- src/scribe_data/cli/list.py | 2 + src/scribe_data/cli/main.py | 8 +- src/scribe_data/cli/total.py | 1 + src/scribe_data/utils.py | 3 +- .../{wiktionary => wikidata}/parse_dump.py | 114 ++++++++++-------- src/scribe_data/wikidata/wikidata_utils.py | 8 +- src/scribe_data/wiktionary/parse_mediaWiki.py | 4 +- tests/cli/test_total.py | 4 +- 18 files changed, 190 insertions(+), 236 deletions(-) rename .github/workflows/{missing_form_check&update.yaml => missing_form_check_update.yaml} (89%) create mode 100644 src/scribe_data/check/__init__.py create mode 100644 src/scribe_data/check/check_missing_forms/__init__.py rename src/scribe_data/{wiktionary => wikidata}/parse_dump.py (91%) diff --git a/.github/workflows/missing_form_check&update.yaml b/.github/workflows/missing_form_check_update.yaml similarity index 89% rename from .github/workflows/missing_form_check&update.yaml rename to .github/workflows/missing_form_check_update.yaml index 1e51ca3f..96219e1a 100644 --- a/.github/workflows/missing_form_check&update.yaml +++ b/.github/workflows/missing_form_check_update.yaml @@ -1,9 +1,9 @@ name: Create Automated PR on: schedule: - - cron: '0 0 1 * *' # Runs at 00:00 UTC on the first day of every month - # Allow manual trigger - workflow_dispatch: + # Runs at 00:00 UTC on the first day of every month. + - cron: "0 0 1 * *" + workflow_dispatch: # allow manual trigger jobs: check-repository: @@ -34,7 +34,7 @@ jobs: - name: Set up Python uses: actions/setup-python@v4 with: - python-version: '3.x' + python-version: "3.x" - name: Install dependencies run: | @@ -54,7 +54,7 @@ jobs: # Check if paths exist if [ -n "${DUMP_PATH}" ] && [ -d "${QUERY_DIR}" ]; then - # Generate the missing features data with all keys processing + # Generate the missing features data with all keys processing. PYTHONPATH=$PYTHONPATH:$(pwd)/src python src/scribe_data/check/check_missing_forms/check_missing_forms.py "${DUMP_PATH}" "${QUERY_DIR}" --process-all-keys else echo "Required paths not found:" @@ -63,7 +63,7 @@ jobs: exit 1 fi - # Debug steps to understand the state + # Debug steps to understand the state. - name: Debug Info run: | echo "Current branch: $(git branch --show-current)" @@ -79,7 +79,7 @@ jobs: - name: Debug Missing Features Data if: always() run: | - # Print the contents of the missing features JSON file if it exists + # Print the contents of the missing features JSON file if it exists. if [ -f missing_features.json ]; then echo "Contents of missing_features.json:" cat missing_features.json @@ -90,10 +90,10 @@ jobs: - name: Generate PR Body id: pr-body run: | - # Run the pr_body.py script with the missing features data + # Run the pr_body.py script with the missing features data. PR_BODY_CONTENT=$(python src/scribe_data/check/check_missing_forms/pr_body.py missing_features.json) - # Debug output + # Debug output. echo "PR Body Content:" echo "$PR_BODY_CONTENT" @@ -106,7 +106,7 @@ jobs: - name: Debug PR Body Output run: | - # Print the PR body content from the output + # Print the PR body content from the output. echo "PR Body from GITHUB_OUTPUT:" cat $GITHUB_OUTPUT @@ -114,17 +114,17 @@ jobs: uses: peter-evans/create-pull-request@v5 with: token: ${{ secrets.GITHUB_TOKEN }} - title: 'Automated PR: Updated Language Data Files' + title: "Automated PR: Updated Language Data Files" body: ${{ steps.pr-body.outputs.body }} base: master branch: automated-missing-forms-pr delete-branch: true draft: false - commit-message: '[create-pull-request] automated change' + commit-message: "[create-pull-request] automated change" committer: GitHub author: github-actions[bot] - # Debug step to verify PR creation attempt + # Debug step to verify PR creation attempt. - name: Check PR Creation run: | echo "Checking if PR was created..." diff --git a/.github/workflows/update_emojis.yaml b/.github/workflows/update_emojis.yaml index 8465147e..14514cec 100644 --- a/.github/workflows/update_emojis.yaml +++ b/.github/workflows/update_emojis.yaml @@ -1,9 +1,9 @@ name: Check and Update Emoji Data on: schedule: - - cron: '0 0 1 * *' # Runs at 00:00 UTC on the first day of every month - # Allow manual trigger - workflow_dispatch: + # Runs at 00:00 UTC on the first day of every month. + - cron: "0 0 1 * *" + workflow_dispatch: # allow manual trigger jobs: check-repository: @@ -34,7 +34,7 @@ jobs: - name: Set up Python uses: actions/setup-python@v4 with: - python-version: '3.x' + python-version: "3.x" - name: Install dependencies run: | @@ -45,11 +45,11 @@ jobs: - name: Get language list id: get-langs run: | - # Fetch language list from GitHub API + # Fetch language list from GitHub API. DERIVED_LANGS=$(curl -s https://api.github.com/repos/unicode-org/cldr-json/contents/cldr-json/cldr-annotations-derived-full/annotationsDerived | jq -r '.[].name') FULL_LANGS=$(curl -s https://api.github.com/repos/unicode-org/cldr-json/contents/cldr-json/cldr-annotations-full/annotations | jq -r '.[].name') - # Combine and deduplicate language lists + # Combine and deduplicate language lists. LANG_LIST=$(echo "$DERIVED_LANGS $FULL_LANGS" | tr ' ' '\n' | sort -u | tr '\n' ' ') echo "lang_list=${LANG_LIST}" >> $GITHUB_OUTPUT echo "Detected languages: ${LANG_LIST}" @@ -57,26 +57,26 @@ jobs: - name: Download and check emoji data id: check-updates run: | - # Create directories if they don't exist + # Create directories if they don't exist. mkdir -p src/scribe_data/unicode/cldr-annotations-derived-full mkdir -p src/scribe_data/unicode/cldr-annotations-full CHANGES_EXIST=false CHANGE_SUMMARY="| Language | Derived Changes | Full Changes |\n|----------|-----------------|--------------|" - # Use dynamic language list from previous step + # Use dynamic language list from previous step. for lang in ${{ steps.get-langs.outputs.lang_list }}; do DERIVED_CHANGED="No" FULL_CHANGED="No" - # Download latest data for each language + # Download latest data for each language. mkdir -p "src/scribe_data/unicode/cldr-annotations-derived-full/annotationsDerived/$lang" mkdir -p "src/scribe_data/unicode/cldr-annotations-full/annotations/$lang" curl -L "https://raw.githubusercontent.com/unicode-org/cldr-json/main/cldr-json/cldr-annotations-derived-full/annotationsDerived/$lang/annotations.json" -o "new_derived_$lang.json" curl -L "https://raw.githubusercontent.com/unicode-org/cldr-json/main/cldr-json/cldr-annotations-full/annotations/$lang/annotations.json" -o "new_full_$lang.json" - # Check derived annotations + # Check derived annotations. if [ -f "src/scribe_data/unicode/cldr-annotations-derived-full/annotationsDerived/$lang/annotations.json" ]; then if ! cmp -s "new_derived_$lang.json" "src/scribe_data/unicode/cldr-annotations-derived-full/annotationsDerived/$lang/annotations.json"; then CHANGES_EXIST=true @@ -87,7 +87,7 @@ jobs: DERIVED_CHANGED="New" fi - # Check full annotations + # Check full annotations. if [ -f "src/scribe_data/unicode/cldr-annotations-full/annotations/$lang/annotations.json" ]; then if ! cmp -s "new_full_$lang.json" "src/scribe_data/unicode/cldr-annotations-full/annotations/$lang/annotations.json"; then CHANGES_EXIST=true @@ -98,7 +98,7 @@ jobs: FULL_CHANGED="New" fi - # Only add to summary if there are changes + # Only add to summary if there are changes. if [ "$DERIVED_CHANGED" != "No" ] || [ "$FULL_CHANGED" != "No" ]; then CHANGE_SUMMARY="$CHANGE_SUMMARY\n| $lang | $DERIVED_CHANGED | $FULL_CHANGED |" fi @@ -112,7 +112,7 @@ jobs: - name: Update files if changed if: steps.check-updates.outputs.changes_exist == 'true' run: | - # Use dynamic language list + # Use dynamic language list. for lang in ${{ steps.get-langs.outputs.lang_list }}; do mkdir -p "src/scribe_data/unicode/cldr-annotations-derived-full/annotationsDerived/$lang" mkdir -p "src/scribe_data/unicode/cldr-annotations-full/annotations/$lang" @@ -129,7 +129,7 @@ jobs: uses: peter-evans/create-pull-request@v5 with: token: ${{ secrets.GITHUB_TOKEN }} - title: 'chore: Update emoji annotations data' + title: "chore: Update emoji annotations data" body: | This PR updates the emoji annotations data from CLDR. @@ -142,9 +142,9 @@ jobs: - No: No changes This is an automated PR created by the emoji data update workflow. - branch: update-emoji-data # Branch name + branch: update-emoji-data # branch name delete-branch: true - commit-message: 'chore: Update emoji annotations data' + commit-message: "chore: Update emoji annotations data" labels: | automated pr emoji-data diff --git a/src/scribe_data/check/__init__.py b/src/scribe_data/check/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/src/scribe_data/check/check_missing_forms/__init__.py b/src/scribe_data/check/check_missing_forms/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/src/scribe_data/check/check_missing_forms/check_missing_forms.py b/src/scribe_data/check/check_missing_forms/check_missing_forms.py index ac57e498..43039976 100644 --- a/src/scribe_data/check/check_missing_forms/check_missing_forms.py +++ b/src/scribe_data/check/check_missing_forms/check_missing_forms.py @@ -1,52 +1,37 @@ +# SPDX-License-Identifier: GPL-3.0-or-later """ Check for missing forms in Wikidata. - -.. raw:: html - """ +import argparse import json import sys -import argparse +from collections import defaultdict from pathlib import Path -from get_forms import parse_sparql_files, extract_dump_forms + from generate_query import generate_query -from collections import defaultdict +from get_forms import extract_dump_forms, parse_sparql_files + from scribe_data.utils import ( - lexeme_form_metadata, - language_metadata, data_type_metadata, + language_metadata, + lexeme_form_metadata, ) def get_all_languages(): """ - Extract all languages and sublanguages from language metadata. + Extract all languages and sub languages from language metadata. Returns ------- list of str - List of language codes for all languages and sublanguages that have + List of language codes for all languages and sub languages that have both ISO codes and QIDs defined. Notes ----- - Only includes languages and sublanguages that have both 'iso' and 'qid' + Only includes languages and sub languages that have both 'iso' and 'qid' fields in their metadata. """ languages = [] @@ -56,11 +41,13 @@ def get_all_languages(): if "iso" in lang_data and "qid" in lang_data: languages.append(lang) - # Add sublanguages. + # Add sub languages. if "sub_languages" in lang_data: - for sublang, sublang_data in lang_data["sub_languages"].items(): - if "iso" in sublang_data and "qid" in sublang_data: - languages.append(sublang) + languages.extend( + sublang + for sublang, sublang_data in lang_data["sub_languages"].items() + if "iso" in sublang_data and "qid" in sublang_data + ) return languages @@ -74,6 +61,7 @@ def get_missing_features(result_sparql, result_dump): result_sparql : dict Features extracted from SPARQL queries. Format: {language: {data_type: [features]}} + result_dump : dict Features extracted from Wikidata dump. Format: {language: {data_type: [features]}} @@ -111,11 +99,11 @@ def get_missing_features(result_sparql, result_dump): # Get values from SPARQL if available. if dt in result_sparql[lang]: - sparql_values = set(tuple(item) for item in result_sparql[lang][dt]) + sparql_values = {tuple(item) for item in result_sparql[lang][dt]} # Get values from dump if available. if dt in result_dump[lang]: - dump_values = set(tuple(item) for item in result_dump[lang][dt]) + dump_values = {tuple(item) for item in result_dump[lang][dt]} # Get unique values from both sources. unique_dump_values = dump_values - sparql_values @@ -131,7 +119,7 @@ def get_missing_features(result_sparql, result_dump): if all(qid in all_qids for qid in item): missing_by_lang_type[lang][dt].append(list(item)) - return missing_by_lang_type if missing_by_lang_type else None + return missing_by_lang_type or None def process_missing_features(missing_features, query_dir): @@ -143,6 +131,7 @@ def process_missing_features(missing_features, query_dir): missing_features : dict Dictionary of missing features by language and data type. Format: {language: {data_type: [features]}} + query_dir : str or Path Directory where generated query files should be saved. @@ -203,7 +192,7 @@ def main(): print(f"Error: Query directory does not exist: {query_dir}") sys.exit(1) - # Get all languages including sublanguages. + # Get all languages including sub languages. languages = get_all_languages() print("Parsing SPARQL files...") diff --git a/src/scribe_data/check/check_missing_forms/download_wd.py b/src/scribe_data/check/check_missing_forms/download_wd.py index ccf6b797..c8efb137 100644 --- a/src/scribe_data/check/check_missing_forms/download_wd.py +++ b/src/scribe_data/check/check_missing_forms/download_wd.py @@ -1,30 +1,15 @@ +# SPDX-License-Identifier: GPL-3.0-or-later """ Download Wikidata lexeme dump. - -.. raw:: html - """ +import os from pathlib import Path + +import requests + from scribe_data.cli.download import download_wd_lexeme_dump from scribe_data.utils import DEFAULT_DUMP_EXPORT_DIR -import requests -import os def wd_lexeme_dump_download(wikidata_dump=None, output_dir=None): @@ -36,6 +21,7 @@ def wd_lexeme_dump_download(wikidata_dump=None, output_dir=None): wikidata_dump : str, optional Date string in YYYYMMDD format for specific dumps. If None, downloads the latest dump. + output_dir : str, optional Directory path for the downloaded file. If None, uses DEFAULT_DUMP_EXPORT_DIR. @@ -47,9 +33,9 @@ def wd_lexeme_dump_download(wikidata_dump=None, output_dir=None): Notes ----- - - Downloads are skipped if the file already exists in the output directory - - Progress is displayed every 50MB during download - - Creates output directory if it doesn't exist + - Downloads are skipped if the file already exists in the output directory. + - Progress is displayed every 50MB during download. + - Creates output directory if it doesn't exist. """ dump_url = download_wd_lexeme_dump(wikidata_dump or "latest-lexemes") @@ -97,6 +83,5 @@ def wd_lexeme_dump_download(wikidata_dump=None, output_dir=None): if __name__ == "__main__": - output_path = wd_lexeme_dump_download() - if output_path: + if output_path := wd_lexeme_dump_download(): print(f"DOWNLOAD_PATH={output_path}") diff --git a/src/scribe_data/check/check_missing_forms/generate_query.py b/src/scribe_data/check/check_missing_forms/generate_query.py index 71bfde50..42f04eac 100644 --- a/src/scribe_data/check/check_missing_forms/generate_query.py +++ b/src/scribe_data/check/check_missing_forms/generate_query.py @@ -1,34 +1,19 @@ +# SPDX-License-Identifier: GPL-3.0-or-later """ Generate SPARQL queries for missing lexeme forms. - -.. raw:: html - """ +import os +from pathlib import Path + from scribe_data.utils import ( - lexeme_form_metadata, - language_metadata, - data_type_metadata, LANGUAGE_DATA_EXTRACTION_DIR as language_data_extraction, ) - -import os -from pathlib import Path +from scribe_data.utils import ( + data_type_metadata, + language_metadata, + lexeme_form_metadata, +) def generate_query(missing_features, query_dir=None): @@ -40,6 +25,7 @@ def generate_query(missing_features, query_dir=None): missing_features : dict Dictionary containing missing features by language and data type. Format: {language_qid: {data_type_qid: [[form_qids]]}} + query_dir : str or Path, optional Directory where query files should be saved. If None, uses default language_data_extraction directory. @@ -51,10 +37,9 @@ def generate_query(missing_features, query_dir=None): Notes ----- - - Generates a single query file combining all forms for a given - language and data type combination - - Query files are named incrementally if duplicates exist - - Creates necessary directories if they don't exist + - Generates a single query file combining all forms for a given language and data type combination. + - Query files are named incrementally if duplicates exist. + - Creates necessary directories if they don't exist. """ language_qid = next(iter(missing_features.keys())) data_type_qid = next(iter(missing_features[language_qid].keys())) @@ -65,7 +50,7 @@ def generate_query(missing_features, query_dir=None): for name, data in language_metadata.items() if data.get("qid") == language_qid ) - language = language_entry[0] # The language name. + language = language_entry[0] # the language name data_type = next( name for name, qid in data_type_metadata.items() if qid == data_type_qid @@ -79,14 +64,15 @@ def generate_query(missing_features, query_dir=None): for item in category.values(): qid_to_label[item["qid"]] = item["label"] - # Process all forms at once + # Process all forms at once. forms_query = [] all_form_combinations = missing_features[language_qid][data_type_qid] for form_qids in all_form_combinations: # Convert QIDs to labels and join them together. labels = [qid_to_label.get(qid, qid) for qid in form_qids] concatenated_label = "".join(labels) - # Make first letter lowercase + + # Make first letter lowercase. concatenated_label = concatenated_label[0].lower() + concatenated_label[1:] forms_query.append({"label": concatenated_label, "qids": form_qids}) @@ -96,12 +82,12 @@ def generate_query(missing_features, query_dir=None): # Enter this query at https://query.wikidata.org/. SELECT - (REPLACE(STR(?lexeme), "http://www.wikidata.org/entity/", "") AS ?lexemeID) + (REPLACE(STR(?lexeme), "http://www.wikidata.org/entity/", "") AS ?lexemeID) ?{data_type} """ + "\n ".join(f'?{form["label"]}' for form in forms_query) where_clause = f""" - WHERE {{ + WHERE {{ ?lexeme dct:language wd:{language_qid} ; wikibase:lexicalCategory wd:{data_type_qid} ; wikibase:lemma ?{data_type} . @@ -149,6 +135,7 @@ def get_available_filename(base_path): base_file_name = ( Path(query_dir) / language / data_type / f"query_{data_type}.sparql" ) + else: base_file_name = f"{language_data_extraction}/{language}/{data_type}/query_{data_type}.sparql" @@ -163,4 +150,5 @@ def get_available_filename(base_path): file.write(final_query) print(f"Query file created: {file_name}") + return file_name diff --git a/src/scribe_data/check/check_missing_forms/get_forms.py b/src/scribe_data/check/check_missing_forms/get_forms.py index 909abf5e..6b061411 100644 --- a/src/scribe_data/check/check_missing_forms/get_forms.py +++ b/src/scribe_data/check/check_missing_forms/get_forms.py @@ -1,32 +1,19 @@ +# SPDX-License-Identifier: GPL-3.0-or-later """ Get forms from Wikidata. -.. raw:: html - """ -from scribe_data.wiktionary.parse_dump import LexemeProcessor import re from collections import defaultdict + from scribe_data.utils import ( - language_metadata, - data_type_metadata, LANGUAGE_DATA_EXTRACTION_DIR as language_data_extraction, ) +from scribe_data.utils import ( + data_type_metadata, + language_metadata, +) +from scribe_data.wikidata.parse_dump import LexemeProcessor iso_to_qid = { lang_data["iso"]: lang_data["qid"] @@ -111,9 +98,7 @@ def parse_sparql_query(query_text): # Extract grammatical features. features = re.finditer(r"wd:(Q\d+)", block_text) - feature_list = [f.group(1) for f in features] - - if feature_list: + if feature_list := [f.group(1) for f in features]: result[language][lexical_category].append(feature_list) return result @@ -132,8 +117,10 @@ def extract_dump_forms( ---------- languages : list of str, optional List of language ISO codes (e.g., ['en', 'fr']) + data_types : list of str, optional List of lexical categories (e.g., ['nouns', 'verbs']) + file_path : str, optional Path to the lexeme dump file, by default "latest-lexemes.json.bz2" @@ -166,8 +153,7 @@ def extract_dump_forms( for data_type, features in data_types_dict.items(): # Get QID from data_type_metadata. - data_type_qid = data_type_metadata.get(data_type) - if data_type_qid: + if data_type_qid := data_type_metadata.get(data_type): converted_features[lang_qid][data_type_qid] = features return converted_features diff --git a/src/scribe_data/check/check_missing_forms/pr_body.py b/src/scribe_data/check/check_missing_forms/pr_body.py index 11aadbe9..822ef9ec 100644 --- a/src/scribe_data/check/check_missing_forms/pr_body.py +++ b/src/scribe_data/check/check_missing_forms/pr_body.py @@ -1,30 +1,14 @@ +# SPDX-License-Identifier: GPL-3.0-or-later """ Generate a formatted PR body describing missing features for each language. - -.. raw:: html - """ import json import sys + from scribe_data.utils import ( - language_metadata, data_type_metadata, + language_metadata, ) @@ -51,12 +35,13 @@ def pr_body(missing_features): - A table showing languages and their missing feature types - Features are grouped by language for better readability """ - # Initialize PR body with a header. - pr_body_content = "## Automated PR: Missing Features\n\n" - pr_body_content += "This PR was automatically created by a GitHub Action.\n\n" - pr_body_content += "### Missing Features Summary\n" - pr_body_content += "| **Language** | **Feature Type** |\n" - pr_body_content += "|--------------|------------------|\n" + pr_body_content = ( + "## Automated PR: Missing Features\n\n" + + "This PR was automatically created by a GitHub Action.\n\n" + + "### Missing Features Summary\n" + + "| **Language** | **Feature Type** |\n" + + "|--------------|------------------|\n" + ) # Create a dictionary to group features by language. grouped_features = {} @@ -69,11 +54,13 @@ def pr_body(missing_features): if data.get("qid") == entity: language_name = name break + if "sub_languages" in data: for sub_name, sub_data in data["sub_languages"].items(): if sub_data.get("qid") == entity: language_name = f"{name} ({sub_name})" break + if language_name: break @@ -99,6 +86,7 @@ def pr_body(missing_features): pr_body_content += "\nPlease review the changes and provide feedback.\n" print(pr_body_content) + return pr_body_content diff --git a/src/scribe_data/cli/get.py b/src/scribe_data/cli/get.py index 7277d04d..589bc7f4 100644 --- a/src/scribe_data/cli/get.py +++ b/src/scribe_data/cli/get.py @@ -14,10 +14,10 @@ from scribe_data.unicode.generate_emoji_keywords import generate_emoji from scribe_data.utils import ( DEFAULT_CSV_EXPORT_DIR, + DEFAULT_DUMP_EXPORT_DIR, DEFAULT_JSON_EXPORT_DIR, DEFAULT_SQLITE_EXPORT_DIR, DEFAULT_TSV_EXPORT_DIR, - DEFAULT_DUMP_EXPORT_DIR, ) from scribe_data.wikidata.query_data import query_data from scribe_data.wikidata.wikidata_utils import parse_wd_lexeme_dump @@ -181,7 +181,7 @@ def prompt_user_download_all(): # MARK: Form Dump elif wikidata_dump is not None: - # If wikidata_dump is an empty string, use the default path + # If wikidata_dump is an empty string, use the default path. if wikidata_dump == "": wikidata_dump = DEFAULT_DUMP_EXPORT_DIR parse_wd_lexeme_dump( diff --git a/src/scribe_data/cli/list.py b/src/scribe_data/cli/list.py index 4a1b09a5..287df047 100644 --- a/src/scribe_data/cli/list.py +++ b/src/scribe_data/cli/list.py @@ -174,8 +174,10 @@ def list_wrapper( ---------- language : str The language to potentially list data types for. + data_type : str The data type to check for. + all_bool : bool Whether all languages and data types should be listed. diff --git a/src/scribe_data/cli/main.py b/src/scribe_data/cli/main.py index 7a37e5c0..89ec60c2 100644 --- a/src/scribe_data/cli/main.py +++ b/src/scribe_data/cli/main.py @@ -19,12 +19,12 @@ from scribe_data.cli.total import total_wrapper from scribe_data.cli.upgrade import upgrade_cli from scribe_data.cli.version import get_version_message -from scribe_data.wiktionary.parse_mediaWiki import parse_wiktionary_translations from scribe_data.utils import ( - DEFAULT_JSON_EXPORT_DIR, DEFAULT_CSV_EXPORT_DIR, DEFAULT_DUMP_EXPORT_DIR, + DEFAULT_JSON_EXPORT_DIR, ) +from scribe_data.wiktionary.parse_mediaWiki import parse_wiktionary_translations LIST_DESCRIPTION = "List languages, data types and combinations of each that Scribe-Data can be used for." GET_DESCRIPTION = ( @@ -355,11 +355,13 @@ def main() -> None: elif args.command in ["get", "g"]: if args.interactive: start_interactive_mode(operation="get") + if args.translation: parse_wiktionary_translations(args.translation, args.output_dir) + else: print( - f"Parsing Wikidata lexeme dump for {args.language} and {args.data_type}" + f"Parsing Wikidata lexeme dump for {args.language} and {args.data_type}..." ) get_data( language=args.language.lower() diff --git a/src/scribe_data/cli/total.py b/src/scribe_data/cli/total.py index a53f2412..c1699832 100644 --- a/src/scribe_data/cli/total.py +++ b/src/scribe_data/cli/total.py @@ -339,6 +339,7 @@ def total_wrapper( data_type : Union[str, List[str]] The data type(s) to check for. + all_bool : bool Whether all languages and data types should be listed. diff --git a/src/scribe_data/utils.py b/src/scribe_data/utils.py index 6a8c270a..3ef45ac5 100644 --- a/src/scribe_data/utils.py +++ b/src/scribe_data/utils.py @@ -697,7 +697,7 @@ def check_lexeme_dump_prompt_download(output_dir: str): return None elif user_input == "Download new version": - # Rename existing latest dump if it exists + # Rename existing latest dump if it exists. latest_dump = Path(output_dir) / "latest-lexemes.json.bz2" if latest_dump.exists(): timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") @@ -706,6 +706,7 @@ def check_lexeme_dump_prompt_download(output_dir: str): rprint( f"[bold green]Renamed existing dump to {backup_name}[/bold green]" ) + return False else: diff --git a/src/scribe_data/wiktionary/parse_dump.py b/src/scribe_data/wikidata/parse_dump.py similarity index 91% rename from src/scribe_data/wiktionary/parse_dump.py rename to src/scribe_data/wikidata/parse_dump.py index 107e5e98..d0e8689c 100644 --- a/src/scribe_data/wiktionary/parse_dump.py +++ b/src/scribe_data/wikidata/parse_dump.py @@ -10,6 +10,8 @@ from typing import List, Union import orjson +from tqdm import tqdm + from scribe_data.utils import ( DEFAULT_DUMP_EXPORT_DIR, check_index_exists, @@ -19,7 +21,6 @@ language_metadata, lexeme_form_metadata, ) -from tqdm import tqdm class LexemeProcessor: @@ -77,7 +78,8 @@ def __init__( item_data["label"], ) - # MARK: build iso mapping + # MARK: Build ISO Mapping + def _build_iso_mapping(self) -> dict: """ Build mapping of ISO codes to language names based on language_metadata. @@ -85,7 +87,7 @@ def _build_iso_mapping(self) -> dict: """ iso_mapping = {} for lang_name, data in language_metadata.items(): - # Handle sub-languages if they exist + # Handle sub-languages if they exist. if "sub_languages" in data: for sub_lang, sub_data in data["sub_languages"].items(): if self.target_lang and sub_lang not in self.target_lang: @@ -93,9 +95,9 @@ def _build_iso_mapping(self) -> dict: if iso_code := sub_data.get("iso"): iso_mapping[iso_code] = sub_lang - continue # Skip main language if it only has sub-languages + continue # skip main language if it only has sub-languages - # Handle main languages + # Handle main languages. if self.target_lang and lang_name not in self.target_lang: continue @@ -104,15 +106,14 @@ def _build_iso_mapping(self) -> dict: for language in self.target_lang: if language.lower().startswith("q") and language[1:].isdigit(): - qid_to_lang = check_qid_is_language(language) - if qid_to_lang: + if qid_to_lang := check_qid_is_language(language): iso_code = get_language_iso_code(language.upper()) iso_mapping[iso_code] = qid_to_lang print(f"ISO code for {language} is {iso_code}") return iso_mapping - # MARK: process lines + # MARK: Process Lines def process_lines(self, line: str) -> None: """ Process one line of data with optimized parsing. @@ -125,7 +126,7 @@ def process_lines(self, line: str) -> None: # Combine field checks into single lookup. required_fields = ("lemmas", "lexicalCategory") - if not all(field in lexeme for field in required_fields): + if any(field not in lexeme for field in required_fields): return lexical_category = lexeme["lexicalCategory"] @@ -162,13 +163,13 @@ def process_lines(self, line: str) -> None: def _process_translations(self, lexeme, word, lang_code, category_name): """ - Optimized translations processing + Optimized translations processing. """ translations = {} valid_iso_codes = self.valid_iso_codes lexeme_id = lexeme["id"] - # Pre-fetch senses to avoid repeated lookups + # Pre-fetch senses to avoid repeated lookups. for sense in lexeme["senses"]: if glosses := sense.get("glosses"): translations.update( @@ -184,7 +185,7 @@ def _process_translations(self, lexeme, word, lang_code, category_name): def _process_forms(self, lexeme, lang_code, category_name): """ - Optimized forms processing + Optimized forms processing. """ lexeme_id = lexeme["id"] forms_data = {} @@ -202,7 +203,7 @@ def _process_forms(self, lexeme, lang_code, category_name): if form_value := rep_data.get("value"): features = form.get("grammaticalFeatures", []) - # If features are not empty and not already in the list + # If features are not empty and not already in the list. if ( features and features not in self.unique_forms[lang_code][category_name] @@ -212,7 +213,8 @@ def _process_forms(self, lexeme, lang_code, category_name): if features := form.get("grammaticalFeatures"): if form_name := self._get_form_name(features): cat_dict[form_name] = form_value - break # Only process first representation + + break # only process first representation if forms_data: self.forms_index.update(forms_data) @@ -220,7 +222,7 @@ def _process_forms(self, lexeme, lang_code, category_name): def _get_form_name(self, features): """ - Optimized form name generation + Optimized form name generation. """ if not features: return "" @@ -238,6 +240,7 @@ def _get_form_name(self, features): if is_first: form_parts.append(label.lower()) is_first = False + else: form_parts.append(label) @@ -247,24 +250,25 @@ def _process_totals(self, lexeme, lang_code, category_name): """ Process totals for statistical counting. """ - # Skip if we have specific data types and this category isn't in them + # Skip if we have specific data types and this category isn't in them. if self.data_types and category_name.lower() not in [ dt.lower() for dt in self.data_types ]: return - # Increment lexeme count for this language and category + # Increment lexeme count for this language and category. self.lexical_category_counts[lang_code][category_name] += 1 - # Count translations if they exist + # Count translations if they exist. if lexeme.get("senses"): translation_count = sum( - 1 - for sense in lexeme["senses"] - if sense.get("glosses") - and any( - lang in self.valid_iso_codes for lang in sense["glosses"].keys() + bool( + sense.get("glosses") + and any( + lang in self.valid_iso_codes for lang in sense["glosses"].keys() + ) ) + for sense in lexeme["senses"] ) if translation_count > 0: self.translation_counts[lang_code][category_name] += translation_count @@ -356,13 +360,13 @@ def export_translations_json(self, filepath: str, language_iso: str = None) -> N ) return - # Flatten the category level + # Flatten the category level. filtered = {} for category_data in self.translations_index[language_iso].values(): for lexeme_id, word_data in category_data.items(): filtered[lexeme_id] = word_data - # Check if filtered data is empty before saving + # Check if filtered data is empty before saving. if not filtered: print(f"No translations found for {language_iso}, skipping export...") return @@ -380,8 +384,10 @@ def export_forms_json( ---------- filepath : str Base path where the JSON file will be saved. + language_iso : str, optional ISO code of the language to export. If None, exports all languages. + data_type : str, optional Category of forms to export (e.g., "nouns", "verbs"). If None, exports all types. @@ -399,11 +405,11 @@ def export_forms_json( for id, lang_data in self.forms_index.items(): if ( language_iso in lang_data and data_type - ): # Only process if we have a data_type + ): # only process if we have a data_type if ( data_type in lang_data[language_iso] - ): # Check if this data_type exists - # Initialize the nested dictionary for this ID if it doesn't exist + ): # Check if this data_type exists. + # Initialize the nested dictionary for this ID if it doesn't exist. if id not in filtered: filtered[id] = {} @@ -413,13 +419,13 @@ def export_forms_json( lang_name = self.iso_to_name[language_iso] - # Check if filtered data is empty before saving + # Check if filtered data is empty before saving. if not filtered: print(f"No forms found for {lang_name} {data_type}, skipping export...") return - # Create the output directory structure - # Check if this is a sub-language and get its main language + # Create the output directory structure. + # Check if this is a sub-language and get its main language. main_lang = None for lang, data in language_metadata.items(): if "sub_languages" in data: @@ -430,7 +436,7 @@ def export_forms_json( if main_lang: break - # If it's a sub-language, create path like: parent/chinese/mandarin/ + # If it's a sub-language, create path like: parent/chinese/mandarin/. if main_lang: output_path = Path(filepath).parent / main_lang / lang_name else: @@ -438,10 +444,10 @@ def export_forms_json( output_path.mkdir(parents=True, exist_ok=True) - # Create the full output filepath + # Create the full output filepath. output_file = output_path / f"lexeme_{data_type}.json" - # Save the filtered data to JSON file + # Save the filtered data to JSON file. try: with open(output_file, "wb") as f: f.write(orjson.dumps(filtered, option=orjson.OPT_INDENT_2)) @@ -459,10 +465,13 @@ def _save_by_language(self, filtered, filepath, language_iso, data_type): ---------- filtered : dict Dictionary with form features as keys and words as values. + filepath : Path Base path for saving the file. + language_iso : str ISO code of the language. + data_type : str Type of data being saved (e.g., "nouns", "verbs"). @@ -474,13 +483,13 @@ def _save_by_language(self, filtered, filepath, language_iso, data_type): base_path = Path(filepath) lang_name = self.iso_to_name[language_iso] - # Create language-specific directory + # Create language-specific directory. lang_filepath = base_path.parent / base_path.name lang_filepath.parent.mkdir(parents=True, exist_ok=True) print(f"Saving {lang_name} {data_type} forms to {lang_filepath}...") - # Save the filtered data with pretty printing + # Save the filtered data with pretty printing. with open(lang_filepath, "wb") as f: f.write( orjson.dumps( @@ -579,7 +588,7 @@ def parse_dump( break for data_type in data_types: - # Create appropriate path based on whether it's a sub-language + # Create appropriate path based on whether it's a sub-language. if main_lang: index_path = ( Path(output_dir) @@ -596,7 +605,7 @@ def parse_dump( needs_processing = True data_types_to_process.add(data_type) else: - # Update path display in skip message + # Update path display in skip message. skip_path = ( f"{main_lang}/{lang}/{data_type}.json" if main_lang @@ -611,10 +620,9 @@ def parse_dump( languages = languages_to_process data_types = list(data_types_to_process) - if "translations" not in parse_type: - if not data_types or not languages: - print("No data types or languages provided. Nothing to process.") - return + if "translations" not in parse_type and (not data_types or not languages): + print("No data types or languages provided. Nothing to process.") + return if not languages: print("All requested data already exists. Nothing to process.") @@ -628,18 +636,18 @@ def parse_dump( # MARK: Handle JSON exports if "translations" in parse_type: for language in languages: - # Get the ISO code for the language - iso_code = None - for iso, name in processor.iso_to_name.items(): - if name.lower() == language.lower(): - iso_code = iso - break - - if iso_code: + if iso_code := next( + ( + iso + for iso, name in processor.iso_to_name.items() + if name.lower() == language.lower() + ), + None, + ): index_path = Path(output_dir) / language / "lexeme_translations.json" - # Ensure parent directory exists + # Ensure parent directory exists. index_path.parent.mkdir(parents=True, exist_ok=True) - # print(f"Exporting translations for {language} to {index_path}") + # print(f"Exporting translations for {language} to {index_path}"). processor.export_translations_json(str(index_path), iso_code) else: print(f"Warning: Could not find ISO code for {language}") @@ -661,7 +669,7 @@ def parse_dump( # def print_unique_forms(unique_forms): # """ - # Pretty print unique grammatical feature sets + # Pretty print unique grammatical feature sets. # """ # for lang, lang_data in unique_forms.items(): # print(f"\nLanguage: {lang}") diff --git a/src/scribe_data/wikidata/wikidata_utils.py b/src/scribe_data/wikidata/wikidata_utils.py index b29a7450..36cc6bba 100644 --- a/src/scribe_data/wikidata/wikidata_utils.py +++ b/src/scribe_data/wikidata/wikidata_utils.py @@ -12,7 +12,7 @@ from scribe_data.cli.download import wd_lexeme_dump_download_wrapper from scribe_data.utils import data_type_metadata, language_metadata -from scribe_data.wiktionary.parse_dump import parse_dump +from scribe_data.wikidata.parse_dump import parse_dump sparql = SPARQLWrapper("https://query.wikidata.org/sparql") sparql.setReturnFormat(JSON) @@ -72,11 +72,11 @@ def parse_wd_lexeme_dump( overwrite_all : bool, default=False If True, automatically overwrite existing files without prompting """ - # Convert "all" to list of all languages including sub-languages + # Convert "all" to list of all languages including sub-languages. if isinstance(language, str) and language.lower() == "all": languages = [] for main_lang, lang_data in language_metadata.items(): - # Add sub-languages if they exist + # Add sub-languages if they exist. if "sub_languages" in lang_data: for sub_lang in lang_data["sub_languages"]: main_lang = sub_lang @@ -84,7 +84,7 @@ def parse_wd_lexeme_dump( language = languages - # For processing: exclude translations and emoji-keywords + # For processing: exclude translations and emoji-keywords. if isinstance(data_types, str) and data_types.lower() == "all": data_types = [ dt diff --git a/src/scribe_data/wiktionary/parse_mediaWiki.py b/src/scribe_data/wiktionary/parse_mediaWiki.py index d6228e93..319fe4da 100644 --- a/src/scribe_data/wiktionary/parse_mediaWiki.py +++ b/src/scribe_data/wiktionary/parse_mediaWiki.py @@ -6,7 +6,8 @@ import json import re from pathlib import Path -from scribe_data.utils import get_language_from_iso, DEFAULT_MEDIAWIKI_EXPORT_DIR + +from scribe_data.utils import DEFAULT_MEDIAWIKI_EXPORT_DIR, get_language_from_iso from scribe_data.wikidata.wikidata_utils import mediaWiki_query @@ -115,6 +116,7 @@ def parse_wiktionary_translations(word, output_dir=DEFAULT_MEDIAWIKI_EXPORT_DIR) ---------- word : str The word to fetch translations for. + output_dir : str or Path, optional Directory to save JSON output (default is DEFAULT_MEDIAWIKI_EXPORT_DIR). Will be created if it doesn't exist. diff --git a/tests/cli/test_total.py b/tests/cli/test_total.py index 089fde97..2d6f56e9 100644 --- a/tests/cli/test_total.py +++ b/tests/cli/test_total.py @@ -258,7 +258,8 @@ def test_total_wrapper_invalid_input(self): with self.assertRaises(ValueError): total_wrapper() - # MARK: Using wikidata_dump + # MARK: Using Dump + @patch("scribe_data.cli.total.parse_wd_lexeme_dump") def test_total_wrapper_wikidata_dump_flag(self, mock_parse_dump): """Test when wikidata_dump is True (flag without path)""" @@ -307,6 +308,7 @@ def test_total_wrapper_wikidata_dump_with_language_and_type(self, mock_parse_dum ) # MARK: Using QID + @patch("scribe_data.cli.total.check_qid_is_language") @patch("scribe_data.cli.total.print_total_lexemes") def test_total_wrapper_with_qid(self, mock_print_total, mock_check_qid): From 789b178725e7267d7aa8c8552ba5ed1c7615bc8a Mon Sep 17 00:00:00 2001 From: Andrew Tavis McAllister Date: Sat, 25 Jan 2025 11:14:39 +0100 Subject: [PATCH 12/13] Adding comments and minor name changes --- src/scribe_data/check/check_missing_forms/get_forms.py | 1 + src/scribe_data/wikidata/wikidata_utils.py | 8 ++++---- src/scribe_data/wiktionary/parse_mediaWiki.py | 9 ++++++--- 3 files changed, 11 insertions(+), 7 deletions(-) diff --git a/src/scribe_data/check/check_missing_forms/get_forms.py b/src/scribe_data/check/check_missing_forms/get_forms.py index 6b061411..30208b62 100644 --- a/src/scribe_data/check/check_missing_forms/get_forms.py +++ b/src/scribe_data/check/check_missing_forms/get_forms.py @@ -104,6 +104,7 @@ def parse_sparql_query(query_text): return result +# Debug line to parsed file. parse_sparql_files() diff --git a/src/scribe_data/wikidata/wikidata_utils.py b/src/scribe_data/wikidata/wikidata_utils.py index 36cc6bba..d33fa096 100644 --- a/src/scribe_data/wikidata/wikidata_utils.py +++ b/src/scribe_data/wikidata/wikidata_utils.py @@ -19,13 +19,13 @@ sparql.setMethod(POST) -def mediaWiki_query(query: str) -> dict: +def mediawiki_query(word: str) -> dict: """ Query the Wikidata API using a MediaWiki query. Parameters ---------- - query : str + word : str The MediaWiki query to execute. Returns @@ -34,8 +34,8 @@ def mediaWiki_query(query: str) -> dict: The JSON response from the API. """ url = ( - f"https://en.wiktionary.org/w/api.php?" - f"action=query&format=json&titles={query}/translations&prop=revisions&rvprop=content" + f"https://wikidata.org/w/api.php?" + f"action=query&format=json&titles={word}/translations&prop=revisions&rvprop=content" ) response = requests.get(url) return response.json() diff --git a/src/scribe_data/wiktionary/parse_mediaWiki.py b/src/scribe_data/wiktionary/parse_mediaWiki.py index 319fe4da..e451830d 100644 --- a/src/scribe_data/wiktionary/parse_mediaWiki.py +++ b/src/scribe_data/wiktionary/parse_mediaWiki.py @@ -8,11 +8,14 @@ from pathlib import Path from scribe_data.utils import DEFAULT_MEDIAWIKI_EXPORT_DIR, get_language_from_iso -from scribe_data.wikidata.wikidata_utils import mediaWiki_query +from scribe_data.wikidata.wikidata_utils import mediawiki_query -def fetch_translation_page(word): - data = mediaWiki_query(word) +def fetch_translation_page(word: str): + """ + Fetches the translation for a given word via the Wiktionary MediaWiki API. + """ + data = mediawiki_query(word=word) pages = data.get("query", {}).get("pages", {}) # Extract page object from dictionary. From 06d77b284ab0c4aa1459e4723437dda820ac92bc Mon Sep 17 00:00:00 2001 From: Andrew Tavis McAllister Date: Sat, 25 Jan 2025 12:24:04 +0100 Subject: [PATCH 13/13] Fixes to query all user flow and outputs and test changes --- src/scribe_data/cli/get.py | 38 ++++++++++++---------- src/scribe_data/cli/main.py | 3 -- src/scribe_data/wikidata/parse_dump.py | 36 ++++++++++---------- src/scribe_data/wikidata/wikidata_utils.py | 2 +- tests/cli/test_get.py | 10 +++--- 5 files changed, 46 insertions(+), 43 deletions(-) diff --git a/src/scribe_data/cli/get.py b/src/scribe_data/cli/get.py index 589bc7f4..c1f47756 100644 --- a/src/scribe_data/cli/get.py +++ b/src/scribe_data/cli/get.py @@ -3,7 +3,7 @@ Functions for getting languages-data types packs for the Scribe-Data CLI. """ -import os # for removing original JSON files +import os from pathlib import Path from typing import List, Union @@ -94,22 +94,13 @@ def prompt_user_download_all(): Checks with the user if they'd rather use Wikidata lexeme dumps before a download all call. """ return questionary.confirm( - "Do you want to query Wikidata directly? (selecting 'no' will use Wikidata lexeme dumps)", + "Do you want to query Wikidata directly? (selecting 'no' will use a Wikidata lexemes dump locally to avoid large Query Service calls)", default=False, ).ask() if all_bool: if language: if prompt_user_download_all(): - parse_wd_lexeme_dump( - language=language, - wikidata_dump_type=["form"], - data_types="all", - type_output_dir=output_dir, - wikidata_dump_path=wikidata_dump, - overwrite_all=overwrite, - ) - else: language_or_sub_language = language.split(" ")[0] print(f"Updating all data types for language: {language.title()}") query_data( @@ -122,17 +113,18 @@ def prompt_user_download_all(): f"Query completed for all data types for language {language.title()}." ) - elif data_type: - if prompt_user_download_all(): + else: parse_wd_lexeme_dump( - language="all", + language=language, wikidata_dump_type=["form"], - data_types=[data_type], + data_types="all", type_output_dir=output_dir, wikidata_dump_path=wikidata_dump, overwrite_all=overwrite, ) - else: + + elif data_type: + if prompt_user_download_all(): print(f"Updating all languages for data type: {data_type.capitalize()}") query_data( languages=None, @@ -144,6 +136,16 @@ def prompt_user_download_all(): f"Query completed for all languages for data type {data_type.capitalize()}." ) + else: + parse_wd_lexeme_dump( + language="all", + wikidata_dump_type=["form"], + data_types=[data_type], + type_output_dir=output_dir, + wikidata_dump_path=wikidata_dump, + overwrite_all=overwrite, + ) + else: print("Updating all languages and data types...") rprint( @@ -169,6 +171,7 @@ def prompt_user_download_all(): # If no language specified, use "all". if language is None: language = "all" + parse_wd_lexeme_dump( language=language, wikidata_dump_type=["translations"], @@ -182,8 +185,9 @@ def prompt_user_download_all(): elif wikidata_dump is not None: # If wikidata_dump is an empty string, use the default path. - if wikidata_dump == "": + if not wikidata_dump: wikidata_dump = DEFAULT_DUMP_EXPORT_DIR + parse_wd_lexeme_dump( language=language, wikidata_dump_type=["form"], diff --git a/src/scribe_data/cli/main.py b/src/scribe_data/cli/main.py index 89ec60c2..1c08ca52 100644 --- a/src/scribe_data/cli/main.py +++ b/src/scribe_data/cli/main.py @@ -360,9 +360,6 @@ def main() -> None: parse_wiktionary_translations(args.translation, args.output_dir) else: - print( - f"Parsing Wikidata lexeme dump for {args.language} and {args.data_type}..." - ) get_data( language=args.language.lower() if args.language is not None diff --git a/src/scribe_data/wikidata/parse_dump.py b/src/scribe_data/wikidata/parse_dump.py index d0e8689c..e39b1ec3 100644 --- a/src/scribe_data/wikidata/parse_dump.py +++ b/src/scribe_data/wikidata/parse_dump.py @@ -373,7 +373,8 @@ def export_translations_json(self, filepath: str, language_iso: str = None) -> N self._save_by_language(filtered, filepath, language_iso, "translations") - # MARK: export forms + # MARK: Export Forms + def export_forms_json( self, filepath: str, language_iso: str = None, data_type: str = None ) -> None: @@ -393,7 +394,7 @@ def export_forms_json( Notes ----- - Creates a directory structure: //lexeme_.json + Creates a directory structure: //.json Skips export if no forms are found for the specified language and data type. """ if language_iso: @@ -421,7 +422,9 @@ def export_forms_json( # Check if filtered data is empty before saving. if not filtered: - print(f"No forms found for {lang_name} {data_type}, skipping export...") + print( + f"No forms found for {lang_name.capitalize()} {data_type}, skipping export..." + ) return # Create the output directory structure. @@ -445,17 +448,19 @@ def export_forms_json( output_path.mkdir(parents=True, exist_ok=True) # Create the full output filepath. - output_file = output_path / f"lexeme_{data_type}.json" + output_file = output_path / f"{data_type}.json" # Save the filtered data to JSON file. try: with open(output_file, "wb") as f: f.write(orjson.dumps(filtered, option=orjson.OPT_INDENT_2)) print( - f"Successfully exported forms for {lang_name} {data_type} to {output_file}" + f"Successfully exported forms for {lang_name.capitalize()} {data_type} to {output_file}" ) except Exception as e: - print(f"Error saving forms for {lang_name} {data_type}: {e}") + print( + f"Error saving forms for {lang_name.capitalize()} {data_type}: {e}" + ) def _save_by_language(self, filtered, filepath, language_iso, data_type): """ @@ -558,7 +563,7 @@ def parse_dump( if "translations" in parse_type: languages_to_process = [] for lang in languages: - index_path = Path(output_dir) / lang / "lexeme_translations.json" + index_path = Path(output_dir) / lang / "translations.json" if not check_index_exists(index_path, overwrite_all): languages_to_process.append(lang) @@ -591,19 +596,16 @@ def parse_dump( # Create appropriate path based on whether it's a sub-language. if main_lang: index_path = ( - Path(output_dir) - / main_lang - / lang - / f"lexeme_{data_type}.json" + Path(output_dir) / main_lang / lang / f"{data_type}.json" ) + else: - index_path = ( - Path(output_dir) / lang / f"lexeme_{data_type}.json" - ) + index_path = Path(output_dir) / lang / f"{data_type}.json" if not check_index_exists(index_path, overwrite_all): needs_processing = True data_types_to_process.add(data_type) + else: # Update path display in skip message. skip_path = ( @@ -644,7 +646,7 @@ def parse_dump( ), None, ): - index_path = Path(output_dir) / language / "lexeme_translations.json" + index_path = Path(output_dir) / language / "translations.json" # Ensure parent directory exists. index_path.parent.mkdir(parents=True, exist_ok=True) # print(f"Exporting translations for {language} to {index_path}"). @@ -654,9 +656,9 @@ def parse_dump( # (b) If "form" in parse_type -> export forms for each data_type in data_types. if "form" in parse_type: - # For each data_type, we create a separate file, e.g. lexeme_nouns.json. + # For each data_type, we create a separate file, e.g. nouns.json. for dt in data_types: - index_path = Path(output_dir) / f"lexeme_{dt}.json" + index_path = Path(output_dir) / f"{dt}.json" iso_codes = set() for word_data in processor.forms_index.values(): iso_codes.update(word_data.keys()) diff --git a/src/scribe_data/wikidata/wikidata_utils.py b/src/scribe_data/wikidata/wikidata_utils.py index d33fa096..7109620e 100644 --- a/src/scribe_data/wikidata/wikidata_utils.py +++ b/src/scribe_data/wikidata/wikidata_utils.py @@ -92,7 +92,7 @@ def parse_wd_lexeme_dump( if dt != "translations" and dt != "emoji-keywords" ] - print(f"Languages to process: {language}") + print(f"Languages to process: {[lang.capitalize() for lang in language]}") if "translations" not in wikidata_dump_type: print(f"Data types to process: {data_types}") diff --git a/tests/cli/test_get.py b/tests/cli/test_get.py index 63af75a8..2cb52061 100644 --- a/tests/cli/test_get.py +++ b/tests/cli/test_get.py @@ -48,16 +48,16 @@ def test_invalid_arguments(self): @patch("scribe_data.cli.get.query_data") @patch("scribe_data.cli.get.parse_wd_lexeme_dump") @patch("scribe_data.cli.get.questionary.confirm") - def test_get_all_data_types_for_language_user_says_yes( + def test_get_all_data_types_for_language_user_says_no( self, mock_questionary_confirm, mock_parse, mock_query_data ): """ - Test the behavior when the user agrees to query Wikidata directly. + Test the behavior when the user agrees to use Wikidata lexeme dumps. This test checks that `parse_wd_lexeme_dump` is called with the correct parameters - when the user confirms they want to query Wikidata. + when the user confirms they don't want to query Wikidata. """ - mock_questionary_confirm.return_value.ask.return_value = True + mock_questionary_confirm.return_value.ask.return_value = False get_data(all_bool=True, language="English") @@ -321,7 +321,7 @@ def test_get_data_with_wikidata_identifier( when a Wikidata identifier is used. """ # Mock the user confirmation to return True (query Wikidata directly). - mock_questionary_confirm.return_value.ask.return_value = True + mock_questionary_confirm.return_value.ask.return_value = False get_data( language="Q9217",