diff --git a/src/load_templates.py b/src/load_dumps.py similarity index 85% rename from src/load_templates.py rename to src/load_dumps.py index 17e132b9bec5e697732118fb5aa3f742efe09608..a9c9ba97c36702355ef46709e00029aff8beb964 100644 --- a/src/load_templates.py +++ b/src/load_dumps.py @@ -1,16 +1,26 @@ -from wiktextract.wiktionary import parse_wiktionary -from wiktextract_context import get_wiktextract_context - import os import re - import threading import time from typing import List +from wiktextract.wiktionary import parse_wiktionary + +from wiktextract_context import get_wiktextract_context DUMPS_DIR = "dumps" +RECOGNIZED_NAMESPACE_NAMES = [ + "Main", + "Category", + "Appendix", + "Project", + "Thesaurus", + "Module", + "Template", + "Reconstruction", +] + def start_progress_indicator(is_done: List[bool], msg: str = ""): is_done[0] = False @@ -64,20 +74,21 @@ def load_templates(wiktlang: str): is_done, msg=f"Loading templates for {wiktlang}..." ) - wxr = get_wiktextract_context(wiktlang, mock_get_page=False) + wxr = get_wiktextract_context(wiktlang) wxr.wtp.db_conn.execute("DELETE FROM pages") wxr.wtp.db_conn.commit() + namespace_ids = { + wxr.wtp.NAMESPACE_DATA.get(name, {}).get("id") + for name in RECOGNIZED_NAMESPACE_NAMES + } parse_wiktionary( wxr, dump_file, num_processes=1, phase1_only=True, - namespace_ids={ - 10, - 828, - }, # Template and Module namespaces; ToDo: Get the namespace IDs from the dump file + namespace_ids=namespace_ids, out_f=None, # type: ignore ) wxr.wtp.db_conn.commit()