From a6d161abb2e98463c02b0e6728a7ea300ea8df3d Mon Sep 17 00:00:00 2001 From: Empiriker <till.ueberfries@gmail.com> Date: Wed, 18 Oct 2023 14:24:38 +0300 Subject: [PATCH] Load whole dumps instead of only templates --- src/{load_templates.py => load_dumps.py} | 29 ++++++++++++++++-------- 1 file changed, 20 insertions(+), 9 deletions(-) rename src/{load_templates.py => load_dumps.py} (85%) diff --git a/src/load_templates.py b/src/load_dumps.py similarity index 85% rename from src/load_templates.py rename to src/load_dumps.py index 17e132b..a9c9ba9 100644 --- a/src/load_templates.py +++ b/src/load_dumps.py @@ -1,16 +1,26 @@ -from wiktextract.wiktionary import parse_wiktionary -from wiktextract_context import get_wiktextract_context - import os import re - import threading import time from typing import List +from wiktextract.wiktionary import parse_wiktionary + +from wiktextract_context import get_wiktextract_context DUMPS_DIR = "dumps" +RECOGNIZED_NAMESPACE_NAMES = [ + "Main", + "Category", + "Appendix", + "Project", + "Thesaurus", + "Module", + "Template", + "Reconstruction", +] + def start_progress_indicator(is_done: List[bool], msg: str = ""): is_done[0] = False @@ -64,20 +74,21 @@ def load_templates(wiktlang: str): is_done, msg=f"Loading templates for {wiktlang}..." ) - wxr = get_wiktextract_context(wiktlang, mock_get_page=False) + wxr = get_wiktextract_context(wiktlang) wxr.wtp.db_conn.execute("DELETE FROM pages") wxr.wtp.db_conn.commit() + namespace_ids = { + wxr.wtp.NAMESPACE_DATA.get(name, {}).get("id") + for name in RECOGNIZED_NAMESPACE_NAMES + } parse_wiktionary( wxr, dump_file, num_processes=1, phase1_only=True, - namespace_ids={ - 10, - 828, - }, # Template and Module namespaces; ToDo: Get the namespace IDs from the dump file + namespace_ids=namespace_ids, out_f=None, # type: ignore ) wxr.wtp.db_conn.commit() -- GitLab