Skip to content
Snippets Groups Projects
Commit a6d161ab authored by Empiriker's avatar Empiriker
Browse files

Load whole dumps instead of only templates

parent 31b3a443
No related branches found
No related tags found
No related merge requests found
from wiktextract.wiktionary import parse_wiktionary
from wiktextract_context import get_wiktextract_context
import os import os
import re import re
import threading import threading
import time import time
from typing import List from typing import List
from wiktextract.wiktionary import parse_wiktionary
from wiktextract_context import get_wiktextract_context
DUMPS_DIR = "dumps" DUMPS_DIR = "dumps"
RECOGNIZED_NAMESPACE_NAMES = [
"Main",
"Category",
"Appendix",
"Project",
"Thesaurus",
"Module",
"Template",
"Reconstruction",
]
def start_progress_indicator(is_done: List[bool], msg: str = ""): def start_progress_indicator(is_done: List[bool], msg: str = ""):
is_done[0] = False is_done[0] = False
...@@ -64,20 +74,21 @@ def load_templates(wiktlang: str): ...@@ -64,20 +74,21 @@ def load_templates(wiktlang: str):
is_done, msg=f"Loading templates for {wiktlang}..." is_done, msg=f"Loading templates for {wiktlang}..."
) )
wxr = get_wiktextract_context(wiktlang, mock_get_page=False) wxr = get_wiktextract_context(wiktlang)
wxr.wtp.db_conn.execute("DELETE FROM pages") wxr.wtp.db_conn.execute("DELETE FROM pages")
wxr.wtp.db_conn.commit() wxr.wtp.db_conn.commit()
namespace_ids = {
wxr.wtp.NAMESPACE_DATA.get(name, {}).get("id")
for name in RECOGNIZED_NAMESPACE_NAMES
}
parse_wiktionary( parse_wiktionary(
wxr, wxr,
dump_file, dump_file,
num_processes=1, num_processes=1,
phase1_only=True, phase1_only=True,
namespace_ids={ namespace_ids=namespace_ids,
10,
828,
}, # Template and Module namespaces; ToDo: Get the namespace IDs from the dump file
out_f=None, # type: ignore out_f=None, # type: ignore
) )
wxr.wtp.db_conn.commit() wxr.wtp.db_conn.commit()
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment