Skip to content
Snippets Groups Projects
Commit 482bb453 authored by Empiriker's avatar Empiriker
Browse files

load templates from dumps on server startup

parent e000302a
No related merge requests found
......@@ -6,6 +6,9 @@ from flask_cors import CORS
import config
from get_wikicode import get_wikicode
from wiktextract_wrapper import Wiktextract
from load_templates import load_templates
load_templates()
app = Flask(__name__)
CORS(app)
......
from wiktextract.wiktionary import parse_wiktionary
from wiktextract_context import get_wiktextract_context
import os
import re
import threading
import time
DUMPS_DIR = "dumps"
def start_progress_indicator():
time_elapsed_indicator.stop = False
indicator_thread = threading.Thread(target=time_elapsed_indicator)
indicator_thread.start()
return indicator_thread
def stop_progress_indicator(indicator_thread):
time_elapsed_indicator.stop = True
indicator_thread.join()
def time_elapsed_indicator():
start_time = time.time()
while not time_elapsed_indicator.stop:
elapsed_time = time.time() - start_time
print(f"\rTime elapsed: {elapsed_time:.2f} seconds", end="")
time.sleep(1)
print(
"\rTime elapsed: {elapsed_time:.2f} seconds".format(elapsed_time=elapsed_time) # type: ignore
)
def get_most_recent_file(directory, lang_code):
pattern = re.compile(
f"{lang_code}wiktionary-(\d+)-pages-articles-multistream.xml.bz2"
)
matching_files = [f for f in os.listdir(directory) if pattern.match(f)]
if not matching_files:
return None
most_recent_file = sorted(
matching_files, key=lambda x: pattern.match(x).group(1), reverse=True
)[0]
return os.path.join(directory, most_recent_file)
def load_templates(wiktlang: str):
print(f"Loading templates for {wiktlang}...")
indicator_thread = start_progress_indicator()
wxr = get_wiktextract_context(wiktlang)
wxr.wtp.db_conn.execute("DELETE FROM pages")
wxr.wtp.db_conn.commit()
dump_file = get_most_recent_file(DUMPS_DIR, wiktlang)
if not dump_file:
raise ValueError(f"Could not find dump file for {wiktlang}.")
parse_wiktionary(
wxr,
dump_file,
num_processes=1,
phase1_only=True,
namespace_ids={10, 828},
out_f=None, # type: ignore
)
wxr.wtp.db_conn.commit()
stop_progress_indicator(indicator_thread)
print("Done loading templates.")
from wiktextract import (
WiktextractContext,
WiktionaryConfig,
)
from wikitextprocessor import Wtp
from typing import Optional
def get_wiktextract_context(wiktlang: str, wordlang: Optional[str] = None):
db_path = f"./sqlite-{wiktlang}.db"
config = WiktionaryConfig(
dump_file_lang_code=wiktlang,
capture_language_codes=[wordlang] if wordlang else None,
capture_translations=True,
capture_pronunciation=True,
capture_linkages=True,
capture_compounds=True,
capture_redirects=True,
capture_examples=True,
capture_etymologies=True,
capture_descendants=True,
capture_inflections=True,
)
wxr = WiktextractContext(Wtp(db_path=db_path), config)
return wxr
from wiktextract import (
WiktextractContext,
WiktionaryConfig,
)
from wiktextract.wiktionary import page_handler
from wikitextprocessor import Wtp, Page
from wikitextprocessor import Page
from wiktextract_context import get_wiktextract_context
db_path = "./sqlite.db"
......@@ -18,23 +16,8 @@ class Wiktextract:
self.wiktlang = wiktlang
self.wordlang = wordlang
config = WiktionaryConfig(
dump_file_lang_code=wiktlang,
capture_language_codes=[wordlang],
capture_translations=True,
capture_pronunciation=True,
capture_linkages=True,
capture_compounds=True,
capture_redirects=True,
capture_examples=True,
capture_etymologies=True,
capture_descendants=True,
capture_inflections=True,
)
wxr = WiktextractContext(Wtp(db_path=db_path), config)
self.page_handler = page_handler
self.page_handler.wxr = wxr
self.page_handler.wxr = get_wiktextract_context(wiktlang, wordlang)
def parse_page(self, title: str, wikicode: str):
# add page to the database (making it accessible to LUA templates)
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment