Skip to content
Snippets Groups Projects
Commit 482bb453 authored by Empiriker's avatar Empiriker
Browse files

load templates from dumps on server startup

parent e000302a
No related branches found
No related tags found
No related merge requests found
......@@ -6,6 +6,9 @@ from flask_cors import CORS
import config
from get_wikicode import get_wikicode
from wiktextract_wrapper import Wiktextract
from load_templates import load_templates
load_templates()
app = Flask(__name__)
CORS(app)
......
from wiktextract.wiktionary import parse_wiktionary
from wiktextract_context import get_wiktextract_context
import os
import re
import threading
import time
DUMPS_DIR = "dumps"
def start_progress_indicator():
time_elapsed_indicator.stop = False
indicator_thread = threading.Thread(target=time_elapsed_indicator)
indicator_thread.start()
return indicator_thread
def stop_progress_indicator(indicator_thread):
time_elapsed_indicator.stop = True
indicator_thread.join()
def time_elapsed_indicator():
start_time = time.time()
while not time_elapsed_indicator.stop:
elapsed_time = time.time() - start_time
print(f"\rTime elapsed: {elapsed_time:.2f} seconds", end="")
time.sleep(1)
print(
"\rTime elapsed: {elapsed_time:.2f} seconds".format(elapsed_time=elapsed_time) # type: ignore
)
def get_most_recent_file(directory, lang_code):
pattern = re.compile(
f"{lang_code}wiktionary-(\d+)-pages-articles-multistream.xml.bz2"
)
matching_files = [f for f in os.listdir(directory) if pattern.match(f)]
if not matching_files:
return None
most_recent_file = sorted(
matching_files, key=lambda x: pattern.match(x).group(1), reverse=True
)[0]
return os.path.join(directory, most_recent_file)
def load_templates(wiktlang: str):
print(f"Loading templates for {wiktlang}...")
indicator_thread = start_progress_indicator()
wxr = get_wiktextract_context(wiktlang)
wxr.wtp.db_conn.execute("DELETE FROM pages")
wxr.wtp.db_conn.commit()
dump_file = get_most_recent_file(DUMPS_DIR, wiktlang)
if not dump_file:
raise ValueError(f"Could not find dump file for {wiktlang}.")
parse_wiktionary(
wxr,
dump_file,
num_processes=1,
phase1_only=True,
namespace_ids={10, 828},
out_f=None, # type: ignore
)
wxr.wtp.db_conn.commit()
stop_progress_indicator(indicator_thread)
print("Done loading templates.")
from wiktextract import (
WiktextractContext,
WiktionaryConfig,
)
from wikitextprocessor import Wtp
from typing import Optional
def get_wiktextract_context(wiktlang: str, wordlang: Optional[str] = None):
db_path = f"./sqlite-{wiktlang}.db"
config = WiktionaryConfig(
dump_file_lang_code=wiktlang,
capture_language_codes=[wordlang] if wordlang else None,
capture_translations=True,
capture_pronunciation=True,
capture_linkages=True,
capture_compounds=True,
capture_redirects=True,
capture_examples=True,
capture_etymologies=True,
capture_descendants=True,
capture_inflections=True,
)
wxr = WiktextractContext(Wtp(db_path=db_path), config)
return wxr
from wiktextract import (
WiktextractContext,
WiktionaryConfig,
)
from wiktextract.wiktionary import page_handler
from wikitextprocessor import Wtp, Page
from wikitextprocessor import Page
from wiktextract_context import get_wiktextract_context
db_path = "./sqlite.db"
......@@ -18,23 +16,8 @@ class Wiktextract:
self.wiktlang = wiktlang
self.wordlang = wordlang
config = WiktionaryConfig(
dump_file_lang_code=wiktlang,
capture_language_codes=[wordlang],
capture_translations=True,
capture_pronunciation=True,
capture_linkages=True,
capture_compounds=True,
capture_redirects=True,
capture_examples=True,
capture_etymologies=True,
capture_descendants=True,
capture_inflections=True,
)
wxr = WiktextractContext(Wtp(db_path=db_path), config)
self.page_handler = page_handler
self.page_handler.wxr = wxr
self.page_handler.wxr = get_wiktextract_context(wiktlang, wordlang)
def parse_page(self, title: str, wikicode: str):
# add page to the database (making it accessible to LUA templates)
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment