diff --git a/src/app.py b/src/app.py index d148407946d58cc3477a4110558509599706ff4c..9978ea0667d141ead048e24deebcdbf12a7985b4 100644 --- a/src/app.py +++ b/src/app.py @@ -6,6 +6,9 @@ from flask_cors import CORS import config from get_wikicode import get_wikicode from wiktextract_wrapper import Wiktextract +from load_templates import load_templates + +load_templates() app = Flask(__name__) CORS(app) diff --git a/src/load_templates.py b/src/load_templates.py new file mode 100644 index 0000000000000000000000000000000000000000..3f064600c69cc84c172d4f94d4af64f25e630f35 --- /dev/null +++ b/src/load_templates.py @@ -0,0 +1,82 @@ +from wiktextract.wiktionary import parse_wiktionary +from wiktextract_context import get_wiktextract_context + +import os +import re + +import threading +import time + + +DUMPS_DIR = "dumps" + + +def start_progress_indicator(): + time_elapsed_indicator.stop = False + indicator_thread = threading.Thread(target=time_elapsed_indicator) + indicator_thread.start() + + return indicator_thread + + +def stop_progress_indicator(indicator_thread): + time_elapsed_indicator.stop = True + indicator_thread.join() + + +def time_elapsed_indicator(): + start_time = time.time() + while not time_elapsed_indicator.stop: + elapsed_time = time.time() - start_time + print(f"\rTime elapsed: {elapsed_time:.2f} seconds", end="") + time.sleep(1) + print( + "\rTime elapsed: {elapsed_time:.2f} seconds".format(elapsed_time=elapsed_time) # type: ignore + ) + + +def get_most_recent_file(directory, lang_code): + pattern = re.compile( + f"{lang_code}wiktionary-(\d+)-pages-articles-multistream.xml.bz2" + ) + + matching_files = [f for f in os.listdir(directory) if pattern.match(f)] + + if not matching_files: + return None + + most_recent_file = sorted( + matching_files, key=lambda x: pattern.match(x).group(1), reverse=True + )[0] + + return os.path.join(directory, most_recent_file) + + +def load_templates(wiktlang: str): + print(f"Loading templates for {wiktlang}...") + + indicator_thread = start_progress_indicator() + + wxr = get_wiktextract_context(wiktlang) + + wxr.wtp.db_conn.execute("DELETE FROM pages") + wxr.wtp.db_conn.commit() + + dump_file = get_most_recent_file(DUMPS_DIR, wiktlang) + + if not dump_file: + raise ValueError(f"Could not find dump file for {wiktlang}.") + + parse_wiktionary( + wxr, + dump_file, + num_processes=1, + phase1_only=True, + namespace_ids={10, 828}, + out_f=None, # type: ignore + ) + wxr.wtp.db_conn.commit() + + stop_progress_indicator(indicator_thread) + + print("Done loading templates.") diff --git a/src/wiktextract_context.py b/src/wiktextract_context.py new file mode 100644 index 0000000000000000000000000000000000000000..fcf768754cf27b516c60d2fc8f02012c7dad68dc --- /dev/null +++ b/src/wiktextract_context.py @@ -0,0 +1,27 @@ +from wiktextract import ( + WiktextractContext, + WiktionaryConfig, +) +from wikitextprocessor import Wtp + +from typing import Optional + + +def get_wiktextract_context(wiktlang: str, wordlang: Optional[str] = None): + db_path = f"./sqlite-{wiktlang}.db" + config = WiktionaryConfig( + dump_file_lang_code=wiktlang, + capture_language_codes=[wordlang] if wordlang else None, + capture_translations=True, + capture_pronunciation=True, + capture_linkages=True, + capture_compounds=True, + capture_redirects=True, + capture_examples=True, + capture_etymologies=True, + capture_descendants=True, + capture_inflections=True, + ) + wxr = WiktextractContext(Wtp(db_path=db_path), config) + + return wxr diff --git a/src/wiktextract_wrapper.py b/src/wiktextract_wrapper.py index b3c46b477092eef9ea7728f3151544efb4aa6d89..f748e68baa9125d2e65d891c3f01d024b14d9ef6 100644 --- a/src/wiktextract_wrapper.py +++ b/src/wiktextract_wrapper.py @@ -1,9 +1,7 @@ -from wiktextract import ( - WiktextractContext, - WiktionaryConfig, -) from wiktextract.wiktionary import page_handler -from wikitextprocessor import Wtp, Page +from wikitextprocessor import Page + +from wiktextract_context import get_wiktextract_context db_path = "./sqlite.db" @@ -18,23 +16,8 @@ class Wiktextract: self.wiktlang = wiktlang self.wordlang = wordlang - config = WiktionaryConfig( - dump_file_lang_code=wiktlang, - capture_language_codes=[wordlang], - capture_translations=True, - capture_pronunciation=True, - capture_linkages=True, - capture_compounds=True, - capture_redirects=True, - capture_examples=True, - capture_etymologies=True, - capture_descendants=True, - capture_inflections=True, - ) - wxr = WiktextractContext(Wtp(db_path=db_path), config) - self.page_handler = page_handler - self.page_handler.wxr = wxr + self.page_handler.wxr = get_wiktextract_context(wiktlang, wordlang) def parse_page(self, title: str, wikicode: str): # add page to the database (making it accessible to LUA templates)