diff --git a/Dockerfile b/Dockerfile index b1d319c12215a3ee9ccb43dc058b9267f4b0bf9a..7fc8758fc7130486f3672eaab624718850f54273 100644 --- a/Dockerfile +++ b/Dockerfile @@ -8,7 +8,7 @@ RUN pip install --no-cache-dir -r requirements.txt COPY /src/. ./src/ COPY /dumps/. ./dumps/ -RUN python ./src/load_templates.py +RUN python ./src/load_dumps.py RUN rm -rf ./dumps EXPOSE 80 diff --git a/README.md b/README.md index caa04c77e1e087b736aa7480c84c9ad77ae6b15b..76a58e117e750ef39ff81e4c24b40e6d17884246 100644 --- a/README.md +++ b/README.md @@ -37,10 +37,10 @@ _Since wiktextract is not regularly published as a Python package, we fix versio ### 4. Load templates from dump files -Run the script `src/load_templates.py` to extract module and template pages from the dumpfile into an sqlite database that will be used by `wiktextract`. +Run the script `src/load_dumps.py` the dumpfile into an sqlite database that will be used by `wiktextract`. ``` -python src/load_templates.py +python src/load_dumps.py ``` ### 5. Start flask app @@ -50,10 +50,13 @@ flask --app src/app.py run ``` ## Using Docker -Alternatively the app can also be containerized using Docker. You still have to provide the dump files in `dumps/`. + +Alternatively the app can also be containerized using Docker. You still have to provide the dump files in `dumps/`. Then performs the two steps: + ### 2. Build image + ``` docker build -t live-query-wiktextract . ``` diff --git a/requirements.txt b/requirements.txt index 72601b8a1b5bf6e9a057a6d83a8804d433acc2f2..d049bcce64ea7801ad20e3712c38d01341e531d7 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,4 +1,3 @@ -pywikibot==8.3.2 Flask==2.3.3 flask_cors==4.0.0 wiktextract @ git+https://github.com/tatuylonen/wiktextract.git diff --git a/src/app.py b/src/app.py index 4884517cf63f3f79d27d4a77c03d9eb72e49e545..1c50351ed7a99622bd77fbab59ace14317affb36 100644 --- a/src/app.py +++ b/src/app.py @@ -2,7 +2,6 @@ from flask import Flask, Response, jsonify, request from flask_cors import CORS import config -from get_wikicode import get_wikicode from wiktextract_wrapper import Wiktextract app = Flask(__name__) @@ -21,31 +20,29 @@ def search(wiktlang, wordlang, word): if wiktlang not in config.supported_wiktlangs: return jsonify({"error": f"Language {wiktlang} not supported"}), 400 - wikicode = get_wikicode(word, wiktlang) - if wikicode: - wiktextractor = Wiktextract(wiktlang, wordlang) - try: - resp = wiktextractor.parse_page(word, wikicode) + wiktextractor = Wiktextract(wiktlang, wordlang) + try: + resp = wiktextractor.parse_page(word) + if resp: return jsonify(resp) - - except Exception as e: - print(e) - - return jsonify({"error": "Parsing page resulted in error: " + str(e)}), 500 - finally: - wiktextractor.wxr.wtp.db_conn.close() - if wiktextractor.wxr.thesaurus_db_conn: - wiktextractor.wxr.thesaurus_db_conn.close() - - else: - return ( - jsonify( - { - "error": f"{word} is unknown in “{wordlang}†in {wiktlang}.wiktionary.org." - } - ), - 404, - ) + else: + return ( + jsonify( + { + "error": f"{word} is unknown in “{wordlang}†in {wiktlang}.wiktionary.org." + } + ), + 404, + ) + + except Exception as e: + print(e) + + return jsonify({"error": "Parsing page resulted in error: " + str(e)}), 500 + finally: + wiktextractor.wxr.wtp.db_conn.close() + if wiktextractor.wxr.thesaurus_db_conn: + wiktextractor.wxr.thesaurus_db_conn.close() if __name__ == "__main__": diff --git a/src/get_wikicode.py b/src/get_wikicode.py deleted file mode 100644 index 24291977bae17f9bfe128d43f80202aeb7b3eea7..0000000000000000000000000000000000000000 --- a/src/get_wikicode.py +++ /dev/null @@ -1,7 +0,0 @@ -import pywikibot - - -def get_wikicode(title: str, wiktlang: str): - site = pywikibot.Site(f"wiktionary:{wiktlang}") - page = pywikibot.Page(site, title) - return page.text diff --git a/src/load_templates.py b/src/load_dumps.py similarity index 85% rename from src/load_templates.py rename to src/load_dumps.py index 7f6def5e87c28639cbd1431ae734796400c48077..a9c9ba97c36702355ef46709e00029aff8beb964 100644 --- a/src/load_templates.py +++ b/src/load_dumps.py @@ -10,6 +10,17 @@ from wiktextract_context import get_wiktextract_context DUMPS_DIR = "dumps" +RECOGNIZED_NAMESPACE_NAMES = [ + "Main", + "Category", + "Appendix", + "Project", + "Thesaurus", + "Module", + "Template", + "Reconstruction", +] + def start_progress_indicator(is_done: List[bool], msg: str = ""): is_done[0] = False @@ -63,20 +74,21 @@ def load_templates(wiktlang: str): is_done, msg=f"Loading templates for {wiktlang}..." ) - wxr = get_wiktextract_context(wiktlang, mock_get_page=False) + wxr = get_wiktextract_context(wiktlang) wxr.wtp.db_conn.execute("DELETE FROM pages") wxr.wtp.db_conn.commit() + namespace_ids = { + wxr.wtp.NAMESPACE_DATA.get(name, {}).get("id") + for name in RECOGNIZED_NAMESPACE_NAMES + } parse_wiktionary( wxr, dump_file, num_processes=1, phase1_only=True, - namespace_ids={ - 10, - 828, - }, # Template and Module namespaces; ToDo: Get the namespace IDs from the dump file + namespace_ids=namespace_ids, out_f=None, # type: ignore ) wxr.wtp.db_conn.commit() diff --git a/src/wiktextract_context.py b/src/wiktextract_context.py index 2cbabf19989958a292f5a35eaccd72ec34641ab8..ce88f3158ab3fe36aa7dea6db4549f43e7524fd6 100644 --- a/src/wiktextract_context.py +++ b/src/wiktextract_context.py @@ -1,37 +1,11 @@ from typing import Optional -from wikitextprocessor import Page, Wtp +from wikitextprocessor import Wtp from wiktextract import WiktextractContext, WiktionaryConfig -from get_wikicode import get_wikicode - -class CustomWtp(Wtp): - def get_page( - self, - title: str, - namespace_id: Optional[int] = None, - no_redirect: bool = False, - ) -> Optional[Page]: - # Call the original get_page method - original_result = super().get_page(title, namespace_id, no_redirect) - - if original_result == None: - # The db is often called with titles like "tracking/parameters/empty parameter". These seem to return None by design and are not present in Wiktionary. Skip these. - if "/translations" in title and not "tracking" in title: - print(f"Page '{title}' not found in db. Fetching from live wiktionary.") - - body = get_wikicode(title, self.lang_code) - - return Page(title, namespace_id, body=body) - - return original_result - - -def get_wiktextract_context( - wiktlang: str, wordlang: Optional[str] = None, mock_get_page: bool = True -): - db_path = f"./sqlite-{wiktlang}.db" +def get_wiktextract_context(wiktlang: str, wordlang: Optional[str] = None): + db_path = f"./sqlite-{wiktlang}-all.db" config = WiktionaryConfig( dump_file_lang_code=wiktlang, capture_language_codes=[wordlang] if wordlang else None, @@ -46,11 +20,7 @@ def get_wiktextract_context( capture_inflections=True, ) config.load_edition_settings() - wtp = ( - CustomWtp(db_path=db_path, lang_code=wiktlang) - if mock_get_page - else Wtp(db_path=db_path, lang_code=wiktlang) - ) + wtp = Wtp(db_path=db_path, lang_code=wiktlang) wxr = WiktextractContext(wtp, config) return wxr diff --git a/src/wiktextract_wrapper.py b/src/wiktextract_wrapper.py index 79f78b2b28777f81acec6ffe026f0afb304cb23d..c1eab07b27d58c1db1cbaf6576894ea1782ab3fd 100644 --- a/src/wiktextract_wrapper.py +++ b/src/wiktextract_wrapper.py @@ -2,41 +2,18 @@ from wiktextract.page import parse_page from wiktextract_context import get_wiktextract_context -db_path = "./sqlite.db" - -DEFAULT_PAGE_VALUES = { - "namespace_id": 0, - "model": "wikitext", -} - class Wiktextract: def __init__(self, wiktlang: str, wordlang: str): self.wiktlang = wiktlang self.wordlang = wordlang - # self.page_handler = page_handler - # self.page_handler.wxr : WiktextractContext = self.wxr = get_wiktextract_context(wiktlang, wordlang) - def parse_page(self, title: str, wikicode: str): - # add page to the database (making it accessible to LUA templates) - self.wxr.wtp.add_page( - title=title, - namespace_id=DEFAULT_PAGE_VALUES["namespace_id"], - body=wikicode, - model=DEFAULT_PAGE_VALUES["model"], - ) - - self.wxr.wtp.start_page(title) - - result = parse_page(self.wxr, title, wikicode) - - # remove the page from the database - self.wxr.wtp.db_conn.execute( - "DELETE FROM pages WHERE title = ? AND model = ?", - (title, DEFAULT_PAGE_VALUES["model"]), - ) - self.wxr.wtp.db_conn.commit() + def parse_page(self, title: str): + page = self.wxr.wtp.get_page(title) + if not page: + return None + result = parse_page(self.wxr, title, page.body) return result