Merge branch 'no-live'

c1ed2da7 · Empiriker · 5feef9f7 · 9908c85c · c1ed2da7 · c1ed2da7
Commit c1ed2da7 authored 1 year ago by Empiriker
--- a/Dockerfile
+++ b/Dockerfile
@@ -8,7 +8,7 @@ RUN pip install --no-cache-dir -r requirements.txt
 COPY /src/. ./src/
 COPY /dumps/. ./dumps/
-RUN python ./src/load_templates.py
+RUN python ./src/load_dumps.py
 RUN rm -rf ./dumps
 EXPOSE 80

--- a/README.md
+++ b/README.md
@@ -37,10 +37,10 @@ _Since wiktextract is not regularly published as a Python package, we fix versio
 ### 4. Load templates from dump files
-Run the script `src/load_templates.py` to extract module and template pages from the dumpfile into an sqlite database that will be used by `wiktextract`.
+Run the script `src/load_dumps.py` the dumpfile into an sqlite database that will be used by `wiktextract`.
 ```
-python src/load_templates.py
+python src/load_dumps.py
 ```
 ### 5. Start flask app
@@ -50,10 +50,13 @@ flask --app src/app.py run
 ```
 ## Using Docker
-Alternatively the app can also be containerized using Docker. You still have to provide the dump files in `dumps/`. 
+Alternatively the app can also be containerized using Docker. You still have to provide the dump files in `dumps/`.
 Then performs the two steps:
 ### 2. Build image
 ```
 docker build -t live-query-wiktextract .
 ```

--- a/requirements.txt
+++ b/requirements.txt
-pywikibot==8.3.2
 Flask==2.3.3
 flask_cors==4.0.0
 wiktextract @ git+https://github.com/tatuylonen/wiktextract.git
--- a/src/app.py
+++ b/src/app.py
@@ -2,7 +2,6 @@ from flask import Flask, Response, jsonify, request
 from flask_cors import CORS
 import config
-from get_wikicode import get_wikicode
 from wiktextract_wrapper import Wiktextract
 app = Flask(__name__)
@@ -21,31 +20,29 @@ def search(wiktlang, wordlang, word):
    if wiktlang not in config.supported_wiktlangs:
        return jsonify({"error": f"Language {wiktlang} not supported"}), 400
-    wikicode = get_wikicode(word, wiktlang)
+    wiktextractor = Wiktextract(wiktlang, wordlang)
-    if wikicode:
+    try:
-        wiktextractor = Wiktextract(wiktlang, wordlang)
+        resp = wiktextractor.parse_page(word)
-        try:
+        if resp:
-            resp = wiktextractor.parse_page(word, wikicode)
            return jsonify(resp)
+        else:
-        except Exception as e:
+            return (
-            print(e)
+                jsonify(
+                    {
-            return jsonify({"error": "Parsing page resulted in error: " + str(e)}), 500
+                        "error": f"{word} is unknown in “{wordlang}” in {wiktlang}.wiktionary.org."
-        finally:
+                    }
-            wiktextractor.wxr.wtp.db_conn.close()
+                ),
-            if wiktextractor.wxr.thesaurus_db_conn:
+                404,
-                wiktextractor.wxr.thesaurus_db_conn.close()
+            )
-    else:
+    except Exception as e:
-        return (
+        print(e)
-            jsonify(
-                {
+        return jsonify({"error": "Parsing page resulted in error: " + str(e)}), 500
-                    "error": f"{word} is unknown in “{wordlang}” in {wiktlang}.wiktionary.org."
+    finally:
-                }
+        wiktextractor.wxr.wtp.db_conn.close()
-            ),
+        if wiktextractor.wxr.thesaurus_db_conn:
-            404,
+            wiktextractor.wxr.thesaurus_db_conn.close()
-        )
 if __name__ == "__main__":

--- a/src/get_wikicode.py
+++ b/src/get_wikicode.py
-import pywikibot
-def get_wikicode(title: str, wiktlang: str):
-    site = pywikibot.Site(f"wiktionary:{wiktlang}")
-    page = pywikibot.Page(site, title)
-    return page.text
--- a/src/load_templates.py
+++ b/src/load_templates.py
@@ -10,6 +10,17 @@ from wiktextract_context import get_wiktextract_context
 DUMPS_DIR = "dumps"
+RECOGNIZED_NAMESPACE_NAMES = [
+    "Main",
+    "Category",
+    "Appendix",
+    "Project",
+    "Thesaurus",
+    "Module",
+    "Template",
+    "Reconstruction",
+]
 def start_progress_indicator(is_done: List[bool], msg: str = ""):
    is_done[0] = False
@@ -63,20 +74,21 @@ def load_templates(wiktlang: str):
        is_done, msg=f"Loading templates for {wiktlang}..."
    )
-    wxr = get_wiktextract_context(wiktlang, mock_get_page=False)
+    wxr = get_wiktextract_context(wiktlang)
    wxr.wtp.db_conn.execute("DELETE FROM pages")
    wxr.wtp.db_conn.commit()
+    namespace_ids = {
+        wxr.wtp.NAMESPACE_DATA.get(name, {}).get("id")
+        for name in RECOGNIZED_NAMESPACE_NAMES
+    }
    parse_wiktionary(
        wxr,
        dump_file,
        num_processes=1,
        phase1_only=True,
-        namespace_ids={
+        namespace_ids=namespace_ids,
-            10,
-            828,
-        },  # Template and Module namespaces; ToDo: Get the namespace IDs from the dump file
        out_f=None,  # type: ignore
    )
    wxr.wtp.db_conn.commit()

--- a/src/wiktextract_context.py
+++ b/src/wiktextract_context.py
 from typing import Optional
-from wikitextprocessor import Page, Wtp
+from wikitextprocessor import Wtp
 from wiktextract import WiktextractContext, WiktionaryConfig
-from get_wikicode import get_wikicode
+def get_wiktextract_context(wiktlang: str, wordlang: Optional[str] = None):
-class CustomWtp(Wtp):
+    db_path = f"./sqlite-{wiktlang}-all.db"
-    def get_page(
-        self,
-        title: str,
-        namespace_id: Optional[int] = None,
-        no_redirect: bool = False,
-    ) -> Optional[Page]:
-        # Call the original get_page method
-        original_result = super().get_page(title, namespace_id, no_redirect)
-        if original_result == None:
-            # The db is often called with titles like "tracking/parameters/empty parameter". These seem to return None by design and are not present in Wiktionary. Skip these.
-            if "/translations" in title and not "tracking" in title:
-                print(f"Page '{title}' not found in db. Fetching from live wiktionary.")
-                body = get_wikicode(title, self.lang_code)
-                return Page(title, namespace_id, body=body)
-        return original_result
-def get_wiktextract_context(
-    wiktlang: str, wordlang: Optional[str] = None, mock_get_page: bool = True
-):
-    db_path = f"./sqlite-{wiktlang}.db"
    config = WiktionaryConfig(
        dump_file_lang_code=wiktlang,
        capture_language_codes=[wordlang] if wordlang else None,
@@ -46,11 +20,7 @@ def get_wiktextract_context(
        capture_inflections=True,
    )
    config.load_edition_settings()
-    wtp = (
+    wtp = Wtp(db_path=db_path, lang_code=wiktlang)
-        CustomWtp(db_path=db_path, lang_code=wiktlang)
-        if mock_get_page
-        else Wtp(db_path=db_path, lang_code=wiktlang)
-    )
    wxr = WiktextractContext(wtp, config)
    return wxr
--- a/src/wiktextract_wrapper.py
+++ b/src/wiktextract_wrapper.py
@@ -2,41 +2,18 @@ from wiktextract.page import parse_page
 from wiktextract_context import get_wiktextract_context
-db_path = "./sqlite.db"
-DEFAULT_PAGE_VALUES = {
-    "namespace_id": 0,
-    "model": "wikitext",
-}
 class Wiktextract:
    def __init__(self, wiktlang: str, wordlang: str):
        self.wiktlang = wiktlang
        self.wordlang = wordlang
-        # self.page_handler = page_handler
-        # self.page_handler.wxr : WiktextractContext =
        self.wxr = get_wiktextract_context(wiktlang, wordlang)
-    def parse_page(self, title: str, wikicode: str):
+    def parse_page(self, title: str):
-        # add page to the database (making it accessible to LUA templates)
+        page = self.wxr.wtp.get_page(title)
-        self.wxr.wtp.add_page(
+        if not page:
-            title=title,
+            return None
-            namespace_id=DEFAULT_PAGE_VALUES["namespace_id"],
+        result = parse_page(self.wxr, title, page.body)
-            body=wikicode,
-            model=DEFAULT_PAGE_VALUES["model"],
-        )
-        self.wxr.wtp.start_page(title)
-        result = parse_page(self.wxr, title, wikicode)
-        # remove the page from the database
-        self.wxr.wtp.db_conn.execute(
-            "DELETE FROM pages WHERE title = ? AND model = ?",
-            (title, DEFAULT_PAGE_VALUES["model"]),
-        )
-        self.wxr.wtp.db_conn.commit()
        return result