Merge branch 'no-live'

c1ed2da7 · Empiriker · 5feef9f7 · 9908c85c · c1ed2da7 · c1ed2da7
Commit c1ed2da7 authored 1 year ago by Empiriker
--- a/Dockerfile
+++ b/Dockerfile
@@ -8,7 +8,7 @@ RUN pip install --no-cache-dir -r requirements.txt
 COPY /src/. ./src/

 COPY /dumps/. ./dumps/
-RUN python ./src/load_templates.py
+RUN python ./src/load_dumps.py
 RUN rm -rf ./dumps

 EXPOSE 80

--- a/README.md
+++ b/README.md
@@ -37,10 +37,10 @@ _Since wiktextract is not regularly published as a Python package, we fix versio

 ### 4. Load templates from dump files

-Run the script `src/load_templates.py` to extract module and template pages from the dumpfile into an sqlite database that will be used by `wiktextract`.
+Run the script `src/load_dumps.py` the dumpfile into an sqlite database that will be used by `wiktextract`.

 ```
-python src/load_templates.py
+python src/load_dumps.py
 ```

 ### 5. Start flask app
@@ -50,10 +50,13 @@ flask --app src/app.py run
 ```

 ## Using Docker
-Alternatively the app can also be containerized using Docker. You still have to provide the dump files in `dumps/`. 
+
+Alternatively the app can also be containerized using Docker. You still have to provide the dump files in `dumps/`.

 Then performs the two steps:
+
 ### 2. Build image
+
 ```
 docker build -t live-query-wiktextract .
 ```

--- a/requirements.txt
+++ b/requirements.txt
-pywikibot==8.3.2
 Flask==2.3.3
 flask_cors==4.0.0
 wiktextract @ git+https://github.com/tatuylonen/wiktextract.git
--- a/src/app.py
+++ b/src/app.py
@@ -2,7 +2,6 @@ from flask import Flask, Response, jsonify, request
 from flask_cors import CORS

 import config
-from get_wikicode import get_wikicode
 from wiktextract_wrapper import Wiktextract

 app = Flask(__name__)
@@ -21,31 +20,29 @@ def search(wiktlang, wordlang, word):
    if wiktlang not in config.supported_wiktlangs:
        return jsonify({"error": f"Language {wiktlang} not supported"}), 400

-    wikicode = get_wikicode(word, wiktlang)
-    if wikicode:
-        wiktextractor = Wiktextract(wiktlang, wordlang)
-        try:
-            resp = wiktextractor.parse_page(word, wikicode)
+    wiktextractor = Wiktextract(wiktlang, wordlang)
+    try:
+        resp = wiktextractor.parse_page(word)
+        if resp:
            return jsonify(resp)
-
-        except Exception as e:
-            print(e)
-
-            return jsonify({"error": "Parsing page resulted in error: " + str(e)}), 500
-        finally:
-            wiktextractor.wxr.wtp.db_conn.close()
-            if wiktextractor.wxr.thesaurus_db_conn:
-                wiktextractor.wxr.thesaurus_db_conn.close()
-
-    else:
-        return (
-            jsonify(
-                {
-                    "error": f"{word} is unknown in “{wordlang}” in {wiktlang}.wiktionary.org."
-                }
-            ),
-            404,
-        )
+        else:
+            return (
+                jsonify(
+                    {
+                        "error": f"{word} is unknown in “{wordlang}” in {wiktlang}.wiktionary.org."
+                    }
+                ),
+                404,
+            )
+
+    except Exception as e:
+        print(e)
+
+        return jsonify({"error": "Parsing page resulted in error: " + str(e)}), 500
+    finally:
+        wiktextractor.wxr.wtp.db_conn.close()
+        if wiktextractor.wxr.thesaurus_db_conn:
+            wiktextractor.wxr.thesaurus_db_conn.close()


 if __name__ == "__main__":

--- a/src/get_wikicode.py
+++ b/src/get_wikicode.py
-import pywikibot
-
-
-def get_wikicode(title: str, wiktlang: str):
-    site = pywikibot.Site(f"wiktionary:{wiktlang}")
-    page = pywikibot.Page(site, title)
-    return page.text
--- a/src/load_templates.py
+++ b/src/load_templates.py
@@ -10,6 +10,17 @@ from wiktextract_context import get_wiktextract_context

 DUMPS_DIR = "dumps"

+RECOGNIZED_NAMESPACE_NAMES = [
+    "Main",
+    "Category",
+    "Appendix",
+    "Project",
+    "Thesaurus",
+    "Module",
+    "Template",
+    "Reconstruction",
+]
+

 def start_progress_indicator(is_done: List[bool], msg: str = ""):
    is_done[0] = False
@@ -63,20 +74,21 @@ def load_templates(wiktlang: str):
        is_done, msg=f"Loading templates for {wiktlang}..."
    )

-    wxr = get_wiktextract_context(wiktlang, mock_get_page=False)
+    wxr = get_wiktextract_context(wiktlang)

    wxr.wtp.db_conn.execute("DELETE FROM pages")
    wxr.wtp.db_conn.commit()

+    namespace_ids = {
+        wxr.wtp.NAMESPACE_DATA.get(name, {}).get("id")
+        for name in RECOGNIZED_NAMESPACE_NAMES
+    }
    parse_wiktionary(
        wxr,
        dump_file,
        num_processes=1,
        phase1_only=True,
-        namespace_ids={
-            10,
-            828,
-        },  # Template and Module namespaces; ToDo: Get the namespace IDs from the dump file
+        namespace_ids=namespace_ids,
        out_f=None,  # type: ignore
    )
    wxr.wtp.db_conn.commit()

--- a/src/wiktextract_context.py
+++ b/src/wiktextract_context.py
 from typing import Optional

-from wikitextprocessor import Page, Wtp
+from wikitextprocessor import Wtp
 from wiktextract import WiktextractContext, WiktionaryConfig

-from get_wikicode import get_wikicode

-
-class CustomWtp(Wtp):
-    def get_page(
-        self,
-        title: str,
-        namespace_id: Optional[int] = None,
-        no_redirect: bool = False,
-    ) -> Optional[Page]:
-        # Call the original get_page method
-        original_result = super().get_page(title, namespace_id, no_redirect)
-
-        if original_result == None:
-            # The db is often called with titles like "tracking/parameters/empty parameter". These seem to return None by design and are not present in Wiktionary. Skip these.
-            if "/translations" in title and not "tracking" in title:
-                print(f"Page '{title}' not found in db. Fetching from live wiktionary.")
-
-                body = get_wikicode(title, self.lang_code)
-
-                return Page(title, namespace_id, body=body)
-
-        return original_result
-
-
-def get_wiktextract_context(
-    wiktlang: str, wordlang: Optional[str] = None, mock_get_page: bool = True
-):
-    db_path = f"./sqlite-{wiktlang}.db"
+def get_wiktextract_context(wiktlang: str, wordlang: Optional[str] = None):
+    db_path = f"./sqlite-{wiktlang}-all.db"
    config = WiktionaryConfig(
        dump_file_lang_code=wiktlang,
        capture_language_codes=[wordlang] if wordlang else None,
@@ -46,11 +20,7 @@ def get_wiktextract_context(
        capture_inflections=True,
    )
    config.load_edition_settings()
-    wtp = (
-        CustomWtp(db_path=db_path, lang_code=wiktlang)
-        if mock_get_page
-        else Wtp(db_path=db_path, lang_code=wiktlang)
-    )
+    wtp = Wtp(db_path=db_path, lang_code=wiktlang)
    wxr = WiktextractContext(wtp, config)

    return wxr
--- a/src/wiktextract_wrapper.py
+++ b/src/wiktextract_wrapper.py
@@ -2,41 +2,18 @@ from wiktextract.page import parse_page

 from wiktextract_context import get_wiktextract_context

-db_path = "./sqlite.db"
-
-DEFAULT_PAGE_VALUES = {
-    "namespace_id": 0,
-    "model": "wikitext",
-}
-

 class Wiktextract:
    def __init__(self, wiktlang: str, wordlang: str):
        self.wiktlang = wiktlang
        self.wordlang = wordlang

-        # self.page_handler = page_handler
-        # self.page_handler.wxr : WiktextractContext =
        self.wxr = get_wiktextract_context(wiktlang, wordlang)

-    def parse_page(self, title: str, wikicode: str):
-        # add page to the database (making it accessible to LUA templates)
-        self.wxr.wtp.add_page(
-            title=title,
-            namespace_id=DEFAULT_PAGE_VALUES["namespace_id"],
-            body=wikicode,
-            model=DEFAULT_PAGE_VALUES["model"],
-        )
-
-        self.wxr.wtp.start_page(title)
-
-        result = parse_page(self.wxr, title, wikicode)
-
-        # remove the page from the database
-        self.wxr.wtp.db_conn.execute(
-            "DELETE FROM pages WHERE title = ? AND model = ?",
-            (title, DEFAULT_PAGE_VALUES["model"]),
-        )
-        self.wxr.wtp.db_conn.commit()
+    def parse_page(self, title: str):
+        page = self.wxr.wtp.get_page(title)
+        if not page:
+            return None
+        result = parse_page(self.wxr, title, page.body)

        return result