Skip to content
Snippets Groups Projects
Commit c1ed2da7 authored by Empiriker's avatar Empiriker
Browse files

Merge branch 'no-live'

parents 5feef9f7 9908c85c
No related branches found
No related tags found
No related merge requests found
...@@ -8,7 +8,7 @@ RUN pip install --no-cache-dir -r requirements.txt ...@@ -8,7 +8,7 @@ RUN pip install --no-cache-dir -r requirements.txt
COPY /src/. ./src/ COPY /src/. ./src/
COPY /dumps/. ./dumps/ COPY /dumps/. ./dumps/
RUN python ./src/load_templates.py RUN python ./src/load_dumps.py
RUN rm -rf ./dumps RUN rm -rf ./dumps
EXPOSE 80 EXPOSE 80
......
...@@ -37,10 +37,10 @@ _Since wiktextract is not regularly published as a Python package, we fix versio ...@@ -37,10 +37,10 @@ _Since wiktextract is not regularly published as a Python package, we fix versio
### 4. Load templates from dump files ### 4. Load templates from dump files
Run the script `src/load_templates.py` to extract module and template pages from the dumpfile into an sqlite database that will be used by `wiktextract`. Run the script `src/load_dumps.py` the dumpfile into an sqlite database that will be used by `wiktextract`.
``` ```
python src/load_templates.py python src/load_dumps.py
``` ```
### 5. Start flask app ### 5. Start flask app
...@@ -50,10 +50,13 @@ flask --app src/app.py run ...@@ -50,10 +50,13 @@ flask --app src/app.py run
``` ```
## Using Docker ## Using Docker
Alternatively the app can also be containerized using Docker. You still have to provide the dump files in `dumps/`.
Alternatively the app can also be containerized using Docker. You still have to provide the dump files in `dumps/`.
Then performs the two steps: Then performs the two steps:
### 2. Build image ### 2. Build image
``` ```
docker build -t live-query-wiktextract . docker build -t live-query-wiktextract .
``` ```
......
...@@ -2,7 +2,6 @@ from flask import Flask, Response, jsonify, request ...@@ -2,7 +2,6 @@ from flask import Flask, Response, jsonify, request
from flask_cors import CORS from flask_cors import CORS
import config import config
from get_wikicode import get_wikicode
from wiktextract_wrapper import Wiktextract from wiktextract_wrapper import Wiktextract
app = Flask(__name__) app = Flask(__name__)
...@@ -21,31 +20,29 @@ def search(wiktlang, wordlang, word): ...@@ -21,31 +20,29 @@ def search(wiktlang, wordlang, word):
if wiktlang not in config.supported_wiktlangs: if wiktlang not in config.supported_wiktlangs:
return jsonify({"error": f"Language {wiktlang} not supported"}), 400 return jsonify({"error": f"Language {wiktlang} not supported"}), 400
wikicode = get_wikicode(word, wiktlang) wiktextractor = Wiktextract(wiktlang, wordlang)
if wikicode: try:
wiktextractor = Wiktextract(wiktlang, wordlang) resp = wiktextractor.parse_page(word)
try: if resp:
resp = wiktextractor.parse_page(word, wikicode)
return jsonify(resp) return jsonify(resp)
else:
except Exception as e: return (
print(e) jsonify(
{
return jsonify({"error": "Parsing page resulted in error: " + str(e)}), 500 "error": f"{word} is unknown in “{wordlang}” in {wiktlang}.wiktionary.org."
finally: }
wiktextractor.wxr.wtp.db_conn.close() ),
if wiktextractor.wxr.thesaurus_db_conn: 404,
wiktextractor.wxr.thesaurus_db_conn.close() )
else: except Exception as e:
return ( print(e)
jsonify(
{ return jsonify({"error": "Parsing page resulted in error: " + str(e)}), 500
"error": f"{word} is unknown in “{wordlang}” in {wiktlang}.wiktionary.org." finally:
} wiktextractor.wxr.wtp.db_conn.close()
), if wiktextractor.wxr.thesaurus_db_conn:
404, wiktextractor.wxr.thesaurus_db_conn.close()
)
if __name__ == "__main__": if __name__ == "__main__":
......
import pywikibot
def get_wikicode(title: str, wiktlang: str):
site = pywikibot.Site(f"wiktionary:{wiktlang}")
page = pywikibot.Page(site, title)
return page.text
...@@ -10,6 +10,17 @@ from wiktextract_context import get_wiktextract_context ...@@ -10,6 +10,17 @@ from wiktextract_context import get_wiktextract_context
DUMPS_DIR = "dumps" DUMPS_DIR = "dumps"
RECOGNIZED_NAMESPACE_NAMES = [
"Main",
"Category",
"Appendix",
"Project",
"Thesaurus",
"Module",
"Template",
"Reconstruction",
]
def start_progress_indicator(is_done: List[bool], msg: str = ""): def start_progress_indicator(is_done: List[bool], msg: str = ""):
is_done[0] = False is_done[0] = False
...@@ -63,20 +74,21 @@ def load_templates(wiktlang: str): ...@@ -63,20 +74,21 @@ def load_templates(wiktlang: str):
is_done, msg=f"Loading templates for {wiktlang}..." is_done, msg=f"Loading templates for {wiktlang}..."
) )
wxr = get_wiktextract_context(wiktlang, mock_get_page=False) wxr = get_wiktextract_context(wiktlang)
wxr.wtp.db_conn.execute("DELETE FROM pages") wxr.wtp.db_conn.execute("DELETE FROM pages")
wxr.wtp.db_conn.commit() wxr.wtp.db_conn.commit()
namespace_ids = {
wxr.wtp.NAMESPACE_DATA.get(name, {}).get("id")
for name in RECOGNIZED_NAMESPACE_NAMES
}
parse_wiktionary( parse_wiktionary(
wxr, wxr,
dump_file, dump_file,
num_processes=1, num_processes=1,
phase1_only=True, phase1_only=True,
namespace_ids={ namespace_ids=namespace_ids,
10,
828,
}, # Template and Module namespaces; ToDo: Get the namespace IDs from the dump file
out_f=None, # type: ignore out_f=None, # type: ignore
) )
wxr.wtp.db_conn.commit() wxr.wtp.db_conn.commit()
......
from typing import Optional from typing import Optional
from wikitextprocessor import Page, Wtp from wikitextprocessor import Wtp
from wiktextract import WiktextractContext, WiktionaryConfig from wiktextract import WiktextractContext, WiktionaryConfig
from get_wikicode import get_wikicode
def get_wiktextract_context(wiktlang: str, wordlang: Optional[str] = None):
class CustomWtp(Wtp): db_path = f"./sqlite-{wiktlang}-all.db"
def get_page(
self,
title: str,
namespace_id: Optional[int] = None,
no_redirect: bool = False,
) -> Optional[Page]:
# Call the original get_page method
original_result = super().get_page(title, namespace_id, no_redirect)
if original_result == None:
# The db is often called with titles like "tracking/parameters/empty parameter". These seem to return None by design and are not present in Wiktionary. Skip these.
if "/translations" in title and not "tracking" in title:
print(f"Page '{title}' not found in db. Fetching from live wiktionary.")
body = get_wikicode(title, self.lang_code)
return Page(title, namespace_id, body=body)
return original_result
def get_wiktextract_context(
wiktlang: str, wordlang: Optional[str] = None, mock_get_page: bool = True
):
db_path = f"./sqlite-{wiktlang}.db"
config = WiktionaryConfig( config = WiktionaryConfig(
dump_file_lang_code=wiktlang, dump_file_lang_code=wiktlang,
capture_language_codes=[wordlang] if wordlang else None, capture_language_codes=[wordlang] if wordlang else None,
...@@ -46,11 +20,7 @@ def get_wiktextract_context( ...@@ -46,11 +20,7 @@ def get_wiktextract_context(
capture_inflections=True, capture_inflections=True,
) )
config.load_edition_settings() config.load_edition_settings()
wtp = ( wtp = Wtp(db_path=db_path, lang_code=wiktlang)
CustomWtp(db_path=db_path, lang_code=wiktlang)
if mock_get_page
else Wtp(db_path=db_path, lang_code=wiktlang)
)
wxr = WiktextractContext(wtp, config) wxr = WiktextractContext(wtp, config)
return wxr return wxr
...@@ -2,41 +2,18 @@ from wiktextract.page import parse_page ...@@ -2,41 +2,18 @@ from wiktextract.page import parse_page
from wiktextract_context import get_wiktextract_context from wiktextract_context import get_wiktextract_context
db_path = "./sqlite.db"
DEFAULT_PAGE_VALUES = {
"namespace_id": 0,
"model": "wikitext",
}
class Wiktextract: class Wiktextract:
def __init__(self, wiktlang: str, wordlang: str): def __init__(self, wiktlang: str, wordlang: str):
self.wiktlang = wiktlang self.wiktlang = wiktlang
self.wordlang = wordlang self.wordlang = wordlang
# self.page_handler = page_handler
# self.page_handler.wxr : WiktextractContext =
self.wxr = get_wiktextract_context(wiktlang, wordlang) self.wxr = get_wiktextract_context(wiktlang, wordlang)
def parse_page(self, title: str, wikicode: str): def parse_page(self, title: str):
# add page to the database (making it accessible to LUA templates) page = self.wxr.wtp.get_page(title)
self.wxr.wtp.add_page( if not page:
title=title, return None
namespace_id=DEFAULT_PAGE_VALUES["namespace_id"], result = parse_page(self.wxr, title, page.body)
body=wikicode,
model=DEFAULT_PAGE_VALUES["model"],
)
self.wxr.wtp.start_page(title)
result = parse_page(self.wxr, title, wikicode)
# remove the page from the database
self.wxr.wtp.db_conn.execute(
"DELETE FROM pages WHERE title = ? AND model = ?",
(title, DEFAULT_PAGE_VALUES["model"]),
)
self.wxr.wtp.db_conn.commit()
return result return result
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment