Skip to content
Snippets Groups Projects
Commit c1ed2da7 authored by Empiriker's avatar Empiriker
Browse files

Merge branch 'no-live'

parents 5feef9f7 9908c85c
No related branches found
No related tags found
No related merge requests found
......@@ -8,7 +8,7 @@ RUN pip install --no-cache-dir -r requirements.txt
COPY /src/. ./src/
COPY /dumps/. ./dumps/
RUN python ./src/load_templates.py
RUN python ./src/load_dumps.py
RUN rm -rf ./dumps
EXPOSE 80
......
......@@ -37,10 +37,10 @@ _Since wiktextract is not regularly published as a Python package, we fix versio
### 4. Load templates from dump files
Run the script `src/load_templates.py` to extract module and template pages from the dumpfile into an sqlite database that will be used by `wiktextract`.
Run the script `src/load_dumps.py` the dumpfile into an sqlite database that will be used by `wiktextract`.
```
python src/load_templates.py
python src/load_dumps.py
```
### 5. Start flask app
......@@ -50,10 +50,13 @@ flask --app src/app.py run
```
## Using Docker
Alternatively the app can also be containerized using Docker. You still have to provide the dump files in `dumps/`.
Alternatively the app can also be containerized using Docker. You still have to provide the dump files in `dumps/`.
Then performs the two steps:
### 2. Build image
```
docker build -t live-query-wiktextract .
```
......
......@@ -2,7 +2,6 @@ from flask import Flask, Response, jsonify, request
from flask_cors import CORS
import config
from get_wikicode import get_wikicode
from wiktextract_wrapper import Wiktextract
app = Flask(__name__)
......@@ -21,31 +20,29 @@ def search(wiktlang, wordlang, word):
if wiktlang not in config.supported_wiktlangs:
return jsonify({"error": f"Language {wiktlang} not supported"}), 400
wikicode = get_wikicode(word, wiktlang)
if wikicode:
wiktextractor = Wiktextract(wiktlang, wordlang)
try:
resp = wiktextractor.parse_page(word, wikicode)
wiktextractor = Wiktextract(wiktlang, wordlang)
try:
resp = wiktextractor.parse_page(word)
if resp:
return jsonify(resp)
except Exception as e:
print(e)
return jsonify({"error": "Parsing page resulted in error: " + str(e)}), 500
finally:
wiktextractor.wxr.wtp.db_conn.close()
if wiktextractor.wxr.thesaurus_db_conn:
wiktextractor.wxr.thesaurus_db_conn.close()
else:
return (
jsonify(
{
"error": f"{word} is unknown in “{wordlang}” in {wiktlang}.wiktionary.org."
}
),
404,
)
else:
return (
jsonify(
{
"error": f"{word} is unknown in “{wordlang}” in {wiktlang}.wiktionary.org."
}
),
404,
)
except Exception as e:
print(e)
return jsonify({"error": "Parsing page resulted in error: " + str(e)}), 500
finally:
wiktextractor.wxr.wtp.db_conn.close()
if wiktextractor.wxr.thesaurus_db_conn:
wiktextractor.wxr.thesaurus_db_conn.close()
if __name__ == "__main__":
......
import pywikibot
def get_wikicode(title: str, wiktlang: str):
site = pywikibot.Site(f"wiktionary:{wiktlang}")
page = pywikibot.Page(site, title)
return page.text
......@@ -10,6 +10,17 @@ from wiktextract_context import get_wiktextract_context
DUMPS_DIR = "dumps"
RECOGNIZED_NAMESPACE_NAMES = [
"Main",
"Category",
"Appendix",
"Project",
"Thesaurus",
"Module",
"Template",
"Reconstruction",
]
def start_progress_indicator(is_done: List[bool], msg: str = ""):
is_done[0] = False
......@@ -63,20 +74,21 @@ def load_templates(wiktlang: str):
is_done, msg=f"Loading templates for {wiktlang}..."
)
wxr = get_wiktextract_context(wiktlang, mock_get_page=False)
wxr = get_wiktextract_context(wiktlang)
wxr.wtp.db_conn.execute("DELETE FROM pages")
wxr.wtp.db_conn.commit()
namespace_ids = {
wxr.wtp.NAMESPACE_DATA.get(name, {}).get("id")
for name in RECOGNIZED_NAMESPACE_NAMES
}
parse_wiktionary(
wxr,
dump_file,
num_processes=1,
phase1_only=True,
namespace_ids={
10,
828,
}, # Template and Module namespaces; ToDo: Get the namespace IDs from the dump file
namespace_ids=namespace_ids,
out_f=None, # type: ignore
)
wxr.wtp.db_conn.commit()
......
from typing import Optional
from wikitextprocessor import Page, Wtp
from wikitextprocessor import Wtp
from wiktextract import WiktextractContext, WiktionaryConfig
from get_wikicode import get_wikicode
class CustomWtp(Wtp):
def get_page(
self,
title: str,
namespace_id: Optional[int] = None,
no_redirect: bool = False,
) -> Optional[Page]:
# Call the original get_page method
original_result = super().get_page(title, namespace_id, no_redirect)
if original_result == None:
# The db is often called with titles like "tracking/parameters/empty parameter". These seem to return None by design and are not present in Wiktionary. Skip these.
if "/translations" in title and not "tracking" in title:
print(f"Page '{title}' not found in db. Fetching from live wiktionary.")
body = get_wikicode(title, self.lang_code)
return Page(title, namespace_id, body=body)
return original_result
def get_wiktextract_context(
wiktlang: str, wordlang: Optional[str] = None, mock_get_page: bool = True
):
db_path = f"./sqlite-{wiktlang}.db"
def get_wiktextract_context(wiktlang: str, wordlang: Optional[str] = None):
db_path = f"./sqlite-{wiktlang}-all.db"
config = WiktionaryConfig(
dump_file_lang_code=wiktlang,
capture_language_codes=[wordlang] if wordlang else None,
......@@ -46,11 +20,7 @@ def get_wiktextract_context(
capture_inflections=True,
)
config.load_edition_settings()
wtp = (
CustomWtp(db_path=db_path, lang_code=wiktlang)
if mock_get_page
else Wtp(db_path=db_path, lang_code=wiktlang)
)
wtp = Wtp(db_path=db_path, lang_code=wiktlang)
wxr = WiktextractContext(wtp, config)
return wxr
......@@ -2,41 +2,18 @@ from wiktextract.page import parse_page
from wiktextract_context import get_wiktextract_context
db_path = "./sqlite.db"
DEFAULT_PAGE_VALUES = {
"namespace_id": 0,
"model": "wikitext",
}
class Wiktextract:
def __init__(self, wiktlang: str, wordlang: str):
self.wiktlang = wiktlang
self.wordlang = wordlang
# self.page_handler = page_handler
# self.page_handler.wxr : WiktextractContext =
self.wxr = get_wiktextract_context(wiktlang, wordlang)
def parse_page(self, title: str, wikicode: str):
# add page to the database (making it accessible to LUA templates)
self.wxr.wtp.add_page(
title=title,
namespace_id=DEFAULT_PAGE_VALUES["namespace_id"],
body=wikicode,
model=DEFAULT_PAGE_VALUES["model"],
)
self.wxr.wtp.start_page(title)
result = parse_page(self.wxr, title, wikicode)
# remove the page from the database
self.wxr.wtp.db_conn.execute(
"DELETE FROM pages WHERE title = ? AND model = ?",
(title, DEFAULT_PAGE_VALUES["model"]),
)
self.wxr.wtp.db_conn.commit()
def parse_page(self, title: str):
page = self.wxr.wtp.get_page(title)
if not page:
return None
result = parse_page(self.wxr, title, page.body)
return result
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment