Skip to content
Snippets Groups Projects
Commit eee9cd3b authored by Empiriker's avatar Empiriker
Browse files

Use local page data instead of live query

parent a6d161ab
No related branches found
No related tags found
No related merge requests found
from flask import Flask, jsonify from flask import Flask, Response, jsonify, request
from flask import request
from flask import Response
from flask_cors import CORS from flask_cors import CORS
import config import config
from get_wikicode import get_wikicode
from wiktextract_wrapper import Wiktextract from wiktextract_wrapper import Wiktextract
app = Flask(__name__) app = Flask(__name__)
...@@ -23,31 +20,29 @@ def search(wiktlang, wordlang, word): ...@@ -23,31 +20,29 @@ def search(wiktlang, wordlang, word):
if wiktlang not in config.supported_wiktlangs: if wiktlang not in config.supported_wiktlangs:
return jsonify({"error": f"Language {wiktlang} not supported"}), 400 return jsonify({"error": f"Language {wiktlang} not supported"}), 400
wikicode = get_wikicode(word, wiktlang) wiktextractor = Wiktextract(wiktlang, wordlang)
if wikicode: try:
wiktextractor = Wiktextract(wiktlang, wordlang) resp = wiktextractor.parse_page(word)
try: if resp:
resp = wiktextractor.parse_page(word, wikicode)
return jsonify(resp) return jsonify(resp)
else:
except Exception as e: return (
print(e) jsonify(
{
return jsonify({"error": "Parsing page resulted in error: " + str(e)}), 500 "error": f"{word} is unknown in “{wordlang}” in {wiktlang}.wiktionary.org."
finally: }
wiktextractor.wxr.wtp.db_conn.close() ),
if wiktextractor.wxr.thesaurus_db_conn: 404,
wiktextractor.wxr.thesaurus_db_conn.close() )
else: except Exception as e:
return ( print(e)
jsonify(
{ return jsonify({"error": "Parsing page resulted in error: " + str(e)}), 500
"error": f"{word} is unknown in “{wordlang}” in {wiktlang}.wiktionary.org." finally:
} wiktextractor.wxr.wtp.db_conn.close()
), if wiktextractor.wxr.thesaurus_db_conn:
404, wiktextractor.wxr.thesaurus_db_conn.close()
)
if __name__ == "__main__": if __name__ == "__main__":
......
import pywikibot
def get_wikicode(title: str, wiktlang: str):
site = pywikibot.Site(f"wiktionary:{wiktlang}")
page = pywikibot.Page(site, title)
return page.text
from wiktextract import (
WiktextractContext,
WiktionaryConfig,
)
from wikitextprocessor import Wtp, Page
from typing import Optional from typing import Optional
from get_wikicode import get_wikicode from wikitextprocessor import Wtp
from wiktextract import WiktextractContext, WiktionaryConfig
class CustomWtp(Wtp):
def get_page(
self,
title: str,
namespace_id: Optional[int] = None,
no_redirect: bool = False,
) -> Optional[Page]:
# Call the original get_page method
original_result = super().get_page(title, namespace_id, no_redirect)
if original_result == None:
# The db is often called with titles like "tracking/parameters/empty parameter". These seem to return None by design and are not present in Wiktionary. Skip these.
if "/translations" in title and not "tracking" in title:
print(f"Page '{title}' not found in db. Fetching from live wiktionary.")
body = get_wikicode(title, self.lang_code)
return Page(title, namespace_id, body=body)
return original_result
def get_wiktextract_context(wiktlang: str, wordlang: Optional[str] = None, mock_get_page: bool = True): def get_wiktextract_context(wiktlang: str, wordlang: Optional[str] = None):
db_path = f"./sqlite-{wiktlang}.db" db_path = f"./sqlite-{wiktlang}-all.db"
config = WiktionaryConfig( config = WiktionaryConfig(
dump_file_lang_code=wiktlang, dump_file_lang_code=wiktlang,
capture_language_codes=[wordlang] if wordlang else None, capture_language_codes=[wordlang] if wordlang else None,
...@@ -47,7 +20,7 @@ def get_wiktextract_context(wiktlang: str, wordlang: Optional[str] = None, mock_ ...@@ -47,7 +20,7 @@ def get_wiktextract_context(wiktlang: str, wordlang: Optional[str] = None, mock_
capture_inflections=True, capture_inflections=True,
) )
config.load_edition_settings() config.load_edition_settings()
wtp = CustomWtp(db_path=db_path, lang_code=wiktlang) if mock_get_page else Wtp(db_path=db_path, lang_code=wiktlang) wtp = Wtp(db_path=db_path, lang_code=wiktlang)
wxr = WiktextractContext(wtp, config) wxr = WiktextractContext(wtp, config)
return wxr return wxr
from wiktextract.page import parse_page from wiktextract.page import parse_page
from wiktextract_context import get_wiktextract_context from wiktextract_context import get_wiktextract_context
db_path = "./sqlite.db"
DEFAULT_PAGE_VALUES = {
"namespace_id": 0,
"model": "wikitext",
}
class Wiktextract: class Wiktextract:
...@@ -15,28 +8,12 @@ class Wiktextract: ...@@ -15,28 +8,12 @@ class Wiktextract:
self.wiktlang = wiktlang self.wiktlang = wiktlang
self.wordlang = wordlang self.wordlang = wordlang
# self.page_handler = page_handler
# self.page_handler.wxr : WiktextractContext =
self.wxr = get_wiktextract_context(wiktlang, wordlang) self.wxr = get_wiktextract_context(wiktlang, wordlang)
def parse_page(self, title: str, wikicode: str): def parse_page(self, title: str):
# add page to the database (making it accessible to LUA templates) page = self.wxr.wtp.get_page(title)
self.wxr.wtp.add_page( if not page:
title=title, return None
namespace_id=DEFAULT_PAGE_VALUES["namespace_id"], result = parse_page(self.wxr, title, page.body)
body=wikicode,
model=DEFAULT_PAGE_VALUES["model"],
)
self.wxr.wtp.start_page(title)
result = parse_page(self.wxr, title, wikicode)
# remove the page from the database
self.wxr.wtp.db_conn.execute(
"DELETE FROM pages WHERE title = ? AND model = ?",
(title, DEFAULT_PAGE_VALUES["model"]),
)
self.wxr.wtp.db_conn.commit()
return result return result
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment