diff --git a/src/app.py b/src/app.py index f8564335b8f08c2acdd3edfa68c95ad69f1f06d9..1c50351ed7a99622bd77fbab59ace14317affb36 100644 --- a/src/app.py +++ b/src/app.py @@ -1,10 +1,7 @@ -from flask import Flask, jsonify -from flask import request -from flask import Response +from flask import Flask, Response, jsonify, request from flask_cors import CORS import config -from get_wikicode import get_wikicode from wiktextract_wrapper import Wiktextract app = Flask(__name__) @@ -23,31 +20,29 @@ def search(wiktlang, wordlang, word): if wiktlang not in config.supported_wiktlangs: return jsonify({"error": f"Language {wiktlang} not supported"}), 400 - wikicode = get_wikicode(word, wiktlang) - if wikicode: - wiktextractor = Wiktextract(wiktlang, wordlang) - try: - resp = wiktextractor.parse_page(word, wikicode) + wiktextractor = Wiktextract(wiktlang, wordlang) + try: + resp = wiktextractor.parse_page(word) + if resp: return jsonify(resp) - - except Exception as e: - print(e) - - return jsonify({"error": "Parsing page resulted in error: " + str(e)}), 500 - finally: - wiktextractor.wxr.wtp.db_conn.close() - if wiktextractor.wxr.thesaurus_db_conn: - wiktextractor.wxr.thesaurus_db_conn.close() - - else: - return ( - jsonify( - { - "error": f"{word} is unknown in “{wordlang}†in {wiktlang}.wiktionary.org." - } - ), - 404, - ) + else: + return ( + jsonify( + { + "error": f"{word} is unknown in “{wordlang}†in {wiktlang}.wiktionary.org." + } + ), + 404, + ) + + except Exception as e: + print(e) + + return jsonify({"error": "Parsing page resulted in error: " + str(e)}), 500 + finally: + wiktextractor.wxr.wtp.db_conn.close() + if wiktextractor.wxr.thesaurus_db_conn: + wiktextractor.wxr.thesaurus_db_conn.close() if __name__ == "__main__": diff --git a/src/get_wikicode.py b/src/get_wikicode.py deleted file mode 100644 index 24291977bae17f9bfe128d43f80202aeb7b3eea7..0000000000000000000000000000000000000000 --- a/src/get_wikicode.py +++ /dev/null @@ -1,7 +0,0 @@ -import pywikibot - - -def get_wikicode(title: str, wiktlang: str): - site = pywikibot.Site(f"wiktionary:{wiktlang}") - page = pywikibot.Page(site, title) - return page.text diff --git a/src/wiktextract_context.py b/src/wiktextract_context.py index 79d08e0a0c8f2f56eb80e9ac5bfec60d07f7bfff..ce88f3158ab3fe36aa7dea6db4549f43e7524fd6 100644 --- a/src/wiktextract_context.py +++ b/src/wiktextract_context.py @@ -1,38 +1,11 @@ -from wiktextract import ( - WiktextractContext, - WiktionaryConfig, -) -from wikitextprocessor import Wtp, Page - from typing import Optional -from get_wikicode import get_wikicode - - -class CustomWtp(Wtp): - def get_page( - self, - title: str, - namespace_id: Optional[int] = None, - no_redirect: bool = False, - ) -> Optional[Page]: - # Call the original get_page method - original_result = super().get_page(title, namespace_id, no_redirect) - - if original_result == None: - # The db is often called with titles like "tracking/parameters/empty parameter". These seem to return None by design and are not present in Wiktionary. Skip these. - if "/translations" in title and not "tracking" in title: - print(f"Page '{title}' not found in db. Fetching from live wiktionary.") - - body = get_wikicode(title, self.lang_code) - - return Page(title, namespace_id, body=body) - - return original_result +from wikitextprocessor import Wtp +from wiktextract import WiktextractContext, WiktionaryConfig -def get_wiktextract_context(wiktlang: str, wordlang: Optional[str] = None, mock_get_page: bool = True): - db_path = f"./sqlite-{wiktlang}.db" +def get_wiktextract_context(wiktlang: str, wordlang: Optional[str] = None): + db_path = f"./sqlite-{wiktlang}-all.db" config = WiktionaryConfig( dump_file_lang_code=wiktlang, capture_language_codes=[wordlang] if wordlang else None, @@ -47,7 +20,7 @@ def get_wiktextract_context(wiktlang: str, wordlang: Optional[str] = None, mock_ capture_inflections=True, ) config.load_edition_settings() - wtp = CustomWtp(db_path=db_path, lang_code=wiktlang) if mock_get_page else Wtp(db_path=db_path, lang_code=wiktlang) + wtp = Wtp(db_path=db_path, lang_code=wiktlang) wxr = WiktextractContext(wtp, config) return wxr diff --git a/src/wiktextract_wrapper.py b/src/wiktextract_wrapper.py index 0f072ec1d0a51548829adba9ccdd30f1915fddd0..c1eab07b27d58c1db1cbaf6576894ea1782ab3fd 100644 --- a/src/wiktextract_wrapper.py +++ b/src/wiktextract_wrapper.py @@ -1,13 +1,6 @@ from wiktextract.page import parse_page from wiktextract_context import get_wiktextract_context - -db_path = "./sqlite.db" - -DEFAULT_PAGE_VALUES = { - "namespace_id": 0, - "model": "wikitext", -} class Wiktextract: @@ -15,28 +8,12 @@ class Wiktextract: self.wiktlang = wiktlang self.wordlang = wordlang - # self.page_handler = page_handler - # self.page_handler.wxr : WiktextractContext = self.wxr = get_wiktextract_context(wiktlang, wordlang) - def parse_page(self, title: str, wikicode: str): - # add page to the database (making it accessible to LUA templates) - self.wxr.wtp.add_page( - title=title, - namespace_id=DEFAULT_PAGE_VALUES["namespace_id"], - body=wikicode, - model=DEFAULT_PAGE_VALUES["model"], - ) - - self.wxr.wtp.start_page(title) - - result = parse_page(self.wxr, title, wikicode) - - # remove the page from the database - self.wxr.wtp.db_conn.execute( - "DELETE FROM pages WHERE title = ? AND model = ?", - (title, DEFAULT_PAGE_VALUES["model"]), - ) - self.wxr.wtp.db_conn.commit() + def parse_page(self, title: str): + page = self.wxr.wtp.get_page(title) + if not page: + return None + result = parse_page(self.wxr, title, page.body) return result