diff --git a/src/wiktextract_context.py b/src/wiktextract_context.py index fcf768754cf27b516c60d2fc8f02012c7dad68dc..e1abb42a70df4eaa23ff72b66c42557d2e29c0f2 100644 --- a/src/wiktextract_context.py +++ b/src/wiktextract_context.py @@ -2,10 +2,34 @@ from wiktextract import ( WiktextractContext, WiktionaryConfig, ) -from wikitextprocessor import Wtp +from wikitextprocessor import Wtp, Page from typing import Optional +from get_wikicode import get_wikicode + + +class CustomWtp(Wtp): + def get_page( + self, + title: str, + namespace_id: Optional[int] = None, + no_redirect: bool = False, + ) -> Optional[Page]: + # Call the original get_page method + original_result = super().get_page(title, namespace_id, no_redirect) + + if original_result == None: + # The db is often called with titles like "tracking/parameters/empty parameter". These seem to return None by design and are not present in Wiktionary. Skip these. + if "/translations" in title and not "tracking" in title: + print(f"Page '{title}' not found in db. Fetching from live wiktionary.") + + body = get_wikicode(title, self.lang_code) + + return Page(title, namespace_id, body=body) + + return original_result + def get_wiktextract_context(wiktlang: str, wordlang: Optional[str] = None): db_path = f"./sqlite-{wiktlang}.db" @@ -22,6 +46,6 @@ def get_wiktextract_context(wiktlang: str, wordlang: Optional[str] = None): capture_descendants=True, capture_inflections=True, ) - wxr = WiktextractContext(Wtp(db_path=db_path), config) + wxr = WiktextractContext(CustomWtp(db_path=db_path), config) return wxr