from wiktextract import ( WiktextractContext, WiktionaryConfig, ) from wikitextprocessor import Wtp, Page from typing import Optional from get_wikicode import get_wikicode class CustomWtp(Wtp): def get_page( self, title: str, namespace_id: Optional[int] = None, no_redirect: bool = False, ) -> Optional[Page]: # Call the original get_page method original_result = super().get_page(title, namespace_id, no_redirect) if original_result == None: # The db is often called with titles like "tracking/parameters/empty parameter". These seem to return None by design and are not present in Wiktionary. Skip these. if "/translations" in title and not "tracking" in title: print(f"Page '{title}' not found in db. Fetching from live wiktionary.") body = get_wikicode(title, self.lang_code) return Page(title, namespace_id, body=body) return original_result def get_wiktextract_context(wiktlang: str, wordlang: Optional[str] = None, mock_get_page: bool = True): db_path = f"./sqlite-{wiktlang}.db" config = WiktionaryConfig( dump_file_lang_code=wiktlang, capture_language_codes=[wordlang] if wordlang else None, capture_translations=True, capture_pronunciation=True, capture_linkages=True, capture_compounds=True, capture_redirects=True, capture_examples=True, capture_etymologies=True, capture_descendants=True, capture_inflections=True, ) config.load_edition_settings() wtp = CustomWtp(db_path=db_path, lang_code=wiktlang) if mock_get_page else Wtp(db_path=db_path, lang_code=wiktlang) wxr = WiktextractContext(wtp, config) return wxr