-
Empiriker authored1069396e
wiktextract_context.py 1.76 KiB
from wiktextract import (
WiktextractContext,
WiktionaryConfig,
)
from wikitextprocessor import Wtp, Page
from typing import Optional
from get_wikicode import get_wikicode
class CustomWtp(Wtp):
def get_page(
self,
title: str,
namespace_id: Optional[int] = None,
no_redirect: bool = False,
) -> Optional[Page]:
# Call the original get_page method
original_result = super().get_page(title, namespace_id, no_redirect)
if original_result == None:
# The db is often called with titles like "tracking/parameters/empty parameter". These seem to return None by design and are not present in Wiktionary. Skip these.
if "/translations" in title and not "tracking" in title:
print(f"Page '{title}' not found in db. Fetching from live wiktionary.")
body = get_wikicode(title, self.lang_code)
return Page(title, namespace_id, body=body)
return original_result
def get_wiktextract_context(wiktlang: str, wordlang: Optional[str] = None, mock_get_page: bool = True):
db_path = f"./sqlite-{wiktlang}.db"
config = WiktionaryConfig(
dump_file_lang_code=wiktlang,
capture_language_codes=[wordlang] if wordlang else None,
capture_translations=True,
capture_pronunciation=True,
capture_linkages=True,
capture_compounds=True,
capture_redirects=True,
capture_examples=True,
capture_etymologies=True,
capture_descendants=True,
capture_inflections=True,
)
config.load_edition_settings()
wtp = CustomWtp(db_path=db_path, lang_code=wiktlang) if mock_get_page else Wtp(db_path=db_path, lang_code=wiktlang)
wxr = WiktextractContext(wtp, config)
return wxr