Skip to content
Snippets Groups Projects
wiktextract_context.py 1.76 KiB
from wiktextract import (
    WiktextractContext,
    WiktionaryConfig,
)
from wikitextprocessor import Wtp, Page

from typing import Optional

from get_wikicode import get_wikicode


class CustomWtp(Wtp):
    def get_page(
        self,
        title: str,
        namespace_id: Optional[int] = None,
        no_redirect: bool = False,
    ) -> Optional[Page]:
        # Call the original get_page method
        original_result = super().get_page(title, namespace_id, no_redirect)

        if original_result == None:
            # The db is often called with titles like "tracking/parameters/empty parameter". These seem to return None by design and are not present in Wiktionary. Skip these.
            if "/translations" in title and not "tracking" in title:
                print(f"Page '{title}' not found in db. Fetching from live wiktionary.")

                body = get_wikicode(title, self.lang_code)

                return Page(title, namespace_id, body=body)

        return original_result


def get_wiktextract_context(wiktlang: str, wordlang: Optional[str] = None, mock_get_page: bool = True):
    db_path = f"./sqlite-{wiktlang}.db"
    config = WiktionaryConfig(
        dump_file_lang_code=wiktlang,
        capture_language_codes=[wordlang] if wordlang else None,
        capture_translations=True,
        capture_pronunciation=True,
        capture_linkages=True,
        capture_compounds=True,
        capture_redirects=True,
        capture_examples=True,
        capture_etymologies=True,
        capture_descendants=True,
        capture_inflections=True,
    )
    config.load_edition_settings()
    wtp = CustomWtp(db_path=db_path, lang_code=wiktlang) if mock_get_page else Wtp(db_path=db_path, lang_code=wiktlang)
    wxr = WiktextractContext(wtp, config)

    return wxr