#!/usr/bin/env python3 import pywikibot import wikitextparser import importlib import json ####### # Oral ####### class Sound: def __init__(self, url, accent): self.url = url self.accent = accent def __eq__(self, other): return self.url == other.url and self.accent == other.accent def serializable(self): if self.accent == None: res = {"url":self.url} else: res = {"accent":self.accent, "url":self.url} return res class Pronunciation: def __init__(self): self.ipa = None self.sounds = [] self.accent = None def set_transcription(self, tscpt): self.ipa = tscpt def set_accent(self, accent): self.accent = accent def add_sound(self, url, accent=None): self.sounds.append(Sound(url,accent)) def serializable(self): snds = [] for s in self.sounds: snds.append(s.serializable()) if self.accent == None: res = {"transcript":self.ipa, "sounds":snds} else: res = {"accent":self.accent, "transcript":self.ipa, "sounds":snds} return res def __str__(self): return f"{self.serializable()}" def __eq__(self, other): res = self.ipa == other.ipa and self.accent == other.accent and len(self.sounds)==len(other.sounds) i = 0 while res and i<len(self.sounds): res = self.sounds[i] == other.sounds[i] i += 1 return res ####### # Metadata ## TODO: # * POS : créer une classe POS avec les traits dépendants (ex: masc en fr) ####### ####### # Senses # TODO: créer une classe Translations ####### class Definition: def __init__(self, lang, text): self.lang = lang self.text = text def __eq__(self, other): return self.lang == other.lang and self.text == other.text def serializable(self): return {"lang":self.lang, "definition":self.text} class Translation(Definition): def serializable(self): return {"lang":self.lang, "translation":self.text} class Example: def __init__(self, transcript, source=None, url=None): self.text = transcript self.source = source self.url = url def __eq__(self, other): return self.text==other.text and self.source==other.source and self.url==other.url def serializable(self): res = {"example":self.text} if self.source != None: res["source"] = self.source if self.url != None: res["url"] = self.url return res class Sense: def __init__(self, label): self.label = label #l'identifiant du sens self.definitions = [] #liste des définitions (elles auront une langue et un texte) self.examples = [] #liste des exemples (un texte obligatoire, source et url sont optionnels) self.translations = [] #liste des traductions dans d'autres langues self.domain = None #domaine d'usage du mot dans ce sens def set_domain(self, d): self.domain = d def add_def(self, lang, definition): theDef = Definition(lang, definition) if theDef not in self.definitions: self.definitions.append(theDef) def add_example(self, transcript, src=None, url=None): theEx = Example(transcript, src, url) if theEx not in self.examples: self.examples.append(theEx) def add_translation(self, lang, translation): theTranslation = Translation(lang, translation) if theTranslation not in self.translations: self.translations.append(theTranslation) def __eq__(self, other): res = self.label == other.label and len(self.definitions) == len(other.definitions) and len(self.examples) == len(other.examples) and len(self.translations) == len(other.translations) and self.domain == other.domain i = 0 while res and i < len(self.examples): res = self.examples[i] in other.examples i+=1 i = 0 while res and i < len(self.translations): res = self.translations[i] in other.translations i+=1 i = 0 while res and i < len(self.definitions): res = self.definitions[i] in other.definitions i+=1 return res def serializable(self): res = {} res[self.label]={} if self.domain != None: res[self.label]["domain"] = self.domain res[self.label]["defs"] = [] for d in self.definitions: res[self.label]["defs"].append(d.serializable()) res[self.label]["exs"] = [] for e in self.examples: res[self.label]["exs"].append(e.serializable()) res[self.label]["trad"] = [] for t in self.translations: res[self.label]["trad"].append(t.serializable()) return res class Entry: def __init__(self, lemma): self.lemma = lemma self.pronunciations = [] self.pos = None self.senses = [] def set_pronunciations(self, pron): if isinstance(pron, Pronunciation): self.pronunciations.append(pron) elif type(pron) == list: for p in pron: if isinstance(p, Pronunciation): self.pronunciations.append(p) else: raise ValueError(f"Entry.set_pronunciation: {p} is not a Pronunciation object ({p.__class__.__name__}).") else: raise ValueError(f"Entry.set_pronunciation: {pron} is not a Pronunciation object ({pron.__class__.__name__}).") def set_pos(self, pos): self.pos = pos def set_senses(self, senses): self.senses = senses def is_valid(self): return self.lemma != None and len(self.pronunciations) > 0 and self.pos != None and len(self.senses) > 0 def __eq__(self, other): res = self.lemma == other.lemma and self.pos ==other.pos and len(self.pronunciations) == len(other.pronunciations) and len(self.senses) == len(other.senses) i = 0 while res and i < len(self.senses): res = self.senses[i] == other.senses[i] i += 1 i = 0 while res and i < len(self.pronunciations): res = self.pronunciations[i] == other.pronunciations[i] i += 1 return res def serializable(self): res = {} res[self.lemma] = {"pos":self.pos} res[self.lemma]["pronunciations"] = [] for p in self.pronunciations: res[self.lemma]["pronunciations"].append(p.serializable()) res[self.lemma]["senses"] = [] for s in self.senses: res[self.lemma]["senses"].append(s.serializable()) return res def __str__(self): res = f"{self.lemma} ({self.pos})\n" for p in self.pronunciations: res += f"{str(p)}\n" for s in self.senses: res += f"{str(s)}\n" return res class ParserContext: def __init__(self, entry): self.lemma = entry self.context = [] self.entries = [] def get_level(self): if len(self.context) == 0: res = -1 else: res = self.context[-1]["wiki"].level return res def push(self, wiki_context): self.context.append({"wiki":wiki_context}) def pop(self, testNewEntry = True): if testNewEntry: self.create_entry() return self.context.pop() def set_top_wiki(self, wiki_context): if len(self.context) == 0: self.push(wiki_context) else: self.context[-1]['wiki'] = wiki_context def set_top_entry_info(self, key, entry_context, testNewEntry=True): if len(self.context) == 0: raise ValueError(f"Trying to set up entry info ({entry_context}), in an empty parserContext.") else: self.context[-1][key] = entry_context if testNewEntry: self.create_entry() def create_entry(self): #Dans le dictionnaire de keys, il n'y a jamais de senses ou de POS res = Entry(self.lemma) for l in self.context: print(l.keys()) if "pro" in l.keys(): res.set_pronunciations(l['pro']) if "ety" in l.keys(): pass #On ignore l'étymologie pour le moment if "POS" in l.keys(): res.set_pos(l['POS']) if "senses" in l.keys(): res.set_senses(l['senses']) # TODO: Ajouter les autres types if res.is_valid() and res not in self.entries: self.entries.append(res) else: res = None return res def debug_top(self): res = "Context: " if len(self.context) == 0 : res += "0" else: info = "" for k,v in self.context[-1].items(): if k != 'wiki': if info != "": info += "\n\t\t\t" info += f"{k} → {str(v)}" res += f"{len(self.context)*'='} {self.context[-1]['wiki'].level*'#'} {self.context[-1]['wiki'].title} / {info}" return res class Wikstraktor: @classmethod def get_instance(cls, wiki_language, entry_language): try: m_name = f"{wiki_language}_{entry_language}".capitalize() instance = getattr(importlib.import_module(f"parsers.{m_name.lower()}"), f"{m_name}_straktor")() except ModuleNotFoundError: print(f"parsers.{m_name.lower()} module not found or {m_name}_straktor not found in module") instance = None return instance def __init__(self): self.entries = [] self.pwb = pywikibot self.wtp = wikitextparser self.parserContext = None def get_file_url(self, file_page_name): res = None try: f = self.pwb.FilePage(self.site, file_page_name) res = f.get_file_url() except pywikibot.exceptions.NoPageError: print(f"{file_page_name} does not exist in {self.site}.") return res #retrieves the content of a page and processes it (adding the entries to the list of entries) #returns the number of entries added def fetch(self, graphy): nb_entries_added = 0 page = self.pwb.Page(self.site, graphy) to_parse = [] if page.text != "": sections = self.wtp.parse(page.text).sections found = False i = 0 ### find language while i < len(sections) and not found: found = sections[i].title != None and sections[i].title.capitalize() == self.constants[self.entry_language] if not found: i += 1 if found: nb_entries_added = self.parse(page.title(), sections[i].sections)#self.wtp.parse(s.contents).sections) return nb_entries_added def parse(self, entry, sections): self.parserContext = ParserContext(entry) for s in sections: if s.title != None : #handle wiki context if self.parserContext.get_level() < s.level: self.parserContext.push(s) else: while self.parserContext.get_level() > s.level: self.parserContext.pop() self.parserContext.set_top_wiki(s) stitle = self.wtp.parse(s.title).templates if stitle == []: stitle = s.title else: stitle = stitle[0].arguments[0].value if self.isPro(stitle): self.parserContext.set_top_entry_info('pro', self.process_pronunciation(self.wtp.parse(s.contents))) elif self.isEty(stitle): self.parserContext.set_top_entry_info('ety', self.process_etymology(self.wtp.parse(s.contents))) # elif stitle in self.constants['POS'].keys(): else: pos = self.process_POS(stitle) if pos != None : self.parserContext.set_top_entry_info('POS', pos, False) self.parserContext.set_top_entry_info('senses', self.process_senses(entry, pos+str(len(self.parserContext.entries)), self.wtp.parse(s.contents))) res = len(self.parserContext.entries) if res > 0: for e in self.parserContext.entries: self.entries.append(e) return res def isPro(self, title): if type(self.constants['pro']) == str: res = title == self.constants['pro'] else: res = title in self.constants['pro'] print(title, res) return res def isEty(self, title): if type(self.constants['ety']) == str: res = title == self.constants['ety'] else: res = title in self.constants['ety'] return res def process_POS(self, parsedwikitext): pass#in subclass def process_pronunciation(self, parsedwikitext): pass#in subclass def process_etymology(self, parsedwikitext): pass#in subclass def process_senses(self, entry, pos, parsedwikitext): pass#in subclass def __str__(self): res = [] for e in self.entries: res.append(e.serializable()) return json.dumps(res, ensure_ascii=False, indent=2) if __name__ == "__main__": #e = Wikstraktor.get_instance('en', "en") f = Wikstraktor.get_instance('fr', 'en') # print(e.get_file_url("File:LL-Q1860 (eng)-Nattes à chat----parent.wav")) # print(e.get_file_url("File:LL-Q1860 (eng)-Nattes à chat-parent.wav")) #e.fetch("water") f.fetch("blue") # print(e.fetch("test"), "entries added") #print(e) file_path = 'test.json' fichier = open(file_path, "w") #fichier.write(str(f)) fichier.write(str(f)) fichier.close() # site = pywikibot.Site(f'wiktionary:en') # p = pywikibot.FilePage(site, "File:LL-Q1860 (eng)-Nattes à chat----parent.wav") # print(p) # if not p.exists(): # site = pywikibot.Site('commons') # p = pywikibot.FilePage(site, "File:LL-Q1860 (eng)-Nattes à chat-parent.wav") # print(p.get_file_url()) #print(e) #Entry("test", wtp.parse(page.text)))