diff --git a/scripts/filter_namespaces.py b/scripts/filter_namespaces.py index 7175900702e284d489f3476af985850757088663..d009701e233d2b114f19d0d4fa47353d2e666b04 100644 --- a/scripts/filter_namespaces.py +++ b/scripts/filter_namespaces.py @@ -18,60 +18,54 @@ if __name__ == "__main__": ) args = parser.parse_args() - dump_path = args.dump_path out_path = args.out_path xml_declaration = "" root_tag = "" - # Preserves the xml declaration and root tag - with bz2.open(dump_path, 'rt') as f: - i = 0 - for line in f: - if "<page" in line: - break - if not root_tag: - match = re.search(r"<(\w+)", line) - if match: - root_tag = match.group(1) - xml_declaration += line + with bz2.open(dump_path, "rt") as f: + i = 0 + for line in f: + if "<page" in line: + break + if not root_tag: + match = re.search(r"<(\w+)", line) + if match: + root_tag = match.group(1) + xml_declaration += line NAMESPACE_IDS = set([]) # Get the namespace ids of the namespaces we want to keep - root = etree.fromstring(xml_declaration+ f"</{root_tag}>", etree.XMLParser()) + root = etree.fromstring(xml_declaration + f"</{root_tag}>", etree.XMLParser()) namespaces = root.nsmap for namespace in root.findall(".//namespaces/namespace", namespaces=namespaces): - if namespace.text in KEEP_NAMESPACES: - NAMESPACE_IDS.add(int(namespace.get("key"))) + if namespace.text in KEEP_NAMESPACES: + NAMESPACE_IDS.add(int(namespace.get("key"))) f.close() + with decompress_dump_file(dump_path) as p, bz2.open(out_path, "wt") as output_file: + output_file.write(xml_declaration) + namespace_str = "http://www.mediawiki.org/xml/export-0.10/" + page_nums = 0 - - with decompress_dump_file(dump_path) as p, bz2.open(out_path, 'wt') as output_file: - output_file.write(xml_declaration) - namespace_str = "http://www.mediawiki.org/xml/export-0.10/" - page_nums = 0 - - # namespaces = {None: namespace_str} - for _, page_element in etree.iterparse( - p.stdout, tag=f"{{{namespace_str}}}page" - ): - page_nums += 1 - - namespace_id = int(page_element.findtext("ns", "0", namespaces)) - if ( - namespace_id not in NAMESPACE_IDS + # namespaces = {None: namespace_str} + for _, page_element in etree.iterparse( + p.stdout, tag=f"{{{namespace_str}}}page" ): - page_element.clear(keep_tail=True) - continue + page_nums += 1 + + namespace_id = int(page_element.findtext("ns", "0", namespaces)) + if namespace_id not in NAMESPACE_IDS: + page_element.clear(keep_tail=True) + continue - output_file.write(etree.tostring(page_element).decode('utf-8')) + output_file.write(etree.tostring(page_element).decode("utf-8")) - if page_nums % 10: - print(f"Processed {page_nums} pages") + if page_nums % 10: + print(f"Processed {page_nums} pages") - output_file.write(f"</{root_tag}>") + output_file.write(f"</{root_tag}>") diff --git a/scripts/filter_page.py b/scripts/filter_page.py index 95a0b5880bf044d82d2d1cc0df4ab72a140a859c..01ec07d24c899f8564414dfc357398ab745c5e33 100644 --- a/scripts/filter_page.py +++ b/scripts/filter_page.py @@ -18,55 +18,50 @@ if __name__ == "__main__": ) args = parser.parse_args() - dump_path = args.dump_path out_path = args.out_path xml_declaration = "" root_tag = "" - # Preserves the xml declaration and root tag - with bz2.open(dump_path, 'rt') as f: - i = 0 - for line in f: - if "<page" in line: - break - if not root_tag: - match = re.search(r"<(\w+)", line) - if match: - root_tag = match.group(1) - xml_declaration += line + with bz2.open(dump_path, "rt") as f: + i = 0 + for line in f: + if "<page" in line: + break + if not root_tag: + match = re.search(r"<(\w+)", line) + if match: + root_tag = match.group(1) + xml_declaration += line NAMESPACE_IDS = set([]) - namespace_str = "http://www.mediawiki.org/xml/export-0.10/" namespaces = {None: namespace_str} - - with decompress_dump_file(dump_path) as p, bz2.open(out_path, 'wt') as output_file: - output_file.write(xml_declaration) - namespace_str = "http://www.mediawiki.org/xml/export-0.10/" - page_nums = 0 - # namespaces = {None: namespace_str} - for _, page_element in etree.iterparse( - p.stdout, tag=f"{{{namespace_str}}}page" - ): - page_nums += 1 + with decompress_dump_file(dump_path) as p, bz2.open(out_path, "wt") as output_file: + output_file.write(xml_declaration) + namespace_str = "http://www.mediawiki.org/xml/export-0.10/" + page_nums = 0 - title = page_element.findtext("title", "", namespaces) + # namespaces = {None: namespace_str} + for _, page_element in etree.iterparse( + p.stdout, tag=f"{{{namespace_str}}}page" + ): + page_nums += 1 - if title in KEEP_PAGES: - output_file.write(etree.tostring(page_element).decode('utf-8')) - KEEP_PAGES.remove(title) - - if not KEEP_PAGES: - break + title = page_element.findtext("title", "", namespaces) - if page_nums % 10: - print(f"Processed {page_nums} pages") + if title in KEEP_PAGES: + output_file.write(etree.tostring(page_element).decode("utf-8")) + KEEP_PAGES.remove(title) - output_file.write(f"</{root_tag}>") + if not KEEP_PAGES: + break + if page_nums % 10: + print(f"Processed {page_nums} pages") + output_file.write(f"</{root_tag}>") diff --git a/src/app.py b/src/app.py index c1e203479a9560971b82c98653160ac0ce7594c1..d148407946d58cc3477a4110558509599706ff4c 100644 --- a/src/app.py +++ b/src/app.py @@ -1,7 +1,7 @@ from flask import Flask, jsonify from flask import request -from flask import Response -from flask_cors import CORS +from flask import Response +from flask_cors import CORS import config from get_wikicode import get_wikicode @@ -10,25 +10,26 @@ from wiktextract_wrapper import Wiktextract app = Flask(__name__) CORS(app) -@app.route('/', methods=['GET']) + +@app.route("/", methods=["GET"]) def index(): c = request.remote_addr response = f"<p>Server is running, your ip is {c}</p>" return Response(response, 200) -@app.route('/search/<wiktlang>/<wordlang>/<word>', methods=['GET']) +@app.route("/search/<wiktlang>/<wordlang>/<word>", methods=["GET"]) def search(wiktlang, wordlang, word): wikicode = get_wikicode(word, wiktlang) if wikicode: - en_wiktextract = Wiktextract("en", wordlang) - try: - resp = en_wiktextract.parse_page(word, wikicode) - return jsonify(resp) - - except Exception as e: - print(e) - resp = f"""<!doctype html> + en_wiktextract = Wiktextract("en", wordlang) + try: + resp = en_wiktextract.parse_page(word, wikicode) + return jsonify(resp) + + except Exception as e: + print(e) + resp = f"""<!doctype html> <html> <head> <title>Error</title> @@ -38,13 +39,13 @@ def search(wiktlang, wordlang, word): <p>{e}</p> </body> </html>""" - status = 404 - mimetype='text/html' - finally: - en_wiktextract.page_handler.wxr.wtp.db_conn.close() + status = 404 + mimetype = "text/html" + finally: + en_wiktextract.page_handler.wxr.wtp.db_conn.close() else: - resp = f"""<!doctype html> + resp = f"""<!doctype html> <html> <head> <title>Error</title> @@ -54,9 +55,10 @@ def search(wiktlang, wordlang, word): <p>{word} is unknown in “{wordlang}†in {wiktlang}.wiktionary.org.</p> </body> </html>""" - status = 404 - mimetype='text/html' + status = 404 + mimetype = "text/html" return Response(resp, status=status, mimetype=mimetype) + if __name__ == "__main__": app.run(host=config.host, port=config.port, debug=config.debugging) diff --git a/src/config.py b/src/config.py index b592f306c6011a9e7c7e36be652fdcc0f737e4da..c7d354923ac874bc7da5c2c0849a2ade714e5a79 100644 --- a/src/config.py +++ b/src/config.py @@ -1,3 +1,3 @@ host = "0.0.0.0" port = 80 -debugging = True \ No newline at end of file +debugging = True diff --git a/src/get_wikicode.py b/src/get_wikicode.py index 91b6ecedf16394cc61f583db48f5d5e2473a63e2..24291977bae17f9bfe128d43f80202aeb7b3eea7 100644 --- a/src/get_wikicode.py +++ b/src/get_wikicode.py @@ -1,6 +1,7 @@ import pywikibot -def get_wikicode(title:str, wiktlang:str): - site = pywikibot.Site(f"wiktionary:{wiktlang}") - page = pywikibot.Page(site, title) - return page.text \ No newline at end of file + +def get_wikicode(title: str, wiktlang: str): + site = pywikibot.Site(f"wiktionary:{wiktlang}") + page = pywikibot.Page(site, title) + return page.text diff --git a/src/wiktextract_wrapper.py b/src/wiktextract_wrapper.py index e9b9c223ea788d3553f9432fa04da0c072593bed..b3c46b477092eef9ea7728f3151544efb4aa6d89 100644 --- a/src/wiktextract_wrapper.py +++ b/src/wiktextract_wrapper.py @@ -8,49 +8,58 @@ from wikitextprocessor import Wtp, Page db_path = "./sqlite.db" DEFAULT_PAGE_VALUES = { - "namespace_id": 0, - "model": 'wikitext', + "namespace_id": 0, + "model": "wikitext", } + + class Wiktextract: - def __init__(self, wiktlang:str, wordlang:str): - self.wiktlang = wiktlang - self.wordlang = wordlang - - config = WiktionaryConfig( - dump_file_lang_code=wiktlang, - capture_language_codes=[wordlang], - capture_translations=True, - capture_pronunciation=True, - capture_linkages=True, - capture_compounds=True, - capture_redirects=True, - capture_examples=True, - capture_etymologies=True, - capture_descendants=True, - capture_inflections=True,) - wxr = WiktextractContext(Wtp(db_path=db_path), config) - - self.page_handler = page_handler - self.page_handler.wxr = wxr - - def parse_page(self, title:str, wikicode:str): - # add page to the database (making it accessible to LUA templates) - self.page_handler.wxr.wtp.add_page(title=title, namespace_id=DEFAULT_PAGE_VALUES["namespace_id"], body=wikicode, model=DEFAULT_PAGE_VALUES["model"]) - - # create a page object (for parsing) - page = Page(title, 0, None, True, wikicode, 'wikitext') - - # parse the page - success, ret, err = self.page_handler(page) - result, parsing_errors = ret - - # remove the page from the database - self.page_handler.wxr.wtp.db_conn.execute("DELETE FROM pages WHERE title = ? AND model = ?", (title, DEFAULT_PAGE_VALUES["model"])) - self.page_handler.wxr.wtp.db_conn.commit() - - if success: - return result - else: - raise Exception(err) + def __init__(self, wiktlang: str, wordlang: str): + self.wiktlang = wiktlang + self.wordlang = wordlang + + config = WiktionaryConfig( + dump_file_lang_code=wiktlang, + capture_language_codes=[wordlang], + capture_translations=True, + capture_pronunciation=True, + capture_linkages=True, + capture_compounds=True, + capture_redirects=True, + capture_examples=True, + capture_etymologies=True, + capture_descendants=True, + capture_inflections=True, + ) + wxr = WiktextractContext(Wtp(db_path=db_path), config) + + self.page_handler = page_handler + self.page_handler.wxr = wxr + + def parse_page(self, title: str, wikicode: str): + # add page to the database (making it accessible to LUA templates) + self.page_handler.wxr.wtp.add_page( + title=title, + namespace_id=DEFAULT_PAGE_VALUES["namespace_id"], + body=wikicode, + model=DEFAULT_PAGE_VALUES["model"], + ) + + # create a page object (for parsing) + page = Page(title, 0, None, True, wikicode, "wikitext") + + # parse the page + success, ret, err = self.page_handler(page) + result, parsing_errors = ret + # remove the page from the database + self.page_handler.wxr.wtp.db_conn.execute( + "DELETE FROM pages WHERE title = ? AND model = ?", + (title, DEFAULT_PAGE_VALUES["model"]), + ) + self.page_handler.wxr.wtp.db_conn.commit() + if success: + return result + else: + raise Exception(err)