From 84b58dcc631905bc5fb6aa7e83f66a369d6d500c Mon Sep 17 00:00:00 2001 From: Empiriker <till.ueberfries@gmail.com> Date: Fri, 8 Sep 2023 14:10:13 +0200 Subject: [PATCH] add flask app with wiktextract wrapper --- .gitignore | 6 ++++ README.md | 21 +++++++++++++ requirements.txt | 3 ++ src/app.py | 61 ++++++++++++++++++++++++++++++++++++++ src/config.py | 3 ++ src/get_wikicode.py | 6 ++++ src/wiktextract_wrapper.py | 47 +++++++++++++++++++++++++++++ 7 files changed, 147 insertions(+) create mode 100644 requirements.txt create mode 100644 src/app.py create mode 100644 src/config.py create mode 100644 src/get_wikicode.py create mode 100644 src/wiktextract_wrapper.py diff --git a/.gitignore b/.gitignore index 68bc17f..f84499c 100644 --- a/.gitignore +++ b/.gitignore @@ -1,3 +1,9 @@ +/wikstraktor +/wiktextract + +throttle.ctrl +apicache-py3 + # Byte-compiled / optimized / DLL files __pycache__/ *.py[cod] diff --git a/README.md b/README.md index b4d2f94..10310d1 100644 --- a/README.md +++ b/README.md @@ -1 +1,22 @@ # live-query-wiktextract + +## Installation + +1. Install wikitextprocessor from source: + +``` +git clone https://github.com/tatuylonen/wikitextprocessor.git +cd wikitextprocessor +python -m pip install -U pip +python -m pip install --use-pep517 . +``` + +- Commit `e5296c16f2d715e62121f23cb5057374da48cda3` was used during development. + +2. Clone wiktextract + +``` +https://github.com/tatuylonen/wiktextract.git +``` + +- Commit `205c4a2d88c27113f0117e0095f466605976af81` was used during development. diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..c799dfc --- /dev/null +++ b/requirements.txt @@ -0,0 +1,3 @@ +pywikibot==8.3.2 +Flask==2.3.3 +flask_cors==4.0.0 \ No newline at end of file diff --git a/src/app.py b/src/app.py new file mode 100644 index 0000000..7a4d2d3 --- /dev/null +++ b/src/app.py @@ -0,0 +1,61 @@ +from flask import Flask, jsonify +from flask import request +from flask import Response +from flask_cors import CORS + +import config +from get_wikicode import get_wikicode +from wiktextract_wrapper import Wiktextract + +app = Flask(__name__) +CORS(app) + +@app.route('/', methods=['GET']) +def index(): + c = request.remote_addr + response = f"<p>Server is running, your ip is {c}</p>" + return Response(response, 200) + + +@app.route('/search/<wiktlang>/<wordlang>/<word>', methods=['GET']) +def search(wiktlang, wordlang, word): + en_wiktextract = Wiktextract("en", "en") + + wikicode = get_wikicode(word, wiktlang) + if wikicode: + try: + resp = en_wiktextract.parse_page(word, wikicode) + return jsonify(resp) + + except Exception as e: + print(e) + resp = f"""<!doctype html> + <html> + <head> + <title>Error</title> + </head> + <body> + <h1>{word}</h1> + <p>{e}</p> + </body> + </html>""" + status = 404 + mimetype='text/html' + + else: + resp = f"""<!doctype html> + <html> + <head> + <title>Error</title> + </head> + <body> + <h1>{word}</h1> + <p>{word} is unknown in “{wordlang}†in {wiktlang}.wiktionary.org.</p> + </body> + </html>""" + status = 404 + mimetype='text/html' + return Response(resp, status=status, mimetype=mimetype) + +if __name__ == "__main__": + app.run(host=config.host, port=config.port, debug=config.debugging) diff --git a/src/config.py b/src/config.py new file mode 100644 index 0000000..b592f30 --- /dev/null +++ b/src/config.py @@ -0,0 +1,3 @@ +host = "0.0.0.0" +port = 80 +debugging = True \ No newline at end of file diff --git a/src/get_wikicode.py b/src/get_wikicode.py new file mode 100644 index 0000000..91b6ece --- /dev/null +++ b/src/get_wikicode.py @@ -0,0 +1,6 @@ +import pywikibot + +def get_wikicode(title:str, wiktlang:str): + site = pywikibot.Site(f"wiktionary:{wiktlang}") + page = pywikibot.Page(site, title) + return page.text \ No newline at end of file diff --git a/src/wiktextract_wrapper.py b/src/wiktextract_wrapper.py new file mode 100644 index 0000000..ee2ef78 --- /dev/null +++ b/src/wiktextract_wrapper.py @@ -0,0 +1,47 @@ +from wiktextract import ( + WiktextractContext, + WiktionaryConfig, +) +from wiktextract.wiktionary import page_handler +from wikitextprocessor import Wtp, Page + +db_path = "./sqlite.db" + +class Wiktextract: + def __init__(self, wiktlang:str, wordlang:str): + self.wiktlang = wiktlang + self.wordlang = wordlang + + config = WiktionaryConfig( + dump_file_lang_code=wiktlang, + capture_language_codes=[wordlang], + capture_translations=True, + capture_pronunciation=True, + capture_linkages=True, + capture_compounds=True, + capture_redirects=True, + capture_examples=True, + capture_etymologies=True, + capture_descendants=True, + capture_inflections=True,) + wxr = WiktextractContext(Wtp(db_path=db_path), config) + + self.page_handler = page_handler + self.page_handler.wxr = wxr + + def parse_page(self, title:str, wikicode:str): + # add page to the database + self.page_handler.wxr.wtp.add_page(title=title, namespace_id=0, body=wikicode, model='wikitext') + + # create a page object + page = Page(title, 0, None, True, wikicode, 'wikitext') + + # parse the page + success, ret, err = self.page_handler(page) + + if success: + return ret + else: + raise Exception(err) + + -- GitLab