diff --git a/.gitignore b/.gitignore index 68bc17f9ff2104a9d7b6777058bb4c343ca72609..f84499c19d2e46edf840fbe1278841784897397b 100644 --- a/.gitignore +++ b/.gitignore @@ -1,3 +1,9 @@ +/wikstraktor +/wiktextract + +throttle.ctrl +apicache-py3 + # Byte-compiled / optimized / DLL files __pycache__/ *.py[cod] diff --git a/README.md b/README.md index b4d2f94a7cb8519716993ec367e775bc0d47baf5..10310d14985ffc13721cb9c9d45b75e1b3dd880e 100644 --- a/README.md +++ b/README.md @@ -1 +1,22 @@ # live-query-wiktextract + +## Installation + +1. Install wikitextprocessor from source: + +``` +git clone https://github.com/tatuylonen/wikitextprocessor.git +cd wikitextprocessor +python -m pip install -U pip +python -m pip install --use-pep517 . +``` + +- Commit `e5296c16f2d715e62121f23cb5057374da48cda3` was used during development. + +2. Clone wiktextract + +``` +https://github.com/tatuylonen/wiktextract.git +``` + +- Commit `205c4a2d88c27113f0117e0095f466605976af81` was used during development. diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000000000000000000000000000000000000..c799dfccd312b832aa129d6881ef8fa33f727c0a --- /dev/null +++ b/requirements.txt @@ -0,0 +1,3 @@ +pywikibot==8.3.2 +Flask==2.3.3 +flask_cors==4.0.0 \ No newline at end of file diff --git a/src/app.py b/src/app.py new file mode 100644 index 0000000000000000000000000000000000000000..7a4d2d381e11728fe25823d04c3e3526a4a623eb --- /dev/null +++ b/src/app.py @@ -0,0 +1,61 @@ +from flask import Flask, jsonify +from flask import request +from flask import Response +from flask_cors import CORS + +import config +from get_wikicode import get_wikicode +from wiktextract_wrapper import Wiktextract + +app = Flask(__name__) +CORS(app) + +@app.route('/', methods=['GET']) +def index(): + c = request.remote_addr + response = f"<p>Server is running, your ip is {c}</p>" + return Response(response, 200) + + +@app.route('/search/<wiktlang>/<wordlang>/<word>', methods=['GET']) +def search(wiktlang, wordlang, word): + en_wiktextract = Wiktextract("en", "en") + + wikicode = get_wikicode(word, wiktlang) + if wikicode: + try: + resp = en_wiktextract.parse_page(word, wikicode) + return jsonify(resp) + + except Exception as e: + print(e) + resp = f"""<!doctype html> + <html> + <head> + <title>Error</title> + </head> + <body> + <h1>{word}</h1> + <p>{e}</p> + </body> + </html>""" + status = 404 + mimetype='text/html' + + else: + resp = f"""<!doctype html> + <html> + <head> + <title>Error</title> + </head> + <body> + <h1>{word}</h1> + <p>{word} is unknown in “{wordlang}†in {wiktlang}.wiktionary.org.</p> + </body> + </html>""" + status = 404 + mimetype='text/html' + return Response(resp, status=status, mimetype=mimetype) + +if __name__ == "__main__": + app.run(host=config.host, port=config.port, debug=config.debugging) diff --git a/src/config.py b/src/config.py new file mode 100644 index 0000000000000000000000000000000000000000..b592f306c6011a9e7c7e36be652fdcc0f737e4da --- /dev/null +++ b/src/config.py @@ -0,0 +1,3 @@ +host = "0.0.0.0" +port = 80 +debugging = True \ No newline at end of file diff --git a/src/get_wikicode.py b/src/get_wikicode.py new file mode 100644 index 0000000000000000000000000000000000000000..91b6ecedf16394cc61f583db48f5d5e2473a63e2 --- /dev/null +++ b/src/get_wikicode.py @@ -0,0 +1,6 @@ +import pywikibot + +def get_wikicode(title:str, wiktlang:str): + site = pywikibot.Site(f"wiktionary:{wiktlang}") + page = pywikibot.Page(site, title) + return page.text \ No newline at end of file diff --git a/src/wiktextract_wrapper.py b/src/wiktextract_wrapper.py new file mode 100644 index 0000000000000000000000000000000000000000..ee2ef7844c3c440f3977a662f1b6743c2a37d911 --- /dev/null +++ b/src/wiktextract_wrapper.py @@ -0,0 +1,47 @@ +from wiktextract import ( + WiktextractContext, + WiktionaryConfig, +) +from wiktextract.wiktionary import page_handler +from wikitextprocessor import Wtp, Page + +db_path = "./sqlite.db" + +class Wiktextract: + def __init__(self, wiktlang:str, wordlang:str): + self.wiktlang = wiktlang + self.wordlang = wordlang + + config = WiktionaryConfig( + dump_file_lang_code=wiktlang, + capture_language_codes=[wordlang], + capture_translations=True, + capture_pronunciation=True, + capture_linkages=True, + capture_compounds=True, + capture_redirects=True, + capture_examples=True, + capture_etymologies=True, + capture_descendants=True, + capture_inflections=True,) + wxr = WiktextractContext(Wtp(db_path=db_path), config) + + self.page_handler = page_handler + self.page_handler.wxr = wxr + + def parse_page(self, title:str, wikicode:str): + # add page to the database + self.page_handler.wxr.wtp.add_page(title=title, namespace_id=0, body=wikicode, model='wikitext') + + # create a page object + page = Page(title, 0, None, True, wikicode, 'wikitext') + + # parse the page + success, ret, err = self.page_handler(page) + + if success: + return ret + else: + raise Exception(err) + +