diff --git a/README.md b/README.md index e405f20412489f1603ee9ca4d733593a9642f0bc..ac827d51cc629e883d19e63403450d4122f17735 100644 --- a/README.md +++ b/README.md @@ -5,10 +5,20 @@ This project provides a light-weight wrapper to the [wiktextract](https://github The FLASK app accepts GET request at the url ``` -localhost:5000/search/<wiktlang>/<wordlang>/<word> +localhost:5000/simplesearch/<lang>/<word> +localhost:5000/search/<wiktlang>/<wordlang>/<word>/<format> ``` - -where `<wiktlang>` specifies the language of the desired Wiktionary edition, `<wordlang>` the language of the word, and `<word>` the word itself to be queried. The route returns the extracted JSON object for the given query. +* `simplesearch` returns a non-ascii wikstraktor json formatted entry + * `lang`: language both for the wiktionary and the word, + * `word`: the wordform to be queried. +* `search` returns a json formatted entry + * `<wiktlang>`: specifies the language of the desired Wiktionary edition, + * `<wordlang>`: the language of the word, + * `<word>`: the word itself to be queried. + * `<format>`: the format of the output + * `wiktextract` or `xtr` : wiktextract native format + * `wikstraktor` or `strkt`: conversion to wikstraktor format + * prefix `a_` can be used to ensure ascii ## Local installation diff --git a/src/app.py b/src/app.py index 1c50351ed7a99622bd77fbab59ace14317affb36..618efdf79d329a463ed77e73de1917f9e2abc3fa 100644 --- a/src/app.py +++ b/src/app.py @@ -1,4 +1,4 @@ -from flask import Flask, Response, jsonify, request +from flask import Flask, Response, json, jsonify, request from flask_cors import CORS import config @@ -15,12 +15,12 @@ def index(): return Response(response, 200) -@app.route("/search/<wiktlang>/<wordlang>/<word>", methods=["GET"]) -def search(wiktlang, wordlang, word): - if wiktlang not in config.supported_wiktlangs: - return jsonify({"error": f"Language {wiktlang} not supported"}), 400 +@app.route("/simplesearch/<lang>/<word>", methods=["GET"]) +def search(lang, word): + if lang not in config.supported_wiktlangs: + return jsonify({"error": f"Language {lang} not supported"}), 400 - wiktextractor = Wiktextract(wiktlang, wordlang) + wiktextractor = Wiktextract(lang, lang) try: resp = wiktextractor.parse_page(word) if resp: @@ -29,7 +29,7 @@ def search(wiktlang, wordlang, word): return ( jsonify( { - "error": f"{word} is unknown in “{wordlang}†in {wiktlang}.wiktionary.org." + "error": f"{word} is unknown in “{lang}†in {lang}.wiktionary.org." } ), 404, @@ -44,6 +44,47 @@ def search(wiktlang, wordlang, word): if wiktextractor.wxr.thesaurus_db_conn: wiktextractor.wxr.thesaurus_db_conn.close() +@app.route("/search/<wiktlang>/<wordlang>/<word>/<format>", methods=["GET"]) +def search_and_format(wiktlang, wordlang, word, format): + if wiktlang not in config.supported_wiktlangs: + return jsonify({"error": f"Language {wiktlang} not supported"}), 400 + wiktextractor = Wiktextract(wiktlang, wordlang) + if len(format)>2 and format[0:2] in ("a_", "A_"): + ascii = True + format = format[2:] + print(ascii, format) + else: + ascii = False + try: + if format in ("wiktextract", "Wiktextract", "xtr"): + resp = wiktextractor.parse_page(word, False) + elif format in ("wikstraktor", "Wikstraktor", "strkt"): + resp = wiktextractor.parse_page(word, True) + else: + return jsonify({"error": f"{format} is not expected"}), 400 + if resp: + if not ascii: + return jsonify(resp) + else: + return Response(json.dumps(resp, ensure_ascii=True), mimetype="application/json") + else: + return ( + jsonify( + { + "error": f"{word} is unknown in “{wordlang}†in {wiktlang}.wiktionary.org." + } + ), + 404, + ) + + except Exception as e: + print(e) + + return jsonify({"error": "Parsing page resulted in error: " + str(e)}), 500 + finally: + wiktextractor.wxr.wtp.db_conn.close() + if wiktextractor.wxr.thesaurus_db_conn: + wiktextractor.wxr.thesaurus_db_conn.close() if __name__ == "__main__": app.run(host=config.host, port=config.port, debug=config.debugging) diff --git a/src/wiktextract_wrapper.py b/src/wiktextract_wrapper.py index 42532d4d73383dc0a8ae38c8b12e1a7e110a765b..c458dc46c7d8974e66a0b87c38c519368c76b29e 100644 --- a/src/wiktextract_wrapper.py +++ b/src/wiktextract_wrapper.py @@ -33,20 +33,18 @@ class Wiktextract: self.wxr = get_wiktextract_context(wiktlang, wordlang) - def parse_page(self, title: str): + def parse_page(self, title: str, wikstraktor_format: bool = True): page = self.wxr.wtp.get_page(title) if not page: return None result = parse_page(self.wxr, title, page.body) + if wikstraktor_format and result: + result = self.wikstraktor_format(result) + return result - converted_result = self.convert(result) - - return converted_result - #return result - - def convert(self, data_format1): + def wikstraktor_format(self, data_format1): transformed_data = [] for index, pos in enumerate(data_format1):