Skip to content
Snippets Groups Projects
Commit e000302a authored by Empiriker's avatar Empiriker
Browse files

autoformat with black

parent f0012230
No related branches found
No related tags found
No related merge requests found
...@@ -18,60 +18,54 @@ if __name__ == "__main__": ...@@ -18,60 +18,54 @@ if __name__ == "__main__":
) )
args = parser.parse_args() args = parser.parse_args()
dump_path = args.dump_path dump_path = args.dump_path
out_path = args.out_path out_path = args.out_path
xml_declaration = "" xml_declaration = ""
root_tag = "" root_tag = ""
# Preserves the xml declaration and root tag # Preserves the xml declaration and root tag
with bz2.open(dump_path, 'rt') as f: with bz2.open(dump_path, "rt") as f:
i = 0 i = 0
for line in f: for line in f:
if "<page" in line: if "<page" in line:
break break
if not root_tag: if not root_tag:
match = re.search(r"<(\w+)", line) match = re.search(r"<(\w+)", line)
if match: if match:
root_tag = match.group(1) root_tag = match.group(1)
xml_declaration += line xml_declaration += line
NAMESPACE_IDS = set([]) NAMESPACE_IDS = set([])
# Get the namespace ids of the namespaces we want to keep # Get the namespace ids of the namespaces we want to keep
root = etree.fromstring(xml_declaration+ f"</{root_tag}>", etree.XMLParser()) root = etree.fromstring(xml_declaration + f"</{root_tag}>", etree.XMLParser())
namespaces = root.nsmap namespaces = root.nsmap
for namespace in root.findall(".//namespaces/namespace", namespaces=namespaces): for namespace in root.findall(".//namespaces/namespace", namespaces=namespaces):
if namespace.text in KEEP_NAMESPACES: if namespace.text in KEEP_NAMESPACES:
NAMESPACE_IDS.add(int(namespace.get("key"))) NAMESPACE_IDS.add(int(namespace.get("key")))
f.close() f.close()
with decompress_dump_file(dump_path) as p, bz2.open(out_path, "wt") as output_file:
output_file.write(xml_declaration)
namespace_str = "http://www.mediawiki.org/xml/export-0.10/"
page_nums = 0
# namespaces = {None: namespace_str}
with decompress_dump_file(dump_path) as p, bz2.open(out_path, 'wt') as output_file: for _, page_element in etree.iterparse(
output_file.write(xml_declaration) p.stdout, tag=f"{{{namespace_str}}}page"
namespace_str = "http://www.mediawiki.org/xml/export-0.10/"
page_nums = 0
# namespaces = {None: namespace_str}
for _, page_element in etree.iterparse(
p.stdout, tag=f"{{{namespace_str}}}page"
):
page_nums += 1
namespace_id = int(page_element.findtext("ns", "0", namespaces))
if (
namespace_id not in NAMESPACE_IDS
): ):
page_element.clear(keep_tail=True) page_nums += 1
continue
namespace_id = int(page_element.findtext("ns", "0", namespaces))
if namespace_id not in NAMESPACE_IDS:
page_element.clear(keep_tail=True)
continue
output_file.write(etree.tostring(page_element).decode('utf-8')) output_file.write(etree.tostring(page_element).decode("utf-8"))
if page_nums % 10: if page_nums % 10:
print(f"Processed {page_nums} pages") print(f"Processed {page_nums} pages")
output_file.write(f"</{root_tag}>") output_file.write(f"</{root_tag}>")
...@@ -18,55 +18,50 @@ if __name__ == "__main__": ...@@ -18,55 +18,50 @@ if __name__ == "__main__":
) )
args = parser.parse_args() args = parser.parse_args()
dump_path = args.dump_path dump_path = args.dump_path
out_path = args.out_path out_path = args.out_path
xml_declaration = "" xml_declaration = ""
root_tag = "" root_tag = ""
# Preserves the xml declaration and root tag # Preserves the xml declaration and root tag
with bz2.open(dump_path, 'rt') as f: with bz2.open(dump_path, "rt") as f:
i = 0 i = 0
for line in f: for line in f:
if "<page" in line: if "<page" in line:
break break
if not root_tag: if not root_tag:
match = re.search(r"<(\w+)", line) match = re.search(r"<(\w+)", line)
if match: if match:
root_tag = match.group(1) root_tag = match.group(1)
xml_declaration += line xml_declaration += line
NAMESPACE_IDS = set([]) NAMESPACE_IDS = set([])
namespace_str = "http://www.mediawiki.org/xml/export-0.10/" namespace_str = "http://www.mediawiki.org/xml/export-0.10/"
namespaces = {None: namespace_str} namespaces = {None: namespace_str}
with decompress_dump_file(dump_path) as p, bz2.open(out_path, 'wt') as output_file:
output_file.write(xml_declaration)
namespace_str = "http://www.mediawiki.org/xml/export-0.10/"
page_nums = 0
# namespaces = {None: namespace_str} with decompress_dump_file(dump_path) as p, bz2.open(out_path, "wt") as output_file:
for _, page_element in etree.iterparse( output_file.write(xml_declaration)
p.stdout, tag=f"{{{namespace_str}}}page" namespace_str = "http://www.mediawiki.org/xml/export-0.10/"
): page_nums = 0
page_nums += 1
title = page_element.findtext("title", "", namespaces) # namespaces = {None: namespace_str}
for _, page_element in etree.iterparse(
p.stdout, tag=f"{{{namespace_str}}}page"
):
page_nums += 1
if title in KEEP_PAGES: title = page_element.findtext("title", "", namespaces)
output_file.write(etree.tostring(page_element).decode('utf-8'))
KEEP_PAGES.remove(title)
if not KEEP_PAGES:
break
if page_nums % 10: if title in KEEP_PAGES:
print(f"Processed {page_nums} pages") output_file.write(etree.tostring(page_element).decode("utf-8"))
KEEP_PAGES.remove(title)
output_file.write(f"</{root_tag}>") if not KEEP_PAGES:
break
if page_nums % 10:
print(f"Processed {page_nums} pages")
output_file.write(f"</{root_tag}>")
from flask import Flask, jsonify from flask import Flask, jsonify
from flask import request from flask import request
from flask import Response from flask import Response
from flask_cors import CORS from flask_cors import CORS
import config import config
from get_wikicode import get_wikicode from get_wikicode import get_wikicode
...@@ -10,25 +10,26 @@ from wiktextract_wrapper import Wiktextract ...@@ -10,25 +10,26 @@ from wiktextract_wrapper import Wiktextract
app = Flask(__name__) app = Flask(__name__)
CORS(app) CORS(app)
@app.route('/', methods=['GET'])
@app.route("/", methods=["GET"])
def index(): def index():
c = request.remote_addr c = request.remote_addr
response = f"<p>Server is running, your ip is {c}</p>" response = f"<p>Server is running, your ip is {c}</p>"
return Response(response, 200) return Response(response, 200)
@app.route('/search/<wiktlang>/<wordlang>/<word>', methods=['GET']) @app.route("/search/<wiktlang>/<wordlang>/<word>", methods=["GET"])
def search(wiktlang, wordlang, word): def search(wiktlang, wordlang, word):
wikicode = get_wikicode(word, wiktlang) wikicode = get_wikicode(word, wiktlang)
if wikicode: if wikicode:
en_wiktextract = Wiktextract("en", wordlang) en_wiktextract = Wiktextract("en", wordlang)
try: try:
resp = en_wiktextract.parse_page(word, wikicode) resp = en_wiktextract.parse_page(word, wikicode)
return jsonify(resp) return jsonify(resp)
except Exception as e: except Exception as e:
print(e) print(e)
resp = f"""<!doctype html> resp = f"""<!doctype html>
<html> <html>
<head> <head>
<title>Error</title> <title>Error</title>
...@@ -38,13 +39,13 @@ def search(wiktlang, wordlang, word): ...@@ -38,13 +39,13 @@ def search(wiktlang, wordlang, word):
<p>{e}</p> <p>{e}</p>
</body> </body>
</html>""" </html>"""
status = 404 status = 404
mimetype='text/html' mimetype = "text/html"
finally: finally:
en_wiktextract.page_handler.wxr.wtp.db_conn.close() en_wiktextract.page_handler.wxr.wtp.db_conn.close()
else: else:
resp = f"""<!doctype html> resp = f"""<!doctype html>
<html> <html>
<head> <head>
<title>Error</title> <title>Error</title>
...@@ -54,9 +55,10 @@ def search(wiktlang, wordlang, word): ...@@ -54,9 +55,10 @@ def search(wiktlang, wordlang, word):
<p>{word} is unknown in “{wordlang}” in {wiktlang}.wiktionary.org.</p> <p>{word} is unknown in “{wordlang}” in {wiktlang}.wiktionary.org.</p>
</body> </body>
</html>""" </html>"""
status = 404 status = 404
mimetype='text/html' mimetype = "text/html"
return Response(resp, status=status, mimetype=mimetype) return Response(resp, status=status, mimetype=mimetype)
if __name__ == "__main__": if __name__ == "__main__":
app.run(host=config.host, port=config.port, debug=config.debugging) app.run(host=config.host, port=config.port, debug=config.debugging)
host = "0.0.0.0" host = "0.0.0.0"
port = 80 port = 80
debugging = True debugging = True
\ No newline at end of file
import pywikibot import pywikibot
def get_wikicode(title:str, wiktlang:str):
site = pywikibot.Site(f"wiktionary:{wiktlang}") def get_wikicode(title: str, wiktlang: str):
page = pywikibot.Page(site, title) site = pywikibot.Site(f"wiktionary:{wiktlang}")
return page.text page = pywikibot.Page(site, title)
\ No newline at end of file return page.text
...@@ -8,49 +8,58 @@ from wikitextprocessor import Wtp, Page ...@@ -8,49 +8,58 @@ from wikitextprocessor import Wtp, Page
db_path = "./sqlite.db" db_path = "./sqlite.db"
DEFAULT_PAGE_VALUES = { DEFAULT_PAGE_VALUES = {
"namespace_id": 0, "namespace_id": 0,
"model": 'wikitext', "model": "wikitext",
} }
class Wiktextract: class Wiktextract:
def __init__(self, wiktlang:str, wordlang:str): def __init__(self, wiktlang: str, wordlang: str):
self.wiktlang = wiktlang self.wiktlang = wiktlang
self.wordlang = wordlang self.wordlang = wordlang
config = WiktionaryConfig( config = WiktionaryConfig(
dump_file_lang_code=wiktlang, dump_file_lang_code=wiktlang,
capture_language_codes=[wordlang], capture_language_codes=[wordlang],
capture_translations=True, capture_translations=True,
capture_pronunciation=True, capture_pronunciation=True,
capture_linkages=True, capture_linkages=True,
capture_compounds=True, capture_compounds=True,
capture_redirects=True, capture_redirects=True,
capture_examples=True, capture_examples=True,
capture_etymologies=True, capture_etymologies=True,
capture_descendants=True, capture_descendants=True,
capture_inflections=True,) capture_inflections=True,
wxr = WiktextractContext(Wtp(db_path=db_path), config) )
wxr = WiktextractContext(Wtp(db_path=db_path), config)
self.page_handler = page_handler
self.page_handler.wxr = wxr self.page_handler = page_handler
self.page_handler.wxr = wxr
def parse_page(self, title:str, wikicode:str):
# add page to the database (making it accessible to LUA templates) def parse_page(self, title: str, wikicode: str):
self.page_handler.wxr.wtp.add_page(title=title, namespace_id=DEFAULT_PAGE_VALUES["namespace_id"], body=wikicode, model=DEFAULT_PAGE_VALUES["model"]) # add page to the database (making it accessible to LUA templates)
self.page_handler.wxr.wtp.add_page(
# create a page object (for parsing) title=title,
page = Page(title, 0, None, True, wikicode, 'wikitext') namespace_id=DEFAULT_PAGE_VALUES["namespace_id"],
body=wikicode,
# parse the page model=DEFAULT_PAGE_VALUES["model"],
success, ret, err = self.page_handler(page) )
result, parsing_errors = ret
# create a page object (for parsing)
# remove the page from the database page = Page(title, 0, None, True, wikicode, "wikitext")
self.page_handler.wxr.wtp.db_conn.execute("DELETE FROM pages WHERE title = ? AND model = ?", (title, DEFAULT_PAGE_VALUES["model"]))
self.page_handler.wxr.wtp.db_conn.commit() # parse the page
success, ret, err = self.page_handler(page)
if success: result, parsing_errors = ret
return result
else:
raise Exception(err)
# remove the page from the database
self.page_handler.wxr.wtp.db_conn.execute(
"DELETE FROM pages WHERE title = ? AND model = ?",
(title, DEFAULT_PAGE_VALUES["model"]),
)
self.page_handler.wxr.wtp.db_conn.commit()
if success:
return result
else:
raise Exception(err)
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment