autoformat with black

e000302a · Empiriker · f0012230 · e000302a · e000302a · e000302a
Commit e000302a authored 1 year ago by Empiriker
--- a/scripts/filter_namespaces.py
+++ b/scripts/filter_namespaces.py
@@ -18,60 +18,54 @@ if __name__ == "__main__":
    )
    args = parser.parse_args()
    dump_path = args.dump_path
    out_path = args.out_path
    xml_declaration = ""
    root_tag = ""
    # Preserves the xml declaration and root tag
-    with bz2.open(dump_path, 'rt') as f:
+    with bz2.open(dump_path, "rt") as f:
-      i = 0
+        i = 0
-      for line in f:
+        for line in f:
-        if "<page" in line:
+            if "<page" in line:
-          break
+                break
-        if not root_tag:
+            if not root_tag:
-          match = re.search(r"<(\w+)", line)
+                match = re.search(r"<(\w+)", line)
-          if match:
+                if match:
-            root_tag = match.group(1)
+                    root_tag = match.group(1)
-        xml_declaration += line
+            xml_declaration += line
    NAMESPACE_IDS = set([])
    # Get the namespace ids of the namespaces we want to keep
-    root = etree.fromstring(xml_declaration+ f"</{root_tag}>", etree.XMLParser())
+    root = etree.fromstring(xml_declaration + f"</{root_tag}>", etree.XMLParser())
    namespaces = root.nsmap
    for namespace in root.findall(".//namespaces/namespace", namespaces=namespaces):
-      if namespace.text in KEEP_NAMESPACES:
+        if namespace.text in KEEP_NAMESPACES:
-        NAMESPACE_IDS.add(int(namespace.get("key")))
+            NAMESPACE_IDS.add(int(namespace.get("key")))
    f.close()
+    with decompress_dump_file(dump_path) as p, bz2.open(out_path, "wt") as output_file:
+        output_file.write(xml_declaration)
+        namespace_str = "http://www.mediawiki.org/xml/export-0.10/"
+        page_nums = 0
+        # namespaces = {None: namespace_str}
-    with decompress_dump_file(dump_path) as p, bz2.open(out_path, 'wt') as output_file:
+        for _, page_element in etree.iterparse(
-      output_file.write(xml_declaration)
+            p.stdout, tag=f"{{{namespace_str}}}page"
-      namespace_str = "http://www.mediawiki.org/xml/export-0.10/"
-      page_nums = 0
-      # namespaces = {None: namespace_str}
-      for _, page_element in etree.iterparse(
-        p.stdout, tag=f"{{{namespace_str}}}page"
-      ):
-        page_nums += 1
-        namespace_id = int(page_element.findtext("ns", "0", namespaces)) 
-        if (
-            namespace_id not in NAMESPACE_IDS
        ):
-            page_element.clear(keep_tail=True)
+            page_nums += 1
-            continue
+            namespace_id = int(page_element.findtext("ns", "0", namespaces))
+            if namespace_id not in NAMESPACE_IDS:
+                page_element.clear(keep_tail=True)
+                continue
-        output_file.write(etree.tostring(page_element).decode('utf-8'))
+            output_file.write(etree.tostring(page_element).decode("utf-8"))
-        if page_nums % 10:
+            if page_nums % 10:
-          print(f"Processed {page_nums} pages")
+                print(f"Processed {page_nums} pages")
-      output_file.write(f"</{root_tag}>")
+        output_file.write(f"</{root_tag}>")
--- a/scripts/filter_page.py
+++ b/scripts/filter_page.py
@@ -18,55 +18,50 @@ if __name__ == "__main__":
    )
    args = parser.parse_args()
    dump_path = args.dump_path
    out_path = args.out_path
    xml_declaration = ""
    root_tag = ""
    # Preserves the xml declaration and root tag
-    with bz2.open(dump_path, 'rt') as f:
+    with bz2.open(dump_path, "rt") as f:
-      i = 0
+        i = 0
-      for line in f:
+        for line in f:
-        if "<page" in line:
+            if "<page" in line:
-          break
+                break
-        if not root_tag:
+            if not root_tag:
-          match = re.search(r"<(\w+)", line)
+                match = re.search(r"<(\w+)", line)
-          if match:
+                if match:
-            root_tag = match.group(1)
+                    root_tag = match.group(1)
-        xml_declaration += line
+            xml_declaration += line
    NAMESPACE_IDS = set([])
    namespace_str = "http://www.mediawiki.org/xml/export-0.10/"
    namespaces = {None: namespace_str}
-    with decompress_dump_file(dump_path) as p, bz2.open(out_path, 'wt') as output_file:
-      output_file.write(xml_declaration)
-      namespace_str = "http://www.mediawiki.org/xml/export-0.10/"
-      page_nums = 0
-      # namespaces = {None: namespace_str}
+    with decompress_dump_file(dump_path) as p, bz2.open(out_path, "wt") as output_file:
-      for _, page_element in etree.iterparse(
+        output_file.write(xml_declaration)
-        p.stdout, tag=f"{{{namespace_str}}}page"
+        namespace_str = "http://www.mediawiki.org/xml/export-0.10/"
-      ):
+        page_nums = 0
-        page_nums += 1
-        title = page_element.findtext("title", "", namespaces)
+        # namespaces = {None: namespace_str}
+        for _, page_element in etree.iterparse(
+            p.stdout, tag=f"{{{namespace_str}}}page"
+        ):
+            page_nums += 1
-        if title in KEEP_PAGES:
+            title = page_element.findtext("title", "", namespaces)
-          output_file.write(etree.tostring(page_element).decode('utf-8'))
-          KEEP_PAGES.remove(title)
-        if not KEEP_PAGES:
-          break
-        if page_nums % 10:
+            if title in KEEP_PAGES:
-          print(f"Processed {page_nums} pages")
+                output_file.write(etree.tostring(page_element).decode("utf-8"))
+                KEEP_PAGES.remove(title)
-      output_file.write(f"</{root_tag}>")
+            if not KEEP_PAGES:
+                break
+            if page_nums % 10:
+                print(f"Processed {page_nums} pages")
+        output_file.write(f"</{root_tag}>")
--- a/src/app.py
+++ b/src/app.py
 from flask import Flask, jsonify
 from flask import request
-from flask import Response 
+from flask import Response
-from flask_cors import CORS 
+from flask_cors import CORS
 import config
 from get_wikicode import get_wikicode
@@ -10,25 +10,26 @@ from wiktextract_wrapper import Wiktextract
 app = Flask(__name__)
 CORS(app)
-@app.route('/', methods=['GET'])
+@app.route("/", methods=["GET"])
 def index():
    c = request.remote_addr
    response = f"<p>Server is running, your ip is {c}</p>"
    return Response(response, 200)
-@app.route('/search/<wiktlang>/<wordlang>/<word>', methods=['GET'])
+@app.route("/search/<wiktlang>/<wordlang>/<word>", methods=["GET"])
 def search(wiktlang, wordlang, word):
    wikicode = get_wikicode(word, wiktlang)
    if wikicode:
-      en_wiktextract = Wiktextract("en", wordlang)
+        en_wiktextract = Wiktextract("en", wordlang)
-      try:
+        try:
-        resp = en_wiktextract.parse_page(word, wikicode)
+            resp = en_wiktextract.parse_page(word, wikicode)
-        return jsonify(resp)
+            return jsonify(resp)
-      except Exception as e:
+        except Exception as e:
-        print(e)
+            print(e)
-        resp =  f"""<!doctype html>
+            resp = f"""<!doctype html>
                    <html>
                    <head>
                        <title>Error</title>
@@ -38,13 +39,13 @@ def search(wiktlang, wordlang, word):
                        <p>{e}</p>  
                    </body>
                    </html>"""
-        status = 404
+            status = 404
-        mimetype='text/html'
+            mimetype = "text/html"
-      finally:
+        finally:
-        en_wiktextract.page_handler.wxr.wtp.db_conn.close()
+            en_wiktextract.page_handler.wxr.wtp.db_conn.close()
    else:
-      resp =  f"""<!doctype html>
+        resp = f"""<!doctype html>
                  <html>
                  <head>
                      <title>Error</title>
@@ -54,9 +55,10 @@ def search(wiktlang, wordlang, word):
                      <p>{word} is unknown in “{wordlang}” in {wiktlang}.wiktionary.org.</p>
                  </body>
                  </html>"""
-      status = 404
+        status = 404
-      mimetype='text/html'
+        mimetype = "text/html"
    return Response(resp, status=status, mimetype=mimetype)
 if __name__ == "__main__":
    app.run(host=config.host, port=config.port, debug=config.debugging)
--- a/src/config.py
+++ b/src/config.py
 host = "0.0.0.0"
 port = 80
 debugging = True
\ No newline at end of file
--- a/src/get_wikicode.py
+++ b/src/get_wikicode.py
 import pywikibot
-def get_wikicode(title:str, wiktlang:str):
-  site = pywikibot.Site(f"wiktionary:{wiktlang}")
+def get_wikicode(title: str, wiktlang: str):
-  page = pywikibot.Page(site, title)
+    site = pywikibot.Site(f"wiktionary:{wiktlang}")
-  return page.text
+    page = pywikibot.Page(site, title)
\ No newline at end of file
+    return page.text
--- a/src/wiktextract_wrapper.py
+++ b/src/wiktextract_wrapper.py
@@ -8,49 +8,58 @@ from wikitextprocessor import Wtp, Page
 db_path = "./sqlite.db"
 DEFAULT_PAGE_VALUES = {
-      "namespace_id": 0,
+    "namespace_id": 0,
-      "model": 'wikitext',
+    "model": "wikitext",
 }
 class Wiktextract:
-  def __init__(self, wiktlang:str, wordlang:str):
+    def __init__(self, wiktlang: str, wordlang: str):
-    self.wiktlang = wiktlang
+        self.wiktlang = wiktlang
-    self.wordlang = wordlang
+        self.wordlang = wordlang
-    config = WiktionaryConfig(
+        config = WiktionaryConfig(
-      dump_file_lang_code=wiktlang,
+            dump_file_lang_code=wiktlang,
-      capture_language_codes=[wordlang],
+            capture_language_codes=[wordlang],
-      capture_translations=True,
+            capture_translations=True,
-      capture_pronunciation=True,
+            capture_pronunciation=True,
-      capture_linkages=True,
+            capture_linkages=True,
-      capture_compounds=True,
+            capture_compounds=True,
-      capture_redirects=True,
+            capture_redirects=True,
-      capture_examples=True,
+            capture_examples=True,
-      capture_etymologies=True,
+            capture_etymologies=True,
-      capture_descendants=True,
+            capture_descendants=True,
-      capture_inflections=True,)
+            capture_inflections=True,
-    wxr = WiktextractContext(Wtp(db_path=db_path), config)
+        )
+        wxr = WiktextractContext(Wtp(db_path=db_path), config)
-    self.page_handler = page_handler
-    self.page_handler.wxr = wxr
+        self.page_handler = page_handler
+        self.page_handler.wxr = wxr
-  def parse_page(self, title:str, wikicode:str):
-    # add page to the database (making it accessible to LUA templates)
+    def parse_page(self, title: str, wikicode: str):
-    self.page_handler.wxr.wtp.add_page(title=title, namespace_id=DEFAULT_PAGE_VALUES["namespace_id"], body=wikicode, model=DEFAULT_PAGE_VALUES["model"])
+        # add page to the database (making it accessible to LUA templates)
+        self.page_handler.wxr.wtp.add_page(
-    # create a page object (for parsing)
+            title=title,
-    page = Page(title, 0, None, True, wikicode, 'wikitext')
+            namespace_id=DEFAULT_PAGE_VALUES["namespace_id"],
+            body=wikicode,
-    # parse the page
+            model=DEFAULT_PAGE_VALUES["model"],
-    success, ret, err = self.page_handler(page)
+        )
-    result, parsing_errors = ret
+        # create a page object (for parsing)
-    # remove the page from the database
+        page = Page(title, 0, None, True, wikicode, "wikitext")
-    self.page_handler.wxr.wtp.db_conn.execute("DELETE FROM pages WHERE title = ? AND model = ?", (title, DEFAULT_PAGE_VALUES["model"]))
-    self.page_handler.wxr.wtp.db_conn.commit()
+        # parse the page
+        success, ret, err = self.page_handler(page)
-    if success:
+        result, parsing_errors = ret
-      return result
-    else:
-      raise Exception(err)
+        # remove the page from the database
+        self.page_handler.wxr.wtp.db_conn.execute(
+            "DELETE FROM pages WHERE title = ? AND model = ?",
+            (title, DEFAULT_PAGE_VALUES["model"]),
+        )
+        self.page_handler.wxr.wtp.db_conn.commit()
+        if success:
+            return result
+        else:
+            raise Exception(err)