Remove obsolete scripts

a087f053 · Empiriker · 31b3a443 · 31b3a443 · 31b3a443
Commit a087f053 authored 1 year ago by Empiriker
--- a/scripts/filter_namespaces.py
+++ b/scripts/filter_namespaces.py
-import argparse
-import re
-from lxml import etree
-import bz2
-import re
-
-from wikitextprocessor.dumpparser import decompress_dump_file
-
-KEEP_NAMESPACES = ["Module", "Template"]
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser(description="Filter pages from dump")
-    parser.add_argument(
-        "--dump_path", type=str, nargs="?", default=None, help="Wiki dump file"
-    )
-    parser.add_argument(
-        "--out_path", type=str, nargs="?", default=None, help="Out file"
-    )
-    args = parser.parse_args()
-
-    dump_path = args.dump_path
-    out_path = args.out_path
-
-    xml_declaration = ""
-    root_tag = ""
-
-    # Preserves the xml declaration and root tag
-    with bz2.open(dump_path, "rt") as f:
-        i = 0
-        for line in f:
-            if "<page" in line:
-                break
-            if not root_tag:
-                match = re.search(r"<(\w+)", line)
-                if match:
-                    root_tag = match.group(1)
-            xml_declaration += line
-
-    NAMESPACE_IDS = set([])
-
-    # Get the namespace ids of the namespaces we want to keep
-    root = etree.fromstring(xml_declaration + f"</{root_tag}>", etree.XMLParser())
-    namespaces = root.nsmap
-    for namespace in root.findall(".//namespaces/namespace", namespaces=namespaces):
-        if namespace.text in KEEP_NAMESPACES:
-            NAMESPACE_IDS.add(int(namespace.get("key")))
-
-    f.close()
-
-    with decompress_dump_file(dump_path) as p, bz2.open(out_path, "wt") as output_file:
-        output_file.write(xml_declaration)
-        namespace_str = "http://www.mediawiki.org/xml/export-0.10/"
-        page_nums = 0
-
-        # namespaces = {None: namespace_str}
-        for _, page_element in etree.iterparse(
-            p.stdout, tag=f"{{{namespace_str}}}page"
-        ):
-            page_nums += 1
-
-            namespace_id = int(page_element.findtext("ns", "0", namespaces))
-            if namespace_id not in NAMESPACE_IDS:
-                page_element.clear(keep_tail=True)
-                continue
-
-            output_file.write(etree.tostring(page_element).decode("utf-8"))
-
-            if page_nums % 10:
-                print(f"Processed {page_nums} pages")
-
-        output_file.write(f"</{root_tag}>")
--- a/scripts/filter_page.py
+++ b/scripts/filter_page.py
-import argparse
-import re
-from lxml import etree
-import bz2
-import re
-
-from wikitextprocessor.dumpparser import decompress_dump_file
-
-KEEP_PAGES = ["example"]
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser(description="Filter pages from dump")
-    parser.add_argument(
-        "--dump_path", type=str, nargs="?", default=None, help="Wiki dump file"
-    )
-    parser.add_argument(
-        "--out_path", type=str, nargs="?", default=None, help="Out file"
-    )
-    args = parser.parse_args()
-
-    dump_path = args.dump_path
-    out_path = args.out_path
-
-    xml_declaration = ""
-    root_tag = ""
-
-    # Preserves the xml declaration and root tag
-    with bz2.open(dump_path, "rt") as f:
-        i = 0
-        for line in f:
-            if "<page" in line:
-                break
-            if not root_tag:
-                match = re.search(r"<(\w+)", line)
-                if match:
-                    root_tag = match.group(1)
-            xml_declaration += line
-
-    NAMESPACE_IDS = set([])
-
-    namespace_str = "http://www.mediawiki.org/xml/export-0.10/"
-    namespaces = {None: namespace_str}
-
-    with decompress_dump_file(dump_path) as p, bz2.open(out_path, "wt") as output_file:
-        output_file.write(xml_declaration)
-        namespace_str = "http://www.mediawiki.org/xml/export-0.10/"
-        page_nums = 0
-
-        # namespaces = {None: namespace_str}
-        for _, page_element in etree.iterparse(
-            p.stdout, tag=f"{{{namespace_str}}}page"
-        ):
-            page_nums += 1
-
-            title = page_element.findtext("title", "", namespaces)
-
-            if title in KEEP_PAGES:
-                output_file.write(etree.tostring(page_element).decode("utf-8"))
-                KEEP_PAGES.remove(title)
-
-            if not KEEP_PAGES:
-                break
-
-            if page_nums % 10:
-                print(f"Processed {page_nums} pages")
-
-        output_file.write(f"</{root_tag}>")