diff --git a/scripts/filter_namespaces.py b/scripts/filter_namespaces.py deleted file mode 100644 index d009701e233d2b114f19d0d4fa47353d2e666b04..0000000000000000000000000000000000000000 --- a/scripts/filter_namespaces.py +++ /dev/null @@ -1,71 +0,0 @@ -import argparse -import re -from lxml import etree -import bz2 -import re - -from wikitextprocessor.dumpparser import decompress_dump_file - -KEEP_NAMESPACES = ["Module", "Template"] - -if __name__ == "__main__": - parser = argparse.ArgumentParser(description="Filter pages from dump") - parser.add_argument( - "--dump_path", type=str, nargs="?", default=None, help="Wiki dump file" - ) - parser.add_argument( - "--out_path", type=str, nargs="?", default=None, help="Out file" - ) - args = parser.parse_args() - - dump_path = args.dump_path - out_path = args.out_path - - xml_declaration = "" - root_tag = "" - - # Preserves the xml declaration and root tag - with bz2.open(dump_path, "rt") as f: - i = 0 - for line in f: - if "<page" in line: - break - if not root_tag: - match = re.search(r"<(\w+)", line) - if match: - root_tag = match.group(1) - xml_declaration += line - - NAMESPACE_IDS = set([]) - - # Get the namespace ids of the namespaces we want to keep - root = etree.fromstring(xml_declaration + f"</{root_tag}>", etree.XMLParser()) - namespaces = root.nsmap - for namespace in root.findall(".//namespaces/namespace", namespaces=namespaces): - if namespace.text in KEEP_NAMESPACES: - NAMESPACE_IDS.add(int(namespace.get("key"))) - - f.close() - - with decompress_dump_file(dump_path) as p, bz2.open(out_path, "wt") as output_file: - output_file.write(xml_declaration) - namespace_str = "http://www.mediawiki.org/xml/export-0.10/" - page_nums = 0 - - # namespaces = {None: namespace_str} - for _, page_element in etree.iterparse( - p.stdout, tag=f"{{{namespace_str}}}page" - ): - page_nums += 1 - - namespace_id = int(page_element.findtext("ns", "0", namespaces)) - if namespace_id not in NAMESPACE_IDS: - page_element.clear(keep_tail=True) - continue - - output_file.write(etree.tostring(page_element).decode("utf-8")) - - if page_nums % 10: - print(f"Processed {page_nums} pages") - - output_file.write(f"</{root_tag}>") diff --git a/scripts/filter_page.py b/scripts/filter_page.py deleted file mode 100644 index 01ec07d24c899f8564414dfc357398ab745c5e33..0000000000000000000000000000000000000000 --- a/scripts/filter_page.py +++ /dev/null @@ -1,67 +0,0 @@ -import argparse -import re -from lxml import etree -import bz2 -import re - -from wikitextprocessor.dumpparser import decompress_dump_file - -KEEP_PAGES = ["example"] - -if __name__ == "__main__": - parser = argparse.ArgumentParser(description="Filter pages from dump") - parser.add_argument( - "--dump_path", type=str, nargs="?", default=None, help="Wiki dump file" - ) - parser.add_argument( - "--out_path", type=str, nargs="?", default=None, help="Out file" - ) - args = parser.parse_args() - - dump_path = args.dump_path - out_path = args.out_path - - xml_declaration = "" - root_tag = "" - - # Preserves the xml declaration and root tag - with bz2.open(dump_path, "rt") as f: - i = 0 - for line in f: - if "<page" in line: - break - if not root_tag: - match = re.search(r"<(\w+)", line) - if match: - root_tag = match.group(1) - xml_declaration += line - - NAMESPACE_IDS = set([]) - - namespace_str = "http://www.mediawiki.org/xml/export-0.10/" - namespaces = {None: namespace_str} - - with decompress_dump_file(dump_path) as p, bz2.open(out_path, "wt") as output_file: - output_file.write(xml_declaration) - namespace_str = "http://www.mediawiki.org/xml/export-0.10/" - page_nums = 0 - - # namespaces = {None: namespace_str} - for _, page_element in etree.iterparse( - p.stdout, tag=f"{{{namespace_str}}}page" - ): - page_nums += 1 - - title = page_element.findtext("title", "", namespaces) - - if title in KEEP_PAGES: - output_file.write(etree.tostring(page_element).decode("utf-8")) - KEEP_PAGES.remove(title) - - if not KEEP_PAGES: - break - - if page_nums % 10: - print(f"Processed {page_nums} pages") - - output_file.write(f"</{root_tag}>")