diff --git a/scripts/filter_namespaces.py b/scripts/filter_namespaces.py new file mode 100644 index 0000000000000000000000000000000000000000..7175900702e284d489f3476af985850757088663 --- /dev/null +++ b/scripts/filter_namespaces.py @@ -0,0 +1,77 @@ +import argparse +import re +from lxml import etree +import bz2 +import re + +from wikitextprocessor.dumpparser import decompress_dump_file + +KEEP_NAMESPACES = ["Module", "Template"] + +if __name__ == "__main__": + parser = argparse.ArgumentParser(description="Filter pages from dump") + parser.add_argument( + "--dump_path", type=str, nargs="?", default=None, help="Wiki dump file" + ) + parser.add_argument( + "--out_path", type=str, nargs="?", default=None, help="Out file" + ) + args = parser.parse_args() + + + dump_path = args.dump_path + out_path = args.out_path + + xml_declaration = "" + root_tag = "" + + + # Preserves the xml declaration and root tag + with bz2.open(dump_path, 'rt') as f: + i = 0 + for line in f: + if "<page" in line: + break + if not root_tag: + match = re.search(r"<(\w+)", line) + if match: + root_tag = match.group(1) + xml_declaration += line + + NAMESPACE_IDS = set([]) + + # Get the namespace ids of the namespaces we want to keep + root = etree.fromstring(xml_declaration+ f"</{root_tag}>", etree.XMLParser()) + namespaces = root.nsmap + for namespace in root.findall(".//namespaces/namespace", namespaces=namespaces): + if namespace.text in KEEP_NAMESPACES: + NAMESPACE_IDS.add(int(namespace.get("key"))) + + f.close() + + + + with decompress_dump_file(dump_path) as p, bz2.open(out_path, 'wt') as output_file: + output_file.write(xml_declaration) + namespace_str = "http://www.mediawiki.org/xml/export-0.10/" + page_nums = 0 + + # namespaces = {None: namespace_str} + for _, page_element in etree.iterparse( + p.stdout, tag=f"{{{namespace_str}}}page" + ): + page_nums += 1 + + namespace_id = int(page_element.findtext("ns", "0", namespaces)) + if ( + namespace_id not in NAMESPACE_IDS + ): + page_element.clear(keep_tail=True) + continue + + output_file.write(etree.tostring(page_element).decode('utf-8')) + + if page_nums % 10: + print(f"Processed {page_nums} pages") + + output_file.write(f"</{root_tag}>") diff --git a/scripts/filter_page.py b/scripts/filter_page.py new file mode 100644 index 0000000000000000000000000000000000000000..95a0b5880bf044d82d2d1cc0df4ab72a140a859c --- /dev/null +++ b/scripts/filter_page.py @@ -0,0 +1,72 @@ +import argparse +import re +from lxml import etree +import bz2 +import re + +from wikitextprocessor.dumpparser import decompress_dump_file + +KEEP_PAGES = ["example"] + +if __name__ == "__main__": + parser = argparse.ArgumentParser(description="Filter pages from dump") + parser.add_argument( + "--dump_path", type=str, nargs="?", default=None, help="Wiki dump file" + ) + parser.add_argument( + "--out_path", type=str, nargs="?", default=None, help="Out file" + ) + args = parser.parse_args() + + + dump_path = args.dump_path + out_path = args.out_path + + xml_declaration = "" + root_tag = "" + + + # Preserves the xml declaration and root tag + with bz2.open(dump_path, 'rt') as f: + i = 0 + for line in f: + if "<page" in line: + break + if not root_tag: + match = re.search(r"<(\w+)", line) + if match: + root_tag = match.group(1) + xml_declaration += line + + NAMESPACE_IDS = set([]) + + + namespace_str = "http://www.mediawiki.org/xml/export-0.10/" + namespaces = {None: namespace_str} + + with decompress_dump_file(dump_path) as p, bz2.open(out_path, 'wt') as output_file: + output_file.write(xml_declaration) + namespace_str = "http://www.mediawiki.org/xml/export-0.10/" + page_nums = 0 + + # namespaces = {None: namespace_str} + for _, page_element in etree.iterparse( + p.stdout, tag=f"{{{namespace_str}}}page" + ): + page_nums += 1 + + title = page_element.findtext("title", "", namespaces) + + if title in KEEP_PAGES: + output_file.write(etree.tostring(page_element).decode('utf-8')) + KEEP_PAGES.remove(title) + + if not KEEP_PAGES: + break + + if page_nums % 10: + print(f"Processed {page_nums} pages") + + output_file.write(f"</{root_tag}>") + +