Skip to content
Snippets Groups Projects
Commit a087f053 authored by Empiriker's avatar Empiriker
Browse files

Remove obsolete scripts

parent 31b3a443
No related branches found
No related tags found
No related merge requests found
import argparse
import re
from lxml import etree
import bz2
import re
from wikitextprocessor.dumpparser import decompress_dump_file
KEEP_NAMESPACES = ["Module", "Template"]
if __name__ == "__main__":
parser = argparse.ArgumentParser(description="Filter pages from dump")
parser.add_argument(
"--dump_path", type=str, nargs="?", default=None, help="Wiki dump file"
)
parser.add_argument(
"--out_path", type=str, nargs="?", default=None, help="Out file"
)
args = parser.parse_args()
dump_path = args.dump_path
out_path = args.out_path
xml_declaration = ""
root_tag = ""
# Preserves the xml declaration and root tag
with bz2.open(dump_path, "rt") as f:
i = 0
for line in f:
if "<page" in line:
break
if not root_tag:
match = re.search(r"<(\w+)", line)
if match:
root_tag = match.group(1)
xml_declaration += line
NAMESPACE_IDS = set([])
# Get the namespace ids of the namespaces we want to keep
root = etree.fromstring(xml_declaration + f"</{root_tag}>", etree.XMLParser())
namespaces = root.nsmap
for namespace in root.findall(".//namespaces/namespace", namespaces=namespaces):
if namespace.text in KEEP_NAMESPACES:
NAMESPACE_IDS.add(int(namespace.get("key")))
f.close()
with decompress_dump_file(dump_path) as p, bz2.open(out_path, "wt") as output_file:
output_file.write(xml_declaration)
namespace_str = "http://www.mediawiki.org/xml/export-0.10/"
page_nums = 0
# namespaces = {None: namespace_str}
for _, page_element in etree.iterparse(
p.stdout, tag=f"{{{namespace_str}}}page"
):
page_nums += 1
namespace_id = int(page_element.findtext("ns", "0", namespaces))
if namespace_id not in NAMESPACE_IDS:
page_element.clear(keep_tail=True)
continue
output_file.write(etree.tostring(page_element).decode("utf-8"))
if page_nums % 10:
print(f"Processed {page_nums} pages")
output_file.write(f"</{root_tag}>")
import argparse
import re
from lxml import etree
import bz2
import re
from wikitextprocessor.dumpparser import decompress_dump_file
KEEP_PAGES = ["example"]
if __name__ == "__main__":
parser = argparse.ArgumentParser(description="Filter pages from dump")
parser.add_argument(
"--dump_path", type=str, nargs="?", default=None, help="Wiki dump file"
)
parser.add_argument(
"--out_path", type=str, nargs="?", default=None, help="Out file"
)
args = parser.parse_args()
dump_path = args.dump_path
out_path = args.out_path
xml_declaration = ""
root_tag = ""
# Preserves the xml declaration and root tag
with bz2.open(dump_path, "rt") as f:
i = 0
for line in f:
if "<page" in line:
break
if not root_tag:
match = re.search(r"<(\w+)", line)
if match:
root_tag = match.group(1)
xml_declaration += line
NAMESPACE_IDS = set([])
namespace_str = "http://www.mediawiki.org/xml/export-0.10/"
namespaces = {None: namespace_str}
with decompress_dump_file(dump_path) as p, bz2.open(out_path, "wt") as output_file:
output_file.write(xml_declaration)
namespace_str = "http://www.mediawiki.org/xml/export-0.10/"
page_nums = 0
# namespaces = {None: namespace_str}
for _, page_element in etree.iterparse(
p.stdout, tag=f"{{{namespace_str}}}page"
):
page_nums += 1
title = page_element.findtext("title", "", namespaces)
if title in KEEP_PAGES:
output_file.write(etree.tostring(page_element).decode("utf-8"))
KEEP_PAGES.remove(title)
if not KEEP_PAGES:
break
if page_nums % 10:
print(f"Processed {page_nums} pages")
output_file.write(f"</{root_tag}>")
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment