diff --git a/scripts/filter_namespaces.py b/scripts/filter_namespaces.py
new file mode 100644
index 0000000000000000000000000000000000000000..7175900702e284d489f3476af985850757088663
--- /dev/null
+++ b/scripts/filter_namespaces.py
@@ -0,0 +1,77 @@
+import argparse
+import re
+from lxml import etree
+import bz2
+import re
+
+from wikitextprocessor.dumpparser import decompress_dump_file
+
+KEEP_NAMESPACES = ["Module", "Template"]
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description="Filter pages from dump")
+    parser.add_argument(
+        "--dump_path", type=str, nargs="?", default=None, help="Wiki dump file"
+    )
+    parser.add_argument(
+        "--out_path", type=str, nargs="?", default=None, help="Out file"
+    )
+    args = parser.parse_args()
+
+
+    dump_path = args.dump_path
+    out_path = args.out_path
+
+    xml_declaration = ""
+    root_tag = ""
+
+
+    # Preserves the xml declaration and root tag
+    with bz2.open(dump_path, 'rt') as f:
+      i = 0
+      for line in f:
+        if "<page" in line:
+          break
+        if not root_tag:
+          match = re.search(r"<(\w+)", line)
+          if match:
+            root_tag = match.group(1)
+        xml_declaration += line
+
+    NAMESPACE_IDS = set([])
+
+    # Get the namespace ids of the namespaces we want to keep
+    root = etree.fromstring(xml_declaration+ f"</{root_tag}>", etree.XMLParser())
+    namespaces = root.nsmap
+    for namespace in root.findall(".//namespaces/namespace", namespaces=namespaces):
+      if namespace.text in KEEP_NAMESPACES:
+        NAMESPACE_IDS.add(int(namespace.get("key")))
+
+    f.close()
+
+
+
+    with decompress_dump_file(dump_path) as p, bz2.open(out_path, 'wt') as output_file:
+      output_file.write(xml_declaration)
+      namespace_str = "http://www.mediawiki.org/xml/export-0.10/"
+      page_nums = 0
+
+      # namespaces = {None: namespace_str}
+      for _, page_element in etree.iterparse(
+        p.stdout, tag=f"{{{namespace_str}}}page"
+      ):
+        page_nums += 1
+
+        namespace_id = int(page_element.findtext("ns", "0", namespaces)) 
+        if (
+            namespace_id not in NAMESPACE_IDS
+        ):
+            page_element.clear(keep_tail=True)
+            continue
+
+        output_file.write(etree.tostring(page_element).decode('utf-8'))
+
+        if page_nums % 10:
+          print(f"Processed {page_nums} pages")
+
+      output_file.write(f"</{root_tag}>")
diff --git a/scripts/filter_page.py b/scripts/filter_page.py
new file mode 100644
index 0000000000000000000000000000000000000000..95a0b5880bf044d82d2d1cc0df4ab72a140a859c
--- /dev/null
+++ b/scripts/filter_page.py
@@ -0,0 +1,72 @@
+import argparse
+import re
+from lxml import etree
+import bz2
+import re
+
+from wikitextprocessor.dumpparser import decompress_dump_file
+
+KEEP_PAGES = ["example"]
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description="Filter pages from dump")
+    parser.add_argument(
+        "--dump_path", type=str, nargs="?", default=None, help="Wiki dump file"
+    )
+    parser.add_argument(
+        "--out_path", type=str, nargs="?", default=None, help="Out file"
+    )
+    args = parser.parse_args()
+
+
+    dump_path = args.dump_path
+    out_path = args.out_path
+
+    xml_declaration = ""
+    root_tag = ""
+
+
+    # Preserves the xml declaration and root tag
+    with bz2.open(dump_path, 'rt') as f:
+      i = 0
+      for line in f:
+        if "<page" in line:
+          break
+        if not root_tag:
+          match = re.search(r"<(\w+)", line)
+          if match:
+            root_tag = match.group(1)
+        xml_declaration += line
+
+    NAMESPACE_IDS = set([])
+
+
+    namespace_str = "http://www.mediawiki.org/xml/export-0.10/"
+    namespaces = {None: namespace_str}
+    
+    with decompress_dump_file(dump_path) as p, bz2.open(out_path, 'wt') as output_file:
+      output_file.write(xml_declaration)
+      namespace_str = "http://www.mediawiki.org/xml/export-0.10/"
+      page_nums = 0
+
+      # namespaces = {None: namespace_str}
+      for _, page_element in etree.iterparse(
+        p.stdout, tag=f"{{{namespace_str}}}page"
+      ):
+        page_nums += 1
+
+        title = page_element.findtext("title", "", namespaces)
+
+        if title in KEEP_PAGES:
+          output_file.write(etree.tostring(page_element).decode('utf-8'))
+          KEEP_PAGES.remove(title)
+        
+        if not KEEP_PAGES:
+          break
+
+        if page_nums % 10:
+          print(f"Processed {page_nums} pages")
+
+      output_file.write(f"</{root_tag}>")
+
+