Skip to content
GitLab
Explore
Sign in
Primary navigation
Search or go to…
Project
EWOK
Manage
Activity
Members
Labels
Plan
Issues
2
Issue boards
Milestones
Wiki
External wiki
Code
Merge requests
0
Repository
Branches
Commits
Tags
Repository graph
Compare revisions
Snippets
Build
Pipelines
Jobs
Pipeline schedules
Artifacts
Deploy
Releases
Package Registry
Model registry
Operate
Environments
Terraform modules
Monitor
Incidents
Analyze
Value stream analytics
Contributor analytics
CI/CD analytics
Repository analytics
Model experiments
Help
Help
Support
GitLab documentation
Compare GitLab plans
Community forum
Contribute to GitLab
Provide feedback
Keyboard shortcuts
?
Snippets
Groups
Projects
Show more breadcrumbs
Lex gaMe
EWOK
Commits
a087f053
Commit
a087f053
authored
1 year ago
by
Empiriker
Browse files
Options
Downloads
Patches
Plain Diff
Remove obsolete scripts
parent
31b3a443
No related branches found
No related tags found
No related merge requests found
Changes
2
Hide whitespace changes
Inline
Side-by-side
Showing
2 changed files
scripts/filter_namespaces.py
+0
-71
0 additions, 71 deletions
scripts/filter_namespaces.py
scripts/filter_page.py
+0
-67
0 additions, 67 deletions
scripts/filter_page.py
with
0 additions
and
138 deletions
scripts/filter_namespaces.py
deleted
100644 → 0
+
0
−
71
View file @
31b3a443
import
argparse
import
re
from
lxml
import
etree
import
bz2
import
re
from
wikitextprocessor.dumpparser
import
decompress_dump_file
KEEP_NAMESPACES
=
[
"
Module
"
,
"
Template
"
]
if
__name__
==
"
__main__
"
:
parser
=
argparse
.
ArgumentParser
(
description
=
"
Filter pages from dump
"
)
parser
.
add_argument
(
"
--dump_path
"
,
type
=
str
,
nargs
=
"
?
"
,
default
=
None
,
help
=
"
Wiki dump file
"
)
parser
.
add_argument
(
"
--out_path
"
,
type
=
str
,
nargs
=
"
?
"
,
default
=
None
,
help
=
"
Out file
"
)
args
=
parser
.
parse_args
()
dump_path
=
args
.
dump_path
out_path
=
args
.
out_path
xml_declaration
=
""
root_tag
=
""
# Preserves the xml declaration and root tag
with
bz2
.
open
(
dump_path
,
"
rt
"
)
as
f
:
i
=
0
for
line
in
f
:
if
"
<page
"
in
line
:
break
if
not
root_tag
:
match
=
re
.
search
(
r
"
<(\w+)
"
,
line
)
if
match
:
root_tag
=
match
.
group
(
1
)
xml_declaration
+=
line
NAMESPACE_IDS
=
set
([])
# Get the namespace ids of the namespaces we want to keep
root
=
etree
.
fromstring
(
xml_declaration
+
f
"
</
{
root_tag
}
>
"
,
etree
.
XMLParser
())
namespaces
=
root
.
nsmap
for
namespace
in
root
.
findall
(
"
.//namespaces/namespace
"
,
namespaces
=
namespaces
):
if
namespace
.
text
in
KEEP_NAMESPACES
:
NAMESPACE_IDS
.
add
(
int
(
namespace
.
get
(
"
key
"
)))
f
.
close
()
with
decompress_dump_file
(
dump_path
)
as
p
,
bz2
.
open
(
out_path
,
"
wt
"
)
as
output_file
:
output_file
.
write
(
xml_declaration
)
namespace_str
=
"
http://www.mediawiki.org/xml/export-0.10/
"
page_nums
=
0
# namespaces = {None: namespace_str}
for
_
,
page_element
in
etree
.
iterparse
(
p
.
stdout
,
tag
=
f
"
{{
{
namespace_str
}
}}page
"
):
page_nums
+=
1
namespace_id
=
int
(
page_element
.
findtext
(
"
ns
"
,
"
0
"
,
namespaces
))
if
namespace_id
not
in
NAMESPACE_IDS
:
page_element
.
clear
(
keep_tail
=
True
)
continue
output_file
.
write
(
etree
.
tostring
(
page_element
).
decode
(
"
utf-8
"
))
if
page_nums
%
10
:
print
(
f
"
Processed
{
page_nums
}
pages
"
)
output_file
.
write
(
f
"
</
{
root_tag
}
>
"
)
This diff is collapsed.
Click to expand it.
scripts/filter_page.py
deleted
100644 → 0
+
0
−
67
View file @
31b3a443
import
argparse
import
re
from
lxml
import
etree
import
bz2
import
re
from
wikitextprocessor.dumpparser
import
decompress_dump_file
KEEP_PAGES
=
[
"
example
"
]
if
__name__
==
"
__main__
"
:
parser
=
argparse
.
ArgumentParser
(
description
=
"
Filter pages from dump
"
)
parser
.
add_argument
(
"
--dump_path
"
,
type
=
str
,
nargs
=
"
?
"
,
default
=
None
,
help
=
"
Wiki dump file
"
)
parser
.
add_argument
(
"
--out_path
"
,
type
=
str
,
nargs
=
"
?
"
,
default
=
None
,
help
=
"
Out file
"
)
args
=
parser
.
parse_args
()
dump_path
=
args
.
dump_path
out_path
=
args
.
out_path
xml_declaration
=
""
root_tag
=
""
# Preserves the xml declaration and root tag
with
bz2
.
open
(
dump_path
,
"
rt
"
)
as
f
:
i
=
0
for
line
in
f
:
if
"
<page
"
in
line
:
break
if
not
root_tag
:
match
=
re
.
search
(
r
"
<(\w+)
"
,
line
)
if
match
:
root_tag
=
match
.
group
(
1
)
xml_declaration
+=
line
NAMESPACE_IDS
=
set
([])
namespace_str
=
"
http://www.mediawiki.org/xml/export-0.10/
"
namespaces
=
{
None
:
namespace_str
}
with
decompress_dump_file
(
dump_path
)
as
p
,
bz2
.
open
(
out_path
,
"
wt
"
)
as
output_file
:
output_file
.
write
(
xml_declaration
)
namespace_str
=
"
http://www.mediawiki.org/xml/export-0.10/
"
page_nums
=
0
# namespaces = {None: namespace_str}
for
_
,
page_element
in
etree
.
iterparse
(
p
.
stdout
,
tag
=
f
"
{{
{
namespace_str
}
}}page
"
):
page_nums
+=
1
title
=
page_element
.
findtext
(
"
title
"
,
""
,
namespaces
)
if
title
in
KEEP_PAGES
:
output_file
.
write
(
etree
.
tostring
(
page_element
).
decode
(
"
utf-8
"
))
KEEP_PAGES
.
remove
(
title
)
if
not
KEEP_PAGES
:
break
if
page_nums
%
10
:
print
(
f
"
Processed
{
page_nums
}
pages
"
)
output_file
.
write
(
f
"
</
{
root_tag
}
>
"
)
This diff is collapsed.
Click to expand it.
Preview
0%
Loading
Try again
or
attach a new file
.
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Save comment
Cancel
Please
register
or
sign in
to comment