Skip to content
GitLab
Explore
Sign in
Primary navigation
Search or go to…
Project
wikstraktor
Manage
Activity
Members
Labels
Plan
Issues
3
Issue boards
Milestones
Wiki
External wiki
Code
Merge requests
0
Repository
Branches
Commits
Tags
Repository graph
Compare revisions
Snippets
Build
Pipelines
Jobs
Pipeline schedules
Artifacts
Deploy
Releases
Package Registry
Model registry
Operate
Environments
Terraform modules
Monitor
Incidents
Analyze
Value stream analytics
Contributor analytics
CI/CD analytics
Repository analytics
Model experiments
Help
Help
Support
GitLab documentation
Compare GitLab plans
Community forum
Contribute to GitLab
Provide feedback
Keyboard shortcuts
?
Snippets
Groups
Projects
Show more breadcrumbs
Lex gaMe
wikstraktor
Commits
80d83b85
Commit
80d83b85
authored
2 years ago
by
Mathieu Loiseau
Browse files
Options
Downloads
Patches
Plain Diff
On se rapproche
parent
e1bf5ca3
No related branches found
No related tags found
No related merge requests found
Changes
1
Hide whitespace changes
Inline
Side-by-side
Showing
1 changed file
wikstraktor.py
+139
-55
139 additions, 55 deletions
wikstraktor.py
with
139 additions
and
55 deletions
wikstraktor.py
+
139
−
55
View file @
80d83b85
...
...
@@ -5,6 +5,7 @@ import importlib
import
json
from
wikstraktor_version
import
version
as
the_version
from
wikstraklog
import
Wikstraklog
import
re
def
get_list_string_level
(
wikitext
):
list_chars
=
{
"
*
"
,
"
#
"
,
"
:
"
}
...
...
@@ -35,10 +36,26 @@ class SubInfo:
self
.
__class__
.
inc_n_id
()
return
self
.
id
def
replace_src_in_id
(
self
,
former_src
,
new_src
):
##Attention si on nettoie en mettant des sources partout, il faudra changer
res
=
None
if
self
.
id
!=
None
and
former_src
!=
None
and
new_src
!=
None
:
self
.
id
=
re
.
sub
(
r
'
^([\w\.]+)-(
'
+
str
(
former_src
)
+
'
)
'
,
r
"
\1-
"
+
str
(
new_src
),
self
.
id
)
res
=
self
.
id
return
res
def
get_src_from_id
(
self
):
res
=
None
if
self
.
id
!=
None
:
gp
=
re
.
match
(
r
'
^[\w\.]+-(\d{1,2})
'
,
self
.
id
)
if
gp
:
res
=
int
(
gp
.
group
(
1
))
return
res
def
serializable
(
self
,
prefix
=
None
):
res
=
{}
if
self
.
set_id
(
prefix
)
!=
None
:
res
[
"
id
"
]
=
self
.
id
if
prefix
!=
None
:
res
[
"
id
"
]
=
self
.
set_id
(
prefix
)
return
res
...
...
@@ -83,10 +100,10 @@ class Pronunciation(SubInfo):
self
.
sounds
.
append
(
Sound
(
url
,
accent
))
def
serializable
(
self
,
prefix
=
None
):
res
=
super
().
serializable
(
prefix
)
snds
=
[]
for
s
in
self
.
sounds
:
snds
.
append
(
s
.
serializable
())
res
=
super
().
serializable
(
prefix
)
res
[
'
transcript
'
]
=
self
.
ipa
if
self
.
has_accents
():
res
[
'
accents
'
]
=
list
(
self
.
accents
)
...
...
@@ -166,12 +183,12 @@ class Example(SubInfo):
return
res
class
Sense
(
SubInfo
):
prfx
=
""
prfx
=
"
s
"
def
__init__
(
self
,
lang
=
None
,
definition
=
None
,
wiki_lang
=
None
,
prefix
=
None
):
print
(
prefix
)
##
super
().
__init__
(
prefix
)
self
.
lang
=
lang
self
.
label
=
None
self
.
set_id
(
prefix
)
#On réinitialise les identifiants des sous-éléments
if
not
isinstance
(
self
,
SubSense
):
Definition
.
reset
()
...
...
@@ -217,14 +234,8 @@ class Sense(SubInfo):
def
metadata_exists
(
self
,
key
):
return
key
in
self
.
metadata
.
keys
()
def
set_id
(
self
,
prefix
=
None
):
if
prefix
!=
None
and
self
.
label
==
None
:
self
.
label
=
f
"
{
prefix
}
_
{
self
.
__class__
.
next_id
}
"
#l'identifiant du sens
self
.
__class__
.
inc_n_id
()
return
self
.
label
def
get_id
(
self
):
return
f
"
{
self
.
lang
}
.
{
self
.
label
}
"
return
self
.
id
def
set_domain
(
self
,
d
):
self
.
domain
=
d
...
...
@@ -235,14 +246,15 @@ class Sense(SubInfo):
else
:
theDef
=
Definition
(
lang
,
definition
)
if
theDef
!=
None
and
theDef
not
in
self
.
definitions
:
theDef
.
set_id
(
self
.
set_id
())
print
(
"
def set id
"
,
self
.
get_id
())
##
theDef
.
set_id
(
self
.
get_id
())
self
.
definitions
.
append
(
theDef
)
def
add_example
(
self
,
transcript
,
src
=
None
,
url
=
None
,
prefix
=
None
):
try
:
theEx
=
Example
(
transcript
,
src
,
url
,
prefix
)
if
theEx
!=
None
and
theEx
not
in
self
.
examples
:
theEx
.
set_id
(
self
.
s
et_id
())
theEx
.
set_id
(
self
.
g
et_id
())
self
.
examples
.
append
(
theEx
)
except
ValueError
as
e
:
print
(
f
"
Skipped empty example
"
)
...
...
@@ -250,17 +262,17 @@ class Sense(SubInfo):
def
add_translation
(
self
,
lang
=
None
,
translation
=
None
):
theTranslation
=
Translation
(
lang
,
translation
)
if
theTranslation
!=
None
and
theTranslation
not
in
self
.
translations
:
theTranslation
.
set_id
(
self
.
s
et_id
())
theTranslation
.
set_id
(
self
.
g
et_id
())
self
.
translations
.
append
(
theTranslation
)
def
add_subsense
(
self
,
subsense
):
if
self
.
label
!=
None
:
if
self
.
id
!=
None
:
subsense
.
set_id
(
self
.
set_id
())
if
subsense
not
in
self
.
subsenses
:
self
.
subsenses
.
append
(
subsense
)
def
__eq__
(
self
,
other
):
res
=
isinstance
(
other
,
self
.
__class__
)
and
self
.
label
==
other
.
label
and
len
(
self
.
definitions
)
==
len
(
other
.
definitions
)
and
len
(
self
.
examples
)
==
len
(
other
.
examples
)
and
len
(
self
.
translations
)
==
len
(
other
.
translations
)
and
self
.
domain
==
other
.
domain
and
len
(
other
.
metadata
)
==
len
(
self
.
metadata
)
and
other
.
regions
==
self
.
regions
res
=
isinstance
(
other
,
self
.
__class__
)
and
self
.
id
==
other
.
id
and
len
(
self
.
definitions
)
==
len
(
other
.
definitions
)
and
len
(
self
.
examples
)
==
len
(
other
.
examples
)
and
len
(
self
.
translations
)
==
len
(
other
.
translations
)
and
self
.
domain
==
other
.
domain
and
len
(
other
.
metadata
)
==
len
(
self
.
metadata
)
and
other
.
regions
==
self
.
regions
i
=
0
while
res
and
i
<
len
(
self
.
examples
):
res
=
self
.
examples
[
i
]
in
other
.
examples
...
...
@@ -289,8 +301,10 @@ class Sense(SubInfo):
i
+=
1
return
res
def
serializable
(
self
,
prefix
=
Non
e
):
def
serializable
(
self
,
id
=
Tru
e
):
res
=
{}
if
id
:
prefix
=
self
.
get_id
()
if
self
.
domain
!=
None
:
res
[
"
Domain
"
]
=
self
.
domain
if
len
(
self
.
regions
)
>
0
:
...
...
@@ -312,7 +326,7 @@ class Sense(SubInfo):
if
len
(
self
.
subsenses
)
>
0
:
res
[
"
Subsenses
"
]
=
{}
for
t
in
self
.
subsenses
:
res
[
"
Subsenses
"
][
t
.
set_id
(
self
.
label
)]
=
t
.
serializable
(
prefix
)
res
[
"
Subsenses
"
][
t
.
set_id
(
self
.
id
)]
=
t
.
serializable
(
prefix
)
return
res
def
__str__
(
self
):
...
...
@@ -320,10 +334,10 @@ class Sense(SubInfo):
class
SubSense
(
Sense
):
def
set_id
(
self
,
prefix
=
None
):
if
prefix
!=
None
and
self
.
label
==
None
:
self
.
label
=
f
"
{
prefix
}
.
{
self
.
__class__
.
next_id
}
"
#l'identifiant du sens
if
prefix
!=
None
and
self
.
id
==
None
:
self
.
id
=
f
"
{
prefix
}
.
{
self
.
__class__
.
next_id
}
"
#l'identifiant du sens
self
.
__class__
.
inc_n_id
()
return
self
.
label
return
self
.
id
class
Entry
:
#version_id : l'identifiant unique de la vesion de la page du wiktionnaire (pywikibot.Page.latest_revision_id)
...
...
@@ -332,8 +346,7 @@ class Entry:
self
.
lang
=
lang
#Si un jour on mixe +ieurs données de plusieurs wiktionnaires, ce sera utile
self
.
sources
=
[]
self
.
sources
.
append
({
"
wiktionary_language
"
:
wiki_lang
,
"
permanentId
"
:
version_id
,
"
wikstraktor_version
"
:
wkskt_version
})
self
.
current_source
=
0
self
.
add_source
(
wiki_lang
,
version_id
,
wkskt_version
)
self
.
pronunciations
=
[]
self
.
pos
=
None
self
.
senses
=
[]
...
...
@@ -343,13 +356,24 @@ class Entry:
def
set_pos
(
self
,
pos
):
self
.
pos
=
pos
def
get_id
(
self
,
source_id
=
0
):
#TODO: remplacer un jour le source id par la bonne source
def
add_source
(
self
,
wiki_lang
,
version_id
,
wkskt_version
):
self
.
sources
.
append
({
"
wiktionary_language
"
:
wiki_lang
,
"
permanentId
"
:
version_id
,
"
wikstraktor_version
"
:
wkskt_version
})
self
.
current_source
=
len
(
self
.
sources
)
-
1
def
set_current_source
(
self
,
src
):
self
.
current_source
=
src
def
get_prefix
(
self
,
source_id
=-
1
):
if
self
.
pos
!=
None
:
pos
=
self
.
pos
pos
=
"
.
"
+
self
.
pos
else
:
pos
=
""
return
f
"
{
self
.
lang
}
-
{
source_id
}
.
{
self
.
lemma
}{
pos
}
"
if
source_id
==
-
1
:
source_id
=
self
.
current_source
return
f
"
{
self
.
lang
}
.
{
self
.
lemma
}{
pos
}
-
{
source_id
}
"
def
get_id
(
self
):
return
f
"
{
self
.
lang
}
.
{
self
.
lemma
}
.
{
self
.
pos
}
"
def
set_pronunciations
(
self
,
pron
):
if
isinstance
(
pron
,
Pronunciation
):
...
...
@@ -365,7 +389,7 @@ class Entry:
def
add_pronunciation
(
self
,
p
):
if
p
not
in
self
.
pronunciations
:
p
.
set_id
(
self
.
get_
id
())
p
.
set_id
(
self
.
get_
prefix
())
self
.
pronunciations
.
append
(
p
)
def
set_senses
(
self
,
senses
):
...
...
@@ -377,15 +401,62 @@ class Entry:
def
add_sense
(
self
,
s
):
if
s
not
in
self
.
senses
:
s
.
set_id
(
self
.
get_
id
())
s
.
set_id
(
self
.
get_
prefix
())
self
.
senses
.
append
(
s
)
def
is_valid
(
self
):
return
self
.
lemma
!=
None
and
self
.
pos
!=
None
and
len
(
self
.
senses
)
>
0
# and len(self.pronunciations) > 0 ← must work without pronounciations
def
same
(
self
,
other
):
return
isinstance
(
other
,
self
.
__class__
)
and
self
.
lemma
==
other
.
lemma
and
self
.
lang
==
other
.
lang
and
self
.
pos
==
other
.
pos
def
merge
(
self
,
other
):
if
self
.
same
(
other
)
and
self
!=
other
:
i
=
0
src_map
=
[]
max_id
=
len
(
self
.
sources
)
-
1
while
i
<
len
(
other
.
sources
):
if
other
.
sources
[
i
]
in
self
.
sources
:
src_map
.
append
(
self
.
sources
.
index
(
other
.
sources
[
i
]))
#should append at rank i
else
:
self
.
add_source
(
other
.
sources
[
i
][
"
wiktionary_language
"
],
other
.
sources
[
i
][
"
permanentId
"
],
other
.
sources
[
i
][
"
wikstraktor_version
"
])
src_map
.
append
(
self
.
current_source
)
#should append at rank i
i
+=
1
for
p
in
other
.
pronunciations
:
src
=
p
.
get_src_from_id
()
if
src
!=
None
and
src
<=
max_id
and
src_map
[
src
]
!=
src
:
#max_id, c'est parce qu'un même objet peut être
#à plusieurs endroits et avoir déjà été modifié
p
.
replace_src_in_id
(
src
,
src_map
[
src
])
self
.
add_pronunciation
(
p
)
for
s
in
other
.
senses
:
src
=
s
.
get_src_from_id
()
if
src
!=
None
and
src_map
[
src
]
!=
src
:
s
.
replace_src_in_id
(
src
,
src_map
[
src
])
for
ss
in
s
.
subsenses
:
ss
.
replace_src_in_id
(
src
,
src_map
[
src
])
for
d
in
ss
.
definitions
:
d
.
replace_src_in_id
(
src
,
src_map
[
src
])
for
e
in
ss
.
examples
:
e
.
replace_src_in_id
(
src
,
src_map
[
src
])
for
t
in
ss
.
translations
:
t
.
replace_src_in_id
(
src
,
src_map
[
src
])
for
d
in
s
.
definitions
:
d
.
replace_src_in_id
(
src
,
src_map
[
src
])
for
e
in
s
.
examples
:
e
.
replace_src_in_id
(
src
,
src_map
[
src
])
for
t
in
s
.
translations
:
t
.
replace_src_in_id
(
src
,
src_map
[
src
])
self
.
add_sense
(
s
)
else
:
if
isinstance
(
other
,
self
.
__class__
):
raise
TypeError
(
f
"
Entry.merge() error :
{
other
.
__class__
}
object cannot be merged with Entry
"
)
else
:
raise
ValueError
(
f
"
Entry.merge() error :
{
self
.
id
}
cannot be merged with
{
other
.
id
}
"
)
def
__eq__
(
self
,
other
):
res
=
isinstance
(
other
,
self
.
__class__
)
and
self
.
lemma
==
other
.
lemma
and
self
.
lang
==
other
.
lang
and
self
.
pos
==
other
.
pos
and
len
(
self
.
pronunciations
)
==
len
(
other
.
pronunciations
)
and
len
(
self
.
senses
)
==
len
(
other
.
senses
)
res
=
self
.
same
(
other
)
and
len
(
self
.
pronunciations
)
==
len
(
other
.
pronunciations
)
and
len
(
self
.
senses
)
==
len
(
other
.
senses
)
i
=
0
while
res
and
i
<
len
(
self
.
senses
):
res
=
self
.
senses
[
i
]
==
other
.
senses
[
i
]
...
...
@@ -400,14 +471,14 @@ class Entry:
res
=
{}
res
[
'
sources
'
]
=
self
.
sources
if
id
:
id
=
self
.
get_
id
()
res
[
'
id
'
]
=
id
prefix
=
self
.
get_
prefix
()
res
[
'
id
'
]
=
self
.
get_id
()
else
:
id
==
None
prefix
==
None
res
[
self
.
lemma
]
=
{
"
pos
"
:
self
.
pos
}
res
[
self
.
lemma
][
"
pronunciations
"
]
=
[]
for
p
in
self
.
pronunciations
:
res
[
self
.
lemma
][
"
pronunciations
"
].
append
(
p
.
serializable
(
id
))
res
[
self
.
lemma
][
"
pronunciations
"
].
append
(
p
.
serializable
(
prefix
))
res
[
self
.
lemma
][
"
senses
"
]
=
{}
for
s
in
self
.
senses
:
res
[
self
.
lemma
][
"
senses
"
][
s
.
get_id
()]
=
s
.
serializable
(
id
)
...
...
@@ -522,10 +593,10 @@ class ParserContext:
class
Wikstraktor
:
@classmethod
def
get_instance
(
cls
,
wiki_language
,
entry_language
,
existing_entries
=
None
):
def
get_instance
(
cls
,
wiki_language
,
entry_language
):
try
:
m_name
=
f
"
{
wiki_language
}
_
{
entry_language
}
"
.
capitalize
()
instance
=
getattr
(
importlib
.
import_module
(
f
"
parsers.
{
m_name
.
lower
()
}
"
),
f
"
{
m_name
}
_straktor
"
)(
existing_entries
)
instance
=
getattr
(
importlib
.
import_module
(
f
"
parsers.
{
m_name
.
lower
()
}
"
),
f
"
{
m_name
}
_straktor
"
)()
instance
.
version
=
the_version
instance
.
log
=
Wikstraklog
(
the_version
,
entry_language
,
wiki_language
)
except
ModuleNotFoundError
:
...
...
@@ -533,11 +604,8 @@ class Wikstraktor:
instance
=
None
return
instance
def
__init__
(
self
,
existing_entries
=
None
):
if
existing_entries
=
None
:
self
.
entries
=
[]
else
:
self
.
entries
=
existing_entries
def
__init__
(
self
):
self
.
entries
=
{}
self
.
pwb
=
pywikibot
self
.
wtp
=
wikitextparser
self
.
parserContext
=
None
...
...
@@ -551,6 +619,13 @@ class Wikstraktor:
print
(
f
"
{
file_page_name
}
does not exist in
{
self
.
site
}
.
"
)
return
res
def
add_entry
(
self
,
e
):
if
e
.
get_id
()
in
self
.
entries
.
keys
():
if
e
!=
self
.
entries
[
e
.
get_id
()]:
self
.
entries
[
e
.
get_id
()].
merge
(
e
)
else
:
self
.
entries
[
e
.
get_id
()]
=
e
#retrieves the content of a page and processes it (adding the entries to the list of entries)
#returns the number of entries added
def
fetch
(
self
,
graphy
):
...
...
@@ -603,7 +678,7 @@ class Wikstraktor:
res
=
len
(
self
.
parserContext
.
entries
)
if
res
>
0
:
for
e
in
self
.
parserContext
.
entries
:
self
.
entries
.
append
(
e
)
self
.
add_entry
(
e
)
return
res
def
isPro
(
self
,
title
):
...
...
@@ -722,12 +797,23 @@ class Wikstraktor:
print
(
"
Skipped empty definition
"
)
return
senses
def
__add__
(
self
,
other
):
if
isinstance
(
other
,
Wikstraktor
):
for
k
,
e
in
other
.
entries
.
items
():
if
k
in
self
.
entries
.
keys
():
self
.
entries
[
k
].
merge
(
e
)
else
:
self
.
entries
[
k
]
=
e
else
:
raise
TypeError
(
f
"
Wikstraktor
'
+
'
:
{
other
.
__class__
}
object cannot be added to
{
self
.
__class__
}
"
)
return
self
def
__str__
(
self
):
return
self
.
export
()
def
serialize
(
self
,
id
=
True
):
res
=
[]
for
e
in
self
.
entries
:
for
e
in
self
.
entries
.
values
()
:
res
.
append
(
e
.
serializable
(
id
))
return
res
...
...
@@ -737,12 +823,6 @@ class Wikstraktor:
else
:
return
json
.
dumps
(
self
.
serialize
(
id
),
ensure_ascii
=
ascii
,
indent
=
4
)
def
export_multi_wikt
(
serialized
,
ascii
=
False
,
compact
=
False
):
if
compact
:
return
json
.
dumps
(
serialized
,
ensure_ascii
=
ascii
)
else
:
return
json
.
dumps
(
serialized
,
ensure_ascii
=
ascii
,
indent
=
4
)
if
__name__
==
"
__main__
"
:
import
argparse
from
argparse
import
RawTextHelpFormatter
#pour le formattage de l'aide
...
...
@@ -762,14 +842,18 @@ if __name__ == "__main__":
wiki_languages
=
args
.
wiki_language
.
split
(
"
+
"
)
languages
=
args
.
language
.
split
(
"
+
"
)
if
args
.
mot
!=
None
:
resp
=
[]
resp
=
None
for
w_l
in
wiki_languages
:
for
l
in
languages
:
w
=
Wikstraktor
.
get_instance
(
w_l
,
l
)
if
w
.
fetch
(
args
.
mot
)
>
0
:
resp
+=
w
.
serialize
(
not
args
.
no_id
)
if
len
(
resp
)
>
0
:
resp
=
export_multi_wikt
(
resp
,
args
.
force_ascii
,
args
.
compact
)
if
resp
==
None
:
resp
=
w
else
:
resp
+=
w
print
(
resp
!=
None
)
if
resp
!=
None
:
resp
=
resp
.
export
(
not
args
.
no_id
,
args
.
force_ascii
,
args
.
compact
)
if
args
.
destination_file
!=
None
:
f
=
open
(
args
.
destination_file
,
"
w
"
)
f
.
write
(
resp
)
...
...
This diff is collapsed.
Click to expand it.
Preview
0%
Loading
Try again
or
attach a new file
.
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Save comment
Cancel
Please
register
or
sign in
to comment