Skip to content
Snippets Groups Projects
Commit 58f04ebd authored by Alice Brenon's avatar Alice Brenon
Browse files

Import a collection of existing scripts

parents
No related branches found
No related tags found
No related merge requests found
Makefile 0 → 100644
CORPUS_ROOT=/home/alice/Dehors/Corpus
EDDA=$(CORPUS_ROOT)/EDdA
LGE=$(CORPUS_ROOT)/LGE
EDDA_META=$(EDDA)/metadata.csv
METADATA=$(EDDA_META)
RAW_LGE=$(LGE)/Text
TEI_LGE=$(LGE)/TEI
PARALLEL_LGE=$(LGE)/Parallel
LGE_META_FROM_EDDA=$(PARALLEL_LGE)/metadata.csv
RAW_PARALLEL_LGE=$(PARALLEL_LGE)/Text
TEI_PARALLEL_LGE=$(PARALLEL_LGE)/TEI
STANZA_PARALLEL_LGE=$(PARALLEL_LGE)/stanza
all: $(METADATA) $(STANZA_PARALLEL_LGE)
$(EDDA_META): $(EDDA)/TEI/
./scripts/EDdA-metadata.py $< > $@
$(LGE_META_FROM_EDDA): $(EDDA_META)
mkdir -p $(dir $@)
./scripts/LGE-metadata-from-EDdA.py $< $(RAW_LGE) $@
$(RAW_PARALLEL_LGE): $(LGE_META_FROM_EDDA) $(RAW_LGE)
./scripts/extract_classified_LGE.sh $^ $@
$(TEI_PARALLEL_LGE): $(LGE_META_FROM_EDDA) $(TEI_LGE)
./scripts/extract_classified_LGE.sh $^ $@
$(STANZA_PARALLEL_LGE): $(RAW_PARALLEL_LGE)
./scripts/stanza-annotator.py $< $@
(use-modules ((geode packages annotation) #:select (python-stanza))
((geode packages models) #:select (stanza-fr))
((gnu packages python) #:select (python))
((gnu packages python-xyz) #:select (python-beautifulsoup4))
((gnu packages xml) #:select (python-lxml)))
(packages->manifest
(list
coreutils ; mktemp for atomic processing, strip CSV headers, general scripting
python ; scripts
python-beautifulsoup4 ; extract EDdA metadata from TEI files
python-lxml ; fusion articles into tomes for TXM
python-stanza ; annotation
stanza-fr ; annotation
))
#!/usr/bin/env python3
import csv
from bs4 import BeautifulSoup
import os
import sys
header = ["T", "article", "head", "domain"]
def getAttribute(article, attribute):
result = article.find(type=attribute)
return result.get('value') if result else ''
def main(rootDirectory):
output = csv.writer(sys.stdout)
output.writerow(header)
for t in range(1,18):
path = f"{rootDirectory}/T{t}"
for rank in range(1, len(os.listdir(path))+1):
root = BeautifulSoup(open(f"{path}/article{rank}.tei"))
normclass = getAttribute(root, "normclass")
output.writerow([
t,
rank,
getAttribute(root, "head"),
normclass if normclass != 'unclassified' else getAttribute(root, "generatedclass")
])
if __name__ == '__main__':
main(sys.argv[1])
#!/usr/bin/env python3
import csv
import sys
def EDdARow(columns):
return {
'tome': columns[0],
'article': columns[1],
'head': columns[2],
'domain': columns[3]
}
def LGERow(tome):
return lambda columns: {
'id': columns[0],
'tome': tome,
'rank': columns[1],
'head': columns[2]
}
def unserializeMetadata(path, rowReader):
with open(path) as f:
inputFile = csv.reader(f)
articles = []
header = True
for row in inputFile:
if header:
header = False
else:
yield rowReader(row)
def concat(generators):
for g in generators:
for x in g:
yield x
def naiveIndexBy(field, elements):
d = {}
for e in elements:
key = e[field]
if key in d:
d[key] = None
else:
d[key] = e
return d
def growPrefixes(d, keys, maxLength):
for length in range(maxLength, 1, -1):
newGeneration = {}
for key in keys:
if len(key) == length:
newKey = key[:-1]
if newKey in newGeneration or newKey in d:
newGeneration[newKey] = None
else:
newGeneration[newKey] = d[key]
for key, value in newGeneration.items():
if value is not None:
d[key] = value
keys.add(key)
elif key not in d:
d[key] = value
def indexBy(field, elements, prefix=True):
d = {}
for e in elements:
key = e[field]
if key in d:
d[key] = None
else:
d[key] = e
if prefix:
keys = set(d.keys())
growPrefixes(d, keys, max(map(len, keys)))
return d
def headWords(head):
words = head.split()
if len(words) == 1:
return words
else:
return [w for w in map(lambda s: s.strip(',.'), words) if w.isupper()]
def identify(head, haystack):
if head in haystack:
if haystack[head] is not None:
return {'type': 'exact', 'match': head, 'found': haystack[head]}
else:
return None
else:
prefix = head[:-1]
while len(prefix) > 0 and prefix not in haystack:
prefix = prefix[:-1]
if prefix in haystack and haystack[prefix] is not None:
return {
'type': 'prefix',
'match': head,
'found': haystack[prefix],
'precision': len(prefix) / len(head)
}
else:
return None
def naiveGetArrows(source, target):
indexedSource = naiveIndexBy('head', source)
indexedTarget = naiveIndexBy('head', target)
for head, article in indexedSource.items():
if article is not None and head in indexedTarget and indexedTarget[head] is not None:
yield {
'source': article,
'target': indexedTarget[head]
}
def getArrows(source, target):
for article in source:
heads = headWords(article['head'])
identified = map(lambda w: identify(w, target), heads)
entries = [e for e in identified if e is not None]
if len(entries) == 1:
yield {
'type': 'match',
'source': article,
'target': entries[0]
}
elif len(entries) > 1:
yield {
'type': 'ambiguity',
'source': article,
'target': entries
}
def interesting(arrow):
if arrow['type'] == 'match':
target = arrow['target']
return len(target['match']) > 3 and (target['type'] == 'exact' or target['precision'] > 0.8)
#gold = [a for a in arrows if interesting(a)]
def getMetadata(arrows, path=None):
output = sys.stdout if path is None else open(path, 'w')
toCsv = csv.writer(output)
toCsv.writerow(['id', 'tome', 'rank', 'head', 'domain'])
for arrow in arrows:
toCsv.writerow([
arrow['target']['id'],
arrow['target']['tome'],
arrow['target']['rank'],
arrow['target']['head'],
arrow['source']['domain']
])
if __name__ == '__main__':
edda = list(unserializeMetadata(sys.argv[1], EDdARow))
lge = list(concat([
unserializeMetadata(f'{sys.argv[2]}/T{T}/metadata.csv', LGERow(T)) for T in range(1, 32)
]))
getMetadata(list(naiveGetArrows(edda, lge)), sys.argv[3])
#!/bin/sh
INPUT_DIR="${1%/}"
TOME_NUMBER="${INPUT_DIR##*/T}"
TOME="T${TOME_NUMBER}"
OUTPUT="${2%/}/${TOME}.xml"
cat <<EOF > "${OUTPUT}"
<teiCorpus version="3.3.0" xmlns="http://www.tei-c.org/ns/1.0">
<teiHeader>
<fileDesc>
<titleStmt>
<title>
La Grande Encyclopédie, Inventaire raisonné des sciences, des lettres et
des arts par une société de savants et de gens de lettres. Tome ${TOME_NUMBER}.
</title>
<respStmt>
<resp>Digitized by</resp>
<orgName>
Bibliothèque Nationale de France
</orgName>
<resp>Encoded by</resp>
<orgName>
ICAR
</orgName>
</respStmt>
</titleStmt>
<publicationStmt>
<distributor>
<orgName>
Project GÉODE
</orgName>
<address>
<addrline>
ICAR UMR 5191
</addrline>
<addrline>
ENS de Lyon
</addrline>
</address>
</distributor>
</publicationStmt>
<sourceDesc>
<bibl>
<title>
La Grande Encyclopédie, Inventaire raisonné des sciences, des lettres et
des arts par une société de savants et de gens de lettres. Tome ${TOME_NUMBER}.
</title>
<author>
Collective
</author>
<creation>
<date>
1885
</date>
</creation>
<imprint>
<date>
1885
</date>
<publisher>
H. Lamirault et Cie,
</publisher>
<pubplace>
Paris
</pubplace>
</imprint>
<biblScope unit="volume">${TOME_NUMBER}</biblScope>
</bibl>
</sourceDesc>
</fileDesc>
</teiHeader>
EOF
find "${INPUT_DIR}" -type f -name "*.tei" | python3 "${0%/*}/reimport.py" >> "${OUTPUT}"
cat <<EOF >> "${OUTPUT}"
</teiCorpus>
EOF
#!/bin/sh
INPUT_METADATA="${1}"
SOURCE_TEXT_ARTICLES="${2}"
OUTPUT="${3}"
if [ -d "${OUTPUT}" ]
then
N=1
while [ -d "${OUTPUT}.${N}" ]
do
N=$((N+1))
done
mv "${OUTPUT}" "${OUTPUT}.${N}"
fi
WORKDIR=$(mktemp -d /tmp/classified-LGE.XXX)
for T in {1..31}
do
mkdir -p "${WORKDIR}/T${T}"
done
while read LINE
do
ID="${LINE%%,*}"
LINE="${LINE#*,}"
T="${LINE%%,*}"
cp "${SOURCE_TEXT_ARTICLES}/T${T}/ById/${ID}."* "${WORKDIR}/T${T}"
done < <(tail -n +2 ${INPUT_METADATA})
mv ${WORKDIR} ${OUTPUT}
from lxml import etree
from sys import stdin
from os.path import basename
import re
def dom(name, attributes, inside):
elem = etree.Element(name, attributes)
if type(inside) == str:
elem.text = inside
else:
for child in inside:
elem.append(child)
return elem
def analyze(inputArticle):
article = {"contents": []}
for node in inputArticle.xpath('//text/*'):
if 'head' not in article:
article['head'] = node.text
else:
article['contents'].append(node)
if 'author' not in article:
article['author'] = "anonyme"
for c in ['norm', 'generated']:
if c + 'Class' not in article:
article[c + 'Class'] = ""
return article
def teiHeader(article):
return dom('teiHeader', {}, [
dom('fileDesc', {}, [
dom('titleStmt', {}, [
dom('title', {}, article['head'])
]),
dom('publicationStmt', {}, [
dom('p', {}, "Annotated with TreeTagger for project GEODE")
]),
dom('sourceDesc', {}, [
dom('bibl', {}, [
dom('title', {}, article['head']),
dom('author', {}, article['author'])
])
])
])
])
def contents(article):
xmlId = etree.QName('{http://www.w3.org/XML/1998/namespace}id')
for c in article['contents']:
for w in c.xpath('//w'):
if xmlId in w.attrib:
del w.attrib[xmlId]
return article['contents']
def buildDocument(docId, article):
textAttributes = {
'id': docId,
'title': article['head'],
'normClass': article['normClass'],
'generatedClass': article['generatedClass'],
'author': article['author']
}
return dom('TEI', {}, [
teiHeader(article),
dom('text', textAttributes, [
dom('body', {}, contents(article))
])
])
if __name__ == '__main__':
for filepath in stdin.read().splitlines():
m = re.match('T(\d+)article_(\d+).xml', basename(filepath))
if m is not None:
docId = '-'.join(m.groups())
with open(filepath) as f:
inputArticle = etree.parse(f)
print(etree.tounicode(buildDocument(docId, analyze(inputArticle))))
#!/usr/bin/env python3
import os
import os.path
import stanza
import sys
class Annotator:
def __init__(self, source, target):
self.source = source
self.target = target
self.model = stanza.Pipeline(lang='fr', processors='tokenize,pos,lemma')
def run(self):
for name in os.listdir(self.source):
if os.path.isdir(f'{self.source}/{name}'):
self.annotate_dir(name)
def annotate_dir(self, directory):
source_path = f'{self.source}/{directory}'
os.makedirs(f'{self.target}/{directory}', exist_ok=True)
for name in os.listdir(source_path):
path = f'{source_path}/{name}'
relpath = f'{directory}/{name}'
if os.path.isdir(path):
self.annotate_dir(relpath)
elif os.path.isfile(path):
self.annotate_file(relpath)
def annotate_file(self, file):
with open(f'{self.source}/{file}', 'r') as source:
self.encode(f'{file[:-4]}.tei', self.model(source.read()))
def encode(self, file, annotation):
with open(f'{self.target}/{file}', 'w') as target:
for sentence in annotation.sentences:
self.encode_sentence(sentence, target)
def encode_sentence(self, sentence, target):
print('<s>', file=target)
for token in sentence.tokens:
self.encode_token(token, target)
print('</s>', file=target)
def encode_token(self, token, target):
form = token.text
lemma = '+'.join(map(lambda w: w.lemma, token.words))
upos = '+'.join(map(lambda w: w.upos, token.words))
print(f'\t<w lemma="{lemma}" pos="{upos}">{form}</w>', file=target)
if __name__ == '__main__':
Annotator(sys.argv[1], sys.argv[2]).run()
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment