Import a collection of existing scripts

58f04ebd · Alice Brenon · 58f04ebd · 58f04ebd · 58f04ebd · 58f04ebd
Commit 58f04ebd authored 2 years ago by Alice Brenon
--- a/Makefile
+++ b/Makefile
+CORPUS_ROOT=/home/alice/Dehors/Corpus
+EDDA=$(CORPUS_ROOT)/EDdA
+LGE=$(CORPUS_ROOT)/LGE
+
+EDDA_META=$(EDDA)/metadata.csv
+METADATA=$(EDDA_META)
+
+RAW_LGE=$(LGE)/Text
+TEI_LGE=$(LGE)/TEI
+PARALLEL_LGE=$(LGE)/Parallel
+LGE_META_FROM_EDDA=$(PARALLEL_LGE)/metadata.csv
+RAW_PARALLEL_LGE=$(PARALLEL_LGE)/Text
+TEI_PARALLEL_LGE=$(PARALLEL_LGE)/TEI
+STANZA_PARALLEL_LGE=$(PARALLEL_LGE)/stanza
+
+all: $(METADATA) $(STANZA_PARALLEL_LGE)
+
+$(EDDA_META): $(EDDA)/TEI/
+	./scripts/EDdA-metadata.py $< > $@
+
+$(LGE_META_FROM_EDDA): $(EDDA_META)
+	mkdir -p $(dir $@)
+	./scripts/LGE-metadata-from-EDdA.py $< $(RAW_LGE) $@
+
+$(RAW_PARALLEL_LGE): $(LGE_META_FROM_EDDA) $(RAW_LGE)
+	./scripts/extract_classified_LGE.sh $^ $@
+
+$(TEI_PARALLEL_LGE): $(LGE_META_FROM_EDDA) $(TEI_LGE)
+	./scripts/extract_classified_LGE.sh $^ $@
+
+$(STANZA_PARALLEL_LGE): $(RAW_PARALLEL_LGE)
+	./scripts/stanza-annotator.py $< $@
--- a/manifest.scm
+++ b/manifest.scm
+(use-modules ((geode packages annotation) #:select (python-stanza))
+             ((geode packages models) #:select (stanza-fr))
+             ((gnu packages python) #:select (python))
+             ((gnu packages python-xyz) #:select (python-beautifulsoup4))
+             ((gnu packages xml) #:select (python-lxml)))
+
+(packages->manifest
+  (list
+    coreutils ; mktemp for atomic processing, strip CSV headers, general scripting
+    python ; scripts
+    python-beautifulsoup4 ; extract EDdA metadata from TEI files
+    python-lxml ; fusion articles into tomes for TXM
+    python-stanza ; annotation
+    stanza-fr ; annotation
+    ))
--- a/scripts/EDdA-metadata.py
+++ b/scripts/EDdA-metadata.py
+#!/usr/bin/env python3
+
+import csv
+from bs4 import BeautifulSoup
+import os
+import sys
+
+header = ["T", "article", "head", "domain"]
+
+def getAttribute(article, attribute):
+    result = article.find(type=attribute)
+    return result.get('value') if result else ''
+
+def main(rootDirectory):
+    output = csv.writer(sys.stdout)
+    output.writerow(header)
+    for t in range(1,18):
+        path = f"{rootDirectory}/T{t}"
+        for rank in range(1, len(os.listdir(path))+1):
+            root = BeautifulSoup(open(f"{path}/article{rank}.tei"))
+            normclass = getAttribute(root, "normclass")
+            output.writerow([
+                t,
+                rank,
+                getAttribute(root, "head"),
+                normclass if normclass != 'unclassified' else getAttribute(root, "generatedclass")
+                ])
+
+if __name__ == '__main__':
+    main(sys.argv[1])
--- a/scripts/LGE-metadata-from-EDdA.py
+++ b/scripts/LGE-metadata-from-EDdA.py
+#!/usr/bin/env python3
+
+import csv
+import sys
+
+def EDdARow(columns):
+    return {
+            'tome': columns[0],
+            'article': columns[1],
+            'head': columns[2],
+            'domain': columns[3]
+            }
+
+def LGERow(tome):
+    return lambda columns: {
+            'id': columns[0],
+            'tome': tome,
+            'rank': columns[1],
+            'head': columns[2]
+        }
+
+def unserializeMetadata(path, rowReader):
+    with open(path) as f:
+        inputFile = csv.reader(f)
+        articles = []
+        header = True
+        for row in inputFile:
+            if header:
+                header = False
+            else:
+                yield rowReader(row)
+
+def concat(generators):
+    for g in generators:
+        for x in g:
+            yield x
+
+def naiveIndexBy(field, elements):
+    d = {}
+    for e in elements:
+        key = e[field]
+        if key in d:
+            d[key] = None
+        else:
+            d[key] = e
+    return d
+
+def growPrefixes(d, keys, maxLength):
+    for length in range(maxLength, 1, -1):
+        newGeneration = {}
+        for key in keys:
+            if len(key) == length:
+                newKey = key[:-1]
+                if newKey in newGeneration or newKey in d:
+                    newGeneration[newKey] = None
+                else:
+                    newGeneration[newKey] = d[key]
+        for key, value in newGeneration.items():
+            if value is not None:
+                d[key] = value
+                keys.add(key)
+            elif key not in d:
+                d[key] = value
+
+def indexBy(field, elements, prefix=True):
+    d = {}
+    for e in elements:
+        key = e[field]
+        if key in d:
+            d[key] = None
+        else:
+            d[key] = e
+    if prefix:
+        keys = set(d.keys())
+        growPrefixes(d, keys, max(map(len, keys)))
+    return d
+
+def headWords(head):
+    words = head.split()
+    if len(words) == 1:
+        return words
+    else:
+        return [w for w in map(lambda s: s.strip(',.'), words) if w.isupper()]
+
+def identify(head, haystack):
+    if head in haystack:
+        if haystack[head] is not None:
+            return {'type': 'exact', 'match': head, 'found': haystack[head]}
+        else:
+            return None
+    else:
+        prefix = head[:-1]
+        while len(prefix) > 0 and prefix not in haystack:
+            prefix = prefix[:-1]
+        if prefix in haystack and haystack[prefix] is not None:
+            return {
+                    'type': 'prefix',
+                    'match': head,
+                    'found': haystack[prefix],
+                    'precision': len(prefix) / len(head)
+                    }
+        else:
+            return None
+
+def naiveGetArrows(source, target):
+    indexedSource = naiveIndexBy('head', source)
+    indexedTarget = naiveIndexBy('head', target)
+    for head, article in indexedSource.items():
+        if article is not None and head in indexedTarget and indexedTarget[head] is not None:
+            yield {
+                    'source': article,
+                    'target': indexedTarget[head]
+                    }
+
+def getArrows(source, target):
+    for article in source:
+        heads = headWords(article['head'])
+        identified = map(lambda w: identify(w, target), heads)
+        entries = [e for e in identified if e is not None]
+        if len(entries) == 1:
+            yield {
+                    'type': 'match',
+                    'source': article,
+                    'target': entries[0]
+                    }
+        elif len(entries) > 1:
+            yield {
+                    'type': 'ambiguity',
+                    'source': article,
+                    'target': entries
+                    }
+
+def interesting(arrow):
+    if arrow['type'] == 'match':
+        target = arrow['target']
+        return len(target['match']) > 3 and (target['type'] == 'exact' or target['precision'] > 0.8)
+
+#gold = [a for a in arrows if interesting(a)]
+
+def getMetadata(arrows, path=None):
+    output = sys.stdout if path is None else open(path, 'w')
+    toCsv = csv.writer(output)
+    toCsv.writerow(['id', 'tome', 'rank', 'head', 'domain'])
+    for arrow in arrows:
+        toCsv.writerow([
+            arrow['target']['id'],
+            arrow['target']['tome'],
+            arrow['target']['rank'],
+            arrow['target']['head'],
+            arrow['source']['domain']
+            ])
+
+if __name__ == '__main__':
+    edda = list(unserializeMetadata(sys.argv[1], EDdARow))
+    lge = list(concat([
+        unserializeMetadata(f'{sys.argv[2]}/T{T}/metadata.csv', LGERow(T)) for T in range(1, 32)
+    ]))
+    getMetadata(list(naiveGetArrows(edda, lge)), sys.argv[3])
--- a/scripts/articlesToVolume.sh
+++ b/scripts/articlesToVolume.sh
+#!/bin/sh
+
+INPUT_DIR="${1%/}"
+TOME_NUMBER="${INPUT_DIR##*/T}"
+TOME="T${TOME_NUMBER}"
+OUTPUT="${2%/}/${TOME}.xml"
+
+cat <<EOF > "${OUTPUT}"
+<teiCorpus version="3.3.0" xmlns="http://www.tei-c.org/ns/1.0">
+<teiHeader>
+  <fileDesc>
+    <titleStmt>
+      <title>
+		La Grande Encyclopédie, Inventaire raisonné des sciences, des lettres et
+		des arts par une société de savants et de gens de lettres. Tome ${TOME_NUMBER}.
+      </title>
+     <respStmt>
+      <resp>Digitized by</resp>
+      <orgName>
+		Bibliothèque Nationale de France
+      </orgName>
+      <resp>Encoded by</resp>
+      <orgName>
+		ICAR
+      </orgName>
+     </respStmt>
+    </titleStmt>
+    <publicationStmt>
+      <distributor>
+       <orgName>
+        Project GÉODE
+       </orgName>
+       <address>
+        <addrline>
+         ICAR UMR 5191
+        </addrline>
+        <addrline>
+         ENS de Lyon
+        </addrline>
+       </address>
+      </distributor>
+    </publicationStmt>
+    <sourceDesc>
+      <bibl>
+       <title>
+		La Grande Encyclopédie, Inventaire raisonné des sciences, des lettres et
+		des arts par une société de savants et de gens de lettres. Tome ${TOME_NUMBER}.
+       </title>
+       <author>
+        Collective
+       </author>
+       <creation>
+        <date>
+         1885
+        </date>
+       </creation>
+       <imprint>
+        <date>
+         1885
+        </date>
+        <publisher>
+         H. Lamirault et Cie,
+        </publisher>
+        <pubplace>
+         Paris
+        </pubplace>
+       </imprint>
+       <biblScope unit="volume">${TOME_NUMBER}</biblScope>
+      </bibl>
+    </sourceDesc>
+  </fileDesc>
+</teiHeader>
+EOF
+
+find "${INPUT_DIR}" -type f -name "*.tei" | python3 "${0%/*}/reimport.py" >> "${OUTPUT}"
+
+cat <<EOF >> "${OUTPUT}"
+</teiCorpus>
+EOF
--- a/scripts/extract_classified_LGE.sh
+++ b/scripts/extract_classified_LGE.sh
+#!/bin/sh
+
+INPUT_METADATA="${1}"
+SOURCE_TEXT_ARTICLES="${2}"
+OUTPUT="${3}"
+if [ -d "${OUTPUT}" ]
+then
+	N=1
+	while [ -d "${OUTPUT}.${N}" ]
+	do
+		N=$((N+1))
+	done
+	mv "${OUTPUT}" "${OUTPUT}.${N}"
+fi
+
+WORKDIR=$(mktemp -d /tmp/classified-LGE.XXX)
+
+for T in {1..31}
+do
+	mkdir -p "${WORKDIR}/T${T}"
+done
+
+while read LINE
+do
+	ID="${LINE%%,*}"
+	LINE="${LINE#*,}"
+	T="${LINE%%,*}"
+	cp "${SOURCE_TEXT_ARTICLES}/T${T}/ById/${ID}."* "${WORKDIR}/T${T}"
+done < <(tail -n +2 ${INPUT_METADATA})
+
+mv ${WORKDIR} ${OUTPUT}
--- a/scripts/reimport.py
+++ b/scripts/reimport.py
+from lxml import etree
+from sys import stdin
+from os.path import basename
+import re
+
+def dom(name, attributes, inside):
+    elem = etree.Element(name, attributes)
+    if type(inside) == str:
+        elem.text = inside
+    else:
+        for child in inside:
+            elem.append(child)
+    return elem
+
+def analyze(inputArticle):
+    article = {"contents": []}
+    for node in inputArticle.xpath('//text/*'):
+        if 'head' not in article:
+            article['head'] = node.text
+        else:
+            article['contents'].append(node)
+    if 'author' not in article:
+        article['author'] = "anonyme"
+    for c in ['norm', 'generated']:
+        if c + 'Class' not in article:
+            article[c + 'Class'] = ""
+    return article
+
+def teiHeader(article):
+    return dom('teiHeader', {}, [
+        dom('fileDesc', {}, [
+            dom('titleStmt', {}, [
+                dom('title', {}, article['head'])
+                ]),
+            dom('publicationStmt', {}, [
+                dom('p', {}, "Annotated with TreeTagger for project GEODE")
+                ]),
+            dom('sourceDesc', {}, [
+                dom('bibl', {}, [
+                    dom('title', {}, article['head']),
+                    dom('author', {}, article['author'])
+                    ])
+                ])
+            ])
+        ])
+
+def contents(article):
+    xmlId = etree.QName('{http://www.w3.org/XML/1998/namespace}id')
+    for c in article['contents']:
+        for w in c.xpath('//w'):
+            if xmlId in w.attrib:
+                del w.attrib[xmlId]
+    return article['contents']
+
+def buildDocument(docId, article):
+    textAttributes = {
+            'id': docId,
+            'title': article['head'],
+            'normClass': article['normClass'],
+            'generatedClass': article['generatedClass'],
+            'author': article['author']
+            }
+    return dom('TEI', {}, [
+            teiHeader(article),
+            dom('text', textAttributes, [
+                dom('body', {}, contents(article))
+                ])
+        ])
+
+if __name__ == '__main__':
+    for filepath in stdin.read().splitlines():
+        m = re.match('T(\d+)article_(\d+).xml', basename(filepath))
+        if m is not None:
+            docId = '-'.join(m.groups())
+            with open(filepath) as f:
+                inputArticle = etree.parse(f)
+            print(etree.tounicode(buildDocument(docId, analyze(inputArticle))))
--- a/scripts/stanza-annotator.py
+++ b/scripts/stanza-annotator.py
+#!/usr/bin/env python3
+
+import os
+import os.path
+import stanza
+import sys
+
+class Annotator:
+    def __init__(self, source, target):
+        self.source = source
+        self.target = target
+        self.model = stanza.Pipeline(lang='fr', processors='tokenize,pos,lemma')
+
+    def run(self):
+        for name in os.listdir(self.source):
+            if os.path.isdir(f'{self.source}/{name}'):
+                self.annotate_dir(name)
+
+    def annotate_dir(self, directory):
+        source_path = f'{self.source}/{directory}'
+        os.makedirs(f'{self.target}/{directory}', exist_ok=True)
+        for name in os.listdir(source_path):
+            path = f'{source_path}/{name}'
+            relpath = f'{directory}/{name}'
+            if os.path.isdir(path):
+                self.annotate_dir(relpath)
+            elif os.path.isfile(path):
+                self.annotate_file(relpath)
+
+    def annotate_file(self, file):
+        with open(f'{self.source}/{file}', 'r') as source:
+            self.encode(f'{file[:-4]}.tei', self.model(source.read()))
+
+    def encode(self, file, annotation):
+        with open(f'{self.target}/{file}', 'w') as target:
+            for sentence in annotation.sentences:
+                self.encode_sentence(sentence, target)
+
+    def encode_sentence(self, sentence, target):
+        print('<s>', file=target)
+        for token in sentence.tokens:
+            self.encode_token(token, target)
+        print('</s>', file=target)
+
+    def encode_token(self, token, target):
+        form = token.text
+        lemma = '+'.join(map(lambda w: w.lemma, token.words))
+        upos = '+'.join(map(lambda w: w.upos, token.words))
+        print(f'\t<w lemma="{lemma}" pos="{upos}">{form}</w>', file=target)
+
+if __name__ == '__main__':
+    Annotator(sys.argv[1], sys.argv[2]).run()