Skip to content
Snippets Groups Projects

Compare revisions

Changes are shown as if the source revision was being merged into the target revision. Learn more about comparing revisions.

Source

Select target project
No results found

Target

Select target project
  • abrenon/outillage
1 result
Show changes
Commits on Source (4)
Showing
with 157 additions and 14 deletions
File moved
from BERT.Base import BERT from BERT.Base import BERT
import datetime import datetime
from loaders import set_random import numpy
import random
import time import time
import torch import torch
from torch.optim import AdamW from torch.optim import AdamW
...@@ -31,9 +32,13 @@ class Trainer(BERT): ...@@ -31,9 +32,13 @@ class Trainer(BERT):
num_warmup_steps = 0, # Default value in run_glue.py num_warmup_steps = 0, # Default value in run_glue.py
num_training_steps = self.epochs * len(data_loader)) num_training_steps = self.epochs * len(data_loader))
def __call__(self): def __call__(self, seed_value=42):
set_random() random.seed(seed_value)
losses = [self.epoch(e) for e in range(self.epochs)] numpy.random.seed(seed_value)
torch.manual_seed(seed_value)
torch.cuda.manual_seed_all(seed_value)
for e in range(self.epochs):
self.epoch(e)
self.save() self.save()
print("\nTraining complete!") print("\nTraining complete!")
......
from BERT.Base import BERT from BERT.Base import BERT
from BERT.Classifier import Classifier from BERT.Classifier import Classifier
from BERT.LabeledData import LabeledData
from BERT.Trainer import Trainer from BERT.Trainer import Trainer
from GEODE.Classification.discursive import functions as discursiveFunctions
knowledgeDomains = [ 'Agriculture',
'Beaux-arts',
'Belles-lettres',
'Chasse',
'Commerce',
'Droit Jurisprudence',
'Géographie',
'Histoire',
'Histoire naturelle',
'Médecine',
'Métiers',
'Militaire',
'Musique',
'Philosophie',
'Physique',
'Politique',
'Religion' ]
functions = {'Historical narrative', functions = ['Historical narrative',
'People narrative', 'People narrative',
'Critical', 'Critical',
'Description', 'Description',
...@@ -6,4 +6,4 @@ functions = {'Historical narrative', ...@@ -6,4 +6,4 @@ functions = {'Historical narrative',
'Example', 'Example',
'Reasoning', 'Reasoning',
'Quotation', 'Quotation',
'Prescriptive'} 'Prescriptive']
from GEODE.store import prepare
import matplotlib.pyplot as plot
import seaborn
def heatmap(matrix, filePath, labels, **kwargs):
plot.figure(figsize=(16,13))
if 'cmap' not in kwargs:
kwargs['cmap'] = 'Blues'
ax = seaborn.heatmap(
matrix, xticklabels=labels, yticklabels=labels, **kwargs
)
plot.savefig(prepare(filePath), dpi=300, bbox_inches='tight')
from GEODE.Classification import discursiveFunctions
from GEODE.Metadata import article, paragraph, fromKey, relativePath, toKey, uid
from GEODE.store import corpus, Directory, SelfContained, toTSV
from GEODE.Visualisation import heatmap
import math
def curry(f):
return lambda x: (lambda *args: f(x, *args))
def gate(n, size, offset=0):
return [1 if i == n else 0 for i in range(offset, offset+size)]
@curry
def orientedIntersection(l, sNew, sOld):
left = max(sNew*l[0], sOld*l[1])
right = min((sNew+1)*l[0], (sOld+1)*l[1])
return max(right-left, 0)
@curry
def resample(newSize, distribution):
oldSize = len(distribution)
lcm = math.lcm(newSize, oldSize)
intersection = orientedIntersection((lcm/newSize, lcm/oldSize))
ratio = oldSize / newSize
for i in range(newSize):
yield oldSize/lcm*sum([distribution[j]*intersection(i, j)
for j in range(math.floor(i*ratio),
math.ceil((i+1)*ratio))])
from GEODE import fromKey, relativePath from GEODE.Metadata import fromKey, relativePath
from GEODE.store.TSV import toTSV
import pandas import pandas
from os import makedirs from os import makedirs
from os.path import dirname, isdir from os.path import dirname, isdir
...@@ -54,7 +55,7 @@ class TSVIndexed(Corpus): ...@@ -54,7 +55,7 @@ class TSVIndexed(Corpus):
def full(self, key, row): def full(self, key, row):
d = self.key(key, row) d = self.key(key, row)
d[self.column_name] = self.content(key, row).strip() + '\n' d[self.column_name] = self.content(key, row).strip()
return d return d
def get_all(self, projector=None, where=None): def get_all(self, projector=None, where=None):
...@@ -98,7 +99,7 @@ class SelfContained(TSVIndexed): ...@@ -98,7 +99,7 @@ class SelfContained(TSVIndexed):
def save(self, iterator): def save(self, iterator):
self.data = pandas.DataFrame(iterator) self.data = pandas.DataFrame(iterator)
self.detect_keys() self.detect_keys()
self.data.to_csv(self.tsv_path, sep='\t', index=False) toTSV(self.tsv_path, self.data)
class Directory(TSVIndexed): class Directory(TSVIndexed):
""" """
...@@ -144,7 +145,7 @@ class Directory(TSVIndexed): ...@@ -144,7 +145,7 @@ class Directory(TSVIndexed):
self.detect_keys() self.detect_keys()
for _, row in self.data.iterrows(): for _, row in self.data.iterrows():
self.write_text(row, row[self.column_name]) self.write_text(row, row[self.column_name])
self.data[self.keys].to_csv(self.tsv_path, sep='\t', index=False) toTSV(self.tsv_path, self.data[self.keys])
def corpus(path, **kwargs): def corpus(path, **kwargs):
if path[-1:] == '/' or isdir(path): if path[-1:] == '/' or isdir(path):
......
from GEODE.signal import curry
from numpy import vectorize
import pandas
@curry
def toStrKey(areParagraphs, row):
key = "{work}_{volume:02d}_{article:04d}"
if areParagraphs:
key += "_{paragraph:04d}"
return key.format(**row)
def forPanda(data, f):
return vectorize(lambda i: f(data.iloc[i]))
def toTSV(filePath, data, sortBy='toStrKey'):
if type(data) != pandas.DataFrame:
data = pandas.DataFrame(data)
if sortBy == 'toStrKey':
sortBy = toStrKey('paragraph' in data)
if sortBy is None:
sortedData = data
else:
sortedData = data.sort_index(key=forPanda(data, sortBy))
sortedData.to_csv(filePath, sep='\t', index=False)
from GEODE.store.Corpus import corpus, Directory, SelfContained
from GEODE.store.TSV import toTSV
import os
import os.path
def prepare(path):
if '/' in path:
os.makedirs(os.path.dirname(path), exist_ok=True)
return path
import json import json
import sys
def load(file_path): def load(file_path):
if type(file_path) == str: if type(file_path) == str:
...@@ -9,7 +10,22 @@ def load(file_path): ...@@ -9,7 +10,22 @@ def load(file_path):
for line in file_path.readlines(): for line in file_path.readlines():
yield json.loads(line) yield json.loads(line)
"""
def load(file_path):
if type(file_path) == str:
with open(file_path, 'r') as input_file:
return list(loadObjects(input_file))
else:
return loadObjects(file_path)
def loadObjects(input_file):
for line in input_file.readlines():
yield json.loads(line)
"""
def save(file_path, objects): def save(file_path, objects):
if file_path == '-':
file_path = sys.stdin
if type(file_path) == str: if type(file_path) == str:
with open(file_path, 'w') as output_file: with open(file_path, 'w') as output_file:
saveObjects(output_file, objects) saveObjects(output_file, objects)
...@@ -18,5 +34,5 @@ def save(file_path, objects): ...@@ -18,5 +34,5 @@ def save(file_path, objects):
def saveObjects(output_file, objects): def saveObjects(output_file, objects):
for obj in objects: for obj in objects:
json.dump(obj, output_file) json.dump(obj, output_file, separators=(',', ':'))
print(file=output_file) print(file=output_file)
from GEODE import uid
def getUID(annotation):
return uid(annotation['meta'])
def UnknownAnswer(annotation, answer):
print(f"Unsupported answer '{answer}' for annotation {getUID(annotation)}")
def TwoAnnotations(annotation, first, second):
print(f"Found two annotations for {getUID(annotation)}: " +
f"'{first}' and '{second}'")
def Contradiction(annotation, label):
print(f"Contradiction found for {getUID(annotation)}: " +
f"function {label} should be both accepted and rejected")
def NoLabelLeft(text):
print(f"No possible function left for {uid(text)}")
...@@ -12,3 +12,7 @@ def checkBound(f): ...@@ -12,3 +12,7 @@ def checkBound(f):
def parseRatio(s): def parseRatio(s):
return checkBound(int(s[:-1]) / 100 if s[-1] == '%' else float(s)) return checkBound(int(s[:-1]) / 100 if s[-1] == '%' else float(s))
def toIterator(*args):
for arg in args:
for elem in arg:
yield elem
...@@ -7,10 +7,12 @@ ...@@ -7,10 +7,12 @@
((gnu packages haskell-web) #:select (ghc-aeson ghc-hxt)) ((gnu packages haskell-web) #:select (ghc-aeson ghc-hxt))
((gnu packages haskell-xyz) #:select (ghc-cassava ((gnu packages haskell-xyz) #:select (ghc-cassava
ghc-hs-conllu ghc-hs-conllu
ghc-random)) ghc-random
ghc-regex-tdfa))
((gnu packages machine-learning) #:select (python-scikit-learn python-spacy))
((gnu packages python) #:select (python)) ((gnu packages python) #:select (python))
((gnu packages python-science) #:select (python-pandas)) ((gnu packages python-science) #:select (python-pandas))
((gnu packages python-xyz) #:select (python-beautifulsoup4)) ((gnu packages python-xyz) #:select (python-beautifulsoup4 python-seaborn))
((gnu packages xml) #:select (python-lxml))) ((gnu packages xml) #:select (python-lxml)))
;(define python-edda (load "/home/alice/Logiciel/python-edda/guix.scm")) ;(define python-edda (load "/home/alice/Logiciel/python-edda/guix.scm"))
...@@ -32,12 +34,16 @@ ...@@ -32,12 +34,16 @@
ghc-hs-conllu ; working on syntax-annotated documents ghc-hs-conllu ; working on syntax-annotated documents
ghc-hxt ; working on xml documents ghc-hxt ; working on xml documents
ghc-random ; sampling data at random ghc-random ; sampling data at random
ghc-regex-tdfa ; working with regexps in haskell
processing-lge ; extracting articles from the BnF files processing-lge ; extracting articles from the BnF files
python ; scripts python ; scripts
python-beautifulsoup4 ; extract EDdA metadata from TEI files python-beautifulsoup4 ; extract EDdA metadata from TEI files
;python-edda ; TODO ;python-edda ; TODO
python-lxml ; fusion articles into tomes for TXM python-lxml ; fusion articles into tomes for TXM
python-pandas ; working with CSV in python python-pandas ; working with CSV in python
python-scikit-learn ; evaluating models
python-seaborn ; draw figures
python-spacy ; working with prodigy's custom formats
python-stanza ; annotation python-stanza ; annotation
sed ; select files from listing sed ; select files from listing
stanza-fr ; annotation stanza-fr ; annotation
......
#!/bin/sh #!/bin/sh
source ${0%/*}/../lib.sh source ${0%%/*}/lib/bash.sh
if [ "$#" != 2 ] if [ "$#" != 2 ]
then then
......