Skip to content
Snippets Groups Projects

Compare revisions

Changes are shown as if the source revision was being merged into the target revision. Learn more about comparing revisions.

Source

Select target project
No results found

Target

Select target project
  • abrenon/outillage
1 result
Show changes
Commits on Source (4)
Showing
with 157 additions and 14 deletions
File moved
from BERT.Base import BERT
import datetime
from loaders import set_random
import numpy
import random
import time
import torch
from torch.optim import AdamW
......@@ -31,9 +32,13 @@ class Trainer(BERT):
num_warmup_steps = 0, # Default value in run_glue.py
num_training_steps = self.epochs * len(data_loader))
def __call__(self):
set_random()
losses = [self.epoch(e) for e in range(self.epochs)]
def __call__(self, seed_value=42):
random.seed(seed_value)
numpy.random.seed(seed_value)
torch.manual_seed(seed_value)
torch.cuda.manual_seed_all(seed_value)
for e in range(self.epochs):
self.epoch(e)
self.save()
print("\nTraining complete!")
......
from BERT.Base import BERT
from BERT.Classifier import Classifier
from BERT.LabeledData import LabeledData
from BERT.Trainer import Trainer
from GEODE.Classification.discursive import functions as discursiveFunctions
knowledgeDomains = [ 'Agriculture',
'Beaux-arts',
'Belles-lettres',
'Chasse',
'Commerce',
'Droit Jurisprudence',
'Géographie',
'Histoire',
'Histoire naturelle',
'Médecine',
'Métiers',
'Militaire',
'Musique',
'Philosophie',
'Physique',
'Politique',
'Religion' ]
functions = {'Historical narrative',
functions = ['Historical narrative',
'People narrative',
'Critical',
'Description',
......@@ -6,4 +6,4 @@ functions = {'Historical narrative',
'Example',
'Reasoning',
'Quotation',
'Prescriptive'}
'Prescriptive']
from GEODE.store import prepare
import matplotlib.pyplot as plot
import seaborn
def heatmap(matrix, filePath, labels, **kwargs):
plot.figure(figsize=(16,13))
if 'cmap' not in kwargs:
kwargs['cmap'] = 'Blues'
ax = seaborn.heatmap(
matrix, xticklabels=labels, yticklabels=labels, **kwargs
)
plot.savefig(prepare(filePath), dpi=300, bbox_inches='tight')
from GEODE.Classification import discursiveFunctions
from GEODE.Metadata import article, paragraph, fromKey, relativePath, toKey, uid
from GEODE.store import corpus, Directory, SelfContained, toTSV
from GEODE.Visualisation import heatmap
import math
def curry(f):
return lambda x: (lambda *args: f(x, *args))
def gate(n, size, offset=0):
return [1 if i == n else 0 for i in range(offset, offset+size)]
@curry
def orientedIntersection(l, sNew, sOld):
left = max(sNew*l[0], sOld*l[1])
right = min((sNew+1)*l[0], (sOld+1)*l[1])
return max(right-left, 0)
@curry
def resample(newSize, distribution):
oldSize = len(distribution)
lcm = math.lcm(newSize, oldSize)
intersection = orientedIntersection((lcm/newSize, lcm/oldSize))
ratio = oldSize / newSize
for i in range(newSize):
yield oldSize/lcm*sum([distribution[j]*intersection(i, j)
for j in range(math.floor(i*ratio),
math.ceil((i+1)*ratio))])
from GEODE import fromKey, relativePath
from GEODE.Metadata import fromKey, relativePath
from GEODE.store.TSV import toTSV
import pandas
from os import makedirs
from os.path import dirname, isdir
......@@ -54,7 +55,7 @@ class TSVIndexed(Corpus):
def full(self, key, row):
d = self.key(key, row)
d[self.column_name] = self.content(key, row).strip() + '\n'
d[self.column_name] = self.content(key, row).strip()
return d
def get_all(self, projector=None, where=None):
......@@ -98,7 +99,7 @@ class SelfContained(TSVIndexed):
def save(self, iterator):
self.data = pandas.DataFrame(iterator)
self.detect_keys()
self.data.to_csv(self.tsv_path, sep='\t', index=False)
toTSV(self.tsv_path, self.data)
class Directory(TSVIndexed):
"""
......@@ -144,7 +145,7 @@ class Directory(TSVIndexed):
self.detect_keys()
for _, row in self.data.iterrows():
self.write_text(row, row[self.column_name])
self.data[self.keys].to_csv(self.tsv_path, sep='\t', index=False)
toTSV(self.tsv_path, self.data[self.keys])
def corpus(path, **kwargs):
if path[-1:] == '/' or isdir(path):
......
from GEODE.signal import curry
from numpy import vectorize
import pandas
@curry
def toStrKey(areParagraphs, row):
key = "{work}_{volume:02d}_{article:04d}"
if areParagraphs:
key += "_{paragraph:04d}"
return key.format(**row)
def forPanda(data, f):
return vectorize(lambda i: f(data.iloc[i]))
def toTSV(filePath, data, sortBy='toStrKey'):
if type(data) != pandas.DataFrame:
data = pandas.DataFrame(data)
if sortBy == 'toStrKey':
sortBy = toStrKey('paragraph' in data)
if sortBy is None:
sortedData = data
else:
sortedData = data.sort_index(key=forPanda(data, sortBy))
sortedData.to_csv(filePath, sep='\t', index=False)
from GEODE.store.Corpus import corpus, Directory, SelfContained
from GEODE.store.TSV import toTSV
import os
import os.path
def prepare(path):
if '/' in path:
os.makedirs(os.path.dirname(path), exist_ok=True)
return path
import json
import sys
def load(file_path):
if type(file_path) == str:
......@@ -9,7 +10,22 @@ def load(file_path):
for line in file_path.readlines():
yield json.loads(line)
"""
def load(file_path):
if type(file_path) == str:
with open(file_path, 'r') as input_file:
return list(loadObjects(input_file))
else:
return loadObjects(file_path)
def loadObjects(input_file):
for line in input_file.readlines():
yield json.loads(line)
"""
def save(file_path, objects):
if file_path == '-':
file_path = sys.stdin
if type(file_path) == str:
with open(file_path, 'w') as output_file:
saveObjects(output_file, objects)
......@@ -18,5 +34,5 @@ def save(file_path, objects):
def saveObjects(output_file, objects):
for obj in objects:
json.dump(obj, output_file)
json.dump(obj, output_file, separators=(',', ':'))
print(file=output_file)
from GEODE import uid
def getUID(annotation):
return uid(annotation['meta'])
def UnknownAnswer(annotation, answer):
print(f"Unsupported answer '{answer}' for annotation {getUID(annotation)}")
def TwoAnnotations(annotation, first, second):
print(f"Found two annotations for {getUID(annotation)}: " +
f"'{first}' and '{second}'")
def Contradiction(annotation, label):
print(f"Contradiction found for {getUID(annotation)}: " +
f"function {label} should be both accepted and rejected")
def NoLabelLeft(text):
print(f"No possible function left for {uid(text)}")
......@@ -12,3 +12,7 @@ def checkBound(f):
def parseRatio(s):
return checkBound(int(s[:-1]) / 100 if s[-1] == '%' else float(s))
def toIterator(*args):
for arg in args:
for elem in arg:
yield elem
......@@ -7,10 +7,12 @@
((gnu packages haskell-web) #:select (ghc-aeson ghc-hxt))
((gnu packages haskell-xyz) #:select (ghc-cassava
ghc-hs-conllu
ghc-random))
ghc-random
ghc-regex-tdfa))
((gnu packages machine-learning) #:select (python-scikit-learn python-spacy))
((gnu packages python) #:select (python))
((gnu packages python-science) #:select (python-pandas))
((gnu packages python-xyz) #:select (python-beautifulsoup4))
((gnu packages python-xyz) #:select (python-beautifulsoup4 python-seaborn))
((gnu packages xml) #:select (python-lxml)))
;(define python-edda (load "/home/alice/Logiciel/python-edda/guix.scm"))
......@@ -32,12 +34,16 @@
ghc-hs-conllu ; working on syntax-annotated documents
ghc-hxt ; working on xml documents
ghc-random ; sampling data at random
ghc-regex-tdfa ; working with regexps in haskell
processing-lge ; extracting articles from the BnF files
python ; scripts
python-beautifulsoup4 ; extract EDdA metadata from TEI files
;python-edda ; TODO
python-lxml ; fusion articles into tomes for TXM
python-pandas ; working with CSV in python
python-scikit-learn ; evaluating models
python-seaborn ; draw figures
python-spacy ; working with prodigy's custom formats
python-stanza ; annotation
sed ; select files from listing
stanza-fr ; annotation
......
#!/bin/sh
source ${0%/*}/../lib.sh
source ${0%%/*}/lib/bash.sh
if [ "$#" != 2 ]
then
......