Skip to content
Snippets Groups Projects
Commit 9d197406 authored by Alice Brenon's avatar Alice Brenon
Browse files

Solidifying some useful code used here and there in the project

parents
No related branches found
No related tags found
No related merge requests found
*.pyc
def curry(f):
return lambda x: (lambda *args: f(x, *args))
superdomains = [
"Agriculture",
"Beaux-arts",
"Belles-lettres",
"Chasse",
"Commerce",
"Droit Jurisprudence",
"Géographie",
"Histoire",
"Histoire naturelle",
"Militaire",
"Musique",
"Médecine",
"Métiers",
"Philosophie",
"Physique",
"Politique",
"Religion"]
articleKey = ['work', 'volume', 'article']
paragraphKey = articleKey + ['paragraph']
def article(work, volume, article_):
return dict(zip(articleKey, [work, int(volume), int(article_)]))
def paragraph(work, volume, article_, paragraph_):
return dict(zip(paragraphKey,
[work, int(volume), int(article_), int(paragraph_)]))
def uid(text):
result = "{work}_{volume}_{article}".format(**text)
if 'paragraph' in text:
result = f"{result}_{text['paragraph']}"
return result
def fromUID(uid_):
components = uid_.split('_')
if len(components) == 3:
return article(*components)
elif len(components) == 4:
return paragraph(*components)
else:
print(f"'{uid}' doesn't represent a valid text UID")
def relativePath(text, extension):
result = "{work}/T{volume}/{article}".format(**text)
if 'paragraph' in text:
result = f"{result}/{text['paragraph']}"
return f"{result}.{extension}"
def toKey(text):
result = (text['work'], text['volume'], text['article'])
if 'paragraph' in text:
result = result + (text['paragraph'],)
return result
def fromKey(key):
if len(key) == len(articleKey):
return article(*key)
elif len(key) == len(paragraphKey):
return paragraph(*key)
else:
print(f"{key} isn't a valid text key")
from GEODE.Functional import curry
import math
@curry
def orientedIntersection(l, sNew, sOld):
left = max(sNew*l[0], sOld*l[1])
right = min((sNew+1)*l[0], (sOld+1)*l[1])
return max(right-left, 0)
@curry
def resample(newSize, distribution):
oldSize = len(distribution)
lcm = math.lcm(newSize, oldSize)
intersection = orientedIntersection((lcm/newSize, lcm/oldSize))
ratio = oldSize / newSize
for i in range(newSize):
yield oldSize/lcm*sum([distribution[j]*intersection(i, j)
for j in range(math.floor(i*ratio),
round((i+1)*ratio))])
from GEODE.Metadata import fromKey, relativePath
from GEODE.Store.Tabular import tabular, toTSV
import pandas
from os import makedirs
from os.path import dirname, isdir
def abstract(f):
def wrapped(*args, **kwargs):
raise NotImplementedError(f.__name__)
return wrapped
class Corpus:
@abstract
def __init__():
pass
@abstract
def get_text(self, primary_key):
pass
@abstract
def get_all(self):
pass
@abstract
def save(self, iterator):
pass
class TSVIndexed(Corpus):
default_keys = ['work', 'volume', 'article']
projectors = ['key', 'content', 'full']
def __init__(self, tsv_path, column_name):
self.tsv_path = tsv_path
self.column_name = column_name
self.data = None
def load(self):
if self.data is None:
self.data = tabular(self.tsv_path)
self.detect_keys()
self.data = self.data.set_index(self.keys, drop=False)
def detect_keys(self):
self.keys = self.default_keys.copy()
if 'paragraph' in self.data:
self.keys.append('paragraph')
@abstract
def content(self, key, row):
pass
def key(self, _, row):
return row[self.keys].to_dict()
def full(self, key, row):
return {**self.key(key, row),
self.column_name: self.content(key, row).strip()}
def get_all(self, projector=None, where=None):
if projector is None:
projector = self.full
elif type(projector) == str and projector in self.projectors:
projector = self.__getattribute__(projector)
self.load()
for row in self.data.iterrows():
if where is None or where(*row):
yield projector(*row)
class SelfContained(TSVIndexed):
"""
A class to handle the dataset TSV normalised path used in the project and loading the
actual text input as a generator from records when they are needed
"""
def __init__(self, tsv_path, column_name='content'):
"""
Positional arguments
:param tsv_path: the path to a TSV dataset containing a primary key and
a text content on every line
Keyword arguments
:param column_name: the name of the column where the text content is
stored
"""
TSVIndexed.__init__(self, tsv_path, column_name)
def get_text(self, primary_key):
self.load()
if type(primary_key) == dict:
primary_key = [primary_key[k] for k in self.keys if k in primary_key]
if type(primary_key) != tuple:
primary_key = tuple(primary_key)
return self.data.xs(primary_key)[self.column_name]
def content(self, _, row):
return row[self.column_name]
def save(self, iterator):
self.data = pandas.DataFrame(iterator)
self.detect_keys()
toTSV(self.tsv_path, self.data)
class Directory(TSVIndexed):
"""
A class to handle the normalised path used in the project and loading the
actual text input as a generator from records when they are needed
"""
def __init__(self, root_path, tsv_filename="files", column_name='content'):
"""
Positional arguments
:param root_path: the path to a GÉODE-style folder containing the text
version of the corpus on which to predict the classes
"""
self.text_path = f"{root_path}/Text"
TSVIndexed.__init__(self, f"{root_path}/{tsv_filename}.tsv", column_name)
def path_to(self, primary_key):
record = self.dict_primary_key(primary_key)
return f"{self.text_path}/{relativePath(record, 'txt')}"
def dict_primary_key(self, primary_key):
if type(primary_key) == pandas.core.series.Series:
return dict(primary_key)
elif type(primary_key) == dict:
return primary_key
else:
return fromKey(primary_key)
def get_text(self, primary_key):
with open(self.path_to(primary_key), 'r') as file:
return file.read()
def content(self, key, _):
return self.get_text(key)
def write_text(self, primary_key, content):
path = self.path_to(primary_key)
makedirs(dirname(path), exist_ok=True)
with open(path, 'w') as file:
file.write(content)
def save(self, iterator):
self.data = pandas.DataFrame(iterator)
self.detect_keys()
for _, row in self.data.iterrows():
self.write_text(row, row[self.column_name])
toTSV(self.tsv_path, self.data[self.keys])
def corpus(path, **kwargs):
if path[-1:] == '/' or isdir(path):
return Directory(path, **kwargs)
elif path[-4:] == '.tsv':
return SelfContained(path, **kwargs)
else:
raise FileNotFoundError(path)
from GEODE.Functional import curry
from numpy import vectorize
import pandas
@curry
def toStrKey(areParagraphs, row):
key = "{work}_{volume:02d}_{article:04d}"
if areParagraphs:
key += "_{paragraph:04d}"
return key.format(**row)
def forPanda(data, f):
return vectorize(lambda i: f(data.loc[i]))
def toTSV(filePath, data, sortBy='toStrKey'):
if type(data) != pandas.DataFrame:
data = pandas.DataFrame(data)
if sortBy == 'toStrKey':
sortBy = toStrKey('paragraph' in data)
if sortBy is None:
sortedData = data
else:
sortedData = data.sort_index(key=forPanda(data, sortBy))
sortedData.to_csv(filePath, sep='\t', index=False)
def tabular(filePath, **kwargs):
sep = ',' if filePath[-4:] == '.csv' else '\t'
return pandas.read_csv(filePath, sep=sep, **kwargs)
from GEODE.Store.Corpus import corpus, Directory, SelfContained
from GEODE.Store.Tabular import tabular, toTSV
import os
import os.path
def prepare(path):
if '/' in path:
os.makedirs(os.path.dirname(path), exist_ok=True)
return path
import argparse
from GEODE.Store import prepare, tabular
import matplotlib.pyplot as plot
import pandas
import seaborn
from sklearn.metrics import confusion_matrix
def trim(name, maxSize):
if len(name) > maxSize:
components = name.split(' ')
return components[0] + ' […]'
else:
return name
def trimLabels(labels, maxWidth):
return labels if maxWidth is None else [trim(l, maxWidth) for l in labels]
def heatmap(matrix, filePath, labels, **kwargs):
plot.figure(figsize=(16,13))
ax = seaborn.heatmap(
matrix, xticklabels=labels, yticklabels=labels, **kwargs
)
plot.savefig(prepare(filePath), dpi=300, bbox_inches='tight')
def fromList(data, labels):
truth = [d['truth'] for d in data]
if labels is None:
labels = sorted(set(truth))
return truth, [d['answer'] for d in data], labels
def fromDataFrame(data, labels):
truth = data['truth']
if labels is None:
labels = sorted(truth.unique())
return truth, data['answer'], labels
def prepareData(data, labels=None):
if type(data) == list:
return fromList(data, labels)
elif type(data) == pandas.DataFrame:
return fromDataFrame(data, labels)
else:
msg = "Unsupported data format {f} to represent a confusion matrix"
raise Exception(msg.format(f=type(data)))
def drawConfusionMatrix(data, outputFile, labels=None, maxWidth=None, **kwargs):
truth, answers, labels = prepareData(data, labels=labels)
matrix = confusion_matrix(truth, answers, labels=labels, normalize='true')
heatmap(matrix, outputFile, trimLabels(labels, maxWidth), **kwargs)
def getArgs(arguments):
cli = argparse.ArgumentParser(
prog='confusionMatrix',
description="Draw a confusion matrix from the result of a prediction")
cli.add_argument('inputTSV')
cli.add_argument('outputPNG')
cli.add_argument('-l', '--labels',
help="path to a file containing one label per line")
cli.add_argument('-w', '--maxWidth', type=int,
help="length from which labels will be truncated")
cli.add_argument('-c', '--cmap', help="color map to use")
return cli.parse_args(arguments)
def drawConfusionMatrixCLI(arguments):
args = getArgs(arguments)
data = tabular(args.inputTSV)
if args.labels is not None:
with open(args.labels, 'r') as labelsFile:
labels = list(map(lambda x: x.strip(), labelsFile))
else:
labels = None
drawConfusionMatrix(data,
args.outputPNG,
labels=labels,
maxWidth=args.maxWidth,
cmap=args.cmap)
import argparse
from GEODE.Signal import resample
from GEODE.Store import prepare, tabular
import matplotlib.pyplot as plot
import seaborn
def gate(measure):
first, last = measure['position'], measure['position'] + measure['size']
return [1 if i >= first and i < last else 0
for i in range(1, 1 + measure['totalSize'])]
def plotDensity(profile, outputPath):
plot.figure(figsize=(16,13))
l = len(profile)
ax = seaborn.lineplot(x=[100*i/(l-1) for i in range(l)], y=profile)
ax.set_xlabel("Position")
ax.set_xlim(0, 100)
ax.set_ylim(0)
ax.xaxis.set_major_formatter('{x}%')
ax.set_ylabel("Density")
ax.yaxis.set_major_formatter('{x}%')
curve = ax.lines[0]
x, y = curve.get_xydata()[:,0], curve.get_xydata()[:,1]
ax.fill_between(x, y, alpha=0.3)
plot.savefig(prepare(outputPath), dpi=300, bbox_inches='tight')
def sumProfiles(sameSizeProfiles):
return list(map(sum, zip(*sameSizeProfiles)))
def densityProfile(measures, resolution):
bySize, count = {}, 0
for measure in measures:
distribution = gate(measure)
count += measure['size']
l = len(distribution)
if l not in bySize:
bySize[l] = []
bySize[l].append(distribution)
resampled = map(resample(resolution), map(sumProfiles, bySize.values()))
return [resolution*x/count for x in sumProfiles(list(resampled))]
def drawDensityProfile(measures, outputFile, resolution):
plotDensity(densityProfile(measures, resolution), outputFile)
def drawDensityProfileCLI(arguments):
cli = argparse.ArgumentParser(
prog='densityProfile',
description="Draw a density profile from a set of occurrences")
cli.add_argument('inputTSV')
cli.add_argument('outputPNG')
cli.add_argument('-r', '--resolution', type=int)
args = cli.parse_args(arguments)
measures = [m[1] for m in tabular(args.inputTSV).iterrows()]
drawDensityProfile(measures, args.outputPNG, args.resolution or 100)
from GEODE.Visualisation.ConfusionMatrix import drawConfusionMatrix, heatmap
from GEODE.Visualisation.DensityProfile import densityProfile, drawDensityProfile, plotDensity
"""
geopyck is a set of python tools developed for project GEODE
<https://geode-project.github.io/>
Copyright (C) 2024 Alice BRENON <alice.brenon@ens-lyon.fr>
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program. If not, see <https://www.gnu.org/licenses/>.
"""
from argparse import ArgumentParser
from GEODE.Metadata import article, articleKey, paragraph, paragraphKey, \
fromKey, relativePath, toKey, uid
from GEODE.Metadata.Domains import superdomains
from GEODE.Store import corpus, Directory, SelfContained, tabular, toTSV
from GEODE.Visualisation import densityProfile, heatmap
from GEODE.Visualisation.ConfusionMatrix import drawConfusionMatrixCLI
from GEODE.Visualisation.DensityProfile import drawDensityProfileCLI
commands = {
'confusionMatrix': drawConfusionMatrixCLI,
'densityProfile': drawDensityProfileCLI
}
def geopyckCLI():
cli = ArgumentParser(
prog='geopyck',
description='A corpus linguistics tool for project GEODE')
cli.add_argument('command', choices=commands.keys())
mainArgs, otherArgs = cli.parse_known_args()
commands[mainArgs.command](otherArgs)
This diff is collapsed.
# Geopyck
guix.scm 0 → 100644
(use-modules ((gnu packages machine-learning) #:select (python-pytorch python-scikit-learn))
((gnu packages python-science) #:select (python-pandas))
((gnu packages python-xyz) #:select (python-matplotlib
python-nltk
python-seaborn))
(guix gexp)
(guix git-download)
((guix licenses) #:select (lgpl3+))
(guix packages)
(guix build-system pyproject))
(let
((%source-dir (dirname (current-filename))))
(package
(name "python-geopyck")
(version "0.1.0")
(source
(local-file %source-dir
#:recursive? #t
#:select? (git-predicate %source-dir)))
(build-system pyproject-build-system)
(propagated-inputs
(list python-matplotlib
python-pandas
python-scikit-learn
python-seaborn))
(arguments
(list #:tests? #f))
(home-page "https://gitlab.liris.cnrs.fr/geode/geopyck")
(synopsis "Python tools developed for project GEODE")
(description
"Geopyck is a python library to handle a corpus of encyclopedic texts and
generate visualisations.
It was developed as part of project GEODE
@url{https://geode-project.github.io/}")
(license lgpl3+)))
setup.py 0 → 100644
#!/usr/bin/env python3
from setuptools import find_packages, setup
setup(name='geopyck',
version='0.1',
packages=find_packages(),
entry_points={
'console_scripts': [
'geopyck = GEODE:geopyckCLI'
]
})
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment