From 45f79db57610a3bd2d8dcefc1f6222fa56dfdc7c Mon Sep 17 00:00:00 2001 From: Alice BRENON <alice.brenon@ens-lyon.fr> Date: Wed, 23 Nov 2022 16:15:35 +0100 Subject: [PATCH] Add a (naive) script to put paragraphs on one line to please stanza + some lib to factorize common bases for future haskell scripts --- Makefile | 25 ++++++--- lib/Data/Metadata.hs | 38 ++++++++++++++ lib/Data/Metadata/Article.hs | 30 +++++++++++ lib/System/Script.hs | 12 +++++ lib/Text/Editor.hs | 52 +++++++++++++++++++ manifest.scm | 8 +++ ...ssified_LGE.sh => extract-parallel-LGE.sh} | 2 +- scripts/linearize.hs | 22 ++++++++ 8 files changed, 182 insertions(+), 7 deletions(-) create mode 100644 lib/Data/Metadata.hs create mode 100644 lib/Data/Metadata/Article.hs create mode 100644 lib/System/Script.hs create mode 100644 lib/Text/Editor.hs rename scripts/{extract_classified_LGE.sh => extract-parallel-LGE.sh} (91%) create mode 100755 scripts/linearize.hs diff --git a/Makefile b/Makefile index ac5dd19..c181e28 100644 --- a/Makefile +++ b/Makefile @@ -3,30 +3,43 @@ EDDA=$(CORPUS_ROOT)/EDdA LGE=$(CORPUS_ROOT)/LGE EDDA_META=$(EDDA)/metadata.csv -METADATA=$(EDDA_META) RAW_LGE=$(LGE)/Text +TOME_DIRS=$(wildcard $(RAW_LGE)/T*) +TOMES=$(TOME_DIRS:$(RAW_LGE)/T%=%) + TEI_LGE=$(LGE)/TEI PARALLEL_LGE=$(LGE)/Parallel LGE_META_FROM_EDDA=$(PARALLEL_LGE)/metadata.csv RAW_PARALLEL_LGE=$(PARALLEL_LGE)/Text +LINEARIZED_PARALLEL_LGE_ROOT=$(PARALLEL_LGE)/Linearized TEI_PARALLEL_LGE=$(PARALLEL_LGE)/TEI + +LINEARIZED_PARALLEL_LGE=$(LINEARIZED_PARALLEL_LGE_ROOT)/ $(TOMES:%=$(LINEARIZED_PARALLEL_LGE_ROOT)/T%) STANZA_PARALLEL_LGE=$(PARALLEL_LGE)/stanza -all: $(METADATA) $(STANZA_PARALLEL_LGE) +METADATA=$(EDDA_META) $(LGE_META_FROM_EDDA) + +all: $(METADATA) $(LINEARIZED_PARALLEL_LGE) $(EDDA_META): $(EDDA)/TEI/ ./scripts/EDdA-metadata.py $< > $@ -$(LGE_META_FROM_EDDA): $(EDDA_META) - mkdir -p $(dir $@) +$(LGE_META_FROM_EDDA): $(EDDA_META) $(PARALLEL_LGE)/ ./scripts/LGE-metadata-from-EDdA.py $< $(RAW_LGE) $@ $(RAW_PARALLEL_LGE): $(LGE_META_FROM_EDDA) $(RAW_LGE) - ./scripts/extract_classified_LGE.sh $^ $@ + ./scripts/extract-parallel-LGE.sh $^ $@ $(TEI_PARALLEL_LGE): $(LGE_META_FROM_EDDA) $(TEI_LGE) - ./scripts/extract_classified_LGE.sh $^ $@ + ./scripts/extract-parallel-LGE.sh $^ $@ $(STANZA_PARALLEL_LGE): $(RAW_PARALLEL_LGE) ./scripts/stanza-annotator.py $< $@ + +%/: + mkdir -p $@ + +$(LINEARIZED_PARALLEL_LGE_ROOT)/T%: $(RAW_PARALLEL_LGE)/T% + mkdir -p $@ + find $< -type f -name '*.txt' | ./scripts/linearize.hs $@ diff --git a/lib/Data/Metadata.hs b/lib/Data/Metadata.hs new file mode 100644 index 0000000..582eaff --- /dev/null +++ b/lib/Data/Metadata.hs @@ -0,0 +1,38 @@ +{-# LANGUAGE NamedFieldPuns #-} +module Data.Metadata ( + Metadata + , byTome + , metadata + ) where + +import Data.Attoparsec.Text (Parser, endOfLine, sepBy1, takeTill) +import Data.List as List (foldl') +import Data.Map as Map (Map, adjust, empty, foldl', insert, member) +import Data.Metadata.Article (Article(..), article) +import Data.Text (Text) + +--type Metadata = Map Text Article +type Metadata = [Article] +type Tomes = Map Int Metadata + +metadata :: Parser Metadata +--metadata = List.foldl' indexByUid Map.empty <$> +-- (skipLine *> article `sepBy1` endOfLine) +metadata = skipLine *> article `sepBy1` endOfLine + where + skipLine = takeTill (\c -> c == '\r' || c == '\n') *> endOfLine + indexByUid temp a@(Article {uid}) = Map.insert uid a temp + +byTome :: Metadata -> Tomes +--byTome = Map.foldl' sortByTome Map.empty +byTome = List.foldl' sortByTome Map.empty + where + sortByTome temp a@(Article {uid, tome}) = +-- Map.adjust (Map.insert uid a) tome . + Map.adjust (a:) tome +-- . (if tome `member` temp then id else Map.insert tome Map.empty) $ temp + . (if tome `member` temp then id else Map.insert tome []) $ temp +{- + . (if tome `member` temp then id else Map.insert tome Map.empty) + $ tome + -} diff --git a/lib/Data/Metadata/Article.hs b/lib/Data/Metadata/Article.hs new file mode 100644 index 0000000..300c3d8 --- /dev/null +++ b/lib/Data/Metadata/Article.hs @@ -0,0 +1,30 @@ +{-# LANGUAGE OverloadedStrings #-} +module Data.Metadata.Article ( + Article(..) + , article + ) where + +import Data.Attoparsec.Text (Parser, char, decimal, sepBy1, string, takeTill) +import Data.Text (Text, stripEnd) + +data Article = Article { + uid :: Text + , tome :: Int + , rank :: Int + , headWord :: Text + , domains :: [Text] + } deriving Show + +article :: Parser Article +article = Article + <$> cell + <*> (decimal <* char ',') + <*> (decimal <* char ',') + <*> cell + <*> domains_ + where + cell = takeTill (== ',') <* char ',' + +domains_ :: Parser [Text] +domains_ = fmap stripEnd <$> + (takeTill (`elem` ['|', '\r', '\n'])) `sepBy1` (string "| ") diff --git a/lib/System/Script.hs b/lib/System/Script.hs new file mode 100644 index 0000000..2455009 --- /dev/null +++ b/lib/System/Script.hs @@ -0,0 +1,12 @@ +module System.Script ( + syntax + ) where + +import System.Exit (die) +import System.Environment (getProgName) +import Text.Printf (printf) + +syntax :: String -> IO () +syntax s = do + this <- getProgName + die $ printf "Syntax: %s %s" this s diff --git a/lib/Text/Editor.hs b/lib/Text/Editor.hs new file mode 100644 index 0000000..b5c4148 --- /dev/null +++ b/lib/Text/Editor.hs @@ -0,0 +1,52 @@ +{-# LANGUAGE FlexibleInstances, OverloadedStrings #-} +module Text.Editor ( + Editor + , edit + , editAll + , editM + , editAllM + ) where + +import Control.Monad.IO.Class (MonadIO(..)) +import Data.Text as Text (Text, intercalate, lines, pack, unpack) +import Data.Text.IO as Text (readFile, writeFile) +import System.FilePath (replaceDirectory) + +type Editor a = a -> a +type EditorM m a = a -> m a + +class Editable a where + enter :: Text -> a + leave :: a -> Text + +instance Editable [Text] where + enter = Text.lines + leave = Text.intercalate "\n" + +instance Editable String where + enter = Text.unpack + leave = Text.pack + +-- | Apply an 'Editor' to the content of a file which path is passed as third +-- argument. The second argument is the path to the target folder where the +-- edited version will be created. +editM :: (Editable a, MonadIO m) => EditorM m a -> FilePath -> FilePath -> m () +editM editor target input = + liftIO (Text.readFile input) + >>= editor . enter + >>= liftIO . Text.writeFile output . leave + where + output = replaceDirectory input target + +-- | A convenient shortcut of 'editM' for pure editors +edit :: Editable a => Editor a -> FilePath -> FilePath -> IO () +edit editor = editM (pure . editor) + +-- | Apply an 'Editor' like 'edit' on all the files which paths are expected to +-- be read from the input 'String', one per line. +editAllM :: (Editable a, MonadIO m) => EditorM m a -> FilePath -> String -> m () +editAllM editor target = mapM_ (editM editor target) . Prelude.lines + +-- | A convenient shortcut of 'editAllM' for pure editors +editAll :: Editable a => Editor a -> FilePath -> String -> IO () +editAll editor = editAllM (pure . editor) diff --git a/manifest.scm b/manifest.scm index 824a72d..19fcf85 100644 --- a/manifest.scm +++ b/manifest.scm @@ -1,5 +1,9 @@ (use-modules ((geode packages annotation) #:select (python-stanza)) ((geode packages models) #:select (stanza-fr)) + ((gnu packages commencement) #:select (gcc-toolchain)) + ((gnu packages haskell) #:select (ghc)) + ((gnu packages haskell-web) #:select (ghc-hxt)) + ((gnu packages haskell-xyz) #:select (ghc-attoparsec)) ((gnu packages python) #:select (python)) ((gnu packages python-xyz) #:select (python-beautifulsoup4)) ((gnu packages xml) #:select (python-lxml))) @@ -7,6 +11,10 @@ (packages->manifest (list coreutils ; mktemp for atomic processing, strip CSV headers, general scripting + gcc-toolchain ; running haskell + ghc ; running haskell + ghc-attoparsec ; parsing metadata + ghc-hxt ; working on xml documents python ; scripts python-beautifulsoup4 ; extract EDdA metadata from TEI files python-lxml ; fusion articles into tomes for TXM diff --git a/scripts/extract_classified_LGE.sh b/scripts/extract-parallel-LGE.sh similarity index 91% rename from scripts/extract_classified_LGE.sh rename to scripts/extract-parallel-LGE.sh index c60ca0a..1411ce2 100755 --- a/scripts/extract_classified_LGE.sh +++ b/scripts/extract-parallel-LGE.sh @@ -13,7 +13,7 @@ then mv "${OUTPUT}" "${OUTPUT}.${N}" fi -WORKDIR=$(mktemp -d /tmp/classified-LGE.XXX) +WORKDIR=$(mktemp -d /tmp/parallel-LGE.XXX) for T in {1..31} do diff --git a/scripts/linearize.hs b/scripts/linearize.hs new file mode 100755 index 0000000..125d393 --- /dev/null +++ b/scripts/linearize.hs @@ -0,0 +1,22 @@ +#!/usr/bin/env -S runhaskell --ghc-arg="-i lib" + +import Data.Char (isUpper) +import System.Environment (getArgs) +import System.FilePath ((</>)) +import System.Script (syntax) +import Text.Editor (editAll) + +linearize :: String -> String +linearize "" = "" +linearize ('¬':'\n':s) = linearize s +linearize ('\n':'\n':s) = "\n\n" ++ linearize s +linearize ('\n':s) = ' ' : linearize s +linearize ('-':'\n':c:s) + | isUpper c = '-' : c : linearize s +linearize (c:s) = c : linearize s + +main :: IO () +main = getArgs >>= cli + where + cli [target] = getContents >>= editAll linearize target + cli _ = syntax "TARGET_DIR" -- GitLab