From 45f79db57610a3bd2d8dcefc1f6222fa56dfdc7c Mon Sep 17 00:00:00 2001
From: Alice BRENON <alice.brenon@ens-lyon.fr>
Date: Wed, 23 Nov 2022 16:15:35 +0100
Subject: [PATCH] Add a (naive) script to put paragraphs on one line to please
 stanza + some lib to factorize common bases for future haskell scripts

---
 Makefile                                      | 25 ++++++---
 lib/Data/Metadata.hs                          | 38 ++++++++++++++
 lib/Data/Metadata/Article.hs                  | 30 +++++++++++
 lib/System/Script.hs                          | 12 +++++
 lib/Text/Editor.hs                            | 52 +++++++++++++++++++
 manifest.scm                                  |  8 +++
 ...ssified_LGE.sh => extract-parallel-LGE.sh} |  2 +-
 scripts/linearize.hs                          | 22 ++++++++
 8 files changed, 182 insertions(+), 7 deletions(-)
 create mode 100644 lib/Data/Metadata.hs
 create mode 100644 lib/Data/Metadata/Article.hs
 create mode 100644 lib/System/Script.hs
 create mode 100644 lib/Text/Editor.hs
 rename scripts/{extract_classified_LGE.sh => extract-parallel-LGE.sh} (91%)
 create mode 100755 scripts/linearize.hs

diff --git a/Makefile b/Makefile
index ac5dd19..c181e28 100644
--- a/Makefile
+++ b/Makefile
@@ -3,30 +3,43 @@ EDDA=$(CORPUS_ROOT)/EDdA
 LGE=$(CORPUS_ROOT)/LGE
 
 EDDA_META=$(EDDA)/metadata.csv
-METADATA=$(EDDA_META)
 
 RAW_LGE=$(LGE)/Text
+TOME_DIRS=$(wildcard $(RAW_LGE)/T*)
+TOMES=$(TOME_DIRS:$(RAW_LGE)/T%=%)
+
 TEI_LGE=$(LGE)/TEI
 PARALLEL_LGE=$(LGE)/Parallel
 LGE_META_FROM_EDDA=$(PARALLEL_LGE)/metadata.csv
 RAW_PARALLEL_LGE=$(PARALLEL_LGE)/Text
+LINEARIZED_PARALLEL_LGE_ROOT=$(PARALLEL_LGE)/Linearized
 TEI_PARALLEL_LGE=$(PARALLEL_LGE)/TEI
+
+LINEARIZED_PARALLEL_LGE=$(LINEARIZED_PARALLEL_LGE_ROOT)/ $(TOMES:%=$(LINEARIZED_PARALLEL_LGE_ROOT)/T%)
 STANZA_PARALLEL_LGE=$(PARALLEL_LGE)/stanza
 
-all: $(METADATA) $(STANZA_PARALLEL_LGE)
+METADATA=$(EDDA_META) $(LGE_META_FROM_EDDA)
+
+all: $(METADATA) $(LINEARIZED_PARALLEL_LGE)
 
 $(EDDA_META): $(EDDA)/TEI/
 	./scripts/EDdA-metadata.py $< > $@
 
-$(LGE_META_FROM_EDDA): $(EDDA_META)
-	mkdir -p $(dir $@)
+$(LGE_META_FROM_EDDA): $(EDDA_META) $(PARALLEL_LGE)/
 	./scripts/LGE-metadata-from-EDdA.py $< $(RAW_LGE) $@
 
 $(RAW_PARALLEL_LGE): $(LGE_META_FROM_EDDA) $(RAW_LGE)
-	./scripts/extract_classified_LGE.sh $^ $@
+	./scripts/extract-parallel-LGE.sh $^ $@
 
 $(TEI_PARALLEL_LGE): $(LGE_META_FROM_EDDA) $(TEI_LGE)
-	./scripts/extract_classified_LGE.sh $^ $@
+	./scripts/extract-parallel-LGE.sh $^ $@
 
 $(STANZA_PARALLEL_LGE): $(RAW_PARALLEL_LGE)
 	./scripts/stanza-annotator.py $< $@
+
+%/:
+	mkdir -p $@
+
+$(LINEARIZED_PARALLEL_LGE_ROOT)/T%: $(RAW_PARALLEL_LGE)/T%
+	mkdir -p $@
+	find $< -type f -name '*.txt' | ./scripts/linearize.hs $@
diff --git a/lib/Data/Metadata.hs b/lib/Data/Metadata.hs
new file mode 100644
index 0000000..582eaff
--- /dev/null
+++ b/lib/Data/Metadata.hs
@@ -0,0 +1,38 @@
+{-# LANGUAGE NamedFieldPuns #-}
+module Data.Metadata (
+      Metadata
+    , byTome
+    , metadata
+  ) where
+
+import Data.Attoparsec.Text (Parser, endOfLine, sepBy1, takeTill)
+import Data.List as List (foldl')
+import Data.Map as Map (Map, adjust, empty, foldl', insert, member)
+import Data.Metadata.Article (Article(..), article)
+import Data.Text (Text)
+
+--type Metadata = Map Text Article
+type Metadata = [Article]
+type Tomes = Map Int Metadata
+
+metadata :: Parser Metadata
+--metadata = List.foldl' indexByUid Map.empty <$>
+--  (skipLine *> article `sepBy1` endOfLine)
+metadata = skipLine *> article `sepBy1` endOfLine
+  where
+    skipLine = takeTill (\c -> c == '\r' || c == '\n') *> endOfLine
+    indexByUid temp a@(Article {uid}) = Map.insert uid a temp
+
+byTome :: Metadata -> Tomes
+--byTome = Map.foldl' sortByTome Map.empty
+byTome = List.foldl' sortByTome Map.empty
+  where
+    sortByTome temp a@(Article {uid, tome}) =
+--        Map.adjust (Map.insert uid a) tome .
+        Map.adjust (a:) tome
+--      . (if tome `member` temp then id else Map.insert tome Map.empty) $ temp
+      . (if tome `member` temp then id else Map.insert tome []) $ temp
+{-
+      . (if tome `member` temp then id else Map.insert tome Map.empty)
+      $ tome
+      -}
diff --git a/lib/Data/Metadata/Article.hs b/lib/Data/Metadata/Article.hs
new file mode 100644
index 0000000..300c3d8
--- /dev/null
+++ b/lib/Data/Metadata/Article.hs
@@ -0,0 +1,30 @@
+{-# LANGUAGE OverloadedStrings #-}
+module Data.Metadata.Article (
+      Article(..)
+    , article
+  ) where
+
+import Data.Attoparsec.Text (Parser, char, decimal, sepBy1, string, takeTill)
+import Data.Text (Text, stripEnd)
+
+data Article = Article {
+      uid :: Text
+    , tome :: Int
+    , rank :: Int
+    , headWord :: Text
+    , domains :: [Text]
+  } deriving Show
+
+article :: Parser Article
+article = Article
+  <$> cell
+  <*> (decimal <* char ',')
+  <*> (decimal <* char ',')
+  <*> cell
+  <*> domains_
+  where
+    cell = takeTill (== ',') <* char ','
+
+domains_ :: Parser [Text]
+domains_ = fmap stripEnd <$>
+  (takeTill (`elem` ['|', '\r', '\n'])) `sepBy1` (string "| ")
diff --git a/lib/System/Script.hs b/lib/System/Script.hs
new file mode 100644
index 0000000..2455009
--- /dev/null
+++ b/lib/System/Script.hs
@@ -0,0 +1,12 @@
+module System.Script (
+    syntax
+  ) where
+
+import System.Exit (die)
+import System.Environment (getProgName)
+import Text.Printf (printf)
+
+syntax :: String -> IO ()
+syntax s = do
+  this <- getProgName
+  die $ printf "Syntax: %s %s" this s
diff --git a/lib/Text/Editor.hs b/lib/Text/Editor.hs
new file mode 100644
index 0000000..b5c4148
--- /dev/null
+++ b/lib/Text/Editor.hs
@@ -0,0 +1,52 @@
+{-# LANGUAGE FlexibleInstances, OverloadedStrings #-}
+module Text.Editor (
+      Editor
+    , edit
+    , editAll
+    , editM
+    , editAllM
+  ) where
+
+import Control.Monad.IO.Class (MonadIO(..))
+import Data.Text as Text (Text, intercalate, lines, pack, unpack)
+import Data.Text.IO as Text (readFile, writeFile)
+import System.FilePath (replaceDirectory)
+
+type Editor a = a -> a
+type EditorM m a = a -> m a
+
+class Editable a where
+  enter :: Text -> a
+  leave :: a -> Text
+
+instance Editable [Text] where
+  enter = Text.lines
+  leave = Text.intercalate "\n"
+
+instance Editable String where
+  enter = Text.unpack
+  leave = Text.pack
+
+-- | Apply an 'Editor' to the content of a file which path is passed as third
+-- argument. The second argument is the path to the target folder where the
+-- edited version will be created.
+editM :: (Editable a, MonadIO m) => EditorM m a -> FilePath -> FilePath -> m ()
+editM editor target input =
+    liftIO (Text.readFile input)
+  >>= editor . enter
+  >>= liftIO . Text.writeFile output . leave
+  where
+    output = replaceDirectory input target
+
+-- | A convenient shortcut of 'editM' for pure editors
+edit :: Editable a => Editor a -> FilePath -> FilePath -> IO ()
+edit editor = editM (pure . editor)
+
+-- | Apply an 'Editor' like 'edit' on all the files which paths are expected to
+-- be read from the input 'String', one per line.
+editAllM :: (Editable a, MonadIO m) => EditorM m a -> FilePath -> String -> m ()
+editAllM editor target = mapM_ (editM editor target) . Prelude.lines
+
+-- | A convenient shortcut of 'editAllM' for pure editors
+editAll :: Editable a => Editor a -> FilePath -> String -> IO ()
+editAll editor = editAllM (pure . editor)
diff --git a/manifest.scm b/manifest.scm
index 824a72d..19fcf85 100644
--- a/manifest.scm
+++ b/manifest.scm
@@ -1,5 +1,9 @@
 (use-modules ((geode packages annotation) #:select (python-stanza))
              ((geode packages models) #:select (stanza-fr))
+             ((gnu packages commencement) #:select (gcc-toolchain))
+             ((gnu packages haskell) #:select (ghc))
+             ((gnu packages haskell-web) #:select (ghc-hxt))
+             ((gnu packages haskell-xyz) #:select (ghc-attoparsec))
              ((gnu packages python) #:select (python))
              ((gnu packages python-xyz) #:select (python-beautifulsoup4))
              ((gnu packages xml) #:select (python-lxml)))
@@ -7,6 +11,10 @@
 (packages->manifest
   (list
     coreutils ; mktemp for atomic processing, strip CSV headers, general scripting
+    gcc-toolchain ; running haskell
+    ghc ; running haskell
+    ghc-attoparsec ; parsing metadata
+    ghc-hxt ; working on xml documents
     python ; scripts
     python-beautifulsoup4 ; extract EDdA metadata from TEI files
     python-lxml ; fusion articles into tomes for TXM
diff --git a/scripts/extract_classified_LGE.sh b/scripts/extract-parallel-LGE.sh
similarity index 91%
rename from scripts/extract_classified_LGE.sh
rename to scripts/extract-parallel-LGE.sh
index c60ca0a..1411ce2 100755
--- a/scripts/extract_classified_LGE.sh
+++ b/scripts/extract-parallel-LGE.sh
@@ -13,7 +13,7 @@ then
 	mv "${OUTPUT}" "${OUTPUT}.${N}"
 fi
 
-WORKDIR=$(mktemp -d /tmp/classified-LGE.XXX)
+WORKDIR=$(mktemp -d /tmp/parallel-LGE.XXX)
 
 for T in {1..31}
 do
diff --git a/scripts/linearize.hs b/scripts/linearize.hs
new file mode 100755
index 0000000..125d393
--- /dev/null
+++ b/scripts/linearize.hs
@@ -0,0 +1,22 @@
+#!/usr/bin/env -S runhaskell --ghc-arg="-i lib"
+
+import Data.Char (isUpper)
+import System.Environment (getArgs)
+import System.FilePath ((</>))
+import System.Script (syntax)
+import Text.Editor (editAll)
+
+linearize :: String -> String
+linearize "" = ""
+linearize ('Â¬':'\n':s) = linearize s
+linearize ('\n':'\n':s) = "\n\n" ++ linearize s
+linearize ('\n':s) = ' ' : linearize s
+linearize ('-':'\n':c:s)
+  | isUpper c = '-' : c : linearize s
+linearize (c:s) = c : linearize s
+
+main :: IO ()
+main = getArgs >>= cli
+  where
+    cli [target] = getContents >>= editAll linearize target
+    cli _ = syntax "TARGET_DIR"
-- 
GitLab