From cb2344573c81e1fa019ed3dec6eda85cbbf0cc18 Mon Sep 17 00:00:00 2001 From: Alice BRENON <alice.brenon@ens-lyon.fr> Date: Thu, 7 Sep 2023 18:50:36 +0200 Subject: [PATCH] Add a script to explode (Text) articles into paragraphs, outputing a files.tsv with a primary key for each of them in the process --- lib/GEODE/Metadata/File.hs | 6 +++ lib/GEODE/Metadata/PrimaryKey/Paragraph.hs | 52 ++++++++++++++++++++++ manifest.scm | 3 +- scripts/paragraphs.hs | 39 ++++++++++++++++ 4 files changed, 99 insertions(+), 1 deletion(-) create mode 100644 lib/GEODE/Metadata/File.hs create mode 100644 lib/GEODE/Metadata/PrimaryKey/Paragraph.hs create mode 100755 scripts/paragraphs.hs diff --git a/lib/GEODE/Metadata/File.hs b/lib/GEODE/Metadata/File.hs new file mode 100644 index 0000000..9565b0d --- /dev/null +++ b/lib/GEODE/Metadata/File.hs @@ -0,0 +1,6 @@ +module GEODE.Metadata.File + ( File(..) ) where + +class File a where + uid :: a -> String + relativePath :: a -> String -> FilePath diff --git a/lib/GEODE/Metadata/PrimaryKey/Paragraph.hs b/lib/GEODE/Metadata/PrimaryKey/Paragraph.hs new file mode 100644 index 0000000..e0d906f --- /dev/null +++ b/lib/GEODE/Metadata/PrimaryKey/Paragraph.hs @@ -0,0 +1,52 @@ +{-# LANGUAGE NamedFieldPuns, OverloadedStrings #-} +module GEODE.Metadata.PrimaryKey.Paragraph + ( ParagraphPK(..) ) where + +import Data.Aeson ((.=), ToJSON(..), object, pairs) +import Data.Csv + ( (.:), FromNamedRecord(..), ToNamedRecord(..), namedField, namedRecord ) +import GEODE.Metadata.File (File(..)) +import GEODE.Metadata as Article + (DefaultFields(..), HasDefaultHeader(..), PrimaryKey(PrimaryKey, book, tome), uid, relativePath) +import qualified GEODE.Metadata as Article (PrimaryKey(rank)) +import System.FilePath ((</>), (<.>), dropExtension) +import Text.Printf (printf) + +data ParagraphPK = ParagraphPK + { article :: PrimaryKey + , rank :: Int } deriving (Eq, Ord, Show) + +instance FromNamedRecord ParagraphPK where + parseNamedRecord nr = ParagraphPK + <$> (PrimaryKey <$> nr .: "book" <*> nr .: "tome" <*> nr .: "article#") + <*> nr .: "paragraph#" + +instance ToNamedRecord ParagraphPK where + toNamedRecord (ParagraphPK {article, rank}) = namedRecord + [ namedField "book" (book article) + , namedField "tome" (tome article) + , namedField "article#" (Article.rank article) + , namedField "paragraph#" rank ] + +instance File ParagraphPK where + uid (ParagraphPK {article, rank}) = printf "%s_%d" (Article.uid article) rank + + relativePath (ParagraphPK {article, rank}) extension = + articleDirectory </> (show rank) <.> extension + where + articleDirectory = dropExtension (Article.relativePath article "") + +instance HasDefaultHeader ParagraphPK where + defaultFields = DefaultFields (["book", "tome", "article#", "paragraph#"]) + +instance ToJSON ParagraphPK where + toJSON (ParagraphPK {article, rank}) = object + [ "book" .= book article + , "tome" .= tome article + , "article#" .= Article.rank article + , "paragraph#" .= rank ] + toEncoding (ParagraphPK {article, rank}) = pairs + ( "book" .= book article + <> "tome" .= tome article + <> "article#" .= Article.rank article + <> "paragraph#" .= rank ) diff --git a/manifest.scm b/manifest.scm index a8ead9b..bf7afd5 100644 --- a/manifest.scm +++ b/manifest.scm @@ -2,7 +2,7 @@ ((geode packages models) #:select (stanza-fr)) ((gnu packages commencement) #:select (gcc-toolchain)) ((gnu packages haskell) #:select (ghc)) - ((gnu packages haskell-web) #:select (ghc-hxt)) + ((gnu packages haskell-web) #:select (ghc-aeson ghc-hxt)) ((gnu packages haskell-xyz) #:select (ghc-cassava ghc-hs-conllu)) ((gnu packages python) #:select (python)) ((gnu packages python-science) #:select (python-pandas)) @@ -19,6 +19,7 @@ ;edda-clinic ; fix and cut the EDdA gcc-toolchain ; running haskell ghc ; running haskell + ghc-aeson ; working with JSON in haskell ghc-cassava ; working with CSV in haskell ;ghc-geode ; handling corpus files ghc-hs-conllu ; working on syntax-annotated documents diff --git a/scripts/paragraphs.hs b/scripts/paragraphs.hs new file mode 100755 index 0000000..453b943 --- /dev/null +++ b/scripts/paragraphs.hs @@ -0,0 +1,39 @@ +#!/usr/bin/env -S runhaskell --ghc-arg="-Wall" --ghc-arg="-i lib" +{-# LANGUAGE OverloadedStrings #-} +import Data.List (dropWhileEnd) +import Data.Text (Text, splitOn) +import Data.Text.IO as Text (readFile, writeFile) +import GEODE.Metadata.File as File (File(..)) +import GEODE.Metadata as Article (PrimaryKey, readNamedTsv, relativePath, tsvFile) +import GEODE.Metadata.PrimaryKey.Paragraph (ParagraphPK(..)) +import System.Directory (createDirectoryIfMissing) +import System.Environment (getArgs) +import System.FilePath ((</>), (<.>), isPathSeparator, takeDirectory) +import System.Script (syntax, try) +import Text.Filter (Editable(..)) +import Text.Filter.Linearize (linearize) + +articleParagraphs :: FilePath -> IO [Text] +articleParagraphs = + fmap (fmap (leave . linearize False . enter) . splitOn "\n\n") . Text.readFile + +to :: FilePath -> FilePath -> PrimaryKey -> IO [ParagraphPK] +to source target article = do + createDirectoryIfMissing True (target </> Article.relativePath article "") + articleParagraphs articlePath >>= mapM create . number + where + articlePath = source </> Article.relativePath article "txt" + number = zip [1..] + create (rank, paragraphText) = + let paragraphPK = ParagraphPK {article, rank} + outputPath = target </> File.relativePath paragraphPK "txt" in + paragraphPK <$ Text.writeFile outputPath paragraphText + +main :: IO () +main = (fmap (dropWhileEnd isPathSeparator) <$> getArgs) >>= run + where + run [inputMeta, source, target] = + try (readNamedTsv inputMeta) + >>= mapM (source `to` target) + >>= tsvFile (takeDirectory target </> "files" <.> "tsv") . concat + run _ = syntax "INPUT_METADATA SOURCE_DIRECTORY TARGET_DIRECTORY" -- GitLab