Add a script to explode (Text) articles into paragraphs, outputing a files.tsv...

Add a script to explode (Text) articles into paragraphs, outputing a files.tsv with a primary key for each of them in the process

Add a script to explode (Text) articles into paragraphs, outputing a files.tsv...
Add a script to explode (Text) articles into paragraphs, outputing a files.tsv with a primary key for each of them in the process
cb234457 · Alice Brenon · d0eb4ce6 · cb234457 · cb234457 · cb234457
Commit cb234457 authored 1 year ago by Alice Brenon
--- a/lib/GEODE/Metadata/File.hs
+++ b/lib/GEODE/Metadata/File.hs
+module GEODE.Metadata.File
+  ( File(..) ) where
+class File a where
+  uid :: a -> String
+  relativePath :: a -> String -> FilePath
--- a/lib/GEODE/Metadata/PrimaryKey/Paragraph.hs
+++ b/lib/GEODE/Metadata/PrimaryKey/Paragraph.hs
+{-# LANGUAGE NamedFieldPuns, OverloadedStrings #-}
+module GEODE.Metadata.PrimaryKey.Paragraph
+  ( ParagraphPK(..) ) where
+import Data.Aeson ((.=), ToJSON(..), object, pairs)
+import Data.Csv
+  ( (.:), FromNamedRecord(..), ToNamedRecord(..), namedField, namedRecord )
+import GEODE.Metadata.File (File(..))
+import GEODE.Metadata as Article
+  (DefaultFields(..), HasDefaultHeader(..), PrimaryKey(PrimaryKey, book, tome), uid, relativePath)
+import qualified GEODE.Metadata as Article (PrimaryKey(rank))
+import System.FilePath ((</>), (<.>), dropExtension)
+import Text.Printf (printf)
+data ParagraphPK = ParagraphPK
+  { article :: PrimaryKey
+  , rank :: Int } deriving (Eq, Ord, Show)
+instance FromNamedRecord ParagraphPK where
+  parseNamedRecord nr = ParagraphPK
+    <$> (PrimaryKey <$> nr .: "book" <*> nr .: "tome" <*> nr .: "article#")
+    <*> nr .: "paragraph#"
+instance ToNamedRecord ParagraphPK where
+  toNamedRecord (ParagraphPK {article, rank}) = namedRecord
+    [ namedField "book" (book article)
+    , namedField "tome" (tome article)
+    , namedField "article#" (Article.rank article)
+    , namedField "paragraph#" rank ]
+instance File ParagraphPK where
+  uid (ParagraphPK {article, rank}) = printf "%s_%d" (Article.uid article) rank
+  relativePath (ParagraphPK {article, rank}) extension =
+    articleDirectory </> (show rank) <.> extension
+    where
+      articleDirectory = dropExtension (Article.relativePath article "")
+instance HasDefaultHeader ParagraphPK where
+  defaultFields = DefaultFields (["book", "tome", "article#", "paragraph#"])
+instance ToJSON ParagraphPK where
+  toJSON (ParagraphPK {article, rank}) = object
+    [ "book" .= book article
+    , "tome" .= tome article
+    , "article#" .= Article.rank article
+    , "paragraph#" .= rank ]
+  toEncoding (ParagraphPK {article, rank}) = pairs
+    ( "book" .= book article
+    <> "tome" .= tome article
+    <> "article#" .= Article.rank article
+    <> "paragraph#" .= rank )
--- a/manifest.scm
+++ b/manifest.scm
@@ -2,7 +2,7 @@
             ((geode packages models) #:select (stanza-fr))
             ((gnu packages commencement) #:select (gcc-toolchain))
             ((gnu packages haskell) #:select (ghc))
-             ((gnu packages haskell-web) #:select (ghc-hxt))
+             ((gnu packages haskell-web) #:select (ghc-aeson ghc-hxt))
             ((gnu packages haskell-xyz) #:select (ghc-cassava ghc-hs-conllu))
             ((gnu packages python) #:select (python))
             ((gnu packages python-science) #:select (python-pandas))
@@ -19,6 +19,7 @@
    ;edda-clinic ; fix and cut the EDdA
    gcc-toolchain ; running haskell
    ghc ; running haskell
+    ghc-aeson ; working with JSON in haskell
    ghc-cassava ; working with CSV in haskell
    ;ghc-geode ; handling corpus files
    ghc-hs-conllu ; working on syntax-annotated documents

--- a/scripts/paragraphs.hs
+++ b/scripts/paragraphs.hs
+#!/usr/bin/env -S runhaskell --ghc-arg="-Wall" --ghc-arg="-i lib"
+{-# LANGUAGE OverloadedStrings #-}
+import Data.List (dropWhileEnd)
+import Data.Text (Text, splitOn)
+import Data.Text.IO as Text (readFile, writeFile)
+import GEODE.Metadata.File as File (File(..))
+import GEODE.Metadata as Article (PrimaryKey, readNamedTsv, relativePath, tsvFile)
+import GEODE.Metadata.PrimaryKey.Paragraph (ParagraphPK(..))
+import System.Directory (createDirectoryIfMissing)
+import System.Environment (getArgs)
+import System.FilePath ((</>), (<.>), isPathSeparator, takeDirectory)
+import System.Script (syntax, try)
+import Text.Filter (Editable(..))
+import Text.Filter.Linearize (linearize)
+articleParagraphs :: FilePath -> IO [Text]
+articleParagraphs =
+  fmap (fmap (leave . linearize False . enter) . splitOn "\n\n") . Text.readFile
+to :: FilePath -> FilePath -> PrimaryKey -> IO [ParagraphPK]
+to source target article = do
+  createDirectoryIfMissing True (target </> Article.relativePath article "")
+  articleParagraphs articlePath >>= mapM create . number
+  where
+    articlePath = source </> Article.relativePath article "txt"
+    number = zip [1..]
+    create (rank, paragraphText) =
+      let paragraphPK = ParagraphPK {article, rank}
+          outputPath = target </> File.relativePath paragraphPK "txt" in
+      paragraphPK <$ Text.writeFile outputPath paragraphText
+main :: IO ()
+main = (fmap (dropWhileEnd isPathSeparator) <$> getArgs) >>= run
+  where
+    run [inputMeta, source, target] =
+        try (readNamedTsv inputMeta)
+      >>= mapM (source `to` target)
+      >>= tsvFile (takeDirectory target </> "files" <.> "tsv") . concat
+    run _ = syntax "INPUT_METADATA SOURCE_DIRECTORY TARGET_DIRECTORY"