From 4cf05a96569d7765ec7c89a7af6b762fc04d60f4 Mon Sep 17 00:00:00 2001
From: Alice BRENON <alice.brenon@ens-lyon.fr>
Date: Thu, 7 Sep 2023 18:59:13 +0200
Subject: [PATCH] Add a script to make a JSONL corpus for prodigy out of
 paragraphs

---
 scripts/ML/prodigy-corpus.hs | 34 ++++++++++++++++++++++++++++++++++
 1 file changed, 34 insertions(+)
 create mode 100755 scripts/ML/prodigy-corpus.hs

diff --git a/scripts/ML/prodigy-corpus.hs b/scripts/ML/prodigy-corpus.hs
new file mode 100755
index 0000000..360e67a
--- /dev/null
+++ b/scripts/ML/prodigy-corpus.hs
@@ -0,0 +1,34 @@
+#!/usr/bin/env -S runhaskell --ghc-arg="-Wall" --ghc-arg="-i lib"
+{-# LANGUAGE DeriveGeneric #-}
+import Data.Aeson (ToJSON(..), defaultOptions, encode, genericToEncoding)
+import Data.ByteString.Lazy.Char8 as ByteString (putStrLn)
+import Data.Text (Text)
+import Data.Text.IO as Text (readFile)
+import GEODE.Metadata (readNamedTsv)
+import GEODE.Metadata.File (relativePath)
+import GEODE.Metadata.PrimaryKey.Paragraph (ParagraphPK)
+import GHC.Generics (Generic)
+import System.Environment (getArgs)
+import System.FilePath ((</>))
+import System.Script (syntax, try)
+
+data Paragraph = Paragraph
+  { text :: Text
+  , meta :: ParagraphPK } deriving Generic
+
+instance ToJSON Paragraph where
+  toEncoding = genericToEncoding defaultOptions
+
+loadParagraph :: FilePath -> ParagraphPK -> IO Paragraph
+loadParagraph source meta = do
+  text <- Text.readFile (source </> relativePath meta "txt")
+  pure $ Paragraph {text, meta}
+
+main :: IO ()
+main = getArgs >>= run
+  where
+    run [inputMeta, source] =
+      try (readNamedTsv inputMeta) >>= mapM_ (prodigyText source)
+    run _ = syntax "INPUT_METADATA SOURCE_DIRECTORY"
+    prodigyText source pK =
+      loadParagraph source pK >>= ByteString.putStrLn . encode
-- 
GitLab