From 4cf05a96569d7765ec7c89a7af6b762fc04d60f4 Mon Sep 17 00:00:00 2001 From: Alice BRENON <alice.brenon@ens-lyon.fr> Date: Thu, 7 Sep 2023 18:59:13 +0200 Subject: [PATCH] Add a script to make a JSONL corpus for prodigy out of paragraphs --- scripts/ML/prodigy-corpus.hs | 34 ++++++++++++++++++++++++++++++++++ 1 file changed, 34 insertions(+) create mode 100755 scripts/ML/prodigy-corpus.hs diff --git a/scripts/ML/prodigy-corpus.hs b/scripts/ML/prodigy-corpus.hs new file mode 100755 index 0000000..360e67a --- /dev/null +++ b/scripts/ML/prodigy-corpus.hs @@ -0,0 +1,34 @@ +#!/usr/bin/env -S runhaskell --ghc-arg="-Wall" --ghc-arg="-i lib" +{-# LANGUAGE DeriveGeneric #-} +import Data.Aeson (ToJSON(..), defaultOptions, encode, genericToEncoding) +import Data.ByteString.Lazy.Char8 as ByteString (putStrLn) +import Data.Text (Text) +import Data.Text.IO as Text (readFile) +import GEODE.Metadata (readNamedTsv) +import GEODE.Metadata.File (relativePath) +import GEODE.Metadata.PrimaryKey.Paragraph (ParagraphPK) +import GHC.Generics (Generic) +import System.Environment (getArgs) +import System.FilePath ((</>)) +import System.Script (syntax, try) + +data Paragraph = Paragraph + { text :: Text + , meta :: ParagraphPK } deriving Generic + +instance ToJSON Paragraph where + toEncoding = genericToEncoding defaultOptions + +loadParagraph :: FilePath -> ParagraphPK -> IO Paragraph +loadParagraph source meta = do + text <- Text.readFile (source </> relativePath meta "txt") + pure $ Paragraph {text, meta} + +main :: IO () +main = getArgs >>= run + where + run [inputMeta, source] = + try (readNamedTsv inputMeta) >>= mapM_ (prodigyText source) + run _ = syntax "INPUT_METADATA SOURCE_DIRECTORY" + prodigyText source pK = + loadParagraph source pK >>= ByteString.putStrLn . encode -- GitLab