diff --git a/scripts/ML/prodigy-corpus.hs b/scripts/ML/prodigy-corpus.hs new file mode 100755 index 0000000000000000000000000000000000000000..360e67a6f9759e5d585380dde2ef839d55286fd8 --- /dev/null +++ b/scripts/ML/prodigy-corpus.hs @@ -0,0 +1,34 @@ +#!/usr/bin/env -S runhaskell --ghc-arg="-Wall" --ghc-arg="-i lib" +{-# LANGUAGE DeriveGeneric #-} +import Data.Aeson (ToJSON(..), defaultOptions, encode, genericToEncoding) +import Data.ByteString.Lazy.Char8 as ByteString (putStrLn) +import Data.Text (Text) +import Data.Text.IO as Text (readFile) +import GEODE.Metadata (readNamedTsv) +import GEODE.Metadata.File (relativePath) +import GEODE.Metadata.PrimaryKey.Paragraph (ParagraphPK) +import GHC.Generics (Generic) +import System.Environment (getArgs) +import System.FilePath ((</>)) +import System.Script (syntax, try) + +data Paragraph = Paragraph + { text :: Text + , meta :: ParagraphPK } deriving Generic + +instance ToJSON Paragraph where + toEncoding = genericToEncoding defaultOptions + +loadParagraph :: FilePath -> ParagraphPK -> IO Paragraph +loadParagraph source meta = do + text <- Text.readFile (source </> relativePath meta "txt") + pure $ Paragraph {text, meta} + +main :: IO () +main = getArgs >>= run + where + run [inputMeta, source] = + try (readNamedTsv inputMeta) >>= mapM_ (prodigyText source) + run _ = syntax "INPUT_METADATA SOURCE_DIRECTORY" + prodigyText source pK = + loadParagraph source pK >>= ByteString.putStrLn . encode