Skip to content
Snippets Groups Projects
Commit 4cf05a96 authored by Alice Brenon's avatar Alice Brenon
Browse files

Add a script to make a JSONL corpus for prodigy out of paragraphs

parent cb234457
No related branches found
No related tags found
No related merge requests found
#!/usr/bin/env -S runhaskell --ghc-arg="-Wall" --ghc-arg="-i lib"
{-# LANGUAGE DeriveGeneric #-}
import Data.Aeson (ToJSON(..), defaultOptions, encode, genericToEncoding)
import Data.ByteString.Lazy.Char8 as ByteString (putStrLn)
import Data.Text (Text)
import Data.Text.IO as Text (readFile)
import GEODE.Metadata (readNamedTsv)
import GEODE.Metadata.File (relativePath)
import GEODE.Metadata.PrimaryKey.Paragraph (ParagraphPK)
import GHC.Generics (Generic)
import System.Environment (getArgs)
import System.FilePath ((</>))
import System.Script (syntax, try)
data Paragraph = Paragraph
{ text :: Text
, meta :: ParagraphPK } deriving Generic
instance ToJSON Paragraph where
toEncoding = genericToEncoding defaultOptions
loadParagraph :: FilePath -> ParagraphPK -> IO Paragraph
loadParagraph source meta = do
text <- Text.readFile (source </> relativePath meta "txt")
pure $ Paragraph {text, meta}
main :: IO ()
main = getArgs >>= run
where
run [inputMeta, source] =
try (readNamedTsv inputMeta) >>= mapM_ (prodigyText source)
run _ = syntax "INPUT_METADATA SOURCE_DIRECTORY"
prodigyText source pK =
loadParagraph source pK >>= ByteString.putStrLn . encode
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment