diff --git a/scripts/ML/prodigy-corpus.hs b/scripts/ML/prodigy-corpus.hs index 360e67a6f9759e5d585380dde2ef839d55286fd8..c2a6e69c940251b36c97014d85c99435c214836d 100755 --- a/scripts/ML/prodigy-corpus.hs +++ b/scripts/ML/prodigy-corpus.hs @@ -1,12 +1,11 @@ #!/usr/bin/env -S runhaskell --ghc-arg="-Wall" --ghc-arg="-i lib" -{-# LANGUAGE DeriveGeneric #-} +{-# LANGUAGE DeriveGeneric, ExplicitNamespaces, OverloadedStrings #-} import Data.Aeson (ToJSON(..), defaultOptions, encode, genericToEncoding) import Data.ByteString.Lazy.Char8 as ByteString (putStrLn) import Data.Text (Text) import Data.Text.IO as Text (readFile) -import GEODE.Metadata (readNamedTsv) -import GEODE.Metadata.File (relativePath) -import GEODE.Metadata.PrimaryKey.Paragraph (ParagraphPK) +import GEODE.Metadata (type (@)(..), Record(..), readNamedTsv) +import GEODE.Metadata.ProdigyMeta (ParagraphMeta) import GHC.Generics (Generic) import System.Environment (getArgs) import System.FilePath ((</>)) @@ -14,21 +13,21 @@ import System.Script (syntax, try) data Paragraph = Paragraph { text :: Text - , meta :: ParagraphPK } deriving Generic + , meta :: ParagraphMeta } deriving Generic instance ToJSON Paragraph where toEncoding = genericToEncoding defaultOptions -loadParagraph :: FilePath -> ParagraphPK -> IO Paragraph -loadParagraph source meta = do - text <- Text.readFile (source </> relativePath meta "txt") +loadParagraph :: FilePath -> ParagraphMeta -> IO Paragraph +loadParagraph source meta@(paragraphRecord :@: _) = do + text <- Text.readFile (source </> relativePath paragraphRecord "txt") pure $ Paragraph {text, meta} main :: IO () main = getArgs >>= run where run [inputMeta, source] = - try (readNamedTsv inputMeta) >>= mapM_ (prodigyText source) + try (readNamedTsv inputMeta) >>= mapM_ (toJSON source) run _ = syntax "INPUT_METADATA SOURCE_DIRECTORY" - prodigyText source pK = - loadParagraph source pK >>= ByteString.putStrLn . encode + toJSON source parMeta = + loadParagraph source parMeta >>= ByteString.putStrLn . encode diff --git a/scripts/ML/prodigy-jsonl-to-tsv.hs b/scripts/ML/prodigy-jsonl-to-tsv.hs new file mode 100755 index 0000000000000000000000000000000000000000..f37451c18b504818c4ff025b28af36546ba174f8 --- /dev/null +++ b/scripts/ML/prodigy-jsonl-to-tsv.hs @@ -0,0 +1,57 @@ +#!/usr/bin/env -S runhaskell --ghc-arg="-Wall" --ghc-arg="-i lib" --ghc-arg="-fprint-potential-instances" +{-# LANGUAGE DeriveGeneric, ExplicitNamespaces, OverloadedStrings #-} +import Data.Aeson ((.:), FromJSON(..), Value(..), encode, withArray, withText, eitherDecode) +import Data.Aeson.Types (prependFailure, typeMismatch) +import Data.ByteString.Lazy as BS (null, readFile, split) +import Data.ByteString.Lazy.Char8 as BS (unpack) +import Data.Csv (ToNamedRecord(..)) +import Data.Text (Text) +import Data.Vector as Vector (head) +import GEODE.Metadata + (type (@)(..), DefaultFields(..), HasDefaultHeader(..), tsvFile) +import GEODE.Metadata.ProdigyMeta (ParagraphMeta) +import GHC.Generics (Generic) +import System.Environment (getArgs) +import System.Script (try, syntax, warn) + +newtype Classification = Classification { paragraphFunction :: Text } deriving Generic + +instance ToNamedRecord Classification +instance HasDefaultHeader Classification where + defaultFields = DefaultFields ["paragraphFunction"] + +type ClassifiedParagraph = ParagraphMeta @ Classification + +instance {-# OVERLAPS #-} FromJSON (Either String ClassifiedParagraph) where + parseJSON o@(Object v) = do + paragraphMeta <- v .: "meta" >>= parseJSON + (fmap (paragraphMeta :@:)) <$> (v .: "accept" >>= parseClassification) + where + parseClassification = withArray "Classification" singleValue + singleValue a + | not $ Prelude.null a = + withText "domain" (pure . Right . Classification) (Vector.head a) + singleValue _ = pure $ Left + ("Looks like " ++ debug ++ " was not classified, ignoring for now") + debug = BS.unpack $ encode o + + parseJSON invalid = + prependFailure "parsing ClassifiedParagraph failed, " + (typeMismatch "Object" invalid) + +logIgnored :: [Either String a] -> IO [a] +logIgnored = foldr keepRight (pure []) + where + keepRight (Left message) acc = warn message *> acc + keepRight (Right a) acc = (a:) <$> acc + +main :: IO () +main = getArgs >>= run + where + run [inputJSONL, outputTSV] = + try (jsonl <$> BS.readFile inputJSONL) + >>= logIgnored + >>= (tsvFile outputTSV :: [ClassifiedParagraph] -> IO ()) + run _ = syntax "INPUT_JSONL OUTPUT_TSV" + newline = 10 + jsonl = mapM eitherDecode . filter (not . BS.null) . BS.split newline diff --git a/scripts/paragraphs.hs b/scripts/paragraphs.hs index 453b943a8848d3a76939bb9fbfbe8656bed0b0e3..3251da9da3e4d7e310d4a877d96496dce67a8405 100755 --- a/scripts/paragraphs.hs +++ b/scripts/paragraphs.hs @@ -1,11 +1,12 @@ #!/usr/bin/env -S runhaskell --ghc-arg="-Wall" --ghc-arg="-i lib" -{-# LANGUAGE OverloadedStrings #-} +{-# LANGUAGE ExplicitNamespaces, OverloadedStrings #-} import Data.List (dropWhileEnd) import Data.Text (Text, splitOn) import Data.Text.IO as Text (readFile, writeFile) -import GEODE.Metadata.File as File (File(..)) -import GEODE.Metadata as Article (PrimaryKey, readNamedTsv, relativePath, tsvFile) -import GEODE.Metadata.PrimaryKey.Paragraph (ParagraphPK(..)) +--import GEODE.Metadata.File as File (File(..)) +import GEODE.Metadata as Article (type (@)(..), ArticleRecord, Entry(..), Record(..), readNamedTsv, relativePath, tsvFile) +import GEODE.Metadata.ParagraphRecord (Paragraph(..)) +import GEODE.Metadata.ProdigyMeta as Prodigy (ParagraphMeta, ProdigyMeta(..)) import System.Directory (createDirectoryIfMissing) import System.Environment (getArgs) import System.FilePath ((</>), (<.>), isPathSeparator, takeDirectory) @@ -17,17 +18,24 @@ articleParagraphs :: FilePath -> IO [Text] articleParagraphs = fmap (fmap (leave . linearize False . enter) . splitOn "\n\n") . Text.readFile -to :: FilePath -> FilePath -> PrimaryKey -> IO [ParagraphPK] -to source target article = do - createDirectoryIfMissing True (target </> Article.relativePath article "") - articleParagraphs articlePath >>= mapM create . number +withMeta :: ArticleRecord @ Entry -> [Text] -> [(ParagraphMeta, Text)] +withMeta (articleRecord :@: entry) paragraphs = zipWith f [1..] paragraphs where - articlePath = source </> Article.relativePath article "txt" - number = zip [1..] - create (rank, paragraphText) = - let paragraphPK = ParagraphPK {article, rank} - outputPath = target </> File.relativePath paragraphPK "txt" in - paragraphPK <$ Text.writeFile outputPath paragraphText + prodigyMeta = ProdigyMeta + { totalParagraphs = length paragraphs + , Prodigy.headword = Article.headword entry } + f paragraph paragraphText = + (articleRecord :@: Paragraph {paragraph} :@: prodigyMeta, paragraphText) + +to :: FilePath -> FilePath -> ArticleRecord @ Entry -> IO [ParagraphMeta] +to source target meta@(articleRecord :@: _) = do + createDirectoryIfMissing True (target </> relativePath articleRecord "") + articleParagraphs articlePath >>= mapM create . withMeta meta + where + articlePath = source </> relativePath articleRecord "txt" + create (paragraphMeta@(paragraphRecord :@: _), paragraphText) = + let outputPath = target </> relativePath paragraphRecord "txt" in + paragraphMeta <$ Text.writeFile outputPath paragraphText main :: IO () main = (fmap (dropWhileEnd isPathSeparator) <$> getArgs) >>= run