From 88e411335d07b462ef7f331934b86dd6a93d7efb Mon Sep 17 00:00:00 2001 From: Alice BRENON <alice.brenon@ens-lyon.fr> Date: Fri, 29 Sep 2023 12:28:08 +0200 Subject: [PATCH] Add support for labeled data in input tsv and make the paralallel between conversions to and from prodigy's jsonl more obvious --- lib/GEODE/Metadata/ProdigyMeta.hs | 18 ++++++++++- scripts/ML/prodigy-jsonl-to-tsv.hs | 32 +++++++------------ ...digy-corpus.hs => prodigy-tsv-to-jsonl.hs} | 12 ++++--- 3 files changed, 35 insertions(+), 27 deletions(-) rename scripts/ML/{prodigy-corpus.hs => prodigy-tsv-to-jsonl.hs} (73%) diff --git a/lib/GEODE/Metadata/ProdigyMeta.hs b/lib/GEODE/Metadata/ProdigyMeta.hs index c63ec93..edb3fdd 100644 --- a/lib/GEODE/Metadata/ProdigyMeta.hs +++ b/lib/GEODE/Metadata/ProdigyMeta.hs @@ -1,6 +1,8 @@ {-# LANGUAGE DeriveGeneric, ExplicitNamespaces, OverloadedStrings #-} module GEODE.Metadata.ProdigyMeta - ( ParagraphMeta + ( Classification(..) + , ClassifiedParagraph + , ParagraphMeta , ProdigyMeta(..) ) where import Data.Aeson ((.=), FromJSON(..), ToJSON(..)) @@ -30,4 +32,18 @@ instance FromNamedRecord ProdigyMeta instance HasDefaultHeader ProdigyMeta where defaultFields = DefaultFields ["totalParagraphs", "headword"] +newtype Classification = Classification { paragraphFunction :: Text } deriving Generic + +instance ToNamedRecord Classification +instance FromNamedRecord Classification +instance HasDefaultHeader Classification where + defaultFields = DefaultFields ["paragraphFunction"] + +instance ToJSONObject Classification where + toJSONObject (Classification {paragraphFunction}) = KeyMap.fromList + [ ("paragraphFunction", toJSON paragraphFunction) ] + toJSONPairs (Classification {paragraphFunction}) = + "paragraphFunction" .= toJSON paragraphFunction + type ParagraphMeta = ParagraphRecord @ ProdigyMeta +type ClassifiedParagraph = ParagraphMeta @ Classification diff --git a/scripts/ML/prodigy-jsonl-to-tsv.hs b/scripts/ML/prodigy-jsonl-to-tsv.hs index f37451c..d26e850 100755 --- a/scripts/ML/prodigy-jsonl-to-tsv.hs +++ b/scripts/ML/prodigy-jsonl-to-tsv.hs @@ -1,31 +1,21 @@ #!/usr/bin/env -S runhaskell --ghc-arg="-Wall" --ghc-arg="-i lib" --ghc-arg="-fprint-potential-instances" -{-# LANGUAGE DeriveGeneric, ExplicitNamespaces, OverloadedStrings #-} +{-# LANGUAGE ExplicitNamespaces, OverloadedStrings #-} import Data.Aeson ((.:), FromJSON(..), Value(..), encode, withArray, withText, eitherDecode) import Data.Aeson.Types (prependFailure, typeMismatch) import Data.ByteString.Lazy as BS (null, readFile, split) import Data.ByteString.Lazy.Char8 as BS (unpack) -import Data.Csv (ToNamedRecord(..)) -import Data.Text (Text) import Data.Vector as Vector (head) -import GEODE.Metadata - (type (@)(..), DefaultFields(..), HasDefaultHeader(..), tsvFile) -import GEODE.Metadata.ProdigyMeta (ParagraphMeta) -import GHC.Generics (Generic) +import GEODE.Metadata (type (@)(..), tsvFile) +import GEODE.Metadata.ProdigyMeta (Classification(..), ClassifiedParagraph) import System.Environment (getArgs) import System.Script (try, syntax, warn) -newtype Classification = Classification { paragraphFunction :: Text } deriving Generic - -instance ToNamedRecord Classification -instance HasDefaultHeader Classification where - defaultFields = DefaultFields ["paragraphFunction"] - -type ClassifiedParagraph = ParagraphMeta @ Classification - -instance {-# OVERLAPS #-} FromJSON (Either String ClassifiedParagraph) where +data Row = Unclassified String | Full ClassifiedParagraph +instance {-# OVERLAPS #-} FromJSON Row where parseJSON o@(Object v) = do paragraphMeta <- v .: "meta" >>= parseJSON - (fmap (paragraphMeta :@:)) <$> (v .: "accept" >>= parseClassification) + classified <- v .: "accept" >>= parseClassification + pure $ either Unclassified (Full . (paragraphMeta :@:)) classified where parseClassification = withArray "Classification" singleValue singleValue a @@ -39,11 +29,11 @@ instance {-# OVERLAPS #-} FromJSON (Either String ClassifiedParagraph) where prependFailure "parsing ClassifiedParagraph failed, " (typeMismatch "Object" invalid) -logIgnored :: [Either String a] -> IO [a] -logIgnored = foldr keepRight (pure []) +logIgnored :: [Row] -> IO [ClassifiedParagraph] +logIgnored = foldr keepFull (pure []) where - keepRight (Left message) acc = warn message *> acc - keepRight (Right a) acc = (a:) <$> acc + keepFull (Unclassified message) acc = warn message *> acc + keepFull (Full a) acc = (a:) <$> acc main :: IO () main = getArgs >>= run diff --git a/scripts/ML/prodigy-corpus.hs b/scripts/ML/prodigy-tsv-to-jsonl.hs similarity index 73% rename from scripts/ML/prodigy-corpus.hs rename to scripts/ML/prodigy-tsv-to-jsonl.hs index c192883..51cf7a6 100755 --- a/scripts/ML/prodigy-corpus.hs +++ b/scripts/ML/prodigy-tsv-to-jsonl.hs @@ -5,7 +5,8 @@ import Data.ByteString.Lazy.Char8 as ByteString (putStrLn) import Data.Text (Text) import Data.Text.IO as Text (readFile) import GEODE.Metadata (type (@)(..), Record(..), readNamedTsv) -import GEODE.Metadata.ProdigyMeta (ParagraphMeta) +import GEODE.Metadata.ProdigyMeta + (Classification(..), ClassifiedParagraph, ParagraphMeta) import GHC.Generics (Generic) import System.Environment (getArgs) import System.FilePath ((</>)) @@ -13,15 +14,16 @@ import System.Script (syntax, try) data Paragraph = Paragraph { text :: Text - , meta :: ParagraphMeta } deriving Generic + , meta :: ParagraphMeta + , accept :: [Text] } deriving Generic instance ToJSON Paragraph where toEncoding = genericToEncoding defaultOptions -loadParagraph :: FilePath -> ParagraphMeta -> IO Paragraph -loadParagraph source meta@(paragraphRecord :@: _) = do +loadParagraph :: FilePath -> ClassifiedParagraph -> IO Paragraph +loadParagraph source (meta@(paragraphRecord :@: _) :@: classification) = do text <- Text.readFile (source </> relativePath paragraphRecord "txt") - pure $ Paragraph {text, meta} + pure $ Paragraph {text, meta, accept = [paragraphFunction classification]} main :: IO () main = getArgs >>= run -- GitLab