From 88e411335d07b462ef7f331934b86dd6a93d7efb Mon Sep 17 00:00:00 2001
From: Alice BRENON <alice.brenon@ens-lyon.fr>
Date: Fri, 29 Sep 2023 12:28:08 +0200
Subject: [PATCH] Add support for labeled data in input tsv and make the
 paralallel between conversions to and from prodigy's jsonl more obvious

---
 lib/GEODE/Metadata/ProdigyMeta.hs             | 18 ++++++++++-
 scripts/ML/prodigy-jsonl-to-tsv.hs            | 32 +++++++------------
 ...digy-corpus.hs => prodigy-tsv-to-jsonl.hs} | 12 ++++---
 3 files changed, 35 insertions(+), 27 deletions(-)
 rename scripts/ML/{prodigy-corpus.hs => prodigy-tsv-to-jsonl.hs} (73%)

diff --git a/lib/GEODE/Metadata/ProdigyMeta.hs b/lib/GEODE/Metadata/ProdigyMeta.hs
index c63ec93..edb3fdd 100644
--- a/lib/GEODE/Metadata/ProdigyMeta.hs
+++ b/lib/GEODE/Metadata/ProdigyMeta.hs
@@ -1,6 +1,8 @@
 {-# LANGUAGE DeriveGeneric, ExplicitNamespaces, OverloadedStrings #-}
 module GEODE.Metadata.ProdigyMeta
-  ( ParagraphMeta
+  ( Classification(..)
+  , ClassifiedParagraph
+  , ParagraphMeta
   , ProdigyMeta(..) ) where
 
 import Data.Aeson ((.=), FromJSON(..), ToJSON(..))
@@ -30,4 +32,18 @@ instance FromNamedRecord ProdigyMeta
 instance HasDefaultHeader ProdigyMeta where
   defaultFields = DefaultFields ["totalParagraphs", "headword"]
 
+newtype Classification = Classification { paragraphFunction :: Text } deriving Generic
+
+instance ToNamedRecord Classification
+instance FromNamedRecord Classification
+instance HasDefaultHeader Classification where
+  defaultFields = DefaultFields ["paragraphFunction"]
+
+instance ToJSONObject Classification where
+  toJSONObject (Classification {paragraphFunction}) = KeyMap.fromList
+    [ ("paragraphFunction", toJSON paragraphFunction) ]
+  toJSONPairs (Classification {paragraphFunction}) =
+    "paragraphFunction" .= toJSON paragraphFunction
+
 type ParagraphMeta = ParagraphRecord @ ProdigyMeta
+type ClassifiedParagraph = ParagraphMeta @ Classification
diff --git a/scripts/ML/prodigy-jsonl-to-tsv.hs b/scripts/ML/prodigy-jsonl-to-tsv.hs
index f37451c..d26e850 100755
--- a/scripts/ML/prodigy-jsonl-to-tsv.hs
+++ b/scripts/ML/prodigy-jsonl-to-tsv.hs
@@ -1,31 +1,21 @@
 #!/usr/bin/env -S runhaskell --ghc-arg="-Wall" --ghc-arg="-i lib" --ghc-arg="-fprint-potential-instances"
-{-# LANGUAGE DeriveGeneric, ExplicitNamespaces, OverloadedStrings #-}
+{-# LANGUAGE ExplicitNamespaces, OverloadedStrings #-}
 import Data.Aeson ((.:), FromJSON(..), Value(..), encode, withArray, withText, eitherDecode)
 import Data.Aeson.Types (prependFailure, typeMismatch)
 import Data.ByteString.Lazy as BS (null, readFile, split)
 import Data.ByteString.Lazy.Char8 as BS (unpack)
-import Data.Csv (ToNamedRecord(..))
-import Data.Text (Text)
 import Data.Vector as Vector (head)
-import GEODE.Metadata
-  (type (@)(..), DefaultFields(..), HasDefaultHeader(..), tsvFile)
-import GEODE.Metadata.ProdigyMeta (ParagraphMeta)
-import GHC.Generics (Generic)
+import GEODE.Metadata (type (@)(..), tsvFile)
+import GEODE.Metadata.ProdigyMeta (Classification(..), ClassifiedParagraph)
 import System.Environment (getArgs)
 import System.Script (try, syntax, warn)
 
-newtype Classification = Classification { paragraphFunction :: Text } deriving Generic
-
-instance ToNamedRecord Classification
-instance HasDefaultHeader Classification where
-  defaultFields = DefaultFields ["paragraphFunction"]
-
-type ClassifiedParagraph = ParagraphMeta @ Classification
-
-instance {-# OVERLAPS #-} FromJSON (Either String ClassifiedParagraph) where
+data Row = Unclassified String | Full ClassifiedParagraph
+instance {-# OVERLAPS #-} FromJSON Row where
   parseJSON o@(Object v) = do
     paragraphMeta <- v .: "meta" >>= parseJSON
-    (fmap (paragraphMeta :@:)) <$> (v .: "accept" >>= parseClassification)
+    classified <- v .: "accept" >>= parseClassification
+    pure $ either Unclassified (Full . (paragraphMeta :@:)) classified
     where
       parseClassification = withArray "Classification" singleValue
       singleValue a
@@ -39,11 +29,11 @@ instance {-# OVERLAPS #-} FromJSON (Either String ClassifiedParagraph) where
     prependFailure "parsing ClassifiedParagraph failed, "
     (typeMismatch "Object" invalid)
 
-logIgnored :: [Either String a] -> IO [a]
-logIgnored = foldr keepRight (pure [])
+logIgnored :: [Row] -> IO [ClassifiedParagraph]
+logIgnored = foldr keepFull (pure [])
   where
-    keepRight (Left message) acc = warn message *> acc
-    keepRight (Right a) acc = (a:) <$> acc
+    keepFull (Unclassified message) acc = warn message *> acc
+    keepFull (Full a) acc = (a:) <$> acc
 
 main :: IO ()
 main = getArgs >>= run
diff --git a/scripts/ML/prodigy-corpus.hs b/scripts/ML/prodigy-tsv-to-jsonl.hs
similarity index 73%
rename from scripts/ML/prodigy-corpus.hs
rename to scripts/ML/prodigy-tsv-to-jsonl.hs
index c192883..51cf7a6 100755
--- a/scripts/ML/prodigy-corpus.hs
+++ b/scripts/ML/prodigy-tsv-to-jsonl.hs
@@ -5,7 +5,8 @@ import Data.ByteString.Lazy.Char8 as ByteString (putStrLn)
 import Data.Text (Text)
 import Data.Text.IO as Text (readFile)
 import GEODE.Metadata (type (@)(..), Record(..), readNamedTsv)
-import GEODE.Metadata.ProdigyMeta (ParagraphMeta)
+import GEODE.Metadata.ProdigyMeta
+  (Classification(..), ClassifiedParagraph, ParagraphMeta)
 import GHC.Generics (Generic)
 import System.Environment (getArgs)
 import System.FilePath ((</>))
@@ -13,15 +14,16 @@ import System.Script (syntax, try)
 
 data Paragraph = Paragraph
   { text :: Text
-  , meta :: ParagraphMeta } deriving Generic
+  , meta :: ParagraphMeta
+  , accept :: [Text] } deriving Generic
 
 instance ToJSON Paragraph where
   toEncoding = genericToEncoding defaultOptions
 
-loadParagraph :: FilePath -> ParagraphMeta -> IO Paragraph
-loadParagraph source meta@(paragraphRecord :@: _) = do
+loadParagraph :: FilePath -> ClassifiedParagraph -> IO Paragraph
+loadParagraph source (meta@(paragraphRecord :@: _) :@: classification) = do
   text <- Text.readFile (source </> relativePath paragraphRecord "txt")
-  pure $ Paragraph {text, meta}
+  pure $ Paragraph {text, meta, accept = [paragraphFunction classification]}
 
 main :: IO ()
 main = getArgs >>= run
-- 
GitLab