Skip to content
Snippets Groups Projects
Commit 88e41133 authored by Alice Brenon's avatar Alice Brenon
Browse files

Add support for labeled data in input tsv and make the paralallel between...

Add support for labeled data in input tsv and make the paralallel between conversions to and from prodigy's jsonl more obvious
parent b9572a96
No related branches found
No related tags found
No related merge requests found
{-# LANGUAGE DeriveGeneric, ExplicitNamespaces, OverloadedStrings #-}
module GEODE.Metadata.ProdigyMeta
( ParagraphMeta
( Classification(..)
, ClassifiedParagraph
, ParagraphMeta
, ProdigyMeta(..) ) where
import Data.Aeson ((.=), FromJSON(..), ToJSON(..))
......@@ -30,4 +32,18 @@ instance FromNamedRecord ProdigyMeta
instance HasDefaultHeader ProdigyMeta where
defaultFields = DefaultFields ["totalParagraphs", "headword"]
newtype Classification = Classification { paragraphFunction :: Text } deriving Generic
instance ToNamedRecord Classification
instance FromNamedRecord Classification
instance HasDefaultHeader Classification where
defaultFields = DefaultFields ["paragraphFunction"]
instance ToJSONObject Classification where
toJSONObject (Classification {paragraphFunction}) = KeyMap.fromList
[ ("paragraphFunction", toJSON paragraphFunction) ]
toJSONPairs (Classification {paragraphFunction}) =
"paragraphFunction" .= toJSON paragraphFunction
type ParagraphMeta = ParagraphRecord @ ProdigyMeta
type ClassifiedParagraph = ParagraphMeta @ Classification
#!/usr/bin/env -S runhaskell --ghc-arg="-Wall" --ghc-arg="-i lib" --ghc-arg="-fprint-potential-instances"
{-# LANGUAGE DeriveGeneric, ExplicitNamespaces, OverloadedStrings #-}
{-# LANGUAGE ExplicitNamespaces, OverloadedStrings #-}
import Data.Aeson ((.:), FromJSON(..), Value(..), encode, withArray, withText, eitherDecode)
import Data.Aeson.Types (prependFailure, typeMismatch)
import Data.ByteString.Lazy as BS (null, readFile, split)
import Data.ByteString.Lazy.Char8 as BS (unpack)
import Data.Csv (ToNamedRecord(..))
import Data.Text (Text)
import Data.Vector as Vector (head)
import GEODE.Metadata
(type (@)(..), DefaultFields(..), HasDefaultHeader(..), tsvFile)
import GEODE.Metadata.ProdigyMeta (ParagraphMeta)
import GHC.Generics (Generic)
import GEODE.Metadata (type (@)(..), tsvFile)
import GEODE.Metadata.ProdigyMeta (Classification(..), ClassifiedParagraph)
import System.Environment (getArgs)
import System.Script (try, syntax, warn)
newtype Classification = Classification { paragraphFunction :: Text } deriving Generic
instance ToNamedRecord Classification
instance HasDefaultHeader Classification where
defaultFields = DefaultFields ["paragraphFunction"]
type ClassifiedParagraph = ParagraphMeta @ Classification
instance {-# OVERLAPS #-} FromJSON (Either String ClassifiedParagraph) where
data Row = Unclassified String | Full ClassifiedParagraph
instance {-# OVERLAPS #-} FromJSON Row where
parseJSON o@(Object v) = do
paragraphMeta <- v .: "meta" >>= parseJSON
(fmap (paragraphMeta :@:)) <$> (v .: "accept" >>= parseClassification)
classified <- v .: "accept" >>= parseClassification
pure $ either Unclassified (Full . (paragraphMeta :@:)) classified
where
parseClassification = withArray "Classification" singleValue
singleValue a
......@@ -39,11 +29,11 @@ instance {-# OVERLAPS #-} FromJSON (Either String ClassifiedParagraph) where
prependFailure "parsing ClassifiedParagraph failed, "
(typeMismatch "Object" invalid)
logIgnored :: [Either String a] -> IO [a]
logIgnored = foldr keepRight (pure [])
logIgnored :: [Row] -> IO [ClassifiedParagraph]
logIgnored = foldr keepFull (pure [])
where
keepRight (Left message) acc = warn message *> acc
keepRight (Right a) acc = (a:) <$> acc
keepFull (Unclassified message) acc = warn message *> acc
keepFull (Full a) acc = (a:) <$> acc
main :: IO ()
main = getArgs >>= run
......
......@@ -5,7 +5,8 @@ import Data.ByteString.Lazy.Char8 as ByteString (putStrLn)
import Data.Text (Text)
import Data.Text.IO as Text (readFile)
import GEODE.Metadata (type (@)(..), Record(..), readNamedTsv)
import GEODE.Metadata.ProdigyMeta (ParagraphMeta)
import GEODE.Metadata.ProdigyMeta
(Classification(..), ClassifiedParagraph, ParagraphMeta)
import GHC.Generics (Generic)
import System.Environment (getArgs)
import System.FilePath ((</>))
......@@ -13,15 +14,16 @@ import System.Script (syntax, try)
data Paragraph = Paragraph
{ text :: Text
, meta :: ParagraphMeta } deriving Generic
, meta :: ParagraphMeta
, accept :: [Text] } deriving Generic
instance ToJSON Paragraph where
toEncoding = genericToEncoding defaultOptions
loadParagraph :: FilePath -> ParagraphMeta -> IO Paragraph
loadParagraph source meta@(paragraphRecord :@: _) = do
loadParagraph :: FilePath -> ClassifiedParagraph -> IO Paragraph
loadParagraph source (meta@(paragraphRecord :@: _) :@: classification) = do
text <- Text.readFile (source </> relativePath paragraphRecord "txt")
pure $ Paragraph {text, meta}
pure $ Paragraph {text, meta, accept = [paragraphFunction classification]}
main :: IO ()
main = getArgs >>= run
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment