Skip to content
Snippets Groups Projects
Commit 88e41133 authored by Alice Brenon's avatar Alice Brenon
Browse files

Add support for labeled data in input tsv and make the paralallel between...

Add support for labeled data in input tsv and make the paralallel between conversions to and from prodigy's jsonl more obvious
parent b9572a96
No related branches found
No related tags found
No related merge requests found
{-# LANGUAGE DeriveGeneric, ExplicitNamespaces, OverloadedStrings #-} {-# LANGUAGE DeriveGeneric, ExplicitNamespaces, OverloadedStrings #-}
module GEODE.Metadata.ProdigyMeta module GEODE.Metadata.ProdigyMeta
( ParagraphMeta ( Classification(..)
, ClassifiedParagraph
, ParagraphMeta
, ProdigyMeta(..) ) where , ProdigyMeta(..) ) where
import Data.Aeson ((.=), FromJSON(..), ToJSON(..)) import Data.Aeson ((.=), FromJSON(..), ToJSON(..))
...@@ -30,4 +32,18 @@ instance FromNamedRecord ProdigyMeta ...@@ -30,4 +32,18 @@ instance FromNamedRecord ProdigyMeta
instance HasDefaultHeader ProdigyMeta where instance HasDefaultHeader ProdigyMeta where
defaultFields = DefaultFields ["totalParagraphs", "headword"] defaultFields = DefaultFields ["totalParagraphs", "headword"]
newtype Classification = Classification { paragraphFunction :: Text } deriving Generic
instance ToNamedRecord Classification
instance FromNamedRecord Classification
instance HasDefaultHeader Classification where
defaultFields = DefaultFields ["paragraphFunction"]
instance ToJSONObject Classification where
toJSONObject (Classification {paragraphFunction}) = KeyMap.fromList
[ ("paragraphFunction", toJSON paragraphFunction) ]
toJSONPairs (Classification {paragraphFunction}) =
"paragraphFunction" .= toJSON paragraphFunction
type ParagraphMeta = ParagraphRecord @ ProdigyMeta type ParagraphMeta = ParagraphRecord @ ProdigyMeta
type ClassifiedParagraph = ParagraphMeta @ Classification
#!/usr/bin/env -S runhaskell --ghc-arg="-Wall" --ghc-arg="-i lib" --ghc-arg="-fprint-potential-instances" #!/usr/bin/env -S runhaskell --ghc-arg="-Wall" --ghc-arg="-i lib" --ghc-arg="-fprint-potential-instances"
{-# LANGUAGE DeriveGeneric, ExplicitNamespaces, OverloadedStrings #-} {-# LANGUAGE ExplicitNamespaces, OverloadedStrings #-}
import Data.Aeson ((.:), FromJSON(..), Value(..), encode, withArray, withText, eitherDecode) import Data.Aeson ((.:), FromJSON(..), Value(..), encode, withArray, withText, eitherDecode)
import Data.Aeson.Types (prependFailure, typeMismatch) import Data.Aeson.Types (prependFailure, typeMismatch)
import Data.ByteString.Lazy as BS (null, readFile, split) import Data.ByteString.Lazy as BS (null, readFile, split)
import Data.ByteString.Lazy.Char8 as BS (unpack) import Data.ByteString.Lazy.Char8 as BS (unpack)
import Data.Csv (ToNamedRecord(..))
import Data.Text (Text)
import Data.Vector as Vector (head) import Data.Vector as Vector (head)
import GEODE.Metadata import GEODE.Metadata (type (@)(..), tsvFile)
(type (@)(..), DefaultFields(..), HasDefaultHeader(..), tsvFile) import GEODE.Metadata.ProdigyMeta (Classification(..), ClassifiedParagraph)
import GEODE.Metadata.ProdigyMeta (ParagraphMeta)
import GHC.Generics (Generic)
import System.Environment (getArgs) import System.Environment (getArgs)
import System.Script (try, syntax, warn) import System.Script (try, syntax, warn)
newtype Classification = Classification { paragraphFunction :: Text } deriving Generic data Row = Unclassified String | Full ClassifiedParagraph
instance {-# OVERLAPS #-} FromJSON Row where
instance ToNamedRecord Classification
instance HasDefaultHeader Classification where
defaultFields = DefaultFields ["paragraphFunction"]
type ClassifiedParagraph = ParagraphMeta @ Classification
instance {-# OVERLAPS #-} FromJSON (Either String ClassifiedParagraph) where
parseJSON o@(Object v) = do parseJSON o@(Object v) = do
paragraphMeta <- v .: "meta" >>= parseJSON paragraphMeta <- v .: "meta" >>= parseJSON
(fmap (paragraphMeta :@:)) <$> (v .: "accept" >>= parseClassification) classified <- v .: "accept" >>= parseClassification
pure $ either Unclassified (Full . (paragraphMeta :@:)) classified
where where
parseClassification = withArray "Classification" singleValue parseClassification = withArray "Classification" singleValue
singleValue a singleValue a
...@@ -39,11 +29,11 @@ instance {-# OVERLAPS #-} FromJSON (Either String ClassifiedParagraph) where ...@@ -39,11 +29,11 @@ instance {-# OVERLAPS #-} FromJSON (Either String ClassifiedParagraph) where
prependFailure "parsing ClassifiedParagraph failed, " prependFailure "parsing ClassifiedParagraph failed, "
(typeMismatch "Object" invalid) (typeMismatch "Object" invalid)
logIgnored :: [Either String a] -> IO [a] logIgnored :: [Row] -> IO [ClassifiedParagraph]
logIgnored = foldr keepRight (pure []) logIgnored = foldr keepFull (pure [])
where where
keepRight (Left message) acc = warn message *> acc keepFull (Unclassified message) acc = warn message *> acc
keepRight (Right a) acc = (a:) <$> acc keepFull (Full a) acc = (a:) <$> acc
main :: IO () main :: IO ()
main = getArgs >>= run main = getArgs >>= run
......
...@@ -5,7 +5,8 @@ import Data.ByteString.Lazy.Char8 as ByteString (putStrLn) ...@@ -5,7 +5,8 @@ import Data.ByteString.Lazy.Char8 as ByteString (putStrLn)
import Data.Text (Text) import Data.Text (Text)
import Data.Text.IO as Text (readFile) import Data.Text.IO as Text (readFile)
import GEODE.Metadata (type (@)(..), Record(..), readNamedTsv) import GEODE.Metadata (type (@)(..), Record(..), readNamedTsv)
import GEODE.Metadata.ProdigyMeta (ParagraphMeta) import GEODE.Metadata.ProdigyMeta
(Classification(..), ClassifiedParagraph, ParagraphMeta)
import GHC.Generics (Generic) import GHC.Generics (Generic)
import System.Environment (getArgs) import System.Environment (getArgs)
import System.FilePath ((</>)) import System.FilePath ((</>))
...@@ -13,15 +14,16 @@ import System.Script (syntax, try) ...@@ -13,15 +14,16 @@ import System.Script (syntax, try)
data Paragraph = Paragraph data Paragraph = Paragraph
{ text :: Text { text :: Text
, meta :: ParagraphMeta } deriving Generic , meta :: ParagraphMeta
, accept :: [Text] } deriving Generic
instance ToJSON Paragraph where instance ToJSON Paragraph where
toEncoding = genericToEncoding defaultOptions toEncoding = genericToEncoding defaultOptions
loadParagraph :: FilePath -> ParagraphMeta -> IO Paragraph loadParagraph :: FilePath -> ClassifiedParagraph -> IO Paragraph
loadParagraph source meta@(paragraphRecord :@: _) = do loadParagraph source (meta@(paragraphRecord :@: _) :@: classification) = do
text <- Text.readFile (source </> relativePath paragraphRecord "txt") text <- Text.readFile (source </> relativePath paragraphRecord "txt")
pure $ Paragraph {text, meta} pure $ Paragraph {text, meta, accept = [paragraphFunction classification]}
main :: IO () main :: IO ()
main = getArgs >>= run main = getArgs >>= run
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment