-
Alice Brenon authored2fbe96cd
prodigy-jsonl-to-tsv.hs 4.28 KiB
#!/usr/bin/env -S runhaskell --ghc-arg="-Wall" --ghc-arg="-i lib" --ghc-arg="-fprint-potential-instances"
{-# LANGUAGE ExplicitNamespaces, OverloadedStrings #-}
import Control.Applicative ((<|>))
import Control.Monad.Except (MonadError(..), ExceptT(..), runExceptT)
import Data.Aeson ((.:), FromJSON(..), Object, Value(..), encode, withArray, withObject, withText, eitherDecode)
import Data.Aeson.Types (Parser, prependFailure, typeMismatch)
import Data.ByteString.Lazy as BS (null, readFile, split)
import Data.ByteString.Lazy.Char8 as BS (unpack)
import Data.String (IsString(..))
import Data.Text as Text (Text)
import Data.Vector as Vector (head)
import GEODE.Metadata (type (@)(..), Record(..), tsvFile)
import GEODE.Metadata.ProdigyMeta (Classification(..), ClassifiedParagraph)
import System.Environment (getArgs)
import System.Script (try, syntax, warn)
import Text.Printf (printf)
data Row = Unclassified String | Full ClassifiedParagraph
instance {-# OVERLAPS #-} FromJSON Row where
{-
parseJSON o@(Object v) = do
paragraphMeta@(paragraphRecord :@: _) <- v .: "meta"
--classified <- v .: "accept" >>= parseClassification
classified <- v .: "label"
pure $ either (\_ -> Unclassified $ debug paragraphRecord) (Full . (paragraphMeta :@:)) classified
where
parseClassification = withArray "Classification" singleValue
singleValue a
| not $ Prelude.null a =
withText "domain" (pure . Right . Classification) (Vector.head a)
singleValue _ = pure $ Left
("Looks like " ++ BS.unpack (encode o) ++ " was not classified, ignoring for now")
debug record =
"Looks like " ++ uid record ++ " was not classified, ignoring for now"
parseJSON o@(Object v) = do
paragraphMeta <- v .: "meta" >>= parseJSON
classified <- v .: "accept" >>= parseClassification
pure $ either Unclassified (Full . (paragraphMeta :@:)) classified
where
parseClassification = withArray "Classification" singleValue
singleValue a
| not $ Prelude.null a =
withText "domain" (pure . Right . Classification) (Vector.head a)
singleValue _ = pure $ Left
("Looks like " ++ debug ++ " was not classified, ignoring for now")
debug = BS.unpack $ encode o
-}
parseJSON = withObject "Row" parseRow
where
parseRow o = do
paragraphMeta <- o .: "meta"
getRow paragraphMeta
<$> runExceptT (classification o)
getRow paragraphMeta@(paragraphRecord :@: _) = either
(Unclassified . debug paragraphRecord)
(Full . (paragraphMeta :@:) . Classification)
classification :: Object -> ExceptT String Parser Text
classification o = do
getTextField "answer" o >>= isAccept
getTextField "label"
--o .: "label" >>= withText "label" pure
--checkAnswer o = ExceptT
-- ((o .: "answer" >>= withText "answer" (pure . isAccept))
-- <|> pure (Left "answer field is missing"))
isAccept "accept" = pure ()
isAccept s = throwError $ printf "answer was \"%s\" and not \"accept\"" s
--isAccept s = Left $ printf "answer was \"%s\" and not \"accept\"" s
debug record = printf "Ignoring %s: %s" (uid record)
getTextField :: String -> Object -> ExceptT String Parser Text
getTextField name o = getField >>= ensureIsText
where
getField :: ExceptT String Parser Value
getField = ExceptT $
(Right <$> (o .: fromString name)) <|> catch "is missing"
ensureIsText :: Value -> ExceptT String Parser Text
ensureIsText v = ExceptT $
withText name (pure . Right) v <|> catch "is not text"
catch :: String -> Parser (Either String a)
catch = pure . Left . printf "%s field %s" name
{-
parseJSON invalid =
prependFailure "parsing Row failed, "
(typeMismatch "Object" invalid)
-}
logIgnored :: [Row] -> IO [ClassifiedParagraph]
logIgnored = foldr keepFull (pure [])
where
keepFull (Unclassified message) acc = warn message *> acc
keepFull (Full a) acc = (a:) <$> acc
main :: IO ()
main = getArgs >>= run
where
run [inputJSONL, outputTSV] =
try (jsonl <$> BS.readFile inputJSONL)
>>= logIgnored
>>= (tsvFile outputTSV :: [ClassifiedParagraph] -> IO ())
run _ = syntax "INPUT_JSONL OUTPUT_TSV"
newline = 10
jsonl = mapM eitherDecode . filter (not . BS.null) . BS.split newline