Skip to content
Snippets Groups Projects
prodigy-jsonl-to-tsv.hs 4.28 KiB
#!/usr/bin/env -S runhaskell --ghc-arg="-Wall" --ghc-arg="-i lib" --ghc-arg="-fprint-potential-instances"
{-# LANGUAGE ExplicitNamespaces, OverloadedStrings #-}
import Control.Applicative ((<|>))
import Control.Monad.Except (MonadError(..), ExceptT(..), runExceptT)
import Data.Aeson ((.:), FromJSON(..), Object, Value(..), encode, withArray, withObject, withText, eitherDecode)
import Data.Aeson.Types (Parser, prependFailure, typeMismatch)
import Data.ByteString.Lazy as BS (null, readFile, split)
import Data.ByteString.Lazy.Char8 as BS (unpack)
import Data.String (IsString(..))
import Data.Text as Text (Text)
import Data.Vector as Vector (head)
import GEODE.Metadata (type (@)(..), Record(..), tsvFile)
import GEODE.Metadata.ProdigyMeta (Classification(..), ClassifiedParagraph)
import System.Environment (getArgs)
import System.Script (try, syntax, warn)
import Text.Printf (printf)

data Row = Unclassified String | Full ClassifiedParagraph
instance {-# OVERLAPS #-} FromJSON Row where
{-
  parseJSON o@(Object v) = do
    paragraphMeta@(paragraphRecord :@: _) <- v .: "meta"
    --classified <- v .: "accept" >>= parseClassification
    classified <- v .: "label"
    pure $ either (\_ -> Unclassified $ debug paragraphRecord) (Full . (paragraphMeta :@:)) classified
    where
      parseClassification = withArray "Classification" singleValue
      singleValue a
        | not $ Prelude.null a =
          withText "domain" (pure . Right . Classification) (Vector.head a)
      singleValue _ = pure $ Left
        ("Looks like " ++ BS.unpack (encode o) ++ " was not classified, ignoring for now")
      debug record =
        "Looks like " ++ uid record ++ " was not classified, ignoring for now"

  parseJSON o@(Object v) = do
    paragraphMeta <- v .: "meta" >>= parseJSON
    classified <- v .: "accept" >>= parseClassification
    pure $ either Unclassified (Full . (paragraphMeta :@:)) classified
    where
      parseClassification = withArray "Classification" singleValue
      singleValue a
        | not $ Prelude.null a =
          withText "domain" (pure . Right . Classification) (Vector.head a)
      singleValue _ = pure $ Left
        ("Looks like " ++ debug ++ " was not classified, ignoring for now")
      debug = BS.unpack $ encode o
-}

  parseJSON = withObject "Row" parseRow
    where
      parseRow o = do
        paragraphMeta <- o .: "meta"
        getRow paragraphMeta
          <$> runExceptT (classification o)
      getRow paragraphMeta@(paragraphRecord :@: _) = either
        (Unclassified . debug paragraphRecord)
        (Full . (paragraphMeta :@:) . Classification)
      classification :: Object -> ExceptT String Parser Text
      classification o = do
        getTextField "answer" o >>= isAccept
        getTextField "label"
        --o .: "label" >>= withText "label" pure
      --checkAnswer o = ExceptT
      --  ((o .: "answer" >>= withText "answer" (pure . isAccept))
      --    <|> pure (Left "answer field is missing"))
      isAccept "accept" = pure ()
      isAccept s = throwError $ printf "answer was \"%s\" and not \"accept\"" s
      --isAccept s = Left $ printf "answer was \"%s\" and not \"accept\"" s
      debug record = printf "Ignoring %s: %s" (uid record)

getTextField :: String -> Object -> ExceptT String Parser Text
getTextField name o = getField >>= ensureIsText
  where
    getField :: ExceptT String Parser Value
    getField = ExceptT $
      (Right <$> (o .: fromString name)) <|> catch "is missing"
    ensureIsText :: Value -> ExceptT String Parser Text
    ensureIsText v = ExceptT $
      withText name (pure . Right) v <|> catch "is not text"
    catch :: String -> Parser (Either String a)
    catch = pure . Left . printf "%s field %s" name

{-
  parseJSON invalid =
    prependFailure "parsing Row failed, "
    (typeMismatch "Object" invalid)
-}

logIgnored :: [Row] -> IO [ClassifiedParagraph]
logIgnored = foldr keepFull (pure [])
  where
    keepFull (Unclassified message) acc = warn message *> acc
    keepFull (Full a) acc = (a:) <$> acc

main :: IO ()
main = getArgs >>= run
  where
    run [inputJSONL, outputTSV] =
        try (jsonl <$> BS.readFile inputJSONL)
      >>= logIgnored
      >>= (tsvFile outputTSV :: [ClassifiedParagraph] -> IO ())
    run _ = syntax "INPUT_JSONL OUTPUT_TSV"
    newline = 10
    jsonl = mapM eitherDecode . filter (not . BS.null) . BS.split newline