#!/usr/bin/env -S runhaskell --ghc-arg="-Wall" --ghc-arg="-i lib" --ghc-arg="-fprint-potential-instances" {-# LANGUAGE ExplicitNamespaces, OverloadedStrings #-} import Control.Applicative ((<|>)) import Control.Monad.Except (MonadError(..), ExceptT(..), runExceptT) import Data.Aeson ((.:), FromJSON(..), Object, Value(..), encode, withArray, withObject, withText, eitherDecode) import Data.Aeson.Types (Parser, prependFailure, typeMismatch) import Data.ByteString.Lazy as BS (null, readFile, split) import Data.ByteString.Lazy.Char8 as BS (unpack) import Data.String (IsString(..)) import Data.Text as Text (Text) import Data.Vector as Vector (head) import GEODE.Metadata (type (@)(..), Record(..), tsvFile) import GEODE.Metadata.ProdigyMeta (Classification(..), ClassifiedParagraph) import System.Environment (getArgs) import System.Script (try, syntax, warn) import Text.Printf (printf) data Row = Unclassified String | Full ClassifiedParagraph instance {-# OVERLAPS #-} FromJSON Row where {- parseJSON o@(Object v) = do paragraphMeta@(paragraphRecord :@: _) <- v .: "meta" --classified <- v .: "accept" >>= parseClassification classified <- v .: "label" pure $ either (\_ -> Unclassified $ debug paragraphRecord) (Full . (paragraphMeta :@:)) classified where parseClassification = withArray "Classification" singleValue singleValue a | not $ Prelude.null a = withText "domain" (pure . Right . Classification) (Vector.head a) singleValue _ = pure $ Left ("Looks like " ++ BS.unpack (encode o) ++ " was not classified, ignoring for now") debug record = "Looks like " ++ uid record ++ " was not classified, ignoring for now" parseJSON o@(Object v) = do paragraphMeta <- v .: "meta" >>= parseJSON classified <- v .: "accept" >>= parseClassification pure $ either Unclassified (Full . (paragraphMeta :@:)) classified where parseClassification = withArray "Classification" singleValue singleValue a | not $ Prelude.null a = withText "domain" (pure . Right . Classification) (Vector.head a) singleValue _ = pure $ Left ("Looks like " ++ debug ++ " was not classified, ignoring for now") debug = BS.unpack $ encode o -} parseJSON = withObject "Row" parseRow where parseRow o = do paragraphMeta <- o .: "meta" getRow paragraphMeta <$> runExceptT (classification o) getRow paragraphMeta@(paragraphRecord :@: _) = either (Unclassified . debug paragraphRecord) (Full . (paragraphMeta :@:) . Classification) classification :: Object -> ExceptT String Parser Text classification o = do getTextField "answer" o >>= isAccept getTextField "label" --o .: "label" >>= withText "label" pure --checkAnswer o = ExceptT -- ((o .: "answer" >>= withText "answer" (pure . isAccept)) -- <|> pure (Left "answer field is missing")) isAccept "accept" = pure () isAccept s = throwError $ printf "answer was \"%s\" and not \"accept\"" s --isAccept s = Left $ printf "answer was \"%s\" and not \"accept\"" s debug record = printf "Ignoring %s: %s" (uid record) getTextField :: String -> Object -> ExceptT String Parser Text getTextField name o = getField >>= ensureIsText where getField :: ExceptT String Parser Value getField = ExceptT $ (Right <$> (o .: fromString name)) <|> catch "is missing" ensureIsText :: Value -> ExceptT String Parser Text ensureIsText v = ExceptT $ withText name (pure . Right) v <|> catch "is not text" catch :: String -> Parser (Either String a) catch = pure . Left . printf "%s field %s" name {- parseJSON invalid = prependFailure "parsing Row failed, " (typeMismatch "Object" invalid) -} logIgnored :: [Row] -> IO [ClassifiedParagraph] logIgnored = foldr keepFull (pure []) where keepFull (Unclassified message) acc = warn message *> acc keepFull (Full a) acc = (a:) <$> acc main :: IO () main = getArgs >>= run where run [inputJSONL, outputTSV] = try (jsonl <$> BS.readFile inputJSONL) >>= logIgnored >>= (tsvFile outputTSV :: [ClassifiedParagraph] -> IO ()) run _ = syntax "INPUT_JSONL OUTPUT_TSV" newline = 10 jsonl = mapM eitherDecode . filter (not . BS.null) . BS.split newline