Skip to content
Snippets Groups Projects
Commit 33d70476 authored by Alice Brenon's avatar Alice Brenon
Browse files

Improving existing scripts for prodigy + add a new one to get TSV from the trained output

parent 4cf05a96
No related branches found
No related tags found
No related merge requests found
#!/usr/bin/env -S runhaskell --ghc-arg="-Wall" --ghc-arg="-i lib"
{-# LANGUAGE DeriveGeneric #-}
{-# LANGUAGE DeriveGeneric, ExplicitNamespaces, OverloadedStrings #-}
import Data.Aeson (ToJSON(..), defaultOptions, encode, genericToEncoding)
import Data.ByteString.Lazy.Char8 as ByteString (putStrLn)
import Data.Text (Text)
import Data.Text.IO as Text (readFile)
import GEODE.Metadata (readNamedTsv)
import GEODE.Metadata.File (relativePath)
import GEODE.Metadata.PrimaryKey.Paragraph (ParagraphPK)
import GEODE.Metadata (type (@)(..), Record(..), readNamedTsv)
import GEODE.Metadata.ProdigyMeta (ParagraphMeta)
import GHC.Generics (Generic)
import System.Environment (getArgs)
import System.FilePath ((</>))
......@@ -14,21 +13,21 @@ import System.Script (syntax, try)
data Paragraph = Paragraph
{ text :: Text
, meta :: ParagraphPK } deriving Generic
, meta :: ParagraphMeta } deriving Generic
instance ToJSON Paragraph where
toEncoding = genericToEncoding defaultOptions
loadParagraph :: FilePath -> ParagraphPK -> IO Paragraph
loadParagraph source meta = do
text <- Text.readFile (source </> relativePath meta "txt")
loadParagraph :: FilePath -> ParagraphMeta -> IO Paragraph
loadParagraph source meta@(paragraphRecord :@: _) = do
text <- Text.readFile (source </> relativePath paragraphRecord "txt")
pure $ Paragraph {text, meta}
main :: IO ()
main = getArgs >>= run
where
run [inputMeta, source] =
try (readNamedTsv inputMeta) >>= mapM_ (prodigyText source)
try (readNamedTsv inputMeta) >>= mapM_ (toJSON source)
run _ = syntax "INPUT_METADATA SOURCE_DIRECTORY"
prodigyText source pK =
loadParagraph source pK >>= ByteString.putStrLn . encode
toJSON source parMeta =
loadParagraph source parMeta >>= ByteString.putStrLn . encode
#!/usr/bin/env -S runhaskell --ghc-arg="-Wall" --ghc-arg="-i lib" --ghc-arg="-fprint-potential-instances"
{-# LANGUAGE DeriveGeneric, ExplicitNamespaces, OverloadedStrings #-}
import Data.Aeson ((.:), FromJSON(..), Value(..), encode, withArray, withText, eitherDecode)
import Data.Aeson.Types (prependFailure, typeMismatch)
import Data.ByteString.Lazy as BS (null, readFile, split)
import Data.ByteString.Lazy.Char8 as BS (unpack)
import Data.Csv (ToNamedRecord(..))
import Data.Text (Text)
import Data.Vector as Vector (head)
import GEODE.Metadata
(type (@)(..), DefaultFields(..), HasDefaultHeader(..), tsvFile)
import GEODE.Metadata.ProdigyMeta (ParagraphMeta)
import GHC.Generics (Generic)
import System.Environment (getArgs)
import System.Script (try, syntax, warn)
newtype Classification = Classification { paragraphFunction :: Text } deriving Generic
instance ToNamedRecord Classification
instance HasDefaultHeader Classification where
defaultFields = DefaultFields ["paragraphFunction"]
type ClassifiedParagraph = ParagraphMeta @ Classification
instance {-# OVERLAPS #-} FromJSON (Either String ClassifiedParagraph) where
parseJSON o@(Object v) = do
paragraphMeta <- v .: "meta" >>= parseJSON
(fmap (paragraphMeta :@:)) <$> (v .: "accept" >>= parseClassification)
where
parseClassification = withArray "Classification" singleValue
singleValue a
| not $ Prelude.null a =
withText "domain" (pure . Right . Classification) (Vector.head a)
singleValue _ = pure $ Left
("Looks like " ++ debug ++ " was not classified, ignoring for now")
debug = BS.unpack $ encode o
parseJSON invalid =
prependFailure "parsing ClassifiedParagraph failed, "
(typeMismatch "Object" invalid)
logIgnored :: [Either String a] -> IO [a]
logIgnored = foldr keepRight (pure [])
where
keepRight (Left message) acc = warn message *> acc
keepRight (Right a) acc = (a:) <$> acc
main :: IO ()
main = getArgs >>= run
where
run [inputJSONL, outputTSV] =
try (jsonl <$> BS.readFile inputJSONL)
>>= logIgnored
>>= (tsvFile outputTSV :: [ClassifiedParagraph] -> IO ())
run _ = syntax "INPUT_JSONL OUTPUT_TSV"
newline = 10
jsonl = mapM eitherDecode . filter (not . BS.null) . BS.split newline
#!/usr/bin/env -S runhaskell --ghc-arg="-Wall" --ghc-arg="-i lib"
{-# LANGUAGE OverloadedStrings #-}
{-# LANGUAGE ExplicitNamespaces, OverloadedStrings #-}
import Data.List (dropWhileEnd)
import Data.Text (Text, splitOn)
import Data.Text.IO as Text (readFile, writeFile)
import GEODE.Metadata.File as File (File(..))
import GEODE.Metadata as Article (PrimaryKey, readNamedTsv, relativePath, tsvFile)
import GEODE.Metadata.PrimaryKey.Paragraph (ParagraphPK(..))
--import GEODE.Metadata.File as File (File(..))
import GEODE.Metadata as Article (type (@)(..), ArticleRecord, Entry(..), Record(..), readNamedTsv, relativePath, tsvFile)
import GEODE.Metadata.ParagraphRecord (Paragraph(..))
import GEODE.Metadata.ProdigyMeta as Prodigy (ParagraphMeta, ProdigyMeta(..))
import System.Directory (createDirectoryIfMissing)
import System.Environment (getArgs)
import System.FilePath ((</>), (<.>), isPathSeparator, takeDirectory)
......@@ -17,17 +18,24 @@ articleParagraphs :: FilePath -> IO [Text]
articleParagraphs =
fmap (fmap (leave . linearize False . enter) . splitOn "\n\n") . Text.readFile
to :: FilePath -> FilePath -> PrimaryKey -> IO [ParagraphPK]
to source target article = do
createDirectoryIfMissing True (target </> Article.relativePath article "")
articleParagraphs articlePath >>= mapM create . number
withMeta :: ArticleRecord @ Entry -> [Text] -> [(ParagraphMeta, Text)]
withMeta (articleRecord :@: entry) paragraphs = zipWith f [1..] paragraphs
where
articlePath = source </> Article.relativePath article "txt"
number = zip [1..]
create (rank, paragraphText) =
let paragraphPK = ParagraphPK {article, rank}
outputPath = target </> File.relativePath paragraphPK "txt" in
paragraphPK <$ Text.writeFile outputPath paragraphText
prodigyMeta = ProdigyMeta
{ totalParagraphs = length paragraphs
, Prodigy.headword = Article.headword entry }
f paragraph paragraphText =
(articleRecord :@: Paragraph {paragraph} :@: prodigyMeta, paragraphText)
to :: FilePath -> FilePath -> ArticleRecord @ Entry -> IO [ParagraphMeta]
to source target meta@(articleRecord :@: _) = do
createDirectoryIfMissing True (target </> relativePath articleRecord "")
articleParagraphs articlePath >>= mapM create . withMeta meta
where
articlePath = source </> relativePath articleRecord "txt"
create (paragraphMeta@(paragraphRecord :@: _), paragraphText) =
let outputPath = target </> relativePath paragraphRecord "txt" in
paragraphMeta <$ Text.writeFile outputPath paragraphText
main :: IO ()
main = (fmap (dropWhileEnd isPathSeparator) <$> getArgs) >>= run
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment