#!/usr/bin/env -S runhaskell --ghc-arg="-Wall" --ghc-arg="-i lib/haskell" --ghc-arg="-i ../ghc-geode/lib" {-# LANGUAGE ExplicitNamespaces #-} import Conllu.Tree (IndexedDocument(..)) import Conllu.Tree.Count (Count(..)) import Data.ByteString as ByteString (readFile) import Data.Csv (DefaultOrdered(..), ToNamedRecord(..)) import Data.Serialize (decode) import Data.Text as Text (length) import Data.Text.IO as Text (hGetContents) import GEODE.Metadata ( type(@)(..), ArticleRecord(..), Document(..), ReadTSV(..), Record(..) , WithDefaultHeader(..), WriteTSV(..), for, getHeader ) import GHC.Generics (Generic) import System.Environment (getArgs) import System.FilePath ((</>)) import System.IO (IOMode(ReadMode), hFileSize, withFile) import System.Script (syntax, warn) data Sizes = Sizes { bytes :: Int , characters :: Int } deriving (Eq, Generic, Ord, Show) instance DefaultOrdered Sizes instance ToNamedRecord Sizes type Result = ArticleRecord @ Sizes @ Count --type Result = ArticleRecord @ Sizes @ Count measureIn :: FilePath -> FilePath -> ArticleRecord -> IO () measureIn textRoot treeRoot article = do sizes <- withFile (textRoot </> relativePath article "txt") ReadMode measure ByteString.readFile treeFile >>= either warn (writeTSV () . (:[]) . glue3 article sizes . _total) . decode where treeFile = treeRoot </> relativePath article "tree" measure h = Sizes <$> (fromInteger <$> hFileSize h) <*> (Text.length <$> hGetContents h) glue3 a b c = WithDefaultHeader (a :@: b :@: c) main :: IO () main = getArgs >>= run where run [textRoot, treeRoot] = do Document {rows} <- readTSV () writeTSV () [getHeader (for :: Result)] mapM_ (measureIn textRoot treeRoot) rows run _ = syntax "TEXT_ROOT TREE_ROOT"