Skip to content
Snippets Groups Projects
measure.hs 1.75 KiB
Newer Older
#!/usr/bin/env -S runhaskell --ghc-arg="-Wall" --ghc-arg="-i lib/haskell" --ghc-arg="-i ../ghc-geode/lib"

{-# LANGUAGE ExplicitNamespaces #-}
import Conllu.Tree (IndexedDocument(..))
import Conllu.Tree.Count (Count(..))
import Data.ByteString as ByteString (readFile)
import Data.Csv (DefaultOrdered(..), ToNamedRecord(..))
import Data.Serialize (decode)
import Data.Text as Text (length)
import Data.Text.IO as Text (hGetContents)
import GEODE.Metadata
  ( type(@)(..), ArticleRecord(..), Document(..), ReadTSV(..), Record(..)
  , WithDefaultHeader(..), WriteTSV(..), for, getHeader )
import GHC.Generics (Generic)
import System.Environment (getArgs)
import System.FilePath ((</>))
import System.IO (IOMode(ReadMode), hFileSize, withFile)
import System.Script (syntax, warn)

data Sizes = Sizes
  { bytes :: Int
  , characters :: Int } deriving (Eq, Generic, Ord, Show)

instance DefaultOrdered Sizes
instance ToNamedRecord Sizes

type Result = ArticleRecord @ Sizes @ Count
--type Result = ArticleRecord @ Sizes @ Count

measureIn :: FilePath -> FilePath -> ArticleRecord -> IO ()
measureIn textRoot treeRoot article = do
  sizes <- withFile (textRoot </> relativePath article "txt") ReadMode measure
  ByteString.readFile treeFile
    >>= either warn (writeTSV () . (:[]) . glue3 article sizes . _total) . decode
  where
    treeFile = treeRoot </> relativePath article "tree"
    measure h = Sizes
      <$> (fromInteger <$> hFileSize h)
      <*> (Text.length <$> hGetContents h)
    glue3 a b c = WithDefaultHeader (a :@: b :@: c)

main :: IO ()
main = getArgs >>= run
  where
    run [textRoot, treeRoot] = do
      Document {rows} <- readTSV ()
      writeTSV () [getHeader (for :: Result)]
      mapM_ (measureIn textRoot treeRoot) rows
    run _ = syntax "TEXT_ROOT TREE_ROOT"