Newer
Older
Alice Brenon
committed
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
#!/usr/bin/env -S runhaskell --ghc-arg="-Wall" --ghc-arg="-i lib/haskell" --ghc-arg="-i ../ghc-geode/lib"
{-# LANGUAGE ExplicitNamespaces #-}
import Conllu.Tree (IndexedDocument(..))
import Conllu.Tree.Count (Count(..))
import Data.ByteString as ByteString (readFile)
import Data.Csv (DefaultOrdered(..), ToNamedRecord(..))
import Data.Serialize (decode)
import Data.Text as Text (length)
import Data.Text.IO as Text (hGetContents)
import GEODE.Metadata
( type(@)(..), ArticleRecord(..), Document(..), ReadTSV(..), Record(..)
, WithDefaultHeader(..), WriteTSV(..), for, getHeader )
import GHC.Generics (Generic)
import System.Environment (getArgs)
import System.FilePath ((</>))
import System.IO (IOMode(ReadMode), hFileSize, withFile)
import System.Script (syntax, warn)
data Sizes = Sizes
{ bytes :: Int
, characters :: Int } deriving (Eq, Generic, Ord, Show)
instance DefaultOrdered Sizes
instance ToNamedRecord Sizes
type Result = ArticleRecord @ Sizes @ Count
--type Result = ArticleRecord @ Sizes @ Count
measureIn :: FilePath -> FilePath -> ArticleRecord -> IO ()
measureIn textRoot treeRoot article = do
sizes <- withFile (textRoot </> relativePath article "txt") ReadMode measure
ByteString.readFile treeFile
>>= either warn (writeTSV () . (:[]) . glue3 article sizes . _total) . decode
where
treeFile = treeRoot </> relativePath article "tree"
measure h = Sizes
<$> (fromInteger <$> hFileSize h)
<*> (Text.length <$> hGetContents h)
glue3 a b c = WithDefaultHeader (a :@: b :@: c)
main :: IO ()
main = getArgs >>= run
where
run [textRoot, treeRoot] = do
Document {rows} <- readTSV ()
writeTSV () [getHeader (for :: Result)]
mapM_ (measureIn textRoot treeRoot) rows
run _ = syntax "TEXT_ROOT TREE_ROOT"