Skip to content
Snippets Groups Projects
Commit 922f5f43 authored by Alice Brenon's avatar Alice Brenon
Browse files

More comprehensive measure script taking both sign- and lexical units-levels into account

parent 0124d259
No related branches found
No related tags found
No related merge requests found
......@@ -2,37 +2,48 @@
{-# LANGUAGE ExplicitNamespaces #-}
import Conllu.Tree (IndexedDocument(..))
import Conllu.Tree.Count (Count(..))
import Data.ByteString as ByteString (readFile)
import Data.Csv (DefaultOrdered(..), ToNamedRecord(..))
import Data.Serialize (decode)
import Data.Text as Text (length)
import Data.Text.IO as Text (hGetContents)
import GEODE.Metadata
( type(@)(..), ArticleRecord(..), Document(..), ReadTSV(..), Record(..)
, WriteTSV(..), for, getHeader, glue )
, WithDefaultHeader(..), WriteTSV(..), for, getHeader )
import GHC.Generics (Generic)
import System.Environment (getArgs)
import System.FilePath ((</>))
import System.IO (IOMode(ReadMode), hFileSize, withFile)
import System.Script (syntax, warn)
newtype Size = Size { size :: Int } deriving (Eq, Generic, Ord, Show)
data Sizes = Sizes
{ bytes :: Int
, characters :: Int } deriving (Eq, Generic, Ord, Show)
instance DefaultOrdered Size
instance ToNamedRecord Size
instance DefaultOrdered Sizes
instance ToNamedRecord Sizes
type Result = ArticleRecord @ Size
type Result = ArticleRecord @ Sizes @ Count
--type Result = ArticleRecord @ Sizes @ Count
measureIn :: FilePath -> ArticleRecord -> IO ()
measureIn inputRoot article =
ByteString.readFile path >>= either warn measure . decode
measureIn :: FilePath -> FilePath -> ArticleRecord -> IO ()
measureIn textRoot treeRoot article = do
sizes <- withFile (textRoot </> relativePath article "txt") ReadMode measure
ByteString.readFile treeFile
>>= either warn (writeTSV () . (:[]) . glue3 article sizes . _total) . decode
where
path = inputRoot </> relativePath article "tree"
--skipAndWarn msg = Nothing <$ warn msg
measure (IndexedDocument {_total}) = writeTSV () [glue article $ Size _total]
treeFile = treeRoot </> relativePath article "tree"
measure h = Sizes
<$> (fromInteger <$> hFileSize h)
<*> (Text.length <$> hGetContents h)
glue3 a b c = WithDefaultHeader (a :@: b :@: c)
main :: IO ()
main = getArgs >>= run
where
run [inputRoot] = do
run [textRoot, treeRoot] = do
Document {rows} <- readTSV ()
writeTSV () [getHeader (for :: Result)]
mapM_ (measureIn inputRoot) rows
run _ = syntax "INPUT_ROOT"
mapM_ (measureIn textRoot treeRoot) rows
run _ = syntax "TEXT_ROOT TREE_ROOT"
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment