Skip to content
Snippets Groups Projects
Commit 922f5f43 authored by Alice Brenon's avatar Alice Brenon
Browse files

More comprehensive measure script taking both sign- and lexical units-levels into account

parent 0124d259
No related branches found
No related tags found
No related merge requests found
...@@ -2,37 +2,48 @@ ...@@ -2,37 +2,48 @@
{-# LANGUAGE ExplicitNamespaces #-} {-# LANGUAGE ExplicitNamespaces #-}
import Conllu.Tree (IndexedDocument(..)) import Conllu.Tree (IndexedDocument(..))
import Conllu.Tree.Count (Count(..))
import Data.ByteString as ByteString (readFile) import Data.ByteString as ByteString (readFile)
import Data.Csv (DefaultOrdered(..), ToNamedRecord(..)) import Data.Csv (DefaultOrdered(..), ToNamedRecord(..))
import Data.Serialize (decode) import Data.Serialize (decode)
import Data.Text as Text (length)
import Data.Text.IO as Text (hGetContents)
import GEODE.Metadata import GEODE.Metadata
( type(@)(..), ArticleRecord(..), Document(..), ReadTSV(..), Record(..) ( type(@)(..), ArticleRecord(..), Document(..), ReadTSV(..), Record(..)
, WriteTSV(..), for, getHeader, glue ) , WithDefaultHeader(..), WriteTSV(..), for, getHeader )
import GHC.Generics (Generic) import GHC.Generics (Generic)
import System.Environment (getArgs) import System.Environment (getArgs)
import System.FilePath ((</>)) import System.FilePath ((</>))
import System.IO (IOMode(ReadMode), hFileSize, withFile)
import System.Script (syntax, warn) import System.Script (syntax, warn)
newtype Size = Size { size :: Int } deriving (Eq, Generic, Ord, Show) data Sizes = Sizes
{ bytes :: Int
, characters :: Int } deriving (Eq, Generic, Ord, Show)
instance DefaultOrdered Size instance DefaultOrdered Sizes
instance ToNamedRecord Size instance ToNamedRecord Sizes
type Result = ArticleRecord @ Size type Result = ArticleRecord @ Sizes @ Count
--type Result = ArticleRecord @ Sizes @ Count
measureIn :: FilePath -> ArticleRecord -> IO () measureIn :: FilePath -> FilePath -> ArticleRecord -> IO ()
measureIn inputRoot article = measureIn textRoot treeRoot article = do
ByteString.readFile path >>= either warn measure . decode sizes <- withFile (textRoot </> relativePath article "txt") ReadMode measure
ByteString.readFile treeFile
>>= either warn (writeTSV () . (:[]) . glue3 article sizes . _total) . decode
where where
path = inputRoot </> relativePath article "tree" treeFile = treeRoot </> relativePath article "tree"
--skipAndWarn msg = Nothing <$ warn msg measure h = Sizes
measure (IndexedDocument {_total}) = writeTSV () [glue article $ Size _total] <$> (fromInteger <$> hFileSize h)
<*> (Text.length <$> hGetContents h)
glue3 a b c = WithDefaultHeader (a :@: b :@: c)
main :: IO () main :: IO ()
main = getArgs >>= run main = getArgs >>= run
where where
run [inputRoot] = do run [textRoot, treeRoot] = do
Document {rows} <- readTSV () Document {rows} <- readTSV ()
writeTSV () [getHeader (for :: Result)] writeTSV () [getHeader (for :: Result)]
mapM_ (measureIn inputRoot) rows mapM_ (measureIn textRoot treeRoot) rows
run _ = syntax "INPUT_ROOT" run _ = syntax "TEXT_ROOT TREE_ROOT"
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment