From 922f5f43d027d7f73c0a73a043f4f83aa317cf91 Mon Sep 17 00:00:00 2001 From: Alice BRENON <alice.brenon@ens-lyon.fr> Date: Mon, 29 Jan 2024 16:56:32 +0100 Subject: [PATCH] More comprehensive measure script taking both sign- and lexical units-levels into account --- scripts/textometry/measure.hs | 49 +++++++++++++++++++++++++++++++++++ scripts/textometry/size.hs | 38 --------------------------- 2 files changed, 49 insertions(+), 38 deletions(-) create mode 100755 scripts/textometry/measure.hs delete mode 100755 scripts/textometry/size.hs diff --git a/scripts/textometry/measure.hs b/scripts/textometry/measure.hs new file mode 100755 index 0000000..5f751d4 --- /dev/null +++ b/scripts/textometry/measure.hs @@ -0,0 +1,49 @@ +#!/usr/bin/env -S runhaskell --ghc-arg="-Wall" --ghc-arg="-i lib/haskell" --ghc-arg="-i ../ghc-geode/lib" + +{-# LANGUAGE ExplicitNamespaces #-} +import Conllu.Tree (IndexedDocument(..)) +import Conllu.Tree.Count (Count(..)) +import Data.ByteString as ByteString (readFile) +import Data.Csv (DefaultOrdered(..), ToNamedRecord(..)) +import Data.Serialize (decode) +import Data.Text as Text (length) +import Data.Text.IO as Text (hGetContents) +import GEODE.Metadata + ( type(@)(..), ArticleRecord(..), Document(..), ReadTSV(..), Record(..) + , WithDefaultHeader(..), WriteTSV(..), for, getHeader ) +import GHC.Generics (Generic) +import System.Environment (getArgs) +import System.FilePath ((</>)) +import System.IO (IOMode(ReadMode), hFileSize, withFile) +import System.Script (syntax, warn) + +data Sizes = Sizes + { bytes :: Int + , characters :: Int } deriving (Eq, Generic, Ord, Show) + +instance DefaultOrdered Sizes +instance ToNamedRecord Sizes + +type Result = ArticleRecord @ Sizes @ Count +--type Result = ArticleRecord @ Sizes @ Count + +measureIn :: FilePath -> FilePath -> ArticleRecord -> IO () +measureIn textRoot treeRoot article = do + sizes <- withFile (textRoot </> relativePath article "txt") ReadMode measure + ByteString.readFile treeFile + >>= either warn (writeTSV () . (:[]) . glue3 article sizes . _total) . decode + where + treeFile = treeRoot </> relativePath article "tree" + measure h = Sizes + <$> (fromInteger <$> hFileSize h) + <*> (Text.length <$> hGetContents h) + glue3 a b c = WithDefaultHeader (a :@: b :@: c) + +main :: IO () +main = getArgs >>= run + where + run [textRoot, treeRoot] = do + Document {rows} <- readTSV () + writeTSV () [getHeader (for :: Result)] + mapM_ (measureIn textRoot treeRoot) rows + run _ = syntax "TEXT_ROOT TREE_ROOT" diff --git a/scripts/textometry/size.hs b/scripts/textometry/size.hs deleted file mode 100755 index d67703c..0000000 --- a/scripts/textometry/size.hs +++ /dev/null @@ -1,38 +0,0 @@ -#!/usr/bin/env -S runhaskell --ghc-arg="-Wall" --ghc-arg="-i lib/haskell" --ghc-arg="-i ../ghc-geode/lib" - -{-# LANGUAGE ExplicitNamespaces #-} -import Conllu.Tree (IndexedDocument(..)) -import Data.ByteString as ByteString (readFile) -import Data.Csv (DefaultOrdered(..), ToNamedRecord(..)) -import Data.Serialize (decode) -import GEODE.Metadata - ( type(@)(..), ArticleRecord(..), Document(..), ReadTSV(..), Record(..) - , WriteTSV(..), for, getHeader, glue ) -import GHC.Generics (Generic) -import System.Environment (getArgs) -import System.FilePath ((</>)) -import System.Script (syntax, warn) - -newtype Size = Size { size :: Int } deriving (Eq, Generic, Ord, Show) - -instance DefaultOrdered Size -instance ToNamedRecord Size - -type Result = ArticleRecord @ Size - -measureIn :: FilePath -> ArticleRecord -> IO () -measureIn inputRoot article = - ByteString.readFile path >>= either warn measure . decode - where - path = inputRoot </> relativePath article "tree" - --skipAndWarn msg = Nothing <$ warn msg - measure (IndexedDocument {_total}) = writeTSV () [glue article $ Size _total] - -main :: IO () -main = getArgs >>= run - where - run [inputRoot] = do - Document {rows} <- readTSV () - writeTSV () [getHeader (for :: Result)] - mapM_ (measureIn inputRoot) rows - run _ = syntax "INPUT_ROOT" -- GitLab