diff --git a/scripts/textometry/indexSyntaxTrees.hs b/scripts/textometry/index-syntax-trees.hs similarity index 92% rename from scripts/textometry/indexSyntaxTrees.hs rename to scripts/textometry/index-syntax-trees.hs index 2e86d5e1a13787b885dfa67d0c242fc4aa5f5ac6..3e1731affa23ddcaadd4c07a632dcca2bbdeaf3e 100755 --- a/scripts/textometry/indexSyntaxTrees.hs +++ b/scripts/textometry/index-syntax-trees.hs @@ -8,13 +8,13 @@ import Control.Monad.IO.Class (MonadIO(..)) import Control.Monad.Reader (MonadReader, asks, runReaderT) import Data.ByteString as ByteString (writeFile) import Data.Serialize (encode) -import GEODE.Metadata ( ArticleRecord, Record(..) , readNamedTsv ) +import GEODE.Metadata (ArticleRecord, Document(..), Record(..), ReadTSV(..)) import Options.Applicative ( Parser, execParser, fullDesc, help, helper, info, long, metavar, progDesc , short, strOption ) import System.Directory (createDirectoryIfMissing) import System.FilePath ((</>), takeDirectory) -import System.Script (try, warn) +import System.Script (warn) data Config = Config { inputRoot :: FilePath @@ -55,4 +55,4 @@ toTree articleRecord = do main :: IO () main = getConfig >>= runReaderT chain where - chain = try (asks inputTsv >>= liftIO . readNamedTsv) >>= mapM_ toTree + chain = asks inputTsv >>= liftIO . readTSV >>= mapM_ toTree . rows diff --git a/scripts/textometry/size.hs b/scripts/textometry/size.hs new file mode 100755 index 0000000000000000000000000000000000000000..d67703c519c3f168500f3572cb14dad1c945344e --- /dev/null +++ b/scripts/textometry/size.hs @@ -0,0 +1,38 @@ +#!/usr/bin/env -S runhaskell --ghc-arg="-Wall" --ghc-arg="-i lib/haskell" --ghc-arg="-i ../ghc-geode/lib" + +{-# LANGUAGE ExplicitNamespaces #-} +import Conllu.Tree (IndexedDocument(..)) +import Data.ByteString as ByteString (readFile) +import Data.Csv (DefaultOrdered(..), ToNamedRecord(..)) +import Data.Serialize (decode) +import GEODE.Metadata + ( type(@)(..), ArticleRecord(..), Document(..), ReadTSV(..), Record(..) + , WriteTSV(..), for, getHeader, glue ) +import GHC.Generics (Generic) +import System.Environment (getArgs) +import System.FilePath ((</>)) +import System.Script (syntax, warn) + +newtype Size = Size { size :: Int } deriving (Eq, Generic, Ord, Show) + +instance DefaultOrdered Size +instance ToNamedRecord Size + +type Result = ArticleRecord @ Size + +measureIn :: FilePath -> ArticleRecord -> IO () +measureIn inputRoot article = + ByteString.readFile path >>= either warn measure . decode + where + path = inputRoot </> relativePath article "tree" + --skipAndWarn msg = Nothing <$ warn msg + measure (IndexedDocument {_total}) = writeTSV () [glue article $ Size _total] + +main :: IO () +main = getArgs >>= run + where + run [inputRoot] = do + Document {rows} <- readTSV () + writeTSV () [getHeader (for :: Result)] + mapM_ (measureIn inputRoot) rows + run _ = syntax "INPUT_ROOT" diff --git a/scripts/textometry/txm-results.hs b/scripts/textometry/txm-results.hs new file mode 100755 index 0000000000000000000000000000000000000000..0168291b4a68c343b8cdb11818c8b96b8d1c567a --- /dev/null +++ b/scripts/textometry/txm-results.hs @@ -0,0 +1,47 @@ +#!/usr/bin/env -S runhaskell --ghc-arg="-Wall" --ghc-arg="-ilib/haskell" --ghc-arg="-i/home/alice/Logiciel/ghc-geode/lib/GEODE/Metadata" + +{-# LANGUAGE ExplicitNamespaces, OverloadedStrings #-} +import Control.Monad (foldM) +import Data.Csv (DefaultOrdered(..), ToNamedRecord(..)) +import Data.Text as Text (Text, split, unpack) +import Data.Text.IO as Text (readFile) +import Data.Vector as Vector (fromList) +import GEODE.Metadata (type(@)(..), ArticleRecord(..), Document(..), Record(..), WriteTSV(..), for, getHeader) +import GHC.Generics (Generic) +import System.Environment (getArgs) +import System.Script (syntax, warn) +import Text.Filter (Editable(..)) +import Text.Printf (printf) + +newtype Result = Result { result :: Text } deriving (Eq, Generic, Ord, Show) + +instance DefaultOrdered Result +instance ToNamedRecord Result + +type Row = ArticleRecord @ Result +getResults :: [Text] -> IO (Document Row) +getResults = fmap buildDoc . foldM parseRow [] . drop 1 + where + buildDoc stack = Document + { header = getHeader (for :: Row) + , rows = Vector.fromList $ reverse stack } + +parseRow :: [Row] -> Text -> IO [Row] +parseRow rows row = pickColumns (Text.split (== '\t') row) + where + pickColumns (uid: _: pivot: _) = + buildRow (onlyTarget pivot) . fromUID $ Text.unpack uid + pickColumns _ = + rows <$ warn (printf "Found less than 3 columns in row: %s" row) + buildRow _ (Left errorMessage) = rows <$ warn errorMessage + buildRow result (Right article) = pure $ (article :@: Result {result}):rows + onlyTarget t = + let l = Text.split (`elem` ['[', ']']) t in + if length l > 2 then l !! 1 else t + +main :: IO () +main = getArgs >>= run + where + run [input] = + Text.readFile input >>= getResults . enter >>= writeTSV () + run _ = syntax "INPUT_TSV"