Newer
Older
Alice Brenon
committed
#!/usr/bin/env -S runhaskell --ghc-arg="-Wall" --ghc-arg="-i lib/haskell"
{-# LANGUAGE DeriveGeneric, ExplicitNamespaces #-}
import Conllu.Parse (parseConllu)
import Conllu.Type (AW, CW(..), Doc, Feat(..), ID(..), Rel(..), Sent(..))
import Control.Applicative ((<**>))
Alice Brenon
committed
import Control.Monad.IO.Class (liftIO)
import Control.Monad.Reader (ReaderT, asks, runReaderT)
Alice Brenon
committed
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
import Data.ByteString.Lazy as ByteString (putStr)
import Data.Csv (ToNamedRecord(..), encodeWith)
import Data.List (find, foldl', partition, sort)
import Data.Tree (Forest, Tree(..))
import GEODE.Metadata
( type(@), ArticleRecord, DefaultFields(..), HasDefaultHeader(..), Record(..)
, WithDefaultHeader(..), glue, readNamedTsv, toTsv, tsvLines )
import GEODE.Metadata.TSV.Header (for, getHeader)
import GHC.Generics (Generic)
import Options.Applicative
( Parser, execParser, fullDesc, help, helper, info, long, metavar, progDesc
, short, strOption )
import System.FilePath ((</>))
import System.Script (try, warn)
data Config = Config
{ inputRoot :: FilePath
, inputTsv :: FilePath }
configParser :: Parser Config
configParser = Config
<$> strOption root
<*> strOption inputTsvArg
where
root = short 'r' <> long "root" <> metavar "CONLLU_DIRECTORY"
<> help "path to the base directory containing the syntax annotations"
inputTsvArg = short 'i' <> long "input" <> metavar "INPUT_TSV"
<> help "TSV file containing the article records to process"
getConfig :: IO Config
getConfig = execParser
(info
(configParser <**> helper)
(fullDesc
<> progDesc "A textometric tool to draw discursive profiles"))
data Position = Position
{ totalSize :: Int
, position :: Int } deriving Generic
instance ToNamedRecord Position
instance HasDefaultHeader Position where
defaultFields = DefaultFields ["position", "totalSize"]
type Measure = ArticleRecord @ Position
Alice Brenon
committed
profile :: ArticleRecord -> ReaderT Config IO [WithDefaultHeader Measure]
profile articleRecord = do
path <- asks ((</> relativePath articleRecord "conllu") . inputRoot)
liftIO $ readFile path >>= either skipAndWarn analyse . parseConllu path
Alice Brenon
committed
where
skipAndWarn msg = [] <$ warn msg
analyse = pure . fmap (glue articleRecord) . findOccurrences
findOccurrences :: Doc -> [Position]
findOccurrences doc =
Position total <$> (zip (reverse offsets) doc >>= imperativeVerb)
where
(offsets, total) = foldl' next ([], 0) $ _words <$> doc
Alice Brenon
committed
next (tmpOffsets, partial) s = (partial:tmpOffsets, partial + length s)
Alice Brenon
committed
getSID :: ID -> Maybe Int
getSID (SID n) = Just n
getSID _ = Nothing
imperativeVerb :: (Int, Sent) -> [Int]
imperativeVerb (offset, sentence) = sort (syntax sentence >>= findSpans False)
where
findSpans isImperative (Node {rootLabel, subForest}) =
case getMoods $ _feats rootLabel of
Just moods -> let nowImperative = moods == ["Imp"] in
consIf rootLabel nowImperative (findSpans nowImperative =<< subForest)
Nothing ->
consIf rootLabel isImperative (findSpans isImperative =<< subForest)
consIf a b = if b then (maybe id ((:) . (offset+)) $ getSID (_id a)) else id
getMoods = fmap _featValues . find (("Mood" ==) . _feat)
type Syntax = Forest (CW AW)
syntax :: Sent -> Syntax
syntax = build 0 . _words
where
build n cws =
let (pointing, others) = partition (pointingTo n) cws in
recurseOn others <$> pointing
recurseOn cws cw = Node
{ rootLabel = cw, subForest = maybe [] (`build` cws) $ getSID (_id cw) }
pointingTo n cw = maybe False (n ==) (getSID . _head =<< _rel cw)
main :: IO ()
Alice Brenon
committed
main = getConfig >>= runReaderT chain
Alice Brenon
committed
where
Alice Brenon
committed
chain = try (asks inputTsv >>= liftIO . readNamedTsv) >>= searchAndDisplay
searchAndDisplay rows = do
liftIO . ByteString.putStr $ encodeWith toTsv [getHeader (for :: Measure)]
mapM_ (\ar -> profile ar >>= liftIO . tsvLines) rows