Newer
Older
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
#!/usr/bin/env -S runhaskell --ghc-arg="-Wall" --ghc-arg="-i lib/haskell"
{-# LANGUAGE ExplicitNamespaces #-}
import qualified Conllu.UposTagset as Conllu (POS(NOUN))
import qualified Conllu.DeprelTagset as Conllu (EP(APPOS))
import Conllu.Tree
( EP(..), IndexedDocument(..), IndexedSentence(..), IndexedWord(..), POS(..)
, Rel(..) )
import Control.Applicative ((<**>), (<|>))
import Control.Monad.IO.Class (liftIO)
import Control.Monad.Reader (ReaderT, asks, runReaderT)
import Data.ByteString as ByteString (readFile)
import Data.Maybe (listToMaybe)
import Data.Serialize (decode)
import Data.Tree (Tree(..))
import GEODE.Metadata
( type(@), ArticleRecord, Record(..), WithDefaultHeader(..), glue
, readNamedTsv, tsvLines )
import GEODE.Metadata.TSV.Header (for, getHeader)
import Options.Applicative
( Parser, execParser, fullDesc, help, helper, info, long, metavar, progDesc
, short, strOption )
import System.FilePath ((</>))
import System.Script (try, warn)
data Config = Config
{ inputRoot :: FilePath
, inputTsv :: FilePath }
configParser :: Parser Config
configParser = Config
<$> strOption root
<*> strOption inputTsvArg
where
root = short 'r' <> long "root" <> metavar "TREE_DIRECTORY"
<> help "path to the base directory containing the syntax annotations"
inputTsvArg = short 'i' <> long "input" <> metavar "INPUT_TSV"
<> help "TSV file containing the article records to process"
getConfig :: IO Config
getConfig = execParser
(info
(configParser <**> helper)
(fullDesc
<> progDesc "A textometric tool to extract nouns at the top of the first sentence"))
type Result = ArticleRecord @ IndexedWord
profile :: ArticleRecord -> ReaderT Config IO (Maybe (WithDefaultHeader Result))
profile articleRecord = do
path <- asks ((</> relativePath articleRecord "tree") . inputRoot)
liftIO $ ByteString.readFile path >>= either skipAndWarn analyse . decode
where
skipAndWarn msg = Nothing <$ warn msg
analyse = pure . fmap (glue articleRecord) . searchDocument
searchDocument :: IndexedDocument -> Maybe IndexedWord
searchDocument (IndexedDocument {_sentences}) =
listToMaybe _sentences >>= firstTopNOUN
firstTopNOUN :: IndexedSentence -> Maybe IndexedWord
firstTopNOUN (IndexedSentence {_syntax}) = listToMaybe _syntax >>= fromTop
where
fromTop (Node {rootLabel, subForest})
| isNoun (_upos rootLabel) = Just rootLabel
| otherwise = foldl (<|>) Nothing (apposNoun <$> subForest)
apposNoun (Node {rootLabel})
| isNoun (_upos rootLabel) && isAppos (_deprel <$> _rel rootLabel) =
Just rootLabel
| otherwise = Nothing
isNoun (Just (POS Conllu.NOUN)) = True
isNoun _ = False
isAppos (Just (EP Conllu.APPOS)) = True
isAppos _ = False
main :: IO ()
main = getConfig >>= runReaderT chain
where
chain = try (asks inputTsv >>= liftIO . readNamedTsv) >>= searchAndDisplay
searchAndDisplay rows = do
liftIO $ tsvLines [getHeader (for :: Result)]
mapM_ (\ar -> profile ar >>= liftIO . tsvLines . maybe [] (:[])) rows