diff --git a/scripts/textometry/topNOUN.hs b/scripts/textometry/topNOUN.hs new file mode 100755 index 0000000000000000000000000000000000000000..f2134f96a1c08b9ef62cae288446847e5087a97d --- /dev/null +++ b/scripts/textometry/topNOUN.hs @@ -0,0 +1,82 @@ +#!/usr/bin/env -S runhaskell --ghc-arg="-Wall" --ghc-arg="-i lib/haskell" + +{-# LANGUAGE ExplicitNamespaces #-} +import qualified Conllu.UposTagset as Conllu (POS(NOUN)) +import qualified Conllu.DeprelTagset as Conllu (EP(APPOS)) +import Conllu.Tree + ( EP(..), IndexedDocument(..), IndexedSentence(..), IndexedWord(..), POS(..) + , Rel(..) ) +import Control.Applicative ((<**>), (<|>)) +import Control.Monad.IO.Class (liftIO) +import Control.Monad.Reader (ReaderT, asks, runReaderT) +import Data.ByteString as ByteString (readFile) +import Data.Maybe (listToMaybe) +import Data.Serialize (decode) +import Data.Tree (Tree(..)) +import GEODE.Metadata + ( type(@), ArticleRecord, Record(..), WithDefaultHeader(..), glue + , readNamedTsv, tsvLines ) +import GEODE.Metadata.TSV.Header (for, getHeader) +import Options.Applicative + ( Parser, execParser, fullDesc, help, helper, info, long, metavar, progDesc + , short, strOption ) +import System.FilePath ((</>)) +import System.Script (try, warn) + +data Config = Config + { inputRoot :: FilePath + , inputTsv :: FilePath } + +configParser :: Parser Config +configParser = Config + <$> strOption root + <*> strOption inputTsvArg + where + root = short 'r' <> long "root" <> metavar "TREE_DIRECTORY" + <> help "path to the base directory containing the syntax annotations" + inputTsvArg = short 'i' <> long "input" <> metavar "INPUT_TSV" + <> help "TSV file containing the article records to process" + +getConfig :: IO Config +getConfig = execParser + (info + (configParser <**> helper) + (fullDesc + <> progDesc "A textometric tool to extract nouns at the top of the first sentence")) + +type Result = ArticleRecord @ IndexedWord + +profile :: ArticleRecord -> ReaderT Config IO (Maybe (WithDefaultHeader Result)) +profile articleRecord = do + path <- asks ((</> relativePath articleRecord "tree") . inputRoot) + liftIO $ ByteString.readFile path >>= either skipAndWarn analyse . decode + where + skipAndWarn msg = Nothing <$ warn msg + analyse = pure . fmap (glue articleRecord) . searchDocument + +searchDocument :: IndexedDocument -> Maybe IndexedWord +searchDocument (IndexedDocument {_sentences}) = + listToMaybe _sentences >>= firstTopNOUN + +firstTopNOUN :: IndexedSentence -> Maybe IndexedWord +firstTopNOUN (IndexedSentence {_syntax}) = listToMaybe _syntax >>= fromTop + where + fromTop (Node {rootLabel, subForest}) + | isNoun (_upos rootLabel) = Just rootLabel + | otherwise = foldl (<|>) Nothing (apposNoun <$> subForest) + apposNoun (Node {rootLabel}) + | isNoun (_upos rootLabel) && isAppos (_deprel <$> _rel rootLabel) = + Just rootLabel + | otherwise = Nothing + isNoun (Just (POS Conllu.NOUN)) = True + isNoun _ = False + isAppos (Just (EP Conllu.APPOS)) = True + isAppos _ = False + +main :: IO () +main = getConfig >>= runReaderT chain + where + chain = try (asks inputTsv >>= liftIO . readNamedTsv) >>= searchAndDisplay + searchAndDisplay rows = do + liftIO $ tsvLines [getHeader (for :: Result)] + mapM_ (\ar -> profile ar >>= liftIO . tsvLines . maybe [] (:[])) rows