#!/usr/bin/env -S runhaskell --ghc-arg="-Wall" --ghc-arg="-i lib/haskell"

{-# LANGUAGE ExplicitNamespaces #-}
import qualified Conllu.UposTagset as Conllu (POS(NOUN))
import qualified Conllu.DeprelTagset as Conllu (EP(APPOS))
import Conllu.Tree
  ( EP(..), IndexedDocument(..), IndexedSentence(..), IndexedWord(..), POS(..)
  , Rel(..) )
import Control.Applicative ((<**>), (<|>))
import Control.Monad.IO.Class (liftIO)
import Control.Monad.Reader (ReaderT, asks, runReaderT)
import Data.ByteString as ByteString (readFile)
import Data.Maybe (listToMaybe)
import Data.Serialize (decode)
import Data.Tree (Tree(..))
import GEODE.Metadata
  ( type(@), ArticleRecord, Record(..), WithDefaultHeader(..), glue
  , readNamedTsv, tsvLines )
import GEODE.Metadata.TSV.Header (for, getHeader)
import Options.Applicative
  ( Parser, execParser, fullDesc, help, helper, info, long, metavar, progDesc
  , short, strOption )
import System.FilePath ((</>))
import System.Script (try, warn)

data Config = Config
  { inputRoot :: FilePath
  , inputTsv :: FilePath }

configParser :: Parser Config
configParser = Config
  <$> strOption root
  <*> strOption inputTsvArg
  where
    root = short 'r' <> long "root" <> metavar "TREE_DIRECTORY"
          <> help "path to the base directory containing the syntax annotations"
    inputTsvArg = short 'i' <> long "input" <> metavar "INPUT_TSV"
          <> help "TSV file containing the article records to process"

getConfig :: IO Config
getConfig = execParser
  (info
    (configParser <**> helper)
    (fullDesc
    <> progDesc "A textometric tool to extract nouns at the top of the first sentence"))

type Result = ArticleRecord @ IndexedWord

profile :: ArticleRecord -> ReaderT Config IO (Maybe (WithDefaultHeader Result))
profile articleRecord = do
  path <- asks ((</> relativePath articleRecord "tree") . inputRoot)
  liftIO $ ByteString.readFile path >>= either skipAndWarn analyse . decode
  where
    skipAndWarn msg = Nothing <$ warn msg
    analyse = pure . fmap (glue articleRecord) . searchDocument

searchDocument :: IndexedDocument -> Maybe IndexedWord
searchDocument (IndexedDocument {_sentences}) =
  listToMaybe _sentences >>= firstTopNOUN

firstTopNOUN :: IndexedSentence -> Maybe IndexedWord
firstTopNOUN (IndexedSentence {_syntax}) = listToMaybe _syntax >>= fromTop
  where
    fromTop (Node {rootLabel, subForest})
      | isNoun (_upos rootLabel) = Just rootLabel
      | otherwise = foldl (<|>) Nothing (apposNoun <$> subForest)
    apposNoun (Node {rootLabel})
      | isNoun (_upos rootLabel) && isAppos (_deprel <$> _rel rootLabel) =
        Just rootLabel
      | otherwise = Nothing
    isNoun (Just (POS Conllu.NOUN)) = True
    isNoun _ = False
    isAppos (Just (EP Conllu.APPOS)) = True
    isAppos _ = False

main :: IO ()
main = getConfig >>= runReaderT chain
  where
    chain = try (asks inputTsv >>= liftIO . readNamedTsv) >>= searchAndDisplay
    searchAndDisplay rows = do
      liftIO $ tsvLines [getHeader (for :: Result)]
      mapM_ (\ar -> profile ar >>= liftIO . tsvLines . maybe [] (:[])) rows