Skip to content
Snippets Groups Projects
Commit 46df1768 authored by Alice Brenon's avatar Alice Brenon
Browse files

Add a script to serialize syntax trees computed from the .conllu files,...

Add a script to serialize syntax trees computed from the .conllu files, copying part of the Conllu.Type types in the process to get a Generic instance for Serialization and to slightly improve structure by indexing features by name with a Map
parent 0981af9f
No related branches found
No related tags found
No related merge requests found
{-# LANGUAGE DataKinds, DeriveGeneric, ExplicitNamespaces, TypeFamilies #-}
module Conllu.Tree
( Syntax
, syntax ) where
import qualified Conllu.Type as Conllu (AW, CW(..), ID(..), FORM, LEMMA, XPOS, MISC, Rel(..), Sent(..), Index, Feat(..))
import qualified Conllu.UposTagset as Conllu (POS)
import qualified Conllu.DeprelTagset as Conllu (EP)
import Data.List (partition)
import Data.Map as Map (Map, empty, insert)
import Data.Serialize (Serialize(..))
import Data.Tree (Forest, Tree(..))
import GHC.Generics (Generic(..), K1(..), Rec0)
data ID =
SID Conllu.Index
| MID Conllu.Index Conllu.Index
| EID Conllu.Index Conllu.Index
deriving (Show, Generic)
instance Serialize ID
newtype POS = POS Conllu.POS deriving Show
instance Generic POS where
type Rep POS = Rec0 Int
from (POS c) = K1 (fromEnum c)
to (K1 i) = POS (toEnum i)
instance Serialize POS
data Feat = Feat
{ _values :: [String]
, _type :: Maybe String } deriving (Show, Generic)
instance Serialize Feat
newtype EP = EP Conllu.EP deriving Show
instance Generic EP where
type Rep EP = Rec0 Int
from (EP c) = K1 (fromEnum c)
to (K1 i) = EP (toEnum i)
instance Serialize EP
data Rel = Rel
{ _head :: ID
, _deprel :: EP
, _subdep :: Maybe String
, _rest :: Maybe [String] } deriving (Show, Generic)
instance Serialize Rel
type FEATS = Map String Feat
data IndexedWord = IndexedWord
{ _id :: ID
, _form :: Conllu.FORM
, _lemma :: Conllu.LEMMA
, _upos :: Maybe POS
, _xpos :: Conllu.XPOS
, _feats :: FEATS
, _rel :: Maybe Rel
, _deps :: [Rel]
, _misc :: Conllu.MISC } deriving (Show, Generic)
instance Serialize IndexedWord
type Syntax = Forest IndexedWord
getSID :: Conllu.ID -> Maybe Int
getSID (Conllu.SID n) = Just n
getSID _ = Nothing
idOfConllu :: Conllu.ID -> ID
idOfConllu (Conllu.SID i) = SID i
idOfConllu (Conllu.MID i j) = MID i j
idOfConllu (Conllu.EID i j) = EID i j
featsOfConllu :: [Conllu.Feat] -> FEATS
featsOfConllu = foldr indexFeat Map.empty
where
indexFeat feat =
Map.insert
(Conllu._feat feat)
(Feat {_values = Conllu._featValues feat, _type = Conllu._featType feat})
relOfConllu :: Conllu.Rel -> Rel
relOfConllu rel = Rel
{ _head = idOfConllu $ Conllu._head rel
, _deprel = EP $ Conllu._deprel rel
, _subdep = Conllu._subdep rel
, _rest = Conllu._rest rel }
indexWord :: Conllu.CW Conllu.AW -> IndexedWord
indexWord cw = IndexedWord
{ _id = idOfConllu $ Conllu._id cw
, _form = Conllu._form cw
, _lemma = Conllu._lemma cw
, _upos = POS <$> Conllu._upos cw
, _xpos = Conllu._xpos cw
, _feats = featsOfConllu $ Conllu._feats cw
, _rel = relOfConllu <$> Conllu._rel cw
, _deps = relOfConllu <$> Conllu._deps cw
, _misc = Conllu._misc cw }
syntax :: Conllu.Sent -> Syntax
syntax = build 0 . Conllu._words
where
build n cws =
let (pointing, others) = partition (pointingTo n) cws in
recurseOn others <$> pointing
recurseOn cws cw = Node
{ rootLabel = indexWord cw, subForest = maybe [] (`build` cws) $ getSID (Conllu._id cw) }
pointingTo n cw = maybe False (n ==) (getSID . Conllu._head =<< Conllu._rel cw)
......@@ -17,5 +17,5 @@ syntax s = do
try :: MonadIO m => m (Either String a) -> m a
try = (>>= either (liftIO . die) pure)
warn :: String -> IO ()
warn = hPutStrLn stderr
warn :: MonadIO m => String -> m ()
warn = liftIO . hPutStrLn stderr
#!/usr/bin/env -S runhaskell --ghc-arg="-Wall" --ghc-arg="-i lib/haskell"
{-# LANGUAGE DeriveGeneric, ExplicitNamespaces, TypeFamilies #-}
import Conllu.Parse (parseConllu)
import Conllu.Tree (syntax)
import Control.Applicative ((<**>))
import Control.Monad.IO.Class (MonadIO(..))
import Control.Monad.Reader (MonadReader, asks, runReaderT)
import Data.ByteString as ByteString (writeFile)
import Data.Serialize (encode)
import GEODE.Metadata ( ArticleRecord, Record(..) , readNamedTsv )
import Options.Applicative
( Parser, execParser, fullDesc, help, helper, info, long, metavar, progDesc
, short, strOption )
import System.Directory (createDirectoryIfMissing)
import System.FilePath ((</>), takeDirectory)
import System.Script (try, warn)
data Config = Config
{ inputRoot :: FilePath
, inputTsv :: FilePath
, outputRoot :: FilePath }
configParser :: Parser Config
configParser = Config
<$> strOption rootOption
<*> strOption tsvOption
<*> strOption outputOption
where
rootOption = short 'r' <> long "root" <> metavar "CONLLU_DIRECTORY"
<> help "path to the base directory containing the syntax annotations"
tsvOption = short 'i' <> long "input" <> metavar "INPUT_TSV"
<> help "TSV file containing the article records to process"
outputOption = short 'o' <> long "output" <> metavar "OUTPUT_DIRECTORY"
<> help "Root directory where to create the trees"
getConfig :: IO Config
getConfig = execParser
(info
(configParser <**> helper)
(fullDesc
<> progDesc "A textometric tool to draw discursive profiles"))
toTree :: (MonadReader Config m, MonadIO m) => ArticleRecord -> m ()
toTree articleRecord = do
path <- asks ((</> relativePath articleRecord "conllu") . inputRoot)
liftIO (readFile path) >>= either warn convert . parseConllu path
where
convert doc = do
path <- asks ((</> relativePath articleRecord "tree") . outputRoot)
liftIO $ do
createDirectoryIfMissing True (takeDirectory path)
ByteString.writeFile path . encode $ syntax =<< doc
main :: IO ()
main = getConfig >>= runReaderT chain
where
chain = try (asks inputTsv >>= liftIO . readNamedTsv) >>= mapM_ toTree
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment