From 5bec61bff6a6bf2b7350755f05f229b30d8b1a61 Mon Sep 17 00:00:00 2001 From: Alice BRENON <alice.brenon@ens-lyon.fr> Date: Mon, 1 Jan 2024 18:56:45 +0100 Subject: [PATCH] Add types to represent indexed sentences and documents, effectively absorbing part of the work that was done during the search --- lib/haskell/Conllu/Tree.hs | 72 ++++++++++++++++++++------- scripts/textometry/serialiseSyntax.hs | 4 +- 2 files changed, 56 insertions(+), 20 deletions(-) diff --git a/lib/haskell/Conllu/Tree.hs b/lib/haskell/Conllu/Tree.hs index 9a67b37..84ef895 100644 --- a/lib/haskell/Conllu/Tree.hs +++ b/lib/haskell/Conllu/Tree.hs @@ -1,12 +1,20 @@ {-# LANGUAGE DataKinds, DeriveGeneric, ExplicitNamespaces, TypeFamilies #-} module Conllu.Tree - ( Syntax - , syntax ) where - -import qualified Conllu.Type as Conllu (AW, CW(..), ID(..), FORM, LEMMA, XPOS, MISC, Rel(..), Sent(..), Index, Feat(..)) + ( Feat(..) + , ID(..) + , IndexedDocument(..) + , IndexedSentence(..) + , IndexedWord(..) + , indexDocument + , indexSentence + , indexWord + , positions ) where + +import qualified Conllu.Type as Conllu (AW, CW(..), Doc, ID(..), FORM, LEMMA, XPOS, MISC, Rel(..), Sent(..), Index, Feat(..)) import qualified Conllu.UposTagset as Conllu (POS) import qualified Conllu.DeprelTagset as Conllu (EP) -import Data.List (partition) +import Data.Int (Int8) +import Data.List (foldl', partition) import Data.Map as Map (Map, empty, insert) import Data.Serialize (Serialize(..)) import Data.Tree (Forest, Tree(..)) @@ -20,13 +28,16 @@ data ID = instance Serialize ID +enumCast :: (Enum a, Enum b) => a -> b +enumCast = toEnum . fromEnum + newtype POS = POS Conllu.POS deriving Show instance Generic POS where - type Rep POS = Rec0 Int + type Rep POS = Rec0 Int8 - from (POS c) = K1 (fromEnum c) + from (POS c) = K1 (enumCast c) - to (K1 i) = POS (toEnum i) + to (K1 i) = POS (enumCast i) instance Serialize POS @@ -38,11 +49,11 @@ instance Serialize Feat newtype EP = EP Conllu.EP deriving Show instance Generic EP where - type Rep EP = Rec0 Int + type Rep EP = Rec0 Int8 - from (EP c) = K1 (fromEnum c) + from (EP c) = K1 (enumCast c) - to (K1 i) = EP (toEnum i) + to (K1 i) = EP (enumCast i) instance Serialize EP @@ -69,17 +80,34 @@ data IndexedWord = IndexedWord instance Serialize IndexedWord -type Syntax = Forest IndexedWord +data IndexedSentence = IndexedSentence + { _offset :: Int + , _syntax :: Forest IndexedWord } deriving (Show, Generic) -getSID :: Conllu.ID -> Maybe Int -getSID (Conllu.SID n) = Just n -getSID _ = Nothing +instance Serialize IndexedSentence + +data IndexedDocument = IndexedDocument + { _total :: Int + , _sentences :: [IndexedSentence] } deriving (Show, Generic) + +instance Serialize IndexedDocument idOfConllu :: Conllu.ID -> ID idOfConllu (Conllu.SID i) = SID i idOfConllu (Conllu.MID i j) = MID i j idOfConllu (Conllu.EID i j) = EID i j +getSID :: Conllu.ID -> Maybe Int +getSID (Conllu.SID n) = Just n +getSID _ = Nothing + +positions :: Int -> IndexedWord -> [Int] +positions offset = fmap (offset +) . getIDs . _id + where + getIDs (SID i) = [i] + getIDs (MID i j) = [i..j] + getIDs _ = [] + featsOfConllu :: [Conllu.Feat] -> FEATS featsOfConllu = foldr indexFeat Map.empty where @@ -107,13 +135,21 @@ indexWord cw = IndexedWord , _deps = relOfConllu <$> Conllu._deps cw , _misc = Conllu._misc cw } -syntax :: Conllu.Sent -> Syntax -syntax = build 0 . Conllu._words +indexSentence :: Int -> [Conllu.CW Conllu.AW] -> IndexedSentence +indexSentence offset = IndexedSentence offset . build 0 where build n cws = let (pointing, others) = partition (pointingTo n) cws in recurseOn others <$> pointing recurseOn cws cw = Node - { rootLabel = indexWord cw, subForest = maybe [] (`build` cws) $ getSID (Conllu._id cw) } + { rootLabel = indexWord cw + , subForest = maybe [] (`build` cws) $ getSID (Conllu._id cw) } pointingTo n cw = maybe False (n ==) (getSID . Conllu._head =<< Conllu._rel cw) +indexDocument :: Conllu.Doc -> IndexedDocument +indexDocument doc = + IndexedDocument {_total, _sentences = zipWith indexSentence offsets sentWords} + where + sentWords = Conllu._words <$> doc + (offsets, _total) = foldl' next ([], 0) sentWords + next (tmpOffsets, partial) s = (partial:tmpOffsets, partial + length s) diff --git a/scripts/textometry/serialiseSyntax.hs b/scripts/textometry/serialiseSyntax.hs index 2e0175e..2e86d5e 100755 --- a/scripts/textometry/serialiseSyntax.hs +++ b/scripts/textometry/serialiseSyntax.hs @@ -2,7 +2,7 @@ {-# LANGUAGE DeriveGeneric, ExplicitNamespaces, TypeFamilies #-} import Conllu.Parse (parseConllu) -import Conllu.Tree (syntax) +import Conllu.Tree (indexDocument) import Control.Applicative ((<**>)) import Control.Monad.IO.Class (MonadIO(..)) import Control.Monad.Reader (MonadReader, asks, runReaderT) @@ -50,7 +50,7 @@ toTree articleRecord = do path <- asks ((</> relativePath articleRecord "tree") . outputRoot) liftIO $ do createDirectoryIfMissing True (takeDirectory path) - ByteString.writeFile path . encode $ syntax =<< doc + ByteString.writeFile path . encode $ indexDocument doc main :: IO () main = getConfig >>= runReaderT chain -- GitLab