From 5bec61bff6a6bf2b7350755f05f229b30d8b1a61 Mon Sep 17 00:00:00 2001
From: Alice BRENON <alice.brenon@ens-lyon.fr>
Date: Mon, 1 Jan 2024 18:56:45 +0100
Subject: [PATCH] Add types to represent indexed sentences and documents,
 effectively absorbing part of the work that was done during the search

---
 lib/haskell/Conllu/Tree.hs            | 72 ++++++++++++++++++++-------
 scripts/textometry/serialiseSyntax.hs |  4 +-
 2 files changed, 56 insertions(+), 20 deletions(-)

diff --git a/lib/haskell/Conllu/Tree.hs b/lib/haskell/Conllu/Tree.hs
index 9a67b37..84ef895 100644
--- a/lib/haskell/Conllu/Tree.hs
+++ b/lib/haskell/Conllu/Tree.hs
@@ -1,12 +1,20 @@
 {-# LANGUAGE DataKinds, DeriveGeneric, ExplicitNamespaces, TypeFamilies #-}
 module Conllu.Tree
-  ( Syntax
-  , syntax ) where
-
-import qualified Conllu.Type as Conllu (AW, CW(..), ID(..), FORM, LEMMA, XPOS, MISC, Rel(..), Sent(..), Index, Feat(..))
+  ( Feat(..)
+  , ID(..)
+  , IndexedDocument(..)
+  , IndexedSentence(..)
+  , IndexedWord(..)
+  , indexDocument
+  , indexSentence
+  , indexWord
+  , positions ) where
+
+import qualified Conllu.Type as Conllu (AW, CW(..), Doc, ID(..), FORM, LEMMA, XPOS, MISC, Rel(..), Sent(..), Index, Feat(..))
 import qualified Conllu.UposTagset as Conllu (POS)
 import qualified Conllu.DeprelTagset as Conllu (EP)
-import Data.List (partition)
+import Data.Int (Int8)
+import Data.List (foldl', partition)
 import Data.Map as Map (Map, empty, insert)
 import Data.Serialize (Serialize(..))
 import Data.Tree (Forest, Tree(..))
@@ -20,13 +28,16 @@ data ID =
 
 instance Serialize ID
 
+enumCast :: (Enum a, Enum b) => a -> b
+enumCast = toEnum . fromEnum
+
 newtype POS = POS Conllu.POS deriving Show
 instance Generic POS where
-  type Rep POS = Rec0 Int
+  type Rep POS = Rec0 Int8
 
-  from (POS c) = K1 (fromEnum c)
+  from (POS c) = K1 (enumCast c)
 
-  to (K1 i) = POS (toEnum i)
+  to (K1 i) = POS (enumCast i)
 
 instance Serialize POS
 
@@ -38,11 +49,11 @@ instance Serialize Feat
 
 newtype EP = EP Conllu.EP deriving Show
 instance Generic EP where
-  type Rep EP = Rec0 Int
+  type Rep EP = Rec0 Int8
 
-  from (EP c) = K1 (fromEnum c)
+  from (EP c) = K1 (enumCast c)
 
-  to (K1 i) = EP (toEnum i)
+  to (K1 i) = EP (enumCast i)
 
 instance Serialize EP
 
@@ -69,17 +80,34 @@ data IndexedWord = IndexedWord
 
 instance Serialize IndexedWord
 
-type Syntax = Forest IndexedWord
+data IndexedSentence = IndexedSentence
+  { _offset :: Int
+  , _syntax :: Forest IndexedWord } deriving (Show, Generic)
 
-getSID :: Conllu.ID -> Maybe Int
-getSID (Conllu.SID n) = Just n
-getSID _ = Nothing
+instance Serialize IndexedSentence
+
+data IndexedDocument = IndexedDocument
+  { _total :: Int
+  , _sentences :: [IndexedSentence] } deriving (Show, Generic)
+
+instance Serialize IndexedDocument
 
 idOfConllu :: Conllu.ID -> ID
 idOfConllu (Conllu.SID i) = SID i
 idOfConllu (Conllu.MID i j) = MID i j
 idOfConllu (Conllu.EID i j) = EID i j
 
+getSID :: Conllu.ID -> Maybe Int
+getSID (Conllu.SID n) = Just n
+getSID _ = Nothing
+
+positions :: Int -> IndexedWord -> [Int]
+positions offset = fmap (offset +) . getIDs . _id
+  where
+    getIDs (SID i) = [i]
+    getIDs (MID i j) = [i..j]
+    getIDs _ = []
+
 featsOfConllu :: [Conllu.Feat] -> FEATS
 featsOfConllu = foldr indexFeat Map.empty
   where
@@ -107,13 +135,21 @@ indexWord cw = IndexedWord
   , _deps = relOfConllu <$> Conllu._deps cw
   , _misc = Conllu._misc cw }
 
-syntax :: Conllu.Sent -> Syntax
-syntax = build 0 . Conllu._words
+indexSentence :: Int -> [Conllu.CW Conllu.AW] -> IndexedSentence
+indexSentence offset = IndexedSentence offset . build 0
   where
     build n cws =
       let (pointing, others) = partition (pointingTo n) cws in
       recurseOn others <$> pointing
     recurseOn cws cw = Node
-      { rootLabel = indexWord cw, subForest = maybe [] (`build` cws) $ getSID (Conllu._id cw) }
+      { rootLabel = indexWord cw
+      , subForest = maybe [] (`build` cws) $ getSID (Conllu._id cw) }
     pointingTo n cw = maybe False (n ==) (getSID . Conllu._head =<< Conllu._rel cw)
 
+indexDocument :: Conllu.Doc -> IndexedDocument
+indexDocument doc =
+  IndexedDocument {_total, _sentences = zipWith indexSentence offsets sentWords}
+  where
+    sentWords = Conllu._words <$> doc
+    (offsets, _total) = foldl' next ([], 0) sentWords
+    next (tmpOffsets, partial) s = (partial:tmpOffsets, partial + length s)
diff --git a/scripts/textometry/serialiseSyntax.hs b/scripts/textometry/serialiseSyntax.hs
index 2e0175e..2e86d5e 100755
--- a/scripts/textometry/serialiseSyntax.hs
+++ b/scripts/textometry/serialiseSyntax.hs
@@ -2,7 +2,7 @@
 
 {-# LANGUAGE DeriveGeneric, ExplicitNamespaces, TypeFamilies #-}
 import Conllu.Parse (parseConllu)
-import Conllu.Tree (syntax)
+import Conllu.Tree (indexDocument)
 import Control.Applicative ((<**>))
 import Control.Monad.IO.Class (MonadIO(..))
 import Control.Monad.Reader (MonadReader, asks, runReaderT)
@@ -50,7 +50,7 @@ toTree articleRecord = do
       path <- asks ((</> relativePath articleRecord "tree") . outputRoot)
       liftIO $ do
         createDirectoryIfMissing True (takeDirectory path)
-        ByteString.writeFile path . encode $ syntax =<< doc
+        ByteString.writeFile path . encode $ indexDocument doc
 
 main :: IO ()
 main = getConfig >>= runReaderT chain
-- 
GitLab