Skip to content
Snippets Groups Projects
Commit 5bec61bf authored by Alice Brenon's avatar Alice Brenon
Browse files

Add types to represent indexed sentences and documents, effectively absorbing...

Add types to represent indexed sentences and documents, effectively absorbing part of the work that was done during the search
parent 46df1768
No related branches found
No related tags found
No related merge requests found
{-# LANGUAGE DataKinds, DeriveGeneric, ExplicitNamespaces, TypeFamilies #-} {-# LANGUAGE DataKinds, DeriveGeneric, ExplicitNamespaces, TypeFamilies #-}
module Conllu.Tree module Conllu.Tree
( Syntax ( Feat(..)
, syntax ) where , ID(..)
, IndexedDocument(..)
import qualified Conllu.Type as Conllu (AW, CW(..), ID(..), FORM, LEMMA, XPOS, MISC, Rel(..), Sent(..), Index, Feat(..)) , IndexedSentence(..)
, IndexedWord(..)
, indexDocument
, indexSentence
, indexWord
, positions ) where
import qualified Conllu.Type as Conllu (AW, CW(..), Doc, ID(..), FORM, LEMMA, XPOS, MISC, Rel(..), Sent(..), Index, Feat(..))
import qualified Conllu.UposTagset as Conllu (POS) import qualified Conllu.UposTagset as Conllu (POS)
import qualified Conllu.DeprelTagset as Conllu (EP) import qualified Conllu.DeprelTagset as Conllu (EP)
import Data.List (partition) import Data.Int (Int8)
import Data.List (foldl', partition)
import Data.Map as Map (Map, empty, insert) import Data.Map as Map (Map, empty, insert)
import Data.Serialize (Serialize(..)) import Data.Serialize (Serialize(..))
import Data.Tree (Forest, Tree(..)) import Data.Tree (Forest, Tree(..))
...@@ -20,13 +28,16 @@ data ID = ...@@ -20,13 +28,16 @@ data ID =
instance Serialize ID instance Serialize ID
enumCast :: (Enum a, Enum b) => a -> b
enumCast = toEnum . fromEnum
newtype POS = POS Conllu.POS deriving Show newtype POS = POS Conllu.POS deriving Show
instance Generic POS where instance Generic POS where
type Rep POS = Rec0 Int type Rep POS = Rec0 Int8
from (POS c) = K1 (fromEnum c) from (POS c) = K1 (enumCast c)
to (K1 i) = POS (toEnum i) to (K1 i) = POS (enumCast i)
instance Serialize POS instance Serialize POS
...@@ -38,11 +49,11 @@ instance Serialize Feat ...@@ -38,11 +49,11 @@ instance Serialize Feat
newtype EP = EP Conllu.EP deriving Show newtype EP = EP Conllu.EP deriving Show
instance Generic EP where instance Generic EP where
type Rep EP = Rec0 Int type Rep EP = Rec0 Int8
from (EP c) = K1 (fromEnum c) from (EP c) = K1 (enumCast c)
to (K1 i) = EP (toEnum i) to (K1 i) = EP (enumCast i)
instance Serialize EP instance Serialize EP
...@@ -69,17 +80,34 @@ data IndexedWord = IndexedWord ...@@ -69,17 +80,34 @@ data IndexedWord = IndexedWord
instance Serialize IndexedWord instance Serialize IndexedWord
type Syntax = Forest IndexedWord data IndexedSentence = IndexedSentence
{ _offset :: Int
, _syntax :: Forest IndexedWord } deriving (Show, Generic)
getSID :: Conllu.ID -> Maybe Int instance Serialize IndexedSentence
getSID (Conllu.SID n) = Just n
getSID _ = Nothing data IndexedDocument = IndexedDocument
{ _total :: Int
, _sentences :: [IndexedSentence] } deriving (Show, Generic)
instance Serialize IndexedDocument
idOfConllu :: Conllu.ID -> ID idOfConllu :: Conllu.ID -> ID
idOfConllu (Conllu.SID i) = SID i idOfConllu (Conllu.SID i) = SID i
idOfConllu (Conllu.MID i j) = MID i j idOfConllu (Conllu.MID i j) = MID i j
idOfConllu (Conllu.EID i j) = EID i j idOfConllu (Conllu.EID i j) = EID i j
getSID :: Conllu.ID -> Maybe Int
getSID (Conllu.SID n) = Just n
getSID _ = Nothing
positions :: Int -> IndexedWord -> [Int]
positions offset = fmap (offset +) . getIDs . _id
where
getIDs (SID i) = [i]
getIDs (MID i j) = [i..j]
getIDs _ = []
featsOfConllu :: [Conllu.Feat] -> FEATS featsOfConllu :: [Conllu.Feat] -> FEATS
featsOfConllu = foldr indexFeat Map.empty featsOfConllu = foldr indexFeat Map.empty
where where
...@@ -107,13 +135,21 @@ indexWord cw = IndexedWord ...@@ -107,13 +135,21 @@ indexWord cw = IndexedWord
, _deps = relOfConllu <$> Conllu._deps cw , _deps = relOfConllu <$> Conllu._deps cw
, _misc = Conllu._misc cw } , _misc = Conllu._misc cw }
syntax :: Conllu.Sent -> Syntax indexSentence :: Int -> [Conllu.CW Conllu.AW] -> IndexedSentence
syntax = build 0 . Conllu._words indexSentence offset = IndexedSentence offset . build 0
where where
build n cws = build n cws =
let (pointing, others) = partition (pointingTo n) cws in let (pointing, others) = partition (pointingTo n) cws in
recurseOn others <$> pointing recurseOn others <$> pointing
recurseOn cws cw = Node recurseOn cws cw = Node
{ rootLabel = indexWord cw, subForest = maybe [] (`build` cws) $ getSID (Conllu._id cw) } { rootLabel = indexWord cw
, subForest = maybe [] (`build` cws) $ getSID (Conllu._id cw) }
pointingTo n cw = maybe False (n ==) (getSID . Conllu._head =<< Conllu._rel cw) pointingTo n cw = maybe False (n ==) (getSID . Conllu._head =<< Conllu._rel cw)
indexDocument :: Conllu.Doc -> IndexedDocument
indexDocument doc =
IndexedDocument {_total, _sentences = zipWith indexSentence offsets sentWords}
where
sentWords = Conllu._words <$> doc
(offsets, _total) = foldl' next ([], 0) sentWords
next (tmpOffsets, partial) s = (partial:tmpOffsets, partial + length s)
...@@ -2,7 +2,7 @@ ...@@ -2,7 +2,7 @@
{-# LANGUAGE DeriveGeneric, ExplicitNamespaces, TypeFamilies #-} {-# LANGUAGE DeriveGeneric, ExplicitNamespaces, TypeFamilies #-}
import Conllu.Parse (parseConllu) import Conllu.Parse (parseConllu)
import Conllu.Tree (syntax) import Conllu.Tree (indexDocument)
import Control.Applicative ((<**>)) import Control.Applicative ((<**>))
import Control.Monad.IO.Class (MonadIO(..)) import Control.Monad.IO.Class (MonadIO(..))
import Control.Monad.Reader (MonadReader, asks, runReaderT) import Control.Monad.Reader (MonadReader, asks, runReaderT)
...@@ -50,7 +50,7 @@ toTree articleRecord = do ...@@ -50,7 +50,7 @@ toTree articleRecord = do
path <- asks ((</> relativePath articleRecord "tree") . outputRoot) path <- asks ((</> relativePath articleRecord "tree") . outputRoot)
liftIO $ do liftIO $ do
createDirectoryIfMissing True (takeDirectory path) createDirectoryIfMissing True (takeDirectory path)
ByteString.writeFile path . encode $ syntax =<< doc ByteString.writeFile path . encode $ indexDocument doc
main :: IO () main :: IO ()
main = getConfig >>= runReaderT chain main = getConfig >>= runReaderT chain
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment