Skip to content
Snippets Groups Projects
Commit 5bec61bf authored by Alice Brenon's avatar Alice Brenon
Browse files

Add types to represent indexed sentences and documents, effectively absorbing...

Add types to represent indexed sentences and documents, effectively absorbing part of the work that was done during the search
parent 46df1768
No related branches found
No related tags found
No related merge requests found
{-# LANGUAGE DataKinds, DeriveGeneric, ExplicitNamespaces, TypeFamilies #-}
module Conllu.Tree
( Syntax
, syntax ) where
import qualified Conllu.Type as Conllu (AW, CW(..), ID(..), FORM, LEMMA, XPOS, MISC, Rel(..), Sent(..), Index, Feat(..))
( Feat(..)
, ID(..)
, IndexedDocument(..)
, IndexedSentence(..)
, IndexedWord(..)
, indexDocument
, indexSentence
, indexWord
, positions ) where
import qualified Conllu.Type as Conllu (AW, CW(..), Doc, ID(..), FORM, LEMMA, XPOS, MISC, Rel(..), Sent(..), Index, Feat(..))
import qualified Conllu.UposTagset as Conllu (POS)
import qualified Conllu.DeprelTagset as Conllu (EP)
import Data.List (partition)
import Data.Int (Int8)
import Data.List (foldl', partition)
import Data.Map as Map (Map, empty, insert)
import Data.Serialize (Serialize(..))
import Data.Tree (Forest, Tree(..))
......@@ -20,13 +28,16 @@ data ID =
instance Serialize ID
enumCast :: (Enum a, Enum b) => a -> b
enumCast = toEnum . fromEnum
newtype POS = POS Conllu.POS deriving Show
instance Generic POS where
type Rep POS = Rec0 Int
type Rep POS = Rec0 Int8
from (POS c) = K1 (fromEnum c)
from (POS c) = K1 (enumCast c)
to (K1 i) = POS (toEnum i)
to (K1 i) = POS (enumCast i)
instance Serialize POS
......@@ -38,11 +49,11 @@ instance Serialize Feat
newtype EP = EP Conllu.EP deriving Show
instance Generic EP where
type Rep EP = Rec0 Int
type Rep EP = Rec0 Int8
from (EP c) = K1 (fromEnum c)
from (EP c) = K1 (enumCast c)
to (K1 i) = EP (toEnum i)
to (K1 i) = EP (enumCast i)
instance Serialize EP
......@@ -69,17 +80,34 @@ data IndexedWord = IndexedWord
instance Serialize IndexedWord
type Syntax = Forest IndexedWord
data IndexedSentence = IndexedSentence
{ _offset :: Int
, _syntax :: Forest IndexedWord } deriving (Show, Generic)
getSID :: Conllu.ID -> Maybe Int
getSID (Conllu.SID n) = Just n
getSID _ = Nothing
instance Serialize IndexedSentence
data IndexedDocument = IndexedDocument
{ _total :: Int
, _sentences :: [IndexedSentence] } deriving (Show, Generic)
instance Serialize IndexedDocument
idOfConllu :: Conllu.ID -> ID
idOfConllu (Conllu.SID i) = SID i
idOfConllu (Conllu.MID i j) = MID i j
idOfConllu (Conllu.EID i j) = EID i j
getSID :: Conllu.ID -> Maybe Int
getSID (Conllu.SID n) = Just n
getSID _ = Nothing
positions :: Int -> IndexedWord -> [Int]
positions offset = fmap (offset +) . getIDs . _id
where
getIDs (SID i) = [i]
getIDs (MID i j) = [i..j]
getIDs _ = []
featsOfConllu :: [Conllu.Feat] -> FEATS
featsOfConllu = foldr indexFeat Map.empty
where
......@@ -107,13 +135,21 @@ indexWord cw = IndexedWord
, _deps = relOfConllu <$> Conllu._deps cw
, _misc = Conllu._misc cw }
syntax :: Conllu.Sent -> Syntax
syntax = build 0 . Conllu._words
indexSentence :: Int -> [Conllu.CW Conllu.AW] -> IndexedSentence
indexSentence offset = IndexedSentence offset . build 0
where
build n cws =
let (pointing, others) = partition (pointingTo n) cws in
recurseOn others <$> pointing
recurseOn cws cw = Node
{ rootLabel = indexWord cw, subForest = maybe [] (`build` cws) $ getSID (Conllu._id cw) }
{ rootLabel = indexWord cw
, subForest = maybe [] (`build` cws) $ getSID (Conllu._id cw) }
pointingTo n cw = maybe False (n ==) (getSID . Conllu._head =<< Conllu._rel cw)
indexDocument :: Conllu.Doc -> IndexedDocument
indexDocument doc =
IndexedDocument {_total, _sentences = zipWith indexSentence offsets sentWords}
where
sentWords = Conllu._words <$> doc
(offsets, _total) = foldl' next ([], 0) sentWords
next (tmpOffsets, partial) s = (partial:tmpOffsets, partial + length s)
......@@ -2,7 +2,7 @@
{-# LANGUAGE DeriveGeneric, ExplicitNamespaces, TypeFamilies #-}
import Conllu.Parse (parseConllu)
import Conllu.Tree (syntax)
import Conllu.Tree (indexDocument)
import Control.Applicative ((<**>))
import Control.Monad.IO.Class (MonadIO(..))
import Control.Monad.Reader (MonadReader, asks, runReaderT)
......@@ -50,7 +50,7 @@ toTree articleRecord = do
path <- asks ((</> relativePath articleRecord "tree") . outputRoot)
liftIO $ do
createDirectoryIfMissing True (takeDirectory path)
ByteString.writeFile path . encode $ syntax =<< doc
ByteString.writeFile path . encode $ indexDocument doc
main :: IO ()
main = getConfig >>= runReaderT chain
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment