Skip to content
Snippets Groups Projects
Commit ea0e05e5 authored by Alice Brenon's avatar Alice Brenon
Browse files

Refine the notion of size in the Tree representation of Conllu annotations as...

Refine the notion of size in the Tree representation of Conllu annotations as used in measures.hs from 922f5f43
parent 49c4cb60
No related branches found
No related tags found
No related merge requests found
...@@ -5,14 +5,15 @@ module Conllu.Tree ...@@ -5,14 +5,15 @@ module Conllu.Tree
, ID(..) , ID(..)
, IndexedDocument(..) , IndexedDocument(..)
, IndexedSentence(..) , IndexedSentence(..)
, IndexedWord(..) , IndexedToken(..)
, POS(..) , POS(..)
, Rel(..) , Rel(..)
, indexDocument , indexDocument
, indexSentence , indexSentence
, indexWord , indexToken
, positions ) where , positions ) where
import Conllu.Tree.Count (Count(..), count)
import qualified Conllu.Type as Conllu (AW, CW(..), Doc, ID(..), FORM, LEMMA, XPOS, MISC, Rel(..), Sent(..), Index, Feat(..)) import qualified Conllu.Type as Conllu (AW, CW(..), Doc, ID(..), FORM, LEMMA, XPOS, MISC, Rel(..), Sent(..), Index, Feat(..))
import qualified Conllu.UposTagset as Conllu (POS) import qualified Conllu.UposTagset as Conllu (POS)
import qualified Conllu.DeprelTagset as Conllu (EP) import qualified Conllu.DeprelTagset as Conllu (EP)
...@@ -90,7 +91,7 @@ instance ToField FEATS where ...@@ -90,7 +91,7 @@ instance ToField FEATS where
k ++ '=':(intercalate "," _values ++ maybe "" showType _type) k ++ '=':(intercalate "," _values ++ maybe "" showType _type)
showType t = '[': t ++ "]" showType t = '[': t ++ "]"
data IndexedWord = IndexedWord data IndexedToken = IndexedToken
{ _id :: ID { _id :: ID
, _form :: Conllu.FORM , _form :: Conllu.FORM
, _lemma :: Conllu.LEMMA , _lemma :: Conllu.LEMMA
...@@ -101,39 +102,40 @@ data IndexedWord = IndexedWord ...@@ -101,39 +102,40 @@ data IndexedWord = IndexedWord
, _deps :: [Rel] , _deps :: [Rel]
, _misc :: Conllu.MISC } deriving (Show, Generic) , _misc :: Conllu.MISC } deriving (Show, Generic)
instance ToNamedRecord IndexedWord where instance ToNamedRecord IndexedToken where
toNamedRecord indexedWord = namedRecord toNamedRecord indexedToken = namedRecord
[ "id" .= _id indexedWord [ "id" .= _id indexedToken
, "form" .= _form indexedWord , "form" .= _form indexedToken
, "lemma" .= _lemma indexedWord , "lemma" .= _lemma indexedToken
, "upos" .= _upos indexedWord , "upos" .= _upos indexedToken
, "xpos" .= _xpos indexedWord , "xpos" .= _xpos indexedToken
, "feats" .= _feats indexedWord , "feats" .= _feats indexedToken
, "head" .= (_head <$> _rel indexedWord) , "head" .= (_head <$> _rel indexedToken)
, "deprel" .= ((sep ":" . deprel) <$> _rel indexedWord) , "deprel" .= ((sep ":" . deprel) <$> _rel indexedToken)
, "deps" .= ("|" `sep` (showDep <$> _deps indexedWord)) , "deps" .= ("|" `sep` (showDep <$> _deps indexedToken))
, "misc" .= _misc indexedWord ] , "misc" .= _misc indexedToken ]
where where
sep = intercalate sep = intercalate
deprel (Rel {_deprel = EP ep, _subdep, _rest}) = deprel (Rel {_deprel = EP ep, _subdep, _rest}) =
show ep : maybe [] id ((:) <$> _subdep <*> _rest) show ep : maybe [] id ((:) <$> _subdep <*> _rest)
showDep rel@(Rel {_head}) = ":" `sep` (show _head:deprel rel) showDep rel@(Rel {_head}) = ":" `sep` (show _head:deprel rel)
instance DefaultOrdered IndexedWord where instance DefaultOrdered IndexedToken where
headerOrder _ = headerOrder _ =
[ "id", "form", "lemma", "upos", "xpos", "feats", "head", "deprel", "deps" [ "id", "form", "lemma", "upos", "xpos", "feats", "head", "deprel", "deps"
, "misc" ] , "misc" ]
instance Serialize IndexedWord instance Serialize IndexedToken
data IndexedSentence = IndexedSentence data IndexedSentence = IndexedSentence
{ _offset :: Int { _offset :: Int
, _syntax :: Forest IndexedWord } deriving (Show, Generic) , _count :: Count
, _syntax :: Forest IndexedToken } deriving (Show, Generic)
instance Serialize IndexedSentence instance Serialize IndexedSentence
data IndexedDocument = IndexedDocument data IndexedDocument = IndexedDocument
{ _total :: Int { _total :: Count
, _sentences :: [IndexedSentence] } deriving (Show, Generic) , _sentences :: [IndexedSentence] } deriving (Show, Generic)
instance Serialize IndexedDocument instance Serialize IndexedDocument
...@@ -147,7 +149,7 @@ getSID :: Conllu.ID -> Maybe Int ...@@ -147,7 +149,7 @@ getSID :: Conllu.ID -> Maybe Int
getSID (Conllu.SID n) = Just n getSID (Conllu.SID n) = Just n
getSID _ = Nothing getSID _ = Nothing
positions :: Int -> IndexedWord -> [Int] positions :: Int -> IndexedToken -> [Int]
positions offset = fmap (offset +) . getIDs . _id positions offset = fmap (offset +) . getIDs . _id
where where
getIDs (SID i) = [i] getIDs (SID i) = [i]
...@@ -169,8 +171,8 @@ relOfConllu rel = Rel ...@@ -169,8 +171,8 @@ relOfConllu rel = Rel
, _subdep = Conllu._subdep rel , _subdep = Conllu._subdep rel
, _rest = Conllu._rest rel } , _rest = Conllu._rest rel }
indexWord :: Conllu.CW Conllu.AW -> IndexedWord indexToken :: Conllu.CW Conllu.AW -> IndexedToken
indexWord cw = IndexedWord indexToken cw = IndexedToken
{ _id = idOfConllu $ Conllu._id cw { _id = idOfConllu $ Conllu._id cw
, _form = Conllu._form cw , _form = Conllu._form cw
, _lemma = Conllu._lemma cw , _lemma = Conllu._lemma cw
...@@ -182,21 +184,22 @@ indexWord cw = IndexedWord ...@@ -182,21 +184,22 @@ indexWord cw = IndexedWord
, _misc = Conllu._misc cw } , _misc = Conllu._misc cw }
indexSentence :: Int -> [Conllu.CW Conllu.AW] -> IndexedSentence indexSentence :: Int -> [Conllu.CW Conllu.AW] -> IndexedSentence
indexSentence offset = IndexedSentence offset . build 0 indexSentence _offset sent = IndexedSentence
{ _offset, _count = count sent, _syntax = build 0 sent }
where where
build n cws = build n cws =
let (pointing, others) = partition (pointingTo n) cws in let (pointing, others) = partition (pointingTo n) cws in
recurseOn others <$> pointing recurseOn others <$> pointing
recurseOn cws cw = Node recurseOn cws cw = Node
{ rootLabel = indexWord cw { rootLabel = indexToken cw
, subForest = maybe [] (`build` cws) $ getSID (Conllu._id cw) } , subForest = maybe [] (`build` cws) $ getSID (Conllu._id cw) }
pointingTo n cw = maybe False (n ==) (getSID . Conllu._head =<< Conllu._rel cw) pointingTo n cw = maybe False (n ==) (getSID . Conllu._head =<< Conllu._rel cw)
indexDocument :: Conllu.Doc -> IndexedDocument indexDocument :: Conllu.Doc -> IndexedDocument
indexDocument doc = addSentences (0, []) $ Conllu._words <$> doc indexDocument doc = addSentences (mempty, []) $ Conllu._words <$> doc
where where
addSentences (_total, sentencesStack) [] = addSentences (_total, sentencesStack) [] =
IndexedDocument {_total, _sentences = reverse sentencesStack} IndexedDocument {_total, _sentences = reverse sentencesStack}
addSentences (partial, sentencesStack) (sentence:others) = addSentences (partial, sentencesStack) (sentence:others) =
let indexed = indexSentence partial sentence in let indexed = indexSentence (tokens partial) sentence in
addSentences (partial + length sentence, indexed:sentencesStack) others addSentences (partial <> _count indexed, indexed:sentencesStack) others
module Conllu.Tree.Count
( Count(..)
, count ) where
import qualified Conllu.Type as Conllu (CW(..), ID(..))
import Data.Csv (DefaultOrdered(..), FromNamedRecord(..), ToNamedRecord(..))
import Data.Serialize (Serialize(..))
import GHC.Generics (Generic)
import Prelude hiding (words)
data Count = Count
{ words :: Int
, tokens :: Int }
deriving (Eq, Show, Generic)
instance DefaultOrdered Count
instance FromNamedRecord Count
instance ToNamedRecord Count
instance Serialize Count
instance Semigroup Count where
c1 <> c2 = Count
{ words = words c1 + words c2, tokens = tokens c1 + tokens c2 }
instance Monoid Count where
mempty = Count 0 0
count :: [Conllu.CW a] -> Count
count sent = Count
{ words = tokens + length multiTokens - sum multiTokens, tokens }
where
ids = Conllu._id <$> sent
tokens = length [i | Conllu.SID i <- ids ]
multiTokens = [to - from + 1 | Conllu.MID from to <- ids ]
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment