Skip to content
Snippets Groups Projects
Commit ea0e05e5 authored by Alice Brenon's avatar Alice Brenon
Browse files

Refine the notion of size in the Tree representation of Conllu annotations as...

Refine the notion of size in the Tree representation of Conllu annotations as used in measures.hs from 922f5f43
parent 49c4cb60
No related branches found
No related tags found
No related merge requests found
......@@ -5,14 +5,15 @@ module Conllu.Tree
, ID(..)
, IndexedDocument(..)
, IndexedSentence(..)
, IndexedWord(..)
, IndexedToken(..)
, POS(..)
, Rel(..)
, indexDocument
, indexSentence
, indexWord
, indexToken
, positions ) where
import Conllu.Tree.Count (Count(..), count)
import qualified Conllu.Type as Conllu (AW, CW(..), Doc, ID(..), FORM, LEMMA, XPOS, MISC, Rel(..), Sent(..), Index, Feat(..))
import qualified Conllu.UposTagset as Conllu (POS)
import qualified Conllu.DeprelTagset as Conllu (EP)
......@@ -90,7 +91,7 @@ instance ToField FEATS where
k ++ '=':(intercalate "," _values ++ maybe "" showType _type)
showType t = '[': t ++ "]"
data IndexedWord = IndexedWord
data IndexedToken = IndexedToken
{ _id :: ID
, _form :: Conllu.FORM
, _lemma :: Conllu.LEMMA
......@@ -101,39 +102,40 @@ data IndexedWord = IndexedWord
, _deps :: [Rel]
, _misc :: Conllu.MISC } deriving (Show, Generic)
instance ToNamedRecord IndexedWord where
toNamedRecord indexedWord = namedRecord
[ "id" .= _id indexedWord
, "form" .= _form indexedWord
, "lemma" .= _lemma indexedWord
, "upos" .= _upos indexedWord
, "xpos" .= _xpos indexedWord
, "feats" .= _feats indexedWord
, "head" .= (_head <$> _rel indexedWord)
, "deprel" .= ((sep ":" . deprel) <$> _rel indexedWord)
, "deps" .= ("|" `sep` (showDep <$> _deps indexedWord))
, "misc" .= _misc indexedWord ]
instance ToNamedRecord IndexedToken where
toNamedRecord indexedToken = namedRecord
[ "id" .= _id indexedToken
, "form" .= _form indexedToken
, "lemma" .= _lemma indexedToken
, "upos" .= _upos indexedToken
, "xpos" .= _xpos indexedToken
, "feats" .= _feats indexedToken
, "head" .= (_head <$> _rel indexedToken)
, "deprel" .= ((sep ":" . deprel) <$> _rel indexedToken)
, "deps" .= ("|" `sep` (showDep <$> _deps indexedToken))
, "misc" .= _misc indexedToken ]
where
sep = intercalate
deprel (Rel {_deprel = EP ep, _subdep, _rest}) =
show ep : maybe [] id ((:) <$> _subdep <*> _rest)
showDep rel@(Rel {_head}) = ":" `sep` (show _head:deprel rel)
instance DefaultOrdered IndexedWord where
instance DefaultOrdered IndexedToken where
headerOrder _ =
[ "id", "form", "lemma", "upos", "xpos", "feats", "head", "deprel", "deps"
, "misc" ]
instance Serialize IndexedWord
instance Serialize IndexedToken
data IndexedSentence = IndexedSentence
{ _offset :: Int
, _syntax :: Forest IndexedWord } deriving (Show, Generic)
, _count :: Count
, _syntax :: Forest IndexedToken } deriving (Show, Generic)
instance Serialize IndexedSentence
data IndexedDocument = IndexedDocument
{ _total :: Int
{ _total :: Count
, _sentences :: [IndexedSentence] } deriving (Show, Generic)
instance Serialize IndexedDocument
......@@ -147,7 +149,7 @@ getSID :: Conllu.ID -> Maybe Int
getSID (Conllu.SID n) = Just n
getSID _ = Nothing
positions :: Int -> IndexedWord -> [Int]
positions :: Int -> IndexedToken -> [Int]
positions offset = fmap (offset +) . getIDs . _id
where
getIDs (SID i) = [i]
......@@ -169,8 +171,8 @@ relOfConllu rel = Rel
, _subdep = Conllu._subdep rel
, _rest = Conllu._rest rel }
indexWord :: Conllu.CW Conllu.AW -> IndexedWord
indexWord cw = IndexedWord
indexToken :: Conllu.CW Conllu.AW -> IndexedToken
indexToken cw = IndexedToken
{ _id = idOfConllu $ Conllu._id cw
, _form = Conllu._form cw
, _lemma = Conllu._lemma cw
......@@ -182,21 +184,22 @@ indexWord cw = IndexedWord
, _misc = Conllu._misc cw }
indexSentence :: Int -> [Conllu.CW Conllu.AW] -> IndexedSentence
indexSentence offset = IndexedSentence offset . build 0
indexSentence _offset sent = IndexedSentence
{ _offset, _count = count sent, _syntax = build 0 sent }
where
build n cws =
let (pointing, others) = partition (pointingTo n) cws in
recurseOn others <$> pointing
recurseOn cws cw = Node
{ rootLabel = indexWord cw
{ rootLabel = indexToken cw
, subForest = maybe [] (`build` cws) $ getSID (Conllu._id cw) }
pointingTo n cw = maybe False (n ==) (getSID . Conllu._head =<< Conllu._rel cw)
indexDocument :: Conllu.Doc -> IndexedDocument
indexDocument doc = addSentences (0, []) $ Conllu._words <$> doc
indexDocument doc = addSentences (mempty, []) $ Conllu._words <$> doc
where
addSentences (_total, sentencesStack) [] =
IndexedDocument {_total, _sentences = reverse sentencesStack}
addSentences (partial, sentencesStack) (sentence:others) =
let indexed = indexSentence partial sentence in
addSentences (partial + length sentence, indexed:sentencesStack) others
let indexed = indexSentence (tokens partial) sentence in
addSentences (partial <> _count indexed, indexed:sentencesStack) others
module Conllu.Tree.Count
( Count(..)
, count ) where
import qualified Conllu.Type as Conllu (CW(..), ID(..))
import Data.Csv (DefaultOrdered(..), FromNamedRecord(..), ToNamedRecord(..))
import Data.Serialize (Serialize(..))
import GHC.Generics (Generic)
import Prelude hiding (words)
data Count = Count
{ words :: Int
, tokens :: Int }
deriving (Eq, Show, Generic)
instance DefaultOrdered Count
instance FromNamedRecord Count
instance ToNamedRecord Count
instance Serialize Count
instance Semigroup Count where
c1 <> c2 = Count
{ words = words c1 + words c2, tokens = tokens c1 + tokens c2 }
instance Monoid Count where
mempty = Count 0 0
count :: [Conllu.CW a] -> Count
count sent = Count
{ words = tokens + length multiTokens - sum multiTokens, tokens }
where
ids = Conllu._id <$> sent
tokens = length [i | Conllu.SID i <- ids ]
multiTokens = [to - from + 1 | Conllu.MID from to <- ids ]
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment