diff --git a/lib/haskell/Conllu/Tree.hs b/lib/haskell/Conllu/Tree.hs index 8d5967bb3a3a069fb51c5283b9c52e80740bfd1f..14fa80e7d9d4057cb7fd025cab2166ed4f3fa0c4 100644 --- a/lib/haskell/Conllu/Tree.hs +++ b/lib/haskell/Conllu/Tree.hs @@ -5,14 +5,15 @@ module Conllu.Tree , ID(..) , IndexedDocument(..) , IndexedSentence(..) - , IndexedWord(..) + , IndexedToken(..) , POS(..) , Rel(..) , indexDocument , indexSentence - , indexWord + , indexToken , positions ) where +import Conllu.Tree.Count (Count(..), count) import qualified Conllu.Type as Conllu (AW, CW(..), Doc, ID(..), FORM, LEMMA, XPOS, MISC, Rel(..), Sent(..), Index, Feat(..)) import qualified Conllu.UposTagset as Conllu (POS) import qualified Conllu.DeprelTagset as Conllu (EP) @@ -90,7 +91,7 @@ instance ToField FEATS where k ++ '=':(intercalate "," _values ++ maybe "" showType _type) showType t = '[': t ++ "]" -data IndexedWord = IndexedWord +data IndexedToken = IndexedToken { _id :: ID , _form :: Conllu.FORM , _lemma :: Conllu.LEMMA @@ -101,39 +102,40 @@ data IndexedWord = IndexedWord , _deps :: [Rel] , _misc :: Conllu.MISC } deriving (Show, Generic) -instance ToNamedRecord IndexedWord where - toNamedRecord indexedWord = namedRecord - [ "id" .= _id indexedWord - , "form" .= _form indexedWord - , "lemma" .= _lemma indexedWord - , "upos" .= _upos indexedWord - , "xpos" .= _xpos indexedWord - , "feats" .= _feats indexedWord - , "head" .= (_head <$> _rel indexedWord) - , "deprel" .= ((sep ":" . deprel) <$> _rel indexedWord) - , "deps" .= ("|" `sep` (showDep <$> _deps indexedWord)) - , "misc" .= _misc indexedWord ] +instance ToNamedRecord IndexedToken where + toNamedRecord indexedToken = namedRecord + [ "id" .= _id indexedToken + , "form" .= _form indexedToken + , "lemma" .= _lemma indexedToken + , "upos" .= _upos indexedToken + , "xpos" .= _xpos indexedToken + , "feats" .= _feats indexedToken + , "head" .= (_head <$> _rel indexedToken) + , "deprel" .= ((sep ":" . deprel) <$> _rel indexedToken) + , "deps" .= ("|" `sep` (showDep <$> _deps indexedToken)) + , "misc" .= _misc indexedToken ] where sep = intercalate deprel (Rel {_deprel = EP ep, _subdep, _rest}) = show ep : maybe [] id ((:) <$> _subdep <*> _rest) showDep rel@(Rel {_head}) = ":" `sep` (show _head:deprel rel) -instance DefaultOrdered IndexedWord where +instance DefaultOrdered IndexedToken where headerOrder _ = [ "id", "form", "lemma", "upos", "xpos", "feats", "head", "deprel", "deps" , "misc" ] -instance Serialize IndexedWord +instance Serialize IndexedToken data IndexedSentence = IndexedSentence { _offset :: Int - , _syntax :: Forest IndexedWord } deriving (Show, Generic) + , _count :: Count + , _syntax :: Forest IndexedToken } deriving (Show, Generic) instance Serialize IndexedSentence data IndexedDocument = IndexedDocument - { _total :: Int + { _total :: Count , _sentences :: [IndexedSentence] } deriving (Show, Generic) instance Serialize IndexedDocument @@ -147,7 +149,7 @@ getSID :: Conllu.ID -> Maybe Int getSID (Conllu.SID n) = Just n getSID _ = Nothing -positions :: Int -> IndexedWord -> [Int] +positions :: Int -> IndexedToken -> [Int] positions offset = fmap (offset +) . getIDs . _id where getIDs (SID i) = [i] @@ -169,8 +171,8 @@ relOfConllu rel = Rel , _subdep = Conllu._subdep rel , _rest = Conllu._rest rel } -indexWord :: Conllu.CW Conllu.AW -> IndexedWord -indexWord cw = IndexedWord +indexToken :: Conllu.CW Conllu.AW -> IndexedToken +indexToken cw = IndexedToken { _id = idOfConllu $ Conllu._id cw , _form = Conllu._form cw , _lemma = Conllu._lemma cw @@ -182,21 +184,22 @@ indexWord cw = IndexedWord , _misc = Conllu._misc cw } indexSentence :: Int -> [Conllu.CW Conllu.AW] -> IndexedSentence -indexSentence offset = IndexedSentence offset . build 0 +indexSentence _offset sent = IndexedSentence + { _offset, _count = count sent, _syntax = build 0 sent } where build n cws = let (pointing, others) = partition (pointingTo n) cws in recurseOn others <$> pointing recurseOn cws cw = Node - { rootLabel = indexWord cw + { rootLabel = indexToken cw , subForest = maybe [] (`build` cws) $ getSID (Conllu._id cw) } pointingTo n cw = maybe False (n ==) (getSID . Conllu._head =<< Conllu._rel cw) indexDocument :: Conllu.Doc -> IndexedDocument -indexDocument doc = addSentences (0, []) $ Conllu._words <$> doc +indexDocument doc = addSentences (mempty, []) $ Conllu._words <$> doc where addSentences (_total, sentencesStack) [] = IndexedDocument {_total, _sentences = reverse sentencesStack} addSentences (partial, sentencesStack) (sentence:others) = - let indexed = indexSentence partial sentence in - addSentences (partial + length sentence, indexed:sentencesStack) others + let indexed = indexSentence (tokens partial) sentence in + addSentences (partial <> _count indexed, indexed:sentencesStack) others diff --git a/lib/haskell/Conllu/Tree/Count.hs b/lib/haskell/Conllu/Tree/Count.hs new file mode 100644 index 0000000000000000000000000000000000000000..9d3fb940e335236a9b5e93ae51c83d5e088d1f77 --- /dev/null +++ b/lib/haskell/Conllu/Tree/Count.hs @@ -0,0 +1,34 @@ +module Conllu.Tree.Count + ( Count(..) + , count ) where + +import qualified Conllu.Type as Conllu (CW(..), ID(..)) +import Data.Csv (DefaultOrdered(..), FromNamedRecord(..), ToNamedRecord(..)) +import Data.Serialize (Serialize(..)) +import GHC.Generics (Generic) +import Prelude hiding (words) + +data Count = Count + { words :: Int + , tokens :: Int } + deriving (Eq, Show, Generic) + +instance DefaultOrdered Count +instance FromNamedRecord Count +instance ToNamedRecord Count +instance Serialize Count + +instance Semigroup Count where + c1 <> c2 = Count + { words = words c1 + words c2, tokens = tokens c1 + tokens c2 } + +instance Monoid Count where + mempty = Count 0 0 + +count :: [Conllu.CW a] -> Count +count sent = Count + { words = tokens + length multiTokens - sum multiTokens, tokens } + where + ids = Conllu._id <$> sent + tokens = length [i | Conllu.SID i <- ids ] + multiTokens = [to - from + 1 | Conllu.MID from to <- ids ]