Skip to content
Snippets Groups Projects
Commit 344466bc authored by Alice Brenon's avatar Alice Brenon
Browse files

Describe how to represent a UD token as a tabular data structure to allow...

Describe how to represent a UD token as a tabular data structure to allow outputing matched words in a TSV
parent 6cb7457d
No related branches found
No related tags found
No related merge requests found
{-# LANGUAGE DataKinds, DeriveGeneric, ExplicitNamespaces, TypeFamilies #-} {-# LANGUAGE DeriveGeneric, OverloadedStrings, TypeFamilies #-}
module Conllu.Tree module Conllu.Tree
( Feat(..) ( EP(..)
, Feat(..)
, ID(..) , ID(..)
, IndexedDocument(..) , IndexedDocument(..)
, IndexedSentence(..) , IndexedSentence(..)
, IndexedWord(..) , IndexedWord(..)
, POS(..)
, Rel(..)
, indexDocument , indexDocument
, indexSentence , indexSentence
, indexWord , indexWord
...@@ -13,21 +16,32 @@ module Conllu.Tree ...@@ -13,21 +16,32 @@ module Conllu.Tree
import qualified Conllu.Type as Conllu (AW, CW(..), Doc, ID(..), FORM, LEMMA, XPOS, MISC, Rel(..), Sent(..), Index, Feat(..)) import qualified Conllu.Type as Conllu (AW, CW(..), Doc, ID(..), FORM, LEMMA, XPOS, MISC, Rel(..), Sent(..), Index, Feat(..))
import qualified Conllu.UposTagset as Conllu (POS) import qualified Conllu.UposTagset as Conllu (POS)
import qualified Conllu.DeprelTagset as Conllu (EP) import qualified Conllu.DeprelTagset as Conllu (EP)
import Data.ByteString.Char8 as ByteString (pack)
import Data.Csv ((.=), ToField(..), ToNamedRecord(..), namedRecord)
import Data.Int (Int8) import Data.Int (Int8)
import Data.List (partition) import Data.List (intercalate, partition)
import Data.Map as Map (Map, empty, insert) import Data.Map as Map (Map, empty, insert, toList)
import Data.Serialize (Serialize(..)) import Data.Serialize (Serialize(..))
import Data.Tree (Forest, Tree(..)) import Data.Tree (Forest, Tree(..))
import GEODE.Metadata (DefaultFields(..), HasDefaultHeader(..))
import GHC.Generics (Generic(..), K1(..), Rec0) import GHC.Generics (Generic(..), K1(..), Rec0)
data ID = data ID =
SID Conllu.Index SID Conllu.Index
| MID Conllu.Index Conllu.Index | MID Conllu.Index Conllu.Index
| EID Conllu.Index Conllu.Index | EID Conllu.Index Conllu.Index
deriving (Show, Generic) deriving Generic
instance Show ID where
show (SID i) = show i
show (MID i j) = show i ++ '-': show j
show (EID i j) = show i ++ '.': show j
instance Serialize ID instance Serialize ID
instance ToField ID where
toField = ByteString.pack . show
enumCast :: (Enum a, Enum b) => a -> b enumCast :: (Enum a, Enum b) => a -> b
enumCast = toEnum . fromEnum enumCast = toEnum . fromEnum
...@@ -41,6 +55,9 @@ instance Generic POS where ...@@ -41,6 +55,9 @@ instance Generic POS where
instance Serialize POS instance Serialize POS
instance ToField POS where
toField (POS p) = ByteString.pack $ show p
data Feat = Feat data Feat = Feat
{ _values :: [String] { _values :: [String]
, _type :: Maybe String } deriving (Show, Generic) , _type :: Maybe String } deriving (Show, Generic)
...@@ -67,6 +84,13 @@ instance Serialize Rel ...@@ -67,6 +84,13 @@ instance Serialize Rel
type FEATS = Map String Feat type FEATS = Map String Feat
instance ToField FEATS where
toField = ByteString.pack . intercalate "|" . fmap showFeat . toList
where
showFeat (k, Feat {_values, _type}) =
k ++ '=':(intercalate "," _values ++ maybe "" showType _type)
showType t = '[': t ++ "]"
data IndexedWord = IndexedWord data IndexedWord = IndexedWord
{ _id :: ID { _id :: ID
, _form :: Conllu.FORM , _form :: Conllu.FORM
...@@ -78,6 +102,29 @@ data IndexedWord = IndexedWord ...@@ -78,6 +102,29 @@ data IndexedWord = IndexedWord
, _deps :: [Rel] , _deps :: [Rel]
, _misc :: Conllu.MISC } deriving (Show, Generic) , _misc :: Conllu.MISC } deriving (Show, Generic)
instance ToNamedRecord IndexedWord where
toNamedRecord indexedWord = namedRecord
[ "id" .= _id indexedWord
, "form" .= _form indexedWord
, "lemma" .= _lemma indexedWord
, "upos" .= _upos indexedWord
, "xpos" .= _xpos indexedWord
, "feats" .= _feats indexedWord
, "head" .= (_head <$> _rel indexedWord)
, "deprel" .= ((sep ":" . deprel) <$> _rel indexedWord)
, "deps" .= ("|" `sep` (showDep <$> _deps indexedWord))
, "misc" .= _misc indexedWord ]
where
sep = intercalate
deprel (Rel {_deprel = EP ep, _subdep, _rest}) =
show ep : maybe [] id ((:) <$> _subdep <*> _rest)
showDep rel@(Rel {_head}) = ":" `sep` (show _head:deprel rel)
instance HasDefaultHeader IndexedWord where
defaultFields = DefaultFields
[ "id", "form", "lemma", "upos", "xpos", "feats", "head", "deprel", "deps"
, "misc" ]
instance Serialize IndexedWord instance Serialize IndexedWord
data IndexedSentence = IndexedSentence data IndexedSentence = IndexedSentence
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment