From 344466bc787ab178d5e0d7e668764eff156342b5 Mon Sep 17 00:00:00 2001 From: Alice BRENON <alice.brenon@ens-lyon.fr> Date: Sat, 6 Jan 2024 23:14:54 +0100 Subject: [PATCH] Describe how to represent a UD token as a tabular data structure to allow outputing matched words in a TSV --- lib/haskell/Conllu/Tree.hs | 57 ++++++++++++++++++++++++++++++++++---- 1 file changed, 52 insertions(+), 5 deletions(-) diff --git a/lib/haskell/Conllu/Tree.hs b/lib/haskell/Conllu/Tree.hs index 559183d..9969117 100644 --- a/lib/haskell/Conllu/Tree.hs +++ b/lib/haskell/Conllu/Tree.hs @@ -1,10 +1,13 @@ -{-# LANGUAGE DataKinds, DeriveGeneric, ExplicitNamespaces, TypeFamilies #-} +{-# LANGUAGE DeriveGeneric, OverloadedStrings, TypeFamilies #-} module Conllu.Tree - ( Feat(..) + ( EP(..) + , Feat(..) , ID(..) , IndexedDocument(..) , IndexedSentence(..) , IndexedWord(..) + , POS(..) + , Rel(..) , indexDocument , indexSentence , indexWord @@ -13,21 +16,32 @@ module Conllu.Tree import qualified Conllu.Type as Conllu (AW, CW(..), Doc, ID(..), FORM, LEMMA, XPOS, MISC, Rel(..), Sent(..), Index, Feat(..)) import qualified Conllu.UposTagset as Conllu (POS) import qualified Conllu.DeprelTagset as Conllu (EP) +import Data.ByteString.Char8 as ByteString (pack) +import Data.Csv ((.=), ToField(..), ToNamedRecord(..), namedRecord) import Data.Int (Int8) -import Data.List (partition) -import Data.Map as Map (Map, empty, insert) +import Data.List (intercalate, partition) +import Data.Map as Map (Map, empty, insert, toList) import Data.Serialize (Serialize(..)) import Data.Tree (Forest, Tree(..)) +import GEODE.Metadata (DefaultFields(..), HasDefaultHeader(..)) import GHC.Generics (Generic(..), K1(..), Rec0) data ID = SID Conllu.Index | MID Conllu.Index Conllu.Index | EID Conllu.Index Conllu.Index - deriving (Show, Generic) + deriving Generic + +instance Show ID where + show (SID i) = show i + show (MID i j) = show i ++ '-': show j + show (EID i j) = show i ++ '.': show j instance Serialize ID +instance ToField ID where + toField = ByteString.pack . show + enumCast :: (Enum a, Enum b) => a -> b enumCast = toEnum . fromEnum @@ -41,6 +55,9 @@ instance Generic POS where instance Serialize POS +instance ToField POS where + toField (POS p) = ByteString.pack $ show p + data Feat = Feat { _values :: [String] , _type :: Maybe String } deriving (Show, Generic) @@ -67,6 +84,13 @@ instance Serialize Rel type FEATS = Map String Feat +instance ToField FEATS where + toField = ByteString.pack . intercalate "|" . fmap showFeat . toList + where + showFeat (k, Feat {_values, _type}) = + k ++ '=':(intercalate "," _values ++ maybe "" showType _type) + showType t = '[': t ++ "]" + data IndexedWord = IndexedWord { _id :: ID , _form :: Conllu.FORM @@ -78,6 +102,29 @@ data IndexedWord = IndexedWord , _deps :: [Rel] , _misc :: Conllu.MISC } deriving (Show, Generic) +instance ToNamedRecord IndexedWord where + toNamedRecord indexedWord = namedRecord + [ "id" .= _id indexedWord + , "form" .= _form indexedWord + , "lemma" .= _lemma indexedWord + , "upos" .= _upos indexedWord + , "xpos" .= _xpos indexedWord + , "feats" .= _feats indexedWord + , "head" .= (_head <$> _rel indexedWord) + , "deprel" .= ((sep ":" . deprel) <$> _rel indexedWord) + , "deps" .= ("|" `sep` (showDep <$> _deps indexedWord)) + , "misc" .= _misc indexedWord ] + where + sep = intercalate + deprel (Rel {_deprel = EP ep, _subdep, _rest}) = + show ep : maybe [] id ((:) <$> _subdep <*> _rest) + showDep rel@(Rel {_head}) = ":" `sep` (show _head:deprel rel) + +instance HasDefaultHeader IndexedWord where + defaultFields = DefaultFields + [ "id", "form", "lemma", "upos", "xpos", "feats", "head", "deprel", "deps" + , "misc" ] + instance Serialize IndexedWord data IndexedSentence = IndexedSentence -- GitLab