From b1cf10a1415c6b7f08898ebb813d03e95d89411d Mon Sep 17 00:00:00 2001 From: Alice BRENON <alice.brenon@ens-lyon.fr> Date: Wed, 21 Jun 2023 22:22:56 +0200 Subject: [PATCH] Implement new features to merge script --- scripts/merge.hs | 51 ++++++++++++++++++++++++++++++++++++++---------- 1 file changed, 41 insertions(+), 10 deletions(-) diff --git a/scripts/merge.hs b/scripts/merge.hs index b0843d3..99b022f 100755 --- a/scripts/merge.hs +++ b/scripts/merge.hs @@ -1,27 +1,58 @@ #!/usr/bin/env -S runhaskell --ghc-arg="-Wall" --ghc-arg="-i lib" -{-# LANGUAGE DeriveGeneric, ExplicitNamespaces, OverloadedStrings #-} +{-# LANGUAGE DeriveGeneric, ExplicitNamespaces, OverloadedStrings, ScopedTypeVariables #-} import Control.Monad.Except (ExceptT(..), runExceptT) -import Data.Map ((!?)) +import Data.Foldable (toList) +import Data.Map (Map, (!?)) import Data.Maybe (catMaybes) -import Data.Vector (Vector, toList) -import GEODE.Metadata (Contrastive, Entry, Has(..), PrimaryKey, type(@)(..), indexBy, readNamedTsv, tsvFile) +import Data.Text (Text, pack) +import GEODE.Metadata (Contrastive(..), Entry, Has(..), MultiText(..), PrimaryKey, type(@)(..), indexBy, readNamedTsv, tsvFile) import System.Environment (getArgs) import System.Exit (die) import System.Script (syntax) -merge :: Vector (PrimaryKey @ Entry) -> Vector (PrimaryKey @ Contrastive) -> [PrimaryKey @ Entry @ Contrastive] -merge left right = catMaybes (mergeRow <$> toList right) +leftJoin :: forall a b c t. (Foldable t, Functor t, Has PrimaryKey a, Has PrimaryKey b) => + (a -> Maybe b -> c) -> t a -> t b -> t c +leftJoin f left right = outputRow <$> left where - indexed = get <$> indexBy get left - mergeRow (pK :@: contrastive) = - (\entry -> (pK :@: entry :@: contrastive)) <$> (indexed !? pK) + indexed :: Map PrimaryKey b + indexed = indexBy get right + outputRow leftLine = + f leftLine (indexed !? get leftLine) + +rightJoin :: (Foldable t, Functor t, Has PrimaryKey a, Has PrimaryKey b) => + (Maybe a -> b -> c) -> t a -> t b -> t c +rightJoin f left right = leftJoin (flip f) right left + +join :: forall a b c t. (Foldable t, Functor t, Has PrimaryKey a, Has PrimaryKey b) => + (a -> b -> c) -> t a -> t b -> [c] +join f left right = catMaybes . toList $ outputRow <$> left + where + indexed :: Map PrimaryKey b + indexed = indexBy get right + outputRow leftLine = + f leftLine <$> (indexed !? get leftLine) + +merge :: (PrimaryKey @ Entry) -> (PrimaryKey @ Contrastive) -> (PrimaryKey @ Entry @ Contrastive) +merge (pK :@: e) (_ :@: c) = pK :@: e :@: c + +tag :: Text -> Maybe PrimaryKey -> (PrimaryKey @ Entry @ Contrastive) -> (PrimaryKey @ Entry @ Contrastive) +tag _ Nothing l = l +tag name (Just _) (pK :@: e :@: contr) = + pK :@: e :@: contr { subCorpus = MultiText (name : subCorpora) } + where + MultiText subCorpora = subCorpus contr main :: IO () main = getArgs >>= run where run [left, right, output] = - runExceptT (merge + runExceptT ((join merge) + <$> ExceptT (readNamedTsv left) + <*> ExceptT (readNamedTsv right) ) + >>= either die (tsvFile output) + run [name, left, right, output] = + runExceptT ((rightJoin (tag $ pack name)) <$> ExceptT (readNamedTsv left) <*> ExceptT (readNamedTsv right) ) >>= either die (tsvFile output) -- GitLab