From b1cf10a1415c6b7f08898ebb813d03e95d89411d Mon Sep 17 00:00:00 2001
From: Alice BRENON <alice.brenon@ens-lyon.fr>
Date: Wed, 21 Jun 2023 22:22:56 +0200
Subject: [PATCH] Implement new features to merge script

---
 scripts/merge.hs | 51 ++++++++++++++++++++++++++++++++++++++----------
 1 file changed, 41 insertions(+), 10 deletions(-)

diff --git a/scripts/merge.hs b/scripts/merge.hs
index b0843d3..99b022f 100755
--- a/scripts/merge.hs
+++ b/scripts/merge.hs
@@ -1,27 +1,58 @@
 #!/usr/bin/env -S runhaskell --ghc-arg="-Wall" --ghc-arg="-i lib"
-{-# LANGUAGE DeriveGeneric, ExplicitNamespaces, OverloadedStrings #-}
+{-# LANGUAGE DeriveGeneric, ExplicitNamespaces, OverloadedStrings, ScopedTypeVariables #-}
 
 import Control.Monad.Except (ExceptT(..), runExceptT)
-import Data.Map ((!?))
+import Data.Foldable (toList)
+import Data.Map (Map, (!?))
 import Data.Maybe (catMaybes)
-import Data.Vector (Vector, toList)
-import GEODE.Metadata (Contrastive, Entry, Has(..), PrimaryKey, type(@)(..), indexBy, readNamedTsv, tsvFile)
+import Data.Text (Text, pack)
+import GEODE.Metadata (Contrastive(..), Entry, Has(..), MultiText(..), PrimaryKey, type(@)(..), indexBy, readNamedTsv, tsvFile)
 import System.Environment (getArgs)
 import System.Exit (die)
 import System.Script (syntax)
 
-merge :: Vector (PrimaryKey @ Entry) -> Vector (PrimaryKey @ Contrastive) -> [PrimaryKey @ Entry @ Contrastive]
-merge left right = catMaybes (mergeRow <$> toList right)
+leftJoin :: forall a b c t. (Foldable t, Functor t, Has PrimaryKey a, Has PrimaryKey b) =>
+  (a -> Maybe b -> c) -> t a -> t b -> t c
+leftJoin f left right = outputRow <$> left
   where
-    indexed = get <$> indexBy get left
-    mergeRow (pK :@: contrastive) =
-      (\entry -> (pK :@: entry :@: contrastive)) <$> (indexed !? pK)
+    indexed :: Map PrimaryKey b
+    indexed = indexBy get right
+    outputRow leftLine =
+      f leftLine (indexed !? get leftLine)
+
+rightJoin :: (Foldable t, Functor t, Has PrimaryKey a, Has PrimaryKey b) =>
+  (Maybe a -> b -> c) -> t a -> t b -> t c
+rightJoin f left right = leftJoin (flip f) right left
+
+join :: forall a b c t. (Foldable t, Functor t, Has PrimaryKey a, Has PrimaryKey b) =>
+  (a -> b -> c) -> t a -> t b -> [c]
+join f left right = catMaybes . toList $ outputRow <$> left
+  where
+    indexed :: Map PrimaryKey b
+    indexed = indexBy get right
+    outputRow leftLine =
+      f leftLine <$> (indexed !? get leftLine)
+
+merge :: (PrimaryKey @ Entry) -> (PrimaryKey @ Contrastive) -> (PrimaryKey @ Entry @ Contrastive)
+merge (pK :@: e) (_ :@: c) = pK :@: e :@: c
+
+tag :: Text -> Maybe PrimaryKey -> (PrimaryKey @ Entry @ Contrastive) -> (PrimaryKey @ Entry @ Contrastive)
+tag _ Nothing l = l
+tag name (Just _) (pK :@: e :@: contr) =
+  pK :@: e :@: contr { subCorpus = MultiText (name : subCorpora) }
+  where
+    MultiText subCorpora = subCorpus contr
 
 main :: IO ()
 main = getArgs >>= run
   where
     run [left, right, output] =
-      runExceptT (merge
+      runExceptT ((join merge)
+                    <$> ExceptT (readNamedTsv left)
+                    <*> ExceptT (readNamedTsv right) )
+      >>= either die (tsvFile output)
+    run [name, left, right, output] =
+      runExceptT ((rightJoin (tag $ pack name))
                     <$> ExceptT (readNamedTsv left)
                     <*> ExceptT (readNamedTsv right) )
       >>= either die (tsvFile output)
-- 
GitLab