Skip to content
Snippets Groups Projects
merge.hs 2.31 KiB
Newer Older
#!/usr/bin/env -S runhaskell --ghc-arg="-Wall" --ghc-arg="-i lib"
{-# LANGUAGE DeriveGeneric, ExplicitNamespaces, OverloadedStrings, ScopedTypeVariables #-}

import Control.Monad.Except (ExceptT(..), runExceptT)
import Data.Foldable (toList)
import Data.Map (Map, (!?))
import Data.Maybe (catMaybes)
import Data.Text (Text, pack)
import GEODE.Metadata (Contrastive(..), Entry, Has(..), MultiText(..), PrimaryKey, type(@)(..), indexBy, readNamedTsv, tsvFile)
import System.Environment (getArgs)
import System.Exit (die)
import System.Script (syntax)

leftJoin :: forall a b c t. (Foldable t, Functor t, Has PrimaryKey a, Has PrimaryKey b) =>
  (a -> Maybe b -> c) -> t a -> t b -> t c
leftJoin f left right = outputRow <$> left
    indexed :: Map PrimaryKey b
    indexed = indexBy get right
    outputRow leftLine =
      f leftLine (indexed !? get leftLine)

rightJoin :: (Foldable t, Functor t, Has PrimaryKey a, Has PrimaryKey b) =>
  (Maybe a -> b -> c) -> t a -> t b -> t c
rightJoin f left right = leftJoin (flip f) right left

join :: forall a b c t. (Foldable t, Functor t, Has PrimaryKey a, Has PrimaryKey b) =>
  (a -> b -> c) -> t a -> t b -> [c]
join f left right = catMaybes . toList $ outputRow <$> left
  where
    indexed :: Map PrimaryKey b
    indexed = indexBy get right
    outputRow leftLine =
      f leftLine <$> (indexed !? get leftLine)

merge :: (PrimaryKey @ Entry) -> (PrimaryKey @ Contrastive) -> (PrimaryKey @ Entry @ Contrastive)
merge (pK :@: e) (_ :@: c) = pK :@: e :@: c

tag :: Text -> Maybe PrimaryKey -> (PrimaryKey @ Entry @ Contrastive) -> (PrimaryKey @ Entry @ Contrastive)
tag _ Nothing l = l
tag name (Just _) (pK :@: e :@: contr) =
  pK :@: e :@: contr { subCorpus = MultiText (name : subCorpora) }
  where
    MultiText subCorpora = subCorpus contr

main :: IO ()
main = getArgs >>= run
  where
    run [left, right, output] =
      runExceptT ((join merge)
                    <$> ExceptT (readNamedTsv left)
                    <*> ExceptT (readNamedTsv right) )
      >>= either die (tsvFile output)
    run [name, left, right, output] =
      runExceptT ((rightJoin (tag $ pack name))
                    <$> ExceptT (readNamedTsv left)
                    <*> ExceptT (readNamedTsv right) )
      >>= either die (tsvFile output)
    run _ = syntax "PRIMARY_KEY.tsv INPUT_METADATA.tsv OUTPUT_METADATA.tsv"