diff --git a/app/Join.hs b/app/Join.hs new file mode 100644 index 0000000000000000000000000000000000000000..a3118c19e03d700813bfe47677a58d5a680dea27 --- /dev/null +++ b/app/Join.hs @@ -0,0 +1,72 @@ +{-# LANGUAGE NamedFieldPuns, OverloadedStrings #-} +module Join + ( Config + , config + , run ) where + +import Control.Applicative ((<|>), optional) +import Data.Attoparsec.Text (char, parseOnly, sepBy, takeWhile1) +import Data.Char (toLower) +import Data.Text as Text (Text, pack) +import Data.Text.Encoding (decodeUtf8) +import Data.Vector as Vector (toList) +import GEODE.Metadata + ( ArticleRecord, ColumnsMapping, JoinContext(..), JoinMethod, ReadTSV(..) + , Renamers, WriteTSV(..), for, getHeader, leftJoin, rightJoin ) +import Options.Applicative + ( Parser, long, metavar, option, short, str, strArgument, strOption, value ) +import System.Exit (die) +import Text.Printf (printf) + +data Config = Config + { leftTSV :: FilePath + , rightTSV :: FilePath + , outputTSV :: Maybe FilePath + , on :: Text + , how :: Maybe String + , renamePatterns :: Maybe String } + +config :: Parser Config +config = Config + <$> strArgument (metavar "LEFT_TSV") + <*> strArgument (metavar "RIGHT_TSV") + <*> option (optional str) (long "output" <> short 'o' <> value Nothing) + <*> strOption (long "on" <> short 'O') + <*> (option (optional str) (long "how" <> short 'H' <> value Nothing)) + <*> (option (optional str) (long "rename" <> short 'r' <> value Nothing)) + +dup :: Applicative f => a -> f (a, a) +dup a = pure (a, a) + +parseColumns :: Text -> IO ColumnsMapping +parseColumns "ARTICLES" = pure (articleHeader, articleHeader) + where + articleHeader = decodeUtf8 <$> + Vector.toList (getHeader (for :: ArticleRecord)) +parseColumns s = either die (pure . unzip) $ parseOnly columns s + where + columns = (mapping <|> (columnName >>= dup)) `sepBy` char ',' + mapping = (,) <$> columnName <* char '=' <*> columnName + columnName = takeWhile1 (not . (`elem` ['=', ','])) + +parseRenamers :: Maybe String -> IO Renamers +parseRenamers Nothing = dup id +parseRenamers (Just s) = + case break (== ',') s of + (left, ',':right) -> pure (Text.pack . printf left, Text.pack . printf right) + _ -> die "--rename option expects to comma-separated printf pattern (with only one '%s' in each)" + +parseMethod :: Maybe String -> IO JoinMethod +parseMethod Nothing = pure leftJoin +parseMethod (Just s) = readMethod $ toLower <$> s + where + readMethod "left" = pure leftJoin + readMethod "right" = pure rightJoin + readMethod e = die ("Unknown join method '" ++ e ++ "'") + +run :: Config -> IO () +run (Config {leftTSV, rightTSV, outputTSV, how, on, renamePatterns}) = do + method <- parseMethod how + context <- JoinContext <$> parseColumns on <*> parseRenamers renamePatterns + (method context <$> (readTSV leftTSV) <*> (readTSV rightTSV)) + >>= (maybe (writeTSV ()) writeTSV outputTSV) diff --git a/app/Main.hs b/app/Main.hs new file mode 100644 index 0000000000000000000000000000000000000000..8a97800d4b67b0406fe32a92688b456f9446b2bc --- /dev/null +++ b/app/Main.hs @@ -0,0 +1,15 @@ +{-# LANGUAGE ExplicitNamespaces #-} +import Control.Applicative ((<**>)) +import qualified Join (Config, config, run) +import Options.Applicative + ( Parser, command, execParser, fullDesc, helper, info, progDesc, subparser ) + +data Hammer = Join Join.Config + +hammer :: Parser Hammer +hammer = subparser + (command "join" (info (Join <$> Join.config) $ progDesc "join two TSV files")) + +main :: IO () +main = execParser (info (hammer <**> helper) fullDesc) >>= run + where run (Join c) = Join.run c diff --git a/geode.cabal b/geode.cabal index ba9c94fa6bfb192adae95bb7b239de9d80ca7e65..1330af0fb0aa3df5e7765d12cdbfce181ad01097 100644 --- a/geode.cabal +++ b/geode.cabal @@ -1,6 +1,6 @@ cabal-version: 2.4 name: geode -version: 0.1.0.0 +version: 0.2.1.0 synopsis: Data structures and tooling used in project GEODE @@ -32,6 +32,7 @@ library , GEODE.Metadata.Record , GEODE.Metadata.TSV , GEODE.Metadata.TSV.Header + , GEODE.Metadata.TSV.Join , GEODE.Metadata.Types , GEODE.Metadata.Work @@ -51,6 +52,23 @@ library hs-source-dirs: lib default-language: Haskell2010 +executable hammer + main-is: Main.hs + other-modules: Join + -- other-extensions: + build-depends: attoparsec >= 0.14.4 && <0.15 + , base + , cassava + , containers + , geode + , mtl + , optparse-applicative + , text + , vector + hs-source-dirs: app + ghc-options: -Wall + default-language: Haskell2010 + test-suite ghc-geode-test default-language: Haskell2010 type: exitcode-stdio-1.0 diff --git a/guix.scm b/guix.scm index e3b1f959ba6d061a4163f6bc3571e7f20021d038..da56fb5d3743189af272e3d4b43b23e2629998ec 100644 --- a/guix.scm +++ b/guix.scm @@ -1,7 +1,8 @@ -(use-modules ((gnu packages haskell-xyz) #:select (ghc-cassava - ghc-optparse-applicative)) - ((gnu packages haskell-check) #:select (ghc-hunit)) +(use-modules ((gnu packages haskell-check) #:select (ghc-hunit)) ((gnu packages haskell-web) #:select (ghc-aeson)) + ((gnu packages haskell-xyz) #:select (ghc-attoparsec + ghc-cassava + ghc-optparse-applicative)) ((guix build-system haskell) #:select (haskell-build-system)) ((guix git-download) #:select (git-predicate)) ((guix gexp) #:select (local-file)) @@ -18,7 +19,7 @@ #:recursive? #t #:select? (git-predicate %source-dir))) (build-system haskell-build-system) - (inputs (list ghc-aeson ghc-cassava ghc-optparse-applicative ghc-hunit)) + (inputs (list ghc-aeson ghc-attoparsec ghc-cassava ghc-optparse-applicative ghc-hunit)) (home-page "https://gitlab.liris.cnrs.fr/geode/ghc-geode") (synopsis "Data structures and tooling used in project GEODE") (description diff --git a/lib/GEODE/Metadata.hs b/lib/GEODE/Metadata.hs index 2d0dc5dc4300f3c47785543c9f5088b2cef8ae68..7f04717d26625f893837adc4a04c28be6c5f6381 100644 --- a/lib/GEODE/Metadata.hs +++ b/lib/GEODE/Metadata.hs @@ -7,15 +7,15 @@ module GEODE.Metadata , module SplitContext , module TSV , module TSV_Header + , module TSV_Join , module Types , module Work , groupBy - , indexBy , sortBy ) where import Data.Foldable as Foldable (toList) import Data.List (sortOn) -import Data.Map.Strict as Map (Map, alter, empty, insert, toList) +import Data.Map.Strict as Map (alter, empty, toList) import GEODE.Metadata.Contrastive as Contrastive import GEODE.Metadata.Entry as Entry ( Entry(headword, name, page), newEntry, normalize ) @@ -24,6 +24,7 @@ import GEODE.Metadata.Record as Record import GEODE.Metadata.SplitContext as SplitContext hiding (get, page, rank) import GEODE.Metadata.TSV as TSV import GEODE.Metadata.TSV.Header as TSV_Header +import GEODE.Metadata.TSV.Join as TSV_Join import GEODE.Metadata.Types as Types import GEODE.Metadata.Work as Work @@ -34,6 +35,3 @@ groupBy :: (Foldable t, Ord k) => (v -> k) -> t v -> [(k, [v])] groupBy field = Map.toList . foldr group Map.empty where group article = Map.alter (Just . maybe [article] (article:)) (field article) - -indexBy :: (Foldable t, Ord k) => (a -> k) -> t a -> Map k a -indexBy f = foldr (\a -> Map.insert (f a) a) Map.empty diff --git a/lib/GEODE/Metadata/TSV/Join.hs b/lib/GEODE/Metadata/TSV/Join.hs new file mode 100644 index 0000000000000000000000000000000000000000..16ea69e883c9f5cd176b7239142d03bc57e6f6f7 --- /dev/null +++ b/lib/GEODE/Metadata/TSV/Join.hs @@ -0,0 +1,87 @@ +{-# LANGUAGE OverloadedStrings #-} +module GEODE.Metadata.TSV.Join + ( ColumnsMapping + , JoinContext(..) + , JoinMethod + , Renamers + , indexBy + , leftJoin + , rightJoin ) where + +import Data.Map as Map (Map, insert, lookup) +import Data.Set as Set (Set, intersection, member, singleton) +import Data.Text as Text (Text) +import Data.Vector as Vector + (Vector, cons, indexed, partitionWith, uncons, unsafeIndex, zipWith ) + +type Row = Vector Text +type Rows = Vector Row +type Document = (Row, Rows) +type ColumnsMapping = ([Text], [Text]) +type Renamers = (Text -> Text, Text -> Text) + +data JoinContext = JoinContext + { columns :: ColumnsMapping + , renamers :: Renamers } + +type JoinMethod = JoinContext -> Rows -> Rows -> Rows + +asSet :: (Foldable t, Ord a) => t a -> Set a +asSet = foldMap singleton + +indexBy :: (Foldable t, Ord k) => (a -> k) -> t a -> Map k a +indexBy f = foldr (\a -> Map.insert (f a) a) mempty + +select :: Vector Int -> Vector a -> Vector a +select mask values = unsafeIndex values <$> mask + +indexMasks :: [Text] -> Vector Text -> (Vector Int, Vector Int) +indexMasks columns = partitionWith inKeys . indexed + where + keys = asSet columns + inKeys (i, a) = (if a `member` keys then Left else Right) i + +renameHeader :: (Foldable t, Functor t) => Renamers -> t Text -> t Text -> (t Text, t Text) +renameHeader (leftRenamer, rightRenamer) left right = + (apply leftRenamer <$> left, apply rightRenamer <$> right) + where + common = intersection (asSet left) (asSet right) + apply r c = if c `member` common then r c else c + +filterRows :: Vector Int -> (Vector Int, Vector Int) -> Rows -> Rows -> Rows +filterRows mainKeyMask (sideKeyMask, sideValueMask) main side = + (maybe emptyCells (select sideValueMask) . match) <$> mainKeys + where + mainKeys = select mainKeyMask <$> main + optimize keys key | length main >= length side || key `member` keys = key + | otherwise = mempty + indexedSide = indexBy (optimize (asSet mainKeys) . select sideKeyMask) side + match = flip Map.lookup indexedSide + emptyCells = (\_ -> "") <$> sideValueMask + +joinInto :: JoinContext -> Maybe Document -> Document -> ((Row, Row), (Rows, Rows)) +joinInto _ Nothing (header, rows) = ((header, mempty), (rows, mempty)) +joinInto context (Just (sideHead, sideRows)) (mainHead, mainRows) = + (newHead, (mainRows, filterRows mainKeyMask sideMasks mainRows sideRows)) + where + (onLeft, onRight) = columns context + mainKeyMask = fst (indexMasks onLeft mainHead) + sideMasks@(_, valueMask) = indexMasks onRight sideHead + newHead = + renameHeader (renamers context) mainHead (select valueMask sideHead) + +leftJoin :: JoinMethod +leftJoin context left right = + build (joinInto context (Vector.uncons right) <$> Vector.uncons left) + where + build Nothing = mempty + build (Just ((mainHead, sideHead), (mainRows, sideRows))) = + Vector.cons (mainHead <> sideHead) (Vector.zipWith (<>) mainRows sideRows) + +rightJoin :: JoinMethod +rightJoin context left right = + build (joinInto context (Vector.uncons left) <$> Vector.uncons right) + where + build Nothing = mempty + build (Just ((mainHead, sideHead), (mainRows, sideRows))) = + Vector.cons (sideHead <> mainHead) (Vector.zipWith (<>) sideRows mainRows)