Skip to content
Snippets Groups Projects
Commit 022f78a7 authored by Alice Brenon's avatar Alice Brenon
Browse files

Adding executable tool (join) to the library

parent 60eb2c02
No related branches found
No related tags found
No related merge requests found
{-# LANGUAGE NamedFieldPuns, OverloadedStrings #-}
module Join
( Config
, config
, run ) where
import Control.Applicative ((<|>), optional)
import Data.Attoparsec.Text (char, parseOnly, sepBy, takeWhile1)
import Data.Char (toLower)
import Data.Text as Text (Text, pack)
import Data.Text.Encoding (decodeUtf8)
import Data.Vector as Vector (toList)
import GEODE.Metadata
( ArticleRecord, ColumnsMapping, JoinContext(..), JoinMethod, ReadTSV(..)
, Renamers, WriteTSV(..), for, getHeader, leftJoin, rightJoin )
import Options.Applicative
( Parser, long, metavar, option, short, str, strArgument, strOption, value )
import System.Exit (die)
import Text.Printf (printf)
data Config = Config
{ leftTSV :: FilePath
, rightTSV :: FilePath
, outputTSV :: Maybe FilePath
, on :: Text
, how :: Maybe String
, renamePatterns :: Maybe String }
config :: Parser Config
config = Config
<$> strArgument (metavar "LEFT_TSV")
<*> strArgument (metavar "RIGHT_TSV")
<*> option (optional str) (long "output" <> short 'o' <> value Nothing)
<*> strOption (long "on" <> short 'O')
<*> (option (optional str) (long "how" <> short 'H' <> value Nothing))
<*> (option (optional str) (long "rename" <> short 'r' <> value Nothing))
dup :: Applicative f => a -> f (a, a)
dup a = pure (a, a)
parseColumns :: Text -> IO ColumnsMapping
parseColumns "ARTICLES" = pure (articleHeader, articleHeader)
where
articleHeader = decodeUtf8 <$>
Vector.toList (getHeader (for :: ArticleRecord))
parseColumns s = either die (pure . unzip) $ parseOnly columns s
where
columns = (mapping <|> (columnName >>= dup)) `sepBy` char ','
mapping = (,) <$> columnName <* char '=' <*> columnName
columnName = takeWhile1 (not . (`elem` ['=', ',']))
parseRenamers :: Maybe String -> IO Renamers
parseRenamers Nothing = dup id
parseRenamers (Just s) =
case break (== ',') s of
(left, ',':right) -> pure (Text.pack . printf left, Text.pack . printf right)
_ -> die "--rename option expects to comma-separated printf pattern (with only one '%s' in each)"
parseMethod :: Maybe String -> IO JoinMethod
parseMethod Nothing = pure leftJoin
parseMethod (Just s) = readMethod $ toLower <$> s
where
readMethod "left" = pure leftJoin
readMethod "right" = pure rightJoin
readMethod e = die ("Unknown join method '" ++ e ++ "'")
run :: Config -> IO ()
run (Config {leftTSV, rightTSV, outputTSV, how, on, renamePatterns}) = do
method <- parseMethod how
context <- JoinContext <$> parseColumns on <*> parseRenamers renamePatterns
(method context <$> (readTSV leftTSV) <*> (readTSV rightTSV))
>>= (maybe (writeTSV ()) writeTSV outputTSV)
{-# LANGUAGE ExplicitNamespaces #-}
import Control.Applicative ((<**>))
import qualified Join (Config, config, run)
import Options.Applicative
( Parser, command, execParser, fullDesc, helper, info, progDesc, subparser )
data Hammer = Join Join.Config
hammer :: Parser Hammer
hammer = subparser
(command "join" (info (Join <$> Join.config) $ progDesc "join two TSV files"))
main :: IO ()
main = execParser (info (hammer <**> helper) fullDesc) >>= run
where run (Join c) = Join.run c
cabal-version: 2.4
name: geode
version: 0.1.0.0
version: 0.2.1.0
synopsis:
Data structures and tooling used in project GEODE
......@@ -32,6 +32,7 @@ library
, GEODE.Metadata.Record
, GEODE.Metadata.TSV
, GEODE.Metadata.TSV.Header
, GEODE.Metadata.TSV.Join
, GEODE.Metadata.Types
, GEODE.Metadata.Work
......@@ -51,6 +52,23 @@ library
hs-source-dirs: lib
default-language: Haskell2010
executable hammer
main-is: Main.hs
other-modules: Join
-- other-extensions:
build-depends: attoparsec >= 0.14.4 && <0.15
, base
, cassava
, containers
, geode
, mtl
, optparse-applicative
, text
, vector
hs-source-dirs: app
ghc-options: -Wall
default-language: Haskell2010
test-suite ghc-geode-test
default-language: Haskell2010
type: exitcode-stdio-1.0
......
(use-modules ((gnu packages haskell-xyz) #:select (ghc-cassava
ghc-optparse-applicative))
((gnu packages haskell-check) #:select (ghc-hunit))
(use-modules ((gnu packages haskell-check) #:select (ghc-hunit))
((gnu packages haskell-web) #:select (ghc-aeson))
((gnu packages haskell-xyz) #:select (ghc-attoparsec
ghc-cassava
ghc-optparse-applicative))
((guix build-system haskell) #:select (haskell-build-system))
((guix git-download) #:select (git-predicate))
((guix gexp) #:select (local-file))
......@@ -18,7 +19,7 @@
#:recursive? #t
#:select? (git-predicate %source-dir)))
(build-system haskell-build-system)
(inputs (list ghc-aeson ghc-cassava ghc-optparse-applicative ghc-hunit))
(inputs (list ghc-aeson ghc-attoparsec ghc-cassava ghc-optparse-applicative ghc-hunit))
(home-page "https://gitlab.liris.cnrs.fr/geode/ghc-geode")
(synopsis "Data structures and tooling used in project GEODE")
(description
......
......@@ -7,15 +7,15 @@ module GEODE.Metadata
, module SplitContext
, module TSV
, module TSV_Header
, module TSV_Join
, module Types
, module Work
, groupBy
, indexBy
, sortBy ) where
import Data.Foldable as Foldable (toList)
import Data.List (sortOn)
import Data.Map.Strict as Map (Map, alter, empty, insert, toList)
import Data.Map.Strict as Map (alter, empty, toList)
import GEODE.Metadata.Contrastive as Contrastive
import GEODE.Metadata.Entry as Entry
( Entry(headword, name, page), newEntry, normalize )
......@@ -24,6 +24,7 @@ import GEODE.Metadata.Record as Record
import GEODE.Metadata.SplitContext as SplitContext hiding (get, page, rank)
import GEODE.Metadata.TSV as TSV
import GEODE.Metadata.TSV.Header as TSV_Header
import GEODE.Metadata.TSV.Join as TSV_Join
import GEODE.Metadata.Types as Types
import GEODE.Metadata.Work as Work
......@@ -34,6 +35,3 @@ groupBy :: (Foldable t, Ord k) => (v -> k) -> t v -> [(k, [v])]
groupBy field = Map.toList . foldr group Map.empty
where
group article = Map.alter (Just . maybe [article] (article:)) (field article)
indexBy :: (Foldable t, Ord k) => (a -> k) -> t a -> Map k a
indexBy f = foldr (\a -> Map.insert (f a) a) Map.empty
{-# LANGUAGE OverloadedStrings #-}
module GEODE.Metadata.TSV.Join
( ColumnsMapping
, JoinContext(..)
, JoinMethod
, Renamers
, indexBy
, leftJoin
, rightJoin ) where
import Data.Map as Map (Map, insert, lookup)
import Data.Set as Set (Set, intersection, member, singleton)
import Data.Text as Text (Text)
import Data.Vector as Vector
(Vector, cons, indexed, partitionWith, uncons, unsafeIndex, zipWith )
type Row = Vector Text
type Rows = Vector Row
type Document = (Row, Rows)
type ColumnsMapping = ([Text], [Text])
type Renamers = (Text -> Text, Text -> Text)
data JoinContext = JoinContext
{ columns :: ColumnsMapping
, renamers :: Renamers }
type JoinMethod = JoinContext -> Rows -> Rows -> Rows
asSet :: (Foldable t, Ord a) => t a -> Set a
asSet = foldMap singleton
indexBy :: (Foldable t, Ord k) => (a -> k) -> t a -> Map k a
indexBy f = foldr (\a -> Map.insert (f a) a) mempty
select :: Vector Int -> Vector a -> Vector a
select mask values = unsafeIndex values <$> mask
indexMasks :: [Text] -> Vector Text -> (Vector Int, Vector Int)
indexMasks columns = partitionWith inKeys . indexed
where
keys = asSet columns
inKeys (i, a) = (if a `member` keys then Left else Right) i
renameHeader :: (Foldable t, Functor t) => Renamers -> t Text -> t Text -> (t Text, t Text)
renameHeader (leftRenamer, rightRenamer) left right =
(apply leftRenamer <$> left, apply rightRenamer <$> right)
where
common = intersection (asSet left) (asSet right)
apply r c = if c `member` common then r c else c
filterRows :: Vector Int -> (Vector Int, Vector Int) -> Rows -> Rows -> Rows
filterRows mainKeyMask (sideKeyMask, sideValueMask) main side =
(maybe emptyCells (select sideValueMask) . match) <$> mainKeys
where
mainKeys = select mainKeyMask <$> main
optimize keys key | length main >= length side || key `member` keys = key
| otherwise = mempty
indexedSide = indexBy (optimize (asSet mainKeys) . select sideKeyMask) side
match = flip Map.lookup indexedSide
emptyCells = (\_ -> "") <$> sideValueMask
joinInto :: JoinContext -> Maybe Document -> Document -> ((Row, Row), (Rows, Rows))
joinInto _ Nothing (header, rows) = ((header, mempty), (rows, mempty))
joinInto context (Just (sideHead, sideRows)) (mainHead, mainRows) =
(newHead, (mainRows, filterRows mainKeyMask sideMasks mainRows sideRows))
where
(onLeft, onRight) = columns context
mainKeyMask = fst (indexMasks onLeft mainHead)
sideMasks@(_, valueMask) = indexMasks onRight sideHead
newHead =
renameHeader (renamers context) mainHead (select valueMask sideHead)
leftJoin :: JoinMethod
leftJoin context left right =
build (joinInto context (Vector.uncons right) <$> Vector.uncons left)
where
build Nothing = mempty
build (Just ((mainHead, sideHead), (mainRows, sideRows))) =
Vector.cons (mainHead <> sideHead) (Vector.zipWith (<>) mainRows sideRows)
rightJoin :: JoinMethod
rightJoin context left right =
build (joinInto context (Vector.uncons left) <$> Vector.uncons right)
where
build Nothing = mempty
build (Just ((mainHead, sideHead), (mainRows, sideRows))) =
Vector.cons (sideHead <> mainHead) (Vector.zipWith (<>) sideRows mainRows)
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment