diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000000000000000000000000000000000000..283f8c996ab97f8f5a9085826ca7647e4499c714 --- /dev/null +++ b/.gitignore @@ -0,0 +1,3 @@ +*.o +*.hi +Main diff --git a/CHANGELOG.md b/CHANGELOG.md index 7f94602a1bff63568dd70ff6cd59f2ad350eace5..230f79a6f3e1c2e4081c9241ad2eb854d42f5153 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,12 @@ # Revision history for ghc-geode +## 0.2.0.0 -- 2023-07-17 + +* Add unit tests +* Add optparse-applicative parsers for input / output parameters +* Rework metadata types and the way they compose (splitting Article into + PrimaryKey and Entry) + ## 0.1.0.0 -- 2023-05-23 * Exposing some basic stuff required by recent changes in [EDdA-Clinic](https://gitlab.huma-num.fr/alicebrenon/EDdAClinic) diff --git a/geode.cabal b/geode.cabal new file mode 100644 index 0000000000000000000000000000000000000000..030f41e311130398eb2a706963a5e30bb94ebe4d --- /dev/null +++ b/geode.cabal @@ -0,0 +1,67 @@ +cabal-version: 2.4 +name: geode +version: 0.1.0.0 +synopsis: + Data structures and tooling used in project GEODE + +-- A longer description of the package. +-- description: +homepage: https://geode-project.github.io/ + +-- A URL where users can report bugs. +-- bug-reports: +license: GPL-3.0-or-later +license-file: LICENSE +author: Alice BRENON +maintainer: alice.brenon@ens-lyon.fr + +-- A copyright notice. +-- copyright: +-- category: +extra-source-files: CHANGELOG.md + +library + exposed-modules: GEODE.Metadata + , GEODE.Metadata.Entry + , GEODE.Metadata.SplitContext + , GEODE.Metadata.TSV.Header + , GEODE.Options + + -- Modules included in this library but not exported. + other-modules: GEODE.Metadata.Contrastive + , GEODE.Metadata.PrimaryKey + , GEODE.Metadata.TSV + , GEODE.Metadata.Types + + -- LANGUAGE extensions used by modules in this package. + -- other-extensions: + build-depends: base >=4.16 && <4.17 + , bytestring >= 0.11.3 && <0.12 + , containers >= 0.6.5.1 && <0.7 + , cassava >= 0.5.3 && <0.6 + , filepath >= 1.4.2.2 && <1.5 + , mtl >= 2.2.2 && <2.3 + , optparse-applicative >= 0.13.2 && <0.18 + , text >= 1.2.5 && <1.3 + , unordered-containers >= 0.2.19.1 && <0.3 + , vector >= 0.12.3.1 && <0.13 + hs-source-dirs: lib + default-language: Haskell2010 + +test-suite ghc-geode-test + default-language: Haskell2010 + type: exitcode-stdio-1.0 + hs-source-dirs: test + main-is: Main.hs + other-modules: GEODE.Metadata.TestEntry + , GEODE.Metadata.TestPrimaryKey + , Test.HUnit.Extra + build-depends: base + , bytestring + , containers + , cassava + , geode + , mtl + , HUnit >= 1.6.2.0 && <1.7 + , text + , unordered-containers diff --git a/ghc-geode.cabal b/ghc-geode.cabal deleted file mode 100644 index 40f21705208347f5296f22862759d98b94f849f5..0000000000000000000000000000000000000000 --- a/ghc-geode.cabal +++ /dev/null @@ -1,40 +0,0 @@ -cabal-version: 2.4 -name: ghc-geode -version: 0.1.0.0 -synopsis: - Data structures and tooling used in project GEODE - --- A longer description of the package. --- description: -homepage: https://geode-project.github.io/ - --- A URL where users can report bugs. --- bug-reports: -license: GPL-3.0-or-later -license-file: LICENSE -author: Alice BRENON -maintainer: alice.brenon@ens-lyon.fr - --- A copyright notice. --- copyright: --- category: -extra-source-files: CHANGELOG.md - -library - exposed-modules: GEODE.Metadata - - -- Modules included in this library but not exported. - other-modules: GEODE.Metadata.Article - , GEODE.Metadata.Projector - , GEODE.Metadata.Types - - -- LANGUAGE extensions used by modules in this package. - -- other-extensions: - build-depends: base >=4.16 && <4.17 - , bytestring >= 0.11.3 && <0.12 - , containers >= 0.6.5.1 && <0.7 - , cassava >= 0.5.3 && <0.6 - , text >= 1.2.5 && <1.3 - , vector >= 0.12.3.1 && <0.13 - hs-source-dirs: lib - default-language: Haskell2010 diff --git a/guix.scm b/guix.scm index 7ff7bd19d027993fc4aed76a7015a4102796fb6f..26f2204f9f30b349f1881f295fd32589d0e5288e 100644 --- a/guix.scm +++ b/guix.scm @@ -1,4 +1,6 @@ -(use-modules ((gnu packages haskell-xyz) #:select (ghc-cassava)) +(use-modules ((gnu packages haskell-xyz) #:select (ghc-cassava + ghc-optparse-applicative)) + ((gnu packages haskell-check) #:select (ghc-hunit)) ((guix build-system haskell) #:select (haskell-build-system)) ((guix git-download) #:select (git-predicate)) ((guix gexp) #:select (local-file)) @@ -9,16 +11,16 @@ ((%source-dir (dirname (current-filename)))) (package (name "ghc-geode") - (version "0.1.0.0") + (version "devel") (source (local-file %source-dir #:recursive? #t #:select? (git-predicate %source-dir))) (build-system haskell-build-system) - (inputs (list ghc-cassava)) + (inputs (list ghc-cassava ghc-optparse-applicative ghc-hunit)) (home-page "https://gitlab.liris.cnrs.fr/geode/ghc-geode") (synopsis "Data structures and tooling used in project GEODE") (description - "Reads the files as streams of markup events to allow altering the file - content on the fly and fixing broken files which aren't proper XML") + "Library providing a representation for corpus metadata and primitives to +define command-line tools to process them.") (license gpl3+))) diff --git a/lib/GEODE/Metadata.hs b/lib/GEODE/Metadata.hs index f71e9704c7d1538f6447400151999f76b04166ec..d07145b197b50f89ae72d6822e94893324ff23d5 100644 --- a/lib/GEODE/Metadata.hs +++ b/lib/GEODE/Metadata.hs @@ -1,59 +1,28 @@ -{-# LANGUAGE NamedFieldPuns, OverloadedStrings #-} -module GEODE.Metadata ( - Article(..) - , Authors(..) - , Book(..) - , Domains(..) - , FromBook(..) - , HasAuthors(..) - , HasDomains(..) - , InFile(..) - , TXMText - , Unique(..) - , groupBy - , list - , readTsv - , sortBy - , tsvFile - , tsvLines - ) where +{-# LANGUAGE OverloadedStrings #-} +module GEODE.Metadata + ( module Contrastive + , module Entry + , module PrimaryKey + , module SplitContext + , module TSV + , module TSV_Header + , module Types + , groupBy + , indexBy + , sortBy ) where -import Data.ByteString.Lazy as ByteString (putStr, readFile, writeFile) -import Data.ByteString.Char8 as StrictByteString (pack) -import Data.Csv - ( DecodeOptions(..), EncodeOptions(..), FromRecord, HasHeader(..) - , ToNamedRecord, ToRecord, decodeWith, defaultEncodeOptions, encodeByNameWith - , encodeWith, header ) import Data.Foldable as Foldable (toList) import Data.List (sortOn) -import Data.Map.Strict as Map (alter, empty, toList) -import Data.Text as Text (Text, intercalate, unpack) -import Data.Vector as Vector (Vector) -import GEODE.Metadata.Article -import GEODE.Metadata.Projector - (FromBook(..), HasAuthors(..), HasDomains(..), InFile(..), TXMText, Unique(..)) -import GEODE.Metadata.Types (Authors(..), Book(..), Domains(..)) - -list :: [Text] -> String -list ts = Text.unpack $ ":" <> intercalate ":" ts <> ":" - -readTsv :: FromRecord a => FilePath -> IO (Either String (Vector a)) -readTsv source = decodeWith fromTsv HasHeader <$> ByteString.readFile source - where - fromTsv = DecodeOptions {decDelimiter = fromIntegral (fromEnum '\t')} - -toTsv :: EncodeOptions -toTsv = defaultEncodeOptions - { encDelimiter = fromIntegral (fromEnum '\t') - , encUseCrLf = False } - -tsvFile :: ToNamedRecord a => FilePath -> [String] -> [a] -> IO () -tsvFile target fields = - ByteString.writeFile target - . encodeByNameWith toTsv (header $ StrictByteString.pack <$> fields) - -tsvLines :: ToRecord a => [a] -> IO () -tsvLines = ByteString.putStr . encodeWith toTsv +import Data.Map.Strict as Map (Map, alter, empty, insert, toList) +import GEODE.Metadata.Contrastive as Contrastive +import GEODE.Metadata.Entry as Entry + ( Entry(headWord, name, page), newEntry, normalize ) +import GEODE.Metadata.PrimaryKey as PrimaryKey +import GEODE.Metadata.SplitContext as SplitContext hiding (get, page, rank) +import GEODE.Metadata.TSV as TSV +import GEODE.Metadata.TSV.Header as TSV_Header + ( DefaultFields(..), HasDefaultHeader(..), WithDefaultHeader(..), glue ) +import GEODE.Metadata.Types as Types sortBy :: (Foldable t, Ord k) => (a -> k) -> t a -> [a] sortBy field = sortOn field . Foldable.toList @@ -62,3 +31,6 @@ groupBy :: (Foldable t, Ord k) => (v -> k) -> t v -> [(k, [v])] groupBy field = Map.toList . foldr group Map.empty where group article = Map.alter (Just . maybe [article] (article:)) (field article) + +indexBy :: (Foldable t, Ord k) => (a -> k) -> t a -> Map k a +indexBy f = foldr (\a -> Map.insert (f a) a) Map.empty diff --git a/lib/GEODE/Metadata/Contrastive.hs b/lib/GEODE/Metadata/Contrastive.hs new file mode 100644 index 0000000000000000000000000000000000000000..69d9c48fe23701bb84d55dd60662d1141d3f3c74 --- /dev/null +++ b/lib/GEODE/Metadata/Contrastive.hs @@ -0,0 +1,46 @@ +{-# LANGUAGE DeriveGeneric, OverloadedStrings #-} +module GEODE.Metadata.Contrastive + ( Contrastive(..) + , MultiText(..) + , formatList ) where + +import Data.Csv + ( FromField(..), FromNamedRecord(..), ToNamedRecord(..), ToField(..) ) +import Data.Text (Text, intercalate, splitOn, uncons, unsnoc) +import GEODE.Metadata.TSV.Header (DefaultFields(..), HasDefaultHeader(..)) +import GHC.Generics (Generic) + +newtype MultiText = MultiText + { getList :: [Text] } deriving (Show) + +formatList :: MultiText -> Text +formatList = colonFormat . getList + where + colonFormat ts = ":" <> intercalate ":" ts <> ":" + +instance FromField MultiText where + parseField f = parseField f >>= checkAndSplit + where + popBoundaries t0 = do + (firstChar,t1) <- uncons t0 + (middle,lastChar) <- unsnoc t1 + pure (firstChar,middle,lastChar) + checkAndSplit t = + case popBoundaries t of + Just (':',"",':') -> pure $ MultiText [] + Just (':',fields,':') -> pure.MultiText $ splitOn ":" fields + _ -> mempty + +instance ToField MultiText where + toField = toField . formatList + +data Contrastive = Contrastive + { authors :: MultiText + , domains :: MultiText + , subCorpus :: MultiText } deriving (Generic, Show) + +instance FromNamedRecord Contrastive +instance ToNamedRecord Contrastive + +instance HasDefaultHeader Contrastive where + defaultFields = DefaultFields [ "authors", "domains", "subCorpus" ] diff --git a/lib/GEODE/Metadata/Entry.hs b/lib/GEODE/Metadata/Entry.hs new file mode 100644 index 0000000000000000000000000000000000000000..c67ea4b946d492e99ab6f75c42ce7fcf80e64671 --- /dev/null +++ b/lib/GEODE/Metadata/Entry.hs @@ -0,0 +1,40 @@ +{-# LANGUAGE DeriveGeneric, FlexibleContexts, OverloadedStrings #-} +module GEODE.Metadata.Entry + ( Entry(..) + , newEntry + , normalize ) where + +import Data.Char (isAlphaNum, isSpace, isUpper, toLower) +import Data.Csv (FromNamedRecord(..), ToNamedRecord(..)) +import Data.Text as Text (Text, concat, foldl', pack, snoc) +import GEODE.Metadata.SplitContext (Field(..), SplitContext(..), next) +import GEODE.Metadata.TSV.Header + (DefaultFields(..), HasDefaultHeader(..), HasDefaultHeader(..)) +import GHC.Generics (Generic) + +data Entry = Entry + { headWord :: Text + , name :: Text + , page :: Int } deriving (Generic, Show) + +instance FromNamedRecord Entry +instance ToNamedRecord Entry + +instance HasDefaultHeader Entry where + defaultFields = DefaultFields [ "headWord", "name", "page" ] + +normalize :: Text -> Text +normalize = Text.foldl' appendIf mempty + where + appendIf tmpText newChar + | isSpace newChar = tmpText + | isUpper newChar = tmpText `snoc` toLower newChar + | isAlphaNum newChar = tmpText `snoc` newChar + | otherwise = tmpText `snoc` '-' + +newEntry :: SplitContext m => Text -> m Entry +newEntry headWord = do + count <- Text.pack . show <$> next (HeadWord prefix) + Entry headWord (Text.concat [prefix, "-", count]) <$> get Page + where + prefix = normalize headWord diff --git a/lib/GEODE/Metadata/PrimaryKey.hs b/lib/GEODE/Metadata/PrimaryKey.hs new file mode 100644 index 0000000000000000000000000000000000000000..037c4a5d126149179f058a87a14bbcd6102d2961 --- /dev/null +++ b/lib/GEODE/Metadata/PrimaryKey.hs @@ -0,0 +1,46 @@ +{-# LANGUAGE DeriveGeneric, FlexibleContexts, FlexibleInstances, NamedFieldPuns, OverloadedStrings #-} +module GEODE.Metadata.PrimaryKey + ( Book(..) + , PrimaryKey(..) + , relativePath + , uid ) where + +import Data.Csv (FromField(..), FromNamedRecord(..), ToNamedRecord(..), ToField(..)) +import GEODE.Metadata.Types (Has(..)) +import GEODE.Metadata.TSV.Header (DefaultFields(..), HasDefaultHeader(..)) +import GHC.Generics (Generic) +import System.FilePath ((</>), (<.>)) +import Text.Printf (printf) + +data Book = EDdA | LGE | Wikipedia deriving (Eq, Ord, Read, Show) + +instance FromField Book where + parseField "EDdA" = pure EDdA + parseField "LGE" = pure LGE + parseField "Wikipedia" = pure Wikipedia + parseField _ = mempty + +instance ToField Book where + toField = toField . show + +data PrimaryKey = PrimaryKey + { book :: Book + , tome :: Int + , rank :: Int } deriving (Eq, Ord, Generic, Show) + +instance FromNamedRecord PrimaryKey +instance ToNamedRecord PrimaryKey + +uid :: Has PrimaryKey a => a -> String +uid a = printf "%s_%d_%d" (show $ book) tome rank + where + PrimaryKey {book, tome, rank} = get a + +relativePath :: Has PrimaryKey a => a -> String -> FilePath +relativePath a extension = + (show book) </> ("T" <> show tome) </> (show rank) <.> extension + where + PrimaryKey {book, tome, rank} = get a + +instance HasDefaultHeader PrimaryKey where + defaultFields = DefaultFields [ "book", "tome", "rank" ] diff --git a/lib/GEODE/Metadata/Projector.hs b/lib/GEODE/Metadata/Projector.hs index f70547c137461ba60fd0d338d8ab81732c9f2daf..3ea0b5ceb12fa1d26616ac086ecaa99252c93b8a 100644 --- a/lib/GEODE/Metadata/Projector.hs +++ b/lib/GEODE/Metadata/Projector.hs @@ -1,21 +1,32 @@ {-# LANGUAGE ConstraintKinds #-} module GEODE.Metadata.Projector - ( FromBook(..) - , HasAuthors(..) - , HasDomains(..) - , InFile(..) - , TXMText - , Unique(..) ) where - -import GEODE.Metadata.Types (Authors(..), Book, Domains(..)) + --( FromBook(..) + --, FromTome(..) + ( HasAuthors(..) + , HasDomains(..) ) where + --, InFile(..) + --, Named(..) + --, TXMText + --, Unique(..) ) where + +import GEODE.Metadata.Types (Authors(..), Domains(..)) +--import GEODE.Metadata.Types (Authors(..), Book, Domains(..)) import Data.Text (Text) +{- class Unique a where uid :: a -> String class FromBook a where book :: a -> Book +class FromTome a where + tome :: a -> Int + +class Named a where + name :: a -> Text +-} + class HasAuthors a where authors_ :: a -> Authors @@ -28,7 +39,9 @@ class HasDomains a where domains :: a -> [Text] domains = getDomains . domains_ +{- class InFile a where - relativePath :: a -> FilePath + relativePath :: a -> String -> FilePath +-} -type TXMText a = (Unique a, FromBook a, HasAuthors a, HasDomains a) +--type TXMText a = (Unique a, FromBook a, HasAuthors a, HasDomains a) diff --git a/lib/GEODE/Metadata/SplitContext.hs b/lib/GEODE/Metadata/SplitContext.hs new file mode 100644 index 0000000000000000000000000000000000000000..432357889d3db549abbcdbc59c12ed6ebe30090e --- /dev/null +++ b/lib/GEODE/Metadata/SplitContext.hs @@ -0,0 +1,95 @@ +{-# LANGUAGE DeriveFunctor, GADTs #-} +module GEODE.Metadata.SplitContext + ( Current(..) + , Field(..) + , HeadWords + , SplitContext(..) + , SplitContextT(..) + , evalSplit + , evalSplitT + , next + , runSplit + , runSplitT ) where + +import Control.Monad.Identity (Identity(..)) +import Control.Monad.IO.Class (MonadIO(..)) +import Control.Monad.Trans (MonadTrans(..)) +import Data.Map as Map (Map, (!?), delete, empty, insert) +import Data.Text (Text) + +type HeadWords = Map Text Int +data Current = Current { page :: Int, rank :: Int, headWords :: HeadWords } +data Field a where + Page :: Field Int + Rank :: Field Int + HeadWord :: Text -> Field (Maybe Int) + +class Monad m => SplitContext m where + context :: m Current + update :: (Current -> Current) -> m () + + get :: Field a -> m a + get Page = page <$> context + get Rank = rank <$> context + get (HeadWord h) = ((!? h) . headWords) <$> context + + set :: Field a -> a -> m () + set Page p = update $ \c -> c { page = p } + set Rank r = update $ \c -> c { rank = r } + set (HeadWord h) Nothing = update $ \c -> c { headWords = delete h $ headWords c } + set (HeadWord h) (Just n) = update $ \c -> c { headWords = insert h n $ headWords c } + + editLookup :: Field a -> (a -> a) -> m a + editLookup field f = get field >>= (\a -> set field a *> pure a) . f + + lookupEdit :: Field a -> (a -> a) -> m a + lookupEdit field f = get field >>= \a -> set field (f a) *> pure a + + edit :: Field a -> (a -> a) -> m () + edit field = (() <$) . editLookup field + + {-# MINIMAL context, update #-} + +newtype SplitContextT m a = + SplitContextT { runWithContext :: Current -> m (a, Current) } + deriving (Functor) + +instance Monad m => Applicative (SplitContextT m) where + pure a = SplitContextT $ \current -> pure (a, current) + sCF <*> sCA = SplitContextT $ \current0 -> do + (f, current1) <- runWithContext sCF current0 + (a, current2) <- runWithContext sCA current1 + pure (f a, current2) + +instance Monad m => Monad (SplitContextT m) where + sCA >>= f = SplitContextT $ \current0 -> do + (a, current1) <- runWithContext sCA current0 + runWithContext (f a) current1 + +instance Monad m => SplitContext (SplitContextT m) where + context = SplitContextT $ \current -> pure (current, current) + update f = SplitContextT $ \current -> pure ((), f current) + +instance MonadTrans SplitContextT where + lift m = SplitContextT $ \current -> m >>= \a -> pure (a, current) + +instance MonadIO m => MonadIO (SplitContextT m) where + liftIO = lift . liftIO + +next :: SplitContext m => Field a -> m Int +next f@(HeadWord _) = lookupEdit f (Just . maybe 1 (+1)) >>= pure . maybe 0 id +next Page = lookupEdit Page (+1) +next Rank = lookupEdit Rank (+1) + +runSplitT :: SplitContextT m a -> m (a, Current) +runSplitT = flip runWithContext $ + Current { page = 1, rank = 1, headWords = Map.empty } + +evalSplitT :: Functor m => SplitContextT m a -> m a +evalSplitT = fmap fst . runSplitT + +runSplit :: SplitContextT Identity a -> (a, Current) +runSplit = runIdentity . runSplitT + +evalSplit :: SplitContextT Identity a -> a +evalSplit = fst . runSplit diff --git a/lib/GEODE/Metadata/TSV.hs b/lib/GEODE/Metadata/TSV.hs new file mode 100644 index 0000000000000000000000000000000000000000..0b5c6719be8252a9c5def25060b80d57607a08ee --- /dev/null +++ b/lib/GEODE/Metadata/TSV.hs @@ -0,0 +1,41 @@ +{-# LANGUAGE ScopedTypeVariables #-} +module GEODE.Metadata.TSV + ( readNamedTsv + , readTsv + , toTsv + , tsvFile + , tsvLines ) where + +import Data.ByteString.Lazy as ByteString (putStr, readFile, writeFile) +import Data.Csv + ( DecodeOptions(..), EncodeOptions(..), FromNamedRecord(..), FromRecord + , HasHeader(..), ToNamedRecord(..), ToRecord(..), decodeByNameWith + , decodeWith, defaultEncodeOptions, encodeByNameWith, encodeWith ) +import Data.Foldable (toList) +import Data.Vector (Vector) +import GEODE.Metadata.TSV.Header (HasDefaultHeader, getHeader, for) + +readNamedTsv :: FromNamedRecord a => FilePath -> IO (Either String (Vector a)) +readNamedTsv source = + (fmap snd . decodeByNameWith fromTsv) <$> ByteString.readFile source + where + fromTsv = DecodeOptions {decDelimiter = fromIntegral (fromEnum '\t')} + +readTsv :: FromRecord a => FilePath -> IO (Either String (Vector a)) +readTsv source = decodeWith fromTsv NoHeader <$> ByteString.readFile source + where + fromTsv = DecodeOptions {decDelimiter = fromIntegral (fromEnum '\t')} + +toTsv :: EncodeOptions +toTsv = defaultEncodeOptions + { encDelimiter = fromIntegral (fromEnum '\t') + , encUseCrLf = False } + +tsvFile :: forall a t. (Foldable t, HasDefaultHeader a, ToNamedRecord a) => + FilePath -> t a -> IO () +tsvFile target = ByteString.writeFile target . encode . toList + where + encode = encodeByNameWith toTsv (getHeader (for :: a)) + +tsvLines :: (Foldable t, ToRecord a) => t a -> IO () +tsvLines = ByteString.putStr . encodeWith toTsv . toList diff --git a/lib/GEODE/Metadata/TSV/Header.hs b/lib/GEODE/Metadata/TSV/Header.hs new file mode 100644 index 0000000000000000000000000000000000000000..6e084e628230f23e007fe6d9d711ab2f0a7d2b04 --- /dev/null +++ b/lib/GEODE/Metadata/TSV/Header.hs @@ -0,0 +1,39 @@ +{-# LANGUAGE ExplicitNamespaces, ScopedTypeVariables, TypeOperators #-} +module GEODE.Metadata.TSV.Header + ( DefaultFields(..) + , HasDefaultHeader(..) + , WithDefaultHeader(..) + , for + , getHeader + , glue ) where + +import Data.ByteString.Char8 as StrictByteString (pack) +import Data.Csv (Header, ToNamedRecord(..), ToRecord(..)) +import Data.HashMap.Strict ((!)) +import Data.Vector (fromList) +import GEODE.Metadata.Types (type (@)(..)) + +newtype WithDefaultHeader a = WithDefaultHeader a +newtype DefaultFields a = DefaultFields [String] +class HasDefaultHeader a where + defaultFields :: DefaultFields a + +instance (HasDefaultHeader a, HasDefaultHeader b) => HasDefaultHeader (a @ b) where + defaultFields = DefaultFields (a ++ b) + where + DefaultFields a = (defaultFields :: DefaultFields a) + DefaultFields b = (defaultFields :: DefaultFields b) + +getHeader :: forall a. HasDefaultHeader a => a -> Header +getHeader _ = StrictByteString.pack <$> fromList fields + where + DefaultFields fields = (defaultFields :: DefaultFields a) + +for :: HasDefaultHeader a => a +for = undefined + +instance (HasDefaultHeader a, ToNamedRecord a) => ToRecord (WithDefaultHeader a) where + toRecord (WithDefaultHeader a) = (toNamedRecord a !) <$> getHeader a + +glue :: a -> b -> WithDefaultHeader (a @ b) +glue a b = WithDefaultHeader (a :@: b) diff --git a/lib/GEODE/Metadata/Types.hs b/lib/GEODE/Metadata/Types.hs index 5e43d627af505e4947b5bf7f3111ce523f3d0dee..362be18e9626830368a66fe8879a155d5f825747 100644 --- a/lib/GEODE/Metadata/Types.hs +++ b/lib/GEODE/Metadata/Types.hs @@ -1,38 +1,29 @@ -{-# LANGUAGE OverloadedStrings #-} +{-# LANGUAGE ExplicitNamespaces, FlexibleInstances, MultiParamTypeClasses, TypeOperators #-} module GEODE.Metadata.Types - ( Authors(..) - , Book(..) - , Domains(..) ) where - -import Control.Applicative (empty) -import Data.Csv (Field, FromField(..), Parser, ToField(..)) -import Data.Text (Text, intercalate, splitOn) - -data Book = EDdA | LGE | Wikipedia deriving (Eq, Ord, Read, Show) - -newtype Authors = Authors - { getAuthors :: [Text] } deriving Show -newtype Domains = Domains - { getDomains :: [Text] } deriving Show - -instance FromField Book where - parseField "EDdA" = pure EDdA - parseField "LGE" = pure LGE - parseField "Wikipedia" = pure Wikipedia - parseField _ = empty -instance ToField Book where - toField = toField . show - -sepBy :: Text -> Field -> Parser [Text] -sepBy s = fmap (splitOn s) . parseField - -instance FromField Authors where - parseField = fmap Authors . sepBy " & " -instance ToField Authors where - toField = toField . intercalate " & ". getAuthors - -instance FromField Domains where - parseField = fmap Domains . sepBy " | " -instance ToField Domains where - toField = toField . intercalate " | ". getDomains + ( Has(..) + , type (@)(..) ) where +import Data.Csv (FromNamedRecord(..), ToNamedRecord(..)) +import Data.HashMap.Strict (union) + +infixr 9 @ +infixr 9 :@: +data a @ b = a :@: b + +class Has a b where + get :: b -> a + +instance Has a a where + get = id + +instance Has a c => Has a (b @ c) where + get (_ :@: c) = get c + +instance {-# OVERLAPS #-} Has a (a @ b) where + get (a :@: _) = a + +instance (ToNamedRecord a, ToNamedRecord b) => ToNamedRecord (a @ b) where + toNamedRecord (a :@: b) = union (toNamedRecord a) (toNamedRecord b) + +instance (FromNamedRecord a, FromNamedRecord b) => FromNamedRecord (a @ b) where + parseNamedRecord nr = (:@:) <$> parseNamedRecord nr <*> parseNamedRecord nr diff --git a/lib/GEODE/Options.hs b/lib/GEODE/Options.hs new file mode 100644 index 0000000000000000000000000000000000000000..66d79499d3291889a3faae37aaa693d9520bd6d2 --- /dev/null +++ b/lib/GEODE/Options.hs @@ -0,0 +1,55 @@ +{-# LANGUAGE NamedFieldPuns #-} +module GEODE.Options + ( Input(..) + , IOConfig(..) + , Output(..) + , input + , ioConfig + , output + , xmlOutput ) where + +import Control.Applicative ((<|>)) +import Data.List.NonEmpty (NonEmpty(..)) +import Options.Applicative + ( Parser, argument, flag', help, long, metavar, short, str, strOption, value ) +import Options.Applicative.NonEmpty (some1) +import Text.Printf (printf) + +data Input = StdIn | File FilePath + +input :: String -> Parser Input +input stdinSemantics = + argument + (File <$> str) + ( value StdIn + <> metavar "INPUT_FILE" + <> help (printf "path of the file to process (%s)" stdinSemantics) ) + +data Output = Metadata | TextRoot FilePath | XMLRoot FilePath + +xmlOutput :: Parser FilePath +xmlOutput = + strOption + ( long "xml-root" <> metavar "DIRECTORY" <> short 'x' + <> help "Root path where to output XML files" ) + +output :: Parser Output +output = + flag' Metadata ( long "metadata" + <> short 'm' + <> help "Print metadata for splitted files on stdout" ) + <|> (TextRoot <$> strOption + ( long "text-root" + <> metavar "DIRECTORY" + <> short 't' + <> help "Root path where to output text files" )) + <|> (XMLRoot <$> xmlOutput) + +data IOConfig = IOConfig + { from :: Input + , to :: NonEmpty Output } + +ioConfig :: String -> Parser IOConfig +ioConfig stdinSemantics = IOConfig + <$> input stdinSemantics + <*> some1 output diff --git a/test/GEODE/Metadata/TestEntry.hs b/test/GEODE/Metadata/TestEntry.hs new file mode 100644 index 0000000000000000000000000000000000000000..bc85c1a676c0d1de59bd5076886678a0fc543850 --- /dev/null +++ b/test/GEODE/Metadata/TestEntry.hs @@ -0,0 +1,47 @@ +{-# LANGUAGE OverloadedStrings #-} +module GEODE.Metadata.TestEntry (testEntry) where + +import Data.Csv (ToNamedRecord(..)) +import Data.Foldable (toList) +import Data.HashMap.Strict ((!?)) +import Data.Text (dropEnd) +import GEODE.Metadata (evalSplit, newEntry, normalize) +import GEODE.Metadata.Entry (Entry(..)) +import GEODE.Metadata.TSV.Header (getHeader, for) +import Test.HUnit (Test(..), (~?=)) +import Test.HUnit.Extra (isJust) + +testEntry :: Test +testEntry = TestLabel "Testing the Entry data type" $ + TestList [ testToNamedRecord, testNormalize, testEntryConstructor ] + +testToNamedRecord :: Test +testToNamedRecord = TestLabel "Testing ToNamedRecord instance" $ + TestList [ has3Keys, validDefaultHeader ] + where + has3Keys = length aNamedRecord ~?= 3 + validDefaultHeader = TestList . toList $ + (isJust . (aNamedRecord !?)) <$> getHeader (for :: Entry) + aNamedRecord = toNamedRecord anEntry + +testNormalize :: Test +testNormalize = TestLabel "Testing function normalize" . TestList $ + check <$> [ ("", "") + , ("é", "é") + , (headWord anEntry, dropEnd 2 $ name anEntry) ] + where + check (a, b) = normalize a ~?= b + +testEntryConstructor :: Test +testEntryConstructor = TestLabel "Testing the entry constructor" . TestList $ + [ a0 ~?= "a-0" + , a1 ~?= "a-1" + , b0 ~?= "b-0" ] + where + (a0, a1, b0) = evalSplit $ (,,) + <$> (name <$> newEntry "A") + <*> (name <$> newEntry "A") + <*> (name <$> newEntry "B") + +anEntry :: Entry +anEntry = Entry "ALCALA DE HÉNARÈS" "alcaladehénarès-0" 1212 diff --git a/test/GEODE/Metadata/TestPrimaryKey.hs b/test/GEODE/Metadata/TestPrimaryKey.hs new file mode 100644 index 0000000000000000000000000000000000000000..62c9662db9ead2e23e8f4fc9075d63c254c97af5 --- /dev/null +++ b/test/GEODE/Metadata/TestPrimaryKey.hs @@ -0,0 +1,26 @@ +{-# LANGUAGE OverloadedStrings #-} +module GEODE.Metadata.TestPrimaryKey (testPrimaryKey) where + +import Data.Csv (ToNamedRecord(..)) +import Data.Foldable (toList) +import Data.HashMap.Strict ((!?)) +import GEODE.Metadata (Book(..), PrimaryKey(..)) +import GEODE.Metadata.TSV.Header (getHeader, for) +import Test.HUnit (Test(..), (~?=)) +import Test.HUnit.Extra (isJust) + +testPrimaryKey :: Test +testPrimaryKey = TestLabel "Testing the PrimaryKey data type" $ + TestList [ testToNamedRecord ] + +testToNamedRecord :: Test +testToNamedRecord = TestLabel "Testing ToNamedRecord instance" $ + TestList [ has3Keys, validDefaultHeader ] + where + has3Keys = length aNamedRecord ~?= 3 + validDefaultHeader = TestList . toList $ + (isJust . (aNamedRecord !?)) <$> getHeader (for :: PrimaryKey) + aNamedRecord = toNamedRecord aPrimaryKey + +aPrimaryKey :: PrimaryKey +aPrimaryKey = PrimaryKey LGE 1 1212 -- ALCALA DE HÉNARÈS diff --git a/test/Main.hs b/test/Main.hs new file mode 100644 index 0000000000000000000000000000000000000000..5aa6b4dc2c1d8aaefce5c99a6c346ebf47ef6e1c --- /dev/null +++ b/test/Main.hs @@ -0,0 +1,12 @@ +module Main (main) where + +import Test.HUnit (Test(..), runTestTTAndExit) +import GEODE.Metadata.TestPrimaryKey (testPrimaryKey) +import GEODE.Metadata.TestEntry (testEntry) + +testMetadata :: Test +testMetadata = TestLabel "Metadata suite" $ + TestList [ testPrimaryKey, testEntry ] + +main :: IO () +main = runTestTTAndExit testMetadata diff --git a/test/Test/HUnit/Extra.hs b/test/Test/HUnit/Extra.hs new file mode 100644 index 0000000000000000000000000000000000000000..046c3faf0b34ddb2b4d5ebd3d0b4d2ac1864b35a --- /dev/null +++ b/test/Test/HUnit/Extra.hs @@ -0,0 +1,12 @@ +module Test.HUnit.Extra + ( isJust + , isNothing ) where + +import Test.HUnit (Test(..), assertBool) +import qualified Data.Maybe as Maybe (isJust, isNothing) + +isJust :: Maybe a -> Test +isJust = TestCase . assertBool "Expected a Just" . Maybe.isJust + +isNothing :: Maybe a -> Test +isNothing = TestCase . assertBool "Expected a Nothing" . Maybe.isNothing