From 7fca2bd55f883e2faa144feec5a9e0639195bdf9 Mon Sep 17 00:00:00 2001 From: Alice BRENON <alice.brenon@ens-lyon.fr> Date: Wed, 7 Jun 2023 18:25:36 +0200 Subject: [PATCH] Trim Metadata a bit to make it only an inclusion shortcut, put TSV-specific stuff out in separate module and cut Article in two --- lib/GEODE/Metadata.hs | 67 ++++++++++----------------------- lib/GEODE/Metadata/Entry.hs | 19 ++++++++++ lib/GEODE/Metadata/ID.hs | 64 +++++++++++++++++++++++++++++++ lib/GEODE/Metadata/Projector.hs | 33 +++++++++++----- lib/GEODE/Metadata/TSV.hs | 63 +++++++++++++++++++++++++++++++ 5 files changed, 188 insertions(+), 58 deletions(-) create mode 100644 lib/GEODE/Metadata/Entry.hs create mode 100644 lib/GEODE/Metadata/ID.hs create mode 100644 lib/GEODE/Metadata/TSV.hs diff --git a/lib/GEODE/Metadata.hs b/lib/GEODE/Metadata.hs index f71e970..04b6417 100644 --- a/lib/GEODE/Metadata.hs +++ b/lib/GEODE/Metadata.hs @@ -1,60 +1,28 @@ {-# LANGUAGE NamedFieldPuns, OverloadedStrings #-} -module GEODE.Metadata ( - Article(..) - , Authors(..) - , Book(..) - , Domains(..) - , FromBook(..) - , HasAuthors(..) - , HasDomains(..) - , InFile(..) - , TXMText - , Unique(..) - , groupBy - , list - , readTsv - , sortBy - , tsvFile - , tsvLines - ) where +module GEODE.Metadata + ( module ID + , module Entry + , module TSV + , module Projector + , module Types + , groupBy + , indexBy + , list + , sortBy ) where -import Data.ByteString.Lazy as ByteString (putStr, readFile, writeFile) -import Data.ByteString.Char8 as StrictByteString (pack) -import Data.Csv - ( DecodeOptions(..), EncodeOptions(..), FromRecord, HasHeader(..) - , ToNamedRecord, ToRecord, decodeWith, defaultEncodeOptions, encodeByNameWith - , encodeWith, header ) import Data.Foldable as Foldable (toList) import Data.List (sortOn) -import Data.Map.Strict as Map (alter, empty, toList) +import Data.Map.Strict as Map (Map, alter, empty, insert, toList) import Data.Text as Text (Text, intercalate, unpack) -import Data.Vector as Vector (Vector) -import GEODE.Metadata.Article -import GEODE.Metadata.Projector - (FromBook(..), HasAuthors(..), HasDomains(..), InFile(..), TXMText, Unique(..)) -import GEODE.Metadata.Types (Authors(..), Book(..), Domains(..)) +import GEODE.Metadata.ID as ID +import GEODE.Metadata.Entry as Entry +import GEODE.Metadata.Projector as Projector +import GEODE.Metadata.TSV as TSV +import GEODE.Metadata.Types as Types list :: [Text] -> String list ts = Text.unpack $ ":" <> intercalate ":" ts <> ":" -readTsv :: FromRecord a => FilePath -> IO (Either String (Vector a)) -readTsv source = decodeWith fromTsv HasHeader <$> ByteString.readFile source - where - fromTsv = DecodeOptions {decDelimiter = fromIntegral (fromEnum '\t')} - -toTsv :: EncodeOptions -toTsv = defaultEncodeOptions - { encDelimiter = fromIntegral (fromEnum '\t') - , encUseCrLf = False } - -tsvFile :: ToNamedRecord a => FilePath -> [String] -> [a] -> IO () -tsvFile target fields = - ByteString.writeFile target - . encodeByNameWith toTsv (header $ StrictByteString.pack <$> fields) - -tsvLines :: ToRecord a => [a] -> IO () -tsvLines = ByteString.putStr . encodeWith toTsv - sortBy :: (Foldable t, Ord k) => (a -> k) -> t a -> [a] sortBy field = sortOn field . Foldable.toList @@ -62,3 +30,6 @@ groupBy :: (Foldable t, Ord k) => (v -> k) -> t v -> [(k, [v])] groupBy field = Map.toList . foldr group Map.empty where group article = Map.alter (Just . maybe [article] (article:)) (field article) + +indexBy :: (Foldable t, Ord k) => (a -> k) -> t a -> Map k a +indexBy f = foldr (\a -> Map.insert (f a) a) Map.empty diff --git a/lib/GEODE/Metadata/Entry.hs b/lib/GEODE/Metadata/Entry.hs new file mode 100644 index 0000000..6e19a21 --- /dev/null +++ b/lib/GEODE/Metadata/Entry.hs @@ -0,0 +1,19 @@ +module GEODE.Metadata.Entry + ( Entry(..) + , headerSection ) where + +import Data.Csv (FromNamedRecord(..), ToNamedRecord(..)) +import Data.Text (Text) +import GEODE.Metadata.TSV (Default(..), DefaultHeader(..)) +import GHC.Generics (Generic) + +data Entry = Entry + { headWord :: Text + , rank :: Int + , page :: Int } deriving (Generic, Show) + +instance FromNamedRecord Entry +instance ToNamedRecord Entry + +instance DefaultHeader Entry where + headerSection = Default [ "headWord", "rank", "page" ] diff --git a/lib/GEODE/Metadata/ID.hs b/lib/GEODE/Metadata/ID.hs new file mode 100644 index 0000000..2d9f1f8 --- /dev/null +++ b/lib/GEODE/Metadata/ID.hs @@ -0,0 +1,64 @@ +module GEODE.Metadata.ID + ( HasID(..) + , ID + , book + , tome + , name + , headerSection + , relativePath + , uid ) where + +import Data.Csv (FromNamedRecord(..), ToNamedRecord(..)) +import Data.Text (Text, unpack) +import GEODE.Metadata.Types (Book) +--import GEODE.Metadata.Projector (InFile(..), Unique(..)) +import GEODE.Metadata.TSV (Default(..), DefaultHeader(..)) +import GHC.Generics (Generic) +import System.FilePath ((</>), (<.>)) +import Text.Printf (printf) + +data ID = ID + { book_ :: Book + , tome_ :: Int + , name_ :: Text } deriving (Eq, Ord, Generic, Show) + +class HasID t where + iD :: t -> ID + +instance HasID ID where + iD = id + +instance HasID (ID, a) where + iD = fst + +book :: HasID a => a -> Book +book = book_ . iD + +tome :: HasID a => a -> Int +tome = tome_ . iD + +name :: HasID a => a -> Text +name = name_ . iD + +instance FromNamedRecord ID +instance ToNamedRecord ID + +uid :: HasID a => a -> String +uid a = printf "%s_%d_%s" (show $ book a) (tome a) (unpack $ name a) + +relativePath :: HasID a => a -> String -> FilePath +relativePath a extension = + (show $ book a) </> (show $ tome a) </> (unpack $ name a) <.> extension + +{- +instance Unique ID where + uid (ID {book, tome, name}) = + printf "%s_%d_%s" (show book) tome (unpack name) + +instance InFile ID where + relativePath (ID {book, tome, name}) extension = + show book </> show tome </> unpack name <.> extension +-} + +instance DefaultHeader ID where + headerSection = Default [ "book", "tome", "name" ] diff --git a/lib/GEODE/Metadata/Projector.hs b/lib/GEODE/Metadata/Projector.hs index f70547c..3ea0b5c 100644 --- a/lib/GEODE/Metadata/Projector.hs +++ b/lib/GEODE/Metadata/Projector.hs @@ -1,21 +1,32 @@ {-# LANGUAGE ConstraintKinds #-} module GEODE.Metadata.Projector - ( FromBook(..) - , HasAuthors(..) - , HasDomains(..) - , InFile(..) - , TXMText - , Unique(..) ) where - -import GEODE.Metadata.Types (Authors(..), Book, Domains(..)) + --( FromBook(..) + --, FromTome(..) + ( HasAuthors(..) + , HasDomains(..) ) where + --, InFile(..) + --, Named(..) + --, TXMText + --, Unique(..) ) where + +import GEODE.Metadata.Types (Authors(..), Domains(..)) +--import GEODE.Metadata.Types (Authors(..), Book, Domains(..)) import Data.Text (Text) +{- class Unique a where uid :: a -> String class FromBook a where book :: a -> Book +class FromTome a where + tome :: a -> Int + +class Named a where + name :: a -> Text +-} + class HasAuthors a where authors_ :: a -> Authors @@ -28,7 +39,9 @@ class HasDomains a where domains :: a -> [Text] domains = getDomains . domains_ +{- class InFile a where - relativePath :: a -> FilePath + relativePath :: a -> String -> FilePath +-} -type TXMText a = (Unique a, FromBook a, HasAuthors a, HasDomains a) +--type TXMText a = (Unique a, FromBook a, HasAuthors a, HasDomains a) diff --git a/lib/GEODE/Metadata/TSV.hs b/lib/GEODE/Metadata/TSV.hs new file mode 100644 index 0000000..cd06047 --- /dev/null +++ b/lib/GEODE/Metadata/TSV.hs @@ -0,0 +1,63 @@ +{-# LANGUAGE ScopedTypeVariables, UndecidableInstances #-} +module GEODE.Metadata.TSV + ( Default(..) + , DefaultHeader(..) + , readNamedTsv + , readTsv + , toTsv + , tsvFile + , tsvLines ) where + +import Data.ByteString.Char8 as StrictByteString (pack) +import Data.ByteString.Lazy as ByteString (putStr, readFile, writeFile) +import Data.Csv + ( DecodeOptions(..), EncodeOptions(..), FromNamedRecord, FromRecord + , HasHeader(..), ToNamedRecord(..), ToRecord(..), decodeByNameWith, decodeWith + , defaultEncodeOptions, encodeByNameWith, encodeWith, header ) +import Data.Foldable (toList) +import Data.HashMap.Strict ((!)) +import Data.Vector (Vector, fromList) + +readNamedTsv :: FromNamedRecord a => FilePath -> IO (Either String (Vector a)) +readNamedTsv source = + (fmap snd . decodeByNameWith fromTsv) <$> ByteString.readFile source + where + fromTsv = DecodeOptions {decDelimiter = fromIntegral (fromEnum '\t')} + +readTsv :: FromRecord a => FilePath -> IO (Either String (Vector a)) +readTsv source = decodeWith fromTsv NoHeader <$> ByteString.readFile source + where + fromTsv = DecodeOptions {decDelimiter = fromIntegral (fromEnum '\t')} + +newtype Default a = Default { defaultHeader :: [String] } + +class DefaultHeader a where + headerSection :: Default a + +instance (DefaultHeader a, DefaultHeader b) => DefaultHeader (a, b) where + headerSection = Default (a ++ b) + where + Default a = (headerSection :: Default a) + Default b = (headerSection :: Default b) + +instance (DefaultHeader a, ToNamedRecord a) => ToRecord a where + toRecord = fromList . prepare . toNamedRecord + where + Default fields = (headerSection :: Default a) + prepare namedRecord = (namedRecord !) . StrictByteString.pack <$> fields + +toTsv :: EncodeOptions +toTsv = defaultEncodeOptions + { encDelimiter = fromIntegral (fromEnum '\t') + , encUseCrLf = False } + +tsvFile :: forall a. (DefaultHeader a, ToNamedRecord a) => FilePath -> [a] -> IO () +tsvFile target = + ByteString.writeFile target + . encodeByNameWith toTsv (header $ StrictByteString.pack <$> fields) + where + Default fields = (headerSection :: Default a) + +tsvLines :: (Foldable t, ToRecord a) => t a -> IO () +tsvLines = ByteString.putStr . encodeWith toTsv . toList + -- GitLab