Skip to content
Snippets Groups Projects
Commit 7fca2bd5 authored by Alice Brenon's avatar Alice Brenon
Browse files

Trim Metadata a bit to make it only an inclusion shortcut, put TSV-specific...

Trim Metadata a bit to make it only an inclusion shortcut, put TSV-specific stuff out in separate module and cut Article in two
parent 80fd8c29
No related branches found
No related tags found
No related merge requests found
{-# LANGUAGE NamedFieldPuns, OverloadedStrings #-}
module GEODE.Metadata (
Article(..)
, Authors(..)
, Book(..)
, Domains(..)
, FromBook(..)
, HasAuthors(..)
, HasDomains(..)
, InFile(..)
, TXMText
, Unique(..)
, groupBy
, list
, readTsv
, sortBy
, tsvFile
, tsvLines
) where
module GEODE.Metadata
( module ID
, module Entry
, module TSV
, module Projector
, module Types
, groupBy
, indexBy
, list
, sortBy ) where
import Data.ByteString.Lazy as ByteString (putStr, readFile, writeFile)
import Data.ByteString.Char8 as StrictByteString (pack)
import Data.Csv
( DecodeOptions(..), EncodeOptions(..), FromRecord, HasHeader(..)
, ToNamedRecord, ToRecord, decodeWith, defaultEncodeOptions, encodeByNameWith
, encodeWith, header )
import Data.Foldable as Foldable (toList)
import Data.List (sortOn)
import Data.Map.Strict as Map (alter, empty, toList)
import Data.Map.Strict as Map (Map, alter, empty, insert, toList)
import Data.Text as Text (Text, intercalate, unpack)
import Data.Vector as Vector (Vector)
import GEODE.Metadata.Article
import GEODE.Metadata.Projector
(FromBook(..), HasAuthors(..), HasDomains(..), InFile(..), TXMText, Unique(..))
import GEODE.Metadata.Types (Authors(..), Book(..), Domains(..))
import GEODE.Metadata.ID as ID
import GEODE.Metadata.Entry as Entry
import GEODE.Metadata.Projector as Projector
import GEODE.Metadata.TSV as TSV
import GEODE.Metadata.Types as Types
list :: [Text] -> String
list ts = Text.unpack $ ":" <> intercalate ":" ts <> ":"
readTsv :: FromRecord a => FilePath -> IO (Either String (Vector a))
readTsv source = decodeWith fromTsv HasHeader <$> ByteString.readFile source
where
fromTsv = DecodeOptions {decDelimiter = fromIntegral (fromEnum '\t')}
toTsv :: EncodeOptions
toTsv = defaultEncodeOptions
{ encDelimiter = fromIntegral (fromEnum '\t')
, encUseCrLf = False }
tsvFile :: ToNamedRecord a => FilePath -> [String] -> [a] -> IO ()
tsvFile target fields =
ByteString.writeFile target
. encodeByNameWith toTsv (header $ StrictByteString.pack <$> fields)
tsvLines :: ToRecord a => [a] -> IO ()
tsvLines = ByteString.putStr . encodeWith toTsv
sortBy :: (Foldable t, Ord k) => (a -> k) -> t a -> [a]
sortBy field = sortOn field . Foldable.toList
......@@ -62,3 +30,6 @@ groupBy :: (Foldable t, Ord k) => (v -> k) -> t v -> [(k, [v])]
groupBy field = Map.toList . foldr group Map.empty
where
group article = Map.alter (Just . maybe [article] (article:)) (field article)
indexBy :: (Foldable t, Ord k) => (a -> k) -> t a -> Map k a
indexBy f = foldr (\a -> Map.insert (f a) a) Map.empty
module GEODE.Metadata.Entry
( Entry(..)
, headerSection ) where
import Data.Csv (FromNamedRecord(..), ToNamedRecord(..))
import Data.Text (Text)
import GEODE.Metadata.TSV (Default(..), DefaultHeader(..))
import GHC.Generics (Generic)
data Entry = Entry
{ headWord :: Text
, rank :: Int
, page :: Int } deriving (Generic, Show)
instance FromNamedRecord Entry
instance ToNamedRecord Entry
instance DefaultHeader Entry where
headerSection = Default [ "headWord", "rank", "page" ]
module GEODE.Metadata.ID
( HasID(..)
, ID
, book
, tome
, name
, headerSection
, relativePath
, uid ) where
import Data.Csv (FromNamedRecord(..), ToNamedRecord(..))
import Data.Text (Text, unpack)
import GEODE.Metadata.Types (Book)
--import GEODE.Metadata.Projector (InFile(..), Unique(..))
import GEODE.Metadata.TSV (Default(..), DefaultHeader(..))
import GHC.Generics (Generic)
import System.FilePath ((</>), (<.>))
import Text.Printf (printf)
data ID = ID
{ book_ :: Book
, tome_ :: Int
, name_ :: Text } deriving (Eq, Ord, Generic, Show)
class HasID t where
iD :: t -> ID
instance HasID ID where
iD = id
instance HasID (ID, a) where
iD = fst
book :: HasID a => a -> Book
book = book_ . iD
tome :: HasID a => a -> Int
tome = tome_ . iD
name :: HasID a => a -> Text
name = name_ . iD
instance FromNamedRecord ID
instance ToNamedRecord ID
uid :: HasID a => a -> String
uid a = printf "%s_%d_%s" (show $ book a) (tome a) (unpack $ name a)
relativePath :: HasID a => a -> String -> FilePath
relativePath a extension =
(show $ book a) </> (show $ tome a) </> (unpack $ name a) <.> extension
{-
instance Unique ID where
uid (ID {book, tome, name}) =
printf "%s_%d_%s" (show book) tome (unpack name)
instance InFile ID where
relativePath (ID {book, tome, name}) extension =
show book </> show tome </> unpack name <.> extension
-}
instance DefaultHeader ID where
headerSection = Default [ "book", "tome", "name" ]
{-# LANGUAGE ConstraintKinds #-}
module GEODE.Metadata.Projector
( FromBook(..)
, HasAuthors(..)
, HasDomains(..)
, InFile(..)
, TXMText
, Unique(..) ) where
import GEODE.Metadata.Types (Authors(..), Book, Domains(..))
--( FromBook(..)
--, FromTome(..)
( HasAuthors(..)
, HasDomains(..) ) where
--, InFile(..)
--, Named(..)
--, TXMText
--, Unique(..) ) where
import GEODE.Metadata.Types (Authors(..), Domains(..))
--import GEODE.Metadata.Types (Authors(..), Book, Domains(..))
import Data.Text (Text)
{-
class Unique a where
uid :: a -> String
class FromBook a where
book :: a -> Book
class FromTome a where
tome :: a -> Int
class Named a where
name :: a -> Text
-}
class HasAuthors a where
authors_ :: a -> Authors
......@@ -28,7 +39,9 @@ class HasDomains a where
domains :: a -> [Text]
domains = getDomains . domains_
{-
class InFile a where
relativePath :: a -> FilePath
relativePath :: a -> String -> FilePath
-}
type TXMText a = (Unique a, FromBook a, HasAuthors a, HasDomains a)
--type TXMText a = (Unique a, FromBook a, HasAuthors a, HasDomains a)
{-# LANGUAGE ScopedTypeVariables, UndecidableInstances #-}
module GEODE.Metadata.TSV
( Default(..)
, DefaultHeader(..)
, readNamedTsv
, readTsv
, toTsv
, tsvFile
, tsvLines ) where
import Data.ByteString.Char8 as StrictByteString (pack)
import Data.ByteString.Lazy as ByteString (putStr, readFile, writeFile)
import Data.Csv
( DecodeOptions(..), EncodeOptions(..), FromNamedRecord, FromRecord
, HasHeader(..), ToNamedRecord(..), ToRecord(..), decodeByNameWith, decodeWith
, defaultEncodeOptions, encodeByNameWith, encodeWith, header )
import Data.Foldable (toList)
import Data.HashMap.Strict ((!))
import Data.Vector (Vector, fromList)
readNamedTsv :: FromNamedRecord a => FilePath -> IO (Either String (Vector a))
readNamedTsv source =
(fmap snd . decodeByNameWith fromTsv) <$> ByteString.readFile source
where
fromTsv = DecodeOptions {decDelimiter = fromIntegral (fromEnum '\t')}
readTsv :: FromRecord a => FilePath -> IO (Either String (Vector a))
readTsv source = decodeWith fromTsv NoHeader <$> ByteString.readFile source
where
fromTsv = DecodeOptions {decDelimiter = fromIntegral (fromEnum '\t')}
newtype Default a = Default { defaultHeader :: [String] }
class DefaultHeader a where
headerSection :: Default a
instance (DefaultHeader a, DefaultHeader b) => DefaultHeader (a, b) where
headerSection = Default (a ++ b)
where
Default a = (headerSection :: Default a)
Default b = (headerSection :: Default b)
instance (DefaultHeader a, ToNamedRecord a) => ToRecord a where
toRecord = fromList . prepare . toNamedRecord
where
Default fields = (headerSection :: Default a)
prepare namedRecord = (namedRecord !) . StrictByteString.pack <$> fields
toTsv :: EncodeOptions
toTsv = defaultEncodeOptions
{ encDelimiter = fromIntegral (fromEnum '\t')
, encUseCrLf = False }
tsvFile :: forall a. (DefaultHeader a, ToNamedRecord a) => FilePath -> [a] -> IO ()
tsvFile target =
ByteString.writeFile target
. encodeByNameWith toTsv (header $ StrictByteString.pack <$> fields)
where
Default fields = (headerSection :: Default a)
tsvLines :: (Foldable t, ToRecord a) => t a -> IO ()
tsvLines = ByteString.putStr . encodeWith toTsv . toList
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment