Skip to content
Snippets Groups Projects
Commit cb234457 authored by Alice Brenon's avatar Alice Brenon
Browse files

Add a script to explode (Text) articles into paragraphs, outputing a files.tsv...

Add a script to explode (Text) articles into paragraphs, outputing a files.tsv with a primary key for each of them in the process
parent d0eb4ce6
No related branches found
No related tags found
No related merge requests found
module GEODE.Metadata.File
( File(..) ) where
class File a where
uid :: a -> String
relativePath :: a -> String -> FilePath
{-# LANGUAGE NamedFieldPuns, OverloadedStrings #-}
module GEODE.Metadata.PrimaryKey.Paragraph
( ParagraphPK(..) ) where
import Data.Aeson ((.=), ToJSON(..), object, pairs)
import Data.Csv
( (.:), FromNamedRecord(..), ToNamedRecord(..), namedField, namedRecord )
import GEODE.Metadata.File (File(..))
import GEODE.Metadata as Article
(DefaultFields(..), HasDefaultHeader(..), PrimaryKey(PrimaryKey, book, tome), uid, relativePath)
import qualified GEODE.Metadata as Article (PrimaryKey(rank))
import System.FilePath ((</>), (<.>), dropExtension)
import Text.Printf (printf)
data ParagraphPK = ParagraphPK
{ article :: PrimaryKey
, rank :: Int } deriving (Eq, Ord, Show)
instance FromNamedRecord ParagraphPK where
parseNamedRecord nr = ParagraphPK
<$> (PrimaryKey <$> nr .: "book" <*> nr .: "tome" <*> nr .: "article#")
<*> nr .: "paragraph#"
instance ToNamedRecord ParagraphPK where
toNamedRecord (ParagraphPK {article, rank}) = namedRecord
[ namedField "book" (book article)
, namedField "tome" (tome article)
, namedField "article#" (Article.rank article)
, namedField "paragraph#" rank ]
instance File ParagraphPK where
uid (ParagraphPK {article, rank}) = printf "%s_%d" (Article.uid article) rank
relativePath (ParagraphPK {article, rank}) extension =
articleDirectory </> (show rank) <.> extension
where
articleDirectory = dropExtension (Article.relativePath article "")
instance HasDefaultHeader ParagraphPK where
defaultFields = DefaultFields (["book", "tome", "article#", "paragraph#"])
instance ToJSON ParagraphPK where
toJSON (ParagraphPK {article, rank}) = object
[ "book" .= book article
, "tome" .= tome article
, "article#" .= Article.rank article
, "paragraph#" .= rank ]
toEncoding (ParagraphPK {article, rank}) = pairs
( "book" .= book article
<> "tome" .= tome article
<> "article#" .= Article.rank article
<> "paragraph#" .= rank )
......@@ -2,7 +2,7 @@
((geode packages models) #:select (stanza-fr))
((gnu packages commencement) #:select (gcc-toolchain))
((gnu packages haskell) #:select (ghc))
((gnu packages haskell-web) #:select (ghc-hxt))
((gnu packages haskell-web) #:select (ghc-aeson ghc-hxt))
((gnu packages haskell-xyz) #:select (ghc-cassava ghc-hs-conllu))
((gnu packages python) #:select (python))
((gnu packages python-science) #:select (python-pandas))
......@@ -19,6 +19,7 @@
;edda-clinic ; fix and cut the EDdA
gcc-toolchain ; running haskell
ghc ; running haskell
ghc-aeson ; working with JSON in haskell
ghc-cassava ; working with CSV in haskell
;ghc-geode ; handling corpus files
ghc-hs-conllu ; working on syntax-annotated documents
......
#!/usr/bin/env -S runhaskell --ghc-arg="-Wall" --ghc-arg="-i lib"
{-# LANGUAGE OverloadedStrings #-}
import Data.List (dropWhileEnd)
import Data.Text (Text, splitOn)
import Data.Text.IO as Text (readFile, writeFile)
import GEODE.Metadata.File as File (File(..))
import GEODE.Metadata as Article (PrimaryKey, readNamedTsv, relativePath, tsvFile)
import GEODE.Metadata.PrimaryKey.Paragraph (ParagraphPK(..))
import System.Directory (createDirectoryIfMissing)
import System.Environment (getArgs)
import System.FilePath ((</>), (<.>), isPathSeparator, takeDirectory)
import System.Script (syntax, try)
import Text.Filter (Editable(..))
import Text.Filter.Linearize (linearize)
articleParagraphs :: FilePath -> IO [Text]
articleParagraphs =
fmap (fmap (leave . linearize False . enter) . splitOn "\n\n") . Text.readFile
to :: FilePath -> FilePath -> PrimaryKey -> IO [ParagraphPK]
to source target article = do
createDirectoryIfMissing True (target </> Article.relativePath article "")
articleParagraphs articlePath >>= mapM create . number
where
articlePath = source </> Article.relativePath article "txt"
number = zip [1..]
create (rank, paragraphText) =
let paragraphPK = ParagraphPK {article, rank}
outputPath = target </> File.relativePath paragraphPK "txt" in
paragraphPK <$ Text.writeFile outputPath paragraphText
main :: IO ()
main = (fmap (dropWhileEnd isPathSeparator) <$> getArgs) >>= run
where
run [inputMeta, source, target] =
try (readNamedTsv inputMeta)
>>= mapM (source `to` target)
>>= tsvFile (takeDirectory target </> "files" <.> "tsv") . concat
run _ = syntax "INPUT_METADATA SOURCE_DIRECTORY TARGET_DIRECTORY"
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment