diff --git a/manifest.scm b/manifest.scm index 6d28e6e119bc13dac0b476df174fbbe612587b4f..1234356fd45d5299b68bf303fb95718f234bdf34 100644 --- a/manifest.scm +++ b/manifest.scm @@ -7,7 +7,8 @@ ((gnu packages haskell-web) #:select (ghc-aeson ghc-hxt)) ((gnu packages haskell-xyz) #:select (ghc-cassava ghc-hs-conllu - ghc-random)) + ghc-random + ghc-regex-tdfa)) ((gnu packages python) #:select (python)) ((gnu packages python-science) #:select (python-pandas)) ((gnu packages python-xyz) #:select (python-beautifulsoup4)) @@ -32,6 +33,7 @@ ghc-hs-conllu ; working on syntax-annotated documents ghc-hxt ; working on xml documents ghc-random ; sampling data at random + ghc-regex-tdfa ; working with regexps in haskell processing-lge ; extracting articles from the BnF files python ; scripts python-beautifulsoup4 ; extract EDdA metadata from TEI files diff --git a/scripts/split.hs b/scripts/split.hs new file mode 100755 index 0000000000000000000000000000000000000000..5e3e58ab27205751565def7d1233c314fc92ffd9 --- /dev/null +++ b/scripts/split.hs @@ -0,0 +1,68 @@ +#!/usr/bin/env -S runhaskell --ghc-arg="-Wall" --ghc-arg="-i lib/haskell" + +import Control.Applicative ((<**>), (<|>)) +import Data.List (foldl') +import Data.Text (Text) +import Data.Text.IO as Text (getContents, writeFile) +import Options.Applicative + ( Parser, execParser, flag', fullDesc, help, helper, info, long, metavar + , progDesc, short, strArgument, strOption) +import System.Directory (createDirectoryIfMissing) +import System.FilePath (takeDirectory) +import Text.Filter (Editable(..)) +import Text.Printf (printf) +import Text.Regex.TDFA ((=~)) + +data Mode = Discard | StartWith | EndWith + +data Config = Config + { splitPattern :: String + , mode :: Mode + , outputPattern :: String } + +configParser :: Parser Config +configParser = Config + <$> strOption on + <*> (flag' StartWith startWith <|> flag' EndWith endWith <|> pure Discard) + <*> strArgument outputPattern + where + on = short 'o' <> long "on" <> metavar "REGEX" + <> help "pattern of the lines on which to split" + outputPattern = + metavar "OUTPUT_PATTERN" <> help "pattern of the output files" + startWith = + short 's' <> long "start" <> help "a part begins with the pattern" + endWith = short 'e' <> long "end" <> help "a part ends with the pattern" + +getConfig :: IO Config +getConfig = execParser + (info + (configParser <**> helper) + (fullDesc + <> progDesc "A tool to split a textual flow on a predefined line or prefix")) + +split :: Config -> [Text] -> [[Text]] +split (Config {splitPattern, mode}) = reverse . close . foldl' aggregate ([], []) + where + close (currentPart, previousParts) = reverse currentPart:previousParts + aggregate tmp@(currentPart, previousParts) line + | line =~ splitPattern = + case mode of + Discard -> ([], close tmp) + StartWith -> ([line], close tmp) + EndWith -> ([], close (line:currentPart, previousParts)) + | otherwise = (line:currentPart, previousParts) + +create :: Editable a => Config -> [a] -> IO () +create (Config {outputPattern}) = mapM_ createFile . zip [1..] . fmap leave + where + createFile :: (Int, Text) -> IO () + createFile (i, content) = + let path = printf outputPattern i in do + createDirectoryIfMissing True (takeDirectory path) + Text.writeFile path content + +main :: IO () +main = do + config <- getConfig + Text.getContents >>= create config . split config . enter