Skip to content
Snippets Groups Projects
Commit ccd9f81d authored by Alice Brenon's avatar Alice Brenon
Browse files

Add a script to split files on lines with a certain pattern

parent 3972fb64
No related branches found
No related tags found
No related merge requests found
...@@ -7,7 +7,8 @@ ...@@ -7,7 +7,8 @@
((gnu packages haskell-web) #:select (ghc-aeson ghc-hxt)) ((gnu packages haskell-web) #:select (ghc-aeson ghc-hxt))
((gnu packages haskell-xyz) #:select (ghc-cassava ((gnu packages haskell-xyz) #:select (ghc-cassava
ghc-hs-conllu ghc-hs-conllu
ghc-random)) ghc-random
ghc-regex-tdfa))
((gnu packages python) #:select (python)) ((gnu packages python) #:select (python))
((gnu packages python-science) #:select (python-pandas)) ((gnu packages python-science) #:select (python-pandas))
((gnu packages python-xyz) #:select (python-beautifulsoup4)) ((gnu packages python-xyz) #:select (python-beautifulsoup4))
...@@ -32,6 +33,7 @@ ...@@ -32,6 +33,7 @@
ghc-hs-conllu ; working on syntax-annotated documents ghc-hs-conllu ; working on syntax-annotated documents
ghc-hxt ; working on xml documents ghc-hxt ; working on xml documents
ghc-random ; sampling data at random ghc-random ; sampling data at random
ghc-regex-tdfa ; working with regexps in haskell
processing-lge ; extracting articles from the BnF files processing-lge ; extracting articles from the BnF files
python ; scripts python ; scripts
python-beautifulsoup4 ; extract EDdA metadata from TEI files python-beautifulsoup4 ; extract EDdA metadata from TEI files
......
#!/usr/bin/env -S runhaskell --ghc-arg="-Wall" --ghc-arg="-i lib/haskell"
import Control.Applicative ((<**>), (<|>))
import Data.List (foldl')
import Data.Text (Text)
import Data.Text.IO as Text (getContents, writeFile)
import Options.Applicative
( Parser, execParser, flag', fullDesc, help, helper, info, long, metavar
, progDesc, short, strArgument, strOption)
import System.Directory (createDirectoryIfMissing)
import System.FilePath (takeDirectory)
import Text.Filter (Editable(..))
import Text.Printf (printf)
import Text.Regex.TDFA ((=~))
data Mode = Discard | StartWith | EndWith
data Config = Config
{ splitPattern :: String
, mode :: Mode
, outputPattern :: String }
configParser :: Parser Config
configParser = Config
<$> strOption on
<*> (flag' StartWith startWith <|> flag' EndWith endWith <|> pure Discard)
<*> strArgument outputPattern
where
on = short 'o' <> long "on" <> metavar "REGEX"
<> help "pattern of the lines on which to split"
outputPattern =
metavar "OUTPUT_PATTERN" <> help "pattern of the output files"
startWith =
short 's' <> long "start" <> help "a part begins with the pattern"
endWith = short 'e' <> long "end" <> help "a part ends with the pattern"
getConfig :: IO Config
getConfig = execParser
(info
(configParser <**> helper)
(fullDesc
<> progDesc "A tool to split a textual flow on a predefined line or prefix"))
split :: Config -> [Text] -> [[Text]]
split (Config {splitPattern, mode}) = reverse . close . foldl' aggregate ([], [])
where
close (currentPart, previousParts) = reverse currentPart:previousParts
aggregate tmp@(currentPart, previousParts) line
| line =~ splitPattern =
case mode of
Discard -> ([], close tmp)
StartWith -> ([line], close tmp)
EndWith -> ([], close (line:currentPart, previousParts))
| otherwise = (line:currentPart, previousParts)
create :: Editable a => Config -> [a] -> IO ()
create (Config {outputPattern}) = mapM_ createFile . zip [1..] . fmap leave
where
createFile :: (Int, Text) -> IO ()
createFile (i, content) =
let path = printf outputPattern i in do
createDirectoryIfMissing True (takeDirectory path)
Text.writeFile path content
main :: IO ()
main = do
config <- getConfig
Text.getContents >>= create config . split config . enter
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment