Skip to content
Snippets Groups Projects
Commit 45f79db5 authored by Alice Brenon's avatar Alice Brenon
Browse files

Add a (naive) script to put paragraphs on one line to please stanza + some lib...

Add a (naive) script to put paragraphs on one line to please stanza + some lib to factorize common bases for future haskell scripts
parent 58f04ebd
No related branches found
No related tags found
No related merge requests found
......@@ -3,30 +3,43 @@ EDDA=$(CORPUS_ROOT)/EDdA
LGE=$(CORPUS_ROOT)/LGE
EDDA_META=$(EDDA)/metadata.csv
METADATA=$(EDDA_META)
RAW_LGE=$(LGE)/Text
TOME_DIRS=$(wildcard $(RAW_LGE)/T*)
TOMES=$(TOME_DIRS:$(RAW_LGE)/T%=%)
TEI_LGE=$(LGE)/TEI
PARALLEL_LGE=$(LGE)/Parallel
LGE_META_FROM_EDDA=$(PARALLEL_LGE)/metadata.csv
RAW_PARALLEL_LGE=$(PARALLEL_LGE)/Text
LINEARIZED_PARALLEL_LGE_ROOT=$(PARALLEL_LGE)/Linearized
TEI_PARALLEL_LGE=$(PARALLEL_LGE)/TEI
LINEARIZED_PARALLEL_LGE=$(LINEARIZED_PARALLEL_LGE_ROOT)/ $(TOMES:%=$(LINEARIZED_PARALLEL_LGE_ROOT)/T%)
STANZA_PARALLEL_LGE=$(PARALLEL_LGE)/stanza
all: $(METADATA) $(STANZA_PARALLEL_LGE)
METADATA=$(EDDA_META) $(LGE_META_FROM_EDDA)
all: $(METADATA) $(LINEARIZED_PARALLEL_LGE)
$(EDDA_META): $(EDDA)/TEI/
./scripts/EDdA-metadata.py $< > $@
$(LGE_META_FROM_EDDA): $(EDDA_META)
mkdir -p $(dir $@)
$(LGE_META_FROM_EDDA): $(EDDA_META) $(PARALLEL_LGE)/
./scripts/LGE-metadata-from-EDdA.py $< $(RAW_LGE) $@
$(RAW_PARALLEL_LGE): $(LGE_META_FROM_EDDA) $(RAW_LGE)
./scripts/extract_classified_LGE.sh $^ $@
./scripts/extract-parallel-LGE.sh $^ $@
$(TEI_PARALLEL_LGE): $(LGE_META_FROM_EDDA) $(TEI_LGE)
./scripts/extract_classified_LGE.sh $^ $@
./scripts/extract-parallel-LGE.sh $^ $@
$(STANZA_PARALLEL_LGE): $(RAW_PARALLEL_LGE)
./scripts/stanza-annotator.py $< $@
%/:
mkdir -p $@
$(LINEARIZED_PARALLEL_LGE_ROOT)/T%: $(RAW_PARALLEL_LGE)/T%
mkdir -p $@
find $< -type f -name '*.txt' | ./scripts/linearize.hs $@
{-# LANGUAGE NamedFieldPuns #-}
module Data.Metadata (
Metadata
, byTome
, metadata
) where
import Data.Attoparsec.Text (Parser, endOfLine, sepBy1, takeTill)
import Data.List as List (foldl')
import Data.Map as Map (Map, adjust, empty, foldl', insert, member)
import Data.Metadata.Article (Article(..), article)
import Data.Text (Text)
--type Metadata = Map Text Article
type Metadata = [Article]
type Tomes = Map Int Metadata
metadata :: Parser Metadata
--metadata = List.foldl' indexByUid Map.empty <$>
-- (skipLine *> article `sepBy1` endOfLine)
metadata = skipLine *> article `sepBy1` endOfLine
where
skipLine = takeTill (\c -> c == '\r' || c == '\n') *> endOfLine
indexByUid temp a@(Article {uid}) = Map.insert uid a temp
byTome :: Metadata -> Tomes
--byTome = Map.foldl' sortByTome Map.empty
byTome = List.foldl' sortByTome Map.empty
where
sortByTome temp a@(Article {uid, tome}) =
-- Map.adjust (Map.insert uid a) tome .
Map.adjust (a:) tome
-- . (if tome `member` temp then id else Map.insert tome Map.empty) $ temp
. (if tome `member` temp then id else Map.insert tome []) $ temp
{-
. (if tome `member` temp then id else Map.insert tome Map.empty)
$ tome
-}
{-# LANGUAGE OverloadedStrings #-}
module Data.Metadata.Article (
Article(..)
, article
) where
import Data.Attoparsec.Text (Parser, char, decimal, sepBy1, string, takeTill)
import Data.Text (Text, stripEnd)
data Article = Article {
uid :: Text
, tome :: Int
, rank :: Int
, headWord :: Text
, domains :: [Text]
} deriving Show
article :: Parser Article
article = Article
<$> cell
<*> (decimal <* char ',')
<*> (decimal <* char ',')
<*> cell
<*> domains_
where
cell = takeTill (== ',') <* char ','
domains_ :: Parser [Text]
domains_ = fmap stripEnd <$>
(takeTill (`elem` ['|', '\r', '\n'])) `sepBy1` (string "| ")
module System.Script (
syntax
) where
import System.Exit (die)
import System.Environment (getProgName)
import Text.Printf (printf)
syntax :: String -> IO ()
syntax s = do
this <- getProgName
die $ printf "Syntax: %s %s" this s
{-# LANGUAGE FlexibleInstances, OverloadedStrings #-}
module Text.Editor (
Editor
, edit
, editAll
, editM
, editAllM
) where
import Control.Monad.IO.Class (MonadIO(..))
import Data.Text as Text (Text, intercalate, lines, pack, unpack)
import Data.Text.IO as Text (readFile, writeFile)
import System.FilePath (replaceDirectory)
type Editor a = a -> a
type EditorM m a = a -> m a
class Editable a where
enter :: Text -> a
leave :: a -> Text
instance Editable [Text] where
enter = Text.lines
leave = Text.intercalate "\n"
instance Editable String where
enter = Text.unpack
leave = Text.pack
-- | Apply an 'Editor' to the content of a file which path is passed as third
-- argument. The second argument is the path to the target folder where the
-- edited version will be created.
editM :: (Editable a, MonadIO m) => EditorM m a -> FilePath -> FilePath -> m ()
editM editor target input =
liftIO (Text.readFile input)
>>= editor . enter
>>= liftIO . Text.writeFile output . leave
where
output = replaceDirectory input target
-- | A convenient shortcut of 'editM' for pure editors
edit :: Editable a => Editor a -> FilePath -> FilePath -> IO ()
edit editor = editM (pure . editor)
-- | Apply an 'Editor' like 'edit' on all the files which paths are expected to
-- be read from the input 'String', one per line.
editAllM :: (Editable a, MonadIO m) => EditorM m a -> FilePath -> String -> m ()
editAllM editor target = mapM_ (editM editor target) . Prelude.lines
-- | A convenient shortcut of 'editAllM' for pure editors
editAll :: Editable a => Editor a -> FilePath -> String -> IO ()
editAll editor = editAllM (pure . editor)
(use-modules ((geode packages annotation) #:select (python-stanza))
((geode packages models) #:select (stanza-fr))
((gnu packages commencement) #:select (gcc-toolchain))
((gnu packages haskell) #:select (ghc))
((gnu packages haskell-web) #:select (ghc-hxt))
((gnu packages haskell-xyz) #:select (ghc-attoparsec))
((gnu packages python) #:select (python))
((gnu packages python-xyz) #:select (python-beautifulsoup4))
((gnu packages xml) #:select (python-lxml)))
......@@ -7,6 +11,10 @@
(packages->manifest
(list
coreutils ; mktemp for atomic processing, strip CSV headers, general scripting
gcc-toolchain ; running haskell
ghc ; running haskell
ghc-attoparsec ; parsing metadata
ghc-hxt ; working on xml documents
python ; scripts
python-beautifulsoup4 ; extract EDdA metadata from TEI files
python-lxml ; fusion articles into tomes for TXM
......
......@@ -13,7 +13,7 @@ then
mv "${OUTPUT}" "${OUTPUT}.${N}"
fi
WORKDIR=$(mktemp -d /tmp/classified-LGE.XXX)
WORKDIR=$(mktemp -d /tmp/parallel-LGE.XXX)
for T in {1..31}
do
......
#!/usr/bin/env -S runhaskell --ghc-arg="-i lib"
import Data.Char (isUpper)
import System.Environment (getArgs)
import System.FilePath ((</>))
import System.Script (syntax)
import Text.Editor (editAll)
linearize :: String -> String
linearize "" = ""
linearize ('¬':'\n':s) = linearize s
linearize ('\n':'\n':s) = "\n\n" ++ linearize s
linearize ('\n':s) = ' ' : linearize s
linearize ('-':'\n':c:s)
| isUpper c = '-' : c : linearize s
linearize (c:s) = c : linearize s
main :: IO ()
main = getArgs >>= cli
where
cli [target] = getContents >>= editAll linearize target
cli _ = syntax "TARGET_DIR"
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment