From 1e28d5936765d96f668daacef3f2c7e84f408001 Mon Sep 17 00:00:00 2001 From: Alice BRENON <alice.brenon@ens-lyon.fr> Date: Fri, 16 Jun 2023 15:39:19 +0200 Subject: [PATCH] Save everything that can be saved after disk crash --- lib/Data/Metadata.hs | 38 +++- lib/Data/Metadata/Article.hs | 1 + lib/Data/Metadata/Trie.hs | 17 +- lib/System/Script.hs | 14 +- lib/Text/TEI.hs | 16 +- manifest.scm | 4 + scripts/EDdA/extract-from-source.sh | 25 +++ scripts/conllu-to-lexicoscope.hs | 45 ++--- scripts/conllu-to-txm.hs | 54 +++--- scripts/extract-corpus.sh | 2 +- scripts/extract-first-word.hs | 175 ++++++++++++++++++ scripts/extract-first-word_lge-mode-backup.hs | 76 ++++++++ scripts/extract-from-source.sh | 20 ++ scripts/lib.sh | 4 + scripts/merge.hs | 28 +++ scripts/parallel-links.hs | 36 ++++ scripts/parallel-links.py | 170 ++++++++--------- scripts/select.hs | 27 +++ scripts/subcorpus/getFiles.hs | 29 +++ scripts/subcorpus/getTXMQuery.hs | 21 +++ 20 files changed, 648 insertions(+), 154 deletions(-) create mode 100755 scripts/EDdA/extract-from-source.sh create mode 100644 scripts/extract-first-word.hs create mode 100644 scripts/extract-first-word_lge-mode-backup.hs create mode 100755 scripts/extract-from-source.sh create mode 100644 scripts/lib.sh create mode 100755 scripts/merge.hs create mode 100755 scripts/parallel-links.hs create mode 100755 scripts/select.hs create mode 100755 scripts/subcorpus/getFiles.hs create mode 100755 scripts/subcorpus/getTXMQuery.hs diff --git a/lib/Data/Metadata.hs b/lib/Data/Metadata.hs index 8af810b..d8ef60a 100644 --- a/lib/Data/Metadata.hs +++ b/lib/Data/Metadata.hs @@ -13,13 +13,16 @@ module Data.Metadata ( , list , readTsv , sortBy + , tsvFile + , tsvLines ) where -import Data.ByteString.Lazy as ByteString (readFile, writeFile) +import Data.ByteString.Lazy as ByteString (putStr, readFile, writeFile) import Data.ByteString.Char8 as StrictByteString (pack) import Data.Csv - ( DecodeOptions(..), EncodeOptions(..), FromRecord, HasHeader(..), ToRecord - , decodeWith, defaultEncodeOptions, encodeWith, header, namedField ) + ( DecodeOptions(..), EncodeOptions(..), FromRecord, HasHeader(..) + , ToNamedRecord, ToRecord, decodeWith, defaultEncodeOptions, encodeByNameWith + , encodeWith, header ) import Data.Foldable as Foldable (toList) import Data.List (sortOn) import Data.Map.Strict as Map (alter, empty, toList) @@ -33,10 +36,33 @@ list :: [Text] -> String list ts = Text.unpack $ ":" <> intercalate ":" ts <> ":" readTsv :: FromRecord a => FilePath -> IO (Either String (Vector a)) -readTsv source = decodeWith tsv HasHeader <$> ByteString.readFile source +readTsv source = decodeWith fromTsv HasHeader <$> ByteString.readFile source where - tsv = DecodeOptions {decDelimiter = fromIntegral (fromEnum '\t')} + fromTsv = DecodeOptions {decDelimiter = fromIntegral (fromEnum '\t')} +{- +toTsv :: (Foldable t, ToNamedRecord a) => Maybe [String] -> t a -> ByteString +toTsv fields content = + encodeByNameWith tsv (header bsFields) + $ zipWith namedField bsFields (Foldable.toList content) + where + bsFields = StrictByteString.pack <$> fields +-} + +toTsv :: EncodeOptions +toTsv = defaultEncodeOptions + { encDelimiter = fromIntegral (fromEnum '\t') + , encUseCrLf = False } + +tsvFile :: ToNamedRecord a => FilePath -> [String] -> [a] -> IO () +tsvFile target fields = + ByteString.writeFile target + . encodeByNameWith toTsv (header $ StrictByteString.pack <$> fields) + +tsvLines :: ToRecord a => [a] -> IO () +tsvLines = ByteString.putStr . encodeWith toTsv + +{- writeTsv :: (Foldable t, ToRecord a) => [String] -> t a -> FilePath -> IO () writeTsv fields content target = ByteString.writeFile target @@ -47,6 +73,8 @@ writeTsv fields content target = tsv = defaultEncodeOptions { encDelimiter = fromIntegral (fromEnum '\t') , encUseCrLf = False } +-} + sortBy :: (Foldable t, Ord k) => (a -> k) -> t a -> [a] sortBy field = sortOn field . Foldable.toList diff --git a/lib/Data/Metadata/Article.hs b/lib/Data/Metadata/Article.hs index ba8c694..505bc10 100644 --- a/lib/Data/Metadata/Article.hs +++ b/lib/Data/Metadata/Article.hs @@ -22,6 +22,7 @@ data Article = Article instance FromRecord Article instance ToRecord Article +--instance ToNamedRecord Article instance Unique Article where uid (Article {articleBook, tome, name}) = diff --git a/lib/Data/Metadata/Trie.hs b/lib/Data/Metadata/Trie.hs index 2383ad5..3ad51a6 100644 --- a/lib/Data/Metadata/Trie.hs +++ b/lib/Data/Metadata/Trie.hs @@ -4,7 +4,7 @@ module Data.Metadata.Trie import Data.Foldable (toList) import Data.Map (Map) -import qualified Data.Map as Map (empty, insert, lookup, singleton) +import qualified Data.Map as Map (delete, empty, insert, lookup, singleton) data Trie e n = Trie @@ -43,12 +43,19 @@ at trie = at_ trie . toList at_ (Trie {store}) [] = store at_ (Trie {edges}) (e:es) = Map.lookup e edges >>= (`at_` es) -insert :: Foldable t => t e -> n -> Trie e n -> Trie e n -insert path n trie = zipUp (edit (trie, Top)) +insert :: (Ord e, Foldable t) => t e -> n -> Trie e n -> Trie e n +insert path n trie = zipUp (edit (toList path) (trie, Top)) where - edit + edit [] (trie, zipper) = (trie {store = Just n}, zipper) + edit (e:es) (Trie {store, edges}, above) = + let (subTree, otherEdges) = partition e edges in + edit es (subTree, Zip {atValue = store, otherEdges, byEdge = e, above}) + partition e edges = + case Map.lookup e edges of + Just subTree -> (subTree, Map.delete e edges) + _ -> (empty, Map.empty) type Indexed e n = ([e], n) -index :: (Foldable t, Functor t) => (a -> Indexed e n) -> t a -> Trie e n +index :: (Ord e, Foldable t, Functor t) => (a -> Indexed e n) -> t a -> Trie e n index by = foldr (uncurry insert) empty . fmap by diff --git a/lib/System/Script.hs b/lib/System/Script.hs index 2455009..5f54942 100644 --- a/lib/System/Script.hs +++ b/lib/System/Script.hs @@ -1,12 +1,20 @@ -module System.Script ( - syntax - ) where +module System.Script + ( syntax + , try + , warn ) where import System.Exit (die) import System.Environment (getProgName) +import System.IO (hPutStrLn, stderr) import Text.Printf (printf) syntax :: String -> IO () syntax s = do this <- getProgName die $ printf "Syntax: %s %s" this s + +try :: IO (Either String a) -> IO a +try = (>>= either die pure) + +warn :: String -> IO () +warn = hPutStrLn stderr diff --git a/lib/Text/TEI.hs b/lib/Text/TEI.hs index 7514df9..ba66a30 100644 --- a/lib/Text/TEI.hs +++ b/lib/Text/TEI.hs @@ -7,10 +7,10 @@ module Text.TEI , text , title ) where -import Data.Metadata - ( Book(..), FromBook(..), HasAuthors(..), HasDomains(..), TXMText, Unique(..) - , list ) -import Text.XML.HXT.Core ( ArrowXml, XmlTree, aelem, attr, mkelem, selem, txt ) +import Data.Text (unpack) +import GEODE.Metadata + (Book(..), Contrastive(..), Has(..), PrimaryKey(..), formatList, uid) +import Text.XML.HXT.Core (ArrowXml, XmlTree, aelem, attr, mkelem, selem, txt) corpusHeader :: ArrowXml a => Book -> String -> a b XmlTree corpusHeader EDdA = eddaHeader @@ -85,15 +85,17 @@ by :: ArrowXml a => String -> String -> [a b XmlTree] by resp orgName = [ selem "resp" [ txt (resp <> " by") ] , selem "orgName" [ txt orgName ] ] +type TXMText a = (Has Contrastive a, Has PrimaryKey a) + text :: (ArrowXml a, TXMText b) => String -> b -> a c XmlTree -> a c XmlTree text unitName txmText body = mkelem "text" metadata [ body, aelem "milestone" [ "unit" .= unitName ] ] where metadata = [ "uid" .= uid txmText - , "book" .= show (book txmText) - , "author" .= list (authors txmText) - , "domains" .= list (domains txmText) ] + , "book" .= (show . book $ get txmText) + , "author" .= (unpack . formatList . authors $ get txmText) + , "domains" .= (unpack . formatList . domains $ get txmText) ] (.=) :: ArrowXml a => String -> String -> a b XmlTree key .= value = attr key (txt value) diff --git a/manifest.scm b/manifest.scm index af89124..6c52a47 100644 --- a/manifest.scm +++ b/manifest.scm @@ -10,13 +10,17 @@ ((gnu packages xml) #:select (python-lxml))) (define python-edda (load "/home/alice/Logiciel/python-edda/guix.scm")) +(define edda-clinic (load "/home/alice/Logiciel/EDdAClinic/guix.scm")) +(define ghc-geode (load "/home/alice/Logiciel/ghc-geode/guix.scm")) (packages->manifest (list coreutils ; mktemp for atomic processing, strip CSV headers, general scripting + edda-clinic ; fix and cut the EDdA gcc-toolchain ; running haskell ghc ; running haskell ghc-cassava ; working with CSV in haskell + ghc-geode ; handling corpus files ghc-hs-conllu ; working on syntax-annotated documents ghc-hxt ; working on xml documents python ; scripts diff --git a/scripts/EDdA/extract-from-source.sh b/scripts/EDdA/extract-from-source.sh new file mode 100755 index 0000000..d3bd4f6 --- /dev/null +++ b/scripts/EDdA/extract-from-source.sh @@ -0,0 +1,25 @@ +#!/bin/sh + +source ${0%/*}/../lib.sh + +if [ "$#" != 2 ] +then + die "${0##*/} SOURCE_DIRECTORY TARGET_DIRECTORY" +else + SOURCE="${1}" + TARGET="${2}" + [ -d "${SOURCE}" ] || die "SOURCE_DIRECTORY must be a directory (containing 1 .tei file per tome)" + [ -d "${TARGET}" ] || die "TARGET_DIRECTORY must be a directory (where output will be generated)" +fi + +for T in {1..17} +do + cleanEDdA < "${SOURCE}/volume$(printf "%02d" $T).tei" \ + | fixEDdA \ + | splitArticles \ + --tome ${T} \ + --xpath "/TEI/text/body/div1" \ + --xml-root "${TARGET}/TEI" \ + --text-root "${TARGET}/Text" \ + --metadata +done diff --git a/scripts/conllu-to-lexicoscope.hs b/scripts/conllu-to-lexicoscope.hs index 705f39a..e4bdfea 100755 --- a/scripts/conllu-to-lexicoscope.hs +++ b/scripts/conllu-to-lexicoscope.hs @@ -1,5 +1,5 @@ #!/usr/bin/env -S runhaskell --ghc-arg="-Wall" --ghc-arg="-i lib" -{-# LANGUAGE NamedFieldPuns, OverloadedStrings #-} +{-# LANGUAGE ExplicitNamespaces, NamedFieldPuns, OverloadedStrings #-} import Conllu.Parse (parseConllu) import Conllu.Parse.Paragraph (Paragraph(..), byParagraph) @@ -8,8 +8,8 @@ import Conllu.Type (Doc, Sent(..)) import Control.Monad.IO.Class (MonadIO(..)) import Control.Monad.State (StateT(..), evalStateT) import Data.List as List (intercalate) -import Data.Map (toList) -import Data.Metadata (Article(..), Authors(..), Domains(..), Tome, byTome, list) +--import Data.Map (toList) +import GEODE.Metadata (Contrastive(..), Entry(..), Has(..), PrimaryKey(..), type(@), formatList, groupBy, readNamedTsv, relativePath, sortBy, uid) import Data.Text as Text (unpack) import System.Environment (getArgs) import System.Exit (die) @@ -21,6 +21,8 @@ import Text.XML.HXT.Core import Text.XML.HXT.Arrow.XmlState.TypeDefs (Selector(..), chgS, theUserState) import Text.XML.HXT.Arrow.XmlState.RunIOStateArrow (initialState) +type Article = PrimaryKey @ Entry @ Contrastive + type Serial = (Int, Int) next :: Selector Serial Int -> IOStateArrow Serial a String @@ -35,19 +37,18 @@ pId = S {getS = fst, setS = \p (_, s) -> (p, s)} sId :: Selector Serial Int sId = S {getS = snd, setS = \s (p, _) -> (p, s)} -to :: FilePath -> FilePath -> (Int, Tome) -> StateT Serial IO () -to source target (tomeNumber, tome) = - mapM_ (processArticle (source </> tomeDir) (target </> tomeDir)) tome - where - tomeDir = "T" <> show tomeNumber +to :: FilePath -> FilePath -> (Int, [Article]) -> StateT Serial IO () +to source target (_, tome) = + mapM_ (processArticle source target) tome processArticle :: FilePath -> FilePath -> Article -> StateT Serial IO () -processArticle source target a@(Article {uid, rank}) = +processArticle source target article = +--processArticle source target a@(Article {rank}) = liftIO (putStrLn ("processing " <> input) *> Prelude.readFile input) - >>= either (liftIO . die) (format a output) . parseConllu input + >>= either (liftIO . die) (format article output) . parseConllu input where - input = source </> Text.unpack uid <.> "conllu" - output = target </> Text.unpack uid <.> "xml" + input = source </> relativePath article "conllu" + output = target </> uid article <.> ".fr.xml" --input = source </> "article" ++ show rank <.> "conllu" --output = target </> "article" ++ show rank <.> "xml" @@ -64,16 +65,17 @@ format article outputPath dom = StateT $ \s -> do ,selem "text" (formatParagraph <$> byParagraph dom)]]] metaFrom :: ArrowXml a => Article -> a n XmlTree -metaFrom (Article {uid, tome, rank, headWord, authors, domains}) = txt $ +metaFrom article = txt $ +--metaFrom (Article {uid, tome, rank, headWord, authors, domains}) = txt $ concatMap (++ "\n") (List.intercalate "\t" <$> [[] - ,["fileName", Text.unpack uid] - ,["tome", show tome] - ,["rank", show rank] - ,["head", Text.unpack headWord] - ,["author", Text.unpack . list $ getAuthors authors] - ,["domain", Text.unpack . list $ getDomains domains]]) + ,["fileName", uid article] + ,["tome", show . tome $ get article] + ,["rank", show . rank $ get article] + ,["head", Text.unpack . headWord $ get article] + ,["author", Text.unpack . formatList . authors $ get article] + ,["domain", Text.unpack . formatList . domains $ get article]]) formatParagraph :: Paragraph -> IOStateArrow Serial n XmlTree formatParagraph (Paragraph sents) = @@ -89,7 +91,8 @@ main :: IO () main = getArgs >>= cli where withSerial = (`evalStateT` (0, 0)) + byTome = groupBy (tome.get) . sortBy uid cli [sourceMeta, sourceRoot, targetRoot] = - byTome sourceMeta - >>= either die (withSerial . mapM_ (sourceRoot `to` targetRoot) . toList) + readNamedTsv sourceMeta + >>= either die (withSerial . mapM_ (sourceRoot `to` targetRoot) . byTome) cli _ = syntax "METADATA_CSV_FILE SOURCE_DIR TARGET_DIR" diff --git a/scripts/conllu-to-txm.hs b/scripts/conllu-to-txm.hs index 8487819..69266da 100755 --- a/scripts/conllu-to-txm.hs +++ b/scripts/conllu-to-txm.hs @@ -1,18 +1,17 @@ #!/usr/bin/env -S runhaskell --ghc-arg="-Wall" --ghc-arg="-i lib" -{-# LANGUAGE NamedFieldPuns, OverloadedStrings #-} +{-# LANGUAGE ExplicitNamespaces, NamedFieldPuns, OverloadedStrings #-} import Conllu.Parse (parseConllu) import Conllu.Type (AW, CW(..), ID(..), Sent(..)) import Control.Applicative ((<|>), liftA2) -import Data.Metadata - ( HasAuthors(..), Book, FromBook(..), relativePath, uid, groupBy, readTsv - , sortBy ) -import Data.Metadata.Article (Article(..)) -import Data.Text as Text (unpack) +import GEODE.Metadata + ( Book, Contrastive(..), Entry(..), Has(..), MultiText(..), PrimaryKey(..) + , type(@), relativePath, groupBy, readNamedTsv, sortBy ) +import Data.Text as Text (breakOn, drop, pack, unpack, splitOn) import System.Environment (getArgs) import System.Exit (die) import System.FilePath ((</>), (<.>)) -import System.Script (syntax) +import System.Script (syntax, try) import Text.TEI ( corpusHeader, publicationStmt, sourceDesc, teiHeader, text, title ) import Text.XML.HXT.Core @@ -20,9 +19,11 @@ import Text.XML.HXT.Core , attr, constL, mkelem, mkText, runX, selem, txt, unlistA, withIndent , writeDocument ) +type Article = PrimaryKey @ Entry @ Contrastive + to :: FilePath -> FilePath -> (Book, [Article]) -> IO () to source target (theBook, metadata) = - mapM_ runArrow . groupBy tome $ sortBy rank metadata + mapM_ runArrow . groupBy (tome.get) $ sortBy (rank.get) metadata where runArrow (tomeNumber, tomeArticles) = runX $ compileTome source target (theBook, tomeNumber, tomeArticles) @@ -33,42 +34,35 @@ compileTome source target (theBook, tomeNumber, metadata) = tomeXml >>> writeDocument [withIndent True] output where strTome = show tomeNumber - output = target </> (show theBook) </> "T" <> strTome <.> "xml" + output = target </> (show theBook) <> "_T" <> strTome <.> "xml" tomeXml = selem "/" [ selem "teiCorpus" [ corpusHeader theBook strTome , (articleFrom source $< constL metadata) ] ] articleFrom :: FilePath -> Article -> IOSArrow b XmlTree -articleFrom source article@(Article {headWord}) = +articleFrom source article = selem "TEI" [ teiHeader - (selem "titleStmt" [title $ Text.unpack headWord]) + (selem "titleStmt" [title . Text.unpack . headWord.get $ article]) (publicationStmt [ selem "p" [ txt "Annotated with Stanza by project GEODE" ] ]) (sourceDesc [ authorArrow >>> selem "author" [ mkText ] ]) , text "article" article (loadConllu input) ] where - input = source </> relativePath article <.> "conllu" - authorArrow = constL (Text.unpack <$> authors article) - -{- -metadataFrom :: Article -> [ IOSArrow b XmlTree ] -metadataFrom article@(Article {book, authors, domains}) = - [ "id" .= uid article - , "book" .= show book - , "author" .= list (getAuthors authors) - , "domains" .= list (getDomains domains) ] --} + input = source </> relativePath article "conllu" + authorArrow = constL (Text.unpack <$> (getList . authors $ get article)) loadConllu :: FilePath -> IOSArrow b XmlTree -loadConllu input = arrIO0 (parseConllu input <$> readFile input) - >>> ((arrIO die) ||| format) +loadConllu input = + arrIO0 (parseConllu input <$> readFile input) >>> (arrIO debug ||| format) where + debug msg = die $ "In file " <> input <> "\n" <> msg format = selem "body" [ unlistA >>> formatSentence ] formatSentence = selem "s" [ arr _words >>> arrL render >>> formatWord ] formatWord = mkelem "w" [ attr "lemma" (txt $< getMaybe _lemma) - , attr "pos" (txt $< getMaybe pos) ] + , attr "pos" (txt $< getMaybe pos) + , attr "type" (txt $< getMaybe misc) ] [ txt $< getMaybe _form ] pos = liftA2 (<|>) spos _xpos @@ -87,11 +81,15 @@ render (w:ws) = w:(render ws) spos :: CW a -> Maybe String spos = fmap show . _upos +misc :: CW a -> Maybe String +misc (CW {_misc}) = _misc >>= (fmap Text.unpack . getNer . Text.pack) + where + getNer = lookup "ner" . fmap (fmap (Text.drop 1) . breakOn "=") . splitOn "|" + main :: IO () main = getArgs >>= cli where - books = groupBy book . sortBy uid + books = groupBy (book.get) cli [inputMeta, sourceRoot, targetRoot] = - readTsv inputMeta - >>= either die (mapM_ (sourceRoot `to` targetRoot) . books) + try (readNamedTsv inputMeta) >>= mapM_ (sourceRoot `to` targetRoot) . books cli _ = syntax "METADATA_TSV_FILE SOURCE_DIR TARGET_DIR" diff --git a/scripts/extract-corpus.sh b/scripts/extract-corpus.sh index 4806dd1..5cc0f48 100755 --- a/scripts/extract-corpus.sh +++ b/scripts/extract-corpus.sh @@ -1,4 +1,4 @@ -#!/bin/bash +#!/bin/sh INPUT_PATH="${1}" SOURCE_TEXT_ARTICLES="${2}" diff --git a/scripts/extract-first-word.hs b/scripts/extract-first-word.hs new file mode 100644 index 0000000..d41da32 --- /dev/null +++ b/scripts/extract-first-word.hs @@ -0,0 +1,175 @@ +{-# LANGUAGE OverloadedStrings #-} +import Control.Applicative ((<|>), many, optional) +import Data.Attoparsec.Text as Atto + ( Parser, char, choice, inClass, letter, many1, parseOnly, sepBy, space, string + , takeWhile ) +import Data.ByteString.Lazy as BS (getContents) +import Data.Char (isUpper, toLower, toUpper) +import Data.Csv (FromRecord(..), HasHeader(..), decode) +--import Data.Csv (HasHeader(..), decode) +import Data.Text as Text (Text, pack) +import Data.Text.IO as Text (readFile) +--import Data.Text.IO as Text (getContents, readFile) +import System.Script (syntax, warn) +import System.Environment (getArgs) +import System.Exit (die) +import System.FilePath ((</>)) +import Text.Printf (printf) + +parenthesis :: Parser Text +parenthesis = char '(' *> Atto.takeWhile (/= ')') <* char ')' + +qualification :: Parser Text +qualification = + qualifier <* optional (char 'e') <* optional (char 's') <* many1 space + where + qualifier = choice $ string . pack <$> (vocabulary ++ (ucFirst <$> vocabulary)) + vocabulary = ["petit", "grand", "vaste", "belle", "fameux", "fameuse"] + ucFirst (c:cs) = toUpper c : cs + ucFirst s = s + +eddaFirstWord :: String -> Parser String +eddaFirstWord headWord = + diderotStar *> many space *> form *> meta *> optional qualification *> many1 letter + where + diderotStar = optional (char '*') + pulp = Atto.takeWhile (inClass ",. \t\n\r") + form = string (pack headWord) <|> Atto.takeWhile isUpper + meta = pulp `sepBy` (abbreviation <|> parenthesis) + abbreviation = pack <$> many1 letter <* char '.' + --gram = choice [noun, adj] <* char '.' + --noun = string "s." <* optional space <* choice [char 'm', char 'f'] + --adj = string "adj" + +lgeFirstWord :: String -> Parser String +lgeFirstWord headWord = form *> pulp *> meta *> optional qualification *> pulp *> many1 letter + where + form = string (pack headWord) + pulp = Atto.takeWhile (inClass ",. \t\n\r") + meta = many (parenthesis *> pulp) + +type Triple = (Int, Int, String) +newtype EDdA = EDdA { eddaTriple :: Triple } +newtype LGE = LGE { lgeTriple :: Triple } + +eddaLine :: (Int, Int, String, String, String, String) -> EDdA +eddaLine (tome, name, headWord, _, _, _) = EDdA (tome, name, headWord) + +lgeLine :: (String, Int, Int, String, String) -> LGE +lgeLine (_,tome, name, headWord, _) = LGE (tome, name, headWord) + +instance FromRecord EDdA where + parseRecord = fmap eddaLine . parseRecord + +instance FromRecord LGE where + parseRecord = fmap lgeLine . parseRecord + +data BookModule a = + BookModule + { getTriple :: a -> Triple + , relativePath :: Int -> Int -> FilePath + , firstWordParser :: String -> Atto.Parser String } + +edda :: BookModule EDdA +edda = BookModule + { getTriple = eddaTriple + , relativePath = printf "T%d/article%d.txt" + , firstWordParser = eddaFirstWord } + +lge :: BookModule LGE +lge = BookModule + { getTriple = lgeTriple + , relativePath = printf "T%d/ByRank/%d.txt" + , firstWordParser = lgeFirstWord } + +extractFrom :: FromRecord a => FilePath -> BookModule a -> IO () +extractFrom rootDirectory book = + decode HasHeader <$> BS.getContents + >>= either die (mapM_ (wordFromTriple . getTriple book)) + where + wordFromTriple (tome, name, headWord) = + Text.readFile (rootDirectory </> relativePath book tome name) + >>= pure . parseOnly (firstWordParser book headWord) + >>= either + (\_ -> warn $ printf "%d\t%d" tome name) + (\word -> putStrLn $ printf "%d\t%d\t%s" tome name word) + +main :: IO () +main = getArgs >>= run + where + normalize = fmap toLower + run [book, rootDirectory] + | normalize book == "edda" = extractFrom rootDirectory edda + | normalize book == "edda" = extractFrom rootDirectory edda + | normalize book == "lge" = extractFrom rootDirectory lge + run _ = syntax "BOOK[either EDdA or LGE] SOURCE_DIRECTORY path to the root directory containing the articles (which CSV metadata are read from stdin)" + +{- + +parseEDdAArticle :: FilePath -> (Int, Int, String, String, String, String) -> IO () +parseEDdAArticle rootDirectory (tome, name, headWord, _, _, _) = + Text.readFile (rootDirectory </> (printf "T%d/article%d.txt" tome name)) + >>= pure . parseOnly (eddaFirstWord headWord) + >>= either + (\_ -> warn $ printf "%d\t%d" tome name) + (\word -> putStrLn $ printf "%d\t%d\t%s" tome name word) + +parseLGEArticle :: FilePath -> (String, Int, Int, String, String) -> IO () +parseLGEArticle rootDirectory (_,tome, name, headWord, _) = + Text.readFile (rootDirectory </> (printf "T%d/ByRank/%d.txt" tome name)) + >>= pure . parseOnly (lgeFirstWord headWord) + >>= either + (\_ -> warn $ printf "%d\t%d" tome name) + (\word -> putStrLn $ printf "%d\t%d\t%s" tome name word) + +extractFirstWord :: FilePath -> (String -> Parser String) + -> (String, Int, Int, String, String) -> IO () + -- -> (Int, Int, String, String, String, String) -> IO () +extractFirstWord rootDirectory parser (_,tome, name, headWord, _) = +--extractFirstWord rootDirectory parser (tome, name, headWord, _, _, _) = + Text.readFile (rootDirectory </> (printf "T%d/ByRank/%d.txt" tome name)) + --Text.readFile (rootDirectory </> (printf "T%d/article%d.txt" tome name)) + >>= pure . parseOnly (parser headWord) + >>= either + (\_ -> warn $ printf "%d\t%d" tome name) + (\word -> putStrLn $ printf "%d\t%d\t%s" tome name word) + +data Book = EDdA | LGE + +data Triple a where + EDdA :: (Int, String, String, String, String, String) -> Triple (Int, String, String) + LGE :: (String, Int, String, String, String) -> Triple (Int, String, String) + +class Triple a where + getTriple :: a -> (Int, String, String) + +instance Triple EDdA (Int, String, String, String, String, String) where + getTriple + +getTriple :: Triple a -> a +getTriple (EDdA (tome, name, headWord, _, _, _)) = (tome, name, headWord) +getTriple (LGE (_,tome, name, headWord, _)) = (tome, name, headWord) + +data BookModule = + BookModule + { getRecord :: Triple (Int, String, String) + , relativePath :: (Int, String) -> FilePath + , firstWordParser :: Atto.Parser Text } + +eddaLine :: (Int, Int, String, String, String, String) -> (Int, String, String) +eddaLine (tome, name, headWord, _, _, _) = (tome, name, headWord) + +eddaPath + +lgeLine :: (String, Int, String, String, String) -> (Int, String, String) +lgeLine (_,tome, name, headWord, _) = (tome, name, headWord) + +data Format = + EDdA (Int, String, String, String, String, String) + | LGE (String, Int, String, String, String) + +getTriple :: Format -> Triple +getTriple (EDdA (tome, name, headWord, _, _, _)) = (tome, name, headWord) +getTriple (LGE (_,tome, name, headWord, _)) = (tome, name, headWord) + +-} diff --git a/scripts/extract-first-word_lge-mode-backup.hs b/scripts/extract-first-word_lge-mode-backup.hs new file mode 100644 index 0000000..76d3c79 --- /dev/null +++ b/scripts/extract-first-word_lge-mode-backup.hs @@ -0,0 +1,76 @@ +{-# LANGUAGE OverloadedStrings #-} +import Control.Applicative ((<|>), many, optional) +import Data.Attoparsec.Text as Atto + ( Parser, char, choice, inClass, letter, many1, parseOnly, sepBy, space, string + , takeWhile ) +import Data.ByteString.Lazy as BS (getContents) +import Data.Char (isUpper, toLower, toUpper) +import Data.Csv (HasHeader(..), decode) +import Data.Text as Text (Text, pack) +import Data.Text.IO as Text (getContents, readFile) +import System.Script (syntax, warn) +import System.Environment (getArgs) +import System.Exit (die) +import System.FilePath ((</>)) +import Text.Printf (printf) + +eddaFirstWord :: String -> Parser String +eddaFirstWord headWord = + diderotStar *> many space *> head *> meta *> optional qualification *> many1 letter + where + diderotStar = optional (char '*') + pulp = Atto.takeWhile (inClass ",. \t\n\r") + head = string (pack headWord) <|> Atto.takeWhile isUpper --choice [Atto.takeWhile isUpper, string (pack headWord)] + meta = pulp `sepBy` (abbreviation <|> parenthesis) + --meta = pulp `sepBy` (choice [abbreviation, parenthesis]) + --head = string (pack headWord) <* pulp `sepBy` (choice [abbreviation, parenthesis]) + abbreviation = pack <$> many1 letter <* char '.' + --head = string (pack headWord) <* pulp `sepBy` (choice [gram, parenthesis]) + gram = choice [noun, adj] <* char '.' + noun = string "s." <* optional space <* choice [char 'm', char 'f'] + adj = string "adj" + +parenthesis :: Parser Text +parenthesis = char '(' *> Atto.takeWhile (/= ')') <* char ')' + +qualification :: Parser Text +qualification = + qualifier <* optional (char 'e') <* optional (char 's') <* many space + where + qualifier = choice $ string . pack <$> (vocabulary ++ (ucFirst <$> vocabulary)) + vocabulary = ["petit", "grand", "vaste", "belle", "fameux", "fameuse"] + ucFirst (c:cs) = toUpper c : cs + +lgeFirstWord :: String -> Parser String +lgeFirstWord headWord = head *> pulp *> meta *> optional qualification *> pulp *> many1 letter + where + head = string (pack headWord) + pulp = Atto.takeWhile (inClass ",. \t\n\r") + meta = many (parenthesis *> pulp) + +extractFirstWord :: FilePath -> (String -> Parser String) + -> (String, Int, Int, String, String) -> IO () + -- -> (Int, Int, String, String, String, String) -> IO () +extractFirstWord rootDirectory parser (_,tome, name, headWord, _) = +--extractFirstWord rootDirectory parser (tome, name, headWord, _, _, _) = + Text.readFile (rootDirectory </> (printf "T%d/ByRank/%d.txt" tome name)) + --Text.readFile (rootDirectory </> (printf "T%d/article%d.txt" tome name)) + >>= pure . parseOnly (parser headWord) + >>= either + (\_ -> warn $ printf "%d\t%d" tome name) + (\word -> putStrLn $ printf "%d\t%d\t%s" tome name word) + +run :: [String] -> IO () +run [book, rootDirectory] + | normalizedBook == "edda" = apply eddaFirstWord + | normalizedBook == "lge" = apply lgeFirstWord + where + normalizedBook = toLower <$> book + apply parser = + BS.getContents + >>= either die (mapM_ $ extractFirstWord rootDirectory parser) . decode HasHeader + +run _ = syntax "BOOK[either EDdA or LGE] SOURCE_DIRECTORY path to the root directory containing the articles (which CSV metadata are read from stdin)" + +main :: IO () +main = getArgs >>= run diff --git a/scripts/extract-from-source.sh b/scripts/extract-from-source.sh new file mode 100755 index 0000000..c99cc55 --- /dev/null +++ b/scripts/extract-from-source.sh @@ -0,0 +1,20 @@ +#!/bin/sh + +BASE_DIR="${0%/*}" + +source ${BASE_DIR}/lib.sh + +if [ "$#" != 2 ] +then + die "${0##*/} SOURCE_DIRECTORY TARGET_DIRECTORY" +else + SOURCE="${1}" + TARGET="${2}" + [ -d "${SOURCE}" ] || die "SOURCE_DIRECTORY must be a directory (containing 1 .tei file per tome)" + [ -d "${TARGET}" ] || die "TARGET_DIRECTORY must be a directory (where output will be generated)" +fi + +FILES_TSV="${TARGET}/files.tsv" +printf "book tome rank headWord name page\n" > "${FILES_TSV}" +${BASE_DIR}/EDdA/extract-from-source.sh "${SOURCE}/EDdA/ARTFL" ${TARGET} >> "${FILES_TSV}" +#${BASE_DIR}/LGE/extract-from-source.sh "${SOURCE}/LGE/BnF" ${TARGET} >> "${FILES_TSV}" diff --git a/scripts/lib.sh b/scripts/lib.sh new file mode 100644 index 0000000..94a0897 --- /dev/null +++ b/scripts/lib.sh @@ -0,0 +1,4 @@ +function die() { + printf "${1}\n" + exit 1 +} diff --git a/scripts/merge.hs b/scripts/merge.hs new file mode 100755 index 0000000..b0843d3 --- /dev/null +++ b/scripts/merge.hs @@ -0,0 +1,28 @@ +#!/usr/bin/env -S runhaskell --ghc-arg="-Wall" --ghc-arg="-i lib" +{-# LANGUAGE DeriveGeneric, ExplicitNamespaces, OverloadedStrings #-} + +import Control.Monad.Except (ExceptT(..), runExceptT) +import Data.Map ((!?)) +import Data.Maybe (catMaybes) +import Data.Vector (Vector, toList) +import GEODE.Metadata (Contrastive, Entry, Has(..), PrimaryKey, type(@)(..), indexBy, readNamedTsv, tsvFile) +import System.Environment (getArgs) +import System.Exit (die) +import System.Script (syntax) + +merge :: Vector (PrimaryKey @ Entry) -> Vector (PrimaryKey @ Contrastive) -> [PrimaryKey @ Entry @ Contrastive] +merge left right = catMaybes (mergeRow <$> toList right) + where + indexed = get <$> indexBy get left + mergeRow (pK :@: contrastive) = + (\entry -> (pK :@: entry :@: contrastive)) <$> (indexed !? pK) + +main :: IO () +main = getArgs >>= run + where + run [left, right, output] = + runExceptT (merge + <$> ExceptT (readNamedTsv left) + <*> ExceptT (readNamedTsv right) ) + >>= either die (tsvFile output) + run _ = syntax "PRIMARY_KEY.tsv INPUT_METADATA.tsv OUTPUT_METADATA.tsv" diff --git a/scripts/parallel-links.hs b/scripts/parallel-links.hs new file mode 100755 index 0000000..3243c5a --- /dev/null +++ b/scripts/parallel-links.hs @@ -0,0 +1,36 @@ +#!/usr/bin/env -S runhaskell --ghc-arg="-Wall" --ghc-arg="-i lib" +{-# LANGUAGE ExplicitNamespaces #-} +module Main where + +import Data.Char as Text (toLower) +import Data.Text as Text (map) +import GEODE.Metadata + ( Entry(..), Has(..), PrimaryKey(..), type(@), groupBy, readNamedTsv, tsvFile ) +import System.Environment (getArgs) +import System.Exit (die) +import System.Script (syntax) + +type Line = PrimaryKey @ Entry + +findPairs :: Foldable t => Bool -> t Line -> [Line] +findPairs caseInsensitive = + concatMap snd . filter isPair . groupBy (normalize . headWord . get) + where + isPair = oneInEach . groupBy (book.get) . snd + oneInEach = (&&) <$> bothBooks <*> oneByBook + bothBooks = ((2 ==) . length) + oneByBook = all ((1 ==) . length . snd) + normalize + | caseInsensitive = Text.map toLower + | otherwise = id + +main :: IO () +main = getArgs >>= popCaseInsensitive run + where + run caseInsensitive [input, output] = readNamedTsv input + >>= either + die + (tsvFile output . findPairs caseInsensitive) + run _ _ = syntax "[-i] SOURCE_METADATA.tsv TARGET_METADATA.tsv" + popCaseInsensitive f ("-i":args) = f True args + popCaseInsensitive f args = f False args diff --git a/scripts/parallel-links.py b/scripts/parallel-links.py index 284dc5d..605b12b 100755 --- a/scripts/parallel-links.py +++ b/scripts/parallel-links.py @@ -27,12 +27,14 @@ def unserializeMetadata(path, rowReader): if header: header = False else: - yield rowReader(row) + yield { + 'book': row[0] + } -def concat(generators): - for g in generators: - for x in g: - yield x +#def concat(generators): +# for g in generators: +# for x in g: +# yield x def naiveIndexBy(field, elements): d = {} @@ -44,63 +46,6 @@ def naiveIndexBy(field, elements): d[key] = e return d -def growPrefixes(d, keys, maxLength): - for length in range(maxLength, 1, -1): - newGeneration = {} - for key in keys: - if len(key) == length: - newKey = key[:-1] - if newKey in newGeneration or newKey in d: - newGeneration[newKey] = None - else: - newGeneration[newKey] = d[key] - for key, value in newGeneration.items(): - if value is not None: - d[key] = value - keys.add(key) - elif key not in d: - d[key] = value - -def indexBy(field, elements, prefix=True): - d = {} - for e in elements: - key = e[field] - if key in d: - d[key] = None - else: - d[key] = e - if prefix: - keys = set(d.keys()) - growPrefixes(d, keys, max(map(len, keys))) - return d - -def headWords(head): - words = head.split() - if len(words) == 1: - return words - else: - return [w for w in map(lambda s: s.strip(',.'), words) if w.isupper()] - -def identify(head, haystack): - if head in haystack: - if haystack[head] is not None: - return {'type': 'exact', 'match': head, 'found': haystack[head]} - else: - return None - else: - prefix = head[:-1] - while len(prefix) > 0 and prefix not in haystack: - prefix = prefix[:-1] - if prefix in haystack and haystack[prefix] is not None: - return { - 'type': 'prefix', - 'match': head, - 'found': haystack[prefix], - 'precision': len(prefix) / len(head) - } - else: - return None - def naiveGetArrows(source, target): indexedSource = naiveIndexBy('head', source) indexedTarget = naiveIndexBy('head', target) @@ -111,28 +56,85 @@ def naiveGetArrows(source, target): 'target': indexedTarget[head] } -def getArrows(source, target): - for article in source: - heads = headWords(article['head']) - identified = map(lambda w: identify(w, target), heads) - entries = [e for e in identified if e is not None] - if len(entries) == 1: - yield { - 'type': 'match', - 'source': article, - 'target': entries[0] - } - elif len(entries) > 1: - yield { - 'type': 'ambiguity', - 'source': article, - 'target': entries - } - -def interesting(arrow): - if arrow['type'] == 'match': - target = arrow['target'] - return len(target['match']) > 3 and (target['type'] == 'exact' or target['precision'] > 0.8) +#def growPrefixes(d, keys, maxLength): +# for length in range(maxLength, 1, -1): +# newGeneration = {} +# for key in keys: +# if len(key) == length: +# newKey = key[:-1] +# if newKey in newGeneration or newKey in d: +# newGeneration[newKey] = None +# else: +# newGeneration[newKey] = d[key] +# for key, value in newGeneration.items(): +# if value is not None: +# d[key] = value +# keys.add(key) +# elif key not in d: +# d[key] = value + +#def indexBy(field, elements, prefix=True): +# d = {} +# for e in elements: +# key = e[field] +# if key in d: +# d[key] = None +# else: +# d[key] = e +# if prefix: +# keys = set(d.keys()) +# growPrefixes(d, keys, max(map(len, keys))) +# return d + +#def headWords(head): +# words = head.split() +# if len(words) == 1: +# return words +# else: +# return [w for w in map(lambda s: s.strip(',.'), words) if w.isupper()] + +#def identify(head, haystack): +# if head in haystack: +# if haystack[head] is not None: +# return {'type': 'exact', 'match': head, 'found': haystack[head]} +# else: +# return None +# else: +# prefix = head[:-1] +# while len(prefix) > 0 and prefix not in haystack: +# prefix = prefix[:-1] +# if prefix in haystack and haystack[prefix] is not None: +# return { +# 'type': 'prefix', +# 'match': head, +# 'found': haystack[prefix], +# 'precision': len(prefix) / len(head) +# } +# else: +# return None +# +#def getArrows(source, target): +# for article in source: +# heads = headWords(article['head']) +# identified = map(lambda w: identify(w, target), heads) +# entries = [e for e in identified if e is not None] +# if len(entries) == 1: +# yield { +# 'type': 'match', +# 'source': article, +# 'target': entries[0] +# } +# elif len(entries) > 1: +# yield { +# 'type': 'ambiguity', +# 'source': article, +# 'target': entries +# } + +#def interesting(arrow): +# if arrow['type'] == 'match': +# target = arrow['target'] +# return len(target['match']) > 3 and (target['type'] == 'exact' or target['precision'] > 0.8) #gold = [a for a in arrows if interesting(a)] diff --git a/scripts/select.hs b/scripts/select.hs new file mode 100755 index 0000000..ba1c5f3 --- /dev/null +++ b/scripts/select.hs @@ -0,0 +1,27 @@ +#!/usr/bin/env -S runhaskell --ghc-arg="-Wall" --ghc-arg="-i lib" +{-# LANGUAGE ExplicitNamespaces, OverloadedStrings #-} +module Main where + +import Control.Monad.Except (ExceptT(..), runExceptT) +import Data.Map ((!?)) +import Data.Maybe (catMaybes) +import Data.Vector (Vector, toList) +import GEODE.Metadata (Entry, Has(..), PrimaryKey, type(@), indexBy, readNamedTsv, tsvFile) +import System.Environment (getArgs) +import System.Exit (die) +import System.Script (syntax) + +getMeta :: Vector PrimaryKey -> Vector (PrimaryKey @ Entry) -> [(PrimaryKey @ Entry)] +getMeta coords meta = catMaybes ((indexed !?) <$> toList coords) + where + indexed = indexBy get meta + +main :: IO () +main = getArgs >>= run + where + run [keys, input, output] = + runExceptT ( getMeta + <$> ExceptT (readNamedTsv keys) + <*> ExceptT (readNamedTsv input) ) + >>= either die (tsvFile output) + run _ = syntax "PRIMARY_KEY.tsv INPUT_METADATA.tsv OUTPUT_METADATA.tsv" diff --git a/scripts/subcorpus/getFiles.hs b/scripts/subcorpus/getFiles.hs new file mode 100755 index 0000000..ca99a95 --- /dev/null +++ b/scripts/subcorpus/getFiles.hs @@ -0,0 +1,29 @@ +#!/usr/bin/env -S runhaskell --ghc-arg="-Wall" --ghc-arg="-i lib" + +import GEODE.Metadata (PrimaryKey(..), readNamedTsv, relativePath) +import System.Directory (createDirectoryIfMissing, doesFileExist) +import System.Environment (getArgs) +import System.FilePath ((</>), takeDirectory) +import System.Posix.Files (createLink) +import System.Script (syntax, try) + +move :: String -> FilePath -> FilePath -> PrimaryKey -> IO () +move extension sourceRoot targetRoot article = do + fileExists <- doesFileExist sourceFile + if fileExists + then do + createDirectoryIfMissing True (takeDirectory targetFile) + createLink sourceFile targetFile + else + putStrLn sourceFile + where + fileName = relativePath article extension + sourceFile = sourceRoot </> fileName + targetFile = targetRoot </> fileName + +main :: IO () +main = getArgs >>= run + where + run [files, extension, source, target] = + try (readNamedTsv files) >>= mapM_ (move extension source target) + run _ = syntax "FILES.tsv EXTENSION SOURCE_DIRECTORY TARGET_DIRECTORY" diff --git a/scripts/subcorpus/getTXMQuery.hs b/scripts/subcorpus/getTXMQuery.hs new file mode 100755 index 0000000..854bf6a --- /dev/null +++ b/scripts/subcorpus/getTXMQuery.hs @@ -0,0 +1,21 @@ +#!/usr/bin/env -S runhaskell --ghc-arg="-Wall" --ghc-arg="-i lib" +{-# LANGUAGE OverloadedStrings #-} + +import Data.Text as Text (Text, pack, intercalate) +import Data.Text.IO as Text (putStr) +import GEODE.Metadata (PrimaryKey, uid, readNamedTsv) +import System.Environment (getArgs) +import System.Script (syntax, try) +import Text.Printf (printf) + +txmQuery :: Foldable t => t PrimaryKey -> Text +txmQuery = ("/region[text,a]:: " <>) . Text.intercalate "|" . groupBy (book.get) + where + select = Text.pack . printf "a.text_uid=\"%s\"" . uid + +main :: IO () +main = getArgs >>= run + where + run [files] = + try (readNamedTsv files) >>= Text.putStr . txmQuery + run _ = syntax "FILES.tsv" -- GitLab