From f8b70bc46a7171ea50377a55e73c7f8b23f4fe70 Mon Sep 17 00:00:00 2001 From: Alice BRENON <alice.brenon@ens-lyon.fr> Date: Thu, 4 May 2023 16:04:03 +0200 Subject: [PATCH] Increase the entropy of the universe --- Makefile | 74 ++++++-- lib/Conllu/Parse/Paragraph.hs | 28 +++ lib/Data/BIO.hs | 45 +++++ lib/Data/Csv/DB.hs | 9 + lib/Data/Metadata.hs | 79 +++++---- lib/Data/Metadata/Article.hs | 71 ++++---- lib/Data/Metadata/Paragraph.hs | 40 +++++ lib/Data/Metadata/Projector.hs | 33 ++++ lib/Data/Metadata/Trie.hs | 54 ++++++ lib/Data/Metadata/Types.hs | 38 +++++ lib/Data/SpaCy.hs | 16 ++ lib/Text/Editor.bak.hs | 101 +++++++++++ lib/Text/Editor.hs | 49 +++--- lib/Text/Filter.hs | 50 ++++++ lib/Text/TEI.hs | 99 +++++++++++ manifest.scm | 10 +- scripts/EDdA-metadata.py | 34 +++- scripts/LGE-metadata-from-EDdA.py | 12 +- scripts/add-domain-metadata.hs | 50 ++++++ ...iclesToVolume.sh => articles-to-volume.sh} | 0 scripts/conllu-to-lexicoscope.hs | 95 +++++++++++ scripts/conllu-to-txm.hs | 97 +++++++++++ scripts/detokenizer.hs | 22 +++ scripts/extract-corpus.sh | 27 +++ scripts/extract-parallel-EDdA.sh | 3 +- scripts/extract-parallel-LGE.sh | 4 +- scripts/fix-tsv.hs | 26 +++ scripts/inject-metadata-to-lexicoscope.hs | 48 ++++++ scripts/inject-metadata.sh | 58 +++++++ scripts/lge-articles-to-volumes.hs | 153 +++++++++++++++++ scripts/linearize.hs | 7 +- scripts/parallel-links.py | 160 ++++++++++++++++++ scripts/parallel-metadata.hs | 15 ++ scripts/parallel-metadata.py | 46 +++++ scripts/reroot.hs | 11 +- scripts/spacy-to-txm.hs | 65 +++++++ scripts/stanza-annotator.py | 55 ------ scripts/stanza-txt-annotator.py | 79 +++++++++ scripts/stanza-xml-annotator.py | 69 ++++++++ scripts/xml-annotator.py.bak | 70 ++++++++ scripts/xml-annotator.py.remote | 67 ++++++++ 41 files changed, 1895 insertions(+), 174 deletions(-) create mode 100644 lib/Conllu/Parse/Paragraph.hs create mode 100644 lib/Data/BIO.hs create mode 100644 lib/Data/Csv/DB.hs create mode 100644 lib/Data/Metadata/Paragraph.hs create mode 100644 lib/Data/Metadata/Projector.hs create mode 100644 lib/Data/Metadata/Trie.hs create mode 100644 lib/Data/Metadata/Types.hs create mode 100644 lib/Data/SpaCy.hs create mode 100644 lib/Text/Editor.bak.hs create mode 100644 lib/Text/Filter.hs create mode 100644 lib/Text/TEI.hs create mode 100755 scripts/add-domain-metadata.hs rename scripts/{articlesToVolume.sh => articles-to-volume.sh} (100%) create mode 100755 scripts/conllu-to-lexicoscope.hs create mode 100755 scripts/conllu-to-txm.hs create mode 100755 scripts/detokenizer.hs create mode 100755 scripts/extract-corpus.sh create mode 100755 scripts/fix-tsv.hs create mode 100755 scripts/inject-metadata-to-lexicoscope.hs create mode 100755 scripts/inject-metadata.sh create mode 100755 scripts/lge-articles-to-volumes.hs create mode 100755 scripts/parallel-links.py create mode 100644 scripts/parallel-metadata.hs create mode 100755 scripts/parallel-metadata.py create mode 100755 scripts/spacy-to-txm.hs delete mode 100755 scripts/stanza-annotator.py create mode 100755 scripts/stanza-txt-annotator.py create mode 100755 scripts/stanza-xml-annotator.py create mode 100755 scripts/xml-annotator.py.bak create mode 100755 scripts/xml-annotator.py.remote diff --git a/Makefile b/Makefile index c181e28..2614959 100644 --- a/Makefile +++ b/Makefile @@ -1,44 +1,96 @@ CORPUS_ROOT=/home/alice/Dehors/Corpus EDDA=$(CORPUS_ROOT)/EDdA LGE=$(CORPUS_ROOT)/LGE +PARALLEL=$(CORPUS_ROOT)/Parallel + +PARALLEL_LINKS=$(PARALLEL)/edda-lge.csv EDDA_META=$(EDDA)/metadata.csv +EDDA_API=$(EDDA)/API +EDDA_CORRESPONDANCES=$(EDDA)/ARTFL-ENCCRE.csv +PARALLEL_EDDA=$(EDDA)/Parallel +PARALLEL_EDDA_META=$(PARALLEL_EDDA)/metadata.csv +RAW_PARALLEL_EDDA=$(PARALLEL)/Text/EDdA +#RAW_PARALLEL_EDDA=$(PARALLEL_EDDA)/Text +TEI_PARALLEL_EDDA=$(PARALLEL_EDDA)/TEI +LINEARIZED_PARALLEL_EDDA_ROOT=$(PARALLEL_EDDA)/Linearized +RAW_EDDA=$(EDDA)/Text +TEI_EDDA=$(EDDA)/TEI RAW_LGE=$(LGE)/Text -TOME_DIRS=$(wildcard $(RAW_LGE)/T*) -TOMES=$(TOME_DIRS:$(RAW_LGE)/T%=%) +EDDA_TOME_DIRS=$(wildcard $(RAW_EDDA)/T*) +EDDA_TOMES=$(EDDA_TOME_DIRS:$(RAW_EDDA)/T%=%) +LGE_TOME_DIRS=$(wildcard $(RAW_LGE)/T*) +LGE_TOMES=$(LGE_TOME_DIRS:$(RAW_LGE)/T%=%) TEI_LGE=$(LGE)/TEI PARALLEL_LGE=$(LGE)/Parallel LGE_META_FROM_EDDA=$(PARALLEL_LGE)/metadata.csv -RAW_PARALLEL_LGE=$(PARALLEL_LGE)/Text +RAW_PARALLEL_LGE=$(PARALLEL)/Text/LGE +#RAW_PARALLEL_LGE=$(PARALLEL_LGE)/Text LINEARIZED_PARALLEL_LGE_ROOT=$(PARALLEL_LGE)/Linearized TEI_PARALLEL_LGE=$(PARALLEL_LGE)/TEI -LINEARIZED_PARALLEL_LGE=$(LINEARIZED_PARALLEL_LGE_ROOT)/ $(TOMES:%=$(LINEARIZED_PARALLEL_LGE_ROOT)/T%) +LINEARIZED_PARALLEL_EDDA=$(LINEARIZED_PARALLEL_EDDA_ROOT)/ $(EDDA_TOMES:%=$(LINEARIZED_PARALLEL_EDDA_ROOT)/T%) +LINEARIZED_PARALLEL_LGE=$(LINEARIZED_PARALLEL_LGE_ROOT)/ $(LGE_TOMES:%=$(LINEARIZED_PARALLEL_LGE_ROOT)/T%) +LINEARIZED_PARALLEL=$(LINEARIZED_PARALLEL_EDDA) $(LINEARIZED_PARALLEL_LGE) + +STANZA_PARALLEL_EDDA=$(PARALLEL_EDDA)/stanza STANZA_PARALLEL_LGE=$(PARALLEL_LGE)/stanza +STANZA_PARALLEL=$(STANZA_PARALLEL_EDDA) $(STANZA_PARALLEL_LGE) + +TXM_PARALLEL_EDDA=$(PARALLEL_EDDA)/TXM +TXM_PARALLEL_LGE=$(PARALLEL_LGE)/TXM +TXM_PARALLEL=$(TXM_PARALLEL_EDDA) $(TXM_PARALLEL_LGE) + +METADATA=$(EDDA_META) $(PARALLEL_EDDA_META) $(LGE_META_FROM_EDDA) -METADATA=$(EDDA_META) $(LGE_META_FROM_EDDA) +PARALLEL_DATA=$(PARALLEL_LINKS) -all: $(METADATA) $(LINEARIZED_PARALLEL_LGE) +#all: $(PARALLEL_DATA) $(METADATA) $(RAW_PARALLEL_EDDA) $(LINEARIZED_PARALLEL_EDDA) $(STANZA_PARALLEL) $(TXM_PARALLEL) +all: $(PARALLEL_DATA) $(EDDA_META) $(PARALLEL_EDDA_META) $(TEI_PARALLEL_EDDA) $(EDDA_META): $(EDDA)/TEI/ - ./scripts/EDdA-metadata.py $< > $@ + EDDA_API="$(EDDA_API)" ./scripts/EDdA-metadata.py $< $(EDDA_CORRESPONDANCES) > $@ + +$(PARALLEL_LINKS): $(EDDA_META) $(RAW_LGE) + mkdir -p $(PARALLEL) + ./scripts/parallel-links.py $^ $@ + +$(PARALLEL_EDDA_META): $(EDDA_META) + mkdir -p + ./scripts/LGE-metadata-from-EDdA.py $^ $(RAW_LGE) $@ $(LGE_META_FROM_EDDA): $(EDDA_META) $(PARALLEL_LGE)/ ./scripts/LGE-metadata-from-EDdA.py $< $(RAW_LGE) $@ +$(RAW_PARALLEL_EDDA): $(PARALLEL_EDDA_META) $(RAW_EDDA) + ./scripts/extract-parallel-EDdA.sh $^ $@ + +$(TEI_PARALLEL_EDDA): $(PARALLEL_EDDA_META) $(TEI_EDDA) + ./scripts/extract-parallel-EDdA.sh $^ $@ + $(RAW_PARALLEL_LGE): $(LGE_META_FROM_EDDA) $(RAW_LGE) - ./scripts/extract-parallel-LGE.sh $^ $@ + bash ./scripts/extract-parallel-LGE.sh $^ $@ $(TEI_PARALLEL_LGE): $(LGE_META_FROM_EDDA) $(TEI_LGE) ./scripts/extract-parallel-LGE.sh $^ $@ -$(STANZA_PARALLEL_LGE): $(RAW_PARALLEL_LGE) - ./scripts/stanza-annotator.py $< $@ +$(STANZA_PARALLEL_EDDA): $(LINEARIZED_PARALLEL_EDDA) + ./scripts/stanza-txt-annotator.py $< $@ + +$(STANZA_PARALLEL_LGE): $(LINEARIZED_PARALLEL_LGE) + ./scripts/stanza-txt-annotator.py $< $@ -%/: +$(TXM_PARALLEL_EDDA): $(STANZA_PARALLEL_EDDA) + ./scripts/articles-to-volumes.hs $< $@ + +$(TXM_PARALLEL_LGE): $(STANZA_PARALLEL_LGE) + ./scripts/articles-to-volumes.hs $< $@ + +$(LINEARIZED_PARALLEL_EDDA_ROOT)/T%: $(RAW_PARALLEL_EDDA)/T% mkdir -p $@ + find $< -type f -name '*.txt' | ./scripts/linearize.hs $@ $(LINEARIZED_PARALLEL_LGE_ROOT)/T%: $(RAW_PARALLEL_LGE)/T% mkdir -p $@ diff --git a/lib/Conllu/Parse/Paragraph.hs b/lib/Conllu/Parse/Paragraph.hs new file mode 100644 index 0000000..8d76fb9 --- /dev/null +++ b/lib/Conllu/Parse/Paragraph.hs @@ -0,0 +1,28 @@ +{-# LANGUAGE NamedFieldPuns #-} +module Conllu.Parse.Paragraph ( + Paragraph(..) + , byParagraph + ) where + +import Conllu.Type (Doc, Sent(..)) +import Data.List (find, isPrefixOf) + +newtype Paragraph = Paragraph { + sentences :: [Sent] + } + +-- | Assumes the document starts with a newpar (otherwise the first sentences +-- don't have a paragraph id to be associated to — this is always the case in +-- the output of our stanza annotator) +byParagraph :: Doc -> [Paragraph] +byParagraph = walk [] + where + walk stack (s:ss) = + case (getParagraph s, stack) of + (Just _, []) -> walk [s] ss + (Just _, _) -> Paragraph (reverse stack):(walk [s] ss) + _ -> walk (s:stack) ss + walk stack [] = [Paragraph $ reverse stack] + +getParagraph :: Sent -> Maybe String +getParagraph = fmap snd . find (("newpar " `isPrefixOf`) . fst) . _meta diff --git a/lib/Data/BIO.hs b/lib/Data/BIO.hs new file mode 100644 index 0000000..f75c5ad --- /dev/null +++ b/lib/Data/BIO.hs @@ -0,0 +1,45 @@ +{-# LANGUAGE OverloadedStrings #-} +module Data.BIO + ( BIO(..) + , Tree(..) + , bio ) where + +import Control.Applicative (empty) +import Data.ByteString as BS (splitAt) +import Data.Csv (FromField(..), ToField(..)) +import Data.Foldable (toList) + +data BIO a = Begin a | Inside a | Outside deriving (Show) + +instance FromField a => FromField (BIO a) where + parseField "O" = pure Outside + parseField f = + case BS.splitAt 2 f of + ("B-", tag) -> Begin <$> parseField tag + ("I-", tag) -> Inside <$> parseField tag + _ -> empty + +instance ToField a => ToField (BIO a) where + toField Outside = "O" + toField (Begin a) = "B-" <> toField a + toField (Inside a) = "I-" <> toField a + +data Tree a b = Leaf a | Node b [Tree a b] | Root [Tree a b] deriving (Show) + +bio :: (Eq b, Foldable t) => (a -> BIO b) -> t a -> Tree a b +bio field = Root . atRoot . toList + where + atRoot [] = [] + atRoot (a:as) = + case field a of + Outside -> (Leaf a):(atRoot as) + Begin b -> enter b a as + Inside b -> enter b a as + enter b a as = + let (nodes, otherTrees) = inside [] b as in + (Node b (Leaf a:nodes)):(atRoot otherTrees) + inside stack b0 l = + case field <$> take 1 l of + [Inside b1] + | b0 == b1 -> inside (Leaf (head l):stack) b0 (tail l) + _ -> (reverse stack, l) diff --git a/lib/Data/Csv/DB.hs b/lib/Data/Csv/DB.hs new file mode 100644 index 0000000..5119872 --- /dev/null +++ b/lib/Data/Csv/DB.hs @@ -0,0 +1,9 @@ +module Data.Csv.DB + (join) where + +join :: Eq x => (Maybe a -> Maybe b -> Maybe c) -> Vector a -> Vector b -> Vector c +join selector + +left :: (a -> f) -> Maybe a -> f +left selector None = +left selector (Just a) = selector a diff --git a/lib/Data/Metadata.hs b/lib/Data/Metadata.hs index 582eaff..8af810b 100644 --- a/lib/Data/Metadata.hs +++ b/lib/Data/Metadata.hs @@ -1,38 +1,57 @@ -{-# LANGUAGE NamedFieldPuns #-} +{-# LANGUAGE NamedFieldPuns, OverloadedStrings #-} module Data.Metadata ( - Metadata - , byTome - , metadata + Authors(..) + , Book(..) + , Domains(..) + , FromBook(..) + , HasAuthors(..) + , HasDomains(..) + , InFile(..) + , TXMText + , Unique(..) + , groupBy + , list + , readTsv + , sortBy ) where -import Data.Attoparsec.Text (Parser, endOfLine, sepBy1, takeTill) -import Data.List as List (foldl') -import Data.Map as Map (Map, adjust, empty, foldl', insert, member) -import Data.Metadata.Article (Article(..), article) -import Data.Text (Text) +import Data.ByteString.Lazy as ByteString (readFile, writeFile) +import Data.ByteString.Char8 as StrictByteString (pack) +import Data.Csv + ( DecodeOptions(..), EncodeOptions(..), FromRecord, HasHeader(..), ToRecord + , decodeWith, defaultEncodeOptions, encodeWith, header, namedField ) +import Data.Foldable as Foldable (toList) +import Data.List (sortOn) +import Data.Map.Strict as Map (alter, empty, toList) +import Data.Metadata.Projector + (FromBook(..), HasAuthors(..), HasDomains(..), InFile(..), TXMText, Unique(..)) +import Data.Metadata.Types (Authors(..), Book(..), Domains(..)) +import Data.Text as Text (Text, intercalate, unpack) +import Data.Vector as Vector (Vector) ---type Metadata = Map Text Article -type Metadata = [Article] -type Tomes = Map Int Metadata +list :: [Text] -> String +list ts = Text.unpack $ ":" <> intercalate ":" ts <> ":" -metadata :: Parser Metadata ---metadata = List.foldl' indexByUid Map.empty <$> --- (skipLine *> article `sepBy1` endOfLine) -metadata = skipLine *> article `sepBy1` endOfLine +readTsv :: FromRecord a => FilePath -> IO (Either String (Vector a)) +readTsv source = decodeWith tsv HasHeader <$> ByteString.readFile source where - skipLine = takeTill (\c -> c == '\r' || c == '\n') *> endOfLine - indexByUid temp a@(Article {uid}) = Map.insert uid a temp + tsv = DecodeOptions {decDelimiter = fromIntegral (fromEnum '\t')} -byTome :: Metadata -> Tomes ---byTome = Map.foldl' sortByTome Map.empty -byTome = List.foldl' sortByTome Map.empty +writeTsv :: (Foldable t, ToRecord a) => [String] -> t a -> FilePath -> IO () +writeTsv fields content target = + ByteString.writeFile target + . encodeWith tsv (header bsFields) + $ zipWith namedField bsFields (Foldable.toList content) where - sortByTome temp a@(Article {uid, tome}) = --- Map.adjust (Map.insert uid a) tome . - Map.adjust (a:) tome --- . (if tome `member` temp then id else Map.insert tome Map.empty) $ temp - . (if tome `member` temp then id else Map.insert tome []) $ temp -{- - . (if tome `member` temp then id else Map.insert tome Map.empty) - $ tome - -} + bsFields = StrictByteString.pack <$> fields + tsv = defaultEncodeOptions + { encDelimiter = fromIntegral (fromEnum '\t') + , encUseCrLf = False } + +sortBy :: (Foldable t, Ord k) => (a -> k) -> t a -> [a] +sortBy field = sortOn field . Foldable.toList + +groupBy :: (Foldable t, Ord k) => (v -> k) -> t v -> [(k, [v])] +groupBy field = Map.toList . foldr group Map.empty + where + group article = Map.alter (Just . maybe [article] (article:)) (field article) diff --git a/lib/Data/Metadata/Article.hs b/lib/Data/Metadata/Article.hs index 300c3d8..ba8c694 100644 --- a/lib/Data/Metadata/Article.hs +++ b/lib/Data/Metadata/Article.hs @@ -1,30 +1,41 @@ -{-# LANGUAGE OverloadedStrings #-} -module Data.Metadata.Article ( - Article(..) - , article - ) where - -import Data.Attoparsec.Text (Parser, char, decimal, sepBy1, string, takeTill) -import Data.Text (Text, stripEnd) - -data Article = Article { - uid :: Text - , tome :: Int - , rank :: Int - , headWord :: Text - , domains :: [Text] - } deriving Show - -article :: Parser Article -article = Article - <$> cell - <*> (decimal <* char ',') - <*> (decimal <* char ',') - <*> cell - <*> domains_ - where - cell = takeTill (== ',') <* char ',' - -domains_ :: Parser [Text] -domains_ = fmap stripEnd <$> - (takeTill (`elem` ['|', '\r', '\n'])) `sepBy1` (string "| ") +{- LANGUAGE DeriveGeneric, OverloadedStrings #-} +module Data.Metadata.Article + ( Article(..) ) where + +import Data.Csv (FromRecord(..), ToRecord(..)) +import Data.Metadata + ( Authors, Book, Domains + , FromBook(..), HasAuthors(..), HasDomains(..), InFile(..), Unique(..) ) +import Data.Text (Text, unpack) +import GHC.Generics (Generic) +import Text.Printf (printf) + +data Article = Article + { articleBook :: Book + , tome :: Int + , name :: Text + , headWord :: Text + , rank :: Int + , page :: Int + , articleAuthors :: Authors + , articleDomains :: Domains } deriving (Generic, Show) + +instance FromRecord Article +instance ToRecord Article + +instance Unique Article where + uid (Article {articleBook, tome, name}) = + printf "%s_%d_%s" (show articleBook) tome (unpack name) + +instance FromBook Article where + book = articleBook + +instance HasAuthors Article where + authors_ = articleAuthors + +instance HasDomains Article where + domains_ = articleDomains + +instance InFile Article where + relativePath (Article {articleBook, tome, name}) = + printf "%s/T%d/%s" (show articleBook) tome (unpack name) diff --git a/lib/Data/Metadata/Paragraph.hs b/lib/Data/Metadata/Paragraph.hs new file mode 100644 index 0000000..2f59105 --- /dev/null +++ b/lib/Data/Metadata/Paragraph.hs @@ -0,0 +1,40 @@ +module Data.Metadata.Paragraph + ( Paragraph(..) ) where + +import Data.Csv (FromRecord(..), ToRecord(..)) +import Data.Metadata + ( Authors, Book(..), Domains + , FromBook(..), HasAuthors(..), HasDomains(..), InFile(..), Unique(..) ) +import GHC.Generics (Generic) +import Text.Printf (printf) + +data Paragraph = Paragraph + { tome :: Int + , name :: String + , headWord :: String + , paragraphAuthors :: Authors + , paragraphDomains :: Domains + , paragraphId :: Int + , content :: String + , nbPlaceSpacy :: Int + , nbPersonSpacy :: Int } deriving (Generic, Show) + +instance FromRecord Paragraph +instance ToRecord Paragraph + +instance Unique Paragraph where + uid (Paragraph {tome, name, paragraphId}) = + printf "EDdA_%d_%s_%d" tome name paragraphId + +instance FromBook Paragraph where + book _ = EDdA + +instance HasAuthors Paragraph where + authors_ = paragraphAuthors + +instance HasDomains Paragraph where + domains_ = paragraphDomains + +instance InFile Paragraph where + relativePath (Paragraph {tome, name, paragraphId}) = + printf "T%d/article%s-%d" tome name paragraphId diff --git a/lib/Data/Metadata/Projector.hs b/lib/Data/Metadata/Projector.hs new file mode 100644 index 0000000..f37d5f5 --- /dev/null +++ b/lib/Data/Metadata/Projector.hs @@ -0,0 +1,33 @@ +module Data.Metadata.Projector + ( FromBook(..) + , HasAuthors(..) + , HasDomains(..) + , InFile(..) + , TXMText + , Unique(..) ) where + +import Data.Metadata.Types (Authors(..), Book, Domains(..)) +import Data.Text (Text) + +class Unique a where + uid :: a -> String + +class FromBook a where + book :: a -> Book + +class HasAuthors a where + authors_ :: a -> Authors + + authors :: a -> [Text] + authors = getAuthors . authors_ + +class HasDomains a where + domains_ :: a -> Domains + + domains :: a -> [Text] + domains = getDomains . domains_ + +class InFile a where + relativePath :: a -> FilePath + +type TXMText a = (Unique a, FromBook a, HasAuthors a, HasDomains a) diff --git a/lib/Data/Metadata/Trie.hs b/lib/Data/Metadata/Trie.hs new file mode 100644 index 0000000..2383ad5 --- /dev/null +++ b/lib/Data/Metadata/Trie.hs @@ -0,0 +1,54 @@ +{-# LANGUAGE NamedFieldPuns #-} +module Data.Metadata.Trie + () where + +import Data.Foldable (toList) +import Data.Map (Map) +import qualified Data.Map as Map (empty, insert, lookup, singleton) + +data Trie e n = + Trie + { store :: Maybe n + , edges :: Edges e n } +type Edges e n = Map e (Trie e n) + +data Zipper e n = + Top + | Zip + { atValue :: Maybe n + , otherEdges :: Edges e n + , byEdge :: e + , above :: Zipper e n } + +type EditState e n = (Trie e n, Zipper e n) + +empty :: Trie e n +empty = Trie { store = Nothing, edges = Map.empty } + +zipUp :: Ord e => EditState e n -> Trie e n +zipUp (t, Top) = t +zipUp (t, Zip {atValue, otherEdges, byEdge, above}) = + zipUp + ( Trie { store = atValue, edges = Map.insert byEdge t otherEdges } + , above ) + +trieOf :: Foldable t => t e -> n -> Trie e n +trieOf path n = foldr insertAt (Trie { store = Just n, edges = Map.empty }) path + where + insertAt e t = Trie { store = Nothing, edges = Map.singleton e t } + +at :: (Foldable t, Ord e) => Trie e n -> t e -> Maybe n +at trie = at_ trie . toList + where + at_ (Trie {store}) [] = store + at_ (Trie {edges}) (e:es) = Map.lookup e edges >>= (`at_` es) + +insert :: Foldable t => t e -> n -> Trie e n -> Trie e n +insert path n trie = zipUp (edit (trie, Top)) + where + edit + +type Indexed e n = ([e], n) + +index :: (Foldable t, Functor t) => (a -> Indexed e n) -> t a -> Trie e n +index by = foldr (uncurry insert) empty . fmap by diff --git a/lib/Data/Metadata/Types.hs b/lib/Data/Metadata/Types.hs new file mode 100644 index 0000000..608bad0 --- /dev/null +++ b/lib/Data/Metadata/Types.hs @@ -0,0 +1,38 @@ +{-# LANGUAGE OverloadedStrings #-} +module Data.Metadata.Types + ( Authors(..) + , Book(..) + , Domains(..) ) where + +import Control.Applicative (empty) +import Data.Csv (Field, FromField(..), Parser, ToField(..)) +import Data.Text (Text, intercalate, splitOn) + +data Book = EDdA | LGE | Wikipedia deriving (Eq, Ord, Read, Show) + +newtype Authors = Authors + { getAuthors :: [Text] } deriving Show +newtype Domains = Domains + { getDomains :: [Text] } deriving Show + +instance FromField Book where + parseField "EDdA" = pure EDdA + parseField "LGE" = pure LGE + parseField "Wikipedia" = pure Wikipedia + parseField _ = empty +instance ToField Book where + toField = toField . show + +sepBy :: Text -> Field -> Parser [Text] +sepBy s = fmap (splitOn s) . parseField + +instance FromField Authors where + parseField = fmap Authors . sepBy " & " +instance ToField Authors where + toField = toField . intercalate " & ". getAuthors + +instance FromField Domains where + parseField = fmap Domains . sepBy " | " +instance ToField Domains where + toField = toField . intercalate " | ". getDomains + diff --git a/lib/Data/SpaCy.hs b/lib/Data/SpaCy.hs new file mode 100644 index 0000000..79a52a8 --- /dev/null +++ b/lib/Data/SpaCy.hs @@ -0,0 +1,16 @@ +module Data.SpaCy + ( Annotation(..) ) where + +import Data.BIO (BIO) +import Data.Csv (FromRecord(..), ToRecord(..)) +import GHC.Generics (Generic) + +data Annotation = Annotation + { tokenNumber :: Int + , form :: String + , lemma :: String + , pos :: String + , namedEntity :: BIO String } deriving (Generic, Show) + +instance FromRecord Annotation +instance ToRecord Annotation diff --git a/lib/Text/Editor.bak.hs b/lib/Text/Editor.bak.hs new file mode 100644 index 0000000..5f7e00b --- /dev/null +++ b/lib/Text/Editor.bak.hs @@ -0,0 +1,101 @@ +{-# LANGUAGE FlexibleInstances, NamedFieldPuns, OverloadedStrings #-} +module Text.Editor ( + -- Editor + --, Filter(..) + Editable(..) + --, (|->) + --, (|=>) + , (<>.) + , editor + --, pureEditor + , fileFilter + --, runAll + , stdinList + ) where + +import Control.Monad.IO.Class (MonadIO(..)) +import Data.Text as Text (Text, intercalate, lines, pack, unpack) +import Data.Text.IO as Text (readFile, writeFile) +import System.FilePath (replaceDirectory, replaceExtension) + +data Filter m a b = Filter { + transform :: a -> m b + , outputFormat :: Maybe String + } + +type Editor m a = Filter m a a + +{- +data Editor m a = Editor { + transform :: a -> m a + , outputFormat :: Maybe String + } +-} + +{- +editor :: (a -> m a) -> Editor m a +editor f = Filter { transform = f, outputFormat = Nothing } +--editor f = Editor { transform = f, outputFormat = Nothing } + +pureEditor :: Applicative m => (a -> a) -> Editor m a +pureEditor = editor . (pure .) +-} + +class Editable a where + enter :: Text -> a + leave :: a -> Text + +instance Editable [Text] where + enter = Text.lines + leave = Text.intercalate "\n" + +instance Editable String where + enter = Text.unpack + leave = Text.pack + +type EditionProcess m = FilePath -> FilePath -> m () + +fileFilter :: (Editable a, Editable b, MonadIO m) => + (FilePath -> FilePath) -> (a -> m b) -> EditionProcess m +fileFilter fixFormat transform target input = +--(|->) (Editor {transform, outputFormat}) target input = + liftIO (Text.readFile input) + >>= transform . enter + >>= liftIO . Text.writeFile output . leave + where + output = fixFormat $ replaceDirectory input target + +(<>.) :: (Editable a, Editable b, MonadIO m) => + (a -> m b) -> String -> EditionProcess m +(<>.) transform extension = fileFilter (`replaceExtension` extension) transform + +editor :: (Editable a, Editable b, MonadIO m) => + (a -> m b) -> EditionProcess m +editor = fileFilter id + +{- +run :: (Editable a, Editable b, MonadIO m) => Filter m a b -> FilePath -> FilePath -> m () +run = (|->) + +(|->) :: (Editable a, Editable b, MonadIO m) => Filter m a b -> FilePath -> FilePath -> m () +--(|->) :: (Editable a, MonadIO m) => Editor m a -> FilePath -> FilePath -> m () +(|->) (Filter {transform, outputFormat}) target input = +--(|->) (Editor {transform, outputFormat}) target input = + liftIO (Text.readFile input) + >>= transform . enter + >>= liftIO . Text.writeFile output . leave + where + fixFormat = maybe id (flip replaceExtension) outputFormat + output = fixFormat $ replaceDirectory input target + +runAll :: (Editable a, Foldable t, MonadIO m) => Editor m a -> FilePath -> t FilePath -> m () +runAll = (|=>) + +-- (|=>) :: (Editable a, MonadIO m) => Editor m a -> FilePath -> String -> m () +(|=>) :: (Editable a, Foldable t, MonadIO m) => Editor m a -> FilePath -> t FilePath -> m () +--editor |=> target = mapM_ (editor |-> target) . Prelude.lines +editor |=> target = mapM_ (editor |-> target) +-} + +stdinList :: MonadIO m => m [FilePath] +stdinList = Prelude.lines <$> liftIO getContents diff --git a/lib/Text/Editor.hs b/lib/Text/Editor.hs index b5c4148..2d7a68b 100644 --- a/lib/Text/Editor.hs +++ b/lib/Text/Editor.hs @@ -1,19 +1,17 @@ -{-# LANGUAGE FlexibleInstances, OverloadedStrings #-} +{-# LANGUAGE FlexibleInstances, NamedFieldPuns, OverloadedStrings #-} module Text.Editor ( - Editor - , edit - , editAll - , editM - , editAllM + Editable(..) + , (<>.) + , apply + , fileFilter + , xargs ) where import Control.Monad.IO.Class (MonadIO(..)) +import Data.ByteString.Lazy as BS (ByteString, pack, unpack) import Data.Text as Text (Text, intercalate, lines, pack, unpack) import Data.Text.IO as Text (readFile, writeFile) -import System.FilePath (replaceDirectory) - -type Editor a = a -> a -type EditorM m a = a -> m a +import System.FilePath (replaceDirectory, replaceExtension) class Editable a where enter :: Text -> a @@ -27,26 +25,23 @@ instance Editable String where enter = Text.unpack leave = Text.pack --- | Apply an 'Editor' to the content of a file which path is passed as third --- argument. The second argument is the path to the target folder where the --- edited version will be created. -editM :: (Editable a, MonadIO m) => EditorM m a -> FilePath -> FilePath -> m () -editM editor target input = +type Editor m = FilePath -> FilePath -> m () + +fileFilter :: (Editable a, Editable b, MonadIO m) => + (FilePath -> FilePath) -> (a -> m b) -> Editor m +fileFilter fixFormat transform target input = liftIO (Text.readFile input) - >>= editor . enter + >>= transform . enter >>= liftIO . Text.writeFile output . leave where - output = replaceDirectory input target + output = fixFormat $ replaceDirectory input target --- | A convenient shortcut of 'editM' for pure editors -edit :: Editable a => Editor a -> FilePath -> FilePath -> IO () -edit editor = editM (pure . editor) +(<>.) :: (Editable a, Editable b, MonadIO m) => (a -> m b) -> String -> Editor m +(<>.) transform extension = fileFilter (`replaceExtension` extension) transform --- | Apply an 'Editor' like 'edit' on all the files which paths are expected to --- be read from the input 'String', one per line. -editAllM :: (Editable a, MonadIO m) => EditorM m a -> FilePath -> String -> m () -editAllM editor target = mapM_ (editM editor target) . Prelude.lines +apply :: (Editable a, Editable b, MonadIO m) => (a -> m b) -> Editor m +apply = fileFilter id --- | A convenient shortcut of 'editAllM' for pure editors -editAll :: Editable a => Editor a -> FilePath -> String -> IO () -editAll editor = editAllM (pure . editor) +xargs :: (Editable a, Editable b, MonadIO m) => (a -> m b) -> FilePath -> m () +xargs transform target = + Prelude.lines <$> liftIO getContents >>= mapM_ (apply transform target) diff --git a/lib/Text/Filter.hs b/lib/Text/Filter.hs new file mode 100644 index 0000000..d55000e --- /dev/null +++ b/lib/Text/Filter.hs @@ -0,0 +1,50 @@ +{-# LANGUAGE FlexibleInstances, NamedFieldPuns, OverloadedStrings #-} +module Text.Filter ( + Editable(..) + , (<>.) + , apply + , fileFilter + , xargs + ) where + +import Control.Monad.IO.Class (MonadIO(..)) +import Data.Text as Text (Text, intercalate, lines, pack, unpack) +import Data.Text.IO as Text (readFile, writeFile) +import System.FilePath (replaceDirectory, replaceExtension) + +class Editable a where + enter :: Text -> a + leave :: a -> Text + +instance Editable Text where + enter = id + leave = id + +instance Editable [Text] where + enter = Text.lines + leave = Text.intercalate "\n" + +instance Editable String where + enter = Text.unpack + leave = Text.pack + +type Filter m = FilePath -> FilePath -> m () + +fileFilter :: (Editable a, Editable b, MonadIO m) => + (FilePath -> FilePath) -> (a -> m b) -> Filter m +fileFilter fixFormat transform target input = + liftIO (Text.readFile input) + >>= transform . enter + >>= liftIO . Text.writeFile output . leave + where + output = fixFormat $ replaceDirectory input target + +(<>.) :: (Editable a, Editable b, MonadIO m) => (a -> m b) -> String -> Filter m +(<>.) transform extension = fileFilter (`replaceExtension` extension) transform + +apply :: (Editable a, Editable b, MonadIO m) => (a -> m b) -> Filter m +apply = fileFilter id + +xargs :: (Editable a, Editable b, MonadIO m) => (a -> m b) -> FilePath -> m () +xargs transform target = + Prelude.lines <$> liftIO getContents >>= mapM_ (apply transform target) diff --git a/lib/Text/TEI.hs b/lib/Text/TEI.hs new file mode 100644 index 0000000..7514df9 --- /dev/null +++ b/lib/Text/TEI.hs @@ -0,0 +1,99 @@ +module Text.TEI + ( (.=) + , corpusHeader + , publicationStmt + , sourceDesc + , teiHeader + , text + , title ) where + +import Data.Metadata + ( Book(..), FromBook(..), HasAuthors(..), HasDomains(..), TXMText, Unique(..) + , list ) +import Text.XML.HXT.Core ( ArrowXml, XmlTree, aelem, attr, mkelem, selem, txt ) + +corpusHeader :: ArrowXml a => Book -> String -> a b XmlTree +corpusHeader EDdA = eddaHeader +corpusHeader LGE = lgeHeader +corpusHeader Wikipedia = wikiHeader + +eddaHeader :: ArrowXml a => String -> a b XmlTree +eddaHeader strTome = + teiHeader + (titleStmt strTome ("Digitized" `by` "University of Chicago Library" + ++ "Published" `by` "ARTFL")) + (corpusPublicationStmt) + (sourceDesc + [ title ("L'Encyclopédie T" <> strTome) + , selem "author" [ txt "Collective" ] + , selem "creation" [ selem "date" [ txt "1752" ] ] ]) + +lgeHeader :: ArrowXml a => String -> a b XmlTree +lgeHeader strTome = + teiHeader + (titleStmt strTome ("Digitized" `by` "Bibliothèque Nationale de France")) + (corpusPublicationStmt) + (sourceDesc + [ title ("La Grande Encyclopédie T" <> strTome) + , selem "author" [ txt "Collective" ] + , selem "creation" [ selem "date" [ txt "1885" ] ] + , selem "imprint" [ selem "date" [ txt "1885" ] + , selem "publisher" [ txt "H. Lamirault et Cie," ] + , selem "pubplace" [ txt "Paris" ] ] + , mkelem "biblScope" [ attr "unit" (txt "volume") ] [ txt strTome ] ]) + +wikiHeader :: ArrowXml a => String -> a b XmlTree +wikiHeader _ = + teiHeader + (titleStmt "Wikipédia, L'encyclopédie libre" []) + (corpusPublicationStmt) + (sourceDesc + [ selem "author" [ selem "orgName" [ txt "Wikimedia Foundation" ] ] ]) + +teiHeader :: ArrowXml a => + a b XmlTree -> a b XmlTree -> a b XmlTree -> a b XmlTree +teiHeader aTitle aPublication aSource = + selem "teiHeader" + [ selem "fileDesc" [ aTitle, aPublication, aSource ] ] + +titleStmt :: ArrowXml a => String -> [a b XmlTree] -> a b XmlTree +titleStmt tome resps = + selem "titleStmt" + [ title tome + , selem "respStmt" (resps ++ "Annotated and encoded" `by` "ICAR") ] + +title :: ArrowXml a => String -> a b XmlTree +title titleContent = selem "title" [txt titleContent] + +corpusPublicationStmt :: ArrowXml a => a b XmlTree +corpusPublicationStmt = + (publicationStmt + [selem "distributor" + [selem "orgName" [txt "Project GÉODE"] + ,selem "address" + [selem "addrline" [txt "ICAR UMR 5191"] + ,selem "addrline" [txt "ENS de Lyon"]]]]) + +publicationStmt :: ArrowXml a => [a b XmlTree] -> a b XmlTree +publicationStmt = selem "publicationStmt" + +sourceDesc :: ArrowXml a => [a b XmlTree] -> a b XmlTree +sourceDesc biblContent = + selem "sourceDesc" [selem "bibl" biblContent] + +by :: ArrowXml a => String -> String -> [a b XmlTree] +by resp orgName = [ selem "resp" [ txt (resp <> " by") ] + , selem "orgName" [ txt orgName ] ] + +text :: (ArrowXml a, TXMText b) => String -> b -> a c XmlTree -> a c XmlTree +text unitName txmText body = + mkelem "text" metadata [ body, aelem "milestone" [ "unit" .= unitName ] ] + where + metadata = + [ "uid" .= uid txmText + , "book" .= show (book txmText) + , "author" .= list (authors txmText) + , "domains" .= list (domains txmText) ] + +(.=) :: ArrowXml a => String -> String -> a b XmlTree +key .= value = attr key (txt value) diff --git a/manifest.scm b/manifest.scm index 19fcf85..af89124 100644 --- a/manifest.scm +++ b/manifest.scm @@ -3,21 +3,27 @@ ((gnu packages commencement) #:select (gcc-toolchain)) ((gnu packages haskell) #:select (ghc)) ((gnu packages haskell-web) #:select (ghc-hxt)) - ((gnu packages haskell-xyz) #:select (ghc-attoparsec)) + ((gnu packages haskell-xyz) #:select (ghc-cassava ghc-hs-conllu)) ((gnu packages python) #:select (python)) + ((gnu packages python-science) #:select (python-pandas)) ((gnu packages python-xyz) #:select (python-beautifulsoup4)) ((gnu packages xml) #:select (python-lxml))) +(define python-edda (load "/home/alice/Logiciel/python-edda/guix.scm")) + (packages->manifest (list coreutils ; mktemp for atomic processing, strip CSV headers, general scripting gcc-toolchain ; running haskell ghc ; running haskell - ghc-attoparsec ; parsing metadata + ghc-cassava ; working with CSV in haskell + ghc-hs-conllu ; working on syntax-annotated documents ghc-hxt ; working on xml documents python ; scripts python-beautifulsoup4 ; extract EDdA metadata from TEI files + python-edda ; TODO python-lxml ; fusion articles into tomes for TXM + python-pandas ; working with CSV in python python-stanza ; annotation stanza-fr ; annotation )) diff --git a/scripts/EDdA-metadata.py b/scripts/EDdA-metadata.py index 05c4ae2..e804019 100755 --- a/scripts/EDdA-metadata.py +++ b/scripts/EDdA-metadata.py @@ -2,17 +2,39 @@ import csv from bs4 import BeautifulSoup +from EDdA import ENCCRE +import pandas import os import sys -header = ["T", "article", "head", "domain"] +header = ["T", "article", "head", "author", "ARTFL_domain", "ENCCRE_domain"] def getAttribute(article, attribute): result = article.find(type=attribute) return result.get('value') if result else '' -def main(rootDirectory): - output = csv.writer(sys.stdout, lineterminator='\n') +def idARTFLToENCCRE(correspondances, tome, article): + try : + row = correspondances.loc[(correspondances['tome']==tome) & + (correspondances['article']==article)].reset_index(drop=True) + return row['entreeid'][0] + except KeyError: + return None + +ENCCREDomains = ENCCRE.query('domaines') + +def getENCCREDomain(enccreId): + if enccreId: + article = ENCCRE.query('article/%s' % enccreId) + annotation = article['annotations'] + if 'constit' in annotation and 'domgen' in annotation['constit'][0]: + domains = annotation['constit'][0]['domgen'] + return ' | '.join( + set(map(lambda d: ENCCREDomains[d]['dgrid'], domains)) + ) + +def main(rootDirectory, correspondances): + output = csv.writer(sys.stdout, lineterminator='\n', delimiter='\t') output.writerow(header) for t in range(1,18): path = f"{rootDirectory}/T{t}" @@ -23,8 +45,10 @@ def main(rootDirectory): t, rank, getAttribute(root, "head"), - normclass if normclass != 'unclassified' else getAttribute(root, "generatedclass") + getAttribute(root, "author"), + normclass, + getENCCREDomain(idARTFLToENCCRE(correspondances, t, rank)) ]) if __name__ == '__main__': - main(sys.argv[1]) + main(sys.argv[1], pandas.read_csv(sys.argv[2])) diff --git a/scripts/LGE-metadata-from-EDdA.py b/scripts/LGE-metadata-from-EDdA.py index 1fabdee..ab1119a 100755 --- a/scripts/LGE-metadata-from-EDdA.py +++ b/scripts/LGE-metadata-from-EDdA.py @@ -8,7 +8,8 @@ def EDdARow(columns): 'tome': columns[0], 'article': columns[1], 'head': columns[2], - 'domain': columns[3] + 'ARTFL_domain': columns[3], + 'ENCCRE_domain': columns[4] } def LGERow(tome): @@ -140,14 +141,19 @@ def interesting(arrow): def getMetadata(arrows, path=None): output = sys.stdout if path is None else open(path, 'w') toCsv = csv.writer(output, lineterminator='\n') - toCsv.writerow(['id', 'tome', 'rank', 'head', 'domain']) + toCsv.writerow( + ['id', 'tomeLGE', 'rankLGE', 'head', 'tomeEDdA', 'rankEDdA', 'ARTFL_domain', 'ENCCRE_domain'] + ) for arrow in arrows: toCsv.writerow([ arrow['target']['id'], arrow['target']['tome'], arrow['target']['rank'], arrow['target']['head'], - arrow['source']['domain'] + arrow['source']['tome'], + arrow['source']['article'], + arrow['source']['ARTFL_domain'], + arrow['source']['ENCCRE_domain'] ]) if __name__ == '__main__': diff --git a/scripts/add-domain-metadata.hs b/scripts/add-domain-metadata.hs new file mode 100755 index 0000000..43ec1e1 --- /dev/null +++ b/scripts/add-domain-metadata.hs @@ -0,0 +1,50 @@ +#!/usr/bin/env -S runhaskell --ghc-arg="-Wall" --ghc-arg="-i lib" +{-# LANGUAGE DeriveGeneric, NamedFieldPuns, OverloadedStrings #-} + +--import Data.ByteString.Lazy as ByteString (readFile) +import Data.Csv (FromRecord) +import Data.Metadata (Domains(..), readCsv) +--import Data.Metadata (Article, articles) +import Data.Text as Text (Text, intercalate, isPrefixOf) +import GHC.Generics (Generic) +import System.Environment (getArgs) +import System.FilePath ((</>), (<.>)) +import System.Exit (die) +import System.Script (syntax) +import Text.Editor (apply) + +data Line = Line { + tome :: String + , rank :: String + , domains :: Domains + } deriving (Generic) + +instance FromRecord Line + +to :: FilePath -> FilePath -> Line -> IO () +to source target (Line {tome, rank, domains}) = + apply (pure . addMeta domains) target input + where + input = source </> "T" <> tome <> "_article" <> rank <.> "xml" + +addMeta :: Domains -> [Text] -> [Text] +addMeta (Domains domains) textLines = + take 6 textLines + ++ [("domain\t" <> intercalate " | " domains) + ,("class\t" <> getClass domains)] + ++ drop 6 textLines + +getClass :: [Text] -> Text +getClass domainsList + | all hasGeo domainsList = "geography" + | any hasGeo domainsList = "some_geography" + | otherwise = "other" + where + hasGeo = ("Géographie" `isPrefixOf`) + +main :: IO () +main = getArgs >>= run + where + run [inputCSV, source, target] = + readCsv inputCSV >>= either die (mapM_ (source `to` target)) + run _ = syntax "METADATA_CSV_FILE SOURCE TARGET" diff --git a/scripts/articlesToVolume.sh b/scripts/articles-to-volume.sh similarity index 100% rename from scripts/articlesToVolume.sh rename to scripts/articles-to-volume.sh diff --git a/scripts/conllu-to-lexicoscope.hs b/scripts/conllu-to-lexicoscope.hs new file mode 100755 index 0000000..705f39a --- /dev/null +++ b/scripts/conllu-to-lexicoscope.hs @@ -0,0 +1,95 @@ +#!/usr/bin/env -S runhaskell --ghc-arg="-Wall" --ghc-arg="-i lib" +{-# LANGUAGE NamedFieldPuns, OverloadedStrings #-} + +import Conllu.Parse (parseConllu) +import Conllu.Parse.Paragraph (Paragraph(..), byParagraph) +import Conllu.Print (printSent) +import Conllu.Type (Doc, Sent(..)) +import Control.Monad.IO.Class (MonadIO(..)) +import Control.Monad.State (StateT(..), evalStateT) +import Data.List as List (intercalate) +import Data.Map (toList) +import Data.Metadata (Article(..), Authors(..), Domains(..), Tome, byTome, list) +import Data.Text as Text (unpack) +import System.Environment (getArgs) +import System.Exit (die) +import System.FilePath ((</>), (<.>)) +import System.Script (syntax) +import Text.XML.HXT.Core + ((>>>), ($<), ArrowXml, IOSLA(..), IOStateArrow, XmlTree, attr, mkelem, selem + , txt, writeDocument) +import Text.XML.HXT.Arrow.XmlState.TypeDefs (Selector(..), chgS, theUserState) +import Text.XML.HXT.Arrow.XmlState.RunIOStateArrow (initialState) + +type Serial = (Int, Int) + +next :: Selector Serial Int -> IOStateArrow Serial a String +next intSelector = IOSLA getUpdate + where + getUpdate state _ = pure (chgS field (+1) state, [show $ getS field state]) + field = theUserState >>> intSelector + +pId :: Selector Serial Int +pId = S {getS = fst, setS = \p (_, s) -> (p, s)} + +sId :: Selector Serial Int +sId = S {getS = snd, setS = \s (p, _) -> (p, s)} + +to :: FilePath -> FilePath -> (Int, Tome) -> StateT Serial IO () +to source target (tomeNumber, tome) = + mapM_ (processArticle (source </> tomeDir) (target </> tomeDir)) tome + where + tomeDir = "T" <> show tomeNumber + +processArticle :: FilePath -> FilePath -> Article -> StateT Serial IO () +processArticle source target a@(Article {uid, rank}) = + liftIO (putStrLn ("processing " <> input) *> Prelude.readFile input) + >>= either (liftIO . die) (format a output) . parseConllu input + where + input = source </> Text.unpack uid <.> "conllu" + output = target </> Text.unpack uid <.> "xml" + --input = source </> "article" ++ show rank <.> "conllu" + --output = target </> "article" ++ show rank <.> "xml" + +format :: Article -> FilePath -> Doc -> StateT Serial IO () +format _ _ [] = pure () +format article outputPath dom = StateT $ \s -> do + (newState, _) <- runIOSLA (xml >>> writeDocument [] outputPath) (initialState s) () + pure ((), getS theUserState newState) + where + xml = selem "/" + [selem "corpus" + [selem "doc" + [selem "meta" [metaFrom article] + ,selem "text" (formatParagraph <$> byParagraph dom)]]] + +metaFrom :: ArrowXml a => Article -> a n XmlTree +metaFrom (Article {uid, tome, rank, headWord, authors, domains}) = txt $ + concatMap (++ "\n") + (List.intercalate "\t" <$> + [[] + ,["fileName", Text.unpack uid] + ,["tome", show tome] + ,["rank", show rank] + ,["head", Text.unpack headWord] + ,["author", Text.unpack . list $ getAuthors authors] + ,["domain", Text.unpack . list $ getDomains domains]]) + +formatParagraph :: Paragraph -> IOStateArrow Serial n XmlTree +formatParagraph (Paragraph sents) = + mkelem "p" [attr "id" (txt . ('p':) $< next pId)] (formatSentence <$> sents) + +formatSentence :: Sent -> IOStateArrow Serial n XmlTree +formatSentence s = + mkelem "s" [attr "id" (txt . ('s':) $< next sId)] [txt sentences] + where + sentences = dropWhile (== '\n') $ printSent (s {_meta = []}) + +main :: IO () +main = getArgs >>= cli + where + withSerial = (`evalStateT` (0, 0)) + cli [sourceMeta, sourceRoot, targetRoot] = + byTome sourceMeta + >>= either die (withSerial . mapM_ (sourceRoot `to` targetRoot) . toList) + cli _ = syntax "METADATA_CSV_FILE SOURCE_DIR TARGET_DIR" diff --git a/scripts/conllu-to-txm.hs b/scripts/conllu-to-txm.hs new file mode 100755 index 0000000..8487819 --- /dev/null +++ b/scripts/conllu-to-txm.hs @@ -0,0 +1,97 @@ +#!/usr/bin/env -S runhaskell --ghc-arg="-Wall" --ghc-arg="-i lib" +{-# LANGUAGE NamedFieldPuns, OverloadedStrings #-} + +import Conllu.Parse (parseConllu) +import Conllu.Type (AW, CW(..), ID(..), Sent(..)) +import Control.Applicative ((<|>), liftA2) +import Data.Metadata + ( HasAuthors(..), Book, FromBook(..), relativePath, uid, groupBy, readTsv + , sortBy ) +import Data.Metadata.Article (Article(..)) +import Data.Text as Text (unpack) +import System.Environment (getArgs) +import System.Exit (die) +import System.FilePath ((</>), (<.>)) +import System.Script (syntax) +import Text.TEI + ( corpusHeader, publicationStmt, sourceDesc, teiHeader, text, title ) +import Text.XML.HXT.Core + ( (>>>), (|||), ($<), IOSLA(..), IOSArrow, XmlTree, arr, arrIO, arrIO0, arrL + , attr, constL, mkelem, mkText, runX, selem, txt, unlistA, withIndent + , writeDocument ) + +to :: FilePath -> FilePath -> (Book, [Article]) -> IO () +to source target (theBook, metadata) = + mapM_ runArrow . groupBy tome $ sortBy rank metadata + where + runArrow (tomeNumber, tomeArticles) = + runX $ compileTome source target (theBook, tomeNumber, tomeArticles) + +compileTome :: + FilePath -> FilePath -> (Book, Int, [Article]) -> IOSArrow b XmlTree +compileTome source target (theBook, tomeNumber, metadata) = + tomeXml >>> writeDocument [withIndent True] output + where + strTome = show tomeNumber + output = target </> (show theBook) </> "T" <> strTome <.> "xml" + tomeXml = selem "/" + [ selem "teiCorpus" + [ corpusHeader theBook strTome + , (articleFrom source $< constL metadata) ] ] + +articleFrom :: FilePath -> Article -> IOSArrow b XmlTree +articleFrom source article@(Article {headWord}) = + selem "TEI" + [ teiHeader + (selem "titleStmt" [title $ Text.unpack headWord]) + (publicationStmt + [ selem "p" [ txt "Annotated with Stanza by project GEODE" ] ]) + (sourceDesc [ authorArrow >>> selem "author" [ mkText ] ]) + , text "article" article (loadConllu input) ] + where + input = source </> relativePath article <.> "conllu" + authorArrow = constL (Text.unpack <$> authors article) + +{- +metadataFrom :: Article -> [ IOSArrow b XmlTree ] +metadataFrom article@(Article {book, authors, domains}) = + [ "id" .= uid article + , "book" .= show book + , "author" .= list (getAuthors authors) + , "domains" .= list (getDomains domains) ] +-} + +loadConllu :: FilePath -> IOSArrow b XmlTree +loadConllu input = arrIO0 (parseConllu input <$> readFile input) + >>> ((arrIO die) ||| format) + where + format = selem "body" [ unlistA >>> formatSentence ] + formatSentence = selem "s" [ arr _words >>> arrL render >>> formatWord ] + formatWord = mkelem "w" [ attr "lemma" (txt $< getMaybe _lemma) + , attr "pos" (txt $< getMaybe pos) ] + [ txt $< getMaybe _form ] + pos = liftA2 (<|>) spos _xpos + +getMaybe :: (a -> Maybe b) -> IOSArrow a b +getMaybe projector = IOSLA $ \s a -> pure (s, maybe [] (:[]) $ projector a) + +render :: [CW AW] -> [CW AW] +render [] = [] +render (w0@(CW {_id = MID a0 b0}):w1@(CW {_id = SID a1}):w2@(CW {_id = SID b2}):ws) + | a0 == a1 && b0 == b2 = + w0 {_lemma = glue _lemma, _upos = Nothing, _xpos = glue spos}:(render ws) + where + glue f = (\a b -> a ++ "+" ++ b) <$> (f w1) <*> (f w2) +render (w:ws) = w:(render ws) + +spos :: CW a -> Maybe String +spos = fmap show . _upos + +main :: IO () +main = getArgs >>= cli + where + books = groupBy book . sortBy uid + cli [inputMeta, sourceRoot, targetRoot] = + readTsv inputMeta + >>= either die (mapM_ (sourceRoot `to` targetRoot) . books) + cli _ = syntax "METADATA_TSV_FILE SOURCE_DIR TARGET_DIR" diff --git a/scripts/detokenizer.hs b/scripts/detokenizer.hs new file mode 100755 index 0000000..124cbb1 --- /dev/null +++ b/scripts/detokenizer.hs @@ -0,0 +1,22 @@ +#!/usr/bin/env -S runhaskell --ghc-arg="-Wall" +{-# LANGUAGE OverloadedStrings #-} +module Main where + +import Data.Text (Text, lines, unsnoc) +import Data.Text.IO (interact) +import Prelude hiding (interact, lines) + +detokenize :: [Text] -> Text +detokenize [] = "\n" +detokenize [w] = w <> "\n" +detokenize (w:c:ws) + | c `elem` [",", "."] = w <> c <> " " <> detokenize ws +detokenize (w:ws) = finish $ unsnoc w + where + finish Nothing = detokenize ws + finish (Just (_, c)) + | c == '\'' = w <> detokenize ws + | otherwise = w <> " " <> detokenize ws + +main :: IO () +main = interact $ detokenize . lines diff --git a/scripts/extract-corpus.sh b/scripts/extract-corpus.sh new file mode 100755 index 0000000..4806dd1 --- /dev/null +++ b/scripts/extract-corpus.sh @@ -0,0 +1,27 @@ +#!/bin/bash + +INPUT_PATH="${1}" +SOURCE_TEXT_ARTICLES="${2}" +OUTPUT="${3}" +if [ -d "${OUTPUT}" ] +then + N=1 + while [ -d "${OUTPUT}.${N}" ] + do + N=$((N+1)) + done + mv "${OUTPUT}" "${OUTPUT}.${N}" +fi + +WORKDIR=$(mktemp -d /tmp/parallel-corpus.XXX) + +while read LINE +do + ID="${LINE%%,*}" + LINE="${LINE#*,}" + RELATIVE_PATH="${LINE%%,*}" + cp "${SOURCE_TEXT_ARTICLES}/${RELATIVE_PATH#*/}.*" "${WORKDIR}/${RELATIVE_PATH}" + #cp "${SOURCE_TEXT_ARTICLES}/T${T}/ById/${ID}."* "${WORKDIR}/T${T}" +done < <(tail -n +2 ${INPUT_METADATA}) + +mv ${WORKDIR} ${OUTPUT} diff --git a/scripts/extract-parallel-EDdA.sh b/scripts/extract-parallel-EDdA.sh index b18305e..62dcf7c 100755 --- a/scripts/extract-parallel-EDdA.sh +++ b/scripts/extract-parallel-EDdA.sh @@ -1,4 +1,4 @@ -#!/bin/bash +#!/usr/bin/env bash INPUT_METADATA="${1}" SOURCE_TEXT_ARTICLES="${2}" @@ -25,7 +25,6 @@ do LINE="${LINE#*,}" LINE="${LINE#*,}" LINE="${LINE#*,}" - LINE="${LINE#*,}" T="${LINE%%,*}" LINE="${LINE#*,}" RANK="${LINE%%,*}" diff --git a/scripts/extract-parallel-LGE.sh b/scripts/extract-parallel-LGE.sh index b1c0c3a..eef78b6 100755 --- a/scripts/extract-parallel-LGE.sh +++ b/scripts/extract-parallel-LGE.sh @@ -24,8 +24,10 @@ while read LINE do ID="${LINE%%,*}" LINE="${LINE#*,}" + LINE="${LINE#*,}" + LINE="${LINE#*,}" T="${LINE%%,*}" - cp "${SOURCE_TEXT_ARTICLES}/T${T}/ById/${ID}."* "${WORKDIR}/T${T}" + cp "${SOURCE_TEXT_ARTICLES}/T${T}/ById/${ID##*_}."* "${WORKDIR}/T${T}" done < <(tail -n +2 ${INPUT_METADATA}) mv ${WORKDIR} ${OUTPUT} diff --git a/scripts/fix-tsv.hs b/scripts/fix-tsv.hs new file mode 100755 index 0000000..749dbbd --- /dev/null +++ b/scripts/fix-tsv.hs @@ -0,0 +1,26 @@ +#!/usr/bin/env -S runhaskell --ghc-arg="-Wall" --ghc-arg="-i lib" +{-# LANGUAGE OverloadedStrings #-} + +import Data.ByteString as BS (toStrict) +import Data.Csv (EncodeOptions(..), encodeWith, defaultEncodeOptions) +import Data.Text as Text (Text, splitOn) +import Data.Text.Encoding as Text (decodeUtf8) +import System.Environment (getArgs) +import System.Exit (die) +import System.Script (syntax) +import Text.Filter (xargs) + +fixTsv :: [Text] -> IO Text +fixTsv = fmap toTsv . mapM fixLine + where + toTsv = decodeUtf8 . BS.toStrict . encodeWith + (defaultEncodeOptions { encDelimiter = toEnum $ fromEnum '\t' }) + fixLine = escapeFormLemma . Text.splitOn "\t" + escapeFormLemma [n, form, lemma, pos, ene] = pure (n, form, lemma, pos, ene) + escapeFormLemma l = die $ show l + +main :: IO () +main = getArgs >>= run + where + run [target] = xargs fixTsv target + run _ = syntax "TARGET_DIRECTORY" diff --git a/scripts/inject-metadata-to-lexicoscope.hs b/scripts/inject-metadata-to-lexicoscope.hs new file mode 100755 index 0000000..b69ccbf --- /dev/null +++ b/scripts/inject-metadata-to-lexicoscope.hs @@ -0,0 +1,48 @@ +#!/usr/bin/env -S runhaskell --ghc-arg="-i lib" +{-# LANGUAGE NamedFieldPuns, OverloadedStrings #-} + +import Data.Attoparsec.Text (parseOnly) +import Data.Metadata.Article (Article(..)) +--import Data.Metadata (Metadata, metadata) +import Data.Metadata (Article, articles) +import Data.Text as Text (Text, intercalate, isPrefixOf, pack, unpack) +import Data.Text.IO as Text (readFile) +import System.Environment (getArgs) +import System.FilePath ((</>), (<.>)) +import System.Exit (die) +import System.Script (syntax) +import Text.Editor (editM) + +to :: FilePath -> FilePath -> [Article] -> IO () +to source target = mapM_ inject + where + inject a@(Article {uid}) = + editM (addMeta a) target (source </> Text.unpack uid <.> "xml") + +addMeta :: Article -> [Text] -> IO [Text] +addMeta (Article {uid, tome, rank, headWord, domains}) (firstLine:_:others) = pure $ + firstLine + : "<corpus><doc><meta>" + : ("fileName\t" <> uid <> ".txt") + : ("tome\t" <> Text.pack (show tome)) + : ("rank\t" <> Text.pack (show rank)) + : ("head\t" <> headWord) + : ("domain\t" <> intercalate " | " domains) + : ("class\t" <> getClass domains) + : others +metaLines _ _ = die "invalid input file" + +getClass :: [Text] -> Text +getClass domainsList + | all hasGeo domainsList = "geography" + | any hasGeo domainsList = "some_geography" + | otherwise = "other" + where + hasGeo = ("Géographie" `isPrefixOf`) + +main :: IO () +main = getArgs >>= run + where + run [inputCSV, source, target] = Text.readFile inputCSV + >>= either die (source `to` target) . parseOnly articles + run _ = syntax "METADATA_CSV_FILE SOURCE TARGET" diff --git a/scripts/inject-metadata.sh b/scripts/inject-metadata.sh new file mode 100755 index 0000000..b08f651 --- /dev/null +++ b/scripts/inject-metadata.sh @@ -0,0 +1,58 @@ +#!/bin/sh + +METADATA="${1}" +SOURCE="${2%/}" +TARGET="${3%/}" + +getClass() +{ + DOMAIN="${1}" + if [ "${DOMAIN%%|*}" == "${DOMAIN}" ] + then + case "${DOMAIN}" in + Géographie*) printf "geography";; + *) printf "other";; + esac + else + while true + do + case "${DOMAIN%%|*}" in + Géographie*) + if [ "${DOMAIN%%|*}" != "${DOMAIN}" ] + then DOMAIN="${DOMAIN#*| }" + else printf "geography"; return + fi;; + *) printf "some_geography"; return;; + esac + done + fi +} + +while read LINE +do + FILE="${LINE%%,*}" + LINE="${LINE#*,}" + TOME="${LINE%%,*}" + LINE="${LINE#*,}" + RANK="${LINE%%,*}" + LINE="${LINE#*,}" + HEAD="${LINE%%,*}" + LINE="${LINE#*,}" + DOMAIN="${LINE%%,*}" + CLASS="$(getClass "${DOMAIN}")" + INPUT="${SOURCE}/${FILE}.xml" + OUTPUT="${TARGET}/${FILE}.xml" + head -n 1 "${INPUT}" > "${OUTPUT}" + + cat >> "${OUTPUT}" << EOF +<corpus><doc><meta> +fileName ${FILE}.txt +tome ${TOME} +rank ${RANK} +head ${HEAD} +domain ${DOMAIN} +class ${CLASS} +EOF + + tail -n +3 "${INPUT}" >> "${OUTPUT}" +done < <(tail -n +2 ${METADATA}) diff --git a/scripts/lge-articles-to-volumes.hs b/scripts/lge-articles-to-volumes.hs new file mode 100755 index 0000000..7e47255 --- /dev/null +++ b/scripts/lge-articles-to-volumes.hs @@ -0,0 +1,153 @@ +#!/usr/bin/env -S runhaskell --ghc-arg="-i lib" +{-# LANGUAGE NamedFieldPuns, OverloadedStrings #-} + +--import Control.Map +import Data.Attoparsec.Text (parseOnly) +import Data.List as List (foldl') +import Data.Map as Map (Map, elems, empty, insertWith, toList) +import Data.Metadata (Article(..), Tome, Tomes, byTome) +import Data.Set as Set (Set, singleton, union) +import Data.Text as Text (Text, concat, intercalate, splitOn, unpack) +import Data.Text.IO as Text (readFile) +import System.Directory (listDirectory) +import System.Environment (getArgs, getProgName) +import System.Exit (die) +import System.FilePath ((</>), (<.>), dropTrailingPathSeparator, takeBaseName) +import System.Script (syntax) +import Text.XML.HXT.Core ( + (>>>), ($<), Arrow, ArrowIO, ArrowXml, IOStateArrow, XmlTree, arr, arrIO0, attr, constL, getChildren, mkelem, readDocument, runX, selem, txt, unlistA, withValidate, writeDocument + ) + +{- +getTome :: FilePath -> String +getTome = drop 1 . takeBaseName . dropTrailingPathSeparator +-} + +type IOArrow b x = IOStateArrow () b x + +compileTome :: FilePath -> FilePath -> (Int, Tome) -> IOArrow b XmlTree +compileTome source target (tomeNumber, metadata) = + tomeXml >>> writeDocument [] (target </> "T" <> strTome <.> "xml") + where + strTome = show tomeNumber + tomeXml = selem "/" + [selem "teiCorpus" + [teiHeader + (titleStmt strTome) + (corpusPublicationStmt) + (corpusSourceDesc strTome) + ,(articleFrom source $< constL (Map.elems metadata))]] + + --, constL (articleFrom source <$> metadata) >>> undefined]] + +articleFrom :: FilePath -> Article -> IOArrow b XmlTree +articleFrom source (Article {tome, uid, headWord, author, artflDomains, enccreDomains}) = + selem "TEI" + [teiHeader + (selem "titleStmt" [title $ Text.unpack headWord]) + (publicationStmt + [selem "p" [txt "Annotated with Stanza by project GEODE"]]) + (sourceDesc [selem "author" [txt $ Text.unpack author]]) + , mkelem "text" metadata [body]] + where + path = source </> "T" <> show tome </> Text.unpack uid <.> "tei" + body = readDocument [withValidate False] path >>> getChildren + key .= value = attr key (txt $ Text.unpack value) + aDomainsAttr = Text.concat [":", Text.intercalate ":" artflDomains, ":"] + eDomainsAttr = Text.concat [":", Text.intercalate ":" enccreDomains, ":"] + metadata = + ["uid" .= uid, "author" .= author, "artflDomains" .= aDomainsAttr, "enccreDomains" .= eDomainsAttr, "book" .= "LGE"] + +-- arrIO0 (listDirectory source) +-- >>> unlistA +-- >>> arr (source </>) +-- >>> readFromDocument [withValidate no] + +{- +getTEICorpus :: ArrowXml a => String -> a b XmlTree -> a b XmlTree +getTEICorpus tome corpus = selem "/" [selem "teiCorpus" [teiHeader tome, corpus]] +-} + +teiHeader :: ArrowXml a => + a b XmlTree -> a b XmlTree -> a b XmlTree -> a b XmlTree +teiHeader aTitle aPublication aSource = + selem "teiHeader" + [selem "fileDesc" [aTitle, aPublication, aSource]] + --selem "fileDesc" [titleStmt tome, publicationStmt, sourceDesc tome] + --] + +titleStmt :: ArrowXml a => String -> a b XmlTree +titleStmt tome = selem "titleStmt" [ + title tome + , selem "respStmt" [ + selem "resp" [txt "Digitized by"] + , selem "orgName" [txt "Bibliothèque Nationale de France"] + , selem "resp" [txt "Encoded by"] + , selem "orgName" [txt "ICAR"] + ] + ] + +publicationStmt :: ArrowXml a => [a b XmlTree] -> a b XmlTree +publicationStmt content = selem "publicationStmt" content + +corpusPublicationStmt :: ArrowXml a => a b XmlTree +corpusPublicationStmt = + (publicationStmt + [selem "distributor" + [selem "orgName" [txt "Project GÉODE"] + ,selem "address" + [selem "addrline" [txt "ICAR UMR 5191"] + ,selem "addrline" [txt "ENS de Lyon"]]]]) + +sourceDesc :: ArrowXml a => [a b XmlTree] -> a b XmlTree +sourceDesc biblContent = + selem "sourceDesc" [selem "bibl" biblContent] + +corpusSourceDesc :: ArrowXml a => String -> a b XmlTree +corpusSourceDesc tome = + sourceDesc + [title tome + ,selem "author" [txt "Collective"] + ,selem "creation" + [selem "date" [txt "1885"]] + ,selem "imprint" + [selem "date" [txt "1885"] + ,selem "publisher" [txt "H. Lamirault et Cie,"] + ,selem "pubplace" [txt "Paris"]] + ,mkelem "biblScope" [attr "unit" (txt "volume")] [txt tome]] + +title :: ArrowXml a => String -> a b XmlTree +title titleContent = selem "title" [txt titleContent] + +{- +$ unlines [ + "La Grande Encyclopédie, Inventaire raisonné des sciences, des lettres et" + , "des arts par une société de savants et de gens de lettres. Tome " ++ tome + ]] +-} + +{- +getTomes :: ArrowIO a => FilePath -> a b Tome +getTomes metadata = constA undefined + +getTomes :: FilePath -> IO Tomes +getTomes = + fmap (foldl' index Map.empty . fmap fields . drop 1 . lines) . Text.readFile + where + fields = Text.splitOn "," + index tomes [file, tome, rank, head, domain] = + Map.insertWith Set.union tome (Set.singleton ) tomes + index tomes _ = tomes + +-} + +main :: IO () +main = getArgs >>= cli + where + cli [sourceMeta, sourceRoot, targetRoot] = + Text.readFile sourceMeta + >>= pure . parseOnly byTome + >>= either + die + (mapM_ (runX . compileTome sourceRoot targetRoot) . Map.toList) + cli _ = syntax "METADATA_CSV_FILE SOURCE_DIR TARGET_DIR" diff --git a/scripts/linearize.hs b/scripts/linearize.hs index 125d393..7ad3e0e 100755 --- a/scripts/linearize.hs +++ b/scripts/linearize.hs @@ -1,10 +1,9 @@ -#!/usr/bin/env -S runhaskell --ghc-arg="-i lib" +#!/usr/bin/env -S runhaskell --ghc-arg="-Wall" --ghc-arg="-i lib" import Data.Char (isUpper) import System.Environment (getArgs) -import System.FilePath ((</>)) import System.Script (syntax) -import Text.Editor (editAll) +import Text.Filter (xargs) linearize :: String -> String linearize "" = "" @@ -18,5 +17,5 @@ linearize (c:s) = c : linearize s main :: IO () main = getArgs >>= cli where - cli [target] = getContents >>= editAll linearize target + cli [target] = xargs (pure.linearize) target cli _ = syntax "TARGET_DIR" diff --git a/scripts/parallel-links.py b/scripts/parallel-links.py new file mode 100755 index 0000000..284dc5d --- /dev/null +++ b/scripts/parallel-links.py @@ -0,0 +1,160 @@ +#!/usr/bin/env python3 + +import csv +import sys + +def EDdARow(columns): + return { + 'tome': columns[0], + 'article': columns[1], + 'head': columns[2], + } + +def LGERow(tome): + return lambda columns: { + 'id': columns[0], + 'tome': tome, + 'rank': columns[1], + 'head': columns[2] + } + +def unserializeMetadata(path, rowReader): + with open(path) as f: + inputFile = csv.reader(f) + articles = [] + header = True + for row in inputFile: + if header: + header = False + else: + yield rowReader(row) + +def concat(generators): + for g in generators: + for x in g: + yield x + +def naiveIndexBy(field, elements): + d = {} + for e in elements: + key = e[field] + if key in d: + d[key] = None + else: + d[key] = e + return d + +def growPrefixes(d, keys, maxLength): + for length in range(maxLength, 1, -1): + newGeneration = {} + for key in keys: + if len(key) == length: + newKey = key[:-1] + if newKey in newGeneration or newKey in d: + newGeneration[newKey] = None + else: + newGeneration[newKey] = d[key] + for key, value in newGeneration.items(): + if value is not None: + d[key] = value + keys.add(key) + elif key not in d: + d[key] = value + +def indexBy(field, elements, prefix=True): + d = {} + for e in elements: + key = e[field] + if key in d: + d[key] = None + else: + d[key] = e + if prefix: + keys = set(d.keys()) + growPrefixes(d, keys, max(map(len, keys))) + return d + +def headWords(head): + words = head.split() + if len(words) == 1: + return words + else: + return [w for w in map(lambda s: s.strip(',.'), words) if w.isupper()] + +def identify(head, haystack): + if head in haystack: + if haystack[head] is not None: + return {'type': 'exact', 'match': head, 'found': haystack[head]} + else: + return None + else: + prefix = head[:-1] + while len(prefix) > 0 and prefix not in haystack: + prefix = prefix[:-1] + if prefix in haystack and haystack[prefix] is not None: + return { + 'type': 'prefix', + 'match': head, + 'found': haystack[prefix], + 'precision': len(prefix) / len(head) + } + else: + return None + +def naiveGetArrows(source, target): + indexedSource = naiveIndexBy('head', source) + indexedTarget = naiveIndexBy('head', target) + for head, article in indexedSource.items(): + if article is not None and head in indexedTarget and indexedTarget[head] is not None: + yield { + 'source': article, + 'target': indexedTarget[head] + } + +def getArrows(source, target): + for article in source: + heads = headWords(article['head']) + identified = map(lambda w: identify(w, target), heads) + entries = [e for e in identified if e is not None] + if len(entries) == 1: + yield { + 'type': 'match', + 'source': article, + 'target': entries[0] + } + elif len(entries) > 1: + yield { + 'type': 'ambiguity', + 'source': article, + 'target': entries + } + +def interesting(arrow): + if arrow['type'] == 'match': + target = arrow['target'] + return len(target['match']) > 3 and (target['type'] == 'exact' or target['precision'] > 0.8) + +#gold = [a for a in arrows if interesting(a)] + +def getMetadata(arrows, path=None): + output = sys.stdout if path is None else open(path, 'w') + toCsv = csv.writer(output, lineterminator='\n') + toCsv.writerow( + ['head', 'id', 'tomeEDdA', 'rankEDdA', 'tomeLGE', 'rankLGE'] + ) + for arrow in arrows: + toCsv.writerow([ + arrow['target']['head'], + arrow['target']['id'], + arrow['source']['tome'], + arrow['source']['article'], + arrow['target']['tome'], + arrow['target']['rank'] + ]) + +if __name__ == '__main__': + edda = list(unserializeMetadata(sys.argv[1], EDdARow)) + lge = list(concat([ + unserializeMetadata(f'{sys.argv[2]}/T{T}/metadata.csv', LGERow(T)) for T in range(1, 32) + ])) + getMetadata(list(naiveGetArrows(edda, lge)), sys.argv[3]) diff --git a/scripts/parallel-metadata.hs b/scripts/parallel-metadata.hs new file mode 100644 index 0000000..eb647e6 --- /dev/null +++ b/scripts/parallel-metadata.hs @@ -0,0 +1,15 @@ +type Link = (String, String, Int, Int, Int, Int) +--(headWord, uid, tomeEDdA, rankEDda, tomeLGE, rankLGE) +type Source_Meta = (Int, Int, String, Authors, Domains, Domains) +--(T,article,head,author,ARTFL_domain,ENCCRE_domain) + +join :: + +lgeMeta :: -> Article +lgeMeta (headWord, uid, tomeEDdA, rankEDda, tomeLGE, rankLGE) + +Right edda <- (decode HasHeader <$> B.readFile "../EDdA/metadata.csv" :: IO (Either String (Data.Vector.Vector (String, String, Int, Int, Int, Int)))) +Right edda <- (decode HasHeader <$> B.readFile "../EDdA/metadata.csv" :: IO (Either String (Data.Vector.Vector (String, String, Int, Int, Int, Int)))) + +main :: IO () +main = diff --git a/scripts/parallel-metadata.py b/scripts/parallel-metadata.py new file mode 100755 index 0000000..6f516a3 --- /dev/null +++ b/scripts/parallel-metadata.py @@ -0,0 +1,46 @@ +#!/usr/bin/env python3 + +import pandas +from sys import argv + +def edda_parallel_line(row): + return ('edda_' + str(row.tomeEDdA) + '_' + str(row.article), + 'EDdA/T' + str(row.tomeEDdA) + '/article' + str(row.article), + 'EDdA', + row.tomeEDdA, + row.article, + row['head'], + row.author, + row.ENCCRE_domain) + +def lge_parallel_line(row): + return ('lge_' + str(row.tomeLGE) + '_' + str(row.id), + 'LGE/T' + str(row.tomeLGE) + '/' + str(row.id), + 'LGE', + row.tomeLGE, + row.rankLGE, + row['head'], + '', + row.ENCCRE_domain) + +def merge(edda_meta, edda_lge_mapping): + return pandas.merge( + edda_meta, + edda_lge_mapping, + how='inner', + on='head') + +def output_metadata(input_meta, row_mapper): + output = input_meta.apply(row_mapper, result_type='expand', axis=1) + output.columns = ['uid', 'path', 'book', 'tome', 'rank', 'head', 'authors', 'domains'] + return output + +if __name__ == '__main__': + if len(argv) == 4: + parallel_meta = merge(pandas.read_csv(argv[1]), pandas.read_csv(argv[2])) + edda_output = f"{argv[3]}/edda_parallel_metadata.csv" + lge_output = f"{argv[3]}/lge_parallel_metadata.csv" + output_metadata(parallel_meta, edda_parallel_line).to_csv(edda_output, index=False) + output_metadata(parallel_meta, lge_parallel_line).to_csv(lge_output, index=False) + else: + print(f"Syntax: {argv[0]} EDdA_metadata EDdA_LGE_mapping OUTPUT_DIRECTORY") diff --git a/scripts/reroot.hs b/scripts/reroot.hs index fc5e715..1b67fc3 100755 --- a/scripts/reroot.hs +++ b/scripts/reroot.hs @@ -1,10 +1,12 @@ -#!/usr/bin/env -S runhaskell --ghc-arg="-i lib" +#!/usr/bin/env -S runhaskell --ghc-arg="-Wall" --ghc-arg="-i lib" {-# LANGUAGE OverloadedStrings #-} -import Data.Text (Text) +import Data.Text (Text, lines, unlines) +import Data.Text.IO (interact) +import Prelude hiding (interact, lines, unlines) import System.Environment (getArgs) import System.Script (syntax) -import Text.Editor (editAll) +import Text.Filter (xargs) wrap :: [Text] -> [Text] wrap text = "<body>": (("\t"<>) <$> text) ++ ["</body>"] @@ -12,5 +14,6 @@ wrap text = "<body>": (("\t"<>) <$> text) ++ ["</body>"] main :: IO () main = getArgs >>= run where - run [target] = getContents >>= editAll wrap target + run ["-"] = interact (unlines . wrap . lines) + run [target] = xargs (pure.wrap) target run _ = syntax "TARGET_DIRECTORY" diff --git a/scripts/spacy-to-txm.hs b/scripts/spacy-to-txm.hs new file mode 100755 index 0000000..1d8e05a --- /dev/null +++ b/scripts/spacy-to-txm.hs @@ -0,0 +1,65 @@ +#!/usr/bin/env -S runhaskell --ghc-arg="-Wall" --ghc-arg="-i lib" +{-# LANGUAGE NamedFieldPuns #-} +module Main where + +import Data.BIO (Tree(..), bio) +import Data.Metadata + ( HasAuthors(..), Book(..), groupBy, readTsv, relativePath ) +import Data.Metadata.Paragraph (Paragraph(..)) +import Data.SpaCy (Annotation(..)) +import Data.Text as Text (unpack) +import System.Environment (getArgs) +import System.Exit (die) +import System.FilePath ((</>), (<.>)) +import System.Script (syntax) +import Text.TEI + ( corpusHeader, publicationStmt, sourceDesc, teiHeader, text, title ) +import Text.XML.HXT.Core + ( (>>>), (|||), ($<), IOSArrow, XmlTree, arr, arrIO, arrIO0, attr, constL + , mkelem, mkText, runX, selem, txt, withIndent, writeDocument ) + +to :: FilePath -> FilePath -> (Int, [Paragraph]) -> IO () +to source target = fmap (\_ -> ()) . runX . compileTome source target + +compileTome :: FilePath -> FilePath -> (Int, [Paragraph]) -> IOSArrow b XmlTree +compileTome source target (tomeNumber, metadata) = + tomeXml >>> writeDocument [withIndent True] output + where + strTome = show tomeNumber + output = target </> "T" <> strTome <.> "xml" + tomeXml = selem "/" + [ selem "teiCorpus" + [ corpusHeader EDdA strTome + , (paragraphFrom source $< constL metadata) ] ] + +paragraphFrom :: FilePath -> Paragraph -> IOSArrow b XmlTree +paragraphFrom source paragraph@(Paragraph {headWord, paragraphId}) = + selem "TEI" + [ teiHeader + (selem "titleStmt" [title $ headWord <> " §" <> show paragraphId]) + (publicationStmt + [ selem "p" [ txt "Annotated with SpaCy by project GEODE" ] ]) + (sourceDesc [ authorArrow >>> selem "author" [ mkText ] ]) + , text "paragraph" paragraph (loadTsv input) ] + where + input = source </> relativePath paragraph <.> "tsv" + authorArrow = constL (Text.unpack <$> authors paragraph) + +loadTsv :: FilePath -> IOSArrow b XmlTree +loadTsv input = arrIO0 (readTsv input) >>> ((arrIO die) ||| format) + where + format = toXml $< arr (bio namedEntity) + toXml (Root nodes) = selem "body" [ toXml $< constL nodes ] + toXml (Leaf (Annotation {lemma, pos, form})) = + mkelem "w" [ attr "lemma" (txt lemma) + , attr "pos" (txt pos) ] + [ txt form ] + toXml (Node namedEntity nodes) = + mkelem "rs" [ attr "type" (txt namedEntity) ] [ toXml $< constL nodes ] + +main :: IO () +main = getArgs >>= cli + where + cli [datasetPath, source, target] = readTsv datasetPath + >>= either die (mapM_ (source `to` target) . groupBy tome) + cli _ = syntax "DATASET_TSV SOURCE_DIR TARGET_DIR" diff --git a/scripts/stanza-annotator.py b/scripts/stanza-annotator.py deleted file mode 100755 index c6f4d5e..0000000 --- a/scripts/stanza-annotator.py +++ /dev/null @@ -1,55 +0,0 @@ -#!/usr/bin/env python3 - -import os -import os.path -import stanza -import sys -from xml.sax.saxutils import escape, quoteattr - -class Annotator: - def __init__(self, source, target): - self.source = source - self.target = target - self.model = stanza.Pipeline(lang='fr', processors='tokenize,pos,lemma') - - def run(self): - for name in os.listdir(self.source): - if os.path.isdir(f'{self.source}/{name}'): - self.annotate_dir(name) - - def annotate_dir(self, directory): - source_path = f'{self.source}/{directory}' - os.makedirs(f'{self.target}/{directory}', exist_ok=True) - for name in os.listdir(source_path): - path = f'{source_path}/{name}' - relpath = f'{directory}/{name}' - if os.path.isdir(path): - self.annotate_dir(relpath) - elif os.path.isfile(path): - self.annotate_file(relpath) - - def annotate_file(self, file): - with open(f'{self.source}/{file}', 'r') as source: - self.encode(f'{file[:-4]}.tei', self.model(source.read())) - - def encode(self, file, annotation): - with open(f'{self.target}/{file}', 'w') as target: - print('<body>', file=target) - for sentence in annotation.sentences: - self.encode_sentence(sentence, target) - print('</body>', file=target) - - def encode_sentence(self, sentence, target): - print('\t<s>', file=target) - for token in sentence.tokens: - self.encode_token(token, target) - print('\t</s>', file=target) - - def encode_token(self, token, target): - form = escape(token.text) - lemma = quoteattr('+'.join(map(lambda w: w.lemma, token.words))) - upos = quoteattr('+'.join(map(lambda w: w.upos, token.words))) - print(f'\t\t<w lemma={lemma} pos={upos}>{form}</w>', file=target) - -if __name__ == '__main__': - Annotator(sys.argv[1], sys.argv[2]).run() diff --git a/scripts/stanza-txt-annotator.py b/scripts/stanza-txt-annotator.py new file mode 100755 index 0000000..5ebcbf0 --- /dev/null +++ b/scripts/stanza-txt-annotator.py @@ -0,0 +1,79 @@ +#!/usr/bin/env python3 + +import os +import os.path +import stanza +from stanza.utils.conll import CoNLL +import sys + +def oneLine(sentence): + return sentence.text.replace('\n', ' ').replace('\t', ' ') + +class Annotator: + def __init__(self, source, target): + self.source = source + self.target = target + self.paragraphSerial = -1 + self.sentenceSerial = -1 + self.model = stanza.Pipeline( + lang='fr', + processors='tokenize,pos,lemma,depparse') + + def run(self): + for name in os.listdir(self.source): + if os.path.isdir(f'{self.source}/{name}'): + self.annotate_dir(name) + + def annotate_dir(self, directory): + source_path = f'{self.source}/{directory}' + os.makedirs(f'{self.target}/{directory}', exist_ok=True) + for name in os.listdir(source_path): + path = f'{source_path}/{name}' + relpath = f'{directory}/{name}' + if os.path.isdir(path): + self.annotate_dir(relpath) + elif os.path.isfile(path): + self.annotate_file(relpath) + + def annotate_file(self, file): + outputFile = f'{self.target}/{file[:-4]}.conllu' + if not os.path.isfile(outputFile): + with open(f'{self.source}/{file}', 'r') as source: + document = source.read() + print("trying " + outputFile) + try: + with open(outputFile, 'w') as target: + for paragraph in document.split('\n\n'): + # print(f'# newpar id = {self.newpar()}', file=target) + self.annotate_paragraph(paragraph, target) + except Exception: + os.unlink(outputFile) + sys.exit(1) + + + def annotate_paragraph(self, paragraph, target): + parsed = self.model(paragraph) + if len(parsed.sentences) > 0: + print(f'# newpar id = {self.newpar()}', file=target) + for sentence in parsed.sentences: + sentence.add_comment(f'# sent_id = {self.newsent()}') + sentence.add_comment(f'# text = {oneLine(sentence)}') + print(CoNLL.doc2conll_text(parsed), file=target, end='') + +# def annotate_paragraph(self, paragraph, target): +# parsed = self.model(paragraph) +# for sentence in parsed.sentences: +# sentence.add_comment(f'# sent_id = {self.newsent()}') +# sentence.add_comment(f'# text = {oneLine(sentence)}') +# print(CoNLL.doc2conll_text(parsed), file=target, end='') + + def newpar(self): + self.paragraphSerial += 1 + return self.paragraphSerial + + def newsent(self): + self.sentenceSerial += 1 + return self.sentenceSerial + +if __name__ == '__main__': + Annotator(sys.argv[1], sys.argv[2]).run() diff --git a/scripts/stanza-xml-annotator.py b/scripts/stanza-xml-annotator.py new file mode 100755 index 0000000..b8ee15e --- /dev/null +++ b/scripts/stanza-xml-annotator.py @@ -0,0 +1,69 @@ +#!/usr/bin/env python3 + +from lxml import etree +import os +import os.path +import stanza +from stanza.utils.conll import CoNLL +import sys + +def oneLine(sentence): + return sentence.text.replace('\n', ' ').replace('\t', ' ') + +class Annotator: + def __init__(self, source, target, xpath): + self.source = source + self.target = target + self.xpath = xpath + self.paragraphSerial = -1 + self.sentenceSerial = -1 + self.model = stanza.Pipeline( + lang='fr', + processors='tokenize,pos,lemma,depparse') + + def run(self): + for name in os.listdir(self.source): + if os.path.isdir(f'{self.source}/{name}'): + self.annotate_dir(name) + + def annotate_dir(self, directory): + source_path = f'{self.source}/{directory}' + os.makedirs(f'{self.target}/{directory}', exist_ok=True) + for name in os.listdir(source_path): + path = f'{source_path}/{name}' + relpath = f'{directory}/{name}' + if os.path.isdir(path): + self.annotate_dir(relpath) + elif os.path.isfile(path): + self.annotate_file(relpath) + + def annotate_file(self, file): + outputFile = f'{self.target}/{file[:-4]}.conllu' + if not os.path.isfile(outputFile): + document = etree.parse(f'{self.source}/{file}') + with open(outputFile, 'w') as target: + for paragraph in document.xpath(self.xpath): + self.annotate_paragraph(paragraph, target) + + def annotate_paragraph(self, paragraph, target): + parsed = self.model(" ".join(paragraph.itertext())) + if len(parsed.sentences) > 0: + print(f'# newpar id = {self.newpar()}', file=target) + for sentence in parsed.sentences: + sentence.add_comment(f'# sent_id = {self.newsent()}') + sentence.add_comment(f'# text = {oneLine(sentence)}') + print(CoNLL.doc2conll_text(parsed), file=target, end='') + + def newpar(self): + self.paragraphSerial += 1 + return self.paragraphSerial + + def newsent(self): + self.sentenceSerial += 1 + return self.sentenceSerial + +if __name__ == '__main__': + paragraphXPath = '/TEI/text/body/div1/p' + if len(sys.argv) > 3: + paragraphXPath = sys.argv[3] + Annotator(sys.argv[1], sys.argv[2], paragraphXPath).run() diff --git a/scripts/xml-annotator.py.bak b/scripts/xml-annotator.py.bak new file mode 100755 index 0000000..5ac322a --- /dev/null +++ b/scripts/xml-annotator.py.bak @@ -0,0 +1,70 @@ +#!/usr/bin/env python3 + +from lxml import etree +import os +import os.path +import stanza +from stanza.utils.conll import CoNLL +import sys + +def oneLine(sentence): + return sentence.text.replace('\n', ' ') + +class Annotator: + def __init__(self, source, target, xpath): + self.source = source + self.target = target + #self.parser = etree.HTMLParser() + self.xpath = xpath + self.paragraphSerial = -1 + self.sentenceSerial = -1 + self.model = stanza.Pipeline( + lang='fr', + processors='tokenize,pos,lemma,depparse') + + def run(self): + for name in os.listdir(self.source): + if os.path.isdir(f'{self.source}/{name}'): + self.annotate_dir(name) + + def annotate_dir(self, directory): + source_path = f'{self.source}/{directory}' + os.makedirs(f'{self.target}/{directory}', exist_ok=True) + for name in os.listdir(source_path): + path = f'{source_path}/{name}' + relpath = f'{directory}/{name}' + if os.path.isdir(path): + self.annotate_dir(relpath) + elif os.path.isfile(path): + self.annotate_file(relpath) + + def annotate_file(self, file): + outputFile = f'{self.target}/{file[:-4]}.conllu' + if not os.path.isfile(outputFile): + document = etree.parse(f'{self.source}/{file}') + #document = etree.parse(f'{self.source}/{file}', self.parser) + with open(outputFile, 'w') as target: + for paragraph in document.xpath(self.xpath): + print(f'# newpar id = {self.newpar()}', file=target) + self.annotate_paragraph(paragraph, target) + + def annotate_paragraph(self, paragraph, target): + parsed = self.model(" ".join(paragraph.itertext())) + for sentence in parsed.sentences: + sentence.add_comment(f'# sent_id = {self.newsent()}') + sentence.add_comment(f'# text = {oneLine(sentence)}') + print(CoNLL.doc2conll_text(parsed), file=target) + + def newpar(self): + self.paragraphSerial += 1 + return self.paragraphSerial + + def newsent(self): + self.sentenceSerial += 1 + return self.sentenceSerial + +if __name__ == '__main__': + paragraphXPath = '/TEI/text/body/div1/p' + if len(sys.argv) > 3: + paragraphXPath = sys.argv[3] + Annotator(sys.argv[1], sys.argv[2], paragraphXPath).run() diff --git a/scripts/xml-annotator.py.remote b/scripts/xml-annotator.py.remote new file mode 100755 index 0000000..1db6bf2 --- /dev/null +++ b/scripts/xml-annotator.py.remote @@ -0,0 +1,67 @@ +#!/usr/bin/env python3 + +from lxml import etree +import os +import os.path +import stanza +from stanza.utils.conll import CoNLL +import sys + +def oneLine(sentence): + return sentence.text.replace('\n', ' ') + +class Annotator: + def __init__(self, source, target, xpath): + self.source = source + self.target = target + self.xpath = xpath + self.paragraphSerial = -1 + self.sentenceSerial = -1 + self.model = stanza.Pipeline( + lang='fr', + processors='tokenize,pos,lemma,depparse') + + def run(self): + for name in os.listdir(self.source): + if os.path.isdir(f'{self.source}/{name}'): + self.annotate_dir(name) + + def annotate_dir(self, directory): + source_path = f'{self.source}/{directory}' + os.makedirs(f'{self.target}/{directory}', exist_ok=True) + for name in os.listdir(source_path): + path = f'{source_path}/{name}' + relpath = f'{directory}/{name}' + if os.path.isdir(path): + self.annotate_dir(relpath) + elif os.path.isfile(path): + self.annotate_file(relpath) + + def annotate_file(self, file): + with open(f'{self.source}/{file}', 'r') as source: + document = etree.parse(source) + with open(f'{self.target}/{file[:-4]}.conllu', 'w') as target: + for paragraph in document.findall(self.xpath): + print(f'# newpar id = {self.newpar()}', file=target) + self.annotate_paragraph(paragraph, target) + + def annotate_paragraph(self, paragraph, target): + parsed = self.model(" ".join(paragraph.itertext())) + for sentence in parsed.sentences: + sentence.add_comment(f'# sent_id = {self.newsent()}') + sentence.add_comment(f'# text = {oneLine(sentence)}') + print(CoNLL.doc2conll_text(parsed), file=target) + + def newpar(self): + self.paragraphSerial += 1 + return self.paragraphSerial + + def newsent(self): + self.sentenceSerial += 1 + return self.sentenceSerial + +if __name__ == '__main__': + paragraphXPath = 'text/body/div1/p' + if len(sys.argv) > 3: + paragraphXPath = sys.argv[3] + Annotator(sys.argv[1], sys.argv[2], paragraphXPath).run() -- GitLab