diff --git a/lib/haskell/Conllu/Tree.hs b/lib/haskell/Conllu/Tree.hs index 9969117b00daaf80f13df4553fc1f90a05d341ba..8d5967bb3a3a069fb51c5283b9c52e80740bfd1f 100644 --- a/lib/haskell/Conllu/Tree.hs +++ b/lib/haskell/Conllu/Tree.hs @@ -1,4 +1,4 @@ -{-# LANGUAGE DeriveGeneric, OverloadedStrings, TypeFamilies #-} +{-# LANGUAGE DeriveGeneric, OverloadedLists, OverloadedStrings, TypeFamilies #-} module Conllu.Tree ( EP(..) , Feat(..) @@ -17,13 +17,12 @@ import qualified Conllu.Type as Conllu (AW, CW(..), Doc, ID(..), FORM, LEMMA, XP import qualified Conllu.UposTagset as Conllu (POS) import qualified Conllu.DeprelTagset as Conllu (EP) import Data.ByteString.Char8 as ByteString (pack) -import Data.Csv ((.=), ToField(..), ToNamedRecord(..), namedRecord) +import Data.Csv ((.=), DefaultOrdered(..), ToField(..), ToNamedRecord(..), namedRecord) import Data.Int (Int8) import Data.List (intercalate, partition) import Data.Map as Map (Map, empty, insert, toList) import Data.Serialize (Serialize(..)) import Data.Tree (Forest, Tree(..)) -import GEODE.Metadata (DefaultFields(..), HasDefaultHeader(..)) import GHC.Generics (Generic(..), K1(..), Rec0) data ID = @@ -120,8 +119,8 @@ instance ToNamedRecord IndexedWord where show ep : maybe [] id ((:) <$> _subdep <*> _rest) showDep rel@(Rel {_head}) = ":" `sep` (show _head:deprel rel) -instance HasDefaultHeader IndexedWord where - defaultFields = DefaultFields +instance DefaultOrdered IndexedWord where + headerOrder _ = [ "id", "form", "lemma", "upos", "xpos", "feats", "head", "deprel", "deps" , "misc" ] diff --git a/lib/haskell/GEODE/Metadata/ParagraphRecord.hs b/lib/haskell/GEODE/Metadata/ParagraphRecord.hs index f37aaf0384d5f1358a1a4bd590c499c21ef834a7..57519e9d89e47784c90a4bf29aa9ea0afafeff8b 100644 --- a/lib/haskell/GEODE/Metadata/ParagraphRecord.hs +++ b/lib/haskell/GEODE/Metadata/ParagraphRecord.hs @@ -6,19 +6,22 @@ module GEODE.Metadata.ParagraphRecord import Data.Aeson ((.=), FromJSON(..), ToJSON(..)) import Data.Aeson.KeyMap as KeyMap (singleton) import Data.Csv - ( (.:), FromNamedRecord(..), ToNamedRecord(..), namedField, namedRecord ) + ( (.:), DefaultOrdered(..), FromNamedRecord(..), ToNamedRecord(..), namedField, namedRecord ) import GEODE.Metadata as Article - ( type (@)(..), ArticleRecord, DefaultFields(..), HasDefaultHeader(..) - , Record(..), ToJSONObject(..), uid, relativePath) + ( type (@)(..), ArticleRecord, Record(..), ToJSONObject(..)) import GHC.Generics (Generic) import System.FilePath ((<.>)) +import Text.Read (readEither) -newtype Paragraph = Paragraph { paragraph :: Int } deriving (Generic, Show) +newtype Paragraph = Paragraph { paragraph :: Int } deriving (Generic, Eq, Ord, Show) +instance DefaultOrdered Paragraph instance FromJSON Paragraph instance Record Paragraph where - uid = show . paragraph + toUID = show . paragraph + + fromUID = fmap Paragraph . readEither . takeWhile (/= '_') relativePath (Paragraph {paragraph}) extension = show paragraph <.> extension @@ -35,6 +38,3 @@ instance ToJSONObject Paragraph where toJSONObject (Paragraph {paragraph}) = KeyMap.singleton "paragraph" (toJSON paragraph) toJSONPairs (Paragraph {paragraph}) = "paragraph" .= paragraph - -instance HasDefaultHeader Paragraph where - defaultFields = DefaultFields ["paragraph"] diff --git a/lib/haskell/GEODE/Metadata/ProdigyMeta.hs b/lib/haskell/GEODE/Metadata/ProdigyMeta.hs index edb3fdd26f50a29edef213e3a0f27eaf50249fd0..60ce5111b48dc4a22563fd19c71bf7fe18b39d61 100644 --- a/lib/haskell/GEODE/Metadata/ProdigyMeta.hs +++ b/lib/haskell/GEODE/Metadata/ProdigyMeta.hs @@ -7,9 +7,9 @@ module GEODE.Metadata.ProdigyMeta import Data.Aeson ((.=), FromJSON(..), ToJSON(..)) import Data.Aeson.KeyMap as KeyMap (fromList) -import Data.Csv (FromNamedRecord(..), ToNamedRecord(..)) +import Data.Csv (DefaultOrdered(..), FromNamedRecord(..), ToNamedRecord(..)) import Data.Text (Text) -import GEODE.Metadata (type (@), DefaultFields(..), HasDefaultHeader(..), ToJSONObject(..)) +import GEODE.Metadata (type (@), ToJSONObject(..)) import GEODE.Metadata.ParagraphRecord (ParagraphRecord) import GHC.Generics (Generic) @@ -26,18 +26,15 @@ instance ToJSONObject ProdigyMeta where "totalParagraphs" .= totalParagraphs <> "headword" .= headword +instance DefaultOrdered ProdigyMeta instance ToNamedRecord ProdigyMeta instance FromNamedRecord ProdigyMeta -instance HasDefaultHeader ProdigyMeta where - defaultFields = DefaultFields ["totalParagraphs", "headword"] - newtype Classification = Classification { paragraphFunction :: Text } deriving Generic +instance DefaultOrdered Classification instance ToNamedRecord Classification instance FromNamedRecord Classification -instance HasDefaultHeader Classification where - defaultFields = DefaultFields ["paragraphFunction"] instance ToJSONObject Classification where toJSONObject (Classification {paragraphFunction}) = KeyMap.fromList diff --git a/lib/haskell/Text/TEI.hs b/lib/haskell/Text/TEI.hs index 558a61643279ec47e574039186a3d4571a0bdf1d..ef47dcd15e5ee5cfbb35874526bbffb6e220f104 100644 --- a/lib/haskell/Text/TEI.hs +++ b/lib/haskell/Text/TEI.hs @@ -9,10 +9,10 @@ module Text.TEI import Data.Text (unpack) import GEODE.Metadata - (Book(..), Contrastive(..), Has(..), PrimaryKey(..), formatList, uid) + (ArticleRecord(..), Work(..), Contrastive(..), Has(..), Record(..), formatList) import Text.XML.HXT.Core (ArrowXml, XmlTree, aelem, attr, mkelem, selem, txt) -corpusHeader :: ArrowXml a => Book -> String -> a b XmlTree +corpusHeader :: ArrowXml a => Work -> String -> a b XmlTree corpusHeader EDdA = eddaHeader corpusHeader LGE = lgeHeader corpusHeader Wikipedia = wikiHeader @@ -85,15 +85,15 @@ by :: ArrowXml a => String -> String -> [a b XmlTree] by resp orgName = [ selem "resp" [ txt (resp <> " by") ] , selem "orgName" [ txt orgName ] ] -type TXMText a = (Has Contrastive a, Has PrimaryKey a) +type TXMText a = (Has Contrastive a, Has ArticleRecord a) text :: (ArrowXml a, TXMText b) => String -> b -> a c XmlTree -> a c XmlTree text unitName txmText body = mkelem "text" metadata [ body, aelem "milestone" [ "unit" .= unitName ] ] where metadata = - [ "uid" .= uid txmText - , "book" .= (show . book $ get txmText) + [ "uid" .= toUID (get txmText :: ArticleRecord) + , "work" .= (show . work $ get txmText) , "author" .= (unpack . formatList . authors $ get txmText) , "domains" .= (unpack . formatList . domains $ get txmText) , "subCorpus" .= (unpack . formatList . subCorpus $ get txmText) ] diff --git a/scripts/conllu-to-lexicoscope.hs b/scripts/conllu-to-lexicoscope.hs deleted file mode 100755 index 1d2a76a0213a4889c165d4489b2a390054f8579f..0000000000000000000000000000000000000000 --- a/scripts/conllu-to-lexicoscope.hs +++ /dev/null @@ -1,161 +0,0 @@ -#!/usr/bin/env -S runhaskell --ghc-arg="-Wall" --ghc-arg="-i lib" -{-# LANGUAGE ExplicitNamespaces, NamedFieldPuns, OverloadedStrings #-} - -import Conllu.Parse (parseConllu) -import Conllu.Parse.Paragraph (Paragraph(..), byParagraph) -import Conllu.Print (printSent) -import Conllu.Type (Sent(..)) ---import Conllu.Type (Doc, Sent(..)) ---import Control.Monad.IO.Class (MonadIO(..)) -import Control.Monad.State (StateT(..), evalStateT) -import Data.List as List (intercalate) ---import Data.Map (toList) -import GEODE.Metadata (Book, Contrastive(..), Entry(..), Has(..), MultiText(..), PrimaryKey(..), type(@), formatList, groupBy, readNamedTsv, relativePath, sortBy, uid) -import Data.Text as Text (unpack) -import System.Environment (getArgs) -import System.Exit (die) -import System.FilePath ((</>), (<.>)) -import System.Script (syntax, try) -import Text.XML.HXT.Core - ((>>>), (|||), ($<), ArrowXml, IOSLA(..), IOStateArrow, XmlTree, arrIO, arrIO0, arrL, attr, constL, mkelem, selem - , txt, withIndent, writeDocument) -import Text.XML.HXT.Arrow.XmlState.TypeDefs (Selector(..), chgS, theUserState) -import Text.XML.HXT.Arrow.XmlState.RunIOStateArrow (initialState) - -type Article = PrimaryKey @ Entry @ Contrastive - -type Serial = (Int, Int) -type WithSerial a = StateT Serial IO a -type SerialArrow n a = IOStateArrow Serial n a ---type SerialArrow a = IOStateArrow Serial () a - -next :: Selector Serial Int -> SerialArrow n String -next intSelector = IOSLA getUpdate - where - getUpdate state _ = pure (chgS field (+1) state, [show $ getS field state]) - field = theUserState >>> intSelector - -pId :: Selector Serial Int -pId = S {getS = fst, setS = \p (_, s) -> (p, s)} - -{- -sId :: Selector Serial Int -sId = S {getS = snd, setS = \s (p, _) -> (p, s)} --} - -{- -to :: FilePath -> FilePath -> (Int, [Article]) -> StateT Serial IO () -to source target (_, tome) = - mapM_ (processArticle source target) tome --} - -stateOfArrow :: IOStateArrow s () a -> StateT s IO () -stateOfArrow a = StateT $ \s0 -> do - (s1, _) <- runIOSLA a (initialState s0) () - pure ((), getS theUserState s1) - -to :: FilePath -> FilePath -> (Book, [Article]) -> WithSerial () -to source target (theBook, metadata) = - mapM_ runArrow . groupBy (tome.get) $ sortBy (rank.get) metadata - where - runArrow (tomeNumber, tomeArticles) = stateOfArrow $ - compileTome source target (theBook, tomeNumber, tomeArticles) - -compileTome :: - FilePath -> FilePath -> (Book, Int, [Article]) -> SerialArrow n XmlTree -compileTome source target (theBook, tomeNumber, metadata) = - tomeXml >>> writeDocument [withIndent True] output - where - strTome = show tomeNumber - output = target </> (show theBook) <> "_T" <> strTome <.> ".fr.xml" - tomeXml = selem "/" - [ selem "teiCorpus" - --[ selem "corpus" - [ articleFrom source $< constL metadata ] ] - -articleFrom :: FilePath -> Article -> SerialArrow n XmlTree -articleFrom source article = - selem "doc" - [ selem "meta" [metaFrom article] - , selem "text" [loadConllu input] ] - - --(formatParagraph <$> byParagraph dom) ] - where - input = source </> relativePath article "conllu" - -loadConllu :: FilePath -> SerialArrow n XmlTree -loadConllu input = - arrIO0 (parseConllu input <$> readFile input) >>> (arrIO debug ||| format) - where - debug msg = die $ "In file " <> input <> "\n" <> msg - format = formatParagraph $< arrL byParagraph - -metaFrom :: ArrowXml a => Article -> a n XmlTree -metaFrom article = txt $ - concatMap (++ "\n") - (List.intercalate "\t" <$> - [[] - ,["fileName", uid article] - ,["tome", show . tome $ get article] - ,["rank", show . rank $ get article] - ,["head", Text.unpack . headWord $ get article] - ,["author", Text.unpack . formatList . authors $ get article] - ,["domains", Text.unpack . formatList . domains $ get article] - ,["parallel", show . ("parallel" `elem`) . getList . subCorpus $ get article ] - ,["hydronym", show . ("hydronym" `elem`) . getList . subCorpus $ get article ]]) - -{- -formatSentence :: IOStateArrow Serial Sent XmlTree -formatSentence :: IOStateArrow Serial Sent XmlTree -formatSentence = - selem "s" [txt sentences] - where - sentences = dropWhile (== '\n') $ printSent (s {_meta = []}) --} - -formatParagraph :: Paragraph -> SerialArrow n XmlTree ---formatParagraph :: Paragraph -> SerialArrow XmlTree -formatParagraph (Paragraph sents) = - mkelem "p" [attr "id" (txt . ('p':) $< next pId)] (formatSentence <$> sents) - -formatSentence :: Sent -> SerialArrow n XmlTree -formatSentence s = - selem "s" [txt sentences] - --mkelem "s" [attr "id" (txt . ('s':) $< next sId)] [txt sentences] - where - sentences = dropWhile (== '\n') $ printSent (s {_meta = []}) - -{- -processArticle :: FilePath -> FilePath -> Article -> StateT Serial IO () -processArticle source target article = ---processArticle source target a@(Article {rank}) = - liftIO (putStrLn ("processing " <> input) *> Prelude.readFile input) - >>= either (liftIO . die) (format article output) . parseConllu input - where - input = source </> relativePath article "conllu" - output = target </> uid article <.> ".fr.xml" - --input = source </> "article" ++ show rank <.> "conllu" - --output = target </> "article" ++ show rank <.> "xml" - -format :: Article -> FilePath -> Doc -> StateT Serial IO () -format _ _ [] = pure () -format article outputPath dom = StateT $ \s -> do - (newState, _) <- runIOSLA (xml >>> writeDocument [] outputPath) (initialState s) () - pure ((), getS theUserState newState) - where - xml = selem "/" - [selem "corpus" - [selem "doc" - [selem "meta" [metaFrom article] - ,selem "text" (formatParagraph <$> byParagraph dom)]]] --} - -main :: IO () -main = getArgs >>= cli - where - withSerial = (`evalStateT` (0, 0)) - books = groupBy (book.get) - cli [sourceMeta, sourceRoot, targetRoot] = - try (readNamedTsv sourceMeta) - >>= (withSerial . mapM_ (sourceRoot `to` targetRoot) . books) - cli _ = syntax "METADATA_CSV_FILE SOURCE_DIR TARGET_DIR" diff --git a/scripts/fix-tsv.hs b/scripts/fix-tsv.hs index 749dbbd6a1462ae681482a2c396e9b59741a9d62..a3d17ca64038c1ff6f13f0d9973f87ff965f81b6 100755 --- a/scripts/fix-tsv.hs +++ b/scripts/fix-tsv.hs @@ -1,20 +1,20 @@ -#!/usr/bin/env -S runhaskell --ghc-arg="-Wall" --ghc-arg="-i lib" +#!/usr/bin/env -S runhaskell --ghc-arg="-Wall" --ghc-arg="-i lib/haskell" {-# LANGUAGE OverloadedStrings #-} import Data.ByteString as BS (toStrict) -import Data.Csv (EncodeOptions(..), encodeWith, defaultEncodeOptions) +import Data.Csv (encodeWith) import Data.Text as Text (Text, splitOn) import Data.Text.Encoding as Text (decodeUtf8) +import GEODE.Metadata (toTSV) import System.Environment (getArgs) import System.Exit (die) import System.Script (syntax) import Text.Filter (xargs) fixTsv :: [Text] -> IO Text -fixTsv = fmap toTsv . mapM fixLine +fixTsv = fmap encode . mapM fixLine where - toTsv = decodeUtf8 . BS.toStrict . encodeWith - (defaultEncodeOptions { encDelimiter = toEnum $ fromEnum '\t' }) + encode = decodeUtf8 . BS.toStrict . encodeWith toTSV fixLine = escapeFormLemma . Text.splitOn "\t" escapeFormLemma [n, form, lemma, pos, ene] = pure (n, form, lemma, pos, ene) escapeFormLemma l = die $ show l diff --git a/scripts/merge.hs b/scripts/merge.hs index b42440c1704bfd3e61224562127d1fe4adf8f1c4..75cec17794e87fbffd550d2f5fe89ce8955e0ee5 100755 --- a/scripts/merge.hs +++ b/scripts/merge.hs @@ -1,35 +1,41 @@ #!/usr/bin/env -S runhaskell --ghc-arg="-Wall" --ghc-arg="-i lib/haskell" {-# LANGUAGE DeriveGeneric, ExplicitNamespaces, OverloadedStrings, ScopedTypeVariables #-} -import Control.Monad.Except (ExceptT(..), runExceptT) -import Data.Foldable (toList) +import Control.Monad.Except (runExceptT) +import Data.Csv (DefaultOrdered(..)) import Data.Map (Map, (!?)) -import Data.Maybe (catMaybes) import Data.Text (Text, pack) -import GEODE.Metadata (Contrastive(..), Entry, Has(..), MultiText(..), ArticleRecord, type(@)(..), indexBy, readNamedTsv, tsvFile) +import Data.Vector (catMaybes) +import GEODE.Metadata + ( type(@)(..), ArticleRecord, Contrastive(..), Document(..), Entry, Has(..) + , MultiText(..), ReadTSV(..), WriteTSV(..), for, getHeader, indexBy) import System.Environment (getArgs) import System.Exit (die) import System.Script (syntax) -leftJoin :: forall a b c t. (Foldable t, Functor t, Has ArticleRecord a, Has ArticleRecord b) => - (a -> Maybe b -> c) -> t a -> t b -> t c -leftJoin f left right = outputRow <$> left +leftJoin :: forall a b c. (Has ArticleRecord a, Has ArticleRecord b, DefaultOrdered c) => + (a -> Maybe b -> c) -> Document a -> Document b -> Document c +leftJoin f left right = Document + { header = getHeader (for :: c) + , rows = outputRow <$> rows left } where indexed :: Map ArticleRecord b - indexed = indexBy get right + indexed = indexBy get (rows right) outputRow leftLine = f leftLine (indexed !? get leftLine) -rightJoin :: (Foldable t, Functor t, Has ArticleRecord a, Has ArticleRecord b) => - (Maybe a -> b -> c) -> t a -> t b -> t c +rightJoin :: forall a b c. (Has ArticleRecord a, Has ArticleRecord b, DefaultOrdered c) => + (Maybe a -> b -> c) -> Document a -> Document b -> Document c rightJoin f left right = leftJoin (flip f) right left -join :: forall a b c t. (Foldable t, Functor t, Has ArticleRecord a, Has ArticleRecord b) => - (a -> b -> c) -> t a -> t b -> [c] -join f left right = catMaybes . toList $ outputRow <$> left +join :: forall a b c. (Has ArticleRecord a, Has ArticleRecord b, DefaultOrdered c) => + (a -> b -> c) -> Document a -> Document b -> Document c +join f left right = Document + { header = getHeader (for :: c) + , rows = catMaybes $ outputRow <$> rows left } where indexed :: Map ArticleRecord b - indexed = indexBy get right + indexed = indexBy get (rows right) outputRow leftLine = f leftLine <$> (indexed !? get leftLine) @@ -48,12 +54,12 @@ main = getArgs >>= run where run [left, right, output] = runExceptT ((join merge) - <$> ExceptT (readNamedTsv left) - <*> ExceptT (readNamedTsv right) ) - >>= either die (tsvFile output) + <$> (readTSV left) + <*> (readTSV right) ) + >>= either die (writeTSV output) run [name, left, right, output] = runExceptT ((rightJoin (tag $ pack name)) - <$> ExceptT (readNamedTsv left) - <*> ExceptT (readNamedTsv right) ) - >>= either die (tsvFile output) + <$> (readTSV left) + <*> (readTSV right) ) + >>= either die (writeTSV output) run _ = syntax "PRIMARY_KEY.tsv INPUT_METADATA.tsv OUTPUT_METADATA.tsv" diff --git a/scripts/paragraphs.hs b/scripts/paragraphs.hs index 03597cdbb80562c1c24f2bb4f47d5e8508ab762a..08b3036a0e7ca0addd34589de6e0db7bf6ef7299 100755 --- a/scripts/paragraphs.hs +++ b/scripts/paragraphs.hs @@ -3,14 +3,16 @@ import Data.List (dropWhileEnd) import Data.Text (Text, splitOn) import Data.Text.IO as Text (readFile, writeFile) +import Data.Vector as Vector (Vector, fromList) import GEODE.Metadata as Article - ( type (@)(..), ArticleRecord, Entry(..), Record(..), readNamedTsv, tsvFile ) + ( type (@)(..), ArticleRecord, Document(..), Entry(..), ReadTSV(..) + , Record(..), WriteTSV(..), getHeader, for ) import GEODE.Metadata.ParagraphRecord (Paragraph(..)) import GEODE.Metadata.ProdigyMeta as Prodigy (ParagraphMeta, ProdigyMeta(..)) import System.Directory (createDirectoryIfMissing) import System.Environment (getArgs) import System.FilePath ((</>), (<.>), isPathSeparator, takeDirectory) -import System.Script (syntax, try) +import System.Script (syntax) import Text.Filter (Editable(..)) import Text.Filter.Linearize (linearize) @@ -38,11 +40,16 @@ to source target meta@(articleRecord :@: _) = do let outputPath = target </> relativePath paragraphRecord "txt" in paragraphMeta <$ Text.writeFile outputPath paragraphText +asDocument :: Vector [ParagraphMeta] -> Document ParagraphMeta +asDocument batches = Document + { header = getHeader (for :: ParagraphMeta) + , rows = Vector.fromList $ concat batches } + main :: IO () main = (fmap (dropWhileEnd isPathSeparator) <$> getArgs) >>= run where run [inputMeta, source, target] = - try (readNamedTsv inputMeta) - >>= mapM (source `to` target) - >>= tsvFile (takeDirectory target </> "files" <.> "tsv") . concat + readTSV inputMeta + >>= mapM (source `to` target) . rows + >>= writeTSV (takeDirectory target </> "files" <.> "tsv") . asDocument run _ = syntax "INPUT_METADATA SOURCE_DIRECTORY TARGET_DIRECTORY" diff --git a/scripts/parallel-links.hs b/scripts/parallel-links.hs index 9a7b85e338ffce59243ac505c4c7723c72805347..2081d767da781379150012fbcac347fa9e3cbdb5 100755 --- a/scripts/parallel-links.hs +++ b/scripts/parallel-links.hs @@ -3,23 +3,26 @@ module Main where import Data.Char as Text (toLower) -import Data.Text as Text (map) +import Data.Map (Map) +import Data.Text as Text (Text, map) import GEODE.Metadata - ( Entry(..), Has(..), ArticleRecord(..), type(@), groupBy, readNamedTsv, tsvFile ) + ( type(@), ArticleRecord(..), Document(..), Entry(..), Has(..), ReadTSV(..) + , WriteTSV(..), groupBy ) +import Data.Vector as Vector (fromList) import System.Environment (getArgs) -import System.Exit (die) import System.Script (syntax) -type Line = ArticleRecord @ Entry +type Line = ArticleRecord @ Entry @ Map Text Text -findDiachronicPairs :: Foldable t => Bool -> t Line -> [Line] -findDiachronicPairs caseInsensitive = - concatMap snd . filter isPair . groupBy (normalize . headword . get) +findDiachronicPairs :: Bool -> Document Line -> Document Line +findDiachronicPairs caseInsensitive d@(Document {rows}) = + d { rows = Vector.fromList . concatMap snd $ filter isPair groups } where isPair = oneInEach . groupBy (work.get) . snd oneInEach = (&&) <$> twoBooks <*> oneByBook twoBooks = ((2 ==) . length) oneByBook = all ((1 ==) . length . snd) + groups = groupBy (normalize . headword . get) rows normalize | caseInsensitive = Text.map toLower | otherwise = id @@ -28,8 +31,7 @@ main :: IO () main = getArgs >>= popCaseInsensitive run where run caseInsensitive [input, output] = - readNamedTsv input - >>= either die (tsvFile output . findDiachronicPairs caseInsensitive) + readTSV input >>= writeTSV output . findDiachronicPairs caseInsensitive run _ _ = syntax "[-i] SOURCE_METADATA.tsv TARGET_METADATA.tsv" popCaseInsensitive f ("-i":args) = f True args popCaseInsensitive f args = f False args diff --git a/scripts/sample.hs b/scripts/sample.hs index 50a54ffbe315e4519548175060d470f847593d49..3145036d06b1ed86b411ed6e4bb58b530a8a088e 100755 --- a/scripts/sample.hs +++ b/scripts/sample.hs @@ -2,10 +2,10 @@ import Data.IntSet as IntSet (IntSet, delete, fromList, insert, member, toList) import Data.Text (Text) import Data.Vector as Vector ((!), Vector, fromList) -import GEODE.Metadata (readTsv, tsvLines) +import GEODE.Metadata (ReadTSV(..), WriteTSV(..)) import System.Environment (getArgs) import System.Random (randomRIO) -import System.Script (syntax, try) +import System.Script (syntax) sampleIndices :: Int -> (Int, Int) -> IO IntSet sampleIndices count (from, to) @@ -24,7 +24,7 @@ sampleIndices count (from, to) build 0 result = pure result build n set = build (n-1) =<< pickNewOne set -sample :: Int -> Vector a -> IO (Vector a) +sample :: Int -> Vector [Text] -> IO (Vector [Text]) sample count rows = Vector.fromList <$> sampledList where sampledList @@ -37,7 +37,5 @@ main :: IO () main = getArgs >>= run where run [size, inputMetadata] = - (try (readTsv inputMetadata) :: IO (Vector [Text])) - >>= sample (read size) - >>= tsvLines + readTSV inputMetadata >>= sample (read size) >>= writeTSV () run _ = syntax "SIZE INPUT_METADATA" diff --git a/scripts/subcorpus/get-files.hs b/scripts/subcorpus/get-files.hs index 1b0167d6dfdb937b56059bfc0cf3aa777c551173..c6a267b329f5ddc858eed43918863e57bf5c85f7 100755 --- a/scripts/subcorpus/get-files.hs +++ b/scripts/subcorpus/get-files.hs @@ -1,14 +1,13 @@ #!/usr/bin/env -S runhaskell --ghc-arg="-Wall" --ghc-arg="-i lib/haskell" -import Data.Csv (FromNamedRecord) -import Data.Vector (Vector) -import GEODE.Metadata (ArticleRecord, Record(..), readNamedTsv) +import Control.Monad.Except (ExceptT, runExceptT) +import GEODE.Metadata (ArticleRecord, Document(..), ReadTSV(..), Record(..)) import GEODE.Metadata.ParagraphRecord (ParagraphRecord) import System.Directory (createDirectoryIfMissing, doesFileExist) import System.Environment (getArgs) import System.FilePath ((</>), takeDirectory) import System.Posix.Files (createLink) -import System.Script (syntax, try) +import System.Script (syntax) link :: Record a => String -> FilePath -> FilePath -> a -> IO () link extension sourceRoot targetRoot record = do @@ -24,17 +23,16 @@ link extension sourceRoot targetRoot record = do sourceFile = sourceRoot </> fileName targetFile = targetRoot </> fileName -type As a = a -> IO () +type As a = Document a -> IO () run :: [String] -> IO () run [files, extension, source, target] = - tsvData >>= either reTryAsArticles (mapM_ (mkLink :: As ParagraphRecord)) + runExceptT (readTSV files :: ExceptT String IO (Document ParagraphRecord)) + >>= either reTryAsArticles (mkLinks :: As ParagraphRecord) where - tsvData :: (FromNamedRecord a, Record a) => IO (Either String (Vector a)) - tsvData = readNamedTsv files - mkLink :: Record a => As a - mkLink = link extension source target - reTryAsArticles _ = try tsvData >>= (mapM_ (mkLink :: As ArticleRecord)) + mkLinks :: Record a => As a + mkLinks = mapM_ (link extension source target) . rows + reTryAsArticles _ = readTSV files >>= (mkLinks :: As ArticleRecord) run _ = syntax "FILES.tsv EXTENSION SOURCE_DIRECTORY TARGET_DIRECTORY" diff --git a/scripts/textometry/computeProfile.hs b/scripts/textometry/compute-profile.hs similarity index 80% rename from scripts/textometry/computeProfile.hs rename to scripts/textometry/compute-profile.hs index 51cfcbb3ddf0015462edcdb651fd84be15f62d4d..4ac3480f2683af28120cd433ca071c9f00c05296 100755 --- a/scripts/textometry/computeProfile.hs +++ b/scripts/textometry/compute-profile.hs @@ -7,22 +7,21 @@ import Conllu.Tree import Control.Applicative ((<**>)) import Control.Monad.IO.Class (liftIO) import Control.Monad.Reader (ReaderT, asks, runReaderT) -import Data.Csv (ToNamedRecord(..)) +import Data.Csv (DefaultOrdered(..), ToNamedRecord(..)) import Data.ByteString as ByteString (readFile) import Data.List (sort) import Data.Map as Map (lookup) import Data.Serialize (decode) import Data.Tree (Tree(..)) import GEODE.Metadata - ( type(@), ArticleRecord, DefaultFields(..), HasDefaultHeader(..), Record(..) - , WithDefaultHeader(..), glue, readNamedTsv, tsvLines ) -import GEODE.Metadata.TSV.Header (for, getHeader) + ( type(@), ArticleRecord, Document(..), ReadTSV(..), Record(..) + , WithDefaultHeader, WriteTSV(..), for, getHeader, glue ) import GHC.Generics (Generic) import Options.Applicative ( Parser, execParser, fullDesc, help, helper, info, long, metavar, progDesc , short, strOption ) import System.FilePath ((</>)) -import System.Script (try, warn) +import System.Script (warn) data Config = Config { inputRoot :: FilePath @@ -46,13 +45,12 @@ getConfig = execParser <> progDesc "A textometric tool to draw discursive profiles")) data Occurrence = Occurrence - { totalSize :: Int - , position :: Int - , size :: Int } deriving Generic + { position :: Int + , size :: Int + , totalSize :: Int } deriving Generic +instance DefaultOrdered Occurrence instance ToNamedRecord Occurrence -instance HasDefaultHeader Occurrence where - defaultFields = DefaultFields ["position", "size", "totalSize"] type Measure = ArticleRecord @ Occurrence @@ -67,7 +65,6 @@ profile articleRecord = do findOccurrences :: IndexedDocument -> [Occurrence] findOccurrences (IndexedDocument {_total, _sentences}) = groupOccurrences [] (_sentences >>= imperativeVerb) - --uncurry (Occurrence _total) <$> (_sentences >>= imperativeVerb) where groupOccurrences :: [(Int, Int)] -> [Int] -> [Occurrence] groupOccurrences stack [] = uncurry (Occurrence _total) <$> reverse stack @@ -92,7 +89,7 @@ imperativeVerb (IndexedSentence {_offset, _syntax}) = main :: IO () main = getConfig >>= runReaderT chain where - chain = try (asks inputTsv >>= liftIO . readNamedTsv) >>= searchAndDisplay - searchAndDisplay rows = do - liftIO $ tsvLines [getHeader (for :: Measure)] - mapM_ (\ar -> profile ar >>= liftIO . tsvLines) rows + chain = asks inputTsv >>= liftIO . readTSV >>= searchAndDisplay + searchAndDisplay (Document {rows}) = do + liftIO $ writeTSV () [getHeader (for :: Measure)] + mapM_ (\ar -> profile ar >>= liftIO . writeTSV ()) rows diff --git a/scripts/textometry/conllu-to-lexicoscope.hs b/scripts/textometry/conllu-to-lexicoscope.hs new file mode 100755 index 0000000000000000000000000000000000000000..0ceb48bd3c87ce0ff201c819bcff7c0deb4d2b4e --- /dev/null +++ b/scripts/textometry/conllu-to-lexicoscope.hs @@ -0,0 +1,111 @@ +#!/usr/bin/env -S runhaskell --ghc-arg="-Wall" --ghc-arg="-i lib/haskell" +{-# LANGUAGE ExplicitNamespaces, NamedFieldPuns, OverloadedStrings #-} + +import Conllu.Parse (parseConllu) +import Conllu.Parse.Paragraph (Paragraph(..), byParagraph) +import Conllu.Print (printSent) +import Conllu.Type (Sent(..)) +import Control.Monad.State (StateT(..), evalStateT) +import Data.List as List (intercalate) +import GEODE.Metadata + ( type(@), ArticleRecord(..), Contrastive(..), Document(..), Entry(..), Has(..) + , MultiText(..), ReadTSV(..), Record(..), Work, formatList, groupBy, sortBy) +import Data.Text as Text (unpack) +import System.Environment (getArgs) +import System.Exit (die) +import System.FilePath ((</>), (<.>)) +import System.Script (syntax) +import Text.XML.HXT.Core + ((>>>), (|||), ($<), ArrowXml, IOSLA(..), IOStateArrow, XmlTree, arrIO, arrIO0, arrL, attr, constL, mkelem, selem + , txt, withIndent, writeDocument) +import Text.XML.HXT.Arrow.XmlState.TypeDefs (Selector(..), chgS, theUserState) +import Text.XML.HXT.Arrow.XmlState.RunIOStateArrow (initialState) + +type Row = ArticleRecord @ Entry @ Contrastive + +type Serial = (Int, Int) +type WithSerial a = StateT Serial IO a +type SerialArrow n a = IOStateArrow Serial n a + +next :: Selector Serial Int -> SerialArrow n String +next intSelector = IOSLA getUpdate + where + getUpdate state _ = pure (chgS field (+1) state, [show $ getS field state]) + field = theUserState >>> intSelector + +pId :: Selector Serial Int +pId = S {getS = fst, setS = \p (_, s) -> (p, s)} + +stateOfArrow :: IOStateArrow s () a -> StateT s IO () +stateOfArrow a = StateT $ \s0 -> do + (s1, _) <- runIOSLA a (initialState s0) () + pure ((), getS theUserState s1) + +to :: FilePath -> FilePath -> (Work, [Row]) -> WithSerial () +to source target (theWork, metadata) = + mapM_ runArrow . groupBy (volume.get) $ sortBy (article.get) metadata + where + runArrow (volumeNumber, articles) = stateOfArrow $ + compileTome source target (theWork, volumeNumber, articles) + +compileTome :: + FilePath -> FilePath -> (Work, Int, [Row]) -> SerialArrow n XmlTree +compileTome source target (theWork, volumeNumber, metadata) = + volumeXml >>> writeDocument [withIndent True] output + where + strTome = show volumeNumber + output = target </> (show theWork) <> "_T" <> strTome <.> ".fr.xml" + volumeXml = selem "/" + [ selem "teiCorpus" + --[ selem "corpus" + [ articleFrom source $< constL metadata ] ] + +articleFrom :: FilePath -> Row -> SerialArrow n XmlTree +articleFrom source row = + selem "doc" + [ selem "meta" [metaFrom row] + , selem "text" [loadConllu input] ] + where + input = source </> relativePath (get row :: ArticleRecord) "conllu" + +loadConllu :: FilePath -> SerialArrow n XmlTree +loadConllu input = + arrIO0 (parseConllu input <$> readFile input) >>> (arrIO debug ||| format) + where + debug msg = die $ "In file " <> input <> "\n" <> msg + format = formatParagraph $< arrL byParagraph + +metaFrom :: ArrowXml a => Row -> a n XmlTree +metaFrom row = txt $ + concatMap (++ "\n") + (List.intercalate "\t" <$> + [[] + ,["fileName", toUID (get row :: ArticleRecord)] + ,["work", show . work $ get row] + ,["volume", show . volume $ get row] + ,["article", show . article $ get row] + ,["head", Text.unpack . headword $ get row] + ,["author", Text.unpack . formatList . authors $ get row] + ,["domains", Text.unpack . formatList . domains $ get row] + ,["parallel", show . ("parallel" `elem`) . getList . subCorpus $ get row ] + ,["hydronym", show . ("hydronym" `elem`) . getList . subCorpus $ get row ]]) + +formatParagraph :: Paragraph -> SerialArrow n XmlTree +formatParagraph (Paragraph sents) = + mkelem "p" [attr "id" (txt . ('p':) $< next pId)] (formatSentence <$> sents) + +formatSentence :: Sent -> SerialArrow n XmlTree +formatSentence s = + selem "s" [txt sentences] + where + sentences = dropWhile (== '\n') $ printSent (s {_meta = []}) + +main :: IO () +main = getArgs >>= cli + where + withSerial = (`evalStateT` (0, 0)) + works = groupBy (work.get) + cli [sourceMeta, sourceRoot, targetRoot] = + readTSV sourceMeta + >>= (withSerial . mapM_ (sourceRoot `to` targetRoot) . works) . rows + cli _ = syntax "METADATA_CSV_FILE SOURCE_DIR TARGET_DIR" diff --git a/scripts/conllu-to-txm.hs b/scripts/textometry/conllu-to-txm.hs similarity index 67% rename from scripts/conllu-to-txm.hs rename to scripts/textometry/conllu-to-txm.hs index 69266da9d81818a81107db434bce1358a95ef63a..298a203b6a062d5613242dad48610f9d41b6807a 100755 --- a/scripts/conllu-to-txm.hs +++ b/scripts/textometry/conllu-to-txm.hs @@ -1,17 +1,17 @@ -#!/usr/bin/env -S runhaskell --ghc-arg="-Wall" --ghc-arg="-i lib" +#!/usr/bin/env -S runhaskell --ghc-arg="-Wall" --ghc-arg="-i lib/haskell" {-# LANGUAGE ExplicitNamespaces, NamedFieldPuns, OverloadedStrings #-} import Conllu.Parse (parseConllu) import Conllu.Type (AW, CW(..), ID(..), Sent(..)) import Control.Applicative ((<|>), liftA2) import GEODE.Metadata - ( Book, Contrastive(..), Entry(..), Has(..), MultiText(..), PrimaryKey(..) - , type(@), relativePath, groupBy, readNamedTsv, sortBy ) + ( type(@)(..), ArticleRecord(..), Contrastive(..), Document(..), Entry(..), Has(..) + , MultiText(..), ReadTSV(..), Record(..), Work, groupBy, sortBy ) import Data.Text as Text (breakOn, drop, pack, unpack, splitOn) import System.Environment (getArgs) import System.Exit (die) import System.FilePath ((</>), (<.>)) -import System.Script (syntax, try) +import System.Script (syntax) import Text.TEI ( corpusHeader, publicationStmt, sourceDesc, teiHeader, text, title ) import Text.XML.HXT.Core @@ -19,39 +19,39 @@ import Text.XML.HXT.Core , attr, constL, mkelem, mkText, runX, selem, txt, unlistA, withIndent , writeDocument ) -type Article = PrimaryKey @ Entry @ Contrastive +type Row = ArticleRecord @ Entry @ Contrastive -to :: FilePath -> FilePath -> (Book, [Article]) -> IO () -to source target (theBook, metadata) = - mapM_ runArrow . groupBy (tome.get) $ sortBy (rank.get) metadata +to :: FilePath -> FilePath -> (Work, [Row]) -> IO () +to source target (theWork, metadata) = + mapM_ runArrow . groupBy (volume.get) $ sortBy (article.get) metadata where - runArrow (tomeNumber, tomeArticles) = - runX $ compileTome source target (theBook, tomeNumber, tomeArticles) + runArrow (volumeNumber, articles) = + runX $ compileTome source target (theWork, volumeNumber, articles) compileTome :: - FilePath -> FilePath -> (Book, Int, [Article]) -> IOSArrow b XmlTree -compileTome source target (theBook, tomeNumber, metadata) = - tomeXml >>> writeDocument [withIndent True] output + FilePath -> FilePath -> (Work, Int, [Row]) -> IOSArrow b XmlTree +compileTome source target (theWork, volumeNumber, metadata) = + volumeXml >>> writeDocument [withIndent True] output where - strTome = show tomeNumber - output = target </> (show theBook) <> "_T" <> strTome <.> "xml" - tomeXml = selem "/" + strTome = show volumeNumber + output = target </> (show theWork) <> "_T" <> strTome <.> "xml" + volumeXml = selem "/" [ selem "teiCorpus" - [ corpusHeader theBook strTome + [ corpusHeader theWork strTome , (articleFrom source $< constL metadata) ] ] -articleFrom :: FilePath -> Article -> IOSArrow b XmlTree -articleFrom source article = +articleFrom :: FilePath -> Row -> IOSArrow b XmlTree +articleFrom source row = selem "TEI" [ teiHeader - (selem "titleStmt" [title . Text.unpack . headWord.get $ article]) + (selem "titleStmt" [title . Text.unpack . headword.get $ row]) (publicationStmt [ selem "p" [ txt "Annotated with Stanza by project GEODE" ] ]) (sourceDesc [ authorArrow >>> selem "author" [ mkText ] ]) - , text "article" article (loadConllu input) ] + , text "article" row (loadConllu input) ] where - input = source </> relativePath article "conllu" - authorArrow = constL (Text.unpack <$> (getList . authors $ get article)) + input = source </> relativePath (get row :: ArticleRecord) "conllu" + authorArrow = constL (Text.unpack <$> (getList . authors $ get row)) loadConllu :: FilePath -> IOSArrow b XmlTree loadConllu input = @@ -89,7 +89,7 @@ misc (CW {_misc}) = _misc >>= (fmap Text.unpack . getNer . Text.pack) main :: IO () main = getArgs >>= cli where - books = groupBy (book.get) + works = groupBy (work.get) cli [inputMeta, sourceRoot, targetRoot] = - try (readNamedTsv inputMeta) >>= mapM_ (sourceRoot `to` targetRoot) . books + readTSV inputMeta >>= mapM_ (sourceRoot `to` targetRoot) . works . rows cli _ = syntax "METADATA_TSV_FILE SOURCE_DIR TARGET_DIR" diff --git a/scripts/textometry/topNOUN.hs b/scripts/textometry/top-noun.hs similarity index 86% rename from scripts/textometry/topNOUN.hs rename to scripts/textometry/top-noun.hs index f2134f96a1c08b9ef62cae288446847e5087a97d..88bc3ece474cd830a1267a376e0a76e625e05033 100755 --- a/scripts/textometry/topNOUN.hs +++ b/scripts/textometry/top-noun.hs @@ -14,14 +14,13 @@ import Data.Maybe (listToMaybe) import Data.Serialize (decode) import Data.Tree (Tree(..)) import GEODE.Metadata - ( type(@), ArticleRecord, Record(..), WithDefaultHeader(..), glue - , readNamedTsv, tsvLines ) -import GEODE.Metadata.TSV.Header (for, getHeader) + ( type(@), ArticleRecord, Document(..), ReadTSV(..), Record(..) + , WithDefaultHeader(..), WriteTSV(..), for, getHeader, glue ) import Options.Applicative ( Parser, execParser, fullDesc, help, helper, info, long, metavar, progDesc , short, strOption ) import System.FilePath ((</>)) -import System.Script (try, warn) +import System.Script (warn) data Config = Config { inputRoot :: FilePath @@ -76,7 +75,7 @@ firstTopNOUN (IndexedSentence {_syntax}) = listToMaybe _syntax >>= fromTop main :: IO () main = getConfig >>= runReaderT chain where - chain = try (asks inputTsv >>= liftIO . readNamedTsv) >>= searchAndDisplay - searchAndDisplay rows = do - liftIO $ tsvLines [getHeader (for :: Result)] - mapM_ (\ar -> profile ar >>= liftIO . tsvLines . maybe [] (:[])) rows + chain = asks inputTsv >>= liftIO . readTSV >>= searchAndDisplay + searchAndDisplay (Document {rows}) = do + liftIO $ writeTSV () [getHeader (for :: Result)] + mapM_ (\ar -> profile ar >>= liftIO . writeTSV () . maybe [] (:[])) rows