diff --git a/.gitignore b/.gitignore index 0d20b6487c61e7d1bde93acf4a14b7a89083a16d..f082d1c69a0271087d6f7cf35a40f051f45fc40d 100644 --- a/.gitignore +++ b/.gitignore @@ -1 +1,3 @@ *.pyc +*.o +*.hi diff --git a/lib/Conllu/Parse/Paragraph.hs b/lib/haskell/Conllu/Parse/Paragraph.hs similarity index 100% rename from lib/Conllu/Parse/Paragraph.hs rename to lib/haskell/Conllu/Parse/Paragraph.hs diff --git a/lib/Data/BIO.hs b/lib/haskell/Data/BIO.hs similarity index 100% rename from lib/Data/BIO.hs rename to lib/haskell/Data/BIO.hs diff --git a/lib/Data/Csv/DB.hs b/lib/haskell/Data/Csv/DB.hs similarity index 100% rename from lib/Data/Csv/DB.hs rename to lib/haskell/Data/Csv/DB.hs diff --git a/lib/Data/Metadata.hs b/lib/haskell/Data/Metadata.hs similarity index 100% rename from lib/Data/Metadata.hs rename to lib/haskell/Data/Metadata.hs diff --git a/lib/Data/Metadata/Article.hs b/lib/haskell/Data/Metadata/Article.hs similarity index 100% rename from lib/Data/Metadata/Article.hs rename to lib/haskell/Data/Metadata/Article.hs diff --git a/lib/Data/Metadata/Paragraph.hs b/lib/haskell/Data/Metadata/Paragraph.hs similarity index 100% rename from lib/Data/Metadata/Paragraph.hs rename to lib/haskell/Data/Metadata/Paragraph.hs diff --git a/lib/Data/Metadata/Projector.hs b/lib/haskell/Data/Metadata/Projector.hs similarity index 100% rename from lib/Data/Metadata/Projector.hs rename to lib/haskell/Data/Metadata/Projector.hs diff --git a/lib/Data/Metadata/Trie.hs b/lib/haskell/Data/Metadata/Trie.hs similarity index 100% rename from lib/Data/Metadata/Trie.hs rename to lib/haskell/Data/Metadata/Trie.hs diff --git a/lib/Data/Metadata/Types.hs b/lib/haskell/Data/Metadata/Types.hs similarity index 100% rename from lib/Data/Metadata/Types.hs rename to lib/haskell/Data/Metadata/Types.hs diff --git a/lib/Data/SpaCy.hs b/lib/haskell/Data/SpaCy.hs similarity index 100% rename from lib/Data/SpaCy.hs rename to lib/haskell/Data/SpaCy.hs diff --git a/lib/GEODE/Metadata/File.hs b/lib/haskell/GEODE/Metadata/File.hs similarity index 100% rename from lib/GEODE/Metadata/File.hs rename to lib/haskell/GEODE/Metadata/File.hs diff --git a/lib/GEODE/Metadata/ParagraphRecord.hs b/lib/haskell/GEODE/Metadata/ParagraphRecord.hs similarity index 100% rename from lib/GEODE/Metadata/ParagraphRecord.hs rename to lib/haskell/GEODE/Metadata/ParagraphRecord.hs diff --git a/lib/GEODE/Metadata/ProdigyMeta.hs b/lib/haskell/GEODE/Metadata/ProdigyMeta.hs similarity index 100% rename from lib/GEODE/Metadata/ProdigyMeta.hs rename to lib/haskell/GEODE/Metadata/ProdigyMeta.hs diff --git a/lib/System/Script.hs b/lib/haskell/System/Script.hs similarity index 100% rename from lib/System/Script.hs rename to lib/haskell/System/Script.hs diff --git a/lib/Text/Filter.hs b/lib/haskell/Text/Filter.hs similarity index 100% rename from lib/Text/Filter.hs rename to lib/haskell/Text/Filter.hs diff --git a/lib/Text/Filter/Linearize.hs b/lib/haskell/Text/Filter/Linearize.hs similarity index 100% rename from lib/Text/Filter/Linearize.hs rename to lib/haskell/Text/Filter/Linearize.hs diff --git a/lib/Text/TEI.hs b/lib/haskell/Text/TEI.hs similarity index 100% rename from lib/Text/TEI.hs rename to lib/haskell/Text/TEI.hs diff --git a/scripts/linearize.hs b/scripts/linearize.hs index c0b029fe0aefdaed71991f2190b86f3c90a0931a..e589ea5e8449878048513cafe8c96c30a82dfe8e 100755 --- a/scripts/linearize.hs +++ b/scripts/linearize.hs @@ -1,4 +1,4 @@ -#!/usr/bin/env -S runhaskell --ghc-arg="-Wall" --ghc-arg="-i lib" +#!/usr/bin/env -S runhaskell --ghc-arg="-Wall" --ghc-arg="-i lib/haskell" import System.Environment (getArgs) import System.Script (syntax) diff --git a/scripts/merge.hs b/scripts/merge.hs index a625ef5b615dd7729343914b813c4f92710cdbe3..b42440c1704bfd3e61224562127d1fe4adf8f1c4 100755 --- a/scripts/merge.hs +++ b/scripts/merge.hs @@ -1,4 +1,4 @@ -#!/usr/bin/env -S runhaskell --ghc-arg="-Wall" --ghc-arg="-i lib" +#!/usr/bin/env -S runhaskell --ghc-arg="-Wall" --ghc-arg="-i lib/haskell" {-# LANGUAGE DeriveGeneric, ExplicitNamespaces, OverloadedStrings, ScopedTypeVariables #-} import Control.Monad.Except (ExceptT(..), runExceptT) diff --git a/scripts/paragraphs.hs b/scripts/paragraphs.hs index 2a4d0e041710f503b822e366c65a5a7021382831..03597cdbb80562c1c24f2bb4f47d5e8508ab762a 100755 --- a/scripts/paragraphs.hs +++ b/scripts/paragraphs.hs @@ -1,9 +1,10 @@ -#!/usr/bin/env -S runhaskell --ghc-arg="-Wall" --ghc-arg="-i lib" +#!/usr/bin/env -S runhaskell --ghc-arg="-Wall" --ghc-arg="-i lib/haskell" {-# LANGUAGE ExplicitNamespaces, OverloadedStrings #-} import Data.List (dropWhileEnd) import Data.Text (Text, splitOn) import Data.Text.IO as Text (readFile, writeFile) -import GEODE.Metadata as Article (type (@)(..), ArticleRecord, Entry(..), Record(..), readNamedTsv, relativePath, tsvFile) +import GEODE.Metadata as Article + ( type (@)(..), ArticleRecord, Entry(..), Record(..), readNamedTsv, tsvFile ) import GEODE.Metadata.ParagraphRecord (Paragraph(..)) import GEODE.Metadata.ProdigyMeta as Prodigy (ParagraphMeta, ProdigyMeta(..)) import System.Directory (createDirectoryIfMissing) diff --git a/scripts/parallel-links.hs b/scripts/parallel-links.hs index 3243c5a4285d3b5454c39c83fa4e523f0f47f0f4..9a7b85e338ffce59243ac505c4c7723c72805347 100755 --- a/scripts/parallel-links.hs +++ b/scripts/parallel-links.hs @@ -1,24 +1,24 @@ -#!/usr/bin/env -S runhaskell --ghc-arg="-Wall" --ghc-arg="-i lib" +#!/usr/bin/env -S runhaskell --ghc-arg="-Wall" --ghc-arg="-i lib/haskell" {-# LANGUAGE ExplicitNamespaces #-} module Main where import Data.Char as Text (toLower) import Data.Text as Text (map) import GEODE.Metadata - ( Entry(..), Has(..), PrimaryKey(..), type(@), groupBy, readNamedTsv, tsvFile ) + ( Entry(..), Has(..), ArticleRecord(..), type(@), groupBy, readNamedTsv, tsvFile ) import System.Environment (getArgs) import System.Exit (die) import System.Script (syntax) -type Line = PrimaryKey @ Entry +type Line = ArticleRecord @ Entry -findPairs :: Foldable t => Bool -> t Line -> [Line] -findPairs caseInsensitive = - concatMap snd . filter isPair . groupBy (normalize . headWord . get) +findDiachronicPairs :: Foldable t => Bool -> t Line -> [Line] +findDiachronicPairs caseInsensitive = + concatMap snd . filter isPair . groupBy (normalize . headword . get) where - isPair = oneInEach . groupBy (book.get) . snd - oneInEach = (&&) <$> bothBooks <*> oneByBook - bothBooks = ((2 ==) . length) + isPair = oneInEach . groupBy (work.get) . snd + oneInEach = (&&) <$> twoBooks <*> oneByBook + twoBooks = ((2 ==) . length) oneByBook = all ((1 ==) . length . snd) normalize | caseInsensitive = Text.map toLower @@ -27,10 +27,9 @@ findPairs caseInsensitive = main :: IO () main = getArgs >>= popCaseInsensitive run where - run caseInsensitive [input, output] = readNamedTsv input - >>= either - die - (tsvFile output . findPairs caseInsensitive) + run caseInsensitive [input, output] = + readNamedTsv input + >>= either die (tsvFile output . findDiachronicPairs caseInsensitive) run _ _ = syntax "[-i] SOURCE_METADATA.tsv TARGET_METADATA.tsv" popCaseInsensitive f ("-i":args) = f True args popCaseInsensitive f args = f False args diff --git a/scripts/reroot.hs b/scripts/reroot.hs index 1b67fc31c5adb76a670b4a2d5dac322b29f05337..502d8962398fc8677cca91686c9f82e0d56e702d 100755 --- a/scripts/reroot.hs +++ b/scripts/reroot.hs @@ -1,19 +1,21 @@ -#!/usr/bin/env -S runhaskell --ghc-arg="-Wall" --ghc-arg="-i lib" +#!/usr/bin/env -S runhaskell --ghc-arg="-Wall" --ghc-arg="-i lib/haskell" {-# LANGUAGE OverloadedStrings #-} -import Data.Text (Text, lines, unlines) +import Data.Text as Text (Text, cons, lines, pack, unlines) import Data.Text.IO (interact) import Prelude hiding (interact, lines, unlines) import System.Environment (getArgs) import System.Script (syntax) import Text.Filter (xargs) -wrap :: [Text] -> [Text] -wrap text = "<body>": (("\t"<>) <$> text) ++ ["</body>"] +wrap :: String -> [Text] -> [Text] +wrap name text = (mkTag False): (("\t"<>) <$> text) ++ [mkTag True] + where + mkTag close = "<" <> (if close then Text.cons '/' else id) (pack name) <> ">" main :: IO () main = getArgs >>= run where - run ["-"] = interact (unlines . wrap . lines) - run [target] = xargs (pure.wrap) target + run [tagName, "-"] = interact (unlines . wrap tagName . lines) + run [tagName, target] = xargs (pure.wrap tagName) target run _ = syntax "TARGET_DIRECTORY" diff --git a/scripts/sample.hs b/scripts/sample.hs new file mode 100755 index 0000000000000000000000000000000000000000..50a54ffbe315e4519548175060d470f847593d49 --- /dev/null +++ b/scripts/sample.hs @@ -0,0 +1,43 @@ +#!/usr/bin/env -S runhaskell --ghc-arg="-Wall" --ghc-arg="-i lib/haskell" +import Data.IntSet as IntSet (IntSet, delete, fromList, insert, member, toList) +import Data.Text (Text) +import Data.Vector as Vector ((!), Vector, fromList) +import GEODE.Metadata (readTsv, tsvLines) +import System.Environment (getArgs) +import System.Random (randomRIO) +import System.Script (syntax, try) + +sampleIndices :: Int -> (Int, Int) -> IO IntSet +sampleIndices count (from, to) + | count >= size = pure $ everything + | otherwise = uncurry build $ + if adding then (count, mempty) else (size-count, everything) + where + size = to - from + 1 + adding = count <= size `div` 2 + everything = IntSet.fromList [from..to] + pickNewOne set = do + index <- randomRIO (from, to) + if index `member` set + then (if adding then pickNewOne set else pure (delete index set)) + else (if adding then pure (insert index set) else pickNewOne set) + build 0 result = pure result + build n set = build (n-1) =<< pickNewOne set + +sample :: Int -> Vector a -> IO (Vector a) +sample count rows = Vector.fromList <$> sampledList + where + sampledList + | null rows = pure [] + | otherwise = do + selected <- sampleIndices count (1, length rows-1) + pure $ (rows!) <$> (0:IntSet.toList selected) + +main :: IO () +main = getArgs >>= run + where + run [size, inputMetadata] = + (try (readTsv inputMetadata) :: IO (Vector [Text])) + >>= sample (read size) + >>= tsvLines + run _ = syntax "SIZE INPUT_METADATA" diff --git a/scripts/select.hs b/scripts/select.hs index fe4b70c9fa5c1f62381de09968aa1e83cabebe32..d9b313e68483015032bf5207341b660e7ba1eec6 100755 --- a/scripts/select.hs +++ b/scripts/select.hs @@ -1,4 +1,4 @@ -#!/usr/bin/env -S runhaskell --ghc-arg="-Wall" --ghc-arg="-i lib" +#!/usr/bin/env -S runhaskell --ghc-arg="-Wall" --ghc-arg="-i lib/haskell" {-# LANGUAGE ExplicitNamespaces, OverloadedStrings #-} module Main where diff --git a/scripts/subcorpus/get-files.hs b/scripts/subcorpus/get-files.hs index a034604e4dcca1e74a32904330ec3df32b2bea53..1b0167d6dfdb937b56059bfc0cf3aa777c551173 100755 --- a/scripts/subcorpus/get-files.hs +++ b/scripts/subcorpus/get-files.hs @@ -1,4 +1,4 @@ -#!/usr/bin/env -S runhaskell --ghc-arg="-Wall" --ghc-arg="-i lib" +#!/usr/bin/env -S runhaskell --ghc-arg="-Wall" --ghc-arg="-i lib/haskell" import Data.Csv (FromNamedRecord) import Data.Vector (Vector)