Skip to content
Snippets Groups Projects
Commit 9abfcf76 authored by Alice Brenon's avatar Alice Brenon
Browse files

First draft building from memories of Mainate

parents
No related branches found
No related tags found
No related merge requests found
{-# LANGUAGE NamedFieldPuns #-}
{-# LANGUAGE MultiParamTypeClasses #-}
module NLP.Attempts (
) where
import Control.Monad.Identity (Identity(..))
import Control.Monad.State (MonadState(..), State, modify, evalState)
import Data.Char (isPunctuation, isSpace)
import Data.Map as Map (Map, (!?), foldrWithKey, fromList, insertWith, member)
import qualified Data.Map as Map (empty)
import Data.Set as Set (fromList, insert, member)
import qualified Data.Set as Set (empty)
import System.IO (Handle, hPutStrLn)
import Text.Printf (printf)
newtype Stream a = Stream {
unStream :: [(a, Stream a)]
} deriving Show
empty :: Stream a
empty = Stream []
isEmpty :: Stream a -> Bool
isEmpty (Stream l) = null l
cons :: a -> Stream a -> Stream a
cons a s = Stream [(a, s)]
parallel :: Stream a -> Stream a -> Stream a
parallel (Stream a) (Stream b) = Stream $ a ++ b
parallelList :: [Stream a] -> Stream a
parallelList = foldr parallel empty
single :: a -> Stream a
single a = cons a empty
ofList :: [a] -> Stream a
ofList = foldr cons empty
where
cons a s = Stream [(a, s)]
keepAll :: (a -> Bool) -> Stream a -> Stream a
keepAll predicate (Stream l) = parallelList $ recFilter <$> l
where
recFilter (c, s)
| predicate c = Stream [(c, keepAll predicate s)]
| otherwise = keepAll predicate s
sendOver :: Show a => Handle -> Stream a -> IO ()
sendOver handle (Stream l) = mapM_ f l
where
f (a, s) = hPutStrLn handle (show a) *> sendOver handle s
fixOCR :: Stream Char -> Stream Char
fixOCR (Stream l) = parallelList $ variants <$> l
where
variants :: (Char, Stream Char) -> Stream Char
variants (c, s) = Stream [(v, fixOCR s) | v <- similar c]
similar 'a' = "ao"
similar 'o' = "ao"
similar 'i' = "i1l"
similar 'l' = "i1l"
similar '1' = "i1l"
similar c = [c]
eatSpaces :: Stream Char -> Stream Char
eatSpaces = keepAll (not . isSpace)
data Layer m a b = Layer {
delta :: (a, Stream a) -> m (Stream b)
, clean :: m (Maybe b)
}
run :: Monad m => Layer m a b -> Stream a -> m (Stream b)
run layer (Stream []) =
Stream . maybe [] (\flushed -> [(flushed, empty)]) <$> clean layer
run layer (Stream l) = parallelList <$> mapM (delta layer) l
push :: a -> State [a] ()
push = modify . (:)
flushWith :: Monoid a => (a -> b) -> State a b
flushWith f = state $ \l -> (f l, mempty)
saveAndRestore :: State s a -> State s a
saveAndRestore p = do
mem <- get
p <* put mem
tokenizer :: Stream Char -> Stream String
tokenizer input = evalState (f input) ""
where
f = run tokenize
tokenize :: Layer (State String) Char String
tokenize = Layer { delta = saveAndRestore . tryOne, clean }
clean = do
lastWord <- flush
pure $ if null lastWord then Nothing else Just lastWord
flush :: State String String
flush = flushWith reverse
tryOne :: (Char, Stream Char) -> State String (Stream String)
tryOne (c, s)
| isSpace c = cons <$> flush <*> f s
| isDash c = parallelList <$> mapM saveAndRestore [
(cons <$> flush <*> (cons "-" <$> f s))
, (push '-' *> f s)
, f s
]
| isPunctuation c = cons <$> flush <*> (cons [c] <$> f s)
| otherwise = push c *> f s
isDash = (`elem` "-—¬")
bruteTokenizer :: Stream Char -> Stream String
bruteTokenizer input = evalState (f input) ""
where
f = run tokenize
tokenize :: Layer (State String) Char String
tokenize = Layer { delta = saveAndRestore . tryOne, clean }
clean = do
lastWord <- flush
pure $ if null lastWord then Nothing else Just lastWord
flush :: State String String
flush = flushWith reverse
tryOne :: (Char, Stream Char) -> State String (Stream String)
tryOne (c, s)
| isSpace c = f s
| isDash c = parallelList <$> mapM saveAndRestore [
(cons <$> flush <*> (cons "-" <$> f s))
, (push '-' *> f s)
, f s
]
| otherwise = parallelList <$> mapM saveAndRestore [
(cons <$> flush <*> (push c *> f s))
, (push c *> f s)
]
isDash = (`elem` "-—¬")
frenchWords :: Stream String -> Stream String
frenchWords = runIdentity . filter
where
filter = run $ Layer {delta, clean = pure Nothing}
delta :: (String, Stream String) -> Identity (Stream String)
delta (w, s)
| Set.member w known = cons w <$> filter s
| otherwise = pure empty
known = foldrWithKey (\k _ -> Set.insert k) Set.empty dico
newtype Form = Form {
unForm :: String
}
data POS = N | Adj | V | D | Pct | Adv | Pr deriving Show
newtype Lemma = Lemma {
unLemma :: String
}
data LexicalUnit = LexicalUnit {
form :: Form
, pos :: POS
, lemma :: Lemma
}
instance Show Form where
show (Form s) = s
instance Show Lemma where
show (Lemma s) = s
instance Show LexicalUnit where
show (LexicalUnit {form, pos, lemma}) =
printf "%s (%s, %s)" (show form) (show pos) (show lemma)
dico :: Map String [LexicalUnit]
dico = indexBy (unForm . form) [
LexicalUnit (Form "le") D (Lemma "LE")
, LexicalUnit (Form "la") D (Lemma "LE")
, LexicalUnit (Form "chat") N (Lemma "CHAT")
, LexicalUnit (Form "petit") Adj (Lemma "PETIT")
, LexicalUnit (Form "raton-laveur") N (Lemma "RATON-LAVEUR")
, LexicalUnit (Form "peut") V (Lemma "POUVOIR")
, LexicalUnit (Form "-") Pct (Lemma "-")
, LexicalUnit (Form "être") V (Lemma "ÊTRE")
, LexicalUnit (Form "peut-être") Adv (Lemma "PEUT-ÊTRE")
, LexicalUnit (Form "il") Pr (Lemma "IL")
, LexicalUnit (Form "marche") N (Lemma "MARCHE")
, LexicalUnit (Form "marche") V (Lemma "MARCHER")
]
indexBy :: Ord k => (o -> k) -> [o] -> Map k [o]
indexBy key = foldr (\o -> insertWith (++) (key o) [o]) Map.empty
annotate :: Stream String -> Stream LexicalUnit
annotate = runIdentity . filter
where
filter = run $ Layer {delta, clean = pure Nothing}
delta :: (String, Stream String) -> Identity (Stream LexicalUnit)
delta (w, s) =
case dico !? w of
Just lexicalUnits ->
parallelList <$> mapM (\lU -> cons lU <$> filter s) lexicalUnits
_ -> pure empty
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment