Skip to content
GitLab
Explore
Sign in
Primary navigation
Search or go to…
Project
O
outillage
Manage
Activity
Members
Labels
Plan
Issues
0
Issue boards
Milestones
Wiki
Code
Merge requests
0
Repository
Branches
Commits
Tags
Repository graph
Compare revisions
Snippets
Build
Pipelines
Jobs
Pipeline schedules
Artifacts
Deploy
Releases
Package Registry
Model registry
Operate
Environments
Terraform modules
Monitor
Incidents
Analyze
Value stream analytics
Contributor analytics
CI/CD analytics
Repository analytics
Model experiments
Help
Help
Support
GitLab documentation
Compare GitLab plans
Community forum
Contribute to GitLab
Provide feedback
Keyboard shortcuts
?
Snippets
Groups
Projects
Show more breadcrumbs
Alice Brenon
outillage
Commits
33d70476
Commit
33d70476
authored
1 year ago
by
Alice Brenon
Browse files
Options
Downloads
Patches
Plain Diff
Improving existing scripts for prodigy + add a new one to get TSV from the trained output
parent
4cf05a96
No related branches found
No related tags found
No related merge requests found
Changes
3
Hide whitespace changes
Inline
Side-by-side
Showing
3 changed files
scripts/ML/prodigy-corpus.hs
+10
-11
10 additions, 11 deletions
scripts/ML/prodigy-corpus.hs
scripts/ML/prodigy-jsonl-to-tsv.hs
+57
-0
57 additions, 0 deletions
scripts/ML/prodigy-jsonl-to-tsv.hs
scripts/paragraphs.hs
+22
-14
22 additions, 14 deletions
scripts/paragraphs.hs
with
89 additions
and
25 deletions
scripts/ML/prodigy-corpus.hs
+
10
−
11
View file @
33d70476
#!/
usr
/
bin
/
env
-
S
runhaskell
--ghc-arg="-Wall" --ghc-arg="-i lib"
{-# LANGUAGE DeriveGeneric #-}
{-# LANGUAGE DeriveGeneric
, ExplicitNamespaces, OverloadedStrings
#-}
import
Data.Aeson
(
ToJSON
(
..
),
defaultOptions
,
encode
,
genericToEncoding
)
import
Data.ByteString.Lazy.Char8
as
ByteString
(
putStrLn
)
import
Data.Text
(
Text
)
import
Data.Text.IO
as
Text
(
readFile
)
import
GEODE.Metadata
(
readNamedTsv
)
import
GEODE.Metadata.File
(
relativePath
)
import
GEODE.Metadata.PrimaryKey.Paragraph
(
ParagraphPK
)
import
GEODE.Metadata
(
type
(
@
)(
..
),
Record
(
..
),
readNamedTsv
)
import
GEODE.Metadata.ProdigyMeta
(
ParagraphMeta
)
import
GHC.Generics
(
Generic
)
import
System.Environment
(
getArgs
)
import
System.FilePath
((
</>
))
...
...
@@ -14,21 +13,21 @@ import System.Script (syntax, try)
data
Paragraph
=
Paragraph
{
text
::
Text
,
meta
::
Paragraph
PK
}
deriving
Generic
,
meta
::
Paragraph
Meta
}
deriving
Generic
instance
ToJSON
Paragraph
where
toEncoding
=
genericToEncoding
defaultOptions
loadParagraph
::
FilePath
->
Paragraph
PK
->
IO
Paragraph
loadParagraph
source
meta
=
do
text
<-
Text
.
readFile
(
source
</>
relativePath
meta
"txt"
)
loadParagraph
::
FilePath
->
Paragraph
Meta
->
IO
Paragraph
loadParagraph
source
meta
@
(
paragraphRecord
:@:
_
)
=
do
text
<-
Text
.
readFile
(
source
</>
relativePath
paragraphRecord
"txt"
)
pure
$
Paragraph
{
text
,
meta
}
main
::
IO
()
main
=
getArgs
>>=
run
where
run
[
inputMeta
,
source
]
=
try
(
readNamedTsv
inputMeta
)
>>=
mapM_
(
prodigyText
source
)
try
(
readNamedTsv
inputMeta
)
>>=
mapM_
(
toJSON
source
)
run
_
=
syntax
"INPUT_METADATA SOURCE_DIRECTORY"
prodigyText
source
p
K
=
loadParagraph
source
p
K
>>=
ByteString
.
putStrLn
.
encode
toJSON
source
p
arMeta
=
loadParagraph
source
p
arMeta
>>=
ByteString
.
putStrLn
.
encode
This diff is collapsed.
Click to expand it.
scripts/ML/prodigy-jsonl-to-tsv.hs
0 → 100755
+
57
−
0
View file @
33d70476
#!/
usr
/
bin
/
env
-
S
runhaskell
--ghc-arg="-Wall" --ghc-arg="-i lib" --ghc-arg="-fprint-potential-instances"
{-# LANGUAGE DeriveGeneric, ExplicitNamespaces, OverloadedStrings #-}
import
Data.Aeson
((
.:
),
FromJSON
(
..
),
Value
(
..
),
encode
,
withArray
,
withText
,
eitherDecode
)
import
Data.Aeson.Types
(
prependFailure
,
typeMismatch
)
import
Data.ByteString.Lazy
as
BS
(
null
,
readFile
,
split
)
import
Data.ByteString.Lazy.Char8
as
BS
(
unpack
)
import
Data.Csv
(
ToNamedRecord
(
..
))
import
Data.Text
(
Text
)
import
Data.Vector
as
Vector
(
head
)
import
GEODE.Metadata
(
type
(
@
)(
..
),
DefaultFields
(
..
),
HasDefaultHeader
(
..
),
tsvFile
)
import
GEODE.Metadata.ProdigyMeta
(
ParagraphMeta
)
import
GHC.Generics
(
Generic
)
import
System.Environment
(
getArgs
)
import
System.Script
(
try
,
syntax
,
warn
)
newtype
Classification
=
Classification
{
paragraphFunction
::
Text
}
deriving
Generic
instance
ToNamedRecord
Classification
instance
HasDefaultHeader
Classification
where
defaultFields
=
DefaultFields
[
"paragraphFunction"
]
type
ClassifiedParagraph
=
ParagraphMeta
@
Classification
instance
{-# OVERLAPS #-}
FromJSON
(
Either
String
ClassifiedParagraph
)
where
parseJSON
o
@
(
Object
v
)
=
do
paragraphMeta
<-
v
.:
"meta"
>>=
parseJSON
(
fmap
(
paragraphMeta
:@:
))
<$>
(
v
.:
"accept"
>>=
parseClassification
)
where
parseClassification
=
withArray
"Classification"
singleValue
singleValue
a
|
not
$
Prelude
.
null
a
=
withText
"domain"
(
pure
.
Right
.
Classification
)
(
Vector
.
head
a
)
singleValue
_
=
pure
$
Left
(
"Looks like "
++
debug
++
" was not classified, ignoring for now"
)
debug
=
BS
.
unpack
$
encode
o
parseJSON
invalid
=
prependFailure
"parsing ClassifiedParagraph failed, "
(
typeMismatch
"Object"
invalid
)
logIgnored
::
[
Either
String
a
]
->
IO
[
a
]
logIgnored
=
foldr
keepRight
(
pure
[]
)
where
keepRight
(
Left
message
)
acc
=
warn
message
*>
acc
keepRight
(
Right
a
)
acc
=
(
a
:
)
<$>
acc
main
::
IO
()
main
=
getArgs
>>=
run
where
run
[
inputJSONL
,
outputTSV
]
=
try
(
jsonl
<$>
BS
.
readFile
inputJSONL
)
>>=
logIgnored
>>=
(
tsvFile
outputTSV
::
[
ClassifiedParagraph
]
->
IO
()
)
run
_
=
syntax
"INPUT_JSONL OUTPUT_TSV"
newline
=
10
jsonl
=
mapM
eitherDecode
.
filter
(
not
.
BS
.
null
)
.
BS
.
split
newline
This diff is collapsed.
Click to expand it.
scripts/paragraphs.hs
+
22
−
14
View file @
33d70476
#!/
usr
/
bin
/
env
-
S
runhaskell
--ghc-arg="-Wall" --ghc-arg="-i lib"
{-# LANGUAGE OverloadedStrings #-}
{-# LANGUAGE
ExplicitNamespaces,
OverloadedStrings #-}
import
Data.List
(
dropWhileEnd
)
import
Data.Text
(
Text
,
splitOn
)
import
Data.Text.IO
as
Text
(
readFile
,
writeFile
)
import
GEODE.Metadata.File
as
File
(
File
(
..
))
import
GEODE.Metadata
as
Article
(
PrimaryKey
,
readNamedTsv
,
relativePath
,
tsvFile
)
import
GEODE.Metadata.PrimaryKey.Paragraph
(
ParagraphPK
(
..
))
--import GEODE.Metadata.File as File (File(..))
import
GEODE.Metadata
as
Article
(
type
(
@
)(
..
),
ArticleRecord
,
Entry
(
..
),
Record
(
..
),
readNamedTsv
,
relativePath
,
tsvFile
)
import
GEODE.Metadata.ParagraphRecord
(
Paragraph
(
..
))
import
GEODE.Metadata.ProdigyMeta
as
Prodigy
(
ParagraphMeta
,
ProdigyMeta
(
..
))
import
System.Directory
(
createDirectoryIfMissing
)
import
System.Environment
(
getArgs
)
import
System.FilePath
((
</>
),
(
<.>
),
isPathSeparator
,
takeDirectory
)
...
...
@@ -17,17 +18,24 @@ articleParagraphs :: FilePath -> IO [Text]
articleParagraphs
=
fmap
(
fmap
(
leave
.
linearize
False
.
enter
)
.
splitOn
"
\n\n
"
)
.
Text
.
readFile
to
::
FilePath
->
FilePath
->
PrimaryKey
->
IO
[
ParagraphPK
]
to
source
target
article
=
do
createDirectoryIfMissing
True
(
target
</>
Article
.
relativePath
article
""
)
articleParagraphs
articlePath
>>=
mapM
create
.
number
withMeta
::
ArticleRecord
@
Entry
->
[
Text
]
->
[(
ParagraphMeta
,
Text
)]
withMeta
(
articleRecord
:@:
entry
)
paragraphs
=
zipWith
f
[
1
..
]
paragraphs
where
articlePath
=
source
</>
Article
.
relativePath
article
"txt"
number
=
zip
[
1
..
]
create
(
rank
,
paragraphText
)
=
let
paragraphPK
=
ParagraphPK
{
article
,
rank
}
outputPath
=
target
</>
File
.
relativePath
paragraphPK
"txt"
in
paragraphPK
<$
Text
.
writeFile
outputPath
paragraphText
prodigyMeta
=
ProdigyMeta
{
totalParagraphs
=
length
paragraphs
,
Prodigy
.
headword
=
Article
.
headword
entry
}
f
paragraph
paragraphText
=
(
articleRecord
:@:
Paragraph
{
paragraph
}
:@:
prodigyMeta
,
paragraphText
)
to
::
FilePath
->
FilePath
->
ArticleRecord
@
Entry
->
IO
[
ParagraphMeta
]
to
source
target
meta
@
(
articleRecord
:@:
_
)
=
do
createDirectoryIfMissing
True
(
target
</>
relativePath
articleRecord
""
)
articleParagraphs
articlePath
>>=
mapM
create
.
withMeta
meta
where
articlePath
=
source
</>
relativePath
articleRecord
"txt"
create
(
paragraphMeta
@
(
paragraphRecord
:@:
_
),
paragraphText
)
=
let
outputPath
=
target
</>
relativePath
paragraphRecord
"txt"
in
paragraphMeta
<$
Text
.
writeFile
outputPath
paragraphText
main
::
IO
()
main
=
(
fmap
(
dropWhileEnd
isPathSeparator
)
<$>
getArgs
)
>>=
run
...
...
This diff is collapsed.
Click to expand it.
Preview
0%
Loading
Try again
or
attach a new file
.
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Save comment
Cancel
Please
register
or
sign in
to comment