From eee529601791c604a7a4b73d8d6005d44c8cf87a Mon Sep 17 00:00:00 2001 From: Alice BRENON <alice.brenon@ens-lyon.fr> Date: Mon, 4 Sep 2023 20:06:31 +0200 Subject: [PATCH] Add source extractor for LGE using the newest syntax for ProcessingLGE and soprano --- scripts/LGE/extract-from-source.sh | 20 ++++++++++++++++++++ scripts/extract-from-source.sh | 4 ++-- 2 files changed, 22 insertions(+), 2 deletions(-) create mode 100755 scripts/LGE/extract-from-source.sh diff --git a/scripts/LGE/extract-from-source.sh b/scripts/LGE/extract-from-source.sh new file mode 100755 index 0000000..e0af410 --- /dev/null +++ b/scripts/LGE/extract-from-source.sh @@ -0,0 +1,20 @@ +#!/bin/sh + +source ${0%/*}/../lib.sh + +if [ "$#" != 2 ] +then + die "${PROG_NAME} SOURCE_DIRECTORY TARGET_DIRECTORY" +else + SOURCE="${1}" + TARGET="${2}" + [ -d "${SOURCE}" ] || die "SOURCE_DIRECTORY must be a directory (containing one ALTO/LGE/T<T> directory with one ALTO-XML file per page)" + [ -d "${TARGET}" ] || die "TARGET_DIRECTORY must be a directory (where output will be generated)" +fi + +cd "${SOURCE}" +for T in {1..31} +do + LGEprepareVolume.sh ${T} "${TARGET}" ALTO + LGEencode.sh "${TARGET}/ALTO/LGE/T${T}" -k Text --metadata --text-root "${TARGET}/Text" 2> /tmp/LGE.log +done diff --git a/scripts/extract-from-source.sh b/scripts/extract-from-source.sh index 0275b82..ba74ce0 100755 --- a/scripts/extract-from-source.sh +++ b/scripts/extract-from-source.sh @@ -10,11 +10,11 @@ then else SOURCE="${1}" TARGET="${2}" - [ -d "${SOURCE}" ] || die "SOURCE_DIRECTORY must be a directory (containing 1 .tei file per tome)" + [ -d "${SOURCE}" ] || die "SOURCE_DIRECTORY must be a directory (containing a folder for EDdA and one for LGE with the structure found in Source.squashfs)" [ -d "${TARGET}" ] || die "TARGET_DIRECTORY must be a directory (where output will be generated)" fi FILES_TSV="${TARGET}/files.tsv" printf "book tome rank headWord name page\n" > "${FILES_TSV}" ${BASE_DIR}/EDdA/extract-from-source.sh "${SOURCE}/EDdA/ARTFL" ${TARGET} >> "${FILES_TSV}" -#${BASE_DIR}/LGE/extract-from-source.sh "${SOURCE}/LGE/BnF" ${TARGET} >> "${FILES_TSV}" +${BASE_DIR}/LGE/extract-from-source.sh "${SOURCE}/LGE/BnF" ${TARGET} >> "${FILES_TSV}" -- GitLab