Skip to content
Snippets Groups Projects
extract-corpus.sh 541 B
Newer Older

INPUT_PATH="${1}"
SOURCE_TEXT_ARTICLES="${2}"
OUTPUT="${3}"
if [ -d "${OUTPUT}" ]
then
	N=1
	while [ -d "${OUTPUT}.${N}" ]
	do
		N=$((N+1))
	done
	mv "${OUTPUT}" "${OUTPUT}.${N}"
fi

WORKDIR=$(mktemp -d /tmp/parallel-corpus.XXX)

while read LINE
do
	ID="${LINE%%,*}"
	LINE="${LINE#*,}"
	RELATIVE_PATH="${LINE%%,*}"
	cp "${SOURCE_TEXT_ARTICLES}/${RELATIVE_PATH#*/}.*" "${WORKDIR}/${RELATIVE_PATH}"
	#cp "${SOURCE_TEXT_ARTICLES}/T${T}/ById/${ID}."* "${WORKDIR}/T${T}"
done < <(tail -n +2 ${INPUT_METADATA})

mv ${WORKDIR} ${OUTPUT}