Newer
Older
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
#!/bin/sh
METADATA="${1}"
SOURCE="${2%/}"
TARGET="${3%/}"
getClass()
{
DOMAIN="${1}"
if [ "${DOMAIN%%|*}" == "${DOMAIN}" ]
then
case "${DOMAIN}" in
Géographie*) printf "geography";;
*) printf "other";;
esac
else
while true
do
case "${DOMAIN%%|*}" in
Géographie*)
if [ "${DOMAIN%%|*}" != "${DOMAIN}" ]
then DOMAIN="${DOMAIN#*| }"
else printf "geography"; return
fi;;
*) printf "some_geography"; return;;
esac
done
fi
}
while read LINE
do
FILE="${LINE%%,*}"
LINE="${LINE#*,}"
TOME="${LINE%%,*}"
LINE="${LINE#*,}"
RANK="${LINE%%,*}"
LINE="${LINE#*,}"
HEAD="${LINE%%,*}"
LINE="${LINE#*,}"
DOMAIN="${LINE%%,*}"
CLASS="$(getClass "${DOMAIN}")"
INPUT="${SOURCE}/${FILE}.xml"
OUTPUT="${TARGET}/${FILE}.xml"
head -n 1 "${INPUT}" > "${OUTPUT}"
cat >> "${OUTPUT}" << EOF
<corpus><doc><meta>
fileName ${FILE}.txt
tome ${TOME}
rank ${RANK}
head ${HEAD}
domain ${DOMAIN}
class ${CLASS}
EOF
tail -n +3 "${INPUT}" >> "${OUTPUT}"
done < <(tail -n +2 ${METADATA})