@@ -6,35 +6,44 @@ cat $1 | wordseg-prep -u phone --gold gold.txt > prepared.txt
6
6
# compute statistics on the tokenized input text
7
7
cat $1 | wordseg-stats --json > stats.json
8
8
9
+ # display the statistics computed on the input text
10
+ echo " STATISTICS"
11
+ echo " =========="
12
+ echo
13
+
14
+ cat stats.json
15
+
16
+ echo
17
+ echo " TUTORIAL PART 1 (no training)"
18
+ echo " ============================="
19
+
20
+
9
21
# segment the prepared text with different algorithms (we show few
10
22
# options for them, use --help to list all of them)
11
- # cat prepared.txt | wordseg-baseline -P 0.5 > segmented.baseline.txt
12
- # cat prepared.txt | wordseg-tp -d ftp -t relative > segmented.tp.txt
13
- # cat prepared.txt | wordseg-puddle -w 2 > segmented.puddle.txt
14
- # cat prepared.txt | wordseg-dpseg -f 1 -r 1 > segmented.dpseg.txt
15
- # cat prepared.txt | wordseg-ag --nruns 4 --njobs 4 --niterations 100 > segmented.ag.txt
23
+ cat prepared.txt | wordseg-baseline -P 0.5 > segmented.baseline.txt
24
+ cat prepared.txt | wordseg-tp -d ftp -t relative > segmented.tp.txt
25
+ cat prepared.txt | wordseg-puddle -w 2 > segmented.puddle.txt
26
+ cat prepared.txt | wordseg-dpseg -f 1 -r 1 > segmented.dpseg.txt
27
+ cat prepared.txt | wordseg-ag --nruns 4 --njobs 4 --niterations 10 > segmented.ag.txt
16
28
17
- # dibs must be provided with a training file
18
- # cat prepared.txt | wordseg-dibs -t gold $1 > segmented.dibs.txt
29
+ # dibs must be provided with word boundaries to do some preliminary training.
30
+ # Boundaries are then removed to generate the text to segment (as with
31
+ # wordseg-prep).
32
+ cat $1 | wordseg-dibs -t gold > segmented.dibs.txt
19
33
20
34
# evaluate them against the gold file
21
35
for algo in baseline tp puddle dpseg dibs ag
22
36
do
23
37
cat segmented.$algo .txt | wordseg-eval gold.txt -r prepared.txt > eval.$algo .txt
24
38
done
25
39
26
- # display the statistics computed on the input text
27
- echo " * Statistics"
28
- echo
29
- cat stats.json
30
40
31
41
# concatenate the evaluations in a table
32
- echo
33
- echo " * Evaluation"
42
+
34
43
echo
35
44
(
36
45
echo " score baseline tp puddle dpseg ag dibs"
37
- echo " ------------------ ------- ------- ------- ------- -------"
46
+ echo " ------------------ ------- ------- ------- ------- ------- ------- "
38
47
for i in $( seq 1 13)
39
48
do
40
49
awk -v i=$i ' NR==i {printf $0}; END {printf " "}' eval.baseline.txt
47
56
) | column -t
48
57
49
58
50
- # # REPEAT THE WHOLE PROCESS, but training on the first 80% of the file
59
+ # ## REPEAT THE WHOLE PROCESS, but training on the first 80% of the file
60
+ echo
61
+ echo
62
+ echo " TUTORIAL PART 2 (train on 80% of data)"
63
+ echo " ======================================"
64
+
51
65
52
66
# split the file into 80/20
53
- csplit $1 $(( $(wc - l < $1 ) * 8 / 10 + 1 ))
67
+ csplit --quiet $1 $(( $(wc - l < $1 ) * 8 / 10 + 1 ))
54
68
mv xx00 train_tagged.txt
55
69
mv xx01 test_tagged.txt
56
70
@@ -61,10 +75,10 @@ cat test_tagged.txt | wordseg-prep -u phone --gold gold_test.txt > prepared_test
61
75
# segment the prepared text with different algorithms -- NOTE train/test implemented for the following
62
76
cat prepared_test.txt | wordseg-tp -d ftp -t relative -T prepared_train.txt > segmented.tp.tt.txt
63
77
cat prepared_test.txt | wordseg-puddle -w 2 -T prepared_train.txt > segmented.puddle.tt.txt
64
- cat prepared_test.txt | wordseg-ag --nruns 4 --njobs 4 --niterations 100 -T prepared_train.txt > segmented.ag.tt.txt
78
+ cat prepared_test.txt | wordseg-ag --nruns 4 --njobs 4 --niterations 10 -T prepared_train.txt > segmented.ag.tt.txt
65
79
66
80
# dibs is provided with a training file in gold format for its parameter, plus the prepared train file
67
- cat prepared_test.txt | wordseg-dibs -t gold gold_train.txt -T prepared_train .txt > segmented.dibs.tt.txt
81
+ cat prepared_test.txt | wordseg-dibs -t gold -T train_tagged .txt > segmented.dibs.tt.txt
68
82
69
83
# evaluate them against the gold file
70
84
for algo in tp puddle dibs ag
74
88
75
89
# concatenate the evaluations in a table
76
90
echo
77
- echo " * Evaluation"
78
- echo
79
91
(
80
- echo " score tp puddle dibs ag "
92
+ echo " score tp puddle ag dibs "
81
93
echo " ------------------ ------- ------- ------- -------"
82
94
for i in $( seq 1 13)
83
95
do
84
96
awk -v i=$i ' NR==i {printf $0}; END {printf " "}' eval.tp.tt.txt
85
97
awk -v i=$i ' NR==i {printf $2}; END {printf " "}' eval.puddle.tt.txt
86
- awk -v i=$i ' NR==i {print $2}; END {printf " "}' eval.dibs .txt
87
- awk -v i=$i ' NR==i {printf $2}' eval.ag .tt.txt
98
+ awk -v i=$i ' NR==i {printf $2}; END {printf " "}' eval.ag.tt .txt
99
+ awk -v i=$i ' NR==i {print $2}' eval.dibs .tt.txt
88
100
done
89
101
) | column -t
90
-
0 commit comments