Skip to content

Commit 18c5240

Browse files
committed
updated tutorial
1 parent 63aa159 commit 18c5240

File tree

4 files changed

+38
-26
lines changed

4 files changed

+38
-26
lines changed

doc/tutorial.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -34,7 +34,7 @@
3434
segmented_tp = tp.segment(prepared, threshold='relative')
3535
segmented_puddle = puddle.segment(prepared, njobs=4, window=2)
3636
segmented_dpseg = dpseg.segment(prepared, nfolds=1, args='--randseed 1')
37-
segmented_ag = ag.segment(prepared, nruns=4, njobs=4, args='-n 100')
37+
segmented_ag = ag.segment(prepared, nruns=4, njobs=4, args='-n 10')
3838

3939
# we must provide a trained model to dibs (with stats on diphones)
4040
model_dibs = dibs.CorpusSummary(text)

doc/tutorial.rst

+1-1
Original file line numberDiff line numberDiff line change
@@ -56,7 +56,7 @@ Python tutorial
5656
---------------
5757

5858
The following script is located in ``../doc/tutorial.py``. It
59-
implements exactly the same process as the bash one:
59+
implements exactly the same process as the bash one (part 1 only):
6060

6161
.. literalinclude:: tutorial.py
6262
:language: python

doc/tutorial.sh

+35-24
Original file line numberDiff line numberDiff line change
@@ -6,35 +6,44 @@ cat $1 | wordseg-prep -u phone --gold gold.txt > prepared.txt
66
# compute statistics on the tokenized input text
77
cat $1 | wordseg-stats --json > stats.json
88

9+
# display the statistics computed on the input text
10+
echo "STATISTICS"
11+
echo "=========="
12+
echo
13+
14+
cat stats.json
15+
16+
echo
17+
echo "TUTORIAL PART 1 (no training)"
18+
echo "============================="
19+
20+
921
# segment the prepared text with different algorithms (we show few
1022
# options for them, use --help to list all of them)
11-
#cat prepared.txt | wordseg-baseline -P 0.5 > segmented.baseline.txt
12-
#cat prepared.txt | wordseg-tp -d ftp -t relative > segmented.tp.txt
13-
#cat prepared.txt | wordseg-puddle -w 2 > segmented.puddle.txt
14-
#cat prepared.txt | wordseg-dpseg -f 1 -r 1 > segmented.dpseg.txt
15-
#cat prepared.txt | wordseg-ag --nruns 4 --njobs 4 --niterations 100 > segmented.ag.txt
23+
cat prepared.txt | wordseg-baseline -P 0.5 > segmented.baseline.txt
24+
cat prepared.txt | wordseg-tp -d ftp -t relative > segmented.tp.txt
25+
cat prepared.txt | wordseg-puddle -w 2 > segmented.puddle.txt
26+
cat prepared.txt | wordseg-dpseg -f 1 -r 1 > segmented.dpseg.txt
27+
cat prepared.txt | wordseg-ag --nruns 4 --njobs 4 --niterations 10 > segmented.ag.txt
1628

17-
# dibs must be provided with a training file
18-
#cat prepared.txt | wordseg-dibs -t gold $1 > segmented.dibs.txt
29+
# dibs must be provided with word boundaries to do some preliminary training.
30+
# Boundaries are then removed to generate the text to segment (as with
31+
# wordseg-prep).
32+
cat $1 | wordseg-dibs -t gold > segmented.dibs.txt
1933

2034
# evaluate them against the gold file
2135
for algo in baseline tp puddle dpseg dibs ag
2236
do
2337
cat segmented.$algo.txt | wordseg-eval gold.txt -r prepared.txt > eval.$algo.txt
2438
done
2539

26-
# display the statistics computed on the input text
27-
echo "* Statistics"
28-
echo
29-
cat stats.json
3040

3141
# concatenate the evaluations in a table
32-
echo
33-
echo "* Evaluation"
42+
3443
echo
3544
(
3645
echo "score baseline tp puddle dpseg ag dibs"
37-
echo "------------------ ------- ------- ------- ------- -------"
46+
echo "------------------ ------- ------- ------- ------- ------- -------"
3847
for i in $(seq 1 13)
3948
do
4049
awk -v i=$i 'NR==i {printf $0}; END {printf " "}' eval.baseline.txt
@@ -47,10 +56,15 @@ echo
4756
) | column -t
4857

4958

50-
## REPEAT THE WHOLE PROCESS, but training on the first 80% of the file
59+
# ## REPEAT THE WHOLE PROCESS, but training on the first 80% of the file
60+
echo
61+
echo
62+
echo "TUTORIAL PART 2 (train on 80% of data)"
63+
echo "======================================"
64+
5165

5266
# split the file into 80/20
53-
csplit $1 $(( $(wc -l < $1 ) * 8 / 10 + 1))
67+
csplit --quiet $1 $(( $(wc -l < $1 ) * 8 / 10 + 1))
5468
mv xx00 train_tagged.txt
5569
mv xx01 test_tagged.txt
5670

@@ -61,10 +75,10 @@ cat test_tagged.txt | wordseg-prep -u phone --gold gold_test.txt > prepared_test
6175
# segment the prepared text with different algorithms -- NOTE train/test implemented for the following
6276
cat prepared_test.txt | wordseg-tp -d ftp -t relative -T prepared_train.txt > segmented.tp.tt.txt
6377
cat prepared_test.txt | wordseg-puddle -w 2 -T prepared_train.txt > segmented.puddle.tt.txt
64-
cat prepared_test.txt | wordseg-ag --nruns 4 --njobs 4 --niterations 100 -T prepared_train.txt > segmented.ag.tt.txt
78+
cat prepared_test.txt | wordseg-ag --nruns 4 --njobs 4 --niterations 10 -T prepared_train.txt > segmented.ag.tt.txt
6579

6680
# dibs is provided with a training file in gold format for its parameter, plus the prepared train file
67-
cat prepared_test.txt | wordseg-dibs -t gold gold_train.txt -T prepared_train.txt > segmented.dibs.tt.txt
81+
cat prepared_test.txt | wordseg-dibs -t gold -T train_tagged.txt > segmented.dibs.tt.txt
6882

6983
# evaluate them against the gold file
7084
for algo in tp puddle dibs ag
@@ -74,17 +88,14 @@ done
7488

7589
# concatenate the evaluations in a table
7690
echo
77-
echo "* Evaluation"
78-
echo
7991
(
80-
echo "score tp puddle dibs ag"
92+
echo "score tp puddle ag dibs"
8193
echo "------------------ ------- ------- ------- -------"
8294
for i in $(seq 1 13)
8395
do
8496
awk -v i=$i 'NR==i {printf $0}; END {printf " "}' eval.tp.tt.txt
8597
awk -v i=$i 'NR==i {printf $2}; END {printf " "}' eval.puddle.tt.txt
86-
awk -v i=$i 'NR==i {print $2}; END {printf " "}' eval.dibs.txt
87-
awk -v i=$i 'NR==i {printf $2}' eval.ag.tt.txt
98+
awk -v i=$i 'NR==i {printf $2}; END {printf " "}' eval.ag.tt.txt
99+
awk -v i=$i 'NR==i {print $2}' eval.dibs.tt.txt
88100
done
89101
) | column -t
90-

test/test_tutorial.py

+1
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@
77
import pytest
88

99

10+
@pytest.mark.skipif(True, reason='skipping tutorial testing (too long)')
1011
@pytest.mark.parametrize('ext', ['py', 'sh'])
1112
def test_tutorial(tags, tmpdir, ext):
1213
tutorial_dir = os.path.abspath(

0 commit comments

Comments
 (0)