updated tutorial

mmmaat · mmmaat · commit 18c5240e025d · 2021-01-26T18:21:25.000+01:00
diff --git a/doc/tutorial.py b/doc/tutorial.py
@@ -34,7 +34,7 @@
 segmented_tp = tp.segment(prepared, threshold='relative')
 segmented_puddle = puddle.segment(prepared, njobs=4, window=2)
 segmented_dpseg = dpseg.segment(prepared, nfolds=1, args='--randseed 1')
-segmented_ag = ag.segment(prepared, nruns=4, njobs=4, args='-n 100')
+segmented_ag = ag.segment(prepared, nruns=4, njobs=4, args='-n 10')
 
 # we must provide a trained model to dibs (with stats on diphones)
 model_dibs = dibs.CorpusSummary(text)
diff --git a/doc/tutorial.rst b/doc/tutorial.rst
@@ -56,7 +56,7 @@ Python tutorial
 ---------------
 
 The following script is located in ``../doc/tutorial.py``. It
-implements exactly the same process as the bash one:
+implements exactly the same process as the bash one (part 1 only):
 
 .. literalinclude:: tutorial.py
    :language: python
diff --git a/doc/tutorial.sh b/doc/tutorial.sh
@@ -6,35 +6,44 @@ cat $1 | wordseg-prep -u phone --gold gold.txt > prepared.txt
 # compute statistics on the tokenized input text
 cat $1 | wordseg-stats --json > stats.json
 
+# display the statistics computed on the input text
+echo "STATISTICS"
+echo "=========="
+echo
+
+cat stats.json
+
+echo
+echo "TUTORIAL PART 1 (no training)"
+echo "============================="
+
+
 # segment the prepared text with different algorithms (we show few
 # options for them, use --help to list all of them)
-#cat prepared.txt | wordseg-baseline -P 0.5 > segmented.baseline.txt
-#cat prepared.txt | wordseg-tp -d ftp -t relative > segmented.tp.txt
-#cat prepared.txt | wordseg-puddle -w 2 > segmented.puddle.txt
-#cat prepared.txt | wordseg-dpseg -f 1 -r 1 > segmented.dpseg.txt
-#cat prepared.txt | wordseg-ag --nruns 4 --njobs 4 --niterations 100 > segmented.ag.txt
+cat prepared.txt | wordseg-baseline -P 0.5 > segmented.baseline.txt
+cat prepared.txt | wordseg-tp -d ftp -t relative > segmented.tp.txt
+cat prepared.txt | wordseg-puddle -w 2 > segmented.puddle.txt
+cat prepared.txt | wordseg-dpseg -f 1 -r 1 > segmented.dpseg.txt
+cat prepared.txt | wordseg-ag --nruns 4 --njobs 4 --niterations 10 > segmented.ag.txt
 
-# dibs must be provided with a training file
-#cat prepared.txt | wordseg-dibs -t gold $1 > segmented.dibs.txt
+# dibs must be provided with word boundaries to do some preliminary training.
+# Boundaries are then removed to generate the text to segment (as with
+# wordseg-prep).
+cat $1 | wordseg-dibs -t gold > segmented.dibs.txt
 
 # evaluate them against the gold file
 for algo in baseline tp puddle dpseg dibs ag
 do
     cat segmented.$algo.txt | wordseg-eval gold.txt -r prepared.txt > eval.$algo.txt
 done
 
-# display the statistics computed on the input text
-echo "* Statistics"
-echo
-cat stats.json
 
 # concatenate the evaluations in a table
-echo
-echo "* Evaluation"
+
 echo
 (
     echo "score baseline tp puddle dpseg ag dibs"
-    echo "------------------ ------- ------- ------- ------- -------"
+    echo "------------------ ------- ------- ------- ------- ------- -------"
     for i in $(seq 1 13)
     do
         awk -v i=$i 'NR==i {printf $0}; END {printf " "}' eval.baseline.txt
@@ -47,10 +56,15 @@ echo
 ) | column -t
 
 
-## REPEAT THE WHOLE PROCESS, but training on the first 80% of the file
+# ## REPEAT THE WHOLE PROCESS, but training on the first 80% of the file
+echo
+echo
+echo "TUTORIAL PART 2 (train on 80% of data)"
+echo "======================================"
+
 
 # split the file into 80/20
-csplit $1 $(( $(wc -l < $1 ) * 8 / 10 + 1))
+csplit --quiet $1 $(( $(wc -l < $1 ) * 8 / 10 + 1))
 mv xx00 train_tagged.txt
 mv xx01 test_tagged.txt
 
@@ -61,10 +75,10 @@ cat test_tagged.txt | wordseg-prep -u phone --gold gold_test.txt > prepared_test
 # segment the prepared text with different algorithms -- NOTE train/test implemented for the following
 cat prepared_test.txt | wordseg-tp -d ftp -t relative -T prepared_train.txt > segmented.tp.tt.txt
 cat prepared_test.txt | wordseg-puddle -w 2 -T prepared_train.txt > segmented.puddle.tt.txt
-cat prepared_test.txt | wordseg-ag --nruns 4 --njobs 4 --niterations 100 -T prepared_train.txt > segmented.ag.tt.txt
+cat prepared_test.txt | wordseg-ag --nruns 4 --njobs 4 --niterations 10 -T prepared_train.txt > segmented.ag.tt.txt
 
 # dibs is provided with a training file in gold format for its parameter, plus the prepared train file
-cat prepared_test.txt | wordseg-dibs -t gold gold_train.txt -T prepared_train.txt > segmented.dibs.tt.txt
+cat prepared_test.txt | wordseg-dibs -t gold -T train_tagged.txt > segmented.dibs.tt.txt
 
 # evaluate them against the gold file
 for algo in tp puddle dibs ag
@@ -74,17 +88,14 @@ done
 
 # concatenate the evaluations in a table
 echo
-echo "* Evaluation"
-echo
 (
-    echo "score tp puddle dibs ag"
+    echo "score tp puddle ag dibs"
     echo "------------------ ------- ------- ------- -------"
     for i in $(seq 1 13)
     do
         awk -v i=$i 'NR==i {printf $0}; END {printf " "}' eval.tp.tt.txt
         awk -v i=$i 'NR==i {printf $2}; END {printf " "}' eval.puddle.tt.txt
-        awk -v i=$i 'NR==i {print $2}; END {printf " "}' eval.dibs.txt
-        awk -v i=$i 'NR==i {printf $2}' eval.ag.tt.txt
+        awk -v i=$i 'NR==i {printf $2}; END {printf " "}' eval.ag.tt.txt
+        awk -v i=$i 'NR==i {print $2}' eval.dibs.tt.txt
     done
 ) | column -t
-
diff --git a/test/test_tutorial.py b/test/test_tutorial.py
@@ -7,6 +7,7 @@
 import pytest
 
 
+@pytest.mark.skipif(True, reason='skipping tutorial testing (too long)')
 @pytest.mark.parametrize('ext', ['py', 'sh'])
 def test_tutorial(tags, tmpdir, ext):
     tutorial_dir = os.path.abspath(