rename suit to suite

isi-nlp · Aug 18, 2021 · 74b8879 · 74b8879
1 parent 6657b36
commit 74b8879
Show file tree

Hide file tree

Showing 9 changed files with 144 additions and 59 deletions.
diff --git a/data/get_data.sh b/data/get_data.sh
@@ -0,0 +1,113 @@
+#!/usr/bin/env bash
+set -e # exit on error
+
+DIR="$(dirname "${BASH_SOURCE[0]}")"  # Get the directory name
+#DIR="$(realpath "${DIR}")"    # Resolve its full path if need be
+
+log_exit() { echo "$2"; exit 1; }
+
+# note, use sacremoses 0.0.45 or newer;
+# I contributed nukthas and viramas for indic langs
+
+for cmd in cut sed unzip sacremoses mtdata awkg ; do
+  which $cmd &> /dev/null ||
+    log_exit 1 "$cmd not found; please install $cmd and rerun me."
+done
+
+
+function tokenize {
+    raw=$1
+    tok=$2
+    echo "tokenizing $raw --> $tok"
+    [[ -f $raw ]] || log_exit 2 "input file not found $raw"
+    #[[ -f $tok ]] && log_exit 2 "output file is not empty $tok"
+    cat $raw | html_unescape | sacremoses normalize -q -d -p -c tokenize -a -x -p :web: > $tok
+ }
+
+function html_unescape {
+    sed -E 's/\& (ge|le|gt|lt|amp|quot|apos|nbsp);/\&\1;/g' |
+        awkg -b 'from html import unescape' 'print(unescape(R0))'
+}
+
+function get_hin_eng {
+
+    dest="$1"
+    [[ -e $dest/_GOOD ]] && return
+    [[ -d $dest ]] || mkdir -p $dest
+
+    [[ -f $dest/mtdata.signature.txt ]] || {
+        mtdata get --langs hin-eng --merge --out $dest \
+               --train IITBv1_5_train --test IITBv1_5_{dev,test}
+
+        mv $dest/train.hin{,.bak}
+        mv $dest/train.eng{,.bak}
+        # grep -E '^https?:[^ ]*$'
+        # exclude copy
+        paste $dest/train.{hin,eng}.bak | awkg -F '\t' 'RET=R[0] != R[1]' > $dest/train.hin-eng       
+        cut -f1 $dest/train.hin-eng > $dest/train.hin
+        cut -f2 $dest/train.hin-eng > $dest/train.eng
+     }
+
+    for lang in eng hin; do
+        for split in dev test; do
+            link=$dest/$split.$lang
+            [[ -e $link ]] ||
+                ln -s tests/IITBv1_5_$split-hin_eng.$lang $link
+        done
+    done
+
+    for split in dev test train; do
+        for lang in eng hin; do
+            tok_file=$dest/$split.$lang.tok
+            [[ -s $tok_file ]] || tokenize $dest/$split.$lang $tok_file
+        done
+    done
+    touch $dest/_GOOD
+}
+
+
+function get_deu_eng {
+    dest="$1"
+    [[ -e $dest/_GOOD ]] && return
+    [[ -d $dest ]] || mkdir -p $dest
+
+    [[ -f $dest/mtdata.signature.txt ]] || {
+       mtdata get --langs deu-eng --merge --out $dest \
+           --train news_commentary_v14 --test newstest201{8,9}_deen
+
+        mv $dest/train.deu{,.bak}
+        mv $dest/train.eng{,.bak}
+        # grep -E '^https?:[^ ]*$'
+        # exclude copy
+        paste $dest/train.{deu,eng}.bak | awkg -F '\t' 'RET=R[0] != R[1]' > $dest/train.deu-eng
+        cut -f1 $dest/train.deu-eng > $dest/train.deu
+        cut -f2 $dest/train.deu-eng > $dest/train.eng
+     }
+
+    for lang in eng deu; do
+      printf "dev newstest2018_deen-deu_eng\ntest newstest2019_deen-deu_eng\n" | \
+        while read split name; do
+          link=$dest/$split.$lang
+          [[ -e $link ]] || ln -s tests/$name.$lang $link
+        done
+     done
+    for split in dev test train; do
+        for lang in eng deu; do
+            tok_file=$dest/$split.$lang.tok
+            [[ -s $tok_file ]] || tokenize $dest/$split.$lang $tok_file
+        done
+    done
+    touch $dest/_GOOD
+}
+
+
+get_hin_eng hin-eng
+get_deu_eng deu-eng
+
+
+
+
+
+
+
+
diff --git a/experiments/pretrained/robertamt-xlmr-2layer.yml b/experiments/pretrained/robertamt-xlmr-2layer.yml
@@ -46,7 +46,7 @@ tester:
     beam_size: 4
     batch_size: 12000  # this is for 1 beam; effective_batch_size = batch_size / beam_size
     lp_alpha: 0.0     # length penalty
-  suit:
+  suite:
     valid:
       - experiments/sample-data/sampl.valid.fr.tok
       - experiments/sample-data/sampl.valid.en     # reference, unmodified -- not tokenized

diff --git a/experiments/pretrained/robertamt-xlmr.yml b/experiments/pretrained/robertamt-xlmr.yml
@@ -45,7 +45,7 @@ tester:
     beam_size: 4
     batch_size: 12000  # this is for 1 beam; effective_batch_size = batch_size / beam_size
     lp_alpha: 0.0     # length penalty
-  suit:
+  suite:
     valid:
       - experiments/sample-data/sampl.valid.fr.tok
       - experiments/sample-data/sampl.valid.en     # reference, unmodified -- not tokenized

diff --git a/experiments/sample-exp/conf.yml b/experiments/sample-exp/conf.yml
@@ -54,7 +54,7 @@ tester:
     lp_alpha: 0.0     # length penalty
     ensemble: 5
     max_len: 50
-  suit:
+  suite:
     valid:
     - experiments/sample-data/sampl.valid.fr.tok
     - experiments/sample-data/sampl.valid.en       # reference, unmodified -- not tokenized

diff --git a/experiments/spark-bigdataprep.yml b/experiments/spark-bigdataprep.yml
@@ -65,7 +65,7 @@ tester:
     lp_alpha: 0.0     # length penalty
     ensemble: 5
     max_len: 50
-  suit:
+  suite:
     valid:
     - experiments/sample-data/sampl.valid.fr.tok
     - experiments/sample-data/sampl.valid.en       # reference, unmodified -- not tokenized

diff --git a/experiments/transformer.base.yml b/experiments/transformer.base.yml
@@ -10,74 +10,49 @@ model_args: # model construction args
   tgt_vocab: 8000
   tied_emb: three-way  # choices: null, one-way, two-way, three-way
 model_type: tfmnmt  # model type. tfmnmt is the transformer NMT model
-optimizer:
-  name: adam
+optim:
   args:
     betas:
     - 0.9
     - 0.98
     eps: 1.0e-09
-    lr: 0.1
-
-schedule:
-  name: noam
-  args:
-    constant: 2
-    warmup: 8000
-    model_dim: 512
-
-criterion:
-  name: smooth_kld    #options "cross_entropy", "smooth_kld", "binary_cross_entropy", "triplet_loss"
-  args:
     label_smoothing: 0.1
-
+    lr: 0.2
+    warmup_steps: 8000
+    constant: 2
+  name: ADAM
 prep: # data preparation
   max_types: 8000  # maximum number of types in vocab ; if shared_vocab=false, set max_src_types and max_tgt_types separately instead of this one
   pieces: bpe   # choices: bpe, char, word, unigram  from google/sentencepiece
   shared_vocab: true  # true means same vocab for src and tgt, false means different vocabs
   src_len: 256   # longer sentences, decision is made as per 'truncate={true,false}'
   tgt_len: 256
-  train_src: data/train.src   # training data
-  train_tgt: data/train.tgt
   truncate: true   # what to do with longer sentences: if true truncate at src_len or tgt_len; if false filter away
-  valid_src: data/valid.src
-  valid_tgt: data/valid.tgt
+  train_src: experiments/sample-data/sampl.test.fr.tok
+  train_tgt: experiments/sample-data/sampl.test.en.tok
+  valid_src: experiments/sample-data/sampl.valid.fr.tok
+  valid_tgt: experiments/sample-data/sampl.valid.en.tok
   mono_src: []  # monolingual data for learning vocab or BPE
   mono_tgt: []
 tester:
   decoder:
-    tune:  # If this block is missing, then tuner will not be run, and some default values are picked from the code
-      trials: 6  # number of random trials, in addition to "suggested" values
-      tune_src: data/valid.src  # dataset for tuning
-      tune_ref: data/valid.tgt
-      beam_size: [1, 4, 8]    # pool of values for beam_size
-      ensemble: [1, 5, 10]
-      lp_alpha: [0.0, 0.4, 0.6, 1.0]
-      suggested:  # list of suggested values for beam_size, ensemble, lp_alpha
-        - 1, 1, 0.0
-        - 4, 1, 0.0
-        - 4, 1, 0.6
-        - 1, 5, 0.0
-        - 4, 5, 0.0
-        - 4, 5, 0.6
-        - 1, 10, 0.0
-        - 4, 10, 0.0
-        - 4, 10, 0.6
-  suit:  # suit of tests to run after the training
-    valid:  # name of test and list of src.tok, ref files (ref should be unmodified)
-    - data/valid.src
-    - data/valid.tgt
-  # in case we want to use external de tokenizer. interface:: $detokenizer < $out > out.detok
-  # by default it uses moses-tokenizer python wrapper to perl script
-  # detokenizer: cut -f1 | python -m rtg.tool.unicode_fix -l hi -d | perl scripts/indic-tok.perl -d
+    beam_size: 4
+    batch_size: 12000  # this is for 1 beam; effective_batch_size = batch_size / beam_size
+    lp_alpha: 0.0     # length penalty
+  suite:  # suit of tests to run after the training
+    # name of test and list of src.tok, ref files (ref should be unmodified)
+    valid:
+      - experiments/sample-data/sampl.valid.fr.tok
+      - experiments/sample-data/sampl.valid.en     # reference, unmodified -- not tokenized
+    test:
+      - experiments/sample-data/sampl.test.fr.tok
+      - experiments/sample-data/sampl.test.en     # reference, unmodified -- not tokenized
 trainer:
   init_args:
     chunk_size: 10   # generation in chunks of time steps to reduce memory consumption
-    grad_accum: 1    # How many batches to accumulate gradients over
   batch_size: 4200   # not exceeding these many tokens (including paddings). in tensor2tensor it is mean batch size
   check_point: 1000  # how often to checkpoint?
   keep_models: 10   # how many checkpoints to keep on disk (small enough to save disk, large enough for checkpt averaging
   steps: 200000   # how many steps to train
-  keep_in_mem: True
 updated_at: '2019-03-09T21:15:33.707183'
 seed: 12345  # fix the manual seed of pytorch + cuda + numpy + python_stdlib RNGs.  Remove/comment this to disable
diff --git a/experiments/transformer.test.yml b/experiments/transformer.test.yml
@@ -64,7 +64,7 @@ tester:
         - 1, 10, 0.0
         - 4, 10, 0.0
         - 4, 10, 0.6
-  suit:
+  suite:
     valid:
     - data/valid.src
     - data/valid.tgt

diff --git a/experiments/wv_cbow.yml b/experiments/wv_cbow.yml
@@ -34,7 +34,7 @@ prep:
   valid_src: data/valid.src
   valid_tgt: data/valid.tgt
 tester:
-  suit:
+  suite:
     valid:
     - data/valid.src
     - data/valid.tgt

diff --git a/rtg/module/criterion.py b/rtg/module/criterion.py
@@ -58,7 +58,6 @@ def smooth_labels(labels, n_labels, smooth_rate, weight=None):
     return full
 
 
-
 def dense_cross_entropy(input: Tensor, target: Tensor, reduction=None, mask_out=None, weight=None,
                         input_type='logits') -> Tensor:
     """
@@ -130,19 +129,17 @@ def dense_cross_entropy(input: Tensor, target: Tensor, reduction=None, mask_out=
         raise ValueError(f'reduce={reduction} not supported')
 
 
-
 @register(kind=CRITERION, name="cross_entropy")
 class CrossEntropy(Criterion):
 
-    def __init__(self, pad_idx: int, label_smoothing=0., reducion='micro'):
+    def __init__(self, pad_idx: int, label_smoothing=0., reduction='micro'):
         super().__init__(input_type='logits', pad_idx=pad_idx)
         assert 0 <= label_smoothing <= 1
         self.label_smoothing = label_smoothing
-        assert reducion in ('micro', 'macro')
-        self.reduction = reducion
-        if reducion == 'macro':
+        assert reduction in ('micro', 'macro')
+        self.reduction = reduction
+        if reduction == 'macro':
             assert self.label_smoothing > 0., 'reduce=macro requires label_smoothing > 0'
-        #self.xent_loss = nn.CrossEntropyLoss(reduction='none')
 
     def forward(self, inputs, targets, mask_pad=True):
         # logits: [N x C] targets: [N]
@@ -158,7 +155,7 @@ def forward(self, inputs, targets, mask_pad=True):
                                        device=inputs.device)
             dense_targets.scatter_(1, targets.type(torch.int64), 1.0)
 
-        weight = self.get_weight(inputs, targets)
+        weight = self.get_weights(inputs, targets)
         loss = dense_cross_entropy(input=inputs, target=dense_targets, reduction=self.reduction,
                                    weight=weight, mask_out=mask_out, input_type=self.input_type)
         return loss