diff --git a/data/get_data.sh b/data/get_data.sh new file mode 100755 index 0000000..46dc1f2 --- /dev/null +++ b/data/get_data.sh @@ -0,0 +1,113 @@ +#!/usr/bin/env bash +set -e # exit on error + +DIR="$(dirname "${BASH_SOURCE[0]}")" # Get the directory name +#DIR="$(realpath "${DIR}")" # Resolve its full path if need be + +log_exit() { echo "$2"; exit 1; } + +# note, use sacremoses 0.0.45 or newer; +# I contributed nukthas and viramas for indic langs + +for cmd in cut sed unzip sacremoses mtdata awkg ; do + which $cmd &> /dev/null || + log_exit 1 "$cmd not found; please install $cmd and rerun me." +done + + +function tokenize { + raw=$1 + tok=$2 + echo "tokenizing $raw --> $tok" + [[ -f $raw ]] || log_exit 2 "input file not found $raw" + #[[ -f $tok ]] && log_exit 2 "output file is not empty $tok" + cat $raw | html_unescape | sacremoses normalize -q -d -p -c tokenize -a -x -p :web: > $tok + } + +function html_unescape { + sed -E 's/\& (ge|le|gt|lt|amp|quot|apos|nbsp);/\&\1;/g' | + awkg -b 'from html import unescape' 'print(unescape(R0))' +} + +function get_hin_eng { + + dest="$1" + [[ -e $dest/_GOOD ]] && return + [[ -d $dest ]] || mkdir -p $dest + + [[ -f $dest/mtdata.signature.txt ]] || { + mtdata get --langs hin-eng --merge --out $dest \ + --train IITBv1_5_train --test IITBv1_5_{dev,test} + + mv $dest/train.hin{,.bak} + mv $dest/train.eng{,.bak} + # grep -E '^https?:[^ ]*$' + # exclude copy + paste $dest/train.{hin,eng}.bak | awkg -F '\t' 'RET=R[0] != R[1]' > $dest/train.hin-eng + cut -f1 $dest/train.hin-eng > $dest/train.hin + cut -f2 $dest/train.hin-eng > $dest/train.eng + } + + for lang in eng hin; do + for split in dev test; do + link=$dest/$split.$lang + [[ -e $link ]] || + ln -s tests/IITBv1_5_$split-hin_eng.$lang $link + done + done + + for split in dev test train; do + for lang in eng hin; do + tok_file=$dest/$split.$lang.tok + [[ -s $tok_file ]] || tokenize $dest/$split.$lang $tok_file + done + done + touch $dest/_GOOD +} + + +function get_deu_eng { + dest="$1" + [[ -e $dest/_GOOD ]] && return + [[ -d $dest ]] || mkdir -p $dest + + [[ -f $dest/mtdata.signature.txt ]] || { + mtdata get --langs deu-eng --merge --out $dest \ + --train news_commentary_v14 --test newstest201{8,9}_deen + + mv $dest/train.deu{,.bak} + mv $dest/train.eng{,.bak} + # grep -E '^https?:[^ ]*$' + # exclude copy + paste $dest/train.{deu,eng}.bak | awkg -F '\t' 'RET=R[0] != R[1]' > $dest/train.deu-eng + cut -f1 $dest/train.deu-eng > $dest/train.deu + cut -f2 $dest/train.deu-eng > $dest/train.eng + } + + for lang in eng deu; do + printf "dev newstest2018_deen-deu_eng\ntest newstest2019_deen-deu_eng\n" | \ + while read split name; do + link=$dest/$split.$lang + [[ -e $link ]] || ln -s tests/$name.$lang $link + done + done + for split in dev test train; do + for lang in eng deu; do + tok_file=$dest/$split.$lang.tok + [[ -s $tok_file ]] || tokenize $dest/$split.$lang $tok_file + done + done + touch $dest/_GOOD +} + + +get_hin_eng hin-eng +get_deu_eng deu-eng + + + + + + + + diff --git a/experiments/pretrained/robertamt-xlmr-2layer.yml b/experiments/pretrained/robertamt-xlmr-2layer.yml index 09b0e11..eea4268 100644 --- a/experiments/pretrained/robertamt-xlmr-2layer.yml +++ b/experiments/pretrained/robertamt-xlmr-2layer.yml @@ -46,7 +46,7 @@ tester: beam_size: 4 batch_size: 12000 # this is for 1 beam; effective_batch_size = batch_size / beam_size lp_alpha: 0.0 # length penalty - suit: + suite: valid: - experiments/sample-data/sampl.valid.fr.tok - experiments/sample-data/sampl.valid.en # reference, unmodified -- not tokenized diff --git a/experiments/pretrained/robertamt-xlmr.yml b/experiments/pretrained/robertamt-xlmr.yml index f61d31d..815b898 100644 --- a/experiments/pretrained/robertamt-xlmr.yml +++ b/experiments/pretrained/robertamt-xlmr.yml @@ -45,7 +45,7 @@ tester: beam_size: 4 batch_size: 12000 # this is for 1 beam; effective_batch_size = batch_size / beam_size lp_alpha: 0.0 # length penalty - suit: + suite: valid: - experiments/sample-data/sampl.valid.fr.tok - experiments/sample-data/sampl.valid.en # reference, unmodified -- not tokenized diff --git a/experiments/sample-exp/conf.yml b/experiments/sample-exp/conf.yml index 79a971e..0993dd4 100644 --- a/experiments/sample-exp/conf.yml +++ b/experiments/sample-exp/conf.yml @@ -54,7 +54,7 @@ tester: lp_alpha: 0.0 # length penalty ensemble: 5 max_len: 50 - suit: + suite: valid: - experiments/sample-data/sampl.valid.fr.tok - experiments/sample-data/sampl.valid.en # reference, unmodified -- not tokenized diff --git a/experiments/spark-bigdataprep.yml b/experiments/spark-bigdataprep.yml index 8fb1114..f530797 100644 --- a/experiments/spark-bigdataprep.yml +++ b/experiments/spark-bigdataprep.yml @@ -65,7 +65,7 @@ tester: lp_alpha: 0.0 # length penalty ensemble: 5 max_len: 50 - suit: + suite: valid: - experiments/sample-data/sampl.valid.fr.tok - experiments/sample-data/sampl.valid.en # reference, unmodified -- not tokenized diff --git a/experiments/transformer.base.yml b/experiments/transformer.base.yml index 4726030..4f34755 100644 --- a/experiments/transformer.base.yml +++ b/experiments/transformer.base.yml @@ -10,74 +10,49 @@ model_args: # model construction args tgt_vocab: 8000 tied_emb: three-way # choices: null, one-way, two-way, three-way model_type: tfmnmt # model type. tfmnmt is the transformer NMT model -optimizer: - name: adam +optim: args: betas: - 0.9 - 0.98 eps: 1.0e-09 - lr: 0.1 - -schedule: - name: noam - args: - constant: 2 - warmup: 8000 - model_dim: 512 - -criterion: - name: smooth_kld #options "cross_entropy", "smooth_kld", "binary_cross_entropy", "triplet_loss" - args: label_smoothing: 0.1 - + lr: 0.2 + warmup_steps: 8000 + constant: 2 + name: ADAM prep: # data preparation max_types: 8000 # maximum number of types in vocab ; if shared_vocab=false, set max_src_types and max_tgt_types separately instead of this one pieces: bpe # choices: bpe, char, word, unigram from google/sentencepiece shared_vocab: true # true means same vocab for src and tgt, false means different vocabs src_len: 256 # longer sentences, decision is made as per 'truncate={true,false}' tgt_len: 256 - train_src: data/train.src # training data - train_tgt: data/train.tgt truncate: true # what to do with longer sentences: if true truncate at src_len or tgt_len; if false filter away - valid_src: data/valid.src - valid_tgt: data/valid.tgt + train_src: experiments/sample-data/sampl.test.fr.tok + train_tgt: experiments/sample-data/sampl.test.en.tok + valid_src: experiments/sample-data/sampl.valid.fr.tok + valid_tgt: experiments/sample-data/sampl.valid.en.tok mono_src: [] # monolingual data for learning vocab or BPE mono_tgt: [] tester: decoder: - tune: # If this block is missing, then tuner will not be run, and some default values are picked from the code - trials: 6 # number of random trials, in addition to "suggested" values - tune_src: data/valid.src # dataset for tuning - tune_ref: data/valid.tgt - beam_size: [1, 4, 8] # pool of values for beam_size - ensemble: [1, 5, 10] - lp_alpha: [0.0, 0.4, 0.6, 1.0] - suggested: # list of suggested values for beam_size, ensemble, lp_alpha - - 1, 1, 0.0 - - 4, 1, 0.0 - - 4, 1, 0.6 - - 1, 5, 0.0 - - 4, 5, 0.0 - - 4, 5, 0.6 - - 1, 10, 0.0 - - 4, 10, 0.0 - - 4, 10, 0.6 - suit: # suit of tests to run after the training - valid: # name of test and list of src.tok, ref files (ref should be unmodified) - - data/valid.src - - data/valid.tgt - # in case we want to use external de tokenizer. interface:: $detokenizer < $out > out.detok - # by default it uses moses-tokenizer python wrapper to perl script - # detokenizer: cut -f1 | python -m rtg.tool.unicode_fix -l hi -d | perl scripts/indic-tok.perl -d + beam_size: 4 + batch_size: 12000 # this is for 1 beam; effective_batch_size = batch_size / beam_size + lp_alpha: 0.0 # length penalty + suite: # suit of tests to run after the training + # name of test and list of src.tok, ref files (ref should be unmodified) + valid: + - experiments/sample-data/sampl.valid.fr.tok + - experiments/sample-data/sampl.valid.en # reference, unmodified -- not tokenized + test: + - experiments/sample-data/sampl.test.fr.tok + - experiments/sample-data/sampl.test.en # reference, unmodified -- not tokenized trainer: init_args: chunk_size: 10 # generation in chunks of time steps to reduce memory consumption - grad_accum: 1 # How many batches to accumulate gradients over batch_size: 4200 # not exceeding these many tokens (including paddings). in tensor2tensor it is mean batch size check_point: 1000 # how often to checkpoint? keep_models: 10 # how many checkpoints to keep on disk (small enough to save disk, large enough for checkpt averaging steps: 200000 # how many steps to train - keep_in_mem: True updated_at: '2019-03-09T21:15:33.707183' seed: 12345 # fix the manual seed of pytorch + cuda + numpy + python_stdlib RNGs. Remove/comment this to disable \ No newline at end of file diff --git a/experiments/transformer.test.yml b/experiments/transformer.test.yml index b6ef762..e142c73 100644 --- a/experiments/transformer.test.yml +++ b/experiments/transformer.test.yml @@ -64,7 +64,7 @@ tester: - 1, 10, 0.0 - 4, 10, 0.0 - 4, 10, 0.6 - suit: + suite: valid: - data/valid.src - data/valid.tgt diff --git a/experiments/wv_cbow.yml b/experiments/wv_cbow.yml index 554d36b..61a621e 100644 --- a/experiments/wv_cbow.yml +++ b/experiments/wv_cbow.yml @@ -34,7 +34,7 @@ prep: valid_src: data/valid.src valid_tgt: data/valid.tgt tester: - suit: + suite: valid: - data/valid.src - data/valid.tgt diff --git a/rtg/module/criterion.py b/rtg/module/criterion.py index 76f2cf2..8aa72b0 100644 --- a/rtg/module/criterion.py +++ b/rtg/module/criterion.py @@ -58,7 +58,6 @@ def smooth_labels(labels, n_labels, smooth_rate, weight=None): return full - def dense_cross_entropy(input: Tensor, target: Tensor, reduction=None, mask_out=None, weight=None, input_type='logits') -> Tensor: """ @@ -130,19 +129,17 @@ def dense_cross_entropy(input: Tensor, target: Tensor, reduction=None, mask_out= raise ValueError(f'reduce={reduction} not supported') - @register(kind=CRITERION, name="cross_entropy") class CrossEntropy(Criterion): - def __init__(self, pad_idx: int, label_smoothing=0., reducion='micro'): + def __init__(self, pad_idx: int, label_smoothing=0., reduction='micro'): super().__init__(input_type='logits', pad_idx=pad_idx) assert 0 <= label_smoothing <= 1 self.label_smoothing = label_smoothing - assert reducion in ('micro', 'macro') - self.reduction = reducion - if reducion == 'macro': + assert reduction in ('micro', 'macro') + self.reduction = reduction + if reduction == 'macro': assert self.label_smoothing > 0., 'reduce=macro requires label_smoothing > 0' - #self.xent_loss = nn.CrossEntropyLoss(reduction='none') def forward(self, inputs, targets, mask_pad=True): # logits: [N x C] targets: [N] @@ -158,7 +155,7 @@ def forward(self, inputs, targets, mask_pad=True): device=inputs.device) dense_targets.scatter_(1, targets.type(torch.int64), 1.0) - weight = self.get_weight(inputs, targets) + weight = self.get_weights(inputs, targets) loss = dense_cross_entropy(input=inputs, target=dense_targets, reduction=self.reduction, weight=weight, mask_out=mask_out, input_type=self.input_type) return loss