Skip to content

Commit

Permalink
rename suit to suite
Browse files Browse the repository at this point in the history
  • Loading branch information
thammegowda committed Aug 18, 2021
1 parent 6657b36 commit 74b8879
Show file tree
Hide file tree
Showing 9 changed files with 144 additions and 59 deletions.
113 changes: 113 additions & 0 deletions data/get_data.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,113 @@
#!/usr/bin/env bash
set -e # exit on error

DIR="$(dirname "${BASH_SOURCE[0]}")" # Get the directory name
#DIR="$(realpath "${DIR}")" # Resolve its full path if need be

log_exit() { echo "$2"; exit 1; }

# note, use sacremoses 0.0.45 or newer;
# I contributed nukthas and viramas for indic langs

for cmd in cut sed unzip sacremoses mtdata awkg ; do
which $cmd &> /dev/null ||
log_exit 1 "$cmd not found; please install $cmd and rerun me."
done


function tokenize {
raw=$1
tok=$2
echo "tokenizing $raw --> $tok"
[[ -f $raw ]] || log_exit 2 "input file not found $raw"
#[[ -f $tok ]] && log_exit 2 "output file is not empty $tok"
cat $raw | html_unescape | sacremoses normalize -q -d -p -c tokenize -a -x -p :web: > $tok
}

function html_unescape {
sed -E 's/\& (ge|le|gt|lt|amp|quot|apos|nbsp);/\&\1;/g' |
awkg -b 'from html import unescape' 'print(unescape(R0))'
}

function get_hin_eng {

dest="$1"
[[ -e $dest/_GOOD ]] && return
[[ -d $dest ]] || mkdir -p $dest

[[ -f $dest/mtdata.signature.txt ]] || {
mtdata get --langs hin-eng --merge --out $dest \
--train IITBv1_5_train --test IITBv1_5_{dev,test}

mv $dest/train.hin{,.bak}
mv $dest/train.eng{,.bak}
# grep -E '^https?:[^ ]*$'
# exclude copy
paste $dest/train.{hin,eng}.bak | awkg -F '\t' 'RET=R[0] != R[1]' > $dest/train.hin-eng
cut -f1 $dest/train.hin-eng > $dest/train.hin
cut -f2 $dest/train.hin-eng > $dest/train.eng
}

for lang in eng hin; do
for split in dev test; do
link=$dest/$split.$lang
[[ -e $link ]] ||
ln -s tests/IITBv1_5_$split-hin_eng.$lang $link
done
done

for split in dev test train; do
for lang in eng hin; do
tok_file=$dest/$split.$lang.tok
[[ -s $tok_file ]] || tokenize $dest/$split.$lang $tok_file
done
done
touch $dest/_GOOD
}


function get_deu_eng {
dest="$1"
[[ -e $dest/_GOOD ]] && return
[[ -d $dest ]] || mkdir -p $dest

[[ -f $dest/mtdata.signature.txt ]] || {
mtdata get --langs deu-eng --merge --out $dest \
--train news_commentary_v14 --test newstest201{8,9}_deen

mv $dest/train.deu{,.bak}
mv $dest/train.eng{,.bak}
# grep -E '^https?:[^ ]*$'
# exclude copy
paste $dest/train.{deu,eng}.bak | awkg -F '\t' 'RET=R[0] != R[1]' > $dest/train.deu-eng
cut -f1 $dest/train.deu-eng > $dest/train.deu
cut -f2 $dest/train.deu-eng > $dest/train.eng
}

for lang in eng deu; do
printf "dev newstest2018_deen-deu_eng\ntest newstest2019_deen-deu_eng\n" | \
while read split name; do
link=$dest/$split.$lang
[[ -e $link ]] || ln -s tests/$name.$lang $link
done
done
for split in dev test train; do
for lang in eng deu; do
tok_file=$dest/$split.$lang.tok
[[ -s $tok_file ]] || tokenize $dest/$split.$lang $tok_file
done
done
touch $dest/_GOOD
}


get_hin_eng hin-eng
get_deu_eng deu-eng








2 changes: 1 addition & 1 deletion experiments/pretrained/robertamt-xlmr-2layer.yml
Original file line number Diff line number Diff line change
Expand Up @@ -46,7 +46,7 @@ tester:
beam_size: 4
batch_size: 12000 # this is for 1 beam; effective_batch_size = batch_size / beam_size
lp_alpha: 0.0 # length penalty
suit:
suite:
valid:
- experiments/sample-data/sampl.valid.fr.tok
- experiments/sample-data/sampl.valid.en # reference, unmodified -- not tokenized
Expand Down
2 changes: 1 addition & 1 deletion experiments/pretrained/robertamt-xlmr.yml
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,7 @@ tester:
beam_size: 4
batch_size: 12000 # this is for 1 beam; effective_batch_size = batch_size / beam_size
lp_alpha: 0.0 # length penalty
suit:
suite:
valid:
- experiments/sample-data/sampl.valid.fr.tok
- experiments/sample-data/sampl.valid.en # reference, unmodified -- not tokenized
Expand Down
2 changes: 1 addition & 1 deletion experiments/sample-exp/conf.yml
Original file line number Diff line number Diff line change
Expand Up @@ -54,7 +54,7 @@ tester:
lp_alpha: 0.0 # length penalty
ensemble: 5
max_len: 50
suit:
suite:
valid:
- experiments/sample-data/sampl.valid.fr.tok
- experiments/sample-data/sampl.valid.en # reference, unmodified -- not tokenized
Expand Down
2 changes: 1 addition & 1 deletion experiments/spark-bigdataprep.yml
Original file line number Diff line number Diff line change
Expand Up @@ -65,7 +65,7 @@ tester:
lp_alpha: 0.0 # length penalty
ensemble: 5
max_len: 50
suit:
suite:
valid:
- experiments/sample-data/sampl.valid.fr.tok
- experiments/sample-data/sampl.valid.en # reference, unmodified -- not tokenized
Expand Down
65 changes: 20 additions & 45 deletions experiments/transformer.base.yml
Original file line number Diff line number Diff line change
Expand Up @@ -10,74 +10,49 @@ model_args: # model construction args
tgt_vocab: 8000
tied_emb: three-way # choices: null, one-way, two-way, three-way
model_type: tfmnmt # model type. tfmnmt is the transformer NMT model
optimizer:
name: adam
optim:
args:
betas:
- 0.9
- 0.98
eps: 1.0e-09
lr: 0.1

schedule:
name: noam
args:
constant: 2
warmup: 8000
model_dim: 512

criterion:
name: smooth_kld #options "cross_entropy", "smooth_kld", "binary_cross_entropy", "triplet_loss"
args:
label_smoothing: 0.1

lr: 0.2
warmup_steps: 8000
constant: 2
name: ADAM
prep: # data preparation
max_types: 8000 # maximum number of types in vocab ; if shared_vocab=false, set max_src_types and max_tgt_types separately instead of this one
pieces: bpe # choices: bpe, char, word, unigram from google/sentencepiece
shared_vocab: true # true means same vocab for src and tgt, false means different vocabs
src_len: 256 # longer sentences, decision is made as per 'truncate={true,false}'
tgt_len: 256
train_src: data/train.src # training data
train_tgt: data/train.tgt
truncate: true # what to do with longer sentences: if true truncate at src_len or tgt_len; if false filter away
valid_src: data/valid.src
valid_tgt: data/valid.tgt
train_src: experiments/sample-data/sampl.test.fr.tok
train_tgt: experiments/sample-data/sampl.test.en.tok
valid_src: experiments/sample-data/sampl.valid.fr.tok
valid_tgt: experiments/sample-data/sampl.valid.en.tok
mono_src: [] # monolingual data for learning vocab or BPE
mono_tgt: []
tester:
decoder:
tune: # If this block is missing, then tuner will not be run, and some default values are picked from the code
trials: 6 # number of random trials, in addition to "suggested" values
tune_src: data/valid.src # dataset for tuning
tune_ref: data/valid.tgt
beam_size: [1, 4, 8] # pool of values for beam_size
ensemble: [1, 5, 10]
lp_alpha: [0.0, 0.4, 0.6, 1.0]
suggested: # list of suggested values for beam_size, ensemble, lp_alpha
- 1, 1, 0.0
- 4, 1, 0.0
- 4, 1, 0.6
- 1, 5, 0.0
- 4, 5, 0.0
- 4, 5, 0.6
- 1, 10, 0.0
- 4, 10, 0.0
- 4, 10, 0.6
suit: # suit of tests to run after the training
valid: # name of test and list of src.tok, ref files (ref should be unmodified)
- data/valid.src
- data/valid.tgt
# in case we want to use external de tokenizer. interface:: $detokenizer < $out > out.detok
# by default it uses moses-tokenizer python wrapper to perl script
# detokenizer: cut -f1 | python -m rtg.tool.unicode_fix -l hi -d | perl scripts/indic-tok.perl -d
beam_size: 4
batch_size: 12000 # this is for 1 beam; effective_batch_size = batch_size / beam_size
lp_alpha: 0.0 # length penalty
suite: # suit of tests to run after the training
# name of test and list of src.tok, ref files (ref should be unmodified)
valid:
- experiments/sample-data/sampl.valid.fr.tok
- experiments/sample-data/sampl.valid.en # reference, unmodified -- not tokenized
test:
- experiments/sample-data/sampl.test.fr.tok
- experiments/sample-data/sampl.test.en # reference, unmodified -- not tokenized
trainer:
init_args:
chunk_size: 10 # generation in chunks of time steps to reduce memory consumption
grad_accum: 1 # How many batches to accumulate gradients over
batch_size: 4200 # not exceeding these many tokens (including paddings). in tensor2tensor it is mean batch size
check_point: 1000 # how often to checkpoint?
keep_models: 10 # how many checkpoints to keep on disk (small enough to save disk, large enough for checkpt averaging
steps: 200000 # how many steps to train
keep_in_mem: True
updated_at: '2019-03-09T21:15:33.707183'
seed: 12345 # fix the manual seed of pytorch + cuda + numpy + python_stdlib RNGs. Remove/comment this to disable
2 changes: 1 addition & 1 deletion experiments/transformer.test.yml
Original file line number Diff line number Diff line change
Expand Up @@ -64,7 +64,7 @@ tester:
- 1, 10, 0.0
- 4, 10, 0.0
- 4, 10, 0.6
suit:
suite:
valid:
- data/valid.src
- data/valid.tgt
Expand Down
2 changes: 1 addition & 1 deletion experiments/wv_cbow.yml
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@ prep:
valid_src: data/valid.src
valid_tgt: data/valid.tgt
tester:
suit:
suite:
valid:
- data/valid.src
- data/valid.tgt
Expand Down
13 changes: 5 additions & 8 deletions rtg/module/criterion.py
Original file line number Diff line number Diff line change
Expand Up @@ -58,7 +58,6 @@ def smooth_labels(labels, n_labels, smooth_rate, weight=None):
return full



def dense_cross_entropy(input: Tensor, target: Tensor, reduction=None, mask_out=None, weight=None,
input_type='logits') -> Tensor:
"""
Expand Down Expand Up @@ -130,19 +129,17 @@ def dense_cross_entropy(input: Tensor, target: Tensor, reduction=None, mask_out=
raise ValueError(f'reduce={reduction} not supported')



@register(kind=CRITERION, name="cross_entropy")
class CrossEntropy(Criterion):

def __init__(self, pad_idx: int, label_smoothing=0., reducion='micro'):
def __init__(self, pad_idx: int, label_smoothing=0., reduction='micro'):
super().__init__(input_type='logits', pad_idx=pad_idx)
assert 0 <= label_smoothing <= 1
self.label_smoothing = label_smoothing
assert reducion in ('micro', 'macro')
self.reduction = reducion
if reducion == 'macro':
assert reduction in ('micro', 'macro')
self.reduction = reduction
if reduction == 'macro':
assert self.label_smoothing > 0., 'reduce=macro requires label_smoothing > 0'
#self.xent_loss = nn.CrossEntropyLoss(reduction='none')

def forward(self, inputs, targets, mask_pad=True):
# logits: [N x C] targets: [N]
Expand All @@ -158,7 +155,7 @@ def forward(self, inputs, targets, mask_pad=True):
device=inputs.device)
dense_targets.scatter_(1, targets.type(torch.int64), 1.0)

weight = self.get_weight(inputs, targets)
weight = self.get_weights(inputs, targets)
loss = dense_cross_entropy(input=inputs, target=dense_targets, reduction=self.reduction,
weight=weight, mask_out=mask_out, input_type=self.input_type)
return loss
Expand Down

0 comments on commit 74b8879

Please sign in to comment.