NVIDIA · May 3, 2018
diff --git a/‎audio_processing.py
+93 b/‎audio_processing.py
+93
diff --git a/‎data_utils.py
+101 b/‎data_utils.py
+101
diff --git a/‎distributed.py
+120 b/‎distributed.py
+120
diff --git a/‎fp16_optimizer.py
+381 b/‎fp16_optimizer.py
+381
diff --git a/‎hparams.py
+91 b/‎hparams.py
+91
diff --git a/‎layers.py
+80 b/‎layers.py
+80
diff --git a/‎logger.py
+48 b/‎logger.py
+48
diff --git a/‎loss_function.py
+19 b/‎loss_function.py
+19
diff --git a/‎loss_scaler.py
+132 b/‎loss_scaler.py
+132
diff --git a/‎model.py
+541 b/‎model.py
+541
diff --git a/‎multiproc.py
+23 b/‎multiproc.py
+23
diff --git a/‎plotting_utils.py
+61 b/‎plotting_utils.py
+61
diff --git a/‎stft.py
+140 b/‎stft.py
+140
diff --git a/‎train.py
+272 b/‎train.py
+272
diff --git a/‎utils.py
+32 b/‎utils.py
+32
@@ -0,0 +1,93 @@
+import torch
+import numpy as np
+from scipy.signal import get_window
+import librosa.util as librosa_util
+
+
+def window_sumsquare(window, n_frames, hop_length=200, win_length=800,
+                     n_fft=800, dtype=np.float32, norm=None):
+    """
+    # from librosa 0.6
+    Compute the sum-square envelope of a window function at a given hop length.
+
+    This is used to estimate modulation effects induced by windowing
+    observations in short-time fourier transforms.
+
+    Parameters
+    ----------
+    window : string, tuple, number, callable, or list-like
+        Window specification, as in `get_window`
+
+    n_frames : int > 0
+        The number of analysis frames
+
+    hop_length : int > 0
+        The number of samples to advance between frames
+
+    win_length : [optional]
+        The length of the window function.  By default, this matches `n_fft`.
+
+    n_fft : int > 0
+        The length of each analysis frame.
+
+    dtype : np.dtype
+        The data type of the output
+
+    Returns
+    -------
+    wss : np.ndarray, shape=`(n_fft + hop_length * (n_frames - 1))`
+        The sum-squared envelope of the window function
+    """
+    if win_length is None:
+        win_length = n_fft
+
+    n = n_fft + hop_length * (n_frames - 1)
+    x = np.zeros(n, dtype=dtype)
+
+    # Compute the squared window at the desired length
+    win_sq = get_window(window, win_length, fftbins=True)
+    win_sq = librosa_util.normalize(win_sq, norm=norm)**2
+    win_sq = librosa_util.pad_center(win_sq, n_fft)
+
+    # Fill the envelope
+    for i in range(n_frames):
+        sample = i * hop_length
+        x[sample:min(n, sample + n_fft)] += win_sq[:max(0, min(n_fft, n - sample))]
+    return x
+
+
+def griffin_lim(magnitudes, stft_fn, n_iters=30):
+    """
+    PARAMS
+    ------
+    magnitudes: spectrogram magnitudes
+    stft_fn: STFT class with transform (STFT) and inverse (ISTFT) methods
+    """
+
+    angles = np.angle(np.exp(2j * np.pi * np.random.rand(*magnitudes.size())))
+    angles = angles.astype(np.float32)
+    angles = torch.autograd.Variable(torch.from_numpy(angles))
+    signal = stft_fn.inverse(magnitudes, angles).squeeze(1)
+
+    for i in range(n_iters):
+        _, angles = stft_fn.transform(signal)
+        signal = stft_fn.inverse(magnitudes, angles).squeeze(1)
+    return signal
+
+
+def dynamic_range_compression(x, C=1, clip_val=1e-5):
+    """
+    PARAMS
+    ------
+    C: compression factor
+    """
+    return torch.log(torch.clamp(x, min=clip_val) * C)
+
+
+def dynamic_range_decompression(x, C=1):
+    """
+    PARAMS
+    ------
+    C: compression factor used to compress
+    """
+    return torch.exp(x) / C
@@ -0,0 +1,101 @@
+import random
+import torch
+import torch.utils.data
+
+import layers
+from utils import load_wav_to_torch, load_filepaths_and_text
+from text import text_to_sequence
+
+
+class TextMelLoader(torch.utils.data.Dataset):
+    """
+        1) loads audio,text pairs
+        2) normalizes text and converts them to sequences of one-hot vectors
+        3) computes mel-spectrograms from audio files.
+    """
+    def __init__(self, audiopaths_and_text, hparams, shuffle=True):
+        self.audiopaths_and_text = load_filepaths_and_text(
+            audiopaths_and_text, hparams.sort_by_length)
+        self.text_cleaners = hparams.text_cleaners
+        self.max_wav_value = hparams.max_wav_value
+        self.sampling_rate = hparams.sampling_rate
+        self.stft = layers.TacotronSTFT(
+            hparams.filter_length, hparams.hop_length, hparams.win_length,
+            hparams.n_mel_channels, hparams.sampling_rate, hparams.mel_fmin,
+            hparams.mel_fmax)
+        random.seed(1234)
+        if shuffle:
+            random.shuffle(self.audiopaths_and_text)
+
+    def get_mel_text_pair(self, audiopath_and_text):
+        # separate filename and text
+        audiopath, text = audiopath_and_text[0], audiopath_and_text[1]
+        text = self.get_text(text)
+        mel = self.get_mel(audiopath)
+        return (text, mel)
+
+    def get_mel(self, filename):
+        audio = load_wav_to_torch(filename, self.sampling_rate)
+        audio_norm = audio / self.max_wav_value
+        audio_norm = audio_norm.unsqueeze(0)
+        audio_norm = torch.autograd.Variable(audio_norm, requires_grad=False)
+        melspec = self.stft.mel_spectrogram(audio_norm)
+        melspec = torch.squeeze(melspec, 0)
+        return melspec
+
+    def get_text(self, text):
+        text_norm = torch.IntTensor(text_to_sequence(text, self.text_cleaners))
+        return text_norm
+
+    def __getitem__(self, index):
+        return self.get_mel_text_pair(self.audiopaths_and_text[index])
+
+    def __len__(self):
+        return len(self.audiopaths_and_text)
+
+
+class TextMelCollate():
+    """ Zero-pads model inputs and targets based on number of frames per setep
+    """
+    def __init__(self, n_frames_per_step):
+        self.n_frames_per_step = n_frames_per_step
+
+    def __call__(self, batch):
+        """Collate's training batch from normalized text and mel-spectrogram
+        PARAMS
+        ------
+        batch: [text_normalized, mel_normalized]
+        """
+        # Right zero-pad all one-hot text sequences to max input length
+        input_lengths, ids_sorted_decreasing = torch.sort(
+            torch.LongTensor([len(x[0]) for x in batch]),
+            dim=0, descending=True)
+        max_input_len = input_lengths[0]
+
+        text_padded = torch.LongTensor(len(batch), max_input_len)
+        text_padded.zero_()
+        for i in range(len(ids_sorted_decreasing)):
+            text = batch[ids_sorted_decreasing[i]][0]
+            text_padded[i, :text.size(0)] = text
+
+        # Right zero-pad mel-spec with extra single zero vector to mark the end
+        num_mels = batch[0][1].size(0)
+        max_target_len = max([x[1].size(1) for x in batch]) + 1
+        if max_target_len % self.n_frames_per_step != 0:
+            max_target_len += self.n_frames_per_step - max_target_len % self.n_frames_per_step
+            assert max_target_len % self.n_frames_per_step == 0
+
+        # include mel padded and gate padded
+        mel_padded = torch.FloatTensor(len(batch), num_mels, max_target_len)
+        mel_padded.zero_()
+        gate_padded = torch.FloatTensor(len(batch), max_target_len)
+        gate_padded.zero_()
+        output_lengths = torch.LongTensor(len(batch))
+        for i in range(len(ids_sorted_decreasing)):
+            mel = batch[ids_sorted_decreasing[i]][1]
+            mel_padded[i, :, :mel.size(1)] = mel
+            gate_padded[i, mel.size(1):] = 1
+            output_lengths[i] = mel.size(1)
+
+        return text_padded, input_lengths, mel_padded, gate_padded, \
+            output_lengths
@@ -0,0 +1,120 @@
+import torch
+import torch.distributed as dist
+from torch.nn.modules import Module
+
+def _flatten_dense_tensors(tensors):
+    """Flatten dense tensors into a contiguous 1D buffer. Assume tensors are of
+    same dense type.
+    Since inputs are dense, the resulting tensor will be a concatenated 1D
+    buffer. Element-wise operation on this buffer will be equivalent to
+    operating individually.
+    Arguments:
+        tensors (Iterable[Tensor]): dense tensors to flatten.
+    Returns:
+        A contiguous 1D buffer containing input tensors.
+    """
+    if len(tensors) == 1:
+        return tensors[0].contiguous().view(-1)
+    flat = torch.cat([t.contiguous().view(-1) for t in tensors], dim=0)
+    return flat
+
+def _unflatten_dense_tensors(flat, tensors):
+    """View a flat buffer using the sizes of tensors. Assume that tensors are of
+    same dense type, and that flat is given by _flatten_dense_tensors.
+    Arguments:
+        flat (Tensor): flattened dense tensors to unflatten.
+        tensors (Iterable[Tensor]): dense tensors whose sizes will be used to
+          unflatten flat.
+    Returns:
+        Unflattened dense tensors with sizes same as tensors and values from
+        flat.
+    """
+    outputs = []
+    offset = 0
+    for tensor in tensors:
+        numel = tensor.numel()
+        outputs.append(flat.narrow(0, offset, numel).view_as(tensor))
+        offset += numel
+    return tuple(outputs)
+
+
+'''
+This version of DistributedDataParallel is designed to be used in conjunction with the multiproc.py
+launcher included with this example. It assumes that your run is using multiprocess with 1
+GPU/process, that the model is on the correct device, and that torch.set_device has been
+used to set the device.
+
+Parameters are broadcasted to the other processes on initialization of DistributedDataParallel,
+and will be allreduced at the finish of the backward pass.
+'''
+class DistributedDataParallel(Module):
+
+    def __init__(self, module):
+        super(DistributedDataParallel, self).__init__()
+        #fallback for PyTorch 0.3
+        if not hasattr(dist, '_backend'):
+            self.warn_on_half = True
+        else:
+            self.warn_on_half = True if dist._backend == dist.dist_backend.GLOO else False
+
+        self.module = module
+
+        for p in self.module.state_dict().values():
+            if not torch.is_tensor(p):
+                continue
+            dist.broadcast(p, 0)
+
+        def allreduce_params():
+            if(self.needs_reduction):
+                self.needs_reduction = False
+                buckets = {}
+                for param in self.module.parameters():
+                    if param.requires_grad and param.grad is not None:
+                        tp = type(param.data)
+                        if tp not in buckets:
+                            buckets[tp] = []
+                        buckets[tp].append(param)
+                if self.warn_on_half:
+                    if torch.cuda.HalfTensor in buckets:
+                        print("WARNING: gloo dist backend for half parameters may be extremely slow." +
+                              " It is recommended to use the NCCL backend in this case. This currently requires" +
+                              "PyTorch built from top of tree master.")
+                        self.warn_on_half = False
+
+                for tp in buckets:
+                    bucket = buckets[tp]
+                    grads = [param.grad.data for param in bucket]
+                    coalesced = _flatten_dense_tensors(grads)
+                    dist.all_reduce(coalesced)
+                    coalesced /= dist.get_world_size()
+                    for buf, synced in zip(grads, _unflatten_dense_tensors(coalesced, grads)):
+                        buf.copy_(synced)
+
+        for param in list(self.module.parameters()):
+            def allreduce_hook(*unused):
+                param._execution_engine.queue_callback(allreduce_params)
+            if param.requires_grad:
+                param.register_hook(allreduce_hook)
+
+    def forward(self, *inputs, **kwargs):
+        self.needs_reduction = True
+        return self.module(*inputs, **kwargs)
+
+    '''
+    def _sync_buffers(self):
+        buffers = list(self.module._all_buffers())
+        if len(buffers) > 0:
+            # cross-node buffer sync
+            flat_buffers = _flatten_dense_tensors(buffers)
+            dist.broadcast(flat_buffers, 0)
+            for buf, synced in zip(buffers, _unflatten_dense_tensors(flat_buffers, buffers)):
+                buf.copy_(synced)
+     def train(self, mode=True):
+        # Clear NCCL communicator and CUDA event cache of the default group ID,
+        # These cache will be recreated at the later call. This is currently a
+        # work-around for a potential NCCL deadlock.
+        if dist._backend == dist.dist_backend.NCCL:
+            dist._clear_group_cache()
+        super(DistributedDataParallel, self).train(mode)
+        self.module.train(mode)
+    '''
@@ -0,0 +1,91 @@
+import tensorflow as tf
+from text import symbols
+
+
+def create_hparams(hparams_string=None, verbose=False):
+    """Create model hyperparameters. Parse nondefault from given string."""
+
+    hparams = tf.contrib.training.HParams(
+        ################################
+        # Experiment Parameters        #
+        ################################
+        epochs=500,
+        iters_per_checkpoint=500,
+        seed=1234,
+        dynamic_loss_scaling=True,
+        fp16_run=False,
+        distributed_run=False,
+        dist_backend="nccl",
+        dist_url="file://distributed.dpt",
+        cudnn_enabled=True,
+        cudnn_benchmark=False,
+
+        ################################
+        # Data Parameters             #
+        ################################
+        training_files='ljs_audio_text_train_filelist.txt',
+        validation_files='ljs_audio_text_val_filelist.txt',
+        text_cleaners=['english_cleaners'],
+        sort_by_length=False,
+
+        ################################
+        # Audio Parameters             #
+        ################################
+        max_wav_value=32768.0,
+        sampling_rate=22050,
+        filter_length=1024,
+        hop_length=256,
+        win_length=1024,
+        n_mel_channels=80,
+        mel_fmin=0.0,
+        mel_fmax=None,  # if None, half the sampling rate
+
+        ################################
+        # Model Parameters             #
+        ################################
+        n_symbols=len(symbols),
+        symbols_embedding_dim=512,
+
+        # Encoder parameters
+        encoder_kernel_size=5,
+        encoder_n_convolutions=3,
+        encoder_embedding_dim=512,
+
+        # Decoder parameters
+        n_frames_per_step=1,
+        decoder_rnn_dim=1024,
+        prenet_dim=256,
+        max_decoder_steps=1000,
+        gate_threshold=0.6,
+
+        # Attention parameters
+        attention_rnn_dim=1024,
+        attention_dim=128,
+
+        # Location Layer parameters
+        attention_location_n_filters=32,
+        attention_location_kernel_size=31,
+
+        # Mel-post processing network parameters
+        postnet_embedding_dim=512,
+        postnet_kernel_size=5,
+        postnet_n_convolutions=5,
+
+        ################################
+        # Optimization Hyperparameters #
+        ################################
+        learning_rate=1e-3,
+        weight_decay=1e-6,
+        grad_clip_thresh=1,
+        batch_size=48,
+        mask_padding=False  # set model's padded outputs to padded values
+    )
+
+    if hparams_string:
+        tf.logging.info('Parsing command line hparams: %s', hparams_string)
+        hparams.parse(hparams_string)
+
+    if verbose:
+        tf.logging.info('Final parsed hparams: %s', hparams.values())
+
+    return hparams
@@ -0,0 +1,80 @@
+import torch
+from librosa.filters import mel as librosa_mel_fn
+from audio_processing import dynamic_range_compression
+from audio_processing import dynamic_range_decompression
+from stft import STFT
+
+
+class LinearNorm(torch.nn.Module):
+    def __init__(self, in_dim, out_dim, bias=True, w_init_gain='linear'):
+        super(LinearNorm, self).__init__()
+        self.linear_layer = torch.nn.Linear(in_dim, out_dim, bias=bias)
+
+        torch.nn.init.xavier_uniform(
+            self.linear_layer.weight,
+            gain=torch.nn.init.calculate_gain(w_init_gain))
+
+    def forward(self, x):
+        return self.linear_layer(x)
+
+
+class ConvNorm(torch.nn.Module):
+    def __init__(self, in_channels, out_channels, kernel_size=1, stride=1,
+                 padding=None, dilation=1, bias=True, w_init_gain='linear'):
+        super(ConvNorm, self).__init__()
+        if padding is None:
+            assert(kernel_size % 2 == 1)
+            padding = int(dilation * (kernel_size - 1) / 2)
+
+        self.conv = torch.nn.Conv1d(in_channels, out_channels,
+                                    kernel_size=kernel_size, stride=stride,
+                                    padding=padding, dilation=dilation,
+                                    bias=bias)
+
+        torch.nn.init.xavier_uniform(
+            self.conv.weight, gain=torch.nn.init.calculate_gain(w_init_gain))
+
+    def forward(self, signal):
+        conv_signal = self.conv(signal)
+        return conv_signal
+
+
+class TacotronSTFT(torch.nn.Module):
+    def __init__(self, filter_length=1024, hop_length=256, win_length=1024,
+                 n_mel_channels=80, sampling_rate=22050, mel_fmin=0.0,
+                 mel_fmax=None):
+        super(TacotronSTFT, self).__init__()
+        self.n_mel_channels = n_mel_channels
+        self.sampling_rate = sampling_rate
+        self.stft_fn = STFT(filter_length, hop_length, win_length)
+        mel_basis = librosa_mel_fn(
+            sampling_rate, filter_length, n_mel_channels, mel_fmin, mel_fmax)
+        mel_basis = torch.from_numpy(mel_basis).float()
+        self.register_buffer('mel_basis', mel_basis)
+
+    def spectral_normalize(self, magnitudes):
+        output = dynamic_range_compression(magnitudes)
+        return output
+
+    def spectral_de_normalize(self, magnitudes):
+        output = dynamic_range_decompression(magnitudes)
+        return output
+
+    def mel_spectrogram(self, y):
+        """Computes mel-spectrograms from a batch of waves
+        PARAMS
+        ------
+        y: Variable(torch.FloatTensor) with shape (B, T) in range [-1, 1]
+
+        RETURNS
+        -------
+        mel_output: torch.FloatTensor of shape (B, n_mel_channels, T)
+        """
+        assert(torch.min(y.data) >= -1)
+        assert(torch.max(y.data) <= 1)
+
+        magnitudes, phases = self.stft_fn.transform(y)
+        magnitudes = magnitudes.data
+        mel_output = torch.matmul(self.mel_basis, magnitudes)
+        mel_output = self.spectral_normalize(mel_output)
+        return mel_output
@@ -0,0 +1,48 @@
+import random
+import torch.nn.functional as F
+from tensorboardX import SummaryWriter
+from plotting_utils import plot_alignment_to_numpy, plot_spectrogram_to_numpy
+from plotting_utils import plot_gate_outputs_to_numpy
+
+
+class Tacotron2Logger(SummaryWriter):
+    def __init__(self, logdir):
+        super(Tacotron2Logger, self).__init__(logdir)
+
+    def log_training(self, reduced_loss, grad_norm, learning_rate, duration,
+                     iteration):
+            self.add_scalar("training.loss", reduced_loss, iteration)
+            self.add_scalar("grad.norm", grad_norm, iteration)
+            self.add_scalar("learning.rate", learning_rate, iteration)
+            self.add_scalar("duration", duration, iteration)
+
+    def log_validation(self, reduced_loss, model, y, y_pred, iteration):
+        self.add_scalar("validation.loss", reduced_loss, iteration)
+        _, mel_outputs, gate_outputs, alignments = y_pred
+        mel_targets, gate_targets = y
+
+        # plot distribution of parameters
+        for tag, value in model.named_parameters():
+            tag = tag.replace('.', '/')
+            self.add_histogram(tag, value.data.cpu().numpy(), iteration)
+
+        # plot alignment, mel target and predicted, gate target and predicted
+        idx = random.randint(0, alignments.size(0) - 1)
+        self.add_image(
+            "alignment",
+            plot_alignment_to_numpy(alignments[idx].data.cpu().numpy().T),
+            iteration)
+        self.add_image(
+            "mel_target",
+            plot_spectrogram_to_numpy(mel_targets[idx].data.cpu().numpy()),
+            iteration)
+        self.add_image(
+            "mel_predicted",
+            plot_spectrogram_to_numpy(mel_outputs[idx].data.cpu().numpy()),
+            iteration)
+        self.add_image(
+            "gate",
+            plot_gate_outputs_to_numpy(
+                gate_targets[idx].data.cpu().numpy(),
+                F.sigmoid(gate_outputs[idx]).data.cpu().numpy()),
+            iteration)
@@ -0,0 +1,19 @@
+from torch import nn
+
+
+class Tacotron2Loss(nn.Module):
+    def __init__(self):
+        super(Tacotron2Loss, self).__init__()
+
+    def forward(self, model_output, targets):
+        mel_target, gate_target = targets[0], targets[1]
+        mel_target.requires_grad = False
+        gate_target.requires_grad = False
+        gate_target = gate_target.view(-1, 1)
+
+        mel_out, mel_out_postnet, gate_out, _ = model_output
+        gate_out = gate_out.view(-1, 1)
+        mel_loss = nn.MSELoss()(mel_out, mel_target) + \
+            nn.MSELoss()(mel_out_postnet, mel_target)
+        gate_loss = nn.BCEWithLogitsLoss()(gate_out, gate_target)
+        return mel_loss + gate_loss
@@ -0,0 +1,132 @@
+import torch
+
+class LossScaler:
+
+    def __init__(self, scale=1):
+        self.cur_scale = scale
+
+    # `params` is a list / generator of torch.Variable
+    def has_overflow(self, params):
+        return False
+
+    # `x` is a torch.Tensor
+    def _has_inf_or_nan(x):
+        return False
+
+    # `overflow` is boolean indicating whether we overflowed in gradient
+    def update_scale(self, overflow):
+        pass
+
+    @property
+    def loss_scale(self):
+        return self.cur_scale
+
+    def scale_gradient(self, module, grad_in, grad_out):
+        return tuple(self.loss_scale * g for g in grad_in)
+
+    def backward(self, loss):
+        scaled_loss = loss*self.loss_scale
+        scaled_loss.backward()
+
+class DynamicLossScaler:
+
+    def __init__(self,
+                 init_scale=2**32,
+                 scale_factor=2.,
+                 scale_window=1000):
+        self.cur_scale = init_scale
+        self.cur_iter = 0
+        self.last_overflow_iter = -1
+        self.scale_factor = scale_factor
+        self.scale_window = scale_window
+
+    # `params` is a list / generator of torch.Variable
+    def has_overflow(self, params):
+#        return False
+        for p in params:
+            if p.grad is not None and DynamicLossScaler._has_inf_or_nan(p.grad.data):
+                return True
+
+        return False
+
+    # `x` is a torch.Tensor
+    def _has_inf_or_nan(x):
+        inf_count = torch.sum(x.abs() == float('inf'))
+        if inf_count > 0:
+            return True
+        nan_count = torch.sum(x != x)
+        return nan_count > 0
+
+    # `overflow` is boolean indicating whether we overflowed in gradient
+    def update_scale(self, overflow):
+        if overflow:
+            #self.cur_scale /= self.scale_factor
+            self.cur_scale = max(self.cur_scale/self.scale_factor, 1)
+            self.last_overflow_iter = self.cur_iter
+        else:
+            if (self.cur_iter - self.last_overflow_iter) % self.scale_window == 0:
+                self.cur_scale *= self.scale_factor
+#        self.cur_scale = 1
+        self.cur_iter += 1
+
+    @property
+    def loss_scale(self):
+        return self.cur_scale
+
+    def scale_gradient(self, module, grad_in, grad_out):
+        return tuple(self.loss_scale * g for g in grad_in)
+
+    def backward(self, loss):
+        scaled_loss = loss*self.loss_scale
+        scaled_loss.backward()
+
+##############################################################
+# Example usage below here -- assuming it's in a separate file
+##############################################################
+if __name__ == "__main__":
+    import torch
+    from torch.autograd import Variable
+    from dynamic_loss_scaler import DynamicLossScaler
+
+    # N is batch size; D_in is input dimension;
+    # H is hidden dimension; D_out is output dimension.
+    N, D_in, H, D_out = 64, 1000, 100, 10
+
+    # Create random Tensors to hold inputs and outputs, and wrap them in Variables.
+    x = Variable(torch.randn(N, D_in), requires_grad=False)
+    y = Variable(torch.randn(N, D_out), requires_grad=False)
+
+    w1 = Variable(torch.randn(D_in, H), requires_grad=True)
+    w2 = Variable(torch.randn(H, D_out), requires_grad=True)
+    parameters = [w1, w2]
+
+    learning_rate = 1e-6
+    optimizer = torch.optim.SGD(parameters, lr=learning_rate)
+    loss_scaler = DynamicLossScaler()
+
+    for t in range(500):
+        y_pred = x.mm(w1).clamp(min=0).mm(w2)
+        loss = (y_pred - y).pow(2).sum() * loss_scaler.loss_scale
+        print('Iter {} loss scale: {}'.format(t, loss_scaler.loss_scale))
+        print('Iter {} scaled loss: {}'.format(t, loss.data[0]))
+        print('Iter {} unscaled loss: {}'.format(t, loss.data[0] / loss_scaler.loss_scale))
+
+        # Run backprop
+        optimizer.zero_grad()
+        loss.backward()
+
+        # Check for overflow
+        has_overflow = DynamicLossScaler.has_overflow(parameters)
+
+        # If no overflow, unscale grad and update as usual
+        if not has_overflow:
+            for param in parameters:
+                param.grad.data.mul_(1. / loss_scaler.loss_scale)
+            optimizer.step()
+        # Otherwise, don't do anything -- ie, skip iteration
+        else:
+            print('OVERFLOW!')
+
+        # Update loss scale for next iteration
+        loss_scaler.update_scale(has_overflow)
+
@@ -0,0 +1,23 @@
+import time
+import torch
+import sys
+import subprocess
+
+argslist = list(sys.argv)[1:]
+num_gpus = torch.cuda.device_count()
+argslist.append('--n_gpus={}'.format(num_gpus))
+workers = []
+job_id = time.strftime("%Y_%m_%d-%H%M%S")
+argslist.append("--group_name=group_{}".format(job_id))
+
+for i in range(num_gpus):
+    argslist.append('--rank={}'.format(i))
+    stdout = None if i == 0 else open("logs/{}_GPU_{}.log".format(job_id, i),
+                                      "w")
+    print(argslist)
+    p = subprocess.Popen([str(sys.executable)]+argslist, stdout=stdout)
+    workers.append(p)
+    argslist = argslist[:-1]
+
+for p in workers:
+    p.wait()
@@ -0,0 +1,61 @@
+import matplotlib
+matplotlib.use("Agg")
+import matplotlib.pylab as plt
+import numpy as np
+
+
+def save_figure_to_numpy(fig):
+    # save it to a numpy array.
+    data = np.fromstring(fig.canvas.tostring_rgb(), dtype=np.uint8, sep='')
+    data = data.reshape(fig.canvas.get_width_height()[::-1] + (3,))
+    return data
+
+
+def plot_alignment_to_numpy(alignment, info=None):
+    fig, ax = plt.subplots(figsize=(6, 4))
+    im = ax.imshow(alignment, aspect='auto', origin='lower',
+                   interpolation='none')
+    fig.colorbar(im, ax=ax)
+    xlabel = 'Decoder timestep'
+    if info is not None:
+        xlabel += '\n\n' + info
+    plt.xlabel(xlabel)
+    plt.ylabel('Encoder timestep')
+    plt.tight_layout()
+
+    fig.canvas.draw()
+    data = save_figure_to_numpy(fig)
+    plt.close()
+    return data
+
+
+def plot_spectrogram_to_numpy(spectrogram):
+    fig, ax = plt.subplots(figsize=(12, 3))
+    im = ax.imshow(spectrogram, aspect="auto", origin="lower",
+                   interpolation='none')
+    plt.colorbar(im, ax=ax)
+    plt.xlabel("Frames")
+    plt.ylabel("Channels")
+    plt.tight_layout()
+
+    fig.canvas.draw()
+    data = save_figure_to_numpy(fig)
+    plt.close()
+    return data
+
+
+def plot_gate_outputs_to_numpy(gate_targets, gate_outputs):
+    fig, ax = plt.subplots(figsize=(12, 3))
+    ax.scatter(range(len(gate_targets)), gate_targets, alpha=0.5,
+               color='green', marker='+', s=1, label='target')
+    ax.scatter(range(len(gate_outputs)), gate_outputs, alpha=0.5,
+               color='red', marker='.', s=1, label='predicted')
+
+    plt.xlabel("Frames (Green target, Red predicted)")
+    plt.ylabel("Gate State")
+    plt.tight_layout()
+
+    fig.canvas.draw()
+    data = save_figure_to_numpy(fig)
+    plt.close()
+    return data
@@ -0,0 +1,140 @@
+"""
+BSD 3-Clause License
+
+Copyright (c) 2017, Prem Seetharaman
+All rights reserved.
+
+* Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions are met:
+
+* Redistributions of source code must retain the above copyright notice,
+  this list of conditions and the following disclaimer.
+
+* Redistributions in binary form must reproduce the above copyright notice, this
+  list of conditions and the following disclaimer in the
+  documentation and/or other materials provided with the distribution.
+
+* Neither the name of the copyright holder nor the names of its
+  contributors may be used to endorse or promote products derived from this
+  software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR
+ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
+ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+"""
+
+import torch
+import numpy as np
+import torch.nn.functional as F
+from torch.autograd import Variable
+from scipy.signal import get_window
+from librosa.util import pad_center, tiny
+from audio_processing import window_sumsquare
+
+
+class STFT(torch.nn.Module):
+    """adapted from Prem Seetharaman's https://github.com/pseeth/pytorch-stft"""
+    def __init__(self, filter_length=800, hop_length=200, win_length=800,
+                 window='hann'):
+        super(STFT, self).__init__()
+        self.filter_length = filter_length
+        self.hop_length = hop_length
+        self.win_length = win_length
+        self.window = window
+        self.forward_transform = None
+        scale = self.filter_length / self.hop_length
+        fourier_basis = np.fft.fft(np.eye(self.filter_length))
+
+        cutoff = int((self.filter_length / 2 + 1))
+        fourier_basis = np.vstack([np.real(fourier_basis[:cutoff, :]),
+                                   np.imag(fourier_basis[:cutoff, :])])
+
+        forward_basis = torch.FloatTensor(fourier_basis[:, None, :])
+        inverse_basis = torch.FloatTensor(
+            np.linalg.pinv(scale * fourier_basis).T[:, None, :])
+
+        if window is not None:
+            assert(win_length >= filter_length)
+            # get window and zero center pad it to filter_length
+            fft_window = get_window(window, win_length, fftbins=True)
+            fft_window = pad_center(fft_window, filter_length)
+            fft_window = torch.from_numpy(fft_window).float()
+
+            # window the bases
+            forward_basis *= fft_window
+            inverse_basis *= fft_window
+
+        self.register_buffer('forward_basis', forward_basis.float())
+        self.register_buffer('inverse_basis', inverse_basis.float())
+
+    def transform(self, input_data):
+        num_batches = input_data.size(0)
+        num_samples = input_data.size(1)
+
+        self.num_samples = num_samples
+
+        # similar to librosa, reflect-pad the input
+        input_data = input_data.view(num_batches, 1, num_samples)
+        input_data = F.pad(
+            input_data.unsqueeze(1),
+            (int(self.filter_length / 2), int(self.filter_length / 2), 0, 0),
+            mode='reflect')
+        input_data = input_data.squeeze(1)
+
+        forward_transform = F.conv1d(
+            input_data,
+            Variable(self.forward_basis, requires_grad=False),
+            stride=self.hop_length,
+            padding=0)
+
+        cutoff = int((self.filter_length / 2) + 1)
+        real_part = forward_transform[:, :cutoff, :]
+        imag_part = forward_transform[:, cutoff:, :]
+
+        magnitude = torch.sqrt(real_part**2 + imag_part**2)
+        phase = torch.autograd.Variable(
+            torch.atan2(imag_part.data, real_part.data))
+
+        return magnitude, phase
+
+    def inverse(self, magnitude, phase):
+        recombine_magnitude_phase = torch.cat(
+            [magnitude*torch.cos(phase), magnitude*torch.sin(phase)], dim=1)
+
+        inverse_transform = F.conv_transpose1d(
+            recombine_magnitude_phase,
+            Variable(self.inverse_basis, requires_grad=False),
+            stride=self.hop_length,
+            padding=0)
+
+        if self.window is not None:
+            window_sum = window_sumsquare(
+                self.window, magnitude.size(-1), hop_length=self.hop_length,
+                win_length=self.win_length, n_fft=self.filter_length,
+                dtype=np.float32)
+            # remove modulation effects
+            approx_nonzero_indices = torch.from_numpy(
+                np.where(window_sum > tiny(window_sum))[0])
+            window_sum = torch.autograd.Variable(
+                torch.from_numpy(window_sum), requires_grad=False)
+            inverse_transform[:, :, approx_nonzero_indices] /= window_sum[approx_nonzero_indices]
+
+            # scale by hop ratio
+            inverse_transform *= float(self.filter_length) / self.hop_length
+
+        inverse_transform = inverse_transform[:, :, int(self.filter_length/2):]
+        inverse_transform = inverse_transform[:, :, :-int(self.filter_length/2):]
+
+        return inverse_transform
+
+    def forward(self, input_data):
+        self.magnitude, self.phase = self.transform(input_data)
+        reconstruction = self.inverse(self.magnitude, self.phase)
+        return reconstruction
@@ -0,0 +1,272 @@
+import os
+import time
+import argparse
+import math
+
+import torch
+from distributed import DistributedDataParallel
+from torch.utils.data.distributed import DistributedSampler
+from torch.nn import DataParallel
+from torch.utils.data import DataLoader
+
+from fp16_optimizer import FP16_Optimizer
+
+from model import Tacotron2
+from data_utils import TextMelLoader, TextMelCollate
+from loss_function import Tacotron2Loss
+from logger import Tacotron2Logger
+from hparams import create_hparams
+
+
+def batchnorm_to_float(module):
+    """Converts batch norm modules to FP32"""
+    if isinstance(module, torch.nn.modules.batchnorm._BatchNorm):
+        module.float()
+    for child in module.children():
+        batchnorm_to_float(child)
+    return module
+
+
+def reduce_tensor(tensor, num_gpus):
+    rt = tensor.clone()
+    torch.distributed.all_reduce(rt, op=torch.distributed.reduce_op.SUM)
+    rt /= num_gpus
+    return rt
+
+
+def init_distributed(hparams, n_gpus, rank, group_name):
+    assert torch.cuda.is_available(), "Distributed mode requires CUDA."
+    print("Initializing distributed")
+    # Set cuda device so everything is done on the right GPU.
+    torch.cuda.set_device(rank % torch.cuda.device_count())
+
+    # Initialize distributed communication
+    torch.distributed.init_process_group(
+        backend=hparams.dist_backend, init_method=hparams.dist_url,
+        world_size=n_gpus, rank=rank, group_name=group_name)
+
+    print("Done initializing distributed")
+
+
+def prepare_dataloaders(hparams):
+    # Get data, data loaders and collate function ready
+    trainset = TextMelLoader(hparams.training_files, hparams)
+    valset = TextMelLoader(hparams.validation_files, hparams)
+    collate_fn = TextMelCollate(hparams.n_frames_per_step)
+
+    train_sampler = DistributedSampler(trainset) \
+        if hparams.distributed_run else None
+
+    train_loader = DataLoader(trainset, num_workers=1, shuffle=False,
+                              sampler=train_sampler,
+                              batch_size=hparams.batch_size, pin_memory=False,
+                              drop_last=True, collate_fn=collate_fn)
+    return train_loader, valset, collate_fn
+
+
+def prepare_directories_and_logger(output_directory, log_directory, rank):
+    if rank == 0:
+        if not os.path.isdir(output_directory):
+            os.makedirs(output_directory)
+            os.chmod(output_directory, 0o775)
+        logger = Tacotron2Logger(os.path.join(output_directory, log_directory))
+    else:
+        logger = None
+    return logger
+
+
+def load_model(hparams):
+    model = Tacotron2(hparams).cuda()
+    model = batchnorm_to_float(model.half()) if hparams.fp16_run else model
+    model = DistributedDataParallel(model) \
+        if hparams.distributed_run else DataParallel(model)
+    return model
+
+
+def warm_start_model(checkpoint_path, model):
+    assert os.path.isfile(checkpoint_path)
+    print("Warm starting model from checkpoint '{}'".format(checkpoint_path))
+    checkpoint_dict = torch.load(checkpoint_path)
+    model.load_state_dict(checkpoint_dict['state_dict'])
+    return model
+
+
+def load_checkpoint(checkpoint_path, model, optimizer):
+    assert os.path.isfile(checkpoint_path)
+    print("Loading checkpoint '{}'".format(checkpoint_path))
+    checkpoint_dict = torch.load(checkpoint_path, map_location='cpu')
+    model.load_state_dict(checkpoint_dict['state_dict'])
+    optimizer.load_state_dict(checkpoint_dict['optimizer'])
+    learning_rate = checkpoint_dict['learning_rate']
+    iteration = checkpoint_dict['iteration']
+    print("Loaded checkpoint '{}' from iteration {}" .format(
+        checkpoint_path, iteration))
+    return model, optimizer, learning_rate, iteration
+
+
+def save_checkpoint(model, optimizer, learning_rate, iteration, filepath):
+    print("Saving model and optimizer state at iteration {} to {}".format(
+        iteration, filepath))
+    torch.save({'iteration': iteration,
+                'state_dict': model.state_dict(),
+                'optimizer': optimizer.state_dict(),
+                'learning_rate': learning_rate}, filepath)
+
+
+def validate(model, criterion, valset, iteration, batch_size, n_gpus,
+             collate_fn, logger, distributed_run, rank):
+    """Handles all the validation scoring and printing"""
+    model.eval()
+    with torch.no_grad():
+        val_sampler = DistributedSampler(valset) if distributed_run else None
+        val_loader = DataLoader(valset, sampler=val_sampler, num_workers=1,
+                                shuffle=False, batch_size=batch_size,
+                                pin_memory=False, collate_fn=collate_fn)
+
+        val_loss = 0.0
+        for i, batch in enumerate(val_loader):
+            x, y = model.module.parse_batch(batch)
+            y_pred = model(x)
+            loss = criterion(y_pred, y)
+            reduced_val_loss = reduce_tensor(loss.data, n_gpus)[0] \
+                if distributed_run else loss.data[0]
+            val_loss += reduced_val_loss
+        val_loss = val_loss / (i + 1)
+
+    model.train()
+    return val_loss
+
+
+def train(output_directory, log_directory, checkpoint_path, warm_start, n_gpus,
+          rank, group_name, hparams):
+    """Training and validation logging results to tensorboard and stdout
+
+    Params
+    ------
+    output_directory (string): directory to save checkpoints
+    log_directory (string) directory to save tensorboard logs
+    checkpoint_path(string): checkpoint path
+    n_gpus (int): number of gpus
+    rank (int): rank of current gpu
+    hparams (object): comma separated list of "name=value" pairs.
+    """
+    if hparams.distributed_run:
+        init_distributed(hparams, n_gpus, rank, group_name)
+
+    torch.manual_seed(hparams.seed)
+    torch.cuda.manual_seed(hparams.seed)
+
+    model = load_model(hparams)
+    learning_rate = hparams.learning_rate
+    optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate,
+                                 weight_decay=hparams.weight_decay)
+    if hparams.fp16_run:
+        optimizer = FP16_Optimizer(
+            optimizer, dynamic_loss_scale=hparams.dynamic_loss_scaling)
+
+    criterion = Tacotron2Loss()
+
+    logger = prepare_directories_and_logger(
+        output_directory, log_directory, rank)
+
+    train_loader, valset, collate_fn = prepare_dataloaders(hparams)
+
+    # Load checkpoint if one exists
+    iteration = 0
+    epoch_offset = 0
+    if checkpoint_path is not None:
+        if warm_start:
+            model = warm_start_model(checkpoint_path, model)
+        else:
+            model, optimizer, learning_rate, iteration = load_checkpoint(
+                checkpoint_path, model, optimizer)
+            iteration += 1  # next iteration is iteration + 1
+            epoch_offset = max(0, int(iteration / len(train_loader)))
+
+    model.train()
+    # ================ MAIN TRAINNIG LOOP! ===================
+    for epoch in range(epoch_offset, hparams.epochs):
+        print("Epoch: {}".format(epoch))
+        for i, batch in enumerate(train_loader):
+            start = time.perf_counter()
+            for param_group in optimizer.param_groups:
+                param_group['lr'] = learning_rate
+
+            model.zero_grad()
+            x, y = model.module.parse_batch(batch)
+            y_pred = model(x)
+            loss = criterion(y_pred, y)
+            reduced_loss = reduce_tensor(loss.data, n_gpus)[0] \
+                if hparams.distributed_run else loss.data[0]
+
+            if hparams.fp16_run:
+                optimizer.backward(loss)
+                grad_norm = optimizer.clip_fp32_grads(hparams.grad_clip_thresh)
+            else:
+                loss.backward()
+                grad_norm = torch.nn.utils.clip_grad_norm(
+                    model.module.parameters(), hparams.grad_clip_thresh)
+
+            optimizer.step()
+
+            overflow = optimizer.overflow if hparams.fp16_run else False
+
+            if not overflow and not math.isnan(reduced_loss) and rank == 0:
+                duration = time.perf_counter() - start
+                print("Train loss {} {:.6f} Grad Norm {:.6f} {:.2f}s/it".format(
+                    iteration, reduced_loss, grad_norm, duration))
+
+                logger.log_training(
+                    reduced_loss, grad_norm, learning_rate, duration, iteration)
+
+            if not overflow and (iteration % hparams.iters_per_checkpoint == 0):
+                reduced_val_loss = validate(
+                    model, criterion, valset, iteration, hparams.batch_size,
+                    n_gpus, collate_fn, logger, hparams.distributed_run, rank)
+
+                if rank == 0:
+                    print("Validation loss {}: {:9f}  ".format(
+                        iteration, reduced_val_loss))
+                    logger.log_validation(
+                        reduced_val_loss, model, y, y_pred, iteration)
+                    checkpoint_path = os.path.join(
+                        output_directory, "checkpoint_{}".format(iteration))
+                    save_checkpoint(model, optimizer, learning_rate, iteration,
+                                    checkpoint_path)
+
+            iteration += 1
+
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser()
+    parser.add_argument('-o', '--output_directory', type=str,
+                        help='directory to save checkpoints')
+    parser.add_argument('-l', '--log_directory', type=str,
+                        help='directory to save tensorboard logs')
+    parser.add_argument('-c', '--checkpoint_path', type=str, default=None,
+                        required=False, help='checkpoint path')
+    parser.add_argument('--warm_start', action='store_true',
+                        help='load the model only (warm start)')
+    parser.add_argument('--n_gpus', type=int, default=1,
+                        required=False, help='number of gpus')
+    parser.add_argument('--rank', type=int, default=0,
+                        required=False, help='rank of current gpu')
+    parser.add_argument('--group_name', type=str, default='group_name',
+                        required=False, help='Distributed group name')
+    parser.add_argument('--hparams', type=str,
+                        required=False, help='comma separated name=value pairs')
+
+    args = parser.parse_args()
+    hparams = create_hparams(args.hparams)
+
+    torch.backends.cudnn.enabled = hparams.cudnn_enabled
+    torch.backends.cudnn.benchmark = hparams.cudnn_benchmark
+
+    print("FP16 Run:", hparams.fp16_run)
+    print("Dynamic Loss Scaling", hparams.dynamic_loss_scaling)
+    print("Distributed Run:", hparams.distributed_run)
+    print("cuDNN Enabled:", hparams.cudnn_enabled)
+    print("cuDNN Benchmark:", hparams.cudnn_benchmark)
+
+    train(args.output_directory, args.log_directory, args.checkpoint_path,
+          args.warm_start, args.n_gpus, args.rank, args.group_name, hparams)
@@ -0,0 +1,32 @@
+import numpy as np
+from scipy.io.wavfile import read
+import torch
+
+
+def get_mask_from_lengths(lengths):
+    max_len = torch.max(lengths)
+    ids = torch.arange(0, max_len, out=torch.LongTensor(max_len)).cuda()
+    mask = (ids < lengths.unsqueeze(1)).byte()
+    return mask
+
+
+def load_wav_to_torch(full_path, sr):
+    sampling_rate, data = read(full_path)
+    assert sr == sampling_rate, "{} SR doesn't match {} on path {}".format(
+        sr, sampling_rate, full_path)
+    return torch.FloatTensor(data.astype(np.float32))
+
+
+def load_filepaths_and_text(filename, sort_by_length, split="|"):
+    with open(filename, encoding='utf-8') as f:
+        filepaths_and_text = [line.strip().split(split) for line in f]
+
+    if sort_by_length:
+        filepaths_and_text.sort(key=lambda x: len(x[1]))
+
+    return filepaths_and_text
+
+
+def to_gpu(x):
+    x = x.contiguous().cuda(async=True)
+    return torch.autograd.Variable(x)