Lab41
diff --git a/‎Classification_baselines/LibriSpeech/LibriSpeech-100.ipynb
Lines changed: 0 additions & 804 deletions b/‎Classification_baselines/LibriSpeech/LibriSpeech-100.ipynb
Lines changed: 0 additions & 804 deletions
diff --git a/‎Classification_baselines/LibriSpeech/LibriSpeech.ipynb
Lines changed: 236 additions & 460 deletions b/‎Classification_baselines/LibriSpeech/LibriSpeech.ipynb
Lines changed: 236 additions & 460 deletions
diff --git a/‎Utils/models.py
Lines changed: 108 additions & 3 deletions b/‎Utils/models.py
Lines changed: 108 additions & 3 deletions
diff --git a/‎Utils/transformations.py
Lines changed: 48 additions & 0 deletions b/‎Utils/transformations.py
Lines changed: 48 additions & 0 deletions
diff --git a/‎cyphercat/__init__.py
Lines changed: 2 additions & 0 deletions b/‎cyphercat/__init__.py
Lines changed: 2 additions & 0 deletions
diff --git a/‎cyphercat/definitions.py
Lines changed: 4 additions & 0 deletions b/‎cyphercat/definitions.py
Lines changed: 4 additions & 0 deletions
diff --git a/‎cyphercat/load_data.py
Lines changed: 6 additions & 10 deletions b/‎cyphercat/load_data.py
Lines changed: 6 additions & 10 deletions
@@ -1,7 +1,8 @@
+import torch
 from torch import nn 
 import torch.nn.functional as F
 import numpy as np 
-
+import os.path
 
 def new_size_conv(size, kernel, stride=1, padding=0): 
     return np.floor((size + 2*padding - (kernel -1)-1)/stride +1)
@@ -290,7 +291,85 @@ def forward(self, x):
 
         return out
 
-            
+
+class audio_cnn_block(nn.Module):
+    '''
+    1D convolution block used to build audio cnn classifiers
+    Args:
+    input: input channels
+    output: output channels
+    kernel_size: convolution kernel size
+    '''
+    def __init__(self, n_input, n_out, kernel_size):
+        super(audio_cnn_block, self).__init__()
+        self.cnn_block = nn.Sequential(
+            nn.Conv1d(n_input, n_out, kernel_size, padding=1),
+            nn.BatchNorm1d(n_out),
+            nn.ReLU(),
+            nn.MaxPool1d(kernel_size=4, stride=4)
+        )
+    
+    def forward(self, x):
+        return self.cnn_block(x)
+
+
+class audio_tiny_cnn(nn.Module):
+    '''
+    Template for convolutional audio classifiers.
+    '''
+    def __init__(self, cnn_sizes, n_hidden, kernel_size, n_classes):
+        '''
+        Init
+        Args: 
+        cnn_sizes: List of sizes for the convolution blocks
+        n_hidden: number of hidden units in the first fully connected layer
+        kernel_size: convolution kernel size
+        n_classes: number of speakers to classify
+        '''
+        super(audio_tiny_cnn, self).__init__()
+        self.down_path = nn.ModuleList()
+        self.down_path.append(audio_cnn_block(cnn_sizes[0], cnn_sizes[1],
+                                              kernel_size,))
+        self.down_path.append(audio_cnn_block(cnn_sizes[1], cnn_sizes[2],
+                                              kernel_size,))
+        self.down_path.append(audio_cnn_block(cnn_sizes[2], cnn_sizes[3],
+                                              kernel_size,))
+        self.fc = nn.Sequential(
+            nn.Linear(cnn_sizes[4], n_hidden),
+            nn.ReLU()
+        )
+        self.out = nn.Linear(n_hidden, n_classes)
+
+    def forward(self, x):
+        for down in self.down_path:
+            x = down(x)
+        x = x.view(x.size(0), -1)
+        x = self.fc(x)
+        return self.out(x)
+
+
+def MFCC_cnn_classifier(n_classes):
+    '''
+    Builds speaker classifier that ingests MFCC's
+    '''
+    in_size = 20
+    n_hidden = 512
+    sizes_list = [in_size, 2*in_size, 4*in_size, 8*in_size, 8*in_size]
+    return audio_tiny_cnn(cnn_sizes=sizes_list, n_hidden=n_hidden,
+                          kernel_size=3, n_classes=125)
+
+
+def ft_cnn_classifer(n_classes):
+    '''
+    Builds speaker classifier that ingests the abs value of fourier transforms
+    '''
+    in_size = 94
+    n_hidden = 512
+    sizes_list = [in_size, in_size, 2*in_size, 4*in_size, 14*4*in_size]
+    return audio_tiny_cnn(cnn_sizes=sizes_list, n_hidden=n_hidden,
+                          kernel_size=7, n_classes=125)
+
+
 def weights_init(m): 
     if isinstance(m, nn.Conv2d):
         nn.init.kaiming_normal_(m.weight, mode='fan_out', nonlinearity='relu')
@@ -302,4 +381,30 @@ def weights_init(m):
     elif isinstance(m, nn.Linear): 
         nn.init.xavier_normal_(m.weight.data)
         nn.init.constant_(m.bias, 0)
-        
+
+def save_checkpoint(model=None, optimizer=None, epoch=None,
+                    data_descriptor=None, loss=None, accuracy=None, path='./',
+                    filename='checkpoint', ext='.pth.tar'):
+    state = {
+        'epoch': epoch,
+        'arch': str(model.type),
+        'state_dict': model.state_dict(),
+        'optimizer' : optimizer.state_dict(),
+        'loss': loss,
+        'accuracy': accuracy,
+        'dataset': data_descriptor
+        }
+    torch.save(state, path+filename+ext)
+
+
+def load_checkpoint(model=None, optimizer=None,  checkpoint=None):
+    assert os.path.isfile(checkpoint), 'Checkpoint not found, aborting load'
+    chpt = torch.load(checkpoint)
+    assert str(model.type) == chpt['arch'], 'Model arquitecture mismatch,\
+  aborting load'
+    model.load_state_dict(chpt['state_dict'])
+    if optimizer is not None:
+        optimizer.load_state_dict['optimizer']
+    print('Succesfully loaded checkpoint \nDataset: %s \nEpoch: %s \nLoss: %s\
+\nAccuracy: %s' % (chpt['dataset'], chpt['epoch'], chpt['loss'],
+                   chpt['accuracy']))
@@ -0,0 +1,48 @@
+import torch
+import librosa as libr
+import numpy as np
+
+
+class ToMFCC:
+    '''
+    Transformation to convert soundfile loaded via LibriSpeechDataset to Mel-
+    frequency cepstral coefficients (MFCCs)
+    Args: 
+    number_of_mels: Number of bins to use for cepstral coefficients
+    Returns:
+    torch.float tensor
+    '''
+    def __init__(self, number_of_mels=128):
+        self.number_of_mels = number_of_mels
+        
+    def __call__(self, y):
+        dims = y.shape
+        y = libr.feature.melspectrogram(np.reshape(y, (dims[1],)), 16000,
+                                        n_mels=self.number_of_mels, fmax=8000)
+        y = libr.feature.mfcc(S=libr.power_to_db(y))
+        y = torch.from_numpy(y)                           
+        return y.float()
+
+
+class STFT:
+    '''
+    Short-time Fourier transform (STFT) for librosa dataset
+    Args: 
+    phase: If true, will return the magnitude and phase of the transformation, 
+    if false only returns magnitude
+    Returns:
+    torch.float tensor
+    '''
+    def __init__(self, phase=False):
+        self.phase = phase
+
+    def __call__(self, y):
+        dims = y.shape
+        y = libr.core.stft(np.reshape(y, (dims[1],)))
+        y, phase = np.abs(y), np.angle(y)
+        y = torch.from_numpy(y).permute(1, 0)
+        phase = torch.from_numpy(phase).permute(1, 0)
+        if self.phase:
+            return torch.cat( (y, phase), dim=0).float()
+        else:
+            return y.float()
@@ -3,6 +3,8 @@
 from .__version__ import __version__
 
 from .utils import *
+from .train import *
 from .models import *
 from .metrics import *
 from .load_data import *
+from .definitions import *
@@ -0,0 +1,4 @@
+import os
+
+CYCAT_DIR = os.path.dirname(os.path.abspath(__file__))
+REPO_DIR  = os.path.split(CYCAT_DIR)[0]
@@ -16,6 +16,7 @@ def custom_preprocessor(out_dir=''):
     ----------
     out_dir   : string
                 directory of unpacked data set
+                    
     """
 
     # Get name of data set from output directory
@@ -56,23 +57,18 @@ def custom_preprocessor(out_dir=''):
     # For LFW
     if 'lfw' in data_name.lower():
 
-        os.rename(os.path.join(out_dir, 'lfw/'), os.path.join(out_dir, 'lfw_original/'))
-
-        lfw_dir    = os.path.join(out_dir, 'lfw_original/')
+        lfw_dir    = out_dir + '_original/'
+        os.rename(out_dir, lfw_dir)
+        
         people_dir = os.listdir(lfw_dir)
 
         num_per_class = 20
 
-        new_dir = os.path.join(out_dir, 'lfw_' + str(num_per_class))
-
-        if not os.path.isdir(new_dir):
-            os.makedirs(new_dir)
-
         for p in people_dir:
             imgs = os.listdir(os.path.join(lfw_dir, p))
             if len(imgs) >= num_per_class:
-                shutil.copytree(os.path.join(lfw_dir, p), os.path.join(new_dir, p))
-
+                shutil.copytree(os.path.join(lfw_dir, p), os.path.join(out_dir, p))
+    
     print('{} successfully downloaded and preprocessed.'.format(data_name))