-
Notifications
You must be signed in to change notification settings - Fork 278
numpy to keras ops #2126
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: master
Are you sure you want to change the base?
numpy to keras ops #2126
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -6,8 +6,10 @@ | |
|
||
try: | ||
import tensorflow as tf | ||
import keras.ops as ops | ||
except ImportError: | ||
tf = None | ||
ops = None | ||
|
||
|
||
@keras_hub_export("keras_hub.layers.WhisperAudioConverter") | ||
|
@@ -84,6 +86,20 @@ def audio_shape(self): | |
"""Returns the preprocessed size of a single audio sample.""" | ||
return (self.max_audio_length, self.num_mels) | ||
|
||
|
||
|
||
def _get_rfftfreq_keras(self): # Inside the class definition | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. what is this comment support to mean? |
||
n = self.num_fft_bins | ||
d = 1.0 / self.sampling_rate | ||
|
||
if n % 2 == 0: | ||
freqs = ops.arange(0, n // 2 + 1, dtype=tf.float32) / (d * n) | ||
else: | ||
freqs = ops.arange(0, (n - 1) // 2 + 1, dtype=tf.float32) / (d * n) | ||
|
||
return freqs | ||
|
||
|
||
def _get_mel_filters(self): | ||
""" | ||
Adapted from Hugging Face | ||
|
@@ -92,25 +108,15 @@ def _get_mel_filters(self): | |
|
||
# TODO: Convert to TensorFlow ops (if possible). | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. We should remove this TODO I think. |
||
|
||
dtype = np.float32 | ||
dtype = self.compute_dtype # Use the class's dtype | ||
# Initialize the weights | ||
weights = np.zeros( | ||
(self.num_mels, int(1 + self.num_fft_bins // 2)), dtype=dtype | ||
) | ||
|
||
weights = ops.zeros((self.num_mels, int(1 + self.num_fft_bins // 2)), dtype=dtype) | ||
# Center freqs of each FFT bin | ||
fftfreqs = np.fft.rfftfreq( | ||
n=self.num_fft_bins, d=1.0 / self.sampling_rate | ||
) | ||
|
||
fftfreqs = self._get_rfftfreq_keras() | ||
# 'Center freqs' of mel bands - uniformly spaced between limits | ||
min_mel = 0.0 | ||
max_mel = 45.245640471924965 | ||
|
||
mels = np.linspace(min_mel, max_mel, self.num_mels + 2) | ||
|
||
mels = np.asanyarray(mels) | ||
|
||
mels = ops.linspace(min_mel, max_mel, self.num_mels + 2) | ||
# Fill in the linear scale | ||
f_min = 0.0 | ||
f_sp = 200.0 / 3 | ||
|
@@ -119,33 +125,33 @@ def _get_mel_filters(self): | |
# And now the nonlinear scale | ||
min_log_hz = 1000.0 # beginning of log region (Hz) | ||
min_log_mel = (min_log_hz - f_min) / f_sp # same (Mels) | ||
logstep = np.log(6.4) / 27.0 # step size for log region | ||
|
||
logstep = ops.log(6.4) / 27.0 # step size for log region | ||
# If we have vector data, vectorize | ||
log_t = mels >= min_log_mel | ||
freqs[log_t] = min_log_hz * np.exp( | ||
logstep * (mels[log_t] - min_log_mel) | ||
) | ||
|
||
freqs = ops.where(log_t, min_log_hz * ops.exp(logstep * (mels - min_log_mel)), freqs) # using tf.where for conditional replacement | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. remove the comment, and reformat to 80 chars |
||
mel_f = freqs | ||
|
||
fdiff = np.diff(mel_f) | ||
ramps = np.subtract.outer(mel_f, fftfreqs) | ||
fdiff = mel_f[1:] - mel_f[:-1] #keras diff. | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. remove these comments |
||
ramps = ops.expand_dims(mel_f, axis=1) - fftfreqs #keras subtract outer | ||
|
||
weights_list = [] | ||
for i in range(self.num_mels): | ||
# lower and upper slopes for all bins | ||
lower = -ramps[i] / fdiff[i] | ||
upper = ramps[i + 2] / fdiff[i + 1] | ||
|
||
# .. then intersect them with each other and zero | ||
weights[i] = np.maximum(0, np.minimum(lower, upper)) | ||
weights_i = ops.maximum(0, ops.minimum(lower, upper)) | ||
weights_list.append(weights_i) | ||
|
||
weights = ops.stack(weights_list) | ||
|
||
# Slaney-style mel is scaled to be approx constant energy per channel | ||
enorm = 2.0 / (mel_f[2 : self.num_mels + 2] - mel_f[: self.num_mels]) | ||
weights *= enorm[:, np.newaxis] | ||
weights *= ops.expand_dims(enorm, axis=1) | ||
|
||
weights = np.transpose(weights) | ||
return tf.constant(weights, dtype=self.compute_dtype) | ||
weights = ops.transpose(weights) | ||
return weights | ||
|
||
def _extract_audio_features(self, audio): | ||
audio = tf.cast(audio, self.compute_dtype) | ||
|
@@ -242,4 +248,4 @@ def get_config(self): | |
"max_audio_length": self.max_audio_length, | ||
} | ||
) | ||
return config | ||
return config | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. don't remove training newline |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
No reason to import keras conditionally. We don't want to assume tf is install, we can assume keras is installed.