Skip to content

numpy to keras ops #2126

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Draft
wants to merge 1 commit into
base: master
Choose a base branch
from
Draft
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
60 changes: 33 additions & 27 deletions keras_hub/src/models/whisper/whisper_audio_converter.py
Original file line number Diff line number Diff line change
@@ -6,8 +6,10 @@

try:
import tensorflow as tf
import keras.ops as ops
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

No reason to import keras conditionally. We don't want to assume tf is install, we can assume keras is installed.

except ImportError:
tf = None
ops = None


@keras_hub_export("keras_hub.layers.WhisperAudioConverter")
@@ -84,6 +86,20 @@ def audio_shape(self):
"""Returns the preprocessed size of a single audio sample."""
return (self.max_audio_length, self.num_mels)



def _get_rfftfreq_keras(self): # Inside the class definition
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

what is this comment support to mean?

n = self.num_fft_bins
d = 1.0 / self.sampling_rate

if n % 2 == 0:
freqs = ops.arange(0, n // 2 + 1, dtype=tf.float32) / (d * n)
else:
freqs = ops.arange(0, (n - 1) // 2 + 1, dtype=tf.float32) / (d * n)

return freqs


def _get_mel_filters(self):
"""
Adapted from Hugging Face
@@ -92,25 +108,15 @@ def _get_mel_filters(self):

# TODO: Convert to TensorFlow ops (if possible).
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We should remove this TODO I think.


dtype = np.float32
dtype = self.compute_dtype # Use the class's dtype
# Initialize the weights
weights = np.zeros(
(self.num_mels, int(1 + self.num_fft_bins // 2)), dtype=dtype
)

weights = ops.zeros((self.num_mels, int(1 + self.num_fft_bins // 2)), dtype=dtype)
# Center freqs of each FFT bin
fftfreqs = np.fft.rfftfreq(
n=self.num_fft_bins, d=1.0 / self.sampling_rate
)

fftfreqs = self._get_rfftfreq_keras()
# 'Center freqs' of mel bands - uniformly spaced between limits
min_mel = 0.0
max_mel = 45.245640471924965

mels = np.linspace(min_mel, max_mel, self.num_mels + 2)

mels = np.asanyarray(mels)

mels = ops.linspace(min_mel, max_mel, self.num_mels + 2)
# Fill in the linear scale
f_min = 0.0
f_sp = 200.0 / 3
@@ -119,33 +125,33 @@ def _get_mel_filters(self):
# And now the nonlinear scale
min_log_hz = 1000.0 # beginning of log region (Hz)
min_log_mel = (min_log_hz - f_min) / f_sp # same (Mels)
logstep = np.log(6.4) / 27.0 # step size for log region

logstep = ops.log(6.4) / 27.0 # step size for log region
# If we have vector data, vectorize
log_t = mels >= min_log_mel
freqs[log_t] = min_log_hz * np.exp(
logstep * (mels[log_t] - min_log_mel)
)

freqs = ops.where(log_t, min_log_hz * ops.exp(logstep * (mels - min_log_mel)), freqs) # using tf.where for conditional replacement
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

remove the comment, and reformat to 80 chars

mel_f = freqs

fdiff = np.diff(mel_f)
ramps = np.subtract.outer(mel_f, fftfreqs)
fdiff = mel_f[1:] - mel_f[:-1] #keras diff.
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

remove these comments

ramps = ops.expand_dims(mel_f, axis=1) - fftfreqs #keras subtract outer

weights_list = []
for i in range(self.num_mels):
# lower and upper slopes for all bins
lower = -ramps[i] / fdiff[i]
upper = ramps[i + 2] / fdiff[i + 1]

# .. then intersect them with each other and zero
weights[i] = np.maximum(0, np.minimum(lower, upper))
weights_i = ops.maximum(0, ops.minimum(lower, upper))
weights_list.append(weights_i)

weights = ops.stack(weights_list)

# Slaney-style mel is scaled to be approx constant energy per channel
enorm = 2.0 / (mel_f[2 : self.num_mels + 2] - mel_f[: self.num_mels])
weights *= enorm[:, np.newaxis]
weights *= ops.expand_dims(enorm, axis=1)

weights = np.transpose(weights)
return tf.constant(weights, dtype=self.compute_dtype)
weights = ops.transpose(weights)
return weights

def _extract_audio_features(self, audio):
audio = tf.cast(audio, self.compute_dtype)
@@ -242,4 +248,4 @@ def get_config(self):
"max_audio_length": self.max_audio_length,
}
)
return config
return config
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

don't remove training newline