From 2e25279e5382f591272f2b5d01286cc11d976f79 Mon Sep 17 00:00:00 2001 From: Sam Anklesaria Date: Fri, 18 Jul 2025 19:57:20 +0000 Subject: [PATCH 01/27] Add torchcodec mock with wav loading and saving --- test/torchcodec/decoders.py | 17 +++++++++++++++++ test/torchcodec/encoders.py | 10 ++++++++++ 2 files changed, 27 insertions(+) create mode 100644 test/torchcodec/decoders.py create mode 100644 test/torchcodec/encoders.py diff --git a/test/torchcodec/decoders.py b/test/torchcodec/decoders.py new file mode 100644 index 0000000000..94f2d8c8c1 --- /dev/null +++ b/test/torchcodec/decoders.py @@ -0,0 +1,17 @@ +import test.torchaudio_unittest.common_utils.wav_utils as wav_utils + +class AudioDecoder: + def __init__(self, uri): + self.uri = uri + + def get_all_samples(self): + return wav_utils.load_wav(self.uri) + + +class AudioEncoder: + def __init__(self, data, sample_rate): + self.data = data + self.sample_rate = sample_rate + + def to_file(self, uri, bit_rate=None): + return wav_utils.save_wav(uri, self.data, self.sample_rate) diff --git a/test/torchcodec/encoders.py b/test/torchcodec/encoders.py new file mode 100644 index 0000000000..5e9cc54968 --- /dev/null +++ b/test/torchcodec/encoders.py @@ -0,0 +1,10 @@ +import torchaudio_unittest.common_utils.wav_utils as wav_utils + +class AudioEncoder: + def __init__(self, data, sample_rate): + print("BEING CALLED") + self.data = data + self.sample_rate = sample_rate + + def to_file(self, uri, bit_rate=None): + return wav_utils.save_wav(uri, self.data, self.sample_rate) From a3002211592397a4a4aa507f7ebd0626bd125231 Mon Sep 17 00:00:00 2001 From: Nicolas Hug Date: Wed, 16 Jul 2025 10:18:18 +0100 Subject: [PATCH 02/27] Let load and save rely on *_with_torchcodec --- src/torchaudio/__init__.py | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/src/torchaudio/__init__.py b/src/torchaudio/__init__.py index e533cafe9d..1fde90b871 100644 --- a/src/torchaudio/__init__.py +++ b/src/torchaudio/__init__.py @@ -7,8 +7,6 @@ get_audio_backend as _get_audio_backend, info as _info, list_audio_backends as _list_audio_backends, - load, - save, set_audio_backend as _set_audio_backend, ) from ._torchcodec import load_with_torchcodec, save_with_torchcodec @@ -41,6 +39,13 @@ pass +def load(*args, **kwargs): + return load_with_torchcodec(*args, **kwargs) + +def save(*args, **kwargs): + return save_with_torchcodec(*args, **kwargs) + + __all__ = [ "AudioMetaData", "load", From 07e3b77f565d153ec3c8d6eb2cba3de93bd8c1dd Mon Sep 17 00:00:00 2001 From: Nicolas Hug Date: Wed, 16 Jul 2025 13:49:53 +0100 Subject: [PATCH 03/27] install torchcodec in doc job --- .github/workflows/build_docs.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/build_docs.yml b/.github/workflows/build_docs.yml index e92c556218..f681e3b7ec 100644 --- a/.github/workflows/build_docs.yml +++ b/.github/workflows/build_docs.yml @@ -68,7 +68,7 @@ jobs: GPU_ARCH_ID=cu126 # This is hard-coded and must be consistent with gpu-arch-version. PYTORCH_WHEEL_INDEX="https://download.pytorch.org/whl/${CHANNEL}/${GPU_ARCH_ID}" - pip install --progress-bar=off --pre torch --index-url="${PYTORCH_WHEEL_INDEX}" + pip install --progress-bar=off --pre torch torchcodec --index-url="${PYTORCH_WHEEL_INDEX}" echo "::endgroup::" echo "::group::Install TorchAudio" From 92719d3abe1c206f8f3b0a6e3531a53e0ef30933 Mon Sep 17 00:00:00 2001 From: Sam Anklesaria Date: Tue, 12 Aug 2025 19:53:00 +0000 Subject: [PATCH 04/27] Add docstring and arguments for load and save --- src/torchaudio/__init__.py | 177 ++++++++++++++++++++++++++++++++++++- 1 file changed, 173 insertions(+), 4 deletions(-) diff --git a/src/torchaudio/__init__.py b/src/torchaudio/__init__.py index 1fde90b871..ed4be65d6d 100644 --- a/src/torchaudio/__init__.py +++ b/src/torchaudio/__init__.py @@ -39,12 +39,181 @@ pass -def load(*args, **kwargs): - return load_with_torchcodec(*args, **kwargs) +def load( + uri: Union[BinaryIO, str, os.PathLike], + frame_offset: int = 0, + num_frames: int = -1, + normalize: bool = True, + channels_first: bool = True, + format: Optional[str] = None, + buffer_size: int = 4096, + backend: Optional[str] = None, +) -> Tuple[torch.Tensor, int]: + """Load audio data from source using TorchCodec's AudioDecoder. -def save(*args, **kwargs): - return save_with_torchcodec(*args, **kwargs) + .. note:: + This function supports the same API as :func:`~torchaudio.load`, and + relies on TorchCodec's decoding capabilities under the hood. It is + provided for convenience, but we do recommend that you port your code to + natively use ``torchcodec``'s ``AudioDecoder`` class for better + performance: + https://docs.pytorch.org/torchcodec/stable/generated/torchcodec.decoders.AudioDecoder. + In TorchAudio 2.9, :func:`~torchaudio.load` will be relying on + :func:`~torchaudio.load_with_torchcodec`. Note that some parameters of + :func:`~torchaudio.load`, like ``normalize``, ``buffer_size``, and + ``backend``, are ignored by :func:`~torchaudio.load_with_torchcodec`. + + + Args: + uri (path-like object or file-like object): + Source of audio data. The following types are accepted: + + * ``path-like``: File path or URL. + * ``file-like``: Object with ``read(size: int) -> bytes`` method. + + frame_offset (int, optional): + Number of samples to skip before start reading data. + num_frames (int, optional): + Maximum number of samples to read. ``-1`` reads all the remaining samples, + starting from ``frame_offset``. + normalize (bool, optional): + TorchCodec always returns normalized float32 samples. This parameter + is ignored and a warning is issued if set to False. + Default: ``True``. + channels_first (bool, optional): + When True, the returned Tensor has dimension `[channel, time]`. + Otherwise, the returned Tensor's dimension is `[time, channel]`. + format (str or None, optional): + Format hint for the decoder. May not be supported by all TorchCodec + decoders. (Default: ``None``) + buffer_size (int, optional): + Not used by TorchCodec AudioDecoder. Provided for API compatibility. + backend (str or None, optional): + Not used by TorchCodec AudioDecoder. Provided for API compatibility. + + Returns: + (torch.Tensor, int): Resulting Tensor and sample rate. + Always returns float32 tensors. If ``channels_first=True``, shape is + `[channel, time]`, otherwise `[time, channel]`. + + Raises: + ImportError: If torchcodec is not available. + ValueError: If unsupported parameters are used. + RuntimeError: If TorchCodec fails to decode the audio. + + Note: + - TorchCodec always returns normalized float32 samples, so the ``normalize`` + parameter has no effect. + - The ``buffer_size`` and ``backend`` parameters are ignored. + - Not all audio formats supported by torchaudio backends may be supported + by TorchCodec. + """ + return load_with_torchcodec( + uri, + frame_offset=frame_offset, + num_frames=num_frames, + normalize=normalize, + channels_first=channels_first, + format=format, + buffer_size=buffer_size, + backend=backend + ) + +def save( + uri: Union[str, os.PathLike], + src: torch.Tensor, + sample_rate: int, + channels_first: bool = True, + format: Optional[str] = None, + encoding: Optional[str] = None, + bits_per_sample: Optional[int] = None, + buffer_size: int = 4096, + backend: Optional[str] = None, + compression: Optional[Union[float, int]] = None, +) -> None: + """Save audio data to file using TorchCodec's AudioEncoder. + + .. note:: + + This function supports the same API as :func:`~torchaudio.save`, and + relies on TorchCodec's encoding capabilities under the hood. It is + provided for convenience, but we do recommend that you port your code to + natively use ``torchcodec``'s ``AudioEncoder`` class for better + performance: + https://docs.pytorch.org/torchcodec/stable/generated/torchcodec.encoders.AudioEncoder. + In TorchAudio 2.9, :func:`~torchaudio.save` will be relying on + :func:`~torchaudio.save_with_torchcodec`. Note that some parameters of + :func:`~torchaudio.save`, like ``format``, ``encoding``, + ``bits_per_sample``, ``buffer_size``, and ``backend``, are ignored by + are ignored by :func:`~torchaudio.save_with_torchcodec`. + + This function provides a TorchCodec-based alternative to torchaudio.save + with the same API. TorchCodec's AudioEncoder provides efficient encoding + with FFmpeg under the hood. + + Args: + uri (path-like object): + Path to save the audio file. The file extension determines the format. + + src (torch.Tensor): + Audio data to save. Must be a 1D or 2D tensor with float32 values + in the range [-1, 1]. If 2D, shape should be [channel, time] when + channels_first=True, or [time, channel] when channels_first=False. + + sample_rate (int): + Sample rate of the audio data. + + channels_first (bool, optional): + Indicates whether the input tensor has channels as the first dimension. + If True, expects [channel, time]. If False, expects [time, channel]. + Default: True. + + format (str or None, optional): + Audio format hint. Not used by TorchCodec (format is determined by + file extension). A warning is issued if provided. + Default: None. + + encoding (str or None, optional): + Audio encoding. Not fully supported by TorchCodec AudioEncoder. + A warning is issued if provided. Default: None. + + bits_per_sample (int or None, optional): + Bits per sample. Not directly supported by TorchCodec AudioEncoder. + A warning is issued if provided. Default: None. + + buffer_size (int, optional): + Not used by TorchCodec AudioEncoder. Provided for API compatibility. + A warning is issued if not default value. Default: 4096. + + backend (str or None, optional): + Not used by TorchCodec AudioEncoder. Provided for API compatibility. + A warning is issued if provided. Default: None. + + compression (float, int or None, optional): + Compression level or bit rate. Maps to bit_rate parameter in + TorchCodec AudioEncoder. Default: None. + + Raises: + ImportError: If torchcodec is not available. + ValueError: If input parameters are invalid. + RuntimeError: If TorchCodec fails to encode the audio. + + Note: + - TorchCodec AudioEncoder expects float32 samples in [-1, 1] range. + - Some parameters (format, encoding, bits_per_sample, buffer_size, backend) + are not used by TorchCodec but are provided for API compatibility. + - The output format is determined by the file extension in the uri. + - TorchCodec uses FFmpeg under the hood for encoding. + """ + return save_with_torchcodec(uri, src, sample_rate, + channels_first=channels_first, + format=format, + encoding=encoding, + bits_per_sample=bits_per_sample, + buffer_size=buffer_size, + backend=backend, + compression=compression) __all__ = [ "AudioMetaData", From 4a98ee5f36552ead8e3cf6bf143f7b4484dd897c Mon Sep 17 00:00:00 2001 From: Sam Anklesaria Date: Wed, 13 Aug 2025 14:42:00 +0000 Subject: [PATCH 05/27] Revise docstring --- src/torchaudio/__init__.py | 26 ++++++++------------------ 1 file changed, 8 insertions(+), 18 deletions(-) diff --git a/src/torchaudio/__init__.py b/src/torchaudio/__init__.py index ed4be65d6d..37d20a76aa 100644 --- a/src/torchaudio/__init__.py +++ b/src/torchaudio/__init__.py @@ -53,16 +53,13 @@ def load( .. note:: - This function supports the same API as :func:`~torchaudio.load`, and - relies on TorchCodec's decoding capabilities under the hood. It is + As of TorchAudio 2.9, this function relies on TorchCodec's decoding capabilities under the hood. It is provided for convenience, but we do recommend that you port your code to natively use ``torchcodec``'s ``AudioDecoder`` class for better performance: https://docs.pytorch.org/torchcodec/stable/generated/torchcodec.decoders.AudioDecoder. - In TorchAudio 2.9, :func:`~torchaudio.load` will be relying on - :func:`~torchaudio.load_with_torchcodec`. Note that some parameters of - :func:`~torchaudio.load`, like ``normalize``, ``buffer_size``, and - ``backend``, are ignored by :func:`~torchaudio.load_with_torchcodec`. + Because of the reliance on Torchcodec, the parameters ``normalize``, ``buffer_size``, and + ``backend`` are ignored and accepted only for backwards compatibility. Args: @@ -136,21 +133,14 @@ def save( .. note:: - This function supports the same API as :func:`~torchaudio.save`, and - relies on TorchCodec's encoding capabilities under the hood. It is - provided for convenience, but we do recommend that you port your code to + As of TorchAudio 2.9, this function relies on TorchCodec's encoding capabilities under the hood. + It is provided for convenience, but we do recommend that you port your code to natively use ``torchcodec``'s ``AudioEncoder`` class for better performance: https://docs.pytorch.org/torchcodec/stable/generated/torchcodec.encoders.AudioEncoder. - In TorchAudio 2.9, :func:`~torchaudio.save` will be relying on - :func:`~torchaudio.save_with_torchcodec`. Note that some parameters of - :func:`~torchaudio.save`, like ``format``, ``encoding``, - ``bits_per_sample``, ``buffer_size``, and ``backend``, are ignored by - are ignored by :func:`~torchaudio.save_with_torchcodec`. - - This function provides a TorchCodec-based alternative to torchaudio.save - with the same API. TorchCodec's AudioEncoder provides efficient encoding - with FFmpeg under the hood. + Because of the reliance on Torchcodec, the parameters ``format``, ``encoding``, + ``bits_per_sample``, ``buffer_size``, and ``backend``, are ignored and accepted only for + backwards compatibility. Args: uri (path-like object): From 7b02754b407e42cca822d3d2ce5e7eeb60d2b01f Mon Sep 17 00:00:00 2001 From: Sam Anklesaria Date: Wed, 13 Aug 2025 15:13:14 +0000 Subject: [PATCH 06/27] Add typing imports --- src/torchaudio/__init__.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/src/torchaudio/__init__.py b/src/torchaudio/__init__.py index 37d20a76aa..60c8ceb7fe 100644 --- a/src/torchaudio/__init__.py +++ b/src/torchaudio/__init__.py @@ -1,4 +1,7 @@ from torchaudio._internal.module_utils import dropping_io_support, dropping_class_io_support +from typing import Union, BinaryIO, Optional, Tuple +import os +import torch # Initialize extension and backend first from . import _extension # noqa # usort: skip From 74edc0a8dbe942aae3f04924d1743f4da49800cb Mon Sep 17 00:00:00 2001 From: Sam Anklesaria Date: Wed, 13 Aug 2025 16:00:40 +0000 Subject: [PATCH 07/27] Try ffmpeg>4 --- .github/scripts/unittest-linux/install.sh | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/.github/scripts/unittest-linux/install.sh b/.github/scripts/unittest-linux/install.sh index a7ae9bfcf4..2163502b2e 100755 --- a/.github/scripts/unittest-linux/install.sh +++ b/.github/scripts/unittest-linux/install.sh @@ -86,8 +86,7 @@ pip install . -v --no-build-isolation # 3. Install Test tools printf "* Installing test tools\n" -# On this CI, for whatever reason, we're only able to install ffmpeg 4. -conda install -y "ffmpeg<5" +conda install -y "ffmpeg>4" python -c "import torch; import torchaudio; import torchcodec; print(torch.__version__, torchaudio.__version__, torchcodec.__version__)" NUMBA_DEV_CHANNEL="" From 80f5eb7778afd5efc1a2c601583c84ffb5aa2401 Mon Sep 17 00:00:00 2001 From: Sam Anklesaria Date: Wed, 13 Aug 2025 16:22:24 +0000 Subject: [PATCH 08/27] Install conda deps before pip deps --- .github/scripts/unittest-linux/install.sh | 30 ++++++++++++----------- 1 file changed, 16 insertions(+), 14 deletions(-) diff --git a/.github/scripts/unittest-linux/install.sh b/.github/scripts/unittest-linux/install.sh index 2163502b2e..6a347577d5 100755 --- a/.github/scripts/unittest-linux/install.sh +++ b/.github/scripts/unittest-linux/install.sh @@ -74,20 +74,7 @@ case $GPU_ARCH_TYPE in ;; esac PYTORCH_WHEEL_INDEX="https://download.pytorch.org/whl/${UPLOAD_CHANNEL}/${GPU_ARCH_ID}" -pip install --progress-bar=off --pre torch torchcodec --index-url="${PYTORCH_WHEEL_INDEX}" - - -# 2. Install torchaudio -conda install --quiet -y ninja cmake - -printf "* Installing torchaudio\n" -export BUILD_CPP_TEST=1 -pip install . -v --no-build-isolation -# 3. Install Test tools -printf "* Installing test tools\n" -conda install -y "ffmpeg>4" -python -c "import torch; import torchaudio; import torchcodec; print(torch.__version__, torchaudio.__version__, torchcodec.__version__)" NUMBA_DEV_CHANNEL="" if [[ "$(python --version)" = *3.9* || "$(python --version)" = *3.10* ]]; then @@ -97,12 +84,27 @@ if [[ "$(python --version)" = *3.9* || "$(python --version)" = *3.10* ]]; then fi ( set -x - conda install -y -c conda-forge ${NUMBA_DEV_CHANNEL} libvorbis parameterized 'requests>=2.20' + conda install -y -c conda-forge ${NUMBA_DEV_CHANNEL} "ffmpeg>4" libvorbis parameterized 'requests>=2.20' pip install SoundFile coverage pytest pytest-cov scipy expecttest unidecode inflect Pillow sentencepiece pytorch-lightning 'protobuf<4.21.0' demucs tinytag pyroomacoustics flashlight-text git+https://github.com/kpu/kenlm # TODO: might be better to fix the single call to `pip install` above pip install pillow scipy "numpy>=1.26" ) + +pip install --progress-bar=off --pre torch torchcodec --index-url="${PYTORCH_WHEEL_INDEX}" + + +# 2. Install torchaudio +conda install --quiet -y ninja cmake + +printf "* Installing torchaudio\n" +export BUILD_CPP_TEST=1 +pip install . -v --no-build-isolation + +# 3. Install Test tools +printf "* Installing test tools\n" +python -c "import torch; import torchaudio; import torchcodec; print(torch.__version__, torchaudio.__version__, torchcodec.__version__)" + # Install fairseq git clone https://github.com/pytorch/fairseq cd fairseq From 7f063a6ce08b442de93471f8891e88e65544e0b3 Mon Sep 17 00:00:00 2001 From: Sam Anklesaria Date: Wed, 13 Aug 2025 18:11:05 +0000 Subject: [PATCH 09/27] Add scipy hack for load and save --- src/torchaudio/__init__.py | 369 ++++++++++++++++++++----------------- 1 file changed, 203 insertions(+), 166 deletions(-) diff --git a/src/torchaudio/__init__.py b/src/torchaudio/__init__.py index 60c8ceb7fe..5910743607 100644 --- a/src/torchaudio/__init__.py +++ b/src/torchaudio/__init__.py @@ -2,6 +2,8 @@ from typing import Union, BinaryIO, Optional, Tuple import os import torch +from scipy.io import wavfile +import sys # Initialize extension and backend first from . import _extension # noqa # usort: skip @@ -41,172 +43,207 @@ except ImportError: pass - -def load( - uri: Union[BinaryIO, str, os.PathLike], - frame_offset: int = 0, - num_frames: int = -1, - normalize: bool = True, - channels_first: bool = True, - format: Optional[str] = None, - buffer_size: int = 4096, - backend: Optional[str] = None, -) -> Tuple[torch.Tensor, int]: - """Load audio data from source using TorchCodec's AudioDecoder. - - .. note:: - - As of TorchAudio 2.9, this function relies on TorchCodec's decoding capabilities under the hood. It is - provided for convenience, but we do recommend that you port your code to - natively use ``torchcodec``'s ``AudioDecoder`` class for better - performance: - https://docs.pytorch.org/torchcodec/stable/generated/torchcodec.decoders.AudioDecoder. - Because of the reliance on Torchcodec, the parameters ``normalize``, ``buffer_size``, and - ``backend`` are ignored and accepted only for backwards compatibility. - - - Args: - uri (path-like object or file-like object): - Source of audio data. The following types are accepted: - - * ``path-like``: File path or URL. - * ``file-like``: Object with ``read(size: int) -> bytes`` method. - - frame_offset (int, optional): - Number of samples to skip before start reading data. - num_frames (int, optional): - Maximum number of samples to read. ``-1`` reads all the remaining samples, - starting from ``frame_offset``. - normalize (bool, optional): - TorchCodec always returns normalized float32 samples. This parameter - is ignored and a warning is issued if set to False. - Default: ``True``. - channels_first (bool, optional): - When True, the returned Tensor has dimension `[channel, time]`. - Otherwise, the returned Tensor's dimension is `[time, channel]`. - format (str or None, optional): - Format hint for the decoder. May not be supported by all TorchCodec - decoders. (Default: ``None``) - buffer_size (int, optional): - Not used by TorchCodec AudioDecoder. Provided for API compatibility. - backend (str or None, optional): - Not used by TorchCodec AudioDecoder. Provided for API compatibility. - - Returns: - (torch.Tensor, int): Resulting Tensor and sample rate. - Always returns float32 tensors. If ``channels_first=True``, shape is - `[channel, time]`, otherwise `[time, channel]`. - - Raises: - ImportError: If torchcodec is not available. - ValueError: If unsupported parameters are used. - RuntimeError: If TorchCodec fails to decode the audio. - - Note: - - TorchCodec always returns normalized float32 samples, so the ``normalize`` - parameter has no effect. - - The ``buffer_size`` and ``backend`` parameters are ignored. - - Not all audio formats supported by torchaudio backends may be supported - by TorchCodec. - """ - return load_with_torchcodec( - uri, - frame_offset=frame_offset, - num_frames=num_frames, - normalize=normalize, - channels_first=channels_first, - format=format, - buffer_size=buffer_size, - backend=backend - ) - -def save( - uri: Union[str, os.PathLike], - src: torch.Tensor, - sample_rate: int, - channels_first: bool = True, - format: Optional[str] = None, - encoding: Optional[str] = None, - bits_per_sample: Optional[int] = None, - buffer_size: int = 4096, - backend: Optional[str] = None, - compression: Optional[Union[float, int]] = None, -) -> None: - """Save audio data to file using TorchCodec's AudioEncoder. - - .. note:: - - As of TorchAudio 2.9, this function relies on TorchCodec's encoding capabilities under the hood. - It is provided for convenience, but we do recommend that you port your code to - natively use ``torchcodec``'s ``AudioEncoder`` class for better - performance: - https://docs.pytorch.org/torchcodec/stable/generated/torchcodec.encoders.AudioEncoder. - Because of the reliance on Torchcodec, the parameters ``format``, ``encoding``, - ``bits_per_sample``, ``buffer_size``, and ``backend``, are ignored and accepted only for - backwards compatibility. - - Args: - uri (path-like object): - Path to save the audio file. The file extension determines the format. - - src (torch.Tensor): - Audio data to save. Must be a 1D or 2D tensor with float32 values - in the range [-1, 1]. If 2D, shape should be [channel, time] when - channels_first=True, or [time, channel] when channels_first=False. - - sample_rate (int): - Sample rate of the audio data. - - channels_first (bool, optional): - Indicates whether the input tensor has channels as the first dimension. - If True, expects [channel, time]. If False, expects [time, channel]. - Default: True. - - format (str or None, optional): - Audio format hint. Not used by TorchCodec (format is determined by - file extension). A warning is issued if provided. - Default: None. - - encoding (str or None, optional): - Audio encoding. Not fully supported by TorchCodec AudioEncoder. - A warning is issued if provided. Default: None. - - bits_per_sample (int or None, optional): - Bits per sample. Not directly supported by TorchCodec AudioEncoder. - A warning is issued if provided. Default: None. - - buffer_size (int, optional): - Not used by TorchCodec AudioEncoder. Provided for API compatibility. - A warning is issued if not default value. Default: 4096. - - backend (str or None, optional): - Not used by TorchCodec AudioEncoder. Provided for API compatibility. - A warning is issued if provided. Default: None. - - compression (float, int or None, optional): - Compression level or bit rate. Maps to bit_rate parameter in - TorchCodec AudioEncoder. Default: None. - - Raises: - ImportError: If torchcodec is not available. - ValueError: If input parameters are invalid. - RuntimeError: If TorchCodec fails to encode the audio. - - Note: - - TorchCodec AudioEncoder expects float32 samples in [-1, 1] range. - - Some parameters (format, encoding, bits_per_sample, buffer_size, backend) - are not used by TorchCodec but are provided for API compatibility. - - The output format is determined by the file extension in the uri. - - TorchCodec uses FFmpeg under the hood for encoding. - """ - return save_with_torchcodec(uri, src, sample_rate, - channels_first=channels_first, - format=format, - encoding=encoding, - bits_per_sample=bits_per_sample, - buffer_size=buffer_size, - backend=backend, - compression=compression) +# CI cannot currently build with ffmpeg>4, but torchcodec is buggy with ffmpeg4. This hack +# allows CI to build with ffmpeg4 and works around load/test bugginess. +if "pytest" in sys.modules: + def load( + uri: Union[BinaryIO, str, os.PathLike], + frame_offset: int = 0, + num_frames: int = -1, + channels_first: bool = True, + format: Optional[str] = None, + buffer_size: int = 4096, + backend: Optional[str] = None, + ) -> Tuple[torch.Tensor, int]: + rate, data = wavfile.read(uri) + if data.ndim == 1: + data = data[:,None] + if num_frames == -1: + num_frames = data.shape[0] - frame_offset + data = data[frame_offset:frame_offset + num_frames] + if channels_first: + data = data.T + return data, rate + + def save( + uri: Union[str, os.PathLike], + src: torch.Tensor, + sample_rate: int, + channels_first: bool = True, + format: Optional[str] = None, + encoding: Optional[str] = None, + bits_per_sample: Optional[int] = None, + buffer_size: int = 4096, + backend: Optional[str] = None, + compression: Optional[Union[float, int]] = None, + ): + wavfile.write(uri, sample_rate, src.numpy()) +else: + def load( + uri: Union[BinaryIO, str, os.PathLike], + frame_offset: int = 0, + num_frames: int = -1, + normalize: bool = True, + channels_first: bool = True, + format: Optional[str] = None, + buffer_size: int = 4096, + backend: Optional[str] = None, + ) -> Tuple[torch.Tensor, int]: + """Load audio data from source using TorchCodec's AudioDecoder. + + .. note:: + + As of TorchAudio 2.9, this function relies on TorchCodec's decoding capabilities under the hood. It is + provided for convenience, but we do recommend that you port your code to + natively use ``torchcodec``'s ``AudioDecoder`` class for better + performance: + https://docs.pytorch.org/torchcodec/stable/generated/torchcodec.decoders.AudioDecoder. + Because of the reliance on Torchcodec, the parameters ``normalize``, ``buffer_size``, and + ``backend`` are ignored and accepted only for backwards compatibility. + + + Args: + uri (path-like object or file-like object): + Source of audio data. The following types are accepted: + + * ``path-like``: File path or URL. + * ``file-like``: Object with ``read(size: int) -> bytes`` method. + + frame_offset (int, optional): + Number of samples to skip before start reading data. + num_frames (int, optional): + Maximum number of samples to read. ``-1`` reads all the remaining samples, + starting from ``frame_offset``. + normalize (bool, optional): + TorchCodec always returns normalized float32 samples. This parameter + is ignored and a warning is issued if set to False. + Default: ``True``. + channels_first (bool, optional): + When True, the returned Tensor has dimension `[channel, time]`. + Otherwise, the returned Tensor's dimension is `[time, channel]`. + format (str or None, optional): + Format hint for the decoder. May not be supported by all TorchCodec + decoders. (Default: ``None``) + buffer_size (int, optional): + Not used by TorchCodec AudioDecoder. Provided for API compatibility. + backend (str or None, optional): + Not used by TorchCodec AudioDecoder. Provided for API compatibility. + + Returns: + (torch.Tensor, int): Resulting Tensor and sample rate. + Always returns float32 tensors. If ``channels_first=True``, shape is + `[channel, time]`, otherwise `[time, channel]`. + + Raises: + ImportError: If torchcodec is not available. + ValueError: If unsupported parameters are used. + RuntimeError: If TorchCodec fails to decode the audio. + + Note: + - TorchCodec always returns normalized float32 samples, so the ``normalize`` + parameter has no effect. + - The ``buffer_size`` and ``backend`` parameters are ignored. + - Not all audio formats supported by torchaudio backends may be supported + by TorchCodec. + """ + return load_with_torchcodec( + uri, + frame_offset=frame_offset, + num_frames=num_frames, + normalize=normalize, + channels_first=channels_first, + format=format, + buffer_size=buffer_size, + backend=backend + ) + + def save( + uri: Union[str, os.PathLike], + src: torch.Tensor, + sample_rate: int, + channels_first: bool = True, + format: Optional[str] = None, + encoding: Optional[str] = None, + bits_per_sample: Optional[int] = None, + buffer_size: int = 4096, + backend: Optional[str] = None, + compression: Optional[Union[float, int]] = None, + ) -> None: + """Save audio data to file using TorchCodec's AudioEncoder. + + .. note:: + + As of TorchAudio 2.9, this function relies on TorchCodec's encoding capabilities under the hood. + It is provided for convenience, but we do recommend that you port your code to + natively use ``torchcodec``'s ``AudioEncoder`` class for better + performance: + https://docs.pytorch.org/torchcodec/stable/generated/torchcodec.encoders.AudioEncoder. + Because of the reliance on Torchcodec, the parameters ``format``, ``encoding``, + ``bits_per_sample``, ``buffer_size``, and ``backend``, are ignored and accepted only for + backwards compatibility. + + Args: + uri (path-like object): + Path to save the audio file. The file extension determines the format. + + src (torch.Tensor): + Audio data to save. Must be a 1D or 2D tensor with float32 values + in the range [-1, 1]. If 2D, shape should be [channel, time] when + channels_first=True, or [time, channel] when channels_first=False. + + sample_rate (int): + Sample rate of the audio data. + + channels_first (bool, optional): + Indicates whether the input tensor has channels as the first dimension. + If True, expects [channel, time]. If False, expects [time, channel]. + Default: True. + + format (str or None, optional): + Audio format hint. Not used by TorchCodec (format is determined by + file extension). A warning is issued if provided. + Default: None. + + encoding (str or None, optional): + Audio encoding. Not fully supported by TorchCodec AudioEncoder. + A warning is issued if provided. Default: None. + + bits_per_sample (int or None, optional): + Bits per sample. Not directly supported by TorchCodec AudioEncoder. + A warning is issued if provided. Default: None. + + buffer_size (int, optional): + Not used by TorchCodec AudioEncoder. Provided for API compatibility. + A warning is issued if not default value. Default: 4096. + + backend (str or None, optional): + Not used by TorchCodec AudioEncoder. Provided for API compatibility. + A warning is issued if provided. Default: None. + + compression (float, int or None, optional): + Compression level or bit rate. Maps to bit_rate parameter in + TorchCodec AudioEncoder. Default: None. + + Raises: + ImportError: If torchcodec is not available. + ValueError: If input parameters are invalid. + RuntimeError: If TorchCodec fails to encode the audio. + + Note: + - TorchCodec AudioEncoder expects float32 samples in [-1, 1] range. + - Some parameters (format, encoding, bits_per_sample, buffer_size, backend) + are not used by TorchCodec but are provided for API compatibility. + - The output format is determined by the file extension in the uri. + - TorchCodec uses FFmpeg under the hood for encoding. + """ + return save_with_torchcodec(uri, src, sample_rate, + channels_first=channels_first, + format=format, + encoding=encoding, + bits_per_sample=bits_per_sample, + buffer_size=buffer_size, + backend=backend, + compression=compression) __all__ = [ "AudioMetaData", From 700c6c9b0a36efc2a8bdeb8c348a84707e67edff Mon Sep 17 00:00:00 2001 From: Sam Anklesaria Date: Wed, 13 Aug 2025 19:17:46 +0000 Subject: [PATCH 10/27] Only import scipy during testing --- .github/scripts/unittest-linux/install.sh | 1 - src/torchaudio/__init__.py | 2 +- 2 files changed, 1 insertion(+), 2 deletions(-) diff --git a/.github/scripts/unittest-linux/install.sh b/.github/scripts/unittest-linux/install.sh index 6a347577d5..e4fa67b1e5 100755 --- a/.github/scripts/unittest-linux/install.sh +++ b/.github/scripts/unittest-linux/install.sh @@ -93,7 +93,6 @@ fi pip install --progress-bar=off --pre torch torchcodec --index-url="${PYTORCH_WHEEL_INDEX}" - # 2. Install torchaudio conda install --quiet -y ninja cmake diff --git a/src/torchaudio/__init__.py b/src/torchaudio/__init__.py index 5910743607..ca34b996cf 100644 --- a/src/torchaudio/__init__.py +++ b/src/torchaudio/__init__.py @@ -2,7 +2,6 @@ from typing import Union, BinaryIO, Optional, Tuple import os import torch -from scipy.io import wavfile import sys # Initialize extension and backend first @@ -46,6 +45,7 @@ # CI cannot currently build with ffmpeg>4, but torchcodec is buggy with ffmpeg4. This hack # allows CI to build with ffmpeg4 and works around load/test bugginess. if "pytest" in sys.modules: + from scipy.io import wavfile def load( uri: Union[BinaryIO, str, os.PathLike], frame_offset: int = 0, From 6995b21ebacdb99f9952f6dead2b504284c63496 Mon Sep 17 00:00:00 2001 From: Sam Anklesaria Date: Wed, 13 Aug 2025 19:52:30 +0000 Subject: [PATCH 11/27] Revert "Install conda deps before pip deps" This reverts commit 80f5eb7778afd5efc1a2c601583c84ffb5aa2401. --- .github/scripts/unittest-linux/install.sh | 28 +++++++++++------------ 1 file changed, 13 insertions(+), 15 deletions(-) diff --git a/.github/scripts/unittest-linux/install.sh b/.github/scripts/unittest-linux/install.sh index e4fa67b1e5..9f99fd1e98 100755 --- a/.github/scripts/unittest-linux/install.sh +++ b/.github/scripts/unittest-linux/install.sh @@ -74,7 +74,19 @@ case $GPU_ARCH_TYPE in ;; esac PYTORCH_WHEEL_INDEX="https://download.pytorch.org/whl/${UPLOAD_CHANNEL}/${GPU_ARCH_ID}" +pip install --progress-bar=off --pre torch torchcodec --index-url="${PYTORCH_WHEEL_INDEX}" + +# 2. Install torchaudio +conda install --quiet -y ninja cmake +printf "* Installing torchaudio\n" +export BUILD_CPP_TEST=1 +pip install . -v --no-build-isolation + +# 3. Install Test tools +printf "* Installing test tools\n" +conda install -y "ffmpeg>4" +python -c "import torch; import torchaudio; import torchcodec; print(torch.__version__, torchaudio.__version__, torchcodec.__version__)" NUMBA_DEV_CHANNEL="" if [[ "$(python --version)" = *3.9* || "$(python --version)" = *3.10* ]]; then @@ -84,26 +96,12 @@ if [[ "$(python --version)" = *3.9* || "$(python --version)" = *3.10* ]]; then fi ( set -x - conda install -y -c conda-forge ${NUMBA_DEV_CHANNEL} "ffmpeg>4" libvorbis parameterized 'requests>=2.20' + conda install -y -c conda-forge ${NUMBA_DEV_CHANNEL} libvorbis parameterized 'requests>=2.20' pip install SoundFile coverage pytest pytest-cov scipy expecttest unidecode inflect Pillow sentencepiece pytorch-lightning 'protobuf<4.21.0' demucs tinytag pyroomacoustics flashlight-text git+https://github.com/kpu/kenlm # TODO: might be better to fix the single call to `pip install` above pip install pillow scipy "numpy>=1.26" ) - -pip install --progress-bar=off --pre torch torchcodec --index-url="${PYTORCH_WHEEL_INDEX}" - -# 2. Install torchaudio -conda install --quiet -y ninja cmake - -printf "* Installing torchaudio\n" -export BUILD_CPP_TEST=1 -pip install . -v --no-build-isolation - -# 3. Install Test tools -printf "* Installing test tools\n" -python -c "import torch; import torchaudio; import torchcodec; print(torch.__version__, torchaudio.__version__, torchcodec.__version__)" - # Install fairseq git clone https://github.com/pytorch/fairseq cd fairseq From 4ab5993566d2109b53c92b9b494ea27be5a555b9 Mon Sep 17 00:00:00 2001 From: Sam Anklesaria Date: Wed, 13 Aug 2025 19:52:35 +0000 Subject: [PATCH 12/27] Revert "Try ffmpeg>4" This reverts commit 74edc0a8dbe942aae3f04924d1743f4da49800cb. --- .github/scripts/unittest-linux/install.sh | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/.github/scripts/unittest-linux/install.sh b/.github/scripts/unittest-linux/install.sh index 9f99fd1e98..15bf71e907 100755 --- a/.github/scripts/unittest-linux/install.sh +++ b/.github/scripts/unittest-linux/install.sh @@ -85,7 +85,8 @@ pip install . -v --no-build-isolation # 3. Install Test tools printf "* Installing test tools\n" -conda install -y "ffmpeg>4" +# On this CI, for whatever reason, we're only able to install ffmpeg 4. +conda install -y "ffmpeg<5" python -c "import torch; import torchaudio; import torchcodec; print(torch.__version__, torchaudio.__version__, torchcodec.__version__)" NUMBA_DEV_CHANNEL="" From 43c460285b61eb4bc412005cad6536e3ac513a3b Mon Sep 17 00:00:00 2001 From: Sam Anklesaria Date: Wed, 13 Aug 2025 19:53:21 +0000 Subject: [PATCH 13/27] Revert torchcodec installation changes --- .github/scripts/unittest-linux/install.sh | 1 + .github/workflows/build_docs.yml | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/.github/scripts/unittest-linux/install.sh b/.github/scripts/unittest-linux/install.sh index 15bf71e907..a7ae9bfcf4 100755 --- a/.github/scripts/unittest-linux/install.sh +++ b/.github/scripts/unittest-linux/install.sh @@ -76,6 +76,7 @@ esac PYTORCH_WHEEL_INDEX="https://download.pytorch.org/whl/${UPLOAD_CHANNEL}/${GPU_ARCH_ID}" pip install --progress-bar=off --pre torch torchcodec --index-url="${PYTORCH_WHEEL_INDEX}" + # 2. Install torchaudio conda install --quiet -y ninja cmake diff --git a/.github/workflows/build_docs.yml b/.github/workflows/build_docs.yml index f681e3b7ec..e92c556218 100644 --- a/.github/workflows/build_docs.yml +++ b/.github/workflows/build_docs.yml @@ -68,7 +68,7 @@ jobs: GPU_ARCH_ID=cu126 # This is hard-coded and must be consistent with gpu-arch-version. PYTORCH_WHEEL_INDEX="https://download.pytorch.org/whl/${CHANNEL}/${GPU_ARCH_ID}" - pip install --progress-bar=off --pre torch torchcodec --index-url="${PYTORCH_WHEEL_INDEX}" + pip install --progress-bar=off --pre torch --index-url="${PYTORCH_WHEEL_INDEX}" echo "::endgroup::" echo "::group::Install TorchAudio" From f74f00423ade5d7c2a1f426193533a0772a7d40e Mon Sep 17 00:00:00 2001 From: Sam Anklesaria Date: Wed, 13 Aug 2025 21:00:05 +0000 Subject: [PATCH 14/27] Use existing wav_utils --- src/torchaudio/__init__.py | 24 +++++-------------- .../torchaudio/utils}/wav_utils.py | 0 .../common_utils/__init__.py | 2 +- 3 files changed, 7 insertions(+), 19 deletions(-) rename {test/torchaudio_unittest/common_utils => src/torchaudio/utils}/wav_utils.py (100%) diff --git a/src/torchaudio/__init__.py b/src/torchaudio/__init__.py index ca34b996cf..1ff3a530e4 100644 --- a/src/torchaudio/__init__.py +++ b/src/torchaudio/__init__.py @@ -45,28 +45,16 @@ # CI cannot currently build with ffmpeg>4, but torchcodec is buggy with ffmpeg4. This hack # allows CI to build with ffmpeg4 and works around load/test bugginess. if "pytest" in sys.modules: - from scipy.io import wavfile + from torchaudio.utils import wav_utils def load( - uri: Union[BinaryIO, str, os.PathLike], - frame_offset: int = 0, - num_frames: int = -1, + uri: str, + normalize: bool = True, channels_first: bool = True, - format: Optional[str] = None, - buffer_size: int = 4096, - backend: Optional[str] = None, ) -> Tuple[torch.Tensor, int]: - rate, data = wavfile.read(uri) - if data.ndim == 1: - data = data[:,None] - if num_frames == -1: - num_frames = data.shape[0] - frame_offset - data = data[frame_offset:frame_offset + num_frames] - if channels_first: - data = data.T - return data, rate + return wav_utils.load_wav(uri, normalize, channels_first) def save( - uri: Union[str, os.PathLike], + uri: str, src: torch.Tensor, sample_rate: int, channels_first: bool = True, @@ -77,7 +65,7 @@ def save( backend: Optional[str] = None, compression: Optional[Union[float, int]] = None, ): - wavfile.write(uri, sample_rate, src.numpy()) + wav_utils.save_wav(uri, src, sample_rate, channels_first=channels_first) else: def load( uri: Union[BinaryIO, str, os.PathLike], diff --git a/test/torchaudio_unittest/common_utils/wav_utils.py b/src/torchaudio/utils/wav_utils.py similarity index 100% rename from test/torchaudio_unittest/common_utils/wav_utils.py rename to src/torchaudio/utils/wav_utils.py diff --git a/test/torchaudio_unittest/common_utils/__init__.py b/test/torchaudio_unittest/common_utils/__init__.py index 509d5208df..93ac7e0821 100644 --- a/test/torchaudio_unittest/common_utils/__init__.py +++ b/test/torchaudio_unittest/common_utils/__init__.py @@ -26,7 +26,7 @@ from .func_utils import torch_script from .image_utils import get_image, rgb_to_gray, rgb_to_yuv_ccir, save_image from .parameterized_utils import load_params, nested_params -from .wav_utils import get_wav_data, load_wav, normalize_wav, save_wav +from torchaudio.utils.wav_utils import get_wav_data, load_wav, normalize_wav, save_wav import pytest class RequestMixin: From 953fc6579960cb0339c41726e36e511aa31299c7 Mon Sep 17 00:00:00 2001 From: Sam Anklesaria Date: Wed, 13 Aug 2025 21:55:08 +0000 Subject: [PATCH 15/27] Support frame_offset and num_frames in load hack --- src/torchaudio/__init__.py | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/src/torchaudio/__init__.py b/src/torchaudio/__init__.py index 1ff3a530e4..592a2cbe6a 100644 --- a/src/torchaudio/__init__.py +++ b/src/torchaudio/__init__.py @@ -48,10 +48,18 @@ from torchaudio.utils import wav_utils def load( uri: str, + frame_offset: int = 0, + num_frames: int = -1, normalize: bool = True, channels_first: bool = True, ) -> Tuple[torch.Tensor, int]: - return wav_utils.load_wav(uri, normalize, channels_first) + data, sample_rate = wav_utils.load_wav(uri, normalize, channels_first=False) + if num_frames == -1: + num_frames = data.shape[0] - frame_offset + data = data[frame_offset:frame_offset+num_frames] + if channels_first: + data = data.transpose(0, 1) + return data, sample_rate def save( uri: str, From dd3ff90799685c8a98565d959c9204fba1cd5097 Mon Sep 17 00:00:00 2001 From: Sam Anklesaria Date: Thu, 14 Aug 2025 01:03:46 +0000 Subject: [PATCH 16/27] Use rand instead of randn for test_save_channels_first --- test/torchaudio_unittest/test_load_save_torchcodec.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/test/torchaudio_unittest/test_load_save_torchcodec.py b/test/torchaudio_unittest/test_load_save_torchcodec.py index 3edb4c423b..90fcc15689 100644 --- a/test/torchaudio_unittest/test_load_save_torchcodec.py +++ b/test/torchaudio_unittest/test_load_save_torchcodec.py @@ -227,9 +227,9 @@ def test_save_channels_first(channels_first): """Test channels_first parameter.""" # Create test data if channels_first: - waveform = torch.randn(2, 16000) # [channel, time] + waveform = torch.rand(2, 16000) # [channel, time] else: - waveform = torch.randn(16000, 2) # [time, channel] + waveform = torch.rand(16000, 2) # [time, channel] sample_rate = 16000 From c94e011ecc5a64f0a550034011157f6cdee34f2d Mon Sep 17 00:00:00 2001 From: Sam Anklesaria Date: Thu, 14 Aug 2025 14:38:27 +0000 Subject: [PATCH 17/27] Remove pytest-aware code in src --- src/torchaudio/__init__.py | 364 +++++++++++++++++-------------------- 1 file changed, 166 insertions(+), 198 deletions(-) diff --git a/src/torchaudio/__init__.py b/src/torchaudio/__init__.py index 592a2cbe6a..0c321c96d2 100644 --- a/src/torchaudio/__init__.py +++ b/src/torchaudio/__init__.py @@ -42,204 +42,172 @@ except ImportError: pass -# CI cannot currently build with ffmpeg>4, but torchcodec is buggy with ffmpeg4. This hack -# allows CI to build with ffmpeg4 and works around load/test bugginess. -if "pytest" in sys.modules: - from torchaudio.utils import wav_utils - def load( - uri: str, - frame_offset: int = 0, - num_frames: int = -1, - normalize: bool = True, - channels_first: bool = True, - ) -> Tuple[torch.Tensor, int]: - data, sample_rate = wav_utils.load_wav(uri, normalize, channels_first=False) - if num_frames == -1: - num_frames = data.shape[0] - frame_offset - data = data[frame_offset:frame_offset+num_frames] - if channels_first: - data = data.transpose(0, 1) - return data, sample_rate - - def save( - uri: str, - src: torch.Tensor, - sample_rate: int, - channels_first: bool = True, - format: Optional[str] = None, - encoding: Optional[str] = None, - bits_per_sample: Optional[int] = None, - buffer_size: int = 4096, - backend: Optional[str] = None, - compression: Optional[Union[float, int]] = None, - ): - wav_utils.save_wav(uri, src, sample_rate, channels_first=channels_first) -else: - def load( - uri: Union[BinaryIO, str, os.PathLike], - frame_offset: int = 0, - num_frames: int = -1, - normalize: bool = True, - channels_first: bool = True, - format: Optional[str] = None, - buffer_size: int = 4096, - backend: Optional[str] = None, - ) -> Tuple[torch.Tensor, int]: - """Load audio data from source using TorchCodec's AudioDecoder. - - .. note:: - - As of TorchAudio 2.9, this function relies on TorchCodec's decoding capabilities under the hood. It is - provided for convenience, but we do recommend that you port your code to - natively use ``torchcodec``'s ``AudioDecoder`` class for better - performance: - https://docs.pytorch.org/torchcodec/stable/generated/torchcodec.decoders.AudioDecoder. - Because of the reliance on Torchcodec, the parameters ``normalize``, ``buffer_size``, and - ``backend`` are ignored and accepted only for backwards compatibility. - - - Args: - uri (path-like object or file-like object): - Source of audio data. The following types are accepted: - - * ``path-like``: File path or URL. - * ``file-like``: Object with ``read(size: int) -> bytes`` method. - - frame_offset (int, optional): - Number of samples to skip before start reading data. - num_frames (int, optional): - Maximum number of samples to read. ``-1`` reads all the remaining samples, - starting from ``frame_offset``. - normalize (bool, optional): - TorchCodec always returns normalized float32 samples. This parameter - is ignored and a warning is issued if set to False. - Default: ``True``. - channels_first (bool, optional): - When True, the returned Tensor has dimension `[channel, time]`. - Otherwise, the returned Tensor's dimension is `[time, channel]`. - format (str or None, optional): - Format hint for the decoder. May not be supported by all TorchCodec - decoders. (Default: ``None``) - buffer_size (int, optional): - Not used by TorchCodec AudioDecoder. Provided for API compatibility. - backend (str or None, optional): - Not used by TorchCodec AudioDecoder. Provided for API compatibility. - - Returns: - (torch.Tensor, int): Resulting Tensor and sample rate. - Always returns float32 tensors. If ``channels_first=True``, shape is - `[channel, time]`, otherwise `[time, channel]`. - - Raises: - ImportError: If torchcodec is not available. - ValueError: If unsupported parameters are used. - RuntimeError: If TorchCodec fails to decode the audio. - - Note: - - TorchCodec always returns normalized float32 samples, so the ``normalize`` - parameter has no effect. - - The ``buffer_size`` and ``backend`` parameters are ignored. - - Not all audio formats supported by torchaudio backends may be supported - by TorchCodec. - """ - return load_with_torchcodec( - uri, - frame_offset=frame_offset, - num_frames=num_frames, - normalize=normalize, - channels_first=channels_first, - format=format, - buffer_size=buffer_size, - backend=backend - ) - - def save( - uri: Union[str, os.PathLike], - src: torch.Tensor, - sample_rate: int, - channels_first: bool = True, - format: Optional[str] = None, - encoding: Optional[str] = None, - bits_per_sample: Optional[int] = None, - buffer_size: int = 4096, - backend: Optional[str] = None, - compression: Optional[Union[float, int]] = None, - ) -> None: - """Save audio data to file using TorchCodec's AudioEncoder. - - .. note:: - - As of TorchAudio 2.9, this function relies on TorchCodec's encoding capabilities under the hood. - It is provided for convenience, but we do recommend that you port your code to - natively use ``torchcodec``'s ``AudioEncoder`` class for better - performance: - https://docs.pytorch.org/torchcodec/stable/generated/torchcodec.encoders.AudioEncoder. - Because of the reliance on Torchcodec, the parameters ``format``, ``encoding``, - ``bits_per_sample``, ``buffer_size``, and ``backend``, are ignored and accepted only for - backwards compatibility. - - Args: - uri (path-like object): - Path to save the audio file. The file extension determines the format. - - src (torch.Tensor): - Audio data to save. Must be a 1D or 2D tensor with float32 values - in the range [-1, 1]. If 2D, shape should be [channel, time] when - channels_first=True, or [time, channel] when channels_first=False. - - sample_rate (int): - Sample rate of the audio data. - - channels_first (bool, optional): - Indicates whether the input tensor has channels as the first dimension. - If True, expects [channel, time]. If False, expects [time, channel]. - Default: True. - - format (str or None, optional): - Audio format hint. Not used by TorchCodec (format is determined by - file extension). A warning is issued if provided. - Default: None. - - encoding (str or None, optional): - Audio encoding. Not fully supported by TorchCodec AudioEncoder. - A warning is issued if provided. Default: None. - - bits_per_sample (int or None, optional): - Bits per sample. Not directly supported by TorchCodec AudioEncoder. - A warning is issued if provided. Default: None. - - buffer_size (int, optional): - Not used by TorchCodec AudioEncoder. Provided for API compatibility. - A warning is issued if not default value. Default: 4096. - - backend (str or None, optional): - Not used by TorchCodec AudioEncoder. Provided for API compatibility. - A warning is issued if provided. Default: None. - - compression (float, int or None, optional): - Compression level or bit rate. Maps to bit_rate parameter in - TorchCodec AudioEncoder. Default: None. - - Raises: - ImportError: If torchcodec is not available. - ValueError: If input parameters are invalid. - RuntimeError: If TorchCodec fails to encode the audio. - - Note: - - TorchCodec AudioEncoder expects float32 samples in [-1, 1] range. - - Some parameters (format, encoding, bits_per_sample, buffer_size, backend) - are not used by TorchCodec but are provided for API compatibility. - - The output format is determined by the file extension in the uri. - - TorchCodec uses FFmpeg under the hood for encoding. - """ - return save_with_torchcodec(uri, src, sample_rate, - channels_first=channels_first, - format=format, - encoding=encoding, - bits_per_sample=bits_per_sample, - buffer_size=buffer_size, - backend=backend, - compression=compression) + +def load( + uri: Union[BinaryIO, str, os.PathLike], + frame_offset: int = 0, + num_frames: int = -1, + normalize: bool = True, + channels_first: bool = True, + format: Optional[str] = None, + buffer_size: int = 4096, + backend: Optional[str] = None, +) -> Tuple[torch.Tensor, int]: + """Load audio data from source using TorchCodec's AudioDecoder. + + .. note:: + + As of TorchAudio 2.9, this function relies on TorchCodec's decoding capabilities under the hood. It is + provided for convenience, but we do recommend that you port your code to + natively use ``torchcodec``'s ``AudioDecoder`` class for better + performance: + https://docs.pytorch.org/torchcodec/stable/generated/torchcodec.decoders.AudioDecoder. + Because of the reliance on Torchcodec, the parameters ``normalize``, ``buffer_size``, and + ``backend`` are ignored and accepted only for backwards compatibility. + + + Args: + uri (path-like object or file-like object): + Source of audio data. The following types are accepted: + + * ``path-like``: File path or URL. + * ``file-like``: Object with ``read(size: int) -> bytes`` method. + + frame_offset (int, optional): + Number of samples to skip before start reading data. + num_frames (int, optional): + Maximum number of samples to read. ``-1`` reads all the remaining samples, + starting from ``frame_offset``. + normalize (bool, optional): + TorchCodec always returns normalized float32 samples. This parameter + is ignored and a warning is issued if set to False. + Default: ``True``. + channels_first (bool, optional): + When True, the returned Tensor has dimension `[channel, time]`. + Otherwise, the returned Tensor's dimension is `[time, channel]`. + format (str or None, optional): + Format hint for the decoder. May not be supported by all TorchCodec + decoders. (Default: ``None``) + buffer_size (int, optional): + Not used by TorchCodec AudioDecoder. Provided for API compatibility. + backend (str or None, optional): + Not used by TorchCodec AudioDecoder. Provided for API compatibility. + + Returns: + (torch.Tensor, int): Resulting Tensor and sample rate. + Always returns float32 tensors. If ``channels_first=True``, shape is + `[channel, time]`, otherwise `[time, channel]`. + + Raises: + ImportError: If torchcodec is not available. + ValueError: If unsupported parameters are used. + RuntimeError: If TorchCodec fails to decode the audio. + + Note: + - TorchCodec always returns normalized float32 samples, so the ``normalize`` + parameter has no effect. + - The ``buffer_size`` and ``backend`` parameters are ignored. + - Not all audio formats supported by torchaudio backends may be supported + by TorchCodec. + """ + return load_with_torchcodec( + uri, + frame_offset=frame_offset, + num_frames=num_frames, + normalize=normalize, + channels_first=channels_first, + format=format, + buffer_size=buffer_size, + backend=backend + ) + +def save( + uri: Union[str, os.PathLike], + src: torch.Tensor, + sample_rate: int, + channels_first: bool = True, + format: Optional[str] = None, + encoding: Optional[str] = None, + bits_per_sample: Optional[int] = None, + buffer_size: int = 4096, + backend: Optional[str] = None, + compression: Optional[Union[float, int]] = None, +) -> None: + """Save audio data to file using TorchCodec's AudioEncoder. + + .. note:: + + As of TorchAudio 2.9, this function relies on TorchCodec's encoding capabilities under the hood. + It is provided for convenience, but we do recommend that you port your code to + natively use ``torchcodec``'s ``AudioEncoder`` class for better + performance: + https://docs.pytorch.org/torchcodec/stable/generated/torchcodec.encoders.AudioEncoder. + Because of the reliance on Torchcodec, the parameters ``format``, ``encoding``, + ``bits_per_sample``, ``buffer_size``, and ``backend``, are ignored and accepted only for + backwards compatibility. + + Args: + uri (path-like object): + Path to save the audio file. The file extension determines the format. + + src (torch.Tensor): + Audio data to save. Must be a 1D or 2D tensor with float32 values + in the range [-1, 1]. If 2D, shape should be [channel, time] when + channels_first=True, or [time, channel] when channels_first=False. + + sample_rate (int): + Sample rate of the audio data. + + channels_first (bool, optional): + Indicates whether the input tensor has channels as the first dimension. + If True, expects [channel, time]. If False, expects [time, channel]. + Default: True. + + format (str or None, optional): + Audio format hint. Not used by TorchCodec (format is determined by + file extension). A warning is issued if provided. + Default: None. + + encoding (str or None, optional): + Audio encoding. Not fully supported by TorchCodec AudioEncoder. + A warning is issued if provided. Default: None. + + bits_per_sample (int or None, optional): + Bits per sample. Not directly supported by TorchCodec AudioEncoder. + A warning is issued if provided. Default: None. + + buffer_size (int, optional): + Not used by TorchCodec AudioEncoder. Provided for API compatibility. + A warning is issued if not default value. Default: 4096. + + backend (str or None, optional): + Not used by TorchCodec AudioEncoder. Provided for API compatibility. + A warning is issued if provided. Default: None. + + compression (float, int or None, optional): + Compression level or bit rate. Maps to bit_rate parameter in + TorchCodec AudioEncoder. Default: None. + + Raises: + ImportError: If torchcodec is not available. + ValueError: If input parameters are invalid. + RuntimeError: If TorchCodec fails to encode the audio. + + Note: + - TorchCodec AudioEncoder expects float32 samples in [-1, 1] range. + - Some parameters (format, encoding, bits_per_sample, buffer_size, backend) + are not used by TorchCodec but are provided for API compatibility. + - The output format is determined by the file extension in the uri. + - TorchCodec uses FFmpeg under the hood for encoding. + """ + return save_with_torchcodec(uri, src, sample_rate, + channels_first=channels_first, + format=format, + encoding=encoding, + bits_per_sample=bits_per_sample, + buffer_size=buffer_size, + backend=backend, + compression=compression) __all__ = [ "AudioMetaData", From b622d8209299382dbd40d14adaa069cf217c0df4 Mon Sep 17 00:00:00 2001 From: Sam Anklesaria Date: Thu, 14 Aug 2025 15:08:06 +0000 Subject: [PATCH 18/27] Remove torchcodec version check --- .github/scripts/unittest-linux/install.sh | 1 - 1 file changed, 1 deletion(-) diff --git a/.github/scripts/unittest-linux/install.sh b/.github/scripts/unittest-linux/install.sh index a7ae9bfcf4..c8f47e63ab 100755 --- a/.github/scripts/unittest-linux/install.sh +++ b/.github/scripts/unittest-linux/install.sh @@ -88,7 +88,6 @@ pip install . -v --no-build-isolation printf "* Installing test tools\n" # On this CI, for whatever reason, we're only able to install ffmpeg 4. conda install -y "ffmpeg<5" -python -c "import torch; import torchaudio; import torchcodec; print(torch.__version__, torchaudio.__version__, torchcodec.__version__)" NUMBA_DEV_CHANNEL="" if [[ "$(python --version)" = *3.9* || "$(python --version)" = *3.10* ]]; then From 93351a24194727341be4b203f6618c9baadbccc7 Mon Sep 17 00:00:00 2001 From: Sam Anklesaria Date: Thu, 14 Aug 2025 15:58:18 +0000 Subject: [PATCH 19/27] Fix bugs in torchcodec mock --- test/conftest.py | 4 + .../common_utils/__init__.py | 2 +- .../common_utils/wav_utils.py | 92 +++++++++++++++++++ test/torchcodec/decoders.py | 17 ++-- test/torchcodec/encoders.py | 6 +- 5 files changed, 106 insertions(+), 15 deletions(-) create mode 100644 test/conftest.py create mode 100644 test/torchaudio_unittest/common_utils/wav_utils.py diff --git a/test/conftest.py b/test/conftest.py new file mode 100644 index 0000000000..35f7ae81ee --- /dev/null +++ b/test/conftest.py @@ -0,0 +1,4 @@ +import sys +from pathlib import Path + +sys.path.append(str(Path(__file__).parent.resolve())) diff --git a/test/torchaudio_unittest/common_utils/__init__.py b/test/torchaudio_unittest/common_utils/__init__.py index 93ac7e0821..509d5208df 100644 --- a/test/torchaudio_unittest/common_utils/__init__.py +++ b/test/torchaudio_unittest/common_utils/__init__.py @@ -26,7 +26,7 @@ from .func_utils import torch_script from .image_utils import get_image, rgb_to_gray, rgb_to_yuv_ccir, save_image from .parameterized_utils import load_params, nested_params -from torchaudio.utils.wav_utils import get_wav_data, load_wav, normalize_wav, save_wav +from .wav_utils import get_wav_data, load_wav, normalize_wav, save_wav import pytest class RequestMixin: diff --git a/test/torchaudio_unittest/common_utils/wav_utils.py b/test/torchaudio_unittest/common_utils/wav_utils.py new file mode 100644 index 0000000000..db15494dca --- /dev/null +++ b/test/torchaudio_unittest/common_utils/wav_utils.py @@ -0,0 +1,92 @@ +from typing import Optional + +import scipy.io.wavfile +import torch + + +def normalize_wav(tensor: torch.Tensor) -> torch.Tensor: + if tensor.dtype == torch.float32: + pass + elif tensor.dtype == torch.int32: + tensor = tensor.to(torch.float32) + tensor[tensor > 0] /= 2147483647.0 + tensor[tensor < 0] /= 2147483648.0 + elif tensor.dtype == torch.int16: + tensor = tensor.to(torch.float32) + tensor[tensor > 0] /= 32767.0 + tensor[tensor < 0] /= 32768.0 + elif tensor.dtype == torch.uint8: + tensor = tensor.to(torch.float32) - 128 + tensor[tensor > 0] /= 127.0 + tensor[tensor < 0] /= 128.0 + return tensor + + +def get_wav_data( + dtype: str, + num_channels: int, + *, + num_frames: Optional[int] = None, + normalize: bool = True, + channels_first: bool = True, +): + """Generate linear signal of the given dtype and num_channels + + Data range is + [-1.0, 1.0] for float32, + [-2147483648, 2147483647] for int32 + [-32768, 32767] for int16 + [0, 255] for uint8 + + num_frames allow to change the linear interpolation parameter. + Default values are 256 for uint8, else 1 << 16. + 1 << 16 as default is so that int16 value range is completely covered. + """ + dtype_ = getattr(torch, dtype) + + if num_frames is None: + if dtype == "uint8": + num_frames = 256 + else: + num_frames = 1 << 16 + + if dtype == "uint8": + base = torch.linspace(0, 255, num_frames, dtype=dtype_) + elif dtype == "int8": + base = torch.linspace(-128, 127, num_frames, dtype=dtype_) + elif dtype == "float32": + base = torch.linspace(-1.0, 1.0, num_frames, dtype=dtype_) + elif dtype == "float64": + base = torch.linspace(-1.0, 1.0, num_frames, dtype=dtype_) + elif dtype == "int32": + base = torch.linspace(-2147483648, 2147483647, num_frames, dtype=dtype_) + elif dtype == "int16": + base = torch.linspace(-32768, 32767, num_frames, dtype=dtype_) + else: + raise NotImplementedError(f"Unsupported dtype {dtype}") + data = base.repeat([num_channels, 1]) + if not channels_first: + data = data.transpose(1, 0) + if normalize: + data = normalize_wav(data) + return data + + +def load_wav(path: str, normalize=True, channels_first=True) -> torch.Tensor: + """Load wav file without torchaudio""" + sample_rate, data = scipy.io.wavfile.read(path) + data = torch.from_numpy(data.copy()) + if data.ndim == 1: + data = data.unsqueeze(1) + if normalize: + data = normalize_wav(data) + if channels_first: + data = data.transpose(1, 0) + return data, sample_rate + + +def save_wav(path, data, sample_rate, channels_first=True): + """Save wav file without torchaudio""" + if channels_first: + data = data.transpose(1, 0) + scipy.io.wavfile.write(path, sample_rate, data.numpy()) diff --git a/test/torchcodec/decoders.py b/test/torchcodec/decoders.py index 94f2d8c8c1..8b2a7a3071 100644 --- a/test/torchcodec/decoders.py +++ b/test/torchcodec/decoders.py @@ -1,17 +1,12 @@ -import test.torchaudio_unittest.common_utils.wav_utils as wav_utils +import torchaudio_unittest.common_utils.wav_utils as wav_utils +from types import SimpleNamespace class AudioDecoder: def __init__(self, uri): self.uri = uri - - def get_all_samples(self): - return wav_utils.load_wav(self.uri) - - -class AudioEncoder: - def __init__(self, data, sample_rate): + data, sample_rate = wav_utils.load_wav(self.uri) + self.metadata = SimpleNamespace(sample_rate=sample_rate) self.data = data - self.sample_rate = sample_rate - def to_file(self, uri, bit_rate=None): - return wav_utils.save_wav(uri, self.data, self.sample_rate) + def get_all_samples(self): + return SimpleNamespace(data=self.data) diff --git a/test/torchcodec/encoders.py b/test/torchcodec/encoders.py index 5e9cc54968..cef6953824 100644 --- a/test/torchcodec/encoders.py +++ b/test/torchcodec/encoders.py @@ -1,10 +1,10 @@ import torchaudio_unittest.common_utils.wav_utils as wav_utils +from types import SimpleNamespace class AudioEncoder: def __init__(self, data, sample_rate): - print("BEING CALLED") self.data = data - self.sample_rate = sample_rate + self.metadata = SimpleNamespace(sample_rate=sample_rate) def to_file(self, uri, bit_rate=None): - return wav_utils.save_wav(uri, self.data, self.sample_rate) + return wav_utils.save_wav(uri, self.data, self.metadata.sample_rate) From 54071630c957e3eab5dc271f5e9bb5dd25e3d67c Mon Sep 17 00:00:00 2001 From: Sam Anklesaria Date: Thu, 14 Aug 2025 16:01:18 +0000 Subject: [PATCH 20/27] Skip test_load_save_torchcodec --- .../test_load_save_torchcodec.py | 152 +++++++++--------- 1 file changed, 78 insertions(+), 74 deletions(-) diff --git a/test/torchaudio_unittest/test_load_save_torchcodec.py b/test/torchaudio_unittest/test_load_save_torchcodec.py index 90fcc15689..28d316952e 100644 --- a/test/torchaudio_unittest/test_load_save_torchcodec.py +++ b/test/torchaudio_unittest/test_load_save_torchcodec.py @@ -12,6 +12,10 @@ from torchaudio import load_with_torchcodec, save_with_torchcodec from torchaudio_unittest.common_utils import get_asset_path +# Now, load/save_torchcodec are the same as torchaudio.load/save, so +# there is no need to test this. +pytest.skip() + def get_ffmpeg_version(): """Get FFmpeg version to check for compatibility issues.""" try: @@ -48,25 +52,25 @@ def test_basic_load(filename): # Skip problematic files on FFmpeg4 due to known compatibility issues if is_ffmpeg4() and filename != "sinewave.wav": pytest.skip("FFmpeg4 has known compatibility issues with some audio files") - + file_path = get_asset_path(*filename.split("/")) - + # Load with torchaudio waveform_ta, sample_rate_ta = torchaudio.load(file_path) - + # Load with torchcodec waveform_tc, sample_rate_tc = load_with_torchcodec(file_path) - + # Check sample rates match assert sample_rate_ta == sample_rate_tc - + # Check shapes match assert waveform_ta.shape == waveform_tc.shape - + # Check data types (should both be float32) assert waveform_ta.dtype == torch.float32 assert waveform_tc.dtype == torch.float32 - + # Check values are close (allowing for small differences in decoders) torch.testing.assert_close(waveform_ta, waveform_tc) @@ -79,17 +83,17 @@ def test_basic_load(filename): def test_frame_offset_and_num_frames(frame_offset, num_frames): """Test frame_offset and num_frames parameters.""" file_path = get_asset_path("sinewave.wav") - + # Load with torchaudio waveform_ta, sample_rate_ta = torchaudio.load( file_path, frame_offset=frame_offset, num_frames=num_frames ) - + # Load with torchcodec waveform_tc, sample_rate_tc = load_with_torchcodec( file_path, frame_offset=frame_offset, num_frames=num_frames ) - + # Check results match assert sample_rate_ta == sample_rate_tc assert waveform_ta.shape == waveform_tc.shape @@ -98,21 +102,21 @@ def test_frame_offset_and_num_frames(frame_offset, num_frames): def test_channels_first(): """Test channels_first parameter.""" file_path = get_asset_path("sinewave.wav") # Use sinewave.wav for compatibility - + # Test channels_first=True (default) waveform_cf_true, sample_rate = load_with_torchcodec(file_path, channels_first=True) - + # Test channels_first=False waveform_cf_false, _ = load_with_torchcodec(file_path, channels_first=False) - + # Check that transpose relationship holds assert waveform_cf_true.shape == waveform_cf_false.transpose(0, 1).shape torch.testing.assert_close(waveform_cf_true, waveform_cf_false.transpose(0, 1)) - + # Compare with torchaudio waveform_ta_true, _ = torchaudio.load(file_path, channels_first=True) waveform_ta_false, _ = torchaudio.load(file_path, channels_first=False) - + assert waveform_cf_true.shape == waveform_ta_true.shape assert waveform_cf_false.shape == waveform_ta_false.shape torch.testing.assert_close(waveform_cf_true, waveform_ta_true) @@ -121,18 +125,18 @@ def test_channels_first(): def test_normalize_parameter_warning(): """Test that normalize=False produces a warning.""" file_path = get_asset_path("sinewave.wav") - + with pytest.warns(UserWarning, match="normalize=False.*ignored"): # This should produce a warning waveform, sample_rate = load_with_torchcodec(file_path, normalize=False) - + # Result should still be float32 (normalized) assert waveform.dtype == torch.float32 def test_buffer_size_parameter_warning(): """Test that non-default buffer_size produces a warning.""" file_path = get_asset_path("sinewave.wav") - + with pytest.warns(UserWarning, match="buffer_size.*not used"): # This should produce a warning waveform, sample_rate = load_with_torchcodec(file_path, buffer_size=8192) @@ -141,7 +145,7 @@ def test_buffer_size_parameter_warning(): def test_backend_parameter_warning(): """Test that specifying backend produces a warning.""" file_path = get_asset_path("sinewave.wav") - + with pytest.warns(UserWarning, match="backend.*not used"): # This should produce a warning waveform, sample_rate = load_with_torchcodec(file_path, backend="ffmpeg") @@ -156,10 +160,10 @@ def test_invalid_file(): def test_format_parameter(): """Test that format parameter produces a warning.""" file_path = get_asset_path("sinewave.wav") - + with pytest.warns(UserWarning, match="format.*not supported"): waveform, sample_rate = load_with_torchcodec(file_path, format="wav") - + # Check basic properties assert waveform.dtype == torch.float32 assert sample_rate > 0 @@ -168,17 +172,17 @@ def test_format_parameter(): def test_multiple_warnings(): """Test that multiple unsupported parameters produce multiple warnings.""" file_path = get_asset_path("sinewave.wav") - + with pytest.warns() as warning_list: # This should produce multiple warnings waveform, sample_rate = load_with_torchcodec( - file_path, - normalize=False, - buffer_size=8192, + file_path, + normalize=False, + buffer_size=8192, backend="ffmpeg" ) - - + + # Check that expected warnings are present messages = [str(w.message) for w in warning_list] assert any("normalize=False" in msg for msg in messages) @@ -194,30 +198,30 @@ def test_save_basic_save(filename): # Load a test file first file_path = get_asset_path(*filename.split("/")) waveform, sample_rate = torchaudio.load(file_path) - + with tempfile.TemporaryDirectory() as temp_dir: # Save with torchaudio ta_path = os.path.join(temp_dir, "ta_output.wav") torchaudio.save(ta_path, waveform, sample_rate) - + # Save with torchcodec tc_path = os.path.join(temp_dir, "tc_output.wav") save_with_torchcodec(tc_path, waveform, sample_rate) - + # Load both back and compare waveform_ta, sample_rate_ta = torchaudio.load(ta_path) waveform_tc, sample_rate_tc = torchaudio.load(tc_path) - + # Check sample rates match assert sample_rate_ta == sample_rate_tc - + # Check shapes match assert waveform_ta.shape == waveform_tc.shape - + # Check data types (should both be float32) assert waveform_ta.dtype == torch.float32 assert waveform_tc.dtype == torch.float32 - + # Check values are close (allowing for small differences in encoders) torch.testing.assert_close(waveform_ta, waveform_tc, atol=1e-3, rtol=1e-3) @@ -230,22 +234,22 @@ def test_save_channels_first(channels_first): waveform = torch.rand(2, 16000) # [channel, time] else: waveform = torch.rand(16000, 2) # [time, channel] - + sample_rate = 16000 - + with tempfile.TemporaryDirectory() as temp_dir: # Save with torchaudio ta_path = os.path.join(temp_dir, "ta_output.wav") torchaudio.save(ta_path, waveform, sample_rate, channels_first=channels_first) - + # Save with torchcodec tc_path = os.path.join(temp_dir, "tc_output.wav") save_with_torchcodec(tc_path, waveform, sample_rate, channels_first=channels_first) - + # Load both back and compare waveform_ta, sample_rate_ta = torchaudio.load(ta_path) waveform_tc, sample_rate_tc = torchaudio.load(tc_path) - + # Check results match assert sample_rate_ta == sample_rate_tc assert waveform_ta.shape == waveform_tc.shape @@ -256,15 +260,15 @@ def test_save_compression_parameter(): """Test compression parameter (maps to bit_rate).""" waveform = torch.randn(1, 16000) sample_rate = 16000 - + with tempfile.TemporaryDirectory() as temp_dir: # Test with compression (bit_rate) output_path = os.path.join(temp_dir, "output.wav") save_with_torchcodec(output_path, waveform, sample_rate, compression=128000) - + # Should not raise an error and file should exist assert os.path.exists(output_path) - + # Load back and check basic properties waveform_loaded, sample_rate_loaded = torchaudio.load(output_path) assert sample_rate_loaded == sample_rate @@ -275,13 +279,13 @@ def test_save_format_parameter_warning(): """Test that format parameter produces a warning.""" waveform = torch.randn(1, 16000) sample_rate = 16000 - + with tempfile.TemporaryDirectory() as temp_dir: output_path = os.path.join(temp_dir, "output.wav") - + with pytest.warns(UserWarning, match="format.*not used"): save_with_torchcodec(output_path, waveform, sample_rate, format="wav") - + # Should still work despite warning assert os.path.exists(output_path) @@ -290,13 +294,13 @@ def test_save_encoding_parameter_warning(): """Test that encoding parameter produces a warning.""" waveform = torch.randn(1, 16000) sample_rate = 16000 - + with tempfile.TemporaryDirectory() as temp_dir: output_path = os.path.join(temp_dir, "output.wav") - + with pytest.warns(UserWarning, match="encoding.*not fully supported"): save_with_torchcodec(output_path, waveform, sample_rate, encoding="PCM_16") - + # Should still work despite warning assert os.path.exists(output_path) @@ -305,13 +309,13 @@ def test_save_bits_per_sample_parameter_warning(): """Test that bits_per_sample parameter produces a warning.""" waveform = torch.randn(1, 16000) sample_rate = 16000 - + with tempfile.TemporaryDirectory() as temp_dir: output_path = os.path.join(temp_dir, "output.wav") - + with pytest.warns(UserWarning, match="bits_per_sample.*not directly supported"): save_with_torchcodec(output_path, waveform, sample_rate, bits_per_sample=16) - + # Should still work despite warning assert os.path.exists(output_path) @@ -320,13 +324,13 @@ def test_save_buffer_size_parameter_warning(): """Test that non-default buffer_size produces a warning.""" waveform = torch.randn(1, 16000) sample_rate = 16000 - + with tempfile.TemporaryDirectory() as temp_dir: output_path = os.path.join(temp_dir, "output.wav") - + with pytest.warns(UserWarning, match="buffer_size.*not used"): save_with_torchcodec(output_path, waveform, sample_rate, buffer_size=8192) - + # Should still work despite warning assert os.path.exists(output_path) @@ -335,13 +339,13 @@ def test_save_backend_parameter_warning(): """Test that specifying backend produces a warning.""" waveform = torch.randn(1, 16000) sample_rate = 16000 - + with tempfile.TemporaryDirectory() as temp_dir: output_path = os.path.join(temp_dir, "output.wav") - + with pytest.warns(UserWarning, match="backend.*not used"): save_with_torchcodec(output_path, waveform, sample_rate, backend="ffmpeg") - + # Should still work despite warning assert os.path.exists(output_path) @@ -350,16 +354,16 @@ def test_save_edge_cases(): """Test edge cases and error conditions.""" waveform = torch.randn(1, 16000) sample_rate = 16000 - + with tempfile.TemporaryDirectory() as temp_dir: output_path = os.path.join(temp_dir, "output.wav") - + # Test with very small waveform small_waveform = torch.randn(1, 10) save_with_torchcodec(output_path, small_waveform, sample_rate) waveform_loaded, sample_rate_loaded = torchaudio.load(output_path) assert sample_rate_loaded == sample_rate - + # Test with different sample rates for sr in [8000, 22050, 44100]: sr_path = os.path.join(temp_dir, f"output_{sr}.wav") @@ -372,19 +376,19 @@ def test_save_invalid_inputs(): """Test that invalid inputs raise appropriate errors.""" waveform = torch.randn(1, 16000) sample_rate = 16000 - + with tempfile.TemporaryDirectory() as temp_dir: output_path = os.path.join(temp_dir, "output.wav") - + # Test with invalid sample rate with pytest.raises(ValueError, match="sample_rate must be positive"): save_with_torchcodec(output_path, waveform, -1) - + # Test with invalid tensor dimensions with pytest.raises(ValueError, match="Expected 1D or 2D tensor"): invalid_waveform = torch.randn(1, 2, 16000) # 3D tensor save_with_torchcodec(output_path, invalid_waveform, sample_rate) - + # Test with non-tensor input with pytest.raises(ValueError, match="Expected src to be a torch.Tensor"): save_with_torchcodec(output_path, [1, 2, 3], sample_rate) @@ -394,14 +398,14 @@ def test_save_multiple_warnings(): """Test that multiple unsupported parameters produce multiple warnings.""" waveform = torch.randn(1, 16000) sample_rate = 16000 - + with tempfile.TemporaryDirectory() as temp_dir: output_path = os.path.join(temp_dir, "output.wav") - + with pytest.warns() as warning_list: save_with_torchcodec( - output_path, - waveform, + output_path, + waveform, sample_rate, format="wav", encoding="PCM_16", @@ -409,7 +413,7 @@ def test_save_multiple_warnings(): buffer_size=8192, backend="ffmpeg" ) - + # Check that expected warnings are present messages = [str(w.message) for w in warning_list] assert any("format" in msg for msg in messages) @@ -417,7 +421,7 @@ def test_save_multiple_warnings(): assert any("bits_per_sample" in msg for msg in messages) assert any("buffer_size" in msg for msg in messages) assert any("backend" in msg for msg in messages) - + # Should still work despite warnings assert os.path.exists(output_path) @@ -426,17 +430,17 @@ def test_save_different_formats(): """Test saving to different audio formats.""" waveform = torch.randn(1, 16000) sample_rate = 16000 - + with tempfile.TemporaryDirectory() as temp_dir: # Test common formats formats = ["wav", "mp3", "flac"] - + for fmt in formats: output_path = os.path.join(temp_dir, f"output.{fmt}") try: save_with_torchcodec(output_path, waveform, sample_rate) assert os.path.exists(output_path) - + # Try to load back (may not work for all formats with all backends) try: waveform_loaded, sample_rate_loaded = torchaudio.load(output_path) @@ -446,4 +450,4 @@ def test_save_different_formats(): pass except Exception as e: # Some formats might not be supported by torchcodec - pytest.skip(f"Format {fmt} not supported: {e}") \ No newline at end of file + pytest.skip(f"Format {fmt} not supported: {e}") From bd7eb5239badb3a4858c5820ff606bf691dcaeff Mon Sep 17 00:00:00 2001 From: Sam Anklesaria Date: Thu, 14 Aug 2025 16:33:48 +0000 Subject: [PATCH 21/27] Correct call to pytest skip --- test/torchaudio_unittest/test_load_save_torchcodec.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/torchaudio_unittest/test_load_save_torchcodec.py b/test/torchaudio_unittest/test_load_save_torchcodec.py index 28d316952e..4a89123939 100644 --- a/test/torchaudio_unittest/test_load_save_torchcodec.py +++ b/test/torchaudio_unittest/test_load_save_torchcodec.py @@ -14,7 +14,7 @@ # Now, load/save_torchcodec are the same as torchaudio.load/save, so # there is no need to test this. -pytest.skip() +pytest.skip(allow_module_level=True) def get_ffmpeg_version(): """Get FFmpeg version to check for compatibility issues.""" From c3d0cc2bca81a9815e0592683347048562d33c16 Mon Sep 17 00:00:00 2001 From: Sam Anklesaria Date: Thu, 14 Aug 2025 16:57:21 +0000 Subject: [PATCH 22/27] Remove torchcodec installation --- .github/scripts/unittest-linux/install.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/scripts/unittest-linux/install.sh b/.github/scripts/unittest-linux/install.sh index c8f47e63ab..68ed032bbb 100755 --- a/.github/scripts/unittest-linux/install.sh +++ b/.github/scripts/unittest-linux/install.sh @@ -74,7 +74,7 @@ case $GPU_ARCH_TYPE in ;; esac PYTORCH_WHEEL_INDEX="https://download.pytorch.org/whl/${UPLOAD_CHANNEL}/${GPU_ARCH_ID}" -pip install --progress-bar=off --pre torch torchcodec --index-url="${PYTORCH_WHEEL_INDEX}" +pip install --progress-bar=off --pre torch --index-url="${PYTORCH_WHEEL_INDEX}" # 2. Install torchaudio From d10fc1925e38c5f1abec5753c5f11987e338e2e9 Mon Sep 17 00:00:00 2001 From: Sam Anklesaria Date: Fri, 15 Aug 2025 15:57:04 +0000 Subject: [PATCH 23/27] Add torchcodec to build installation --- .github/workflows/build_docs.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/build_docs.yml b/.github/workflows/build_docs.yml index e92c556218..f681e3b7ec 100644 --- a/.github/workflows/build_docs.yml +++ b/.github/workflows/build_docs.yml @@ -68,7 +68,7 @@ jobs: GPU_ARCH_ID=cu126 # This is hard-coded and must be consistent with gpu-arch-version. PYTORCH_WHEEL_INDEX="https://download.pytorch.org/whl/${CHANNEL}/${GPU_ARCH_ID}" - pip install --progress-bar=off --pre torch --index-url="${PYTORCH_WHEEL_INDEX}" + pip install --progress-bar=off --pre torch torchcodec --index-url="${PYTORCH_WHEEL_INDEX}" echo "::endgroup::" echo "::group::Install TorchAudio" From 92fee5133bd585b43f96bcf3985a61806fee6f33 Mon Sep 17 00:00:00 2001 From: Sam Anklesaria Date: Fri, 15 Aug 2025 16:48:41 +0000 Subject: [PATCH 24/27] Remove redundant wav_utils --- src/torchaudio/utils/wav_utils.py | 92 ------------------------------- 1 file changed, 92 deletions(-) delete mode 100644 src/torchaudio/utils/wav_utils.py diff --git a/src/torchaudio/utils/wav_utils.py b/src/torchaudio/utils/wav_utils.py deleted file mode 100644 index db15494dca..0000000000 --- a/src/torchaudio/utils/wav_utils.py +++ /dev/null @@ -1,92 +0,0 @@ -from typing import Optional - -import scipy.io.wavfile -import torch - - -def normalize_wav(tensor: torch.Tensor) -> torch.Tensor: - if tensor.dtype == torch.float32: - pass - elif tensor.dtype == torch.int32: - tensor = tensor.to(torch.float32) - tensor[tensor > 0] /= 2147483647.0 - tensor[tensor < 0] /= 2147483648.0 - elif tensor.dtype == torch.int16: - tensor = tensor.to(torch.float32) - tensor[tensor > 0] /= 32767.0 - tensor[tensor < 0] /= 32768.0 - elif tensor.dtype == torch.uint8: - tensor = tensor.to(torch.float32) - 128 - tensor[tensor > 0] /= 127.0 - tensor[tensor < 0] /= 128.0 - return tensor - - -def get_wav_data( - dtype: str, - num_channels: int, - *, - num_frames: Optional[int] = None, - normalize: bool = True, - channels_first: bool = True, -): - """Generate linear signal of the given dtype and num_channels - - Data range is - [-1.0, 1.0] for float32, - [-2147483648, 2147483647] for int32 - [-32768, 32767] for int16 - [0, 255] for uint8 - - num_frames allow to change the linear interpolation parameter. - Default values are 256 for uint8, else 1 << 16. - 1 << 16 as default is so that int16 value range is completely covered. - """ - dtype_ = getattr(torch, dtype) - - if num_frames is None: - if dtype == "uint8": - num_frames = 256 - else: - num_frames = 1 << 16 - - if dtype == "uint8": - base = torch.linspace(0, 255, num_frames, dtype=dtype_) - elif dtype == "int8": - base = torch.linspace(-128, 127, num_frames, dtype=dtype_) - elif dtype == "float32": - base = torch.linspace(-1.0, 1.0, num_frames, dtype=dtype_) - elif dtype == "float64": - base = torch.linspace(-1.0, 1.0, num_frames, dtype=dtype_) - elif dtype == "int32": - base = torch.linspace(-2147483648, 2147483647, num_frames, dtype=dtype_) - elif dtype == "int16": - base = torch.linspace(-32768, 32767, num_frames, dtype=dtype_) - else: - raise NotImplementedError(f"Unsupported dtype {dtype}") - data = base.repeat([num_channels, 1]) - if not channels_first: - data = data.transpose(1, 0) - if normalize: - data = normalize_wav(data) - return data - - -def load_wav(path: str, normalize=True, channels_first=True) -> torch.Tensor: - """Load wav file without torchaudio""" - sample_rate, data = scipy.io.wavfile.read(path) - data = torch.from_numpy(data.copy()) - if data.ndim == 1: - data = data.unsqueeze(1) - if normalize: - data = normalize_wav(data) - if channels_first: - data = data.transpose(1, 0) - return data, sample_rate - - -def save_wav(path, data, sample_rate, channels_first=True): - """Save wav file without torchaudio""" - if channels_first: - data = data.transpose(1, 0) - scipy.io.wavfile.write(path, sample_rate, data.numpy()) From 2646e59182bdbecf6f2edaccee71bdc208ff2ca8 Mon Sep 17 00:00:00 2001 From: Nicolas Hug Date: Mon, 18 Aug 2025 12:16:34 +0100 Subject: [PATCH 25/27] remove sys --- src/torchaudio/__init__.py | 1 - 1 file changed, 1 deletion(-) diff --git a/src/torchaudio/__init__.py b/src/torchaudio/__init__.py index 0c321c96d2..78f42e5cfb 100644 --- a/src/torchaudio/__init__.py +++ b/src/torchaudio/__init__.py @@ -2,7 +2,6 @@ from typing import Union, BinaryIO, Optional, Tuple import os import torch -import sys # Initialize extension and backend first from . import _extension # noqa # usort: skip From 6c43c04bfd3009ccd0461b1610e721b61c318c47 Mon Sep 17 00:00:00 2001 From: Nicolas Hug Date: Mon, 18 Aug 2025 12:27:07 +0100 Subject: [PATCH 26/27] Add comments --- test/conftest.py | 19 +++++++++++++++++++ test/torchcodec/decoders.py | 3 +++ test/torchcodec/encoders.py | 3 +++ 3 files changed, 25 insertions(+) diff --git a/test/conftest.py b/test/conftest.py index 35f7ae81ee..3b16aab043 100644 --- a/test/conftest.py +++ b/test/conftest.py @@ -1,4 +1,23 @@ import sys from pathlib import Path +# Note: [TorchCodec test dependency mocking hack] +# We are adding the `test/` directory to the system path. This causes the +# `tests/torchcodec` folder to be importable, and in particular, this makes it +# possible to mock torchcodec utilities. E.g. executing: +# +# ``` +# from torchcodec.decoders import AudioDecoder +# ``` +# directly or indirectly when running the tests will effectively be loading the +# mocked `AudioDecoder` implemented in `test/torchcodec/decoders.py`, which +# relies on scipy instead of relying on torchcodec. +# +# So whenever `torchaudio.load()` is called from within the tests, it's the +# mocked scipy `AudioDecoder` that gets used. Ultimately, this allows us *not* +# to add torchcodec as a test dependency of torchaudio: we can just rely on +# scipy. +# +# This is VERY hacky and ideally we should implement a more robust way to mock +# torchcodec. sys.path.append(str(Path(__file__).parent.resolve())) diff --git a/test/torchcodec/decoders.py b/test/torchcodec/decoders.py index 8b2a7a3071..0064be91d6 100644 --- a/test/torchcodec/decoders.py +++ b/test/torchcodec/decoders.py @@ -1,6 +1,9 @@ import torchaudio_unittest.common_utils.wav_utils as wav_utils from types import SimpleNamespace +# See corresponding [TorchCodec test dependency mocking hack] note in +# conftest.py + class AudioDecoder: def __init__(self, uri): self.uri = uri diff --git a/test/torchcodec/encoders.py b/test/torchcodec/encoders.py index cef6953824..e6b0693018 100644 --- a/test/torchcodec/encoders.py +++ b/test/torchcodec/encoders.py @@ -1,6 +1,9 @@ import torchaudio_unittest.common_utils.wav_utils as wav_utils from types import SimpleNamespace +# See corresponding [TorchCodec test dependency mocking hack] note in +# conftest.py + class AudioEncoder: def __init__(self, data, sample_rate): self.data = data From 498ce4928bc7befb96e77970fc3e34e0bf6f9dac Mon Sep 17 00:00:00 2001 From: Nicolas Hug Date: Mon, 18 Aug 2025 12:30:05 +0100 Subject: [PATCH 27/27] clarify comment --- test/torchaudio_unittest/test_load_save_torchcodec.py | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/test/torchaudio_unittest/test_load_save_torchcodec.py b/test/torchaudio_unittest/test_load_save_torchcodec.py index 4a89123939..9057e93811 100644 --- a/test/torchaudio_unittest/test_load_save_torchcodec.py +++ b/test/torchaudio_unittest/test_load_save_torchcodec.py @@ -12,8 +12,13 @@ from torchaudio import load_with_torchcodec, save_with_torchcodec from torchaudio_unittest.common_utils import get_asset_path -# Now, load/save_torchcodec are the same as torchaudio.load/save, so -# there is no need to test this. +# These tests were ran when `torchaudio.load()` and `torchaudio.save()` were +# still relying on their previous backends (ffmpeg, sox, soundfile). We needed +# to validate that the newly introduced `load_with_torchcodec()` and +# save_with_torchcodec() were matching their results. +# From 2.9, `load()` and `save()` now internally rely on `load_with_torchcodec()` and +# `save_with_torchcodec()` directly, so these tests are now redundant and we +# skip them unconditionally. pytest.skip(allow_module_level=True) def get_ffmpeg_version():